pixeltable 0.2.21__py3-none-any.whl → 0.2.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (82) hide show
  1. pixeltable/__version__.py +2 -2
  2. pixeltable/catalog/__init__.py +1 -1
  3. pixeltable/catalog/column.py +37 -11
  4. pixeltable/catalog/globals.py +18 -0
  5. pixeltable/catalog/insertable_table.py +6 -4
  6. pixeltable/catalog/table.py +19 -3
  7. pixeltable/catalog/table_version.py +34 -14
  8. pixeltable/catalog/view.py +16 -17
  9. pixeltable/dataframe.py +7 -8
  10. pixeltable/env.py +5 -0
  11. pixeltable/exec/__init__.py +0 -1
  12. pixeltable/exec/aggregation_node.py +6 -3
  13. pixeltable/exec/cache_prefetch_node.py +1 -1
  14. pixeltable/exec/data_row_batch.py +2 -19
  15. pixeltable/exec/exec_node.py +2 -1
  16. pixeltable/exec/expr_eval_node.py +17 -10
  17. pixeltable/exec/in_memory_data_node.py +6 -3
  18. pixeltable/exec/sql_node.py +24 -25
  19. pixeltable/exprs/arithmetic_expr.py +3 -1
  20. pixeltable/exprs/array_slice.py +7 -7
  21. pixeltable/exprs/column_property_ref.py +37 -10
  22. pixeltable/exprs/column_ref.py +93 -14
  23. pixeltable/exprs/comparison.py +5 -5
  24. pixeltable/exprs/compound_predicate.py +8 -7
  25. pixeltable/exprs/data_row.py +27 -18
  26. pixeltable/exprs/expr.py +53 -52
  27. pixeltable/exprs/expr_set.py +5 -0
  28. pixeltable/exprs/function_call.py +32 -16
  29. pixeltable/exprs/globals.py +4 -1
  30. pixeltable/exprs/in_predicate.py +8 -7
  31. pixeltable/exprs/inline_expr.py +4 -4
  32. pixeltable/exprs/is_null.py +4 -4
  33. pixeltable/exprs/json_mapper.py +11 -12
  34. pixeltable/exprs/json_path.py +5 -10
  35. pixeltable/exprs/literal.py +5 -5
  36. pixeltable/exprs/method_ref.py +5 -4
  37. pixeltable/exprs/object_ref.py +2 -1
  38. pixeltable/exprs/row_builder.py +88 -36
  39. pixeltable/exprs/rowid_ref.py +12 -11
  40. pixeltable/exprs/similarity_expr.py +12 -7
  41. pixeltable/exprs/sql_element_cache.py +7 -5
  42. pixeltable/exprs/type_cast.py +8 -6
  43. pixeltable/exprs/variable.py +5 -4
  44. pixeltable/func/aggregate_function.py +1 -1
  45. pixeltable/func/function.py +11 -10
  46. pixeltable/functions/__init__.py +2 -2
  47. pixeltable/functions/globals.py +5 -7
  48. pixeltable/functions/huggingface.py +19 -20
  49. pixeltable/functions/llama_cpp.py +106 -0
  50. pixeltable/functions/ollama.py +147 -0
  51. pixeltable/functions/replicate.py +72 -0
  52. pixeltable/functions/string.py +9 -0
  53. pixeltable/globals.py +12 -20
  54. pixeltable/index/btree.py +16 -3
  55. pixeltable/index/embedding_index.py +4 -4
  56. pixeltable/io/__init__.py +1 -2
  57. pixeltable/io/fiftyone.py +178 -0
  58. pixeltable/io/globals.py +96 -2
  59. pixeltable/iterators/base.py +3 -2
  60. pixeltable/iterators/document.py +1 -1
  61. pixeltable/iterators/video.py +120 -63
  62. pixeltable/metadata/__init__.py +1 -1
  63. pixeltable/metadata/converters/convert_21.py +34 -0
  64. pixeltable/metadata/converters/util.py +45 -4
  65. pixeltable/metadata/notes.py +1 -0
  66. pixeltable/metadata/schema.py +8 -0
  67. pixeltable/plan.py +16 -14
  68. pixeltable/py.typed +0 -0
  69. pixeltable/store.py +7 -2
  70. pixeltable/tool/create_test_video.py +1 -1
  71. pixeltable/tool/embed_udf.py +1 -1
  72. pixeltable/tool/mypy_plugin.py +28 -5
  73. pixeltable/type_system.py +17 -1
  74. pixeltable/utils/documents.py +15 -1
  75. pixeltable/utils/formatter.py +9 -10
  76. {pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/METADATA +46 -10
  77. pixeltable-0.2.22.dist-info/RECORD +153 -0
  78. pixeltable/exec/media_validation_node.py +0 -43
  79. pixeltable-0.2.21.dist-info/RECORD +0 -148
  80. {pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/LICENSE +0 -0
  81. {pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/WHEEL +0 -0
  82. {pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/entry_points.txt +0 -0
@@ -7,7 +7,7 @@ from typing import Iterable, List, Optional
7
7
 
8
8
  from tqdm import TqdmWarning, tqdm
9
9
 
10
- import pixeltable.exprs as exprs
10
+ from pixeltable import exprs
11
11
  from pixeltable.func import CallableFunction
12
12
 
13
13
  from .data_row_batch import DataRowBatch
@@ -22,7 +22,7 @@ class ExprEvalNode(ExecNode):
22
22
  @dataclass
23
23
  class Cohort:
24
24
  """List of exprs that form an evaluation context and contain calls to at most one external function"""
25
- exprs: List[exprs.Expr]
25
+ exprs_: List[exprs.Expr]
26
26
  batched_fn: Optional[CallableFunction]
27
27
  segment_ctxs: List['exprs.RowBuilder.EvalCtx']
28
28
  target_slot_idxs: List[int]
@@ -38,7 +38,7 @@ class ExprEvalNode(ExecNode):
38
38
  # we're only materializing exprs that are not already in the input
39
39
  self.target_exprs = [e for e in output_exprs if e.slot_idx not in input_slot_idxs]
40
40
  self.pbar: Optional[tqdm] = None
41
- self.cohorts: List[List[ExprEvalNode.Cohort]] = []
41
+ self.cohorts: List[ExprEvalNode.Cohort] = []
42
42
  self._create_cohorts()
43
43
 
44
44
  def __next__(self) -> DataRowBatch:
@@ -88,6 +88,8 @@ class ExprEvalNode(ExecNode):
88
88
  for e in all_exprs:
89
89
  if not self._is_batched_fn_call(e):
90
90
  continue
91
+ assert isinstance(e, exprs.FunctionCall)
92
+ assert isinstance(e.fn, CallableFunction)
91
93
  if current_batched_fn is None or current_batched_fn != e.fn:
92
94
  # create a new cohort
93
95
  cohorts.append([])
@@ -96,8 +98,8 @@ class ExprEvalNode(ExecNode):
96
98
 
97
99
  # expand the cohorts to include all exprs that are in the same evaluation context as the external calls;
98
100
  # cohorts are evaluated in order, so we can exclude the target slots from preceding cohorts and input slots
99
- exclude = set([e.slot_idx for e in self.input_exprs])
100
- all_target_slot_idxs = set([e.slot_idx for e in self.target_exprs])
101
+ exclude = set(e.slot_idx for e in self.input_exprs)
102
+ all_target_slot_idxs = set(e.slot_idx for e in self.target_exprs)
101
103
  target_slot_idxs: List[List[int]] = [] # the ones materialized by each cohort
102
104
  for i in range(len(cohorts)):
103
105
  cohorts[i] = self.row_builder.get_dependencies(
@@ -106,7 +108,7 @@ class ExprEvalNode(ExecNode):
106
108
  [e.slot_idx for e in cohorts[i] if e.slot_idx in all_target_slot_idxs])
107
109
  exclude.update(target_slot_idxs[-1])
108
110
 
109
- all_cohort_slot_idxs = set([e.slot_idx for cohort in cohorts for e in cohort])
111
+ all_cohort_slot_idxs = set(e.slot_idx for cohort in cohorts for e in cohort)
110
112
  remaining_slot_idxs = set(all_target_slot_idxs) - all_cohort_slot_idxs
111
113
  if len(remaining_slot_idxs) > 0:
112
114
  cohorts.append(self.row_builder.get_dependencies(
@@ -164,9 +166,10 @@ class ExprEvalNode(ExecNode):
164
166
  rows[row_idx], segment_ctx, self.ctx.profile, ignore_errors=self.ctx.ignore_errors)
165
167
  else:
166
168
  fn_call = segment_ctx.exprs[0]
169
+ assert isinstance(fn_call, exprs.FunctionCall)
167
170
  # make a batched external function call
168
- arg_batches = [[] for _ in range(len(fn_call.args))]
169
- kwarg_batches = {k: [] for k in fn_call.kwargs.keys()}
171
+ arg_batches: list[list[exprs.Expr]] = [[] for _ in range(len(fn_call.args))]
172
+ kwarg_batches: dict[str, list[exprs.Expr]] = {k: [] for k in fn_call.kwargs.keys()}
170
173
 
171
174
  valid_batch_idxs: List[int] = [] # rows with exceptions are not valid
172
175
  for row_idx in range(batch_start_idx, batch_start_idx + num_batch_rows):
@@ -176,12 +179,15 @@ class ExprEvalNode(ExecNode):
176
179
  continue
177
180
  valid_batch_idxs.append(row_idx)
178
181
  args, kwargs = fn_call._make_args(row)
179
- [arg_batches[i].append(args[i]) for i in range(len(args))]
180
- [kwarg_batches[k].append(kwargs[k]) for k in kwargs.keys()]
182
+ for i in range(len(args)):
183
+ arg_batches[i].append(args[i])
184
+ for k in kwargs.keys():
185
+ kwarg_batches[k].append(kwargs[k])
181
186
  num_valid_batch_rows = len(valid_batch_idxs)
182
187
 
183
188
  if ext_batch_size is None:
184
189
  # we need to choose a batch size based on the args
190
+ assert isinstance(fn_call.fn, CallableFunction)
185
191
  sample_args = [arg_batches[i][0] for i in range(len(arg_batches))]
186
192
  ext_batch_size = fn_call.fn.get_batch_size(*sample_args)
187
193
 
@@ -201,6 +207,7 @@ class ExprEvalNode(ExecNode):
201
207
  for k in kwarg_batches.keys()
202
208
  }
203
209
  start_ts = time.perf_counter()
210
+ assert isinstance(fn_call.fn, CallableFunction)
204
211
  result_batch = fn_call.fn.exec_batch(*call_args, **call_kwargs)
205
212
  self.ctx.profile.eval_time[fn_call.slot_idx] += time.perf_counter() - start_ts
206
213
  self.ctx.profile.eval_count[fn_call.slot_idx] += num_ext_batch_rows
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import Any, Optional, Iterator
2
+ from typing import Any, Iterator, Optional
3
3
 
4
4
  import pixeltable.catalog as catalog
5
5
  import pixeltable.exprs as exprs
@@ -23,12 +23,15 @@ class InMemoryDataNode(ExecNode):
23
23
  start_row_id: int
24
24
  output_rows: Optional[DataRowBatch]
25
25
 
26
+ # output_exprs is declared in the superclass, but we redeclare it here with a more specific type
27
+ output_exprs: list[exprs.ColumnRef]
28
+
26
29
  def __init__(
27
30
  self, tbl: catalog.TableVersion, rows: list[dict[str, Any]],
28
31
  row_builder: exprs.RowBuilder, start_row_id: int,
29
32
  ):
30
- # we materialize all output slots
31
- output_exprs = [e for e in row_builder.get_output_exprs() if isinstance(e, exprs.ColumnRef)]
33
+ # we materialize the input slots
34
+ output_exprs = list(row_builder.input_exprs)
32
35
  super().__init__(row_builder, output_exprs, [], None)
33
36
  assert tbl.is_insertable()
34
37
  self.tbl = tbl
@@ -1,13 +1,14 @@
1
1
  import logging
2
2
  import warnings
3
3
  from decimal import Decimal
4
- from typing import Optional, Iterable, Iterator, NamedTuple
4
+ from typing import Iterable, Iterator, NamedTuple, Optional
5
5
  from uuid import UUID
6
6
 
7
7
  import sqlalchemy as sql
8
8
 
9
9
  import pixeltable.catalog as catalog
10
10
  import pixeltable.exprs as exprs
11
+
11
12
  from .data_row_batch import DataRowBatch
12
13
  from .exec_node import ExecNode
13
14
 
@@ -100,7 +101,7 @@ class SqlNode(ExecNode):
100
101
  # minimize the number of tables that need to be joined to the target table
101
102
  self.retarget_rowid_refs(tbl, self.select_list)
102
103
 
103
- assert self.sql_elements.contains(self.select_list)
104
+ assert self.sql_elements.contains_all(self.select_list)
104
105
  self.set_pk = set_pk
105
106
  self.num_pk_cols = 0
106
107
  if set_pk:
@@ -120,13 +121,13 @@ class SqlNode(ExecNode):
120
121
  def _create_stmt(self) -> sql.Select:
121
122
  """Create Select from local state"""
122
123
 
123
- assert self.sql_elements.contains(self.select_list)
124
+ assert self.sql_elements.contains_all(self.select_list)
124
125
  sql_select_list = [self.sql_elements.get(e) for e in self.select_list]
125
126
  if self.set_pk:
126
127
  sql_select_list += self.tbl.tbl_version.store_tbl.pk_columns()
127
128
  stmt = sql.select(*sql_select_list)
128
129
 
129
- order_by_clause: list[sql.ClauseElement] = []
130
+ order_by_clause: list[sql.ColumnElement] = []
130
131
  for e, asc in self.order_by_clause:
131
132
  if isinstance(e, exprs.SimilarityExpr):
132
133
  order_by_clause.append(e.as_order_by_clause(asc))
@@ -141,7 +142,7 @@ class SqlNode(ExecNode):
141
142
  return stmt
142
143
 
143
144
  def _ordering_tbl_ids(self) -> set[UUID]:
144
- return exprs.Expr.list_tbl_ids(e for e, _ in self.order_by_clause)
145
+ return exprs.Expr.all_tbl_ids(e for e, _ in self.order_by_clause)
145
146
 
146
147
  def to_cte(self) -> Optional[tuple[sql.CTE, exprs.ExprDict[sql.ColumnElement]]]:
147
148
  """
@@ -182,9 +183,9 @@ class SqlNode(ExecNode):
182
183
  """
183
184
  # we need to include at least the root
184
185
  if refd_tbl_ids is None:
185
- refd_tbl_ids = {}
186
+ refd_tbl_ids = set()
186
187
  if exact_version_only is None:
187
- exact_version_only = {}
188
+ exact_version_only = set()
188
189
  candidates = tbl.get_tbl_versions()
189
190
  assert len(candidates) > 0
190
191
  joined_tbls: list[catalog.TableVersion] = [candidates[0]]
@@ -193,6 +194,7 @@ class SqlNode(ExecNode):
193
194
  joined_tbls.append(tbl)
194
195
 
195
196
  first = True
197
+ prev_tbl: catalog.TableVersion
196
198
  for tbl in joined_tbls[::-1]:
197
199
  if first:
198
200
  stmt = stmt.select_from(tbl.store_tbl.sa_tbl)
@@ -239,22 +241,19 @@ class SqlNode(ExecNode):
239
241
  def __iter__(self) -> Iterator[DataRowBatch]:
240
242
  # run the query; do this here rather than in _open(), exceptions are only expected during iteration
241
243
  assert self.ctx.conn is not None
242
- try:
243
- with warnings.catch_warnings(record=True) as w:
244
- stmt = self._create_stmt()
245
- try:
246
- # log stmt, if possible
247
- stmt_str = str(stmt.compile(compile_kwargs={'literal_binds': True}))
248
- _logger.debug(f'SqlLookupNode stmt:\n{stmt_str}')
249
- except Exception as e:
250
- pass
251
- self._log_explain(stmt)
252
-
253
- result_cursor = self.ctx.conn.execute(stmt)
254
- for warning in w:
255
- pass
256
- except Exception as e:
257
- raise e
244
+ with warnings.catch_warnings(record=True) as w:
245
+ stmt = self._create_stmt()
246
+ try:
247
+ # log stmt, if possible
248
+ stmt_str = str(stmt.compile(compile_kwargs={'literal_binds': True}))
249
+ _logger.debug(f'SqlLookupNode stmt:\n{stmt_str}')
250
+ except Exception:
251
+ pass
252
+ self._log_explain(stmt)
253
+
254
+ result_cursor = self.ctx.conn.execute(stmt)
255
+ for warning in w:
256
+ pass
258
257
 
259
258
  tbl_version = self.tbl.tbl_version if self.tbl is not None else None
260
259
  output_batch = DataRowBatch(tbl_version, self.row_builder)
@@ -350,7 +349,7 @@ class SqlScanNode(SqlNode):
350
349
  def _create_stmt(self) -> sql.Select:
351
350
  stmt = super()._create_stmt()
352
351
  where_clause_tbl_ids = self.where_clause.tbl_ids() if self.where_clause is not None else set()
353
- refd_tbl_ids = exprs.Expr.list_tbl_ids(self.select_list) | where_clause_tbl_ids | self._ordering_tbl_ids()
352
+ refd_tbl_ids = exprs.Expr.all_tbl_ids(self.select_list) | where_clause_tbl_ids | self._ordering_tbl_ids()
354
353
  stmt = self.create_from_clause(
355
354
  self.tbl, stmt, refd_tbl_ids, exact_version_only={t.id for t in self.exact_version_only})
356
355
 
@@ -386,7 +385,7 @@ class SqlLookupNode(SqlNode):
386
385
 
387
386
  def _create_stmt(self) -> sql.Select:
388
387
  stmt = super()._create_stmt()
389
- refd_tbl_ids = exprs.Expr.list_tbl_ids(self.select_list) | self._ordering_tbl_ids()
388
+ refd_tbl_ids = exprs.Expr.all_tbl_ids(self.select_list) | self._ordering_tbl_ids()
390
389
  stmt = self.create_from_clause(self.tbl, stmt, refd_tbl_ids)
391
390
  stmt = stmt.where(self.where_clause)
392
391
  return stmt
@@ -6,6 +6,7 @@ import sqlalchemy as sql
6
6
 
7
7
  import pixeltable.exceptions as excs
8
8
  import pixeltable.type_system as ts
9
+
9
10
  from .data_row import DataRow
10
11
  from .expr import Expr
11
12
  from .globals import ArithmeticOperator
@@ -86,6 +87,7 @@ class ArithmeticExpr(Expr):
86
87
  return sql.sql.expression.cast(sql.func.floor(left / right), sql.Integer)
87
88
  if self.col_type.is_float_type():
88
89
  return sql.sql.expression.cast(sql.func.floor(left / right), sql.Float)
90
+ assert False
89
91
 
90
92
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
91
93
  op1_val = data_row[self._op1.slot_idx]
@@ -121,7 +123,7 @@ class ArithmeticExpr(Expr):
121
123
  return {'operator': self.operator.value, **super()._as_dict()}
122
124
 
123
125
  @classmethod
124
- def _from_dict(cls, d: dict, components: list[Expr]) -> Expr:
126
+ def _from_dict(cls, d: dict, components: list[Expr]) -> ArithmeticExpr:
125
127
  assert 'operator' in d
126
128
  assert len(components) == 2
127
129
  return cls(ArithmeticOperator(d['operator']), components[0], components[1])
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any, Dict, List, Optional, Tuple
3
+ from typing import Any, Optional, Union
4
4
 
5
5
  import sqlalchemy as sql
6
6
 
@@ -15,7 +15,7 @@ class ArraySlice(Expr):
15
15
  """
16
16
  Slice operation on an array, eg, t.array_col[:, 1:2].
17
17
  """
18
- def __init__(self, arr: Expr, index: Tuple):
18
+ def __init__(self, arr: Expr, index: tuple[Union[int, slice], ...]):
19
19
  assert arr.col_type.is_array_type()
20
20
  # determine result type
21
21
  super().__init__(arr.col_type)
@@ -24,7 +24,7 @@ class ArraySlice(Expr):
24
24
  self.id = self._create_id()
25
25
 
26
26
  def __str__(self) -> str:
27
- index_strs: List[str] = []
27
+ index_strs: list[str] = []
28
28
  for el in self.index:
29
29
  if isinstance(el, int):
30
30
  index_strs.append(str(el))
@@ -39,7 +39,7 @@ class ArraySlice(Expr):
39
39
  def _equals(self, other: ArraySlice) -> bool:
40
40
  return self.index == other.index
41
41
 
42
- def _id_attrs(self) -> List[Tuple[str, Any]]:
42
+ def _id_attrs(self) -> list[tuple[str, Any]]:
43
43
  return super()._id_attrs() + [('index', self.index)]
44
44
 
45
45
  def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
@@ -49,8 +49,8 @@ class ArraySlice(Expr):
49
49
  val = data_row[self._array.slot_idx]
50
50
  data_row[self.slot_idx] = val[self.index]
51
51
 
52
- def _as_dict(self) -> Dict:
53
- index = []
52
+ def _as_dict(self) -> dict:
53
+ index: list[Any] = []
54
54
  for el in self.index:
55
55
  if isinstance(el, slice):
56
56
  index.append([el.start, el.stop, el.step])
@@ -59,7 +59,7 @@ class ArraySlice(Expr):
59
59
  return {'index': index, **super()._as_dict()}
60
60
 
61
61
  @classmethod
62
- def _from_dict(cls, d: Dict, components: List[Expr]) -> Expr:
62
+ def _from_dict(cls, d: dict, components: list[Expr]) -> ArraySlice:
63
63
  assert 'index' in d
64
64
  index = []
65
65
  for el in d['index']:
@@ -1,11 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import enum
4
- from typing import Optional, List, Any, Dict, Tuple
4
+ from typing import Any, Optional
5
5
 
6
6
  import sqlalchemy as sql
7
7
 
8
8
  import pixeltable.type_system as ts
9
+ from pixeltable import catalog
9
10
  from .column_ref import ColumnRef
10
11
  from .data_row import DataRow
11
12
  from .expr import Expr
@@ -33,22 +34,36 @@ class ColumnPropertyRef(Expr):
33
34
  def default_column_name(self) -> Optional[str]:
34
35
  return str(self).replace('.', '_')
35
36
 
36
- def _equals(self, other: ColumnRef) -> bool:
37
+ def _equals(self, other: ColumnPropertyRef) -> bool:
37
38
  return self.prop == other.prop
38
39
 
39
- def _id_attrs(self) -> List[Tuple[str, Any]]:
40
+ def _id_attrs(self) -> list[tuple[str, Any]]:
40
41
  return super()._id_attrs() + [('prop', self.prop.value)]
41
42
 
42
43
  @property
43
44
  def _col_ref(self) -> ColumnRef:
44
- return self.components[0]
45
+ col_ref = self.components[0]
46
+ assert isinstance(col_ref, ColumnRef)
47
+ return col_ref
45
48
 
46
49
  def __str__(self) -> str:
47
50
  return f'{self._col_ref}.{self.prop.name.lower()}'
48
51
 
52
+ def is_error_prop(self) -> bool:
53
+ return self.prop == self.Property.ERRORTYPE or self.prop == self.Property.ERRORMSG
54
+
49
55
  def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ColumnElement]:
50
56
  if not self._col_ref.col.is_stored:
51
57
  return None
58
+
59
+ # the errortype/-msg properties of a read-validated media column need to be extracted from the DataRow
60
+ if (
61
+ self._col_ref.col.col_type.is_media_type()
62
+ and self._col_ref.col.media_validation == catalog.MediaValidation.ON_READ
63
+ and self.is_error_prop()
64
+ ):
65
+ return None
66
+
52
67
  if self.prop == self.Property.ERRORTYPE:
53
68
  assert self._col_ref.col.sa_errortype_col is not None
54
69
  return self._col_ref.col.sa_errortype_col
@@ -61,18 +76,30 @@ class ColumnPropertyRef(Expr):
61
76
  return None
62
77
 
63
78
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
64
- assert self.prop == self.Property.FILEURL or self.prop == self.Property.LOCALPATH
65
- assert data_row.has_val[self._col_ref.slot_idx]
66
79
  if self.prop == self.Property.FILEURL:
80
+ assert data_row.has_val[self._col_ref.slot_idx]
67
81
  data_row[self.slot_idx] = data_row.file_urls[self._col_ref.slot_idx]
68
- if self.prop == self.Property.LOCALPATH:
82
+ return
83
+ elif self.prop == self.Property.LOCALPATH:
84
+ assert data_row.has_val[self._col_ref.slot_idx]
69
85
  data_row[self.slot_idx] = data_row.file_paths[self._col_ref.slot_idx]
70
-
71
- def _as_dict(self) -> Dict:
86
+ return
87
+ elif self.is_error_prop():
88
+ exc = data_row.get_exc(self._col_ref.slot_idx)
89
+ if exc is None:
90
+ data_row[self.slot_idx] = None
91
+ elif self.prop == self.Property.ERRORTYPE:
92
+ data_row[self.slot_idx] = type(exc).__name__
93
+ else:
94
+ data_row[self.slot_idx] = str(exc)
95
+ else:
96
+ assert False
97
+
98
+ def _as_dict(self) -> dict:
72
99
  return {'prop': self.prop.value, **super()._as_dict()}
73
100
 
74
101
  @classmethod
75
- def _from_dict(cls, d: Dict, components: List[Expr]) -> Expr:
102
+ def _from_dict(cls, d: dict, components: list[Expr]) -> ColumnPropertyRef:
76
103
  assert 'prop' in d
77
104
  assert isinstance(components[0], ColumnRef)
78
105
  return cls(components[0], cls.Property(d['prop']))
@@ -1,16 +1,18 @@
1
1
  from __future__ import annotations
2
- from typing import Optional, Any, Tuple
2
+
3
+ from typing import Any, Optional, Sequence
3
4
  from uuid import UUID
4
5
 
5
6
  import sqlalchemy as sql
6
7
 
7
- from .expr import Expr
8
+ import pixeltable.catalog as catalog
9
+ import pixeltable.exceptions as excs
10
+ import pixeltable.iterators as iters
11
+
8
12
  from .data_row import DataRow
13
+ from .expr import Expr
9
14
  from .row_builder import RowBuilder
10
15
  from .sql_element_cache import SqlElementCache
11
- import pixeltable.iterators as iters
12
- import pixeltable.exceptions as excs
13
- import pixeltable.catalog as catalog
14
16
 
15
17
 
16
18
  class ColumnRef(Expr):
@@ -19,18 +21,31 @@ class ColumnRef(Expr):
19
21
  When this reference is created in the context of a view, it can also refer to a column of the view base.
20
22
  For that reason, a ColumnRef needs to be serialized with the qualifying table id (column ids are only
21
23
  unique in the context of a particular table).
24
+
25
+ Media validation:
26
+ - media validation is potentially cpu-intensive, and it's desirable to schedule and parallelize it during
27
+ general expr evaluation
28
+ - media validation on read is done in ColumnRef.eval()
29
+ - a validating ColumnRef cannot be translated to SQL (because the validation is done in Python)
30
+ - in that case, the ColumnRef also instantiates a second non-validating ColumnRef as a component (= dependency)
31
+ - the non-validating ColumnRef is used for SQL translation
32
+
33
+ TODO:
34
+ separate Exprs (like validating ColumnRefs) from the logical expression tree and instead have RowBuilder
35
+ insert them into the EvalCtxs as needed
22
36
  """
23
37
 
24
38
  col: catalog.Column
25
39
  is_unstored_iter_col: bool
26
40
  iter_arg_ctx: Optional[RowBuilder.EvalCtx]
27
41
  base_rowid_len: int
28
- base_rowid: list[Optional[Any]]
42
+ base_rowid: Sequence[Optional[Any]]
29
43
  iterator: Optional[iters.ComponentIterator]
30
44
  pos_idx: Optional[int]
31
45
  id: int
46
+ perform_validation: bool # if True, performs media validation
32
47
 
33
- def __init__(self, col: catalog.Column):
48
+ def __init__(self, col: catalog.Column, perform_validation: Optional[bool] = None):
34
49
  super().__init__(col.col_type)
35
50
  assert col.tbl is not None
36
51
  self.col = col
@@ -43,17 +58,44 @@ class ColumnRef(Expr):
43
58
  self.iterator = None
44
59
  # index of the position column in the view's primary key; don't try to reference tbl.store_tbl here
45
60
  self.pos_idx = col.tbl.num_rowid_columns() - 1 if self.is_unstored_iter_col else None
61
+
62
+ self.perform_validation = False
63
+ if col.col_type.is_media_type():
64
+ # we perform media validation if the column is a media type and the validation is set to ON_READ,
65
+ # unless we're told not to
66
+ if perform_validation is not None:
67
+ self.perform_validation = perform_validation
68
+ else:
69
+ self.perform_validation = (
70
+ col.col_type.is_media_type() and col.media_validation == catalog.MediaValidation.ON_READ
71
+ )
72
+ else:
73
+ assert perform_validation is None or not perform_validation
74
+ if self.perform_validation:
75
+ non_validating_col_ref = ColumnRef(col, perform_validation=False)
76
+ self.components = [non_validating_col_ref]
46
77
  self.id = self._create_id()
47
78
 
48
79
  def set_iter_arg_ctx(self, iter_arg_ctx: RowBuilder.EvalCtx) -> None:
49
80
  self.iter_arg_ctx = iter_arg_ctx
50
81
  assert len(self.iter_arg_ctx.target_slot_idxs) == 1 # a single inline dict
51
82
 
52
- def _id_attrs(self) -> list[Tuple[str, Any]]:
53
- return super()._id_attrs() + [('tbl_id', self.col.tbl.id), ('col_id', self.col.id)]
83
+ def _id_attrs(self) -> list[tuple[str, Any]]:
84
+ return (
85
+ super()._id_attrs()
86
+ + [('tbl_id', self.col.tbl.id), ('col_id', self.col.id), ('perform_validation', self.perform_validation)]
87
+ )
88
+
89
+ # override
90
+ def _retarget(self, tbl_versions: dict[UUID, catalog.TableVersion]) -> ColumnRef:
91
+ target = tbl_versions[self.col.tbl.id]
92
+ assert self.col.id in target.cols_by_id
93
+ col = target.cols_by_id[self.col.id]
94
+ return ColumnRef(col)
54
95
 
55
96
  def __getattr__(self, name: str) -> Expr:
56
97
  from .column_property_ref import ColumnPropertyRef
98
+
57
99
  # resolve column properties
58
100
  if name == ColumnPropertyRef.Property.ERRORTYPE.name.lower() \
59
101
  or name == ColumnPropertyRef.Property.ERRORMSG.name.lower():
@@ -82,7 +124,7 @@ class ColumnRef(Expr):
82
124
  return str(self)
83
125
 
84
126
  def _equals(self, other: ColumnRef) -> bool:
85
- return self.col == other.col
127
+ return self.col == other.col and self.perform_validation == other.perform_validation
86
128
 
87
129
  def __str__(self) -> str:
88
130
  if self.col.name is None:
@@ -94,9 +136,38 @@ class ColumnRef(Expr):
94
136
  return f'ColumnRef({self.col!r})'
95
137
 
96
138
  def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
97
- return self.col.sa_col
139
+ return None if self.perform_validation else self.col.sa_col
98
140
 
99
141
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
142
+ if self.perform_validation:
143
+ # validate media file of our input ColumnRef and if successful, replicate the state of that slot
144
+ # to our slot
145
+ unvalidated_slot_idx = self.components[0].slot_idx
146
+ if data_row.file_paths[unvalidated_slot_idx] is None:
147
+ # no media file to validate, we still need to replicate the value
148
+ assert data_row.file_urls[unvalidated_slot_idx] is None
149
+ val = data_row.vals[unvalidated_slot_idx]
150
+ data_row.vals[self.slot_idx] = val
151
+ data_row.has_val[self.slot_idx] = True
152
+ return
153
+
154
+ try:
155
+ self.col.col_type.validate_media(data_row.file_paths[unvalidated_slot_idx])
156
+ # access the value only after successful validation
157
+ val = data_row[unvalidated_slot_idx]
158
+ data_row.vals[self.slot_idx] = val
159
+ data_row.has_val[self.slot_idx] = True
160
+ # make sure that the validated slot points to the same file as the unvalidated slot
161
+ data_row.file_paths[self.slot_idx] = data_row.file_paths[unvalidated_slot_idx]
162
+ data_row.file_urls[self.slot_idx] = data_row.file_urls[unvalidated_slot_idx]
163
+ return
164
+ except excs.Error as exc:
165
+ # propagate the exception, but ignore it otherwise;
166
+ # media validation errors don't cause exceptions during query execution
167
+ # TODO: allow for different error-handling behavior
168
+ row_builder.set_exc(data_row, self.slot_idx, exc)
169
+ return
170
+
100
171
  if not self.is_unstored_iter_col:
101
172
  # supply default
102
173
  data_row[self.slot_idx] = None
@@ -115,7 +186,14 @@ class ColumnRef(Expr):
115
186
  def _as_dict(self) -> dict:
116
187
  tbl = self.col.tbl
117
188
  version = tbl.version if tbl.is_snapshot else None
118
- return {'tbl_id': str(tbl.id), 'tbl_version': version, 'col_id': self.col.id}
189
+ # we omit self.components, even if this is a validating ColumnRef, because init() will recreate the
190
+ # non-validating component ColumnRef
191
+ return {
192
+ 'tbl_id': str(tbl.id),
193
+ 'tbl_version': version,
194
+ 'col_id': self.col.id,
195
+ 'perform_validation': self.perform_validation
196
+ }
119
197
 
120
198
  @classmethod
121
199
  def get_column(cls, d: dict) -> catalog.Column:
@@ -126,6 +204,7 @@ class ColumnRef(Expr):
126
204
  return col
127
205
 
128
206
  @classmethod
129
- def _from_dict(cls, d: dict, _: list[Expr]) -> Expr:
207
+ def _from_dict(cls, d: dict, _: list[Expr]) -> ColumnRef:
130
208
  col = cls.get_column(d)
131
- return cls(col)
209
+ perform_validation = d['perform_validation']
210
+ return cls(col, perform_validation=perform_validation)
@@ -1,12 +1,13 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Optional, List, Any, Dict
3
+ from typing import Any, Optional
4
4
 
5
5
  import sqlalchemy as sql
6
6
 
7
7
  import pixeltable.exceptions as excs
8
8
  import pixeltable.index as index
9
9
  import pixeltable.type_system as ts
10
+
10
11
  from .column_ref import ColumnRef
11
12
  from .data_row import DataRow
12
13
  from .expr import Expr
@@ -65,7 +66,7 @@ class Comparison(Expr):
65
66
  def _op2(self) -> Expr:
66
67
  return self.components[1]
67
68
 
68
- def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ClauseElement]:
69
+ def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ColumnElement]:
69
70
  left = sql_elements.get(self._op1)
70
71
  if self.is_search_arg_comparison:
71
72
  # reference the index value column if there is an index and this is not a snapshot
@@ -113,11 +114,10 @@ class Comparison(Expr):
113
114
  elif self.operator == ComparisonOperator.GE:
114
115
  data_row[self.slot_idx] = left >= right
115
116
 
116
- def _as_dict(self) -> Dict:
117
+ def _as_dict(self) -> dict:
117
118
  return {'operator': self.operator.value, **super()._as_dict()}
118
119
 
119
120
  @classmethod
120
- def _from_dict(cls, d: Dict, components: List[Expr]) -> Expr:
121
+ def _from_dict(cls, d: dict, components: list[Expr]) -> Comparison:
121
122
  assert 'operator' in d
122
123
  return cls(ComparisonOperator(d['operator']), components[0], components[1])
123
-