pixeltable 0.4.15__py3-none-any.whl → 0.4.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (57) hide show
  1. pixeltable/__init__.py +4 -0
  2. pixeltable/catalog/catalog.py +105 -51
  3. pixeltable/catalog/column.py +7 -2
  4. pixeltable/catalog/table.py +1 -0
  5. pixeltable/catalog/table_metadata.py +4 -0
  6. pixeltable/catalog/table_version.py +99 -78
  7. pixeltable/catalog/table_version_handle.py +4 -1
  8. pixeltable/config.py +6 -0
  9. pixeltable/dataframe.py +10 -5
  10. pixeltable/env.py +48 -19
  11. pixeltable/exec/__init__.py +2 -0
  12. pixeltable/exec/cell_materialization_node.py +231 -0
  13. pixeltable/exec/cell_reconstruction_node.py +135 -0
  14. pixeltable/exec/exec_node.py +1 -1
  15. pixeltable/exec/expr_eval/evaluators.py +1 -0
  16. pixeltable/exec/expr_eval/expr_eval_node.py +3 -0
  17. pixeltable/exec/expr_eval/globals.py +2 -0
  18. pixeltable/exec/globals.py +32 -0
  19. pixeltable/exec/object_store_save_node.py +1 -4
  20. pixeltable/exec/row_update_node.py +16 -9
  21. pixeltable/exec/sql_node.py +107 -14
  22. pixeltable/exprs/__init__.py +1 -1
  23. pixeltable/exprs/arithmetic_expr.py +10 -11
  24. pixeltable/exprs/column_property_ref.py +10 -10
  25. pixeltable/exprs/column_ref.py +2 -2
  26. pixeltable/exprs/data_row.py +106 -37
  27. pixeltable/exprs/expr.py +9 -0
  28. pixeltable/exprs/expr_set.py +14 -7
  29. pixeltable/exprs/inline_expr.py +2 -19
  30. pixeltable/exprs/json_path.py +45 -12
  31. pixeltable/exprs/row_builder.py +54 -22
  32. pixeltable/functions/__init__.py +1 -0
  33. pixeltable/functions/bedrock.py +7 -0
  34. pixeltable/functions/deepseek.py +11 -4
  35. pixeltable/functions/llama_cpp.py +7 -0
  36. pixeltable/functions/math.py +1 -1
  37. pixeltable/functions/ollama.py +7 -0
  38. pixeltable/functions/openai.py +4 -4
  39. pixeltable/functions/openrouter.py +143 -0
  40. pixeltable/globals.py +10 -4
  41. pixeltable/io/globals.py +16 -15
  42. pixeltable/io/table_data_conduit.py +46 -21
  43. pixeltable/metadata/__init__.py +1 -1
  44. pixeltable/metadata/converters/convert_40.py +73 -0
  45. pixeltable/metadata/notes.py +1 -0
  46. pixeltable/plan.py +175 -46
  47. pixeltable/store.py +1 -1
  48. pixeltable/type_system.py +5 -3
  49. pixeltable/utils/console_output.py +4 -1
  50. pixeltable/utils/exception_handler.py +5 -28
  51. pixeltable/utils/image.py +7 -0
  52. pixeltable/utils/misc.py +5 -0
  53. {pixeltable-0.4.15.dist-info → pixeltable-0.4.16.dist-info}/METADATA +2 -1
  54. {pixeltable-0.4.15.dist-info → pixeltable-0.4.16.dist-info}/RECORD +57 -50
  55. {pixeltable-0.4.15.dist-info → pixeltable-0.4.16.dist-info}/WHEEL +0 -0
  56. {pixeltable-0.4.15.dist-info → pixeltable-0.4.16.dist-info}/entry_points.txt +0 -0
  57. {pixeltable-0.4.15.dist-info → pixeltable-0.4.16.dist-info}/licenses/LICENSE +0 -0
@@ -1,3 +1,4 @@
1
+ import datetime
1
2
  import logging
2
3
  import warnings
3
4
  from decimal import Decimal
@@ -65,7 +66,7 @@ def print_order_by_clause(clause: OrderByClause) -> str:
65
66
 
66
67
  class SqlNode(ExecNode):
67
68
  """
68
- Materializes data from the store via an SQL statement.
69
+ Materializes data from the store via a SQL statement.
69
70
  This only provides the select list. The subclasses are responsible for the From clause and any additional clauses.
70
71
  The pk columns are not included in the select list.
71
72
  If set_pk is True, they are added to the end of the result set when creating the SQL statement
@@ -82,6 +83,8 @@ class SqlNode(ExecNode):
82
83
 
83
84
  tbl: Optional[catalog.TableVersionPath]
84
85
  select_list: exprs.ExprSet
86
+ columns: list[catalog.Column] # for which columns to populate DataRow.cell_vals/cell_md
87
+ cell_md_refs: list[exprs.ColumnPropertyRef] # of ColumnRefs which also need DataRow.slot_cellmd for evaluation
85
88
  set_pk: bool
86
89
  num_pk_cols: int
87
90
  py_filter: Optional[exprs.Expr] # a predicate that can only be run in Python
@@ -89,6 +92,12 @@ class SqlNode(ExecNode):
89
92
  cte: Optional[sql.CTE]
90
93
  sql_elements: exprs.SqlElementCache
91
94
 
95
+ # execution state
96
+ cellmd_item_idxs: exprs.ExprDict[int] # cellmd expr -> idx in sql select list
97
+ column_item_idxs: dict[catalog.Column, int] # column -> idx in sql select list
98
+ column_cellmd_item_idxs: dict[catalog.Column, int] # column -> idx in sql select list
99
+ result_cursor: sql.engine.CursorResult | None
100
+
92
101
  # where_clause/-_element: allow subclass to set one or the other (but not both)
93
102
  where_clause: Optional[exprs.Expr]
94
103
  where_clause_element: Optional[sql.ColumnElement]
@@ -101,12 +110,22 @@ class SqlNode(ExecNode):
101
110
  tbl: Optional[catalog.TableVersionPath],
102
111
  row_builder: exprs.RowBuilder,
103
112
  select_list: Iterable[exprs.Expr],
113
+ columns: list[catalog.Column],
104
114
  sql_elements: exprs.SqlElementCache,
115
+ cell_md_col_refs: list[exprs.ColumnRef] | None = None,
105
116
  set_pk: bool = False,
106
117
  ):
107
118
  # create Select stmt
108
119
  self.sql_elements = sql_elements
109
120
  self.tbl = tbl
121
+ self.columns = columns
122
+ if cell_md_col_refs is not None:
123
+ assert all(ref.col.stores_cellmd for ref in cell_md_col_refs)
124
+ self.cell_md_refs = [
125
+ exprs.ColumnPropertyRef(ref, exprs.ColumnPropertyRef.Property.CELLMD) for ref in cell_md_col_refs
126
+ ]
127
+ else:
128
+ self.cell_md_refs = []
110
129
  self.select_list = exprs.ExprSet(select_list)
111
130
  # unstored iter columns: we also need to retrieve whatever is needed to materialize the iter args
112
131
  for iter_arg in row_builder.unstored_iter_args.values():
@@ -129,6 +148,9 @@ class SqlNode(ExecNode):
129
148
  assert self.num_pk_cols > 1
130
149
 
131
150
  # additional state
151
+ self.cellmd_item_idxs = exprs.ExprDict()
152
+ self.column_item_idxs = {}
153
+ self.column_cellmd_item_idxs = {}
132
154
  self.result_cursor = None
133
155
  # the filter is provided by the subclass
134
156
  self.py_filter = None
@@ -144,10 +166,9 @@ class SqlNode(ExecNode):
144
166
  if tv is not None:
145
167
  assert tv.is_validated
146
168
 
147
- def _create_pk_cols(self) -> list[sql.Column]:
148
- """Create a list of pk columns"""
149
- # we need to retrieve the pk columns
169
+ def _pk_col_items(self) -> list[sql.Column]:
150
170
  if self.set_pk:
171
+ # we need to retrieve the pk columns
151
172
  assert self.tbl is not None
152
173
  assert self.tbl.tbl_version.get().is_validated
153
174
  return self.tbl.tbl_version.get().store_tbl.pk_columns()
@@ -157,7 +178,19 @@ class SqlNode(ExecNode):
157
178
  """Create Select from local state"""
158
179
 
159
180
  assert self.sql_elements.contains_all(self.select_list)
160
- sql_select_list = [self.sql_elements.get(e) for e in self.select_list] + self._create_pk_cols()
181
+ sql_select_list_exprs = exprs.ExprSet(self.select_list)
182
+ self.cellmd_item_idxs = exprs.ExprDict((ref, sql_select_list_exprs.add(ref)) for ref in self.cell_md_refs)
183
+ column_refs = [exprs.ColumnRef(col) for col in self.columns]
184
+ self.column_item_idxs = {col_ref.col: sql_select_list_exprs.add(col_ref) for col_ref in column_refs}
185
+ column_cellmd_refs = [
186
+ exprs.ColumnPropertyRef(col_ref, exprs.ColumnPropertyRef.Property.CELLMD)
187
+ for col_ref in column_refs
188
+ if col_ref.col.stores_cellmd
189
+ ]
190
+ self.column_cellmd_item_idxs = {
191
+ cellmd_ref.col_ref.col: sql_select_list_exprs.add(cellmd_ref) for cellmd_ref in column_cellmd_refs
192
+ }
193
+ sql_select_list = [self.sql_elements.get(e) for e in sql_select_list_exprs] + self._pk_col_items()
161
194
  stmt = sql.select(*sql_select_list)
162
195
 
163
196
  where_clause_element = (
@@ -198,9 +231,7 @@ class SqlNode(ExecNode):
198
231
  if not keep_pk:
199
232
  self.set_pk = False # we don't need the PK if we use this SqlNode as a CTE
200
233
  self.cte = self._create_stmt().cte()
201
- pk_count = self.num_pk_cols if self.set_pk else 0
202
- assert len(self.select_list) + pk_count == len(self.cte.c)
203
- return self.cte, exprs.ExprDict(zip(self.select_list, self.cte.c)) # skip pk cols
234
+ return self.cte, exprs.ExprDict(zip(list(self.select_list) + self.cell_md_refs, self.cte.c)) # skip pk cols
204
235
 
205
236
  @classmethod
206
237
  def retarget_rowid_refs(cls, target: catalog.TableVersionPath, expr_seq: Iterable[exprs.Expr]) -> None:
@@ -318,24 +349,53 @@ class SqlNode(ExecNode):
318
349
  output_batch = DataRowBatch(self.row_builder)
319
350
  output_row: Optional[exprs.DataRow] = None
320
351
  num_rows_returned = 0
352
+ is_using_cockroachdb = Env.get().is_using_cockroachdb
353
+ tzinfo = Env.get().default_time_zone
321
354
 
322
355
  for sql_row in result_cursor:
323
356
  output_row = output_batch.add_row(output_row)
324
357
 
325
358
  # populate output_row
359
+
326
360
  if self.num_pk_cols > 0:
327
361
  output_row.set_pk(tuple(sql_row[-self.num_pk_cols :]))
362
+
363
+ # column copies
364
+ for col, item_idx in self.column_item_idxs.items():
365
+ output_row.cell_vals[col.id] = sql_row[item_idx]
366
+ for col, item_idx in self.column_cellmd_item_idxs.items():
367
+ cell_md_dict = sql_row[item_idx]
368
+ output_row.cell_md[col.id] = exprs.CellMd(**cell_md_dict) if cell_md_dict is not None else None
369
+
370
+ # populate DataRow.slot_cellmd, where requested
371
+ for cellmd_ref, item_idx in self.cellmd_item_idxs.items():
372
+ cell_md_dict = sql_row[item_idx]
373
+ output_row.slot_md[cellmd_ref.col_ref.slot_idx] = (
374
+ exprs.CellMd.from_dict(cell_md_dict) if cell_md_dict is not None else None
375
+ )
376
+
328
377
  # copy the output of the SQL query into the output row
329
378
  for i, e in enumerate(self.select_list):
330
379
  slot_idx = e.slot_idx
331
- # certain numerical operations can produce Decimals (eg, SUM(<int column>)); we need to convert them
332
380
  if isinstance(sql_row[i], Decimal):
381
+ # certain numerical operations can produce Decimals (eg, SUM(<int column>)); we need to convert them
333
382
  if e.col_type.is_int_type():
334
383
  output_row[slot_idx] = int(sql_row[i])
335
384
  elif e.col_type.is_float_type():
336
385
  output_row[slot_idx] = float(sql_row[i])
337
386
  else:
338
387
  raise RuntimeError(f'Unexpected Decimal value for {e}')
388
+ elif is_using_cockroachdb and isinstance(sql_row[i], datetime.datetime):
389
+ # Ensure that the datetime is timezone-aware and in the session time zone
390
+ # cockroachDB returns timestamps in the session time zone, with numeric offset,
391
+ # convert to the session time zone with the requested tzinfo for DST handling
392
+ if e.col_type.is_timestamp_type():
393
+ if isinstance(sql_row[i].tzinfo, datetime.timezone):
394
+ output_row[slot_idx] = sql_row[i].astimezone(tz=tzinfo)
395
+ else:
396
+ output_row[slot_idx] = sql_row[i]
397
+ else:
398
+ raise RuntimeError(f'Unexpected datetime value for {e}')
339
399
  else:
340
400
  output_row[slot_idx] = sql_row[i]
341
401
 
@@ -387,11 +447,21 @@ class SqlScanNode(SqlNode):
387
447
  tbl: catalog.TableVersionPath,
388
448
  row_builder: exprs.RowBuilder,
389
449
  select_list: Iterable[exprs.Expr],
450
+ columns: list[catalog.Column],
451
+ cell_md_col_refs: list[exprs.ColumnRef] | None = None,
390
452
  set_pk: bool = False,
391
453
  exact_version_only: Optional[list[catalog.TableVersionHandle]] = None,
392
454
  ):
393
455
  sql_elements = exprs.SqlElementCache()
394
- super().__init__(tbl, row_builder, select_list, sql_elements, set_pk=set_pk)
456
+ super().__init__(
457
+ tbl,
458
+ row_builder,
459
+ select_list,
460
+ columns=columns,
461
+ sql_elements=sql_elements,
462
+ set_pk=set_pk,
463
+ cell_md_col_refs=cell_md_col_refs,
464
+ )
395
465
  # create Select stmt
396
466
  if exact_version_only is None:
397
467
  exact_version_only = []
@@ -423,11 +493,21 @@ class SqlLookupNode(SqlNode):
423
493
  tbl: catalog.TableVersionPath,
424
494
  row_builder: exprs.RowBuilder,
425
495
  select_list: Iterable[exprs.Expr],
496
+ columns: list[catalog.Column],
426
497
  sa_key_cols: list[sql.Column],
427
498
  key_vals: list[tuple],
499
+ cell_md_col_refs: list[exprs.ColumnRef] | None = None,
428
500
  ):
429
501
  sql_elements = exprs.SqlElementCache()
430
- super().__init__(tbl, row_builder, select_list, sql_elements, set_pk=True)
502
+ super().__init__(
503
+ tbl,
504
+ row_builder,
505
+ select_list,
506
+ columns=columns,
507
+ sql_elements=sql_elements,
508
+ set_pk=True,
509
+ cell_md_col_refs=cell_md_col_refs,
510
+ )
431
511
  # Where clause: (key-col-1, key-col-2, ...) IN ((val-1, val-2, ...), ...)
432
512
  self.where_clause_element = sql.tuple_(*sa_key_cols).in_(key_vals)
433
513
 
@@ -460,9 +540,10 @@ class SqlAggregationNode(SqlNode):
460
540
  limit: Optional[int] = None,
461
541
  exact_version_only: Optional[list[catalog.TableVersion]] = None,
462
542
  ):
543
+ assert len(input.cell_md_refs) == 0 # there's no aggregation over json or arrays in SQL
463
544
  self.input_cte, input_col_map = input.to_cte()
464
545
  sql_elements = exprs.SqlElementCache(input_col_map)
465
- super().__init__(None, row_builder, select_list, sql_elements)
546
+ super().__init__(None, row_builder, select_list, columns=[], sql_elements=sql_elements)
466
547
  self.group_by_items = group_by_items
467
548
 
468
549
  def _create_stmt(self) -> sql.Select:
@@ -498,7 +579,10 @@ class SqlJoinNode(SqlNode):
498
579
  input_cte, input_col_map = input_node.to_cte()
499
580
  self.input_ctes.append(input_cte)
500
581
  sql_elements.extend(input_col_map)
501
- super().__init__(None, row_builder, select_list, sql_elements)
582
+ cell_md_col_refs = [cell_md_ref.col_ref for input in inputs for cell_md_ref in input.cell_md_refs]
583
+ super().__init__(
584
+ None, row_builder, select_list, columns=[], sql_elements=sql_elements, cell_md_col_refs=cell_md_col_refs
585
+ )
502
586
 
503
587
  def _create_stmt(self) -> sql.Select:
504
588
  from pixeltable import plan
@@ -552,7 +636,16 @@ class SqlSampleNode(SqlNode):
552
636
  assert self.pk_count > 1
553
637
  sql_elements = exprs.SqlElementCache(input_col_map)
554
638
  assert sql_elements.contains_all(stratify_exprs)
555
- super().__init__(input.tbl, row_builder, select_list, sql_elements, set_pk=True)
639
+ cell_md_col_refs = [cell_md_ref.col_ref for cell_md_ref in input.cell_md_refs]
640
+ super().__init__(
641
+ input.tbl,
642
+ row_builder,
643
+ select_list,
644
+ columns=[],
645
+ sql_elements=sql_elements,
646
+ cell_md_col_refs=cell_md_col_refs,
647
+ set_pk=True,
648
+ )
556
649
  self.stratify_exprs = stratify_exprs
557
650
  self.sample_clause = sample_clause
558
651
  assert isinstance(self.sample_clause.seed, int)
@@ -6,7 +6,7 @@ from .column_property_ref import ColumnPropertyRef
6
6
  from .column_ref import ColumnRef
7
7
  from .comparison import Comparison
8
8
  from .compound_predicate import CompoundPredicate
9
- from .data_row import DataRow
9
+ from .data_row import ArrayMd, CellMd, DataRow
10
10
  from .expr import Expr
11
11
  from .expr_dict import ExprDict
12
12
  from .expr_set import ExprSet
@@ -72,15 +72,16 @@ class ArithmeticExpr(Expr):
72
72
  return left * right
73
73
  if self.operator == ArithmeticOperator.DIV:
74
74
  assert self.col_type.is_float_type()
75
- # Avoid DivisionByZero: if right is 0, make this a NULL
75
+ # Avoid division by zero errors by converting any zero divisor to NULL.
76
76
  # TODO: Should we cast the NULLs to NaNs when they are retrieved back into Python?
77
- nullif = sql.sql.func.nullif(right, 0)
78
- # We have to cast to a `float`, or else we'll get a `Decimal`
79
- return sql.sql.expression.cast(left / nullif, self.col_type.to_sa_type())
77
+ # These casts cause the computation to take place in float units, rather than DECIMAL.
78
+ nullif = sql.cast(sql.func.nullif(right, 0), self.col_type.to_sa_type())
79
+ return sql.cast(left, self.col_type.to_sa_type()) / nullif
80
80
  if self.operator == ArithmeticOperator.MOD:
81
81
  if self.col_type.is_int_type():
82
- nullif = sql.sql.func.nullif(right, 0)
83
- return left % nullif
82
+ # Avoid division by zero errors by converting any zero divisor to NULL.
83
+ nullif1 = sql.cast(sql.func.nullif(right, 0), self.col_type.to_sa_type())
84
+ return left % nullif1
84
85
  if self.col_type.is_float_type():
85
86
  # Postgres does not support modulus for floats
86
87
  return None
@@ -90,11 +91,9 @@ class ArithmeticExpr(Expr):
90
91
  # We need the behavior to be consistent, so that expressions will evaluate the same way
91
92
  # whether or not their operands can be translated to SQL. These SQL clauses should
92
93
  # mimic the behavior of Python's // operator.
93
- nullif = sql.sql.func.nullif(right, 0)
94
- if self.col_type.is_int_type():
95
- return sql.sql.expression.cast(sql.func.floor(left / nullif), self.col_type.to_sa_type())
96
- if self.col_type.is_float_type():
97
- return sql.sql.expression.cast(sql.func.floor(left / nullif), self.col_type.to_sa_type())
94
+ # Avoid division by zero errors by converting any zero divisor to NULL.
95
+ nullif = sql.cast(sql.func.nullif(right, 0), self.col_type.to_sa_type())
96
+ return sql.func.floor(sql.cast(left, self.col_type.to_sa_type()) / nullif)
98
97
  raise AssertionError()
99
98
 
100
99
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
@@ -44,21 +44,21 @@ class ColumnPropertyRef(Expr):
44
44
  return [*super()._id_attrs(), ('prop', self.prop.value)]
45
45
 
46
46
  @property
47
- def _col_ref(self) -> ColumnRef:
47
+ def col_ref(self) -> ColumnRef:
48
48
  col_ref = self.components[0]
49
49
  assert isinstance(col_ref, ColumnRef)
50
50
  return col_ref
51
51
 
52
52
  def __repr__(self) -> str:
53
- return f'{self._col_ref}.{self.prop.name.lower()}'
53
+ return f'{self.col_ref}.{self.prop.name.lower()}'
54
54
 
55
55
  def is_cellmd_prop(self) -> bool:
56
56
  return self.prop in (self.Property.ERRORTYPE, self.Property.ERRORMSG, self.Property.CELLMD)
57
57
 
58
58
  def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ColumnElement]:
59
- if not self._col_ref.col_handle.get().is_stored:
59
+ if not self.col_ref.col_handle.get().is_stored:
60
60
  return None
61
- col = self._col_ref.col_handle.get()
61
+ col = self.col_ref.col_handle.get()
62
62
 
63
63
  # the errortype/-msg properties of a read-validated media column need to be extracted from the DataRow
64
64
  if (
@@ -77,7 +77,7 @@ class ColumnPropertyRef(Expr):
77
77
  return col.sa_cellmd_col
78
78
  if self.prop == self.Property.FILEURL:
79
79
  # the file url is stored as the column value
80
- return sql_elements.get(self._col_ref)
80
+ return sql_elements.get(self.col_ref)
81
81
  return None
82
82
 
83
83
  @classmethod
@@ -87,15 +87,15 @@ class ColumnPropertyRef(Expr):
87
87
 
88
88
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
89
89
  if self.prop == self.Property.FILEURL:
90
- assert data_row.has_val[self._col_ref.slot_idx]
91
- data_row[self.slot_idx] = data_row.file_urls[self._col_ref.slot_idx]
90
+ assert data_row.has_val[self.col_ref.slot_idx]
91
+ data_row[self.slot_idx] = data_row.file_urls[self.col_ref.slot_idx]
92
92
  return
93
93
  elif self.prop == self.Property.LOCALPATH:
94
- assert data_row.has_val[self._col_ref.slot_idx]
95
- data_row[self.slot_idx] = data_row.file_paths[self._col_ref.slot_idx]
94
+ assert data_row.has_val[self.col_ref.slot_idx]
95
+ data_row[self.slot_idx] = data_row.file_paths[self.col_ref.slot_idx]
96
96
  return
97
97
  elif self.is_cellmd_prop():
98
- exc = data_row.get_exc(self._col_ref.slot_idx)
98
+ exc = data_row.get_exc(self.col_ref.slot_idx)
99
99
  if exc is None:
100
100
  data_row[self.slot_idx] = None
101
101
  elif self.prop == self.Property.ERRORTYPE:
@@ -123,8 +123,8 @@ class ColumnRef(Expr):
123
123
  name == ColumnPropertyRef.Property.ERRORTYPE.name.lower()
124
124
  or name == ColumnPropertyRef.Property.ERRORMSG.name.lower()
125
125
  ):
126
- property_is_present = self.col.stores_cellmd
127
- if not property_is_present:
126
+ is_valid = (self.col.is_computed or self.col.col_type.is_media_type()) and self.col.is_stored
127
+ if not is_valid:
128
128
  raise excs.Error(f'{name} only valid for a stored computed or media column: {self}')
129
129
  return ColumnPropertyRef(self, ColumnPropertyRef.Property[name.upper()])
130
130
  if (
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import dataclasses
3
4
  import datetime
4
5
  import io
5
6
  import urllib.parse
@@ -13,15 +14,72 @@ import PIL
13
14
  import PIL.Image
14
15
  import sqlalchemy as sql
15
16
 
17
+ import pixeltable.utils.image as image_utils
16
18
  from pixeltable import catalog, env
17
19
  from pixeltable.utils.local_store import TempStore
20
+ from pixeltable.utils.misc import non_none_dict_factory
21
+
22
+
23
+ @dataclasses.dataclass
24
+ class ArrayMd:
25
+ """
26
+ Metadata for array cells that are stored externally.
27
+ """
28
+
29
+ start: int
30
+ end: int
31
+
32
+ # we store bool arrays as packed bits (uint8 arrays), and need to record the shape to reconstruct the array
33
+ is_bool: bool = False
34
+ shape: tuple[int, ...] | None = None
35
+
36
+ def as_dict(self) -> dict:
37
+ # dict_factory: suppress Nones
38
+ x = dataclasses.asdict(self, dict_factory=non_none_dict_factory)
39
+ return x
40
+
41
+
42
+ @dataclasses.dataclass
43
+ class CellMd:
44
+ """
45
+ Content of the cellmd column.
46
+
47
+ All fields are optional, to minimize storage.
48
+ """
49
+
50
+ errortype: str | None = None
51
+ errormsg: str | None = None
52
+
53
+ # a list of file urls that are used to store images and arrays; only set for json and array columns
54
+ # for json columns: a list of all urls referenced in the column value
55
+ # for array columns: a single url
56
+ file_urls: list[str] | None = None
57
+
58
+ array_md: ArrayMd | None = None
59
+
60
+ @classmethod
61
+ def from_dict(cls, d: dict) -> CellMd:
62
+ x: CellMd
63
+ if 'array_md' in d:
64
+ d2 = d.copy()
65
+ del d2['array_md']
66
+ x = cls(**d2, array_md=ArrayMd(**d['array_md']))
67
+ else:
68
+ x = cls(**d)
69
+ return x
70
+
71
+ def as_dict(self) -> dict:
72
+ x = dataclasses.asdict(self, dict_factory=non_none_dict_factory)
73
+ return x
18
74
 
19
75
 
20
76
  class DataRow:
21
77
  """
22
78
  Encapsulates all data and execution state needed by RowBuilder and DataRowBatch:
23
79
  - state for in-memory computation
24
- - state for storing the data
80
+ - state needed for expression evaluation
81
+ - containers for output column values
82
+
25
83
  This is not meant to be a black-box abstraction.
26
84
 
27
85
  In-memory representations by column type:
@@ -39,79 +97,92 @@ class DataRow:
39
97
  - DocumentType: local path if available, otherwise url
40
98
  """
41
99
 
100
+ # expr evaluation state; indexed by slot idx
42
101
  vals: np.ndarray # of object
43
102
  has_val: np.ndarray # of bool
44
103
  excs: np.ndarray # of object
45
-
46
- # If `may_have_exc` is False, then we guarantee that no slot has an exception set. This is used to optimize
47
- # exception handling under normal operation.
48
- _may_have_exc: bool
49
-
50
- # expr evaluation state; indexed by slot idx
51
104
  missing_slots: np.ndarray # of bool; number of missing dependencies
52
105
  missing_dependents: np.ndarray # of int16; number of missing dependents
53
106
  is_scheduled: np.ndarray # of bool; True if this slot is scheduled for evaluation
54
107
 
55
- # control structures that are shared across all DataRows in a batch
56
- img_slot_idxs: list[int]
57
- media_slot_idxs: list[int]
58
- array_slot_idxs: list[int]
59
-
60
- # the primary key of a store row is a sequence of ints (the number is different for table vs view)
61
- pk: Optional[tuple[int, ...]]
108
+ # CellMd needed for query execution; needs to be indexed by slot idx, not column id, to work for joins
109
+ slot_md: dict[int, CellMd]
62
110
 
63
111
  # file_urls:
64
112
  # - stored url of file for media in vals[i]
65
113
  # - None if vals[i] is not media type
66
114
  # - not None if file_paths[i] is not None
115
+ # TODO: this is a sparse vector; should it be a dict[int, str]?
67
116
  file_urls: np.ndarray # of str
68
117
 
69
118
  # file_paths:
70
119
  # - local path of media file in vals[i]; points to the file cache if file_urls[i] is remote
71
120
  # - None if vals[i] is not a media type or if there is no local file yet for file_urls[i]
121
+ # TODO: this is a sparse vector; should it be a dict[int, str]?
72
122
  file_paths: np.ndarray # of str
73
123
 
124
+ # If `may_have_exc` is False, then we guarantee that no slot has an exception set. This is used to optimize
125
+ # exception handling under normal operation.
126
+ _may_have_exc: bool
127
+
128
+ # the primary key of a store row is a sequence of ints (the number is different for table vs view)
129
+ pk: Optional[tuple[int, ...]]
74
130
  # for nested rows (ie, those produced by JsonMapperDispatcher)
75
131
  parent_row: Optional[DataRow]
76
132
  parent_slot_idx: Optional[int]
77
133
 
134
+ # state for table output (insert()/update()); key: column id
135
+ cell_vals: dict[int, Any] # materialized values of output columns, in the format required for the column
136
+ cell_md: dict[int, CellMd]
137
+
138
+ # control structures that are shared across all DataRows in a batch
139
+ img_slot_idxs: list[int]
140
+ media_slot_idxs: list[int]
141
+ array_slot_idxs: list[int]
142
+ json_slot_idxs: list[int]
143
+
78
144
  def __init__(
79
145
  self,
80
146
  size: int,
81
147
  img_slot_idxs: list[int],
82
148
  media_slot_idxs: list[int],
83
149
  array_slot_idxs: list[int],
150
+ json_slot_idxs: list[int],
84
151
  parent_row: Optional[DataRow] = None,
85
152
  parent_slot_idx: Optional[int] = None,
86
153
  ):
87
- self.img_slot_idxs = img_slot_idxs
88
- self.media_slot_idxs = media_slot_idxs
89
- self.array_slot_idxs = array_slot_idxs
90
154
  self.init(size)
91
155
  self.parent_row = parent_row
92
156
  self.parent_slot_idx = parent_slot_idx
93
-
94
- def init(self, num_slots: int) -> None:
95
- self.vals = np.full(num_slots, None, dtype=object)
96
- self.has_val = np.zeros(num_slots, dtype=bool)
97
- self.excs = np.full(num_slots, None, dtype=object)
157
+ self.img_slot_idxs = img_slot_idxs
158
+ self.media_slot_idxs = media_slot_idxs
159
+ self.array_slot_idxs = array_slot_idxs
160
+ self.json_slot_idxs = json_slot_idxs
161
+
162
+ def init(self, size: int) -> None:
163
+ self.vals = np.full(size, None, dtype=object)
164
+ self.has_val = np.zeros(size, dtype=bool)
165
+ self.excs = np.full(size, None, dtype=object)
166
+ self.missing_slots = np.zeros(size, dtype=bool)
167
+ self.missing_dependents = np.zeros(size, dtype=np.int16)
168
+ self.is_scheduled = np.zeros(size, dtype=bool)
169
+ self.slot_md = {}
170
+ self.file_urls = np.full(size, None, dtype=object)
171
+ self.file_paths = np.full(size, None, dtype=object)
98
172
  self._may_have_exc = False
99
- self.missing_slots = np.zeros(num_slots, dtype=bool)
100
- self.missing_dependents = np.zeros(num_slots, dtype=np.int16)
101
- self.is_scheduled = np.zeros(num_slots, dtype=bool)
173
+ self.cell_vals = {}
174
+ self.cell_md = {}
102
175
  self.pk = None
103
- self.file_urls = np.full(num_slots, None, dtype=object)
104
- self.file_paths = np.full(num_slots, None, dtype=object)
105
176
  self.parent_row = None
106
177
  self.parent_slot_idx = None
107
178
 
108
- def clear(self, idxs: Optional[np.ndarray] = None) -> None:
109
- if idxs is not None:
110
- self.has_val[idxs] = False
111
- self.vals[idxs] = None
112
- self.excs[idxs] = None
113
- self.file_urls[idxs] = None
114
- self.file_paths[idxs] = None
179
+ def clear(self, slot_idxs: Optional[np.ndarray] = None) -> None:
180
+ if slot_idxs is not None:
181
+ self.has_val[slot_idxs] = False
182
+ self.vals[slot_idxs] = None
183
+ self.excs[slot_idxs] = None
184
+ self.file_urls[slot_idxs] = None
185
+ self.file_paths[slot_idxs] = None
115
186
  else:
116
187
  self.init(len(self.vals))
117
188
 
@@ -292,9 +363,7 @@ class DataRow:
292
363
  val = self.vals[index]
293
364
  format = None
294
365
  if isinstance(val, PIL.Image.Image):
295
- # Default to JPEG unless the image has a transparency layer (which isn't supported by JPEG).
296
- # In that case, use WebP instead.
297
- format = 'webp' if val.has_transparency_data else 'jpeg'
366
+ format = image_utils.default_format(val)
298
367
  filepath, url = TempStore.save_media_object(val, col, format=format)
299
368
  self.file_paths[index] = str(filepath) if filepath is not None else None
300
369
  self.vals[index] = None
pixeltable/exprs/expr.py CHANGED
@@ -368,6 +368,15 @@ class Expr(abc.ABC):
368
368
  for e in expr_list:
369
369
  yield from e.subexprs(expr_class=expr_class, filter=filter, traverse_matches=traverse_matches)
370
370
 
371
+ @classmethod
372
+ def list_contains(
373
+ cls,
374
+ expr_list: Iterable[Expr],
375
+ expr_class: type[Expr] | None = None,
376
+ filter: Callable[[Expr], bool] | None = None,
377
+ ) -> bool:
378
+ return any(e._contains(expr_class, filter) for e in expr_list)
379
+
371
380
  def _contains(self, cls: Optional[type[Expr]] = None, filter: Optional[Callable[[Expr], bool]] = None) -> bool:
372
381
  """
373
382
  Returns True if any subexpr is an instance of cls and/or matches filter.
@@ -9,26 +9,33 @@ T = TypeVar('T', bound='Expr')
9
9
 
10
10
  class ExprSet(Generic[T]):
11
11
  """
12
- A set that also supports indexed lookup (by slot_idx and Expr.id). Exprs are uniquely identified by Expr.id.
12
+ An ordered set that also supports indexed lookup (by slot_idx and Expr.id). Exprs are uniquely identified by
13
+ Expr.id.
13
14
  """
14
15
 
15
16
  exprs: dict[int, T] # key: Expr.id
17
+ expr_offsets: dict[int, int] # key: Expr.id, value: offset into self.exprs.keys()
16
18
  exprs_by_idx: dict[int, T] # key: slot_idx
17
19
 
18
20
  def __init__(self, elements: Optional[Iterable[T]] = None):
19
21
  self.exprs = {}
22
+ self.expr_offsets = {}
20
23
  self.exprs_by_idx = {}
21
24
  if elements is not None:
22
25
  for e in elements:
23
26
  self.add(e)
24
27
 
25
- def add(self, expr: T) -> None:
26
- if expr.id in self.exprs:
27
- return
28
+ def add(self, expr: T) -> int:
29
+ """Returns offset corresponding to iteration order"""
30
+ offset = self.expr_offsets.get(expr.id)
31
+ if offset is not None:
32
+ return offset
33
+ offset = len(self.exprs)
28
34
  self.exprs[expr.id] = expr
29
- if expr.slot_idx is None:
30
- return
31
- self.exprs_by_idx[expr.slot_idx] = expr
35
+ self.expr_offsets[expr.id] = offset
36
+ if expr.slot_idx is not None:
37
+ self.exprs_by_idx[expr.slot_idx] = expr
38
+ return offset
32
39
 
33
40
  def update(self, *others: Iterable[T]) -> None:
34
41
  for other in others: