pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (202) hide show
  1. pixeltable/__init__.py +23 -5
  2. pixeltable/_version.py +1 -0
  3. pixeltable/catalog/__init__.py +5 -3
  4. pixeltable/catalog/catalog.py +1318 -404
  5. pixeltable/catalog/column.py +186 -115
  6. pixeltable/catalog/dir.py +1 -2
  7. pixeltable/catalog/globals.py +11 -43
  8. pixeltable/catalog/insertable_table.py +167 -79
  9. pixeltable/catalog/path.py +61 -23
  10. pixeltable/catalog/schema_object.py +9 -10
  11. pixeltable/catalog/table.py +626 -308
  12. pixeltable/catalog/table_metadata.py +101 -0
  13. pixeltable/catalog/table_version.py +713 -569
  14. pixeltable/catalog/table_version_handle.py +37 -6
  15. pixeltable/catalog/table_version_path.py +42 -29
  16. pixeltable/catalog/tbl_ops.py +50 -0
  17. pixeltable/catalog/update_status.py +191 -0
  18. pixeltable/catalog/view.py +108 -94
  19. pixeltable/config.py +128 -22
  20. pixeltable/dataframe.py +188 -100
  21. pixeltable/env.py +407 -136
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +3 -0
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +231 -0
  27. pixeltable/exec/cell_reconstruction_node.py +135 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +7 -6
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +190 -30
  37. pixeltable/exec/globals.py +32 -0
  38. pixeltable/exec/in_memory_data_node.py +18 -18
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +206 -101
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +34 -30
  46. pixeltable/exprs/column_ref.py +92 -96
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +152 -55
  50. pixeltable/exprs/expr.py +62 -43
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +75 -37
  54. pixeltable/exprs/globals.py +1 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +10 -27
  57. pixeltable/exprs/is_null.py +1 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +5 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +127 -53
  64. pixeltable/exprs/rowid_ref.py +8 -12
  65. pixeltable/exprs/similarity_expr.py +50 -25
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +10 -10
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +20 -18
  78. pixeltable/func/signature.py +43 -16
  79. pixeltable/func/tools.py +23 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +6 -0
  82. pixeltable/functions/anthropic.py +93 -33
  83. pixeltable/functions/audio.py +114 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +1 -1
  86. pixeltable/functions/deepseek.py +20 -9
  87. pixeltable/functions/fireworks.py +2 -2
  88. pixeltable/functions/gemini.py +28 -11
  89. pixeltable/functions/globals.py +13 -13
  90. pixeltable/functions/groq.py +108 -0
  91. pixeltable/functions/huggingface.py +1046 -23
  92. pixeltable/functions/image.py +9 -18
  93. pixeltable/functions/llama_cpp.py +23 -8
  94. pixeltable/functions/math.py +3 -4
  95. pixeltable/functions/mistralai.py +4 -15
  96. pixeltable/functions/ollama.py +16 -9
  97. pixeltable/functions/openai.py +104 -82
  98. pixeltable/functions/openrouter.py +143 -0
  99. pixeltable/functions/replicate.py +2 -2
  100. pixeltable/functions/reve.py +250 -0
  101. pixeltable/functions/string.py +21 -28
  102. pixeltable/functions/timestamp.py +13 -14
  103. pixeltable/functions/together.py +4 -6
  104. pixeltable/functions/twelvelabs.py +92 -0
  105. pixeltable/functions/util.py +6 -1
  106. pixeltable/functions/video.py +1388 -106
  107. pixeltable/functions/vision.py +7 -7
  108. pixeltable/functions/whisper.py +15 -7
  109. pixeltable/functions/whisperx.py +179 -0
  110. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  111. pixeltable/globals.py +332 -105
  112. pixeltable/index/base.py +13 -22
  113. pixeltable/index/btree.py +23 -22
  114. pixeltable/index/embedding_index.py +32 -44
  115. pixeltable/io/__init__.py +4 -2
  116. pixeltable/io/datarows.py +7 -6
  117. pixeltable/io/external_store.py +49 -77
  118. pixeltable/io/fiftyone.py +11 -11
  119. pixeltable/io/globals.py +29 -28
  120. pixeltable/io/hf_datasets.py +17 -9
  121. pixeltable/io/label_studio.py +70 -66
  122. pixeltable/io/lancedb.py +3 -0
  123. pixeltable/io/pandas.py +12 -11
  124. pixeltable/io/parquet.py +13 -93
  125. pixeltable/io/table_data_conduit.py +71 -47
  126. pixeltable/io/utils.py +3 -3
  127. pixeltable/iterators/__init__.py +2 -1
  128. pixeltable/iterators/audio.py +21 -11
  129. pixeltable/iterators/document.py +116 -55
  130. pixeltable/iterators/image.py +5 -2
  131. pixeltable/iterators/video.py +293 -13
  132. pixeltable/metadata/__init__.py +4 -2
  133. pixeltable/metadata/converters/convert_18.py +2 -2
  134. pixeltable/metadata/converters/convert_19.py +2 -2
  135. pixeltable/metadata/converters/convert_20.py +2 -2
  136. pixeltable/metadata/converters/convert_21.py +2 -2
  137. pixeltable/metadata/converters/convert_22.py +2 -2
  138. pixeltable/metadata/converters/convert_24.py +2 -2
  139. pixeltable/metadata/converters/convert_25.py +2 -2
  140. pixeltable/metadata/converters/convert_26.py +2 -2
  141. pixeltable/metadata/converters/convert_29.py +4 -4
  142. pixeltable/metadata/converters/convert_34.py +2 -2
  143. pixeltable/metadata/converters/convert_36.py +2 -2
  144. pixeltable/metadata/converters/convert_37.py +15 -0
  145. pixeltable/metadata/converters/convert_38.py +39 -0
  146. pixeltable/metadata/converters/convert_39.py +124 -0
  147. pixeltable/metadata/converters/convert_40.py +73 -0
  148. pixeltable/metadata/converters/util.py +13 -12
  149. pixeltable/metadata/notes.py +4 -0
  150. pixeltable/metadata/schema.py +79 -42
  151. pixeltable/metadata/utils.py +74 -0
  152. pixeltable/mypy/__init__.py +3 -0
  153. pixeltable/mypy/mypy_plugin.py +123 -0
  154. pixeltable/plan.py +274 -223
  155. pixeltable/share/__init__.py +1 -1
  156. pixeltable/share/packager.py +259 -129
  157. pixeltable/share/protocol/__init__.py +34 -0
  158. pixeltable/share/protocol/common.py +170 -0
  159. pixeltable/share/protocol/operation_types.py +33 -0
  160. pixeltable/share/protocol/replica.py +109 -0
  161. pixeltable/share/publish.py +213 -57
  162. pixeltable/store.py +238 -175
  163. pixeltable/type_system.py +104 -63
  164. pixeltable/utils/__init__.py +2 -3
  165. pixeltable/utils/arrow.py +108 -13
  166. pixeltable/utils/av.py +298 -0
  167. pixeltable/utils/azure_store.py +305 -0
  168. pixeltable/utils/code.py +3 -3
  169. pixeltable/utils/console_output.py +4 -1
  170. pixeltable/utils/coroutine.py +6 -23
  171. pixeltable/utils/dbms.py +31 -5
  172. pixeltable/utils/description_helper.py +4 -5
  173. pixeltable/utils/documents.py +5 -6
  174. pixeltable/utils/exception_handler.py +7 -30
  175. pixeltable/utils/filecache.py +6 -6
  176. pixeltable/utils/formatter.py +4 -6
  177. pixeltable/utils/gcs_store.py +283 -0
  178. pixeltable/utils/http_server.py +2 -3
  179. pixeltable/utils/iceberg.py +1 -2
  180. pixeltable/utils/image.py +17 -0
  181. pixeltable/utils/lancedb.py +88 -0
  182. pixeltable/utils/local_store.py +316 -0
  183. pixeltable/utils/misc.py +5 -0
  184. pixeltable/utils/object_stores.py +528 -0
  185. pixeltable/utils/pydantic.py +60 -0
  186. pixeltable/utils/pytorch.py +5 -6
  187. pixeltable/utils/s3_store.py +392 -0
  188. pixeltable-0.4.20.dist-info/METADATA +587 -0
  189. pixeltable-0.4.20.dist-info/RECORD +218 -0
  190. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
  191. pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
  192. pixeltable/__version__.py +0 -3
  193. pixeltable/ext/__init__.py +0 -17
  194. pixeltable/ext/functions/__init__.py +0 -11
  195. pixeltable/ext/functions/whisperx.py +0 -77
  196. pixeltable/utils/media_store.py +0 -77
  197. pixeltable/utils/s3.py +0 -17
  198. pixeltable/utils/sample.py +0 -25
  199. pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
  200. pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
  201. pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
  202. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import datetime
4
- from typing import Any, Optional
4
+ from typing import Any
5
5
 
6
6
  import numpy as np
7
7
  import sqlalchemy as sql
@@ -18,7 +18,7 @@ from .sql_element_cache import SqlElementCache
18
18
  class Literal(Expr):
19
19
  val: Any
20
20
 
21
- def __init__(self, val: Any, col_type: Optional[ts.ColumnType] = None):
21
+ def __init__(self, val: Any, col_type: ts.ColumnType | None = None):
22
22
  if col_type is not None:
23
23
  val = col_type.create_literal(val)
24
24
  else:
@@ -42,7 +42,7 @@ class Literal(Expr):
42
42
  self.val = val
43
43
  self.id = self._create_id()
44
44
 
45
- def default_column_name(self) -> Optional[str]:
45
+ def default_column_name(self) -> str | None:
46
46
  return 'Literal'
47
47
 
48
48
  def __str__(self) -> str:
@@ -69,7 +69,7 @@ class Literal(Expr):
69
69
  def _id_attrs(self) -> list[tuple[str, Any]]:
70
70
  return [*super()._id_attrs(), ('val', self.val)]
71
71
 
72
- def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
72
+ def sql_expr(self, _: SqlElementCache) -> sql.ColumnElement | None:
73
73
  # Return a sql object so that constants can participate in SQL expressions
74
74
  return sql.sql.expression.literal(self.val, type_=self.col_type.to_sa_type())
75
75
 
@@ -97,7 +97,7 @@ class Literal(Expr):
97
97
  else:
98
98
  return {'val': self.val, **super()._as_dict()}
99
99
 
100
- def as_literal(self) -> Optional[Literal]:
100
+ def as_literal(self) -> Literal | None:
101
101
  return self
102
102
 
103
103
  @classmethod
@@ -1,4 +1,4 @@
1
- from typing import Any, Optional
1
+ from typing import Any
2
2
 
3
3
  import sqlalchemy as sql
4
4
 
@@ -55,7 +55,7 @@ class MethodRef(Expr):
55
55
  def _id_attrs(self) -> list[tuple[str, Any]]:
56
56
  return [*super()._id_attrs(), ('method_name', self.method_name)]
57
57
 
58
- def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
58
+ def sql_expr(self, _: SqlElementCache) -> sql.ColumnElement | None:
59
59
  return None
60
60
 
61
61
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any, Optional
3
+ from typing import Any
4
4
 
5
5
  import sqlalchemy as sql
6
6
 
@@ -43,7 +43,7 @@ class ObjectRef(Expr):
43
43
  def _equals(self, other: ObjectRef) -> bool:
44
44
  return self.id == other.id
45
45
 
46
- def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
46
+ def sql_expr(self, _: SqlElementCache) -> sql.ColumnElement | None:
47
47
  return None
48
48
 
49
49
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
@@ -1,16 +1,17 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import dataclasses
3
4
  import sys
4
5
  import time
5
- from dataclasses import dataclass
6
- from typing import Any, Iterable, Optional, Sequence
6
+ from typing import Any, Iterable, NamedTuple, Sequence
7
7
  from uuid import UUID
8
8
 
9
9
  import numpy as np
10
+ import sqlalchemy as sql
10
11
 
11
- from pixeltable import catalog, exceptions as excs, utils
12
+ from pixeltable import catalog, exceptions as excs, exprs, utils
12
13
  from pixeltable.env import Env
13
- from pixeltable.utils.media_store import MediaStore
14
+ from pixeltable.utils.misc import non_none_dict_factory
14
15
 
15
16
  from .data_row import DataRow
16
17
  from .expr import Expr, ExprScope
@@ -35,8 +36,7 @@ class ExecProfile:
35
36
  )
36
37
 
37
38
 
38
- @dataclass
39
- class ColumnSlotIdx:
39
+ class ColumnSlotIdx(NamedTuple):
40
40
  """Info for how to locate materialized column in DataRow
41
41
  TODO: can this be integrated into RowBuilder directly?
42
42
  """
@@ -50,6 +50,12 @@ class RowBuilder:
50
50
 
51
51
  For ColumnRefs to unstored iterator columns:
52
52
  - in order for them to be executable, we also record the iterator args and pass them to the ColumnRef
53
+
54
+ Args:
55
+ output_exprs: list of Exprs to be evaluated
56
+ columns: list of columns to be materialized
57
+ input_exprs: list of Exprs that are excluded from evaluation (because they're already materialized)
58
+ TODO: enforce that output_exprs doesn't overlap with input_exprs?
53
59
  """
54
60
 
55
61
  unique_exprs: ExprSet
@@ -63,7 +69,8 @@ class RowBuilder:
63
69
 
64
70
  input_exprs: ExprSet
65
71
 
66
- table_columns: list[ColumnSlotIdx]
72
+ tbl: catalog.TableVersion | None # reference table of the RowBuilder; used to identify pk columns for writes
73
+ table_columns: dict[catalog.Column, int | None] # value: slot idx, if the result of an expr
67
74
  default_eval_ctx: EvalCtx
68
75
  unstored_iter_args: dict[UUID, Expr]
69
76
 
@@ -84,7 +91,12 @@ class RowBuilder:
84
91
  # (a subexpr can be shared across multiple output exprs)
85
92
  output_expr_ids: list[set[int]]
86
93
 
87
- @dataclass
94
+ img_slot_idxs: list[int] # Indices of image slots
95
+ media_slot_idxs: list[int] # Indices of non-image media slots
96
+ array_slot_idxs: list[int] # Indices of array slots
97
+ json_slot_idxs: list[int] # Indices of json slots
98
+
99
+ @dataclasses.dataclass
88
100
  class EvalCtx:
89
101
  """Context for evaluating a set of target exprs"""
90
102
 
@@ -93,14 +105,13 @@ class RowBuilder:
93
105
  target_slot_idxs: list[int] # slot idxs of target exprs; might contain duplicates
94
106
  target_exprs: list[Expr] # exprs corresponding to target_slot_idxs
95
107
 
96
- def __init__(self, output_exprs: Sequence[Expr], columns: Sequence[catalog.Column], input_exprs: Iterable[Expr]):
97
- """
98
- Args:
99
- output_exprs: list of Exprs to be evaluated
100
- columns: list of columns to be materialized
101
- input_exprs: list of Exprs that are excluded from evaluation (because they're already materialized)
102
- TODO: enforce that output_exprs doesn't overlap with input_exprs?
103
- """
108
+ def __init__(
109
+ self,
110
+ output_exprs: Sequence[Expr],
111
+ columns: Sequence[catalog.Column],
112
+ input_exprs: Iterable[Expr],
113
+ tbl: catalog.TableVersion | None = None,
114
+ ):
104
115
  self.unique_exprs: ExprSet[Expr] = ExprSet() # dependencies precede their dependents
105
116
  self.next_slot_idx = 0
106
117
 
@@ -117,7 +128,7 @@ class RowBuilder:
117
128
  )
118
129
 
119
130
  # if init(columns):
120
- # - we are creating table rows and need to record columns for create_table_row()
131
+ # - we are creating table rows and need to record columns for create_store_table_row()
121
132
  # - output_exprs materialize those columns
122
133
  # - input_exprs are ColumnRefs of the non-computed columns (ie, what needs to be provided as input)
123
134
  # - media validation:
@@ -125,7 +136,8 @@ class RowBuilder:
125
136
  # * further references to that column (eg, computed cols) need to resolve to the validating ColumnRef
126
137
  from .column_ref import ColumnRef
127
138
 
128
- self.table_columns: list[ColumnSlotIdx] = []
139
+ self.tbl = tbl
140
+ self.table_columns = {}
129
141
  self.input_exprs = ExprSet()
130
142
  validating_colrefs: dict[Expr, Expr] = {} # key: non-validating colref, value: corresp. validating colref
131
143
  for col in columns:
@@ -171,18 +183,18 @@ class RowBuilder:
171
183
  col_refs = [e for e in self.unique_exprs if isinstance(e, ColumnRef)]
172
184
 
173
185
  def refs_unstored_iter_col(col_ref: ColumnRef) -> bool:
174
- tbl = col_ref.col.tbl
186
+ tbl = col_ref.col.get_tbl()
175
187
  return tbl.is_component_view and tbl.is_iterator_column(col_ref.col) and not col_ref.col.is_stored
176
188
 
177
189
  unstored_iter_col_refs = [col_ref for col_ref in col_refs if refs_unstored_iter_col(col_ref)]
178
- component_views = [col_ref.col.tbl for col_ref in unstored_iter_col_refs]
190
+ component_views = [col_ref.col.get_tbl() for col_ref in unstored_iter_col_refs]
179
191
  unstored_iter_args = {view.id: view.iterator_args.copy() for view in component_views}
180
192
  self.unstored_iter_args = {
181
193
  id: self._record_unique_expr(arg, recursive=True) for id, arg in unstored_iter_args.items()
182
194
  }
183
195
 
184
196
  for col_ref in unstored_iter_col_refs:
185
- iter_arg_ctx = self.create_eval_ctx([unstored_iter_args[col_ref.col.tbl.id]])
197
+ iter_arg_ctx = self.create_eval_ctx([unstored_iter_args[col_ref.col.get_tbl().id]])
186
198
  col_ref.set_iter_arg_ctx(iter_arg_ctx)
187
199
 
188
200
  # we guarantee that we can compute the expr DAG in a single front-to-back pass
@@ -201,7 +213,7 @@ class RowBuilder:
201
213
  # this is input and therefore doesn't depend on other exprs
202
214
  continue
203
215
  # error properties don't have exceptions themselves
204
- if isinstance(expr, ColumnPropertyRef) and expr.is_error_prop():
216
+ if isinstance(expr, ColumnPropertyRef) and expr.is_cellmd_prop():
205
217
  continue
206
218
  dependency_idxs = [d.slot_idx for d in expr.dependencies()]
207
219
  self.dependencies[expr.slot_idx, dependency_idxs] = True
@@ -227,13 +239,32 @@ class RowBuilder:
227
239
  for e in self.output_exprs:
228
240
  self._record_output_expr_id(e, e.slot_idx)
229
241
 
242
+ self.img_slot_idxs = [e.slot_idx for e in self.unique_exprs if e.col_type.is_image_type()]
243
+ self.media_slot_idxs = [
244
+ e.slot_idx for e in self.unique_exprs if e.col_type.is_media_type() and not e.col_type.is_image_type()
245
+ ]
246
+ self.array_slot_idxs = [e.slot_idx for e in self.unique_exprs if e.col_type.is_array_type()]
247
+ self.json_slot_idxs = [e.slot_idx for e in self.unique_exprs if e.col_type.is_json_type()]
248
+
230
249
  def add_table_column(self, col: catalog.Column, slot_idx: int) -> None:
231
- """Record a column that is part of the table row"""
232
- self.table_columns.append(ColumnSlotIdx(col, slot_idx))
250
+ """Record an output column for which the value is produced via expr evaluation"""
251
+ assert self.tbl is not None
252
+ assert col.is_stored
253
+ self.table_columns[col] = slot_idx
233
254
 
234
- def output_slot_idxs(self) -> list[ColumnSlotIdx]:
235
- """Return ColumnSlotIdx for output columns"""
236
- return self.table_columns
255
+ def add_table_columns(self, cols: list[catalog.Column]) -> None:
256
+ """Record output columns whose values are materialized into DataRow.cell_vals"""
257
+ for col in cols:
258
+ self.table_columns[col] = None
259
+
260
+ @property
261
+ def media_output_col_info(self) -> list[ColumnSlotIdx]:
262
+ """Return slot idxs for media output columns whose values are produced by expr evaluation"""
263
+ return [
264
+ ColumnSlotIdx(col, slot_idx)
265
+ for col, slot_idx in self.table_columns.items()
266
+ if col.col_type.is_media_type() and slot_idx is not None
267
+ ]
237
268
 
238
269
  @property
239
270
  def num_materialized(self) -> int:
@@ -277,7 +308,7 @@ class RowBuilder:
277
308
  self._record_output_expr_id(d, output_expr_id)
278
309
 
279
310
  def _compute_dependencies(
280
- self, target_slot_idxs: list[int], excluded_slot_idxs: list[int], target_scope: Optional[ExprScope] = None
311
+ self, target_slot_idxs: list[int], excluded_slot_idxs: list[int], target_scope: ExprScope | None = None
281
312
  ) -> list[int]:
282
313
  """Compute exprs needed to materialize the given target slots, excluding 'excluded_slot_idxs'
283
314
 
@@ -331,7 +362,7 @@ class RowBuilder:
331
362
  self.__set_slot_idxs_aux(c)
332
363
 
333
364
  def get_dependencies(
334
- self, targets: Iterable[Expr], exclude: Optional[Iterable[Expr]] = None, limit_scope: bool = True
365
+ self, targets: Iterable[Expr], exclude: Iterable[Expr] | None = None, limit_scope: bool = True
335
366
  ) -> list[Expr]:
336
367
  """
337
368
  Return list of dependencies needed to evaluate the given target exprs (expressed as slot idxs).
@@ -349,7 +380,7 @@ class RowBuilder:
349
380
  return []
350
381
  # make sure we only refer to recorded exprs
351
382
  targets = [self.unique_exprs[e] for e in targets]
352
- target_scope: Optional[ExprScope] = None
383
+ target_scope: ExprScope | None = None
353
384
  if limit_scope:
354
385
  # make sure all targets are from the same scope
355
386
  target_scopes = {e.scope() for e in targets}
@@ -367,7 +398,7 @@ class RowBuilder:
367
398
  return [self.unique_exprs[id] for id in result_ids]
368
399
 
369
400
  def create_eval_ctx(
370
- self, targets: Iterable[Expr], exclude: Optional[Iterable[Expr]] = None, limit_scope: bool = True
401
+ self, targets: Iterable[Expr], exclude: Iterable[Expr] | None = None, limit_scope: bool = True
371
402
  ) -> EvalCtx:
372
403
  """Return EvalCtx for targets"""
373
404
  targets = list(targets)
@@ -396,9 +427,9 @@ class RowBuilder:
396
427
  self,
397
428
  data_row: DataRow,
398
429
  ctx: EvalCtx,
399
- profile: Optional[ExecProfile] = None,
430
+ profile: ExecProfile | None = None,
400
431
  ignore_errors: bool = False,
401
- force_eval: Optional[ExprScope] = None,
432
+ force_eval: ExprScope | None = None,
402
433
  ) -> None:
403
434
  """
404
435
  Populates the slots in data_row given in ctx.
@@ -427,33 +458,76 @@ class RowBuilder:
427
458
  expr, f'expression {expr}', data_row.get_exc(expr.slot_idx), exc_tb, input_vals, 0
428
459
  ) from exc
429
460
 
430
- def create_table_row(self, data_row: DataRow, exc_col_ids: set[int]) -> tuple[dict[str, Any], int]:
431
- """Create a table row from the slots that have an output column assigned
461
+ def create_store_table_row(
462
+ self, data_row: DataRow, cols_with_excs: set[int] | None, pk: tuple[int, ...]
463
+ ) -> tuple[list[Any], int]:
464
+ """Create a store table row from the slots that have an output column assigned
432
465
 
433
- Return tuple[dict that represents a stored row (can be passed to sql.insert()), # of exceptions]
466
+ Return tuple[list of row values in `self.table_columns` order, # of exceptions]
434
467
  This excludes system columns.
468
+ Row values are converted to their store type.
435
469
  """
470
+ from pixeltable.exprs.column_property_ref import ColumnPropertyRef
471
+
436
472
  num_excs = 0
437
- table_row: dict[str, Any] = {}
438
- for info in self.table_columns:
439
- col, slot_idx = info.col, info.slot_idx
473
+ table_row: list[Any] = list(pk)
474
+ # Nulls in JSONB columns need to be stored as sql.sql.null(), otherwise it stores a json 'null'
475
+ for col, slot_idx in self.table_columns.items():
476
+ if col.id in data_row.cell_vals:
477
+ table_row.append(data_row.cell_vals[col.id])
478
+ if col.stores_cellmd:
479
+ if data_row.cell_md[col.id] is None:
480
+ table_row.append(sql.sql.null())
481
+ else:
482
+ # we want to minimize the size of the stored dict and use dict_factory to remove Nones
483
+ md = dataclasses.asdict(data_row.cell_md[col.id], dict_factory=non_none_dict_factory)
484
+ assert len(md) > 0
485
+ table_row.append(md)
486
+ if slot_idx is not None and data_row.has_exc(slot_idx):
487
+ num_excs += 1
488
+ if cols_with_excs is not None:
489
+ cols_with_excs.add(col.id)
490
+ continue
491
+
440
492
  if data_row.has_exc(slot_idx):
441
- # exceptions get stored in the errortype/-msg columns
442
493
  exc = data_row.get_exc(slot_idx)
443
494
  num_excs += 1
444
- exc_col_ids.add(col.id)
445
- table_row[col.store_name()] = None
446
- table_row[col.errortype_store_name()] = type(exc).__name__
447
- table_row[col.errormsg_store_name()] = str(exc)
495
+ if cols_with_excs is not None:
496
+ cols_with_excs.add(col.id)
497
+ table_row.append(sql.sql.null() if col.col_type.is_json_type() else None)
498
+ if col.stores_cellmd:
499
+ # exceptions get stored in the errortype/-msg properties of the cellmd column
500
+ table_row.append(ColumnPropertyRef.create_cellmd_exc(exc))
448
501
  else:
449
- if col.col_type.is_image_type() and data_row.file_urls[slot_idx] is None:
450
- # we have yet to store this image
451
- filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.version))
452
- data_row.flush_img(slot_idx, filepath)
453
- val = data_row.get_stored_val(slot_idx, col.get_sa_col_type())
454
- table_row[col.store_name()] = val
455
- # we unfortunately need to set these, even if there are no errors
456
- table_row[col.errortype_store_name()] = None
457
- table_row[col.errormsg_store_name()] = None
502
+ val = data_row.get_stored_val(slot_idx, col.sa_col_type)
503
+ table_row.append(val)
504
+ if col.stores_cellmd:
505
+ table_row.append(sql.sql.null()) # placeholder for cellmd column
458
506
 
459
507
  return table_row, num_excs
508
+
509
+ def store_column_names(self) -> list[str]:
510
+ """
511
+ Returns the list of store column names corresponding to the table_columns of this RowBuilder.
512
+ The second tuple element of the return value is a dictionary containing all media columns in the
513
+ table; it's the mapping {list_index: column}.
514
+ """
515
+ assert self.tbl is not None, self.table_columns
516
+ store_col_names: list[str] = [pk_col.name for pk_col in self.tbl.store_tbl.pk_columns()]
517
+
518
+ for col in self.table_columns:
519
+ store_col_names.append(col.store_name())
520
+ if col.stores_cellmd:
521
+ store_col_names.append(col.cellmd_store_name())
522
+
523
+ return store_col_names
524
+
525
+ def make_row(self) -> exprs.DataRow:
526
+ """Creates a new DataRow with the current row_builder's configuration."""
527
+ return exprs.DataRow(
528
+ size=self.num_materialized,
529
+ img_slot_idxs=self.img_slot_idxs,
530
+ media_slot_idxs=self.media_slot_idxs,
531
+ array_slot_idxs=self.array_slot_idxs,
532
+ json_slot_idxs=self.json_slot_idxs,
533
+ )
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
- from typing import Any, Optional, cast
4
+ from typing import Any, cast
5
5
  from uuid import UUID
6
6
 
7
7
  import sqlalchemy as sql
@@ -25,18 +25,18 @@ class RowidRef(Expr):
25
25
  (with and without a TableVersion).
26
26
  """
27
27
 
28
- tbl: Optional[catalog.TableVersionHandle]
29
- normalized_base: Optional[catalog.TableVersionHandle]
28
+ tbl: catalog.TableVersionHandle | None
29
+ normalized_base: catalog.TableVersionHandle | None
30
30
  tbl_id: UUID
31
31
  normalized_base_id: UUID
32
32
  rowid_component_idx: int
33
33
 
34
34
  def __init__(
35
35
  self,
36
- tbl: Optional[catalog.TableVersionHandle],
36
+ tbl: catalog.TableVersionHandle | None,
37
37
  idx: int,
38
- tbl_id: Optional[UUID] = None,
39
- normalized_base_id: Optional[UUID] = None,
38
+ tbl_id: UUID | None = None,
39
+ normalized_base_id: UUID | None = None,
40
40
  ):
41
41
  super().__init__(ts.IntType(nullable=False))
42
42
  self.tbl = tbl
@@ -57,7 +57,7 @@ class RowidRef(Expr):
57
57
  self.rowid_component_idx = idx
58
58
  self.id = self._create_id()
59
59
 
60
- def default_column_name(self) -> Optional[str]:
60
+ def default_column_name(self) -> str | None:
61
61
  return str(self)
62
62
 
63
63
  def _equals(self, other: RowidRef) -> bool:
@@ -98,17 +98,13 @@ class RowidRef(Expr):
98
98
  self.tbl = tbl.tbl_version
99
99
  self.tbl_id = self.tbl.id
100
100
 
101
- def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
101
+ def sql_expr(self, _: SqlElementCache) -> sql.ColumnElement | None:
102
102
  tbl = self.tbl.get() if self.tbl is not None else catalog.Catalog.get().get_tbl_version(self.tbl_id, None)
103
103
  assert tbl.is_validated
104
104
  rowid_cols = tbl.store_tbl.rowid_columns()
105
105
  assert self.rowid_component_idx <= len(rowid_cols), (
106
106
  f'{self.rowid_component_idx} not consistent with {rowid_cols}'
107
107
  )
108
- # _logger.debug(
109
- # f'RowidRef.sql_expr: tbl={tbl.id}{tbl.effective_version} sa_tbl={id(tbl.store_tbl.sa_tbl):x} '
110
- # f'tv={id(tbl):x}'
111
- # )
112
108
  return rowid_cols[self.rowid_component_idx]
113
109
 
114
110
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
@@ -1,4 +1,6 @@
1
- from typing import Any, Optional
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
2
4
 
3
5
  import sqlalchemy as sql
4
6
 
@@ -12,34 +14,43 @@ from .literal import Literal
12
14
  from .row_builder import RowBuilder
13
15
  from .sql_element_cache import SqlElementCache
14
16
 
17
+ if TYPE_CHECKING:
18
+ from pixeltable.catalog.table_version import TableVersion
19
+
15
20
 
16
21
  class SimilarityExpr(Expr):
17
- def __init__(self, col_ref: ColumnRef, item: Any, idx_name: Optional[str] = None):
22
+ """
23
+ A similarity expression against an embedding index.
24
+ """
25
+
26
+ idx_id: int
27
+ idx_name: str
28
+
29
+ def __init__(self, col_ref: ColumnRef, item: Any, idx_name: str | None = None):
30
+ from pixeltable.index import EmbeddingIndex
31
+
18
32
  super().__init__(ts.FloatType())
19
33
  item_expr = Expr.from_object(item)
20
34
  if item_expr is None or not (item_expr.col_type.is_string_type() or item_expr.col_type.is_image_type()):
21
35
  raise excs.Error(f'similarity(): requires a string or a PIL.Image.Image object, not a {type(item)}')
22
- assert item_expr.col_type.is_string_type() or item_expr.col_type.is_image_type()
23
36
 
24
37
  self.components = [col_ref, item_expr]
25
38
 
26
- from pixeltable import index
27
-
28
39
  # determine index to use
29
- idx_dict = col_ref.find_embedding_index(idx_name, 'similarity')
30
- assert len(idx_dict) == 1
31
- self.idx_info = next(iter(idx_dict.values()))
32
- idx = self.idx_info.idx
33
- assert isinstance(idx, index.EmbeddingIndex)
40
+ idx_info = col_ref.tbl.get().get_idx(col_ref.col, idx_name, EmbeddingIndex)
41
+ self.idx_id = idx_info.id
42
+ self.idx_name = idx_info.name
43
+ idx = idx_info.idx
44
+ assert isinstance(idx, EmbeddingIndex)
34
45
 
35
46
  if item_expr.col_type.is_string_type() and idx.string_embed is None:
36
47
  raise excs.Error(
37
- f'Embedding index {self.idx_info.name!r} on column {self.idx_info.col.name!r} does not have a '
48
+ f'Embedding index {idx_info.name!r} on column {idx_info.col.name!r} does not have a '
38
49
  f'string embedding and does not support string queries'
39
50
  )
40
51
  if item_expr.col_type.is_image_type() and idx.image_embed is None:
41
52
  raise excs.Error(
42
- f'Embedding index {self.idx_info.name!r} on column {self.idx_info.col.name!r} does not have an '
53
+ f'Embedding index {idx_info.name!r} on column {idx_info.col.name!r} does not have an '
43
54
  f'image embedding and does not support image queries'
44
55
  )
45
56
  self.id = self._create_id()
@@ -48,39 +59,53 @@ class SimilarityExpr(Expr):
48
59
  return f'{self.components[0]}.similarity({self.components[1]})'
49
60
 
50
61
  def _id_attrs(self) -> list[tuple[str, Any]]:
51
- return [*super()._id_attrs(), ('idx_name', self.idx_info.name)]
62
+ return [*super()._id_attrs(), ('idx_id', self.idx_id)]
52
63
 
53
64
  def default_column_name(self) -> str:
54
65
  return 'similarity'
55
66
 
56
- def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
57
- # TODO: validate that the index still exists
67
+ def sql_expr(self, _: SqlElementCache) -> sql.ColumnElement | None:
68
+ from pixeltable.index import EmbeddingIndex
69
+
70
+ # check for a literal here, instead of the c'tor: needed for ExprTemplateFunctions
58
71
  if not isinstance(self.components[1], Literal):
59
72
  raise excs.Error('similarity(): requires a string or a PIL.Image.Image object, not an expression')
60
73
  item = self.components[1].val
61
- from pixeltable import index
74
+ idx_info = self._resolve_idx()
75
+ assert isinstance(idx_info.idx, EmbeddingIndex)
76
+ return idx_info.idx.similarity_clause(idx_info.val_col, item)
62
77
 
63
- assert isinstance(self.idx_info.idx, index.EmbeddingIndex)
64
- return self.idx_info.idx.similarity_clause(self.idx_info.val_col, item)
78
+ def as_order_by_clause(self, is_asc: bool) -> sql.ColumnElement | None:
79
+ from pixeltable.index import EmbeddingIndex
65
80
 
66
- def as_order_by_clause(self, is_asc: bool) -> Optional[sql.ColumnElement]:
81
+ # check for a literal here, instead of the c'tor: needed for ExprTemplateFunctions
67
82
  if not isinstance(self.components[1], Literal):
68
83
  raise excs.Error('similarity(): requires a string or a PIL.Image.Image object, not an expression')
69
84
  item = self.components[1].val
70
- from pixeltable import index
85
+ idx_info = self._resolve_idx()
86
+ assert isinstance(idx_info.idx, EmbeddingIndex)
87
+ return idx_info.idx.order_by_clause(idx_info.val_col, item, is_asc)
88
+
89
+ def _resolve_idx(self) -> 'TableVersion.IndexInfo':
90
+ from pixeltable.index import EmbeddingIndex
71
91
 
72
- assert isinstance(self.idx_info.idx, index.EmbeddingIndex)
73
- return self.idx_info.idx.order_by_clause(self.idx_info.val_col, item, is_asc)
92
+ # resolve idx_id
93
+ col_ref = self.components[0]
94
+ if self.idx_id not in col_ref.tbl.get().idxs:
95
+ raise excs.Error(f'Index {self.idx_name!r} not found')
96
+ idx_info = col_ref.tbl.get().idxs[self.idx_id]
97
+ assert isinstance(idx_info.idx, EmbeddingIndex)
98
+ return idx_info
74
99
 
75
100
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
76
101
  raise excs.Error('similarity(): cannot be used in a computed column')
77
102
 
78
103
  def _as_dict(self) -> dict:
79
- return {'idx_name': self.idx_info.name, **super()._as_dict()}
104
+ return {'idx_name': self.idx_name, **super()._as_dict()}
80
105
 
81
106
  @classmethod
82
107
  def _from_dict(cls, d: dict, components: list[Expr]) -> 'SimilarityExpr':
83
- iname = d.get('idx_name')
108
+ idx_name = d.get('idx_name')
84
109
  assert len(components) == 2
85
110
  assert isinstance(components[0], ColumnRef)
86
- return cls(components[0], components[1], idx_name=iname)
111
+ return cls(components[0], components[1], idx_name=idx_name)
@@ -1,4 +1,4 @@
1
- from typing import Iterable, Optional
1
+ from typing import Iterable
2
2
 
3
3
  import sqlalchemy as sql
4
4
 
@@ -9,9 +9,9 @@ from .expr_dict import ExprDict
9
9
  class SqlElementCache:
10
10
  """Cache of sql.ColumnElements for exprs"""
11
11
 
12
- cache: dict[int, Optional[sql.ColumnElement]] # key: Expr.id
12
+ cache: dict[int, sql.ColumnElement | None] # key: Expr.id
13
13
 
14
- def __init__(self, elements: Optional[ExprDict[sql.ColumnElement]] = None):
14
+ def __init__(self, elements: ExprDict[sql.ColumnElement] | None = None):
15
15
  self.cache = {}
16
16
  if elements is not None:
17
17
  for e, el in elements.items():
@@ -21,7 +21,7 @@ class SqlElementCache:
21
21
  for e, el in elements.items():
22
22
  self.cache[e.id] = el
23
23
 
24
- def get(self, e: Expr) -> Optional[sql.ColumnElement]:
24
+ def get(self, e: Expr) -> sql.ColumnElement | None:
25
25
  """Returns the sql.ColumnElement for the given Expr, or None if Expr.to_sql() returns None."""
26
26
  try:
27
27
  return self.cache[e.id]
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any, Optional, Union
3
+ from typing import Any
4
4
 
5
5
  import sqlalchemy as sql
6
6
 
@@ -60,7 +60,7 @@ class StringOp(Expr):
60
60
  def _id_attrs(self) -> list[tuple[str, Any]]:
61
61
  return [*super()._id_attrs(), ('operator', self.operator.value)]
62
62
 
63
- def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ColumnElement]:
63
+ def sql_expr(self, sql_elements: SqlElementCache) -> sql.ColumnElement | None:
64
64
  left = sql_elements.get(self._op1)
65
65
  right = sql_elements.get(self._op2)
66
66
  if left is None or right is None:
@@ -68,7 +68,7 @@ class StringOp(Expr):
68
68
  if self.operator == StringOperator.CONCAT:
69
69
  return left.concat(right)
70
70
  if self.operator == StringOperator.REPEAT:
71
- return sql.func.repeat(sql.cast(left, sql.String), sql.cast(right, sql.Integer))
71
+ return sql.func.repeat(left.cast(sql.String), right.cast(sql.Integer))
72
72
  return None
73
73
 
74
74
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
@@ -76,7 +76,7 @@ class StringOp(Expr):
76
76
  op2_val = data_row[self._op2.slot_idx]
77
77
  data_row[self.slot_idx] = self.eval_nullable(op1_val, op2_val)
78
78
 
79
- def eval_nullable(self, op1_val: Union[str, None], op2_val: Union[int, str, None]) -> Union[str, None]:
79
+ def eval_nullable(self, op1_val: str | None, op2_val: int | str | None) -> str | None:
80
80
  """
81
81
  Return the result of evaluating the expression on two nullable int/float operands,
82
82
  None is interpreted as SQL NULL
@@ -85,7 +85,7 @@ class StringOp(Expr):
85
85
  return None
86
86
  return self.eval_non_null(op1_val, op2_val)
87
87
 
88
- def eval_non_null(self, op1_val: str, op2_val: Union[int, str]) -> str:
88
+ def eval_non_null(self, op1_val: str, op2_val: int | str) -> str:
89
89
  """
90
90
  Return the result of evaluating the expression on two int/float operands
91
91
  """