pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (202) hide show
  1. pixeltable/__init__.py +23 -5
  2. pixeltable/_version.py +1 -0
  3. pixeltable/catalog/__init__.py +5 -3
  4. pixeltable/catalog/catalog.py +1318 -404
  5. pixeltable/catalog/column.py +186 -115
  6. pixeltable/catalog/dir.py +1 -2
  7. pixeltable/catalog/globals.py +11 -43
  8. pixeltable/catalog/insertable_table.py +167 -79
  9. pixeltable/catalog/path.py +61 -23
  10. pixeltable/catalog/schema_object.py +9 -10
  11. pixeltable/catalog/table.py +626 -308
  12. pixeltable/catalog/table_metadata.py +101 -0
  13. pixeltable/catalog/table_version.py +713 -569
  14. pixeltable/catalog/table_version_handle.py +37 -6
  15. pixeltable/catalog/table_version_path.py +42 -29
  16. pixeltable/catalog/tbl_ops.py +50 -0
  17. pixeltable/catalog/update_status.py +191 -0
  18. pixeltable/catalog/view.py +108 -94
  19. pixeltable/config.py +128 -22
  20. pixeltable/dataframe.py +188 -100
  21. pixeltable/env.py +407 -136
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +3 -0
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +231 -0
  27. pixeltable/exec/cell_reconstruction_node.py +135 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +7 -6
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +190 -30
  37. pixeltable/exec/globals.py +32 -0
  38. pixeltable/exec/in_memory_data_node.py +18 -18
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +206 -101
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +34 -30
  46. pixeltable/exprs/column_ref.py +92 -96
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +152 -55
  50. pixeltable/exprs/expr.py +62 -43
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +75 -37
  54. pixeltable/exprs/globals.py +1 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +10 -27
  57. pixeltable/exprs/is_null.py +1 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +5 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +127 -53
  64. pixeltable/exprs/rowid_ref.py +8 -12
  65. pixeltable/exprs/similarity_expr.py +50 -25
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +10 -10
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +20 -18
  78. pixeltable/func/signature.py +43 -16
  79. pixeltable/func/tools.py +23 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +6 -0
  82. pixeltable/functions/anthropic.py +93 -33
  83. pixeltable/functions/audio.py +114 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +1 -1
  86. pixeltable/functions/deepseek.py +20 -9
  87. pixeltable/functions/fireworks.py +2 -2
  88. pixeltable/functions/gemini.py +28 -11
  89. pixeltable/functions/globals.py +13 -13
  90. pixeltable/functions/groq.py +108 -0
  91. pixeltable/functions/huggingface.py +1046 -23
  92. pixeltable/functions/image.py +9 -18
  93. pixeltable/functions/llama_cpp.py +23 -8
  94. pixeltable/functions/math.py +3 -4
  95. pixeltable/functions/mistralai.py +4 -15
  96. pixeltable/functions/ollama.py +16 -9
  97. pixeltable/functions/openai.py +104 -82
  98. pixeltable/functions/openrouter.py +143 -0
  99. pixeltable/functions/replicate.py +2 -2
  100. pixeltable/functions/reve.py +250 -0
  101. pixeltable/functions/string.py +21 -28
  102. pixeltable/functions/timestamp.py +13 -14
  103. pixeltable/functions/together.py +4 -6
  104. pixeltable/functions/twelvelabs.py +92 -0
  105. pixeltable/functions/util.py +6 -1
  106. pixeltable/functions/video.py +1388 -106
  107. pixeltable/functions/vision.py +7 -7
  108. pixeltable/functions/whisper.py +15 -7
  109. pixeltable/functions/whisperx.py +179 -0
  110. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  111. pixeltable/globals.py +332 -105
  112. pixeltable/index/base.py +13 -22
  113. pixeltable/index/btree.py +23 -22
  114. pixeltable/index/embedding_index.py +32 -44
  115. pixeltable/io/__init__.py +4 -2
  116. pixeltable/io/datarows.py +7 -6
  117. pixeltable/io/external_store.py +49 -77
  118. pixeltable/io/fiftyone.py +11 -11
  119. pixeltable/io/globals.py +29 -28
  120. pixeltable/io/hf_datasets.py +17 -9
  121. pixeltable/io/label_studio.py +70 -66
  122. pixeltable/io/lancedb.py +3 -0
  123. pixeltable/io/pandas.py +12 -11
  124. pixeltable/io/parquet.py +13 -93
  125. pixeltable/io/table_data_conduit.py +71 -47
  126. pixeltable/io/utils.py +3 -3
  127. pixeltable/iterators/__init__.py +2 -1
  128. pixeltable/iterators/audio.py +21 -11
  129. pixeltable/iterators/document.py +116 -55
  130. pixeltable/iterators/image.py +5 -2
  131. pixeltable/iterators/video.py +293 -13
  132. pixeltable/metadata/__init__.py +4 -2
  133. pixeltable/metadata/converters/convert_18.py +2 -2
  134. pixeltable/metadata/converters/convert_19.py +2 -2
  135. pixeltable/metadata/converters/convert_20.py +2 -2
  136. pixeltable/metadata/converters/convert_21.py +2 -2
  137. pixeltable/metadata/converters/convert_22.py +2 -2
  138. pixeltable/metadata/converters/convert_24.py +2 -2
  139. pixeltable/metadata/converters/convert_25.py +2 -2
  140. pixeltable/metadata/converters/convert_26.py +2 -2
  141. pixeltable/metadata/converters/convert_29.py +4 -4
  142. pixeltable/metadata/converters/convert_34.py +2 -2
  143. pixeltable/metadata/converters/convert_36.py +2 -2
  144. pixeltable/metadata/converters/convert_37.py +15 -0
  145. pixeltable/metadata/converters/convert_38.py +39 -0
  146. pixeltable/metadata/converters/convert_39.py +124 -0
  147. pixeltable/metadata/converters/convert_40.py +73 -0
  148. pixeltable/metadata/converters/util.py +13 -12
  149. pixeltable/metadata/notes.py +4 -0
  150. pixeltable/metadata/schema.py +79 -42
  151. pixeltable/metadata/utils.py +74 -0
  152. pixeltable/mypy/__init__.py +3 -0
  153. pixeltable/mypy/mypy_plugin.py +123 -0
  154. pixeltable/plan.py +274 -223
  155. pixeltable/share/__init__.py +1 -1
  156. pixeltable/share/packager.py +259 -129
  157. pixeltable/share/protocol/__init__.py +34 -0
  158. pixeltable/share/protocol/common.py +170 -0
  159. pixeltable/share/protocol/operation_types.py +33 -0
  160. pixeltable/share/protocol/replica.py +109 -0
  161. pixeltable/share/publish.py +213 -57
  162. pixeltable/store.py +238 -175
  163. pixeltable/type_system.py +104 -63
  164. pixeltable/utils/__init__.py +2 -3
  165. pixeltable/utils/arrow.py +108 -13
  166. pixeltable/utils/av.py +298 -0
  167. pixeltable/utils/azure_store.py +305 -0
  168. pixeltable/utils/code.py +3 -3
  169. pixeltable/utils/console_output.py +4 -1
  170. pixeltable/utils/coroutine.py +6 -23
  171. pixeltable/utils/dbms.py +31 -5
  172. pixeltable/utils/description_helper.py +4 -5
  173. pixeltable/utils/documents.py +5 -6
  174. pixeltable/utils/exception_handler.py +7 -30
  175. pixeltable/utils/filecache.py +6 -6
  176. pixeltable/utils/formatter.py +4 -6
  177. pixeltable/utils/gcs_store.py +283 -0
  178. pixeltable/utils/http_server.py +2 -3
  179. pixeltable/utils/iceberg.py +1 -2
  180. pixeltable/utils/image.py +17 -0
  181. pixeltable/utils/lancedb.py +88 -0
  182. pixeltable/utils/local_store.py +316 -0
  183. pixeltable/utils/misc.py +5 -0
  184. pixeltable/utils/object_stores.py +528 -0
  185. pixeltable/utils/pydantic.py +60 -0
  186. pixeltable/utils/pytorch.py +5 -6
  187. pixeltable/utils/s3_store.py +392 -0
  188. pixeltable-0.4.20.dist-info/METADATA +587 -0
  189. pixeltable-0.4.20.dist-info/RECORD +218 -0
  190. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
  191. pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
  192. pixeltable/__version__.py +0 -3
  193. pixeltable/ext/__init__.py +0 -17
  194. pixeltable/ext/functions/__init__.py +0 -11
  195. pixeltable/ext/functions/whisperx.py +0 -77
  196. pixeltable/utils/media_store.py +0 -77
  197. pixeltable/utils/s3.py +0 -17
  198. pixeltable/utils/sample.py +0 -25
  199. pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
  200. pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
  201. pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
  202. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0
pixeltable/dataframe.py CHANGED
@@ -8,14 +8,15 @@ import json
8
8
  import logging
9
9
  import traceback
10
10
  from pathlib import Path
11
- from typing import TYPE_CHECKING, Any, AsyncIterator, Callable, Hashable, Iterator, NoReturn, Optional, Sequence, Union
11
+ from typing import TYPE_CHECKING, Any, AsyncIterator, Callable, Hashable, Iterator, NoReturn, Sequence, TypeVar
12
12
 
13
13
  import pandas as pd
14
- import sqlalchemy as sql
14
+ import pydantic
15
+ import sqlalchemy.exc as sql_exc
15
16
 
16
17
  from pixeltable import catalog, exceptions as excs, exec, exprs, plan, type_system as ts
17
18
  from pixeltable.catalog import Catalog, is_valid_identifier
18
- from pixeltable.catalog.globals import UpdateStatus
19
+ from pixeltable.catalog.update_status import UpdateStatus
19
20
  from pixeltable.env import Env
20
21
  from pixeltable.plan import Planner, SampleClause
21
22
  from pixeltable.type_system import ColumnType
@@ -32,6 +33,11 @@ _logger = logging.getLogger('pixeltable')
32
33
 
33
34
 
34
35
  class DataFrameResultSet:
36
+ _rows: list[list[Any]]
37
+ _col_names: list[str]
38
+ __schema: dict[str, ColumnType]
39
+ __formatter: Formatter
40
+
35
41
  def __init__(self, rows: list[list[Any]], schema: dict[str, ColumnType]):
36
42
  self._rows = rows
37
43
  self._col_names = list(schema.keys())
@@ -66,6 +72,44 @@ class DataFrameResultSet:
66
72
  def to_pandas(self) -> pd.DataFrame:
67
73
  return pd.DataFrame.from_records(self._rows, columns=self._col_names)
68
74
 
75
+ BaseModelT = TypeVar('BaseModelT', bound=pydantic.BaseModel)
76
+
77
+ def to_pydantic(self, model: type[BaseModelT]) -> Iterator[BaseModelT]:
78
+ """
79
+ Convert the DataFrameResultSet to a list of Pydantic model instances.
80
+
81
+ Args:
82
+ model: A Pydantic model class.
83
+
84
+ Returns:
85
+ An iterator over Pydantic model instances, one for each row in the result set.
86
+
87
+ Raises:
88
+ Error: If the row data doesn't match the model schema.
89
+ """
90
+ model_fields = model.model_fields
91
+ model_config = getattr(model, 'model_config', {})
92
+ forbid_extra_fields = model_config.get('extra') == 'forbid'
93
+
94
+ # schema validation
95
+ required_fields = {name for name, field in model_fields.items() if field.is_required()}
96
+ col_names = set(self._col_names)
97
+ missing_fields = required_fields - col_names
98
+ if len(missing_fields) > 0:
99
+ raise excs.Error(
100
+ f'Required model fields {missing_fields} are missing from result set columns {self._col_names}'
101
+ )
102
+ if forbid_extra_fields:
103
+ extra_fields = col_names - set(model_fields.keys())
104
+ if len(extra_fields) > 0:
105
+ raise excs.Error(f"Extra fields {extra_fields} are not allowed in model with extra='forbid'")
106
+
107
+ for row in self:
108
+ try:
109
+ yield model(**row)
110
+ except pydantic.ValidationError as e:
111
+ raise excs.Error(str(e)) from e
112
+
69
113
  def _row_to_dict(self, row_idx: int) -> dict[str, Any]:
70
114
  return {self._col_names[i]: self._rows[row_idx][i] for i in range(len(self._col_names))}
71
115
 
@@ -107,14 +151,14 @@ class DataFrameResultSet:
107
151
  # # output of the agg stage
108
152
  # self.agg_output_exprs: list[exprs.Expr] = []
109
153
  # # Where clause of the Select stmt of the SQL scan stage
110
- # self.sql_where_clause: Optional[sql.ClauseElement] = None
154
+ # self.sql_where_clause: sql.ClauseElement | None = None
111
155
  # # filter predicate applied to input rows of the SQL scan stage
112
- # self.filter: Optional[exprs.Predicate] = None
113
- # self.similarity_clause: Optional[exprs.ImageSimilarityPredicate] = None
156
+ # self.filter: exprs.Predicate | None = None
157
+ # self.similarity_clause: exprs.ImageSimilarityPredicate | None = None
114
158
  # self.agg_fn_calls: list[exprs.FunctionCall] = [] # derived from unique_exprs
115
159
  # self.has_frame_col: bool = False # True if we're referencing the frame col
116
160
  #
117
- # self.evaluator: Optional[exprs.Evaluator] = None
161
+ # self.evaluator: exprs.Evaluator | None = None
118
162
  # self.sql_scan_eval_ctx: list[exprs.Expr] = [] # needed to materialize output of SQL scan stage
119
163
  # self.agg_eval_ctx: list[exprs.Expr] = [] # needed to materialize output of agg stage
120
164
  # self.filter_eval_ctx: list[exprs.Expr] = []
@@ -131,27 +175,29 @@ class DataFrameResultSet:
131
175
 
132
176
 
133
177
  class DataFrame:
178
+ """Represents a query for retrieving and transforming data from Pixeltable tables."""
179
+
134
180
  _from_clause: plan.FromClause
135
181
  _select_list_exprs: list[exprs.Expr]
136
182
  _schema: dict[str, ts.ColumnType]
137
- select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]]
138
- where_clause: Optional[exprs.Expr]
139
- group_by_clause: Optional[list[exprs.Expr]]
140
- grouping_tbl: Optional[catalog.TableVersion]
141
- order_by_clause: Optional[list[tuple[exprs.Expr, bool]]]
142
- limit_val: Optional[exprs.Expr]
143
- sample_clause: Optional[SampleClause]
183
+ select_list: list[tuple[exprs.Expr, str | None]] | None
184
+ where_clause: exprs.Expr | None
185
+ group_by_clause: list[exprs.Expr] | None
186
+ grouping_tbl: catalog.TableVersion | None
187
+ order_by_clause: list[tuple[exprs.Expr, bool]] | None
188
+ limit_val: exprs.Expr | None
189
+ sample_clause: SampleClause | None
144
190
 
145
191
  def __init__(
146
192
  self,
147
- from_clause: Optional[plan.FromClause] = None,
148
- select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]] = None,
149
- where_clause: Optional[exprs.Expr] = None,
150
- group_by_clause: Optional[list[exprs.Expr]] = None,
151
- grouping_tbl: Optional[catalog.TableVersion] = None,
152
- order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None, # list[(expr, asc)]
153
- limit: Optional[exprs.Expr] = None,
154
- sample_clause: Optional[SampleClause] = None,
193
+ from_clause: plan.FromClause | None = None,
194
+ select_list: list[tuple[exprs.Expr, str | None]] | None = None,
195
+ where_clause: exprs.Expr | None = None,
196
+ group_by_clause: list[exprs.Expr] | None = None,
197
+ grouping_tbl: catalog.TableVersion | None = None,
198
+ order_by_clause: list[tuple[exprs.Expr, bool]] | None = None, # list[(expr, asc)]
199
+ limit: exprs.Expr | None = None,
200
+ sample_clause: SampleClause | None = None,
155
201
  ):
156
202
  self._from_clause = from_clause
157
203
 
@@ -175,7 +221,7 @@ class DataFrame:
175
221
 
176
222
  @classmethod
177
223
  def _normalize_select_list(
178
- cls, tbls: list[catalog.TableVersionPath], select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]]
224
+ cls, tbls: list[catalog.TableVersionPath], select_list: list[tuple[exprs.Expr, str | None]] | None
179
225
  ) -> tuple[list[exprs.Expr], list[str]]:
180
226
  """
181
227
  Expand select list information with all columns and their names
@@ -236,23 +282,23 @@ class DataFrame:
236
282
  if var.name not in unique_vars:
237
283
  unique_vars[var.name] = var
238
284
  elif unique_vars[var.name].col_type != var.col_type:
239
- raise excs.Error(f'Multiple definitions of parameter {var.name}')
285
+ raise excs.Error(f'Multiple definitions of parameter {var.name!r}')
240
286
  return unique_vars
241
287
 
242
288
  @classmethod
243
289
  def _convert_param_to_typed_expr(
244
- cls, v: Any, required_type: ts.ColumnType, required: bool, name: str, range: Optional[tuple[Any, Any]] = None
245
- ) -> Optional[exprs.Expr]:
290
+ cls, v: Any, required_type: ts.ColumnType, required: bool, name: str, range: tuple[Any, Any] | None = None
291
+ ) -> exprs.Expr | None:
246
292
  if v is None:
247
293
  if required:
248
294
  raise excs.Error(f'{name!r} parameter must be present')
249
295
  return v
250
296
  v_expr = exprs.Expr.from_object(v)
251
297
  if not v_expr.col_type.matches(required_type):
252
- raise excs.Error(f'{name!r} parameter must be of type {required_type!r}, instead of {v_expr.col_type}')
298
+ raise excs.Error(f'{name!r} parameter must be of type `{required_type}`; got `{v_expr.col_type}`')
253
299
  if range is not None:
254
300
  if not isinstance(v_expr, exprs.Literal):
255
- raise excs.Error(f'{name!r} parameter must be a constant, not {v_expr}')
301
+ raise excs.Error(f'{name!r} parameter must be a constant; got: {v_expr}')
256
302
  if range[0] is not None and not (v_expr.val >= range[0]):
257
303
  raise excs.Error(f'{name!r} parameter must be >= {range[0]}')
258
304
  if range[1] is not None and not (v_expr.val <= range[1]):
@@ -261,7 +307,7 @@ class DataFrame:
261
307
 
262
308
  @classmethod
263
309
  def validate_constant_type_range(
264
- cls, v: Any, required_type: ts.ColumnType, required: bool, name: str, range: Optional[tuple[Any, Any]] = None
310
+ cls, v: Any, required_type: ts.ColumnType, required: bool, name: str, range: tuple[Any, Any] | None = None
265
311
  ) -> Any:
266
312
  """Validate that the given named parameter is a constant of the required type and within the specified range."""
267
313
  v_expr = cls._convert_param_to_typed_expr(v, required_type, required, name, range)
@@ -307,7 +353,7 @@ class DataFrame:
307
353
 
308
354
  def _create_query_plan(self) -> exec.ExecNode:
309
355
  # construct a group-by clause if we're grouping by a table
310
- group_by_clause: Optional[list[exprs.Expr]] = None
356
+ group_by_clause: list[exprs.Expr] | None = None
311
357
  if self.grouping_tbl is not None:
312
358
  assert self.group_by_clause is None
313
359
  num_rowid_cols = len(self.grouping_tbl.store_tbl.rowid_columns())
@@ -330,7 +376,7 @@ class DataFrame:
330
376
  sample_clause=self.sample_clause,
331
377
  )
332
378
 
333
- def __rowid_columns(self, num_rowid_cols: Optional[int] = None) -> list[exprs.Expr]:
379
+ def __rowid_columns(self, num_rowid_cols: int | None = None) -> list[exprs.Expr]:
334
380
  """Return list of RowidRef for the given number of associated rowids"""
335
381
  return Planner.rowid_columns(self._first_tbl.tbl_version, num_rowid_cols)
336
382
 
@@ -401,6 +447,7 @@ class DataFrame:
401
447
 
402
448
  @property
403
449
  def schema(self) -> dict[str, ColumnType]:
450
+ """Column names and types in this DataFrame."""
404
451
  return self._schema
405
452
 
406
453
  def bind(self, args: dict[str, Any]) -> DataFrame:
@@ -425,7 +472,7 @@ class DataFrame:
425
472
  var_expr = vars[arg_name]
426
473
  arg_expr = exprs.Expr.from_object(arg_val)
427
474
  if arg_expr is None:
428
- raise excs.Error(f'Cannot convert argument {arg_val} to a Pixeltable expression')
475
+ raise excs.Error(f'That argument cannot be converted to a Pixeltable expression: {arg_val}')
429
476
  var_exprs[var_expr] = arg_expr
430
477
 
431
478
  exprs.Expr.list_substitute(select_list_exprs, var_exprs)
@@ -437,7 +484,7 @@ class DataFrame:
437
484
  exprs.Expr.list_substitute(order_by_exprs, var_exprs)
438
485
 
439
486
  select_list = list(zip(select_list_exprs, self.schema.keys()))
440
- order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None
487
+ order_by_clause: list[tuple[exprs.Expr, bool]] | None = None
441
488
  if order_by_exprs is not None:
442
489
  order_by_clause = [
443
490
  (expr, asc) for expr, asc in zip(order_by_exprs, [asc for _, asc in self.order_by_clause])
@@ -445,7 +492,7 @@ class DataFrame:
445
492
  if limit_val is not None:
446
493
  limit_val = limit_val.substitute(var_exprs)
447
494
  if limit_val is not None and not isinstance(limit_val, exprs.Literal):
448
- raise excs.Error(f'limit(): parameter must be a constant, but got {limit_val}')
495
+ raise excs.Error(f'limit(): parameter must be a constant; got: {limit_val}')
449
496
 
450
497
  return DataFrame(
451
498
  from_clause=self._from_clause,
@@ -475,26 +522,31 @@ class DataFrame:
475
522
  raise excs.Error(msg) from e
476
523
 
477
524
  def _output_row_iterator(self) -> Iterator[list]:
478
- with Catalog.get().begin_xact(for_write=False):
525
+ # TODO: extend begin_xact() to accept multiple TVPs for joins
526
+ single_tbl = self._first_tbl if len(self._from_clause.tbls) == 1 else None
527
+ with Catalog.get().begin_xact(tbl=single_tbl, for_write=False):
479
528
  try:
480
529
  for data_row in self._exec():
481
530
  yield [data_row[e.slot_idx] for e in self._select_list_exprs]
482
531
  except excs.ExprEvalError as e:
483
532
  self._raise_expr_eval_err(e)
484
- except sql.exc.DBAPIError as e:
485
- raise excs.Error(f'Error during SQL execution:\n{e}') from e
533
+ except (sql_exc.DBAPIError, sql_exc.OperationalError, sql_exc.InternalError) as e:
534
+ Catalog.get().convert_sql_exc(e, tbl=(single_tbl.tbl_version if single_tbl is not None else None))
535
+ raise # just re-raise if not converted to a Pixeltable error
486
536
 
487
537
  def collect(self) -> DataFrameResultSet:
488
538
  return DataFrameResultSet(list(self._output_row_iterator()), self.schema)
489
539
 
490
540
  async def _acollect(self) -> DataFrameResultSet:
541
+ single_tbl = self._first_tbl if len(self._from_clause.tbls) == 1 else None
491
542
  try:
492
543
  result = [[row[e.slot_idx] for e in self._select_list_exprs] async for row in self._aexec()]
493
544
  return DataFrameResultSet(result, self.schema)
494
545
  except excs.ExprEvalError as e:
495
546
  self._raise_expr_eval_err(e)
496
- except sql.exc.DBAPIError as e:
497
- raise excs.Error(f'Error during SQL execution:\n{e}') from e
547
+ except (sql_exc.DBAPIError, sql_exc.OperationalError, sql_exc.InternalError) as e:
548
+ Catalog.get().convert_sql_exc(e, tbl=(single_tbl.tbl_version if single_tbl is not None else None))
549
+ raise # just re-raise if not converted to a Pixeltable error
498
550
 
499
551
  def count(self) -> int:
500
552
  """Return the number of rows in the DataFrame.
@@ -507,7 +559,7 @@ class DataFrame:
507
559
 
508
560
  from pixeltable.plan import Planner
509
561
 
510
- with Catalog.get().begin_xact(for_write=False) as conn:
562
+ with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=False) as conn:
511
563
  stmt = Planner.create_count_stmt(self._first_tbl, self.where_clause)
512
564
  result: int = conn.execute(stmt).scalar_one()
513
565
  assert isinstance(result, int)
@@ -620,7 +672,7 @@ class DataFrame:
620
672
  return self
621
673
 
622
674
  # analyze select list; wrap literals with the corresponding expressions
623
- select_list: list[tuple[exprs.Expr, Optional[str]]] = []
675
+ select_list: list[tuple[exprs.Expr, str | None]] = []
624
676
  for raw_expr, name in base_list:
625
677
  expr = exprs.Expr.from_object(raw_expr)
626
678
  if expr is None:
@@ -640,8 +692,8 @@ class DataFrame:
640
692
  pass
641
693
  if not expr.is_bound_by(self._from_clause.tbls):
642
694
  raise excs.Error(
643
- f"Expression '{expr}' cannot be evaluated in the context of this query's tables "
644
- f'({",".join(tbl.tbl_version.get().versioned_name for tbl in self._from_clause.tbls)})'
695
+ f"That expression cannot be evaluated in the context of this query's tables "
696
+ f'({",".join(tbl.tbl_version.get().versioned_name for tbl in self._from_clause.tbls)}): {expr}'
645
697
  )
646
698
  select_list.append((expr, name))
647
699
 
@@ -652,7 +704,7 @@ class DataFrame:
652
704
  if name in seen:
653
705
  repeated_names = [j for j, x in enumerate(names) if x == name]
654
706
  pretty = ', '.join(map(str, repeated_names))
655
- raise excs.Error(f'Repeated column name "{name}" in select() at positions: {pretty}')
707
+ raise excs.Error(f'Repeated column name {name!r} in select() at positions: {pretty}')
656
708
  seen.add(name)
657
709
 
658
710
  return DataFrame(
@@ -690,13 +742,13 @@ class DataFrame:
690
742
  >>> df = person.where(t.age > 30)
691
743
  """
692
744
  if self.where_clause is not None:
693
- raise excs.Error('Where clause already specified')
745
+ raise excs.Error('where() clause already specified')
694
746
  if self.sample_clause is not None:
695
- raise excs.Error('where cannot be used after sample()')
747
+ raise excs.Error('where() cannot be used after sample()')
696
748
  if not isinstance(pred, exprs.Expr):
697
- raise excs.Error(f'Where() requires a Pixeltable expression, but instead got {type(pred)}')
749
+ raise excs.Error(f'where() expects a Pixeltable expression; got: {pred}')
698
750
  if not pred.col_type.is_bool_type():
699
- raise excs.Error(f'Where(): expression needs to return bool, but instead returns {pred.col_type}')
751
+ raise excs.Error(f'where() expression needs to return `Bool`, but instead returns `{pred.col_type}`')
700
752
  return DataFrame(
701
753
  from_clause=self._from_clause,
702
754
  select_list=self.select_list,
@@ -708,7 +760,7 @@ class DataFrame:
708
760
  )
709
761
 
710
762
  def _create_join_predicate(
711
- self, other: catalog.TableVersionPath, on: Union[exprs.Expr, Sequence[exprs.ColumnRef]]
763
+ self, other: catalog.TableVersionPath, on: exprs.Expr | Sequence[exprs.ColumnRef]
712
764
  ) -> exprs.Expr:
713
765
  """Verifies user-specified 'on' argument and converts it into a join predicate."""
714
766
  col_refs: list[exprs.ColumnRef] = []
@@ -718,19 +770,21 @@ class DataFrame:
718
770
  on = [on]
719
771
  elif isinstance(on, exprs.Expr):
720
772
  if not on.is_bound_by(joined_tbls):
721
- raise excs.Error(f"'on': expression cannot be evaluated in the context of the joined tables: {on}")
773
+ raise excs.Error(f'`on` expression cannot be evaluated in the context of the joined tables: {on}')
722
774
  if not on.col_type.is_bool_type():
723
- raise excs.Error(f"'on': boolean expression expected, but got {on.col_type}: {on}")
775
+ raise excs.Error(
776
+ f'`on` expects an expression of type `Bool`, but got one of type `{on.col_type}`: {on}'
777
+ )
724
778
  return on
725
779
  elif not isinstance(on, Sequence) or len(on) == 0:
726
- raise excs.Error("'on': must be a sequence of column references or a boolean expression")
780
+ raise excs.Error('`on` must be a sequence of column references or a boolean expression')
727
781
 
728
782
  assert isinstance(on, Sequence)
729
783
  for col_ref in on:
730
784
  if not isinstance(col_ref, exprs.ColumnRef):
731
- raise excs.Error("'on': must be a sequence of column references or a boolean expression")
785
+ raise excs.Error('`on` must be a sequence of column references or a boolean expression')
732
786
  if not col_ref.is_bound_by(joined_tbls):
733
- raise excs.Error(f"'on': expression cannot be evaluated in the context of the joined tables: {col_ref}")
787
+ raise excs.Error(f'`on` expression cannot be evaluated in the context of the joined tables: {col_ref}')
734
788
  col_refs.append(col_ref)
735
789
 
736
790
  predicates: list[exprs.Expr] = []
@@ -738,27 +792,27 @@ class DataFrame:
738
792
  assert len(col_refs) > 0 and len(joined_tbls) >= 2
739
793
  for col_ref in col_refs:
740
794
  # identify the referenced column by name in 'other'
741
- rhs_col = other.get_column(col_ref.col.name, include_bases=True)
795
+ rhs_col = other.get_column(col_ref.col.name)
742
796
  if rhs_col is None:
743
- raise excs.Error(f"'on': column {col_ref.col.name!r} not found in joined table")
797
+ raise excs.Error(f'`on` column {col_ref.col.name!r} not found in joined table')
744
798
  rhs_col_ref = exprs.ColumnRef(rhs_col)
745
799
 
746
- lhs_col_ref: Optional[exprs.ColumnRef] = None
747
- if any(tbl.has_column(col_ref.col, include_bases=True) for tbl in self._from_clause.tbls):
800
+ lhs_col_ref: exprs.ColumnRef | None = None
801
+ if any(tbl.has_column(col_ref.col) for tbl in self._from_clause.tbls):
748
802
  # col_ref comes from the existing from_clause, we use that directly
749
803
  lhs_col_ref = col_ref
750
804
  else:
751
805
  # col_ref comes from other, we need to look for a match in the existing from_clause by name
752
806
  for tbl in self._from_clause.tbls:
753
- col = tbl.get_column(col_ref.col.name, include_bases=True)
807
+ col = tbl.get_column(col_ref.col.name)
754
808
  if col is None:
755
809
  continue
756
810
  if lhs_col_ref is not None:
757
- raise excs.Error(f"'on': ambiguous column reference: {col_ref.col.name!r}")
811
+ raise excs.Error(f'`on`: ambiguous column reference: {col_ref.col.name}')
758
812
  lhs_col_ref = exprs.ColumnRef(col)
759
813
  if lhs_col_ref is None:
760
814
  tbl_names = [tbl.tbl_name() for tbl in self._from_clause.tbls]
761
- raise excs.Error(f"'on': column {col_ref.col.name!r} not found in any of: {' '.join(tbl_names)}")
815
+ raise excs.Error(f'`on`: column {col_ref.col.name!r} not found in any of: {" ".join(tbl_names)}')
762
816
  pred = exprs.Comparison(exprs.ComparisonOperator.EQ, lhs_col_ref, rhs_col_ref)
763
817
  predicates.append(pred)
764
818
 
@@ -771,7 +825,7 @@ class DataFrame:
771
825
  def join(
772
826
  self,
773
827
  other: catalog.Table,
774
- on: Optional[Union[exprs.Expr, Sequence[exprs.ColumnRef]]] = None,
828
+ on: exprs.Expr | Sequence[exprs.ColumnRef] | None = None,
775
829
  how: plan.JoinType.LiteralType = 'inner',
776
830
  ) -> DataFrame:
777
831
  """
@@ -822,16 +876,16 @@ class DataFrame:
822
876
  """
823
877
  if self.sample_clause is not None:
824
878
  raise excs.Error('join() cannot be used with sample()')
825
- join_pred: Optional[exprs.Expr]
879
+ join_pred: exprs.Expr | None
826
880
  if how == 'cross':
827
881
  if on is not None:
828
- raise excs.Error("'on' not allowed for cross join")
882
+ raise excs.Error('`on` not allowed for cross join')
829
883
  join_pred = None
830
884
  else:
831
885
  if on is None:
832
- raise excs.Error(f"how={how!r} requires 'on'")
886
+ raise excs.Error(f'`how={how!r}` requires `on` to be present')
833
887
  join_pred = self._create_join_predicate(other._tbl_version_path, on)
834
- join_clause = plan.JoinClause(join_type=plan.JoinType.validated(how, "'how'"), join_predicate=join_pred)
888
+ join_clause = plan.JoinClause(join_type=plan.JoinType.validated(how, '`how`'), join_predicate=join_pred)
835
889
  from_clause = plan.FromClause(
836
890
  tbls=[*self._from_clause.tbls, other._tbl_version_path],
837
891
  join_clauses=[*self._from_clause.join_clauses, join_clause],
@@ -888,24 +942,24 @@ class DataFrame:
888
942
  >>> df = book.group_by(t.genre).select(t.genre, total=sum(t.price)).show()
889
943
  """
890
944
  if self.group_by_clause is not None:
891
- raise excs.Error('Group-by already specified')
945
+ raise excs.Error('group_by() already specified')
892
946
  if self.sample_clause is not None:
893
947
  raise excs.Error('group_by() cannot be used with sample()')
894
948
 
895
- grouping_tbl: Optional[catalog.TableVersion] = None
896
- group_by_clause: Optional[list[exprs.Expr]] = None
949
+ grouping_tbl: catalog.TableVersion | None = None
950
+ group_by_clause: list[exprs.Expr] | None = None
897
951
  for item in grouping_items:
898
952
  if isinstance(item, (catalog.Table, catalog.TableVersion)):
899
953
  if len(grouping_items) > 1:
900
- raise excs.Error('group_by(): only one table can be specified')
954
+ raise excs.Error('group_by(): only one Table can be specified')
901
955
  if len(self._from_clause.tbls) > 1:
902
956
  raise excs.Error('group_by() with Table not supported for joins')
903
957
  grouping_tbl = item if isinstance(item, catalog.TableVersion) else item._tbl_version.get()
904
958
  # we need to make sure that the grouping table is a base of self.tbl
905
959
  base = self._first_tbl.find_tbl_version(grouping_tbl.id)
906
- if base is None or base.id == self._first_tbl.tbl_id():
960
+ if base is None or base.id == self._first_tbl.tbl_id:
907
961
  raise excs.Error(
908
- f'group_by(): {grouping_tbl.name} is not a base table of {self._first_tbl.tbl_name()}'
962
+ f'group_by(): {grouping_tbl.name!r} is not a base table of {self._first_tbl.tbl_name()!r}'
909
963
  )
910
964
  break
911
965
  if not isinstance(item, exprs.Expr):
@@ -976,7 +1030,7 @@ class DataFrame:
976
1030
  >>> df = book.order_by(t.price, asc=False).order_by(t.pages)
977
1031
  """
978
1032
  if self.sample_clause is not None:
979
- raise excs.Error('group_by() cannot be used with sample()')
1033
+ raise excs.Error('order_by() cannot be used with sample()')
980
1034
  for e in expr_list:
981
1035
  if not isinstance(e, exprs.Expr):
982
1036
  raise excs.Error(f'Invalid expression in order_by(): {e}')
@@ -1017,10 +1071,10 @@ class DataFrame:
1017
1071
 
1018
1072
  def sample(
1019
1073
  self,
1020
- n: Optional[int] = None,
1021
- n_per_stratum: Optional[int] = None,
1022
- fraction: Optional[float] = None,
1023
- seed: Optional[int] = None,
1074
+ n: int | None = None,
1075
+ n_per_stratum: int | None = None,
1076
+ fraction: float | None = None,
1077
+ seed: int | None = None,
1024
1078
  stratify_by: Any = None,
1025
1079
  ) -> DataFrame:
1026
1080
  """
@@ -1074,7 +1128,7 @@ class DataFrame:
1074
1128
  """
1075
1129
  # Check context of usage
1076
1130
  if self.sample_clause is not None:
1077
- raise excs.Error('sample() cannot be used with sample()')
1131
+ raise excs.Error('Multiple sample() clauses not allowed')
1078
1132
  if self.group_by_clause is not None:
1079
1133
  raise excs.Error('sample() cannot be used with group_by()')
1080
1134
  if self.order_by_clause is not None:
@@ -1111,11 +1165,11 @@ class DataFrame:
1111
1165
  if expr is None or not isinstance(expr, exprs.Expr):
1112
1166
  raise excs.Error(f'Invalid expression: {expr}')
1113
1167
  if not expr.col_type.is_scalar_type():
1114
- raise excs.Error(f'Invalid type: expression must be a scalar type (not {expr.col_type})')
1168
+ raise excs.Error(f'Invalid type: expression must be a scalar type (not `{expr.col_type}`)')
1115
1169
  if not expr.is_bound_by(self._from_clause.tbls):
1116
1170
  raise excs.Error(
1117
- f"Expression '{expr}' cannot be evaluated in the context of this query's tables "
1118
- f'({",".join(tbl.tbl_name() for tbl in self._from_clause.tbls)})'
1171
+ f"That expression cannot be evaluated in the context of this query's tables "
1172
+ f'({",".join(tbl.tbl_name() for tbl in self._from_clause.tbls)}): {expr}'
1119
1173
  )
1120
1174
  stratify_exprs.append(expr)
1121
1175
 
@@ -1153,18 +1207,42 @@ class DataFrame:
1153
1207
  Via the above DataFrame person, update the column 'city' to 'Oakland'
1154
1208
  and 'state' to 'CA' in the table t:
1155
1209
 
1156
- >>> df = person.update({'city': 'Oakland', 'state': 'CA'})
1210
+ >>> person.update({'city': 'Oakland', 'state': 'CA'})
1157
1211
 
1158
1212
  Via the above DataFrame person, update the column 'age' to 30 for any
1159
1213
  rows where 'year' is 2014 in the table t:
1160
1214
 
1161
- >>> df = person.where(t.year == 2014).update({'age': 30})
1215
+ >>> person.where(t.year == 2014).update({'age': 30})
1162
1216
  """
1163
1217
  self._validate_mutable('update', False)
1164
- tbl_id = self._first_tbl.tbl_id()
1165
- with Catalog.get().begin_xact(tbl_id=tbl_id, for_write=True):
1218
+ with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=True, lock_mutable_tree=True):
1166
1219
  return self._first_tbl.tbl_version.get().update(value_spec, where=self.where_clause, cascade=cascade)
1167
1220
 
1221
+ def recompute_columns(
1222
+ self, *columns: str | exprs.ColumnRef, errors_only: bool = False, cascade: bool = True
1223
+ ) -> UpdateStatus:
1224
+ """Recompute one or more computed columns of the underlying table of the DataFrame.
1225
+
1226
+ Args:
1227
+ columns: The names or references of the computed columns to recompute.
1228
+ errors_only: If True, only run the recomputation for rows that have errors in the column (ie, the column's
1229
+ `errortype` property indicates that an error occurred). Only allowed for recomputing a single column.
1230
+ cascade: if True, also update all computed columns that transitively depend on the recomputed columns.
1231
+
1232
+ Returns:
1233
+ UpdateStatus: the status of the operation.
1234
+
1235
+ Example:
1236
+ For table `person` with column `age` and computed column `height`, recompute the value of `height` for all
1237
+ rows where `age` is less than 18:
1238
+
1239
+ >>> df = person.where(t.age < 18).recompute_columns(person.height)
1240
+ """
1241
+ self._validate_mutable('recompute_columns', False)
1242
+ with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=True, lock_mutable_tree=True):
1243
+ tbl = Catalog.get().get_table_by_id(self._first_tbl.tbl_id)
1244
+ return tbl.recompute_columns(*columns, where=self.where_clause, errors_only=errors_only, cascade=cascade)
1245
+
1168
1246
  def delete(self) -> UpdateStatus:
1169
1247
  """Delete rows form the underlying table of the DataFrame.
1170
1248
 
@@ -1174,19 +1252,14 @@ class DataFrame:
1174
1252
  UpdateStatus: the status of the delete operation.
1175
1253
 
1176
1254
  Example:
1177
- Given the DataFrame person from a table t with all its columns and rows:
1255
+ For a table `person` with column `age`, delete all rows where 'age' is less than 18:
1178
1256
 
1179
- >>> person = t.select()
1180
-
1181
- Via the above DataFrame person, delete all rows from the table t where the column 'age' is less than 18:
1182
-
1183
- >>> df = person.where(t.age < 18).delete()
1257
+ >>> person.where(t.age < 18).delete()
1184
1258
  """
1185
1259
  self._validate_mutable('delete', False)
1186
1260
  if not self._first_tbl.is_insertable():
1187
- raise excs.Error('Cannot delete from view')
1188
- tbl_id = self._first_tbl.tbl_id()
1189
- with Catalog.get().begin_xact(tbl_id=tbl_id, for_write=True):
1261
+ raise excs.Error('Cannot use `delete` on a view.')
1262
+ with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=True, lock_mutable_tree=True):
1190
1263
  return self._first_tbl.tbl_version.get().delete(where=self.where_clause)
1191
1264
 
1192
1265
  def _validate_mutable(self, op_name: str, allow_select: bool) -> None:
@@ -1196,14 +1269,28 @@ class DataFrame:
1196
1269
  op_name: The name of the operation for which the test is being performed.
1197
1270
  allow_select: If True, allow a select() specification in the Dataframe.
1198
1271
  """
1272
+ self._validate_mutable_op_sequence(op_name, allow_select)
1273
+
1274
+ # TODO: Reconcile these with Table.__check_mutable()
1275
+ assert len(self._from_clause.tbls) == 1
1276
+ # First check if it's a replica, since every replica handle is also a snapshot
1277
+ if self._first_tbl.is_replica():
1278
+ raise excs.Error(f'Cannot use `{op_name}` on a replica.')
1279
+ if self._first_tbl.is_snapshot():
1280
+ raise excs.Error(f'Cannot use `{op_name}` on a snapshot.')
1281
+
1282
+ def _validate_mutable_op_sequence(self, op_name: str, allow_select: bool) -> None:
1283
+ """Tests whether the sequence of operations on this DataFrame is valid for a mutation operation."""
1199
1284
  if self.group_by_clause is not None or self.grouping_tbl is not None:
1200
- raise excs.Error(f'Cannot use `{op_name}` after `group_by`')
1285
+ raise excs.Error(f'Cannot use `{op_name}` after `group_by`.')
1201
1286
  if self.order_by_clause is not None:
1202
- raise excs.Error(f'Cannot use `{op_name}` after `order_by`')
1287
+ raise excs.Error(f'Cannot use `{op_name}` after `order_by`.')
1203
1288
  if self.select_list is not None and not allow_select:
1204
- raise excs.Error(f'Cannot use `{op_name}` after `select`')
1289
+ raise excs.Error(f'Cannot use `{op_name}` after `select`.')
1205
1290
  if self.limit_val is not None:
1206
- raise excs.Error(f'Cannot use `{op_name}` after `limit`')
1291
+ raise excs.Error(f'Cannot use `{op_name}` after `limit`.')
1292
+ if self._has_joins():
1293
+ raise excs.Error(f'Cannot use `{op_name}` after `join`.')
1207
1294
 
1208
1295
  def as_dict(self) -> dict[str, Any]:
1209
1296
  """
@@ -1307,7 +1394,8 @@ class DataFrame:
1307
1394
  assert data_file_path.is_file()
1308
1395
  return data_file_path
1309
1396
  else:
1310
- with Catalog.get().begin_xact(for_write=False):
1397
+ # TODO: extend begin_xact() to accept multiple TVPs for joins
1398
+ with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=False):
1311
1399
  return write_coco_dataset(self, dest_path)
1312
1400
 
1313
1401
  def to_pytorch_dataset(self, image_format: str = 'pt') -> 'torch.utils.data.IterableDataset':
@@ -1352,7 +1440,7 @@ class DataFrame:
1352
1440
  if dest_path.exists(): # fast path: use cache
1353
1441
  assert dest_path.is_dir()
1354
1442
  else:
1355
- with Catalog.get().begin_xact(for_write=False):
1443
+ with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=False):
1356
1444
  export_parquet(self, dest_path, inline_images=True)
1357
1445
 
1358
1446
  return PixeltablePytorchDataset(path=dest_path, image_format=image_format)