pixeltable 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (79) hide show
  1. pixeltable/__init__.py +1 -1
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +9 -1
  4. pixeltable/catalog/catalog.py +559 -134
  5. pixeltable/catalog/column.py +36 -32
  6. pixeltable/catalog/dir.py +1 -2
  7. pixeltable/catalog/globals.py +12 -0
  8. pixeltable/catalog/insertable_table.py +30 -25
  9. pixeltable/catalog/schema_object.py +9 -6
  10. pixeltable/catalog/table.py +334 -267
  11. pixeltable/catalog/table_version.py +360 -241
  12. pixeltable/catalog/table_version_handle.py +18 -2
  13. pixeltable/catalog/table_version_path.py +86 -23
  14. pixeltable/catalog/view.py +47 -23
  15. pixeltable/dataframe.py +198 -19
  16. pixeltable/env.py +6 -4
  17. pixeltable/exceptions.py +6 -0
  18. pixeltable/exec/__init__.py +1 -1
  19. pixeltable/exec/exec_node.py +2 -0
  20. pixeltable/exec/expr_eval/evaluators.py +4 -1
  21. pixeltable/exec/expr_eval/expr_eval_node.py +4 -4
  22. pixeltable/exec/in_memory_data_node.py +1 -1
  23. pixeltable/exec/sql_node.py +188 -22
  24. pixeltable/exprs/column_property_ref.py +16 -6
  25. pixeltable/exprs/column_ref.py +33 -11
  26. pixeltable/exprs/comparison.py +1 -1
  27. pixeltable/exprs/data_row.py +5 -3
  28. pixeltable/exprs/expr.py +11 -4
  29. pixeltable/exprs/literal.py +2 -0
  30. pixeltable/exprs/row_builder.py +4 -6
  31. pixeltable/exprs/rowid_ref.py +8 -0
  32. pixeltable/exprs/similarity_expr.py +1 -0
  33. pixeltable/func/__init__.py +1 -0
  34. pixeltable/func/mcp.py +74 -0
  35. pixeltable/func/query_template_function.py +5 -3
  36. pixeltable/func/tools.py +12 -2
  37. pixeltable/func/udf.py +2 -2
  38. pixeltable/functions/__init__.py +1 -0
  39. pixeltable/functions/anthropic.py +19 -45
  40. pixeltable/functions/deepseek.py +19 -38
  41. pixeltable/functions/fireworks.py +9 -18
  42. pixeltable/functions/gemini.py +165 -33
  43. pixeltable/functions/groq.py +108 -0
  44. pixeltable/functions/llama_cpp.py +6 -6
  45. pixeltable/functions/math.py +63 -0
  46. pixeltable/functions/mistralai.py +16 -53
  47. pixeltable/functions/ollama.py +1 -1
  48. pixeltable/functions/openai.py +82 -165
  49. pixeltable/functions/string.py +212 -58
  50. pixeltable/functions/together.py +22 -80
  51. pixeltable/globals.py +10 -4
  52. pixeltable/index/base.py +5 -0
  53. pixeltable/index/btree.py +5 -0
  54. pixeltable/index/embedding_index.py +5 -0
  55. pixeltable/io/external_store.py +10 -31
  56. pixeltable/io/label_studio.py +5 -5
  57. pixeltable/io/parquet.py +4 -4
  58. pixeltable/io/table_data_conduit.py +1 -32
  59. pixeltable/metadata/__init__.py +11 -2
  60. pixeltable/metadata/converters/convert_13.py +2 -2
  61. pixeltable/metadata/converters/convert_30.py +6 -11
  62. pixeltable/metadata/converters/convert_35.py +9 -0
  63. pixeltable/metadata/converters/convert_36.py +38 -0
  64. pixeltable/metadata/converters/convert_37.py +15 -0
  65. pixeltable/metadata/converters/util.py +3 -9
  66. pixeltable/metadata/notes.py +3 -0
  67. pixeltable/metadata/schema.py +13 -1
  68. pixeltable/plan.py +135 -12
  69. pixeltable/share/packager.py +321 -20
  70. pixeltable/share/publish.py +2 -2
  71. pixeltable/store.py +31 -13
  72. pixeltable/type_system.py +30 -0
  73. pixeltable/utils/dbms.py +1 -1
  74. pixeltable/utils/formatter.py +64 -42
  75. {pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/METADATA +2 -1
  76. {pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/RECORD +79 -74
  77. {pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/LICENSE +0 -0
  78. {pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/WHEEL +0 -0
  79. {pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/entry_points.txt +0 -0
pixeltable/dataframe.py CHANGED
@@ -14,9 +14,10 @@ import pandas as pd
14
14
  import sqlalchemy as sql
15
15
 
16
16
  from pixeltable import catalog, exceptions as excs, exec, exprs, plan, type_system as ts
17
- from pixeltable.catalog import is_valid_identifier
17
+ from pixeltable.catalog import Catalog, is_valid_identifier
18
18
  from pixeltable.catalog.globals import UpdateStatus
19
19
  from pixeltable.env import Env
20
+ from pixeltable.plan import Planner, SampleClause
20
21
  from pixeltable.type_system import ColumnType
21
22
  from pixeltable.utils.description_helper import DescriptionHelper
22
23
  from pixeltable.utils.formatter import Formatter
@@ -139,6 +140,7 @@ class DataFrame:
139
140
  grouping_tbl: Optional[catalog.TableVersion]
140
141
  order_by_clause: Optional[list[tuple[exprs.Expr, bool]]]
141
142
  limit_val: Optional[exprs.Expr]
143
+ sample_clause: Optional[SampleClause]
142
144
 
143
145
  def __init__(
144
146
  self,
@@ -149,6 +151,7 @@ class DataFrame:
149
151
  grouping_tbl: Optional[catalog.TableVersion] = None,
150
152
  order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None, # list[(expr, asc)]
151
153
  limit: Optional[exprs.Expr] = None,
154
+ sample_clause: Optional[SampleClause] = None,
152
155
  ):
153
156
  self._from_clause = from_clause
154
157
 
@@ -168,6 +171,7 @@ class DataFrame:
168
171
  self.grouping_tbl = grouping_tbl
169
172
  self.order_by_clause = copy.deepcopy(order_by_clause)
170
173
  self.limit_val = limit
174
+ self.sample_clause = sample_clause
171
175
 
172
176
  @classmethod
173
177
  def _normalize_select_list(
@@ -210,8 +214,7 @@ class DataFrame:
210
214
 
211
215
  @property
212
216
  def _first_tbl(self) -> catalog.TableVersionPath:
213
- assert len(self._from_clause.tbls) == 1
214
- return self._from_clause.tbls[0]
217
+ return self._from_clause._first_tbl
215
218
 
216
219
  def _vars(self) -> dict[str, exprs.Variable]:
217
220
  """
@@ -236,6 +239,36 @@ class DataFrame:
236
239
  raise excs.Error(f'Multiple definitions of parameter {var.name}')
237
240
  return unique_vars
238
241
 
242
+ @classmethod
243
+ def _convert_param_to_typed_expr(
244
+ cls, v: Any, required_type: ts.ColumnType, required: bool, name: str, range: Optional[tuple[Any, Any]] = None
245
+ ) -> Optional[exprs.Expr]:
246
+ if v is None:
247
+ if required:
248
+ raise excs.Error(f'{name!r} parameter must be present')
249
+ return v
250
+ v_expr = exprs.Expr.from_object(v)
251
+ if not v_expr.col_type.matches(required_type):
252
+ raise excs.Error(f'{name!r} parameter must be of type {required_type!r}, instead of {v_expr.col_type}')
253
+ if range is not None:
254
+ if not isinstance(v_expr, exprs.Literal):
255
+ raise excs.Error(f'{name!r} parameter must be a constant, not {v_expr}')
256
+ if range[0] is not None and not (v_expr.val >= range[0]):
257
+ raise excs.Error(f'{name!r} parameter must be >= {range[0]}')
258
+ if range[1] is not None and not (v_expr.val <= range[1]):
259
+ raise excs.Error(f'{name!r} parameter must be <= {range[1]}')
260
+ return v_expr
261
+
262
+ @classmethod
263
+ def validate_constant_type_range(
264
+ cls, v: Any, required_type: ts.ColumnType, required: bool, name: str, range: Optional[tuple[Any, Any]] = None
265
+ ) -> Any:
266
+ """Validate that the given named parameter is a constant of the required type and within the specified range."""
267
+ v_expr = cls._convert_param_to_typed_expr(v, required_type, required, name, range)
268
+ if v_expr is None:
269
+ return None
270
+ return v_expr.val
271
+
239
272
  def parameters(self) -> dict[str, ColumnType]:
240
273
  """Return a dict mapping parameter name to parameter type.
241
274
 
@@ -280,7 +313,7 @@ class DataFrame:
280
313
  num_rowid_cols = len(self.grouping_tbl.store_tbl.rowid_columns())
281
314
  # the grouping table must be a base of self.tbl
282
315
  assert num_rowid_cols <= len(self._first_tbl.tbl_version.get().store_tbl.rowid_columns())
283
- group_by_clause = [exprs.RowidRef(self._first_tbl.tbl_version, idx) for idx in range(num_rowid_cols)]
316
+ group_by_clause = self.__rowid_columns(num_rowid_cols)
284
317
  elif self.group_by_clause is not None:
285
318
  group_by_clause = self.group_by_clause
286
319
 
@@ -292,14 +325,21 @@ class DataFrame:
292
325
  self._select_list_exprs,
293
326
  where_clause=self.where_clause,
294
327
  group_by_clause=group_by_clause,
295
- order_by_clause=self.order_by_clause if self.order_by_clause is not None else [],
328
+ order_by_clause=self.order_by_clause,
296
329
  limit=self.limit_val,
330
+ sample_clause=self.sample_clause,
297
331
  )
298
332
 
333
+ def __rowid_columns(self, num_rowid_cols: Optional[int] = None) -> list[exprs.Expr]:
334
+ """Return list of RowidRef for the given number of associated rowids"""
335
+ return Planner.rowid_columns(self._first_tbl.tbl_version, num_rowid_cols)
336
+
299
337
  def _has_joins(self) -> bool:
300
338
  return len(self._from_clause.join_clauses) > 0
301
339
 
302
340
  def show(self, n: int = 20) -> DataFrameResultSet:
341
+ if self.sample_clause is not None:
342
+ raise excs.Error('show() cannot be used with sample()')
303
343
  assert n is not None
304
344
  return self.limit(n).collect()
305
345
 
@@ -322,6 +362,8 @@ class DataFrame:
322
362
  raise excs.Error('head() cannot be used with order_by()')
323
363
  if self._has_joins():
324
364
  raise excs.Error('head() not supported for joins')
365
+ if self.sample_clause is not None:
366
+ raise excs.Error('head() cannot be used with sample()')
325
367
  if self.group_by_clause is not None:
326
368
  raise excs.Error('head() cannot be used with group_by()')
327
369
  num_rowid_cols = len(self._first_tbl.tbl_version.get().store_tbl.rowid_columns())
@@ -347,6 +389,8 @@ class DataFrame:
347
389
  raise excs.Error('tail() cannot be used with order_by()')
348
390
  if self._has_joins():
349
391
  raise excs.Error('tail() not supported for joins')
392
+ if self.sample_clause is not None:
393
+ raise excs.Error('tail() cannot be used with sample()')
350
394
  if self.group_by_clause is not None:
351
395
  raise excs.Error('tail() cannot be used with group_by()')
352
396
  num_rowid_cols = len(self._first_tbl.tbl_version.get().store_tbl.rowid_columns())
@@ -431,7 +475,9 @@ class DataFrame:
431
475
  raise excs.Error(msg) from e
432
476
 
433
477
  def _output_row_iterator(self) -> Iterator[list]:
434
- with Env.get().begin_xact():
478
+ # TODO: extend begin_xact() to accept multiple TVPs for joins
479
+ single_tbl = self._first_tbl if len(self._from_clause.tbls) == 1 else None
480
+ with Catalog.get().begin_xact(tbl=single_tbl, for_write=False):
435
481
  try:
436
482
  for data_row in self._exec():
437
483
  yield [data_row[e.slot_idx] for e in self._select_list_exprs]
@@ -463,8 +509,8 @@ class DataFrame:
463
509
 
464
510
  from pixeltable.plan import Planner
465
511
 
466
- stmt = Planner.create_count_stmt(self._first_tbl, self.where_clause)
467
- with Env.get().begin_xact() as conn:
512
+ with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=False) as conn:
513
+ stmt = Planner.create_count_stmt(self._first_tbl, self.where_clause)
468
514
  result: int = conn.execute(stmt).scalar_one()
469
515
  assert isinstance(result, int)
470
516
  return result
@@ -510,6 +556,9 @@ class DataFrame:
510
556
  if self.limit_val is not None:
511
557
  heading_vals.append('Limit')
512
558
  info_vals.append(self.limit_val.display_str(inline=False))
559
+ if self.sample_clause is not None:
560
+ heading_vals.append('Sample')
561
+ info_vals.append(self.sample_clause.display_str(inline=False))
513
562
  assert len(heading_vals) == len(info_vals)
514
563
  return pd.DataFrame(info_vals, index=heading_vals)
515
564
 
@@ -644,6 +693,8 @@ class DataFrame:
644
693
  """
645
694
  if self.where_clause is not None:
646
695
  raise excs.Error('Where clause already specified')
696
+ if self.sample_clause is not None:
697
+ raise excs.Error('where cannot be used after sample()')
647
698
  if not isinstance(pred, exprs.Expr):
648
699
  raise excs.Error(f'Where() requires a Pixeltable expression, but instead got {type(pred)}')
649
700
  if not pred.col_type.is_bool_type():
@@ -771,6 +822,8 @@ class DataFrame:
771
822
 
772
823
  >>> df = t.join(d, on=(t.d1 == d.pk1) & (t.d2 == d.pk2), how='left')
773
824
  """
825
+ if self.sample_clause is not None:
826
+ raise excs.Error('join() cannot be used with sample()')
774
827
  join_pred: Optional[exprs.Expr]
775
828
  if how == 'cross':
776
829
  if on is not None:
@@ -838,6 +891,9 @@ class DataFrame:
838
891
  """
839
892
  if self.group_by_clause is not None:
840
893
  raise excs.Error('Group-by already specified')
894
+ if self.sample_clause is not None:
895
+ raise excs.Error('group_by() cannot be used with sample()')
896
+
841
897
  grouping_tbl: Optional[catalog.TableVersion] = None
842
898
  group_by_clause: Optional[list[exprs.Expr]] = None
843
899
  for item in grouping_items:
@@ -849,7 +905,7 @@ class DataFrame:
849
905
  grouping_tbl = item if isinstance(item, catalog.TableVersion) else item._tbl_version.get()
850
906
  # we need to make sure that the grouping table is a base of self.tbl
851
907
  base = self._first_tbl.find_tbl_version(grouping_tbl.id)
852
- if base is None or base.id == self._first_tbl.tbl_id():
908
+ if base is None or base.id == self._first_tbl.tbl_id:
853
909
  raise excs.Error(
854
910
  f'group_by(): {grouping_tbl.name} is not a base table of {self._first_tbl.tbl_name()}'
855
911
  )
@@ -921,6 +977,8 @@ class DataFrame:
921
977
 
922
978
  >>> df = book.order_by(t.price, asc=False).order_by(t.pages)
923
979
  """
980
+ if self.sample_clause is not None:
981
+ raise excs.Error('group_by() cannot be used with sample()')
924
982
  for e in expr_list:
925
983
  if not isinstance(e, exprs.Expr):
926
984
  raise excs.Error(f'Invalid expression in order_by(): {e}')
@@ -945,10 +1003,10 @@ class DataFrame:
945
1003
  Returns:
946
1004
  A new DataFrame with the specified limited rows.
947
1005
  """
948
- assert n is not None
949
- n = exprs.Expr.from_object(n)
950
- if not n.col_type.is_int_type():
951
- raise excs.Error(f'limit(): parameter must be of type int, instead of {n.col_type}')
1006
+ if self.sample_clause is not None:
1007
+ raise excs.Error('limit() cannot be used with sample()')
1008
+
1009
+ limit_expr = self._convert_param_to_typed_expr(n, ts.IntType(nullable=False), True, 'limit()')
952
1010
  return DataFrame(
953
1011
  from_clause=self._from_clause,
954
1012
  select_list=self.select_list,
@@ -956,7 +1014,124 @@ class DataFrame:
956
1014
  group_by_clause=self.group_by_clause,
957
1015
  grouping_tbl=self.grouping_tbl,
958
1016
  order_by_clause=self.order_by_clause,
959
- limit=n,
1017
+ limit=limit_expr,
1018
+ )
1019
+
1020
+ def sample(
1021
+ self,
1022
+ n: Optional[int] = None,
1023
+ n_per_stratum: Optional[int] = None,
1024
+ fraction: Optional[float] = None,
1025
+ seed: Optional[int] = None,
1026
+ stratify_by: Any = None,
1027
+ ) -> DataFrame:
1028
+ """
1029
+ Return a new DataFrame specifying a sample of rows from the DataFrame, considered in a shuffled order.
1030
+
1031
+ The size of the sample can be specified in three ways:
1032
+
1033
+ - `n`: the total number of rows to produce as a sample
1034
+ - `n_per_stratum`: the number of rows to produce per stratum as a sample
1035
+ - `fraction`: the fraction of available rows to produce as a sample
1036
+
1037
+ The sample can be stratified by one or more columns, which means that the sample will
1038
+ be selected from each stratum separately.
1039
+
1040
+ The data is shuffled before creating the sample.
1041
+
1042
+ Args:
1043
+ n: Total number of rows to produce as a sample.
1044
+ n_per_stratum: Number of rows to produce per stratum as a sample. This parameter is only valid if
1045
+ `stratify_by` is specified. Only one of `n` or `n_per_stratum` can be specified.
1046
+ fraction: Fraction of available rows to produce as a sample. This parameter is not usable with `n` or
1047
+ `n_per_stratum`. The fraction must be between 0.0 and 1.0.
1048
+ seed: Random seed for reproducible shuffling
1049
+ stratify_by: If specified, the sample will be stratified by these values.
1050
+
1051
+ Returns:
1052
+ A new DataFrame which specifies the sampled rows
1053
+
1054
+ Examples:
1055
+ Given the Table `person` containing the field 'age', we can create samples of the table in various ways:
1056
+
1057
+ Sample 100 rows from the above Table:
1058
+
1059
+ >>> df = person.sample(n=100)
1060
+
1061
+ Sample 10% of the rows from the above Table:
1062
+
1063
+ >>> df = person.sample(fraction=0.1)
1064
+
1065
+ Sample 10% of the rows from the above Table, stratified by the column 'age':
1066
+
1067
+ >>> df = person.sample(fraction=0.1, stratify_by=t.age)
1068
+
1069
+ Equal allocation sampling: Sample 2 rows from each age present in the above Table:
1070
+
1071
+ >>> df = person.sample(n_per_stratum=2, stratify_by=t.age)
1072
+
1073
+ Sampling is compatible with the where clause, so we can also sample from a filtered DataFrame:
1074
+
1075
+ >>> df = person.where(t.age > 30).sample(n=100)
1076
+ """
1077
+ # Check context of usage
1078
+ if self.sample_clause is not None:
1079
+ raise excs.Error('sample() cannot be used with sample()')
1080
+ if self.group_by_clause is not None:
1081
+ raise excs.Error('sample() cannot be used with group_by()')
1082
+ if self.order_by_clause is not None:
1083
+ raise excs.Error('sample() cannot be used with order_by()')
1084
+ if self.limit_val is not None:
1085
+ raise excs.Error('sample() cannot be used with limit()')
1086
+ if self._has_joins():
1087
+ raise excs.Error('sample() cannot be used with join()')
1088
+
1089
+ # Check paramter combinations
1090
+ if (n is not None) + (n_per_stratum is not None) + (fraction is not None) != 1:
1091
+ raise excs.Error('Exactly one of `n`, `n_per_stratum`, or `fraction` must be specified.')
1092
+ if n_per_stratum is not None and stratify_by is None:
1093
+ raise excs.Error('Must specify `stratify_by` to use `n_per_stratum`')
1094
+
1095
+ # Check parameter types and values
1096
+ n = self.validate_constant_type_range(n, ts.IntType(nullable=False), False, 'n', (1, None))
1097
+ n_per_stratum = self.validate_constant_type_range(
1098
+ n_per_stratum, ts.IntType(nullable=False), False, 'n_per_stratum', (1, None)
1099
+ )
1100
+ fraction = self.validate_constant_type_range(
1101
+ fraction, ts.FloatType(nullable=False), False, 'fraction', (0.0, 1.0)
1102
+ )
1103
+ seed = self.validate_constant_type_range(seed, ts.IntType(nullable=False), False, 'seed')
1104
+
1105
+ # analyze stratify list
1106
+ stratify_exprs: list[exprs.Expr] = []
1107
+ if stratify_by is not None:
1108
+ if isinstance(stratify_by, exprs.Expr):
1109
+ stratify_by = [stratify_by]
1110
+ if not isinstance(stratify_by, (list, tuple)):
1111
+ raise excs.Error('`stratify_by` must be a list of scalar expressions')
1112
+ for expr in stratify_by:
1113
+ if expr is None or not isinstance(expr, exprs.Expr):
1114
+ raise excs.Error(f'Invalid expression: {expr}')
1115
+ if not expr.col_type.is_scalar_type():
1116
+ raise excs.Error(f'Invalid type: expression must be a scalar type (not {expr.col_type})')
1117
+ if not expr.is_bound_by(self._from_clause.tbls):
1118
+ raise excs.Error(
1119
+ f"Expression '{expr}' cannot be evaluated in the context of this query's tables "
1120
+ f'({",".join(tbl.tbl_name() for tbl in self._from_clause.tbls)})'
1121
+ )
1122
+ stratify_exprs.append(expr)
1123
+
1124
+ sample_clause = SampleClause(None, n, n_per_stratum, fraction, seed, stratify_exprs)
1125
+
1126
+ return DataFrame(
1127
+ from_clause=self._from_clause,
1128
+ select_list=self.select_list,
1129
+ where_clause=self.where_clause,
1130
+ group_by_clause=self.group_by_clause,
1131
+ grouping_tbl=self.grouping_tbl,
1132
+ order_by_clause=self.order_by_clause,
1133
+ limit=self.limit_val,
1134
+ sample_clause=sample_clause,
960
1135
  )
961
1136
 
962
1137
  def update(self, value_spec: dict[str, Any], cascade: bool = True) -> UpdateStatus:
@@ -988,7 +1163,7 @@ class DataFrame:
988
1163
  >>> df = person.where(t.year == 2014).update({'age': 30})
989
1164
  """
990
1165
  self._validate_mutable('update', False)
991
- with Env.get().begin_xact():
1166
+ with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=True, lock_mutable_tree=True):
992
1167
  return self._first_tbl.tbl_version.get().update(value_spec, where=self.where_clause, cascade=cascade)
993
1168
 
994
1169
  def delete(self) -> UpdateStatus:
@@ -1011,7 +1186,7 @@ class DataFrame:
1011
1186
  self._validate_mutable('delete', False)
1012
1187
  if not self._first_tbl.is_insertable():
1013
1188
  raise excs.Error('Cannot delete from view')
1014
- with Env.get().begin_xact():
1189
+ with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=True, lock_mutable_tree=True):
1015
1190
  return self._first_tbl.tbl_version.get().delete(where=self.where_clause)
1016
1191
 
1017
1192
  def _validate_mutable(self, op_name: str, allow_select: bool) -> None:
@@ -1053,13 +1228,14 @@ class DataFrame:
1053
1228
  if self.order_by_clause is not None
1054
1229
  else None,
1055
1230
  'limit_val': self.limit_val.as_dict() if self.limit_val is not None else None,
1231
+ 'sample_clause': self.sample_clause.as_dict() if self.sample_clause is not None else None,
1056
1232
  }
1057
1233
  return d
1058
1234
 
1059
1235
  @classmethod
1060
1236
  def from_dict(cls, d: dict[str, Any]) -> 'DataFrame':
1061
1237
  # we need to wrap the construction with a transaction, because it might need to load metadata
1062
- with Env.get().begin_xact():
1238
+ with Catalog.get().begin_xact(for_write=False):
1063
1239
  tbls = [catalog.TableVersionPath.from_dict(tbl_dict) for tbl_dict in d['from_clause']['tbls']]
1064
1240
  join_clauses = [plan.JoinClause(**clause_dict) for clause_dict in d['from_clause']['join_clauses']]
1065
1241
  from_clause = plan.FromClause(tbls=tbls, join_clauses=join_clauses)
@@ -1079,6 +1255,7 @@ class DataFrame:
1079
1255
  else None
1080
1256
  )
1081
1257
  limit_val = exprs.Expr.from_dict(d['limit_val']) if d['limit_val'] is not None else None
1258
+ sample_clause = SampleClause.from_dict(d['sample_clause']) if d['sample_clause'] is not None else None
1082
1259
 
1083
1260
  return DataFrame(
1084
1261
  from_clause=from_clause,
@@ -1088,6 +1265,7 @@ class DataFrame:
1088
1265
  grouping_tbl=grouping_tbl,
1089
1266
  order_by_clause=order_by_clause,
1090
1267
  limit=limit_val,
1268
+ sample_clause=sample_clause,
1091
1269
  )
1092
1270
 
1093
1271
  def _hash_result_set(self) -> str:
@@ -1129,7 +1307,8 @@ class DataFrame:
1129
1307
  assert data_file_path.is_file()
1130
1308
  return data_file_path
1131
1309
  else:
1132
- with Env.get().begin_xact():
1310
+ # TODO: extend begin_xact() to accept multiple TVPs for joins
1311
+ with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=False):
1133
1312
  return write_coco_dataset(self, dest_path)
1134
1313
 
1135
1314
  def to_pytorch_dataset(self, image_format: str = 'pt') -> 'torch.utils.data.IterableDataset':
@@ -1174,7 +1353,7 @@ class DataFrame:
1174
1353
  if dest_path.exists(): # fast path: use cache
1175
1354
  assert dest_path.is_dir()
1176
1355
  else:
1177
- with Env.get().begin_xact():
1356
+ with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=False):
1178
1357
  export_parquet(self, dest_path, inline_images=True)
1179
1358
 
1180
1359
  return PixeltablePytorchDataset(path=dest_path, image_format=image_format)
pixeltable/env.py CHANGED
@@ -25,6 +25,7 @@ from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
25
25
 
26
26
  import pixeltable_pgserver
27
27
  import sqlalchemy as sql
28
+ from pillow_heif import register_heif_opener # type: ignore[import-untyped]
28
29
  from tqdm import TqdmWarning
29
30
 
30
31
  from pixeltable import exceptions as excs
@@ -191,6 +192,7 @@ class Env:
191
192
  assert self._dbms is not None
192
193
  return self._dbms
193
194
 
195
+ @property
194
196
  def in_xact(self) -> bool:
195
197
  return self._current_conn is not None
196
198
 
@@ -201,20 +203,17 @@ class Env:
201
203
 
202
204
  @contextmanager
203
205
  def begin_xact(self) -> Iterator[sql.Connection]:
204
- """Return a context manager that yields a connection to the database. Idempotent."""
206
+ """Call Catalog.begin_xact() instead, unless there is a specific reason to call this directly."""
205
207
  if self._current_conn is None:
206
208
  assert self._current_session is None
207
209
  try:
208
210
  with self.engine.begin() as conn, sql.orm.Session(conn) as session:
209
- # TODO: remove print() once we're done with debugging the concurrent update behavior
210
- # print(f'{datetime.datetime.now()}: start xact')
211
211
  self._current_conn = conn
212
212
  self._current_session = session
213
213
  yield conn
214
214
  finally:
215
215
  self._current_session = None
216
216
  self._current_conn = None
217
- # print(f'{datetime.datetime.now()}: end xact')
218
217
  else:
219
218
  assert self._current_session is not None
220
219
  yield self._current_conn
@@ -600,6 +599,7 @@ class Env:
600
599
 
601
600
  def _set_up_runtime(self) -> None:
602
601
  """Check for and start runtime services"""
602
+ register_heif_opener()
603
603
  self._start_web_server()
604
604
  self.__register_packages()
605
605
 
@@ -611,9 +611,11 @@ class Env:
611
611
  self.__register_package('fiftyone')
612
612
  self.__register_package('fireworks', library_name='fireworks-ai')
613
613
  self.__register_package('google.genai', library_name='google-genai')
614
+ self.__register_package('groq')
614
615
  self.__register_package('huggingface_hub', library_name='huggingface-hub')
615
616
  self.__register_package('label_studio_sdk', library_name='label-studio-sdk')
616
617
  self.__register_package('llama_cpp', library_name='llama-cpp-python')
618
+ self.__register_package('mcp')
617
619
  self.__register_package('mistralai')
618
620
  self.__register_package('mistune')
619
621
  self.__register_package('ollama')
pixeltable/exceptions.py CHANGED
@@ -10,6 +10,12 @@ class Error(Exception):
10
10
 
11
11
 
12
12
  class ExprEvalError(Exception):
13
+ """
14
+ Used during query execution to signal expr evaluation failures.
15
+
16
+ NOT A USER-FACING EXCEPTION. All ExprEvalError instances need to be converted into Error instances.
17
+ """
18
+
13
19
  expr: 'exprs.Expr'
14
20
  expr_msg: str
15
21
  exc: Exception
@@ -9,4 +9,4 @@ from .exec_node import ExecNode
9
9
  from .expr_eval import ExprEvalNode
10
10
  from .in_memory_data_node import InMemoryDataNode
11
11
  from .row_update_node import RowUpdateNode
12
- from .sql_node import SqlAggregationNode, SqlJoinNode, SqlLookupNode, SqlNode, SqlScanNode
12
+ from .sql_node import SqlAggregationNode, SqlJoinNode, SqlLookupNode, SqlNode, SqlSampleNode, SqlScanNode
@@ -73,6 +73,8 @@ class ExecNode(abc.ABC):
73
73
  except RuntimeError:
74
74
  loop = asyncio.new_event_loop()
75
75
  asyncio.set_event_loop(loop)
76
+ # we set a deliberately long duration to avoid warnings getting printed to the console in debug mode
77
+ loop.slow_callback_duration = 3600
76
78
 
77
79
  if _logger.isEnabledFor(logging.DEBUG):
78
80
  loop.set_debug(True)
@@ -317,7 +317,10 @@ class JsonMapperDispatcher(Evaluator):
317
317
  for _ in src
318
318
  ]
319
319
  for nested_row, anchor_val in zip(nested_rows, src):
320
- nested_row[self.scope_anchor.slot_idx] = anchor_val
320
+ # It's possible that self.scope_anchor.slot_idx is None; this corresponds to the case where the
321
+ # mapper expression doesn't actually contain references to RELATIVE_PATH_ROOT.
322
+ if self.scope_anchor.slot_idx is not None:
323
+ nested_row[self.scope_anchor.slot_idx] = anchor_val
321
324
  for slot_idx_, nested_slot_idx in self.external_slot_map.items():
322
325
  nested_row[nested_slot_idx] = row[slot_idx_]
323
326
  self.nested_exec_ctx.init_rows(nested_rows)
@@ -49,7 +49,7 @@ class ExprEvalNode(ExecNode):
49
49
  # execution state
50
50
  tasks: set[asyncio.Task] # collects all running tasks to prevent them from getting gc'd
51
51
  exc_event: asyncio.Event # set if an exception needs to be propagated
52
- error: Optional[Union[excs.Error, excs.ExprEvalError]] # exception that needs to be propagated
52
+ error: Optional[Union[Exception]] # exception that needs to be propagated
53
53
  completed_rows: asyncio.Queue[exprs.DataRow] # rows that have completed evaluation
54
54
  completed_event: asyncio.Event # set when completed_rows is non-empty
55
55
  input_iter: AsyncIterator[DataRowBatch]
@@ -133,10 +133,10 @@ class ExprEvalNode(ExecNode):
133
133
  except StopAsyncIteration:
134
134
  self.input_complete = True
135
135
  _logger.debug(f'finished input: #input_rows={self.num_input_rows}, #avail={self.avail_input_rows}')
136
- except excs.Error as err:
137
- self.error = err
136
+ # make sure to pass DBAPIError through, so the transaction handling logic sees it
137
+ except Exception as exc:
138
+ self.error = exc
138
139
  self.exc_event.set()
139
- # TODO: should we also handle Exception here and create an excs.Error from it?
140
140
 
141
141
  @property
142
142
  def total_buffered(self) -> int:
@@ -38,7 +38,7 @@ class InMemoryDataNode(ExecNode):
38
38
  # we materialize the input slots
39
39
  output_exprs = list(row_builder.input_exprs)
40
40
  super().__init__(row_builder, output_exprs, [], None)
41
- assert tbl.get().is_insertable()
41
+ assert tbl.get().is_insertable
42
42
  self.tbl = tbl
43
43
  self.input_rows = rows
44
44
  self.start_row_id = start_row_id