pixeltable 0.3.15__py3-none-any.whl → 0.4.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (58) hide show
  1. pixeltable/__version__.py +2 -2
  2. pixeltable/catalog/catalog.py +296 -105
  3. pixeltable/catalog/column.py +10 -8
  4. pixeltable/catalog/dir.py +1 -2
  5. pixeltable/catalog/insertable_table.py +25 -20
  6. pixeltable/catalog/schema_object.py +3 -6
  7. pixeltable/catalog/table.py +261 -189
  8. pixeltable/catalog/table_version.py +333 -202
  9. pixeltable/catalog/table_version_handle.py +15 -2
  10. pixeltable/catalog/table_version_path.py +60 -14
  11. pixeltable/catalog/view.py +38 -6
  12. pixeltable/dataframe.py +196 -18
  13. pixeltable/env.py +4 -4
  14. pixeltable/exec/__init__.py +1 -1
  15. pixeltable/exec/expr_eval/evaluators.py +4 -1
  16. pixeltable/exec/in_memory_data_node.py +1 -1
  17. pixeltable/exec/sql_node.py +171 -22
  18. pixeltable/exprs/column_property_ref.py +15 -6
  19. pixeltable/exprs/column_ref.py +32 -11
  20. pixeltable/exprs/comparison.py +1 -1
  21. pixeltable/exprs/data_row.py +5 -3
  22. pixeltable/exprs/expr.py +7 -0
  23. pixeltable/exprs/literal.py +2 -0
  24. pixeltable/exprs/row_builder.py +4 -6
  25. pixeltable/exprs/rowid_ref.py +8 -0
  26. pixeltable/exprs/similarity_expr.py +1 -0
  27. pixeltable/func/query_template_function.py +1 -1
  28. pixeltable/func/tools.py +1 -1
  29. pixeltable/functions/gemini.py +0 -1
  30. pixeltable/functions/string.py +212 -58
  31. pixeltable/globals.py +12 -4
  32. pixeltable/index/base.py +5 -0
  33. pixeltable/index/btree.py +5 -0
  34. pixeltable/index/embedding_index.py +5 -0
  35. pixeltable/io/external_store.py +8 -29
  36. pixeltable/io/label_studio.py +1 -1
  37. pixeltable/io/parquet.py +2 -2
  38. pixeltable/io/table_data_conduit.py +0 -31
  39. pixeltable/metadata/__init__.py +11 -2
  40. pixeltable/metadata/converters/convert_13.py +2 -2
  41. pixeltable/metadata/converters/convert_30.py +6 -11
  42. pixeltable/metadata/converters/convert_35.py +9 -0
  43. pixeltable/metadata/converters/convert_36.py +38 -0
  44. pixeltable/metadata/converters/util.py +3 -9
  45. pixeltable/metadata/notes.py +2 -0
  46. pixeltable/metadata/schema.py +8 -1
  47. pixeltable/plan.py +221 -14
  48. pixeltable/share/packager.py +137 -13
  49. pixeltable/share/publish.py +2 -2
  50. pixeltable/store.py +19 -13
  51. pixeltable/utils/dbms.py +1 -1
  52. pixeltable/utils/formatter.py +64 -42
  53. pixeltable/utils/sample.py +25 -0
  54. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/METADATA +2 -1
  55. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/RECORD +58 -55
  56. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/LICENSE +0 -0
  57. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/WHEEL +0 -0
  58. {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/entry_points.txt +0 -0
pixeltable/dataframe.py CHANGED
@@ -14,9 +14,10 @@ import pandas as pd
14
14
  import sqlalchemy as sql
15
15
 
16
16
  from pixeltable import catalog, exceptions as excs, exec, exprs, plan, type_system as ts
17
- from pixeltable.catalog import is_valid_identifier
17
+ from pixeltable.catalog import Catalog, is_valid_identifier
18
18
  from pixeltable.catalog.globals import UpdateStatus
19
19
  from pixeltable.env import Env
20
+ from pixeltable.plan import Planner, SampleClause
20
21
  from pixeltable.type_system import ColumnType
21
22
  from pixeltable.utils.description_helper import DescriptionHelper
22
23
  from pixeltable.utils.formatter import Formatter
@@ -139,6 +140,7 @@ class DataFrame:
139
140
  grouping_tbl: Optional[catalog.TableVersion]
140
141
  order_by_clause: Optional[list[tuple[exprs.Expr, bool]]]
141
142
  limit_val: Optional[exprs.Expr]
143
+ sample_clause: Optional[SampleClause]
142
144
 
143
145
  def __init__(
144
146
  self,
@@ -149,6 +151,7 @@ class DataFrame:
149
151
  grouping_tbl: Optional[catalog.TableVersion] = None,
150
152
  order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None, # list[(expr, asc)]
151
153
  limit: Optional[exprs.Expr] = None,
154
+ sample_clause: Optional[SampleClause] = None,
152
155
  ):
153
156
  self._from_clause = from_clause
154
157
 
@@ -168,6 +171,7 @@ class DataFrame:
168
171
  self.grouping_tbl = grouping_tbl
169
172
  self.order_by_clause = copy.deepcopy(order_by_clause)
170
173
  self.limit_val = limit
174
+ self.sample_clause = sample_clause
171
175
 
172
176
  @classmethod
173
177
  def _normalize_select_list(
@@ -210,8 +214,7 @@ class DataFrame:
210
214
 
211
215
  @property
212
216
  def _first_tbl(self) -> catalog.TableVersionPath:
213
- assert len(self._from_clause.tbls) == 1
214
- return self._from_clause.tbls[0]
217
+ return self._from_clause._first_tbl
215
218
 
216
219
  def _vars(self) -> dict[str, exprs.Variable]:
217
220
  """
@@ -236,6 +239,36 @@ class DataFrame:
236
239
  raise excs.Error(f'Multiple definitions of parameter {var.name}')
237
240
  return unique_vars
238
241
 
242
+ @classmethod
243
+ def _convert_param_to_typed_expr(
244
+ cls, v: Any, required_type: ts.ColumnType, required: bool, name: str, range: Optional[tuple[Any, Any]] = None
245
+ ) -> Optional[exprs.Expr]:
246
+ if v is None:
247
+ if required:
248
+ raise excs.Error(f'{name!r} parameter must be present')
249
+ return v
250
+ v_expr = exprs.Expr.from_object(v)
251
+ if not v_expr.col_type.matches(required_type):
252
+ raise excs.Error(f'{name!r} parameter must be of type {required_type!r}, instead of {v_expr.col_type}')
253
+ if range is not None:
254
+ if not isinstance(v_expr, exprs.Literal):
255
+ raise excs.Error(f'{name!r} parameter must be a constant, not {v_expr}')
256
+ if range[0] is not None and not (v_expr.val >= range[0]):
257
+ raise excs.Error(f'{name!r} parameter must be >= {range[0]}')
258
+ if range[1] is not None and not (v_expr.val <= range[1]):
259
+ raise excs.Error(f'{name!r} parameter must be <= {range[1]}')
260
+ return v_expr
261
+
262
+ @classmethod
263
+ def validate_constant_type_range(
264
+ cls, v: Any, required_type: ts.ColumnType, required: bool, name: str, range: Optional[tuple[Any, Any]] = None
265
+ ) -> Any:
266
+ """Validate that the given named parameter is a constant of the required type and within the specified range."""
267
+ v_expr = cls._convert_param_to_typed_expr(v, required_type, required, name, range)
268
+ if v_expr is None:
269
+ return None
270
+ return v_expr.val
271
+
239
272
  def parameters(self) -> dict[str, ColumnType]:
240
273
  """Return a dict mapping parameter name to parameter type.
241
274
 
@@ -280,7 +313,7 @@ class DataFrame:
280
313
  num_rowid_cols = len(self.grouping_tbl.store_tbl.rowid_columns())
281
314
  # the grouping table must be a base of self.tbl
282
315
  assert num_rowid_cols <= len(self._first_tbl.tbl_version.get().store_tbl.rowid_columns())
283
- group_by_clause = [exprs.RowidRef(self._first_tbl.tbl_version, idx) for idx in range(num_rowid_cols)]
316
+ group_by_clause = self.__rowid_columns(num_rowid_cols)
284
317
  elif self.group_by_clause is not None:
285
318
  group_by_clause = self.group_by_clause
286
319
 
@@ -292,14 +325,21 @@ class DataFrame:
292
325
  self._select_list_exprs,
293
326
  where_clause=self.where_clause,
294
327
  group_by_clause=group_by_clause,
295
- order_by_clause=self.order_by_clause if self.order_by_clause is not None else [],
328
+ order_by_clause=self.order_by_clause,
296
329
  limit=self.limit_val,
330
+ sample_clause=self.sample_clause,
297
331
  )
298
332
 
333
+ def __rowid_columns(self, num_rowid_cols: Optional[int] = None) -> list[exprs.Expr]:
334
+ """Return list of RowidRef for the given number of associated rowids"""
335
+ return Planner.rowid_columns(self._first_tbl.tbl_version, num_rowid_cols)
336
+
299
337
  def _has_joins(self) -> bool:
300
338
  return len(self._from_clause.join_clauses) > 0
301
339
 
302
340
  def show(self, n: int = 20) -> DataFrameResultSet:
341
+ if self.sample_clause is not None:
342
+ raise excs.Error('show() cannot be used with sample()')
303
343
  assert n is not None
304
344
  return self.limit(n).collect()
305
345
 
@@ -322,6 +362,8 @@ class DataFrame:
322
362
  raise excs.Error('head() cannot be used with order_by()')
323
363
  if self._has_joins():
324
364
  raise excs.Error('head() not supported for joins')
365
+ if self.sample_clause is not None:
366
+ raise excs.Error('head() cannot be used with sample()')
325
367
  if self.group_by_clause is not None:
326
368
  raise excs.Error('head() cannot be used with group_by()')
327
369
  num_rowid_cols = len(self._first_tbl.tbl_version.get().store_tbl.rowid_columns())
@@ -347,6 +389,8 @@ class DataFrame:
347
389
  raise excs.Error('tail() cannot be used with order_by()')
348
390
  if self._has_joins():
349
391
  raise excs.Error('tail() not supported for joins')
392
+ if self.sample_clause is not None:
393
+ raise excs.Error('tail() cannot be used with sample()')
350
394
  if self.group_by_clause is not None:
351
395
  raise excs.Error('tail() cannot be used with group_by()')
352
396
  num_rowid_cols = len(self._first_tbl.tbl_version.get().store_tbl.rowid_columns())
@@ -431,7 +475,7 @@ class DataFrame:
431
475
  raise excs.Error(msg) from e
432
476
 
433
477
  def _output_row_iterator(self) -> Iterator[list]:
434
- with Env.get().begin_xact():
478
+ with Catalog.get().begin_xact(for_write=False):
435
479
  try:
436
480
  for data_row in self._exec():
437
481
  yield [data_row[e.slot_idx] for e in self._select_list_exprs]
@@ -463,8 +507,8 @@ class DataFrame:
463
507
 
464
508
  from pixeltable.plan import Planner
465
509
 
466
- stmt = Planner.create_count_stmt(self._first_tbl, self.where_clause)
467
- with Env.get().begin_xact() as conn:
510
+ with Catalog.get().begin_xact(for_write=False) as conn:
511
+ stmt = Planner.create_count_stmt(self._first_tbl, self.where_clause)
468
512
  result: int = conn.execute(stmt).scalar_one()
469
513
  assert isinstance(result, int)
470
514
  return result
@@ -510,6 +554,9 @@ class DataFrame:
510
554
  if self.limit_val is not None:
511
555
  heading_vals.append('Limit')
512
556
  info_vals.append(self.limit_val.display_str(inline=False))
557
+ if self.sample_clause is not None:
558
+ heading_vals.append('Sample')
559
+ info_vals.append(self.sample_clause.display_str(inline=False))
513
560
  assert len(heading_vals) == len(info_vals)
514
561
  return pd.DataFrame(info_vals, index=heading_vals)
515
562
 
@@ -644,6 +691,8 @@ class DataFrame:
644
691
  """
645
692
  if self.where_clause is not None:
646
693
  raise excs.Error('Where clause already specified')
694
+ if self.sample_clause is not None:
695
+ raise excs.Error('where cannot be used after sample()')
647
696
  if not isinstance(pred, exprs.Expr):
648
697
  raise excs.Error(f'Where() requires a Pixeltable expression, but instead got {type(pred)}')
649
698
  if not pred.col_type.is_bool_type():
@@ -771,6 +820,8 @@ class DataFrame:
771
820
 
772
821
  >>> df = t.join(d, on=(t.d1 == d.pk1) & (t.d2 == d.pk2), how='left')
773
822
  """
823
+ if self.sample_clause is not None:
824
+ raise excs.Error('join() cannot be used with sample()')
774
825
  join_pred: Optional[exprs.Expr]
775
826
  if how == 'cross':
776
827
  if on is not None:
@@ -838,6 +889,9 @@ class DataFrame:
838
889
  """
839
890
  if self.group_by_clause is not None:
840
891
  raise excs.Error('Group-by already specified')
892
+ if self.sample_clause is not None:
893
+ raise excs.Error('group_by() cannot be used with sample()')
894
+
841
895
  grouping_tbl: Optional[catalog.TableVersion] = None
842
896
  group_by_clause: Optional[list[exprs.Expr]] = None
843
897
  for item in grouping_items:
@@ -921,6 +975,8 @@ class DataFrame:
921
975
 
922
976
  >>> df = book.order_by(t.price, asc=False).order_by(t.pages)
923
977
  """
978
+ if self.sample_clause is not None:
979
+ raise excs.Error('group_by() cannot be used with sample()')
924
980
  for e in expr_list:
925
981
  if not isinstance(e, exprs.Expr):
926
982
  raise excs.Error(f'Invalid expression in order_by(): {e}')
@@ -945,10 +1001,10 @@ class DataFrame:
945
1001
  Returns:
946
1002
  A new DataFrame with the specified limited rows.
947
1003
  """
948
- assert n is not None
949
- n = exprs.Expr.from_object(n)
950
- if not n.col_type.is_int_type():
951
- raise excs.Error(f'limit(): parameter must be of type int, instead of {n.col_type}')
1004
+ if self.sample_clause is not None:
1005
+ raise excs.Error('limit() cannot be used with sample()')
1006
+
1007
+ limit_expr = self._convert_param_to_typed_expr(n, ts.IntType(nullable=False), True, 'limit()')
952
1008
  return DataFrame(
953
1009
  from_clause=self._from_clause,
954
1010
  select_list=self.select_list,
@@ -956,7 +1012,124 @@ class DataFrame:
956
1012
  group_by_clause=self.group_by_clause,
957
1013
  grouping_tbl=self.grouping_tbl,
958
1014
  order_by_clause=self.order_by_clause,
959
- limit=n,
1015
+ limit=limit_expr,
1016
+ )
1017
+
1018
+ def sample(
1019
+ self,
1020
+ n: Optional[int] = None,
1021
+ n_per_stratum: Optional[int] = None,
1022
+ fraction: Optional[float] = None,
1023
+ seed: Optional[int] = None,
1024
+ stratify_by: Any = None,
1025
+ ) -> DataFrame:
1026
+ """
1027
+ Return a new DataFrame specifying a sample of rows from the DataFrame, considered in a shuffled order.
1028
+
1029
+ The size of the sample can be specified in three ways:
1030
+
1031
+ - `n`: the total number of rows to produce as a sample
1032
+ - `n_per_stratum`: the number of rows to produce per stratum as a sample
1033
+ - `fraction`: the fraction of available rows to produce as a sample
1034
+
1035
+ The sample can be stratified by one or more columns, which means that the sample will
1036
+ be selected from each stratum separately.
1037
+
1038
+ The data is shuffled before creating the sample.
1039
+
1040
+ Args:
1041
+ n: Total number of rows to produce as a sample.
1042
+ n_per_stratum: Number of rows to produce per stratum as a sample. This parameter is only valid if
1043
+ `stratify_by` is specified. Only one of `n` or `n_per_stratum` can be specified.
1044
+ fraction: Fraction of available rows to produce as a sample. This parameter is not usable with `n` or
1045
+ `n_per_stratum`. The fraction must be between 0.0 and 1.0.
1046
+ seed: Random seed for reproducible shuffling
1047
+ stratify_by: If specified, the sample will be stratified by these values.
1048
+
1049
+ Returns:
1050
+ A new DataFrame which specifies the sampled rows
1051
+
1052
+ Examples:
1053
+ Given the Table `person` containing the field 'age', we can create samples of the table in various ways:
1054
+
1055
+ Sample 100 rows from the above Table:
1056
+
1057
+ >>> df = person.sample(n=100)
1058
+
1059
+ Sample 10% of the rows from the above Table:
1060
+
1061
+ >>> df = person.sample(fraction=0.1)
1062
+
1063
+ Sample 10% of the rows from the above Table, stratified by the column 'age':
1064
+
1065
+ >>> df = person.sample(fraction=0.1, stratify_by=t.age)
1066
+
1067
+ Equal allocation sampling: Sample 2 rows from each age present in the above Table:
1068
+
1069
+ >>> df = person.sample(n_per_stratum=2, stratify_by=t.age)
1070
+
1071
+ Sampling is compatible with the where clause, so we can also sample from a filtered DataFrame:
1072
+
1073
+ >>> df = person.where(t.age > 30).sample(n=100)
1074
+ """
1075
+ # Check context of usage
1076
+ if self.sample_clause is not None:
1077
+ raise excs.Error('sample() cannot be used with sample()')
1078
+ if self.group_by_clause is not None:
1079
+ raise excs.Error('sample() cannot be used with group_by()')
1080
+ if self.order_by_clause is not None:
1081
+ raise excs.Error('sample() cannot be used with order_by()')
1082
+ if self.limit_val is not None:
1083
+ raise excs.Error('sample() cannot be used with limit()')
1084
+ if self._has_joins():
1085
+ raise excs.Error('sample() cannot be used with join()')
1086
+
1087
+ # Check paramter combinations
1088
+ if (n is not None) + (n_per_stratum is not None) + (fraction is not None) != 1:
1089
+ raise excs.Error('Exactly one of `n`, `n_per_stratum`, or `fraction` must be specified.')
1090
+ if n_per_stratum is not None and stratify_by is None:
1091
+ raise excs.Error('Must specify `stratify_by` to use `n_per_stratum`')
1092
+
1093
+ # Check parameter types and values
1094
+ n = self.validate_constant_type_range(n, ts.IntType(nullable=False), False, 'n', (1, None))
1095
+ n_per_stratum = self.validate_constant_type_range(
1096
+ n_per_stratum, ts.IntType(nullable=False), False, 'n_per_stratum', (1, None)
1097
+ )
1098
+ fraction = self.validate_constant_type_range(
1099
+ fraction, ts.FloatType(nullable=False), False, 'fraction', (0.0, 1.0)
1100
+ )
1101
+ seed = self.validate_constant_type_range(seed, ts.IntType(nullable=False), False, 'seed')
1102
+
1103
+ # analyze stratify list
1104
+ stratify_exprs: list[exprs.Expr] = []
1105
+ if stratify_by is not None:
1106
+ if isinstance(stratify_by, exprs.Expr):
1107
+ stratify_by = [stratify_by]
1108
+ if not isinstance(stratify_by, (list, tuple)):
1109
+ raise excs.Error('`stratify_by` must be a list of scalar expressions')
1110
+ for expr in stratify_by:
1111
+ if expr is None or not isinstance(expr, exprs.Expr):
1112
+ raise excs.Error(f'Invalid expression: {expr}')
1113
+ if not expr.col_type.is_scalar_type():
1114
+ raise excs.Error(f'Invalid type: expression must be a scalar type (not {expr.col_type})')
1115
+ if not expr.is_bound_by(self._from_clause.tbls):
1116
+ raise excs.Error(
1117
+ f"Expression '{expr}' cannot be evaluated in the context of this query's tables "
1118
+ f'({",".join(tbl.tbl_name() for tbl in self._from_clause.tbls)})'
1119
+ )
1120
+ stratify_exprs.append(expr)
1121
+
1122
+ sample_clause = SampleClause(None, n, n_per_stratum, fraction, seed, stratify_exprs)
1123
+
1124
+ return DataFrame(
1125
+ from_clause=self._from_clause,
1126
+ select_list=self.select_list,
1127
+ where_clause=self.where_clause,
1128
+ group_by_clause=self.group_by_clause,
1129
+ grouping_tbl=self.grouping_tbl,
1130
+ order_by_clause=self.order_by_clause,
1131
+ limit=self.limit_val,
1132
+ sample_clause=sample_clause,
960
1133
  )
961
1134
 
962
1135
  def update(self, value_spec: dict[str, Any], cascade: bool = True) -> UpdateStatus:
@@ -988,7 +1161,8 @@ class DataFrame:
988
1161
  >>> df = person.where(t.year == 2014).update({'age': 30})
989
1162
  """
990
1163
  self._validate_mutable('update', False)
991
- with Env.get().begin_xact():
1164
+ tbl_id = self._first_tbl.tbl_id()
1165
+ with Catalog.get().begin_xact(tbl_id=tbl_id, for_write=True):
992
1166
  return self._first_tbl.tbl_version.get().update(value_spec, where=self.where_clause, cascade=cascade)
993
1167
 
994
1168
  def delete(self) -> UpdateStatus:
@@ -1011,7 +1185,8 @@ class DataFrame:
1011
1185
  self._validate_mutable('delete', False)
1012
1186
  if not self._first_tbl.is_insertable():
1013
1187
  raise excs.Error('Cannot delete from view')
1014
- with Env.get().begin_xact():
1188
+ tbl_id = self._first_tbl.tbl_id()
1189
+ with Catalog.get().begin_xact(tbl_id=tbl_id, for_write=True):
1015
1190
  return self._first_tbl.tbl_version.get().delete(where=self.where_clause)
1016
1191
 
1017
1192
  def _validate_mutable(self, op_name: str, allow_select: bool) -> None:
@@ -1053,13 +1228,14 @@ class DataFrame:
1053
1228
  if self.order_by_clause is not None
1054
1229
  else None,
1055
1230
  'limit_val': self.limit_val.as_dict() if self.limit_val is not None else None,
1231
+ 'sample_clause': self.sample_clause.as_dict() if self.sample_clause is not None else None,
1056
1232
  }
1057
1233
  return d
1058
1234
 
1059
1235
  @classmethod
1060
1236
  def from_dict(cls, d: dict[str, Any]) -> 'DataFrame':
1061
1237
  # we need to wrap the construction with a transaction, because it might need to load metadata
1062
- with Env.get().begin_xact():
1238
+ with Catalog.get().begin_xact(for_write=False):
1063
1239
  tbls = [catalog.TableVersionPath.from_dict(tbl_dict) for tbl_dict in d['from_clause']['tbls']]
1064
1240
  join_clauses = [plan.JoinClause(**clause_dict) for clause_dict in d['from_clause']['join_clauses']]
1065
1241
  from_clause = plan.FromClause(tbls=tbls, join_clauses=join_clauses)
@@ -1079,6 +1255,7 @@ class DataFrame:
1079
1255
  else None
1080
1256
  )
1081
1257
  limit_val = exprs.Expr.from_dict(d['limit_val']) if d['limit_val'] is not None else None
1258
+ sample_clause = SampleClause.from_dict(d['sample_clause']) if d['sample_clause'] is not None else None
1082
1259
 
1083
1260
  return DataFrame(
1084
1261
  from_clause=from_clause,
@@ -1088,6 +1265,7 @@ class DataFrame:
1088
1265
  grouping_tbl=grouping_tbl,
1089
1266
  order_by_clause=order_by_clause,
1090
1267
  limit=limit_val,
1268
+ sample_clause=sample_clause,
1091
1269
  )
1092
1270
 
1093
1271
  def _hash_result_set(self) -> str:
@@ -1129,7 +1307,7 @@ class DataFrame:
1129
1307
  assert data_file_path.is_file()
1130
1308
  return data_file_path
1131
1309
  else:
1132
- with Env.get().begin_xact():
1310
+ with Catalog.get().begin_xact(for_write=False):
1133
1311
  return write_coco_dataset(self, dest_path)
1134
1312
 
1135
1313
  def to_pytorch_dataset(self, image_format: str = 'pt') -> 'torch.utils.data.IterableDataset':
@@ -1174,7 +1352,7 @@ class DataFrame:
1174
1352
  if dest_path.exists(): # fast path: use cache
1175
1353
  assert dest_path.is_dir()
1176
1354
  else:
1177
- with Env.get().begin_xact():
1355
+ with Catalog.get().begin_xact(for_write=False):
1178
1356
  export_parquet(self, dest_path, inline_images=True)
1179
1357
 
1180
1358
  return PixeltablePytorchDataset(path=dest_path, image_format=image_format)
pixeltable/env.py CHANGED
@@ -25,6 +25,7 @@ from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
25
25
 
26
26
  import pixeltable_pgserver
27
27
  import sqlalchemy as sql
28
+ from pillow_heif import register_heif_opener # type: ignore[import-untyped]
28
29
  from tqdm import TqdmWarning
29
30
 
30
31
  from pixeltable import exceptions as excs
@@ -191,6 +192,7 @@ class Env:
191
192
  assert self._dbms is not None
192
193
  return self._dbms
193
194
 
195
+ @property
194
196
  def in_xact(self) -> bool:
195
197
  return self._current_conn is not None
196
198
 
@@ -201,20 +203,17 @@ class Env:
201
203
 
202
204
  @contextmanager
203
205
  def begin_xact(self) -> Iterator[sql.Connection]:
204
- """Return a context manager that yields a connection to the database. Idempotent."""
206
+ """Call Catalog.begin_xact() instead, unless there is a specific reason to call this directly."""
205
207
  if self._current_conn is None:
206
208
  assert self._current_session is None
207
209
  try:
208
210
  with self.engine.begin() as conn, sql.orm.Session(conn) as session:
209
- # TODO: remove print() once we're done with debugging the concurrent update behavior
210
- # print(f'{datetime.datetime.now()}: start xact')
211
211
  self._current_conn = conn
212
212
  self._current_session = session
213
213
  yield conn
214
214
  finally:
215
215
  self._current_session = None
216
216
  self._current_conn = None
217
- # print(f'{datetime.datetime.now()}: end xact')
218
217
  else:
219
218
  assert self._current_session is not None
220
219
  yield self._current_conn
@@ -600,6 +599,7 @@ class Env:
600
599
 
601
600
  def _set_up_runtime(self) -> None:
602
601
  """Check for and start runtime services"""
602
+ register_heif_opener()
603
603
  self._start_web_server()
604
604
  self.__register_packages()
605
605
 
@@ -9,4 +9,4 @@ from .exec_node import ExecNode
9
9
  from .expr_eval import ExprEvalNode
10
10
  from .in_memory_data_node import InMemoryDataNode
11
11
  from .row_update_node import RowUpdateNode
12
- from .sql_node import SqlAggregationNode, SqlJoinNode, SqlLookupNode, SqlNode, SqlScanNode
12
+ from .sql_node import SqlAggregationNode, SqlJoinNode, SqlLookupNode, SqlNode, SqlSampleNode, SqlScanNode
@@ -317,7 +317,10 @@ class JsonMapperDispatcher(Evaluator):
317
317
  for _ in src
318
318
  ]
319
319
  for nested_row, anchor_val in zip(nested_rows, src):
320
- nested_row[self.scope_anchor.slot_idx] = anchor_val
320
+ # It's possible that self.scope_anchor.slot_idx is None; this corresponds to the case where the
321
+ # mapper expression doesn't actually contain references to RELATIVE_PATH_ROOT.
322
+ if self.scope_anchor.slot_idx is not None:
323
+ nested_row[self.scope_anchor.slot_idx] = anchor_val
321
324
  for slot_idx_, nested_slot_idx in self.external_slot_map.items():
322
325
  nested_row[nested_slot_idx] = row[slot_idx_]
323
326
  self.nested_exec_ctx.init_rows(nested_rows)
@@ -38,7 +38,7 @@ class InMemoryDataNode(ExecNode):
38
38
  # we materialize the input slots
39
39
  output_exprs = list(row_builder.input_exprs)
40
40
  super().__init__(row_builder, output_exprs, [], None)
41
- assert tbl.get().is_insertable()
41
+ assert tbl.get().is_insertable
42
42
  self.tbl = tbl
43
43
  self.input_rows = rows
44
44
  self.start_row_id = start_row_id