pixeltable 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (127) hide show
  1. pixeltable/__init__.py +5 -3
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +1 -0
  4. pixeltable/catalog/catalog.py +335 -128
  5. pixeltable/catalog/column.py +22 -5
  6. pixeltable/catalog/dir.py +19 -6
  7. pixeltable/catalog/insertable_table.py +34 -37
  8. pixeltable/catalog/named_function.py +0 -4
  9. pixeltable/catalog/schema_object.py +28 -42
  10. pixeltable/catalog/table.py +193 -158
  11. pixeltable/catalog/table_version.py +191 -232
  12. pixeltable/catalog/table_version_handle.py +50 -0
  13. pixeltable/catalog/table_version_path.py +49 -33
  14. pixeltable/catalog/view.py +56 -96
  15. pixeltable/config.py +103 -0
  16. pixeltable/dataframe.py +89 -89
  17. pixeltable/env.py +98 -168
  18. pixeltable/exec/aggregation_node.py +5 -4
  19. pixeltable/exec/cache_prefetch_node.py +1 -1
  20. pixeltable/exec/component_iteration_node.py +13 -9
  21. pixeltable/exec/data_row_batch.py +3 -3
  22. pixeltable/exec/exec_context.py +0 -4
  23. pixeltable/exec/exec_node.py +3 -2
  24. pixeltable/exec/expr_eval/schedulers.py +2 -1
  25. pixeltable/exec/in_memory_data_node.py +9 -4
  26. pixeltable/exec/row_update_node.py +1 -2
  27. pixeltable/exec/sql_node.py +20 -16
  28. pixeltable/exprs/__init__.py +2 -0
  29. pixeltable/exprs/arithmetic_expr.py +7 -11
  30. pixeltable/exprs/array_slice.py +1 -1
  31. pixeltable/exprs/column_property_ref.py +3 -3
  32. pixeltable/exprs/column_ref.py +12 -13
  33. pixeltable/exprs/comparison.py +3 -6
  34. pixeltable/exprs/compound_predicate.py +4 -4
  35. pixeltable/exprs/expr.py +31 -22
  36. pixeltable/exprs/expr_dict.py +3 -3
  37. pixeltable/exprs/expr_set.py +1 -1
  38. pixeltable/exprs/function_call.py +110 -80
  39. pixeltable/exprs/globals.py +3 -3
  40. pixeltable/exprs/in_predicate.py +1 -1
  41. pixeltable/exprs/inline_expr.py +3 -3
  42. pixeltable/exprs/is_null.py +1 -1
  43. pixeltable/exprs/json_mapper.py +2 -2
  44. pixeltable/exprs/json_path.py +17 -10
  45. pixeltable/exprs/literal.py +1 -1
  46. pixeltable/exprs/method_ref.py +2 -2
  47. pixeltable/exprs/row_builder.py +8 -17
  48. pixeltable/exprs/rowid_ref.py +21 -10
  49. pixeltable/exprs/similarity_expr.py +5 -5
  50. pixeltable/exprs/sql_element_cache.py +1 -1
  51. pixeltable/exprs/type_cast.py +2 -3
  52. pixeltable/exprs/variable.py +2 -2
  53. pixeltable/ext/__init__.py +2 -0
  54. pixeltable/ext/functions/__init__.py +2 -0
  55. pixeltable/ext/functions/yolox.py +3 -3
  56. pixeltable/func/__init__.py +3 -1
  57. pixeltable/func/aggregate_function.py +9 -9
  58. pixeltable/func/callable_function.py +3 -4
  59. pixeltable/func/expr_template_function.py +6 -16
  60. pixeltable/func/function.py +48 -14
  61. pixeltable/func/function_registry.py +1 -3
  62. pixeltable/func/query_template_function.py +5 -12
  63. pixeltable/func/signature.py +23 -22
  64. pixeltable/func/tools.py +3 -3
  65. pixeltable/func/udf.py +6 -4
  66. pixeltable/functions/__init__.py +2 -0
  67. pixeltable/functions/fireworks.py +7 -4
  68. pixeltable/functions/globals.py +4 -5
  69. pixeltable/functions/huggingface.py +1 -5
  70. pixeltable/functions/image.py +17 -7
  71. pixeltable/functions/llama_cpp.py +1 -1
  72. pixeltable/functions/mistralai.py +1 -1
  73. pixeltable/functions/ollama.py +4 -4
  74. pixeltable/functions/openai.py +19 -19
  75. pixeltable/functions/string.py +23 -30
  76. pixeltable/functions/timestamp.py +11 -6
  77. pixeltable/functions/together.py +14 -12
  78. pixeltable/functions/util.py +1 -1
  79. pixeltable/functions/video.py +5 -4
  80. pixeltable/functions/vision.py +6 -9
  81. pixeltable/functions/whisper.py +3 -3
  82. pixeltable/globals.py +246 -260
  83. pixeltable/index/__init__.py +2 -0
  84. pixeltable/index/base.py +1 -1
  85. pixeltable/index/btree.py +3 -1
  86. pixeltable/index/embedding_index.py +11 -5
  87. pixeltable/io/external_store.py +11 -12
  88. pixeltable/io/label_studio.py +4 -3
  89. pixeltable/io/parquet.py +57 -56
  90. pixeltable/iterators/__init__.py +4 -2
  91. pixeltable/iterators/audio.py +11 -11
  92. pixeltable/iterators/document.py +10 -10
  93. pixeltable/iterators/string.py +1 -2
  94. pixeltable/iterators/video.py +14 -15
  95. pixeltable/metadata/__init__.py +9 -5
  96. pixeltable/metadata/converters/convert_10.py +0 -1
  97. pixeltable/metadata/converters/convert_15.py +0 -2
  98. pixeltable/metadata/converters/convert_23.py +0 -2
  99. pixeltable/metadata/converters/convert_24.py +3 -3
  100. pixeltable/metadata/converters/convert_25.py +1 -1
  101. pixeltable/metadata/converters/convert_27.py +0 -2
  102. pixeltable/metadata/converters/convert_28.py +0 -2
  103. pixeltable/metadata/converters/convert_29.py +7 -8
  104. pixeltable/metadata/converters/util.py +7 -7
  105. pixeltable/metadata/schema.py +27 -19
  106. pixeltable/plan.py +68 -40
  107. pixeltable/share/__init__.py +2 -0
  108. pixeltable/share/packager.py +15 -12
  109. pixeltable/share/publish.py +3 -5
  110. pixeltable/store.py +37 -38
  111. pixeltable/type_system.py +41 -28
  112. pixeltable/utils/coco.py +4 -4
  113. pixeltable/utils/console_output.py +1 -3
  114. pixeltable/utils/description_helper.py +1 -1
  115. pixeltable/utils/documents.py +3 -3
  116. pixeltable/utils/filecache.py +20 -9
  117. pixeltable/utils/formatter.py +2 -3
  118. pixeltable/utils/media_store.py +1 -1
  119. pixeltable/utils/pytorch.py +1 -1
  120. pixeltable/utils/sql.py +4 -4
  121. pixeltable/utils/transactional_directory.py +2 -1
  122. {pixeltable-0.3.6.dist-info → pixeltable-0.3.8.dist-info}/METADATA +1 -1
  123. pixeltable-0.3.8.dist-info/RECORD +174 -0
  124. pixeltable-0.3.6.dist-info/RECORD +0 -172
  125. {pixeltable-0.3.6.dist-info → pixeltable-0.3.8.dist-info}/LICENSE +0 -0
  126. {pixeltable-0.3.6.dist-info → pixeltable-0.3.8.dist-info}/WHEEL +0 -0
  127. {pixeltable-0.3.6.dist-info → pixeltable-0.3.8.dist-info}/entry_points.txt +0 -0
pixeltable/dataframe.py CHANGED
@@ -10,13 +10,10 @@ import traceback
10
10
  from pathlib import Path
11
11
  from typing import TYPE_CHECKING, Any, AsyncIterator, Callable, Hashable, Iterator, NoReturn, Optional, Sequence, Union
12
12
 
13
- import numpy as np
14
13
  import pandas as pd
15
14
  import sqlalchemy as sql
16
15
 
17
- import pixeltable.exceptions as excs
18
- import pixeltable.type_system as ts
19
- from pixeltable import catalog, exec, exprs, plan
16
+ from pixeltable import catalog, exceptions as excs, exec, exprs, plan, type_system as ts
20
17
  from pixeltable.catalog import is_valid_identifier
21
18
  from pixeltable.catalog.globals import UpdateStatus
22
19
  from pixeltable.env import Env
@@ -80,7 +77,7 @@ class DataFrameResultSet:
80
77
  if isinstance(index, int):
81
78
  return self._row_to_dict(index)
82
79
  if isinstance(index, tuple) and len(index) == 2:
83
- if not isinstance(index[0], int) or not (isinstance(index[1], str) or isinstance(index[1], int)):
80
+ if not isinstance(index[0], int) or not isinstance(index[1], (str, int)):
84
81
  raise excs.Error(f'Bad index, expected [<row idx>, <column name | column index>]: {index}')
85
82
  if isinstance(index[1], str) and index[1] not in self._col_names:
86
83
  raise excs.Error(f'Invalid column name: {index[1]}')
@@ -96,6 +93,9 @@ class DataFrameResultSet:
96
93
  return False
97
94
  return self.to_pandas().equals(other.to_pandas())
98
95
 
96
+ def __hash__(self):
97
+ return hash(self.to_pandas())
98
+
99
99
 
100
100
  # # TODO: remove this; it's only here as a reminder that we still need to call release() in the current implementation
101
101
  # class AnalysisInfo:
@@ -232,9 +232,8 @@ class DataFrame:
232
232
  for var in vars:
233
233
  if var.name not in unique_vars:
234
234
  unique_vars[var.name] = var
235
- else:
236
- if unique_vars[var.name].col_type != var.col_type:
237
- raise excs.Error(f'Multiple definitions of parameter {var.name}')
235
+ elif unique_vars[var.name].col_type != var.col_type:
236
+ raise excs.Error(f'Multiple definitions of parameter {var.name}')
238
237
  return unique_vars
239
238
 
240
239
  def parameters(self) -> dict[str, ColumnType]:
@@ -242,17 +241,15 @@ class DataFrame:
242
241
 
243
242
  Parameters are Variables contained in any component of the DataFrame.
244
243
  """
245
- vars = self._vars()
246
- return {name: var.col_type for name, var in vars.items()}
244
+ return {name: var.col_type for name, var in self._vars().items()}
247
245
 
248
- def _exec(self, conn: Optional[sql.engine.Connection] = None) -> Iterator[exprs.DataRow]:
246
+ def _exec(self) -> Iterator[exprs.DataRow]:
249
247
  """Run the query and return rows as a generator.
250
248
  This function must not modify the state of the DataFrame, otherwise it breaks dataset caching.
251
249
  """
252
250
  plan = self._create_query_plan()
253
251
 
254
- def exec_plan(conn: sql.engine.Connection) -> Iterator[exprs.DataRow]:
255
- plan.ctx.set_conn(conn)
252
+ def exec_plan() -> Iterator[exprs.DataRow]:
256
253
  plan.open()
257
254
  try:
258
255
  for row_batch in plan:
@@ -260,18 +257,13 @@ class DataFrame:
260
257
  finally:
261
258
  plan.close()
262
259
 
263
- if conn is None:
264
- with Env.get().engine.begin() as conn:
265
- yield from exec_plan(conn)
266
- else:
267
- yield from exec_plan(conn)
260
+ yield from exec_plan()
268
261
 
269
- async def _aexec(self, conn: sql.engine.Connection) -> AsyncIterator[exprs.DataRow]:
262
+ async def _aexec(self) -> AsyncIterator[exprs.DataRow]:
270
263
  """Run the query and return rows as a generator.
271
264
  This function must not modify the state of the DataFrame, otherwise it breaks dataset caching.
272
265
  """
273
266
  plan = self._create_query_plan()
274
- plan.ctx.set_conn(conn)
275
267
  plan.open()
276
268
  try:
277
269
  async for row_batch in plan:
@@ -287,7 +279,7 @@ class DataFrame:
287
279
  assert self.group_by_clause is None
288
280
  num_rowid_cols = len(self.grouping_tbl.store_tbl.rowid_columns())
289
281
  # the grouping table must be a base of self.tbl
290
- assert num_rowid_cols <= len(self._first_tbl.tbl_version.store_tbl.rowid_columns())
282
+ assert num_rowid_cols <= len(self._first_tbl.tbl_version.get().store_tbl.rowid_columns())
291
283
  group_by_clause = [exprs.RowidRef(self._first_tbl.tbl_version, idx) for idx in range(num_rowid_cols)]
292
284
  elif self.group_by_clause is not None:
293
285
  group_by_clause = self.group_by_clause
@@ -327,10 +319,10 @@ class DataFrame:
327
319
  if the DataFrame has an order_by clause.
328
320
  """
329
321
  if self.order_by_clause is not None:
330
- raise excs.Error(f'head() cannot be used with order_by()')
322
+ raise excs.Error('head() cannot be used with order_by()')
331
323
  if self._has_joins():
332
- raise excs.Error(f'head() not supported for joins')
333
- num_rowid_cols = len(self._first_tbl.tbl_version.store_tbl.rowid_columns())
324
+ raise excs.Error('head() not supported for joins')
325
+ num_rowid_cols = len(self._first_tbl.tbl_version.get().store_tbl.rowid_columns())
334
326
  order_by_clause = [exprs.RowidRef(self._first_tbl.tbl_version, idx) for idx in range(num_rowid_cols)]
335
327
  return self.order_by(*order_by_clause, asc=True).limit(n).collect()
336
328
 
@@ -350,10 +342,10 @@ class DataFrame:
350
342
  if the DataFrame has an order_by clause.
351
343
  """
352
344
  if self.order_by_clause is not None:
353
- raise excs.Error(f'tail() cannot be used with order_by()')
345
+ raise excs.Error('tail() cannot be used with order_by()')
354
346
  if self._has_joins():
355
- raise excs.Error(f'tail() not supported for joins')
356
- num_rowid_cols = len(self._first_tbl.tbl_version.store_tbl.rowid_columns())
347
+ raise excs.Error('tail() not supported for joins')
348
+ num_rowid_cols = len(self._first_tbl.tbl_version.get().store_tbl.rowid_columns())
357
349
  order_by_clause = [exprs.RowidRef(self._first_tbl.tbl_version, idx) for idx in range(num_rowid_cols)]
358
350
  result = self.order_by(*order_by_clause, asc=False).limit(n).collect()
359
351
  result._reverse()
@@ -418,7 +410,7 @@ class DataFrame:
418
410
  )
419
411
 
420
412
  def _raise_expr_eval_err(self, e: excs.ExprEvalError) -> NoReturn:
421
- msg = f'In row {e.row_num} the {e.expr_msg} encountered exception {type(e.exc).__name__}:\n{str(e.exc)}'
413
+ msg = f'In row {e.row_num} the {e.expr_msg} encountered exception {type(e.exc).__name__}:\n{e.exc}'
422
414
  if len(e.input_vals) > 0:
423
415
  input_msgs = [
424
416
  f"'{d}' = {d.col_type.print_value(e.input_vals[i])}" for i, d in enumerate(e.expr.dependencies())
@@ -434,29 +426,27 @@ class DataFrame:
434
426
  msg += f'\nStack:\n{nl.join(stack_trace[-1:1:-1])}'
435
427
  raise excs.Error(msg) from e
436
428
 
437
- def _output_row_iterator(self, conn: Optional[sql.engine.Connection] = None) -> Iterator[list]:
438
- try:
439
- for data_row in self._exec(conn):
440
- yield [data_row[e.slot_idx] for e in self._select_list_exprs]
441
- except excs.ExprEvalError as e:
442
- self._raise_expr_eval_err(e)
443
- except sql.exc.DBAPIError as e:
444
- raise excs.Error(f'Error during SQL execution:\n{e}')
429
+ def _output_row_iterator(self) -> Iterator[list]:
430
+ with Env.get().begin_xact():
431
+ try:
432
+ for data_row in self._exec():
433
+ yield [data_row[e.slot_idx] for e in self._select_list_exprs]
434
+ except excs.ExprEvalError as e:
435
+ self._raise_expr_eval_err(e)
436
+ except sql.exc.DBAPIError as e:
437
+ raise excs.Error(f'Error during SQL execution:\n{e}') from e
445
438
 
446
439
  def collect(self) -> DataFrameResultSet:
447
- return self._collect()
440
+ return DataFrameResultSet(list(self._output_row_iterator()), self.schema)
448
441
 
449
- def _collect(self, conn: Optional[sql.engine.Connection] = None) -> DataFrameResultSet:
450
- return DataFrameResultSet(list(self._output_row_iterator(conn)), self.schema)
451
-
452
- async def _acollect(self, conn: sql.engine.Connection) -> DataFrameResultSet:
442
+ async def _acollect(self) -> DataFrameResultSet:
453
443
  try:
454
- result = [[row[e.slot_idx] for e in self._select_list_exprs] async for row in self._aexec(conn)]
444
+ result = [[row[e.slot_idx] for e in self._select_list_exprs] async for row in self._aexec()]
455
445
  return DataFrameResultSet(result, self.schema)
456
446
  except excs.ExprEvalError as e:
457
447
  self._raise_expr_eval_err(e)
458
448
  except sql.exc.DBAPIError as e:
459
- raise excs.Error(f'Error during SQL execution:\n{e}')
449
+ raise excs.Error(f'Error during SQL execution:\n{e}') from e
460
450
 
461
451
  def count(self) -> int:
462
452
  """Return the number of rows in the DataFrame.
@@ -467,7 +457,7 @@ class DataFrame:
467
457
  from pixeltable.plan import Planner
468
458
 
469
459
  stmt = Planner.create_count_stmt(self._first_tbl, self.where_clause)
470
- with Env.get().engine.connect() as conn:
460
+ with Env.get().begin_xact() as conn:
471
461
  result: int = conn.execute(stmt).scalar_one()
472
462
  assert isinstance(result, int)
473
463
  return result
@@ -567,7 +557,7 @@ class DataFrame:
567
557
 
568
558
  """
569
559
  if self.select_list is not None:
570
- raise excs.Error(f'Select list already specified')
560
+ raise excs.Error('Select list already specified')
571
561
  for name, _ in named_items.items():
572
562
  if not isinstance(name, str) or not is_valid_identifier(name):
573
563
  raise excs.Error(f'Invalid name: {name}')
@@ -653,7 +643,7 @@ class DataFrame:
653
643
  ) -> exprs.Expr:
654
644
  """Verifies user-specified 'on' argument and converts it into a join predicate."""
655
645
  col_refs: list[exprs.ColumnRef] = []
656
- joined_tbls = self._from_clause.tbls + [other]
646
+ joined_tbls = [*self._from_clause.tbls, other]
657
647
 
658
648
  if isinstance(on, exprs.ColumnRef):
659
649
  on = [on]
@@ -663,14 +653,13 @@ class DataFrame:
663
653
  if not on.col_type.is_bool_type():
664
654
  raise excs.Error(f"'on': boolean expression expected, but got {on.col_type}: {on}")
665
655
  return on
666
- else:
667
- if not isinstance(on, Sequence) or len(on) == 0:
668
- raise excs.Error(f"'on': must be a sequence of column references or a boolean expression")
656
+ elif not isinstance(on, Sequence) or len(on) == 0:
657
+ raise excs.Error("'on': must be a sequence of column references or a boolean expression")
669
658
 
670
659
  assert isinstance(on, Sequence)
671
660
  for col_ref in on:
672
661
  if not isinstance(col_ref, exprs.ColumnRef):
673
- raise excs.Error(f"'on': must be a sequence of column references or a boolean expression")
662
+ raise excs.Error("'on': must be a sequence of column references or a boolean expression")
674
663
  if not col_ref.is_bound_by(joined_tbls):
675
664
  raise excs.Error(f"'on': expression cannot be evaluated in the context of the joined tables: {col_ref}")
676
665
  col_refs.append(col_ref)
@@ -765,7 +754,7 @@ class DataFrame:
765
754
  join_pred: Optional[exprs.Expr]
766
755
  if how == 'cross':
767
756
  if on is not None:
768
- raise excs.Error(f"'on' not allowed for cross join")
757
+ raise excs.Error("'on' not allowed for cross join")
769
758
  join_pred = None
770
759
  else:
771
760
  if on is None:
@@ -828,20 +817,20 @@ class DataFrame:
828
817
  >>> df = book.group_by(t.genre).select(t.genre, total=sum(t.price)).show()
829
818
  """
830
819
  if self.group_by_clause is not None:
831
- raise excs.Error(f'Group-by already specified')
820
+ raise excs.Error('Group-by already specified')
832
821
  grouping_tbl: Optional[catalog.TableVersion] = None
833
822
  group_by_clause: Optional[list[exprs.Expr]] = None
834
823
  for item in grouping_items:
835
824
  if isinstance(item, catalog.Table):
836
825
  if len(grouping_items) > 1:
837
- raise excs.Error(f'group_by(): only one table can be specified')
826
+ raise excs.Error('group_by(): only one table can be specified')
838
827
  if len(self._from_clause.tbls) > 1:
839
- raise excs.Error(f'group_by() with Table not supported for joins')
828
+ raise excs.Error('group_by() with Table not supported for joins')
840
829
  # we need to make sure that the grouping table is a base of self.tbl
841
830
  base = self._first_tbl.find_tbl_version(item._tbl_version_path.tbl_id())
842
831
  if base is None or base.id == self._first_tbl.tbl_id():
843
832
  raise excs.Error(f'group_by(): {item._name} is not a base table of {self._first_tbl.tbl_name()}')
844
- grouping_tbl = item._tbl_version_path.tbl_version
833
+ grouping_tbl = item._tbl_version_path.tbl_version.get()
845
834
  break
846
835
  if not isinstance(item, exprs.Expr):
847
836
  raise excs.Error(f'Invalid expression in group_by(): {item}')
@@ -943,16 +932,19 @@ class DataFrame:
943
932
 
944
933
  >>> person = t.select()
945
934
 
946
- Via the above DataFrame person, update the column 'city' to 'Oakland' and 'state' to 'CA' in the table t:
935
+ Via the above DataFrame person, update the column 'city' to 'Oakland'
936
+ and 'state' to 'CA' in the table t:
947
937
 
948
938
  >>> df = person.update({'city': 'Oakland', 'state': 'CA'})
949
939
 
950
- Via the above DataFrame person, update the column 'age' to 30 for any rows where 'year' is 2014 in the table t:
940
+ Via the above DataFrame person, update the column 'age' to 30 for any
941
+ rows where 'year' is 2014 in the table t:
951
942
 
952
943
  >>> df = person.where(t.year == 2014).update({'age': 30})
953
944
  """
954
945
  self._validate_mutable('update', False)
955
- return self._first_tbl.tbl_version.update(value_spec, where=self.where_clause, cascade=cascade)
946
+ with Env.get().begin_xact():
947
+ return self._first_tbl.tbl_version.get().update(value_spec, where=self.where_clause, cascade=cascade)
956
948
 
957
949
  def delete(self) -> UpdateStatus:
958
950
  """Delete rows form the underlying table of the DataFrame.
@@ -973,8 +965,9 @@ class DataFrame:
973
965
  """
974
966
  self._validate_mutable('delete', False)
975
967
  if not self._first_tbl.is_insertable():
976
- raise excs.Error(f'Cannot delete from view')
977
- return self._first_tbl.tbl_version.delete(where=self.where_clause)
968
+ raise excs.Error('Cannot delete from view')
969
+ with Env.get().begin_xact():
970
+ return self._first_tbl.tbl_version.get().delete(where=self.where_clause)
978
971
 
979
972
  def _validate_mutable(self, op_name: str, allow_select: bool) -> None:
980
973
  """Tests whether this DataFrame can be mutated (such as by an update operation).
@@ -1020,32 +1013,37 @@ class DataFrame:
1020
1013
 
1021
1014
  @classmethod
1022
1015
  def from_dict(cls, d: dict[str, Any]) -> 'DataFrame':
1023
- tbls = [catalog.TableVersionPath.from_dict(tbl_dict) for tbl_dict in d['from_clause']['tbls']]
1024
- join_clauses = [plan.JoinClause(**clause_dict) for clause_dict in d['from_clause']['join_clauses']]
1025
- from_clause = plan.FromClause(tbls=tbls, join_clauses=join_clauses)
1026
- select_list = (
1027
- [(exprs.Expr.from_dict(e), name) for e, name in d['select_list']] if d['select_list'] is not None else None
1028
- )
1029
- where_clause = exprs.Expr.from_dict(d['where_clause']) if d['where_clause'] is not None else None
1030
- group_by_clause = (
1031
- [exprs.Expr.from_dict(e) for e in d['group_by_clause']] if d['group_by_clause'] is not None else None
1032
- )
1033
- grouping_tbl = catalog.TableVersion.from_dict(d['grouping_tbl']) if d['grouping_tbl'] is not None else None
1034
- order_by_clause = (
1035
- [(exprs.Expr.from_dict(e), asc) for e, asc in d['order_by_clause']]
1036
- if d['order_by_clause'] is not None
1037
- else None
1038
- )
1039
- limit_val = exprs.Expr.from_dict(d['limit_val']) if d['limit_val'] is not None else None
1040
- return DataFrame(
1041
- from_clause=from_clause,
1042
- select_list=select_list,
1043
- where_clause=where_clause,
1044
- group_by_clause=group_by_clause,
1045
- grouping_tbl=grouping_tbl,
1046
- order_by_clause=order_by_clause,
1047
- limit=limit_val,
1048
- )
1016
+ # we need to wrap the construction with a transaction, because it might need to load metadata
1017
+ with Env.get().begin_xact():
1018
+ tbls = [catalog.TableVersionPath.from_dict(tbl_dict) for tbl_dict in d['from_clause']['tbls']]
1019
+ join_clauses = [plan.JoinClause(**clause_dict) for clause_dict in d['from_clause']['join_clauses']]
1020
+ from_clause = plan.FromClause(tbls=tbls, join_clauses=join_clauses)
1021
+ select_list = (
1022
+ [(exprs.Expr.from_dict(e), name) for e, name in d['select_list']]
1023
+ if d['select_list'] is not None
1024
+ else None
1025
+ )
1026
+ where_clause = exprs.Expr.from_dict(d['where_clause']) if d['where_clause'] is not None else None
1027
+ group_by_clause = (
1028
+ [exprs.Expr.from_dict(e) for e in d['group_by_clause']] if d['group_by_clause'] is not None else None
1029
+ )
1030
+ grouping_tbl = catalog.TableVersion.from_dict(d['grouping_tbl']) if d['grouping_tbl'] is not None else None
1031
+ order_by_clause = (
1032
+ [(exprs.Expr.from_dict(e), asc) for e, asc in d['order_by_clause']]
1033
+ if d['order_by_clause'] is not None
1034
+ else None
1035
+ )
1036
+ limit_val = exprs.Expr.from_dict(d['limit_val']) if d['limit_val'] is not None else None
1037
+
1038
+ return DataFrame(
1039
+ from_clause=from_clause,
1040
+ select_list=select_list,
1041
+ where_clause=where_clause,
1042
+ group_by_clause=group_by_clause,
1043
+ grouping_tbl=grouping_tbl,
1044
+ order_by_clause=order_by_clause,
1045
+ limit=limit_val,
1046
+ )
1049
1047
 
1050
1048
  def _hash_result_set(self) -> str:
1051
1049
  """Return a hash that changes when the result set changes."""
@@ -1053,7 +1051,7 @@ class DataFrame:
1053
1051
  # add list of referenced table versions (the actual versions, not the effective ones) in order to force cache
1054
1052
  # invalidation when any of the referenced tables changes
1055
1053
  d['tbl_versions'] = [
1056
- tbl_version.version for tbl in self._from_clause.tbls for tbl_version in tbl.get_tbl_versions()
1054
+ tbl_version.get().version for tbl in self._from_clause.tbls for tbl_version in tbl.get_tbl_versions()
1057
1055
  ]
1058
1056
  summary_string = json.dumps(d)
1059
1057
  return hashlib.sha256(summary_string.encode()).hexdigest()
@@ -1086,7 +1084,8 @@ class DataFrame:
1086
1084
  assert data_file_path.is_file()
1087
1085
  return data_file_path
1088
1086
  else:
1089
- return write_coco_dataset(self, dest_path)
1087
+ with Env.get().begin_xact():
1088
+ return write_coco_dataset(self, dest_path)
1090
1089
 
1091
1090
  def to_pytorch_dataset(self, image_format: str = 'pt') -> 'torch.utils.data.IterableDataset':
1092
1091
  """
@@ -1130,6 +1129,7 @@ class DataFrame:
1130
1129
  if dest_path.exists(): # fast path: use cache
1131
1130
  assert dest_path.is_dir()
1132
1131
  else:
1133
- export_parquet(self, dest_path, inline_images=True)
1132
+ with Env.get().begin_xact():
1133
+ export_parquet(self, dest_path, inline_images=True)
1134
1134
 
1135
1135
  return PixeltablePytorchDataset(path=dest_path, image_format=image_format)