pixeltable 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (127) hide show
  1. pixeltable/__init__.py +5 -3
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +1 -0
  4. pixeltable/catalog/catalog.py +335 -128
  5. pixeltable/catalog/column.py +22 -5
  6. pixeltable/catalog/dir.py +19 -6
  7. pixeltable/catalog/insertable_table.py +34 -37
  8. pixeltable/catalog/named_function.py +0 -4
  9. pixeltable/catalog/schema_object.py +28 -42
  10. pixeltable/catalog/table.py +193 -158
  11. pixeltable/catalog/table_version.py +191 -232
  12. pixeltable/catalog/table_version_handle.py +50 -0
  13. pixeltable/catalog/table_version_path.py +49 -33
  14. pixeltable/catalog/view.py +56 -96
  15. pixeltable/config.py +103 -0
  16. pixeltable/dataframe.py +89 -89
  17. pixeltable/env.py +98 -168
  18. pixeltable/exec/aggregation_node.py +5 -4
  19. pixeltable/exec/cache_prefetch_node.py +1 -1
  20. pixeltable/exec/component_iteration_node.py +13 -9
  21. pixeltable/exec/data_row_batch.py +3 -3
  22. pixeltable/exec/exec_context.py +0 -4
  23. pixeltable/exec/exec_node.py +3 -2
  24. pixeltable/exec/expr_eval/schedulers.py +2 -1
  25. pixeltable/exec/in_memory_data_node.py +9 -4
  26. pixeltable/exec/row_update_node.py +1 -2
  27. pixeltable/exec/sql_node.py +20 -16
  28. pixeltable/exprs/__init__.py +2 -0
  29. pixeltable/exprs/arithmetic_expr.py +7 -11
  30. pixeltable/exprs/array_slice.py +1 -1
  31. pixeltable/exprs/column_property_ref.py +3 -3
  32. pixeltable/exprs/column_ref.py +12 -13
  33. pixeltable/exprs/comparison.py +3 -6
  34. pixeltable/exprs/compound_predicate.py +4 -4
  35. pixeltable/exprs/expr.py +31 -22
  36. pixeltable/exprs/expr_dict.py +3 -3
  37. pixeltable/exprs/expr_set.py +1 -1
  38. pixeltable/exprs/function_call.py +110 -80
  39. pixeltable/exprs/globals.py +3 -3
  40. pixeltable/exprs/in_predicate.py +1 -1
  41. pixeltable/exprs/inline_expr.py +3 -3
  42. pixeltable/exprs/is_null.py +1 -1
  43. pixeltable/exprs/json_mapper.py +2 -2
  44. pixeltable/exprs/json_path.py +17 -10
  45. pixeltable/exprs/literal.py +1 -1
  46. pixeltable/exprs/method_ref.py +2 -2
  47. pixeltable/exprs/row_builder.py +8 -17
  48. pixeltable/exprs/rowid_ref.py +21 -10
  49. pixeltable/exprs/similarity_expr.py +5 -5
  50. pixeltable/exprs/sql_element_cache.py +1 -1
  51. pixeltable/exprs/type_cast.py +2 -3
  52. pixeltable/exprs/variable.py +2 -2
  53. pixeltable/ext/__init__.py +2 -0
  54. pixeltable/ext/functions/__init__.py +2 -0
  55. pixeltable/ext/functions/yolox.py +3 -3
  56. pixeltable/func/__init__.py +3 -1
  57. pixeltable/func/aggregate_function.py +9 -9
  58. pixeltable/func/callable_function.py +3 -4
  59. pixeltable/func/expr_template_function.py +6 -16
  60. pixeltable/func/function.py +48 -14
  61. pixeltable/func/function_registry.py +1 -3
  62. pixeltable/func/query_template_function.py +5 -12
  63. pixeltable/func/signature.py +23 -22
  64. pixeltable/func/tools.py +3 -3
  65. pixeltable/func/udf.py +6 -4
  66. pixeltable/functions/__init__.py +2 -0
  67. pixeltable/functions/fireworks.py +7 -4
  68. pixeltable/functions/globals.py +4 -5
  69. pixeltable/functions/huggingface.py +1 -5
  70. pixeltable/functions/image.py +17 -7
  71. pixeltable/functions/llama_cpp.py +1 -1
  72. pixeltable/functions/mistralai.py +1 -1
  73. pixeltable/functions/ollama.py +4 -4
  74. pixeltable/functions/openai.py +19 -19
  75. pixeltable/functions/string.py +23 -30
  76. pixeltable/functions/timestamp.py +11 -6
  77. pixeltable/functions/together.py +14 -12
  78. pixeltable/functions/util.py +1 -1
  79. pixeltable/functions/video.py +5 -4
  80. pixeltable/functions/vision.py +6 -9
  81. pixeltable/functions/whisper.py +3 -3
  82. pixeltable/globals.py +246 -260
  83. pixeltable/index/__init__.py +2 -0
  84. pixeltable/index/base.py +1 -1
  85. pixeltable/index/btree.py +3 -1
  86. pixeltable/index/embedding_index.py +11 -5
  87. pixeltable/io/external_store.py +11 -12
  88. pixeltable/io/label_studio.py +4 -3
  89. pixeltable/io/parquet.py +57 -56
  90. pixeltable/iterators/__init__.py +4 -2
  91. pixeltable/iterators/audio.py +11 -11
  92. pixeltable/iterators/document.py +10 -10
  93. pixeltable/iterators/string.py +1 -2
  94. pixeltable/iterators/video.py +14 -15
  95. pixeltable/metadata/__init__.py +9 -5
  96. pixeltable/metadata/converters/convert_10.py +0 -1
  97. pixeltable/metadata/converters/convert_15.py +0 -2
  98. pixeltable/metadata/converters/convert_23.py +0 -2
  99. pixeltable/metadata/converters/convert_24.py +3 -3
  100. pixeltable/metadata/converters/convert_25.py +1 -1
  101. pixeltable/metadata/converters/convert_27.py +0 -2
  102. pixeltable/metadata/converters/convert_28.py +0 -2
  103. pixeltable/metadata/converters/convert_29.py +7 -8
  104. pixeltable/metadata/converters/util.py +7 -7
  105. pixeltable/metadata/schema.py +27 -19
  106. pixeltable/plan.py +68 -40
  107. pixeltable/share/__init__.py +2 -0
  108. pixeltable/share/packager.py +15 -12
  109. pixeltable/share/publish.py +3 -5
  110. pixeltable/store.py +37 -38
  111. pixeltable/type_system.py +41 -28
  112. pixeltable/utils/coco.py +4 -4
  113. pixeltable/utils/console_output.py +1 -3
  114. pixeltable/utils/description_helper.py +1 -1
  115. pixeltable/utils/documents.py +3 -3
  116. pixeltable/utils/filecache.py +20 -9
  117. pixeltable/utils/formatter.py +2 -3
  118. pixeltable/utils/media_store.py +1 -1
  119. pixeltable/utils/pytorch.py +1 -1
  120. pixeltable/utils/sql.py +4 -4
  121. pixeltable/utils/transactional_directory.py +2 -1
  122. {pixeltable-0.3.6.dist-info → pixeltable-0.3.8.dist-info}/METADATA +1 -1
  123. pixeltable-0.3.8.dist-info/RECORD +174 -0
  124. pixeltable-0.3.6.dist-info/RECORD +0 -172
  125. {pixeltable-0.3.6.dist-info → pixeltable-0.3.8.dist-info}/LICENSE +0 -0
  126. {pixeltable-0.3.6.dist-info → pixeltable-0.3.8.dist-info}/WHEEL +0 -0
  127. {pixeltable-0.3.6.dist-info → pixeltable-0.3.8.dist-info}/entry_points.txt +0 -0
@@ -13,7 +13,7 @@ def _(engine: sql.engine.Engine) -> None:
13
13
 
14
14
  def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
15
15
  if k == 'path' and (
16
- v in ['pixeltable.functions.huggingface.clip_text', 'pixeltable.functions.huggingface.clip_image']
16
+ v in {'pixeltable.functions.huggingface.clip_text', 'pixeltable.functions.huggingface.clip_image'}
17
17
  ):
18
18
  return 'path', 'pixeltable.functions.huggingface.clip'
19
19
  return None
@@ -1,12 +1,10 @@
1
1
  import logging
2
- from typing import Any, Optional
3
2
  from uuid import UUID
4
3
 
5
4
  import sqlalchemy as sql
6
5
 
7
6
  from pixeltable.metadata import register_converter
8
7
  from pixeltable.metadata.converters.util import convert_table_md
9
- from pixeltable.metadata.schema import Table
10
8
 
11
9
  _logger = logging.getLogger('pixeltable')
12
10
 
@@ -1,5 +1,3 @@
1
- import logging
2
-
3
1
  import sqlalchemy as sql
4
2
 
5
3
  from pixeltable.metadata import register_converter
@@ -63,13 +63,12 @@ def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], A
63
63
  # is an edge case that won't migrate properly.
64
64
  parameters: list[dict] = v['fn']['signature']['parameters']
65
65
  for i, param in enumerate(parameters):
66
- if param['kind'] == 'VAR_POSITIONAL':
67
- if new_args_len > i:
68
- # For peculiar historical reasons, variable kwargs might show up in args. Thus variable
69
- # positional args is not necessarily the last element of args; it might be the second-to-last.
70
- assert new_args_len <= i + 2, new_args
71
- rolled_args = new_args[i]
72
- new_args = new_args[:i] + new_args[i + 1 :]
66
+ if param['kind'] == 'VAR_POSITIONAL' and new_args_len > i:
67
+ # For peculiar historical reasons, variable kwargs might show up in args. Thus variable
68
+ # positional args is not necessarily the last element of args; it might be the second-to-last.
69
+ assert new_args_len <= i + 2, new_args
70
+ rolled_args = new_args[i]
71
+ new_args = new_args[:i] + new_args[i + 1 :]
73
72
  if param['kind'] == 'VAR_KEYWORD':
74
73
  # As noted above, variable kwargs might show up either in args or in kwargs. If it's in args, it
75
74
  # is necessarily the last element.
@@ -81,7 +80,7 @@ def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], A
81
80
  rolled_kwargs = kwargs.pop(param['name'])
82
81
 
83
82
  if rolled_args is not None:
84
- assert rolled_args['_classname'] in ('InlineArray', 'InlineList')
83
+ assert rolled_args['_classname'] in {'InlineArray', 'InlineList'}
85
84
  new_args.extend(rolled_args['components'])
86
85
  if rolled_kwargs is not None:
87
86
  assert rolled_kwargs['_classname'] == 'InlineDict'
@@ -34,12 +34,12 @@ def convert_table_md(
34
34
  """
35
35
  with engine.begin() as conn:
36
36
  for row in conn.execute(sql.select(Table)):
37
- id = row[0]
37
+ tbl_id = row[0]
38
38
  table_md = row[2]
39
39
  assert isinstance(table_md, dict)
40
40
  updated_table_md = copy.deepcopy(table_md)
41
41
  if table_md_updater is not None:
42
- table_md_updater(updated_table_md, id)
42
+ table_md_updater(updated_table_md, tbl_id)
43
43
  if column_md_updater is not None:
44
44
  __update_column_md(updated_table_md, column_md_updater)
45
45
  if external_store_md_updater is not None:
@@ -47,19 +47,19 @@ def convert_table_md(
47
47
  if substitution_fn is not None:
48
48
  updated_table_md = __substitute_md_rec(updated_table_md, substitution_fn)
49
49
  if updated_table_md != table_md:
50
- __logger.info(f'Updating schema for table: {id}')
51
- conn.execute(sql.update(Table).where(Table.id == id).values(md=updated_table_md))
50
+ __logger.info(f'Updating schema for table: {tbl_id}')
51
+ conn.execute(sql.update(Table).where(Table.id == tbl_id).values(md=updated_table_md))
52
52
 
53
53
  for row in conn.execute(sql.select(Function)):
54
- id = row[0]
54
+ fn_id = row[0]
55
55
  function_md = row[2]
56
56
  assert isinstance(function_md, dict)
57
57
  updated_function_md = copy.deepcopy(function_md)
58
58
  if substitution_fn is not None:
59
59
  updated_function_md = __substitute_md_rec(updated_function_md, substitution_fn)
60
60
  if updated_function_md != function_md:
61
- __logger.info(f'Updating function: {id}')
62
- conn.execute(sql.update(Function).where(Function.id == id).values(md=updated_function_md))
61
+ __logger.info(f'Updating function: {fn_id}')
62
+ conn.execute(sql.update(Function).where(Function.id == fn_id).values(md=updated_function_md))
63
63
 
64
64
 
65
65
  def __update_column_md(table_md: dict, column_md_updater: Callable[[dict], None]) -> None:
@@ -4,16 +4,14 @@ import uuid
4
4
  from typing import Any, Optional, TypeVar, Union, get_type_hints
5
5
 
6
6
  import sqlalchemy as sql
7
- import sqlalchemy.orm as orm
8
- from sqlalchemy import BigInteger, ForeignKey, Integer, LargeBinary
7
+ from sqlalchemy import BigInteger, ForeignKey, Integer, LargeBinary, orm
9
8
  from sqlalchemy.dialects.postgresql import JSONB, UUID
10
- from sqlalchemy.orm import declarative_base
11
9
  from sqlalchemy.orm.decl_api import DeclarativeMeta
12
10
 
13
11
  # Base has to be marked explicitly as a type, in order to be used elsewhere as a type hint. But in addition to being
14
12
  # a type, it's also a `DeclarativeMeta`. The following pattern enables us to expose both `Base` and `Base.metadata`
15
13
  # outside of the module in a typesafe way.
16
- Base: type = declarative_base()
14
+ Base: type = orm.declarative_base()
17
15
  assert isinstance(Base, DeclarativeMeta)
18
16
  base_metadata = Base.metadata
19
17
 
@@ -23,7 +21,7 @@ T = TypeVar('T')
23
21
  def md_from_dict(data_class_type: type[T], data: Any) -> T:
24
22
  """Re-instantiate a dataclass instance that contains nested dataclasses from a dict."""
25
23
  if dataclasses.is_dataclass(data_class_type):
26
- fieldtypes = {f: t for f, t in get_type_hints(data_class_type).items()}
24
+ fieldtypes = get_type_hints(data_class_type)
27
25
  return data_class_type(**{f: md_from_dict(fieldtypes[f], data[f]) for f in data}) # type: ignore[return-value]
28
26
 
29
27
  origin = typing.get_origin(data_class_type)
@@ -43,7 +41,7 @@ def md_from_dict(data_class_type: type[T], data: Any) -> T:
43
41
  elif origin is tuple:
44
42
  return tuple(md_from_dict(arg_type, elem) for arg_type, elem in zip(type_args, data)) # type: ignore[return-value]
45
43
  else:
46
- assert False
44
+ raise AssertionError(origin)
47
45
  else:
48
46
  return data
49
47
 
@@ -85,7 +83,7 @@ class Dir(Base):
85
83
  UUID(as_uuid=True), primary_key=True, default=uuid.uuid4, nullable=False
86
84
  )
87
85
  parent_id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=True)
88
- md = sql.Column(JSONB, nullable=False)
86
+ md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # DirMd
89
87
 
90
88
 
91
89
  @dataclasses.dataclass
@@ -131,13 +129,17 @@ class IndexMd:
131
129
  init_args: dict[str, Any]
132
130
 
133
131
 
132
+ # a stored table version path is a list of (table id as str, effective table version)
133
+ TableVersionPath = list[tuple[str, Optional[int]]]
134
+
135
+
134
136
  @dataclasses.dataclass
135
137
  class ViewMd:
136
138
  is_snapshot: bool
137
139
  include_base_columns: bool
138
140
 
139
141
  # (table id, version); for mutable views, all versions are None
140
- base_versions: list[tuple[str, Optional[int]]]
142
+ base_versions: TableVersionPath
141
143
 
142
144
  # filter predicate applied to the base table; view-only
143
145
  predicate: Optional[dict[str, Any]]
@@ -192,7 +194,7 @@ class Table(Base):
192
194
 
193
195
  id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), primary_key=True, nullable=False)
194
196
  dir_id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=False)
195
- md = sql.Column(JSONB, nullable=False) # TableMd
197
+ md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # TableMd
196
198
 
197
199
 
198
200
  @dataclasses.dataclass
@@ -205,9 +207,11 @@ class TableVersionMd:
205
207
 
206
208
  class TableVersion(Base):
207
209
  __tablename__ = 'tableversions'
208
- tbl_id = sql.Column(UUID(as_uuid=True), ForeignKey('tables.id'), primary_key=True, nullable=False)
209
- version = sql.Column(BigInteger, primary_key=True, nullable=False)
210
- md = sql.Column(JSONB, nullable=False) # TableVersionMd
210
+ tbl_id: orm.Mapped[uuid.UUID] = orm.mapped_column(
211
+ UUID(as_uuid=True), ForeignKey('tables.id'), primary_key=True, nullable=False
212
+ )
213
+ version: orm.Mapped[int] = orm.mapped_column(BigInteger, primary_key=True, nullable=False)
214
+ md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False)
211
215
 
212
216
 
213
217
  @dataclasses.dataclass
@@ -246,9 +250,11 @@ class TableSchemaVersionMd:
246
250
  class TableSchemaVersion(Base):
247
251
  __tablename__ = 'tableschemaversions'
248
252
 
249
- tbl_id = sql.Column(UUID(as_uuid=True), ForeignKey('tables.id'), primary_key=True, nullable=False)
250
- schema_version = sql.Column(BigInteger, primary_key=True, nullable=False)
251
- md = sql.Column(JSONB, nullable=False) # TableSchemaVersionMd
253
+ tbl_id: orm.Mapped[uuid.UUID] = orm.mapped_column(
254
+ UUID(as_uuid=True), ForeignKey('tables.id'), primary_key=True, nullable=False
255
+ )
256
+ schema_version: orm.Mapped[int] = orm.mapped_column(BigInteger, primary_key=True, nullable=False)
257
+ md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # TableSchemaVersionMd
252
258
 
253
259
 
254
260
  @dataclasses.dataclass
@@ -271,7 +277,9 @@ class Function(Base):
271
277
 
272
278
  __tablename__ = 'functions'
273
279
 
274
- id = sql.Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4, nullable=False)
275
- dir_id = sql.Column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=True)
276
- md = sql.Column(JSONB, nullable=False) # FunctionMd
277
- binary_obj = sql.Column(LargeBinary, nullable=True)
280
+ id: orm.Mapped[uuid.UUID] = orm.mapped_column(
281
+ UUID(as_uuid=True), primary_key=True, default=uuid.uuid4, nullable=False
282
+ )
283
+ dir_id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=True)
284
+ md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # FunctionMd
285
+ binary_obj: orm.Mapped[Optional[bytes]] = orm.mapped_column(LargeBinary, nullable=True)
pixeltable/plan.py CHANGED
@@ -2,14 +2,15 @@ from __future__ import annotations
2
2
 
3
3
  import dataclasses
4
4
  import enum
5
+ from textwrap import dedent
5
6
  from typing import Any, Iterable, Literal, Optional, Sequence
6
7
  from uuid import UUID
7
8
 
8
9
  import sqlalchemy as sql
9
10
 
10
11
  import pixeltable as pxt
11
- import pixeltable.exec as exec
12
- from pixeltable import catalog, exceptions as excs, exprs
12
+ from pixeltable import catalog, exceptions as excs, exec, exprs
13
+ from pixeltable.catalog import Column, TableVersionHandle
13
14
  from pixeltable.exec.sql_node import OrderByClause, OrderByItem, combine_order_by_clauses, print_order_by_clause
14
15
 
15
16
 
@@ -54,9 +55,9 @@ class JoinType(enum.Enum):
54
55
  def validated(cls, name: str, error_prefix: str) -> JoinType:
55
56
  try:
56
57
  return cls[name.upper()]
57
- except KeyError:
58
- val_strs = ', '.join(f'{s.lower()!r}' for s in cls.__members__.keys())
59
- raise excs.Error(f'{error_prefix} must be one of: [{val_strs}]')
58
+ except KeyError as exc:
59
+ val_strs = ', '.join(f'{s.lower()!r}' for s in cls.__members__)
60
+ raise excs.Error(f'{error_prefix} must be one of: [{val_strs}]') from exc
60
61
 
61
62
 
62
63
  @dataclasses.dataclass
@@ -177,19 +178,21 @@ class Analyzer:
177
178
  )
178
179
 
179
180
  # check that Where clause and filter doesn't contain aggregates
180
- if self.sql_where_clause is not None:
181
- if any(_is_agg_fn_call(e) for e in self.sql_where_clause.subexprs(expr_class=exprs.FunctionCall)):
182
- raise excs.Error(f'where() cannot contain aggregate functions: {self.sql_where_clause}')
183
- if self.filter is not None:
184
- if any(_is_agg_fn_call(e) for e in self.filter.subexprs(expr_class=exprs.FunctionCall)):
185
- raise excs.Error(f'where() cannot contain aggregate functions: {self.filter}')
181
+ if self.sql_where_clause is not None and any(
182
+ _is_agg_fn_call(e) for e in self.sql_where_clause.subexprs(expr_class=exprs.FunctionCall)
183
+ ):
184
+ raise excs.Error(f'where() cannot contain aggregate functions: {self.sql_where_clause}')
185
+ if self.filter is not None and any(
186
+ _is_agg_fn_call(e) for e in self.filter.subexprs(expr_class=exprs.FunctionCall)
187
+ ):
188
+ raise excs.Error(f'where() cannot contain aggregate functions: {self.filter}')
186
189
 
187
190
  # check that grouping exprs don't contain aggregates and can be expressed as SQL (we perform sort-based
188
191
  # aggregation and rely on the SqlScanNode returning data in the correct order)
189
192
  for e in self.group_by_clause:
190
193
  if not self.sql_elements.contains(e):
191
194
  raise excs.Error(f'Invalid grouping expression, needs to be expressible in SQL: {e}')
192
- if e._contains(filter=lambda e: _is_agg_fn_call(e)):
195
+ if e._contains(filter=_is_agg_fn_call):
193
196
  raise excs.Error(f'Grouping expression contains aggregate function: {e}')
194
197
 
195
198
  def _determine_agg_status(self, e: exprs.Expr, grouping_expr_ids: set[int]) -> tuple[bool, bool]:
@@ -207,7 +210,7 @@ class Analyzer:
207
210
  return True, False
208
211
  elif isinstance(e, exprs.Literal):
209
212
  return True, True
210
- elif isinstance(e, exprs.ColumnRef) or isinstance(e, exprs.RowidRef):
213
+ elif isinstance(e, (exprs.ColumnRef, exprs.RowidRef)):
211
214
  # we already know that this isn't a grouping expr
212
215
  return False, True
213
216
  else:
@@ -275,14 +278,19 @@ class Planner:
275
278
  cls, tbl: catalog.TableVersion, rows: list[dict[str, Any]], ignore_errors: bool
276
279
  ) -> exec.ExecNode:
277
280
  """Creates a plan for TableVersion.insert()"""
278
- assert not tbl.is_view()
281
+ assert not tbl.is_view
279
282
  # stored_cols: all cols we need to store, incl computed cols (and indices)
280
283
  stored_cols = [c for c in tbl.cols_by_id.values() if c.is_stored]
281
284
  assert len(stored_cols) > 0 # there needs to be something to store
285
+
286
+ cls.__check_valid_columns(tbl, stored_cols, 'inserted into')
287
+
282
288
  row_builder = exprs.RowBuilder([], stored_cols, [])
283
289
 
284
290
  # create InMemoryDataNode for 'rows'
285
- plan: exec.ExecNode = exec.InMemoryDataNode(tbl, rows, row_builder, tbl.next_rowid)
291
+ plan: exec.ExecNode = exec.InMemoryDataNode(
292
+ TableVersionHandle(tbl.id, tbl.effective_version), rows, row_builder, tbl.next_rowid
293
+ )
286
294
 
287
295
  media_input_col_info = [
288
296
  exprs.ColumnSlotIdx(col_ref.col, col_ref.slot_idx)
@@ -318,7 +326,7 @@ class Planner:
318
326
  def create_df_insert_plan(
319
327
  cls, tbl: catalog.TableVersion, df: 'pxt.DataFrame', ignore_errors: bool
320
328
  ) -> exec.ExecNode:
321
- assert not tbl.is_view()
329
+ assert not tbl.is_view
322
330
  plan = df._create_query_plan() # ExecNode constructed by the DataFrame
323
331
 
324
332
  # Modify the plan RowBuilder to register the output columns
@@ -363,7 +371,7 @@ class Planner:
363
371
  """
364
372
  # retrieve all stored cols and all target exprs
365
373
  assert isinstance(tbl, catalog.TableVersionPath)
366
- target = tbl.tbl_version # the one we need to update
374
+ target = tbl.tbl_version.get() # the one we need to update
367
375
  updated_cols = list(update_targets.keys())
368
376
  if len(recompute_targets) > 0:
369
377
  recomputed_cols = set(recompute_targets)
@@ -374,11 +382,14 @@ class Planner:
374
382
  recomputed_cols.update(idx_val_cols)
375
383
  # we only need to recompute stored columns (unstored ones are substituted away)
376
384
  recomputed_cols = {c for c in recomputed_cols if c.is_stored}
377
- recomputed_base_cols = {col for col in recomputed_cols if col.tbl == target}
385
+
386
+ cls.__check_valid_columns(tbl.tbl_version.get(), recomputed_cols, 'updated in')
387
+
388
+ recomputed_base_cols = {col for col in recomputed_cols if col.tbl == tbl.tbl_version}
378
389
  copied_cols = [
379
390
  col
380
391
  for col in target.cols_by_id.values()
381
- if col.is_stored and not col in updated_cols and not col in recomputed_base_cols
392
+ if col.is_stored and col not in updated_cols and col not in recomputed_base_cols
382
393
  ]
383
394
  select_list: list[exprs.Expr] = [exprs.ColumnRef(col) for col in copied_cols]
384
395
  select_list.extend(update_targets.values())
@@ -398,7 +409,25 @@ class Planner:
398
409
  for i, col in enumerate(all_base_cols):
399
410
  plan.row_builder.add_table_column(col, select_list[i].slot_idx)
400
411
  recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
401
- return plan, [f'{c.tbl.name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
412
+ return plan, [f'{c.tbl.get().name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
413
+
414
+ @classmethod
415
+ def __check_valid_columns(
416
+ cls, tbl: catalog.TableVersion, cols: Iterable[Column], op_name: Literal['inserted into', 'updated in']
417
+ ) -> None:
418
+ for col in cols:
419
+ if col.value_expr is not None and not col.value_expr.is_valid:
420
+ raise excs.Error(
421
+ dedent(
422
+ f"""
423
+ Data cannot be {op_name} the table {tbl.name!r},
424
+ because the column {col.name!r} is currently invalid:
425
+ {{validation_error}}
426
+ """
427
+ )
428
+ .strip()
429
+ .format(validation_error=col.value_expr.validation_error)
430
+ )
402
431
 
403
432
  @classmethod
404
433
  def create_batch_update_plan(
@@ -417,7 +446,7 @@ class Planner:
417
446
  - list of user-visible columns that are being recomputed
418
447
  """
419
448
  assert isinstance(tbl, catalog.TableVersionPath)
420
- target = tbl.tbl_version # the one we need to update
449
+ target = tbl.tbl_version.get() # the one we need to update
421
450
  sa_key_cols: list[sql.Column] = []
422
451
  key_vals: list[tuple] = []
423
452
  if len(rowids) > 0:
@@ -440,7 +469,7 @@ class Planner:
440
469
  copied_cols = [
441
470
  col
442
471
  for col in target.cols_by_id.values()
443
- if col.is_stored and not col in updated_cols and not col in recomputed_base_cols
472
+ if col.is_stored and col not in updated_cols and col not in recomputed_base_cols
444
473
  ]
445
474
  select_list: list[exprs.Expr] = [exprs.ColumnRef(col) for col in copied_cols]
446
475
  select_list.extend(exprs.ColumnRef(col) for col in updated_cols)
@@ -507,11 +536,11 @@ class Planner:
507
536
  - list of columns that are being recomputed
508
537
  """
509
538
  assert isinstance(view, catalog.TableVersionPath)
510
- assert view.is_view()
511
- target = view.tbl_version # the one we need to update
539
+ assert view.is_view
540
+ target = view.tbl_version.get() # the one we need to update
512
541
  # retrieve all stored cols and all target exprs
513
542
  recomputed_cols = set(recompute_targets.copy())
514
- copied_cols = [col for col in target.cols_by_id.values() if col.is_stored and not col in recomputed_cols]
543
+ copied_cols = [col for col in target.cols_by_id.values() if col.is_stored and col not in recomputed_cols]
515
544
  select_list: list[exprs.Expr] = [exprs.ColumnRef(col) for col in copied_cols]
516
545
  # resolve recomputed exprs to stored columns in the base
517
546
  recomputed_exprs = [
@@ -551,13 +580,13 @@ class Planner:
551
580
  - number of materialized values per row
552
581
  """
553
582
  assert isinstance(view, catalog.TableVersionPath)
554
- assert view.is_view()
583
+ assert view.is_view
555
584
  # things we need to materialize as DataRows:
556
585
  # 1. stored computed cols
557
586
  # - iterator columns are effectively computed, just not with a value_expr
558
587
  # - we can ignore stored non-computed columns because they have a default value that is supplied directly by
559
588
  # the store
560
- target = view.tbl_version # the one we need to populate
589
+ target = view.tbl_version.get() # the one we need to populate
561
590
  stored_cols = [c for c in target.cols_by_id.values() if c.is_stored]
562
591
  # 2. for component views: iterator args
563
592
  iterator_args = [target.iterator_args] if target.iterator_args is not None else []
@@ -585,8 +614,8 @@ class Planner:
585
614
  exact_version_only=view.get_bases() if propagates_insert else [],
586
615
  )
587
616
  exec_ctx = plan.ctx
588
- if target.is_component_view():
589
- plan = exec.ComponentIterationNode(target, plan)
617
+ if target.is_component_view:
618
+ plan = exec.ComponentIterationNode(view.tbl_version, plan)
590
619
  if len(view_output_exprs) > 0:
591
620
  plan = exec.ExprEvalNode(
592
621
  row_builder, output_exprs=view_output_exprs, input_exprs=base_output_exprs, input=plan
@@ -639,11 +668,12 @@ class Planner:
639
668
  @classmethod
640
669
  def _is_contained_in(cls, l1: Iterable[exprs.Expr], l2: Iterable[exprs.Expr]) -> bool:
641
670
  """Returns True if l1 is contained in l2"""
642
- s1, s2 = set(e.id for e in l1), set(e.id for e in l2)
643
- return s1 <= s2
671
+ return {e.id for e in l1} <= {e.id for e in l2}
644
672
 
645
673
  @classmethod
646
- def _insert_prefetch_node(cls, tbl_id: UUID, row_builder: exprs.RowBuilder, input: exec.ExecNode) -> exec.ExecNode:
674
+ def _insert_prefetch_node(
675
+ cls, tbl_id: UUID, row_builder: exprs.RowBuilder, input_node: exec.ExecNode
676
+ ) -> exec.ExecNode:
647
677
  """Returns a CachePrefetchNode into the plan if needed, otherwise returns input"""
648
678
  # we prefetch external files for all media ColumnRefs, even those that aren't part of the dependencies
649
679
  # of output_exprs: if unstored iterator columns are present, we might need to materialize ColumnRefs that
@@ -652,10 +682,10 @@ class Planner:
652
682
  e for e in list(row_builder.unique_exprs) if isinstance(e, exprs.ColumnRef) and e.col_type.is_media_type()
653
683
  ]
654
684
  if len(media_col_refs) == 0:
655
- return input
685
+ return input_node
656
686
  # we need to prefetch external files for media column types
657
687
  file_col_info = [exprs.ColumnSlotIdx(e.col, e.slot_idx) for e in media_col_refs]
658
- prefetch_node = exec.CachePrefetchNode(tbl_id, file_col_info, input)
688
+ prefetch_node = exec.CachePrefetchNode(tbl_id, file_col_info, input_node)
659
689
  return prefetch_node
660
690
 
661
691
  @classmethod
@@ -668,7 +698,7 @@ class Planner:
668
698
  order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None,
669
699
  limit: Optional[exprs.Expr] = None,
670
700
  ignore_errors: bool = False,
671
- exact_version_only: Optional[list[catalog.TableVersion]] = None,
701
+ exact_version_only: Optional[list[catalog.TableVersionHandle]] = None,
672
702
  ) -> exec.ExecNode:
673
703
  """Return plan for executing a query.
674
704
  Updates 'select_list' in place to make it executable.
@@ -714,7 +744,7 @@ class Planner:
714
744
  eval_ctx: exprs.RowBuilder.EvalCtx,
715
745
  limit: Optional[exprs.Expr] = None,
716
746
  with_pk: bool = False,
717
- exact_version_only: Optional[list[catalog.TableVersion]] = None,
747
+ exact_version_only: Optional[list[catalog.TableVersionHandle]] = None,
718
748
  ) -> exec.ExecNode:
719
749
  """
720
750
  Create plan to materialize eval_ctx.
@@ -752,13 +782,11 @@ class Planner:
752
782
  )
753
783
  if analyzer.filter is not None:
754
784
  candidates.extend(
755
- exprs.Expr.subexprs(analyzer.filter, filter=lambda e: sql_elements.contains(e), traverse_matches=False)
785
+ exprs.Expr.subexprs(analyzer.filter, filter=sql_elements.contains, traverse_matches=False)
756
786
  )
757
787
  if is_python_agg and analyzer.group_by_clause is not None:
758
788
  candidates.extend(
759
- exprs.Expr.list_subexprs(
760
- analyzer.group_by_clause, filter=lambda e: sql_elements.contains(e), traverse_matches=False
761
- )
789
+ exprs.Expr.list_subexprs(analyzer.group_by_clause, filter=sql_elements.contains, traverse_matches=False)
762
790
  )
763
791
  # not isinstance(...): we don't want to materialize Literals via a Select
764
792
  sql_exprs = exprs.ExprSet(e for e in candidates if not isinstance(e, exprs.Literal))
@@ -1 +1,3 @@
1
+ # ruff: noqa: F401
2
+
1
3
  from .publish import publish_snapshot
@@ -66,13 +66,15 @@ class TablePackager:
66
66
  'tables': [
67
67
  {
68
68
  'table_id': str(t._tbl_version.id),
69
- # These are temporary; will replace with a better solution once the concurrency changes to catalog have
70
- # been merged
71
- 'table_md': dataclasses.asdict(t._tbl_version._create_tbl_md()),
69
+ # These are temporary; will replace with a better solution once the concurrency
70
+ # changes to catalog have been merged
71
+ 'table_md': dataclasses.asdict(t._tbl_version.get()._create_tbl_md()),
72
72
  'table_version_md': dataclasses.asdict(
73
- t._tbl_version._create_version_md(datetime.now().timestamp())
73
+ t._tbl_version.get()._create_version_md(datetime.now().timestamp())
74
+ ),
75
+ 'table_schema_version_md': dataclasses.asdict(
76
+ t._tbl_version.get()._create_schema_version_md(0)
74
77
  ),
75
- 'table_schema_version_md': dataclasses.asdict(t._tbl_version._create_schema_version_md(0)),
76
78
  }
77
79
  for t in (table, *table._bases)
78
80
  ]
@@ -91,11 +93,12 @@ class TablePackager:
91
93
  with open(self.tmp_dir / 'metadata.json', 'w', encoding='utf8') as fp:
92
94
  json.dump(self.md, fp)
93
95
  self.iceberg_catalog = sqlite_catalog(self.tmp_dir / 'warehouse')
94
- ancestors = (self.table, *self.table._bases)
95
- for t in ancestors:
96
- _logger.info(f"Exporting table '{t._path}'.")
97
- self.__export_table(t)
98
- _logger.info(f'Building archive.')
96
+ with Env.get().begin_xact():
97
+ ancestors = (self.table, *self.table._bases)
98
+ for t in ancestors:
99
+ _logger.info(f"Exporting table '{t._path}'.")
100
+ self.__export_table(t)
101
+ _logger.info('Building archive.')
99
102
  bundle_path = self.__build_tarball()
100
103
  _logger.info(f'Packaging complete: {bundle_path}')
101
104
  return bundle_path
@@ -117,7 +120,7 @@ class TablePackager:
117
120
  # to get the column types, since we'll be substituting `fileurl`s for media columns.
118
121
  actual_col_types: list[ts.ColumnType] = []
119
122
 
120
- for col_name, col in t._tbl_version.cols_by_name.items():
123
+ for col_name, col in t._tbl_version.get().cols_by_name.items():
121
124
  if not col.is_stored:
122
125
  continue
123
126
  if col.col_type.is_media_type():
@@ -150,7 +153,7 @@ class TablePackager:
150
153
  """
151
154
  Iceberg tables must have a namespace, which cannot be the empty string, so we prepend `pxt` to the table path.
152
155
  """
153
- parent_path = table._parent._path
156
+ parent_path = table._parent()._path()
154
157
  if len(parent_path) == 0:
155
158
  return 'pxt'
156
159
  else:
@@ -1,16 +1,14 @@
1
- import dataclasses
2
1
  import os
3
2
  import sys
4
3
  import urllib.parse
5
4
  import urllib.request
6
- from datetime import datetime
7
5
  from pathlib import Path
8
6
 
9
7
  import requests
10
8
  from tqdm import tqdm
11
9
 
12
10
  import pixeltable as pxt
13
- from pixeltable import exceptions as excs, metadata
11
+ from pixeltable import exceptions as excs
14
12
  from pixeltable.env import Env
15
13
  from pixeltable.utils import sha256sum
16
14
 
@@ -46,7 +44,7 @@ def publish_snapshot(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
46
44
  else:
47
45
  raise excs.Error(f'Unsupported destination: {destination_uri}')
48
46
 
49
- Env.get().console_logger.info(f'Finalizing snapshot ...')
47
+ Env.get().console_logger.info('Finalizing snapshot ...')
50
48
 
51
49
  finalize_request_json = {
52
50
  'upload_id': upload_id,
@@ -83,7 +81,7 @@ def _upload_bundle_to_s3(bundle: Path, parsed_location: urllib.parse.ParseResult
83
81
  upload_args = {'ChecksumAlgorithm': 'SHA256'}
84
82
 
85
83
  progress_bar = tqdm(
86
- desc=f'Uploading',
84
+ desc='Uploading',
87
85
  total=bundle.stat().st_size,
88
86
  unit='B',
89
87
  unit_scale=True,