pixeltable 0.3.11__py3-none-any.whl → 0.3.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (53) hide show
  1. pixeltable/__init__.py +2 -27
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/catalog.py +309 -59
  4. pixeltable/catalog/globals.py +5 -5
  5. pixeltable/catalog/insertable_table.py +13 -1
  6. pixeltable/catalog/path.py +13 -6
  7. pixeltable/catalog/table.py +28 -41
  8. pixeltable/catalog/table_version.py +100 -72
  9. pixeltable/catalog/view.py +35 -9
  10. pixeltable/dataframe.py +2 -2
  11. pixeltable/exceptions.py +20 -2
  12. pixeltable/exec/expr_eval/evaluators.py +0 -4
  13. pixeltable/exec/expr_eval/expr_eval_node.py +0 -1
  14. pixeltable/exec/sql_node.py +3 -3
  15. pixeltable/exprs/json_path.py +1 -5
  16. pixeltable/func/__init__.py +1 -1
  17. pixeltable/func/aggregate_function.py +1 -1
  18. pixeltable/func/callable_function.py +1 -1
  19. pixeltable/func/expr_template_function.py +2 -2
  20. pixeltable/func/function.py +3 -4
  21. pixeltable/func/query_template_function.py +87 -4
  22. pixeltable/func/tools.py +1 -1
  23. pixeltable/func/udf.py +1 -1
  24. pixeltable/functions/__init__.py +1 -0
  25. pixeltable/functions/anthropic.py +1 -1
  26. pixeltable/functions/bedrock.py +130 -0
  27. pixeltable/functions/huggingface.py +7 -6
  28. pixeltable/functions/image.py +15 -16
  29. pixeltable/functions/mistralai.py +3 -2
  30. pixeltable/functions/openai.py +9 -8
  31. pixeltable/functions/together.py +4 -3
  32. pixeltable/globals.py +7 -2
  33. pixeltable/io/datarows.py +4 -3
  34. pixeltable/io/label_studio.py +17 -17
  35. pixeltable/io/pandas.py +13 -12
  36. pixeltable/io/table_data_conduit.py +8 -2
  37. pixeltable/metadata/__init__.py +1 -1
  38. pixeltable/metadata/converters/convert_19.py +2 -2
  39. pixeltable/metadata/converters/convert_31.py +11 -0
  40. pixeltable/metadata/converters/convert_32.py +15 -0
  41. pixeltable/metadata/converters/convert_33.py +17 -0
  42. pixeltable/metadata/notes.py +3 -0
  43. pixeltable/metadata/schema.py +26 -1
  44. pixeltable/plan.py +2 -3
  45. pixeltable/share/packager.py +9 -25
  46. pixeltable/share/publish.py +20 -9
  47. pixeltable/store.py +7 -4
  48. pixeltable/utils/exception_handler.py +59 -0
  49. {pixeltable-0.3.11.dist-info → pixeltable-0.3.13.dist-info}/METADATA +1 -1
  50. {pixeltable-0.3.11.dist-info → pixeltable-0.3.13.dist-info}/RECORD +53 -48
  51. {pixeltable-0.3.11.dist-info → pixeltable-0.3.13.dist-info}/WHEEL +1 -1
  52. {pixeltable-0.3.11.dist-info → pixeltable-0.3.13.dist-info}/LICENSE +0 -0
  53. {pixeltable-0.3.11.dist-info → pixeltable-0.3.13.dist-info}/entry_points.txt +0 -0
pixeltable/globals.py CHANGED
@@ -616,9 +616,14 @@ def _extract_paths(
616
616
  matches = [name for name, entry in dir_entries.items() if entry.dir is not None]
617
617
  else:
618
618
  matches = [name for name, entry in dir_entries.items() if entry.table is not None]
619
+
620
+ # Filter out system paths
621
+ matches = [name for name in matches if catalog.is_valid_identifier(name)]
619
622
  result = [parent.append(name) for name in matches]
620
- for name, entry in [(name, entry) for name, entry in dir_entries.items() if len(entry.dir_entries) > 0]:
621
- result.extend(_extract_paths(entry.dir_entries, parent=parent.append(name), entry_type=entry_type))
623
+
624
+ for name, entry in dir_entries.items():
625
+ if len(entry.dir_entries) > 0 and catalog.is_valid_identifier(name):
626
+ result.extend(_extract_paths(entry.dir_entries, parent=parent.append(name), entry_type=entry_type))
622
627
  return result
623
628
 
624
629
 
pixeltable/io/datarows.py CHANGED
@@ -3,13 +3,14 @@ from __future__ import annotations
3
3
  from typing import Any, Iterable, Optional, Union
4
4
 
5
5
  import pixeltable as pxt
6
+ import pixeltable.type_system as ts
6
7
  from pixeltable import exceptions as excs
7
8
 
8
9
 
9
10
  def _infer_schema_from_rows(
10
11
  rows: Iterable[dict[str, Any]], schema_overrides: dict[str, Any], primary_key: list[str]
11
- ) -> dict[str, pxt.ColumnType]:
12
- schema: dict[str, pxt.ColumnType] = {}
12
+ ) -> dict[str, ts.ColumnType]:
13
+ schema: dict[str, ts.ColumnType] = {}
13
14
  cols_with_nones: set[str] = set()
14
15
 
15
16
  for n, row in enumerate(rows):
@@ -23,7 +24,7 @@ def _infer_schema_from_rows(
23
24
  elif value is not None:
24
25
  # If `key` is not in `schema_overrides`, then we infer its type from the data.
25
26
  # The column type will always be nullable by default.
26
- col_type = pxt.ColumnType.infer_literal_type(value, nullable=col_name not in primary_key)
27
+ col_type = ts.ColumnType.infer_literal_type(value, nullable=col_name not in primary_key)
27
28
  if col_type is None:
28
29
  raise excs.Error(
29
30
  f'Could not infer type for column `{col_name}`; the value in row {n} '
@@ -11,7 +11,7 @@ import label_studio_sdk # type: ignore[import-untyped]
11
11
  import PIL.Image
12
12
  from requests.exceptions import HTTPError
13
13
 
14
- import pixeltable as pxt
14
+ import pixeltable.type_system as ts
15
15
  from pixeltable import Column, Table, env, exceptions as excs
16
16
  from pixeltable.config import Config
17
17
  from pixeltable.exprs import ColumnRef, DataRow, Expr
@@ -89,21 +89,21 @@ class LabelStudioProject(Project):
89
89
  def __project_config(self) -> '_LabelStudioConfig':
90
90
  return self.__parse_project_config(self.project_params['label_config'])
91
91
 
92
- def get_export_columns(self) -> dict[str, pxt.ColumnType]:
92
+ def get_export_columns(self) -> dict[str, ts.ColumnType]:
93
93
  """
94
94
  The data keys and preannotation fields specified in this Label Studio project.
95
95
  """
96
96
  return self.__project_config.export_columns
97
97
 
98
- def get_import_columns(self) -> dict[str, pxt.ColumnType]:
98
+ def get_import_columns(self) -> dict[str, ts.ColumnType]:
99
99
  """
100
100
  Always contains a single entry:
101
101
 
102
102
  ```
103
- {"annotations": pxt.JsonType(nullable=True)}
103
+ {"annotations": ts.JsonType(nullable=True)}
104
104
  ```
105
105
  """
106
- return {ANNOTATIONS_COLUMN: pxt.JsonType(nullable=True)}
106
+ return {ANNOTATIONS_COLUMN: ts.JsonType(nullable=True)}
107
107
 
108
108
  def sync(self, t: Table, export_data: bool, import_data: bool) -> SyncStatus:
109
109
  _logger.info(
@@ -412,8 +412,8 @@ class LabelStudioProject(Project):
412
412
  # TODO(aaron-siegel): Simplify this once propagation is properly implemented in batch_update
413
413
  ancestor = t
414
414
  while local_annotations_col not in ancestor._tbl_version.get().cols:
415
- assert ancestor._base is not None
416
- ancestor = ancestor._base
415
+ assert ancestor._base_table is not None
416
+ ancestor = ancestor._base_table
417
417
  update_status = ancestor.batch_update(updates)
418
418
  env.Env.get().console_logger.info(f'Updated annotation(s) from {len(updates)} task(s) in {self}.')
419
419
  return SyncStatus(pxt_rows_updated=update_status.num_rows, num_excs=update_status.num_excs)
@@ -577,10 +577,10 @@ class LabelStudioProject(Project):
577
577
  else:
578
578
  local_annotations_column = next(k for k, v in col_mapping.items() if v == ANNOTATIONS_COLUMN)
579
579
  if local_annotations_column not in t._schema:
580
- t.add_columns({local_annotations_column: pxt.JsonType(nullable=True)})
580
+ t.add_columns({local_annotations_column: ts.JsonType(nullable=True)})
581
581
 
582
582
  resolved_col_mapping = cls.validate_columns(
583
- t, config.export_columns, {ANNOTATIONS_COLUMN: pxt.JsonType(nullable=True)}, col_mapping
583
+ t, config.export_columns, {ANNOTATIONS_COLUMN: ts.JsonType(nullable=True)}, col_mapping
584
584
  )
585
585
 
586
586
  # Perform some additional validation
@@ -649,7 +649,7 @@ class LabelStudioProject(Project):
649
649
  @dataclass(frozen=True)
650
650
  class _DataKey:
651
651
  name: Optional[str] # The 'name' attribute of the data key; may differ from the field name
652
- column_type: pxt.ColumnType
652
+ column_type: ts.ColumnType
653
653
 
654
654
 
655
655
  @dataclass(frozen=True)
@@ -673,18 +673,18 @@ class _LabelStudioConfig:
673
673
  )
674
674
 
675
675
  @property
676
- def export_columns(self) -> dict[str, pxt.ColumnType]:
676
+ def export_columns(self) -> dict[str, ts.ColumnType]:
677
677
  data_key_cols = {key_id: key_info.column_type for key_id, key_info in self.data_keys.items()}
678
- rl_cols = {name: pxt.JsonType() for name in self.rectangle_labels}
678
+ rl_cols = {name: ts.JsonType() for name in self.rectangle_labels}
679
679
  return {**data_key_cols, **rl_cols}
680
680
 
681
681
 
682
682
  ANNOTATIONS_COLUMN = 'annotations'
683
683
  _PAGE_SIZE = 100 # This is the default used in the LS SDK
684
684
  _LS_TAG_MAP = {
685
- 'header': pxt.StringType(),
686
- 'text': pxt.StringType(),
687
- 'image': pxt.ImageType(),
688
- 'video': pxt.VideoType(),
689
- 'audio': pxt.AudioType(),
685
+ 'header': ts.StringType(),
686
+ 'text': ts.StringType(),
687
+ 'image': ts.ImageType(),
688
+ 'video': ts.VideoType(),
689
+ 'audio': ts.AudioType(),
690
690
  }
pixeltable/io/pandas.py CHANGED
@@ -8,6 +8,7 @@ from pandas.api.types import is_datetime64_any_dtype, is_extension_array_dtype
8
8
 
9
9
  import pixeltable as pxt
10
10
  import pixeltable.exceptions as excs
11
+ import pixeltable.type_system as ts
11
12
 
12
13
 
13
14
  def import_pandas(
@@ -119,15 +120,15 @@ def _df_check_primary_key_values(df: pd.DataFrame, primary_key: list[str]) -> No
119
120
 
120
121
 
121
122
  def df_infer_schema(
122
- df: pd.DataFrame, schema_overrides: dict[str, pxt.ColumnType], primary_key: list[str]
123
- ) -> dict[str, pxt.ColumnType]:
123
+ df: pd.DataFrame, schema_overrides: dict[str, ts.ColumnType], primary_key: list[str]
124
+ ) -> dict[str, ts.ColumnType]:
124
125
  """
125
126
  Infers a Pixeltable schema from a Pandas DataFrame.
126
127
 
127
128
  Returns:
128
129
  A tuple containing a Pixeltable schema and a list of primary key column names.
129
130
  """
130
- pd_schema: dict[str, pxt.ColumnType] = {}
131
+ pd_schema: dict[str, ts.ColumnType] = {}
131
132
  for pd_name, pd_dtype in zip(df.columns, df.dtypes):
132
133
  if pd_name in schema_overrides:
133
134
  pxt_type = schema_overrides[pd_name]
@@ -138,7 +139,7 @@ def df_infer_schema(
138
139
  return pd_schema
139
140
 
140
141
 
141
- def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[pxt.ColumnType]:
142
+ def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[ts.ColumnType]:
142
143
  """
143
144
  Determines a pixeltable ColumnType from a pandas dtype
144
145
 
@@ -146,21 +147,21 @@ def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[pxt.C
146
147
  pd_dtype: A pandas dtype object
147
148
 
148
149
  Returns:
149
- pxt.ColumnType: A pixeltable ColumnType
150
+ ts.ColumnType: A pixeltable ColumnType
150
151
  """
151
152
  # Pandas extension arrays / types (Int64, boolean, string[pyarrow], etc.) are not directly
152
153
  # compatible with NumPy dtypes
153
154
  # The timezone-aware datetime64[ns, tz=] dtype is a pandas extension dtype
154
155
  if is_datetime64_any_dtype(pd_dtype):
155
- return pxt.TimestampType(nullable=nullable)
156
+ return ts.TimestampType(nullable=nullable)
156
157
  if is_extension_array_dtype(pd_dtype):
157
158
  return None
158
159
  # Most other pandas dtypes are directly NumPy compatible
159
160
  assert isinstance(pd_dtype, np.dtype)
160
- return pxt.ArrayType.from_np_dtype(pd_dtype, nullable)
161
+ return ts.ArrayType.from_np_dtype(pd_dtype, nullable)
161
162
 
162
163
 
163
- def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable: bool) -> pxt.ColumnType:
164
+ def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable: bool) -> ts.ColumnType:
164
165
  """
165
166
  Infers a Pixeltable type based on a pandas dtype.
166
167
  """
@@ -176,12 +177,12 @@ def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable:
176
177
 
177
178
  if len(data_col) == 0:
178
179
  # No non-null values; default to FloatType (the Pandas type of an all-NaN column)
179
- return pxt.FloatType(nullable=nullable)
180
+ return ts.FloatType(nullable=nullable)
180
181
 
181
- inferred_type = pxt.ColumnType.infer_common_literal_type(data_col)
182
+ inferred_type = ts.ColumnType.infer_common_literal_type(data_col)
182
183
  if inferred_type is None:
183
184
  # Fallback on StringType if everything else fails
184
- return pxt.StringType(nullable=nullable)
185
+ return ts.StringType(nullable=nullable)
185
186
  else:
186
187
  return inferred_type.copy(nullable=nullable)
187
188
 
@@ -189,7 +190,7 @@ def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable:
189
190
 
190
191
 
191
192
  def _df_row_to_pxt_row(
192
- row: tuple[Any, ...], schema: dict[str, pxt.ColumnType], col_mapping: Optional[dict[str, str]]
193
+ row: tuple[Any, ...], schema: dict[str, ts.ColumnType], col_mapping: Optional[dict[str, str]]
193
194
  ) -> dict[str, Any]:
194
195
  """Convert a row to insertable format"""
195
196
  pxt_row: dict[str, Any] = {}
@@ -15,6 +15,7 @@ from pyarrow.parquet import ParquetDataset
15
15
 
16
16
  import pixeltable as pxt
17
17
  import pixeltable.exceptions as excs
18
+ import pixeltable.type_system as ts
18
19
  from pixeltable.io.pandas import _df_check_primary_key_values, _df_row_to_pxt_row, df_infer_schema
19
20
  from pixeltable.utils import parse_local_file_path
20
21
 
@@ -72,6 +73,11 @@ class TableDataConduit:
72
73
  def check_source_format(self) -> None:
73
74
  assert self.source_format is None or TableDataConduitFormat.is_valid(self.source_format)
74
75
 
76
+ def __post_init__(self) -> None:
77
+ """If no extra_fields were provided, initialize to empty dict"""
78
+ if self.extra_fields is None:
79
+ self.extra_fields = {}
80
+
75
81
  @classmethod
76
82
  def is_rowdata_structure(cls, d: TableDataSource) -> bool:
77
83
  if not isinstance(d, list) or len(d) == 0:
@@ -83,7 +89,7 @@ class TableDataConduit:
83
89
 
84
90
  def normalize_pxt_schema_types(self) -> None:
85
91
  for name, coltype in self.pxt_schema.items():
86
- self.pxt_schema[name] = pxt.ColumnType.normalize_type(coltype)
92
+ self.pxt_schema[name] = ts.ColumnType.normalize_type(coltype)
87
93
 
88
94
  def infer_schema(self) -> dict[str, Any]:
89
95
  raise NotImplementedError
@@ -393,7 +399,7 @@ class HFTableDataConduit(TableDataConduit):
393
399
  f'Column name `{self.column_name_for_split}` already exists in dataset schema;'
394
400
  f'provide a different `column_name_for_split`'
395
401
  )
396
- self.src_schema[self.column_name_for_split] = pxt.StringType(nullable=True)
402
+ self.src_schema[self.column_name_for_split] = ts.StringType(nullable=True)
397
403
 
398
404
  inferred_schema, inferred_pk, self.source_column_map = normalize_schema_names(
399
405
  self.src_schema, self.src_pk, self.src_schema_overrides, True
@@ -16,7 +16,7 @@ _console_logger = ConsoleLogger(logging.getLogger('pixeltable'))
16
16
 
17
17
 
18
18
  # current version of the metadata; this is incremented whenever the metadata schema changes
19
- VERSION = 31
19
+ VERSION = 34
20
20
 
21
21
 
22
22
  def create_system_info(engine: sql.engine.Engine) -> None:
@@ -3,7 +3,7 @@ from typing import Any, Optional
3
3
 
4
4
  import sqlalchemy as sql
5
5
 
6
- import pixeltable as pxt
6
+ import pixeltable.type_system as ts
7
7
  from pixeltable.metadata import register_converter, schema
8
8
  from pixeltable.metadata.converters.util import convert_table_md
9
9
 
@@ -34,7 +34,7 @@ def __update_timestamp_literals(k: Any, v: Any) -> Optional[tuple[Any, Any]]:
34
34
  # timestamp literal, which (in version 19) is stored in the DB as a naive datetime.
35
35
  # We convert it to an aware datetime, stored in UTC.
36
36
  assert v['_classname'] == 'Literal'
37
- assert v['val_t'] == pxt.ColumnType.Type.TIMESTAMP.name
37
+ assert v['val_t'] == ts.ColumnType.Type.TIMESTAMP.name
38
38
  assert isinstance(v['val'], str)
39
39
  dt = datetime.datetime.fromisoformat(v['val'])
40
40
  assert dt.tzinfo is None # In version 19 all timestamps are naive
@@ -0,0 +1,11 @@
1
+ import sqlalchemy as sql
2
+
3
+ from pixeltable.metadata import register_converter
4
+
5
+
6
+ @register_converter(version=31)
7
+ def _(engine: sql.engine.Engine) -> None:
8
+ # Add a column "lock_dummy: int8" to the dirs table in the store
9
+ # This column is the target of an UPDATE operation to synchronize directory operations
10
+ with engine.begin() as conn:
11
+ conn.execute(sql.text('ALTER TABLE dirs ADD COLUMN lock_dummy int8'))
@@ -0,0 +1,15 @@
1
+ from uuid import UUID
2
+
3
+ import sqlalchemy as sql
4
+
5
+ from pixeltable.metadata import register_converter
6
+ from pixeltable.metadata.converters.util import convert_table_md
7
+
8
+
9
+ @register_converter(version=32)
10
+ def _(engine: sql.engine.Engine) -> None:
11
+ convert_table_md(engine, table_md_updater=__update_table_md)
12
+
13
+
14
+ def __update_table_md(table_md: dict, table_id: UUID) -> None:
15
+ table_md['is_replica'] = False
@@ -0,0 +1,17 @@
1
+ from uuid import UUID
2
+
3
+ import sqlalchemy as sql
4
+
5
+ from pixeltable.metadata import register_converter
6
+ from pixeltable.metadata.converters.util import convert_table_md
7
+
8
+
9
+ @register_converter(version=33)
10
+ def _(engine: sql.engine.Engine) -> None:
11
+ convert_table_md(engine, table_md_updater=__update_table_md)
12
+
13
+
14
+ def __update_table_md(table_md: dict, table_id: UUID) -> None:
15
+ """Set default value of 'is_pk' field in column metadata to False"""
16
+ for col_md in table_md['column_md'].values():
17
+ col_md['is_pk'] = False if col_md['is_pk'] is None else col_md['is_pk']
@@ -2,6 +2,9 @@
2
2
  # rather than as a comment, so that the existence of a description can be enforced by
3
3
  # the unit tests when new versions are added.
4
4
  VERSION_NOTES = {
5
+ 34: 'Set default value for is_pk field in column metadata to False',
6
+ 33: 'Add is_replica field to table metadata',
7
+ 32: 'Add the lock_dummy BIGINT column to the dirs table',
5
8
  31: 'Add table ids to metadata structs',
6
9
  30: 'Store default values and constant arguments as literals',
7
10
  29: 'Add user and additional_md fields to metadata structs',
@@ -1,7 +1,7 @@
1
1
  import dataclasses
2
2
  import typing
3
3
  import uuid
4
- from typing import Any, Optional, TypeVar, Union, get_type_hints
4
+ from typing import Any, NamedTuple, Optional, TypeVar, Union, get_type_hints
5
5
 
6
6
  import sqlalchemy as sql
7
7
  from sqlalchemy import BigInteger, ForeignKey, Integer, LargeBinary, orm
@@ -84,6 +84,8 @@ class Dir(Base):
84
84
  )
85
85
  parent_id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=True)
86
86
  md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # DirMd
87
+ # This field is updated to synchronize database operations across multiple sessions
88
+ lock_dummy: orm.Mapped[int] = orm.mapped_column(BigInteger, nullable=True)
87
89
 
88
90
 
89
91
  @dataclasses.dataclass
@@ -155,6 +157,7 @@ class ViewMd:
155
157
  class TableMd:
156
158
  tbl_id: str # uuid.UUID
157
159
  name: str
160
+ is_replica: bool
158
161
 
159
162
  user: Optional[str]
160
163
 
@@ -286,3 +289,25 @@ class Function(Base):
286
289
  dir_id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=True)
287
290
  md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # FunctionMd
288
291
  binary_obj: orm.Mapped[Optional[bytes]] = orm.mapped_column(LargeBinary, nullable=True)
292
+
293
+
294
+ class FullTableMd(NamedTuple):
295
+ tbl_md: TableMd
296
+ version_md: TableVersionMd
297
+ schema_version_md: TableSchemaVersionMd
298
+
299
+ def as_dict(self) -> dict[str, Any]:
300
+ return {
301
+ 'table_id': self.tbl_md.tbl_id,
302
+ 'table_md': dataclasses.asdict(self.tbl_md),
303
+ 'table_version_md': dataclasses.asdict(self.version_md),
304
+ 'table_schema_version_md': dataclasses.asdict(self.schema_version_md),
305
+ }
306
+
307
+ @classmethod
308
+ def from_dict(cls, data_dict: dict[str, Any]) -> 'FullTableMd':
309
+ return FullTableMd(
310
+ tbl_md=md_from_dict(TableMd, data_dict['table_md']),
311
+ version_md=md_from_dict(TableVersionMd, data_dict['table_version_md']),
312
+ schema_version_md=md_from_dict(TableSchemaVersionMd, data_dict['table_schema_version_md']),
313
+ )
pixeltable/plan.py CHANGED
@@ -768,8 +768,7 @@ class Planner:
768
768
  # - select list subexprs that aren't aggregates
769
769
  # - join clause subexprs
770
770
  # - subexprs of Where clause conjuncts that can't be run in SQL
771
- # - all grouping exprs, if any aggregate function call can't be run in SQL (in that case, they all have to be
772
- # run in Python)
771
+ # - all grouping exprs
773
772
  candidates = list(
774
773
  exprs.Expr.list_subexprs(
775
774
  analyzer.select_list,
@@ -784,7 +783,7 @@ class Planner:
784
783
  candidates.extend(
785
784
  exprs.Expr.subexprs(analyzer.filter, filter=sql_elements.contains, traverse_matches=False)
786
785
  )
787
- if is_python_agg and analyzer.group_by_clause is not None:
786
+ if analyzer.group_by_clause is not None:
788
787
  candidates.extend(
789
788
  exprs.Expr.list_subexprs(analyzer.group_by_clause, filter=sql_elements.contains, traverse_matches=False)
790
789
  )
@@ -1,4 +1,3 @@
1
- import dataclasses
2
1
  import io
3
2
  import json
4
3
  import logging
@@ -6,7 +5,6 @@ import tarfile
6
5
  import urllib.parse
7
6
  import urllib.request
8
7
  import uuid
9
- from datetime import datetime
10
8
  from pathlib import Path
11
9
  from typing import Any, Iterator, Optional
12
10
 
@@ -58,28 +56,14 @@ class TablePackager:
58
56
  self.tmp_dir = Path(Env.get().create_tmp_path())
59
57
  self.media_files = {}
60
58
 
61
- # Generate metadata
62
- self.md = {
63
- 'pxt_version': pxt.__version__,
64
- 'pxt_md_version': metadata.VERSION,
65
- 'md': {
66
- 'tables': [
67
- {
68
- 'table_id': str(t._tbl_version.id),
69
- # These are temporary; will replace with a better solution once the concurrency
70
- # changes to catalog have been merged
71
- 'table_md': dataclasses.asdict(t._tbl_version.get()._create_tbl_md()),
72
- 'table_version_md': dataclasses.asdict(
73
- t._tbl_version.get()._create_version_md(datetime.now().timestamp())
74
- ),
75
- 'table_schema_version_md': dataclasses.asdict(
76
- t._tbl_version.get()._create_schema_version_md(0)
77
- ),
78
- }
79
- for t in (table, *table._bases)
80
- ]
81
- },
82
- }
59
+ # Load metadata
60
+ with Env.get().begin_xact():
61
+ tbl_md = catalog.Catalog.get().load_replica_md(table)
62
+ self.md = {
63
+ 'pxt_version': pxt.__version__,
64
+ 'pxt_md_version': metadata.VERSION,
65
+ 'md': {'tables': [md.as_dict() for md in tbl_md]},
66
+ }
83
67
  if additional_md is not None:
84
68
  self.md.update(additional_md)
85
69
 
@@ -94,7 +78,7 @@ class TablePackager:
94
78
  json.dump(self.md, fp)
95
79
  self.iceberg_catalog = sqlite_catalog(self.tmp_dir / 'warehouse')
96
80
  with Env.get().begin_xact():
97
- ancestors = (self.table, *self.table._bases)
81
+ ancestors = (self.table, *self.table._base_tables)
98
82
  for t in ancestors:
99
83
  _logger.info(f"Exporting table '{t._path}'.")
100
84
  self.__export_table(t)
@@ -1,4 +1,3 @@
1
- import os
2
1
  import sys
3
2
  import urllib.parse
4
3
  import urllib.request
@@ -10,22 +9,22 @@ from tqdm import tqdm
10
9
  import pixeltable as pxt
11
10
  from pixeltable import exceptions as excs
12
11
  from pixeltable.env import Env
12
+ from pixeltable.metadata.schema import FullTableMd
13
13
  from pixeltable.utils import sha256sum
14
14
 
15
15
  from .packager import TablePackager
16
16
 
17
17
  # These URLs are abstracted out for now, but will be replaced with actual (hard-coded) URLs once the
18
18
  # pixeltable.com URLs are available.
19
- _PUBLISH_URL = os.environ.get('PIXELTABLE_PUBLISH_URL')
20
- _FINALIZE_URL = os.environ.get('PIXELTABLE_FINALIZE_URL')
19
+
20
+ PIXELTABLE_API_URL = 'https://internal-api.pixeltable.com'
21
21
 
22
22
 
23
23
  def publish_snapshot(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
24
24
  packager = TablePackager(src_tbl, additional_md={'table_uri': dest_tbl_uri})
25
- request_json = packager.md
26
- headers_json = {'X-api-key': Env.get().pxt_api_key}
27
-
28
- response = requests.post(_PUBLISH_URL, json=request_json, headers=headers_json)
25
+ request_json = packager.md | {'operation_type': 'publish_snapshot'}
26
+ headers_json = {'X-api-key': Env.get().pxt_api_key, 'Content-Type': 'application/json'}
27
+ response = requests.post(PIXELTABLE_API_URL, json=request_json, headers=headers_json)
29
28
  if response.status_code != 200:
30
29
  raise excs.Error(f'Error publishing snapshot: {response.text}')
31
30
  response_json = response.json()
@@ -47,14 +46,14 @@ def publish_snapshot(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
47
46
  Env.get().console_logger.info('Finalizing snapshot ...')
48
47
 
49
48
  finalize_request_json = {
49
+ 'operation_type': 'finalize_snapshot',
50
50
  'upload_id': upload_id,
51
51
  'datafile': bundle.name,
52
52
  'size': bundle.stat().st_size,
53
53
  'sha256': sha256sum(bundle), # Generate our own SHA for independent verification
54
54
  }
55
-
56
55
  # TODO: Use Pydantic for validation
57
- finalize_response = requests.post(_FINALIZE_URL, json=finalize_request_json, headers=headers_json)
56
+ finalize_response = requests.post(PIXELTABLE_API_URL, json=finalize_request_json, headers=headers_json)
58
57
  if finalize_response.status_code != 200:
59
58
  raise excs.Error(f'Error finalizing snapshot: {finalize_response.text}')
60
59
  finalize_response_json = finalize_response.json()
@@ -66,6 +65,18 @@ def publish_snapshot(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
66
65
  return confirmed_tbl_uri
67
66
 
68
67
 
68
+ def clone_snapshot(dest_tbl_uri: str) -> list[FullTableMd]:
69
+ headers_json = {'X-api-key': Env.get().pxt_api_key, 'Content-Type': 'application/json'}
70
+ clone_request_json = {'operation_type': 'clone_snapshot', 'table_uri': dest_tbl_uri}
71
+ response = requests.post(PIXELTABLE_API_URL, json=clone_request_json, headers=headers_json)
72
+ if response.status_code != 200:
73
+ raise excs.Error(f'Error cloning snapshot: {response.text}')
74
+ response_json = response.json()
75
+ if not isinstance(response_json, dict) or 'table_uri' not in response_json:
76
+ raise excs.Error(f'Unexpected response from server.\n{response_json}')
77
+ return [FullTableMd.from_dict(t) for t in response_json['md']['tables']]
78
+
79
+
69
80
  def _upload_bundle_to_s3(bundle: Path, parsed_location: urllib.parse.ParseResult) -> None:
70
81
  from pixeltable.utils.s3 import get_client
71
82
 
pixeltable/store.py CHANGED
@@ -16,6 +16,7 @@ from pixeltable import catalog, exceptions as excs, exprs
16
16
  from pixeltable.env import Env
17
17
  from pixeltable.exec import ExecNode
18
18
  from pixeltable.metadata import schema
19
+ from pixeltable.utils.exception_handler import run_cleanup
19
20
  from pixeltable.utils.media_store import MediaStore
20
21
  from pixeltable.utils.sql import log_explain, log_stmt
21
22
 
@@ -232,7 +233,6 @@ class StoreBase:
232
233
  assert col.tbl.id == self.tbl_version.id
233
234
  num_excs = 0
234
235
  num_rows = 0
235
-
236
236
  # create temp table to store output of exec_plan, with the same primary key as the store table
237
237
  tmp_name = f'temp_{self._storage_name()}'
238
238
  tmp_pk_cols = [sql.Column(col.name, col.type, primary_key=True) for col in self.pk_columns()]
@@ -301,10 +301,13 @@ class StoreBase:
301
301
  )
302
302
  log_explain(_logger, update_stmt, conn)
303
303
  conn.execute(update_stmt)
304
-
305
304
  finally:
306
- tmp_tbl.drop(bind=conn)
307
- self.sa_md.remove(tmp_tbl)
305
+
306
+ def remove_tmp_tbl() -> None:
307
+ self.sa_md.remove(tmp_tbl)
308
+ tmp_tbl.drop(bind=conn)
309
+
310
+ run_cleanup(remove_tmp_tbl, raise_error=True)
308
311
  return num_excs
309
312
 
310
313
  def insert_rows(
@@ -0,0 +1,59 @@
1
+ import logging
2
+ import sys
3
+ from typing import Any, Callable, Optional, TypeVar
4
+
5
+ R = TypeVar('R')
6
+
7
+
8
+ def _is_in_exception() -> bool:
9
+ """
10
+ Check if code is currently executing within an exception context.
11
+ """
12
+ current_exception = sys.exc_info()[1]
13
+ return current_exception is not None
14
+
15
+
16
+ def run_cleanup_on_exception(cleanup_func: Callable[..., R], *args: Any, **kwargs: Any) -> Optional[R]:
17
+ """
18
+ Runs cleanup only when running in exception context.
19
+
20
+ The function `run_cleanup_on_exception()` should be used to clean up resources when an operation fails.
21
+ This is typically done using a try, except, and finally block, with the resource cleanup logic placed within
22
+ the except block. However, this pattern may not handle KeyboardInterrupt exceptions.
23
+ To ensure that resources are always cleaned up at least once when an exception or KeyboardInterrupt occurs,
24
+ create an idempotent function for cleaning up resources and pass it to the `run_cleanup_on_exception()` function
25
+ from the finally block.
26
+ """
27
+ if _is_in_exception():
28
+ return run_cleanup(cleanup_func, *args, raise_error=False, **kwargs)
29
+ return None
30
+
31
+
32
+ def run_cleanup(cleanup_func: Callable[..., R], *args: Any, raise_error: bool = True, **kwargs: Any) -> Optional[R]:
33
+ """
34
+ Runs a cleanup function. If interrupted, retry cleanup.
35
+ The `run_cleanup()` function ensures that the `cleanup_func()` function executes at least once.
36
+ If the `cleanup_func()` is interrupted during execution, it will be retried.
37
+
38
+ Args:
39
+ cleanup_func: an idempotent function
40
+ raise_error: raise an exception if an error occurs during cleanup.
41
+ """
42
+ try:
43
+ logging.debug(f'Running cleanup function: {cleanup_func.__name__!r}')
44
+ return cleanup_func(*args, **kwargs)
45
+ except KeyboardInterrupt as interrupt:
46
+ # Save original exception and re-attempt cleanup
47
+ original_exception = interrupt
48
+ logging.debug(f'Cleanup {cleanup_func.__name__!r} interrupted, retrying')
49
+ try:
50
+ return cleanup_func(*args, **kwargs)
51
+ except Exception as e:
52
+ # Suppress this exception
53
+ logging.error(f'Cleanup {cleanup_func.__name__!r} failed with exception {e}')
54
+ raise KeyboardInterrupt from original_exception
55
+ except Exception as e:
56
+ logging.error(f'Cleanup {cleanup_func.__name__!r} failed with exception {e}')
57
+ if raise_error:
58
+ raise e
59
+ return None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: pixeltable
3
- Version: 0.3.11
3
+ Version: 0.3.13
4
4
  Summary: AI Data Infrastructure: Declarative, Multimodal, and Incremental
5
5
  License: Apache-2.0
6
6
  Keywords: data-science,machine-learning,database,ai,computer-vision,chatbot,ml,artificial-intelligence,feature-engineering,multimodal,mlops,feature-store,vector-database,llm,genai