pixeltable 0.4.17__py3-none-any.whl → 0.4.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

@@ -280,7 +280,7 @@ class Catalog:
280
280
  - this needs to be done in a retry loop, because Postgres can decide to abort the transaction
281
281
  (SerializationFailure, LockNotAvailable)
282
282
  - for that reason, we do all lock acquisition prior to doing any real work (eg, compute column values),
283
- to minimize the probability of loosing that work due to a forced abort
283
+ to minimize the probability of losing that work due to a forced abort
284
284
 
285
285
  If convert_db_excs == True, converts DBAPIErrors into excs.Errors.
286
286
  """
@@ -433,7 +433,7 @@ class Catalog:
433
433
 
434
434
  The function should not raise exceptions; if it does, they are logged and ignored.
435
435
  """
436
- assert Env.get().in_xact
436
+ assert self.in_write_xact
437
437
  self._undo_actions.append(func)
438
438
  return func
439
439
 
@@ -792,19 +792,25 @@ class Catalog:
792
792
  return result
793
793
 
794
794
  @retry_loop(for_write=True)
795
- def move(self, path: Path, new_path: Path) -> None:
796
- self._move(path, new_path)
795
+ def move(self, path: Path, new_path: Path, if_exists: IfExistsParam, if_not_exists: IfNotExistsParam) -> None:
796
+ self._move(path, new_path, if_exists, if_not_exists)
797
797
 
798
- def _move(self, path: Path, new_path: Path) -> None:
799
- _, dest_dir, src_obj = self._prepare_dir_op(
798
+ def _move(self, path: Path, new_path: Path, if_exists: IfExistsParam, if_not_exists: IfNotExistsParam) -> None:
799
+ dest_obj, dest_dir, src_obj = self._prepare_dir_op(
800
800
  add_dir_path=new_path.parent,
801
801
  add_name=new_path.name,
802
802
  drop_dir_path=path.parent,
803
803
  drop_name=path.name,
804
- raise_if_exists=True,
805
- raise_if_not_exists=True,
804
+ raise_if_exists=(if_exists == IfExistsParam.ERROR),
805
+ raise_if_not_exists=(if_not_exists == IfNotExistsParam.ERROR),
806
806
  )
807
- src_obj._move(new_path.name, dest_dir._id)
807
+ assert dest_obj is None or if_exists == IfExistsParam.IGNORE
808
+ assert src_obj is not None or if_not_exists == IfNotExistsParam.IGNORE
809
+ if dest_obj is None and src_obj is not None:
810
+ # If dest_obj is not None, it means `if_exists='ignore'` and the destination already exists.
811
+ # If src_obj is None, it means `if_not_exists='ignore'` and the source doesn't exist.
812
+ # If dest_obj is None and src_obj is not None, then we can proceed with the move.
813
+ src_obj._move(new_path.name, dest_dir._id)
808
814
 
809
815
  def _prepare_dir_op(
810
816
  self,
@@ -815,7 +821,7 @@ class Catalog:
815
821
  drop_expected: Optional[type[SchemaObject]] = None,
816
822
  raise_if_exists: bool = False,
817
823
  raise_if_not_exists: bool = False,
818
- ) -> tuple[Optional[SchemaObject], Optional[SchemaObject], Optional[SchemaObject]]:
824
+ ) -> tuple[Optional[SchemaObject], Optional[Dir], Optional[SchemaObject]]:
819
825
  """
820
826
  Validates paths and acquires locks needed for a directory operation, ie, add/drop/rename (add + drop) of a
821
827
  directory entry.
@@ -902,9 +908,10 @@ class Catalog:
902
908
  schema.Table.md['name'].astext == name,
903
909
  schema.Table.md['user'].astext == user,
904
910
  )
905
- tbl_id = conn.execute(q).scalar_one_or_none()
906
- if tbl_id is not None:
907
- return self.get_table_by_id(tbl_id, version)
911
+ tbl_id = conn.execute(q).scalars().all()
912
+ assert len(tbl_id) <= 1, name
913
+ if len(tbl_id) == 1:
914
+ return self.get_table_by_id(tbl_id[0], version)
908
915
 
909
916
  return None
910
917
 
@@ -1084,7 +1091,7 @@ class Catalog:
1084
1091
  The metadata should be presented in standard "ancestor order", with the table being replicated at
1085
1092
  list position 0 and the (root) base table at list position -1.
1086
1093
  """
1087
- assert Env.get().in_xact
1094
+ assert self.in_write_xact
1088
1095
 
1089
1096
  tbl_id = UUID(md[0].tbl_md.tbl_id)
1090
1097
 
@@ -1150,11 +1157,11 @@ class Catalog:
1150
1157
  # We need to do this at the end, since `existing_path` needs to first have a non-fragment table version in
1151
1158
  # order to be instantiated as a schema object.
1152
1159
  existing = self.get_table_by_id(tbl_id)
1153
- if existing is not None:
1154
- existing_path = Path.parse(existing._path(), allow_system_path=True)
1155
- if existing_path != path:
1156
- assert existing_path.is_system_path
1157
- self._move(existing_path, path)
1160
+ assert existing is not None
1161
+ existing_path = Path.parse(existing._path(), allow_system_path=True)
1162
+ if existing_path != path:
1163
+ assert existing_path.is_system_path
1164
+ self._move(existing_path, path, IfExistsParam.ERROR, IfNotExistsParam.ERROR)
1158
1165
 
1159
1166
  def __ensure_system_dir_exists(self) -> Dir:
1160
1167
  system_path = Path.parse('_system', allow_system_path=True)
@@ -77,6 +77,17 @@ class Table(SchemaObject):
77
77
  self._tbl_version = None
78
78
 
79
79
  def _move(self, new_name: str, new_dir_id: UUID) -> None:
80
+ old_name = self._name
81
+ old_dir_id = self._dir_id
82
+
83
+ cat = catalog.Catalog.get()
84
+
85
+ @cat.register_undo_action
86
+ def _() -> None:
87
+ # TODO: We should really be invalidating the Table instance and forcing a reload.
88
+ self._name = old_name
89
+ self._dir_id = old_dir_id
90
+
80
91
  super()._move(new_name, new_dir_id)
81
92
  conn = env.Env.get().conn
82
93
  stmt = sql.text(
@@ -625,7 +636,7 @@ class Table(SchemaObject):
625
636
  - `'abort'`: an exception will be raised and the column will not be added.
626
637
  - `'ignore'`: execution will continue and the column will be added. Any rows
627
638
  with errors will have a `None` value for the column, with information about the error stored in the
628
- corresponding `tbl.col_name.errormsg` tbl.col_name.errortype` fields.
639
+ corresponding `tbl.col_name.errormsg` and `tbl.col_name.errortype` fields.
629
640
  if_exists: Determines the behavior if the column already exists. Must be one of the following:
630
641
 
631
642
  - `'error'`: an exception will be raised.
@@ -986,22 +997,28 @@ class Table(SchemaObject):
986
997
  Only `String` and `Image` columns are currently supported. Here's an example that uses a
987
998
  [CLIP embedding][pixeltable.functions.huggingface.clip] to index an image column:
988
999
 
1000
+ ```
989
1001
  >>> from pixeltable.functions.huggingface import clip
990
- ... embedding_fn = clip.using(model_id='openai/clip-vit-base-patch32')
991
- ... tbl.add_embedding_index(tbl.img, embedding=embedding_fn)
1002
+ >>> embedding_fn = clip.using(model_id='openai/clip-vit-base-patch32')
1003
+ >>> tbl.add_embedding_index(tbl.img, embedding=embedding_fn)
1004
+ ```
992
1005
 
993
- Once the index is created, similiarity lookups can be performed using the `similarity` pseudo-function.
1006
+ Once the index is created, similarity lookups can be performed using the `similarity` pseudo-function:
994
1007
 
1008
+ ```
995
1009
  >>> reference_img = PIL.Image.open('my_image.jpg')
996
- ... sim = tbl.img.similarity(reference_img)
997
- ... tbl.select(tbl.img, sim).order_by(sim, asc=False).limit(5)
1010
+ >>> sim = tbl.img.similarity(reference_img)
1011
+ >>> tbl.select(tbl.img, sim).order_by(sim, asc=False).limit(5)
1012
+ ```
998
1013
 
999
1014
  If the embedding UDF is a multimodal embedding (supporting more than one data type), then lookups may be
1000
1015
  performed using any of its supported types. In our example, CLIP supports both text and images, so we can
1001
1016
  also search for images using a text description:
1002
1017
 
1018
+ ```
1003
1019
  >>> sim = tbl.img.similarity('a picture of a train')
1004
- ... tbl.select(tbl.img, sim).order_by(sim, asc=False).limit(5)
1020
+ >>> tbl.select(tbl.img, sim).order_by(sim, asc=False).limit(5)
1021
+ ```
1005
1022
 
1006
1023
  Args:
1007
1024
  column: The name of, or reference to, the column to be indexed; must be a `String` or `Image` column.
@@ -1032,9 +1049,9 @@ class Table(SchemaObject):
1032
1049
  Add an index to the `img` column of the table `my_table`:
1033
1050
 
1034
1051
  >>> from pixeltable.functions.huggingface import clip
1035
- ... tbl = pxt.get_table('my_table')
1036
- ... embedding_fn = clip.using(model_id='openai/clip-vit-base-patch32')
1037
- ... tbl.add_embedding_index(tbl.img, embedding=embedding_fn)
1052
+ >>> tbl = pxt.get_table('my_table')
1053
+ >>> embedding_fn = clip.using(model_id='openai/clip-vit-base-patch32')
1054
+ >>> tbl.add_embedding_index(tbl.img, embedding=embedding_fn)
1038
1055
 
1039
1056
  Alternatively, the `img` column may be specified by name:
1040
1057
 
@@ -1328,7 +1345,8 @@ class Table(SchemaObject):
1328
1345
  on_error: Literal['abort', 'ignore'] = 'abort',
1329
1346
  print_stats: bool = False,
1330
1347
  **kwargs: Any,
1331
- )```
1348
+ )
1349
+ ```
1332
1350
 
1333
1351
  To insert just a single row, you can use the more concise syntax:
1334
1352
 
@@ -1338,7 +1356,8 @@ class Table(SchemaObject):
1338
1356
  on_error: Literal['abort', 'ignore'] = 'abort',
1339
1357
  print_stats: bool = False,
1340
1358
  **kwargs: Any
1341
- )```
1359
+ )
1360
+ ```
1342
1361
 
1343
1362
  Args:
1344
1363
  source: A data source from which data can be imported.
@@ -1459,8 +1478,8 @@ class Table(SchemaObject):
1459
1478
  the row with new `id` 3 (assuming this key does not exist):
1460
1479
 
1461
1480
  >>> tbl.update(
1462
- [{'id': 1, 'name': 'Alice', 'age': 30}, {'id': 3, 'name': 'Bob', 'age': 40}],
1463
- if_not_exists='insert')
1481
+ ... [{'id': 1, 'name': 'Alice', 'age': 30}, {'id': 3, 'name': 'Bob', 'age': 40}],
1482
+ ... if_not_exists='insert')
1464
1483
  """
1465
1484
  from pixeltable.catalog import Catalog
1466
1485
 
@@ -96,6 +96,8 @@ class TableVersion:
96
96
  cols_by_name: dict[str, Column]
97
97
  # contains only columns visible in this version, both system and user
98
98
  cols_by_id: dict[int, Column]
99
+ # all indices defined on this table
100
+ all_idxs: dict[str, TableVersion.IndexInfo]
99
101
  # contains only actively maintained indices
100
102
  idxs_by_name: dict[str, TableVersion.IndexInfo]
101
103
 
@@ -129,6 +131,12 @@ class TableVersion:
129
131
  base_path: Optional[pxt.catalog.TableVersionPath] = None,
130
132
  base: Optional[TableVersionHandle] = None,
131
133
  ):
134
+ from pixeltable import exprs
135
+ from pixeltable.plan import SampleClause
136
+
137
+ from .table_version_handle import TableVersionHandle
138
+ from .table_version_path import TableVersionPath
139
+
132
140
  self.is_validated = True # a freshly constructed instance is always valid
133
141
  self.is_initialized = False
134
142
  self.id = id
@@ -141,9 +149,6 @@ class TableVersion:
141
149
  self.store_tbl = None
142
150
 
143
151
  # mutable tables need their TableVersionPath for expr eval during updates
144
- from .table_version_handle import TableVersionHandle
145
- from .table_version_path import TableVersionPath
146
-
147
152
  if self.is_snapshot:
148
153
  self.path = None
149
154
  else:
@@ -153,9 +158,6 @@ class TableVersion:
153
158
  self.path = TableVersionPath(self_handle, base=base_path)
154
159
 
155
160
  # view-specific initialization
156
- from pixeltable import exprs
157
- from pixeltable.plan import SampleClause
158
-
159
161
  predicate_dict = None if self.view_md is None or self.view_md.predicate is None else self.view_md.predicate
160
162
  self.predicate = exprs.Expr.from_dict(predicate_dict) if predicate_dict is not None else None
161
163
  sample_dict = None if self.view_md is None or self.view_md.sample_clause is None else self.view_md.sample_clause
@@ -180,6 +182,7 @@ class TableVersion:
180
182
  self.cols = []
181
183
  self.cols_by_name = {}
182
184
  self.cols_by_id = {}
185
+ self.all_idxs = {}
183
186
  self.idxs_by_name = {}
184
187
  self.external_stores = {}
185
188
 
@@ -373,7 +376,7 @@ class TableVersion:
373
376
  cat._tbl_versions[tbl_version.id, tbl_version.effective_version] = tbl_version
374
377
  tbl_version.init()
375
378
  tbl_version.store_tbl.create()
376
- tbl_version.store_tbl.ensure_columns_exist(col for col in tbl_version.cols if col.is_stored)
379
+ tbl_version.store_tbl.ensure_updated_schema()
377
380
  return tbl_version
378
381
 
379
382
  def delete_media(self, tbl_version: Optional[int] = None) -> None:
@@ -463,13 +466,17 @@ class TableVersion:
463
466
  idx_col = self._lookup_column(QColumnId(UUID(md.indexed_col_tbl_id), md.indexed_col_id))
464
467
  assert idx_col is not None
465
468
  idx = cls.from_dict(idx_col, md.init_args)
469
+ assert isinstance(idx, index.IndexBase)
470
+
471
+ val_col = next(col for col in self.cols if col.id == md.index_val_col_id)
472
+ undo_col = next(col for col in self.cols if col.id == md.index_val_undo_col_id)
473
+ idx_info = self.IndexInfo(id=md.id, name=md.name, idx=idx, col=idx_col, val_col=val_col, undo_col=undo_col)
474
+ self.all_idxs[md.name] = idx_info
466
475
 
467
476
  # fix up the sa column type of the index value and undo columns
468
477
  # we need to do this for all indices, not just those that are active in this TableVersion, to ensure we get
469
478
  # the correct SA schema in the StoreTable.
470
- val_col = next(col for col in self.cols if col.id == md.index_val_col_id)
471
479
  val_col.sa_col_type = idx.index_sa_type()
472
- undo_col = next(col for col in self.cols if col.id == md.index_val_undo_col_id)
473
480
  undo_col.sa_col_type = idx.index_sa_type()
474
481
  if not isinstance(idx, index.EmbeddingIndex):
475
482
  # Historically, the intent has been not to store cellmd data, even for embedding indices. However,
@@ -501,9 +508,6 @@ class TableVersion:
501
508
  assert md.indexed_col_id in self.cols_by_id
502
509
  assert md.index_val_col_id in self.cols_by_id
503
510
  assert md.index_val_undo_col_id in self.cols_by_id
504
- idx_info = self.IndexInfo(
505
- id=md.id, name=md.name, idx=idx, col=idx_col, val_col=val_col, undo_col=undo_col
506
- )
507
511
  self.idxs_by_name[md.name] = idx_info
508
512
 
509
513
  def _lookup_column(self, id: QColumnId) -> Column | None:
pixeltable/dataframe.py CHANGED
@@ -1039,7 +1039,7 @@ class DataFrame:
1039
1039
  >>> df = book.order_by(t.price, asc=False).order_by(t.pages)
1040
1040
  """
1041
1041
  if self.sample_clause is not None:
1042
- raise excs.Error('group_by() cannot be used with sample()')
1042
+ raise excs.Error('order_by() cannot be used with sample()')
1043
1043
  for e in expr_list:
1044
1044
  if not isinstance(e, exprs.Expr):
1045
1045
  raise excs.Error(f'Invalid expression in order_by(): {e}')
pixeltable/env.py CHANGED
@@ -760,10 +760,12 @@ class Env:
760
760
 
761
761
  def __register_packages(self) -> None:
762
762
  """Declare optional packages that are utilized by some parts of the code."""
763
+ self.__register_package('accelerate')
763
764
  self.__register_package('anthropic')
764
765
  self.__register_package('azure.storage.blob', library_name='azure-storage-blob')
765
766
  self.__register_package('boto3')
766
767
  self.__register_package('datasets')
768
+ self.__register_package('diffusers')
767
769
  self.__register_package('fiftyone')
768
770
  self.__register_package('fireworks', library_name='fireworks-ai')
769
771
  self.__register_package('google.cloud.storage', library_name='google-cloud-storage')
@@ -771,6 +773,7 @@ class Env:
771
773
  self.__register_package('groq')
772
774
  self.__register_package('huggingface_hub', library_name='huggingface-hub')
773
775
  self.__register_package('label_studio_sdk', library_name='label-studio-sdk')
776
+ self.__register_package('librosa')
774
777
  self.__register_package('llama_cpp', library_name='llama-cpp-python')
775
778
  self.__register_package('mcp')
776
779
  self.__register_package('mistralai')
@@ -783,6 +786,7 @@ class Env:
783
786
  self.__register_package('replicate')
784
787
  self.__register_package('sentencepiece')
785
788
  self.__register_package('sentence_transformers', library_name='sentence-transformers')
789
+ self.__register_package('soundfile')
786
790
  self.__register_package('spacy')
787
791
  self.__register_package('tiktoken')
788
792
  self.__register_package('together')
@@ -1,3 +1,4 @@
1
+ import random
1
2
  from typing import Optional
2
3
 
3
4
  import sqlalchemy as sql
@@ -8,6 +9,17 @@ from pixeltable import exprs
8
9
  class ExecContext:
9
10
  """Class for execution runtime constants"""
10
11
 
12
+ row_builder: exprs.RowBuilder
13
+ profile: exprs.ExecProfile
14
+ show_pbar: bool
15
+ batch_size: int
16
+ num_rows: Optional[int]
17
+ conn: Optional[sql.engine.Connection]
18
+ pk_clause: Optional[list[sql.ClauseElement]]
19
+ num_computed_exprs: int
20
+ ignore_errors: bool
21
+ random_seed: int # general-purpose source of randomness with execution scope
22
+
11
23
  def __init__(
12
24
  self,
13
25
  row_builder: exprs.RowBuilder,
@@ -23,8 +35,9 @@ class ExecContext:
23
35
  self.row_builder = row_builder
24
36
  self.profile = exprs.ExecProfile(row_builder)
25
37
  # num_rows is used to compute the total number of computed cells used for the progress bar
26
- self.num_rows: Optional[int] = None
27
- self.conn: Optional[sql.engine.Connection] = None # if present, use this to execute SQL queries
38
+ self.num_rows = None
39
+ self.conn = None # if present, use this to execute SQL queries
28
40
  self.pk_clause = pk_clause
29
41
  self.num_computed_exprs = num_computed_exprs
30
42
  self.ignore_errors = ignore_errors
43
+ self.random_seed = random.randint(0, 1 << 63)
@@ -648,7 +648,6 @@ class SqlSampleNode(SqlNode):
648
648
  )
649
649
  self.stratify_exprs = stratify_exprs
650
650
  self.sample_clause = sample_clause
651
- assert isinstance(self.sample_clause.seed, int)
652
651
 
653
652
  @classmethod
654
653
  def key_sql_expr(cls, seed: sql.ColumnElement, sql_cols: Iterable[sql.ColumnElement]) -> sql.ColumnElement:
@@ -667,7 +666,9 @@ class SqlSampleNode(SqlNode):
667
666
  """Create an expression for randomly ordering rows with a given seed"""
668
667
  rowid_cols = [*cte.c[-self.pk_count : -1]] # exclude the version column
669
668
  assert len(rowid_cols) > 0
670
- return self.key_sql_expr(sql.literal_column(str(self.sample_clause.seed)), rowid_cols)
669
+ # If seed is not set in the sample clause, use the random seed given by the execution context
670
+ seed = self.sample_clause.seed if self.sample_clause.seed is not None else self.ctx.random_seed
671
+ return self.key_sql_expr(sql.literal_column(str(seed)), rowid_cols)
671
672
 
672
673
  def _create_stmt(self) -> sql.Select:
673
674
  from pixeltable.plan import SampleClause