pixeltable 0.3.13__py3-none-any.whl → 0.3.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +2 -2
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/catalog.py +9 -7
- pixeltable/catalog/column.py +6 -2
- pixeltable/catalog/dir.py +2 -1
- pixeltable/catalog/insertable_table.py +1 -1
- pixeltable/catalog/schema_object.py +2 -1
- pixeltable/catalog/table.py +12 -8
- pixeltable/catalog/table_version.py +21 -0
- pixeltable/catalog/view.py +3 -3
- pixeltable/dataframe.py +48 -5
- pixeltable/env.py +1 -1
- pixeltable/exec/aggregation_node.py +14 -0
- pixeltable/exec/cache_prefetch_node.py +1 -1
- pixeltable/exec/expr_eval/expr_eval_node.py +1 -1
- pixeltable/exprs/column_ref.py +42 -17
- pixeltable/exprs/data_row.py +3 -0
- pixeltable/exprs/globals.py +1 -1
- pixeltable/exprs/literal.py +11 -1
- pixeltable/exprs/rowid_ref.py +4 -1
- pixeltable/exprs/similarity_expr.py +1 -1
- pixeltable/func/function.py +1 -1
- pixeltable/functions/__init__.py +1 -0
- pixeltable/functions/date.py +185 -0
- pixeltable/functions/gemini.py +184 -49
- pixeltable/functions/globals.py +1 -16
- pixeltable/functions/json.py +2 -1
- pixeltable/functions/math.py +103 -0
- pixeltable/functions/string.py +1 -2
- pixeltable/functions/video.py +2 -2
- pixeltable/globals.py +26 -9
- pixeltable/io/hf_datasets.py +2 -2
- pixeltable/io/pandas.py +16 -4
- pixeltable/io/parquet.py +4 -2
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_34.py +21 -0
- pixeltable/metadata/notes.py +1 -0
- pixeltable/plan.py +12 -5
- pixeltable/share/__init__.py +1 -1
- pixeltable/share/packager.py +397 -120
- pixeltable/share/publish.py +61 -16
- pixeltable/store.py +57 -20
- pixeltable/type_system.py +46 -2
- pixeltable/utils/arrow.py +8 -2
- pixeltable/utils/pytorch.py +4 -0
- {pixeltable-0.3.13.dist-info → pixeltable-0.3.15.dist-info}/METADATA +2 -4
- {pixeltable-0.3.13.dist-info → pixeltable-0.3.15.dist-info}/RECORD +50 -48
- {pixeltable-0.3.13.dist-info → pixeltable-0.3.15.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.13.dist-info → pixeltable-0.3.15.dist-info}/WHEEL +0 -0
- {pixeltable-0.3.13.dist-info → pixeltable-0.3.15.dist-info}/entry_points.txt +0 -0
pixeltable/__init__.py
CHANGED
|
@@ -9,6 +9,7 @@ from .globals import (
|
|
|
9
9
|
array,
|
|
10
10
|
configure_logging,
|
|
11
11
|
create_dir,
|
|
12
|
+
create_replica,
|
|
12
13
|
create_snapshot,
|
|
13
14
|
create_table,
|
|
14
15
|
create_view,
|
|
@@ -20,11 +21,10 @@ from .globals import (
|
|
|
20
21
|
list_functions,
|
|
21
22
|
list_tables,
|
|
22
23
|
move,
|
|
23
|
-
publish_snapshot,
|
|
24
24
|
tool,
|
|
25
25
|
tools,
|
|
26
26
|
)
|
|
27
|
-
from .type_system import Array, Audio, Bool, Document, Float, Image, Int, Json, Required, String, Timestamp, Video
|
|
27
|
+
from .type_system import Array, Audio, Bool, Date, Document, Float, Image, Int, Json, Required, String, Timestamp, Video
|
|
28
28
|
|
|
29
29
|
# This import must go last to avoid circular imports.
|
|
30
30
|
from . import ext, functions, io, iterators # isort: skip
|
pixeltable/__version__.py
CHANGED
|
@@ -1,3 +1,3 @@
|
|
|
1
1
|
# These version placeholders will be replaced during build.
|
|
2
|
-
__version__ = '0.3.
|
|
3
|
-
__version_tuple__ = (0, 3,
|
|
2
|
+
__version__ = '0.3.15'
|
|
3
|
+
__version_tuple__ = (0, 3, 15)
|
pixeltable/catalog/catalog.py
CHANGED
|
@@ -432,7 +432,9 @@ class Catalog:
|
|
|
432
432
|
return view
|
|
433
433
|
|
|
434
434
|
@_retry_loop
|
|
435
|
-
def create_replica(
|
|
435
|
+
def create_replica(
|
|
436
|
+
self, path: Path, md: list[schema.FullTableMd], if_exists: IfExistsParam = IfExistsParam.ERROR
|
|
437
|
+
) -> Table:
|
|
436
438
|
"""
|
|
437
439
|
Creates table, table_version, and table_schema_version records for a replica with the given metadata.
|
|
438
440
|
The metadata should be presented in standard "ancestor order", with the table being replicated at
|
|
@@ -458,11 +460,11 @@ class Catalog:
|
|
|
458
460
|
# TODO: Handle concurrency in create_replica()
|
|
459
461
|
existing = Catalog.get().get_table_by_id(tbl_id)
|
|
460
462
|
if existing is not None:
|
|
461
|
-
existing_path = Path(existing._path
|
|
463
|
+
existing_path = Path(existing._path, allow_system_paths=True)
|
|
462
464
|
# It does exist. If it's a non-system table, that's an error: it's already been replicated.
|
|
463
465
|
if not existing_path.is_system_path:
|
|
464
466
|
raise excs.Error(
|
|
465
|
-
f'That table has already been replicated as {existing._path
|
|
467
|
+
f'That table has already been replicated as {existing._path!r}. \n'
|
|
466
468
|
f'Drop the existing replica if you wish to re-create it.'
|
|
467
469
|
)
|
|
468
470
|
# If it's a system table, then this means it was created at some point as the ancestor of some other
|
|
@@ -487,7 +489,7 @@ class Catalog:
|
|
|
487
489
|
# The table already exists in the catalog. The existing path might be a system path (if the table
|
|
488
490
|
# was created as an anonymous base table of some other table), or it might not (if it's a snapshot
|
|
489
491
|
# that was directly replicated by the user at some point). In either case, use the existing path.
|
|
490
|
-
replica_path = Path(replica._path
|
|
492
|
+
replica_path = Path(replica._path, allow_system_paths=True)
|
|
491
493
|
|
|
492
494
|
# Store the metadata; it could be a new version (in which case a new record will be created) or a
|
|
493
495
|
# known version (in which case the newly received metadata will be validated as identical).
|
|
@@ -619,11 +621,11 @@ class Catalog:
|
|
|
619
621
|
msg: str
|
|
620
622
|
if is_replace:
|
|
621
623
|
msg = (
|
|
622
|
-
f'{obj_type_str} {tbl._path
|
|
624
|
+
f'{obj_type_str} {tbl._path} already exists and has dependents. '
|
|
623
625
|
"Use `if_exists='replace_force'` to replace it."
|
|
624
626
|
)
|
|
625
627
|
else:
|
|
626
|
-
msg = f'{obj_type_str} {tbl._path
|
|
628
|
+
msg = f'{obj_type_str} {tbl._path} has dependents.'
|
|
627
629
|
raise excs.Error(msg)
|
|
628
630
|
|
|
629
631
|
for view_id in view_ids:
|
|
@@ -634,7 +636,7 @@ class Catalog:
|
|
|
634
636
|
tbl._drop()
|
|
635
637
|
assert tbl._id in self._tbls
|
|
636
638
|
del self._tbls[tbl._id]
|
|
637
|
-
_logger.info(f'Dropped table `{tbl._path
|
|
639
|
+
_logger.info(f'Dropped table `{tbl._path}`.')
|
|
638
640
|
|
|
639
641
|
@_retry_loop
|
|
640
642
|
def create_dir(self, path: Path, if_exists: IfExistsParam, parents: bool) -> Dir:
|
pixeltable/catalog/column.py
CHANGED
|
@@ -16,6 +16,7 @@ from .globals import MediaValidation, is_valid_identifier
|
|
|
16
16
|
if TYPE_CHECKING:
|
|
17
17
|
from .table_version import TableVersion
|
|
18
18
|
from .table_version_handle import TableVersionHandle
|
|
19
|
+
from .table_version_path import TableVersionPath
|
|
19
20
|
|
|
20
21
|
_logger = logging.getLogger('pixeltable')
|
|
21
22
|
|
|
@@ -170,9 +171,12 @@ class Column:
|
|
|
170
171
|
)
|
|
171
172
|
return len(window_fn_calls) > 0
|
|
172
173
|
|
|
173
|
-
|
|
174
|
+
# TODO: This should be moved out of `Column` (its presence in `Column` doesn't anticipate indices being defined on
|
|
175
|
+
# multiple dependents)
|
|
176
|
+
def get_idx_info(self, reference_tbl: Optional['TableVersionPath'] = None) -> dict[str, 'TableVersion.IndexInfo']:
|
|
174
177
|
assert self.tbl is not None
|
|
175
|
-
|
|
178
|
+
tbl = reference_tbl.tbl_version if reference_tbl is not None else self.tbl
|
|
179
|
+
return {name: info for name, info in tbl.get().idxs_by_name.items() if info.col == self}
|
|
176
180
|
|
|
177
181
|
@property
|
|
178
182
|
def is_computed(self) -> bool:
|
pixeltable/catalog/dir.py
CHANGED
|
@@ -38,12 +38,13 @@ class Dir(SchemaObject):
|
|
|
38
38
|
def _display_name(cls) -> str:
|
|
39
39
|
return 'directory'
|
|
40
40
|
|
|
41
|
+
@property
|
|
41
42
|
def _path(self) -> str:
|
|
42
43
|
"""Returns the path to this schema object."""
|
|
43
44
|
if self._dir_id is None:
|
|
44
45
|
# we're the root dir
|
|
45
46
|
return ''
|
|
46
|
-
return super()._path
|
|
47
|
+
return super()._path
|
|
47
48
|
|
|
48
49
|
def _move(self, new_name: str, new_dir_id: UUID) -> None:
|
|
49
50
|
# print(
|
|
@@ -33,6 +33,7 @@ class SchemaObject:
|
|
|
33
33
|
return None
|
|
34
34
|
return Catalog.get().get_dir(self._dir_id)
|
|
35
35
|
|
|
36
|
+
@property
|
|
36
37
|
def _path(self) -> str:
|
|
37
38
|
"""Returns the path to this schema object."""
|
|
38
39
|
from .catalog import Catalog
|
|
@@ -44,7 +45,7 @@ class SchemaObject:
|
|
|
44
45
|
|
|
45
46
|
def get_metadata(self) -> dict[str, Any]:
|
|
46
47
|
"""Returns metadata associated with this schema object."""
|
|
47
|
-
return {'name': self._name, 'path': self._path
|
|
48
|
+
return {'name': self._name, 'path': self._path}
|
|
48
49
|
|
|
49
50
|
@classmethod
|
|
50
51
|
@abstractmethod
|
pixeltable/catalog/table.py
CHANGED
|
@@ -109,7 +109,7 @@ class Table(SchemaObject):
|
|
|
109
109
|
self._check_is_dropped()
|
|
110
110
|
with env.Env.get().begin_xact():
|
|
111
111
|
md = super().get_metadata()
|
|
112
|
-
md['base'] = self._base_table._path
|
|
112
|
+
md['base'] = self._base_table._path if self._base_table is not None else None
|
|
113
113
|
md['schema'] = self._schema
|
|
114
114
|
md['is_replica'] = self._tbl_version.get().is_replica
|
|
115
115
|
md['version'] = self._version
|
|
@@ -146,7 +146,7 @@ class Table(SchemaObject):
|
|
|
146
146
|
col = self._tbl_version_path.get_column(name)
|
|
147
147
|
if col is None:
|
|
148
148
|
raise AttributeError(f'Column {name!r} unknown')
|
|
149
|
-
return ColumnRef(col)
|
|
149
|
+
return ColumnRef(col, reference_tbl=self._tbl_version_path)
|
|
150
150
|
|
|
151
151
|
def __getitem__(self, name: str) -> 'exprs.ColumnRef':
|
|
152
152
|
"""Return a ColumnRef for the given name."""
|
|
@@ -165,7 +165,7 @@ class Table(SchemaObject):
|
|
|
165
165
|
"""
|
|
166
166
|
self._check_is_dropped()
|
|
167
167
|
with env.Env.get().begin_xact():
|
|
168
|
-
return [t._path
|
|
168
|
+
return [t._path for t in self._get_views(recursive=recursive)]
|
|
169
169
|
|
|
170
170
|
def _get_views(self, *, recursive: bool = True) -> list['Table']:
|
|
171
171
|
cat = catalog.Catalog.get()
|
|
@@ -220,6 +220,10 @@ class Table(SchemaObject):
|
|
|
220
220
|
"""
|
|
221
221
|
return self._df().group_by(*items)
|
|
222
222
|
|
|
223
|
+
def distinct(self) -> 'pxt.DataFrame':
|
|
224
|
+
"""Remove duplicate rows from table."""
|
|
225
|
+
return self._df().distinct()
|
|
226
|
+
|
|
223
227
|
def limit(self, n: int) -> 'pxt.DataFrame':
|
|
224
228
|
return self._df().limit(n)
|
|
225
229
|
|
|
@@ -254,11 +258,15 @@ class Table(SchemaObject):
|
|
|
254
258
|
"""Return the schema (column names and column types) of this table."""
|
|
255
259
|
return {c.name: c.col_type for c in self._tbl_version_path.columns()}
|
|
256
260
|
|
|
261
|
+
@property
|
|
262
|
+
def base_table(self) -> Optional['Table']:
|
|
263
|
+
with env.Env.get().begin_xact():
|
|
264
|
+
return self._base_table
|
|
265
|
+
|
|
257
266
|
@property
|
|
258
267
|
@abc.abstractmethod
|
|
259
268
|
def _base_table(self) -> Optional['Table']:
|
|
260
269
|
"""The base's Table instance"""
|
|
261
|
-
...
|
|
262
270
|
|
|
263
271
|
@property
|
|
264
272
|
def _base_tables(self) -> list['Table']:
|
|
@@ -274,7 +282,6 @@ class Table(SchemaObject):
|
|
|
274
282
|
@abc.abstractmethod
|
|
275
283
|
def _effective_base_versions(self) -> list[Optional[int]]:
|
|
276
284
|
"""The effective versions of the ancestor bases, starting with its immediate base."""
|
|
277
|
-
...
|
|
278
285
|
|
|
279
286
|
@property
|
|
280
287
|
def _comment(self) -> str:
|
|
@@ -311,9 +318,6 @@ class Table(SchemaObject):
|
|
|
311
318
|
helper.append(f'COMMENT: {self._comment}')
|
|
312
319
|
return helper
|
|
313
320
|
|
|
314
|
-
@abc.abstractmethod
|
|
315
|
-
def _table_descriptor(self) -> str: ...
|
|
316
|
-
|
|
317
321
|
def _col_descriptor(self, columns: Optional[list[str]] = None) -> pd.DataFrame:
|
|
318
322
|
return pd.DataFrame(
|
|
319
323
|
{
|
|
@@ -202,6 +202,13 @@ class TableVersion:
|
|
|
202
202
|
|
|
203
203
|
return TableVersionHandle(self.id, self.effective_version, tbl_version=self)
|
|
204
204
|
|
|
205
|
+
@property
|
|
206
|
+
def versioned_name(self) -> str:
|
|
207
|
+
if self.effective_version is None:
|
|
208
|
+
return self.name
|
|
209
|
+
else:
|
|
210
|
+
return f'{self.name}:{self.effective_version}'
|
|
211
|
+
|
|
205
212
|
@classmethod
|
|
206
213
|
def create(
|
|
207
214
|
cls,
|
|
@@ -314,6 +321,20 @@ class TableVersion:
|
|
|
314
321
|
session.add(schema_version_record)
|
|
315
322
|
return tbl_record.id, tbl_version
|
|
316
323
|
|
|
324
|
+
@classmethod
|
|
325
|
+
def create_replica(cls, md: schema.FullTableMd) -> TableVersion:
|
|
326
|
+
tbl_id = UUID(md.tbl_md.tbl_id)
|
|
327
|
+
_logger.info(f'Creating replica table version {tbl_id}:{md.version_md.version}.')
|
|
328
|
+
view_md = md.tbl_md.view_md
|
|
329
|
+
base_path = pxt.catalog.TableVersionPath.from_md(view_md.base_versions) if view_md is not None else None
|
|
330
|
+
base = base_path.tbl_version if base_path is not None else None
|
|
331
|
+
tbl_version = cls(
|
|
332
|
+
tbl_id, md.tbl_md, md.version_md.version, md.schema_version_md, [], base_path=base_path, base=base
|
|
333
|
+
)
|
|
334
|
+
tbl_version.store_tbl.create()
|
|
335
|
+
tbl_version.store_tbl.ensure_columns_exist(col for col in tbl_version.cols if col.is_stored)
|
|
336
|
+
return tbl_version
|
|
337
|
+
|
|
317
338
|
def drop(self) -> None:
|
|
318
339
|
from .catalog import Catalog
|
|
319
340
|
|
pixeltable/catalog/view.py
CHANGED
|
@@ -285,13 +285,13 @@ class View(Table):
|
|
|
285
285
|
|
|
286
286
|
def _table_descriptor(self) -> str:
|
|
287
287
|
display_name = 'Snapshot' if self._snapshot_only else 'View'
|
|
288
|
-
result = [f'{display_name} {self._path
|
|
288
|
+
result = [f'{display_name} {self._path!r}']
|
|
289
289
|
bases_descrs: list[str] = []
|
|
290
290
|
for base, effective_version in zip(self._base_tables, self._effective_base_versions):
|
|
291
291
|
if effective_version is None:
|
|
292
|
-
bases_descrs.append(f'{base._path
|
|
292
|
+
bases_descrs.append(f'{base._path!r}')
|
|
293
293
|
else:
|
|
294
|
-
base_descr = f'{base._path
|
|
294
|
+
base_descr = f'{base._path}:{effective_version}'
|
|
295
295
|
bases_descrs.append(f'{base_descr!r}')
|
|
296
296
|
result.append(f' (of {", ".join(bases_descrs)})')
|
|
297
297
|
|
pixeltable/dataframe.py
CHANGED
|
@@ -322,6 +322,8 @@ class DataFrame:
|
|
|
322
322
|
raise excs.Error('head() cannot be used with order_by()')
|
|
323
323
|
if self._has_joins():
|
|
324
324
|
raise excs.Error('head() not supported for joins')
|
|
325
|
+
if self.group_by_clause is not None:
|
|
326
|
+
raise excs.Error('head() cannot be used with group_by()')
|
|
325
327
|
num_rowid_cols = len(self._first_tbl.tbl_version.get().store_tbl.rowid_columns())
|
|
326
328
|
order_by_clause = [exprs.RowidRef(self._first_tbl.tbl_version, idx) for idx in range(num_rowid_cols)]
|
|
327
329
|
return self.order_by(*order_by_clause, asc=True).limit(n).collect()
|
|
@@ -345,6 +347,8 @@ class DataFrame:
|
|
|
345
347
|
raise excs.Error('tail() cannot be used with order_by()')
|
|
346
348
|
if self._has_joins():
|
|
347
349
|
raise excs.Error('tail() not supported for joins')
|
|
350
|
+
if self.group_by_clause is not None:
|
|
351
|
+
raise excs.Error('tail() cannot be used with group_by()')
|
|
348
352
|
num_rowid_cols = len(self._first_tbl.tbl_version.get().store_tbl.rowid_columns())
|
|
349
353
|
order_by_clause = [exprs.RowidRef(self._first_tbl.tbl_version, idx) for idx in range(num_rowid_cols)]
|
|
350
354
|
result = self.order_by(*order_by_clause, asc=False).limit(n).collect()
|
|
@@ -454,6 +458,9 @@ class DataFrame:
|
|
|
454
458
|
Returns:
|
|
455
459
|
The number of rows in the DataFrame.
|
|
456
460
|
"""
|
|
461
|
+
if self.group_by_clause is not None:
|
|
462
|
+
raise excs.Error('count() cannot be used with group_by()')
|
|
463
|
+
|
|
457
464
|
from pixeltable.plan import Planner
|
|
458
465
|
|
|
459
466
|
stmt = Planner.create_count_stmt(self._first_tbl, self.where_clause)
|
|
@@ -573,10 +580,21 @@ class DataFrame:
|
|
|
573
580
|
raise excs.Error(f'Invalid expression: {raw_expr}')
|
|
574
581
|
if expr.col_type.is_invalid_type() and not (isinstance(expr, exprs.Literal) and expr.val is None):
|
|
575
582
|
raise excs.Error(f'Invalid type: {raw_expr}')
|
|
583
|
+
if len(self._from_clause.tbls) == 1:
|
|
584
|
+
# Select expressions need to be retargeted in order to handle snapshots correctly, as in expressions
|
|
585
|
+
# such as `snapshot.select(base_tbl.col)`
|
|
586
|
+
# TODO: For joins involving snapshots, we need a more sophisticated retarget() that can handle
|
|
587
|
+
# multiple TableVersionPaths.
|
|
588
|
+
expr = expr.copy()
|
|
589
|
+
try:
|
|
590
|
+
expr.retarget(self._from_clause.tbls[0])
|
|
591
|
+
except Exception:
|
|
592
|
+
# If retarget() fails, then the succeeding is_bound_by() will raise an error.
|
|
593
|
+
pass
|
|
576
594
|
if not expr.is_bound_by(self._from_clause.tbls):
|
|
577
595
|
raise excs.Error(
|
|
578
596
|
f"Expression '{expr}' cannot be evaluated in the context of this query's tables "
|
|
579
|
-
f'({",".join(tbl.
|
|
597
|
+
f'({",".join(tbl.tbl_version.get().versioned_name for tbl in self._from_clause.tbls)})'
|
|
580
598
|
)
|
|
581
599
|
select_list.append((expr, name))
|
|
582
600
|
|
|
@@ -823,16 +841,18 @@ class DataFrame:
|
|
|
823
841
|
grouping_tbl: Optional[catalog.TableVersion] = None
|
|
824
842
|
group_by_clause: Optional[list[exprs.Expr]] = None
|
|
825
843
|
for item in grouping_items:
|
|
826
|
-
if isinstance(item, catalog.Table):
|
|
844
|
+
if isinstance(item, (catalog.Table, catalog.TableVersion)):
|
|
827
845
|
if len(grouping_items) > 1:
|
|
828
846
|
raise excs.Error('group_by(): only one table can be specified')
|
|
829
847
|
if len(self._from_clause.tbls) > 1:
|
|
830
848
|
raise excs.Error('group_by() with Table not supported for joins')
|
|
849
|
+
grouping_tbl = item if isinstance(item, catalog.TableVersion) else item._tbl_version.get()
|
|
831
850
|
# we need to make sure that the grouping table is a base of self.tbl
|
|
832
|
-
base = self._first_tbl.find_tbl_version(
|
|
851
|
+
base = self._first_tbl.find_tbl_version(grouping_tbl.id)
|
|
833
852
|
if base is None or base.id == self._first_tbl.tbl_id():
|
|
834
|
-
raise excs.Error(
|
|
835
|
-
|
|
853
|
+
raise excs.Error(
|
|
854
|
+
f'group_by(): {grouping_tbl.name} is not a base table of {self._first_tbl.tbl_name()}'
|
|
855
|
+
)
|
|
836
856
|
break
|
|
837
857
|
if not isinstance(item, exprs.Expr):
|
|
838
858
|
raise excs.Error(f'Invalid expression in group_by(): {item}')
|
|
@@ -848,6 +868,29 @@ class DataFrame:
|
|
|
848
868
|
limit=self.limit_val,
|
|
849
869
|
)
|
|
850
870
|
|
|
871
|
+
def distinct(self) -> DataFrame:
|
|
872
|
+
"""
|
|
873
|
+
Remove duplicate rows from this DataFrame.
|
|
874
|
+
|
|
875
|
+
Note that grouping will be applied to the rows based on the select clause of this Dataframe.
|
|
876
|
+
In the absence of a select clause, by default, all columns are selected in the grouping.
|
|
877
|
+
|
|
878
|
+
Examples:
|
|
879
|
+
Select unique addresses from table `addresses`.
|
|
880
|
+
|
|
881
|
+
>>> results = addresses.distinct()
|
|
882
|
+
|
|
883
|
+
Select unique cities in table `addresses`
|
|
884
|
+
|
|
885
|
+
>>> results = addresses.city.distinct()
|
|
886
|
+
|
|
887
|
+
Select unique locations (street, city) in the state of `CA`
|
|
888
|
+
|
|
889
|
+
>>> results = addresses.select(addresses.street, addresses.city).where(addresses.state == 'CA').distinct()
|
|
890
|
+
"""
|
|
891
|
+
exps, _ = self._normalize_select_list(self._from_clause.tbls, self.select_list)
|
|
892
|
+
return self.group_by(*exps)
|
|
893
|
+
|
|
851
894
|
def order_by(self, *expr_list: exprs.Expr, asc: bool = True) -> DataFrame:
|
|
852
895
|
"""Add an order-by clause to this DataFrame.
|
|
853
896
|
|
pixeltable/env.py
CHANGED
|
@@ -610,7 +610,7 @@ class Env:
|
|
|
610
610
|
self.__register_package('datasets')
|
|
611
611
|
self.__register_package('fiftyone')
|
|
612
612
|
self.__register_package('fireworks', library_name='fireworks-ai')
|
|
613
|
-
self.__register_package('google.
|
|
613
|
+
self.__register_package('google.genai', library_name='google-genai')
|
|
614
614
|
self.__register_package('huggingface_hub', library_name='huggingface-hub')
|
|
615
615
|
self.__register_package('label_studio_sdk', library_name='label-studio-sdk')
|
|
616
616
|
self.__register_package('llama_cpp', library_name='llama-cpp-python')
|
|
@@ -24,6 +24,7 @@ class AggregationNode(ExecNode):
|
|
|
24
24
|
agg_fn_eval_ctx: exprs.RowBuilder.EvalCtx
|
|
25
25
|
agg_fn_calls: list[exprs.FunctionCall]
|
|
26
26
|
output_batch: DataRowBatch
|
|
27
|
+
limit: Optional[int]
|
|
27
28
|
|
|
28
29
|
def __init__(
|
|
29
30
|
self,
|
|
@@ -45,6 +46,11 @@ class AggregationNode(ExecNode):
|
|
|
45
46
|
self.agg_fn_calls = [cast(exprs.FunctionCall, e) for e in self.agg_fn_eval_ctx.target_exprs]
|
|
46
47
|
# create output_batch here, rather than in __iter__(), so we don't need to remember tbl and row_builder
|
|
47
48
|
self.output_batch = DataRowBatch(tbl, row_builder, 0)
|
|
49
|
+
self.limit = None
|
|
50
|
+
|
|
51
|
+
def set_limit(self, limit: int) -> None:
|
|
52
|
+
# we can't propagate the limit to our input
|
|
53
|
+
self.limit = limit
|
|
48
54
|
|
|
49
55
|
def _reset_agg_state(self, row_num: int) -> None:
|
|
50
56
|
for fn_call in self.agg_fn_calls:
|
|
@@ -69,21 +75,29 @@ class AggregationNode(ExecNode):
|
|
|
69
75
|
prev_row: Optional[exprs.DataRow] = None
|
|
70
76
|
current_group: Optional[list[Any]] = None # the values of the group-by exprs
|
|
71
77
|
num_input_rows = 0
|
|
78
|
+
num_output_rows = 0
|
|
72
79
|
async for row_batch in self.input:
|
|
73
80
|
num_input_rows += len(row_batch)
|
|
74
81
|
for row in row_batch:
|
|
75
82
|
group = [row[e.slot_idx] for e in self.group_by] if self.group_by is not None else None
|
|
83
|
+
|
|
76
84
|
if current_group is None:
|
|
77
85
|
current_group = group
|
|
78
86
|
self._reset_agg_state(0)
|
|
87
|
+
|
|
79
88
|
if group != current_group:
|
|
80
89
|
# we're entering a new group, emit a row for the previous one
|
|
81
90
|
self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
|
|
82
91
|
self.output_batch.add_row(prev_row)
|
|
92
|
+
num_output_rows += 1
|
|
93
|
+
if self.limit is not None and num_output_rows == self.limit:
|
|
94
|
+
yield self.output_batch
|
|
95
|
+
return
|
|
83
96
|
current_group = group
|
|
84
97
|
self._reset_agg_state(0)
|
|
85
98
|
self._update_agg_state(row, 0)
|
|
86
99
|
prev_row = row
|
|
100
|
+
|
|
87
101
|
if prev_row is not None:
|
|
88
102
|
# emit the last group
|
|
89
103
|
self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
|
|
@@ -167,7 +167,7 @@ class CachePrefetchNode(ExecNode):
|
|
|
167
167
|
assert not self.input_finished
|
|
168
168
|
input_batch: Optional[DataRowBatch]
|
|
169
169
|
try:
|
|
170
|
-
input_batch = await input
|
|
170
|
+
input_batch = await anext(input)
|
|
171
171
|
except StopAsyncIteration:
|
|
172
172
|
input_batch = None
|
|
173
173
|
if input_batch is None:
|
|
@@ -115,7 +115,7 @@ class ExprEvalNode(ExecNode):
|
|
|
115
115
|
"""
|
|
116
116
|
assert not self.input_complete
|
|
117
117
|
try:
|
|
118
|
-
batch = await self.input_iter
|
|
118
|
+
batch = await anext(self.input_iter)
|
|
119
119
|
assert self.next_input_batch is None
|
|
120
120
|
if self.current_input_batch is None:
|
|
121
121
|
self.current_input_batch = batch
|
pixeltable/exprs/column_ref.py
CHANGED
|
@@ -31,12 +31,18 @@ class ColumnRef(Expr):
|
|
|
31
31
|
- in that case, the ColumnRef also instantiates a second non-validating ColumnRef as a component (= dependency)
|
|
32
32
|
- the non-validating ColumnRef is used for SQL translation
|
|
33
33
|
|
|
34
|
+
A ColumnRef may have an optional reference table, which carries the context of the ColumnRef resolution. Thus
|
|
35
|
+
if `v` is a view of `t` (for example), then `v.my_col` and `t.my_col` refer to the same underlying column, but
|
|
36
|
+
their reference tables will be `v` and `t`, respectively. This is to ensure correct behavior of expressions such
|
|
37
|
+
as `v.my_col.head()`.
|
|
38
|
+
|
|
34
39
|
TODO:
|
|
35
40
|
separate Exprs (like validating ColumnRefs) from the logical expression tree and instead have RowBuilder
|
|
36
41
|
insert them into the EvalCtxs as needed
|
|
37
42
|
"""
|
|
38
43
|
|
|
39
44
|
col: catalog.Column
|
|
45
|
+
reference_tbl: Optional[catalog.TableVersionPath]
|
|
40
46
|
is_unstored_iter_col: bool
|
|
41
47
|
iter_arg_ctx: Optional[RowBuilder.EvalCtx]
|
|
42
48
|
base_rowid_len: int
|
|
@@ -46,10 +52,16 @@ class ColumnRef(Expr):
|
|
|
46
52
|
id: int
|
|
47
53
|
perform_validation: bool # if True, performs media validation
|
|
48
54
|
|
|
49
|
-
def __init__(
|
|
55
|
+
def __init__(
|
|
56
|
+
self,
|
|
57
|
+
col: catalog.Column,
|
|
58
|
+
reference_tbl: Optional[catalog.TableVersionPath] = None,
|
|
59
|
+
perform_validation: Optional[bool] = None,
|
|
60
|
+
):
|
|
50
61
|
super().__init__(col.col_type)
|
|
51
62
|
assert col.tbl is not None
|
|
52
63
|
self.col = col
|
|
64
|
+
self.reference_tbl = reference_tbl
|
|
53
65
|
self.is_unstored_iter_col = (
|
|
54
66
|
col.tbl.get().is_component_view and col.tbl.get().is_iterator_column(col) and not col.is_stored
|
|
55
67
|
)
|
|
@@ -95,7 +107,7 @@ class ColumnRef(Expr):
|
|
|
95
107
|
target = tbl_versions[self.col.tbl.id]
|
|
96
108
|
assert self.col.id in target.cols_by_id
|
|
97
109
|
col = target.cols_by_id[self.col.id]
|
|
98
|
-
return ColumnRef(col)
|
|
110
|
+
return ColumnRef(col, self.reference_tbl)
|
|
99
111
|
|
|
100
112
|
def __getattr__(self, name: str) -> Expr:
|
|
101
113
|
from .column_property_ref import ColumnPropertyRef
|
|
@@ -126,26 +138,26 @@ class ColumnRef(Expr):
|
|
|
126
138
|
|
|
127
139
|
return super().__getattr__(name)
|
|
128
140
|
|
|
129
|
-
@classmethod
|
|
130
141
|
def find_embedding_index(
|
|
131
|
-
|
|
142
|
+
self, idx_name: Optional[str], method_name: str
|
|
132
143
|
) -> dict[str, catalog.TableVersion.IndexInfo]:
|
|
133
144
|
"""Return IndexInfo for a column, with an optional given name"""
|
|
134
|
-
# determine index to use
|
|
135
|
-
idx_info_dict = col.get_idx_info()
|
|
136
145
|
from pixeltable import index
|
|
137
146
|
|
|
147
|
+
# determine index to use
|
|
148
|
+
idx_info_dict = self.col.get_idx_info(self.reference_tbl)
|
|
149
|
+
|
|
138
150
|
embedding_idx_info = {
|
|
139
151
|
info: value for info, value in idx_info_dict.items() if isinstance(value.idx, index.EmbeddingIndex)
|
|
140
152
|
}
|
|
141
153
|
if len(embedding_idx_info) == 0:
|
|
142
|
-
raise excs.Error(f'No indices found for {method_name!r} on column {col.name!r}')
|
|
154
|
+
raise excs.Error(f'No indices found for {method_name!r} on column {self.col.name!r}')
|
|
143
155
|
if idx_name is not None and idx_name not in embedding_idx_info:
|
|
144
|
-
raise excs.Error(f'Index {idx_name!r} not found for {method_name!r} on column {col.name!r}')
|
|
156
|
+
raise excs.Error(f'Index {idx_name!r} not found for {method_name!r} on column {self.col.name!r}')
|
|
145
157
|
if len(embedding_idx_info) > 1:
|
|
146
158
|
if idx_name is None:
|
|
147
159
|
raise excs.Error(
|
|
148
|
-
f'Column {col.name!r} has multiple indices; use the index name to disambiguate: '
|
|
160
|
+
f'Column {self.col.name!r} has multiple indices; use the index name to disambiguate: '
|
|
149
161
|
f'`{method_name}(..., idx=<index_name>)`'
|
|
150
162
|
)
|
|
151
163
|
idx_info = {idx_name: embedding_idx_info[idx_name]}
|
|
@@ -159,7 +171,7 @@ class ColumnRef(Expr):
|
|
|
159
171
|
return SimilarityExpr(self, item, idx_name=idx)
|
|
160
172
|
|
|
161
173
|
def embedding(self, *, idx: Optional[str] = None) -> ColumnRef:
|
|
162
|
-
idx_info =
|
|
174
|
+
idx_info = self.find_embedding_index(idx, 'embedding')
|
|
163
175
|
assert len(idx_info) == 1
|
|
164
176
|
col = copy.copy(next(iter(idx_info.values())).val_col)
|
|
165
177
|
col.name = f'{self.col.name}_embedding_{idx if idx is not None else ""}'
|
|
@@ -167,14 +179,21 @@ class ColumnRef(Expr):
|
|
|
167
179
|
return ColumnRef(col)
|
|
168
180
|
|
|
169
181
|
def default_column_name(self) -> Optional[str]:
|
|
170
|
-
return
|
|
182
|
+
return self.col.name if self.col is not None else None
|
|
171
183
|
|
|
172
184
|
def _equals(self, other: ColumnRef) -> bool:
|
|
173
185
|
return self.col == other.col and self.perform_validation == other.perform_validation
|
|
174
186
|
|
|
175
187
|
def _df(self) -> 'pxt.dataframe.DataFrame':
|
|
176
|
-
|
|
177
|
-
|
|
188
|
+
from pixeltable import plan
|
|
189
|
+
|
|
190
|
+
if self.reference_tbl is None:
|
|
191
|
+
# No reference table; use the current version of the table to which the column belongs
|
|
192
|
+
tbl = catalog.Catalog.get().get_table_by_id(self.col.tbl.id)
|
|
193
|
+
return tbl.select(self)
|
|
194
|
+
else:
|
|
195
|
+
# Explicit reference table; construct a DataFrame directly from it
|
|
196
|
+
return pxt.DataFrame(plan.FromClause([self.reference_tbl])).select(self)
|
|
178
197
|
|
|
179
198
|
def show(self, *args: Any, **kwargs: Any) -> 'pxt.dataframe.DataFrameResultSet':
|
|
180
199
|
return self._df().show(*args, **kwargs)
|
|
@@ -188,6 +207,10 @@ class ColumnRef(Expr):
|
|
|
188
207
|
def count(self) -> int:
|
|
189
208
|
return self._df().count()
|
|
190
209
|
|
|
210
|
+
def distinct(self) -> 'pxt.dataframe.DataFrame':
|
|
211
|
+
"""Return distinct values in this column."""
|
|
212
|
+
return self._df().distinct()
|
|
213
|
+
|
|
191
214
|
def __str__(self) -> str:
|
|
192
215
|
if self.col.name is None:
|
|
193
216
|
return f'<unnamed column {self.col.id}>'
|
|
@@ -203,7 +226,7 @@ class ColumnRef(Expr):
|
|
|
203
226
|
def _descriptors(self) -> DescriptionHelper:
|
|
204
227
|
tbl = catalog.Catalog.get().get_table_by_id(self.col.tbl.id)
|
|
205
228
|
helper = DescriptionHelper()
|
|
206
|
-
helper.append(f'Column\n{self.col.name!r}\n(of table {tbl._path
|
|
229
|
+
helper.append(f'Column\n{self.col.name!r}\n(of table {tbl._path!r})')
|
|
207
230
|
helper.append(tbl._col_descriptor([self.col.name]))
|
|
208
231
|
idxs = tbl._index_descriptor([self.col.name])
|
|
209
232
|
if len(idxs) > 0:
|
|
@@ -260,13 +283,14 @@ class ColumnRef(Expr):
|
|
|
260
283
|
|
|
261
284
|
def _as_dict(self) -> dict:
|
|
262
285
|
tbl = self.col.tbl
|
|
263
|
-
|
|
286
|
+
tbl_version = tbl.get().version if tbl.get().is_snapshot else None
|
|
264
287
|
# we omit self.components, even if this is a validating ColumnRef, because init() will recreate the
|
|
265
288
|
# non-validating component ColumnRef
|
|
266
289
|
return {
|
|
267
290
|
'tbl_id': str(tbl.id),
|
|
268
|
-
'tbl_version':
|
|
291
|
+
'tbl_version': tbl_version,
|
|
269
292
|
'col_id': self.col.id,
|
|
293
|
+
'reference_tbl': self.reference_tbl.as_dict() if self.reference_tbl is not None else None,
|
|
270
294
|
'perform_validation': self.perform_validation,
|
|
271
295
|
}
|
|
272
296
|
|
|
@@ -281,5 +305,6 @@ class ColumnRef(Expr):
|
|
|
281
305
|
@classmethod
|
|
282
306
|
def _from_dict(cls, d: dict, _: list[Expr]) -> ColumnRef:
|
|
283
307
|
col = cls.get_column(d)
|
|
308
|
+
reference_tbl = None if d['reference_tbl'] is None else catalog.TableVersionPath.from_dict(d['reference_tbl'])
|
|
284
309
|
perform_validation = d['perform_validation']
|
|
285
|
-
return cls(col, perform_validation=perform_validation)
|
|
310
|
+
return cls(col, reference_tbl, perform_validation=perform_validation)
|
pixeltable/exprs/data_row.py
CHANGED
|
@@ -29,10 +29,13 @@ class DataRow:
|
|
|
29
29
|
- FloatType: float
|
|
30
30
|
- BoolType: bool
|
|
31
31
|
- TimestampType: datetime.datetime
|
|
32
|
+
- DateType: datetime.date
|
|
32
33
|
- JsonType: json-serializable object
|
|
33
34
|
- ArrayType: numpy.ndarray
|
|
34
35
|
- ImageType: PIL.Image.Image
|
|
35
36
|
- VideoType: local path if available, otherwise url
|
|
37
|
+
- AudioType: local path if available, otherwise url
|
|
38
|
+
- DocumentType: local path if available, otherwise url
|
|
36
39
|
"""
|
|
37
40
|
|
|
38
41
|
vals: np.ndarray # of object
|
pixeltable/exprs/globals.py
CHANGED
|
@@ -5,7 +5,7 @@ import enum
|
|
|
5
5
|
from typing import Union
|
|
6
6
|
|
|
7
7
|
# Python types corresponding to our literal types
|
|
8
|
-
LiteralPythonTypes = Union[str, int, float, bool, datetime.datetime]
|
|
8
|
+
LiteralPythonTypes = Union[str, int, float, bool, datetime.datetime, datetime.date]
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def print_slice(s: slice) -> str:
|