PyPI - pixeltable - Versions diffs - 0.3.13__py3-none-any.whl → 0.3.15__py3-none-any.whl - Mend

pixeltable 0.3.13py3-none-any.whl → 0.3.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (50) hide show

pixeltable/__init__.py +2 -2
pixeltable/__version__.py +2 -2
pixeltable/catalog/catalog.py +9 -7
pixeltable/catalog/column.py +6 -2
pixeltable/catalog/dir.py +2 -1
pixeltable/catalog/insertable_table.py +1 -1
pixeltable/catalog/schema_object.py +2 -1
pixeltable/catalog/table.py +12 -8
pixeltable/catalog/table_version.py +21 -0
pixeltable/catalog/view.py +3 -3
pixeltable/dataframe.py +48 -5
pixeltable/env.py +1 -1
pixeltable/exec/aggregation_node.py +14 -0
pixeltable/exec/cache_prefetch_node.py +1 -1
pixeltable/exec/expr_eval/expr_eval_node.py +1 -1
pixeltable/exprs/column_ref.py +42 -17
pixeltable/exprs/data_row.py +3 -0
pixeltable/exprs/globals.py +1 -1
pixeltable/exprs/literal.py +11 -1
pixeltable/exprs/rowid_ref.py +4 -1
pixeltable/exprs/similarity_expr.py +1 -1
pixeltable/func/function.py +1 -1
pixeltable/functions/__init__.py +1 -0
pixeltable/functions/date.py +185 -0
pixeltable/functions/gemini.py +184 -49
pixeltable/functions/globals.py +1 -16
pixeltable/functions/json.py +2 -1
pixeltable/functions/math.py +103 -0
pixeltable/functions/string.py +1 -2
pixeltable/functions/video.py +2 -2
pixeltable/globals.py +26 -9
pixeltable/io/hf_datasets.py +2 -2
pixeltable/io/pandas.py +16 -4
pixeltable/io/parquet.py +4 -2
pixeltable/metadata/__init__.py +1 -1
pixeltable/metadata/converters/convert_34.py +21 -0
pixeltable/metadata/notes.py +1 -0
pixeltable/plan.py +12 -5
pixeltable/share/__init__.py +1 -1
pixeltable/share/packager.py +397 -120
pixeltable/share/publish.py +61 -16
pixeltable/store.py +57 -20
pixeltable/type_system.py +46 -2
pixeltable/utils/arrow.py +8 -2
pixeltable/utils/pytorch.py +4 -0
{pixeltable-0.3.13.dist-info → pixeltable-0.3.15.dist-info}/METADATA +2 -4
{pixeltable-0.3.13.dist-info → pixeltable-0.3.15.dist-info}/RECORD +50 -48
{pixeltable-0.3.13.dist-info → pixeltable-0.3.15.dist-info}/LICENSE +0 -0
{pixeltable-0.3.13.dist-info → pixeltable-0.3.15.dist-info}/WHEEL +0 -0
{pixeltable-0.3.13.dist-info → pixeltable-0.3.15.dist-info}/entry_points.txt +0 -0

pixeltable/__init__.py CHANGED Viewed

@@ -9,6 +9,7 @@ from .globals import (
     array,
     configure_logging,
     create_dir,
+    create_replica,
     create_snapshot,
     create_table,
     create_view,
@@ -20,11 +21,10 @@ from .globals import (
     list_functions,
     list_tables,
     move,
-    publish_snapshot,
     tool,
     tools,
 )
-from .type_system import Array, Audio, Bool, Document, Float, Image, Int, Json, Required, String, Timestamp, Video
+from .type_system import Array, Audio, Bool, Date, Document, Float, Image, Int, Json, Required, String, Timestamp, Video
 # This import must go last to avoid circular imports.
 from . import ext, functions, io, iterators  # isort: skip

pixeltable/__version__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 # These version placeholders will be replaced during build.
-__version__ = '0.3.13'
-__version_tuple__ = (0, 3, 13)
+__version__ = '0.3.15'
+__version_tuple__ = (0, 3, 15)

pixeltable/catalog/catalog.py CHANGED Viewed

@@ -432,7 +432,9 @@ class Catalog:
         return view
     @_retry_loop
-    def create_replica(self, path: Path, md: list[schema.FullTableMd], if_exists: IfExistsParam) -> Table:
+    def create_replica(
+        self, path: Path, md: list[schema.FullTableMd], if_exists: IfExistsParam = IfExistsParam.ERROR
+    ) -> Table:
         """
         Creates table, table_version, and table_schema_version records for a replica with the given metadata.
         The metadata should be presented in standard "ancestor order", with the table being replicated at
@@ -458,11 +460,11 @@ class Catalog:
         # TODO: Handle concurrency in create_replica()
         existing = Catalog.get().get_table_by_id(tbl_id)
         if existing is not None:
-            existing_path = Path(existing._path(), allow_system_paths=True)
+            existing_path = Path(existing._path, allow_system_paths=True)
             # It does exist. If it's a non-system table, that's an error: it's already been replicated.
             if not existing_path.is_system_path:
                 raise excs.Error(
-                    f'That table has already been replicated as {existing._path()!r}. \n'
+                    f'That table has already been replicated as {existing._path!r}. \n'
                     f'Drop the existing replica if you wish to re-create it.'
                 )
             # If it's a system table, then this means it was created at some point as the ancestor of some other
@@ -487,7 +489,7 @@ class Catalog:
                 # The table already exists in the catalog. The existing path might be a system path (if the table
                 # was created as an anonymous base table of some other table), or it might not (if it's a snapshot
                 # that was directly replicated by the user at some point). In either case, use the existing path.
-                replica_path = Path(replica._path(), allow_system_paths=True)
+                replica_path = Path(replica._path, allow_system_paths=True)
             # Store the metadata; it could be a new version (in which case a new record will be created) or a
             # known version (in which case the newly received metadata will be validated as identical).
@@ -619,11 +621,11 @@ class Catalog:
                 msg: str
                 if is_replace:
                     msg = (
-                        f'{obj_type_str} {tbl._path()} already exists and has dependents. '
+                        f'{obj_type_str} {tbl._path} already exists and has dependents. '
                         "Use `if_exists='replace_force'` to replace it."
                     )
                 else:
-                    msg = f'{obj_type_str} {tbl._path()} has dependents.'
+                    msg = f'{obj_type_str} {tbl._path} has dependents.'
                 raise excs.Error(msg)
             for view_id in view_ids:
@@ -634,7 +636,7 @@ class Catalog:
         tbl._drop()
         assert tbl._id in self._tbls
         del self._tbls[tbl._id]
-        _logger.info(f'Dropped table `{tbl._path()}`.')
+        _logger.info(f'Dropped table `{tbl._path}`.')
     @_retry_loop
     def create_dir(self, path: Path, if_exists: IfExistsParam, parents: bool) -> Dir:

pixeltable/catalog/column.py CHANGED Viewed

@@ -16,6 +16,7 @@ from .globals import MediaValidation, is_valid_identifier
 if TYPE_CHECKING:
     from .table_version import TableVersion
     from .table_version_handle import TableVersionHandle
+    from .table_version_path import TableVersionPath
 _logger = logging.getLogger('pixeltable')
@@ -170,9 +171,12 @@ class Column:
         )
         return len(window_fn_calls) > 0
-    def get_idx_info(self) -> dict[str, 'TableVersion.IndexInfo']:
+    # TODO: This should be moved out of `Column` (its presence in `Column` doesn't anticipate indices being defined on
+    #     multiple dependents)
+    def get_idx_info(self, reference_tbl: Optional['TableVersionPath'] = None) -> dict[str, 'TableVersion.IndexInfo']:
         assert self.tbl is not None
-        return {name: info for name, info in self.tbl.get().idxs_by_name.items() if info.col == self}
+        tbl = reference_tbl.tbl_version if reference_tbl is not None else self.tbl
+        return {name: info for name, info in tbl.get().idxs_by_name.items() if info.col == self}
     @property
     def is_computed(self) -> bool:

pixeltable/catalog/dir.py CHANGED Viewed

@@ -38,12 +38,13 @@ class Dir(SchemaObject):
     def _display_name(cls) -> str:
         return 'directory'
+    @property
     def _path(self) -> str:
         """Returns the path to this schema object."""
         if self._dir_id is None:
             # we're the root dir
             return ''
-        return super()._path()
+        return super()._path
     def _move(self, new_name: str, new_dir_id: UUID) -> None:
         # print(

pixeltable/catalog/insertable_table.py CHANGED Viewed

@@ -238,4 +238,4 @@ class InsertableTable(Table):
         return []
     def _table_descriptor(self) -> str:
-        return f'Table {self._path()!r}'
+        return f'Table {self._path!r}'

pixeltable/catalog/schema_object.py CHANGED Viewed

@@ -33,6 +33,7 @@ class SchemaObject:
                 return None
             return Catalog.get().get_dir(self._dir_id)
+    @property
     def _path(self) -> str:
         """Returns the path to this schema object."""
         from .catalog import Catalog
@@ -44,7 +45,7 @@ class SchemaObject:
     def get_metadata(self) -> dict[str, Any]:
         """Returns metadata associated with this schema object."""
-        return {'name': self._name, 'path': self._path()}
+        return {'name': self._name, 'path': self._path}
     @classmethod
     @abstractmethod

pixeltable/catalog/table.py CHANGED Viewed

@@ -109,7 +109,7 @@ class Table(SchemaObject):
         self._check_is_dropped()
         with env.Env.get().begin_xact():
             md = super().get_metadata()
-            md['base'] = self._base_table._path() if self._base_table is not None else None
+            md['base'] = self._base_table._path if self._base_table is not None else None
             md['schema'] = self._schema
             md['is_replica'] = self._tbl_version.get().is_replica
             md['version'] = self._version
@@ -146,7 +146,7 @@ class Table(SchemaObject):
         col = self._tbl_version_path.get_column(name)
         if col is None:
             raise AttributeError(f'Column {name!r} unknown')
-        return ColumnRef(col)
+        return ColumnRef(col, reference_tbl=self._tbl_version_path)
     def __getitem__(self, name: str) -> 'exprs.ColumnRef':
         """Return a ColumnRef for the given name."""
@@ -165,7 +165,7 @@ class Table(SchemaObject):
         """
         self._check_is_dropped()
         with env.Env.get().begin_xact():
-            return [t._path() for t in self._get_views(recursive=recursive)]
+            return [t._path for t in self._get_views(recursive=recursive)]
     def _get_views(self, *, recursive: bool = True) -> list['Table']:
         cat = catalog.Catalog.get()
@@ -220,6 +220,10 @@ class Table(SchemaObject):
         """
         return self._df().group_by(*items)
+    def distinct(self) -> 'pxt.DataFrame':
+        """Remove duplicate rows from table."""
+        return self._df().distinct()
     def limit(self, n: int) -> 'pxt.DataFrame':
         return self._df().limit(n)
@@ -254,11 +258,15 @@ class Table(SchemaObject):
         """Return the schema (column names and column types) of this table."""
         return {c.name: c.col_type for c in self._tbl_version_path.columns()}
+    @property
+    def base_table(self) -> Optional['Table']:
+        with env.Env.get().begin_xact():
+            return self._base_table
     @property
     @abc.abstractmethod
     def _base_table(self) -> Optional['Table']:
         """The base's Table instance"""
-        ...
     @property
     def _base_tables(self) -> list['Table']:
@@ -274,7 +282,6 @@ class Table(SchemaObject):
     @abc.abstractmethod
     def _effective_base_versions(self) -> list[Optional[int]]:
         """The effective versions of the ancestor bases, starting with its immediate base."""
-        ...
     @property
     def _comment(self) -> str:
@@ -311,9 +318,6 @@ class Table(SchemaObject):
             helper.append(f'COMMENT: {self._comment}')
         return helper
-    @abc.abstractmethod
-    def _table_descriptor(self) -> str: ...
     def _col_descriptor(self, columns: Optional[list[str]] = None) -> pd.DataFrame:
         return pd.DataFrame(
             {

pixeltable/catalog/table_version.py CHANGED Viewed

@@ -202,6 +202,13 @@ class TableVersion:
         return TableVersionHandle(self.id, self.effective_version, tbl_version=self)
+    @property
+    def versioned_name(self) -> str:
+        if self.effective_version is None:
+            return self.name
+        else:
+            return f'{self.name}:{self.effective_version}'
     @classmethod
     def create(
         cls,
@@ -314,6 +321,20 @@ class TableVersion:
         session.add(schema_version_record)
         return tbl_record.id, tbl_version
+    @classmethod
+    def create_replica(cls, md: schema.FullTableMd) -> TableVersion:
+        tbl_id = UUID(md.tbl_md.tbl_id)
+        _logger.info(f'Creating replica table version {tbl_id}:{md.version_md.version}.')
+        view_md = md.tbl_md.view_md
+        base_path = pxt.catalog.TableVersionPath.from_md(view_md.base_versions) if view_md is not None else None
+        base = base_path.tbl_version if base_path is not None else None
+        tbl_version = cls(
+            tbl_id, md.tbl_md, md.version_md.version, md.schema_version_md, [], base_path=base_path, base=base
+        )
+        tbl_version.store_tbl.create()
+        tbl_version.store_tbl.ensure_columns_exist(col for col in tbl_version.cols if col.is_stored)
+        return tbl_version
     def drop(self) -> None:
         from .catalog import Catalog

pixeltable/catalog/view.py CHANGED Viewed

@@ -285,13 +285,13 @@ class View(Table):
     def _table_descriptor(self) -> str:
         display_name = 'Snapshot' if self._snapshot_only else 'View'
-        result = [f'{display_name} {self._path()!r}']
+        result = [f'{display_name} {self._path!r}']
         bases_descrs: list[str] = []
         for base, effective_version in zip(self._base_tables, self._effective_base_versions):
             if effective_version is None:
-                bases_descrs.append(f'{base._path()!r}')
+                bases_descrs.append(f'{base._path!r}')
             else:
-                base_descr = f'{base._path()}:{effective_version}'
+                base_descr = f'{base._path}:{effective_version}'
                 bases_descrs.append(f'{base_descr!r}')
         result.append(f' (of {", ".join(bases_descrs)})')

pixeltable/dataframe.py CHANGED Viewed

@@ -322,6 +322,8 @@ class DataFrame:
             raise excs.Error('head() cannot be used with order_by()')
         if self._has_joins():
             raise excs.Error('head() not supported for joins')
+        if self.group_by_clause is not None:
+            raise excs.Error('head() cannot be used with group_by()')
         num_rowid_cols = len(self._first_tbl.tbl_version.get().store_tbl.rowid_columns())
         order_by_clause = [exprs.RowidRef(self._first_tbl.tbl_version, idx) for idx in range(num_rowid_cols)]
         return self.order_by(*order_by_clause, asc=True).limit(n).collect()
@@ -345,6 +347,8 @@ class DataFrame:
             raise excs.Error('tail() cannot be used with order_by()')
         if self._has_joins():
             raise excs.Error('tail() not supported for joins')
+        if self.group_by_clause is not None:
+            raise excs.Error('tail() cannot be used with group_by()')
         num_rowid_cols = len(self._first_tbl.tbl_version.get().store_tbl.rowid_columns())
         order_by_clause = [exprs.RowidRef(self._first_tbl.tbl_version, idx) for idx in range(num_rowid_cols)]
         result = self.order_by(*order_by_clause, asc=False).limit(n).collect()
@@ -454,6 +458,9 @@ class DataFrame:
         Returns:
             The number of rows in the DataFrame.
         """
+        if self.group_by_clause is not None:
+            raise excs.Error('count() cannot be used with group_by()')
         from pixeltable.plan import Planner
         stmt = Planner.create_count_stmt(self._first_tbl, self.where_clause)
@@ -573,10 +580,21 @@ class DataFrame:
                 raise excs.Error(f'Invalid expression: {raw_expr}')
             if expr.col_type.is_invalid_type() and not (isinstance(expr, exprs.Literal) and expr.val is None):
                 raise excs.Error(f'Invalid type: {raw_expr}')
+            if len(self._from_clause.tbls) == 1:
+                # Select expressions need to be retargeted in order to handle snapshots correctly, as in expressions
+                # such as `snapshot.select(base_tbl.col)`
+                # TODO: For joins involving snapshots, we need a more sophisticated retarget() that can handle
+                #     multiple TableVersionPaths.
+                expr = expr.copy()
+                try:
+                    expr.retarget(self._from_clause.tbls[0])
+                except Exception:
+                    # If retarget() fails, then the succeeding is_bound_by() will raise an error.
+                    pass
             if not expr.is_bound_by(self._from_clause.tbls):
                 raise excs.Error(
                     f"Expression '{expr}' cannot be evaluated in the context of this query's tables "
-                    f'({",".join(tbl.tbl_name() for tbl in self._from_clause.tbls)})'
+                    f'({",".join(tbl.tbl_version.get().versioned_name for tbl in self._from_clause.tbls)})'
                 )
             select_list.append((expr, name))
@@ -823,16 +841,18 @@ class DataFrame:
         grouping_tbl: Optional[catalog.TableVersion] = None
         group_by_clause: Optional[list[exprs.Expr]] = None
         for item in grouping_items:
-            if isinstance(item, catalog.Table):
+            if isinstance(item, (catalog.Table, catalog.TableVersion)):
                 if len(grouping_items) > 1:
                     raise excs.Error('group_by(): only one table can be specified')
                 if len(self._from_clause.tbls) > 1:
                     raise excs.Error('group_by() with Table not supported for joins')
+                grouping_tbl = item if isinstance(item, catalog.TableVersion) else item._tbl_version.get()
                 # we need to make sure that the grouping table is a base of self.tbl
-                base = self._first_tbl.find_tbl_version(item._tbl_version_path.tbl_id())
+                base = self._first_tbl.find_tbl_version(grouping_tbl.id)
                 if base is None or base.id == self._first_tbl.tbl_id():
-                    raise excs.Error(f'group_by(): {item._name} is not a base table of {self._first_tbl.tbl_name()}')
-                grouping_tbl = item._tbl_version_path.tbl_version.get()
+                    raise excs.Error(
+                        f'group_by(): {grouping_tbl.name} is not a base table of {self._first_tbl.tbl_name()}'
+                    )
                 break
             if not isinstance(item, exprs.Expr):
                 raise excs.Error(f'Invalid expression in group_by(): {item}')
@@ -848,6 +868,29 @@ class DataFrame:
             limit=self.limit_val,
         )
+    def distinct(self) -> DataFrame:
+        """
+        Remove duplicate rows from this DataFrame.
+        Note that grouping will be applied to the rows based on the select clause of this Dataframe.
+        In the absence of a select clause, by default, all columns are selected in the grouping.
+        Examples:
+            Select unique addresses from table `addresses`.
+            >>> results = addresses.distinct()
+            Select unique cities in table `addresses`
+            >>> results = addresses.city.distinct()
+            Select unique locations (street, city) in the state of `CA`
+            >>> results = addresses.select(addresses.street, addresses.city).where(addresses.state == 'CA').distinct()
+        """
+        exps, _ = self._normalize_select_list(self._from_clause.tbls, self.select_list)
+        return self.group_by(*exps)
     def order_by(self, *expr_list: exprs.Expr, asc: bool = True) -> DataFrame:
         """Add an order-by clause to this DataFrame.

pixeltable/env.py CHANGED Viewed

@@ -610,7 +610,7 @@ class Env:
         self.__register_package('datasets')
         self.__register_package('fiftyone')
         self.__register_package('fireworks', library_name='fireworks-ai')
-        self.__register_package('google.generativeai', library_name='google-generativeai')
+        self.__register_package('google.genai', library_name='google-genai')
         self.__register_package('huggingface_hub', library_name='huggingface-hub')
         self.__register_package('label_studio_sdk', library_name='label-studio-sdk')
         self.__register_package('llama_cpp', library_name='llama-cpp-python')

pixeltable/exec/aggregation_node.py CHANGED Viewed

@@ -24,6 +24,7 @@ class AggregationNode(ExecNode):
     agg_fn_eval_ctx: exprs.RowBuilder.EvalCtx
     agg_fn_calls: list[exprs.FunctionCall]
     output_batch: DataRowBatch
+    limit: Optional[int]
     def __init__(
         self,
@@ -45,6 +46,11 @@ class AggregationNode(ExecNode):
         self.agg_fn_calls = [cast(exprs.FunctionCall, e) for e in self.agg_fn_eval_ctx.target_exprs]
         # create output_batch here, rather than in __iter__(), so we don't need to remember tbl and row_builder
         self.output_batch = DataRowBatch(tbl, row_builder, 0)
+        self.limit = None
+    def set_limit(self, limit: int) -> None:
+        # we can't propagate the limit to our input
+        self.limit = limit
     def _reset_agg_state(self, row_num: int) -> None:
         for fn_call in self.agg_fn_calls:
@@ -69,21 +75,29 @@ class AggregationNode(ExecNode):
         prev_row: Optional[exprs.DataRow] = None
         current_group: Optional[list[Any]] = None  # the values of the group-by exprs
         num_input_rows = 0
+        num_output_rows = 0
         async for row_batch in self.input:
             num_input_rows += len(row_batch)
             for row in row_batch:
                 group = [row[e.slot_idx] for e in self.group_by] if self.group_by is not None else None
                 if current_group is None:
                     current_group = group
                     self._reset_agg_state(0)
                 if group != current_group:
                     # we're entering a new group, emit a row for the previous one
                     self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
                     self.output_batch.add_row(prev_row)
+                    num_output_rows += 1
+                    if self.limit is not None and num_output_rows == self.limit:
+                        yield self.output_batch
+                        return
                     current_group = group
                     self._reset_agg_state(0)
                 self._update_agg_state(row, 0)
                 prev_row = row
         if prev_row is not None:
             # emit the last group
             self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)

pixeltable/exec/cache_prefetch_node.py CHANGED Viewed

@@ -167,7 +167,7 @@ class CachePrefetchNode(ExecNode):
         assert not self.input_finished
         input_batch: Optional[DataRowBatch]
         try:
-            input_batch = await input.__anext__()
+            input_batch = await anext(input)
         except StopAsyncIteration:
             input_batch = None
         if input_batch is None:

pixeltable/exec/expr_eval/expr_eval_node.py CHANGED Viewed

@@ -115,7 +115,7 @@ class ExprEvalNode(ExecNode):
         """
         assert not self.input_complete
         try:
-            batch = await self.input_iter.__anext__()
+            batch = await anext(self.input_iter)
             assert self.next_input_batch is None
             if self.current_input_batch is None:
                 self.current_input_batch = batch

pixeltable/exprs/column_ref.py CHANGED Viewed

@@ -31,12 +31,18 @@ class ColumnRef(Expr):
     - in that case, the ColumnRef also instantiates a second non-validating ColumnRef as a component (= dependency)
     - the non-validating ColumnRef is used for SQL translation
+    A ColumnRef may have an optional reference table, which carries the context of the ColumnRef resolution. Thus
+    if `v` is a view of `t` (for example), then `v.my_col` and `t.my_col` refer to the same underlying column, but
+    their reference tables will be `v` and `t`, respectively. This is to ensure correct behavior of expressions such
+    as `v.my_col.head()`.
     TODO:
     separate Exprs (like validating ColumnRefs) from the logical expression tree and instead have RowBuilder
     insert them into the EvalCtxs as needed
     """
     col: catalog.Column
+    reference_tbl: Optional[catalog.TableVersionPath]
     is_unstored_iter_col: bool
     iter_arg_ctx: Optional[RowBuilder.EvalCtx]
     base_rowid_len: int
@@ -46,10 +52,16 @@ class ColumnRef(Expr):
     id: int
     perform_validation: bool  # if True, performs media validation
-    def __init__(self, col: catalog.Column, perform_validation: Optional[bool] = None):
+    def __init__(
+        self,
+        col: catalog.Column,
+        reference_tbl: Optional[catalog.TableVersionPath] = None,
+        perform_validation: Optional[bool] = None,
+    ):
         super().__init__(col.col_type)
         assert col.tbl is not None
         self.col = col
+        self.reference_tbl = reference_tbl
         self.is_unstored_iter_col = (
             col.tbl.get().is_component_view and col.tbl.get().is_iterator_column(col) and not col.is_stored
         )
@@ -95,7 +107,7 @@ class ColumnRef(Expr):
         target = tbl_versions[self.col.tbl.id]
         assert self.col.id in target.cols_by_id
         col = target.cols_by_id[self.col.id]
-        return ColumnRef(col)
+        return ColumnRef(col, self.reference_tbl)
     def __getattr__(self, name: str) -> Expr:
         from .column_property_ref import ColumnPropertyRef
@@ -126,26 +138,26 @@ class ColumnRef(Expr):
         return super().__getattr__(name)
-    @classmethod
     def find_embedding_index(
-        cls, col: catalog.Column, idx_name: Optional[str], method_name: str
+        self, idx_name: Optional[str], method_name: str
     ) -> dict[str, catalog.TableVersion.IndexInfo]:
         """Return IndexInfo for a column, with an optional given name"""
-        # determine index to use
-        idx_info_dict = col.get_idx_info()
         from pixeltable import index
+        # determine index to use
+        idx_info_dict = self.col.get_idx_info(self.reference_tbl)
         embedding_idx_info = {
             info: value for info, value in idx_info_dict.items() if isinstance(value.idx, index.EmbeddingIndex)
         }
         if len(embedding_idx_info) == 0:
-            raise excs.Error(f'No indices found for {method_name!r} on column {col.name!r}')
+            raise excs.Error(f'No indices found for {method_name!r} on column {self.col.name!r}')
         if idx_name is not None and idx_name not in embedding_idx_info:
-            raise excs.Error(f'Index {idx_name!r} not found for {method_name!r} on column {col.name!r}')
+            raise excs.Error(f'Index {idx_name!r} not found for {method_name!r} on column {self.col.name!r}')
         if len(embedding_idx_info) > 1:
             if idx_name is None:
                 raise excs.Error(
-                    f'Column {col.name!r} has multiple indices; use the index name to disambiguate: '
+                    f'Column {self.col.name!r} has multiple indices; use the index name to disambiguate: '
                     f'`{method_name}(..., idx=<index_name>)`'
                 )
             idx_info = {idx_name: embedding_idx_info[idx_name]}
@@ -159,7 +171,7 @@ class ColumnRef(Expr):
         return SimilarityExpr(self, item, idx_name=idx)
     def embedding(self, *, idx: Optional[str] = None) -> ColumnRef:
-        idx_info = ColumnRef.find_embedding_index(self.col, idx, 'embedding')
+        idx_info = self.find_embedding_index(idx, 'embedding')
         assert len(idx_info) == 1
         col = copy.copy(next(iter(idx_info.values())).val_col)
         col.name = f'{self.col.name}_embedding_{idx if idx is not None else ""}'
@@ -167,14 +179,21 @@ class ColumnRef(Expr):
         return ColumnRef(col)
     def default_column_name(self) -> Optional[str]:
-        return str(self)
+        return self.col.name if self.col is not None else None
     def _equals(self, other: ColumnRef) -> bool:
         return self.col == other.col and self.perform_validation == other.perform_validation
     def _df(self) -> 'pxt.dataframe.DataFrame':
-        tbl = catalog.Catalog.get().get_table_by_id(self.col.tbl.id)
-        return tbl.select(self)
+        from pixeltable import plan
+        if self.reference_tbl is None:
+            # No reference table; use the current version of the table to which the column belongs
+            tbl = catalog.Catalog.get().get_table_by_id(self.col.tbl.id)
+            return tbl.select(self)
+        else:
+            # Explicit reference table; construct a DataFrame directly from it
+            return pxt.DataFrame(plan.FromClause([self.reference_tbl])).select(self)
     def show(self, *args: Any, **kwargs: Any) -> 'pxt.dataframe.DataFrameResultSet':
         return self._df().show(*args, **kwargs)
@@ -188,6 +207,10 @@ class ColumnRef(Expr):
     def count(self) -> int:
         return self._df().count()
+    def distinct(self) -> 'pxt.dataframe.DataFrame':
+        """Return distinct values in this column."""
+        return self._df().distinct()
     def __str__(self) -> str:
         if self.col.name is None:
             return f'<unnamed column {self.col.id}>'
@@ -203,7 +226,7 @@ class ColumnRef(Expr):
     def _descriptors(self) -> DescriptionHelper:
         tbl = catalog.Catalog.get().get_table_by_id(self.col.tbl.id)
         helper = DescriptionHelper()
-        helper.append(f'Column\n{self.col.name!r}\n(of table {tbl._path()!r})')
+        helper.append(f'Column\n{self.col.name!r}\n(of table {tbl._path!r})')
         helper.append(tbl._col_descriptor([self.col.name]))
         idxs = tbl._index_descriptor([self.col.name])
         if len(idxs) > 0:
@@ -260,13 +283,14 @@ class ColumnRef(Expr):
     def _as_dict(self) -> dict:
         tbl = self.col.tbl
-        version = tbl.get().version if tbl.get().is_snapshot else None
+        tbl_version = tbl.get().version if tbl.get().is_snapshot else None
         # we omit self.components, even if this is a validating ColumnRef, because init() will recreate the
         # non-validating component ColumnRef
         return {
             'tbl_id': str(tbl.id),
-            'tbl_version': version,
+            'tbl_version': tbl_version,
             'col_id': self.col.id,
+            'reference_tbl': self.reference_tbl.as_dict() if self.reference_tbl is not None else None,
             'perform_validation': self.perform_validation,
         }
@@ -281,5 +305,6 @@ class ColumnRef(Expr):
     @classmethod
     def _from_dict(cls, d: dict, _: list[Expr]) -> ColumnRef:
         col = cls.get_column(d)
+        reference_tbl = None if d['reference_tbl'] is None else catalog.TableVersionPath.from_dict(d['reference_tbl'])
         perform_validation = d['perform_validation']
-        return cls(col, perform_validation=perform_validation)
+        return cls(col, reference_tbl, perform_validation=perform_validation)

pixeltable/exprs/data_row.py CHANGED Viewed

@@ -29,10 +29,13 @@ class DataRow:
     - FloatType: float
     - BoolType: bool
     - TimestampType: datetime.datetime
+    - DateType: datetime.date
     - JsonType: json-serializable object
     - ArrayType: numpy.ndarray
     - ImageType: PIL.Image.Image
     - VideoType: local path if available, otherwise url
+    - AudioType: local path if available, otherwise url
+    - DocumentType: local path if available, otherwise url
     """
     vals: np.ndarray  # of object

pixeltable/exprs/globals.py CHANGED Viewed

@@ -5,7 +5,7 @@ import enum
 from typing import Union
 # Python types corresponding to our literal types
-LiteralPythonTypes = Union[str, int, float, bool, datetime.datetime]
+LiteralPythonTypes = Union[str, int, float, bool, datetime.datetime, datetime.date]
 def print_slice(s: slice) -> str:

pixeltable 0.3.13__py3-none-any.whl → 0.3.15__py3-none-any.whl

Potentially problematic release.

pixeltable 0.3.13py3-none-any.whl → 0.3.15py3-none-any.whl