PyPI - pixeltable - Versions diffs - 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl - Mend

pixeltable 0.3.14py3-none-any.whl → 0.5.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (220) hide show

pixeltable/__init__.py +42 -8
pixeltable/{dataframe.py → _query.py} +470 -206
pixeltable/_version.py +1 -0
pixeltable/catalog/__init__.py +5 -4
pixeltable/catalog/catalog.py +1785 -432
pixeltable/catalog/column.py +190 -113
pixeltable/catalog/dir.py +2 -4
pixeltable/catalog/globals.py +19 -46
pixeltable/catalog/insertable_table.py +191 -98
pixeltable/catalog/path.py +63 -23
pixeltable/catalog/schema_object.py +11 -15
pixeltable/catalog/table.py +843 -436
pixeltable/catalog/table_metadata.py +103 -0
pixeltable/catalog/table_version.py +978 -657
pixeltable/catalog/table_version_handle.py +72 -16
pixeltable/catalog/table_version_path.py +112 -43
pixeltable/catalog/tbl_ops.py +53 -0
pixeltable/catalog/update_status.py +191 -0
pixeltable/catalog/view.py +134 -90
pixeltable/config.py +134 -22
pixeltable/env.py +471 -157
pixeltable/exceptions.py +6 -0
pixeltable/exec/__init__.py +4 -1
pixeltable/exec/aggregation_node.py +7 -8
pixeltable/exec/cache_prefetch_node.py +83 -110
pixeltable/exec/cell_materialization_node.py +268 -0
pixeltable/exec/cell_reconstruction_node.py +168 -0
pixeltable/exec/component_iteration_node.py +4 -3
pixeltable/exec/data_row_batch.py +8 -65
pixeltable/exec/exec_context.py +16 -4
pixeltable/exec/exec_node.py +13 -36
pixeltable/exec/expr_eval/evaluators.py +11 -7
pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
pixeltable/exec/expr_eval/globals.py +8 -5
pixeltable/exec/expr_eval/row_buffer.py +1 -2
pixeltable/exec/expr_eval/schedulers.py +106 -56
pixeltable/exec/globals.py +35 -0
pixeltable/exec/in_memory_data_node.py +19 -19
pixeltable/exec/object_store_save_node.py +293 -0
pixeltable/exec/row_update_node.py +16 -9
pixeltable/exec/sql_node.py +351 -84
pixeltable/exprs/__init__.py +1 -1
pixeltable/exprs/arithmetic_expr.py +27 -22
pixeltable/exprs/array_slice.py +3 -3
pixeltable/exprs/column_property_ref.py +36 -23
pixeltable/exprs/column_ref.py +213 -89
pixeltable/exprs/comparison.py +5 -5
pixeltable/exprs/compound_predicate.py +5 -4
pixeltable/exprs/data_row.py +164 -54
pixeltable/exprs/expr.py +70 -44
pixeltable/exprs/expr_dict.py +3 -3
pixeltable/exprs/expr_set.py +17 -10
pixeltable/exprs/function_call.py +100 -40
pixeltable/exprs/globals.py +2 -2
pixeltable/exprs/in_predicate.py +4 -4
pixeltable/exprs/inline_expr.py +18 -32
pixeltable/exprs/is_null.py +7 -3
pixeltable/exprs/json_mapper.py +8 -8
pixeltable/exprs/json_path.py +56 -22
pixeltable/exprs/literal.py +27 -5
pixeltable/exprs/method_ref.py +2 -2
pixeltable/exprs/object_ref.py +2 -2
pixeltable/exprs/row_builder.py +167 -67
pixeltable/exprs/rowid_ref.py +25 -10
pixeltable/exprs/similarity_expr.py +58 -40
pixeltable/exprs/sql_element_cache.py +4 -4
pixeltable/exprs/string_op.py +5 -5
pixeltable/exprs/type_cast.py +3 -5
pixeltable/func/__init__.py +1 -0
pixeltable/func/aggregate_function.py +8 -8
pixeltable/func/callable_function.py +9 -9
pixeltable/func/expr_template_function.py +17 -11
pixeltable/func/function.py +18 -20
pixeltable/func/function_registry.py +6 -7
pixeltable/func/globals.py +2 -3
pixeltable/func/mcp.py +74 -0
pixeltable/func/query_template_function.py +29 -27
pixeltable/func/signature.py +46 -19
pixeltable/func/tools.py +31 -13
pixeltable/func/udf.py +18 -20
pixeltable/functions/__init__.py +16 -0
pixeltable/functions/anthropic.py +123 -77
pixeltable/functions/audio.py +147 -10
pixeltable/functions/bedrock.py +13 -6
pixeltable/functions/date.py +7 -4
pixeltable/functions/deepseek.py +35 -43
pixeltable/functions/document.py +81 -0
pixeltable/functions/fal.py +76 -0
pixeltable/functions/fireworks.py +11 -20
pixeltable/functions/gemini.py +195 -39
pixeltable/functions/globals.py +142 -14
pixeltable/functions/groq.py +108 -0
pixeltable/functions/huggingface.py +1056 -24
pixeltable/functions/image.py +115 -57
pixeltable/functions/json.py +1 -1
pixeltable/functions/llama_cpp.py +28 -13
pixeltable/functions/math.py +67 -5
pixeltable/functions/mistralai.py +18 -55
pixeltable/functions/net.py +70 -0
pixeltable/functions/ollama.py +20 -13
pixeltable/functions/openai.py +240 -226
pixeltable/functions/openrouter.py +143 -0
pixeltable/functions/replicate.py +4 -4
pixeltable/functions/reve.py +250 -0
pixeltable/functions/string.py +239 -69
pixeltable/functions/timestamp.py +16 -16
pixeltable/functions/together.py +24 -84
pixeltable/functions/twelvelabs.py +188 -0
pixeltable/functions/util.py +6 -1
pixeltable/functions/uuid.py +30 -0
pixeltable/functions/video.py +1515 -107
pixeltable/functions/vision.py +8 -8
pixeltable/functions/voyageai.py +289 -0
pixeltable/functions/whisper.py +16 -8
pixeltable/functions/whisperx.py +179 -0
pixeltable/{ext/functions → functions}/yolox.py +2 -4
pixeltable/globals.py +362 -115
pixeltable/index/base.py +17 -21
pixeltable/index/btree.py +28 -22
pixeltable/index/embedding_index.py +100 -118
pixeltable/io/__init__.py +4 -2
pixeltable/io/datarows.py +8 -7
pixeltable/io/external_store.py +56 -105
pixeltable/io/fiftyone.py +13 -13
pixeltable/io/globals.py +31 -30
pixeltable/io/hf_datasets.py +61 -16
pixeltable/io/label_studio.py +74 -70
pixeltable/io/lancedb.py +3 -0
pixeltable/io/pandas.py +21 -12
pixeltable/io/parquet.py +25 -105
pixeltable/io/table_data_conduit.py +250 -123
pixeltable/io/utils.py +4 -4
pixeltable/iterators/__init__.py +2 -1
pixeltable/iterators/audio.py +26 -25
pixeltable/iterators/base.py +9 -3
pixeltable/iterators/document.py +112 -78
pixeltable/iterators/image.py +12 -15
pixeltable/iterators/string.py +11 -4
pixeltable/iterators/video.py +523 -120
pixeltable/metadata/__init__.py +14 -3
pixeltable/metadata/converters/convert_13.py +2 -2
pixeltable/metadata/converters/convert_18.py +2 -2
pixeltable/metadata/converters/convert_19.py +2 -2
pixeltable/metadata/converters/convert_20.py +2 -2
pixeltable/metadata/converters/convert_21.py +2 -2
pixeltable/metadata/converters/convert_22.py +2 -2
pixeltable/metadata/converters/convert_24.py +2 -2
pixeltable/metadata/converters/convert_25.py +2 -2
pixeltable/metadata/converters/convert_26.py +2 -2
pixeltable/metadata/converters/convert_29.py +4 -4
pixeltable/metadata/converters/convert_30.py +34 -21
pixeltable/metadata/converters/convert_34.py +2 -2
pixeltable/metadata/converters/convert_35.py +9 -0
pixeltable/metadata/converters/convert_36.py +38 -0
pixeltable/metadata/converters/convert_37.py +15 -0
pixeltable/metadata/converters/convert_38.py +39 -0
pixeltable/metadata/converters/convert_39.py +124 -0
pixeltable/metadata/converters/convert_40.py +73 -0
pixeltable/metadata/converters/convert_41.py +12 -0
pixeltable/metadata/converters/convert_42.py +9 -0
pixeltable/metadata/converters/convert_43.py +44 -0
pixeltable/metadata/converters/util.py +20 -31
pixeltable/metadata/notes.py +9 -0
pixeltable/metadata/schema.py +140 -53
pixeltable/metadata/utils.py +74 -0
pixeltable/mypy/__init__.py +3 -0
pixeltable/mypy/mypy_plugin.py +123 -0
pixeltable/plan.py +382 -115
pixeltable/share/__init__.py +1 -1
pixeltable/share/packager.py +547 -83
pixeltable/share/protocol/__init__.py +33 -0
pixeltable/share/protocol/common.py +165 -0
pixeltable/share/protocol/operation_types.py +33 -0
pixeltable/share/protocol/replica.py +119 -0
pixeltable/share/publish.py +257 -59
pixeltable/store.py +311 -194
pixeltable/type_system.py +373 -211
pixeltable/utils/__init__.py +2 -3
pixeltable/utils/arrow.py +131 -17
pixeltable/utils/av.py +298 -0
pixeltable/utils/azure_store.py +346 -0
pixeltable/utils/coco.py +6 -6
pixeltable/utils/code.py +3 -3
pixeltable/utils/console_output.py +4 -1
pixeltable/utils/coroutine.py +6 -23
pixeltable/utils/dbms.py +32 -6
pixeltable/utils/description_helper.py +4 -5
pixeltable/utils/documents.py +7 -18
pixeltable/utils/exception_handler.py +7 -30
pixeltable/utils/filecache.py +6 -6
pixeltable/utils/formatter.py +86 -48
pixeltable/utils/gcs_store.py +295 -0
pixeltable/utils/http.py +133 -0
pixeltable/utils/http_server.py +2 -3
pixeltable/utils/iceberg.py +1 -2
pixeltable/utils/image.py +17 -0
pixeltable/utils/lancedb.py +90 -0
pixeltable/utils/local_store.py +322 -0
pixeltable/utils/misc.py +5 -0
pixeltable/utils/object_stores.py +573 -0
pixeltable/utils/pydantic.py +60 -0
pixeltable/utils/pytorch.py +5 -6
pixeltable/utils/s3_store.py +527 -0
pixeltable/utils/sql.py +26 -0
pixeltable/utils/system.py +30 -0
pixeltable-0.5.7.dist-info/METADATA +579 -0
pixeltable-0.5.7.dist-info/RECORD +227 -0
{pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
pixeltable/__version__.py +0 -3
pixeltable/catalog/named_function.py +0 -40
pixeltable/ext/__init__.py +0 -17
pixeltable/ext/functions/__init__.py +0 -11
pixeltable/ext/functions/whisperx.py +0 -77
pixeltable/utils/media_store.py +0 -77
pixeltable/utils/s3.py +0 -17
pixeltable-0.3.14.dist-info/METADATA +0 -434
pixeltable-0.3.14.dist-info/RECORD +0 -186
pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
{pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0

pixeltable/{dataframe.py → _query.py} RENAMED Viewed

@@ -8,15 +8,17 @@ import json
 import logging
 import traceback
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, AsyncIterator, Callable, Hashable, Iterator, NoReturn, Optional, Sequence, Union
+from typing import TYPE_CHECKING, Any, AsyncIterator, Callable, Hashable, Iterator, NoReturn, Sequence, TypeVar
 import pandas as pd
-import sqlalchemy as sql
+import pydantic
+import sqlalchemy.exc as sql_exc
 from pixeltable import catalog, exceptions as excs, exec, exprs, plan, type_system as ts
-from pixeltable.catalog import is_valid_identifier
-from pixeltable.catalog.globals import UpdateStatus
+from pixeltable.catalog import Catalog, is_valid_identifier
+from pixeltable.catalog.update_status import UpdateStatus
 from pixeltable.env import Env
+from pixeltable.plan import Planner, SampleClause
 from pixeltable.type_system import ColumnType
 from pixeltable.utils.description_helper import DescriptionHelper
 from pixeltable.utils.formatter import Formatter
@@ -25,12 +27,17 @@ if TYPE_CHECKING:
     import torch
     import torch.utils.data
-__all__ = ['DataFrame']
+__all__ = ['Query']
 _logger = logging.getLogger('pixeltable')
-class DataFrameResultSet:
+class ResultSet:
+    _rows: list[list[Any]]
+    _col_names: list[str]
+    __schema: dict[str, ColumnType]
+    __formatter: Formatter
     def __init__(self, rows: list[list[Any]], schema: dict[str, ColumnType]):
         self._rows = rows
         self._col_names = list(schema.keys())
@@ -65,6 +72,44 @@ class DataFrameResultSet:
     def to_pandas(self) -> pd.DataFrame:
         return pd.DataFrame.from_records(self._rows, columns=self._col_names)
+    BaseModelT = TypeVar('BaseModelT', bound=pydantic.BaseModel)
+    def to_pydantic(self, model: type[BaseModelT]) -> Iterator[BaseModelT]:
+        """
+        Convert the ResultSet to a list of Pydantic model instances.
+        Args:
+            model: A Pydantic model class.
+        Returns:
+            An iterator over Pydantic model instances, one for each row in the result set.
+        Raises:
+            Error: If the row data doesn't match the model schema.
+        """
+        model_fields = model.model_fields
+        model_config = getattr(model, 'model_config', {})
+        forbid_extra_fields = model_config.get('extra') == 'forbid'
+        # schema validation
+        required_fields = {name for name, field in model_fields.items() if field.is_required()}
+        col_names = set(self._col_names)
+        missing_fields = required_fields - col_names
+        if len(missing_fields) > 0:
+            raise excs.Error(
+                f'Required model fields {missing_fields} are missing from result set columns {self._col_names}'
+            )
+        if forbid_extra_fields:
+            extra_fields = col_names - set(model_fields.keys())
+            if len(extra_fields) > 0:
+                raise excs.Error(f"Extra fields {extra_fields} are not allowed in model with extra='forbid'")
+        for row in self:
+            try:
+                yield model(**row)
+            except pydantic.ValidationError as e:
+                raise excs.Error(str(e)) from e
     def _row_to_dict(self, row_idx: int) -> dict[str, Any]:
         return {self._col_names[i]: self._rows[row_idx][i] for i in range(len(self._col_names))}
@@ -89,7 +134,7 @@ class DataFrameResultSet:
         return (self._row_to_dict(i) for i in range(len(self)))
     def __eq__(self, other: object) -> bool:
-        if not isinstance(other, DataFrameResultSet):
+        if not isinstance(other, ResultSet):
             return False
         return self.to_pandas().equals(other.to_pandas())
@@ -106,14 +151,14 @@ class DataFrameResultSet:
 #         # output of the agg stage
 #         self.agg_output_exprs: list[exprs.Expr] = []
 #         # Where clause of the Select stmt of the SQL scan stage
-#         self.sql_where_clause: Optional[sql.ClauseElement] = None
+#         self.sql_where_clause: sql.ClauseElement | None = None
 #         # filter predicate applied to input rows of the SQL scan stage
-#         self.filter: Optional[exprs.Predicate] = None
-#         self.similarity_clause: Optional[exprs.ImageSimilarityPredicate] = None
+#         self.filter: exprs.Predicate | None = None
+#         self.similarity_clause: exprs.ImageSimilarityPredicate | None = None
 #         self.agg_fn_calls: list[exprs.FunctionCall] = []  # derived from unique_exprs
 #         self.has_frame_col: bool = False  # True if we're referencing the frame col
 #
-#         self.evaluator: Optional[exprs.Evaluator] = None
+#         self.evaluator: exprs.Evaluator | None = None
 #         self.sql_scan_eval_ctx: list[exprs.Expr] = []  # needed to materialize output of SQL scan stage
 #         self.agg_eval_ctx: list[exprs.Expr] = []  # needed to materialize output of agg stage
 #         self.filter_eval_ctx: list[exprs.Expr] = []
@@ -129,32 +174,36 @@ class DataFrameResultSet:
 #             self.filter.release()
-class DataFrame:
+class Query:
+    """Represents a query for retrieving and transforming data from Pixeltable tables."""
     _from_clause: plan.FromClause
     _select_list_exprs: list[exprs.Expr]
     _schema: dict[str, ts.ColumnType]
-    select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]]
-    where_clause: Optional[exprs.Expr]
-    group_by_clause: Optional[list[exprs.Expr]]
-    grouping_tbl: Optional[catalog.TableVersion]
-    order_by_clause: Optional[list[tuple[exprs.Expr, bool]]]
-    limit_val: Optional[exprs.Expr]
+    select_list: list[tuple[exprs.Expr, str | None]] | None
+    where_clause: exprs.Expr | None
+    group_by_clause: list[exprs.Expr] | None
+    grouping_tbl: catalog.TableVersion | None
+    order_by_clause: list[tuple[exprs.Expr, bool]] | None
+    limit_val: exprs.Expr | None
+    sample_clause: SampleClause | None
     def __init__(
         self,
-        from_clause: Optional[plan.FromClause] = None,
-        select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]] = None,
-        where_clause: Optional[exprs.Expr] = None,
-        group_by_clause: Optional[list[exprs.Expr]] = None,
-        grouping_tbl: Optional[catalog.TableVersion] = None,
-        order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None,  # list[(expr, asc)]
-        limit: Optional[exprs.Expr] = None,
+        from_clause: plan.FromClause | None = None,
+        select_list: list[tuple[exprs.Expr, str | None]] | None = None,
+        where_clause: exprs.Expr | None = None,
+        group_by_clause: list[exprs.Expr] | None = None,
+        grouping_tbl: catalog.TableVersion | None = None,
+        order_by_clause: list[tuple[exprs.Expr, bool]] | None = None,  # list[(expr, asc)]
+        limit: exprs.Expr | None = None,
+        sample_clause: SampleClause | None = None,
     ):
         self._from_clause = from_clause
         # exprs contain execution state and therefore cannot be shared
         select_list = copy.deepcopy(select_list)
-        select_list_exprs, column_names = DataFrame._normalize_select_list(self._from_clause.tbls, select_list)
+        select_list_exprs, column_names = Query._normalize_select_list(self._from_clause.tbls, select_list)
         # check select list after expansion to catch early
         # the following two lists are always non empty, even if select list is None.
         assert len(column_names) == len(select_list_exprs)
@@ -168,10 +217,11 @@ class DataFrame:
         self.grouping_tbl = grouping_tbl
         self.order_by_clause = copy.deepcopy(order_by_clause)
         self.limit_val = limit
+        self.sample_clause = sample_clause
     @classmethod
     def _normalize_select_list(
-        cls, tbls: list[catalog.TableVersionPath], select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]]
+        cls, tbls: list[catalog.TableVersionPath], select_list: list[tuple[exprs.Expr, str | None]] | None
     ) -> tuple[list[exprs.Expr], list[str]]:
         """
         Expand select list information with all columns and their names
@@ -210,12 +260,11 @@ class DataFrame:
     @property
     def _first_tbl(self) -> catalog.TableVersionPath:
-        assert len(self._from_clause.tbls) == 1
-        return self._from_clause.tbls[0]
+        return self._from_clause._first_tbl
     def _vars(self) -> dict[str, exprs.Variable]:
         """
-        Return a dict mapping variable name to Variable for all Variables contained in any component of the DataFrame
+        Return a dict mapping variable name to Variable for all Variables contained in any component of the Query
         """
         all_exprs: list[exprs.Expr] = []
         all_exprs.extend(self._select_list_exprs)
@@ -233,19 +282,49 @@ class DataFrame:
             if var.name not in unique_vars:
                 unique_vars[var.name] = var
             elif unique_vars[var.name].col_type != var.col_type:
-                raise excs.Error(f'Multiple definitions of parameter {var.name}')
+                raise excs.Error(f'Multiple definitions of parameter {var.name!r}')
         return unique_vars
+    @classmethod
+    def _convert_param_to_typed_expr(
+        cls, v: Any, required_type: ts.ColumnType, required: bool, name: str, range: tuple[Any, Any] | None = None
+    ) -> exprs.Expr | None:
+        if v is None:
+            if required:
+                raise excs.Error(f'{name!r} parameter must be present')
+            return v
+        v_expr = exprs.Expr.from_object(v)
+        if not v_expr.col_type.matches(required_type):
+            raise excs.Error(f'{name!r} parameter must be of type `{required_type}`; got `{v_expr.col_type}`')
+        if range is not None:
+            if not isinstance(v_expr, exprs.Literal):
+                raise excs.Error(f'{name!r} parameter must be a constant; got: {v_expr}')
+            if range[0] is not None and not (v_expr.val >= range[0]):
+                raise excs.Error(f'{name!r} parameter must be >= {range[0]}')
+            if range[1] is not None and not (v_expr.val <= range[1]):
+                raise excs.Error(f'{name!r} parameter must be <= {range[1]}')
+        return v_expr
+    @classmethod
+    def validate_constant_type_range(
+        cls, v: Any, required_type: ts.ColumnType, required: bool, name: str, range: tuple[Any, Any] | None = None
+    ) -> Any:
+        """Validate that the given named parameter is a constant of the required type and within the specified range."""
+        v_expr = cls._convert_param_to_typed_expr(v, required_type, required, name, range)
+        if v_expr is None:
+            return None
+        return v_expr.val
     def parameters(self) -> dict[str, ColumnType]:
         """Return a dict mapping parameter name to parameter type.
-        Parameters are Variables contained in any component of the DataFrame.
+        Parameters are Variables contained in any component of the Query.
         """
         return {name: var.col_type for name, var in self._vars().items()}
     def _exec(self) -> Iterator[exprs.DataRow]:
         """Run the query and return rows as a generator.
-        This function must not modify the state of the DataFrame, otherwise it breaks dataset caching.
+        This function must not modify the state of the Query, otherwise it breaks dataset caching.
         """
         plan = self._create_query_plan()
@@ -261,7 +340,7 @@ class DataFrame:
     async def _aexec(self) -> AsyncIterator[exprs.DataRow]:
         """Run the query and return rows as a generator.
-        This function must not modify the state of the DataFrame, otherwise it breaks dataset caching.
+        This function must not modify the state of the Query, otherwise it breaks dataset caching.
         """
         plan = self._create_query_plan()
         plan.open()
@@ -274,37 +353,44 @@ class DataFrame:
     def _create_query_plan(self) -> exec.ExecNode:
         # construct a group-by clause if we're grouping by a table
-        group_by_clause: Optional[list[exprs.Expr]] = None
+        group_by_clause: list[exprs.Expr] | None = None
         if self.grouping_tbl is not None:
             assert self.group_by_clause is None
             num_rowid_cols = len(self.grouping_tbl.store_tbl.rowid_columns())
             # the grouping table must be a base of self.tbl
             assert num_rowid_cols <= len(self._first_tbl.tbl_version.get().store_tbl.rowid_columns())
-            group_by_clause = [exprs.RowidRef(self._first_tbl.tbl_version, idx) for idx in range(num_rowid_cols)]
+            group_by_clause = self.__rowid_columns(num_rowid_cols)
         elif self.group_by_clause is not None:
             group_by_clause = self.group_by_clause
         for item in self._select_list_exprs:
             item.bind_rel_paths()
-        return plan.Planner.create_query_plan(
+        return Planner.create_query_plan(
             self._from_clause,
             self._select_list_exprs,
             where_clause=self.where_clause,
             group_by_clause=group_by_clause,
-            order_by_clause=self.order_by_clause if self.order_by_clause is not None else [],
+            order_by_clause=self.order_by_clause,
             limit=self.limit_val,
+            sample_clause=self.sample_clause,
         )
+    def __rowid_columns(self, num_rowid_cols: int | None = None) -> list[exprs.Expr]:
+        """Return list of RowidRef for the given number of associated rowids"""
+        return Planner.rowid_columns(self._first_tbl.tbl_version, num_rowid_cols)
     def _has_joins(self) -> bool:
         return len(self._from_clause.join_clauses) > 0
-    def show(self, n: int = 20) -> DataFrameResultSet:
+    def show(self, n: int = 20) -> ResultSet:
+        if self.sample_clause is not None:
+            raise excs.Error('show() cannot be used with sample()')
         assert n is not None
         return self.limit(n).collect()
-    def head(self, n: int = 10) -> DataFrameResultSet:
-        """Return the first n rows of the DataFrame, in insertion order of the underlying Table.
+    def head(self, n: int = 10) -> ResultSet:
+        """Return the first n rows of the Query, in insertion order of the underlying Table.
         head() is not supported for joins.
@@ -312,24 +398,26 @@ class DataFrame:
             n: Number of rows to select. Default is 10.
         Returns:
-            A DataFrameResultSet with the first n rows of the DataFrame.
+            A ResultSet with the first n rows of the Query.
         Raises:
-            Error: If the DataFrame is the result of a join or
-                if the DataFrame has an order_by clause.
+            Error: If the Query is the result of a join or
+                if the Query has an order_by clause.
         """
         if self.order_by_clause is not None:
             raise excs.Error('head() cannot be used with order_by()')
         if self._has_joins():
             raise excs.Error('head() not supported for joins')
+        if self.sample_clause is not None:
+            raise excs.Error('head() cannot be used with sample()')
         if self.group_by_clause is not None:
             raise excs.Error('head() cannot be used with group_by()')
         num_rowid_cols = len(self._first_tbl.tbl_version.get().store_tbl.rowid_columns())
         order_by_clause = [exprs.RowidRef(self._first_tbl.tbl_version, idx) for idx in range(num_rowid_cols)]
         return self.order_by(*order_by_clause, asc=True).limit(n).collect()
-    def tail(self, n: int = 10) -> DataFrameResultSet:
-        """Return the last n rows of the DataFrame, in insertion order of the underlying Table.
+    def tail(self, n: int = 10) -> ResultSet:
+        """Return the last n rows of the Query, in insertion order of the underlying Table.
         tail() is not supported for joins.
@@ -337,16 +425,18 @@ class DataFrame:
             n: Number of rows to select. Default is 10.
         Returns:
-            A DataFrameResultSet with the last n rows of the DataFrame.
+            A ResultSet with the last n rows of the Query.
         Raises:
-            Error: If the DataFrame is the result of a join or
-                if the DataFrame has an order_by clause.
+            Error: If the Query is the result of a join or
+                if the Query has an order_by clause.
         """
         if self.order_by_clause is not None:
             raise excs.Error('tail() cannot be used with order_by()')
         if self._has_joins():
             raise excs.Error('tail() not supported for joins')
+        if self.sample_clause is not None:
+            raise excs.Error('tail() cannot be used with sample()')
         if self.group_by_clause is not None:
             raise excs.Error('tail() cannot be used with group_by()')
         num_rowid_cols = len(self._first_tbl.tbl_version.get().store_tbl.rowid_columns())
@@ -357,10 +447,11 @@ class DataFrame:
     @property
     def schema(self) -> dict[str, ColumnType]:
+        """Column names and types in this Query."""
         return self._schema
-    def bind(self, args: dict[str, Any]) -> DataFrame:
-        """Bind arguments to parameters and return a new DataFrame."""
+    def bind(self, args: dict[str, Any]) -> Query:
+        """Bind arguments to parameters and return a new Query."""
         # substitute Variables with the corresponding values according to 'args', converted to Literals
         select_list_exprs = copy.deepcopy(self._select_list_exprs)
         where_clause = copy.deepcopy(self.where_clause)
@@ -381,7 +472,7 @@ class DataFrame:
             var_expr = vars[arg_name]
             arg_expr = exprs.Expr.from_object(arg_val)
             if arg_expr is None:
-                raise excs.Error(f'Cannot convert argument {arg_val} to a Pixeltable expression')
+                raise excs.Error(f'That argument cannot be converted to a Pixeltable expression: {arg_val}')
             var_exprs[var_expr] = arg_expr
         exprs.Expr.list_substitute(select_list_exprs, var_exprs)
@@ -393,7 +484,7 @@ class DataFrame:
             exprs.Expr.list_substitute(order_by_exprs, var_exprs)
         select_list = list(zip(select_list_exprs, self.schema.keys()))
-        order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None
+        order_by_clause: list[tuple[exprs.Expr, bool]] | None = None
         if order_by_exprs is not None:
             order_by_clause = [
                 (expr, asc) for expr, asc in zip(order_by_exprs, [asc for _, asc in self.order_by_clause])
@@ -401,9 +492,9 @@ class DataFrame:
         if limit_val is not None:
             limit_val = limit_val.substitute(var_exprs)
             if limit_val is not None and not isinstance(limit_val, exprs.Literal):
-                raise excs.Error(f'limit(): parameter must be a constant, but got {limit_val}')
+                raise excs.Error(f'limit(): parameter must be a constant; got: {limit_val}')
-        return DataFrame(
+        return Query(
             from_clause=self._from_clause,
             select_list=select_list,
             where_clause=where_clause,
@@ -431,41 +522,41 @@ class DataFrame:
         raise excs.Error(msg) from e
     def _output_row_iterator(self) -> Iterator[list]:
-        with Env.get().begin_xact():
+        # TODO: extend begin_xact() to accept multiple TVPs for joins
+        single_tbl = self._first_tbl if len(self._from_clause.tbls) == 1 else None
+        with Catalog.get().begin_xact(tbl=single_tbl, for_write=False):
             try:
                 for data_row in self._exec():
                     yield [data_row[e.slot_idx] for e in self._select_list_exprs]
             except excs.ExprEvalError as e:
                 self._raise_expr_eval_err(e)
-            except sql.exc.DBAPIError as e:
-                raise excs.Error(f'Error during SQL execution:\n{e}') from e
+            except (sql_exc.DBAPIError, sql_exc.OperationalError, sql_exc.InternalError) as e:
+                Catalog.get().convert_sql_exc(e, tbl=(single_tbl.tbl_version if single_tbl is not None else None))
+                raise  # just re-raise if not converted to a Pixeltable error
-    def collect(self) -> DataFrameResultSet:
-        return DataFrameResultSet(list(self._output_row_iterator()), self.schema)
+    def collect(self) -> ResultSet:
+        return ResultSet(list(self._output_row_iterator()), self.schema)
-    async def _acollect(self) -> DataFrameResultSet:
+    async def _acollect(self) -> ResultSet:
+        single_tbl = self._first_tbl if len(self._from_clause.tbls) == 1 else None
         try:
             result = [[row[e.slot_idx] for e in self._select_list_exprs] async for row in self._aexec()]
-            return DataFrameResultSet(result, self.schema)
+            return ResultSet(result, self.schema)
         except excs.ExprEvalError as e:
             self._raise_expr_eval_err(e)
-        except sql.exc.DBAPIError as e:
-            raise excs.Error(f'Error during SQL execution:\n{e}') from e
+        except (sql_exc.DBAPIError, sql_exc.OperationalError, sql_exc.InternalError) as e:
+            Catalog.get().convert_sql_exc(e, tbl=(single_tbl.tbl_version if single_tbl is not None else None))
+            raise  # just re-raise if not converted to a Pixeltable error
     def count(self) -> int:
-        """Return the number of rows in the DataFrame.
+        """Return the number of rows in the Query.
         Returns:
-            The number of rows in the DataFrame.
+            The number of rows in the Query.
         """
-        if self.group_by_clause is not None:
-            raise excs.Error('count() cannot be used with group_by()')
-        from pixeltable.plan import Planner
-        stmt = Planner.create_count_stmt(self._first_tbl, self.where_clause)
-        with Env.get().begin_xact() as conn:
-            result: int = conn.execute(stmt).scalar_one()
+        with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=False) as conn:
+            count_stmt = Planner.create_count_stmt(self)
+            result: int = conn.execute(count_stmt).scalar_one()
             assert isinstance(result, int)
             return result
@@ -510,12 +601,15 @@ class DataFrame:
         if self.limit_val is not None:
             heading_vals.append('Limit')
             info_vals.append(self.limit_val.display_str(inline=False))
+        if self.sample_clause is not None:
+            heading_vals.append('Sample')
+            info_vals.append(self.sample_clause.display_str(inline=False))
         assert len(heading_vals) == len(info_vals)
         return pd.DataFrame(info_vals, index=heading_vals)
     def describe(self) -> None:
         """
-        Prints a tabular description of this DataFrame.
+        Prints a tabular description of this Query.
         The description has two columns, heading and info, which list the contents of each 'component'
                 (select list, where clause, ...) vertically.
         """
@@ -532,35 +626,35 @@ class DataFrame:
     def _repr_html_(self) -> str:
         return self._descriptors().to_html()
-    def select(self, *items: Any, **named_items: Any) -> DataFrame:
-        """Select columns or expressions from the DataFrame.
+    def select(self, *items: Any, **named_items: Any) -> Query:
+        """Select columns or expressions from the Query.
         Args:
             items: expressions to be selected
             named_items: named expressions to be selected
         Returns:
-            A new DataFrame with the specified select list.
+            A new Query with the specified select list.
         Raises:
             Error: If the select list is already specified,
                 or if any of the specified expressions are invalid,
-                or refer to tables not in the DataFrame.
+                or refer to tables not in the Query.
         Examples:
-            Given the DataFrame person from a table t with all its columns and rows:
+            Given the Query person from a table t with all its columns and rows:
             >>> person = t.select()
-            Select the columns 'name' and 'age' (referenced in table t) from the DataFrame person:
+            Select the columns 'name' and 'age' (referenced in table t) from the Query person:
-            >>> df = person.select(t.name, t.age)
+            >>> query = person.select(t.name, t.age)
-            Select the columns 'name' (referenced in table t) from the DataFrame person,
+            Select the columns 'name' (referenced in table t) from the Query person,
             and a named column 'is_adult' from the expression `age >= 18` where 'age' is
             another column in table t:
-            >>> df = person.select(t.name, is_adult=(t.age >= 18))
+            >>> query = person.select(t.name, is_adult=(t.age >= 18))
         """
         if self.select_list is not None:
@@ -573,7 +667,7 @@ class DataFrame:
             return self
         # analyze select list; wrap literals with the corresponding expressions
-        select_list: list[tuple[exprs.Expr, Optional[str]]] = []
+        select_list: list[tuple[exprs.Expr, str | None]] = []
         for raw_expr, name in base_list:
             expr = exprs.Expr.from_object(raw_expr)
             if expr is None:
@@ -593,22 +687,22 @@ class DataFrame:
                     pass
             if not expr.is_bound_by(self._from_clause.tbls):
                 raise excs.Error(
-                    f"Expression '{expr}' cannot be evaluated in the context of this query's tables "
-                    f'({",".join(tbl.tbl_version.get().versioned_name for tbl in self._from_clause.tbls)})'
+                    f"That expression cannot be evaluated in the context of this query's tables "
+                    f'({",".join(tbl.tbl_version.get().versioned_name for tbl in self._from_clause.tbls)}): {expr}'
                 )
             select_list.append((expr, name))
         # check user provided names do not conflict among themselves or with auto-generated ones
         seen: set[str] = set()
-        _, names = DataFrame._normalize_select_list(self._from_clause.tbls, select_list)
+        _, names = Query._normalize_select_list(self._from_clause.tbls, select_list)
         for name in names:
             if name in seen:
                 repeated_names = [j for j, x in enumerate(names) if x == name]
                 pretty = ', '.join(map(str, repeated_names))
-                raise excs.Error(f'Repeated column name "{name}" in select() at positions: {pretty}')
+                raise excs.Error(f'Repeated column name {name!r} in select() at positions: {pretty}')
             seen.add(name)
-        return DataFrame(
+        return Query(
             from_clause=self._from_clause,
             select_list=select_list,
             where_clause=self.where_clause,
@@ -618,37 +712,39 @@ class DataFrame:
             limit=self.limit_val,
         )
-    def where(self, pred: exprs.Expr) -> DataFrame:
+    def where(self, pred: exprs.Expr) -> Query:
         """Filter rows based on a predicate.
         Args:
             pred: the predicate to filter rows
         Returns:
-            A new DataFrame with the specified predicates replacing the where-clause.
+            A new Query with the specified predicates replacing the where-clause.
         Raises:
             Error: If the predicate is not a Pixeltable expression,
                 or if it does not return a boolean value,
-                or refers to tables not in the DataFrame.
+                or refers to tables not in the Query.
         Examples:
-            Given the DataFrame person from a table t with all its columns and rows:
+            Given the Query person from a table t with all its columns and rows:
             >>> person = t.select()
-            Filter the above DataFrame person to only include rows where the column 'age'
+            Filter the above Query person to only include rows where the column 'age'
             (referenced in table t) is greater than 30:
-            >>> df = person.where(t.age > 30)
+            >>> query = person.where(t.age > 30)
         """
         if self.where_clause is not None:
-            raise excs.Error('Where clause already specified')
+            raise excs.Error('where() clause already specified')
+        if self.sample_clause is not None:
+            raise excs.Error('where() cannot be used after sample()')
         if not isinstance(pred, exprs.Expr):
-            raise excs.Error(f'Where() requires a Pixeltable expression, but instead got {type(pred)}')
+            raise excs.Error(f'where() expects a Pixeltable expression; got: {pred}')
         if not pred.col_type.is_bool_type():
-            raise excs.Error(f'Where(): expression needs to return bool, but instead returns {pred.col_type}')
-        return DataFrame(
+            raise excs.Error(f'where() expression needs to return `Bool`, but instead returns `{pred.col_type}`')
+        return Query(
             from_clause=self._from_clause,
             select_list=self.select_list,
             where_clause=pred,
@@ -659,7 +755,7 @@ class DataFrame:
         )
     def _create_join_predicate(
-        self, other: catalog.TableVersionPath, on: Union[exprs.Expr, Sequence[exprs.ColumnRef]]
+        self, other: catalog.TableVersionPath, on: exprs.Expr | Sequence[exprs.ColumnRef]
     ) -> exprs.Expr:
         """Verifies user-specified 'on' argument and converts it into a join predicate."""
         col_refs: list[exprs.ColumnRef] = []
@@ -669,19 +765,21 @@ class DataFrame:
             on = [on]
         elif isinstance(on, exprs.Expr):
             if not on.is_bound_by(joined_tbls):
-                raise excs.Error(f"'on': expression cannot be evaluated in the context of the joined tables: {on}")
+                raise excs.Error(f'`on` expression cannot be evaluated in the context of the joined tables: {on}')
             if not on.col_type.is_bool_type():
-                raise excs.Error(f"'on': boolean expression expected, but got {on.col_type}: {on}")
+                raise excs.Error(
+                    f'`on` expects an expression of type `Bool`, but got one of type `{on.col_type}`: {on}'
+                )
             return on
         elif not isinstance(on, Sequence) or len(on) == 0:
-            raise excs.Error("'on': must be a sequence of column references or a boolean expression")
+            raise excs.Error('`on` must be a sequence of column references or a boolean expression')
         assert isinstance(on, Sequence)
         for col_ref in on:
             if not isinstance(col_ref, exprs.ColumnRef):
-                raise excs.Error("'on': must be a sequence of column references or a boolean expression")
+                raise excs.Error('`on` must be a sequence of column references or a boolean expression')
             if not col_ref.is_bound_by(joined_tbls):
-                raise excs.Error(f"'on': expression cannot be evaluated in the context of the joined tables: {col_ref}")
+                raise excs.Error(f'`on` expression cannot be evaluated in the context of the joined tables: {col_ref}')
             col_refs.append(col_ref)
         predicates: list[exprs.Expr] = []
@@ -689,27 +787,27 @@ class DataFrame:
         assert len(col_refs) > 0 and len(joined_tbls) >= 2
         for col_ref in col_refs:
             # identify the referenced column by name in 'other'
-            rhs_col = other.get_column(col_ref.col.name, include_bases=True)
+            rhs_col = other.get_column(col_ref.col.name)
             if rhs_col is None:
-                raise excs.Error(f"'on': column {col_ref.col.name!r} not found in joined table")
+                raise excs.Error(f'`on` column {col_ref.col.name!r} not found in joined table')
             rhs_col_ref = exprs.ColumnRef(rhs_col)
-            lhs_col_ref: Optional[exprs.ColumnRef] = None
-            if any(tbl.has_column(col_ref.col, include_bases=True) for tbl in self._from_clause.tbls):
+            lhs_col_ref: exprs.ColumnRef | None = None
+            if any(tbl.has_column(col_ref.col) for tbl in self._from_clause.tbls):
                 # col_ref comes from the existing from_clause, we use that directly
                 lhs_col_ref = col_ref
             else:
                 # col_ref comes from other, we need to look for a match in the existing from_clause by name
                 for tbl in self._from_clause.tbls:
-                    col = tbl.get_column(col_ref.col.name, include_bases=True)
+                    col = tbl.get_column(col_ref.col.name)
                     if col is None:
                         continue
                     if lhs_col_ref is not None:
-                        raise excs.Error(f"'on': ambiguous column reference: {col_ref.col.name!r}")
+                        raise excs.Error(f'`on`: ambiguous column reference: {col_ref.col.name}')
                     lhs_col_ref = exprs.ColumnRef(col)
                 if lhs_col_ref is None:
                     tbl_names = [tbl.tbl_name() for tbl in self._from_clause.tbls]
-                    raise excs.Error(f"'on': column {col_ref.col.name!r} not found in any of: {' '.join(tbl_names)}")
+                    raise excs.Error(f'`on`: column {col_ref.col.name!r} not found in any of: {" ".join(tbl_names)}')
             pred = exprs.Comparison(exprs.ComparisonOperator.EQ, lhs_col_ref, rhs_col_ref)
             predicates.append(pred)
@@ -722,11 +820,11 @@ class DataFrame:
     def join(
         self,
         other: catalog.Table,
-        on: Optional[Union[exprs.Expr, Sequence[exprs.ColumnRef]]] = None,
+        on: exprs.Expr | Sequence[exprs.ColumnRef] | None = None,
         how: plan.JoinType.LiteralType = 'inner',
-    ) -> DataFrame:
+    ) -> Query:
         """
-        Join this DataFrame with a table.
+        Join this Query with a table.
         Args:
             other: the table to join with
@@ -734,23 +832,23 @@ class DataFrame:
                 expression.
                 - column references: implies an equality predicate that matches columns in both this
-                    DataFrame and `other` by name.
+                    Query and `other` by name.
-                    - column in `other`: A column with that same name must be present in this DataFrame, and **it must
+                    - column in `other`: A column with that same name must be present in this Query, and **it must
                         be unique** (otherwise the join is ambiguous).
-                    - column in this DataFrame: A column with that same name must be present in `other`.
+                    - column in this Query: A column with that same name must be present in `other`.
                 - boolean expression: The expressions must be valid in the context of the joined tables.
             how: the type of join to perform.
                 - `'inner'`: only keep rows that have a match in both
-                - `'left'`: keep all rows from this DataFrame and only matching rows from the other table
-                - `'right'`: keep all rows from the other table and only matching rows from this DataFrame
-                - `'full_outer'`: keep all rows from both this DataFrame and the other table
+                - `'left'`: keep all rows from this Query and only matching rows from the other table
+                - `'right'`: keep all rows from the other table and only matching rows from this Query
+                - `'full_outer'`: keep all rows from both this Query and the other table
                 - `'cross'`: Cartesian product; no `on` condition allowed
         Returns:
-            A new DataFrame.
+            A new Query.
         Examples:
             Perform an inner join between t1 and t2 on the column id:
@@ -769,23 +867,25 @@ class DataFrame:
             Join t with d, which has a composite primary key (columns pk1 and pk2, with corresponding foreign
             key columns d1 and d2 in t):
-            >>> df = t.join(d, on=(t.d1 == d.pk1) & (t.d2 == d.pk2), how='left')
+            >>> query = t.join(d, on=(t.d1 == d.pk1) & (t.d2 == d.pk2), how='left')
         """
-        join_pred: Optional[exprs.Expr]
+        if self.sample_clause is not None:
+            raise excs.Error('join() cannot be used with sample()')
+        join_pred: exprs.Expr | None
         if how == 'cross':
             if on is not None:
-                raise excs.Error("'on' not allowed for cross join")
+                raise excs.Error('`on` not allowed for cross join')
             join_pred = None
         else:
             if on is None:
-                raise excs.Error(f"how={how!r} requires 'on'")
+                raise excs.Error(f'`how={how!r}` requires `on` to be present')
             join_pred = self._create_join_predicate(other._tbl_version_path, on)
-        join_clause = plan.JoinClause(join_type=plan.JoinType.validated(how, "'how'"), join_predicate=join_pred)
+        join_clause = plan.JoinClause(join_type=plan.JoinType.validated(how, '`how`'), join_predicate=join_pred)
         from_clause = plan.FromClause(
             tbls=[*self._from_clause.tbls, other._tbl_version_path],
             join_clauses=[*self._from_clause.join_clauses, join_clause],
         )
-        return DataFrame(
+        return Query(
             from_clause=from_clause,
             select_list=self.select_list,
             where_clause=self.where_clause,
@@ -795,70 +895,73 @@ class DataFrame:
             limit=self.limit_val,
         )
-    def group_by(self, *grouping_items: Any) -> DataFrame:
-        """Add a group-by clause to this DataFrame.
+    def group_by(self, *grouping_items: Any) -> Query:
+        """Add a group-by clause to this Query.
         Variants:
-        - group_by(<base table>): group a component view by their respective base table rows
-        - group_by(<expr>, ...): group by the given expressions
+        - group_by(base_tbl): group a component view by their respective base table rows
+        - group_by(expr1, expr2, expr3): group by the given expressions
-        Note, that grouping will be applied to the rows and take effect when
+        Note that grouping will be applied to the rows and take effect when
         used with an aggregation function like sum(), count() etc.
         Args:
             grouping_items: expressions to group by
         Returns:
-            A new DataFrame with the specified group-by clause.
+            A new Query with the specified group-by clause.
         Raises:
             Error: If the group-by clause is already specified,
                 or if the specified expression is invalid,
-                or refer to tables not in the DataFrame,
-                or if the DataFrame is a result of a join.
+                or refer to tables not in the Query,
+                or if the Query is a result of a join.
         Examples:
-            Given the DataFrame book from a table t with all its columns and rows:
+            Given the Query book from a table t with all its columns and rows:
             >>> book = t.select()
-            Group the above DataFrame book by the 'genre' column (referenced in table t):
+            Group the above Query book by the 'genre' column (referenced in table t):
-            >>> df = book.group_by(t.genre)
+            >>> query = book.group_by(t.genre)
-            Use the above DataFrame df grouped by genre to count the number of
+            Use the above Query grouped by genre to count the number of
             books for each 'genre':
-            >>> df = book.group_by(t.genre).select(t.genre, count=count(t.genre)).show()
+            >>> query = book.group_by(t.genre).select(t.genre, count=count(t.genre)).show()
-            Use the above DataFrame df grouped by genre to the total price of
+            Use the above Query grouped by genre to the total price of
             books for each 'genre':
-            >>> df = book.group_by(t.genre).select(t.genre, total=sum(t.price)).show()
+            >>> query = book.group_by(t.genre).select(t.genre, total=sum(t.price)).show()
         """
         if self.group_by_clause is not None:
-            raise excs.Error('Group-by already specified')
-        grouping_tbl: Optional[catalog.TableVersion] = None
-        group_by_clause: Optional[list[exprs.Expr]] = None
+            raise excs.Error('group_by() already specified')
+        if self.sample_clause is not None:
+            raise excs.Error('group_by() cannot be used with sample()')
+        grouping_tbl: catalog.TableVersion | None = None
+        group_by_clause: list[exprs.Expr] | None = None
         for item in grouping_items:
             if isinstance(item, (catalog.Table, catalog.TableVersion)):
                 if len(grouping_items) > 1:
-                    raise excs.Error('group_by(): only one table can be specified')
+                    raise excs.Error('group_by(): only one Table can be specified')
                 if len(self._from_clause.tbls) > 1:
                     raise excs.Error('group_by() with Table not supported for joins')
                 grouping_tbl = item if isinstance(item, catalog.TableVersion) else item._tbl_version.get()
                 # we need to make sure that the grouping table is a base of self.tbl
                 base = self._first_tbl.find_tbl_version(grouping_tbl.id)
-                if base is None or base.id == self._first_tbl.tbl_id():
+                if base is None or base.id == self._first_tbl.tbl_id:
                     raise excs.Error(
-                        f'group_by(): {grouping_tbl.name} is not a base table of {self._first_tbl.tbl_name()}'
+                        f'group_by(): {grouping_tbl.name!r} is not a base table of {self._first_tbl.tbl_name()!r}'
                     )
                 break
             if not isinstance(item, exprs.Expr):
                 raise excs.Error(f'Invalid expression in group_by(): {item}')
         if grouping_tbl is None:
             group_by_clause = list(grouping_items)
-        return DataFrame(
+        return Query(
             from_clause=self._from_clause,
             select_list=self.select_list,
             where_clause=self.where_clause,
@@ -868,11 +971,11 @@ class DataFrame:
             limit=self.limit_val,
         )
-    def distinct(self) -> DataFrame:
+    def distinct(self) -> Query:
         """
-        Remove duplicate rows from this DataFrame.
+        Remove duplicate rows from this Query.
-        Note that grouping will be applied to the rows based on the select clause of this Dataframe.
+        Note that grouping will be applied to the rows based on the select clause of this Query.
         In the absence of a select clause, by default, all columns are selected in the grouping.
         Examples:
@@ -891,8 +994,8 @@ class DataFrame:
         exps, _ = self._normalize_select_list(self._from_clause.tbls, self.select_list)
         return self.group_by(*exps)
-    def order_by(self, *expr_list: exprs.Expr, asc: bool = True) -> DataFrame:
-        """Add an order-by clause to this DataFrame.
+    def order_by(self, *expr_list: exprs.Expr, asc: bool = True) -> Query:
+        """Add an order-by clause to this Query.
         Args:
             expr_list: expressions to order by
@@ -900,33 +1003,35 @@ class DataFrame:
                 Default is True.
         Returns:
-            A new DataFrame with the specified order-by clause.
+            A new Query with the specified order-by clause.
         Raises:
             Error: If the order-by clause is already specified,
                 or if the specified expression is invalid,
-                or refer to tables not in the DataFrame.
+                or refer to tables not in the Query.
         Examples:
-            Given the DataFrame book from a table t with all its columns and rows:
+            Given the Query book from a table t with all its columns and rows:
             >>> book = t.select()
-            Order the above DataFrame book by two columns (price, pages) in descending order:
+            Order the above Query book by two columns (price, pages) in descending order:
-            >>> df = book.order_by(t.price, t.pages, asc=False)
+            >>> query = book.order_by(t.price, t.pages, asc=False)
-            Order the above DataFrame book by price in descending order, but order the pages
+            Order the above Query book by price in descending order, but order the pages
             in ascending order:
-            >>> df = book.order_by(t.price, asc=False).order_by(t.pages)
+            >>> query = book.order_by(t.price, asc=False).order_by(t.pages)
         """
+        if self.sample_clause is not None:
+            raise excs.Error('order_by() cannot be used with sample()')
         for e in expr_list:
             if not isinstance(e, exprs.Expr):
                 raise excs.Error(f'Invalid expression in order_by(): {e}')
         order_by_clause = self.order_by_clause if self.order_by_clause is not None else []
         order_by_clause.extend([(e.copy(), asc) for e in expr_list])
-        return DataFrame(
+        return Query(
             from_clause=self._from_clause,
             select_list=self.select_list,
             where_clause=self.where_clause,
@@ -936,31 +1041,148 @@ class DataFrame:
             limit=self.limit_val,
         )
-    def limit(self, n: int) -> DataFrame:
-        """Limit the number of rows in the DataFrame.
+    def limit(self, n: int) -> Query:
+        """Limit the number of rows in the Query.
         Args:
             n: Number of rows to select.
         Returns:
-            A new DataFrame with the specified limited rows.
+            A new Query with the specified limited rows.
         """
-        assert n is not None
-        n = exprs.Expr.from_object(n)
-        if not n.col_type.is_int_type():
-            raise excs.Error(f'limit(): parameter must be of type int, instead of {n.col_type}')
-        return DataFrame(
+        if self.sample_clause is not None:
+            raise excs.Error('limit() cannot be used with sample()')
+        limit_expr = self._convert_param_to_typed_expr(n, ts.IntType(nullable=False), True, 'limit()')
+        return Query(
+            from_clause=self._from_clause,
+            select_list=self.select_list,
+            where_clause=self.where_clause,
+            group_by_clause=self.group_by_clause,
+            grouping_tbl=self.grouping_tbl,
+            order_by_clause=self.order_by_clause,
+            limit=limit_expr,
+        )
+    def sample(
+        self,
+        n: int | None = None,
+        n_per_stratum: int | None = None,
+        fraction: float | None = None,
+        seed: int | None = None,
+        stratify_by: Any = None,
+    ) -> Query:
+        """
+        Return a new Query specifying a sample of rows from the Query, considered in a shuffled order.
+        The size of the sample can be specified in three ways:
+        - `n`: the total number of rows to produce as a sample
+        - `n_per_stratum`: the number of rows to produce per stratum as a sample
+        - `fraction`: the fraction of available rows to produce as a sample
+        The sample can be stratified by one or more columns, which means that the sample will
+        be selected from each stratum separately.
+        The data is shuffled before creating the sample.
+        Args:
+            n: Total number of rows to produce as a sample.
+            n_per_stratum: Number of rows to produce per stratum as a sample. This parameter is only valid if
+                `stratify_by` is specified. Only one of `n` or `n_per_stratum` can be specified.
+            fraction: Fraction of available rows to produce as a sample. This parameter is not usable with `n` or
+                `n_per_stratum`. The fraction must be between 0.0 and 1.0.
+            seed: Random seed for reproducible shuffling
+            stratify_by: If specified, the sample will be stratified by these values.
+        Returns:
+            A new Query which specifies the sampled rows
+        Examples:
+            Given the Table `person` containing the field 'age', we can create samples of the table in various ways:
+            Sample 100 rows from the above Table:
+            >>> query = person.sample(n=100)
+            Sample 10% of the rows from the above Table:
+            >>> query = person.sample(fraction=0.1)
+            Sample 10% of the rows from the above Table, stratified by the column 'age':
+            >>> query = person.sample(fraction=0.1, stratify_by=t.age)
+            Equal allocation sampling: Sample 2 rows from each age present in the above Table:
+            >>> query = person.sample(n_per_stratum=2, stratify_by=t.age)
+            Sampling is compatible with the where clause, so we can also sample from a filtered Query:
+            >>> query = person.where(t.age > 30).sample(n=100)
+        """
+        # Check context of usage
+        if self.sample_clause is not None:
+            raise excs.Error('Multiple sample() clauses not allowed')
+        if self.group_by_clause is not None:
+            raise excs.Error('sample() cannot be used with group_by()')
+        if self.order_by_clause is not None:
+            raise excs.Error('sample() cannot be used with order_by()')
+        if self.limit_val is not None:
+            raise excs.Error('sample() cannot be used with limit()')
+        if self._has_joins():
+            raise excs.Error('sample() cannot be used with join()')
+        # Check paramter combinations
+        if (n is not None) + (n_per_stratum is not None) + (fraction is not None) != 1:
+            raise excs.Error('Exactly one of `n`, `n_per_stratum`, or `fraction` must be specified.')
+        if n_per_stratum is not None and stratify_by is None:
+            raise excs.Error('Must specify `stratify_by` to use `n_per_stratum`')
+        # Check parameter types and values
+        n = self.validate_constant_type_range(n, ts.IntType(nullable=False), False, 'n', (1, None))
+        n_per_stratum = self.validate_constant_type_range(
+            n_per_stratum, ts.IntType(nullable=False), False, 'n_per_stratum', (1, None)
+        )
+        fraction = self.validate_constant_type_range(
+            fraction, ts.FloatType(nullable=False), False, 'fraction', (0.0, 1.0)
+        )
+        seed = self.validate_constant_type_range(seed, ts.IntType(nullable=False), False, 'seed')
+        # analyze stratify list
+        stratify_exprs: list[exprs.Expr] = []
+        if stratify_by is not None:
+            if isinstance(stratify_by, exprs.Expr):
+                stratify_by = [stratify_by]
+            if not isinstance(stratify_by, (list, tuple)):
+                raise excs.Error('`stratify_by` must be a list of scalar expressions')
+            for expr in stratify_by:
+                if expr is None or not isinstance(expr, exprs.Expr):
+                    raise excs.Error(f'Invalid expression: {expr}')
+                if not expr.col_type.is_scalar_type():
+                    raise excs.Error(f'Invalid type: expression must be a scalar type (not `{expr.col_type}`)')
+                if not expr.is_bound_by(self._from_clause.tbls):
+                    raise excs.Error(
+                        f"That expression cannot be evaluated in the context of this query's tables "
+                        f'({",".join(tbl.tbl_name() for tbl in self._from_clause.tbls)}): {expr}'
+                    )
+                stratify_exprs.append(expr)
+        sample_clause = SampleClause(None, n, n_per_stratum, fraction, seed, stratify_exprs)
+        return Query(
             from_clause=self._from_clause,
             select_list=self.select_list,
             where_clause=self.where_clause,
             group_by_clause=self.group_by_clause,
             grouping_tbl=self.grouping_tbl,
             order_by_clause=self.order_by_clause,
-            limit=n,
+            limit=self.limit_val,
+            sample_clause=sample_clause,
         )
     def update(self, value_spec: dict[str, Any], cascade: bool = True) -> UpdateStatus:
-        """Update rows in the underlying table of the DataFrame.
+        """Update rows in the underlying table of the Query.
         Update rows in the table with the specified value_spec.
@@ -973,70 +1195,105 @@ class DataFrame:
             UpdateStatus: the status of the update operation.
         Example:
-            Given the DataFrame person from a table t with all its columns and rows:
+            Given the Query person from a table t with all its columns and rows:
             >>> person = t.select()
-            Via the above DataFrame person, update the column 'city' to 'Oakland'
+            Via the above Query person, update the column 'city' to 'Oakland'
             and 'state' to 'CA' in the table t:
-            >>> df = person.update({'city': 'Oakland', 'state': 'CA'})
+            >>> person.update({'city': 'Oakland', 'state': 'CA'})
-            Via the above DataFrame person, update the column 'age' to 30 for any
+            Via the above Query person, update the column 'age' to 30 for any
             rows where 'year' is 2014 in the table t:
-            >>> df = person.where(t.year == 2014).update({'age': 30})
+            >>> person.where(t.year == 2014).update({'age': 30})
         """
         self._validate_mutable('update', False)
-        with Env.get().begin_xact():
+        with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=True, lock_mutable_tree=True):
             return self._first_tbl.tbl_version.get().update(value_spec, where=self.where_clause, cascade=cascade)
-    def delete(self) -> UpdateStatus:
-        """Delete rows form the underlying table of the DataFrame.
+    def recompute_columns(
+        self, *columns: str | exprs.ColumnRef, errors_only: bool = False, cascade: bool = True
+    ) -> UpdateStatus:
+        """Recompute one or more computed columns of the underlying table of the Query.
-        The delete operation is only allowed for DataFrames on base tables.
+        Args:
+            columns: The names or references of the computed columns to recompute.
+            errors_only: If True, only run the recomputation for rows that have errors in the column (ie, the column's
+                `errortype` property indicates that an error occurred). Only allowed for recomputing a single column.
+            cascade: if True, also update all computed columns that transitively depend on the recomputed columns.
         Returns:
-            UpdateStatus: the status of the delete operation.
+            UpdateStatus: the status of the operation.
         Example:
-            Given the DataFrame person from a table t with all its columns and rows:
+            For table `person` with column `age` and computed column `height`, recompute the value of `height` for all
+            rows where `age` is less than 18:
-            >>> person = t.select()
+            >>> query = person.where(t.age < 18).recompute_columns(person.height)
+        """
+        self._validate_mutable('recompute_columns', False)
+        with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=True, lock_mutable_tree=True):
+            tbl = Catalog.get().get_table_by_id(self._first_tbl.tbl_id)
+            return tbl.recompute_columns(*columns, where=self.where_clause, errors_only=errors_only, cascade=cascade)
-            Via the above DataFrame person, delete all rows from the table t where the column 'age' is less than 18:
+    def delete(self) -> UpdateStatus:
+        """Delete rows form the underlying table of the Query.
+        The delete operation is only allowed for Queries on base tables.
+        Returns:
+            UpdateStatus: the status of the delete operation.
-            >>> df = person.where(t.age < 18).delete()
+        Example:
+            For a table `person` with column `age`, delete all rows where 'age' is less than 18:
+            >>> person.where(t.age < 18).delete()
         """
         self._validate_mutable('delete', False)
         if not self._first_tbl.is_insertable():
-            raise excs.Error('Cannot delete from view')
-        with Env.get().begin_xact():
+            raise excs.Error('Cannot use `delete` on a view.')
+        with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=True, lock_mutable_tree=True):
             return self._first_tbl.tbl_version.get().delete(where=self.where_clause)
     def _validate_mutable(self, op_name: str, allow_select: bool) -> None:
-        """Tests whether this DataFrame can be mutated (such as by an update operation).
+        """Tests whether this Query can be mutated (such as by an update operation).
         Args:
             op_name: The name of the operation for which the test is being performed.
-            allow_select: If True, allow a select() specification in the Dataframe.
+            allow_select: If True, allow a select() specification in the Query.
         """
+        self._validate_mutable_op_sequence(op_name, allow_select)
+        # TODO: Reconcile these with Table.__check_mutable()
+        assert len(self._from_clause.tbls) == 1
+        # First check if it's a replica, since every replica handle is also a snapshot
+        if self._first_tbl.is_replica():
+            raise excs.Error(f'Cannot use `{op_name}` on a replica.')
+        if self._first_tbl.is_snapshot():
+            raise excs.Error(f'Cannot use `{op_name}` on a snapshot.')
+    def _validate_mutable_op_sequence(self, op_name: str, allow_select: bool) -> None:
+        """Tests whether the sequence of operations on this Query is valid for a mutation operation."""
         if self.group_by_clause is not None or self.grouping_tbl is not None:
-            raise excs.Error(f'Cannot use `{op_name}` after `group_by`')
+            raise excs.Error(f'Cannot use `{op_name}` after `group_by`.')
         if self.order_by_clause is not None:
-            raise excs.Error(f'Cannot use `{op_name}` after `order_by`')
+            raise excs.Error(f'Cannot use `{op_name}` after `order_by`.')
         if self.select_list is not None and not allow_select:
-            raise excs.Error(f'Cannot use `{op_name}` after `select`')
+            raise excs.Error(f'Cannot use `{op_name}` after `select`.')
         if self.limit_val is not None:
-            raise excs.Error(f'Cannot use `{op_name}` after `limit`')
+            raise excs.Error(f'Cannot use `{op_name}` after `limit`.')
+        if self._has_joins():
+            raise excs.Error(f'Cannot use `{op_name}` after `join`.')
     def as_dict(self) -> dict[str, Any]:
         """
         Returns:
-            Dictionary representing this dataframe.
+            Dictionary representing this Query.
         """
         d = {
-            '_classname': 'DataFrame',
+            '_classname': 'Query',
             'from_clause': {
                 'tbls': [tbl.as_dict() for tbl in self._from_clause.tbls],
                 'join_clauses': [dataclasses.asdict(clause) for clause in self._from_clause.join_clauses],
@@ -1053,13 +1310,14 @@ class DataFrame:
             if self.order_by_clause is not None
             else None,
             'limit_val': self.limit_val.as_dict() if self.limit_val is not None else None,
+            'sample_clause': self.sample_clause.as_dict() if self.sample_clause is not None else None,
         }
         return d
     @classmethod
-    def from_dict(cls, d: dict[str, Any]) -> 'DataFrame':
+    def from_dict(cls, d: dict[str, Any]) -> 'Query':
         # we need to wrap the construction with a transaction, because it might need to load metadata
-        with Env.get().begin_xact():
+        with Catalog.get().begin_xact(for_write=False):
             tbls = [catalog.TableVersionPath.from_dict(tbl_dict) for tbl_dict in d['from_clause']['tbls']]
             join_clauses = [plan.JoinClause(**clause_dict) for clause_dict in d['from_clause']['join_clauses']]
             from_clause = plan.FromClause(tbls=tbls, join_clauses=join_clauses)
@@ -1079,8 +1337,9 @@ class DataFrame:
                 else None
             )
             limit_val = exprs.Expr.from_dict(d['limit_val']) if d['limit_val'] is not None else None
+            sample_clause = SampleClause.from_dict(d['sample_clause']) if d['sample_clause'] is not None else None
-            return DataFrame(
+            return Query(
                 from_clause=from_clause,
                 select_list=select_list,
                 where_clause=where_clause,
@@ -1088,6 +1347,7 @@ class DataFrame:
                 grouping_tbl=grouping_tbl,
                 order_by_clause=order_by_clause,
                 limit=limit_val,
+                sample_clause=sample_clause,
             )
     def _hash_result_set(self) -> str:
@@ -1102,8 +1362,10 @@ class DataFrame:
         return hashlib.sha256(summary_string.encode()).hexdigest()
     def to_coco_dataset(self) -> Path:
-        """Convert the dataframe to a COCO dataset.
-        This dataframe must return a single json-typed output column in the following format:
+        """Convert the Query to a COCO dataset.
+        This Query must return a single json-typed output column in the following format:
+        ```python
         {
             'image': PIL.Image.Image,
             'annotations': [
@@ -1114,6 +1376,7 @@ class DataFrame:
                 ...
             ],
         }
+        ```
         Returns:
             Path to the COCO dataset file.
@@ -1129,12 +1392,13 @@ class DataFrame:
             assert data_file_path.is_file()
             return data_file_path
         else:
-            with Env.get().begin_xact():
+            # TODO: extend begin_xact() to accept multiple TVPs for joins
+            with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=False):
                 return write_coco_dataset(self, dest_path)
     def to_pytorch_dataset(self, image_format: str = 'pt') -> 'torch.utils.data.IterableDataset':
         """
-        Convert the dataframe to a pytorch IterableDataset suitable for parallel loading
+        Convert the Query to a pytorch IterableDataset suitable for parallel loading
         with torch.utils.data.DataLoader.
         This method requires pyarrow >= 13, torch and torchvision to work.
@@ -1174,7 +1438,7 @@ class DataFrame:
         if dest_path.exists():  # fast path: use cache
             assert dest_path.is_dir()
         else:
-            with Env.get().begin_xact():
+            with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=False):
                 export_parquet(self, dest_path, inline_images=True)
         return PixeltablePytorchDataset(path=dest_path, image_format=image_format)

pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl

pixeltable 0.3.14py3-none-any.whl → 0.5.7py3-none-any.whl