PyPI - pixeltable - Versions diffs - 0.3.15__py3-none-any.whl → 0.4.0rc2__py3-none-any.whl - Mend

pixeltable 0.3.15py3-none-any.whl → 0.4.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (58) hide show

pixeltable/__version__.py +2 -2
pixeltable/catalog/catalog.py +296 -105
pixeltable/catalog/column.py +10 -8
pixeltable/catalog/dir.py +1 -2
pixeltable/catalog/insertable_table.py +25 -20
pixeltable/catalog/schema_object.py +3 -6
pixeltable/catalog/table.py +261 -189
pixeltable/catalog/table_version.py +333 -202
pixeltable/catalog/table_version_handle.py +15 -2
pixeltable/catalog/table_version_path.py +60 -14
pixeltable/catalog/view.py +38 -6
pixeltable/dataframe.py +196 -18
pixeltable/env.py +4 -4
pixeltable/exec/__init__.py +1 -1
pixeltable/exec/expr_eval/evaluators.py +4 -1
pixeltable/exec/in_memory_data_node.py +1 -1
pixeltable/exec/sql_node.py +171 -22
pixeltable/exprs/column_property_ref.py +15 -6
pixeltable/exprs/column_ref.py +32 -11
pixeltable/exprs/comparison.py +1 -1
pixeltable/exprs/data_row.py +5 -3
pixeltable/exprs/expr.py +7 -0
pixeltable/exprs/literal.py +2 -0
pixeltable/exprs/row_builder.py +4 -6
pixeltable/exprs/rowid_ref.py +8 -0
pixeltable/exprs/similarity_expr.py +1 -0
pixeltable/func/query_template_function.py +1 -1
pixeltable/func/tools.py +1 -1
pixeltable/functions/gemini.py +0 -1
pixeltable/functions/string.py +212 -58
pixeltable/globals.py +12 -4
pixeltable/index/base.py +5 -0
pixeltable/index/btree.py +5 -0
pixeltable/index/embedding_index.py +5 -0
pixeltable/io/external_store.py +8 -29
pixeltable/io/label_studio.py +1 -1
pixeltable/io/parquet.py +2 -2
pixeltable/io/table_data_conduit.py +0 -31
pixeltable/metadata/__init__.py +11 -2
pixeltable/metadata/converters/convert_13.py +2 -2
pixeltable/metadata/converters/convert_30.py +6 -11
pixeltable/metadata/converters/convert_35.py +9 -0
pixeltable/metadata/converters/convert_36.py +38 -0
pixeltable/metadata/converters/util.py +3 -9
pixeltable/metadata/notes.py +2 -0
pixeltable/metadata/schema.py +8 -1
pixeltable/plan.py +221 -14
pixeltable/share/packager.py +137 -13
pixeltable/share/publish.py +2 -2
pixeltable/store.py +19 -13
pixeltable/utils/dbms.py +1 -1
pixeltable/utils/formatter.py +64 -42
pixeltable/utils/sample.py +25 -0
{pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/METADATA +2 -1
{pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/RECORD +58 -55
{pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/LICENSE +0 -0
{pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/WHEEL +0 -0
{pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/entry_points.txt +0 -0

pixeltable/io/table_data_conduit.py CHANGED Viewed

@@ -23,7 +23,6 @@ from .utils import normalize_schema_names
 _logger = logging.getLogger('pixeltable')
-# ---------------------------------------------------------------------------------------------------------
 if TYPE_CHECKING:
     import datasets  # type: ignore[import-untyped]
@@ -46,9 +45,6 @@ class TableDataConduitFormat(str, enum.Enum):
         return False
-# ---------------------------------------------------------------------------------------------------------
 @dataclass
 class TableDataConduit:
     source: TableDataSource
@@ -129,9 +125,6 @@ class TableDataConduit:
             raise excs.Error(f'Missing required column(s) ({", ".join(missing_cols)})')
-# ---------------------------------------------------------------------------------------------------------
 class DFTableDataConduit(TableDataConduit):
     pxt_df: pxt.DataFrame = None
@@ -155,9 +148,6 @@ class DFTableDataConduit(TableDataConduit):
         self.check_source_columns_are_insertable(self.pxt_df.schema.keys())
-# ---------------------------------------------------------------------------------------------------------
 class RowDataTableDataConduit(TableDataConduit):
     raw_rows: Optional[RowData] = None
     disable_mapping: bool = True
@@ -235,9 +225,6 @@ class RowDataTableDataConduit(TableDataConduit):
             yield self.valid_rows
-# ---------------------------------------------------------------------------------------------------------
 class PandasTableDataConduit(TableDataConduit):
     pd_df: pd.DataFrame = None
     batch_count: int = 0
@@ -293,9 +280,6 @@ class PandasTableDataConduit(TableDataConduit):
             yield self.valid_rows
-# ---------------------------------------------------------------------------------------------------------
 class CSVTableDataConduit(TableDataConduit):
     @classmethod
     def from_tds(cls, tds: TableDataConduit) -> 'PandasTableDataConduit':
@@ -307,9 +291,6 @@ class CSVTableDataConduit(TableDataConduit):
         return PandasTableDataConduit.from_tds(t)
-# ---------------------------------------------------------------------------------------------------------
 class ExcelTableDataConduit(TableDataConduit):
     @classmethod
     def from_tds(cls, tds: TableDataConduit) -> 'PandasTableDataConduit':
@@ -321,9 +302,6 @@ class ExcelTableDataConduit(TableDataConduit):
         return PandasTableDataConduit.from_tds(t)
-# ---------------------------------------------------------------------------------------------------------
 class JsonTableDataConduit(TableDataConduit):
     @classmethod
     def from_tds(cls, tds: TableDataConduit) -> RowDataTableDataConduit:
@@ -346,9 +324,6 @@ class JsonTableDataConduit(TableDataConduit):
         return t2
-# ---------------------------------------------------------------------------------------------------------
 class HFTableDataConduit(TableDataConduit):
     hf_ds: Optional[Union[datasets.Dataset, datasets.DatasetDict]] = None
     column_name_for_split: Optional[str] = None
@@ -478,9 +453,6 @@ class HFTableDataConduit(TableDataConduit):
                 yield batch
-# ---------------------------------------------------------------------------------------------------------
 class ParquetTableDataConduit(TableDataConduit):
     pq_ds: Optional[ParquetDataset] = None
@@ -542,9 +514,6 @@ class ParquetTableDataConduit(TableDataConduit):
             raise e
-# ---------------------------------------------------------------------------------------------------------
 class UnkTableDataConduit(TableDataConduit):
     """Source type is not known at the time of creation"""

pixeltable/metadata/__init__.py CHANGED Viewed

@@ -8,15 +8,17 @@ from typing import Callable
 import sqlalchemy as sql
 from sqlalchemy import orm
+import pixeltable as pxt
+import pixeltable.exceptions as excs
 from pixeltable.utils.console_output import ConsoleLogger
 from .schema import SystemInfo, SystemInfoMd
 _console_logger = ConsoleLogger(logging.getLogger('pixeltable'))
+_logger = logging.getLogger('pixeltable')
 # current version of the metadata; this is incremented whenever the metadata schema changes
-VERSION = 35
+VERSION = 37
 def create_system_info(engine: sql.engine.Engine) -> None:
@@ -55,6 +57,13 @@ def upgrade_md(engine: sql.engine.Engine) -> None:
         system_info = session.query(SystemInfo).one().md
         md_version = system_info['schema_version']
         assert isinstance(md_version, int)
+        _logger.info(f'Current database version: {md_version}, installed version: {VERSION}')
+        if md_version > VERSION:
+            raise excs.Error(
+                'This Pixeltable database was created with a newer Pixeltable version '
+                f'than the one currently installed ({pxt.__version__}).\n'
+                'Please update to the latest Pixeltable version by running: pip install --upgrade pixeltable'
+            )
         if md_version == VERSION:
             return
         while md_version < VERSION:

pixeltable/metadata/converters/convert_13.py CHANGED Viewed

@@ -12,9 +12,9 @@ _logger = logging.getLogger('pixeltable')
 @register_converter(version=13)
 def _(engine: sql.engine.Engine) -> None:
     with engine.begin() as conn:
-        for row in conn.execute(sql.select(Table)):
+        for row in conn.execute(sql.select(Table.id, Table.md)):
             id = row[0]
-            md = row[2]
+            md = row[1]
             updated_md = __update_md(md)
             if updated_md != md:
                 _logger.info(f'Updating schema for table: {id}')

pixeltable/metadata/converters/convert_30.py CHANGED Viewed

@@ -1,33 +1,28 @@
 import copy
+from uuid import UUID
 import sqlalchemy as sql
 from pixeltable.metadata import register_converter
 from pixeltable.metadata.converters.util import (
-    convert_table_record,
+    convert_table_md,
     convert_table_schema_version_record,
     convert_table_version_record,
 )
-from pixeltable.metadata.schema import Table, TableSchemaVersion, TableVersion
+from pixeltable.metadata.schema import TableSchemaVersion, TableVersion
 @register_converter(version=30)
 def _(engine: sql.engine.Engine) -> None:
-    convert_table_record(engine, table_record_updater=__update_table_record)
+    convert_table_md(engine, table_md_updater=__update_table_md)
     convert_table_version_record(engine, table_version_record_updater=__update_table_version_record)
     convert_table_schema_version_record(
         engine, table_schema_version_record_updater=__update_table_schema_version_record
     )
-def __update_table_record(record: Table) -> None:
-    """
-    Update TableMd with table_id
-    """
-    assert isinstance(record.md, dict)
-    md = copy.copy(record.md)
-    md['tbl_id'] = str(record.id)
-    record.md = md
+def __update_table_md(md: dict, tbl_id: UUID) -> None:
+    md['tbl_id'] = str(tbl_id)
 def __update_table_version_record(record: TableVersion) -> None:

pixeltable/metadata/converters/convert_35.py ADDED Viewed

@@ -0,0 +1,9 @@
+import sqlalchemy as sql
+from pixeltable.metadata import register_converter
+@register_converter(version=35)
+def _(engine: sql.engine.Engine) -> None:
+    with engine.begin() as conn:
+        conn.execute(sql.text('ALTER TABLE tables ADD COLUMN lock_dummy int8'))

pixeltable/metadata/converters/convert_36.py ADDED Viewed

@@ -0,0 +1,38 @@
+import logging
+from typing import Any, Optional
+from uuid import UUID
+import sqlalchemy as sql
+from pixeltable.metadata import register_converter
+from pixeltable.metadata.converters.util import convert_table_md
+_logger = logging.getLogger('pixeltable')
+@register_converter(version=36)
+def _(engine: sql.engine.Engine) -> None:
+    convert_table_md(engine, table_md_updater=__update_table_md, substitution_fn=__substitute_md)
+def __update_table_md(table_md: dict, table_id: UUID) -> None:
+    """Update the view metadata to add the sample_clause field if it is missing
+    Args:
+        table_md (dict): copy of the original table metadata. this gets updated in place.
+        table_id (UUID): the table id
+    """
+    if table_md['view_md'] is None:
+        return
+    if 'sample_clause' not in table_md['view_md']:
+        table_md['view_md']['sample_clause'] = None
+        _logger.info(f'Updating view metadata for table: {table_id}')
+def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
+    if isinstance(v, dict) and (v.get('_classname') == 'DataFrame'):
+        if 'sample_clause' not in v:
+            v['sample_clause'] = None
+        return k, v
+    return None

pixeltable/metadata/converters/util.py CHANGED Viewed

@@ -33,9 +33,10 @@ def convert_table_md(
             the original entry will be replaced, and the traversal will continue with `v'`.
     """
     with engine.begin() as conn:
-        for row in conn.execute(sql.select(Table)):
+        # avoid a SELECT * here, which breaks when we add new columns to Table
+        for row in conn.execute(sql.select(Table.id, Table.md)):
             tbl_id = row[0]
-            table_md = row[2]
+            table_md = row[1]
             assert isinstance(table_md, dict)
             updated_table_md = copy.deepcopy(table_md)
             if table_md_updater is not None:
@@ -145,13 +146,6 @@ def __update_schema_column(table_schema_version_md: dict, schema_column_updater:
         schema_column_updater(schema_col)
-def convert_table_record(engine: sql.engine.Engine, table_record_updater: Optional[Callable[[Table], None]]) -> None:
-    with sql.orm.Session(engine, future=True) as session:
-        for record in session.query(Table).all():
-            table_record_updater(record)
-        session.commit()
 def convert_table_version_record(
     engine: sql.engine.Engine, table_version_record_updater: Optional[Callable[[TableVersion], None]]
 ) -> None:

pixeltable/metadata/notes.py CHANGED Viewed

@@ -2,6 +2,8 @@
 # rather than as a comment, so that the existence of a description can be enforced by
 # the unit tests when new versions are added.
 VERSION_NOTES = {
+    37: 'Add support for the sample() method on DataFrames',
+    36: 'Added Table.lock_dummy',
     35: 'Track reference_tbl in ColumnRef',
     34: 'Set default value for is_pk field in column metadata to False',
     33: 'Add is_replica field to table metadata',

pixeltable/metadata/schema.py CHANGED Viewed

@@ -84,7 +84,8 @@ class Dir(Base):
     )
     parent_id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=True)
     md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False)  # DirMd
-    # This field is updated to synchronize database operations across multiple sessions
+    # used to force acquisition of an X-lock via an Update stmt
     lock_dummy: orm.Mapped[int] = orm.mapped_column(BigInteger, nullable=True)
@@ -146,6 +147,9 @@ class ViewMd:
     # filter predicate applied to the base table; view-only
     predicate: Optional[dict[str, Any]]
+    # sampling predicate applied to the base table; view-only
+    sample_clause: Optional[dict[str, Any]]
     # ComponentIterator subclass; only for component views
     iterator_class_fqn: Optional[str]
@@ -200,6 +204,9 @@ class Table(Base):
     dir_id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=False)
     md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False)  # TableMd
+    # used to force acquisition of an X-lock via an Update stmt
+    lock_dummy: orm.Mapped[int] = orm.mapped_column(BigInteger, nullable=True)
 @dataclasses.dataclass
 class TableVersionMd:

pixeltable/plan.py CHANGED Viewed

@@ -3,7 +3,7 @@ from __future__ import annotations
 import dataclasses
 import enum
 from textwrap import dedent
-from typing import Any, Iterable, Literal, Optional, Sequence
+from typing import Any, Iterable, Literal, NamedTuple, Optional, Sequence
 from uuid import UUID
 import sqlalchemy as sql
@@ -12,6 +12,7 @@ import pixeltable as pxt
 from pixeltable import catalog, exceptions as excs, exec, exprs
 from pixeltable.catalog import Column, TableVersionHandle
 from pixeltable.exec.sql_node import OrderByClause, OrderByItem, combine_order_by_clauses, print_order_by_clause
+from pixeltable.utils.sample import sample_key
 def _is_agg_fn_call(e: exprs.Expr) -> bool:
@@ -75,6 +76,98 @@ class FromClause:
     tbls: list[catalog.TableVersionPath]
     join_clauses: list[JoinClause] = dataclasses.field(default_factory=list)
+    @property
+    def _first_tbl(self) -> catalog.TableVersionPath:
+        assert len(self.tbls) == 1
+        return self.tbls[0]
+@dataclasses.dataclass
+class SampleClause:
+    """Defines a sampling clause for a table."""
+    version: Optional[int]
+    n: Optional[int]
+    n_per_stratum: Optional[int]
+    fraction: Optional[float]
+    seed: Optional[int]
+    stratify_exprs: Optional[list[exprs.Expr]]
+    # This seed value is used if one is not supplied
+    DEFAULT_SEED = 0
+    # The version of the hashing algorithm used for ordering and fractional sampling.
+    CURRENT_VERSION = 1
+    def __post_init__(self) -> None:
+        """If no version was provided, provide the default version"""
+        if self.version is None:
+            self.version = self.CURRENT_VERSION
+        if self.seed is None:
+            self.seed = self.DEFAULT_SEED
+    @property
+    def is_stratified(self) -> bool:
+        """Check if the sampling is stratified"""
+        return self.stratify_exprs is not None and len(self.stratify_exprs) > 0
+    @property
+    def is_repeatable(self) -> bool:
+        """Return true if the same rows will continue to be sampled if source rows are added or deleted."""
+        return not self.is_stratified and self.fraction is not None
+    def display_str(self, inline: bool = False) -> str:
+        return str(self)
+    def as_dict(self) -> dict:
+        """Return a dictionary representation of the object"""
+        d = dataclasses.asdict(self)
+        d['_classname'] = self.__class__.__name__
+        if self.is_stratified:
+            d['stratify_exprs'] = [e.as_dict() for e in self.stratify_exprs]
+        return d
+    @classmethod
+    def from_dict(cls, d: dict) -> SampleClause:
+        """Create a SampleClause from a dictionary representation"""
+        d_cleaned = {key: value for key, value in d.items() if key != '_classname'}
+        s = cls(**d_cleaned)
+        if s.is_stratified:
+            s.stratify_exprs = [exprs.Expr.from_dict(e) for e in d_cleaned.get('stratify_exprs', [])]
+        return s
+    def __repr__(self) -> str:
+        s = ','.join(e.display_str(inline=True) for e in self.stratify_exprs)
+        return (
+            f'sample_{self.version}(n={self.n}, n_per_stratum={self.n_per_stratum}, '
+            f'fraction={self.fraction}, seed={self.seed}, [{s}])'
+        )
+    @classmethod
+    def fraction_to_md5_hex(cls, fraction: float) -> str:
+        """Return the string representation of an approximation (to ~1e-9) of a fraction of the total space
+        of md5 hash values.
+        This is used for fractional sampling.
+        """
+        # Maximum count for the upper 32 bits of MD5: 2^32
+        max_md5_value = (2**32) - 1
+        # Calculate the fraction of this value
+        threshold_int = max_md5_value * int(1_000_000_000 * fraction) // 1_000_000_000
+        # Convert to hexadecimal string with padding
+        return format(threshold_int, '08x') + 'ffffffffffffffffffffffff'
+class SamplingClauses(NamedTuple):
+    """Clauses provided when rewriting a SampleClause"""
+    where: exprs.Expr
+    group_by_clause: Optional[list[exprs.Expr]]
+    order_by_clause: Optional[list[tuple[exprs.Expr, bool]]]
+    limit: Optional[exprs.Expr]
+    sample_clause: Optional[SampleClause]
 class Analyzer:
     """
@@ -260,7 +353,7 @@ class Planner:
     # TODO: create an exec.CountNode and change this to create_count_plan()
     @classmethod
     def create_count_stmt(cls, tbl: catalog.TableVersionPath, where_clause: Optional[exprs.Expr] = None) -> sql.Select:
-        stmt = sql.select(sql.func.count())
+        stmt = sql.select(sql.func.count().label('all_count'))
         refd_tbl_ids: set[UUID] = set()
         if where_clause is not None:
             analyzer = cls.analyze(tbl, where_clause)
@@ -289,7 +382,7 @@ class Planner:
         # create InMemoryDataNode for 'rows'
         plan: exec.ExecNode = exec.InMemoryDataNode(
-            TableVersionHandle(tbl.id, tbl.effective_version), rows, row_builder, tbl.next_rowid
+            TableVersionHandle(tbl.id, tbl.effective_version), rows, row_builder, tbl.next_row_id
         )
         media_input_col_info = [
@@ -322,6 +415,13 @@ class Planner:
         )
         return plan
+    @classmethod
+    def rowid_columns(cls, target: TableVersionHandle, num_rowid_cols: Optional[int] = None) -> list[exprs.Expr]:
+        """Return list of RowidRef for the given number of associated rowids"""
+        if num_rowid_cols is None:
+            num_rowid_cols = target.get().num_rowid_columns()
+        return [exprs.RowidRef(target, i) for i in range(num_rowid_cols)]
     @classmethod
     def create_df_insert_plan(
         cls, tbl: catalog.TableVersion, df: 'pxt.DataFrame', ignore_errors: bool
@@ -385,7 +485,7 @@ class Planner:
         cls.__check_valid_columns(tbl.tbl_version.get(), recomputed_cols, 'updated in')
-        recomputed_base_cols = {col for col in recomputed_cols if col.tbl == tbl.tbl_version}
+        recomputed_base_cols = {col for col in recomputed_cols if col.tbl.id == tbl.tbl_version.id}
         copied_cols = [
             col
             for col in target.cols_by_id.values()
@@ -409,7 +509,7 @@ class Planner:
         for i, col in enumerate(all_base_cols):
             plan.row_builder.add_table_column(col, select_list[i].slot_idx)
         recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
-        return plan, [f'{c.tbl.get().name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
+        return plan, [f'{c.tbl.name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
     @classmethod
     def __check_valid_columns(
@@ -465,7 +565,7 @@ class Planner:
         recomputed_cols.update(idx_val_cols)
         # we only need to recompute stored columns (unstored ones are substituted away)
         recomputed_cols = {c for c in recomputed_cols if c.is_stored}
-        recomputed_base_cols = {col for col in recomputed_cols if col.tbl == target}
+        recomputed_base_cols = {col for col in recomputed_cols if col.tbl.id == target.id}
         copied_cols = [
             col
             for col in target.cols_by_id.values()
@@ -591,7 +691,24 @@ class Planner:
         # 2. for component views: iterator args
         iterator_args = [target.iterator_args] if target.iterator_args is not None else []
-        row_builder = exprs.RowBuilder(iterator_args, stored_cols, [])
+        # If this contains a sample specification, modify / create where, group_by, order_by, and limit clauses
+        from_clause = FromClause(tbls=[view.base])
+        where, group_by_clause, order_by_clause, limit, sample_clause = cls.create_sample_clauses(
+            from_clause, target.sample_clause, target.predicate, None, [], None
+        )
+        # if we're propagating an insert, we only want to see those base rows that were created for the current version
+        base_analyzer = Analyzer(
+            from_clause,
+            iterator_args,
+            where_clause=where,
+            group_by_clause=group_by_clause,
+            order_by_clause=order_by_clause,
+        )
+        row_builder = exprs.RowBuilder(base_analyzer.all_exprs, stored_cols, [])
+        if target.sample_clause is not None and base_analyzer.filter is not None:
+            raise excs.Error(f'Filter {base_analyzer.filter} not expressible in SQL')
         # execution plan:
         # 1. materialize exprs computed from the base that are needed for stored view columns
@@ -603,13 +720,22 @@ class Planner:
             for e in row_builder.default_eval_ctx.target_exprs
             if e.is_bound_by([view]) and not e.is_bound_by([view.base])
         ]
-        # if we're propagating an insert, we only want to see those base rows that were created for the current version
-        base_analyzer = Analyzer(FromClause(tbls=[view.base]), base_output_exprs, where_clause=target.predicate)
+        # Create a new analyzer reflecting exactly what is required from the base table
+        base_analyzer = Analyzer(
+            from_clause,
+            base_output_exprs,
+            where_clause=where,
+            group_by_clause=group_by_clause,
+            order_by_clause=order_by_clause,
+        )
         base_eval_ctx = row_builder.create_eval_ctx(base_analyzer.all_exprs)
         plan = cls._create_query_plan(
             row_builder=row_builder,
             analyzer=base_analyzer,
             eval_ctx=base_eval_ctx,
+            limit=limit,
+            sample_clause=sample_clause,
             with_pk=True,
             exact_version_only=view.get_bases() if propagates_insert else [],
         )
@@ -692,6 +818,62 @@ class Planner:
         prefetch_node = exec.CachePrefetchNode(tbl_id, file_col_info, input_node)
         return prefetch_node
+    @classmethod
+    def create_sample_clauses(
+        cls,
+        from_clause: FromClause,
+        sample_clause: SampleClause,
+        where_clause: Optional[exprs.Expr],
+        group_by_clause: Optional[list[exprs.Expr]],
+        order_by_clause: Optional[list[tuple[exprs.Expr, bool]]],
+        limit: Optional[exprs.Expr],
+    ) -> SamplingClauses:
+        """tuple[
+            exprs.Expr,
+            Optional[list[exprs.Expr]],
+            Optional[list[tuple[exprs.Expr, bool]]],
+            Optional[exprs.Expr],
+            Optional[SampleClause],
+        ]:"""
+        """Construct clauses required for sampling under various conditions.
+        If there is no sampling, then return the original clauses.
+        If the sample is stratified, then return only the group by clause. The rest of the
+        mechanism for stratified sampling is provided by the SampleSqlNode.
+        If the sample is non-stratified, then rewrite the query to accommodate the supplied where clause,
+        and provide the other clauses required for sampling
+        """
+        # If no sample clause, return the original clauses
+        if sample_clause is None:
+            return SamplingClauses(where_clause, group_by_clause, order_by_clause, limit, None)
+        # If the sample clause is stratified, create a group by clause
+        if sample_clause.is_stratified:
+            group_by = sample_clause.stratify_exprs
+            # Note that limit is not possible here
+            return SamplingClauses(where_clause, group_by, order_by_clause, None, sample_clause)
+        else:
+            # If non-stratified sampling, construct a where clause, order_by, and limit clauses
+            # Construct an expression for sorting rows and limiting row counts
+            s_key = sample_key(
+                exprs.Literal(sample_clause.seed), *cls.rowid_columns(from_clause._first_tbl.tbl_version)
+            )
+            # Construct a suitable where clause
+            where = where_clause
+            if sample_clause.fraction is not None:
+                fraction_md5_hex = exprs.Expr.from_object(
+                    sample_clause.fraction_to_md5_hex(float(sample_clause.fraction))
+                )
+                f_where = s_key < fraction_md5_hex
+                where = where & f_where if where is not None else f_where
+            order_by: list[tuple[exprs.Expr, bool]] = [(s_key, True)]
+            limit = exprs.Literal(sample_clause.n)
+            # Note that group_by is not possible here
+            return SamplingClauses(where, None, order_by, limit, None)
     @classmethod
     def create_query_plan(
         cls,
@@ -701,6 +883,7 @@ class Planner:
         group_by_clause: Optional[list[exprs.Expr]] = None,
         order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None,
         limit: Optional[exprs.Expr] = None,
+        sample_clause: Optional[SampleClause] = None,
         ignore_errors: bool = False,
         exact_version_only: Optional[list[catalog.TableVersionHandle]] = None,
     ) -> exec.ExecNode:
@@ -714,14 +897,22 @@ class Planner:
             order_by_clause = []
         if exact_version_only is None:
             exact_version_only = []
+        # Modify clauses to include sample clause
+        where, group_by_clause, order_by_clause, limit, sample = cls.create_sample_clauses(
+            from_clause, sample_clause, where_clause, group_by_clause, order_by_clause, limit
+        )
         analyzer = Analyzer(
             from_clause,
             select_list,
-            where_clause=where_clause,
+            where_clause=where,
             group_by_clause=group_by_clause,
             order_by_clause=order_by_clause,
         )
         row_builder = exprs.RowBuilder(analyzer.all_exprs, [], [])
+        if sample_clause is not None and analyzer.filter is not None:
+            raise excs.Error(f'Filter {analyzer.filter} not expressible in SQL')
         analyzer.finalize(row_builder)
         # select_list: we need to materialize everything that's been collected
@@ -732,6 +923,7 @@ class Planner:
             analyzer=analyzer,
             eval_ctx=eval_ctx,
             limit=limit,
+            sample_clause=sample,
             with_pk=True,
             exact_version_only=exact_version_only,
         )
@@ -747,6 +939,7 @@ class Planner:
         analyzer: Analyzer,
         eval_ctx: exprs.RowBuilder.EvalCtx,
         limit: Optional[exprs.Expr] = None,
+        sample_clause: Optional[SampleClause] = None,
         with_pk: bool = False,
         exact_version_only: Optional[list[catalog.TableVersionHandle]] = None,
     ) -> exec.ExecNode:
@@ -857,12 +1050,26 @@ class Planner:
                 sql_elements.contains_all(analyzer.select_list)
                 and sql_elements.contains_all(analyzer.grouping_exprs)
                 and isinstance(plan, exec.SqlNode)
-                and plan.to_cte() is not None
+                and plan.to_cte(keep_pk=(sample_clause is not None)) is not None
             ):
-                plan = exec.SqlAggregationNode(
-                    row_builder, input=plan, select_list=analyzer.select_list, group_by_items=analyzer.group_by_clause
-                )
+                if sample_clause is not None:
+                    plan = exec.SqlSampleNode(
+                        row_builder,
+                        input=plan,
+                        select_list=analyzer.select_list,
+                        stratify_exprs=analyzer.group_by_clause,
+                        sample_clause=sample_clause,
+                    )
+                else:
+                    plan = exec.SqlAggregationNode(
+                        row_builder,
+                        input=plan,
+                        select_list=analyzer.select_list,
+                        group_by_items=analyzer.group_by_clause,
+                    )
             else:
+                if sample_clause is not None:
+                    raise excs.Error('Sample clause not supported with Python aggregation')
                 input_sql_node = plan.get_node(exec.SqlNode)
                 assert combined_ordering is not None
                 input_sql_node.set_order_by(combined_ordering)

pixeltable 0.3.15__py3-none-any.whl → 0.4.0rc2__py3-none-any.whl

Potentially problematic release.

pixeltable 0.3.15py3-none-any.whl → 0.4.0rc2py3-none-any.whl