PyPI - pixeltable - Versions diffs - 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

pixeltable 0.3.14py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (79) hide show

pixeltable/__init__.py +1 -1
pixeltable/__version__.py +2 -2
pixeltable/catalog/__init__.py +9 -1
pixeltable/catalog/catalog.py +559 -134
pixeltable/catalog/column.py +36 -32
pixeltable/catalog/dir.py +1 -2
pixeltable/catalog/globals.py +12 -0
pixeltable/catalog/insertable_table.py +30 -25
pixeltable/catalog/schema_object.py +9 -6
pixeltable/catalog/table.py +334 -267
pixeltable/catalog/table_version.py +360 -241
pixeltable/catalog/table_version_handle.py +18 -2
pixeltable/catalog/table_version_path.py +86 -23
pixeltable/catalog/view.py +47 -23
pixeltable/dataframe.py +198 -19
pixeltable/env.py +6 -4
pixeltable/exceptions.py +6 -0
pixeltable/exec/__init__.py +1 -1
pixeltable/exec/exec_node.py +2 -0
pixeltable/exec/expr_eval/evaluators.py +4 -1
pixeltable/exec/expr_eval/expr_eval_node.py +4 -4
pixeltable/exec/in_memory_data_node.py +1 -1
pixeltable/exec/sql_node.py +188 -22
pixeltable/exprs/column_property_ref.py +16 -6
pixeltable/exprs/column_ref.py +33 -11
pixeltable/exprs/comparison.py +1 -1
pixeltable/exprs/data_row.py +5 -3
pixeltable/exprs/expr.py +11 -4
pixeltable/exprs/literal.py +2 -0
pixeltable/exprs/row_builder.py +4 -6
pixeltable/exprs/rowid_ref.py +8 -0
pixeltable/exprs/similarity_expr.py +1 -0
pixeltable/func/__init__.py +1 -0
pixeltable/func/mcp.py +74 -0
pixeltable/func/query_template_function.py +5 -3
pixeltable/func/tools.py +12 -2
pixeltable/func/udf.py +2 -2
pixeltable/functions/__init__.py +1 -0
pixeltable/functions/anthropic.py +19 -45
pixeltable/functions/deepseek.py +19 -38
pixeltable/functions/fireworks.py +9 -18
pixeltable/functions/gemini.py +165 -33
pixeltable/functions/groq.py +108 -0
pixeltable/functions/llama_cpp.py +6 -6
pixeltable/functions/math.py +63 -0
pixeltable/functions/mistralai.py +16 -53
pixeltable/functions/ollama.py +1 -1
pixeltable/functions/openai.py +82 -165
pixeltable/functions/string.py +212 -58
pixeltable/functions/together.py +22 -80
pixeltable/globals.py +10 -4
pixeltable/index/base.py +5 -0
pixeltable/index/btree.py +5 -0
pixeltable/index/embedding_index.py +5 -0
pixeltable/io/external_store.py +10 -31
pixeltable/io/label_studio.py +5 -5
pixeltable/io/parquet.py +4 -4
pixeltable/io/table_data_conduit.py +1 -32
pixeltable/metadata/__init__.py +11 -2
pixeltable/metadata/converters/convert_13.py +2 -2
pixeltable/metadata/converters/convert_30.py +6 -11
pixeltable/metadata/converters/convert_35.py +9 -0
pixeltable/metadata/converters/convert_36.py +38 -0
pixeltable/metadata/converters/convert_37.py +15 -0
pixeltable/metadata/converters/util.py +3 -9
pixeltable/metadata/notes.py +3 -0
pixeltable/metadata/schema.py +13 -1
pixeltable/plan.py +135 -12
pixeltable/share/packager.py +321 -20
pixeltable/share/publish.py +2 -2
pixeltable/store.py +31 -13
pixeltable/type_system.py +30 -0
pixeltable/utils/dbms.py +1 -1
pixeltable/utils/formatter.py +64 -42
{pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/METADATA +2 -1
{pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/RECORD +79 -74
{pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/LICENSE +0 -0
{pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/WHEEL +0 -0
{pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/entry_points.txt +0 -0

pixeltable/plan.py CHANGED Viewed

@@ -75,6 +75,88 @@ class FromClause:
     tbls: list[catalog.TableVersionPath]
     join_clauses: list[JoinClause] = dataclasses.field(default_factory=list)
+    @property
+    def _first_tbl(self) -> catalog.TableVersionPath:
+        assert len(self.tbls) == 1
+        return self.tbls[0]
+@dataclasses.dataclass
+class SampleClause:
+    """Defines a sampling clause for a table."""
+    version: Optional[int]
+    n: Optional[int]
+    n_per_stratum: Optional[int]
+    fraction: Optional[float]
+    seed: Optional[int]
+    stratify_exprs: Optional[list[exprs.Expr]]
+    # This seed value is used if one is not supplied
+    DEFAULT_SEED = 0
+    # The version of the hashing algorithm used for ordering and fractional sampling.
+    CURRENT_VERSION = 1
+    def __post_init__(self) -> None:
+        """If no version was provided, provide the default version"""
+        if self.version is None:
+            self.version = self.CURRENT_VERSION
+        if self.seed is None:
+            self.seed = self.DEFAULT_SEED
+    @property
+    def is_stratified(self) -> bool:
+        """Check if the sampling is stratified"""
+        return self.stratify_exprs is not None and len(self.stratify_exprs) > 0
+    @property
+    def is_repeatable(self) -> bool:
+        """Return true if the same rows will continue to be sampled if source rows are added or deleted."""
+        return not self.is_stratified and self.fraction is not None
+    def display_str(self, inline: bool = False) -> str:
+        return str(self)
+    def as_dict(self) -> dict:
+        """Return a dictionary representation of the object"""
+        d = dataclasses.asdict(self)
+        d['_classname'] = self.__class__.__name__
+        if self.is_stratified:
+            d['stratify_exprs'] = [e.as_dict() for e in self.stratify_exprs]
+        return d
+    @classmethod
+    def from_dict(cls, d: dict) -> SampleClause:
+        """Create a SampleClause from a dictionary representation"""
+        d_cleaned = {key: value for key, value in d.items() if key != '_classname'}
+        s = cls(**d_cleaned)
+        if s.is_stratified:
+            s.stratify_exprs = [exprs.Expr.from_dict(e) for e in d_cleaned.get('stratify_exprs', [])]
+        return s
+    def __repr__(self) -> str:
+        s = ','.join(e.display_str(inline=True) for e in self.stratify_exprs)
+        return (
+            f'sample_{self.version}(n={self.n}, n_per_stratum={self.n_per_stratum}, '
+            f'fraction={self.fraction}, seed={self.seed}, [{s}])'
+        )
+    @classmethod
+    def fraction_to_md5_hex(cls, fraction: float) -> str:
+        """Return the string representation of an approximation (to ~1e-9) of a fraction of the total space
+        of md5 hash values.
+        This is used for fractional sampling.
+        """
+        # Maximum count for the upper 32 bits of MD5: 2^32
+        max_md5_value = (2**32) - 1
+        # Calculate the fraction of this value
+        threshold_int = max_md5_value * int(1_000_000_000 * fraction) // 1_000_000_000
+        # Convert to hexadecimal string with padding
+        return format(threshold_int, '08x') + 'ffffffffffffffffffffffff'
 class Analyzer:
     """
@@ -87,6 +169,8 @@ class Analyzer:
     group_by_clause: Optional[list[exprs.Expr]]  # None for non-aggregate queries; [] for agg query w/o grouping
     grouping_exprs: list[exprs.Expr]  # [] for non-aggregate queries or agg query w/o grouping
     order_by_clause: OrderByClause
+    stratify_exprs: list[exprs.Expr]  # [] if no stratiifcation is required
+    sample_clause: Optional[SampleClause]  # None if no sampling clause is present
     sql_elements: exprs.SqlElementCache
@@ -107,6 +191,7 @@ class Analyzer:
         where_clause: Optional[exprs.Expr] = None,
         group_by_clause: Optional[list[exprs.Expr]] = None,
         order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None,
+        sample_clause: Optional[SampleClause] = None,
     ):
         if order_by_clause is None:
             order_by_clause = []
@@ -120,6 +205,11 @@ class Analyzer:
         self.group_by_clause = (
             [e.resolve_computed_cols() for e in group_by_clause] if group_by_clause is not None else None
         )
+        self.sample_clause = sample_clause
+        if self.sample_clause is not None and self.sample_clause.is_stratified:
+            self.stratify_exprs = [e.resolve_computed_cols() for e in sample_clause.stratify_exprs]
+        else:
+            self.stratify_exprs = []
         self.order_by_clause = [OrderByItem(e.resolve_computed_cols(), asc) for e, asc in order_by_clause]
         self.sql_where_clause = None
@@ -135,8 +225,11 @@ class Analyzer:
                 self.all_exprs.append(join_clause.join_predicate)
         if self.group_by_clause is not None:
             self.all_exprs.extend(self.group_by_clause)
+        self.all_exprs.extend(self.stratify_exprs)
         self.all_exprs.extend(e for e, _ in self.order_by_clause)
         if self.filter is not None:
+            if sample_clause is not None:
+                raise excs.Error(f'Filter {self.filter} not expressible in SQL')
             self.all_exprs.append(self.filter)
         self.agg_order_by = []
@@ -260,7 +353,7 @@ class Planner:
     # TODO: create an exec.CountNode and change this to create_count_plan()
     @classmethod
     def create_count_stmt(cls, tbl: catalog.TableVersionPath, where_clause: Optional[exprs.Expr] = None) -> sql.Select:
-        stmt = sql.select(sql.func.count())
+        stmt = sql.select(sql.func.count().label('all_count'))
         refd_tbl_ids: set[UUID] = set()
         if where_clause is not None:
             analyzer = cls.analyze(tbl, where_clause)
@@ -289,7 +382,7 @@ class Planner:
         # create InMemoryDataNode for 'rows'
         plan: exec.ExecNode = exec.InMemoryDataNode(
-            TableVersionHandle(tbl.id, tbl.effective_version), rows, row_builder, tbl.next_rowid
+            TableVersionHandle(tbl.id, tbl.effective_version), rows, row_builder, tbl.next_row_id
         )
         media_input_col_info = [
@@ -322,6 +415,13 @@ class Planner:
         )
         return plan
+    @classmethod
+    def rowid_columns(cls, target: TableVersionHandle, num_rowid_cols: Optional[int] = None) -> list[exprs.Expr]:
+        """Return list of RowidRef for the given number of associated rowids"""
+        if num_rowid_cols is None:
+            num_rowid_cols = target.get().num_rowid_columns()
+        return [exprs.RowidRef(target, i) for i in range(num_rowid_cols)]
     @classmethod
     def create_df_insert_plan(
         cls, tbl: catalog.TableVersion, df: 'pxt.DataFrame', ignore_errors: bool
@@ -385,7 +485,7 @@ class Planner:
         cls.__check_valid_columns(tbl.tbl_version.get(), recomputed_cols, 'updated in')
-        recomputed_base_cols = {col for col in recomputed_cols if col.tbl == tbl.tbl_version}
+        recomputed_base_cols = {col for col in recomputed_cols if col.tbl.id == tbl.tbl_version.id}
         copied_cols = [
             col
             for col in target.cols_by_id.values()
@@ -409,7 +509,7 @@ class Planner:
         for i, col in enumerate(all_base_cols):
             plan.row_builder.add_table_column(col, select_list[i].slot_idx)
         recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
-        return plan, [f'{c.tbl.get().name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
+        return plan, [f'{c.tbl.name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
     @classmethod
     def __check_valid_columns(
@@ -465,7 +565,7 @@ class Planner:
         recomputed_cols.update(idx_val_cols)
         # we only need to recompute stored columns (unstored ones are substituted away)
         recomputed_cols = {c for c in recomputed_cols if c.is_stored}
-        recomputed_base_cols = {col for col in recomputed_cols if col.tbl == target}
+        recomputed_base_cols = {col for col in recomputed_cols if col.tbl.id == target.id}
         copied_cols = [
             col
             for col in target.cols_by_id.values()
@@ -591,8 +691,13 @@ class Planner:
         # 2. for component views: iterator args
         iterator_args = [target.iterator_args] if target.iterator_args is not None else []
-        row_builder = exprs.RowBuilder(iterator_args, stored_cols, [])
+        from_clause = FromClause(tbls=[view.base])
+        base_analyzer = Analyzer(
+            from_clause, iterator_args, where_clause=target.predicate, sample_clause=target.sample_clause
+        )
+        row_builder = exprs.RowBuilder(base_analyzer.all_exprs, stored_cols, [])
+        # if we're propagating an insert, we only want to see those base rows that were created for the current version
         # execution plan:
         # 1. materialize exprs computed from the base that are needed for stored view columns
         # 2. if it's an iterator view, expand the base rows into component rows
@@ -603,8 +708,11 @@ class Planner:
             for e in row_builder.default_eval_ctx.target_exprs
             if e.is_bound_by([view]) and not e.is_bound_by([view.base])
         ]
-        # if we're propagating an insert, we only want to see those base rows that were created for the current version
-        base_analyzer = Analyzer(FromClause(tbls=[view.base]), base_output_exprs, where_clause=target.predicate)
+        # Create a new analyzer reflecting exactly what is required from the base table
+        base_analyzer = Analyzer(
+            from_clause, base_output_exprs, where_clause=target.predicate, sample_clause=target.sample_clause
+        )
         base_eval_ctx = row_builder.create_eval_ctx(base_analyzer.all_exprs)
         plan = cls._create_query_plan(
             row_builder=row_builder,
@@ -701,6 +809,7 @@ class Planner:
         group_by_clause: Optional[list[exprs.Expr]] = None,
         order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None,
         limit: Optional[exprs.Expr] = None,
+        sample_clause: Optional[SampleClause] = None,
         ignore_errors: bool = False,
         exact_version_only: Optional[list[catalog.TableVersionHandle]] = None,
     ) -> exec.ExecNode:
@@ -714,12 +823,14 @@ class Planner:
             order_by_clause = []
         if exact_version_only is None:
             exact_version_only = []
         analyzer = Analyzer(
             from_clause,
             select_list,
             where_clause=where_clause,
             group_by_clause=group_by_clause,
             order_by_clause=order_by_clause,
+            sample_clause=sample_clause,
         )
         row_builder = exprs.RowBuilder(analyzer.all_exprs, [], [])
@@ -773,6 +884,7 @@ class Planner:
         # - join clause subexprs
         # - subexprs of Where clause conjuncts that can't be run in SQL
         # - all grouping exprs
+        # - all stratify exprs
         candidates = list(
             exprs.Expr.list_subexprs(
                 analyzer.select_list,
@@ -787,10 +899,12 @@ class Planner:
             candidates.extend(
                 exprs.Expr.subexprs(analyzer.filter, filter=sql_elements.contains, traverse_matches=False)
             )
-        if analyzer.group_by_clause is not None:
-            candidates.extend(
-                exprs.Expr.list_subexprs(analyzer.group_by_clause, filter=sql_elements.contains, traverse_matches=False)
-            )
+        candidates.extend(
+            exprs.Expr.list_subexprs(analyzer.grouping_exprs, filter=sql_elements.contains, traverse_matches=False)
+        )
+        candidates.extend(
+            exprs.Expr.list_subexprs(analyzer.stratify_exprs, filter=sql_elements.contains, traverse_matches=False)
+        )
         # not isinstance(...): we don't want to materialize Literals via a Select
         sql_exprs = exprs.ExprSet(e for e in candidates if not isinstance(e, exprs.Literal))
@@ -835,6 +949,15 @@ class Planner:
             # we need to order the input for window functions
             plan.set_order_by(analyzer.get_window_fn_ob_clause())
+        if analyzer.sample_clause is not None:
+            plan = exec.SqlSampleNode(
+                row_builder,
+                input=plan,
+                select_list=tbl_scan_exprs,
+                sample_clause=analyzer.sample_clause,
+                stratify_exprs=analyzer.stratify_exprs,
+            )
         plan = cls._insert_prefetch_node(tbl.tbl_version.id, row_builder, plan)
         if analyzer.group_by_clause is not None:

pixeltable 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl

Potentially problematic release.

pixeltable 0.3.14py3-none-any.whl → 0.4.0py3-none-any.whl