PyPI - pixeltable - Versions diffs - 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

pixeltable 0.3.14py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (79) hide show

pixeltable/__init__.py +1 -1
pixeltable/__version__.py +2 -2
pixeltable/catalog/__init__.py +9 -1
pixeltable/catalog/catalog.py +559 -134
pixeltable/catalog/column.py +36 -32
pixeltable/catalog/dir.py +1 -2
pixeltable/catalog/globals.py +12 -0
pixeltable/catalog/insertable_table.py +30 -25
pixeltable/catalog/schema_object.py +9 -6
pixeltable/catalog/table.py +334 -267
pixeltable/catalog/table_version.py +360 -241
pixeltable/catalog/table_version_handle.py +18 -2
pixeltable/catalog/table_version_path.py +86 -23
pixeltable/catalog/view.py +47 -23
pixeltable/dataframe.py +198 -19
pixeltable/env.py +6 -4
pixeltable/exceptions.py +6 -0
pixeltable/exec/__init__.py +1 -1
pixeltable/exec/exec_node.py +2 -0
pixeltable/exec/expr_eval/evaluators.py +4 -1
pixeltable/exec/expr_eval/expr_eval_node.py +4 -4
pixeltable/exec/in_memory_data_node.py +1 -1
pixeltable/exec/sql_node.py +188 -22
pixeltable/exprs/column_property_ref.py +16 -6
pixeltable/exprs/column_ref.py +33 -11
pixeltable/exprs/comparison.py +1 -1
pixeltable/exprs/data_row.py +5 -3
pixeltable/exprs/expr.py +11 -4
pixeltable/exprs/literal.py +2 -0
pixeltable/exprs/row_builder.py +4 -6
pixeltable/exprs/rowid_ref.py +8 -0
pixeltable/exprs/similarity_expr.py +1 -0
pixeltable/func/__init__.py +1 -0
pixeltable/func/mcp.py +74 -0
pixeltable/func/query_template_function.py +5 -3
pixeltable/func/tools.py +12 -2
pixeltable/func/udf.py +2 -2
pixeltable/functions/__init__.py +1 -0
pixeltable/functions/anthropic.py +19 -45
pixeltable/functions/deepseek.py +19 -38
pixeltable/functions/fireworks.py +9 -18
pixeltable/functions/gemini.py +165 -33
pixeltable/functions/groq.py +108 -0
pixeltable/functions/llama_cpp.py +6 -6
pixeltable/functions/math.py +63 -0
pixeltable/functions/mistralai.py +16 -53
pixeltable/functions/ollama.py +1 -1
pixeltable/functions/openai.py +82 -165
pixeltable/functions/string.py +212 -58
pixeltable/functions/together.py +22 -80
pixeltable/globals.py +10 -4
pixeltable/index/base.py +5 -0
pixeltable/index/btree.py +5 -0
pixeltable/index/embedding_index.py +5 -0
pixeltable/io/external_store.py +10 -31
pixeltable/io/label_studio.py +5 -5
pixeltable/io/parquet.py +4 -4
pixeltable/io/table_data_conduit.py +1 -32
pixeltable/metadata/__init__.py +11 -2
pixeltable/metadata/converters/convert_13.py +2 -2
pixeltable/metadata/converters/convert_30.py +6 -11
pixeltable/metadata/converters/convert_35.py +9 -0
pixeltable/metadata/converters/convert_36.py +38 -0
pixeltable/metadata/converters/convert_37.py +15 -0
pixeltable/metadata/converters/util.py +3 -9
pixeltable/metadata/notes.py +3 -0
pixeltable/metadata/schema.py +13 -1
pixeltable/plan.py +135 -12
pixeltable/share/packager.py +321 -20
pixeltable/share/publish.py +2 -2
pixeltable/store.py +31 -13
pixeltable/type_system.py +30 -0
pixeltable/utils/dbms.py +1 -1
pixeltable/utils/formatter.py +64 -42
{pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/METADATA +2 -1
{pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/RECORD +79 -74
{pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/LICENSE +0 -0
{pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/WHEEL +0 -0
{pixeltable-0.3.14.dist-info → pixeltable-0.4.0.dist-info}/entry_points.txt +0 -0

pixeltable/exec/sql_node.py CHANGED Viewed

@@ -14,6 +14,7 @@ from .exec_node import ExecNode
 if TYPE_CHECKING:
     import pixeltable.plan
+    from pixeltable.plan import SampleClause
 _logger = logging.getLogger('pixeltable')
@@ -64,8 +65,12 @@ def print_order_by_clause(clause: OrderByClause) -> str:
 class SqlNode(ExecNode):
     """
-    Materializes data from the store via a Select stmt.
+    Materializes data from the store via an SQL statement.
     This only provides the select list. The subclasses are responsible for the From clause and any additional clauses.
+    The pk columns are not included in the select list.
+    If set_pk is True, they are added to the end of the result set when creating the SQL statement
+    so they can always be referenced as cols[-num_pk_cols:] in the result set.
+    The pk_columns consist of the rowid columns of the target table followed by the version number.
     """
     tbl: Optional[catalog.TableVersionPath]
@@ -122,6 +127,7 @@ class SqlNode(ExecNode):
             # we also need to retrieve the pk columns
             assert tbl is not None
             self.num_pk_cols = len(tbl.tbl_version.get().store_tbl.pk_columns())
+            assert self.num_pk_cols > 1
         # additional state
         self.result_cursor = None
@@ -134,14 +140,25 @@ class SqlNode(ExecNode):
         self.where_clause_element = None
         self.order_by_clause = []
+        if self.tbl is not None:
+            tv = self.tbl.tbl_version._tbl_version
+            if tv is not None:
+                assert tv.is_validated
+    def _create_pk_cols(self) -> list[sql.Column]:
+        """Create a list of pk columns"""
+        # we need to retrieve the pk columns
+        if self.set_pk:
+            assert self.tbl is not None
+            assert self.tbl.tbl_version.get().is_validated
+            return self.tbl.tbl_version.get().store_tbl.pk_columns()
+        return []
     def _create_stmt(self) -> sql.Select:
         """Create Select from local state"""
         assert self.sql_elements.contains_all(self.select_list)
-        sql_select_list = [self.sql_elements.get(e) for e in self.select_list]
-        if self.set_pk:
-            assert self.tbl is not None
-            sql_select_list += self.tbl.tbl_version.get().store_tbl.pk_columns()
+        sql_select_list = [self.sql_elements.get(e) for e in self.select_list] + self._create_pk_cols()
         stmt = sql.select(*sql_select_list)
         where_clause_element = (
@@ -167,9 +184,10 @@ class SqlNode(ExecNode):
     def _ordering_tbl_ids(self) -> set[UUID]:
         return exprs.Expr.all_tbl_ids(e for e, _ in self.order_by_clause)
-    def to_cte(self) -> Optional[tuple[sql.CTE, exprs.ExprDict[sql.ColumnElement]]]:
+    def to_cte(self, keep_pk: bool = False) -> Optional[tuple[sql.CTE, exprs.ExprDict[sql.ColumnElement]]]:
         """
-        Returns a CTE that materializes the output of this node plus a mapping from select list expr to output column
+        Creates a CTE that materializes the output of this node plus a mapping from select list expr to output column.
+        keep_pk: if True, the PK columns are included in the CTE Select statement
         Returns:
             (CTE, dict from Expr to output column)
@@ -177,11 +195,13 @@ class SqlNode(ExecNode):
         if self.py_filter is not None:
             # the filter needs to run in Python
             return None
-        self.set_pk = False  # we don't need the PK if we use this SqlNode as a CTE
         if self.cte is None:
+            if not keep_pk:
+                self.set_pk = False  # we don't need the PK if we use this SqlNode as a CTE
             self.cte = self._create_stmt().cte()
-            assert len(self.cte.c) == len(self.select_list)
-        return self.cte, exprs.ExprDict(zip(self.select_list, self.cte.c))
+        pk_count = self.num_pk_cols if self.set_pk else 0
+        assert len(self.select_list) + pk_count == len(self.cte.c)
+        return self.cte, exprs.ExprDict(zip(self.select_list, self.cte.c))  # skip pk cols
     @classmethod
     def retarget_rowid_refs(cls, target: catalog.TableVersionPath, expr_seq: Iterable[exprs.Expr]) -> None:
@@ -220,26 +240,29 @@ class SqlNode(ExecNode):
                 joined_tbls.append(t)
         first = True
-        prev_tbl: Optional[catalog.TableVersionHandle] = None
+        prev_tv: Optional[catalog.TableVersion] = None
         for t in joined_tbls[::-1]:
+            tv = t.get()
+            # _logger.debug(f'create_from_clause: tbl_id={tv.id} {id(tv.store_tbl.sa_tbl)}')
             if first:
-                stmt = stmt.select_from(t.get().store_tbl.sa_tbl)
+                stmt = stmt.select_from(tv.store_tbl.sa_tbl)
                 first = False
             else:
-                # join tbl to prev_tbl on prev_tbl's rowid cols
-                prev_tbl_rowid_cols = prev_tbl.get().store_tbl.rowid_columns()
-                tbl_rowid_cols = t.get().store_tbl.rowid_columns()
+                # join tv to prev_tv on prev_tv's rowid cols
+                prev_tbl_rowid_cols = prev_tv.store_tbl.rowid_columns()
+                tbl_rowid_cols = tv.store_tbl.rowid_columns()
                 rowid_clauses = [
                     c1 == c2 for c1, c2 in zip(prev_tbl_rowid_cols, tbl_rowid_cols[: len(prev_tbl_rowid_cols)])
                 ]
-                stmt = stmt.join(t.get().store_tbl.sa_tbl, sql.and_(*rowid_clauses))
+                stmt = stmt.join(tv.store_tbl.sa_tbl, sql.and_(*rowid_clauses))
             if t.id in exact_version_only:
-                stmt = stmt.where(t.get().store_tbl.v_min_col == t.get().version)
+                stmt = stmt.where(tv.store_tbl.v_min_col == tv.version)
             else:
-                stmt = stmt.where(t.get().store_tbl.v_min_col <= t.get().version).where(
-                    t.get().store_tbl.v_max_col > t.get().version
-                )
-            prev_tbl = t
+                stmt = stmt.where(tv.store_tbl.sa_tbl.c.v_min <= tv.version)
+                stmt = stmt.where(tv.store_tbl.sa_tbl.c.v_max > tv.version)
+            prev_tv = tv
         return stmt
     def set_where(self, where_clause: exprs.Expr) -> None:
@@ -284,7 +307,8 @@ class SqlNode(ExecNode):
                 stmt_str = str(stmt.compile(compile_kwargs={'literal_binds': True}))
                 _logger.debug(f'SqlLookupNode stmt:\n{stmt_str}')
             except Exception:
-                pass
+                # log something if we can't log the compiled stmt
+                _logger.debug(f'SqlLookupNode proto-stmt:\n{stmt}')
             self._log_explain(stmt)
             conn = Env.get().conn
@@ -501,3 +525,145 @@ class SqlJoinNode(SqlNode):
                 full=join_clause == plan.JoinType.FULL_OUTER,
             )
         return stmt
+class SqlSampleNode(SqlNode):
+    """
+    Returns rows sampled from the input node.
+    """
+    input_cte: Optional[sql.CTE]
+    pk_count: int
+    stratify_exprs: Optional[list[exprs.Expr]]
+    sample_clause: 'SampleClause'
+    def __init__(
+        self,
+        row_builder: exprs.RowBuilder,
+        input: SqlNode,
+        select_list: Iterable[exprs.Expr],
+        sample_clause: 'SampleClause',
+        stratify_exprs: list[exprs.Expr],
+    ):
+        """
+        Args:
+            input: SqlNode to sample from
+            select_list: can contain calls to AggregateFunctions
+            sample_clause: specifies the sampling method
+            stratify_exprs: Analyzer processed list of expressions to stratify by.
+        """
+        assert isinstance(input, SqlNode)
+        self.input_cte, input_col_map = input.to_cte(keep_pk=True)
+        self.pk_count = input.num_pk_cols
+        assert self.pk_count > 1
+        sql_elements = exprs.SqlElementCache(input_col_map)
+        assert sql_elements.contains_all(stratify_exprs)
+        super().__init__(input.tbl, row_builder, select_list, sql_elements, set_pk=True)
+        self.stratify_exprs = stratify_exprs
+        self.sample_clause = sample_clause
+        assert isinstance(self.sample_clause.seed, int)
+    @classmethod
+    def key_sql_expr(cls, seed: sql.ColumnElement, sql_cols: Iterable[sql.ColumnElement]) -> sql.ColumnElement:
+        """Construct expression which is the ordering key for rows to be sampled
+        General SQL form is:
+        - MD5(<seed::text> [ + '___' + <rowid_col_val>::text]+
+        """
+        sql_expr: sql.ColumnElement = sql.cast(seed, sql.Text)
+        for e in sql_cols:
+            # Quotes are required below to guarantee that the string is properly presented in SQL
+            sql_expr = sql_expr + sql.literal_column("'___'", sql.Text) + sql.cast(e, sql.Text)
+        sql_expr = sql.func.md5(sql_expr)
+        return sql_expr
+    def _create_key_sql(self, cte: sql.CTE) -> sql.ColumnElement:
+        """Create an expression for randomly ordering rows with a given seed"""
+        rowid_cols = [*cte.c[-self.pk_count : -1]]  # exclude the version column
+        assert len(rowid_cols) > 0
+        return self.key_sql_expr(sql.literal_column(str(self.sample_clause.seed)), rowid_cols)
+    def _create_stmt(self) -> sql.Select:
+        from pixeltable.plan import SampleClause
+        if self.sample_clause.fraction is not None:
+            if len(self.stratify_exprs) == 0:
+                # If non-stratified sampling, construct a where clause, order_by, and limit clauses
+                s_key = self._create_key_sql(self.input_cte)
+                # Construct a suitable where clause
+                fraction_sql = sql.cast(SampleClause.fraction_to_md5_hex(float(self.sample_clause.fraction)), sql.Text)
+                order_by = self._create_key_sql(self.input_cte)
+                return sql.select(*self.input_cte.c).where(s_key < fraction_sql).order_by(order_by)
+            return self._create_stmt_stratified_fraction(self.sample_clause.fraction)
+        else:
+            if len(self.stratify_exprs) == 0:
+                # No stratification, just return n samples from the input CTE
+                order_by = self._create_key_sql(self.input_cte)
+                return sql.select(*self.input_cte.c).order_by(order_by).limit(self.sample_clause.n)
+            return self._create_stmt_stratified_n(self.sample_clause.n, self.sample_clause.n_per_stratum)
+    def _create_stmt_stratified_n(self, n: Optional[int], n_per_stratum: Optional[int]) -> sql.Select:
+        """Create a Select stmt that returns n samples across all strata or n_per_stratum samples per stratum"""
+        sql_strata_exprs = [self.sql_elements.get(e) for e in self.stratify_exprs]
+        order_by = self._create_key_sql(self.input_cte)
+        # Create a list of all columns plus the rank
+        # Get all columns from the input CTE dynamically
+        select_columns = [*self.input_cte.c]
+        select_columns.append(
+            sql.func.row_number().over(partition_by=sql_strata_exprs, order_by=order_by).label('rank')
+        )
+        row_rank_cte = sql.select(*select_columns).select_from(self.input_cte).cte('row_rank_cte')
+        final_columns = [*row_rank_cte.c[:-1]]  # exclude the rank column
+        if n_per_stratum is not None:
+            return sql.select(*final_columns).filter(row_rank_cte.c.rank <= n_per_stratum)
+        else:
+            secondary_order = self._create_key_sql(row_rank_cte)
+            return sql.select(*final_columns).order_by(row_rank_cte.c.rank, secondary_order).limit(n)
+    def _create_stmt_stratified_fraction(self, fraction_samples: float) -> sql.Select:
+        """Create a Select stmt that returns a fraction of the rows per strata"""
+        # Build the strata count CTE
+        # Produces a table of the form:
+        #   (*stratify_exprs, s_s_size)
+        # where s_s_size is the number of samples to take from each stratum
+        sql_strata_exprs = [self.sql_elements.get(e) for e in self.stratify_exprs]
+        per_strata_count_cte = (
+            sql.select(
+                *sql_strata_exprs,
+                sql.func.ceil(fraction_samples * sql.func.count(1).cast(sql.Integer)).label('s_s_size'),
+            )
+            .select_from(self.input_cte)
+            .group_by(*sql_strata_exprs)
+            .cte('per_strata_count_cte')
+        )
+        # Build a CTE that ranks the rows within each stratum
+        # Include all columns from the input CTE dynamically
+        order_by = self._create_key_sql(self.input_cte)
+        select_columns = [*self.input_cte.c]
+        select_columns.append(
+            sql.func.row_number().over(partition_by=sql_strata_exprs, order_by=order_by).label('rank')
+        )
+        row_rank_cte = sql.select(*select_columns).select_from(self.input_cte).cte('row_rank_cte')
+        # Build the join criterion dynamically to accommodate any number of stratify_by expressions
+        join_c = sql.true()
+        for col in per_strata_count_cte.c[:-1]:
+            join_c &= row_rank_cte.c[col.name].isnot_distinct_from(col)
+        # Join with per_strata_count_cte to limit returns to the requested fraction of rows
+        final_columns = [*row_rank_cte.c[:-1]]  # exclude the rank column
+        stmt = (
+            sql.select(*final_columns)
+            .select_from(row_rank_cte)
+            .join(per_strata_count_cte, join_c)
+            .where(row_rank_cte.c.rank <= per_strata_count_cte.c.s_s_size)
+        )
+        return stmt

pixeltable/exprs/column_property_ref.py CHANGED Viewed

@@ -58,20 +58,30 @@ class ColumnPropertyRef(Expr):
         if not self._col_ref.col.is_stored:
             return None
+        # we need to reestablish that we have the correct Column instance, there could have been a metadata
+        # reload since init()
+        # TODO: add an explicit prepare phase (ie, Expr.prepare()) that gives every subclass instance a chance to
+        # perform runtime checks and update state
+        tv = self._col_ref.tbl_version.get()
+        assert tv.is_validated
+        # we can assume at this point during query execution that the column exists
+        assert self._col_ref.col_id in tv.cols_by_id
+        col = tv.cols_by_id[self._col_ref.col_id]
         # the errortype/-msg properties of a read-validated media column need to be extracted from the DataRow
         if (
-            self._col_ref.col.col_type.is_media_type()
-            and self._col_ref.col.media_validation == catalog.MediaValidation.ON_READ
+            col.col_type.is_media_type()
+            and col.media_validation == catalog.MediaValidation.ON_READ
             and self.is_error_prop()
         ):
             return None
         if self.prop == self.Property.ERRORTYPE:
-            assert self._col_ref.col.sa_errortype_col is not None
-            return self._col_ref.col.sa_errortype_col
+            assert col.sa_errortype_col is not None
+            return col.sa_errortype_col
         if self.prop == self.Property.ERRORMSG:
-            assert self._col_ref.col.sa_errormsg_col is not None
-            return self._col_ref.col.sa_errormsg_col
+            assert col.sa_errormsg_col is not None
+            return col.sa_errormsg_col
         if self.prop == self.Property.FILEURL:
             # the file url is stored as the column value
             return sql_elements.get(self._col_ref)

pixeltable/exprs/column_ref.py CHANGED Viewed

@@ -52,6 +52,10 @@ class ColumnRef(Expr):
     id: int
     perform_validation: bool  # if True, performs media validation
+    # needed by sql_expr() to re-resolve Column instance after a metadata reload
+    tbl_version: catalog.TableVersionHandle
+    col_id: int
     def __init__(
         self,
         col: catalog.Column,
@@ -62,16 +66,17 @@ class ColumnRef(Expr):
         assert col.tbl is not None
         self.col = col
         self.reference_tbl = reference_tbl
-        self.is_unstored_iter_col = (
-            col.tbl.get().is_component_view and col.tbl.get().is_iterator_column(col) and not col.is_stored
-        )
+        self.tbl_version = catalog.TableVersionHandle(col.tbl.id, col.tbl.effective_version)
+        self.col_id = col.id
+        self.is_unstored_iter_col = col.tbl.is_component_view and col.tbl.is_iterator_column(col) and not col.is_stored
         self.iter_arg_ctx = None
         # number of rowid columns in the base table
-        self.base_rowid_len = col.tbl.get().base.get().num_rowid_columns() if self.is_unstored_iter_col else 0
+        self.base_rowid_len = col.tbl.base.get().num_rowid_columns() if self.is_unstored_iter_col else 0
         self.base_rowid = [None] * self.base_rowid_len
         self.iterator = None
         # index of the position column in the view's primary key; don't try to reference tbl.store_tbl here
-        self.pos_idx = col.tbl.get().num_rowid_columns() - 1 if self.is_unstored_iter_col else None
+        self.pos_idx = col.tbl.num_rowid_columns() - 1 if self.is_unstored_iter_col else None
         self.perform_validation = False
         if col.col_type.is_media_type():
@@ -175,7 +180,7 @@ class ColumnRef(Expr):
         assert len(idx_info) == 1
         col = copy.copy(next(iter(idx_info.values())).val_col)
         col.name = f'{self.col.name}_embedding_{idx if idx is not None else ""}'
-        col.create_sa_cols()
+        # col.create_sa_cols()
         return ColumnRef(col)
     def default_column_name(self) -> Optional[str]:
@@ -226,7 +231,7 @@ class ColumnRef(Expr):
     def _descriptors(self) -> DescriptionHelper:
         tbl = catalog.Catalog.get().get_table_by_id(self.col.tbl.id)
         helper = DescriptionHelper()
-        helper.append(f'Column\n{self.col.name!r}\n(of table {tbl._path!r})')
+        helper.append(f'Column\n{self.col.name!r}\n(of table {tbl._path()!r})')
         helper.append(tbl._col_descriptor([self.col.name]))
         idxs = tbl._index_descriptor([self.col.name])
         if len(idxs) > 0:
@@ -234,7 +239,19 @@ class ColumnRef(Expr):
         return helper
     def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
-        return None if self.perform_validation else self.col.sa_col
+        if self.perform_validation:
+            return None
+        # we need to reestablish that we have the correct Column instance, there could have been a metadata
+        # reload since init()
+        # TODO: add an explicit prepare phase (ie, Expr.prepare()) that gives every subclass instance a chance to
+        # perform runtime checks and update state
+        tv = self.tbl_version.get()
+        assert tv.is_validated
+        # we can assume at this point during query execution that the column exists
+        assert self.col_id in tv.cols_by_id
+        self.col = tv.cols_by_id[self.col_id]
+        assert self.col.tbl is tv
+        return self.col.sa_col
     def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
         if self.perform_validation:
@@ -275,7 +292,7 @@ class ColumnRef(Expr):
         if self.base_rowid != data_row.pk[: self.base_rowid_len]:
             row_builder.eval(data_row, self.iter_arg_ctx)
             iterator_args = data_row[self.iter_arg_ctx.target_slot_idxs[0]]
-            self.iterator = self.col.tbl.get().iterator_cls(**iterator_args)
+            self.iterator = self.col.tbl.iterator_cls(**iterator_args)
             self.base_rowid = data_row.pk[: self.base_rowid_len]
         self.iterator.set_pos(data_row.pk[self.pos_idx])
         res = next(self.iterator)
@@ -283,17 +300,22 @@ class ColumnRef(Expr):
     def _as_dict(self) -> dict:
         tbl = self.col.tbl
-        tbl_version = tbl.get().version if tbl.get().is_snapshot else None
+        version = tbl.version if tbl.is_snapshot else None
         # we omit self.components, even if this is a validating ColumnRef, because init() will recreate the
         # non-validating component ColumnRef
         return {
             'tbl_id': str(tbl.id),
-            'tbl_version': tbl_version,
+            'tbl_version': version,
             'col_id': self.col.id,
             'reference_tbl': self.reference_tbl.as_dict() if self.reference_tbl is not None else None,
             'perform_validation': self.perform_validation,
         }
+    @classmethod
+    def get_column_id(cls, d: dict) -> catalog.QColumnId:
+        tbl_id, col_id = UUID(d['tbl_id']), d['col_id']
+        return catalog.QColumnId(tbl_id, col_id)
     @classmethod
     def get_column(cls, d: dict) -> catalog.Column:
         tbl_id, version, col_id = UUID(d['tbl_id']), d['tbl_version'], d['col_id']

pixeltable/exprs/comparison.py CHANGED Viewed

@@ -81,7 +81,7 @@ class Comparison(Expr):
         if self.is_search_arg_comparison:
             # reference the index value column if there is an index and this is not a snapshot
             # (indices don't apply to snapshots)
-            tbl = self._op1.col.tbl.get()
+            tbl = self._op1.col.tbl
             idx_info = [
                 info for info in self._op1.col.get_idx_info().values() if isinstance(info.idx, index.BtreeIndex)
             ]

pixeltable/exprs/data_row.py CHANGED Viewed

@@ -214,6 +214,7 @@ class DataRow:
         """Assign in-memory cell value
         This allows overwriting
         """
+        assert isinstance(idx, int)
         assert self.excs[idx] is None
         if (idx in self.img_slot_idxs or idx in self.media_slot_idxs) and isinstance(val, str):
@@ -253,14 +254,15 @@ class DataRow:
         assert self.excs[index] is None
         if self.file_paths[index] is None:
             if filepath is not None:
-                # we want to save this to a file
-                self.file_paths[index] = filepath
-                self.file_urls[index] = urllib.parse.urljoin('file:', urllib.request.pathname2url(filepath))
                 image = self.vals[index]
                 assert isinstance(image, PIL.Image.Image)
                 # Default to JPEG unless the image has a transparency layer (which isn't supported by JPEG).
                 # In that case, use WebP instead.
                 format = 'webp' if image.has_transparency_data else 'jpeg'
+                if not filepath.endswith(f'.{format}'):
+                    filepath += f'.{format}'
+                self.file_paths[index] = filepath
+                self.file_urls[index] = urllib.parse.urljoin('file:', urllib.request.pathname2url(filepath))
                 image.save(filepath, format=format)
             else:
                 # we discard the content of this cell

pixeltable/exprs/expr.py CHANGED Viewed

@@ -276,6 +276,13 @@ class Expr(abc.ABC):
         tbl_versions = {tbl_version.id: tbl_version.get() for tbl_version in tbl.get_tbl_versions()}
         return self._retarget(tbl_versions)
+    @classmethod
+    def retarget_list(cls, expr_list: list[Expr], tbl: catalog.TableVersionPath) -> None:
+        """Retarget ColumnRefs in expr_list to the specific TableVersions in tbl."""
+        tbl_versions = {tbl_version.id: tbl_version.get() for tbl_version in tbl.get_tbl_versions()}
+        for i in range(len(expr_list)):
+            expr_list[i] = expr_list[i]._retarget(tbl_versions)
     def _retarget(self, tbl_versions: dict[UUID, catalog.TableVersion]) -> Self:
         for i in range(len(self.components)):
             self.components[i] = self.components[i]._retarget(tbl_versions)
@@ -387,17 +394,17 @@ class Expr(abc.ABC):
         return {tbl_id for e in exprs_ for tbl_id in e.tbl_ids()}
     @classmethod
-    def get_refd_columns(cls, expr_dict: dict[str, Any]) -> list[catalog.Column]:
+    def get_refd_column_ids(cls, expr_dict: dict[str, Any]) -> set[catalog.QColumnId]:
         """Return Columns referenced by expr_dict."""
-        result: list[catalog.Column] = []
+        result: set[catalog.QColumnId] = set()
         assert '_classname' in expr_dict
         from .column_ref import ColumnRef
         if expr_dict['_classname'] == 'ColumnRef':
-            result.append(ColumnRef.get_column(expr_dict))
+            result.add(ColumnRef.get_column_id(expr_dict))
         if 'components' in expr_dict:
             for component_dict in expr_dict['components']:
-                result.extend(cls.get_refd_columns(component_dict))
+                result.update(cls.get_refd_column_ids(component_dict))
         return result
     def as_literal(self) -> Optional[Expr]:

pixeltable/exprs/literal.py CHANGED Viewed

@@ -16,6 +16,8 @@ from .sql_element_cache import SqlElementCache
 class Literal(Expr):
+    val: Any
     def __init__(self, val: Any, col_type: Optional[ts.ColumnType] = None):
         if col_type is not None:
             val = col_type.create_literal(val)

pixeltable/exprs/row_builder.py CHANGED Viewed

@@ -172,13 +172,11 @@ class RowBuilder:
         def refs_unstored_iter_col(col_ref: ColumnRef) -> bool:
             tbl = col_ref.col.tbl
-            return (
-                tbl.get().is_component_view and tbl.get().is_iterator_column(col_ref.col) and not col_ref.col.is_stored
-            )
+            return tbl.is_component_view and tbl.is_iterator_column(col_ref.col) and not col_ref.col.is_stored
         unstored_iter_col_refs = [col_ref for col_ref in col_refs if refs_unstored_iter_col(col_ref)]
         component_views = [col_ref.col.tbl for col_ref in unstored_iter_col_refs]
-        unstored_iter_args = {view.id: view.get().iterator_args.copy() for view in component_views}
+        unstored_iter_args = {view.id: view.iterator_args.copy() for view in component_views}
         self.unstored_iter_args = {
             id: self._record_unique_expr(arg, recursive=True) for id, arg in unstored_iter_args.items()
         }
@@ -450,9 +448,9 @@ class RowBuilder:
             else:
                 if col.col_type.is_image_type() and data_row.file_urls[slot_idx] is None:
                     # we have yet to store this image
-                    filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.get().version))
+                    filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.version))
                     data_row.flush_img(slot_idx, filepath)
-                val = data_row.get_stored_val(slot_idx, col.sa_col.type)
+                val = data_row.get_stored_val(slot_idx, col.get_sa_col_type())
                 table_row[col.store_name()] = val
                 # we unfortunately need to set these, even if there are no errors
                 table_row[col.errortype_store_name()] = None

pixeltable/exprs/rowid_ref.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import logging
 from typing import Any, Optional, cast
 from uuid import UUID
@@ -12,6 +13,8 @@ from .expr import Expr
 from .row_builder import RowBuilder
 from .sql_element_cache import SqlElementCache
+_logger = logging.getLogger('pixeltable')
 class RowidRef(Expr):
     """A reference to a part of a table rowid
@@ -97,10 +100,15 @@ class RowidRef(Expr):
     def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
         tbl = self.tbl.get() if self.tbl is not None else catalog.Catalog.get().get_tbl_version(self.tbl_id, None)
+        assert tbl.is_validated
         rowid_cols = tbl.store_tbl.rowid_columns()
         assert self.rowid_component_idx <= len(rowid_cols), (
             f'{self.rowid_component_idx} not consistent with {rowid_cols}'
         )
+        # _logger.debug(
+        #     f'RowidRef.sql_expr: tbl={tbl.id}{tbl.effective_version} sa_tbl={id(tbl.store_tbl.sa_tbl):x} '
+        #     f'tv={id(tbl):x}'
+        # )
         return rowid_cols[self.rowid_component_idx]
     def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:

pixeltable/exprs/similarity_expr.py CHANGED Viewed

@@ -54,6 +54,7 @@ class SimilarityExpr(Expr):
         return 'similarity'
     def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
+        # TODO: validate that the index still exists
         if not isinstance(self.components[1], Literal):
             raise excs.Error('similarity(): requires a string or a PIL.Image.Image object, not an expression')
         item = self.components[1].val

pixeltable/func/__init__.py CHANGED Viewed

@@ -5,6 +5,7 @@ from .callable_function import CallableFunction
 from .expr_template_function import ExprTemplateFunction
 from .function import Function, InvalidFunction
 from .function_registry import FunctionRegistry
+from .mcp import mcp_udfs
 from .query_template_function import QueryTemplateFunction, query, retrieval_udf
 from .signature import Batch, Parameter, Signature
 from .tools import Tool, ToolChoice, Tools

pixeltable/func/mcp.py ADDED Viewed

@@ -0,0 +1,74 @@
+import asyncio
+import inspect
+from typing import TYPE_CHECKING, Any, Optional
+import pixeltable as pxt
+from pixeltable import exceptions as excs, type_system as ts
+from pixeltable.func.signature import Parameter
+if TYPE_CHECKING:
+    import mcp
+def mcp_udfs(url: str) -> list['pxt.func.Function']:
+    return asyncio.run(mcp_udfs_async(url))
+async def mcp_udfs_async(url: str) -> list['pxt.func.Function']:
+    import mcp
+    from mcp.client.streamable_http import streamablehttp_client
+    list_tools_result: Optional[mcp.types.ListToolsResult] = None
+    async with (
+        streamablehttp_client(url) as (read_stream, write_stream, _),
+        mcp.ClientSession(read_stream, write_stream) as session,
+    ):
+        await session.initialize()
+        list_tools_result = await session.list_tools()
+    assert list_tools_result is not None
+    return [mcp_tool_to_udf(url, tool) for tool in list_tools_result.tools]
+def mcp_tool_to_udf(url: str, mcp_tool: 'mcp.types.Tool') -> 'pxt.func.Function':
+    import mcp
+    from mcp.client.streamable_http import streamablehttp_client
+    async def invoke(**kwargs: Any) -> str:
+        # TODO: Cache session objects rather than creating a new one each time?
+        async with (
+            streamablehttp_client(url) as (read_stream, write_stream, _),
+            mcp.ClientSession(read_stream, write_stream) as session,
+        ):
+            await session.initialize()
+            res = await session.call_tool(name=mcp_tool.name, arguments=kwargs)
+            # TODO Handle image/audio responses?
+            return res.content[0].text  # type: ignore[union-attr]
+    if mcp_tool.description is not None:
+        invoke.__doc__ = mcp_tool.description
+    input_schema = mcp_tool.inputSchema
+    params = {
+        name: __mcp_param_to_pxt_type(mcp_tool.name, name, param) for name, param in input_schema['properties'].items()
+    }
+    required = input_schema.get('required', [])
+    # Ensure that any params not appearing in `required` are nullable.
+    # (A required param might or might not be nullable, since its type might be an 'anyOf' containing a null.)
+    for name in params.keys() - required:
+        params[name] = params[name].copy(nullable=True)
+    signature = pxt.func.Signature(
+        return_type=ts.StringType(),  # Return type is always string
+        parameters=[Parameter(name, col_type, inspect.Parameter.KEYWORD_ONLY) for name, col_type in params.items()],
+    )
+    return pxt.func.CallableFunction(signatures=[signature], py_fns=[invoke], self_name=mcp_tool.name)
+def __mcp_param_to_pxt_type(tool_name: str, name: str, param: dict[str, Any]) -> ts.ColumnType:
+    pxt_type = ts.ColumnType.from_json_schema(param)
+    if pxt_type is None:
+        raise excs.Error(f'Unknown type schema for MCP parameter {name!r} of tool {tool_name!r}: {param}')
+    return pxt_type

pixeltable 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl

Potentially problematic release.

pixeltable 0.3.14py3-none-any.whl → 0.4.0py3-none-any.whl