PyPI - pixeltable - Versions diffs - 0.4.15__py3-none-any.whl → 0.4.16__py3-none-any.whl - Mend

pixeltable 0.4.15py3-none-any.whl → 0.4.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (57) hide show

pixeltable/__init__.py +4 -0
pixeltable/catalog/catalog.py +105 -51
pixeltable/catalog/column.py +7 -2
pixeltable/catalog/table.py +1 -0
pixeltable/catalog/table_metadata.py +4 -0
pixeltable/catalog/table_version.py +99 -78
pixeltable/catalog/table_version_handle.py +4 -1
pixeltable/config.py +6 -0
pixeltable/dataframe.py +10 -5
pixeltable/env.py +48 -19
pixeltable/exec/__init__.py +2 -0
pixeltable/exec/cell_materialization_node.py +231 -0
pixeltable/exec/cell_reconstruction_node.py +135 -0
pixeltable/exec/exec_node.py +1 -1
pixeltable/exec/expr_eval/evaluators.py +1 -0
pixeltable/exec/expr_eval/expr_eval_node.py +3 -0
pixeltable/exec/expr_eval/globals.py +2 -0
pixeltable/exec/globals.py +32 -0
pixeltable/exec/object_store_save_node.py +1 -4
pixeltable/exec/row_update_node.py +16 -9
pixeltable/exec/sql_node.py +107 -14
pixeltable/exprs/__init__.py +1 -1
pixeltable/exprs/arithmetic_expr.py +10 -11
pixeltable/exprs/column_property_ref.py +10 -10
pixeltable/exprs/column_ref.py +2 -2
pixeltable/exprs/data_row.py +106 -37
pixeltable/exprs/expr.py +9 -0
pixeltable/exprs/expr_set.py +14 -7
pixeltable/exprs/inline_expr.py +2 -19
pixeltable/exprs/json_path.py +45 -12
pixeltable/exprs/row_builder.py +54 -22
pixeltable/functions/__init__.py +1 -0
pixeltable/functions/bedrock.py +7 -0
pixeltable/functions/deepseek.py +11 -4
pixeltable/functions/llama_cpp.py +7 -0
pixeltable/functions/math.py +1 -1
pixeltable/functions/ollama.py +7 -0
pixeltable/functions/openai.py +4 -4
pixeltable/functions/openrouter.py +143 -0
pixeltable/globals.py +10 -4
pixeltable/io/globals.py +16 -15
pixeltable/io/table_data_conduit.py +46 -21
pixeltable/metadata/__init__.py +1 -1
pixeltable/metadata/converters/convert_40.py +73 -0
pixeltable/metadata/notes.py +1 -0
pixeltable/plan.py +175 -46
pixeltable/store.py +1 -1
pixeltable/type_system.py +5 -3
pixeltable/utils/console_output.py +4 -1
pixeltable/utils/exception_handler.py +5 -28
pixeltable/utils/image.py +7 -0
pixeltable/utils/misc.py +5 -0
{pixeltable-0.4.15.dist-info → pixeltable-0.4.16.dist-info}/METADATA +2 -1
{pixeltable-0.4.15.dist-info → pixeltable-0.4.16.dist-info}/RECORD +57 -50
{pixeltable-0.4.15.dist-info → pixeltable-0.4.16.dist-info}/WHEEL +0 -0
{pixeltable-0.4.15.dist-info → pixeltable-0.4.16.dist-info}/entry_points.txt +0 -0
{pixeltable-0.4.15.dist-info → pixeltable-0.4.16.dist-info}/licenses/LICENSE +0 -0

pixeltable/exec/sql_node.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import datetime
 import logging
 import warnings
 from decimal import Decimal
@@ -65,7 +66,7 @@ def print_order_by_clause(clause: OrderByClause) -> str:
 class SqlNode(ExecNode):
     """
-    Materializes data from the store via an SQL statement.
+    Materializes data from the store via a SQL statement.
     This only provides the select list. The subclasses are responsible for the From clause and any additional clauses.
     The pk columns are not included in the select list.
     If set_pk is True, they are added to the end of the result set when creating the SQL statement
@@ -82,6 +83,8 @@ class SqlNode(ExecNode):
     tbl: Optional[catalog.TableVersionPath]
     select_list: exprs.ExprSet
+    columns: list[catalog.Column]  # for which columns to populate DataRow.cell_vals/cell_md
+    cell_md_refs: list[exprs.ColumnPropertyRef]  # of ColumnRefs which also need DataRow.slot_cellmd for evaluation
     set_pk: bool
     num_pk_cols: int
     py_filter: Optional[exprs.Expr]  # a predicate that can only be run in Python
@@ -89,6 +92,12 @@ class SqlNode(ExecNode):
     cte: Optional[sql.CTE]
     sql_elements: exprs.SqlElementCache
+    # execution state
+    cellmd_item_idxs: exprs.ExprDict[int]  # cellmd expr -> idx in sql select list
+    column_item_idxs: dict[catalog.Column, int]  # column -> idx in sql select list
+    column_cellmd_item_idxs: dict[catalog.Column, int]  # column -> idx in sql select list
+    result_cursor: sql.engine.CursorResult | None
     # where_clause/-_element: allow subclass to set one or the other (but not both)
     where_clause: Optional[exprs.Expr]
     where_clause_element: Optional[sql.ColumnElement]
@@ -101,12 +110,22 @@ class SqlNode(ExecNode):
         tbl: Optional[catalog.TableVersionPath],
         row_builder: exprs.RowBuilder,
         select_list: Iterable[exprs.Expr],
+        columns: list[catalog.Column],
         sql_elements: exprs.SqlElementCache,
+        cell_md_col_refs: list[exprs.ColumnRef] | None = None,
         set_pk: bool = False,
     ):
         # create Select stmt
         self.sql_elements = sql_elements
         self.tbl = tbl
+        self.columns = columns
+        if cell_md_col_refs is not None:
+            assert all(ref.col.stores_cellmd for ref in cell_md_col_refs)
+            self.cell_md_refs = [
+                exprs.ColumnPropertyRef(ref, exprs.ColumnPropertyRef.Property.CELLMD) for ref in cell_md_col_refs
+            ]
+        else:
+            self.cell_md_refs = []
         self.select_list = exprs.ExprSet(select_list)
         # unstored iter columns: we also need to retrieve whatever is needed to materialize the iter args
         for iter_arg in row_builder.unstored_iter_args.values():
@@ -129,6 +148,9 @@ class SqlNode(ExecNode):
             assert self.num_pk_cols > 1
         # additional state
+        self.cellmd_item_idxs = exprs.ExprDict()
+        self.column_item_idxs = {}
+        self.column_cellmd_item_idxs = {}
         self.result_cursor = None
         # the filter is provided by the subclass
         self.py_filter = None
@@ -144,10 +166,9 @@ class SqlNode(ExecNode):
             if tv is not None:
                 assert tv.is_validated
-    def _create_pk_cols(self) -> list[sql.Column]:
-        """Create a list of pk columns"""
-        # we need to retrieve the pk columns
+    def _pk_col_items(self) -> list[sql.Column]:
         if self.set_pk:
+            # we need to retrieve the pk columns
             assert self.tbl is not None
             assert self.tbl.tbl_version.get().is_validated
             return self.tbl.tbl_version.get().store_tbl.pk_columns()
@@ -157,7 +178,19 @@ class SqlNode(ExecNode):
         """Create Select from local state"""
         assert self.sql_elements.contains_all(self.select_list)
-        sql_select_list = [self.sql_elements.get(e) for e in self.select_list] + self._create_pk_cols()
+        sql_select_list_exprs = exprs.ExprSet(self.select_list)
+        self.cellmd_item_idxs = exprs.ExprDict((ref, sql_select_list_exprs.add(ref)) for ref in self.cell_md_refs)
+        column_refs = [exprs.ColumnRef(col) for col in self.columns]
+        self.column_item_idxs = {col_ref.col: sql_select_list_exprs.add(col_ref) for col_ref in column_refs}
+        column_cellmd_refs = [
+            exprs.ColumnPropertyRef(col_ref, exprs.ColumnPropertyRef.Property.CELLMD)
+            for col_ref in column_refs
+            if col_ref.col.stores_cellmd
+        ]
+        self.column_cellmd_item_idxs = {
+            cellmd_ref.col_ref.col: sql_select_list_exprs.add(cellmd_ref) for cellmd_ref in column_cellmd_refs
+        }
+        sql_select_list = [self.sql_elements.get(e) for e in sql_select_list_exprs] + self._pk_col_items()
         stmt = sql.select(*sql_select_list)
         where_clause_element = (
@@ -198,9 +231,7 @@ class SqlNode(ExecNode):
             if not keep_pk:
                 self.set_pk = False  # we don't need the PK if we use this SqlNode as a CTE
             self.cte = self._create_stmt().cte()
-        pk_count = self.num_pk_cols if self.set_pk else 0
-        assert len(self.select_list) + pk_count == len(self.cte.c)
-        return self.cte, exprs.ExprDict(zip(self.select_list, self.cte.c))  # skip pk cols
+        return self.cte, exprs.ExprDict(zip(list(self.select_list) + self.cell_md_refs, self.cte.c))  # skip pk cols
     @classmethod
     def retarget_rowid_refs(cls, target: catalog.TableVersionPath, expr_seq: Iterable[exprs.Expr]) -> None:
@@ -318,24 +349,53 @@ class SqlNode(ExecNode):
         output_batch = DataRowBatch(self.row_builder)
         output_row: Optional[exprs.DataRow] = None
         num_rows_returned = 0
+        is_using_cockroachdb = Env.get().is_using_cockroachdb
+        tzinfo = Env.get().default_time_zone
         for sql_row in result_cursor:
             output_row = output_batch.add_row(output_row)
             # populate output_row
             if self.num_pk_cols > 0:
                 output_row.set_pk(tuple(sql_row[-self.num_pk_cols :]))
+            # column copies
+            for col, item_idx in self.column_item_idxs.items():
+                output_row.cell_vals[col.id] = sql_row[item_idx]
+            for col, item_idx in self.column_cellmd_item_idxs.items():
+                cell_md_dict = sql_row[item_idx]
+                output_row.cell_md[col.id] = exprs.CellMd(**cell_md_dict) if cell_md_dict is not None else None
+            # populate DataRow.slot_cellmd, where requested
+            for cellmd_ref, item_idx in self.cellmd_item_idxs.items():
+                cell_md_dict = sql_row[item_idx]
+                output_row.slot_md[cellmd_ref.col_ref.slot_idx] = (
+                    exprs.CellMd.from_dict(cell_md_dict) if cell_md_dict is not None else None
+                )
             # copy the output of the SQL query into the output row
             for i, e in enumerate(self.select_list):
                 slot_idx = e.slot_idx
-                # certain numerical operations can produce Decimals (eg, SUM(<int column>)); we need to convert them
                 if isinstance(sql_row[i], Decimal):
+                    # certain numerical operations can produce Decimals (eg, SUM(<int column>)); we need to convert them
                     if e.col_type.is_int_type():
                         output_row[slot_idx] = int(sql_row[i])
                     elif e.col_type.is_float_type():
                         output_row[slot_idx] = float(sql_row[i])
                     else:
                         raise RuntimeError(f'Unexpected Decimal value for {e}')
+                elif is_using_cockroachdb and isinstance(sql_row[i], datetime.datetime):
+                    # Ensure that the datetime is timezone-aware and in the session time zone
+                    # cockroachDB returns timestamps in the session time zone, with numeric offset,
+                    # convert to the session time zone with the requested tzinfo for DST handling
+                    if e.col_type.is_timestamp_type():
+                        if isinstance(sql_row[i].tzinfo, datetime.timezone):
+                            output_row[slot_idx] = sql_row[i].astimezone(tz=tzinfo)
+                        else:
+                            output_row[slot_idx] = sql_row[i]
+                    else:
+                        raise RuntimeError(f'Unexpected datetime value for {e}')
                 else:
                     output_row[slot_idx] = sql_row[i]
@@ -387,11 +447,21 @@ class SqlScanNode(SqlNode):
         tbl: catalog.TableVersionPath,
         row_builder: exprs.RowBuilder,
         select_list: Iterable[exprs.Expr],
+        columns: list[catalog.Column],
+        cell_md_col_refs: list[exprs.ColumnRef] | None = None,
         set_pk: bool = False,
         exact_version_only: Optional[list[catalog.TableVersionHandle]] = None,
     ):
         sql_elements = exprs.SqlElementCache()
-        super().__init__(tbl, row_builder, select_list, sql_elements, set_pk=set_pk)
+        super().__init__(
+            tbl,
+            row_builder,
+            select_list,
+            columns=columns,
+            sql_elements=sql_elements,
+            set_pk=set_pk,
+            cell_md_col_refs=cell_md_col_refs,
+        )
         # create Select stmt
         if exact_version_only is None:
             exact_version_only = []
@@ -423,11 +493,21 @@ class SqlLookupNode(SqlNode):
         tbl: catalog.TableVersionPath,
         row_builder: exprs.RowBuilder,
         select_list: Iterable[exprs.Expr],
+        columns: list[catalog.Column],
         sa_key_cols: list[sql.Column],
         key_vals: list[tuple],
+        cell_md_col_refs: list[exprs.ColumnRef] | None = None,
     ):
         sql_elements = exprs.SqlElementCache()
-        super().__init__(tbl, row_builder, select_list, sql_elements, set_pk=True)
+        super().__init__(
+            tbl,
+            row_builder,
+            select_list,
+            columns=columns,
+            sql_elements=sql_elements,
+            set_pk=True,
+            cell_md_col_refs=cell_md_col_refs,
+        )
         # Where clause: (key-col-1, key-col-2, ...) IN ((val-1, val-2, ...), ...)
         self.where_clause_element = sql.tuple_(*sa_key_cols).in_(key_vals)
@@ -460,9 +540,10 @@ class SqlAggregationNode(SqlNode):
         limit: Optional[int] = None,
         exact_version_only: Optional[list[catalog.TableVersion]] = None,
     ):
+        assert len(input.cell_md_refs) == 0  # there's no aggregation over json or arrays in SQL
         self.input_cte, input_col_map = input.to_cte()
         sql_elements = exprs.SqlElementCache(input_col_map)
-        super().__init__(None, row_builder, select_list, sql_elements)
+        super().__init__(None, row_builder, select_list, columns=[], sql_elements=sql_elements)
         self.group_by_items = group_by_items
     def _create_stmt(self) -> sql.Select:
@@ -498,7 +579,10 @@ class SqlJoinNode(SqlNode):
             input_cte, input_col_map = input_node.to_cte()
             self.input_ctes.append(input_cte)
             sql_elements.extend(input_col_map)
-        super().__init__(None, row_builder, select_list, sql_elements)
+        cell_md_col_refs = [cell_md_ref.col_ref for input in inputs for cell_md_ref in input.cell_md_refs]
+        super().__init__(
+            None, row_builder, select_list, columns=[], sql_elements=sql_elements, cell_md_col_refs=cell_md_col_refs
+        )
     def _create_stmt(self) -> sql.Select:
         from pixeltable import plan
@@ -552,7 +636,16 @@ class SqlSampleNode(SqlNode):
         assert self.pk_count > 1
         sql_elements = exprs.SqlElementCache(input_col_map)
         assert sql_elements.contains_all(stratify_exprs)
-        super().__init__(input.tbl, row_builder, select_list, sql_elements, set_pk=True)
+        cell_md_col_refs = [cell_md_ref.col_ref for cell_md_ref in input.cell_md_refs]
+        super().__init__(
+            input.tbl,
+            row_builder,
+            select_list,
+            columns=[],
+            sql_elements=sql_elements,
+            cell_md_col_refs=cell_md_col_refs,
+            set_pk=True,
+        )
         self.stratify_exprs = stratify_exprs
         self.sample_clause = sample_clause
         assert isinstance(self.sample_clause.seed, int)

pixeltable/exprs/__init__.py CHANGED Viewed

@@ -6,7 +6,7 @@ from .column_property_ref import ColumnPropertyRef
 from .column_ref import ColumnRef
 from .comparison import Comparison
 from .compound_predicate import CompoundPredicate
-from .data_row import DataRow
+from .data_row import ArrayMd, CellMd, DataRow
 from .expr import Expr
 from .expr_dict import ExprDict
 from .expr_set import ExprSet

pixeltable/exprs/arithmetic_expr.py CHANGED Viewed

@@ -72,15 +72,16 @@ class ArithmeticExpr(Expr):
             return left * right
         if self.operator == ArithmeticOperator.DIV:
             assert self.col_type.is_float_type()
-            # Avoid DivisionByZero: if right is 0, make this a NULL
+            # Avoid division by zero errors by converting any zero divisor to NULL.
             # TODO: Should we cast the NULLs to NaNs when they are retrieved back into Python?
-            nullif = sql.sql.func.nullif(right, 0)
-            # We have to cast to a `float`, or else we'll get a `Decimal`
-            return sql.sql.expression.cast(left / nullif, self.col_type.to_sa_type())
+            # These casts cause the computation to take place in float units, rather than DECIMAL.
+            nullif = sql.cast(sql.func.nullif(right, 0), self.col_type.to_sa_type())
+            return sql.cast(left, self.col_type.to_sa_type()) / nullif
         if self.operator == ArithmeticOperator.MOD:
             if self.col_type.is_int_type():
-                nullif = sql.sql.func.nullif(right, 0)
-                return left % nullif
+                # Avoid division by zero errors by converting any zero divisor to NULL.
+                nullif1 = sql.cast(sql.func.nullif(right, 0), self.col_type.to_sa_type())
+                return left % nullif1
             if self.col_type.is_float_type():
                 # Postgres does not support modulus for floats
                 return None
@@ -90,11 +91,9 @@ class ArithmeticExpr(Expr):
             # We need the behavior to be consistent, so that expressions will evaluate the same way
             # whether or not their operands can be translated to SQL. These SQL clauses should
             # mimic the behavior of Python's // operator.
-            nullif = sql.sql.func.nullif(right, 0)
-            if self.col_type.is_int_type():
-                return sql.sql.expression.cast(sql.func.floor(left / nullif), self.col_type.to_sa_type())
-            if self.col_type.is_float_type():
-                return sql.sql.expression.cast(sql.func.floor(left / nullif), self.col_type.to_sa_type())
+            # Avoid division by zero errors by converting any zero divisor to NULL.
+            nullif = sql.cast(sql.func.nullif(right, 0), self.col_type.to_sa_type())
+            return sql.func.floor(sql.cast(left, self.col_type.to_sa_type()) / nullif)
         raise AssertionError()
     def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:

pixeltable/exprs/column_property_ref.py CHANGED Viewed

@@ -44,21 +44,21 @@ class ColumnPropertyRef(Expr):
         return [*super()._id_attrs(), ('prop', self.prop.value)]
     @property
-    def _col_ref(self) -> ColumnRef:
+    def col_ref(self) -> ColumnRef:
         col_ref = self.components[0]
         assert isinstance(col_ref, ColumnRef)
         return col_ref
     def __repr__(self) -> str:
-        return f'{self._col_ref}.{self.prop.name.lower()}'
+        return f'{self.col_ref}.{self.prop.name.lower()}'
     def is_cellmd_prop(self) -> bool:
         return self.prop in (self.Property.ERRORTYPE, self.Property.ERRORMSG, self.Property.CELLMD)
     def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ColumnElement]:
-        if not self._col_ref.col_handle.get().is_stored:
+        if not self.col_ref.col_handle.get().is_stored:
             return None
-        col = self._col_ref.col_handle.get()
+        col = self.col_ref.col_handle.get()
         # the errortype/-msg properties of a read-validated media column need to be extracted from the DataRow
         if (
@@ -77,7 +77,7 @@ class ColumnPropertyRef(Expr):
             return col.sa_cellmd_col
         if self.prop == self.Property.FILEURL:
             # the file url is stored as the column value
-            return sql_elements.get(self._col_ref)
+            return sql_elements.get(self.col_ref)
         return None
     @classmethod
@@ -87,15 +87,15 @@ class ColumnPropertyRef(Expr):
     def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
         if self.prop == self.Property.FILEURL:
-            assert data_row.has_val[self._col_ref.slot_idx]
-            data_row[self.slot_idx] = data_row.file_urls[self._col_ref.slot_idx]
+            assert data_row.has_val[self.col_ref.slot_idx]
+            data_row[self.slot_idx] = data_row.file_urls[self.col_ref.slot_idx]
             return
         elif self.prop == self.Property.LOCALPATH:
-            assert data_row.has_val[self._col_ref.slot_idx]
-            data_row[self.slot_idx] = data_row.file_paths[self._col_ref.slot_idx]
+            assert data_row.has_val[self.col_ref.slot_idx]
+            data_row[self.slot_idx] = data_row.file_paths[self.col_ref.slot_idx]
             return
         elif self.is_cellmd_prop():
-            exc = data_row.get_exc(self._col_ref.slot_idx)
+            exc = data_row.get_exc(self.col_ref.slot_idx)
             if exc is None:
                 data_row[self.slot_idx] = None
             elif self.prop == self.Property.ERRORTYPE:

pixeltable/exprs/column_ref.py CHANGED Viewed

@@ -123,8 +123,8 @@ class ColumnRef(Expr):
             name == ColumnPropertyRef.Property.ERRORTYPE.name.lower()
             or name == ColumnPropertyRef.Property.ERRORMSG.name.lower()
         ):
-            property_is_present = self.col.stores_cellmd
-            if not property_is_present:
+            is_valid = (self.col.is_computed or self.col.col_type.is_media_type()) and self.col.is_stored
+            if not is_valid:
                 raise excs.Error(f'{name} only valid for a stored computed or media column: {self}')
             return ColumnPropertyRef(self, ColumnPropertyRef.Property[name.upper()])
         if (

pixeltable/exprs/data_row.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import dataclasses
 import datetime
 import io
 import urllib.parse
@@ -13,15 +14,72 @@ import PIL
 import PIL.Image
 import sqlalchemy as sql
+import pixeltable.utils.image as image_utils
 from pixeltable import catalog, env
 from pixeltable.utils.local_store import TempStore
+from pixeltable.utils.misc import non_none_dict_factory
+@dataclasses.dataclass
+class ArrayMd:
+    """
+    Metadata for array cells that are stored externally.
+    """
+    start: int
+    end: int
+    # we store bool arrays as packed bits (uint8 arrays), and need to record the shape to reconstruct the array
+    is_bool: bool = False
+    shape: tuple[int, ...] | None = None
+    def as_dict(self) -> dict:
+        # dict_factory: suppress Nones
+        x = dataclasses.asdict(self, dict_factory=non_none_dict_factory)
+        return x
+@dataclasses.dataclass
+class CellMd:
+    """
+    Content of the cellmd column.
+    All fields are optional, to minimize storage.
+    """
+    errortype: str | None = None
+    errormsg: str | None = None
+    # a list of file urls that are used to store images and arrays; only set for json and array columns
+    # for json columns: a list of all urls referenced in the column value
+    # for array columns: a single url
+    file_urls: list[str] | None = None
+    array_md: ArrayMd | None = None
+    @classmethod
+    def from_dict(cls, d: dict) -> CellMd:
+        x: CellMd
+        if 'array_md' in d:
+            d2 = d.copy()
+            del d2['array_md']
+            x = cls(**d2, array_md=ArrayMd(**d['array_md']))
+        else:
+            x = cls(**d)
+        return x
+    def as_dict(self) -> dict:
+        x = dataclasses.asdict(self, dict_factory=non_none_dict_factory)
+        return x
 class DataRow:
     """
     Encapsulates all data and execution state needed by RowBuilder and DataRowBatch:
     - state for in-memory computation
-    - state for storing the data
+    - state needed for expression evaluation
+    - containers for output column values
     This is not meant to be a black-box abstraction.
     In-memory representations by column type:
@@ -39,79 +97,92 @@ class DataRow:
     - DocumentType: local path if available, otherwise url
     """
+    # expr evaluation state; indexed by slot idx
     vals: np.ndarray  # of object
     has_val: np.ndarray  # of bool
     excs: np.ndarray  # of object
-    # If `may_have_exc` is False, then we guarantee that no slot has an exception set. This is used to optimize
-    # exception handling under normal operation.
-    _may_have_exc: bool
-    # expr evaluation state; indexed by slot idx
     missing_slots: np.ndarray  # of bool; number of missing dependencies
     missing_dependents: np.ndarray  # of int16; number of missing dependents
     is_scheduled: np.ndarray  # of bool; True if this slot is scheduled for evaluation
-    # control structures that are shared across all DataRows in a batch
-    img_slot_idxs: list[int]
-    media_slot_idxs: list[int]
-    array_slot_idxs: list[int]
-    # the primary key of a store row is a sequence of ints (the number is different for table vs view)
-    pk: Optional[tuple[int, ...]]
+    # CellMd needed for query execution; needs to be indexed by slot idx, not column id, to work for joins
+    slot_md: dict[int, CellMd]
     # file_urls:
     # - stored url of file for media in vals[i]
     # - None if vals[i] is not media type
     # - not None if file_paths[i] is not None
+    # TODO: this is a sparse vector; should it be a dict[int, str]?
     file_urls: np.ndarray  # of str
     # file_paths:
     # - local path of media file in vals[i]; points to the file cache if file_urls[i] is remote
     # - None if vals[i] is not a media type or if there is no local file yet for file_urls[i]
+    # TODO: this is a sparse vector; should it be a dict[int, str]?
     file_paths: np.ndarray  # of str
+    # If `may_have_exc` is False, then we guarantee that no slot has an exception set. This is used to optimize
+    # exception handling under normal operation.
+    _may_have_exc: bool
+    # the primary key of a store row is a sequence of ints (the number is different for table vs view)
+    pk: Optional[tuple[int, ...]]
     # for nested rows (ie, those produced by JsonMapperDispatcher)
     parent_row: Optional[DataRow]
     parent_slot_idx: Optional[int]
+    # state for table output (insert()/update()); key: column id
+    cell_vals: dict[int, Any]  # materialized values of output columns, in the format required for the column
+    cell_md: dict[int, CellMd]
+    # control structures that are shared across all DataRows in a batch
+    img_slot_idxs: list[int]
+    media_slot_idxs: list[int]
+    array_slot_idxs: list[int]
+    json_slot_idxs: list[int]
     def __init__(
         self,
         size: int,
         img_slot_idxs: list[int],
         media_slot_idxs: list[int],
         array_slot_idxs: list[int],
+        json_slot_idxs: list[int],
         parent_row: Optional[DataRow] = None,
         parent_slot_idx: Optional[int] = None,
     ):
-        self.img_slot_idxs = img_slot_idxs
-        self.media_slot_idxs = media_slot_idxs
-        self.array_slot_idxs = array_slot_idxs
         self.init(size)
         self.parent_row = parent_row
         self.parent_slot_idx = parent_slot_idx
-    def init(self, num_slots: int) -> None:
-        self.vals = np.full(num_slots, None, dtype=object)
-        self.has_val = np.zeros(num_slots, dtype=bool)
-        self.excs = np.full(num_slots, None, dtype=object)
+        self.img_slot_idxs = img_slot_idxs
+        self.media_slot_idxs = media_slot_idxs
+        self.array_slot_idxs = array_slot_idxs
+        self.json_slot_idxs = json_slot_idxs
+    def init(self, size: int) -> None:
+        self.vals = np.full(size, None, dtype=object)
+        self.has_val = np.zeros(size, dtype=bool)
+        self.excs = np.full(size, None, dtype=object)
+        self.missing_slots = np.zeros(size, dtype=bool)
+        self.missing_dependents = np.zeros(size, dtype=np.int16)
+        self.is_scheduled = np.zeros(size, dtype=bool)
+        self.slot_md = {}
+        self.file_urls = np.full(size, None, dtype=object)
+        self.file_paths = np.full(size, None, dtype=object)
         self._may_have_exc = False
-        self.missing_slots = np.zeros(num_slots, dtype=bool)
-        self.missing_dependents = np.zeros(num_slots, dtype=np.int16)
-        self.is_scheduled = np.zeros(num_slots, dtype=bool)
+        self.cell_vals = {}
+        self.cell_md = {}
         self.pk = None
-        self.file_urls = np.full(num_slots, None, dtype=object)
-        self.file_paths = np.full(num_slots, None, dtype=object)
         self.parent_row = None
         self.parent_slot_idx = None
-    def clear(self, idxs: Optional[np.ndarray] = None) -> None:
-        if idxs is not None:
-            self.has_val[idxs] = False
-            self.vals[idxs] = None
-            self.excs[idxs] = None
-            self.file_urls[idxs] = None
-            self.file_paths[idxs] = None
+    def clear(self, slot_idxs: Optional[np.ndarray] = None) -> None:
+        if slot_idxs is not None:
+            self.has_val[slot_idxs] = False
+            self.vals[slot_idxs] = None
+            self.excs[slot_idxs] = None
+            self.file_urls[slot_idxs] = None
+            self.file_paths[slot_idxs] = None
         else:
             self.init(len(self.vals))
@@ -292,9 +363,7 @@ class DataRow:
         val = self.vals[index]
         format = None
         if isinstance(val, PIL.Image.Image):
-            # Default to JPEG unless the image has a transparency layer (which isn't supported by JPEG).
-            # In that case, use WebP instead.
-            format = 'webp' if val.has_transparency_data else 'jpeg'
+            format = image_utils.default_format(val)
         filepath, url = TempStore.save_media_object(val, col, format=format)
         self.file_paths[index] = str(filepath) if filepath is not None else None
         self.vals[index] = None

pixeltable/exprs/expr.py CHANGED Viewed

@@ -368,6 +368,15 @@ class Expr(abc.ABC):
         for e in expr_list:
             yield from e.subexprs(expr_class=expr_class, filter=filter, traverse_matches=traverse_matches)
+    @classmethod
+    def list_contains(
+        cls,
+        expr_list: Iterable[Expr],
+        expr_class: type[Expr] | None = None,
+        filter: Callable[[Expr], bool] | None = None,
+    ) -> bool:
+        return any(e._contains(expr_class, filter) for e in expr_list)
     def _contains(self, cls: Optional[type[Expr]] = None, filter: Optional[Callable[[Expr], bool]] = None) -> bool:
         """
         Returns True if any subexpr is an instance of cls and/or matches filter.

pixeltable/exprs/expr_set.py CHANGED Viewed

@@ -9,26 +9,33 @@ T = TypeVar('T', bound='Expr')
 class ExprSet(Generic[T]):
     """
-    A set that also supports indexed lookup (by slot_idx and Expr.id). Exprs are uniquely identified by Expr.id.
+    An ordered set that also supports indexed lookup (by slot_idx and Expr.id). Exprs are uniquely identified by
+    Expr.id.
     """
     exprs: dict[int, T]  # key: Expr.id
+    expr_offsets: dict[int, int]  # key: Expr.id, value: offset into self.exprs.keys()
     exprs_by_idx: dict[int, T]  # key: slot_idx
     def __init__(self, elements: Optional[Iterable[T]] = None):
         self.exprs = {}
+        self.expr_offsets = {}
         self.exprs_by_idx = {}
         if elements is not None:
             for e in elements:
                 self.add(e)
-    def add(self, expr: T) -> None:
-        if expr.id in self.exprs:
-            return
+    def add(self, expr: T) -> int:
+        """Returns offset corresponding to iteration order"""
+        offset = self.expr_offsets.get(expr.id)
+        if offset is not None:
+            return offset
+        offset = len(self.exprs)
         self.exprs[expr.id] = expr
-        if expr.slot_idx is None:
-            return
-        self.exprs_by_idx[expr.slot_idx] = expr
+        self.expr_offsets[expr.id] = offset
+        if expr.slot_idx is not None:
+            self.exprs_by_idx[expr.slot_idx] = expr
+        return offset
     def update(self, *others: Iterable[T]) -> None:
         for other in others:

pixeltable 0.4.15__py3-none-any.whl → 0.4.16__py3-none-any.whl

Potentially problematic release.

pixeltable 0.4.15py3-none-any.whl → 0.4.16py3-none-any.whl