PyPI - pixeltable - Versions diffs - 0.2.21__py3-none-any.whl → 0.2.22__py3-none-any.whl - Mend

pixeltable 0.2.21py3-none-any.whl → 0.2.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (82) hide show

pixeltable/__version__.py +2 -2
pixeltable/catalog/__init__.py +1 -1
pixeltable/catalog/column.py +37 -11
pixeltable/catalog/globals.py +18 -0
pixeltable/catalog/insertable_table.py +6 -4
pixeltable/catalog/table.py +19 -3
pixeltable/catalog/table_version.py +34 -14
pixeltable/catalog/view.py +16 -17
pixeltable/dataframe.py +7 -8
pixeltable/env.py +5 -0
pixeltable/exec/__init__.py +0 -1
pixeltable/exec/aggregation_node.py +6 -3
pixeltable/exec/cache_prefetch_node.py +1 -1
pixeltable/exec/data_row_batch.py +2 -19
pixeltable/exec/exec_node.py +2 -1
pixeltable/exec/expr_eval_node.py +17 -10
pixeltable/exec/in_memory_data_node.py +6 -3
pixeltable/exec/sql_node.py +24 -25
pixeltable/exprs/arithmetic_expr.py +3 -1
pixeltable/exprs/array_slice.py +7 -7
pixeltable/exprs/column_property_ref.py +37 -10
pixeltable/exprs/column_ref.py +93 -14
pixeltable/exprs/comparison.py +5 -5
pixeltable/exprs/compound_predicate.py +8 -7
pixeltable/exprs/data_row.py +27 -18
pixeltable/exprs/expr.py +53 -52
pixeltable/exprs/expr_set.py +5 -0
pixeltable/exprs/function_call.py +32 -16
pixeltable/exprs/globals.py +4 -1
pixeltable/exprs/in_predicate.py +8 -7
pixeltable/exprs/inline_expr.py +4 -4
pixeltable/exprs/is_null.py +4 -4
pixeltable/exprs/json_mapper.py +11 -12
pixeltable/exprs/json_path.py +5 -10
pixeltable/exprs/literal.py +5 -5
pixeltable/exprs/method_ref.py +5 -4
pixeltable/exprs/object_ref.py +2 -1
pixeltable/exprs/row_builder.py +88 -36
pixeltable/exprs/rowid_ref.py +12 -11
pixeltable/exprs/similarity_expr.py +12 -7
pixeltable/exprs/sql_element_cache.py +7 -5
pixeltable/exprs/type_cast.py +8 -6
pixeltable/exprs/variable.py +5 -4
pixeltable/func/aggregate_function.py +1 -1
pixeltable/func/function.py +11 -10
pixeltable/functions/__init__.py +2 -2
pixeltable/functions/globals.py +5 -7
pixeltable/functions/huggingface.py +19 -20
pixeltable/functions/llama_cpp.py +106 -0
pixeltable/functions/ollama.py +147 -0
pixeltable/functions/replicate.py +72 -0
pixeltable/functions/string.py +9 -0
pixeltable/globals.py +12 -20
pixeltable/index/btree.py +16 -3
pixeltable/index/embedding_index.py +4 -4
pixeltable/io/__init__.py +1 -2
pixeltable/io/fiftyone.py +178 -0
pixeltable/io/globals.py +96 -2
pixeltable/iterators/base.py +3 -2
pixeltable/iterators/document.py +1 -1
pixeltable/iterators/video.py +120 -63
pixeltable/metadata/__init__.py +1 -1
pixeltable/metadata/converters/convert_21.py +34 -0
pixeltable/metadata/converters/util.py +45 -4
pixeltable/metadata/notes.py +1 -0
pixeltable/metadata/schema.py +8 -0
pixeltable/plan.py +16 -14
pixeltable/py.typed +0 -0
pixeltable/store.py +7 -2
pixeltable/tool/create_test_video.py +1 -1
pixeltable/tool/embed_udf.py +1 -1
pixeltable/tool/mypy_plugin.py +28 -5
pixeltable/type_system.py +17 -1
pixeltable/utils/documents.py +15 -1
pixeltable/utils/formatter.py +9 -10
{pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/METADATA +46 -10
pixeltable-0.2.22.dist-info/RECORD +153 -0
pixeltable/exec/media_validation_node.py +0 -43
pixeltable-0.2.21.dist-info/RECORD +0 -148
{pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/LICENSE +0 -0
{pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/WHEEL +0 -0
{pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/entry_points.txt +0 -0

pixeltable/exec/expr_eval_node.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import Iterable, List, Optional
 from tqdm import TqdmWarning, tqdm
-import pixeltable.exprs as exprs
+from pixeltable import exprs
 from pixeltable.func import CallableFunction
 from .data_row_batch import DataRowBatch
@@ -22,7 +22,7 @@ class ExprEvalNode(ExecNode):
     @dataclass
     class Cohort:
         """List of exprs that form an evaluation context and contain calls to at most one external function"""
-        exprs: List[exprs.Expr]
+        exprs_: List[exprs.Expr]
         batched_fn: Optional[CallableFunction]
         segment_ctxs: List['exprs.RowBuilder.EvalCtx']
         target_slot_idxs: List[int]
@@ -38,7 +38,7 @@ class ExprEvalNode(ExecNode):
         # we're only materializing exprs that are not already in the input
         self.target_exprs = [e for e in output_exprs if e.slot_idx not in input_slot_idxs]
         self.pbar: Optional[tqdm] = None
-        self.cohorts: List[List[ExprEvalNode.Cohort]] = []
+        self.cohorts: List[ExprEvalNode.Cohort] = []
         self._create_cohorts()
     def __next__(self) -> DataRowBatch:
@@ -88,6 +88,8 @@ class ExprEvalNode(ExecNode):
         for e in all_exprs:
             if not self._is_batched_fn_call(e):
                 continue
+            assert isinstance(e, exprs.FunctionCall)
+            assert isinstance(e.fn, CallableFunction)
             if current_batched_fn is None or current_batched_fn != e.fn:
                 # create a new cohort
                 cohorts.append([])
@@ -96,8 +98,8 @@ class ExprEvalNode(ExecNode):
         # expand the cohorts to include all exprs that are in the same evaluation context as the external calls;
         # cohorts are evaluated in order, so we can exclude the target slots from preceding cohorts and input slots
-        exclude = set([e.slot_idx for e in self.input_exprs])
-        all_target_slot_idxs = set([e.slot_idx for e in self.target_exprs])
+        exclude = set(e.slot_idx for e in self.input_exprs)
+        all_target_slot_idxs = set(e.slot_idx for e in self.target_exprs)
         target_slot_idxs: List[List[int]] = []  # the ones materialized by each cohort
         for i in range(len(cohorts)):
             cohorts[i] = self.row_builder.get_dependencies(
@@ -106,7 +108,7 @@ class ExprEvalNode(ExecNode):
                 [e.slot_idx for e in cohorts[i] if e.slot_idx in all_target_slot_idxs])
             exclude.update(target_slot_idxs[-1])
-        all_cohort_slot_idxs = set([e.slot_idx for cohort in cohorts for e in cohort])
+        all_cohort_slot_idxs = set(e.slot_idx for cohort in cohorts for e in cohort)
         remaining_slot_idxs = set(all_target_slot_idxs) - all_cohort_slot_idxs
         if len(remaining_slot_idxs) > 0:
             cohorts.append(self.row_builder.get_dependencies(
@@ -164,9 +166,10 @@ class ExprEvalNode(ExecNode):
                             rows[row_idx], segment_ctx, self.ctx.profile, ignore_errors=self.ctx.ignore_errors)
                 else:
                     fn_call = segment_ctx.exprs[0]
+                    assert isinstance(fn_call, exprs.FunctionCall)
                     # make a batched external function call
-                    arg_batches = [[] for _ in range(len(fn_call.args))]
-                    kwarg_batches = {k: [] for k in fn_call.kwargs.keys()}
+                    arg_batches: list[list[exprs.Expr]] = [[] for _ in range(len(fn_call.args))]
+                    kwarg_batches: dict[str, list[exprs.Expr]] = {k: [] for k in fn_call.kwargs.keys()}
                     valid_batch_idxs: List[int] = []  # rows with exceptions are not valid
                     for row_idx in range(batch_start_idx, batch_start_idx + num_batch_rows):
@@ -176,12 +179,15 @@ class ExprEvalNode(ExecNode):
                             continue
                         valid_batch_idxs.append(row_idx)
                         args, kwargs = fn_call._make_args(row)
-                        [arg_batches[i].append(args[i]) for i in range(len(args))]
-                        [kwarg_batches[k].append(kwargs[k]) for k in kwargs.keys()]
+                        for i in range(len(args)):
+                            arg_batches[i].append(args[i])
+                        for k in kwargs.keys():
+                            kwarg_batches[k].append(kwargs[k])
                     num_valid_batch_rows = len(valid_batch_idxs)
                     if ext_batch_size is None:
                         # we need to choose a batch size based on the args
+                        assert isinstance(fn_call.fn, CallableFunction)
                         sample_args = [arg_batches[i][0] for i in range(len(arg_batches))]
                         ext_batch_size = fn_call.fn.get_batch_size(*sample_args)
@@ -201,6 +207,7 @@ class ExprEvalNode(ExecNode):
                             for k in kwarg_batches.keys()
                         }
                         start_ts = time.perf_counter()
+                        assert isinstance(fn_call.fn, CallableFunction)
                         result_batch = fn_call.fn.exec_batch(*call_args, **call_kwargs)
                         self.ctx.profile.eval_time[fn_call.slot_idx] += time.perf_counter() - start_ts
                         self.ctx.profile.eval_count[fn_call.slot_idx] += num_ext_batch_rows

pixeltable/exec/in_memory_data_node.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import logging
-from typing import Any, Optional, Iterator
+from typing import Any, Iterator, Optional
 import pixeltable.catalog as catalog
 import pixeltable.exprs as exprs
@@ -23,12 +23,15 @@ class InMemoryDataNode(ExecNode):
     start_row_id: int
     output_rows: Optional[DataRowBatch]
+    # output_exprs is declared in the superclass, but we redeclare it here with a more specific type
+    output_exprs: list[exprs.ColumnRef]
     def __init__(
         self, tbl: catalog.TableVersion, rows: list[dict[str, Any]],
         row_builder: exprs.RowBuilder, start_row_id: int,
     ):
-        # we materialize all output slots
-        output_exprs = [e for e in row_builder.get_output_exprs() if isinstance(e, exprs.ColumnRef)]
+        # we materialize the input slots
+        output_exprs = list(row_builder.input_exprs)
         super().__init__(row_builder, output_exprs, [], None)
         assert tbl.is_insertable()
         self.tbl = tbl

pixeltable/exec/sql_node.py CHANGED Viewed

@@ -1,13 +1,14 @@
 import logging
 import warnings
 from decimal import Decimal
-from typing import Optional, Iterable, Iterator, NamedTuple
+from typing import Iterable, Iterator, NamedTuple, Optional
 from uuid import UUID
 import sqlalchemy as sql
 import pixeltable.catalog as catalog
 import pixeltable.exprs as exprs
 from .data_row_batch import DataRowBatch
 from .exec_node import ExecNode
@@ -100,7 +101,7 @@ class SqlNode(ExecNode):
             # minimize the number of tables that need to be joined to the target table
             self.retarget_rowid_refs(tbl, self.select_list)
-        assert self.sql_elements.contains(self.select_list)
+        assert self.sql_elements.contains_all(self.select_list)
         self.set_pk = set_pk
         self.num_pk_cols = 0
         if set_pk:
@@ -120,13 +121,13 @@ class SqlNode(ExecNode):
     def _create_stmt(self) -> sql.Select:
         """Create Select from local state"""
-        assert self.sql_elements.contains(self.select_list)
+        assert self.sql_elements.contains_all(self.select_list)
         sql_select_list = [self.sql_elements.get(e) for e in self.select_list]
         if self.set_pk:
             sql_select_list += self.tbl.tbl_version.store_tbl.pk_columns()
         stmt = sql.select(*sql_select_list)
-        order_by_clause: list[sql.ClauseElement] = []
+        order_by_clause: list[sql.ColumnElement] = []
         for e, asc in self.order_by_clause:
             if isinstance(e, exprs.SimilarityExpr):
                 order_by_clause.append(e.as_order_by_clause(asc))
@@ -141,7 +142,7 @@ class SqlNode(ExecNode):
         return stmt
     def _ordering_tbl_ids(self) -> set[UUID]:
-        return exprs.Expr.list_tbl_ids(e for e, _ in self.order_by_clause)
+        return exprs.Expr.all_tbl_ids(e for e, _ in self.order_by_clause)
     def to_cte(self) -> Optional[tuple[sql.CTE, exprs.ExprDict[sql.ColumnElement]]]:
         """
@@ -182,9 +183,9 @@ class SqlNode(ExecNode):
         """
         # we need to include at least the root
         if refd_tbl_ids is None:
-            refd_tbl_ids = {}
+            refd_tbl_ids = set()
         if exact_version_only is None:
-            exact_version_only = {}
+            exact_version_only = set()
         candidates = tbl.get_tbl_versions()
         assert len(candidates) > 0
         joined_tbls: list[catalog.TableVersion] = [candidates[0]]
@@ -193,6 +194,7 @@ class SqlNode(ExecNode):
                 joined_tbls.append(tbl)
         first = True
+        prev_tbl: catalog.TableVersion
         for tbl in joined_tbls[::-1]:
             if first:
                 stmt = stmt.select_from(tbl.store_tbl.sa_tbl)
@@ -239,22 +241,19 @@ class SqlNode(ExecNode):
     def __iter__(self) -> Iterator[DataRowBatch]:
         # run the query; do this here rather than in _open(), exceptions are only expected during iteration
         assert self.ctx.conn is not None
-        try:
-            with warnings.catch_warnings(record=True) as w:
-                stmt = self._create_stmt()
-                try:
-                    # log stmt, if possible
-                    stmt_str = str(stmt.compile(compile_kwargs={'literal_binds': True}))
-                    _logger.debug(f'SqlLookupNode stmt:\n{stmt_str}')
-                except Exception as e:
-                    pass
-                self._log_explain(stmt)
-                result_cursor = self.ctx.conn.execute(stmt)
-                for warning in w:
-                    pass
-        except Exception as e:
-            raise e
+        with warnings.catch_warnings(record=True) as w:
+            stmt = self._create_stmt()
+            try:
+                # log stmt, if possible
+                stmt_str = str(stmt.compile(compile_kwargs={'literal_binds': True}))
+                _logger.debug(f'SqlLookupNode stmt:\n{stmt_str}')
+            except Exception:
+                pass
+            self._log_explain(stmt)
+            result_cursor = self.ctx.conn.execute(stmt)
+            for warning in w:
+                pass
         tbl_version = self.tbl.tbl_version if self.tbl is not None else None
         output_batch = DataRowBatch(tbl_version, self.row_builder)
@@ -350,7 +349,7 @@ class SqlScanNode(SqlNode):
     def _create_stmt(self) -> sql.Select:
         stmt = super()._create_stmt()
         where_clause_tbl_ids = self.where_clause.tbl_ids() if self.where_clause is not None else set()
-        refd_tbl_ids = exprs.Expr.list_tbl_ids(self.select_list) | where_clause_tbl_ids | self._ordering_tbl_ids()
+        refd_tbl_ids = exprs.Expr.all_tbl_ids(self.select_list) | where_clause_tbl_ids | self._ordering_tbl_ids()
         stmt = self.create_from_clause(
             self.tbl, stmt, refd_tbl_ids, exact_version_only={t.id for t in self.exact_version_only})
@@ -386,7 +385,7 @@ class SqlLookupNode(SqlNode):
     def _create_stmt(self) -> sql.Select:
         stmt = super()._create_stmt()
-        refd_tbl_ids = exprs.Expr.list_tbl_ids(self.select_list) | self._ordering_tbl_ids()
+        refd_tbl_ids = exprs.Expr.all_tbl_ids(self.select_list) | self._ordering_tbl_ids()
         stmt = self.create_from_clause(self.tbl, stmt, refd_tbl_ids)
         stmt = stmt.where(self.where_clause)
         return stmt

pixeltable/exprs/arithmetic_expr.py CHANGED Viewed

@@ -6,6 +6,7 @@ import sqlalchemy as sql
 import pixeltable.exceptions as excs
 import pixeltable.type_system as ts
 from .data_row import DataRow
 from .expr import Expr
 from .globals import ArithmeticOperator
@@ -86,6 +87,7 @@ class ArithmeticExpr(Expr):
                 return sql.sql.expression.cast(sql.func.floor(left / right), sql.Integer)
             if self.col_type.is_float_type():
                 return sql.sql.expression.cast(sql.func.floor(left / right), sql.Float)
+        assert False
     def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
         op1_val = data_row[self._op1.slot_idx]
@@ -121,7 +123,7 @@ class ArithmeticExpr(Expr):
         return {'operator': self.operator.value, **super()._as_dict()}
     @classmethod
-    def _from_dict(cls, d: dict, components: list[Expr]) -> Expr:
+    def _from_dict(cls, d: dict, components: list[Expr]) -> ArithmeticExpr:
         assert 'operator' in d
         assert len(components) == 2
         return cls(ArithmeticOperator(d['operator']), components[0], components[1])

pixeltable/exprs/array_slice.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from __future__ import annotations
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Optional, Union
 import sqlalchemy as sql
@@ -15,7 +15,7 @@ class ArraySlice(Expr):
     """
     Slice operation on an array, eg, t.array_col[:, 1:2].
     """
-    def __init__(self, arr: Expr, index: Tuple):
+    def __init__(self, arr: Expr, index: tuple[Union[int, slice], ...]):
         assert arr.col_type.is_array_type()
         # determine result type
         super().__init__(arr.col_type)
@@ -24,7 +24,7 @@ class ArraySlice(Expr):
         self.id = self._create_id()
     def __str__(self) -> str:
-        index_strs: List[str] = []
+        index_strs: list[str] = []
         for el in self.index:
             if isinstance(el, int):
                 index_strs.append(str(el))
@@ -39,7 +39,7 @@ class ArraySlice(Expr):
     def _equals(self, other: ArraySlice) -> bool:
         return self.index == other.index
-    def _id_attrs(self) -> List[Tuple[str, Any]]:
+    def _id_attrs(self) -> list[tuple[str, Any]]:
         return super()._id_attrs() + [('index', self.index)]
     def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
@@ -49,8 +49,8 @@ class ArraySlice(Expr):
         val = data_row[self._array.slot_idx]
         data_row[self.slot_idx] = val[self.index]
-    def _as_dict(self) -> Dict:
-        index = []
+    def _as_dict(self) -> dict:
+        index: list[Any] = []
         for el in self.index:
             if isinstance(el, slice):
                 index.append([el.start, el.stop, el.step])
@@ -59,7 +59,7 @@ class ArraySlice(Expr):
         return {'index': index, **super()._as_dict()}
     @classmethod
-    def _from_dict(cls, d: Dict, components: List[Expr]) -> Expr:
+    def _from_dict(cls, d: dict, components: list[Expr]) -> ArraySlice:
         assert 'index' in d
         index = []
         for el in d['index']:

pixeltable/exprs/column_property_ref.py CHANGED Viewed

@@ -1,11 +1,12 @@
 from __future__ import annotations
 import enum
-from typing import Optional, List, Any, Dict, Tuple
+from typing import Any, Optional
 import sqlalchemy as sql
 import pixeltable.type_system as ts
+from pixeltable import catalog
 from .column_ref import ColumnRef
 from .data_row import DataRow
 from .expr import Expr
@@ -33,22 +34,36 @@ class ColumnPropertyRef(Expr):
     def default_column_name(self) -> Optional[str]:
         return str(self).replace('.', '_')
-    def _equals(self, other: ColumnRef) -> bool:
+    def _equals(self, other: ColumnPropertyRef) -> bool:
         return self.prop == other.prop
-    def _id_attrs(self) -> List[Tuple[str, Any]]:
+    def _id_attrs(self) -> list[tuple[str, Any]]:
         return super()._id_attrs() + [('prop', self.prop.value)]
     @property
     def _col_ref(self) -> ColumnRef:
-        return self.components[0]
+        col_ref = self.components[0]
+        assert isinstance(col_ref, ColumnRef)
+        return col_ref
     def __str__(self) -> str:
         return f'{self._col_ref}.{self.prop.name.lower()}'
+    def is_error_prop(self) -> bool:
+        return self.prop == self.Property.ERRORTYPE or self.prop == self.Property.ERRORMSG
     def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ColumnElement]:
         if not self._col_ref.col.is_stored:
             return None
+        # the errortype/-msg properties of a read-validated media column need to be extracted from the DataRow
+        if (
+            self._col_ref.col.col_type.is_media_type()
+            and self._col_ref.col.media_validation == catalog.MediaValidation.ON_READ
+            and self.is_error_prop()
+        ):
+            return None
         if self.prop == self.Property.ERRORTYPE:
             assert self._col_ref.col.sa_errortype_col is not None
             return self._col_ref.col.sa_errortype_col
@@ -61,18 +76,30 @@ class ColumnPropertyRef(Expr):
         return None
     def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
-        assert self.prop == self.Property.FILEURL or self.prop == self.Property.LOCALPATH
-        assert data_row.has_val[self._col_ref.slot_idx]
         if self.prop == self.Property.FILEURL:
+            assert data_row.has_val[self._col_ref.slot_idx]
             data_row[self.slot_idx] = data_row.file_urls[self._col_ref.slot_idx]
-        if self.prop == self.Property.LOCALPATH:
+            return
+        elif self.prop == self.Property.LOCALPATH:
+            assert data_row.has_val[self._col_ref.slot_idx]
             data_row[self.slot_idx] = data_row.file_paths[self._col_ref.slot_idx]
-    def _as_dict(self) -> Dict:
+            return
+        elif self.is_error_prop():
+            exc = data_row.get_exc(self._col_ref.slot_idx)
+            if exc is None:
+                data_row[self.slot_idx] = None
+            elif self.prop == self.Property.ERRORTYPE:
+                data_row[self.slot_idx] = type(exc).__name__
+            else:
+                data_row[self.slot_idx] = str(exc)
+        else:
+            assert False
+    def _as_dict(self) -> dict:
         return {'prop': self.prop.value, **super()._as_dict()}
     @classmethod
-    def _from_dict(cls, d: Dict, components: List[Expr]) -> Expr:
+    def _from_dict(cls, d: dict, components: list[Expr]) -> ColumnPropertyRef:
         assert 'prop' in d
         assert isinstance(components[0], ColumnRef)
         return cls(components[0], cls.Property(d['prop']))

pixeltable/exprs/column_ref.py CHANGED Viewed

@@ -1,16 +1,18 @@
 from __future__ import annotations
-from typing import Optional, Any, Tuple
+from typing import Any, Optional, Sequence
 from uuid import UUID
 import sqlalchemy as sql
-from .expr import Expr
+import pixeltable.catalog as catalog
+import pixeltable.exceptions as excs
+import pixeltable.iterators as iters
 from .data_row import DataRow
+from .expr import Expr
 from .row_builder import RowBuilder
 from .sql_element_cache import SqlElementCache
-import pixeltable.iterators as iters
-import pixeltable.exceptions as excs
-import pixeltable.catalog as catalog
 class ColumnRef(Expr):
@@ -19,18 +21,31 @@ class ColumnRef(Expr):
     When this reference is created in the context of a view, it can also refer to a column of the view base.
     For that reason, a ColumnRef needs to be serialized with the qualifying table id (column ids are only
     unique in the context of a particular table).
+    Media validation:
+    - media validation is potentially cpu-intensive, and it's desirable to schedule and parallelize it during
+      general expr evaluation
+    - media validation on read is done in ColumnRef.eval()
+    - a validating ColumnRef cannot be translated to SQL (because the validation is done in Python)
+    - in that case, the ColumnRef also instantiates a second non-validating ColumnRef as a component (= dependency)
+    - the non-validating ColumnRef is used for SQL translation
+    TODO:
+    separate Exprs (like validating ColumnRefs) from the logical expression tree and instead have RowBuilder
+    insert them into the EvalCtxs as needed
     """
     col: catalog.Column
     is_unstored_iter_col: bool
     iter_arg_ctx: Optional[RowBuilder.EvalCtx]
     base_rowid_len: int
-    base_rowid: list[Optional[Any]]
+    base_rowid: Sequence[Optional[Any]]
     iterator: Optional[iters.ComponentIterator]
     pos_idx: Optional[int]
     id: int
+    perform_validation: bool  # if True, performs media validation
-    def __init__(self, col: catalog.Column):
+    def __init__(self, col: catalog.Column, perform_validation: Optional[bool] = None):
         super().__init__(col.col_type)
         assert col.tbl is not None
         self.col = col
@@ -43,17 +58,44 @@ class ColumnRef(Expr):
         self.iterator = None
         # index of the position column in the view's primary key; don't try to reference tbl.store_tbl here
         self.pos_idx = col.tbl.num_rowid_columns() - 1 if self.is_unstored_iter_col else None
+        self.perform_validation = False
+        if col.col_type.is_media_type():
+            # we perform media validation if the column is a media type and the validation is set to ON_READ,
+            # unless we're told not to
+            if perform_validation is not None:
+                self.perform_validation = perform_validation
+            else:
+                self.perform_validation = (
+                    col.col_type.is_media_type() and col.media_validation == catalog.MediaValidation.ON_READ
+                )
+        else:
+            assert perform_validation is None or not perform_validation
+        if self.perform_validation:
+            non_validating_col_ref = ColumnRef(col, perform_validation=False)
+            self.components = [non_validating_col_ref]
         self.id = self._create_id()
     def set_iter_arg_ctx(self, iter_arg_ctx: RowBuilder.EvalCtx) -> None:
         self.iter_arg_ctx = iter_arg_ctx
         assert len(self.iter_arg_ctx.target_slot_idxs) == 1  # a single inline dict
-    def _id_attrs(self) -> list[Tuple[str, Any]]:
-        return super()._id_attrs() + [('tbl_id', self.col.tbl.id), ('col_id', self.col.id)]
+    def _id_attrs(self) -> list[tuple[str, Any]]:
+        return (
+            super()._id_attrs()
+            + [('tbl_id', self.col.tbl.id), ('col_id', self.col.id), ('perform_validation', self.perform_validation)]
+        )
+    # override
+    def _retarget(self, tbl_versions: dict[UUID, catalog.TableVersion]) -> ColumnRef:
+        target = tbl_versions[self.col.tbl.id]
+        assert self.col.id in target.cols_by_id
+        col = target.cols_by_id[self.col.id]
+        return ColumnRef(col)
     def __getattr__(self, name: str) -> Expr:
         from .column_property_ref import ColumnPropertyRef
         # resolve column properties
         if name == ColumnPropertyRef.Property.ERRORTYPE.name.lower() \
                 or name == ColumnPropertyRef.Property.ERRORMSG.name.lower():
@@ -82,7 +124,7 @@ class ColumnRef(Expr):
         return str(self)
     def _equals(self, other: ColumnRef) -> bool:
-        return self.col == other.col
+        return self.col == other.col and self.perform_validation == other.perform_validation
     def __str__(self) -> str:
         if self.col.name is None:
@@ -94,9 +136,38 @@ class ColumnRef(Expr):
         return f'ColumnRef({self.col!r})'
     def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
-        return self.col.sa_col
+        return None if self.perform_validation else self.col.sa_col
     def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
+        if self.perform_validation:
+            # validate media file of our input ColumnRef and if successful, replicate the state of that slot
+            # to our slot
+            unvalidated_slot_idx = self.components[0].slot_idx
+            if data_row.file_paths[unvalidated_slot_idx] is None:
+                # no media file to validate, we still need to replicate the value
+                assert data_row.file_urls[unvalidated_slot_idx] is None
+                val = data_row.vals[unvalidated_slot_idx]
+                data_row.vals[self.slot_idx] = val
+                data_row.has_val[self.slot_idx] = True
+                return
+            try:
+                self.col.col_type.validate_media(data_row.file_paths[unvalidated_slot_idx])
+                # access the value only after successful validation
+                val = data_row[unvalidated_slot_idx]
+                data_row.vals[self.slot_idx] = val
+                data_row.has_val[self.slot_idx] = True
+                # make sure that the validated slot points to the same file as the unvalidated slot
+                data_row.file_paths[self.slot_idx] = data_row.file_paths[unvalidated_slot_idx]
+                data_row.file_urls[self.slot_idx] = data_row.file_urls[unvalidated_slot_idx]
+                return
+            except excs.Error as exc:
+                # propagate the exception, but ignore it otherwise;
+                # media validation errors don't cause exceptions during query execution
+                # TODO: allow for different error-handling behavior
+                row_builder.set_exc(data_row, self.slot_idx, exc)
+                return
         if not self.is_unstored_iter_col:
             # supply default
             data_row[self.slot_idx] = None
@@ -115,7 +186,14 @@ class ColumnRef(Expr):
     def _as_dict(self) -> dict:
         tbl = self.col.tbl
         version = tbl.version if tbl.is_snapshot else None
-        return {'tbl_id': str(tbl.id), 'tbl_version': version, 'col_id': self.col.id}
+        # we omit self.components, even if this is a validating ColumnRef, because init() will recreate the
+        # non-validating component ColumnRef
+        return {
+            'tbl_id': str(tbl.id),
+            'tbl_version': version,
+            'col_id': self.col.id,
+            'perform_validation': self.perform_validation
+        }
     @classmethod
     def get_column(cls, d: dict) -> catalog.Column:
@@ -126,6 +204,7 @@ class ColumnRef(Expr):
         return col
     @classmethod
-    def _from_dict(cls, d: dict, _: list[Expr]) -> Expr:
+    def _from_dict(cls, d: dict, _: list[Expr]) -> ColumnRef:
         col = cls.get_column(d)
-        return cls(col)
+        perform_validation = d['perform_validation']
+        return cls(col, perform_validation=perform_validation)

pixeltable/exprs/comparison.py CHANGED Viewed

@@ -1,12 +1,13 @@
 from __future__ import annotations
-from typing import Optional, List, Any, Dict
+from typing import Any, Optional
 import sqlalchemy as sql
 import pixeltable.exceptions as excs
 import pixeltable.index as index
 import pixeltable.type_system as ts
 from .column_ref import ColumnRef
 from .data_row import DataRow
 from .expr import Expr
@@ -65,7 +66,7 @@ class Comparison(Expr):
     def _op2(self) -> Expr:
         return self.components[1]
-    def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ClauseElement]:
+    def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ColumnElement]:
         left = sql_elements.get(self._op1)
         if self.is_search_arg_comparison:
             # reference the index value column if there is an index and this is not a snapshot
@@ -113,11 +114,10 @@ class Comparison(Expr):
         elif self.operator == ComparisonOperator.GE:
             data_row[self.slot_idx] = left >= right
-    def _as_dict(self) -> Dict:
+    def _as_dict(self) -> dict:
         return {'operator': self.operator.value, **super()._as_dict()}
     @classmethod
-    def _from_dict(cls, d: Dict, components: List[Expr]) -> Expr:
+    def _from_dict(cls, d: dict, components: list[Expr]) -> Comparison:
         assert 'operator' in d
         return cls(ComparisonOperator(d['operator']), components[0], components[1])

pixeltable 0.2.21__py3-none-any.whl → 0.2.22__py3-none-any.whl

Potentially problematic release.

pixeltable 0.2.21py3-none-any.whl → 0.2.22py3-none-any.whl