PyPI - pixeltable - Versions diffs - 0.2.19__py3-none-any.whl → 0.2.21__py3-none-any.whl - Mend

pixeltable 0.2.19py3-none-any.whl → 0.2.21py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (88) hide show

pixeltable/__init__.py +7 -19
pixeltable/__version__.py +2 -2
pixeltable/catalog/__init__.py +7 -7
pixeltable/catalog/globals.py +3 -0
pixeltable/catalog/insertable_table.py +9 -7
pixeltable/catalog/table.py +220 -143
pixeltable/catalog/table_version.py +36 -18
pixeltable/catalog/table_version_path.py +0 -8
pixeltable/catalog/view.py +3 -3
pixeltable/dataframe.py +9 -24
pixeltable/env.py +107 -36
pixeltable/exceptions.py +7 -4
pixeltable/exec/__init__.py +1 -1
pixeltable/exec/aggregation_node.py +22 -15
pixeltable/exec/component_iteration_node.py +62 -41
pixeltable/exec/data_row_batch.py +7 -7
pixeltable/exec/exec_node.py +35 -7
pixeltable/exec/expr_eval_node.py +2 -1
pixeltable/exec/in_memory_data_node.py +9 -9
pixeltable/exec/sql_node.py +265 -136
pixeltable/exprs/__init__.py +1 -0
pixeltable/exprs/data_row.py +30 -19
pixeltable/exprs/expr.py +15 -14
pixeltable/exprs/expr_dict.py +55 -0
pixeltable/exprs/expr_set.py +21 -15
pixeltable/exprs/function_call.py +21 -8
pixeltable/exprs/json_path.py +3 -6
pixeltable/exprs/rowid_ref.py +2 -2
pixeltable/exprs/sql_element_cache.py +5 -1
pixeltable/ext/functions/whisperx.py +7 -2
pixeltable/func/callable_function.py +2 -2
pixeltable/func/function_registry.py +6 -7
pixeltable/func/query_template_function.py +11 -12
pixeltable/func/signature.py +17 -15
pixeltable/func/udf.py +0 -4
pixeltable/functions/__init__.py +1 -1
pixeltable/functions/audio.py +4 -6
pixeltable/functions/globals.py +86 -42
pixeltable/functions/huggingface.py +12 -14
pixeltable/functions/image.py +59 -45
pixeltable/functions/json.py +0 -1
pixeltable/functions/mistralai.py +2 -2
pixeltable/functions/openai.py +22 -25
pixeltable/functions/string.py +50 -50
pixeltable/functions/timestamp.py +20 -20
pixeltable/functions/together.py +26 -12
pixeltable/functions/video.py +11 -20
pixeltable/functions/whisper.py +2 -20
pixeltable/globals.py +57 -56
pixeltable/index/base.py +2 -2
pixeltable/index/btree.py +7 -7
pixeltable/index/embedding_index.py +8 -10
pixeltable/io/external_store.py +11 -5
pixeltable/io/globals.py +3 -1
pixeltable/io/hf_datasets.py +4 -4
pixeltable/io/label_studio.py +6 -6
pixeltable/io/parquet.py +14 -13
pixeltable/iterators/document.py +10 -8
pixeltable/iterators/video.py +10 -1
pixeltable/metadata/__init__.py +3 -2
pixeltable/metadata/converters/convert_14.py +4 -2
pixeltable/metadata/converters/convert_15.py +1 -1
pixeltable/metadata/converters/convert_19.py +1 -0
pixeltable/metadata/converters/convert_20.py +1 -1
pixeltable/metadata/converters/util.py +9 -8
pixeltable/metadata/schema.py +32 -21
pixeltable/plan.py +136 -154
pixeltable/store.py +51 -36
pixeltable/tool/create_test_db_dump.py +7 -7
pixeltable/tool/doc_plugins/griffe.py +3 -34
pixeltable/tool/mypy_plugin.py +32 -0
pixeltable/type_system.py +243 -60
pixeltable/utils/arrow.py +10 -9
pixeltable/utils/coco.py +4 -4
pixeltable/utils/documents.py +1 -1
pixeltable/utils/filecache.py +131 -84
pixeltable/utils/formatter.py +1 -1
pixeltable/utils/http_server.py +2 -5
pixeltable/utils/media_store.py +6 -6
pixeltable/utils/pytorch.py +10 -11
pixeltable/utils/sql.py +2 -1
{pixeltable-0.2.19.dist-info → pixeltable-0.2.21.dist-info}/METADATA +16 -7
pixeltable-0.2.21.dist-info/RECORD +148 -0
pixeltable/utils/help.py +0 -11
pixeltable-0.2.19.dist-info/RECORD +0 -147
{pixeltable-0.2.19.dist-info → pixeltable-0.2.21.dist-info}/LICENSE +0 -0
{pixeltable-0.2.19.dist-info → pixeltable-0.2.21.dist-info}/WHEEL +0 -0
{pixeltable-0.2.19.dist-info → pixeltable-0.2.21.dist-info}/entry_points.txt +0 -0

pixeltable/plan.py CHANGED Viewed

@@ -1,5 +1,4 @@
-import itertools
-from typing import Any, Iterable, Optional, Sequence
+from typing import Any, Iterable, Optional, Sequence, cast
 from uuid import UUID
 import sqlalchemy as sql
@@ -9,6 +8,7 @@ import pixeltable.exec as exec
 from pixeltable import catalog
 from pixeltable import exceptions as excs
 from pixeltable import exprs
+from pixeltable.exec.sql_node import OrderByItem, OrderByClause, combine_order_by_clauses, print_order_by_clause
 def _is_agg_fn_call(e: exprs.Expr) -> bool:
@@ -46,11 +46,9 @@ class Analyzer:
     tbl: catalog.TableVersionPath
     all_exprs: list[exprs.Expr]
     select_list: list[exprs.Expr]
-    group_by_clause: list[exprs.Expr]
-    order_by_clause: list[tuple[exprs.Expr, bool]]
-    # exprs that can be expressed in SQL and are retrieved directly from the store
-    #sql_exprs: list[exprs.Expr]
+    group_by_clause: Optional[list[exprs.Expr]]  # None for non-aggregate queries; [] for agg query w/o grouping
+    grouping_exprs: list[exprs.Expr]  # [] for non-aggregate queries or agg query w/o grouping
+    order_by_clause: OrderByClause
     sql_elements: exprs.SqlElementCache
@@ -60,15 +58,14 @@ class Analyzer:
     # filter predicate applied to output rows of the SQL scan
     filter: Optional[exprs.Expr]
-    agg_fn_calls: list[exprs.FunctionCall]
+    agg_fn_calls: list[exprs.FunctionCall]  # grouping aggregation (ie, not window functions)
+    window_fn_calls: list[exprs.FunctionCall]
     agg_order_by: list[exprs.Expr]
     def __init__(
             self, tbl: catalog.TableVersionPath, select_list: Sequence[exprs.Expr],
             where_clause: Optional[exprs.Expr] = None, group_by_clause: Optional[list[exprs.Expr]] = None,
             order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None):
-        if group_by_clause is None:
-            group_by_clause = []
         if order_by_clause is None:
             order_by_clause = []
         self.tbl = tbl
@@ -78,8 +75,10 @@ class Analyzer:
         self.select_list = [e.resolve_computed_cols() for e in select_list]
         if where_clause is not None:
             where_clause = where_clause.resolve_computed_cols()
-        self.group_by_clause = [e.resolve_computed_cols() for e in group_by_clause]
-        self.order_by_clause = [(e.resolve_computed_cols(), asc) for e, asc in order_by_clause]
+        self.group_by_clause = (
+            [e.resolve_computed_cols() for e in group_by_clause] if group_by_clause is not None else None
+        )
+        self.order_by_clause = [OrderByItem(e.resolve_computed_cols(), asc) for e, asc in order_by_clause]
         self.sql_where_clause = None
         self.filter = None
@@ -89,20 +88,36 @@ class Analyzer:
         # all exprs that are evaluated in Python; not executable
         self.all_exprs = self.select_list.copy()
-        self.all_exprs.extend(self.group_by_clause)
+        if self.group_by_clause is not None:
+            self.all_exprs.extend(self.group_by_clause)
         self.all_exprs.extend(e for e, _ in self.order_by_clause)
         if self.filter is not None:
             self.all_exprs.append(self.filter)
         self.agg_order_by = []
+        self.agg_fn_calls = []
+        self.window_fn_calls = []
         self._analyze_agg()
+        self.grouping_exprs = self.group_by_clause if self.group_by_clause is not None else []
     def _analyze_agg(self) -> None:
         """Check semantic correctness of aggregation and fill in agg-specific fields of Analyzer"""
-        self.agg_fn_calls = [e for e in self.all_exprs if isinstance(e, exprs.FunctionCall) and _is_agg_fn_call(e)]
+        candidates = self.select_list
+        agg_fn_calls = exprs.ExprSet(
+            exprs.Expr.list_subexprs(
+                candidates, expr_class=exprs.FunctionCall,
+                filter=lambda e: bool(e.is_agg_fn_call and not e.is_window_fn_call)))
+        self.agg_fn_calls = list(agg_fn_calls)
+        window_fn_calls = exprs.ExprSet(
+            exprs.Expr.list_subexprs(
+                candidates, expr_class=exprs.FunctionCall, filter=lambda e: bool(e.is_window_fn_call)))
+        self.window_fn_calls = list(window_fn_calls)
         if len(self.agg_fn_calls) == 0:
             # nothing to do
             return
+        # if we're doing grouping aggregation and don't have an explicit Group By clause, we're creating a single group
+        if self.group_by_clause is None:
+            self.group_by_clause = []
         # check that select list only contains aggregate output
         grouping_expr_ids = {e.id for e in self.group_by_clause}
@@ -113,8 +128,7 @@ class Analyzer:
         # check that filter doesn't contain aggregates
         if self.filter is not None:
-            agg_fn_calls = [e for e in self.filter.subexprs(expr_class=exprs.FunctionCall, filter=lambda e: _is_agg_fn_call(e))]
-            if len(agg_fn_calls) > 0:
+            if any(_is_agg_fn_call(e) for e in self.filter.subexprs(expr_class=exprs.FunctionCall)):
                 raise excs.Error(f'Filter cannot contain aggregate functions: {self.filter}')
         # check that grouping exprs don't contain aggregates and can be expressed as SQL (we perform sort-based
@@ -125,27 +139,6 @@ class Analyzer:
             if e._contains(filter=lambda e: _is_agg_fn_call(e)):
                 raise excs.Error(f'Grouping expression contains aggregate function: {e}')
-        # check that agg fn calls don't have contradicting ordering requirements
-        order_by: list[exprs.Expr] = []
-        order_by_origin: Optional[exprs.Expr] = None  # the expr that determines the ordering
-        for agg_fn_call in self.agg_fn_calls:
-            fn_call_order_by = agg_fn_call.get_agg_order_by()
-            if len(fn_call_order_by) == 0:
-                continue
-            if len(order_by) == 0:
-                order_by = fn_call_order_by
-                order_by_origin = agg_fn_call
-            else:
-                combined = _get_combined_ordering(
-                    [(e, True) for e in order_by], [(e, True) for e in fn_call_order_by])
-                if len(combined) == 0:
-                    raise excs.Error((
-                        f"Incompatible ordering requirements between expressions '{order_by_origin}' and "
-                        f"'{agg_fn_call}':\n"
-                        f"{exprs.Expr.print_list(order_by)} vs {exprs.Expr.print_list(fn_call_order_by)}"
-                    ))
-        self.agg_order_by = order_by
     def _determine_agg_status(self, e: exprs.Expr, grouping_expr_ids: set[int]) -> tuple[bool, bool]:
         """Determine whether expr is the input to or output of an aggregate function.
         Returns:
@@ -175,14 +168,14 @@ class Analyzer:
                 raise excs.Error(f'Invalid expression, mixes aggregate with non-aggregate: {e}')
             return is_output, is_input
     def finalize(self, row_builder: exprs.RowBuilder) -> None:
         """Make all exprs executable
         TODO: add EvalCtx for each expr list?
         """
         # maintain original composition of select list
         row_builder.set_slot_idxs(self.select_list, remove_duplicates=False)
-        row_builder.set_slot_idxs(self.group_by_clause)
+        if self.group_by_clause is not None:
+            row_builder.set_slot_idxs(self.group_by_clause)
         order_by_exprs = [e for e, _ in self.order_by_clause]
         row_builder.set_slot_idxs(order_by_exprs)
         row_builder.set_slot_idxs(self.all_exprs)
@@ -191,6 +184,19 @@ class Analyzer:
         row_builder.set_slot_idxs(self.agg_fn_calls)
         row_builder.set_slot_idxs(self.agg_order_by)
+    def get_window_fn_ob_clause(self) -> Optional[OrderByClause]:
+        clause: list[OrderByClause] = []
+        for fn_call in self.window_fn_calls:
+            # window functions require ordering by the group_by/order_by clauses
+            group_by_exprs, order_by_exprs = fn_call.get_window_sort_exprs()
+            clause.append(
+                [OrderByItem(e, None) for e in group_by_exprs] + [OrderByItem(e, True) for e in order_by_exprs])
+        return combine_order_by_clauses(clause)
+    def has_agg(self) -> bool:
+        """True if there is any kind of aggregation in the query"""
+        return self.group_by_clause is not None or len(self.agg_fn_calls) > 0 or len(self.window_fn_calls) > 0
 class Planner:
     # TODO: create an exec.CountNode and change this to create_count_plan()
@@ -507,93 +513,35 @@ class Planner:
         return plan, len(row_builder.default_eval_ctx.target_exprs)
     @classmethod
-    def _determine_ordering(cls, analyzer: Analyzer) -> list[tuple[exprs.Expr, bool]]:
-        """Returns the exprs for the ORDER BY clause of the SqlScanNode"""
-        order_by_items: list[tuple[exprs.Expr, Optional[bool]]] = []
-        order_by_origin: Optional[exprs.Expr] = None  # the expr that determines the ordering
-        # window functions require ordering by the group_by/order_by clauses
-        window_fn_calls = [
-            e for e in analyzer.all_exprs if isinstance(e, exprs.FunctionCall) and e.is_window_fn_call
-        ]
-        if len(window_fn_calls) > 0:
-            for fn_call in window_fn_calls:
+    def _verify_ordering(cls, analyzer: Analyzer, verify_agg: bool) -> None:
+        """Verify that the various ordering requirements don't conflict"""
+        ob_clauses: list[OrderByClause] = [analyzer.order_by_clause.copy()]
+        if verify_agg:
+            ordering: OrderByClause
+            for fn_call in analyzer.window_fn_calls:
+                # window functions require ordering by the group_by/order_by clauses
                 gb, ob = fn_call.get_window_sort_exprs()
-                # for now, the ordering is implicitly ascending
-                fn_call_ordering = [(e, None) for e in gb] + [(e, True) for e in ob]
-                if len(order_by_items) == 0:
-                    order_by_items = fn_call_ordering
-                    order_by_origin = fn_call
-                else:
-                    # check for compatibility
-                    other_order_by_clauses = fn_call_ordering
-                    combined = _get_combined_ordering(order_by_items, other_order_by_clauses)
-                    if len(combined) == 0:
-                        raise excs.Error((
-                            f"Incompatible ordering requirements between expressions '{order_by_origin}' and "
-                            f"'{fn_call}':\n"
-                            f"{exprs.Expr.print_list(order_by_items)} vs {exprs.Expr.print_list(other_order_by_clauses)}"
-                        ))
-                    order_by_items = combined
-        if len(analyzer.group_by_clause) > 0:
-            agg_ordering = [(e, None) for e in analyzer.group_by_clause] + [(e, True) for e in analyzer.agg_order_by]
-            if len(order_by_items) > 0:
-                # check for compatibility
-                combined = _get_combined_ordering(order_by_items, agg_ordering)
-                if len(combined) == 0:
-                    raise excs.Error((
-                        f"Incompatible ordering requirements between expressions '{order_by_origin}' and "
-                        f"grouping expressions:\n"
-                        f"{exprs.Expr.print_list([e for e, _ in order_by_items])} vs "
-                        f"{exprs.Expr.print_list([e for e, _ in agg_ordering])}"
-                    ))
-                order_by_items = combined
-            else:
-                order_by_items = agg_ordering
+                ordering = [OrderByItem(e, None) for e in gb] + [OrderByItem(e, True) for e in ob]
+                ob_clauses.append(ordering)
+            for fn_call in analyzer.agg_fn_calls:
+                # agg functions with an ordering requirement are implicitly ascending
+                ordering = (
+                    [OrderByItem(e, None) for e in analyzer.group_by_clause]
+                    + [OrderByItem(e, True) for e in fn_call.get_agg_order_by()]
+                )
+                ob_clauses.append(ordering)
+        if len(ob_clauses) <= 1:
+            return
-        if len(analyzer.order_by_clause) > 0:
-            if len(order_by_items) > 0:
-                # check for compatibility
-                combined = _get_combined_ordering(order_by_items, analyzer.order_by_clause)
-                if len(combined) == 0:
-                    raise excs.Error((
-                        f"Incompatible ordering requirements between expressions '{order_by_origin}' and "
-                        f"order-by expressions:\n"
-                        f"{exprs.Expr.print_list([e for e, _ in order_by_items])} vs "
-                        f"{exprs.Expr.print_list([e for e, _ in analyzer.order_by_clause])}"
-                    ))
-                order_by_items = combined
-            else:
-                order_by_items = analyzer.order_by_clause
-        # TODO: can this be unified with the same logic in RowBuilder
-        def refs_unstored_iter_col(e: exprs.Expr) -> bool:
-            if not isinstance(e, exprs.ColumnRef):
-                return False
-            tbl = e.col.tbl
-            return tbl.is_component_view() and tbl.is_iterator_column(e.col) and not e.col.is_stored
-        unstored_iter_col_refs = list(exprs.Expr.list_subexprs(analyzer.all_exprs, expr_class=exprs.ColumnRef, filter=refs_unstored_iter_col))
-        if len(unstored_iter_col_refs) > 0 and len(order_by_items) == 0:
-            # we don't already have a user-requested ordering and we access unstored iterator columns:
-            # order by the primary key of the component view, which minimizes the number of iterator instantiations
-            component_views = {e.col.tbl for e in unstored_iter_col_refs}
-            # TODO: generalize this to multi-level iteration
-            assert len(component_views) == 1
-            component_view = list(component_views)[0]
-            order_by_items = [
-                (exprs.RowidRef(component_view, idx), None)
-                for idx in range(len(component_view.store_tbl.rowid_columns()))
-            ]
-            order_by_origin = unstored_iter_col_refs[0]
-        for e in [e for e, _ in order_by_items]:
-            if not analyzer.sql_elements.contains(e):
-                raise excs.Error(f'order_by element cannot be expressed in SQL: {e}')
-        # we do ascending ordering by default, if not specified otherwise
-        order_by_items = [(e, True) if asc is None else (e, asc) for e, asc in order_by_items]
-        return order_by_items
+        combined_ordering = ob_clauses[0]
+        for ordering in ob_clauses[1:]:
+            combined = combine_order_by_clauses([combined_ordering, ordering])
+            if combined is None:
+                raise excs.Error(
+                    f'Incompatible ordering requirements: '
+                    f'{print_order_by_clause(combined_ordering)} vs {print_order_by_clause(ordering)}')
+            combined_ordering = combined
     @classmethod
     def _is_contained_in(cls, l1: Iterable[exprs.Expr], l2: Iterable[exprs.Expr]) -> bool:
@@ -632,8 +580,6 @@ class Planner:
         """
         if select_list is None:
             select_list = []
-        if group_by_clause is None:
-            group_by_clause = []
         if order_by_clause is None:
             order_by_clause = []
         if exact_version_only is None:
@@ -641,16 +587,12 @@ class Planner:
         analyzer = Analyzer(
             tbl, select_list, where_clause=where_clause, group_by_clause=group_by_clause,
             order_by_clause=order_by_clause)
-        input_exprs = exprs.ExprSet(exprs.Expr.list_subexprs(
-            analyzer.all_exprs, filter=analyzer.sql_elements.contains, traverse_matches=False))
-        # remove Literals from sql_exprs, we don't want to materialize them via a Select
-        input_exprs = exprs.ExprSet(e for e in input_exprs if not isinstance(e, exprs.Literal))
-        row_builder = exprs.RowBuilder(analyzer.all_exprs, [], input_exprs)
+        row_builder = exprs.RowBuilder(analyzer.all_exprs, [], [])
         analyzer.finalize(row_builder)
         # select_list: we need to materialize everything that's been collected
         # with_pk: for now, we always retrieve the PK, because we need it for the file cache
-        eval_ctx = row_builder.create_eval_ctx(analyzer.all_exprs)
+        eval_ctx = row_builder.create_eval_ctx(analyzer.select_list)
         plan = cls._create_query_plan(
             tbl, row_builder, analyzer=analyzer, eval_ctx=eval_ctx, limit=limit, with_pk=True,
             exact_version_only=exact_version_only)
@@ -677,48 +619,88 @@ class Planner:
         if exact_version_only is None:
             exact_version_only = []
         assert isinstance(tbl, catalog.TableVersionPath)
-        is_agg_query = len(analyzer.group_by_clause) > 0 or len(analyzer.agg_fn_calls) > 0
+        sql_elements = analyzer.sql_elements
+        is_python_agg = (
+            not sql_elements.contains(analyzer.agg_fn_calls) or not sql_elements.contains(analyzer.window_fn_calls)
+        )
         ctx = exec.ExecContext(row_builder)
+        cls._verify_ordering(analyzer, verify_agg=is_python_agg)
+        # materialized with SQL scan:
+        # - select list subexprs that aren't aggregates
+        # - Where clause conjuncts that can't be run in SQL
+        # - all grouping exprs, if any aggregate function call can't be run in SQL (in that case, they all have to be
+        #   run in Python)
+        candidates = list(exprs.Expr.list_subexprs(
+            analyzer.select_list,
+            filter=lambda e: (
+                sql_elements.contains(e)
+                and not e._contains(cls=exprs.FunctionCall, filter=lambda e: bool(e.is_agg_fn_call))
+            ),
+            traverse_matches=False))
+        if analyzer.filter is not None:
+            candidates.extend(exprs.Expr.subexprs(
+                analyzer.filter, filter=lambda e: sql_elements.contains(e), traverse_matches=False))
+        if is_python_agg and analyzer.group_by_clause is not None:
+            candidates.extend(exprs.Expr.list_subexprs(
+                analyzer.group_by_clause, filter=lambda e: sql_elements.contains(e), traverse_matches=False))
+        # not isinstance(...): we don't want to materialize Literals via a Select
+        sql_scan_exprs = exprs.ExprSet(e for e in candidates if not isinstance(e, exprs.Literal))
-        order_by_items = cls._determine_ordering(analyzer)
-        sql_limit = 0 if is_agg_query else limit  # if we're aggregating, the limit applies to the agg output
-        sql_exprs = [
-            e for e in eval_ctx.exprs if analyzer.sql_elements.contains(e) and not isinstance(e, exprs.Literal)
-        ]
         plan = exec.SqlScanNode(
-            tbl, row_builder, select_list=sql_exprs, where_clause=analyzer.sql_where_clause,
-            filter=analyzer.filter, order_by_items=order_by_items,
-            limit=sql_limit, set_pk=with_pk, exact_version_only=exact_version_only)
+            tbl, row_builder, select_list=sql_scan_exprs, where_clause=analyzer.sql_where_clause,
+            filter=analyzer.filter, set_pk=with_pk, exact_version_only=exact_version_only)
+        if len(analyzer.window_fn_calls) > 0:
+            # we need to order the input for window functions
+            plan.add_order_by(analyzer.get_window_fn_ob_clause())
         plan = cls._insert_prefetch_node(tbl.tbl_version.id, analyzer.select_list, row_builder, plan)
-        if len(analyzer.group_by_clause) > 0 or len(analyzer.agg_fn_calls) > 0:
-            # we're doing aggregation; the input of the AggregateNode are the grouping exprs plus the
+        if analyzer.group_by_clause is not None:
+            # we're doing grouping aggregation; the input of the AggregateNode are the grouping exprs plus the
             # args of the agg fn calls
-            agg_input = exprs.ExprSet(analyzer.group_by_clause.copy())
+            agg_input = exprs.ExprSet(analyzer.grouping_exprs.copy())
             for fn_call in analyzer.agg_fn_calls:
                 agg_input.update(fn_call.components)
-            if not exprs.ExprSet(sql_exprs).issuperset(agg_input):
+            if not sql_scan_exprs.issuperset(agg_input):
                 # we need an ExprEvalNode
-                plan = exec.ExprEvalNode(row_builder, agg_input, sql_exprs, input=plan)
+                plan = exec.ExprEvalNode(row_builder, agg_input, sql_scan_exprs, input=plan)
             # batch size for aggregation input: this could be the entire table, so we need to divide it into
             # smaller batches; at the same time, we need to make the batches large enough to amortize the
             # function call overhead
             ctx.batch_size = 16
-            plan = exec.AggregationNode(
-                tbl.tbl_version, row_builder, analyzer.group_by_clause, analyzer.agg_fn_calls, agg_input, input=plan)
-            agg_output = exprs.ExprSet(itertools.chain(analyzer.group_by_clause, analyzer.agg_fn_calls))
-            if not agg_output.issuperset(exprs.ExprSet(eval_ctx.target_exprs)):
-                # we need an ExprEvalNode to evaluate the remaining output exprs
-                plan = exec.ExprEvalNode(row_builder, eval_ctx.target_exprs, agg_output, input=plan)
+            # do aggregation in SQL if all agg exprs can be translated
+            if (sql_elements.contains(analyzer.select_list)
+                    and sql_elements.contains(analyzer.grouping_exprs)
+                    and isinstance(plan, exec.SqlNode)
+                    and plan.to_cte() is not None):
+                plan = exec.SqlAggregationNode(
+                    row_builder, input=plan, select_list=analyzer.select_list, group_by_items=analyzer.group_by_clause)
+            else:
+                plan = exec.AggregationNode(
+                    tbl.tbl_version, row_builder, analyzer.group_by_clause,
+                    analyzer.agg_fn_calls + analyzer.window_fn_calls, agg_input, input=plan)
+                typecheck_dummy = analyzer.grouping_exprs + analyzer.agg_fn_calls + analyzer.window_fn_calls
+                agg_output = exprs.ExprSet(typecheck_dummy)
+                if not agg_output.issuperset(exprs.ExprSet(eval_ctx.target_exprs)):
+                    # we need an ExprEvalNode to evaluate the remaining output exprs
+                    plan = exec.ExprEvalNode(row_builder, eval_ctx.target_exprs, agg_output, input=plan)
         else:
-            if not exprs.ExprSet(sql_exprs).issuperset(exprs.ExprSet(eval_ctx.target_exprs)):
+            if not exprs.ExprSet(sql_scan_exprs).issuperset(exprs.ExprSet(eval_ctx.target_exprs)):
                 # we need an ExprEvalNode to evaluate the remaining output exprs
-                plan = exec.ExprEvalNode(row_builder, eval_ctx.target_exprs, sql_exprs, input=plan)
+                plan = exec.ExprEvalNode(row_builder, eval_ctx.target_exprs, sql_scan_exprs, input=plan)
             # we're returning everything to the user, so we might as well do it in a single batch
             ctx.batch_size = 0
+        sql_node = plan.get_sql_node()
+        assert sql_node is not None
+        if len(analyzer.order_by_clause) > 0:
+            sql_node.add_order_by(analyzer.order_by_clause)
+        if limit is not None:
+            plan.set_limit(limit)
         plan.set_ctx(ctx)
         return plan

pixeltable/store.py CHANGED Viewed

@@ -7,18 +7,19 @@ import sys
 import urllib.parse
 import urllib.request
 import warnings
-from typing import Optional, Dict, Any, List, Tuple, Set, Union
+from typing import Any, Iterator, Literal, Optional, Union
 import sqlalchemy as sql
-from tqdm import tqdm, TqdmWarning
+from tqdm import TqdmWarning, tqdm
 import pixeltable.catalog as catalog
 import pixeltable.env as env
+import pixeltable.exceptions as excs
 from pixeltable import exprs
 from pixeltable.exec import ExecNode
 from pixeltable.metadata import schema
 from pixeltable.utils.media_store import MediaStore
-from pixeltable.utils.sql import log_stmt, log_explain
+from pixeltable.utils.sql import log_explain, log_stmt
 _logger = logging.getLogger('pixeltable')
@@ -31,35 +32,42 @@ class StoreBase:
     - v_min: version at which the row was created
     - v_max: version at which the row was deleted (or MAX_VERSION if it's still live)
     """
+    tbl_version: catalog.TableVersion
+    sa_md: sql.MetaData
+    sa_tbl: Optional[sql.Table]
+    _pk_cols: list[sql.Column]
+    v_min_col: sql.Column
+    v_max_col: sql.Column
+    base: Optional[StoreBase]
     __INSERT_BATCH_SIZE = 1000
     def __init__(self, tbl_version: catalog.TableVersion):
         self.tbl_version = tbl_version
         self.sa_md = sql.MetaData()
-        self.sa_tbl: Optional[sql.Table] = None
+        self.sa_tbl = None
         # We need to declare a `base` variable here, even though it's only defined for instances of `StoreView`,
         # since it's referenced by various methods of `StoreBase`
         self.base = None if tbl_version.base is None else tbl_version.base.store_tbl
         self.create_sa_tbl()
-    def pk_columns(self) -> List[sql.Column]:
-        return self._pk_columns
+    def pk_columns(self) -> list[sql.Column]:
+        return self._pk_cols
-    def rowid_columns(self) -> List[sql.Column]:
-        return self._pk_columns[:-1]
+    def rowid_columns(self) -> list[sql.Column]:
+        return self._pk_cols[:-1]
     @abc.abstractmethod
-    def _create_rowid_columns(self) -> List[sql.Column]:
+    def _create_rowid_columns(self) -> list[sql.Column]:
         """Create and return rowid columns"""
-    def _create_system_columns(self) -> List[sql.Column]:
+    def _create_system_columns(self) -> list[sql.Column]:
         """Create and return system columns"""
         rowid_cols = self._create_rowid_columns()
         self.v_min_col = sql.Column('v_min', sql.BigInteger, nullable=False)
         self.v_max_col = \
             sql.Column('v_max', sql.BigInteger, nullable=False, server_default=str(schema.Table.MAX_VERSION))
-        self._pk_columns = [*rowid_cols, self.v_min_col]
+        self._pk_cols = [*rowid_cols, self.v_min_col]
         return [*rowid_cols, self.v_min_col, self.v_max_col]
     def create_sa_tbl(self) -> None:
@@ -79,7 +87,7 @@ class StoreBase:
             # if we're called in response to a schema change, we need to remove the old table first
             self.sa_md.remove(self.sa_tbl)
-        idxs: List[sql.Index] = []
+        idxs: list[sql.Index] = []
         # index for all system columns:
         # - base x view joins can be executed as merge joins
         # - speeds up ORDER BY rowid DESC
@@ -126,7 +134,7 @@ class StoreBase:
         return new_file_url
     def _move_tmp_media_files(
-            self, table_rows: List[Dict[str, Any]], media_cols: List[catalog.Column], v_min: int
+            self, table_rows: list[dict[str, Any]], media_cols: list[catalog.Column], v_min: int
     ) -> None:
         """Move tmp media files that we generated to a permanent location"""
         for c in media_cols:
@@ -135,23 +143,17 @@ class StoreBase:
                 table_row[c.store_name()] = self._move_tmp_media_file(file_url, c, v_min)
     def _create_table_row(
-            self, input_row: exprs.DataRow, row_builder: exprs.RowBuilder, media_cols: List[catalog.Column],
-            exc_col_ids: Set[int], v_min: int
-    ) -> Tuple[Dict[str, Any], int]:
+            self, input_row: exprs.DataRow, row_builder: exprs.RowBuilder, exc_col_ids: set[int], pk: tuple[int, ...]
+    ) -> tuple[dict[str, Any], int]:
         """Return Tuple[complete table row, # of exceptions] for insert()
         Creates a row that includes the PK columns, with the values from input_row.pk.
         Returns:
             Tuple[complete table row, # of exceptions]
         """
         table_row, num_excs = row_builder.create_table_row(input_row, exc_col_ids)
-        assert input_row.pk is not None and len(input_row.pk) == len(self._pk_columns)
-        for pk_col, pk_val in zip(self._pk_columns, input_row.pk):
-            if pk_col == self.v_min_col:
-                table_row[pk_col.name] = v_min
-            else:
-                table_row[pk_col.name] = pk_val
+        assert len(pk) == len(self._pk_cols)
+        for pk_col, pk_val in zip(self._pk_cols, pk):
+            table_row[pk_col.name] = pk_val
         return table_row, num_excs
     def count(self, conn: Optional[sql.engine.Connection] = None) -> int:
@@ -212,14 +214,20 @@ class StoreBase:
             conn.execute(sql.text(stmt))
     def load_column(
-            self, col: catalog.Column, exec_plan: ExecNode, value_expr_slot_idx: int, conn: sql.engine.Connection
+        self,
+        col: catalog.Column,
+        exec_plan: ExecNode,
+        value_expr_slot_idx: int,
+        conn: sql.engine.Connection,
+        on_error: Literal['abort', 'ignore']
     ) -> int:
         """Update store column of a computed column with values produced by an execution plan
         Returns:
             number of rows with exceptions
         Raises:
-            sql.exc.DBAPIError if there was an error during SQL execution
+            sql.exc.DBAPIError if there was a SQL error during execution
+            excs.Error if on_error='abort' and there was an exception during row evaluation
         """
         num_excs = 0
         num_rows = 0
@@ -253,6 +261,10 @@ class StoreBase:
                         if result_row.has_exc(value_expr_slot_idx):
                             num_excs += 1
                             value_exc = result_row.get_exc(value_expr_slot_idx)
+                            if on_error == 'abort':
+                                raise excs.Error(
+                                    f'Error while evaluating computed column `{col.name}`:\n{value_exc}'
+                                ) from value_exc
                             # we store a NULL value and record the exception/exc type
                             error_type = type(value_exc).__name__
                             error_msg = str(value_exc)
@@ -291,8 +303,8 @@ class StoreBase:
     def insert_rows(
             self, exec_plan: ExecNode, conn: sql.engine.Connection, v_min: Optional[int] = None,
-            show_progress: bool = True
-    ) -> Tuple[int, int, Set[int]]:
+            show_progress: bool = True, rowids: Optional[Iterator[int]] = None
+    ) -> tuple[int, int, set[int]]:
         """Insert rows into the store table and update the catalog table's md
         Returns:
             number of inserted rows, number of exceptions, set of column ids that have exceptions
@@ -302,7 +314,7 @@ class StoreBase:
         # TODO: total?
         num_excs = 0
         num_rows = 0
-        cols_with_excs: Set[int] = set()
+        cols_with_excs: set[int] = set()
         progress_bar: Optional[tqdm] = None  # create this only after we started executing
         row_builder = exec_plan.row_builder
         media_cols = [info.col for info in row_builder.table_columns if info.col.col_type.is_media_type()]
@@ -312,13 +324,16 @@ class StoreBase:
                 num_rows += len(row_batch)
                 for batch_start_idx in range(0, len(row_batch), self.__INSERT_BATCH_SIZE):
                     # compute batch of rows and convert them into table rows
-                    table_rows: List[Dict[str, Any]] = []
+                    table_rows: list[dict[str, Any]] = []
                     for row_idx in range(batch_start_idx, min(batch_start_idx + self.__INSERT_BATCH_SIZE, len(row_batch))):
                         row = row_batch[row_idx]
-                        table_row, num_row_exc = \
-                            self._create_table_row(row, row_builder, media_cols, cols_with_excs, v_min=v_min)
+                        rowid = (next(rowids),) if rowids is not None else row.pk[:-1]
+                        pk = rowid + (v_min,)
+                        table_row, num_row_exc = self._create_table_row(row, row_builder, cols_with_excs, pk=pk)
                         num_excs += num_row_exc
                         table_rows.append(table_row)
                         if show_progress:
                             if progress_bar is None:
                                 warnings.simplefilter("ignore", category=TqdmWarning)
@@ -353,7 +368,7 @@ class StoreBase:
         return sql.and_(clause, self.base._versions_clause(versions[1:], match_on_vmin))
     def delete_rows(
-            self, current_version: int, base_versions: List[Optional[int]], match_on_vmin: bool,
+            self, current_version: int, base_versions: list[Optional[int]], match_on_vmin: bool,
             where_clause: Optional[sql.ColumnElement[bool]], conn: sql.engine.Connection) -> int:
         """Mark rows as deleted that are live and were created prior to current_version.
         Also: populate the undo columns
@@ -397,7 +412,7 @@ class StoreTable(StoreBase):
         assert not tbl_version.is_view()
         super().__init__(tbl_version)
-    def _create_rowid_columns(self) -> List[sql.Column]:
+    def _create_rowid_columns(self) -> list[sql.Column]:
         self.rowid_col = sql.Column('rowid', sql.BigInteger, nullable=False)
         return [self.rowid_col]
@@ -413,7 +428,7 @@ class StoreView(StoreBase):
         assert catalog_view.is_view()
         super().__init__(catalog_view)
-    def _create_rowid_columns(self) -> List[sql.Column]:
+    def _create_rowid_columns(self) -> list[sql.Column]:
         # a view row corresponds directly to a single base row, which means it needs to duplicate its rowid columns
         self.rowid_cols = [sql.Column(c.name, c.type) for c in self.base.rowid_columns()]
         return self.rowid_cols
@@ -439,7 +454,7 @@ class StoreComponentView(StoreView):
     def __init__(self, catalog_view: catalog.TableVersion):
         super().__init__(catalog_view)
-    def _create_rowid_columns(self) -> List[sql.Column]:
+    def _create_rowid_columns(self) -> list[sql.Column]:
         # each base row is expanded into n view rows
         self.rowid_cols = [sql.Column(c.name, c.type) for c in self.base.rowid_columns()]
         # name of pos column: avoid collisions with bases' pos columns

pixeltable 0.2.19__py3-none-any.whl → 0.2.21__py3-none-any.whl

Potentially problematic release.

pixeltable 0.2.19py3-none-any.whl → 0.2.21py3-none-any.whl