PyPI - pixeltable - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl - Mend

pixeltable 0.3.10py3-none-any.whl → 0.3.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (119) hide show

pixeltable/__init__.py +2 -2
pixeltable/__version__.py +2 -2
pixeltable/catalog/__init__.py +2 -1
pixeltable/catalog/catalog.py +370 -93
pixeltable/catalog/column.py +6 -4
pixeltable/catalog/dir.py +5 -5
pixeltable/catalog/globals.py +14 -16
pixeltable/catalog/insertable_table.py +6 -8
pixeltable/catalog/path.py +14 -7
pixeltable/catalog/table.py +72 -62
pixeltable/catalog/table_version.py +137 -107
pixeltable/catalog/table_version_handle.py +3 -0
pixeltable/catalog/table_version_path.py +1 -1
pixeltable/catalog/view.py +10 -14
pixeltable/dataframe.py +5 -3
pixeltable/env.py +108 -42
pixeltable/exec/__init__.py +2 -0
pixeltable/exec/aggregation_node.py +6 -8
pixeltable/exec/cache_prefetch_node.py +4 -7
pixeltable/exec/component_iteration_node.py +1 -3
pixeltable/exec/data_row_batch.py +1 -2
pixeltable/exec/exec_context.py +1 -1
pixeltable/exec/exec_node.py +1 -2
pixeltable/exec/expr_eval/__init__.py +2 -0
pixeltable/exec/expr_eval/evaluators.py +137 -20
pixeltable/exec/expr_eval/expr_eval_node.py +43 -64
pixeltable/exec/expr_eval/globals.py +68 -7
pixeltable/exec/expr_eval/schedulers.py +25 -23
pixeltable/exec/in_memory_data_node.py +8 -6
pixeltable/exec/row_update_node.py +3 -4
pixeltable/exec/sql_node.py +16 -18
pixeltable/exprs/__init__.py +1 -1
pixeltable/exprs/column_property_ref.py +1 -1
pixeltable/exprs/column_ref.py +3 -3
pixeltable/exprs/compound_predicate.py +1 -1
pixeltable/exprs/data_row.py +17 -1
pixeltable/exprs/expr.py +12 -12
pixeltable/exprs/function_call.py +34 -2
pixeltable/exprs/json_mapper.py +95 -48
pixeltable/exprs/json_path.py +4 -9
pixeltable/exprs/method_ref.py +2 -2
pixeltable/exprs/object_ref.py +2 -2
pixeltable/exprs/row_builder.py +33 -6
pixeltable/exprs/similarity_expr.py +1 -1
pixeltable/exprs/sql_element_cache.py +1 -1
pixeltable/exprs/string_op.py +2 -2
pixeltable/ext/__init__.py +1 -1
pixeltable/ext/functions/__init__.py +1 -1
pixeltable/ext/functions/whisperx.py +1 -1
pixeltable/ext/functions/yolox.py +1 -1
pixeltable/func/__init__.py +1 -1
pixeltable/func/aggregate_function.py +2 -2
pixeltable/func/callable_function.py +3 -6
pixeltable/func/expr_template_function.py +24 -4
pixeltable/func/function.py +7 -9
pixeltable/func/function_registry.py +1 -1
pixeltable/func/query_template_function.py +87 -4
pixeltable/func/signature.py +1 -1
pixeltable/func/tools.py +1 -1
pixeltable/func/udf.py +2 -2
pixeltable/functions/__init__.py +1 -1
pixeltable/functions/anthropic.py +2 -2
pixeltable/functions/audio.py +1 -1
pixeltable/functions/deepseek.py +1 -1
pixeltable/functions/fireworks.py +1 -1
pixeltable/functions/globals.py +6 -6
pixeltable/functions/huggingface.py +1 -1
pixeltable/functions/image.py +1 -1
pixeltable/functions/json.py +1 -1
pixeltable/functions/llama_cpp.py +1 -1
pixeltable/functions/math.py +1 -1
pixeltable/functions/mistralai.py +1 -1
pixeltable/functions/ollama.py +1 -1
pixeltable/functions/openai.py +2 -2
pixeltable/functions/replicate.py +1 -1
pixeltable/functions/string.py +1 -1
pixeltable/functions/timestamp.py +1 -1
pixeltable/functions/together.py +1 -1
pixeltable/functions/util.py +1 -1
pixeltable/functions/video.py +2 -2
pixeltable/functions/vision.py +2 -2
pixeltable/globals.py +7 -2
pixeltable/index/embedding_index.py +12 -1
pixeltable/io/__init__.py +5 -3
pixeltable/io/fiftyone.py +6 -7
pixeltable/io/label_studio.py +21 -20
pixeltable/io/pandas.py +6 -5
pixeltable/iterators/__init__.py +1 -1
pixeltable/metadata/__init__.py +6 -4
pixeltable/metadata/converters/convert_24.py +3 -3
pixeltable/metadata/converters/convert_25.py +1 -1
pixeltable/metadata/converters/convert_29.py +1 -1
pixeltable/metadata/converters/convert_31.py +11 -0
pixeltable/metadata/converters/convert_32.py +15 -0
pixeltable/metadata/converters/convert_33.py +17 -0
pixeltable/metadata/notes.py +3 -0
pixeltable/metadata/schema.py +26 -1
pixeltable/plan.py +2 -3
pixeltable/share/packager.py +8 -24
pixeltable/share/publish.py +20 -9
pixeltable/store.py +9 -6
pixeltable/type_system.py +19 -7
pixeltable/utils/console_output.py +3 -2
pixeltable/utils/coroutine.py +3 -3
pixeltable/utils/dbms.py +66 -0
pixeltable/utils/documents.py +61 -67
pixeltable/utils/exception_handler.py +59 -0
pixeltable/utils/filecache.py +1 -1
pixeltable/utils/http_server.py +3 -2
pixeltable/utils/pytorch.py +1 -1
pixeltable/utils/sql.py +1 -1
pixeltable-0.3.12.dist-info/METADATA +436 -0
pixeltable-0.3.12.dist-info/RECORD +183 -0
pixeltable/catalog/path_dict.py +0 -169
pixeltable-0.3.10.dist-info/METADATA +0 -382
pixeltable-0.3.10.dist-info/RECORD +0 -179
{pixeltable-0.3.10.dist-info → pixeltable-0.3.12.dist-info}/LICENSE +0 -0
{pixeltable-0.3.10.dist-info → pixeltable-0.3.12.dist-info}/WHEEL +0 -0
{pixeltable-0.3.10.dist-info → pixeltable-0.3.12.dist-info}/entry_points.txt +0 -0

pixeltable/exec/expr_eval/evaluators.py CHANGED Viewed

@@ -9,7 +9,7 @@ from typing import Any, Callable, Iterator, Optional, cast
 from pixeltable import exprs, func
-from .globals import Dispatcher, Evaluator, FnCallArgs
+from .globals import Dispatcher, Evaluator, ExecCtx, FnCallArgs
 _logger = logging.getLogger('pixeltable')
@@ -26,8 +26,8 @@ class DefaultExprEvaluator(Evaluator):
     e: exprs.Expr
-    def __init__(self, e: exprs.Expr, dispatcher: Dispatcher):
-        super().__init__(dispatcher)
+    def __init__(self, e: exprs.Expr, dispatcher: Dispatcher, exec_ctx: ExecCtx):
+        super().__init__(dispatcher, exec_ctx)
         self.e = e
     def schedule(self, rows: list[exprs.DataRow], slot_idx: int) -> None:
@@ -47,8 +47,8 @@ class DefaultExprEvaluator(Evaluator):
                 _, _, exc_tb = sys.exc_info()
                 row.set_exc(self.e.slot_idx, exc)
                 rows_with_excs.add(idx)
-                self.dispatcher.dispatch_exc([row], self.e.slot_idx, exc_tb)
-        self.dispatcher.dispatch([rows[i] for i in range(len(rows)) if i not in rows_with_excs])
+                self.dispatcher.dispatch_exc([row], self.e.slot_idx, exc_tb, self.exec_ctx)
+        self.dispatcher.dispatch([rows[i] for i in range(len(rows)) if i not in rows_with_excs], self.exec_ctx)
 class FnCallEvaluator(Evaluator):
@@ -70,8 +70,8 @@ class FnCallEvaluator(Evaluator):
     call_args_queue: Optional[asyncio.Queue[FnCallArgs]]  # FnCallArgs waiting for execution
     batch_size: Optional[int]
-    def __init__(self, fn_call: exprs.FunctionCall, dispatcher: Dispatcher):
-        super().__init__(dispatcher)
+    def __init__(self, fn_call: exprs.FunctionCall, dispatcher: Dispatcher, exec_ctx: ExecCtx):
+        super().__init__(dispatcher, exec_ctx)
         self.fn_call = fn_call
         self.fn = cast(func.CallableFunction, fn_call.fn)
         if isinstance(self.fn, func.CallableFunction) and self.fn.is_batched:
@@ -104,7 +104,7 @@ class FnCallEvaluator(Evaluator):
                 rows_call_args.append(FnCallArgs(self.fn_call, [row], args=args, kwargs=kwargs))
         if len(skip_rows) > 0:
-            self.dispatcher.dispatch(skip_rows)
+            self.dispatcher.dispatch(skip_rows, self.exec_ctx)
         if self.batch_size is not None:
             if not self.is_closed and (len(rows_call_args) + self.call_args_queue.qsize() < self.batch_size):
@@ -132,7 +132,7 @@ class FnCallEvaluator(Evaluator):
                 if self.fn_call.resource_pool is not None:
                     # hand the call off to the resource pool's scheduler
                     scheduler = self.dispatcher.schedulers[self.fn_call.resource_pool]
-                    scheduler.submit(batched_call_args)
+                    scheduler.submit(batched_call_args, self.exec_ctx)
                 else:
                     task = asyncio.create_task(self.eval_batch(batched_call_args))
                     self.dispatcher.register_task(task)
@@ -142,7 +142,7 @@ class FnCallEvaluator(Evaluator):
                 # hand the call off to the resource pool's scheduler
                 scheduler = self.dispatcher.schedulers[self.fn_call.resource_pool]
                 for item in rows_call_args:
-                    scheduler.submit(item)
+                    scheduler.submit(item, self.exec_ctx)
             else:
                 # create one task per call
                 for item in rows_call_args:
@@ -161,14 +161,12 @@ class FnCallEvaluator(Evaluator):
     def _create_batch_call_args(self, call_args: list[FnCallArgs]) -> FnCallArgs:
         """Roll call_args into a single batched FnCallArgs"""
         batch_args: list[list[Optional[Any]]] = [[None] * len(call_args) for _ in range(len(self.fn_call.arg_idxs))]
-        batch_kwargs: dict[str, list[Optional[Any]]] = {
-            k: [None] * len(call_args) for k in self.fn_call.kwarg_idxs.keys()
-        }
+        batch_kwargs: dict[str, list[Optional[Any]]] = {k: [None] * len(call_args) for k in self.fn_call.kwarg_idxs}
         assert isinstance(self.fn, func.CallableFunction)
         for i, item in enumerate(call_args):
             for j in range(len(item.args)):
                 batch_args[j][i] = item.args[j]
-            for k in item.kwargs.keys():
+            for k in item.kwargs:
                 batch_kwargs[k][i] = item.kwargs[k]
         return FnCallArgs(
             self.fn_call, [item.row for item in call_args], batch_args=batch_args, batch_kwargs=batch_kwargs
@@ -190,12 +188,12 @@ class FnCallEvaluator(Evaluator):
             _, _, exc_tb = sys.exc_info()
             for row in batched_call_args.rows:
                 row.set_exc(self.fn_call.slot_idx, exc)
-            self.dispatcher.dispatch_exc(batched_call_args.rows, self.fn_call.slot_idx, exc_tb)
+            self.dispatcher.dispatch_exc(batched_call_args.rows, self.fn_call.slot_idx, exc_tb, self.exec_ctx)
             return
         for i, row in enumerate(batched_call_args.rows):
             row[self.fn_call.slot_idx] = result_batch[i]
-        self.dispatcher.dispatch(batched_call_args.rows)
+        self.dispatcher.dispatch(batched_call_args.rows, self.exec_ctx)
     async def eval_async(self, call_args: FnCallArgs) -> None:
         assert len(call_args.rows) == 1
@@ -208,7 +206,7 @@ class FnCallEvaluator(Evaluator):
             call_args.row[self.fn_call.slot_idx] = await self.fn.aexec(*call_args.args, **call_args.kwargs)
             end_ts = datetime.datetime.now()
             _logger.debug(f'Evaluated slot {self.fn_call.slot_idx} in {end_ts - start_ts}')
-            self.dispatcher.dispatch([call_args.row])
+            self.dispatcher.dispatch([call_args.row], self.exec_ctx)
         except Exception as exc:
             import anthropic
@@ -216,7 +214,7 @@ class FnCallEvaluator(Evaluator):
                 _logger.debug(f'RateLimitError: {exc}')
             _, _, exc_tb = sys.exc_info()
             call_args.row.set_exc(self.fn_call.slot_idx, exc)
-            self.dispatcher.dispatch_exc(call_args.rows, self.fn_call.slot_idx, exc_tb)
+            self.dispatcher.dispatch_exc(call_args.rows, self.fn_call.slot_idx, exc_tb, self.exec_ctx)
     async def eval(self, call_args_batch: list[FnCallArgs]) -> None:
         rows_with_excs: set[int] = set()  # records idxs into 'rows'
@@ -233,9 +231,9 @@ class FnCallEvaluator(Evaluator):
                 _, _, exc_tb = sys.exc_info()
                 item.row.set_exc(self.fn_call.slot_idx, exc)
                 rows_with_excs.add(idx)
-                self.dispatcher.dispatch_exc(item.rows, self.fn_call.slot_idx, exc_tb)
+                self.dispatcher.dispatch_exc(item.rows, self.fn_call.slot_idx, exc_tb, self.exec_ctx)
         self.dispatcher.dispatch(
-            [call_args_batch[i].row for i in range(len(call_args_batch)) if i not in rows_with_excs]
+            [call_args_batch[i].row for i in range(len(call_args_batch)) if i not in rows_with_excs], self.exec_ctx
         )
     def _close(self) -> None:
@@ -246,3 +244,122 @@ class FnCallEvaluator(Evaluator):
         batched_call_args = self._create_batch_call_args(list(self._queued_call_args_iter()))
         task = asyncio.create_task(self.eval_batch(batched_call_args))
         self.dispatcher.register_task(task)
+class NestedRowList:
+    """
+    A list of nested rows, used by JsonMapperDispatcher to store the rows corresponding to the elements of the
+    JsonMapper source list and make completion awaitable.
+    """
+    rows: list[exprs.DataRow]
+    num_completed: int
+    completion: asyncio.Event
+    def __init__(self, rows: list[exprs.DataRow]):
+        self.num_completed = 0
+        self.rows = rows
+        self.completion = asyncio.Event()
+    def complete_row(self) -> None:
+        self.num_completed += 1
+        if self.num_completed == len(self.rows):
+            self.completion.set()
+class JsonMapperDispatcher(Evaluator):
+    """
+    The execution logic for materializing the nested DataRows of a JsonMapper/JsonMapperDispatch.
+    The rows are stored in a NestedRowList, which itself is stored in the JsonMapperDispatch instance's slot.
+    """
+    e: exprs.JsonMapperDispatch
+    target_expr: exprs.Expr
+    scope_anchor: exprs.ObjectRef
+    nested_exec_ctx: ExecCtx  # ExecCtx needed to evaluate the nested rows
+    external_slot_map: dict[int, int]  # slot idx in parent row -> slot idx in nested row
+    has_async_calls: bool  # True if target_expr contains any async FunctionCalls
+    def __init__(self, e: exprs.JsonMapperDispatch, dispatcher: Dispatcher, exec_ctx: ExecCtx):
+        super().__init__(dispatcher, exec_ctx)
+        self.e = e
+        self.target_expr = e.target_expr.copy()  # we need new slot idxs
+        self.scope_anchor = e.scope_anchor.copy()
+        nested_row_builder = exprs.RowBuilder(output_exprs=[self.target_expr], columns=[], input_exprs=[])
+        nested_row_builder.set_slot_idxs([self.target_expr, self.scope_anchor])
+        target_expr_ctx = nested_row_builder.create_eval_ctx([self.target_expr], limit_scope=True)
+        self.has_async_calls = any(isinstance(e, exprs.FunctionCall) and e.is_async for e in target_expr_ctx.exprs)
+        target_scope = self.target_expr.scope()
+        # we need to pre-populated nested rows with slot values that are produced in an outer scope (literals excluded)
+        parent_exprs = [
+            e for e in target_expr_ctx.exprs if e.scope() != target_scope and not isinstance(e, exprs.Literal)
+        ]
+        self.external_slot_map = {exec_ctx.row_builder.unique_exprs[e].slot_idx: e.slot_idx for e in parent_exprs}
+        self.nested_exec_ctx = ExecCtx(dispatcher, nested_row_builder, [self.target_expr], parent_exprs)
+    def schedule(self, rows: list[exprs.DataRow], slot_idx: int) -> None:
+        """Create nested rows for all source list elements and dispatch them"""
+        assert self.e.slot_idx >= 0
+        all_nested_rows: list[exprs.DataRow] = []
+        for row in rows:
+            src = row[self.e.src_expr.slot_idx]
+            if not isinstance(src, list):
+                # invalid/non-list src path
+                row[self.e.slot_idx] = None
+                continue
+            nested_rows = [
+                exprs.DataRow(
+                    size=self.nested_exec_ctx.row_builder.num_materialized,
+                    img_slot_idxs=[],
+                    media_slot_idxs=[],
+                    array_slot_idxs=[],
+                    parent_row=row,
+                    parent_slot_idx=self.e.slot_idx,
+                )
+                for _ in src
+            ]
+            for nested_row, anchor_val in zip(nested_rows, src):
+                nested_row[self.scope_anchor.slot_idx] = anchor_val
+                for slot_idx_, nested_slot_idx in self.external_slot_map.items():
+                    nested_row[nested_slot_idx] = row[slot_idx_]
+            self.nested_exec_ctx.init_rows(nested_rows)
+            # we modify DataRow.vals here directly, rather than going through __getitem__(), because we don't have
+            # an official "value" yet (the nested rows are not yet materialized)
+            row.vals[self.e.slot_idx] = NestedRowList(nested_rows)
+            all_nested_rows.extend(nested_rows)
+        self.dispatcher.dispatch(all_nested_rows, self.nested_exec_ctx)
+        task = asyncio.create_task(self.gather(rows))
+        self.dispatcher.register_task(task)
+    async def gather(self, rows: list[exprs.DataRow]) -> None:
+        """Wait for nested rows to complete, then signal completion to the parent rows"""
+        if self.has_async_calls:
+            # if our target expr contains async FunctionCalls, they typically get completed out-of-order, and it's
+            # more effective to dispatch them as they complete
+            remaining = {
+                asyncio.create_task(row.vals[self.e.slot_idx].completion.wait()): row
+                for row in rows
+                if not row.has_val[self.e.slot_idx]
+            }
+            while len(remaining) > 0:
+                done, _ = await asyncio.wait(remaining.keys(), return_when=asyncio.FIRST_COMPLETED)
+                done_rows = [remaining.pop(task) for task in done]
+                for row in done_rows:
+                    row.has_val[self.e.slot_idx] = True
+                self.dispatcher.dispatch(done_rows, self.exec_ctx)
+        else:
+            # our target expr doesn't contain async FunctionCalls, which means they will get completed in-order
+            for row in rows:
+                if row.has_val[self.e.slot_idx]:
+                    # the source_expr's value is not a list
+                    assert row.vals[self.e.slot_idx] is None
+                    continue
+                assert row.vals[self.e.slot_idx] is not None and isinstance(row.vals[self.e.slot_idx], NestedRowList)
+                await row.vals[self.e.slot_idx].completion.wait()
+                row.has_val[self.e.slot_idx] = True
+            self.dispatcher.dispatch(rows, self.exec_ctx)

pixeltable/exec/expr_eval/expr_eval_node.py CHANGED Viewed

@@ -9,12 +9,12 @@ from typing import AsyncIterator, Iterable, Optional, Union
 import numpy as np
 import pixeltable.exceptions as excs
-from pixeltable import exprs, func
+from pixeltable import exprs
 from ..data_row_batch import DataRowBatch
 from ..exec_node import ExecNode
-from .evaluators import DefaultExprEvaluator, FnCallEvaluator
-from .globals import Evaluator, Scheduler
+from .evaluators import FnCallEvaluator, NestedRowList
+from .globals import ExecCtx, Scheduler
 from .row_buffer import RowBuffer
 from .schedulers import SCHEDULERS
@@ -42,12 +42,9 @@ class ExprEvalNode(ExecNode):
     """
     maintain_input_order: bool  # True if we're returning rows in the order we received them from our input
-    num_dependencies: np.ndarray  # number of dependencies for our output slots; indexed by slot idx
     outputs: np.ndarray  # bool per slot; True if this slot is part of our output
-    slot_evaluators: dict[int, Evaluator]  # key: slot idx
     schedulers: dict[str, Scheduler]  # key: resource pool name
-    gc_targets: np.ndarray  # bool per slot; True if this is an intermediate expr (ie, not part of our output)
-    eval_ctx: np.ndarray  # bool per slot; EvalCtx.slot_idxs as a mask
+    exec_ctx: ExecCtx  # for input/output rows
     # execution state
     tasks: set[asyncio.Task]  # collects all running tasks to prevent them from getting gc'd
@@ -82,19 +79,10 @@ class ExprEvalNode(ExecNode):
     ):
         super().__init__(row_builder, output_exprs, input_exprs, input)
         self.maintain_input_order = maintain_input_order
-        self.num_dependencies = np.sum(row_builder.dependencies, axis=1)
         self.outputs = np.zeros(row_builder.num_materialized, dtype=bool)
         output_slot_idxs = [e.slot_idx for e in output_exprs]
         self.outputs[output_slot_idxs] = True
         self.tasks = set()
-        self.gc_targets = np.ones(row_builder.num_materialized, dtype=bool)
-        # we need to retain all slots that are part of the output
-        self.gc_targets[[e.slot_idx for e in row_builder.output_exprs]] = False
-        output_ctx = self.row_builder.create_eval_ctx(output_exprs, exclude=input_exprs)
-        self.eval_ctx = np.zeros(row_builder.num_materialized, dtype=bool)
-        self.eval_ctx[output_ctx.slot_idxs] = True
         self.error = None
         self.input_iter = self.input.__aiter__()
@@ -110,30 +98,14 @@ class ExprEvalNode(ExecNode):
         self.num_input_rows = 0
         self.num_output_rows = 0
-        self.slot_evaluators = {}
+        # self.slot_evaluators = {}
         self.schedulers = {}
-        self._init_slot_evaluators()
+        # self._init_slot_evaluators()
+        self.exec_ctx = ExecCtx(self, self.row_builder, output_exprs, input_exprs)
     def set_input_order(self, maintain_input_order: bool) -> None:
         self.maintain_input_order = maintain_input_order
-    def _init_slot_evaluators(self) -> None:
-        """Create slot evaluators and resource pool schedulers"""
-        resource_pools: set[str] = set()
-        for slot_idx in range(self.row_builder.num_materialized):
-            expr = self.row_builder.unique_exprs[slot_idx]
-            if (
-                isinstance(expr, exprs.FunctionCall)
-                # ExprTemplateFunction and AggregateFunction calls are best handled by FunctionCall.eval()
-                and not isinstance(expr.fn, func.ExprTemplateFunction)
-                and not isinstance(expr.fn, func.AggregateFunction)
-            ):
-                if expr.resource_pool is not None:
-                    resource_pools.add(expr.resource_pool)
-                self.slot_evaluators[slot_idx] = FnCallEvaluator(expr, self)
-            else:
-                self.slot_evaluators[slot_idx] = DefaultExprEvaluator(expr, self)
     async def _fetch_input_batch(self) -> None:
         """
         Fetches another batch from our input or sets input_complete to True if there are no more batches.
@@ -155,7 +127,8 @@ class ExprEvalNode(ExecNode):
             self.num_input_rows += len(batch)
             self.avail_input_rows += len(batch)
             _logger.debug(
-                f'adding input: batch_size={len(batch)} #input_rows={self.num_input_rows} #avail={self.avail_input_rows}'
+                f'adding input: batch_size={len(batch)} #input_rows={self.num_input_rows} '
+                f'#avail={self.avail_input_rows}'
             )
         except StopAsyncIteration:
             self.input_complete = True
@@ -199,8 +172,8 @@ class ExprEvalNode(ExecNode):
         self.num_in_flight += num_rows
         self._log_state(f'dispatch input ({num_rows})')
-        self._init_input_rows(rows)
-        self.dispatch(rows)
+        self.exec_ctx.init_rows(rows)
+        self.dispatch(rows, self.exec_ctx)
     def _log_state(self, prefix: str) -> None:
         _logger.debug(
@@ -212,7 +185,9 @@ class ExprEvalNode(ExecNode):
     def _init_schedulers(self) -> None:
         resource_pools = {
-            eval.fn_call.resource_pool for eval in self.slot_evaluators.values() if isinstance(eval, FnCallEvaluator)
+            eval.fn_call.resource_pool
+            for eval in self.exec_ctx.slot_evaluators.values()
+            if isinstance(eval, FnCallEvaluator)
         }
         resource_pools = {pool for pool in resource_pools if pool is not None}
         for pool_name in resource_pools:
@@ -287,7 +262,7 @@ class ExprEvalNode(ExecNode):
                 if self.input_complete and self.avail_input_rows == 0 and not closed_evaluators:
                     # no more input rows to dispatch, but we're still waiting for rows to finish:
                     # close  all slot evaluators to flush queued rows
-                    for evaluator in self.slot_evaluators.values():
+                    for evaluator in self.exec_ctx.slot_evaluators.values():
                         evaluator.close()
                     closed_evaluators = True
@@ -303,7 +278,7 @@ class ExprEvalNode(ExecNode):
                     if completed_aw is None:
                         completed_aw = asyncio.create_task(self.completed_event.wait(), name='completed.wait()')
                     aws.add(completed_aw)
-                done, pending = await asyncio.wait(aws, return_when=asyncio.FIRST_COMPLETED)
+                done, _ = await asyncio.wait(aws, return_when=asyncio.FIRST_COMPLETED)
                 if self.exc_event.is_set():
                     # we got an exception that we need to propagate through __iter__()
@@ -332,22 +307,18 @@ class ExprEvalNode(ExecNode):
                     task.cancel()
             _ = await asyncio.gather(*active_tasks, return_exceptions=True)
-    def _init_input_rows(self, rows: list[exprs.DataRow]) -> None:
-        """Set execution state in DataRow"""
-        for row in rows:
-            row.missing_dependents = np.sum(self.row_builder.dependencies[row.has_val == False], axis=0)
-            row.missing_slots = self.eval_ctx & (row.has_val == False)
-    def dispatch_exc(self, rows: list[exprs.DataRow], slot_with_exc: int, exc_tb: TracebackType) -> None:
+    def dispatch_exc(
+        self, rows: list[exprs.DataRow], slot_with_exc: int, exc_tb: TracebackType, exec_ctx: ExecCtx
+    ) -> None:
         """Propagate exception to main event loop or to dependent slots, depending on ignore_errors"""
         if len(rows) == 0 or self.exc_event.is_set():
             return
         if not self.ctx.ignore_errors:
-            dependency_idxs = [e.slot_idx for e in self.row_builder.unique_exprs[slot_with_exc].dependencies()]
+            dependency_idxs = [e.slot_idx for e in exec_ctx.row_builder.unique_exprs[slot_with_exc].dependencies()]
             first_row = rows[0]
             input_vals = [first_row[idx] for idx in dependency_idxs]
-            e = self.row_builder.unique_exprs[slot_with_exc]
+            e = exec_ctx.row_builder.unique_exprs[slot_with_exc]
             self.error = excs.ExprEvalError(e, f'expression {e}', first_row.get_exc(e.slot_idx), exc_tb, input_vals, 0)
             self.exc_event.set()
             return
@@ -356,17 +327,17 @@ class ExprEvalNode(ExecNode):
             assert row.has_exc(slot_with_exc)
             exc = row.get_exc(slot_with_exc)
             # propagate exception
-            for slot_idx in np.nonzero(self.row_builder.transitive_dependents[slot_with_exc])[0].tolist():
+            for slot_idx in np.nonzero(exec_ctx.row_builder.transitive_dependents[slot_with_exc])[0].tolist():
                 row.set_exc(slot_idx, exc)
-        self.dispatch(rows)
+        self.dispatch(rows, exec_ctx)
-    def dispatch(self, rows: list[exprs.DataRow]) -> None:
+    def dispatch(self, rows: list[exprs.DataRow], exec_ctx: ExecCtx) -> None:
         """Dispatch rows to slot evaluators, based on materialized dependencies"""
         if len(rows) == 0 or self.exc_event.is_set():
             return
         # slots ready for evaluation; rows x slots
-        ready_slots = np.zeros((len(rows), self.row_builder.num_materialized), dtype=bool)
+        ready_slots = np.zeros((len(rows), exec_ctx.row_builder.num_materialized), dtype=bool)
         completed_rows = np.zeros(len(rows), dtype=bool)
         for i, row in enumerate(rows):
             row.missing_slots &= row.has_val == False
@@ -375,25 +346,33 @@ class ExprEvalNode(ExecNode):
                 completed_rows[i] = True
             else:
                 # dependencies of missing slots
-                missing_dependencies = self.num_dependencies * row.missing_slots
+                missing_dependencies = exec_ctx.row_builder.num_dependencies * row.missing_slots
                 # determine ready slots that are not yet materialized and not yet scheduled
-                num_mat_dependencies = np.sum(self.row_builder.dependencies * row.has_val, axis=1)
+                num_mat_dependencies = np.sum(exec_ctx.row_builder.dependencies * row.has_val, axis=1)
                 num_missing = missing_dependencies - num_mat_dependencies
                 ready_slots[i] = (num_missing == 0) & (row.is_scheduled == False) & row.missing_slots
-                row.is_scheduled = row.is_scheduled | ready_slots[i]
+                row.is_scheduled |= ready_slots[i]
             # clear intermediate values that are no longer needed (ie, all dependents are materialized)
-            missing_dependents = np.sum(self.row_builder.dependencies[row.has_val == False], axis=0)
-            gc_targets = (missing_dependents == 0) & (row.missing_dependents > 0) & self.gc_targets
+            missing_dependents = np.sum(exec_ctx.row_builder.dependencies[row.has_val == False], axis=0)
+            gc_targets = (missing_dependents == 0) & (row.missing_dependents > 0) & exec_ctx.gc_targets
             row.clear(gc_targets)
             row.missing_dependents = missing_dependents
         if np.any(completed_rows):
             completed_idxs = list(completed_rows.nonzero()[0])
-            for i in completed_idxs:
-                self.completed_rows.put_nowait(rows[i])
-            self.completed_event.set()
-            self.num_in_flight -= len(completed_idxs)
+            if rows[i].parent_row is not None:
+                # these are nested rows
+                for i in completed_idxs:
+                    row = rows[i]
+                    assert row.parent_row is not None and row.parent_slot_idx is not None
+                    assert isinstance(row.parent_row.vals[row.parent_slot_idx], NestedRowList)
+                    row.parent_row.vals[row.parent_slot_idx].complete_row()
+            else:
+                for i in completed_idxs:
+                    self.completed_rows.put_nowait(rows[i])
+                self.completed_event.set()
+                self.num_in_flight -= len(completed_idxs)
         # schedule all ready slots
         for slot_idx in np.sum(ready_slots, axis=0).nonzero()[0]:
@@ -401,7 +380,7 @@ class ExprEvalNode(ExecNode):
             _ = ready_rows_v.nonzero()
             ready_rows = [rows[i] for i in ready_rows_v.nonzero()[0]]
             _logger.debug(f'Scheduling {len(ready_rows)} rows for slot {slot_idx}')
-            self.slot_evaluators[slot_idx].schedule(ready_rows, slot_idx)
+            exec_ctx.slot_evaluators[slot_idx].schedule(ready_rows, slot_idx)
     def register_task(self, t: asyncio.Task) -> None:
         self.tasks.add(t)

pixeltable/exec/expr_eval/globals.py CHANGED Viewed

@@ -4,7 +4,9 @@ import abc
 import asyncio
 from dataclasses import dataclass
 from types import TracebackType
-from typing import Any, Optional, Protocol
+from typing import Any, Iterable, Optional, Protocol
+import numpy as np
 from pixeltable import exprs, func
@@ -53,6 +55,7 @@ class Scheduler(abc.ABC):
         request: FnCallArgs
         num_retries: int
+        exec_ctx: ExecCtx
         def __lt__(self, other: Scheduler.QueueItem) -> bool:
             # prioritize by number of retries (more retries = higher priority)
@@ -67,8 +70,8 @@ class Scheduler(abc.ABC):
         self.queue = asyncio.PriorityQueue()
         self.dispatcher = dispatcher
-    def submit(self, item: FnCallArgs) -> None:
-        self.queue.put_nowait(self.QueueItem(item, 0))
+    def submit(self, item: FnCallArgs, exec_ctx: ExecCtx) -> None:
+        self.queue.put_nowait(self.QueueItem(item, 0, exec_ctx))
     @classmethod
     @abc.abstractmethod
@@ -90,11 +93,11 @@ class Dispatcher(Protocol):
     exc_event: asyncio.Event
     schedulers: dict[str, Scheduler]  # key: resource pool id
-    def dispatch(self, rows: list[exprs.DataRow]) -> None:
+    def dispatch(self, rows: list[exprs.DataRow], exec_ctx: Any) -> None:
         """Dispatches row slots to the appropriate schedulers; does not block"""
         ...
-    def dispatch_exc(self, rows: list[exprs.DataRow], slot_with_exc: int, exc_tb: TracebackType) -> None:
+    def dispatch_exc(self, rows: list[exprs.DataRow], slot_with_exc: int, exc_tb: TracebackType, exec_ctx: Any) -> None:
         """Propagates exception in slot_with_exc to all dependent slots and dispatches the rest; does not block"""
         ...
@@ -116,15 +119,16 @@ class Evaluator(abc.ABC):
     dispatcher: Dispatcher
     is_closed: bool
+    exec_ctx: 'ExecCtx'
-    def __init__(self, dispatcher: Dispatcher):
+    def __init__(self, dispatcher: Dispatcher, exec_ctx: 'ExecCtx') -> None:
         self.dispatcher = dispatcher
         self.is_closed = False
+        self.exec_ctx = exec_ctx
     @abc.abstractmethod
     def schedule(self, rows: list[exprs.DataRow], slot_idx: int) -> None:
         """Create tasks to evaluate the expression in the given slot for the given rows; must not block."""
-        ...
     def _close(self) -> None:
         """Close the evaluator; must not block"""
@@ -134,3 +138,60 @@ class Evaluator(abc.ABC):
         """Indicates that there may not be any more rows getting scheduled"""
         self.is_closed = True
         self._close()
+class ExecCtx:
+    """DataRow-specific state needed by ExprEvalNode"""
+    row_builder: exprs.RowBuilder
+    slot_evaluators: dict[int, Evaluator]  # key: slot idx
+    gc_targets: np.ndarray  # bool per slot; True if this is an intermediate expr (ie, not part of our output)
+    eval_ctx: np.ndarray  # bool per slot; EvalCtx.slot_idxs as a mask
+    literals: dict[int, Any]  # key: slot idx; value: literal value for this slot; used to pre-populate rows
+    def __init__(
+        self,
+        dispatcher: Dispatcher,
+        row_builder: exprs.RowBuilder,
+        output_exprs: Iterable[exprs.Expr],
+        input_exprs: Iterable[exprs.Expr],
+    ):
+        self.row_builder = row_builder
+        self.slot_evaluators = {}
+        # TODO: only include output_exprs dependencies
+        self.gc_targets = np.ones(self.row_builder.num_materialized, dtype=bool)
+        # we need to retain all slots that are part of the output
+        self.gc_targets[[e.slot_idx for e in self.row_builder.output_exprs]] = False
+        output_ctx = self.row_builder.create_eval_ctx(output_exprs, exclude=input_exprs)
+        self.literals = {e.slot_idx: e.val for e in output_ctx.exprs if isinstance(e, exprs.Literal)}
+        self.eval_ctx = np.zeros(self.row_builder.num_materialized, dtype=bool)
+        non_literal_slot_idxs = [e.slot_idx for e in output_ctx.exprs if not isinstance(e, exprs.Literal)]
+        self.eval_ctx[non_literal_slot_idxs] = True
+        self._init_slot_evaluators(dispatcher, non_literal_slot_idxs)
+    def _init_slot_evaluators(self, dispatcher: Dispatcher, target_slot_idxs: list[int]) -> None:
+        from .evaluators import DefaultExprEvaluator, FnCallEvaluator, JsonMapperDispatcher
+        for slot_idx in target_slot_idxs:
+            expr = self.row_builder.unique_exprs[slot_idx]
+            if (
+                isinstance(expr, exprs.FunctionCall)
+                # ExprTemplateFunction and AggregateFunction calls are best handled by FunctionCall.eval()
+                and not isinstance(expr.fn, func.ExprTemplateFunction)
+                and not isinstance(expr.fn, func.AggregateFunction)
+            ):
+                self.slot_evaluators[slot_idx] = FnCallEvaluator(expr, dispatcher, self)
+            elif isinstance(expr, exprs.JsonMapperDispatch):
+                self.slot_evaluators[slot_idx] = JsonMapperDispatcher(expr, dispatcher, self)
+            else:
+                self.slot_evaluators[slot_idx] = DefaultExprEvaluator(expr, dispatcher, self)
+    def init_rows(self, rows: list[exprs.DataRow]) -> None:
+        """Pre-populate rows with literals and initialize execution state"""
+        for row in rows:
+            # set literals before missing_dependents/slots
+            for slot_idx, val in self.literals.items():
+                row[slot_idx] = val
+            row.missing_dependents = np.sum(self.row_builder.dependencies[row.has_val == False], axis=0)
+            row.missing_slots = self.eval_ctx & (row.has_val == False)

pixeltable 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl

Potentially problematic release.

pixeltable 0.3.10py3-none-any.whl → 0.3.12py3-none-any.whl