PyPI - pixeltable - Versions diffs - 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

pixeltable 0.3.0py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (56) hide show

pixeltable/__version__.py +2 -2
pixeltable/catalog/insertable_table.py +3 -3
pixeltable/catalog/table.py +2 -2
pixeltable/catalog/table_version.py +3 -2
pixeltable/catalog/view.py +1 -1
pixeltable/dataframe.py +52 -27
pixeltable/env.py +109 -4
pixeltable/exec/__init__.py +1 -1
pixeltable/exec/aggregation_node.py +3 -3
pixeltable/exec/cache_prefetch_node.py +13 -7
pixeltable/exec/component_iteration_node.py +3 -9
pixeltable/exec/data_row_batch.py +17 -5
pixeltable/exec/exec_node.py +32 -12
pixeltable/exec/expr_eval/__init__.py +1 -0
pixeltable/exec/expr_eval/evaluators.py +240 -0
pixeltable/exec/expr_eval/expr_eval_node.py +408 -0
pixeltable/exec/expr_eval/globals.py +113 -0
pixeltable/exec/expr_eval/row_buffer.py +76 -0
pixeltable/exec/expr_eval/schedulers.py +240 -0
pixeltable/exec/in_memory_data_node.py +2 -2
pixeltable/exec/row_update_node.py +14 -14
pixeltable/exec/sql_node.py +2 -2
pixeltable/exprs/column_ref.py +5 -1
pixeltable/exprs/data_row.py +50 -40
pixeltable/exprs/expr.py +57 -12
pixeltable/exprs/function_call.py +54 -19
pixeltable/exprs/inline_expr.py +12 -21
pixeltable/exprs/literal.py +25 -8
pixeltable/exprs/row_builder.py +25 -2
pixeltable/func/aggregate_function.py +4 -0
pixeltable/func/callable_function.py +54 -4
pixeltable/func/expr_template_function.py +5 -1
pixeltable/func/function.py +48 -7
pixeltable/func/query_template_function.py +16 -7
pixeltable/func/udf.py +7 -1
pixeltable/functions/__init__.py +1 -1
pixeltable/functions/anthropic.py +97 -21
pixeltable/functions/gemini.py +2 -6
pixeltable/functions/openai.py +219 -28
pixeltable/globals.py +2 -3
pixeltable/io/hf_datasets.py +1 -1
pixeltable/io/label_studio.py +5 -5
pixeltable/io/parquet.py +1 -1
pixeltable/metadata/__init__.py +2 -1
pixeltable/plan.py +24 -9
pixeltable/store.py +6 -0
pixeltable/type_system.py +73 -36
pixeltable/utils/arrow.py +3 -8
pixeltable/utils/console_output.py +41 -0
pixeltable/utils/filecache.py +1 -1
{pixeltable-0.3.0.dist-info → pixeltable-0.3.2.dist-info}/METADATA +4 -1
{pixeltable-0.3.0.dist-info → pixeltable-0.3.2.dist-info}/RECORD +55 -49
pixeltable/exec/expr_eval_node.py +0 -232
{pixeltable-0.3.0.dist-info → pixeltable-0.3.2.dist-info}/LICENSE +0 -0
{pixeltable-0.3.0.dist-info → pixeltable-0.3.2.dist-info}/WHEEL +0 -0
{pixeltable-0.3.0.dist-info → pixeltable-0.3.2.dist-info}/entry_points.txt +0 -0

pixeltable/exec/expr_eval/globals.py ADDED Viewed

@@ -0,0 +1,113 @@
+import abc
+import asyncio
+from dataclasses import dataclass
+from types import TracebackType
+from typing import Any, Protocol, Optional
+from pixeltable import exprs
+from pixeltable import func
+@dataclass
+class FnCallArgs:
+    """Container for everything needed to execute a FunctionCall against one or more DataRows"""
+    fn_call: exprs.FunctionCall
+    rows: list[exprs.DataRow]
+    # single call
+    args: Optional[list[Any]] = None
+    kwargs: Optional[dict[str, Any]] = None
+    # batch call
+    batch_args: Optional[list[list[Optional[Any]]]] = None
+    batch_kwargs: Optional[dict[str, list[Optional[Any]]]] = None
+    @property
+    def pxt_fn(self) -> func.CallableFunction:
+        assert isinstance(self.fn_call.fn, func.CallableFunction)
+        return self.fn_call.fn
+    @property
+    def is_batched(self) -> bool:
+        return self.batch_args is not None
+    @property
+    def row(self) -> exprs.DataRow:
+        assert len(self.rows) == 1
+        return self.rows[0]
+class Scheduler(abc.ABC):
+    """
+    Base class for schedulers. A scheduler executes FunctionCalls against a limited resource pool.
+    Expected behavior:
+    - all created tasks must be recorded in dispatcher.tasks
+    - schedulers are responsible for aborting execution when a) the task is cancelled or b) when an exception occurred
+      elsewhere (indicated by dispatcher.exc_event)
+    """
+    @abc.abstractmethod
+    def submit(self, item: FnCallArgs) -> None:
+        pass
+    @classmethod
+    @abc.abstractmethod
+    def matches(cls, resource_pool: str) -> bool:
+        """Returns True if the scheduler can handle the given resource pool"""
+        pass
+class Dispatcher(Protocol):
+    """
+    Row dispatcher used by Evaluators/Schedulers for post-processing after slot materialization and for task management.
+    Task management: all tasks need to be registered via register_task()
+    Exceptions: evaluators/schedulers need to check exc_event prior to starting long-running (non-interruptible)
+        computations
+    """
+    row_builder: exprs.RowBuilder
+    exc_event: asyncio.Event
+    schedulers: dict[str, Scheduler]  # key: resource pool id
+    def dispatch(self, rows: list[exprs.DataRow]) -> None:
+        """Dispatches row slots to the appropriate schedulers; does not block"""
+        ...
+    def dispatch_exc(self, rows: list[exprs.DataRow], slot_with_exc: int, exc_tb: TracebackType) -> None:
+        """Propagates exception in slot_with_exc to all dependent slots and dispatches the rest; does not block"""
+        ...
+    def register_task(self, f: asyncio.Task) -> None:
+        """Register task with dispatcher for subsequent cleanup; does not block"""
+        ...
+class Evaluator(abc.ABC):
+    """
+    Base class for expression evaluators. Each DataRow slot is assigned an evaluator, which is responsible for the
+    execution of the expression evaluation logic as well as the scheduling/task breakdown of that execution.
+    Expected behavior:
+    - all created tasks must be recorded in dispatcher.tasks
+    - evaluators are responsible for aborting execution when a) the task is cancelled or b) when an exception occurred
+      elsewhere (indicated by dispatcher.exc_event)
+    """
+    dispatcher: Dispatcher
+    is_closed: bool
+    def __init__(self, dispatcher: Dispatcher):
+        self.dispatcher = dispatcher
+        self.is_closed = False
+    @abc.abstractmethod
+    def schedule(self, rows: list[exprs.DataRow], slot_idx: int) -> None:
+        """Create tasks to evaluate the expression in the given slot for the given rows; must not block."""
+        ...
+    def _close(self) -> None:
+        """Close the evaluator; must not block"""
+        pass
+    def close(self) -> None:
+        """Indicates that there may not be any more rows getting scheduled"""
+        self.is_closed = True
+        self._close()

pixeltable/exec/expr_eval/row_buffer.py ADDED Viewed

@@ -0,0 +1,76 @@
+from __future__ import annotations
+import logging
+from typing import Optional
+import numpy as np
+from pixeltable import exprs
+_logger = logging.getLogger('pixeltable')
+class RowBuffer:
+    """Fixed-length circular buffer of DataRows; knows how to maintain input order"""
+    size: int
+    row_pos_map: Optional[dict[int, int]]  # id(row) -> position of row in output; None if not maintaining order
+    num_rows: int  # number of rows in the buffer
+    num_ready: int  # number of consecutive non-None rows at head
+    buffer: np.ndarray  # of object
+    head_idx: int  # index of beginning of the buffer
+    head_pos: int  # row position of the beginning of the buffer
+    def __init__(self, size: int):
+        self.size = size
+        self.row_pos_map = None
+        self.num_rows = 0
+        self.num_ready = 0
+        self.buffer = np.full(size, None, dtype=object)
+        self.head_pos = 0
+        self.head_idx = 0
+    def set_row_pos_map(self, row_pos_map: dict[int, int]) -> None:
+        self.row_pos_map = row_pos_map
+    def add_row(self, row: exprs.DataRow) -> None:
+        offset: int  # of new row from head
+        if self.row_pos_map is not None:
+            pos = self.row_pos_map.get(id(row))
+            assert pos is not None and (pos - self.head_pos < self.size), f'{pos} {self.head_pos} {self.size}'
+            offset = pos - self.head_pos
+        else:
+            offset = self.num_rows
+        idx = (self.head_idx + offset) % self.size
+        assert self.buffer[idx] is None
+        self.buffer[idx] = row
+        self.num_rows += 1
+        if self.row_pos_map is not None:
+            if offset == self.num_ready:
+                # we have new ready rows; find out how many
+                while offset < self.size and self.buffer[(self.head_idx + offset) % self.size] is not None:
+                    offset += 1
+                self.num_ready = offset
+        else:
+            self.num_ready += 1
+    def get_rows(self, n: int) -> list[exprs.DataRow]:
+        """Get up to n ready rows from head"""
+        n = min(n, self.num_ready)
+        if n == 0:
+            return []
+        rows: list[exprs.DataRow]
+        if self.head_idx + n <= self.size:
+            rows = self.buffer[self.head_idx:self.head_idx + n].tolist()
+            self.buffer[self.head_idx:self.head_idx + n] = None
+        else:
+            rows = np.concatenate([self.buffer[self.head_idx:], self.buffer[:self.head_idx + n - self.size]]).tolist()
+            self.buffer[self.head_idx:] = None
+            self.buffer[:self.head_idx + n - self.size] = None
+        self.head_pos += n
+        self.head_idx = (self.head_idx + n) % self.size
+        self.num_rows -= n
+        self.num_ready -= n
+        return rows

pixeltable/exec/expr_eval/schedulers.py ADDED Viewed

@@ -0,0 +1,240 @@
+from __future__ import annotations
+import asyncio
+import datetime
+import inspect
+import logging
+import sys
+from dataclasses import dataclass
+from typing import Optional, Awaitable, Collection
+from pixeltable import env
+from pixeltable import func
+from .globals import Scheduler, FnCallArgs, Dispatcher
+_logger = logging.getLogger('pixeltable')
+class RateLimitsScheduler(Scheduler):
+    """
+    Scheduler for FunctionCalls with a RateLimitsInfo pool, which provides information about actual resource usage.
+    Scheduling strategy:
+    - try to stay below resource limits by utilizing reported RateLimitInfo.remaining
+    - also take into account the estimated resource usage for in-flight requests
+      (obtained via RateLimitsInfo.get_request_resources())
+    - issue synchronous requests when we don't have a RateLimitsInfo yet or when we depleted a resource and need to
+      wait for a reset
+    TODO:
+    - limit the number of in-flight requests based on the open file limit
+    """
+    @dataclass(frozen=True)
+    class QueueItem:
+        request: FnCallArgs
+        num_retries: int
+        def __lt__(self, other: RateLimitsScheduler.QueueItem) -> bool:
+            # prioritize by number of retries
+            return self.num_retries > other.num_retries
+    resource_pool: str
+    queue: asyncio.PriorityQueue[QueueItem]  # prioritizes retries
+    loop_task: asyncio.Task
+    dispatcher: Dispatcher
+    get_request_resources_param_names: list[str]  # names of parameters of RateLimitsInfo.get_request_resources()
+    # scheduling-related state
+    pool_info: Optional[env.RateLimitsInfo]
+    est_usage: dict[str, int]  # value per resource; accumulated estimates since the last util. report
+    num_in_flight: int  # unfinished tasks
+    request_completed: asyncio.Event
+    total_requests: int
+    total_retried: int
+    TIME_FORMAT = '%H:%M.%S %f'
+    MAX_RETRIES = 10
+    def __init__(self, resource_pool: str, dispatcher: Dispatcher):
+        self.resource_pool = resource_pool
+        self.queue = asyncio.PriorityQueue()
+        self.dispatcher = dispatcher
+        self.loop_task = asyncio.create_task(self._main_loop())
+        self.dispatcher.register_task(self.loop_task)
+        self.pool_info = None  # initialized in _main_loop by the first request
+        self.est_usage = {}
+        self.num_in_flight = 0
+        self.request_completed = asyncio.Event()
+        self.total_requests = 0
+        self.total_retried = 0
+        self.get_request_resources_param_names = []
+    @classmethod
+    def matches(cls, resource_pool: str) -> bool:
+        return resource_pool.startswith('rate-limits:')
+    def submit(self, item: FnCallArgs) -> None:
+        self.queue.put_nowait(self.QueueItem(item, 0))
+    def _set_pool_info(self) -> None:
+        """Initialize pool_info with the RateLimitsInfo for the resource pool, if available"""
+        if self.pool_info is not None:
+            return
+        self.pool_info = env.Env.get().get_resource_pool_info(self.resource_pool, None)
+        if self.pool_info is None:
+            return
+        assert isinstance(self.pool_info, env.RateLimitsInfo)
+        assert hasattr(self.pool_info, 'get_request_resources')
+        sig = inspect.signature(self.pool_info.get_request_resources)
+        self.get_request_resources_param_names = [p.name for p in sig.parameters.values()]
+        self.est_usage = {r: 0 for r in self._resources}
+    async def _main_loop(self) -> None:
+        item: Optional[RateLimitsScheduler.QueueItem] = None
+        while True:
+            if item is None:
+                item = await self.queue.get()
+                if item.num_retries > 0:
+                    self.total_retried += 1
+            now = datetime.datetime.now(tz=datetime.timezone.utc)
+            if self.pool_info is None or not self.pool_info.is_initialized():
+                # wait for a single request to get rate limits
+                _logger.debug(f'initializing rate limits for {self.resource_pool}')
+                await self._exec(item.request, item.num_retries, is_task=False)
+                item = None
+                # if this was the first request, it created the pool_info
+                if self.pool_info is None:
+                    self._set_pool_info()
+                continue
+            # check rate limits
+            request_resources = self._get_request_resources(item.request)
+            limits_info = self._check_resource_limits(request_resources)
+            aws: list[Awaitable[None]] = []
+            completed_aw: Optional[asyncio.Task] = None
+            wait_for_reset: Optional[asyncio.Task] = None
+            if limits_info is not None:
+                # limits_info's resource is depleted, wait for capacity to free up
+                if self.num_in_flight > 0:
+                    # a completed request can free up capacity
+                    self.request_completed.clear()
+                    completed_aw = asyncio.create_task(self.request_completed.wait())
+                    aws.append(completed_aw)
+                    _logger.debug(f'waiting for completed request for {self.resource_pool}')
+                reset_at = limits_info.reset_at
+                if reset_at > now:
+                    # we're waiting for the rate limit to reset
+                    wait_for_reset = asyncio.create_task(asyncio.sleep((reset_at - now).total_seconds()))
+                    aws.append(wait_for_reset)
+                    _logger.debug(f'waiting for rate limit reset for {self.resource_pool}')
+            if len(aws) > 0:
+                # we have something to wait for
+                done, pending = await asyncio.wait(aws, return_when=asyncio.FIRST_COMPLETED)
+                for task in pending:
+                    task.cancel()
+                if completed_aw in done:
+                    _logger.debug(f'wait(): completed request for {self.resource_pool}')
+                if wait_for_reset in done:
+                    _logger.debug(f'wait(): rate limit reset for {self.resource_pool}')
+                    # force waiting for another rate limit report before making any scheduling decisions
+                    self.pool_info.reset()
+                # re-evaluate current capacity for current item
+                continue
+            # we have a new in-flight request
+            for resource, val in request_resources.items():
+                self.est_usage[resource] += val
+            _logger.debug(f'creating task for {self.resource_pool}')
+            self.num_in_flight += 1
+            task = asyncio.create_task(self._exec(item.request, item.num_retries, is_task=True))
+            self.dispatcher.register_task(task)
+            item = None
+    @property
+    def _resources(self) -> Collection[str]:
+        return self.pool_info.resource_limits.keys() if self.pool_info is not None else []
+    def _get_request_resources(self, request: FnCallArgs) -> dict[str, int]:
+        kwargs_batch = request.fn_call.get_param_values(self.get_request_resources_param_names, request.rows)
+        if not request.is_batched:
+            return self.pool_info.get_request_resources(**kwargs_batch[0])
+        else:
+            batch_kwargs = {k: [d[k] for d in kwargs_batch] for k in kwargs_batch[0]}
+            constant_kwargs, batch_kwargs = request.pxt_fn.create_batch_kwargs(batch_kwargs)
+            return self.pool_info.get_request_resources(**constant_kwargs, **batch_kwargs)
+    def _check_resource_limits(self, request_resources: dict[str, int]) -> Optional[env.RateLimitInfo]:
+        """Returns the most depleted resource, relative to its limit, or None if all resources are within limits"""
+        candidates: list[tuple[env.RateLimitInfo, float]] = []  # (info, relative usage)
+        for resource, usage in request_resources.items():
+            # 0.05: leave some headroom, we don't have perfect information
+            info = self.pool_info.resource_limits[resource]
+            est_remaining = info.remaining - self.est_usage[resource] - usage
+            if est_remaining < 0.05 * info.limit:
+                candidates.append((info, est_remaining / info.limit))
+        if len(candidates) == 0:
+            return None
+        return min(candidates, key=lambda x: x[1])[0]
+    async def _exec(self, request: FnCallArgs, num_retries: int, is_task: bool) -> None:
+        assert all(not row.has_val[request.fn_call.slot_idx] for row in request.rows)
+        assert all(not row.has_exc(request.fn_call.slot_idx) for row in request.rows)
+        try:
+            start_ts = datetime.datetime.now(tz=datetime.timezone.utc)
+            pxt_fn = request.fn_call.fn
+            assert isinstance(pxt_fn, func.CallableFunction)
+            _logger.debug(f'scheduler {self.resource_pool}: start evaluating slot {request.fn_call.slot_idx}, batch_size={len(request.rows)}')
+            self.total_requests += 1
+            if request.is_batched:
+                batch_result = await pxt_fn.aexec_batch(*request.batch_args, **request.batch_kwargs)
+                assert len(batch_result) == len(request.rows)
+                for row, result in zip(request.rows, batch_result):
+                    row[request.fn_call.slot_idx] = result
+            else:
+                result = await pxt_fn.aexec(*request.args, **request.kwargs)
+                request.row[request.fn_call.slot_idx] = result
+            end_ts = datetime.datetime.now(tz=datetime.timezone.utc)
+            _logger.debug(f'scheduler {self.resource_pool}: evaluated slot {request.fn_call.slot_idx} in {end_ts - start_ts}, batch_size={len(request.rows)}')
+            # purge accumulated usage estimate, now that we have a new report
+            self.est_usage = {r: 0 for r in self._resources}
+            self.dispatcher.dispatch(request.rows)
+        except Exception as exc:
+            _logger.debug(f'scheduler {self.resource_pool}: exception in slot {request.fn_call.slot_idx}: {exc}')
+            if  self.pool_info is None:
+                # our pool info should be available at this point
+                self._set_pool_info()
+            if num_retries < self.MAX_RETRIES and self.pool_info is not None:
+                retry_delay = self.pool_info.get_retry_delay(exc)
+                if retry_delay is not None:
+                    self.total_retried += 1
+                    _logger.debug(f'scheduler {self.resource_pool}: retrying in {retry_delay} seconds')
+                    await asyncio.sleep(retry_delay)
+                    self.queue.put_nowait(self.QueueItem(request, num_retries + 1))
+                    return
+            # TODO: update resource limits reported in exc.response.headers, if present
+            # record the exception
+            _, _, exc_tb = sys.exc_info()
+            for row in request.rows:
+                row.set_exc(request.fn_call.slot_idx, exc)
+            self.dispatcher.dispatch_exc(request.rows, request.fn_call.slot_idx, exc_tb)
+        finally:
+            _logger.debug(
+                f'Scheduler stats: #requests={self.total_requests}, #retried={self.total_retried}')
+            if is_task:
+                self.num_in_flight -= 1
+                self.request_completed.set()
+# all concrete Scheduler subclasses that implement matches()
+SCHEDULERS = [RateLimitsScheduler]

pixeltable/exec/in_memory_data_node.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import logging
-from typing import Any, Iterator, Optional
+from typing import Any, Iterator, Optional, AsyncIterator
 import pixeltable.catalog as catalog
 import pixeltable.exprs as exprs
@@ -76,6 +76,6 @@ class InMemoryDataNode(ExecNode):
         self.ctx.num_rows = len(self.output_rows)
-    def __iter__(self) -> Iterator[DataRowBatch]:
+    async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
         _logger.debug(f'InMemoryDataNode: created row batch with {len(self.output_rows)} output_rows')
         yield self.output_rows

pixeltable/exec/row_update_node.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import logging
-from typing import Any
+from typing import Any, AsyncIterator
 import pixeltable.catalog as catalog
 import pixeltable.exprs as exprs
@@ -34,19 +34,19 @@ class RowUpdateNode(ExecNode):
         self.key_slot_idxs = {col: all_col_slot_idxs[col] for col in tbl.tbl_version.primary_key_columns()}
         self.matched_key_vals: set[tuple] = set()
-    def __next__(self) -> DataRowBatch:
-        batch = next(self.input)
-        for row in batch:
-            key_vals = row.rowid if self.is_rowid_key else \
-                tuple(row[slot_idx] for slot_idx in self.key_slot_idxs.values())
-            if key_vals not in self.updates:
-                continue
-            self.matched_key_vals.add(key_vals)
-            col_vals = self.updates[key_vals]
-            for col, val in col_vals.items():
-                slot_idx = self.col_slot_idxs[col]
-                row[slot_idx] = val
-        return batch
+    async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
+        async for batch in self.input:
+            for row in batch:
+                key_vals = row.rowid if self.is_rowid_key else \
+                    tuple(row[slot_idx] for slot_idx in self.key_slot_idxs.values())
+                if key_vals not in self.updates:
+                    continue
+                self.matched_key_vals.add(key_vals)
+                col_vals = self.updates[key_vals]
+                for col, val in col_vals.items():
+                    slot_idx = self.col_slot_idxs[col]
+                    row[slot_idx] = val
+            yield batch
     def unmatched_rows(self) -> list[dict[str, Any]]:
         """Return rows that didn't get used in the updates as a list of dicts compatible with TableVersion.insert()."""

pixeltable/exec/sql_node.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import logging
 import warnings
 from decimal import Decimal
-from typing import Iterable, Iterator, NamedTuple, Optional, TYPE_CHECKING, Sequence
+from typing import Iterable, Iterator, NamedTuple, Optional, TYPE_CHECKING, Sequence, AsyncIterator
 from uuid import UUID
 import sqlalchemy as sql
@@ -264,7 +264,7 @@ class SqlNode(ExecNode):
         except Exception as e:
             _logger.warning(f'EXPLAIN failed with error: {e}')
-    def __iter__(self) -> Iterator[DataRowBatch]:
+    async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
         # run the query; do this here rather than in _open(), exceptions are only expected during iteration
         assert self.ctx.conn is not None
         with warnings.catch_warnings(record=True) as w:

pixeltable/exprs/column_ref.py CHANGED Viewed

@@ -101,7 +101,8 @@ class ColumnRef(Expr):
         # resolve column properties
         if name == ColumnPropertyRef.Property.ERRORTYPE.name.lower() \
                 or name == ColumnPropertyRef.Property.ERRORMSG.name.lower():
-            if not (self.col.is_computed and self.col.is_stored) and not self.col.col_type.is_media_type():
+            property_is_present = self.col.is_stored and (self.col.is_computed or self.col_type.is_media_type())
+            if not property_is_present:
                 raise excs.Error(f'{name} only valid for a stored computed or media column: {self}')
             return ColumnPropertyRef(self, ColumnPropertyRef.Property[name.upper()])
         if name == ColumnPropertyRef.Property.FILEURL.name.lower() \
@@ -239,3 +240,6 @@ class ColumnRef(Expr):
         col = cls.get_column(d)
         perform_validation = d['perform_validation']
         return cls(col, perform_validation=perform_validation)
+    def is_constant(self) -> bool:
+        return False

pixeltable/exprs/data_row.py CHANGED Viewed

@@ -6,10 +6,10 @@ import urllib.parse
 import urllib.request
 from typing import Any, Optional
-import numpy as np
-import pgvector.sqlalchemy  # type: ignore[import-untyped]
 import PIL
 import PIL.Image
+import numpy as np
+import pgvector.sqlalchemy  # type: ignore[import-untyped]
 import sqlalchemy as sql
 from pixeltable import env
@@ -34,9 +34,14 @@ class DataRow:
     - VideoType: local path if available, otherwise url
     """
-    vals: list[Any]
-    has_val: list[bool]
-    excs: list[Optional[Exception]]
+    vals: np.ndarray  # of object
+    has_val: np.ndarray  # of bool
+    excs: np.ndarray  # of object
+    # expr evaluation state; indexed by slot idx
+    missing_slots: np.ndarray  # of bool; number of missing dependencies
+    missing_dependents: np.ndarray  # of int16; number of missing dependents
+    is_scheduled: np.ndarray  # of bool; True if this slot is scheduled for evaluation
     # control structures that are shared across all DataRows in a batch
     img_slot_idxs: list[int]
@@ -50,32 +55,47 @@ class DataRow:
     # - stored url of file for media in vals[i]
     # - None if vals[i] is not media type
     # - not None if file_paths[i] is not None
-    file_urls: list[Optional[str]]
+    file_urls: np.ndarray  # of str
     # file_paths:
     # - local path of media file in vals[i]; points to the file cache if file_urls[i] is remote
     # - None if vals[i] is not a media type or if there is no local file yet for file_urls[i]
-    file_paths: list[Optional[str]]
+    file_paths: np.ndarray  # of str
     def __init__(self, size: int, img_slot_idxs: list[int], media_slot_idxs: list[int], array_slot_idxs: list[int]):
-        self.vals = [None] * size
-        self.has_val = [False] * size
-        self.excs = [None] * size
         self.img_slot_idxs = img_slot_idxs
         self.media_slot_idxs = media_slot_idxs
         self.array_slot_idxs = array_slot_idxs
+        self.init(size)
+    def init(self, num_slots: int) -> None:
+        self.vals = np.full(num_slots, None, dtype=object)
+        self.has_val = np.zeros(num_slots, dtype=bool)
+        self.excs = np.full(num_slots, None, dtype=object)
+        self.missing_slots = np.zeros(num_slots, dtype=bool)
+        self.missing_dependents = np.zeros(num_slots, dtype=np.int16)
+        self.is_scheduled = np.zeros(num_slots, dtype=bool)
         self.pk = None
-        self.file_urls = [None] * size
-        self.file_paths = [None] * size
-    def clear(self) -> None:
-        size = len(self.vals)
-        self.vals = [None] * size
-        self.has_val = [False] * size
-        self.excs = [None] * size
-        self.pk = None
-        self.file_urls = [None] * size
-        self.file_paths = [None] * size
+        self.file_urls = np.full(num_slots, None, dtype=object)
+        self.file_paths = np.full(num_slots, None, dtype=object)
+    def clear(self, idxs: Optional[np.ndarray] = None) -> None:
+        if idxs is not None:
+            self.has_val[idxs] = False
+            self.vals[idxs] = None
+            self.excs[idxs] = None
+            self.file_urls[idxs] = None
+            self.file_paths[idxs] = None
+        else:
+            self.init(len(self.vals))
+    def set_file_path(self, idx: int, path: str) -> None:
+        """Augment an existing url with a local file path"""
+        assert self.has_val[idx]
+        assert idx in self.img_slot_idxs or idx in self.media_slot_idxs
+        self.file_paths[idx] = path
+        if idx in self.media_slot_idxs:
+            self.vals[idx] = path
     def copy(self, target: DataRow) -> None:
         """Create a copy of the contents of this DataRow in target
@@ -98,16 +118,18 @@ class DataRow:
         """
         if slot_idx is not None:
             return self.excs[slot_idx] is not None
-        return any(exc is not None for exc in self.excs)
+        return (self.excs != None).any()
     def get_exc(self, slot_idx: int) -> Optional[Exception]:
-        return self.excs[slot_idx]
+        exc = self.excs[slot_idx]
+        assert exc is None or isinstance(exc, Exception)
+        return exc
     def get_first_exc(self) -> Optional[Exception]:
-        for exc in self.excs:
-            if exc is not None:
-                return exc
-        return None
+        mask = self.excs != None
+        if not mask.any():
+            return None
+        return self.excs[mask][0]
     def set_exc(self, slot_idx: int, exc: Exception) -> None:
         assert self.excs[slot_idx] is None
@@ -119,9 +141,6 @@ class DataRow:
         self.file_paths[slot_idx] = None
         self.file_urls[slot_idx] = None
-    def __len__(self) -> int:
-        return len(self.vals)
     def __getitem__(self, index: object) -> Any:
         """Returns in-memory value, ie, what is needed for expr evaluation"""
         assert isinstance(index, int)
@@ -171,11 +190,10 @@ class DataRow:
         return self.vals[index]
-    def __setitem__(self, idx: object, val: Any) -> None:
+    def __setitem__(self, idx: int, val: Any) -> None:
         """Assign in-memory cell value
         This allows overwriting
         """
-        assert isinstance(idx, int)
         assert self.excs[idx] is None
         if (idx in self.img_slot_idxs or idx in self.media_slot_idxs) and isinstance(val, str):
@@ -207,14 +225,6 @@ class DataRow:
             self.vals[idx] = val
         self.has_val[idx] = True
-    def set_file_path(self, idx: int, path: str) -> None:
-        """Augment an existing url with a local file path"""
-        assert self.has_val[idx]
-        assert idx in self.img_slot_idxs or idx in self.media_slot_idxs
-        self.file_paths[idx] = path
-        if idx in self.media_slot_idxs:
-            self.vals[idx] = path
     def flush_img(self, index: int, filepath: Optional[str] = None) -> None:
         """Discard the in-memory value and save it to a local file, if filepath is not None"""
         if self.vals[index] is None:

pixeltable 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

Potentially problematic release.

pixeltable 0.3.0py3-none-any.whl → 0.3.2py3-none-any.whl