PyPI - pixeltable - Versions diffs - 0.2.21__py3-none-any.whl → 0.2.23__py3-none-any.whl - Mend

pixeltable 0.2.21py3-none-any.whl → 0.2.23py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (94) hide show

pixeltable/__init__.py +2 -2
pixeltable/__version__.py +2 -2
pixeltable/catalog/__init__.py +1 -1
pixeltable/catalog/column.py +41 -29
pixeltable/catalog/globals.py +18 -0
pixeltable/catalog/insertable_table.py +30 -10
pixeltable/catalog/table.py +198 -86
pixeltable/catalog/table_version.py +47 -53
pixeltable/catalog/table_version_path.py +2 -2
pixeltable/catalog/view.py +17 -18
pixeltable/dataframe.py +27 -36
pixeltable/env.py +7 -0
pixeltable/exec/__init__.py +0 -1
pixeltable/exec/aggregation_node.py +6 -3
pixeltable/exec/cache_prefetch_node.py +189 -43
pixeltable/exec/data_row_batch.py +5 -22
pixeltable/exec/exec_context.py +2 -2
pixeltable/exec/exec_node.py +3 -2
pixeltable/exec/expr_eval_node.py +23 -16
pixeltable/exec/in_memory_data_node.py +6 -3
pixeltable/exec/sql_node.py +24 -25
pixeltable/exprs/arithmetic_expr.py +12 -5
pixeltable/exprs/array_slice.py +7 -7
pixeltable/exprs/column_property_ref.py +37 -10
pixeltable/exprs/column_ref.py +97 -14
pixeltable/exprs/comparison.py +10 -5
pixeltable/exprs/compound_predicate.py +8 -7
pixeltable/exprs/data_row.py +27 -18
pixeltable/exprs/expr.py +53 -52
pixeltable/exprs/expr_set.py +5 -0
pixeltable/exprs/function_call.py +32 -16
pixeltable/exprs/globals.py +4 -1
pixeltable/exprs/in_predicate.py +8 -7
pixeltable/exprs/inline_expr.py +4 -4
pixeltable/exprs/is_null.py +4 -4
pixeltable/exprs/json_mapper.py +11 -12
pixeltable/exprs/json_path.py +6 -11
pixeltable/exprs/literal.py +5 -5
pixeltable/exprs/method_ref.py +5 -4
pixeltable/exprs/object_ref.py +2 -1
pixeltable/exprs/row_builder.py +88 -36
pixeltable/exprs/rowid_ref.py +12 -11
pixeltable/exprs/similarity_expr.py +12 -7
pixeltable/exprs/sql_element_cache.py +7 -5
pixeltable/exprs/type_cast.py +8 -6
pixeltable/exprs/variable.py +5 -4
pixeltable/func/aggregate_function.py +9 -9
pixeltable/func/expr_template_function.py +6 -5
pixeltable/func/function.py +11 -10
pixeltable/func/udf.py +6 -11
pixeltable/functions/__init__.py +2 -2
pixeltable/functions/globals.py +5 -7
pixeltable/functions/huggingface.py +155 -45
pixeltable/functions/llama_cpp.py +107 -0
pixeltable/functions/mistralai.py +1 -1
pixeltable/functions/ollama.py +147 -0
pixeltable/functions/openai.py +1 -1
pixeltable/functions/replicate.py +72 -0
pixeltable/functions/string.py +9 -0
pixeltable/functions/together.py +1 -1
pixeltable/functions/util.py +5 -2
pixeltable/globals.py +67 -26
pixeltable/index/btree.py +16 -3
pixeltable/index/embedding_index.py +4 -4
pixeltable/io/__init__.py +1 -2
pixeltable/io/fiftyone.py +178 -0
pixeltable/io/globals.py +96 -2
pixeltable/iterators/base.py +3 -2
pixeltable/iterators/document.py +1 -1
pixeltable/iterators/video.py +120 -63
pixeltable/metadata/__init__.py +1 -1
pixeltable/metadata/converters/convert_21.py +34 -0
pixeltable/metadata/converters/util.py +45 -4
pixeltable/metadata/notes.py +1 -0
pixeltable/metadata/schema.py +8 -0
pixeltable/plan.py +17 -15
pixeltable/py.typed +0 -0
pixeltable/store.py +7 -2
pixeltable/tool/create_test_db_dump.py +1 -1
pixeltable/tool/create_test_video.py +1 -1
pixeltable/tool/embed_udf.py +1 -1
pixeltable/tool/mypy_plugin.py +28 -5
pixeltable/type_system.py +100 -36
pixeltable/utils/coco.py +5 -5
pixeltable/utils/documents.py +15 -1
pixeltable/utils/formatter.py +12 -13
pixeltable/utils/s3.py +6 -3
{pixeltable-0.2.21.dist-info → pixeltable-0.2.23.dist-info}/METADATA +158 -49
pixeltable-0.2.23.dist-info/RECORD +153 -0
pixeltable/exec/media_validation_node.py +0 -43
pixeltable-0.2.21.dist-info/RECORD +0 -148
{pixeltable-0.2.21.dist-info → pixeltable-0.2.23.dist-info}/LICENSE +0 -0
{pixeltable-0.2.21.dist-info → pixeltable-0.2.23.dist-info}/WHEEL +0 -0
{pixeltable-0.2.21.dist-info → pixeltable-0.2.23.dist-info}/entry_points.txt +0 -0

pixeltable/exec/cache_prefetch_node.py CHANGED Viewed

@@ -1,87 +1,226 @@
 from __future__ import annotations
-import concurrent.futures
+import dataclasses
+import itertools
 import logging
 import threading
 import urllib.parse
 import urllib.request
-from collections import defaultdict
+from collections import deque
+from concurrent import futures
 from pathlib import Path
-from typing import List, Optional, Any, Tuple, Dict
+from typing import Optional, Any, Iterator
 from uuid import UUID
 import pixeltable.env as env
 import pixeltable.exceptions as excs
 import pixeltable.exprs as exprs
+from pixeltable import catalog
 from pixeltable.utils.filecache import FileCache
 from .data_row_batch import DataRowBatch
 from .exec_node import ExecNode
 _logger = logging.getLogger('pixeltable')
 class CachePrefetchNode(ExecNode):
     """Brings files with external URLs into the cache
     TODO:
-    - maintain a queue of row batches, in order to overlap download and evaluation
     - adapting the number of download threads at runtime to maximize throughput
     """
-    def __init__(self, tbl_id: UUID, file_col_info: List[exprs.ColumnSlotIdx], input: ExecNode):
-        # []: we don't have anything to evaluate
+    BATCH_SIZE = 16
+    NUM_EXECUTOR_THREADS = 16
+    retain_input_order: bool  # if True, return rows in the exact order they were received
+    file_col_info: list[exprs.ColumnSlotIdx]
+    boto_client: Optional[Any]
+    boto_client_lock: threading.Lock
+    # execution state
+    batch_tbl_version: Optional[catalog.TableVersion]  # needed to construct output batches
+    num_returned_rows: int
+    # ready_rows: rows that are ready to be returned, ordered by row idx;
+    # the implied row idx of ready_rows[0] is num_returned_rows
+    ready_rows: deque[Optional[exprs.DataRow]]
+    in_flight_rows: dict[int, CachePrefetchNode.RowState]  # rows with in-flight urls; id(row) -> RowState
+    in_flight_requests: dict[futures.Future, str]  # in-flight requests for urls; future -> URL
+    in_flight_urls: dict[str, list[tuple[exprs.DataRow, exprs.ColumnSlotIdx]]]  # URL -> [(row, info)]
+    input_finished: bool
+    row_idx: Iterator[Optional[int]]
+    @dataclasses.dataclass
+    class RowState:
+        row: exprs.DataRow
+        idx: Optional[int]  # position in input stream; None if we don't retain input order
+        num_missing: int  # number of missing URLs in this row
+    def __init__(
+            self, tbl_id: UUID, file_col_info: list[exprs.ColumnSlotIdx], input: ExecNode,
+            retain_input_order: bool = True):
+        # input_/output_exprs=[]: we don't have anything to evaluate
         super().__init__(input.row_builder, [], [], input)
-        self.tbl_id = tbl_id
+        self.retain_input_order = retain_input_order
         self.file_col_info = file_col_info
         # clients for specific services are constructed as needed, because it's time-consuming
-        self.boto_client: Optional[Any] = None
+        self.boto_client = None
         self.boto_client_lock = threading.Lock()
-    def __next__(self) -> DataRowBatch:
-        input_batch = next(self.input)
+        self.batch_tbl_version = None
+        self.num_returned_rows = 0
+        self.ready_rows = deque()
+        self.in_flight_rows = {}
+        self.in_flight_requests = {}
+        self.in_flight_urls = {}
+        self.input_finished = False
+        self.row_idx = itertools.count() if retain_input_order else itertools.repeat(None)
+    def __iter__(self) -> Iterator[DataRowBatch]:
+        input_iter = iter(self.input)
+        with futures.ThreadPoolExecutor(max_workers=self.NUM_EXECUTOR_THREADS) as executor:
+            # we create enough in-flight requests to fill the first batch
+            while not self.input_finished and self.__num_pending_rows() < self.BATCH_SIZE:
+                self.__submit_input_batch(input_iter, executor)
+            while True:
+                # try to assemble a full batch of output rows
+                if not self.__has_ready_batch() and len(self.in_flight_requests) > 0:
+                    self.__wait_for_requests()
+                # try to create enough in-flight requests to fill the next batch
+                while not self.input_finished and self.__num_pending_rows() < self.BATCH_SIZE:
+                    self.__submit_input_batch(input_iter, executor)
+                if len(self.ready_rows) > 0:
+                    # create DataRowBatch from the first BATCH_SIZE ready rows
+                    batch = DataRowBatch(self.batch_tbl_version, self.row_builder)
+                    rows = [self.ready_rows.popleft() for _ in range(min(self.BATCH_SIZE, len(self.ready_rows)))]
+                    for row in rows:
+                        assert row is not None
+                        batch.add_row(row)
+                    self.num_returned_rows += len(rows)
+                    _logger.debug(f'returning {len(rows)} rows')
+                    yield batch
+                if self.input_finished and self.__num_pending_rows() == 0:
+                    return
+    def __num_pending_rows(self) -> int:
+        return len(self.in_flight_rows) + len(self.ready_rows)
+    def __has_ready_batch(self) -> bool:
+        """True if there are >= BATCH_SIZES entries in ready_rows and the first BATCH_SIZE ones are all non-None"""
+        return (
+            sum(int(row is not None) for row in itertools.islice(self.ready_rows, self.BATCH_SIZE)) == self.BATCH_SIZE
+        )
+    def __ready_prefix_len(self) -> int:
+        """Length of the non-None prefix of ready_rows (= what we can return right now)"""
+        return sum(1 for _ in itertools.takewhile(lambda x: x is not None, self.ready_rows))
+    def __add_ready_row(self, row: exprs.DataRow, row_idx: Optional[int]) -> None:
+        if row_idx is None:
+            self.ready_rows.append(row)
+        else:
+            # extend ready_rows to accommodate row_idx
+            idx = row_idx - self.num_returned_rows
+            if idx >= len(self.ready_rows):
+                self.ready_rows.extend([None] * (idx - len(self.ready_rows) + 1))
+            self.ready_rows[idx] = row
+    def __wait_for_requests(self) -> None:
+        """Wait for in-flight requests to complete until we have a full batch of rows"""
+        file_cache = FileCache.get()
+        _logger.debug(f'waiting for requests; ready_batch_size={self.__ready_prefix_len()}')
+        while not self.__has_ready_batch() and len(self.in_flight_requests) > 0:
+            done, _ = futures.wait(self.in_flight_requests, return_when=futures.FIRST_COMPLETED)
+            for f in done:
+                url = self.in_flight_requests.pop(f)
+                tmp_path, exc = f.result()
+                local_path: Optional[Path] = None
+                if tmp_path is not None:
+                    # register the file with the cache for the first column in which it's missing
+                    assert url in self.in_flight_urls
+                    _, info = self.in_flight_urls[url][0]
+                    local_path = file_cache.add(info.col.tbl.id, info.col.id, url, tmp_path)
+                    _logger.debug(f'cached {url} as {local_path}')
+                # add the local path/exception to the slots that reference the url
+                for row, info in self.in_flight_urls.pop(url):
+                    if exc is not None:
+                        self.row_builder.set_exc(row, info.slot_idx, exc)
+                    else:
+                        assert local_path is not None
+                        row.set_file_path(info.slot_idx, str(local_path))
+                    state = self.in_flight_rows[id(row)]
+                    state.num_missing -= 1
+                    if state.num_missing == 0:
+                        del self.in_flight_rows[id(row)]
+                        self.__add_ready_row(row, state.idx)
+                        _logger.debug(f'row {state.idx} is ready (ready_batch_size={self.__ready_prefix_len()})')
+    def __submit_input_batch(self, input: Iterator[DataRowBatch], executor: futures.ThreadPoolExecutor) -> None:
+        assert not self.input_finished
+        input_batch = next(input, None)
+        if input_batch is None:
+            self.input_finished = True
+            return
+        if self.batch_tbl_version is None:
+            self.batch_tbl_version = input_batch.tbl
-        # collect external URLs that aren't already cached, and set DataRow.file_paths for those that are
         file_cache = FileCache.get()
-        cache_misses: List[Tuple[exprs.DataRow, exprs.ColumnSlotIdx]] = []
-        missing_url_rows: Dict[str, List[exprs.DataRow]] = defaultdict(list)  # URL -> rows in which it's missing
+        # URLs from this input batch that aren't already in the file cache;
+        # we use a list to make sure we submit urls in the order in which they appear in the output, which minimizes
+        # the time it takes to get the next batch together
+        cache_misses: list[str] = []
+        url_pos: dict[str, int] = {}  # url -> row_idx; used for logging
         for row in input_batch:
+            # identify missing local files in input batch, or fill in their paths if they're already cached
+            num_missing = 0
+            row_idx = next(self.row_idx)
             for info in self.file_col_info:
                 url = row.file_urls[info.slot_idx]
                 if url is None or row.file_paths[info.slot_idx] is not None:
                     # nothing to do
                     continue
-                if url in missing_url_rows:
-                    missing_url_rows[url].append(row)
+                locations = self.in_flight_urls.get(url)
+                if locations is not None:
+                    # we've already seen this
+                    locations.append((row, info))
+                    num_missing += 1
                     continue
                 local_path = file_cache.lookup(url)
                 if local_path is None:
-                    cache_misses.append((row, info))
-                    missing_url_rows[url].append(row)
+                    cache_misses.append(url)
+                    self.in_flight_urls[url] = [(row, info)]
+                    num_missing += 1
+                    if url not in url_pos:
+                        url_pos[url] = row_idx
                 else:
                     row.set_file_path(info.slot_idx, str(local_path))
-        # download the cache misses in parallel
-        # TODO: set max_workers to maximize throughput
-        futures: Dict[concurrent.futures.Future, Tuple[exprs.DataRow, exprs.ColumnSlotIdx]] = {}
-        with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
-            for row, info in cache_misses:
-                futures[executor.submit(self._fetch_url, row, info.slot_idx)] = (row, info)
-            for future in concurrent.futures.as_completed(futures):
-                # TODO:  does this need to deal with recoverable errors (such as retry after throttling)?
-                tmp_path = future.result()
-                if tmp_path is None:
-                    continue
-                row, info = futures[future]
-                url = row.file_urls[info.slot_idx]
-                local_path = file_cache.add(self.tbl_id, info.col.id, url, tmp_path)
-                _logger.debug(f'PrefetchNode: cached {url} as {local_path}')
-                for row in missing_url_rows[url]:
-                    row.set_file_path(info.slot_idx, str(local_path))
+            if num_missing > 0:
+                self.in_flight_rows[id(row)] = self.RowState(row, row_idx, num_missing)
+            else:
+                self.__add_ready_row(row, row_idx)
-        return input_batch
+        _logger.debug(f'submitting {len(cache_misses)} urls')
+        for url in cache_misses:
+            f = executor.submit(self.__fetch_url, url)
+            _logger.debug(f'submitted {url} for idx {url_pos[url]}')
+            self.in_flight_requests[f] = url
-    def _fetch_url(self, row: exprs.DataRow, slot_idx: int) -> Optional[str]:
+    def __fetch_url(self, url: str) -> tuple[Optional[Path], Optional[Exception]]:
         """Fetches a remote URL into Env.tmp_dir and returns its path"""
-        url = row.file_urls[slot_idx]
+        _logger.debug(f'fetching url={url} thread_name={threading.current_thread().name}')
         parsed = urllib.parse.urlparse(url)
         # Use len(parsed.scheme) > 1 here to ensure we're not being passed
         # a Windows filename
@@ -93,24 +232,31 @@ class CachePrefetchNode(ExecNode):
             extension = p.suffix
         tmp_path = env.Env.get().create_tmp_path(extension=extension)
         try:
+            _logger.debug(f'Downloading {url} to {tmp_path}')
             if parsed.scheme == 's3':
                 from pixeltable.utils.s3 import get_client
                 with self.boto_client_lock:
                     if self.boto_client is None:
-                        self.boto_client = get_client()
-                    self.boto_client.download_file(parsed.netloc, parsed.path.lstrip('/'), str(tmp_path))
+                        config = {
+                            'max_pool_connections': self.NUM_EXECUTOR_THREADS + 4,  # +4: leave some headroom
+                            'connect_timeout': 5,
+                            'read_timeout': 30,
+                            'retries': {'max_attempts': 3, 'mode': 'adaptive'},
+                        }
+                        self.boto_client = get_client(**config)
+                self.boto_client.download_file(parsed.netloc, parsed.path.lstrip('/'), str(tmp_path))
             elif parsed.scheme == 'http' or parsed.scheme == 'https':
                 with urllib.request.urlopen(url) as resp, open(tmp_path, 'wb') as f:
                     data = resp.read()
                     f.write(data)
             else:
                 assert False, f'Unsupported URL scheme: {parsed.scheme}'
-            return tmp_path
+            _logger.debug(f'Downloaded {url} to {tmp_path}')
+            return tmp_path, None
         except Exception as e:
             # we want to add the file url to the exception message
             exc = excs.Error(f'Failed to download {url}: {e}')
-            self.row_builder.set_exc(row, slot_idx, exc)
+            _logger.debug(f'Failed to download {url}: {e}', exc_info=e)
             if not self.ctx.ignore_errors:
                 raise exc from None  # suppress original exception
-        return None
+            return None, exc

pixeltable/exec/data_row_batch.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from __future__ import annotations
-from typing import List, Iterator, Optional
+from typing import Iterator, Optional
 import logging
 import pixeltable.exprs as exprs
@@ -49,12 +49,12 @@ class DataRowBatch:
     def __len__(self) -> int:
         return len(self.rows)
-    def __getitem__(self, index: object) -> exprs.DataRow:
+    def __getitem__(self, index: int) -> exprs.DataRow:
         return self.rows[index]
     def flush_imgs(
-            self, idx_range: Optional[slice] = None, stored_img_info: Optional[List[exprs.ColumnSlotIdx]] = None,
-            flushed_slot_idxs: Optional[List[int]] = None
+            self, idx_range: Optional[slice] = None, stored_img_info: Optional[list[exprs.ColumnSlotIdx]] = None,
+            flushed_slot_idxs: Optional[list[int]] = None
     ) -> None:
         """Flushes images in the given range of rows."""
         assert self.tbl is not None
@@ -74,21 +74,4 @@ class DataRowBatch:
                 row.flush_img(slot_idx)
     def __iter__(self) -> Iterator[exprs.DataRow]:
-        return DataRowBatchIterator(self)
-class DataRowBatchIterator:
-    """
-    Iterator over a DataRowBatch.
-    """
-    def __init__(self, batch: DataRowBatch):
-        self.row_batch = batch
-        self.index = 0
-    def __next__(self) -> exprs.DataRow:
-        if self.index >= len(self.row_batch.rows):
-            raise StopIteration
-        row = self.row_batch.rows[self.index]
-        self.index += 1
-        return row
+        return iter(self.rows)

pixeltable/exec/exec_context.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Optional, List
+from typing import Optional
 import sqlalchemy as sql
@@ -8,7 +8,7 @@ class ExecContext:
     """Class for execution runtime constants"""
     def __init__(
             self, row_builder: exprs.RowBuilder, *, show_pbar: bool = False, batch_size: int = 0,
-            pk_clause: Optional[List[sql.ClauseElement]] = None, num_computed_exprs: int = 0,
+            pk_clause: Optional[list[sql.ClauseElement]] = None, num_computed_exprs: int = 0,
             ignore_errors: bool = False
     ):
         self.show_pbar = show_pbar

pixeltable/exec/exec_node.py CHANGED Viewed

@@ -1,9 +1,10 @@
 from __future__ import annotations
 import abc
-from typing import Iterable, Optional, List, TYPE_CHECKING, Iterator
+from typing import TYPE_CHECKING, Iterable, Iterator, Optional
 import pixeltable.exprs as exprs
 from .data_row_batch import DataRowBatch
 from .exec_context import ExecContext
@@ -42,7 +43,7 @@ class ExecNode(abc.ABC):
         if self.input is not None:
             self.input.set_ctx(ctx)
-    def set_stored_img_cols(self, stored_img_cols: List[exprs.ColumnSlotIdx]) -> None:
+    def set_stored_img_cols(self, stored_img_cols: list[exprs.ColumnSlotIdx]) -> None:
         self.stored_img_cols = stored_img_cols
         # propagate batch size to the source
         if self.input is not None:

pixeltable/exec/expr_eval_node.py CHANGED Viewed

@@ -3,11 +3,11 @@ import sys
 import time
 import warnings
 from dataclasses import dataclass
-from typing import Iterable, List, Optional
+from typing import Iterable, Optional
 from tqdm import TqdmWarning, tqdm
-import pixeltable.exprs as exprs
+from pixeltable import exprs
 from pixeltable.func import CallableFunction
 from .data_row_batch import DataRowBatch
@@ -22,10 +22,10 @@ class ExprEvalNode(ExecNode):
     @dataclass
     class Cohort:
         """List of exprs that form an evaluation context and contain calls to at most one external function"""
-        exprs: List[exprs.Expr]
+        exprs_: list[exprs.Expr]
         batched_fn: Optional[CallableFunction]
-        segment_ctxs: List['exprs.RowBuilder.EvalCtx']
-        target_slot_idxs: List[int]
+        segment_ctxs: list['exprs.RowBuilder.EvalCtx']
+        target_slot_idxs: list[int]
         batch_size: int = 8
     def __init__(
@@ -38,7 +38,7 @@ class ExprEvalNode(ExecNode):
         # we're only materializing exprs that are not already in the input
         self.target_exprs = [e for e in output_exprs if e.slot_idx not in input_slot_idxs]
         self.pbar: Optional[tqdm] = None
-        self.cohorts: List[List[ExprEvalNode.Cohort]] = []
+        self.cohorts: list[ExprEvalNode.Cohort] = []
         self._create_cohorts()
     def __next__(self) -> DataRowBatch:
@@ -83,11 +83,13 @@ class ExprEvalNode(ExecNode):
         all_exprs = self.row_builder.get_dependencies(self.target_exprs)
         # break up all_exprs into cohorts such that each cohort contains calls to at most one external function;
         # seed the cohorts with only the ext fn calls
-        cohorts: List[List[exprs.Expr]] = []
+        cohorts: list[list[exprs.Expr]] = []
         current_batched_fn: Optional[CallableFunction] = None
         for e in all_exprs:
             if not self._is_batched_fn_call(e):
                 continue
+            assert isinstance(e, exprs.FunctionCall)
+            assert isinstance(e.fn, CallableFunction)
             if current_batched_fn is None or current_batched_fn != e.fn:
                 # create a new cohort
                 cohorts.append([])
@@ -96,9 +98,9 @@ class ExprEvalNode(ExecNode):
         # expand the cohorts to include all exprs that are in the same evaluation context as the external calls;
         # cohorts are evaluated in order, so we can exclude the target slots from preceding cohorts and input slots
-        exclude = set([e.slot_idx for e in self.input_exprs])
-        all_target_slot_idxs = set([e.slot_idx for e in self.target_exprs])
-        target_slot_idxs: List[List[int]] = []  # the ones materialized by each cohort
+        exclude = set(e.slot_idx for e in self.input_exprs)
+        all_target_slot_idxs = set(e.slot_idx for e in self.target_exprs)
+        target_slot_idxs: list[list[int]] = []  # the ones materialized by each cohort
         for i in range(len(cohorts)):
             cohorts[i] = self.row_builder.get_dependencies(
                 cohorts[i], exclude=[self.row_builder.unique_exprs[slot_idx] for slot_idx in exclude])
@@ -106,7 +108,7 @@ class ExprEvalNode(ExecNode):
                 [e.slot_idx for e in cohorts[i] if e.slot_idx in all_target_slot_idxs])
             exclude.update(target_slot_idxs[-1])
-        all_cohort_slot_idxs = set([e.slot_idx for cohort in cohorts for e in cohort])
+        all_cohort_slot_idxs = set(e.slot_idx for cohort in cohorts for e in cohort)
         remaining_slot_idxs = set(all_target_slot_idxs) - all_cohort_slot_idxs
         if len(remaining_slot_idxs) > 0:
             cohorts.append(self.row_builder.get_dependencies(
@@ -164,11 +166,12 @@ class ExprEvalNode(ExecNode):
                             rows[row_idx], segment_ctx, self.ctx.profile, ignore_errors=self.ctx.ignore_errors)
                 else:
                     fn_call = segment_ctx.exprs[0]
+                    assert isinstance(fn_call, exprs.FunctionCall)
                     # make a batched external function call
-                    arg_batches = [[] for _ in range(len(fn_call.args))]
-                    kwarg_batches = {k: [] for k in fn_call.kwargs.keys()}
+                    arg_batches: list[list[exprs.Expr]] = [[] for _ in range(len(fn_call.args))]
+                    kwarg_batches: dict[str, list[exprs.Expr]] = {k: [] for k in fn_call.kwargs.keys()}
-                    valid_batch_idxs: List[int] = []  # rows with exceptions are not valid
+                    valid_batch_idxs: list[int] = []  # rows with exceptions are not valid
                     for row_idx in range(batch_start_idx, batch_start_idx + num_batch_rows):
                         row = rows[row_idx]
                         if row.has_exc(fn_call.slot_idx):
@@ -176,12 +179,15 @@ class ExprEvalNode(ExecNode):
                             continue
                         valid_batch_idxs.append(row_idx)
                         args, kwargs = fn_call._make_args(row)
-                        [arg_batches[i].append(args[i]) for i in range(len(args))]
-                        [kwarg_batches[k].append(kwargs[k]) for k in kwargs.keys()]
+                        for i in range(len(args)):
+                            arg_batches[i].append(args[i])
+                        for k in kwargs.keys():
+                            kwarg_batches[k].append(kwargs[k])
                     num_valid_batch_rows = len(valid_batch_idxs)
                     if ext_batch_size is None:
                         # we need to choose a batch size based on the args
+                        assert isinstance(fn_call.fn, CallableFunction)
                         sample_args = [arg_batches[i][0] for i in range(len(arg_batches))]
                         ext_batch_size = fn_call.fn.get_batch_size(*sample_args)
@@ -201,6 +207,7 @@ class ExprEvalNode(ExecNode):
                             for k in kwarg_batches.keys()
                         }
                         start_ts = time.perf_counter()
+                        assert isinstance(fn_call.fn, CallableFunction)
                         result_batch = fn_call.fn.exec_batch(*call_args, **call_kwargs)
                         self.ctx.profile.eval_time[fn_call.slot_idx] += time.perf_counter() - start_ts
                         self.ctx.profile.eval_count[fn_call.slot_idx] += num_ext_batch_rows

pixeltable/exec/in_memory_data_node.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import logging
-from typing import Any, Optional, Iterator
+from typing import Any, Iterator, Optional
 import pixeltable.catalog as catalog
 import pixeltable.exprs as exprs
@@ -23,12 +23,15 @@ class InMemoryDataNode(ExecNode):
     start_row_id: int
     output_rows: Optional[DataRowBatch]
+    # output_exprs is declared in the superclass, but we redeclare it here with a more specific type
+    output_exprs: list[exprs.ColumnRef]
     def __init__(
         self, tbl: catalog.TableVersion, rows: list[dict[str, Any]],
         row_builder: exprs.RowBuilder, start_row_id: int,
     ):
-        # we materialize all output slots
-        output_exprs = [e for e in row_builder.get_output_exprs() if isinstance(e, exprs.ColumnRef)]
+        # we materialize the input slots
+        output_exprs = list(row_builder.input_exprs)
         super().__init__(row_builder, output_exprs, [], None)
         assert tbl.is_insertable()
         self.tbl = tbl

pixeltable/exec/sql_node.py CHANGED Viewed

@@ -1,13 +1,14 @@
 import logging
 import warnings
 from decimal import Decimal
-from typing import Optional, Iterable, Iterator, NamedTuple
+from typing import Iterable, Iterator, NamedTuple, Optional
 from uuid import UUID
 import sqlalchemy as sql
 import pixeltable.catalog as catalog
 import pixeltable.exprs as exprs
 from .data_row_batch import DataRowBatch
 from .exec_node import ExecNode
@@ -100,7 +101,7 @@ class SqlNode(ExecNode):
             # minimize the number of tables that need to be joined to the target table
             self.retarget_rowid_refs(tbl, self.select_list)
-        assert self.sql_elements.contains(self.select_list)
+        assert self.sql_elements.contains_all(self.select_list)
         self.set_pk = set_pk
         self.num_pk_cols = 0
         if set_pk:
@@ -120,13 +121,13 @@ class SqlNode(ExecNode):
     def _create_stmt(self) -> sql.Select:
         """Create Select from local state"""
-        assert self.sql_elements.contains(self.select_list)
+        assert self.sql_elements.contains_all(self.select_list)
         sql_select_list = [self.sql_elements.get(e) for e in self.select_list]
         if self.set_pk:
             sql_select_list += self.tbl.tbl_version.store_tbl.pk_columns()
         stmt = sql.select(*sql_select_list)
-        order_by_clause: list[sql.ClauseElement] = []
+        order_by_clause: list[sql.ColumnElement] = []
         for e, asc in self.order_by_clause:
             if isinstance(e, exprs.SimilarityExpr):
                 order_by_clause.append(e.as_order_by_clause(asc))
@@ -141,7 +142,7 @@ class SqlNode(ExecNode):
         return stmt
     def _ordering_tbl_ids(self) -> set[UUID]:
-        return exprs.Expr.list_tbl_ids(e for e, _ in self.order_by_clause)
+        return exprs.Expr.all_tbl_ids(e for e, _ in self.order_by_clause)
     def to_cte(self) -> Optional[tuple[sql.CTE, exprs.ExprDict[sql.ColumnElement]]]:
         """
@@ -182,9 +183,9 @@ class SqlNode(ExecNode):
         """
         # we need to include at least the root
         if refd_tbl_ids is None:
-            refd_tbl_ids = {}
+            refd_tbl_ids = set()
         if exact_version_only is None:
-            exact_version_only = {}
+            exact_version_only = set()
         candidates = tbl.get_tbl_versions()
         assert len(candidates) > 0
         joined_tbls: list[catalog.TableVersion] = [candidates[0]]
@@ -193,6 +194,7 @@ class SqlNode(ExecNode):
                 joined_tbls.append(tbl)
         first = True
+        prev_tbl: catalog.TableVersion
         for tbl in joined_tbls[::-1]:
             if first:
                 stmt = stmt.select_from(tbl.store_tbl.sa_tbl)
@@ -239,22 +241,19 @@ class SqlNode(ExecNode):
     def __iter__(self) -> Iterator[DataRowBatch]:
         # run the query; do this here rather than in _open(), exceptions are only expected during iteration
         assert self.ctx.conn is not None
-        try:
-            with warnings.catch_warnings(record=True) as w:
-                stmt = self._create_stmt()
-                try:
-                    # log stmt, if possible
-                    stmt_str = str(stmt.compile(compile_kwargs={'literal_binds': True}))
-                    _logger.debug(f'SqlLookupNode stmt:\n{stmt_str}')
-                except Exception as e:
-                    pass
-                self._log_explain(stmt)
-                result_cursor = self.ctx.conn.execute(stmt)
-                for warning in w:
-                    pass
-        except Exception as e:
-            raise e
+        with warnings.catch_warnings(record=True) as w:
+            stmt = self._create_stmt()
+            try:
+                # log stmt, if possible
+                stmt_str = str(stmt.compile(compile_kwargs={'literal_binds': True}))
+                _logger.debug(f'SqlLookupNode stmt:\n{stmt_str}')
+            except Exception:
+                pass
+            self._log_explain(stmt)
+            result_cursor = self.ctx.conn.execute(stmt)
+            for warning in w:
+                pass
         tbl_version = self.tbl.tbl_version if self.tbl is not None else None
         output_batch = DataRowBatch(tbl_version, self.row_builder)
@@ -350,7 +349,7 @@ class SqlScanNode(SqlNode):
     def _create_stmt(self) -> sql.Select:
         stmt = super()._create_stmt()
         where_clause_tbl_ids = self.where_clause.tbl_ids() if self.where_clause is not None else set()
-        refd_tbl_ids = exprs.Expr.list_tbl_ids(self.select_list) | where_clause_tbl_ids | self._ordering_tbl_ids()
+        refd_tbl_ids = exprs.Expr.all_tbl_ids(self.select_list) | where_clause_tbl_ids | self._ordering_tbl_ids()
         stmt = self.create_from_clause(
             self.tbl, stmt, refd_tbl_ids, exact_version_only={t.id for t in self.exact_version_only})
@@ -386,7 +385,7 @@ class SqlLookupNode(SqlNode):
     def _create_stmt(self) -> sql.Select:
         stmt = super()._create_stmt()
-        refd_tbl_ids = exprs.Expr.list_tbl_ids(self.select_list) | self._ordering_tbl_ids()
+        refd_tbl_ids = exprs.Expr.all_tbl_ids(self.select_list) | self._ordering_tbl_ids()
         stmt = self.create_from_clause(self.tbl, stmt, refd_tbl_ids)
         stmt = stmt.where(self.where_clause)
         return stmt

pixeltable 0.2.21__py3-none-any.whl → 0.2.23__py3-none-any.whl

pixeltable 0.2.21py3-none-any.whl → 0.2.23py3-none-any.whl