PyPI - pixeltable - Versions diffs - 0.2.22__py3-none-any.whl → 0.2.24__py3-none-any.whl - Mend

pixeltable 0.2.22py3-none-any.whl → 0.2.24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (40) hide show

pixeltable/__init__.py +2 -2
pixeltable/__version__.py +2 -2
pixeltable/catalog/column.py +8 -22
pixeltable/catalog/insertable_table.py +26 -8
pixeltable/catalog/table.py +179 -83
pixeltable/catalog/table_version.py +13 -39
pixeltable/catalog/table_version_path.py +2 -2
pixeltable/catalog/view.py +2 -2
pixeltable/dataframe.py +20 -28
pixeltable/env.py +2 -0
pixeltable/exec/cache_prefetch_node.py +189 -43
pixeltable/exec/data_row_batch.py +3 -3
pixeltable/exec/exec_context.py +2 -2
pixeltable/exec/exec_node.py +2 -2
pixeltable/exec/expr_eval_node.py +8 -8
pixeltable/exprs/arithmetic_expr.py +9 -4
pixeltable/exprs/column_ref.py +4 -0
pixeltable/exprs/comparison.py +5 -0
pixeltable/exprs/json_path.py +1 -1
pixeltable/func/aggregate_function.py +8 -8
pixeltable/func/expr_template_function.py +6 -5
pixeltable/func/udf.py +6 -11
pixeltable/functions/huggingface.py +145 -25
pixeltable/functions/llama_cpp.py +3 -2
pixeltable/functions/mistralai.py +1 -1
pixeltable/functions/openai.py +1 -1
pixeltable/functions/together.py +1 -1
pixeltable/functions/util.py +5 -2
pixeltable/globals.py +55 -6
pixeltable/plan.py +1 -1
pixeltable/tool/create_test_db_dump.py +1 -1
pixeltable/type_system.py +83 -35
pixeltable/utils/coco.py +5 -5
pixeltable/utils/formatter.py +3 -3
pixeltable/utils/s3.py +6 -3
{pixeltable-0.2.22.dist-info → pixeltable-0.2.24.dist-info}/METADATA +119 -46
{pixeltable-0.2.22.dist-info → pixeltable-0.2.24.dist-info}/RECORD +40 -40
{pixeltable-0.2.22.dist-info → pixeltable-0.2.24.dist-info}/LICENSE +0 -0
{pixeltable-0.2.22.dist-info → pixeltable-0.2.24.dist-info}/WHEEL +0 -0
{pixeltable-0.2.22.dist-info → pixeltable-0.2.24.dist-info}/entry_points.txt +0 -0

pixeltable/catalog/table_version.py CHANGED Viewed

@@ -193,8 +193,6 @@ class TableVersion:
             col.id = pos
             col.schema_version_add = 0
             cols_by_name[col.name] = col
-            if col.value_expr is None and col.compute_func is not None:
-                cls._create_value_expr(col, base_path)
             if col.is_computed:
                 col.check_value_expr()
@@ -494,37 +492,35 @@ class TableVersion:
             self._update_md(time.time(), conn, preceding_schema_version=preceding_schema_version)
             _logger.info(f'Dropped index {idx_md.name} on table {self.name}')
-    def add_column(self, col: Column, print_stats: bool, on_error: Literal['abort', 'ignore']) -> UpdateStatus:
+    def add_columns(self, cols: Iterable[Column], print_stats: bool, on_error: Literal['abort', 'ignore']) -> UpdateStatus:
         """Adds a column to the table.
         """
         assert not self.is_snapshot
-        assert is_valid_identifier(col.name)
-        assert col.stored is not None
-        assert col.name not in self.cols_by_name
-        col.tbl = self
-        col.id = self.next_col_id
-        self.next_col_id += 1
-        if col.compute_func is not None:
-            # create value_expr from compute_func
-            self._create_value_expr(col, self.path)
+        assert all(is_valid_identifier(col.name) for col in cols)
+        assert all(col.stored is not None for col in cols)
+        assert all(col.name not in self.cols_by_name for col in cols)
+        for col in cols:
+            col.tbl = self
+            col.id = self.next_col_id
+            self.next_col_id += 1
         # we're creating a new schema version
         self.version += 1
         preceding_schema_version = self.schema_version
         self.schema_version = self.version
         with Env.get().engine.begin() as conn:
-            status = self._add_columns([col], conn, print_stats=print_stats, on_error=on_error)
-            _ = self._add_default_index(col, conn)
+            status = self._add_columns(cols, conn, print_stats=print_stats, on_error=on_error)
+            for col in cols:
+                _ = self._add_default_index(col, conn)
             self._update_md(time.time(), conn, preceding_schema_version=preceding_schema_version)
-        _logger.info(f'Added column {col.name} to table {self.name}, new version: {self.version}')
+        _logger.info(f'Added columns {[col.name for col in cols]} to table {self.name}, new version: {self.version}')
         msg = (
             f'Added {status.num_rows} column value{"" if status.num_rows == 1 else "s"} '
             f'with {status.num_excs} error{"" if status.num_excs == 1 else "s"}.'
         )
         print(msg)
-        _logger.info(f'Column {col.name}: {msg}')
+        _logger.info(f'Columns {[col.name for col in cols]}: {msg}')
         return status
     def _add_columns(
@@ -1140,28 +1136,6 @@ class TableVersion:
         names = [c.name for c in self.cols_by_name.values() if c.is_computed]
         return names
-    @classmethod
-    def _create_value_expr(cls, col: Column, path: pxt.catalog.TableVersionPath) -> None:
-        """
-        Create col.value_expr, given col.compute_func.
-        Interprets compute_func's parameters to be references to columns and construct ColumnRefs as args.
-        Does not update Column.dependent_cols.
-        """
-        assert col.value_expr is None
-        assert col.compute_func is not None
-        from pixeltable import exprs
-        params = inspect.signature(col.compute_func).parameters
-        args: list[exprs.ColumnRef] = []
-        for param_name in params:
-            param = path.get_column(param_name)
-            if param is None:
-                raise excs.Error(
-                    f'Column {col.name}: Callable parameter refers to an unknown column: {param_name}')
-            args.append(exprs.ColumnRef(param))
-        fn = func.make_function(
-            col.compute_func, return_type=col.col_type, param_types=[arg.col_type for arg in args])
-        col.set_value_expr(fn(*args))
     def _record_refd_columns(self, col: Column) -> None:
         """Update Column.dependent_cols for all cols referenced in col.value_expr.
         """

pixeltable/catalog/table_version_path.py CHANGED Viewed

@@ -81,13 +81,13 @@ class TableVersionPath:
             return None
         return self.base.find_tbl_version(id)
-    def __getattr__(self, col_name: str) -> exprs.ColumnRef:
+    def get_column_ref(self, col_name: str) -> exprs.ColumnRef:
         """Return a ColumnRef for the given column name."""
         from pixeltable.exprs import ColumnRef
         if col_name not in self.tbl_version.cols_by_name:
             if self.base is None:
                 raise AttributeError(f'Column {col_name} unknown')
-            return getattr(self.base, col_name)
+            return self.base.get_column_ref(col_name)
         col = self.tbl_version.cols_by_name[col_name]
         return ColumnRef(col)

pixeltable/catalog/view.py CHANGED Viewed

@@ -2,7 +2,7 @@ from __future__ import annotations
 import inspect
 import logging
-from typing import TYPE_CHECKING, Any, Iterable, Optional
+from typing import TYPE_CHECKING, Any, Iterable, Literal, Optional
 from uuid import UUID
 import sqlalchemy.orm as orm
@@ -216,7 +216,7 @@ class View(Table):
     def insert(
             self, rows: Optional[Iterable[dict[str, Any]]] = None, /, *, print_stats: bool = False,
-            fail_on_exception: bool = True, **kwargs: Any
+            on_error: Literal['abort', 'ignore'] = 'abort', **kwargs: Any
     ) -> UpdateStatus:
         raise excs.Error(f'{self._display_name()} {self._name!r}: cannot insert into view')

pixeltable/dataframe.py CHANGED Viewed

@@ -8,7 +8,7 @@ import logging
 import mimetypes
 import traceback
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Dict, Hashable, Iterator, List, Optional, Sequence, Set, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Hashable, Iterator, Optional, Sequence, Union
 import pandas as pd
 import pandas.io.formats.style
@@ -34,14 +34,6 @@ __all__ = ['DataFrame']
 _logger = logging.getLogger('pixeltable')
-def _create_source_tag(file_path: str) -> str:
-    src_url = get_file_uri(Env.get().http_address, file_path)
-    mime = mimetypes.guess_type(src_url)[0]
-    # if mime is None, the attribute string would not be valid html.
-    mime_attr = f'type="{mime}"' if mime is not None else ''
-    return f'<source src="{src_url}" {mime_attr} />'
 class DataFrameResultSet:
     def __init__(self, rows: list[list[Any]], schema: dict[str, ColumnType]):
         self._rows = rows
@@ -77,7 +69,7 @@ class DataFrameResultSet:
     def to_pandas(self) -> pd.DataFrame:
         return pd.DataFrame.from_records(self._rows, columns=self._col_names)
-    def _row_to_dict(self, row_idx: int) -> Dict[str, Any]:
+    def _row_to_dict(self, row_idx: int) -> dict[str, Any]:
         return {self._col_names[i]: self._rows[row_idx][i] for i in range(len(self._col_names))}
     def __getitem__(self, index: Any) -> Any:
@@ -111,22 +103,22 @@ class DataFrameResultSet:
 #     def __init__(self, tbl: catalog.TableVersion):
 #         self.tbl = tbl
 #         # output of the SQL scan stage
-#         self.sql_scan_output_exprs: List[exprs.Expr] = []
+#         self.sql_scan_output_exprs: list[exprs.Expr] = []
 #         # output of the agg stage
-#         self.agg_output_exprs: List[exprs.Expr] = []
+#         self.agg_output_exprs: list[exprs.Expr] = []
 #         # Where clause of the Select stmt of the SQL scan stage
 #         self.sql_where_clause: Optional[sql.ClauseElement] = None
 #         # filter predicate applied to input rows of the SQL scan stage
 #         self.filter: Optional[exprs.Predicate] = None
 #         self.similarity_clause: Optional[exprs.ImageSimilarityPredicate] = None
-#         self.agg_fn_calls: List[exprs.FunctionCall] = []  # derived from unique_exprs
+#         self.agg_fn_calls: list[exprs.FunctionCall] = []  # derived from unique_exprs
 #         self.has_frame_col: bool = False  # True if we're referencing the frame col
 #
 #         self.evaluator: Optional[exprs.Evaluator] = None
-#         self.sql_scan_eval_ctx: List[exprs.Expr] = []  # needed to materialize output of SQL scan stage
-#         self.agg_eval_ctx: List[exprs.Expr] = []  # needed to materialize output of agg stage
-#         self.filter_eval_ctx: List[exprs.Expr] = []
-#         self.group_by_eval_ctx: List[exprs.Expr] = []
+#         self.sql_scan_eval_ctx: list[exprs.Expr] = []  # needed to materialize output of SQL scan stage
+#         self.agg_eval_ctx: list[exprs.Expr] = []  # needed to materialize output of agg stage
+#         self.filter_eval_ctx: list[exprs.Expr] = []
+#         self.group_by_eval_ctx: list[exprs.Expr] = []
 #
 #     def finalize_exec(self) -> None:
 #         """
@@ -142,11 +134,11 @@ class DataFrame:
     def __init__(
         self,
         tbl: catalog.TableVersionPath,
-        select_list: Optional[List[Tuple[exprs.Expr, Optional[str]]]] = None,
+        select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]] = None,
         where_clause: Optional[exprs.Expr] = None,
-        group_by_clause: Optional[List[exprs.Expr]] = None,
+        group_by_clause: Optional[list[exprs.Expr]] = None,
         grouping_tbl: Optional[catalog.TableVersion] = None,
-        order_by_clause: Optional[List[Tuple[exprs.Expr, bool]]] = None,  # List[(expr, asc)]
+        order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None,  # list[(expr, asc)]
         limit: Optional[int] = None,
     ):
         self.tbl = tbl
@@ -174,7 +166,7 @@ class DataFrame:
     @classmethod
     def _select_list_check_rep(
         cls,
-        select_list: Optional[List[Tuple[exprs.Expr, Optional[str]]]],
+        select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]],
     ) -> None:
         """Validate basic select list types."""
         if select_list is None:  # basic check for valid select list
@@ -411,8 +403,8 @@ class DataFrame:
     def _description(self) -> pd.DataFrame:
         """see DataFrame.describe()"""
-        heading_vals: List[str] = []
-        info_vals: List[str] = []
+        heading_vals: list[str] = []
+        info_vals: list[str] = []
         if self.select_list is not None:
             assert len(self.select_list) > 0
             heading_vals.append('Select')
@@ -497,7 +489,7 @@ class DataFrame:
         # check user provided names do not conflict among themselves
         # or with auto-generated ones
-        seen: Set[str] = set()
+        seen: set[str] = set()
         _, names = DataFrame._normalize_select_list(self.tbl, select_list)
         for name in names:
             if name in seen:
@@ -540,7 +532,7 @@ class DataFrame:
         if self.group_by_clause is not None:
             raise excs.Error(f'Group-by already specified')
         grouping_tbl: Optional[catalog.TableVersion] = None
-        group_by_clause: Optional[List[exprs.Expr]] = None
+        group_by_clause: Optional[list[exprs.Expr]] = None
         for item in grouping_items:
             if isinstance(item, catalog.Table):
                 if len(grouping_items) > 1:
@@ -618,7 +610,7 @@ class DataFrame:
     def __getitem__(self, index: Union[exprs.Expr, Sequence[exprs.Expr]]) -> DataFrame:
         """
         Allowed:
-        - [List[Expr]]/[Tuple[Expr]]: setting the select list
+        - [list[Expr]]/[tuple[Expr]]: setting the select list
         - [Expr]: setting a single-col select list
         """
         if isinstance(index, exprs.Expr):
@@ -627,7 +619,7 @@ class DataFrame:
             return self.select(*index)
         raise TypeError(f'Invalid index type: {type(index)}')
-    def as_dict(self) -> Dict[str, Any]:
+    def as_dict(self) -> dict[str, Any]:
         """
         Returns:
             Dictionary representing this dataframe.
@@ -649,7 +641,7 @@ class DataFrame:
         return d
     @classmethod
-    def from_dict(cls, d: Dict[str, Any]) -> 'DataFrame':
+    def from_dict(cls, d: dict[str, Any]) -> 'DataFrame':
         tbl = catalog.TableVersionPath.from_dict(d['tbl'])
         select_list = [(exprs.Expr.from_dict(e), name) for e, name in d['select_list']] \
             if d['select_list'] is not None else None

pixeltable/env.py CHANGED Viewed

@@ -506,11 +506,13 @@ class Env:
         self.__register_package('openpyxl')
         self.__register_package('pyarrow')
         self.__register_package('replicate')
+        self.__register_package('sentencepiece')
         self.__register_package('sentence_transformers', library_name='sentence-transformers')
         self.__register_package('spacy')
         self.__register_package('tiktoken')
         self.__register_package('together')
         self.__register_package('torch')
+        self.__register_package('torchaudio')
         self.__register_package('torchvision')
         self.__register_package('transformers')
         self.__register_package('whisper', library_name='openai-whisper')

pixeltable/exec/cache_prefetch_node.py CHANGED Viewed

@@ -1,87 +1,226 @@
 from __future__ import annotations
-import concurrent.futures
+import dataclasses
+import itertools
 import logging
 import threading
 import urllib.parse
 import urllib.request
-from collections import defaultdict
+from collections import deque
+from concurrent import futures
 from pathlib import Path
-from typing import List, Optional, Any, Tuple, Dict
+from typing import Optional, Any, Iterator
 from uuid import UUID
 import pixeltable.env as env
 import pixeltable.exceptions as excs
 import pixeltable.exprs as exprs
+from pixeltable import catalog
 from pixeltable.utils.filecache import FileCache
 from .data_row_batch import DataRowBatch
 from .exec_node import ExecNode
 _logger = logging.getLogger('pixeltable')
 class CachePrefetchNode(ExecNode):
     """Brings files with external URLs into the cache
     TODO:
-    - maintain a queue of row batches, in order to overlap download and evaluation
     - adapting the number of download threads at runtime to maximize throughput
     """
-    def __init__(self, tbl_id: UUID, file_col_info: List[exprs.ColumnSlotIdx], input: ExecNode):
-        # []: we don't have anything to evaluate
+    BATCH_SIZE = 16
+    NUM_EXECUTOR_THREADS = 16
+    retain_input_order: bool  # if True, return rows in the exact order they were received
+    file_col_info: list[exprs.ColumnSlotIdx]
+    boto_client: Optional[Any]
+    boto_client_lock: threading.Lock
+    # execution state
+    batch_tbl_version: Optional[catalog.TableVersion]  # needed to construct output batches
+    num_returned_rows: int
+    # ready_rows: rows that are ready to be returned, ordered by row idx;
+    # the implied row idx of ready_rows[0] is num_returned_rows
+    ready_rows: deque[Optional[exprs.DataRow]]
+    in_flight_rows: dict[int, CachePrefetchNode.RowState]  # rows with in-flight urls; id(row) -> RowState
+    in_flight_requests: dict[futures.Future, str]  # in-flight requests for urls; future -> URL
+    in_flight_urls: dict[str, list[tuple[exprs.DataRow, exprs.ColumnSlotIdx]]]  # URL -> [(row, info)]
+    input_finished: bool
+    row_idx: Iterator[Optional[int]]
+    @dataclasses.dataclass
+    class RowState:
+        row: exprs.DataRow
+        idx: Optional[int]  # position in input stream; None if we don't retain input order
+        num_missing: int  # number of missing URLs in this row
+    def __init__(
+            self, tbl_id: UUID, file_col_info: list[exprs.ColumnSlotIdx], input: ExecNode,
+            retain_input_order: bool = True):
+        # input_/output_exprs=[]: we don't have anything to evaluate
         super().__init__(input.row_builder, [], [], input)
-        self.tbl_id = tbl_id
+        self.retain_input_order = retain_input_order
         self.file_col_info = file_col_info
         # clients for specific services are constructed as needed, because it's time-consuming
-        self.boto_client: Optional[Any] = None
+        self.boto_client = None
         self.boto_client_lock = threading.Lock()
-    def __next__(self) -> DataRowBatch:
-        input_batch = next(self.input)
+        self.batch_tbl_version = None
+        self.num_returned_rows = 0
+        self.ready_rows = deque()
+        self.in_flight_rows = {}
+        self.in_flight_requests = {}
+        self.in_flight_urls = {}
+        self.input_finished = False
+        self.row_idx = itertools.count() if retain_input_order else itertools.repeat(None)
+    def __iter__(self) -> Iterator[DataRowBatch]:
+        input_iter = iter(self.input)
+        with futures.ThreadPoolExecutor(max_workers=self.NUM_EXECUTOR_THREADS) as executor:
+            # we create enough in-flight requests to fill the first batch
+            while not self.input_finished and self.__num_pending_rows() < self.BATCH_SIZE:
+                self.__submit_input_batch(input_iter, executor)
+            while True:
+                # try to assemble a full batch of output rows
+                if not self.__has_ready_batch() and len(self.in_flight_requests) > 0:
+                    self.__wait_for_requests()
+                # try to create enough in-flight requests to fill the next batch
+                while not self.input_finished and self.__num_pending_rows() < self.BATCH_SIZE:
+                    self.__submit_input_batch(input_iter, executor)
+                if len(self.ready_rows) > 0:
+                    # create DataRowBatch from the first BATCH_SIZE ready rows
+                    batch = DataRowBatch(self.batch_tbl_version, self.row_builder)
+                    rows = [self.ready_rows.popleft() for _ in range(min(self.BATCH_SIZE, len(self.ready_rows)))]
+                    for row in rows:
+                        assert row is not None
+                        batch.add_row(row)
+                    self.num_returned_rows += len(rows)
+                    _logger.debug(f'returning {len(rows)} rows')
+                    yield batch
+                if self.input_finished and self.__num_pending_rows() == 0:
+                    return
+    def __num_pending_rows(self) -> int:
+        return len(self.in_flight_rows) + len(self.ready_rows)
+    def __has_ready_batch(self) -> bool:
+        """True if there are >= BATCH_SIZES entries in ready_rows and the first BATCH_SIZE ones are all non-None"""
+        return (
+            sum(int(row is not None) for row in itertools.islice(self.ready_rows, self.BATCH_SIZE)) == self.BATCH_SIZE
+        )
+    def __ready_prefix_len(self) -> int:
+        """Length of the non-None prefix of ready_rows (= what we can return right now)"""
+        return sum(1 for _ in itertools.takewhile(lambda x: x is not None, self.ready_rows))
+    def __add_ready_row(self, row: exprs.DataRow, row_idx: Optional[int]) -> None:
+        if row_idx is None:
+            self.ready_rows.append(row)
+        else:
+            # extend ready_rows to accommodate row_idx
+            idx = row_idx - self.num_returned_rows
+            if idx >= len(self.ready_rows):
+                self.ready_rows.extend([None] * (idx - len(self.ready_rows) + 1))
+            self.ready_rows[idx] = row
+    def __wait_for_requests(self) -> None:
+        """Wait for in-flight requests to complete until we have a full batch of rows"""
+        file_cache = FileCache.get()
+        _logger.debug(f'waiting for requests; ready_batch_size={self.__ready_prefix_len()}')
+        while not self.__has_ready_batch() and len(self.in_flight_requests) > 0:
+            done, _ = futures.wait(self.in_flight_requests, return_when=futures.FIRST_COMPLETED)
+            for f in done:
+                url = self.in_flight_requests.pop(f)
+                tmp_path, exc = f.result()
+                local_path: Optional[Path] = None
+                if tmp_path is not None:
+                    # register the file with the cache for the first column in which it's missing
+                    assert url in self.in_flight_urls
+                    _, info = self.in_flight_urls[url][0]
+                    local_path = file_cache.add(info.col.tbl.id, info.col.id, url, tmp_path)
+                    _logger.debug(f'cached {url} as {local_path}')
+                # add the local path/exception to the slots that reference the url
+                for row, info in self.in_flight_urls.pop(url):
+                    if exc is not None:
+                        self.row_builder.set_exc(row, info.slot_idx, exc)
+                    else:
+                        assert local_path is not None
+                        row.set_file_path(info.slot_idx, str(local_path))
+                    state = self.in_flight_rows[id(row)]
+                    state.num_missing -= 1
+                    if state.num_missing == 0:
+                        del self.in_flight_rows[id(row)]
+                        self.__add_ready_row(row, state.idx)
+                        _logger.debug(f'row {state.idx} is ready (ready_batch_size={self.__ready_prefix_len()})')
+    def __submit_input_batch(self, input: Iterator[DataRowBatch], executor: futures.ThreadPoolExecutor) -> None:
+        assert not self.input_finished
+        input_batch = next(input, None)
+        if input_batch is None:
+            self.input_finished = True
+            return
+        if self.batch_tbl_version is None:
+            self.batch_tbl_version = input_batch.tbl
-        # collect external URLs that aren't already cached, and set DataRow.file_paths for those that are
         file_cache = FileCache.get()
-        cache_misses: List[Tuple[exprs.DataRow, exprs.ColumnSlotIdx]] = []
-        missing_url_rows: Dict[str, List[exprs.DataRow]] = defaultdict(list)  # URL -> rows in which it's missing
+        # URLs from this input batch that aren't already in the file cache;
+        # we use a list to make sure we submit urls in the order in which they appear in the output, which minimizes
+        # the time it takes to get the next batch together
+        cache_misses: list[str] = []
+        url_pos: dict[str, int] = {}  # url -> row_idx; used for logging
         for row in input_batch:
+            # identify missing local files in input batch, or fill in their paths if they're already cached
+            num_missing = 0
+            row_idx = next(self.row_idx)
             for info in self.file_col_info:
                 url = row.file_urls[info.slot_idx]
                 if url is None or row.file_paths[info.slot_idx] is not None:
                     # nothing to do
                     continue
-                if url in missing_url_rows:
-                    missing_url_rows[url].append(row)
+                locations = self.in_flight_urls.get(url)
+                if locations is not None:
+                    # we've already seen this
+                    locations.append((row, info))
+                    num_missing += 1
                     continue
                 local_path = file_cache.lookup(url)
                 if local_path is None:
-                    cache_misses.append((row, info))
-                    missing_url_rows[url].append(row)
+                    cache_misses.append(url)
+                    self.in_flight_urls[url] = [(row, info)]
+                    num_missing += 1
+                    if url not in url_pos:
+                        url_pos[url] = row_idx
                 else:
                     row.set_file_path(info.slot_idx, str(local_path))
-        # download the cache misses in parallel
-        # TODO: set max_workers to maximize throughput
-        futures: Dict[concurrent.futures.Future, Tuple[exprs.DataRow, exprs.ColumnSlotIdx]] = {}
-        with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
-            for row, info in cache_misses:
-                futures[executor.submit(self._fetch_url, row, info.slot_idx)] = (row, info)
-            for future in concurrent.futures.as_completed(futures):
-                # TODO:  does this need to deal with recoverable errors (such as retry after throttling)?
-                tmp_path = future.result()
-                if tmp_path is None:
-                    continue
-                row, info = futures[future]
-                url = row.file_urls[info.slot_idx]
-                local_path = file_cache.add(self.tbl_id, info.col.id, url, tmp_path)
-                _logger.debug(f'PrefetchNode: cached {url} as {local_path}')
-                for row in missing_url_rows[url]:
-                    row.set_file_path(info.slot_idx, str(local_path))
+            if num_missing > 0:
+                self.in_flight_rows[id(row)] = self.RowState(row, row_idx, num_missing)
+            else:
+                self.__add_ready_row(row, row_idx)
-        return input_batch
+        _logger.debug(f'submitting {len(cache_misses)} urls')
+        for url in cache_misses:
+            f = executor.submit(self.__fetch_url, url)
+            _logger.debug(f'submitted {url} for idx {url_pos[url]}')
+            self.in_flight_requests[f] = url
-    def _fetch_url(self, row: exprs.DataRow, slot_idx: int) -> Optional[Path]:
+    def __fetch_url(self, url: str) -> tuple[Optional[Path], Optional[Exception]]:
         """Fetches a remote URL into Env.tmp_dir and returns its path"""
-        url = row.file_urls[slot_idx]
+        _logger.debug(f'fetching url={url} thread_name={threading.current_thread().name}')
         parsed = urllib.parse.urlparse(url)
         # Use len(parsed.scheme) > 1 here to ensure we're not being passed
         # a Windows filename
@@ -93,24 +232,31 @@ class CachePrefetchNode(ExecNode):
             extension = p.suffix
         tmp_path = env.Env.get().create_tmp_path(extension=extension)
         try:
+            _logger.debug(f'Downloading {url} to {tmp_path}')
             if parsed.scheme == 's3':
                 from pixeltable.utils.s3 import get_client
                 with self.boto_client_lock:
                     if self.boto_client is None:
-                        self.boto_client = get_client()
-                    self.boto_client.download_file(parsed.netloc, parsed.path.lstrip('/'), str(tmp_path))
+                        config = {
+                            'max_pool_connections': self.NUM_EXECUTOR_THREADS + 4,  # +4: leave some headroom
+                            'connect_timeout': 5,
+                            'read_timeout': 30,
+                            'retries': {'max_attempts': 3, 'mode': 'adaptive'},
+                        }
+                        self.boto_client = get_client(**config)
+                self.boto_client.download_file(parsed.netloc, parsed.path.lstrip('/'), str(tmp_path))
             elif parsed.scheme == 'http' or parsed.scheme == 'https':
                 with urllib.request.urlopen(url) as resp, open(tmp_path, 'wb') as f:
                     data = resp.read()
                     f.write(data)
             else:
                 assert False, f'Unsupported URL scheme: {parsed.scheme}'
-            return tmp_path
+            _logger.debug(f'Downloaded {url} to {tmp_path}')
+            return tmp_path, None
         except Exception as e:
             # we want to add the file url to the exception message
             exc = excs.Error(f'Failed to download {url}: {e}')
-            self.row_builder.set_exc(row, slot_idx, exc)
+            _logger.debug(f'Failed to download {url}: {e}', exc_info=e)
             if not self.ctx.ignore_errors:
                 raise exc from None  # suppress original exception
-        return None
+            return None, exc

pixeltable/exec/data_row_batch.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from __future__ import annotations
-from typing import List, Iterator, Optional
+from typing import Iterator, Optional
 import logging
 import pixeltable.exprs as exprs
@@ -53,8 +53,8 @@ class DataRowBatch:
         return self.rows[index]
     def flush_imgs(
-            self, idx_range: Optional[slice] = None, stored_img_info: Optional[List[exprs.ColumnSlotIdx]] = None,
-            flushed_slot_idxs: Optional[List[int]] = None
+            self, idx_range: Optional[slice] = None, stored_img_info: Optional[list[exprs.ColumnSlotIdx]] = None,
+            flushed_slot_idxs: Optional[list[int]] = None
     ) -> None:
         """Flushes images in the given range of rows."""
         assert self.tbl is not None

pixeltable/exec/exec_context.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Optional, List
+from typing import Optional
 import sqlalchemy as sql
@@ -8,7 +8,7 @@ class ExecContext:
     """Class for execution runtime constants"""
     def __init__(
             self, row_builder: exprs.RowBuilder, *, show_pbar: bool = False, batch_size: int = 0,
-            pk_clause: Optional[List[sql.ClauseElement]] = None, num_computed_exprs: int = 0,
+            pk_clause: Optional[list[sql.ClauseElement]] = None, num_computed_exprs: int = 0,
             ignore_errors: bool = False
     ):
         self.show_pbar = show_pbar

pixeltable/exec/exec_node.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
 import abc
-from typing import TYPE_CHECKING, Iterable, Iterator, List, Optional
+from typing import TYPE_CHECKING, Iterable, Iterator, Optional
 import pixeltable.exprs as exprs
@@ -43,7 +43,7 @@ class ExecNode(abc.ABC):
         if self.input is not None:
             self.input.set_ctx(ctx)
-    def set_stored_img_cols(self, stored_img_cols: List[exprs.ColumnSlotIdx]) -> None:
+    def set_stored_img_cols(self, stored_img_cols: list[exprs.ColumnSlotIdx]) -> None:
         self.stored_img_cols = stored_img_cols
         # propagate batch size to the source
         if self.input is not None:

pixeltable 0.2.22__py3-none-any.whl → 0.2.24__py3-none-any.whl

Potentially problematic release.

pixeltable 0.2.22py3-none-any.whl → 0.2.24py3-none-any.whl