PyPI - pixeltable - Versions diffs - 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl - Mend

pixeltable 0.4.0rc3py3-none-any.whl → 0.4.20py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (202) hide show

pixeltable/__init__.py +23 -5
pixeltable/_version.py +1 -0
pixeltable/catalog/__init__.py +5 -3
pixeltable/catalog/catalog.py +1318 -404
pixeltable/catalog/column.py +186 -115
pixeltable/catalog/dir.py +1 -2
pixeltable/catalog/globals.py +11 -43
pixeltable/catalog/insertable_table.py +167 -79
pixeltable/catalog/path.py +61 -23
pixeltable/catalog/schema_object.py +9 -10
pixeltable/catalog/table.py +626 -308
pixeltable/catalog/table_metadata.py +101 -0
pixeltable/catalog/table_version.py +713 -569
pixeltable/catalog/table_version_handle.py +37 -6
pixeltable/catalog/table_version_path.py +42 -29
pixeltable/catalog/tbl_ops.py +50 -0
pixeltable/catalog/update_status.py +191 -0
pixeltable/catalog/view.py +108 -94
pixeltable/config.py +128 -22
pixeltable/dataframe.py +188 -100
pixeltable/env.py +407 -136
pixeltable/exceptions.py +6 -0
pixeltable/exec/__init__.py +3 -0
pixeltable/exec/aggregation_node.py +7 -8
pixeltable/exec/cache_prefetch_node.py +83 -110
pixeltable/exec/cell_materialization_node.py +231 -0
pixeltable/exec/cell_reconstruction_node.py +135 -0
pixeltable/exec/component_iteration_node.py +4 -3
pixeltable/exec/data_row_batch.py +8 -65
pixeltable/exec/exec_context.py +16 -4
pixeltable/exec/exec_node.py +13 -36
pixeltable/exec/expr_eval/evaluators.py +7 -6
pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
pixeltable/exec/expr_eval/globals.py +8 -5
pixeltable/exec/expr_eval/row_buffer.py +1 -2
pixeltable/exec/expr_eval/schedulers.py +190 -30
pixeltable/exec/globals.py +32 -0
pixeltable/exec/in_memory_data_node.py +18 -18
pixeltable/exec/object_store_save_node.py +293 -0
pixeltable/exec/row_update_node.py +16 -9
pixeltable/exec/sql_node.py +206 -101
pixeltable/exprs/__init__.py +1 -1
pixeltable/exprs/arithmetic_expr.py +27 -22
pixeltable/exprs/array_slice.py +3 -3
pixeltable/exprs/column_property_ref.py +34 -30
pixeltable/exprs/column_ref.py +92 -96
pixeltable/exprs/comparison.py +5 -5
pixeltable/exprs/compound_predicate.py +5 -4
pixeltable/exprs/data_row.py +152 -55
pixeltable/exprs/expr.py +62 -43
pixeltable/exprs/expr_dict.py +3 -3
pixeltable/exprs/expr_set.py +17 -10
pixeltable/exprs/function_call.py +75 -37
pixeltable/exprs/globals.py +1 -2
pixeltable/exprs/in_predicate.py +4 -4
pixeltable/exprs/inline_expr.py +10 -27
pixeltable/exprs/is_null.py +1 -3
pixeltable/exprs/json_mapper.py +8 -8
pixeltable/exprs/json_path.py +56 -22
pixeltable/exprs/literal.py +5 -5
pixeltable/exprs/method_ref.py +2 -2
pixeltable/exprs/object_ref.py +2 -2
pixeltable/exprs/row_builder.py +127 -53
pixeltable/exprs/rowid_ref.py +8 -12
pixeltable/exprs/similarity_expr.py +50 -25
pixeltable/exprs/sql_element_cache.py +4 -4
pixeltable/exprs/string_op.py +5 -5
pixeltable/exprs/type_cast.py +3 -5
pixeltable/func/__init__.py +1 -0
pixeltable/func/aggregate_function.py +8 -8
pixeltable/func/callable_function.py +9 -9
pixeltable/func/expr_template_function.py +10 -10
pixeltable/func/function.py +18 -20
pixeltable/func/function_registry.py +6 -7
pixeltable/func/globals.py +2 -3
pixeltable/func/mcp.py +74 -0
pixeltable/func/query_template_function.py +20 -18
pixeltable/func/signature.py +43 -16
pixeltable/func/tools.py +23 -13
pixeltable/func/udf.py +18 -20
pixeltable/functions/__init__.py +6 -0
pixeltable/functions/anthropic.py +93 -33
pixeltable/functions/audio.py +114 -10
pixeltable/functions/bedrock.py +13 -6
pixeltable/functions/date.py +1 -1
pixeltable/functions/deepseek.py +20 -9
pixeltable/functions/fireworks.py +2 -2
pixeltable/functions/gemini.py +28 -11
pixeltable/functions/globals.py +13 -13
pixeltable/functions/groq.py +108 -0
pixeltable/functions/huggingface.py +1046 -23
pixeltable/functions/image.py +9 -18
pixeltable/functions/llama_cpp.py +23 -8
pixeltable/functions/math.py +3 -4
pixeltable/functions/mistralai.py +4 -15
pixeltable/functions/ollama.py +16 -9
pixeltable/functions/openai.py +104 -82
pixeltable/functions/openrouter.py +143 -0
pixeltable/functions/replicate.py +2 -2
pixeltable/functions/reve.py +250 -0
pixeltable/functions/string.py +21 -28
pixeltable/functions/timestamp.py +13 -14
pixeltable/functions/together.py +4 -6
pixeltable/functions/twelvelabs.py +92 -0
pixeltable/functions/util.py +6 -1
pixeltable/functions/video.py +1388 -106
pixeltable/functions/vision.py +7 -7
pixeltable/functions/whisper.py +15 -7
pixeltable/functions/whisperx.py +179 -0
pixeltable/{ext/functions → functions}/yolox.py +2 -4
pixeltable/globals.py +332 -105
pixeltable/index/base.py +13 -22
pixeltable/index/btree.py +23 -22
pixeltable/index/embedding_index.py +32 -44
pixeltable/io/__init__.py +4 -2
pixeltable/io/datarows.py +7 -6
pixeltable/io/external_store.py +49 -77
pixeltable/io/fiftyone.py +11 -11
pixeltable/io/globals.py +29 -28
pixeltable/io/hf_datasets.py +17 -9
pixeltable/io/label_studio.py +70 -66
pixeltable/io/lancedb.py +3 -0
pixeltable/io/pandas.py +12 -11
pixeltable/io/parquet.py +13 -93
pixeltable/io/table_data_conduit.py +71 -47
pixeltable/io/utils.py +3 -3
pixeltable/iterators/__init__.py +2 -1
pixeltable/iterators/audio.py +21 -11
pixeltable/iterators/document.py +116 -55
pixeltable/iterators/image.py +5 -2
pixeltable/iterators/video.py +293 -13
pixeltable/metadata/__init__.py +4 -2
pixeltable/metadata/converters/convert_18.py +2 -2
pixeltable/metadata/converters/convert_19.py +2 -2
pixeltable/metadata/converters/convert_20.py +2 -2
pixeltable/metadata/converters/convert_21.py +2 -2
pixeltable/metadata/converters/convert_22.py +2 -2
pixeltable/metadata/converters/convert_24.py +2 -2
pixeltable/metadata/converters/convert_25.py +2 -2
pixeltable/metadata/converters/convert_26.py +2 -2
pixeltable/metadata/converters/convert_29.py +4 -4
pixeltable/metadata/converters/convert_34.py +2 -2
pixeltable/metadata/converters/convert_36.py +2 -2
pixeltable/metadata/converters/convert_37.py +15 -0
pixeltable/metadata/converters/convert_38.py +39 -0
pixeltable/metadata/converters/convert_39.py +124 -0
pixeltable/metadata/converters/convert_40.py +73 -0
pixeltable/metadata/converters/util.py +13 -12
pixeltable/metadata/notes.py +4 -0
pixeltable/metadata/schema.py +79 -42
pixeltable/metadata/utils.py +74 -0
pixeltable/mypy/__init__.py +3 -0
pixeltable/mypy/mypy_plugin.py +123 -0
pixeltable/plan.py +274 -223
pixeltable/share/__init__.py +1 -1
pixeltable/share/packager.py +259 -129
pixeltable/share/protocol/__init__.py +34 -0
pixeltable/share/protocol/common.py +170 -0
pixeltable/share/protocol/operation_types.py +33 -0
pixeltable/share/protocol/replica.py +109 -0
pixeltable/share/publish.py +213 -57
pixeltable/store.py +238 -175
pixeltable/type_system.py +104 -63
pixeltable/utils/__init__.py +2 -3
pixeltable/utils/arrow.py +108 -13
pixeltable/utils/av.py +298 -0
pixeltable/utils/azure_store.py +305 -0
pixeltable/utils/code.py +3 -3
pixeltable/utils/console_output.py +4 -1
pixeltable/utils/coroutine.py +6 -23
pixeltable/utils/dbms.py +31 -5
pixeltable/utils/description_helper.py +4 -5
pixeltable/utils/documents.py +5 -6
pixeltable/utils/exception_handler.py +7 -30
pixeltable/utils/filecache.py +6 -6
pixeltable/utils/formatter.py +4 -6
pixeltable/utils/gcs_store.py +283 -0
pixeltable/utils/http_server.py +2 -3
pixeltable/utils/iceberg.py +1 -2
pixeltable/utils/image.py +17 -0
pixeltable/utils/lancedb.py +88 -0
pixeltable/utils/local_store.py +316 -0
pixeltable/utils/misc.py +5 -0
pixeltable/utils/object_stores.py +528 -0
pixeltable/utils/pydantic.py +60 -0
pixeltable/utils/pytorch.py +5 -6
pixeltable/utils/s3_store.py +392 -0
pixeltable-0.4.20.dist-info/METADATA +587 -0
pixeltable-0.4.20.dist-info/RECORD +218 -0
{pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
pixeltable/__version__.py +0 -3
pixeltable/ext/__init__.py +0 -17
pixeltable/ext/functions/__init__.py +0 -11
pixeltable/ext/functions/whisperx.py +0 -77
pixeltable/utils/media_store.py +0 -77
pixeltable/utils/s3.py +0 -17
pixeltable/utils/sample.py +0 -25
pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
{pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0

pixeltable/exceptions.py CHANGED Viewed

@@ -10,6 +10,12 @@ class Error(Exception):
 class ExprEvalError(Exception):
+    """
+    Used during query execution to signal expr evaluation failures.
+    NOT A USER-FACING EXCEPTION. All ExprEvalError instances need to be converted into Error instances.
+    """
     expr: 'exprs.Expr'
     expr_msg: str
     exc: Exception

pixeltable/exec/__init__.py CHANGED Viewed

@@ -2,11 +2,14 @@
 from .aggregation_node import AggregationNode
 from .cache_prefetch_node import CachePrefetchNode
+from .cell_materialization_node import CellMaterializationNode
+from .cell_reconstruction_node import CellReconstructionNode
 from .component_iteration_node import ComponentIterationNode
 from .data_row_batch import DataRowBatch
 from .exec_context import ExecContext
 from .exec_node import ExecNode
 from .expr_eval import ExprEvalNode
 from .in_memory_data_node import InMemoryDataNode
+from .object_store_save_node import ObjectStoreSaveNode
 from .row_update_node import RowUpdateNode
 from .sql_node import SqlAggregationNode, SqlJoinNode, SqlLookupNode, SqlNode, SqlSampleNode, SqlScanNode

pixeltable/exec/aggregation_node.py CHANGED Viewed

@@ -2,7 +2,7 @@ from __future__ import annotations
 import logging
 import sys
-from typing import Any, AsyncIterator, Iterable, Optional, cast
+from typing import Any, AsyncIterator, Iterable, cast
 from pixeltable import catalog, exceptions as excs, exprs
@@ -19,18 +19,18 @@ class AggregationNode(ExecNode):
     At the moment, this returns all results in a single DataRowBatch.
     """
-    group_by: Optional[list[exprs.Expr]]
+    group_by: list[exprs.Expr] | None
     input_exprs: list[exprs.Expr]
     agg_fn_eval_ctx: exprs.RowBuilder.EvalCtx
     agg_fn_calls: list[exprs.FunctionCall]
     output_batch: DataRowBatch
-    limit: Optional[int]
+    limit: int | None
     def __init__(
         self,
         tbl: catalog.TableVersionHandle,
         row_builder: exprs.RowBuilder,
-        group_by: Optional[list[exprs.Expr]],
+        group_by: list[exprs.Expr] | None,
         agg_fn_calls: list[exprs.FunctionCall],
         input_exprs: Iterable[exprs.Expr],
         input: ExecNode,
@@ -45,7 +45,7 @@ class AggregationNode(ExecNode):
         # we need to make sure to refer to the same exprs that RowBuilder.eval() will use
         self.agg_fn_calls = [cast(exprs.FunctionCall, e) for e in self.agg_fn_eval_ctx.target_exprs]
         # create output_batch here, rather than in __iter__(), so we don't need to remember tbl and row_builder
-        self.output_batch = DataRowBatch(tbl, row_builder, 0)
+        self.output_batch = DataRowBatch(row_builder)
         self.limit = None
     def set_limit(self, limit: int) -> None:
@@ -72,8 +72,8 @@ class AggregationNode(ExecNode):
                 raise excs.ExprEvalError(fn_call, expr_msg, exc, exc_tb, input_vals, row_num) from exc
     async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
-        prev_row: Optional[exprs.DataRow] = None
-        current_group: Optional[list[Any]] = None  # the values of the group-by exprs
+        prev_row: exprs.DataRow | None = None
+        current_group: list[Any] | None = None  # the values of the group-by exprs
         num_input_rows = 0
         num_output_rows = 0
         async for row_batch in self.input:
@@ -103,6 +103,5 @@ class AggregationNode(ExecNode):
             self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
             self.output_batch.add_row(prev_row)
-        self.output_batch.flush_imgs(None, self.stored_img_cols, self.flushed_img_slots)
         _logger.debug(f'AggregateNode: consumed {num_input_rows} rows, returning {len(self.output_batch.rows)} rows')
         yield self.output_batch

pixeltable/exec/cache_prefetch_node.py CHANGED Viewed

@@ -9,11 +9,12 @@ import urllib.request
 from collections import deque
 from concurrent import futures
 from pathlib import Path
-from typing import Any, AsyncIterator, Iterator, Optional
+from typing import AsyncIterator, Iterator
 from uuid import UUID
-from pixeltable import catalog, env, exceptions as excs, exprs
+from pixeltable import exceptions as excs, exprs
 from pixeltable.utils.filecache import FileCache
+from pixeltable.utils.object_stores import ObjectOps
 from .data_row_batch import DataRowBatch
 from .exec_node import ExecNode
@@ -25,35 +26,35 @@ class CachePrefetchNode(ExecNode):
     """Brings files with external URLs into the cache
     TODO:
-    - adapting the number of download threads at runtime to maximize throughput
+    - Process a row at a time and limit the number of in-flight rows to control memory usage
+    - Create asyncio.Tasks to consume our input in order to increase concurrency.
     """
+    QUEUE_DEPTH_HIGH_WATER = 50  # target number of in-flight requests
+    QUEUE_DEPTH_LOW_WATER = 20  # target number of in-flight requests
     BATCH_SIZE = 16
-    NUM_EXECUTOR_THREADS = 16
+    MAX_WORKERS = 15
     retain_input_order: bool  # if True, return rows in the exact order they were received
     file_col_info: list[exprs.ColumnSlotIdx]
-    boto_client: Optional[Any]
-    boto_client_lock: threading.Lock
     # execution state
-    batch_tbl_version: Optional[catalog.TableVersionHandle]  # needed to construct output batches
     num_returned_rows: int
     # ready_rows: rows that are ready to be returned, ordered by row idx;
     # the implied row idx of ready_rows[0] is num_returned_rows
-    ready_rows: deque[Optional[exprs.DataRow]]
+    ready_rows: deque[exprs.DataRow | None]
     in_flight_rows: dict[int, CachePrefetchNode.RowState]  # rows with in-flight urls; id(row) -> RowState
     in_flight_requests: dict[futures.Future, str]  # in-flight requests for urls; future -> URL
     in_flight_urls: dict[str, list[tuple[exprs.DataRow, exprs.ColumnSlotIdx]]]  # URL -> [(row, info)]
     input_finished: bool
-    row_idx: Iterator[Optional[int]]
+    row_idx: Iterator[int | None]
     @dataclasses.dataclass
     class RowState:
         row: exprs.DataRow
-        idx: Optional[int]  # position in input stream; None if we don't retain input order
+        idx: int | None  # position in input stream; None if we don't retain input order
         num_missing: int  # number of missing URLs in this row
     def __init__(
@@ -64,11 +65,6 @@ class CachePrefetchNode(ExecNode):
         self.retain_input_order = retain_input_order
         self.file_col_info = file_col_info
-        # clients for specific services are constructed as needed, because it's time-consuming
-        self.boto_client = None
-        self.boto_client_lock = threading.Lock()
-        self.batch_tbl_version = None
         self.num_returned_rows = 0
         self.ready_rows = deque()
         self.in_flight_rows = {}
@@ -76,26 +72,44 @@ class CachePrefetchNode(ExecNode):
         self.in_flight_urls = {}
         self.input_finished = False
         self.row_idx = itertools.count() if retain_input_order else itertools.repeat(None)
+        assert self.QUEUE_DEPTH_HIGH_WATER > self.QUEUE_DEPTH_LOW_WATER
-    async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
-        input_iter = self.input.__aiter__()
-        with futures.ThreadPoolExecutor(max_workers=self.NUM_EXECUTOR_THREADS) as executor:
-            # we create enough in-flight requests to fill the first batch
-            while not self.input_finished and self.__num_pending_rows() < self.BATCH_SIZE:
-                await self.__submit_input_batch(input_iter, executor)
+    @property
+    def queued_work(self) -> int:
+        return len(self.in_flight_requests)
-            while True:
-                # try to assemble a full batch of output rows
-                if not self.__has_ready_batch() and len(self.in_flight_requests) > 0:
-                    self.__wait_for_requests()
-                # try to create enough in-flight requests to fill the next batch
-                while not self.input_finished and self.__num_pending_rows() < self.BATCH_SIZE:
-                    await self.__submit_input_batch(input_iter, executor)
+    async def get_input_batch(self, input_iter: AsyncIterator[DataRowBatch]) -> DataRowBatch | None:
+        """Get the next batch of input rows, or None if there are no more rows"""
+        try:
+            input_batch = await anext(input_iter)
+            if input_batch is None:
+                self.input_finished = True
+            return input_batch
+        except StopAsyncIteration:
+            self.input_finished = True
+            return None
-                if len(self.ready_rows) > 0:
+    async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
+        input_iter = aiter(self.input)
+        with futures.ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
+            while True:
+                # Create work to fill the queue to the high water mark ... ?without overrunning the in-flight row limit.
+                while not self.input_finished and self.queued_work < self.QUEUE_DEPTH_HIGH_WATER:
+                    input_batch = await self.get_input_batch(input_iter)
+                    if input_batch is not None:
+                        self.__process_input_batch(input_batch, executor)
+                # Wait for enough completions to enable more queueing or if we're done
+                while self.queued_work > self.QUEUE_DEPTH_LOW_WATER or (self.input_finished and self.queued_work > 0):
+                    done, _ = futures.wait(self.in_flight_requests, return_when=futures.FIRST_COMPLETED)
+                    self.__process_completions(done, ignore_errors=self.ctx.ignore_errors)
+                # Emit results to meet batch size requirements or empty the in-flight row queue
+                if self.__has_ready_batch() or (
+                    len(self.ready_rows) > 0 and self.input_finished and self.queued_work == 0
+                ):
                     # create DataRowBatch from the first BATCH_SIZE ready rows
-                    batch = DataRowBatch(self.batch_tbl_version, self.row_builder)
+                    batch = DataRowBatch(self.row_builder)
                     rows = [self.ready_rows.popleft() for _ in range(min(self.BATCH_SIZE, len(self.ready_rows)))]
                     for row in rows:
                         assert row is not None
@@ -104,23 +118,16 @@ class CachePrefetchNode(ExecNode):
                     _logger.debug(f'returning {len(rows)} rows')
                     yield batch
-                if self.input_finished and self.__num_pending_rows() == 0:
+                if self.input_finished and self.queued_work == 0 and len(self.ready_rows) == 0:
                     return
-    def __num_pending_rows(self) -> int:
-        return len(self.in_flight_rows) + len(self.ready_rows)
     def __has_ready_batch(self) -> bool:
         """True if there are >= BATCH_SIZES entries in ready_rows and the first BATCH_SIZE ones are all non-None"""
         return (
             sum(int(row is not None) for row in itertools.islice(self.ready_rows, self.BATCH_SIZE)) == self.BATCH_SIZE
         )
-    def __ready_prefix_len(self) -> int:
-        """Length of the non-None prefix of ready_rows (= what we can return right now)"""
-        return sum(1 for _ in itertools.takewhile(lambda x: x is not None, self.ready_rows))
-    def __add_ready_row(self, row: exprs.DataRow, row_idx: Optional[int]) -> None:
+    def __add_ready_row(self, row: exprs.DataRow, row_idx: int | None) -> None:
         if row_idx is None:
             self.ready_rows.append(row)
         else:
@@ -130,52 +137,36 @@ class CachePrefetchNode(ExecNode):
                 self.ready_rows.extend([None] * (idx - len(self.ready_rows) + 1))
             self.ready_rows[idx] = row
-    def __wait_for_requests(self) -> None:
-        """Wait for in-flight requests to complete until we have a full batch of rows"""
+    def __process_completions(self, done: set[futures.Future], ignore_errors: bool) -> None:
         file_cache = FileCache.get()
-        _logger.debug(f'waiting for requests; ready_batch_size={self.__ready_prefix_len()}')
-        while not self.__has_ready_batch() and len(self.in_flight_requests) > 0:
-            done, _ = futures.wait(self.in_flight_requests, return_when=futures.FIRST_COMPLETED)
-            for f in done:
-                url = self.in_flight_requests.pop(f)
-                tmp_path, exc = f.result()
-                local_path: Optional[Path] = None
-                if tmp_path is not None:
-                    # register the file with the cache for the first column in which it's missing
-                    assert url in self.in_flight_urls
-                    _, info = self.in_flight_urls[url][0]
-                    local_path = file_cache.add(info.col.tbl.id, info.col.id, url, tmp_path)
-                    _logger.debug(f'cached {url} as {local_path}')
-                # add the local path/exception to the slots that reference the url
-                for row, info in self.in_flight_urls.pop(url):
-                    if exc is not None:
-                        self.row_builder.set_exc(row, info.slot_idx, exc)
-                    else:
-                        assert local_path is not None
-                        row.set_file_path(info.slot_idx, str(local_path))
-                    state = self.in_flight_rows[id(row)]
-                    state.num_missing -= 1
-                    if state.num_missing == 0:
-                        del self.in_flight_rows[id(row)]
-                        self.__add_ready_row(row, state.idx)
-                        _logger.debug(f'row {state.idx} is ready (ready_batch_size={self.__ready_prefix_len()})')
-    async def __submit_input_batch(
-        self, input: AsyncIterator[DataRowBatch], executor: futures.ThreadPoolExecutor
-    ) -> None:
-        assert not self.input_finished
-        input_batch: Optional[DataRowBatch]
-        try:
-            input_batch = await anext(input)
-        except StopAsyncIteration:
-            input_batch = None
-        if input_batch is None:
-            self.input_finished = True
-            return
-        if self.batch_tbl_version is None:
-            self.batch_tbl_version = input_batch.tbl
+        for f in done:
+            url = self.in_flight_requests.pop(f)
+            tmp_path, exc = f.result()
+            if exc is not None and not ignore_errors:
+                raise exc
+            local_path: Path | None = None
+            if tmp_path is not None:
+                # register the file with the cache for the first column in which it's missing
+                assert url in self.in_flight_urls
+                _, info = self.in_flight_urls[url][0]
+                local_path = file_cache.add(info.col.get_tbl().id, info.col.id, url, tmp_path)
+                _logger.debug(f'cached {url} as {local_path}')
+            # add the local path/exception to the slots that reference the url
+            for row, info in self.in_flight_urls.pop(url):
+                if exc is not None:
+                    self.row_builder.set_exc(row, info.slot_idx, exc)
+                else:
+                    assert local_path is not None
+                    row.set_file_path(info.slot_idx, str(local_path))
+                state = self.in_flight_rows[id(row)]
+                state.num_missing -= 1
+                if state.num_missing == 0:
+                    del self.in_flight_rows[id(row)]
+                    self.__add_ready_row(row, state.idx)
+    def __process_input_batch(self, input_batch: DataRowBatch, executor: futures.ThreadPoolExecutor) -> None:
+        """Process a batch of input rows, submitting URLs for download and adding ready rows to ready_rows"""
         file_cache = FileCache.get()
         # URLs from this input batch that aren't already in the file cache;
@@ -183,7 +174,7 @@ class CachePrefetchNode(ExecNode):
         # the time it takes to get the next batch together
         cache_misses: list[str] = []
-        url_pos: dict[str, int] = {}  # url -> row_idx; used for logging
+        url_pos: dict[str, int | None] = {}  # url -> row_idx; used for logging
         for row in input_batch:
             # identify missing local files in input batch, or fill in their paths if they're already cached
             num_missing = 0
@@ -222,8 +213,10 @@ class CachePrefetchNode(ExecNode):
             _logger.debug(f'submitted {url} for idx {url_pos[url]}')
             self.in_flight_requests[f] = url
-    def __fetch_url(self, url: str) -> tuple[Optional[Path], Optional[Exception]]:
-        """Fetches a remote URL into Env.tmp_dir and returns its path"""
+    def __fetch_url(self, url: str) -> tuple[Path | None, Exception | None]:
+        """Fetches a remote URL into the TempStore and returns its path"""
+        from pixeltable.utils.local_store import TempStore
         _logger.debug(f'fetching url={url} thread_name={threading.current_thread().name}')
         parsed = urllib.parse.urlparse(url)
         # Use len(parsed.scheme) > 1 here to ensure we're not being passed
@@ -234,34 +227,14 @@ class CachePrefetchNode(ExecNode):
         if parsed.path:
             p = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed.path)))
             extension = p.suffix
-        tmp_path = env.Env.get().create_tmp_path(extension=extension)
+        tmp_path = TempStore.create_path(extension=extension)
         try:
             _logger.debug(f'Downloading {url} to {tmp_path}')
-            if parsed.scheme == 's3':
-                from pixeltable.utils.s3 import get_client
-                with self.boto_client_lock:
-                    if self.boto_client is None:
-                        config = {
-                            'max_pool_connections': self.NUM_EXECUTOR_THREADS + 4,  # +4: leave some headroom
-                            'connect_timeout': 5,
-                            'read_timeout': 30,
-                            'retries': {'max_attempts': 3, 'mode': 'adaptive'},
-                        }
-                        self.boto_client = get_client(**config)
-                self.boto_client.download_file(parsed.netloc, parsed.path.lstrip('/'), str(tmp_path))
-            elif parsed.scheme in ('http', 'https'):
-                with urllib.request.urlopen(url) as resp, open(tmp_path, 'wb') as f:
-                    data = resp.read()
-                    f.write(data)
-            else:
-                raise AssertionError(f'Unsupported URL scheme: {parsed.scheme}')
+            ObjectOps.copy_object_to_local_file(url, tmp_path)
             _logger.debug(f'Downloaded {url} to {tmp_path}')
             return tmp_path, None
         except Exception as e:
             # we want to add the file url to the exception message
             exc = excs.Error(f'Failed to download {url}: {e}')
             _logger.debug(f'Failed to download {url}: {e}', exc_info=e)
-            if not self.ctx.ignore_errors:
-                raise exc from None  # suppress original exception
             return None, exc

pixeltable/exec/cell_materialization_node.py ADDED Viewed

@@ -0,0 +1,231 @@
+from __future__ import annotations
+import io
+import logging
+import os
+from pathlib import Path
+from typing import Any, AsyncIterator
+import numpy as np
+import pgvector.sqlalchemy  # type: ignore[import-untyped]
+import PIL.Image
+import sqlalchemy as sql
+import pixeltable.type_system as ts
+import pixeltable.utils.image as image_utils
+from pixeltable import catalog, exprs
+from pixeltable.env import Env
+from pixeltable.utils.local_store import LocalStore
+from .data_row_batch import DataRowBatch
+from .exec_node import ExecNode
+from .globals import INLINED_OBJECT_MD_KEY, InlinedObjectMd
+_logger = logging.getLogger('pixeltable')
+class CellMaterializationNode(ExecNode):
+    """
+    Node to populate DataRow.cell_vals/cell_md.
+    For now, the scope is limited to populating DataRow.cells_vals for json and array columns.
+    Array values:
+    - Arrays < MAX_DB_ARRAY_SIZE are stored inline in the db column
+    - Larger arrays are written to inlined_obj_files
+    - Bool arrays are stored as packed bits (uint8)
+    - cell_md: holds the url of the file, plus start and end offsets, plus bool flag and shape for bool arrays
+      (this allows us to query cell_md to get the total external storage size of an array column)
+    Json values:
+    - Inlined images and arrays are written to inlined_obj_files and replaced with a dict containing the object
+      location
+    - Bool arrays are also stored as packed bits; the dict also contains the shape and bool flag
+    - cell_md contains the list of urls for the inlined objects.
+    TODO:
+    - execute file IO via asyncio Tasks in a thread pool?
+      (we already seem to be getting 90% of hardware IO throughput)
+    - subsume all cell materialization
+    """
+    output_col_info: dict[catalog.Column, int]  # value: slot idx
+    # execution state
+    inlined_obj_files: list[Path]  # only [-1] is open for writing
+    buffered_writer: io.BufferedWriter | None  # BufferedWriter for inlined_obj_files[-1]
+    MIN_FILE_SIZE = 8 * 2**20  # 8MB
+    MAX_DB_ARRAY_SIZE = 512  # max size of array stored in table column; in bytes
+    def __init__(self, input: ExecNode):
+        super().__init__(input.row_builder, [], [], input)
+        self.output_col_info = {
+            col: slot_idx
+            for col, slot_idx in input.row_builder.table_columns.items()
+            if slot_idx is not None and (col.col_type.is_json_type() or col.col_type.is_array_type())
+        }
+        self.inlined_obj_files = []
+        self.buffered_writer = None
+    async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
+        async for batch in self.input:
+            for row in batch:
+                for col, slot_idx in self.output_col_info.items():
+                    if row.has_exc(slot_idx):
+                        # Nulls in JSONB columns need to be stored as sql.sql.null(), otherwise it stores a json 'null'
+                        row.cell_vals[col.id] = sql.sql.null() if col.col_type.is_json_type() else None
+                        exc = row.get_exc(slot_idx)
+                        row.cell_md[col.id] = exprs.CellMd(errortype=type(exc).__name__, errormsg=str(exc))
+                        continue
+                    val = row[slot_idx]
+                    if val is None:
+                        row.cell_vals[col.id] = sql.sql.null() if col.col_type.is_json_type() else None
+                        row.cell_md[col.id] = None
+                        continue
+                    if col.col_type.is_json_type():
+                        self._materialize_json_cell(row, col, val)
+                    else:
+                        assert col.col_type.is_array_type()
+                        assert isinstance(val, np.ndarray)
+                        self._materialize_array_cell(row, col, val)
+                    # continue with only the currently open file
+                    self.inlined_obj_files = self.inlined_obj_files[-1:]
+            yield batch
+        self._flush_buffer(finalize=True)
+    def init_writer(self) -> None:
+        if self.buffered_writer is None:
+            self._reset_buffer()
+            assert self.buffered_writer is not None
+    def close(self) -> None:
+        if self.buffered_writer is not None:
+            # there must have been an error, otherwise _flush_full_buffer(finalize=True) would have set this to None
+            self.buffered_writer.close()
+            self.buffered_writer = None
+    def _materialize_json_cell(self, row: exprs.DataRow, col: catalog.Column, val: Any) -> None:
+        if self._json_has_inlined_objs(val):
+            row.cell_vals[col.id] = self._rewrite_json(val)
+            row.cell_md[col.id] = exprs.CellMd(file_urls=[local_path.as_uri() for local_path in self.inlined_obj_files])
+        else:
+            row.cell_vals[col.id] = val
+            row.cell_md[col.id] = None
+    def _materialize_array_cell(self, row: exprs.DataRow, col: catalog.Column, val: np.ndarray) -> None:
+        if isinstance(col.sa_col_type, pgvector.sqlalchemy.Vector):
+            # this is a vector column (ie, used for a vector index): store the array itself
+            row.cell_vals[col.id] = val
+            row.cell_md[col.id] = None
+        elif val.nbytes <= self.MAX_DB_ARRAY_SIZE:
+            # this array is small enough to store in the db column (type: binary) directly
+            buffer = io.BytesIO()
+            np.save(buffer, val, allow_pickle=False)
+            row.cell_vals[col.id] = buffer.getvalue()
+            row.cell_md[col.id] = None
+        else:
+            # append this array to the buffer and store its location in the cell md
+            ar: np.ndarray
+            if np.issubdtype(val.dtype, np.bool_):
+                # for bool arrays, store as packed bits, otherwise it's 1 byte per element
+                ar = np.packbits(val)
+            else:
+                ar = val
+            self.init_writer()
+            start = self.buffered_writer.tell()
+            np.save(self.buffered_writer, ar, allow_pickle=False)
+            end = self.buffered_writer.tell()
+            row.cell_vals[col.id] = None
+            cell_md = exprs.CellMd(
+                file_urls=[self.inlined_obj_files[-1].as_uri()], array_md=exprs.ArrayMd(start=start, end=end)
+            )
+            if np.issubdtype(val.dtype, np.bool_):
+                cell_md.array_md.is_bool = True
+                cell_md.array_md.shape = val.shape
+            row.cell_md[col.id] = cell_md
+            self._flush_buffer()
+        assert row.cell_vals[col.id] is not None or row.cell_md[col.id] is not None
+    def _json_has_inlined_objs(self, element: Any) -> bool:
+        if isinstance(element, list):
+            return any(self._json_has_inlined_objs(v) for v in element)
+        if isinstance(element, dict):
+            return any(self._json_has_inlined_objs(v) for v in element.values())
+        return isinstance(element, (np.ndarray, PIL.Image.Image))
+    def _rewrite_json(self, element: Any) -> Any:
+        """Recursively rewrites a JSON structure by writing any inlined arrays or images to self.buffered_writer."""
+        if isinstance(element, list):
+            return [self._rewrite_json(v) for v in element]
+        if isinstance(element, dict):
+            return {k: self._rewrite_json(v) for k, v in element.items()}
+        if isinstance(element, np.ndarray):
+            obj_md = self._write_inlined_array(element)
+            return {INLINED_OBJECT_MD_KEY: obj_md.as_dict()}
+        if isinstance(element, PIL.Image.Image):
+            obj_md = self._write_inlined_image(element)
+            return {INLINED_OBJECT_MD_KEY: obj_md.as_dict()}
+        return element
+    def _write_inlined_array(self, ar: np.ndarray) -> InlinedObjectMd:
+        """Write an ndarray to buffered_writer and return its metadata."""
+        self.init_writer()
+        url_idx = len(self.inlined_obj_files) - 1
+        start = self.buffered_writer.tell()
+        shape: tuple[int, ...] | None
+        is_bool_array: bool
+        if np.issubdtype(ar.dtype, np.bool_):
+            shape = ar.shape
+            ar = np.packbits(ar)
+            is_bool_array = True
+        else:
+            shape = None
+            is_bool_array = False
+        np.save(self.buffered_writer, ar, allow_pickle=False)
+        end = self.buffered_writer.tell()
+        self._flush_buffer()
+        return InlinedObjectMd(
+            type=ts.ColumnType.Type.ARRAY.name,
+            url_idx=url_idx,
+            array_md=exprs.ArrayMd(start=start, end=end, is_bool=is_bool_array, shape=shape),
+        )
+    def _write_inlined_image(self, img: PIL.Image.Image) -> InlinedObjectMd:
+        """Write a PIL image to buffered_writer and return: index into inlined_obj_files, start offset, end offset"""
+        self.init_writer()
+        url_idx = len(self.inlined_obj_files) - 1
+        start = self.buffered_writer.tell()
+        img.save(self.buffered_writer, format=image_utils.default_format(img))
+        end = self.buffered_writer.tell()
+        self._flush_buffer()
+        return InlinedObjectMd(type=ts.ColumnType.Type.IMAGE.name, url_idx=url_idx, img_start=start, img_end=end)
+    def _reset_buffer(self) -> None:
+        local_path = LocalStore(Env.get().media_dir)._prepare_path_raw(
+            self.row_builder.tbl.id, 0, self.row_builder.tbl.version
+        )
+        self.inlined_obj_files.append(local_path)
+        fh = open(local_path, 'wb', buffering=self.MIN_FILE_SIZE * 2)  # noqa: SIM115
+        assert isinstance(fh, io.BufferedWriter)
+        self.buffered_writer = fh
+    def _flush_buffer(self, finalize: bool = False) -> None:
+        """Flush buffered_writer to storage if it exceeds its minimum size or finalize is True."""
+        if self.buffered_writer is None:
+            return
+        if self.buffered_writer.tell() < self.MIN_FILE_SIZE and not finalize:
+            return
+        self.buffered_writer.flush()
+        os.fsync(self.buffered_writer.fileno())  # needed to force bytes cached by OS to storage
+        self.buffered_writer.close()
+        if finalize:
+            self.buffered_writer = None
+        else:
+            self._reset_buffer()

pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl

Potentially problematic release.

pixeltable 0.4.0rc3py3-none-any.whl → 0.4.20py3-none-any.whl