PyPI - pixeltable - Versions diffs - 0.4.13__py3-none-any.whl → 0.4.14__py3-none-any.whl - Mend

pixeltable 0.4.13py3-none-any.whl → 0.4.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (46) hide show

pixeltable/catalog/catalog.py +179 -63
pixeltable/catalog/column.py +24 -20
pixeltable/catalog/table.py +24 -8
pixeltable/catalog/table_version.py +15 -6
pixeltable/catalog/view.py +22 -22
pixeltable/config.py +2 -0
pixeltable/dataframe.py +3 -2
pixeltable/env.py +42 -21
pixeltable/exec/__init__.py +1 -0
pixeltable/exec/aggregation_node.py +0 -1
pixeltable/exec/cache_prefetch_node.py +74 -98
pixeltable/exec/data_row_batch.py +2 -18
pixeltable/exec/in_memory_data_node.py +1 -1
pixeltable/exec/object_store_save_node.py +299 -0
pixeltable/exec/sql_node.py +28 -33
pixeltable/exprs/data_row.py +31 -25
pixeltable/exprs/json_path.py +6 -5
pixeltable/exprs/row_builder.py +6 -12
pixeltable/functions/gemini.py +1 -1
pixeltable/functions/openai.py +1 -1
pixeltable/functions/video.py +5 -6
pixeltable/globals.py +3 -3
pixeltable/index/embedding_index.py +5 -8
pixeltable/io/fiftyone.py +1 -1
pixeltable/io/label_studio.py +4 -5
pixeltable/iterators/audio.py +1 -1
pixeltable/iterators/document.py +10 -12
pixeltable/iterators/video.py +1 -1
pixeltable/metadata/schema.py +7 -0
pixeltable/plan.py +26 -1
pixeltable/share/packager.py +8 -2
pixeltable/share/publish.py +3 -9
pixeltable/type_system.py +1 -3
pixeltable/utils/dbms.py +31 -5
pixeltable/utils/gcs_store.py +283 -0
pixeltable/utils/local_store.py +316 -0
pixeltable/utils/object_stores.py +497 -0
pixeltable/utils/pytorch.py +5 -6
pixeltable/utils/s3_store.py +354 -0
{pixeltable-0.4.13.dist-info → pixeltable-0.4.14.dist-info}/METADATA +1 -1
{pixeltable-0.4.13.dist-info → pixeltable-0.4.14.dist-info}/RECORD +44 -41
pixeltable/utils/media_store.py +0 -248
pixeltable/utils/s3.py +0 -17
{pixeltable-0.4.13.dist-info → pixeltable-0.4.14.dist-info}/WHEEL +0 -0
{pixeltable-0.4.13.dist-info → pixeltable-0.4.14.dist-info}/entry_points.txt +0 -0
{pixeltable-0.4.13.dist-info → pixeltable-0.4.14.dist-info}/licenses/LICENSE +0 -0

pixeltable/catalog/table_version.py CHANGED Viewed

@@ -20,7 +20,7 @@ from pixeltable.iterators import ComponentIterator
 from pixeltable.metadata import schema
 from pixeltable.utils.exception_handler import run_cleanup_on_exception
 from pixeltable.utils.filecache import FileCache
-from pixeltable.utils.media_store import MediaStore
+from pixeltable.utils.object_stores import ObjectOps
 from .tbl_ops import TableOp
@@ -327,7 +327,7 @@ class TableVersion:
             from .table_version_path import TableVersionPath
             # clear out any remaining media files from an aborted previous attempt
-            MediaStore.get().delete(self.id)
+            self.delete_media()
             view_path = TableVersionPath.from_dict(op.load_view_op.view_path)
             plan, _ = Planner.create_view_load_plan(view_path)
             _, row_counts = self.store_tbl.insert_rows(plan, v_min=self.version)
@@ -356,14 +356,23 @@ class TableVersion:
         cat = pxt.catalog.Catalog.get()
         # We're creating a new TableVersion replica, so we should never have seen this particular
         # TableVersion instance before.
-        assert tbl_version.effective_version is not None
-        assert (tbl_version.id, tbl_version.effective_version) not in cat._tbl_versions
+        # Actually this isn't true, because we might be re-creating a dropped replica.
+        # TODO: Understand why old TableVersions are kept around even for a dropped table.
+        # assert tbl_version.effective_version is not None
+        # assert (tbl_version.id, tbl_version.effective_version) not in cat._tbl_versions
         cat._tbl_versions[tbl_version.id, tbl_version.effective_version] = tbl_version
         tbl_version.init()
         tbl_version.store_tbl.create()
         tbl_version.store_tbl.ensure_columns_exist(col for col in tbl_version.cols if col.is_stored)
         return tbl_version
+    def delete_media(self, tbl_version: Optional[int] = None) -> None:
+        # Assemble a set of column destinations and delete objects from all of them
+        # None is a valid column destination which refers to the default object location
+        destinations = {col.destination for col in self.cols if col.is_stored}
+        for dest in destinations:
+            ObjectOps.delete(dest, self.id, tbl_version=tbl_version)
     def drop(self) -> None:
         # if self.is_view and self.is_mutable:
         #     # update mutable_views
@@ -374,7 +383,7 @@ class TableVersion:
         #     if self.base.get().is_mutable:
         #         self.base.get().mutable_views.remove(TableVersionHandle.create(self))
-        MediaStore.get().delete(self.id)
+        self.delete_media()
         FileCache.get().clear(tbl_id=self.id)
         self.store_tbl.drop()
@@ -1236,7 +1245,7 @@ class TableVersion:
             )
         # delete newly-added data
-        MediaStore.get().delete(self.id, tbl_version=self.version)
+        self.delete_media(tbl_version=self.version)
         conn.execute(sql.delete(self.store_tbl.sa_tbl).where(self.store_tbl.sa_tbl.c.v_min == self.version))
         # revert new deletions

pixeltable/catalog/view.py CHANGED Viewed

@@ -47,17 +47,13 @@ class View(Table):
             self._tbl_version = tbl_version_path.tbl_version
     def _display_name(self) -> str:
-        name: str
-        if self._tbl_version_path.is_snapshot():
-            name = 'snapshot'
-        elif self._tbl_version_path.is_view():
-            name = 'view'
-        else:
-            assert self._tbl_version_path.is_replica()
-            name = 'table'
         if self._tbl_version_path.is_replica():
-            name = f'{name}-replica'
-        return name
+            return 'replica'
+        if self._tbl_version_path.is_snapshot():
+            return 'snapshot'
+        if self._tbl_version_path.is_view():
+            return 'view'
+        return 'table'
     @classmethod
     def select_list_to_additional_columns(cls, select_list: list[tuple[exprs.Expr, Optional[str]]]) -> dict[str, dict]:
@@ -270,12 +266,12 @@ class View(Table):
             # Update name and path with version qualifiers.
             md['name'] = f'{self._name}:{self._tbl_version_path.version()}'
             md['path'] = f'{self._path()}:{self._tbl_version_path.version()}'
-        base_tbl = self._get_base_table()
-        if base_tbl is None:
-            md['base'] = None
-        else:
+        base_tbl_id = self._base_tbl_id
+        if base_tbl_id is not None:
+            base_tbl = self._get_base_table()
+            base_path = '<anonymous base table>' if base_tbl is None else base_tbl._path()
             base_version = self._effective_base_versions[0]
-            md['base'] = base_tbl._path() if base_version is None else f'{base_tbl._path()}:{base_version}'
+            md['base'] = base_path if base_version is None else f'{base_path}:{base_version}'
         return md
     def insert(
@@ -294,17 +290,21 @@ class View(Table):
     def delete(self, where: Optional[exprs.Expr] = None) -> UpdateStatus:
         raise excs.Error(f'{self._display_str()}: Cannot delete from a {self._display_name()}.')
-    def _get_base_table(self) -> Optional['Table']:
+    @property
+    def _base_tbl_id(self) -> Optional[UUID]:
         if self._tbl_version_path.tbl_id != self._id:
             # _tbl_version_path represents a different schema object from this one. This can only happen if this is a
             # named pure snapshot.
-            base_id = self._tbl_version_path.tbl_id
-        elif self._tbl_version_path.base is None:
+            return self._tbl_version_path.tbl_id
+        if self._tbl_version_path.base is None:
             return None
-        else:
-            base_id = self._tbl_version_path.base.tbl_id
-        with catalog.Catalog.get().begin_xact(tbl_id=base_id, for_write=False):
-            return catalog.Catalog.get().get_table_by_id(base_id)
+        return self._tbl_version_path.base.tbl_id
+    def _get_base_table(self) -> Optional['Table']:
+        """Returns None if there is no base table, or if the base table is hidden."""
+        base_tbl_id = self._base_tbl_id
+        with catalog.Catalog.get().begin_xact(tbl_id=base_tbl_id, for_write=False):
+            return catalog.Catalog.get().get_table_by_id(base_tbl_id)
     @property
     def _effective_base_versions(self) -> list[Optional[int]]:

pixeltable/config.py CHANGED Viewed

@@ -161,6 +161,8 @@ KNOWN_CONFIG_OPTIONS = {
         'hide_warnings': 'Hide warnings from the console',
         'verbosity': 'Verbosity level for console output',
         'api_key': 'API key for Pixeltable cloud',
+        'r2_profile': 'AWS config profile name used to access R2 storage',
+        's3_profile': 'AWS config profile name used to access S3 storage',
     },
     'anthropic': {'api_key': 'Anthropic API key'},
     'bedrock': {'api_key': 'AWS Bedrock API key'},

pixeltable/dataframe.py CHANGED Viewed

@@ -1276,10 +1276,11 @@ class DataFrame:
         # TODO: Reconcile these with Table.__check_mutable()
         assert len(self._from_clause.tbls) == 1
-        if self._first_tbl.is_snapshot():
-            raise excs.Error(f'Cannot use `{op_name}` on a snapshot.')
+        # First check if it's a replica, since every replica handle is also a snapshot
         if self._first_tbl.is_replica():
             raise excs.Error(f'Cannot use `{op_name}` on a replica.')
+        if self._first_tbl.is_snapshot():
+            raise excs.Error(f'Cannot use `{op_name}` on a snapshot.')
     def _validate_mutable_op_sequence(self, op_name: str, allow_select: bool) -> None:
         """Tests whether the sequence of operations on this DataFrame is valid for a mutation operation."""

pixeltable/env.py CHANGED Viewed

@@ -28,6 +28,7 @@ import nest_asyncio  # type: ignore[import-untyped]
 import pixeltable_pgserver
 import sqlalchemy as sql
 from pillow_heif import register_heif_opener  # type: ignore[import-untyped]
+from sqlalchemy import orm
 from tenacity import retry, stop_after_attempt, wait_exponential_jitter
 from tqdm import TqdmWarning
@@ -36,6 +37,7 @@ from pixeltable.config import Config
 from pixeltable.utils.console_output import ConsoleLogger, ConsoleMessageFilter, ConsoleOutputHandler, map_level
 from pixeltable.utils.dbms import CockroachDbms, Dbms, PostgresqlDbms
 from pixeltable.utils.http_server import make_server
+from pixeltable.utils.object_stores import ObjectPath, StorageObjectAddress
 if TYPE_CHECKING:
     import spacy
@@ -58,7 +60,8 @@ class Env:
     _log_fmt_str = '%(asctime)s %(levelname)s %(name)s %(filename)s:%(lineno)d: %(message)s'
     _media_dir: Optional[Path]
-    _file_cache_dir: Optional[Path]  # cached media files with external URL
+    _object_soa: Optional[StorageObjectAddress]
+    _file_cache_dir: Optional[Path]  # cached object files with external URL
     _dataset_cache_dir: Optional[Path]  # cached datasets (eg, pytorch or COCO)
     _log_dir: Optional[Path]  # log files
     _tmp_dir: Optional[Path]  # any tmp files
@@ -88,7 +91,7 @@ class Env:
     _resource_pool_info: dict[str, Any]
     _current_conn: Optional[sql.Connection]
-    _current_session: Optional[sql.orm.Session]
+    _current_session: Optional[orm.Session]
     _current_isolation_level: Optional[Literal['REPEATABLE_READ', 'SERIALIZABLE']]
     _dbms: Optional[Dbms]
     _event_loop: Optional[asyncio.AbstractEventLoop]  # event loop for ExecNode
@@ -120,7 +123,8 @@ class Env:
         assert self._instance is None, 'Env is a singleton; use Env.get() to access the instance'
         self._media_dir = None  # computed media files
-        self._file_cache_dir = None  # cached media files with external URL
+        self._object_soa = None  # computed object files in StorageObjectAddress format
+        self._file_cache_dir = None  # cached object files with external URL
         self._dataset_cache_dir = None  # cached datasets (eg, pytorch or COCO)
         self._log_dir = None  # log files
         self._tmp_dir = None  # any tmp files
@@ -224,7 +228,7 @@ class Env:
         return self._current_conn
     @property
-    def session(self) -> Optional[sql.orm.Session]:
+    def session(self) -> Optional[orm.Session]:
         assert self._current_session is not None
         return self._current_session
@@ -258,7 +262,7 @@ class Env:
                 self._current_isolation_level = 'SERIALIZABLE'
                 with (
                     self.engine.connect().execution_options(isolation_level=self._current_isolation_level) as conn,
-                    sql.orm.Session(conn) as session,
+                    orm.Session(conn) as session,
                     conn.begin(),
                 ):
                     self._current_conn = conn
@@ -363,6 +367,7 @@ class Env:
         if not self._media_dir.exists():
             self._media_dir.mkdir()
+        self._object_soa = ObjectPath.parse_object_storage_addr(str(self._media_dir), may_contain_object_name=False)
         if not self._file_cache_dir.exists():
             self._file_cache_dir.mkdir()
         if not self._dataset_cache_dir.exists():
@@ -615,15 +620,17 @@ class Env:
         Args:
             - name: The name of the client
         """
-        cl = _registered_clients[name]
-        if cl.client_obj is not None:
-            return cl.client_obj  # Already initialized
-        # Construct a client, retrieving each parameter from config.
+        # Return the existing client if it has already been constructed
+        with _registered_clients_lock:
+            cl = _registered_clients[name]
+            if cl.client_obj is not None:
+                return cl.client_obj  # Already initialized
+        # Retrieve parameters required to construct the requested client.
         init_kwargs: dict[str, Any] = {}
         for param in cl.params.values():
             # Determine the type of the parameter for proper config parsing.
+            pname = param.name
             t = param.annotation
             # Deference Optional[T]
             if typing.get_origin(t) in (typing.Union, types.UnionType):
@@ -633,27 +640,31 @@ class Env:
                 elif args[1] is type(None):
                     t = args[0]
             assert isinstance(t, type), t
-            arg: Any = Config.get().get_value(param.name, t, section=name)
+            arg: Any = Config.get().get_value(pname, t, section=name)
             if arg is not None:
-                init_kwargs[param.name] = arg
+                init_kwargs[pname] = arg
             elif param.default is inspect.Parameter.empty:
                 raise excs.Error(
-                    f'`{name}` client not initialized: parameter `{param.name}` is not configured.\n'
-                    f'To fix this, specify the `{name.upper()}_{param.name.upper()}` environment variable, '
-                    f'or put `{param.name.lower()}` in the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
+                    f'`{name}` client not initialized: parameter `{pname}` is not configured.\n'
+                    f'To fix this, specify the `{name.upper()}_{pname.upper()}` environment variable, '
+                    f'or put `{pname.lower()}` in the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
                 )
-        cl.client_obj = cl.init_fn(**init_kwargs)
-        self._logger.info(f'Initialized `{name}` client.')
-        return cl.client_obj
+        # Construct the requested client
+        with _registered_clients_lock:
+            if cl.client_obj is not None:
+                return cl.client_obj  # Already initialized
+            cl.client_obj = cl.init_fn(**init_kwargs)
+            self._logger.info(f'Initialized `{name}` client with parameters: {init_kwargs}.')
+            return cl.client_obj
     def _start_web_server(self) -> None:
         """
         The http server root is the file system root.
         eg: /home/media/foo.mp4 is located at http://127.0.0.1:{port}/home/media/foo.mp4
         On Windows, the server will translate paths like http://127.0.0.1:{port}/c:/media/foo.mp4
-        This arrangement enables serving media hosted within _home,
-        as well as external media inserted into pixeltable or produced by pixeltable.
+        This arrangement enables serving objects hosted within _home,
+        as well as external objects inserted into pixeltable or produced by pixeltable.
         The port is chosen dynamically to prevent conflicts.
         """
         # Port 0 means OS picks one for us.
@@ -713,10 +724,12 @@ class Env:
     def __register_packages(self) -> None:
         """Declare optional packages that are utilized by some parts of the code."""
         self.__register_package('anthropic')
+        self.__register_package('azure.storage.blob', library_name='azure-storage-blob')
         self.__register_package('boto3')
         self.__register_package('datasets')
         self.__register_package('fiftyone')
         self.__register_package('fireworks', library_name='fireworks-ai')
+        self.__register_package('google.cloud.storage', library_name='google-cloud-storage')
         self.__register_package('google.genai', library_name='google-genai')
         self.__register_package('groq')
         self.__register_package('huggingface_hub', library_name='huggingface-hub')
@@ -815,6 +828,12 @@ class Env:
         assert self._media_dir is not None
         return self._media_dir
+    @property
+    def object_soa(self) -> StorageObjectAddress:
+        assert self._media_dir is not None
+        assert self._object_soa is not None
+        return self._object_soa
     @property
     def file_cache_dir(self) -> Path:
         assert self._file_cache_dir is not None
@@ -947,11 +966,13 @@ def register_client(name: str) -> Callable:
     def decorator(fn: Callable) -> None:
         sig = inspect.signature(fn)
         params = dict(sig.parameters)
-        _registered_clients[name] = ApiClient(init_fn=fn, params=params)
+        with _registered_clients_lock:
+            _registered_clients[name] = ApiClient(init_fn=fn, params=params)
     return decorator
+_registered_clients_lock: threading.Lock = threading.Lock()
 _registered_clients: dict[str, ApiClient] = {}

pixeltable/exec/__init__.py CHANGED Viewed

@@ -8,5 +8,6 @@ from .exec_context import ExecContext
 from .exec_node import ExecNode
 from .expr_eval import ExprEvalNode
 from .in_memory_data_node import InMemoryDataNode
+from .object_store_save_node import ObjectStoreSaveNode
 from .row_update_node import RowUpdateNode
 from .sql_node import SqlAggregationNode, SqlJoinNode, SqlLookupNode, SqlNode, SqlSampleNode, SqlScanNode

pixeltable/exec/aggregation_node.py CHANGED Viewed

@@ -103,6 +103,5 @@ class AggregationNode(ExecNode):
             self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
             self.output_batch.add_row(prev_row)
-        self.output_batch.flush_imgs(None, self.row_builder.stored_img_cols, self.flushed_img_slots)
         _logger.debug(f'AggregateNode: consumed {num_input_rows} rows, returning {len(self.output_batch.rows)} rows')
         yield self.output_batch

pixeltable/exec/cache_prefetch_node.py CHANGED Viewed

@@ -9,12 +9,12 @@ import urllib.request
 from collections import deque
 from concurrent import futures
 from pathlib import Path
-from typing import Any, AsyncIterator, Iterator, Optional
+from typing import AsyncIterator, Iterator, Optional
 from uuid import UUID
 from pixeltable import exceptions as excs, exprs
 from pixeltable.utils.filecache import FileCache
-from pixeltable.utils.media_store import TempStore
+from pixeltable.utils.object_stores import ObjectOps
 from .data_row_batch import DataRowBatch
 from .exec_node import ExecNode
@@ -26,16 +26,17 @@ class CachePrefetchNode(ExecNode):
     """Brings files with external URLs into the cache
     TODO:
-    - adapting the number of download threads at runtime to maximize throughput
+    - Process a row at a time and limit the number of in-flight rows to control memory usage
+    - Create asyncio.Tasks to consume our input in order to increase concurrency.
     """
+    QUEUE_DEPTH_HIGH_WATER = 50  # target number of in-flight requests
+    QUEUE_DEPTH_LOW_WATER = 20  # target number of in-flight requests
     BATCH_SIZE = 16
-    NUM_EXECUTOR_THREADS = 16
+    MAX_WORKERS = 15
     retain_input_order: bool  # if True, return rows in the exact order they were received
     file_col_info: list[exprs.ColumnSlotIdx]
-    boto_client: Optional[Any]
-    boto_client_lock: threading.Lock
     # execution state
     num_returned_rows: int
@@ -64,10 +65,6 @@ class CachePrefetchNode(ExecNode):
         self.retain_input_order = retain_input_order
         self.file_col_info = file_col_info
-        # clients for specific services are constructed as needed, because it's time-consuming
-        self.boto_client = None
-        self.boto_client_lock = threading.Lock()
         self.num_returned_rows = 0
         self.ready_rows = deque()
         self.in_flight_rows = {}
@@ -75,24 +72,42 @@ class CachePrefetchNode(ExecNode):
         self.in_flight_urls = {}
         self.input_finished = False
         self.row_idx = itertools.count() if retain_input_order else itertools.repeat(None)
+        assert self.QUEUE_DEPTH_HIGH_WATER > self.QUEUE_DEPTH_LOW_WATER
-    async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
-        input_iter = self.input.__aiter__()
-        with futures.ThreadPoolExecutor(max_workers=self.NUM_EXECUTOR_THREADS) as executor:
-            # we create enough in-flight requests to fill the first batch
-            while not self.input_finished and self.__num_pending_rows() < self.BATCH_SIZE:
-                await self.__submit_input_batch(input_iter, executor)
+    @property
+    def queued_work(self) -> int:
+        return len(self.in_flight_requests)
-            while True:
-                # try to assemble a full batch of output rows
-                if not self.__has_ready_batch() and len(self.in_flight_requests) > 0:
-                    self.__wait_for_requests()
-                # try to create enough in-flight requests to fill the next batch
-                while not self.input_finished and self.__num_pending_rows() < self.BATCH_SIZE:
-                    await self.__submit_input_batch(input_iter, executor)
+    async def get_input_batch(self, input_iter: AsyncIterator[DataRowBatch]) -> Optional[DataRowBatch]:
+        """Get the next batch of input rows, or None if there are no more rows"""
+        try:
+            input_batch = await anext(input_iter)
+            if input_batch is None:
+                self.input_finished = True
+            return input_batch
+        except StopAsyncIteration:
+            self.input_finished = True
+            return None
-                if len(self.ready_rows) > 0:
+    async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
+        input_iter = aiter(self.input)
+        with futures.ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
+            while True:
+                # Create work to fill the queue to the high water mark ... ?without overrunning the in-flight row limit.
+                while not self.input_finished and self.queued_work < self.QUEUE_DEPTH_HIGH_WATER:
+                    input_batch = await self.get_input_batch(input_iter)
+                    if input_batch is not None:
+                        self.__process_input_batch(input_batch, executor)
+                # Wait for enough completions to enable more queueing or if we're done
+                while self.queued_work > self.QUEUE_DEPTH_LOW_WATER or (self.input_finished and self.queued_work > 0):
+                    done, _ = futures.wait(self.in_flight_requests, return_when=futures.FIRST_COMPLETED)
+                    self.__process_completions(done, ignore_errors=self.ctx.ignore_errors)
+                # Emit results to meet batch size requirements or empty the in-flight row queue
+                if self.__has_ready_batch() or (
+                    len(self.ready_rows) > 0 and self.input_finished and self.queued_work == 0
+                ):
                     # create DataRowBatch from the first BATCH_SIZE ready rows
                     batch = DataRowBatch(self.row_builder)
                     rows = [self.ready_rows.popleft() for _ in range(min(self.BATCH_SIZE, len(self.ready_rows)))]
@@ -103,22 +118,15 @@ class CachePrefetchNode(ExecNode):
                     _logger.debug(f'returning {len(rows)} rows')
                     yield batch
-                if self.input_finished and self.__num_pending_rows() == 0:
+                if self.input_finished and self.queued_work == 0 and len(self.ready_rows) == 0:
                     return
-    def __num_pending_rows(self) -> int:
-        return len(self.in_flight_rows) + len(self.ready_rows)
     def __has_ready_batch(self) -> bool:
         """True if there are >= BATCH_SIZES entries in ready_rows and the first BATCH_SIZE ones are all non-None"""
         return (
             sum(int(row is not None) for row in itertools.islice(self.ready_rows, self.BATCH_SIZE)) == self.BATCH_SIZE
         )
-    def __ready_prefix_len(self) -> int:
-        """Length of the non-None prefix of ready_rows (= what we can return right now)"""
-        return sum(1 for _ in itertools.takewhile(lambda x: x is not None, self.ready_rows))
     def __add_ready_row(self, row: exprs.DataRow, row_idx: Optional[int]) -> None:
         if row_idx is None:
             self.ready_rows.append(row)
@@ -129,50 +137,36 @@ class CachePrefetchNode(ExecNode):
                 self.ready_rows.extend([None] * (idx - len(self.ready_rows) + 1))
             self.ready_rows[idx] = row
-    def __wait_for_requests(self) -> None:
-        """Wait for in-flight requests to complete until we have a full batch of rows"""
+    def __process_completions(self, done: set[futures.Future], ignore_errors: bool) -> None:
         file_cache = FileCache.get()
-        _logger.debug(f'waiting for requests; ready_batch_size={self.__ready_prefix_len()}')
-        while not self.__has_ready_batch() and len(self.in_flight_requests) > 0:
-            done, _ = futures.wait(self.in_flight_requests, return_when=futures.FIRST_COMPLETED)
-            for f in done:
-                url = self.in_flight_requests.pop(f)
-                tmp_path, exc = f.result()
-                local_path: Optional[Path] = None
-                if tmp_path is not None:
-                    # register the file with the cache for the first column in which it's missing
-                    assert url in self.in_flight_urls
-                    _, info = self.in_flight_urls[url][0]
-                    local_path = file_cache.add(info.col.tbl.id, info.col.id, url, tmp_path)
-                    _logger.debug(f'cached {url} as {local_path}')
-                # add the local path/exception to the slots that reference the url
-                for row, info in self.in_flight_urls.pop(url):
-                    if exc is not None:
-                        self.row_builder.set_exc(row, info.slot_idx, exc)
-                    else:
-                        assert local_path is not None
-                        row.set_file_path(info.slot_idx, str(local_path))
-                    state = self.in_flight_rows[id(row)]
-                    state.num_missing -= 1
-                    if state.num_missing == 0:
-                        del self.in_flight_rows[id(row)]
-                        self.__add_ready_row(row, state.idx)
-                        _logger.debug(f'row {state.idx} is ready (ready_batch_size={self.__ready_prefix_len()})')
-    async def __submit_input_batch(
-        self, input: AsyncIterator[DataRowBatch], executor: futures.ThreadPoolExecutor
-    ) -> None:
-        assert not self.input_finished
-        input_batch: Optional[DataRowBatch]
-        try:
-            input_batch = await anext(input)
-        except StopAsyncIteration:
-            input_batch = None
-        if input_batch is None:
-            self.input_finished = True
-            return
+        for f in done:
+            url = self.in_flight_requests.pop(f)
+            tmp_path, exc = f.result()
+            if exc is not None and not ignore_errors:
+                raise exc
+            local_path: Optional[Path] = None
+            if tmp_path is not None:
+                # register the file with the cache for the first column in which it's missing
+                assert url in self.in_flight_urls
+                _, info = self.in_flight_urls[url][0]
+                local_path = file_cache.add(info.col.tbl.id, info.col.id, url, tmp_path)
+                _logger.debug(f'cached {url} as {local_path}')
+            # add the local path/exception to the slots that reference the url
+            for row, info in self.in_flight_urls.pop(url):
+                if exc is not None:
+                    self.row_builder.set_exc(row, info.slot_idx, exc)
+                else:
+                    assert local_path is not None
+                    row.set_file_path(info.slot_idx, str(local_path))
+                state = self.in_flight_rows[id(row)]
+                state.num_missing -= 1
+                if state.num_missing == 0:
+                    del self.in_flight_rows[id(row)]
+                    self.__add_ready_row(row, state.idx)
+    def __process_input_batch(self, input_batch: DataRowBatch, executor: futures.ThreadPoolExecutor) -> None:
+        """Process a batch of input rows, submitting URLs for download and adding ready rows to ready_rows"""
         file_cache = FileCache.get()
         # URLs from this input batch that aren't already in the file cache;
@@ -180,7 +174,7 @@ class CachePrefetchNode(ExecNode):
         # the time it takes to get the next batch together
         cache_misses: list[str] = []
-        url_pos: dict[str, int] = {}  # url -> row_idx; used for logging
+        url_pos: dict[str, Optional[int]] = {}  # url -> row_idx; used for logging
         for row in input_batch:
             # identify missing local files in input batch, or fill in their paths if they're already cached
             num_missing = 0
@@ -221,6 +215,8 @@ class CachePrefetchNode(ExecNode):
     def __fetch_url(self, url: str) -> tuple[Optional[Path], Optional[Exception]]:
         """Fetches a remote URL into the TempStore and returns its path"""
+        from pixeltable.utils.local_store import TempStore
         _logger.debug(f'fetching url={url} thread_name={threading.current_thread().name}')
         parsed = urllib.parse.urlparse(url)
         # Use len(parsed.scheme) > 1 here to ensure we're not being passed
@@ -234,31 +230,11 @@ class CachePrefetchNode(ExecNode):
         tmp_path = TempStore.create_path(extension=extension)
         try:
             _logger.debug(f'Downloading {url} to {tmp_path}')
-            if parsed.scheme == 's3':
-                from pixeltable.utils.s3 import get_client
-                with self.boto_client_lock:
-                    if self.boto_client is None:
-                        config = {
-                            'max_pool_connections': self.NUM_EXECUTOR_THREADS + 4,  # +4: leave some headroom
-                            'connect_timeout': 5,
-                            'read_timeout': 30,
-                            'retries': {'max_attempts': 3, 'mode': 'adaptive'},
-                        }
-                        self.boto_client = get_client(**config)
-                self.boto_client.download_file(parsed.netloc, parsed.path.lstrip('/'), str(tmp_path))
-            elif parsed.scheme in ('http', 'https'):
-                with urllib.request.urlopen(url) as resp, open(tmp_path, 'wb') as f:
-                    data = resp.read()
-                    f.write(data)
-            else:
-                raise AssertionError(f'Unsupported URL scheme: {parsed.scheme}')
+            ObjectOps.copy_object_to_local_file(url, tmp_path)
             _logger.debug(f'Downloaded {url} to {tmp_path}')
             return tmp_path, None
         except Exception as e:
             # we want to add the file url to the exception message
             exc = excs.Error(f'Failed to download {url}: {e}')
             _logger.debug(f'Failed to download {url}: {e}', exc_info=e)
-            if not self.ctx.ignore_errors:
-                raise exc from None  # suppress original exception
             return None, exc

pixeltable/exec/data_row_batch.py CHANGED Viewed

@@ -12,15 +12,14 @@ class DataRowBatch:
     """Set of DataRows, indexed by rowid.
     Contains the metadata needed to initialize DataRows.
+    Requires either num_rows or rows to be specified, but not both.
     """
     row_builder: exprs.RowBuilder
     rows: list[exprs.DataRow]
     def __init__(self, row_builder: exprs.RowBuilder, rows: Optional[list[exprs.DataRow]] = None):
-        """
-        Requires either num_rows or rows to be specified, but not both.
-        """
         self.row_builder = row_builder
         self.rows = [] if rows is None else rows
@@ -39,20 +38,5 @@ class DataRowBatch:
     def __getitem__(self, index: int) -> exprs.DataRow:
         return self.rows[index]
-    def flush_imgs(
-        self, idx_range: Optional[slice], stored_img_info: list[exprs.ColumnSlotIdx], flushed_img_slots: list[int]
-    ) -> None:
-        """Flushes images in the given range of rows."""
-        if len(stored_img_info) == 0 and len(flushed_img_slots) == 0:
-            return
-        if idx_range is None:
-            idx_range = slice(0, len(self.rows))
-        for row in self.rows[idx_range]:
-            for info in stored_img_info:
-                row.flush_img(info.slot_idx, info.col)
-            for slot_idx in flushed_img_slots:
-                row.flush_img(slot_idx)
     def __iter__(self) -> Iterator[exprs.DataRow]:
         return iter(self.rows)

pixeltable 0.4.13__py3-none-any.whl → 0.4.14__py3-none-any.whl

Potentially problematic release.

pixeltable 0.4.13py3-none-any.whl → 0.4.14py3-none-any.whl