PyPI - pixeltable - Versions diffs - 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl - Mend

pixeltable 0.3.2py3-none-any.whl → 0.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (147) hide show

pixeltable/__init__.py +64 -11
pixeltable/__version__.py +2 -2
pixeltable/catalog/__init__.py +1 -1
pixeltable/catalog/catalog.py +50 -27
pixeltable/catalog/column.py +27 -11
pixeltable/catalog/dir.py +6 -4
pixeltable/catalog/globals.py +8 -1
pixeltable/catalog/insertable_table.py +22 -12
pixeltable/catalog/named_function.py +10 -6
pixeltable/catalog/path.py +3 -2
pixeltable/catalog/path_dict.py +8 -6
pixeltable/catalog/schema_object.py +2 -1
pixeltable/catalog/table.py +121 -101
pixeltable/catalog/table_version.py +291 -142
pixeltable/catalog/table_version_path.py +8 -5
pixeltable/catalog/view.py +67 -26
pixeltable/dataframe.py +102 -72
pixeltable/env.py +20 -21
pixeltable/exec/__init__.py +2 -2
pixeltable/exec/aggregation_node.py +10 -4
pixeltable/exec/cache_prefetch_node.py +5 -3
pixeltable/exec/component_iteration_node.py +9 -8
pixeltable/exec/data_row_batch.py +21 -10
pixeltable/exec/exec_context.py +10 -3
pixeltable/exec/exec_node.py +23 -12
pixeltable/exec/expr_eval/evaluators.py +13 -7
pixeltable/exec/expr_eval/expr_eval_node.py +24 -15
pixeltable/exec/expr_eval/globals.py +30 -7
pixeltable/exec/expr_eval/row_buffer.py +5 -6
pixeltable/exec/expr_eval/schedulers.py +151 -31
pixeltable/exec/in_memory_data_node.py +8 -7
pixeltable/exec/row_update_node.py +15 -5
pixeltable/exec/sql_node.py +56 -27
pixeltable/exprs/__init__.py +2 -2
pixeltable/exprs/arithmetic_expr.py +57 -26
pixeltable/exprs/array_slice.py +1 -1
pixeltable/exprs/column_property_ref.py +2 -1
pixeltable/exprs/column_ref.py +20 -15
pixeltable/exprs/comparison.py +6 -2
pixeltable/exprs/compound_predicate.py +1 -3
pixeltable/exprs/data_row.py +2 -2
pixeltable/exprs/expr.py +101 -72
pixeltable/exprs/expr_dict.py +2 -1
pixeltable/exprs/expr_set.py +3 -1
pixeltable/exprs/function_call.py +39 -41
pixeltable/exprs/globals.py +1 -0
pixeltable/exprs/in_predicate.py +2 -2
pixeltable/exprs/inline_expr.py +20 -17
pixeltable/exprs/json_mapper.py +4 -2
pixeltable/exprs/json_path.py +12 -18
pixeltable/exprs/literal.py +5 -9
pixeltable/exprs/method_ref.py +1 -0
pixeltable/exprs/object_ref.py +1 -1
pixeltable/exprs/row_builder.py +32 -17
pixeltable/exprs/rowid_ref.py +14 -5
pixeltable/exprs/similarity_expr.py +11 -6
pixeltable/exprs/sql_element_cache.py +1 -1
pixeltable/exprs/type_cast.py +24 -9
pixeltable/ext/__init__.py +1 -0
pixeltable/ext/functions/__init__.py +1 -0
pixeltable/ext/functions/whisperx.py +2 -2
pixeltable/ext/functions/yolox.py +11 -11
pixeltable/func/aggregate_function.py +17 -13
pixeltable/func/callable_function.py +6 -6
pixeltable/func/expr_template_function.py +15 -14
pixeltable/func/function.py +16 -16
pixeltable/func/function_registry.py +11 -8
pixeltable/func/globals.py +4 -2
pixeltable/func/query_template_function.py +12 -13
pixeltable/func/signature.py +18 -9
pixeltable/func/tools.py +10 -17
pixeltable/func/udf.py +106 -11
pixeltable/functions/__init__.py +21 -2
pixeltable/functions/anthropic.py +16 -12
pixeltable/functions/fireworks.py +63 -5
pixeltable/functions/gemini.py +13 -3
pixeltable/functions/globals.py +18 -6
pixeltable/functions/huggingface.py +20 -38
pixeltable/functions/image.py +7 -3
pixeltable/functions/json.py +1 -0
pixeltable/functions/llama_cpp.py +1 -4
pixeltable/functions/mistralai.py +31 -20
pixeltable/functions/ollama.py +4 -18
pixeltable/functions/openai.py +201 -108
pixeltable/functions/replicate.py +11 -10
pixeltable/functions/string.py +70 -7
pixeltable/functions/timestamp.py +21 -8
pixeltable/functions/together.py +66 -52
pixeltable/functions/video.py +1 -0
pixeltable/functions/vision.py +14 -11
pixeltable/functions/whisper.py +2 -1
pixeltable/globals.py +60 -26
pixeltable/index/__init__.py +1 -1
pixeltable/index/btree.py +5 -3
pixeltable/index/embedding_index.py +15 -14
pixeltable/io/__init__.py +1 -1
pixeltable/io/external_store.py +30 -25
pixeltable/io/fiftyone.py +6 -14
pixeltable/io/globals.py +33 -27
pixeltable/io/hf_datasets.py +2 -1
pixeltable/io/label_studio.py +77 -68
pixeltable/io/pandas.py +33 -9
pixeltable/io/parquet.py +9 -12
pixeltable/iterators/__init__.py +1 -0
pixeltable/iterators/audio.py +205 -0
pixeltable/iterators/document.py +19 -8
pixeltable/iterators/image.py +6 -24
pixeltable/iterators/string.py +3 -6
pixeltable/iterators/video.py +1 -7
pixeltable/metadata/__init__.py +7 -1
pixeltable/metadata/converters/convert_10.py +2 -2
pixeltable/metadata/converters/convert_15.py +1 -5
pixeltable/metadata/converters/convert_16.py +2 -4
pixeltable/metadata/converters/convert_17.py +2 -4
pixeltable/metadata/converters/convert_18.py +2 -4
pixeltable/metadata/converters/convert_19.py +2 -5
pixeltable/metadata/converters/convert_20.py +1 -4
pixeltable/metadata/converters/convert_21.py +4 -6
pixeltable/metadata/converters/convert_22.py +1 -0
pixeltable/metadata/converters/convert_23.py +5 -5
pixeltable/metadata/converters/convert_24.py +12 -13
pixeltable/metadata/converters/convert_26.py +23 -0
pixeltable/metadata/converters/util.py +3 -4
pixeltable/metadata/notes.py +1 -0
pixeltable/metadata/schema.py +13 -2
pixeltable/plan.py +173 -98
pixeltable/store.py +42 -26
pixeltable/type_system.py +62 -54
pixeltable/utils/arrow.py +1 -2
pixeltable/utils/coco.py +16 -17
pixeltable/utils/code.py +1 -1
pixeltable/utils/console_output.py +6 -3
pixeltable/utils/description_helper.py +7 -7
pixeltable/utils/documents.py +3 -1
pixeltable/utils/filecache.py +12 -7
pixeltable/utils/http_server.py +9 -8
pixeltable/utils/media_store.py +2 -1
pixeltable/utils/pytorch.py +11 -14
pixeltable/utils/s3.py +1 -0
pixeltable/utils/sql.py +1 -0
pixeltable/utils/transactional_directory.py +2 -2
{pixeltable-0.3.2.dist-info → pixeltable-0.3.3.dist-info}/METADATA +6 -8
pixeltable-0.3.3.dist-info/RECORD +163 -0
pixeltable-0.3.2.dist-info/RECORD +0 -161
{pixeltable-0.3.2.dist-info → pixeltable-0.3.3.dist-info}/LICENSE +0 -0
{pixeltable-0.3.2.dist-info → pixeltable-0.3.3.dist-info}/WHEEL +0 -0
{pixeltable-0.3.2.dist-info → pixeltable-0.3.3.dist-info}/entry_points.txt +0 -0

pixeltable/env.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from __future__ import annotations
-from abc import abstractmethod
 import datetime
 import glob
 import http.server
@@ -16,6 +15,7 @@ import sys
 import threading
 import uuid
 import warnings
+from abc import abstractmethod
 from dataclasses import dataclass, field
 from pathlib import Path
 from sys import stdout
@@ -375,6 +375,7 @@ class Env:
         if create_db:
             from pixeltable.metadata import schema
             schema.base_metadata.create_all(self._sa_engine)
             metadata.create_system_info(self._sa_engine)
@@ -387,11 +388,7 @@ class Env:
     def _create_engine(self, time_zone_name: Optional[str], echo: bool = False) -> None:
         connect_args = {} if time_zone_name is None else {'options': f'-c timezone={time_zone_name}'}
         self._sa_engine = sql.create_engine(
-            self.db_url,
-            echo=echo,
-            future=True,
-            isolation_level='REPEATABLE READ',
-            connect_args=connect_args,
+            self.db_url, echo=echo, future=True, isolation_level='REPEATABLE READ', connect_args=connect_args
         )
         self._logger.info(f'Created SQLAlchemy engine at: {self.db_url}')
         with self.engine.begin() as conn:
@@ -424,7 +421,7 @@ class Env:
             with engine.begin() as conn:
                 # use C collation to get standard C/Python-style sorting
                 stmt = (
-                    f"CREATE DATABASE {preparer.quote(self._db_name)} "
+                    f'CREATE DATABASE {preparer.quote(self._db_name)} '
                     "ENCODING 'utf-8' LC_COLLATE 'C' LC_CTYPE 'C' TEMPLATE template0"
                 )
                 conn.execute(sql.text(stmt))
@@ -448,12 +445,12 @@ class Env:
         try:
             with engine.begin() as conn:
                 # terminate active connections
-                stmt = (f"""
+                stmt = f"""
                     SELECT pg_terminate_backend(pg_stat_activity.pid)
                     FROM pg_stat_activity
                     WHERE pg_stat_activity.datname = '{self._db_name}'
                     AND pid <> pg_backend_pid()
-                """)
+                """
                 conn.execute(sql.text(stmt))
                 # drop db
                 stmt = f'DROP DATABASE {preparer.quote(self._db_name)}'
@@ -563,7 +560,7 @@ class Env:
             is_installed = False
         self.__optional_packages[package_name] = PackageInfo(
             is_installed=is_installed,
-            library_name=library_name or package_name  # defaults to package_name unless specified otherwise
+            library_name=library_name or package_name,  # defaults to package_name unless specified otherwise
         )
     def require_package(self, package_name: str, min_version: Optional[list[int]] = None) -> None:
@@ -609,6 +606,7 @@ class Env:
         """
         import spacy
         from spacy.cli.download import get_model_filename
         spacy_model = 'en_core_web_sm'
         spacy_model_version = '3.7.1'
         filename = get_model_filename(spacy_model, spacy_model_version, sdist=False)
@@ -626,7 +624,7 @@ class Env:
             self._logger.warn(f'Failed to load spaCy model: {spacy_model}', exc_info=exc)
             warnings.warn(
                 f"Failed to load spaCy model '{spacy_model}'. spaCy features will not be available.",
-                excs.PixeltableWarning
+                excs.PixeltableWarning,
             )
             self.__optional_packages['spacy'].is_installed = False
@@ -636,8 +634,7 @@ class Env:
     def create_tmp_path(self, extension: str = '') -> Path:
         return self._tmp_dir / f'{uuid.uuid4()}{extension}'
-    #def get_resource_pool_info(self, pool_id: str, pool_info_cls: Optional[Type[T]]) -> T:
+    # def get_resource_pool_info(self, pool_id: str, pool_info_cls: Optional[Type[T]]) -> T:
     def get_resource_pool_info(self, pool_id: str, make_pool_info: Optional[Callable[[], T]] = None) -> T:
         """Returns the info object for the given id, creating it if necessary."""
         info = self._resource_pool_info.get(pool_id)
@@ -707,6 +704,7 @@ def register_client(name: str) -> Callable:
     Args:
         - name (str): The name of the API client (e.g., 'openai' or 'label-studio').
     """
     def decorator(fn: Callable) -> None:
         global _registered_clients
         sig = inspect.signature(fn)
@@ -721,6 +719,7 @@ class Config:
     The (global) Pixeltable configuration, as loaded from `config.toml`. Provides methods for retrieving
     configuration values, which can be set in the config file or as environment variables.
     """
     __config: dict[str, Any]
     @classmethod
@@ -750,12 +749,7 @@ class Config:
         free_disk_space_bytes = shutil.disk_usage(config_path.parent).free
         # Default cache size is 1/5 of free disk space
         file_cache_size_g = free_disk_space_bytes / 5 / (1 << 30)
-        return {
-            'pixeltable': {
-                'file_cache_size_g': round(file_cache_size_g, 1),
-                'hide_warnings': False,
-            }
-        }
+        return {'pixeltable': {'file_cache_size_g': round(file_cache_size_g, 1), 'hide_warnings': False}}
     def __init__(self, config: dict[str, Any]) -> None:
         self.__config = config
@@ -840,7 +834,9 @@ class RateLimitsInfo:
             self.resource_limits = {k: RateLimitInfo(k, now, *v) for k, v in kwargs.items() if v is not None}
             # TODO: remove
             for info in self.resource_limits.values():
-                _logger.debug(f'Init {info.resource} rate limit: rem={info.remaining} reset={info.reset_at.strftime(TIME_FORMAT)} delta={(info.reset_at - now).total_seconds()}')
+                _logger.debug(
+                    f'Init {info.resource} rate limit: rem={info.remaining} reset={info.reset_at.strftime(TIME_FORMAT)} delta={(info.reset_at - now).total_seconds()}'
+                )
         else:
             for k, v in kwargs.items():
                 if v is not None:
@@ -855,6 +851,7 @@ class RateLimitsInfo:
 @dataclass
 class RateLimitInfo:
     """Container for rate limit-related information for a single resource."""
     resource: str
     recorded_at: datetime.datetime
     limit: int
@@ -871,4 +868,6 @@ class RateLimitInfo:
         reset_delta = reset_at - self.reset_at
         self.reset_at = reset_at
         # TODO: remove
-        _logger.debug(f'Update {self.resource} rate limit: rem={self.remaining} reset={self.reset_at.strftime(TIME_FORMAT)} reset_delta={reset_delta.total_seconds()} recorded_delta={(self.reset_at - recorded_at).total_seconds()}')
+        _logger.debug(
+            f'Update {self.resource} rate limit: rem={self.remaining} reset={self.reset_at.strftime(TIME_FORMAT)} reset_delta={reset_delta.total_seconds()} recorded_delta={(self.reset_at - recorded_at).total_seconds()}'
+        )

pixeltable/exec/__init__.py CHANGED Viewed

@@ -4,7 +4,7 @@ from .component_iteration_node import ComponentIterationNode
 from .data_row_batch import DataRowBatch
 from .exec_context import ExecContext
 from .exec_node import ExecNode
+from .expr_eval import ExprEvalNode
 from .in_memory_data_node import InMemoryDataNode
 from .row_update_node import RowUpdateNode
-from .sql_node import SqlLookupNode, SqlScanNode, SqlAggregationNode, SqlNode, SqlJoinNode
-from .expr_eval import ExprEvalNode
+from .sql_node import SqlAggregationNode, SqlJoinNode, SqlLookupNode, SqlNode, SqlScanNode

pixeltable/exec/aggregation_node.py CHANGED Viewed

@@ -2,7 +2,7 @@ from __future__ import annotations
 import logging
 import sys
-from typing import Any, Iterable, Iterator, Optional, cast, AsyncIterator
+from typing import Any, AsyncIterator, Iterable, Iterator, Optional, cast
 import pixeltable.catalog as catalog
 import pixeltable.exceptions as excs
@@ -13,12 +13,14 @@ from .exec_node import ExecNode
 _logger = logging.getLogger('pixeltable')
 class AggregationNode(ExecNode):
     """
     In-memory aggregation for UDAs.
     At the moment, this returns all results in a single DataRowBatch.
     """
     group_by: Optional[list[exprs.Expr]]
     input_exprs: list[exprs.Expr]
     agg_fn_eval_ctx: exprs.RowBuilder.EvalCtx
@@ -26,8 +28,13 @@ class AggregationNode(ExecNode):
     output_batch: DataRowBatch
     def __init__(
-            self, tbl: catalog.TableVersion, row_builder: exprs.RowBuilder, group_by: Optional[list[exprs.Expr]],
-            agg_fn_calls: list[exprs.FunctionCall], input_exprs: Iterable[exprs.Expr], input: ExecNode
+        self,
+        tbl: catalog.TableVersion,
+        row_builder: exprs.RowBuilder,
+        group_by: Optional[list[exprs.Expr]],
+        agg_fn_calls: list[exprs.FunctionCall],
+        input_exprs: Iterable[exprs.Expr],
+        input: ExecNode,
     ):
         output_exprs: list[exprs.Expr] = [] if group_by is None else list(group_by)
         output_exprs.extend(agg_fn_calls)
@@ -86,4 +93,3 @@ class AggregationNode(ExecNode):
         self.output_batch.flush_imgs(None, self.stored_img_cols, self.flushed_img_slots)
         _logger.debug(f'AggregateNode: consumed {num_input_rows} rows, returning {len(self.output_batch.rows)} rows')
         yield self.output_batch

pixeltable/exec/cache_prefetch_node.py CHANGED Viewed

@@ -9,7 +9,7 @@ import urllib.request
 from collections import deque
 from concurrent import futures
 from pathlib import Path
-from typing import Optional, Any, Iterator, AsyncIterator
+from typing import Any, AsyncIterator, Iterator, Optional
 from uuid import UUID
 import pixeltable.env as env
@@ -30,6 +30,7 @@ class CachePrefetchNode(ExecNode):
     TODO:
     - adapting the number of download threads at runtime to maximize throughput
     """
     BATCH_SIZE = 16
     NUM_EXECUTOR_THREADS = 16
@@ -59,8 +60,8 @@ class CachePrefetchNode(ExecNode):
         num_missing: int  # number of missing URLs in this row
     def __init__(
-            self, tbl_id: UUID, file_col_info: list[exprs.ColumnSlotIdx], input: ExecNode,
-            retain_input_order: bool = True):
+        self, tbl_id: UUID, file_col_info: list[exprs.ColumnSlotIdx], input: ExecNode, retain_input_order: bool = True
+    ):
         # input_/output_exprs=[]: we don't have anything to evaluate
         super().__init__(input.row_builder, [], [], input)
         self.retain_input_order = retain_input_order
@@ -241,6 +242,7 @@ class CachePrefetchNode(ExecNode):
             _logger.debug(f'Downloading {url} to {tmp_path}')
             if parsed.scheme == 's3':
                 from pixeltable.utils.s3 import get_client
                 with self.boto_client_lock:
                     if self.boto_client is None:
                         config = {

pixeltable/exec/component_iteration_node.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import inspect
-from typing import Iterator, Optional, AsyncIterator
+from typing import AsyncIterator, Iterator, Optional
 import pixeltable.catalog as catalog
 import pixeltable.exceptions as excs
@@ -14,6 +14,7 @@ class ComponentIterationNode(ExecNode):
     Returns row batches of OUTPUT_BATCH_SIZE size.
     """
     __OUTPUT_BATCH_SIZE = 1024
     def __init__(self, view: catalog.TableVersion, input: ExecNode):
@@ -25,8 +26,8 @@ class ComponentIterationNode(ExecNode):
         self.iterator_args = iterator_args[0]
         assert isinstance(self.iterator_args, exprs.InlineDict)
         self.iterator_args_ctx = self.row_builder.create_eval_ctx([self.iterator_args])
-        self.iterator_output_schema, self.unstored_column_names = (
-            self.view.iterator_cls.output_schema(**self.iterator_args.to_kwargs())
+        self.iterator_output_schema, self.unstored_column_names = self.view.iterator_cls.output_schema(
+            **self.iterator_args.to_kwargs()
         )
         self.iterator_output_fields = list(self.iterator_output_schema.keys())
         self.iterator_output_cols = {
@@ -34,7 +35,8 @@ class ComponentIterationNode(ExecNode):
         }
         # referenced iterator output fields
         self.refd_output_slot_idxs = {
-            e.col.name: e.slot_idx for e in self.row_builder.unique_exprs
+            e.col.name: e.slot_idx
+            for e in self.row_builder.unique_exprs
             if isinstance(e, exprs.ColumnRef) and e.col.name in self.iterator_output_fields
         }
@@ -79,8 +81,7 @@ class ComponentIterationNode(ExecNode):
         # verify and copy component_dict fields to their respective slots in output_row
         for field_name, field_val in component_dict.items():
             if field_name not in self.iterator_output_fields:
-                raise excs.Error(
-                    f'Invalid field name {field_name} in output of {self.view.iterator_cls.__name__}')
+                raise excs.Error(f'Invalid field name {field_name} in output of {self.view.iterator_cls.__name__}')
             if field_name not in self.refd_output_slot_idxs:
                 # we can ignore this
                 continue
@@ -90,5 +91,5 @@ class ComponentIterationNode(ExecNode):
         if len(component_dict) != len(self.iterator_output_fields):
             missing_fields = set(self.refd_output_slot_idxs.keys()) - set(component_dict.keys())
             raise excs.Error(
-                f'Invalid output of {self.view.iterator_cls.__name__}: '
-                f'missing fields {", ".join(missing_fields)}')
+                f'Invalid output of {self.view.iterator_cls.__name__}: missing fields {", ".join(missing_fields)}'
+            )

pixeltable/exec/data_row_batch.py CHANGED Viewed

@@ -1,19 +1,21 @@
 from __future__ import annotations
-from typing import Iterator, Optional
 import logging
+from typing import Iterator, Optional
-import pixeltable.exprs as exprs
 import pixeltable.catalog as catalog
+import pixeltable.exprs as exprs
 from pixeltable.utils.media_store import MediaStore
 _logger = logging.getLogger('pixeltable')
 class DataRowBatch:
     """Set of DataRows, indexed by rowid.
     Contains the metadata needed to initialize DataRows.
     """
     tbl: Optional[catalog.TableVersion]
     row_builder: exprs.RowBuilder
     img_slot_idxs: list[int]
@@ -22,8 +24,11 @@ class DataRowBatch:
     rows: list[exprs.DataRow]
     def __init__(
-        self, tbl: Optional[catalog.TableVersion], row_builder: exprs.RowBuilder, num_rows: Optional[int] = None,
-        rows: Optional[list[exprs.DataRow]] = None
+        self,
+        tbl: Optional[catalog.TableVersion],
+        row_builder: exprs.RowBuilder,
+        num_rows: Optional[int] = None,
+        rows: Optional[list[exprs.DataRow]] = None,
     ):
         """
         Requires either num_rows or rows to be specified, but not both.
@@ -34,7 +39,8 @@ class DataRowBatch:
         self.img_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_image_type()]
         # non-image media slots
         self.media_slot_idxs = [
-            e.slot_idx for e in row_builder.unique_exprs
+            e.slot_idx
+            for e in row_builder.unique_exprs
             if e.col_type.is_media_type() and not e.col_type.is_image_type()
         ]
         self.array_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_array_type()]
@@ -44,14 +50,17 @@ class DataRowBatch:
             if num_rows is None:
                 num_rows = 0
             self.rows = [
-                exprs.DataRow(row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs)
+                exprs.DataRow(
+                    row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs
+                )
                 for _ in range(num_rows)
             ]
     def add_row(self, row: Optional[exprs.DataRow] = None) -> exprs.DataRow:
         if row is None:
             row = exprs.DataRow(
-                self.row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs)
+                self.row_builder.num_materialized, self.img_slot_idxs, self.media_slot_idxs, self.array_slot_idxs
+            )
         self.rows.append(row)
         return row
@@ -65,8 +74,10 @@ class DataRowBatch:
         return self.rows[index]
     def flush_imgs(
-            self, idx_range: Optional[slice] = None, stored_img_info: Optional[list[exprs.ColumnSlotIdx]] = None,
-            flushed_slot_idxs: Optional[list[int]] = None
+        self,
+        idx_range: Optional[slice] = None,
+        stored_img_info: Optional[list[exprs.ColumnSlotIdx]] = None,
+        flushed_slot_idxs: Optional[list[int]] = None,
     ) -> None:
         """Flushes images in the given range of rows."""
         assert self.tbl is not None

pixeltable/exec/exec_context.py CHANGED Viewed

@@ -4,12 +4,19 @@ import sqlalchemy as sql
 import pixeltable.exprs as exprs
 class ExecContext:
     """Class for execution runtime constants"""
     def __init__(
-            self, row_builder: exprs.RowBuilder, *, show_pbar: bool = False, batch_size: int = 0,
-            pk_clause: Optional[list[sql.ClauseElement]] = None, num_computed_exprs: int = 0,
-            ignore_errors: bool = False
+        self,
+        row_builder: exprs.RowBuilder,
+        *,
+        show_pbar: bool = False,
+        batch_size: int = 0,
+        pk_clause: Optional[list[sql.ClauseElement]] = None,
+        num_computed_exprs: int = 0,
+        ignore_errors: bool = False,
     ):
         self.show_pbar = show_pbar
         self.batch_size = batch_size

pixeltable/exec/exec_node.py CHANGED Viewed

@@ -4,16 +4,19 @@ import abc
 import asyncio
 import logging
 import sys
-from typing import Iterable, Iterator, Optional, TypeVar, AsyncIterator
+from typing import AsyncIterator, Iterable, Iterator, Optional, TypeVar
 import pixeltable.exprs as exprs
 from .data_row_batch import DataRowBatch
 from .exec_context import ExecContext
 _logger = logging.getLogger('pixeltable')
 class ExecNode(abc.ABC):
     """Base class of all execution nodes"""
     output_exprs: Iterable[exprs.Expr]
     row_builder: exprs.RowBuilder
     input: Optional[ExecNode]
@@ -22,8 +25,12 @@ class ExecNode(abc.ABC):
     ctx: Optional[ExecContext]
     def __init__(
-            self, row_builder: exprs.RowBuilder, output_exprs: Iterable[exprs.Expr],
-            input_exprs: Iterable[exprs.Expr], input: Optional[ExecNode] = None):
+        self,
+        row_builder: exprs.RowBuilder,
+        output_exprs: Iterable[exprs.Expr],
+        input_exprs: Iterable[exprs.Expr],
+        input: Optional[ExecNode] = None,
+    ):
         self.output_exprs = output_exprs
         self.row_builder = row_builder
         self.input = input
@@ -31,8 +38,7 @@ class ExecNode(abc.ABC):
         output_slot_idxs = {e.slot_idx for e in output_exprs}
         output_dependencies = row_builder.get_dependencies(output_exprs, exclude=input_exprs)
         self.flushed_img_slots = [
-            e.slot_idx for e in output_dependencies
-            if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
+            e.slot_idx for e in output_dependencies if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
         ]
         self.stored_img_cols = []
         self.ctx = None  # all nodes of a tree share the same context
@@ -53,16 +59,20 @@ class ExecNode(abc.ABC):
         pass
     def __iter__(self) -> Iterator[DataRowBatch]:
+        running_loop: Optional[asyncio.AbstractEventLoop] = None
+        loop: asyncio.AbstractEventLoop
         try:
-            # check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow nested event loops
-            _ = asyncio.get_event_loop()
+            # check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
+            # multiple run_until_complete()
+            running_loop = asyncio.get_running_loop()
             import nest_asyncio  # type: ignore
             nest_asyncio.apply()
+            loop = running_loop
+            _logger.debug(f'Patched running loop')
         except RuntimeError:
-            pass
-        loop = asyncio.new_event_loop()
-        asyncio.set_event_loop(loop)
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
         if 'pytest' in sys.modules:
             loop.set_debug(True)
@@ -75,7 +85,8 @@ class ExecNode(abc.ABC):
         except StopAsyncIteration:
             pass
         finally:
-            loop.close()
+            if loop != running_loop:
+                loop.close()
     def open(self) -> None:
         """Bottom-up initialization of nodes for execution. Must be called before __next__."""

pixeltable/exec/expr_eval/evaluators.py CHANGED Viewed

@@ -5,10 +5,10 @@ import datetime
 import itertools
 import logging
 import sys
-from typing import Iterator, Any, Optional, Callable, cast
+from typing import Any, Callable, Iterator, Optional, cast
+from pixeltable import exprs, func
-from pixeltable import exprs
-from pixeltable import func
 from .globals import Dispatcher, Evaluator, FnCallArgs
 _logger = logging.getLogger('pixeltable')
@@ -23,6 +23,7 @@ class DefaultExprEvaluator(Evaluator):
     TODO:
     - parallelize via Ray
     """
     e: exprs.Expr
     def __init__(self, e: exprs.Expr, dispatcher: Dispatcher):
@@ -60,6 +61,7 @@ class FnCallEvaluator(Evaluator):
     TODO:
     - adaptive batching: finding the optimal batch size based on observed execution times
     """
     fn_call: exprs.FunctionCall
     fn: func.CallableFunction
     scalar_py_fn: Optional[Callable]  # only set for non-batching CallableFunctions
@@ -73,7 +75,7 @@ class FnCallEvaluator(Evaluator):
         self.fn_call = fn_call
         self.fn = cast(func.CallableFunction, fn_call.fn)
         if isinstance(self.fn, func.CallableFunction) and self.fn.is_batched:
-            self.call_args_queue =  asyncio.Queue[FnCallArgs]()
+            self.call_args_queue = asyncio.Queue[FnCallArgs]()
             # we're not supplying sample arguments there, they're ignored anyway
             self.batch_size = self.fn.get_batch_size()
             self.scalar_py_fn = None
@@ -167,14 +169,16 @@ class FnCallEvaluator(Evaluator):
             for k in item.kwargs.keys():
                 batch_kwargs[k][i] = item.kwargs[k]
         return FnCallArgs(
-            self.fn_call, [item.row for item in call_args], batch_args=batch_args, batch_kwargs=batch_kwargs)
+            self.fn_call, [item.row for item in call_args], batch_args=batch_args, batch_kwargs=batch_kwargs
+        )
     async def eval_batch(self, batched_call_args: FnCallArgs) -> None:
         result_batch: list[Any]
         try:
             if self.fn.is_async:
                 result_batch = await self.fn.aexec_batch(
-                    *batched_call_args.batch_args, **batched_call_args.batch_kwargs)
+                    *batched_call_args.batch_args, **batched_call_args.batch_kwargs
+                )
             else:
                 # check for cancellation before starting something potentially long-running
                 if asyncio.current_task().cancelled() or self.dispatcher.exc_event.is_set():
@@ -205,6 +209,7 @@ class FnCallEvaluator(Evaluator):
             self.dispatcher.dispatch([call_args.row])
         except Exception as exc:
             import anthropic
             if isinstance(exc, anthropic.RateLimitError):
                 _logger.debug(f'RateLimitError: {exc}')
             _, _, exc_tb = sys.exc_info()
@@ -228,7 +233,8 @@ class FnCallEvaluator(Evaluator):
                 rows_with_excs.add(idx)
                 self.dispatcher.dispatch_exc(item.rows, self.fn_call.slot_idx, exc_tb)
         self.dispatcher.dispatch(
-            [call_args_batch[i].row for i in range(len(call_args_batch)) if i not in rows_with_excs])
+            [call_args_batch[i].row for i in range(len(call_args_batch)) if i not in rows_with_excs]
+        )
     def _close(self) -> None:
         """Create a task for the incomplete batch of queued FnCallArgs, if any"""

pixeltable/exec/expr_eval/expr_eval_node.py CHANGED Viewed

@@ -4,24 +4,23 @@ import asyncio
 import logging
 import traceback
 from types import TracebackType
-from typing import Iterable, AsyncIterator, Optional, Union
+from typing import AsyncIterator, Iterable, Optional, Union
 import numpy as np
 import pixeltable.exceptions as excs
-from pixeltable import exprs
-from pixeltable import func
+from pixeltable import exprs, func
+from ..data_row_batch import DataRowBatch
+from ..exec_node import ExecNode
 from .evaluators import DefaultExprEvaluator, FnCallEvaluator
 from .globals import Evaluator, Scheduler
 from .row_buffer import RowBuffer
 from .schedulers import SCHEDULERS
-from ..data_row_batch import DataRowBatch
-from ..exec_node import ExecNode
 _logger = logging.getLogger('pixeltable')
 class ExprEvalNode(ExecNode):
     """
     Expression evaluation
@@ -35,10 +34,13 @@ class ExprEvalNode(ExecNode):
     TODO:
     - Literal handling: currently, Literal values are copied into slots via the normal evaluation mechanism, which is
       needless overhead; instead: pre-populate Literal slots in _init_row()
+    - dynamically determine MAX_BUFFERED_ROWS, based on the avg memory consumption of a row and our configured memory
+      limit
     - local model inference on gpu: currently, no attempt is made to ensure that models can fit onto the gpu
       simultaneously, which will cause errors; instead, the execution should be divided into sequential phases, each
       of which only contains a subset of the models which is known to fit onto the gpu simultaneously
     """
     maintain_input_order: bool  # True if we're returning rows in the order we received them from our input
     num_dependencies: np.ndarray  # number of dependencies for our output slots; indexed by slot idx
     outputs: np.ndarray  # bool per slot; True if this slot is part of our output
@@ -68,11 +70,15 @@ class ExprEvalNode(ExecNode):
     num_output_rows: int
     BATCH_SIZE = 64
-    MAX_BUFFERED_ROWS = 512  # maximum number of rows that have been dispatched but not yet returned
+    MAX_BUFFERED_ROWS = 2048  # maximum number of rows that have been dispatched but not yet returned
     def __init__(
-        self, row_builder: exprs.RowBuilder, output_exprs: Iterable[exprs.Expr], input_exprs: Iterable[exprs.Expr],
-        input: ExecNode, maintain_input_order: bool = True
+        self,
+        row_builder: exprs.RowBuilder,
+        output_exprs: Iterable[exprs.Expr],
+        input_exprs: Iterable[exprs.Expr],
+        input: ExecNode,
+        maintain_input_order: bool = True,
     ):
         super().__init__(row_builder, output_exprs, input_exprs, input)
         self.maintain_input_order = maintain_input_order
@@ -148,7 +154,9 @@ class ExprEvalNode(ExecNode):
                     self.row_pos_map[id(row)] = self.num_input_rows + idx
             self.num_input_rows += len(batch)
             self.avail_input_rows += len(batch)
-            _logger.debug(f'adding input: batch_size={len(batch)} #input_rows={self.num_input_rows} #avail={self.avail_input_rows}')
+            _logger.debug(
+                f'adding input: batch_size={len(batch)} #input_rows={self.num_input_rows} #avail={self.avail_input_rows}'
+            )
         except StopAsyncIteration:
             self.input_complete = True
             _logger.debug(f'finished input: #input_rows={self.num_input_rows}, #avail={self.avail_input_rows}')
@@ -175,11 +183,11 @@ class ExprEvalNode(ExecNode):
         rows: list[exprs.DataRow]
         if avail_current_batch_rows > num_rows:
             # we only need rows from current_input_batch
-            rows = self.current_input_batch.rows[self.input_row_idx:self.input_row_idx + num_rows]
+            rows = self.current_input_batch.rows[self.input_row_idx : self.input_row_idx + num_rows]
             self.input_row_idx += num_rows
         else:
             # we need rows from both current_/next_input_batch
-            rows = self.current_input_batch.rows[self.input_row_idx:]
+            rows = self.current_input_batch.rows[self.input_row_idx :]
             self.current_input_batch = self.next_input_batch
             self.next_input_batch = None
             self.input_row_idx = 0
@@ -236,6 +244,7 @@ class ExprEvalNode(ExecNode):
         exc_event_aw = asyncio.create_task(self.exc_event.wait(), name='exc_event.wait()')
         input_batch_aw: Optional[asyncio.Task] = None
         completed_aw: Optional[asyncio.Task] = None
+        closed_evaluators = False  # True after calling Evaluator.close()
         try:
             while True:
@@ -275,11 +284,12 @@ class ExprEvalNode(ExecNode):
                     assert self.output_buffer.num_rows == 0
                     return
-                if self.input_complete and self.avail_input_rows == 0:
+                if self.input_complete and self.avail_input_rows == 0 and not closed_evaluators:
                     # no more input rows to dispatch, but we're still waiting for rows to finish:
                     # close  all slot evaluators to flush queued rows
                     for evaluator in self.slot_evaluators.values():
                         evaluator.close()
+                    closed_evaluators = True
                 # we don't have a full batch of rows at this point and need to wait
                 aws = {exc_event_aw}  # always wait for an exception
@@ -335,8 +345,7 @@ class ExprEvalNode(ExecNode):
             first_row = rows[0]
             input_vals = [first_row[idx] for idx in dependency_idxs]
             e = self.row_builder.unique_exprs[slot_with_exc]
-            self.error = excs.ExprEvalError(
-                e, f'expression {e}', first_row.get_exc(e.slot_idx), exc_tb, input_vals, 0)
+            self.error = excs.ExprEvalError(e, f'expression {e}', first_row.get_exc(e.slot_idx), exc_tb, input_vals, 0)
             self.exc_event.set()
             return

pixeltable 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl

Potentially problematic release.

pixeltable 0.3.2py3-none-any.whl → 0.3.3py3-none-any.whl