PyPI - pixeltable - Versions diffs - 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl - Mend

pixeltable 0.4.1py3-none-any.whl → 0.4.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (51) hide show

pixeltable/__init__.py +1 -0
pixeltable/__version__.py +2 -2
pixeltable/catalog/__init__.py +3 -10
pixeltable/catalog/catalog.py +139 -59
pixeltable/catalog/column.py +32 -23
pixeltable/catalog/globals.py +2 -45
pixeltable/catalog/insertable_table.py +5 -2
pixeltable/catalog/path.py +6 -0
pixeltable/catalog/table.py +173 -23
pixeltable/catalog/table_version.py +156 -92
pixeltable/catalog/table_version_handle.py +26 -1
pixeltable/catalog/update_status.py +179 -0
pixeltable/catalog/view.py +12 -3
pixeltable/config.py +76 -12
pixeltable/dataframe.py +1 -1
pixeltable/env.py +29 -0
pixeltable/exec/exec_node.py +7 -24
pixeltable/exec/expr_eval/schedulers.py +134 -7
pixeltable/exprs/column_property_ref.py +23 -20
pixeltable/exprs/column_ref.py +24 -18
pixeltable/exprs/data_row.py +9 -0
pixeltable/exprs/function_call.py +2 -2
pixeltable/exprs/row_builder.py +46 -14
pixeltable/exprs/rowid_ref.py +0 -4
pixeltable/func/function.py +3 -3
pixeltable/functions/audio.py +36 -9
pixeltable/functions/video.py +57 -10
pixeltable/globals.py +61 -1
pixeltable/io/__init__.py +1 -1
pixeltable/io/external_store.py +39 -64
pixeltable/io/globals.py +4 -4
pixeltable/io/hf_datasets.py +10 -2
pixeltable/io/label_studio.py +52 -48
pixeltable/metadata/__init__.py +1 -1
pixeltable/metadata/converters/convert_38.py +39 -0
pixeltable/metadata/converters/convert_39.py +125 -0
pixeltable/metadata/converters/util.py +3 -0
pixeltable/metadata/notes.py +2 -0
pixeltable/metadata/schema.py +14 -2
pixeltable/metadata/utils.py +78 -0
pixeltable/plan.py +26 -18
pixeltable/share/packager.py +20 -38
pixeltable/store.py +121 -142
pixeltable/type_system.py +2 -2
pixeltable/utils/coroutine.py +6 -23
pixeltable/utils/media_store.py +39 -0
{pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/METADATA +1 -1
{pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/RECORD +51 -47
{pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/LICENSE +0 -0
{pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/WHEEL +0 -0
{pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/entry_points.txt +0 -0

pixeltable/catalog/update_status.py ADDED Viewed

@@ -0,0 +1,179 @@
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from IPython.lib.pretty import RepresentationPrinter
+@dataclass(frozen=True)
+class RowCountStats:
+    """
+    Statistics about the counts of rows affected by a table operation.
+    """
+    ins_rows: int = 0  # rows inserted
+    del_rows: int = 0  # rows deleted
+    upd_rows: int = 0  # rows updated
+    num_excs: int = 0  # total number of exceptions
+    # TODO: disambiguate what this means: # of slots computed or # of columns computed?
+    computed_values: int = 0  # number of computed values (e.g., computed columns) affected by the operation
+    @property
+    def num_rows(self) -> int:
+        return self.ins_rows + self.del_rows + self.upd_rows
+    def insert_to_update(self) -> 'RowCountStats':
+        """
+        Convert insert row count stats to update row count stats.
+        This is used when an insert operation is treated as an update.
+        """
+        return RowCountStats(
+            ins_rows=0,
+            del_rows=self.del_rows,
+            upd_rows=self.upd_rows + self.ins_rows,
+            num_excs=self.num_excs,
+            computed_values=self.computed_values,
+        )
+    def __add__(self, other: 'RowCountStats') -> 'RowCountStats':
+        """
+        Add the stats from two RowCountStats objects together.
+        """
+        return RowCountStats(
+            ins_rows=self.ins_rows + other.ins_rows,
+            del_rows=self.del_rows + other.del_rows,
+            upd_rows=self.upd_rows + other.upd_rows,
+            num_excs=self.num_excs + other.num_excs,
+            computed_values=self.computed_values + other.computed_values,
+        )
+@dataclass(frozen=True)
+class UpdateStatus:
+    """
+    Information about changes to table data or table schema
+    """
+    updated_cols: list[str] = field(default_factory=list)
+    cols_with_excs: list[str] = field(default_factory=list)
+    # stats for the rows affected by the operation
+    row_count_stats: RowCountStats = field(default_factory=RowCountStats)
+    # stats for changes cascaded to other tables
+    cascade_row_count_stats: RowCountStats = field(default_factory=RowCountStats)
+    # stats for the rows affected by the operation in an external store
+    ext_row_count_stats: RowCountStats = field(default_factory=RowCountStats)
+    @property
+    def num_rows(self) -> int:
+        return self.row_count_stats.num_rows + self.cascade_row_count_stats.num_rows
+    @property
+    def num_excs(self) -> int:
+        return self.row_count_stats.num_excs + self.cascade_row_count_stats.num_excs
+    @property
+    def num_computed_values(self) -> int:
+        return self.row_count_stats.computed_values + self.cascade_row_count_stats.computed_values
+    def insert_to_update(self) -> 'UpdateStatus':
+        """
+        Convert the update status from an insert operation to an update operation.
+        This is used when an insert operation is treated as an update.
+        """
+        return UpdateStatus(
+            updated_cols=self.updated_cols,
+            cols_with_excs=self.cols_with_excs,
+            row_count_stats=self.row_count_stats.insert_to_update(),
+            cascade_row_count_stats=self.cascade_row_count_stats.insert_to_update(),
+            ext_row_count_stats=self.ext_row_count_stats,
+        )
+    def to_cascade(self) -> 'UpdateStatus':
+        """
+        Convert the update status to a cascade update status.
+        This is used when an operation cascades changes to other tables.
+        """
+        return UpdateStatus(
+            updated_cols=self.updated_cols,
+            cols_with_excs=self.cols_with_excs,
+            row_count_stats=RowCountStats(),
+            cascade_row_count_stats=self.cascade_row_count_stats + self.row_count_stats,
+            ext_row_count_stats=self.ext_row_count_stats,
+        )
+    def __add__(self, other: 'UpdateStatus') -> UpdateStatus:
+        """
+        Add the update status from two UpdateStatus objects together.
+        """
+        return UpdateStatus(
+            updated_cols=list(dict.fromkeys(self.updated_cols + other.updated_cols)),
+            cols_with_excs=list(dict.fromkeys(self.cols_with_excs + other.cols_with_excs)),
+            row_count_stats=self.row_count_stats + other.row_count_stats,
+            cascade_row_count_stats=self.cascade_row_count_stats + other.cascade_row_count_stats,
+            ext_row_count_stats=self.ext_row_count_stats + other.ext_row_count_stats,
+        )
+    @property
+    def insert_msg(self) -> str:
+        """Return a message describing the results of an insert operation."""
+        if self.num_excs == 0:
+            cols_with_excs_str = ''
+        else:
+            cols_with_excs_str = (
+                f' across {len(self.cols_with_excs)} column{"" if len(self.cols_with_excs) == 1 else "s"}'
+            )
+            cols_with_excs_str += f' ({", ".join(self.cols_with_excs)})'
+        msg = (
+            f'Inserted {self.num_rows} row{"" if self.num_rows == 1 else "s"} '
+            f'with {self.num_excs} error{"" if self.num_excs == 1 else "s"}{cols_with_excs_str}.'
+        )
+        return msg
+    @classmethod
+    def __cnt_str(cls, cnt: int, item: str) -> str:
+        assert cnt > 0
+        return f'{cnt} {item}{"" if cnt == 1 else "s"}'
+    def _repr_pretty_(self, p: 'RepresentationPrinter', cycle: bool) -> None:
+        messages = []
+        # Combine row count stats and cascade row count stats
+        stats = self.row_count_stats + self.cascade_row_count_stats
+        if stats.ins_rows > 0:
+            messages.append(f'{self.__cnt_str(stats.ins_rows, "row")} inserted')
+        if stats.del_rows > 0:
+            messages.append(f'{self.__cnt_str(stats.del_rows, "row")} deleted')
+        if stats.upd_rows > 0:
+            messages.append(f'{self.__cnt_str(stats.upd_rows, "row")} updated')
+        if stats.computed_values > 0:
+            messages.append(f'{self.__cnt_str(stats.computed_values, "value")} computed')
+        if stats.num_excs > 0:
+            messages.append(self.__cnt_str(stats.num_excs, 'exception'))
+        p.text(', '.join(messages) + '.' if len(messages) > 0 else 'No rows affected.')
+    @property
+    def pxt_rows_updated(self) -> int:
+        """
+        Returns the number of Pixeltable rows that were updated as a result of the operation.
+        """
+        return (self.row_count_stats + self.cascade_row_count_stats).upd_rows
+    @property
+    def external_rows_updated(self) -> int:
+        return self.ext_row_count_stats.upd_rows
+    @property
+    def external_rows_created(self) -> int:
+        return self.ext_row_count_stats.ins_rows
+    @property
+    def external_rows_deleted(self) -> int:
+        return self.ext_row_count_stats.del_rows
+    @property
+    def ext_num_rows(self) -> int:
+        return self.ext_row_count_stats.num_rows

pixeltable/catalog/view.py CHANGED Viewed

@@ -17,11 +17,12 @@ if TYPE_CHECKING:
 from .column import Column
-from .globals import _POS_COLUMN_NAME, MediaValidation, UpdateStatus
+from .globals import _POS_COLUMN_NAME, MediaValidation
 from .table import Table
 from .table_version import TableVersion
 from .table_version_handle import TableVersionHandle
 from .table_version_path import TableVersionPath
+from .update_status import UpdateStatus
 if TYPE_CHECKING:
     from pixeltable.globals import TableDataSource
@@ -229,7 +230,10 @@ class View(Table):
             try:
                 plan, _ = Planner.create_view_load_plan(view._tbl_version_path)
-                num_rows, num_excs, _ = tbl_version.store_tbl.insert_rows(plan, v_min=tbl_version.version)
+                _, row_counts = tbl_version.store_tbl.insert_rows(plan, v_min=tbl_version.version)
+                status = UpdateStatus(row_count_stats=row_counts)
+                tbl_version._write_md_update_status(0, update_status=status)
             except:
                 # we need to remove the orphaned TableVersion instance
                 del catalog.Catalog.get()._tbl_versions[tbl_version.id, tbl_version.effective_version]
@@ -238,7 +242,9 @@ class View(Table):
                     # also remove tbl_version from the base
                     base_tbl_version.mutable_views.remove(TableVersionHandle.create(tbl_version))
                 raise
-            Env.get().console_logger.info(f'Created view `{name}` with {num_rows} rows, {num_excs} exceptions.')
+            Env.get().console_logger.info(
+                f'Created view `{name}` with {status.num_rows} rows, {status.num_excs} exceptions.'
+            )
         session.commit()
         return view
@@ -273,6 +279,9 @@ class View(Table):
         md = super()._get_metadata()
         md['is_view'] = True
         md['is_snapshot'] = self._tbl_version_path.is_snapshot()
+        base_tbl = self._get_base_table()
+        base_version = self._effective_base_versions[0]
+        md['base'] = base_tbl._path() if base_version is None else f'{base_tbl._path()}:{base_version}'
         return md
     def insert(

pixeltable/config.py CHANGED Viewed

@@ -25,19 +25,26 @@ class Config:
     __home: Path
     __config_file: Path
+    __config_overrides: dict[str, Any]
     __config_dict: dict[str, Any]
-    def __init__(self) -> None:
+    def __init__(self, config_overrides: dict[str, Any]) -> None:
         assert self.__instance is None, 'Config is a singleton; use Config.get() to access the instance'
-        self.__home = Path(os.environ.get('PIXELTABLE_HOME', str(Path.home() / '.pixeltable')))
+        for var in config_overrides:
+            if var not in KNOWN_CONFIG_OVERRIDES:
+                raise excs.Error(f'Unrecognized configuration variable: {var}')
+        self.__config_overrides = config_overrides
+        self.__home = Path(self.lookup_env('pixeltable', 'home', str(Path.home() / '.pixeltable')))
         if self.__home.exists() and not self.__home.is_dir():
-            raise RuntimeError(f'{self.__home} is not a directory')
+            raise excs.Error(f'Not a directory: {self.__home}')
         if not self.__home.exists():
             print(f'Creating a Pixeltable instance at: {self.__home}')
             self.__home.mkdir()
-        self.__config_file = Path(os.environ.get('PIXELTABLE_CONFIG', str(self.__home / 'config.toml')))
+        self.__config_file = Path(self.lookup_env('pixeltable', 'config', str(self.__home / 'config.toml')))
         self.__config_dict: dict[str, Any]
         if os.path.isfile(self.__config_file):
@@ -46,6 +53,12 @@ class Config:
                     self.__config_dict = toml.load(stream)
                 except Exception as exc:
                     raise excs.Error(f'Could not read config file: {self.__config_file}') from exc
+            for section, section_dict in self.__config_dict.items():
+                if section not in KNOWN_CONFIG_OPTIONS:
+                    raise excs.Error(f'Unrecognized section {section!r} in config file: {self.__config_file}')
+                for key in section_dict:
+                    if key not in KNOWN_CONFIG_OPTIONS[section]:
+                        raise excs.Error(f"Unrecognized option '{section}.{key}' in config file: {self.__config_file}")
         else:
             self.__config_dict = self.__create_default_config(self.__config_file)
             with open(self.__config_file, 'w', encoding='utf-8') as stream:
@@ -65,10 +78,18 @@ class Config:
     @classmethod
     def get(cls) -> Config:
-        if cls.__instance is None:
-            cls.__instance = cls()
+        cls.init({})
         return cls.__instance
+    @classmethod
+    def init(cls, config_overrides: dict[str, Any]) -> None:
+        if cls.__instance is None:
+            cls.__instance = cls(config_overrides)
+        elif len(config_overrides) > 0:
+            raise excs.Error(
+                'Pixeltable has already been initialized; cannot specify new config values in the same session'
+            )
     @classmethod
     def __create_default_config(cls, config_path: Path) -> dict[str, Any]:
         free_disk_space_bytes = shutil.disk_usage(config_path.parent).free
@@ -76,14 +97,23 @@ class Config:
         file_cache_size_g = free_disk_space_bytes / 5 / (1 << 30)
         return {'pixeltable': {'file_cache_size_g': round(file_cache_size_g, 1), 'hide_warnings': False}}
-    def get_value(self, key: str, expected_type: type[T], section: str = 'pixeltable') -> Optional[T]:
+    def lookup_env(self, section: str, key: str, default: Any = None) -> Any:
+        override_var = f'{section}.{key}'
         env_var = f'{section.upper()}_{key.upper()}'
+        if override_var in self.__config_overrides:
+            return self.__config_overrides[override_var]
         if env_var in os.environ:
-            value = os.environ[env_var]
-        elif section in self.__config_dict and key in self.__config_dict[section]:
+            return os.environ[env_var]
+        return default
+    def get_value(self, key: str, expected_type: type[T], section: str = 'pixeltable') -> Optional[T]:
+        value = self.lookup_env(section, key)  # Try to get from environment first
+        # Next try the config file
+        if value is None and section in self.__config_dict and key in self.__config_dict[section]:
             value = self.__config_dict[section][key]
-        else:
-            return None
+        if value is None:
+            return None  # Not specified
         try:
             if expected_type is bool and isinstance(value, str):
@@ -91,7 +121,7 @@ class Config:
                     raise excs.Error(f'Invalid value for configuration parameter {section}.{key}: {value}')
                 return value.lower() == 'true'  # type: ignore[return-value]
             return expected_type(value)  # type: ignore[call-arg]
-        except ValueError as exc:
+        except (ValueError, TypeError) as exc:
             raise excs.Error(f'Invalid value for configuration parameter {section}.{key}: {value}') from exc
     def get_string_value(self, key: str, section: str = 'pixeltable') -> Optional[str]:
@@ -105,3 +135,37 @@ class Config:
     def get_bool_value(self, key: str, section: str = 'pixeltable') -> Optional[bool]:
         return self.get_value(key, bool, section)
+KNOWN_CONFIG_OPTIONS = {
+    'pixeltable': {
+        'home': 'Path to the Pixeltable home directory',
+        'config': 'Path to the Pixeltable config file',
+        'pgdata': 'Path to the Pixeltable postgres data directory',
+        'db': 'Postgres database name',
+        'file_cache_size_g': 'Size of the file cache in GB',
+        'time_zone': 'Default time zone for timestamps',
+        'hide_warnings': 'Hide warnings from the console',
+        'verbosity': 'Verbosity level for console output',
+        'api_key': 'API key for Pixeltable cloud',
+    },
+    'anthropic': {'api_key': 'Anthropic API key'},
+    'bedrock': {'api_key': 'AWS Bedrock API key'},
+    'deepseek': {'api_key': 'Deepseek API key'},
+    'fireworks': {'api_key': 'Fireworks API key'},
+    'gemini': {'api_key': 'Gemini API key'},
+    'groq': {'api_key': 'Groq API key'},
+    'label_studio': {'api_key': 'Label Studio API key', 'url': 'Label Studio server URL'},
+    'mistral': {'api_key': 'Mistral API key'},
+    'openai': {'api_key': 'OpenAI API key'},
+    'replicate': {'api_token': 'Replicate API token'},
+    'together': {'api_key': 'Together API key'},
+    'pypi': {'api_key': 'PyPI API key (for internal use only)'},
+}
+KNOWN_CONFIG_OVERRIDES = {
+    f'{section}.{key}': info
+    for section, section_dict in KNOWN_CONFIG_OPTIONS.items()
+    for key, info in section_dict.items()
+}

pixeltable/dataframe.py CHANGED Viewed

@@ -15,7 +15,7 @@ import sqlalchemy as sql
 from pixeltable import catalog, exceptions as excs, exec, exprs, plan, type_system as ts
 from pixeltable.catalog import Catalog, is_valid_identifier
-from pixeltable.catalog.globals import UpdateStatus
+from pixeltable.catalog.update_status import UpdateStatus
 from pixeltable.env import Env
 from pixeltable.plan import Planner, SampleClause
 from pixeltable.type_system import ColumnType

pixeltable/env.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import asyncio
 import datetime
 import glob
 import http.server
@@ -22,6 +23,7 @@ from sys import stdout
 from typing import TYPE_CHECKING, Any, Callable, Iterator, Optional, TypeVar
 from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
+import nest_asyncio  # type: ignore[import-untyped]
 import pixeltable_pgserver
 import sqlalchemy as sql
 from pillow_heif import register_heif_opener  # type: ignore[import-untyped]
@@ -85,6 +87,7 @@ class Env:
     _current_conn: Optional[sql.Connection]
     _current_session: Optional[sql.orm.Session]
     _dbms: Optional[Dbms]
+    _event_loop: Optional[asyncio.AbstractEventLoop]  # event loop for ExecNode
     @classmethod
     def get(cls) -> Env:
@@ -140,6 +143,32 @@ class Env:
         self._current_conn = None
         self._current_session = None
         self._dbms = None
+        self._event_loop = None
+    def _init_event_loop(self) -> None:
+        try:
+            # check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
+            # multiple run_until_complete()
+            running_loop = asyncio.get_running_loop()
+            self._event_loop = running_loop
+            _logger.debug('Patched running loop')
+        except RuntimeError:
+            self._event_loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(self._event_loop)
+            # we set a deliberately long duration to avoid warnings getting printed to the console in debug mode
+            self._event_loop.slow_callback_duration = 3600
+        # always allow nested event loops, we need that to run async udfs synchronously (eg, for SimilarityExpr);
+        # see run_coroutine_synchronously()
+        nest_asyncio.apply()
+        if _logger.isEnabledFor(logging.DEBUG):
+            self._event_loop.set_debug(True)
+    @property
+    def event_loop(self) -> asyncio.AbstractEventLoop:
+        if self._event_loop is None:
+            self._init_event_loop()
+        return self._event_loop
     @property
     def db_url(self) -> str:

pixeltable/exec/exec_node.py CHANGED Viewed

@@ -1,11 +1,11 @@
 from __future__ import annotations
 import abc
-import asyncio
 import logging
 from typing import AsyncIterator, Iterable, Iterator, Optional, TypeVar
 from pixeltable import exprs
+from pixeltable.env import Env
 from .data_row_batch import DataRowBatch
 from .exec_context import ExecContext
@@ -59,26 +59,7 @@ class ExecNode(abc.ABC):
         pass
     def __iter__(self) -> Iterator[DataRowBatch]:
-        running_loop: Optional[asyncio.AbstractEventLoop] = None
-        loop: asyncio.AbstractEventLoop
-        try:
-            # check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
-            # multiple run_until_complete()
-            running_loop = asyncio.get_running_loop()
-            import nest_asyncio  # type: ignore[import-untyped]
-            nest_asyncio.apply()
-            loop = running_loop
-            _logger.debug('Patched running loop')
-        except RuntimeError:
-            loop = asyncio.new_event_loop()
-            asyncio.set_event_loop(loop)
-            # we set a deliberately long duration to avoid warnings getting printed to the console in debug mode
-            loop.slow_callback_duration = 3600
-        if _logger.isEnabledFor(logging.DEBUG):
-            loop.set_debug(True)
+        loop = Env.get().event_loop
         aiter = self.__aiter__()
         try:
             while True:
@@ -86,9 +67,11 @@ class ExecNode(abc.ABC):
                 yield batch
         except StopAsyncIteration:
             pass
-        finally:
-            if loop != running_loop:
-                loop.close()
+        # TODO:
+        #  - we seem to have some tasks that aren't accounted for by ExprEvalNode and don't get cancelled by the time
+        #    we end up here
+        # - however, blindly cancelling all pending tasks doesn't work when running in a jupyter environment, which
+        #   creates tasks on its own
     def open(self) -> None:
         """Bottom-up initialization of nodes for execution. Must be called before __next__."""

pixeltable/exec/expr_eval/schedulers.py CHANGED Viewed

@@ -4,9 +4,10 @@ import asyncio
 import datetime
 import inspect
 import logging
+import re
 import sys
 import time
-from typing import Awaitable, Collection, Optional
+from typing import Any, Awaitable, Collection, Optional
 from pixeltable import env, func
 from pixeltable.config import Config
@@ -250,8 +251,20 @@ class RequestRateScheduler(Scheduler):
     total_retried: int
     TIME_FORMAT = '%H:%M.%S %f'
-    MAX_RETRIES = 10
+    MAX_RETRIES = 3
     DEFAULT_RATE_LIMIT = 600  # requests per minute
+    RATE_LIMIT_INDICATORS = ('rate limit', 'too many requests', '429', 'quota exceeded', 'throttled', 'rate exceeded')
+    RETRY_AFTER_PATTERNS = (
+        r'retry after (\d+(?:\.\d+)?)\s*seconds?',
+        r'try again in (\d+(?:\.\d+)?)\s*seconds?',
+        r'wait (\d+(?:\.\d+)?)\s*seconds?',
+        r'retry-after:\s*(\d+(?:\.\d+)?)',
+    )
+    # Exponential backoff defaults
+    BASE_RETRY_DELAY = 1.0  # in seconds
+    MAX_RETRY_DELAY = 60.0  # in seconds
+    RETRY_BACKOFF_MULTIPLIER = 2.0
     def __init__(self, resource_pool: str, dispatcher: Dispatcher):
         super().__init__(resource_pool, dispatcher)
@@ -337,11 +350,12 @@ class RequestRateScheduler(Scheduler):
             self.dispatcher.dispatch(request.rows, exec_ctx)
         except Exception as exc:
-            # TODO: which exception can be retried?
-            _logger.debug(f'exception for {self.resource_pool}: {exc}')
-            status = getattr(exc, 'status', None)
-            _logger.debug(f'type={type(exc)} has_status={hasattr(exc, "status")} status={status}')
-            if num_retries < self.MAX_RETRIES:
+            _logger.debug(f'exception for {self.resource_pool}: type={type(exc)}\n{exc}')
+            is_rate_limit_error, retry_after = self._is_rate_limit_error(exc)
+            if is_rate_limit_error and num_retries < self.MAX_RETRIES:
+                retry_delay = self._compute_retry_delay(num_retries, retry_after)
+                _logger.debug(f'scheduler {self.resource_pool}: retrying after {retry_delay}')
+                await asyncio.sleep(retry_delay)
                 self.queue.put_nowait(self.QueueItem(request, num_retries + 1, exec_ctx))
                 return
@@ -358,6 +372,119 @@ class RequestRateScheduler(Scheduler):
             if is_task:
                 self.num_in_flight -= 1
+    def _is_rate_limit_error(self, exc: Exception) -> tuple[bool, Optional[float]]:
+        """Returns True if the exception indicates a rate limit error, and the retry delay in seconds."""
+        from http import HTTPStatus
+        # Check for HTTP status TOO_MANY_REQUESTS in various exception classes.
+        # We look for attributes that contain status codes, instead of checking the type of the exception,
+        # in order to handle a wider variety of exception classes.
+        is_rate_limit_error = False
+        retry_delay: Optional[float] = None
+        # requests.HTTPError/httpx.HTTPStatusError
+        if (
+            hasattr(exc, 'response')
+            and hasattr(exc.response, 'status_code')
+            and exc.response.status_code == HTTPStatus.TOO_MANY_REQUESTS.value
+        ):
+            is_rate_limit_error = True
+            retry_delay = self._extract_retry_delay_from_headers(exc.response.headers)
+        elif (
+            # urllib.error.HTTPError
+            (hasattr(exc, 'code') and exc.code == HTTPStatus.TOO_MANY_REQUESTS.value)
+            # aiohttp.ClientResponseError
+            or (hasattr(exc, 'status') and exc.status == HTTPStatus.TOO_MANY_REQUESTS.value)
+        ) and hasattr(exc, 'headers'):
+            is_rate_limit_error = True
+            retry_delay = self._extract_retry_delay_from_headers(exc.headers)
+        if is_rate_limit_error:
+            return True, retry_delay
+        # Check common rate limit keywords in exception message
+        error_msg = str(exc).lower()
+        if any(indicator in error_msg for indicator in self.RATE_LIMIT_INDICATORS):
+            retry_delay = self._extract_retry_delay_from_message(error_msg)
+            return True, retry_delay
+        return False, None
+    def _extract_retry_delay_from_headers(self, headers: Optional[Any]) -> Optional[float]:
+        """Extract retry delay from HTTP headers."""
+        if headers is None:
+            return None
+        # convert headers to dict-like object for consistent access
+        header_dict: dict
+        if hasattr(headers, 'get'):
+            header_dict = headers
+        else:
+            # headers are a list of tuples or other format
+            try:
+                header_dict = dict(headers)
+            except (TypeError, ValueError):
+                return None
+        # normalize dict keys: lowercase and remove dashes
+        header_dict = {k.lower().replace('-', ''): v for k, v in header_dict.items()}
+        # check Retry-After header
+        retry_after = header_dict.get('retryafter')
+        if retry_after is not None:
+            try:
+                return float(retry_after)
+            except (ValueError, TypeError):
+                pass
+        # check X-RateLimit-Reset (Unix timestamp)
+        reset_time = header_dict.get('xratelimitreset')
+        if reset_time is not None:
+            try:
+                reset_timestamp = float(reset_time)
+                delay = max(0, reset_timestamp - time.time())
+                return delay
+            except (ValueError, TypeError):
+                pass
+        # check X-RateLimit-Reset-After (seconds from now)
+        reset_after = header_dict.get('xratelimitresetafter')
+        if reset_after is not None:
+            try:
+                return float(reset_after)
+            except (ValueError, TypeError):
+                pass
+        return None
+    def _extract_retry_delay_from_message(self, msg: str) -> Optional[float]:
+        msg_lower = msg.lower()
+        for pattern in self.RETRY_AFTER_PATTERNS:
+            match = re.search(pattern, msg_lower)
+            if match is not None:
+                try:
+                    return float(match.group(1))
+                except (ValueError, TypeError):
+                    continue
+        return None
+    def _compute_retry_delay(self, num_retries: int, retry_after: Optional[float] = None) -> float:
+        """
+        Calculate exponential backoff delay for rate limit errors.
+        Args:
+            retry_count: Number of retries attempted (0-based)
+            retry_after: Suggested delay from Retry-After header
+        Returns:
+            Delay in seconds
+        """
+        if retry_after is not None and retry_after > 0:
+            # Use server-suggested delay, but cap it at max_delay
+            return max(min(retry_after, self.MAX_RETRY_DELAY), self.BASE_RETRY_DELAY)
+        else:
+            delay = self.BASE_RETRY_DELAY * (self.RETRY_BACKOFF_MULTIPLIER**num_retries)
+            return max(min(delay, self.MAX_RETRY_DELAY), self.BASE_RETRY_DELAY)
 # all concrete Scheduler subclasses that implement matches()
 SCHEDULERS = [RateLimitsScheduler, RequestRateScheduler]

pixeltable 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

Potentially problematic release.

pixeltable 0.4.1py3-none-any.whl → 0.4.3py3-none-any.whl