pixeltable 0.4.2__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -0
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +2 -10
- pixeltable/catalog/catalog.py +64 -38
- pixeltable/catalog/column.py +22 -23
- pixeltable/catalog/globals.py +2 -148
- pixeltable/catalog/insertable_table.py +6 -4
- pixeltable/catalog/path.py +6 -0
- pixeltable/catalog/table.py +51 -32
- pixeltable/catalog/table_version.py +69 -45
- pixeltable/catalog/update_status.py +179 -0
- pixeltable/catalog/view.py +9 -2
- pixeltable/config.py +76 -12
- pixeltable/dataframe.py +1 -1
- pixeltable/env.py +29 -0
- pixeltable/exec/exec_node.py +7 -24
- pixeltable/exec/expr_eval/schedulers.py +134 -7
- pixeltable/exprs/column_property_ref.py +21 -9
- pixeltable/exprs/column_ref.py +5 -1
- pixeltable/exprs/function_call.py +2 -2
- pixeltable/exprs/row_builder.py +10 -9
- pixeltable/exprs/rowid_ref.py +0 -4
- pixeltable/func/function.py +3 -3
- pixeltable/functions/audio.py +36 -9
- pixeltable/functions/video.py +57 -10
- pixeltable/globals.py +61 -1
- pixeltable/io/__init__.py +1 -1
- pixeltable/io/external_store.py +3 -55
- pixeltable/io/globals.py +4 -4
- pixeltable/io/hf_datasets.py +10 -2
- pixeltable/io/label_studio.py +16 -16
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_39.py +125 -0
- pixeltable/metadata/converters/util.py +3 -0
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +14 -2
- pixeltable/plan.py +4 -0
- pixeltable/share/packager.py +20 -38
- pixeltable/store.py +18 -50
- pixeltable/type_system.py +2 -2
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/media_store.py +39 -0
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.3.dist-info}/METADATA +1 -1
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.3.dist-info}/RECORD +47 -45
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.3.dist-info}/LICENSE +0 -0
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.3.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.3.dist-info}/entry_points.txt +0 -0
pixeltable/catalog/view.py
CHANGED
|
@@ -17,11 +17,12 @@ if TYPE_CHECKING:
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
from .column import Column
|
|
20
|
-
from .globals import _POS_COLUMN_NAME, MediaValidation
|
|
20
|
+
from .globals import _POS_COLUMN_NAME, MediaValidation
|
|
21
21
|
from .table import Table
|
|
22
22
|
from .table_version import TableVersion
|
|
23
23
|
from .table_version_handle import TableVersionHandle
|
|
24
24
|
from .table_version_path import TableVersionPath
|
|
25
|
+
from .update_status import UpdateStatus
|
|
25
26
|
|
|
26
27
|
if TYPE_CHECKING:
|
|
27
28
|
from pixeltable.globals import TableDataSource
|
|
@@ -229,7 +230,10 @@ class View(Table):
|
|
|
229
230
|
|
|
230
231
|
try:
|
|
231
232
|
plan, _ = Planner.create_view_load_plan(view._tbl_version_path)
|
|
232
|
-
_,
|
|
233
|
+
_, row_counts = tbl_version.store_tbl.insert_rows(plan, v_min=tbl_version.version)
|
|
234
|
+
status = UpdateStatus(row_count_stats=row_counts)
|
|
235
|
+
tbl_version._write_md_update_status(0, update_status=status)
|
|
236
|
+
|
|
233
237
|
except:
|
|
234
238
|
# we need to remove the orphaned TableVersion instance
|
|
235
239
|
del catalog.Catalog.get()._tbl_versions[tbl_version.id, tbl_version.effective_version]
|
|
@@ -275,6 +279,9 @@ class View(Table):
|
|
|
275
279
|
md = super()._get_metadata()
|
|
276
280
|
md['is_view'] = True
|
|
277
281
|
md['is_snapshot'] = self._tbl_version_path.is_snapshot()
|
|
282
|
+
base_tbl = self._get_base_table()
|
|
283
|
+
base_version = self._effective_base_versions[0]
|
|
284
|
+
md['base'] = base_tbl._path() if base_version is None else f'{base_tbl._path()}:{base_version}'
|
|
278
285
|
return md
|
|
279
286
|
|
|
280
287
|
def insert(
|
pixeltable/config.py
CHANGED
|
@@ -25,19 +25,26 @@ class Config:
|
|
|
25
25
|
|
|
26
26
|
__home: Path
|
|
27
27
|
__config_file: Path
|
|
28
|
+
__config_overrides: dict[str, Any]
|
|
28
29
|
__config_dict: dict[str, Any]
|
|
29
30
|
|
|
30
|
-
def __init__(self) -> None:
|
|
31
|
+
def __init__(self, config_overrides: dict[str, Any]) -> None:
|
|
31
32
|
assert self.__instance is None, 'Config is a singleton; use Config.get() to access the instance'
|
|
32
33
|
|
|
33
|
-
|
|
34
|
+
for var in config_overrides:
|
|
35
|
+
if var not in KNOWN_CONFIG_OVERRIDES:
|
|
36
|
+
raise excs.Error(f'Unrecognized configuration variable: {var}')
|
|
37
|
+
|
|
38
|
+
self.__config_overrides = config_overrides
|
|
39
|
+
|
|
40
|
+
self.__home = Path(self.lookup_env('pixeltable', 'home', str(Path.home() / '.pixeltable')))
|
|
34
41
|
if self.__home.exists() and not self.__home.is_dir():
|
|
35
|
-
raise
|
|
42
|
+
raise excs.Error(f'Not a directory: {self.__home}')
|
|
36
43
|
if not self.__home.exists():
|
|
37
44
|
print(f'Creating a Pixeltable instance at: {self.__home}')
|
|
38
45
|
self.__home.mkdir()
|
|
39
46
|
|
|
40
|
-
self.__config_file = Path(
|
|
47
|
+
self.__config_file = Path(self.lookup_env('pixeltable', 'config', str(self.__home / 'config.toml')))
|
|
41
48
|
|
|
42
49
|
self.__config_dict: dict[str, Any]
|
|
43
50
|
if os.path.isfile(self.__config_file):
|
|
@@ -46,6 +53,12 @@ class Config:
|
|
|
46
53
|
self.__config_dict = toml.load(stream)
|
|
47
54
|
except Exception as exc:
|
|
48
55
|
raise excs.Error(f'Could not read config file: {self.__config_file}') from exc
|
|
56
|
+
for section, section_dict in self.__config_dict.items():
|
|
57
|
+
if section not in KNOWN_CONFIG_OPTIONS:
|
|
58
|
+
raise excs.Error(f'Unrecognized section {section!r} in config file: {self.__config_file}')
|
|
59
|
+
for key in section_dict:
|
|
60
|
+
if key not in KNOWN_CONFIG_OPTIONS[section]:
|
|
61
|
+
raise excs.Error(f"Unrecognized option '{section}.{key}' in config file: {self.__config_file}")
|
|
49
62
|
else:
|
|
50
63
|
self.__config_dict = self.__create_default_config(self.__config_file)
|
|
51
64
|
with open(self.__config_file, 'w', encoding='utf-8') as stream:
|
|
@@ -65,10 +78,18 @@ class Config:
|
|
|
65
78
|
|
|
66
79
|
@classmethod
|
|
67
80
|
def get(cls) -> Config:
|
|
68
|
-
|
|
69
|
-
cls.__instance = cls()
|
|
81
|
+
cls.init({})
|
|
70
82
|
return cls.__instance
|
|
71
83
|
|
|
84
|
+
@classmethod
|
|
85
|
+
def init(cls, config_overrides: dict[str, Any]) -> None:
|
|
86
|
+
if cls.__instance is None:
|
|
87
|
+
cls.__instance = cls(config_overrides)
|
|
88
|
+
elif len(config_overrides) > 0:
|
|
89
|
+
raise excs.Error(
|
|
90
|
+
'Pixeltable has already been initialized; cannot specify new config values in the same session'
|
|
91
|
+
)
|
|
92
|
+
|
|
72
93
|
@classmethod
|
|
73
94
|
def __create_default_config(cls, config_path: Path) -> dict[str, Any]:
|
|
74
95
|
free_disk_space_bytes = shutil.disk_usage(config_path.parent).free
|
|
@@ -76,14 +97,23 @@ class Config:
|
|
|
76
97
|
file_cache_size_g = free_disk_space_bytes / 5 / (1 << 30)
|
|
77
98
|
return {'pixeltable': {'file_cache_size_g': round(file_cache_size_g, 1), 'hide_warnings': False}}
|
|
78
99
|
|
|
79
|
-
def
|
|
100
|
+
def lookup_env(self, section: str, key: str, default: Any = None) -> Any:
|
|
101
|
+
override_var = f'{section}.{key}'
|
|
80
102
|
env_var = f'{section.upper()}_{key.upper()}'
|
|
103
|
+
if override_var in self.__config_overrides:
|
|
104
|
+
return self.__config_overrides[override_var]
|
|
81
105
|
if env_var in os.environ:
|
|
82
|
-
|
|
83
|
-
|
|
106
|
+
return os.environ[env_var]
|
|
107
|
+
return default
|
|
108
|
+
|
|
109
|
+
def get_value(self, key: str, expected_type: type[T], section: str = 'pixeltable') -> Optional[T]:
|
|
110
|
+
value = self.lookup_env(section, key) # Try to get from environment first
|
|
111
|
+
# Next try the config file
|
|
112
|
+
if value is None and section in self.__config_dict and key in self.__config_dict[section]:
|
|
84
113
|
value = self.__config_dict[section][key]
|
|
85
|
-
|
|
86
|
-
|
|
114
|
+
|
|
115
|
+
if value is None:
|
|
116
|
+
return None # Not specified
|
|
87
117
|
|
|
88
118
|
try:
|
|
89
119
|
if expected_type is bool and isinstance(value, str):
|
|
@@ -91,7 +121,7 @@ class Config:
|
|
|
91
121
|
raise excs.Error(f'Invalid value for configuration parameter {section}.{key}: {value}')
|
|
92
122
|
return value.lower() == 'true' # type: ignore[return-value]
|
|
93
123
|
return expected_type(value) # type: ignore[call-arg]
|
|
94
|
-
except ValueError as exc:
|
|
124
|
+
except (ValueError, TypeError) as exc:
|
|
95
125
|
raise excs.Error(f'Invalid value for configuration parameter {section}.{key}: {value}') from exc
|
|
96
126
|
|
|
97
127
|
def get_string_value(self, key: str, section: str = 'pixeltable') -> Optional[str]:
|
|
@@ -105,3 +135,37 @@ class Config:
|
|
|
105
135
|
|
|
106
136
|
def get_bool_value(self, key: str, section: str = 'pixeltable') -> Optional[bool]:
|
|
107
137
|
return self.get_value(key, bool, section)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
KNOWN_CONFIG_OPTIONS = {
|
|
141
|
+
'pixeltable': {
|
|
142
|
+
'home': 'Path to the Pixeltable home directory',
|
|
143
|
+
'config': 'Path to the Pixeltable config file',
|
|
144
|
+
'pgdata': 'Path to the Pixeltable postgres data directory',
|
|
145
|
+
'db': 'Postgres database name',
|
|
146
|
+
'file_cache_size_g': 'Size of the file cache in GB',
|
|
147
|
+
'time_zone': 'Default time zone for timestamps',
|
|
148
|
+
'hide_warnings': 'Hide warnings from the console',
|
|
149
|
+
'verbosity': 'Verbosity level for console output',
|
|
150
|
+
'api_key': 'API key for Pixeltable cloud',
|
|
151
|
+
},
|
|
152
|
+
'anthropic': {'api_key': 'Anthropic API key'},
|
|
153
|
+
'bedrock': {'api_key': 'AWS Bedrock API key'},
|
|
154
|
+
'deepseek': {'api_key': 'Deepseek API key'},
|
|
155
|
+
'fireworks': {'api_key': 'Fireworks API key'},
|
|
156
|
+
'gemini': {'api_key': 'Gemini API key'},
|
|
157
|
+
'groq': {'api_key': 'Groq API key'},
|
|
158
|
+
'label_studio': {'api_key': 'Label Studio API key', 'url': 'Label Studio server URL'},
|
|
159
|
+
'mistral': {'api_key': 'Mistral API key'},
|
|
160
|
+
'openai': {'api_key': 'OpenAI API key'},
|
|
161
|
+
'replicate': {'api_token': 'Replicate API token'},
|
|
162
|
+
'together': {'api_key': 'Together API key'},
|
|
163
|
+
'pypi': {'api_key': 'PyPI API key (for internal use only)'},
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
KNOWN_CONFIG_OVERRIDES = {
|
|
168
|
+
f'{section}.{key}': info
|
|
169
|
+
for section, section_dict in KNOWN_CONFIG_OPTIONS.items()
|
|
170
|
+
for key, info in section_dict.items()
|
|
171
|
+
}
|
pixeltable/dataframe.py
CHANGED
|
@@ -15,7 +15,7 @@ import sqlalchemy as sql
|
|
|
15
15
|
|
|
16
16
|
from pixeltable import catalog, exceptions as excs, exec, exprs, plan, type_system as ts
|
|
17
17
|
from pixeltable.catalog import Catalog, is_valid_identifier
|
|
18
|
-
from pixeltable.catalog.
|
|
18
|
+
from pixeltable.catalog.update_status import UpdateStatus
|
|
19
19
|
from pixeltable.env import Env
|
|
20
20
|
from pixeltable.plan import Planner, SampleClause
|
|
21
21
|
from pixeltable.type_system import ColumnType
|
pixeltable/env.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
import datetime
|
|
4
5
|
import glob
|
|
5
6
|
import http.server
|
|
@@ -22,6 +23,7 @@ from sys import stdout
|
|
|
22
23
|
from typing import TYPE_CHECKING, Any, Callable, Iterator, Optional, TypeVar
|
|
23
24
|
from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
|
|
24
25
|
|
|
26
|
+
import nest_asyncio # type: ignore[import-untyped]
|
|
25
27
|
import pixeltable_pgserver
|
|
26
28
|
import sqlalchemy as sql
|
|
27
29
|
from pillow_heif import register_heif_opener # type: ignore[import-untyped]
|
|
@@ -85,6 +87,7 @@ class Env:
|
|
|
85
87
|
_current_conn: Optional[sql.Connection]
|
|
86
88
|
_current_session: Optional[sql.orm.Session]
|
|
87
89
|
_dbms: Optional[Dbms]
|
|
90
|
+
_event_loop: Optional[asyncio.AbstractEventLoop] # event loop for ExecNode
|
|
88
91
|
|
|
89
92
|
@classmethod
|
|
90
93
|
def get(cls) -> Env:
|
|
@@ -140,6 +143,32 @@ class Env:
|
|
|
140
143
|
self._current_conn = None
|
|
141
144
|
self._current_session = None
|
|
142
145
|
self._dbms = None
|
|
146
|
+
self._event_loop = None
|
|
147
|
+
|
|
148
|
+
def _init_event_loop(self) -> None:
|
|
149
|
+
try:
|
|
150
|
+
# check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
|
|
151
|
+
# multiple run_until_complete()
|
|
152
|
+
running_loop = asyncio.get_running_loop()
|
|
153
|
+
self._event_loop = running_loop
|
|
154
|
+
_logger.debug('Patched running loop')
|
|
155
|
+
except RuntimeError:
|
|
156
|
+
self._event_loop = asyncio.new_event_loop()
|
|
157
|
+
asyncio.set_event_loop(self._event_loop)
|
|
158
|
+
# we set a deliberately long duration to avoid warnings getting printed to the console in debug mode
|
|
159
|
+
self._event_loop.slow_callback_duration = 3600
|
|
160
|
+
|
|
161
|
+
# always allow nested event loops, we need that to run async udfs synchronously (eg, for SimilarityExpr);
|
|
162
|
+
# see run_coroutine_synchronously()
|
|
163
|
+
nest_asyncio.apply()
|
|
164
|
+
if _logger.isEnabledFor(logging.DEBUG):
|
|
165
|
+
self._event_loop.set_debug(True)
|
|
166
|
+
|
|
167
|
+
@property
|
|
168
|
+
def event_loop(self) -> asyncio.AbstractEventLoop:
|
|
169
|
+
if self._event_loop is None:
|
|
170
|
+
self._init_event_loop()
|
|
171
|
+
return self._event_loop
|
|
143
172
|
|
|
144
173
|
@property
|
|
145
174
|
def db_url(self) -> str:
|
pixeltable/exec/exec_node.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import abc
|
|
4
|
-
import asyncio
|
|
5
4
|
import logging
|
|
6
5
|
from typing import AsyncIterator, Iterable, Iterator, Optional, TypeVar
|
|
7
6
|
|
|
8
7
|
from pixeltable import exprs
|
|
8
|
+
from pixeltable.env import Env
|
|
9
9
|
|
|
10
10
|
from .data_row_batch import DataRowBatch
|
|
11
11
|
from .exec_context import ExecContext
|
|
@@ -59,26 +59,7 @@ class ExecNode(abc.ABC):
|
|
|
59
59
|
pass
|
|
60
60
|
|
|
61
61
|
def __iter__(self) -> Iterator[DataRowBatch]:
|
|
62
|
-
|
|
63
|
-
loop: asyncio.AbstractEventLoop
|
|
64
|
-
try:
|
|
65
|
-
# check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
|
|
66
|
-
# multiple run_until_complete()
|
|
67
|
-
running_loop = asyncio.get_running_loop()
|
|
68
|
-
import nest_asyncio # type: ignore[import-untyped]
|
|
69
|
-
|
|
70
|
-
nest_asyncio.apply()
|
|
71
|
-
loop = running_loop
|
|
72
|
-
_logger.debug('Patched running loop')
|
|
73
|
-
except RuntimeError:
|
|
74
|
-
loop = asyncio.new_event_loop()
|
|
75
|
-
asyncio.set_event_loop(loop)
|
|
76
|
-
# we set a deliberately long duration to avoid warnings getting printed to the console in debug mode
|
|
77
|
-
loop.slow_callback_duration = 3600
|
|
78
|
-
|
|
79
|
-
if _logger.isEnabledFor(logging.DEBUG):
|
|
80
|
-
loop.set_debug(True)
|
|
81
|
-
|
|
62
|
+
loop = Env.get().event_loop
|
|
82
63
|
aiter = self.__aiter__()
|
|
83
64
|
try:
|
|
84
65
|
while True:
|
|
@@ -86,9 +67,11 @@ class ExecNode(abc.ABC):
|
|
|
86
67
|
yield batch
|
|
87
68
|
except StopAsyncIteration:
|
|
88
69
|
pass
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
70
|
+
# TODO:
|
|
71
|
+
# - we seem to have some tasks that aren't accounted for by ExprEvalNode and don't get cancelled by the time
|
|
72
|
+
# we end up here
|
|
73
|
+
# - however, blindly cancelling all pending tasks doesn't work when running in a jupyter environment, which
|
|
74
|
+
# creates tasks on its own
|
|
92
75
|
|
|
93
76
|
def open(self) -> None:
|
|
94
77
|
"""Bottom-up initialization of nodes for execution. Must be called before __next__."""
|
|
@@ -4,9 +4,10 @@ import asyncio
|
|
|
4
4
|
import datetime
|
|
5
5
|
import inspect
|
|
6
6
|
import logging
|
|
7
|
+
import re
|
|
7
8
|
import sys
|
|
8
9
|
import time
|
|
9
|
-
from typing import Awaitable, Collection, Optional
|
|
10
|
+
from typing import Any, Awaitable, Collection, Optional
|
|
10
11
|
|
|
11
12
|
from pixeltable import env, func
|
|
12
13
|
from pixeltable.config import Config
|
|
@@ -250,8 +251,20 @@ class RequestRateScheduler(Scheduler):
|
|
|
250
251
|
total_retried: int
|
|
251
252
|
|
|
252
253
|
TIME_FORMAT = '%H:%M.%S %f'
|
|
253
|
-
MAX_RETRIES =
|
|
254
|
+
MAX_RETRIES = 3
|
|
254
255
|
DEFAULT_RATE_LIMIT = 600 # requests per minute
|
|
256
|
+
RATE_LIMIT_INDICATORS = ('rate limit', 'too many requests', '429', 'quota exceeded', 'throttled', 'rate exceeded')
|
|
257
|
+
RETRY_AFTER_PATTERNS = (
|
|
258
|
+
r'retry after (\d+(?:\.\d+)?)\s*seconds?',
|
|
259
|
+
r'try again in (\d+(?:\.\d+)?)\s*seconds?',
|
|
260
|
+
r'wait (\d+(?:\.\d+)?)\s*seconds?',
|
|
261
|
+
r'retry-after:\s*(\d+(?:\.\d+)?)',
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
# Exponential backoff defaults
|
|
265
|
+
BASE_RETRY_DELAY = 1.0 # in seconds
|
|
266
|
+
MAX_RETRY_DELAY = 60.0 # in seconds
|
|
267
|
+
RETRY_BACKOFF_MULTIPLIER = 2.0
|
|
255
268
|
|
|
256
269
|
def __init__(self, resource_pool: str, dispatcher: Dispatcher):
|
|
257
270
|
super().__init__(resource_pool, dispatcher)
|
|
@@ -337,11 +350,12 @@ class RequestRateScheduler(Scheduler):
|
|
|
337
350
|
self.dispatcher.dispatch(request.rows, exec_ctx)
|
|
338
351
|
|
|
339
352
|
except Exception as exc:
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
353
|
+
_logger.debug(f'exception for {self.resource_pool}: type={type(exc)}\n{exc}')
|
|
354
|
+
is_rate_limit_error, retry_after = self._is_rate_limit_error(exc)
|
|
355
|
+
if is_rate_limit_error and num_retries < self.MAX_RETRIES:
|
|
356
|
+
retry_delay = self._compute_retry_delay(num_retries, retry_after)
|
|
357
|
+
_logger.debug(f'scheduler {self.resource_pool}: retrying after {retry_delay}')
|
|
358
|
+
await asyncio.sleep(retry_delay)
|
|
345
359
|
self.queue.put_nowait(self.QueueItem(request, num_retries + 1, exec_ctx))
|
|
346
360
|
return
|
|
347
361
|
|
|
@@ -358,6 +372,119 @@ class RequestRateScheduler(Scheduler):
|
|
|
358
372
|
if is_task:
|
|
359
373
|
self.num_in_flight -= 1
|
|
360
374
|
|
|
375
|
+
def _is_rate_limit_error(self, exc: Exception) -> tuple[bool, Optional[float]]:
|
|
376
|
+
"""Returns True if the exception indicates a rate limit error, and the retry delay in seconds."""
|
|
377
|
+
from http import HTTPStatus
|
|
378
|
+
|
|
379
|
+
# Check for HTTP status TOO_MANY_REQUESTS in various exception classes.
|
|
380
|
+
# We look for attributes that contain status codes, instead of checking the type of the exception,
|
|
381
|
+
# in order to handle a wider variety of exception classes.
|
|
382
|
+
is_rate_limit_error = False
|
|
383
|
+
retry_delay: Optional[float] = None
|
|
384
|
+
|
|
385
|
+
# requests.HTTPError/httpx.HTTPStatusError
|
|
386
|
+
if (
|
|
387
|
+
hasattr(exc, 'response')
|
|
388
|
+
and hasattr(exc.response, 'status_code')
|
|
389
|
+
and exc.response.status_code == HTTPStatus.TOO_MANY_REQUESTS.value
|
|
390
|
+
):
|
|
391
|
+
is_rate_limit_error = True
|
|
392
|
+
retry_delay = self._extract_retry_delay_from_headers(exc.response.headers)
|
|
393
|
+
elif (
|
|
394
|
+
# urllib.error.HTTPError
|
|
395
|
+
(hasattr(exc, 'code') and exc.code == HTTPStatus.TOO_MANY_REQUESTS.value)
|
|
396
|
+
# aiohttp.ClientResponseError
|
|
397
|
+
or (hasattr(exc, 'status') and exc.status == HTTPStatus.TOO_MANY_REQUESTS.value)
|
|
398
|
+
) and hasattr(exc, 'headers'):
|
|
399
|
+
is_rate_limit_error = True
|
|
400
|
+
retry_delay = self._extract_retry_delay_from_headers(exc.headers)
|
|
401
|
+
|
|
402
|
+
if is_rate_limit_error:
|
|
403
|
+
return True, retry_delay
|
|
404
|
+
|
|
405
|
+
# Check common rate limit keywords in exception message
|
|
406
|
+
error_msg = str(exc).lower()
|
|
407
|
+
if any(indicator in error_msg for indicator in self.RATE_LIMIT_INDICATORS):
|
|
408
|
+
retry_delay = self._extract_retry_delay_from_message(error_msg)
|
|
409
|
+
return True, retry_delay
|
|
410
|
+
|
|
411
|
+
return False, None
|
|
412
|
+
|
|
413
|
+
def _extract_retry_delay_from_headers(self, headers: Optional[Any]) -> Optional[float]:
|
|
414
|
+
"""Extract retry delay from HTTP headers."""
|
|
415
|
+
if headers is None:
|
|
416
|
+
return None
|
|
417
|
+
|
|
418
|
+
# convert headers to dict-like object for consistent access
|
|
419
|
+
header_dict: dict
|
|
420
|
+
if hasattr(headers, 'get'):
|
|
421
|
+
header_dict = headers
|
|
422
|
+
else:
|
|
423
|
+
# headers are a list of tuples or other format
|
|
424
|
+
try:
|
|
425
|
+
header_dict = dict(headers)
|
|
426
|
+
except (TypeError, ValueError):
|
|
427
|
+
return None
|
|
428
|
+
# normalize dict keys: lowercase and remove dashes
|
|
429
|
+
header_dict = {k.lower().replace('-', ''): v for k, v in header_dict.items()}
|
|
430
|
+
|
|
431
|
+
# check Retry-After header
|
|
432
|
+
retry_after = header_dict.get('retryafter')
|
|
433
|
+
if retry_after is not None:
|
|
434
|
+
try:
|
|
435
|
+
return float(retry_after)
|
|
436
|
+
except (ValueError, TypeError):
|
|
437
|
+
pass
|
|
438
|
+
|
|
439
|
+
# check X-RateLimit-Reset (Unix timestamp)
|
|
440
|
+
reset_time = header_dict.get('xratelimitreset')
|
|
441
|
+
if reset_time is not None:
|
|
442
|
+
try:
|
|
443
|
+
reset_timestamp = float(reset_time)
|
|
444
|
+
delay = max(0, reset_timestamp - time.time())
|
|
445
|
+
return delay
|
|
446
|
+
except (ValueError, TypeError):
|
|
447
|
+
pass
|
|
448
|
+
|
|
449
|
+
# check X-RateLimit-Reset-After (seconds from now)
|
|
450
|
+
reset_after = header_dict.get('xratelimitresetafter')
|
|
451
|
+
if reset_after is not None:
|
|
452
|
+
try:
|
|
453
|
+
return float(reset_after)
|
|
454
|
+
except (ValueError, TypeError):
|
|
455
|
+
pass
|
|
456
|
+
|
|
457
|
+
return None
|
|
458
|
+
|
|
459
|
+
def _extract_retry_delay_from_message(self, msg: str) -> Optional[float]:
|
|
460
|
+
msg_lower = msg.lower()
|
|
461
|
+
for pattern in self.RETRY_AFTER_PATTERNS:
|
|
462
|
+
match = re.search(pattern, msg_lower)
|
|
463
|
+
if match is not None:
|
|
464
|
+
try:
|
|
465
|
+
return float(match.group(1))
|
|
466
|
+
except (ValueError, TypeError):
|
|
467
|
+
continue
|
|
468
|
+
return None
|
|
469
|
+
|
|
470
|
+
def _compute_retry_delay(self, num_retries: int, retry_after: Optional[float] = None) -> float:
|
|
471
|
+
"""
|
|
472
|
+
Calculate exponential backoff delay for rate limit errors.
|
|
473
|
+
|
|
474
|
+
Args:
|
|
475
|
+
retry_count: Number of retries attempted (0-based)
|
|
476
|
+
retry_after: Suggested delay from Retry-After header
|
|
477
|
+
|
|
478
|
+
Returns:
|
|
479
|
+
Delay in seconds
|
|
480
|
+
"""
|
|
481
|
+
if retry_after is not None and retry_after > 0:
|
|
482
|
+
# Use server-suggested delay, but cap it at max_delay
|
|
483
|
+
return max(min(retry_after, self.MAX_RETRY_DELAY), self.BASE_RETRY_DELAY)
|
|
484
|
+
else:
|
|
485
|
+
delay = self.BASE_RETRY_DELAY * (self.RETRY_BACKOFF_MULTIPLIER**num_retries)
|
|
486
|
+
return max(min(delay, self.MAX_RETRY_DELAY), self.BASE_RETRY_DELAY)
|
|
487
|
+
|
|
361
488
|
|
|
362
489
|
# all concrete Scheduler subclasses that implement matches()
|
|
363
490
|
SCHEDULERS = [RateLimitsScheduler, RequestRateScheduler]
|
|
@@ -26,6 +26,7 @@ class ColumnPropertyRef(Expr):
|
|
|
26
26
|
ERRORMSG = 1
|
|
27
27
|
FILEURL = 2
|
|
28
28
|
LOCALPATH = 3
|
|
29
|
+
CELLMD = 4 # JSON metadata for the cell, e.g. errortype, errormsg for media columns
|
|
29
30
|
|
|
30
31
|
def __init__(self, col_ref: ColumnRef, prop: Property):
|
|
31
32
|
super().__init__(ts.StringType(nullable=True))
|
|
@@ -51,8 +52,8 @@ class ColumnPropertyRef(Expr):
|
|
|
51
52
|
def __repr__(self) -> str:
|
|
52
53
|
return f'{self._col_ref}.{self.prop.name.lower()}'
|
|
53
54
|
|
|
54
|
-
def
|
|
55
|
-
return self.prop in (self.Property.ERRORTYPE, self.Property.ERRORMSG)
|
|
55
|
+
def is_cellmd_prop(self) -> bool:
|
|
56
|
+
return self.prop in (self.Property.ERRORTYPE, self.Property.ERRORMSG, self.Property.CELLMD)
|
|
56
57
|
|
|
57
58
|
def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ColumnElement]:
|
|
58
59
|
if not self._col_ref.col_handle.get().is_stored:
|
|
@@ -63,21 +64,27 @@ class ColumnPropertyRef(Expr):
|
|
|
63
64
|
if (
|
|
64
65
|
col.col_type.is_media_type()
|
|
65
66
|
and col.media_validation == catalog.MediaValidation.ON_READ
|
|
66
|
-
and self.
|
|
67
|
+
and self.is_cellmd_prop()
|
|
67
68
|
):
|
|
68
69
|
return None
|
|
69
70
|
|
|
70
71
|
if self.prop == self.Property.ERRORTYPE:
|
|
71
|
-
|
|
72
|
-
return col.sa_errortype_col
|
|
72
|
+
return col.sa_cellmd_col.op('->>')('errortype')
|
|
73
73
|
if self.prop == self.Property.ERRORMSG:
|
|
74
|
-
|
|
75
|
-
|
|
74
|
+
return col.sa_cellmd_col.op('->>')('errormsg')
|
|
75
|
+
if self.prop == self.Property.CELLMD:
|
|
76
|
+
assert col.sa_cellmd_col is not None
|
|
77
|
+
return col.sa_cellmd_col
|
|
76
78
|
if self.prop == self.Property.FILEURL:
|
|
77
79
|
# the file url is stored as the column value
|
|
78
80
|
return sql_elements.get(self._col_ref)
|
|
79
81
|
return None
|
|
80
82
|
|
|
83
|
+
@classmethod
|
|
84
|
+
def create_cellmd_exc(cls, exc: Exception) -> dict[str, str]:
|
|
85
|
+
"""Create a cellmd value from an exception."""
|
|
86
|
+
return {'errortype': type(exc).__name__, 'errormsg': str(exc)}
|
|
87
|
+
|
|
81
88
|
def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
|
|
82
89
|
if self.prop == self.Property.FILEURL:
|
|
83
90
|
assert data_row.has_val[self._col_ref.slot_idx]
|
|
@@ -87,14 +94,19 @@ class ColumnPropertyRef(Expr):
|
|
|
87
94
|
assert data_row.has_val[self._col_ref.slot_idx]
|
|
88
95
|
data_row[self.slot_idx] = data_row.file_paths[self._col_ref.slot_idx]
|
|
89
96
|
return
|
|
90
|
-
elif self.
|
|
97
|
+
elif self.is_cellmd_prop():
|
|
91
98
|
exc = data_row.get_exc(self._col_ref.slot_idx)
|
|
92
99
|
if exc is None:
|
|
93
100
|
data_row[self.slot_idx] = None
|
|
94
101
|
elif self.prop == self.Property.ERRORTYPE:
|
|
95
102
|
data_row[self.slot_idx] = type(exc).__name__
|
|
96
|
-
|
|
103
|
+
elif self.prop == self.Property.ERRORMSG:
|
|
97
104
|
data_row[self.slot_idx] = str(exc)
|
|
105
|
+
elif self.prop == self.Property.CELLMD:
|
|
106
|
+
data_row[self.slot_idx] = self.create_cellmd_exc(exc)
|
|
107
|
+
else:
|
|
108
|
+
raise AssertionError(f'Unknown property {self.prop}')
|
|
109
|
+
return
|
|
98
110
|
else:
|
|
99
111
|
raise AssertionError()
|
|
100
112
|
|
pixeltable/exprs/column_ref.py
CHANGED
|
@@ -115,11 +115,15 @@ class ColumnRef(Expr):
|
|
|
115
115
|
from .column_property_ref import ColumnPropertyRef
|
|
116
116
|
|
|
117
117
|
# resolve column properties
|
|
118
|
+
if name == ColumnPropertyRef.Property.CELLMD.name.lower():
|
|
119
|
+
# This is not user accessible, but used internally to store cell metadata
|
|
120
|
+
return super().__getattr__(name)
|
|
121
|
+
|
|
118
122
|
if (
|
|
119
123
|
name == ColumnPropertyRef.Property.ERRORTYPE.name.lower()
|
|
120
124
|
or name == ColumnPropertyRef.Property.ERRORMSG.name.lower()
|
|
121
125
|
):
|
|
122
|
-
property_is_present = self.col.
|
|
126
|
+
property_is_present = self.col.stores_cellmd
|
|
123
127
|
if not property_is_present:
|
|
124
128
|
raise excs.Error(f'{name} only valid for a stored computed or media column: {self}')
|
|
125
129
|
return ColumnPropertyRef(self, ColumnPropertyRef.Property[name.upper()])
|
|
@@ -446,11 +446,11 @@ class FunctionCall(Expr):
|
|
|
446
446
|
dedent(
|
|
447
447
|
f"""
|
|
448
448
|
The UDF '{fn.self_path}' cannot be located, because
|
|
449
|
-
{{
|
|
449
|
+
{{error_msg}}
|
|
450
450
|
"""
|
|
451
451
|
)
|
|
452
452
|
.strip()
|
|
453
|
-
.format(
|
|
453
|
+
.format(error_msg=fn.error_msg)
|
|
454
454
|
)
|
|
455
455
|
return cls(fn, args, kwargs, return_type, is_method_call=is_method_call, validation_error=validation_error)
|
|
456
456
|
|
pixeltable/exprs/row_builder.py
CHANGED
|
@@ -209,7 +209,7 @@ class RowBuilder:
|
|
|
209
209
|
# this is input and therefore doesn't depend on other exprs
|
|
210
210
|
continue
|
|
211
211
|
# error properties don't have exceptions themselves
|
|
212
|
-
if isinstance(expr, ColumnPropertyRef) and expr.
|
|
212
|
+
if isinstance(expr, ColumnPropertyRef) and expr.is_cellmd_prop():
|
|
213
213
|
continue
|
|
214
214
|
dependency_idxs = [d.slot_idx for d in expr.dependencies()]
|
|
215
215
|
self.dependencies[expr.slot_idx, dependency_idxs] = True
|
|
@@ -444,6 +444,8 @@ class RowBuilder:
|
|
|
444
444
|
Return tuple[list of row values in `self.table_columns` order, # of exceptions]
|
|
445
445
|
This excludes system columns.
|
|
446
446
|
"""
|
|
447
|
+
from pixeltable.exprs.column_property_ref import ColumnPropertyRef
|
|
448
|
+
|
|
447
449
|
num_excs = 0
|
|
448
450
|
table_row: list[Any] = list(pk)
|
|
449
451
|
for info in self.table_columns:
|
|
@@ -454,9 +456,9 @@ class RowBuilder:
|
|
|
454
456
|
if cols_with_excs is not None:
|
|
455
457
|
cols_with_excs.add(col.id)
|
|
456
458
|
table_row.append(None)
|
|
457
|
-
if col.
|
|
458
|
-
# exceptions get stored in the errortype/-msg
|
|
459
|
-
table_row.
|
|
459
|
+
if col.stores_cellmd:
|
|
460
|
+
# exceptions get stored in the errortype/-msg properties of the cellmd column
|
|
461
|
+
table_row.append(ColumnPropertyRef.create_cellmd_exc(exc))
|
|
460
462
|
else:
|
|
461
463
|
if col.col_type.is_image_type() and data_row.file_urls[slot_idx] is None:
|
|
462
464
|
# we have yet to store this image
|
|
@@ -464,8 +466,8 @@ class RowBuilder:
|
|
|
464
466
|
data_row.flush_img(slot_idx, filepath)
|
|
465
467
|
val = data_row.get_stored_val(slot_idx, col.get_sa_col_type())
|
|
466
468
|
table_row.append(val)
|
|
467
|
-
if col.
|
|
468
|
-
table_row.
|
|
469
|
+
if col.stores_cellmd:
|
|
470
|
+
table_row.append(None) # placeholder for cellmd column
|
|
469
471
|
|
|
470
472
|
return table_row, num_excs
|
|
471
473
|
|
|
@@ -483,8 +485,7 @@ class RowBuilder:
|
|
|
483
485
|
if col.col.col_type.is_media_type():
|
|
484
486
|
media_cols[len(store_col_names)] = col.col
|
|
485
487
|
store_col_names.append(col.col.store_name())
|
|
486
|
-
if col.col.
|
|
487
|
-
store_col_names.append(col.col.
|
|
488
|
-
store_col_names.append(col.col.errormsg_store_name())
|
|
488
|
+
if col.col.stores_cellmd:
|
|
489
|
+
store_col_names.append(col.col.cellmd_store_name())
|
|
489
490
|
|
|
490
491
|
return store_col_names, media_cols
|
pixeltable/exprs/rowid_ref.py
CHANGED
|
@@ -105,10 +105,6 @@ class RowidRef(Expr):
|
|
|
105
105
|
assert self.rowid_component_idx <= len(rowid_cols), (
|
|
106
106
|
f'{self.rowid_component_idx} not consistent with {rowid_cols}'
|
|
107
107
|
)
|
|
108
|
-
# _logger.debug(
|
|
109
|
-
# f'RowidRef.sql_expr: tbl={tbl.id}{tbl.effective_version} sa_tbl={id(tbl.store_tbl.sa_tbl):x} '
|
|
110
|
-
# f'tv={id(tbl):x}'
|
|
111
|
-
# )
|
|
112
108
|
return rowid_cols[self.rowid_component_idx]
|
|
113
109
|
|
|
114
110
|
def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
|
pixeltable/func/function.py
CHANGED
|
@@ -504,12 +504,12 @@ class Function(ABC):
|
|
|
504
504
|
|
|
505
505
|
class InvalidFunction(Function):
|
|
506
506
|
fn_dict: dict[str, Any]
|
|
507
|
-
|
|
507
|
+
error_msg: str
|
|
508
508
|
|
|
509
|
-
def __init__(self, self_path: str, fn_dict: dict[str, Any],
|
|
509
|
+
def __init__(self, self_path: str, fn_dict: dict[str, Any], error_msg: str):
|
|
510
510
|
super().__init__([], self_path)
|
|
511
511
|
self.fn_dict = fn_dict
|
|
512
|
-
self.
|
|
512
|
+
self.error_msg = error_msg
|
|
513
513
|
|
|
514
514
|
def _as_dict(self) -> dict:
|
|
515
515
|
"""
|