pixeltable 0.4.2__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (47) hide show
  1. pixeltable/__init__.py +1 -0
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +2 -10
  4. pixeltable/catalog/catalog.py +64 -38
  5. pixeltable/catalog/column.py +22 -23
  6. pixeltable/catalog/globals.py +2 -148
  7. pixeltable/catalog/insertable_table.py +6 -4
  8. pixeltable/catalog/path.py +6 -0
  9. pixeltable/catalog/table.py +51 -32
  10. pixeltable/catalog/table_version.py +69 -45
  11. pixeltable/catalog/update_status.py +179 -0
  12. pixeltable/catalog/view.py +9 -2
  13. pixeltable/config.py +76 -12
  14. pixeltable/dataframe.py +1 -1
  15. pixeltable/env.py +29 -0
  16. pixeltable/exec/exec_node.py +7 -24
  17. pixeltable/exec/expr_eval/schedulers.py +134 -7
  18. pixeltable/exprs/column_property_ref.py +21 -9
  19. pixeltable/exprs/column_ref.py +5 -1
  20. pixeltable/exprs/function_call.py +2 -2
  21. pixeltable/exprs/row_builder.py +10 -9
  22. pixeltable/exprs/rowid_ref.py +0 -4
  23. pixeltable/func/function.py +3 -3
  24. pixeltable/functions/audio.py +36 -9
  25. pixeltable/functions/video.py +57 -10
  26. pixeltable/globals.py +61 -1
  27. pixeltable/io/__init__.py +1 -1
  28. pixeltable/io/external_store.py +3 -55
  29. pixeltable/io/globals.py +4 -4
  30. pixeltable/io/hf_datasets.py +10 -2
  31. pixeltable/io/label_studio.py +16 -16
  32. pixeltable/metadata/__init__.py +1 -1
  33. pixeltable/metadata/converters/convert_39.py +125 -0
  34. pixeltable/metadata/converters/util.py +3 -0
  35. pixeltable/metadata/notes.py +1 -0
  36. pixeltable/metadata/schema.py +14 -2
  37. pixeltable/plan.py +4 -0
  38. pixeltable/share/packager.py +20 -38
  39. pixeltable/store.py +18 -50
  40. pixeltable/type_system.py +2 -2
  41. pixeltable/utils/coroutine.py +6 -23
  42. pixeltable/utils/media_store.py +39 -0
  43. {pixeltable-0.4.2.dist-info → pixeltable-0.4.3.dist-info}/METADATA +1 -1
  44. {pixeltable-0.4.2.dist-info → pixeltable-0.4.3.dist-info}/RECORD +47 -45
  45. {pixeltable-0.4.2.dist-info → pixeltable-0.4.3.dist-info}/LICENSE +0 -0
  46. {pixeltable-0.4.2.dist-info → pixeltable-0.4.3.dist-info}/WHEEL +0 -0
  47. {pixeltable-0.4.2.dist-info → pixeltable-0.4.3.dist-info}/entry_points.txt +0 -0
@@ -17,11 +17,12 @@ if TYPE_CHECKING:
17
17
 
18
18
 
19
19
  from .column import Column
20
- from .globals import _POS_COLUMN_NAME, MediaValidation, UpdateStatus
20
+ from .globals import _POS_COLUMN_NAME, MediaValidation
21
21
  from .table import Table
22
22
  from .table_version import TableVersion
23
23
  from .table_version_handle import TableVersionHandle
24
24
  from .table_version_path import TableVersionPath
25
+ from .update_status import UpdateStatus
25
26
 
26
27
  if TYPE_CHECKING:
27
28
  from pixeltable.globals import TableDataSource
@@ -229,7 +230,10 @@ class View(Table):
229
230
 
230
231
  try:
231
232
  plan, _ = Planner.create_view_load_plan(view._tbl_version_path)
232
- _, status = tbl_version.store_tbl.insert_rows(plan, v_min=tbl_version.version)
233
+ _, row_counts = tbl_version.store_tbl.insert_rows(plan, v_min=tbl_version.version)
234
+ status = UpdateStatus(row_count_stats=row_counts)
235
+ tbl_version._write_md_update_status(0, update_status=status)
236
+
233
237
  except:
234
238
  # we need to remove the orphaned TableVersion instance
235
239
  del catalog.Catalog.get()._tbl_versions[tbl_version.id, tbl_version.effective_version]
@@ -275,6 +279,9 @@ class View(Table):
275
279
  md = super()._get_metadata()
276
280
  md['is_view'] = True
277
281
  md['is_snapshot'] = self._tbl_version_path.is_snapshot()
282
+ base_tbl = self._get_base_table()
283
+ base_version = self._effective_base_versions[0]
284
+ md['base'] = base_tbl._path() if base_version is None else f'{base_tbl._path()}:{base_version}'
278
285
  return md
279
286
 
280
287
  def insert(
pixeltable/config.py CHANGED
@@ -25,19 +25,26 @@ class Config:
25
25
 
26
26
  __home: Path
27
27
  __config_file: Path
28
+ __config_overrides: dict[str, Any]
28
29
  __config_dict: dict[str, Any]
29
30
 
30
- def __init__(self) -> None:
31
+ def __init__(self, config_overrides: dict[str, Any]) -> None:
31
32
  assert self.__instance is None, 'Config is a singleton; use Config.get() to access the instance'
32
33
 
33
- self.__home = Path(os.environ.get('PIXELTABLE_HOME', str(Path.home() / '.pixeltable')))
34
+ for var in config_overrides:
35
+ if var not in KNOWN_CONFIG_OVERRIDES:
36
+ raise excs.Error(f'Unrecognized configuration variable: {var}')
37
+
38
+ self.__config_overrides = config_overrides
39
+
40
+ self.__home = Path(self.lookup_env('pixeltable', 'home', str(Path.home() / '.pixeltable')))
34
41
  if self.__home.exists() and not self.__home.is_dir():
35
- raise RuntimeError(f'{self.__home} is not a directory')
42
+ raise excs.Error(f'Not a directory: {self.__home}')
36
43
  if not self.__home.exists():
37
44
  print(f'Creating a Pixeltable instance at: {self.__home}')
38
45
  self.__home.mkdir()
39
46
 
40
- self.__config_file = Path(os.environ.get('PIXELTABLE_CONFIG', str(self.__home / 'config.toml')))
47
+ self.__config_file = Path(self.lookup_env('pixeltable', 'config', str(self.__home / 'config.toml')))
41
48
 
42
49
  self.__config_dict: dict[str, Any]
43
50
  if os.path.isfile(self.__config_file):
@@ -46,6 +53,12 @@ class Config:
46
53
  self.__config_dict = toml.load(stream)
47
54
  except Exception as exc:
48
55
  raise excs.Error(f'Could not read config file: {self.__config_file}') from exc
56
+ for section, section_dict in self.__config_dict.items():
57
+ if section not in KNOWN_CONFIG_OPTIONS:
58
+ raise excs.Error(f'Unrecognized section {section!r} in config file: {self.__config_file}')
59
+ for key in section_dict:
60
+ if key not in KNOWN_CONFIG_OPTIONS[section]:
61
+ raise excs.Error(f"Unrecognized option '{section}.{key}' in config file: {self.__config_file}")
49
62
  else:
50
63
  self.__config_dict = self.__create_default_config(self.__config_file)
51
64
  with open(self.__config_file, 'w', encoding='utf-8') as stream:
@@ -65,10 +78,18 @@ class Config:
65
78
 
66
79
  @classmethod
67
80
  def get(cls) -> Config:
68
- if cls.__instance is None:
69
- cls.__instance = cls()
81
+ cls.init({})
70
82
  return cls.__instance
71
83
 
84
+ @classmethod
85
+ def init(cls, config_overrides: dict[str, Any]) -> None:
86
+ if cls.__instance is None:
87
+ cls.__instance = cls(config_overrides)
88
+ elif len(config_overrides) > 0:
89
+ raise excs.Error(
90
+ 'Pixeltable has already been initialized; cannot specify new config values in the same session'
91
+ )
92
+
72
93
  @classmethod
73
94
  def __create_default_config(cls, config_path: Path) -> dict[str, Any]:
74
95
  free_disk_space_bytes = shutil.disk_usage(config_path.parent).free
@@ -76,14 +97,23 @@ class Config:
76
97
  file_cache_size_g = free_disk_space_bytes / 5 / (1 << 30)
77
98
  return {'pixeltable': {'file_cache_size_g': round(file_cache_size_g, 1), 'hide_warnings': False}}
78
99
 
79
- def get_value(self, key: str, expected_type: type[T], section: str = 'pixeltable') -> Optional[T]:
100
+ def lookup_env(self, section: str, key: str, default: Any = None) -> Any:
101
+ override_var = f'{section}.{key}'
80
102
  env_var = f'{section.upper()}_{key.upper()}'
103
+ if override_var in self.__config_overrides:
104
+ return self.__config_overrides[override_var]
81
105
  if env_var in os.environ:
82
- value = os.environ[env_var]
83
- elif section in self.__config_dict and key in self.__config_dict[section]:
106
+ return os.environ[env_var]
107
+ return default
108
+
109
+ def get_value(self, key: str, expected_type: type[T], section: str = 'pixeltable') -> Optional[T]:
110
+ value = self.lookup_env(section, key) # Try to get from environment first
111
+ # Next try the config file
112
+ if value is None and section in self.__config_dict and key in self.__config_dict[section]:
84
113
  value = self.__config_dict[section][key]
85
- else:
86
- return None
114
+
115
+ if value is None:
116
+ return None # Not specified
87
117
 
88
118
  try:
89
119
  if expected_type is bool and isinstance(value, str):
@@ -91,7 +121,7 @@ class Config:
91
121
  raise excs.Error(f'Invalid value for configuration parameter {section}.{key}: {value}')
92
122
  return value.lower() == 'true' # type: ignore[return-value]
93
123
  return expected_type(value) # type: ignore[call-arg]
94
- except ValueError as exc:
124
+ except (ValueError, TypeError) as exc:
95
125
  raise excs.Error(f'Invalid value for configuration parameter {section}.{key}: {value}') from exc
96
126
 
97
127
  def get_string_value(self, key: str, section: str = 'pixeltable') -> Optional[str]:
@@ -105,3 +135,37 @@ class Config:
105
135
 
106
136
  def get_bool_value(self, key: str, section: str = 'pixeltable') -> Optional[bool]:
107
137
  return self.get_value(key, bool, section)
138
+
139
+
140
+ KNOWN_CONFIG_OPTIONS = {
141
+ 'pixeltable': {
142
+ 'home': 'Path to the Pixeltable home directory',
143
+ 'config': 'Path to the Pixeltable config file',
144
+ 'pgdata': 'Path to the Pixeltable postgres data directory',
145
+ 'db': 'Postgres database name',
146
+ 'file_cache_size_g': 'Size of the file cache in GB',
147
+ 'time_zone': 'Default time zone for timestamps',
148
+ 'hide_warnings': 'Hide warnings from the console',
149
+ 'verbosity': 'Verbosity level for console output',
150
+ 'api_key': 'API key for Pixeltable cloud',
151
+ },
152
+ 'anthropic': {'api_key': 'Anthropic API key'},
153
+ 'bedrock': {'api_key': 'AWS Bedrock API key'},
154
+ 'deepseek': {'api_key': 'Deepseek API key'},
155
+ 'fireworks': {'api_key': 'Fireworks API key'},
156
+ 'gemini': {'api_key': 'Gemini API key'},
157
+ 'groq': {'api_key': 'Groq API key'},
158
+ 'label_studio': {'api_key': 'Label Studio API key', 'url': 'Label Studio server URL'},
159
+ 'mistral': {'api_key': 'Mistral API key'},
160
+ 'openai': {'api_key': 'OpenAI API key'},
161
+ 'replicate': {'api_token': 'Replicate API token'},
162
+ 'together': {'api_key': 'Together API key'},
163
+ 'pypi': {'api_key': 'PyPI API key (for internal use only)'},
164
+ }
165
+
166
+
167
+ KNOWN_CONFIG_OVERRIDES = {
168
+ f'{section}.{key}': info
169
+ for section, section_dict in KNOWN_CONFIG_OPTIONS.items()
170
+ for key, info in section_dict.items()
171
+ }
pixeltable/dataframe.py CHANGED
@@ -15,7 +15,7 @@ import sqlalchemy as sql
15
15
 
16
16
  from pixeltable import catalog, exceptions as excs, exec, exprs, plan, type_system as ts
17
17
  from pixeltable.catalog import Catalog, is_valid_identifier
18
- from pixeltable.catalog.globals import UpdateStatus
18
+ from pixeltable.catalog.update_status import UpdateStatus
19
19
  from pixeltable.env import Env
20
20
  from pixeltable.plan import Planner, SampleClause
21
21
  from pixeltable.type_system import ColumnType
pixeltable/env.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import asyncio
3
4
  import datetime
4
5
  import glob
5
6
  import http.server
@@ -22,6 +23,7 @@ from sys import stdout
22
23
  from typing import TYPE_CHECKING, Any, Callable, Iterator, Optional, TypeVar
23
24
  from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
24
25
 
26
+ import nest_asyncio # type: ignore[import-untyped]
25
27
  import pixeltable_pgserver
26
28
  import sqlalchemy as sql
27
29
  from pillow_heif import register_heif_opener # type: ignore[import-untyped]
@@ -85,6 +87,7 @@ class Env:
85
87
  _current_conn: Optional[sql.Connection]
86
88
  _current_session: Optional[sql.orm.Session]
87
89
  _dbms: Optional[Dbms]
90
+ _event_loop: Optional[asyncio.AbstractEventLoop] # event loop for ExecNode
88
91
 
89
92
  @classmethod
90
93
  def get(cls) -> Env:
@@ -140,6 +143,32 @@ class Env:
140
143
  self._current_conn = None
141
144
  self._current_session = None
142
145
  self._dbms = None
146
+ self._event_loop = None
147
+
148
+ def _init_event_loop(self) -> None:
149
+ try:
150
+ # check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
151
+ # multiple run_until_complete()
152
+ running_loop = asyncio.get_running_loop()
153
+ self._event_loop = running_loop
154
+ _logger.debug('Patched running loop')
155
+ except RuntimeError:
156
+ self._event_loop = asyncio.new_event_loop()
157
+ asyncio.set_event_loop(self._event_loop)
158
+ # we set a deliberately long duration to avoid warnings getting printed to the console in debug mode
159
+ self._event_loop.slow_callback_duration = 3600
160
+
161
+ # always allow nested event loops, we need that to run async udfs synchronously (eg, for SimilarityExpr);
162
+ # see run_coroutine_synchronously()
163
+ nest_asyncio.apply()
164
+ if _logger.isEnabledFor(logging.DEBUG):
165
+ self._event_loop.set_debug(True)
166
+
167
+ @property
168
+ def event_loop(self) -> asyncio.AbstractEventLoop:
169
+ if self._event_loop is None:
170
+ self._init_event_loop()
171
+ return self._event_loop
143
172
 
144
173
  @property
145
174
  def db_url(self) -> str:
@@ -1,11 +1,11 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import abc
4
- import asyncio
5
4
  import logging
6
5
  from typing import AsyncIterator, Iterable, Iterator, Optional, TypeVar
7
6
 
8
7
  from pixeltable import exprs
8
+ from pixeltable.env import Env
9
9
 
10
10
  from .data_row_batch import DataRowBatch
11
11
  from .exec_context import ExecContext
@@ -59,26 +59,7 @@ class ExecNode(abc.ABC):
59
59
  pass
60
60
 
61
61
  def __iter__(self) -> Iterator[DataRowBatch]:
62
- running_loop: Optional[asyncio.AbstractEventLoop] = None
63
- loop: asyncio.AbstractEventLoop
64
- try:
65
- # check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
66
- # multiple run_until_complete()
67
- running_loop = asyncio.get_running_loop()
68
- import nest_asyncio # type: ignore[import-untyped]
69
-
70
- nest_asyncio.apply()
71
- loop = running_loop
72
- _logger.debug('Patched running loop')
73
- except RuntimeError:
74
- loop = asyncio.new_event_loop()
75
- asyncio.set_event_loop(loop)
76
- # we set a deliberately long duration to avoid warnings getting printed to the console in debug mode
77
- loop.slow_callback_duration = 3600
78
-
79
- if _logger.isEnabledFor(logging.DEBUG):
80
- loop.set_debug(True)
81
-
62
+ loop = Env.get().event_loop
82
63
  aiter = self.__aiter__()
83
64
  try:
84
65
  while True:
@@ -86,9 +67,11 @@ class ExecNode(abc.ABC):
86
67
  yield batch
87
68
  except StopAsyncIteration:
88
69
  pass
89
- finally:
90
- if loop != running_loop:
91
- loop.close()
70
+ # TODO:
71
+ # - we seem to have some tasks that aren't accounted for by ExprEvalNode and don't get cancelled by the time
72
+ # we end up here
73
+ # - however, blindly cancelling all pending tasks doesn't work when running in a jupyter environment, which
74
+ # creates tasks on its own
92
75
 
93
76
  def open(self) -> None:
94
77
  """Bottom-up initialization of nodes for execution. Must be called before __next__."""
@@ -4,9 +4,10 @@ import asyncio
4
4
  import datetime
5
5
  import inspect
6
6
  import logging
7
+ import re
7
8
  import sys
8
9
  import time
9
- from typing import Awaitable, Collection, Optional
10
+ from typing import Any, Awaitable, Collection, Optional
10
11
 
11
12
  from pixeltable import env, func
12
13
  from pixeltable.config import Config
@@ -250,8 +251,20 @@ class RequestRateScheduler(Scheduler):
250
251
  total_retried: int
251
252
 
252
253
  TIME_FORMAT = '%H:%M.%S %f'
253
- MAX_RETRIES = 10
254
+ MAX_RETRIES = 3
254
255
  DEFAULT_RATE_LIMIT = 600 # requests per minute
256
+ RATE_LIMIT_INDICATORS = ('rate limit', 'too many requests', '429', 'quota exceeded', 'throttled', 'rate exceeded')
257
+ RETRY_AFTER_PATTERNS = (
258
+ r'retry after (\d+(?:\.\d+)?)\s*seconds?',
259
+ r'try again in (\d+(?:\.\d+)?)\s*seconds?',
260
+ r'wait (\d+(?:\.\d+)?)\s*seconds?',
261
+ r'retry-after:\s*(\d+(?:\.\d+)?)',
262
+ )
263
+
264
+ # Exponential backoff defaults
265
+ BASE_RETRY_DELAY = 1.0 # in seconds
266
+ MAX_RETRY_DELAY = 60.0 # in seconds
267
+ RETRY_BACKOFF_MULTIPLIER = 2.0
255
268
 
256
269
  def __init__(self, resource_pool: str, dispatcher: Dispatcher):
257
270
  super().__init__(resource_pool, dispatcher)
@@ -337,11 +350,12 @@ class RequestRateScheduler(Scheduler):
337
350
  self.dispatcher.dispatch(request.rows, exec_ctx)
338
351
 
339
352
  except Exception as exc:
340
- # TODO: which exception can be retried?
341
- _logger.debug(f'exception for {self.resource_pool}: {exc}')
342
- status = getattr(exc, 'status', None)
343
- _logger.debug(f'type={type(exc)} has_status={hasattr(exc, "status")} status={status}')
344
- if num_retries < self.MAX_RETRIES:
353
+ _logger.debug(f'exception for {self.resource_pool}: type={type(exc)}\n{exc}')
354
+ is_rate_limit_error, retry_after = self._is_rate_limit_error(exc)
355
+ if is_rate_limit_error and num_retries < self.MAX_RETRIES:
356
+ retry_delay = self._compute_retry_delay(num_retries, retry_after)
357
+ _logger.debug(f'scheduler {self.resource_pool}: retrying after {retry_delay}')
358
+ await asyncio.sleep(retry_delay)
345
359
  self.queue.put_nowait(self.QueueItem(request, num_retries + 1, exec_ctx))
346
360
  return
347
361
 
@@ -358,6 +372,119 @@ class RequestRateScheduler(Scheduler):
358
372
  if is_task:
359
373
  self.num_in_flight -= 1
360
374
 
375
+ def _is_rate_limit_error(self, exc: Exception) -> tuple[bool, Optional[float]]:
376
+ """Returns True if the exception indicates a rate limit error, and the retry delay in seconds."""
377
+ from http import HTTPStatus
378
+
379
+ # Check for HTTP status TOO_MANY_REQUESTS in various exception classes.
380
+ # We look for attributes that contain status codes, instead of checking the type of the exception,
381
+ # in order to handle a wider variety of exception classes.
382
+ is_rate_limit_error = False
383
+ retry_delay: Optional[float] = None
384
+
385
+ # requests.HTTPError/httpx.HTTPStatusError
386
+ if (
387
+ hasattr(exc, 'response')
388
+ and hasattr(exc.response, 'status_code')
389
+ and exc.response.status_code == HTTPStatus.TOO_MANY_REQUESTS.value
390
+ ):
391
+ is_rate_limit_error = True
392
+ retry_delay = self._extract_retry_delay_from_headers(exc.response.headers)
393
+ elif (
394
+ # urllib.error.HTTPError
395
+ (hasattr(exc, 'code') and exc.code == HTTPStatus.TOO_MANY_REQUESTS.value)
396
+ # aiohttp.ClientResponseError
397
+ or (hasattr(exc, 'status') and exc.status == HTTPStatus.TOO_MANY_REQUESTS.value)
398
+ ) and hasattr(exc, 'headers'):
399
+ is_rate_limit_error = True
400
+ retry_delay = self._extract_retry_delay_from_headers(exc.headers)
401
+
402
+ if is_rate_limit_error:
403
+ return True, retry_delay
404
+
405
+ # Check common rate limit keywords in exception message
406
+ error_msg = str(exc).lower()
407
+ if any(indicator in error_msg for indicator in self.RATE_LIMIT_INDICATORS):
408
+ retry_delay = self._extract_retry_delay_from_message(error_msg)
409
+ return True, retry_delay
410
+
411
+ return False, None
412
+
413
+ def _extract_retry_delay_from_headers(self, headers: Optional[Any]) -> Optional[float]:
414
+ """Extract retry delay from HTTP headers."""
415
+ if headers is None:
416
+ return None
417
+
418
+ # convert headers to dict-like object for consistent access
419
+ header_dict: dict
420
+ if hasattr(headers, 'get'):
421
+ header_dict = headers
422
+ else:
423
+ # headers are a list of tuples or other format
424
+ try:
425
+ header_dict = dict(headers)
426
+ except (TypeError, ValueError):
427
+ return None
428
+ # normalize dict keys: lowercase and remove dashes
429
+ header_dict = {k.lower().replace('-', ''): v for k, v in header_dict.items()}
430
+
431
+ # check Retry-After header
432
+ retry_after = header_dict.get('retryafter')
433
+ if retry_after is not None:
434
+ try:
435
+ return float(retry_after)
436
+ except (ValueError, TypeError):
437
+ pass
438
+
439
+ # check X-RateLimit-Reset (Unix timestamp)
440
+ reset_time = header_dict.get('xratelimitreset')
441
+ if reset_time is not None:
442
+ try:
443
+ reset_timestamp = float(reset_time)
444
+ delay = max(0, reset_timestamp - time.time())
445
+ return delay
446
+ except (ValueError, TypeError):
447
+ pass
448
+
449
+ # check X-RateLimit-Reset-After (seconds from now)
450
+ reset_after = header_dict.get('xratelimitresetafter')
451
+ if reset_after is not None:
452
+ try:
453
+ return float(reset_after)
454
+ except (ValueError, TypeError):
455
+ pass
456
+
457
+ return None
458
+
459
+ def _extract_retry_delay_from_message(self, msg: str) -> Optional[float]:
460
+ msg_lower = msg.lower()
461
+ for pattern in self.RETRY_AFTER_PATTERNS:
462
+ match = re.search(pattern, msg_lower)
463
+ if match is not None:
464
+ try:
465
+ return float(match.group(1))
466
+ except (ValueError, TypeError):
467
+ continue
468
+ return None
469
+
470
+ def _compute_retry_delay(self, num_retries: int, retry_after: Optional[float] = None) -> float:
471
+ """
472
+ Calculate exponential backoff delay for rate limit errors.
473
+
474
+ Args:
475
+ retry_count: Number of retries attempted (0-based)
476
+ retry_after: Suggested delay from Retry-After header
477
+
478
+ Returns:
479
+ Delay in seconds
480
+ """
481
+ if retry_after is not None and retry_after > 0:
482
+ # Use server-suggested delay, but cap it at max_delay
483
+ return max(min(retry_after, self.MAX_RETRY_DELAY), self.BASE_RETRY_DELAY)
484
+ else:
485
+ delay = self.BASE_RETRY_DELAY * (self.RETRY_BACKOFF_MULTIPLIER**num_retries)
486
+ return max(min(delay, self.MAX_RETRY_DELAY), self.BASE_RETRY_DELAY)
487
+
361
488
 
362
489
  # all concrete Scheduler subclasses that implement matches()
363
490
  SCHEDULERS = [RateLimitsScheduler, RequestRateScheduler]
@@ -26,6 +26,7 @@ class ColumnPropertyRef(Expr):
26
26
  ERRORMSG = 1
27
27
  FILEURL = 2
28
28
  LOCALPATH = 3
29
+ CELLMD = 4 # JSON metadata for the cell, e.g. errortype, errormsg for media columns
29
30
 
30
31
  def __init__(self, col_ref: ColumnRef, prop: Property):
31
32
  super().__init__(ts.StringType(nullable=True))
@@ -51,8 +52,8 @@ class ColumnPropertyRef(Expr):
51
52
  def __repr__(self) -> str:
52
53
  return f'{self._col_ref}.{self.prop.name.lower()}'
53
54
 
54
- def is_error_prop(self) -> bool:
55
- return self.prop in (self.Property.ERRORTYPE, self.Property.ERRORMSG)
55
+ def is_cellmd_prop(self) -> bool:
56
+ return self.prop in (self.Property.ERRORTYPE, self.Property.ERRORMSG, self.Property.CELLMD)
56
57
 
57
58
  def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ColumnElement]:
58
59
  if not self._col_ref.col_handle.get().is_stored:
@@ -63,21 +64,27 @@ class ColumnPropertyRef(Expr):
63
64
  if (
64
65
  col.col_type.is_media_type()
65
66
  and col.media_validation == catalog.MediaValidation.ON_READ
66
- and self.is_error_prop()
67
+ and self.is_cellmd_prop()
67
68
  ):
68
69
  return None
69
70
 
70
71
  if self.prop == self.Property.ERRORTYPE:
71
- assert col.sa_errortype_col is not None
72
- return col.sa_errortype_col
72
+ return col.sa_cellmd_col.op('->>')('errortype')
73
73
  if self.prop == self.Property.ERRORMSG:
74
- assert col.sa_errormsg_col is not None
75
- return col.sa_errormsg_col
74
+ return col.sa_cellmd_col.op('->>')('errormsg')
75
+ if self.prop == self.Property.CELLMD:
76
+ assert col.sa_cellmd_col is not None
77
+ return col.sa_cellmd_col
76
78
  if self.prop == self.Property.FILEURL:
77
79
  # the file url is stored as the column value
78
80
  return sql_elements.get(self._col_ref)
79
81
  return None
80
82
 
83
+ @classmethod
84
+ def create_cellmd_exc(cls, exc: Exception) -> dict[str, str]:
85
+ """Create a cellmd value from an exception."""
86
+ return {'errortype': type(exc).__name__, 'errormsg': str(exc)}
87
+
81
88
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
82
89
  if self.prop == self.Property.FILEURL:
83
90
  assert data_row.has_val[self._col_ref.slot_idx]
@@ -87,14 +94,19 @@ class ColumnPropertyRef(Expr):
87
94
  assert data_row.has_val[self._col_ref.slot_idx]
88
95
  data_row[self.slot_idx] = data_row.file_paths[self._col_ref.slot_idx]
89
96
  return
90
- elif self.is_error_prop():
97
+ elif self.is_cellmd_prop():
91
98
  exc = data_row.get_exc(self._col_ref.slot_idx)
92
99
  if exc is None:
93
100
  data_row[self.slot_idx] = None
94
101
  elif self.prop == self.Property.ERRORTYPE:
95
102
  data_row[self.slot_idx] = type(exc).__name__
96
- else:
103
+ elif self.prop == self.Property.ERRORMSG:
97
104
  data_row[self.slot_idx] = str(exc)
105
+ elif self.prop == self.Property.CELLMD:
106
+ data_row[self.slot_idx] = self.create_cellmd_exc(exc)
107
+ else:
108
+ raise AssertionError(f'Unknown property {self.prop}')
109
+ return
98
110
  else:
99
111
  raise AssertionError()
100
112
 
@@ -115,11 +115,15 @@ class ColumnRef(Expr):
115
115
  from .column_property_ref import ColumnPropertyRef
116
116
 
117
117
  # resolve column properties
118
+ if name == ColumnPropertyRef.Property.CELLMD.name.lower():
119
+ # This is not user accessible, but used internally to store cell metadata
120
+ return super().__getattr__(name)
121
+
118
122
  if (
119
123
  name == ColumnPropertyRef.Property.ERRORTYPE.name.lower()
120
124
  or name == ColumnPropertyRef.Property.ERRORMSG.name.lower()
121
125
  ):
122
- property_is_present = self.col.is_stored and (self.col.is_computed or self.col_type.is_media_type())
126
+ property_is_present = self.col.stores_cellmd
123
127
  if not property_is_present:
124
128
  raise excs.Error(f'{name} only valid for a stored computed or media column: {self}')
125
129
  return ColumnPropertyRef(self, ColumnPropertyRef.Property[name.upper()])
@@ -446,11 +446,11 @@ class FunctionCall(Expr):
446
446
  dedent(
447
447
  f"""
448
448
  The UDF '{fn.self_path}' cannot be located, because
449
- {{errormsg}}
449
+ {{error_msg}}
450
450
  """
451
451
  )
452
452
  .strip()
453
- .format(errormsg=fn.errormsg)
453
+ .format(error_msg=fn.error_msg)
454
454
  )
455
455
  return cls(fn, args, kwargs, return_type, is_method_call=is_method_call, validation_error=validation_error)
456
456
 
@@ -209,7 +209,7 @@ class RowBuilder:
209
209
  # this is input and therefore doesn't depend on other exprs
210
210
  continue
211
211
  # error properties don't have exceptions themselves
212
- if isinstance(expr, ColumnPropertyRef) and expr.is_error_prop():
212
+ if isinstance(expr, ColumnPropertyRef) and expr.is_cellmd_prop():
213
213
  continue
214
214
  dependency_idxs = [d.slot_idx for d in expr.dependencies()]
215
215
  self.dependencies[expr.slot_idx, dependency_idxs] = True
@@ -444,6 +444,8 @@ class RowBuilder:
444
444
  Return tuple[list of row values in `self.table_columns` order, # of exceptions]
445
445
  This excludes system columns.
446
446
  """
447
+ from pixeltable.exprs.column_property_ref import ColumnPropertyRef
448
+
447
449
  num_excs = 0
448
450
  table_row: list[Any] = list(pk)
449
451
  for info in self.table_columns:
@@ -454,9 +456,9 @@ class RowBuilder:
454
456
  if cols_with_excs is not None:
455
457
  cols_with_excs.add(col.id)
456
458
  table_row.append(None)
457
- if col.records_errors:
458
- # exceptions get stored in the errortype/-msg columns
459
- table_row.extend((type(exc).__name__, str(exc)))
459
+ if col.stores_cellmd:
460
+ # exceptions get stored in the errortype/-msg properties of the cellmd column
461
+ table_row.append(ColumnPropertyRef.create_cellmd_exc(exc))
460
462
  else:
461
463
  if col.col_type.is_image_type() and data_row.file_urls[slot_idx] is None:
462
464
  # we have yet to store this image
@@ -464,8 +466,8 @@ class RowBuilder:
464
466
  data_row.flush_img(slot_idx, filepath)
465
467
  val = data_row.get_stored_val(slot_idx, col.get_sa_col_type())
466
468
  table_row.append(val)
467
- if col.records_errors:
468
- table_row.extend((None, None))
469
+ if col.stores_cellmd:
470
+ table_row.append(None) # placeholder for cellmd column
469
471
 
470
472
  return table_row, num_excs
471
473
 
@@ -483,8 +485,7 @@ class RowBuilder:
483
485
  if col.col.col_type.is_media_type():
484
486
  media_cols[len(store_col_names)] = col.col
485
487
  store_col_names.append(col.col.store_name())
486
- if col.col.records_errors:
487
- store_col_names.append(col.col.errortype_store_name())
488
- store_col_names.append(col.col.errormsg_store_name())
488
+ if col.col.stores_cellmd:
489
+ store_col_names.append(col.col.cellmd_store_name())
489
490
 
490
491
  return store_col_names, media_cols
@@ -105,10 +105,6 @@ class RowidRef(Expr):
105
105
  assert self.rowid_component_idx <= len(rowid_cols), (
106
106
  f'{self.rowid_component_idx} not consistent with {rowid_cols}'
107
107
  )
108
- # _logger.debug(
109
- # f'RowidRef.sql_expr: tbl={tbl.id}{tbl.effective_version} sa_tbl={id(tbl.store_tbl.sa_tbl):x} '
110
- # f'tv={id(tbl):x}'
111
- # )
112
108
  return rowid_cols[self.rowid_component_idx]
113
109
 
114
110
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
@@ -504,12 +504,12 @@ class Function(ABC):
504
504
 
505
505
  class InvalidFunction(Function):
506
506
  fn_dict: dict[str, Any]
507
- errormsg: str
507
+ error_msg: str
508
508
 
509
- def __init__(self, self_path: str, fn_dict: dict[str, Any], errormsg: str):
509
+ def __init__(self, self_path: str, fn_dict: dict[str, Any], error_msg: str):
510
510
  super().__init__([], self_path)
511
511
  self.fn_dict = fn_dict
512
- self.errormsg = errormsg
512
+ self.error_msg = error_msg
513
513
 
514
514
  def _as_dict(self) -> dict:
515
515
  """