pixeltable 0.4.7__py3-none-any.whl → 0.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (50) hide show
  1. pixeltable/__init__.py +1 -1
  2. pixeltable/catalog/catalog.py +4 -6
  3. pixeltable/catalog/insertable_table.py +125 -28
  4. pixeltable/catalog/table.py +51 -15
  5. pixeltable/catalog/table_version.py +12 -8
  6. pixeltable/catalog/table_version_path.py +6 -5
  7. pixeltable/config.py +25 -9
  8. pixeltable/dataframe.py +3 -3
  9. pixeltable/env.py +89 -20
  10. pixeltable/exec/aggregation_node.py +1 -1
  11. pixeltable/exec/cache_prefetch_node.py +4 -3
  12. pixeltable/exec/exec_node.py +0 -8
  13. pixeltable/exec/expr_eval/globals.py +1 -0
  14. pixeltable/exec/expr_eval/schedulers.py +16 -4
  15. pixeltable/exec/in_memory_data_node.py +2 -3
  16. pixeltable/exprs/data_row.py +5 -5
  17. pixeltable/exprs/function_call.py +59 -21
  18. pixeltable/exprs/row_builder.py +11 -5
  19. pixeltable/func/expr_template_function.py +6 -3
  20. pixeltable/functions/__init__.py +2 -0
  21. pixeltable/functions/anthropic.py +1 -2
  22. pixeltable/functions/deepseek.py +5 -1
  23. pixeltable/functions/gemini.py +11 -2
  24. pixeltable/functions/huggingface.py +6 -12
  25. pixeltable/functions/openai.py +2 -1
  26. pixeltable/functions/video.py +5 -5
  27. pixeltable/functions/whisperx.py +177 -0
  28. pixeltable/{ext/functions → functions}/yolox.py +0 -4
  29. pixeltable/globals.py +16 -3
  30. pixeltable/io/fiftyone.py +3 -3
  31. pixeltable/io/label_studio.py +2 -1
  32. pixeltable/iterators/audio.py +3 -2
  33. pixeltable/iterators/document.py +0 -6
  34. pixeltable/metadata/__init__.py +3 -1
  35. pixeltable/mypy/__init__.py +3 -0
  36. pixeltable/mypy/mypy_plugin.py +123 -0
  37. pixeltable/plan.py +0 -16
  38. pixeltable/share/packager.py +6 -6
  39. pixeltable/share/publish.py +134 -7
  40. pixeltable/type_system.py +20 -4
  41. pixeltable/utils/media_store.py +131 -66
  42. pixeltable/utils/pydantic.py +60 -0
  43. {pixeltable-0.4.7.dist-info → pixeltable-0.4.9.dist-info}/METADATA +186 -121
  44. {pixeltable-0.4.7.dist-info → pixeltable-0.4.9.dist-info}/RECORD +47 -46
  45. pixeltable/ext/__init__.py +0 -17
  46. pixeltable/ext/functions/__init__.py +0 -11
  47. pixeltable/ext/functions/whisperx.py +0 -77
  48. {pixeltable-0.4.7.dist-info → pixeltable-0.4.9.dist-info}/WHEEL +0 -0
  49. {pixeltable-0.4.7.dist-info → pixeltable-0.4.9.dist-info}/entry_points.txt +0 -0
  50. {pixeltable-0.4.7.dist-info → pixeltable-0.4.9.dist-info}/licenses/LICENSE +0 -0
pixeltable/env.py CHANGED
@@ -15,7 +15,6 @@ import sys
15
15
  import threading
16
16
  import types
17
17
  import typing
18
- import uuid
19
18
  import warnings
20
19
  from contextlib import contextmanager
21
20
  from dataclasses import dataclass, field
@@ -28,6 +27,7 @@ import nest_asyncio # type: ignore[import-untyped]
28
27
  import pixeltable_pgserver
29
28
  import sqlalchemy as sql
30
29
  from pillow_heif import register_heif_opener # type: ignore[import-untyped]
30
+ from tenacity import retry, stop_after_attempt, wait_exponential_jitter
31
31
  from tqdm import TqdmWarning
32
32
 
33
33
  from pixeltable import exceptions as excs
@@ -101,12 +101,18 @@ class Env:
101
101
  def _init_env(cls, reinit_db: bool = False) -> None:
102
102
  assert not cls.__initializing, 'Circular env initialization detected.'
103
103
  cls.__initializing = True
104
+ if cls._instance is not None:
105
+ cls._instance._clean_up()
104
106
  cls._instance = None
105
107
  env = Env()
106
- env._set_up(reinit_db=reinit_db)
107
- env._upgrade_metadata()
108
- cls._instance = env
109
- cls.__initializing = False
108
+ try:
109
+ env._set_up(reinit_db=reinit_db)
110
+ env._upgrade_metadata()
111
+ cls._instance = env
112
+ finally:
113
+ # Reset the initializing flag, even if setup fails.
114
+ # This prevents the environment from being left in a broken state.
115
+ cls.__initializing = False
110
116
 
111
117
  def __init__(self) -> None:
112
118
  assert self._instance is None, 'Env is a singleton; use Env.get() to access the instance'
@@ -246,7 +252,7 @@ class Env:
246
252
  if self._current_conn is None:
247
253
  assert self._current_session is None
248
254
  try:
249
- self._current_isolation_level = 'SERIALIZABLE' if for_write else 'REPEATABLE_READ'
255
+ self._current_isolation_level = 'SERIALIZABLE'
250
256
  with (
251
257
  self.engine.connect().execution_options(isolation_level=self._current_isolation_level) as conn,
252
258
  sql.orm.Session(conn) as session,
@@ -485,7 +491,7 @@ class Env:
485
491
  raise excs.Error(error)
486
492
  self._logger.info(f'Using database at: {self.db_url}')
487
493
  else:
488
- self._db_name = os.environ.get('PIXELTABLE_DB', 'pixeltable')
494
+ self._db_name = config.get_string_value('db') or 'pixeltable'
489
495
  self._pgdata_dir = Path(os.environ.get('PIXELTABLE_PGDATA', str(Config.get().home / 'pgdata')))
490
496
  # cleanup_mode=None will leave the postgres process running after Python exits
491
497
  # cleanup_mode='stop' will terminate the postgres process when Python exits
@@ -499,14 +505,24 @@ class Env:
499
505
  assert self._db_url is not None
500
506
  assert self._db_name is not None
501
507
 
508
+ @retry(
509
+ stop=stop_after_attempt(3), # Stop after 3 attempts
510
+ wait=wait_exponential_jitter(initial=0.2, max=1.0, jitter=0.2), # Exponential backoff with jitter
511
+ )
502
512
  def _init_metadata(self) -> None:
503
513
  """
504
514
  Create pixeltable metadata tables and system metadata.
505
515
  This is an idempotent operation.
516
+
517
+ Retry logic handles race conditions when multiple Pixeltable processes
518
+ attempt to initialize metadata tables simultaneously. The first process may succeed
519
+ in creating tables while others encounter database constraints (e.g., "table already exists").
520
+ Exponential backoff with jitter reduces contention between competing processes.
506
521
  """
507
522
  assert self._sa_engine is not None
508
523
  from pixeltable import metadata
509
524
 
525
+ self._logger.debug('Creating pixeltable metadata')
510
526
  metadata.schema.base_metadata.create_all(self._sa_engine, checkfirst=True)
511
527
  metadata.create_system_info(self._sa_engine)
512
528
 
@@ -557,6 +573,14 @@ class Env:
557
573
  finally:
558
574
  engine.dispose()
559
575
 
576
+ def _pgserver_terminate_connections_stmt(self) -> str:
577
+ return f"""
578
+ SELECT pg_terminate_backend(pg_stat_activity.pid)
579
+ FROM pg_stat_activity
580
+ WHERE pg_stat_activity.datname = '{self._db_name}'
581
+ AND pid <> pg_backend_pid()
582
+ """
583
+
560
584
  def _drop_store_db(self) -> None:
561
585
  assert self._db_name is not None
562
586
  engine = sql.create_engine(self._dbms.default_system_db_url(), future=True, isolation_level='AUTOCOMMIT')
@@ -565,13 +589,7 @@ class Env:
565
589
  with engine.begin() as conn:
566
590
  # terminate active connections
567
591
  if self._db_server is not None:
568
- stmt = f"""
569
- SELECT pg_terminate_backend(pg_stat_activity.pid)
570
- FROM pg_stat_activity
571
- WHERE pg_stat_activity.datname = '{self._db_name}'
572
- AND pid <> pg_backend_pid()
573
- """
574
- conn.execute(sql.text(stmt))
592
+ conn.execute(sql.text(self._pgserver_terminate_connections_stmt()))
575
593
  # drop db
576
594
  stmt = self._dbms.drop_db_stmt(preparer.quote(self._db_name))
577
595
  conn.execute(sql.text(stmt))
@@ -749,12 +767,6 @@ class Env:
749
767
  else:
750
768
  os.remove(path)
751
769
 
752
- def num_tmp_files(self) -> int:
753
- return len(glob.glob(f'{self._tmp_dir}/*'))
754
-
755
- def create_tmp_path(self, extension: str = '') -> Path:
756
- return self._tmp_dir / f'{uuid.uuid4()}{extension}'
757
-
758
770
  # def get_resource_pool_info(self, pool_id: str, pool_info_cls: Optional[Type[T]]) -> T:
759
771
  def get_resource_pool_info(self, pool_id: str, make_pool_info: Optional[Callable[[], T]] = None) -> T:
760
772
  """Returns the info object for the given id, creating it if necessary."""
@@ -815,6 +827,63 @@ class Env:
815
827
  except Exception as exc:
816
828
  raise excs.Error(f'Failed to load spaCy model: {spacy_model}') from exc
817
829
 
830
+ def _clean_up(self) -> None:
831
+ """
832
+ Internal cleanup method that properly closes all resources and resets state.
833
+ This is called before destroying the singleton instance.
834
+ """
835
+ assert self._current_session is None
836
+ assert self._current_conn is None
837
+
838
+ # Stop HTTP server
839
+ if self._httpd is not None:
840
+ try:
841
+ self._httpd.shutdown()
842
+ self._httpd.server_close()
843
+ except Exception as e:
844
+ _logger.warning(f'Error stopping HTTP server: {e}')
845
+
846
+ # First terminate all connections to the database
847
+ if self._db_server is not None:
848
+ assert self._dbms is not None
849
+ assert self._db_name is not None
850
+ try:
851
+ temp_engine = sql.create_engine(self._dbms.default_system_db_url(), isolation_level='AUTOCOMMIT')
852
+ try:
853
+ with temp_engine.begin() as conn:
854
+ conn.execute(sql.text(self._pgserver_terminate_connections_stmt()))
855
+ _logger.info(f"Terminated all connections to database '{self._db_name}'")
856
+ except Exception as e:
857
+ _logger.warning(f'Error terminating database connections: {e}')
858
+ finally:
859
+ temp_engine.dispose()
860
+ except Exception as e:
861
+ _logger.warning(f'Error stopping database server: {e}')
862
+
863
+ # Dispose of SQLAlchemy engine (after stopping db server)
864
+ if self._sa_engine is not None:
865
+ try:
866
+ self._sa_engine.dispose()
867
+ except Exception as e:
868
+ _logger.warning(f'Error disposing engine: {e}')
869
+
870
+ # Close event loop
871
+ if self._event_loop is not None:
872
+ try:
873
+ if self._event_loop.is_running():
874
+ self._event_loop.stop()
875
+ self._event_loop.close()
876
+ except Exception as e:
877
+ _logger.warning(f'Error closing event loop: {e}')
878
+
879
+ # Remove logging handlers
880
+ for handler in self._logger.handlers[:]:
881
+ try:
882
+ handler.close()
883
+ self._logger.removeHandler(handler)
884
+ except Exception as e:
885
+ _logger.warning(f'Error removing handler: {e}')
886
+
818
887
 
819
888
  def register_client(name: str) -> Callable:
820
889
  """Decorator that registers a third-party API client for use by Pixeltable.
@@ -103,6 +103,6 @@ class AggregationNode(ExecNode):
103
103
  self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
104
104
  self.output_batch.add_row(prev_row)
105
105
 
106
- self.output_batch.flush_imgs(None, self.stored_img_cols, self.flushed_img_slots)
106
+ self.output_batch.flush_imgs(None, self.row_builder.stored_img_cols, self.flushed_img_slots)
107
107
  _logger.debug(f'AggregateNode: consumed {num_input_rows} rows, returning {len(self.output_batch.rows)} rows')
108
108
  yield self.output_batch
@@ -12,8 +12,9 @@ from pathlib import Path
12
12
  from typing import Any, AsyncIterator, Iterator, Optional
13
13
  from uuid import UUID
14
14
 
15
- from pixeltable import env, exceptions as excs, exprs
15
+ from pixeltable import exceptions as excs, exprs
16
16
  from pixeltable.utils.filecache import FileCache
17
+ from pixeltable.utils.media_store import TempStore
17
18
 
18
19
  from .data_row_batch import DataRowBatch
19
20
  from .exec_node import ExecNode
@@ -219,7 +220,7 @@ class CachePrefetchNode(ExecNode):
219
220
  self.in_flight_requests[f] = url
220
221
 
221
222
  def __fetch_url(self, url: str) -> tuple[Optional[Path], Optional[Exception]]:
222
- """Fetches a remote URL into Env.tmp_dir and returns its path"""
223
+ """Fetches a remote URL into the TempStore and returns its path"""
223
224
  _logger.debug(f'fetching url={url} thread_name={threading.current_thread().name}')
224
225
  parsed = urllib.parse.urlparse(url)
225
226
  # Use len(parsed.scheme) > 1 here to ensure we're not being passed
@@ -230,7 +231,7 @@ class CachePrefetchNode(ExecNode):
230
231
  if parsed.path:
231
232
  p = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed.path)))
232
233
  extension = p.suffix
233
- tmp_path = env.Env.get().create_tmp_path(extension=extension)
234
+ tmp_path = TempStore.create_path(extension=extension)
234
235
  try:
235
236
  _logger.debug(f'Downloading {url} to {tmp_path}')
236
237
  if parsed.scheme == 's3':
@@ -20,7 +20,6 @@ class ExecNode(abc.ABC):
20
20
  row_builder: exprs.RowBuilder
21
21
  input: Optional[ExecNode]
22
22
  flushed_img_slots: list[int] # idxs of image slots of our output_exprs dependencies
23
- stored_img_cols: list[exprs.ColumnSlotIdx]
24
23
  ctx: Optional[ExecContext]
25
24
 
26
25
  def __init__(
@@ -40,7 +39,6 @@ class ExecNode(abc.ABC):
40
39
  self.flushed_img_slots = [
41
40
  e.slot_idx for e in output_dependencies if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
42
41
  ]
43
- self.stored_img_cols = []
44
42
  self.ctx = None # all nodes of a tree share the same context
45
43
 
46
44
  def set_ctx(self, ctx: ExecContext) -> None:
@@ -48,12 +46,6 @@ class ExecNode(abc.ABC):
48
46
  if self.input is not None:
49
47
  self.input.set_ctx(ctx)
50
48
 
51
- def set_stored_img_cols(self, stored_img_cols: list[exprs.ColumnSlotIdx]) -> None:
52
- self.stored_img_cols = stored_img_cols
53
- # propagate batch size to the source
54
- if self.input is not None:
55
- self.input.set_stored_img_cols(stored_img_cols)
56
-
57
49
  @abc.abstractmethod
58
50
  def __aiter__(self) -> AsyncIterator[DataRowBatch]:
59
51
  pass
@@ -56,6 +56,7 @@ class Scheduler(abc.ABC):
56
56
  request: FnCallArgs
57
57
  num_retries: int
58
58
  exec_ctx: ExecCtx
59
+ retry_after: Optional[float] = None # time.monotonic()
59
60
 
60
61
  def __lt__(self, other: Scheduler.QueueItem) -> bool:
61
62
  # prioritize by number of retries (more retries = higher priority)
@@ -270,6 +270,7 @@ class RequestRateScheduler(Scheduler):
270
270
  num_in_flight: int
271
271
  total_requests: int
272
272
  total_retried: int
273
+ total_errors: int
273
274
 
274
275
  TIME_FORMAT = '%H:%M.%S %f'
275
276
  MAX_RETRIES = 3
@@ -294,6 +295,7 @@ class RequestRateScheduler(Scheduler):
294
295
  self.num_in_flight = 0
295
296
  self.total_requests = 0
296
297
  self.total_retried = 0
298
+ self.total_errors = 0
297
299
 
298
300
  # try to get the rate limit from the config
299
301
  elems = resource_pool.split(':')
@@ -312,6 +314,7 @@ class RequestRateScheduler(Scheduler):
312
314
  key = model
313
315
  requests_per_min = Config.get().get_int_value(key, section=section)
314
316
  requests_per_min = requests_per_min or self.DEFAULT_RATE_LIMIT
317
+ _logger.debug(f'rate limit for {self.resource_pool}: {requests_per_min} RPM')
315
318
  self.secs_per_request = 1 / (requests_per_min / 60)
316
319
 
317
320
  @classmethod
@@ -325,8 +328,12 @@ class RequestRateScheduler(Scheduler):
325
328
  if item.num_retries > 0:
326
329
  self.total_retried += 1
327
330
  now = time.monotonic()
331
+ wait_duration = 0.0
332
+ if item.retry_after is not None:
333
+ wait_duration = item.retry_after - now
328
334
  if now - last_request_ts < self.secs_per_request:
329
- wait_duration = self.secs_per_request - (now - last_request_ts)
335
+ wait_duration = max(wait_duration, self.secs_per_request - (now - last_request_ts))
336
+ if wait_duration > 0:
330
337
  _logger.debug(f'waiting for {wait_duration} for {self.resource_pool}')
331
338
  await asyncio.sleep(wait_duration)
332
339
 
@@ -372,15 +379,20 @@ class RequestRateScheduler(Scheduler):
372
379
 
373
380
  except Exception as exc:
374
381
  _logger.debug(f'exception for {self.resource_pool}: type={type(exc)}\n{exc}')
382
+ if hasattr(exc, 'response') and hasattr(exc.response, 'headers'):
383
+ _logger.debug(f'scheduler {self.resource_pool}: exception headers: {exc.response.headers}')
375
384
  is_rate_limit_error, retry_after = self._is_rate_limit_error(exc)
376
385
  if is_rate_limit_error and num_retries < self.MAX_RETRIES:
377
386
  retry_delay = self._compute_retry_delay(num_retries, retry_after)
378
387
  _logger.debug(f'scheduler {self.resource_pool}: retrying after {retry_delay}')
379
- await asyncio.sleep(retry_delay)
380
- self.queue.put_nowait(self.QueueItem(request, num_retries + 1, exec_ctx))
388
+ now = time.monotonic()
389
+ # put the request back in the queue right away, which prevents new requests from being generated until
390
+ # this one succeeds or exceeds its retry limit
391
+ self.queue.put_nowait(self.QueueItem(request, num_retries + 1, exec_ctx, retry_after=now + retry_delay))
381
392
  return
382
393
 
383
394
  # record the exception
395
+ self.total_errors += 1
384
396
  _, _, exc_tb = sys.exc_info()
385
397
  for row in request.rows:
386
398
  row.set_exc(request.fn_call.slot_idx, exc)
@@ -388,7 +400,7 @@ class RequestRateScheduler(Scheduler):
388
400
  finally:
389
401
  _logger.debug(
390
402
  f'Scheduler stats: #in-flight={self.num_in_flight} #requests={self.total_requests}, '
391
- f'#retried={self.total_retried}'
403
+ f'#retried={self.total_retried} #errors={self.total_errors}'
392
404
  )
393
405
  if is_task:
394
406
  self.num_in_flight -= 1
@@ -2,7 +2,7 @@ import logging
2
2
  from typing import Any, AsyncIterator, Optional
3
3
 
4
4
  from pixeltable import catalog, exprs
5
- from pixeltable.utils.media_store import MediaStore
5
+ from pixeltable.utils.media_store import TempStore
6
6
 
7
7
  from .data_row_batch import DataRowBatch
8
8
  from .exec_node import ExecNode
@@ -67,8 +67,7 @@ class InMemoryDataNode(ExecNode):
67
67
  col = col_info.col
68
68
  if col.col_type.is_image_type() and isinstance(val, bytes):
69
69
  # this is a literal media file, ie, a sequence of bytes; save it as a binary file and store the path
70
- assert col.tbl.id == self.tbl.id
71
- filepath, _ = MediaStore.save_media_object(val, col, format=None)
70
+ filepath, _ = TempStore.save_media_object(val, col, format=None)
72
71
  output_row[col_info.slot_idx] = str(filepath)
73
72
  else:
74
73
  output_row[col_info.slot_idx] = val
@@ -14,7 +14,7 @@ import PIL.Image
14
14
  import sqlalchemy as sql
15
15
 
16
16
  from pixeltable import catalog, env
17
- from pixeltable.utils.media_store import MediaStore
17
+ from pixeltable.utils.media_store import MediaStore, TempStore
18
18
 
19
19
 
20
20
  class DataRow:
@@ -270,7 +270,7 @@ class DataRow:
270
270
  # Default to JPEG unless the image has a transparency layer (which isn't supported by JPEG).
271
271
  # In that case, use WebP instead.
272
272
  format = 'webp' if image.has_transparency_data else 'jpeg'
273
- filepath, url = MediaStore.save_media_object(image, col, format=format)
273
+ filepath, url = MediaStore.get().save_media_object(image, col, format=format)
274
274
  self.file_paths[index] = str(filepath)
275
275
  self.file_urls[index] = url
276
276
  else:
@@ -282,16 +282,16 @@ class DataRow:
282
282
  self.vals[index] = None
283
283
 
284
284
  def move_tmp_media_file(self, index: int, col: catalog.Column) -> None:
285
- """If a media url refers to data in a temporary file, move the data to the MediaStore"""
285
+ """If a media url refers to data in a temporary file, move the data to a MediaStore"""
286
286
  if self.file_urls[index] is None:
287
287
  return
288
288
  assert self.excs[index] is None
289
289
  assert col.col_type.is_media_type()
290
- src_path = MediaStore.resolve_tmp_url(self.file_urls[index])
290
+ src_path = TempStore.resolve_url(self.file_urls[index])
291
291
  if src_path is None:
292
292
  # The media url does not point to a temporary file, leave it as is
293
293
  return
294
- new_file_url = MediaStore.relocate_local_media_file(src_path, col)
294
+ new_file_url = MediaStore.get().relocate_local_media_file(src_path, col)
295
295
  self.file_urls[index] = new_file_url
296
296
 
297
297
  @property
@@ -115,6 +115,7 @@ class FunctionCall(Expr):
115
115
  self._validation_error = validation_error
116
116
 
117
117
  if validation_error is not None:
118
+ self.bound_idxs = {}
118
119
  self.resource_pool = None
119
120
  return
120
121
 
@@ -300,8 +301,16 @@ class FunctionCall(Expr):
300
301
  """
301
302
  res = super().substitute(spec)
302
303
  assert res is self
303
- self.return_type = self.fn.call_return_type(self.bound_args)
304
- self.col_type = self.return_type
304
+ if self.is_valid:
305
+ # If this FunctionCall is valid, re-evaluate the call_return_type of the substituted expression. If the
306
+ # FunctionCall is not valid, it isn't safe to do this. (Really we should be asserting that it *is* valid,
307
+ # but we still need to be able to do substitutions on invalid FunctionCalls, because loading an
308
+ # EmbeddingIndex from the db involves reconstructing the requisite (substituted) FunctionCalls. We could
309
+ # fix this by separately persisting the FunctionCall instances held by EmbeddingIndex to the db. That's
310
+ # probably a good idea, but it's also probably not urgent, since it only affects Functions that have a
311
+ # conditional_return_type implemented.)
312
+ self.return_type = self.fn.call_return_type(self.bound_args)
313
+ self.col_type = self.return_type
305
314
  return self
306
315
 
307
316
  def update(self, data_row: DataRow) -> None:
@@ -480,25 +489,54 @@ class FunctionCall(Expr):
480
489
  ).strip()
481
490
  else:
482
491
  # Evaluate the call_return_type as defined in the current codebase.
483
- call_return_type = resolved_fn.call_return_type(bound_args)
484
- if return_type is None:
485
- # Schema versions prior to 25 did not store the return_type in metadata, and there is no obvious way to
486
- # infer it during DB migration, so we might encounter a stored return_type of None. In that case, we use
487
- # the call_return_type that we just inferred (which matches the deserialization behavior prior to
488
- # version 25).
489
- return_type = call_return_type
490
- elif not return_type.is_supertype_of(call_return_type, ignore_nullable=True):
491
- # There is a return_type stored in metadata (schema version >= 25),
492
- # and the stored return_type of the UDF call doesn't match the column type of the FunctionCall.
493
- validation_error = dedent(
494
- f"""
495
- The return type stored in the database for a UDF call to {fn.self_path!r} no longer
496
- matches its return type as currently defined in the code. This probably means that the
497
- code for {fn.self_path!r} has changed in a backward-incompatible way.
498
- Return type of UDF call in the database: {return_type}
499
- Return type of UDF as currently defined in code: {call_return_type}
500
- """
501
- ).strip()
492
+ call_return_type: Optional[ts.ColumnType] = None
493
+
494
+ if isinstance(resolved_fn, func.ExprTemplateFunction) and not resolved_fn.template.expr.is_valid:
495
+ # The FunctionCall is based on an ExprTemplateFunction, but the template expression is not valid
496
+ # (because it in turn contains an invalid FunctionCall). In this case, inherit the validation error
497
+ # from the template expression.
498
+ validation_error = resolved_fn.template.expr.validation_error
499
+ else:
500
+ try:
501
+ call_return_type = resolved_fn.call_return_type(bound_args)
502
+ except ImportError as exc:
503
+ validation_error = dedent(
504
+ f"""
505
+ A UDF call to {fn.self_path!r} could not be fully resolved, because a module required
506
+ by the UDF could not be imported:
507
+ {exc}
508
+ """
509
+ )
510
+
511
+ assert (call_return_type is None) != (validation_error is None)
512
+
513
+ if call_return_type is None and return_type is None:
514
+ # Schema versions prior to 25 did not store the return_type in metadata, and there is no obvious
515
+ # way to infer it during DB migration, so we might encounter a stored return_type of None. If the
516
+ # resolution of call_return_type also fails, then we're out of luck; we have no choice but to
517
+ # fail-fast.
518
+ raise excs.Error(validation_error)
519
+
520
+ if call_return_type is not None:
521
+ # call_return_type resolution succeeded.
522
+ if return_type is None:
523
+ # Schema versions prior to 25 did not store the return_type in metadata (as mentioned above), so
524
+ # fall back on the call_return_type.
525
+ return_type = call_return_type
526
+ elif not return_type.is_supertype_of(call_return_type, ignore_nullable=True):
527
+ # There is a return_type stored in metadata (schema version >= 25),
528
+ # and the stored return_type of the UDF call doesn't match the column type of the FunctionCall.
529
+ validation_error = dedent(
530
+ f"""
531
+ The return type stored in the database for a UDF call to {fn.self_path!r} no longer
532
+ matches its return type as currently defined in the code. This probably means that the
533
+ code for {fn.self_path!r} has changed in a backward-incompatible way.
534
+ Return type of UDF call in the database: {return_type}
535
+ Return type of UDF as currently defined in code: {call_return_type}
536
+ """
537
+ ).strip()
538
+
539
+ assert return_type is not None # Guaranteed by the above logic.
502
540
 
503
541
  fn_call = cls(
504
542
  resolved_fn,
@@ -86,6 +86,8 @@ class RowBuilder:
86
86
  img_slot_idxs: list[int] # Indices of image slots
87
87
  media_slot_idxs: list[int] # Indices of non-image media slots
88
88
  array_slot_idxs: list[int] # Indices of array slots
89
+ stored_img_cols: list[exprs.ColumnSlotIdx]
90
+ stored_media_cols: list[exprs.ColumnSlotIdx]
89
91
 
90
92
  @dataclass
91
93
  class EvalCtx:
@@ -112,6 +114,8 @@ class RowBuilder:
112
114
  """
113
115
  self.unique_exprs: ExprSet[Expr] = ExprSet() # dependencies precede their dependents
114
116
  self.next_slot_idx = 0
117
+ self.stored_img_cols = []
118
+ self.stored_media_cols = []
115
119
 
116
120
  # record input and output exprs; make copies to avoid reusing execution state
117
121
  unique_input_exprs = [self._record_unique_expr(e.copy(), recursive=False) for e in input_exprs]
@@ -246,11 +250,13 @@ class RowBuilder:
246
250
  def add_table_column(self, col: catalog.Column, slot_idx: int) -> None:
247
251
  """Record a column that is part of the table row"""
248
252
  assert self.tbl is not None
249
- self.table_columns.append(ColumnSlotIdx(col, slot_idx))
250
-
251
- def output_slot_idxs(self) -> list[ColumnSlotIdx]:
252
- """Return ColumnSlotIdx for output columns"""
253
- return self.table_columns
253
+ assert col.is_stored
254
+ info = ColumnSlotIdx(col, slot_idx)
255
+ self.table_columns.append(info)
256
+ if col.col_type.is_media_type():
257
+ self.stored_media_cols.append(info)
258
+ if col.col_type.is_image_type():
259
+ self.stored_img_cols.append(info)
254
260
 
255
261
  @property
256
262
  def num_materialized(self) -> int:
@@ -85,13 +85,16 @@ class ExprTemplateFunction(Function):
85
85
  conditional_return_type).
86
86
  """
87
87
  assert not self.is_polymorphic
88
- template = self.template
89
88
  with_defaults = bound_args.copy()
90
89
  with_defaults.update(
91
- {param_name: default for param_name, default in template.defaults.items() if param_name not in bound_args}
90
+ {
91
+ param_name: default
92
+ for param_name, default in self.template.defaults.items()
93
+ if param_name not in bound_args
94
+ }
92
95
  )
93
96
  substituted_expr = self.template.expr.copy().substitute(
94
- {template.param_exprs[name]: expr for name, expr in with_defaults.items()}
97
+ {self.template.param_exprs[name]: expr for name, expr in with_defaults.items()}
95
98
  )
96
99
  return substituted_expr.col_type
97
100
 
@@ -26,6 +26,8 @@ from . import (
26
26
  video,
27
27
  vision,
28
28
  whisper,
29
+ whisperx,
30
+ yolox,
29
31
  )
30
32
  from .globals import count, map, max, mean, min, sum
31
33
 
@@ -132,8 +132,7 @@ class AnthropicRateLimitsInfo(env.RateLimitsInfo):
132
132
  should_retry_str = exc.response.headers.get('x-should-retry', '')
133
133
  if should_retry_str.lower() != 'true':
134
134
  return None
135
- retry_after_str = exc.response.headers.get('retry-after', '1')
136
- return int(retry_after_str)
135
+ return super().get_retry_delay(exc)
137
136
 
138
137
 
139
138
  @pxt.udf
@@ -26,7 +26,7 @@ def _deepseek_client() -> 'openai.AsyncOpenAI':
26
26
  return env.Env.get().get_client('deepseek')
27
27
 
28
28
 
29
- @pxt.udf
29
+ @pxt.udf(resource_pool='request-rate:deepseek')
30
30
  async def chat_completions(
31
31
  messages: list,
32
32
  *,
@@ -43,6 +43,10 @@ async def chat_completions(
43
43
 
44
44
  Deepseek uses the OpenAI SDK, so you will need to install the `openai` package to use this UDF.
45
45
 
46
+ Request throttling:
47
+ Applies the rate limit set in the config (section `deepseek`, key `rate_limit`). If no rate
48
+ limit is configured, uses a default of 600 RPM.
49
+
46
50
  __Requirements:__
47
51
 
48
52
  - `pip install openai`
@@ -14,6 +14,7 @@ import PIL.Image
14
14
 
15
15
  import pixeltable as pxt
16
16
  from pixeltable import env, exceptions as excs, exprs
17
+ from pixeltable.utils.media_store import TempStore
17
18
 
18
19
  if TYPE_CHECKING:
19
20
  from google import genai
@@ -39,7 +40,7 @@ async def generate_content(
39
40
  <https://ai.google.dev/gemini-api/docs/text-generation>
40
41
 
41
42
  Request throttling:
42
- Applies the rate limit set in the config (section `gemini`, key `rate_limit`). If no rate
43
+ Applies the rate limit set in the config (section `gemini.rate_limits`; use the model id as the key). If no rate
43
44
  limit is configured, uses a default of 600 RPM.
44
45
 
45
46
  __Requirements:__
@@ -126,6 +127,10 @@ async def generate_images(prompt: str, *, model: str, config: Optional[dict] = N
126
127
  Generates images based on a text description and configuration. For additional details, see:
127
128
  <https://ai.google.dev/gemini-api/docs/image-generation>
128
129
 
130
+ Request throttling:
131
+ Applies the rate limit set in the config (section `imagen.rate_limits`; use the model id as the key). If no rate
132
+ limit is configured, uses a default of 600 RPM.
133
+
129
134
  __Requirements:__
130
135
 
131
136
  - `pip install google-genai`
@@ -167,6 +172,10 @@ async def generate_videos(
167
172
  Generates videos based on a text description and configuration. For additional details, see:
168
173
  <https://ai.google.dev/gemini-api/docs/video-generation>
169
174
 
175
+ Request throttling:
176
+ Applies the rate limit set in the config (section `veo.rate_limits`; use the model id as the key). If no rate
177
+ limit is configured, uses a default of 600 RPM.
178
+
170
179
  __Requirements:__
171
180
 
172
181
  - `pip install google-genai`
@@ -215,7 +224,7 @@ async def generate_videos(
215
224
  assert video_bytes is not None
216
225
 
217
226
  # Create a temporary file to store the video bytes
218
- output_path = env.Env.get().create_tmp_path('.mp4')
227
+ output_path = TempStore.create_path(extension='.mp4')
219
228
  Path(output_path).write_bytes(video_bytes)
220
229
  return str(output_path)
221
230