pixeltable 0.4.5__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (55) hide show
  1. pixeltable/__init__.py +4 -2
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +1 -1
  4. pixeltable/catalog/catalog.py +3 -3
  5. pixeltable/catalog/column.py +49 -0
  6. pixeltable/catalog/insertable_table.py +0 -7
  7. pixeltable/catalog/schema_object.py +1 -14
  8. pixeltable/catalog/table.py +139 -53
  9. pixeltable/catalog/table_version.py +30 -138
  10. pixeltable/catalog/view.py +2 -1
  11. pixeltable/dataframe.py +2 -3
  12. pixeltable/env.py +43 -5
  13. pixeltable/exec/expr_eval/expr_eval_node.py +2 -2
  14. pixeltable/exec/expr_eval/schedulers.py +36 -15
  15. pixeltable/exprs/array_slice.py +2 -2
  16. pixeltable/exprs/data_row.py +13 -0
  17. pixeltable/exprs/expr.py +9 -9
  18. pixeltable/exprs/function_call.py +2 -2
  19. pixeltable/exprs/globals.py +1 -2
  20. pixeltable/exprs/json_path.py +3 -3
  21. pixeltable/exprs/row_builder.py +14 -16
  22. pixeltable/exprs/string_op.py +3 -3
  23. pixeltable/func/query_template_function.py +2 -2
  24. pixeltable/func/signature.py +30 -3
  25. pixeltable/func/tools.py +2 -2
  26. pixeltable/functions/anthropic.py +75 -25
  27. pixeltable/functions/globals.py +2 -2
  28. pixeltable/functions/llama_cpp.py +9 -1
  29. pixeltable/functions/openai.py +74 -54
  30. pixeltable/functions/video.py +54 -1
  31. pixeltable/functions/vision.py +2 -2
  32. pixeltable/globals.py +74 -12
  33. pixeltable/io/datarows.py +3 -3
  34. pixeltable/io/fiftyone.py +4 -4
  35. pixeltable/io/globals.py +3 -3
  36. pixeltable/io/hf_datasets.py +4 -4
  37. pixeltable/io/pandas.py +6 -6
  38. pixeltable/io/parquet.py +3 -3
  39. pixeltable/io/table_data_conduit.py +2 -2
  40. pixeltable/io/utils.py +2 -2
  41. pixeltable/iterators/document.py +2 -2
  42. pixeltable/iterators/video.py +49 -9
  43. pixeltable/share/packager.py +45 -36
  44. pixeltable/store.py +5 -25
  45. pixeltable/type_system.py +5 -8
  46. pixeltable/utils/__init__.py +2 -2
  47. pixeltable/utils/arrow.py +5 -5
  48. pixeltable/utils/description_helper.py +3 -3
  49. pixeltable/utils/iceberg.py +1 -2
  50. {pixeltable-0.4.5.dist-info → pixeltable-0.4.7.dist-info}/METADATA +109 -59
  51. {pixeltable-0.4.5.dist-info → pixeltable-0.4.7.dist-info}/RECORD +64 -64
  52. {pixeltable-0.4.5.dist-info → pixeltable-0.4.7.dist-info}/WHEEL +1 -1
  53. pixeltable-0.4.7.dist-info/entry_points.txt +2 -0
  54. pixeltable-0.4.5.dist-info/entry_points.txt +0 -3
  55. {pixeltable-0.4.5.dist-info → pixeltable-0.4.7.dist-info/licenses}/LICENSE +0 -0
@@ -14,7 +14,6 @@ import sqlalchemy as sql
14
14
 
15
15
  import pixeltable as pxt
16
16
  import pixeltable.exceptions as excs
17
- import pixeltable.type_system as ts
18
17
  from pixeltable import exprs, index
19
18
  from pixeltable.env import Env
20
19
  from pixeltable.iterators import ComponentIterator
@@ -223,18 +222,23 @@ class TableVersion:
223
222
  view_md: Optional[schema.ViewMd] = None,
224
223
  ) -> TableVersionMd:
225
224
  user = Env.get().user
225
+ timestamp = time.time()
226
226
 
227
- # assign ids
227
+ # assign ids, create metadata
228
228
  cols_by_name: dict[str, Column] = {}
229
+ column_md: dict[int, schema.ColumnMd] = {}
230
+ schema_col_md: dict[int, schema.SchemaColumn] = {}
229
231
  for pos, col in enumerate(cols):
230
232
  col.id = pos
231
233
  col.schema_version_add = 0
232
234
  cols_by_name[col.name] = col
233
235
  if col.is_computed:
234
236
  col.check_value_expr()
237
+ col_md, sch_md = col.to_md(pos)
238
+ assert sch_md is not None
239
+ column_md[col.id] = col_md
240
+ schema_col_md[col.id] = sch_md
235
241
 
236
- timestamp = time.time()
237
- column_md = cls._create_column_md(cols)
238
242
  tbl_id = uuid.uuid4()
239
243
  tbl_id_str = str(tbl_id)
240
244
  tbl_md = schema.TableMd(
@@ -256,18 +260,15 @@ class TableVersion:
256
260
  )
257
261
 
258
262
  table_version_md = schema.TableVersionMd(
259
- tbl_id=tbl_id_str, created_at=timestamp, version=0, schema_version=0, additional_md={}
263
+ tbl_id=tbl_id_str,
264
+ created_at=timestamp,
265
+ version=0,
266
+ schema_version=0,
267
+ user=user,
268
+ update_status=None,
269
+ additional_md={},
260
270
  )
261
271
 
262
- schema_col_md: dict[int, schema.SchemaColumn] = {}
263
- for pos, col in enumerate(cols):
264
- md = schema.SchemaColumn(
265
- pos=pos,
266
- name=col.name,
267
- media_validation=col._media_validation.name.lower() if col._media_validation is not None else None,
268
- )
269
- schema_col_md[col.id] = md
270
-
271
272
  schema_version_md = schema.TableSchemaVersionMd(
272
273
  tbl_id=tbl_id_str,
273
274
  schema_version=0,
@@ -290,76 +291,11 @@ class TableVersion:
290
291
  comment: str,
291
292
  media_validation: MediaValidation,
292
293
  ) -> tuple[UUID, Optional[TableVersion]]:
293
- user = Env.get().user
294
-
295
- # assign ids
296
- cols_by_name: dict[str, Column] = {}
297
- for pos, col in enumerate(cols):
298
- col.id = pos
299
- col.schema_version_add = 0
300
- cols_by_name[col.name] = col
301
- if col.is_computed:
302
- col.check_value_expr()
303
-
304
- timestamp = time.time()
305
- # create schema.Table
306
- # Column.dependent_cols for existing cols is wrong at this point, but init() will set it correctly
307
- column_md = cls._create_column_md(cols)
308
- tbl_id = uuid.uuid4()
309
- tbl_id_str = str(tbl_id)
310
- table_md = schema.TableMd(
311
- tbl_id=tbl_id_str,
312
- name=name,
313
- user=user,
314
- is_replica=False,
315
- current_version=0,
316
- current_schema_version=0,
317
- next_col_id=len(cols),
318
- next_idx_id=0,
319
- next_row_id=0,
320
- view_sn=0,
321
- column_md=column_md,
322
- index_md={},
323
- external_stores=[],
324
- view_md=None,
325
- additional_md={},
326
- )
327
-
328
- # create schema.TableVersion of the initial version
329
- table_version_md = schema.TableVersionMd(
330
- tbl_id=tbl_id_str,
331
- created_at=timestamp,
332
- version=0,
333
- schema_version=0,
334
- user=user,
335
- update_status=None,
336
- additional_md={},
337
- )
338
-
339
- # create schema.TableSchemaVersion
340
- schema_col_md: dict[int, schema.SchemaColumn] = {}
341
- for pos, col in enumerate(cols):
342
- md = schema.SchemaColumn(
343
- pos=pos,
344
- name=col.name,
345
- media_validation=col._media_validation.name.lower() if col._media_validation is not None else None,
346
- )
347
- schema_col_md[col.id] = md
348
-
349
- schema_version_md = schema.TableSchemaVersionMd(
350
- tbl_id=tbl_id_str,
351
- schema_version=0,
352
- preceding_schema_version=None,
353
- columns=schema_col_md,
354
- num_retained_versions=num_retained_versions,
355
- comment=comment,
356
- media_validation=media_validation.name.lower(),
357
- additional_md={},
358
- )
359
-
294
+ inital_md = cls.create_initial_md(name, cols, num_retained_versions, comment, media_validation, view_md=None)
360
295
  cat = pxt.catalog.Catalog.get()
361
296
 
362
- tbl_version = cls(tbl_id, table_md, table_version_md, None, schema_version_md, [])
297
+ tbl_id = UUID(hex=inital_md.tbl_md.tbl_id)
298
+ tbl_version = cls(tbl_id, inital_md.tbl_md, inital_md.version_md, None, inital_md.schema_version_md, [])
363
299
  # TODO: break this up, so that Catalog.create_table() registers tbl_version
364
300
  cat._tbl_versions[tbl_id, None] = tbl_version
365
301
  tbl_version.init()
@@ -373,8 +309,8 @@ class TableVersion:
373
309
  tbl_id=tbl_id,
374
310
  dir_id=dir_id,
375
311
  tbl_md=tbl_version.tbl_md,
376
- version_md=table_version_md,
377
- schema_version_md=schema_version_md,
312
+ version_md=inital_md.version_md,
313
+ schema_version_md=inital_md.schema_version_md,
378
314
  )
379
315
  return tbl_id, tbl_version
380
316
 
@@ -480,25 +416,7 @@ class TableVersion:
480
416
  sorted_column_md = sorted(self.tbl_md.column_md.values(), key=lambda item: item.id)
481
417
  for col_md in sorted_column_md:
482
418
  schema_col_md = self.schema_version_md.columns.get(col_md.id)
483
- col_name = schema_col_md.name if schema_col_md is not None else None
484
- media_val = (
485
- MediaValidation[schema_col_md.media_validation.upper()]
486
- if schema_col_md is not None and schema_col_md.media_validation is not None
487
- else None
488
- )
489
- col = Column(
490
- col_id=col_md.id,
491
- name=col_name,
492
- col_type=ts.ColumnType.from_dict(col_md.col_type),
493
- is_pk=col_md.is_pk,
494
- stored=col_md.stored,
495
- media_validation=media_val,
496
- schema_version_add=col_md.schema_version_add,
497
- schema_version_drop=col_md.schema_version_drop,
498
- value_expr_dict=col_md.value_expr,
499
- tbl=self,
500
- )
501
- col.tbl = self
419
+ col = Column.from_md(col_md, self, schema_col_md)
502
420
  self.cols.append(col)
503
421
 
504
422
  # populate the lookup structures before Expr.from_dict()
@@ -783,31 +701,22 @@ class TableVersion:
783
701
  num_excs = 0
784
702
  cols_with_excs: list[Column] = []
785
703
  for col in cols_to_add:
704
+ assert col.id is not None, 'Column id must be set before adding the column'
786
705
  excs_per_col = 0
787
706
  col.schema_version_add = self.schema_version
788
707
  # add the column to the lookup structures now, rather than after the store changes executed successfully,
789
708
  # because it might be referenced by the next column's value_expr
790
709
  self.cols.append(col)
791
- if col.name is not None:
792
- self.cols_by_name[col.name] = col
793
710
  self.cols_by_id[col.id] = col
794
-
795
- # also add to stored md
796
- self._tbl_md.column_md[col.id] = schema.ColumnMd(
797
- id=col.id,
798
- col_type=col.col_type.as_dict(),
799
- is_pk=col.is_pk,
800
- schema_version_add=col.schema_version_add,
801
- schema_version_drop=col.schema_version_drop,
802
- value_expr=col.value_expr.as_dict() if col.value_expr is not None else None,
803
- stored=col.stored,
804
- )
805
711
  if col.name is not None:
806
- self._schema_version_md.columns[col.id] = schema.SchemaColumn(
807
- name=col.name,
808
- pos=len(self.cols_by_name),
809
- media_validation=col._media_validation.name.lower() if col._media_validation is not None else None,
810
- )
712
+ self.cols_by_name[col.name] = col
713
+ col_md, sch_md = col.to_md(len(self.cols_by_name))
714
+ assert sch_md is not None, 'Schema column metadata must be created for user-facing columns'
715
+ self._tbl_md.column_md[col.id] = col_md
716
+ self._schema_version_md.columns[col.id] = sch_md
717
+ else:
718
+ col_md, _ = col.to_md()
719
+ self._tbl_md.column_md[col.id] = col_md
811
720
 
812
721
  if col.is_stored:
813
722
  self.store_tbl.add_column(col)
@@ -1628,23 +1537,6 @@ class TableVersion:
1628
1537
  return 1 + self.base.get().num_rowid_columns()
1629
1538
  return 1
1630
1539
 
1631
- @classmethod
1632
- def _create_column_md(cls, cols: list[Column]) -> dict[int, schema.ColumnMd]:
1633
- column_md: dict[int, schema.ColumnMd] = {}
1634
- for col in cols:
1635
- value_expr_dict = col.value_expr.as_dict() if col.value_expr is not None else None
1636
- assert col.is_pk is not None
1637
- column_md[col.id] = schema.ColumnMd(
1638
- id=col.id,
1639
- col_type=col.col_type.as_dict(),
1640
- is_pk=col.is_pk,
1641
- schema_version_add=col.schema_version_add,
1642
- schema_version_drop=col.schema_version_drop,
1643
- value_expr=value_expr_dict,
1644
- stored=col.stored,
1645
- )
1646
- return column_md
1647
-
1648
1540
  @classmethod
1649
1541
  def _create_stores_md(cls, stores: Iterable[pxt.io.ExternalStore]) -> list[dict[str, Any]]:
1650
1542
  return [
@@ -25,6 +25,7 @@ from .tbl_ops import CreateStoreTableOp, LoadViewOp, TableOp
25
25
  from .update_status import UpdateStatus
26
26
 
27
27
  if TYPE_CHECKING:
28
+ from pixeltable.catalog.table import TableMetadata
28
29
  from pixeltable.globals import TableDataSource
29
30
 
30
31
  _logger = logging.getLogger('pixeltable')
@@ -261,7 +262,7 @@ class View(Table):
261
262
  """
262
263
  return self._snapshot_only and self._id == self._tbl_version_path.tbl_id
263
264
 
264
- def _get_metadata(self) -> dict[str, Any]:
265
+ def _get_metadata(self) -> 'TableMetadata':
265
266
  md = super()._get_metadata()
266
267
  md['is_view'] = True
267
268
  md['is_snapshot'] = self._tbl_version_path.is_snapshot()
pixeltable/dataframe.py CHANGED
@@ -19,7 +19,6 @@ from typing import (
19
19
  Optional,
20
20
  Sequence,
21
21
  TypeVar,
22
- Union,
23
22
  )
24
23
 
25
24
  import pandas as pd
@@ -766,7 +765,7 @@ class DataFrame:
766
765
  )
767
766
 
768
767
  def _create_join_predicate(
769
- self, other: catalog.TableVersionPath, on: Union[exprs.Expr, Sequence[exprs.ColumnRef]]
768
+ self, other: catalog.TableVersionPath, on: exprs.Expr | Sequence[exprs.ColumnRef]
770
769
  ) -> exprs.Expr:
771
770
  """Verifies user-specified 'on' argument and converts it into a join predicate."""
772
771
  col_refs: list[exprs.ColumnRef] = []
@@ -829,7 +828,7 @@ class DataFrame:
829
828
  def join(
830
829
  self,
831
830
  other: catalog.Table,
832
- on: Optional[Union[exprs.Expr, Sequence[exprs.ColumnRef]]] = None,
831
+ on: exprs.Expr | Sequence[exprs.ColumnRef] | None = None,
833
832
  how: plan.JoinType.LiteralType = 'inner',
834
833
  ) -> DataFrame:
835
834
  """
pixeltable/env.py CHANGED
@@ -17,7 +17,6 @@ import types
17
17
  import typing
18
18
  import uuid
19
19
  import warnings
20
- from abc import abstractmethod
21
20
  from contextlib import contextmanager
22
21
  from dataclasses import dataclass, field
23
22
  from pathlib import Path
@@ -890,6 +889,10 @@ class RateLimitsInfo:
890
889
  get_request_resources: Callable[..., dict[str, int]]
891
890
 
892
891
  resource_limits: dict[str, RateLimitInfo] = field(default_factory=dict)
892
+ has_exc: bool = False
893
+
894
+ def debug_str(self) -> str:
895
+ return ','.join(info.debug_str() for info in self.resource_limits.values())
893
896
 
894
897
  def is_initialized(self) -> bool:
895
898
  return len(self.resource_limits) > 0
@@ -897,7 +900,7 @@ class RateLimitsInfo:
897
900
  def reset(self) -> None:
898
901
  self.resource_limits.clear()
899
902
 
900
- def record(self, **kwargs: Any) -> None:
903
+ def record(self, reset_exc: bool = False, **kwargs: Any) -> None:
901
904
  now = datetime.datetime.now(tz=datetime.timezone.utc)
902
905
  if len(self.resource_limits) == 0:
903
906
  self.resource_limits = {k: RateLimitInfo(k, now, *v) for k, v in kwargs.items() if v is not None}
@@ -908,14 +911,30 @@ class RateLimitsInfo:
908
911
  f'reset={info.reset_at.strftime(TIME_FORMAT)} delta={(info.reset_at - now).total_seconds()}'
909
912
  )
910
913
  else:
914
+ if self.has_exc and not reset_exc:
915
+ # ignore updates until we're asked to reset
916
+ _logger.debug(f'rate_limits.record(): ignoring update {kwargs}')
917
+ return
918
+ self.has_exc = False
911
919
  for k, v in kwargs.items():
912
920
  if v is not None:
913
921
  self.resource_limits[k].update(now, *v)
914
922
 
915
- @abstractmethod
923
+ def record_exc(self, exc: Exception) -> None:
924
+ """Update self.resource_limits based on the exception headers"""
925
+ self.has_exc = True
926
+
916
927
  def get_retry_delay(self, exc: Exception) -> Optional[float]:
917
928
  """Returns number of seconds to wait before retry, or None if not retryable"""
918
- pass
929
+ if len(self.resource_limits) == 0:
930
+ return 1.0
931
+ # we're looking for the maximum delay across all depleted resources
932
+ max_delay = 0.0
933
+ now = datetime.datetime.now(tz=datetime.timezone.utc)
934
+ for limit_info in self.resource_limits.values():
935
+ if limit_info.remaining < 0.05 * limit_info.limit:
936
+ max_delay = max(max_delay, (limit_info.reset_at - now).total_seconds())
937
+ return max_delay if max_delay > 0 else None
919
938
 
920
939
 
921
940
  @dataclass
@@ -928,9 +947,15 @@ class RateLimitInfo:
928
947
  remaining: int
929
948
  reset_at: datetime.datetime
930
949
 
950
+ def debug_str(self) -> str:
951
+ return (
952
+ f'{self.resource}@{self.recorded_at.strftime(TIME_FORMAT)}: '
953
+ f'{self.limit}/{self.remaining}/{self.reset_at.strftime(TIME_FORMAT)}'
954
+ )
955
+
931
956
  def update(self, recorded_at: datetime.datetime, limit: int, remaining: int, reset_at: datetime.datetime) -> None:
932
957
  # we always update everything, even though responses may come back out-of-order: we can't use reset_at to
933
- # determine order, because it doesn't increase monotonically (the reeset duration shortens as output_tokens
958
+ # determine order, because it doesn't increase monotonically (the reset duration shortens as output_tokens
934
959
  # are freed up - going from max to actual)
935
960
  self.recorded_at = recorded_at
936
961
  self.limit = limit
@@ -942,3 +967,16 @@ class RateLimitInfo:
942
967
  f'Update {self.resource} rate limit: rem={self.remaining} reset={self.reset_at.strftime(TIME_FORMAT)} '
943
968
  f'reset_delta={reset_delta.total_seconds()} recorded_delta={(self.reset_at - recorded_at).total_seconds()}'
944
969
  )
970
+
971
+
972
+ @dataclass
973
+ class RuntimeCtx:
974
+ """
975
+ Container for runtime data provided by the execution system to udfs.
976
+
977
+ Udfs that accept the special _runtime_ctx parameter receive an instance of this class.
978
+ """
979
+
980
+ # Indicates a retry attempt following a rate limit error (error code: 429). Requires a 'rate-limits' resource pool.
981
+ # If True, call RateLimitsInfo.record() with reset_exc=True.
982
+ is_retry: bool = False
@@ -4,7 +4,7 @@ import asyncio
4
4
  import logging
5
5
  import traceback
6
6
  from types import TracebackType
7
- from typing import AsyncIterator, Iterable, Optional, Union
7
+ from typing import AsyncIterator, Iterable, Optional
8
8
 
9
9
  import numpy as np
10
10
 
@@ -49,7 +49,7 @@ class ExprEvalNode(ExecNode):
49
49
  # execution state
50
50
  tasks: set[asyncio.Task] # collects all running tasks to prevent them from getting gc'd
51
51
  exc_event: asyncio.Event # set if an exception needs to be propagated
52
- error: Optional[Union[Exception]] # exception that needs to be propagated
52
+ error: Optional[Exception] # exception that needs to be propagated
53
53
  completed_rows: asyncio.Queue[exprs.DataRow] # rows that have completed evaluation
54
54
  completed_event: asyncio.Event # set when completed_rows is non-empty
55
55
  input_iter: AsyncIterator[DataRowBatch]
@@ -81,6 +81,8 @@ class RateLimitsScheduler(Scheduler):
81
81
  while True:
82
82
  if item is None:
83
83
  item = await self.queue.get()
84
+ assert isinstance(item.request.fn_call.fn, func.CallableFunction)
85
+ assert '_runtime_ctx' in item.request.fn_call.fn.signature.system_parameters
84
86
  if item.num_retries > 0:
85
87
  self.total_retried += 1
86
88
 
@@ -97,7 +99,6 @@ class RateLimitsScheduler(Scheduler):
97
99
  continue
98
100
 
99
101
  # check rate limits
100
- _logger.debug(f'checking rate limits for {self.resource_pool}')
101
102
  request_resources = self._get_request_resources(item.request)
102
103
  limits_info = self._check_resource_limits(request_resources)
103
104
  aws: list[Awaitable[None]] = []
@@ -116,21 +117,31 @@ class RateLimitsScheduler(Scheduler):
116
117
  reset_at = limits_info.reset_at
117
118
  if reset_at > now:
118
119
  # we're waiting for the rate limit to reset
119
- wait_for_reset = asyncio.create_task(asyncio.sleep((reset_at - now).total_seconds()))
120
+ wait_duration = (reset_at - now).total_seconds()
121
+ wait_for_reset = asyncio.create_task(asyncio.sleep(wait_duration))
120
122
  aws.append(wait_for_reset)
121
- _logger.debug(f'waiting for rate limit reset for {self.resource_pool}')
123
+ _logger.debug(
124
+ f'waiting {wait_duration:.2f}s for rate limit reset of '
125
+ f'{self.resource_pool}:{limits_info.resource} (remaining={limits_info.remaining})'
126
+ )
122
127
 
123
128
  if len(aws) > 0:
124
129
  # we have something to wait for
130
+ report_ts = limits_info.recorded_at
125
131
  done, pending = await asyncio.wait(aws, return_when=asyncio.FIRST_COMPLETED)
126
132
  for task in pending:
127
133
  task.cancel()
128
134
  if completed_aw in done:
129
135
  _logger.debug(f'wait(): completed request for {self.resource_pool}')
130
136
  if wait_for_reset in done:
131
- _logger.debug(f'wait(): rate limit reset for {self.resource_pool}')
132
- # force waiting for another rate limit report before making any scheduling decisions
133
- self.pool_info.reset()
137
+ _logger.debug(f'wait(): rate limit reset for {self.resource_pool}:{limits_info.resource}')
138
+ last_report_ts = self.pool_info.resource_limits[limits_info.resource].recorded_at
139
+ if report_ts == last_report_ts:
140
+ # if we haven't seen a new report since we started waiting, force waiting for another rate limit
141
+ # report before making any scheduling decisions
142
+ # TODO: is it a good idea to discard the information we have?
143
+ _logger.debug(f'resetting {self.resource_pool}: currently at {self.pool_info.debug_str()}')
144
+ self.pool_info.reset()
134
145
  # re-evaluate current capacity for current item
135
146
  continue
136
147
 
@@ -158,16 +169,22 @@ class RateLimitsScheduler(Scheduler):
158
169
 
159
170
  def _check_resource_limits(self, request_resources: dict[str, int]) -> Optional[env.RateLimitInfo]:
160
171
  """Returns the most depleted resource, relative to its limit, or None if all resources are within limits"""
161
- candidates: list[tuple[env.RateLimitInfo, float]] = [] # (info, relative usage)
172
+ candidates: list[tuple[env.RateLimitInfo, float]] = [] # (info, relative remaining)
162
173
  for resource, usage in request_resources.items():
163
- # 0.05: leave some headroom, we don't have perfect information
164
174
  info = self.pool_info.resource_limits[resource]
165
175
  est_remaining = info.remaining - self.est_usage[resource] - usage
166
- if est_remaining < 0.05 * info.limit:
167
- candidates.append((info, est_remaining / info.limit))
168
- if len(candidates) == 0:
169
- return None
170
- return min(candidates, key=lambda x: x[1])[0]
176
+ candidates.append((info, est_remaining / info.limit))
177
+ assert len(candidates) > 0
178
+ candidates.sort(key=lambda x: x[1]) # most depleted first
179
+ most_depleted = candidates[0]
180
+ _logger.debug(
181
+ f'check_resource_limits({request_resources}): '
182
+ f'most_depleted={most_depleted[0].resource}, rel_remaining={most_depleted[1]}'
183
+ )
184
+ # 0.05: leave some headroom, we don't have perfect information
185
+ if most_depleted[1] < 0.05:
186
+ return most_depleted[0]
187
+ return None
171
188
 
172
189
  async def _exec(self, request: FnCallArgs, exec_ctx: ExecCtx, num_retries: int, is_task: bool) -> None:
173
190
  assert all(not row.has_val[request.fn_call.slot_idx] for row in request.rows)
@@ -188,7 +205,8 @@ class RateLimitsScheduler(Scheduler):
188
205
  for row, result in zip(request.rows, batch_result):
189
206
  row[request.fn_call.slot_idx] = result
190
207
  else:
191
- result = await pxt_fn.aexec(*request.args, **request.kwargs)
208
+ request_kwargs = {**request.kwargs, '_runtime_ctx': env.RuntimeCtx(is_retry=num_retries > 0)}
209
+ result = await pxt_fn.aexec(*request.args, **request_kwargs)
192
210
  request.row[request.fn_call.slot_idx] = result
193
211
  end_ts = datetime.datetime.now(tz=datetime.timezone.utc)
194
212
  _logger.debug(
@@ -202,10 +220,14 @@ class RateLimitsScheduler(Scheduler):
202
220
  self.dispatcher.dispatch(request.rows, exec_ctx)
203
221
  except Exception as exc:
204
222
  _logger.debug(f'scheduler {self.resource_pool}: exception in slot {request.fn_call.slot_idx}: {exc}')
223
+ if hasattr(exc, 'response') and hasattr(exc.response, 'headers'):
224
+ _logger.debug(f'scheduler {self.resource_pool}: exception headers: {exc.response.headers}')
205
225
  if self.pool_info is None:
206
226
  # our pool info should be available at this point
207
227
  self._set_pool_info()
208
228
  assert self.pool_info is not None
229
+ self.pool_info.record_exc(exc)
230
+
209
231
  if num_retries < self.MAX_RETRIES:
210
232
  retry_delay = self.pool_info.get_retry_delay(exc)
211
233
  if retry_delay is not None:
@@ -214,7 +236,6 @@ class RateLimitsScheduler(Scheduler):
214
236
  await asyncio.sleep(retry_delay)
215
237
  self.queue.put_nowait(self.QueueItem(request, num_retries + 1, exec_ctx))
216
238
  return
217
- # TODO: update resource limits reported in exc.response.headers, if present
218
239
 
219
240
  # record the exception
220
241
  _, _, exc_tb = sys.exc_info()
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any, Optional, Union
3
+ from typing import Any, Optional
4
4
 
5
5
  import sqlalchemy as sql
6
6
 
@@ -16,7 +16,7 @@ class ArraySlice(Expr):
16
16
  Slice operation on an array, eg, t.array_col[:, 1:2].
17
17
  """
18
18
 
19
- def __init__(self, arr: Expr, index: tuple[Union[int, slice], ...]):
19
+ def __init__(self, arr: Expr, index: tuple[int | slice, ...]):
20
20
  assert arr.col_type.is_array_type()
21
21
  # determine result type
22
22
  super().__init__(arr.col_type)
@@ -281,6 +281,19 @@ class DataRow:
281
281
  pass
282
282
  self.vals[index] = None
283
283
 
284
+ def move_tmp_media_file(self, index: int, col: catalog.Column) -> None:
285
+ """If a media url refers to data in a temporary file, move the data to the MediaStore"""
286
+ if self.file_urls[index] is None:
287
+ return
288
+ assert self.excs[index] is None
289
+ assert col.col_type.is_media_type()
290
+ src_path = MediaStore.resolve_tmp_url(self.file_urls[index])
291
+ if src_path is None:
292
+ # The media url does not point to a temporary file, leave it as is
293
+ return
294
+ new_file_url = MediaStore.relocate_local_media_file(src_path, col)
295
+ self.file_urls[index] = new_file_url
296
+
284
297
  @property
285
298
  def rowid(self) -> tuple[int, ...]:
286
299
  return self.pk[:-1]
pixeltable/exprs/expr.py CHANGED
@@ -7,7 +7,7 @@ import inspect
7
7
  import json
8
8
  import sys
9
9
  import typing
10
- from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Optional, TypeVar, Union, overload
10
+ from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Optional, TypeVar, overload
11
11
  from uuid import UUID
12
12
 
13
13
  import numpy as np
@@ -550,7 +550,7 @@ class Expr(abc.ABC):
550
550
  else:
551
551
  return InPredicate(self, value_set_literal=value_set)
552
552
 
553
- def astype(self, new_type: Union[ts.ColumnType, type, _AnnotatedAlias]) -> 'exprs.TypeCast':
553
+ def astype(self, new_type: ts.ColumnType | type | _AnnotatedAlias) -> 'exprs.TypeCast':
554
554
  from pixeltable.exprs import TypeCast
555
555
 
556
556
  # Interpret the type argument the same way we would if given in a schema
@@ -562,7 +562,7 @@ class Expr(abc.ABC):
562
562
  return TypeCast(self, col_type)
563
563
 
564
564
  def apply(
565
- self, fn: Callable, *, col_type: Union[ts.ColumnType, type, _AnnotatedAlias, None] = None
565
+ self, fn: Callable, *, col_type: ts.ColumnType | type | _AnnotatedAlias | None = None
566
566
  ) -> 'exprs.FunctionCall':
567
567
  if col_type is not None:
568
568
  col_type = ts.ColumnType.normalize_type(col_type)
@@ -646,7 +646,7 @@ class Expr(abc.ABC):
646
646
 
647
647
  def _make_comparison(self, op: ComparisonOperator, other: object) -> 'exprs.Comparison':
648
648
  """
649
- other: Union[Expr, LiteralPythonTypes]
649
+ other: Expr | LiteralPythonTypes
650
650
  """
651
651
  # TODO: check for compatibility
652
652
  from .comparison import Comparison
@@ -661,7 +661,7 @@ class Expr(abc.ABC):
661
661
  def __neg__(self) -> 'exprs.ArithmeticExpr':
662
662
  return self._make_arithmetic_expr(ArithmeticOperator.MUL, -1)
663
663
 
664
- def __add__(self, other: object) -> Union[exprs.ArithmeticExpr, exprs.StringOp]:
664
+ def __add__(self, other: object) -> exprs.ArithmeticExpr | exprs.StringOp:
665
665
  if isinstance(self, str) or (isinstance(self, Expr) and self.col_type.is_string_type()):
666
666
  return self._make_string_expr(StringOperator.CONCAT, other)
667
667
  return self._make_arithmetic_expr(ArithmeticOperator.ADD, other)
@@ -669,7 +669,7 @@ class Expr(abc.ABC):
669
669
  def __sub__(self, other: object) -> 'exprs.ArithmeticExpr':
670
670
  return self._make_arithmetic_expr(ArithmeticOperator.SUB, other)
671
671
 
672
- def __mul__(self, other: object) -> Union['exprs.ArithmeticExpr', 'exprs.StringOp']:
672
+ def __mul__(self, other: object) -> 'exprs.ArithmeticExpr' | 'exprs.StringOp':
673
673
  if isinstance(self, str) or (isinstance(self, Expr) and self.col_type.is_string_type()):
674
674
  return self._make_string_expr(StringOperator.REPEAT, other)
675
675
  return self._make_arithmetic_expr(ArithmeticOperator.MUL, other)
@@ -683,7 +683,7 @@ class Expr(abc.ABC):
683
683
  def __floordiv__(self, other: object) -> 'exprs.ArithmeticExpr':
684
684
  return self._make_arithmetic_expr(ArithmeticOperator.FLOORDIV, other)
685
685
 
686
- def __radd__(self, other: object) -> Union['exprs.ArithmeticExpr', 'exprs.StringOp']:
686
+ def __radd__(self, other: object) -> 'exprs.ArithmeticExpr' | 'exprs.StringOp':
687
687
  if isinstance(other, str) or (isinstance(other, Expr) and other.col_type.is_string_type()):
688
688
  return self._rmake_string_expr(StringOperator.CONCAT, other)
689
689
  return self._rmake_arithmetic_expr(ArithmeticOperator.ADD, other)
@@ -691,7 +691,7 @@ class Expr(abc.ABC):
691
691
  def __rsub__(self, other: object) -> 'exprs.ArithmeticExpr':
692
692
  return self._rmake_arithmetic_expr(ArithmeticOperator.SUB, other)
693
693
 
694
- def __rmul__(self, other: object) -> Union['exprs.ArithmeticExpr', 'exprs.StringOp']:
694
+ def __rmul__(self, other: object) -> 'exprs.ArithmeticExpr' | 'exprs.StringOp':
695
695
  if isinstance(other, str) or (isinstance(other, Expr) and other.col_type.is_string_type()):
696
696
  return self._rmake_string_expr(StringOperator.REPEAT, other)
697
697
  return self._rmake_arithmetic_expr(ArithmeticOperator.MUL, other)
@@ -733,7 +733,7 @@ class Expr(abc.ABC):
733
733
 
734
734
  def _make_arithmetic_expr(self, op: ArithmeticOperator, other: object) -> 'exprs.ArithmeticExpr':
735
735
  """
736
- other: Union[Expr, LiteralPythonTypes]
736
+ other: Expr | LiteralPythonTypes
737
737
  """
738
738
  # TODO: check for compatibility
739
739
  from .arithmetic_expr import ArithmeticExpr
@@ -4,7 +4,7 @@ import inspect
4
4
  import logging
5
5
  import sys
6
6
  from textwrap import dedent
7
- from typing import Any, Optional, Sequence, Union
7
+ from typing import Any, Optional, Sequence
8
8
 
9
9
  import sqlalchemy as sql
10
10
 
@@ -36,7 +36,7 @@ class FunctionCall(Expr):
36
36
  # - a component index, if the parameter is a non-variadic parameter
37
37
  # - a list of component indices, if the parameter is a variadic positional parameter
38
38
  # - a dict mapping keyword names to component indices, if the parameter is a variadic keyword parameter
39
- bound_idxs: dict[str, Union[int, list[int], dict[str, int]]]
39
+ bound_idxs: dict[str, int | list[int] | dict[str, int]]
40
40
 
41
41
  return_type: ts.ColumnType
42
42
  group_by_start_idx: int
@@ -2,10 +2,9 @@ from __future__ import annotations
2
2
 
3
3
  import datetime
4
4
  import enum
5
- from typing import Union
6
5
 
7
6
  # Python types corresponding to our literal types
8
- LiteralPythonTypes = Union[str, int, float, bool, datetime.datetime, datetime.date]
7
+ LiteralPythonTypes = str | int | float | bool | datetime.datetime | datetime.date
9
8
 
10
9
 
11
10
  def print_slice(s: slice) -> str: