pixeltable 0.4.5__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +4 -2
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +1 -1
- pixeltable/catalog/catalog.py +3 -3
- pixeltable/catalog/column.py +49 -0
- pixeltable/catalog/insertable_table.py +0 -7
- pixeltable/catalog/schema_object.py +1 -14
- pixeltable/catalog/table.py +139 -53
- pixeltable/catalog/table_version.py +30 -138
- pixeltable/catalog/view.py +2 -1
- pixeltable/dataframe.py +2 -3
- pixeltable/env.py +43 -5
- pixeltable/exec/expr_eval/expr_eval_node.py +2 -2
- pixeltable/exec/expr_eval/schedulers.py +36 -15
- pixeltable/exprs/array_slice.py +2 -2
- pixeltable/exprs/data_row.py +13 -0
- pixeltable/exprs/expr.py +9 -9
- pixeltable/exprs/function_call.py +2 -2
- pixeltable/exprs/globals.py +1 -2
- pixeltable/exprs/json_path.py +3 -3
- pixeltable/exprs/row_builder.py +14 -16
- pixeltable/exprs/string_op.py +3 -3
- pixeltable/func/query_template_function.py +2 -2
- pixeltable/func/signature.py +30 -3
- pixeltable/func/tools.py +2 -2
- pixeltable/functions/anthropic.py +75 -25
- pixeltable/functions/globals.py +2 -2
- pixeltable/functions/llama_cpp.py +9 -1
- pixeltable/functions/openai.py +74 -54
- pixeltable/functions/video.py +54 -1
- pixeltable/functions/vision.py +2 -2
- pixeltable/globals.py +74 -12
- pixeltable/io/datarows.py +3 -3
- pixeltable/io/fiftyone.py +4 -4
- pixeltable/io/globals.py +3 -3
- pixeltable/io/hf_datasets.py +4 -4
- pixeltable/io/pandas.py +6 -6
- pixeltable/io/parquet.py +3 -3
- pixeltable/io/table_data_conduit.py +2 -2
- pixeltable/io/utils.py +2 -2
- pixeltable/iterators/document.py +2 -2
- pixeltable/iterators/video.py +49 -9
- pixeltable/share/packager.py +45 -36
- pixeltable/store.py +5 -25
- pixeltable/type_system.py +5 -8
- pixeltable/utils/__init__.py +2 -2
- pixeltable/utils/arrow.py +5 -5
- pixeltable/utils/description_helper.py +3 -3
- pixeltable/utils/iceberg.py +1 -2
- {pixeltable-0.4.5.dist-info → pixeltable-0.4.7.dist-info}/METADATA +109 -59
- {pixeltable-0.4.5.dist-info → pixeltable-0.4.7.dist-info}/RECORD +64 -64
- {pixeltable-0.4.5.dist-info → pixeltable-0.4.7.dist-info}/WHEEL +1 -1
- pixeltable-0.4.7.dist-info/entry_points.txt +2 -0
- pixeltable-0.4.5.dist-info/entry_points.txt +0 -3
- {pixeltable-0.4.5.dist-info → pixeltable-0.4.7.dist-info/licenses}/LICENSE +0 -0
|
@@ -14,7 +14,6 @@ import sqlalchemy as sql
|
|
|
14
14
|
|
|
15
15
|
import pixeltable as pxt
|
|
16
16
|
import pixeltable.exceptions as excs
|
|
17
|
-
import pixeltable.type_system as ts
|
|
18
17
|
from pixeltable import exprs, index
|
|
19
18
|
from pixeltable.env import Env
|
|
20
19
|
from pixeltable.iterators import ComponentIterator
|
|
@@ -223,18 +222,23 @@ class TableVersion:
|
|
|
223
222
|
view_md: Optional[schema.ViewMd] = None,
|
|
224
223
|
) -> TableVersionMd:
|
|
225
224
|
user = Env.get().user
|
|
225
|
+
timestamp = time.time()
|
|
226
226
|
|
|
227
|
-
# assign ids
|
|
227
|
+
# assign ids, create metadata
|
|
228
228
|
cols_by_name: dict[str, Column] = {}
|
|
229
|
+
column_md: dict[int, schema.ColumnMd] = {}
|
|
230
|
+
schema_col_md: dict[int, schema.SchemaColumn] = {}
|
|
229
231
|
for pos, col in enumerate(cols):
|
|
230
232
|
col.id = pos
|
|
231
233
|
col.schema_version_add = 0
|
|
232
234
|
cols_by_name[col.name] = col
|
|
233
235
|
if col.is_computed:
|
|
234
236
|
col.check_value_expr()
|
|
237
|
+
col_md, sch_md = col.to_md(pos)
|
|
238
|
+
assert sch_md is not None
|
|
239
|
+
column_md[col.id] = col_md
|
|
240
|
+
schema_col_md[col.id] = sch_md
|
|
235
241
|
|
|
236
|
-
timestamp = time.time()
|
|
237
|
-
column_md = cls._create_column_md(cols)
|
|
238
242
|
tbl_id = uuid.uuid4()
|
|
239
243
|
tbl_id_str = str(tbl_id)
|
|
240
244
|
tbl_md = schema.TableMd(
|
|
@@ -256,18 +260,15 @@ class TableVersion:
|
|
|
256
260
|
)
|
|
257
261
|
|
|
258
262
|
table_version_md = schema.TableVersionMd(
|
|
259
|
-
tbl_id=tbl_id_str,
|
|
263
|
+
tbl_id=tbl_id_str,
|
|
264
|
+
created_at=timestamp,
|
|
265
|
+
version=0,
|
|
266
|
+
schema_version=0,
|
|
267
|
+
user=user,
|
|
268
|
+
update_status=None,
|
|
269
|
+
additional_md={},
|
|
260
270
|
)
|
|
261
271
|
|
|
262
|
-
schema_col_md: dict[int, schema.SchemaColumn] = {}
|
|
263
|
-
for pos, col in enumerate(cols):
|
|
264
|
-
md = schema.SchemaColumn(
|
|
265
|
-
pos=pos,
|
|
266
|
-
name=col.name,
|
|
267
|
-
media_validation=col._media_validation.name.lower() if col._media_validation is not None else None,
|
|
268
|
-
)
|
|
269
|
-
schema_col_md[col.id] = md
|
|
270
|
-
|
|
271
272
|
schema_version_md = schema.TableSchemaVersionMd(
|
|
272
273
|
tbl_id=tbl_id_str,
|
|
273
274
|
schema_version=0,
|
|
@@ -290,76 +291,11 @@ class TableVersion:
|
|
|
290
291
|
comment: str,
|
|
291
292
|
media_validation: MediaValidation,
|
|
292
293
|
) -> tuple[UUID, Optional[TableVersion]]:
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
# assign ids
|
|
296
|
-
cols_by_name: dict[str, Column] = {}
|
|
297
|
-
for pos, col in enumerate(cols):
|
|
298
|
-
col.id = pos
|
|
299
|
-
col.schema_version_add = 0
|
|
300
|
-
cols_by_name[col.name] = col
|
|
301
|
-
if col.is_computed:
|
|
302
|
-
col.check_value_expr()
|
|
303
|
-
|
|
304
|
-
timestamp = time.time()
|
|
305
|
-
# create schema.Table
|
|
306
|
-
# Column.dependent_cols for existing cols is wrong at this point, but init() will set it correctly
|
|
307
|
-
column_md = cls._create_column_md(cols)
|
|
308
|
-
tbl_id = uuid.uuid4()
|
|
309
|
-
tbl_id_str = str(tbl_id)
|
|
310
|
-
table_md = schema.TableMd(
|
|
311
|
-
tbl_id=tbl_id_str,
|
|
312
|
-
name=name,
|
|
313
|
-
user=user,
|
|
314
|
-
is_replica=False,
|
|
315
|
-
current_version=0,
|
|
316
|
-
current_schema_version=0,
|
|
317
|
-
next_col_id=len(cols),
|
|
318
|
-
next_idx_id=0,
|
|
319
|
-
next_row_id=0,
|
|
320
|
-
view_sn=0,
|
|
321
|
-
column_md=column_md,
|
|
322
|
-
index_md={},
|
|
323
|
-
external_stores=[],
|
|
324
|
-
view_md=None,
|
|
325
|
-
additional_md={},
|
|
326
|
-
)
|
|
327
|
-
|
|
328
|
-
# create schema.TableVersion of the initial version
|
|
329
|
-
table_version_md = schema.TableVersionMd(
|
|
330
|
-
tbl_id=tbl_id_str,
|
|
331
|
-
created_at=timestamp,
|
|
332
|
-
version=0,
|
|
333
|
-
schema_version=0,
|
|
334
|
-
user=user,
|
|
335
|
-
update_status=None,
|
|
336
|
-
additional_md={},
|
|
337
|
-
)
|
|
338
|
-
|
|
339
|
-
# create schema.TableSchemaVersion
|
|
340
|
-
schema_col_md: dict[int, schema.SchemaColumn] = {}
|
|
341
|
-
for pos, col in enumerate(cols):
|
|
342
|
-
md = schema.SchemaColumn(
|
|
343
|
-
pos=pos,
|
|
344
|
-
name=col.name,
|
|
345
|
-
media_validation=col._media_validation.name.lower() if col._media_validation is not None else None,
|
|
346
|
-
)
|
|
347
|
-
schema_col_md[col.id] = md
|
|
348
|
-
|
|
349
|
-
schema_version_md = schema.TableSchemaVersionMd(
|
|
350
|
-
tbl_id=tbl_id_str,
|
|
351
|
-
schema_version=0,
|
|
352
|
-
preceding_schema_version=None,
|
|
353
|
-
columns=schema_col_md,
|
|
354
|
-
num_retained_versions=num_retained_versions,
|
|
355
|
-
comment=comment,
|
|
356
|
-
media_validation=media_validation.name.lower(),
|
|
357
|
-
additional_md={},
|
|
358
|
-
)
|
|
359
|
-
|
|
294
|
+
inital_md = cls.create_initial_md(name, cols, num_retained_versions, comment, media_validation, view_md=None)
|
|
360
295
|
cat = pxt.catalog.Catalog.get()
|
|
361
296
|
|
|
362
|
-
|
|
297
|
+
tbl_id = UUID(hex=inital_md.tbl_md.tbl_id)
|
|
298
|
+
tbl_version = cls(tbl_id, inital_md.tbl_md, inital_md.version_md, None, inital_md.schema_version_md, [])
|
|
363
299
|
# TODO: break this up, so that Catalog.create_table() registers tbl_version
|
|
364
300
|
cat._tbl_versions[tbl_id, None] = tbl_version
|
|
365
301
|
tbl_version.init()
|
|
@@ -373,8 +309,8 @@ class TableVersion:
|
|
|
373
309
|
tbl_id=tbl_id,
|
|
374
310
|
dir_id=dir_id,
|
|
375
311
|
tbl_md=tbl_version.tbl_md,
|
|
376
|
-
version_md=
|
|
377
|
-
schema_version_md=schema_version_md,
|
|
312
|
+
version_md=inital_md.version_md,
|
|
313
|
+
schema_version_md=inital_md.schema_version_md,
|
|
378
314
|
)
|
|
379
315
|
return tbl_id, tbl_version
|
|
380
316
|
|
|
@@ -480,25 +416,7 @@ class TableVersion:
|
|
|
480
416
|
sorted_column_md = sorted(self.tbl_md.column_md.values(), key=lambda item: item.id)
|
|
481
417
|
for col_md in sorted_column_md:
|
|
482
418
|
schema_col_md = self.schema_version_md.columns.get(col_md.id)
|
|
483
|
-
|
|
484
|
-
media_val = (
|
|
485
|
-
MediaValidation[schema_col_md.media_validation.upper()]
|
|
486
|
-
if schema_col_md is not None and schema_col_md.media_validation is not None
|
|
487
|
-
else None
|
|
488
|
-
)
|
|
489
|
-
col = Column(
|
|
490
|
-
col_id=col_md.id,
|
|
491
|
-
name=col_name,
|
|
492
|
-
col_type=ts.ColumnType.from_dict(col_md.col_type),
|
|
493
|
-
is_pk=col_md.is_pk,
|
|
494
|
-
stored=col_md.stored,
|
|
495
|
-
media_validation=media_val,
|
|
496
|
-
schema_version_add=col_md.schema_version_add,
|
|
497
|
-
schema_version_drop=col_md.schema_version_drop,
|
|
498
|
-
value_expr_dict=col_md.value_expr,
|
|
499
|
-
tbl=self,
|
|
500
|
-
)
|
|
501
|
-
col.tbl = self
|
|
419
|
+
col = Column.from_md(col_md, self, schema_col_md)
|
|
502
420
|
self.cols.append(col)
|
|
503
421
|
|
|
504
422
|
# populate the lookup structures before Expr.from_dict()
|
|
@@ -783,31 +701,22 @@ class TableVersion:
|
|
|
783
701
|
num_excs = 0
|
|
784
702
|
cols_with_excs: list[Column] = []
|
|
785
703
|
for col in cols_to_add:
|
|
704
|
+
assert col.id is not None, 'Column id must be set before adding the column'
|
|
786
705
|
excs_per_col = 0
|
|
787
706
|
col.schema_version_add = self.schema_version
|
|
788
707
|
# add the column to the lookup structures now, rather than after the store changes executed successfully,
|
|
789
708
|
# because it might be referenced by the next column's value_expr
|
|
790
709
|
self.cols.append(col)
|
|
791
|
-
if col.name is not None:
|
|
792
|
-
self.cols_by_name[col.name] = col
|
|
793
710
|
self.cols_by_id[col.id] = col
|
|
794
|
-
|
|
795
|
-
# also add to stored md
|
|
796
|
-
self._tbl_md.column_md[col.id] = schema.ColumnMd(
|
|
797
|
-
id=col.id,
|
|
798
|
-
col_type=col.col_type.as_dict(),
|
|
799
|
-
is_pk=col.is_pk,
|
|
800
|
-
schema_version_add=col.schema_version_add,
|
|
801
|
-
schema_version_drop=col.schema_version_drop,
|
|
802
|
-
value_expr=col.value_expr.as_dict() if col.value_expr is not None else None,
|
|
803
|
-
stored=col.stored,
|
|
804
|
-
)
|
|
805
711
|
if col.name is not None:
|
|
806
|
-
self.
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
712
|
+
self.cols_by_name[col.name] = col
|
|
713
|
+
col_md, sch_md = col.to_md(len(self.cols_by_name))
|
|
714
|
+
assert sch_md is not None, 'Schema column metadata must be created for user-facing columns'
|
|
715
|
+
self._tbl_md.column_md[col.id] = col_md
|
|
716
|
+
self._schema_version_md.columns[col.id] = sch_md
|
|
717
|
+
else:
|
|
718
|
+
col_md, _ = col.to_md()
|
|
719
|
+
self._tbl_md.column_md[col.id] = col_md
|
|
811
720
|
|
|
812
721
|
if col.is_stored:
|
|
813
722
|
self.store_tbl.add_column(col)
|
|
@@ -1628,23 +1537,6 @@ class TableVersion:
|
|
|
1628
1537
|
return 1 + self.base.get().num_rowid_columns()
|
|
1629
1538
|
return 1
|
|
1630
1539
|
|
|
1631
|
-
@classmethod
|
|
1632
|
-
def _create_column_md(cls, cols: list[Column]) -> dict[int, schema.ColumnMd]:
|
|
1633
|
-
column_md: dict[int, schema.ColumnMd] = {}
|
|
1634
|
-
for col in cols:
|
|
1635
|
-
value_expr_dict = col.value_expr.as_dict() if col.value_expr is not None else None
|
|
1636
|
-
assert col.is_pk is not None
|
|
1637
|
-
column_md[col.id] = schema.ColumnMd(
|
|
1638
|
-
id=col.id,
|
|
1639
|
-
col_type=col.col_type.as_dict(),
|
|
1640
|
-
is_pk=col.is_pk,
|
|
1641
|
-
schema_version_add=col.schema_version_add,
|
|
1642
|
-
schema_version_drop=col.schema_version_drop,
|
|
1643
|
-
value_expr=value_expr_dict,
|
|
1644
|
-
stored=col.stored,
|
|
1645
|
-
)
|
|
1646
|
-
return column_md
|
|
1647
|
-
|
|
1648
1540
|
@classmethod
|
|
1649
1541
|
def _create_stores_md(cls, stores: Iterable[pxt.io.ExternalStore]) -> list[dict[str, Any]]:
|
|
1650
1542
|
return [
|
pixeltable/catalog/view.py
CHANGED
|
@@ -25,6 +25,7 @@ from .tbl_ops import CreateStoreTableOp, LoadViewOp, TableOp
|
|
|
25
25
|
from .update_status import UpdateStatus
|
|
26
26
|
|
|
27
27
|
if TYPE_CHECKING:
|
|
28
|
+
from pixeltable.catalog.table import TableMetadata
|
|
28
29
|
from pixeltable.globals import TableDataSource
|
|
29
30
|
|
|
30
31
|
_logger = logging.getLogger('pixeltable')
|
|
@@ -261,7 +262,7 @@ class View(Table):
|
|
|
261
262
|
"""
|
|
262
263
|
return self._snapshot_only and self._id == self._tbl_version_path.tbl_id
|
|
263
264
|
|
|
264
|
-
def _get_metadata(self) ->
|
|
265
|
+
def _get_metadata(self) -> 'TableMetadata':
|
|
265
266
|
md = super()._get_metadata()
|
|
266
267
|
md['is_view'] = True
|
|
267
268
|
md['is_snapshot'] = self._tbl_version_path.is_snapshot()
|
pixeltable/dataframe.py
CHANGED
|
@@ -19,7 +19,6 @@ from typing import (
|
|
|
19
19
|
Optional,
|
|
20
20
|
Sequence,
|
|
21
21
|
TypeVar,
|
|
22
|
-
Union,
|
|
23
22
|
)
|
|
24
23
|
|
|
25
24
|
import pandas as pd
|
|
@@ -766,7 +765,7 @@ class DataFrame:
|
|
|
766
765
|
)
|
|
767
766
|
|
|
768
767
|
def _create_join_predicate(
|
|
769
|
-
self, other: catalog.TableVersionPath, on:
|
|
768
|
+
self, other: catalog.TableVersionPath, on: exprs.Expr | Sequence[exprs.ColumnRef]
|
|
770
769
|
) -> exprs.Expr:
|
|
771
770
|
"""Verifies user-specified 'on' argument and converts it into a join predicate."""
|
|
772
771
|
col_refs: list[exprs.ColumnRef] = []
|
|
@@ -829,7 +828,7 @@ class DataFrame:
|
|
|
829
828
|
def join(
|
|
830
829
|
self,
|
|
831
830
|
other: catalog.Table,
|
|
832
|
-
on:
|
|
831
|
+
on: exprs.Expr | Sequence[exprs.ColumnRef] | None = None,
|
|
833
832
|
how: plan.JoinType.LiteralType = 'inner',
|
|
834
833
|
) -> DataFrame:
|
|
835
834
|
"""
|
pixeltable/env.py
CHANGED
|
@@ -17,7 +17,6 @@ import types
|
|
|
17
17
|
import typing
|
|
18
18
|
import uuid
|
|
19
19
|
import warnings
|
|
20
|
-
from abc import abstractmethod
|
|
21
20
|
from contextlib import contextmanager
|
|
22
21
|
from dataclasses import dataclass, field
|
|
23
22
|
from pathlib import Path
|
|
@@ -890,6 +889,10 @@ class RateLimitsInfo:
|
|
|
890
889
|
get_request_resources: Callable[..., dict[str, int]]
|
|
891
890
|
|
|
892
891
|
resource_limits: dict[str, RateLimitInfo] = field(default_factory=dict)
|
|
892
|
+
has_exc: bool = False
|
|
893
|
+
|
|
894
|
+
def debug_str(self) -> str:
|
|
895
|
+
return ','.join(info.debug_str() for info in self.resource_limits.values())
|
|
893
896
|
|
|
894
897
|
def is_initialized(self) -> bool:
|
|
895
898
|
return len(self.resource_limits) > 0
|
|
@@ -897,7 +900,7 @@ class RateLimitsInfo:
|
|
|
897
900
|
def reset(self) -> None:
|
|
898
901
|
self.resource_limits.clear()
|
|
899
902
|
|
|
900
|
-
def record(self, **kwargs: Any) -> None:
|
|
903
|
+
def record(self, reset_exc: bool = False, **kwargs: Any) -> None:
|
|
901
904
|
now = datetime.datetime.now(tz=datetime.timezone.utc)
|
|
902
905
|
if len(self.resource_limits) == 0:
|
|
903
906
|
self.resource_limits = {k: RateLimitInfo(k, now, *v) for k, v in kwargs.items() if v is not None}
|
|
@@ -908,14 +911,30 @@ class RateLimitsInfo:
|
|
|
908
911
|
f'reset={info.reset_at.strftime(TIME_FORMAT)} delta={(info.reset_at - now).total_seconds()}'
|
|
909
912
|
)
|
|
910
913
|
else:
|
|
914
|
+
if self.has_exc and not reset_exc:
|
|
915
|
+
# ignore updates until we're asked to reset
|
|
916
|
+
_logger.debug(f'rate_limits.record(): ignoring update {kwargs}')
|
|
917
|
+
return
|
|
918
|
+
self.has_exc = False
|
|
911
919
|
for k, v in kwargs.items():
|
|
912
920
|
if v is not None:
|
|
913
921
|
self.resource_limits[k].update(now, *v)
|
|
914
922
|
|
|
915
|
-
|
|
923
|
+
def record_exc(self, exc: Exception) -> None:
|
|
924
|
+
"""Update self.resource_limits based on the exception headers"""
|
|
925
|
+
self.has_exc = True
|
|
926
|
+
|
|
916
927
|
def get_retry_delay(self, exc: Exception) -> Optional[float]:
|
|
917
928
|
"""Returns number of seconds to wait before retry, or None if not retryable"""
|
|
918
|
-
|
|
929
|
+
if len(self.resource_limits) == 0:
|
|
930
|
+
return 1.0
|
|
931
|
+
# we're looking for the maximum delay across all depleted resources
|
|
932
|
+
max_delay = 0.0
|
|
933
|
+
now = datetime.datetime.now(tz=datetime.timezone.utc)
|
|
934
|
+
for limit_info in self.resource_limits.values():
|
|
935
|
+
if limit_info.remaining < 0.05 * limit_info.limit:
|
|
936
|
+
max_delay = max(max_delay, (limit_info.reset_at - now).total_seconds())
|
|
937
|
+
return max_delay if max_delay > 0 else None
|
|
919
938
|
|
|
920
939
|
|
|
921
940
|
@dataclass
|
|
@@ -928,9 +947,15 @@ class RateLimitInfo:
|
|
|
928
947
|
remaining: int
|
|
929
948
|
reset_at: datetime.datetime
|
|
930
949
|
|
|
950
|
+
def debug_str(self) -> str:
|
|
951
|
+
return (
|
|
952
|
+
f'{self.resource}@{self.recorded_at.strftime(TIME_FORMAT)}: '
|
|
953
|
+
f'{self.limit}/{self.remaining}/{self.reset_at.strftime(TIME_FORMAT)}'
|
|
954
|
+
)
|
|
955
|
+
|
|
931
956
|
def update(self, recorded_at: datetime.datetime, limit: int, remaining: int, reset_at: datetime.datetime) -> None:
|
|
932
957
|
# we always update everything, even though responses may come back out-of-order: we can't use reset_at to
|
|
933
|
-
# determine order, because it doesn't increase monotonically (the
|
|
958
|
+
# determine order, because it doesn't increase monotonically (the reset duration shortens as output_tokens
|
|
934
959
|
# are freed up - going from max to actual)
|
|
935
960
|
self.recorded_at = recorded_at
|
|
936
961
|
self.limit = limit
|
|
@@ -942,3 +967,16 @@ class RateLimitInfo:
|
|
|
942
967
|
f'Update {self.resource} rate limit: rem={self.remaining} reset={self.reset_at.strftime(TIME_FORMAT)} '
|
|
943
968
|
f'reset_delta={reset_delta.total_seconds()} recorded_delta={(self.reset_at - recorded_at).total_seconds()}'
|
|
944
969
|
)
|
|
970
|
+
|
|
971
|
+
|
|
972
|
+
@dataclass
|
|
973
|
+
class RuntimeCtx:
|
|
974
|
+
"""
|
|
975
|
+
Container for runtime data provided by the execution system to udfs.
|
|
976
|
+
|
|
977
|
+
Udfs that accept the special _runtime_ctx parameter receive an instance of this class.
|
|
978
|
+
"""
|
|
979
|
+
|
|
980
|
+
# Indicates a retry attempt following a rate limit error (error code: 429). Requires a 'rate-limits' resource pool.
|
|
981
|
+
# If True, call RateLimitsInfo.record() with reset_exc=True.
|
|
982
|
+
is_retry: bool = False
|
|
@@ -4,7 +4,7 @@ import asyncio
|
|
|
4
4
|
import logging
|
|
5
5
|
import traceback
|
|
6
6
|
from types import TracebackType
|
|
7
|
-
from typing import AsyncIterator, Iterable, Optional
|
|
7
|
+
from typing import AsyncIterator, Iterable, Optional
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
|
|
@@ -49,7 +49,7 @@ class ExprEvalNode(ExecNode):
|
|
|
49
49
|
# execution state
|
|
50
50
|
tasks: set[asyncio.Task] # collects all running tasks to prevent them from getting gc'd
|
|
51
51
|
exc_event: asyncio.Event # set if an exception needs to be propagated
|
|
52
|
-
error: Optional[
|
|
52
|
+
error: Optional[Exception] # exception that needs to be propagated
|
|
53
53
|
completed_rows: asyncio.Queue[exprs.DataRow] # rows that have completed evaluation
|
|
54
54
|
completed_event: asyncio.Event # set when completed_rows is non-empty
|
|
55
55
|
input_iter: AsyncIterator[DataRowBatch]
|
|
@@ -81,6 +81,8 @@ class RateLimitsScheduler(Scheduler):
|
|
|
81
81
|
while True:
|
|
82
82
|
if item is None:
|
|
83
83
|
item = await self.queue.get()
|
|
84
|
+
assert isinstance(item.request.fn_call.fn, func.CallableFunction)
|
|
85
|
+
assert '_runtime_ctx' in item.request.fn_call.fn.signature.system_parameters
|
|
84
86
|
if item.num_retries > 0:
|
|
85
87
|
self.total_retried += 1
|
|
86
88
|
|
|
@@ -97,7 +99,6 @@ class RateLimitsScheduler(Scheduler):
|
|
|
97
99
|
continue
|
|
98
100
|
|
|
99
101
|
# check rate limits
|
|
100
|
-
_logger.debug(f'checking rate limits for {self.resource_pool}')
|
|
101
102
|
request_resources = self._get_request_resources(item.request)
|
|
102
103
|
limits_info = self._check_resource_limits(request_resources)
|
|
103
104
|
aws: list[Awaitable[None]] = []
|
|
@@ -116,21 +117,31 @@ class RateLimitsScheduler(Scheduler):
|
|
|
116
117
|
reset_at = limits_info.reset_at
|
|
117
118
|
if reset_at > now:
|
|
118
119
|
# we're waiting for the rate limit to reset
|
|
119
|
-
|
|
120
|
+
wait_duration = (reset_at - now).total_seconds()
|
|
121
|
+
wait_for_reset = asyncio.create_task(asyncio.sleep(wait_duration))
|
|
120
122
|
aws.append(wait_for_reset)
|
|
121
|
-
_logger.debug(
|
|
123
|
+
_logger.debug(
|
|
124
|
+
f'waiting {wait_duration:.2f}s for rate limit reset of '
|
|
125
|
+
f'{self.resource_pool}:{limits_info.resource} (remaining={limits_info.remaining})'
|
|
126
|
+
)
|
|
122
127
|
|
|
123
128
|
if len(aws) > 0:
|
|
124
129
|
# we have something to wait for
|
|
130
|
+
report_ts = limits_info.recorded_at
|
|
125
131
|
done, pending = await asyncio.wait(aws, return_when=asyncio.FIRST_COMPLETED)
|
|
126
132
|
for task in pending:
|
|
127
133
|
task.cancel()
|
|
128
134
|
if completed_aw in done:
|
|
129
135
|
_logger.debug(f'wait(): completed request for {self.resource_pool}')
|
|
130
136
|
if wait_for_reset in done:
|
|
131
|
-
_logger.debug(f'wait(): rate limit reset for {self.resource_pool}')
|
|
132
|
-
|
|
133
|
-
|
|
137
|
+
_logger.debug(f'wait(): rate limit reset for {self.resource_pool}:{limits_info.resource}')
|
|
138
|
+
last_report_ts = self.pool_info.resource_limits[limits_info.resource].recorded_at
|
|
139
|
+
if report_ts == last_report_ts:
|
|
140
|
+
# if we haven't seen a new report since we started waiting, force waiting for another rate limit
|
|
141
|
+
# report before making any scheduling decisions
|
|
142
|
+
# TODO: is it a good idea to discard the information we have?
|
|
143
|
+
_logger.debug(f'resetting {self.resource_pool}: currently at {self.pool_info.debug_str()}')
|
|
144
|
+
self.pool_info.reset()
|
|
134
145
|
# re-evaluate current capacity for current item
|
|
135
146
|
continue
|
|
136
147
|
|
|
@@ -158,16 +169,22 @@ class RateLimitsScheduler(Scheduler):
|
|
|
158
169
|
|
|
159
170
|
def _check_resource_limits(self, request_resources: dict[str, int]) -> Optional[env.RateLimitInfo]:
|
|
160
171
|
"""Returns the most depleted resource, relative to its limit, or None if all resources are within limits"""
|
|
161
|
-
candidates: list[tuple[env.RateLimitInfo, float]] = [] # (info, relative
|
|
172
|
+
candidates: list[tuple[env.RateLimitInfo, float]] = [] # (info, relative remaining)
|
|
162
173
|
for resource, usage in request_resources.items():
|
|
163
|
-
# 0.05: leave some headroom, we don't have perfect information
|
|
164
174
|
info = self.pool_info.resource_limits[resource]
|
|
165
175
|
est_remaining = info.remaining - self.est_usage[resource] - usage
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
176
|
+
candidates.append((info, est_remaining / info.limit))
|
|
177
|
+
assert len(candidates) > 0
|
|
178
|
+
candidates.sort(key=lambda x: x[1]) # most depleted first
|
|
179
|
+
most_depleted = candidates[0]
|
|
180
|
+
_logger.debug(
|
|
181
|
+
f'check_resource_limits({request_resources}): '
|
|
182
|
+
f'most_depleted={most_depleted[0].resource}, rel_remaining={most_depleted[1]}'
|
|
183
|
+
)
|
|
184
|
+
# 0.05: leave some headroom, we don't have perfect information
|
|
185
|
+
if most_depleted[1] < 0.05:
|
|
186
|
+
return most_depleted[0]
|
|
187
|
+
return None
|
|
171
188
|
|
|
172
189
|
async def _exec(self, request: FnCallArgs, exec_ctx: ExecCtx, num_retries: int, is_task: bool) -> None:
|
|
173
190
|
assert all(not row.has_val[request.fn_call.slot_idx] for row in request.rows)
|
|
@@ -188,7 +205,8 @@ class RateLimitsScheduler(Scheduler):
|
|
|
188
205
|
for row, result in zip(request.rows, batch_result):
|
|
189
206
|
row[request.fn_call.slot_idx] = result
|
|
190
207
|
else:
|
|
191
|
-
|
|
208
|
+
request_kwargs = {**request.kwargs, '_runtime_ctx': env.RuntimeCtx(is_retry=num_retries > 0)}
|
|
209
|
+
result = await pxt_fn.aexec(*request.args, **request_kwargs)
|
|
192
210
|
request.row[request.fn_call.slot_idx] = result
|
|
193
211
|
end_ts = datetime.datetime.now(tz=datetime.timezone.utc)
|
|
194
212
|
_logger.debug(
|
|
@@ -202,10 +220,14 @@ class RateLimitsScheduler(Scheduler):
|
|
|
202
220
|
self.dispatcher.dispatch(request.rows, exec_ctx)
|
|
203
221
|
except Exception as exc:
|
|
204
222
|
_logger.debug(f'scheduler {self.resource_pool}: exception in slot {request.fn_call.slot_idx}: {exc}')
|
|
223
|
+
if hasattr(exc, 'response') and hasattr(exc.response, 'headers'):
|
|
224
|
+
_logger.debug(f'scheduler {self.resource_pool}: exception headers: {exc.response.headers}')
|
|
205
225
|
if self.pool_info is None:
|
|
206
226
|
# our pool info should be available at this point
|
|
207
227
|
self._set_pool_info()
|
|
208
228
|
assert self.pool_info is not None
|
|
229
|
+
self.pool_info.record_exc(exc)
|
|
230
|
+
|
|
209
231
|
if num_retries < self.MAX_RETRIES:
|
|
210
232
|
retry_delay = self.pool_info.get_retry_delay(exc)
|
|
211
233
|
if retry_delay is not None:
|
|
@@ -214,7 +236,6 @@ class RateLimitsScheduler(Scheduler):
|
|
|
214
236
|
await asyncio.sleep(retry_delay)
|
|
215
237
|
self.queue.put_nowait(self.QueueItem(request, num_retries + 1, exec_ctx))
|
|
216
238
|
return
|
|
217
|
-
# TODO: update resource limits reported in exc.response.headers, if present
|
|
218
239
|
|
|
219
240
|
# record the exception
|
|
220
241
|
_, _, exc_tb = sys.exc_info()
|
pixeltable/exprs/array_slice.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import Any, Optional
|
|
3
|
+
from typing import Any, Optional
|
|
4
4
|
|
|
5
5
|
import sqlalchemy as sql
|
|
6
6
|
|
|
@@ -16,7 +16,7 @@ class ArraySlice(Expr):
|
|
|
16
16
|
Slice operation on an array, eg, t.array_col[:, 1:2].
|
|
17
17
|
"""
|
|
18
18
|
|
|
19
|
-
def __init__(self, arr: Expr, index: tuple[
|
|
19
|
+
def __init__(self, arr: Expr, index: tuple[int | slice, ...]):
|
|
20
20
|
assert arr.col_type.is_array_type()
|
|
21
21
|
# determine result type
|
|
22
22
|
super().__init__(arr.col_type)
|
pixeltable/exprs/data_row.py
CHANGED
|
@@ -281,6 +281,19 @@ class DataRow:
|
|
|
281
281
|
pass
|
|
282
282
|
self.vals[index] = None
|
|
283
283
|
|
|
284
|
+
def move_tmp_media_file(self, index: int, col: catalog.Column) -> None:
|
|
285
|
+
"""If a media url refers to data in a temporary file, move the data to the MediaStore"""
|
|
286
|
+
if self.file_urls[index] is None:
|
|
287
|
+
return
|
|
288
|
+
assert self.excs[index] is None
|
|
289
|
+
assert col.col_type.is_media_type()
|
|
290
|
+
src_path = MediaStore.resolve_tmp_url(self.file_urls[index])
|
|
291
|
+
if src_path is None:
|
|
292
|
+
# The media url does not point to a temporary file, leave it as is
|
|
293
|
+
return
|
|
294
|
+
new_file_url = MediaStore.relocate_local_media_file(src_path, col)
|
|
295
|
+
self.file_urls[index] = new_file_url
|
|
296
|
+
|
|
284
297
|
@property
|
|
285
298
|
def rowid(self) -> tuple[int, ...]:
|
|
286
299
|
return self.pk[:-1]
|
pixeltable/exprs/expr.py
CHANGED
|
@@ -7,7 +7,7 @@ import inspect
|
|
|
7
7
|
import json
|
|
8
8
|
import sys
|
|
9
9
|
import typing
|
|
10
|
-
from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Optional, TypeVar,
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Optional, TypeVar, overload
|
|
11
11
|
from uuid import UUID
|
|
12
12
|
|
|
13
13
|
import numpy as np
|
|
@@ -550,7 +550,7 @@ class Expr(abc.ABC):
|
|
|
550
550
|
else:
|
|
551
551
|
return InPredicate(self, value_set_literal=value_set)
|
|
552
552
|
|
|
553
|
-
def astype(self, new_type:
|
|
553
|
+
def astype(self, new_type: ts.ColumnType | type | _AnnotatedAlias) -> 'exprs.TypeCast':
|
|
554
554
|
from pixeltable.exprs import TypeCast
|
|
555
555
|
|
|
556
556
|
# Interpret the type argument the same way we would if given in a schema
|
|
@@ -562,7 +562,7 @@ class Expr(abc.ABC):
|
|
|
562
562
|
return TypeCast(self, col_type)
|
|
563
563
|
|
|
564
564
|
def apply(
|
|
565
|
-
self, fn: Callable, *, col_type:
|
|
565
|
+
self, fn: Callable, *, col_type: ts.ColumnType | type | _AnnotatedAlias | None = None
|
|
566
566
|
) -> 'exprs.FunctionCall':
|
|
567
567
|
if col_type is not None:
|
|
568
568
|
col_type = ts.ColumnType.normalize_type(col_type)
|
|
@@ -646,7 +646,7 @@ class Expr(abc.ABC):
|
|
|
646
646
|
|
|
647
647
|
def _make_comparison(self, op: ComparisonOperator, other: object) -> 'exprs.Comparison':
|
|
648
648
|
"""
|
|
649
|
-
other:
|
|
649
|
+
other: Expr | LiteralPythonTypes
|
|
650
650
|
"""
|
|
651
651
|
# TODO: check for compatibility
|
|
652
652
|
from .comparison import Comparison
|
|
@@ -661,7 +661,7 @@ class Expr(abc.ABC):
|
|
|
661
661
|
def __neg__(self) -> 'exprs.ArithmeticExpr':
|
|
662
662
|
return self._make_arithmetic_expr(ArithmeticOperator.MUL, -1)
|
|
663
663
|
|
|
664
|
-
def __add__(self, other: object) ->
|
|
664
|
+
def __add__(self, other: object) -> exprs.ArithmeticExpr | exprs.StringOp:
|
|
665
665
|
if isinstance(self, str) or (isinstance(self, Expr) and self.col_type.is_string_type()):
|
|
666
666
|
return self._make_string_expr(StringOperator.CONCAT, other)
|
|
667
667
|
return self._make_arithmetic_expr(ArithmeticOperator.ADD, other)
|
|
@@ -669,7 +669,7 @@ class Expr(abc.ABC):
|
|
|
669
669
|
def __sub__(self, other: object) -> 'exprs.ArithmeticExpr':
|
|
670
670
|
return self._make_arithmetic_expr(ArithmeticOperator.SUB, other)
|
|
671
671
|
|
|
672
|
-
def __mul__(self, other: object) ->
|
|
672
|
+
def __mul__(self, other: object) -> 'exprs.ArithmeticExpr' | 'exprs.StringOp':
|
|
673
673
|
if isinstance(self, str) or (isinstance(self, Expr) and self.col_type.is_string_type()):
|
|
674
674
|
return self._make_string_expr(StringOperator.REPEAT, other)
|
|
675
675
|
return self._make_arithmetic_expr(ArithmeticOperator.MUL, other)
|
|
@@ -683,7 +683,7 @@ class Expr(abc.ABC):
|
|
|
683
683
|
def __floordiv__(self, other: object) -> 'exprs.ArithmeticExpr':
|
|
684
684
|
return self._make_arithmetic_expr(ArithmeticOperator.FLOORDIV, other)
|
|
685
685
|
|
|
686
|
-
def __radd__(self, other: object) ->
|
|
686
|
+
def __radd__(self, other: object) -> 'exprs.ArithmeticExpr' | 'exprs.StringOp':
|
|
687
687
|
if isinstance(other, str) or (isinstance(other, Expr) and other.col_type.is_string_type()):
|
|
688
688
|
return self._rmake_string_expr(StringOperator.CONCAT, other)
|
|
689
689
|
return self._rmake_arithmetic_expr(ArithmeticOperator.ADD, other)
|
|
@@ -691,7 +691,7 @@ class Expr(abc.ABC):
|
|
|
691
691
|
def __rsub__(self, other: object) -> 'exprs.ArithmeticExpr':
|
|
692
692
|
return self._rmake_arithmetic_expr(ArithmeticOperator.SUB, other)
|
|
693
693
|
|
|
694
|
-
def __rmul__(self, other: object) ->
|
|
694
|
+
def __rmul__(self, other: object) -> 'exprs.ArithmeticExpr' | 'exprs.StringOp':
|
|
695
695
|
if isinstance(other, str) or (isinstance(other, Expr) and other.col_type.is_string_type()):
|
|
696
696
|
return self._rmake_string_expr(StringOperator.REPEAT, other)
|
|
697
697
|
return self._rmake_arithmetic_expr(ArithmeticOperator.MUL, other)
|
|
@@ -733,7 +733,7 @@ class Expr(abc.ABC):
|
|
|
733
733
|
|
|
734
734
|
def _make_arithmetic_expr(self, op: ArithmeticOperator, other: object) -> 'exprs.ArithmeticExpr':
|
|
735
735
|
"""
|
|
736
|
-
other:
|
|
736
|
+
other: Expr | LiteralPythonTypes
|
|
737
737
|
"""
|
|
738
738
|
# TODO: check for compatibility
|
|
739
739
|
from .arithmetic_expr import ArithmeticExpr
|
|
@@ -4,7 +4,7 @@ import inspect
|
|
|
4
4
|
import logging
|
|
5
5
|
import sys
|
|
6
6
|
from textwrap import dedent
|
|
7
|
-
from typing import Any, Optional, Sequence
|
|
7
|
+
from typing import Any, Optional, Sequence
|
|
8
8
|
|
|
9
9
|
import sqlalchemy as sql
|
|
10
10
|
|
|
@@ -36,7 +36,7 @@ class FunctionCall(Expr):
|
|
|
36
36
|
# - a component index, if the parameter is a non-variadic parameter
|
|
37
37
|
# - a list of component indices, if the parameter is a variadic positional parameter
|
|
38
38
|
# - a dict mapping keyword names to component indices, if the parameter is a variadic keyword parameter
|
|
39
|
-
bound_idxs: dict[str,
|
|
39
|
+
bound_idxs: dict[str, int | list[int] | dict[str, int]]
|
|
40
40
|
|
|
41
41
|
return_type: ts.ColumnType
|
|
42
42
|
group_by_start_idx: int
|
pixeltable/exprs/globals.py
CHANGED
|
@@ -2,10 +2,9 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import datetime
|
|
4
4
|
import enum
|
|
5
|
-
from typing import Union
|
|
6
5
|
|
|
7
6
|
# Python types corresponding to our literal types
|
|
8
|
-
LiteralPythonTypes =
|
|
7
|
+
LiteralPythonTypes = str | int | float | bool | datetime.datetime | datetime.date
|
|
9
8
|
|
|
10
9
|
|
|
11
10
|
def print_slice(s: slice) -> str:
|