pixeltable 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -0
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +3 -10
- pixeltable/catalog/catalog.py +139 -59
- pixeltable/catalog/column.py +32 -23
- pixeltable/catalog/globals.py +2 -45
- pixeltable/catalog/insertable_table.py +5 -2
- pixeltable/catalog/path.py +6 -0
- pixeltable/catalog/table.py +173 -23
- pixeltable/catalog/table_version.py +156 -92
- pixeltable/catalog/table_version_handle.py +26 -1
- pixeltable/catalog/update_status.py +179 -0
- pixeltable/catalog/view.py +12 -3
- pixeltable/config.py +76 -12
- pixeltable/dataframe.py +1 -1
- pixeltable/env.py +29 -0
- pixeltable/exec/exec_node.py +7 -24
- pixeltable/exec/expr_eval/schedulers.py +134 -7
- pixeltable/exprs/column_property_ref.py +23 -20
- pixeltable/exprs/column_ref.py +24 -18
- pixeltable/exprs/data_row.py +9 -0
- pixeltable/exprs/function_call.py +2 -2
- pixeltable/exprs/row_builder.py +46 -14
- pixeltable/exprs/rowid_ref.py +0 -4
- pixeltable/func/function.py +3 -3
- pixeltable/functions/audio.py +36 -9
- pixeltable/functions/video.py +57 -10
- pixeltable/globals.py +61 -1
- pixeltable/io/__init__.py +1 -1
- pixeltable/io/external_store.py +39 -64
- pixeltable/io/globals.py +4 -4
- pixeltable/io/hf_datasets.py +10 -2
- pixeltable/io/label_studio.py +52 -48
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +125 -0
- pixeltable/metadata/converters/util.py +3 -0
- pixeltable/metadata/notes.py +2 -0
- pixeltable/metadata/schema.py +14 -2
- pixeltable/metadata/utils.py +78 -0
- pixeltable/plan.py +26 -18
- pixeltable/share/packager.py +20 -38
- pixeltable/store.py +121 -142
- pixeltable/type_system.py +2 -2
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/media_store.py +39 -0
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/METADATA +1 -1
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/RECORD +51 -47
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/LICENSE +0 -0
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from IPython.lib.pretty import RepresentationPrinter
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True)
|
|
11
|
+
class RowCountStats:
|
|
12
|
+
"""
|
|
13
|
+
Statistics about the counts of rows affected by a table operation.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
ins_rows: int = 0 # rows inserted
|
|
17
|
+
del_rows: int = 0 # rows deleted
|
|
18
|
+
upd_rows: int = 0 # rows updated
|
|
19
|
+
num_excs: int = 0 # total number of exceptions
|
|
20
|
+
# TODO: disambiguate what this means: # of slots computed or # of columns computed?
|
|
21
|
+
computed_values: int = 0 # number of computed values (e.g., computed columns) affected by the operation
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def num_rows(self) -> int:
|
|
25
|
+
return self.ins_rows + self.del_rows + self.upd_rows
|
|
26
|
+
|
|
27
|
+
def insert_to_update(self) -> 'RowCountStats':
|
|
28
|
+
"""
|
|
29
|
+
Convert insert row count stats to update row count stats.
|
|
30
|
+
This is used when an insert operation is treated as an update.
|
|
31
|
+
"""
|
|
32
|
+
return RowCountStats(
|
|
33
|
+
ins_rows=0,
|
|
34
|
+
del_rows=self.del_rows,
|
|
35
|
+
upd_rows=self.upd_rows + self.ins_rows,
|
|
36
|
+
num_excs=self.num_excs,
|
|
37
|
+
computed_values=self.computed_values,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
def __add__(self, other: 'RowCountStats') -> 'RowCountStats':
|
|
41
|
+
"""
|
|
42
|
+
Add the stats from two RowCountStats objects together.
|
|
43
|
+
"""
|
|
44
|
+
return RowCountStats(
|
|
45
|
+
ins_rows=self.ins_rows + other.ins_rows,
|
|
46
|
+
del_rows=self.del_rows + other.del_rows,
|
|
47
|
+
upd_rows=self.upd_rows + other.upd_rows,
|
|
48
|
+
num_excs=self.num_excs + other.num_excs,
|
|
49
|
+
computed_values=self.computed_values + other.computed_values,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass(frozen=True)
|
|
54
|
+
class UpdateStatus:
|
|
55
|
+
"""
|
|
56
|
+
Information about changes to table data or table schema
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
updated_cols: list[str] = field(default_factory=list)
|
|
60
|
+
cols_with_excs: list[str] = field(default_factory=list)
|
|
61
|
+
|
|
62
|
+
# stats for the rows affected by the operation
|
|
63
|
+
row_count_stats: RowCountStats = field(default_factory=RowCountStats)
|
|
64
|
+
|
|
65
|
+
# stats for changes cascaded to other tables
|
|
66
|
+
cascade_row_count_stats: RowCountStats = field(default_factory=RowCountStats)
|
|
67
|
+
|
|
68
|
+
# stats for the rows affected by the operation in an external store
|
|
69
|
+
ext_row_count_stats: RowCountStats = field(default_factory=RowCountStats)
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def num_rows(self) -> int:
|
|
73
|
+
return self.row_count_stats.num_rows + self.cascade_row_count_stats.num_rows
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def num_excs(self) -> int:
|
|
77
|
+
return self.row_count_stats.num_excs + self.cascade_row_count_stats.num_excs
|
|
78
|
+
|
|
79
|
+
@property
|
|
80
|
+
def num_computed_values(self) -> int:
|
|
81
|
+
return self.row_count_stats.computed_values + self.cascade_row_count_stats.computed_values
|
|
82
|
+
|
|
83
|
+
def insert_to_update(self) -> 'UpdateStatus':
|
|
84
|
+
"""
|
|
85
|
+
Convert the update status from an insert operation to an update operation.
|
|
86
|
+
This is used when an insert operation is treated as an update.
|
|
87
|
+
"""
|
|
88
|
+
return UpdateStatus(
|
|
89
|
+
updated_cols=self.updated_cols,
|
|
90
|
+
cols_with_excs=self.cols_with_excs,
|
|
91
|
+
row_count_stats=self.row_count_stats.insert_to_update(),
|
|
92
|
+
cascade_row_count_stats=self.cascade_row_count_stats.insert_to_update(),
|
|
93
|
+
ext_row_count_stats=self.ext_row_count_stats,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
def to_cascade(self) -> 'UpdateStatus':
|
|
97
|
+
"""
|
|
98
|
+
Convert the update status to a cascade update status.
|
|
99
|
+
This is used when an operation cascades changes to other tables.
|
|
100
|
+
"""
|
|
101
|
+
return UpdateStatus(
|
|
102
|
+
updated_cols=self.updated_cols,
|
|
103
|
+
cols_with_excs=self.cols_with_excs,
|
|
104
|
+
row_count_stats=RowCountStats(),
|
|
105
|
+
cascade_row_count_stats=self.cascade_row_count_stats + self.row_count_stats,
|
|
106
|
+
ext_row_count_stats=self.ext_row_count_stats,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
def __add__(self, other: 'UpdateStatus') -> UpdateStatus:
|
|
110
|
+
"""
|
|
111
|
+
Add the update status from two UpdateStatus objects together.
|
|
112
|
+
"""
|
|
113
|
+
return UpdateStatus(
|
|
114
|
+
updated_cols=list(dict.fromkeys(self.updated_cols + other.updated_cols)),
|
|
115
|
+
cols_with_excs=list(dict.fromkeys(self.cols_with_excs + other.cols_with_excs)),
|
|
116
|
+
row_count_stats=self.row_count_stats + other.row_count_stats,
|
|
117
|
+
cascade_row_count_stats=self.cascade_row_count_stats + other.cascade_row_count_stats,
|
|
118
|
+
ext_row_count_stats=self.ext_row_count_stats + other.ext_row_count_stats,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
@property
|
|
122
|
+
def insert_msg(self) -> str:
|
|
123
|
+
"""Return a message describing the results of an insert operation."""
|
|
124
|
+
if self.num_excs == 0:
|
|
125
|
+
cols_with_excs_str = ''
|
|
126
|
+
else:
|
|
127
|
+
cols_with_excs_str = (
|
|
128
|
+
f' across {len(self.cols_with_excs)} column{"" if len(self.cols_with_excs) == 1 else "s"}'
|
|
129
|
+
)
|
|
130
|
+
cols_with_excs_str += f' ({", ".join(self.cols_with_excs)})'
|
|
131
|
+
msg = (
|
|
132
|
+
f'Inserted {self.num_rows} row{"" if self.num_rows == 1 else "s"} '
|
|
133
|
+
f'with {self.num_excs} error{"" if self.num_excs == 1 else "s"}{cols_with_excs_str}.'
|
|
134
|
+
)
|
|
135
|
+
return msg
|
|
136
|
+
|
|
137
|
+
@classmethod
|
|
138
|
+
def __cnt_str(cls, cnt: int, item: str) -> str:
|
|
139
|
+
assert cnt > 0
|
|
140
|
+
return f'{cnt} {item}{"" if cnt == 1 else "s"}'
|
|
141
|
+
|
|
142
|
+
def _repr_pretty_(self, p: 'RepresentationPrinter', cycle: bool) -> None:
|
|
143
|
+
messages = []
|
|
144
|
+
# Combine row count stats and cascade row count stats
|
|
145
|
+
stats = self.row_count_stats + self.cascade_row_count_stats
|
|
146
|
+
if stats.ins_rows > 0:
|
|
147
|
+
messages.append(f'{self.__cnt_str(stats.ins_rows, "row")} inserted')
|
|
148
|
+
if stats.del_rows > 0:
|
|
149
|
+
messages.append(f'{self.__cnt_str(stats.del_rows, "row")} deleted')
|
|
150
|
+
if stats.upd_rows > 0:
|
|
151
|
+
messages.append(f'{self.__cnt_str(stats.upd_rows, "row")} updated')
|
|
152
|
+
if stats.computed_values > 0:
|
|
153
|
+
messages.append(f'{self.__cnt_str(stats.computed_values, "value")} computed')
|
|
154
|
+
if stats.num_excs > 0:
|
|
155
|
+
messages.append(self.__cnt_str(stats.num_excs, 'exception'))
|
|
156
|
+
p.text(', '.join(messages) + '.' if len(messages) > 0 else 'No rows affected.')
|
|
157
|
+
|
|
158
|
+
@property
|
|
159
|
+
def pxt_rows_updated(self) -> int:
|
|
160
|
+
"""
|
|
161
|
+
Returns the number of Pixeltable rows that were updated as a result of the operation.
|
|
162
|
+
"""
|
|
163
|
+
return (self.row_count_stats + self.cascade_row_count_stats).upd_rows
|
|
164
|
+
|
|
165
|
+
@property
|
|
166
|
+
def external_rows_updated(self) -> int:
|
|
167
|
+
return self.ext_row_count_stats.upd_rows
|
|
168
|
+
|
|
169
|
+
@property
|
|
170
|
+
def external_rows_created(self) -> int:
|
|
171
|
+
return self.ext_row_count_stats.ins_rows
|
|
172
|
+
|
|
173
|
+
@property
|
|
174
|
+
def external_rows_deleted(self) -> int:
|
|
175
|
+
return self.ext_row_count_stats.del_rows
|
|
176
|
+
|
|
177
|
+
@property
|
|
178
|
+
def ext_num_rows(self) -> int:
|
|
179
|
+
return self.ext_row_count_stats.num_rows
|
pixeltable/catalog/view.py
CHANGED
|
@@ -17,11 +17,12 @@ if TYPE_CHECKING:
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
from .column import Column
|
|
20
|
-
from .globals import _POS_COLUMN_NAME, MediaValidation
|
|
20
|
+
from .globals import _POS_COLUMN_NAME, MediaValidation
|
|
21
21
|
from .table import Table
|
|
22
22
|
from .table_version import TableVersion
|
|
23
23
|
from .table_version_handle import TableVersionHandle
|
|
24
24
|
from .table_version_path import TableVersionPath
|
|
25
|
+
from .update_status import UpdateStatus
|
|
25
26
|
|
|
26
27
|
if TYPE_CHECKING:
|
|
27
28
|
from pixeltable.globals import TableDataSource
|
|
@@ -229,7 +230,10 @@ class View(Table):
|
|
|
229
230
|
|
|
230
231
|
try:
|
|
231
232
|
plan, _ = Planner.create_view_load_plan(view._tbl_version_path)
|
|
232
|
-
|
|
233
|
+
_, row_counts = tbl_version.store_tbl.insert_rows(plan, v_min=tbl_version.version)
|
|
234
|
+
status = UpdateStatus(row_count_stats=row_counts)
|
|
235
|
+
tbl_version._write_md_update_status(0, update_status=status)
|
|
236
|
+
|
|
233
237
|
except:
|
|
234
238
|
# we need to remove the orphaned TableVersion instance
|
|
235
239
|
del catalog.Catalog.get()._tbl_versions[tbl_version.id, tbl_version.effective_version]
|
|
@@ -238,7 +242,9 @@ class View(Table):
|
|
|
238
242
|
# also remove tbl_version from the base
|
|
239
243
|
base_tbl_version.mutable_views.remove(TableVersionHandle.create(tbl_version))
|
|
240
244
|
raise
|
|
241
|
-
Env.get().console_logger.info(
|
|
245
|
+
Env.get().console_logger.info(
|
|
246
|
+
f'Created view `{name}` with {status.num_rows} rows, {status.num_excs} exceptions.'
|
|
247
|
+
)
|
|
242
248
|
|
|
243
249
|
session.commit()
|
|
244
250
|
return view
|
|
@@ -273,6 +279,9 @@ class View(Table):
|
|
|
273
279
|
md = super()._get_metadata()
|
|
274
280
|
md['is_view'] = True
|
|
275
281
|
md['is_snapshot'] = self._tbl_version_path.is_snapshot()
|
|
282
|
+
base_tbl = self._get_base_table()
|
|
283
|
+
base_version = self._effective_base_versions[0]
|
|
284
|
+
md['base'] = base_tbl._path() if base_version is None else f'{base_tbl._path()}:{base_version}'
|
|
276
285
|
return md
|
|
277
286
|
|
|
278
287
|
def insert(
|
pixeltable/config.py
CHANGED
|
@@ -25,19 +25,26 @@ class Config:
|
|
|
25
25
|
|
|
26
26
|
__home: Path
|
|
27
27
|
__config_file: Path
|
|
28
|
+
__config_overrides: dict[str, Any]
|
|
28
29
|
__config_dict: dict[str, Any]
|
|
29
30
|
|
|
30
|
-
def __init__(self) -> None:
|
|
31
|
+
def __init__(self, config_overrides: dict[str, Any]) -> None:
|
|
31
32
|
assert self.__instance is None, 'Config is a singleton; use Config.get() to access the instance'
|
|
32
33
|
|
|
33
|
-
|
|
34
|
+
for var in config_overrides:
|
|
35
|
+
if var not in KNOWN_CONFIG_OVERRIDES:
|
|
36
|
+
raise excs.Error(f'Unrecognized configuration variable: {var}')
|
|
37
|
+
|
|
38
|
+
self.__config_overrides = config_overrides
|
|
39
|
+
|
|
40
|
+
self.__home = Path(self.lookup_env('pixeltable', 'home', str(Path.home() / '.pixeltable')))
|
|
34
41
|
if self.__home.exists() and not self.__home.is_dir():
|
|
35
|
-
raise
|
|
42
|
+
raise excs.Error(f'Not a directory: {self.__home}')
|
|
36
43
|
if not self.__home.exists():
|
|
37
44
|
print(f'Creating a Pixeltable instance at: {self.__home}')
|
|
38
45
|
self.__home.mkdir()
|
|
39
46
|
|
|
40
|
-
self.__config_file = Path(
|
|
47
|
+
self.__config_file = Path(self.lookup_env('pixeltable', 'config', str(self.__home / 'config.toml')))
|
|
41
48
|
|
|
42
49
|
self.__config_dict: dict[str, Any]
|
|
43
50
|
if os.path.isfile(self.__config_file):
|
|
@@ -46,6 +53,12 @@ class Config:
|
|
|
46
53
|
self.__config_dict = toml.load(stream)
|
|
47
54
|
except Exception as exc:
|
|
48
55
|
raise excs.Error(f'Could not read config file: {self.__config_file}') from exc
|
|
56
|
+
for section, section_dict in self.__config_dict.items():
|
|
57
|
+
if section not in KNOWN_CONFIG_OPTIONS:
|
|
58
|
+
raise excs.Error(f'Unrecognized section {section!r} in config file: {self.__config_file}')
|
|
59
|
+
for key in section_dict:
|
|
60
|
+
if key not in KNOWN_CONFIG_OPTIONS[section]:
|
|
61
|
+
raise excs.Error(f"Unrecognized option '{section}.{key}' in config file: {self.__config_file}")
|
|
49
62
|
else:
|
|
50
63
|
self.__config_dict = self.__create_default_config(self.__config_file)
|
|
51
64
|
with open(self.__config_file, 'w', encoding='utf-8') as stream:
|
|
@@ -65,10 +78,18 @@ class Config:
|
|
|
65
78
|
|
|
66
79
|
@classmethod
|
|
67
80
|
def get(cls) -> Config:
|
|
68
|
-
|
|
69
|
-
cls.__instance = cls()
|
|
81
|
+
cls.init({})
|
|
70
82
|
return cls.__instance
|
|
71
83
|
|
|
84
|
+
@classmethod
|
|
85
|
+
def init(cls, config_overrides: dict[str, Any]) -> None:
|
|
86
|
+
if cls.__instance is None:
|
|
87
|
+
cls.__instance = cls(config_overrides)
|
|
88
|
+
elif len(config_overrides) > 0:
|
|
89
|
+
raise excs.Error(
|
|
90
|
+
'Pixeltable has already been initialized; cannot specify new config values in the same session'
|
|
91
|
+
)
|
|
92
|
+
|
|
72
93
|
@classmethod
|
|
73
94
|
def __create_default_config(cls, config_path: Path) -> dict[str, Any]:
|
|
74
95
|
free_disk_space_bytes = shutil.disk_usage(config_path.parent).free
|
|
@@ -76,14 +97,23 @@ class Config:
|
|
|
76
97
|
file_cache_size_g = free_disk_space_bytes / 5 / (1 << 30)
|
|
77
98
|
return {'pixeltable': {'file_cache_size_g': round(file_cache_size_g, 1), 'hide_warnings': False}}
|
|
78
99
|
|
|
79
|
-
def
|
|
100
|
+
def lookup_env(self, section: str, key: str, default: Any = None) -> Any:
|
|
101
|
+
override_var = f'{section}.{key}'
|
|
80
102
|
env_var = f'{section.upper()}_{key.upper()}'
|
|
103
|
+
if override_var in self.__config_overrides:
|
|
104
|
+
return self.__config_overrides[override_var]
|
|
81
105
|
if env_var in os.environ:
|
|
82
|
-
|
|
83
|
-
|
|
106
|
+
return os.environ[env_var]
|
|
107
|
+
return default
|
|
108
|
+
|
|
109
|
+
def get_value(self, key: str, expected_type: type[T], section: str = 'pixeltable') -> Optional[T]:
|
|
110
|
+
value = self.lookup_env(section, key) # Try to get from environment first
|
|
111
|
+
# Next try the config file
|
|
112
|
+
if value is None and section in self.__config_dict and key in self.__config_dict[section]:
|
|
84
113
|
value = self.__config_dict[section][key]
|
|
85
|
-
|
|
86
|
-
|
|
114
|
+
|
|
115
|
+
if value is None:
|
|
116
|
+
return None # Not specified
|
|
87
117
|
|
|
88
118
|
try:
|
|
89
119
|
if expected_type is bool and isinstance(value, str):
|
|
@@ -91,7 +121,7 @@ class Config:
|
|
|
91
121
|
raise excs.Error(f'Invalid value for configuration parameter {section}.{key}: {value}')
|
|
92
122
|
return value.lower() == 'true' # type: ignore[return-value]
|
|
93
123
|
return expected_type(value) # type: ignore[call-arg]
|
|
94
|
-
except ValueError as exc:
|
|
124
|
+
except (ValueError, TypeError) as exc:
|
|
95
125
|
raise excs.Error(f'Invalid value for configuration parameter {section}.{key}: {value}') from exc
|
|
96
126
|
|
|
97
127
|
def get_string_value(self, key: str, section: str = 'pixeltable') -> Optional[str]:
|
|
@@ -105,3 +135,37 @@ class Config:
|
|
|
105
135
|
|
|
106
136
|
def get_bool_value(self, key: str, section: str = 'pixeltable') -> Optional[bool]:
|
|
107
137
|
return self.get_value(key, bool, section)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
KNOWN_CONFIG_OPTIONS = {
|
|
141
|
+
'pixeltable': {
|
|
142
|
+
'home': 'Path to the Pixeltable home directory',
|
|
143
|
+
'config': 'Path to the Pixeltable config file',
|
|
144
|
+
'pgdata': 'Path to the Pixeltable postgres data directory',
|
|
145
|
+
'db': 'Postgres database name',
|
|
146
|
+
'file_cache_size_g': 'Size of the file cache in GB',
|
|
147
|
+
'time_zone': 'Default time zone for timestamps',
|
|
148
|
+
'hide_warnings': 'Hide warnings from the console',
|
|
149
|
+
'verbosity': 'Verbosity level for console output',
|
|
150
|
+
'api_key': 'API key for Pixeltable cloud',
|
|
151
|
+
},
|
|
152
|
+
'anthropic': {'api_key': 'Anthropic API key'},
|
|
153
|
+
'bedrock': {'api_key': 'AWS Bedrock API key'},
|
|
154
|
+
'deepseek': {'api_key': 'Deepseek API key'},
|
|
155
|
+
'fireworks': {'api_key': 'Fireworks API key'},
|
|
156
|
+
'gemini': {'api_key': 'Gemini API key'},
|
|
157
|
+
'groq': {'api_key': 'Groq API key'},
|
|
158
|
+
'label_studio': {'api_key': 'Label Studio API key', 'url': 'Label Studio server URL'},
|
|
159
|
+
'mistral': {'api_key': 'Mistral API key'},
|
|
160
|
+
'openai': {'api_key': 'OpenAI API key'},
|
|
161
|
+
'replicate': {'api_token': 'Replicate API token'},
|
|
162
|
+
'together': {'api_key': 'Together API key'},
|
|
163
|
+
'pypi': {'api_key': 'PyPI API key (for internal use only)'},
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
KNOWN_CONFIG_OVERRIDES = {
|
|
168
|
+
f'{section}.{key}': info
|
|
169
|
+
for section, section_dict in KNOWN_CONFIG_OPTIONS.items()
|
|
170
|
+
for key, info in section_dict.items()
|
|
171
|
+
}
|
pixeltable/dataframe.py
CHANGED
|
@@ -15,7 +15,7 @@ import sqlalchemy as sql
|
|
|
15
15
|
|
|
16
16
|
from pixeltable import catalog, exceptions as excs, exec, exprs, plan, type_system as ts
|
|
17
17
|
from pixeltable.catalog import Catalog, is_valid_identifier
|
|
18
|
-
from pixeltable.catalog.
|
|
18
|
+
from pixeltable.catalog.update_status import UpdateStatus
|
|
19
19
|
from pixeltable.env import Env
|
|
20
20
|
from pixeltable.plan import Planner, SampleClause
|
|
21
21
|
from pixeltable.type_system import ColumnType
|
pixeltable/env.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
import datetime
|
|
4
5
|
import glob
|
|
5
6
|
import http.server
|
|
@@ -22,6 +23,7 @@ from sys import stdout
|
|
|
22
23
|
from typing import TYPE_CHECKING, Any, Callable, Iterator, Optional, TypeVar
|
|
23
24
|
from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
|
|
24
25
|
|
|
26
|
+
import nest_asyncio # type: ignore[import-untyped]
|
|
25
27
|
import pixeltable_pgserver
|
|
26
28
|
import sqlalchemy as sql
|
|
27
29
|
from pillow_heif import register_heif_opener # type: ignore[import-untyped]
|
|
@@ -85,6 +87,7 @@ class Env:
|
|
|
85
87
|
_current_conn: Optional[sql.Connection]
|
|
86
88
|
_current_session: Optional[sql.orm.Session]
|
|
87
89
|
_dbms: Optional[Dbms]
|
|
90
|
+
_event_loop: Optional[asyncio.AbstractEventLoop] # event loop for ExecNode
|
|
88
91
|
|
|
89
92
|
@classmethod
|
|
90
93
|
def get(cls) -> Env:
|
|
@@ -140,6 +143,32 @@ class Env:
|
|
|
140
143
|
self._current_conn = None
|
|
141
144
|
self._current_session = None
|
|
142
145
|
self._dbms = None
|
|
146
|
+
self._event_loop = None
|
|
147
|
+
|
|
148
|
+
def _init_event_loop(self) -> None:
|
|
149
|
+
try:
|
|
150
|
+
# check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
|
|
151
|
+
# multiple run_until_complete()
|
|
152
|
+
running_loop = asyncio.get_running_loop()
|
|
153
|
+
self._event_loop = running_loop
|
|
154
|
+
_logger.debug('Patched running loop')
|
|
155
|
+
except RuntimeError:
|
|
156
|
+
self._event_loop = asyncio.new_event_loop()
|
|
157
|
+
asyncio.set_event_loop(self._event_loop)
|
|
158
|
+
# we set a deliberately long duration to avoid warnings getting printed to the console in debug mode
|
|
159
|
+
self._event_loop.slow_callback_duration = 3600
|
|
160
|
+
|
|
161
|
+
# always allow nested event loops, we need that to run async udfs synchronously (eg, for SimilarityExpr);
|
|
162
|
+
# see run_coroutine_synchronously()
|
|
163
|
+
nest_asyncio.apply()
|
|
164
|
+
if _logger.isEnabledFor(logging.DEBUG):
|
|
165
|
+
self._event_loop.set_debug(True)
|
|
166
|
+
|
|
167
|
+
@property
|
|
168
|
+
def event_loop(self) -> asyncio.AbstractEventLoop:
|
|
169
|
+
if self._event_loop is None:
|
|
170
|
+
self._init_event_loop()
|
|
171
|
+
return self._event_loop
|
|
143
172
|
|
|
144
173
|
@property
|
|
145
174
|
def db_url(self) -> str:
|
pixeltable/exec/exec_node.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import abc
|
|
4
|
-
import asyncio
|
|
5
4
|
import logging
|
|
6
5
|
from typing import AsyncIterator, Iterable, Iterator, Optional, TypeVar
|
|
7
6
|
|
|
8
7
|
from pixeltable import exprs
|
|
8
|
+
from pixeltable.env import Env
|
|
9
9
|
|
|
10
10
|
from .data_row_batch import DataRowBatch
|
|
11
11
|
from .exec_context import ExecContext
|
|
@@ -59,26 +59,7 @@ class ExecNode(abc.ABC):
|
|
|
59
59
|
pass
|
|
60
60
|
|
|
61
61
|
def __iter__(self) -> Iterator[DataRowBatch]:
|
|
62
|
-
|
|
63
|
-
loop: asyncio.AbstractEventLoop
|
|
64
|
-
try:
|
|
65
|
-
# check if we are already in an event loop (eg, Jupyter's); if so, patch it to allow
|
|
66
|
-
# multiple run_until_complete()
|
|
67
|
-
running_loop = asyncio.get_running_loop()
|
|
68
|
-
import nest_asyncio # type: ignore[import-untyped]
|
|
69
|
-
|
|
70
|
-
nest_asyncio.apply()
|
|
71
|
-
loop = running_loop
|
|
72
|
-
_logger.debug('Patched running loop')
|
|
73
|
-
except RuntimeError:
|
|
74
|
-
loop = asyncio.new_event_loop()
|
|
75
|
-
asyncio.set_event_loop(loop)
|
|
76
|
-
# we set a deliberately long duration to avoid warnings getting printed to the console in debug mode
|
|
77
|
-
loop.slow_callback_duration = 3600
|
|
78
|
-
|
|
79
|
-
if _logger.isEnabledFor(logging.DEBUG):
|
|
80
|
-
loop.set_debug(True)
|
|
81
|
-
|
|
62
|
+
loop = Env.get().event_loop
|
|
82
63
|
aiter = self.__aiter__()
|
|
83
64
|
try:
|
|
84
65
|
while True:
|
|
@@ -86,9 +67,11 @@ class ExecNode(abc.ABC):
|
|
|
86
67
|
yield batch
|
|
87
68
|
except StopAsyncIteration:
|
|
88
69
|
pass
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
70
|
+
# TODO:
|
|
71
|
+
# - we seem to have some tasks that aren't accounted for by ExprEvalNode and don't get cancelled by the time
|
|
72
|
+
# we end up here
|
|
73
|
+
# - however, blindly cancelling all pending tasks doesn't work when running in a jupyter environment, which
|
|
74
|
+
# creates tasks on its own
|
|
92
75
|
|
|
93
76
|
def open(self) -> None:
|
|
94
77
|
"""Bottom-up initialization of nodes for execution. Must be called before __next__."""
|
|
@@ -4,9 +4,10 @@ import asyncio
|
|
|
4
4
|
import datetime
|
|
5
5
|
import inspect
|
|
6
6
|
import logging
|
|
7
|
+
import re
|
|
7
8
|
import sys
|
|
8
9
|
import time
|
|
9
|
-
from typing import Awaitable, Collection, Optional
|
|
10
|
+
from typing import Any, Awaitable, Collection, Optional
|
|
10
11
|
|
|
11
12
|
from pixeltable import env, func
|
|
12
13
|
from pixeltable.config import Config
|
|
@@ -250,8 +251,20 @@ class RequestRateScheduler(Scheduler):
|
|
|
250
251
|
total_retried: int
|
|
251
252
|
|
|
252
253
|
TIME_FORMAT = '%H:%M.%S %f'
|
|
253
|
-
MAX_RETRIES =
|
|
254
|
+
MAX_RETRIES = 3
|
|
254
255
|
DEFAULT_RATE_LIMIT = 600 # requests per minute
|
|
256
|
+
RATE_LIMIT_INDICATORS = ('rate limit', 'too many requests', '429', 'quota exceeded', 'throttled', 'rate exceeded')
|
|
257
|
+
RETRY_AFTER_PATTERNS = (
|
|
258
|
+
r'retry after (\d+(?:\.\d+)?)\s*seconds?',
|
|
259
|
+
r'try again in (\d+(?:\.\d+)?)\s*seconds?',
|
|
260
|
+
r'wait (\d+(?:\.\d+)?)\s*seconds?',
|
|
261
|
+
r'retry-after:\s*(\d+(?:\.\d+)?)',
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
# Exponential backoff defaults
|
|
265
|
+
BASE_RETRY_DELAY = 1.0 # in seconds
|
|
266
|
+
MAX_RETRY_DELAY = 60.0 # in seconds
|
|
267
|
+
RETRY_BACKOFF_MULTIPLIER = 2.0
|
|
255
268
|
|
|
256
269
|
def __init__(self, resource_pool: str, dispatcher: Dispatcher):
|
|
257
270
|
super().__init__(resource_pool, dispatcher)
|
|
@@ -337,11 +350,12 @@ class RequestRateScheduler(Scheduler):
|
|
|
337
350
|
self.dispatcher.dispatch(request.rows, exec_ctx)
|
|
338
351
|
|
|
339
352
|
except Exception as exc:
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
353
|
+
_logger.debug(f'exception for {self.resource_pool}: type={type(exc)}\n{exc}')
|
|
354
|
+
is_rate_limit_error, retry_after = self._is_rate_limit_error(exc)
|
|
355
|
+
if is_rate_limit_error and num_retries < self.MAX_RETRIES:
|
|
356
|
+
retry_delay = self._compute_retry_delay(num_retries, retry_after)
|
|
357
|
+
_logger.debug(f'scheduler {self.resource_pool}: retrying after {retry_delay}')
|
|
358
|
+
await asyncio.sleep(retry_delay)
|
|
345
359
|
self.queue.put_nowait(self.QueueItem(request, num_retries + 1, exec_ctx))
|
|
346
360
|
return
|
|
347
361
|
|
|
@@ -358,6 +372,119 @@ class RequestRateScheduler(Scheduler):
|
|
|
358
372
|
if is_task:
|
|
359
373
|
self.num_in_flight -= 1
|
|
360
374
|
|
|
375
|
+
def _is_rate_limit_error(self, exc: Exception) -> tuple[bool, Optional[float]]:
|
|
376
|
+
"""Returns True if the exception indicates a rate limit error, and the retry delay in seconds."""
|
|
377
|
+
from http import HTTPStatus
|
|
378
|
+
|
|
379
|
+
# Check for HTTP status TOO_MANY_REQUESTS in various exception classes.
|
|
380
|
+
# We look for attributes that contain status codes, instead of checking the type of the exception,
|
|
381
|
+
# in order to handle a wider variety of exception classes.
|
|
382
|
+
is_rate_limit_error = False
|
|
383
|
+
retry_delay: Optional[float] = None
|
|
384
|
+
|
|
385
|
+
# requests.HTTPError/httpx.HTTPStatusError
|
|
386
|
+
if (
|
|
387
|
+
hasattr(exc, 'response')
|
|
388
|
+
and hasattr(exc.response, 'status_code')
|
|
389
|
+
and exc.response.status_code == HTTPStatus.TOO_MANY_REQUESTS.value
|
|
390
|
+
):
|
|
391
|
+
is_rate_limit_error = True
|
|
392
|
+
retry_delay = self._extract_retry_delay_from_headers(exc.response.headers)
|
|
393
|
+
elif (
|
|
394
|
+
# urllib.error.HTTPError
|
|
395
|
+
(hasattr(exc, 'code') and exc.code == HTTPStatus.TOO_MANY_REQUESTS.value)
|
|
396
|
+
# aiohttp.ClientResponseError
|
|
397
|
+
or (hasattr(exc, 'status') and exc.status == HTTPStatus.TOO_MANY_REQUESTS.value)
|
|
398
|
+
) and hasattr(exc, 'headers'):
|
|
399
|
+
is_rate_limit_error = True
|
|
400
|
+
retry_delay = self._extract_retry_delay_from_headers(exc.headers)
|
|
401
|
+
|
|
402
|
+
if is_rate_limit_error:
|
|
403
|
+
return True, retry_delay
|
|
404
|
+
|
|
405
|
+
# Check common rate limit keywords in exception message
|
|
406
|
+
error_msg = str(exc).lower()
|
|
407
|
+
if any(indicator in error_msg for indicator in self.RATE_LIMIT_INDICATORS):
|
|
408
|
+
retry_delay = self._extract_retry_delay_from_message(error_msg)
|
|
409
|
+
return True, retry_delay
|
|
410
|
+
|
|
411
|
+
return False, None
|
|
412
|
+
|
|
413
|
+
def _extract_retry_delay_from_headers(self, headers: Optional[Any]) -> Optional[float]:
|
|
414
|
+
"""Extract retry delay from HTTP headers."""
|
|
415
|
+
if headers is None:
|
|
416
|
+
return None
|
|
417
|
+
|
|
418
|
+
# convert headers to dict-like object for consistent access
|
|
419
|
+
header_dict: dict
|
|
420
|
+
if hasattr(headers, 'get'):
|
|
421
|
+
header_dict = headers
|
|
422
|
+
else:
|
|
423
|
+
# headers are a list of tuples or other format
|
|
424
|
+
try:
|
|
425
|
+
header_dict = dict(headers)
|
|
426
|
+
except (TypeError, ValueError):
|
|
427
|
+
return None
|
|
428
|
+
# normalize dict keys: lowercase and remove dashes
|
|
429
|
+
header_dict = {k.lower().replace('-', ''): v for k, v in header_dict.items()}
|
|
430
|
+
|
|
431
|
+
# check Retry-After header
|
|
432
|
+
retry_after = header_dict.get('retryafter')
|
|
433
|
+
if retry_after is not None:
|
|
434
|
+
try:
|
|
435
|
+
return float(retry_after)
|
|
436
|
+
except (ValueError, TypeError):
|
|
437
|
+
pass
|
|
438
|
+
|
|
439
|
+
# check X-RateLimit-Reset (Unix timestamp)
|
|
440
|
+
reset_time = header_dict.get('xratelimitreset')
|
|
441
|
+
if reset_time is not None:
|
|
442
|
+
try:
|
|
443
|
+
reset_timestamp = float(reset_time)
|
|
444
|
+
delay = max(0, reset_timestamp - time.time())
|
|
445
|
+
return delay
|
|
446
|
+
except (ValueError, TypeError):
|
|
447
|
+
pass
|
|
448
|
+
|
|
449
|
+
# check X-RateLimit-Reset-After (seconds from now)
|
|
450
|
+
reset_after = header_dict.get('xratelimitresetafter')
|
|
451
|
+
if reset_after is not None:
|
|
452
|
+
try:
|
|
453
|
+
return float(reset_after)
|
|
454
|
+
except (ValueError, TypeError):
|
|
455
|
+
pass
|
|
456
|
+
|
|
457
|
+
return None
|
|
458
|
+
|
|
459
|
+
def _extract_retry_delay_from_message(self, msg: str) -> Optional[float]:
|
|
460
|
+
msg_lower = msg.lower()
|
|
461
|
+
for pattern in self.RETRY_AFTER_PATTERNS:
|
|
462
|
+
match = re.search(pattern, msg_lower)
|
|
463
|
+
if match is not None:
|
|
464
|
+
try:
|
|
465
|
+
return float(match.group(1))
|
|
466
|
+
except (ValueError, TypeError):
|
|
467
|
+
continue
|
|
468
|
+
return None
|
|
469
|
+
|
|
470
|
+
def _compute_retry_delay(self, num_retries: int, retry_after: Optional[float] = None) -> float:
|
|
471
|
+
"""
|
|
472
|
+
Calculate exponential backoff delay for rate limit errors.
|
|
473
|
+
|
|
474
|
+
Args:
|
|
475
|
+
retry_count: Number of retries attempted (0-based)
|
|
476
|
+
retry_after: Suggested delay from Retry-After header
|
|
477
|
+
|
|
478
|
+
Returns:
|
|
479
|
+
Delay in seconds
|
|
480
|
+
"""
|
|
481
|
+
if retry_after is not None and retry_after > 0:
|
|
482
|
+
# Use server-suggested delay, but cap it at max_delay
|
|
483
|
+
return max(min(retry_after, self.MAX_RETRY_DELAY), self.BASE_RETRY_DELAY)
|
|
484
|
+
else:
|
|
485
|
+
delay = self.BASE_RETRY_DELAY * (self.RETRY_BACKOFF_MULTIPLIER**num_retries)
|
|
486
|
+
return max(min(delay, self.MAX_RETRY_DELAY), self.BASE_RETRY_DELAY)
|
|
487
|
+
|
|
361
488
|
|
|
362
489
|
# all concrete Scheduler subclasses that implement matches()
|
|
363
490
|
SCHEDULERS = [RateLimitsScheduler, RequestRateScheduler]
|