pixeltable 0.4.7__py3-none-any.whl → 0.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/catalog/catalog.py +4 -6
- pixeltable/catalog/table.py +41 -14
- pixeltable/catalog/table_version.py +12 -8
- pixeltable/catalog/table_version_path.py +6 -5
- pixeltable/config.py +24 -9
- pixeltable/dataframe.py +3 -3
- pixeltable/env.py +70 -16
- pixeltable/exec/aggregation_node.py +1 -1
- pixeltable/exec/cache_prefetch_node.py +4 -3
- pixeltable/exec/exec_node.py +0 -8
- pixeltable/exec/expr_eval/globals.py +1 -0
- pixeltable/exec/expr_eval/schedulers.py +16 -4
- pixeltable/exec/in_memory_data_node.py +2 -3
- pixeltable/exprs/data_row.py +5 -5
- pixeltable/exprs/function_call.py +59 -21
- pixeltable/exprs/row_builder.py +11 -5
- pixeltable/func/expr_template_function.py +6 -3
- pixeltable/functions/anthropic.py +1 -2
- pixeltable/functions/deepseek.py +5 -1
- pixeltable/functions/gemini.py +11 -2
- pixeltable/functions/huggingface.py +6 -12
- pixeltable/functions/openai.py +2 -1
- pixeltable/functions/video.py +5 -5
- pixeltable/globals.py +13 -2
- pixeltable/io/fiftyone.py +3 -3
- pixeltable/io/label_studio.py +2 -1
- pixeltable/iterators/audio.py +3 -2
- pixeltable/iterators/document.py +0 -6
- pixeltable/plan.py +0 -16
- pixeltable/share/packager.py +6 -6
- pixeltable/share/publish.py +134 -7
- pixeltable/utils/media_store.py +131 -66
- {pixeltable-0.4.7.dist-info → pixeltable-0.4.8.dist-info}/METADATA +186 -121
- {pixeltable-0.4.7.dist-info → pixeltable-0.4.8.dist-info}/RECORD +37 -37
- {pixeltable-0.4.7.dist-info → pixeltable-0.4.8.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.7.dist-info → pixeltable-0.4.8.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.7.dist-info → pixeltable-0.4.8.dist-info}/licenses/LICENSE +0 -0
pixeltable/catalog/catalog.py
CHANGED
|
@@ -189,12 +189,10 @@ class Catalog:
|
|
|
189
189
|
@classmethod
|
|
190
190
|
def clear(cls) -> None:
|
|
191
191
|
"""Remove the instance. Used for testing."""
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
# )
|
|
197
|
-
tbl_version.is_validated = False
|
|
192
|
+
if cls._instance is not None:
|
|
193
|
+
# invalidate all existing instances to force reloading of metadata
|
|
194
|
+
for tbl_version in cls._instance._tbl_versions.values():
|
|
195
|
+
tbl_version.is_validated = False
|
|
198
196
|
cls._instance = None
|
|
199
197
|
|
|
200
198
|
def __init__(self) -> None:
|
pixeltable/catalog/table.py
CHANGED
|
@@ -183,16 +183,14 @@ class Table(SchemaObject):
|
|
|
183
183
|
|
|
184
184
|
return op()
|
|
185
185
|
|
|
186
|
-
def _get_views(self, *, recursive: bool = True,
|
|
186
|
+
def _get_views(self, *, recursive: bool = True, mutable_only: bool = False) -> list['Table']:
|
|
187
187
|
cat = catalog.Catalog.get()
|
|
188
188
|
view_ids = cat.get_view_ids(self._id)
|
|
189
189
|
views = [cat.get_table_by_id(id) for id in view_ids]
|
|
190
|
-
if
|
|
191
|
-
views = [t for t in views if
|
|
190
|
+
if mutable_only:
|
|
191
|
+
views = [t for t in views if t._tbl_version_path.is_mutable()]
|
|
192
192
|
if recursive:
|
|
193
|
-
views.extend(
|
|
194
|
-
t for view in views for t in view._get_views(recursive=True, include_snapshots=include_snapshots)
|
|
195
|
-
)
|
|
193
|
+
views.extend(t for view in views for t in view._get_views(recursive=True, mutable_only=mutable_only))
|
|
196
194
|
return views
|
|
197
195
|
|
|
198
196
|
def _df(self) -> 'pxt.dataframe.DataFrame':
|
|
@@ -836,21 +834,25 @@ class Table(SchemaObject):
|
|
|
836
834
|
if_not_exists_ = IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
|
|
837
835
|
|
|
838
836
|
if isinstance(column, str):
|
|
839
|
-
col = self._tbl_version_path.get_column(column
|
|
837
|
+
col = self._tbl_version_path.get_column(column)
|
|
840
838
|
if col is None:
|
|
841
839
|
if if_not_exists_ == IfNotExistsParam.ERROR:
|
|
842
840
|
raise excs.Error(f'Column {column!r} unknown')
|
|
843
841
|
assert if_not_exists_ == IfNotExistsParam.IGNORE
|
|
844
842
|
return
|
|
843
|
+
if col.tbl.id != self._tbl_version_path.tbl_id:
|
|
844
|
+
raise excs.Error(f'Cannot drop base table column {col.name!r}')
|
|
845
845
|
col = self._tbl_version.get().cols_by_name[column]
|
|
846
846
|
else:
|
|
847
|
-
exists = self._tbl_version_path.has_column(column.col
|
|
847
|
+
exists = self._tbl_version_path.has_column(column.col)
|
|
848
848
|
if not exists:
|
|
849
849
|
if if_not_exists_ == IfNotExistsParam.ERROR:
|
|
850
850
|
raise excs.Error(f'Unknown column: {column.col.qualified_name}')
|
|
851
851
|
assert if_not_exists_ == IfNotExistsParam.IGNORE
|
|
852
852
|
return
|
|
853
853
|
col = column.col
|
|
854
|
+
if col.tbl.id != self._tbl_version_path.tbl_id:
|
|
855
|
+
raise excs.Error(f'Cannot drop base table column {col.name!r}')
|
|
854
856
|
|
|
855
857
|
dependent_user_cols = [c for c in cat.get_column_dependents(col.tbl.id, col.id) if c.name is not None]
|
|
856
858
|
if len(dependent_user_cols) > 0:
|
|
@@ -859,13 +861,32 @@ class Table(SchemaObject):
|
|
|
859
861
|
f'{", ".join(c.name for c in dependent_user_cols)}'
|
|
860
862
|
)
|
|
861
863
|
|
|
862
|
-
|
|
864
|
+
views = self._get_views(recursive=True, mutable_only=True)
|
|
865
|
+
|
|
866
|
+
# See if any view predicates depend on this column
|
|
867
|
+
dependent_views = []
|
|
868
|
+
for view in views:
|
|
869
|
+
if view._tbl_version is not None:
|
|
870
|
+
predicate = view._tbl_version.get().predicate
|
|
871
|
+
if predicate is not None:
|
|
872
|
+
for predicate_col in exprs.Expr.get_refd_column_ids(predicate.as_dict()):
|
|
873
|
+
if predicate_col.tbl_id == col.tbl.id and predicate_col.col_id == col.id:
|
|
874
|
+
dependent_views.append((view, predicate))
|
|
875
|
+
|
|
876
|
+
if len(dependent_views) > 0:
|
|
877
|
+
dependent_views_str = '\n'.join(
|
|
878
|
+
f'view: {view._path()}, predicate: {predicate!s}' for view, predicate in dependent_views
|
|
879
|
+
)
|
|
880
|
+
raise excs.Error(
|
|
881
|
+
f'Cannot drop column `{col.name}` because the following views depend on it:\n{dependent_views_str}'
|
|
882
|
+
)
|
|
883
|
+
|
|
863
884
|
# See if this column has a dependent store. We need to look through all stores in all
|
|
864
885
|
# (transitive) views of this table.
|
|
865
886
|
col_handle = col.handle
|
|
866
887
|
dependent_stores = [
|
|
867
888
|
(view, store)
|
|
868
|
-
for view in (self, *
|
|
889
|
+
for view in (self, *views)
|
|
869
890
|
for store in view._tbl_version.get().external_stores.values()
|
|
870
891
|
if col_handle in store.get_local_columns()
|
|
871
892
|
]
|
|
@@ -878,6 +899,12 @@ class Table(SchemaObject):
|
|
|
878
899
|
f'Cannot drop column `{col.name}` because the following external stores depend on it:\n'
|
|
879
900
|
f'{", ".join(dependent_store_names)}'
|
|
880
901
|
)
|
|
902
|
+
all_columns = self.columns()
|
|
903
|
+
if len(all_columns) == 1 and col.name == all_columns[0]:
|
|
904
|
+
raise excs.Error(
|
|
905
|
+
f'Cannot drop column `{col.name}` because it is the last remaining column in this table.'
|
|
906
|
+
f' Tables must have at least one column.'
|
|
907
|
+
)
|
|
881
908
|
|
|
882
909
|
self._tbl_version.get().drop_column(col)
|
|
883
910
|
|
|
@@ -1108,11 +1135,11 @@ class Table(SchemaObject):
|
|
|
1108
1135
|
"""Resolve a column parameter to a Column object"""
|
|
1109
1136
|
col: Column = None
|
|
1110
1137
|
if isinstance(column, str):
|
|
1111
|
-
col = self._tbl_version_path.get_column(column
|
|
1138
|
+
col = self._tbl_version_path.get_column(column)
|
|
1112
1139
|
if col is None:
|
|
1113
1140
|
raise excs.Error(f'Column {column!r} unknown')
|
|
1114
1141
|
elif isinstance(column, ColumnRef):
|
|
1115
|
-
exists = self._tbl_version_path.has_column(column.col
|
|
1142
|
+
exists = self._tbl_version_path.has_column(column.col)
|
|
1116
1143
|
if not exists:
|
|
1117
1144
|
raise excs.Error(f'Unknown column: {column.col.qualified_name}')
|
|
1118
1145
|
col = column.col
|
|
@@ -1483,14 +1510,14 @@ class Table(SchemaObject):
|
|
|
1483
1510
|
col_name: str
|
|
1484
1511
|
col: Column
|
|
1485
1512
|
if isinstance(column, str):
|
|
1486
|
-
col = self._tbl_version_path.get_column(column
|
|
1513
|
+
col = self._tbl_version_path.get_column(column)
|
|
1487
1514
|
if col is None:
|
|
1488
1515
|
raise excs.Error(f'Unknown column: {column!r}')
|
|
1489
1516
|
col_name = column
|
|
1490
1517
|
else:
|
|
1491
1518
|
assert isinstance(column, ColumnRef)
|
|
1492
1519
|
col = column.col
|
|
1493
|
-
if not self._tbl_version_path.has_column(col
|
|
1520
|
+
if not self._tbl_version_path.has_column(col):
|
|
1494
1521
|
raise excs.Error(f'Unknown column: {col.name!r}')
|
|
1495
1522
|
col_name = col.name
|
|
1496
1523
|
if not col.is_computed:
|
|
@@ -327,7 +327,7 @@ class TableVersion:
|
|
|
327
327
|
from .table_version_path import TableVersionPath
|
|
328
328
|
|
|
329
329
|
# clear out any remaining media files from an aborted previous attempt
|
|
330
|
-
MediaStore.delete(self.id)
|
|
330
|
+
MediaStore.get().delete(self.id)
|
|
331
331
|
view_path = TableVersionPath.from_dict(op.load_view_op.view_path)
|
|
332
332
|
plan, _ = Planner.create_view_load_plan(view_path)
|
|
333
333
|
_, row_counts = self.store_tbl.insert_rows(plan, v_min=self.version)
|
|
@@ -374,7 +374,7 @@ class TableVersion:
|
|
|
374
374
|
# if self.base.get().is_mutable:
|
|
375
375
|
# self.base.get().mutable_views.remove(TableVersionHandle.create(self))
|
|
376
376
|
|
|
377
|
-
MediaStore.delete(self.id)
|
|
377
|
+
MediaStore.get().delete(self.id)
|
|
378
378
|
FileCache.get().clear(tbl_id=self.id)
|
|
379
379
|
self.store_tbl.drop()
|
|
380
380
|
|
|
@@ -827,14 +827,17 @@ class TableVersion:
|
|
|
827
827
|
|
|
828
828
|
def rename_column(self, old_name: str, new_name: str) -> None:
|
|
829
829
|
"""Rename a column."""
|
|
830
|
-
|
|
831
|
-
|
|
830
|
+
if not self.is_mutable:
|
|
831
|
+
raise excs.Error(f'Cannot rename column for immutable table {self.name!r}')
|
|
832
|
+
col = self.path.get_column(old_name)
|
|
833
|
+
if col is None:
|
|
832
834
|
raise excs.Error(f'Unknown column: {old_name}')
|
|
835
|
+
if col.tbl.id != self.id:
|
|
836
|
+
raise excs.Error(f'Cannot rename base table column {col.name!r}')
|
|
833
837
|
if not is_valid_identifier(new_name):
|
|
834
838
|
raise excs.Error(f"Invalid column name: '{new_name}'")
|
|
835
839
|
if new_name in self.cols_by_name:
|
|
836
840
|
raise excs.Error(f'Column {new_name} already exists')
|
|
837
|
-
col = self.cols_by_name[old_name]
|
|
838
841
|
del self.cols_by_name[old_name]
|
|
839
842
|
col.name = new_name
|
|
840
843
|
self.cols_by_name[new_name] = col
|
|
@@ -1024,10 +1027,11 @@ class TableVersion:
|
|
|
1024
1027
|
for el in val:
|
|
1025
1028
|
assert isinstance(el, int)
|
|
1026
1029
|
continue
|
|
1027
|
-
col = self.path.get_column(col_name
|
|
1030
|
+
col = self.path.get_column(col_name)
|
|
1028
1031
|
if col is None:
|
|
1029
|
-
# TODO: return more informative error if this is trying to update a base column
|
|
1030
1032
|
raise excs.Error(f'Column {col_name} unknown')
|
|
1033
|
+
if col.tbl.id != self.id:
|
|
1034
|
+
raise excs.Error(f'Column {col.name!r} is a base table column and cannot be updated')
|
|
1031
1035
|
if col.is_computed:
|
|
1032
1036
|
raise excs.Error(f'Column {col_name} is computed and cannot be updated')
|
|
1033
1037
|
if col.is_pk and not allow_pk:
|
|
@@ -1235,7 +1239,7 @@ class TableVersion:
|
|
|
1235
1239
|
)
|
|
1236
1240
|
|
|
1237
1241
|
# delete newly-added data
|
|
1238
|
-
MediaStore.delete(self.id, tbl_version=self.version)
|
|
1242
|
+
MediaStore.get().delete(self.id, tbl_version=self.version)
|
|
1239
1243
|
conn.execute(sql.delete(self.store_tbl.sa_tbl).where(self.store_tbl.sa_tbl.c.v_min == self.version))
|
|
1240
1244
|
|
|
1241
1245
|
# revert new deletions
|
|
@@ -184,13 +184,13 @@ class TableVersionPath:
|
|
|
184
184
|
cols = self.columns()
|
|
185
185
|
return {col.id: col for col in cols}
|
|
186
186
|
|
|
187
|
-
def get_column(self, name: str
|
|
187
|
+
def get_column(self, name: str) -> Optional[Column]:
|
|
188
188
|
"""Return the column with the given name, or None if not found"""
|
|
189
189
|
self.refresh_cached_md()
|
|
190
190
|
col = self._cached_tbl_version.cols_by_name.get(name)
|
|
191
191
|
if col is not None:
|
|
192
192
|
return col
|
|
193
|
-
elif self.base is not None and
|
|
193
|
+
elif self.base is not None and self._cached_tbl_version.include_base_columns:
|
|
194
194
|
return self.base.get_column(name)
|
|
195
195
|
else:
|
|
196
196
|
return None
|
|
@@ -206,10 +206,11 @@ class TableVersionPath:
|
|
|
206
206
|
else:
|
|
207
207
|
return None
|
|
208
208
|
|
|
209
|
-
def has_column(self, col: Column
|
|
209
|
+
def has_column(self, col: Column) -> bool:
|
|
210
210
|
"""Return True if this table has the given column."""
|
|
211
|
-
self.refresh_cached_md()
|
|
212
211
|
assert col.tbl is not None
|
|
212
|
+
self.refresh_cached_md()
|
|
213
|
+
|
|
213
214
|
if (
|
|
214
215
|
col.tbl.id == self.tbl_version.id
|
|
215
216
|
and col.tbl.effective_version == self.tbl_version.effective_version
|
|
@@ -217,7 +218,7 @@ class TableVersionPath:
|
|
|
217
218
|
):
|
|
218
219
|
# the column is visible in this table version
|
|
219
220
|
return True
|
|
220
|
-
elif self.base is not None
|
|
221
|
+
elif self.base is not None:
|
|
221
222
|
return self.base.has_column(col)
|
|
222
223
|
else:
|
|
223
224
|
return False
|
pixeltable/config.py
CHANGED
|
@@ -111,10 +111,19 @@ class Config:
|
|
|
111
111
|
return default
|
|
112
112
|
|
|
113
113
|
def get_value(self, key: str, expected_type: type[T], section: str = 'pixeltable') -> Optional[T]:
|
|
114
|
-
value = self.lookup_env(section, key) # Try to get from environment first
|
|
114
|
+
value: Any = self.lookup_env(section, key) # Try to get from environment first
|
|
115
115
|
# Next try the config file
|
|
116
|
-
if value is None
|
|
117
|
-
|
|
116
|
+
if value is None:
|
|
117
|
+
# Resolve nested section dicts
|
|
118
|
+
lookup_elems = [*section.split('.'), key]
|
|
119
|
+
value = self.__config_dict
|
|
120
|
+
for el in lookup_elems:
|
|
121
|
+
if isinstance(value, dict):
|
|
122
|
+
if el not in value:
|
|
123
|
+
return None
|
|
124
|
+
value = value[el]
|
|
125
|
+
else:
|
|
126
|
+
return None
|
|
118
127
|
|
|
119
128
|
if value is None:
|
|
120
129
|
return None # Not specified
|
|
@@ -155,19 +164,25 @@ KNOWN_CONFIG_OPTIONS = {
|
|
|
155
164
|
},
|
|
156
165
|
'anthropic': {'api_key': 'Anthropic API key'},
|
|
157
166
|
'bedrock': {'api_key': 'AWS Bedrock API key'},
|
|
158
|
-
'deepseek': {'api_key': 'Deepseek API key'},
|
|
159
|
-
'fireworks': {'api_key': 'Fireworks API key'},
|
|
160
|
-
'gemini': {'api_key': 'Gemini API key'},
|
|
161
|
-
'
|
|
167
|
+
'deepseek': {'api_key': 'Deepseek API key', 'rate_limit': 'Rate limit for Deepseek API requests'},
|
|
168
|
+
'fireworks': {'api_key': 'Fireworks API key', 'rate_limit': 'Rate limit for Fireworks API requests'},
|
|
169
|
+
'gemini': {'api_key': 'Gemini API key', 'rate_limits': 'Per-model rate limits for Gemini API requests'},
|
|
170
|
+
'imagen': {'rate_limits': 'Per-model rate limits for Imagen API requests'},
|
|
171
|
+
'veo': {'rate_limits': 'Per-model rate limits for Veo API requests'},
|
|
172
|
+
'groq': {'api_key': 'Groq API key', 'rate_limit': 'Rate limit for Groq API requests'},
|
|
162
173
|
'label_studio': {'api_key': 'Label Studio API key', 'url': 'Label Studio server URL'},
|
|
163
|
-
'mistral': {'api_key': 'Mistral API key'},
|
|
174
|
+
'mistral': {'api_key': 'Mistral API key', 'rate_limit': 'Rate limit for Mistral API requests'},
|
|
164
175
|
'openai': {
|
|
165
176
|
'api_key': 'OpenAI API key',
|
|
166
177
|
'base_url': 'OpenAI API base URL',
|
|
167
178
|
'api_version': 'API version if using Azure OpenAI',
|
|
179
|
+
'rate_limits': 'Per-model rate limits for OpenAI API requests',
|
|
168
180
|
},
|
|
169
181
|
'replicate': {'api_token': 'Replicate API token'},
|
|
170
|
-
'together': {
|
|
182
|
+
'together': {
|
|
183
|
+
'api_key': 'Together API key',
|
|
184
|
+
'rate_limits': 'Per-model category rate limits for Together API requests',
|
|
185
|
+
},
|
|
171
186
|
'pypi': {'api_key': 'PyPI API key (for internal use only)'},
|
|
172
187
|
}
|
|
173
188
|
|
pixeltable/dataframe.py
CHANGED
|
@@ -795,19 +795,19 @@ class DataFrame:
|
|
|
795
795
|
assert len(col_refs) > 0 and len(joined_tbls) >= 2
|
|
796
796
|
for col_ref in col_refs:
|
|
797
797
|
# identify the referenced column by name in 'other'
|
|
798
|
-
rhs_col = other.get_column(col_ref.col.name
|
|
798
|
+
rhs_col = other.get_column(col_ref.col.name)
|
|
799
799
|
if rhs_col is None:
|
|
800
800
|
raise excs.Error(f"'on': column {col_ref.col.name!r} not found in joined table")
|
|
801
801
|
rhs_col_ref = exprs.ColumnRef(rhs_col)
|
|
802
802
|
|
|
803
803
|
lhs_col_ref: Optional[exprs.ColumnRef] = None
|
|
804
|
-
if any(tbl.has_column(col_ref.col
|
|
804
|
+
if any(tbl.has_column(col_ref.col) for tbl in self._from_clause.tbls):
|
|
805
805
|
# col_ref comes from the existing from_clause, we use that directly
|
|
806
806
|
lhs_col_ref = col_ref
|
|
807
807
|
else:
|
|
808
808
|
# col_ref comes from other, we need to look for a match in the existing from_clause by name
|
|
809
809
|
for tbl in self._from_clause.tbls:
|
|
810
|
-
col = tbl.get_column(col_ref.col.name
|
|
810
|
+
col = tbl.get_column(col_ref.col.name)
|
|
811
811
|
if col is None:
|
|
812
812
|
continue
|
|
813
813
|
if lhs_col_ref is not None:
|
pixeltable/env.py
CHANGED
|
@@ -15,7 +15,6 @@ import sys
|
|
|
15
15
|
import threading
|
|
16
16
|
import types
|
|
17
17
|
import typing
|
|
18
|
-
import uuid
|
|
19
18
|
import warnings
|
|
20
19
|
from contextlib import contextmanager
|
|
21
20
|
from dataclasses import dataclass, field
|
|
@@ -101,6 +100,8 @@ class Env:
|
|
|
101
100
|
def _init_env(cls, reinit_db: bool = False) -> None:
|
|
102
101
|
assert not cls.__initializing, 'Circular env initialization detected.'
|
|
103
102
|
cls.__initializing = True
|
|
103
|
+
if cls._instance is not None:
|
|
104
|
+
cls._instance._clean_up()
|
|
104
105
|
cls._instance = None
|
|
105
106
|
env = Env()
|
|
106
107
|
env._set_up(reinit_db=reinit_db)
|
|
@@ -246,7 +247,7 @@ class Env:
|
|
|
246
247
|
if self._current_conn is None:
|
|
247
248
|
assert self._current_session is None
|
|
248
249
|
try:
|
|
249
|
-
self._current_isolation_level = 'SERIALIZABLE'
|
|
250
|
+
self._current_isolation_level = 'SERIALIZABLE'
|
|
250
251
|
with (
|
|
251
252
|
self.engine.connect().execution_options(isolation_level=self._current_isolation_level) as conn,
|
|
252
253
|
sql.orm.Session(conn) as session,
|
|
@@ -485,7 +486,7 @@ class Env:
|
|
|
485
486
|
raise excs.Error(error)
|
|
486
487
|
self._logger.info(f'Using database at: {self.db_url}')
|
|
487
488
|
else:
|
|
488
|
-
self._db_name =
|
|
489
|
+
self._db_name = config.get_string_value('db') or 'pixeltable'
|
|
489
490
|
self._pgdata_dir = Path(os.environ.get('PIXELTABLE_PGDATA', str(Config.get().home / 'pgdata')))
|
|
490
491
|
# cleanup_mode=None will leave the postgres process running after Python exits
|
|
491
492
|
# cleanup_mode='stop' will terminate the postgres process when Python exits
|
|
@@ -557,6 +558,14 @@ class Env:
|
|
|
557
558
|
finally:
|
|
558
559
|
engine.dispose()
|
|
559
560
|
|
|
561
|
+
def _pgserver_terminate_connections_stmt(self) -> str:
|
|
562
|
+
return f"""
|
|
563
|
+
SELECT pg_terminate_backend(pg_stat_activity.pid)
|
|
564
|
+
FROM pg_stat_activity
|
|
565
|
+
WHERE pg_stat_activity.datname = '{self._db_name}'
|
|
566
|
+
AND pid <> pg_backend_pid()
|
|
567
|
+
"""
|
|
568
|
+
|
|
560
569
|
def _drop_store_db(self) -> None:
|
|
561
570
|
assert self._db_name is not None
|
|
562
571
|
engine = sql.create_engine(self._dbms.default_system_db_url(), future=True, isolation_level='AUTOCOMMIT')
|
|
@@ -565,13 +574,7 @@ class Env:
|
|
|
565
574
|
with engine.begin() as conn:
|
|
566
575
|
# terminate active connections
|
|
567
576
|
if self._db_server is not None:
|
|
568
|
-
|
|
569
|
-
SELECT pg_terminate_backend(pg_stat_activity.pid)
|
|
570
|
-
FROM pg_stat_activity
|
|
571
|
-
WHERE pg_stat_activity.datname = '{self._db_name}'
|
|
572
|
-
AND pid <> pg_backend_pid()
|
|
573
|
-
"""
|
|
574
|
-
conn.execute(sql.text(stmt))
|
|
577
|
+
conn.execute(sql.text(self._pgserver_terminate_connections_stmt()))
|
|
575
578
|
# drop db
|
|
576
579
|
stmt = self._dbms.drop_db_stmt(preparer.quote(self._db_name))
|
|
577
580
|
conn.execute(sql.text(stmt))
|
|
@@ -749,12 +752,6 @@ class Env:
|
|
|
749
752
|
else:
|
|
750
753
|
os.remove(path)
|
|
751
754
|
|
|
752
|
-
def num_tmp_files(self) -> int:
|
|
753
|
-
return len(glob.glob(f'{self._tmp_dir}/*'))
|
|
754
|
-
|
|
755
|
-
def create_tmp_path(self, extension: str = '') -> Path:
|
|
756
|
-
return self._tmp_dir / f'{uuid.uuid4()}{extension}'
|
|
757
|
-
|
|
758
755
|
# def get_resource_pool_info(self, pool_id: str, pool_info_cls: Optional[Type[T]]) -> T:
|
|
759
756
|
def get_resource_pool_info(self, pool_id: str, make_pool_info: Optional[Callable[[], T]] = None) -> T:
|
|
760
757
|
"""Returns the info object for the given id, creating it if necessary."""
|
|
@@ -815,6 +812,63 @@ class Env:
|
|
|
815
812
|
except Exception as exc:
|
|
816
813
|
raise excs.Error(f'Failed to load spaCy model: {spacy_model}') from exc
|
|
817
814
|
|
|
815
|
+
def _clean_up(self) -> None:
|
|
816
|
+
"""
|
|
817
|
+
Internal cleanup method that properly closes all resources and resets state.
|
|
818
|
+
This is called before destroying the singleton instance.
|
|
819
|
+
"""
|
|
820
|
+
assert self._current_session is None
|
|
821
|
+
assert self._current_conn is None
|
|
822
|
+
|
|
823
|
+
# Stop HTTP server
|
|
824
|
+
if self._httpd is not None:
|
|
825
|
+
try:
|
|
826
|
+
self._httpd.shutdown()
|
|
827
|
+
self._httpd.server_close()
|
|
828
|
+
except Exception as e:
|
|
829
|
+
_logger.warning(f'Error stopping HTTP server: {e}')
|
|
830
|
+
|
|
831
|
+
# First terminate all connections to the database
|
|
832
|
+
if self._db_server is not None:
|
|
833
|
+
assert self._dbms is not None
|
|
834
|
+
assert self._db_name is not None
|
|
835
|
+
try:
|
|
836
|
+
temp_engine = sql.create_engine(self._dbms.default_system_db_url(), isolation_level='AUTOCOMMIT')
|
|
837
|
+
try:
|
|
838
|
+
with temp_engine.begin() as conn:
|
|
839
|
+
conn.execute(sql.text(self._pgserver_terminate_connections_stmt()))
|
|
840
|
+
_logger.info(f"Terminated all connections to database '{self._db_name}'")
|
|
841
|
+
except Exception as e:
|
|
842
|
+
_logger.warning(f'Error terminating database connections: {e}')
|
|
843
|
+
finally:
|
|
844
|
+
temp_engine.dispose()
|
|
845
|
+
except Exception as e:
|
|
846
|
+
_logger.warning(f'Error stopping database server: {e}')
|
|
847
|
+
|
|
848
|
+
# Dispose of SQLAlchemy engine (after stopping db server)
|
|
849
|
+
if self._sa_engine is not None:
|
|
850
|
+
try:
|
|
851
|
+
self._sa_engine.dispose()
|
|
852
|
+
except Exception as e:
|
|
853
|
+
_logger.warning(f'Error disposing engine: {e}')
|
|
854
|
+
|
|
855
|
+
# Close event loop
|
|
856
|
+
if self._event_loop is not None:
|
|
857
|
+
try:
|
|
858
|
+
if self._event_loop.is_running():
|
|
859
|
+
self._event_loop.stop()
|
|
860
|
+
self._event_loop.close()
|
|
861
|
+
except Exception as e:
|
|
862
|
+
_logger.warning(f'Error closing event loop: {e}')
|
|
863
|
+
|
|
864
|
+
# Remove logging handlers
|
|
865
|
+
for handler in self._logger.handlers[:]:
|
|
866
|
+
try:
|
|
867
|
+
handler.close()
|
|
868
|
+
self._logger.removeHandler(handler)
|
|
869
|
+
except Exception as e:
|
|
870
|
+
_logger.warning(f'Error removing handler: {e}')
|
|
871
|
+
|
|
818
872
|
|
|
819
873
|
def register_client(name: str) -> Callable:
|
|
820
874
|
"""Decorator that registers a third-party API client for use by Pixeltable.
|
|
@@ -103,6 +103,6 @@ class AggregationNode(ExecNode):
|
|
|
103
103
|
self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
|
|
104
104
|
self.output_batch.add_row(prev_row)
|
|
105
105
|
|
|
106
|
-
self.output_batch.flush_imgs(None, self.stored_img_cols, self.flushed_img_slots)
|
|
106
|
+
self.output_batch.flush_imgs(None, self.row_builder.stored_img_cols, self.flushed_img_slots)
|
|
107
107
|
_logger.debug(f'AggregateNode: consumed {num_input_rows} rows, returning {len(self.output_batch.rows)} rows')
|
|
108
108
|
yield self.output_batch
|
|
@@ -12,8 +12,9 @@ from pathlib import Path
|
|
|
12
12
|
from typing import Any, AsyncIterator, Iterator, Optional
|
|
13
13
|
from uuid import UUID
|
|
14
14
|
|
|
15
|
-
from pixeltable import
|
|
15
|
+
from pixeltable import exceptions as excs, exprs
|
|
16
16
|
from pixeltable.utils.filecache import FileCache
|
|
17
|
+
from pixeltable.utils.media_store import TempStore
|
|
17
18
|
|
|
18
19
|
from .data_row_batch import DataRowBatch
|
|
19
20
|
from .exec_node import ExecNode
|
|
@@ -219,7 +220,7 @@ class CachePrefetchNode(ExecNode):
|
|
|
219
220
|
self.in_flight_requests[f] = url
|
|
220
221
|
|
|
221
222
|
def __fetch_url(self, url: str) -> tuple[Optional[Path], Optional[Exception]]:
|
|
222
|
-
"""Fetches a remote URL into
|
|
223
|
+
"""Fetches a remote URL into the TempStore and returns its path"""
|
|
223
224
|
_logger.debug(f'fetching url={url} thread_name={threading.current_thread().name}')
|
|
224
225
|
parsed = urllib.parse.urlparse(url)
|
|
225
226
|
# Use len(parsed.scheme) > 1 here to ensure we're not being passed
|
|
@@ -230,7 +231,7 @@ class CachePrefetchNode(ExecNode):
|
|
|
230
231
|
if parsed.path:
|
|
231
232
|
p = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed.path)))
|
|
232
233
|
extension = p.suffix
|
|
233
|
-
tmp_path =
|
|
234
|
+
tmp_path = TempStore.create_path(extension=extension)
|
|
234
235
|
try:
|
|
235
236
|
_logger.debug(f'Downloading {url} to {tmp_path}')
|
|
236
237
|
if parsed.scheme == 's3':
|
pixeltable/exec/exec_node.py
CHANGED
|
@@ -20,7 +20,6 @@ class ExecNode(abc.ABC):
|
|
|
20
20
|
row_builder: exprs.RowBuilder
|
|
21
21
|
input: Optional[ExecNode]
|
|
22
22
|
flushed_img_slots: list[int] # idxs of image slots of our output_exprs dependencies
|
|
23
|
-
stored_img_cols: list[exprs.ColumnSlotIdx]
|
|
24
23
|
ctx: Optional[ExecContext]
|
|
25
24
|
|
|
26
25
|
def __init__(
|
|
@@ -40,7 +39,6 @@ class ExecNode(abc.ABC):
|
|
|
40
39
|
self.flushed_img_slots = [
|
|
41
40
|
e.slot_idx for e in output_dependencies if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
|
|
42
41
|
]
|
|
43
|
-
self.stored_img_cols = []
|
|
44
42
|
self.ctx = None # all nodes of a tree share the same context
|
|
45
43
|
|
|
46
44
|
def set_ctx(self, ctx: ExecContext) -> None:
|
|
@@ -48,12 +46,6 @@ class ExecNode(abc.ABC):
|
|
|
48
46
|
if self.input is not None:
|
|
49
47
|
self.input.set_ctx(ctx)
|
|
50
48
|
|
|
51
|
-
def set_stored_img_cols(self, stored_img_cols: list[exprs.ColumnSlotIdx]) -> None:
|
|
52
|
-
self.stored_img_cols = stored_img_cols
|
|
53
|
-
# propagate batch size to the source
|
|
54
|
-
if self.input is not None:
|
|
55
|
-
self.input.set_stored_img_cols(stored_img_cols)
|
|
56
|
-
|
|
57
49
|
@abc.abstractmethod
|
|
58
50
|
def __aiter__(self) -> AsyncIterator[DataRowBatch]:
|
|
59
51
|
pass
|
|
@@ -56,6 +56,7 @@ class Scheduler(abc.ABC):
|
|
|
56
56
|
request: FnCallArgs
|
|
57
57
|
num_retries: int
|
|
58
58
|
exec_ctx: ExecCtx
|
|
59
|
+
retry_after: Optional[float] = None # time.monotonic()
|
|
59
60
|
|
|
60
61
|
def __lt__(self, other: Scheduler.QueueItem) -> bool:
|
|
61
62
|
# prioritize by number of retries (more retries = higher priority)
|
|
@@ -270,6 +270,7 @@ class RequestRateScheduler(Scheduler):
|
|
|
270
270
|
num_in_flight: int
|
|
271
271
|
total_requests: int
|
|
272
272
|
total_retried: int
|
|
273
|
+
total_errors: int
|
|
273
274
|
|
|
274
275
|
TIME_FORMAT = '%H:%M.%S %f'
|
|
275
276
|
MAX_RETRIES = 3
|
|
@@ -294,6 +295,7 @@ class RequestRateScheduler(Scheduler):
|
|
|
294
295
|
self.num_in_flight = 0
|
|
295
296
|
self.total_requests = 0
|
|
296
297
|
self.total_retried = 0
|
|
298
|
+
self.total_errors = 0
|
|
297
299
|
|
|
298
300
|
# try to get the rate limit from the config
|
|
299
301
|
elems = resource_pool.split(':')
|
|
@@ -312,6 +314,7 @@ class RequestRateScheduler(Scheduler):
|
|
|
312
314
|
key = model
|
|
313
315
|
requests_per_min = Config.get().get_int_value(key, section=section)
|
|
314
316
|
requests_per_min = requests_per_min or self.DEFAULT_RATE_LIMIT
|
|
317
|
+
_logger.debug(f'rate limit for {self.resource_pool}: {requests_per_min} RPM')
|
|
315
318
|
self.secs_per_request = 1 / (requests_per_min / 60)
|
|
316
319
|
|
|
317
320
|
@classmethod
|
|
@@ -325,8 +328,12 @@ class RequestRateScheduler(Scheduler):
|
|
|
325
328
|
if item.num_retries > 0:
|
|
326
329
|
self.total_retried += 1
|
|
327
330
|
now = time.monotonic()
|
|
331
|
+
wait_duration = 0.0
|
|
332
|
+
if item.retry_after is not None:
|
|
333
|
+
wait_duration = item.retry_after - now
|
|
328
334
|
if now - last_request_ts < self.secs_per_request:
|
|
329
|
-
wait_duration = self.secs_per_request - (now - last_request_ts)
|
|
335
|
+
wait_duration = max(wait_duration, self.secs_per_request - (now - last_request_ts))
|
|
336
|
+
if wait_duration > 0:
|
|
330
337
|
_logger.debug(f'waiting for {wait_duration} for {self.resource_pool}')
|
|
331
338
|
await asyncio.sleep(wait_duration)
|
|
332
339
|
|
|
@@ -372,15 +379,20 @@ class RequestRateScheduler(Scheduler):
|
|
|
372
379
|
|
|
373
380
|
except Exception as exc:
|
|
374
381
|
_logger.debug(f'exception for {self.resource_pool}: type={type(exc)}\n{exc}')
|
|
382
|
+
if hasattr(exc, 'response') and hasattr(exc.response, 'headers'):
|
|
383
|
+
_logger.debug(f'scheduler {self.resource_pool}: exception headers: {exc.response.headers}')
|
|
375
384
|
is_rate_limit_error, retry_after = self._is_rate_limit_error(exc)
|
|
376
385
|
if is_rate_limit_error and num_retries < self.MAX_RETRIES:
|
|
377
386
|
retry_delay = self._compute_retry_delay(num_retries, retry_after)
|
|
378
387
|
_logger.debug(f'scheduler {self.resource_pool}: retrying after {retry_delay}')
|
|
379
|
-
|
|
380
|
-
|
|
388
|
+
now = time.monotonic()
|
|
389
|
+
# put the request back in the queue right away, which prevents new requests from being generated until
|
|
390
|
+
# this one succeeds or exceeds its retry limit
|
|
391
|
+
self.queue.put_nowait(self.QueueItem(request, num_retries + 1, exec_ctx, retry_after=now + retry_delay))
|
|
381
392
|
return
|
|
382
393
|
|
|
383
394
|
# record the exception
|
|
395
|
+
self.total_errors += 1
|
|
384
396
|
_, _, exc_tb = sys.exc_info()
|
|
385
397
|
for row in request.rows:
|
|
386
398
|
row.set_exc(request.fn_call.slot_idx, exc)
|
|
@@ -388,7 +400,7 @@ class RequestRateScheduler(Scheduler):
|
|
|
388
400
|
finally:
|
|
389
401
|
_logger.debug(
|
|
390
402
|
f'Scheduler stats: #in-flight={self.num_in_flight} #requests={self.total_requests}, '
|
|
391
|
-
f'#retried={self.total_retried}'
|
|
403
|
+
f'#retried={self.total_retried} #errors={self.total_errors}'
|
|
392
404
|
)
|
|
393
405
|
if is_task:
|
|
394
406
|
self.num_in_flight -= 1
|
|
@@ -2,7 +2,7 @@ import logging
|
|
|
2
2
|
from typing import Any, AsyncIterator, Optional
|
|
3
3
|
|
|
4
4
|
from pixeltable import catalog, exprs
|
|
5
|
-
from pixeltable.utils.media_store import
|
|
5
|
+
from pixeltable.utils.media_store import TempStore
|
|
6
6
|
|
|
7
7
|
from .data_row_batch import DataRowBatch
|
|
8
8
|
from .exec_node import ExecNode
|
|
@@ -67,8 +67,7 @@ class InMemoryDataNode(ExecNode):
|
|
|
67
67
|
col = col_info.col
|
|
68
68
|
if col.col_type.is_image_type() and isinstance(val, bytes):
|
|
69
69
|
# this is a literal media file, ie, a sequence of bytes; save it as a binary file and store the path
|
|
70
|
-
|
|
71
|
-
filepath, _ = MediaStore.save_media_object(val, col, format=None)
|
|
70
|
+
filepath, _ = TempStore.save_media_object(val, col, format=None)
|
|
72
71
|
output_row[col_info.slot_idx] = str(filepath)
|
|
73
72
|
else:
|
|
74
73
|
output_row[col_info.slot_idx] = val
|