pixeltable 0.4.7__py3-none-any.whl → 0.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (37) hide show
  1. pixeltable/catalog/catalog.py +4 -6
  2. pixeltable/catalog/table.py +41 -14
  3. pixeltable/catalog/table_version.py +12 -8
  4. pixeltable/catalog/table_version_path.py +6 -5
  5. pixeltable/config.py +24 -9
  6. pixeltable/dataframe.py +3 -3
  7. pixeltable/env.py +70 -16
  8. pixeltable/exec/aggregation_node.py +1 -1
  9. pixeltable/exec/cache_prefetch_node.py +4 -3
  10. pixeltable/exec/exec_node.py +0 -8
  11. pixeltable/exec/expr_eval/globals.py +1 -0
  12. pixeltable/exec/expr_eval/schedulers.py +16 -4
  13. pixeltable/exec/in_memory_data_node.py +2 -3
  14. pixeltable/exprs/data_row.py +5 -5
  15. pixeltable/exprs/function_call.py +59 -21
  16. pixeltable/exprs/row_builder.py +11 -5
  17. pixeltable/func/expr_template_function.py +6 -3
  18. pixeltable/functions/anthropic.py +1 -2
  19. pixeltable/functions/deepseek.py +5 -1
  20. pixeltable/functions/gemini.py +11 -2
  21. pixeltable/functions/huggingface.py +6 -12
  22. pixeltable/functions/openai.py +2 -1
  23. pixeltable/functions/video.py +5 -5
  24. pixeltable/globals.py +13 -2
  25. pixeltable/io/fiftyone.py +3 -3
  26. pixeltable/io/label_studio.py +2 -1
  27. pixeltable/iterators/audio.py +3 -2
  28. pixeltable/iterators/document.py +0 -6
  29. pixeltable/plan.py +0 -16
  30. pixeltable/share/packager.py +6 -6
  31. pixeltable/share/publish.py +134 -7
  32. pixeltable/utils/media_store.py +131 -66
  33. {pixeltable-0.4.7.dist-info → pixeltable-0.4.8.dist-info}/METADATA +186 -121
  34. {pixeltable-0.4.7.dist-info → pixeltable-0.4.8.dist-info}/RECORD +37 -37
  35. {pixeltable-0.4.7.dist-info → pixeltable-0.4.8.dist-info}/WHEEL +0 -0
  36. {pixeltable-0.4.7.dist-info → pixeltable-0.4.8.dist-info}/entry_points.txt +0 -0
  37. {pixeltable-0.4.7.dist-info → pixeltable-0.4.8.dist-info}/licenses/LICENSE +0 -0
@@ -189,12 +189,10 @@ class Catalog:
189
189
  @classmethod
190
190
  def clear(cls) -> None:
191
191
  """Remove the instance. Used for testing."""
192
- # invalidate all existing instances to force reloading of metadata
193
- for tbl_version in cls._instance._tbl_versions.values():
194
- # _logger.debug(
195
- # f'Invalidating table version {tbl_version.id}:{tbl_version.effective_version} ({id(tbl_version):x})'
196
- # )
197
- tbl_version.is_validated = False
192
+ if cls._instance is not None:
193
+ # invalidate all existing instances to force reloading of metadata
194
+ for tbl_version in cls._instance._tbl_versions.values():
195
+ tbl_version.is_validated = False
198
196
  cls._instance = None
199
197
 
200
198
  def __init__(self) -> None:
@@ -183,16 +183,14 @@ class Table(SchemaObject):
183
183
 
184
184
  return op()
185
185
 
186
- def _get_views(self, *, recursive: bool = True, include_snapshots: bool = True) -> list['Table']:
186
+ def _get_views(self, *, recursive: bool = True, mutable_only: bool = False) -> list['Table']:
187
187
  cat = catalog.Catalog.get()
188
188
  view_ids = cat.get_view_ids(self._id)
189
189
  views = [cat.get_table_by_id(id) for id in view_ids]
190
- if not include_snapshots:
191
- views = [t for t in views if not t._tbl_version_path.is_snapshot()]
190
+ if mutable_only:
191
+ views = [t for t in views if t._tbl_version_path.is_mutable()]
192
192
  if recursive:
193
- views.extend(
194
- t for view in views for t in view._get_views(recursive=True, include_snapshots=include_snapshots)
195
- )
193
+ views.extend(t for view in views for t in view._get_views(recursive=True, mutable_only=mutable_only))
196
194
  return views
197
195
 
198
196
  def _df(self) -> 'pxt.dataframe.DataFrame':
@@ -836,21 +834,25 @@ class Table(SchemaObject):
836
834
  if_not_exists_ = IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
837
835
 
838
836
  if isinstance(column, str):
839
- col = self._tbl_version_path.get_column(column, include_bases=False)
837
+ col = self._tbl_version_path.get_column(column)
840
838
  if col is None:
841
839
  if if_not_exists_ == IfNotExistsParam.ERROR:
842
840
  raise excs.Error(f'Column {column!r} unknown')
843
841
  assert if_not_exists_ == IfNotExistsParam.IGNORE
844
842
  return
843
+ if col.tbl.id != self._tbl_version_path.tbl_id:
844
+ raise excs.Error(f'Cannot drop base table column {col.name!r}')
845
845
  col = self._tbl_version.get().cols_by_name[column]
846
846
  else:
847
- exists = self._tbl_version_path.has_column(column.col, include_bases=False)
847
+ exists = self._tbl_version_path.has_column(column.col)
848
848
  if not exists:
849
849
  if if_not_exists_ == IfNotExistsParam.ERROR:
850
850
  raise excs.Error(f'Unknown column: {column.col.qualified_name}')
851
851
  assert if_not_exists_ == IfNotExistsParam.IGNORE
852
852
  return
853
853
  col = column.col
854
+ if col.tbl.id != self._tbl_version_path.tbl_id:
855
+ raise excs.Error(f'Cannot drop base table column {col.name!r}')
854
856
 
855
857
  dependent_user_cols = [c for c in cat.get_column_dependents(col.tbl.id, col.id) if c.name is not None]
856
858
  if len(dependent_user_cols) > 0:
@@ -859,13 +861,32 @@ class Table(SchemaObject):
859
861
  f'{", ".join(c.name for c in dependent_user_cols)}'
860
862
  )
861
863
 
862
- _ = self._get_views(recursive=True, include_snapshots=False)
864
+ views = self._get_views(recursive=True, mutable_only=True)
865
+
866
+ # See if any view predicates depend on this column
867
+ dependent_views = []
868
+ for view in views:
869
+ if view._tbl_version is not None:
870
+ predicate = view._tbl_version.get().predicate
871
+ if predicate is not None:
872
+ for predicate_col in exprs.Expr.get_refd_column_ids(predicate.as_dict()):
873
+ if predicate_col.tbl_id == col.tbl.id and predicate_col.col_id == col.id:
874
+ dependent_views.append((view, predicate))
875
+
876
+ if len(dependent_views) > 0:
877
+ dependent_views_str = '\n'.join(
878
+ f'view: {view._path()}, predicate: {predicate!s}' for view, predicate in dependent_views
879
+ )
880
+ raise excs.Error(
881
+ f'Cannot drop column `{col.name}` because the following views depend on it:\n{dependent_views_str}'
882
+ )
883
+
863
884
  # See if this column has a dependent store. We need to look through all stores in all
864
885
  # (transitive) views of this table.
865
886
  col_handle = col.handle
866
887
  dependent_stores = [
867
888
  (view, store)
868
- for view in (self, *self._get_views(recursive=True, include_snapshots=False))
889
+ for view in (self, *views)
869
890
  for store in view._tbl_version.get().external_stores.values()
870
891
  if col_handle in store.get_local_columns()
871
892
  ]
@@ -878,6 +899,12 @@ class Table(SchemaObject):
878
899
  f'Cannot drop column `{col.name}` because the following external stores depend on it:\n'
879
900
  f'{", ".join(dependent_store_names)}'
880
901
  )
902
+ all_columns = self.columns()
903
+ if len(all_columns) == 1 and col.name == all_columns[0]:
904
+ raise excs.Error(
905
+ f'Cannot drop column `{col.name}` because it is the last remaining column in this table.'
906
+ f' Tables must have at least one column.'
907
+ )
881
908
 
882
909
  self._tbl_version.get().drop_column(col)
883
910
 
@@ -1108,11 +1135,11 @@ class Table(SchemaObject):
1108
1135
  """Resolve a column parameter to a Column object"""
1109
1136
  col: Column = None
1110
1137
  if isinstance(column, str):
1111
- col = self._tbl_version_path.get_column(column, include_bases=True)
1138
+ col = self._tbl_version_path.get_column(column)
1112
1139
  if col is None:
1113
1140
  raise excs.Error(f'Column {column!r} unknown')
1114
1141
  elif isinstance(column, ColumnRef):
1115
- exists = self._tbl_version_path.has_column(column.col, include_bases=True)
1142
+ exists = self._tbl_version_path.has_column(column.col)
1116
1143
  if not exists:
1117
1144
  raise excs.Error(f'Unknown column: {column.col.qualified_name}')
1118
1145
  col = column.col
@@ -1483,14 +1510,14 @@ class Table(SchemaObject):
1483
1510
  col_name: str
1484
1511
  col: Column
1485
1512
  if isinstance(column, str):
1486
- col = self._tbl_version_path.get_column(column, include_bases=True)
1513
+ col = self._tbl_version_path.get_column(column)
1487
1514
  if col is None:
1488
1515
  raise excs.Error(f'Unknown column: {column!r}')
1489
1516
  col_name = column
1490
1517
  else:
1491
1518
  assert isinstance(column, ColumnRef)
1492
1519
  col = column.col
1493
- if not self._tbl_version_path.has_column(col, include_bases=True):
1520
+ if not self._tbl_version_path.has_column(col):
1494
1521
  raise excs.Error(f'Unknown column: {col.name!r}')
1495
1522
  col_name = col.name
1496
1523
  if not col.is_computed:
@@ -327,7 +327,7 @@ class TableVersion:
327
327
  from .table_version_path import TableVersionPath
328
328
 
329
329
  # clear out any remaining media files from an aborted previous attempt
330
- MediaStore.delete(self.id)
330
+ MediaStore.get().delete(self.id)
331
331
  view_path = TableVersionPath.from_dict(op.load_view_op.view_path)
332
332
  plan, _ = Planner.create_view_load_plan(view_path)
333
333
  _, row_counts = self.store_tbl.insert_rows(plan, v_min=self.version)
@@ -374,7 +374,7 @@ class TableVersion:
374
374
  # if self.base.get().is_mutable:
375
375
  # self.base.get().mutable_views.remove(TableVersionHandle.create(self))
376
376
 
377
- MediaStore.delete(self.id)
377
+ MediaStore.get().delete(self.id)
378
378
  FileCache.get().clear(tbl_id=self.id)
379
379
  self.store_tbl.drop()
380
380
 
@@ -827,14 +827,17 @@ class TableVersion:
827
827
 
828
828
  def rename_column(self, old_name: str, new_name: str) -> None:
829
829
  """Rename a column."""
830
- assert self.is_mutable
831
- if old_name not in self.cols_by_name:
830
+ if not self.is_mutable:
831
+ raise excs.Error(f'Cannot rename column for immutable table {self.name!r}')
832
+ col = self.path.get_column(old_name)
833
+ if col is None:
832
834
  raise excs.Error(f'Unknown column: {old_name}')
835
+ if col.tbl.id != self.id:
836
+ raise excs.Error(f'Cannot rename base table column {col.name!r}')
833
837
  if not is_valid_identifier(new_name):
834
838
  raise excs.Error(f"Invalid column name: '{new_name}'")
835
839
  if new_name in self.cols_by_name:
836
840
  raise excs.Error(f'Column {new_name} already exists')
837
- col = self.cols_by_name[old_name]
838
841
  del self.cols_by_name[old_name]
839
842
  col.name = new_name
840
843
  self.cols_by_name[new_name] = col
@@ -1024,10 +1027,11 @@ class TableVersion:
1024
1027
  for el in val:
1025
1028
  assert isinstance(el, int)
1026
1029
  continue
1027
- col = self.path.get_column(col_name, include_bases=False)
1030
+ col = self.path.get_column(col_name)
1028
1031
  if col is None:
1029
- # TODO: return more informative error if this is trying to update a base column
1030
1032
  raise excs.Error(f'Column {col_name} unknown')
1033
+ if col.tbl.id != self.id:
1034
+ raise excs.Error(f'Column {col.name!r} is a base table column and cannot be updated')
1031
1035
  if col.is_computed:
1032
1036
  raise excs.Error(f'Column {col_name} is computed and cannot be updated')
1033
1037
  if col.is_pk and not allow_pk:
@@ -1235,7 +1239,7 @@ class TableVersion:
1235
1239
  )
1236
1240
 
1237
1241
  # delete newly-added data
1238
- MediaStore.delete(self.id, tbl_version=self.version)
1242
+ MediaStore.get().delete(self.id, tbl_version=self.version)
1239
1243
  conn.execute(sql.delete(self.store_tbl.sa_tbl).where(self.store_tbl.sa_tbl.c.v_min == self.version))
1240
1244
 
1241
1245
  # revert new deletions
@@ -184,13 +184,13 @@ class TableVersionPath:
184
184
  cols = self.columns()
185
185
  return {col.id: col for col in cols}
186
186
 
187
- def get_column(self, name: str, include_bases: Optional[bool] = None) -> Optional[Column]:
187
+ def get_column(self, name: str) -> Optional[Column]:
188
188
  """Return the column with the given name, or None if not found"""
189
189
  self.refresh_cached_md()
190
190
  col = self._cached_tbl_version.cols_by_name.get(name)
191
191
  if col is not None:
192
192
  return col
193
- elif self.base is not None and (include_bases or self._cached_tbl_version.include_base_columns):
193
+ elif self.base is not None and self._cached_tbl_version.include_base_columns:
194
194
  return self.base.get_column(name)
195
195
  else:
196
196
  return None
@@ -206,10 +206,11 @@ class TableVersionPath:
206
206
  else:
207
207
  return None
208
208
 
209
- def has_column(self, col: Column, include_bases: bool = True) -> bool:
209
+ def has_column(self, col: Column) -> bool:
210
210
  """Return True if this table has the given column."""
211
- self.refresh_cached_md()
212
211
  assert col.tbl is not None
212
+ self.refresh_cached_md()
213
+
213
214
  if (
214
215
  col.tbl.id == self.tbl_version.id
215
216
  and col.tbl.effective_version == self.tbl_version.effective_version
@@ -217,7 +218,7 @@ class TableVersionPath:
217
218
  ):
218
219
  # the column is visible in this table version
219
220
  return True
220
- elif self.base is not None and include_bases:
221
+ elif self.base is not None:
221
222
  return self.base.has_column(col)
222
223
  else:
223
224
  return False
pixeltable/config.py CHANGED
@@ -111,10 +111,19 @@ class Config:
111
111
  return default
112
112
 
113
113
  def get_value(self, key: str, expected_type: type[T], section: str = 'pixeltable') -> Optional[T]:
114
- value = self.lookup_env(section, key) # Try to get from environment first
114
+ value: Any = self.lookup_env(section, key) # Try to get from environment first
115
115
  # Next try the config file
116
- if value is None and section in self.__config_dict and key in self.__config_dict[section]:
117
- value = self.__config_dict[section][key]
116
+ if value is None:
117
+ # Resolve nested section dicts
118
+ lookup_elems = [*section.split('.'), key]
119
+ value = self.__config_dict
120
+ for el in lookup_elems:
121
+ if isinstance(value, dict):
122
+ if el not in value:
123
+ return None
124
+ value = value[el]
125
+ else:
126
+ return None
118
127
 
119
128
  if value is None:
120
129
  return None # Not specified
@@ -155,19 +164,25 @@ KNOWN_CONFIG_OPTIONS = {
155
164
  },
156
165
  'anthropic': {'api_key': 'Anthropic API key'},
157
166
  'bedrock': {'api_key': 'AWS Bedrock API key'},
158
- 'deepseek': {'api_key': 'Deepseek API key'},
159
- 'fireworks': {'api_key': 'Fireworks API key'},
160
- 'gemini': {'api_key': 'Gemini API key'},
161
- 'groq': {'api_key': 'Groq API key'},
167
+ 'deepseek': {'api_key': 'Deepseek API key', 'rate_limit': 'Rate limit for Deepseek API requests'},
168
+ 'fireworks': {'api_key': 'Fireworks API key', 'rate_limit': 'Rate limit for Fireworks API requests'},
169
+ 'gemini': {'api_key': 'Gemini API key', 'rate_limits': 'Per-model rate limits for Gemini API requests'},
170
+ 'imagen': {'rate_limits': 'Per-model rate limits for Imagen API requests'},
171
+ 'veo': {'rate_limits': 'Per-model rate limits for Veo API requests'},
172
+ 'groq': {'api_key': 'Groq API key', 'rate_limit': 'Rate limit for Groq API requests'},
162
173
  'label_studio': {'api_key': 'Label Studio API key', 'url': 'Label Studio server URL'},
163
- 'mistral': {'api_key': 'Mistral API key'},
174
+ 'mistral': {'api_key': 'Mistral API key', 'rate_limit': 'Rate limit for Mistral API requests'},
164
175
  'openai': {
165
176
  'api_key': 'OpenAI API key',
166
177
  'base_url': 'OpenAI API base URL',
167
178
  'api_version': 'API version if using Azure OpenAI',
179
+ 'rate_limits': 'Per-model rate limits for OpenAI API requests',
168
180
  },
169
181
  'replicate': {'api_token': 'Replicate API token'},
170
- 'together': {'api_key': 'Together API key'},
182
+ 'together': {
183
+ 'api_key': 'Together API key',
184
+ 'rate_limits': 'Per-model category rate limits for Together API requests',
185
+ },
171
186
  'pypi': {'api_key': 'PyPI API key (for internal use only)'},
172
187
  }
173
188
 
pixeltable/dataframe.py CHANGED
@@ -795,19 +795,19 @@ class DataFrame:
795
795
  assert len(col_refs) > 0 and len(joined_tbls) >= 2
796
796
  for col_ref in col_refs:
797
797
  # identify the referenced column by name in 'other'
798
- rhs_col = other.get_column(col_ref.col.name, include_bases=True)
798
+ rhs_col = other.get_column(col_ref.col.name)
799
799
  if rhs_col is None:
800
800
  raise excs.Error(f"'on': column {col_ref.col.name!r} not found in joined table")
801
801
  rhs_col_ref = exprs.ColumnRef(rhs_col)
802
802
 
803
803
  lhs_col_ref: Optional[exprs.ColumnRef] = None
804
- if any(tbl.has_column(col_ref.col, include_bases=True) for tbl in self._from_clause.tbls):
804
+ if any(tbl.has_column(col_ref.col) for tbl in self._from_clause.tbls):
805
805
  # col_ref comes from the existing from_clause, we use that directly
806
806
  lhs_col_ref = col_ref
807
807
  else:
808
808
  # col_ref comes from other, we need to look for a match in the existing from_clause by name
809
809
  for tbl in self._from_clause.tbls:
810
- col = tbl.get_column(col_ref.col.name, include_bases=True)
810
+ col = tbl.get_column(col_ref.col.name)
811
811
  if col is None:
812
812
  continue
813
813
  if lhs_col_ref is not None:
pixeltable/env.py CHANGED
@@ -15,7 +15,6 @@ import sys
15
15
  import threading
16
16
  import types
17
17
  import typing
18
- import uuid
19
18
  import warnings
20
19
  from contextlib import contextmanager
21
20
  from dataclasses import dataclass, field
@@ -101,6 +100,8 @@ class Env:
101
100
  def _init_env(cls, reinit_db: bool = False) -> None:
102
101
  assert not cls.__initializing, 'Circular env initialization detected.'
103
102
  cls.__initializing = True
103
+ if cls._instance is not None:
104
+ cls._instance._clean_up()
104
105
  cls._instance = None
105
106
  env = Env()
106
107
  env._set_up(reinit_db=reinit_db)
@@ -246,7 +247,7 @@ class Env:
246
247
  if self._current_conn is None:
247
248
  assert self._current_session is None
248
249
  try:
249
- self._current_isolation_level = 'SERIALIZABLE' if for_write else 'REPEATABLE_READ'
250
+ self._current_isolation_level = 'SERIALIZABLE'
250
251
  with (
251
252
  self.engine.connect().execution_options(isolation_level=self._current_isolation_level) as conn,
252
253
  sql.orm.Session(conn) as session,
@@ -485,7 +486,7 @@ class Env:
485
486
  raise excs.Error(error)
486
487
  self._logger.info(f'Using database at: {self.db_url}')
487
488
  else:
488
- self._db_name = os.environ.get('PIXELTABLE_DB', 'pixeltable')
489
+ self._db_name = config.get_string_value('db') or 'pixeltable'
489
490
  self._pgdata_dir = Path(os.environ.get('PIXELTABLE_PGDATA', str(Config.get().home / 'pgdata')))
490
491
  # cleanup_mode=None will leave the postgres process running after Python exits
491
492
  # cleanup_mode='stop' will terminate the postgres process when Python exits
@@ -557,6 +558,14 @@ class Env:
557
558
  finally:
558
559
  engine.dispose()
559
560
 
561
+ def _pgserver_terminate_connections_stmt(self) -> str:
562
+ return f"""
563
+ SELECT pg_terminate_backend(pg_stat_activity.pid)
564
+ FROM pg_stat_activity
565
+ WHERE pg_stat_activity.datname = '{self._db_name}'
566
+ AND pid <> pg_backend_pid()
567
+ """
568
+
560
569
  def _drop_store_db(self) -> None:
561
570
  assert self._db_name is not None
562
571
  engine = sql.create_engine(self._dbms.default_system_db_url(), future=True, isolation_level='AUTOCOMMIT')
@@ -565,13 +574,7 @@ class Env:
565
574
  with engine.begin() as conn:
566
575
  # terminate active connections
567
576
  if self._db_server is not None:
568
- stmt = f"""
569
- SELECT pg_terminate_backend(pg_stat_activity.pid)
570
- FROM pg_stat_activity
571
- WHERE pg_stat_activity.datname = '{self._db_name}'
572
- AND pid <> pg_backend_pid()
573
- """
574
- conn.execute(sql.text(stmt))
577
+ conn.execute(sql.text(self._pgserver_terminate_connections_stmt()))
575
578
  # drop db
576
579
  stmt = self._dbms.drop_db_stmt(preparer.quote(self._db_name))
577
580
  conn.execute(sql.text(stmt))
@@ -749,12 +752,6 @@ class Env:
749
752
  else:
750
753
  os.remove(path)
751
754
 
752
- def num_tmp_files(self) -> int:
753
- return len(glob.glob(f'{self._tmp_dir}/*'))
754
-
755
- def create_tmp_path(self, extension: str = '') -> Path:
756
- return self._tmp_dir / f'{uuid.uuid4()}{extension}'
757
-
758
755
  # def get_resource_pool_info(self, pool_id: str, pool_info_cls: Optional[Type[T]]) -> T:
759
756
  def get_resource_pool_info(self, pool_id: str, make_pool_info: Optional[Callable[[], T]] = None) -> T:
760
757
  """Returns the info object for the given id, creating it if necessary."""
@@ -815,6 +812,63 @@ class Env:
815
812
  except Exception as exc:
816
813
  raise excs.Error(f'Failed to load spaCy model: {spacy_model}') from exc
817
814
 
815
+ def _clean_up(self) -> None:
816
+ """
817
+ Internal cleanup method that properly closes all resources and resets state.
818
+ This is called before destroying the singleton instance.
819
+ """
820
+ assert self._current_session is None
821
+ assert self._current_conn is None
822
+
823
+ # Stop HTTP server
824
+ if self._httpd is not None:
825
+ try:
826
+ self._httpd.shutdown()
827
+ self._httpd.server_close()
828
+ except Exception as e:
829
+ _logger.warning(f'Error stopping HTTP server: {e}')
830
+
831
+ # First terminate all connections to the database
832
+ if self._db_server is not None:
833
+ assert self._dbms is not None
834
+ assert self._db_name is not None
835
+ try:
836
+ temp_engine = sql.create_engine(self._dbms.default_system_db_url(), isolation_level='AUTOCOMMIT')
837
+ try:
838
+ with temp_engine.begin() as conn:
839
+ conn.execute(sql.text(self._pgserver_terminate_connections_stmt()))
840
+ _logger.info(f"Terminated all connections to database '{self._db_name}'")
841
+ except Exception as e:
842
+ _logger.warning(f'Error terminating database connections: {e}')
843
+ finally:
844
+ temp_engine.dispose()
845
+ except Exception as e:
846
+ _logger.warning(f'Error stopping database server: {e}')
847
+
848
+ # Dispose of SQLAlchemy engine (after stopping db server)
849
+ if self._sa_engine is not None:
850
+ try:
851
+ self._sa_engine.dispose()
852
+ except Exception as e:
853
+ _logger.warning(f'Error disposing engine: {e}')
854
+
855
+ # Close event loop
856
+ if self._event_loop is not None:
857
+ try:
858
+ if self._event_loop.is_running():
859
+ self._event_loop.stop()
860
+ self._event_loop.close()
861
+ except Exception as e:
862
+ _logger.warning(f'Error closing event loop: {e}')
863
+
864
+ # Remove logging handlers
865
+ for handler in self._logger.handlers[:]:
866
+ try:
867
+ handler.close()
868
+ self._logger.removeHandler(handler)
869
+ except Exception as e:
870
+ _logger.warning(f'Error removing handler: {e}')
871
+
818
872
 
819
873
  def register_client(name: str) -> Callable:
820
874
  """Decorator that registers a third-party API client for use by Pixeltable.
@@ -103,6 +103,6 @@ class AggregationNode(ExecNode):
103
103
  self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
104
104
  self.output_batch.add_row(prev_row)
105
105
 
106
- self.output_batch.flush_imgs(None, self.stored_img_cols, self.flushed_img_slots)
106
+ self.output_batch.flush_imgs(None, self.row_builder.stored_img_cols, self.flushed_img_slots)
107
107
  _logger.debug(f'AggregateNode: consumed {num_input_rows} rows, returning {len(self.output_batch.rows)} rows')
108
108
  yield self.output_batch
@@ -12,8 +12,9 @@ from pathlib import Path
12
12
  from typing import Any, AsyncIterator, Iterator, Optional
13
13
  from uuid import UUID
14
14
 
15
- from pixeltable import env, exceptions as excs, exprs
15
+ from pixeltable import exceptions as excs, exprs
16
16
  from pixeltable.utils.filecache import FileCache
17
+ from pixeltable.utils.media_store import TempStore
17
18
 
18
19
  from .data_row_batch import DataRowBatch
19
20
  from .exec_node import ExecNode
@@ -219,7 +220,7 @@ class CachePrefetchNode(ExecNode):
219
220
  self.in_flight_requests[f] = url
220
221
 
221
222
  def __fetch_url(self, url: str) -> tuple[Optional[Path], Optional[Exception]]:
222
- """Fetches a remote URL into Env.tmp_dir and returns its path"""
223
+ """Fetches a remote URL into the TempStore and returns its path"""
223
224
  _logger.debug(f'fetching url={url} thread_name={threading.current_thread().name}')
224
225
  parsed = urllib.parse.urlparse(url)
225
226
  # Use len(parsed.scheme) > 1 here to ensure we're not being passed
@@ -230,7 +231,7 @@ class CachePrefetchNode(ExecNode):
230
231
  if parsed.path:
231
232
  p = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed.path)))
232
233
  extension = p.suffix
233
- tmp_path = env.Env.get().create_tmp_path(extension=extension)
234
+ tmp_path = TempStore.create_path(extension=extension)
234
235
  try:
235
236
  _logger.debug(f'Downloading {url} to {tmp_path}')
236
237
  if parsed.scheme == 's3':
@@ -20,7 +20,6 @@ class ExecNode(abc.ABC):
20
20
  row_builder: exprs.RowBuilder
21
21
  input: Optional[ExecNode]
22
22
  flushed_img_slots: list[int] # idxs of image slots of our output_exprs dependencies
23
- stored_img_cols: list[exprs.ColumnSlotIdx]
24
23
  ctx: Optional[ExecContext]
25
24
 
26
25
  def __init__(
@@ -40,7 +39,6 @@ class ExecNode(abc.ABC):
40
39
  self.flushed_img_slots = [
41
40
  e.slot_idx for e in output_dependencies if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
42
41
  ]
43
- self.stored_img_cols = []
44
42
  self.ctx = None # all nodes of a tree share the same context
45
43
 
46
44
  def set_ctx(self, ctx: ExecContext) -> None:
@@ -48,12 +46,6 @@ class ExecNode(abc.ABC):
48
46
  if self.input is not None:
49
47
  self.input.set_ctx(ctx)
50
48
 
51
- def set_stored_img_cols(self, stored_img_cols: list[exprs.ColumnSlotIdx]) -> None:
52
- self.stored_img_cols = stored_img_cols
53
- # propagate batch size to the source
54
- if self.input is not None:
55
- self.input.set_stored_img_cols(stored_img_cols)
56
-
57
49
  @abc.abstractmethod
58
50
  def __aiter__(self) -> AsyncIterator[DataRowBatch]:
59
51
  pass
@@ -56,6 +56,7 @@ class Scheduler(abc.ABC):
56
56
  request: FnCallArgs
57
57
  num_retries: int
58
58
  exec_ctx: ExecCtx
59
+ retry_after: Optional[float] = None # time.monotonic()
59
60
 
60
61
  def __lt__(self, other: Scheduler.QueueItem) -> bool:
61
62
  # prioritize by number of retries (more retries = higher priority)
@@ -270,6 +270,7 @@ class RequestRateScheduler(Scheduler):
270
270
  num_in_flight: int
271
271
  total_requests: int
272
272
  total_retried: int
273
+ total_errors: int
273
274
 
274
275
  TIME_FORMAT = '%H:%M.%S %f'
275
276
  MAX_RETRIES = 3
@@ -294,6 +295,7 @@ class RequestRateScheduler(Scheduler):
294
295
  self.num_in_flight = 0
295
296
  self.total_requests = 0
296
297
  self.total_retried = 0
298
+ self.total_errors = 0
297
299
 
298
300
  # try to get the rate limit from the config
299
301
  elems = resource_pool.split(':')
@@ -312,6 +314,7 @@ class RequestRateScheduler(Scheduler):
312
314
  key = model
313
315
  requests_per_min = Config.get().get_int_value(key, section=section)
314
316
  requests_per_min = requests_per_min or self.DEFAULT_RATE_LIMIT
317
+ _logger.debug(f'rate limit for {self.resource_pool}: {requests_per_min} RPM')
315
318
  self.secs_per_request = 1 / (requests_per_min / 60)
316
319
 
317
320
  @classmethod
@@ -325,8 +328,12 @@ class RequestRateScheduler(Scheduler):
325
328
  if item.num_retries > 0:
326
329
  self.total_retried += 1
327
330
  now = time.monotonic()
331
+ wait_duration = 0.0
332
+ if item.retry_after is not None:
333
+ wait_duration = item.retry_after - now
328
334
  if now - last_request_ts < self.secs_per_request:
329
- wait_duration = self.secs_per_request - (now - last_request_ts)
335
+ wait_duration = max(wait_duration, self.secs_per_request - (now - last_request_ts))
336
+ if wait_duration > 0:
330
337
  _logger.debug(f'waiting for {wait_duration} for {self.resource_pool}')
331
338
  await asyncio.sleep(wait_duration)
332
339
 
@@ -372,15 +379,20 @@ class RequestRateScheduler(Scheduler):
372
379
 
373
380
  except Exception as exc:
374
381
  _logger.debug(f'exception for {self.resource_pool}: type={type(exc)}\n{exc}')
382
+ if hasattr(exc, 'response') and hasattr(exc.response, 'headers'):
383
+ _logger.debug(f'scheduler {self.resource_pool}: exception headers: {exc.response.headers}')
375
384
  is_rate_limit_error, retry_after = self._is_rate_limit_error(exc)
376
385
  if is_rate_limit_error and num_retries < self.MAX_RETRIES:
377
386
  retry_delay = self._compute_retry_delay(num_retries, retry_after)
378
387
  _logger.debug(f'scheduler {self.resource_pool}: retrying after {retry_delay}')
379
- await asyncio.sleep(retry_delay)
380
- self.queue.put_nowait(self.QueueItem(request, num_retries + 1, exec_ctx))
388
+ now = time.monotonic()
389
+ # put the request back in the queue right away, which prevents new requests from being generated until
390
+ # this one succeeds or exceeds its retry limit
391
+ self.queue.put_nowait(self.QueueItem(request, num_retries + 1, exec_ctx, retry_after=now + retry_delay))
381
392
  return
382
393
 
383
394
  # record the exception
395
+ self.total_errors += 1
384
396
  _, _, exc_tb = sys.exc_info()
385
397
  for row in request.rows:
386
398
  row.set_exc(request.fn_call.slot_idx, exc)
@@ -388,7 +400,7 @@ class RequestRateScheduler(Scheduler):
388
400
  finally:
389
401
  _logger.debug(
390
402
  f'Scheduler stats: #in-flight={self.num_in_flight} #requests={self.total_requests}, '
391
- f'#retried={self.total_retried}'
403
+ f'#retried={self.total_retried} #errors={self.total_errors}'
392
404
  )
393
405
  if is_task:
394
406
  self.num_in_flight -= 1
@@ -2,7 +2,7 @@ import logging
2
2
  from typing import Any, AsyncIterator, Optional
3
3
 
4
4
  from pixeltable import catalog, exprs
5
- from pixeltable.utils.media_store import MediaStore
5
+ from pixeltable.utils.media_store import TempStore
6
6
 
7
7
  from .data_row_batch import DataRowBatch
8
8
  from .exec_node import ExecNode
@@ -67,8 +67,7 @@ class InMemoryDataNode(ExecNode):
67
67
  col = col_info.col
68
68
  if col.col_type.is_image_type() and isinstance(val, bytes):
69
69
  # this is a literal media file, ie, a sequence of bytes; save it as a binary file and store the path
70
- assert col.tbl.id == self.tbl.id
71
- filepath, _ = MediaStore.save_media_object(val, col, format=None)
70
+ filepath, _ = TempStore.save_media_object(val, col, format=None)
72
71
  output_row[col_info.slot_idx] = str(filepath)
73
72
  else:
74
73
  output_row[col_info.slot_idx] = val