pixeltable 0.4.13__py3-none-any.whl → 0.4.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (55) hide show
  1. pixeltable/__init__.py +2 -1
  2. pixeltable/catalog/catalog.py +187 -63
  3. pixeltable/catalog/column.py +24 -20
  4. pixeltable/catalog/table.py +24 -8
  5. pixeltable/catalog/table_metadata.py +1 -0
  6. pixeltable/catalog/table_version.py +16 -34
  7. pixeltable/catalog/update_status.py +12 -0
  8. pixeltable/catalog/view.py +22 -22
  9. pixeltable/config.py +2 -0
  10. pixeltable/dataframe.py +4 -2
  11. pixeltable/env.py +46 -21
  12. pixeltable/exec/__init__.py +1 -0
  13. pixeltable/exec/aggregation_node.py +0 -1
  14. pixeltable/exec/cache_prefetch_node.py +74 -98
  15. pixeltable/exec/data_row_batch.py +2 -18
  16. pixeltable/exec/expr_eval/expr_eval_node.py +11 -0
  17. pixeltable/exec/in_memory_data_node.py +1 -1
  18. pixeltable/exec/object_store_save_node.py +299 -0
  19. pixeltable/exec/sql_node.py +28 -33
  20. pixeltable/exprs/data_row.py +31 -25
  21. pixeltable/exprs/json_path.py +6 -5
  22. pixeltable/exprs/row_builder.py +6 -12
  23. pixeltable/functions/gemini.py +1 -1
  24. pixeltable/functions/openai.py +1 -1
  25. pixeltable/functions/video.py +128 -15
  26. pixeltable/functions/whisperx.py +2 -0
  27. pixeltable/functions/yolox.py +2 -0
  28. pixeltable/globals.py +49 -30
  29. pixeltable/index/embedding_index.py +5 -8
  30. pixeltable/io/__init__.py +1 -0
  31. pixeltable/io/fiftyone.py +1 -1
  32. pixeltable/io/label_studio.py +4 -5
  33. pixeltable/iterators/__init__.py +1 -0
  34. pixeltable/iterators/audio.py +1 -1
  35. pixeltable/iterators/document.py +10 -12
  36. pixeltable/iterators/video.py +1 -1
  37. pixeltable/metadata/schema.py +7 -0
  38. pixeltable/plan.py +26 -1
  39. pixeltable/share/packager.py +8 -2
  40. pixeltable/share/publish.py +3 -10
  41. pixeltable/store.py +1 -1
  42. pixeltable/type_system.py +1 -3
  43. pixeltable/utils/dbms.py +31 -5
  44. pixeltable/utils/gcs_store.py +283 -0
  45. pixeltable/utils/local_store.py +316 -0
  46. pixeltable/utils/object_stores.py +497 -0
  47. pixeltable/utils/pytorch.py +5 -6
  48. pixeltable/utils/s3_store.py +354 -0
  49. {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/METADATA +1 -1
  50. {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/RECORD +53 -50
  51. pixeltable/utils/media_store.py +0 -248
  52. pixeltable/utils/s3.py +0 -17
  53. {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/WHEEL +0 -0
  54. {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/entry_points.txt +0 -0
  55. {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/licenses/LICENSE +0 -0
@@ -18,9 +18,8 @@ from pixeltable import exprs, index
18
18
  from pixeltable.env import Env
19
19
  from pixeltable.iterators import ComponentIterator
20
20
  from pixeltable.metadata import schema
21
- from pixeltable.utils.exception_handler import run_cleanup_on_exception
22
21
  from pixeltable.utils.filecache import FileCache
23
- from pixeltable.utils.media_store import MediaStore
22
+ from pixeltable.utils.object_stores import ObjectOps
24
23
 
25
24
  from .tbl_ops import TableOp
26
25
 
@@ -327,7 +326,7 @@ class TableVersion:
327
326
  from .table_version_path import TableVersionPath
328
327
 
329
328
  # clear out any remaining media files from an aborted previous attempt
330
- MediaStore.get().delete(self.id)
329
+ self.delete_media()
331
330
  view_path = TableVersionPath.from_dict(op.load_view_op.view_path)
332
331
  plan, _ = Planner.create_view_load_plan(view_path)
333
332
  _, row_counts = self.store_tbl.insert_rows(plan, v_min=self.version)
@@ -356,14 +355,23 @@ class TableVersion:
356
355
  cat = pxt.catalog.Catalog.get()
357
356
  # We're creating a new TableVersion replica, so we should never have seen this particular
358
357
  # TableVersion instance before.
359
- assert tbl_version.effective_version is not None
360
- assert (tbl_version.id, tbl_version.effective_version) not in cat._tbl_versions
358
+ # Actually this isn't true, because we might be re-creating a dropped replica.
359
+ # TODO: Understand why old TableVersions are kept around even for a dropped table.
360
+ # assert tbl_version.effective_version is not None
361
+ # assert (tbl_version.id, tbl_version.effective_version) not in cat._tbl_versions
361
362
  cat._tbl_versions[tbl_version.id, tbl_version.effective_version] = tbl_version
362
363
  tbl_version.init()
363
364
  tbl_version.store_tbl.create()
364
365
  tbl_version.store_tbl.ensure_columns_exist(col for col in tbl_version.cols if col.is_stored)
365
366
  return tbl_version
366
367
 
368
+ def delete_media(self, tbl_version: Optional[int] = None) -> None:
369
+ # Assemble a set of column destinations and delete objects from all of them
370
+ # None is a valid column destination which refers to the default object location
371
+ destinations = {col.destination for col in self.cols if col.is_stored}
372
+ for dest in destinations:
373
+ ObjectOps.delete(dest, self.id, tbl_version=tbl_version)
374
+
367
375
  def drop(self) -> None:
368
376
  # if self.is_view and self.is_mutable:
369
377
  # # update mutable_views
@@ -374,7 +382,7 @@ class TableVersion:
374
382
  # if self.base.get().is_mutable:
375
383
  # self.base.get().mutable_views.remove(TableVersionHandle.create(self))
376
384
 
377
- MediaStore.get().delete(self.id)
385
+ self.delete_media()
378
386
  FileCache.get().clear(tbl_id=self.id)
379
387
  self.store_tbl.drop()
380
388
 
@@ -595,18 +603,7 @@ class TableVersion:
595
603
  idx_info = self.IndexInfo(id=idx_id, name=idx_name, idx=idx, col=col, val_col=val_col, undo_col=undo_col)
596
604
  self._tbl_md.index_md[idx_id] = idx_md
597
605
  self.idxs_by_name[idx_name] = idx_info
598
- try:
599
- idx.create_index(self._store_idx_name(idx_id), val_col)
600
- finally:
601
-
602
- def cleanup_index() -> None:
603
- """Delete the newly added in-memory index structure"""
604
- del self.idxs_by_name[idx_name]
605
- del self._tbl_md.index_md[idx_id]
606
- self.next_idx_id = idx_id
607
-
608
- # Run cleanup only if there has been an exception; otherwise, skip cleanup.
609
- run_cleanup_on_exception(cleanup_index)
606
+ idx.create_index(self._store_idx_name(idx_id), val_col)
610
607
 
611
608
  def _add_index(self, col: Column, idx_name: Optional[str], idx: index.IndexBase) -> UpdateStatus:
612
609
  val_col, undo_vol = self._create_index_columns(idx)
@@ -741,21 +738,6 @@ class TableVersion:
741
738
  num_excs += excs_per_col
742
739
  computed_values += plan.ctx.num_computed_exprs * row_count
743
740
  finally:
744
- # Ensure cleanup occurs if an exception or keyboard interruption happens during `load_column()`.
745
- def cleanup_on_error() -> None:
746
- """Delete columns that are added as part of current add_columns operation and re-initialize
747
- the sqlalchemy schema"""
748
- self.cols = [col for col in self.cols if col not in cols_to_add]
749
- for col in cols_to_add:
750
- # remove columns that we already added
751
- if col.id in self.cols_by_id:
752
- del self.cols_by_id[col.id]
753
- if col.name is not None and col.name in self.cols_by_name:
754
- del self.cols_by_name[col.name]
755
- self.store_tbl.create_sa_tbl()
756
-
757
- # Run cleanup only if there has been an exception; otherwise, skip cleanup.
758
- run_cleanup_on_exception(cleanup_on_error)
759
741
  plan.close()
760
742
 
761
743
  pxt.catalog.Catalog.get().record_column_dependencies(self)
@@ -1236,7 +1218,7 @@ class TableVersion:
1236
1218
  )
1237
1219
 
1238
1220
  # delete newly-added data
1239
- MediaStore.get().delete(self.id, tbl_version=self.version)
1221
+ self.delete_media(tbl_version=self.version)
1240
1222
  conn.execute(sql.delete(self.store_tbl.sa_tbl).where(self.store_tbl.sa_tbl.c.v_min == self.version))
1241
1223
 
1242
1224
  # revert new deletions
@@ -57,27 +57,35 @@ class UpdateStatus:
57
57
  """
58
58
 
59
59
  updated_cols: list[str] = field(default_factory=list)
60
+ """Columns that were updated."""
60
61
  cols_with_excs: list[str] = field(default_factory=list)
62
+ """Columns that encountered exceptions."""
61
63
 
62
64
  # stats for the rows affected by the operation
63
65
  row_count_stats: RowCountStats = field(default_factory=RowCountStats)
66
+ """Row count statistics for rows affected by this operation."""
64
67
 
65
68
  # stats for changes cascaded to other tables
66
69
  cascade_row_count_stats: RowCountStats = field(default_factory=RowCountStats)
70
+ """Row count statistics for changes cascaded to other tables."""
67
71
 
68
72
  # stats for the rows affected by the operation in an external store
69
73
  ext_row_count_stats: RowCountStats = field(default_factory=RowCountStats)
74
+ """Row count statistics for rows affected in an external store."""
70
75
 
71
76
  @property
72
77
  def num_rows(self) -> int:
78
+ """Total number of rows affected (including cascaded changes)."""
73
79
  return self.row_count_stats.num_rows + self.cascade_row_count_stats.num_rows
74
80
 
75
81
  @property
76
82
  def num_excs(self) -> int:
83
+ """Total number of exceptions encountered (including cascaded changes)."""
77
84
  return self.row_count_stats.num_excs + self.cascade_row_count_stats.num_excs
78
85
 
79
86
  @property
80
87
  def num_computed_values(self) -> int:
88
+ """Total number of computed values affected (including cascaded changes)."""
81
89
  return self.row_count_stats.computed_values + self.cascade_row_count_stats.computed_values
82
90
 
83
91
  def insert_to_update(self) -> 'UpdateStatus':
@@ -164,16 +172,20 @@ class UpdateStatus:
164
172
 
165
173
  @property
166
174
  def external_rows_updated(self) -> int:
175
+ """Number of rows updated in an external store."""
167
176
  return self.ext_row_count_stats.upd_rows
168
177
 
169
178
  @property
170
179
  def external_rows_created(self) -> int:
180
+ """Number of rows created in an external store."""
171
181
  return self.ext_row_count_stats.ins_rows
172
182
 
173
183
  @property
174
184
  def external_rows_deleted(self) -> int:
185
+ """Number of rows deleted from an external store."""
175
186
  return self.ext_row_count_stats.del_rows
176
187
 
177
188
  @property
178
189
  def ext_num_rows(self) -> int:
190
+ """Total number of rows affected in an external store."""
179
191
  return self.ext_row_count_stats.num_rows
@@ -47,17 +47,13 @@ class View(Table):
47
47
  self._tbl_version = tbl_version_path.tbl_version
48
48
 
49
49
  def _display_name(self) -> str:
50
- name: str
51
- if self._tbl_version_path.is_snapshot():
52
- name = 'snapshot'
53
- elif self._tbl_version_path.is_view():
54
- name = 'view'
55
- else:
56
- assert self._tbl_version_path.is_replica()
57
- name = 'table'
58
50
  if self._tbl_version_path.is_replica():
59
- name = f'{name}-replica'
60
- return name
51
+ return 'replica'
52
+ if self._tbl_version_path.is_snapshot():
53
+ return 'snapshot'
54
+ if self._tbl_version_path.is_view():
55
+ return 'view'
56
+ return 'table'
61
57
 
62
58
  @classmethod
63
59
  def select_list_to_additional_columns(cls, select_list: list[tuple[exprs.Expr, Optional[str]]]) -> dict[str, dict]:
@@ -270,12 +266,12 @@ class View(Table):
270
266
  # Update name and path with version qualifiers.
271
267
  md['name'] = f'{self._name}:{self._tbl_version_path.version()}'
272
268
  md['path'] = f'{self._path()}:{self._tbl_version_path.version()}'
273
- base_tbl = self._get_base_table()
274
- if base_tbl is None:
275
- md['base'] = None
276
- else:
269
+ base_tbl_id = self._base_tbl_id
270
+ if base_tbl_id is not None:
271
+ base_tbl = self._get_base_table()
272
+ base_path = '<anonymous base table>' if base_tbl is None else base_tbl._path()
277
273
  base_version = self._effective_base_versions[0]
278
- md['base'] = base_tbl._path() if base_version is None else f'{base_tbl._path()}:{base_version}'
274
+ md['base'] = base_path if base_version is None else f'{base_path}:{base_version}'
279
275
  return md
280
276
 
281
277
  def insert(
@@ -294,17 +290,21 @@ class View(Table):
294
290
  def delete(self, where: Optional[exprs.Expr] = None) -> UpdateStatus:
295
291
  raise excs.Error(f'{self._display_str()}: Cannot delete from a {self._display_name()}.')
296
292
 
297
- def _get_base_table(self) -> Optional['Table']:
293
+ @property
294
+ def _base_tbl_id(self) -> Optional[UUID]:
298
295
  if self._tbl_version_path.tbl_id != self._id:
299
296
  # _tbl_version_path represents a different schema object from this one. This can only happen if this is a
300
297
  # named pure snapshot.
301
- base_id = self._tbl_version_path.tbl_id
302
- elif self._tbl_version_path.base is None:
298
+ return self._tbl_version_path.tbl_id
299
+ if self._tbl_version_path.base is None:
303
300
  return None
304
- else:
305
- base_id = self._tbl_version_path.base.tbl_id
306
- with catalog.Catalog.get().begin_xact(tbl_id=base_id, for_write=False):
307
- return catalog.Catalog.get().get_table_by_id(base_id)
301
+ return self._tbl_version_path.base.tbl_id
302
+
303
+ def _get_base_table(self) -> Optional['Table']:
304
+ """Returns None if there is no base table, or if the base table is hidden."""
305
+ base_tbl_id = self._base_tbl_id
306
+ with catalog.Catalog.get().begin_xact(tbl_id=base_tbl_id, for_write=False):
307
+ return catalog.Catalog.get().get_table_by_id(base_tbl_id)
308
308
 
309
309
  @property
310
310
  def _effective_base_versions(self) -> list[Optional[int]]:
pixeltable/config.py CHANGED
@@ -161,6 +161,8 @@ KNOWN_CONFIG_OPTIONS = {
161
161
  'hide_warnings': 'Hide warnings from the console',
162
162
  'verbosity': 'Verbosity level for console output',
163
163
  'api_key': 'API key for Pixeltable cloud',
164
+ 'r2_profile': 'AWS config profile name used to access R2 storage',
165
+ 's3_profile': 'AWS config profile name used to access S3 storage',
164
166
  },
165
167
  'anthropic': {'api_key': 'Anthropic API key'},
166
168
  'bedrock': {'api_key': 'AWS Bedrock API key'},
pixeltable/dataframe.py CHANGED
@@ -456,6 +456,7 @@ class DataFrame:
456
456
 
457
457
  @property
458
458
  def schema(self) -> dict[str, ColumnType]:
459
+ """Column names and types in this DataFrame."""
459
460
  return self._schema
460
461
 
461
462
  def bind(self, args: dict[str, Any]) -> DataFrame:
@@ -1276,10 +1277,11 @@ class DataFrame:
1276
1277
 
1277
1278
  # TODO: Reconcile these with Table.__check_mutable()
1278
1279
  assert len(self._from_clause.tbls) == 1
1279
- if self._first_tbl.is_snapshot():
1280
- raise excs.Error(f'Cannot use `{op_name}` on a snapshot.')
1280
+ # First check if it's a replica, since every replica handle is also a snapshot
1281
1281
  if self._first_tbl.is_replica():
1282
1282
  raise excs.Error(f'Cannot use `{op_name}` on a replica.')
1283
+ if self._first_tbl.is_snapshot():
1284
+ raise excs.Error(f'Cannot use `{op_name}` on a snapshot.')
1283
1285
 
1284
1286
  def _validate_mutable_op_sequence(self, op_name: str, allow_select: bool) -> None:
1285
1287
  """Tests whether the sequence of operations on this DataFrame is valid for a mutation operation."""
pixeltable/env.py CHANGED
@@ -28,6 +28,7 @@ import nest_asyncio # type: ignore[import-untyped]
28
28
  import pixeltable_pgserver
29
29
  import sqlalchemy as sql
30
30
  from pillow_heif import register_heif_opener # type: ignore[import-untyped]
31
+ from sqlalchemy import orm
31
32
  from tenacity import retry, stop_after_attempt, wait_exponential_jitter
32
33
  from tqdm import TqdmWarning
33
34
 
@@ -36,6 +37,7 @@ from pixeltable.config import Config
36
37
  from pixeltable.utils.console_output import ConsoleLogger, ConsoleMessageFilter, ConsoleOutputHandler, map_level
37
38
  from pixeltable.utils.dbms import CockroachDbms, Dbms, PostgresqlDbms
38
39
  from pixeltable.utils.http_server import make_server
40
+ from pixeltable.utils.object_stores import ObjectPath, StorageObjectAddress
39
41
 
40
42
  if TYPE_CHECKING:
41
43
  import spacy
@@ -58,7 +60,8 @@ class Env:
58
60
  _log_fmt_str = '%(asctime)s %(levelname)s %(name)s %(filename)s:%(lineno)d: %(message)s'
59
61
 
60
62
  _media_dir: Optional[Path]
61
- _file_cache_dir: Optional[Path] # cached media files with external URL
63
+ _object_soa: Optional[StorageObjectAddress]
64
+ _file_cache_dir: Optional[Path] # cached object files with external URL
62
65
  _dataset_cache_dir: Optional[Path] # cached datasets (eg, pytorch or COCO)
63
66
  _log_dir: Optional[Path] # log files
64
67
  _tmp_dir: Optional[Path] # any tmp files
@@ -88,7 +91,7 @@ class Env:
88
91
 
89
92
  _resource_pool_info: dict[str, Any]
90
93
  _current_conn: Optional[sql.Connection]
91
- _current_session: Optional[sql.orm.Session]
94
+ _current_session: Optional[orm.Session]
92
95
  _current_isolation_level: Optional[Literal['REPEATABLE_READ', 'SERIALIZABLE']]
93
96
  _dbms: Optional[Dbms]
94
97
  _event_loop: Optional[asyncio.AbstractEventLoop] # event loop for ExecNode
@@ -120,7 +123,8 @@ class Env:
120
123
  assert self._instance is None, 'Env is a singleton; use Env.get() to access the instance'
121
124
 
122
125
  self._media_dir = None # computed media files
123
- self._file_cache_dir = None # cached media files with external URL
126
+ self._object_soa = None # computed object files in StorageObjectAddress format
127
+ self._file_cache_dir = None # cached object files with external URL
124
128
  self._dataset_cache_dir = None # cached datasets (eg, pytorch or COCO)
125
129
  self._log_dir = None # log files
126
130
  self._tmp_dir = None # any tmp files
@@ -224,7 +228,7 @@ class Env:
224
228
  return self._current_conn
225
229
 
226
230
  @property
227
- def session(self) -> Optional[sql.orm.Session]:
231
+ def session(self) -> Optional[orm.Session]:
228
232
  assert self._current_session is not None
229
233
  return self._current_session
230
234
 
@@ -258,7 +262,7 @@ class Env:
258
262
  self._current_isolation_level = 'SERIALIZABLE'
259
263
  with (
260
264
  self.engine.connect().execution_options(isolation_level=self._current_isolation_level) as conn,
261
- sql.orm.Session(conn) as session,
265
+ orm.Session(conn) as session,
262
266
  conn.begin(),
263
267
  ):
264
268
  self._current_conn = conn
@@ -363,6 +367,7 @@ class Env:
363
367
 
364
368
  if not self._media_dir.exists():
365
369
  self._media_dir.mkdir()
370
+ self._object_soa = ObjectPath.parse_object_storage_addr(str(self._media_dir), may_contain_object_name=False)
366
371
  if not self._file_cache_dir.exists():
367
372
  self._file_cache_dir.mkdir()
368
373
  if not self._dataset_cache_dir.exists():
@@ -615,15 +620,17 @@ class Env:
615
620
  Args:
616
621
  - name: The name of the client
617
622
  """
618
- cl = _registered_clients[name]
619
- if cl.client_obj is not None:
620
- return cl.client_obj # Already initialized
621
-
622
- # Construct a client, retrieving each parameter from config.
623
+ # Return the existing client if it has already been constructed
624
+ with _registered_clients_lock:
625
+ cl = _registered_clients[name]
626
+ if cl.client_obj is not None:
627
+ return cl.client_obj # Already initialized
623
628
 
629
+ # Retrieve parameters required to construct the requested client.
624
630
  init_kwargs: dict[str, Any] = {}
625
631
  for param in cl.params.values():
626
632
  # Determine the type of the parameter for proper config parsing.
633
+ pname = param.name
627
634
  t = param.annotation
628
635
  # Deference Optional[T]
629
636
  if typing.get_origin(t) in (typing.Union, types.UnionType):
@@ -633,27 +640,31 @@ class Env:
633
640
  elif args[1] is type(None):
634
641
  t = args[0]
635
642
  assert isinstance(t, type), t
636
- arg: Any = Config.get().get_value(param.name, t, section=name)
643
+ arg: Any = Config.get().get_value(pname, t, section=name)
637
644
  if arg is not None:
638
- init_kwargs[param.name] = arg
645
+ init_kwargs[pname] = arg
639
646
  elif param.default is inspect.Parameter.empty:
640
647
  raise excs.Error(
641
- f'`{name}` client not initialized: parameter `{param.name}` is not configured.\n'
642
- f'To fix this, specify the `{name.upper()}_{param.name.upper()}` environment variable, '
643
- f'or put `{param.name.lower()}` in the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
648
+ f'`{name}` client not initialized: parameter `{pname}` is not configured.\n'
649
+ f'To fix this, specify the `{name.upper()}_{pname.upper()}` environment variable, '
650
+ f'or put `{pname.lower()}` in the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
644
651
  )
645
652
 
646
- cl.client_obj = cl.init_fn(**init_kwargs)
647
- self._logger.info(f'Initialized `{name}` client.')
648
- return cl.client_obj
653
+ # Construct the requested client
654
+ with _registered_clients_lock:
655
+ if cl.client_obj is not None:
656
+ return cl.client_obj # Already initialized
657
+ cl.client_obj = cl.init_fn(**init_kwargs)
658
+ self._logger.info(f'Initialized `{name}` client with parameters: {init_kwargs}.')
659
+ return cl.client_obj
649
660
 
650
661
  def _start_web_server(self) -> None:
651
662
  """
652
663
  The http server root is the file system root.
653
664
  eg: /home/media/foo.mp4 is located at http://127.0.0.1:{port}/home/media/foo.mp4
654
665
  On Windows, the server will translate paths like http://127.0.0.1:{port}/c:/media/foo.mp4
655
- This arrangement enables serving media hosted within _home,
656
- as well as external media inserted into pixeltable or produced by pixeltable.
666
+ This arrangement enables serving objects hosted within _home,
667
+ as well as external objects inserted into pixeltable or produced by pixeltable.
657
668
  The port is chosen dynamically to prevent conflicts.
658
669
  """
659
670
  # Port 0 means OS picks one for us.
@@ -713,10 +724,12 @@ class Env:
713
724
  def __register_packages(self) -> None:
714
725
  """Declare optional packages that are utilized by some parts of the code."""
715
726
  self.__register_package('anthropic')
727
+ self.__register_package('azure.storage.blob', library_name='azure-storage-blob')
716
728
  self.__register_package('boto3')
717
729
  self.__register_package('datasets')
718
730
  self.__register_package('fiftyone')
719
731
  self.__register_package('fireworks', library_name='fireworks-ai')
732
+ self.__register_package('google.cloud.storage', library_name='google-cloud-storage')
720
733
  self.__register_package('google.genai', library_name='google-genai')
721
734
  self.__register_package('groq')
722
735
  self.__register_package('huggingface_hub', library_name='huggingface-hub')
@@ -757,6 +770,10 @@ class Env:
757
770
  library_name=library_name or package_name, # defaults to package_name unless specified otherwise
758
771
  )
759
772
 
773
+ def require_binary(self, binary_name: str) -> None:
774
+ if not shutil.which(binary_name):
775
+ raise excs.Error(f'{binary_name} is not installed or not in PATH. Please install it to use this feature.')
776
+
760
777
  def require_package(self, package_name: str, min_version: Optional[list[int]] = None) -> None:
761
778
  """
762
779
  Checks whether the specified optional package is available. If not, raises an exception
@@ -815,6 +832,12 @@ class Env:
815
832
  assert self._media_dir is not None
816
833
  return self._media_dir
817
834
 
835
+ @property
836
+ def object_soa(self) -> StorageObjectAddress:
837
+ assert self._media_dir is not None
838
+ assert self._object_soa is not None
839
+ return self._object_soa
840
+
818
841
  @property
819
842
  def file_cache_dir(self) -> Path:
820
843
  assert self._file_cache_dir is not None
@@ -947,11 +970,13 @@ def register_client(name: str) -> Callable:
947
970
  def decorator(fn: Callable) -> None:
948
971
  sig = inspect.signature(fn)
949
972
  params = dict(sig.parameters)
950
- _registered_clients[name] = ApiClient(init_fn=fn, params=params)
973
+ with _registered_clients_lock:
974
+ _registered_clients[name] = ApiClient(init_fn=fn, params=params)
951
975
 
952
976
  return decorator
953
977
 
954
978
 
979
+ _registered_clients_lock: threading.Lock = threading.Lock()
955
980
  _registered_clients: dict[str, ApiClient] = {}
956
981
 
957
982
 
@@ -8,5 +8,6 @@ from .exec_context import ExecContext
8
8
  from .exec_node import ExecNode
9
9
  from .expr_eval import ExprEvalNode
10
10
  from .in_memory_data_node import InMemoryDataNode
11
+ from .object_store_save_node import ObjectStoreSaveNode
11
12
  from .row_update_node import RowUpdateNode
12
13
  from .sql_node import SqlAggregationNode, SqlJoinNode, SqlLookupNode, SqlNode, SqlSampleNode, SqlScanNode
@@ -103,6 +103,5 @@ class AggregationNode(ExecNode):
103
103
  self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
104
104
  self.output_batch.add_row(prev_row)
105
105
 
106
- self.output_batch.flush_imgs(None, self.row_builder.stored_img_cols, self.flushed_img_slots)
107
106
  _logger.debug(f'AggregateNode: consumed {num_input_rows} rows, returning {len(self.output_batch.rows)} rows')
108
107
  yield self.output_batch