pixeltable 0.4.13__py3-none-any.whl → 0.4.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (46) hide show
  1. pixeltable/catalog/catalog.py +179 -63
  2. pixeltable/catalog/column.py +24 -20
  3. pixeltable/catalog/table.py +24 -8
  4. pixeltable/catalog/table_version.py +15 -6
  5. pixeltable/catalog/view.py +22 -22
  6. pixeltable/config.py +2 -0
  7. pixeltable/dataframe.py +3 -2
  8. pixeltable/env.py +42 -21
  9. pixeltable/exec/__init__.py +1 -0
  10. pixeltable/exec/aggregation_node.py +0 -1
  11. pixeltable/exec/cache_prefetch_node.py +74 -98
  12. pixeltable/exec/data_row_batch.py +2 -18
  13. pixeltable/exec/in_memory_data_node.py +1 -1
  14. pixeltable/exec/object_store_save_node.py +299 -0
  15. pixeltable/exec/sql_node.py +28 -33
  16. pixeltable/exprs/data_row.py +31 -25
  17. pixeltable/exprs/json_path.py +6 -5
  18. pixeltable/exprs/row_builder.py +6 -12
  19. pixeltable/functions/gemini.py +1 -1
  20. pixeltable/functions/openai.py +1 -1
  21. pixeltable/functions/video.py +5 -6
  22. pixeltable/globals.py +3 -3
  23. pixeltable/index/embedding_index.py +5 -8
  24. pixeltable/io/fiftyone.py +1 -1
  25. pixeltable/io/label_studio.py +4 -5
  26. pixeltable/iterators/audio.py +1 -1
  27. pixeltable/iterators/document.py +10 -12
  28. pixeltable/iterators/video.py +1 -1
  29. pixeltable/metadata/schema.py +7 -0
  30. pixeltable/plan.py +26 -1
  31. pixeltable/share/packager.py +8 -2
  32. pixeltable/share/publish.py +3 -9
  33. pixeltable/type_system.py +1 -3
  34. pixeltable/utils/dbms.py +31 -5
  35. pixeltable/utils/gcs_store.py +283 -0
  36. pixeltable/utils/local_store.py +316 -0
  37. pixeltable/utils/object_stores.py +497 -0
  38. pixeltable/utils/pytorch.py +5 -6
  39. pixeltable/utils/s3_store.py +354 -0
  40. {pixeltable-0.4.13.dist-info → pixeltable-0.4.14.dist-info}/METADATA +1 -1
  41. {pixeltable-0.4.13.dist-info → pixeltable-0.4.14.dist-info}/RECORD +44 -41
  42. pixeltable/utils/media_store.py +0 -248
  43. pixeltable/utils/s3.py +0 -17
  44. {pixeltable-0.4.13.dist-info → pixeltable-0.4.14.dist-info}/WHEEL +0 -0
  45. {pixeltable-0.4.13.dist-info → pixeltable-0.4.14.dist-info}/entry_points.txt +0 -0
  46. {pixeltable-0.4.13.dist-info → pixeltable-0.4.14.dist-info}/licenses/LICENSE +0 -0
@@ -20,7 +20,7 @@ from pixeltable.iterators import ComponentIterator
20
20
  from pixeltable.metadata import schema
21
21
  from pixeltable.utils.exception_handler import run_cleanup_on_exception
22
22
  from pixeltable.utils.filecache import FileCache
23
- from pixeltable.utils.media_store import MediaStore
23
+ from pixeltable.utils.object_stores import ObjectOps
24
24
 
25
25
  from .tbl_ops import TableOp
26
26
 
@@ -327,7 +327,7 @@ class TableVersion:
327
327
  from .table_version_path import TableVersionPath
328
328
 
329
329
  # clear out any remaining media files from an aborted previous attempt
330
- MediaStore.get().delete(self.id)
330
+ self.delete_media()
331
331
  view_path = TableVersionPath.from_dict(op.load_view_op.view_path)
332
332
  plan, _ = Planner.create_view_load_plan(view_path)
333
333
  _, row_counts = self.store_tbl.insert_rows(plan, v_min=self.version)
@@ -356,14 +356,23 @@ class TableVersion:
356
356
  cat = pxt.catalog.Catalog.get()
357
357
  # We're creating a new TableVersion replica, so we should never have seen this particular
358
358
  # TableVersion instance before.
359
- assert tbl_version.effective_version is not None
360
- assert (tbl_version.id, tbl_version.effective_version) not in cat._tbl_versions
359
+ # Actually this isn't true, because we might be re-creating a dropped replica.
360
+ # TODO: Understand why old TableVersions are kept around even for a dropped table.
361
+ # assert tbl_version.effective_version is not None
362
+ # assert (tbl_version.id, tbl_version.effective_version) not in cat._tbl_versions
361
363
  cat._tbl_versions[tbl_version.id, tbl_version.effective_version] = tbl_version
362
364
  tbl_version.init()
363
365
  tbl_version.store_tbl.create()
364
366
  tbl_version.store_tbl.ensure_columns_exist(col for col in tbl_version.cols if col.is_stored)
365
367
  return tbl_version
366
368
 
369
+ def delete_media(self, tbl_version: Optional[int] = None) -> None:
370
+ # Assemble a set of column destinations and delete objects from all of them
371
+ # None is a valid column destination which refers to the default object location
372
+ destinations = {col.destination for col in self.cols if col.is_stored}
373
+ for dest in destinations:
374
+ ObjectOps.delete(dest, self.id, tbl_version=tbl_version)
375
+
367
376
  def drop(self) -> None:
368
377
  # if self.is_view and self.is_mutable:
369
378
  # # update mutable_views
@@ -374,7 +383,7 @@ class TableVersion:
374
383
  # if self.base.get().is_mutable:
375
384
  # self.base.get().mutable_views.remove(TableVersionHandle.create(self))
376
385
 
377
- MediaStore.get().delete(self.id)
386
+ self.delete_media()
378
387
  FileCache.get().clear(tbl_id=self.id)
379
388
  self.store_tbl.drop()
380
389
 
@@ -1236,7 +1245,7 @@ class TableVersion:
1236
1245
  )
1237
1246
 
1238
1247
  # delete newly-added data
1239
- MediaStore.get().delete(self.id, tbl_version=self.version)
1248
+ self.delete_media(tbl_version=self.version)
1240
1249
  conn.execute(sql.delete(self.store_tbl.sa_tbl).where(self.store_tbl.sa_tbl.c.v_min == self.version))
1241
1250
 
1242
1251
  # revert new deletions
@@ -47,17 +47,13 @@ class View(Table):
47
47
  self._tbl_version = tbl_version_path.tbl_version
48
48
 
49
49
  def _display_name(self) -> str:
50
- name: str
51
- if self._tbl_version_path.is_snapshot():
52
- name = 'snapshot'
53
- elif self._tbl_version_path.is_view():
54
- name = 'view'
55
- else:
56
- assert self._tbl_version_path.is_replica()
57
- name = 'table'
58
50
  if self._tbl_version_path.is_replica():
59
- name = f'{name}-replica'
60
- return name
51
+ return 'replica'
52
+ if self._tbl_version_path.is_snapshot():
53
+ return 'snapshot'
54
+ if self._tbl_version_path.is_view():
55
+ return 'view'
56
+ return 'table'
61
57
 
62
58
  @classmethod
63
59
  def select_list_to_additional_columns(cls, select_list: list[tuple[exprs.Expr, Optional[str]]]) -> dict[str, dict]:
@@ -270,12 +266,12 @@ class View(Table):
270
266
  # Update name and path with version qualifiers.
271
267
  md['name'] = f'{self._name}:{self._tbl_version_path.version()}'
272
268
  md['path'] = f'{self._path()}:{self._tbl_version_path.version()}'
273
- base_tbl = self._get_base_table()
274
- if base_tbl is None:
275
- md['base'] = None
276
- else:
269
+ base_tbl_id = self._base_tbl_id
270
+ if base_tbl_id is not None:
271
+ base_tbl = self._get_base_table()
272
+ base_path = '<anonymous base table>' if base_tbl is None else base_tbl._path()
277
273
  base_version = self._effective_base_versions[0]
278
- md['base'] = base_tbl._path() if base_version is None else f'{base_tbl._path()}:{base_version}'
274
+ md['base'] = base_path if base_version is None else f'{base_path}:{base_version}'
279
275
  return md
280
276
 
281
277
  def insert(
@@ -294,17 +290,21 @@ class View(Table):
294
290
  def delete(self, where: Optional[exprs.Expr] = None) -> UpdateStatus:
295
291
  raise excs.Error(f'{self._display_str()}: Cannot delete from a {self._display_name()}.')
296
292
 
297
- def _get_base_table(self) -> Optional['Table']:
293
+ @property
294
+ def _base_tbl_id(self) -> Optional[UUID]:
298
295
  if self._tbl_version_path.tbl_id != self._id:
299
296
  # _tbl_version_path represents a different schema object from this one. This can only happen if this is a
300
297
  # named pure snapshot.
301
- base_id = self._tbl_version_path.tbl_id
302
- elif self._tbl_version_path.base is None:
298
+ return self._tbl_version_path.tbl_id
299
+ if self._tbl_version_path.base is None:
303
300
  return None
304
- else:
305
- base_id = self._tbl_version_path.base.tbl_id
306
- with catalog.Catalog.get().begin_xact(tbl_id=base_id, for_write=False):
307
- return catalog.Catalog.get().get_table_by_id(base_id)
301
+ return self._tbl_version_path.base.tbl_id
302
+
303
+ def _get_base_table(self) -> Optional['Table']:
304
+ """Returns None if there is no base table, or if the base table is hidden."""
305
+ base_tbl_id = self._base_tbl_id
306
+ with catalog.Catalog.get().begin_xact(tbl_id=base_tbl_id, for_write=False):
307
+ return catalog.Catalog.get().get_table_by_id(base_tbl_id)
308
308
 
309
309
  @property
310
310
  def _effective_base_versions(self) -> list[Optional[int]]:
pixeltable/config.py CHANGED
@@ -161,6 +161,8 @@ KNOWN_CONFIG_OPTIONS = {
161
161
  'hide_warnings': 'Hide warnings from the console',
162
162
  'verbosity': 'Verbosity level for console output',
163
163
  'api_key': 'API key for Pixeltable cloud',
164
+ 'r2_profile': 'AWS config profile name used to access R2 storage',
165
+ 's3_profile': 'AWS config profile name used to access S3 storage',
164
166
  },
165
167
  'anthropic': {'api_key': 'Anthropic API key'},
166
168
  'bedrock': {'api_key': 'AWS Bedrock API key'},
pixeltable/dataframe.py CHANGED
@@ -1276,10 +1276,11 @@ class DataFrame:
1276
1276
 
1277
1277
  # TODO: Reconcile these with Table.__check_mutable()
1278
1278
  assert len(self._from_clause.tbls) == 1
1279
- if self._first_tbl.is_snapshot():
1280
- raise excs.Error(f'Cannot use `{op_name}` on a snapshot.')
1279
+ # First check if it's a replica, since every replica handle is also a snapshot
1281
1280
  if self._first_tbl.is_replica():
1282
1281
  raise excs.Error(f'Cannot use `{op_name}` on a replica.')
1282
+ if self._first_tbl.is_snapshot():
1283
+ raise excs.Error(f'Cannot use `{op_name}` on a snapshot.')
1283
1284
 
1284
1285
  def _validate_mutable_op_sequence(self, op_name: str, allow_select: bool) -> None:
1285
1286
  """Tests whether the sequence of operations on this DataFrame is valid for a mutation operation."""
pixeltable/env.py CHANGED
@@ -28,6 +28,7 @@ import nest_asyncio # type: ignore[import-untyped]
28
28
  import pixeltable_pgserver
29
29
  import sqlalchemy as sql
30
30
  from pillow_heif import register_heif_opener # type: ignore[import-untyped]
31
+ from sqlalchemy import orm
31
32
  from tenacity import retry, stop_after_attempt, wait_exponential_jitter
32
33
  from tqdm import TqdmWarning
33
34
 
@@ -36,6 +37,7 @@ from pixeltable.config import Config
36
37
  from pixeltable.utils.console_output import ConsoleLogger, ConsoleMessageFilter, ConsoleOutputHandler, map_level
37
38
  from pixeltable.utils.dbms import CockroachDbms, Dbms, PostgresqlDbms
38
39
  from pixeltable.utils.http_server import make_server
40
+ from pixeltable.utils.object_stores import ObjectPath, StorageObjectAddress
39
41
 
40
42
  if TYPE_CHECKING:
41
43
  import spacy
@@ -58,7 +60,8 @@ class Env:
58
60
  _log_fmt_str = '%(asctime)s %(levelname)s %(name)s %(filename)s:%(lineno)d: %(message)s'
59
61
 
60
62
  _media_dir: Optional[Path]
61
- _file_cache_dir: Optional[Path] # cached media files with external URL
63
+ _object_soa: Optional[StorageObjectAddress]
64
+ _file_cache_dir: Optional[Path] # cached object files with external URL
62
65
  _dataset_cache_dir: Optional[Path] # cached datasets (eg, pytorch or COCO)
63
66
  _log_dir: Optional[Path] # log files
64
67
  _tmp_dir: Optional[Path] # any tmp files
@@ -88,7 +91,7 @@ class Env:
88
91
 
89
92
  _resource_pool_info: dict[str, Any]
90
93
  _current_conn: Optional[sql.Connection]
91
- _current_session: Optional[sql.orm.Session]
94
+ _current_session: Optional[orm.Session]
92
95
  _current_isolation_level: Optional[Literal['REPEATABLE_READ', 'SERIALIZABLE']]
93
96
  _dbms: Optional[Dbms]
94
97
  _event_loop: Optional[asyncio.AbstractEventLoop] # event loop for ExecNode
@@ -120,7 +123,8 @@ class Env:
120
123
  assert self._instance is None, 'Env is a singleton; use Env.get() to access the instance'
121
124
 
122
125
  self._media_dir = None # computed media files
123
- self._file_cache_dir = None # cached media files with external URL
126
+ self._object_soa = None # computed object files in StorageObjectAddress format
127
+ self._file_cache_dir = None # cached object files with external URL
124
128
  self._dataset_cache_dir = None # cached datasets (eg, pytorch or COCO)
125
129
  self._log_dir = None # log files
126
130
  self._tmp_dir = None # any tmp files
@@ -224,7 +228,7 @@ class Env:
224
228
  return self._current_conn
225
229
 
226
230
  @property
227
- def session(self) -> Optional[sql.orm.Session]:
231
+ def session(self) -> Optional[orm.Session]:
228
232
  assert self._current_session is not None
229
233
  return self._current_session
230
234
 
@@ -258,7 +262,7 @@ class Env:
258
262
  self._current_isolation_level = 'SERIALIZABLE'
259
263
  with (
260
264
  self.engine.connect().execution_options(isolation_level=self._current_isolation_level) as conn,
261
- sql.orm.Session(conn) as session,
265
+ orm.Session(conn) as session,
262
266
  conn.begin(),
263
267
  ):
264
268
  self._current_conn = conn
@@ -363,6 +367,7 @@ class Env:
363
367
 
364
368
  if not self._media_dir.exists():
365
369
  self._media_dir.mkdir()
370
+ self._object_soa = ObjectPath.parse_object_storage_addr(str(self._media_dir), may_contain_object_name=False)
366
371
  if not self._file_cache_dir.exists():
367
372
  self._file_cache_dir.mkdir()
368
373
  if not self._dataset_cache_dir.exists():
@@ -615,15 +620,17 @@ class Env:
615
620
  Args:
616
621
  - name: The name of the client
617
622
  """
618
- cl = _registered_clients[name]
619
- if cl.client_obj is not None:
620
- return cl.client_obj # Already initialized
621
-
622
- # Construct a client, retrieving each parameter from config.
623
+ # Return the existing client if it has already been constructed
624
+ with _registered_clients_lock:
625
+ cl = _registered_clients[name]
626
+ if cl.client_obj is not None:
627
+ return cl.client_obj # Already initialized
623
628
 
629
+ # Retrieve parameters required to construct the requested client.
624
630
  init_kwargs: dict[str, Any] = {}
625
631
  for param in cl.params.values():
626
632
  # Determine the type of the parameter for proper config parsing.
633
+ pname = param.name
627
634
  t = param.annotation
628
635
  # Deference Optional[T]
629
636
  if typing.get_origin(t) in (typing.Union, types.UnionType):
@@ -633,27 +640,31 @@ class Env:
633
640
  elif args[1] is type(None):
634
641
  t = args[0]
635
642
  assert isinstance(t, type), t
636
- arg: Any = Config.get().get_value(param.name, t, section=name)
643
+ arg: Any = Config.get().get_value(pname, t, section=name)
637
644
  if arg is not None:
638
- init_kwargs[param.name] = arg
645
+ init_kwargs[pname] = arg
639
646
  elif param.default is inspect.Parameter.empty:
640
647
  raise excs.Error(
641
- f'`{name}` client not initialized: parameter `{param.name}` is not configured.\n'
642
- f'To fix this, specify the `{name.upper()}_{param.name.upper()}` environment variable, '
643
- f'or put `{param.name.lower()}` in the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
648
+ f'`{name}` client not initialized: parameter `{pname}` is not configured.\n'
649
+ f'To fix this, specify the `{name.upper()}_{pname.upper()}` environment variable, '
650
+ f'or put `{pname.lower()}` in the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
644
651
  )
645
652
 
646
- cl.client_obj = cl.init_fn(**init_kwargs)
647
- self._logger.info(f'Initialized `{name}` client.')
648
- return cl.client_obj
653
+ # Construct the requested client
654
+ with _registered_clients_lock:
655
+ if cl.client_obj is not None:
656
+ return cl.client_obj # Already initialized
657
+ cl.client_obj = cl.init_fn(**init_kwargs)
658
+ self._logger.info(f'Initialized `{name}` client with parameters: {init_kwargs}.')
659
+ return cl.client_obj
649
660
 
650
661
  def _start_web_server(self) -> None:
651
662
  """
652
663
  The http server root is the file system root.
653
664
  eg: /home/media/foo.mp4 is located at http://127.0.0.1:{port}/home/media/foo.mp4
654
665
  On Windows, the server will translate paths like http://127.0.0.1:{port}/c:/media/foo.mp4
655
- This arrangement enables serving media hosted within _home,
656
- as well as external media inserted into pixeltable or produced by pixeltable.
666
+ This arrangement enables serving objects hosted within _home,
667
+ as well as external objects inserted into pixeltable or produced by pixeltable.
657
668
  The port is chosen dynamically to prevent conflicts.
658
669
  """
659
670
  # Port 0 means OS picks one for us.
@@ -713,10 +724,12 @@ class Env:
713
724
  def __register_packages(self) -> None:
714
725
  """Declare optional packages that are utilized by some parts of the code."""
715
726
  self.__register_package('anthropic')
727
+ self.__register_package('azure.storage.blob', library_name='azure-storage-blob')
716
728
  self.__register_package('boto3')
717
729
  self.__register_package('datasets')
718
730
  self.__register_package('fiftyone')
719
731
  self.__register_package('fireworks', library_name='fireworks-ai')
732
+ self.__register_package('google.cloud.storage', library_name='google-cloud-storage')
720
733
  self.__register_package('google.genai', library_name='google-genai')
721
734
  self.__register_package('groq')
722
735
  self.__register_package('huggingface_hub', library_name='huggingface-hub')
@@ -815,6 +828,12 @@ class Env:
815
828
  assert self._media_dir is not None
816
829
  return self._media_dir
817
830
 
831
+ @property
832
+ def object_soa(self) -> StorageObjectAddress:
833
+ assert self._media_dir is not None
834
+ assert self._object_soa is not None
835
+ return self._object_soa
836
+
818
837
  @property
819
838
  def file_cache_dir(self) -> Path:
820
839
  assert self._file_cache_dir is not None
@@ -947,11 +966,13 @@ def register_client(name: str) -> Callable:
947
966
  def decorator(fn: Callable) -> None:
948
967
  sig = inspect.signature(fn)
949
968
  params = dict(sig.parameters)
950
- _registered_clients[name] = ApiClient(init_fn=fn, params=params)
969
+ with _registered_clients_lock:
970
+ _registered_clients[name] = ApiClient(init_fn=fn, params=params)
951
971
 
952
972
  return decorator
953
973
 
954
974
 
975
+ _registered_clients_lock: threading.Lock = threading.Lock()
955
976
  _registered_clients: dict[str, ApiClient] = {}
956
977
 
957
978
 
@@ -8,5 +8,6 @@ from .exec_context import ExecContext
8
8
  from .exec_node import ExecNode
9
9
  from .expr_eval import ExprEvalNode
10
10
  from .in_memory_data_node import InMemoryDataNode
11
+ from .object_store_save_node import ObjectStoreSaveNode
11
12
  from .row_update_node import RowUpdateNode
12
13
  from .sql_node import SqlAggregationNode, SqlJoinNode, SqlLookupNode, SqlNode, SqlSampleNode, SqlScanNode
@@ -103,6 +103,5 @@ class AggregationNode(ExecNode):
103
103
  self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
104
104
  self.output_batch.add_row(prev_row)
105
105
 
106
- self.output_batch.flush_imgs(None, self.row_builder.stored_img_cols, self.flushed_img_slots)
107
106
  _logger.debug(f'AggregateNode: consumed {num_input_rows} rows, returning {len(self.output_batch.rows)} rows')
108
107
  yield self.output_batch
@@ -9,12 +9,12 @@ import urllib.request
9
9
  from collections import deque
10
10
  from concurrent import futures
11
11
  from pathlib import Path
12
- from typing import Any, AsyncIterator, Iterator, Optional
12
+ from typing import AsyncIterator, Iterator, Optional
13
13
  from uuid import UUID
14
14
 
15
15
  from pixeltable import exceptions as excs, exprs
16
16
  from pixeltable.utils.filecache import FileCache
17
- from pixeltable.utils.media_store import TempStore
17
+ from pixeltable.utils.object_stores import ObjectOps
18
18
 
19
19
  from .data_row_batch import DataRowBatch
20
20
  from .exec_node import ExecNode
@@ -26,16 +26,17 @@ class CachePrefetchNode(ExecNode):
26
26
  """Brings files with external URLs into the cache
27
27
 
28
28
  TODO:
29
- - adapting the number of download threads at runtime to maximize throughput
29
+ - Process a row at a time and limit the number of in-flight rows to control memory usage
30
+ - Create asyncio.Tasks to consume our input in order to increase concurrency.
30
31
  """
31
32
 
33
+ QUEUE_DEPTH_HIGH_WATER = 50 # target number of in-flight requests
34
+ QUEUE_DEPTH_LOW_WATER = 20 # target number of in-flight requests
32
35
  BATCH_SIZE = 16
33
- NUM_EXECUTOR_THREADS = 16
36
+ MAX_WORKERS = 15
34
37
 
35
38
  retain_input_order: bool # if True, return rows in the exact order they were received
36
39
  file_col_info: list[exprs.ColumnSlotIdx]
37
- boto_client: Optional[Any]
38
- boto_client_lock: threading.Lock
39
40
 
40
41
  # execution state
41
42
  num_returned_rows: int
@@ -64,10 +65,6 @@ class CachePrefetchNode(ExecNode):
64
65
  self.retain_input_order = retain_input_order
65
66
  self.file_col_info = file_col_info
66
67
 
67
- # clients for specific services are constructed as needed, because it's time-consuming
68
- self.boto_client = None
69
- self.boto_client_lock = threading.Lock()
70
-
71
68
  self.num_returned_rows = 0
72
69
  self.ready_rows = deque()
73
70
  self.in_flight_rows = {}
@@ -75,24 +72,42 @@ class CachePrefetchNode(ExecNode):
75
72
  self.in_flight_urls = {}
76
73
  self.input_finished = False
77
74
  self.row_idx = itertools.count() if retain_input_order else itertools.repeat(None)
75
+ assert self.QUEUE_DEPTH_HIGH_WATER > self.QUEUE_DEPTH_LOW_WATER
78
76
 
79
- async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
80
- input_iter = self.input.__aiter__()
81
- with futures.ThreadPoolExecutor(max_workers=self.NUM_EXECUTOR_THREADS) as executor:
82
- # we create enough in-flight requests to fill the first batch
83
- while not self.input_finished and self.__num_pending_rows() < self.BATCH_SIZE:
84
- await self.__submit_input_batch(input_iter, executor)
77
+ @property
78
+ def queued_work(self) -> int:
79
+ return len(self.in_flight_requests)
85
80
 
86
- while True:
87
- # try to assemble a full batch of output rows
88
- if not self.__has_ready_batch() and len(self.in_flight_requests) > 0:
89
- self.__wait_for_requests()
90
-
91
- # try to create enough in-flight requests to fill the next batch
92
- while not self.input_finished and self.__num_pending_rows() < self.BATCH_SIZE:
93
- await self.__submit_input_batch(input_iter, executor)
81
+ async def get_input_batch(self, input_iter: AsyncIterator[DataRowBatch]) -> Optional[DataRowBatch]:
82
+ """Get the next batch of input rows, or None if there are no more rows"""
83
+ try:
84
+ input_batch = await anext(input_iter)
85
+ if input_batch is None:
86
+ self.input_finished = True
87
+ return input_batch
88
+ except StopAsyncIteration:
89
+ self.input_finished = True
90
+ return None
94
91
 
95
- if len(self.ready_rows) > 0:
92
+ async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
93
+ input_iter = aiter(self.input)
94
+ with futures.ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
95
+ while True:
96
+ # Create work to fill the queue to the high water mark ... ?without overrunning the in-flight row limit.
97
+ while not self.input_finished and self.queued_work < self.QUEUE_DEPTH_HIGH_WATER:
98
+ input_batch = await self.get_input_batch(input_iter)
99
+ if input_batch is not None:
100
+ self.__process_input_batch(input_batch, executor)
101
+
102
+ # Wait for enough completions to enable more queueing or if we're done
103
+ while self.queued_work > self.QUEUE_DEPTH_LOW_WATER or (self.input_finished and self.queued_work > 0):
104
+ done, _ = futures.wait(self.in_flight_requests, return_when=futures.FIRST_COMPLETED)
105
+ self.__process_completions(done, ignore_errors=self.ctx.ignore_errors)
106
+
107
+ # Emit results to meet batch size requirements or empty the in-flight row queue
108
+ if self.__has_ready_batch() or (
109
+ len(self.ready_rows) > 0 and self.input_finished and self.queued_work == 0
110
+ ):
96
111
  # create DataRowBatch from the first BATCH_SIZE ready rows
97
112
  batch = DataRowBatch(self.row_builder)
98
113
  rows = [self.ready_rows.popleft() for _ in range(min(self.BATCH_SIZE, len(self.ready_rows)))]
@@ -103,22 +118,15 @@ class CachePrefetchNode(ExecNode):
103
118
  _logger.debug(f'returning {len(rows)} rows')
104
119
  yield batch
105
120
 
106
- if self.input_finished and self.__num_pending_rows() == 0:
121
+ if self.input_finished and self.queued_work == 0 and len(self.ready_rows) == 0:
107
122
  return
108
123
 
109
- def __num_pending_rows(self) -> int:
110
- return len(self.in_flight_rows) + len(self.ready_rows)
111
-
112
124
  def __has_ready_batch(self) -> bool:
113
125
  """True if there are >= BATCH_SIZES entries in ready_rows and the first BATCH_SIZE ones are all non-None"""
114
126
  return (
115
127
  sum(int(row is not None) for row in itertools.islice(self.ready_rows, self.BATCH_SIZE)) == self.BATCH_SIZE
116
128
  )
117
129
 
118
- def __ready_prefix_len(self) -> int:
119
- """Length of the non-None prefix of ready_rows (= what we can return right now)"""
120
- return sum(1 for _ in itertools.takewhile(lambda x: x is not None, self.ready_rows))
121
-
122
130
  def __add_ready_row(self, row: exprs.DataRow, row_idx: Optional[int]) -> None:
123
131
  if row_idx is None:
124
132
  self.ready_rows.append(row)
@@ -129,50 +137,36 @@ class CachePrefetchNode(ExecNode):
129
137
  self.ready_rows.extend([None] * (idx - len(self.ready_rows) + 1))
130
138
  self.ready_rows[idx] = row
131
139
 
132
- def __wait_for_requests(self) -> None:
133
- """Wait for in-flight requests to complete until we have a full batch of rows"""
140
+ def __process_completions(self, done: set[futures.Future], ignore_errors: bool) -> None:
134
141
  file_cache = FileCache.get()
135
- _logger.debug(f'waiting for requests; ready_batch_size={self.__ready_prefix_len()}')
136
- while not self.__has_ready_batch() and len(self.in_flight_requests) > 0:
137
- done, _ = futures.wait(self.in_flight_requests, return_when=futures.FIRST_COMPLETED)
138
- for f in done:
139
- url = self.in_flight_requests.pop(f)
140
- tmp_path, exc = f.result()
141
- local_path: Optional[Path] = None
142
- if tmp_path is not None:
143
- # register the file with the cache for the first column in which it's missing
144
- assert url in self.in_flight_urls
145
- _, info = self.in_flight_urls[url][0]
146
- local_path = file_cache.add(info.col.tbl.id, info.col.id, url, tmp_path)
147
- _logger.debug(f'cached {url} as {local_path}')
148
-
149
- # add the local path/exception to the slots that reference the url
150
- for row, info in self.in_flight_urls.pop(url):
151
- if exc is not None:
152
- self.row_builder.set_exc(row, info.slot_idx, exc)
153
- else:
154
- assert local_path is not None
155
- row.set_file_path(info.slot_idx, str(local_path))
156
- state = self.in_flight_rows[id(row)]
157
- state.num_missing -= 1
158
- if state.num_missing == 0:
159
- del self.in_flight_rows[id(row)]
160
- self.__add_ready_row(row, state.idx)
161
- _logger.debug(f'row {state.idx} is ready (ready_batch_size={self.__ready_prefix_len()})')
162
-
163
- async def __submit_input_batch(
164
- self, input: AsyncIterator[DataRowBatch], executor: futures.ThreadPoolExecutor
165
- ) -> None:
166
- assert not self.input_finished
167
- input_batch: Optional[DataRowBatch]
168
- try:
169
- input_batch = await anext(input)
170
- except StopAsyncIteration:
171
- input_batch = None
172
- if input_batch is None:
173
- self.input_finished = True
174
- return
175
-
142
+ for f in done:
143
+ url = self.in_flight_requests.pop(f)
144
+ tmp_path, exc = f.result()
145
+ if exc is not None and not ignore_errors:
146
+ raise exc
147
+ local_path: Optional[Path] = None
148
+ if tmp_path is not None:
149
+ # register the file with the cache for the first column in which it's missing
150
+ assert url in self.in_flight_urls
151
+ _, info = self.in_flight_urls[url][0]
152
+ local_path = file_cache.add(info.col.tbl.id, info.col.id, url, tmp_path)
153
+ _logger.debug(f'cached {url} as {local_path}')
154
+
155
+ # add the local path/exception to the slots that reference the url
156
+ for row, info in self.in_flight_urls.pop(url):
157
+ if exc is not None:
158
+ self.row_builder.set_exc(row, info.slot_idx, exc)
159
+ else:
160
+ assert local_path is not None
161
+ row.set_file_path(info.slot_idx, str(local_path))
162
+ state = self.in_flight_rows[id(row)]
163
+ state.num_missing -= 1
164
+ if state.num_missing == 0:
165
+ del self.in_flight_rows[id(row)]
166
+ self.__add_ready_row(row, state.idx)
167
+
168
+ def __process_input_batch(self, input_batch: DataRowBatch, executor: futures.ThreadPoolExecutor) -> None:
169
+ """Process a batch of input rows, submitting URLs for download and adding ready rows to ready_rows"""
176
170
  file_cache = FileCache.get()
177
171
 
178
172
  # URLs from this input batch that aren't already in the file cache;
@@ -180,7 +174,7 @@ class CachePrefetchNode(ExecNode):
180
174
  # the time it takes to get the next batch together
181
175
  cache_misses: list[str] = []
182
176
 
183
- url_pos: dict[str, int] = {} # url -> row_idx; used for logging
177
+ url_pos: dict[str, Optional[int]] = {} # url -> row_idx; used for logging
184
178
  for row in input_batch:
185
179
  # identify missing local files in input batch, or fill in their paths if they're already cached
186
180
  num_missing = 0
@@ -221,6 +215,8 @@ class CachePrefetchNode(ExecNode):
221
215
 
222
216
  def __fetch_url(self, url: str) -> tuple[Optional[Path], Optional[Exception]]:
223
217
  """Fetches a remote URL into the TempStore and returns its path"""
218
+ from pixeltable.utils.local_store import TempStore
219
+
224
220
  _logger.debug(f'fetching url={url} thread_name={threading.current_thread().name}')
225
221
  parsed = urllib.parse.urlparse(url)
226
222
  # Use len(parsed.scheme) > 1 here to ensure we're not being passed
@@ -234,31 +230,11 @@ class CachePrefetchNode(ExecNode):
234
230
  tmp_path = TempStore.create_path(extension=extension)
235
231
  try:
236
232
  _logger.debug(f'Downloading {url} to {tmp_path}')
237
- if parsed.scheme == 's3':
238
- from pixeltable.utils.s3 import get_client
239
-
240
- with self.boto_client_lock:
241
- if self.boto_client is None:
242
- config = {
243
- 'max_pool_connections': self.NUM_EXECUTOR_THREADS + 4, # +4: leave some headroom
244
- 'connect_timeout': 5,
245
- 'read_timeout': 30,
246
- 'retries': {'max_attempts': 3, 'mode': 'adaptive'},
247
- }
248
- self.boto_client = get_client(**config)
249
- self.boto_client.download_file(parsed.netloc, parsed.path.lstrip('/'), str(tmp_path))
250
- elif parsed.scheme in ('http', 'https'):
251
- with urllib.request.urlopen(url) as resp, open(tmp_path, 'wb') as f:
252
- data = resp.read()
253
- f.write(data)
254
- else:
255
- raise AssertionError(f'Unsupported URL scheme: {parsed.scheme}')
233
+ ObjectOps.copy_object_to_local_file(url, tmp_path)
256
234
  _logger.debug(f'Downloaded {url} to {tmp_path}')
257
235
  return tmp_path, None
258
236
  except Exception as e:
259
237
  # we want to add the file url to the exception message
260
238
  exc = excs.Error(f'Failed to download {url}: {e}')
261
239
  _logger.debug(f'Failed to download {url}: {e}', exc_info=e)
262
- if not self.ctx.ignore_errors:
263
- raise exc from None # suppress original exception
264
240
  return None, exc
@@ -12,15 +12,14 @@ class DataRowBatch:
12
12
  """Set of DataRows, indexed by rowid.
13
13
 
14
14
  Contains the metadata needed to initialize DataRows.
15
+
16
+ Requires either num_rows or rows to be specified, but not both.
15
17
  """
16
18
 
17
19
  row_builder: exprs.RowBuilder
18
20
  rows: list[exprs.DataRow]
19
21
 
20
22
  def __init__(self, row_builder: exprs.RowBuilder, rows: Optional[list[exprs.DataRow]] = None):
21
- """
22
- Requires either num_rows or rows to be specified, but not both.
23
- """
24
23
  self.row_builder = row_builder
25
24
  self.rows = [] if rows is None else rows
26
25
 
@@ -39,20 +38,5 @@ class DataRowBatch:
39
38
  def __getitem__(self, index: int) -> exprs.DataRow:
40
39
  return self.rows[index]
41
40
 
42
- def flush_imgs(
43
- self, idx_range: Optional[slice], stored_img_info: list[exprs.ColumnSlotIdx], flushed_img_slots: list[int]
44
- ) -> None:
45
- """Flushes images in the given range of rows."""
46
- if len(stored_img_info) == 0 and len(flushed_img_slots) == 0:
47
- return
48
-
49
- if idx_range is None:
50
- idx_range = slice(0, len(self.rows))
51
- for row in self.rows[idx_range]:
52
- for info in stored_img_info:
53
- row.flush_img(info.slot_idx, info.col)
54
- for slot_idx in flushed_img_slots:
55
- row.flush_img(slot_idx)
56
-
57
41
  def __iter__(self) -> Iterator[exprs.DataRow]:
58
42
  return iter(self.rows)