pixeltable 0.4.13__py3-none-any.whl → 0.4.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/catalog/catalog.py +179 -63
- pixeltable/catalog/column.py +24 -20
- pixeltable/catalog/table.py +24 -8
- pixeltable/catalog/table_version.py +15 -6
- pixeltable/catalog/view.py +22 -22
- pixeltable/config.py +2 -0
- pixeltable/dataframe.py +3 -2
- pixeltable/env.py +42 -21
- pixeltable/exec/__init__.py +1 -0
- pixeltable/exec/aggregation_node.py +0 -1
- pixeltable/exec/cache_prefetch_node.py +74 -98
- pixeltable/exec/data_row_batch.py +2 -18
- pixeltable/exec/in_memory_data_node.py +1 -1
- pixeltable/exec/object_store_save_node.py +299 -0
- pixeltable/exec/sql_node.py +28 -33
- pixeltable/exprs/data_row.py +31 -25
- pixeltable/exprs/json_path.py +6 -5
- pixeltable/exprs/row_builder.py +6 -12
- pixeltable/functions/gemini.py +1 -1
- pixeltable/functions/openai.py +1 -1
- pixeltable/functions/video.py +5 -6
- pixeltable/globals.py +3 -3
- pixeltable/index/embedding_index.py +5 -8
- pixeltable/io/fiftyone.py +1 -1
- pixeltable/io/label_studio.py +4 -5
- pixeltable/iterators/audio.py +1 -1
- pixeltable/iterators/document.py +10 -12
- pixeltable/iterators/video.py +1 -1
- pixeltable/metadata/schema.py +7 -0
- pixeltable/plan.py +26 -1
- pixeltable/share/packager.py +8 -2
- pixeltable/share/publish.py +3 -9
- pixeltable/type_system.py +1 -3
- pixeltable/utils/dbms.py +31 -5
- pixeltable/utils/gcs_store.py +283 -0
- pixeltable/utils/local_store.py +316 -0
- pixeltable/utils/object_stores.py +497 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +354 -0
- {pixeltable-0.4.13.dist-info → pixeltable-0.4.14.dist-info}/METADATA +1 -1
- {pixeltable-0.4.13.dist-info → pixeltable-0.4.14.dist-info}/RECORD +44 -41
- pixeltable/utils/media_store.py +0 -248
- pixeltable/utils/s3.py +0 -17
- {pixeltable-0.4.13.dist-info → pixeltable-0.4.14.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.13.dist-info → pixeltable-0.4.14.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.13.dist-info → pixeltable-0.4.14.dist-info}/licenses/LICENSE +0 -0
|
@@ -20,7 +20,7 @@ from pixeltable.iterators import ComponentIterator
|
|
|
20
20
|
from pixeltable.metadata import schema
|
|
21
21
|
from pixeltable.utils.exception_handler import run_cleanup_on_exception
|
|
22
22
|
from pixeltable.utils.filecache import FileCache
|
|
23
|
-
from pixeltable.utils.
|
|
23
|
+
from pixeltable.utils.object_stores import ObjectOps
|
|
24
24
|
|
|
25
25
|
from .tbl_ops import TableOp
|
|
26
26
|
|
|
@@ -327,7 +327,7 @@ class TableVersion:
|
|
|
327
327
|
from .table_version_path import TableVersionPath
|
|
328
328
|
|
|
329
329
|
# clear out any remaining media files from an aborted previous attempt
|
|
330
|
-
|
|
330
|
+
self.delete_media()
|
|
331
331
|
view_path = TableVersionPath.from_dict(op.load_view_op.view_path)
|
|
332
332
|
plan, _ = Planner.create_view_load_plan(view_path)
|
|
333
333
|
_, row_counts = self.store_tbl.insert_rows(plan, v_min=self.version)
|
|
@@ -356,14 +356,23 @@ class TableVersion:
|
|
|
356
356
|
cat = pxt.catalog.Catalog.get()
|
|
357
357
|
# We're creating a new TableVersion replica, so we should never have seen this particular
|
|
358
358
|
# TableVersion instance before.
|
|
359
|
-
|
|
360
|
-
|
|
359
|
+
# Actually this isn't true, because we might be re-creating a dropped replica.
|
|
360
|
+
# TODO: Understand why old TableVersions are kept around even for a dropped table.
|
|
361
|
+
# assert tbl_version.effective_version is not None
|
|
362
|
+
# assert (tbl_version.id, tbl_version.effective_version) not in cat._tbl_versions
|
|
361
363
|
cat._tbl_versions[tbl_version.id, tbl_version.effective_version] = tbl_version
|
|
362
364
|
tbl_version.init()
|
|
363
365
|
tbl_version.store_tbl.create()
|
|
364
366
|
tbl_version.store_tbl.ensure_columns_exist(col for col in tbl_version.cols if col.is_stored)
|
|
365
367
|
return tbl_version
|
|
366
368
|
|
|
369
|
+
def delete_media(self, tbl_version: Optional[int] = None) -> None:
|
|
370
|
+
# Assemble a set of column destinations and delete objects from all of them
|
|
371
|
+
# None is a valid column destination which refers to the default object location
|
|
372
|
+
destinations = {col.destination for col in self.cols if col.is_stored}
|
|
373
|
+
for dest in destinations:
|
|
374
|
+
ObjectOps.delete(dest, self.id, tbl_version=tbl_version)
|
|
375
|
+
|
|
367
376
|
def drop(self) -> None:
|
|
368
377
|
# if self.is_view and self.is_mutable:
|
|
369
378
|
# # update mutable_views
|
|
@@ -374,7 +383,7 @@ class TableVersion:
|
|
|
374
383
|
# if self.base.get().is_mutable:
|
|
375
384
|
# self.base.get().mutable_views.remove(TableVersionHandle.create(self))
|
|
376
385
|
|
|
377
|
-
|
|
386
|
+
self.delete_media()
|
|
378
387
|
FileCache.get().clear(tbl_id=self.id)
|
|
379
388
|
self.store_tbl.drop()
|
|
380
389
|
|
|
@@ -1236,7 +1245,7 @@ class TableVersion:
|
|
|
1236
1245
|
)
|
|
1237
1246
|
|
|
1238
1247
|
# delete newly-added data
|
|
1239
|
-
|
|
1248
|
+
self.delete_media(tbl_version=self.version)
|
|
1240
1249
|
conn.execute(sql.delete(self.store_tbl.sa_tbl).where(self.store_tbl.sa_tbl.c.v_min == self.version))
|
|
1241
1250
|
|
|
1242
1251
|
# revert new deletions
|
pixeltable/catalog/view.py
CHANGED
|
@@ -47,17 +47,13 @@ class View(Table):
|
|
|
47
47
|
self._tbl_version = tbl_version_path.tbl_version
|
|
48
48
|
|
|
49
49
|
def _display_name(self) -> str:
|
|
50
|
-
name: str
|
|
51
|
-
if self._tbl_version_path.is_snapshot():
|
|
52
|
-
name = 'snapshot'
|
|
53
|
-
elif self._tbl_version_path.is_view():
|
|
54
|
-
name = 'view'
|
|
55
|
-
else:
|
|
56
|
-
assert self._tbl_version_path.is_replica()
|
|
57
|
-
name = 'table'
|
|
58
50
|
if self._tbl_version_path.is_replica():
|
|
59
|
-
|
|
60
|
-
|
|
51
|
+
return 'replica'
|
|
52
|
+
if self._tbl_version_path.is_snapshot():
|
|
53
|
+
return 'snapshot'
|
|
54
|
+
if self._tbl_version_path.is_view():
|
|
55
|
+
return 'view'
|
|
56
|
+
return 'table'
|
|
61
57
|
|
|
62
58
|
@classmethod
|
|
63
59
|
def select_list_to_additional_columns(cls, select_list: list[tuple[exprs.Expr, Optional[str]]]) -> dict[str, dict]:
|
|
@@ -270,12 +266,12 @@ class View(Table):
|
|
|
270
266
|
# Update name and path with version qualifiers.
|
|
271
267
|
md['name'] = f'{self._name}:{self._tbl_version_path.version()}'
|
|
272
268
|
md['path'] = f'{self._path()}:{self._tbl_version_path.version()}'
|
|
273
|
-
|
|
274
|
-
if
|
|
275
|
-
|
|
276
|
-
|
|
269
|
+
base_tbl_id = self._base_tbl_id
|
|
270
|
+
if base_tbl_id is not None:
|
|
271
|
+
base_tbl = self._get_base_table()
|
|
272
|
+
base_path = '<anonymous base table>' if base_tbl is None else base_tbl._path()
|
|
277
273
|
base_version = self._effective_base_versions[0]
|
|
278
|
-
md['base'] =
|
|
274
|
+
md['base'] = base_path if base_version is None else f'{base_path}:{base_version}'
|
|
279
275
|
return md
|
|
280
276
|
|
|
281
277
|
def insert(
|
|
@@ -294,17 +290,21 @@ class View(Table):
|
|
|
294
290
|
def delete(self, where: Optional[exprs.Expr] = None) -> UpdateStatus:
|
|
295
291
|
raise excs.Error(f'{self._display_str()}: Cannot delete from a {self._display_name()}.')
|
|
296
292
|
|
|
297
|
-
|
|
293
|
+
@property
|
|
294
|
+
def _base_tbl_id(self) -> Optional[UUID]:
|
|
298
295
|
if self._tbl_version_path.tbl_id != self._id:
|
|
299
296
|
# _tbl_version_path represents a different schema object from this one. This can only happen if this is a
|
|
300
297
|
# named pure snapshot.
|
|
301
|
-
|
|
302
|
-
|
|
298
|
+
return self._tbl_version_path.tbl_id
|
|
299
|
+
if self._tbl_version_path.base is None:
|
|
303
300
|
return None
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
301
|
+
return self._tbl_version_path.base.tbl_id
|
|
302
|
+
|
|
303
|
+
def _get_base_table(self) -> Optional['Table']:
|
|
304
|
+
"""Returns None if there is no base table, or if the base table is hidden."""
|
|
305
|
+
base_tbl_id = self._base_tbl_id
|
|
306
|
+
with catalog.Catalog.get().begin_xact(tbl_id=base_tbl_id, for_write=False):
|
|
307
|
+
return catalog.Catalog.get().get_table_by_id(base_tbl_id)
|
|
308
308
|
|
|
309
309
|
@property
|
|
310
310
|
def _effective_base_versions(self) -> list[Optional[int]]:
|
pixeltable/config.py
CHANGED
|
@@ -161,6 +161,8 @@ KNOWN_CONFIG_OPTIONS = {
|
|
|
161
161
|
'hide_warnings': 'Hide warnings from the console',
|
|
162
162
|
'verbosity': 'Verbosity level for console output',
|
|
163
163
|
'api_key': 'API key for Pixeltable cloud',
|
|
164
|
+
'r2_profile': 'AWS config profile name used to access R2 storage',
|
|
165
|
+
's3_profile': 'AWS config profile name used to access S3 storage',
|
|
164
166
|
},
|
|
165
167
|
'anthropic': {'api_key': 'Anthropic API key'},
|
|
166
168
|
'bedrock': {'api_key': 'AWS Bedrock API key'},
|
pixeltable/dataframe.py
CHANGED
|
@@ -1276,10 +1276,11 @@ class DataFrame:
|
|
|
1276
1276
|
|
|
1277
1277
|
# TODO: Reconcile these with Table.__check_mutable()
|
|
1278
1278
|
assert len(self._from_clause.tbls) == 1
|
|
1279
|
-
if
|
|
1280
|
-
raise excs.Error(f'Cannot use `{op_name}` on a snapshot.')
|
|
1279
|
+
# First check if it's a replica, since every replica handle is also a snapshot
|
|
1281
1280
|
if self._first_tbl.is_replica():
|
|
1282
1281
|
raise excs.Error(f'Cannot use `{op_name}` on a replica.')
|
|
1282
|
+
if self._first_tbl.is_snapshot():
|
|
1283
|
+
raise excs.Error(f'Cannot use `{op_name}` on a snapshot.')
|
|
1283
1284
|
|
|
1284
1285
|
def _validate_mutable_op_sequence(self, op_name: str, allow_select: bool) -> None:
|
|
1285
1286
|
"""Tests whether the sequence of operations on this DataFrame is valid for a mutation operation."""
|
pixeltable/env.py
CHANGED
|
@@ -28,6 +28,7 @@ import nest_asyncio # type: ignore[import-untyped]
|
|
|
28
28
|
import pixeltable_pgserver
|
|
29
29
|
import sqlalchemy as sql
|
|
30
30
|
from pillow_heif import register_heif_opener # type: ignore[import-untyped]
|
|
31
|
+
from sqlalchemy import orm
|
|
31
32
|
from tenacity import retry, stop_after_attempt, wait_exponential_jitter
|
|
32
33
|
from tqdm import TqdmWarning
|
|
33
34
|
|
|
@@ -36,6 +37,7 @@ from pixeltable.config import Config
|
|
|
36
37
|
from pixeltable.utils.console_output import ConsoleLogger, ConsoleMessageFilter, ConsoleOutputHandler, map_level
|
|
37
38
|
from pixeltable.utils.dbms import CockroachDbms, Dbms, PostgresqlDbms
|
|
38
39
|
from pixeltable.utils.http_server import make_server
|
|
40
|
+
from pixeltable.utils.object_stores import ObjectPath, StorageObjectAddress
|
|
39
41
|
|
|
40
42
|
if TYPE_CHECKING:
|
|
41
43
|
import spacy
|
|
@@ -58,7 +60,8 @@ class Env:
|
|
|
58
60
|
_log_fmt_str = '%(asctime)s %(levelname)s %(name)s %(filename)s:%(lineno)d: %(message)s'
|
|
59
61
|
|
|
60
62
|
_media_dir: Optional[Path]
|
|
61
|
-
|
|
63
|
+
_object_soa: Optional[StorageObjectAddress]
|
|
64
|
+
_file_cache_dir: Optional[Path] # cached object files with external URL
|
|
62
65
|
_dataset_cache_dir: Optional[Path] # cached datasets (eg, pytorch or COCO)
|
|
63
66
|
_log_dir: Optional[Path] # log files
|
|
64
67
|
_tmp_dir: Optional[Path] # any tmp files
|
|
@@ -88,7 +91,7 @@ class Env:
|
|
|
88
91
|
|
|
89
92
|
_resource_pool_info: dict[str, Any]
|
|
90
93
|
_current_conn: Optional[sql.Connection]
|
|
91
|
-
_current_session: Optional[
|
|
94
|
+
_current_session: Optional[orm.Session]
|
|
92
95
|
_current_isolation_level: Optional[Literal['REPEATABLE_READ', 'SERIALIZABLE']]
|
|
93
96
|
_dbms: Optional[Dbms]
|
|
94
97
|
_event_loop: Optional[asyncio.AbstractEventLoop] # event loop for ExecNode
|
|
@@ -120,7 +123,8 @@ class Env:
|
|
|
120
123
|
assert self._instance is None, 'Env is a singleton; use Env.get() to access the instance'
|
|
121
124
|
|
|
122
125
|
self._media_dir = None # computed media files
|
|
123
|
-
self.
|
|
126
|
+
self._object_soa = None # computed object files in StorageObjectAddress format
|
|
127
|
+
self._file_cache_dir = None # cached object files with external URL
|
|
124
128
|
self._dataset_cache_dir = None # cached datasets (eg, pytorch or COCO)
|
|
125
129
|
self._log_dir = None # log files
|
|
126
130
|
self._tmp_dir = None # any tmp files
|
|
@@ -224,7 +228,7 @@ class Env:
|
|
|
224
228
|
return self._current_conn
|
|
225
229
|
|
|
226
230
|
@property
|
|
227
|
-
def session(self) -> Optional[
|
|
231
|
+
def session(self) -> Optional[orm.Session]:
|
|
228
232
|
assert self._current_session is not None
|
|
229
233
|
return self._current_session
|
|
230
234
|
|
|
@@ -258,7 +262,7 @@ class Env:
|
|
|
258
262
|
self._current_isolation_level = 'SERIALIZABLE'
|
|
259
263
|
with (
|
|
260
264
|
self.engine.connect().execution_options(isolation_level=self._current_isolation_level) as conn,
|
|
261
|
-
|
|
265
|
+
orm.Session(conn) as session,
|
|
262
266
|
conn.begin(),
|
|
263
267
|
):
|
|
264
268
|
self._current_conn = conn
|
|
@@ -363,6 +367,7 @@ class Env:
|
|
|
363
367
|
|
|
364
368
|
if not self._media_dir.exists():
|
|
365
369
|
self._media_dir.mkdir()
|
|
370
|
+
self._object_soa = ObjectPath.parse_object_storage_addr(str(self._media_dir), may_contain_object_name=False)
|
|
366
371
|
if not self._file_cache_dir.exists():
|
|
367
372
|
self._file_cache_dir.mkdir()
|
|
368
373
|
if not self._dataset_cache_dir.exists():
|
|
@@ -615,15 +620,17 @@ class Env:
|
|
|
615
620
|
Args:
|
|
616
621
|
- name: The name of the client
|
|
617
622
|
"""
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
+
# Return the existing client if it has already been constructed
|
|
624
|
+
with _registered_clients_lock:
|
|
625
|
+
cl = _registered_clients[name]
|
|
626
|
+
if cl.client_obj is not None:
|
|
627
|
+
return cl.client_obj # Already initialized
|
|
623
628
|
|
|
629
|
+
# Retrieve parameters required to construct the requested client.
|
|
624
630
|
init_kwargs: dict[str, Any] = {}
|
|
625
631
|
for param in cl.params.values():
|
|
626
632
|
# Determine the type of the parameter for proper config parsing.
|
|
633
|
+
pname = param.name
|
|
627
634
|
t = param.annotation
|
|
628
635
|
# Deference Optional[T]
|
|
629
636
|
if typing.get_origin(t) in (typing.Union, types.UnionType):
|
|
@@ -633,27 +640,31 @@ class Env:
|
|
|
633
640
|
elif args[1] is type(None):
|
|
634
641
|
t = args[0]
|
|
635
642
|
assert isinstance(t, type), t
|
|
636
|
-
arg: Any = Config.get().get_value(
|
|
643
|
+
arg: Any = Config.get().get_value(pname, t, section=name)
|
|
637
644
|
if arg is not None:
|
|
638
|
-
init_kwargs[
|
|
645
|
+
init_kwargs[pname] = arg
|
|
639
646
|
elif param.default is inspect.Parameter.empty:
|
|
640
647
|
raise excs.Error(
|
|
641
|
-
f'`{name}` client not initialized: parameter `{
|
|
642
|
-
f'To fix this, specify the `{name.upper()}_{
|
|
643
|
-
f'or put `{
|
|
648
|
+
f'`{name}` client not initialized: parameter `{pname}` is not configured.\n'
|
|
649
|
+
f'To fix this, specify the `{name.upper()}_{pname.upper()}` environment variable, '
|
|
650
|
+
f'or put `{pname.lower()}` in the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
|
|
644
651
|
)
|
|
645
652
|
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
653
|
+
# Construct the requested client
|
|
654
|
+
with _registered_clients_lock:
|
|
655
|
+
if cl.client_obj is not None:
|
|
656
|
+
return cl.client_obj # Already initialized
|
|
657
|
+
cl.client_obj = cl.init_fn(**init_kwargs)
|
|
658
|
+
self._logger.info(f'Initialized `{name}` client with parameters: {init_kwargs}.')
|
|
659
|
+
return cl.client_obj
|
|
649
660
|
|
|
650
661
|
def _start_web_server(self) -> None:
|
|
651
662
|
"""
|
|
652
663
|
The http server root is the file system root.
|
|
653
664
|
eg: /home/media/foo.mp4 is located at http://127.0.0.1:{port}/home/media/foo.mp4
|
|
654
665
|
On Windows, the server will translate paths like http://127.0.0.1:{port}/c:/media/foo.mp4
|
|
655
|
-
This arrangement enables serving
|
|
656
|
-
as well as external
|
|
666
|
+
This arrangement enables serving objects hosted within _home,
|
|
667
|
+
as well as external objects inserted into pixeltable or produced by pixeltable.
|
|
657
668
|
The port is chosen dynamically to prevent conflicts.
|
|
658
669
|
"""
|
|
659
670
|
# Port 0 means OS picks one for us.
|
|
@@ -713,10 +724,12 @@ class Env:
|
|
|
713
724
|
def __register_packages(self) -> None:
|
|
714
725
|
"""Declare optional packages that are utilized by some parts of the code."""
|
|
715
726
|
self.__register_package('anthropic')
|
|
727
|
+
self.__register_package('azure.storage.blob', library_name='azure-storage-blob')
|
|
716
728
|
self.__register_package('boto3')
|
|
717
729
|
self.__register_package('datasets')
|
|
718
730
|
self.__register_package('fiftyone')
|
|
719
731
|
self.__register_package('fireworks', library_name='fireworks-ai')
|
|
732
|
+
self.__register_package('google.cloud.storage', library_name='google-cloud-storage')
|
|
720
733
|
self.__register_package('google.genai', library_name='google-genai')
|
|
721
734
|
self.__register_package('groq')
|
|
722
735
|
self.__register_package('huggingface_hub', library_name='huggingface-hub')
|
|
@@ -815,6 +828,12 @@ class Env:
|
|
|
815
828
|
assert self._media_dir is not None
|
|
816
829
|
return self._media_dir
|
|
817
830
|
|
|
831
|
+
@property
|
|
832
|
+
def object_soa(self) -> StorageObjectAddress:
|
|
833
|
+
assert self._media_dir is not None
|
|
834
|
+
assert self._object_soa is not None
|
|
835
|
+
return self._object_soa
|
|
836
|
+
|
|
818
837
|
@property
|
|
819
838
|
def file_cache_dir(self) -> Path:
|
|
820
839
|
assert self._file_cache_dir is not None
|
|
@@ -947,11 +966,13 @@ def register_client(name: str) -> Callable:
|
|
|
947
966
|
def decorator(fn: Callable) -> None:
|
|
948
967
|
sig = inspect.signature(fn)
|
|
949
968
|
params = dict(sig.parameters)
|
|
950
|
-
|
|
969
|
+
with _registered_clients_lock:
|
|
970
|
+
_registered_clients[name] = ApiClient(init_fn=fn, params=params)
|
|
951
971
|
|
|
952
972
|
return decorator
|
|
953
973
|
|
|
954
974
|
|
|
975
|
+
_registered_clients_lock: threading.Lock = threading.Lock()
|
|
955
976
|
_registered_clients: dict[str, ApiClient] = {}
|
|
956
977
|
|
|
957
978
|
|
pixeltable/exec/__init__.py
CHANGED
|
@@ -8,5 +8,6 @@ from .exec_context import ExecContext
|
|
|
8
8
|
from .exec_node import ExecNode
|
|
9
9
|
from .expr_eval import ExprEvalNode
|
|
10
10
|
from .in_memory_data_node import InMemoryDataNode
|
|
11
|
+
from .object_store_save_node import ObjectStoreSaveNode
|
|
11
12
|
from .row_update_node import RowUpdateNode
|
|
12
13
|
from .sql_node import SqlAggregationNode, SqlJoinNode, SqlLookupNode, SqlNode, SqlSampleNode, SqlScanNode
|
|
@@ -103,6 +103,5 @@ class AggregationNode(ExecNode):
|
|
|
103
103
|
self.row_builder.eval(prev_row, self.agg_fn_eval_ctx, profile=self.ctx.profile)
|
|
104
104
|
self.output_batch.add_row(prev_row)
|
|
105
105
|
|
|
106
|
-
self.output_batch.flush_imgs(None, self.row_builder.stored_img_cols, self.flushed_img_slots)
|
|
107
106
|
_logger.debug(f'AggregateNode: consumed {num_input_rows} rows, returning {len(self.output_batch.rows)} rows')
|
|
108
107
|
yield self.output_batch
|
|
@@ -9,12 +9,12 @@ import urllib.request
|
|
|
9
9
|
from collections import deque
|
|
10
10
|
from concurrent import futures
|
|
11
11
|
from pathlib import Path
|
|
12
|
-
from typing import
|
|
12
|
+
from typing import AsyncIterator, Iterator, Optional
|
|
13
13
|
from uuid import UUID
|
|
14
14
|
|
|
15
15
|
from pixeltable import exceptions as excs, exprs
|
|
16
16
|
from pixeltable.utils.filecache import FileCache
|
|
17
|
-
from pixeltable.utils.
|
|
17
|
+
from pixeltable.utils.object_stores import ObjectOps
|
|
18
18
|
|
|
19
19
|
from .data_row_batch import DataRowBatch
|
|
20
20
|
from .exec_node import ExecNode
|
|
@@ -26,16 +26,17 @@ class CachePrefetchNode(ExecNode):
|
|
|
26
26
|
"""Brings files with external URLs into the cache
|
|
27
27
|
|
|
28
28
|
TODO:
|
|
29
|
-
-
|
|
29
|
+
- Process a row at a time and limit the number of in-flight rows to control memory usage
|
|
30
|
+
- Create asyncio.Tasks to consume our input in order to increase concurrency.
|
|
30
31
|
"""
|
|
31
32
|
|
|
33
|
+
QUEUE_DEPTH_HIGH_WATER = 50 # target number of in-flight requests
|
|
34
|
+
QUEUE_DEPTH_LOW_WATER = 20 # target number of in-flight requests
|
|
32
35
|
BATCH_SIZE = 16
|
|
33
|
-
|
|
36
|
+
MAX_WORKERS = 15
|
|
34
37
|
|
|
35
38
|
retain_input_order: bool # if True, return rows in the exact order they were received
|
|
36
39
|
file_col_info: list[exprs.ColumnSlotIdx]
|
|
37
|
-
boto_client: Optional[Any]
|
|
38
|
-
boto_client_lock: threading.Lock
|
|
39
40
|
|
|
40
41
|
# execution state
|
|
41
42
|
num_returned_rows: int
|
|
@@ -64,10 +65,6 @@ class CachePrefetchNode(ExecNode):
|
|
|
64
65
|
self.retain_input_order = retain_input_order
|
|
65
66
|
self.file_col_info = file_col_info
|
|
66
67
|
|
|
67
|
-
# clients for specific services are constructed as needed, because it's time-consuming
|
|
68
|
-
self.boto_client = None
|
|
69
|
-
self.boto_client_lock = threading.Lock()
|
|
70
|
-
|
|
71
68
|
self.num_returned_rows = 0
|
|
72
69
|
self.ready_rows = deque()
|
|
73
70
|
self.in_flight_rows = {}
|
|
@@ -75,24 +72,42 @@ class CachePrefetchNode(ExecNode):
|
|
|
75
72
|
self.in_flight_urls = {}
|
|
76
73
|
self.input_finished = False
|
|
77
74
|
self.row_idx = itertools.count() if retain_input_order else itertools.repeat(None)
|
|
75
|
+
assert self.QUEUE_DEPTH_HIGH_WATER > self.QUEUE_DEPTH_LOW_WATER
|
|
78
76
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
# we create enough in-flight requests to fill the first batch
|
|
83
|
-
while not self.input_finished and self.__num_pending_rows() < self.BATCH_SIZE:
|
|
84
|
-
await self.__submit_input_batch(input_iter, executor)
|
|
77
|
+
@property
|
|
78
|
+
def queued_work(self) -> int:
|
|
79
|
+
return len(self.in_flight_requests)
|
|
85
80
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
81
|
+
async def get_input_batch(self, input_iter: AsyncIterator[DataRowBatch]) -> Optional[DataRowBatch]:
|
|
82
|
+
"""Get the next batch of input rows, or None if there are no more rows"""
|
|
83
|
+
try:
|
|
84
|
+
input_batch = await anext(input_iter)
|
|
85
|
+
if input_batch is None:
|
|
86
|
+
self.input_finished = True
|
|
87
|
+
return input_batch
|
|
88
|
+
except StopAsyncIteration:
|
|
89
|
+
self.input_finished = True
|
|
90
|
+
return None
|
|
94
91
|
|
|
95
|
-
|
|
92
|
+
async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
|
|
93
|
+
input_iter = aiter(self.input)
|
|
94
|
+
with futures.ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
|
|
95
|
+
while True:
|
|
96
|
+
# Create work to fill the queue to the high water mark ... ?without overrunning the in-flight row limit.
|
|
97
|
+
while not self.input_finished and self.queued_work < self.QUEUE_DEPTH_HIGH_WATER:
|
|
98
|
+
input_batch = await self.get_input_batch(input_iter)
|
|
99
|
+
if input_batch is not None:
|
|
100
|
+
self.__process_input_batch(input_batch, executor)
|
|
101
|
+
|
|
102
|
+
# Wait for enough completions to enable more queueing or if we're done
|
|
103
|
+
while self.queued_work > self.QUEUE_DEPTH_LOW_WATER or (self.input_finished and self.queued_work > 0):
|
|
104
|
+
done, _ = futures.wait(self.in_flight_requests, return_when=futures.FIRST_COMPLETED)
|
|
105
|
+
self.__process_completions(done, ignore_errors=self.ctx.ignore_errors)
|
|
106
|
+
|
|
107
|
+
# Emit results to meet batch size requirements or empty the in-flight row queue
|
|
108
|
+
if self.__has_ready_batch() or (
|
|
109
|
+
len(self.ready_rows) > 0 and self.input_finished and self.queued_work == 0
|
|
110
|
+
):
|
|
96
111
|
# create DataRowBatch from the first BATCH_SIZE ready rows
|
|
97
112
|
batch = DataRowBatch(self.row_builder)
|
|
98
113
|
rows = [self.ready_rows.popleft() for _ in range(min(self.BATCH_SIZE, len(self.ready_rows)))]
|
|
@@ -103,22 +118,15 @@ class CachePrefetchNode(ExecNode):
|
|
|
103
118
|
_logger.debug(f'returning {len(rows)} rows')
|
|
104
119
|
yield batch
|
|
105
120
|
|
|
106
|
-
if self.input_finished and self.
|
|
121
|
+
if self.input_finished and self.queued_work == 0 and len(self.ready_rows) == 0:
|
|
107
122
|
return
|
|
108
123
|
|
|
109
|
-
def __num_pending_rows(self) -> int:
|
|
110
|
-
return len(self.in_flight_rows) + len(self.ready_rows)
|
|
111
|
-
|
|
112
124
|
def __has_ready_batch(self) -> bool:
|
|
113
125
|
"""True if there are >= BATCH_SIZES entries in ready_rows and the first BATCH_SIZE ones are all non-None"""
|
|
114
126
|
return (
|
|
115
127
|
sum(int(row is not None) for row in itertools.islice(self.ready_rows, self.BATCH_SIZE)) == self.BATCH_SIZE
|
|
116
128
|
)
|
|
117
129
|
|
|
118
|
-
def __ready_prefix_len(self) -> int:
|
|
119
|
-
"""Length of the non-None prefix of ready_rows (= what we can return right now)"""
|
|
120
|
-
return sum(1 for _ in itertools.takewhile(lambda x: x is not None, self.ready_rows))
|
|
121
|
-
|
|
122
130
|
def __add_ready_row(self, row: exprs.DataRow, row_idx: Optional[int]) -> None:
|
|
123
131
|
if row_idx is None:
|
|
124
132
|
self.ready_rows.append(row)
|
|
@@ -129,50 +137,36 @@ class CachePrefetchNode(ExecNode):
|
|
|
129
137
|
self.ready_rows.extend([None] * (idx - len(self.ready_rows) + 1))
|
|
130
138
|
self.ready_rows[idx] = row
|
|
131
139
|
|
|
132
|
-
def
|
|
133
|
-
"""Wait for in-flight requests to complete until we have a full batch of rows"""
|
|
140
|
+
def __process_completions(self, done: set[futures.Future], ignore_errors: bool) -> None:
|
|
134
141
|
file_cache = FileCache.get()
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
async def __submit_input_batch(
|
|
164
|
-
self, input: AsyncIterator[DataRowBatch], executor: futures.ThreadPoolExecutor
|
|
165
|
-
) -> None:
|
|
166
|
-
assert not self.input_finished
|
|
167
|
-
input_batch: Optional[DataRowBatch]
|
|
168
|
-
try:
|
|
169
|
-
input_batch = await anext(input)
|
|
170
|
-
except StopAsyncIteration:
|
|
171
|
-
input_batch = None
|
|
172
|
-
if input_batch is None:
|
|
173
|
-
self.input_finished = True
|
|
174
|
-
return
|
|
175
|
-
|
|
142
|
+
for f in done:
|
|
143
|
+
url = self.in_flight_requests.pop(f)
|
|
144
|
+
tmp_path, exc = f.result()
|
|
145
|
+
if exc is not None and not ignore_errors:
|
|
146
|
+
raise exc
|
|
147
|
+
local_path: Optional[Path] = None
|
|
148
|
+
if tmp_path is not None:
|
|
149
|
+
# register the file with the cache for the first column in which it's missing
|
|
150
|
+
assert url in self.in_flight_urls
|
|
151
|
+
_, info = self.in_flight_urls[url][0]
|
|
152
|
+
local_path = file_cache.add(info.col.tbl.id, info.col.id, url, tmp_path)
|
|
153
|
+
_logger.debug(f'cached {url} as {local_path}')
|
|
154
|
+
|
|
155
|
+
# add the local path/exception to the slots that reference the url
|
|
156
|
+
for row, info in self.in_flight_urls.pop(url):
|
|
157
|
+
if exc is not None:
|
|
158
|
+
self.row_builder.set_exc(row, info.slot_idx, exc)
|
|
159
|
+
else:
|
|
160
|
+
assert local_path is not None
|
|
161
|
+
row.set_file_path(info.slot_idx, str(local_path))
|
|
162
|
+
state = self.in_flight_rows[id(row)]
|
|
163
|
+
state.num_missing -= 1
|
|
164
|
+
if state.num_missing == 0:
|
|
165
|
+
del self.in_flight_rows[id(row)]
|
|
166
|
+
self.__add_ready_row(row, state.idx)
|
|
167
|
+
|
|
168
|
+
def __process_input_batch(self, input_batch: DataRowBatch, executor: futures.ThreadPoolExecutor) -> None:
|
|
169
|
+
"""Process a batch of input rows, submitting URLs for download and adding ready rows to ready_rows"""
|
|
176
170
|
file_cache = FileCache.get()
|
|
177
171
|
|
|
178
172
|
# URLs from this input batch that aren't already in the file cache;
|
|
@@ -180,7 +174,7 @@ class CachePrefetchNode(ExecNode):
|
|
|
180
174
|
# the time it takes to get the next batch together
|
|
181
175
|
cache_misses: list[str] = []
|
|
182
176
|
|
|
183
|
-
url_pos: dict[str, int] = {} # url -> row_idx; used for logging
|
|
177
|
+
url_pos: dict[str, Optional[int]] = {} # url -> row_idx; used for logging
|
|
184
178
|
for row in input_batch:
|
|
185
179
|
# identify missing local files in input batch, or fill in their paths if they're already cached
|
|
186
180
|
num_missing = 0
|
|
@@ -221,6 +215,8 @@ class CachePrefetchNode(ExecNode):
|
|
|
221
215
|
|
|
222
216
|
def __fetch_url(self, url: str) -> tuple[Optional[Path], Optional[Exception]]:
|
|
223
217
|
"""Fetches a remote URL into the TempStore and returns its path"""
|
|
218
|
+
from pixeltable.utils.local_store import TempStore
|
|
219
|
+
|
|
224
220
|
_logger.debug(f'fetching url={url} thread_name={threading.current_thread().name}')
|
|
225
221
|
parsed = urllib.parse.urlparse(url)
|
|
226
222
|
# Use len(parsed.scheme) > 1 here to ensure we're not being passed
|
|
@@ -234,31 +230,11 @@ class CachePrefetchNode(ExecNode):
|
|
|
234
230
|
tmp_path = TempStore.create_path(extension=extension)
|
|
235
231
|
try:
|
|
236
232
|
_logger.debug(f'Downloading {url} to {tmp_path}')
|
|
237
|
-
|
|
238
|
-
from pixeltable.utils.s3 import get_client
|
|
239
|
-
|
|
240
|
-
with self.boto_client_lock:
|
|
241
|
-
if self.boto_client is None:
|
|
242
|
-
config = {
|
|
243
|
-
'max_pool_connections': self.NUM_EXECUTOR_THREADS + 4, # +4: leave some headroom
|
|
244
|
-
'connect_timeout': 5,
|
|
245
|
-
'read_timeout': 30,
|
|
246
|
-
'retries': {'max_attempts': 3, 'mode': 'adaptive'},
|
|
247
|
-
}
|
|
248
|
-
self.boto_client = get_client(**config)
|
|
249
|
-
self.boto_client.download_file(parsed.netloc, parsed.path.lstrip('/'), str(tmp_path))
|
|
250
|
-
elif parsed.scheme in ('http', 'https'):
|
|
251
|
-
with urllib.request.urlopen(url) as resp, open(tmp_path, 'wb') as f:
|
|
252
|
-
data = resp.read()
|
|
253
|
-
f.write(data)
|
|
254
|
-
else:
|
|
255
|
-
raise AssertionError(f'Unsupported URL scheme: {parsed.scheme}')
|
|
233
|
+
ObjectOps.copy_object_to_local_file(url, tmp_path)
|
|
256
234
|
_logger.debug(f'Downloaded {url} to {tmp_path}')
|
|
257
235
|
return tmp_path, None
|
|
258
236
|
except Exception as e:
|
|
259
237
|
# we want to add the file url to the exception message
|
|
260
238
|
exc = excs.Error(f'Failed to download {url}: {e}')
|
|
261
239
|
_logger.debug(f'Failed to download {url}: {e}', exc_info=e)
|
|
262
|
-
if not self.ctx.ignore_errors:
|
|
263
|
-
raise exc from None # suppress original exception
|
|
264
240
|
return None, exc
|
|
@@ -12,15 +12,14 @@ class DataRowBatch:
|
|
|
12
12
|
"""Set of DataRows, indexed by rowid.
|
|
13
13
|
|
|
14
14
|
Contains the metadata needed to initialize DataRows.
|
|
15
|
+
|
|
16
|
+
Requires either num_rows or rows to be specified, but not both.
|
|
15
17
|
"""
|
|
16
18
|
|
|
17
19
|
row_builder: exprs.RowBuilder
|
|
18
20
|
rows: list[exprs.DataRow]
|
|
19
21
|
|
|
20
22
|
def __init__(self, row_builder: exprs.RowBuilder, rows: Optional[list[exprs.DataRow]] = None):
|
|
21
|
-
"""
|
|
22
|
-
Requires either num_rows or rows to be specified, but not both.
|
|
23
|
-
"""
|
|
24
23
|
self.row_builder = row_builder
|
|
25
24
|
self.rows = [] if rows is None else rows
|
|
26
25
|
|
|
@@ -39,20 +38,5 @@ class DataRowBatch:
|
|
|
39
38
|
def __getitem__(self, index: int) -> exprs.DataRow:
|
|
40
39
|
return self.rows[index]
|
|
41
40
|
|
|
42
|
-
def flush_imgs(
|
|
43
|
-
self, idx_range: Optional[slice], stored_img_info: list[exprs.ColumnSlotIdx], flushed_img_slots: list[int]
|
|
44
|
-
) -> None:
|
|
45
|
-
"""Flushes images in the given range of rows."""
|
|
46
|
-
if len(stored_img_info) == 0 and len(flushed_img_slots) == 0:
|
|
47
|
-
return
|
|
48
|
-
|
|
49
|
-
if idx_range is None:
|
|
50
|
-
idx_range = slice(0, len(self.rows))
|
|
51
|
-
for row in self.rows[idx_range]:
|
|
52
|
-
for info in stored_img_info:
|
|
53
|
-
row.flush_img(info.slot_idx, info.col)
|
|
54
|
-
for slot_idx in flushed_img_slots:
|
|
55
|
-
row.flush_img(slot_idx)
|
|
56
|
-
|
|
57
41
|
def __iter__(self) -> Iterator[exprs.DataRow]:
|
|
58
42
|
return iter(self.rows)
|