pixeltable 0.4.13__py3-none-any.whl → 0.4.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +2 -1
- pixeltable/catalog/catalog.py +187 -63
- pixeltable/catalog/column.py +24 -20
- pixeltable/catalog/table.py +24 -8
- pixeltable/catalog/table_metadata.py +1 -0
- pixeltable/catalog/table_version.py +16 -34
- pixeltable/catalog/update_status.py +12 -0
- pixeltable/catalog/view.py +22 -22
- pixeltable/config.py +2 -0
- pixeltable/dataframe.py +4 -2
- pixeltable/env.py +46 -21
- pixeltable/exec/__init__.py +1 -0
- pixeltable/exec/aggregation_node.py +0 -1
- pixeltable/exec/cache_prefetch_node.py +74 -98
- pixeltable/exec/data_row_batch.py +2 -18
- pixeltable/exec/expr_eval/expr_eval_node.py +11 -0
- pixeltable/exec/in_memory_data_node.py +1 -1
- pixeltable/exec/object_store_save_node.py +299 -0
- pixeltable/exec/sql_node.py +28 -33
- pixeltable/exprs/data_row.py +31 -25
- pixeltable/exprs/json_path.py +6 -5
- pixeltable/exprs/row_builder.py +6 -12
- pixeltable/functions/gemini.py +1 -1
- pixeltable/functions/openai.py +1 -1
- pixeltable/functions/video.py +128 -15
- pixeltable/functions/whisperx.py +2 -0
- pixeltable/functions/yolox.py +2 -0
- pixeltable/globals.py +49 -30
- pixeltable/index/embedding_index.py +5 -8
- pixeltable/io/__init__.py +1 -0
- pixeltable/io/fiftyone.py +1 -1
- pixeltable/io/label_studio.py +4 -5
- pixeltable/iterators/__init__.py +1 -0
- pixeltable/iterators/audio.py +1 -1
- pixeltable/iterators/document.py +10 -12
- pixeltable/iterators/video.py +1 -1
- pixeltable/metadata/schema.py +7 -0
- pixeltable/plan.py +26 -1
- pixeltable/share/packager.py +8 -2
- pixeltable/share/publish.py +3 -10
- pixeltable/store.py +1 -1
- pixeltable/type_system.py +1 -3
- pixeltable/utils/dbms.py +31 -5
- pixeltable/utils/gcs_store.py +283 -0
- pixeltable/utils/local_store.py +316 -0
- pixeltable/utils/object_stores.py +497 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +354 -0
- {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/METADATA +1 -1
- {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/RECORD +53 -50
- pixeltable/utils/media_store.py +0 -248
- pixeltable/utils/s3.py +0 -17
- {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/licenses/LICENSE +0 -0
|
@@ -9,12 +9,12 @@ import urllib.request
|
|
|
9
9
|
from collections import deque
|
|
10
10
|
from concurrent import futures
|
|
11
11
|
from pathlib import Path
|
|
12
|
-
from typing import
|
|
12
|
+
from typing import AsyncIterator, Iterator, Optional
|
|
13
13
|
from uuid import UUID
|
|
14
14
|
|
|
15
15
|
from pixeltable import exceptions as excs, exprs
|
|
16
16
|
from pixeltable.utils.filecache import FileCache
|
|
17
|
-
from pixeltable.utils.
|
|
17
|
+
from pixeltable.utils.object_stores import ObjectOps
|
|
18
18
|
|
|
19
19
|
from .data_row_batch import DataRowBatch
|
|
20
20
|
from .exec_node import ExecNode
|
|
@@ -26,16 +26,17 @@ class CachePrefetchNode(ExecNode):
|
|
|
26
26
|
"""Brings files with external URLs into the cache
|
|
27
27
|
|
|
28
28
|
TODO:
|
|
29
|
-
-
|
|
29
|
+
- Process a row at a time and limit the number of in-flight rows to control memory usage
|
|
30
|
+
- Create asyncio.Tasks to consume our input in order to increase concurrency.
|
|
30
31
|
"""
|
|
31
32
|
|
|
33
|
+
QUEUE_DEPTH_HIGH_WATER = 50 # target number of in-flight requests
|
|
34
|
+
QUEUE_DEPTH_LOW_WATER = 20 # target number of in-flight requests
|
|
32
35
|
BATCH_SIZE = 16
|
|
33
|
-
|
|
36
|
+
MAX_WORKERS = 15
|
|
34
37
|
|
|
35
38
|
retain_input_order: bool # if True, return rows in the exact order they were received
|
|
36
39
|
file_col_info: list[exprs.ColumnSlotIdx]
|
|
37
|
-
boto_client: Optional[Any]
|
|
38
|
-
boto_client_lock: threading.Lock
|
|
39
40
|
|
|
40
41
|
# execution state
|
|
41
42
|
num_returned_rows: int
|
|
@@ -64,10 +65,6 @@ class CachePrefetchNode(ExecNode):
|
|
|
64
65
|
self.retain_input_order = retain_input_order
|
|
65
66
|
self.file_col_info = file_col_info
|
|
66
67
|
|
|
67
|
-
# clients for specific services are constructed as needed, because it's time-consuming
|
|
68
|
-
self.boto_client = None
|
|
69
|
-
self.boto_client_lock = threading.Lock()
|
|
70
|
-
|
|
71
68
|
self.num_returned_rows = 0
|
|
72
69
|
self.ready_rows = deque()
|
|
73
70
|
self.in_flight_rows = {}
|
|
@@ -75,24 +72,42 @@ class CachePrefetchNode(ExecNode):
|
|
|
75
72
|
self.in_flight_urls = {}
|
|
76
73
|
self.input_finished = False
|
|
77
74
|
self.row_idx = itertools.count() if retain_input_order else itertools.repeat(None)
|
|
75
|
+
assert self.QUEUE_DEPTH_HIGH_WATER > self.QUEUE_DEPTH_LOW_WATER
|
|
78
76
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
# we create enough in-flight requests to fill the first batch
|
|
83
|
-
while not self.input_finished and self.__num_pending_rows() < self.BATCH_SIZE:
|
|
84
|
-
await self.__submit_input_batch(input_iter, executor)
|
|
77
|
+
@property
|
|
78
|
+
def queued_work(self) -> int:
|
|
79
|
+
return len(self.in_flight_requests)
|
|
85
80
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
81
|
+
async def get_input_batch(self, input_iter: AsyncIterator[DataRowBatch]) -> Optional[DataRowBatch]:
|
|
82
|
+
"""Get the next batch of input rows, or None if there are no more rows"""
|
|
83
|
+
try:
|
|
84
|
+
input_batch = await anext(input_iter)
|
|
85
|
+
if input_batch is None:
|
|
86
|
+
self.input_finished = True
|
|
87
|
+
return input_batch
|
|
88
|
+
except StopAsyncIteration:
|
|
89
|
+
self.input_finished = True
|
|
90
|
+
return None
|
|
94
91
|
|
|
95
|
-
|
|
92
|
+
async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
|
|
93
|
+
input_iter = aiter(self.input)
|
|
94
|
+
with futures.ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
|
|
95
|
+
while True:
|
|
96
|
+
# Create work to fill the queue to the high water mark ... ?without overrunning the in-flight row limit.
|
|
97
|
+
while not self.input_finished and self.queued_work < self.QUEUE_DEPTH_HIGH_WATER:
|
|
98
|
+
input_batch = await self.get_input_batch(input_iter)
|
|
99
|
+
if input_batch is not None:
|
|
100
|
+
self.__process_input_batch(input_batch, executor)
|
|
101
|
+
|
|
102
|
+
# Wait for enough completions to enable more queueing or if we're done
|
|
103
|
+
while self.queued_work > self.QUEUE_DEPTH_LOW_WATER or (self.input_finished and self.queued_work > 0):
|
|
104
|
+
done, _ = futures.wait(self.in_flight_requests, return_when=futures.FIRST_COMPLETED)
|
|
105
|
+
self.__process_completions(done, ignore_errors=self.ctx.ignore_errors)
|
|
106
|
+
|
|
107
|
+
# Emit results to meet batch size requirements or empty the in-flight row queue
|
|
108
|
+
if self.__has_ready_batch() or (
|
|
109
|
+
len(self.ready_rows) > 0 and self.input_finished and self.queued_work == 0
|
|
110
|
+
):
|
|
96
111
|
# create DataRowBatch from the first BATCH_SIZE ready rows
|
|
97
112
|
batch = DataRowBatch(self.row_builder)
|
|
98
113
|
rows = [self.ready_rows.popleft() for _ in range(min(self.BATCH_SIZE, len(self.ready_rows)))]
|
|
@@ -103,22 +118,15 @@ class CachePrefetchNode(ExecNode):
|
|
|
103
118
|
_logger.debug(f'returning {len(rows)} rows')
|
|
104
119
|
yield batch
|
|
105
120
|
|
|
106
|
-
if self.input_finished and self.
|
|
121
|
+
if self.input_finished and self.queued_work == 0 and len(self.ready_rows) == 0:
|
|
107
122
|
return
|
|
108
123
|
|
|
109
|
-
def __num_pending_rows(self) -> int:
|
|
110
|
-
return len(self.in_flight_rows) + len(self.ready_rows)
|
|
111
|
-
|
|
112
124
|
def __has_ready_batch(self) -> bool:
|
|
113
125
|
"""True if there are >= BATCH_SIZES entries in ready_rows and the first BATCH_SIZE ones are all non-None"""
|
|
114
126
|
return (
|
|
115
127
|
sum(int(row is not None) for row in itertools.islice(self.ready_rows, self.BATCH_SIZE)) == self.BATCH_SIZE
|
|
116
128
|
)
|
|
117
129
|
|
|
118
|
-
def __ready_prefix_len(self) -> int:
|
|
119
|
-
"""Length of the non-None prefix of ready_rows (= what we can return right now)"""
|
|
120
|
-
return sum(1 for _ in itertools.takewhile(lambda x: x is not None, self.ready_rows))
|
|
121
|
-
|
|
122
130
|
def __add_ready_row(self, row: exprs.DataRow, row_idx: Optional[int]) -> None:
|
|
123
131
|
if row_idx is None:
|
|
124
132
|
self.ready_rows.append(row)
|
|
@@ -129,50 +137,36 @@ class CachePrefetchNode(ExecNode):
|
|
|
129
137
|
self.ready_rows.extend([None] * (idx - len(self.ready_rows) + 1))
|
|
130
138
|
self.ready_rows[idx] = row
|
|
131
139
|
|
|
132
|
-
def
|
|
133
|
-
"""Wait for in-flight requests to complete until we have a full batch of rows"""
|
|
140
|
+
def __process_completions(self, done: set[futures.Future], ignore_errors: bool) -> None:
|
|
134
141
|
file_cache = FileCache.get()
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
async def __submit_input_batch(
|
|
164
|
-
self, input: AsyncIterator[DataRowBatch], executor: futures.ThreadPoolExecutor
|
|
165
|
-
) -> None:
|
|
166
|
-
assert not self.input_finished
|
|
167
|
-
input_batch: Optional[DataRowBatch]
|
|
168
|
-
try:
|
|
169
|
-
input_batch = await anext(input)
|
|
170
|
-
except StopAsyncIteration:
|
|
171
|
-
input_batch = None
|
|
172
|
-
if input_batch is None:
|
|
173
|
-
self.input_finished = True
|
|
174
|
-
return
|
|
175
|
-
|
|
142
|
+
for f in done:
|
|
143
|
+
url = self.in_flight_requests.pop(f)
|
|
144
|
+
tmp_path, exc = f.result()
|
|
145
|
+
if exc is not None and not ignore_errors:
|
|
146
|
+
raise exc
|
|
147
|
+
local_path: Optional[Path] = None
|
|
148
|
+
if tmp_path is not None:
|
|
149
|
+
# register the file with the cache for the first column in which it's missing
|
|
150
|
+
assert url in self.in_flight_urls
|
|
151
|
+
_, info = self.in_flight_urls[url][0]
|
|
152
|
+
local_path = file_cache.add(info.col.tbl.id, info.col.id, url, tmp_path)
|
|
153
|
+
_logger.debug(f'cached {url} as {local_path}')
|
|
154
|
+
|
|
155
|
+
# add the local path/exception to the slots that reference the url
|
|
156
|
+
for row, info in self.in_flight_urls.pop(url):
|
|
157
|
+
if exc is not None:
|
|
158
|
+
self.row_builder.set_exc(row, info.slot_idx, exc)
|
|
159
|
+
else:
|
|
160
|
+
assert local_path is not None
|
|
161
|
+
row.set_file_path(info.slot_idx, str(local_path))
|
|
162
|
+
state = self.in_flight_rows[id(row)]
|
|
163
|
+
state.num_missing -= 1
|
|
164
|
+
if state.num_missing == 0:
|
|
165
|
+
del self.in_flight_rows[id(row)]
|
|
166
|
+
self.__add_ready_row(row, state.idx)
|
|
167
|
+
|
|
168
|
+
def __process_input_batch(self, input_batch: DataRowBatch, executor: futures.ThreadPoolExecutor) -> None:
|
|
169
|
+
"""Process a batch of input rows, submitting URLs for download and adding ready rows to ready_rows"""
|
|
176
170
|
file_cache = FileCache.get()
|
|
177
171
|
|
|
178
172
|
# URLs from this input batch that aren't already in the file cache;
|
|
@@ -180,7 +174,7 @@ class CachePrefetchNode(ExecNode):
|
|
|
180
174
|
# the time it takes to get the next batch together
|
|
181
175
|
cache_misses: list[str] = []
|
|
182
176
|
|
|
183
|
-
url_pos: dict[str, int] = {} # url -> row_idx; used for logging
|
|
177
|
+
url_pos: dict[str, Optional[int]] = {} # url -> row_idx; used for logging
|
|
184
178
|
for row in input_batch:
|
|
185
179
|
# identify missing local files in input batch, or fill in their paths if they're already cached
|
|
186
180
|
num_missing = 0
|
|
@@ -221,6 +215,8 @@ class CachePrefetchNode(ExecNode):
|
|
|
221
215
|
|
|
222
216
|
def __fetch_url(self, url: str) -> tuple[Optional[Path], Optional[Exception]]:
|
|
223
217
|
"""Fetches a remote URL into the TempStore and returns its path"""
|
|
218
|
+
from pixeltable.utils.local_store import TempStore
|
|
219
|
+
|
|
224
220
|
_logger.debug(f'fetching url={url} thread_name={threading.current_thread().name}')
|
|
225
221
|
parsed = urllib.parse.urlparse(url)
|
|
226
222
|
# Use len(parsed.scheme) > 1 here to ensure we're not being passed
|
|
@@ -234,31 +230,11 @@ class CachePrefetchNode(ExecNode):
|
|
|
234
230
|
tmp_path = TempStore.create_path(extension=extension)
|
|
235
231
|
try:
|
|
236
232
|
_logger.debug(f'Downloading {url} to {tmp_path}')
|
|
237
|
-
|
|
238
|
-
from pixeltable.utils.s3 import get_client
|
|
239
|
-
|
|
240
|
-
with self.boto_client_lock:
|
|
241
|
-
if self.boto_client is None:
|
|
242
|
-
config = {
|
|
243
|
-
'max_pool_connections': self.NUM_EXECUTOR_THREADS + 4, # +4: leave some headroom
|
|
244
|
-
'connect_timeout': 5,
|
|
245
|
-
'read_timeout': 30,
|
|
246
|
-
'retries': {'max_attempts': 3, 'mode': 'adaptive'},
|
|
247
|
-
}
|
|
248
|
-
self.boto_client = get_client(**config)
|
|
249
|
-
self.boto_client.download_file(parsed.netloc, parsed.path.lstrip('/'), str(tmp_path))
|
|
250
|
-
elif parsed.scheme in ('http', 'https'):
|
|
251
|
-
with urllib.request.urlopen(url) as resp, open(tmp_path, 'wb') as f:
|
|
252
|
-
data = resp.read()
|
|
253
|
-
f.write(data)
|
|
254
|
-
else:
|
|
255
|
-
raise AssertionError(f'Unsupported URL scheme: {parsed.scheme}')
|
|
233
|
+
ObjectOps.copy_object_to_local_file(url, tmp_path)
|
|
256
234
|
_logger.debug(f'Downloaded {url} to {tmp_path}')
|
|
257
235
|
return tmp_path, None
|
|
258
236
|
except Exception as e:
|
|
259
237
|
# we want to add the file url to the exception message
|
|
260
238
|
exc = excs.Error(f'Failed to download {url}: {e}')
|
|
261
239
|
_logger.debug(f'Failed to download {url}: {e}', exc_info=e)
|
|
262
|
-
if not self.ctx.ignore_errors:
|
|
263
|
-
raise exc from None # suppress original exception
|
|
264
240
|
return None, exc
|
|
@@ -12,15 +12,14 @@ class DataRowBatch:
|
|
|
12
12
|
"""Set of DataRows, indexed by rowid.
|
|
13
13
|
|
|
14
14
|
Contains the metadata needed to initialize DataRows.
|
|
15
|
+
|
|
16
|
+
Requires either num_rows or rows to be specified, but not both.
|
|
15
17
|
"""
|
|
16
18
|
|
|
17
19
|
row_builder: exprs.RowBuilder
|
|
18
20
|
rows: list[exprs.DataRow]
|
|
19
21
|
|
|
20
22
|
def __init__(self, row_builder: exprs.RowBuilder, rows: Optional[list[exprs.DataRow]] = None):
|
|
21
|
-
"""
|
|
22
|
-
Requires either num_rows or rows to be specified, but not both.
|
|
23
|
-
"""
|
|
24
23
|
self.row_builder = row_builder
|
|
25
24
|
self.rows = [] if rows is None else rows
|
|
26
25
|
|
|
@@ -39,20 +38,5 @@ class DataRowBatch:
|
|
|
39
38
|
def __getitem__(self, index: int) -> exprs.DataRow:
|
|
40
39
|
return self.rows[index]
|
|
41
40
|
|
|
42
|
-
def flush_imgs(
|
|
43
|
-
self, idx_range: Optional[slice], stored_img_info: list[exprs.ColumnSlotIdx], flushed_img_slots: list[int]
|
|
44
|
-
) -> None:
|
|
45
|
-
"""Flushes images in the given range of rows."""
|
|
46
|
-
if len(stored_img_info) == 0 and len(flushed_img_slots) == 0:
|
|
47
|
-
return
|
|
48
|
-
|
|
49
|
-
if idx_range is None:
|
|
50
|
-
idx_range = slice(0, len(self.rows))
|
|
51
|
-
for row in self.rows[idx_range]:
|
|
52
|
-
for info in stored_img_info:
|
|
53
|
-
row.flush_img(info.slot_idx, info.col)
|
|
54
|
-
for slot_idx in flushed_img_slots:
|
|
55
|
-
row.flush_img(slot_idx)
|
|
56
|
-
|
|
57
41
|
def __iter__(self) -> Iterator[exprs.DataRow]:
|
|
58
42
|
return iter(self.rows)
|
|
@@ -390,6 +390,17 @@ class ExprEvalNode(ExecNode):
|
|
|
390
390
|
# end the main loop if we had an unhandled exception
|
|
391
391
|
try:
|
|
392
392
|
t.result()
|
|
393
|
+
except KeyboardInterrupt:
|
|
394
|
+
# ExprEvalNode instances are long-running and reused across multiple operations.
|
|
395
|
+
# When a user interrupts an operation (Ctrl+C), the main evaluation loop properly
|
|
396
|
+
# handles the KeyboardInterrupt and terminates the current operation. However,
|
|
397
|
+
# background tasks spawned by evaluators may complete asynchronously after the
|
|
398
|
+
# operation has ended, and their done callbacks will fire during subsequent
|
|
399
|
+
# operations. These "phantom" KeyboardInterrupt exceptions from previous
|
|
400
|
+
# operations' background tasks should not interfere with new operations, so we
|
|
401
|
+
# absorb them here rather than propagating them via self.error/self.exc_event.
|
|
402
|
+
_logger.debug('Task completed with KeyboardInterrupt (user cancellation)')
|
|
403
|
+
pass
|
|
393
404
|
except asyncio.CancelledError:
|
|
394
405
|
pass
|
|
395
406
|
except Exception as exc:
|
|
@@ -2,7 +2,7 @@ import logging
|
|
|
2
2
|
from typing import Any, AsyncIterator, Optional
|
|
3
3
|
|
|
4
4
|
from pixeltable import catalog, exprs
|
|
5
|
-
from pixeltable.utils.
|
|
5
|
+
from pixeltable.utils.local_store import TempStore
|
|
6
6
|
|
|
7
7
|
from .data_row_batch import DataRowBatch
|
|
8
8
|
from .exec_node import ExecNode
|
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import dataclasses
|
|
4
|
+
import itertools
|
|
5
|
+
import logging
|
|
6
|
+
from collections import defaultdict, deque
|
|
7
|
+
from concurrent import futures
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import AsyncIterator, Iterator, NamedTuple, Optional
|
|
10
|
+
from uuid import UUID
|
|
11
|
+
|
|
12
|
+
from pixeltable import exprs
|
|
13
|
+
from pixeltable.utils.object_stores import ObjectOps, ObjectPath, StorageTarget
|
|
14
|
+
|
|
15
|
+
from .data_row_batch import DataRowBatch
|
|
16
|
+
from .exec_node import ExecNode
|
|
17
|
+
|
|
18
|
+
_logger = logging.getLogger('pixeltable')
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ObjectStoreSaveNode(ExecNode):
|
|
22
|
+
"""Save files into designated object store(s).
|
|
23
|
+
|
|
24
|
+
Each row may have multiple files that need to be saved to a destination.
|
|
25
|
+
Each file may be referenced by more than one column in the row.
|
|
26
|
+
Each file may have multiple destinations, e.g., S3 bucket and local file system.
|
|
27
|
+
If there are multiple destinations, the file cannot be moved to any destination
|
|
28
|
+
until it has been copied to all of the other destinations.
|
|
29
|
+
Diagrammatically:
|
|
30
|
+
Row -> [src_path1, src_path2, ...]
|
|
31
|
+
src_path -> [dest1, dest2, ...]
|
|
32
|
+
dest1: [row_location1, row_location2, ...]
|
|
33
|
+
Paths with multiple destinations are removed from the TempStore only after all destination copies are complete.
|
|
34
|
+
|
|
35
|
+
TODO:
|
|
36
|
+
- Process a row at a time and limit the number of in-flight rows to control memory usage
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
QUEUE_DEPTH_HIGH_WATER = 50 # target number of in-flight requests
|
|
40
|
+
QUEUE_DEPTH_LOW_WATER = 20 # target number of in-flight requests
|
|
41
|
+
BATCH_SIZE = 16
|
|
42
|
+
MAX_WORKERS = 15
|
|
43
|
+
|
|
44
|
+
class WorkDesignator(NamedTuple):
|
|
45
|
+
"""Specify the source and destination for a WorkItem"""
|
|
46
|
+
|
|
47
|
+
src_path: str # source of the file to be processed
|
|
48
|
+
destination: Optional[str] # destination URI for the file to be processed
|
|
49
|
+
|
|
50
|
+
class WorkItem(NamedTuple):
|
|
51
|
+
src_path: Path
|
|
52
|
+
destination: Optional[str]
|
|
53
|
+
info: exprs.ColumnSlotIdx # column info for the file being processed
|
|
54
|
+
destination_count: int = 1 # number of unique destinations for this file
|
|
55
|
+
|
|
56
|
+
retain_input_order: bool # if True, return rows in the exact order they were received
|
|
57
|
+
file_col_info: list[exprs.ColumnSlotIdx]
|
|
58
|
+
|
|
59
|
+
# execution state
|
|
60
|
+
num_returned_rows: int
|
|
61
|
+
|
|
62
|
+
# ready_rows: rows that are ready to be returned, ordered by row idx;
|
|
63
|
+
# the implied row idx of ready_rows[0] is num_returned_rows
|
|
64
|
+
ready_rows: deque[Optional[exprs.DataRow]]
|
|
65
|
+
|
|
66
|
+
in_flight_rows: dict[int, ObjectStoreSaveNode.RowState] # rows with in-flight work; id(row) -> RowState
|
|
67
|
+
in_flight_requests: dict[
|
|
68
|
+
futures.Future, WorkDesignator
|
|
69
|
+
] # in-flight requests to save paths: Future -> WorkDesignator
|
|
70
|
+
in_flight_work: dict[
|
|
71
|
+
WorkDesignator, list[tuple[exprs.DataRow, exprs.ColumnSlotIdx]]
|
|
72
|
+
] # WorkDesignator -> [(row, info)]
|
|
73
|
+
|
|
74
|
+
input_finished: bool
|
|
75
|
+
row_idx: Iterator[Optional[int]]
|
|
76
|
+
|
|
77
|
+
@dataclasses.dataclass
|
|
78
|
+
class RowState:
|
|
79
|
+
row: exprs.DataRow
|
|
80
|
+
idx: Optional[int] # position in input stream; None if we don't retain input order
|
|
81
|
+
num_missing: int # number of references to media files in this row
|
|
82
|
+
delete_destinations: list[Path] # paths to delete after all copies are complete
|
|
83
|
+
|
|
84
|
+
def __init__(
|
|
85
|
+
self, tbl_id: UUID, file_col_info: list[exprs.ColumnSlotIdx], input: ExecNode, retain_input_order: bool = True
|
|
86
|
+
):
|
|
87
|
+
# input_/output_exprs=[]: we don't have anything to evaluate
|
|
88
|
+
super().__init__(input.row_builder, [], [], input)
|
|
89
|
+
self.retain_input_order = retain_input_order
|
|
90
|
+
self.file_col_info = file_col_info
|
|
91
|
+
|
|
92
|
+
self.num_returned_rows = 0
|
|
93
|
+
self.ready_rows = deque()
|
|
94
|
+
self.in_flight_rows = {}
|
|
95
|
+
self.in_flight_requests = {}
|
|
96
|
+
self.in_flight_work = {}
|
|
97
|
+
self.input_finished = False
|
|
98
|
+
self.row_idx = itertools.count() if retain_input_order else itertools.repeat(None)
|
|
99
|
+
assert self.QUEUE_DEPTH_HIGH_WATER > self.QUEUE_DEPTH_LOW_WATER
|
|
100
|
+
|
|
101
|
+
@property
|
|
102
|
+
def queued_work(self) -> int:
|
|
103
|
+
return len(self.in_flight_requests)
|
|
104
|
+
|
|
105
|
+
async def get_input_batch(self, input_iter: AsyncIterator[DataRowBatch]) -> Optional[DataRowBatch]:
|
|
106
|
+
"""Get the next batch of input rows, or None if there are no more rows"""
|
|
107
|
+
try:
|
|
108
|
+
input_batch = await anext(input_iter)
|
|
109
|
+
if input_batch is None:
|
|
110
|
+
self.input_finished = True
|
|
111
|
+
return input_batch
|
|
112
|
+
except StopAsyncIteration:
|
|
113
|
+
self.input_finished = True
|
|
114
|
+
return None
|
|
115
|
+
|
|
116
|
+
async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
|
|
117
|
+
input_iter = aiter(self.input)
|
|
118
|
+
with futures.ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
|
|
119
|
+
while True:
|
|
120
|
+
# Create work to fill the queue to the high water mark ... ?without overrunning the in-flight row limit.
|
|
121
|
+
while not self.input_finished and self.queued_work < self.QUEUE_DEPTH_HIGH_WATER:
|
|
122
|
+
input_batch = await self.get_input_batch(input_iter)
|
|
123
|
+
if input_batch is not None:
|
|
124
|
+
self.__process_input_batch(input_batch, executor)
|
|
125
|
+
|
|
126
|
+
# Wait for enough completions to enable more queueing or if we're done
|
|
127
|
+
while self.queued_work > self.QUEUE_DEPTH_LOW_WATER or (self.input_finished and self.queued_work > 0):
|
|
128
|
+
done, _ = futures.wait(self.in_flight_requests, return_when=futures.FIRST_COMPLETED)
|
|
129
|
+
self.__process_completions(done, ignore_errors=self.ctx.ignore_errors)
|
|
130
|
+
|
|
131
|
+
# Emit results to meet batch size requirements or empty the in-flight row queue
|
|
132
|
+
if self.__has_ready_batch() or (
|
|
133
|
+
len(self.ready_rows) > 0 and self.input_finished and self.queued_work == 0
|
|
134
|
+
):
|
|
135
|
+
# create DataRowBatch from the first BATCH_SIZE ready rows
|
|
136
|
+
batch = DataRowBatch(self.row_builder)
|
|
137
|
+
rows = [self.ready_rows.popleft() for _ in range(min(self.BATCH_SIZE, len(self.ready_rows)))]
|
|
138
|
+
for row in rows:
|
|
139
|
+
assert row is not None
|
|
140
|
+
batch.add_row(row)
|
|
141
|
+
self.num_returned_rows += len(rows)
|
|
142
|
+
_logger.debug(f'returning {len(rows)} rows')
|
|
143
|
+
yield batch
|
|
144
|
+
|
|
145
|
+
if self.input_finished and self.queued_work == 0 and len(self.ready_rows) == 0:
|
|
146
|
+
return
|
|
147
|
+
|
|
148
|
+
def __has_ready_batch(self) -> bool:
|
|
149
|
+
"""True if there are >= BATCH_SIZES entries in ready_rows and the first BATCH_SIZE ones are all non-None"""
|
|
150
|
+
return (
|
|
151
|
+
sum(int(row is not None) for row in itertools.islice(self.ready_rows, self.BATCH_SIZE)) == self.BATCH_SIZE
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
def __add_ready_row(self, row: exprs.DataRow, row_idx: Optional[int]) -> None:
|
|
155
|
+
if row_idx is None:
|
|
156
|
+
self.ready_rows.append(row)
|
|
157
|
+
else:
|
|
158
|
+
# extend ready_rows to accommodate row_idx
|
|
159
|
+
idx = row_idx - self.num_returned_rows
|
|
160
|
+
if idx >= len(self.ready_rows):
|
|
161
|
+
self.ready_rows.extend([None] * (idx - len(self.ready_rows) + 1))
|
|
162
|
+
self.ready_rows[idx] = row
|
|
163
|
+
|
|
164
|
+
def __process_completions(self, done: set[futures.Future], ignore_errors: bool) -> None:
|
|
165
|
+
from pixeltable.utils.local_store import TempStore
|
|
166
|
+
|
|
167
|
+
for f in done:
|
|
168
|
+
work_designator = self.in_flight_requests.pop(f)
|
|
169
|
+
new_file_url, exc = f.result()
|
|
170
|
+
if exc is not None and not ignore_errors:
|
|
171
|
+
raise exc
|
|
172
|
+
assert new_file_url is not None
|
|
173
|
+
|
|
174
|
+
# add the local path/exception to the slots that reference the url
|
|
175
|
+
for row, info in self.in_flight_work.pop(work_designator):
|
|
176
|
+
if exc is not None:
|
|
177
|
+
self.row_builder.set_exc(row, info.slot_idx, exc)
|
|
178
|
+
else:
|
|
179
|
+
row.file_urls[info.slot_idx] = new_file_url
|
|
180
|
+
|
|
181
|
+
state = self.in_flight_rows[id(row)]
|
|
182
|
+
state.num_missing -= 1
|
|
183
|
+
if state.num_missing == 0:
|
|
184
|
+
# All operations for this row are complete. Delete all files which had multiple destinations
|
|
185
|
+
for src_path in state.delete_destinations:
|
|
186
|
+
TempStore.delete_media_file(src_path)
|
|
187
|
+
del self.in_flight_rows[id(row)]
|
|
188
|
+
self.__add_ready_row(row, state.idx)
|
|
189
|
+
|
|
190
|
+
def __process_input_row(self, row: exprs.DataRow) -> list[ObjectStoreSaveNode.WorkItem]:
|
|
191
|
+
"""Process a batch of input rows, generating a list of work"""
|
|
192
|
+
from pixeltable.utils.local_store import LocalStore, TempStore
|
|
193
|
+
|
|
194
|
+
# Create a list of work to do for media storage in this row
|
|
195
|
+
row_idx = next(self.row_idx)
|
|
196
|
+
row_to_do: list[ObjectStoreSaveNode.WorkItem] = []
|
|
197
|
+
num_missing = 0
|
|
198
|
+
unique_destinations: dict[Path, int] = defaultdict(int) # destination -> count of unique destinations
|
|
199
|
+
|
|
200
|
+
for info in self.file_col_info:
|
|
201
|
+
col, index = info
|
|
202
|
+
# we may need to store this imagehave yet to store this image
|
|
203
|
+
if row.prepare_col_val_for_save(index, col):
|
|
204
|
+
row.file_urls[index] = row.save_media_to_temp(index, col)
|
|
205
|
+
|
|
206
|
+
url = row.file_urls[index]
|
|
207
|
+
if url is None:
|
|
208
|
+
# nothing to do
|
|
209
|
+
continue
|
|
210
|
+
|
|
211
|
+
assert row.excs[index] is None
|
|
212
|
+
assert col.col_type.is_media_type()
|
|
213
|
+
|
|
214
|
+
destination = info.col.destination
|
|
215
|
+
soa = None if destination is None else ObjectPath.parse_object_storage_addr(destination, False)
|
|
216
|
+
if (
|
|
217
|
+
soa is not None
|
|
218
|
+
and soa.storage_target == StorageTarget.LOCAL_STORE
|
|
219
|
+
and LocalStore(soa).resolve_url(url) is not None
|
|
220
|
+
):
|
|
221
|
+
# A local non-default destination was specified, and the url already points there
|
|
222
|
+
continue
|
|
223
|
+
|
|
224
|
+
src_path = LocalStore.file_url_to_path(url)
|
|
225
|
+
if src_path is None:
|
|
226
|
+
# The url does not point to a local file, do not attempt to copy/move it
|
|
227
|
+
continue
|
|
228
|
+
|
|
229
|
+
if destination is None and not TempStore.contains_path(src_path):
|
|
230
|
+
# Do not copy local file URLs to the LocalStore
|
|
231
|
+
continue
|
|
232
|
+
|
|
233
|
+
work_designator = ObjectStoreSaveNode.WorkDesignator(str(src_path), destination)
|
|
234
|
+
locations = self.in_flight_work.get(work_designator)
|
|
235
|
+
if locations is not None:
|
|
236
|
+
# we've already seen this
|
|
237
|
+
locations.append((row, info))
|
|
238
|
+
num_missing += 1
|
|
239
|
+
continue
|
|
240
|
+
|
|
241
|
+
work_item = ObjectStoreSaveNode.WorkItem(src_path, destination, info)
|
|
242
|
+
row_to_do.append(work_item)
|
|
243
|
+
self.in_flight_work[work_designator] = [(row, info)]
|
|
244
|
+
num_missing += 1
|
|
245
|
+
unique_destinations[src_path] += 1
|
|
246
|
+
|
|
247
|
+
# Update work items to reflect the number of unique destinations
|
|
248
|
+
new_to_do = []
|
|
249
|
+
for work_item in row_to_do:
|
|
250
|
+
if unique_destinations[work_item.src_path] == 1 and TempStore.contains_path(work_item.src_path):
|
|
251
|
+
new_to_do.append(work_item)
|
|
252
|
+
else:
|
|
253
|
+
new_to_do.append(
|
|
254
|
+
ObjectStoreSaveNode.WorkItem(
|
|
255
|
+
work_item.src_path,
|
|
256
|
+
work_item.destination,
|
|
257
|
+
work_item.info,
|
|
258
|
+
destination_count=unique_destinations[work_item.src_path] + 1,
|
|
259
|
+
# +1 for the TempStore destination
|
|
260
|
+
)
|
|
261
|
+
)
|
|
262
|
+
delete_destinations = [k for k, v in unique_destinations.items() if v > 1 and TempStore.contains_path(k)]
|
|
263
|
+
row_to_do = new_to_do
|
|
264
|
+
|
|
265
|
+
if len(row_to_do) > 0:
|
|
266
|
+
self.in_flight_rows[id(row)] = self.RowState(
|
|
267
|
+
row, row_idx, num_missing, delete_destinations=delete_destinations
|
|
268
|
+
)
|
|
269
|
+
else:
|
|
270
|
+
self.__add_ready_row(row, row_idx)
|
|
271
|
+
return row_to_do
|
|
272
|
+
|
|
273
|
+
def __process_input_batch(self, input_batch: DataRowBatch, executor: futures.ThreadPoolExecutor) -> None:
|
|
274
|
+
"""Process a batch of input rows, submitting temporary files for upload"""
|
|
275
|
+
work_to_do: list[ObjectStoreSaveNode.WorkItem] = []
|
|
276
|
+
|
|
277
|
+
for row in input_batch:
|
|
278
|
+
row_to_do = self.__process_input_row(row)
|
|
279
|
+
if len(row_to_do) > 0:
|
|
280
|
+
work_to_do.extend(row_to_do)
|
|
281
|
+
|
|
282
|
+
for work_item in work_to_do:
|
|
283
|
+
f = executor.submit(self.__persist_media_file, work_item)
|
|
284
|
+
self.in_flight_requests[f] = ObjectStoreSaveNode.WorkDesignator(
|
|
285
|
+
str(work_item.src_path), work_item.destination
|
|
286
|
+
)
|
|
287
|
+
_logger.debug(f'submitted {work_item}')
|
|
288
|
+
|
|
289
|
+
def __persist_media_file(self, work_item: WorkItem) -> tuple[Optional[str], Optional[Exception]]:
|
|
290
|
+
"""Move data from the TempStore to another location"""
|
|
291
|
+
src_path = work_item.src_path
|
|
292
|
+
col = work_item.info.col
|
|
293
|
+
assert col.destination == work_item.destination
|
|
294
|
+
try:
|
|
295
|
+
new_file_url = ObjectOps.put_file(col, src_path, work_item.destination_count == 1)
|
|
296
|
+
return new_file_url, None
|
|
297
|
+
except Exception as e:
|
|
298
|
+
_logger.debug(f'Failed to move/copy {src_path}: {e}', exc_info=e)
|
|
299
|
+
return None, e
|