pixeltable 0.4.13__py3-none-any.whl → 0.4.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (55) hide show
  1. pixeltable/__init__.py +2 -1
  2. pixeltable/catalog/catalog.py +187 -63
  3. pixeltable/catalog/column.py +24 -20
  4. pixeltable/catalog/table.py +24 -8
  5. pixeltable/catalog/table_metadata.py +1 -0
  6. pixeltable/catalog/table_version.py +16 -34
  7. pixeltable/catalog/update_status.py +12 -0
  8. pixeltable/catalog/view.py +22 -22
  9. pixeltable/config.py +2 -0
  10. pixeltable/dataframe.py +4 -2
  11. pixeltable/env.py +46 -21
  12. pixeltable/exec/__init__.py +1 -0
  13. pixeltable/exec/aggregation_node.py +0 -1
  14. pixeltable/exec/cache_prefetch_node.py +74 -98
  15. pixeltable/exec/data_row_batch.py +2 -18
  16. pixeltable/exec/expr_eval/expr_eval_node.py +11 -0
  17. pixeltable/exec/in_memory_data_node.py +1 -1
  18. pixeltable/exec/object_store_save_node.py +299 -0
  19. pixeltable/exec/sql_node.py +28 -33
  20. pixeltable/exprs/data_row.py +31 -25
  21. pixeltable/exprs/json_path.py +6 -5
  22. pixeltable/exprs/row_builder.py +6 -12
  23. pixeltable/functions/gemini.py +1 -1
  24. pixeltable/functions/openai.py +1 -1
  25. pixeltable/functions/video.py +128 -15
  26. pixeltable/functions/whisperx.py +2 -0
  27. pixeltable/functions/yolox.py +2 -0
  28. pixeltable/globals.py +49 -30
  29. pixeltable/index/embedding_index.py +5 -8
  30. pixeltable/io/__init__.py +1 -0
  31. pixeltable/io/fiftyone.py +1 -1
  32. pixeltable/io/label_studio.py +4 -5
  33. pixeltable/iterators/__init__.py +1 -0
  34. pixeltable/iterators/audio.py +1 -1
  35. pixeltable/iterators/document.py +10 -12
  36. pixeltable/iterators/video.py +1 -1
  37. pixeltable/metadata/schema.py +7 -0
  38. pixeltable/plan.py +26 -1
  39. pixeltable/share/packager.py +8 -2
  40. pixeltable/share/publish.py +3 -10
  41. pixeltable/store.py +1 -1
  42. pixeltable/type_system.py +1 -3
  43. pixeltable/utils/dbms.py +31 -5
  44. pixeltable/utils/gcs_store.py +283 -0
  45. pixeltable/utils/local_store.py +316 -0
  46. pixeltable/utils/object_stores.py +497 -0
  47. pixeltable/utils/pytorch.py +5 -6
  48. pixeltable/utils/s3_store.py +354 -0
  49. {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/METADATA +1 -1
  50. {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/RECORD +53 -50
  51. pixeltable/utils/media_store.py +0 -248
  52. pixeltable/utils/s3.py +0 -17
  53. {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/WHEEL +0 -0
  54. {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/entry_points.txt +0 -0
  55. {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/licenses/LICENSE +0 -0
@@ -9,12 +9,12 @@ import urllib.request
9
9
  from collections import deque
10
10
  from concurrent import futures
11
11
  from pathlib import Path
12
- from typing import Any, AsyncIterator, Iterator, Optional
12
+ from typing import AsyncIterator, Iterator, Optional
13
13
  from uuid import UUID
14
14
 
15
15
  from pixeltable import exceptions as excs, exprs
16
16
  from pixeltable.utils.filecache import FileCache
17
- from pixeltable.utils.media_store import TempStore
17
+ from pixeltable.utils.object_stores import ObjectOps
18
18
 
19
19
  from .data_row_batch import DataRowBatch
20
20
  from .exec_node import ExecNode
@@ -26,16 +26,17 @@ class CachePrefetchNode(ExecNode):
26
26
  """Brings files with external URLs into the cache
27
27
 
28
28
  TODO:
29
- - adapting the number of download threads at runtime to maximize throughput
29
+ - Process a row at a time and limit the number of in-flight rows to control memory usage
30
+ - Create asyncio.Tasks to consume our input in order to increase concurrency.
30
31
  """
31
32
 
33
+ QUEUE_DEPTH_HIGH_WATER = 50 # target number of in-flight requests
34
+ QUEUE_DEPTH_LOW_WATER = 20 # target number of in-flight requests
32
35
  BATCH_SIZE = 16
33
- NUM_EXECUTOR_THREADS = 16
36
+ MAX_WORKERS = 15
34
37
 
35
38
  retain_input_order: bool # if True, return rows in the exact order they were received
36
39
  file_col_info: list[exprs.ColumnSlotIdx]
37
- boto_client: Optional[Any]
38
- boto_client_lock: threading.Lock
39
40
 
40
41
  # execution state
41
42
  num_returned_rows: int
@@ -64,10 +65,6 @@ class CachePrefetchNode(ExecNode):
64
65
  self.retain_input_order = retain_input_order
65
66
  self.file_col_info = file_col_info
66
67
 
67
- # clients for specific services are constructed as needed, because it's time-consuming
68
- self.boto_client = None
69
- self.boto_client_lock = threading.Lock()
70
-
71
68
  self.num_returned_rows = 0
72
69
  self.ready_rows = deque()
73
70
  self.in_flight_rows = {}
@@ -75,24 +72,42 @@ class CachePrefetchNode(ExecNode):
75
72
  self.in_flight_urls = {}
76
73
  self.input_finished = False
77
74
  self.row_idx = itertools.count() if retain_input_order else itertools.repeat(None)
75
+ assert self.QUEUE_DEPTH_HIGH_WATER > self.QUEUE_DEPTH_LOW_WATER
78
76
 
79
- async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
80
- input_iter = self.input.__aiter__()
81
- with futures.ThreadPoolExecutor(max_workers=self.NUM_EXECUTOR_THREADS) as executor:
82
- # we create enough in-flight requests to fill the first batch
83
- while not self.input_finished and self.__num_pending_rows() < self.BATCH_SIZE:
84
- await self.__submit_input_batch(input_iter, executor)
77
+ @property
78
+ def queued_work(self) -> int:
79
+ return len(self.in_flight_requests)
85
80
 
86
- while True:
87
- # try to assemble a full batch of output rows
88
- if not self.__has_ready_batch() and len(self.in_flight_requests) > 0:
89
- self.__wait_for_requests()
90
-
91
- # try to create enough in-flight requests to fill the next batch
92
- while not self.input_finished and self.__num_pending_rows() < self.BATCH_SIZE:
93
- await self.__submit_input_batch(input_iter, executor)
81
+ async def get_input_batch(self, input_iter: AsyncIterator[DataRowBatch]) -> Optional[DataRowBatch]:
82
+ """Get the next batch of input rows, or None if there are no more rows"""
83
+ try:
84
+ input_batch = await anext(input_iter)
85
+ if input_batch is None:
86
+ self.input_finished = True
87
+ return input_batch
88
+ except StopAsyncIteration:
89
+ self.input_finished = True
90
+ return None
94
91
 
95
- if len(self.ready_rows) > 0:
92
+ async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
93
+ input_iter = aiter(self.input)
94
+ with futures.ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
95
+ while True:
96
+ # Create work to fill the queue to the high water mark ... ?without overrunning the in-flight row limit.
97
+ while not self.input_finished and self.queued_work < self.QUEUE_DEPTH_HIGH_WATER:
98
+ input_batch = await self.get_input_batch(input_iter)
99
+ if input_batch is not None:
100
+ self.__process_input_batch(input_batch, executor)
101
+
102
+ # Wait for enough completions to enable more queueing or if we're done
103
+ while self.queued_work > self.QUEUE_DEPTH_LOW_WATER or (self.input_finished and self.queued_work > 0):
104
+ done, _ = futures.wait(self.in_flight_requests, return_when=futures.FIRST_COMPLETED)
105
+ self.__process_completions(done, ignore_errors=self.ctx.ignore_errors)
106
+
107
+ # Emit results to meet batch size requirements or empty the in-flight row queue
108
+ if self.__has_ready_batch() or (
109
+ len(self.ready_rows) > 0 and self.input_finished and self.queued_work == 0
110
+ ):
96
111
  # create DataRowBatch from the first BATCH_SIZE ready rows
97
112
  batch = DataRowBatch(self.row_builder)
98
113
  rows = [self.ready_rows.popleft() for _ in range(min(self.BATCH_SIZE, len(self.ready_rows)))]
@@ -103,22 +118,15 @@ class CachePrefetchNode(ExecNode):
103
118
  _logger.debug(f'returning {len(rows)} rows')
104
119
  yield batch
105
120
 
106
- if self.input_finished and self.__num_pending_rows() == 0:
121
+ if self.input_finished and self.queued_work == 0 and len(self.ready_rows) == 0:
107
122
  return
108
123
 
109
- def __num_pending_rows(self) -> int:
110
- return len(self.in_flight_rows) + len(self.ready_rows)
111
-
112
124
  def __has_ready_batch(self) -> bool:
113
125
  """True if there are >= BATCH_SIZES entries in ready_rows and the first BATCH_SIZE ones are all non-None"""
114
126
  return (
115
127
  sum(int(row is not None) for row in itertools.islice(self.ready_rows, self.BATCH_SIZE)) == self.BATCH_SIZE
116
128
  )
117
129
 
118
- def __ready_prefix_len(self) -> int:
119
- """Length of the non-None prefix of ready_rows (= what we can return right now)"""
120
- return sum(1 for _ in itertools.takewhile(lambda x: x is not None, self.ready_rows))
121
-
122
130
  def __add_ready_row(self, row: exprs.DataRow, row_idx: Optional[int]) -> None:
123
131
  if row_idx is None:
124
132
  self.ready_rows.append(row)
@@ -129,50 +137,36 @@ class CachePrefetchNode(ExecNode):
129
137
  self.ready_rows.extend([None] * (idx - len(self.ready_rows) + 1))
130
138
  self.ready_rows[idx] = row
131
139
 
132
- def __wait_for_requests(self) -> None:
133
- """Wait for in-flight requests to complete until we have a full batch of rows"""
140
+ def __process_completions(self, done: set[futures.Future], ignore_errors: bool) -> None:
134
141
  file_cache = FileCache.get()
135
- _logger.debug(f'waiting for requests; ready_batch_size={self.__ready_prefix_len()}')
136
- while not self.__has_ready_batch() and len(self.in_flight_requests) > 0:
137
- done, _ = futures.wait(self.in_flight_requests, return_when=futures.FIRST_COMPLETED)
138
- for f in done:
139
- url = self.in_flight_requests.pop(f)
140
- tmp_path, exc = f.result()
141
- local_path: Optional[Path] = None
142
- if tmp_path is not None:
143
- # register the file with the cache for the first column in which it's missing
144
- assert url in self.in_flight_urls
145
- _, info = self.in_flight_urls[url][0]
146
- local_path = file_cache.add(info.col.tbl.id, info.col.id, url, tmp_path)
147
- _logger.debug(f'cached {url} as {local_path}')
148
-
149
- # add the local path/exception to the slots that reference the url
150
- for row, info in self.in_flight_urls.pop(url):
151
- if exc is not None:
152
- self.row_builder.set_exc(row, info.slot_idx, exc)
153
- else:
154
- assert local_path is not None
155
- row.set_file_path(info.slot_idx, str(local_path))
156
- state = self.in_flight_rows[id(row)]
157
- state.num_missing -= 1
158
- if state.num_missing == 0:
159
- del self.in_flight_rows[id(row)]
160
- self.__add_ready_row(row, state.idx)
161
- _logger.debug(f'row {state.idx} is ready (ready_batch_size={self.__ready_prefix_len()})')
162
-
163
- async def __submit_input_batch(
164
- self, input: AsyncIterator[DataRowBatch], executor: futures.ThreadPoolExecutor
165
- ) -> None:
166
- assert not self.input_finished
167
- input_batch: Optional[DataRowBatch]
168
- try:
169
- input_batch = await anext(input)
170
- except StopAsyncIteration:
171
- input_batch = None
172
- if input_batch is None:
173
- self.input_finished = True
174
- return
175
-
142
+ for f in done:
143
+ url = self.in_flight_requests.pop(f)
144
+ tmp_path, exc = f.result()
145
+ if exc is not None and not ignore_errors:
146
+ raise exc
147
+ local_path: Optional[Path] = None
148
+ if tmp_path is not None:
149
+ # register the file with the cache for the first column in which it's missing
150
+ assert url in self.in_flight_urls
151
+ _, info = self.in_flight_urls[url][0]
152
+ local_path = file_cache.add(info.col.tbl.id, info.col.id, url, tmp_path)
153
+ _logger.debug(f'cached {url} as {local_path}')
154
+
155
+ # add the local path/exception to the slots that reference the url
156
+ for row, info in self.in_flight_urls.pop(url):
157
+ if exc is not None:
158
+ self.row_builder.set_exc(row, info.slot_idx, exc)
159
+ else:
160
+ assert local_path is not None
161
+ row.set_file_path(info.slot_idx, str(local_path))
162
+ state = self.in_flight_rows[id(row)]
163
+ state.num_missing -= 1
164
+ if state.num_missing == 0:
165
+ del self.in_flight_rows[id(row)]
166
+ self.__add_ready_row(row, state.idx)
167
+
168
+ def __process_input_batch(self, input_batch: DataRowBatch, executor: futures.ThreadPoolExecutor) -> None:
169
+ """Process a batch of input rows, submitting URLs for download and adding ready rows to ready_rows"""
176
170
  file_cache = FileCache.get()
177
171
 
178
172
  # URLs from this input batch that aren't already in the file cache;
@@ -180,7 +174,7 @@ class CachePrefetchNode(ExecNode):
180
174
  # the time it takes to get the next batch together
181
175
  cache_misses: list[str] = []
182
176
 
183
- url_pos: dict[str, int] = {} # url -> row_idx; used for logging
177
+ url_pos: dict[str, Optional[int]] = {} # url -> row_idx; used for logging
184
178
  for row in input_batch:
185
179
  # identify missing local files in input batch, or fill in their paths if they're already cached
186
180
  num_missing = 0
@@ -221,6 +215,8 @@ class CachePrefetchNode(ExecNode):
221
215
 
222
216
  def __fetch_url(self, url: str) -> tuple[Optional[Path], Optional[Exception]]:
223
217
  """Fetches a remote URL into the TempStore and returns its path"""
218
+ from pixeltable.utils.local_store import TempStore
219
+
224
220
  _logger.debug(f'fetching url={url} thread_name={threading.current_thread().name}')
225
221
  parsed = urllib.parse.urlparse(url)
226
222
  # Use len(parsed.scheme) > 1 here to ensure we're not being passed
@@ -234,31 +230,11 @@ class CachePrefetchNode(ExecNode):
234
230
  tmp_path = TempStore.create_path(extension=extension)
235
231
  try:
236
232
  _logger.debug(f'Downloading {url} to {tmp_path}')
237
- if parsed.scheme == 's3':
238
- from pixeltable.utils.s3 import get_client
239
-
240
- with self.boto_client_lock:
241
- if self.boto_client is None:
242
- config = {
243
- 'max_pool_connections': self.NUM_EXECUTOR_THREADS + 4, # +4: leave some headroom
244
- 'connect_timeout': 5,
245
- 'read_timeout': 30,
246
- 'retries': {'max_attempts': 3, 'mode': 'adaptive'},
247
- }
248
- self.boto_client = get_client(**config)
249
- self.boto_client.download_file(parsed.netloc, parsed.path.lstrip('/'), str(tmp_path))
250
- elif parsed.scheme in ('http', 'https'):
251
- with urllib.request.urlopen(url) as resp, open(tmp_path, 'wb') as f:
252
- data = resp.read()
253
- f.write(data)
254
- else:
255
- raise AssertionError(f'Unsupported URL scheme: {parsed.scheme}')
233
+ ObjectOps.copy_object_to_local_file(url, tmp_path)
256
234
  _logger.debug(f'Downloaded {url} to {tmp_path}')
257
235
  return tmp_path, None
258
236
  except Exception as e:
259
237
  # we want to add the file url to the exception message
260
238
  exc = excs.Error(f'Failed to download {url}: {e}')
261
239
  _logger.debug(f'Failed to download {url}: {e}', exc_info=e)
262
- if not self.ctx.ignore_errors:
263
- raise exc from None # suppress original exception
264
240
  return None, exc
@@ -12,15 +12,14 @@ class DataRowBatch:
12
12
  """Set of DataRows, indexed by rowid.
13
13
 
14
14
  Contains the metadata needed to initialize DataRows.
15
+
16
+ Requires either num_rows or rows to be specified, but not both.
15
17
  """
16
18
 
17
19
  row_builder: exprs.RowBuilder
18
20
  rows: list[exprs.DataRow]
19
21
 
20
22
  def __init__(self, row_builder: exprs.RowBuilder, rows: Optional[list[exprs.DataRow]] = None):
21
- """
22
- Requires either num_rows or rows to be specified, but not both.
23
- """
24
23
  self.row_builder = row_builder
25
24
  self.rows = [] if rows is None else rows
26
25
 
@@ -39,20 +38,5 @@ class DataRowBatch:
39
38
  def __getitem__(self, index: int) -> exprs.DataRow:
40
39
  return self.rows[index]
41
40
 
42
- def flush_imgs(
43
- self, idx_range: Optional[slice], stored_img_info: list[exprs.ColumnSlotIdx], flushed_img_slots: list[int]
44
- ) -> None:
45
- """Flushes images in the given range of rows."""
46
- if len(stored_img_info) == 0 and len(flushed_img_slots) == 0:
47
- return
48
-
49
- if idx_range is None:
50
- idx_range = slice(0, len(self.rows))
51
- for row in self.rows[idx_range]:
52
- for info in stored_img_info:
53
- row.flush_img(info.slot_idx, info.col)
54
- for slot_idx in flushed_img_slots:
55
- row.flush_img(slot_idx)
56
-
57
41
  def __iter__(self) -> Iterator[exprs.DataRow]:
58
42
  return iter(self.rows)
@@ -390,6 +390,17 @@ class ExprEvalNode(ExecNode):
390
390
  # end the main loop if we had an unhandled exception
391
391
  try:
392
392
  t.result()
393
+ except KeyboardInterrupt:
394
+ # ExprEvalNode instances are long-running and reused across multiple operations.
395
+ # When a user interrupts an operation (Ctrl+C), the main evaluation loop properly
396
+ # handles the KeyboardInterrupt and terminates the current operation. However,
397
+ # background tasks spawned by evaluators may complete asynchronously after the
398
+ # operation has ended, and their done callbacks will fire during subsequent
399
+ # operations. These "phantom" KeyboardInterrupt exceptions from previous
400
+ # operations' background tasks should not interfere with new operations, so we
401
+ # absorb them here rather than propagating them via self.error/self.exc_event.
402
+ _logger.debug('Task completed with KeyboardInterrupt (user cancellation)')
403
+ pass
393
404
  except asyncio.CancelledError:
394
405
  pass
395
406
  except Exception as exc:
@@ -2,7 +2,7 @@ import logging
2
2
  from typing import Any, AsyncIterator, Optional
3
3
 
4
4
  from pixeltable import catalog, exprs
5
- from pixeltable.utils.media_store import TempStore
5
+ from pixeltable.utils.local_store import TempStore
6
6
 
7
7
  from .data_row_batch import DataRowBatch
8
8
  from .exec_node import ExecNode
@@ -0,0 +1,299 @@
1
+ from __future__ import annotations
2
+
3
+ import dataclasses
4
+ import itertools
5
+ import logging
6
+ from collections import defaultdict, deque
7
+ from concurrent import futures
8
+ from pathlib import Path
9
+ from typing import AsyncIterator, Iterator, NamedTuple, Optional
10
+ from uuid import UUID
11
+
12
+ from pixeltable import exprs
13
+ from pixeltable.utils.object_stores import ObjectOps, ObjectPath, StorageTarget
14
+
15
+ from .data_row_batch import DataRowBatch
16
+ from .exec_node import ExecNode
17
+
18
+ _logger = logging.getLogger('pixeltable')
19
+
20
+
21
+ class ObjectStoreSaveNode(ExecNode):
22
+ """Save files into designated object store(s).
23
+
24
+ Each row may have multiple files that need to be saved to a destination.
25
+ Each file may be referenced by more than one column in the row.
26
+ Each file may have multiple destinations, e.g., S3 bucket and local file system.
27
+ If there are multiple destinations, the file cannot be moved to any destination
28
+ until it has been copied to all of the other destinations.
29
+ Diagrammatically:
30
+ Row -> [src_path1, src_path2, ...]
31
+ src_path -> [dest1, dest2, ...]
32
+ dest1: [row_location1, row_location2, ...]
33
+ Paths with multiple destinations are removed from the TempStore only after all destination copies are complete.
34
+
35
+ TODO:
36
+ - Process a row at a time and limit the number of in-flight rows to control memory usage
37
+ """
38
+
39
+ QUEUE_DEPTH_HIGH_WATER = 50 # target number of in-flight requests
40
+ QUEUE_DEPTH_LOW_WATER = 20 # target number of in-flight requests
41
+ BATCH_SIZE = 16
42
+ MAX_WORKERS = 15
43
+
44
+ class WorkDesignator(NamedTuple):
45
+ """Specify the source and destination for a WorkItem"""
46
+
47
+ src_path: str # source of the file to be processed
48
+ destination: Optional[str] # destination URI for the file to be processed
49
+
50
+ class WorkItem(NamedTuple):
51
+ src_path: Path
52
+ destination: Optional[str]
53
+ info: exprs.ColumnSlotIdx # column info for the file being processed
54
+ destination_count: int = 1 # number of unique destinations for this file
55
+
56
+ retain_input_order: bool # if True, return rows in the exact order they were received
57
+ file_col_info: list[exprs.ColumnSlotIdx]
58
+
59
+ # execution state
60
+ num_returned_rows: int
61
+
62
+ # ready_rows: rows that are ready to be returned, ordered by row idx;
63
+ # the implied row idx of ready_rows[0] is num_returned_rows
64
+ ready_rows: deque[Optional[exprs.DataRow]]
65
+
66
+ in_flight_rows: dict[int, ObjectStoreSaveNode.RowState] # rows with in-flight work; id(row) -> RowState
67
+ in_flight_requests: dict[
68
+ futures.Future, WorkDesignator
69
+ ] # in-flight requests to save paths: Future -> WorkDesignator
70
+ in_flight_work: dict[
71
+ WorkDesignator, list[tuple[exprs.DataRow, exprs.ColumnSlotIdx]]
72
+ ] # WorkDesignator -> [(row, info)]
73
+
74
+ input_finished: bool
75
+ row_idx: Iterator[Optional[int]]
76
+
77
+ @dataclasses.dataclass
78
+ class RowState:
79
+ row: exprs.DataRow
80
+ idx: Optional[int] # position in input stream; None if we don't retain input order
81
+ num_missing: int # number of references to media files in this row
82
+ delete_destinations: list[Path] # paths to delete after all copies are complete
83
+
84
+ def __init__(
85
+ self, tbl_id: UUID, file_col_info: list[exprs.ColumnSlotIdx], input: ExecNode, retain_input_order: bool = True
86
+ ):
87
+ # input_/output_exprs=[]: we don't have anything to evaluate
88
+ super().__init__(input.row_builder, [], [], input)
89
+ self.retain_input_order = retain_input_order
90
+ self.file_col_info = file_col_info
91
+
92
+ self.num_returned_rows = 0
93
+ self.ready_rows = deque()
94
+ self.in_flight_rows = {}
95
+ self.in_flight_requests = {}
96
+ self.in_flight_work = {}
97
+ self.input_finished = False
98
+ self.row_idx = itertools.count() if retain_input_order else itertools.repeat(None)
99
+ assert self.QUEUE_DEPTH_HIGH_WATER > self.QUEUE_DEPTH_LOW_WATER
100
+
101
+ @property
102
+ def queued_work(self) -> int:
103
+ return len(self.in_flight_requests)
104
+
105
+ async def get_input_batch(self, input_iter: AsyncIterator[DataRowBatch]) -> Optional[DataRowBatch]:
106
+ """Get the next batch of input rows, or None if there are no more rows"""
107
+ try:
108
+ input_batch = await anext(input_iter)
109
+ if input_batch is None:
110
+ self.input_finished = True
111
+ return input_batch
112
+ except StopAsyncIteration:
113
+ self.input_finished = True
114
+ return None
115
+
116
+ async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
117
+ input_iter = aiter(self.input)
118
+ with futures.ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
119
+ while True:
120
+ # Create work to fill the queue to the high water mark ... ?without overrunning the in-flight row limit.
121
+ while not self.input_finished and self.queued_work < self.QUEUE_DEPTH_HIGH_WATER:
122
+ input_batch = await self.get_input_batch(input_iter)
123
+ if input_batch is not None:
124
+ self.__process_input_batch(input_batch, executor)
125
+
126
+ # Wait for enough completions to enable more queueing or if we're done
127
+ while self.queued_work > self.QUEUE_DEPTH_LOW_WATER or (self.input_finished and self.queued_work > 0):
128
+ done, _ = futures.wait(self.in_flight_requests, return_when=futures.FIRST_COMPLETED)
129
+ self.__process_completions(done, ignore_errors=self.ctx.ignore_errors)
130
+
131
+ # Emit results to meet batch size requirements or empty the in-flight row queue
132
+ if self.__has_ready_batch() or (
133
+ len(self.ready_rows) > 0 and self.input_finished and self.queued_work == 0
134
+ ):
135
+ # create DataRowBatch from the first BATCH_SIZE ready rows
136
+ batch = DataRowBatch(self.row_builder)
137
+ rows = [self.ready_rows.popleft() for _ in range(min(self.BATCH_SIZE, len(self.ready_rows)))]
138
+ for row in rows:
139
+ assert row is not None
140
+ batch.add_row(row)
141
+ self.num_returned_rows += len(rows)
142
+ _logger.debug(f'returning {len(rows)} rows')
143
+ yield batch
144
+
145
+ if self.input_finished and self.queued_work == 0 and len(self.ready_rows) == 0:
146
+ return
147
+
148
+ def __has_ready_batch(self) -> bool:
149
+ """True if there are >= BATCH_SIZES entries in ready_rows and the first BATCH_SIZE ones are all non-None"""
150
+ return (
151
+ sum(int(row is not None) for row in itertools.islice(self.ready_rows, self.BATCH_SIZE)) == self.BATCH_SIZE
152
+ )
153
+
154
+ def __add_ready_row(self, row: exprs.DataRow, row_idx: Optional[int]) -> None:
155
+ if row_idx is None:
156
+ self.ready_rows.append(row)
157
+ else:
158
+ # extend ready_rows to accommodate row_idx
159
+ idx = row_idx - self.num_returned_rows
160
+ if idx >= len(self.ready_rows):
161
+ self.ready_rows.extend([None] * (idx - len(self.ready_rows) + 1))
162
+ self.ready_rows[idx] = row
163
+
164
+ def __process_completions(self, done: set[futures.Future], ignore_errors: bool) -> None:
165
+ from pixeltable.utils.local_store import TempStore
166
+
167
+ for f in done:
168
+ work_designator = self.in_flight_requests.pop(f)
169
+ new_file_url, exc = f.result()
170
+ if exc is not None and not ignore_errors:
171
+ raise exc
172
+ assert new_file_url is not None
173
+
174
+ # add the local path/exception to the slots that reference the url
175
+ for row, info in self.in_flight_work.pop(work_designator):
176
+ if exc is not None:
177
+ self.row_builder.set_exc(row, info.slot_idx, exc)
178
+ else:
179
+ row.file_urls[info.slot_idx] = new_file_url
180
+
181
+ state = self.in_flight_rows[id(row)]
182
+ state.num_missing -= 1
183
+ if state.num_missing == 0:
184
+ # All operations for this row are complete. Delete all files which had multiple destinations
185
+ for src_path in state.delete_destinations:
186
+ TempStore.delete_media_file(src_path)
187
+ del self.in_flight_rows[id(row)]
188
+ self.__add_ready_row(row, state.idx)
189
+
190
+ def __process_input_row(self, row: exprs.DataRow) -> list[ObjectStoreSaveNode.WorkItem]:
191
+ """Process a batch of input rows, generating a list of work"""
192
+ from pixeltable.utils.local_store import LocalStore, TempStore
193
+
194
+ # Create a list of work to do for media storage in this row
195
+ row_idx = next(self.row_idx)
196
+ row_to_do: list[ObjectStoreSaveNode.WorkItem] = []
197
+ num_missing = 0
198
+ unique_destinations: dict[Path, int] = defaultdict(int) # destination -> count of unique destinations
199
+
200
+ for info in self.file_col_info:
201
+ col, index = info
202
+ # we may need to store this imagehave yet to store this image
203
+ if row.prepare_col_val_for_save(index, col):
204
+ row.file_urls[index] = row.save_media_to_temp(index, col)
205
+
206
+ url = row.file_urls[index]
207
+ if url is None:
208
+ # nothing to do
209
+ continue
210
+
211
+ assert row.excs[index] is None
212
+ assert col.col_type.is_media_type()
213
+
214
+ destination = info.col.destination
215
+ soa = None if destination is None else ObjectPath.parse_object_storage_addr(destination, False)
216
+ if (
217
+ soa is not None
218
+ and soa.storage_target == StorageTarget.LOCAL_STORE
219
+ and LocalStore(soa).resolve_url(url) is not None
220
+ ):
221
+ # A local non-default destination was specified, and the url already points there
222
+ continue
223
+
224
+ src_path = LocalStore.file_url_to_path(url)
225
+ if src_path is None:
226
+ # The url does not point to a local file, do not attempt to copy/move it
227
+ continue
228
+
229
+ if destination is None and not TempStore.contains_path(src_path):
230
+ # Do not copy local file URLs to the LocalStore
231
+ continue
232
+
233
+ work_designator = ObjectStoreSaveNode.WorkDesignator(str(src_path), destination)
234
+ locations = self.in_flight_work.get(work_designator)
235
+ if locations is not None:
236
+ # we've already seen this
237
+ locations.append((row, info))
238
+ num_missing += 1
239
+ continue
240
+
241
+ work_item = ObjectStoreSaveNode.WorkItem(src_path, destination, info)
242
+ row_to_do.append(work_item)
243
+ self.in_flight_work[work_designator] = [(row, info)]
244
+ num_missing += 1
245
+ unique_destinations[src_path] += 1
246
+
247
+ # Update work items to reflect the number of unique destinations
248
+ new_to_do = []
249
+ for work_item in row_to_do:
250
+ if unique_destinations[work_item.src_path] == 1 and TempStore.contains_path(work_item.src_path):
251
+ new_to_do.append(work_item)
252
+ else:
253
+ new_to_do.append(
254
+ ObjectStoreSaveNode.WorkItem(
255
+ work_item.src_path,
256
+ work_item.destination,
257
+ work_item.info,
258
+ destination_count=unique_destinations[work_item.src_path] + 1,
259
+ # +1 for the TempStore destination
260
+ )
261
+ )
262
+ delete_destinations = [k for k, v in unique_destinations.items() if v > 1 and TempStore.contains_path(k)]
263
+ row_to_do = new_to_do
264
+
265
+ if len(row_to_do) > 0:
266
+ self.in_flight_rows[id(row)] = self.RowState(
267
+ row, row_idx, num_missing, delete_destinations=delete_destinations
268
+ )
269
+ else:
270
+ self.__add_ready_row(row, row_idx)
271
+ return row_to_do
272
+
273
+ def __process_input_batch(self, input_batch: DataRowBatch, executor: futures.ThreadPoolExecutor) -> None:
274
+ """Process a batch of input rows, submitting temporary files for upload"""
275
+ work_to_do: list[ObjectStoreSaveNode.WorkItem] = []
276
+
277
+ for row in input_batch:
278
+ row_to_do = self.__process_input_row(row)
279
+ if len(row_to_do) > 0:
280
+ work_to_do.extend(row_to_do)
281
+
282
+ for work_item in work_to_do:
283
+ f = executor.submit(self.__persist_media_file, work_item)
284
+ self.in_flight_requests[f] = ObjectStoreSaveNode.WorkDesignator(
285
+ str(work_item.src_path), work_item.destination
286
+ )
287
+ _logger.debug(f'submitted {work_item}')
288
+
289
+ def __persist_media_file(self, work_item: WorkItem) -> tuple[Optional[str], Optional[Exception]]:
290
+ """Move data from the TempStore to another location"""
291
+ src_path = work_item.src_path
292
+ col = work_item.info.col
293
+ assert col.destination == work_item.destination
294
+ try:
295
+ new_file_url = ObjectOps.put_file(col, src_path, work_item.destination_count == 1)
296
+ return new_file_url, None
297
+ except Exception as e:
298
+ _logger.debug(f'Failed to move/copy {src_path}: {e}', exc_info=e)
299
+ return None, e