pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. pixeltable/__init__.py +83 -19
  2. pixeltable/_query.py +1444 -0
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +7 -4
  5. pixeltable/catalog/catalog.py +2394 -119
  6. pixeltable/catalog/column.py +225 -104
  7. pixeltable/catalog/dir.py +38 -9
  8. pixeltable/catalog/globals.py +53 -34
  9. pixeltable/catalog/insertable_table.py +265 -115
  10. pixeltable/catalog/path.py +80 -17
  11. pixeltable/catalog/schema_object.py +28 -43
  12. pixeltable/catalog/table.py +1270 -677
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +1270 -751
  15. pixeltable/catalog/table_version_handle.py +109 -0
  16. pixeltable/catalog/table_version_path.py +137 -42
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +251 -134
  20. pixeltable/config.py +215 -0
  21. pixeltable/env.py +736 -285
  22. pixeltable/exceptions.py +26 -2
  23. pixeltable/exec/__init__.py +7 -2
  24. pixeltable/exec/aggregation_node.py +39 -21
  25. pixeltable/exec/cache_prefetch_node.py +87 -109
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +25 -28
  29. pixeltable/exec/data_row_batch.py +11 -46
  30. pixeltable/exec/exec_context.py +26 -11
  31. pixeltable/exec/exec_node.py +35 -27
  32. pixeltable/exec/expr_eval/__init__.py +3 -0
  33. pixeltable/exec/expr_eval/evaluators.py +365 -0
  34. pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
  35. pixeltable/exec/expr_eval/globals.py +200 -0
  36. pixeltable/exec/expr_eval/row_buffer.py +74 -0
  37. pixeltable/exec/expr_eval/schedulers.py +413 -0
  38. pixeltable/exec/globals.py +35 -0
  39. pixeltable/exec/in_memory_data_node.py +35 -27
  40. pixeltable/exec/object_store_save_node.py +293 -0
  41. pixeltable/exec/row_update_node.py +44 -29
  42. pixeltable/exec/sql_node.py +414 -115
  43. pixeltable/exprs/__init__.py +8 -5
  44. pixeltable/exprs/arithmetic_expr.py +79 -45
  45. pixeltable/exprs/array_slice.py +5 -5
  46. pixeltable/exprs/column_property_ref.py +40 -26
  47. pixeltable/exprs/column_ref.py +254 -61
  48. pixeltable/exprs/comparison.py +14 -9
  49. pixeltable/exprs/compound_predicate.py +9 -10
  50. pixeltable/exprs/data_row.py +213 -72
  51. pixeltable/exprs/expr.py +270 -104
  52. pixeltable/exprs/expr_dict.py +6 -5
  53. pixeltable/exprs/expr_set.py +20 -11
  54. pixeltable/exprs/function_call.py +383 -284
  55. pixeltable/exprs/globals.py +18 -5
  56. pixeltable/exprs/in_predicate.py +7 -7
  57. pixeltable/exprs/inline_expr.py +37 -37
  58. pixeltable/exprs/is_null.py +8 -4
  59. pixeltable/exprs/json_mapper.py +120 -54
  60. pixeltable/exprs/json_path.py +90 -60
  61. pixeltable/exprs/literal.py +61 -16
  62. pixeltable/exprs/method_ref.py +7 -6
  63. pixeltable/exprs/object_ref.py +19 -8
  64. pixeltable/exprs/row_builder.py +238 -75
  65. pixeltable/exprs/rowid_ref.py +53 -15
  66. pixeltable/exprs/similarity_expr.py +65 -50
  67. pixeltable/exprs/sql_element_cache.py +5 -5
  68. pixeltable/exprs/string_op.py +107 -0
  69. pixeltable/exprs/type_cast.py +25 -13
  70. pixeltable/exprs/variable.py +2 -2
  71. pixeltable/func/__init__.py +9 -5
  72. pixeltable/func/aggregate_function.py +197 -92
  73. pixeltable/func/callable_function.py +119 -35
  74. pixeltable/func/expr_template_function.py +101 -48
  75. pixeltable/func/function.py +375 -62
  76. pixeltable/func/function_registry.py +20 -19
  77. pixeltable/func/globals.py +6 -5
  78. pixeltable/func/mcp.py +74 -0
  79. pixeltable/func/query_template_function.py +151 -35
  80. pixeltable/func/signature.py +178 -49
  81. pixeltable/func/tools.py +164 -0
  82. pixeltable/func/udf.py +176 -53
  83. pixeltable/functions/__init__.py +44 -4
  84. pixeltable/functions/anthropic.py +226 -47
  85. pixeltable/functions/audio.py +148 -11
  86. pixeltable/functions/bedrock.py +137 -0
  87. pixeltable/functions/date.py +188 -0
  88. pixeltable/functions/deepseek.py +113 -0
  89. pixeltable/functions/document.py +81 -0
  90. pixeltable/functions/fal.py +76 -0
  91. pixeltable/functions/fireworks.py +72 -20
  92. pixeltable/functions/gemini.py +249 -0
  93. pixeltable/functions/globals.py +208 -53
  94. pixeltable/functions/groq.py +108 -0
  95. pixeltable/functions/huggingface.py +1088 -95
  96. pixeltable/functions/image.py +155 -84
  97. pixeltable/functions/json.py +8 -11
  98. pixeltable/functions/llama_cpp.py +31 -19
  99. pixeltable/functions/math.py +169 -0
  100. pixeltable/functions/mistralai.py +50 -75
  101. pixeltable/functions/net.py +70 -0
  102. pixeltable/functions/ollama.py +29 -36
  103. pixeltable/functions/openai.py +548 -160
  104. pixeltable/functions/openrouter.py +143 -0
  105. pixeltable/functions/replicate.py +15 -14
  106. pixeltable/functions/reve.py +250 -0
  107. pixeltable/functions/string.py +310 -85
  108. pixeltable/functions/timestamp.py +37 -19
  109. pixeltable/functions/together.py +77 -120
  110. pixeltable/functions/twelvelabs.py +188 -0
  111. pixeltable/functions/util.py +7 -2
  112. pixeltable/functions/uuid.py +30 -0
  113. pixeltable/functions/video.py +1528 -117
  114. pixeltable/functions/vision.py +26 -26
  115. pixeltable/functions/voyageai.py +289 -0
  116. pixeltable/functions/whisper.py +19 -10
  117. pixeltable/functions/whisperx.py +179 -0
  118. pixeltable/functions/yolox.py +112 -0
  119. pixeltable/globals.py +716 -236
  120. pixeltable/index/__init__.py +3 -1
  121. pixeltable/index/base.py +17 -21
  122. pixeltable/index/btree.py +32 -22
  123. pixeltable/index/embedding_index.py +155 -92
  124. pixeltable/io/__init__.py +12 -7
  125. pixeltable/io/datarows.py +140 -0
  126. pixeltable/io/external_store.py +83 -125
  127. pixeltable/io/fiftyone.py +24 -33
  128. pixeltable/io/globals.py +47 -182
  129. pixeltable/io/hf_datasets.py +96 -127
  130. pixeltable/io/label_studio.py +171 -156
  131. pixeltable/io/lancedb.py +3 -0
  132. pixeltable/io/pandas.py +136 -115
  133. pixeltable/io/parquet.py +40 -153
  134. pixeltable/io/table_data_conduit.py +702 -0
  135. pixeltable/io/utils.py +100 -0
  136. pixeltable/iterators/__init__.py +8 -4
  137. pixeltable/iterators/audio.py +207 -0
  138. pixeltable/iterators/base.py +9 -3
  139. pixeltable/iterators/document.py +144 -87
  140. pixeltable/iterators/image.py +17 -38
  141. pixeltable/iterators/string.py +15 -12
  142. pixeltable/iterators/video.py +523 -127
  143. pixeltable/metadata/__init__.py +33 -8
  144. pixeltable/metadata/converters/convert_10.py +2 -3
  145. pixeltable/metadata/converters/convert_13.py +2 -2
  146. pixeltable/metadata/converters/convert_15.py +15 -11
  147. pixeltable/metadata/converters/convert_16.py +4 -5
  148. pixeltable/metadata/converters/convert_17.py +4 -5
  149. pixeltable/metadata/converters/convert_18.py +4 -6
  150. pixeltable/metadata/converters/convert_19.py +6 -9
  151. pixeltable/metadata/converters/convert_20.py +3 -6
  152. pixeltable/metadata/converters/convert_21.py +6 -8
  153. pixeltable/metadata/converters/convert_22.py +3 -2
  154. pixeltable/metadata/converters/convert_23.py +33 -0
  155. pixeltable/metadata/converters/convert_24.py +55 -0
  156. pixeltable/metadata/converters/convert_25.py +19 -0
  157. pixeltable/metadata/converters/convert_26.py +23 -0
  158. pixeltable/metadata/converters/convert_27.py +29 -0
  159. pixeltable/metadata/converters/convert_28.py +13 -0
  160. pixeltable/metadata/converters/convert_29.py +110 -0
  161. pixeltable/metadata/converters/convert_30.py +63 -0
  162. pixeltable/metadata/converters/convert_31.py +11 -0
  163. pixeltable/metadata/converters/convert_32.py +15 -0
  164. pixeltable/metadata/converters/convert_33.py +17 -0
  165. pixeltable/metadata/converters/convert_34.py +21 -0
  166. pixeltable/metadata/converters/convert_35.py +9 -0
  167. pixeltable/metadata/converters/convert_36.py +38 -0
  168. pixeltable/metadata/converters/convert_37.py +15 -0
  169. pixeltable/metadata/converters/convert_38.py +39 -0
  170. pixeltable/metadata/converters/convert_39.py +124 -0
  171. pixeltable/metadata/converters/convert_40.py +73 -0
  172. pixeltable/metadata/converters/convert_41.py +12 -0
  173. pixeltable/metadata/converters/convert_42.py +9 -0
  174. pixeltable/metadata/converters/convert_43.py +44 -0
  175. pixeltable/metadata/converters/util.py +44 -18
  176. pixeltable/metadata/notes.py +21 -0
  177. pixeltable/metadata/schema.py +185 -42
  178. pixeltable/metadata/utils.py +74 -0
  179. pixeltable/mypy/__init__.py +3 -0
  180. pixeltable/mypy/mypy_plugin.py +123 -0
  181. pixeltable/plan.py +616 -225
  182. pixeltable/share/__init__.py +3 -0
  183. pixeltable/share/packager.py +797 -0
  184. pixeltable/share/protocol/__init__.py +33 -0
  185. pixeltable/share/protocol/common.py +165 -0
  186. pixeltable/share/protocol/operation_types.py +33 -0
  187. pixeltable/share/protocol/replica.py +119 -0
  188. pixeltable/share/publish.py +349 -0
  189. pixeltable/store.py +398 -232
  190. pixeltable/type_system.py +730 -267
  191. pixeltable/utils/__init__.py +40 -0
  192. pixeltable/utils/arrow.py +201 -29
  193. pixeltable/utils/av.py +298 -0
  194. pixeltable/utils/azure_store.py +346 -0
  195. pixeltable/utils/coco.py +26 -27
  196. pixeltable/utils/code.py +4 -4
  197. pixeltable/utils/console_output.py +46 -0
  198. pixeltable/utils/coroutine.py +24 -0
  199. pixeltable/utils/dbms.py +92 -0
  200. pixeltable/utils/description_helper.py +11 -12
  201. pixeltable/utils/documents.py +60 -61
  202. pixeltable/utils/exception_handler.py +36 -0
  203. pixeltable/utils/filecache.py +38 -22
  204. pixeltable/utils/formatter.py +88 -51
  205. pixeltable/utils/gcs_store.py +295 -0
  206. pixeltable/utils/http.py +133 -0
  207. pixeltable/utils/http_server.py +14 -13
  208. pixeltable/utils/iceberg.py +13 -0
  209. pixeltable/utils/image.py +17 -0
  210. pixeltable/utils/lancedb.py +90 -0
  211. pixeltable/utils/local_store.py +322 -0
  212. pixeltable/utils/misc.py +5 -0
  213. pixeltable/utils/object_stores.py +573 -0
  214. pixeltable/utils/pydantic.py +60 -0
  215. pixeltable/utils/pytorch.py +20 -20
  216. pixeltable/utils/s3_store.py +527 -0
  217. pixeltable/utils/sql.py +32 -5
  218. pixeltable/utils/system.py +30 -0
  219. pixeltable/utils/transactional_directory.py +4 -3
  220. pixeltable-0.5.7.dist-info/METADATA +579 -0
  221. pixeltable-0.5.7.dist-info/RECORD +227 -0
  222. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  223. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  224. pixeltable/__version__.py +0 -3
  225. pixeltable/catalog/named_function.py +0 -36
  226. pixeltable/catalog/path_dict.py +0 -141
  227. pixeltable/dataframe.py +0 -894
  228. pixeltable/exec/expr_eval_node.py +0 -232
  229. pixeltable/ext/__init__.py +0 -14
  230. pixeltable/ext/functions/__init__.py +0 -8
  231. pixeltable/ext/functions/whisperx.py +0 -77
  232. pixeltable/ext/functions/yolox.py +0 -157
  233. pixeltable/tool/create_test_db_dump.py +0 -311
  234. pixeltable/tool/create_test_video.py +0 -81
  235. pixeltable/tool/doc_plugins/griffe.py +0 -50
  236. pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
  237. pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
  238. pixeltable/tool/embed_udf.py +0 -9
  239. pixeltable/tool/mypy_plugin.py +0 -55
  240. pixeltable/utils/media_store.py +0 -76
  241. pixeltable/utils/s3.py +0 -16
  242. pixeltable-0.2.26.dist-info/METADATA +0 -400
  243. pixeltable-0.2.26.dist-info/RECORD +0 -156
  244. pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
  245. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,293 @@
1
+ from __future__ import annotations
2
+
3
+ import dataclasses
4
+ import itertools
5
+ import logging
6
+ from collections import defaultdict, deque
7
+ from concurrent import futures
8
+ from pathlib import Path
9
+ from typing import AsyncIterator, Iterator, NamedTuple
10
+
11
+ from pixeltable import exprs
12
+ from pixeltable.utils.object_stores import ObjectOps, ObjectPath, StorageTarget
13
+
14
+ from .data_row_batch import DataRowBatch
15
+ from .exec_node import ExecNode
16
+
17
+ _logger = logging.getLogger('pixeltable')
18
+
19
+
20
+ class ObjectStoreSaveNode(ExecNode):
21
+ """Save files into designated object store(s).
22
+
23
+ Each row may have multiple files that need to be saved to a destination.
24
+ Each file may be referenced by more than one column in the row.
25
+ Each file may have multiple destinations, e.g., S3 bucket and local file system.
26
+ If there are multiple destinations, the file cannot be moved to any destination
27
+ until it has been copied to all of the other destinations.
28
+ Diagrammatically:
29
+ Row -> [src_path1, src_path2, ...]
30
+ src_path -> [dest1, dest2, ...]
31
+ dest1: [row_location1, row_location2, ...]
32
+ Paths with multiple destinations are removed from the TempStore only after all destination copies are complete.
33
+
34
+ TODO:
35
+ - Process a row at a time and limit the number of in-flight rows to control memory usage
36
+ """
37
+
38
+ QUEUE_DEPTH_HIGH_WATER = 50 # target number of in-flight requests
39
+ QUEUE_DEPTH_LOW_WATER = 20 # target number of in-flight requests
40
+ BATCH_SIZE = 16
41
+ MAX_WORKERS = 15
42
+
43
+ class WorkDesignator(NamedTuple):
44
+ """Specify the source and destination for a WorkItem"""
45
+
46
+ src_path: str # source of the file to be processed
47
+ destination: str # destination URI for the file to be processed
48
+
49
+ class WorkItem(NamedTuple):
50
+ src_path: Path
51
+ destination: str | None
52
+ info: exprs.ColumnSlotIdx # column info for the file being processed
53
+ destination_count: int = 1 # number of unique destinations for this file
54
+
55
+ retain_input_order: bool # if True, return rows in the exact order they were received
56
+ file_col_info: list[exprs.ColumnSlotIdx]
57
+
58
+ # execution state
59
+ num_returned_rows: int
60
+
61
+ # ready_rows: rows that are ready to be returned, ordered by row idx;
62
+ # the implied row idx of ready_rows[0] is num_returned_rows
63
+ ready_rows: deque[exprs.DataRow | None]
64
+
65
+ in_flight_rows: dict[int, ObjectStoreSaveNode.RowState] # rows with in-flight work; id(row) -> RowState
66
+ in_flight_requests: dict[
67
+ futures.Future, WorkDesignator
68
+ ] # in-flight requests to save paths: Future -> WorkDesignator
69
+ in_flight_work: dict[
70
+ WorkDesignator, list[tuple[exprs.DataRow, exprs.ColumnSlotIdx]]
71
+ ] # WorkDesignator -> [(row, info)]
72
+
73
+ input_finished: bool
74
+ row_idx: Iterator[int | None]
75
+
76
+ @dataclasses.dataclass
77
+ class RowState:
78
+ row: exprs.DataRow
79
+ idx: int | None # position in input stream; None if we don't retain input order
80
+ num_missing: int # number of references to media files in this row
81
+ delete_destinations: list[Path] # paths to delete after all copies are complete
82
+
83
+ def __init__(self, file_col_info: list[exprs.ColumnSlotIdx], input: ExecNode, retain_input_order: bool = True):
84
+ # input_/output_exprs=[]: we don't have anything to evaluate
85
+ super().__init__(input.row_builder, [], [], input)
86
+ self.retain_input_order = retain_input_order
87
+ self.file_col_info = file_col_info
88
+
89
+ self.num_returned_rows = 0
90
+ self.ready_rows = deque()
91
+ self.in_flight_rows = {}
92
+ self.in_flight_requests = {}
93
+ self.in_flight_work = {}
94
+ self.input_finished = False
95
+ self.row_idx = itertools.count() if retain_input_order else itertools.repeat(None)
96
+ assert self.QUEUE_DEPTH_HIGH_WATER > self.QUEUE_DEPTH_LOW_WATER
97
+
98
+ @property
99
+ def queued_work(self) -> int:
100
+ return len(self.in_flight_requests)
101
+
102
+ async def get_input_batch(self, input_iter: AsyncIterator[DataRowBatch]) -> DataRowBatch | None:
103
+ """Get the next batch of input rows, or None if there are no more rows"""
104
+ try:
105
+ input_batch = await anext(input_iter)
106
+ if input_batch is None:
107
+ self.input_finished = True
108
+ return input_batch
109
+ except StopAsyncIteration:
110
+ self.input_finished = True
111
+ return None
112
+
113
+ async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
114
+ input_iter = aiter(self.input)
115
+ with futures.ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
116
+ while True:
117
+ # Create work to fill the queue to the high water mark ... ?without overrunning the in-flight row limit.
118
+ while not self.input_finished and self.queued_work < self.QUEUE_DEPTH_HIGH_WATER:
119
+ input_batch = await self.get_input_batch(input_iter)
120
+ if input_batch is not None:
121
+ self.__process_input_batch(input_batch, executor)
122
+
123
+ # Wait for enough completions to enable more queueing or if we're done
124
+ while self.queued_work > self.QUEUE_DEPTH_LOW_WATER or (self.input_finished and self.queued_work > 0):
125
+ done, _ = futures.wait(self.in_flight_requests, return_when=futures.FIRST_COMPLETED)
126
+ self.__process_completions(done, ignore_errors=self.ctx.ignore_errors)
127
+
128
+ # Emit results to meet batch size requirements or empty the in-flight row queue
129
+ if self.__has_ready_batch() or (
130
+ len(self.ready_rows) > 0 and self.input_finished and self.queued_work == 0
131
+ ):
132
+ # create DataRowBatch from the first BATCH_SIZE ready rows
133
+ batch = DataRowBatch(self.row_builder)
134
+ rows = [self.ready_rows.popleft() for _ in range(min(self.BATCH_SIZE, len(self.ready_rows)))]
135
+ for row in rows:
136
+ assert row is not None
137
+ batch.add_row(row)
138
+ self.num_returned_rows += len(rows)
139
+ _logger.debug(f'returning {len(rows)} rows')
140
+ yield batch
141
+
142
+ if self.input_finished and self.queued_work == 0 and len(self.ready_rows) == 0:
143
+ return
144
+
145
+ def __has_ready_batch(self) -> bool:
146
+ """True if there are >= BATCH_SIZES entries in ready_rows and the first BATCH_SIZE ones are all non-None"""
147
+ return (
148
+ sum(int(row is not None) for row in itertools.islice(self.ready_rows, self.BATCH_SIZE)) == self.BATCH_SIZE
149
+ )
150
+
151
+ def __add_ready_row(self, row: exprs.DataRow, row_idx: int | None) -> None:
152
+ if row_idx is None:
153
+ self.ready_rows.append(row)
154
+ else:
155
+ # extend ready_rows to accommodate row_idx
156
+ idx = row_idx - self.num_returned_rows
157
+ if idx >= len(self.ready_rows):
158
+ self.ready_rows.extend([None] * (idx - len(self.ready_rows) + 1))
159
+ self.ready_rows[idx] = row
160
+
161
+ def __process_completions(self, done: set[futures.Future], ignore_errors: bool) -> None:
162
+ from pixeltable.utils.local_store import TempStore
163
+
164
+ for f in done:
165
+ work_designator = self.in_flight_requests.pop(f)
166
+ new_file_url, exc = f.result()
167
+ if exc is not None and not ignore_errors:
168
+ raise exc
169
+ assert new_file_url is not None
170
+
171
+ # add the local path/exception to the slots that reference the url
172
+ for row, info in self.in_flight_work.pop(work_designator):
173
+ if exc is not None:
174
+ self.row_builder.set_exc(row, info.slot_idx, exc)
175
+ else:
176
+ row.file_urls[info.slot_idx] = new_file_url
177
+
178
+ state = self.in_flight_rows[id(row)]
179
+ state.num_missing -= 1
180
+ if state.num_missing == 0:
181
+ # All operations for this row are complete. Delete all files which had multiple destinations
182
+ for src_path in state.delete_destinations:
183
+ TempStore.delete_media_file(src_path)
184
+ del self.in_flight_rows[id(row)]
185
+ self.__add_ready_row(row, state.idx)
186
+
187
+ def __process_input_row(self, row: exprs.DataRow) -> list[ObjectStoreSaveNode.WorkItem]:
188
+ """Process a batch of input rows, generating a list of work"""
189
+ from pixeltable.utils.local_store import LocalStore, TempStore
190
+
191
+ # Create a list of work to do for media storage in this row
192
+ row_idx = next(self.row_idx)
193
+ row_to_do: list[ObjectStoreSaveNode.WorkItem] = []
194
+ num_missing = 0
195
+ unique_destinations: dict[Path, int] = defaultdict(int) # destination -> count of unique destinations
196
+
197
+ for info in self.file_col_info:
198
+ col, index = info
199
+ # we may need to store this imagehave yet to store this image
200
+ if row.prepare_col_val_for_save(index, col):
201
+ row.file_urls[index] = row.save_media_to_temp(index, col)
202
+
203
+ url = row.file_urls[index]
204
+ if url is None:
205
+ # nothing to do
206
+ continue
207
+
208
+ assert row.excs[index] is None
209
+ assert col.col_type.is_media_type()
210
+
211
+ destination = info.col.destination
212
+ if destination is not None:
213
+ soa = ObjectPath.parse_object_storage_addr(destination, False)
214
+ if soa.storage_target == StorageTarget.LOCAL_STORE and LocalStore(soa).resolve_url(url) is not None:
215
+ # A local non-default destination was specified, and the url already points there
216
+ continue
217
+
218
+ src_path = LocalStore.file_url_to_path(url)
219
+ if src_path is None:
220
+ # The url does not point to a local file, do not attempt to copy/move it
221
+ continue
222
+
223
+ if destination is None and not TempStore.contains_path(src_path):
224
+ # Do not copy local file URLs to the LocalStore
225
+ continue
226
+
227
+ work_designator = ObjectStoreSaveNode.WorkDesignator(str(src_path), destination)
228
+ locations = self.in_flight_work.get(work_designator)
229
+ if locations is not None:
230
+ # we've already seen this
231
+ locations.append((row, info))
232
+ num_missing += 1
233
+ continue
234
+
235
+ work_item = ObjectStoreSaveNode.WorkItem(src_path, destination, info)
236
+ row_to_do.append(work_item)
237
+ self.in_flight_work[work_designator] = [(row, info)]
238
+ num_missing += 1
239
+ unique_destinations[src_path] += 1
240
+
241
+ # Update work items to reflect the number of unique destinations
242
+ new_to_do = []
243
+ for work_item in row_to_do:
244
+ if unique_destinations[work_item.src_path] == 1 and TempStore.contains_path(work_item.src_path):
245
+ new_to_do.append(work_item)
246
+ else:
247
+ new_to_do.append(
248
+ ObjectStoreSaveNode.WorkItem(
249
+ work_item.src_path,
250
+ work_item.destination,
251
+ work_item.info,
252
+ destination_count=unique_destinations[work_item.src_path] + 1,
253
+ # +1 for the TempStore destination
254
+ )
255
+ )
256
+ delete_destinations = [k for k, v in unique_destinations.items() if v > 1 and TempStore.contains_path(k)]
257
+ row_to_do = new_to_do
258
+
259
+ if len(row_to_do) > 0:
260
+ self.in_flight_rows[id(row)] = self.RowState(
261
+ row, row_idx, num_missing, delete_destinations=delete_destinations
262
+ )
263
+ else:
264
+ self.__add_ready_row(row, row_idx)
265
+ return row_to_do
266
+
267
+ def __process_input_batch(self, input_batch: DataRowBatch, executor: futures.ThreadPoolExecutor) -> None:
268
+ """Process a batch of input rows, submitting temporary files for upload"""
269
+ work_to_do: list[ObjectStoreSaveNode.WorkItem] = []
270
+
271
+ for row in input_batch:
272
+ row_to_do = self.__process_input_row(row)
273
+ if len(row_to_do) > 0:
274
+ work_to_do.extend(row_to_do)
275
+
276
+ for work_item in work_to_do:
277
+ f = executor.submit(self.__persist_media_file, work_item)
278
+ self.in_flight_requests[f] = ObjectStoreSaveNode.WorkDesignator(
279
+ str(work_item.src_path), work_item.destination
280
+ )
281
+ _logger.debug(f'submitted {work_item}')
282
+
283
+ def __persist_media_file(self, work_item: WorkItem) -> tuple[str | None, Exception | None]:
284
+ """Move data from the TempStore to another location"""
285
+ src_path = work_item.src_path
286
+ col = work_item.info.col
287
+ assert col.destination == work_item.destination
288
+ try:
289
+ new_file_url = ObjectOps.put_file(col, src_path, work_item.destination_count == 1)
290
+ return new_file_url, None
291
+ except Exception as e:
292
+ _logger.debug(f'Failed to move/copy {src_path}: {e}', exc_info=e)
293
+ return None, e
@@ -1,61 +1,76 @@
1
1
  import logging
2
- from typing import Any
2
+ from typing import Any, AsyncIterator
3
+
4
+ from pixeltable import catalog, exprs
3
5
 
4
- import pixeltable.catalog as catalog
5
- import pixeltable.exprs as exprs
6
- from pixeltable.utils.media_store import MediaStore
7
6
  from .data_row_batch import DataRowBatch
8
7
  from .exec_node import ExecNode
9
8
 
10
9
  _logger = logging.getLogger('pixeltable')
11
10
 
11
+
12
12
  class RowUpdateNode(ExecNode):
13
13
  """
14
14
  Update individual rows in the input batches, identified by key columns.
15
15
 
16
16
  The updates for a row are provided as a dict of column names to new values.
17
- The node assumes that all update dicts contain the same keys, and it populates the slots of the columns present in
18
- the update list.
17
+ Populates the slots of the columns present in the update list.
18
+ Assumptions:
19
+ - all update dicts contain the same keys
20
+ - the input node populates DataRow.cell_vals for all primary key columns
19
21
  """
22
+
23
+ updates: dict[tuple, dict[catalog.Column, Any]]
24
+ is_rowid_key: bool # if True, key_vals_batch contains rowids rather than primary key values
25
+ col_slot_idxs: dict[catalog.Column, int]
26
+ pk_columns: list[catalog.Column]
27
+ matched_key_vals: set[tuple]
28
+
20
29
  def __init__(
21
- self, tbl: catalog.TableVersionPath, key_vals_batch: list[tuple], is_rowid_key: bool,
22
- col_vals_batch: list[dict[catalog.Column, Any]], row_builder: exprs.RowBuilder, input: ExecNode,
30
+ self,
31
+ tbl: catalog.TableVersionPath,
32
+ key_vals_batch: list[tuple],
33
+ is_rowid_key: bool,
34
+ col_vals_batch: list[dict[catalog.Column, Any]],
35
+ row_builder: exprs.RowBuilder,
36
+ input: ExecNode,
23
37
  ):
24
38
  super().__init__(row_builder, [], [], input)
25
- self.updates = {key_vals: col_vals for key_vals, col_vals in zip(key_vals_batch, col_vals_batch)}
39
+ self.updates = dict(zip(key_vals_batch, col_vals_batch))
26
40
  self.is_rowid_key = is_rowid_key
27
41
  # determine slot idxs of all columns we need to read or write
28
42
  # retrieve ColumnRefs from the RowBuilder (has slot_idx set)
29
43
  all_col_slot_idxs = {
30
44
  col_ref.col: col_ref.slot_idx
31
- for col_ref in row_builder.unique_exprs if isinstance(col_ref, exprs.ColumnRef)
45
+ for col_ref in row_builder.unique_exprs
46
+ if isinstance(col_ref, exprs.ColumnRef)
32
47
  }
33
- self.col_slot_idxs = {col: all_col_slot_idxs[col] for col in col_vals_batch[0].keys()}
34
- self.key_slot_idxs = {col: all_col_slot_idxs[col] for col in tbl.tbl_version.primary_key_columns()}
35
- self.matched_key_vals: set[tuple] = set()
36
-
37
- def __next__(self) -> DataRowBatch:
38
- batch = next(self.input)
39
- for row in batch:
40
- key_vals = row.rowid if self.is_rowid_key else \
41
- tuple(row[slot_idx] for slot_idx in self.key_slot_idxs.values())
42
- if key_vals not in self.updates:
43
- continue
44
- self.matched_key_vals.add(key_vals)
45
- col_vals = self.updates[key_vals]
46
- for col, val in col_vals.items():
47
- slot_idx = self.col_slot_idxs[col]
48
- row[slot_idx] = val
49
- return batch
48
+ # all update target columns should have assigned slot idxs
49
+ assert all(col in all_col_slot_idxs for col in col_vals_batch[0])
50
+ self.col_slot_idxs = {col: all_col_slot_idxs[col] for col in col_vals_batch[0]}
51
+ self.pk_columns = tbl.tbl_version.get().primary_key_columns()
52
+ self.matched_key_vals = set()
53
+
54
+ async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
55
+ async for batch in self.input:
56
+ for row in batch:
57
+ key_vals = row.rowid if self.is_rowid_key else tuple(row.cell_vals[col.id] for col in self.pk_columns)
58
+ if key_vals not in self.updates:
59
+ continue
60
+ self.matched_key_vals.add(key_vals)
61
+ col_vals = self.updates[key_vals]
62
+ for col, val in col_vals.items():
63
+ slot_idx = self.col_slot_idxs[col]
64
+ row[slot_idx] = val
65
+ yield batch
50
66
 
51
67
  def unmatched_rows(self) -> list[dict[str, Any]]:
52
68
  """Return rows that didn't get used in the updates as a list of dicts compatible with TableVersion.insert()."""
53
69
  result: list[dict[str, Any]] = []
54
- key_cols = self.key_slot_idxs.keys()
55
70
  for key_vals, col_vals in self.updates.items():
56
71
  if key_vals in self.matched_key_vals:
57
72
  continue
58
- row = {col.name: val for col, val in zip(key_cols, key_vals)}
73
+ row = {col.name: val for col, val in zip(self.pk_columns, key_vals)}
59
74
  row.update({col.name: val for col, val in col_vals.items()})
60
75
  result.append(row)
61
76
  return result