pixeltable 0.4.12__py3-none-any.whl → 0.4.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +11 -1
- pixeltable/catalog/__init__.py +2 -1
- pixeltable/catalog/catalog.py +179 -63
- pixeltable/catalog/column.py +24 -20
- pixeltable/catalog/table.py +96 -124
- pixeltable/catalog/table_metadata.py +96 -0
- pixeltable/catalog/table_version.py +15 -6
- pixeltable/catalog/view.py +22 -22
- pixeltable/config.py +2 -0
- pixeltable/dataframe.py +3 -2
- pixeltable/env.py +43 -21
- pixeltable/exec/__init__.py +1 -0
- pixeltable/exec/aggregation_node.py +0 -1
- pixeltable/exec/cache_prefetch_node.py +74 -98
- pixeltable/exec/data_row_batch.py +2 -18
- pixeltable/exec/in_memory_data_node.py +1 -1
- pixeltable/exec/object_store_save_node.py +299 -0
- pixeltable/exec/sql_node.py +28 -33
- pixeltable/exprs/data_row.py +31 -25
- pixeltable/exprs/json_path.py +6 -5
- pixeltable/exprs/row_builder.py +6 -12
- pixeltable/functions/gemini.py +1 -1
- pixeltable/functions/openai.py +1 -1
- pixeltable/functions/video.py +5 -6
- pixeltable/globals.py +6 -7
- pixeltable/index/embedding_index.py +5 -8
- pixeltable/io/__init__.py +2 -1
- pixeltable/io/fiftyone.py +1 -1
- pixeltable/io/label_studio.py +4 -5
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/parquet.py +9 -89
- pixeltable/io/table_data_conduit.py +2 -2
- pixeltable/iterators/audio.py +1 -1
- pixeltable/iterators/document.py +10 -12
- pixeltable/iterators/video.py +1 -1
- pixeltable/metadata/schema.py +7 -0
- pixeltable/plan.py +26 -1
- pixeltable/share/packager.py +8 -2
- pixeltable/share/publish.py +3 -9
- pixeltable/type_system.py +1 -3
- pixeltable/utils/arrow.py +97 -2
- pixeltable/utils/dbms.py +31 -5
- pixeltable/utils/gcs_store.py +283 -0
- pixeltable/utils/lancedb.py +88 -0
- pixeltable/utils/local_store.py +316 -0
- pixeltable/utils/object_stores.py +497 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +354 -0
- {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/METADATA +162 -127
- {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/RECORD +53 -47
- pixeltable/utils/media_store.py +0 -248
- pixeltable/utils/s3.py +0 -17
- {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import dataclasses
|
|
4
|
+
import itertools
|
|
5
|
+
import logging
|
|
6
|
+
from collections import defaultdict, deque
|
|
7
|
+
from concurrent import futures
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import AsyncIterator, Iterator, NamedTuple, Optional
|
|
10
|
+
from uuid import UUID
|
|
11
|
+
|
|
12
|
+
from pixeltable import exprs
|
|
13
|
+
from pixeltable.utils.object_stores import ObjectOps, ObjectPath, StorageTarget
|
|
14
|
+
|
|
15
|
+
from .data_row_batch import DataRowBatch
|
|
16
|
+
from .exec_node import ExecNode
|
|
17
|
+
|
|
18
|
+
_logger = logging.getLogger('pixeltable')
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ObjectStoreSaveNode(ExecNode):
|
|
22
|
+
"""Save files into designated object store(s).
|
|
23
|
+
|
|
24
|
+
Each row may have multiple files that need to be saved to a destination.
|
|
25
|
+
Each file may be referenced by more than one column in the row.
|
|
26
|
+
Each file may have multiple destinations, e.g., S3 bucket and local file system.
|
|
27
|
+
If there are multiple destinations, the file cannot be moved to any destination
|
|
28
|
+
until it has been copied to all of the other destinations.
|
|
29
|
+
Diagrammatically:
|
|
30
|
+
Row -> [src_path1, src_path2, ...]
|
|
31
|
+
src_path -> [dest1, dest2, ...]
|
|
32
|
+
dest1: [row_location1, row_location2, ...]
|
|
33
|
+
Paths with multiple destinations are removed from the TempStore only after all destination copies are complete.
|
|
34
|
+
|
|
35
|
+
TODO:
|
|
36
|
+
- Process a row at a time and limit the number of in-flight rows to control memory usage
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
QUEUE_DEPTH_HIGH_WATER = 50 # target number of in-flight requests
|
|
40
|
+
QUEUE_DEPTH_LOW_WATER = 20 # target number of in-flight requests
|
|
41
|
+
BATCH_SIZE = 16
|
|
42
|
+
MAX_WORKERS = 15
|
|
43
|
+
|
|
44
|
+
class WorkDesignator(NamedTuple):
|
|
45
|
+
"""Specify the source and destination for a WorkItem"""
|
|
46
|
+
|
|
47
|
+
src_path: str # source of the file to be processed
|
|
48
|
+
destination: Optional[str] # destination URI for the file to be processed
|
|
49
|
+
|
|
50
|
+
class WorkItem(NamedTuple):
|
|
51
|
+
src_path: Path
|
|
52
|
+
destination: Optional[str]
|
|
53
|
+
info: exprs.ColumnSlotIdx # column info for the file being processed
|
|
54
|
+
destination_count: int = 1 # number of unique destinations for this file
|
|
55
|
+
|
|
56
|
+
retain_input_order: bool # if True, return rows in the exact order they were received
|
|
57
|
+
file_col_info: list[exprs.ColumnSlotIdx]
|
|
58
|
+
|
|
59
|
+
# execution state
|
|
60
|
+
num_returned_rows: int
|
|
61
|
+
|
|
62
|
+
# ready_rows: rows that are ready to be returned, ordered by row idx;
|
|
63
|
+
# the implied row idx of ready_rows[0] is num_returned_rows
|
|
64
|
+
ready_rows: deque[Optional[exprs.DataRow]]
|
|
65
|
+
|
|
66
|
+
in_flight_rows: dict[int, ObjectStoreSaveNode.RowState] # rows with in-flight work; id(row) -> RowState
|
|
67
|
+
in_flight_requests: dict[
|
|
68
|
+
futures.Future, WorkDesignator
|
|
69
|
+
] # in-flight requests to save paths: Future -> WorkDesignator
|
|
70
|
+
in_flight_work: dict[
|
|
71
|
+
WorkDesignator, list[tuple[exprs.DataRow, exprs.ColumnSlotIdx]]
|
|
72
|
+
] # WorkDesignator -> [(row, info)]
|
|
73
|
+
|
|
74
|
+
input_finished: bool
|
|
75
|
+
row_idx: Iterator[Optional[int]]
|
|
76
|
+
|
|
77
|
+
@dataclasses.dataclass
|
|
78
|
+
class RowState:
|
|
79
|
+
row: exprs.DataRow
|
|
80
|
+
idx: Optional[int] # position in input stream; None if we don't retain input order
|
|
81
|
+
num_missing: int # number of references to media files in this row
|
|
82
|
+
delete_destinations: list[Path] # paths to delete after all copies are complete
|
|
83
|
+
|
|
84
|
+
def __init__(
|
|
85
|
+
self, tbl_id: UUID, file_col_info: list[exprs.ColumnSlotIdx], input: ExecNode, retain_input_order: bool = True
|
|
86
|
+
):
|
|
87
|
+
# input_/output_exprs=[]: we don't have anything to evaluate
|
|
88
|
+
super().__init__(input.row_builder, [], [], input)
|
|
89
|
+
self.retain_input_order = retain_input_order
|
|
90
|
+
self.file_col_info = file_col_info
|
|
91
|
+
|
|
92
|
+
self.num_returned_rows = 0
|
|
93
|
+
self.ready_rows = deque()
|
|
94
|
+
self.in_flight_rows = {}
|
|
95
|
+
self.in_flight_requests = {}
|
|
96
|
+
self.in_flight_work = {}
|
|
97
|
+
self.input_finished = False
|
|
98
|
+
self.row_idx = itertools.count() if retain_input_order else itertools.repeat(None)
|
|
99
|
+
assert self.QUEUE_DEPTH_HIGH_WATER > self.QUEUE_DEPTH_LOW_WATER
|
|
100
|
+
|
|
101
|
+
@property
|
|
102
|
+
def queued_work(self) -> int:
|
|
103
|
+
return len(self.in_flight_requests)
|
|
104
|
+
|
|
105
|
+
async def get_input_batch(self, input_iter: AsyncIterator[DataRowBatch]) -> Optional[DataRowBatch]:
|
|
106
|
+
"""Get the next batch of input rows, or None if there are no more rows"""
|
|
107
|
+
try:
|
|
108
|
+
input_batch = await anext(input_iter)
|
|
109
|
+
if input_batch is None:
|
|
110
|
+
self.input_finished = True
|
|
111
|
+
return input_batch
|
|
112
|
+
except StopAsyncIteration:
|
|
113
|
+
self.input_finished = True
|
|
114
|
+
return None
|
|
115
|
+
|
|
116
|
+
async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
|
|
117
|
+
input_iter = aiter(self.input)
|
|
118
|
+
with futures.ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
|
|
119
|
+
while True:
|
|
120
|
+
# Create work to fill the queue to the high water mark ... ?without overrunning the in-flight row limit.
|
|
121
|
+
while not self.input_finished and self.queued_work < self.QUEUE_DEPTH_HIGH_WATER:
|
|
122
|
+
input_batch = await self.get_input_batch(input_iter)
|
|
123
|
+
if input_batch is not None:
|
|
124
|
+
self.__process_input_batch(input_batch, executor)
|
|
125
|
+
|
|
126
|
+
# Wait for enough completions to enable more queueing or if we're done
|
|
127
|
+
while self.queued_work > self.QUEUE_DEPTH_LOW_WATER or (self.input_finished and self.queued_work > 0):
|
|
128
|
+
done, _ = futures.wait(self.in_flight_requests, return_when=futures.FIRST_COMPLETED)
|
|
129
|
+
self.__process_completions(done, ignore_errors=self.ctx.ignore_errors)
|
|
130
|
+
|
|
131
|
+
# Emit results to meet batch size requirements or empty the in-flight row queue
|
|
132
|
+
if self.__has_ready_batch() or (
|
|
133
|
+
len(self.ready_rows) > 0 and self.input_finished and self.queued_work == 0
|
|
134
|
+
):
|
|
135
|
+
# create DataRowBatch from the first BATCH_SIZE ready rows
|
|
136
|
+
batch = DataRowBatch(self.row_builder)
|
|
137
|
+
rows = [self.ready_rows.popleft() for _ in range(min(self.BATCH_SIZE, len(self.ready_rows)))]
|
|
138
|
+
for row in rows:
|
|
139
|
+
assert row is not None
|
|
140
|
+
batch.add_row(row)
|
|
141
|
+
self.num_returned_rows += len(rows)
|
|
142
|
+
_logger.debug(f'returning {len(rows)} rows')
|
|
143
|
+
yield batch
|
|
144
|
+
|
|
145
|
+
if self.input_finished and self.queued_work == 0 and len(self.ready_rows) == 0:
|
|
146
|
+
return
|
|
147
|
+
|
|
148
|
+
def __has_ready_batch(self) -> bool:
|
|
149
|
+
"""True if there are >= BATCH_SIZES entries in ready_rows and the first BATCH_SIZE ones are all non-None"""
|
|
150
|
+
return (
|
|
151
|
+
sum(int(row is not None) for row in itertools.islice(self.ready_rows, self.BATCH_SIZE)) == self.BATCH_SIZE
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
def __add_ready_row(self, row: exprs.DataRow, row_idx: Optional[int]) -> None:
|
|
155
|
+
if row_idx is None:
|
|
156
|
+
self.ready_rows.append(row)
|
|
157
|
+
else:
|
|
158
|
+
# extend ready_rows to accommodate row_idx
|
|
159
|
+
idx = row_idx - self.num_returned_rows
|
|
160
|
+
if idx >= len(self.ready_rows):
|
|
161
|
+
self.ready_rows.extend([None] * (idx - len(self.ready_rows) + 1))
|
|
162
|
+
self.ready_rows[idx] = row
|
|
163
|
+
|
|
164
|
+
def __process_completions(self, done: set[futures.Future], ignore_errors: bool) -> None:
|
|
165
|
+
from pixeltable.utils.local_store import TempStore
|
|
166
|
+
|
|
167
|
+
for f in done:
|
|
168
|
+
work_designator = self.in_flight_requests.pop(f)
|
|
169
|
+
new_file_url, exc = f.result()
|
|
170
|
+
if exc is not None and not ignore_errors:
|
|
171
|
+
raise exc
|
|
172
|
+
assert new_file_url is not None
|
|
173
|
+
|
|
174
|
+
# add the local path/exception to the slots that reference the url
|
|
175
|
+
for row, info in self.in_flight_work.pop(work_designator):
|
|
176
|
+
if exc is not None:
|
|
177
|
+
self.row_builder.set_exc(row, info.slot_idx, exc)
|
|
178
|
+
else:
|
|
179
|
+
row.file_urls[info.slot_idx] = new_file_url
|
|
180
|
+
|
|
181
|
+
state = self.in_flight_rows[id(row)]
|
|
182
|
+
state.num_missing -= 1
|
|
183
|
+
if state.num_missing == 0:
|
|
184
|
+
# All operations for this row are complete. Delete all files which had multiple destinations
|
|
185
|
+
for src_path in state.delete_destinations:
|
|
186
|
+
TempStore.delete_media_file(src_path)
|
|
187
|
+
del self.in_flight_rows[id(row)]
|
|
188
|
+
self.__add_ready_row(row, state.idx)
|
|
189
|
+
|
|
190
|
+
def __process_input_row(self, row: exprs.DataRow) -> list[ObjectStoreSaveNode.WorkItem]:
|
|
191
|
+
"""Process a batch of input rows, generating a list of work"""
|
|
192
|
+
from pixeltable.utils.local_store import LocalStore, TempStore
|
|
193
|
+
|
|
194
|
+
# Create a list of work to do for media storage in this row
|
|
195
|
+
row_idx = next(self.row_idx)
|
|
196
|
+
row_to_do: list[ObjectStoreSaveNode.WorkItem] = []
|
|
197
|
+
num_missing = 0
|
|
198
|
+
unique_destinations: dict[Path, int] = defaultdict(int) # destination -> count of unique destinations
|
|
199
|
+
|
|
200
|
+
for info in self.file_col_info:
|
|
201
|
+
col, index = info
|
|
202
|
+
# we may need to store this imagehave yet to store this image
|
|
203
|
+
if row.prepare_col_val_for_save(index, col):
|
|
204
|
+
row.file_urls[index] = row.save_media_to_temp(index, col)
|
|
205
|
+
|
|
206
|
+
url = row.file_urls[index]
|
|
207
|
+
if url is None:
|
|
208
|
+
# nothing to do
|
|
209
|
+
continue
|
|
210
|
+
|
|
211
|
+
assert row.excs[index] is None
|
|
212
|
+
assert col.col_type.is_media_type()
|
|
213
|
+
|
|
214
|
+
destination = info.col.destination
|
|
215
|
+
soa = None if destination is None else ObjectPath.parse_object_storage_addr(destination, False)
|
|
216
|
+
if (
|
|
217
|
+
soa is not None
|
|
218
|
+
and soa.storage_target == StorageTarget.LOCAL_STORE
|
|
219
|
+
and LocalStore(soa).resolve_url(url) is not None
|
|
220
|
+
):
|
|
221
|
+
# A local non-default destination was specified, and the url already points there
|
|
222
|
+
continue
|
|
223
|
+
|
|
224
|
+
src_path = LocalStore.file_url_to_path(url)
|
|
225
|
+
if src_path is None:
|
|
226
|
+
# The url does not point to a local file, do not attempt to copy/move it
|
|
227
|
+
continue
|
|
228
|
+
|
|
229
|
+
if destination is None and not TempStore.contains_path(src_path):
|
|
230
|
+
# Do not copy local file URLs to the LocalStore
|
|
231
|
+
continue
|
|
232
|
+
|
|
233
|
+
work_designator = ObjectStoreSaveNode.WorkDesignator(str(src_path), destination)
|
|
234
|
+
locations = self.in_flight_work.get(work_designator)
|
|
235
|
+
if locations is not None:
|
|
236
|
+
# we've already seen this
|
|
237
|
+
locations.append((row, info))
|
|
238
|
+
num_missing += 1
|
|
239
|
+
continue
|
|
240
|
+
|
|
241
|
+
work_item = ObjectStoreSaveNode.WorkItem(src_path, destination, info)
|
|
242
|
+
row_to_do.append(work_item)
|
|
243
|
+
self.in_flight_work[work_designator] = [(row, info)]
|
|
244
|
+
num_missing += 1
|
|
245
|
+
unique_destinations[src_path] += 1
|
|
246
|
+
|
|
247
|
+
# Update work items to reflect the number of unique destinations
|
|
248
|
+
new_to_do = []
|
|
249
|
+
for work_item in row_to_do:
|
|
250
|
+
if unique_destinations[work_item.src_path] == 1 and TempStore.contains_path(work_item.src_path):
|
|
251
|
+
new_to_do.append(work_item)
|
|
252
|
+
else:
|
|
253
|
+
new_to_do.append(
|
|
254
|
+
ObjectStoreSaveNode.WorkItem(
|
|
255
|
+
work_item.src_path,
|
|
256
|
+
work_item.destination,
|
|
257
|
+
work_item.info,
|
|
258
|
+
destination_count=unique_destinations[work_item.src_path] + 1,
|
|
259
|
+
# +1 for the TempStore destination
|
|
260
|
+
)
|
|
261
|
+
)
|
|
262
|
+
delete_destinations = [k for k, v in unique_destinations.items() if v > 1 and TempStore.contains_path(k)]
|
|
263
|
+
row_to_do = new_to_do
|
|
264
|
+
|
|
265
|
+
if len(row_to_do) > 0:
|
|
266
|
+
self.in_flight_rows[id(row)] = self.RowState(
|
|
267
|
+
row, row_idx, num_missing, delete_destinations=delete_destinations
|
|
268
|
+
)
|
|
269
|
+
else:
|
|
270
|
+
self.__add_ready_row(row, row_idx)
|
|
271
|
+
return row_to_do
|
|
272
|
+
|
|
273
|
+
def __process_input_batch(self, input_batch: DataRowBatch, executor: futures.ThreadPoolExecutor) -> None:
|
|
274
|
+
"""Process a batch of input rows, submitting temporary files for upload"""
|
|
275
|
+
work_to_do: list[ObjectStoreSaveNode.WorkItem] = []
|
|
276
|
+
|
|
277
|
+
for row in input_batch:
|
|
278
|
+
row_to_do = self.__process_input_row(row)
|
|
279
|
+
if len(row_to_do) > 0:
|
|
280
|
+
work_to_do.extend(row_to_do)
|
|
281
|
+
|
|
282
|
+
for work_item in work_to_do:
|
|
283
|
+
f = executor.submit(self.__persist_media_file, work_item)
|
|
284
|
+
self.in_flight_requests[f] = ObjectStoreSaveNode.WorkDesignator(
|
|
285
|
+
str(work_item.src_path), work_item.destination
|
|
286
|
+
)
|
|
287
|
+
_logger.debug(f'submitted {work_item}')
|
|
288
|
+
|
|
289
|
+
def __persist_media_file(self, work_item: WorkItem) -> tuple[Optional[str], Optional[Exception]]:
|
|
290
|
+
"""Move data from the TempStore to another location"""
|
|
291
|
+
src_path = work_item.src_path
|
|
292
|
+
col = work_item.info.col
|
|
293
|
+
assert col.destination == work_item.destination
|
|
294
|
+
try:
|
|
295
|
+
new_file_url = ObjectOps.put_file(col, src_path, work_item.destination_count == 1)
|
|
296
|
+
return new_file_url, None
|
|
297
|
+
except Exception as e:
|
|
298
|
+
_logger.debug(f'Failed to move/copy {src_path}: {e}', exc_info=e)
|
|
299
|
+
return None, e
|
pixeltable/exec/sql_node.py
CHANGED
|
@@ -71,6 +71,13 @@ class SqlNode(ExecNode):
|
|
|
71
71
|
If set_pk is True, they are added to the end of the result set when creating the SQL statement
|
|
72
72
|
so they can always be referenced as cols[-num_pk_cols:] in the result set.
|
|
73
73
|
The pk_columns consist of the rowid columns of the target table followed by the version number.
|
|
74
|
+
|
|
75
|
+
If row_builder contains references to unstored iter columns, expands the select list to include their
|
|
76
|
+
SQL-materializable subexpressions.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
select_list: output of the query
|
|
80
|
+
set_pk: if True, sets the primary for each DataRow
|
|
74
81
|
"""
|
|
75
82
|
|
|
76
83
|
tbl: Optional[catalog.TableVersionPath]
|
|
@@ -97,14 +104,6 @@ class SqlNode(ExecNode):
|
|
|
97
104
|
sql_elements: exprs.SqlElementCache,
|
|
98
105
|
set_pk: bool = False,
|
|
99
106
|
):
|
|
100
|
-
"""
|
|
101
|
-
If row_builder contains references to unstored iter columns, expands the select list to include their
|
|
102
|
-
SQL-materializable subexpressions.
|
|
103
|
-
|
|
104
|
-
Args:
|
|
105
|
-
select_list: output of the query
|
|
106
|
-
set_pk: if True, sets the primary for each DataRow
|
|
107
|
-
"""
|
|
108
107
|
# create Select stmt
|
|
109
108
|
self.sql_elements = sql_elements
|
|
110
109
|
self.tbl = tbl
|
|
@@ -374,6 +373,11 @@ class SqlScanNode(SqlNode):
|
|
|
374
373
|
Materializes data from the store via a Select stmt.
|
|
375
374
|
|
|
376
375
|
Supports filtering and ordering.
|
|
376
|
+
|
|
377
|
+
Args:
|
|
378
|
+
select_list: output of the query
|
|
379
|
+
set_pk: if True, sets the primary for each DataRow
|
|
380
|
+
exact_version_only: tables for which we only want to see rows created at the current version
|
|
377
381
|
"""
|
|
378
382
|
|
|
379
383
|
exact_version_only: list[catalog.TableVersionHandle]
|
|
@@ -386,12 +390,6 @@ class SqlScanNode(SqlNode):
|
|
|
386
390
|
set_pk: bool = False,
|
|
387
391
|
exact_version_only: Optional[list[catalog.TableVersionHandle]] = None,
|
|
388
392
|
):
|
|
389
|
-
"""
|
|
390
|
-
Args:
|
|
391
|
-
select_list: output of the query
|
|
392
|
-
set_pk: if True, sets the primary for each DataRow
|
|
393
|
-
exact_version_only: tables for which we only want to see rows created at the current version
|
|
394
|
-
"""
|
|
395
393
|
sql_elements = exprs.SqlElementCache()
|
|
396
394
|
super().__init__(tbl, row_builder, select_list, sql_elements, set_pk=set_pk)
|
|
397
395
|
# create Select stmt
|
|
@@ -413,6 +411,11 @@ class SqlScanNode(SqlNode):
|
|
|
413
411
|
class SqlLookupNode(SqlNode):
|
|
414
412
|
"""
|
|
415
413
|
Materializes data from the store via a Select stmt with a WHERE clause that matches a list of key values
|
|
414
|
+
|
|
415
|
+
Args:
|
|
416
|
+
select_list: output of the query
|
|
417
|
+
sa_key_cols: list of key columns in the store table
|
|
418
|
+
key_vals: list of key values to look up
|
|
416
419
|
"""
|
|
417
420
|
|
|
418
421
|
def __init__(
|
|
@@ -423,12 +426,6 @@ class SqlLookupNode(SqlNode):
|
|
|
423
426
|
sa_key_cols: list[sql.Column],
|
|
424
427
|
key_vals: list[tuple],
|
|
425
428
|
):
|
|
426
|
-
"""
|
|
427
|
-
Args:
|
|
428
|
-
select_list: output of the query
|
|
429
|
-
sa_key_cols: list of key columns in the store table
|
|
430
|
-
key_vals: list of key values to look up
|
|
431
|
-
"""
|
|
432
429
|
sql_elements = exprs.SqlElementCache()
|
|
433
430
|
super().__init__(tbl, row_builder, select_list, sql_elements, set_pk=True)
|
|
434
431
|
# Where clause: (key-col-1, key-col-2, ...) IN ((val-1, val-2, ...), ...)
|
|
@@ -444,6 +441,11 @@ class SqlLookupNode(SqlNode):
|
|
|
444
441
|
class SqlAggregationNode(SqlNode):
|
|
445
442
|
"""
|
|
446
443
|
Materializes data from the store via a Select stmt with a WHERE clause that matches a list of key values
|
|
444
|
+
|
|
445
|
+
Args:
|
|
446
|
+
select_list: can contain calls to AggregateFunctions
|
|
447
|
+
group_by_items: list of expressions to group by
|
|
448
|
+
limit: max number of rows to return: None = no limit
|
|
447
449
|
"""
|
|
448
450
|
|
|
449
451
|
group_by_items: Optional[list[exprs.Expr]]
|
|
@@ -458,12 +460,6 @@ class SqlAggregationNode(SqlNode):
|
|
|
458
460
|
limit: Optional[int] = None,
|
|
459
461
|
exact_version_only: Optional[list[catalog.TableVersion]] = None,
|
|
460
462
|
):
|
|
461
|
-
"""
|
|
462
|
-
Args:
|
|
463
|
-
select_list: can contain calls to AggregateFunctions
|
|
464
|
-
group_by_items: list of expressions to group by
|
|
465
|
-
limit: max number of rows to return: None = no limit
|
|
466
|
-
"""
|
|
467
463
|
self.input_cte, input_col_map = input.to_cte()
|
|
468
464
|
sql_elements = exprs.SqlElementCache(input_col_map)
|
|
469
465
|
super().__init__(None, row_builder, select_list, sql_elements)
|
|
@@ -529,6 +525,12 @@ class SqlJoinNode(SqlNode):
|
|
|
529
525
|
class SqlSampleNode(SqlNode):
|
|
530
526
|
"""
|
|
531
527
|
Returns rows sampled from the input node.
|
|
528
|
+
|
|
529
|
+
Args:
|
|
530
|
+
input: SqlNode to sample from
|
|
531
|
+
select_list: can contain calls to AggregateFunctions
|
|
532
|
+
sample_clause: specifies the sampling method
|
|
533
|
+
stratify_exprs: Analyzer processed list of expressions to stratify by.
|
|
532
534
|
"""
|
|
533
535
|
|
|
534
536
|
input_cte: Optional[sql.CTE]
|
|
@@ -544,13 +546,6 @@ class SqlSampleNode(SqlNode):
|
|
|
544
546
|
sample_clause: 'SampleClause',
|
|
545
547
|
stratify_exprs: list[exprs.Expr],
|
|
546
548
|
):
|
|
547
|
-
"""
|
|
548
|
-
Args:
|
|
549
|
-
input: SqlNode to sample from
|
|
550
|
-
select_list: can contain calls to AggregateFunctions
|
|
551
|
-
sample_clause: specifies the sampling method
|
|
552
|
-
stratify_exprs: Analyzer processed list of expressions to stratify by.
|
|
553
|
-
"""
|
|
554
549
|
assert isinstance(input, SqlNode)
|
|
555
550
|
self.input_cte, input_col_map = input.to_cte(keep_pk=True)
|
|
556
551
|
self.pk_count = input.num_pk_cols
|
pixeltable/exprs/data_row.py
CHANGED
|
@@ -14,7 +14,7 @@ import PIL.Image
|
|
|
14
14
|
import sqlalchemy as sql
|
|
15
15
|
|
|
16
16
|
from pixeltable import catalog, env
|
|
17
|
-
from pixeltable.utils.
|
|
17
|
+
from pixeltable.utils.local_store import TempStore
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
class DataRow:
|
|
@@ -257,42 +257,48 @@ class DataRow:
|
|
|
257
257
|
self.vals[idx] = val
|
|
258
258
|
self.has_val[idx] = True
|
|
259
259
|
|
|
260
|
-
def
|
|
261
|
-
"""
|
|
260
|
+
def prepare_col_val_for_save(self, index: int, col: Optional[catalog.Column] = None) -> bool:
|
|
261
|
+
"""
|
|
262
|
+
Prepare to save a column's value into the appropriate store. Discard unneeded values.
|
|
263
|
+
|
|
264
|
+
Return:
|
|
265
|
+
True if the media object in the column needs to be saved.
|
|
266
|
+
"""
|
|
262
267
|
if self.vals[index] is None:
|
|
263
|
-
return
|
|
268
|
+
return False
|
|
269
|
+
|
|
270
|
+
if self.file_urls[index] is not None:
|
|
271
|
+
return False
|
|
272
|
+
|
|
264
273
|
assert self.excs[index] is None
|
|
265
274
|
if self.file_paths[index] is None:
|
|
266
275
|
if col is not None:
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
if isinstance(image, PIL.Image.Image):
|
|
270
|
-
# Default to JPEG unless the image has a transparency layer (which isn't supported by JPEG).
|
|
271
|
-
# In that case, use WebP instead.
|
|
272
|
-
format = 'webp' if image.has_transparency_data else 'jpeg'
|
|
273
|
-
filepath, url = MediaStore.get().save_media_object(image, col, format=format)
|
|
274
|
-
self.file_paths[index] = str(filepath)
|
|
275
|
-
self.file_urls[index] = url
|
|
276
|
+
# This is a media object that needs to be saved
|
|
277
|
+
return True
|
|
276
278
|
else:
|
|
277
|
-
# we
|
|
279
|
+
# This is a media object that we don't care about, so we discard it
|
|
278
280
|
self.has_val[index] = False
|
|
279
281
|
else:
|
|
280
282
|
# we already have a file for this image, nothing left to do
|
|
281
283
|
pass
|
|
284
|
+
|
|
282
285
|
self.vals[index] = None
|
|
286
|
+
return False
|
|
283
287
|
|
|
284
|
-
def
|
|
285
|
-
"""
|
|
286
|
-
|
|
287
|
-
return
|
|
288
|
-
assert self.excs[index] is None
|
|
288
|
+
def save_media_to_temp(self, index: int, col: catalog.Column) -> str:
|
|
289
|
+
"""Save the media object in the column to the TempStore.
|
|
290
|
+
Objects cannot be saved directly to general destinations."""
|
|
289
291
|
assert col.col_type.is_media_type()
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
292
|
+
val = self.vals[index]
|
|
293
|
+
format = None
|
|
294
|
+
if isinstance(val, PIL.Image.Image):
|
|
295
|
+
# Default to JPEG unless the image has a transparency layer (which isn't supported by JPEG).
|
|
296
|
+
# In that case, use WebP instead.
|
|
297
|
+
format = 'webp' if val.has_transparency_data else 'jpeg'
|
|
298
|
+
filepath, url = TempStore.save_media_object(val, col, format=format)
|
|
299
|
+
self.file_paths[index] = str(filepath) if filepath is not None else None
|
|
300
|
+
self.vals[index] = None
|
|
301
|
+
return url
|
|
296
302
|
|
|
297
303
|
@property
|
|
298
304
|
def rowid(self) -> tuple[int, ...]:
|
pixeltable/exprs/json_path.py
CHANGED
|
@@ -17,14 +17,15 @@ from .sql_element_cache import SqlElementCache
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
class JsonPath(Expr):
|
|
20
|
+
"""
|
|
21
|
+
anchor can be None, in which case this is a relative JsonPath and the anchor is set later via set_anchor().
|
|
22
|
+
scope_idx: for relative paths, index of referenced JsonMapper
|
|
23
|
+
(0: indicates the immediately preceding JsonMapper, -1: the parent of the immediately preceding mapper, ...)
|
|
24
|
+
"""
|
|
25
|
+
|
|
20
26
|
def __init__(
|
|
21
27
|
self, anchor: Optional[Expr], path_elements: Optional[list[str | int | slice]] = None, scope_idx: int = 0
|
|
22
28
|
) -> None:
|
|
23
|
-
"""
|
|
24
|
-
anchor can be None, in which case this is a relative JsonPath and the anchor is set later via set_anchor().
|
|
25
|
-
scope_idx: for relative paths, index of referenced JsonMapper
|
|
26
|
-
(0: indicates the immediately preceding JsonMapper, -1: the parent of the immediately preceding mapper, ...)
|
|
27
|
-
"""
|
|
28
29
|
if path_elements is None:
|
|
29
30
|
path_elements = []
|
|
30
31
|
super().__init__(ts.JsonType(nullable=True)) # JsonPath expressions are always nullable
|
pixeltable/exprs/row_builder.py
CHANGED
|
@@ -48,6 +48,12 @@ class RowBuilder:
|
|
|
48
48
|
|
|
49
49
|
For ColumnRefs to unstored iterator columns:
|
|
50
50
|
- in order for them to be executable, we also record the iterator args and pass them to the ColumnRef
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
output_exprs: list of Exprs to be evaluated
|
|
54
|
+
columns: list of columns to be materialized
|
|
55
|
+
input_exprs: list of Exprs that are excluded from evaluation (because they're already materialized)
|
|
56
|
+
TODO: enforce that output_exprs doesn't overlap with input_exprs?
|
|
51
57
|
"""
|
|
52
58
|
|
|
53
59
|
unique_exprs: ExprSet
|
|
@@ -105,13 +111,6 @@ class RowBuilder:
|
|
|
105
111
|
input_exprs: Iterable[Expr],
|
|
106
112
|
tbl: Optional[catalog.TableVersion] = None,
|
|
107
113
|
):
|
|
108
|
-
"""
|
|
109
|
-
Args:
|
|
110
|
-
output_exprs: list of Exprs to be evaluated
|
|
111
|
-
columns: list of columns to be materialized
|
|
112
|
-
input_exprs: list of Exprs that are excluded from evaluation (because they're already materialized)
|
|
113
|
-
TODO: enforce that output_exprs doesn't overlap with input_exprs?
|
|
114
|
-
"""
|
|
115
114
|
self.unique_exprs: ExprSet[Expr] = ExprSet() # dependencies precede their dependents
|
|
116
115
|
self.next_slot_idx = 0
|
|
117
116
|
self.stored_img_cols = []
|
|
@@ -474,11 +473,6 @@ class RowBuilder:
|
|
|
474
473
|
# exceptions get stored in the errortype/-msg properties of the cellmd column
|
|
475
474
|
table_row.append(ColumnPropertyRef.create_cellmd_exc(exc))
|
|
476
475
|
else:
|
|
477
|
-
if col.col_type.is_media_type():
|
|
478
|
-
if col.col_type.is_image_type() and data_row.file_urls[slot_idx] is None:
|
|
479
|
-
# we have yet to store this image
|
|
480
|
-
data_row.flush_img(slot_idx, col)
|
|
481
|
-
data_row.move_tmp_media_file(slot_idx, col)
|
|
482
476
|
val = data_row.get_stored_val(slot_idx, col.get_sa_col_type())
|
|
483
477
|
table_row.append(val)
|
|
484
478
|
if col.stores_cellmd:
|
pixeltable/functions/gemini.py
CHANGED
|
@@ -15,7 +15,7 @@ import PIL.Image
|
|
|
15
15
|
import pixeltable as pxt
|
|
16
16
|
from pixeltable import env, exceptions as excs, exprs
|
|
17
17
|
from pixeltable.utils.code import local_public_names
|
|
18
|
-
from pixeltable.utils.
|
|
18
|
+
from pixeltable.utils.local_store import TempStore
|
|
19
19
|
|
|
20
20
|
if TYPE_CHECKING:
|
|
21
21
|
from google import genai
|
pixeltable/functions/openai.py
CHANGED
|
@@ -23,7 +23,7 @@ import pixeltable as pxt
|
|
|
23
23
|
from pixeltable import env, exprs, type_system as ts
|
|
24
24
|
from pixeltable.func import Batch, Tools
|
|
25
25
|
from pixeltable.utils.code import local_public_names
|
|
26
|
-
from pixeltable.utils.
|
|
26
|
+
from pixeltable.utils.local_store import TempStore
|
|
27
27
|
|
|
28
28
|
if TYPE_CHECKING:
|
|
29
29
|
import openai
|
pixeltable/functions/video.py
CHANGED
|
@@ -17,7 +17,7 @@ import pixeltable as pxt
|
|
|
17
17
|
import pixeltable.utils.av as av_utils
|
|
18
18
|
from pixeltable.env import Env
|
|
19
19
|
from pixeltable.utils.code import local_public_names
|
|
20
|
-
from pixeltable.utils.
|
|
20
|
+
from pixeltable.utils.local_store import TempStore
|
|
21
21
|
|
|
22
22
|
_logger = logging.getLogger('pixeltable')
|
|
23
23
|
_format_defaults: dict[str, tuple[str, str]] = { # format -> (codec, ext)
|
|
@@ -49,6 +49,10 @@ class make_video(pxt.Aggregator):
|
|
|
49
49
|
"""
|
|
50
50
|
Aggregator that creates a video from a sequence of images, using the default video encoder and yuv420p pixel format.
|
|
51
51
|
|
|
52
|
+
Follows https://pyav.org/docs/develop/cookbook/numpy.html#generating-video
|
|
53
|
+
|
|
54
|
+
TODO: provide parameters for video_encoder and pix_fmt
|
|
55
|
+
|
|
52
56
|
Args:
|
|
53
57
|
fps: Frames per second for the output video.
|
|
54
58
|
|
|
@@ -98,11 +102,6 @@ class make_video(pxt.Aggregator):
|
|
|
98
102
|
fps: int
|
|
99
103
|
|
|
100
104
|
def __init__(self, fps: int = 25):
|
|
101
|
-
"""
|
|
102
|
-
Follows https://pyav.org/docs/develop/cookbook/numpy.html#generating-video
|
|
103
|
-
|
|
104
|
-
TODO: provide parameters for video_encoder and pix_fmt
|
|
105
|
-
"""
|
|
106
105
|
self.container = None
|
|
107
106
|
self.stream = None
|
|
108
107
|
self.fps = fps
|
pixeltable/globals.py
CHANGED
|
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import TYPE_CHECKING, Any, Iterable,
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Iterable, Literal, NamedTuple, Optional, Union
|
|
7
7
|
|
|
8
8
|
import pandas as pd
|
|
9
9
|
import pydantic
|
|
@@ -24,9 +24,8 @@ if TYPE_CHECKING:
|
|
|
24
24
|
str,
|
|
25
25
|
os.PathLike,
|
|
26
26
|
Path, # OS paths, filenames, URLs
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
Sequence[pydantic.BaseModel], # list of Pydantic models
|
|
27
|
+
Iterable[dict[str, Any]], # dictionaries of values
|
|
28
|
+
Iterable[pydantic.BaseModel], # Pydantic model instances
|
|
30
29
|
DataFrame, # Pixeltable DataFrame
|
|
31
30
|
pd.DataFrame, # pandas DataFrame
|
|
32
31
|
datasets.Dataset,
|
|
@@ -764,15 +763,15 @@ def ls(path: str = '') -> pd.DataFrame:
|
|
|
764
763
|
base = md['base'] or ''
|
|
765
764
|
if base.startswith('_'):
|
|
766
765
|
base = '<anonymous base table>'
|
|
767
|
-
if md['
|
|
766
|
+
if md['is_replica']:
|
|
767
|
+
kind = 'replica'
|
|
768
|
+
elif md['is_snapshot']:
|
|
768
769
|
kind = 'snapshot'
|
|
769
770
|
elif md['is_view']:
|
|
770
771
|
kind = 'view'
|
|
771
772
|
else:
|
|
772
773
|
kind = 'table'
|
|
773
774
|
version = '' if kind == 'snapshot' else str(md['version'])
|
|
774
|
-
if md['is_replica']:
|
|
775
|
-
kind = f'{kind}-replica'
|
|
776
775
|
rows.append([name, kind, version, base])
|
|
777
776
|
return rows
|
|
778
777
|
|