pixeltable 0.4.12__py3-none-any.whl → 0.4.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (55) hide show
  1. pixeltable/__init__.py +11 -1
  2. pixeltable/catalog/__init__.py +2 -1
  3. pixeltable/catalog/catalog.py +179 -63
  4. pixeltable/catalog/column.py +24 -20
  5. pixeltable/catalog/table.py +96 -124
  6. pixeltable/catalog/table_metadata.py +96 -0
  7. pixeltable/catalog/table_version.py +15 -6
  8. pixeltable/catalog/view.py +22 -22
  9. pixeltable/config.py +2 -0
  10. pixeltable/dataframe.py +3 -2
  11. pixeltable/env.py +43 -21
  12. pixeltable/exec/__init__.py +1 -0
  13. pixeltable/exec/aggregation_node.py +0 -1
  14. pixeltable/exec/cache_prefetch_node.py +74 -98
  15. pixeltable/exec/data_row_batch.py +2 -18
  16. pixeltable/exec/in_memory_data_node.py +1 -1
  17. pixeltable/exec/object_store_save_node.py +299 -0
  18. pixeltable/exec/sql_node.py +28 -33
  19. pixeltable/exprs/data_row.py +31 -25
  20. pixeltable/exprs/json_path.py +6 -5
  21. pixeltable/exprs/row_builder.py +6 -12
  22. pixeltable/functions/gemini.py +1 -1
  23. pixeltable/functions/openai.py +1 -1
  24. pixeltable/functions/video.py +5 -6
  25. pixeltable/globals.py +6 -7
  26. pixeltable/index/embedding_index.py +5 -8
  27. pixeltable/io/__init__.py +2 -1
  28. pixeltable/io/fiftyone.py +1 -1
  29. pixeltable/io/label_studio.py +4 -5
  30. pixeltable/io/lancedb.py +3 -0
  31. pixeltable/io/parquet.py +9 -89
  32. pixeltable/io/table_data_conduit.py +2 -2
  33. pixeltable/iterators/audio.py +1 -1
  34. pixeltable/iterators/document.py +10 -12
  35. pixeltable/iterators/video.py +1 -1
  36. pixeltable/metadata/schema.py +7 -0
  37. pixeltable/plan.py +26 -1
  38. pixeltable/share/packager.py +8 -2
  39. pixeltable/share/publish.py +3 -9
  40. pixeltable/type_system.py +1 -3
  41. pixeltable/utils/arrow.py +97 -2
  42. pixeltable/utils/dbms.py +31 -5
  43. pixeltable/utils/gcs_store.py +283 -0
  44. pixeltable/utils/lancedb.py +88 -0
  45. pixeltable/utils/local_store.py +316 -0
  46. pixeltable/utils/object_stores.py +497 -0
  47. pixeltable/utils/pytorch.py +5 -6
  48. pixeltable/utils/s3_store.py +354 -0
  49. {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/METADATA +162 -127
  50. {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/RECORD +53 -47
  51. pixeltable/utils/media_store.py +0 -248
  52. pixeltable/utils/s3.py +0 -17
  53. {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/WHEEL +0 -0
  54. {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/entry_points.txt +0 -0
  55. {pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,299 @@
1
+ from __future__ import annotations
2
+
3
+ import dataclasses
4
+ import itertools
5
+ import logging
6
+ from collections import defaultdict, deque
7
+ from concurrent import futures
8
+ from pathlib import Path
9
+ from typing import AsyncIterator, Iterator, NamedTuple, Optional
10
+ from uuid import UUID
11
+
12
+ from pixeltable import exprs
13
+ from pixeltable.utils.object_stores import ObjectOps, ObjectPath, StorageTarget
14
+
15
+ from .data_row_batch import DataRowBatch
16
+ from .exec_node import ExecNode
17
+
18
+ _logger = logging.getLogger('pixeltable')
19
+
20
+
21
+ class ObjectStoreSaveNode(ExecNode):
22
+ """Save files into designated object store(s).
23
+
24
+ Each row may have multiple files that need to be saved to a destination.
25
+ Each file may be referenced by more than one column in the row.
26
+ Each file may have multiple destinations, e.g., S3 bucket and local file system.
27
+ If there are multiple destinations, the file cannot be moved to any destination
28
+ until it has been copied to all of the other destinations.
29
+ Diagrammatically:
30
+ Row -> [src_path1, src_path2, ...]
31
+ src_path -> [dest1, dest2, ...]
32
+ dest1: [row_location1, row_location2, ...]
33
+ Paths with multiple destinations are removed from the TempStore only after all destination copies are complete.
34
+
35
+ TODO:
36
+ - Process a row at a time and limit the number of in-flight rows to control memory usage
37
+ """
38
+
39
+ QUEUE_DEPTH_HIGH_WATER = 50 # target number of in-flight requests
40
+ QUEUE_DEPTH_LOW_WATER = 20 # target number of in-flight requests
41
+ BATCH_SIZE = 16
42
+ MAX_WORKERS = 15
43
+
44
+ class WorkDesignator(NamedTuple):
45
+ """Specify the source and destination for a WorkItem"""
46
+
47
+ src_path: str # source of the file to be processed
48
+ destination: Optional[str] # destination URI for the file to be processed
49
+
50
+ class WorkItem(NamedTuple):
51
+ src_path: Path
52
+ destination: Optional[str]
53
+ info: exprs.ColumnSlotIdx # column info for the file being processed
54
+ destination_count: int = 1 # number of unique destinations for this file
55
+
56
+ retain_input_order: bool # if True, return rows in the exact order they were received
57
+ file_col_info: list[exprs.ColumnSlotIdx]
58
+
59
+ # execution state
60
+ num_returned_rows: int
61
+
62
+ # ready_rows: rows that are ready to be returned, ordered by row idx;
63
+ # the implied row idx of ready_rows[0] is num_returned_rows
64
+ ready_rows: deque[Optional[exprs.DataRow]]
65
+
66
+ in_flight_rows: dict[int, ObjectStoreSaveNode.RowState] # rows with in-flight work; id(row) -> RowState
67
+ in_flight_requests: dict[
68
+ futures.Future, WorkDesignator
69
+ ] # in-flight requests to save paths: Future -> WorkDesignator
70
+ in_flight_work: dict[
71
+ WorkDesignator, list[tuple[exprs.DataRow, exprs.ColumnSlotIdx]]
72
+ ] # WorkDesignator -> [(row, info)]
73
+
74
+ input_finished: bool
75
+ row_idx: Iterator[Optional[int]]
76
+
77
+ @dataclasses.dataclass
78
+ class RowState:
79
+ row: exprs.DataRow
80
+ idx: Optional[int] # position in input stream; None if we don't retain input order
81
+ num_missing: int # number of references to media files in this row
82
+ delete_destinations: list[Path] # paths to delete after all copies are complete
83
+
84
+ def __init__(
85
+ self, tbl_id: UUID, file_col_info: list[exprs.ColumnSlotIdx], input: ExecNode, retain_input_order: bool = True
86
+ ):
87
+ # input_/output_exprs=[]: we don't have anything to evaluate
88
+ super().__init__(input.row_builder, [], [], input)
89
+ self.retain_input_order = retain_input_order
90
+ self.file_col_info = file_col_info
91
+
92
+ self.num_returned_rows = 0
93
+ self.ready_rows = deque()
94
+ self.in_flight_rows = {}
95
+ self.in_flight_requests = {}
96
+ self.in_flight_work = {}
97
+ self.input_finished = False
98
+ self.row_idx = itertools.count() if retain_input_order else itertools.repeat(None)
99
+ assert self.QUEUE_DEPTH_HIGH_WATER > self.QUEUE_DEPTH_LOW_WATER
100
+
101
+ @property
102
+ def queued_work(self) -> int:
103
+ return len(self.in_flight_requests)
104
+
105
+ async def get_input_batch(self, input_iter: AsyncIterator[DataRowBatch]) -> Optional[DataRowBatch]:
106
+ """Get the next batch of input rows, or None if there are no more rows"""
107
+ try:
108
+ input_batch = await anext(input_iter)
109
+ if input_batch is None:
110
+ self.input_finished = True
111
+ return input_batch
112
+ except StopAsyncIteration:
113
+ self.input_finished = True
114
+ return None
115
+
116
+ async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
117
+ input_iter = aiter(self.input)
118
+ with futures.ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
119
+ while True:
120
+ # Create work to fill the queue to the high water mark ... ?without overrunning the in-flight row limit.
121
+ while not self.input_finished and self.queued_work < self.QUEUE_DEPTH_HIGH_WATER:
122
+ input_batch = await self.get_input_batch(input_iter)
123
+ if input_batch is not None:
124
+ self.__process_input_batch(input_batch, executor)
125
+
126
+ # Wait for enough completions to enable more queueing or if we're done
127
+ while self.queued_work > self.QUEUE_DEPTH_LOW_WATER or (self.input_finished and self.queued_work > 0):
128
+ done, _ = futures.wait(self.in_flight_requests, return_when=futures.FIRST_COMPLETED)
129
+ self.__process_completions(done, ignore_errors=self.ctx.ignore_errors)
130
+
131
+ # Emit results to meet batch size requirements or empty the in-flight row queue
132
+ if self.__has_ready_batch() or (
133
+ len(self.ready_rows) > 0 and self.input_finished and self.queued_work == 0
134
+ ):
135
+ # create DataRowBatch from the first BATCH_SIZE ready rows
136
+ batch = DataRowBatch(self.row_builder)
137
+ rows = [self.ready_rows.popleft() for _ in range(min(self.BATCH_SIZE, len(self.ready_rows)))]
138
+ for row in rows:
139
+ assert row is not None
140
+ batch.add_row(row)
141
+ self.num_returned_rows += len(rows)
142
+ _logger.debug(f'returning {len(rows)} rows')
143
+ yield batch
144
+
145
+ if self.input_finished and self.queued_work == 0 and len(self.ready_rows) == 0:
146
+ return
147
+
148
+ def __has_ready_batch(self) -> bool:
149
+ """True if there are >= BATCH_SIZES entries in ready_rows and the first BATCH_SIZE ones are all non-None"""
150
+ return (
151
+ sum(int(row is not None) for row in itertools.islice(self.ready_rows, self.BATCH_SIZE)) == self.BATCH_SIZE
152
+ )
153
+
154
+ def __add_ready_row(self, row: exprs.DataRow, row_idx: Optional[int]) -> None:
155
+ if row_idx is None:
156
+ self.ready_rows.append(row)
157
+ else:
158
+ # extend ready_rows to accommodate row_idx
159
+ idx = row_idx - self.num_returned_rows
160
+ if idx >= len(self.ready_rows):
161
+ self.ready_rows.extend([None] * (idx - len(self.ready_rows) + 1))
162
+ self.ready_rows[idx] = row
163
+
164
+ def __process_completions(self, done: set[futures.Future], ignore_errors: bool) -> None:
165
+ from pixeltable.utils.local_store import TempStore
166
+
167
+ for f in done:
168
+ work_designator = self.in_flight_requests.pop(f)
169
+ new_file_url, exc = f.result()
170
+ if exc is not None and not ignore_errors:
171
+ raise exc
172
+ assert new_file_url is not None
173
+
174
+ # add the local path/exception to the slots that reference the url
175
+ for row, info in self.in_flight_work.pop(work_designator):
176
+ if exc is not None:
177
+ self.row_builder.set_exc(row, info.slot_idx, exc)
178
+ else:
179
+ row.file_urls[info.slot_idx] = new_file_url
180
+
181
+ state = self.in_flight_rows[id(row)]
182
+ state.num_missing -= 1
183
+ if state.num_missing == 0:
184
+ # All operations for this row are complete. Delete all files which had multiple destinations
185
+ for src_path in state.delete_destinations:
186
+ TempStore.delete_media_file(src_path)
187
+ del self.in_flight_rows[id(row)]
188
+ self.__add_ready_row(row, state.idx)
189
+
190
+ def __process_input_row(self, row: exprs.DataRow) -> list[ObjectStoreSaveNode.WorkItem]:
191
+ """Process a batch of input rows, generating a list of work"""
192
+ from pixeltable.utils.local_store import LocalStore, TempStore
193
+
194
+ # Create a list of work to do for media storage in this row
195
+ row_idx = next(self.row_idx)
196
+ row_to_do: list[ObjectStoreSaveNode.WorkItem] = []
197
+ num_missing = 0
198
+ unique_destinations: dict[Path, int] = defaultdict(int) # destination -> count of unique destinations
199
+
200
+ for info in self.file_col_info:
201
+ col, index = info
202
+ # we may need to store this imagehave yet to store this image
203
+ if row.prepare_col_val_for_save(index, col):
204
+ row.file_urls[index] = row.save_media_to_temp(index, col)
205
+
206
+ url = row.file_urls[index]
207
+ if url is None:
208
+ # nothing to do
209
+ continue
210
+
211
+ assert row.excs[index] is None
212
+ assert col.col_type.is_media_type()
213
+
214
+ destination = info.col.destination
215
+ soa = None if destination is None else ObjectPath.parse_object_storage_addr(destination, False)
216
+ if (
217
+ soa is not None
218
+ and soa.storage_target == StorageTarget.LOCAL_STORE
219
+ and LocalStore(soa).resolve_url(url) is not None
220
+ ):
221
+ # A local non-default destination was specified, and the url already points there
222
+ continue
223
+
224
+ src_path = LocalStore.file_url_to_path(url)
225
+ if src_path is None:
226
+ # The url does not point to a local file, do not attempt to copy/move it
227
+ continue
228
+
229
+ if destination is None and not TempStore.contains_path(src_path):
230
+ # Do not copy local file URLs to the LocalStore
231
+ continue
232
+
233
+ work_designator = ObjectStoreSaveNode.WorkDesignator(str(src_path), destination)
234
+ locations = self.in_flight_work.get(work_designator)
235
+ if locations is not None:
236
+ # we've already seen this
237
+ locations.append((row, info))
238
+ num_missing += 1
239
+ continue
240
+
241
+ work_item = ObjectStoreSaveNode.WorkItem(src_path, destination, info)
242
+ row_to_do.append(work_item)
243
+ self.in_flight_work[work_designator] = [(row, info)]
244
+ num_missing += 1
245
+ unique_destinations[src_path] += 1
246
+
247
+ # Update work items to reflect the number of unique destinations
248
+ new_to_do = []
249
+ for work_item in row_to_do:
250
+ if unique_destinations[work_item.src_path] == 1 and TempStore.contains_path(work_item.src_path):
251
+ new_to_do.append(work_item)
252
+ else:
253
+ new_to_do.append(
254
+ ObjectStoreSaveNode.WorkItem(
255
+ work_item.src_path,
256
+ work_item.destination,
257
+ work_item.info,
258
+ destination_count=unique_destinations[work_item.src_path] + 1,
259
+ # +1 for the TempStore destination
260
+ )
261
+ )
262
+ delete_destinations = [k for k, v in unique_destinations.items() if v > 1 and TempStore.contains_path(k)]
263
+ row_to_do = new_to_do
264
+
265
+ if len(row_to_do) > 0:
266
+ self.in_flight_rows[id(row)] = self.RowState(
267
+ row, row_idx, num_missing, delete_destinations=delete_destinations
268
+ )
269
+ else:
270
+ self.__add_ready_row(row, row_idx)
271
+ return row_to_do
272
+
273
+ def __process_input_batch(self, input_batch: DataRowBatch, executor: futures.ThreadPoolExecutor) -> None:
274
+ """Process a batch of input rows, submitting temporary files for upload"""
275
+ work_to_do: list[ObjectStoreSaveNode.WorkItem] = []
276
+
277
+ for row in input_batch:
278
+ row_to_do = self.__process_input_row(row)
279
+ if len(row_to_do) > 0:
280
+ work_to_do.extend(row_to_do)
281
+
282
+ for work_item in work_to_do:
283
+ f = executor.submit(self.__persist_media_file, work_item)
284
+ self.in_flight_requests[f] = ObjectStoreSaveNode.WorkDesignator(
285
+ str(work_item.src_path), work_item.destination
286
+ )
287
+ _logger.debug(f'submitted {work_item}')
288
+
289
+ def __persist_media_file(self, work_item: WorkItem) -> tuple[Optional[str], Optional[Exception]]:
290
+ """Move data from the TempStore to another location"""
291
+ src_path = work_item.src_path
292
+ col = work_item.info.col
293
+ assert col.destination == work_item.destination
294
+ try:
295
+ new_file_url = ObjectOps.put_file(col, src_path, work_item.destination_count == 1)
296
+ return new_file_url, None
297
+ except Exception as e:
298
+ _logger.debug(f'Failed to move/copy {src_path}: {e}', exc_info=e)
299
+ return None, e
@@ -71,6 +71,13 @@ class SqlNode(ExecNode):
71
71
  If set_pk is True, they are added to the end of the result set when creating the SQL statement
72
72
  so they can always be referenced as cols[-num_pk_cols:] in the result set.
73
73
  The pk_columns consist of the rowid columns of the target table followed by the version number.
74
+
75
+ If row_builder contains references to unstored iter columns, expands the select list to include their
76
+ SQL-materializable subexpressions.
77
+
78
+ Args:
79
+ select_list: output of the query
80
+ set_pk: if True, sets the primary for each DataRow
74
81
  """
75
82
 
76
83
  tbl: Optional[catalog.TableVersionPath]
@@ -97,14 +104,6 @@ class SqlNode(ExecNode):
97
104
  sql_elements: exprs.SqlElementCache,
98
105
  set_pk: bool = False,
99
106
  ):
100
- """
101
- If row_builder contains references to unstored iter columns, expands the select list to include their
102
- SQL-materializable subexpressions.
103
-
104
- Args:
105
- select_list: output of the query
106
- set_pk: if True, sets the primary for each DataRow
107
- """
108
107
  # create Select stmt
109
108
  self.sql_elements = sql_elements
110
109
  self.tbl = tbl
@@ -374,6 +373,11 @@ class SqlScanNode(SqlNode):
374
373
  Materializes data from the store via a Select stmt.
375
374
 
376
375
  Supports filtering and ordering.
376
+
377
+ Args:
378
+ select_list: output of the query
379
+ set_pk: if True, sets the primary for each DataRow
380
+ exact_version_only: tables for which we only want to see rows created at the current version
377
381
  """
378
382
 
379
383
  exact_version_only: list[catalog.TableVersionHandle]
@@ -386,12 +390,6 @@ class SqlScanNode(SqlNode):
386
390
  set_pk: bool = False,
387
391
  exact_version_only: Optional[list[catalog.TableVersionHandle]] = None,
388
392
  ):
389
- """
390
- Args:
391
- select_list: output of the query
392
- set_pk: if True, sets the primary for each DataRow
393
- exact_version_only: tables for which we only want to see rows created at the current version
394
- """
395
393
  sql_elements = exprs.SqlElementCache()
396
394
  super().__init__(tbl, row_builder, select_list, sql_elements, set_pk=set_pk)
397
395
  # create Select stmt
@@ -413,6 +411,11 @@ class SqlScanNode(SqlNode):
413
411
  class SqlLookupNode(SqlNode):
414
412
  """
415
413
  Materializes data from the store via a Select stmt with a WHERE clause that matches a list of key values
414
+
415
+ Args:
416
+ select_list: output of the query
417
+ sa_key_cols: list of key columns in the store table
418
+ key_vals: list of key values to look up
416
419
  """
417
420
 
418
421
  def __init__(
@@ -423,12 +426,6 @@ class SqlLookupNode(SqlNode):
423
426
  sa_key_cols: list[sql.Column],
424
427
  key_vals: list[tuple],
425
428
  ):
426
- """
427
- Args:
428
- select_list: output of the query
429
- sa_key_cols: list of key columns in the store table
430
- key_vals: list of key values to look up
431
- """
432
429
  sql_elements = exprs.SqlElementCache()
433
430
  super().__init__(tbl, row_builder, select_list, sql_elements, set_pk=True)
434
431
  # Where clause: (key-col-1, key-col-2, ...) IN ((val-1, val-2, ...), ...)
@@ -444,6 +441,11 @@ class SqlLookupNode(SqlNode):
444
441
  class SqlAggregationNode(SqlNode):
445
442
  """
446
443
  Materializes data from the store via a Select stmt with a WHERE clause that matches a list of key values
444
+
445
+ Args:
446
+ select_list: can contain calls to AggregateFunctions
447
+ group_by_items: list of expressions to group by
448
+ limit: max number of rows to return: None = no limit
447
449
  """
448
450
 
449
451
  group_by_items: Optional[list[exprs.Expr]]
@@ -458,12 +460,6 @@ class SqlAggregationNode(SqlNode):
458
460
  limit: Optional[int] = None,
459
461
  exact_version_only: Optional[list[catalog.TableVersion]] = None,
460
462
  ):
461
- """
462
- Args:
463
- select_list: can contain calls to AggregateFunctions
464
- group_by_items: list of expressions to group by
465
- limit: max number of rows to return: None = no limit
466
- """
467
463
  self.input_cte, input_col_map = input.to_cte()
468
464
  sql_elements = exprs.SqlElementCache(input_col_map)
469
465
  super().__init__(None, row_builder, select_list, sql_elements)
@@ -529,6 +525,12 @@ class SqlJoinNode(SqlNode):
529
525
  class SqlSampleNode(SqlNode):
530
526
  """
531
527
  Returns rows sampled from the input node.
528
+
529
+ Args:
530
+ input: SqlNode to sample from
531
+ select_list: can contain calls to AggregateFunctions
532
+ sample_clause: specifies the sampling method
533
+ stratify_exprs: Analyzer processed list of expressions to stratify by.
532
534
  """
533
535
 
534
536
  input_cte: Optional[sql.CTE]
@@ -544,13 +546,6 @@ class SqlSampleNode(SqlNode):
544
546
  sample_clause: 'SampleClause',
545
547
  stratify_exprs: list[exprs.Expr],
546
548
  ):
547
- """
548
- Args:
549
- input: SqlNode to sample from
550
- select_list: can contain calls to AggregateFunctions
551
- sample_clause: specifies the sampling method
552
- stratify_exprs: Analyzer processed list of expressions to stratify by.
553
- """
554
549
  assert isinstance(input, SqlNode)
555
550
  self.input_cte, input_col_map = input.to_cte(keep_pk=True)
556
551
  self.pk_count = input.num_pk_cols
@@ -14,7 +14,7 @@ import PIL.Image
14
14
  import sqlalchemy as sql
15
15
 
16
16
  from pixeltable import catalog, env
17
- from pixeltable.utils.media_store import MediaStore, TempStore
17
+ from pixeltable.utils.local_store import TempStore
18
18
 
19
19
 
20
20
  class DataRow:
@@ -257,42 +257,48 @@ class DataRow:
257
257
  self.vals[idx] = val
258
258
  self.has_val[idx] = True
259
259
 
260
- def flush_img(self, index: int, col: Optional[catalog.Column] = None) -> None:
261
- """Save or discard the in-memory value (required to be a PIL.Image.Image)"""
260
+ def prepare_col_val_for_save(self, index: int, col: Optional[catalog.Column] = None) -> bool:
261
+ """
262
+ Prepare to save a column's value into the appropriate store. Discard unneeded values.
263
+
264
+ Return:
265
+ True if the media object in the column needs to be saved.
266
+ """
262
267
  if self.vals[index] is None:
263
- return
268
+ return False
269
+
270
+ if self.file_urls[index] is not None:
271
+ return False
272
+
264
273
  assert self.excs[index] is None
265
274
  if self.file_paths[index] is None:
266
275
  if col is not None:
267
- image = self.vals[index]
268
- format = None
269
- if isinstance(image, PIL.Image.Image):
270
- # Default to JPEG unless the image has a transparency layer (which isn't supported by JPEG).
271
- # In that case, use WebP instead.
272
- format = 'webp' if image.has_transparency_data else 'jpeg'
273
- filepath, url = MediaStore.get().save_media_object(image, col, format=format)
274
- self.file_paths[index] = str(filepath)
275
- self.file_urls[index] = url
276
+ # This is a media object that needs to be saved
277
+ return True
276
278
  else:
277
- # we discard the content of this cell
279
+ # This is a media object that we don't care about, so we discard it
278
280
  self.has_val[index] = False
279
281
  else:
280
282
  # we already have a file for this image, nothing left to do
281
283
  pass
284
+
282
285
  self.vals[index] = None
286
+ return False
283
287
 
284
- def move_tmp_media_file(self, index: int, col: catalog.Column) -> None:
285
- """If a media url refers to data in a temporary file, move the data to a MediaStore"""
286
- if self.file_urls[index] is None:
287
- return
288
- assert self.excs[index] is None
288
+ def save_media_to_temp(self, index: int, col: catalog.Column) -> str:
289
+ """Save the media object in the column to the TempStore.
290
+ Objects cannot be saved directly to general destinations."""
289
291
  assert col.col_type.is_media_type()
290
- src_path = TempStore.resolve_url(self.file_urls[index])
291
- if src_path is None:
292
- # The media url does not point to a temporary file, leave it as is
293
- return
294
- new_file_url = MediaStore.get().relocate_local_media_file(src_path, col)
295
- self.file_urls[index] = new_file_url
292
+ val = self.vals[index]
293
+ format = None
294
+ if isinstance(val, PIL.Image.Image):
295
+ # Default to JPEG unless the image has a transparency layer (which isn't supported by JPEG).
296
+ # In that case, use WebP instead.
297
+ format = 'webp' if val.has_transparency_data else 'jpeg'
298
+ filepath, url = TempStore.save_media_object(val, col, format=format)
299
+ self.file_paths[index] = str(filepath) if filepath is not None else None
300
+ self.vals[index] = None
301
+ return url
296
302
 
297
303
  @property
298
304
  def rowid(self) -> tuple[int, ...]:
@@ -17,14 +17,15 @@ from .sql_element_cache import SqlElementCache
17
17
 
18
18
 
19
19
  class JsonPath(Expr):
20
+ """
21
+ anchor can be None, in which case this is a relative JsonPath and the anchor is set later via set_anchor().
22
+ scope_idx: for relative paths, index of referenced JsonMapper
23
+ (0: indicates the immediately preceding JsonMapper, -1: the parent of the immediately preceding mapper, ...)
24
+ """
25
+
20
26
  def __init__(
21
27
  self, anchor: Optional[Expr], path_elements: Optional[list[str | int | slice]] = None, scope_idx: int = 0
22
28
  ) -> None:
23
- """
24
- anchor can be None, in which case this is a relative JsonPath and the anchor is set later via set_anchor().
25
- scope_idx: for relative paths, index of referenced JsonMapper
26
- (0: indicates the immediately preceding JsonMapper, -1: the parent of the immediately preceding mapper, ...)
27
- """
28
29
  if path_elements is None:
29
30
  path_elements = []
30
31
  super().__init__(ts.JsonType(nullable=True)) # JsonPath expressions are always nullable
@@ -48,6 +48,12 @@ class RowBuilder:
48
48
 
49
49
  For ColumnRefs to unstored iterator columns:
50
50
  - in order for them to be executable, we also record the iterator args and pass them to the ColumnRef
51
+
52
+ Args:
53
+ output_exprs: list of Exprs to be evaluated
54
+ columns: list of columns to be materialized
55
+ input_exprs: list of Exprs that are excluded from evaluation (because they're already materialized)
56
+ TODO: enforce that output_exprs doesn't overlap with input_exprs?
51
57
  """
52
58
 
53
59
  unique_exprs: ExprSet
@@ -105,13 +111,6 @@ class RowBuilder:
105
111
  input_exprs: Iterable[Expr],
106
112
  tbl: Optional[catalog.TableVersion] = None,
107
113
  ):
108
- """
109
- Args:
110
- output_exprs: list of Exprs to be evaluated
111
- columns: list of columns to be materialized
112
- input_exprs: list of Exprs that are excluded from evaluation (because they're already materialized)
113
- TODO: enforce that output_exprs doesn't overlap with input_exprs?
114
- """
115
114
  self.unique_exprs: ExprSet[Expr] = ExprSet() # dependencies precede their dependents
116
115
  self.next_slot_idx = 0
117
116
  self.stored_img_cols = []
@@ -474,11 +473,6 @@ class RowBuilder:
474
473
  # exceptions get stored in the errortype/-msg properties of the cellmd column
475
474
  table_row.append(ColumnPropertyRef.create_cellmd_exc(exc))
476
475
  else:
477
- if col.col_type.is_media_type():
478
- if col.col_type.is_image_type() and data_row.file_urls[slot_idx] is None:
479
- # we have yet to store this image
480
- data_row.flush_img(slot_idx, col)
481
- data_row.move_tmp_media_file(slot_idx, col)
482
476
  val = data_row.get_stored_val(slot_idx, col.get_sa_col_type())
483
477
  table_row.append(val)
484
478
  if col.stores_cellmd:
@@ -15,7 +15,7 @@ import PIL.Image
15
15
  import pixeltable as pxt
16
16
  from pixeltable import env, exceptions as excs, exprs
17
17
  from pixeltable.utils.code import local_public_names
18
- from pixeltable.utils.media_store import TempStore
18
+ from pixeltable.utils.local_store import TempStore
19
19
 
20
20
  if TYPE_CHECKING:
21
21
  from google import genai
@@ -23,7 +23,7 @@ import pixeltable as pxt
23
23
  from pixeltable import env, exprs, type_system as ts
24
24
  from pixeltable.func import Batch, Tools
25
25
  from pixeltable.utils.code import local_public_names
26
- from pixeltable.utils.media_store import TempStore
26
+ from pixeltable.utils.local_store import TempStore
27
27
 
28
28
  if TYPE_CHECKING:
29
29
  import openai
@@ -17,7 +17,7 @@ import pixeltable as pxt
17
17
  import pixeltable.utils.av as av_utils
18
18
  from pixeltable.env import Env
19
19
  from pixeltable.utils.code import local_public_names
20
- from pixeltable.utils.media_store import TempStore
20
+ from pixeltable.utils.local_store import TempStore
21
21
 
22
22
  _logger = logging.getLogger('pixeltable')
23
23
  _format_defaults: dict[str, tuple[str, str]] = { # format -> (codec, ext)
@@ -49,6 +49,10 @@ class make_video(pxt.Aggregator):
49
49
  """
50
50
  Aggregator that creates a video from a sequence of images, using the default video encoder and yuv420p pixel format.
51
51
 
52
+ Follows https://pyav.org/docs/develop/cookbook/numpy.html#generating-video
53
+
54
+ TODO: provide parameters for video_encoder and pix_fmt
55
+
52
56
  Args:
53
57
  fps: Frames per second for the output video.
54
58
 
@@ -98,11 +102,6 @@ class make_video(pxt.Aggregator):
98
102
  fps: int
99
103
 
100
104
  def __init__(self, fps: int = 25):
101
- """
102
- Follows https://pyav.org/docs/develop/cookbook/numpy.html#generating-video
103
-
104
- TODO: provide parameters for video_encoder and pix_fmt
105
- """
106
105
  self.container = None
107
106
  self.stream = None
108
107
  self.fps = fps
pixeltable/globals.py CHANGED
@@ -3,7 +3,7 @@ from __future__ import annotations
3
3
  import logging
4
4
  import os
5
5
  from pathlib import Path
6
- from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, NamedTuple, Optional, Sequence, Union
6
+ from typing import TYPE_CHECKING, Any, Iterable, Literal, NamedTuple, Optional, Union
7
7
 
8
8
  import pandas as pd
9
9
  import pydantic
@@ -24,9 +24,8 @@ if TYPE_CHECKING:
24
24
  str,
25
25
  os.PathLike,
26
26
  Path, # OS paths, filenames, URLs
27
- Iterator[dict[str, Any]], # iterator producing dictionaries of values
28
- RowData, # list of dictionaries
29
- Sequence[pydantic.BaseModel], # list of Pydantic models
27
+ Iterable[dict[str, Any]], # dictionaries of values
28
+ Iterable[pydantic.BaseModel], # Pydantic model instances
30
29
  DataFrame, # Pixeltable DataFrame
31
30
  pd.DataFrame, # pandas DataFrame
32
31
  datasets.Dataset,
@@ -764,15 +763,15 @@ def ls(path: str = '') -> pd.DataFrame:
764
763
  base = md['base'] or ''
765
764
  if base.startswith('_'):
766
765
  base = '<anonymous base table>'
767
- if md['is_snapshot']:
766
+ if md['is_replica']:
767
+ kind = 'replica'
768
+ elif md['is_snapshot']:
768
769
  kind = 'snapshot'
769
770
  elif md['is_view']:
770
771
  kind = 'view'
771
772
  else:
772
773
  kind = 'table'
773
774
  version = '' if kind == 'snapshot' else str(md['version'])
774
- if md['is_replica']:
775
- kind = f'{kind}-replica'
776
775
  rows.append([name, kind, version, base])
777
776
  return rows
778
777