pixeltable 0.4.13__py3-none-any.whl → 0.4.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (55) hide show
  1. pixeltable/__init__.py +2 -1
  2. pixeltable/catalog/catalog.py +187 -63
  3. pixeltable/catalog/column.py +24 -20
  4. pixeltable/catalog/table.py +24 -8
  5. pixeltable/catalog/table_metadata.py +1 -0
  6. pixeltable/catalog/table_version.py +16 -34
  7. pixeltable/catalog/update_status.py +12 -0
  8. pixeltable/catalog/view.py +22 -22
  9. pixeltable/config.py +2 -0
  10. pixeltable/dataframe.py +4 -2
  11. pixeltable/env.py +46 -21
  12. pixeltable/exec/__init__.py +1 -0
  13. pixeltable/exec/aggregation_node.py +0 -1
  14. pixeltable/exec/cache_prefetch_node.py +74 -98
  15. pixeltable/exec/data_row_batch.py +2 -18
  16. pixeltable/exec/expr_eval/expr_eval_node.py +11 -0
  17. pixeltable/exec/in_memory_data_node.py +1 -1
  18. pixeltable/exec/object_store_save_node.py +299 -0
  19. pixeltable/exec/sql_node.py +28 -33
  20. pixeltable/exprs/data_row.py +31 -25
  21. pixeltable/exprs/json_path.py +6 -5
  22. pixeltable/exprs/row_builder.py +6 -12
  23. pixeltable/functions/gemini.py +1 -1
  24. pixeltable/functions/openai.py +1 -1
  25. pixeltable/functions/video.py +128 -15
  26. pixeltable/functions/whisperx.py +2 -0
  27. pixeltable/functions/yolox.py +2 -0
  28. pixeltable/globals.py +49 -30
  29. pixeltable/index/embedding_index.py +5 -8
  30. pixeltable/io/__init__.py +1 -0
  31. pixeltable/io/fiftyone.py +1 -1
  32. pixeltable/io/label_studio.py +4 -5
  33. pixeltable/iterators/__init__.py +1 -0
  34. pixeltable/iterators/audio.py +1 -1
  35. pixeltable/iterators/document.py +10 -12
  36. pixeltable/iterators/video.py +1 -1
  37. pixeltable/metadata/schema.py +7 -0
  38. pixeltable/plan.py +26 -1
  39. pixeltable/share/packager.py +8 -2
  40. pixeltable/share/publish.py +3 -10
  41. pixeltable/store.py +1 -1
  42. pixeltable/type_system.py +1 -3
  43. pixeltable/utils/dbms.py +31 -5
  44. pixeltable/utils/gcs_store.py +283 -0
  45. pixeltable/utils/local_store.py +316 -0
  46. pixeltable/utils/object_stores.py +497 -0
  47. pixeltable/utils/pytorch.py +5 -6
  48. pixeltable/utils/s3_store.py +354 -0
  49. {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/METADATA +1 -1
  50. {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/RECORD +53 -50
  51. pixeltable/utils/media_store.py +0 -248
  52. pixeltable/utils/s3.py +0 -17
  53. {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/WHEEL +0 -0
  54. {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/entry_points.txt +0 -0
  55. {pixeltable-0.4.13.dist-info → pixeltable-0.4.15.dist-info}/licenses/LICENSE +0 -0
@@ -71,6 +71,13 @@ class SqlNode(ExecNode):
71
71
  If set_pk is True, they are added to the end of the result set when creating the SQL statement
72
72
  so they can always be referenced as cols[-num_pk_cols:] in the result set.
73
73
  The pk_columns consist of the rowid columns of the target table followed by the version number.
74
+
75
+ If row_builder contains references to unstored iter columns, expands the select list to include their
76
+ SQL-materializable subexpressions.
77
+
78
+ Args:
79
+ select_list: output of the query
80
+ set_pk: if True, sets the primary for each DataRow
74
81
  """
75
82
 
76
83
  tbl: Optional[catalog.TableVersionPath]
@@ -97,14 +104,6 @@ class SqlNode(ExecNode):
97
104
  sql_elements: exprs.SqlElementCache,
98
105
  set_pk: bool = False,
99
106
  ):
100
- """
101
- If row_builder contains references to unstored iter columns, expands the select list to include their
102
- SQL-materializable subexpressions.
103
-
104
- Args:
105
- select_list: output of the query
106
- set_pk: if True, sets the primary for each DataRow
107
- """
108
107
  # create Select stmt
109
108
  self.sql_elements = sql_elements
110
109
  self.tbl = tbl
@@ -374,6 +373,11 @@ class SqlScanNode(SqlNode):
374
373
  Materializes data from the store via a Select stmt.
375
374
 
376
375
  Supports filtering and ordering.
376
+
377
+ Args:
378
+ select_list: output of the query
379
+ set_pk: if True, sets the primary for each DataRow
380
+ exact_version_only: tables for which we only want to see rows created at the current version
377
381
  """
378
382
 
379
383
  exact_version_only: list[catalog.TableVersionHandle]
@@ -386,12 +390,6 @@ class SqlScanNode(SqlNode):
386
390
  set_pk: bool = False,
387
391
  exact_version_only: Optional[list[catalog.TableVersionHandle]] = None,
388
392
  ):
389
- """
390
- Args:
391
- select_list: output of the query
392
- set_pk: if True, sets the primary for each DataRow
393
- exact_version_only: tables for which we only want to see rows created at the current version
394
- """
395
393
  sql_elements = exprs.SqlElementCache()
396
394
  super().__init__(tbl, row_builder, select_list, sql_elements, set_pk=set_pk)
397
395
  # create Select stmt
@@ -413,6 +411,11 @@ class SqlScanNode(SqlNode):
413
411
  class SqlLookupNode(SqlNode):
414
412
  """
415
413
  Materializes data from the store via a Select stmt with a WHERE clause that matches a list of key values
414
+
415
+ Args:
416
+ select_list: output of the query
417
+ sa_key_cols: list of key columns in the store table
418
+ key_vals: list of key values to look up
416
419
  """
417
420
 
418
421
  def __init__(
@@ -423,12 +426,6 @@ class SqlLookupNode(SqlNode):
423
426
  sa_key_cols: list[sql.Column],
424
427
  key_vals: list[tuple],
425
428
  ):
426
- """
427
- Args:
428
- select_list: output of the query
429
- sa_key_cols: list of key columns in the store table
430
- key_vals: list of key values to look up
431
- """
432
429
  sql_elements = exprs.SqlElementCache()
433
430
  super().__init__(tbl, row_builder, select_list, sql_elements, set_pk=True)
434
431
  # Where clause: (key-col-1, key-col-2, ...) IN ((val-1, val-2, ...), ...)
@@ -444,6 +441,11 @@ class SqlLookupNode(SqlNode):
444
441
  class SqlAggregationNode(SqlNode):
445
442
  """
446
443
  Materializes data from the store via a Select stmt with a WHERE clause that matches a list of key values
444
+
445
+ Args:
446
+ select_list: can contain calls to AggregateFunctions
447
+ group_by_items: list of expressions to group by
448
+ limit: max number of rows to return: None = no limit
447
449
  """
448
450
 
449
451
  group_by_items: Optional[list[exprs.Expr]]
@@ -458,12 +460,6 @@ class SqlAggregationNode(SqlNode):
458
460
  limit: Optional[int] = None,
459
461
  exact_version_only: Optional[list[catalog.TableVersion]] = None,
460
462
  ):
461
- """
462
- Args:
463
- select_list: can contain calls to AggregateFunctions
464
- group_by_items: list of expressions to group by
465
- limit: max number of rows to return: None = no limit
466
- """
467
463
  self.input_cte, input_col_map = input.to_cte()
468
464
  sql_elements = exprs.SqlElementCache(input_col_map)
469
465
  super().__init__(None, row_builder, select_list, sql_elements)
@@ -529,6 +525,12 @@ class SqlJoinNode(SqlNode):
529
525
  class SqlSampleNode(SqlNode):
530
526
  """
531
527
  Returns rows sampled from the input node.
528
+
529
+ Args:
530
+ input: SqlNode to sample from
531
+ select_list: can contain calls to AggregateFunctions
532
+ sample_clause: specifies the sampling method
533
+ stratify_exprs: Analyzer processed list of expressions to stratify by.
532
534
  """
533
535
 
534
536
  input_cte: Optional[sql.CTE]
@@ -544,13 +546,6 @@ class SqlSampleNode(SqlNode):
544
546
  sample_clause: 'SampleClause',
545
547
  stratify_exprs: list[exprs.Expr],
546
548
  ):
547
- """
548
- Args:
549
- input: SqlNode to sample from
550
- select_list: can contain calls to AggregateFunctions
551
- sample_clause: specifies the sampling method
552
- stratify_exprs: Analyzer processed list of expressions to stratify by.
553
- """
554
549
  assert isinstance(input, SqlNode)
555
550
  self.input_cte, input_col_map = input.to_cte(keep_pk=True)
556
551
  self.pk_count = input.num_pk_cols
@@ -14,7 +14,7 @@ import PIL.Image
14
14
  import sqlalchemy as sql
15
15
 
16
16
  from pixeltable import catalog, env
17
- from pixeltable.utils.media_store import MediaStore, TempStore
17
+ from pixeltable.utils.local_store import TempStore
18
18
 
19
19
 
20
20
  class DataRow:
@@ -257,42 +257,48 @@ class DataRow:
257
257
  self.vals[idx] = val
258
258
  self.has_val[idx] = True
259
259
 
260
- def flush_img(self, index: int, col: Optional[catalog.Column] = None) -> None:
261
- """Save or discard the in-memory value (required to be a PIL.Image.Image)"""
260
+ def prepare_col_val_for_save(self, index: int, col: Optional[catalog.Column] = None) -> bool:
261
+ """
262
+ Prepare to save a column's value into the appropriate store. Discard unneeded values.
263
+
264
+ Return:
265
+ True if the media object in the column needs to be saved.
266
+ """
262
267
  if self.vals[index] is None:
263
- return
268
+ return False
269
+
270
+ if self.file_urls[index] is not None:
271
+ return False
272
+
264
273
  assert self.excs[index] is None
265
274
  if self.file_paths[index] is None:
266
275
  if col is not None:
267
- image = self.vals[index]
268
- format = None
269
- if isinstance(image, PIL.Image.Image):
270
- # Default to JPEG unless the image has a transparency layer (which isn't supported by JPEG).
271
- # In that case, use WebP instead.
272
- format = 'webp' if image.has_transparency_data else 'jpeg'
273
- filepath, url = MediaStore.get().save_media_object(image, col, format=format)
274
- self.file_paths[index] = str(filepath)
275
- self.file_urls[index] = url
276
+ # This is a media object that needs to be saved
277
+ return True
276
278
  else:
277
- # we discard the content of this cell
279
+ # This is a media object that we don't care about, so we discard it
278
280
  self.has_val[index] = False
279
281
  else:
280
282
  # we already have a file for this image, nothing left to do
281
283
  pass
284
+
282
285
  self.vals[index] = None
286
+ return False
283
287
 
284
- def move_tmp_media_file(self, index: int, col: catalog.Column) -> None:
285
- """If a media url refers to data in a temporary file, move the data to a MediaStore"""
286
- if self.file_urls[index] is None:
287
- return
288
- assert self.excs[index] is None
288
+ def save_media_to_temp(self, index: int, col: catalog.Column) -> str:
289
+ """Save the media object in the column to the TempStore.
290
+ Objects cannot be saved directly to general destinations."""
289
291
  assert col.col_type.is_media_type()
290
- src_path = TempStore.resolve_url(self.file_urls[index])
291
- if src_path is None:
292
- # The media url does not point to a temporary file, leave it as is
293
- return
294
- new_file_url = MediaStore.get().relocate_local_media_file(src_path, col)
295
- self.file_urls[index] = new_file_url
292
+ val = self.vals[index]
293
+ format = None
294
+ if isinstance(val, PIL.Image.Image):
295
+ # Default to JPEG unless the image has a transparency layer (which isn't supported by JPEG).
296
+ # In that case, use WebP instead.
297
+ format = 'webp' if val.has_transparency_data else 'jpeg'
298
+ filepath, url = TempStore.save_media_object(val, col, format=format)
299
+ self.file_paths[index] = str(filepath) if filepath is not None else None
300
+ self.vals[index] = None
301
+ return url
296
302
 
297
303
  @property
298
304
  def rowid(self) -> tuple[int, ...]:
@@ -17,14 +17,15 @@ from .sql_element_cache import SqlElementCache
17
17
 
18
18
 
19
19
  class JsonPath(Expr):
20
+ """
21
+ anchor can be None, in which case this is a relative JsonPath and the anchor is set later via set_anchor().
22
+ scope_idx: for relative paths, index of referenced JsonMapper
23
+ (0: indicates the immediately preceding JsonMapper, -1: the parent of the immediately preceding mapper, ...)
24
+ """
25
+
20
26
  def __init__(
21
27
  self, anchor: Optional[Expr], path_elements: Optional[list[str | int | slice]] = None, scope_idx: int = 0
22
28
  ) -> None:
23
- """
24
- anchor can be None, in which case this is a relative JsonPath and the anchor is set later via set_anchor().
25
- scope_idx: for relative paths, index of referenced JsonMapper
26
- (0: indicates the immediately preceding JsonMapper, -1: the parent of the immediately preceding mapper, ...)
27
- """
28
29
  if path_elements is None:
29
30
  path_elements = []
30
31
  super().__init__(ts.JsonType(nullable=True)) # JsonPath expressions are always nullable
@@ -48,6 +48,12 @@ class RowBuilder:
48
48
 
49
49
  For ColumnRefs to unstored iterator columns:
50
50
  - in order for them to be executable, we also record the iterator args and pass them to the ColumnRef
51
+
52
+ Args:
53
+ output_exprs: list of Exprs to be evaluated
54
+ columns: list of columns to be materialized
55
+ input_exprs: list of Exprs that are excluded from evaluation (because they're already materialized)
56
+ TODO: enforce that output_exprs doesn't overlap with input_exprs?
51
57
  """
52
58
 
53
59
  unique_exprs: ExprSet
@@ -105,13 +111,6 @@ class RowBuilder:
105
111
  input_exprs: Iterable[Expr],
106
112
  tbl: Optional[catalog.TableVersion] = None,
107
113
  ):
108
- """
109
- Args:
110
- output_exprs: list of Exprs to be evaluated
111
- columns: list of columns to be materialized
112
- input_exprs: list of Exprs that are excluded from evaluation (because they're already materialized)
113
- TODO: enforce that output_exprs doesn't overlap with input_exprs?
114
- """
115
114
  self.unique_exprs: ExprSet[Expr] = ExprSet() # dependencies precede their dependents
116
115
  self.next_slot_idx = 0
117
116
  self.stored_img_cols = []
@@ -474,11 +473,6 @@ class RowBuilder:
474
473
  # exceptions get stored in the errortype/-msg properties of the cellmd column
475
474
  table_row.append(ColumnPropertyRef.create_cellmd_exc(exc))
476
475
  else:
477
- if col.col_type.is_media_type():
478
- if col.col_type.is_image_type() and data_row.file_urls[slot_idx] is None:
479
- # we have yet to store this image
480
- data_row.flush_img(slot_idx, col)
481
- data_row.move_tmp_media_file(slot_idx, col)
482
476
  val = data_row.get_stored_val(slot_idx, col.get_sa_col_type())
483
477
  table_row.append(val)
484
478
  if col.stores_cellmd:
@@ -15,7 +15,7 @@ import PIL.Image
15
15
  import pixeltable as pxt
16
16
  from pixeltable import env, exceptions as excs, exprs
17
17
  from pixeltable.utils.code import local_public_names
18
- from pixeltable.utils.media_store import TempStore
18
+ from pixeltable.utils.local_store import TempStore
19
19
 
20
20
  if TYPE_CHECKING:
21
21
  from google import genai
@@ -23,7 +23,7 @@ import pixeltable as pxt
23
23
  from pixeltable import env, exprs, type_system as ts
24
24
  from pixeltable.func import Batch, Tools
25
25
  from pixeltable.utils.code import local_public_names
26
- from pixeltable.utils.media_store import TempStore
26
+ from pixeltable.utils.local_store import TempStore
27
27
 
28
28
  if TYPE_CHECKING:
29
29
  import openai
@@ -4,7 +4,6 @@ Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs)
4
4
 
5
5
  import logging
6
6
  import pathlib
7
- import shutil
8
7
  import subprocess
9
8
  from typing import Literal, NoReturn
10
9
 
@@ -17,7 +16,7 @@ import pixeltable as pxt
17
16
  import pixeltable.utils.av as av_utils
18
17
  from pixeltable.env import Env
19
18
  from pixeltable.utils.code import local_public_names
20
- from pixeltable.utils.media_store import TempStore
19
+ from pixeltable.utils.local_store import TempStore
21
20
 
22
21
  _logger = logging.getLogger('pixeltable')
23
22
  _format_defaults: dict[str, tuple[str, str]] = { # format -> (codec, ext)
@@ -49,6 +48,10 @@ class make_video(pxt.Aggregator):
49
48
  """
50
49
  Aggregator that creates a video from a sequence of images, using the default video encoder and yuv420p pixel format.
51
50
 
51
+ Follows https://pyav.org/docs/develop/cookbook/numpy.html#generating-video
52
+
53
+ TODO: provide parameters for video_encoder and pix_fmt
54
+
52
55
  Args:
53
56
  fps: Frames per second for the output video.
54
57
 
@@ -98,11 +101,6 @@ class make_video(pxt.Aggregator):
98
101
  fps: int
99
102
 
100
103
  def __init__(self, fps: int = 25):
101
- """
102
- Follows https://pyav.org/docs/develop/cookbook/numpy.html#generating-video
103
-
104
- TODO: provide parameters for video_encoder and pix_fmt
105
- """
106
104
  self.container = None
107
105
  self.stream = None
108
106
  self.fps = fps
@@ -328,6 +326,7 @@ def clip(
328
326
  Returns:
329
327
  New video containing only the specified time range or None if start_time is beyond the end of the video.
330
328
  """
329
+ Env.get().require_binary('ffmpeg')
331
330
  if start_time < 0:
332
331
  raise pxt.Error(f'start_time must be non-negative, got {start_time}')
333
332
  if end_time is not None and end_time <= start_time:
@@ -336,8 +335,6 @@ def clip(
336
335
  raise pxt.Error(f'duration must be positive, got {duration}')
337
336
  if end_time is not None and duration is not None:
338
337
  raise pxt.Error('end_time and duration cannot both be specified')
339
- if not shutil.which('ffmpeg'):
340
- raise pxt.Error('ffmpeg is not installed or not in PATH. Please install ffmpeg to use get_clip().')
341
338
 
342
339
  video_duration = av_utils.get_video_duration(video)
343
340
  if video_duration is not None and start_time > video_duration:
@@ -389,10 +386,9 @@ def segment_video(video: pxt.Video, *, duration: float) -> list[str]:
389
386
  >>> duration = tbl.video.get_duration()
390
387
  >>> tbl.select(segment_paths=tbl.video.segment_video(duration=duration / 2 + 1)).collect()
391
388
  """
389
+ Env.get().require_binary('ffmpeg')
392
390
  if duration <= 0:
393
391
  raise pxt.Error(f'duration must be positive, got {duration}')
394
- if not shutil.which('ffmpeg'):
395
- raise pxt.Error('ffmpeg is not installed or not in PATH. Please install ffmpeg to use segment_video().')
396
392
 
397
393
  base_path = TempStore.create_path(extension='')
398
394
 
@@ -437,10 +433,9 @@ def concat_videos(videos: list[pxt.Video]) -> pxt.Video:
437
433
  Returns:
438
434
  A new video containing the merged videos.
439
435
  """
436
+ Env.get().require_binary('ffmpeg')
440
437
  if len(videos) == 0:
441
438
  raise pxt.Error('concat_videos(): empty argument list')
442
- if not shutil.which('ffmpeg'):
443
- raise pxt.Error('ffmpeg is not installed or not in PATH. Please install ffmpeg to use concat_videos().')
444
439
 
445
440
  # Check that all videos have the same resolution
446
441
  resolutions: list[tuple[int, int]] = []
@@ -529,6 +524,125 @@ def concat_videos(videos: list[pxt.Video]) -> pxt.Video:
529
524
  filelist_path.unlink()
530
525
 
531
526
 
527
+ @pxt.udf
528
+ def with_audio(
529
+ video: pxt.Video,
530
+ audio: pxt.Audio,
531
+ *,
532
+ video_start_time: float = 0.0,
533
+ video_duration: float | None = None,
534
+ audio_start_time: float = 0.0,
535
+ audio_duration: float | None = None,
536
+ ) -> pxt.Video:
537
+ """
538
+ Creates a new video that combines the video stream from `video` and the audio stream from `audio`.
539
+ The `start_time` and `duration` parameters can be used to select a specific time range from each input.
540
+ If the audio input (or selected time range) is longer than the video, the audio will be truncated.
541
+
542
+
543
+ __Requirements:__
544
+
545
+ - `ffmpeg` needs to be installed and in PATH
546
+
547
+ Args:
548
+ video: Input video.
549
+ audio: Input audio.
550
+ video_start_time: Start time in the video input (in seconds).
551
+ video_duration: Duration of video segment (in seconds). If None, uses the remainder of the video after
552
+ `video_start_time`. `video_duration` determines the duration of the output video.
553
+ audio_start_time: Start time in the audio input (in seconds).
554
+ audio_duration: Duration of audio segment (in seconds). If None, uses the remainder of the audio after
555
+ `audio_start_time`. If the audio is longer than the output video, it will be truncated.
556
+
557
+ Returns:
558
+ A new video file with the audio track added.
559
+
560
+ Examples:
561
+ Add background music to a video:
562
+
563
+ >>> tbl.select(tbl.video.with_audio(tbl.music_track)).collect()
564
+
565
+ Add audio starting 5 seconds into both files:
566
+
567
+ >>> tbl.select(
568
+ ... tbl.video.with_audio(
569
+ ... tbl.music_track,
570
+ ... video_start_time=5.0,
571
+ ... audio_start_time=5.0
572
+ ... )
573
+ ... ).collect()
574
+
575
+ Use a 10-second clip from the middle of both files:
576
+
577
+ >>> tbl.select(
578
+ ... tbl.video.with_audio(
579
+ ... tbl.music_track,
580
+ ... video_start_time=30.0,
581
+ ... video_duration=10.0,
582
+ ... audio_start_time=15.0,
583
+ ... audio_duration=10.0
584
+ ... )
585
+ ... ).collect()
586
+ """
587
+ Env.get().require_binary('ffmpeg')
588
+ if video_start_time < 0:
589
+ raise pxt.Error(f'video_offset must be non-negative, got {video_start_time}')
590
+ if audio_start_time < 0:
591
+ raise pxt.Error(f'audio_offset must be non-negative, got {audio_start_time}')
592
+ if video_duration is not None and video_duration <= 0:
593
+ raise pxt.Error(f'video_duration must be positive, got {video_duration}')
594
+ if audio_duration is not None and audio_duration <= 0:
595
+ raise pxt.Error(f'audio_duration must be positive, got {audio_duration}')
596
+
597
+ output_path = str(TempStore.create_path(extension='.mp4'))
598
+
599
+ cmd = ['ffmpeg']
600
+ if video_start_time > 0:
601
+ # fast seek, must precede -i
602
+ cmd.extend(['-ss', str(video_start_time)])
603
+ if video_duration is not None:
604
+ cmd.extend(['-t', str(video_duration)])
605
+ else:
606
+ video_duration = av_utils.get_video_duration(video)
607
+ cmd.extend(['-i', str(video)])
608
+
609
+ if audio_start_time > 0:
610
+ cmd.extend(['-ss', str(audio_start_time)])
611
+ if audio_duration is not None:
612
+ cmd.extend(['-t', str(audio_duration)])
613
+ cmd.extend(['-i', str(audio)])
614
+
615
+ cmd.extend(
616
+ [
617
+ '-map',
618
+ '0:v:0', # video from first input
619
+ '-map',
620
+ '1:a:0', # audio from second input
621
+ '-c:v',
622
+ 'copy', # avoid re-encoding
623
+ '-c:a',
624
+ 'copy', # avoid re-encoding
625
+ '-t',
626
+ str(video_duration), # limit output duration to video duration
627
+ '-loglevel',
628
+ 'error', # only show errors
629
+ output_path,
630
+ ]
631
+ )
632
+
633
+ _logger.debug(f'with_audio(): {" ".join(cmd)}')
634
+
635
+ try:
636
+ result = subprocess.run(cmd, capture_output=True, text=True, check=True)
637
+ output_file = pathlib.Path(output_path)
638
+ if not output_file.exists() or output_file.stat().st_size == 0:
639
+ stderr_output = result.stderr.strip() if result.stderr is not None else ''
640
+ raise pxt.Error(f'ffmpeg failed to create output file for commandline: {" ".join(cmd)}\n{stderr_output}')
641
+ return output_path
642
+ except subprocess.CalledProcessError as e:
643
+ _handle_ffmpeg_error(e)
644
+
645
+
532
646
  @pxt.udf(is_method=True)
533
647
  def overlay_text(
534
648
  video: pxt.Video,
@@ -615,8 +729,7 @@ def overlay_text(
615
729
  ... )
616
730
  ... ).collect()
617
731
  """
618
- if not shutil.which('ffmpeg'):
619
- raise pxt.Error('ffmpeg is not installed or not in PATH. Please install ffmpeg to use overlay_text().')
732
+ Env.get().require_binary('ffmpeg')
620
733
  if font_size <= 0:
621
734
  raise pxt.Error(f'font_size must be positive, got {font_size}')
622
735
  if opacity < 0.0 or opacity > 1.0:
@@ -1,3 +1,5 @@
1
+ """WhisperX audio transcription and diarization functions."""
2
+
1
3
  from typing import TYPE_CHECKING, Any, Optional
2
4
 
3
5
  import numpy as np
@@ -1,3 +1,5 @@
1
+ """YOLOX object detection functions."""
2
+
1
3
  import logging
2
4
  from typing import TYPE_CHECKING
3
5
 
pixeltable/globals.py CHANGED
@@ -397,40 +397,54 @@ def create_snapshot(
397
397
  )
398
398
 
399
399
 
400
- def create_replica(
401
- destination: str,
400
+ def publish(
402
401
  source: str | catalog.Table,
402
+ destination_uri: str,
403
403
  bucket_name: str | None = None,
404
404
  access: Literal['public', 'private'] = 'private',
405
- ) -> Optional[catalog.Table]:
405
+ ) -> None:
406
406
  """
407
- Create a replica of a table. Can be used either to create a remote replica of a local table, or to create a local
408
- replica of a remote table. A given table can have at most one replica per Pixeltable instance.
407
+ Publishes a replica of a local Pixeltable table to Pixeltable cloud. A given table can be published to at most one
408
+ URI per Pixeltable cloud database.
409
409
 
410
410
  Args:
411
- destination: Path where the replica will be created. Can be either a local path such as `'my_dir.my_table'`, or
412
- a remote URI such as `'pxt://username/mydir.my_table'`.
413
- source: Path to the source table, or (if the source table is a local table) a handle to the source table.
414
- bucket_name: The name of the pixeltable cloud-registered bucket to use to store replica's data.
415
- If no `bucket_name` is provided, the default Pixeltable storage bucket will be used.
411
+ source: Path or table handle of the local table to be published.
412
+ destination_uri: Remote URI where the replica will be published, such as `'pxt://org_name/my_dir/my_table'`.
413
+ bucket_name: The name of the bucket to use to store replica's data. The bucket must be registered with
414
+ Pixeltable cloud. If no `bucket_name` is provided, the default storage bucket for the destination
415
+ database will be used.
416
416
  access: Access control for the replica.
417
417
 
418
418
  - `'public'`: Anyone can access this replica.
419
- - `'private'`: Only the owner can access.
419
+ - `'private'`: Only the host organization can access.
420
420
  """
421
- remote_dest = destination.startswith('pxt://')
422
- remote_source = isinstance(source, str) and source.startswith('pxt://')
423
- if remote_dest == remote_source:
424
- raise excs.Error('Exactly one of `destination` or `source` must be a remote URI.')
425
-
426
- if remote_dest:
427
- if isinstance(source, str):
428
- source = get_table(source)
429
- share.push_replica(destination, source, bucket_name, access)
430
- return None
431
- else:
432
- assert isinstance(source, str)
433
- return share.pull_replica(destination, source)
421
+ if not destination_uri.startswith('pxt://'):
422
+ raise excs.Error("`destination_uri` must be a remote Pixeltable URI with the prefix 'pxt://'")
423
+
424
+ if isinstance(source, str):
425
+ source = get_table(source)
426
+
427
+ share.push_replica(destination_uri, source, bucket_name, access)
428
+
429
+
430
+ def replicate(remote_uri: str, local_path: str) -> catalog.Table:
431
+ """
432
+ Retrieve a replica from Pixeltable cloud as a local table. This will create a full local copy of the replica in a
433
+ way that preserves the table structure of the original source data. Once replicated, the local table can be
434
+ queried offline just as any other Pixeltable table.
435
+
436
+ Args:
437
+ remote_uri: Remote URI of the table to be replicated, such as `'pxt://org_name/my_dir/my_table'`.
438
+ local_path: Local table path where the replica will be created, such as `'my_new_dir.my_new_tbl'`. It can be
439
+ the same or different from the cloud table name.
440
+
441
+ Returns:
442
+ A handle to the newly created local replica table.
443
+ """
444
+ if not remote_uri.startswith('pxt://'):
445
+ raise excs.Error("`remote_uri` must be a remote Pixeltable URI with the prefix 'pxt://'")
446
+
447
+ return share.pull_replica(local_path, remote_uri)
434
448
 
435
449
 
436
450
  def get_table(path: str) -> catalog.Table:
@@ -498,10 +512,11 @@ def move(path: str, new_path: str) -> None:
498
512
  def drop_table(
499
513
  table: str | catalog.Table, force: bool = False, if_not_exists: Literal['error', 'ignore'] = 'error'
500
514
  ) -> None:
501
- """Drop a table, view, or snapshot.
515
+ """Drop a table, view, snapshot, or replica.
502
516
 
503
517
  Args:
504
- table: Fully qualified name, or handle, of the table to be dropped.
518
+ table: Fully qualified name or table handle of the table to be dropped; or a remote URI of a cloud replica to
519
+ be deleted.
505
520
  force: If `True`, will also drop all views and sub-views of this table.
506
521
  if_not_exists: Directive regarding how to handle if the path does not exist.
507
522
  Must be one of the following:
@@ -541,13 +556,17 @@ def drop_table(
541
556
  assert isinstance(table, str)
542
557
  tbl_path = table
543
558
 
559
+ if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
560
+
544
561
  if tbl_path.startswith('pxt://'):
545
562
  # Remote table
563
+ if force:
564
+ raise excs.Error('Cannot use `force=True` with a cloud replica URI.')
565
+ # TODO: Handle if_not_exists properly
546
566
  share.delete_replica(tbl_path)
547
567
  else:
548
568
  # Local table
549
569
  path_obj = catalog.Path.parse(tbl_path)
550
- if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
551
570
  Catalog.get().drop_table(path_obj, force=force, if_not_exists=if_not_exists_)
552
571
 
553
572
 
@@ -763,15 +782,15 @@ def ls(path: str = '') -> pd.DataFrame:
763
782
  base = md['base'] or ''
764
783
  if base.startswith('_'):
765
784
  base = '<anonymous base table>'
766
- if md['is_snapshot']:
785
+ if md['is_replica']:
786
+ kind = 'replica'
787
+ elif md['is_snapshot']:
767
788
  kind = 'snapshot'
768
789
  elif md['is_view']:
769
790
  kind = 'view'
770
791
  else:
771
792
  kind = 'table'
772
793
  version = '' if kind == 'snapshot' else str(md['version'])
773
- if md['is_replica']:
774
- kind = f'{kind}-replica'
775
794
  rows.append([name, kind, version, base])
776
795
  return rows
777
796