pixeltable 0.4.17__py3-none-any.whl → 0.4.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (153) hide show
  1. pixeltable/__init__.py +1 -1
  2. pixeltable/_version.py +1 -0
  3. pixeltable/catalog/catalog.py +144 -118
  4. pixeltable/catalog/column.py +104 -115
  5. pixeltable/catalog/globals.py +1 -2
  6. pixeltable/catalog/insertable_table.py +44 -49
  7. pixeltable/catalog/path.py +3 -4
  8. pixeltable/catalog/schema_object.py +4 -4
  9. pixeltable/catalog/table.py +139 -124
  10. pixeltable/catalog/table_metadata.py +6 -6
  11. pixeltable/catalog/table_version.py +315 -246
  12. pixeltable/catalog/table_version_handle.py +4 -4
  13. pixeltable/catalog/table_version_path.py +9 -10
  14. pixeltable/catalog/tbl_ops.py +9 -3
  15. pixeltable/catalog/view.py +34 -28
  16. pixeltable/config.py +14 -10
  17. pixeltable/dataframe.py +69 -78
  18. pixeltable/env.py +78 -64
  19. pixeltable/exec/aggregation_node.py +6 -6
  20. pixeltable/exec/cache_prefetch_node.py +10 -10
  21. pixeltable/exec/data_row_batch.py +3 -3
  22. pixeltable/exec/exec_context.py +16 -4
  23. pixeltable/exec/exec_node.py +5 -5
  24. pixeltable/exec/expr_eval/evaluators.py +6 -6
  25. pixeltable/exec/expr_eval/expr_eval_node.py +8 -7
  26. pixeltable/exec/expr_eval/globals.py +6 -6
  27. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  28. pixeltable/exec/expr_eval/schedulers.py +11 -11
  29. pixeltable/exec/in_memory_data_node.py +2 -2
  30. pixeltable/exec/object_store_save_node.py +14 -17
  31. pixeltable/exec/sql_node.py +28 -27
  32. pixeltable/exprs/arithmetic_expr.py +4 -4
  33. pixeltable/exprs/array_slice.py +2 -2
  34. pixeltable/exprs/column_property_ref.py +3 -3
  35. pixeltable/exprs/column_ref.py +61 -74
  36. pixeltable/exprs/comparison.py +5 -5
  37. pixeltable/exprs/compound_predicate.py +3 -3
  38. pixeltable/exprs/data_row.py +12 -12
  39. pixeltable/exprs/expr.py +41 -31
  40. pixeltable/exprs/expr_dict.py +3 -3
  41. pixeltable/exprs/expr_set.py +3 -3
  42. pixeltable/exprs/function_call.py +14 -14
  43. pixeltable/exprs/in_predicate.py +4 -4
  44. pixeltable/exprs/inline_expr.py +8 -8
  45. pixeltable/exprs/is_null.py +1 -3
  46. pixeltable/exprs/json_mapper.py +8 -8
  47. pixeltable/exprs/json_path.py +6 -6
  48. pixeltable/exprs/literal.py +5 -5
  49. pixeltable/exprs/method_ref.py +2 -2
  50. pixeltable/exprs/object_ref.py +2 -2
  51. pixeltable/exprs/row_builder.py +14 -14
  52. pixeltable/exprs/rowid_ref.py +8 -8
  53. pixeltable/exprs/similarity_expr.py +50 -25
  54. pixeltable/exprs/sql_element_cache.py +4 -4
  55. pixeltable/exprs/string_op.py +2 -2
  56. pixeltable/exprs/type_cast.py +3 -5
  57. pixeltable/func/aggregate_function.py +8 -8
  58. pixeltable/func/callable_function.py +9 -9
  59. pixeltable/func/expr_template_function.py +3 -3
  60. pixeltable/func/function.py +15 -17
  61. pixeltable/func/function_registry.py +6 -7
  62. pixeltable/func/globals.py +2 -3
  63. pixeltable/func/mcp.py +2 -2
  64. pixeltable/func/query_template_function.py +16 -16
  65. pixeltable/func/signature.py +14 -14
  66. pixeltable/func/tools.py +11 -11
  67. pixeltable/func/udf.py +16 -18
  68. pixeltable/functions/__init__.py +1 -0
  69. pixeltable/functions/anthropic.py +7 -7
  70. pixeltable/functions/audio.py +76 -0
  71. pixeltable/functions/bedrock.py +6 -6
  72. pixeltable/functions/deepseek.py +4 -4
  73. pixeltable/functions/fireworks.py +2 -2
  74. pixeltable/functions/gemini.py +6 -6
  75. pixeltable/functions/globals.py +12 -12
  76. pixeltable/functions/groq.py +4 -4
  77. pixeltable/functions/huggingface.py +1033 -6
  78. pixeltable/functions/image.py +7 -10
  79. pixeltable/functions/llama_cpp.py +7 -7
  80. pixeltable/functions/math.py +2 -3
  81. pixeltable/functions/mistralai.py +3 -3
  82. pixeltable/functions/ollama.py +9 -9
  83. pixeltable/functions/openai.py +21 -21
  84. pixeltable/functions/openrouter.py +7 -7
  85. pixeltable/functions/string.py +21 -28
  86. pixeltable/functions/timestamp.py +7 -8
  87. pixeltable/functions/together.py +4 -6
  88. pixeltable/functions/twelvelabs.py +92 -0
  89. pixeltable/functions/video.py +36 -31
  90. pixeltable/functions/vision.py +6 -6
  91. pixeltable/functions/whisper.py +7 -7
  92. pixeltable/functions/whisperx.py +16 -16
  93. pixeltable/globals.py +75 -40
  94. pixeltable/index/base.py +12 -8
  95. pixeltable/index/btree.py +19 -22
  96. pixeltable/index/embedding_index.py +30 -39
  97. pixeltable/io/datarows.py +3 -3
  98. pixeltable/io/external_store.py +13 -16
  99. pixeltable/io/fiftyone.py +5 -5
  100. pixeltable/io/globals.py +5 -5
  101. pixeltable/io/hf_datasets.py +4 -4
  102. pixeltable/io/label_studio.py +12 -12
  103. pixeltable/io/pandas.py +6 -6
  104. pixeltable/io/parquet.py +2 -2
  105. pixeltable/io/table_data_conduit.py +12 -12
  106. pixeltable/io/utils.py +2 -2
  107. pixeltable/iterators/audio.py +2 -2
  108. pixeltable/iterators/document.py +88 -57
  109. pixeltable/iterators/video.py +66 -37
  110. pixeltable/metadata/converters/convert_18.py +2 -2
  111. pixeltable/metadata/converters/convert_19.py +2 -2
  112. pixeltable/metadata/converters/convert_20.py +2 -2
  113. pixeltable/metadata/converters/convert_21.py +2 -2
  114. pixeltable/metadata/converters/convert_22.py +2 -2
  115. pixeltable/metadata/converters/convert_24.py +2 -2
  116. pixeltable/metadata/converters/convert_25.py +2 -2
  117. pixeltable/metadata/converters/convert_26.py +2 -2
  118. pixeltable/metadata/converters/convert_29.py +4 -4
  119. pixeltable/metadata/converters/convert_34.py +2 -2
  120. pixeltable/metadata/converters/convert_36.py +2 -2
  121. pixeltable/metadata/converters/convert_38.py +2 -2
  122. pixeltable/metadata/converters/convert_39.py +1 -2
  123. pixeltable/metadata/converters/util.py +11 -13
  124. pixeltable/metadata/schema.py +22 -21
  125. pixeltable/metadata/utils.py +2 -6
  126. pixeltable/mypy/mypy_plugin.py +5 -5
  127. pixeltable/plan.py +32 -34
  128. pixeltable/share/packager.py +7 -7
  129. pixeltable/share/publish.py +3 -3
  130. pixeltable/store.py +126 -41
  131. pixeltable/type_system.py +43 -46
  132. pixeltable/utils/__init__.py +1 -2
  133. pixeltable/utils/arrow.py +4 -4
  134. pixeltable/utils/av.py +74 -38
  135. pixeltable/utils/azure_store.py +305 -0
  136. pixeltable/utils/code.py +1 -2
  137. pixeltable/utils/dbms.py +15 -19
  138. pixeltable/utils/description_helper.py +2 -3
  139. pixeltable/utils/documents.py +5 -6
  140. pixeltable/utils/exception_handler.py +2 -2
  141. pixeltable/utils/filecache.py +5 -5
  142. pixeltable/utils/formatter.py +4 -6
  143. pixeltable/utils/gcs_store.py +9 -9
  144. pixeltable/utils/local_store.py +17 -17
  145. pixeltable/utils/object_stores.py +59 -43
  146. pixeltable/utils/s3_store.py +35 -30
  147. {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/METADATA +4 -4
  148. pixeltable-0.4.19.dist-info/RECORD +213 -0
  149. pixeltable/__version__.py +0 -3
  150. pixeltable-0.4.17.dist-info/RECORD +0 -211
  151. {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/WHEEL +0 -0
  152. {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/entry_points.txt +0 -0
  153. {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,92 @@
1
+ """
2
+ Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs)
3
+ that wrap various endpoints from the TwelveLabs API. In order to use them, you must
4
+ first `pip install twelvelabs` and configure your TwelveLabs credentials, as described in
5
+ the [Working with TwelveLabs](https://pixeltable.readme.io/docs/working-with-twelvelabs) tutorial.
6
+ """
7
+
8
+ from typing import TYPE_CHECKING, Any, Literal
9
+
10
+ import numpy as np
11
+
12
+ import pixeltable as pxt
13
+ from pixeltable import env
14
+ from pixeltable.utils.code import local_public_names
15
+
16
+ if TYPE_CHECKING:
17
+ from twelvelabs import AsyncTwelveLabs
18
+
19
+
20
+ @env.register_client('twelvelabs')
21
+ def _(api_key: str) -> 'AsyncTwelveLabs':
22
+ from twelvelabs import AsyncTwelveLabs
23
+
24
+ return AsyncTwelveLabs(api_key=api_key)
25
+
26
+
27
+ def _twelvelabs_client() -> 'AsyncTwelveLabs':
28
+ return env.Env.get().get_client('twelvelabs')
29
+
30
+
31
+ @pxt.udf(resource_pool='request-rate:twelvelabs')
32
+ async def embed(
33
+ model_name: str,
34
+ *,
35
+ text: str | None = None,
36
+ text_truncate: Literal['none', 'start', 'end'] | None = None,
37
+ audio: pxt.Audio | None = None,
38
+ # TODO: support images
39
+ # image: pxt.Image | None = None,
40
+ **kwargs: Any,
41
+ ) -> pxt.Array[(1024,), pxt.Float]:
42
+ """
43
+ Creates an embedding vector for the given `text`, `audio`, or `image` parameter. Only one of `text`, `audio`, or
44
+ `image` may be specified.
45
+
46
+ Equivalent to the TwelveLabs Embed API.
47
+ https://docs.twelvelabs.io/v1.3/docs/guides/create-embeddings
48
+
49
+ Request throttling:
50
+ Applies the rate limit set in the config (section `twelvelabs`, key `rate_limit`). If no rate
51
+ limit is configured, uses a default of 600 RPM.
52
+
53
+ __Requirements:__
54
+
55
+ - `pip install twelvelabs`
56
+
57
+ Args:
58
+ model_name: The name of the model to use. Check
59
+ [the TwelveLabs documentation](https://docs.twelvelabs.io/v1.3/sdk-reference/python/create-text-image-and-audio-embeddings)
60
+ for available models.
61
+ text: The text to embed.
62
+ text_truncate: Truncation mode for the text.
63
+ audio: The audio to embed.
64
+
65
+ Returns:
66
+ The embedding.
67
+
68
+ Examples:
69
+ Add a computed column `embed` for an embedding of a string column `input`:
70
+
71
+ >>> tbl.add_computed_column(
72
+ ... embed=embed(model_name='Marengo-retrieval-2.7', text=tbl.input)
73
+ ... )
74
+ """
75
+ cl = _twelvelabs_client()
76
+ res = await cl.embed.create(
77
+ model_name=model_name, text=text, text_truncate=text_truncate, audio_file=audio, **kwargs
78
+ )
79
+ if text is not None:
80
+ if res.text_embedding is None:
81
+ raise pxt.Error(f"Didn't receive embedding for text: {text}")
82
+ vector = res.text_embedding.segments[0].float_
83
+ return np.array(vector, dtype=np.float64)
84
+ # TODO: handle audio and image, once we know how to get a non-error response
85
+ return None
86
+
87
+
88
+ __all__ = local_public_names(__name__)
89
+
90
+
91
+ def __dir__() -> list[str]:
92
+ return __all__
@@ -20,28 +20,6 @@ from pixeltable.utils.code import local_public_names
20
20
  from pixeltable.utils.local_store import TempStore
21
21
 
22
22
  _logger = logging.getLogger('pixeltable')
23
- _format_defaults: dict[str, tuple[str, str]] = { # format -> (codec, ext)
24
- 'wav': ('pcm_s16le', 'wav'),
25
- 'mp3': ('libmp3lame', 'mp3'),
26
- 'flac': ('flac', 'flac'),
27
- # 'mp4': ('aac', 'm4a'),
28
- }
29
-
30
- # for mp4:
31
- # - extract_audio() fails with
32
- # "Application provided invalid, non monotonically increasing dts to muxer in stream 0: 1146 >= 290"
33
- # - chatgpt suggests this can be fixed in the following manner
34
- # for packet in container.demux(audio_stream):
35
- # packet.pts = None # Reset the PTS and DTS to allow FFmpeg to set them automatically
36
- # packet.dts = None
37
- # for frame in packet.decode():
38
- # frame.pts = None
39
- # for packet in output_stream.encode(frame):
40
- # output_container.mux(packet)
41
- #
42
- # # Flush remaining packets
43
- # for packet in output_stream.encode():
44
- # output_container.mux(packet)
45
23
 
46
24
 
47
25
  @pxt.uda(requires_order_by=True)
@@ -150,9 +128,9 @@ def extract_audio(
150
128
  ... extracted_audio=tbl.video_col.extract_audio(format='flac')
151
129
  ... )
152
130
  """
153
- if format not in _format_defaults:
131
+ if format not in av_utils.AUDIO_FORMATS:
154
132
  raise ValueError(f'extract_audio(): unsupported audio format: {format}')
155
- default_codec, ext = _format_defaults[format]
133
+ default_codec, ext = av_utils.AUDIO_FORMATS[format]
156
134
 
157
135
  with av.open(video_path) as container:
158
136
  if len(container.streams.audio) <= stream_idx:
@@ -306,7 +284,14 @@ def _handle_ffmpeg_error(e: subprocess.CalledProcessError) -> NoReturn:
306
284
 
307
285
  @pxt.udf(is_method=True)
308
286
  def clip(
309
- video: pxt.Video, *, start_time: float, end_time: float | None = None, duration: float | None = None
287
+ video: pxt.Video,
288
+ *,
289
+ start_time: float,
290
+ end_time: float | None = None,
291
+ duration: float | None = None,
292
+ mode: Literal['fast', 'accurate'] = 'accurate',
293
+ video_encoder: str | None = None,
294
+ video_encoder_args: dict[str, Any] | None = None,
310
295
  ) -> pxt.Video | None:
311
296
  """
312
297
  Extract a clip from a video, specified by `start_time` and either `end_time` or `duration` (in seconds).
@@ -323,6 +308,14 @@ def clip(
323
308
  start_time: Start time in seconds
324
309
  end_time: End time in seconds
325
310
  duration: Duration of the clip in seconds
311
+ mode:
312
+
313
+ - `'fast'`: avoids re-encoding but starts the clip at the nearest keyframes and as a result, the clip
314
+ duration will be slightly longer than requested
315
+ - `'accurate'`: extracts a frame-accurate clip, but requires re-encoding
316
+ video_encoder: Video encoder to use. If not specified, uses the default encoder for the current platform.
317
+ Only available for `mode='accurate'`.
318
+ video_encoder_args: Additional arguments to pass to the video encoder. Only available for `mode='accurate'`.
326
319
 
327
320
  Returns:
328
321
  New video containing only the specified time range or None if start_time is beyond the end of the video.
@@ -336,6 +329,11 @@ def clip(
336
329
  raise pxt.Error(f'duration must be positive, got {duration}')
337
330
  if end_time is not None and duration is not None:
338
331
  raise pxt.Error('end_time and duration cannot both be specified')
332
+ if mode == 'fast':
333
+ if video_encoder is not None:
334
+ raise pxt.Error("video_encoder is not supported for mode='fast'")
335
+ if video_encoder_args is not None:
336
+ raise pxt.Error("video_encoder_args is not supported for mode='fast'")
339
337
 
340
338
  video_duration = av_utils.get_video_duration(video)
341
339
  if video_duration is not None and start_time > video_duration:
@@ -345,7 +343,15 @@ def clip(
345
343
 
346
344
  if end_time is not None:
347
345
  duration = end_time - start_time
348
- cmd = av_utils.ffmpeg_clip_cmd(str(video), output_path, start_time, duration)
346
+ cmd = av_utils.ffmpeg_clip_cmd(
347
+ str(video),
348
+ output_path,
349
+ start_time,
350
+ duration,
351
+ fast=(mode == 'fast'),
352
+ video_encoder=video_encoder,
353
+ video_encoder_args=video_encoder_args,
354
+ )
349
355
 
350
356
  try:
351
357
  result = subprocess.run(cmd, capture_output=True, text=True, check=True)
@@ -364,7 +370,7 @@ def segment_video(
364
370
  *,
365
371
  duration: float | None = None,
366
372
  segment_times: list[float] | None = None,
367
- mode: Literal['fast', 'accurate'] = 'fast',
373
+ mode: Literal['fast', 'accurate'] = 'accurate',
368
374
  video_encoder: str | None = None,
369
375
  video_encoder_args: dict[str, Any] | None = None,
370
376
  ) -> list[str]:
@@ -400,15 +406,14 @@ def segment_video(
400
406
  Examples:
401
407
  Split a video at 1 minute intervals using fast mode:
402
408
 
403
- >>> tbl.select(segment_paths=tbl.video.segment_video(duration=60)).collect()
409
+ >>> tbl.select(segment_paths=tbl.video.segment_video(duration=60, mode='fast')).collect()
404
410
 
405
- Split video into exact 10-second segments with accurate mode, using the libx264 encoder with a CRF of 23 and
406
- slow preset (for smaller output files):
411
+ Split video into exact 10-second segments with default accurate mode, using the libx264 encoder with a CRF of 23
412
+ and slow preset (for smaller output files):
407
413
 
408
414
  >>> tbl.select(
409
415
  ... segment_paths=tbl.video.segment_video(
410
416
  ... duration=10,
411
- ... mode='accurate',
412
417
  ... video_encoder='libx264',
413
418
  ... video_encoder_args={'crf': 23, 'preset': 'slow'}
414
419
  ... )
@@ -14,7 +14,7 @@ t.select(pxtv.draw_bounding_boxes(t.img, boxes=t.boxes, label=t.labels)).collect
14
14
  import colorsys
15
15
  import hashlib
16
16
  from collections import defaultdict
17
- from typing import Any, Optional
17
+ from typing import Any
18
18
 
19
19
  import numpy as np
20
20
  import PIL.Image
@@ -293,13 +293,13 @@ def __create_label_colors(labels: list[Any]) -> dict[Any, str]:
293
293
  def draw_bounding_boxes(
294
294
  img: PIL.Image.Image,
295
295
  boxes: list[list[int]],
296
- labels: Optional[list[Any]] = None,
297
- color: Optional[str] = None,
298
- box_colors: Optional[list[str]] = None,
296
+ labels: list[Any] | None = None,
297
+ color: str | None = None,
298
+ box_colors: list[str] | None = None,
299
299
  fill: bool = False,
300
300
  width: int = 1,
301
- font: Optional[str] = None,
302
- font_size: Optional[int] = None,
301
+ font: str | None = None,
302
+ font_size: int | None = None,
303
303
  ) -> PIL.Image.Image:
304
304
  """
305
305
  Draws bounding boxes on the given image.
@@ -6,7 +6,7 @@ This UDF will cause Pixeltable to invoke the relevant model locally. In order to
6
6
  first `pip install openai-whisper`.
7
7
  """
8
8
 
9
- from typing import TYPE_CHECKING, Optional, Sequence
9
+ from typing import TYPE_CHECKING, Sequence
10
10
 
11
11
  import pixeltable as pxt
12
12
  from pixeltable.env import Env
@@ -21,16 +21,16 @@ def transcribe(
21
21
  audio: pxt.Audio,
22
22
  *,
23
23
  model: str,
24
- temperature: Optional[Sequence[float]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
25
- compression_ratio_threshold: Optional[float] = 2.4,
26
- logprob_threshold: Optional[float] = -1.0,
27
- no_speech_threshold: Optional[float] = 0.6,
24
+ temperature: Sequence[float] | None = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
25
+ compression_ratio_threshold: float | None = 2.4,
26
+ logprob_threshold: float | None = -1.0,
27
+ no_speech_threshold: float | None = 0.6,
28
28
  condition_on_previous_text: bool = True,
29
- initial_prompt: Optional[str] = None,
29
+ initial_prompt: str | None = None,
30
30
  word_timestamps: bool = False,
31
31
  prepend_punctuations: str = '"\'“¿([{-',
32
32
  append_punctuations: str = '"\'.。,,!!??::”)]}、', # noqa: RUF001
33
- decode_options: Optional[dict] = None,
33
+ decode_options: dict | None = None,
34
34
  ) -> dict:
35
35
  """
36
36
  Transcribe an audio file using Whisper.
@@ -1,6 +1,6 @@
1
1
  """WhisperX audio transcription and diarization functions."""
2
2
 
3
- from typing import TYPE_CHECKING, Any, Optional
3
+ from typing import TYPE_CHECKING, Any
4
4
 
5
5
  import numpy as np
6
6
 
@@ -21,17 +21,17 @@ def transcribe(
21
21
  *,
22
22
  model: str,
23
23
  diarize: bool = False,
24
- compute_type: Optional[str] = None,
25
- language: Optional[str] = None,
26
- task: Optional[str] = None,
27
- chunk_size: Optional[int] = None,
28
- alignment_model_name: Optional[str] = None,
29
- interpolate_method: Optional[str] = None,
30
- return_char_alignments: Optional[bool] = None,
31
- diarization_model_name: Optional[str] = None,
32
- num_speakers: Optional[int] = None,
33
- min_speakers: Optional[int] = None,
34
- max_speakers: Optional[int] = None,
24
+ compute_type: str | None = None,
25
+ language: str | None = None,
26
+ task: str | None = None,
27
+ chunk_size: int | None = None,
28
+ alignment_model_name: str | None = None,
29
+ interpolate_method: str | None = None,
30
+ return_char_alignments: bool | None = None,
31
+ diarization_model_name: str | None = None,
32
+ num_speakers: int | None = None,
33
+ min_speakers: int | None = None,
34
+ max_speakers: int | None = None,
35
35
  ) -> dict:
36
36
  """
37
37
  Transcribe an audio file using WhisperX.
@@ -144,7 +144,7 @@ def _lookup_transcription_model(model: str, device: str, compute_type: str) -> '
144
144
  return _model_cache[key]
145
145
 
146
146
 
147
- def _lookup_alignment_model(language_code: str, device: str, model_name: Optional[str]) -> tuple['Wav2Vec2Model', dict]:
147
+ def _lookup_alignment_model(language_code: str, device: str, model_name: str | None) -> tuple['Wav2Vec2Model', dict]:
148
148
  import whisperx
149
149
 
150
150
  key = (language_code, device, model_name)
@@ -154,7 +154,7 @@ def _lookup_alignment_model(language_code: str, device: str, model_name: Optiona
154
154
  return _alignment_model_cache[key]
155
155
 
156
156
 
157
- def _lookup_diarization_model(device: str, model_name: Optional[str]) -> 'DiarizationPipeline':
157
+ def _lookup_diarization_model(device: str, model_name: str | None) -> 'DiarizationPipeline':
158
158
  from whisperx.diarize import DiarizationPipeline
159
159
 
160
160
  key = (device, model_name)
@@ -168,8 +168,8 @@ def _lookup_diarization_model(device: str, model_name: Optional[str]) -> 'Diariz
168
168
 
169
169
 
170
170
  _model_cache: dict[tuple[str, str, str], 'FasterWhisperPipeline'] = {}
171
- _alignment_model_cache: dict[tuple[str, str, Optional[str]], tuple['Wav2Vec2Model', dict]] = {}
172
- _diarization_model_cache: dict[tuple[str, Optional[str]], 'DiarizationPipeline'] = {}
171
+ _alignment_model_cache: dict[tuple[str, str, str | None], tuple['Wav2Vec2Model', dict]] = {}
172
+ _diarization_model_cache: dict[tuple[str, str | None], 'DiarizationPipeline'] = {}
173
173
 
174
174
 
175
175
  __all__ = local_public_names(__name__)
pixeltable/globals.py CHANGED
@@ -3,7 +3,7 @@ from __future__ import annotations
3
3
  import logging
4
4
  import os
5
5
  from pathlib import Path
6
- from typing import TYPE_CHECKING, Any, Iterable, Literal, NamedTuple, Optional, Union
6
+ from typing import TYPE_CHECKING, Any, Iterable, Literal, NamedTuple, Union
7
7
 
8
8
  import pandas as pd
9
9
  import pydantic
@@ -14,6 +14,7 @@ from pixeltable.catalog import Catalog, TableVersionPath
14
14
  from pixeltable.catalog.insertable_table import OnErrorParameter
15
15
  from pixeltable.config import Config
16
16
  from pixeltable.env import Env
17
+ from pixeltable.io.table_data_conduit import DFTableDataConduit, TableDataConduit
17
18
  from pixeltable.iterators import ComponentIterator
18
19
 
19
20
  if TYPE_CHECKING:
@@ -36,7 +37,7 @@ if TYPE_CHECKING:
36
37
  _logger = logging.getLogger('pixeltable')
37
38
 
38
39
 
39
- def init(config_overrides: Optional[dict[str, Any]] = None) -> None:
40
+ def init(config_overrides: dict[str, Any] | None = None) -> None:
40
41
  """Initializes the Pixeltable environment."""
41
42
  if config_overrides is None:
42
43
  config_overrides = {}
@@ -46,18 +47,19 @@ def init(config_overrides: Optional[dict[str, Any]] = None) -> None:
46
47
 
47
48
  def create_table(
48
49
  path: str,
49
- schema: Optional[dict[str, Any]] = None,
50
+ schema: dict[str, Any] | None = None,
50
51
  *,
51
- source: Optional[TableDataSource] = None,
52
- source_format: Optional[Literal['csv', 'excel', 'parquet', 'json']] = None,
53
- schema_overrides: Optional[dict[str, Any]] = None,
52
+ source: TableDataSource | None = None,
53
+ source_format: Literal['csv', 'excel', 'parquet', 'json'] | None = None,
54
+ schema_overrides: dict[str, Any] | None = None,
55
+ create_default_idxs: bool = True,
54
56
  on_error: Literal['abort', 'ignore'] = 'abort',
55
57
  primary_key: str | list[str] | None = None,
56
58
  num_retained_versions: int = 10,
57
59
  comment: str = '',
58
60
  media_validation: Literal['on_read', 'on_write'] = 'on_write',
59
61
  if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
60
- extra_args: Optional[dict[str, Any]] = None, # Additional arguments to data source provider
62
+ extra_args: dict[str, Any] | None = None, # Additional arguments to data source provider
61
63
  ) -> catalog.Table:
62
64
  """Create a new base table. Exactly one of `schema` or `source` must be provided.
63
65
 
@@ -77,6 +79,8 @@ def create_table(
77
79
  schema_overrides: Must be used in conjunction with a `source`.
78
80
  If specified, then columns in `schema_overrides` will be given the specified types.
79
81
  (Pixeltable will attempt to infer the types of any columns not specified.)
82
+ create_default_idxs: If True, creates a B-tree index on every scalar and media column that is not computed,
83
+ except for boolean columns.
80
84
  on_error: Determines the behavior if an error occurs while evaluating a computed column or detecting an
81
85
  invalid media file (such as a corrupt image) for one of the inserted rows.
82
86
 
@@ -138,7 +142,7 @@ def create_table(
138
142
 
139
143
  >>> tbl = pxt.create_table('my_table', source='data.csv')
140
144
  """
141
- from pixeltable.io.table_data_conduit import DFTableDataConduit, UnkTableDataConduit
145
+ from pixeltable.io.table_data_conduit import UnkTableDataConduit
142
146
  from pixeltable.io.utils import normalize_primary_key_parameter
143
147
 
144
148
  if (schema is None) == (source is None):
@@ -150,11 +154,16 @@ def create_table(
150
154
  path_obj = catalog.Path.parse(path)
151
155
  if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
152
156
  media_validation_ = catalog.MediaValidation.validated(media_validation, 'media_validation')
153
- primary_key: Optional[list[str]] = normalize_primary_key_parameter(primary_key)
154
- table: catalog.Table = None
155
- tds = None
156
- data_source = None
157
+ primary_key: list[str] | None = normalize_primary_key_parameter(primary_key)
158
+ data_source: TableDataConduit | None = None
157
159
  if source is not None:
160
+ if isinstance(source, str) and source.strip().startswith('pxt://'):
161
+ raise excs.Error(
162
+ 'create_table(): Creating a table directly from a cloud URI is not supported.'
163
+ ' Please replicate the table locally first using `pxt.replicate()`:\n'
164
+ "replica_tbl = pxt.replicate('pxt://path/to/remote_table', 'local_replica_name')\n"
165
+ "pxt.create_table('new_table_name', source=replica_tbl)"
166
+ )
158
167
  tds = UnkTableDataConduit(source, source_format=source_format, extra_fields=extra_args)
159
168
  tds.check_source_format()
160
169
  data_source = tds.specialize()
@@ -179,35 +188,43 @@ def create_table(
179
188
  'Unable to create a proper schema from supplied `source`. Please use appropriate `schema_overrides`.'
180
189
  )
181
190
 
182
- table, was_created = Catalog.get().create_table(
191
+ tbl, was_created = Catalog.get().create_table(
183
192
  path_obj,
184
193
  schema,
185
- data_source.pxt_df if isinstance(data_source, DFTableDataConduit) else None,
186
194
  if_exists=if_exists_,
187
195
  primary_key=primary_key,
188
196
  comment=comment,
189
197
  media_validation=media_validation_,
190
198
  num_retained_versions=num_retained_versions,
199
+ create_default_idxs=create_default_idxs,
191
200
  )
192
- if was_created and data_source is not None and not is_direct_df:
201
+
202
+ # TODO: combine data loading with table creation into a single transaction
203
+ if was_created:
193
204
  fail_on_exception = OnErrorParameter.fail_on_exception(on_error)
194
- table.insert_table_data_source(data_source=data_source, fail_on_exception=fail_on_exception)
205
+ if isinstance(data_source, DFTableDataConduit):
206
+ df = data_source.pxt_df
207
+ with Catalog.get().begin_xact(tbl=tbl._tbl_version_path, for_write=True, lock_mutable_tree=True):
208
+ tbl._tbl_version.get().insert(None, df, fail_on_exception=fail_on_exception)
209
+ elif data_source is not None and not is_direct_df:
210
+ tbl.insert_table_data_source(data_source=data_source, fail_on_exception=fail_on_exception)
195
211
 
196
- return table
212
+ return tbl
197
213
 
198
214
 
199
215
  def create_view(
200
216
  path: str,
201
217
  base: catalog.Table | DataFrame,
202
218
  *,
203
- additional_columns: Optional[dict[str, Any]] = None,
219
+ additional_columns: dict[str, Any] | None = None,
204
220
  is_snapshot: bool = False,
205
- iterator: Optional[tuple[type[ComponentIterator], dict[str, Any]]] = None,
221
+ create_default_idxs: bool = False,
222
+ iterator: tuple[type[ComponentIterator], dict[str, Any]] | None = None,
206
223
  num_retained_versions: int = 10,
207
224
  comment: str = '',
208
225
  media_validation: Literal['on_read', 'on_write'] = 'on_write',
209
226
  if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
210
- ) -> Optional[catalog.Table]:
227
+ ) -> catalog.Table | None:
211
228
  """Create a view of an existing table object (which itself can be a view or a snapshot or a base table).
212
229
 
213
230
  Args:
@@ -220,6 +237,8 @@ def create_view(
220
237
  [`create_table`][pixeltable.create_table].
221
238
  is_snapshot: Whether the view is a snapshot. Setting this to `True` is equivalent to calling
222
239
  [`create_snapshot`][pixeltable.create_snapshot].
240
+ create_default_idxs: Whether to create default indexes on the view's columns (the base's columns are excluded).
241
+ Cannot be `True` for snapshots.
223
242
  iterator: The iterator to use for this view. If specified, then this view will be a one-to-many view of
224
243
  the base table.
225
244
  num_retained_versions: Number of versions of the view to retain.
@@ -267,9 +286,11 @@ def create_view(
267
286
  >>> tbl = pxt.get_table('my_table')
268
287
  ... view = pxt.create_view('my_view', tbl.where(tbl.col1 > 100), if_exists='replace_force')
269
288
  """
289
+ if is_snapshot and create_default_idxs is True:
290
+ raise excs.Error('Cannot create default indexes on a snapshot')
270
291
  tbl_version_path: TableVersionPath
271
- select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]] = None
272
- where: Optional[exprs.Expr] = None
292
+ select_list: list[tuple[exprs.Expr, str | None]] | None = None
293
+ where: exprs.Expr | None = None
273
294
  if isinstance(base, catalog.Table):
274
295
  tbl_version_path = base._tbl_version_path
275
296
  sample_clause = None
@@ -297,7 +318,7 @@ def create_view(
297
318
  if col_name in [c.name for c in tbl_version_path.columns()]:
298
319
  raise excs.Error(
299
320
  f'Column {col_name!r} already exists in the base table '
300
- f'{tbl_version_path.get_column(col_name).tbl.name}.'
321
+ f'{tbl_version_path.get_column(col_name).get_tbl().name}.'
301
322
  )
302
323
 
303
324
  return Catalog.get().create_view(
@@ -308,6 +329,7 @@ def create_view(
308
329
  sample_clause=sample_clause,
309
330
  additional_columns=additional_columns,
310
331
  is_snapshot=is_snapshot,
332
+ create_default_idxs=create_default_idxs,
311
333
  iterator=iterator,
312
334
  num_retained_versions=num_retained_versions,
313
335
  comment=comment,
@@ -320,13 +342,13 @@ def create_snapshot(
320
342
  path_str: str,
321
343
  base: catalog.Table | DataFrame,
322
344
  *,
323
- additional_columns: Optional[dict[str, Any]] = None,
324
- iterator: Optional[tuple[type[ComponentIterator], dict[str, Any]]] = None,
345
+ additional_columns: dict[str, Any] | None = None,
346
+ iterator: tuple[type[ComponentIterator], dict[str, Any]] | None = None,
325
347
  num_retained_versions: int = 10,
326
348
  comment: str = '',
327
349
  media_validation: Literal['on_read', 'on_write'] = 'on_write',
328
350
  if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
329
- ) -> Optional[catalog.Table]:
351
+ ) -> catalog.Table | None:
330
352
  """Create a snapshot of an existing table object (which itself can be a view or a snapshot or a base table).
331
353
 
332
354
  Args:
@@ -487,12 +509,28 @@ def get_table(path: str, if_not_exists: Literal['error', 'ignore'] = 'error') ->
487
509
  return tbl
488
510
 
489
511
 
490
- def move(path: str, new_path: str) -> None:
512
+ def move(
513
+ path: str,
514
+ new_path: str,
515
+ *,
516
+ if_exists: Literal['error', 'ignore'] = 'error',
517
+ if_not_exists: Literal['error', 'ignore'] = 'error',
518
+ ) -> None:
491
519
  """Move a schema object to a new directory and/or rename a schema object.
492
520
 
493
521
  Args:
494
522
  path: absolute path to the existing schema object.
495
523
  new_path: absolute new path for the schema object.
524
+ if_exists: Directive regarding how to handle if a schema object already exists at the new path.
525
+ Must be one of the following:
526
+
527
+ - `'error'`: raise an error
528
+ - `'ignore'`: do nothing and return
529
+ if_not_exists: Directive regarding how to handle if the source path does not exist.
530
+ Must be one of the following:
531
+
532
+ - `'error'`: raise an error
533
+ - `'ignore'`: do nothing and return
496
534
 
497
535
  Raises:
498
536
  Error: If path does not exist or new_path already exists.
@@ -506,13 +544,16 @@ def move(path: str, new_path: str) -> None:
506
544
 
507
545
  >>>> pxt.move('dir1.my_table', 'dir1.new_name')
508
546
  """
547
+ if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
548
+ if if_exists_ not in (catalog.IfExistsParam.ERROR, catalog.IfExistsParam.IGNORE):
549
+ raise excs.Error("`if_exists` must be one of 'error' or 'ignore'")
550
+ if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
509
551
  if path == new_path:
510
552
  raise excs.Error('move(): source and destination cannot be identical')
511
553
  path_obj, new_path_obj = catalog.Path.parse(path), catalog.Path.parse(new_path)
512
554
  if path_obj.is_ancestor(new_path_obj):
513
555
  raise excs.Error(f'move(): cannot move {path!r} into its own subdirectory')
514
- cat = Catalog.get()
515
- cat.move(path_obj, new_path_obj)
556
+ Catalog.get().move(path_obj, new_path_obj, if_exists_, if_not_exists_)
516
557
 
517
558
 
518
559
  def drop_table(
@@ -660,8 +701,8 @@ def _list_tables(dir_path: str = '', recursive: bool = True, allow_system_paths:
660
701
 
661
702
 
662
703
  def create_dir(
663
- path: str, if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error', parents: bool = False
664
- ) -> Optional[catalog.Dir]:
704
+ path: str, *, if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error', parents: bool = False
705
+ ) -> catalog.Dir | None:
665
706
  """Create a directory.
666
707
 
667
708
  Args:
@@ -816,9 +857,7 @@ def ls(path: str = '') -> pd.DataFrame:
816
857
 
817
858
 
818
859
  def _extract_paths(
819
- dir_entries: dict[str, Catalog.DirEntry],
820
- parent: catalog.Path,
821
- entry_type: Optional[type[catalog.SchemaObject]] = None,
860
+ dir_entries: dict[str, Catalog.DirEntry], parent: catalog.Path, entry_type: type[catalog.SchemaObject] | None = None
822
861
  ) -> list[catalog.Path]:
823
862
  """Convert nested dir_entries structure to a flattened list of paths."""
824
863
  matches: list[str]
@@ -928,7 +967,7 @@ def tools(*args: func.Function | func.tools.Tool) -> func.tools.Tools:
928
967
  return func.tools.Tools(tools=[arg if isinstance(arg, func.tools.Tool) else tool(arg) for arg in args])
929
968
 
930
969
 
931
- def tool(fn: func.Function, name: Optional[str] = None, description: Optional[str] = None) -> func.tools.Tool:
970
+ def tool(fn: func.Function, name: str | None = None, description: str | None = None) -> func.tools.Tool:
932
971
  """
933
972
  Specifies a Pixeltable UDF to be used as an LLM tool with customizable metadata. See the documentation for
934
973
  [pxt.tools()][pixeltable.tools] for more details.
@@ -949,11 +988,7 @@ def tool(fn: func.Function, name: Optional[str] = None, description: Optional[st
949
988
 
950
989
 
951
990
  def configure_logging(
952
- *,
953
- to_stdout: Optional[bool] = None,
954
- level: Optional[int] = None,
955
- add: Optional[str] = None,
956
- remove: Optional[str] = None,
991
+ *, to_stdout: bool | None = None, level: int | None = None, add: str | None = None, remove: str | None = None
957
992
  ) -> None:
958
993
  """Configure logging.
959
994