pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (202) hide show
  1. pixeltable/__init__.py +23 -5
  2. pixeltable/_version.py +1 -0
  3. pixeltable/catalog/__init__.py +5 -3
  4. pixeltable/catalog/catalog.py +1318 -404
  5. pixeltable/catalog/column.py +186 -115
  6. pixeltable/catalog/dir.py +1 -2
  7. pixeltable/catalog/globals.py +11 -43
  8. pixeltable/catalog/insertable_table.py +167 -79
  9. pixeltable/catalog/path.py +61 -23
  10. pixeltable/catalog/schema_object.py +9 -10
  11. pixeltable/catalog/table.py +626 -308
  12. pixeltable/catalog/table_metadata.py +101 -0
  13. pixeltable/catalog/table_version.py +713 -569
  14. pixeltable/catalog/table_version_handle.py +37 -6
  15. pixeltable/catalog/table_version_path.py +42 -29
  16. pixeltable/catalog/tbl_ops.py +50 -0
  17. pixeltable/catalog/update_status.py +191 -0
  18. pixeltable/catalog/view.py +108 -94
  19. pixeltable/config.py +128 -22
  20. pixeltable/dataframe.py +188 -100
  21. pixeltable/env.py +407 -136
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +3 -0
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +231 -0
  27. pixeltable/exec/cell_reconstruction_node.py +135 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +7 -6
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +190 -30
  37. pixeltable/exec/globals.py +32 -0
  38. pixeltable/exec/in_memory_data_node.py +18 -18
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +206 -101
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +34 -30
  46. pixeltable/exprs/column_ref.py +92 -96
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +152 -55
  50. pixeltable/exprs/expr.py +62 -43
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +75 -37
  54. pixeltable/exprs/globals.py +1 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +10 -27
  57. pixeltable/exprs/is_null.py +1 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +5 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +127 -53
  64. pixeltable/exprs/rowid_ref.py +8 -12
  65. pixeltable/exprs/similarity_expr.py +50 -25
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +10 -10
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +20 -18
  78. pixeltable/func/signature.py +43 -16
  79. pixeltable/func/tools.py +23 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +6 -0
  82. pixeltable/functions/anthropic.py +93 -33
  83. pixeltable/functions/audio.py +114 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +1 -1
  86. pixeltable/functions/deepseek.py +20 -9
  87. pixeltable/functions/fireworks.py +2 -2
  88. pixeltable/functions/gemini.py +28 -11
  89. pixeltable/functions/globals.py +13 -13
  90. pixeltable/functions/groq.py +108 -0
  91. pixeltable/functions/huggingface.py +1046 -23
  92. pixeltable/functions/image.py +9 -18
  93. pixeltable/functions/llama_cpp.py +23 -8
  94. pixeltable/functions/math.py +3 -4
  95. pixeltable/functions/mistralai.py +4 -15
  96. pixeltable/functions/ollama.py +16 -9
  97. pixeltable/functions/openai.py +104 -82
  98. pixeltable/functions/openrouter.py +143 -0
  99. pixeltable/functions/replicate.py +2 -2
  100. pixeltable/functions/reve.py +250 -0
  101. pixeltable/functions/string.py +21 -28
  102. pixeltable/functions/timestamp.py +13 -14
  103. pixeltable/functions/together.py +4 -6
  104. pixeltable/functions/twelvelabs.py +92 -0
  105. pixeltable/functions/util.py +6 -1
  106. pixeltable/functions/video.py +1388 -106
  107. pixeltable/functions/vision.py +7 -7
  108. pixeltable/functions/whisper.py +15 -7
  109. pixeltable/functions/whisperx.py +179 -0
  110. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  111. pixeltable/globals.py +332 -105
  112. pixeltable/index/base.py +13 -22
  113. pixeltable/index/btree.py +23 -22
  114. pixeltable/index/embedding_index.py +32 -44
  115. pixeltable/io/__init__.py +4 -2
  116. pixeltable/io/datarows.py +7 -6
  117. pixeltable/io/external_store.py +49 -77
  118. pixeltable/io/fiftyone.py +11 -11
  119. pixeltable/io/globals.py +29 -28
  120. pixeltable/io/hf_datasets.py +17 -9
  121. pixeltable/io/label_studio.py +70 -66
  122. pixeltable/io/lancedb.py +3 -0
  123. pixeltable/io/pandas.py +12 -11
  124. pixeltable/io/parquet.py +13 -93
  125. pixeltable/io/table_data_conduit.py +71 -47
  126. pixeltable/io/utils.py +3 -3
  127. pixeltable/iterators/__init__.py +2 -1
  128. pixeltable/iterators/audio.py +21 -11
  129. pixeltable/iterators/document.py +116 -55
  130. pixeltable/iterators/image.py +5 -2
  131. pixeltable/iterators/video.py +293 -13
  132. pixeltable/metadata/__init__.py +4 -2
  133. pixeltable/metadata/converters/convert_18.py +2 -2
  134. pixeltable/metadata/converters/convert_19.py +2 -2
  135. pixeltable/metadata/converters/convert_20.py +2 -2
  136. pixeltable/metadata/converters/convert_21.py +2 -2
  137. pixeltable/metadata/converters/convert_22.py +2 -2
  138. pixeltable/metadata/converters/convert_24.py +2 -2
  139. pixeltable/metadata/converters/convert_25.py +2 -2
  140. pixeltable/metadata/converters/convert_26.py +2 -2
  141. pixeltable/metadata/converters/convert_29.py +4 -4
  142. pixeltable/metadata/converters/convert_34.py +2 -2
  143. pixeltable/metadata/converters/convert_36.py +2 -2
  144. pixeltable/metadata/converters/convert_37.py +15 -0
  145. pixeltable/metadata/converters/convert_38.py +39 -0
  146. pixeltable/metadata/converters/convert_39.py +124 -0
  147. pixeltable/metadata/converters/convert_40.py +73 -0
  148. pixeltable/metadata/converters/util.py +13 -12
  149. pixeltable/metadata/notes.py +4 -0
  150. pixeltable/metadata/schema.py +79 -42
  151. pixeltable/metadata/utils.py +74 -0
  152. pixeltable/mypy/__init__.py +3 -0
  153. pixeltable/mypy/mypy_plugin.py +123 -0
  154. pixeltable/plan.py +274 -223
  155. pixeltable/share/__init__.py +1 -1
  156. pixeltable/share/packager.py +259 -129
  157. pixeltable/share/protocol/__init__.py +34 -0
  158. pixeltable/share/protocol/common.py +170 -0
  159. pixeltable/share/protocol/operation_types.py +33 -0
  160. pixeltable/share/protocol/replica.py +109 -0
  161. pixeltable/share/publish.py +213 -57
  162. pixeltable/store.py +238 -175
  163. pixeltable/type_system.py +104 -63
  164. pixeltable/utils/__init__.py +2 -3
  165. pixeltable/utils/arrow.py +108 -13
  166. pixeltable/utils/av.py +298 -0
  167. pixeltable/utils/azure_store.py +305 -0
  168. pixeltable/utils/code.py +3 -3
  169. pixeltable/utils/console_output.py +4 -1
  170. pixeltable/utils/coroutine.py +6 -23
  171. pixeltable/utils/dbms.py +31 -5
  172. pixeltable/utils/description_helper.py +4 -5
  173. pixeltable/utils/documents.py +5 -6
  174. pixeltable/utils/exception_handler.py +7 -30
  175. pixeltable/utils/filecache.py +6 -6
  176. pixeltable/utils/formatter.py +4 -6
  177. pixeltable/utils/gcs_store.py +283 -0
  178. pixeltable/utils/http_server.py +2 -3
  179. pixeltable/utils/iceberg.py +1 -2
  180. pixeltable/utils/image.py +17 -0
  181. pixeltable/utils/lancedb.py +88 -0
  182. pixeltable/utils/local_store.py +316 -0
  183. pixeltable/utils/misc.py +5 -0
  184. pixeltable/utils/object_stores.py +528 -0
  185. pixeltable/utils/pydantic.py +60 -0
  186. pixeltable/utils/pytorch.py +5 -6
  187. pixeltable/utils/s3_store.py +392 -0
  188. pixeltable-0.4.20.dist-info/METADATA +587 -0
  189. pixeltable-0.4.20.dist-info/RECORD +218 -0
  190. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
  191. pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
  192. pixeltable/__version__.py +0 -3
  193. pixeltable/ext/__init__.py +0 -17
  194. pixeltable/ext/functions/__init__.py +0 -11
  195. pixeltable/ext/functions/whisperx.py +0 -77
  196. pixeltable/utils/media_store.py +0 -77
  197. pixeltable/utils/s3.py +0 -17
  198. pixeltable/utils/sample.py +0 -25
  199. pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
  200. pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
  201. pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
  202. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0
@@ -1,15 +1,21 @@
1
+ import glob
1
2
  import logging
2
3
  import math
4
+ import subprocess
3
5
  from fractions import Fraction
4
6
  from pathlib import Path
5
- from typing import Any, Optional
7
+ from typing import Any, Iterator, Literal
6
8
 
7
9
  import av
8
10
  import pandas as pd
9
11
  import PIL.Image
10
12
 
13
+ import pixeltable as pxt
11
14
  import pixeltable.exceptions as excs
12
15
  import pixeltable.type_system as ts
16
+ import pixeltable.utils.av as av_utils
17
+ from pixeltable.env import Env
18
+ from pixeltable.utils.local_store import TempStore
13
19
 
14
20
  from .base import ComponentIterator
15
21
 
@@ -29,12 +35,29 @@ class FrameIterator(ComponentIterator):
29
35
  extracted). If `fps` is greater than the frame rate of the video, an error will be raised.
30
36
  num_frames: Exact number of frames to extract. The frames will be spaced as evenly as possible. If
31
37
  `num_frames` is greater than the number of frames in the video, all frames will be extracted.
38
+ all_frame_attrs:
39
+ If True, outputs a `pxt.Json` column `frame_attrs` with the following `pyav`-provided attributes
40
+ (for more information, see `pyav`'s documentation on
41
+ [VideoFrame](https://pyav.org/docs/develop/api/video.html#module-av.video.frame) and
42
+ [Frame](https://pyav.org/docs/develop/api/frame.html)):
43
+
44
+ * `index` (`int`)
45
+ * `pts` (`int | None`)
46
+ * `dts` (`int | None`)
47
+ * `time` (`float | None`)
48
+ * `is_corrupt` (`bool`)
49
+ * `key_frame` (`bool`)
50
+ * `pict_type` (`int`)
51
+ * `interlaced_frame` (`bool`)
52
+
53
+ If False, only outputs frame attributes `frame_idx`, `pos_msec`, and `pos_frame` as separate columns.
32
54
  """
33
55
 
34
56
  # Input parameters
35
57
  video_path: Path
36
- fps: Optional[float]
37
- num_frames: Optional[int]
58
+ fps: float | None
59
+ num_frames: int | None
60
+ all_frame_attrs: bool
38
61
 
39
62
  # Video info
40
63
  container: av.container.input.InputContainer
@@ -44,13 +67,15 @@ class FrameIterator(ComponentIterator):
44
67
  video_start_time: int
45
68
 
46
69
  # List of frame indices to be extracted, or None to extract all frames
47
- frames_to_extract: Optional[list[int]]
70
+ frames_to_extract: list[int] | None
48
71
 
49
72
  # Next frame to extract, as an iterator `pos` index. If `frames_to_extract` is None, this is the same as the
50
73
  # frame index in the video. Otherwise, the corresponding video index is `frames_to_extract[next_pos]`.
51
74
  next_pos: int
52
75
 
53
- def __init__(self, video: str, *, fps: Optional[float] = None, num_frames: Optional[int] = None):
76
+ def __init__(
77
+ self, video: str, *, fps: float | None = None, num_frames: int | None = None, all_frame_attrs: bool = False
78
+ ):
54
79
  if fps is not None and num_frames is not None:
55
80
  raise excs.Error('At most one of `fps` or `num_frames` may be specified')
56
81
 
@@ -60,6 +85,7 @@ class FrameIterator(ComponentIterator):
60
85
  self.container = av.open(str(video_path))
61
86
  self.fps = fps
62
87
  self.num_frames = num_frames
88
+ self.all_frame_attrs = all_frame_attrs
63
89
 
64
90
  self.video_framerate = self.container.streams.video[0].average_rate
65
91
  self.video_time_base = self.container.streams.video[0].time_base
@@ -115,16 +141,17 @@ class FrameIterator(ComponentIterator):
115
141
  'video': ts.VideoType(nullable=False),
116
142
  'fps': ts.FloatType(nullable=True),
117
143
  'num_frames': ts.IntType(nullable=True),
144
+ 'all_frame_attrs': ts.BoolType(nullable=False),
118
145
  }
119
146
 
120
147
  @classmethod
121
148
  def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
122
- return {
123
- 'frame_idx': ts.IntType(),
124
- 'pos_msec': ts.FloatType(),
125
- 'pos_frame': ts.IntType(),
126
- 'frame': ts.ImageType(),
127
- }, ['frame']
149
+ attrs: dict[str, ts.ColumnType]
150
+ if kwargs.get('all_frame_attrs'):
151
+ attrs = {'frame_attrs': ts.JsonType()}
152
+ else:
153
+ attrs = {'frame_idx': ts.IntType(), 'pos_msec': ts.FloatType(), 'pos_frame': ts.IntType()}
154
+ return {**attrs, 'frame': ts.ImageType()}, ['frame']
128
155
 
129
156
  def __next__(self) -> dict[str, Any]:
130
157
  # Determine the frame index in the video corresponding to the iterator index `next_pos`;
@@ -164,8 +191,22 @@ class FrameIterator(ComponentIterator):
164
191
  raise excs.Error(f'Frame {next_video_idx} is missing from the video (video file is corrupt)')
165
192
  img = frame.to_image()
166
193
  assert isinstance(img, PIL.Image.Image)
167
- pos_msec = float(pts * self.video_time_base * 1000)
168
- result = {'frame_idx': self.next_pos, 'pos_msec': pos_msec, 'pos_frame': video_idx, 'frame': img}
194
+ pts_msec = float(pts * self.video_time_base * 1000)
195
+ result: dict[str, Any] = {'frame': img}
196
+ if self.all_frame_attrs:
197
+ attrs = {
198
+ 'index': video_idx,
199
+ 'pts': frame.pts,
200
+ 'dts': frame.dts,
201
+ 'time': frame.time,
202
+ 'is_corrupt': frame.is_corrupt,
203
+ 'key_frame': frame.key_frame,
204
+ 'pict_type': frame.pict_type,
205
+ 'interlaced_frame': frame.interlaced_frame,
206
+ }
207
+ result['frame_attrs'] = attrs
208
+ else:
209
+ result.update({'frame_idx': self.next_pos, 'pos_msec': pts_msec, 'pos_frame': video_idx})
169
210
  self.next_pos += 1
170
211
  return result
171
212
 
@@ -184,3 +225,242 @@ class FrameIterator(ComponentIterator):
184
225
  # then the iterator will step forward to the desired frame on the subsequent call to next().
185
226
  self.container.seek(seek_pos, backward=True, stream=self.container.streams.video[0])
186
227
  self.next_pos = pos
228
+
229
+
230
+ class VideoSplitter(ComponentIterator):
231
+ """
232
+ Iterator over segments of a video file, which is split into fixed-size segments of length `segment_duration`
233
+ seconds.
234
+
235
+ Args:
236
+ duration: Video segment duration in seconds
237
+ overlap: Overlap between consecutive segments in seconds. Only available for `mode='fast'`.
238
+ min_segment_duration: Drop the last segment if it is smaller than min_segment_duration.
239
+ mode: Segmentation mode:
240
+ - `'fast'`: Quick segmentation using stream copy (splits only at keyframes, approximate durations)
241
+ - `'accurate'`: Precise segmentation with re-encoding (exact durations, slower)
242
+ video_encoder: Video encoder to use. If not specified, uses the default encoder for the current platform.
243
+ Only available for `mode='accurate'`.
244
+ video_encoder_args: Additional arguments to pass to the video encoder. Only available for `mode='accurate'`.
245
+ """
246
+
247
+ # Input parameters
248
+ video_path: Path
249
+ segment_duration: float | None
250
+ segment_times: list[float] | None
251
+ overlap: float
252
+ min_segment_duration: float
253
+ video_encoder: str | None
254
+ video_encoder_args: dict[str, Any] | None
255
+
256
+ # Video metadata
257
+ video_duration: float
258
+ video_time_base: Fraction
259
+ video_start_time: int
260
+
261
+ output_iter: Iterator[dict[str, Any]]
262
+
263
+ def __init__(
264
+ self,
265
+ video: str,
266
+ *,
267
+ duration: float | None = None,
268
+ overlap: float | None = None,
269
+ min_segment_duration: float | None = None,
270
+ segment_times: list[float] | None = None,
271
+ mode: Literal['fast', 'accurate'] = 'accurate',
272
+ video_encoder: str | None = None,
273
+ video_encoder_args: dict[str, Any] | None = None,
274
+ ):
275
+ Env.get().require_binary('ffmpeg')
276
+ assert (duration is not None) != (segment_times is not None)
277
+ if segment_times is not None:
278
+ assert len(segment_times) > 0
279
+ if duration is not None:
280
+ assert duration > 0.0
281
+ assert duration >= min_segment_duration
282
+ assert overlap is None or overlap < duration
283
+
284
+ video_path = Path(video)
285
+ assert video_path.exists() and video_path.is_file()
286
+
287
+ self.video_path = video_path
288
+ self.segment_duration = duration
289
+ self.overlap = overlap if overlap is not None else 0.0
290
+ self.min_segment_duration = min_segment_duration if min_segment_duration is not None else 0.0
291
+ self.segment_times = segment_times
292
+ self.video_encoder = video_encoder
293
+ self.video_encoder_args = video_encoder_args
294
+
295
+ with av.open(str(video_path)) as container:
296
+ video_stream = container.streams.video[0]
297
+ self.video_time_base = video_stream.time_base
298
+ self.video_start_time = video_stream.start_time or 0
299
+
300
+ self.output_iter = self.fast_iter() if mode == 'fast' else self.accurate_iter()
301
+
302
+ @classmethod
303
+ def input_schema(cls) -> dict[str, ts.ColumnType]:
304
+ return {
305
+ 'video': ts.VideoType(nullable=False),
306
+ 'duration': ts.FloatType(nullable=True),
307
+ 'overlap': ts.FloatType(nullable=True),
308
+ 'min_segment_duration': ts.FloatType(nullable=True),
309
+ 'segment_times': ts.JsonType(nullable=True),
310
+ 'mode': ts.StringType(nullable=False),
311
+ 'video_encoder': ts.StringType(nullable=True),
312
+ 'video_encoder_args': ts.JsonType(nullable=True),
313
+ }
314
+
315
+ @classmethod
316
+ def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
317
+ param_names = ['duration', 'overlap', 'min_segment_duration', 'segment_times']
318
+ params = dict(zip(param_names, args))
319
+ params.update(kwargs)
320
+
321
+ segment_duration = params.get('duration')
322
+ segment_times = params.get('segment_times')
323
+ overlap = params.get('overlap')
324
+ min_segment_duration = params.get('min_segment_duration')
325
+ mode = params.get('mode', 'fast')
326
+
327
+ if segment_duration is None and segment_times is None:
328
+ raise excs.Error('Must specify either duration or segment_times')
329
+ if segment_duration is not None and segment_times is not None:
330
+ raise excs.Error('duration and segment_times cannot both be specified')
331
+ if segment_times is not None:
332
+ if len(segment_times) == 0:
333
+ raise excs.Error('segment_times cannot be empty')
334
+ if overlap is not None:
335
+ raise excs.Error('overlap cannot be specified with segment_times')
336
+ if segment_duration is not None:
337
+ if segment_duration <= 0.0:
338
+ raise excs.Error('duration must be a positive number')
339
+ if min_segment_duration is not None and segment_duration < min_segment_duration:
340
+ raise excs.Error('duration must be at least min_segment_duration')
341
+ if overlap is not None and overlap >= segment_duration:
342
+ raise excs.Error('overlap must be less than duration')
343
+ if mode == 'accurate' and overlap is not None:
344
+ raise excs.Error("Cannot specify overlap for mode='accurate'")
345
+ if mode == 'fast':
346
+ if params.get('video_encoder') is not None:
347
+ raise excs.Error("Cannot specify video_encoder for mode='fast'")
348
+ if params.get('video_encoder_args') is not None:
349
+ raise excs.Error("Cannot specify video_encoder_args for mode='fast'")
350
+
351
+ return {
352
+ 'segment_start': ts.FloatType(nullable=False),
353
+ 'segment_start_pts': ts.IntType(nullable=False),
354
+ 'segment_end': ts.FloatType(nullable=False),
355
+ 'segment_end_pts': ts.IntType(nullable=False),
356
+ 'video_segment': ts.VideoType(nullable=False),
357
+ }, []
358
+
359
+ def fast_iter(self) -> Iterator[dict[str, Any]]:
360
+ segment_path: str = ''
361
+ try:
362
+ start_time = 0.0
363
+ start_pts = 0
364
+ segment_idx = 0
365
+ while True:
366
+ target_duration: float | None
367
+ if self.segment_duration is not None:
368
+ target_duration = self.segment_duration
369
+ elif self.segment_times is not None and segment_idx < len(self.segment_times):
370
+ target_duration = self.segment_times[segment_idx] - start_time
371
+ else:
372
+ target_duration = None # the rest of the video
373
+
374
+ segment_path = str(TempStore.create_path(extension='.mp4'))
375
+ cmd = av_utils.ffmpeg_clip_cmd(str(self.video_path), segment_path, start_time, target_duration)
376
+ _ = subprocess.run(cmd, capture_output=True, text=True, check=True)
377
+
378
+ # use the actual duration
379
+ segment_duration = av_utils.get_video_duration(segment_path)
380
+ if segment_duration - self.overlap == 0.0 or segment_duration < self.min_segment_duration:
381
+ # we're done
382
+ Path(segment_path).unlink()
383
+ return
384
+
385
+ segment_end = start_time + segment_duration
386
+ segment_end_pts = start_pts + round(segment_duration / self.video_time_base)
387
+ result = {
388
+ 'segment_start': start_time,
389
+ 'segment_start_pts': start_pts,
390
+ 'segment_end': segment_end,
391
+ 'segment_end_pts': segment_end_pts,
392
+ 'video_segment': segment_path,
393
+ }
394
+ yield result
395
+
396
+ start_time = segment_end - self.overlap
397
+ start_pts = segment_end_pts - round(self.overlap / self.video_time_base)
398
+
399
+ segment_idx += 1
400
+ if self.segment_times is not None and segment_idx > len(self.segment_times):
401
+ # We've created all segments including the final segment after the last segment_time
402
+ break
403
+
404
+ except subprocess.CalledProcessError as e:
405
+ if segment_path and Path(segment_path).exists():
406
+ Path(segment_path).unlink()
407
+ error_msg = f'ffmpeg failed with return code {e.returncode}'
408
+ if e.stderr:
409
+ error_msg += f': {e.stderr.strip()}'
410
+ raise pxt.Error(error_msg) from e
411
+
412
+ def accurate_iter(self) -> Iterator[dict[str, Any]]:
413
+ base_path = TempStore.create_path(extension='')
414
+ # Use ffmpeg -f segment for accurate segmentation with re-encoding
415
+ output_pattern = f'{base_path}_segment_%04d.mp4'
416
+ cmd = av_utils.ffmpeg_segment_cmd(
417
+ str(self.video_path),
418
+ output_pattern,
419
+ segment_duration=self.segment_duration,
420
+ segment_times=self.segment_times,
421
+ video_encoder=self.video_encoder,
422
+ video_encoder_args=self.video_encoder_args,
423
+ )
424
+
425
+ try:
426
+ _ = subprocess.run(cmd, capture_output=True, text=True, check=True)
427
+ output_paths = sorted(glob.glob(f'{base_path}_segment_*.mp4'))
428
+ # TODO: is this actually an error?
429
+ # if len(output_paths) == 0:
430
+ # stderr_output = result.stderr.strip() if result.stderr is not None else ''
431
+ # raise pxt.Error(
432
+ # f'ffmpeg failed to create output files for commandline: {" ".join(cmd)}\n{stderr_output}'
433
+ # )
434
+ start_time = 0.0
435
+ start_pts = 0
436
+ for segment_path in output_paths:
437
+ segment_duration = av_utils.get_video_duration(segment_path)
438
+ if segment_duration < self.min_segment_duration:
439
+ Path(segment_path).unlink()
440
+ return
441
+
442
+ result = {
443
+ 'segment_start': start_time,
444
+ 'segment_start_pts': start_pts,
445
+ 'segment_end': start_time + segment_duration,
446
+ 'segment_end_pts': start_pts + round(segment_duration / self.video_time_base),
447
+ 'video_segment': segment_path,
448
+ }
449
+ yield result
450
+ start_time += segment_duration
451
+ start_pts += round(segment_duration / self.video_time_base)
452
+
453
+ except subprocess.CalledProcessError as e:
454
+ error_msg = f'ffmpeg failed with return code {e.returncode}'
455
+ if e.stderr:
456
+ error_msg += f': {e.stderr.strip()}'
457
+ raise pxt.Error(error_msg) from e
458
+
459
+ def __next__(self) -> dict[str, Any]:
460
+ return next(self.output_iter)
461
+
462
+ def close(self) -> None:
463
+ pass
464
+
465
+ def set_pos(self, pos: int) -> None:
466
+ pass
@@ -18,13 +18,14 @@ _console_logger = ConsoleLogger(logging.getLogger('pixeltable'))
18
18
  _logger = logging.getLogger('pixeltable')
19
19
 
20
20
  # current version of the metadata; this is incremented whenever the metadata schema changes
21
- VERSION = 37
21
+ VERSION = 41
22
22
 
23
23
 
24
24
  def create_system_info(engine: sql.engine.Engine) -> None:
25
25
  """Create the system metadata record"""
26
26
  system_md = SystemInfoMd(schema_version=VERSION)
27
27
  record = SystemInfo(md=dataclasses.asdict(system_md))
28
+ _logger.debug(f'Creating pixeltable system info record {record}')
28
29
  with orm.Session(engine, future=True) as session:
29
30
  # Write system metadata only once for idempotency
30
31
  if session.query(SystemInfo).count() == 0:
@@ -54,7 +55,8 @@ for _, modname, _ in pkgutil.iter_modules([os.path.dirname(__file__) + '/convert
54
55
  def upgrade_md(engine: sql.engine.Engine) -> None:
55
56
  """Upgrade the metadata schema to the current version"""
56
57
  with orm.Session(engine) as session:
57
- system_info = session.query(SystemInfo).one().md
58
+ # Get exclusive lock on SystemInfo row
59
+ system_info = session.query(SystemInfo).with_for_update().one().md
58
60
  md_version = system_info['schema_version']
59
61
  assert isinstance(md_version, int)
60
62
  _logger.info(f'Current database version: {md_version}, installed version: {VERSION}')
@@ -1,4 +1,4 @@
1
- from typing import Any, Optional
1
+ from typing import Any
2
2
 
3
3
  import sqlalchemy as sql
4
4
 
@@ -11,7 +11,7 @@ def _(engine: sql.engine.Engine) -> None:
11
11
  convert_table_md(engine, substitution_fn=__substitute_md)
12
12
 
13
13
 
14
- def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
14
+ def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
15
15
  # Migrate a few changed function names
16
16
  if k == 'path' and v == 'pixeltable.functions.string.str_format':
17
17
  return 'path', 'pixeltable.functions.string.format'
@@ -1,5 +1,5 @@
1
1
  import datetime
2
- from typing import Any, Optional
2
+ from typing import Any
3
3
 
4
4
  import sqlalchemy as sql
5
5
 
@@ -28,7 +28,7 @@ def _(engine: sql.engine.Engine) -> None:
28
28
  conn.execute(sql.text(f'ALTER TABLE {store_name} ALTER COLUMN col_{col_id} TYPE TIMESTAMPTZ'))
29
29
 
30
30
 
31
- def __update_timestamp_literals(k: Any, v: Any) -> Optional[tuple[Any, Any]]:
31
+ def __update_timestamp_literals(k: Any, v: Any) -> tuple[Any, Any] | None:
32
32
  if isinstance(v, dict) and 'val_t' in v:
33
33
  # It's a literal with an explicit 'val_t' field. In version 19 this can only mean a
34
34
  # timestamp literal, which (in version 19) is stored in the DB as a naive datetime.
@@ -1,4 +1,4 @@
1
- from typing import Any, Optional
1
+ from typing import Any
2
2
 
3
3
  import sqlalchemy as sql
4
4
 
@@ -11,7 +11,7 @@ def _(engine: sql.engine.Engine) -> None:
11
11
  convert_table_md(engine, substitution_fn=__substitute_md)
12
12
 
13
13
 
14
- def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
14
+ def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
15
15
  if isinstance(v, dict) and '_classname' in v:
16
16
  # The way InlineArray is represented changed in v20. Previously, literal values were stored
17
17
  # directly in the Inline expr; now we store them in Literal sub-exprs. This converter
@@ -1,4 +1,4 @@
1
- from typing import Any, Optional
1
+ from typing import Any
2
2
 
3
3
  import sqlalchemy as sql
4
4
 
@@ -24,7 +24,7 @@ def __update_schema_column(schema_column: dict) -> None:
24
24
  schema_column['media_validation'] = None
25
25
 
26
26
 
27
- def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
27
+ def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
28
28
  if isinstance(v, dict) and '_classname' in v and v['_classname'] == 'ColumnRef':
29
29
  if 'perform_validation' not in v:
30
30
  v['perform_validation'] = False
@@ -1,4 +1,4 @@
1
- from typing import Any, Optional
1
+ from typing import Any
2
2
 
3
3
  import sqlalchemy as sql
4
4
 
@@ -11,7 +11,7 @@ def _(engine: sql.engine.Engine) -> None:
11
11
  convert_table_md(engine, substitution_fn=__substitute_md)
12
12
 
13
13
 
14
- def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
14
+ def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
15
15
  if isinstance(v, dict) and '_classname' in v and v['_classname'] == 'DataFrame':
16
16
  v['from_clause'] = {'tbls': [v['tbl']], 'join_clauses': []}
17
17
  return k, v
@@ -1,4 +1,4 @@
1
- from typing import Any, Optional
1
+ from typing import Any
2
2
 
3
3
  import sqlalchemy as sql
4
4
 
@@ -11,7 +11,7 @@ def _(engine: sql.engine.Engine) -> None:
11
11
  convert_table_md(engine, substitution_fn=__substitute_md)
12
12
 
13
13
 
14
- def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
14
+ def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
15
15
  from pixeltable import func
16
16
  from pixeltable.func.globals import resolve_symbol
17
17
 
@@ -1,4 +1,4 @@
1
- from typing import Any, Optional
1
+ from typing import Any
2
2
 
3
3
  import sqlalchemy as sql
4
4
 
@@ -11,7 +11,7 @@ def _(engine: sql.engine.Engine) -> None:
11
11
  convert_table_md(engine, substitution_fn=__substitute_md)
12
12
 
13
13
 
14
- def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
14
+ def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
15
15
  if k == 'path' and (
16
16
  v in ('pixeltable.functions.huggingface.clip_text', 'pixeltable.functions.huggingface.clip_image')
17
17
  ):
@@ -1,4 +1,4 @@
1
- from typing import Any, Optional
1
+ from typing import Any
2
2
 
3
3
  import sqlalchemy as sql
4
4
 
@@ -11,7 +11,7 @@ def _(engine: sql.engine.Engine) -> None:
11
11
  convert_table_md(engine, substitution_fn=__substitute_md)
12
12
 
13
13
 
14
- def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
14
+ def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
15
15
  import pixeltable.type_system as ts
16
16
  from pixeltable.exprs.literal import Literal
17
17
 
@@ -1,4 +1,4 @@
1
- from typing import Any, Optional
1
+ from typing import Any
2
2
 
3
3
  import sqlalchemy as sql
4
4
 
@@ -12,7 +12,7 @@ def _(engine: sql.engine.Engine) -> None:
12
12
  convert_table_md(engine, substitution_fn=__substitute_md)
13
13
 
14
14
 
15
- def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
15
+ def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
16
16
  # Defaults are now stored as literals in signatures
17
17
  if k == 'parameters':
18
18
  for param in v:
@@ -55,8 +55,8 @@ def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], A
55
55
  # We need to expand ("unroll") any var-args or var-kwargs.
56
56
 
57
57
  new_args_len = len(new_args)
58
- rolled_args: Optional[dict] = None
59
- rolled_kwargs: Optional[dict] = None
58
+ rolled_args: dict | None = None
59
+ rolled_kwargs: dict | None = None
60
60
 
61
61
  if 'signature' in v['fn']:
62
62
  # If it's a pickled function, there's no signature, so we're out of luck; varargs in a pickled function
@@ -1,4 +1,4 @@
1
- from typing import Any, Optional
1
+ from typing import Any
2
2
 
3
3
  import sqlalchemy as sql
4
4
 
@@ -11,7 +11,7 @@ def _(engine: sql.engine.Engine) -> None:
11
11
  convert_table_md(engine, substitution_fn=__substitute_md)
12
12
 
13
13
 
14
- def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
14
+ def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
15
15
  if isinstance(v, dict) and '_classname' in v and v['_classname'] == 'ColumnRef':
16
16
  # Add reference_tbl to ColumnRef; for historical metadata it is always equal to tbl
17
17
  assert 'reference_tbl' not in v
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import Any, Optional
2
+ from typing import Any
3
3
  from uuid import UUID
4
4
 
5
5
  import sqlalchemy as sql
@@ -30,7 +30,7 @@ def __update_table_md(table_md: dict, table_id: UUID) -> None:
30
30
  _logger.info(f'Updating view metadata for table: {table_id}')
31
31
 
32
32
 
33
- def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
33
+ def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
34
34
  if isinstance(v, dict) and (v.get('_classname') == 'DataFrame'):
35
35
  if 'sample_clause' not in v:
36
36
  v['sample_clause'] = None
@@ -0,0 +1,15 @@
1
+ from uuid import UUID
2
+
3
+ import sqlalchemy as sql
4
+
5
+ from pixeltable.metadata import register_converter
6
+ from pixeltable.metadata.converters.util import convert_table_md
7
+
8
+
9
+ @register_converter(version=37)
10
+ def _(engine: sql.engine.Engine) -> None:
11
+ convert_table_md(engine, table_md_updater=__update_table_md)
12
+
13
+
14
+ def __update_table_md(table_md: dict, _: UUID) -> None:
15
+ table_md['view_sn'] = 0
@@ -0,0 +1,39 @@
1
+ from typing import Any
2
+
3
+ import sqlalchemy as sql
4
+
5
+ from pixeltable.metadata import register_converter
6
+ from pixeltable.metadata.converters.util import convert_table_md
7
+
8
+
9
+ @register_converter(version=38)
10
+ def _(engine: sql.engine.Engine) -> None:
11
+ convert_table_md(engine, substitution_fn=__substitute_md)
12
+
13
+
14
+ def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
15
+ if k == 'col_mapping':
16
+ assert isinstance(v, list)
17
+ return k, [__col_mapping_entry(e) for e in v]
18
+ if k == 'stored_proxies':
19
+ assert isinstance(v, list)
20
+ return k, [__stored_proxies_entry(e) for e in v]
21
+ return None
22
+
23
+
24
+ def __col_mapping_entry(e: list) -> list:
25
+ assert isinstance(e, list)
26
+ assert isinstance(e[0], dict)
27
+ assert isinstance(e[1], str)
28
+ return [__col_handle(e[0]), e[1]]
29
+
30
+
31
+ def __stored_proxies_entry(e: list) -> list:
32
+ assert isinstance(e, list)
33
+ assert isinstance(e[0], dict)
34
+ assert isinstance(e[1], dict)
35
+ return [__col_handle(e[0]), __col_handle(e[1])]
36
+
37
+
38
+ def __col_handle(e: dict) -> dict:
39
+ return {'tbl_version': {'id': e['tbl_id'], 'effective_version': None}, 'col_id': e['col_id']}