pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +23 -5
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +5 -3
- pixeltable/catalog/catalog.py +1318 -404
- pixeltable/catalog/column.py +186 -115
- pixeltable/catalog/dir.py +1 -2
- pixeltable/catalog/globals.py +11 -43
- pixeltable/catalog/insertable_table.py +167 -79
- pixeltable/catalog/path.py +61 -23
- pixeltable/catalog/schema_object.py +9 -10
- pixeltable/catalog/table.py +626 -308
- pixeltable/catalog/table_metadata.py +101 -0
- pixeltable/catalog/table_version.py +713 -569
- pixeltable/catalog/table_version_handle.py +37 -6
- pixeltable/catalog/table_version_path.py +42 -29
- pixeltable/catalog/tbl_ops.py +50 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +108 -94
- pixeltable/config.py +128 -22
- pixeltable/dataframe.py +188 -100
- pixeltable/env.py +407 -136
- pixeltable/exceptions.py +6 -0
- pixeltable/exec/__init__.py +3 -0
- pixeltable/exec/aggregation_node.py +7 -8
- pixeltable/exec/cache_prefetch_node.py +83 -110
- pixeltable/exec/cell_materialization_node.py +231 -0
- pixeltable/exec/cell_reconstruction_node.py +135 -0
- pixeltable/exec/component_iteration_node.py +4 -3
- pixeltable/exec/data_row_batch.py +8 -65
- pixeltable/exec/exec_context.py +16 -4
- pixeltable/exec/exec_node.py +13 -36
- pixeltable/exec/expr_eval/evaluators.py +7 -6
- pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
- pixeltable/exec/expr_eval/globals.py +8 -5
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +190 -30
- pixeltable/exec/globals.py +32 -0
- pixeltable/exec/in_memory_data_node.py +18 -18
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +16 -9
- pixeltable/exec/sql_node.py +206 -101
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +27 -22
- pixeltable/exprs/array_slice.py +3 -3
- pixeltable/exprs/column_property_ref.py +34 -30
- pixeltable/exprs/column_ref.py +92 -96
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +5 -4
- pixeltable/exprs/data_row.py +152 -55
- pixeltable/exprs/expr.py +62 -43
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +17 -10
- pixeltable/exprs/function_call.py +75 -37
- pixeltable/exprs/globals.py +1 -2
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +10 -27
- pixeltable/exprs/is_null.py +1 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +56 -22
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +127 -53
- pixeltable/exprs/rowid_ref.py +8 -12
- pixeltable/exprs/similarity_expr.py +50 -25
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +5 -5
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +10 -10
- pixeltable/func/function.py +18 -20
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +20 -18
- pixeltable/func/signature.py +43 -16
- pixeltable/func/tools.py +23 -13
- pixeltable/func/udf.py +18 -20
- pixeltable/functions/__init__.py +6 -0
- pixeltable/functions/anthropic.py +93 -33
- pixeltable/functions/audio.py +114 -10
- pixeltable/functions/bedrock.py +13 -6
- pixeltable/functions/date.py +1 -1
- pixeltable/functions/deepseek.py +20 -9
- pixeltable/functions/fireworks.py +2 -2
- pixeltable/functions/gemini.py +28 -11
- pixeltable/functions/globals.py +13 -13
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1046 -23
- pixeltable/functions/image.py +9 -18
- pixeltable/functions/llama_cpp.py +23 -8
- pixeltable/functions/math.py +3 -4
- pixeltable/functions/mistralai.py +4 -15
- pixeltable/functions/ollama.py +16 -9
- pixeltable/functions/openai.py +104 -82
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +2 -2
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +21 -28
- pixeltable/functions/timestamp.py +13 -14
- pixeltable/functions/together.py +4 -6
- pixeltable/functions/twelvelabs.py +92 -0
- pixeltable/functions/util.py +6 -1
- pixeltable/functions/video.py +1388 -106
- pixeltable/functions/vision.py +7 -7
- pixeltable/functions/whisper.py +15 -7
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/{ext/functions → functions}/yolox.py +2 -4
- pixeltable/globals.py +332 -105
- pixeltable/index/base.py +13 -22
- pixeltable/index/btree.py +23 -22
- pixeltable/index/embedding_index.py +32 -44
- pixeltable/io/__init__.py +4 -2
- pixeltable/io/datarows.py +7 -6
- pixeltable/io/external_store.py +49 -77
- pixeltable/io/fiftyone.py +11 -11
- pixeltable/io/globals.py +29 -28
- pixeltable/io/hf_datasets.py +17 -9
- pixeltable/io/label_studio.py +70 -66
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +12 -11
- pixeltable/io/parquet.py +13 -93
- pixeltable/io/table_data_conduit.py +71 -47
- pixeltable/io/utils.py +3 -3
- pixeltable/iterators/__init__.py +2 -1
- pixeltable/iterators/audio.py +21 -11
- pixeltable/iterators/document.py +116 -55
- pixeltable/iterators/image.py +5 -2
- pixeltable/iterators/video.py +293 -13
- pixeltable/metadata/__init__.py +4 -2
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_36.py +2 -2
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/util.py +13 -12
- pixeltable/metadata/notes.py +4 -0
- pixeltable/metadata/schema.py +79 -42
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +274 -223
- pixeltable/share/__init__.py +1 -1
- pixeltable/share/packager.py +259 -129
- pixeltable/share/protocol/__init__.py +34 -0
- pixeltable/share/protocol/common.py +170 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +109 -0
- pixeltable/share/publish.py +213 -57
- pixeltable/store.py +238 -175
- pixeltable/type_system.py +104 -63
- pixeltable/utils/__init__.py +2 -3
- pixeltable/utils/arrow.py +108 -13
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +305 -0
- pixeltable/utils/code.py +3 -3
- pixeltable/utils/console_output.py +4 -1
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/dbms.py +31 -5
- pixeltable/utils/description_helper.py +4 -5
- pixeltable/utils/documents.py +5 -6
- pixeltable/utils/exception_handler.py +7 -30
- pixeltable/utils/filecache.py +6 -6
- pixeltable/utils/formatter.py +4 -6
- pixeltable/utils/gcs_store.py +283 -0
- pixeltable/utils/http_server.py +2 -3
- pixeltable/utils/iceberg.py +1 -2
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +88 -0
- pixeltable/utils/local_store.py +316 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +528 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +392 -0
- pixeltable-0.4.20.dist-info/METADATA +587 -0
- pixeltable-0.4.20.dist-info/RECORD +218 -0
- {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
- pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/ext/__init__.py +0 -17
- pixeltable/ext/functions/__init__.py +0 -11
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/utils/media_store.py +0 -77
- pixeltable/utils/s3.py +0 -17
- pixeltable/utils/sample.py +0 -25
- pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
- pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
- pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
- {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0
pixeltable/iterators/video.py
CHANGED
|
@@ -1,15 +1,21 @@
|
|
|
1
|
+
import glob
|
|
1
2
|
import logging
|
|
2
3
|
import math
|
|
4
|
+
import subprocess
|
|
3
5
|
from fractions import Fraction
|
|
4
6
|
from pathlib import Path
|
|
5
|
-
from typing import Any,
|
|
7
|
+
from typing import Any, Iterator, Literal
|
|
6
8
|
|
|
7
9
|
import av
|
|
8
10
|
import pandas as pd
|
|
9
11
|
import PIL.Image
|
|
10
12
|
|
|
13
|
+
import pixeltable as pxt
|
|
11
14
|
import pixeltable.exceptions as excs
|
|
12
15
|
import pixeltable.type_system as ts
|
|
16
|
+
import pixeltable.utils.av as av_utils
|
|
17
|
+
from pixeltable.env import Env
|
|
18
|
+
from pixeltable.utils.local_store import TempStore
|
|
13
19
|
|
|
14
20
|
from .base import ComponentIterator
|
|
15
21
|
|
|
@@ -29,12 +35,29 @@ class FrameIterator(ComponentIterator):
|
|
|
29
35
|
extracted). If `fps` is greater than the frame rate of the video, an error will be raised.
|
|
30
36
|
num_frames: Exact number of frames to extract. The frames will be spaced as evenly as possible. If
|
|
31
37
|
`num_frames` is greater than the number of frames in the video, all frames will be extracted.
|
|
38
|
+
all_frame_attrs:
|
|
39
|
+
If True, outputs a `pxt.Json` column `frame_attrs` with the following `pyav`-provided attributes
|
|
40
|
+
(for more information, see `pyav`'s documentation on
|
|
41
|
+
[VideoFrame](https://pyav.org/docs/develop/api/video.html#module-av.video.frame) and
|
|
42
|
+
[Frame](https://pyav.org/docs/develop/api/frame.html)):
|
|
43
|
+
|
|
44
|
+
* `index` (`int`)
|
|
45
|
+
* `pts` (`int | None`)
|
|
46
|
+
* `dts` (`int | None`)
|
|
47
|
+
* `time` (`float | None`)
|
|
48
|
+
* `is_corrupt` (`bool`)
|
|
49
|
+
* `key_frame` (`bool`)
|
|
50
|
+
* `pict_type` (`int`)
|
|
51
|
+
* `interlaced_frame` (`bool`)
|
|
52
|
+
|
|
53
|
+
If False, only outputs frame attributes `frame_idx`, `pos_msec`, and `pos_frame` as separate columns.
|
|
32
54
|
"""
|
|
33
55
|
|
|
34
56
|
# Input parameters
|
|
35
57
|
video_path: Path
|
|
36
|
-
fps:
|
|
37
|
-
num_frames:
|
|
58
|
+
fps: float | None
|
|
59
|
+
num_frames: int | None
|
|
60
|
+
all_frame_attrs: bool
|
|
38
61
|
|
|
39
62
|
# Video info
|
|
40
63
|
container: av.container.input.InputContainer
|
|
@@ -44,13 +67,15 @@ class FrameIterator(ComponentIterator):
|
|
|
44
67
|
video_start_time: int
|
|
45
68
|
|
|
46
69
|
# List of frame indices to be extracted, or None to extract all frames
|
|
47
|
-
frames_to_extract:
|
|
70
|
+
frames_to_extract: list[int] | None
|
|
48
71
|
|
|
49
72
|
# Next frame to extract, as an iterator `pos` index. If `frames_to_extract` is None, this is the same as the
|
|
50
73
|
# frame index in the video. Otherwise, the corresponding video index is `frames_to_extract[next_pos]`.
|
|
51
74
|
next_pos: int
|
|
52
75
|
|
|
53
|
-
def __init__(
|
|
76
|
+
def __init__(
|
|
77
|
+
self, video: str, *, fps: float | None = None, num_frames: int | None = None, all_frame_attrs: bool = False
|
|
78
|
+
):
|
|
54
79
|
if fps is not None and num_frames is not None:
|
|
55
80
|
raise excs.Error('At most one of `fps` or `num_frames` may be specified')
|
|
56
81
|
|
|
@@ -60,6 +85,7 @@ class FrameIterator(ComponentIterator):
|
|
|
60
85
|
self.container = av.open(str(video_path))
|
|
61
86
|
self.fps = fps
|
|
62
87
|
self.num_frames = num_frames
|
|
88
|
+
self.all_frame_attrs = all_frame_attrs
|
|
63
89
|
|
|
64
90
|
self.video_framerate = self.container.streams.video[0].average_rate
|
|
65
91
|
self.video_time_base = self.container.streams.video[0].time_base
|
|
@@ -115,16 +141,17 @@ class FrameIterator(ComponentIterator):
|
|
|
115
141
|
'video': ts.VideoType(nullable=False),
|
|
116
142
|
'fps': ts.FloatType(nullable=True),
|
|
117
143
|
'num_frames': ts.IntType(nullable=True),
|
|
144
|
+
'all_frame_attrs': ts.BoolType(nullable=False),
|
|
118
145
|
}
|
|
119
146
|
|
|
120
147
|
@classmethod
|
|
121
148
|
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
'
|
|
125
|
-
|
|
126
|
-
'
|
|
127
|
-
}, ['frame']
|
|
149
|
+
attrs: dict[str, ts.ColumnType]
|
|
150
|
+
if kwargs.get('all_frame_attrs'):
|
|
151
|
+
attrs = {'frame_attrs': ts.JsonType()}
|
|
152
|
+
else:
|
|
153
|
+
attrs = {'frame_idx': ts.IntType(), 'pos_msec': ts.FloatType(), 'pos_frame': ts.IntType()}
|
|
154
|
+
return {**attrs, 'frame': ts.ImageType()}, ['frame']
|
|
128
155
|
|
|
129
156
|
def __next__(self) -> dict[str, Any]:
|
|
130
157
|
# Determine the frame index in the video corresponding to the iterator index `next_pos`;
|
|
@@ -164,8 +191,22 @@ class FrameIterator(ComponentIterator):
|
|
|
164
191
|
raise excs.Error(f'Frame {next_video_idx} is missing from the video (video file is corrupt)')
|
|
165
192
|
img = frame.to_image()
|
|
166
193
|
assert isinstance(img, PIL.Image.Image)
|
|
167
|
-
|
|
168
|
-
result
|
|
194
|
+
pts_msec = float(pts * self.video_time_base * 1000)
|
|
195
|
+
result: dict[str, Any] = {'frame': img}
|
|
196
|
+
if self.all_frame_attrs:
|
|
197
|
+
attrs = {
|
|
198
|
+
'index': video_idx,
|
|
199
|
+
'pts': frame.pts,
|
|
200
|
+
'dts': frame.dts,
|
|
201
|
+
'time': frame.time,
|
|
202
|
+
'is_corrupt': frame.is_corrupt,
|
|
203
|
+
'key_frame': frame.key_frame,
|
|
204
|
+
'pict_type': frame.pict_type,
|
|
205
|
+
'interlaced_frame': frame.interlaced_frame,
|
|
206
|
+
}
|
|
207
|
+
result['frame_attrs'] = attrs
|
|
208
|
+
else:
|
|
209
|
+
result.update({'frame_idx': self.next_pos, 'pos_msec': pts_msec, 'pos_frame': video_idx})
|
|
169
210
|
self.next_pos += 1
|
|
170
211
|
return result
|
|
171
212
|
|
|
@@ -184,3 +225,242 @@ class FrameIterator(ComponentIterator):
|
|
|
184
225
|
# then the iterator will step forward to the desired frame on the subsequent call to next().
|
|
185
226
|
self.container.seek(seek_pos, backward=True, stream=self.container.streams.video[0])
|
|
186
227
|
self.next_pos = pos
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
class VideoSplitter(ComponentIterator):
|
|
231
|
+
"""
|
|
232
|
+
Iterator over segments of a video file, which is split into fixed-size segments of length `segment_duration`
|
|
233
|
+
seconds.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
duration: Video segment duration in seconds
|
|
237
|
+
overlap: Overlap between consecutive segments in seconds. Only available for `mode='fast'`.
|
|
238
|
+
min_segment_duration: Drop the last segment if it is smaller than min_segment_duration.
|
|
239
|
+
mode: Segmentation mode:
|
|
240
|
+
- `'fast'`: Quick segmentation using stream copy (splits only at keyframes, approximate durations)
|
|
241
|
+
- `'accurate'`: Precise segmentation with re-encoding (exact durations, slower)
|
|
242
|
+
video_encoder: Video encoder to use. If not specified, uses the default encoder for the current platform.
|
|
243
|
+
Only available for `mode='accurate'`.
|
|
244
|
+
video_encoder_args: Additional arguments to pass to the video encoder. Only available for `mode='accurate'`.
|
|
245
|
+
"""
|
|
246
|
+
|
|
247
|
+
# Input parameters
|
|
248
|
+
video_path: Path
|
|
249
|
+
segment_duration: float | None
|
|
250
|
+
segment_times: list[float] | None
|
|
251
|
+
overlap: float
|
|
252
|
+
min_segment_duration: float
|
|
253
|
+
video_encoder: str | None
|
|
254
|
+
video_encoder_args: dict[str, Any] | None
|
|
255
|
+
|
|
256
|
+
# Video metadata
|
|
257
|
+
video_duration: float
|
|
258
|
+
video_time_base: Fraction
|
|
259
|
+
video_start_time: int
|
|
260
|
+
|
|
261
|
+
output_iter: Iterator[dict[str, Any]]
|
|
262
|
+
|
|
263
|
+
def __init__(
|
|
264
|
+
self,
|
|
265
|
+
video: str,
|
|
266
|
+
*,
|
|
267
|
+
duration: float | None = None,
|
|
268
|
+
overlap: float | None = None,
|
|
269
|
+
min_segment_duration: float | None = None,
|
|
270
|
+
segment_times: list[float] | None = None,
|
|
271
|
+
mode: Literal['fast', 'accurate'] = 'accurate',
|
|
272
|
+
video_encoder: str | None = None,
|
|
273
|
+
video_encoder_args: dict[str, Any] | None = None,
|
|
274
|
+
):
|
|
275
|
+
Env.get().require_binary('ffmpeg')
|
|
276
|
+
assert (duration is not None) != (segment_times is not None)
|
|
277
|
+
if segment_times is not None:
|
|
278
|
+
assert len(segment_times) > 0
|
|
279
|
+
if duration is not None:
|
|
280
|
+
assert duration > 0.0
|
|
281
|
+
assert duration >= min_segment_duration
|
|
282
|
+
assert overlap is None or overlap < duration
|
|
283
|
+
|
|
284
|
+
video_path = Path(video)
|
|
285
|
+
assert video_path.exists() and video_path.is_file()
|
|
286
|
+
|
|
287
|
+
self.video_path = video_path
|
|
288
|
+
self.segment_duration = duration
|
|
289
|
+
self.overlap = overlap if overlap is not None else 0.0
|
|
290
|
+
self.min_segment_duration = min_segment_duration if min_segment_duration is not None else 0.0
|
|
291
|
+
self.segment_times = segment_times
|
|
292
|
+
self.video_encoder = video_encoder
|
|
293
|
+
self.video_encoder_args = video_encoder_args
|
|
294
|
+
|
|
295
|
+
with av.open(str(video_path)) as container:
|
|
296
|
+
video_stream = container.streams.video[0]
|
|
297
|
+
self.video_time_base = video_stream.time_base
|
|
298
|
+
self.video_start_time = video_stream.start_time or 0
|
|
299
|
+
|
|
300
|
+
self.output_iter = self.fast_iter() if mode == 'fast' else self.accurate_iter()
|
|
301
|
+
|
|
302
|
+
@classmethod
|
|
303
|
+
def input_schema(cls) -> dict[str, ts.ColumnType]:
|
|
304
|
+
return {
|
|
305
|
+
'video': ts.VideoType(nullable=False),
|
|
306
|
+
'duration': ts.FloatType(nullable=True),
|
|
307
|
+
'overlap': ts.FloatType(nullable=True),
|
|
308
|
+
'min_segment_duration': ts.FloatType(nullable=True),
|
|
309
|
+
'segment_times': ts.JsonType(nullable=True),
|
|
310
|
+
'mode': ts.StringType(nullable=False),
|
|
311
|
+
'video_encoder': ts.StringType(nullable=True),
|
|
312
|
+
'video_encoder_args': ts.JsonType(nullable=True),
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
@classmethod
|
|
316
|
+
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
317
|
+
param_names = ['duration', 'overlap', 'min_segment_duration', 'segment_times']
|
|
318
|
+
params = dict(zip(param_names, args))
|
|
319
|
+
params.update(kwargs)
|
|
320
|
+
|
|
321
|
+
segment_duration = params.get('duration')
|
|
322
|
+
segment_times = params.get('segment_times')
|
|
323
|
+
overlap = params.get('overlap')
|
|
324
|
+
min_segment_duration = params.get('min_segment_duration')
|
|
325
|
+
mode = params.get('mode', 'fast')
|
|
326
|
+
|
|
327
|
+
if segment_duration is None and segment_times is None:
|
|
328
|
+
raise excs.Error('Must specify either duration or segment_times')
|
|
329
|
+
if segment_duration is not None and segment_times is not None:
|
|
330
|
+
raise excs.Error('duration and segment_times cannot both be specified')
|
|
331
|
+
if segment_times is not None:
|
|
332
|
+
if len(segment_times) == 0:
|
|
333
|
+
raise excs.Error('segment_times cannot be empty')
|
|
334
|
+
if overlap is not None:
|
|
335
|
+
raise excs.Error('overlap cannot be specified with segment_times')
|
|
336
|
+
if segment_duration is not None:
|
|
337
|
+
if segment_duration <= 0.0:
|
|
338
|
+
raise excs.Error('duration must be a positive number')
|
|
339
|
+
if min_segment_duration is not None and segment_duration < min_segment_duration:
|
|
340
|
+
raise excs.Error('duration must be at least min_segment_duration')
|
|
341
|
+
if overlap is not None and overlap >= segment_duration:
|
|
342
|
+
raise excs.Error('overlap must be less than duration')
|
|
343
|
+
if mode == 'accurate' and overlap is not None:
|
|
344
|
+
raise excs.Error("Cannot specify overlap for mode='accurate'")
|
|
345
|
+
if mode == 'fast':
|
|
346
|
+
if params.get('video_encoder') is not None:
|
|
347
|
+
raise excs.Error("Cannot specify video_encoder for mode='fast'")
|
|
348
|
+
if params.get('video_encoder_args') is not None:
|
|
349
|
+
raise excs.Error("Cannot specify video_encoder_args for mode='fast'")
|
|
350
|
+
|
|
351
|
+
return {
|
|
352
|
+
'segment_start': ts.FloatType(nullable=False),
|
|
353
|
+
'segment_start_pts': ts.IntType(nullable=False),
|
|
354
|
+
'segment_end': ts.FloatType(nullable=False),
|
|
355
|
+
'segment_end_pts': ts.IntType(nullable=False),
|
|
356
|
+
'video_segment': ts.VideoType(nullable=False),
|
|
357
|
+
}, []
|
|
358
|
+
|
|
359
|
+
def fast_iter(self) -> Iterator[dict[str, Any]]:
|
|
360
|
+
segment_path: str = ''
|
|
361
|
+
try:
|
|
362
|
+
start_time = 0.0
|
|
363
|
+
start_pts = 0
|
|
364
|
+
segment_idx = 0
|
|
365
|
+
while True:
|
|
366
|
+
target_duration: float | None
|
|
367
|
+
if self.segment_duration is not None:
|
|
368
|
+
target_duration = self.segment_duration
|
|
369
|
+
elif self.segment_times is not None and segment_idx < len(self.segment_times):
|
|
370
|
+
target_duration = self.segment_times[segment_idx] - start_time
|
|
371
|
+
else:
|
|
372
|
+
target_duration = None # the rest of the video
|
|
373
|
+
|
|
374
|
+
segment_path = str(TempStore.create_path(extension='.mp4'))
|
|
375
|
+
cmd = av_utils.ffmpeg_clip_cmd(str(self.video_path), segment_path, start_time, target_duration)
|
|
376
|
+
_ = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
377
|
+
|
|
378
|
+
# use the actual duration
|
|
379
|
+
segment_duration = av_utils.get_video_duration(segment_path)
|
|
380
|
+
if segment_duration - self.overlap == 0.0 or segment_duration < self.min_segment_duration:
|
|
381
|
+
# we're done
|
|
382
|
+
Path(segment_path).unlink()
|
|
383
|
+
return
|
|
384
|
+
|
|
385
|
+
segment_end = start_time + segment_duration
|
|
386
|
+
segment_end_pts = start_pts + round(segment_duration / self.video_time_base)
|
|
387
|
+
result = {
|
|
388
|
+
'segment_start': start_time,
|
|
389
|
+
'segment_start_pts': start_pts,
|
|
390
|
+
'segment_end': segment_end,
|
|
391
|
+
'segment_end_pts': segment_end_pts,
|
|
392
|
+
'video_segment': segment_path,
|
|
393
|
+
}
|
|
394
|
+
yield result
|
|
395
|
+
|
|
396
|
+
start_time = segment_end - self.overlap
|
|
397
|
+
start_pts = segment_end_pts - round(self.overlap / self.video_time_base)
|
|
398
|
+
|
|
399
|
+
segment_idx += 1
|
|
400
|
+
if self.segment_times is not None and segment_idx > len(self.segment_times):
|
|
401
|
+
# We've created all segments including the final segment after the last segment_time
|
|
402
|
+
break
|
|
403
|
+
|
|
404
|
+
except subprocess.CalledProcessError as e:
|
|
405
|
+
if segment_path and Path(segment_path).exists():
|
|
406
|
+
Path(segment_path).unlink()
|
|
407
|
+
error_msg = f'ffmpeg failed with return code {e.returncode}'
|
|
408
|
+
if e.stderr:
|
|
409
|
+
error_msg += f': {e.stderr.strip()}'
|
|
410
|
+
raise pxt.Error(error_msg) from e
|
|
411
|
+
|
|
412
|
+
def accurate_iter(self) -> Iterator[dict[str, Any]]:
|
|
413
|
+
base_path = TempStore.create_path(extension='')
|
|
414
|
+
# Use ffmpeg -f segment for accurate segmentation with re-encoding
|
|
415
|
+
output_pattern = f'{base_path}_segment_%04d.mp4'
|
|
416
|
+
cmd = av_utils.ffmpeg_segment_cmd(
|
|
417
|
+
str(self.video_path),
|
|
418
|
+
output_pattern,
|
|
419
|
+
segment_duration=self.segment_duration,
|
|
420
|
+
segment_times=self.segment_times,
|
|
421
|
+
video_encoder=self.video_encoder,
|
|
422
|
+
video_encoder_args=self.video_encoder_args,
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
try:
|
|
426
|
+
_ = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
427
|
+
output_paths = sorted(glob.glob(f'{base_path}_segment_*.mp4'))
|
|
428
|
+
# TODO: is this actually an error?
|
|
429
|
+
# if len(output_paths) == 0:
|
|
430
|
+
# stderr_output = result.stderr.strip() if result.stderr is not None else ''
|
|
431
|
+
# raise pxt.Error(
|
|
432
|
+
# f'ffmpeg failed to create output files for commandline: {" ".join(cmd)}\n{stderr_output}'
|
|
433
|
+
# )
|
|
434
|
+
start_time = 0.0
|
|
435
|
+
start_pts = 0
|
|
436
|
+
for segment_path in output_paths:
|
|
437
|
+
segment_duration = av_utils.get_video_duration(segment_path)
|
|
438
|
+
if segment_duration < self.min_segment_duration:
|
|
439
|
+
Path(segment_path).unlink()
|
|
440
|
+
return
|
|
441
|
+
|
|
442
|
+
result = {
|
|
443
|
+
'segment_start': start_time,
|
|
444
|
+
'segment_start_pts': start_pts,
|
|
445
|
+
'segment_end': start_time + segment_duration,
|
|
446
|
+
'segment_end_pts': start_pts + round(segment_duration / self.video_time_base),
|
|
447
|
+
'video_segment': segment_path,
|
|
448
|
+
}
|
|
449
|
+
yield result
|
|
450
|
+
start_time += segment_duration
|
|
451
|
+
start_pts += round(segment_duration / self.video_time_base)
|
|
452
|
+
|
|
453
|
+
except subprocess.CalledProcessError as e:
|
|
454
|
+
error_msg = f'ffmpeg failed with return code {e.returncode}'
|
|
455
|
+
if e.stderr:
|
|
456
|
+
error_msg += f': {e.stderr.strip()}'
|
|
457
|
+
raise pxt.Error(error_msg) from e
|
|
458
|
+
|
|
459
|
+
def __next__(self) -> dict[str, Any]:
|
|
460
|
+
return next(self.output_iter)
|
|
461
|
+
|
|
462
|
+
def close(self) -> None:
|
|
463
|
+
pass
|
|
464
|
+
|
|
465
|
+
def set_pos(self, pos: int) -> None:
|
|
466
|
+
pass
|
pixeltable/metadata/__init__.py
CHANGED
|
@@ -18,13 +18,14 @@ _console_logger = ConsoleLogger(logging.getLogger('pixeltable'))
|
|
|
18
18
|
_logger = logging.getLogger('pixeltable')
|
|
19
19
|
|
|
20
20
|
# current version of the metadata; this is incremented whenever the metadata schema changes
|
|
21
|
-
VERSION =
|
|
21
|
+
VERSION = 41
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
def create_system_info(engine: sql.engine.Engine) -> None:
|
|
25
25
|
"""Create the system metadata record"""
|
|
26
26
|
system_md = SystemInfoMd(schema_version=VERSION)
|
|
27
27
|
record = SystemInfo(md=dataclasses.asdict(system_md))
|
|
28
|
+
_logger.debug(f'Creating pixeltable system info record {record}')
|
|
28
29
|
with orm.Session(engine, future=True) as session:
|
|
29
30
|
# Write system metadata only once for idempotency
|
|
30
31
|
if session.query(SystemInfo).count() == 0:
|
|
@@ -54,7 +55,8 @@ for _, modname, _ in pkgutil.iter_modules([os.path.dirname(__file__) + '/convert
|
|
|
54
55
|
def upgrade_md(engine: sql.engine.Engine) -> None:
|
|
55
56
|
"""Upgrade the metadata schema to the current version"""
|
|
56
57
|
with orm.Session(engine) as session:
|
|
57
|
-
|
|
58
|
+
# Get exclusive lock on SystemInfo row
|
|
59
|
+
system_info = session.query(SystemInfo).with_for_update().one().md
|
|
58
60
|
md_version = system_info['schema_version']
|
|
59
61
|
assert isinstance(md_version, int)
|
|
60
62
|
_logger.info(f'Current database version: {md_version}, installed version: {VERSION}')
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any
|
|
1
|
+
from typing import Any
|
|
2
2
|
|
|
3
3
|
import sqlalchemy as sql
|
|
4
4
|
|
|
@@ -11,7 +11,7 @@ def _(engine: sql.engine.Engine) -> None:
|
|
|
11
11
|
convert_table_md(engine, substitution_fn=__substitute_md)
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
def __substitute_md(k:
|
|
14
|
+
def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
|
|
15
15
|
# Migrate a few changed function names
|
|
16
16
|
if k == 'path' and v == 'pixeltable.functions.string.str_format':
|
|
17
17
|
return 'path', 'pixeltable.functions.string.format'
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import datetime
|
|
2
|
-
from typing import Any
|
|
2
|
+
from typing import Any
|
|
3
3
|
|
|
4
4
|
import sqlalchemy as sql
|
|
5
5
|
|
|
@@ -28,7 +28,7 @@ def _(engine: sql.engine.Engine) -> None:
|
|
|
28
28
|
conn.execute(sql.text(f'ALTER TABLE {store_name} ALTER COLUMN col_{col_id} TYPE TIMESTAMPTZ'))
|
|
29
29
|
|
|
30
30
|
|
|
31
|
-
def __update_timestamp_literals(k: Any, v: Any) ->
|
|
31
|
+
def __update_timestamp_literals(k: Any, v: Any) -> tuple[Any, Any] | None:
|
|
32
32
|
if isinstance(v, dict) and 'val_t' in v:
|
|
33
33
|
# It's a literal with an explicit 'val_t' field. In version 19 this can only mean a
|
|
34
34
|
# timestamp literal, which (in version 19) is stored in the DB as a naive datetime.
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any
|
|
1
|
+
from typing import Any
|
|
2
2
|
|
|
3
3
|
import sqlalchemy as sql
|
|
4
4
|
|
|
@@ -11,7 +11,7 @@ def _(engine: sql.engine.Engine) -> None:
|
|
|
11
11
|
convert_table_md(engine, substitution_fn=__substitute_md)
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
def __substitute_md(k:
|
|
14
|
+
def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
|
|
15
15
|
if isinstance(v, dict) and '_classname' in v:
|
|
16
16
|
# The way InlineArray is represented changed in v20. Previously, literal values were stored
|
|
17
17
|
# directly in the Inline expr; now we store them in Literal sub-exprs. This converter
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any
|
|
1
|
+
from typing import Any
|
|
2
2
|
|
|
3
3
|
import sqlalchemy as sql
|
|
4
4
|
|
|
@@ -24,7 +24,7 @@ def __update_schema_column(schema_column: dict) -> None:
|
|
|
24
24
|
schema_column['media_validation'] = None
|
|
25
25
|
|
|
26
26
|
|
|
27
|
-
def __substitute_md(k:
|
|
27
|
+
def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
|
|
28
28
|
if isinstance(v, dict) and '_classname' in v and v['_classname'] == 'ColumnRef':
|
|
29
29
|
if 'perform_validation' not in v:
|
|
30
30
|
v['perform_validation'] = False
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any
|
|
1
|
+
from typing import Any
|
|
2
2
|
|
|
3
3
|
import sqlalchemy as sql
|
|
4
4
|
|
|
@@ -11,7 +11,7 @@ def _(engine: sql.engine.Engine) -> None:
|
|
|
11
11
|
convert_table_md(engine, substitution_fn=__substitute_md)
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
def __substitute_md(k:
|
|
14
|
+
def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
|
|
15
15
|
if isinstance(v, dict) and '_classname' in v and v['_classname'] == 'DataFrame':
|
|
16
16
|
v['from_clause'] = {'tbls': [v['tbl']], 'join_clauses': []}
|
|
17
17
|
return k, v
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any
|
|
1
|
+
from typing import Any
|
|
2
2
|
|
|
3
3
|
import sqlalchemy as sql
|
|
4
4
|
|
|
@@ -11,7 +11,7 @@ def _(engine: sql.engine.Engine) -> None:
|
|
|
11
11
|
convert_table_md(engine, substitution_fn=__substitute_md)
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
def __substitute_md(k:
|
|
14
|
+
def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
|
|
15
15
|
from pixeltable import func
|
|
16
16
|
from pixeltable.func.globals import resolve_symbol
|
|
17
17
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any
|
|
1
|
+
from typing import Any
|
|
2
2
|
|
|
3
3
|
import sqlalchemy as sql
|
|
4
4
|
|
|
@@ -11,7 +11,7 @@ def _(engine: sql.engine.Engine) -> None:
|
|
|
11
11
|
convert_table_md(engine, substitution_fn=__substitute_md)
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
def __substitute_md(k:
|
|
14
|
+
def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
|
|
15
15
|
if k == 'path' and (
|
|
16
16
|
v in ('pixeltable.functions.huggingface.clip_text', 'pixeltable.functions.huggingface.clip_image')
|
|
17
17
|
):
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any
|
|
1
|
+
from typing import Any
|
|
2
2
|
|
|
3
3
|
import sqlalchemy as sql
|
|
4
4
|
|
|
@@ -11,7 +11,7 @@ def _(engine: sql.engine.Engine) -> None:
|
|
|
11
11
|
convert_table_md(engine, substitution_fn=__substitute_md)
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
def __substitute_md(k:
|
|
14
|
+
def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
|
|
15
15
|
import pixeltable.type_system as ts
|
|
16
16
|
from pixeltable.exprs.literal import Literal
|
|
17
17
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any
|
|
1
|
+
from typing import Any
|
|
2
2
|
|
|
3
3
|
import sqlalchemy as sql
|
|
4
4
|
|
|
@@ -12,7 +12,7 @@ def _(engine: sql.engine.Engine) -> None:
|
|
|
12
12
|
convert_table_md(engine, substitution_fn=__substitute_md)
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
def __substitute_md(k:
|
|
15
|
+
def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
|
|
16
16
|
# Defaults are now stored as literals in signatures
|
|
17
17
|
if k == 'parameters':
|
|
18
18
|
for param in v:
|
|
@@ -55,8 +55,8 @@ def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], A
|
|
|
55
55
|
# We need to expand ("unroll") any var-args or var-kwargs.
|
|
56
56
|
|
|
57
57
|
new_args_len = len(new_args)
|
|
58
|
-
rolled_args:
|
|
59
|
-
rolled_kwargs:
|
|
58
|
+
rolled_args: dict | None = None
|
|
59
|
+
rolled_kwargs: dict | None = None
|
|
60
60
|
|
|
61
61
|
if 'signature' in v['fn']:
|
|
62
62
|
# If it's a pickled function, there's no signature, so we're out of luck; varargs in a pickled function
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any
|
|
1
|
+
from typing import Any
|
|
2
2
|
|
|
3
3
|
import sqlalchemy as sql
|
|
4
4
|
|
|
@@ -11,7 +11,7 @@ def _(engine: sql.engine.Engine) -> None:
|
|
|
11
11
|
convert_table_md(engine, substitution_fn=__substitute_md)
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
def __substitute_md(k:
|
|
14
|
+
def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
|
|
15
15
|
if isinstance(v, dict) and '_classname' in v and v['_classname'] == 'ColumnRef':
|
|
16
16
|
# Add reference_tbl to ColumnRef; for historical metadata it is always equal to tbl
|
|
17
17
|
assert 'reference_tbl' not in v
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Any
|
|
2
|
+
from typing import Any
|
|
3
3
|
from uuid import UUID
|
|
4
4
|
|
|
5
5
|
import sqlalchemy as sql
|
|
@@ -30,7 +30,7 @@ def __update_table_md(table_md: dict, table_id: UUID) -> None:
|
|
|
30
30
|
_logger.info(f'Updating view metadata for table: {table_id}')
|
|
31
31
|
|
|
32
32
|
|
|
33
|
-
def __substitute_md(k:
|
|
33
|
+
def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
|
|
34
34
|
if isinstance(v, dict) and (v.get('_classname') == 'DataFrame'):
|
|
35
35
|
if 'sample_clause' not in v:
|
|
36
36
|
v['sample_clause'] = None
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from uuid import UUID
|
|
2
|
+
|
|
3
|
+
import sqlalchemy as sql
|
|
4
|
+
|
|
5
|
+
from pixeltable.metadata import register_converter
|
|
6
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@register_converter(version=37)
|
|
10
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
11
|
+
convert_table_md(engine, table_md_updater=__update_table_md)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def __update_table_md(table_md: dict, _: UUID) -> None:
|
|
15
|
+
table_md['view_sn'] = 0
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
import sqlalchemy as sql
|
|
4
|
+
|
|
5
|
+
from pixeltable.metadata import register_converter
|
|
6
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@register_converter(version=38)
|
|
10
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
11
|
+
convert_table_md(engine, substitution_fn=__substitute_md)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
|
|
15
|
+
if k == 'col_mapping':
|
|
16
|
+
assert isinstance(v, list)
|
|
17
|
+
return k, [__col_mapping_entry(e) for e in v]
|
|
18
|
+
if k == 'stored_proxies':
|
|
19
|
+
assert isinstance(v, list)
|
|
20
|
+
return k, [__stored_proxies_entry(e) for e in v]
|
|
21
|
+
return None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def __col_mapping_entry(e: list) -> list:
|
|
25
|
+
assert isinstance(e, list)
|
|
26
|
+
assert isinstance(e[0], dict)
|
|
27
|
+
assert isinstance(e[1], str)
|
|
28
|
+
return [__col_handle(e[0]), e[1]]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def __stored_proxies_entry(e: list) -> list:
|
|
32
|
+
assert isinstance(e, list)
|
|
33
|
+
assert isinstance(e[0], dict)
|
|
34
|
+
assert isinstance(e[1], dict)
|
|
35
|
+
return [__col_handle(e[0]), __col_handle(e[1])]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def __col_handle(e: dict) -> dict:
|
|
39
|
+
return {'tbl_version': {'id': e['tbl_id'], 'effective_version': None}, 'col_id': e['col_id']}
|