pixeltable 0.4.15__py3-none-any.whl → 0.4.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +4 -0
- pixeltable/catalog/catalog.py +125 -63
- pixeltable/catalog/column.py +7 -2
- pixeltable/catalog/table.py +1 -0
- pixeltable/catalog/table_metadata.py +4 -0
- pixeltable/catalog/table_version.py +174 -117
- pixeltable/catalog/table_version_handle.py +4 -1
- pixeltable/catalog/table_version_path.py +0 -11
- pixeltable/catalog/view.py +6 -0
- pixeltable/config.py +7 -0
- pixeltable/dataframe.py +10 -5
- pixeltable/env.py +56 -19
- pixeltable/exec/__init__.py +2 -0
- pixeltable/exec/cell_materialization_node.py +231 -0
- pixeltable/exec/cell_reconstruction_node.py +135 -0
- pixeltable/exec/exec_node.py +1 -1
- pixeltable/exec/expr_eval/evaluators.py +1 -0
- pixeltable/exec/expr_eval/expr_eval_node.py +3 -0
- pixeltable/exec/expr_eval/globals.py +2 -0
- pixeltable/exec/globals.py +32 -0
- pixeltable/exec/object_store_save_node.py +1 -4
- pixeltable/exec/row_update_node.py +16 -9
- pixeltable/exec/sql_node.py +107 -14
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +23 -18
- pixeltable/exprs/column_property_ref.py +10 -10
- pixeltable/exprs/column_ref.py +2 -2
- pixeltable/exprs/data_row.py +106 -37
- pixeltable/exprs/expr.py +9 -0
- pixeltable/exprs/expr_set.py +14 -7
- pixeltable/exprs/inline_expr.py +2 -19
- pixeltable/exprs/json_path.py +45 -12
- pixeltable/exprs/row_builder.py +54 -22
- pixeltable/functions/__init__.py +1 -0
- pixeltable/functions/bedrock.py +7 -0
- pixeltable/functions/deepseek.py +11 -4
- pixeltable/functions/llama_cpp.py +7 -0
- pixeltable/functions/math.py +1 -1
- pixeltable/functions/ollama.py +7 -0
- pixeltable/functions/openai.py +4 -4
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/video.py +110 -28
- pixeltable/globals.py +10 -4
- pixeltable/io/globals.py +18 -17
- pixeltable/io/parquet.py +1 -1
- pixeltable/io/table_data_conduit.py +47 -22
- pixeltable/iterators/document.py +61 -23
- pixeltable/iterators/video.py +126 -53
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/notes.py +1 -0
- pixeltable/plan.py +175 -46
- pixeltable/share/packager.py +155 -26
- pixeltable/store.py +2 -3
- pixeltable/type_system.py +5 -3
- pixeltable/utils/arrow.py +6 -6
- pixeltable/utils/av.py +65 -0
- pixeltable/utils/console_output.py +4 -1
- pixeltable/utils/exception_handler.py +5 -28
- pixeltable/utils/image.py +7 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +16 -1
- pixeltable/utils/s3_store.py +44 -11
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/METADATA +29 -28
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/RECORD +68 -61
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/licenses/LICENSE +0 -0
pixeltable/iterators/document.py
CHANGED
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
import enum
|
|
3
|
+
import io
|
|
3
4
|
import logging
|
|
4
5
|
from typing import Any, ClassVar, Iterable, Iterator, Optional
|
|
5
6
|
|
|
7
|
+
import fitz # type: ignore[import-untyped]
|
|
6
8
|
import ftfy
|
|
9
|
+
import PIL.Image
|
|
10
|
+
from bs4.element import NavigableString, Tag
|
|
7
11
|
|
|
8
12
|
from pixeltable.env import Env
|
|
9
13
|
from pixeltable.exceptions import Error
|
|
10
|
-
from pixeltable.type_system import ColumnType, DocumentType, IntType, JsonType, StringType
|
|
14
|
+
from pixeltable.type_system import BoolType, ColumnType, DocumentType, ImageType, IntType, JsonType, StringType
|
|
11
15
|
from pixeltable.utils.documents import get_document_handle
|
|
12
16
|
|
|
13
17
|
from .base import ComponentIterator
|
|
@@ -54,6 +58,7 @@ class DocumentSection:
|
|
|
54
58
|
|
|
55
59
|
text: Optional[str]
|
|
56
60
|
metadata: Optional[DocumentSectionMetadata]
|
|
61
|
+
image: Optional[PIL.Image.Image] = None
|
|
57
62
|
|
|
58
63
|
|
|
59
64
|
def _parse_separators(separators: str) -> list[Separator]:
|
|
@@ -95,6 +100,8 @@ class DocumentSplitter(ComponentIterator):
|
|
|
95
100
|
|
|
96
101
|
Chunked text will be cleaned with `ftfy.fix_text` to fix up common problems with unicode sequences.
|
|
97
102
|
|
|
103
|
+
How to init the `DocumentSplitter` class?
|
|
104
|
+
|
|
98
105
|
Args:
|
|
99
106
|
separators: separators to use to chunk the document. Options are:
|
|
100
107
|
`'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
|
|
@@ -125,13 +132,23 @@ class DocumentSplitter(ComponentIterator):
|
|
|
125
132
|
html_skip_tags: Optional[list[str]] = None,
|
|
126
133
|
tiktoken_encoding: Optional[str] = 'cl100k_base',
|
|
127
134
|
tiktoken_target_model: Optional[str] = None,
|
|
135
|
+
# (PDF-processing-only)
|
|
136
|
+
include_page_image: bool = False,
|
|
137
|
+
page_image_dpi: int = 300,
|
|
138
|
+
page_image_format: str = 'png',
|
|
128
139
|
):
|
|
129
140
|
if html_skip_tags is None:
|
|
130
141
|
html_skip_tags = ['nav']
|
|
131
142
|
self._doc_handle = get_document_handle(document)
|
|
132
143
|
assert self._doc_handle is not None
|
|
133
144
|
# calling the output_schema method to validate the input arguments
|
|
134
|
-
self.output_schema(
|
|
145
|
+
self.output_schema(
|
|
146
|
+
separators=separators,
|
|
147
|
+
metadata=metadata,
|
|
148
|
+
limit=limit,
|
|
149
|
+
overlap=overlap,
|
|
150
|
+
include_page_image=include_page_image,
|
|
151
|
+
)
|
|
135
152
|
self._separators = _parse_separators(separators)
|
|
136
153
|
self._metadata_fields = _parse_metadata(metadata)
|
|
137
154
|
if self._doc_handle.bs_doc is not None:
|
|
@@ -148,6 +165,10 @@ class DocumentSplitter(ComponentIterator):
|
|
|
148
165
|
self._tiktoken_encoding = tiktoken_encoding
|
|
149
166
|
self._tiktoken_target_model = tiktoken_target_model
|
|
150
167
|
|
|
168
|
+
self._include_page_image = include_page_image
|
|
169
|
+
self._page_image_dpi = page_image_dpi
|
|
170
|
+
self._page_image_format = page_image_format
|
|
171
|
+
|
|
151
172
|
# set up processing pipeline
|
|
152
173
|
if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
|
|
153
174
|
assert self._doc_handle.bs_doc is not None
|
|
@@ -182,6 +203,10 @@ class DocumentSplitter(ComponentIterator):
|
|
|
182
203
|
'skip_tags': StringType(nullable=True),
|
|
183
204
|
'tiktoken_encoding': StringType(nullable=True),
|
|
184
205
|
'tiktoken_target_model': StringType(nullable=True),
|
|
206
|
+
# PDF options must be declared so validation accepts them:
|
|
207
|
+
'include_page_image': BoolType(nullable=True),
|
|
208
|
+
'page_image_dpi': IntType(nullable=True),
|
|
209
|
+
'page_image_format': StringType(nullable=True),
|
|
185
210
|
}
|
|
186
211
|
|
|
187
212
|
@classmethod
|
|
@@ -211,6 +236,15 @@ class DocumentSplitter(ComponentIterator):
|
|
|
211
236
|
if kwargs.get('limit') is None:
|
|
212
237
|
raise Error('limit is required with "token_limit"/"char_limit" separators')
|
|
213
238
|
|
|
239
|
+
# check dependencies at the end
|
|
240
|
+
if Separator.SENTENCE in separators:
|
|
241
|
+
_ = Env.get().spacy_nlp
|
|
242
|
+
if Separator.TOKEN_LIMIT in separators:
|
|
243
|
+
Env.get().require_package('tiktoken')
|
|
244
|
+
|
|
245
|
+
if kwargs.get('include_page_image'):
|
|
246
|
+
schema['image'] = ImageType(nullable=True)
|
|
247
|
+
|
|
214
248
|
return schema, []
|
|
215
249
|
|
|
216
250
|
def __next__(self) -> dict[str, Any]:
|
|
@@ -230,6 +264,11 @@ class DocumentSplitter(ComponentIterator):
|
|
|
230
264
|
result[md_field.name.lower()] = section.metadata.page
|
|
231
265
|
elif md_field == ChunkMetadata.BOUNDING_BOX:
|
|
232
266
|
result[md_field.name.lower()] = section.metadata.bounding_box
|
|
267
|
+
|
|
268
|
+
# FIX: only include image if schema supports it
|
|
269
|
+
if self._include_page_image:
|
|
270
|
+
result['image'] = section.image
|
|
271
|
+
|
|
233
272
|
return result
|
|
234
273
|
|
|
235
274
|
def _html_sections(self) -> Iterator[DocumentSection]:
|
|
@@ -265,7 +304,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
265
304
|
yield DocumentSection(text=full_text, metadata=md)
|
|
266
305
|
accumulated_text = []
|
|
267
306
|
|
|
268
|
-
def process_element(el:
|
|
307
|
+
def process_element(el: Tag | NavigableString) -> Iterator[DocumentSection]:
|
|
269
308
|
# process the element and emit sections as necessary
|
|
270
309
|
nonlocal accumulated_text, headings, sourceline, emit_on_heading, emit_on_paragraph
|
|
271
310
|
|
|
@@ -353,46 +392,45 @@ class DocumentSplitter(ComponentIterator):
|
|
|
353
392
|
yield from emit()
|
|
354
393
|
|
|
355
394
|
def _pdf_sections(self) -> Iterator[DocumentSection]:
|
|
356
|
-
"""Create DocumentSections reflecting the pdf-specific separators"""
|
|
357
|
-
import fitz # type: ignore[import-untyped]
|
|
358
|
-
|
|
359
395
|
doc: fitz.Document = self._doc_handle.pdf_doc
|
|
360
396
|
assert doc is not None
|
|
361
397
|
|
|
362
398
|
emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
|
|
363
399
|
emit_on_page = Separator.PAGE in self._separators or emit_on_paragraph
|
|
364
400
|
|
|
365
|
-
accumulated_text = []
|
|
401
|
+
accumulated_text: list[str] = []
|
|
366
402
|
|
|
367
|
-
def
|
|
368
|
-
fixed = ftfy.fix_text(
|
|
403
|
+
def _add_cleaned(raw: str) -> None:
|
|
404
|
+
fixed = ftfy.fix_text(raw)
|
|
369
405
|
if fixed:
|
|
370
406
|
accumulated_text.append(fixed)
|
|
371
407
|
|
|
372
408
|
def _emit_text() -> str:
|
|
373
|
-
|
|
409
|
+
txt = ''.join(accumulated_text)
|
|
374
410
|
accumulated_text.clear()
|
|
375
|
-
return
|
|
411
|
+
return txt
|
|
412
|
+
|
|
413
|
+
for page_idx, page in enumerate(doc.pages()):
|
|
414
|
+
# render once per page if requested
|
|
415
|
+
page_image = None
|
|
416
|
+
if self._include_page_image:
|
|
417
|
+
pix = page.get_pixmap(dpi=self._page_image_dpi) # ← single render
|
|
418
|
+
page_image = PIL.Image.open(io.BytesIO(pix.tobytes(self._page_image_format)))
|
|
376
419
|
|
|
377
|
-
for page_number, page in enumerate(doc.pages()):
|
|
378
420
|
for block in page.get_text('blocks'):
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
# see pymupdf docs https://pymupdf.readthedocs.io/en/latest/app1.html
|
|
382
|
-
# other libraries like pdfminer also lack an explicit paragraph concept
|
|
383
|
-
x1, y1, x2, y2, text, _, _ = block
|
|
384
|
-
_add_cleaned_text(text)
|
|
421
|
+
x1, y1, x2, y2, text, *_ = block
|
|
422
|
+
_add_cleaned(text)
|
|
385
423
|
if accumulated_text and emit_on_paragraph:
|
|
386
424
|
bbox = {'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2}
|
|
387
|
-
|
|
388
|
-
yield DocumentSection(text=_emit_text(), metadata=
|
|
425
|
+
md = DocumentSectionMetadata(page=page_idx, bounding_box=bbox)
|
|
426
|
+
yield DocumentSection(text=_emit_text(), metadata=md, image=page_image)
|
|
389
427
|
|
|
390
428
|
if accumulated_text and emit_on_page and not emit_on_paragraph:
|
|
391
|
-
|
|
392
|
-
|
|
429
|
+
md = DocumentSectionMetadata(page=page_idx)
|
|
430
|
+
yield DocumentSection(text=_emit_text(), metadata=md, image=page_image)
|
|
393
431
|
|
|
394
432
|
if accumulated_text and not emit_on_page:
|
|
395
|
-
yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata())
|
|
433
|
+
yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata(), image=None)
|
|
396
434
|
|
|
397
435
|
def _txt_sections(self) -> Iterator[DocumentSection]:
|
|
398
436
|
"""Create DocumentSections for text files.
|
pixeltable/iterators/video.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
+
import glob
|
|
1
2
|
import logging
|
|
2
3
|
import math
|
|
3
|
-
import shutil
|
|
4
4
|
import subprocess
|
|
5
5
|
from fractions import Fraction
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Any, Optional
|
|
7
|
+
from typing import Any, Iterator, Literal, Optional
|
|
8
8
|
|
|
9
9
|
import av
|
|
10
10
|
import pandas as pd
|
|
@@ -14,6 +14,7 @@ import pixeltable as pxt
|
|
|
14
14
|
import pixeltable.exceptions as excs
|
|
15
15
|
import pixeltable.type_system as ts
|
|
16
16
|
import pixeltable.utils.av as av_utils
|
|
17
|
+
from pixeltable.env import Env
|
|
17
18
|
from pixeltable.utils.local_store import TempStore
|
|
18
19
|
|
|
19
20
|
from .base import ComponentIterator
|
|
@@ -237,9 +238,15 @@ class VideoSplitter(ComponentIterator):
|
|
|
237
238
|
seconds.
|
|
238
239
|
|
|
239
240
|
Args:
|
|
240
|
-
|
|
241
|
-
overlap: Overlap between consecutive segments in seconds.
|
|
242
|
-
min_segment_duration: Drop the last segment if it is smaller than min_segment_duration
|
|
241
|
+
duration: Video segment duration in seconds
|
|
242
|
+
overlap: Overlap between consecutive segments in seconds. Only available for `mode='fast'`.
|
|
243
|
+
min_segment_duration: Drop the last segment if it is smaller than min_segment_duration.
|
|
244
|
+
mode: Segmentation mode:
|
|
245
|
+
- `'fast'`: Quick segmentation using stream copy (splits only at keyframes, approximate durations)
|
|
246
|
+
- `'accurate'`: Precise segmentation with re-encoding (exact durations, slower)
|
|
247
|
+
video_encoder: Video encoder to use. If not specified, uses the default encoder for the current platform.
|
|
248
|
+
Only available for `mode='accurate'`.
|
|
249
|
+
video_encoder_args: Additional arguments to pass to the video encoder. Only available for `mode='accurate'`.
|
|
243
250
|
"""
|
|
244
251
|
|
|
245
252
|
# Input parameters
|
|
@@ -247,65 +254,85 @@ class VideoSplitter(ComponentIterator):
|
|
|
247
254
|
segment_duration: float
|
|
248
255
|
overlap: float
|
|
249
256
|
min_segment_duration: float
|
|
257
|
+
video_encoder: str | None
|
|
258
|
+
video_encoder_args: dict[str, Any] | None
|
|
250
259
|
|
|
251
260
|
# Video metadata
|
|
252
261
|
video_duration: float
|
|
253
262
|
video_time_base: Fraction
|
|
254
263
|
video_start_time: int
|
|
255
264
|
|
|
256
|
-
|
|
257
|
-
next_segment_start: float
|
|
258
|
-
next_segment_start_pts: int
|
|
265
|
+
output_iter: Iterator[dict[str, Any]]
|
|
259
266
|
|
|
260
|
-
def __init__(
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
267
|
+
def __init__(
|
|
268
|
+
self,
|
|
269
|
+
video: str,
|
|
270
|
+
*,
|
|
271
|
+
duration: float,
|
|
272
|
+
overlap: float = 0.0,
|
|
273
|
+
min_segment_duration: float = 0.0,
|
|
274
|
+
mode: Literal['fast', 'accurate'] = 'fast',
|
|
275
|
+
video_encoder: str | None = None,
|
|
276
|
+
video_encoder_args: dict[str, Any] | None = None,
|
|
277
|
+
):
|
|
278
|
+
Env.get().require_binary('ffmpeg')
|
|
279
|
+
assert duration > 0.0
|
|
280
|
+
assert duration >= min_segment_duration
|
|
281
|
+
assert overlap < duration
|
|
264
282
|
|
|
265
283
|
video_path = Path(video)
|
|
266
284
|
assert video_path.exists() and video_path.is_file()
|
|
267
285
|
|
|
268
|
-
if not shutil.which('ffmpeg'):
|
|
269
|
-
raise pxt.Error('ffmpeg is not installed or not in PATH. Please install ffmpeg to use VideoSplitter.')
|
|
270
|
-
|
|
271
286
|
self.video_path = video_path
|
|
272
|
-
self.segment_duration =
|
|
287
|
+
self.segment_duration = duration
|
|
273
288
|
self.overlap = overlap
|
|
274
289
|
self.min_segment_duration = min_segment_duration
|
|
290
|
+
self.video_encoder = video_encoder
|
|
291
|
+
self.video_encoder_args = video_encoder_args
|
|
275
292
|
|
|
276
293
|
with av.open(str(video_path)) as container:
|
|
277
294
|
video_stream = container.streams.video[0]
|
|
278
295
|
self.video_time_base = video_stream.time_base
|
|
279
296
|
self.video_start_time = video_stream.start_time or 0
|
|
280
297
|
|
|
281
|
-
self.
|
|
282
|
-
self.next_segment_start_pts = self.video_start_time
|
|
298
|
+
self.output_iter = self.fast_iter() if mode == 'fast' else self.accurate_iter()
|
|
283
299
|
|
|
284
300
|
@classmethod
|
|
285
301
|
def input_schema(cls) -> dict[str, ts.ColumnType]:
|
|
286
302
|
return {
|
|
287
303
|
'video': ts.VideoType(nullable=False),
|
|
288
|
-
'
|
|
304
|
+
'duration': ts.FloatType(nullable=True),
|
|
289
305
|
'overlap': ts.FloatType(nullable=True),
|
|
290
306
|
'min_segment_duration': ts.FloatType(nullable=True),
|
|
307
|
+
'mode': ts.StringType(nullable=False),
|
|
308
|
+
'video_encoder': ts.StringType(nullable=True),
|
|
309
|
+
'video_encoder_args': ts.JsonType(nullable=True),
|
|
291
310
|
}
|
|
292
311
|
|
|
293
312
|
@classmethod
|
|
294
313
|
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
295
|
-
param_names = ['
|
|
314
|
+
param_names = ['duration', 'overlap', 'min_segment_duration']
|
|
296
315
|
params = dict(zip(param_names, args))
|
|
297
316
|
params.update(kwargs)
|
|
298
317
|
|
|
299
|
-
segment_duration = params['
|
|
318
|
+
segment_duration = params['duration']
|
|
300
319
|
min_segment_duration = params.get('min_segment_duration', 0.0)
|
|
301
320
|
overlap = params.get('overlap', 0.0)
|
|
321
|
+
mode = params.get('mode', 'fast')
|
|
302
322
|
|
|
303
323
|
if segment_duration <= 0.0:
|
|
304
|
-
raise excs.Error('
|
|
324
|
+
raise excs.Error('duration must be a positive number')
|
|
305
325
|
if segment_duration < min_segment_duration:
|
|
306
|
-
raise excs.Error('
|
|
326
|
+
raise excs.Error('duration must be at least min_segment_duration')
|
|
327
|
+
if mode == 'accurate' and overlap > 0:
|
|
328
|
+
raise excs.Error("Cannot specify overlap for mode='accurate'")
|
|
307
329
|
if overlap >= segment_duration:
|
|
308
|
-
raise excs.Error('overlap must be less than
|
|
330
|
+
raise excs.Error('overlap must be less than duration')
|
|
331
|
+
if mode == 'fast':
|
|
332
|
+
if params.get('video_encoder') is not None:
|
|
333
|
+
raise excs.Error("Cannot specify video_encoder for mode='fast'")
|
|
334
|
+
if params.get('video_encoder_args') is not None:
|
|
335
|
+
raise excs.Error("Cannot specify video_encoder_args for mode='fast'")
|
|
309
336
|
|
|
310
337
|
return {
|
|
311
338
|
'segment_start': ts.FloatType(nullable=False),
|
|
@@ -315,48 +342,94 @@ class VideoSplitter(ComponentIterator):
|
|
|
315
342
|
'video_segment': ts.VideoType(nullable=False),
|
|
316
343
|
}, []
|
|
317
344
|
|
|
318
|
-
def
|
|
319
|
-
segment_path
|
|
345
|
+
def fast_iter(self) -> Iterator[dict[str, Any]]:
|
|
346
|
+
segment_path: str
|
|
320
347
|
try:
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
348
|
+
start_time = 0.0
|
|
349
|
+
start_pts = 0
|
|
350
|
+
while True:
|
|
351
|
+
segment_path = str(TempStore.create_path(extension='.mp4'))
|
|
352
|
+
cmd = av_utils.ffmpeg_clip_cmd(str(self.video_path), segment_path, start_time, self.segment_duration)
|
|
353
|
+
_ = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
354
|
+
|
|
355
|
+
# use the actual duration
|
|
356
|
+
segment_duration = av_utils.get_video_duration(segment_path)
|
|
357
|
+
if segment_duration - self.overlap == 0.0 or segment_duration < self.min_segment_duration:
|
|
358
|
+
# we're done
|
|
359
|
+
Path(segment_path).unlink()
|
|
360
|
+
return
|
|
361
|
+
|
|
362
|
+
segment_end = start_time + segment_duration
|
|
363
|
+
segment_end_pts = start_pts + round(segment_duration / self.video_time_base)
|
|
364
|
+
result = {
|
|
365
|
+
'segment_start': start_time,
|
|
366
|
+
'segment_start_pts': start_pts,
|
|
367
|
+
'segment_end': segment_end,
|
|
368
|
+
'segment_end_pts': segment_end_pts,
|
|
369
|
+
'video_segment': segment_path,
|
|
370
|
+
}
|
|
371
|
+
yield result
|
|
325
372
|
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
if segment_duration - self.overlap == 0.0:
|
|
329
|
-
# we're done
|
|
330
|
-
Path(segment_path).unlink()
|
|
331
|
-
raise StopIteration
|
|
373
|
+
start_time = segment_end - self.overlap
|
|
374
|
+
start_pts = segment_end_pts - round(self.overlap / self.video_time_base)
|
|
332
375
|
|
|
333
|
-
|
|
376
|
+
except subprocess.CalledProcessError as e:
|
|
377
|
+
if Path(segment_path).exists():
|
|
334
378
|
Path(segment_path).unlink()
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
379
|
+
error_msg = f'ffmpeg failed with return code {e.returncode}'
|
|
380
|
+
if e.stderr:
|
|
381
|
+
error_msg += f': {e.stderr.strip()}'
|
|
382
|
+
raise pxt.Error(error_msg) from e
|
|
339
383
|
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
384
|
+
def accurate_iter(self) -> Iterator[dict[str, Any]]:
|
|
385
|
+
base_path = TempStore.create_path(extension='')
|
|
386
|
+
# Use ffmpeg -f segment for accurate segmentation with re-encoding
|
|
387
|
+
output_pattern = f'{base_path}_segment_%04d.mp4'
|
|
388
|
+
cmd = av_utils.ffmpeg_segment_cmd(
|
|
389
|
+
str(self.video_path),
|
|
390
|
+
output_pattern,
|
|
391
|
+
segment_duration=self.segment_duration,
|
|
392
|
+
video_encoder=self.video_encoder,
|
|
393
|
+
video_encoder_args=self.video_encoder_args,
|
|
394
|
+
)
|
|
349
395
|
|
|
350
|
-
|
|
396
|
+
try:
|
|
397
|
+
_ = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
398
|
+
output_paths = sorted(glob.glob(f'{base_path}_segment_*.mp4'))
|
|
399
|
+
# TODO: is this actually an error?
|
|
400
|
+
# if len(output_paths) == 0:
|
|
401
|
+
# stderr_output = result.stderr.strip() if result.stderr is not None else ''
|
|
402
|
+
# raise pxt.Error(
|
|
403
|
+
# f'ffmpeg failed to create output files for commandline: {" ".join(cmd)}\n{stderr_output}'
|
|
404
|
+
# )
|
|
405
|
+
start_time = 0.0
|
|
406
|
+
start_pts = 0
|
|
407
|
+
for segment_path in output_paths:
|
|
408
|
+
segment_duration = av_utils.get_video_duration(segment_path)
|
|
409
|
+
if segment_duration < self.min_segment_duration:
|
|
410
|
+
Path(segment_path).unlink()
|
|
411
|
+
return
|
|
412
|
+
|
|
413
|
+
result = {
|
|
414
|
+
'segment_start': start_time,
|
|
415
|
+
'segment_start_pts': start_pts,
|
|
416
|
+
'segment_end': start_time + segment_duration,
|
|
417
|
+
'segment_end_pts': start_pts + round(segment_duration / self.video_time_base),
|
|
418
|
+
'video_segment': segment_path,
|
|
419
|
+
}
|
|
420
|
+
yield result
|
|
421
|
+
start_time += segment_duration
|
|
422
|
+
start_pts += round(segment_duration / self.video_time_base)
|
|
351
423
|
|
|
352
424
|
except subprocess.CalledProcessError as e:
|
|
353
|
-
if Path(segment_path).exists():
|
|
354
|
-
Path(segment_path).unlink()
|
|
355
425
|
error_msg = f'ffmpeg failed with return code {e.returncode}'
|
|
356
426
|
if e.stderr:
|
|
357
427
|
error_msg += f': {e.stderr.strip()}'
|
|
358
428
|
raise pxt.Error(error_msg) from e
|
|
359
429
|
|
|
430
|
+
def __next__(self) -> dict[str, Any]:
|
|
431
|
+
return next(self.output_iter)
|
|
432
|
+
|
|
360
433
|
def close(self) -> None:
|
|
361
434
|
pass
|
|
362
435
|
|
pixeltable/metadata/__init__.py
CHANGED
|
@@ -18,7 +18,7 @@ _console_logger = ConsoleLogger(logging.getLogger('pixeltable'))
|
|
|
18
18
|
_logger = logging.getLogger('pixeltable')
|
|
19
19
|
|
|
20
20
|
# current version of the metadata; this is incremented whenever the metadata schema changes
|
|
21
|
-
VERSION =
|
|
21
|
+
VERSION = 41
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
def create_system_info(engine: sql.engine.Engine) -> None:
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from uuid import UUID
|
|
3
|
+
|
|
4
|
+
import sqlalchemy as sql
|
|
5
|
+
|
|
6
|
+
from pixeltable.metadata import register_converter
|
|
7
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
8
|
+
|
|
9
|
+
_logger = logging.getLogger('pixeltable')
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@register_converter(version=40)
|
|
13
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
14
|
+
convert_table_md(engine, table_modifier=__table_modifier)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def __table_modifier(conn: sql.Connection, tbl_id: UUID, orig_table_md: dict, updated_table_md: dict) -> None:
|
|
18
|
+
store_prefix = 'view' if orig_table_md['view_md'] is not None else 'tbl'
|
|
19
|
+
store_name = f'{store_prefix}_{tbl_id.hex}'
|
|
20
|
+
|
|
21
|
+
# Get the list of column names that need _cellmd columns
|
|
22
|
+
_logger.info(f'Checking table {orig_table_md["name"]} ({store_name})')
|
|
23
|
+
col_ids = find_target_columns(orig_table_md)
|
|
24
|
+
if len(col_ids) == 0:
|
|
25
|
+
_logger.info(f'No Array or Json columns found in table {orig_table_md["name"]}. Skipping migration.')
|
|
26
|
+
return
|
|
27
|
+
|
|
28
|
+
# Check which columns already exist in the table
|
|
29
|
+
check_columns_sql = sql.text(f"""
|
|
30
|
+
SELECT column_name
|
|
31
|
+
FROM information_schema.columns
|
|
32
|
+
WHERE table_name = '{store_name}'
|
|
33
|
+
""")
|
|
34
|
+
existing_columns = {row[0] for row in conn.execute(check_columns_sql)}
|
|
35
|
+
|
|
36
|
+
# Filter out columns that already have _cellmd
|
|
37
|
+
col_ids_to_add: list[int] = []
|
|
38
|
+
for col_id in col_ids:
|
|
39
|
+
cellmd_col = f'col_{col_id}_cellmd'
|
|
40
|
+
if cellmd_col not in existing_columns:
|
|
41
|
+
col_ids_to_add.append(col_id)
|
|
42
|
+
else:
|
|
43
|
+
_logger.info(f'Column {cellmd_col} already exists in table {orig_table_md["name"]}. Skipping.')
|
|
44
|
+
|
|
45
|
+
if len(col_ids_to_add) == 0:
|
|
46
|
+
_logger.info(f'All _cellmd columns already exist in table {orig_table_md["name"]}. Skipping migration.')
|
|
47
|
+
return
|
|
48
|
+
|
|
49
|
+
return add_cellmd_columns(conn, store_name, col_ids_to_add)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def find_target_columns(table_md: dict) -> list[int]:
|
|
53
|
+
"""Returns ids of stored array and json columns"""
|
|
54
|
+
result: list[int] = []
|
|
55
|
+
for col_id, col_md in table_md['column_md'].items():
|
|
56
|
+
col_type = col_md['col_type']
|
|
57
|
+
classname = col_type.get('_classname')
|
|
58
|
+
if classname in ['ArrayType', 'JsonType'] and col_md.get('stored', False):
|
|
59
|
+
result.append(col_id)
|
|
60
|
+
_logger.info(f'Found {classname} column: {col_id}')
|
|
61
|
+
return result
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def add_cellmd_columns(conn: sql.Connection, store_name: str, col_ids: list[int]) -> None:
|
|
65
|
+
try:
|
|
66
|
+
# Add new columns
|
|
67
|
+
add_column_str = ', '.join(f'ADD COLUMN col_{col_id}_cellmd JSONB DEFAULT NULL' for col_id in col_ids)
|
|
68
|
+
add_column_sql = sql.text(f'ALTER TABLE {store_name} {add_column_str}')
|
|
69
|
+
conn.execute(add_column_sql)
|
|
70
|
+
_logger.info(f'Added columns to {store_name}: {", ".join(f"col_{col_id}_cellmd" for col_id in col_ids)}')
|
|
71
|
+
except sql.exc.SQLAlchemyError as e:
|
|
72
|
+
_logger.error(f'Migration for table {store_name} failed: {e}')
|
|
73
|
+
raise
|
pixeltable/metadata/notes.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# rather than as a comment, so that the existence of a description can be enforced by
|
|
3
3
|
# the unit tests when new versions are added.
|
|
4
4
|
VERSION_NOTES = {
|
|
5
|
+
41: 'Cellmd columns for array and json columns',
|
|
5
6
|
40: 'Convert error property columns to cellmd columns',
|
|
6
7
|
39: 'ColumnHandles in external stores',
|
|
7
8
|
38: 'Added TableMd.view_sn',
|