pixeltable 0.4.17__py3-none-any.whl → 0.4.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/catalog/catalog.py +26 -19
- pixeltable/catalog/table.py +33 -14
- pixeltable/catalog/table_version.py +16 -12
- pixeltable/dataframe.py +1 -1
- pixeltable/env.py +4 -0
- pixeltable/exec/exec_context.py +15 -2
- pixeltable/exec/sql_node.py +3 -2
- pixeltable/functions/huggingface.py +1031 -2
- pixeltable/functions/video.py +34 -7
- pixeltable/globals.py +23 -4
- pixeltable/iterators/document.py +88 -57
- pixeltable/iterators/video.py +58 -24
- pixeltable/plan.py +2 -6
- pixeltable/store.py +24 -3
- pixeltable/utils/av.py +66 -38
- {pixeltable-0.4.17.dist-info → pixeltable-0.4.18.dist-info}/METADATA +4 -4
- {pixeltable-0.4.17.dist-info → pixeltable-0.4.18.dist-info}/RECORD +20 -20
- {pixeltable-0.4.17.dist-info → pixeltable-0.4.18.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.17.dist-info → pixeltable-0.4.18.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.17.dist-info → pixeltable-0.4.18.dist-info}/licenses/LICENSE +0 -0
pixeltable/functions/video.py
CHANGED
|
@@ -306,7 +306,14 @@ def _handle_ffmpeg_error(e: subprocess.CalledProcessError) -> NoReturn:
|
|
|
306
306
|
|
|
307
307
|
@pxt.udf(is_method=True)
|
|
308
308
|
def clip(
|
|
309
|
-
video: pxt.Video,
|
|
309
|
+
video: pxt.Video,
|
|
310
|
+
*,
|
|
311
|
+
start_time: float,
|
|
312
|
+
end_time: float | None = None,
|
|
313
|
+
duration: float | None = None,
|
|
314
|
+
mode: Literal['fast', 'accurate'] = 'accurate',
|
|
315
|
+
video_encoder: str | None = None,
|
|
316
|
+
video_encoder_args: dict[str, Any] | None = None,
|
|
310
317
|
) -> pxt.Video | None:
|
|
311
318
|
"""
|
|
312
319
|
Extract a clip from a video, specified by `start_time` and either `end_time` or `duration` (in seconds).
|
|
@@ -323,6 +330,14 @@ def clip(
|
|
|
323
330
|
start_time: Start time in seconds
|
|
324
331
|
end_time: End time in seconds
|
|
325
332
|
duration: Duration of the clip in seconds
|
|
333
|
+
mode:
|
|
334
|
+
|
|
335
|
+
- `'fast'`: avoids re-encoding but starts the clip at the nearest keyframes and as a result, the clip
|
|
336
|
+
duration will be slightly longer than requested
|
|
337
|
+
- `'accurate'`: extracts a frame-accurate clip, but requires re-encoding
|
|
338
|
+
video_encoder: Video encoder to use. If not specified, uses the default encoder for the current platform.
|
|
339
|
+
Only available for `mode='accurate'`.
|
|
340
|
+
video_encoder_args: Additional arguments to pass to the video encoder. Only available for `mode='accurate'`.
|
|
326
341
|
|
|
327
342
|
Returns:
|
|
328
343
|
New video containing only the specified time range or None if start_time is beyond the end of the video.
|
|
@@ -336,6 +351,11 @@ def clip(
|
|
|
336
351
|
raise pxt.Error(f'duration must be positive, got {duration}')
|
|
337
352
|
if end_time is not None and duration is not None:
|
|
338
353
|
raise pxt.Error('end_time and duration cannot both be specified')
|
|
354
|
+
if mode == 'fast':
|
|
355
|
+
if video_encoder is not None:
|
|
356
|
+
raise pxt.Error("video_encoder is not supported for mode='fast'")
|
|
357
|
+
if video_encoder_args is not None:
|
|
358
|
+
raise pxt.Error("video_encoder_args is not supported for mode='fast'")
|
|
339
359
|
|
|
340
360
|
video_duration = av_utils.get_video_duration(video)
|
|
341
361
|
if video_duration is not None and start_time > video_duration:
|
|
@@ -345,7 +365,15 @@ def clip(
|
|
|
345
365
|
|
|
346
366
|
if end_time is not None:
|
|
347
367
|
duration = end_time - start_time
|
|
348
|
-
cmd = av_utils.ffmpeg_clip_cmd(
|
|
368
|
+
cmd = av_utils.ffmpeg_clip_cmd(
|
|
369
|
+
str(video),
|
|
370
|
+
output_path,
|
|
371
|
+
start_time,
|
|
372
|
+
duration,
|
|
373
|
+
fast=(mode == 'fast'),
|
|
374
|
+
video_encoder=video_encoder,
|
|
375
|
+
video_encoder_args=video_encoder_args,
|
|
376
|
+
)
|
|
349
377
|
|
|
350
378
|
try:
|
|
351
379
|
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
@@ -364,7 +392,7 @@ def segment_video(
|
|
|
364
392
|
*,
|
|
365
393
|
duration: float | None = None,
|
|
366
394
|
segment_times: list[float] | None = None,
|
|
367
|
-
mode: Literal['fast', 'accurate'] = '
|
|
395
|
+
mode: Literal['fast', 'accurate'] = 'accurate',
|
|
368
396
|
video_encoder: str | None = None,
|
|
369
397
|
video_encoder_args: dict[str, Any] | None = None,
|
|
370
398
|
) -> list[str]:
|
|
@@ -400,15 +428,14 @@ def segment_video(
|
|
|
400
428
|
Examples:
|
|
401
429
|
Split a video at 1 minute intervals using fast mode:
|
|
402
430
|
|
|
403
|
-
>>> tbl.select(segment_paths=tbl.video.segment_video(duration=60)).collect()
|
|
431
|
+
>>> tbl.select(segment_paths=tbl.video.segment_video(duration=60, mode='fast')).collect()
|
|
404
432
|
|
|
405
|
-
Split video into exact 10-second segments with accurate mode, using the libx264 encoder with a CRF of 23
|
|
406
|
-
slow preset (for smaller output files):
|
|
433
|
+
Split video into exact 10-second segments with default accurate mode, using the libx264 encoder with a CRF of 23
|
|
434
|
+
and slow preset (for smaller output files):
|
|
407
435
|
|
|
408
436
|
>>> tbl.select(
|
|
409
437
|
... segment_paths=tbl.video.segment_video(
|
|
410
438
|
... duration=10,
|
|
411
|
-
... mode='accurate',
|
|
412
439
|
... video_encoder='libx264',
|
|
413
440
|
... video_encoder_args={'crf': 23, 'preset': 'slow'}
|
|
414
441
|
... )
|
pixeltable/globals.py
CHANGED
|
@@ -487,12 +487,28 @@ def get_table(path: str, if_not_exists: Literal['error', 'ignore'] = 'error') ->
|
|
|
487
487
|
return tbl
|
|
488
488
|
|
|
489
489
|
|
|
490
|
-
def move(
|
|
490
|
+
def move(
|
|
491
|
+
path: str,
|
|
492
|
+
new_path: str,
|
|
493
|
+
*,
|
|
494
|
+
if_exists: Literal['error', 'ignore'] = 'error',
|
|
495
|
+
if_not_exists: Literal['error', 'ignore'] = 'error',
|
|
496
|
+
) -> None:
|
|
491
497
|
"""Move a schema object to a new directory and/or rename a schema object.
|
|
492
498
|
|
|
493
499
|
Args:
|
|
494
500
|
path: absolute path to the existing schema object.
|
|
495
501
|
new_path: absolute new path for the schema object.
|
|
502
|
+
if_exists: Directive regarding how to handle if a schema object already exists at the new path.
|
|
503
|
+
Must be one of the following:
|
|
504
|
+
|
|
505
|
+
- `'error'`: raise an error
|
|
506
|
+
- `'ignore'`: do nothing and return
|
|
507
|
+
if_not_exists: Directive regarding how to handle if the source path does not exist.
|
|
508
|
+
Must be one of the following:
|
|
509
|
+
|
|
510
|
+
- `'error'`: raise an error
|
|
511
|
+
- `'ignore'`: do nothing and return
|
|
496
512
|
|
|
497
513
|
Raises:
|
|
498
514
|
Error: If path does not exist or new_path already exists.
|
|
@@ -506,13 +522,16 @@ def move(path: str, new_path: str) -> None:
|
|
|
506
522
|
|
|
507
523
|
>>>> pxt.move('dir1.my_table', 'dir1.new_name')
|
|
508
524
|
"""
|
|
525
|
+
if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
|
|
526
|
+
if if_exists_ not in (catalog.IfExistsParam.ERROR, catalog.IfExistsParam.IGNORE):
|
|
527
|
+
raise excs.Error("`if_exists` must be one of 'error' or 'ignore'")
|
|
528
|
+
if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
|
|
509
529
|
if path == new_path:
|
|
510
530
|
raise excs.Error('move(): source and destination cannot be identical')
|
|
511
531
|
path_obj, new_path_obj = catalog.Path.parse(path), catalog.Path.parse(new_path)
|
|
512
532
|
if path_obj.is_ancestor(new_path_obj):
|
|
513
533
|
raise excs.Error(f'move(): cannot move {path!r} into its own subdirectory')
|
|
514
|
-
|
|
515
|
-
cat.move(path_obj, new_path_obj)
|
|
534
|
+
Catalog.get().move(path_obj, new_path_obj, if_exists_, if_not_exists_)
|
|
516
535
|
|
|
517
536
|
|
|
518
537
|
def drop_table(
|
|
@@ -660,7 +679,7 @@ def _list_tables(dir_path: str = '', recursive: bool = True, allow_system_paths:
|
|
|
660
679
|
|
|
661
680
|
|
|
662
681
|
def create_dir(
|
|
663
|
-
path: str, if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error', parents: bool = False
|
|
682
|
+
path: str, *, if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error', parents: bool = False
|
|
664
683
|
) -> Optional[catalog.Dir]:
|
|
665
684
|
"""Create a directory.
|
|
666
685
|
|
pixeltable/iterators/document.py
CHANGED
|
@@ -2,7 +2,7 @@ import dataclasses
|
|
|
2
2
|
import enum
|
|
3
3
|
import io
|
|
4
4
|
import logging
|
|
5
|
-
from typing import Any, ClassVar, Iterable, Iterator,
|
|
5
|
+
from typing import Any, ClassVar, Iterable, Iterator, Literal
|
|
6
6
|
|
|
7
7
|
import fitz # type: ignore[import-untyped]
|
|
8
8
|
import ftfy
|
|
@@ -11,7 +11,7 @@ from bs4.element import NavigableString, Tag
|
|
|
11
11
|
|
|
12
12
|
from pixeltable.env import Env
|
|
13
13
|
from pixeltable.exceptions import Error
|
|
14
|
-
from pixeltable.type_system import
|
|
14
|
+
from pixeltable.type_system import ColumnType, DocumentType, ImageType, IntType, JsonType, StringType
|
|
15
15
|
from pixeltable.utils.documents import get_document_handle
|
|
16
16
|
|
|
17
17
|
from .base import ComponentIterator
|
|
@@ -19,6 +19,11 @@ from .base import ComponentIterator
|
|
|
19
19
|
_logger = logging.getLogger('pixeltable')
|
|
20
20
|
|
|
21
21
|
|
|
22
|
+
class Element(enum.Enum):
|
|
23
|
+
TEXT = 1
|
|
24
|
+
IMAGE = 2
|
|
25
|
+
|
|
26
|
+
|
|
22
27
|
class ChunkMetadata(enum.Enum):
|
|
23
28
|
TITLE = 1
|
|
24
29
|
HEADING = 2
|
|
@@ -41,28 +46,28 @@ class DocumentSectionMetadata:
|
|
|
41
46
|
"""Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
|
|
42
47
|
|
|
43
48
|
# html and markdown metadata
|
|
44
|
-
sourceline:
|
|
49
|
+
sourceline: int | None = None
|
|
45
50
|
# the stack of headings up to the most recently observed one;
|
|
46
51
|
# eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
|
|
47
|
-
heading:
|
|
52
|
+
heading: dict[str, str] | None = None
|
|
48
53
|
|
|
49
54
|
# pdf-specific metadata
|
|
50
|
-
page:
|
|
55
|
+
page: int | None = None
|
|
51
56
|
# bounding box as an {x1, y1, x2, y2} dictionary
|
|
52
|
-
bounding_box:
|
|
57
|
+
bounding_box: dict[str, float] | None = None
|
|
53
58
|
|
|
54
59
|
|
|
55
60
|
@dataclasses.dataclass
|
|
56
61
|
class DocumentSection:
|
|
57
62
|
"""A single document chunk, according to some of the splitting criteria"""
|
|
58
63
|
|
|
59
|
-
text:
|
|
60
|
-
|
|
61
|
-
|
|
64
|
+
text: str | None = None
|
|
65
|
+
image: PIL.Image.Image | None = None
|
|
66
|
+
metadata: DocumentSectionMetadata | None = None
|
|
62
67
|
|
|
63
68
|
|
|
64
69
|
def _parse_separators(separators: str) -> list[Separator]:
|
|
65
|
-
ret = []
|
|
70
|
+
ret: list[Separator] = []
|
|
66
71
|
for s in separators.split(','):
|
|
67
72
|
clean_s = s.strip().upper()
|
|
68
73
|
if not clean_s:
|
|
@@ -76,7 +81,7 @@ def _parse_separators(separators: str) -> list[Separator]:
|
|
|
76
81
|
|
|
77
82
|
|
|
78
83
|
def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
|
|
79
|
-
ret = []
|
|
84
|
+
ret: list[ChunkMetadata] = []
|
|
80
85
|
for m in metadata.split(','):
|
|
81
86
|
clean_m = m.strip().upper()
|
|
82
87
|
if not clean_m:
|
|
@@ -89,6 +94,18 @@ def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
|
|
|
89
94
|
return ret
|
|
90
95
|
|
|
91
96
|
|
|
97
|
+
def _parse_elements(elements: list[Literal['text', 'image']]) -> list[Element]:
|
|
98
|
+
result: list[Element] = []
|
|
99
|
+
for e in elements:
|
|
100
|
+
clean_e = e.strip().upper()
|
|
101
|
+
if clean_e not in Element.__members__:
|
|
102
|
+
raise Error(f'Invalid element: `{e}`. Valid elements are: {", ".join(Element.__members__).lower()}')
|
|
103
|
+
result.append(Element[clean_e])
|
|
104
|
+
if len(result) == 0:
|
|
105
|
+
raise Error('elements cannot be empty')
|
|
106
|
+
return result
|
|
107
|
+
|
|
108
|
+
|
|
92
109
|
_HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
|
|
93
110
|
|
|
94
111
|
|
|
@@ -106,11 +123,16 @@ class DocumentSplitter(ComponentIterator):
|
|
|
106
123
|
separators: separators to use to chunk the document. Options are:
|
|
107
124
|
`'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
|
|
108
125
|
This may be a comma-separated string, e.g., `'heading,token_limit'`.
|
|
126
|
+
elements: list of elements to extract from the document. Options are:
|
|
127
|
+
`'text'`, `'image'`. Defaults to `['text']` if not specified. The `'image'` element is only supported
|
|
128
|
+
for the `'page'` separator on PDF documents.
|
|
109
129
|
limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
|
|
110
130
|
or `'char_limit'` is specified.
|
|
111
131
|
metadata: additional metadata fields to include in the output. Options are:
|
|
112
132
|
`'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
|
|
113
133
|
(PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
|
|
134
|
+
image_dpi: DPI to use when extracting images from PDFs. Defaults to 300.
|
|
135
|
+
image_format: format to use when extracting images from PDFs. Defaults to 'png'.
|
|
114
136
|
"""
|
|
115
137
|
|
|
116
138
|
METADATA_COLUMN_TYPES: ClassVar[dict[ChunkMetadata, ColumnType]] = {
|
|
@@ -121,34 +143,41 @@ class DocumentSplitter(ComponentIterator):
|
|
|
121
143
|
ChunkMetadata.BOUNDING_BOX: JsonType(nullable=True),
|
|
122
144
|
}
|
|
123
145
|
|
|
146
|
+
_doc_handle: Any
|
|
147
|
+
_separators: list[Separator]
|
|
148
|
+
_elements: list[Element]
|
|
149
|
+
_metadata_fields: list[ChunkMetadata]
|
|
150
|
+
_doc_title: str
|
|
151
|
+
_limit: int
|
|
152
|
+
_skip_tags: list[str]
|
|
153
|
+
_overlap: int
|
|
154
|
+
_tiktoken_encoding: str | None
|
|
155
|
+
_tiktoken_target_model: str | None
|
|
156
|
+
_image_dpi: int
|
|
157
|
+
_image_format: str
|
|
158
|
+
|
|
159
|
+
_sections: Iterator[DocumentSection]
|
|
160
|
+
|
|
124
161
|
def __init__(
|
|
125
162
|
self,
|
|
126
163
|
document: str,
|
|
127
164
|
*,
|
|
128
165
|
separators: str,
|
|
129
|
-
|
|
130
|
-
|
|
166
|
+
elements: list[Literal['text', 'image']] | None = None,
|
|
167
|
+
limit: int | None = None,
|
|
168
|
+
overlap: int | None = None,
|
|
131
169
|
metadata: str = '',
|
|
132
|
-
html_skip_tags:
|
|
133
|
-
tiktoken_encoding:
|
|
134
|
-
tiktoken_target_model:
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
page_image_dpi: int = 300,
|
|
138
|
-
page_image_format: str = 'png',
|
|
170
|
+
html_skip_tags: list[str] | None = None,
|
|
171
|
+
tiktoken_encoding: str | None = 'cl100k_base',
|
|
172
|
+
tiktoken_target_model: str | None = None,
|
|
173
|
+
image_dpi: int = 300,
|
|
174
|
+
image_format: str = 'png',
|
|
139
175
|
):
|
|
140
176
|
if html_skip_tags is None:
|
|
141
177
|
html_skip_tags = ['nav']
|
|
142
178
|
self._doc_handle = get_document_handle(document)
|
|
179
|
+
self._elements = _parse_elements(elements.copy()) if elements is not None else [Element.TEXT]
|
|
143
180
|
assert self._doc_handle is not None
|
|
144
|
-
# calling the output_schema method to validate the input arguments
|
|
145
|
-
self.output_schema(
|
|
146
|
-
separators=separators,
|
|
147
|
-
metadata=metadata,
|
|
148
|
-
limit=limit,
|
|
149
|
-
overlap=overlap,
|
|
150
|
-
include_page_image=include_page_image,
|
|
151
|
-
)
|
|
152
181
|
self._separators = _parse_separators(separators)
|
|
153
182
|
self._metadata_fields = _parse_metadata(metadata)
|
|
154
183
|
if self._doc_handle.bs_doc is not None:
|
|
@@ -164,10 +193,8 @@ class DocumentSplitter(ComponentIterator):
|
|
|
164
193
|
self._overlap = 0 if overlap is None else overlap
|
|
165
194
|
self._tiktoken_encoding = tiktoken_encoding
|
|
166
195
|
self._tiktoken_target_model = tiktoken_target_model
|
|
167
|
-
|
|
168
|
-
self.
|
|
169
|
-
self._page_image_dpi = page_image_dpi
|
|
170
|
-
self._page_image_format = page_image_format
|
|
196
|
+
self._image_dpi = image_dpi
|
|
197
|
+
self._image_format = image_format
|
|
171
198
|
|
|
172
199
|
# set up processing pipeline
|
|
173
200
|
if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
|
|
@@ -197,23 +224,28 @@ class DocumentSplitter(ComponentIterator):
|
|
|
197
224
|
return {
|
|
198
225
|
'document': DocumentType(nullable=False),
|
|
199
226
|
'separators': StringType(nullable=False),
|
|
227
|
+
'elements': JsonType(nullable=False),
|
|
200
228
|
'metadata': StringType(nullable=False),
|
|
201
229
|
'limit': IntType(nullable=True),
|
|
202
230
|
'overlap': IntType(nullable=True),
|
|
203
231
|
'skip_tags': StringType(nullable=True),
|
|
204
232
|
'tiktoken_encoding': StringType(nullable=True),
|
|
205
233
|
'tiktoken_target_model': StringType(nullable=True),
|
|
206
|
-
|
|
207
|
-
'
|
|
208
|
-
'page_image_dpi': IntType(nullable=True),
|
|
209
|
-
'page_image_format': StringType(nullable=True),
|
|
234
|
+
'image_dpi': IntType(nullable=True),
|
|
235
|
+
'image_format': StringType(nullable=True),
|
|
210
236
|
}
|
|
211
237
|
|
|
212
238
|
@classmethod
|
|
213
239
|
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
|
|
214
|
-
schema: dict[str, ColumnType] = {
|
|
215
|
-
|
|
216
|
-
|
|
240
|
+
schema: dict[str, ColumnType] = {}
|
|
241
|
+
elements = _parse_elements(kwargs.get('elements', ['text']))
|
|
242
|
+
for element in elements:
|
|
243
|
+
if element == Element.TEXT:
|
|
244
|
+
schema['text'] = StringType(nullable=False)
|
|
245
|
+
elif element == Element.IMAGE:
|
|
246
|
+
schema['image'] = ImageType(nullable=False)
|
|
247
|
+
|
|
248
|
+
md_fields = _parse_metadata(kwargs.get('metadata', ''))
|
|
217
249
|
for md_field in md_fields:
|
|
218
250
|
schema[md_field.name.lower()] = cls.METADATA_COLUMN_TYPES[md_field]
|
|
219
251
|
|
|
@@ -223,6 +255,8 @@ class DocumentSplitter(ComponentIterator):
|
|
|
223
255
|
limit = kwargs.get('limit')
|
|
224
256
|
overlap = kwargs.get('overlap')
|
|
225
257
|
|
|
258
|
+
if Element.IMAGE in elements and separators != [Separator.PAGE]:
|
|
259
|
+
raise Error('Image elements are only supported for the "page" separator on PDF documents')
|
|
226
260
|
if limit is not None or overlap is not None:
|
|
227
261
|
if Separator.TOKEN_LIMIT not in separators and Separator.CHAR_LIMIT not in separators:
|
|
228
262
|
raise Error('limit/overlap requires the "token_limit" or "char_limit" separator')
|
|
@@ -236,23 +270,25 @@ class DocumentSplitter(ComponentIterator):
|
|
|
236
270
|
if kwargs.get('limit') is None:
|
|
237
271
|
raise Error('limit is required with "token_limit"/"char_limit" separators')
|
|
238
272
|
|
|
239
|
-
# check dependencies at the end
|
|
240
273
|
if Separator.SENTENCE in separators:
|
|
241
274
|
_ = Env.get().spacy_nlp
|
|
242
275
|
if Separator.TOKEN_LIMIT in separators:
|
|
243
276
|
Env.get().require_package('tiktoken')
|
|
244
277
|
|
|
245
|
-
if kwargs.get('include_page_image'):
|
|
246
|
-
schema['image'] = ImageType(nullable=True)
|
|
247
|
-
|
|
248
278
|
return schema, []
|
|
249
279
|
|
|
250
280
|
def __next__(self) -> dict[str, Any]:
|
|
251
281
|
while True:
|
|
252
282
|
section = next(self._sections)
|
|
253
|
-
if section.text is None:
|
|
283
|
+
if section.text is None and section.image is None:
|
|
254
284
|
continue
|
|
255
|
-
result: dict[str, Any] = {
|
|
285
|
+
result: dict[str, Any] = {}
|
|
286
|
+
for element in self._elements:
|
|
287
|
+
if element == Element.TEXT:
|
|
288
|
+
result['text'] = section.text
|
|
289
|
+
elif element == Element.IMAGE:
|
|
290
|
+
result['image'] = section.image
|
|
291
|
+
|
|
256
292
|
for md_field in self._metadata_fields:
|
|
257
293
|
if md_field == ChunkMetadata.TITLE:
|
|
258
294
|
result[md_field.name.lower()] = self._doc_title
|
|
@@ -265,10 +301,6 @@ class DocumentSplitter(ComponentIterator):
|
|
|
265
301
|
elif md_field == ChunkMetadata.BOUNDING_BOX:
|
|
266
302
|
result[md_field.name.lower()] = section.metadata.bounding_box
|
|
267
303
|
|
|
268
|
-
# FIX: only include image if schema supports it
|
|
269
|
-
if self._include_page_image:
|
|
270
|
-
result['image'] = section.image
|
|
271
|
-
|
|
272
304
|
return result
|
|
273
305
|
|
|
274
306
|
def _html_sections(self) -> Iterator[DocumentSection]:
|
|
@@ -411,11 +443,10 @@ class DocumentSplitter(ComponentIterator):
|
|
|
411
443
|
return txt
|
|
412
444
|
|
|
413
445
|
for page_idx, page in enumerate(doc.pages()):
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
page_image = PIL.Image.open(io.BytesIO(pix.tobytes(self._page_image_format)))
|
|
446
|
+
img: PIL.Image.Image | None = None
|
|
447
|
+
if Element.IMAGE in self._elements:
|
|
448
|
+
pix = page.get_pixmap(dpi=self._image_dpi)
|
|
449
|
+
img = PIL.Image.open(io.BytesIO(pix.tobytes(self._image_format)))
|
|
419
450
|
|
|
420
451
|
for block in page.get_text('blocks'):
|
|
421
452
|
x1, y1, x2, y2, text, *_ = block
|
|
@@ -423,14 +454,14 @@ class DocumentSplitter(ComponentIterator):
|
|
|
423
454
|
if accumulated_text and emit_on_paragraph:
|
|
424
455
|
bbox = {'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2}
|
|
425
456
|
md = DocumentSectionMetadata(page=page_idx, bounding_box=bbox)
|
|
426
|
-
yield DocumentSection(text=_emit_text(), metadata=md
|
|
457
|
+
yield DocumentSection(text=_emit_text(), metadata=md)
|
|
427
458
|
|
|
428
459
|
if accumulated_text and emit_on_page and not emit_on_paragraph:
|
|
429
460
|
md = DocumentSectionMetadata(page=page_idx)
|
|
430
|
-
yield DocumentSection(text=_emit_text(),
|
|
461
|
+
yield DocumentSection(text=_emit_text(), image=img, metadata=md)
|
|
431
462
|
|
|
432
463
|
if accumulated_text and not emit_on_page:
|
|
433
|
-
yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata()
|
|
464
|
+
yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata())
|
|
434
465
|
|
|
435
466
|
def _txt_sections(self) -> Iterator[DocumentSection]:
|
|
436
467
|
"""Create DocumentSections for text files.
|
pixeltable/iterators/video.py
CHANGED
|
@@ -251,7 +251,8 @@ class VideoSplitter(ComponentIterator):
|
|
|
251
251
|
|
|
252
252
|
# Input parameters
|
|
253
253
|
video_path: Path
|
|
254
|
-
segment_duration: float
|
|
254
|
+
segment_duration: float | None
|
|
255
|
+
segment_times: list[float] | None
|
|
255
256
|
overlap: float
|
|
256
257
|
min_segment_duration: float
|
|
257
258
|
video_encoder: str | None
|
|
@@ -268,25 +269,31 @@ class VideoSplitter(ComponentIterator):
|
|
|
268
269
|
self,
|
|
269
270
|
video: str,
|
|
270
271
|
*,
|
|
271
|
-
duration: float,
|
|
272
|
-
overlap: float =
|
|
273
|
-
min_segment_duration: float =
|
|
274
|
-
|
|
272
|
+
duration: float | None = None,
|
|
273
|
+
overlap: float | None = None,
|
|
274
|
+
min_segment_duration: float | None = None,
|
|
275
|
+
segment_times: list[float] | None = None,
|
|
276
|
+
mode: Literal['fast', 'accurate'] = 'accurate',
|
|
275
277
|
video_encoder: str | None = None,
|
|
276
278
|
video_encoder_args: dict[str, Any] | None = None,
|
|
277
279
|
):
|
|
278
280
|
Env.get().require_binary('ffmpeg')
|
|
279
|
-
assert duration
|
|
280
|
-
|
|
281
|
-
|
|
281
|
+
assert (duration is not None) != (segment_times is not None)
|
|
282
|
+
if segment_times is not None:
|
|
283
|
+
assert len(segment_times) > 0
|
|
284
|
+
if duration is not None:
|
|
285
|
+
assert duration > 0.0
|
|
286
|
+
assert duration >= min_segment_duration
|
|
287
|
+
assert overlap is None or overlap < duration
|
|
282
288
|
|
|
283
289
|
video_path = Path(video)
|
|
284
290
|
assert video_path.exists() and video_path.is_file()
|
|
285
291
|
|
|
286
292
|
self.video_path = video_path
|
|
287
293
|
self.segment_duration = duration
|
|
288
|
-
self.overlap = overlap
|
|
289
|
-
self.min_segment_duration = min_segment_duration
|
|
294
|
+
self.overlap = overlap if overlap is not None else 0.0
|
|
295
|
+
self.min_segment_duration = min_segment_duration if min_segment_duration is not None else 0.0
|
|
296
|
+
self.segment_times = segment_times
|
|
290
297
|
self.video_encoder = video_encoder
|
|
291
298
|
self.video_encoder_args = video_encoder_args
|
|
292
299
|
|
|
@@ -304,6 +311,7 @@ class VideoSplitter(ComponentIterator):
|
|
|
304
311
|
'duration': ts.FloatType(nullable=True),
|
|
305
312
|
'overlap': ts.FloatType(nullable=True),
|
|
306
313
|
'min_segment_duration': ts.FloatType(nullable=True),
|
|
314
|
+
'segment_times': ts.JsonType(nullable=True),
|
|
307
315
|
'mode': ts.StringType(nullable=False),
|
|
308
316
|
'video_encoder': ts.StringType(nullable=True),
|
|
309
317
|
'video_encoder_args': ts.JsonType(nullable=True),
|
|
@@ -311,23 +319,34 @@ class VideoSplitter(ComponentIterator):
|
|
|
311
319
|
|
|
312
320
|
@classmethod
|
|
313
321
|
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
314
|
-
param_names = ['duration', 'overlap', 'min_segment_duration']
|
|
322
|
+
param_names = ['duration', 'overlap', 'min_segment_duration', 'segment_times']
|
|
315
323
|
params = dict(zip(param_names, args))
|
|
316
324
|
params.update(kwargs)
|
|
317
325
|
|
|
318
|
-
segment_duration = params
|
|
319
|
-
|
|
320
|
-
overlap = params.get('overlap'
|
|
326
|
+
segment_duration = params.get('duration')
|
|
327
|
+
segment_times = params.get('segment_times')
|
|
328
|
+
overlap = params.get('overlap')
|
|
329
|
+
min_segment_duration = params.get('min_segment_duration')
|
|
321
330
|
mode = params.get('mode', 'fast')
|
|
322
331
|
|
|
323
|
-
if segment_duration
|
|
324
|
-
raise excs.Error('
|
|
325
|
-
if segment_duration
|
|
326
|
-
raise excs.Error('duration
|
|
327
|
-
if
|
|
332
|
+
if segment_duration is None and segment_times is None:
|
|
333
|
+
raise excs.Error('Must specify either duration or segment_times')
|
|
334
|
+
if segment_duration is not None and segment_times is not None:
|
|
335
|
+
raise excs.Error('duration and segment_times cannot both be specified')
|
|
336
|
+
if segment_times is not None:
|
|
337
|
+
if len(segment_times) == 0:
|
|
338
|
+
raise excs.Error('segment_times cannot be empty')
|
|
339
|
+
if overlap is not None:
|
|
340
|
+
raise excs.Error('overlap cannot be specified with segment_times')
|
|
341
|
+
if segment_duration is not None:
|
|
342
|
+
if segment_duration <= 0.0:
|
|
343
|
+
raise excs.Error('duration must be a positive number')
|
|
344
|
+
if min_segment_duration is not None and segment_duration < min_segment_duration:
|
|
345
|
+
raise excs.Error('duration must be at least min_segment_duration')
|
|
346
|
+
if overlap is not None and overlap >= segment_duration:
|
|
347
|
+
raise excs.Error('overlap must be less than duration')
|
|
348
|
+
if mode == 'accurate' and overlap is not None:
|
|
328
349
|
raise excs.Error("Cannot specify overlap for mode='accurate'")
|
|
329
|
-
if overlap >= segment_duration:
|
|
330
|
-
raise excs.Error('overlap must be less than duration')
|
|
331
350
|
if mode == 'fast':
|
|
332
351
|
if params.get('video_encoder') is not None:
|
|
333
352
|
raise excs.Error("Cannot specify video_encoder for mode='fast'")
|
|
@@ -343,13 +362,22 @@ class VideoSplitter(ComponentIterator):
|
|
|
343
362
|
}, []
|
|
344
363
|
|
|
345
364
|
def fast_iter(self) -> Iterator[dict[str, Any]]:
|
|
346
|
-
segment_path: str
|
|
365
|
+
segment_path: str = ''
|
|
347
366
|
try:
|
|
348
367
|
start_time = 0.0
|
|
349
368
|
start_pts = 0
|
|
369
|
+
segment_idx = 0
|
|
350
370
|
while True:
|
|
371
|
+
target_duration: float | None
|
|
372
|
+
if self.segment_duration is not None:
|
|
373
|
+
target_duration = self.segment_duration
|
|
374
|
+
elif self.segment_times is not None and segment_idx < len(self.segment_times):
|
|
375
|
+
target_duration = self.segment_times[segment_idx] - start_time
|
|
376
|
+
else:
|
|
377
|
+
target_duration = None # the rest of the video
|
|
378
|
+
|
|
351
379
|
segment_path = str(TempStore.create_path(extension='.mp4'))
|
|
352
|
-
cmd = av_utils.ffmpeg_clip_cmd(str(self.video_path), segment_path, start_time,
|
|
380
|
+
cmd = av_utils.ffmpeg_clip_cmd(str(self.video_path), segment_path, start_time, target_duration)
|
|
353
381
|
_ = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
354
382
|
|
|
355
383
|
# use the actual duration
|
|
@@ -373,8 +401,13 @@ class VideoSplitter(ComponentIterator):
|
|
|
373
401
|
start_time = segment_end - self.overlap
|
|
374
402
|
start_pts = segment_end_pts - round(self.overlap / self.video_time_base)
|
|
375
403
|
|
|
404
|
+
segment_idx += 1
|
|
405
|
+
if self.segment_times is not None and segment_idx > len(self.segment_times):
|
|
406
|
+
# We've created all segments including the final segment after the last segment_time
|
|
407
|
+
break
|
|
408
|
+
|
|
376
409
|
except subprocess.CalledProcessError as e:
|
|
377
|
-
if Path(segment_path).exists():
|
|
410
|
+
if segment_path and Path(segment_path).exists():
|
|
378
411
|
Path(segment_path).unlink()
|
|
379
412
|
error_msg = f'ffmpeg failed with return code {e.returncode}'
|
|
380
413
|
if e.stderr:
|
|
@@ -389,6 +422,7 @@ class VideoSplitter(ComponentIterator):
|
|
|
389
422
|
str(self.video_path),
|
|
390
423
|
output_pattern,
|
|
391
424
|
segment_duration=self.segment_duration,
|
|
425
|
+
segment_times=self.segment_times,
|
|
392
426
|
video_encoder=self.video_encoder,
|
|
393
427
|
video_encoder_args=self.video_encoder_args,
|
|
394
428
|
)
|
pixeltable/plan.py
CHANGED
|
@@ -93,18 +93,13 @@ class SampleClause:
|
|
|
93
93
|
seed: Optional[int]
|
|
94
94
|
stratify_exprs: Optional[list[exprs.Expr]]
|
|
95
95
|
|
|
96
|
-
# This seed value is used if one is not supplied
|
|
97
|
-
DEFAULT_SEED = 0
|
|
98
|
-
|
|
99
96
|
# The version of the hashing algorithm used for ordering and fractional sampling.
|
|
100
97
|
CURRENT_VERSION = 1
|
|
101
98
|
|
|
102
99
|
def __post_init__(self) -> None:
|
|
103
|
-
|
|
100
|
+
# If no version was provided, provide the default version
|
|
104
101
|
if self.version is None:
|
|
105
102
|
self.version = self.CURRENT_VERSION
|
|
106
|
-
if self.seed is None:
|
|
107
|
-
self.seed = self.DEFAULT_SEED
|
|
108
103
|
|
|
109
104
|
@property
|
|
110
105
|
def is_stratified(self) -> bool:
|
|
@@ -1006,6 +1001,7 @@ class Planner:
|
|
|
1006
1001
|
analyzer.window_fn_calls
|
|
1007
1002
|
)
|
|
1008
1003
|
ctx = exec.ExecContext(row_builder)
|
|
1004
|
+
|
|
1009
1005
|
combined_ordering = cls._create_combined_ordering(analyzer, verify_agg=is_python_agg)
|
|
1010
1006
|
cls._verify_join_clauses(analyzer)
|
|
1011
1007
|
|