pixeltable 0.4.17__py3-none-any.whl → 0.4.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

@@ -306,7 +306,14 @@ def _handle_ffmpeg_error(e: subprocess.CalledProcessError) -> NoReturn:
306
306
 
307
307
  @pxt.udf(is_method=True)
308
308
  def clip(
309
- video: pxt.Video, *, start_time: float, end_time: float | None = None, duration: float | None = None
309
+ video: pxt.Video,
310
+ *,
311
+ start_time: float,
312
+ end_time: float | None = None,
313
+ duration: float | None = None,
314
+ mode: Literal['fast', 'accurate'] = 'accurate',
315
+ video_encoder: str | None = None,
316
+ video_encoder_args: dict[str, Any] | None = None,
310
317
  ) -> pxt.Video | None:
311
318
  """
312
319
  Extract a clip from a video, specified by `start_time` and either `end_time` or `duration` (in seconds).
@@ -323,6 +330,14 @@ def clip(
323
330
  start_time: Start time in seconds
324
331
  end_time: End time in seconds
325
332
  duration: Duration of the clip in seconds
333
+ mode:
334
+
335
+ - `'fast'`: avoids re-encoding but starts the clip at the nearest keyframes and as a result, the clip
336
+ duration will be slightly longer than requested
337
+ - `'accurate'`: extracts a frame-accurate clip, but requires re-encoding
338
+ video_encoder: Video encoder to use. If not specified, uses the default encoder for the current platform.
339
+ Only available for `mode='accurate'`.
340
+ video_encoder_args: Additional arguments to pass to the video encoder. Only available for `mode='accurate'`.
326
341
 
327
342
  Returns:
328
343
  New video containing only the specified time range or None if start_time is beyond the end of the video.
@@ -336,6 +351,11 @@ def clip(
336
351
  raise pxt.Error(f'duration must be positive, got {duration}')
337
352
  if end_time is not None and duration is not None:
338
353
  raise pxt.Error('end_time and duration cannot both be specified')
354
+ if mode == 'fast':
355
+ if video_encoder is not None:
356
+ raise pxt.Error("video_encoder is not supported for mode='fast'")
357
+ if video_encoder_args is not None:
358
+ raise pxt.Error("video_encoder_args is not supported for mode='fast'")
339
359
 
340
360
  video_duration = av_utils.get_video_duration(video)
341
361
  if video_duration is not None and start_time > video_duration:
@@ -345,7 +365,15 @@ def clip(
345
365
 
346
366
  if end_time is not None:
347
367
  duration = end_time - start_time
348
- cmd = av_utils.ffmpeg_clip_cmd(str(video), output_path, start_time, duration)
368
+ cmd = av_utils.ffmpeg_clip_cmd(
369
+ str(video),
370
+ output_path,
371
+ start_time,
372
+ duration,
373
+ fast=(mode == 'fast'),
374
+ video_encoder=video_encoder,
375
+ video_encoder_args=video_encoder_args,
376
+ )
349
377
 
350
378
  try:
351
379
  result = subprocess.run(cmd, capture_output=True, text=True, check=True)
@@ -364,7 +392,7 @@ def segment_video(
364
392
  *,
365
393
  duration: float | None = None,
366
394
  segment_times: list[float] | None = None,
367
- mode: Literal['fast', 'accurate'] = 'fast',
395
+ mode: Literal['fast', 'accurate'] = 'accurate',
368
396
  video_encoder: str | None = None,
369
397
  video_encoder_args: dict[str, Any] | None = None,
370
398
  ) -> list[str]:
@@ -400,15 +428,14 @@ def segment_video(
400
428
  Examples:
401
429
  Split a video at 1 minute intervals using fast mode:
402
430
 
403
- >>> tbl.select(segment_paths=tbl.video.segment_video(duration=60)).collect()
431
+ >>> tbl.select(segment_paths=tbl.video.segment_video(duration=60, mode='fast')).collect()
404
432
 
405
- Split video into exact 10-second segments with accurate mode, using the libx264 encoder with a CRF of 23 and
406
- slow preset (for smaller output files):
433
+ Split video into exact 10-second segments with default accurate mode, using the libx264 encoder with a CRF of 23
434
+ and slow preset (for smaller output files):
407
435
 
408
436
  >>> tbl.select(
409
437
  ... segment_paths=tbl.video.segment_video(
410
438
  ... duration=10,
411
- ... mode='accurate',
412
439
  ... video_encoder='libx264',
413
440
  ... video_encoder_args={'crf': 23, 'preset': 'slow'}
414
441
  ... )
pixeltable/globals.py CHANGED
@@ -487,12 +487,28 @@ def get_table(path: str, if_not_exists: Literal['error', 'ignore'] = 'error') ->
487
487
  return tbl
488
488
 
489
489
 
490
- def move(path: str, new_path: str) -> None:
490
+ def move(
491
+ path: str,
492
+ new_path: str,
493
+ *,
494
+ if_exists: Literal['error', 'ignore'] = 'error',
495
+ if_not_exists: Literal['error', 'ignore'] = 'error',
496
+ ) -> None:
491
497
  """Move a schema object to a new directory and/or rename a schema object.
492
498
 
493
499
  Args:
494
500
  path: absolute path to the existing schema object.
495
501
  new_path: absolute new path for the schema object.
502
+ if_exists: Directive regarding how to handle if a schema object already exists at the new path.
503
+ Must be one of the following:
504
+
505
+ - `'error'`: raise an error
506
+ - `'ignore'`: do nothing and return
507
+ if_not_exists: Directive regarding how to handle if the source path does not exist.
508
+ Must be one of the following:
509
+
510
+ - `'error'`: raise an error
511
+ - `'ignore'`: do nothing and return
496
512
 
497
513
  Raises:
498
514
  Error: If path does not exist or new_path already exists.
@@ -506,13 +522,16 @@ def move(path: str, new_path: str) -> None:
506
522
 
507
523
  >>>> pxt.move('dir1.my_table', 'dir1.new_name')
508
524
  """
525
+ if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
526
+ if if_exists_ not in (catalog.IfExistsParam.ERROR, catalog.IfExistsParam.IGNORE):
527
+ raise excs.Error("`if_exists` must be one of 'error' or 'ignore'")
528
+ if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
509
529
  if path == new_path:
510
530
  raise excs.Error('move(): source and destination cannot be identical')
511
531
  path_obj, new_path_obj = catalog.Path.parse(path), catalog.Path.parse(new_path)
512
532
  if path_obj.is_ancestor(new_path_obj):
513
533
  raise excs.Error(f'move(): cannot move {path!r} into its own subdirectory')
514
- cat = Catalog.get()
515
- cat.move(path_obj, new_path_obj)
534
+ Catalog.get().move(path_obj, new_path_obj, if_exists_, if_not_exists_)
516
535
 
517
536
 
518
537
  def drop_table(
@@ -660,7 +679,7 @@ def _list_tables(dir_path: str = '', recursive: bool = True, allow_system_paths:
660
679
 
661
680
 
662
681
  def create_dir(
663
- path: str, if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error', parents: bool = False
682
+ path: str, *, if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error', parents: bool = False
664
683
  ) -> Optional[catalog.Dir]:
665
684
  """Create a directory.
666
685
 
@@ -2,7 +2,7 @@ import dataclasses
2
2
  import enum
3
3
  import io
4
4
  import logging
5
- from typing import Any, ClassVar, Iterable, Iterator, Optional
5
+ from typing import Any, ClassVar, Iterable, Iterator, Literal
6
6
 
7
7
  import fitz # type: ignore[import-untyped]
8
8
  import ftfy
@@ -11,7 +11,7 @@ from bs4.element import NavigableString, Tag
11
11
 
12
12
  from pixeltable.env import Env
13
13
  from pixeltable.exceptions import Error
14
- from pixeltable.type_system import BoolType, ColumnType, DocumentType, ImageType, IntType, JsonType, StringType
14
+ from pixeltable.type_system import ColumnType, DocumentType, ImageType, IntType, JsonType, StringType
15
15
  from pixeltable.utils.documents import get_document_handle
16
16
 
17
17
  from .base import ComponentIterator
@@ -19,6 +19,11 @@ from .base import ComponentIterator
19
19
  _logger = logging.getLogger('pixeltable')
20
20
 
21
21
 
22
+ class Element(enum.Enum):
23
+ TEXT = 1
24
+ IMAGE = 2
25
+
26
+
22
27
  class ChunkMetadata(enum.Enum):
23
28
  TITLE = 1
24
29
  HEADING = 2
@@ -41,28 +46,28 @@ class DocumentSectionMetadata:
41
46
  """Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
42
47
 
43
48
  # html and markdown metadata
44
- sourceline: Optional[int] = None
49
+ sourceline: int | None = None
45
50
  # the stack of headings up to the most recently observed one;
46
51
  # eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
47
- heading: Optional[dict[str, str]] = None
52
+ heading: dict[str, str] | None = None
48
53
 
49
54
  # pdf-specific metadata
50
- page: Optional[int] = None
55
+ page: int | None = None
51
56
  # bounding box as an {x1, y1, x2, y2} dictionary
52
- bounding_box: Optional[dict[str, float]] = None
57
+ bounding_box: dict[str, float] | None = None
53
58
 
54
59
 
55
60
  @dataclasses.dataclass
56
61
  class DocumentSection:
57
62
  """A single document chunk, according to some of the splitting criteria"""
58
63
 
59
- text: Optional[str]
60
- metadata: Optional[DocumentSectionMetadata]
61
- image: Optional[PIL.Image.Image] = None
64
+ text: str | None = None
65
+ image: PIL.Image.Image | None = None
66
+ metadata: DocumentSectionMetadata | None = None
62
67
 
63
68
 
64
69
  def _parse_separators(separators: str) -> list[Separator]:
65
- ret = []
70
+ ret: list[Separator] = []
66
71
  for s in separators.split(','):
67
72
  clean_s = s.strip().upper()
68
73
  if not clean_s:
@@ -76,7 +81,7 @@ def _parse_separators(separators: str) -> list[Separator]:
76
81
 
77
82
 
78
83
  def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
79
- ret = []
84
+ ret: list[ChunkMetadata] = []
80
85
  for m in metadata.split(','):
81
86
  clean_m = m.strip().upper()
82
87
  if not clean_m:
@@ -89,6 +94,18 @@ def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
89
94
  return ret
90
95
 
91
96
 
97
+ def _parse_elements(elements: list[Literal['text', 'image']]) -> list[Element]:
98
+ result: list[Element] = []
99
+ for e in elements:
100
+ clean_e = e.strip().upper()
101
+ if clean_e not in Element.__members__:
102
+ raise Error(f'Invalid element: `{e}`. Valid elements are: {", ".join(Element.__members__).lower()}')
103
+ result.append(Element[clean_e])
104
+ if len(result) == 0:
105
+ raise Error('elements cannot be empty')
106
+ return result
107
+
108
+
92
109
  _HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
93
110
 
94
111
 
@@ -106,11 +123,16 @@ class DocumentSplitter(ComponentIterator):
106
123
  separators: separators to use to chunk the document. Options are:
107
124
  `'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
108
125
  This may be a comma-separated string, e.g., `'heading,token_limit'`.
126
+ elements: list of elements to extract from the document. Options are:
127
+ `'text'`, `'image'`. Defaults to `['text']` if not specified. The `'image'` element is only supported
128
+ for the `'page'` separator on PDF documents.
109
129
  limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
110
130
  or `'char_limit'` is specified.
111
131
  metadata: additional metadata fields to include in the output. Options are:
112
132
  `'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
113
133
  (PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
134
+ image_dpi: DPI to use when extracting images from PDFs. Defaults to 300.
135
+ image_format: format to use when extracting images from PDFs. Defaults to 'png'.
114
136
  """
115
137
 
116
138
  METADATA_COLUMN_TYPES: ClassVar[dict[ChunkMetadata, ColumnType]] = {
@@ -121,34 +143,41 @@ class DocumentSplitter(ComponentIterator):
121
143
  ChunkMetadata.BOUNDING_BOX: JsonType(nullable=True),
122
144
  }
123
145
 
146
+ _doc_handle: Any
147
+ _separators: list[Separator]
148
+ _elements: list[Element]
149
+ _metadata_fields: list[ChunkMetadata]
150
+ _doc_title: str
151
+ _limit: int
152
+ _skip_tags: list[str]
153
+ _overlap: int
154
+ _tiktoken_encoding: str | None
155
+ _tiktoken_target_model: str | None
156
+ _image_dpi: int
157
+ _image_format: str
158
+
159
+ _sections: Iterator[DocumentSection]
160
+
124
161
  def __init__(
125
162
  self,
126
163
  document: str,
127
164
  *,
128
165
  separators: str,
129
- limit: Optional[int] = None,
130
- overlap: Optional[int] = None,
166
+ elements: list[Literal['text', 'image']] | None = None,
167
+ limit: int | None = None,
168
+ overlap: int | None = None,
131
169
  metadata: str = '',
132
- html_skip_tags: Optional[list[str]] = None,
133
- tiktoken_encoding: Optional[str] = 'cl100k_base',
134
- tiktoken_target_model: Optional[str] = None,
135
- # (PDF-processing-only)
136
- include_page_image: bool = False,
137
- page_image_dpi: int = 300,
138
- page_image_format: str = 'png',
170
+ html_skip_tags: list[str] | None = None,
171
+ tiktoken_encoding: str | None = 'cl100k_base',
172
+ tiktoken_target_model: str | None = None,
173
+ image_dpi: int = 300,
174
+ image_format: str = 'png',
139
175
  ):
140
176
  if html_skip_tags is None:
141
177
  html_skip_tags = ['nav']
142
178
  self._doc_handle = get_document_handle(document)
179
+ self._elements = _parse_elements(elements.copy()) if elements is not None else [Element.TEXT]
143
180
  assert self._doc_handle is not None
144
- # calling the output_schema method to validate the input arguments
145
- self.output_schema(
146
- separators=separators,
147
- metadata=metadata,
148
- limit=limit,
149
- overlap=overlap,
150
- include_page_image=include_page_image,
151
- )
152
181
  self._separators = _parse_separators(separators)
153
182
  self._metadata_fields = _parse_metadata(metadata)
154
183
  if self._doc_handle.bs_doc is not None:
@@ -164,10 +193,8 @@ class DocumentSplitter(ComponentIterator):
164
193
  self._overlap = 0 if overlap is None else overlap
165
194
  self._tiktoken_encoding = tiktoken_encoding
166
195
  self._tiktoken_target_model = tiktoken_target_model
167
-
168
- self._include_page_image = include_page_image
169
- self._page_image_dpi = page_image_dpi
170
- self._page_image_format = page_image_format
196
+ self._image_dpi = image_dpi
197
+ self._image_format = image_format
171
198
 
172
199
  # set up processing pipeline
173
200
  if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
@@ -197,23 +224,28 @@ class DocumentSplitter(ComponentIterator):
197
224
  return {
198
225
  'document': DocumentType(nullable=False),
199
226
  'separators': StringType(nullable=False),
227
+ 'elements': JsonType(nullable=False),
200
228
  'metadata': StringType(nullable=False),
201
229
  'limit': IntType(nullable=True),
202
230
  'overlap': IntType(nullable=True),
203
231
  'skip_tags': StringType(nullable=True),
204
232
  'tiktoken_encoding': StringType(nullable=True),
205
233
  'tiktoken_target_model': StringType(nullable=True),
206
- # PDF options must be declared so validation accepts them:
207
- 'include_page_image': BoolType(nullable=True),
208
- 'page_image_dpi': IntType(nullable=True),
209
- 'page_image_format': StringType(nullable=True),
234
+ 'image_dpi': IntType(nullable=True),
235
+ 'image_format': StringType(nullable=True),
210
236
  }
211
237
 
212
238
  @classmethod
213
239
  def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
214
- schema: dict[str, ColumnType] = {'text': StringType()}
215
- md_fields = _parse_metadata(kwargs['metadata']) if 'metadata' in kwargs else []
216
-
240
+ schema: dict[str, ColumnType] = {}
241
+ elements = _parse_elements(kwargs.get('elements', ['text']))
242
+ for element in elements:
243
+ if element == Element.TEXT:
244
+ schema['text'] = StringType(nullable=False)
245
+ elif element == Element.IMAGE:
246
+ schema['image'] = ImageType(nullable=False)
247
+
248
+ md_fields = _parse_metadata(kwargs.get('metadata', ''))
217
249
  for md_field in md_fields:
218
250
  schema[md_field.name.lower()] = cls.METADATA_COLUMN_TYPES[md_field]
219
251
 
@@ -223,6 +255,8 @@ class DocumentSplitter(ComponentIterator):
223
255
  limit = kwargs.get('limit')
224
256
  overlap = kwargs.get('overlap')
225
257
 
258
+ if Element.IMAGE in elements and separators != [Separator.PAGE]:
259
+ raise Error('Image elements are only supported for the "page" separator on PDF documents')
226
260
  if limit is not None or overlap is not None:
227
261
  if Separator.TOKEN_LIMIT not in separators and Separator.CHAR_LIMIT not in separators:
228
262
  raise Error('limit/overlap requires the "token_limit" or "char_limit" separator')
@@ -236,23 +270,25 @@ class DocumentSplitter(ComponentIterator):
236
270
  if kwargs.get('limit') is None:
237
271
  raise Error('limit is required with "token_limit"/"char_limit" separators')
238
272
 
239
- # check dependencies at the end
240
273
  if Separator.SENTENCE in separators:
241
274
  _ = Env.get().spacy_nlp
242
275
  if Separator.TOKEN_LIMIT in separators:
243
276
  Env.get().require_package('tiktoken')
244
277
 
245
- if kwargs.get('include_page_image'):
246
- schema['image'] = ImageType(nullable=True)
247
-
248
278
  return schema, []
249
279
 
250
280
  def __next__(self) -> dict[str, Any]:
251
281
  while True:
252
282
  section = next(self._sections)
253
- if section.text is None:
283
+ if section.text is None and section.image is None:
254
284
  continue
255
- result: dict[str, Any] = {'text': section.text}
285
+ result: dict[str, Any] = {}
286
+ for element in self._elements:
287
+ if element == Element.TEXT:
288
+ result['text'] = section.text
289
+ elif element == Element.IMAGE:
290
+ result['image'] = section.image
291
+
256
292
  for md_field in self._metadata_fields:
257
293
  if md_field == ChunkMetadata.TITLE:
258
294
  result[md_field.name.lower()] = self._doc_title
@@ -265,10 +301,6 @@ class DocumentSplitter(ComponentIterator):
265
301
  elif md_field == ChunkMetadata.BOUNDING_BOX:
266
302
  result[md_field.name.lower()] = section.metadata.bounding_box
267
303
 
268
- # FIX: only include image if schema supports it
269
- if self._include_page_image:
270
- result['image'] = section.image
271
-
272
304
  return result
273
305
 
274
306
  def _html_sections(self) -> Iterator[DocumentSection]:
@@ -411,11 +443,10 @@ class DocumentSplitter(ComponentIterator):
411
443
  return txt
412
444
 
413
445
  for page_idx, page in enumerate(doc.pages()):
414
- # render once per page if requested
415
- page_image = None
416
- if self._include_page_image:
417
- pix = page.get_pixmap(dpi=self._page_image_dpi) # ← single render
418
- page_image = PIL.Image.open(io.BytesIO(pix.tobytes(self._page_image_format)))
446
+ img: PIL.Image.Image | None = None
447
+ if Element.IMAGE in self._elements:
448
+ pix = page.get_pixmap(dpi=self._image_dpi)
449
+ img = PIL.Image.open(io.BytesIO(pix.tobytes(self._image_format)))
419
450
 
420
451
  for block in page.get_text('blocks'):
421
452
  x1, y1, x2, y2, text, *_ = block
@@ -423,14 +454,14 @@ class DocumentSplitter(ComponentIterator):
423
454
  if accumulated_text and emit_on_paragraph:
424
455
  bbox = {'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2}
425
456
  md = DocumentSectionMetadata(page=page_idx, bounding_box=bbox)
426
- yield DocumentSection(text=_emit_text(), metadata=md, image=page_image)
457
+ yield DocumentSection(text=_emit_text(), metadata=md)
427
458
 
428
459
  if accumulated_text and emit_on_page and not emit_on_paragraph:
429
460
  md = DocumentSectionMetadata(page=page_idx)
430
- yield DocumentSection(text=_emit_text(), metadata=md, image=page_image)
461
+ yield DocumentSection(text=_emit_text(), image=img, metadata=md)
431
462
 
432
463
  if accumulated_text and not emit_on_page:
433
- yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata(), image=None)
464
+ yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata())
434
465
 
435
466
  def _txt_sections(self) -> Iterator[DocumentSection]:
436
467
  """Create DocumentSections for text files.
@@ -251,7 +251,8 @@ class VideoSplitter(ComponentIterator):
251
251
 
252
252
  # Input parameters
253
253
  video_path: Path
254
- segment_duration: float
254
+ segment_duration: float | None
255
+ segment_times: list[float] | None
255
256
  overlap: float
256
257
  min_segment_duration: float
257
258
  video_encoder: str | None
@@ -268,25 +269,31 @@ class VideoSplitter(ComponentIterator):
268
269
  self,
269
270
  video: str,
270
271
  *,
271
- duration: float,
272
- overlap: float = 0.0,
273
- min_segment_duration: float = 0.0,
274
- mode: Literal['fast', 'accurate'] = 'fast',
272
+ duration: float | None = None,
273
+ overlap: float | None = None,
274
+ min_segment_duration: float | None = None,
275
+ segment_times: list[float] | None = None,
276
+ mode: Literal['fast', 'accurate'] = 'accurate',
275
277
  video_encoder: str | None = None,
276
278
  video_encoder_args: dict[str, Any] | None = None,
277
279
  ):
278
280
  Env.get().require_binary('ffmpeg')
279
- assert duration > 0.0
280
- assert duration >= min_segment_duration
281
- assert overlap < duration
281
+ assert (duration is not None) != (segment_times is not None)
282
+ if segment_times is not None:
283
+ assert len(segment_times) > 0
284
+ if duration is not None:
285
+ assert duration > 0.0
286
+ assert duration >= min_segment_duration
287
+ assert overlap is None or overlap < duration
282
288
 
283
289
  video_path = Path(video)
284
290
  assert video_path.exists() and video_path.is_file()
285
291
 
286
292
  self.video_path = video_path
287
293
  self.segment_duration = duration
288
- self.overlap = overlap
289
- self.min_segment_duration = min_segment_duration
294
+ self.overlap = overlap if overlap is not None else 0.0
295
+ self.min_segment_duration = min_segment_duration if min_segment_duration is not None else 0.0
296
+ self.segment_times = segment_times
290
297
  self.video_encoder = video_encoder
291
298
  self.video_encoder_args = video_encoder_args
292
299
 
@@ -304,6 +311,7 @@ class VideoSplitter(ComponentIterator):
304
311
  'duration': ts.FloatType(nullable=True),
305
312
  'overlap': ts.FloatType(nullable=True),
306
313
  'min_segment_duration': ts.FloatType(nullable=True),
314
+ 'segment_times': ts.JsonType(nullable=True),
307
315
  'mode': ts.StringType(nullable=False),
308
316
  'video_encoder': ts.StringType(nullable=True),
309
317
  'video_encoder_args': ts.JsonType(nullable=True),
@@ -311,23 +319,34 @@ class VideoSplitter(ComponentIterator):
311
319
 
312
320
  @classmethod
313
321
  def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
314
- param_names = ['duration', 'overlap', 'min_segment_duration']
322
+ param_names = ['duration', 'overlap', 'min_segment_duration', 'segment_times']
315
323
  params = dict(zip(param_names, args))
316
324
  params.update(kwargs)
317
325
 
318
- segment_duration = params['duration']
319
- min_segment_duration = params.get('min_segment_duration', 0.0)
320
- overlap = params.get('overlap', 0.0)
326
+ segment_duration = params.get('duration')
327
+ segment_times = params.get('segment_times')
328
+ overlap = params.get('overlap')
329
+ min_segment_duration = params.get('min_segment_duration')
321
330
  mode = params.get('mode', 'fast')
322
331
 
323
- if segment_duration <= 0.0:
324
- raise excs.Error('duration must be a positive number')
325
- if segment_duration < min_segment_duration:
326
- raise excs.Error('duration must be at least min_segment_duration')
327
- if mode == 'accurate' and overlap > 0:
332
+ if segment_duration is None and segment_times is None:
333
+ raise excs.Error('Must specify either duration or segment_times')
334
+ if segment_duration is not None and segment_times is not None:
335
+ raise excs.Error('duration and segment_times cannot both be specified')
336
+ if segment_times is not None:
337
+ if len(segment_times) == 0:
338
+ raise excs.Error('segment_times cannot be empty')
339
+ if overlap is not None:
340
+ raise excs.Error('overlap cannot be specified with segment_times')
341
+ if segment_duration is not None:
342
+ if segment_duration <= 0.0:
343
+ raise excs.Error('duration must be a positive number')
344
+ if min_segment_duration is not None and segment_duration < min_segment_duration:
345
+ raise excs.Error('duration must be at least min_segment_duration')
346
+ if overlap is not None and overlap >= segment_duration:
347
+ raise excs.Error('overlap must be less than duration')
348
+ if mode == 'accurate' and overlap is not None:
328
349
  raise excs.Error("Cannot specify overlap for mode='accurate'")
329
- if overlap >= segment_duration:
330
- raise excs.Error('overlap must be less than duration')
331
350
  if mode == 'fast':
332
351
  if params.get('video_encoder') is not None:
333
352
  raise excs.Error("Cannot specify video_encoder for mode='fast'")
@@ -343,13 +362,22 @@ class VideoSplitter(ComponentIterator):
343
362
  }, []
344
363
 
345
364
  def fast_iter(self) -> Iterator[dict[str, Any]]:
346
- segment_path: str
365
+ segment_path: str = ''
347
366
  try:
348
367
  start_time = 0.0
349
368
  start_pts = 0
369
+ segment_idx = 0
350
370
  while True:
371
+ target_duration: float | None
372
+ if self.segment_duration is not None:
373
+ target_duration = self.segment_duration
374
+ elif self.segment_times is not None and segment_idx < len(self.segment_times):
375
+ target_duration = self.segment_times[segment_idx] - start_time
376
+ else:
377
+ target_duration = None # the rest of the video
378
+
351
379
  segment_path = str(TempStore.create_path(extension='.mp4'))
352
- cmd = av_utils.ffmpeg_clip_cmd(str(self.video_path), segment_path, start_time, self.segment_duration)
380
+ cmd = av_utils.ffmpeg_clip_cmd(str(self.video_path), segment_path, start_time, target_duration)
353
381
  _ = subprocess.run(cmd, capture_output=True, text=True, check=True)
354
382
 
355
383
  # use the actual duration
@@ -373,8 +401,13 @@ class VideoSplitter(ComponentIterator):
373
401
  start_time = segment_end - self.overlap
374
402
  start_pts = segment_end_pts - round(self.overlap / self.video_time_base)
375
403
 
404
+ segment_idx += 1
405
+ if self.segment_times is not None and segment_idx > len(self.segment_times):
406
+ # We've created all segments including the final segment after the last segment_time
407
+ break
408
+
376
409
  except subprocess.CalledProcessError as e:
377
- if Path(segment_path).exists():
410
+ if segment_path and Path(segment_path).exists():
378
411
  Path(segment_path).unlink()
379
412
  error_msg = f'ffmpeg failed with return code {e.returncode}'
380
413
  if e.stderr:
@@ -389,6 +422,7 @@ class VideoSplitter(ComponentIterator):
389
422
  str(self.video_path),
390
423
  output_pattern,
391
424
  segment_duration=self.segment_duration,
425
+ segment_times=self.segment_times,
392
426
  video_encoder=self.video_encoder,
393
427
  video_encoder_args=self.video_encoder_args,
394
428
  )
pixeltable/plan.py CHANGED
@@ -93,18 +93,13 @@ class SampleClause:
93
93
  seed: Optional[int]
94
94
  stratify_exprs: Optional[list[exprs.Expr]]
95
95
 
96
- # This seed value is used if one is not supplied
97
- DEFAULT_SEED = 0
98
-
99
96
  # The version of the hashing algorithm used for ordering and fractional sampling.
100
97
  CURRENT_VERSION = 1
101
98
 
102
99
  def __post_init__(self) -> None:
103
- """If no version was provided, provide the default version"""
100
+ # If no version was provided, provide the default version
104
101
  if self.version is None:
105
102
  self.version = self.CURRENT_VERSION
106
- if self.seed is None:
107
- self.seed = self.DEFAULT_SEED
108
103
 
109
104
  @property
110
105
  def is_stratified(self) -> bool:
@@ -1006,6 +1001,7 @@ class Planner:
1006
1001
  analyzer.window_fn_calls
1007
1002
  )
1008
1003
  ctx = exec.ExecContext(row_builder)
1004
+
1009
1005
  combined_ordering = cls._create_combined_ordering(analyzer, verify_agg=is_python_agg)
1010
1006
  cls._verify_join_clauses(analyzer)
1011
1007