pixeltable 0.4.16__py3-none-any.whl → 0.4.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

@@ -1,13 +1,17 @@
1
1
  import dataclasses
2
2
  import enum
3
+ import io
3
4
  import logging
4
5
  from typing import Any, ClassVar, Iterable, Iterator, Optional
5
6
 
7
+ import fitz # type: ignore[import-untyped]
6
8
  import ftfy
9
+ import PIL.Image
10
+ from bs4.element import NavigableString, Tag
7
11
 
8
12
  from pixeltable.env import Env
9
13
  from pixeltable.exceptions import Error
10
- from pixeltable.type_system import ColumnType, DocumentType, IntType, JsonType, StringType
14
+ from pixeltable.type_system import BoolType, ColumnType, DocumentType, ImageType, IntType, JsonType, StringType
11
15
  from pixeltable.utils.documents import get_document_handle
12
16
 
13
17
  from .base import ComponentIterator
@@ -54,6 +58,7 @@ class DocumentSection:
54
58
 
55
59
  text: Optional[str]
56
60
  metadata: Optional[DocumentSectionMetadata]
61
+ image: Optional[PIL.Image.Image] = None
57
62
 
58
63
 
59
64
  def _parse_separators(separators: str) -> list[Separator]:
@@ -95,6 +100,8 @@ class DocumentSplitter(ComponentIterator):
95
100
 
96
101
  Chunked text will be cleaned with `ftfy.fix_text` to fix up common problems with unicode sequences.
97
102
 
103
+ How to init the `DocumentSplitter` class?
104
+
98
105
  Args:
99
106
  separators: separators to use to chunk the document. Options are:
100
107
  `'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
@@ -125,13 +132,23 @@ class DocumentSplitter(ComponentIterator):
125
132
  html_skip_tags: Optional[list[str]] = None,
126
133
  tiktoken_encoding: Optional[str] = 'cl100k_base',
127
134
  tiktoken_target_model: Optional[str] = None,
135
+ # (PDF-processing-only)
136
+ include_page_image: bool = False,
137
+ page_image_dpi: int = 300,
138
+ page_image_format: str = 'png',
128
139
  ):
129
140
  if html_skip_tags is None:
130
141
  html_skip_tags = ['nav']
131
142
  self._doc_handle = get_document_handle(document)
132
143
  assert self._doc_handle is not None
133
144
  # calling the output_schema method to validate the input arguments
134
- self.output_schema(separators=separators, metadata=metadata, limit=limit, overlap=overlap)
145
+ self.output_schema(
146
+ separators=separators,
147
+ metadata=metadata,
148
+ limit=limit,
149
+ overlap=overlap,
150
+ include_page_image=include_page_image,
151
+ )
135
152
  self._separators = _parse_separators(separators)
136
153
  self._metadata_fields = _parse_metadata(metadata)
137
154
  if self._doc_handle.bs_doc is not None:
@@ -148,6 +165,10 @@ class DocumentSplitter(ComponentIterator):
148
165
  self._tiktoken_encoding = tiktoken_encoding
149
166
  self._tiktoken_target_model = tiktoken_target_model
150
167
 
168
+ self._include_page_image = include_page_image
169
+ self._page_image_dpi = page_image_dpi
170
+ self._page_image_format = page_image_format
171
+
151
172
  # set up processing pipeline
152
173
  if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
153
174
  assert self._doc_handle.bs_doc is not None
@@ -182,6 +203,10 @@ class DocumentSplitter(ComponentIterator):
182
203
  'skip_tags': StringType(nullable=True),
183
204
  'tiktoken_encoding': StringType(nullable=True),
184
205
  'tiktoken_target_model': StringType(nullable=True),
206
+ # PDF options must be declared so validation accepts them:
207
+ 'include_page_image': BoolType(nullable=True),
208
+ 'page_image_dpi': IntType(nullable=True),
209
+ 'page_image_format': StringType(nullable=True),
185
210
  }
186
211
 
187
212
  @classmethod
@@ -211,6 +236,15 @@ class DocumentSplitter(ComponentIterator):
211
236
  if kwargs.get('limit') is None:
212
237
  raise Error('limit is required with "token_limit"/"char_limit" separators')
213
238
 
239
+ # check dependencies at the end
240
+ if Separator.SENTENCE in separators:
241
+ _ = Env.get().spacy_nlp
242
+ if Separator.TOKEN_LIMIT in separators:
243
+ Env.get().require_package('tiktoken')
244
+
245
+ if kwargs.get('include_page_image'):
246
+ schema['image'] = ImageType(nullable=True)
247
+
214
248
  return schema, []
215
249
 
216
250
  def __next__(self) -> dict[str, Any]:
@@ -230,6 +264,11 @@ class DocumentSplitter(ComponentIterator):
230
264
  result[md_field.name.lower()] = section.metadata.page
231
265
  elif md_field == ChunkMetadata.BOUNDING_BOX:
232
266
  result[md_field.name.lower()] = section.metadata.bounding_box
267
+
268
+ # FIX: only include image if schema supports it
269
+ if self._include_page_image:
270
+ result['image'] = section.image
271
+
233
272
  return result
234
273
 
235
274
  def _html_sections(self) -> Iterator[DocumentSection]:
@@ -265,7 +304,7 @@ class DocumentSplitter(ComponentIterator):
265
304
  yield DocumentSection(text=full_text, metadata=md)
266
305
  accumulated_text = []
267
306
 
268
- def process_element(el: bs4.element.Tag | bs4.NavigableString) -> Iterator[DocumentSection]:
307
+ def process_element(el: Tag | NavigableString) -> Iterator[DocumentSection]:
269
308
  # process the element and emit sections as necessary
270
309
  nonlocal accumulated_text, headings, sourceline, emit_on_heading, emit_on_paragraph
271
310
 
@@ -353,46 +392,45 @@ class DocumentSplitter(ComponentIterator):
353
392
  yield from emit()
354
393
 
355
394
  def _pdf_sections(self) -> Iterator[DocumentSection]:
356
- """Create DocumentSections reflecting the pdf-specific separators"""
357
- import fitz # type: ignore[import-untyped]
358
-
359
395
  doc: fitz.Document = self._doc_handle.pdf_doc
360
396
  assert doc is not None
361
397
 
362
398
  emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
363
399
  emit_on_page = Separator.PAGE in self._separators or emit_on_paragraph
364
400
 
365
- accumulated_text = [] # invariant: all elements are ftfy clean and non-empty
401
+ accumulated_text: list[str] = []
366
402
 
367
- def _add_cleaned_text(raw_text: str) -> None:
368
- fixed = ftfy.fix_text(raw_text)
403
+ def _add_cleaned(raw: str) -> None:
404
+ fixed = ftfy.fix_text(raw)
369
405
  if fixed:
370
406
  accumulated_text.append(fixed)
371
407
 
372
408
  def _emit_text() -> str:
373
- full_text = ''.join(accumulated_text)
409
+ txt = ''.join(accumulated_text)
374
410
  accumulated_text.clear()
375
- return full_text
411
+ return txt
412
+
413
+ for page_idx, page in enumerate(doc.pages()):
414
+ # render once per page if requested
415
+ page_image = None
416
+ if self._include_page_image:
417
+ pix = page.get_pixmap(dpi=self._page_image_dpi) # ← single render
418
+ page_image = PIL.Image.open(io.BytesIO(pix.tobytes(self._page_image_format)))
376
419
 
377
- for page_number, page in enumerate(doc.pages()):
378
420
  for block in page.get_text('blocks'):
379
- # there is no concept of paragraph in pdf, block is the closest thing
380
- # we can get (eg a paragraph in text may cut across pages)
381
- # see pymupdf docs https://pymupdf.readthedocs.io/en/latest/app1.html
382
- # other libraries like pdfminer also lack an explicit paragraph concept
383
- x1, y1, x2, y2, text, _, _ = block
384
- _add_cleaned_text(text)
421
+ x1, y1, x2, y2, text, *_ = block
422
+ _add_cleaned(text)
385
423
  if accumulated_text and emit_on_paragraph:
386
424
  bbox = {'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2}
387
- metadata = DocumentSectionMetadata(page=page_number, bounding_box=bbox)
388
- yield DocumentSection(text=_emit_text(), metadata=metadata)
425
+ md = DocumentSectionMetadata(page=page_idx, bounding_box=bbox)
426
+ yield DocumentSection(text=_emit_text(), metadata=md, image=page_image)
389
427
 
390
428
  if accumulated_text and emit_on_page and not emit_on_paragraph:
391
- yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata(page=page_number))
392
- accumulated_text = []
429
+ md = DocumentSectionMetadata(page=page_idx)
430
+ yield DocumentSection(text=_emit_text(), metadata=md, image=page_image)
393
431
 
394
432
  if accumulated_text and not emit_on_page:
395
- yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata())
433
+ yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata(), image=None)
396
434
 
397
435
  def _txt_sections(self) -> Iterator[DocumentSection]:
398
436
  """Create DocumentSections for text files.
@@ -1,10 +1,10 @@
1
+ import glob
1
2
  import logging
2
3
  import math
3
- import shutil
4
4
  import subprocess
5
5
  from fractions import Fraction
6
6
  from pathlib import Path
7
- from typing import Any, Optional
7
+ from typing import Any, Iterator, Literal, Optional
8
8
 
9
9
  import av
10
10
  import pandas as pd
@@ -14,6 +14,7 @@ import pixeltable as pxt
14
14
  import pixeltable.exceptions as excs
15
15
  import pixeltable.type_system as ts
16
16
  import pixeltable.utils.av as av_utils
17
+ from pixeltable.env import Env
17
18
  from pixeltable.utils.local_store import TempStore
18
19
 
19
20
  from .base import ComponentIterator
@@ -237,9 +238,15 @@ class VideoSplitter(ComponentIterator):
237
238
  seconds.
238
239
 
239
240
  Args:
240
- segment_duration: Video segment duration in seconds
241
- overlap: Overlap between consecutive segments in seconds.
242
- min_segment_duration: Drop the last segment if it is smaller than min_segment_duration
241
+ duration: Video segment duration in seconds
242
+ overlap: Overlap between consecutive segments in seconds. Only available for `mode='fast'`.
243
+ min_segment_duration: Drop the last segment if it is smaller than min_segment_duration.
244
+ mode: Segmentation mode:
245
+ - `'fast'`: Quick segmentation using stream copy (splits only at keyframes, approximate durations)
246
+ - `'accurate'`: Precise segmentation with re-encoding (exact durations, slower)
247
+ video_encoder: Video encoder to use. If not specified, uses the default encoder for the current platform.
248
+ Only available for `mode='accurate'`.
249
+ video_encoder_args: Additional arguments to pass to the video encoder. Only available for `mode='accurate'`.
243
250
  """
244
251
 
245
252
  # Input parameters
@@ -247,65 +254,85 @@ class VideoSplitter(ComponentIterator):
247
254
  segment_duration: float
248
255
  overlap: float
249
256
  min_segment_duration: float
257
+ video_encoder: str | None
258
+ video_encoder_args: dict[str, Any] | None
250
259
 
251
260
  # Video metadata
252
261
  video_duration: float
253
262
  video_time_base: Fraction
254
263
  video_start_time: int
255
264
 
256
- # position tracking
257
- next_segment_start: float
258
- next_segment_start_pts: int
265
+ output_iter: Iterator[dict[str, Any]]
259
266
 
260
- def __init__(self, video: str, segment_duration: float, *, overlap: float = 0.0, min_segment_duration: float = 0.0):
261
- assert segment_duration > 0.0
262
- assert segment_duration >= min_segment_duration
263
- assert overlap < segment_duration
267
+ def __init__(
268
+ self,
269
+ video: str,
270
+ *,
271
+ duration: float,
272
+ overlap: float = 0.0,
273
+ min_segment_duration: float = 0.0,
274
+ mode: Literal['fast', 'accurate'] = 'fast',
275
+ video_encoder: str | None = None,
276
+ video_encoder_args: dict[str, Any] | None = None,
277
+ ):
278
+ Env.get().require_binary('ffmpeg')
279
+ assert duration > 0.0
280
+ assert duration >= min_segment_duration
281
+ assert overlap < duration
264
282
 
265
283
  video_path = Path(video)
266
284
  assert video_path.exists() and video_path.is_file()
267
285
 
268
- if not shutil.which('ffmpeg'):
269
- raise pxt.Error('ffmpeg is not installed or not in PATH. Please install ffmpeg to use VideoSplitter.')
270
-
271
286
  self.video_path = video_path
272
- self.segment_duration = segment_duration
287
+ self.segment_duration = duration
273
288
  self.overlap = overlap
274
289
  self.min_segment_duration = min_segment_duration
290
+ self.video_encoder = video_encoder
291
+ self.video_encoder_args = video_encoder_args
275
292
 
276
293
  with av.open(str(video_path)) as container:
277
294
  video_stream = container.streams.video[0]
278
295
  self.video_time_base = video_stream.time_base
279
296
  self.video_start_time = video_stream.start_time or 0
280
297
 
281
- self.next_segment_start = float(self.video_start_time * self.video_time_base)
282
- self.next_segment_start_pts = self.video_start_time
298
+ self.output_iter = self.fast_iter() if mode == 'fast' else self.accurate_iter()
283
299
 
284
300
  @classmethod
285
301
  def input_schema(cls) -> dict[str, ts.ColumnType]:
286
302
  return {
287
303
  'video': ts.VideoType(nullable=False),
288
- 'segment_duration': ts.FloatType(nullable=False),
304
+ 'duration': ts.FloatType(nullable=True),
289
305
  'overlap': ts.FloatType(nullable=True),
290
306
  'min_segment_duration': ts.FloatType(nullable=True),
307
+ 'mode': ts.StringType(nullable=False),
308
+ 'video_encoder': ts.StringType(nullable=True),
309
+ 'video_encoder_args': ts.JsonType(nullable=True),
291
310
  }
292
311
 
293
312
  @classmethod
294
313
  def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
295
- param_names = ['segment_duration', 'overlap', 'min_segment_duration']
314
+ param_names = ['duration', 'overlap', 'min_segment_duration']
296
315
  params = dict(zip(param_names, args))
297
316
  params.update(kwargs)
298
317
 
299
- segment_duration = params['segment_duration']
318
+ segment_duration = params['duration']
300
319
  min_segment_duration = params.get('min_segment_duration', 0.0)
301
320
  overlap = params.get('overlap', 0.0)
321
+ mode = params.get('mode', 'fast')
302
322
 
303
323
  if segment_duration <= 0.0:
304
- raise excs.Error('segment_duration must be a positive number')
324
+ raise excs.Error('duration must be a positive number')
305
325
  if segment_duration < min_segment_duration:
306
- raise excs.Error('segment_duration must be at least min_segment_duration')
326
+ raise excs.Error('duration must be at least min_segment_duration')
327
+ if mode == 'accurate' and overlap > 0:
328
+ raise excs.Error("Cannot specify overlap for mode='accurate'")
307
329
  if overlap >= segment_duration:
308
- raise excs.Error('overlap must be less than segment_duration')
330
+ raise excs.Error('overlap must be less than duration')
331
+ if mode == 'fast':
332
+ if params.get('video_encoder') is not None:
333
+ raise excs.Error("Cannot specify video_encoder for mode='fast'")
334
+ if params.get('video_encoder_args') is not None:
335
+ raise excs.Error("Cannot specify video_encoder_args for mode='fast'")
309
336
 
310
337
  return {
311
338
  'segment_start': ts.FloatType(nullable=False),
@@ -315,48 +342,94 @@ class VideoSplitter(ComponentIterator):
315
342
  'video_segment': ts.VideoType(nullable=False),
316
343
  }, []
317
344
 
318
- def __next__(self) -> dict[str, Any]:
319
- segment_path = str(TempStore.create_path(extension='.mp4'))
345
+ def fast_iter(self) -> Iterator[dict[str, Any]]:
346
+ segment_path: str
320
347
  try:
321
- cmd = av_utils.ffmpeg_clip_cmd(
322
- str(self.video_path), segment_path, self.next_segment_start, self.segment_duration
323
- )
324
- _ = subprocess.run(cmd, capture_output=True, text=True, check=True)
348
+ start_time = 0.0
349
+ start_pts = 0
350
+ while True:
351
+ segment_path = str(TempStore.create_path(extension='.mp4'))
352
+ cmd = av_utils.ffmpeg_clip_cmd(str(self.video_path), segment_path, start_time, self.segment_duration)
353
+ _ = subprocess.run(cmd, capture_output=True, text=True, check=True)
354
+
355
+ # use the actual duration
356
+ segment_duration = av_utils.get_video_duration(segment_path)
357
+ if segment_duration - self.overlap == 0.0 or segment_duration < self.min_segment_duration:
358
+ # we're done
359
+ Path(segment_path).unlink()
360
+ return
361
+
362
+ segment_end = start_time + segment_duration
363
+ segment_end_pts = start_pts + round(segment_duration / self.video_time_base)
364
+ result = {
365
+ 'segment_start': start_time,
366
+ 'segment_start_pts': start_pts,
367
+ 'segment_end': segment_end,
368
+ 'segment_end_pts': segment_end_pts,
369
+ 'video_segment': segment_path,
370
+ }
371
+ yield result
325
372
 
326
- # use the actual duration
327
- segment_duration = av_utils.get_video_duration(segment_path)
328
- if segment_duration - self.overlap == 0.0:
329
- # we're done
330
- Path(segment_path).unlink()
331
- raise StopIteration
373
+ start_time = segment_end - self.overlap
374
+ start_pts = segment_end_pts - round(self.overlap / self.video_time_base)
332
375
 
333
- if segment_duration < self.min_segment_duration:
376
+ except subprocess.CalledProcessError as e:
377
+ if Path(segment_path).exists():
334
378
  Path(segment_path).unlink()
335
- raise StopIteration
336
-
337
- segment_end = self.next_segment_start + segment_duration
338
- segment_end_pts = self.next_segment_start_pts + round(segment_duration / self.video_time_base)
379
+ error_msg = f'ffmpeg failed with return code {e.returncode}'
380
+ if e.stderr:
381
+ error_msg += f': {e.stderr.strip()}'
382
+ raise pxt.Error(error_msg) from e
339
383
 
340
- result = {
341
- 'segment_start': self.next_segment_start,
342
- 'segment_start_pts': self.next_segment_start_pts,
343
- 'segment_end': segment_end,
344
- 'segment_end_pts': segment_end_pts,
345
- 'video_segment': segment_path,
346
- }
347
- self.next_segment_start = segment_end - self.overlap
348
- self.next_segment_start_pts = segment_end_pts - round(self.overlap / self.video_time_base)
384
+ def accurate_iter(self) -> Iterator[dict[str, Any]]:
385
+ base_path = TempStore.create_path(extension='')
386
+ # Use ffmpeg -f segment for accurate segmentation with re-encoding
387
+ output_pattern = f'{base_path}_segment_%04d.mp4'
388
+ cmd = av_utils.ffmpeg_segment_cmd(
389
+ str(self.video_path),
390
+ output_pattern,
391
+ segment_duration=self.segment_duration,
392
+ video_encoder=self.video_encoder,
393
+ video_encoder_args=self.video_encoder_args,
394
+ )
349
395
 
350
- return result
396
+ try:
397
+ _ = subprocess.run(cmd, capture_output=True, text=True, check=True)
398
+ output_paths = sorted(glob.glob(f'{base_path}_segment_*.mp4'))
399
+ # TODO: is this actually an error?
400
+ # if len(output_paths) == 0:
401
+ # stderr_output = result.stderr.strip() if result.stderr is not None else ''
402
+ # raise pxt.Error(
403
+ # f'ffmpeg failed to create output files for commandline: {" ".join(cmd)}\n{stderr_output}'
404
+ # )
405
+ start_time = 0.0
406
+ start_pts = 0
407
+ for segment_path in output_paths:
408
+ segment_duration = av_utils.get_video_duration(segment_path)
409
+ if segment_duration < self.min_segment_duration:
410
+ Path(segment_path).unlink()
411
+ return
412
+
413
+ result = {
414
+ 'segment_start': start_time,
415
+ 'segment_start_pts': start_pts,
416
+ 'segment_end': start_time + segment_duration,
417
+ 'segment_end_pts': start_pts + round(segment_duration / self.video_time_base),
418
+ 'video_segment': segment_path,
419
+ }
420
+ yield result
421
+ start_time += segment_duration
422
+ start_pts += round(segment_duration / self.video_time_base)
351
423
 
352
424
  except subprocess.CalledProcessError as e:
353
- if Path(segment_path).exists():
354
- Path(segment_path).unlink()
355
425
  error_msg = f'ffmpeg failed with return code {e.returncode}'
356
426
  if e.stderr:
357
427
  error_msg += f': {e.stderr.strip()}'
358
428
  raise pxt.Error(error_msg) from e
359
429
 
430
+ def __next__(self) -> dict[str, Any]:
431
+ return next(self.output_iter)
432
+
360
433
  def close(self) -> None:
361
434
  pass
362
435