pixeltable 0.4.17__py3-none-any.whl → 0.4.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (153) hide show
  1. pixeltable/__init__.py +1 -1
  2. pixeltable/_version.py +1 -0
  3. pixeltable/catalog/catalog.py +144 -118
  4. pixeltable/catalog/column.py +104 -115
  5. pixeltable/catalog/globals.py +1 -2
  6. pixeltable/catalog/insertable_table.py +44 -49
  7. pixeltable/catalog/path.py +3 -4
  8. pixeltable/catalog/schema_object.py +4 -4
  9. pixeltable/catalog/table.py +139 -124
  10. pixeltable/catalog/table_metadata.py +6 -6
  11. pixeltable/catalog/table_version.py +315 -246
  12. pixeltable/catalog/table_version_handle.py +4 -4
  13. pixeltable/catalog/table_version_path.py +9 -10
  14. pixeltable/catalog/tbl_ops.py +9 -3
  15. pixeltable/catalog/view.py +34 -28
  16. pixeltable/config.py +14 -10
  17. pixeltable/dataframe.py +69 -78
  18. pixeltable/env.py +78 -64
  19. pixeltable/exec/aggregation_node.py +6 -6
  20. pixeltable/exec/cache_prefetch_node.py +10 -10
  21. pixeltable/exec/data_row_batch.py +3 -3
  22. pixeltable/exec/exec_context.py +16 -4
  23. pixeltable/exec/exec_node.py +5 -5
  24. pixeltable/exec/expr_eval/evaluators.py +6 -6
  25. pixeltable/exec/expr_eval/expr_eval_node.py +8 -7
  26. pixeltable/exec/expr_eval/globals.py +6 -6
  27. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  28. pixeltable/exec/expr_eval/schedulers.py +11 -11
  29. pixeltable/exec/in_memory_data_node.py +2 -2
  30. pixeltable/exec/object_store_save_node.py +14 -17
  31. pixeltable/exec/sql_node.py +28 -27
  32. pixeltable/exprs/arithmetic_expr.py +4 -4
  33. pixeltable/exprs/array_slice.py +2 -2
  34. pixeltable/exprs/column_property_ref.py +3 -3
  35. pixeltable/exprs/column_ref.py +61 -74
  36. pixeltable/exprs/comparison.py +5 -5
  37. pixeltable/exprs/compound_predicate.py +3 -3
  38. pixeltable/exprs/data_row.py +12 -12
  39. pixeltable/exprs/expr.py +41 -31
  40. pixeltable/exprs/expr_dict.py +3 -3
  41. pixeltable/exprs/expr_set.py +3 -3
  42. pixeltable/exprs/function_call.py +14 -14
  43. pixeltable/exprs/in_predicate.py +4 -4
  44. pixeltable/exprs/inline_expr.py +8 -8
  45. pixeltable/exprs/is_null.py +1 -3
  46. pixeltable/exprs/json_mapper.py +8 -8
  47. pixeltable/exprs/json_path.py +6 -6
  48. pixeltable/exprs/literal.py +5 -5
  49. pixeltable/exprs/method_ref.py +2 -2
  50. pixeltable/exprs/object_ref.py +2 -2
  51. pixeltable/exprs/row_builder.py +14 -14
  52. pixeltable/exprs/rowid_ref.py +8 -8
  53. pixeltable/exprs/similarity_expr.py +50 -25
  54. pixeltable/exprs/sql_element_cache.py +4 -4
  55. pixeltable/exprs/string_op.py +2 -2
  56. pixeltable/exprs/type_cast.py +3 -5
  57. pixeltable/func/aggregate_function.py +8 -8
  58. pixeltable/func/callable_function.py +9 -9
  59. pixeltable/func/expr_template_function.py +3 -3
  60. pixeltable/func/function.py +15 -17
  61. pixeltable/func/function_registry.py +6 -7
  62. pixeltable/func/globals.py +2 -3
  63. pixeltable/func/mcp.py +2 -2
  64. pixeltable/func/query_template_function.py +16 -16
  65. pixeltable/func/signature.py +14 -14
  66. pixeltable/func/tools.py +11 -11
  67. pixeltable/func/udf.py +16 -18
  68. pixeltable/functions/__init__.py +1 -0
  69. pixeltable/functions/anthropic.py +7 -7
  70. pixeltable/functions/audio.py +76 -0
  71. pixeltable/functions/bedrock.py +6 -6
  72. pixeltable/functions/deepseek.py +4 -4
  73. pixeltable/functions/fireworks.py +2 -2
  74. pixeltable/functions/gemini.py +6 -6
  75. pixeltable/functions/globals.py +12 -12
  76. pixeltable/functions/groq.py +4 -4
  77. pixeltable/functions/huggingface.py +1033 -6
  78. pixeltable/functions/image.py +7 -10
  79. pixeltable/functions/llama_cpp.py +7 -7
  80. pixeltable/functions/math.py +2 -3
  81. pixeltable/functions/mistralai.py +3 -3
  82. pixeltable/functions/ollama.py +9 -9
  83. pixeltable/functions/openai.py +21 -21
  84. pixeltable/functions/openrouter.py +7 -7
  85. pixeltable/functions/string.py +21 -28
  86. pixeltable/functions/timestamp.py +7 -8
  87. pixeltable/functions/together.py +4 -6
  88. pixeltable/functions/twelvelabs.py +92 -0
  89. pixeltable/functions/video.py +36 -31
  90. pixeltable/functions/vision.py +6 -6
  91. pixeltable/functions/whisper.py +7 -7
  92. pixeltable/functions/whisperx.py +16 -16
  93. pixeltable/globals.py +75 -40
  94. pixeltable/index/base.py +12 -8
  95. pixeltable/index/btree.py +19 -22
  96. pixeltable/index/embedding_index.py +30 -39
  97. pixeltable/io/datarows.py +3 -3
  98. pixeltable/io/external_store.py +13 -16
  99. pixeltable/io/fiftyone.py +5 -5
  100. pixeltable/io/globals.py +5 -5
  101. pixeltable/io/hf_datasets.py +4 -4
  102. pixeltable/io/label_studio.py +12 -12
  103. pixeltable/io/pandas.py +6 -6
  104. pixeltable/io/parquet.py +2 -2
  105. pixeltable/io/table_data_conduit.py +12 -12
  106. pixeltable/io/utils.py +2 -2
  107. pixeltable/iterators/audio.py +2 -2
  108. pixeltable/iterators/document.py +88 -57
  109. pixeltable/iterators/video.py +66 -37
  110. pixeltable/metadata/converters/convert_18.py +2 -2
  111. pixeltable/metadata/converters/convert_19.py +2 -2
  112. pixeltable/metadata/converters/convert_20.py +2 -2
  113. pixeltable/metadata/converters/convert_21.py +2 -2
  114. pixeltable/metadata/converters/convert_22.py +2 -2
  115. pixeltable/metadata/converters/convert_24.py +2 -2
  116. pixeltable/metadata/converters/convert_25.py +2 -2
  117. pixeltable/metadata/converters/convert_26.py +2 -2
  118. pixeltable/metadata/converters/convert_29.py +4 -4
  119. pixeltable/metadata/converters/convert_34.py +2 -2
  120. pixeltable/metadata/converters/convert_36.py +2 -2
  121. pixeltable/metadata/converters/convert_38.py +2 -2
  122. pixeltable/metadata/converters/convert_39.py +1 -2
  123. pixeltable/metadata/converters/util.py +11 -13
  124. pixeltable/metadata/schema.py +22 -21
  125. pixeltable/metadata/utils.py +2 -6
  126. pixeltable/mypy/mypy_plugin.py +5 -5
  127. pixeltable/plan.py +32 -34
  128. pixeltable/share/packager.py +7 -7
  129. pixeltable/share/publish.py +3 -3
  130. pixeltable/store.py +126 -41
  131. pixeltable/type_system.py +43 -46
  132. pixeltable/utils/__init__.py +1 -2
  133. pixeltable/utils/arrow.py +4 -4
  134. pixeltable/utils/av.py +74 -38
  135. pixeltable/utils/azure_store.py +305 -0
  136. pixeltable/utils/code.py +1 -2
  137. pixeltable/utils/dbms.py +15 -19
  138. pixeltable/utils/description_helper.py +2 -3
  139. pixeltable/utils/documents.py +5 -6
  140. pixeltable/utils/exception_handler.py +2 -2
  141. pixeltable/utils/filecache.py +5 -5
  142. pixeltable/utils/formatter.py +4 -6
  143. pixeltable/utils/gcs_store.py +9 -9
  144. pixeltable/utils/local_store.py +17 -17
  145. pixeltable/utils/object_stores.py +59 -43
  146. pixeltable/utils/s3_store.py +35 -30
  147. {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/METADATA +4 -4
  148. pixeltable-0.4.19.dist-info/RECORD +213 -0
  149. pixeltable/__version__.py +0 -3
  150. pixeltable-0.4.17.dist-info/RECORD +0 -211
  151. {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/WHEEL +0 -0
  152. {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/entry_points.txt +0 -0
  153. {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/licenses/LICENSE +0 -0
@@ -8,7 +8,7 @@ import urllib.parse
8
8
  import urllib.request
9
9
  from dataclasses import dataclass, field, fields
10
10
  from pathlib import Path
11
- from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Optional, cast
11
+ from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, cast
12
12
 
13
13
  import numpy as np
14
14
  import pandas as pd
@@ -50,15 +50,15 @@ class TableDataConduitFormat(str, enum.Enum):
50
50
  @dataclass
51
51
  class TableDataConduit:
52
52
  source: 'TableDataSource'
53
- source_format: Optional[str] = None
54
- source_column_map: Optional[dict[str, str]] = None
53
+ source_format: str | None = None
54
+ source_column_map: dict[str, str] | None = None
55
55
  if_row_exists: Literal['update', 'ignore', 'error'] = 'error'
56
- pxt_schema: Optional[dict[str, ts.ColumnType]] = None
57
- src_schema_overrides: Optional[dict[str, ts.ColumnType]] = None
58
- src_schema: Optional[dict[str, ts.ColumnType]] = None
59
- pxt_pk: Optional[list[str]] = None
60
- src_pk: Optional[list[str]] = None
61
- valid_rows: Optional[RowData] = None
56
+ pxt_schema: dict[str, ts.ColumnType] | None = None
57
+ src_schema_overrides: dict[str, ts.ColumnType] | None = None
58
+ src_schema: dict[str, ts.ColumnType] | None = None
59
+ pxt_pk: list[str] | None = None
60
+ src_pk: list[str] | None = None
61
+ valid_rows: RowData | None = None
62
62
  extra_fields: dict[str, Any] = field(default_factory=dict)
63
63
 
64
64
  reqd_col_names: set[str] = field(default_factory=set)
@@ -151,7 +151,7 @@ class DFTableDataConduit(TableDataConduit):
151
151
 
152
152
 
153
153
  class RowDataTableDataConduit(TableDataConduit):
154
- raw_rows: Optional[RowData] = None
154
+ raw_rows: RowData | None = None
155
155
  disable_mapping: bool = True
156
156
  batch_count: int = 0
157
157
 
@@ -332,7 +332,7 @@ class HFTableDataConduit(TableDataConduit):
332
332
  - use set_format('arrow') and convert ChunkedArrays to PIL.Image.Image instead of going through numpy, which is slow
333
333
  """
334
334
 
335
- column_name_for_split: Optional[str] = None
335
+ column_name_for_split: str | None = None
336
336
  categorical_features: dict[str, dict[int, str]]
337
337
  dataset_dict: dict[str, datasets.Dataset] = None
338
338
  hf_schema_source: dict[str, Any] = None
@@ -478,7 +478,7 @@ class HFTableDataConduit(TableDataConduit):
478
478
 
479
479
 
480
480
  class ParquetTableDataConduit(TableDataConduit):
481
- pq_ds: Optional[ParquetDataset] = None
481
+ pq_ds: ParquetDataset | None = None
482
482
 
483
483
  @classmethod
484
484
  def from_tds(cls, tds: TableDataConduit) -> 'ParquetTableDataConduit':
pixeltable/io/utils.py CHANGED
@@ -1,5 +1,5 @@
1
1
  from keyword import iskeyword as is_python_keyword
2
- from typing import Any, Optional
2
+ from typing import Any
3
3
 
4
4
  import pixeltable as pxt
5
5
  import pixeltable.exceptions as excs
@@ -40,7 +40,7 @@ def normalize_schema_names(
40
40
  primary_key: list[str],
41
41
  schema_overrides: dict[str, Any],
42
42
  require_valid_pxt_column_names: bool = False,
43
- ) -> tuple[dict[str, Any], list[str], Optional[dict[str, str]]]:
43
+ ) -> tuple[dict[str, Any], list[str], dict[str, str] | None]:
44
44
  """
45
45
  Convert all names in the input schema from source names to valid Pixeltable identifiers
46
46
  - Ensure that all names are unique.
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from fractions import Fraction
3
3
  from pathlib import Path
4
- from typing import Any, ClassVar, Optional
4
+ from typing import Any, ClassVar
5
5
 
6
6
  import av
7
7
 
@@ -37,7 +37,7 @@ class AudioSplitter(ComponentIterator):
37
37
 
38
38
  # List of chunks to extract
39
39
  # Each chunk is defined by start and end presentation timestamps in audio file (int)
40
- chunks_to_extract_in_pts: Optional[list[tuple[int, int]]]
40
+ chunks_to_extract_in_pts: list[tuple[int, int]] | None
41
41
  # next chunk to extract
42
42
  next_pos: int
43
43
 
@@ -2,7 +2,7 @@ import dataclasses
2
2
  import enum
3
3
  import io
4
4
  import logging
5
- from typing import Any, ClassVar, Iterable, Iterator, Optional
5
+ from typing import Any, ClassVar, Iterable, Iterator, Literal
6
6
 
7
7
  import fitz # type: ignore[import-untyped]
8
8
  import ftfy
@@ -11,7 +11,7 @@ from bs4.element import NavigableString, Tag
11
11
 
12
12
  from pixeltable.env import Env
13
13
  from pixeltable.exceptions import Error
14
- from pixeltable.type_system import BoolType, ColumnType, DocumentType, ImageType, IntType, JsonType, StringType
14
+ from pixeltable.type_system import ColumnType, DocumentType, ImageType, IntType, JsonType, StringType
15
15
  from pixeltable.utils.documents import get_document_handle
16
16
 
17
17
  from .base import ComponentIterator
@@ -19,6 +19,11 @@ from .base import ComponentIterator
19
19
  _logger = logging.getLogger('pixeltable')
20
20
 
21
21
 
22
+ class Element(enum.Enum):
23
+ TEXT = 1
24
+ IMAGE = 2
25
+
26
+
22
27
  class ChunkMetadata(enum.Enum):
23
28
  TITLE = 1
24
29
  HEADING = 2
@@ -41,28 +46,28 @@ class DocumentSectionMetadata:
41
46
  """Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
42
47
 
43
48
  # html and markdown metadata
44
- sourceline: Optional[int] = None
49
+ sourceline: int | None = None
45
50
  # the stack of headings up to the most recently observed one;
46
51
  # eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
47
- heading: Optional[dict[str, str]] = None
52
+ heading: dict[str, str] | None = None
48
53
 
49
54
  # pdf-specific metadata
50
- page: Optional[int] = None
55
+ page: int | None = None
51
56
  # bounding box as an {x1, y1, x2, y2} dictionary
52
- bounding_box: Optional[dict[str, float]] = None
57
+ bounding_box: dict[str, float] | None = None
53
58
 
54
59
 
55
60
  @dataclasses.dataclass
56
61
  class DocumentSection:
57
62
  """A single document chunk, according to some of the splitting criteria"""
58
63
 
59
- text: Optional[str]
60
- metadata: Optional[DocumentSectionMetadata]
61
- image: Optional[PIL.Image.Image] = None
64
+ text: str | None = None
65
+ image: PIL.Image.Image | None = None
66
+ metadata: DocumentSectionMetadata | None = None
62
67
 
63
68
 
64
69
  def _parse_separators(separators: str) -> list[Separator]:
65
- ret = []
70
+ ret: list[Separator] = []
66
71
  for s in separators.split(','):
67
72
  clean_s = s.strip().upper()
68
73
  if not clean_s:
@@ -76,7 +81,7 @@ def _parse_separators(separators: str) -> list[Separator]:
76
81
 
77
82
 
78
83
  def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
79
- ret = []
84
+ ret: list[ChunkMetadata] = []
80
85
  for m in metadata.split(','):
81
86
  clean_m = m.strip().upper()
82
87
  if not clean_m:
@@ -89,6 +94,18 @@ def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
89
94
  return ret
90
95
 
91
96
 
97
+ def _parse_elements(elements: list[Literal['text', 'image']]) -> list[Element]:
98
+ result: list[Element] = []
99
+ for e in elements:
100
+ clean_e = e.strip().upper()
101
+ if clean_e not in Element.__members__:
102
+ raise Error(f'Invalid element: `{e}`. Valid elements are: {", ".join(Element.__members__).lower()}')
103
+ result.append(Element[clean_e])
104
+ if len(result) == 0:
105
+ raise Error('elements cannot be empty')
106
+ return result
107
+
108
+
92
109
  _HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
93
110
 
94
111
 
@@ -106,11 +123,16 @@ class DocumentSplitter(ComponentIterator):
106
123
  separators: separators to use to chunk the document. Options are:
107
124
  `'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
108
125
  This may be a comma-separated string, e.g., `'heading,token_limit'`.
126
+ elements: list of elements to extract from the document. Options are:
127
+ `'text'`, `'image'`. Defaults to `['text']` if not specified. The `'image'` element is only supported
128
+ for the `'page'` separator on PDF documents.
109
129
  limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
110
130
  or `'char_limit'` is specified.
111
131
  metadata: additional metadata fields to include in the output. Options are:
112
132
  `'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
113
133
  (PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
134
+ image_dpi: DPI to use when extracting images from PDFs. Defaults to 300.
135
+ image_format: format to use when extracting images from PDFs. Defaults to 'png'.
114
136
  """
115
137
 
116
138
  METADATA_COLUMN_TYPES: ClassVar[dict[ChunkMetadata, ColumnType]] = {
@@ -121,34 +143,41 @@ class DocumentSplitter(ComponentIterator):
121
143
  ChunkMetadata.BOUNDING_BOX: JsonType(nullable=True),
122
144
  }
123
145
 
146
+ _doc_handle: Any
147
+ _separators: list[Separator]
148
+ _elements: list[Element]
149
+ _metadata_fields: list[ChunkMetadata]
150
+ _doc_title: str
151
+ _limit: int
152
+ _skip_tags: list[str]
153
+ _overlap: int
154
+ _tiktoken_encoding: str | None
155
+ _tiktoken_target_model: str | None
156
+ _image_dpi: int
157
+ _image_format: str
158
+
159
+ _sections: Iterator[DocumentSection]
160
+
124
161
  def __init__(
125
162
  self,
126
163
  document: str,
127
164
  *,
128
165
  separators: str,
129
- limit: Optional[int] = None,
130
- overlap: Optional[int] = None,
166
+ elements: list[Literal['text', 'image']] | None = None,
167
+ limit: int | None = None,
168
+ overlap: int | None = None,
131
169
  metadata: str = '',
132
- html_skip_tags: Optional[list[str]] = None,
133
- tiktoken_encoding: Optional[str] = 'cl100k_base',
134
- tiktoken_target_model: Optional[str] = None,
135
- # (PDF-processing-only)
136
- include_page_image: bool = False,
137
- page_image_dpi: int = 300,
138
- page_image_format: str = 'png',
170
+ html_skip_tags: list[str] | None = None,
171
+ tiktoken_encoding: str | None = 'cl100k_base',
172
+ tiktoken_target_model: str | None = None,
173
+ image_dpi: int = 300,
174
+ image_format: str = 'png',
139
175
  ):
140
176
  if html_skip_tags is None:
141
177
  html_skip_tags = ['nav']
142
178
  self._doc_handle = get_document_handle(document)
179
+ self._elements = _parse_elements(elements.copy()) if elements is not None else [Element.TEXT]
143
180
  assert self._doc_handle is not None
144
- # calling the output_schema method to validate the input arguments
145
- self.output_schema(
146
- separators=separators,
147
- metadata=metadata,
148
- limit=limit,
149
- overlap=overlap,
150
- include_page_image=include_page_image,
151
- )
152
181
  self._separators = _parse_separators(separators)
153
182
  self._metadata_fields = _parse_metadata(metadata)
154
183
  if self._doc_handle.bs_doc is not None:
@@ -164,10 +193,8 @@ class DocumentSplitter(ComponentIterator):
164
193
  self._overlap = 0 if overlap is None else overlap
165
194
  self._tiktoken_encoding = tiktoken_encoding
166
195
  self._tiktoken_target_model = tiktoken_target_model
167
-
168
- self._include_page_image = include_page_image
169
- self._page_image_dpi = page_image_dpi
170
- self._page_image_format = page_image_format
196
+ self._image_dpi = image_dpi
197
+ self._image_format = image_format
171
198
 
172
199
  # set up processing pipeline
173
200
  if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
@@ -197,23 +224,28 @@ class DocumentSplitter(ComponentIterator):
197
224
  return {
198
225
  'document': DocumentType(nullable=False),
199
226
  'separators': StringType(nullable=False),
227
+ 'elements': JsonType(nullable=False),
200
228
  'metadata': StringType(nullable=False),
201
229
  'limit': IntType(nullable=True),
202
230
  'overlap': IntType(nullable=True),
203
231
  'skip_tags': StringType(nullable=True),
204
232
  'tiktoken_encoding': StringType(nullable=True),
205
233
  'tiktoken_target_model': StringType(nullable=True),
206
- # PDF options must be declared so validation accepts them:
207
- 'include_page_image': BoolType(nullable=True),
208
- 'page_image_dpi': IntType(nullable=True),
209
- 'page_image_format': StringType(nullable=True),
234
+ 'image_dpi': IntType(nullable=True),
235
+ 'image_format': StringType(nullable=True),
210
236
  }
211
237
 
212
238
  @classmethod
213
239
  def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
214
- schema: dict[str, ColumnType] = {'text': StringType()}
215
- md_fields = _parse_metadata(kwargs['metadata']) if 'metadata' in kwargs else []
216
-
240
+ schema: dict[str, ColumnType] = {}
241
+ elements = _parse_elements(kwargs.get('elements', ['text']))
242
+ for element in elements:
243
+ if element == Element.TEXT:
244
+ schema['text'] = StringType(nullable=False)
245
+ elif element == Element.IMAGE:
246
+ schema['image'] = ImageType(nullable=False)
247
+
248
+ md_fields = _parse_metadata(kwargs.get('metadata', ''))
217
249
  for md_field in md_fields:
218
250
  schema[md_field.name.lower()] = cls.METADATA_COLUMN_TYPES[md_field]
219
251
 
@@ -223,6 +255,8 @@ class DocumentSplitter(ComponentIterator):
223
255
  limit = kwargs.get('limit')
224
256
  overlap = kwargs.get('overlap')
225
257
 
258
+ if Element.IMAGE in elements and separators != [Separator.PAGE]:
259
+ raise Error('Image elements are only supported for the "page" separator on PDF documents')
226
260
  if limit is not None or overlap is not None:
227
261
  if Separator.TOKEN_LIMIT not in separators and Separator.CHAR_LIMIT not in separators:
228
262
  raise Error('limit/overlap requires the "token_limit" or "char_limit" separator')
@@ -236,23 +270,25 @@ class DocumentSplitter(ComponentIterator):
236
270
  if kwargs.get('limit') is None:
237
271
  raise Error('limit is required with "token_limit"/"char_limit" separators')
238
272
 
239
- # check dependencies at the end
240
273
  if Separator.SENTENCE in separators:
241
274
  _ = Env.get().spacy_nlp
242
275
  if Separator.TOKEN_LIMIT in separators:
243
276
  Env.get().require_package('tiktoken')
244
277
 
245
- if kwargs.get('include_page_image'):
246
- schema['image'] = ImageType(nullable=True)
247
-
248
278
  return schema, []
249
279
 
250
280
  def __next__(self) -> dict[str, Any]:
251
281
  while True:
252
282
  section = next(self._sections)
253
- if section.text is None:
283
+ if section.text is None and section.image is None:
254
284
  continue
255
- result: dict[str, Any] = {'text': section.text}
285
+ result: dict[str, Any] = {}
286
+ for element in self._elements:
287
+ if element == Element.TEXT:
288
+ result['text'] = section.text
289
+ elif element == Element.IMAGE:
290
+ result['image'] = section.image
291
+
256
292
  for md_field in self._metadata_fields:
257
293
  if md_field == ChunkMetadata.TITLE:
258
294
  result[md_field.name.lower()] = self._doc_title
@@ -265,10 +301,6 @@ class DocumentSplitter(ComponentIterator):
265
301
  elif md_field == ChunkMetadata.BOUNDING_BOX:
266
302
  result[md_field.name.lower()] = section.metadata.bounding_box
267
303
 
268
- # FIX: only include image if schema supports it
269
- if self._include_page_image:
270
- result['image'] = section.image
271
-
272
304
  return result
273
305
 
274
306
  def _html_sections(self) -> Iterator[DocumentSection]:
@@ -411,11 +443,10 @@ class DocumentSplitter(ComponentIterator):
411
443
  return txt
412
444
 
413
445
  for page_idx, page in enumerate(doc.pages()):
414
- # render once per page if requested
415
- page_image = None
416
- if self._include_page_image:
417
- pix = page.get_pixmap(dpi=self._page_image_dpi) # ← single render
418
- page_image = PIL.Image.open(io.BytesIO(pix.tobytes(self._page_image_format)))
446
+ img: PIL.Image.Image | None = None
447
+ if Element.IMAGE in self._elements:
448
+ pix = page.get_pixmap(dpi=self._image_dpi)
449
+ img = PIL.Image.open(io.BytesIO(pix.tobytes(self._image_format)))
419
450
 
420
451
  for block in page.get_text('blocks'):
421
452
  x1, y1, x2, y2, text, *_ = block
@@ -423,14 +454,14 @@ class DocumentSplitter(ComponentIterator):
423
454
  if accumulated_text and emit_on_paragraph:
424
455
  bbox = {'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2}
425
456
  md = DocumentSectionMetadata(page=page_idx, bounding_box=bbox)
426
- yield DocumentSection(text=_emit_text(), metadata=md, image=page_image)
457
+ yield DocumentSection(text=_emit_text(), metadata=md)
427
458
 
428
459
  if accumulated_text and emit_on_page and not emit_on_paragraph:
429
460
  md = DocumentSectionMetadata(page=page_idx)
430
- yield DocumentSection(text=_emit_text(), metadata=md, image=page_image)
461
+ yield DocumentSection(text=_emit_text(), image=img, metadata=md)
431
462
 
432
463
  if accumulated_text and not emit_on_page:
433
- yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata(), image=None)
464
+ yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata())
434
465
 
435
466
  def _txt_sections(self) -> Iterator[DocumentSection]:
436
467
  """Create DocumentSections for text files.
@@ -4,7 +4,7 @@ import math
4
4
  import subprocess
5
5
  from fractions import Fraction
6
6
  from pathlib import Path
7
- from typing import Any, Iterator, Literal, Optional
7
+ from typing import Any, Iterator, Literal
8
8
 
9
9
  import av
10
10
  import pandas as pd
@@ -42,9 +42,9 @@ class FrameIterator(ComponentIterator):
42
42
  [Frame](https://pyav.org/docs/develop/api/frame.html)):
43
43
 
44
44
  * `index` (`int`)
45
- * `pts` (`Optional[int]`)
46
- * `dts` (`Optional[int]`)
47
- * `time` (`Optional[float]`)
45
+ * `pts` (`int | None`)
46
+ * `dts` (`int | None`)
47
+ * `time` (`float | None`)
48
48
  * `is_corrupt` (`bool`)
49
49
  * `key_frame` (`bool`)
50
50
  * `pict_type` (`int`)
@@ -55,8 +55,8 @@ class FrameIterator(ComponentIterator):
55
55
 
56
56
  # Input parameters
57
57
  video_path: Path
58
- fps: Optional[float]
59
- num_frames: Optional[int]
58
+ fps: float | None
59
+ num_frames: int | None
60
60
  all_frame_attrs: bool
61
61
 
62
62
  # Video info
@@ -67,19 +67,14 @@ class FrameIterator(ComponentIterator):
67
67
  video_start_time: int
68
68
 
69
69
  # List of frame indices to be extracted, or None to extract all frames
70
- frames_to_extract: Optional[list[int]]
70
+ frames_to_extract: list[int] | None
71
71
 
72
72
  # Next frame to extract, as an iterator `pos` index. If `frames_to_extract` is None, this is the same as the
73
73
  # frame index in the video. Otherwise, the corresponding video index is `frames_to_extract[next_pos]`.
74
74
  next_pos: int
75
75
 
76
76
  def __init__(
77
- self,
78
- video: str,
79
- *,
80
- fps: Optional[float] = None,
81
- num_frames: Optional[int] = None,
82
- all_frame_attrs: bool = False,
77
+ self, video: str, *, fps: float | None = None, num_frames: int | None = None, all_frame_attrs: bool = False
83
78
  ):
84
79
  if fps is not None and num_frames is not None:
85
80
  raise excs.Error('At most one of `fps` or `num_frames` may be specified')
@@ -251,7 +246,8 @@ class VideoSplitter(ComponentIterator):
251
246
 
252
247
  # Input parameters
253
248
  video_path: Path
254
- segment_duration: float
249
+ segment_duration: float | None
250
+ segment_times: list[float] | None
255
251
  overlap: float
256
252
  min_segment_duration: float
257
253
  video_encoder: str | None
@@ -268,25 +264,31 @@ class VideoSplitter(ComponentIterator):
268
264
  self,
269
265
  video: str,
270
266
  *,
271
- duration: float,
272
- overlap: float = 0.0,
273
- min_segment_duration: float = 0.0,
274
- mode: Literal['fast', 'accurate'] = 'fast',
267
+ duration: float | None = None,
268
+ overlap: float | None = None,
269
+ min_segment_duration: float | None = None,
270
+ segment_times: list[float] | None = None,
271
+ mode: Literal['fast', 'accurate'] = 'accurate',
275
272
  video_encoder: str | None = None,
276
273
  video_encoder_args: dict[str, Any] | None = None,
277
274
  ):
278
275
  Env.get().require_binary('ffmpeg')
279
- assert duration > 0.0
280
- assert duration >= min_segment_duration
281
- assert overlap < duration
276
+ assert (duration is not None) != (segment_times is not None)
277
+ if segment_times is not None:
278
+ assert len(segment_times) > 0
279
+ if duration is not None:
280
+ assert duration > 0.0
281
+ assert duration >= min_segment_duration
282
+ assert overlap is None or overlap < duration
282
283
 
283
284
  video_path = Path(video)
284
285
  assert video_path.exists() and video_path.is_file()
285
286
 
286
287
  self.video_path = video_path
287
288
  self.segment_duration = duration
288
- self.overlap = overlap
289
- self.min_segment_duration = min_segment_duration
289
+ self.overlap = overlap if overlap is not None else 0.0
290
+ self.min_segment_duration = min_segment_duration if min_segment_duration is not None else 0.0
291
+ self.segment_times = segment_times
290
292
  self.video_encoder = video_encoder
291
293
  self.video_encoder_args = video_encoder_args
292
294
 
@@ -304,6 +306,7 @@ class VideoSplitter(ComponentIterator):
304
306
  'duration': ts.FloatType(nullable=True),
305
307
  'overlap': ts.FloatType(nullable=True),
306
308
  'min_segment_duration': ts.FloatType(nullable=True),
309
+ 'segment_times': ts.JsonType(nullable=True),
307
310
  'mode': ts.StringType(nullable=False),
308
311
  'video_encoder': ts.StringType(nullable=True),
309
312
  'video_encoder_args': ts.JsonType(nullable=True),
@@ -311,23 +314,34 @@ class VideoSplitter(ComponentIterator):
311
314
 
312
315
  @classmethod
313
316
  def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
314
- param_names = ['duration', 'overlap', 'min_segment_duration']
317
+ param_names = ['duration', 'overlap', 'min_segment_duration', 'segment_times']
315
318
  params = dict(zip(param_names, args))
316
319
  params.update(kwargs)
317
320
 
318
- segment_duration = params['duration']
319
- min_segment_duration = params.get('min_segment_duration', 0.0)
320
- overlap = params.get('overlap', 0.0)
321
+ segment_duration = params.get('duration')
322
+ segment_times = params.get('segment_times')
323
+ overlap = params.get('overlap')
324
+ min_segment_duration = params.get('min_segment_duration')
321
325
  mode = params.get('mode', 'fast')
322
326
 
323
- if segment_duration <= 0.0:
324
- raise excs.Error('duration must be a positive number')
325
- if segment_duration < min_segment_duration:
326
- raise excs.Error('duration must be at least min_segment_duration')
327
- if mode == 'accurate' and overlap > 0:
327
+ if segment_duration is None and segment_times is None:
328
+ raise excs.Error('Must specify either duration or segment_times')
329
+ if segment_duration is not None and segment_times is not None:
330
+ raise excs.Error('duration and segment_times cannot both be specified')
331
+ if segment_times is not None:
332
+ if len(segment_times) == 0:
333
+ raise excs.Error('segment_times cannot be empty')
334
+ if overlap is not None:
335
+ raise excs.Error('overlap cannot be specified with segment_times')
336
+ if segment_duration is not None:
337
+ if segment_duration <= 0.0:
338
+ raise excs.Error('duration must be a positive number')
339
+ if min_segment_duration is not None and segment_duration < min_segment_duration:
340
+ raise excs.Error('duration must be at least min_segment_duration')
341
+ if overlap is not None and overlap >= segment_duration:
342
+ raise excs.Error('overlap must be less than duration')
343
+ if mode == 'accurate' and overlap is not None:
328
344
  raise excs.Error("Cannot specify overlap for mode='accurate'")
329
- if overlap >= segment_duration:
330
- raise excs.Error('overlap must be less than duration')
331
345
  if mode == 'fast':
332
346
  if params.get('video_encoder') is not None:
333
347
  raise excs.Error("Cannot specify video_encoder for mode='fast'")
@@ -343,13 +357,22 @@ class VideoSplitter(ComponentIterator):
343
357
  }, []
344
358
 
345
359
  def fast_iter(self) -> Iterator[dict[str, Any]]:
346
- segment_path: str
360
+ segment_path: str = ''
347
361
  try:
348
362
  start_time = 0.0
349
363
  start_pts = 0
364
+ segment_idx = 0
350
365
  while True:
366
+ target_duration: float | None
367
+ if self.segment_duration is not None:
368
+ target_duration = self.segment_duration
369
+ elif self.segment_times is not None and segment_idx < len(self.segment_times):
370
+ target_duration = self.segment_times[segment_idx] - start_time
371
+ else:
372
+ target_duration = None # the rest of the video
373
+
351
374
  segment_path = str(TempStore.create_path(extension='.mp4'))
352
- cmd = av_utils.ffmpeg_clip_cmd(str(self.video_path), segment_path, start_time, self.segment_duration)
375
+ cmd = av_utils.ffmpeg_clip_cmd(str(self.video_path), segment_path, start_time, target_duration)
353
376
  _ = subprocess.run(cmd, capture_output=True, text=True, check=True)
354
377
 
355
378
  # use the actual duration
@@ -373,8 +396,13 @@ class VideoSplitter(ComponentIterator):
373
396
  start_time = segment_end - self.overlap
374
397
  start_pts = segment_end_pts - round(self.overlap / self.video_time_base)
375
398
 
399
+ segment_idx += 1
400
+ if self.segment_times is not None and segment_idx > len(self.segment_times):
401
+ # We've created all segments including the final segment after the last segment_time
402
+ break
403
+
376
404
  except subprocess.CalledProcessError as e:
377
- if Path(segment_path).exists():
405
+ if segment_path and Path(segment_path).exists():
378
406
  Path(segment_path).unlink()
379
407
  error_msg = f'ffmpeg failed with return code {e.returncode}'
380
408
  if e.stderr:
@@ -389,6 +417,7 @@ class VideoSplitter(ComponentIterator):
389
417
  str(self.video_path),
390
418
  output_pattern,
391
419
  segment_duration=self.segment_duration,
420
+ segment_times=self.segment_times,
392
421
  video_encoder=self.video_encoder,
393
422
  video_encoder_args=self.video_encoder_args,
394
423
  )
@@ -1,4 +1,4 @@
1
- from typing import Any, Optional
1
+ from typing import Any
2
2
 
3
3
  import sqlalchemy as sql
4
4
 
@@ -11,7 +11,7 @@ def _(engine: sql.engine.Engine) -> None:
11
11
  convert_table_md(engine, substitution_fn=__substitute_md)
12
12
 
13
13
 
14
- def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
14
+ def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
15
15
  # Migrate a few changed function names
16
16
  if k == 'path' and v == 'pixeltable.functions.string.str_format':
17
17
  return 'path', 'pixeltable.functions.string.format'
@@ -1,5 +1,5 @@
1
1
  import datetime
2
- from typing import Any, Optional
2
+ from typing import Any
3
3
 
4
4
  import sqlalchemy as sql
5
5
 
@@ -28,7 +28,7 @@ def _(engine: sql.engine.Engine) -> None:
28
28
  conn.execute(sql.text(f'ALTER TABLE {store_name} ALTER COLUMN col_{col_id} TYPE TIMESTAMPTZ'))
29
29
 
30
30
 
31
- def __update_timestamp_literals(k: Any, v: Any) -> Optional[tuple[Any, Any]]:
31
+ def __update_timestamp_literals(k: Any, v: Any) -> tuple[Any, Any] | None:
32
32
  if isinstance(v, dict) and 'val_t' in v:
33
33
  # It's a literal with an explicit 'val_t' field. In version 19 this can only mean a
34
34
  # timestamp literal, which (in version 19) is stored in the DB as a naive datetime.
@@ -1,4 +1,4 @@
1
- from typing import Any, Optional
1
+ from typing import Any
2
2
 
3
3
  import sqlalchemy as sql
4
4
 
@@ -11,7 +11,7 @@ def _(engine: sql.engine.Engine) -> None:
11
11
  convert_table_md(engine, substitution_fn=__substitute_md)
12
12
 
13
13
 
14
- def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
14
+ def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
15
15
  if isinstance(v, dict) and '_classname' in v:
16
16
  # The way InlineArray is represented changed in v20. Previously, literal values were stored
17
17
  # directly in the Inline expr; now we store them in Literal sub-exprs. This converter
@@ -1,4 +1,4 @@
1
- from typing import Any, Optional
1
+ from typing import Any
2
2
 
3
3
  import sqlalchemy as sql
4
4
 
@@ -24,7 +24,7 @@ def __update_schema_column(schema_column: dict) -> None:
24
24
  schema_column['media_validation'] = None
25
25
 
26
26
 
27
- def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
27
+ def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
28
28
  if isinstance(v, dict) and '_classname' in v and v['_classname'] == 'ColumnRef':
29
29
  if 'perform_validation' not in v:
30
30
  v['perform_validation'] = False