pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (202) hide show
  1. pixeltable/__init__.py +23 -5
  2. pixeltable/_version.py +1 -0
  3. pixeltable/catalog/__init__.py +5 -3
  4. pixeltable/catalog/catalog.py +1318 -404
  5. pixeltable/catalog/column.py +186 -115
  6. pixeltable/catalog/dir.py +1 -2
  7. pixeltable/catalog/globals.py +11 -43
  8. pixeltable/catalog/insertable_table.py +167 -79
  9. pixeltable/catalog/path.py +61 -23
  10. pixeltable/catalog/schema_object.py +9 -10
  11. pixeltable/catalog/table.py +626 -308
  12. pixeltable/catalog/table_metadata.py +101 -0
  13. pixeltable/catalog/table_version.py +713 -569
  14. pixeltable/catalog/table_version_handle.py +37 -6
  15. pixeltable/catalog/table_version_path.py +42 -29
  16. pixeltable/catalog/tbl_ops.py +50 -0
  17. pixeltable/catalog/update_status.py +191 -0
  18. pixeltable/catalog/view.py +108 -94
  19. pixeltable/config.py +128 -22
  20. pixeltable/dataframe.py +188 -100
  21. pixeltable/env.py +407 -136
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +3 -0
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +231 -0
  27. pixeltable/exec/cell_reconstruction_node.py +135 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +7 -6
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +190 -30
  37. pixeltable/exec/globals.py +32 -0
  38. pixeltable/exec/in_memory_data_node.py +18 -18
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +206 -101
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +34 -30
  46. pixeltable/exprs/column_ref.py +92 -96
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +152 -55
  50. pixeltable/exprs/expr.py +62 -43
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +75 -37
  54. pixeltable/exprs/globals.py +1 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +10 -27
  57. pixeltable/exprs/is_null.py +1 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +5 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +127 -53
  64. pixeltable/exprs/rowid_ref.py +8 -12
  65. pixeltable/exprs/similarity_expr.py +50 -25
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +10 -10
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +20 -18
  78. pixeltable/func/signature.py +43 -16
  79. pixeltable/func/tools.py +23 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +6 -0
  82. pixeltable/functions/anthropic.py +93 -33
  83. pixeltable/functions/audio.py +114 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +1 -1
  86. pixeltable/functions/deepseek.py +20 -9
  87. pixeltable/functions/fireworks.py +2 -2
  88. pixeltable/functions/gemini.py +28 -11
  89. pixeltable/functions/globals.py +13 -13
  90. pixeltable/functions/groq.py +108 -0
  91. pixeltable/functions/huggingface.py +1046 -23
  92. pixeltable/functions/image.py +9 -18
  93. pixeltable/functions/llama_cpp.py +23 -8
  94. pixeltable/functions/math.py +3 -4
  95. pixeltable/functions/mistralai.py +4 -15
  96. pixeltable/functions/ollama.py +16 -9
  97. pixeltable/functions/openai.py +104 -82
  98. pixeltable/functions/openrouter.py +143 -0
  99. pixeltable/functions/replicate.py +2 -2
  100. pixeltable/functions/reve.py +250 -0
  101. pixeltable/functions/string.py +21 -28
  102. pixeltable/functions/timestamp.py +13 -14
  103. pixeltable/functions/together.py +4 -6
  104. pixeltable/functions/twelvelabs.py +92 -0
  105. pixeltable/functions/util.py +6 -1
  106. pixeltable/functions/video.py +1388 -106
  107. pixeltable/functions/vision.py +7 -7
  108. pixeltable/functions/whisper.py +15 -7
  109. pixeltable/functions/whisperx.py +179 -0
  110. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  111. pixeltable/globals.py +332 -105
  112. pixeltable/index/base.py +13 -22
  113. pixeltable/index/btree.py +23 -22
  114. pixeltable/index/embedding_index.py +32 -44
  115. pixeltable/io/__init__.py +4 -2
  116. pixeltable/io/datarows.py +7 -6
  117. pixeltable/io/external_store.py +49 -77
  118. pixeltable/io/fiftyone.py +11 -11
  119. pixeltable/io/globals.py +29 -28
  120. pixeltable/io/hf_datasets.py +17 -9
  121. pixeltable/io/label_studio.py +70 -66
  122. pixeltable/io/lancedb.py +3 -0
  123. pixeltable/io/pandas.py +12 -11
  124. pixeltable/io/parquet.py +13 -93
  125. pixeltable/io/table_data_conduit.py +71 -47
  126. pixeltable/io/utils.py +3 -3
  127. pixeltable/iterators/__init__.py +2 -1
  128. pixeltable/iterators/audio.py +21 -11
  129. pixeltable/iterators/document.py +116 -55
  130. pixeltable/iterators/image.py +5 -2
  131. pixeltable/iterators/video.py +293 -13
  132. pixeltable/metadata/__init__.py +4 -2
  133. pixeltable/metadata/converters/convert_18.py +2 -2
  134. pixeltable/metadata/converters/convert_19.py +2 -2
  135. pixeltable/metadata/converters/convert_20.py +2 -2
  136. pixeltable/metadata/converters/convert_21.py +2 -2
  137. pixeltable/metadata/converters/convert_22.py +2 -2
  138. pixeltable/metadata/converters/convert_24.py +2 -2
  139. pixeltable/metadata/converters/convert_25.py +2 -2
  140. pixeltable/metadata/converters/convert_26.py +2 -2
  141. pixeltable/metadata/converters/convert_29.py +4 -4
  142. pixeltable/metadata/converters/convert_34.py +2 -2
  143. pixeltable/metadata/converters/convert_36.py +2 -2
  144. pixeltable/metadata/converters/convert_37.py +15 -0
  145. pixeltable/metadata/converters/convert_38.py +39 -0
  146. pixeltable/metadata/converters/convert_39.py +124 -0
  147. pixeltable/metadata/converters/convert_40.py +73 -0
  148. pixeltable/metadata/converters/util.py +13 -12
  149. pixeltable/metadata/notes.py +4 -0
  150. pixeltable/metadata/schema.py +79 -42
  151. pixeltable/metadata/utils.py +74 -0
  152. pixeltable/mypy/__init__.py +3 -0
  153. pixeltable/mypy/mypy_plugin.py +123 -0
  154. pixeltable/plan.py +274 -223
  155. pixeltable/share/__init__.py +1 -1
  156. pixeltable/share/packager.py +259 -129
  157. pixeltable/share/protocol/__init__.py +34 -0
  158. pixeltable/share/protocol/common.py +170 -0
  159. pixeltable/share/protocol/operation_types.py +33 -0
  160. pixeltable/share/protocol/replica.py +109 -0
  161. pixeltable/share/publish.py +213 -57
  162. pixeltable/store.py +238 -175
  163. pixeltable/type_system.py +104 -63
  164. pixeltable/utils/__init__.py +2 -3
  165. pixeltable/utils/arrow.py +108 -13
  166. pixeltable/utils/av.py +298 -0
  167. pixeltable/utils/azure_store.py +305 -0
  168. pixeltable/utils/code.py +3 -3
  169. pixeltable/utils/console_output.py +4 -1
  170. pixeltable/utils/coroutine.py +6 -23
  171. pixeltable/utils/dbms.py +31 -5
  172. pixeltable/utils/description_helper.py +4 -5
  173. pixeltable/utils/documents.py +5 -6
  174. pixeltable/utils/exception_handler.py +7 -30
  175. pixeltable/utils/filecache.py +6 -6
  176. pixeltable/utils/formatter.py +4 -6
  177. pixeltable/utils/gcs_store.py +283 -0
  178. pixeltable/utils/http_server.py +2 -3
  179. pixeltable/utils/iceberg.py +1 -2
  180. pixeltable/utils/image.py +17 -0
  181. pixeltable/utils/lancedb.py +88 -0
  182. pixeltable/utils/local_store.py +316 -0
  183. pixeltable/utils/misc.py +5 -0
  184. pixeltable/utils/object_stores.py +528 -0
  185. pixeltable/utils/pydantic.py +60 -0
  186. pixeltable/utils/pytorch.py +5 -6
  187. pixeltable/utils/s3_store.py +392 -0
  188. pixeltable-0.4.20.dist-info/METADATA +587 -0
  189. pixeltable-0.4.20.dist-info/RECORD +218 -0
  190. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
  191. pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
  192. pixeltable/__version__.py +0 -3
  193. pixeltable/ext/__init__.py +0 -17
  194. pixeltable/ext/functions/__init__.py +0 -11
  195. pixeltable/ext/functions/whisperx.py +0 -77
  196. pixeltable/utils/media_store.py +0 -77
  197. pixeltable/utils/s3.py +0 -17
  198. pixeltable/utils/sample.py +0 -25
  199. pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
  200. pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
  201. pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
  202. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0
@@ -1,13 +1,17 @@
1
1
  import dataclasses
2
2
  import enum
3
+ import io
3
4
  import logging
4
- from typing import Any, ClassVar, Iterable, Iterator, Optional, Union
5
+ from typing import Any, ClassVar, Iterable, Iterator, Literal
5
6
 
7
+ import fitz # type: ignore[import-untyped]
6
8
  import ftfy
9
+ import PIL.Image
10
+ from bs4.element import NavigableString, Tag
7
11
 
8
12
  from pixeltable.env import Env
9
13
  from pixeltable.exceptions import Error
10
- from pixeltable.type_system import ColumnType, DocumentType, IntType, JsonType, StringType
14
+ from pixeltable.type_system import ColumnType, DocumentType, ImageType, IntType, JsonType, StringType
11
15
  from pixeltable.utils.documents import get_document_handle
12
16
 
13
17
  from .base import ComponentIterator
@@ -15,6 +19,11 @@ from .base import ComponentIterator
15
19
  _logger = logging.getLogger('pixeltable')
16
20
 
17
21
 
22
+ class Element(enum.Enum):
23
+ TEXT = 1
24
+ IMAGE = 2
25
+
26
+
18
27
  class ChunkMetadata(enum.Enum):
19
28
  TITLE = 1
20
29
  HEADING = 2
@@ -37,27 +46,28 @@ class DocumentSectionMetadata:
37
46
  """Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
38
47
 
39
48
  # html and markdown metadata
40
- sourceline: Optional[int] = None
49
+ sourceline: int | None = None
41
50
  # the stack of headings up to the most recently observed one;
42
51
  # eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
43
- heading: Optional[dict[str, str]] = None
52
+ heading: dict[str, str] | None = None
44
53
 
45
54
  # pdf-specific metadata
46
- page: Optional[int] = None
55
+ page: int | None = None
47
56
  # bounding box as an {x1, y1, x2, y2} dictionary
48
- bounding_box: Optional[dict[str, float]] = None
57
+ bounding_box: dict[str, float] | None = None
49
58
 
50
59
 
51
60
  @dataclasses.dataclass
52
61
  class DocumentSection:
53
62
  """A single document chunk, according to some of the splitting criteria"""
54
63
 
55
- text: Optional[str]
56
- metadata: Optional[DocumentSectionMetadata]
64
+ text: str | None = None
65
+ image: PIL.Image.Image | None = None
66
+ metadata: DocumentSectionMetadata | None = None
57
67
 
58
68
 
59
69
  def _parse_separators(separators: str) -> list[Separator]:
60
- ret = []
70
+ ret: list[Separator] = []
61
71
  for s in separators.split(','):
62
72
  clean_s = s.strip().upper()
63
73
  if not clean_s:
@@ -71,7 +81,7 @@ def _parse_separators(separators: str) -> list[Separator]:
71
81
 
72
82
 
73
83
  def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
74
- ret = []
84
+ ret: list[ChunkMetadata] = []
75
85
  for m in metadata.split(','):
76
86
  clean_m = m.strip().upper()
77
87
  if not clean_m:
@@ -84,6 +94,18 @@ def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
84
94
  return ret
85
95
 
86
96
 
97
+ def _parse_elements(elements: list[Literal['text', 'image']]) -> list[Element]:
98
+ result: list[Element] = []
99
+ for e in elements:
100
+ clean_e = e.strip().upper()
101
+ if clean_e not in Element.__members__:
102
+ raise Error(f'Invalid element: `{e}`. Valid elements are: {", ".join(Element.__members__).lower()}')
103
+ result.append(Element[clean_e])
104
+ if len(result) == 0:
105
+ raise Error('elements cannot be empty')
106
+ return result
107
+
108
+
87
109
  _HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
88
110
 
89
111
 
@@ -94,6 +116,23 @@ class DocumentSplitter(ComponentIterator):
94
116
  include additional metadata fields if specified in the `metadata` parameter, as explained below.
95
117
 
96
118
  Chunked text will be cleaned with `ftfy.fix_text` to fix up common problems with unicode sequences.
119
+
120
+ How to init the `DocumentSplitter` class?
121
+
122
+ Args:
123
+ separators: separators to use to chunk the document. Options are:
124
+ `'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
125
+ This may be a comma-separated string, e.g., `'heading,token_limit'`.
126
+ elements: list of elements to extract from the document. Options are:
127
+ `'text'`, `'image'`. Defaults to `['text']` if not specified. The `'image'` element is only supported
128
+ for the `'page'` separator on PDF documents.
129
+ limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
130
+ or `'char_limit'` is specified.
131
+ metadata: additional metadata fields to include in the output. Options are:
132
+ `'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
133
+ (PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
134
+ image_dpi: DPI to use when extracting images from PDFs. Defaults to 300.
135
+ image_format: format to use when extracting images from PDFs. Defaults to 'png'.
97
136
  """
98
137
 
99
138
  METADATA_COLUMN_TYPES: ClassVar[dict[ChunkMetadata, ColumnType]] = {
@@ -104,36 +143,41 @@ class DocumentSplitter(ComponentIterator):
104
143
  ChunkMetadata.BOUNDING_BOX: JsonType(nullable=True),
105
144
  }
106
145
 
146
+ _doc_handle: Any
147
+ _separators: list[Separator]
148
+ _elements: list[Element]
149
+ _metadata_fields: list[ChunkMetadata]
150
+ _doc_title: str
151
+ _limit: int
152
+ _skip_tags: list[str]
153
+ _overlap: int
154
+ _tiktoken_encoding: str | None
155
+ _tiktoken_target_model: str | None
156
+ _image_dpi: int
157
+ _image_format: str
158
+
159
+ _sections: Iterator[DocumentSection]
160
+
107
161
  def __init__(
108
162
  self,
109
163
  document: str,
110
164
  *,
111
165
  separators: str,
112
- limit: Optional[int] = None,
113
- overlap: Optional[int] = None,
166
+ elements: list[Literal['text', 'image']] | None = None,
167
+ limit: int | None = None,
168
+ overlap: int | None = None,
114
169
  metadata: str = '',
115
- html_skip_tags: Optional[list[str]] = None,
116
- tiktoken_encoding: Optional[str] = 'cl100k_base',
117
- tiktoken_target_model: Optional[str] = None,
170
+ html_skip_tags: list[str] | None = None,
171
+ tiktoken_encoding: str | None = 'cl100k_base',
172
+ tiktoken_target_model: str | None = None,
173
+ image_dpi: int = 300,
174
+ image_format: str = 'png',
118
175
  ):
119
- """Init method for `DocumentSplitter` class.
120
-
121
- Args:
122
- separators: separators to use to chunk the document. Options are:
123
- `'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
124
- This may be a comma-separated string, e.g., `'heading,token_limit'`.
125
- limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
126
- or `'char_limit'` is specified.
127
- metadata: additional metadata fields to include in the output. Options are:
128
- `'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
129
- (PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
130
- """
131
176
  if html_skip_tags is None:
132
177
  html_skip_tags = ['nav']
133
178
  self._doc_handle = get_document_handle(document)
179
+ self._elements = _parse_elements(elements.copy()) if elements is not None else [Element.TEXT]
134
180
  assert self._doc_handle is not None
135
- # calling the output_schema method to validate the input arguments
136
- self.output_schema(separators=separators, metadata=metadata, limit=limit, overlap=overlap)
137
181
  self._separators = _parse_separators(separators)
138
182
  self._metadata_fields = _parse_metadata(metadata)
139
183
  if self._doc_handle.bs_doc is not None:
@@ -149,6 +193,8 @@ class DocumentSplitter(ComponentIterator):
149
193
  self._overlap = 0 if overlap is None else overlap
150
194
  self._tiktoken_encoding = tiktoken_encoding
151
195
  self._tiktoken_target_model = tiktoken_target_model
196
+ self._image_dpi = image_dpi
197
+ self._image_format = image_format
152
198
 
153
199
  # set up processing pipeline
154
200
  if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
@@ -178,19 +224,28 @@ class DocumentSplitter(ComponentIterator):
178
224
  return {
179
225
  'document': DocumentType(nullable=False),
180
226
  'separators': StringType(nullable=False),
227
+ 'elements': JsonType(nullable=False),
181
228
  'metadata': StringType(nullable=False),
182
229
  'limit': IntType(nullable=True),
183
230
  'overlap': IntType(nullable=True),
184
231
  'skip_tags': StringType(nullable=True),
185
232
  'tiktoken_encoding': StringType(nullable=True),
186
233
  'tiktoken_target_model': StringType(nullable=True),
234
+ 'image_dpi': IntType(nullable=True),
235
+ 'image_format': StringType(nullable=True),
187
236
  }
188
237
 
189
238
  @classmethod
190
239
  def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
191
- schema: dict[str, ColumnType] = {'text': StringType()}
192
- md_fields = _parse_metadata(kwargs['metadata']) if 'metadata' in kwargs else []
193
-
240
+ schema: dict[str, ColumnType] = {}
241
+ elements = _parse_elements(kwargs.get('elements', ['text']))
242
+ for element in elements:
243
+ if element == Element.TEXT:
244
+ schema['text'] = StringType(nullable=False)
245
+ elif element == Element.IMAGE:
246
+ schema['image'] = ImageType(nullable=False)
247
+
248
+ md_fields = _parse_metadata(kwargs.get('metadata', ''))
194
249
  for md_field in md_fields:
195
250
  schema[md_field.name.lower()] = cls.METADATA_COLUMN_TYPES[md_field]
196
251
 
@@ -200,6 +255,8 @@ class DocumentSplitter(ComponentIterator):
200
255
  limit = kwargs.get('limit')
201
256
  overlap = kwargs.get('overlap')
202
257
 
258
+ if Element.IMAGE in elements and separators != [Separator.PAGE]:
259
+ raise Error('Image elements are only supported for the "page" separator on PDF documents')
203
260
  if limit is not None or overlap is not None:
204
261
  if Separator.TOKEN_LIMIT not in separators and Separator.CHAR_LIMIT not in separators:
205
262
  raise Error('limit/overlap requires the "token_limit" or "char_limit" separator')
@@ -213,7 +270,6 @@ class DocumentSplitter(ComponentIterator):
213
270
  if kwargs.get('limit') is None:
214
271
  raise Error('limit is required with "token_limit"/"char_limit" separators')
215
272
 
216
- # check dependencies at the end
217
273
  if Separator.SENTENCE in separators:
218
274
  _ = Env.get().spacy_nlp
219
275
  if Separator.TOKEN_LIMIT in separators:
@@ -224,9 +280,15 @@ class DocumentSplitter(ComponentIterator):
224
280
  def __next__(self) -> dict[str, Any]:
225
281
  while True:
226
282
  section = next(self._sections)
227
- if section.text is None:
283
+ if section.text is None and section.image is None:
228
284
  continue
229
- result: dict[str, Any] = {'text': section.text}
285
+ result: dict[str, Any] = {}
286
+ for element in self._elements:
287
+ if element == Element.TEXT:
288
+ result['text'] = section.text
289
+ elif element == Element.IMAGE:
290
+ result['image'] = section.image
291
+
230
292
  for md_field in self._metadata_fields:
231
293
  if md_field == ChunkMetadata.TITLE:
232
294
  result[md_field.name.lower()] = self._doc_title
@@ -238,6 +300,7 @@ class DocumentSplitter(ComponentIterator):
238
300
  result[md_field.name.lower()] = section.metadata.page
239
301
  elif md_field == ChunkMetadata.BOUNDING_BOX:
240
302
  result[md_field.name.lower()] = section.metadata.bounding_box
303
+
241
304
  return result
242
305
 
243
306
  def _html_sections(self) -> Iterator[DocumentSection]:
@@ -273,7 +336,7 @@ class DocumentSplitter(ComponentIterator):
273
336
  yield DocumentSection(text=full_text, metadata=md)
274
337
  accumulated_text = []
275
338
 
276
- def process_element(el: Union[bs4.element.Tag, bs4.NavigableString]) -> Iterator[DocumentSection]:
339
+ def process_element(el: Tag | NavigableString) -> Iterator[DocumentSection]:
277
340
  # process the element and emit sections as necessary
278
341
  nonlocal accumulated_text, headings, sourceline, emit_on_heading, emit_on_paragraph
279
342
 
@@ -361,43 +424,41 @@ class DocumentSplitter(ComponentIterator):
361
424
  yield from emit()
362
425
 
363
426
  def _pdf_sections(self) -> Iterator[DocumentSection]:
364
- """Create DocumentSections reflecting the pdf-specific separators"""
365
- import fitz # type: ignore[import-untyped]
366
-
367
427
  doc: fitz.Document = self._doc_handle.pdf_doc
368
428
  assert doc is not None
369
429
 
370
430
  emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
371
431
  emit_on_page = Separator.PAGE in self._separators or emit_on_paragraph
372
432
 
373
- accumulated_text = [] # invariant: all elements are ftfy clean and non-empty
433
+ accumulated_text: list[str] = []
374
434
 
375
- def _add_cleaned_text(raw_text: str) -> None:
376
- fixed = ftfy.fix_text(raw_text)
435
+ def _add_cleaned(raw: str) -> None:
436
+ fixed = ftfy.fix_text(raw)
377
437
  if fixed:
378
438
  accumulated_text.append(fixed)
379
439
 
380
440
  def _emit_text() -> str:
381
- full_text = ''.join(accumulated_text)
441
+ txt = ''.join(accumulated_text)
382
442
  accumulated_text.clear()
383
- return full_text
443
+ return txt
444
+
445
+ for page_idx, page in enumerate(doc.pages()):
446
+ img: PIL.Image.Image | None = None
447
+ if Element.IMAGE in self._elements:
448
+ pix = page.get_pixmap(dpi=self._image_dpi)
449
+ img = PIL.Image.open(io.BytesIO(pix.tobytes(self._image_format)))
384
450
 
385
- for page_number, page in enumerate(doc.pages()):
386
451
  for block in page.get_text('blocks'):
387
- # there is no concept of paragraph in pdf, block is the closest thing
388
- # we can get (eg a paragraph in text may cut across pages)
389
- # see pymupdf docs https://pymupdf.readthedocs.io/en/latest/app1.html
390
- # other libraries like pdfminer also lack an explicit paragraph concept
391
- x1, y1, x2, y2, text, _, _ = block
392
- _add_cleaned_text(text)
452
+ x1, y1, x2, y2, text, *_ = block
453
+ _add_cleaned(text)
393
454
  if accumulated_text and emit_on_paragraph:
394
455
  bbox = {'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2}
395
- metadata = DocumentSectionMetadata(page=page_number, bounding_box=bbox)
396
- yield DocumentSection(text=_emit_text(), metadata=metadata)
456
+ md = DocumentSectionMetadata(page=page_idx, bounding_box=bbox)
457
+ yield DocumentSection(text=_emit_text(), metadata=md)
397
458
 
398
459
  if accumulated_text and emit_on_page and not emit_on_paragraph:
399
- yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata(page=page_number))
400
- accumulated_text = []
460
+ md = DocumentSectionMetadata(page=page_idx)
461
+ yield DocumentSection(text=_emit_text(), image=img, metadata=md)
401
462
 
402
463
  if accumulated_text and not emit_on_page:
403
464
  yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata())
@@ -31,8 +31,7 @@ class TileIterator(ComponentIterator):
31
31
  __j: int
32
32
 
33
33
  def __init__(self, image: PIL.Image.Image, *, tile_size: tuple[int, int], overlap: tuple[int, int] = (0, 0)):
34
- if overlap[0] >= tile_size[0] or overlap[1] >= tile_size[1]:
35
- raise excs.Error(f'overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}')
34
+ assert overlap[0] < tile_size[0] and overlap[1] < tile_size[1]
36
35
 
37
36
  self.__image = image
38
37
  self.__image.load()
@@ -79,4 +78,8 @@ class TileIterator(ComponentIterator):
79
78
 
80
79
  @classmethod
81
80
  def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
81
+ tile_size = kwargs.get('tile_size')
82
+ overlap = kwargs.get('overlap', (0, 0))
83
+ if overlap[0] >= tile_size[0] or overlap[1] >= tile_size[1]:
84
+ raise excs.Error(f'overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}')
82
85
  return {'tile': ts.ImageType(), 'tile_coord': ts.JsonType(), 'tile_box': ts.JsonType()}, ['tile']