pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +23 -5
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +5 -3
- pixeltable/catalog/catalog.py +1318 -404
- pixeltable/catalog/column.py +186 -115
- pixeltable/catalog/dir.py +1 -2
- pixeltable/catalog/globals.py +11 -43
- pixeltable/catalog/insertable_table.py +167 -79
- pixeltable/catalog/path.py +61 -23
- pixeltable/catalog/schema_object.py +9 -10
- pixeltable/catalog/table.py +626 -308
- pixeltable/catalog/table_metadata.py +101 -0
- pixeltable/catalog/table_version.py +713 -569
- pixeltable/catalog/table_version_handle.py +37 -6
- pixeltable/catalog/table_version_path.py +42 -29
- pixeltable/catalog/tbl_ops.py +50 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +108 -94
- pixeltable/config.py +128 -22
- pixeltable/dataframe.py +188 -100
- pixeltable/env.py +407 -136
- pixeltable/exceptions.py +6 -0
- pixeltable/exec/__init__.py +3 -0
- pixeltable/exec/aggregation_node.py +7 -8
- pixeltable/exec/cache_prefetch_node.py +83 -110
- pixeltable/exec/cell_materialization_node.py +231 -0
- pixeltable/exec/cell_reconstruction_node.py +135 -0
- pixeltable/exec/component_iteration_node.py +4 -3
- pixeltable/exec/data_row_batch.py +8 -65
- pixeltable/exec/exec_context.py +16 -4
- pixeltable/exec/exec_node.py +13 -36
- pixeltable/exec/expr_eval/evaluators.py +7 -6
- pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
- pixeltable/exec/expr_eval/globals.py +8 -5
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +190 -30
- pixeltable/exec/globals.py +32 -0
- pixeltable/exec/in_memory_data_node.py +18 -18
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +16 -9
- pixeltable/exec/sql_node.py +206 -101
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +27 -22
- pixeltable/exprs/array_slice.py +3 -3
- pixeltable/exprs/column_property_ref.py +34 -30
- pixeltable/exprs/column_ref.py +92 -96
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +5 -4
- pixeltable/exprs/data_row.py +152 -55
- pixeltable/exprs/expr.py +62 -43
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +17 -10
- pixeltable/exprs/function_call.py +75 -37
- pixeltable/exprs/globals.py +1 -2
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +10 -27
- pixeltable/exprs/is_null.py +1 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +56 -22
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +127 -53
- pixeltable/exprs/rowid_ref.py +8 -12
- pixeltable/exprs/similarity_expr.py +50 -25
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +5 -5
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +10 -10
- pixeltable/func/function.py +18 -20
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +20 -18
- pixeltable/func/signature.py +43 -16
- pixeltable/func/tools.py +23 -13
- pixeltable/func/udf.py +18 -20
- pixeltable/functions/__init__.py +6 -0
- pixeltable/functions/anthropic.py +93 -33
- pixeltable/functions/audio.py +114 -10
- pixeltable/functions/bedrock.py +13 -6
- pixeltable/functions/date.py +1 -1
- pixeltable/functions/deepseek.py +20 -9
- pixeltable/functions/fireworks.py +2 -2
- pixeltable/functions/gemini.py +28 -11
- pixeltable/functions/globals.py +13 -13
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1046 -23
- pixeltable/functions/image.py +9 -18
- pixeltable/functions/llama_cpp.py +23 -8
- pixeltable/functions/math.py +3 -4
- pixeltable/functions/mistralai.py +4 -15
- pixeltable/functions/ollama.py +16 -9
- pixeltable/functions/openai.py +104 -82
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +2 -2
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +21 -28
- pixeltable/functions/timestamp.py +13 -14
- pixeltable/functions/together.py +4 -6
- pixeltable/functions/twelvelabs.py +92 -0
- pixeltable/functions/util.py +6 -1
- pixeltable/functions/video.py +1388 -106
- pixeltable/functions/vision.py +7 -7
- pixeltable/functions/whisper.py +15 -7
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/{ext/functions → functions}/yolox.py +2 -4
- pixeltable/globals.py +332 -105
- pixeltable/index/base.py +13 -22
- pixeltable/index/btree.py +23 -22
- pixeltable/index/embedding_index.py +32 -44
- pixeltable/io/__init__.py +4 -2
- pixeltable/io/datarows.py +7 -6
- pixeltable/io/external_store.py +49 -77
- pixeltable/io/fiftyone.py +11 -11
- pixeltable/io/globals.py +29 -28
- pixeltable/io/hf_datasets.py +17 -9
- pixeltable/io/label_studio.py +70 -66
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +12 -11
- pixeltable/io/parquet.py +13 -93
- pixeltable/io/table_data_conduit.py +71 -47
- pixeltable/io/utils.py +3 -3
- pixeltable/iterators/__init__.py +2 -1
- pixeltable/iterators/audio.py +21 -11
- pixeltable/iterators/document.py +116 -55
- pixeltable/iterators/image.py +5 -2
- pixeltable/iterators/video.py +293 -13
- pixeltable/metadata/__init__.py +4 -2
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_36.py +2 -2
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/util.py +13 -12
- pixeltable/metadata/notes.py +4 -0
- pixeltable/metadata/schema.py +79 -42
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +274 -223
- pixeltable/share/__init__.py +1 -1
- pixeltable/share/packager.py +259 -129
- pixeltable/share/protocol/__init__.py +34 -0
- pixeltable/share/protocol/common.py +170 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +109 -0
- pixeltable/share/publish.py +213 -57
- pixeltable/store.py +238 -175
- pixeltable/type_system.py +104 -63
- pixeltable/utils/__init__.py +2 -3
- pixeltable/utils/arrow.py +108 -13
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +305 -0
- pixeltable/utils/code.py +3 -3
- pixeltable/utils/console_output.py +4 -1
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/dbms.py +31 -5
- pixeltable/utils/description_helper.py +4 -5
- pixeltable/utils/documents.py +5 -6
- pixeltable/utils/exception_handler.py +7 -30
- pixeltable/utils/filecache.py +6 -6
- pixeltable/utils/formatter.py +4 -6
- pixeltable/utils/gcs_store.py +283 -0
- pixeltable/utils/http_server.py +2 -3
- pixeltable/utils/iceberg.py +1 -2
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +88 -0
- pixeltable/utils/local_store.py +316 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +528 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +392 -0
- pixeltable-0.4.20.dist-info/METADATA +587 -0
- pixeltable-0.4.20.dist-info/RECORD +218 -0
- {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
- pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/ext/__init__.py +0 -17
- pixeltable/ext/functions/__init__.py +0 -11
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/utils/media_store.py +0 -77
- pixeltable/utils/s3.py +0 -17
- pixeltable/utils/sample.py +0 -25
- pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
- pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
- pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
- {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0
pixeltable/iterators/document.py
CHANGED
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
import enum
|
|
3
|
+
import io
|
|
3
4
|
import logging
|
|
4
|
-
from typing import Any, ClassVar, Iterable, Iterator,
|
|
5
|
+
from typing import Any, ClassVar, Iterable, Iterator, Literal
|
|
5
6
|
|
|
7
|
+
import fitz # type: ignore[import-untyped]
|
|
6
8
|
import ftfy
|
|
9
|
+
import PIL.Image
|
|
10
|
+
from bs4.element import NavigableString, Tag
|
|
7
11
|
|
|
8
12
|
from pixeltable.env import Env
|
|
9
13
|
from pixeltable.exceptions import Error
|
|
10
|
-
from pixeltable.type_system import ColumnType, DocumentType, IntType, JsonType, StringType
|
|
14
|
+
from pixeltable.type_system import ColumnType, DocumentType, ImageType, IntType, JsonType, StringType
|
|
11
15
|
from pixeltable.utils.documents import get_document_handle
|
|
12
16
|
|
|
13
17
|
from .base import ComponentIterator
|
|
@@ -15,6 +19,11 @@ from .base import ComponentIterator
|
|
|
15
19
|
_logger = logging.getLogger('pixeltable')
|
|
16
20
|
|
|
17
21
|
|
|
22
|
+
class Element(enum.Enum):
|
|
23
|
+
TEXT = 1
|
|
24
|
+
IMAGE = 2
|
|
25
|
+
|
|
26
|
+
|
|
18
27
|
class ChunkMetadata(enum.Enum):
|
|
19
28
|
TITLE = 1
|
|
20
29
|
HEADING = 2
|
|
@@ -37,27 +46,28 @@ class DocumentSectionMetadata:
|
|
|
37
46
|
"""Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
|
|
38
47
|
|
|
39
48
|
# html and markdown metadata
|
|
40
|
-
sourceline:
|
|
49
|
+
sourceline: int | None = None
|
|
41
50
|
# the stack of headings up to the most recently observed one;
|
|
42
51
|
# eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
|
|
43
|
-
heading:
|
|
52
|
+
heading: dict[str, str] | None = None
|
|
44
53
|
|
|
45
54
|
# pdf-specific metadata
|
|
46
|
-
page:
|
|
55
|
+
page: int | None = None
|
|
47
56
|
# bounding box as an {x1, y1, x2, y2} dictionary
|
|
48
|
-
bounding_box:
|
|
57
|
+
bounding_box: dict[str, float] | None = None
|
|
49
58
|
|
|
50
59
|
|
|
51
60
|
@dataclasses.dataclass
|
|
52
61
|
class DocumentSection:
|
|
53
62
|
"""A single document chunk, according to some of the splitting criteria"""
|
|
54
63
|
|
|
55
|
-
text:
|
|
56
|
-
|
|
64
|
+
text: str | None = None
|
|
65
|
+
image: PIL.Image.Image | None = None
|
|
66
|
+
metadata: DocumentSectionMetadata | None = None
|
|
57
67
|
|
|
58
68
|
|
|
59
69
|
def _parse_separators(separators: str) -> list[Separator]:
|
|
60
|
-
ret = []
|
|
70
|
+
ret: list[Separator] = []
|
|
61
71
|
for s in separators.split(','):
|
|
62
72
|
clean_s = s.strip().upper()
|
|
63
73
|
if not clean_s:
|
|
@@ -71,7 +81,7 @@ def _parse_separators(separators: str) -> list[Separator]:
|
|
|
71
81
|
|
|
72
82
|
|
|
73
83
|
def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
|
|
74
|
-
ret = []
|
|
84
|
+
ret: list[ChunkMetadata] = []
|
|
75
85
|
for m in metadata.split(','):
|
|
76
86
|
clean_m = m.strip().upper()
|
|
77
87
|
if not clean_m:
|
|
@@ -84,6 +94,18 @@ def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
|
|
|
84
94
|
return ret
|
|
85
95
|
|
|
86
96
|
|
|
97
|
+
def _parse_elements(elements: list[Literal['text', 'image']]) -> list[Element]:
|
|
98
|
+
result: list[Element] = []
|
|
99
|
+
for e in elements:
|
|
100
|
+
clean_e = e.strip().upper()
|
|
101
|
+
if clean_e not in Element.__members__:
|
|
102
|
+
raise Error(f'Invalid element: `{e}`. Valid elements are: {", ".join(Element.__members__).lower()}')
|
|
103
|
+
result.append(Element[clean_e])
|
|
104
|
+
if len(result) == 0:
|
|
105
|
+
raise Error('elements cannot be empty')
|
|
106
|
+
return result
|
|
107
|
+
|
|
108
|
+
|
|
87
109
|
_HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
|
|
88
110
|
|
|
89
111
|
|
|
@@ -94,6 +116,23 @@ class DocumentSplitter(ComponentIterator):
|
|
|
94
116
|
include additional metadata fields if specified in the `metadata` parameter, as explained below.
|
|
95
117
|
|
|
96
118
|
Chunked text will be cleaned with `ftfy.fix_text` to fix up common problems with unicode sequences.
|
|
119
|
+
|
|
120
|
+
How to init the `DocumentSplitter` class?
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
separators: separators to use to chunk the document. Options are:
|
|
124
|
+
`'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
|
|
125
|
+
This may be a comma-separated string, e.g., `'heading,token_limit'`.
|
|
126
|
+
elements: list of elements to extract from the document. Options are:
|
|
127
|
+
`'text'`, `'image'`. Defaults to `['text']` if not specified. The `'image'` element is only supported
|
|
128
|
+
for the `'page'` separator on PDF documents.
|
|
129
|
+
limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
|
|
130
|
+
or `'char_limit'` is specified.
|
|
131
|
+
metadata: additional metadata fields to include in the output. Options are:
|
|
132
|
+
`'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
|
|
133
|
+
(PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
|
|
134
|
+
image_dpi: DPI to use when extracting images from PDFs. Defaults to 300.
|
|
135
|
+
image_format: format to use when extracting images from PDFs. Defaults to 'png'.
|
|
97
136
|
"""
|
|
98
137
|
|
|
99
138
|
METADATA_COLUMN_TYPES: ClassVar[dict[ChunkMetadata, ColumnType]] = {
|
|
@@ -104,36 +143,41 @@ class DocumentSplitter(ComponentIterator):
|
|
|
104
143
|
ChunkMetadata.BOUNDING_BOX: JsonType(nullable=True),
|
|
105
144
|
}
|
|
106
145
|
|
|
146
|
+
_doc_handle: Any
|
|
147
|
+
_separators: list[Separator]
|
|
148
|
+
_elements: list[Element]
|
|
149
|
+
_metadata_fields: list[ChunkMetadata]
|
|
150
|
+
_doc_title: str
|
|
151
|
+
_limit: int
|
|
152
|
+
_skip_tags: list[str]
|
|
153
|
+
_overlap: int
|
|
154
|
+
_tiktoken_encoding: str | None
|
|
155
|
+
_tiktoken_target_model: str | None
|
|
156
|
+
_image_dpi: int
|
|
157
|
+
_image_format: str
|
|
158
|
+
|
|
159
|
+
_sections: Iterator[DocumentSection]
|
|
160
|
+
|
|
107
161
|
def __init__(
|
|
108
162
|
self,
|
|
109
163
|
document: str,
|
|
110
164
|
*,
|
|
111
165
|
separators: str,
|
|
112
|
-
|
|
113
|
-
|
|
166
|
+
elements: list[Literal['text', 'image']] | None = None,
|
|
167
|
+
limit: int | None = None,
|
|
168
|
+
overlap: int | None = None,
|
|
114
169
|
metadata: str = '',
|
|
115
|
-
html_skip_tags:
|
|
116
|
-
tiktoken_encoding:
|
|
117
|
-
tiktoken_target_model:
|
|
170
|
+
html_skip_tags: list[str] | None = None,
|
|
171
|
+
tiktoken_encoding: str | None = 'cl100k_base',
|
|
172
|
+
tiktoken_target_model: str | None = None,
|
|
173
|
+
image_dpi: int = 300,
|
|
174
|
+
image_format: str = 'png',
|
|
118
175
|
):
|
|
119
|
-
"""Init method for `DocumentSplitter` class.
|
|
120
|
-
|
|
121
|
-
Args:
|
|
122
|
-
separators: separators to use to chunk the document. Options are:
|
|
123
|
-
`'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
|
|
124
|
-
This may be a comma-separated string, e.g., `'heading,token_limit'`.
|
|
125
|
-
limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
|
|
126
|
-
or `'char_limit'` is specified.
|
|
127
|
-
metadata: additional metadata fields to include in the output. Options are:
|
|
128
|
-
`'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
|
|
129
|
-
(PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
|
|
130
|
-
"""
|
|
131
176
|
if html_skip_tags is None:
|
|
132
177
|
html_skip_tags = ['nav']
|
|
133
178
|
self._doc_handle = get_document_handle(document)
|
|
179
|
+
self._elements = _parse_elements(elements.copy()) if elements is not None else [Element.TEXT]
|
|
134
180
|
assert self._doc_handle is not None
|
|
135
|
-
# calling the output_schema method to validate the input arguments
|
|
136
|
-
self.output_schema(separators=separators, metadata=metadata, limit=limit, overlap=overlap)
|
|
137
181
|
self._separators = _parse_separators(separators)
|
|
138
182
|
self._metadata_fields = _parse_metadata(metadata)
|
|
139
183
|
if self._doc_handle.bs_doc is not None:
|
|
@@ -149,6 +193,8 @@ class DocumentSplitter(ComponentIterator):
|
|
|
149
193
|
self._overlap = 0 if overlap is None else overlap
|
|
150
194
|
self._tiktoken_encoding = tiktoken_encoding
|
|
151
195
|
self._tiktoken_target_model = tiktoken_target_model
|
|
196
|
+
self._image_dpi = image_dpi
|
|
197
|
+
self._image_format = image_format
|
|
152
198
|
|
|
153
199
|
# set up processing pipeline
|
|
154
200
|
if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
|
|
@@ -178,19 +224,28 @@ class DocumentSplitter(ComponentIterator):
|
|
|
178
224
|
return {
|
|
179
225
|
'document': DocumentType(nullable=False),
|
|
180
226
|
'separators': StringType(nullable=False),
|
|
227
|
+
'elements': JsonType(nullable=False),
|
|
181
228
|
'metadata': StringType(nullable=False),
|
|
182
229
|
'limit': IntType(nullable=True),
|
|
183
230
|
'overlap': IntType(nullable=True),
|
|
184
231
|
'skip_tags': StringType(nullable=True),
|
|
185
232
|
'tiktoken_encoding': StringType(nullable=True),
|
|
186
233
|
'tiktoken_target_model': StringType(nullable=True),
|
|
234
|
+
'image_dpi': IntType(nullable=True),
|
|
235
|
+
'image_format': StringType(nullable=True),
|
|
187
236
|
}
|
|
188
237
|
|
|
189
238
|
@classmethod
|
|
190
239
|
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
|
|
191
|
-
schema: dict[str, ColumnType] = {
|
|
192
|
-
|
|
193
|
-
|
|
240
|
+
schema: dict[str, ColumnType] = {}
|
|
241
|
+
elements = _parse_elements(kwargs.get('elements', ['text']))
|
|
242
|
+
for element in elements:
|
|
243
|
+
if element == Element.TEXT:
|
|
244
|
+
schema['text'] = StringType(nullable=False)
|
|
245
|
+
elif element == Element.IMAGE:
|
|
246
|
+
schema['image'] = ImageType(nullable=False)
|
|
247
|
+
|
|
248
|
+
md_fields = _parse_metadata(kwargs.get('metadata', ''))
|
|
194
249
|
for md_field in md_fields:
|
|
195
250
|
schema[md_field.name.lower()] = cls.METADATA_COLUMN_TYPES[md_field]
|
|
196
251
|
|
|
@@ -200,6 +255,8 @@ class DocumentSplitter(ComponentIterator):
|
|
|
200
255
|
limit = kwargs.get('limit')
|
|
201
256
|
overlap = kwargs.get('overlap')
|
|
202
257
|
|
|
258
|
+
if Element.IMAGE in elements and separators != [Separator.PAGE]:
|
|
259
|
+
raise Error('Image elements are only supported for the "page" separator on PDF documents')
|
|
203
260
|
if limit is not None or overlap is not None:
|
|
204
261
|
if Separator.TOKEN_LIMIT not in separators and Separator.CHAR_LIMIT not in separators:
|
|
205
262
|
raise Error('limit/overlap requires the "token_limit" or "char_limit" separator')
|
|
@@ -213,7 +270,6 @@ class DocumentSplitter(ComponentIterator):
|
|
|
213
270
|
if kwargs.get('limit') is None:
|
|
214
271
|
raise Error('limit is required with "token_limit"/"char_limit" separators')
|
|
215
272
|
|
|
216
|
-
# check dependencies at the end
|
|
217
273
|
if Separator.SENTENCE in separators:
|
|
218
274
|
_ = Env.get().spacy_nlp
|
|
219
275
|
if Separator.TOKEN_LIMIT in separators:
|
|
@@ -224,9 +280,15 @@ class DocumentSplitter(ComponentIterator):
|
|
|
224
280
|
def __next__(self) -> dict[str, Any]:
|
|
225
281
|
while True:
|
|
226
282
|
section = next(self._sections)
|
|
227
|
-
if section.text is None:
|
|
283
|
+
if section.text is None and section.image is None:
|
|
228
284
|
continue
|
|
229
|
-
result: dict[str, Any] = {
|
|
285
|
+
result: dict[str, Any] = {}
|
|
286
|
+
for element in self._elements:
|
|
287
|
+
if element == Element.TEXT:
|
|
288
|
+
result['text'] = section.text
|
|
289
|
+
elif element == Element.IMAGE:
|
|
290
|
+
result['image'] = section.image
|
|
291
|
+
|
|
230
292
|
for md_field in self._metadata_fields:
|
|
231
293
|
if md_field == ChunkMetadata.TITLE:
|
|
232
294
|
result[md_field.name.lower()] = self._doc_title
|
|
@@ -238,6 +300,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
238
300
|
result[md_field.name.lower()] = section.metadata.page
|
|
239
301
|
elif md_field == ChunkMetadata.BOUNDING_BOX:
|
|
240
302
|
result[md_field.name.lower()] = section.metadata.bounding_box
|
|
303
|
+
|
|
241
304
|
return result
|
|
242
305
|
|
|
243
306
|
def _html_sections(self) -> Iterator[DocumentSection]:
|
|
@@ -273,7 +336,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
273
336
|
yield DocumentSection(text=full_text, metadata=md)
|
|
274
337
|
accumulated_text = []
|
|
275
338
|
|
|
276
|
-
def process_element(el:
|
|
339
|
+
def process_element(el: Tag | NavigableString) -> Iterator[DocumentSection]:
|
|
277
340
|
# process the element and emit sections as necessary
|
|
278
341
|
nonlocal accumulated_text, headings, sourceline, emit_on_heading, emit_on_paragraph
|
|
279
342
|
|
|
@@ -361,43 +424,41 @@ class DocumentSplitter(ComponentIterator):
|
|
|
361
424
|
yield from emit()
|
|
362
425
|
|
|
363
426
|
def _pdf_sections(self) -> Iterator[DocumentSection]:
|
|
364
|
-
"""Create DocumentSections reflecting the pdf-specific separators"""
|
|
365
|
-
import fitz # type: ignore[import-untyped]
|
|
366
|
-
|
|
367
427
|
doc: fitz.Document = self._doc_handle.pdf_doc
|
|
368
428
|
assert doc is not None
|
|
369
429
|
|
|
370
430
|
emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
|
|
371
431
|
emit_on_page = Separator.PAGE in self._separators or emit_on_paragraph
|
|
372
432
|
|
|
373
|
-
accumulated_text = []
|
|
433
|
+
accumulated_text: list[str] = []
|
|
374
434
|
|
|
375
|
-
def
|
|
376
|
-
fixed = ftfy.fix_text(
|
|
435
|
+
def _add_cleaned(raw: str) -> None:
|
|
436
|
+
fixed = ftfy.fix_text(raw)
|
|
377
437
|
if fixed:
|
|
378
438
|
accumulated_text.append(fixed)
|
|
379
439
|
|
|
380
440
|
def _emit_text() -> str:
|
|
381
|
-
|
|
441
|
+
txt = ''.join(accumulated_text)
|
|
382
442
|
accumulated_text.clear()
|
|
383
|
-
return
|
|
443
|
+
return txt
|
|
444
|
+
|
|
445
|
+
for page_idx, page in enumerate(doc.pages()):
|
|
446
|
+
img: PIL.Image.Image | None = None
|
|
447
|
+
if Element.IMAGE in self._elements:
|
|
448
|
+
pix = page.get_pixmap(dpi=self._image_dpi)
|
|
449
|
+
img = PIL.Image.open(io.BytesIO(pix.tobytes(self._image_format)))
|
|
384
450
|
|
|
385
|
-
for page_number, page in enumerate(doc.pages()):
|
|
386
451
|
for block in page.get_text('blocks'):
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
# see pymupdf docs https://pymupdf.readthedocs.io/en/latest/app1.html
|
|
390
|
-
# other libraries like pdfminer also lack an explicit paragraph concept
|
|
391
|
-
x1, y1, x2, y2, text, _, _ = block
|
|
392
|
-
_add_cleaned_text(text)
|
|
452
|
+
x1, y1, x2, y2, text, *_ = block
|
|
453
|
+
_add_cleaned(text)
|
|
393
454
|
if accumulated_text and emit_on_paragraph:
|
|
394
455
|
bbox = {'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2}
|
|
395
|
-
|
|
396
|
-
yield DocumentSection(text=_emit_text(), metadata=
|
|
456
|
+
md = DocumentSectionMetadata(page=page_idx, bounding_box=bbox)
|
|
457
|
+
yield DocumentSection(text=_emit_text(), metadata=md)
|
|
397
458
|
|
|
398
459
|
if accumulated_text and emit_on_page and not emit_on_paragraph:
|
|
399
|
-
|
|
400
|
-
|
|
460
|
+
md = DocumentSectionMetadata(page=page_idx)
|
|
461
|
+
yield DocumentSection(text=_emit_text(), image=img, metadata=md)
|
|
401
462
|
|
|
402
463
|
if accumulated_text and not emit_on_page:
|
|
403
464
|
yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata())
|
pixeltable/iterators/image.py
CHANGED
|
@@ -31,8 +31,7 @@ class TileIterator(ComponentIterator):
|
|
|
31
31
|
__j: int
|
|
32
32
|
|
|
33
33
|
def __init__(self, image: PIL.Image.Image, *, tile_size: tuple[int, int], overlap: tuple[int, int] = (0, 0)):
|
|
34
|
-
|
|
35
|
-
raise excs.Error(f'overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}')
|
|
34
|
+
assert overlap[0] < tile_size[0] and overlap[1] < tile_size[1]
|
|
36
35
|
|
|
37
36
|
self.__image = image
|
|
38
37
|
self.__image.load()
|
|
@@ -79,4 +78,8 @@ class TileIterator(ComponentIterator):
|
|
|
79
78
|
|
|
80
79
|
@classmethod
|
|
81
80
|
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
81
|
+
tile_size = kwargs.get('tile_size')
|
|
82
|
+
overlap = kwargs.get('overlap', (0, 0))
|
|
83
|
+
if overlap[0] >= tile_size[0] or overlap[1] >= tile_size[1]:
|
|
84
|
+
raise excs.Error(f'overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}')
|
|
82
85
|
return {'tile': ts.ImageType(), 'tile_coord': ts.JsonType(), 'tile_box': ts.JsonType()}, ['tile']
|