pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pixeltable/__init__.py +83 -19
- pixeltable/_query.py +1444 -0
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +7 -4
- pixeltable/catalog/catalog.py +2394 -119
- pixeltable/catalog/column.py +225 -104
- pixeltable/catalog/dir.py +38 -9
- pixeltable/catalog/globals.py +53 -34
- pixeltable/catalog/insertable_table.py +265 -115
- pixeltable/catalog/path.py +80 -17
- pixeltable/catalog/schema_object.py +28 -43
- pixeltable/catalog/table.py +1270 -677
- pixeltable/catalog/table_metadata.py +103 -0
- pixeltable/catalog/table_version.py +1270 -751
- pixeltable/catalog/table_version_handle.py +109 -0
- pixeltable/catalog/table_version_path.py +137 -42
- pixeltable/catalog/tbl_ops.py +53 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +251 -134
- pixeltable/config.py +215 -0
- pixeltable/env.py +736 -285
- pixeltable/exceptions.py +26 -2
- pixeltable/exec/__init__.py +7 -2
- pixeltable/exec/aggregation_node.py +39 -21
- pixeltable/exec/cache_prefetch_node.py +87 -109
- pixeltable/exec/cell_materialization_node.py +268 -0
- pixeltable/exec/cell_reconstruction_node.py +168 -0
- pixeltable/exec/component_iteration_node.py +25 -28
- pixeltable/exec/data_row_batch.py +11 -46
- pixeltable/exec/exec_context.py +26 -11
- pixeltable/exec/exec_node.py +35 -27
- pixeltable/exec/expr_eval/__init__.py +3 -0
- pixeltable/exec/expr_eval/evaluators.py +365 -0
- pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
- pixeltable/exec/expr_eval/globals.py +200 -0
- pixeltable/exec/expr_eval/row_buffer.py +74 -0
- pixeltable/exec/expr_eval/schedulers.py +413 -0
- pixeltable/exec/globals.py +35 -0
- pixeltable/exec/in_memory_data_node.py +35 -27
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +44 -29
- pixeltable/exec/sql_node.py +414 -115
- pixeltable/exprs/__init__.py +8 -5
- pixeltable/exprs/arithmetic_expr.py +79 -45
- pixeltable/exprs/array_slice.py +5 -5
- pixeltable/exprs/column_property_ref.py +40 -26
- pixeltable/exprs/column_ref.py +254 -61
- pixeltable/exprs/comparison.py +14 -9
- pixeltable/exprs/compound_predicate.py +9 -10
- pixeltable/exprs/data_row.py +213 -72
- pixeltable/exprs/expr.py +270 -104
- pixeltable/exprs/expr_dict.py +6 -5
- pixeltable/exprs/expr_set.py +20 -11
- pixeltable/exprs/function_call.py +383 -284
- pixeltable/exprs/globals.py +18 -5
- pixeltable/exprs/in_predicate.py +7 -7
- pixeltable/exprs/inline_expr.py +37 -37
- pixeltable/exprs/is_null.py +8 -4
- pixeltable/exprs/json_mapper.py +120 -54
- pixeltable/exprs/json_path.py +90 -60
- pixeltable/exprs/literal.py +61 -16
- pixeltable/exprs/method_ref.py +7 -6
- pixeltable/exprs/object_ref.py +19 -8
- pixeltable/exprs/row_builder.py +238 -75
- pixeltable/exprs/rowid_ref.py +53 -15
- pixeltable/exprs/similarity_expr.py +65 -50
- pixeltable/exprs/sql_element_cache.py +5 -5
- pixeltable/exprs/string_op.py +107 -0
- pixeltable/exprs/type_cast.py +25 -13
- pixeltable/exprs/variable.py +2 -2
- pixeltable/func/__init__.py +9 -5
- pixeltable/func/aggregate_function.py +197 -92
- pixeltable/func/callable_function.py +119 -35
- pixeltable/func/expr_template_function.py +101 -48
- pixeltable/func/function.py +375 -62
- pixeltable/func/function_registry.py +20 -19
- pixeltable/func/globals.py +6 -5
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +151 -35
- pixeltable/func/signature.py +178 -49
- pixeltable/func/tools.py +164 -0
- pixeltable/func/udf.py +176 -53
- pixeltable/functions/__init__.py +44 -4
- pixeltable/functions/anthropic.py +226 -47
- pixeltable/functions/audio.py +148 -11
- pixeltable/functions/bedrock.py +137 -0
- pixeltable/functions/date.py +188 -0
- pixeltable/functions/deepseek.py +113 -0
- pixeltable/functions/document.py +81 -0
- pixeltable/functions/fal.py +76 -0
- pixeltable/functions/fireworks.py +72 -20
- pixeltable/functions/gemini.py +249 -0
- pixeltable/functions/globals.py +208 -53
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1088 -95
- pixeltable/functions/image.py +155 -84
- pixeltable/functions/json.py +8 -11
- pixeltable/functions/llama_cpp.py +31 -19
- pixeltable/functions/math.py +169 -0
- pixeltable/functions/mistralai.py +50 -75
- pixeltable/functions/net.py +70 -0
- pixeltable/functions/ollama.py +29 -36
- pixeltable/functions/openai.py +548 -160
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +15 -14
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +310 -85
- pixeltable/functions/timestamp.py +37 -19
- pixeltable/functions/together.py +77 -120
- pixeltable/functions/twelvelabs.py +188 -0
- pixeltable/functions/util.py +7 -2
- pixeltable/functions/uuid.py +30 -0
- pixeltable/functions/video.py +1528 -117
- pixeltable/functions/vision.py +26 -26
- pixeltable/functions/voyageai.py +289 -0
- pixeltable/functions/whisper.py +19 -10
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/functions/yolox.py +112 -0
- pixeltable/globals.py +716 -236
- pixeltable/index/__init__.py +3 -1
- pixeltable/index/base.py +17 -21
- pixeltable/index/btree.py +32 -22
- pixeltable/index/embedding_index.py +155 -92
- pixeltable/io/__init__.py +12 -7
- pixeltable/io/datarows.py +140 -0
- pixeltable/io/external_store.py +83 -125
- pixeltable/io/fiftyone.py +24 -33
- pixeltable/io/globals.py +47 -182
- pixeltable/io/hf_datasets.py +96 -127
- pixeltable/io/label_studio.py +171 -156
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +136 -115
- pixeltable/io/parquet.py +40 -153
- pixeltable/io/table_data_conduit.py +702 -0
- pixeltable/io/utils.py +100 -0
- pixeltable/iterators/__init__.py +8 -4
- pixeltable/iterators/audio.py +207 -0
- pixeltable/iterators/base.py +9 -3
- pixeltable/iterators/document.py +144 -87
- pixeltable/iterators/image.py +17 -38
- pixeltable/iterators/string.py +15 -12
- pixeltable/iterators/video.py +523 -127
- pixeltable/metadata/__init__.py +33 -8
- pixeltable/metadata/converters/convert_10.py +2 -3
- pixeltable/metadata/converters/convert_13.py +2 -2
- pixeltable/metadata/converters/convert_15.py +15 -11
- pixeltable/metadata/converters/convert_16.py +4 -5
- pixeltable/metadata/converters/convert_17.py +4 -5
- pixeltable/metadata/converters/convert_18.py +4 -6
- pixeltable/metadata/converters/convert_19.py +6 -9
- pixeltable/metadata/converters/convert_20.py +3 -6
- pixeltable/metadata/converters/convert_21.py +6 -8
- pixeltable/metadata/converters/convert_22.py +3 -2
- pixeltable/metadata/converters/convert_23.py +33 -0
- pixeltable/metadata/converters/convert_24.py +55 -0
- pixeltable/metadata/converters/convert_25.py +19 -0
- pixeltable/metadata/converters/convert_26.py +23 -0
- pixeltable/metadata/converters/convert_27.py +29 -0
- pixeltable/metadata/converters/convert_28.py +13 -0
- pixeltable/metadata/converters/convert_29.py +110 -0
- pixeltable/metadata/converters/convert_30.py +63 -0
- pixeltable/metadata/converters/convert_31.py +11 -0
- pixeltable/metadata/converters/convert_32.py +15 -0
- pixeltable/metadata/converters/convert_33.py +17 -0
- pixeltable/metadata/converters/convert_34.py +21 -0
- pixeltable/metadata/converters/convert_35.py +9 -0
- pixeltable/metadata/converters/convert_36.py +38 -0
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/convert_41.py +12 -0
- pixeltable/metadata/converters/convert_42.py +9 -0
- pixeltable/metadata/converters/convert_43.py +44 -0
- pixeltable/metadata/converters/util.py +44 -18
- pixeltable/metadata/notes.py +21 -0
- pixeltable/metadata/schema.py +185 -42
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +616 -225
- pixeltable/share/__init__.py +3 -0
- pixeltable/share/packager.py +797 -0
- pixeltable/share/protocol/__init__.py +33 -0
- pixeltable/share/protocol/common.py +165 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +119 -0
- pixeltable/share/publish.py +349 -0
- pixeltable/store.py +398 -232
- pixeltable/type_system.py +730 -267
- pixeltable/utils/__init__.py +40 -0
- pixeltable/utils/arrow.py +201 -29
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +346 -0
- pixeltable/utils/coco.py +26 -27
- pixeltable/utils/code.py +4 -4
- pixeltable/utils/console_output.py +46 -0
- pixeltable/utils/coroutine.py +24 -0
- pixeltable/utils/dbms.py +92 -0
- pixeltable/utils/description_helper.py +11 -12
- pixeltable/utils/documents.py +60 -61
- pixeltable/utils/exception_handler.py +36 -0
- pixeltable/utils/filecache.py +38 -22
- pixeltable/utils/formatter.py +88 -51
- pixeltable/utils/gcs_store.py +295 -0
- pixeltable/utils/http.py +133 -0
- pixeltable/utils/http_server.py +14 -13
- pixeltable/utils/iceberg.py +13 -0
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +90 -0
- pixeltable/utils/local_store.py +322 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +573 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +20 -20
- pixeltable/utils/s3_store.py +527 -0
- pixeltable/utils/sql.py +32 -5
- pixeltable/utils/system.py +30 -0
- pixeltable/utils/transactional_directory.py +4 -3
- pixeltable-0.5.7.dist-info/METADATA +579 -0
- pixeltable-0.5.7.dist-info/RECORD +227 -0
- {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
- pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/catalog/named_function.py +0 -36
- pixeltable/catalog/path_dict.py +0 -141
- pixeltable/dataframe.py +0 -894
- pixeltable/exec/expr_eval_node.py +0 -232
- pixeltable/ext/__init__.py +0 -14
- pixeltable/ext/functions/__init__.py +0 -8
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/ext/functions/yolox.py +0 -157
- pixeltable/tool/create_test_db_dump.py +0 -311
- pixeltable/tool/create_test_video.py +0 -81
- pixeltable/tool/doc_plugins/griffe.py +0 -50
- pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
- pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
- pixeltable/tool/embed_udf.py +0 -9
- pixeltable/tool/mypy_plugin.py +0 -55
- pixeltable/utils/media_store.py +0 -76
- pixeltable/utils/s3.py +0 -16
- pixeltable-0.2.26.dist-info/METADATA +0 -400
- pixeltable-0.2.26.dist-info/RECORD +0 -156
- pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
- {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
pixeltable/iterators/document.py
CHANGED
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
import enum
|
|
3
3
|
import logging
|
|
4
|
-
from typing import Any, Iterable, Iterator,
|
|
4
|
+
from typing import Any, ClassVar, Iterable, Iterator, Literal
|
|
5
5
|
|
|
6
6
|
import ftfy
|
|
7
|
+
import PIL.Image
|
|
8
|
+
from bs4.element import NavigableString, Tag
|
|
9
|
+
from deprecated import deprecated
|
|
10
|
+
from pypdfium2 import PdfDocument # type: ignore[import-untyped]
|
|
7
11
|
|
|
8
12
|
from pixeltable.env import Env
|
|
9
13
|
from pixeltable.exceptions import Error
|
|
10
|
-
from pixeltable.type_system import ColumnType, DocumentType, IntType, JsonType, StringType
|
|
14
|
+
from pixeltable.type_system import ColumnType, DocumentType, ImageType, IntType, JsonType, StringType
|
|
11
15
|
from pixeltable.utils.documents import get_document_handle
|
|
12
16
|
|
|
13
17
|
from .base import ComponentIterator
|
|
@@ -15,6 +19,11 @@ from .base import ComponentIterator
|
|
|
15
19
|
_logger = logging.getLogger('pixeltable')
|
|
16
20
|
|
|
17
21
|
|
|
22
|
+
class Element(enum.Enum):
|
|
23
|
+
TEXT = 1
|
|
24
|
+
IMAGE = 2
|
|
25
|
+
|
|
26
|
+
|
|
18
27
|
class ChunkMetadata(enum.Enum):
|
|
19
28
|
TITLE = 1
|
|
20
29
|
HEADING = 2
|
|
@@ -35,27 +44,30 @@ class Separator(enum.Enum):
|
|
|
35
44
|
@dataclasses.dataclass
|
|
36
45
|
class DocumentSectionMetadata:
|
|
37
46
|
"""Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
|
|
47
|
+
|
|
38
48
|
# html and markdown metadata
|
|
39
|
-
sourceline:
|
|
49
|
+
sourceline: int | None = None
|
|
40
50
|
# the stack of headings up to the most recently observed one;
|
|
41
51
|
# eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
|
|
42
|
-
heading:
|
|
52
|
+
heading: dict[str, str] | None = None
|
|
43
53
|
|
|
44
54
|
# pdf-specific metadata
|
|
45
|
-
page:
|
|
55
|
+
page: int | None = None
|
|
46
56
|
# bounding box as an {x1, y1, x2, y2} dictionary
|
|
47
|
-
bounding_box:
|
|
57
|
+
bounding_box: dict[str, float] | None = None
|
|
48
58
|
|
|
49
59
|
|
|
50
60
|
@dataclasses.dataclass
|
|
51
61
|
class DocumentSection:
|
|
52
62
|
"""A single document chunk, according to some of the splitting criteria"""
|
|
53
|
-
|
|
54
|
-
|
|
63
|
+
|
|
64
|
+
text: str | None = None
|
|
65
|
+
image: PIL.Image.Image | None = None
|
|
66
|
+
metadata: DocumentSectionMetadata | None = None
|
|
55
67
|
|
|
56
68
|
|
|
57
69
|
def _parse_separators(separators: str) -> list[Separator]:
|
|
58
|
-
ret = []
|
|
70
|
+
ret: list[Separator] = []
|
|
59
71
|
for s in separators.split(','):
|
|
60
72
|
clean_s = s.strip().upper()
|
|
61
73
|
if not clean_s:
|
|
@@ -69,7 +81,7 @@ def _parse_separators(separators: str) -> list[Separator]:
|
|
|
69
81
|
|
|
70
82
|
|
|
71
83
|
def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
|
|
72
|
-
ret = []
|
|
84
|
+
ret: list[ChunkMetadata] = []
|
|
73
85
|
for m in metadata.split(','):
|
|
74
86
|
clean_m = m.strip().upper()
|
|
75
87
|
if not clean_m:
|
|
@@ -82,18 +94,23 @@ def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
|
|
|
82
94
|
return ret
|
|
83
95
|
|
|
84
96
|
|
|
85
|
-
|
|
97
|
+
def _parse_elements(elements: list[Literal['text', 'image']]) -> list[Element]:
|
|
98
|
+
result: list[Element] = []
|
|
99
|
+
for e in elements:
|
|
100
|
+
clean_e = e.strip().upper()
|
|
101
|
+
if clean_e not in Element.__members__:
|
|
102
|
+
raise Error(f'Invalid element: `{e}`. Valid elements are: {", ".join(Element.__members__).lower()}')
|
|
103
|
+
result.append(Element[clean_e])
|
|
104
|
+
if len(result) == 0:
|
|
105
|
+
raise Error('elements cannot be empty')
|
|
106
|
+
return result
|
|
86
107
|
|
|
87
108
|
|
|
88
|
-
|
|
89
|
-
"""Iterator over chunks of a document. The document is chunked according to the specified `separators`.
|
|
109
|
+
_HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
|
|
90
110
|
|
|
91
|
-
The iterator yields a `text` field containing the text of the chunk, and it may also
|
|
92
|
-
include additional metadata fields if specified in the `metadata` parameter, as explained below.
|
|
93
111
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
METADATA_COLUMN_TYPES = {
|
|
112
|
+
class DocumentSplitter(ComponentIterator):
|
|
113
|
+
METADATA_COLUMN_TYPES: ClassVar[dict[ChunkMetadata, ColumnType]] = {
|
|
97
114
|
ChunkMetadata.TITLE: StringType(nullable=True),
|
|
98
115
|
ChunkMetadata.HEADING: JsonType(nullable=True),
|
|
99
116
|
ChunkMetadata.SOURCELINE: IntType(nullable=True),
|
|
@@ -101,30 +118,41 @@ class DocumentSplitter(ComponentIterator):
|
|
|
101
118
|
ChunkMetadata.BOUNDING_BOX: JsonType(nullable=True),
|
|
102
119
|
}
|
|
103
120
|
|
|
121
|
+
_doc_handle: Any
|
|
122
|
+
_separators: list[Separator]
|
|
123
|
+
_elements: list[Element]
|
|
124
|
+
_metadata_fields: list[ChunkMetadata]
|
|
125
|
+
_doc_title: str
|
|
126
|
+
_limit: int
|
|
127
|
+
_skip_tags: list[str]
|
|
128
|
+
_overlap: int
|
|
129
|
+
_tiktoken_encoding: str | None
|
|
130
|
+
_tiktoken_target_model: str | None
|
|
131
|
+
_image_dpi: int
|
|
132
|
+
_image_format: str
|
|
133
|
+
|
|
134
|
+
_sections: Iterator[DocumentSection]
|
|
135
|
+
|
|
104
136
|
def __init__(
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
137
|
+
self,
|
|
138
|
+
document: str,
|
|
139
|
+
*,
|
|
140
|
+
separators: str,
|
|
141
|
+
elements: list[Literal['text', 'image']] | None = None,
|
|
142
|
+
limit: int | None = None,
|
|
143
|
+
overlap: int | None = None,
|
|
144
|
+
metadata: str = '',
|
|
145
|
+
skip_tags: list[str] | None = None,
|
|
146
|
+
tiktoken_encoding: str | None = 'cl100k_base',
|
|
147
|
+
tiktoken_target_model: str | None = None,
|
|
148
|
+
image_dpi: int = 300,
|
|
149
|
+
image_format: str = 'png',
|
|
109
150
|
):
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
Args:
|
|
113
|
-
separators: separators to use to chunk the document. Options are:
|
|
114
|
-
`'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
|
|
115
|
-
This may be a comma-separated string, e.g., `'heading,token_limit'`.
|
|
116
|
-
limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
|
|
117
|
-
or `'char_limit'` is specified.
|
|
118
|
-
metadata: additional metadata fields to include in the output. Options are:
|
|
119
|
-
`'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
|
|
120
|
-
(PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
|
|
121
|
-
"""
|
|
122
|
-
if html_skip_tags is None:
|
|
123
|
-
html_skip_tags = ['nav']
|
|
151
|
+
if skip_tags is None:
|
|
152
|
+
skip_tags = ['nav']
|
|
124
153
|
self._doc_handle = get_document_handle(document)
|
|
154
|
+
self._elements = _parse_elements(elements.copy()) if elements is not None else [Element.TEXT]
|
|
125
155
|
assert self._doc_handle is not None
|
|
126
|
-
# calling the output_schema method to validate the input arguments
|
|
127
|
-
self.output_schema(separators=separators, metadata=metadata, limit=limit, overlap=overlap)
|
|
128
156
|
self._separators = _parse_separators(separators)
|
|
129
157
|
self._metadata_fields = _parse_metadata(metadata)
|
|
130
158
|
if self._doc_handle.bs_doc is not None:
|
|
@@ -136,10 +164,12 @@ class DocumentSplitter(ComponentIterator):
|
|
|
136
164
|
else:
|
|
137
165
|
self._doc_title = ''
|
|
138
166
|
self._limit = 0 if limit is None else limit
|
|
139
|
-
self._skip_tags =
|
|
167
|
+
self._skip_tags = skip_tags
|
|
140
168
|
self._overlap = 0 if overlap is None else overlap
|
|
141
169
|
self._tiktoken_encoding = tiktoken_encoding
|
|
142
170
|
self._tiktoken_target_model = tiktoken_target_model
|
|
171
|
+
self._image_dpi = image_dpi
|
|
172
|
+
self._image_format = image_format
|
|
143
173
|
|
|
144
174
|
# set up processing pipeline
|
|
145
175
|
if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
|
|
@@ -151,8 +181,11 @@ class DocumentSplitter(ComponentIterator):
|
|
|
151
181
|
elif self._doc_handle.format == DocumentType.DocumentFormat.PDF:
|
|
152
182
|
assert self._doc_handle.pdf_doc is not None
|
|
153
183
|
self._sections = self._pdf_sections()
|
|
184
|
+
elif self._doc_handle.format == DocumentType.DocumentFormat.TXT:
|
|
185
|
+
assert self._doc_handle.txt_doc is not None
|
|
186
|
+
self._sections = self._txt_sections()
|
|
154
187
|
else:
|
|
155
|
-
|
|
188
|
+
raise AssertionError(f'Unsupported document format: {self._doc_handle.format}')
|
|
156
189
|
|
|
157
190
|
if Separator.SENTENCE in self._separators:
|
|
158
191
|
self._sections = self._sentence_sections(self._sections)
|
|
@@ -166,19 +199,28 @@ class DocumentSplitter(ComponentIterator):
|
|
|
166
199
|
return {
|
|
167
200
|
'document': DocumentType(nullable=False),
|
|
168
201
|
'separators': StringType(nullable=False),
|
|
202
|
+
'elements': JsonType(nullable=False),
|
|
169
203
|
'metadata': StringType(nullable=False),
|
|
170
204
|
'limit': IntType(nullable=True),
|
|
171
205
|
'overlap': IntType(nullable=True),
|
|
172
206
|
'skip_tags': StringType(nullable=True),
|
|
173
207
|
'tiktoken_encoding': StringType(nullable=True),
|
|
174
208
|
'tiktoken_target_model': StringType(nullable=True),
|
|
209
|
+
'image_dpi': IntType(nullable=True),
|
|
210
|
+
'image_format': StringType(nullable=True),
|
|
175
211
|
}
|
|
176
212
|
|
|
177
213
|
@classmethod
|
|
178
214
|
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
|
|
179
|
-
schema: dict[str, ColumnType] = {
|
|
180
|
-
|
|
181
|
-
|
|
215
|
+
schema: dict[str, ColumnType] = {}
|
|
216
|
+
elements = _parse_elements(kwargs.get('elements', ['text']))
|
|
217
|
+
for element in elements:
|
|
218
|
+
if element == Element.TEXT:
|
|
219
|
+
schema['text'] = StringType(nullable=False)
|
|
220
|
+
elif element == Element.IMAGE:
|
|
221
|
+
schema['image'] = ImageType(nullable=False)
|
|
222
|
+
|
|
223
|
+
md_fields = _parse_metadata(kwargs.get('metadata', ''))
|
|
182
224
|
for md_field in md_fields:
|
|
183
225
|
schema[md_field.name.lower()] = cls.METADATA_COLUMN_TYPES[md_field]
|
|
184
226
|
|
|
@@ -188,6 +230,8 @@ class DocumentSplitter(ComponentIterator):
|
|
|
188
230
|
limit = kwargs.get('limit')
|
|
189
231
|
overlap = kwargs.get('overlap')
|
|
190
232
|
|
|
233
|
+
if Element.IMAGE in elements and separators != [Separator.PAGE]:
|
|
234
|
+
raise Error('Image elements are only supported for the "page" separator on PDF documents')
|
|
191
235
|
if limit is not None or overlap is not None:
|
|
192
236
|
if Separator.TOKEN_LIMIT not in separators and Separator.CHAR_LIMIT not in separators:
|
|
193
237
|
raise Error('limit/overlap requires the "token_limit" or "char_limit" separator')
|
|
@@ -201,9 +245,8 @@ class DocumentSplitter(ComponentIterator):
|
|
|
201
245
|
if kwargs.get('limit') is None:
|
|
202
246
|
raise Error('limit is required with "token_limit"/"char_limit" separators')
|
|
203
247
|
|
|
204
|
-
# check dependencies at the end
|
|
205
248
|
if Separator.SENTENCE in separators:
|
|
206
|
-
Env.get().
|
|
249
|
+
_ = Env.get().spacy_nlp
|
|
207
250
|
if Separator.TOKEN_LIMIT in separators:
|
|
208
251
|
Env.get().require_package('tiktoken')
|
|
209
252
|
|
|
@@ -212,9 +255,15 @@ class DocumentSplitter(ComponentIterator):
|
|
|
212
255
|
def __next__(self) -> dict[str, Any]:
|
|
213
256
|
while True:
|
|
214
257
|
section = next(self._sections)
|
|
215
|
-
if section.text is None:
|
|
258
|
+
if section.text is None and section.image is None:
|
|
216
259
|
continue
|
|
217
|
-
result: dict[str, Any] = {
|
|
260
|
+
result: dict[str, Any] = {}
|
|
261
|
+
for element in self._elements:
|
|
262
|
+
if element == Element.TEXT:
|
|
263
|
+
result['text'] = section.text
|
|
264
|
+
elif element == Element.IMAGE:
|
|
265
|
+
result['image'] = section.image
|
|
266
|
+
|
|
218
267
|
for md_field in self._metadata_fields:
|
|
219
268
|
if md_field == ChunkMetadata.TITLE:
|
|
220
269
|
result[md_field.name.lower()] = self._doc_title
|
|
@@ -226,18 +275,20 @@ class DocumentSplitter(ComponentIterator):
|
|
|
226
275
|
result[md_field.name.lower()] = section.metadata.page
|
|
227
276
|
elif md_field == ChunkMetadata.BOUNDING_BOX:
|
|
228
277
|
result[md_field.name.lower()] = section.metadata.bounding_box
|
|
278
|
+
|
|
229
279
|
return result
|
|
230
280
|
|
|
231
281
|
def _html_sections(self) -> Iterator[DocumentSection]:
|
|
232
282
|
"""Create DocumentSections reflecting the html-specific separators"""
|
|
233
283
|
import bs4
|
|
284
|
+
|
|
234
285
|
emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
|
|
235
286
|
emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
|
|
236
287
|
# current state
|
|
237
288
|
accumulated_text: list[str] = [] # currently accumulated text
|
|
238
289
|
# accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
|
|
239
290
|
|
|
240
|
-
headings: dict[str, str] = {}
|
|
291
|
+
headings: dict[str, str] = {} # current state of observed headings (level -> text)
|
|
241
292
|
sourceline = 0 # most recently seen sourceline
|
|
242
293
|
|
|
243
294
|
def update_metadata(el: bs4.Tag) -> None:
|
|
@@ -246,9 +297,9 @@ class DocumentSplitter(ComponentIterator):
|
|
|
246
297
|
sourceline = el.sourceline
|
|
247
298
|
if el.name in _HTML_HEADINGS:
|
|
248
299
|
# remove the previously seen lower levels
|
|
249
|
-
lower_levels = [
|
|
250
|
-
for
|
|
251
|
-
del headings[
|
|
300
|
+
lower_levels = [lv for lv in headings if lv > el.name]
|
|
301
|
+
for lv in lower_levels:
|
|
302
|
+
del headings[lv]
|
|
252
303
|
headings[el.name] = el.get_text().strip()
|
|
253
304
|
|
|
254
305
|
def emit() -> Iterator[DocumentSection]:
|
|
@@ -260,7 +311,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
260
311
|
yield DocumentSection(text=full_text, metadata=md)
|
|
261
312
|
accumulated_text = []
|
|
262
313
|
|
|
263
|
-
def process_element(el:
|
|
314
|
+
def process_element(el: Tag | NavigableString) -> Iterator[DocumentSection]:
|
|
264
315
|
# process the element and emit sections as necessary
|
|
265
316
|
nonlocal accumulated_text, headings, sourceline, emit_on_heading, emit_on_paragraph
|
|
266
317
|
|
|
@@ -297,7 +348,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
297
348
|
# current state
|
|
298
349
|
accumulated_text: list[str] = [] # currently accumulated text
|
|
299
350
|
# accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
|
|
300
|
-
headings: dict[str, str] = {}
|
|
351
|
+
headings: dict[str, str] = {} # current state of observed headings (level -> text)
|
|
301
352
|
|
|
302
353
|
def update_headings(heading: dict) -> None:
|
|
303
354
|
# update current state
|
|
@@ -307,9 +358,9 @@ class DocumentSplitter(ComponentIterator):
|
|
|
307
358
|
level = f'h{lint}'
|
|
308
359
|
text = heading['children'][0]['raw'].strip()
|
|
309
360
|
# remove the previously seen lower levels
|
|
310
|
-
lower_levels = [
|
|
311
|
-
for
|
|
312
|
-
del headings[
|
|
361
|
+
lower_levels = [lv for lv in headings if lv > level]
|
|
362
|
+
for lv in lower_levels:
|
|
363
|
+
del headings[lv]
|
|
313
364
|
headings[level] = text
|
|
314
365
|
|
|
315
366
|
def emit() -> Iterator[DocumentSection]:
|
|
@@ -348,47 +399,48 @@ class DocumentSplitter(ComponentIterator):
|
|
|
348
399
|
yield from emit()
|
|
349
400
|
|
|
350
401
|
def _pdf_sections(self) -> Iterator[DocumentSection]:
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
402
|
+
if Separator.PARAGRAPH in self._separators:
|
|
403
|
+
raise Error(
|
|
404
|
+
'Paragraph splitting is not currently supported for PDF documents. Please contact'
|
|
405
|
+
' us at https://github.com/pixeltable/pixeltable/issues if you need this feature.'
|
|
406
|
+
)
|
|
355
407
|
|
|
356
|
-
|
|
357
|
-
|
|
408
|
+
doc: PdfDocument = self._doc_handle.pdf_doc
|
|
409
|
+
assert isinstance(doc, PdfDocument)
|
|
358
410
|
|
|
359
|
-
|
|
411
|
+
emit_on_page = Separator.PAGE in self._separators
|
|
412
|
+
accumulated_text: list[str] = []
|
|
360
413
|
|
|
361
|
-
def
|
|
362
|
-
fixed = ftfy.fix_text(
|
|
414
|
+
def _add_cleaned(raw: str) -> None:
|
|
415
|
+
fixed = ftfy.fix_text(raw)
|
|
363
416
|
if fixed:
|
|
364
417
|
accumulated_text.append(fixed)
|
|
365
418
|
|
|
366
419
|
def _emit_text() -> str:
|
|
367
|
-
|
|
420
|
+
txt = ''.join(accumulated_text)
|
|
368
421
|
accumulated_text.clear()
|
|
369
|
-
return
|
|
370
|
-
|
|
371
|
-
for
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
_add_cleaned_text(text)
|
|
379
|
-
if accumulated_text and emit_on_paragraph:
|
|
380
|
-
bbox = {'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2}
|
|
381
|
-
metadata = DocumentSectionMetadata(page=page_number, bounding_box=bbox)
|
|
382
|
-
yield DocumentSection(text=_emit_text(), metadata=metadata)
|
|
383
|
-
|
|
384
|
-
if accumulated_text and emit_on_page and not emit_on_paragraph:
|
|
385
|
-
yield DocumentSection(text=_emit_text(),
|
|
386
|
-
metadata=DocumentSectionMetadata(page=page_number))
|
|
387
|
-
accumulated_text = []
|
|
422
|
+
return txt
|
|
423
|
+
|
|
424
|
+
for page_idx, page in enumerate(doc):
|
|
425
|
+
img = page.render().to_pil() if Element.IMAGE in self._elements else None
|
|
426
|
+
text = page.get_textpage().get_text_bounded()
|
|
427
|
+
_add_cleaned(text)
|
|
428
|
+
if accumulated_text and emit_on_page:
|
|
429
|
+
md = DocumentSectionMetadata(page=page_idx)
|
|
430
|
+
yield DocumentSection(text=_emit_text(), image=img, metadata=md)
|
|
388
431
|
|
|
389
432
|
if accumulated_text and not emit_on_page:
|
|
390
433
|
yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata())
|
|
391
434
|
|
|
435
|
+
def _txt_sections(self) -> Iterator[DocumentSection]:
|
|
436
|
+
"""Create DocumentSections for text files.
|
|
437
|
+
|
|
438
|
+
Currently, it returns the entire text as a single section.
|
|
439
|
+
TODO: Add support for paragraphs.
|
|
440
|
+
"""
|
|
441
|
+
assert self._doc_handle.txt_doc is not None
|
|
442
|
+
yield DocumentSection(text=ftfy.fix_text(self._doc_handle.txt_doc), metadata=DocumentSectionMetadata())
|
|
443
|
+
|
|
392
444
|
def _sentence_sections(self, input_sections: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
|
|
393
445
|
"""Split the input sections into sentences"""
|
|
394
446
|
for section in input_sections:
|
|
@@ -399,6 +451,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
399
451
|
|
|
400
452
|
def _token_chunks(self, input: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
|
|
401
453
|
import tiktoken
|
|
454
|
+
|
|
402
455
|
if self._tiktoken_target_model is not None:
|
|
403
456
|
encoding = tiktoken.encoding_for_model(self._tiktoken_target_model)
|
|
404
457
|
else:
|
|
@@ -442,5 +495,9 @@ class DocumentSplitter(ComponentIterator):
|
|
|
442
495
|
def close(self) -> None:
|
|
443
496
|
pass
|
|
444
497
|
|
|
445
|
-
|
|
446
|
-
|
|
498
|
+
@classmethod
|
|
499
|
+
@deprecated(
|
|
500
|
+
'create() is deprecated; use `pixeltable.functions.document.document_splitter` instead', version='0.5.6'
|
|
501
|
+
)
|
|
502
|
+
def create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
|
|
503
|
+
return super()._create(**kwargs)
|
pixeltable/iterators/image.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import Any, Sequence
|
|
2
2
|
|
|
3
3
|
import PIL.Image
|
|
4
|
+
from deprecated import deprecated
|
|
4
5
|
|
|
5
6
|
import pixeltable.exceptions as excs
|
|
6
7
|
import pixeltable.type_system as ts
|
|
@@ -8,18 +9,6 @@ from pixeltable.iterators.base import ComponentIterator
|
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
class TileIterator(ComponentIterator):
|
|
11
|
-
"""
|
|
12
|
-
Iterator over tiles of an image. Each image will be divided into tiles of size `tile_size`, and the tiles will be
|
|
13
|
-
iterated over in row-major order (left-to-right, then top-to-bottom). An optional `overlap` parameter may be
|
|
14
|
-
specified. If the tiles do not exactly cover the image, then the rightmost and bottommost tiles will be padded with
|
|
15
|
-
blackspace, so that the output images all have the exact size `tile_size`.
|
|
16
|
-
|
|
17
|
-
Args:
|
|
18
|
-
image: Image to split into tiles.
|
|
19
|
-
tile_size: Size of each tile, as a pair of integers `[width, height]`.
|
|
20
|
-
overlap: Amount of overlap between adjacent tiles, as a pair of integers `[width, height]`.
|
|
21
|
-
"""
|
|
22
|
-
|
|
23
12
|
__image: PIL.Image.Image
|
|
24
13
|
__tile_size: Sequence[int]
|
|
25
14
|
__overlap: Sequence[int]
|
|
@@ -30,15 +19,8 @@ class TileIterator(ComponentIterator):
|
|
|
30
19
|
__i: int
|
|
31
20
|
__j: int
|
|
32
21
|
|
|
33
|
-
def __init__(
|
|
34
|
-
|
|
35
|
-
image: PIL.Image.Image,
|
|
36
|
-
*,
|
|
37
|
-
tile_size: tuple[int, int],
|
|
38
|
-
overlap: tuple[int, int] = (0, 0),
|
|
39
|
-
):
|
|
40
|
-
if overlap[0] >= tile_size[0] or overlap[1] >= tile_size[1]:
|
|
41
|
-
raise excs.Error(f"overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}")
|
|
22
|
+
def __init__(self, image: PIL.Image.Image, *, tile_size: tuple[int, int], overlap: tuple[int, int] = (0, 0)):
|
|
23
|
+
assert overlap[0] < tile_size[0] and overlap[1] < tile_size[1]
|
|
42
24
|
|
|
43
25
|
self.__image = image
|
|
44
26
|
self.__image.load()
|
|
@@ -64,11 +46,7 @@ class TileIterator(ComponentIterator):
|
|
|
64
46
|
x2 = x1 + self.__tile_size[0]
|
|
65
47
|
y2 = y1 + self.__tile_size[1]
|
|
66
48
|
tile = self.__image.crop((x1, y1, x2, y2))
|
|
67
|
-
result = {
|
|
68
|
-
'tile': tile,
|
|
69
|
-
'tile_coord': [self.__i, self.__j],
|
|
70
|
-
'tile_box': [x1, y1, x2, y2]
|
|
71
|
-
}
|
|
49
|
+
result = {'tile': tile, 'tile_coord': [self.__i, self.__j], 'tile_box': [x1, y1, x2, y2]}
|
|
72
50
|
|
|
73
51
|
self.__i += 1
|
|
74
52
|
if self.__i >= self.__xlen:
|
|
@@ -79,22 +57,23 @@ class TileIterator(ComponentIterator):
|
|
|
79
57
|
def close(self) -> None:
|
|
80
58
|
pass
|
|
81
59
|
|
|
82
|
-
def set_pos(self, pos: int) -> None:
|
|
60
|
+
def set_pos(self, pos: int, **kwargs: Any) -> None:
|
|
83
61
|
self.__j = pos // self.__xlen
|
|
84
62
|
self.__i = pos % self.__xlen
|
|
85
63
|
|
|
86
64
|
@classmethod
|
|
87
65
|
def input_schema(cls, *args: Any, **kwargs: Any) -> dict[str, ts.ColumnType]:
|
|
88
|
-
return {
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
66
|
+
return {'image': ts.ImageType(), 'tile_size': ts.JsonType(), 'overlap': ts.JsonType()}
|
|
67
|
+
|
|
68
|
+
@classmethod
|
|
69
|
+
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
70
|
+
tile_size = kwargs.get('tile_size')
|
|
71
|
+
overlap = kwargs.get('overlap', (0, 0))
|
|
72
|
+
if overlap[0] >= tile_size[0] or overlap[1] >= tile_size[1]:
|
|
73
|
+
raise excs.Error(f'overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}')
|
|
74
|
+
return {'tile': ts.ImageType(), 'tile_coord': ts.JsonType(), 'tile_box': ts.JsonType()}, ['tile']
|
|
93
75
|
|
|
94
76
|
@classmethod
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
'tile_coord': ts.JsonType(),
|
|
99
|
-
'tile_box': ts.JsonType(),
|
|
100
|
-
}, ['tile']
|
|
77
|
+
@deprecated('create() is deprecated; use `pixeltable.functions.image.tile_iterator` instead', version='0.5.6')
|
|
78
|
+
def create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
|
|
79
|
+
return super()._create(**kwargs)
|
pixeltable/iterators/string.py
CHANGED
|
@@ -1,13 +1,17 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Any, Iterator
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
from deprecated import deprecated
|
|
4
|
+
|
|
5
|
+
from pixeltable import exceptions as excs, type_system as ts
|
|
5
6
|
from pixeltable.env import Env
|
|
6
7
|
from pixeltable.iterators.base import ComponentIterator
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
class StringSplitter(ComponentIterator):
|
|
10
|
-
|
|
11
|
+
_text: str
|
|
12
|
+
doc: Any # spacy doc
|
|
13
|
+
iter: Iterator[dict[str, Any]]
|
|
14
|
+
|
|
11
15
|
def __init__(self, text: str, *, separators: str):
|
|
12
16
|
if separators != 'sentence':
|
|
13
17
|
raise excs.Error('Only `sentence` separators are currently supported.')
|
|
@@ -25,16 +29,15 @@ class StringSplitter(ComponentIterator):
|
|
|
25
29
|
def close(self) -> None:
|
|
26
30
|
pass
|
|
27
31
|
|
|
28
|
-
def set_pos(self, pos: int) -> None:
|
|
29
|
-
pass
|
|
30
|
-
|
|
31
32
|
@classmethod
|
|
32
33
|
def input_schema(cls, *args: Any, **kwargs: Any) -> dict[str, ts.ColumnType]:
|
|
33
|
-
return {
|
|
34
|
-
'text': ts.StringType(),
|
|
35
|
-
'separators': ts.StringType(),
|
|
36
|
-
}
|
|
34
|
+
return {'text': ts.StringType(), 'separators': ts.StringType()}
|
|
37
35
|
|
|
38
36
|
@classmethod
|
|
39
|
-
def output_schema(cls,
|
|
37
|
+
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
40
38
|
return {'text': ts.StringType()}, []
|
|
39
|
+
|
|
40
|
+
@classmethod
|
|
41
|
+
@deprecated('create() is deprecated; use `pixeltable.functions.string.string_splitter` instead', version='0.5.6')
|
|
42
|
+
def create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
|
|
43
|
+
return super()._create(**kwargs)
|