pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pixeltable/__init__.py +42 -8
- pixeltable/{dataframe.py → _query.py} +470 -206
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +5 -4
- pixeltable/catalog/catalog.py +1785 -432
- pixeltable/catalog/column.py +190 -113
- pixeltable/catalog/dir.py +2 -4
- pixeltable/catalog/globals.py +19 -46
- pixeltable/catalog/insertable_table.py +191 -98
- pixeltable/catalog/path.py +63 -23
- pixeltable/catalog/schema_object.py +11 -15
- pixeltable/catalog/table.py +843 -436
- pixeltable/catalog/table_metadata.py +103 -0
- pixeltable/catalog/table_version.py +978 -657
- pixeltable/catalog/table_version_handle.py +72 -16
- pixeltable/catalog/table_version_path.py +112 -43
- pixeltable/catalog/tbl_ops.py +53 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +134 -90
- pixeltable/config.py +134 -22
- pixeltable/env.py +471 -157
- pixeltable/exceptions.py +6 -0
- pixeltable/exec/__init__.py +4 -1
- pixeltable/exec/aggregation_node.py +7 -8
- pixeltable/exec/cache_prefetch_node.py +83 -110
- pixeltable/exec/cell_materialization_node.py +268 -0
- pixeltable/exec/cell_reconstruction_node.py +168 -0
- pixeltable/exec/component_iteration_node.py +4 -3
- pixeltable/exec/data_row_batch.py +8 -65
- pixeltable/exec/exec_context.py +16 -4
- pixeltable/exec/exec_node.py +13 -36
- pixeltable/exec/expr_eval/evaluators.py +11 -7
- pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
- pixeltable/exec/expr_eval/globals.py +8 -5
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +106 -56
- pixeltable/exec/globals.py +35 -0
- pixeltable/exec/in_memory_data_node.py +19 -19
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +16 -9
- pixeltable/exec/sql_node.py +351 -84
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +27 -22
- pixeltable/exprs/array_slice.py +3 -3
- pixeltable/exprs/column_property_ref.py +36 -23
- pixeltable/exprs/column_ref.py +213 -89
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +5 -4
- pixeltable/exprs/data_row.py +164 -54
- pixeltable/exprs/expr.py +70 -44
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +17 -10
- pixeltable/exprs/function_call.py +100 -40
- pixeltable/exprs/globals.py +2 -2
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +18 -32
- pixeltable/exprs/is_null.py +7 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +56 -22
- pixeltable/exprs/literal.py +27 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +167 -67
- pixeltable/exprs/rowid_ref.py +25 -10
- pixeltable/exprs/similarity_expr.py +58 -40
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +5 -5
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +17 -11
- pixeltable/func/function.py +18 -20
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +29 -27
- pixeltable/func/signature.py +46 -19
- pixeltable/func/tools.py +31 -13
- pixeltable/func/udf.py +18 -20
- pixeltable/functions/__init__.py +16 -0
- pixeltable/functions/anthropic.py +123 -77
- pixeltable/functions/audio.py +147 -10
- pixeltable/functions/bedrock.py +13 -6
- pixeltable/functions/date.py +7 -4
- pixeltable/functions/deepseek.py +35 -43
- pixeltable/functions/document.py +81 -0
- pixeltable/functions/fal.py +76 -0
- pixeltable/functions/fireworks.py +11 -20
- pixeltable/functions/gemini.py +195 -39
- pixeltable/functions/globals.py +142 -14
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1056 -24
- pixeltable/functions/image.py +115 -57
- pixeltable/functions/json.py +1 -1
- pixeltable/functions/llama_cpp.py +28 -13
- pixeltable/functions/math.py +67 -5
- pixeltable/functions/mistralai.py +18 -55
- pixeltable/functions/net.py +70 -0
- pixeltable/functions/ollama.py +20 -13
- pixeltable/functions/openai.py +240 -226
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +4 -4
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +239 -69
- pixeltable/functions/timestamp.py +16 -16
- pixeltable/functions/together.py +24 -84
- pixeltable/functions/twelvelabs.py +188 -0
- pixeltable/functions/util.py +6 -1
- pixeltable/functions/uuid.py +30 -0
- pixeltable/functions/video.py +1515 -107
- pixeltable/functions/vision.py +8 -8
- pixeltable/functions/voyageai.py +289 -0
- pixeltable/functions/whisper.py +16 -8
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/{ext/functions → functions}/yolox.py +2 -4
- pixeltable/globals.py +362 -115
- pixeltable/index/base.py +17 -21
- pixeltable/index/btree.py +28 -22
- pixeltable/index/embedding_index.py +100 -118
- pixeltable/io/__init__.py +4 -2
- pixeltable/io/datarows.py +8 -7
- pixeltable/io/external_store.py +56 -105
- pixeltable/io/fiftyone.py +13 -13
- pixeltable/io/globals.py +31 -30
- pixeltable/io/hf_datasets.py +61 -16
- pixeltable/io/label_studio.py +74 -70
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +21 -12
- pixeltable/io/parquet.py +25 -105
- pixeltable/io/table_data_conduit.py +250 -123
- pixeltable/io/utils.py +4 -4
- pixeltable/iterators/__init__.py +2 -1
- pixeltable/iterators/audio.py +26 -25
- pixeltable/iterators/base.py +9 -3
- pixeltable/iterators/document.py +112 -78
- pixeltable/iterators/image.py +12 -15
- pixeltable/iterators/string.py +11 -4
- pixeltable/iterators/video.py +523 -120
- pixeltable/metadata/__init__.py +14 -3
- pixeltable/metadata/converters/convert_13.py +2 -2
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_30.py +34 -21
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_35.py +9 -0
- pixeltable/metadata/converters/convert_36.py +38 -0
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/convert_41.py +12 -0
- pixeltable/metadata/converters/convert_42.py +9 -0
- pixeltable/metadata/converters/convert_43.py +44 -0
- pixeltable/metadata/converters/util.py +20 -31
- pixeltable/metadata/notes.py +9 -0
- pixeltable/metadata/schema.py +140 -53
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +382 -115
- pixeltable/share/__init__.py +1 -1
- pixeltable/share/packager.py +547 -83
- pixeltable/share/protocol/__init__.py +33 -0
- pixeltable/share/protocol/common.py +165 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +119 -0
- pixeltable/share/publish.py +257 -59
- pixeltable/store.py +311 -194
- pixeltable/type_system.py +373 -211
- pixeltable/utils/__init__.py +2 -3
- pixeltable/utils/arrow.py +131 -17
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +346 -0
- pixeltable/utils/coco.py +6 -6
- pixeltable/utils/code.py +3 -3
- pixeltable/utils/console_output.py +4 -1
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/dbms.py +32 -6
- pixeltable/utils/description_helper.py +4 -5
- pixeltable/utils/documents.py +7 -18
- pixeltable/utils/exception_handler.py +7 -30
- pixeltable/utils/filecache.py +6 -6
- pixeltable/utils/formatter.py +86 -48
- pixeltable/utils/gcs_store.py +295 -0
- pixeltable/utils/http.py +133 -0
- pixeltable/utils/http_server.py +2 -3
- pixeltable/utils/iceberg.py +1 -2
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +90 -0
- pixeltable/utils/local_store.py +322 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +573 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +527 -0
- pixeltable/utils/sql.py +26 -0
- pixeltable/utils/system.py +30 -0
- pixeltable-0.5.7.dist-info/METADATA +579 -0
- pixeltable-0.5.7.dist-info/RECORD +227 -0
- {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
- pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/catalog/named_function.py +0 -40
- pixeltable/ext/__init__.py +0 -17
- pixeltable/ext/functions/__init__.py +0 -11
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/utils/media_store.py +0 -77
- pixeltable/utils/s3.py +0 -17
- pixeltable-0.3.14.dist-info/METADATA +0 -434
- pixeltable-0.3.14.dist-info/RECORD +0 -186
- pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
- {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
pixeltable/iterators/audio.py
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import uuid
|
|
3
2
|
from fractions import Fraction
|
|
4
3
|
from pathlib import Path
|
|
5
|
-
from typing import Any, ClassVar
|
|
4
|
+
from typing import Any, ClassVar
|
|
6
5
|
|
|
7
6
|
import av
|
|
7
|
+
from deprecated import deprecated
|
|
8
8
|
|
|
9
|
-
from pixeltable import
|
|
9
|
+
from pixeltable import exceptions as excs, type_system as ts
|
|
10
|
+
from pixeltable.utils.local_store import TempStore
|
|
10
11
|
|
|
11
12
|
from .base import ComponentIterator
|
|
12
13
|
|
|
@@ -14,18 +15,6 @@ _logger = logging.getLogger('pixeltable')
|
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
class AudioSplitter(ComponentIterator):
|
|
17
|
-
"""
|
|
18
|
-
Iterator over chunks of an audio file. The audio file is split into smaller chunks,
|
|
19
|
-
where the duration of each chunk is determined by chunk_duration_sec.
|
|
20
|
-
The iterator yields audio chunks as pxt.Audio, along with the start and end time of each chunk.
|
|
21
|
-
If the input contains no audio, no chunks are yielded.
|
|
22
|
-
|
|
23
|
-
Args:
|
|
24
|
-
chunk_duration_sec: Audio chunk duration in seconds
|
|
25
|
-
overlap_sec: Overlap between consecutive chunks in seconds.
|
|
26
|
-
min_chunk_duration_sec: Drop the last chunk if it is smaller than min_chunk_duration_sec
|
|
27
|
-
"""
|
|
28
|
-
|
|
29
18
|
# Input parameters
|
|
30
19
|
audio_path: Path
|
|
31
20
|
chunk_duration_sec: float
|
|
@@ -37,7 +26,7 @@ class AudioSplitter(ComponentIterator):
|
|
|
37
26
|
|
|
38
27
|
# List of chunks to extract
|
|
39
28
|
# Each chunk is defined by start and end presentation timestamps in audio file (int)
|
|
40
|
-
chunks_to_extract_in_pts:
|
|
29
|
+
chunks_to_extract_in_pts: list[tuple[int, int]] | None
|
|
41
30
|
# next chunk to extract
|
|
42
31
|
next_pos: int
|
|
43
32
|
|
|
@@ -55,12 +44,9 @@ class AudioSplitter(ComponentIterator):
|
|
|
55
44
|
def __init__(
|
|
56
45
|
self, audio: str, chunk_duration_sec: float, *, overlap_sec: float = 0.0, min_chunk_duration_sec: float = 0.0
|
|
57
46
|
):
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
raise excs.Error('chunk_duration_sec must be at least min_chunk_duration_sec')
|
|
62
|
-
if overlap_sec >= chunk_duration_sec:
|
|
63
|
-
raise excs.Error('overlap_sec must be less than chunk_duration_sec')
|
|
47
|
+
assert chunk_duration_sec > 0.0
|
|
48
|
+
assert chunk_duration_sec >= min_chunk_duration_sec
|
|
49
|
+
assert overlap_sec < chunk_duration_sec
|
|
64
50
|
audio_path = Path(audio)
|
|
65
51
|
assert audio_path.exists() and audio_path.is_file()
|
|
66
52
|
self.audio_path = audio_path
|
|
@@ -128,6 +114,19 @@ class AudioSplitter(ComponentIterator):
|
|
|
128
114
|
|
|
129
115
|
@classmethod
|
|
130
116
|
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
117
|
+
param_names = ['chunk_duration_sec', 'min_chunk_duration_sec', 'overlap_sec']
|
|
118
|
+
params = dict(zip(param_names, args))
|
|
119
|
+
params.update(kwargs)
|
|
120
|
+
|
|
121
|
+
chunk_duration_sec = params['chunk_duration_sec']
|
|
122
|
+
min_chunk_duration_sec = params.get('min_chunk_duration_sec', 0.0)
|
|
123
|
+
overlap_sec = params.get('overlap_sec', 0.0)
|
|
124
|
+
if chunk_duration_sec <= 0.0:
|
|
125
|
+
raise excs.Error('chunk_duration_sec must be a positive number')
|
|
126
|
+
if chunk_duration_sec < min_chunk_duration_sec:
|
|
127
|
+
raise excs.Error('chunk_duration_sec must be at least min_chunk_duration_sec')
|
|
128
|
+
if overlap_sec >= chunk_duration_sec:
|
|
129
|
+
raise excs.Error('overlap_sec must be less than chunk_duration_sec')
|
|
131
130
|
return {
|
|
132
131
|
'start_time_sec': ts.FloatType(),
|
|
133
132
|
'end_time_sec': ts.FloatType(),
|
|
@@ -140,7 +139,7 @@ class AudioSplitter(ComponentIterator):
|
|
|
140
139
|
target_chunk_start, target_chunk_end = self.chunks_to_extract_in_pts[self.next_pos]
|
|
141
140
|
chunk_start_pts = 0
|
|
142
141
|
chunk_end_pts = 0
|
|
143
|
-
chunk_file = str(
|
|
142
|
+
chunk_file = str(TempStore.create_path(extension=self.audio_path.suffix))
|
|
144
143
|
output_container = av.open(chunk_file, mode='w')
|
|
145
144
|
input_stream = self.container.streams.audio[0]
|
|
146
145
|
codec_name = AudioSplitter.__codec_map.get(input_stream.codec_context.name, input_stream.codec_context.name)
|
|
@@ -202,5 +201,7 @@ class AudioSplitter(ComponentIterator):
|
|
|
202
201
|
def close(self) -> None:
|
|
203
202
|
self.container.close()
|
|
204
203
|
|
|
205
|
-
|
|
206
|
-
|
|
204
|
+
@classmethod
|
|
205
|
+
@deprecated('create() is deprecated; use `pixeltable.functions.audio.audio_splitter` instead', version='0.5.6')
|
|
206
|
+
def create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
|
|
207
|
+
return super()._create(**kwargs)
|
pixeltable/iterators/base.py
CHANGED
|
@@ -43,11 +43,17 @@ class ComponentIterator(ABC):
|
|
|
43
43
|
"""Close the iterator and release all resources"""
|
|
44
44
|
raise NotImplementedError
|
|
45
45
|
|
|
46
|
-
|
|
47
|
-
def set_pos(self, pos: int) -> None:
|
|
46
|
+
def set_pos(self, pos: int, **kwargs: Any) -> None:
|
|
48
47
|
"""Set the iterator position to pos"""
|
|
49
|
-
|
|
48
|
+
pass
|
|
50
49
|
|
|
51
50
|
@classmethod
|
|
52
51
|
def create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
|
|
52
|
+
# TODO: This is still needed for compatibility with existing user-defined iterators; it will become deprecated
|
|
53
|
+
# when the new decorator pattern is introduced for iterators
|
|
54
|
+
return cls._create(**kwargs)
|
|
55
|
+
|
|
56
|
+
@classmethod
|
|
57
|
+
def _create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
|
|
58
|
+
# create() variant that can be called by subclasses without generating a deprecation warning.
|
|
53
59
|
return cls, kwargs
|
pixeltable/iterators/document.py
CHANGED
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
import enum
|
|
3
3
|
import logging
|
|
4
|
-
from typing import Any, ClassVar, Iterable, Iterator,
|
|
4
|
+
from typing import Any, ClassVar, Iterable, Iterator, Literal
|
|
5
5
|
|
|
6
6
|
import ftfy
|
|
7
|
+
import PIL.Image
|
|
8
|
+
from bs4.element import NavigableString, Tag
|
|
9
|
+
from deprecated import deprecated
|
|
10
|
+
from pypdfium2 import PdfDocument # type: ignore[import-untyped]
|
|
7
11
|
|
|
8
12
|
from pixeltable.env import Env
|
|
9
13
|
from pixeltable.exceptions import Error
|
|
10
|
-
from pixeltable.type_system import ColumnType, DocumentType, IntType, JsonType, StringType
|
|
14
|
+
from pixeltable.type_system import ColumnType, DocumentType, ImageType, IntType, JsonType, StringType
|
|
11
15
|
from pixeltable.utils.documents import get_document_handle
|
|
12
16
|
|
|
13
17
|
from .base import ComponentIterator
|
|
@@ -15,6 +19,11 @@ from .base import ComponentIterator
|
|
|
15
19
|
_logger = logging.getLogger('pixeltable')
|
|
16
20
|
|
|
17
21
|
|
|
22
|
+
class Element(enum.Enum):
|
|
23
|
+
TEXT = 1
|
|
24
|
+
IMAGE = 2
|
|
25
|
+
|
|
26
|
+
|
|
18
27
|
class ChunkMetadata(enum.Enum):
|
|
19
28
|
TITLE = 1
|
|
20
29
|
HEADING = 2
|
|
@@ -37,27 +46,28 @@ class DocumentSectionMetadata:
|
|
|
37
46
|
"""Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
|
|
38
47
|
|
|
39
48
|
# html and markdown metadata
|
|
40
|
-
sourceline:
|
|
49
|
+
sourceline: int | None = None
|
|
41
50
|
# the stack of headings up to the most recently observed one;
|
|
42
51
|
# eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
|
|
43
|
-
heading:
|
|
52
|
+
heading: dict[str, str] | None = None
|
|
44
53
|
|
|
45
54
|
# pdf-specific metadata
|
|
46
|
-
page:
|
|
55
|
+
page: int | None = None
|
|
47
56
|
# bounding box as an {x1, y1, x2, y2} dictionary
|
|
48
|
-
bounding_box:
|
|
57
|
+
bounding_box: dict[str, float] | None = None
|
|
49
58
|
|
|
50
59
|
|
|
51
60
|
@dataclasses.dataclass
|
|
52
61
|
class DocumentSection:
|
|
53
62
|
"""A single document chunk, according to some of the splitting criteria"""
|
|
54
63
|
|
|
55
|
-
text:
|
|
56
|
-
|
|
64
|
+
text: str | None = None
|
|
65
|
+
image: PIL.Image.Image | None = None
|
|
66
|
+
metadata: DocumentSectionMetadata | None = None
|
|
57
67
|
|
|
58
68
|
|
|
59
69
|
def _parse_separators(separators: str) -> list[Separator]:
|
|
60
|
-
ret = []
|
|
70
|
+
ret: list[Separator] = []
|
|
61
71
|
for s in separators.split(','):
|
|
62
72
|
clean_s = s.strip().upper()
|
|
63
73
|
if not clean_s:
|
|
@@ -71,7 +81,7 @@ def _parse_separators(separators: str) -> list[Separator]:
|
|
|
71
81
|
|
|
72
82
|
|
|
73
83
|
def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
|
|
74
|
-
ret = []
|
|
84
|
+
ret: list[ChunkMetadata] = []
|
|
75
85
|
for m in metadata.split(','):
|
|
76
86
|
clean_m = m.strip().upper()
|
|
77
87
|
if not clean_m:
|
|
@@ -84,18 +94,22 @@ def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
|
|
|
84
94
|
return ret
|
|
85
95
|
|
|
86
96
|
|
|
87
|
-
|
|
88
|
-
|
|
97
|
+
def _parse_elements(elements: list[Literal['text', 'image']]) -> list[Element]:
|
|
98
|
+
result: list[Element] = []
|
|
99
|
+
for e in elements:
|
|
100
|
+
clean_e = e.strip().upper()
|
|
101
|
+
if clean_e not in Element.__members__:
|
|
102
|
+
raise Error(f'Invalid element: `{e}`. Valid elements are: {", ".join(Element.__members__).lower()}')
|
|
103
|
+
result.append(Element[clean_e])
|
|
104
|
+
if len(result) == 0:
|
|
105
|
+
raise Error('elements cannot be empty')
|
|
106
|
+
return result
|
|
89
107
|
|
|
90
|
-
class DocumentSplitter(ComponentIterator):
|
|
91
|
-
"""Iterator over chunks of a document. The document is chunked according to the specified `separators`.
|
|
92
108
|
|
|
93
|
-
|
|
94
|
-
include additional metadata fields if specified in the `metadata` parameter, as explained below.
|
|
109
|
+
_HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
|
|
95
110
|
|
|
96
|
-
Chunked text will be cleaned with `ftfy.fix_text` to fix up common problems with unicode sequences.
|
|
97
|
-
"""
|
|
98
111
|
|
|
112
|
+
class DocumentSplitter(ComponentIterator):
|
|
99
113
|
METADATA_COLUMN_TYPES: ClassVar[dict[ChunkMetadata, ColumnType]] = {
|
|
100
114
|
ChunkMetadata.TITLE: StringType(nullable=True),
|
|
101
115
|
ChunkMetadata.HEADING: JsonType(nullable=True),
|
|
@@ -104,36 +118,41 @@ class DocumentSplitter(ComponentIterator):
|
|
|
104
118
|
ChunkMetadata.BOUNDING_BOX: JsonType(nullable=True),
|
|
105
119
|
}
|
|
106
120
|
|
|
121
|
+
_doc_handle: Any
|
|
122
|
+
_separators: list[Separator]
|
|
123
|
+
_elements: list[Element]
|
|
124
|
+
_metadata_fields: list[ChunkMetadata]
|
|
125
|
+
_doc_title: str
|
|
126
|
+
_limit: int
|
|
127
|
+
_skip_tags: list[str]
|
|
128
|
+
_overlap: int
|
|
129
|
+
_tiktoken_encoding: str | None
|
|
130
|
+
_tiktoken_target_model: str | None
|
|
131
|
+
_image_dpi: int
|
|
132
|
+
_image_format: str
|
|
133
|
+
|
|
134
|
+
_sections: Iterator[DocumentSection]
|
|
135
|
+
|
|
107
136
|
def __init__(
|
|
108
137
|
self,
|
|
109
138
|
document: str,
|
|
110
139
|
*,
|
|
111
140
|
separators: str,
|
|
112
|
-
|
|
113
|
-
|
|
141
|
+
elements: list[Literal['text', 'image']] | None = None,
|
|
142
|
+
limit: int | None = None,
|
|
143
|
+
overlap: int | None = None,
|
|
114
144
|
metadata: str = '',
|
|
115
|
-
|
|
116
|
-
tiktoken_encoding:
|
|
117
|
-
tiktoken_target_model:
|
|
145
|
+
skip_tags: list[str] | None = None,
|
|
146
|
+
tiktoken_encoding: str | None = 'cl100k_base',
|
|
147
|
+
tiktoken_target_model: str | None = None,
|
|
148
|
+
image_dpi: int = 300,
|
|
149
|
+
image_format: str = 'png',
|
|
118
150
|
):
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
Args:
|
|
122
|
-
separators: separators to use to chunk the document. Options are:
|
|
123
|
-
`'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
|
|
124
|
-
This may be a comma-separated string, e.g., `'heading,token_limit'`.
|
|
125
|
-
limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
|
|
126
|
-
or `'char_limit'` is specified.
|
|
127
|
-
metadata: additional metadata fields to include in the output. Options are:
|
|
128
|
-
`'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
|
|
129
|
-
(PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
|
|
130
|
-
"""
|
|
131
|
-
if html_skip_tags is None:
|
|
132
|
-
html_skip_tags = ['nav']
|
|
151
|
+
if skip_tags is None:
|
|
152
|
+
skip_tags = ['nav']
|
|
133
153
|
self._doc_handle = get_document_handle(document)
|
|
154
|
+
self._elements = _parse_elements(elements.copy()) if elements is not None else [Element.TEXT]
|
|
134
155
|
assert self._doc_handle is not None
|
|
135
|
-
# calling the output_schema method to validate the input arguments
|
|
136
|
-
self.output_schema(separators=separators, metadata=metadata, limit=limit, overlap=overlap)
|
|
137
156
|
self._separators = _parse_separators(separators)
|
|
138
157
|
self._metadata_fields = _parse_metadata(metadata)
|
|
139
158
|
if self._doc_handle.bs_doc is not None:
|
|
@@ -145,10 +164,12 @@ class DocumentSplitter(ComponentIterator):
|
|
|
145
164
|
else:
|
|
146
165
|
self._doc_title = ''
|
|
147
166
|
self._limit = 0 if limit is None else limit
|
|
148
|
-
self._skip_tags =
|
|
167
|
+
self._skip_tags = skip_tags
|
|
149
168
|
self._overlap = 0 if overlap is None else overlap
|
|
150
169
|
self._tiktoken_encoding = tiktoken_encoding
|
|
151
170
|
self._tiktoken_target_model = tiktoken_target_model
|
|
171
|
+
self._image_dpi = image_dpi
|
|
172
|
+
self._image_format = image_format
|
|
152
173
|
|
|
153
174
|
# set up processing pipeline
|
|
154
175
|
if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
|
|
@@ -178,19 +199,28 @@ class DocumentSplitter(ComponentIterator):
|
|
|
178
199
|
return {
|
|
179
200
|
'document': DocumentType(nullable=False),
|
|
180
201
|
'separators': StringType(nullable=False),
|
|
202
|
+
'elements': JsonType(nullable=False),
|
|
181
203
|
'metadata': StringType(nullable=False),
|
|
182
204
|
'limit': IntType(nullable=True),
|
|
183
205
|
'overlap': IntType(nullable=True),
|
|
184
206
|
'skip_tags': StringType(nullable=True),
|
|
185
207
|
'tiktoken_encoding': StringType(nullable=True),
|
|
186
208
|
'tiktoken_target_model': StringType(nullable=True),
|
|
209
|
+
'image_dpi': IntType(nullable=True),
|
|
210
|
+
'image_format': StringType(nullable=True),
|
|
187
211
|
}
|
|
188
212
|
|
|
189
213
|
@classmethod
|
|
190
214
|
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
|
|
191
|
-
schema: dict[str, ColumnType] = {
|
|
192
|
-
|
|
193
|
-
|
|
215
|
+
schema: dict[str, ColumnType] = {}
|
|
216
|
+
elements = _parse_elements(kwargs.get('elements', ['text']))
|
|
217
|
+
for element in elements:
|
|
218
|
+
if element == Element.TEXT:
|
|
219
|
+
schema['text'] = StringType(nullable=False)
|
|
220
|
+
elif element == Element.IMAGE:
|
|
221
|
+
schema['image'] = ImageType(nullable=False)
|
|
222
|
+
|
|
223
|
+
md_fields = _parse_metadata(kwargs.get('metadata', ''))
|
|
194
224
|
for md_field in md_fields:
|
|
195
225
|
schema[md_field.name.lower()] = cls.METADATA_COLUMN_TYPES[md_field]
|
|
196
226
|
|
|
@@ -200,6 +230,8 @@ class DocumentSplitter(ComponentIterator):
|
|
|
200
230
|
limit = kwargs.get('limit')
|
|
201
231
|
overlap = kwargs.get('overlap')
|
|
202
232
|
|
|
233
|
+
if Element.IMAGE in elements and separators != [Separator.PAGE]:
|
|
234
|
+
raise Error('Image elements are only supported for the "page" separator on PDF documents')
|
|
203
235
|
if limit is not None or overlap is not None:
|
|
204
236
|
if Separator.TOKEN_LIMIT not in separators and Separator.CHAR_LIMIT not in separators:
|
|
205
237
|
raise Error('limit/overlap requires the "token_limit" or "char_limit" separator')
|
|
@@ -213,7 +245,6 @@ class DocumentSplitter(ComponentIterator):
|
|
|
213
245
|
if kwargs.get('limit') is None:
|
|
214
246
|
raise Error('limit is required with "token_limit"/"char_limit" separators')
|
|
215
247
|
|
|
216
|
-
# check dependencies at the end
|
|
217
248
|
if Separator.SENTENCE in separators:
|
|
218
249
|
_ = Env.get().spacy_nlp
|
|
219
250
|
if Separator.TOKEN_LIMIT in separators:
|
|
@@ -224,9 +255,15 @@ class DocumentSplitter(ComponentIterator):
|
|
|
224
255
|
def __next__(self) -> dict[str, Any]:
|
|
225
256
|
while True:
|
|
226
257
|
section = next(self._sections)
|
|
227
|
-
if section.text is None:
|
|
258
|
+
if section.text is None and section.image is None:
|
|
228
259
|
continue
|
|
229
|
-
result: dict[str, Any] = {
|
|
260
|
+
result: dict[str, Any] = {}
|
|
261
|
+
for element in self._elements:
|
|
262
|
+
if element == Element.TEXT:
|
|
263
|
+
result['text'] = section.text
|
|
264
|
+
elif element == Element.IMAGE:
|
|
265
|
+
result['image'] = section.image
|
|
266
|
+
|
|
230
267
|
for md_field in self._metadata_fields:
|
|
231
268
|
if md_field == ChunkMetadata.TITLE:
|
|
232
269
|
result[md_field.name.lower()] = self._doc_title
|
|
@@ -238,6 +275,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
238
275
|
result[md_field.name.lower()] = section.metadata.page
|
|
239
276
|
elif md_field == ChunkMetadata.BOUNDING_BOX:
|
|
240
277
|
result[md_field.name.lower()] = section.metadata.bounding_box
|
|
278
|
+
|
|
241
279
|
return result
|
|
242
280
|
|
|
243
281
|
def _html_sections(self) -> Iterator[DocumentSection]:
|
|
@@ -273,7 +311,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
273
311
|
yield DocumentSection(text=full_text, metadata=md)
|
|
274
312
|
accumulated_text = []
|
|
275
313
|
|
|
276
|
-
def process_element(el:
|
|
314
|
+
def process_element(el: Tag | NavigableString) -> Iterator[DocumentSection]:
|
|
277
315
|
# process the element and emit sections as necessary
|
|
278
316
|
nonlocal accumulated_text, headings, sourceline, emit_on_heading, emit_on_paragraph
|
|
279
317
|
|
|
@@ -361,43 +399,35 @@ class DocumentSplitter(ComponentIterator):
|
|
|
361
399
|
yield from emit()
|
|
362
400
|
|
|
363
401
|
def _pdf_sections(self) -> Iterator[DocumentSection]:
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
402
|
+
if Separator.PARAGRAPH in self._separators:
|
|
403
|
+
raise Error(
|
|
404
|
+
'Paragraph splitting is not currently supported for PDF documents. Please contact'
|
|
405
|
+
' us at https://github.com/pixeltable/pixeltable/issues if you need this feature.'
|
|
406
|
+
)
|
|
369
407
|
|
|
370
|
-
|
|
371
|
-
|
|
408
|
+
doc: PdfDocument = self._doc_handle.pdf_doc
|
|
409
|
+
assert isinstance(doc, PdfDocument)
|
|
372
410
|
|
|
373
|
-
|
|
411
|
+
emit_on_page = Separator.PAGE in self._separators
|
|
412
|
+
accumulated_text: list[str] = []
|
|
374
413
|
|
|
375
|
-
def
|
|
376
|
-
fixed = ftfy.fix_text(
|
|
414
|
+
def _add_cleaned(raw: str) -> None:
|
|
415
|
+
fixed = ftfy.fix_text(raw)
|
|
377
416
|
if fixed:
|
|
378
417
|
accumulated_text.append(fixed)
|
|
379
418
|
|
|
380
419
|
def _emit_text() -> str:
|
|
381
|
-
|
|
420
|
+
txt = ''.join(accumulated_text)
|
|
382
421
|
accumulated_text.clear()
|
|
383
|
-
return
|
|
384
|
-
|
|
385
|
-
for
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
_add_cleaned_text(text)
|
|
393
|
-
if accumulated_text and emit_on_paragraph:
|
|
394
|
-
bbox = {'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2}
|
|
395
|
-
metadata = DocumentSectionMetadata(page=page_number, bounding_box=bbox)
|
|
396
|
-
yield DocumentSection(text=_emit_text(), metadata=metadata)
|
|
397
|
-
|
|
398
|
-
if accumulated_text and emit_on_page and not emit_on_paragraph:
|
|
399
|
-
yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata(page=page_number))
|
|
400
|
-
accumulated_text = []
|
|
422
|
+
return txt
|
|
423
|
+
|
|
424
|
+
for page_idx, page in enumerate(doc):
|
|
425
|
+
img = page.render().to_pil() if Element.IMAGE in self._elements else None
|
|
426
|
+
text = page.get_textpage().get_text_bounded()
|
|
427
|
+
_add_cleaned(text)
|
|
428
|
+
if accumulated_text and emit_on_page:
|
|
429
|
+
md = DocumentSectionMetadata(page=page_idx)
|
|
430
|
+
yield DocumentSection(text=_emit_text(), image=img, metadata=md)
|
|
401
431
|
|
|
402
432
|
if accumulated_text and not emit_on_page:
|
|
403
433
|
yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata())
|
|
@@ -465,5 +495,9 @@ class DocumentSplitter(ComponentIterator):
|
|
|
465
495
|
def close(self) -> None:
|
|
466
496
|
pass
|
|
467
497
|
|
|
468
|
-
|
|
469
|
-
|
|
498
|
+
@classmethod
|
|
499
|
+
@deprecated(
|
|
500
|
+
'create() is deprecated; use `pixeltable.functions.document.document_splitter` instead', version='0.5.6'
|
|
501
|
+
)
|
|
502
|
+
def create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
|
|
503
|
+
return super()._create(**kwargs)
|
pixeltable/iterators/image.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import Any, Sequence
|
|
2
2
|
|
|
3
3
|
import PIL.Image
|
|
4
|
+
from deprecated import deprecated
|
|
4
5
|
|
|
5
6
|
import pixeltable.exceptions as excs
|
|
6
7
|
import pixeltable.type_system as ts
|
|
@@ -8,18 +9,6 @@ from pixeltable.iterators.base import ComponentIterator
|
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
class TileIterator(ComponentIterator):
|
|
11
|
-
"""
|
|
12
|
-
Iterator over tiles of an image. Each image will be divided into tiles of size `tile_size`, and the tiles will be
|
|
13
|
-
iterated over in row-major order (left-to-right, then top-to-bottom). An optional `overlap` parameter may be
|
|
14
|
-
specified. If the tiles do not exactly cover the image, then the rightmost and bottommost tiles will be padded with
|
|
15
|
-
blackspace, so that the output images all have the exact size `tile_size`.
|
|
16
|
-
|
|
17
|
-
Args:
|
|
18
|
-
image: Image to split into tiles.
|
|
19
|
-
tile_size: Size of each tile, as a pair of integers `[width, height]`.
|
|
20
|
-
overlap: Amount of overlap between adjacent tiles, as a pair of integers `[width, height]`.
|
|
21
|
-
"""
|
|
22
|
-
|
|
23
12
|
__image: PIL.Image.Image
|
|
24
13
|
__tile_size: Sequence[int]
|
|
25
14
|
__overlap: Sequence[int]
|
|
@@ -31,8 +20,7 @@ class TileIterator(ComponentIterator):
|
|
|
31
20
|
__j: int
|
|
32
21
|
|
|
33
22
|
def __init__(self, image: PIL.Image.Image, *, tile_size: tuple[int, int], overlap: tuple[int, int] = (0, 0)):
|
|
34
|
-
|
|
35
|
-
raise excs.Error(f'overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}')
|
|
23
|
+
assert overlap[0] < tile_size[0] and overlap[1] < tile_size[1]
|
|
36
24
|
|
|
37
25
|
self.__image = image
|
|
38
26
|
self.__image.load()
|
|
@@ -69,7 +57,7 @@ class TileIterator(ComponentIterator):
|
|
|
69
57
|
def close(self) -> None:
|
|
70
58
|
pass
|
|
71
59
|
|
|
72
|
-
def set_pos(self, pos: int) -> None:
|
|
60
|
+
def set_pos(self, pos: int, **kwargs: Any) -> None:
|
|
73
61
|
self.__j = pos // self.__xlen
|
|
74
62
|
self.__i = pos % self.__xlen
|
|
75
63
|
|
|
@@ -79,4 +67,13 @@ class TileIterator(ComponentIterator):
|
|
|
79
67
|
|
|
80
68
|
@classmethod
|
|
81
69
|
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
70
|
+
tile_size = kwargs.get('tile_size')
|
|
71
|
+
overlap = kwargs.get('overlap', (0, 0))
|
|
72
|
+
if overlap[0] >= tile_size[0] or overlap[1] >= tile_size[1]:
|
|
73
|
+
raise excs.Error(f'overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}')
|
|
82
74
|
return {'tile': ts.ImageType(), 'tile_coord': ts.JsonType(), 'tile_box': ts.JsonType()}, ['tile']
|
|
75
|
+
|
|
76
|
+
@classmethod
|
|
77
|
+
@deprecated('create() is deprecated; use `pixeltable.functions.image.tile_iterator` instead', version='0.5.6')
|
|
78
|
+
def create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
|
|
79
|
+
return super()._create(**kwargs)
|
pixeltable/iterators/string.py
CHANGED
|
@@ -1,12 +1,17 @@
|
|
|
1
1
|
from typing import Any, Iterator
|
|
2
2
|
|
|
3
|
+
from deprecated import deprecated
|
|
4
|
+
|
|
3
5
|
from pixeltable import exceptions as excs, type_system as ts
|
|
4
6
|
from pixeltable.env import Env
|
|
5
7
|
from pixeltable.iterators.base import ComponentIterator
|
|
6
8
|
|
|
7
9
|
|
|
8
10
|
class StringSplitter(ComponentIterator):
|
|
9
|
-
|
|
11
|
+
_text: str
|
|
12
|
+
doc: Any # spacy doc
|
|
13
|
+
iter: Iterator[dict[str, Any]]
|
|
14
|
+
|
|
10
15
|
def __init__(self, text: str, *, separators: str):
|
|
11
16
|
if separators != 'sentence':
|
|
12
17
|
raise excs.Error('Only `sentence` separators are currently supported.')
|
|
@@ -24,9 +29,6 @@ class StringSplitter(ComponentIterator):
|
|
|
24
29
|
def close(self) -> None:
|
|
25
30
|
pass
|
|
26
31
|
|
|
27
|
-
def set_pos(self, pos: int) -> None:
|
|
28
|
-
pass
|
|
29
|
-
|
|
30
32
|
@classmethod
|
|
31
33
|
def input_schema(cls, *args: Any, **kwargs: Any) -> dict[str, ts.ColumnType]:
|
|
32
34
|
return {'text': ts.StringType(), 'separators': ts.StringType()}
|
|
@@ -34,3 +36,8 @@ class StringSplitter(ComponentIterator):
|
|
|
34
36
|
@classmethod
|
|
35
37
|
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
36
38
|
return {'text': ts.StringType()}, []
|
|
39
|
+
|
|
40
|
+
@classmethod
|
|
41
|
+
@deprecated('create() is deprecated; use `pixeltable.functions.string.string_splitter` instead', version='0.5.6')
|
|
42
|
+
def create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
|
|
43
|
+
return super()._create(**kwargs)
|