pixeltable 0.4.17__py3-none-any.whl → 0.4.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -1
- pixeltable/_version.py +1 -0
- pixeltable/catalog/catalog.py +144 -118
- pixeltable/catalog/column.py +104 -115
- pixeltable/catalog/globals.py +1 -2
- pixeltable/catalog/insertable_table.py +44 -49
- pixeltable/catalog/path.py +3 -4
- pixeltable/catalog/schema_object.py +4 -4
- pixeltable/catalog/table.py +139 -124
- pixeltable/catalog/table_metadata.py +6 -6
- pixeltable/catalog/table_version.py +315 -246
- pixeltable/catalog/table_version_handle.py +4 -4
- pixeltable/catalog/table_version_path.py +9 -10
- pixeltable/catalog/tbl_ops.py +9 -3
- pixeltable/catalog/view.py +34 -28
- pixeltable/config.py +14 -10
- pixeltable/dataframe.py +69 -78
- pixeltable/env.py +78 -64
- pixeltable/exec/aggregation_node.py +6 -6
- pixeltable/exec/cache_prefetch_node.py +10 -10
- pixeltable/exec/data_row_batch.py +3 -3
- pixeltable/exec/exec_context.py +16 -4
- pixeltable/exec/exec_node.py +5 -5
- pixeltable/exec/expr_eval/evaluators.py +6 -6
- pixeltable/exec/expr_eval/expr_eval_node.py +8 -7
- pixeltable/exec/expr_eval/globals.py +6 -6
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +11 -11
- pixeltable/exec/in_memory_data_node.py +2 -2
- pixeltable/exec/object_store_save_node.py +14 -17
- pixeltable/exec/sql_node.py +28 -27
- pixeltable/exprs/arithmetic_expr.py +4 -4
- pixeltable/exprs/array_slice.py +2 -2
- pixeltable/exprs/column_property_ref.py +3 -3
- pixeltable/exprs/column_ref.py +61 -74
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +3 -3
- pixeltable/exprs/data_row.py +12 -12
- pixeltable/exprs/expr.py +41 -31
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +3 -3
- pixeltable/exprs/function_call.py +14 -14
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +8 -8
- pixeltable/exprs/is_null.py +1 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +6 -6
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +14 -14
- pixeltable/exprs/rowid_ref.py +8 -8
- pixeltable/exprs/similarity_expr.py +50 -25
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +2 -2
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +3 -3
- pixeltable/func/function.py +15 -17
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +2 -2
- pixeltable/func/query_template_function.py +16 -16
- pixeltable/func/signature.py +14 -14
- pixeltable/func/tools.py +11 -11
- pixeltable/func/udf.py +16 -18
- pixeltable/functions/__init__.py +1 -0
- pixeltable/functions/anthropic.py +7 -7
- pixeltable/functions/audio.py +76 -0
- pixeltable/functions/bedrock.py +6 -6
- pixeltable/functions/deepseek.py +4 -4
- pixeltable/functions/fireworks.py +2 -2
- pixeltable/functions/gemini.py +6 -6
- pixeltable/functions/globals.py +12 -12
- pixeltable/functions/groq.py +4 -4
- pixeltable/functions/huggingface.py +1033 -6
- pixeltable/functions/image.py +7 -10
- pixeltable/functions/llama_cpp.py +7 -7
- pixeltable/functions/math.py +2 -3
- pixeltable/functions/mistralai.py +3 -3
- pixeltable/functions/ollama.py +9 -9
- pixeltable/functions/openai.py +21 -21
- pixeltable/functions/openrouter.py +7 -7
- pixeltable/functions/string.py +21 -28
- pixeltable/functions/timestamp.py +7 -8
- pixeltable/functions/together.py +4 -6
- pixeltable/functions/twelvelabs.py +92 -0
- pixeltable/functions/video.py +36 -31
- pixeltable/functions/vision.py +6 -6
- pixeltable/functions/whisper.py +7 -7
- pixeltable/functions/whisperx.py +16 -16
- pixeltable/globals.py +75 -40
- pixeltable/index/base.py +12 -8
- pixeltable/index/btree.py +19 -22
- pixeltable/index/embedding_index.py +30 -39
- pixeltable/io/datarows.py +3 -3
- pixeltable/io/external_store.py +13 -16
- pixeltable/io/fiftyone.py +5 -5
- pixeltable/io/globals.py +5 -5
- pixeltable/io/hf_datasets.py +4 -4
- pixeltable/io/label_studio.py +12 -12
- pixeltable/io/pandas.py +6 -6
- pixeltable/io/parquet.py +2 -2
- pixeltable/io/table_data_conduit.py +12 -12
- pixeltable/io/utils.py +2 -2
- pixeltable/iterators/audio.py +2 -2
- pixeltable/iterators/document.py +88 -57
- pixeltable/iterators/video.py +66 -37
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_36.py +2 -2
- pixeltable/metadata/converters/convert_38.py +2 -2
- pixeltable/metadata/converters/convert_39.py +1 -2
- pixeltable/metadata/converters/util.py +11 -13
- pixeltable/metadata/schema.py +22 -21
- pixeltable/metadata/utils.py +2 -6
- pixeltable/mypy/mypy_plugin.py +5 -5
- pixeltable/plan.py +32 -34
- pixeltable/share/packager.py +7 -7
- pixeltable/share/publish.py +3 -3
- pixeltable/store.py +126 -41
- pixeltable/type_system.py +43 -46
- pixeltable/utils/__init__.py +1 -2
- pixeltable/utils/arrow.py +4 -4
- pixeltable/utils/av.py +74 -38
- pixeltable/utils/azure_store.py +305 -0
- pixeltable/utils/code.py +1 -2
- pixeltable/utils/dbms.py +15 -19
- pixeltable/utils/description_helper.py +2 -3
- pixeltable/utils/documents.py +5 -6
- pixeltable/utils/exception_handler.py +2 -2
- pixeltable/utils/filecache.py +5 -5
- pixeltable/utils/formatter.py +4 -6
- pixeltable/utils/gcs_store.py +9 -9
- pixeltable/utils/local_store.py +17 -17
- pixeltable/utils/object_stores.py +59 -43
- pixeltable/utils/s3_store.py +35 -30
- {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/METADATA +4 -4
- pixeltable-0.4.19.dist-info/RECORD +213 -0
- pixeltable/__version__.py +0 -3
- pixeltable-0.4.17.dist-info/RECORD +0 -211
- {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/licenses/LICENSE +0 -0
|
@@ -8,7 +8,7 @@ import urllib.parse
|
|
|
8
8
|
import urllib.request
|
|
9
9
|
from dataclasses import dataclass, field, fields
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal,
|
|
11
|
+
from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, cast
|
|
12
12
|
|
|
13
13
|
import numpy as np
|
|
14
14
|
import pandas as pd
|
|
@@ -50,15 +50,15 @@ class TableDataConduitFormat(str, enum.Enum):
|
|
|
50
50
|
@dataclass
|
|
51
51
|
class TableDataConduit:
|
|
52
52
|
source: 'TableDataSource'
|
|
53
|
-
source_format:
|
|
54
|
-
source_column_map:
|
|
53
|
+
source_format: str | None = None
|
|
54
|
+
source_column_map: dict[str, str] | None = None
|
|
55
55
|
if_row_exists: Literal['update', 'ignore', 'error'] = 'error'
|
|
56
|
-
pxt_schema:
|
|
57
|
-
src_schema_overrides:
|
|
58
|
-
src_schema:
|
|
59
|
-
pxt_pk:
|
|
60
|
-
src_pk:
|
|
61
|
-
valid_rows:
|
|
56
|
+
pxt_schema: dict[str, ts.ColumnType] | None = None
|
|
57
|
+
src_schema_overrides: dict[str, ts.ColumnType] | None = None
|
|
58
|
+
src_schema: dict[str, ts.ColumnType] | None = None
|
|
59
|
+
pxt_pk: list[str] | None = None
|
|
60
|
+
src_pk: list[str] | None = None
|
|
61
|
+
valid_rows: RowData | None = None
|
|
62
62
|
extra_fields: dict[str, Any] = field(default_factory=dict)
|
|
63
63
|
|
|
64
64
|
reqd_col_names: set[str] = field(default_factory=set)
|
|
@@ -151,7 +151,7 @@ class DFTableDataConduit(TableDataConduit):
|
|
|
151
151
|
|
|
152
152
|
|
|
153
153
|
class RowDataTableDataConduit(TableDataConduit):
|
|
154
|
-
raw_rows:
|
|
154
|
+
raw_rows: RowData | None = None
|
|
155
155
|
disable_mapping: bool = True
|
|
156
156
|
batch_count: int = 0
|
|
157
157
|
|
|
@@ -332,7 +332,7 @@ class HFTableDataConduit(TableDataConduit):
|
|
|
332
332
|
- use set_format('arrow') and convert ChunkedArrays to PIL.Image.Image instead of going through numpy, which is slow
|
|
333
333
|
"""
|
|
334
334
|
|
|
335
|
-
column_name_for_split:
|
|
335
|
+
column_name_for_split: str | None = None
|
|
336
336
|
categorical_features: dict[str, dict[int, str]]
|
|
337
337
|
dataset_dict: dict[str, datasets.Dataset] = None
|
|
338
338
|
hf_schema_source: dict[str, Any] = None
|
|
@@ -478,7 +478,7 @@ class HFTableDataConduit(TableDataConduit):
|
|
|
478
478
|
|
|
479
479
|
|
|
480
480
|
class ParquetTableDataConduit(TableDataConduit):
|
|
481
|
-
pq_ds:
|
|
481
|
+
pq_ds: ParquetDataset | None = None
|
|
482
482
|
|
|
483
483
|
@classmethod
|
|
484
484
|
def from_tds(cls, tds: TableDataConduit) -> 'ParquetTableDataConduit':
|
pixeltable/io/utils.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from keyword import iskeyword as is_python_keyword
|
|
2
|
-
from typing import Any
|
|
2
|
+
from typing import Any
|
|
3
3
|
|
|
4
4
|
import pixeltable as pxt
|
|
5
5
|
import pixeltable.exceptions as excs
|
|
@@ -40,7 +40,7 @@ def normalize_schema_names(
|
|
|
40
40
|
primary_key: list[str],
|
|
41
41
|
schema_overrides: dict[str, Any],
|
|
42
42
|
require_valid_pxt_column_names: bool = False,
|
|
43
|
-
) -> tuple[dict[str, Any], list[str],
|
|
43
|
+
) -> tuple[dict[str, Any], list[str], dict[str, str] | None]:
|
|
44
44
|
"""
|
|
45
45
|
Convert all names in the input schema from source names to valid Pixeltable identifiers
|
|
46
46
|
- Ensure that all names are unique.
|
pixeltable/iterators/audio.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from fractions import Fraction
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Any, ClassVar
|
|
4
|
+
from typing import Any, ClassVar
|
|
5
5
|
|
|
6
6
|
import av
|
|
7
7
|
|
|
@@ -37,7 +37,7 @@ class AudioSplitter(ComponentIterator):
|
|
|
37
37
|
|
|
38
38
|
# List of chunks to extract
|
|
39
39
|
# Each chunk is defined by start and end presentation timestamps in audio file (int)
|
|
40
|
-
chunks_to_extract_in_pts:
|
|
40
|
+
chunks_to_extract_in_pts: list[tuple[int, int]] | None
|
|
41
41
|
# next chunk to extract
|
|
42
42
|
next_pos: int
|
|
43
43
|
|
pixeltable/iterators/document.py
CHANGED
|
@@ -2,7 +2,7 @@ import dataclasses
|
|
|
2
2
|
import enum
|
|
3
3
|
import io
|
|
4
4
|
import logging
|
|
5
|
-
from typing import Any, ClassVar, Iterable, Iterator,
|
|
5
|
+
from typing import Any, ClassVar, Iterable, Iterator, Literal
|
|
6
6
|
|
|
7
7
|
import fitz # type: ignore[import-untyped]
|
|
8
8
|
import ftfy
|
|
@@ -11,7 +11,7 @@ from bs4.element import NavigableString, Tag
|
|
|
11
11
|
|
|
12
12
|
from pixeltable.env import Env
|
|
13
13
|
from pixeltable.exceptions import Error
|
|
14
|
-
from pixeltable.type_system import
|
|
14
|
+
from pixeltable.type_system import ColumnType, DocumentType, ImageType, IntType, JsonType, StringType
|
|
15
15
|
from pixeltable.utils.documents import get_document_handle
|
|
16
16
|
|
|
17
17
|
from .base import ComponentIterator
|
|
@@ -19,6 +19,11 @@ from .base import ComponentIterator
|
|
|
19
19
|
_logger = logging.getLogger('pixeltable')
|
|
20
20
|
|
|
21
21
|
|
|
22
|
+
class Element(enum.Enum):
|
|
23
|
+
TEXT = 1
|
|
24
|
+
IMAGE = 2
|
|
25
|
+
|
|
26
|
+
|
|
22
27
|
class ChunkMetadata(enum.Enum):
|
|
23
28
|
TITLE = 1
|
|
24
29
|
HEADING = 2
|
|
@@ -41,28 +46,28 @@ class DocumentSectionMetadata:
|
|
|
41
46
|
"""Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
|
|
42
47
|
|
|
43
48
|
# html and markdown metadata
|
|
44
|
-
sourceline:
|
|
49
|
+
sourceline: int | None = None
|
|
45
50
|
# the stack of headings up to the most recently observed one;
|
|
46
51
|
# eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
|
|
47
|
-
heading:
|
|
52
|
+
heading: dict[str, str] | None = None
|
|
48
53
|
|
|
49
54
|
# pdf-specific metadata
|
|
50
|
-
page:
|
|
55
|
+
page: int | None = None
|
|
51
56
|
# bounding box as an {x1, y1, x2, y2} dictionary
|
|
52
|
-
bounding_box:
|
|
57
|
+
bounding_box: dict[str, float] | None = None
|
|
53
58
|
|
|
54
59
|
|
|
55
60
|
@dataclasses.dataclass
|
|
56
61
|
class DocumentSection:
|
|
57
62
|
"""A single document chunk, according to some of the splitting criteria"""
|
|
58
63
|
|
|
59
|
-
text:
|
|
60
|
-
|
|
61
|
-
|
|
64
|
+
text: str | None = None
|
|
65
|
+
image: PIL.Image.Image | None = None
|
|
66
|
+
metadata: DocumentSectionMetadata | None = None
|
|
62
67
|
|
|
63
68
|
|
|
64
69
|
def _parse_separators(separators: str) -> list[Separator]:
|
|
65
|
-
ret = []
|
|
70
|
+
ret: list[Separator] = []
|
|
66
71
|
for s in separators.split(','):
|
|
67
72
|
clean_s = s.strip().upper()
|
|
68
73
|
if not clean_s:
|
|
@@ -76,7 +81,7 @@ def _parse_separators(separators: str) -> list[Separator]:
|
|
|
76
81
|
|
|
77
82
|
|
|
78
83
|
def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
|
|
79
|
-
ret = []
|
|
84
|
+
ret: list[ChunkMetadata] = []
|
|
80
85
|
for m in metadata.split(','):
|
|
81
86
|
clean_m = m.strip().upper()
|
|
82
87
|
if not clean_m:
|
|
@@ -89,6 +94,18 @@ def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
|
|
|
89
94
|
return ret
|
|
90
95
|
|
|
91
96
|
|
|
97
|
+
def _parse_elements(elements: list[Literal['text', 'image']]) -> list[Element]:
|
|
98
|
+
result: list[Element] = []
|
|
99
|
+
for e in elements:
|
|
100
|
+
clean_e = e.strip().upper()
|
|
101
|
+
if clean_e not in Element.__members__:
|
|
102
|
+
raise Error(f'Invalid element: `{e}`. Valid elements are: {", ".join(Element.__members__).lower()}')
|
|
103
|
+
result.append(Element[clean_e])
|
|
104
|
+
if len(result) == 0:
|
|
105
|
+
raise Error('elements cannot be empty')
|
|
106
|
+
return result
|
|
107
|
+
|
|
108
|
+
|
|
92
109
|
_HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
|
|
93
110
|
|
|
94
111
|
|
|
@@ -106,11 +123,16 @@ class DocumentSplitter(ComponentIterator):
|
|
|
106
123
|
separators: separators to use to chunk the document. Options are:
|
|
107
124
|
`'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
|
|
108
125
|
This may be a comma-separated string, e.g., `'heading,token_limit'`.
|
|
126
|
+
elements: list of elements to extract from the document. Options are:
|
|
127
|
+
`'text'`, `'image'`. Defaults to `['text']` if not specified. The `'image'` element is only supported
|
|
128
|
+
for the `'page'` separator on PDF documents.
|
|
109
129
|
limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
|
|
110
130
|
or `'char_limit'` is specified.
|
|
111
131
|
metadata: additional metadata fields to include in the output. Options are:
|
|
112
132
|
`'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
|
|
113
133
|
(PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
|
|
134
|
+
image_dpi: DPI to use when extracting images from PDFs. Defaults to 300.
|
|
135
|
+
image_format: format to use when extracting images from PDFs. Defaults to 'png'.
|
|
114
136
|
"""
|
|
115
137
|
|
|
116
138
|
METADATA_COLUMN_TYPES: ClassVar[dict[ChunkMetadata, ColumnType]] = {
|
|
@@ -121,34 +143,41 @@ class DocumentSplitter(ComponentIterator):
|
|
|
121
143
|
ChunkMetadata.BOUNDING_BOX: JsonType(nullable=True),
|
|
122
144
|
}
|
|
123
145
|
|
|
146
|
+
_doc_handle: Any
|
|
147
|
+
_separators: list[Separator]
|
|
148
|
+
_elements: list[Element]
|
|
149
|
+
_metadata_fields: list[ChunkMetadata]
|
|
150
|
+
_doc_title: str
|
|
151
|
+
_limit: int
|
|
152
|
+
_skip_tags: list[str]
|
|
153
|
+
_overlap: int
|
|
154
|
+
_tiktoken_encoding: str | None
|
|
155
|
+
_tiktoken_target_model: str | None
|
|
156
|
+
_image_dpi: int
|
|
157
|
+
_image_format: str
|
|
158
|
+
|
|
159
|
+
_sections: Iterator[DocumentSection]
|
|
160
|
+
|
|
124
161
|
def __init__(
|
|
125
162
|
self,
|
|
126
163
|
document: str,
|
|
127
164
|
*,
|
|
128
165
|
separators: str,
|
|
129
|
-
|
|
130
|
-
|
|
166
|
+
elements: list[Literal['text', 'image']] | None = None,
|
|
167
|
+
limit: int | None = None,
|
|
168
|
+
overlap: int | None = None,
|
|
131
169
|
metadata: str = '',
|
|
132
|
-
html_skip_tags:
|
|
133
|
-
tiktoken_encoding:
|
|
134
|
-
tiktoken_target_model:
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
page_image_dpi: int = 300,
|
|
138
|
-
page_image_format: str = 'png',
|
|
170
|
+
html_skip_tags: list[str] | None = None,
|
|
171
|
+
tiktoken_encoding: str | None = 'cl100k_base',
|
|
172
|
+
tiktoken_target_model: str | None = None,
|
|
173
|
+
image_dpi: int = 300,
|
|
174
|
+
image_format: str = 'png',
|
|
139
175
|
):
|
|
140
176
|
if html_skip_tags is None:
|
|
141
177
|
html_skip_tags = ['nav']
|
|
142
178
|
self._doc_handle = get_document_handle(document)
|
|
179
|
+
self._elements = _parse_elements(elements.copy()) if elements is not None else [Element.TEXT]
|
|
143
180
|
assert self._doc_handle is not None
|
|
144
|
-
# calling the output_schema method to validate the input arguments
|
|
145
|
-
self.output_schema(
|
|
146
|
-
separators=separators,
|
|
147
|
-
metadata=metadata,
|
|
148
|
-
limit=limit,
|
|
149
|
-
overlap=overlap,
|
|
150
|
-
include_page_image=include_page_image,
|
|
151
|
-
)
|
|
152
181
|
self._separators = _parse_separators(separators)
|
|
153
182
|
self._metadata_fields = _parse_metadata(metadata)
|
|
154
183
|
if self._doc_handle.bs_doc is not None:
|
|
@@ -164,10 +193,8 @@ class DocumentSplitter(ComponentIterator):
|
|
|
164
193
|
self._overlap = 0 if overlap is None else overlap
|
|
165
194
|
self._tiktoken_encoding = tiktoken_encoding
|
|
166
195
|
self._tiktoken_target_model = tiktoken_target_model
|
|
167
|
-
|
|
168
|
-
self.
|
|
169
|
-
self._page_image_dpi = page_image_dpi
|
|
170
|
-
self._page_image_format = page_image_format
|
|
196
|
+
self._image_dpi = image_dpi
|
|
197
|
+
self._image_format = image_format
|
|
171
198
|
|
|
172
199
|
# set up processing pipeline
|
|
173
200
|
if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
|
|
@@ -197,23 +224,28 @@ class DocumentSplitter(ComponentIterator):
|
|
|
197
224
|
return {
|
|
198
225
|
'document': DocumentType(nullable=False),
|
|
199
226
|
'separators': StringType(nullable=False),
|
|
227
|
+
'elements': JsonType(nullable=False),
|
|
200
228
|
'metadata': StringType(nullable=False),
|
|
201
229
|
'limit': IntType(nullable=True),
|
|
202
230
|
'overlap': IntType(nullable=True),
|
|
203
231
|
'skip_tags': StringType(nullable=True),
|
|
204
232
|
'tiktoken_encoding': StringType(nullable=True),
|
|
205
233
|
'tiktoken_target_model': StringType(nullable=True),
|
|
206
|
-
|
|
207
|
-
'
|
|
208
|
-
'page_image_dpi': IntType(nullable=True),
|
|
209
|
-
'page_image_format': StringType(nullable=True),
|
|
234
|
+
'image_dpi': IntType(nullable=True),
|
|
235
|
+
'image_format': StringType(nullable=True),
|
|
210
236
|
}
|
|
211
237
|
|
|
212
238
|
@classmethod
|
|
213
239
|
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
|
|
214
|
-
schema: dict[str, ColumnType] = {
|
|
215
|
-
|
|
216
|
-
|
|
240
|
+
schema: dict[str, ColumnType] = {}
|
|
241
|
+
elements = _parse_elements(kwargs.get('elements', ['text']))
|
|
242
|
+
for element in elements:
|
|
243
|
+
if element == Element.TEXT:
|
|
244
|
+
schema['text'] = StringType(nullable=False)
|
|
245
|
+
elif element == Element.IMAGE:
|
|
246
|
+
schema['image'] = ImageType(nullable=False)
|
|
247
|
+
|
|
248
|
+
md_fields = _parse_metadata(kwargs.get('metadata', ''))
|
|
217
249
|
for md_field in md_fields:
|
|
218
250
|
schema[md_field.name.lower()] = cls.METADATA_COLUMN_TYPES[md_field]
|
|
219
251
|
|
|
@@ -223,6 +255,8 @@ class DocumentSplitter(ComponentIterator):
|
|
|
223
255
|
limit = kwargs.get('limit')
|
|
224
256
|
overlap = kwargs.get('overlap')
|
|
225
257
|
|
|
258
|
+
if Element.IMAGE in elements and separators != [Separator.PAGE]:
|
|
259
|
+
raise Error('Image elements are only supported for the "page" separator on PDF documents')
|
|
226
260
|
if limit is not None or overlap is not None:
|
|
227
261
|
if Separator.TOKEN_LIMIT not in separators and Separator.CHAR_LIMIT not in separators:
|
|
228
262
|
raise Error('limit/overlap requires the "token_limit" or "char_limit" separator')
|
|
@@ -236,23 +270,25 @@ class DocumentSplitter(ComponentIterator):
|
|
|
236
270
|
if kwargs.get('limit') is None:
|
|
237
271
|
raise Error('limit is required with "token_limit"/"char_limit" separators')
|
|
238
272
|
|
|
239
|
-
# check dependencies at the end
|
|
240
273
|
if Separator.SENTENCE in separators:
|
|
241
274
|
_ = Env.get().spacy_nlp
|
|
242
275
|
if Separator.TOKEN_LIMIT in separators:
|
|
243
276
|
Env.get().require_package('tiktoken')
|
|
244
277
|
|
|
245
|
-
if kwargs.get('include_page_image'):
|
|
246
|
-
schema['image'] = ImageType(nullable=True)
|
|
247
|
-
|
|
248
278
|
return schema, []
|
|
249
279
|
|
|
250
280
|
def __next__(self) -> dict[str, Any]:
|
|
251
281
|
while True:
|
|
252
282
|
section = next(self._sections)
|
|
253
|
-
if section.text is None:
|
|
283
|
+
if section.text is None and section.image is None:
|
|
254
284
|
continue
|
|
255
|
-
result: dict[str, Any] = {
|
|
285
|
+
result: dict[str, Any] = {}
|
|
286
|
+
for element in self._elements:
|
|
287
|
+
if element == Element.TEXT:
|
|
288
|
+
result['text'] = section.text
|
|
289
|
+
elif element == Element.IMAGE:
|
|
290
|
+
result['image'] = section.image
|
|
291
|
+
|
|
256
292
|
for md_field in self._metadata_fields:
|
|
257
293
|
if md_field == ChunkMetadata.TITLE:
|
|
258
294
|
result[md_field.name.lower()] = self._doc_title
|
|
@@ -265,10 +301,6 @@ class DocumentSplitter(ComponentIterator):
|
|
|
265
301
|
elif md_field == ChunkMetadata.BOUNDING_BOX:
|
|
266
302
|
result[md_field.name.lower()] = section.metadata.bounding_box
|
|
267
303
|
|
|
268
|
-
# FIX: only include image if schema supports it
|
|
269
|
-
if self._include_page_image:
|
|
270
|
-
result['image'] = section.image
|
|
271
|
-
|
|
272
304
|
return result
|
|
273
305
|
|
|
274
306
|
def _html_sections(self) -> Iterator[DocumentSection]:
|
|
@@ -411,11 +443,10 @@ class DocumentSplitter(ComponentIterator):
|
|
|
411
443
|
return txt
|
|
412
444
|
|
|
413
445
|
for page_idx, page in enumerate(doc.pages()):
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
page_image = PIL.Image.open(io.BytesIO(pix.tobytes(self._page_image_format)))
|
|
446
|
+
img: PIL.Image.Image | None = None
|
|
447
|
+
if Element.IMAGE in self._elements:
|
|
448
|
+
pix = page.get_pixmap(dpi=self._image_dpi)
|
|
449
|
+
img = PIL.Image.open(io.BytesIO(pix.tobytes(self._image_format)))
|
|
419
450
|
|
|
420
451
|
for block in page.get_text('blocks'):
|
|
421
452
|
x1, y1, x2, y2, text, *_ = block
|
|
@@ -423,14 +454,14 @@ class DocumentSplitter(ComponentIterator):
|
|
|
423
454
|
if accumulated_text and emit_on_paragraph:
|
|
424
455
|
bbox = {'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2}
|
|
425
456
|
md = DocumentSectionMetadata(page=page_idx, bounding_box=bbox)
|
|
426
|
-
yield DocumentSection(text=_emit_text(), metadata=md
|
|
457
|
+
yield DocumentSection(text=_emit_text(), metadata=md)
|
|
427
458
|
|
|
428
459
|
if accumulated_text and emit_on_page and not emit_on_paragraph:
|
|
429
460
|
md = DocumentSectionMetadata(page=page_idx)
|
|
430
|
-
yield DocumentSection(text=_emit_text(),
|
|
461
|
+
yield DocumentSection(text=_emit_text(), image=img, metadata=md)
|
|
431
462
|
|
|
432
463
|
if accumulated_text and not emit_on_page:
|
|
433
|
-
yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata()
|
|
464
|
+
yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata())
|
|
434
465
|
|
|
435
466
|
def _txt_sections(self) -> Iterator[DocumentSection]:
|
|
436
467
|
"""Create DocumentSections for text files.
|
pixeltable/iterators/video.py
CHANGED
|
@@ -4,7 +4,7 @@ import math
|
|
|
4
4
|
import subprocess
|
|
5
5
|
from fractions import Fraction
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Any, Iterator, Literal
|
|
7
|
+
from typing import Any, Iterator, Literal
|
|
8
8
|
|
|
9
9
|
import av
|
|
10
10
|
import pandas as pd
|
|
@@ -42,9 +42,9 @@ class FrameIterator(ComponentIterator):
|
|
|
42
42
|
[Frame](https://pyav.org/docs/develop/api/frame.html)):
|
|
43
43
|
|
|
44
44
|
* `index` (`int`)
|
|
45
|
-
* `pts` (`
|
|
46
|
-
* `dts` (`
|
|
47
|
-
* `time` (`
|
|
45
|
+
* `pts` (`int | None`)
|
|
46
|
+
* `dts` (`int | None`)
|
|
47
|
+
* `time` (`float | None`)
|
|
48
48
|
* `is_corrupt` (`bool`)
|
|
49
49
|
* `key_frame` (`bool`)
|
|
50
50
|
* `pict_type` (`int`)
|
|
@@ -55,8 +55,8 @@ class FrameIterator(ComponentIterator):
|
|
|
55
55
|
|
|
56
56
|
# Input parameters
|
|
57
57
|
video_path: Path
|
|
58
|
-
fps:
|
|
59
|
-
num_frames:
|
|
58
|
+
fps: float | None
|
|
59
|
+
num_frames: int | None
|
|
60
60
|
all_frame_attrs: bool
|
|
61
61
|
|
|
62
62
|
# Video info
|
|
@@ -67,19 +67,14 @@ class FrameIterator(ComponentIterator):
|
|
|
67
67
|
video_start_time: int
|
|
68
68
|
|
|
69
69
|
# List of frame indices to be extracted, or None to extract all frames
|
|
70
|
-
frames_to_extract:
|
|
70
|
+
frames_to_extract: list[int] | None
|
|
71
71
|
|
|
72
72
|
# Next frame to extract, as an iterator `pos` index. If `frames_to_extract` is None, this is the same as the
|
|
73
73
|
# frame index in the video. Otherwise, the corresponding video index is `frames_to_extract[next_pos]`.
|
|
74
74
|
next_pos: int
|
|
75
75
|
|
|
76
76
|
def __init__(
|
|
77
|
-
self,
|
|
78
|
-
video: str,
|
|
79
|
-
*,
|
|
80
|
-
fps: Optional[float] = None,
|
|
81
|
-
num_frames: Optional[int] = None,
|
|
82
|
-
all_frame_attrs: bool = False,
|
|
77
|
+
self, video: str, *, fps: float | None = None, num_frames: int | None = None, all_frame_attrs: bool = False
|
|
83
78
|
):
|
|
84
79
|
if fps is not None and num_frames is not None:
|
|
85
80
|
raise excs.Error('At most one of `fps` or `num_frames` may be specified')
|
|
@@ -251,7 +246,8 @@ class VideoSplitter(ComponentIterator):
|
|
|
251
246
|
|
|
252
247
|
# Input parameters
|
|
253
248
|
video_path: Path
|
|
254
|
-
segment_duration: float
|
|
249
|
+
segment_duration: float | None
|
|
250
|
+
segment_times: list[float] | None
|
|
255
251
|
overlap: float
|
|
256
252
|
min_segment_duration: float
|
|
257
253
|
video_encoder: str | None
|
|
@@ -268,25 +264,31 @@ class VideoSplitter(ComponentIterator):
|
|
|
268
264
|
self,
|
|
269
265
|
video: str,
|
|
270
266
|
*,
|
|
271
|
-
duration: float,
|
|
272
|
-
overlap: float =
|
|
273
|
-
min_segment_duration: float =
|
|
274
|
-
|
|
267
|
+
duration: float | None = None,
|
|
268
|
+
overlap: float | None = None,
|
|
269
|
+
min_segment_duration: float | None = None,
|
|
270
|
+
segment_times: list[float] | None = None,
|
|
271
|
+
mode: Literal['fast', 'accurate'] = 'accurate',
|
|
275
272
|
video_encoder: str | None = None,
|
|
276
273
|
video_encoder_args: dict[str, Any] | None = None,
|
|
277
274
|
):
|
|
278
275
|
Env.get().require_binary('ffmpeg')
|
|
279
|
-
assert duration
|
|
280
|
-
|
|
281
|
-
|
|
276
|
+
assert (duration is not None) != (segment_times is not None)
|
|
277
|
+
if segment_times is not None:
|
|
278
|
+
assert len(segment_times) > 0
|
|
279
|
+
if duration is not None:
|
|
280
|
+
assert duration > 0.0
|
|
281
|
+
assert duration >= min_segment_duration
|
|
282
|
+
assert overlap is None or overlap < duration
|
|
282
283
|
|
|
283
284
|
video_path = Path(video)
|
|
284
285
|
assert video_path.exists() and video_path.is_file()
|
|
285
286
|
|
|
286
287
|
self.video_path = video_path
|
|
287
288
|
self.segment_duration = duration
|
|
288
|
-
self.overlap = overlap
|
|
289
|
-
self.min_segment_duration = min_segment_duration
|
|
289
|
+
self.overlap = overlap if overlap is not None else 0.0
|
|
290
|
+
self.min_segment_duration = min_segment_duration if min_segment_duration is not None else 0.0
|
|
291
|
+
self.segment_times = segment_times
|
|
290
292
|
self.video_encoder = video_encoder
|
|
291
293
|
self.video_encoder_args = video_encoder_args
|
|
292
294
|
|
|
@@ -304,6 +306,7 @@ class VideoSplitter(ComponentIterator):
|
|
|
304
306
|
'duration': ts.FloatType(nullable=True),
|
|
305
307
|
'overlap': ts.FloatType(nullable=True),
|
|
306
308
|
'min_segment_duration': ts.FloatType(nullable=True),
|
|
309
|
+
'segment_times': ts.JsonType(nullable=True),
|
|
307
310
|
'mode': ts.StringType(nullable=False),
|
|
308
311
|
'video_encoder': ts.StringType(nullable=True),
|
|
309
312
|
'video_encoder_args': ts.JsonType(nullable=True),
|
|
@@ -311,23 +314,34 @@ class VideoSplitter(ComponentIterator):
|
|
|
311
314
|
|
|
312
315
|
@classmethod
|
|
313
316
|
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
314
|
-
param_names = ['duration', 'overlap', 'min_segment_duration']
|
|
317
|
+
param_names = ['duration', 'overlap', 'min_segment_duration', 'segment_times']
|
|
315
318
|
params = dict(zip(param_names, args))
|
|
316
319
|
params.update(kwargs)
|
|
317
320
|
|
|
318
|
-
segment_duration = params
|
|
319
|
-
|
|
320
|
-
overlap = params.get('overlap'
|
|
321
|
+
segment_duration = params.get('duration')
|
|
322
|
+
segment_times = params.get('segment_times')
|
|
323
|
+
overlap = params.get('overlap')
|
|
324
|
+
min_segment_duration = params.get('min_segment_duration')
|
|
321
325
|
mode = params.get('mode', 'fast')
|
|
322
326
|
|
|
323
|
-
if segment_duration
|
|
324
|
-
raise excs.Error('
|
|
325
|
-
if segment_duration
|
|
326
|
-
raise excs.Error('duration
|
|
327
|
-
if
|
|
327
|
+
if segment_duration is None and segment_times is None:
|
|
328
|
+
raise excs.Error('Must specify either duration or segment_times')
|
|
329
|
+
if segment_duration is not None and segment_times is not None:
|
|
330
|
+
raise excs.Error('duration and segment_times cannot both be specified')
|
|
331
|
+
if segment_times is not None:
|
|
332
|
+
if len(segment_times) == 0:
|
|
333
|
+
raise excs.Error('segment_times cannot be empty')
|
|
334
|
+
if overlap is not None:
|
|
335
|
+
raise excs.Error('overlap cannot be specified with segment_times')
|
|
336
|
+
if segment_duration is not None:
|
|
337
|
+
if segment_duration <= 0.0:
|
|
338
|
+
raise excs.Error('duration must be a positive number')
|
|
339
|
+
if min_segment_duration is not None and segment_duration < min_segment_duration:
|
|
340
|
+
raise excs.Error('duration must be at least min_segment_duration')
|
|
341
|
+
if overlap is not None and overlap >= segment_duration:
|
|
342
|
+
raise excs.Error('overlap must be less than duration')
|
|
343
|
+
if mode == 'accurate' and overlap is not None:
|
|
328
344
|
raise excs.Error("Cannot specify overlap for mode='accurate'")
|
|
329
|
-
if overlap >= segment_duration:
|
|
330
|
-
raise excs.Error('overlap must be less than duration')
|
|
331
345
|
if mode == 'fast':
|
|
332
346
|
if params.get('video_encoder') is not None:
|
|
333
347
|
raise excs.Error("Cannot specify video_encoder for mode='fast'")
|
|
@@ -343,13 +357,22 @@ class VideoSplitter(ComponentIterator):
|
|
|
343
357
|
}, []
|
|
344
358
|
|
|
345
359
|
def fast_iter(self) -> Iterator[dict[str, Any]]:
|
|
346
|
-
segment_path: str
|
|
360
|
+
segment_path: str = ''
|
|
347
361
|
try:
|
|
348
362
|
start_time = 0.0
|
|
349
363
|
start_pts = 0
|
|
364
|
+
segment_idx = 0
|
|
350
365
|
while True:
|
|
366
|
+
target_duration: float | None
|
|
367
|
+
if self.segment_duration is not None:
|
|
368
|
+
target_duration = self.segment_duration
|
|
369
|
+
elif self.segment_times is not None and segment_idx < len(self.segment_times):
|
|
370
|
+
target_duration = self.segment_times[segment_idx] - start_time
|
|
371
|
+
else:
|
|
372
|
+
target_duration = None # the rest of the video
|
|
373
|
+
|
|
351
374
|
segment_path = str(TempStore.create_path(extension='.mp4'))
|
|
352
|
-
cmd = av_utils.ffmpeg_clip_cmd(str(self.video_path), segment_path, start_time,
|
|
375
|
+
cmd = av_utils.ffmpeg_clip_cmd(str(self.video_path), segment_path, start_time, target_duration)
|
|
353
376
|
_ = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
354
377
|
|
|
355
378
|
# use the actual duration
|
|
@@ -373,8 +396,13 @@ class VideoSplitter(ComponentIterator):
|
|
|
373
396
|
start_time = segment_end - self.overlap
|
|
374
397
|
start_pts = segment_end_pts - round(self.overlap / self.video_time_base)
|
|
375
398
|
|
|
399
|
+
segment_idx += 1
|
|
400
|
+
if self.segment_times is not None and segment_idx > len(self.segment_times):
|
|
401
|
+
# We've created all segments including the final segment after the last segment_time
|
|
402
|
+
break
|
|
403
|
+
|
|
376
404
|
except subprocess.CalledProcessError as e:
|
|
377
|
-
if Path(segment_path).exists():
|
|
405
|
+
if segment_path and Path(segment_path).exists():
|
|
378
406
|
Path(segment_path).unlink()
|
|
379
407
|
error_msg = f'ffmpeg failed with return code {e.returncode}'
|
|
380
408
|
if e.stderr:
|
|
@@ -389,6 +417,7 @@ class VideoSplitter(ComponentIterator):
|
|
|
389
417
|
str(self.video_path),
|
|
390
418
|
output_pattern,
|
|
391
419
|
segment_duration=self.segment_duration,
|
|
420
|
+
segment_times=self.segment_times,
|
|
392
421
|
video_encoder=self.video_encoder,
|
|
393
422
|
video_encoder_args=self.video_encoder_args,
|
|
394
423
|
)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any
|
|
1
|
+
from typing import Any
|
|
2
2
|
|
|
3
3
|
import sqlalchemy as sql
|
|
4
4
|
|
|
@@ -11,7 +11,7 @@ def _(engine: sql.engine.Engine) -> None:
|
|
|
11
11
|
convert_table_md(engine, substitution_fn=__substitute_md)
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
def __substitute_md(k:
|
|
14
|
+
def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
|
|
15
15
|
# Migrate a few changed function names
|
|
16
16
|
if k == 'path' and v == 'pixeltable.functions.string.str_format':
|
|
17
17
|
return 'path', 'pixeltable.functions.string.format'
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import datetime
|
|
2
|
-
from typing import Any
|
|
2
|
+
from typing import Any
|
|
3
3
|
|
|
4
4
|
import sqlalchemy as sql
|
|
5
5
|
|
|
@@ -28,7 +28,7 @@ def _(engine: sql.engine.Engine) -> None:
|
|
|
28
28
|
conn.execute(sql.text(f'ALTER TABLE {store_name} ALTER COLUMN col_{col_id} TYPE TIMESTAMPTZ'))
|
|
29
29
|
|
|
30
30
|
|
|
31
|
-
def __update_timestamp_literals(k: Any, v: Any) ->
|
|
31
|
+
def __update_timestamp_literals(k: Any, v: Any) -> tuple[Any, Any] | None:
|
|
32
32
|
if isinstance(v, dict) and 'val_t' in v:
|
|
33
33
|
# It's a literal with an explicit 'val_t' field. In version 19 this can only mean a
|
|
34
34
|
# timestamp literal, which (in version 19) is stored in the DB as a naive datetime.
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any
|
|
1
|
+
from typing import Any
|
|
2
2
|
|
|
3
3
|
import sqlalchemy as sql
|
|
4
4
|
|
|
@@ -11,7 +11,7 @@ def _(engine: sql.engine.Engine) -> None:
|
|
|
11
11
|
convert_table_md(engine, substitution_fn=__substitute_md)
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
def __substitute_md(k:
|
|
14
|
+
def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
|
|
15
15
|
if isinstance(v, dict) and '_classname' in v:
|
|
16
16
|
# The way InlineArray is represented changed in v20. Previously, literal values were stored
|
|
17
17
|
# directly in the Inline expr; now we store them in Literal sub-exprs. This converter
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any
|
|
1
|
+
from typing import Any
|
|
2
2
|
|
|
3
3
|
import sqlalchemy as sql
|
|
4
4
|
|
|
@@ -24,7 +24,7 @@ def __update_schema_column(schema_column: dict) -> None:
|
|
|
24
24
|
schema_column['media_validation'] = None
|
|
25
25
|
|
|
26
26
|
|
|
27
|
-
def __substitute_md(k:
|
|
27
|
+
def __substitute_md(k: str | None, v: Any) -> tuple[str | None, Any] | None:
|
|
28
28
|
if isinstance(v, dict) and '_classname' in v and v['_classname'] == 'ColumnRef':
|
|
29
29
|
if 'perform_validation' not in v:
|
|
30
30
|
v['perform_validation'] = False
|