pixeltable 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +20 -9
- pixeltable/__version__.py +3 -0
- pixeltable/catalog/column.py +23 -7
- pixeltable/catalog/insertable_table.py +32 -19
- pixeltable/catalog/table.py +210 -20
- pixeltable/catalog/table_version.py +272 -111
- pixeltable/catalog/table_version_path.py +6 -1
- pixeltable/dataframe.py +184 -110
- pixeltable/datatransfer/__init__.py +1 -0
- pixeltable/datatransfer/label_studio.py +526 -0
- pixeltable/datatransfer/remote.py +113 -0
- pixeltable/env.py +213 -79
- pixeltable/exec/__init__.py +2 -1
- pixeltable/exec/data_row_batch.py +6 -7
- pixeltable/exec/expr_eval_node.py +28 -28
- pixeltable/exec/sql_scan_node.py +7 -6
- pixeltable/exprs/__init__.py +4 -3
- pixeltable/exprs/column_ref.py +11 -2
- pixeltable/exprs/comparison.py +39 -1
- pixeltable/exprs/data_row.py +7 -0
- pixeltable/exprs/expr.py +26 -19
- pixeltable/exprs/function_call.py +17 -18
- pixeltable/exprs/globals.py +14 -2
- pixeltable/exprs/image_member_access.py +9 -28
- pixeltable/exprs/in_predicate.py +96 -0
- pixeltable/exprs/inline_array.py +13 -11
- pixeltable/exprs/inline_dict.py +15 -13
- pixeltable/exprs/row_builder.py +7 -1
- pixeltable/exprs/similarity_expr.py +67 -0
- pixeltable/ext/functions/whisperx.py +30 -0
- pixeltable/ext/functions/yolox.py +16 -0
- pixeltable/func/__init__.py +0 -2
- pixeltable/func/aggregate_function.py +5 -2
- pixeltable/func/callable_function.py +57 -13
- pixeltable/func/expr_template_function.py +14 -3
- pixeltable/func/function.py +35 -4
- pixeltable/func/signature.py +5 -15
- pixeltable/func/udf.py +8 -12
- pixeltable/functions/fireworks.py +9 -4
- pixeltable/functions/huggingface.py +48 -5
- pixeltable/functions/openai.py +49 -11
- pixeltable/functions/pil/image.py +61 -64
- pixeltable/functions/together.py +32 -6
- pixeltable/functions/util.py +0 -43
- pixeltable/functions/video.py +46 -8
- pixeltable/globals.py +443 -0
- pixeltable/index/__init__.py +1 -0
- pixeltable/index/base.py +9 -2
- pixeltable/index/btree.py +54 -0
- pixeltable/index/embedding_index.py +91 -15
- pixeltable/io/__init__.py +4 -0
- pixeltable/io/globals.py +59 -0
- pixeltable/{utils → io}/hf_datasets.py +48 -17
- pixeltable/io/pandas.py +148 -0
- pixeltable/{utils → io}/parquet.py +58 -33
- pixeltable/iterators/__init__.py +1 -1
- pixeltable/iterators/base.py +8 -4
- pixeltable/iterators/document.py +225 -93
- pixeltable/iterators/video.py +16 -9
- pixeltable/metadata/__init__.py +8 -4
- pixeltable/metadata/converters/convert_12.py +3 -0
- pixeltable/metadata/converters/convert_13.py +41 -0
- pixeltable/metadata/converters/convert_14.py +13 -0
- pixeltable/metadata/converters/convert_15.py +29 -0
- pixeltable/metadata/converters/util.py +63 -0
- pixeltable/metadata/schema.py +12 -6
- pixeltable/plan.py +11 -24
- pixeltable/store.py +16 -23
- pixeltable/tool/create_test_db_dump.py +49 -14
- pixeltable/type_system.py +27 -58
- pixeltable/utils/coco.py +94 -0
- pixeltable/utils/documents.py +42 -12
- pixeltable/utils/http_server.py +70 -0
- pixeltable-0.2.7.dist-info/METADATA +137 -0
- pixeltable-0.2.7.dist-info/RECORD +126 -0
- {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/WHEEL +1 -1
- pixeltable/client.py +0 -600
- pixeltable/exprs/image_similarity_predicate.py +0 -58
- pixeltable/func/batched_function.py +0 -53
- pixeltable/func/nos_function.py +0 -202
- pixeltable/tests/conftest.py +0 -171
- pixeltable/tests/ext/test_yolox.py +0 -21
- pixeltable/tests/functions/test_fireworks.py +0 -43
- pixeltable/tests/functions/test_functions.py +0 -60
- pixeltable/tests/functions/test_huggingface.py +0 -158
- pixeltable/tests/functions/test_openai.py +0 -162
- pixeltable/tests/functions/test_together.py +0 -112
- pixeltable/tests/test_audio.py +0 -65
- pixeltable/tests/test_catalog.py +0 -27
- pixeltable/tests/test_client.py +0 -21
- pixeltable/tests/test_component_view.py +0 -379
- pixeltable/tests/test_dataframe.py +0 -440
- pixeltable/tests/test_dirs.py +0 -107
- pixeltable/tests/test_document.py +0 -120
- pixeltable/tests/test_exprs.py +0 -802
- pixeltable/tests/test_function.py +0 -332
- pixeltable/tests/test_index.py +0 -138
- pixeltable/tests/test_migration.py +0 -44
- pixeltable/tests/test_nos.py +0 -54
- pixeltable/tests/test_snapshot.py +0 -231
- pixeltable/tests/test_table.py +0 -1343
- pixeltable/tests/test_transactional_directory.py +0 -42
- pixeltable/tests/test_types.py +0 -52
- pixeltable/tests/test_video.py +0 -159
- pixeltable/tests/test_view.py +0 -535
- pixeltable/tests/utils.py +0 -442
- pixeltable/utils/clip.py +0 -18
- pixeltable-0.2.5.dist-info/METADATA +0 -128
- pixeltable-0.2.5.dist-info/RECORD +0 -139
- {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/LICENSE +0 -0
pixeltable/iterators/document.py
CHANGED
|
@@ -1,23 +1,25 @@
|
|
|
1
|
-
from typing import Dict, Any, List, Tuple, Generator, Optional, Iterable
|
|
2
|
-
import logging
|
|
3
1
|
import dataclasses
|
|
4
2
|
import enum
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Dict, Any, List, Tuple, Optional, Iterable, Iterator
|
|
5
5
|
|
|
6
|
-
|
|
6
|
+
import ftfy
|
|
7
7
|
|
|
8
|
-
from pixeltable.type_system import ColumnType, DocumentType, StringType, IntType, JsonType
|
|
9
|
-
from pixeltable.exceptions import Error
|
|
10
8
|
from pixeltable.env import Env
|
|
9
|
+
from pixeltable.exceptions import Error
|
|
10
|
+
from pixeltable.type_system import ColumnType, DocumentType, StringType, IntType, JsonType
|
|
11
11
|
from pixeltable.utils.documents import get_document_handle
|
|
12
|
-
|
|
12
|
+
from .base import ComponentIterator
|
|
13
13
|
|
|
14
14
|
_logger = logging.getLogger('pixeltable')
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
class ChunkMetadata(enum.Enum):
|
|
18
18
|
TITLE = 1
|
|
19
|
-
|
|
19
|
+
HEADING = 2
|
|
20
20
|
SOURCELINE = 3
|
|
21
|
+
PAGE = 4
|
|
22
|
+
BOUNDING_BOX = 5
|
|
21
23
|
|
|
22
24
|
|
|
23
25
|
class Separator(enum.Enum):
|
|
@@ -26,52 +28,115 @@ class Separator(enum.Enum):
|
|
|
26
28
|
SENTENCE = 3
|
|
27
29
|
TOKEN_LIMIT = 4
|
|
28
30
|
CHAR_LIMIT = 5
|
|
31
|
+
PAGE = 6
|
|
29
32
|
|
|
30
33
|
|
|
31
34
|
@dataclasses.dataclass
|
|
32
|
-
class
|
|
35
|
+
class DocumentSectionMetadata:
|
|
33
36
|
"""Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
|
|
34
|
-
|
|
35
|
-
|
|
37
|
+
# html and markdown metadata
|
|
38
|
+
sourceline: Optional[int] = None
|
|
36
39
|
# the stack of headings up to the most recently observed one;
|
|
37
40
|
# eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
|
|
38
|
-
|
|
41
|
+
heading: Optional[Dict[int, str]] = None
|
|
42
|
+
|
|
43
|
+
# pdf-specific metadata
|
|
44
|
+
page: Optional[int] = None
|
|
45
|
+
# bounding box as an {x1, y1, x2, y2} dictionary
|
|
46
|
+
bounding_box: Optional[Dict[str, float]] = None
|
|
39
47
|
|
|
40
48
|
|
|
41
49
|
@dataclasses.dataclass
|
|
42
50
|
class DocumentSection:
|
|
43
51
|
"""A single document chunk, according to some of the splitting criteria"""
|
|
44
52
|
text: Optional[str]
|
|
45
|
-
|
|
53
|
+
metadata: Optional[DocumentSectionMetadata]
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _parse_separators(separators: str) -> List[Separator]:
|
|
57
|
+
ret = []
|
|
58
|
+
for s in separators.split(','):
|
|
59
|
+
clean_s = s.strip().upper()
|
|
60
|
+
if not clean_s:
|
|
61
|
+
continue
|
|
62
|
+
if clean_s not in Separator.__members__:
|
|
63
|
+
raise Error(
|
|
64
|
+
f'Invalid separator: `{s.strip()}`. Valid separators are: {", ".join(Separator.__members__).lower()}'
|
|
65
|
+
)
|
|
66
|
+
ret.append(Separator[clean_s])
|
|
67
|
+
return ret
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _parse_metadata(metadata: str) -> List[ChunkMetadata]:
|
|
71
|
+
ret = []
|
|
72
|
+
for m in metadata.split(','):
|
|
73
|
+
clean_m = m.strip().upper()
|
|
74
|
+
if not clean_m:
|
|
75
|
+
continue
|
|
76
|
+
if clean_m not in ChunkMetadata.__members__:
|
|
77
|
+
raise Error(
|
|
78
|
+
f'Invalid metadata: `{m.strip()}`. Valid metadata are: {", ".join(ChunkMetadata.__members__).lower()}'
|
|
79
|
+
)
|
|
80
|
+
ret.append(ChunkMetadata[clean_m])
|
|
81
|
+
return ret
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
_HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
|
|
46
85
|
|
|
47
86
|
|
|
48
87
|
class DocumentSplitter(ComponentIterator):
|
|
49
|
-
"""
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
88
|
+
"""Iterator over chunks of a document. The document is chunked according to the specified `separators`.
|
|
89
|
+
|
|
90
|
+
The iterator yields a `text` field containing the text of the chunk, and it may also
|
|
91
|
+
include additional metadata fields if specified in the `metadata` parameter, as explained below.
|
|
92
|
+
|
|
93
|
+
Chunked text will be cleaned with `ftfy.fix_text` to fix up common problems with unicode sequences.
|
|
94
|
+
"""
|
|
95
|
+
METADATA_COLUMN_TYPES = {
|
|
96
|
+
ChunkMetadata.TITLE: StringType(nullable=True),
|
|
97
|
+
ChunkMetadata.HEADING: JsonType(nullable=True),
|
|
98
|
+
ChunkMetadata.SOURCELINE: IntType(nullable=True),
|
|
99
|
+
ChunkMetadata.PAGE: IntType(nullable=True),
|
|
100
|
+
ChunkMetadata.BOUNDING_BOX: JsonType(nullable=True),
|
|
54
101
|
}
|
|
55
102
|
|
|
56
103
|
def __init__(
|
|
57
|
-
self, document: str, *, separators: str, limit: int =
|
|
58
|
-
|
|
104
|
+
self, document: str, *, separators: str, limit: Optional[int] = None, overlap: Optional[int] = None,
|
|
105
|
+
metadata: str = '',
|
|
106
|
+
html_skip_tags: Optional[list[str]] = None, tiktoken_encoding: Optional[str] = 'cl100k_base',
|
|
59
107
|
tiktoken_target_model: Optional[str] = None
|
|
60
108
|
):
|
|
61
|
-
|
|
109
|
+
"""Init method for `DocumentSplitter` class.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
separators: separators to use to chunk the document. Options are:
|
|
113
|
+
`'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
|
|
114
|
+
This may be a comma-separated string, e.g., `'heading,token_limit'`.
|
|
115
|
+
limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
|
|
116
|
+
or `'char_limit'` is specified.
|
|
117
|
+
metadata: additional metadata fields to include in the output. Options are:
|
|
118
|
+
`'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
|
|
119
|
+
(PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
|
|
120
|
+
"""
|
|
62
121
|
if html_skip_tags is None:
|
|
63
122
|
html_skip_tags = ['nav']
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
self._separators =
|
|
69
|
-
self.
|
|
70
|
-
self.
|
|
71
|
-
|
|
72
|
-
|
|
123
|
+
self._doc_handle = get_document_handle(document)
|
|
124
|
+
assert self._doc_handle is not None
|
|
125
|
+
# calling the output_schema method to validate the input arguments
|
|
126
|
+
self.output_schema(separators=separators, metadata=metadata, limit=limit, overlap=overlap)
|
|
127
|
+
self._separators = _parse_separators(separators)
|
|
128
|
+
self._metadata_fields = _parse_metadata(metadata)
|
|
129
|
+
if self._doc_handle.bs_doc is not None:
|
|
130
|
+
title = self._doc_handle.bs_doc.title
|
|
131
|
+
if title is None:
|
|
132
|
+
self._doc_title = ''
|
|
133
|
+
else:
|
|
134
|
+
self._doc_title = ftfy.fix_text(title.get_text().strip())
|
|
135
|
+
else:
|
|
136
|
+
self._doc_title = ''
|
|
137
|
+
self._limit = 0 if limit is None else limit
|
|
73
138
|
self._skip_tags = html_skip_tags
|
|
74
|
-
self._overlap = overlap
|
|
139
|
+
self._overlap = 0 if overlap is None else overlap
|
|
75
140
|
self._tiktoken_encoding = tiktoken_encoding
|
|
76
141
|
self._tiktoken_target_model = tiktoken_target_model
|
|
77
142
|
|
|
@@ -79,9 +144,15 @@ class DocumentSplitter(ComponentIterator):
|
|
|
79
144
|
if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
|
|
80
145
|
assert self._doc_handle.bs_doc is not None
|
|
81
146
|
self._sections = self._html_sections()
|
|
82
|
-
|
|
147
|
+
elif self._doc_handle.format == DocumentType.DocumentFormat.MD:
|
|
83
148
|
assert self._doc_handle.md_ast is not None
|
|
84
149
|
self._sections = self._markdown_sections()
|
|
150
|
+
elif self._doc_handle.format == DocumentType.DocumentFormat.PDF:
|
|
151
|
+
assert self._doc_handle.pdf_doc is not None
|
|
152
|
+
self._sections = self._pdf_sections()
|
|
153
|
+
else:
|
|
154
|
+
assert False, f'unknown document format: {self._doc_handle.format}'
|
|
155
|
+
|
|
85
156
|
if Separator.SENTENCE in self._separators:
|
|
86
157
|
self._sections = self._sentence_sections(self._sections)
|
|
87
158
|
if Separator.TOKEN_LIMIT in self._separators:
|
|
@@ -105,38 +176,36 @@ class DocumentSplitter(ComponentIterator):
|
|
|
105
176
|
@classmethod
|
|
106
177
|
def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
|
|
107
178
|
schema = {'text': StringType()}
|
|
108
|
-
if 'metadata' in kwargs
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
raise Error(f'Invalid metadata field {md_field}')
|
|
113
|
-
schema[md_field.lower()] = cls.MD_COLUMN_TYPES[ChunkMetadata[md_field.upper()]]
|
|
179
|
+
md_fields = _parse_metadata(kwargs['metadata']) if 'metadata' in kwargs else []
|
|
180
|
+
|
|
181
|
+
for md_field in md_fields:
|
|
182
|
+
schema[md_field.name.lower()] = cls.METADATA_COLUMN_TYPES[md_field]
|
|
114
183
|
|
|
115
184
|
assert 'separators' in kwargs
|
|
116
|
-
separators = kwargs['separators']
|
|
117
|
-
for separator in separators:
|
|
118
|
-
if not hasattr(Separator, separator.upper()):
|
|
119
|
-
raise Error(f'Invalid separator {separator}')
|
|
185
|
+
separators = _parse_separators(kwargs['separators'])
|
|
120
186
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
Env.get().require_package('spacy')
|
|
124
|
-
if 'token_limit' in separators:
|
|
125
|
-
Env.get().require_package('tiktoken')
|
|
187
|
+
limit = kwargs.get('limit')
|
|
188
|
+
overlap = kwargs.get('overlap')
|
|
126
189
|
|
|
127
|
-
if
|
|
128
|
-
if
|
|
190
|
+
if limit is not None or overlap is not None:
|
|
191
|
+
if Separator.TOKEN_LIMIT not in separators and Separator.CHAR_LIMIT not in separators:
|
|
129
192
|
raise Error('limit/overlap requires the "token_limit" or "char_limit" separator')
|
|
130
|
-
if
|
|
193
|
+
if limit is not None and limit <= 0:
|
|
131
194
|
raise Error('"limit" must be an integer > 0')
|
|
132
|
-
if
|
|
195
|
+
if overlap is not None and overlap < 0:
|
|
133
196
|
raise Error('"overlap" must be an integer >= 0')
|
|
134
|
-
if
|
|
135
|
-
if
|
|
197
|
+
if Separator.TOKEN_LIMIT in separators or Separator.CHAR_LIMIT in separators:
|
|
198
|
+
if Separator.TOKEN_LIMIT in separators and Separator.CHAR_LIMIT in separators:
|
|
136
199
|
raise Error('Cannot specify both "token_limit" and "char_limit" separators')
|
|
137
|
-
if 'limit'
|
|
200
|
+
if kwargs.get('limit') is None:
|
|
138
201
|
raise Error('limit is required with "token_limit"/"char_limit" separators')
|
|
139
202
|
|
|
203
|
+
# check dependencies at the end
|
|
204
|
+
if Separator.SENTENCE in separators:
|
|
205
|
+
Env.get().require_package('spacy')
|
|
206
|
+
if Separator.TOKEN_LIMIT in separators:
|
|
207
|
+
Env.get().require_package('tiktoken')
|
|
208
|
+
|
|
140
209
|
return schema, []
|
|
141
210
|
|
|
142
211
|
def __next__(self) -> Dict[str, Any]:
|
|
@@ -145,47 +214,55 @@ class DocumentSplitter(ComponentIterator):
|
|
|
145
214
|
if section.text is None:
|
|
146
215
|
continue
|
|
147
216
|
result = {'text': section.text}
|
|
148
|
-
for md_field in self.
|
|
217
|
+
for md_field in self._metadata_fields:
|
|
149
218
|
if md_field == ChunkMetadata.TITLE:
|
|
150
219
|
result[md_field.name.lower()] = self._doc_title
|
|
151
|
-
elif md_field == ChunkMetadata.
|
|
152
|
-
result[md_field.name.lower()] = section.
|
|
220
|
+
elif md_field == ChunkMetadata.HEADING:
|
|
221
|
+
result[md_field.name.lower()] = section.metadata.heading
|
|
153
222
|
elif md_field == ChunkMetadata.SOURCELINE:
|
|
154
|
-
result[md_field.name.lower()] = section.
|
|
223
|
+
result[md_field.name.lower()] = section.metadata.sourceline
|
|
224
|
+
elif md_field == ChunkMetadata.PAGE:
|
|
225
|
+
result[md_field.name.lower()] = section.metadata.page
|
|
226
|
+
elif md_field == ChunkMetadata.BOUNDING_BOX:
|
|
227
|
+
result[md_field.name.lower()] = section.metadata.bounding_box
|
|
155
228
|
return result
|
|
156
229
|
|
|
157
|
-
def _html_sections(self) ->
|
|
230
|
+
def _html_sections(self) -> Iterator[DocumentSection]:
|
|
158
231
|
"""Create DocumentSections reflecting the html-specific separators"""
|
|
159
232
|
import bs4
|
|
160
233
|
emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
|
|
161
234
|
emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
|
|
162
235
|
# current state
|
|
163
|
-
|
|
236
|
+
accumulated_text = [] # currently accumulated text
|
|
237
|
+
# accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
|
|
238
|
+
|
|
164
239
|
headings: Dict[int, str] = {} # current state of observed headings (level -> text)
|
|
165
240
|
sourceline = 0 # most recently seen sourceline
|
|
166
241
|
|
|
167
|
-
def
|
|
242
|
+
def update_metadata(el: bs4.Tag) -> None:
|
|
168
243
|
# update current state
|
|
169
244
|
nonlocal headings, sourceline
|
|
170
245
|
sourceline = el.sourceline
|
|
171
|
-
if el.name in
|
|
246
|
+
if el.name in _HTML_HEADINGS:
|
|
172
247
|
level = int(el.name[1])
|
|
173
248
|
# remove the previously seen lower levels
|
|
174
|
-
lower_levels = [l for l in headings
|
|
249
|
+
lower_levels = [l for l in headings if l > level]
|
|
175
250
|
for l in lower_levels:
|
|
176
251
|
del headings[l]
|
|
177
252
|
headings[level] = el.get_text().strip()
|
|
178
253
|
|
|
179
254
|
def emit() -> None:
|
|
180
|
-
nonlocal
|
|
181
|
-
if len(
|
|
182
|
-
md =
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
255
|
+
nonlocal accumulated_text, headings, sourceline
|
|
256
|
+
if len(accumulated_text) > 0:
|
|
257
|
+
md = DocumentSectionMetadata(sourceline=sourceline, heading=headings.copy())
|
|
258
|
+
full_text = ' '.join(accumulated_text)
|
|
259
|
+
full_text = ftfy.fix_text(full_text)
|
|
260
|
+
yield DocumentSection(text=full_text, metadata=md)
|
|
261
|
+
accumulated_text = []
|
|
262
|
+
|
|
263
|
+
def process_element(el: bs4.PageElement) -> Iterator[DocumentSection]:
|
|
187
264
|
# process the element and emit sections as necessary
|
|
188
|
-
nonlocal
|
|
265
|
+
nonlocal accumulated_text, headings, sourceline, emit_on_heading, emit_on_paragraph
|
|
189
266
|
if el.name in self._skip_tags:
|
|
190
267
|
return
|
|
191
268
|
|
|
@@ -193,30 +270,31 @@ class DocumentSplitter(ComponentIterator):
|
|
|
193
270
|
# accumulate text until we see a tag we care about
|
|
194
271
|
text = el.get_text().strip()
|
|
195
272
|
if len(text) > 0:
|
|
196
|
-
|
|
273
|
+
accumulated_text.append(text)
|
|
197
274
|
return
|
|
198
275
|
|
|
199
|
-
if el.name in
|
|
276
|
+
if el.name in _HTML_HEADINGS:
|
|
200
277
|
if emit_on_heading:
|
|
201
278
|
yield from emit()
|
|
202
|
-
|
|
279
|
+
update_metadata(el)
|
|
203
280
|
elif el.name == 'p':
|
|
204
281
|
if emit_on_paragraph:
|
|
205
282
|
yield from emit()
|
|
206
|
-
|
|
283
|
+
update_metadata(el)
|
|
207
284
|
for child in el.children:
|
|
208
285
|
yield from process_element(child)
|
|
209
286
|
|
|
210
287
|
yield from process_element(self._doc_handle.bs_doc)
|
|
211
288
|
yield from emit()
|
|
212
289
|
|
|
213
|
-
def _markdown_sections(self) ->
|
|
290
|
+
def _markdown_sections(self) -> Iterator[DocumentSection]:
|
|
214
291
|
"""Create DocumentSections reflecting the html-specific separators"""
|
|
215
292
|
assert self._doc_handle.md_ast is not None
|
|
216
293
|
emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
|
|
217
294
|
emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
|
|
218
295
|
# current state
|
|
219
|
-
|
|
296
|
+
accumulated_text = [] # currently accumulated text
|
|
297
|
+
# accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
|
|
220
298
|
headings: Dict[int, str] = {} # current state of observed headings (level -> text)
|
|
221
299
|
|
|
222
300
|
def update_headings(heading: Dict) -> None:
|
|
@@ -232,22 +310,22 @@ class DocumentSplitter(ComponentIterator):
|
|
|
232
310
|
headings[level] = text
|
|
233
311
|
|
|
234
312
|
def emit() -> None:
|
|
235
|
-
nonlocal
|
|
236
|
-
if len(
|
|
237
|
-
|
|
238
|
-
yield DocumentSection(text=
|
|
239
|
-
|
|
313
|
+
nonlocal accumulated_text, headings
|
|
314
|
+
if len(accumulated_text) > 0:
|
|
315
|
+
metadata = DocumentSectionMetadata(sourceline=0, heading=headings.copy())
|
|
316
|
+
yield DocumentSection(text=ftfy.fix_text(' '.join(accumulated_text)), metadata=metadata)
|
|
317
|
+
accumulated_text = []
|
|
240
318
|
|
|
241
|
-
def process_element(el: Dict) ->
|
|
319
|
+
def process_element(el: Dict) -> Iterator[DocumentSection]:
|
|
242
320
|
# process the element and emit sections as necessary
|
|
243
|
-
nonlocal
|
|
321
|
+
nonlocal accumulated_text, headings, emit_on_heading, emit_on_paragraph
|
|
244
322
|
assert 'type' in el
|
|
245
323
|
|
|
246
324
|
if el['type'] == 'text':
|
|
247
325
|
# accumulate text until we see a separator element
|
|
248
326
|
text = el['raw'].strip()
|
|
249
327
|
if len(text) > 0:
|
|
250
|
-
|
|
328
|
+
accumulated_text.append(text)
|
|
251
329
|
return
|
|
252
330
|
|
|
253
331
|
if el['type'] == 'heading':
|
|
@@ -266,15 +344,57 @@ class DocumentSplitter(ComponentIterator):
|
|
|
266
344
|
yield from process_element(el)
|
|
267
345
|
yield from emit()
|
|
268
346
|
|
|
269
|
-
def
|
|
347
|
+
def _pdf_sections(self) -> Iterator[DocumentSection]:
|
|
348
|
+
"""Create DocumentSections reflecting the pdf-specific separators"""
|
|
349
|
+
import fitz
|
|
350
|
+
doc: fitz.Document = self._doc_handle.pdf_doc
|
|
351
|
+
assert doc is not None
|
|
352
|
+
|
|
353
|
+
emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
|
|
354
|
+
emit_on_page = Separator.PAGE in self._separators or emit_on_paragraph
|
|
355
|
+
|
|
356
|
+
accumulated_text = [] # invariant: all elements are ftfy clean and non-empty
|
|
357
|
+
|
|
358
|
+
def _add_cleaned_text(raw_text: str) -> None:
|
|
359
|
+
fixed = ftfy.fix_text(raw_text)
|
|
360
|
+
if fixed:
|
|
361
|
+
accumulated_text.append(fixed)
|
|
362
|
+
|
|
363
|
+
def _emit_text() -> str:
|
|
364
|
+
full_text = ''.join(accumulated_text)
|
|
365
|
+
accumulated_text.clear()
|
|
366
|
+
return full_text
|
|
367
|
+
|
|
368
|
+
for page_number, page in enumerate(doc.pages()):
|
|
369
|
+
for block in page.get_text('blocks'):
|
|
370
|
+
# there is no concept of paragraph in pdf, block is the closest thing
|
|
371
|
+
# we can get (eg a paragraph in text may cut across pages)
|
|
372
|
+
# see pymupdf docs https://pymupdf.readthedocs.io/en/latest/app1.html
|
|
373
|
+
# other libraries like pdfminer also lack an explicit paragraph concept
|
|
374
|
+
x1, y1, x2, y2, text, _, _ = block
|
|
375
|
+
_add_cleaned_text(text)
|
|
376
|
+
if accumulated_text and emit_on_paragraph:
|
|
377
|
+
bbox = {'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2}
|
|
378
|
+
metadata = DocumentSectionMetadata(page=page_number, bounding_box=bbox)
|
|
379
|
+
yield DocumentSection(text=_emit_text(), metadata=metadata)
|
|
380
|
+
|
|
381
|
+
if accumulated_text and emit_on_page and not emit_on_paragraph:
|
|
382
|
+
yield DocumentSection(text=_emit_text(),
|
|
383
|
+
metadata=DocumentSectionMetadata(page=page_number))
|
|
384
|
+
accumulated_text = []
|
|
385
|
+
|
|
386
|
+
if accumulated_text and not emit_on_page:
|
|
387
|
+
yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata())
|
|
388
|
+
|
|
389
|
+
def _sentence_sections(self, input_sections: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
|
|
270
390
|
"""Split the input sections into sentences"""
|
|
271
391
|
for section in input_sections:
|
|
272
392
|
if section.text is not None:
|
|
273
393
|
doc = Env.get().spacy_nlp(section.text)
|
|
274
394
|
for sent in doc.sents:
|
|
275
|
-
yield DocumentSection(text=sent.text,
|
|
395
|
+
yield DocumentSection(text=sent.text, metadata=section.metadata)
|
|
276
396
|
|
|
277
|
-
def _token_chunks(self, input: Iterable[DocumentSection]) ->
|
|
397
|
+
def _token_chunks(self, input: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
|
|
278
398
|
import tiktoken
|
|
279
399
|
if self._tiktoken_target_model is not None:
|
|
280
400
|
encoding = tiktoken.encoding_for_model(self._tiktoken_target_model)
|
|
@@ -287,13 +407,25 @@ class DocumentSplitter(ComponentIterator):
|
|
|
287
407
|
continue
|
|
288
408
|
tokens = encoding.encode(section.text)
|
|
289
409
|
start_idx = 0
|
|
410
|
+
text = None
|
|
290
411
|
while start_idx < len(tokens):
|
|
291
412
|
end_idx = min(start_idx + self._limit, len(tokens))
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
413
|
+
while end_idx > start_idx:
|
|
414
|
+
# find a cutoff point that doesn't cut in the middle of utf8 multi-byte sequences
|
|
415
|
+
try:
|
|
416
|
+
# check that the truncated data can be properly decoded
|
|
417
|
+
text = encoding.decode(tokens[start_idx:end_idx], errors='strict')
|
|
418
|
+
break
|
|
419
|
+
except UnicodeDecodeError:
|
|
420
|
+
# we split the token array at a point where the utf8 encoding is broken
|
|
421
|
+
end_idx -= 1
|
|
422
|
+
|
|
423
|
+
assert end_idx > start_idx
|
|
424
|
+
assert text
|
|
425
|
+
yield DocumentSection(text=text, metadata=section.metadata)
|
|
426
|
+
start_idx = max(start_idx + 1, end_idx - self._overlap) # ensure we make progress
|
|
427
|
+
|
|
428
|
+
def _char_chunks(self, input: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
|
|
297
429
|
for section in input:
|
|
298
430
|
if section.text is None:
|
|
299
431
|
continue
|
|
@@ -301,7 +433,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
301
433
|
while start_idx < len(section.text):
|
|
302
434
|
end_idx = min(start_idx + self._limit, len(section.text))
|
|
303
435
|
text = section.text[start_idx:end_idx]
|
|
304
|
-
yield DocumentSection(text=text,
|
|
436
|
+
yield DocumentSection(text=text, metadata=section.metadata)
|
|
305
437
|
start_idx += self._limit - self._overlap
|
|
306
438
|
|
|
307
439
|
def close(self) -> None:
|
pixeltable/iterators/video.py
CHANGED
|
@@ -1,21 +1,28 @@
|
|
|
1
|
-
from typing import Dict, Any, List, Tuple
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
import math
|
|
4
1
|
import logging
|
|
2
|
+
import math
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Dict, Any, List, Tuple
|
|
5
5
|
|
|
6
|
-
import cv2
|
|
7
6
|
import PIL.Image
|
|
7
|
+
import cv2
|
|
8
8
|
|
|
9
|
-
from .base import ComponentIterator
|
|
10
|
-
|
|
11
|
-
from pixeltable.type_system import ColumnType, VideoType, ImageType, IntType, FloatType
|
|
12
9
|
from pixeltable.exceptions import Error
|
|
13
|
-
|
|
10
|
+
from pixeltable.type_system import ColumnType, VideoType, ImageType, IntType, FloatType
|
|
11
|
+
from .base import ComponentIterator
|
|
14
12
|
|
|
15
13
|
_logger = logging.getLogger('pixeltable')
|
|
16
14
|
|
|
15
|
+
|
|
17
16
|
class FrameIterator(ComponentIterator):
|
|
18
|
-
|
|
17
|
+
"""Iterator over frames of a video.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
video: URL or file of the video to use for frame extraction
|
|
21
|
+
fps: number of frames to extract per second of video. This may be a fractional value, such as 0.5.
|
|
22
|
+
If set to 0.0, then the native framerate of the video will be used (all frames will be extracted).
|
|
23
|
+
Default: 0.0
|
|
24
|
+
"""
|
|
25
|
+
def __init__(self, video: str, *, fps: float = 0.0):
|
|
19
26
|
video_path = Path(video)
|
|
20
27
|
assert video_path.exists() and video_path.is_file()
|
|
21
28
|
self.video_path = video_path
|
pixeltable/metadata/__init__.py
CHANGED
|
@@ -10,11 +10,11 @@ import sqlalchemy.orm as orm
|
|
|
10
10
|
from .schema import SystemInfo, SystemInfoMd
|
|
11
11
|
|
|
12
12
|
# current version of the metadata; this is incremented whenever the metadata schema changes
|
|
13
|
-
VERSION =
|
|
13
|
+
VERSION = 16
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def create_system_info(engine: sql.engine.Engine) -> None:
|
|
17
|
-
"""Create the
|
|
17
|
+
"""Create the system metadata record"""
|
|
18
18
|
system_md = SystemInfoMd(schema_version=VERSION)
|
|
19
19
|
record = SystemInfo(md=dataclasses.asdict(system_md))
|
|
20
20
|
with orm.Session(engine, future=True) as session:
|
|
@@ -30,17 +30,21 @@ def register_converter(version: int, cb: Callable[[sql.engine.Engine], None]) ->
|
|
|
30
30
|
global converter_cbs
|
|
31
31
|
converter_cbs[version] = cb
|
|
32
32
|
|
|
33
|
+
def noop_converter(engine: sql.engine.Engine) -> None:
|
|
34
|
+
# Converter to use when incrementing the schema version, but without any functional changes
|
|
35
|
+
pass
|
|
36
|
+
|
|
33
37
|
# load all converter modules
|
|
34
38
|
for _, modname, _ in pkgutil.iter_modules([os.path.dirname(__file__) + '/converters']):
|
|
35
39
|
importlib.import_module('pixeltable.metadata.converters.' + modname)
|
|
36
40
|
|
|
37
41
|
def upgrade_md(engine: sql.engine.Engine) -> None:
|
|
38
42
|
"""Upgrade the metadata schema to the current version"""
|
|
39
|
-
with orm.Session(engine
|
|
43
|
+
with orm.Session(engine) as session:
|
|
40
44
|
system_info = session.query(SystemInfo).one().md
|
|
41
45
|
md_version = system_info['schema_version']
|
|
42
46
|
if md_version == VERSION:
|
|
43
|
-
|
|
47
|
+
return
|
|
44
48
|
while md_version < VERSION:
|
|
45
49
|
if md_version not in converter_cbs:
|
|
46
50
|
raise RuntimeError(f'No metadata converter for version {md_version}')
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
import sqlalchemy as sql
|
|
5
|
+
|
|
6
|
+
from pixeltable.metadata import register_converter
|
|
7
|
+
from pixeltable.metadata.schema import Table
|
|
8
|
+
|
|
9
|
+
_logger = logging.getLogger('pixeltable')
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def convert_13(engine: sql.engine.Engine) -> None:
|
|
13
|
+
with engine.begin() as conn:
|
|
14
|
+
for row in conn.execute(sql.select(Table)):
|
|
15
|
+
id = row[0]
|
|
16
|
+
md = row[2]
|
|
17
|
+
updated_md = _update_md(md)
|
|
18
|
+
if updated_md != md:
|
|
19
|
+
_logger.info(f'Updating schema for table: {id}')
|
|
20
|
+
conn.execute(sql.update(Table).where(Table.id == id).values(md=updated_md))
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# Traverse the schema dictionary and replace instances of `ExplicitBatchedFunction` with
|
|
24
|
+
# `CallableFunction`. DB versions prior to 14 can't contain serialized batched functions,
|
|
25
|
+
# so this is all we need to do.
|
|
26
|
+
def _update_md(md: Any) -> Any:
|
|
27
|
+
if isinstance(md, dict):
|
|
28
|
+
updated_md = {}
|
|
29
|
+
for k, v in md.items():
|
|
30
|
+
if k == '_classpath' and v == 'pixeltable.func.batched_function.ExplicitBatchedFunction':
|
|
31
|
+
updated_md[k] = 'pixeltable.func.callable_function.CallableFunction'
|
|
32
|
+
else:
|
|
33
|
+
updated_md[k] = _update_md(v)
|
|
34
|
+
return updated_md
|
|
35
|
+
elif isinstance(md, list):
|
|
36
|
+
return [_update_md(v) for v in md]
|
|
37
|
+
else:
|
|
38
|
+
return md
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
register_converter(13, convert_13)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import sqlalchemy as sql
|
|
2
|
+
|
|
3
|
+
from pixeltable.metadata.schema import Table
|
|
4
|
+
from pixeltable.metadata import register_converter
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def convert_14(engine: sql.engine.Engine) -> None:
|
|
8
|
+
default_remotes = {'remotes': []}
|
|
9
|
+
with engine.begin() as conn:
|
|
10
|
+
conn.execute(sql.update(Table).where(Table.md['remotes'] == None).values(md=Table.md.concat(default_remotes)))
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
register_converter(14, convert_14)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
|
|
3
|
+
import sqlalchemy as sql
|
|
4
|
+
|
|
5
|
+
from pixeltable.metadata import register_converter
|
|
6
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def convert_15(engine: sql.engine.Engine) -> None:
|
|
10
|
+
convert_table_md(engine, column_md_updater=update_column_md, remote_md_updater=update_remote_md)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def update_column_md(column_md: dict) -> None:
|
|
14
|
+
column_md['proxy_base'] = None
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def update_remote_md(remote_md: dict) -> None:
|
|
18
|
+
remote_md['class'] = f'{remote_md["module"]}.{remote_md["class"]}'
|
|
19
|
+
del remote_md['module']
|
|
20
|
+
if remote_md['class'] == 'pixeltable.datatransfer.remote.MockRemote':
|
|
21
|
+
remote_md['remote_md']['name'] = f'remote_{uuid.uuid4()}'
|
|
22
|
+
elif remote_md['class'] == 'pixeltable.datatransfer.label_studio.LabelStudioProject':
|
|
23
|
+
# 'post' is the media_import_method for legacy LabelStudioProject remotes
|
|
24
|
+
remote_md['remote_md']['media_import_method'] = 'post'
|
|
25
|
+
else:
|
|
26
|
+
assert False, remote_md['class']
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
register_converter(15, convert_15)
|