pixeltable 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +18 -9
- pixeltable/__version__.py +3 -0
- pixeltable/catalog/column.py +31 -50
- pixeltable/catalog/insertable_table.py +7 -6
- pixeltable/catalog/table.py +171 -57
- pixeltable/catalog/table_version.py +417 -140
- pixeltable/catalog/table_version_path.py +2 -2
- pixeltable/dataframe.py +239 -121
- pixeltable/env.py +82 -16
- pixeltable/exec/__init__.py +2 -1
- pixeltable/exec/cache_prefetch_node.py +1 -1
- pixeltable/exec/data_row_batch.py +6 -7
- pixeltable/exec/expr_eval_node.py +28 -28
- pixeltable/exec/in_memory_data_node.py +11 -7
- pixeltable/exec/sql_scan_node.py +7 -6
- pixeltable/exprs/__init__.py +4 -3
- pixeltable/exprs/column_ref.py +9 -0
- pixeltable/exprs/comparison.py +3 -3
- pixeltable/exprs/data_row.py +5 -1
- pixeltable/exprs/expr.py +15 -7
- pixeltable/exprs/function_call.py +17 -15
- pixeltable/exprs/image_member_access.py +9 -28
- pixeltable/exprs/in_predicate.py +96 -0
- pixeltable/exprs/inline_array.py +13 -11
- pixeltable/exprs/inline_dict.py +15 -13
- pixeltable/exprs/literal.py +16 -4
- pixeltable/exprs/row_builder.py +15 -41
- pixeltable/exprs/similarity_expr.py +65 -0
- pixeltable/ext/__init__.py +5 -0
- pixeltable/ext/functions/yolox.py +92 -0
- pixeltable/func/__init__.py +0 -2
- pixeltable/func/aggregate_function.py +18 -15
- pixeltable/func/callable_function.py +57 -13
- pixeltable/func/expr_template_function.py +20 -3
- pixeltable/func/function.py +35 -4
- pixeltable/func/globals.py +24 -14
- pixeltable/func/signature.py +23 -27
- pixeltable/func/udf.py +13 -12
- pixeltable/functions/__init__.py +8 -8
- pixeltable/functions/eval.py +7 -8
- pixeltable/functions/huggingface.py +64 -17
- pixeltable/functions/openai.py +36 -3
- pixeltable/functions/pil/image.py +61 -64
- pixeltable/functions/together.py +21 -0
- pixeltable/functions/util.py +11 -0
- pixeltable/globals.py +425 -0
- pixeltable/index/__init__.py +2 -0
- pixeltable/index/base.py +51 -0
- pixeltable/index/embedding_index.py +168 -0
- pixeltable/io/__init__.py +3 -0
- pixeltable/{utils → io}/hf_datasets.py +48 -17
- pixeltable/io/pandas.py +148 -0
- pixeltable/{utils → io}/parquet.py +58 -33
- pixeltable/iterators/__init__.py +1 -1
- pixeltable/iterators/base.py +4 -0
- pixeltable/iterators/document.py +218 -97
- pixeltable/iterators/video.py +8 -9
- pixeltable/metadata/__init__.py +7 -3
- pixeltable/metadata/converters/convert_12.py +3 -0
- pixeltable/metadata/converters/convert_13.py +41 -0
- pixeltable/metadata/schema.py +45 -22
- pixeltable/plan.py +15 -51
- pixeltable/store.py +38 -41
- pixeltable/tool/create_test_db_dump.py +39 -4
- pixeltable/type_system.py +47 -96
- pixeltable/utils/documents.py +42 -12
- pixeltable/utils/http_server.py +70 -0
- {pixeltable-0.2.4.dist-info → pixeltable-0.2.6.dist-info}/METADATA +14 -10
- pixeltable-0.2.6.dist-info/RECORD +119 -0
- {pixeltable-0.2.4.dist-info → pixeltable-0.2.6.dist-info}/WHEEL +1 -1
- pixeltable/client.py +0 -604
- pixeltable/exprs/image_similarity_predicate.py +0 -58
- pixeltable/func/batched_function.py +0 -53
- pixeltable/tests/conftest.py +0 -177
- pixeltable/tests/functions/test_fireworks.py +0 -42
- pixeltable/tests/functions/test_functions.py +0 -60
- pixeltable/tests/functions/test_huggingface.py +0 -158
- pixeltable/tests/functions/test_openai.py +0 -152
- pixeltable/tests/functions/test_together.py +0 -111
- pixeltable/tests/test_audio.py +0 -65
- pixeltable/tests/test_catalog.py +0 -27
- pixeltable/tests/test_client.py +0 -21
- pixeltable/tests/test_component_view.py +0 -370
- pixeltable/tests/test_dataframe.py +0 -439
- pixeltable/tests/test_dirs.py +0 -107
- pixeltable/tests/test_document.py +0 -120
- pixeltable/tests/test_exprs.py +0 -805
- pixeltable/tests/test_function.py +0 -324
- pixeltable/tests/test_migration.py +0 -43
- pixeltable/tests/test_nos.py +0 -54
- pixeltable/tests/test_snapshot.py +0 -208
- pixeltable/tests/test_table.py +0 -1267
- pixeltable/tests/test_transactional_directory.py +0 -42
- pixeltable/tests/test_types.py +0 -22
- pixeltable/tests/test_video.py +0 -159
- pixeltable/tests/test_view.py +0 -530
- pixeltable/tests/utils.py +0 -408
- pixeltable-0.2.4.dist-info/RECORD +0 -132
- {pixeltable-0.2.4.dist-info → pixeltable-0.2.6.dist-info}/LICENSE +0 -0
pixeltable/iterators/document.py
CHANGED
|
@@ -1,24 +1,24 @@
|
|
|
1
|
-
from typing import Dict, Any, List, Tuple, Generator, Optional, Iterable
|
|
2
|
-
import logging
|
|
3
1
|
import dataclasses
|
|
4
2
|
import enum
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Dict, Any, List, Tuple, Optional, Iterable, Iterator
|
|
5
5
|
|
|
6
|
-
|
|
6
|
+
import ftfy
|
|
7
7
|
|
|
8
|
-
from pixeltable.type_system import ColumnType, DocumentType, StringType, IntType, JsonType
|
|
9
|
-
from pixeltable.exceptions import Error
|
|
10
8
|
from pixeltable.env import Env
|
|
9
|
+
from pixeltable.exceptions import Error
|
|
10
|
+
from pixeltable.type_system import ColumnType, DocumentType, StringType, IntType, JsonType
|
|
11
11
|
from pixeltable.utils.documents import get_document_handle
|
|
12
|
-
|
|
12
|
+
from .base import ComponentIterator
|
|
13
13
|
|
|
14
14
|
_logger = logging.getLogger('pixeltable')
|
|
15
15
|
|
|
16
|
-
|
|
17
16
|
class ChunkMetadata(enum.Enum):
|
|
18
17
|
TITLE = 1
|
|
19
|
-
|
|
18
|
+
HEADING = 2
|
|
20
19
|
SOURCELINE = 3
|
|
21
|
-
|
|
20
|
+
PAGE = 4
|
|
21
|
+
BOUNDING_BOX = 5
|
|
22
22
|
|
|
23
23
|
class Separator(enum.Enum):
|
|
24
24
|
HEADING = 1
|
|
@@ -26,52 +26,106 @@ class Separator(enum.Enum):
|
|
|
26
26
|
SENTENCE = 3
|
|
27
27
|
TOKEN_LIMIT = 4
|
|
28
28
|
CHAR_LIMIT = 5
|
|
29
|
-
|
|
29
|
+
PAGE = 6
|
|
30
30
|
|
|
31
31
|
@dataclasses.dataclass
|
|
32
|
-
class
|
|
32
|
+
class DocumentSectionMetadata:
|
|
33
33
|
"""Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
|
|
34
|
-
|
|
35
|
-
|
|
34
|
+
# html and markdown metadata
|
|
35
|
+
sourceline: Optional[int] = None
|
|
36
36
|
# the stack of headings up to the most recently observed one;
|
|
37
37
|
# eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
|
|
38
|
-
|
|
38
|
+
heading: Optional[Dict[int, str]] = None
|
|
39
39
|
|
|
40
|
+
# pdf-specific metadata
|
|
41
|
+
page: Optional[int] = None
|
|
42
|
+
# bounding box as an {x1, y1, x2, y2} dictionary
|
|
43
|
+
bounding_box: Optional[Dict[str, float]] = None
|
|
40
44
|
|
|
41
45
|
@dataclasses.dataclass
|
|
42
46
|
class DocumentSection:
|
|
43
47
|
"""A single document chunk, according to some of the splitting criteria"""
|
|
44
48
|
text: Optional[str]
|
|
45
|
-
|
|
46
|
-
|
|
49
|
+
metadata: Optional[DocumentSectionMetadata]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _parse_separators(separators: str) -> List[Separator]:
|
|
53
|
+
ret = []
|
|
54
|
+
for s in separators.split(','):
|
|
55
|
+
clean_s = s.strip().upper()
|
|
56
|
+
if not clean_s:
|
|
57
|
+
continue
|
|
58
|
+
if clean_s not in Separator.__members__:
|
|
59
|
+
raise Error(
|
|
60
|
+
f'Invalid separator: `{s.strip()}`. Valid separators are: {", ".join(Separator.__members__).lower()}'
|
|
61
|
+
)
|
|
62
|
+
ret.append(Separator[clean_s])
|
|
63
|
+
return ret
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _parse_metadata(metadata: str) -> List[ChunkMetadata]:
|
|
67
|
+
ret = []
|
|
68
|
+
for m in metadata.split(','):
|
|
69
|
+
clean_m = m.strip().upper()
|
|
70
|
+
if not clean_m:
|
|
71
|
+
continue
|
|
72
|
+
if clean_m not in ChunkMetadata.__members__:
|
|
73
|
+
raise Error(
|
|
74
|
+
f'Invalid metadata: `{m.strip()}`. Valid metadata are: {", ".join(ChunkMetadata.__members__).lower()}'
|
|
75
|
+
)
|
|
76
|
+
ret.append(ChunkMetadata[clean_m])
|
|
77
|
+
return ret
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
_HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
|
|
47
81
|
|
|
48
82
|
class DocumentSplitter(ComponentIterator):
|
|
49
|
-
"""
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
83
|
+
"""Iterator over pieces of a document. The document is split into chunks based on the specified separators.
|
|
84
|
+
The iterator output tuples are of schema {'text': StringType()}, but can include additional metadata fields if specified
|
|
85
|
+
in the `metadata` argument as explained below.
|
|
86
|
+
All chunk text is passed through `ftfy.fix_text` to fix up common problems with unicode sequences.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
`metadata`: which additional metadata fields to include in the output schema:
|
|
90
|
+
'title', 'heading' (HTML and Markdown), 'sourceline' (HTML), 'page' (PDF), 'bounding_box' (PDF).
|
|
91
|
+
The input can be a comma-separated string of these values eg. 'title,heading,sourceline'.
|
|
92
|
+
`separators`: which separators to use to split the document into rows. Options are:
|
|
93
|
+
'heading', 'paragraph', 'sentence', 'token_limit', 'char_limit', 'page'. As with metadata, this is can be a
|
|
94
|
+
comma-separated string eg. 'heading, token_limit'.
|
|
95
|
+
`limit`: the maximum number of tokens or characters in each chunk if 'token_limit' or 'char_limit' is specified.
|
|
96
|
+
"""
|
|
97
|
+
METADATA_COLUMN_TYPES = {
|
|
98
|
+
ChunkMetadata.TITLE: StringType(nullable=True),
|
|
99
|
+
ChunkMetadata.HEADING: JsonType(nullable=True),
|
|
100
|
+
ChunkMetadata.SOURCELINE: IntType(nullable=True),
|
|
101
|
+
ChunkMetadata.PAGE: IntType(nullable=True),
|
|
102
|
+
ChunkMetadata.BOUNDING_BOX: JsonType(nullable=True),
|
|
54
103
|
}
|
|
55
104
|
|
|
56
105
|
def __init__(
|
|
57
|
-
self, document: str, *, separators: str, limit: int =
|
|
58
|
-
html_skip_tags: List[str] = None, tiktoken_encoding: Optional[str] = 'cl100k_base',
|
|
106
|
+
self, document: str, *, separators: str, limit: Optional[int] = None, overlap: Optional[int] = None, metadata: str = '',
|
|
107
|
+
html_skip_tags: Optional[List[str]] = None, tiktoken_encoding: Optional[str] = 'cl100k_base',
|
|
59
108
|
tiktoken_target_model: Optional[str] = None
|
|
60
109
|
):
|
|
61
|
-
import bs4
|
|
62
110
|
if html_skip_tags is None:
|
|
63
111
|
html_skip_tags = ['nav']
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
self._separators =
|
|
69
|
-
self.
|
|
70
|
-
self.
|
|
71
|
-
|
|
72
|
-
|
|
112
|
+
self._doc_handle = get_document_handle(document)
|
|
113
|
+
assert self._doc_handle is not None
|
|
114
|
+
# calling the output_schema method to validate the input arguments
|
|
115
|
+
self.output_schema(separators=separators, metadata=metadata, limit=limit, overlap=overlap)
|
|
116
|
+
self._separators = _parse_separators(separators)
|
|
117
|
+
self._metadata_fields = _parse_metadata(metadata)
|
|
118
|
+
if self._doc_handle.bs_doc is not None:
|
|
119
|
+
title = self._doc_handle.bs_doc.title
|
|
120
|
+
if title is None:
|
|
121
|
+
self._doc_title = ''
|
|
122
|
+
else:
|
|
123
|
+
self._doc_title = ftfy.fix_text(title.get_text().strip())
|
|
124
|
+
else:
|
|
125
|
+
self._doc_title = ''
|
|
126
|
+
self._limit = 0 if limit is None else limit
|
|
73
127
|
self._skip_tags = html_skip_tags
|
|
74
|
-
self._overlap = overlap
|
|
128
|
+
self._overlap = 0 if overlap is None else overlap
|
|
75
129
|
self._tiktoken_encoding = tiktoken_encoding
|
|
76
130
|
self._tiktoken_target_model = tiktoken_target_model
|
|
77
131
|
|
|
@@ -79,9 +133,15 @@ class DocumentSplitter(ComponentIterator):
|
|
|
79
133
|
if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
|
|
80
134
|
assert self._doc_handle.bs_doc is not None
|
|
81
135
|
self._sections = self._html_sections()
|
|
82
|
-
|
|
136
|
+
elif self._doc_handle.format == DocumentType.DocumentFormat.MD:
|
|
83
137
|
assert self._doc_handle.md_ast is not None
|
|
84
138
|
self._sections = self._markdown_sections()
|
|
139
|
+
elif self._doc_handle.format == DocumentType.DocumentFormat.PDF:
|
|
140
|
+
assert self._doc_handle.pdf_doc is not None
|
|
141
|
+
self._sections = self._pdf_sections()
|
|
142
|
+
else:
|
|
143
|
+
assert False, f'unknown document format: {self._doc_handle.format}'
|
|
144
|
+
|
|
85
145
|
if Separator.SENTENCE in self._separators:
|
|
86
146
|
self._sections = self._sentence_sections(self._sections)
|
|
87
147
|
if Separator.TOKEN_LIMIT in self._separators:
|
|
@@ -105,38 +165,36 @@ class DocumentSplitter(ComponentIterator):
|
|
|
105
165
|
@classmethod
|
|
106
166
|
def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
|
|
107
167
|
schema = {'text': StringType()}
|
|
108
|
-
if 'metadata' in kwargs
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
raise Error(f'Invalid metadata field {md_field}')
|
|
113
|
-
schema[md_field.lower()] = cls.MD_COLUMN_TYPES[ChunkMetadata[md_field.upper()]]
|
|
168
|
+
md_fields = _parse_metadata(kwargs['metadata']) if 'metadata' in kwargs else []
|
|
169
|
+
|
|
170
|
+
for md_field in md_fields:
|
|
171
|
+
schema[md_field.name.lower()] = cls.METADATA_COLUMN_TYPES[md_field]
|
|
114
172
|
|
|
115
173
|
assert 'separators' in kwargs
|
|
116
|
-
separators = kwargs['separators']
|
|
117
|
-
for separator in separators:
|
|
118
|
-
if not hasattr(Separator, separator.upper()):
|
|
119
|
-
raise Error(f'Invalid separator {separator}')
|
|
174
|
+
separators = _parse_separators(kwargs['separators'])
|
|
120
175
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
Env.get().require_package('spacy')
|
|
124
|
-
if 'token_limit' in separators:
|
|
125
|
-
Env.get().require_package('tiktoken')
|
|
176
|
+
limit = kwargs.get('limit')
|
|
177
|
+
overlap = kwargs.get('overlap')
|
|
126
178
|
|
|
127
|
-
if
|
|
128
|
-
if
|
|
179
|
+
if limit is not None or overlap is not None:
|
|
180
|
+
if Separator.TOKEN_LIMIT not in separators and Separator.CHAR_LIMIT not in separators:
|
|
129
181
|
raise Error('limit/overlap requires the "token_limit" or "char_limit" separator')
|
|
130
|
-
if
|
|
182
|
+
if limit is not None and limit <= 0:
|
|
131
183
|
raise Error('"limit" must be an integer > 0')
|
|
132
|
-
if
|
|
184
|
+
if overlap is not None and overlap < 0:
|
|
133
185
|
raise Error('"overlap" must be an integer >= 0')
|
|
134
|
-
if
|
|
135
|
-
if
|
|
186
|
+
if Separator.TOKEN_LIMIT in separators or Separator.CHAR_LIMIT in separators:
|
|
187
|
+
if Separator.TOKEN_LIMIT in separators and Separator.CHAR_LIMIT in separators:
|
|
136
188
|
raise Error('Cannot specify both "token_limit" and "char_limit" separators')
|
|
137
|
-
if 'limit'
|
|
189
|
+
if kwargs.get('limit') is None:
|
|
138
190
|
raise Error('limit is required with "token_limit"/"char_limit" separators')
|
|
139
191
|
|
|
192
|
+
# check dependencies at the end
|
|
193
|
+
if Separator.SENTENCE in separators:
|
|
194
|
+
Env.get().require_package('spacy')
|
|
195
|
+
if Separator.TOKEN_LIMIT in separators:
|
|
196
|
+
Env.get().require_package('tiktoken')
|
|
197
|
+
|
|
140
198
|
return schema, []
|
|
141
199
|
|
|
142
200
|
def __next__(self) -> Dict[str, Any]:
|
|
@@ -145,47 +203,55 @@ class DocumentSplitter(ComponentIterator):
|
|
|
145
203
|
if section.text is None:
|
|
146
204
|
continue
|
|
147
205
|
result = {'text': section.text}
|
|
148
|
-
for md_field in self.
|
|
206
|
+
for md_field in self._metadata_fields:
|
|
149
207
|
if md_field == ChunkMetadata.TITLE:
|
|
150
208
|
result[md_field.name.lower()] = self._doc_title
|
|
151
|
-
elif md_field == ChunkMetadata.
|
|
152
|
-
result[md_field.name.lower()] = section.
|
|
209
|
+
elif md_field == ChunkMetadata.HEADING:
|
|
210
|
+
result[md_field.name.lower()] = section.metadata.heading
|
|
153
211
|
elif md_field == ChunkMetadata.SOURCELINE:
|
|
154
|
-
result[md_field.name.lower()] = section.
|
|
212
|
+
result[md_field.name.lower()] = section.metadata.sourceline
|
|
213
|
+
elif md_field == ChunkMetadata.PAGE:
|
|
214
|
+
result[md_field.name.lower()] = section.metadata.page
|
|
215
|
+
elif md_field == ChunkMetadata.BOUNDING_BOX:
|
|
216
|
+
result[md_field.name.lower()] = section.metadata.bounding_box
|
|
155
217
|
return result
|
|
156
218
|
|
|
157
|
-
def _html_sections(self) ->
|
|
219
|
+
def _html_sections(self) -> Iterator[DocumentSection]:
|
|
158
220
|
"""Create DocumentSections reflecting the html-specific separators"""
|
|
159
221
|
import bs4
|
|
160
222
|
emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
|
|
161
223
|
emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
|
|
162
224
|
# current state
|
|
163
|
-
|
|
225
|
+
accumulated_text = [] # currently accumulated text
|
|
226
|
+
# accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
|
|
227
|
+
|
|
164
228
|
headings: Dict[int, str] = {} # current state of observed headings (level -> text)
|
|
165
229
|
sourceline = 0 # most recently seen sourceline
|
|
166
230
|
|
|
167
|
-
def
|
|
231
|
+
def update_metadata(el: bs4.Tag) -> None:
|
|
168
232
|
# update current state
|
|
169
233
|
nonlocal headings, sourceline
|
|
170
234
|
sourceline = el.sourceline
|
|
171
|
-
if el.name in
|
|
235
|
+
if el.name in _HTML_HEADINGS:
|
|
172
236
|
level = int(el.name[1])
|
|
173
237
|
# remove the previously seen lower levels
|
|
174
|
-
lower_levels = [l for l in headings
|
|
238
|
+
lower_levels = [l for l in headings if l > level]
|
|
175
239
|
for l in lower_levels:
|
|
176
240
|
del headings[l]
|
|
177
241
|
headings[level] = el.get_text().strip()
|
|
178
242
|
|
|
179
243
|
def emit() -> None:
|
|
180
|
-
nonlocal
|
|
181
|
-
if len(
|
|
182
|
-
md =
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
244
|
+
nonlocal accumulated_text, headings, sourceline
|
|
245
|
+
if len(accumulated_text) > 0:
|
|
246
|
+
md = DocumentSectionMetadata(sourceline=sourceline, heading=headings.copy())
|
|
247
|
+
full_text = ' '.join(accumulated_text)
|
|
248
|
+
full_text = ftfy.fix_text(full_text)
|
|
249
|
+
yield DocumentSection(text=full_text, metadata=md)
|
|
250
|
+
accumulated_text = []
|
|
251
|
+
|
|
252
|
+
def process_element(el: bs4.PageElement) -> Iterator[DocumentSection]:
|
|
187
253
|
# process the element and emit sections as necessary
|
|
188
|
-
nonlocal
|
|
254
|
+
nonlocal accumulated_text, headings, sourceline, emit_on_heading, emit_on_paragraph
|
|
189
255
|
if el.name in self._skip_tags:
|
|
190
256
|
return
|
|
191
257
|
|
|
@@ -193,30 +259,31 @@ class DocumentSplitter(ComponentIterator):
|
|
|
193
259
|
# accumulate text until we see a tag we care about
|
|
194
260
|
text = el.get_text().strip()
|
|
195
261
|
if len(text) > 0:
|
|
196
|
-
|
|
262
|
+
accumulated_text.append(text)
|
|
197
263
|
return
|
|
198
264
|
|
|
199
|
-
if el.name in
|
|
265
|
+
if el.name in _HTML_HEADINGS:
|
|
200
266
|
if emit_on_heading:
|
|
201
267
|
yield from emit()
|
|
202
|
-
|
|
268
|
+
update_metadata(el)
|
|
203
269
|
elif el.name == 'p':
|
|
204
270
|
if emit_on_paragraph:
|
|
205
271
|
yield from emit()
|
|
206
|
-
|
|
272
|
+
update_metadata(el)
|
|
207
273
|
for child in el.children:
|
|
208
274
|
yield from process_element(child)
|
|
209
275
|
|
|
210
276
|
yield from process_element(self._doc_handle.bs_doc)
|
|
211
277
|
yield from emit()
|
|
212
278
|
|
|
213
|
-
def _markdown_sections(self) ->
|
|
279
|
+
def _markdown_sections(self) -> Iterator[DocumentSection]:
|
|
214
280
|
"""Create DocumentSections reflecting the html-specific separators"""
|
|
215
281
|
assert self._doc_handle.md_ast is not None
|
|
216
282
|
emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
|
|
217
283
|
emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
|
|
218
284
|
# current state
|
|
219
|
-
|
|
285
|
+
accumulated_text = [] # currently accumulated text
|
|
286
|
+
# accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
|
|
220
287
|
headings: Dict[int, str] = {} # current state of observed headings (level -> text)
|
|
221
288
|
|
|
222
289
|
def update_headings(heading: Dict) -> None:
|
|
@@ -232,22 +299,22 @@ class DocumentSplitter(ComponentIterator):
|
|
|
232
299
|
headings[level] = text
|
|
233
300
|
|
|
234
301
|
def emit() -> None:
|
|
235
|
-
nonlocal
|
|
236
|
-
if len(
|
|
237
|
-
|
|
238
|
-
yield DocumentSection(text=
|
|
239
|
-
|
|
302
|
+
nonlocal accumulated_text, headings
|
|
303
|
+
if len(accumulated_text) > 0:
|
|
304
|
+
metadata = DocumentSectionMetadata(sourceline=0, heading=headings.copy())
|
|
305
|
+
yield DocumentSection(text=ftfy.fix_text(' '.join(accumulated_text)), metadata=metadata)
|
|
306
|
+
accumulated_text = []
|
|
240
307
|
|
|
241
|
-
def process_element(el: Dict) ->
|
|
308
|
+
def process_element(el: Dict) -> Iterator[DocumentSection]:
|
|
242
309
|
# process the element and emit sections as necessary
|
|
243
|
-
nonlocal
|
|
310
|
+
nonlocal accumulated_text, headings, emit_on_heading, emit_on_paragraph
|
|
244
311
|
assert 'type' in el
|
|
245
312
|
|
|
246
313
|
if el['type'] == 'text':
|
|
247
314
|
# accumulate text until we see a separator element
|
|
248
315
|
text = el['raw'].strip()
|
|
249
316
|
if len(text) > 0:
|
|
250
|
-
|
|
317
|
+
accumulated_text.append(text)
|
|
251
318
|
return
|
|
252
319
|
|
|
253
320
|
if el['type'] == 'heading':
|
|
@@ -266,15 +333,57 @@ class DocumentSplitter(ComponentIterator):
|
|
|
266
333
|
yield from process_element(el)
|
|
267
334
|
yield from emit()
|
|
268
335
|
|
|
269
|
-
def
|
|
336
|
+
def _pdf_sections(self) -> Iterator[DocumentSection]:
|
|
337
|
+
"""Create DocumentSections reflecting the pdf-specific separators"""
|
|
338
|
+
import fitz
|
|
339
|
+
doc: fitz.Document = self._doc_handle.pdf_doc
|
|
340
|
+
assert doc is not None
|
|
341
|
+
|
|
342
|
+
emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
|
|
343
|
+
emit_on_page = Separator.PAGE in self._separators or emit_on_paragraph
|
|
344
|
+
|
|
345
|
+
accumulated_text = [] # invariant: all elements are ftfy clean and non-empty
|
|
346
|
+
|
|
347
|
+
def _add_cleaned_text(raw_text: str) -> None:
|
|
348
|
+
fixed = ftfy.fix_text(raw_text)
|
|
349
|
+
if fixed:
|
|
350
|
+
accumulated_text.append(fixed)
|
|
351
|
+
|
|
352
|
+
def _emit_text() -> str:
|
|
353
|
+
full_text = ''.join(accumulated_text)
|
|
354
|
+
accumulated_text.clear()
|
|
355
|
+
return full_text
|
|
356
|
+
|
|
357
|
+
for page_number, page in enumerate(doc.pages()):
|
|
358
|
+
for block in page.get_text('blocks'):
|
|
359
|
+
# there is no concept of paragraph in pdf, block is the closest thing
|
|
360
|
+
# we can get (eg a paragraph in text may cut across pages)
|
|
361
|
+
# see pymupdf docs https://pymupdf.readthedocs.io/en/latest/app1.html
|
|
362
|
+
# other libraries like pdfminer also lack an explicit paragraph concept
|
|
363
|
+
x1, y1, x2, y2, text, _, _ = block
|
|
364
|
+
_add_cleaned_text(text)
|
|
365
|
+
if accumulated_text and emit_on_paragraph:
|
|
366
|
+
bbox = {'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2}
|
|
367
|
+
metadata = DocumentSectionMetadata(page=page_number, bounding_box=bbox)
|
|
368
|
+
yield DocumentSection(text=_emit_text(), metadata=metadata)
|
|
369
|
+
|
|
370
|
+
if accumulated_text and emit_on_page and not emit_on_paragraph:
|
|
371
|
+
yield DocumentSection(text=_emit_text(),
|
|
372
|
+
metadata=DocumentSectionMetadata(page=page_number))
|
|
373
|
+
accumulated_text = []
|
|
374
|
+
|
|
375
|
+
if accumulated_text and not emit_on_page:
|
|
376
|
+
yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata())
|
|
377
|
+
|
|
378
|
+
def _sentence_sections(self, input_sections: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
|
|
270
379
|
"""Split the input sections into sentences"""
|
|
271
380
|
for section in input_sections:
|
|
272
381
|
if section.text is not None:
|
|
273
382
|
doc = Env.get().spacy_nlp(section.text)
|
|
274
383
|
for sent in doc.sents:
|
|
275
|
-
yield DocumentSection(text=sent.text,
|
|
384
|
+
yield DocumentSection(text=sent.text, metadata=section.metadata)
|
|
276
385
|
|
|
277
|
-
def _token_chunks(self, input: Iterable[DocumentSection]) ->
|
|
386
|
+
def _token_chunks(self, input: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
|
|
278
387
|
import tiktoken
|
|
279
388
|
if self._tiktoken_target_model is not None:
|
|
280
389
|
encoding = tiktoken.encoding_for_model(self._tiktoken_target_model)
|
|
@@ -287,13 +396,25 @@ class DocumentSplitter(ComponentIterator):
|
|
|
287
396
|
continue
|
|
288
397
|
tokens = encoding.encode(section.text)
|
|
289
398
|
start_idx = 0
|
|
399
|
+
text = None
|
|
290
400
|
while start_idx < len(tokens):
|
|
291
401
|
end_idx = min(start_idx + self._limit, len(tokens))
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
402
|
+
while end_idx > start_idx:
|
|
403
|
+
# find a cutoff point that doesn't cut in the middle of utf8 multi-byte sequences
|
|
404
|
+
try:
|
|
405
|
+
# check that the truncated data can be properly decoded
|
|
406
|
+
text = encoding.decode(tokens[start_idx:end_idx], errors='strict')
|
|
407
|
+
break
|
|
408
|
+
except UnicodeDecodeError:
|
|
409
|
+
# we split the token array at a point where the utf8 encoding is broken
|
|
410
|
+
end_idx -= 1
|
|
411
|
+
|
|
412
|
+
assert end_idx > start_idx
|
|
413
|
+
assert text
|
|
414
|
+
yield DocumentSection(text=text, metadata=section.metadata)
|
|
415
|
+
start_idx = max(start_idx + 1, end_idx - self._overlap) # ensure we make progress
|
|
416
|
+
|
|
417
|
+
def _char_chunks(self, input: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
|
|
297
418
|
for section in input:
|
|
298
419
|
if section.text is None:
|
|
299
420
|
continue
|
|
@@ -301,7 +422,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
301
422
|
while start_idx < len(section.text):
|
|
302
423
|
end_idx = min(start_idx + self._limit, len(section.text))
|
|
303
424
|
text = section.text[start_idx:end_idx]
|
|
304
|
-
yield DocumentSection(text=text,
|
|
425
|
+
yield DocumentSection(text=text, metadata=section.metadata)
|
|
305
426
|
start_idx += self._limit - self._overlap
|
|
306
427
|
|
|
307
428
|
def close(self) -> None:
|
pixeltable/iterators/video.py
CHANGED
|
@@ -1,21 +1,20 @@
|
|
|
1
|
-
from typing import Dict, Any, List, Tuple
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
import math
|
|
4
1
|
import logging
|
|
2
|
+
import math
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Dict, Any, List, Tuple
|
|
5
5
|
|
|
6
|
-
import cv2
|
|
7
6
|
import PIL.Image
|
|
7
|
+
import cv2
|
|
8
8
|
|
|
9
|
-
from
|
|
10
|
-
|
|
11
|
-
from pixeltable.type_system import ColumnType, VideoType, ImageType, IntType, FloatType
|
|
9
|
+
from pixeltable import exprs
|
|
12
10
|
from pixeltable.exceptions import Error
|
|
13
|
-
|
|
11
|
+
from pixeltable.type_system import ColumnType, VideoType, ImageType, IntType, FloatType
|
|
12
|
+
from .base import ComponentIterator
|
|
14
13
|
|
|
15
14
|
_logger = logging.getLogger('pixeltable')
|
|
16
15
|
|
|
17
16
|
class FrameIterator(ComponentIterator):
|
|
18
|
-
def __init__(self, video: str, fps: float = 0.0):
|
|
17
|
+
def __init__(self, video: str, *, fps: float = 0.0):
|
|
19
18
|
video_path = Path(video)
|
|
20
19
|
assert video_path.exists() and video_path.is_file()
|
|
21
20
|
self.video_path = video_path
|
pixeltable/metadata/__init__.py
CHANGED
|
@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
|
|
|
10
10
|
from .schema import SystemInfo, SystemInfoMd
|
|
11
11
|
|
|
12
12
|
# current version of the metadata; this is incremented whenever the metadata schema changes
|
|
13
|
-
VERSION =
|
|
13
|
+
VERSION = 14
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def create_system_info(engine: sql.engine.Engine) -> None:
|
|
@@ -30,17 +30,21 @@ def register_converter(version: int, cb: Callable[[sql.engine.Engine], None]) ->
|
|
|
30
30
|
global converter_cbs
|
|
31
31
|
converter_cbs[version] = cb
|
|
32
32
|
|
|
33
|
+
def noop_converter(engine: sql.engine.Engine) -> None:
|
|
34
|
+
# Converter to use when incrementing the schema version, but without any functional changes
|
|
35
|
+
pass
|
|
36
|
+
|
|
33
37
|
# load all converter modules
|
|
34
38
|
for _, modname, _ in pkgutil.iter_modules([os.path.dirname(__file__) + '/converters']):
|
|
35
39
|
importlib.import_module('pixeltable.metadata.converters.' + modname)
|
|
36
40
|
|
|
37
41
|
def upgrade_md(engine: sql.engine.Engine) -> None:
|
|
38
42
|
"""Upgrade the metadata schema to the current version"""
|
|
39
|
-
with orm.Session(engine
|
|
43
|
+
with orm.Session(engine) as session:
|
|
40
44
|
system_info = session.query(SystemInfo).one().md
|
|
41
45
|
md_version = system_info['schema_version']
|
|
42
46
|
if md_version == VERSION:
|
|
43
|
-
|
|
47
|
+
return
|
|
44
48
|
while md_version < VERSION:
|
|
45
49
|
if md_version not in converter_cbs:
|
|
46
50
|
raise RuntimeError(f'No metadata converter for version {md_version}')
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
import sqlalchemy as sql
|
|
5
|
+
|
|
6
|
+
from pixeltable.metadata import register_converter
|
|
7
|
+
from pixeltable.metadata.schema import Table
|
|
8
|
+
|
|
9
|
+
_logger = logging.getLogger('pixeltable')
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def convert_13(engine: sql.engine.Engine) -> None:
|
|
13
|
+
with engine.begin() as conn:
|
|
14
|
+
for row in conn.execute(sql.select(Table)):
|
|
15
|
+
id = row[0]
|
|
16
|
+
md = row[2]
|
|
17
|
+
updated_md = _update_md(md)
|
|
18
|
+
if updated_md != md:
|
|
19
|
+
_logger.info(f'Updating schema for table: {id}')
|
|
20
|
+
conn.execute(sql.update(Table).where(Table.id == id).values(md=updated_md))
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# Traverse the schema dictionary and replace instances of `ExplicitBatchedFunction` with
|
|
24
|
+
# `CallableFunction`. DB versions prior to 14 can't contain serialized batched functions,
|
|
25
|
+
# so this is all we need to do.
|
|
26
|
+
def _update_md(md: Any) -> Any:
|
|
27
|
+
if isinstance(md, dict):
|
|
28
|
+
updated_md = {}
|
|
29
|
+
for k, v in md.items():
|
|
30
|
+
if k == '_classpath' and v == 'pixeltable.func.batched_function.ExplicitBatchedFunction':
|
|
31
|
+
updated_md[k] = 'pixeltable.func.callable_function.CallableFunction'
|
|
32
|
+
else:
|
|
33
|
+
updated_md[k] = _update_md(v)
|
|
34
|
+
return updated_md
|
|
35
|
+
elif isinstance(md, list):
|
|
36
|
+
return [_update_md(v) for v in md]
|
|
37
|
+
else:
|
|
38
|
+
return md
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
register_converter(13, convert_13)
|