pixeltable 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +21 -4
- pixeltable/catalog/__init__.py +13 -0
- pixeltable/catalog/catalog.py +159 -0
- pixeltable/catalog/column.py +200 -0
- pixeltable/catalog/dir.py +32 -0
- pixeltable/catalog/globals.py +33 -0
- pixeltable/catalog/insertable_table.py +191 -0
- pixeltable/catalog/named_function.py +36 -0
- pixeltable/catalog/path.py +58 -0
- pixeltable/catalog/path_dict.py +139 -0
- pixeltable/catalog/schema_object.py +39 -0
- pixeltable/catalog/table.py +581 -0
- pixeltable/catalog/table_version.py +749 -0
- pixeltable/catalog/table_version_path.py +133 -0
- pixeltable/catalog/view.py +203 -0
- pixeltable/client.py +520 -31
- pixeltable/dataframe.py +540 -349
- pixeltable/env.py +373 -48
- pixeltable/exceptions.py +12 -21
- pixeltable/exec/__init__.py +9 -0
- pixeltable/exec/aggregation_node.py +78 -0
- pixeltable/exec/cache_prefetch_node.py +113 -0
- pixeltable/exec/component_iteration_node.py +79 -0
- pixeltable/exec/data_row_batch.py +95 -0
- pixeltable/exec/exec_context.py +22 -0
- pixeltable/exec/exec_node.py +61 -0
- pixeltable/exec/expr_eval_node.py +217 -0
- pixeltable/exec/in_memory_data_node.py +69 -0
- pixeltable/exec/media_validation_node.py +43 -0
- pixeltable/exec/sql_scan_node.py +225 -0
- pixeltable/exprs/__init__.py +24 -0
- pixeltable/exprs/arithmetic_expr.py +102 -0
- pixeltable/exprs/array_slice.py +71 -0
- pixeltable/exprs/column_property_ref.py +77 -0
- pixeltable/exprs/column_ref.py +105 -0
- pixeltable/exprs/comparison.py +77 -0
- pixeltable/exprs/compound_predicate.py +98 -0
- pixeltable/exprs/data_row.py +187 -0
- pixeltable/exprs/expr.py +586 -0
- pixeltable/exprs/expr_set.py +39 -0
- pixeltable/exprs/function_call.py +380 -0
- pixeltable/exprs/globals.py +69 -0
- pixeltable/exprs/image_member_access.py +115 -0
- pixeltable/exprs/image_similarity_predicate.py +58 -0
- pixeltable/exprs/inline_array.py +107 -0
- pixeltable/exprs/inline_dict.py +101 -0
- pixeltable/exprs/is_null.py +38 -0
- pixeltable/exprs/json_mapper.py +121 -0
- pixeltable/exprs/json_path.py +159 -0
- pixeltable/exprs/literal.py +54 -0
- pixeltable/exprs/object_ref.py +41 -0
- pixeltable/exprs/predicate.py +44 -0
- pixeltable/exprs/row_builder.py +355 -0
- pixeltable/exprs/rowid_ref.py +94 -0
- pixeltable/exprs/type_cast.py +53 -0
- pixeltable/exprs/variable.py +45 -0
- pixeltable/func/__init__.py +9 -0
- pixeltable/func/aggregate_function.py +194 -0
- pixeltable/func/batched_function.py +53 -0
- pixeltable/func/callable_function.py +69 -0
- pixeltable/func/expr_template_function.py +82 -0
- pixeltable/func/function.py +110 -0
- pixeltable/func/function_registry.py +227 -0
- pixeltable/func/globals.py +36 -0
- pixeltable/func/nos_function.py +202 -0
- pixeltable/func/signature.py +166 -0
- pixeltable/func/udf.py +163 -0
- pixeltable/functions/__init__.py +52 -103
- pixeltable/functions/eval.py +216 -0
- pixeltable/functions/fireworks.py +61 -0
- pixeltable/functions/huggingface.py +120 -0
- pixeltable/functions/image.py +16 -0
- pixeltable/functions/openai.py +88 -0
- pixeltable/functions/pil/image.py +148 -7
- pixeltable/functions/string.py +13 -0
- pixeltable/functions/together.py +27 -0
- pixeltable/functions/util.py +41 -0
- pixeltable/functions/video.py +62 -0
- pixeltable/iterators/__init__.py +3 -0
- pixeltable/iterators/base.py +48 -0
- pixeltable/iterators/document.py +311 -0
- pixeltable/iterators/video.py +89 -0
- pixeltable/metadata/__init__.py +54 -0
- pixeltable/metadata/converters/convert_10.py +18 -0
- pixeltable/metadata/schema.py +211 -0
- pixeltable/plan.py +656 -0
- pixeltable/store.py +413 -182
- pixeltable/tests/conftest.py +143 -86
- pixeltable/tests/test_audio.py +65 -0
- pixeltable/tests/test_catalog.py +27 -0
- pixeltable/tests/test_client.py +14 -14
- pixeltable/tests/test_component_view.py +372 -0
- pixeltable/tests/test_dataframe.py +433 -0
- pixeltable/tests/test_dirs.py +78 -62
- pixeltable/tests/test_document.py +117 -0
- pixeltable/tests/test_exprs.py +591 -135
- pixeltable/tests/test_function.py +297 -67
- pixeltable/tests/test_functions.py +283 -1
- pixeltable/tests/test_migration.py +43 -0
- pixeltable/tests/test_nos.py +54 -0
- pixeltable/tests/test_snapshot.py +208 -0
- pixeltable/tests/test_table.py +1086 -258
- pixeltable/tests/test_transactional_directory.py +42 -0
- pixeltable/tests/test_types.py +5 -11
- pixeltable/tests/test_video.py +149 -34
- pixeltable/tests/test_view.py +530 -0
- pixeltable/tests/utils.py +186 -45
- pixeltable/tool/create_test_db_dump.py +149 -0
- pixeltable/type_system.py +490 -133
- pixeltable/utils/__init__.py +17 -46
- pixeltable/utils/clip.py +12 -15
- pixeltable/utils/coco.py +136 -0
- pixeltable/utils/documents.py +39 -0
- pixeltable/utils/filecache.py +195 -0
- pixeltable/utils/help.py +11 -0
- pixeltable/utils/media_store.py +76 -0
- pixeltable/utils/parquet.py +126 -0
- pixeltable/utils/pytorch.py +172 -0
- pixeltable/utils/s3.py +13 -0
- pixeltable/utils/sql.py +17 -0
- pixeltable/utils/transactional_directory.py +35 -0
- pixeltable-0.2.0.dist-info/LICENSE +18 -0
- pixeltable-0.2.0.dist-info/METADATA +117 -0
- pixeltable-0.2.0.dist-info/RECORD +125 -0
- {pixeltable-0.1.2.dist-info → pixeltable-0.2.0.dist-info}/WHEEL +1 -1
- pixeltable/catalog.py +0 -1421
- pixeltable/exprs.py +0 -1745
- pixeltable/function.py +0 -269
- pixeltable/functions/clip.py +0 -10
- pixeltable/functions/pil/__init__.py +0 -23
- pixeltable/functions/tf.py +0 -21
- pixeltable/index.py +0 -57
- pixeltable/tests/test_dict.py +0 -24
- pixeltable/tests/test_tf.py +0 -69
- pixeltable/tf.py +0 -33
- pixeltable/utils/tf.py +0 -33
- pixeltable/utils/video.py +0 -32
- pixeltable-0.1.2.dist-info/LICENSE +0 -201
- pixeltable-0.1.2.dist-info/METADATA +0 -89
- pixeltable-0.1.2.dist-info/RECORD +0 -37
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
from typing import Dict, Any, List, Tuple, Generator, Optional, Iterable
|
|
2
|
+
import logging
|
|
3
|
+
import dataclasses
|
|
4
|
+
import enum
|
|
5
|
+
|
|
6
|
+
from .base import ComponentIterator
|
|
7
|
+
|
|
8
|
+
from pixeltable.type_system import ColumnType, DocumentType, StringType, IntType, JsonType
|
|
9
|
+
from pixeltable.exceptions import Error
|
|
10
|
+
from pixeltable.env import Env
|
|
11
|
+
from pixeltable.utils.documents import get_document_handle
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
_logger = logging.getLogger('pixeltable')
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ChunkMetadata(enum.Enum):
|
|
18
|
+
TITLE = 1
|
|
19
|
+
HEADINGS = 2
|
|
20
|
+
SOURCELINE = 3
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class Separator(enum.Enum):
|
|
24
|
+
HEADING = 1
|
|
25
|
+
PARAGRAPH = 2
|
|
26
|
+
SENTENCE = 3
|
|
27
|
+
TOKEN_LIMIT = 4
|
|
28
|
+
CHAR_LIMIT = 5
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclasses.dataclass
|
|
32
|
+
class DocumentSectionMd:
|
|
33
|
+
"""Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
|
|
34
|
+
source_line: int
|
|
35
|
+
|
|
36
|
+
# the stack of headings up to the most recently observed one;
|
|
37
|
+
# eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
|
|
38
|
+
headings: Dict[int, str]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclasses.dataclass
|
|
42
|
+
class DocumentSection:
|
|
43
|
+
"""A single document chunk, according to some of the splitting criteria"""
|
|
44
|
+
text: Optional[str]
|
|
45
|
+
md: Optional[DocumentSectionMd]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class DocumentSplitter(ComponentIterator):
|
|
49
|
+
""""Iterator over pieces of a document"""
|
|
50
|
+
MD_COLUMN_TYPES = {
|
|
51
|
+
ChunkMetadata.TITLE: StringType(),
|
|
52
|
+
ChunkMetadata.HEADINGS: JsonType(),
|
|
53
|
+
ChunkMetadata.SOURCELINE: IntType()
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
def __init__(
|
|
57
|
+
self, document: str, *, separators: str, limit: int = 0, overlap: int = 0, metadata: str = '',
|
|
58
|
+
html_skip_tags: List[str] = None, tiktoken_encoding: Optional[str] = 'cl100k_base',
|
|
59
|
+
tiktoken_target_model: Optional[str] = None
|
|
60
|
+
):
|
|
61
|
+
import bs4
|
|
62
|
+
if html_skip_tags is None:
|
|
63
|
+
html_skip_tags = ['nav']
|
|
64
|
+
with open(document, 'r') as fh:
|
|
65
|
+
s = fh.read()
|
|
66
|
+
self._doc_handle = get_document_handle(s)
|
|
67
|
+
assert self._doc_handle is not None
|
|
68
|
+
self._separators = [Separator[s.upper()] for s in separators.split(',')]
|
|
69
|
+
self._md_fields = [ChunkMetadata[m.upper()] for m in metadata.split(',')] if len(metadata) > 0 else []
|
|
70
|
+
self._doc_title = \
|
|
71
|
+
self._doc_handle.bs_doc.title.get_text().strip() if self._doc_handle.bs_doc is not None else ''
|
|
72
|
+
self._limit = limit
|
|
73
|
+
self._skip_tags = html_skip_tags
|
|
74
|
+
self._overlap = overlap
|
|
75
|
+
self._tiktoken_encoding = tiktoken_encoding
|
|
76
|
+
self._tiktoken_target_model = tiktoken_target_model
|
|
77
|
+
|
|
78
|
+
# set up processing pipeline
|
|
79
|
+
if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
|
|
80
|
+
assert self._doc_handle.bs_doc is not None
|
|
81
|
+
self._sections = self._html_sections()
|
|
82
|
+
else:
|
|
83
|
+
assert self._doc_handle.md_ast is not None
|
|
84
|
+
self._sections = self._markdown_sections()
|
|
85
|
+
if Separator.SENTENCE in self._separators:
|
|
86
|
+
self._sections = self._sentence_sections(self._sections)
|
|
87
|
+
if Separator.TOKEN_LIMIT in self._separators:
|
|
88
|
+
self._sections = self._token_chunks(self._sections)
|
|
89
|
+
if Separator.CHAR_LIMIT in self._separators:
|
|
90
|
+
self._sections = self._char_chunks(self._sections)
|
|
91
|
+
|
|
92
|
+
@classmethod
|
|
93
|
+
def input_schema(cls) -> Dict[str, ColumnType]:
|
|
94
|
+
return {
|
|
95
|
+
'document': DocumentType(nullable=False),
|
|
96
|
+
'separators': StringType(nullable=False),
|
|
97
|
+
'metadata': StringType(nullable=True),
|
|
98
|
+
'limit': IntType(nullable=True),
|
|
99
|
+
'overlap': IntType(nullable=True),
|
|
100
|
+
'skip_tags': StringType(nullable=True),
|
|
101
|
+
'tiktoken_encoding': StringType(nullable=True),
|
|
102
|
+
'tiktoken_target_model': StringType(nullable=True),
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
@classmethod
|
|
106
|
+
def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
|
|
107
|
+
schema = {'text': StringType()}
|
|
108
|
+
if 'metadata' in kwargs and len(kwargs['metadata']) > 0:
|
|
109
|
+
md_fields = kwargs['metadata'].split(',')
|
|
110
|
+
for md_field in md_fields:
|
|
111
|
+
if not hasattr(ChunkMetadata, md_field.upper()):
|
|
112
|
+
raise Error(f'Invalid metadata field {md_field}')
|
|
113
|
+
schema[md_field.lower()] = cls.MD_COLUMN_TYPES[ChunkMetadata[md_field.upper()]]
|
|
114
|
+
|
|
115
|
+
assert 'separators' in kwargs
|
|
116
|
+
separators = kwargs['separators'].split(',')
|
|
117
|
+
for separator in separators:
|
|
118
|
+
if not hasattr(Separator, separator.upper()):
|
|
119
|
+
raise Error(f'Invalid separator {separator}')
|
|
120
|
+
|
|
121
|
+
# check dependencies
|
|
122
|
+
if 'sentence' in separators:
|
|
123
|
+
Env.get().require_package('spacy')
|
|
124
|
+
if 'token_limit' in separators:
|
|
125
|
+
Env.get().require_package('tiktoken')
|
|
126
|
+
|
|
127
|
+
if 'limit' in kwargs or 'overlap' in kwargs:
|
|
128
|
+
if 'token_limit' not in separators and 'char_limit' not in separators:
|
|
129
|
+
raise Error('limit/overlap requires the "token_limit" or "char_limit" separator')
|
|
130
|
+
if 'limit' in kwargs and int(kwargs['limit']) <= 0:
|
|
131
|
+
raise Error('"limit" must be an integer > 0')
|
|
132
|
+
if 'overlap' in kwargs and int(kwargs['overlap']) < 0:
|
|
133
|
+
raise Error('"overlap" must be an integer >= 0')
|
|
134
|
+
if 'token_limit' in separators or 'char_limit' in separators:
|
|
135
|
+
if 'token_limit' in separators and 'char_limit' in separators:
|
|
136
|
+
raise Error('Cannot specify both "token_limit" and "char_limit" separators')
|
|
137
|
+
if 'limit' not in kwargs:
|
|
138
|
+
raise Error('limit is required with "token_limit"/"char_limit" separators')
|
|
139
|
+
|
|
140
|
+
return schema, []
|
|
141
|
+
|
|
142
|
+
def __next__(self) -> Dict[str, Any]:
|
|
143
|
+
while True:
|
|
144
|
+
section = next(self._sections)
|
|
145
|
+
if section.text is None:
|
|
146
|
+
continue
|
|
147
|
+
result = {'text': section.text}
|
|
148
|
+
for md_field in self._md_fields:
|
|
149
|
+
if md_field == ChunkMetadata.TITLE:
|
|
150
|
+
result[md_field.name.lower()] = self._doc_title
|
|
151
|
+
elif md_field == ChunkMetadata.HEADINGS:
|
|
152
|
+
result[md_field.name.lower()] = section.md.headings
|
|
153
|
+
elif md_field == ChunkMetadata.SOURCELINE:
|
|
154
|
+
result[md_field.name.lower()] = section.md.source_line
|
|
155
|
+
return result
|
|
156
|
+
|
|
157
|
+
def _html_sections(self) -> Generator[DocumentSection, None, None]:
|
|
158
|
+
"""Create DocumentSections reflecting the html-specific separators"""
|
|
159
|
+
import bs4
|
|
160
|
+
emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
|
|
161
|
+
emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
|
|
162
|
+
# current state
|
|
163
|
+
text_section = '' # currently accumulated text
|
|
164
|
+
headings: Dict[int, str] = {} # current state of observed headings (level -> text)
|
|
165
|
+
sourceline = 0 # most recently seen sourceline
|
|
166
|
+
|
|
167
|
+
def update_md(el: bs4.Tag) -> None:
|
|
168
|
+
# update current state
|
|
169
|
+
nonlocal headings, sourceline
|
|
170
|
+
sourceline = el.sourceline
|
|
171
|
+
if el.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
|
172
|
+
level = int(el.name[1])
|
|
173
|
+
# remove the previously seen lower levels
|
|
174
|
+
lower_levels = [l for l in headings.keys() if l > level]
|
|
175
|
+
for l in lower_levels:
|
|
176
|
+
del headings[l]
|
|
177
|
+
headings[level] = el.get_text().strip()
|
|
178
|
+
|
|
179
|
+
def emit() -> None:
|
|
180
|
+
nonlocal text_section, headings, sourceline
|
|
181
|
+
if len(text_section) > 0:
|
|
182
|
+
md = DocumentSectionMd(sourceline, headings.copy())
|
|
183
|
+
yield DocumentSection(text=text_section, md=md)
|
|
184
|
+
text_section = ''
|
|
185
|
+
|
|
186
|
+
def process_element(el: bs4.PageElement) -> Generator[DocumentSection, None, None]:
|
|
187
|
+
# process the element and emit sections as necessary
|
|
188
|
+
nonlocal text_section, headings, sourceline, emit_on_heading, emit_on_paragraph
|
|
189
|
+
if el.name in self._skip_tags:
|
|
190
|
+
return
|
|
191
|
+
|
|
192
|
+
if isinstance(el, bs4.NavigableString):
|
|
193
|
+
# accumulate text until we see a tag we care about
|
|
194
|
+
text = el.get_text().strip()
|
|
195
|
+
if len(text) > 0:
|
|
196
|
+
text_section += ' ' + text
|
|
197
|
+
return
|
|
198
|
+
|
|
199
|
+
if el.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
|
200
|
+
if emit_on_heading:
|
|
201
|
+
yield from emit()
|
|
202
|
+
update_md(el)
|
|
203
|
+
elif el.name == 'p':
|
|
204
|
+
if emit_on_paragraph:
|
|
205
|
+
yield from emit()
|
|
206
|
+
update_md(el)
|
|
207
|
+
for child in el.children:
|
|
208
|
+
yield from process_element(child)
|
|
209
|
+
|
|
210
|
+
yield from process_element(self._doc_handle.bs_doc)
|
|
211
|
+
yield from emit()
|
|
212
|
+
|
|
213
|
+
def _markdown_sections(self) -> Generator[DocumentSection, None, None]:
|
|
214
|
+
"""Create DocumentSections reflecting the html-specific separators"""
|
|
215
|
+
assert self._doc_handle.md_ast is not None
|
|
216
|
+
emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
|
|
217
|
+
emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
|
|
218
|
+
# current state
|
|
219
|
+
text_section = '' # currently accumulated text
|
|
220
|
+
headings: Dict[int, str] = {} # current state of observed headings (level -> text)
|
|
221
|
+
|
|
222
|
+
def update_headings(heading: Dict) -> None:
|
|
223
|
+
# update current state
|
|
224
|
+
nonlocal headings
|
|
225
|
+
assert 'type' in heading and heading['type'] == 'heading'
|
|
226
|
+
level = heading['attrs']['level']
|
|
227
|
+
text = heading['children'][0]['raw'].strip()
|
|
228
|
+
# remove the previously seen lower levels
|
|
229
|
+
lower_levels = [l for l in headings.keys() if l > level]
|
|
230
|
+
for l in lower_levels:
|
|
231
|
+
del headings[l]
|
|
232
|
+
headings[level] = text
|
|
233
|
+
|
|
234
|
+
def emit() -> None:
|
|
235
|
+
nonlocal text_section, headings
|
|
236
|
+
if len(text_section) > 0:
|
|
237
|
+
md = DocumentSectionMd(0, headings.copy())
|
|
238
|
+
yield DocumentSection(text=text_section, md=md)
|
|
239
|
+
text_section = ''
|
|
240
|
+
|
|
241
|
+
def process_element(el: Dict) -> Generator[DocumentSection, None, None]:
|
|
242
|
+
# process the element and emit sections as necessary
|
|
243
|
+
nonlocal text_section, headings, emit_on_heading, emit_on_paragraph
|
|
244
|
+
assert 'type' in el
|
|
245
|
+
|
|
246
|
+
if el['type'] == 'text':
|
|
247
|
+
# accumulate text until we see a separator element
|
|
248
|
+
text = el['raw'].strip()
|
|
249
|
+
if len(text) > 0:
|
|
250
|
+
text_section += ' ' + text
|
|
251
|
+
return
|
|
252
|
+
|
|
253
|
+
if el['type'] == 'heading':
|
|
254
|
+
if emit_on_heading:
|
|
255
|
+
yield from emit()
|
|
256
|
+
update_headings(el)
|
|
257
|
+
elif el['type'] == 'paragraph':
|
|
258
|
+
if emit_on_paragraph:
|
|
259
|
+
yield from emit()
|
|
260
|
+
if 'children' not in el:
|
|
261
|
+
return
|
|
262
|
+
for child in el['children']:
|
|
263
|
+
yield from process_element(child)
|
|
264
|
+
|
|
265
|
+
for el in self._doc_handle.md_ast:
|
|
266
|
+
yield from process_element(el)
|
|
267
|
+
yield from emit()
|
|
268
|
+
|
|
269
|
+
def _sentence_sections(self, input_sections: Iterable[DocumentSection]) -> Generator[DocumentSection, None, None]:
|
|
270
|
+
"""Split the input sections into sentences"""
|
|
271
|
+
for section in input_sections:
|
|
272
|
+
if section.text is not None:
|
|
273
|
+
doc = Env.get().spacy_nlp(section.text)
|
|
274
|
+
for sent in doc.sents:
|
|
275
|
+
yield DocumentSection(text=sent.text, md=section.md)
|
|
276
|
+
|
|
277
|
+
def _token_chunks(self, input: Iterable[DocumentSection]) -> Generator[DocumentSection, None, None]:
|
|
278
|
+
import tiktoken
|
|
279
|
+
if self._tiktoken_target_model is not None:
|
|
280
|
+
encoding = tiktoken.encoding_for_model(self._tiktoken_target_model)
|
|
281
|
+
else:
|
|
282
|
+
encoding = tiktoken.get_encoding(self._tiktoken_encoding)
|
|
283
|
+
assert self._limit > 0 and self._overlap >= 0
|
|
284
|
+
|
|
285
|
+
for section in input:
|
|
286
|
+
if section.text is None:
|
|
287
|
+
continue
|
|
288
|
+
tokens = encoding.encode(section.text)
|
|
289
|
+
start_idx = 0
|
|
290
|
+
while start_idx < len(tokens):
|
|
291
|
+
end_idx = min(start_idx + self._limit, len(tokens))
|
|
292
|
+
text = encoding.decode(tokens[start_idx:end_idx])
|
|
293
|
+
yield DocumentSection(text=text, md=section.md)
|
|
294
|
+
start_idx += self._limit - self._overlap
|
|
295
|
+
|
|
296
|
+
def _char_chunks(self, input: Iterable[DocumentSection]) -> Generator[DocumentSection, None, None]:
|
|
297
|
+
for section in input:
|
|
298
|
+
if section.text is None:
|
|
299
|
+
continue
|
|
300
|
+
start_idx = 0
|
|
301
|
+
while start_idx < len(section.text):
|
|
302
|
+
end_idx = min(start_idx + self._limit, len(section.text))
|
|
303
|
+
text = section.text[start_idx:end_idx]
|
|
304
|
+
yield DocumentSection(text=text, md=section.md)
|
|
305
|
+
start_idx += self._limit - self._overlap
|
|
306
|
+
|
|
307
|
+
def close(self) -> None:
|
|
308
|
+
pass
|
|
309
|
+
|
|
310
|
+
def set_pos(self, pos: int) -> None:
|
|
311
|
+
pass
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
from typing import Dict, Any, List, Tuple
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import math
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
import cv2
|
|
7
|
+
import PIL.Image
|
|
8
|
+
|
|
9
|
+
from .base import ComponentIterator
|
|
10
|
+
|
|
11
|
+
from pixeltable.type_system import ColumnType, VideoType, ImageType, IntType, FloatType
|
|
12
|
+
from pixeltable.exceptions import Error
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
_logger = logging.getLogger('pixeltable')
|
|
16
|
+
|
|
17
|
+
class FrameIterator(ComponentIterator):
|
|
18
|
+
def __init__(self, video: str, fps: float = 0.0):
|
|
19
|
+
video_path = Path(video)
|
|
20
|
+
assert video_path.exists() and video_path.is_file()
|
|
21
|
+
self.video_path = video_path
|
|
22
|
+
self.fps = fps
|
|
23
|
+
self.video_reader = cv2.VideoCapture(str(video_path))
|
|
24
|
+
if not self.video_reader.isOpened():
|
|
25
|
+
raise Error(f'Failed to open video: {video}')
|
|
26
|
+
video_fps = int(self.video_reader.get(cv2.CAP_PROP_FPS))
|
|
27
|
+
if fps > video_fps:
|
|
28
|
+
raise Error(f'Video {video}: requested fps ({fps}) exceeds that of the video ({video_fps})')
|
|
29
|
+
self.frame_freq = int(video_fps / fps) if fps > 0 else 1
|
|
30
|
+
num_video_frames = int(self.video_reader.get(cv2.CAP_PROP_FRAME_COUNT))
|
|
31
|
+
if num_video_frames == 0:
|
|
32
|
+
raise Error(f'Video {video}: failed to get number of frames')
|
|
33
|
+
# ceil: round up to ensure we count frame 0
|
|
34
|
+
self.num_frames = math.ceil(num_video_frames / self.frame_freq) if fps > 0 else num_video_frames
|
|
35
|
+
_logger.debug(f'FrameIterator: path={self.video_path} fps={self.fps}')
|
|
36
|
+
|
|
37
|
+
self.next_frame_idx = 0
|
|
38
|
+
|
|
39
|
+
@classmethod
|
|
40
|
+
def input_schema(cls) -> Dict[str, ColumnType]:
|
|
41
|
+
return {
|
|
42
|
+
'video': VideoType(nullable=False),
|
|
43
|
+
'fps': FloatType()
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
@classmethod
|
|
47
|
+
def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
|
|
48
|
+
return {
|
|
49
|
+
'frame_idx': IntType(),
|
|
50
|
+
'pos_msec': FloatType(),
|
|
51
|
+
'pos_frame': FloatType(),
|
|
52
|
+
'frame': ImageType(),
|
|
53
|
+
}, ['frame']
|
|
54
|
+
|
|
55
|
+
def __next__(self) -> Dict[str, Any]:
|
|
56
|
+
while True:
|
|
57
|
+
pos_msec = self.video_reader.get(cv2.CAP_PROP_POS_MSEC)
|
|
58
|
+
pos_frame = self.video_reader.get(cv2.CAP_PROP_POS_FRAMES)
|
|
59
|
+
status, img = self.video_reader.read()
|
|
60
|
+
if not status:
|
|
61
|
+
_logger.debug(f'releasing video reader for {self.video_path}')
|
|
62
|
+
self.video_reader.release()
|
|
63
|
+
self.video_reader = None
|
|
64
|
+
raise StopIteration
|
|
65
|
+
if pos_frame % self.frame_freq == 0:
|
|
66
|
+
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
|
67
|
+
result = {
|
|
68
|
+
'frame_idx': self.next_frame_idx,
|
|
69
|
+
'pos_msec': pos_msec,
|
|
70
|
+
'pos_frame': pos_frame,
|
|
71
|
+
'frame': PIL.Image.fromarray(img),
|
|
72
|
+
}
|
|
73
|
+
self.next_frame_idx += 1
|
|
74
|
+
# frame_freq > 1: jumping to the target frame here with video_reader.set() is far slower than just
|
|
75
|
+
# skipping the unwanted frames
|
|
76
|
+
return result
|
|
77
|
+
|
|
78
|
+
def close(self) -> None:
|
|
79
|
+
if self.video_reader is not None:
|
|
80
|
+
self.video_reader.release()
|
|
81
|
+
self.video_reader = None
|
|
82
|
+
|
|
83
|
+
def set_pos(self, pos: int) -> None:
|
|
84
|
+
"""Seek to frame idx"""
|
|
85
|
+
if pos == self.next_frame_idx:
|
|
86
|
+
return
|
|
87
|
+
_logger.debug(f'seeking to frame {pos}')
|
|
88
|
+
self.video_reader.set(cv2.CAP_PROP_POS_FRAMES, pos * self.frame_freq)
|
|
89
|
+
self.next_frame_idx = pos
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import importlib
|
|
3
|
+
import os
|
|
4
|
+
import pkgutil
|
|
5
|
+
from typing import Callable, Dict
|
|
6
|
+
|
|
7
|
+
import sqlalchemy as sql
|
|
8
|
+
import sqlalchemy.orm as orm
|
|
9
|
+
|
|
10
|
+
from .schema import SystemInfo, SystemInfoMd
|
|
11
|
+
|
|
12
|
+
# current version of the metadata; this is incremented whenever the metadata schema changes
|
|
13
|
+
VERSION = 12
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def create_system_info(engine: sql.engine.Engine) -> None:
|
|
17
|
+
"""Create the systemmetadata record"""
|
|
18
|
+
system_md = SystemInfoMd(schema_version=VERSION)
|
|
19
|
+
record = SystemInfo(md=dataclasses.asdict(system_md))
|
|
20
|
+
with orm.Session(engine, future=True) as session:
|
|
21
|
+
session.add(record)
|
|
22
|
+
session.flush()
|
|
23
|
+
session.commit()
|
|
24
|
+
|
|
25
|
+
# conversion functions for upgrading the metadata schema from one version to the following
|
|
26
|
+
# key: old schema version
|
|
27
|
+
converter_cbs: Dict[int, Callable[[sql.engine.Engine], None]] = {}
|
|
28
|
+
|
|
29
|
+
def register_converter(version: int, cb: Callable[[sql.engine.Engine], None]) -> None:
|
|
30
|
+
global converter_cbs
|
|
31
|
+
converter_cbs[version] = cb
|
|
32
|
+
|
|
33
|
+
# load all converter modules
|
|
34
|
+
for _, modname, _ in pkgutil.iter_modules([os.path.dirname(__file__) + '/converters']):
|
|
35
|
+
importlib.import_module('pixeltable.metadata.converters.' + modname)
|
|
36
|
+
|
|
37
|
+
def upgrade_md(engine: sql.engine.Engine) -> None:
|
|
38
|
+
"""Upgrade the metadata schema to the current version"""
|
|
39
|
+
with orm.Session(engine, future=True) as session:
|
|
40
|
+
system_info = session.query(SystemInfo).one().md
|
|
41
|
+
md_version = system_info['schema_version']
|
|
42
|
+
if md_version == VERSION:
|
|
43
|
+
return
|
|
44
|
+
while md_version < VERSION:
|
|
45
|
+
if md_version not in converter_cbs:
|
|
46
|
+
raise RuntimeError(f'No metadata converter for version {md_version}')
|
|
47
|
+
print(f'Converting metadata from version {md_version} to {md_version + 1}')
|
|
48
|
+
converter_cbs[md_version](engine)
|
|
49
|
+
md_version += 1
|
|
50
|
+
# update system info
|
|
51
|
+
conn = session.connection()
|
|
52
|
+
system_info_md = SystemInfoMd(schema_version=VERSION)
|
|
53
|
+
conn.execute(SystemInfo.__table__.update().values(md=dataclasses.asdict(system_info_md)))
|
|
54
|
+
session.commit()
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import sqlalchemy as sql
|
|
2
|
+
|
|
3
|
+
from pixeltable.metadata.schema import Table, TableSchemaVersion
|
|
4
|
+
from pixeltable.metadata import register_converter
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def convert_10(engine: sql.engine.Engine) -> None:
|
|
8
|
+
default_table_attrs = {"comment": None, "num_retained_versions": 10}
|
|
9
|
+
with engine.begin() as conn:
|
|
10
|
+
# Because `parameters` wasn't actually used for anything,
|
|
11
|
+
# we can simply delete it without any data loss.
|
|
12
|
+
conn.execute(sql.update(Table).values(md=Table.md - 'parameters'))
|
|
13
|
+
# Add `table_attrs` to all instances of tableschemaversions.md.
|
|
14
|
+
conn.execute(sql.update(TableSchemaVersion).values(md=TableSchemaVersion.md.concat(default_table_attrs)))
|
|
15
|
+
return
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
register_converter(10, convert_10)
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
from typing import Optional, List, Dict, get_type_hints, Type, Any, TypeVar, Tuple, Union
|
|
2
|
+
import platform
|
|
3
|
+
import uuid
|
|
4
|
+
import dataclasses
|
|
5
|
+
|
|
6
|
+
import sqlalchemy as sql
|
|
7
|
+
from sqlalchemy import Integer, String, Boolean, BigInteger, LargeBinary
|
|
8
|
+
from sqlalchemy.dialects.postgresql import UUID, JSONB
|
|
9
|
+
from sqlalchemy import ForeignKey, UniqueConstraint, ForeignKeyConstraint
|
|
10
|
+
from sqlalchemy.orm import declarative_base
|
|
11
|
+
|
|
12
|
+
Base = declarative_base()
|
|
13
|
+
|
|
14
|
+
T = TypeVar('T')
|
|
15
|
+
|
|
16
|
+
def md_from_dict(data_class_type: Type[T], data: Any) -> T:
|
|
17
|
+
"""Re-instantiate a dataclass instance that contains nested dataclasses from a dict."""
|
|
18
|
+
if dataclasses.is_dataclass(data_class_type):
|
|
19
|
+
fieldtypes = {f: t for f, t in get_type_hints(data_class_type).items()}
|
|
20
|
+
return data_class_type(**{f: md_from_dict(fieldtypes[f], data[f]) for f in data})
|
|
21
|
+
elif hasattr(data_class_type, '__origin__'):
|
|
22
|
+
if data_class_type.__origin__ is Union and type(None) in data_class_type.__args__:
|
|
23
|
+
# Handling Optional types
|
|
24
|
+
non_none_args = [arg for arg in data_class_type.__args__ if arg is not type(None)]
|
|
25
|
+
if len(non_none_args) == 1:
|
|
26
|
+
return md_from_dict(non_none_args[0], data) if data is not None else None
|
|
27
|
+
elif data_class_type.__origin__ is list:
|
|
28
|
+
return [md_from_dict(data_class_type.__args__[0], elem) for elem in data]
|
|
29
|
+
elif data_class_type.__origin__ is dict:
|
|
30
|
+
key_type = data_class_type.__args__[0]
|
|
31
|
+
val_type = data_class_type.__args__[1]
|
|
32
|
+
return {key_type(key): md_from_dict(val_type, val) for key, val in data.items()}
|
|
33
|
+
elif data_class_type.__origin__ is tuple:
|
|
34
|
+
return tuple(md_from_dict(arg_type, elem) for arg_type, elem in zip(data_class_type.__args__, data))
|
|
35
|
+
else:
|
|
36
|
+
return data
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# structure of the stored metadata:
|
|
40
|
+
# - each schema entity that grows somehow proportionally to the data (# of output_rows, total insert operations,
|
|
41
|
+
# number of schema changes) gets its own table
|
|
42
|
+
# - each table has an 'md' column that basically contains the payload
|
|
43
|
+
# - exceptions to that are foreign keys without which lookups would be too slow (ex.: TableSchemaVersions.tbl_id)
|
|
44
|
+
# - the md column contains a dataclass serialized to json; this has the advantage of making changes to the metadata
|
|
45
|
+
# schema easier (the goal is not to have to rely on some schema migration framework; if that breaks for some user,
|
|
46
|
+
# it would be very difficult to patch up)
|
|
47
|
+
|
|
48
|
+
@dataclasses.dataclass
|
|
49
|
+
class SystemInfoMd:
|
|
50
|
+
schema_version: int
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class SystemInfo(Base):
|
|
54
|
+
"""A single-row table that contains system-wide metadata."""
|
|
55
|
+
__tablename__ = 'systeminfo'
|
|
56
|
+
dummy = sql.Column(Integer, primary_key=True, default=0, nullable=False)
|
|
57
|
+
md = sql.Column(JSONB, nullable=False) # SystemInfoMd
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclasses.dataclass
|
|
61
|
+
class DirMd:
|
|
62
|
+
name: str
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class Dir(Base):
|
|
66
|
+
__tablename__ = 'dirs'
|
|
67
|
+
|
|
68
|
+
id = sql.Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4, nullable=False)
|
|
69
|
+
parent_id = sql.Column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=True)
|
|
70
|
+
md = sql.Column(JSONB, nullable=False)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@dataclasses.dataclass
|
|
74
|
+
class ColumnHistory:
|
|
75
|
+
"""
|
|
76
|
+
Records when a column was added/dropped, which is needed to GC unreachable storage columns
|
|
77
|
+
(a column that was added after table snapshot n and dropped before table snapshot n+1 can be removed
|
|
78
|
+
from the stored table).
|
|
79
|
+
One record per column (across all schema versions).
|
|
80
|
+
"""
|
|
81
|
+
col_id: int
|
|
82
|
+
schema_version_add: int
|
|
83
|
+
schema_version_drop: Optional[int]
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@dataclasses.dataclass
|
|
87
|
+
class ViewMd:
|
|
88
|
+
is_snapshot: bool
|
|
89
|
+
|
|
90
|
+
# (table id, version); for mutable views, all versions are None
|
|
91
|
+
base_versions: List[Tuple[str, Optional[int]]]
|
|
92
|
+
|
|
93
|
+
# filter predicate applied to the base table; view-only
|
|
94
|
+
predicate: Optional[Dict[str, Any]]
|
|
95
|
+
|
|
96
|
+
# ComponentIterator subclass; only for component views
|
|
97
|
+
iterator_class_fqn: Optional[str]
|
|
98
|
+
|
|
99
|
+
# args to pass to the iterator class constructor; only for component views
|
|
100
|
+
iterator_args: Optional[Dict[str, Any]]
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@dataclasses.dataclass
|
|
104
|
+
class TableMd:
|
|
105
|
+
name: str
|
|
106
|
+
|
|
107
|
+
# monotonically increasing w/in Table for both data and schema changes, starting at 0
|
|
108
|
+
current_version: int
|
|
109
|
+
# each version has a corresponding schema version (current_version >= current_schema_version)
|
|
110
|
+
current_schema_version: int
|
|
111
|
+
|
|
112
|
+
# used to assign Column.id
|
|
113
|
+
next_col_id: int
|
|
114
|
+
|
|
115
|
+
# - used to assign the rowid column in the storage table
|
|
116
|
+
# - every row is assigned a unique and immutable rowid on insertion
|
|
117
|
+
next_row_id: int
|
|
118
|
+
|
|
119
|
+
column_history: Dict[int, ColumnHistory] # col_id -> ColumnHistory
|
|
120
|
+
|
|
121
|
+
view_md: Optional[ViewMd]
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class Table(Base):
|
|
125
|
+
"""
|
|
126
|
+
Table represents both tables and views.
|
|
127
|
+
|
|
128
|
+
Views are in essence a subclass of tables, because they also store materialized columns. The differences are:
|
|
129
|
+
- views have a base, which is either a (live) table or a snapshot
|
|
130
|
+
- views can have a filter predicate
|
|
131
|
+
"""
|
|
132
|
+
__tablename__ = 'tables'
|
|
133
|
+
|
|
134
|
+
MAX_VERSION = 9223372036854775807 # 2^63 - 1
|
|
135
|
+
|
|
136
|
+
id = sql.Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4, nullable=False)
|
|
137
|
+
dir_id = sql.Column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=False)
|
|
138
|
+
md = sql.Column(JSONB, nullable=False) # TableMd
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
@dataclasses.dataclass
|
|
142
|
+
class TableVersionMd:
|
|
143
|
+
created_at: float # time.time()
|
|
144
|
+
version: int
|
|
145
|
+
schema_version: int
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class TableVersion(Base):
|
|
149
|
+
__tablename__ = 'tableversions'
|
|
150
|
+
tbl_id = sql.Column(UUID(as_uuid=True), ForeignKey('tables.id'), primary_key=True, nullable=False)
|
|
151
|
+
version = sql.Column(BigInteger, primary_key=True, nullable=False)
|
|
152
|
+
md = sql.Column(JSONB, nullable=False) # TableVersionMd
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
@dataclasses.dataclass
|
|
156
|
+
class SchemaColumn:
|
|
157
|
+
"""
|
|
158
|
+
Records the logical (user-visible) schema of a table.
|
|
159
|
+
Contains the full set of columns for each new schema version: one record per (column x schema version).
|
|
160
|
+
"""
|
|
161
|
+
pos: int
|
|
162
|
+
name: str
|
|
163
|
+
col_type: dict
|
|
164
|
+
is_pk: bool
|
|
165
|
+
value_expr: Optional[dict]
|
|
166
|
+
stored: Optional[bool]
|
|
167
|
+
# if True, creates vector index for this column
|
|
168
|
+
is_indexed: bool
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
@dataclasses.dataclass
|
|
172
|
+
class TableSchemaVersionMd:
|
|
173
|
+
schema_version: int
|
|
174
|
+
preceding_schema_version: Optional[int]
|
|
175
|
+
columns: Dict[int, SchemaColumn] # col_id -> SchemaColumn
|
|
176
|
+
num_retained_versions: int
|
|
177
|
+
comment: str
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
# versioning: each table schema change results in a new record
|
|
181
|
+
class TableSchemaVersion(Base):
|
|
182
|
+
__tablename__ = 'tableschemaversions'
|
|
183
|
+
|
|
184
|
+
tbl_id = sql.Column(UUID(as_uuid=True), ForeignKey('tables.id'), primary_key=True, nullable=False)
|
|
185
|
+
schema_version = sql.Column(BigInteger, primary_key=True, nullable=False)
|
|
186
|
+
md = sql.Column(JSONB, nullable=False) # TableSchemaVersionMd
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
@dataclasses.dataclass
|
|
190
|
+
class FunctionMd:
|
|
191
|
+
name: str
|
|
192
|
+
py_version: str # platform.python_version
|
|
193
|
+
class_name: str # name of the Function subclass
|
|
194
|
+
md: dict # part of the output of Function.to_store()
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
class Function(Base):
|
|
198
|
+
"""
|
|
199
|
+
User-defined functions that are not module functions (ie, aren't available at runtime as a symbol in a known
|
|
200
|
+
module).
|
|
201
|
+
Functions without a name are anonymous functions used in the definition of a computed column.
|
|
202
|
+
Functions that have names are also assigned to a database and directory.
|
|
203
|
+
We store the Python version under which a Function was created (and the callable pickled) in order to warn
|
|
204
|
+
against version mismatches.
|
|
205
|
+
"""
|
|
206
|
+
__tablename__ = 'functions'
|
|
207
|
+
|
|
208
|
+
id = sql.Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4, nullable=False)
|
|
209
|
+
dir_id = sql.Column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=True)
|
|
210
|
+
md = sql.Column(JSONB, nullable=False) # FunctionMd
|
|
211
|
+
binary_obj = sql.Column(LargeBinary, nullable=True)
|