pixeltable 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +53 -0
- pixeltable/__version__.py +3 -0
- pixeltable/catalog/__init__.py +13 -0
- pixeltable/catalog/catalog.py +159 -0
- pixeltable/catalog/column.py +181 -0
- pixeltable/catalog/dir.py +32 -0
- pixeltable/catalog/globals.py +33 -0
- pixeltable/catalog/insertable_table.py +192 -0
- pixeltable/catalog/named_function.py +36 -0
- pixeltable/catalog/path.py +58 -0
- pixeltable/catalog/path_dict.py +139 -0
- pixeltable/catalog/schema_object.py +39 -0
- pixeltable/catalog/table.py +695 -0
- pixeltable/catalog/table_version.py +1026 -0
- pixeltable/catalog/table_version_path.py +133 -0
- pixeltable/catalog/view.py +203 -0
- pixeltable/dataframe.py +749 -0
- pixeltable/env.py +466 -0
- pixeltable/exceptions.py +17 -0
- pixeltable/exec/__init__.py +10 -0
- pixeltable/exec/aggregation_node.py +78 -0
- pixeltable/exec/cache_prefetch_node.py +116 -0
- pixeltable/exec/component_iteration_node.py +79 -0
- pixeltable/exec/data_row_batch.py +94 -0
- pixeltable/exec/exec_context.py +22 -0
- pixeltable/exec/exec_node.py +61 -0
- pixeltable/exec/expr_eval_node.py +217 -0
- pixeltable/exec/in_memory_data_node.py +73 -0
- pixeltable/exec/media_validation_node.py +43 -0
- pixeltable/exec/sql_scan_node.py +226 -0
- pixeltable/exprs/__init__.py +25 -0
- pixeltable/exprs/arithmetic_expr.py +102 -0
- pixeltable/exprs/array_slice.py +71 -0
- pixeltable/exprs/column_property_ref.py +77 -0
- pixeltable/exprs/column_ref.py +114 -0
- pixeltable/exprs/comparison.py +77 -0
- pixeltable/exprs/compound_predicate.py +98 -0
- pixeltable/exprs/data_row.py +199 -0
- pixeltable/exprs/expr.py +594 -0
- pixeltable/exprs/expr_set.py +39 -0
- pixeltable/exprs/function_call.py +382 -0
- pixeltable/exprs/globals.py +69 -0
- pixeltable/exprs/image_member_access.py +96 -0
- pixeltable/exprs/in_predicate.py +96 -0
- pixeltable/exprs/inline_array.py +109 -0
- pixeltable/exprs/inline_dict.py +103 -0
- pixeltable/exprs/is_null.py +38 -0
- pixeltable/exprs/json_mapper.py +121 -0
- pixeltable/exprs/json_path.py +159 -0
- pixeltable/exprs/literal.py +66 -0
- pixeltable/exprs/object_ref.py +41 -0
- pixeltable/exprs/predicate.py +44 -0
- pixeltable/exprs/row_builder.py +329 -0
- pixeltable/exprs/rowid_ref.py +94 -0
- pixeltable/exprs/similarity_expr.py +65 -0
- pixeltable/exprs/type_cast.py +53 -0
- pixeltable/exprs/variable.py +45 -0
- pixeltable/ext/__init__.py +5 -0
- pixeltable/ext/functions/yolox.py +92 -0
- pixeltable/func/__init__.py +7 -0
- pixeltable/func/aggregate_function.py +197 -0
- pixeltable/func/callable_function.py +113 -0
- pixeltable/func/expr_template_function.py +99 -0
- pixeltable/func/function.py +141 -0
- pixeltable/func/function_registry.py +227 -0
- pixeltable/func/globals.py +46 -0
- pixeltable/func/nos_function.py +202 -0
- pixeltable/func/signature.py +162 -0
- pixeltable/func/udf.py +164 -0
- pixeltable/functions/__init__.py +95 -0
- pixeltable/functions/eval.py +215 -0
- pixeltable/functions/fireworks.py +34 -0
- pixeltable/functions/huggingface.py +167 -0
- pixeltable/functions/image.py +16 -0
- pixeltable/functions/openai.py +289 -0
- pixeltable/functions/pil/image.py +147 -0
- pixeltable/functions/string.py +13 -0
- pixeltable/functions/together.py +143 -0
- pixeltable/functions/util.py +52 -0
- pixeltable/functions/video.py +62 -0
- pixeltable/globals.py +425 -0
- pixeltable/index/__init__.py +2 -0
- pixeltable/index/base.py +51 -0
- pixeltable/index/embedding_index.py +168 -0
- pixeltable/io/__init__.py +3 -0
- pixeltable/io/hf_datasets.py +188 -0
- pixeltable/io/pandas.py +148 -0
- pixeltable/io/parquet.py +192 -0
- pixeltable/iterators/__init__.py +3 -0
- pixeltable/iterators/base.py +52 -0
- pixeltable/iterators/document.py +432 -0
- pixeltable/iterators/video.py +88 -0
- pixeltable/metadata/__init__.py +58 -0
- pixeltable/metadata/converters/convert_10.py +18 -0
- pixeltable/metadata/converters/convert_12.py +3 -0
- pixeltable/metadata/converters/convert_13.py +41 -0
- pixeltable/metadata/schema.py +234 -0
- pixeltable/plan.py +620 -0
- pixeltable/store.py +424 -0
- pixeltable/tool/create_test_db_dump.py +184 -0
- pixeltable/tool/create_test_video.py +81 -0
- pixeltable/type_system.py +846 -0
- pixeltable/utils/__init__.py +17 -0
- pixeltable/utils/arrow.py +98 -0
- pixeltable/utils/clip.py +18 -0
- pixeltable/utils/coco.py +136 -0
- pixeltable/utils/documents.py +69 -0
- pixeltable/utils/filecache.py +195 -0
- pixeltable/utils/help.py +11 -0
- pixeltable/utils/http_server.py +70 -0
- pixeltable/utils/media_store.py +76 -0
- pixeltable/utils/pytorch.py +91 -0
- pixeltable/utils/s3.py +13 -0
- pixeltable/utils/sql.py +17 -0
- pixeltable/utils/transactional_directory.py +35 -0
- pixeltable-0.0.0.dist-info/LICENSE +18 -0
- pixeltable-0.0.0.dist-info/METADATA +131 -0
- pixeltable-0.0.0.dist-info/RECORD +119 -0
- pixeltable-0.0.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,432 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import enum
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Dict, Any, List, Tuple, Optional, Iterable, Iterator
|
|
5
|
+
|
|
6
|
+
import ftfy
|
|
7
|
+
|
|
8
|
+
from pixeltable.env import Env
|
|
9
|
+
from pixeltable.exceptions import Error
|
|
10
|
+
from pixeltable.type_system import ColumnType, DocumentType, StringType, IntType, JsonType
|
|
11
|
+
from pixeltable.utils.documents import get_document_handle
|
|
12
|
+
from .base import ComponentIterator
|
|
13
|
+
|
|
14
|
+
_logger = logging.getLogger('pixeltable')
|
|
15
|
+
|
|
16
|
+
class ChunkMetadata(enum.Enum):
|
|
17
|
+
TITLE = 1
|
|
18
|
+
HEADING = 2
|
|
19
|
+
SOURCELINE = 3
|
|
20
|
+
PAGE = 4
|
|
21
|
+
BOUNDING_BOX = 5
|
|
22
|
+
|
|
23
|
+
class Separator(enum.Enum):
|
|
24
|
+
HEADING = 1
|
|
25
|
+
PARAGRAPH = 2
|
|
26
|
+
SENTENCE = 3
|
|
27
|
+
TOKEN_LIMIT = 4
|
|
28
|
+
CHAR_LIMIT = 5
|
|
29
|
+
PAGE = 6
|
|
30
|
+
|
|
31
|
+
@dataclasses.dataclass
|
|
32
|
+
class DocumentSectionMetadata:
|
|
33
|
+
"""Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
|
|
34
|
+
# html and markdown metadata
|
|
35
|
+
sourceline: Optional[int] = None
|
|
36
|
+
# the stack of headings up to the most recently observed one;
|
|
37
|
+
# eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
|
|
38
|
+
heading: Optional[Dict[int, str]] = None
|
|
39
|
+
|
|
40
|
+
# pdf-specific metadata
|
|
41
|
+
page: Optional[int] = None
|
|
42
|
+
# bounding box as an {x1, y1, x2, y2} dictionary
|
|
43
|
+
bounding_box: Optional[Dict[str, float]] = None
|
|
44
|
+
|
|
45
|
+
@dataclasses.dataclass
|
|
46
|
+
class DocumentSection:
|
|
47
|
+
"""A single document chunk, according to some of the splitting criteria"""
|
|
48
|
+
text: Optional[str]
|
|
49
|
+
metadata: Optional[DocumentSectionMetadata]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _parse_separators(separators: str) -> List[Separator]:
|
|
53
|
+
ret = []
|
|
54
|
+
for s in separators.split(','):
|
|
55
|
+
clean_s = s.strip().upper()
|
|
56
|
+
if not clean_s:
|
|
57
|
+
continue
|
|
58
|
+
if clean_s not in Separator.__members__:
|
|
59
|
+
raise Error(
|
|
60
|
+
f'Invalid separator: `{s.strip()}`. Valid separators are: {", ".join(Separator.__members__).lower()}'
|
|
61
|
+
)
|
|
62
|
+
ret.append(Separator[clean_s])
|
|
63
|
+
return ret
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _parse_metadata(metadata: str) -> List[ChunkMetadata]:
|
|
67
|
+
ret = []
|
|
68
|
+
for m in metadata.split(','):
|
|
69
|
+
clean_m = m.strip().upper()
|
|
70
|
+
if not clean_m:
|
|
71
|
+
continue
|
|
72
|
+
if clean_m not in ChunkMetadata.__members__:
|
|
73
|
+
raise Error(
|
|
74
|
+
f'Invalid metadata: `{m.strip()}`. Valid metadata are: {", ".join(ChunkMetadata.__members__).lower()}'
|
|
75
|
+
)
|
|
76
|
+
ret.append(ChunkMetadata[clean_m])
|
|
77
|
+
return ret
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
_HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
|
|
81
|
+
|
|
82
|
+
class DocumentSplitter(ComponentIterator):
|
|
83
|
+
"""Iterator over pieces of a document. The document is split into chunks based on the specified separators.
|
|
84
|
+
The iterator output tuples are of schema {'text': StringType()}, but can include additional metadata fields if specified
|
|
85
|
+
in the `metadata` argument as explained below.
|
|
86
|
+
All chunk text is passed through `ftfy.fix_text` to fix up common problems with unicode sequences.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
`metadata`: which additional metadata fields to include in the output schema:
|
|
90
|
+
'title', 'heading' (HTML and Markdown), 'sourceline' (HTML), 'page' (PDF), 'bounding_box' (PDF).
|
|
91
|
+
The input can be a comma-separated string of these values eg. 'title,heading,sourceline'.
|
|
92
|
+
`separators`: which separators to use to split the document into rows. Options are:
|
|
93
|
+
'heading', 'paragraph', 'sentence', 'token_limit', 'char_limit', 'page'. As with metadata, this is can be a
|
|
94
|
+
comma-separated string eg. 'heading, token_limit'.
|
|
95
|
+
`limit`: the maximum number of tokens or characters in each chunk if 'token_limit' or 'char_limit' is specified.
|
|
96
|
+
"""
|
|
97
|
+
METADATA_COLUMN_TYPES = {
|
|
98
|
+
ChunkMetadata.TITLE: StringType(nullable=True),
|
|
99
|
+
ChunkMetadata.HEADING: JsonType(nullable=True),
|
|
100
|
+
ChunkMetadata.SOURCELINE: IntType(nullable=True),
|
|
101
|
+
ChunkMetadata.PAGE: IntType(nullable=True),
|
|
102
|
+
ChunkMetadata.BOUNDING_BOX: JsonType(nullable=True),
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
def __init__(
|
|
106
|
+
self, document: str, *, separators: str, limit: Optional[int] = None, overlap: Optional[int] = None, metadata: str = '',
|
|
107
|
+
html_skip_tags: Optional[List[str]] = None, tiktoken_encoding: Optional[str] = 'cl100k_base',
|
|
108
|
+
tiktoken_target_model: Optional[str] = None
|
|
109
|
+
):
|
|
110
|
+
if html_skip_tags is None:
|
|
111
|
+
html_skip_tags = ['nav']
|
|
112
|
+
self._doc_handle = get_document_handle(document)
|
|
113
|
+
assert self._doc_handle is not None
|
|
114
|
+
# calling the output_schema method to validate the input arguments
|
|
115
|
+
self.output_schema(separators=separators, metadata=metadata, limit=limit, overlap=overlap)
|
|
116
|
+
self._separators = _parse_separators(separators)
|
|
117
|
+
self._metadata_fields = _parse_metadata(metadata)
|
|
118
|
+
if self._doc_handle.bs_doc is not None:
|
|
119
|
+
title = self._doc_handle.bs_doc.title
|
|
120
|
+
if title is None:
|
|
121
|
+
self._doc_title = ''
|
|
122
|
+
else:
|
|
123
|
+
self._doc_title = ftfy.fix_text(title.get_text().strip())
|
|
124
|
+
else:
|
|
125
|
+
self._doc_title = ''
|
|
126
|
+
self._limit = 0 if limit is None else limit
|
|
127
|
+
self._skip_tags = html_skip_tags
|
|
128
|
+
self._overlap = 0 if overlap is None else overlap
|
|
129
|
+
self._tiktoken_encoding = tiktoken_encoding
|
|
130
|
+
self._tiktoken_target_model = tiktoken_target_model
|
|
131
|
+
|
|
132
|
+
# set up processing pipeline
|
|
133
|
+
if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
|
|
134
|
+
assert self._doc_handle.bs_doc is not None
|
|
135
|
+
self._sections = self._html_sections()
|
|
136
|
+
elif self._doc_handle.format == DocumentType.DocumentFormat.MD:
|
|
137
|
+
assert self._doc_handle.md_ast is not None
|
|
138
|
+
self._sections = self._markdown_sections()
|
|
139
|
+
elif self._doc_handle.format == DocumentType.DocumentFormat.PDF:
|
|
140
|
+
assert self._doc_handle.pdf_doc is not None
|
|
141
|
+
self._sections = self._pdf_sections()
|
|
142
|
+
else:
|
|
143
|
+
assert False, f'unknown document format: {self._doc_handle.format}'
|
|
144
|
+
|
|
145
|
+
if Separator.SENTENCE in self._separators:
|
|
146
|
+
self._sections = self._sentence_sections(self._sections)
|
|
147
|
+
if Separator.TOKEN_LIMIT in self._separators:
|
|
148
|
+
self._sections = self._token_chunks(self._sections)
|
|
149
|
+
if Separator.CHAR_LIMIT in self._separators:
|
|
150
|
+
self._sections = self._char_chunks(self._sections)
|
|
151
|
+
|
|
152
|
+
@classmethod
|
|
153
|
+
def input_schema(cls) -> Dict[str, ColumnType]:
|
|
154
|
+
return {
|
|
155
|
+
'document': DocumentType(nullable=False),
|
|
156
|
+
'separators': StringType(nullable=False),
|
|
157
|
+
'metadata': StringType(nullable=True),
|
|
158
|
+
'limit': IntType(nullable=True),
|
|
159
|
+
'overlap': IntType(nullable=True),
|
|
160
|
+
'skip_tags': StringType(nullable=True),
|
|
161
|
+
'tiktoken_encoding': StringType(nullable=True),
|
|
162
|
+
'tiktoken_target_model': StringType(nullable=True),
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
@classmethod
|
|
166
|
+
def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
|
|
167
|
+
schema = {'text': StringType()}
|
|
168
|
+
md_fields = _parse_metadata(kwargs['metadata']) if 'metadata' in kwargs else []
|
|
169
|
+
|
|
170
|
+
for md_field in md_fields:
|
|
171
|
+
schema[md_field.name.lower()] = cls.METADATA_COLUMN_TYPES[md_field]
|
|
172
|
+
|
|
173
|
+
assert 'separators' in kwargs
|
|
174
|
+
separators = _parse_separators(kwargs['separators'])
|
|
175
|
+
|
|
176
|
+
limit = kwargs.get('limit')
|
|
177
|
+
overlap = kwargs.get('overlap')
|
|
178
|
+
|
|
179
|
+
if limit is not None or overlap is not None:
|
|
180
|
+
if Separator.TOKEN_LIMIT not in separators and Separator.CHAR_LIMIT not in separators:
|
|
181
|
+
raise Error('limit/overlap requires the "token_limit" or "char_limit" separator')
|
|
182
|
+
if limit is not None and limit <= 0:
|
|
183
|
+
raise Error('"limit" must be an integer > 0')
|
|
184
|
+
if overlap is not None and overlap < 0:
|
|
185
|
+
raise Error('"overlap" must be an integer >= 0')
|
|
186
|
+
if Separator.TOKEN_LIMIT in separators or Separator.CHAR_LIMIT in separators:
|
|
187
|
+
if Separator.TOKEN_LIMIT in separators and Separator.CHAR_LIMIT in separators:
|
|
188
|
+
raise Error('Cannot specify both "token_limit" and "char_limit" separators')
|
|
189
|
+
if kwargs.get('limit') is None:
|
|
190
|
+
raise Error('limit is required with "token_limit"/"char_limit" separators')
|
|
191
|
+
|
|
192
|
+
# check dependencies at the end
|
|
193
|
+
if Separator.SENTENCE in separators:
|
|
194
|
+
Env.get().require_package('spacy')
|
|
195
|
+
if Separator.TOKEN_LIMIT in separators:
|
|
196
|
+
Env.get().require_package('tiktoken')
|
|
197
|
+
|
|
198
|
+
return schema, []
|
|
199
|
+
|
|
200
|
+
def __next__(self) -> Dict[str, Any]:
|
|
201
|
+
while True:
|
|
202
|
+
section = next(self._sections)
|
|
203
|
+
if section.text is None:
|
|
204
|
+
continue
|
|
205
|
+
result = {'text': section.text}
|
|
206
|
+
for md_field in self._metadata_fields:
|
|
207
|
+
if md_field == ChunkMetadata.TITLE:
|
|
208
|
+
result[md_field.name.lower()] = self._doc_title
|
|
209
|
+
elif md_field == ChunkMetadata.HEADING:
|
|
210
|
+
result[md_field.name.lower()] = section.metadata.heading
|
|
211
|
+
elif md_field == ChunkMetadata.SOURCELINE:
|
|
212
|
+
result[md_field.name.lower()] = section.metadata.sourceline
|
|
213
|
+
elif md_field == ChunkMetadata.PAGE:
|
|
214
|
+
result[md_field.name.lower()] = section.metadata.page
|
|
215
|
+
elif md_field == ChunkMetadata.BOUNDING_BOX:
|
|
216
|
+
result[md_field.name.lower()] = section.metadata.bounding_box
|
|
217
|
+
return result
|
|
218
|
+
|
|
219
|
+
def _html_sections(self) -> Iterator[DocumentSection]:
|
|
220
|
+
"""Create DocumentSections reflecting the html-specific separators"""
|
|
221
|
+
import bs4
|
|
222
|
+
emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
|
|
223
|
+
emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
|
|
224
|
+
# current state
|
|
225
|
+
accumulated_text = [] # currently accumulated text
|
|
226
|
+
# accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
|
|
227
|
+
|
|
228
|
+
headings: Dict[int, str] = {} # current state of observed headings (level -> text)
|
|
229
|
+
sourceline = 0 # most recently seen sourceline
|
|
230
|
+
|
|
231
|
+
def update_metadata(el: bs4.Tag) -> None:
|
|
232
|
+
# update current state
|
|
233
|
+
nonlocal headings, sourceline
|
|
234
|
+
sourceline = el.sourceline
|
|
235
|
+
if el.name in _HTML_HEADINGS:
|
|
236
|
+
level = int(el.name[1])
|
|
237
|
+
# remove the previously seen lower levels
|
|
238
|
+
lower_levels = [l for l in headings if l > level]
|
|
239
|
+
for l in lower_levels:
|
|
240
|
+
del headings[l]
|
|
241
|
+
headings[level] = el.get_text().strip()
|
|
242
|
+
|
|
243
|
+
def emit() -> None:
|
|
244
|
+
nonlocal accumulated_text, headings, sourceline
|
|
245
|
+
if len(accumulated_text) > 0:
|
|
246
|
+
md = DocumentSectionMetadata(sourceline=sourceline, heading=headings.copy())
|
|
247
|
+
full_text = ' '.join(accumulated_text)
|
|
248
|
+
full_text = ftfy.fix_text(full_text)
|
|
249
|
+
yield DocumentSection(text=full_text, metadata=md)
|
|
250
|
+
accumulated_text = []
|
|
251
|
+
|
|
252
|
+
def process_element(el: bs4.PageElement) -> Iterator[DocumentSection]:
|
|
253
|
+
# process the element and emit sections as necessary
|
|
254
|
+
nonlocal accumulated_text, headings, sourceline, emit_on_heading, emit_on_paragraph
|
|
255
|
+
if el.name in self._skip_tags:
|
|
256
|
+
return
|
|
257
|
+
|
|
258
|
+
if isinstance(el, bs4.NavigableString):
|
|
259
|
+
# accumulate text until we see a tag we care about
|
|
260
|
+
text = el.get_text().strip()
|
|
261
|
+
if len(text) > 0:
|
|
262
|
+
accumulated_text.append(text)
|
|
263
|
+
return
|
|
264
|
+
|
|
265
|
+
if el.name in _HTML_HEADINGS:
|
|
266
|
+
if emit_on_heading:
|
|
267
|
+
yield from emit()
|
|
268
|
+
update_metadata(el)
|
|
269
|
+
elif el.name == 'p':
|
|
270
|
+
if emit_on_paragraph:
|
|
271
|
+
yield from emit()
|
|
272
|
+
update_metadata(el)
|
|
273
|
+
for child in el.children:
|
|
274
|
+
yield from process_element(child)
|
|
275
|
+
|
|
276
|
+
yield from process_element(self._doc_handle.bs_doc)
|
|
277
|
+
yield from emit()
|
|
278
|
+
|
|
279
|
+
def _markdown_sections(self) -> Iterator[DocumentSection]:
|
|
280
|
+
"""Create DocumentSections reflecting the html-specific separators"""
|
|
281
|
+
assert self._doc_handle.md_ast is not None
|
|
282
|
+
emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
|
|
283
|
+
emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
|
|
284
|
+
# current state
|
|
285
|
+
accumulated_text = [] # currently accumulated text
|
|
286
|
+
# accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
|
|
287
|
+
headings: Dict[int, str] = {} # current state of observed headings (level -> text)
|
|
288
|
+
|
|
289
|
+
def update_headings(heading: Dict) -> None:
|
|
290
|
+
# update current state
|
|
291
|
+
nonlocal headings
|
|
292
|
+
assert 'type' in heading and heading['type'] == 'heading'
|
|
293
|
+
level = heading['attrs']['level']
|
|
294
|
+
text = heading['children'][0]['raw'].strip()
|
|
295
|
+
# remove the previously seen lower levels
|
|
296
|
+
lower_levels = [l for l in headings.keys() if l > level]
|
|
297
|
+
for l in lower_levels:
|
|
298
|
+
del headings[l]
|
|
299
|
+
headings[level] = text
|
|
300
|
+
|
|
301
|
+
def emit() -> None:
|
|
302
|
+
nonlocal accumulated_text, headings
|
|
303
|
+
if len(accumulated_text) > 0:
|
|
304
|
+
metadata = DocumentSectionMetadata(sourceline=0, heading=headings.copy())
|
|
305
|
+
yield DocumentSection(text=ftfy.fix_text(' '.join(accumulated_text)), metadata=metadata)
|
|
306
|
+
accumulated_text = []
|
|
307
|
+
|
|
308
|
+
def process_element(el: Dict) -> Iterator[DocumentSection]:
|
|
309
|
+
# process the element and emit sections as necessary
|
|
310
|
+
nonlocal accumulated_text, headings, emit_on_heading, emit_on_paragraph
|
|
311
|
+
assert 'type' in el
|
|
312
|
+
|
|
313
|
+
if el['type'] == 'text':
|
|
314
|
+
# accumulate text until we see a separator element
|
|
315
|
+
text = el['raw'].strip()
|
|
316
|
+
if len(text) > 0:
|
|
317
|
+
accumulated_text.append(text)
|
|
318
|
+
return
|
|
319
|
+
|
|
320
|
+
if el['type'] == 'heading':
|
|
321
|
+
if emit_on_heading:
|
|
322
|
+
yield from emit()
|
|
323
|
+
update_headings(el)
|
|
324
|
+
elif el['type'] == 'paragraph':
|
|
325
|
+
if emit_on_paragraph:
|
|
326
|
+
yield from emit()
|
|
327
|
+
if 'children' not in el:
|
|
328
|
+
return
|
|
329
|
+
for child in el['children']:
|
|
330
|
+
yield from process_element(child)
|
|
331
|
+
|
|
332
|
+
for el in self._doc_handle.md_ast:
|
|
333
|
+
yield from process_element(el)
|
|
334
|
+
yield from emit()
|
|
335
|
+
|
|
336
|
+
def _pdf_sections(self) -> Iterator[DocumentSection]:
|
|
337
|
+
"""Create DocumentSections reflecting the pdf-specific separators"""
|
|
338
|
+
import fitz
|
|
339
|
+
doc: fitz.Document = self._doc_handle.pdf_doc
|
|
340
|
+
assert doc is not None
|
|
341
|
+
|
|
342
|
+
emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
|
|
343
|
+
emit_on_page = Separator.PAGE in self._separators or emit_on_paragraph
|
|
344
|
+
|
|
345
|
+
accumulated_text = [] # invariant: all elements are ftfy clean and non-empty
|
|
346
|
+
|
|
347
|
+
def _add_cleaned_text(raw_text: str) -> None:
|
|
348
|
+
fixed = ftfy.fix_text(raw_text)
|
|
349
|
+
if fixed:
|
|
350
|
+
accumulated_text.append(fixed)
|
|
351
|
+
|
|
352
|
+
def _emit_text() -> str:
|
|
353
|
+
full_text = ''.join(accumulated_text)
|
|
354
|
+
accumulated_text.clear()
|
|
355
|
+
return full_text
|
|
356
|
+
|
|
357
|
+
for page_number, page in enumerate(doc.pages()):
|
|
358
|
+
for block in page.get_text('blocks'):
|
|
359
|
+
# there is no concept of paragraph in pdf, block is the closest thing
|
|
360
|
+
# we can get (eg a paragraph in text may cut across pages)
|
|
361
|
+
# see pymupdf docs https://pymupdf.readthedocs.io/en/latest/app1.html
|
|
362
|
+
# other libraries like pdfminer also lack an explicit paragraph concept
|
|
363
|
+
x1, y1, x2, y2, text, _, _ = block
|
|
364
|
+
_add_cleaned_text(text)
|
|
365
|
+
if accumulated_text and emit_on_paragraph:
|
|
366
|
+
bbox = {'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2}
|
|
367
|
+
metadata = DocumentSectionMetadata(page=page_number, bounding_box=bbox)
|
|
368
|
+
yield DocumentSection(text=_emit_text(), metadata=metadata)
|
|
369
|
+
|
|
370
|
+
if accumulated_text and emit_on_page and not emit_on_paragraph:
|
|
371
|
+
yield DocumentSection(text=_emit_text(),
|
|
372
|
+
metadata=DocumentSectionMetadata(page=page_number))
|
|
373
|
+
accumulated_text = []
|
|
374
|
+
|
|
375
|
+
if accumulated_text and not emit_on_page:
|
|
376
|
+
yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata())
|
|
377
|
+
|
|
378
|
+
def _sentence_sections(self, input_sections: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
|
|
379
|
+
"""Split the input sections into sentences"""
|
|
380
|
+
for section in input_sections:
|
|
381
|
+
if section.text is not None:
|
|
382
|
+
doc = Env.get().spacy_nlp(section.text)
|
|
383
|
+
for sent in doc.sents:
|
|
384
|
+
yield DocumentSection(text=sent.text, metadata=section.metadata)
|
|
385
|
+
|
|
386
|
+
def _token_chunks(self, input: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
|
|
387
|
+
import tiktoken
|
|
388
|
+
if self._tiktoken_target_model is not None:
|
|
389
|
+
encoding = tiktoken.encoding_for_model(self._tiktoken_target_model)
|
|
390
|
+
else:
|
|
391
|
+
encoding = tiktoken.get_encoding(self._tiktoken_encoding)
|
|
392
|
+
assert self._limit > 0 and self._overlap >= 0
|
|
393
|
+
|
|
394
|
+
for section in input:
|
|
395
|
+
if section.text is None:
|
|
396
|
+
continue
|
|
397
|
+
tokens = encoding.encode(section.text)
|
|
398
|
+
start_idx = 0
|
|
399
|
+
text = None
|
|
400
|
+
while start_idx < len(tokens):
|
|
401
|
+
end_idx = min(start_idx + self._limit, len(tokens))
|
|
402
|
+
while end_idx > start_idx:
|
|
403
|
+
# find a cutoff point that doesn't cut in the middle of utf8 multi-byte sequences
|
|
404
|
+
try:
|
|
405
|
+
# check that the truncated data can be properly decoded
|
|
406
|
+
text = encoding.decode(tokens[start_idx:end_idx], errors='strict')
|
|
407
|
+
break
|
|
408
|
+
except UnicodeDecodeError:
|
|
409
|
+
# we split the token array at a point where the utf8 encoding is broken
|
|
410
|
+
end_idx -= 1
|
|
411
|
+
|
|
412
|
+
assert end_idx > start_idx
|
|
413
|
+
assert text
|
|
414
|
+
yield DocumentSection(text=text, metadata=section.metadata)
|
|
415
|
+
start_idx = max(start_idx + 1, end_idx - self._overlap) # ensure we make progress
|
|
416
|
+
|
|
417
|
+
def _char_chunks(self, input: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
|
|
418
|
+
for section in input:
|
|
419
|
+
if section.text is None:
|
|
420
|
+
continue
|
|
421
|
+
start_idx = 0
|
|
422
|
+
while start_idx < len(section.text):
|
|
423
|
+
end_idx = min(start_idx + self._limit, len(section.text))
|
|
424
|
+
text = section.text[start_idx:end_idx]
|
|
425
|
+
yield DocumentSection(text=text, metadata=section.metadata)
|
|
426
|
+
start_idx += self._limit - self._overlap
|
|
427
|
+
|
|
428
|
+
def close(self) -> None:
|
|
429
|
+
pass
|
|
430
|
+
|
|
431
|
+
def set_pos(self, pos: int) -> None:
|
|
432
|
+
pass
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import math
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Dict, Any, List, Tuple
|
|
5
|
+
|
|
6
|
+
import PIL.Image
|
|
7
|
+
import cv2
|
|
8
|
+
|
|
9
|
+
from pixeltable import exprs
|
|
10
|
+
from pixeltable.exceptions import Error
|
|
11
|
+
from pixeltable.type_system import ColumnType, VideoType, ImageType, IntType, FloatType
|
|
12
|
+
from .base import ComponentIterator
|
|
13
|
+
|
|
14
|
+
_logger = logging.getLogger('pixeltable')
|
|
15
|
+
|
|
16
|
+
class FrameIterator(ComponentIterator):
|
|
17
|
+
def __init__(self, video: str, *, fps: float = 0.0):
|
|
18
|
+
video_path = Path(video)
|
|
19
|
+
assert video_path.exists() and video_path.is_file()
|
|
20
|
+
self.video_path = video_path
|
|
21
|
+
self.fps = fps
|
|
22
|
+
self.video_reader = cv2.VideoCapture(str(video_path))
|
|
23
|
+
if not self.video_reader.isOpened():
|
|
24
|
+
raise Error(f'Failed to open video: {video}')
|
|
25
|
+
video_fps = int(self.video_reader.get(cv2.CAP_PROP_FPS))
|
|
26
|
+
if fps > video_fps:
|
|
27
|
+
raise Error(f'Video {video}: requested fps ({fps}) exceeds that of the video ({video_fps})')
|
|
28
|
+
self.frame_freq = int(video_fps / fps) if fps > 0 else 1
|
|
29
|
+
num_video_frames = int(self.video_reader.get(cv2.CAP_PROP_FRAME_COUNT))
|
|
30
|
+
if num_video_frames == 0:
|
|
31
|
+
raise Error(f'Video {video}: failed to get number of frames')
|
|
32
|
+
# ceil: round up to ensure we count frame 0
|
|
33
|
+
self.num_frames = math.ceil(num_video_frames / self.frame_freq) if fps > 0 else num_video_frames
|
|
34
|
+
_logger.debug(f'FrameIterator: path={self.video_path} fps={self.fps}')
|
|
35
|
+
|
|
36
|
+
self.next_frame_idx = 0
|
|
37
|
+
|
|
38
|
+
@classmethod
|
|
39
|
+
def input_schema(cls) -> Dict[str, ColumnType]:
|
|
40
|
+
return {
|
|
41
|
+
'video': VideoType(nullable=False),
|
|
42
|
+
'fps': FloatType()
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
@classmethod
|
|
46
|
+
def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
|
|
47
|
+
return {
|
|
48
|
+
'frame_idx': IntType(),
|
|
49
|
+
'pos_msec': FloatType(),
|
|
50
|
+
'pos_frame': FloatType(),
|
|
51
|
+
'frame': ImageType(),
|
|
52
|
+
}, ['frame']
|
|
53
|
+
|
|
54
|
+
def __next__(self) -> Dict[str, Any]:
|
|
55
|
+
while True:
|
|
56
|
+
pos_msec = self.video_reader.get(cv2.CAP_PROP_POS_MSEC)
|
|
57
|
+
pos_frame = self.video_reader.get(cv2.CAP_PROP_POS_FRAMES)
|
|
58
|
+
status, img = self.video_reader.read()
|
|
59
|
+
if not status:
|
|
60
|
+
_logger.debug(f'releasing video reader for {self.video_path}')
|
|
61
|
+
self.video_reader.release()
|
|
62
|
+
self.video_reader = None
|
|
63
|
+
raise StopIteration
|
|
64
|
+
if pos_frame % self.frame_freq == 0:
|
|
65
|
+
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
|
66
|
+
result = {
|
|
67
|
+
'frame_idx': self.next_frame_idx,
|
|
68
|
+
'pos_msec': pos_msec,
|
|
69
|
+
'pos_frame': pos_frame,
|
|
70
|
+
'frame': PIL.Image.fromarray(img),
|
|
71
|
+
}
|
|
72
|
+
self.next_frame_idx += 1
|
|
73
|
+
# frame_freq > 1: jumping to the target frame here with video_reader.set() is far slower than just
|
|
74
|
+
# skipping the unwanted frames
|
|
75
|
+
return result
|
|
76
|
+
|
|
77
|
+
def close(self) -> None:
|
|
78
|
+
if self.video_reader is not None:
|
|
79
|
+
self.video_reader.release()
|
|
80
|
+
self.video_reader = None
|
|
81
|
+
|
|
82
|
+
def set_pos(self, pos: int) -> None:
|
|
83
|
+
"""Seek to frame idx"""
|
|
84
|
+
if pos == self.next_frame_idx:
|
|
85
|
+
return
|
|
86
|
+
_logger.debug(f'seeking to frame {pos}')
|
|
87
|
+
self.video_reader.set(cv2.CAP_PROP_POS_FRAMES, pos * self.frame_freq)
|
|
88
|
+
self.next_frame_idx = pos
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import importlib
|
|
3
|
+
import os
|
|
4
|
+
import pkgutil
|
|
5
|
+
from typing import Callable, Dict
|
|
6
|
+
|
|
7
|
+
import sqlalchemy as sql
|
|
8
|
+
import sqlalchemy.orm as orm
|
|
9
|
+
|
|
10
|
+
from .schema import SystemInfo, SystemInfoMd
|
|
11
|
+
|
|
12
|
+
# current version of the metadata; this is incremented whenever the metadata schema changes
|
|
13
|
+
VERSION = 14
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def create_system_info(engine: sql.engine.Engine) -> None:
|
|
17
|
+
"""Create the systemmetadata record"""
|
|
18
|
+
system_md = SystemInfoMd(schema_version=VERSION)
|
|
19
|
+
record = SystemInfo(md=dataclasses.asdict(system_md))
|
|
20
|
+
with orm.Session(engine, future=True) as session:
|
|
21
|
+
session.add(record)
|
|
22
|
+
session.flush()
|
|
23
|
+
session.commit()
|
|
24
|
+
|
|
25
|
+
# conversion functions for upgrading the metadata schema from one version to the following
|
|
26
|
+
# key: old schema version
|
|
27
|
+
converter_cbs: Dict[int, Callable[[sql.engine.Engine], None]] = {}
|
|
28
|
+
|
|
29
|
+
def register_converter(version: int, cb: Callable[[sql.engine.Engine], None]) -> None:
|
|
30
|
+
global converter_cbs
|
|
31
|
+
converter_cbs[version] = cb
|
|
32
|
+
|
|
33
|
+
def noop_converter(engine: sql.engine.Engine) -> None:
|
|
34
|
+
# Converter to use when incrementing the schema version, but without any functional changes
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
# load all converter modules
|
|
38
|
+
for _, modname, _ in pkgutil.iter_modules([os.path.dirname(__file__) + '/converters']):
|
|
39
|
+
importlib.import_module('pixeltable.metadata.converters.' + modname)
|
|
40
|
+
|
|
41
|
+
def upgrade_md(engine: sql.engine.Engine) -> None:
|
|
42
|
+
"""Upgrade the metadata schema to the current version"""
|
|
43
|
+
with orm.Session(engine) as session:
|
|
44
|
+
system_info = session.query(SystemInfo).one().md
|
|
45
|
+
md_version = system_info['schema_version']
|
|
46
|
+
if md_version == VERSION:
|
|
47
|
+
return
|
|
48
|
+
while md_version < VERSION:
|
|
49
|
+
if md_version not in converter_cbs:
|
|
50
|
+
raise RuntimeError(f'No metadata converter for version {md_version}')
|
|
51
|
+
print(f'Converting metadata from version {md_version} to {md_version + 1}')
|
|
52
|
+
converter_cbs[md_version](engine)
|
|
53
|
+
md_version += 1
|
|
54
|
+
# update system info
|
|
55
|
+
conn = session.connection()
|
|
56
|
+
system_info_md = SystemInfoMd(schema_version=VERSION)
|
|
57
|
+
conn.execute(SystemInfo.__table__.update().values(md=dataclasses.asdict(system_info_md)))
|
|
58
|
+
session.commit()
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import sqlalchemy as sql
|
|
2
|
+
|
|
3
|
+
from pixeltable.metadata.schema import Table, TableSchemaVersion
|
|
4
|
+
from pixeltable.metadata import register_converter
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def convert_10(engine: sql.engine.Engine) -> None:
|
|
8
|
+
default_table_attrs = {"comment": None, "num_retained_versions": 10}
|
|
9
|
+
with engine.begin() as conn:
|
|
10
|
+
# Because `parameters` wasn't actually used for anything,
|
|
11
|
+
# we can simply delete it without any data loss.
|
|
12
|
+
conn.execute(sql.update(Table).values(md=Table.md - 'parameters'))
|
|
13
|
+
# Add `table_attrs` to all instances of tableschemaversions.md.
|
|
14
|
+
conn.execute(sql.update(TableSchemaVersion).values(md=TableSchemaVersion.md.concat(default_table_attrs)))
|
|
15
|
+
return
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
register_converter(10, convert_10)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
import sqlalchemy as sql
|
|
5
|
+
|
|
6
|
+
from pixeltable.metadata import register_converter
|
|
7
|
+
from pixeltable.metadata.schema import Table
|
|
8
|
+
|
|
9
|
+
_logger = logging.getLogger('pixeltable')
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def convert_13(engine: sql.engine.Engine) -> None:
|
|
13
|
+
with engine.begin() as conn:
|
|
14
|
+
for row in conn.execute(sql.select(Table)):
|
|
15
|
+
id = row[0]
|
|
16
|
+
md = row[2]
|
|
17
|
+
updated_md = _update_md(md)
|
|
18
|
+
if updated_md != md:
|
|
19
|
+
_logger.info(f'Updating schema for table: {id}')
|
|
20
|
+
conn.execute(sql.update(Table).where(Table.id == id).values(md=updated_md))
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# Traverse the schema dictionary and replace instances of `ExplicitBatchedFunction` with
|
|
24
|
+
# `CallableFunction`. DB versions prior to 14 can't contain serialized batched functions,
|
|
25
|
+
# so this is all we need to do.
|
|
26
|
+
def _update_md(md: Any) -> Any:
|
|
27
|
+
if isinstance(md, dict):
|
|
28
|
+
updated_md = {}
|
|
29
|
+
for k, v in md.items():
|
|
30
|
+
if k == '_classpath' and v == 'pixeltable.func.batched_function.ExplicitBatchedFunction':
|
|
31
|
+
updated_md[k] = 'pixeltable.func.callable_function.CallableFunction'
|
|
32
|
+
else:
|
|
33
|
+
updated_md[k] = _update_md(v)
|
|
34
|
+
return updated_md
|
|
35
|
+
elif isinstance(md, list):
|
|
36
|
+
return [_update_md(v) for v in md]
|
|
37
|
+
else:
|
|
38
|
+
return md
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
register_converter(13, convert_13)
|