pixeltable 0.1.2__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (140) hide show
  1. pixeltable/__init__.py +21 -4
  2. pixeltable/catalog/__init__.py +13 -0
  3. pixeltable/catalog/catalog.py +159 -0
  4. pixeltable/catalog/column.py +200 -0
  5. pixeltable/catalog/dir.py +32 -0
  6. pixeltable/catalog/globals.py +33 -0
  7. pixeltable/catalog/insertable_table.py +191 -0
  8. pixeltable/catalog/named_function.py +36 -0
  9. pixeltable/catalog/path.py +58 -0
  10. pixeltable/catalog/path_dict.py +139 -0
  11. pixeltable/catalog/schema_object.py +39 -0
  12. pixeltable/catalog/table.py +581 -0
  13. pixeltable/catalog/table_version.py +749 -0
  14. pixeltable/catalog/table_version_path.py +133 -0
  15. pixeltable/catalog/view.py +203 -0
  16. pixeltable/client.py +520 -31
  17. pixeltable/dataframe.py +540 -349
  18. pixeltable/env.py +373 -48
  19. pixeltable/exceptions.py +12 -21
  20. pixeltable/exec/__init__.py +9 -0
  21. pixeltable/exec/aggregation_node.py +78 -0
  22. pixeltable/exec/cache_prefetch_node.py +113 -0
  23. pixeltable/exec/component_iteration_node.py +79 -0
  24. pixeltable/exec/data_row_batch.py +95 -0
  25. pixeltable/exec/exec_context.py +22 -0
  26. pixeltable/exec/exec_node.py +61 -0
  27. pixeltable/exec/expr_eval_node.py +217 -0
  28. pixeltable/exec/in_memory_data_node.py +69 -0
  29. pixeltable/exec/media_validation_node.py +43 -0
  30. pixeltable/exec/sql_scan_node.py +225 -0
  31. pixeltable/exprs/__init__.py +24 -0
  32. pixeltable/exprs/arithmetic_expr.py +102 -0
  33. pixeltable/exprs/array_slice.py +71 -0
  34. pixeltable/exprs/column_property_ref.py +77 -0
  35. pixeltable/exprs/column_ref.py +105 -0
  36. pixeltable/exprs/comparison.py +77 -0
  37. pixeltable/exprs/compound_predicate.py +98 -0
  38. pixeltable/exprs/data_row.py +187 -0
  39. pixeltable/exprs/expr.py +586 -0
  40. pixeltable/exprs/expr_set.py +39 -0
  41. pixeltable/exprs/function_call.py +380 -0
  42. pixeltable/exprs/globals.py +69 -0
  43. pixeltable/exprs/image_member_access.py +115 -0
  44. pixeltable/exprs/image_similarity_predicate.py +58 -0
  45. pixeltable/exprs/inline_array.py +107 -0
  46. pixeltable/exprs/inline_dict.py +101 -0
  47. pixeltable/exprs/is_null.py +38 -0
  48. pixeltable/exprs/json_mapper.py +121 -0
  49. pixeltable/exprs/json_path.py +159 -0
  50. pixeltable/exprs/literal.py +54 -0
  51. pixeltable/exprs/object_ref.py +41 -0
  52. pixeltable/exprs/predicate.py +44 -0
  53. pixeltable/exprs/row_builder.py +355 -0
  54. pixeltable/exprs/rowid_ref.py +94 -0
  55. pixeltable/exprs/type_cast.py +53 -0
  56. pixeltable/exprs/variable.py +45 -0
  57. pixeltable/func/__init__.py +9 -0
  58. pixeltable/func/aggregate_function.py +194 -0
  59. pixeltable/func/batched_function.py +53 -0
  60. pixeltable/func/callable_function.py +69 -0
  61. pixeltable/func/expr_template_function.py +82 -0
  62. pixeltable/func/function.py +110 -0
  63. pixeltable/func/function_registry.py +227 -0
  64. pixeltable/func/globals.py +36 -0
  65. pixeltable/func/nos_function.py +202 -0
  66. pixeltable/func/signature.py +166 -0
  67. pixeltable/func/udf.py +163 -0
  68. pixeltable/functions/__init__.py +52 -103
  69. pixeltable/functions/eval.py +216 -0
  70. pixeltable/functions/fireworks.py +61 -0
  71. pixeltable/functions/huggingface.py +120 -0
  72. pixeltable/functions/image.py +16 -0
  73. pixeltable/functions/openai.py +88 -0
  74. pixeltable/functions/pil/image.py +148 -7
  75. pixeltable/functions/string.py +13 -0
  76. pixeltable/functions/together.py +27 -0
  77. pixeltable/functions/util.py +41 -0
  78. pixeltable/functions/video.py +62 -0
  79. pixeltable/iterators/__init__.py +3 -0
  80. pixeltable/iterators/base.py +48 -0
  81. pixeltable/iterators/document.py +311 -0
  82. pixeltable/iterators/video.py +89 -0
  83. pixeltable/metadata/__init__.py +54 -0
  84. pixeltable/metadata/converters/convert_10.py +18 -0
  85. pixeltable/metadata/schema.py +211 -0
  86. pixeltable/plan.py +656 -0
  87. pixeltable/store.py +413 -182
  88. pixeltable/tests/conftest.py +143 -86
  89. pixeltable/tests/test_audio.py +65 -0
  90. pixeltable/tests/test_catalog.py +27 -0
  91. pixeltable/tests/test_client.py +14 -14
  92. pixeltable/tests/test_component_view.py +372 -0
  93. pixeltable/tests/test_dataframe.py +433 -0
  94. pixeltable/tests/test_dirs.py +78 -62
  95. pixeltable/tests/test_document.py +117 -0
  96. pixeltable/tests/test_exprs.py +591 -135
  97. pixeltable/tests/test_function.py +297 -67
  98. pixeltable/tests/test_functions.py +283 -1
  99. pixeltable/tests/test_migration.py +43 -0
  100. pixeltable/tests/test_nos.py +54 -0
  101. pixeltable/tests/test_snapshot.py +208 -0
  102. pixeltable/tests/test_table.py +1086 -258
  103. pixeltable/tests/test_transactional_directory.py +42 -0
  104. pixeltable/tests/test_types.py +5 -11
  105. pixeltable/tests/test_video.py +149 -34
  106. pixeltable/tests/test_view.py +530 -0
  107. pixeltable/tests/utils.py +186 -45
  108. pixeltable/tool/create_test_db_dump.py +149 -0
  109. pixeltable/type_system.py +490 -133
  110. pixeltable/utils/__init__.py +17 -46
  111. pixeltable/utils/clip.py +12 -15
  112. pixeltable/utils/coco.py +136 -0
  113. pixeltable/utils/documents.py +39 -0
  114. pixeltable/utils/filecache.py +195 -0
  115. pixeltable/utils/help.py +11 -0
  116. pixeltable/utils/media_store.py +76 -0
  117. pixeltable/utils/parquet.py +126 -0
  118. pixeltable/utils/pytorch.py +172 -0
  119. pixeltable/utils/s3.py +13 -0
  120. pixeltable/utils/sql.py +17 -0
  121. pixeltable/utils/transactional_directory.py +35 -0
  122. pixeltable-0.2.1.dist-info/LICENSE +18 -0
  123. pixeltable-0.2.1.dist-info/METADATA +119 -0
  124. pixeltable-0.2.1.dist-info/RECORD +125 -0
  125. {pixeltable-0.1.2.dist-info → pixeltable-0.2.1.dist-info}/WHEEL +1 -1
  126. pixeltable/catalog.py +0 -1421
  127. pixeltable/exprs.py +0 -1745
  128. pixeltable/function.py +0 -269
  129. pixeltable/functions/clip.py +0 -10
  130. pixeltable/functions/pil/__init__.py +0 -23
  131. pixeltable/functions/tf.py +0 -21
  132. pixeltable/index.py +0 -57
  133. pixeltable/tests/test_dict.py +0 -24
  134. pixeltable/tests/test_tf.py +0 -69
  135. pixeltable/tf.py +0 -33
  136. pixeltable/utils/tf.py +0 -33
  137. pixeltable/utils/video.py +0 -32
  138. pixeltable-0.1.2.dist-info/LICENSE +0 -201
  139. pixeltable-0.1.2.dist-info/METADATA +0 -89
  140. pixeltable-0.1.2.dist-info/RECORD +0 -37
@@ -0,0 +1,311 @@
1
+ from typing import Dict, Any, List, Tuple, Generator, Optional, Iterable
2
+ import logging
3
+ import dataclasses
4
+ import enum
5
+
6
+ from .base import ComponentIterator
7
+
8
+ from pixeltable.type_system import ColumnType, DocumentType, StringType, IntType, JsonType
9
+ from pixeltable.exceptions import Error
10
+ from pixeltable.env import Env
11
+ from pixeltable.utils.documents import get_document_handle
12
+
13
+
14
+ _logger = logging.getLogger('pixeltable')
15
+
16
+
17
+ class ChunkMetadata(enum.Enum):
18
+ TITLE = 1
19
+ HEADINGS = 2
20
+ SOURCELINE = 3
21
+
22
+
23
+ class Separator(enum.Enum):
24
+ HEADING = 1
25
+ PARAGRAPH = 2
26
+ SENTENCE = 3
27
+ TOKEN_LIMIT = 4
28
+ CHAR_LIMIT = 5
29
+
30
+
31
+ @dataclasses.dataclass
32
+ class DocumentSectionMd:
33
+ """Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
34
+ source_line: int
35
+
36
+ # the stack of headings up to the most recently observed one;
37
+ # eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
38
+ headings: Dict[int, str]
39
+
40
+
41
+ @dataclasses.dataclass
42
+ class DocumentSection:
43
+ """A single document chunk, according to some of the splitting criteria"""
44
+ text: Optional[str]
45
+ md: Optional[DocumentSectionMd]
46
+
47
+
48
+ class DocumentSplitter(ComponentIterator):
49
+ """"Iterator over pieces of a document"""
50
+ MD_COLUMN_TYPES = {
51
+ ChunkMetadata.TITLE: StringType(),
52
+ ChunkMetadata.HEADINGS: JsonType(),
53
+ ChunkMetadata.SOURCELINE: IntType()
54
+ }
55
+
56
+ def __init__(
57
+ self, document: str, *, separators: str, limit: int = 0, overlap: int = 0, metadata: str = '',
58
+ html_skip_tags: List[str] = None, tiktoken_encoding: Optional[str] = 'cl100k_base',
59
+ tiktoken_target_model: Optional[str] = None
60
+ ):
61
+ import bs4
62
+ if html_skip_tags is None:
63
+ html_skip_tags = ['nav']
64
+ with open(document, 'r') as fh:
65
+ s = fh.read()
66
+ self._doc_handle = get_document_handle(s)
67
+ assert self._doc_handle is not None
68
+ self._separators = [Separator[s.upper()] for s in separators.split(',')]
69
+ self._md_fields = [ChunkMetadata[m.upper()] for m in metadata.split(',')] if len(metadata) > 0 else []
70
+ self._doc_title = \
71
+ self._doc_handle.bs_doc.title.get_text().strip() if self._doc_handle.bs_doc is not None else ''
72
+ self._limit = limit
73
+ self._skip_tags = html_skip_tags
74
+ self._overlap = overlap
75
+ self._tiktoken_encoding = tiktoken_encoding
76
+ self._tiktoken_target_model = tiktoken_target_model
77
+
78
+ # set up processing pipeline
79
+ if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
80
+ assert self._doc_handle.bs_doc is not None
81
+ self._sections = self._html_sections()
82
+ else:
83
+ assert self._doc_handle.md_ast is not None
84
+ self._sections = self._markdown_sections()
85
+ if Separator.SENTENCE in self._separators:
86
+ self._sections = self._sentence_sections(self._sections)
87
+ if Separator.TOKEN_LIMIT in self._separators:
88
+ self._sections = self._token_chunks(self._sections)
89
+ if Separator.CHAR_LIMIT in self._separators:
90
+ self._sections = self._char_chunks(self._sections)
91
+
92
+ @classmethod
93
+ def input_schema(cls) -> Dict[str, ColumnType]:
94
+ return {
95
+ 'document': DocumentType(nullable=False),
96
+ 'separators': StringType(nullable=False),
97
+ 'metadata': StringType(nullable=True),
98
+ 'limit': IntType(nullable=True),
99
+ 'overlap': IntType(nullable=True),
100
+ 'skip_tags': StringType(nullable=True),
101
+ 'tiktoken_encoding': StringType(nullable=True),
102
+ 'tiktoken_target_model': StringType(nullable=True),
103
+ }
104
+
105
+ @classmethod
106
+ def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
107
+ schema = {'text': StringType()}
108
+ if 'metadata' in kwargs and len(kwargs['metadata']) > 0:
109
+ md_fields = kwargs['metadata'].split(',')
110
+ for md_field in md_fields:
111
+ if not hasattr(ChunkMetadata, md_field.upper()):
112
+ raise Error(f'Invalid metadata field {md_field}')
113
+ schema[md_field.lower()] = cls.MD_COLUMN_TYPES[ChunkMetadata[md_field.upper()]]
114
+
115
+ assert 'separators' in kwargs
116
+ separators = kwargs['separators'].split(',')
117
+ for separator in separators:
118
+ if not hasattr(Separator, separator.upper()):
119
+ raise Error(f'Invalid separator {separator}')
120
+
121
+ # check dependencies
122
+ if 'sentence' in separators:
123
+ Env.get().require_package('spacy')
124
+ if 'token_limit' in separators:
125
+ Env.get().require_package('tiktoken')
126
+
127
+ if 'limit' in kwargs or 'overlap' in kwargs:
128
+ if 'token_limit' not in separators and 'char_limit' not in separators:
129
+ raise Error('limit/overlap requires the "token_limit" or "char_limit" separator')
130
+ if 'limit' in kwargs and int(kwargs['limit']) <= 0:
131
+ raise Error('"limit" must be an integer > 0')
132
+ if 'overlap' in kwargs and int(kwargs['overlap']) < 0:
133
+ raise Error('"overlap" must be an integer >= 0')
134
+ if 'token_limit' in separators or 'char_limit' in separators:
135
+ if 'token_limit' in separators and 'char_limit' in separators:
136
+ raise Error('Cannot specify both "token_limit" and "char_limit" separators')
137
+ if 'limit' not in kwargs:
138
+ raise Error('limit is required with "token_limit"/"char_limit" separators')
139
+
140
+ return schema, []
141
+
142
+ def __next__(self) -> Dict[str, Any]:
143
+ while True:
144
+ section = next(self._sections)
145
+ if section.text is None:
146
+ continue
147
+ result = {'text': section.text}
148
+ for md_field in self._md_fields:
149
+ if md_field == ChunkMetadata.TITLE:
150
+ result[md_field.name.lower()] = self._doc_title
151
+ elif md_field == ChunkMetadata.HEADINGS:
152
+ result[md_field.name.lower()] = section.md.headings
153
+ elif md_field == ChunkMetadata.SOURCELINE:
154
+ result[md_field.name.lower()] = section.md.source_line
155
+ return result
156
+
157
+ def _html_sections(self) -> Generator[DocumentSection, None, None]:
158
+ """Create DocumentSections reflecting the html-specific separators"""
159
+ import bs4
160
+ emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
161
+ emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
162
+ # current state
163
+ text_section = '' # currently accumulated text
164
+ headings: Dict[int, str] = {} # current state of observed headings (level -> text)
165
+ sourceline = 0 # most recently seen sourceline
166
+
167
+ def update_md(el: bs4.Tag) -> None:
168
+ # update current state
169
+ nonlocal headings, sourceline
170
+ sourceline = el.sourceline
171
+ if el.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
172
+ level = int(el.name[1])
173
+ # remove the previously seen lower levels
174
+ lower_levels = [l for l in headings.keys() if l > level]
175
+ for l in lower_levels:
176
+ del headings[l]
177
+ headings[level] = el.get_text().strip()
178
+
179
+ def emit() -> None:
180
+ nonlocal text_section, headings, sourceline
181
+ if len(text_section) > 0:
182
+ md = DocumentSectionMd(sourceline, headings.copy())
183
+ yield DocumentSection(text=text_section, md=md)
184
+ text_section = ''
185
+
186
+ def process_element(el: bs4.PageElement) -> Generator[DocumentSection, None, None]:
187
+ # process the element and emit sections as necessary
188
+ nonlocal text_section, headings, sourceline, emit_on_heading, emit_on_paragraph
189
+ if el.name in self._skip_tags:
190
+ return
191
+
192
+ if isinstance(el, bs4.NavigableString):
193
+ # accumulate text until we see a tag we care about
194
+ text = el.get_text().strip()
195
+ if len(text) > 0:
196
+ text_section += ' ' + text
197
+ return
198
+
199
+ if el.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
200
+ if emit_on_heading:
201
+ yield from emit()
202
+ update_md(el)
203
+ elif el.name == 'p':
204
+ if emit_on_paragraph:
205
+ yield from emit()
206
+ update_md(el)
207
+ for child in el.children:
208
+ yield from process_element(child)
209
+
210
+ yield from process_element(self._doc_handle.bs_doc)
211
+ yield from emit()
212
+
213
+ def _markdown_sections(self) -> Generator[DocumentSection, None, None]:
214
+ """Create DocumentSections reflecting the html-specific separators"""
215
+ assert self._doc_handle.md_ast is not None
216
+ emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
217
+ emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
218
+ # current state
219
+ text_section = '' # currently accumulated text
220
+ headings: Dict[int, str] = {} # current state of observed headings (level -> text)
221
+
222
+ def update_headings(heading: Dict) -> None:
223
+ # update current state
224
+ nonlocal headings
225
+ assert 'type' in heading and heading['type'] == 'heading'
226
+ level = heading['attrs']['level']
227
+ text = heading['children'][0]['raw'].strip()
228
+ # remove the previously seen lower levels
229
+ lower_levels = [l for l in headings.keys() if l > level]
230
+ for l in lower_levels:
231
+ del headings[l]
232
+ headings[level] = text
233
+
234
+ def emit() -> None:
235
+ nonlocal text_section, headings
236
+ if len(text_section) > 0:
237
+ md = DocumentSectionMd(0, headings.copy())
238
+ yield DocumentSection(text=text_section, md=md)
239
+ text_section = ''
240
+
241
+ def process_element(el: Dict) -> Generator[DocumentSection, None, None]:
242
+ # process the element and emit sections as necessary
243
+ nonlocal text_section, headings, emit_on_heading, emit_on_paragraph
244
+ assert 'type' in el
245
+
246
+ if el['type'] == 'text':
247
+ # accumulate text until we see a separator element
248
+ text = el['raw'].strip()
249
+ if len(text) > 0:
250
+ text_section += ' ' + text
251
+ return
252
+
253
+ if el['type'] == 'heading':
254
+ if emit_on_heading:
255
+ yield from emit()
256
+ update_headings(el)
257
+ elif el['type'] == 'paragraph':
258
+ if emit_on_paragraph:
259
+ yield from emit()
260
+ if 'children' not in el:
261
+ return
262
+ for child in el['children']:
263
+ yield from process_element(child)
264
+
265
+ for el in self._doc_handle.md_ast:
266
+ yield from process_element(el)
267
+ yield from emit()
268
+
269
+ def _sentence_sections(self, input_sections: Iterable[DocumentSection]) -> Generator[DocumentSection, None, None]:
270
+ """Split the input sections into sentences"""
271
+ for section in input_sections:
272
+ if section.text is not None:
273
+ doc = Env.get().spacy_nlp(section.text)
274
+ for sent in doc.sents:
275
+ yield DocumentSection(text=sent.text, md=section.md)
276
+
277
+ def _token_chunks(self, input: Iterable[DocumentSection]) -> Generator[DocumentSection, None, None]:
278
+ import tiktoken
279
+ if self._tiktoken_target_model is not None:
280
+ encoding = tiktoken.encoding_for_model(self._tiktoken_target_model)
281
+ else:
282
+ encoding = tiktoken.get_encoding(self._tiktoken_encoding)
283
+ assert self._limit > 0 and self._overlap >= 0
284
+
285
+ for section in input:
286
+ if section.text is None:
287
+ continue
288
+ tokens = encoding.encode(section.text)
289
+ start_idx = 0
290
+ while start_idx < len(tokens):
291
+ end_idx = min(start_idx + self._limit, len(tokens))
292
+ text = encoding.decode(tokens[start_idx:end_idx])
293
+ yield DocumentSection(text=text, md=section.md)
294
+ start_idx += self._limit - self._overlap
295
+
296
+ def _char_chunks(self, input: Iterable[DocumentSection]) -> Generator[DocumentSection, None, None]:
297
+ for section in input:
298
+ if section.text is None:
299
+ continue
300
+ start_idx = 0
301
+ while start_idx < len(section.text):
302
+ end_idx = min(start_idx + self._limit, len(section.text))
303
+ text = section.text[start_idx:end_idx]
304
+ yield DocumentSection(text=text, md=section.md)
305
+ start_idx += self._limit - self._overlap
306
+
307
+ def close(self) -> None:
308
+ pass
309
+
310
+ def set_pos(self, pos: int) -> None:
311
+ pass
@@ -0,0 +1,89 @@
1
+ from typing import Dict, Any, List, Tuple
2
+ from pathlib import Path
3
+ import math
4
+ import logging
5
+
6
+ import cv2
7
+ import PIL.Image
8
+
9
+ from .base import ComponentIterator
10
+
11
+ from pixeltable.type_system import ColumnType, VideoType, ImageType, IntType, FloatType
12
+ from pixeltable.exceptions import Error
13
+
14
+
15
+ _logger = logging.getLogger('pixeltable')
16
+
17
+ class FrameIterator(ComponentIterator):
18
+ def __init__(self, video: str, fps: float = 0.0):
19
+ video_path = Path(video)
20
+ assert video_path.exists() and video_path.is_file()
21
+ self.video_path = video_path
22
+ self.fps = fps
23
+ self.video_reader = cv2.VideoCapture(str(video_path))
24
+ if not self.video_reader.isOpened():
25
+ raise Error(f'Failed to open video: {video}')
26
+ video_fps = int(self.video_reader.get(cv2.CAP_PROP_FPS))
27
+ if fps > video_fps:
28
+ raise Error(f'Video {video}: requested fps ({fps}) exceeds that of the video ({video_fps})')
29
+ self.frame_freq = int(video_fps / fps) if fps > 0 else 1
30
+ num_video_frames = int(self.video_reader.get(cv2.CAP_PROP_FRAME_COUNT))
31
+ if num_video_frames == 0:
32
+ raise Error(f'Video {video}: failed to get number of frames')
33
+ # ceil: round up to ensure we count frame 0
34
+ self.num_frames = math.ceil(num_video_frames / self.frame_freq) if fps > 0 else num_video_frames
35
+ _logger.debug(f'FrameIterator: path={self.video_path} fps={self.fps}')
36
+
37
+ self.next_frame_idx = 0
38
+
39
+ @classmethod
40
+ def input_schema(cls) -> Dict[str, ColumnType]:
41
+ return {
42
+ 'video': VideoType(nullable=False),
43
+ 'fps': FloatType()
44
+ }
45
+
46
+ @classmethod
47
+ def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
48
+ return {
49
+ 'frame_idx': IntType(),
50
+ 'pos_msec': FloatType(),
51
+ 'pos_frame': FloatType(),
52
+ 'frame': ImageType(),
53
+ }, ['frame']
54
+
55
+ def __next__(self) -> Dict[str, Any]:
56
+ while True:
57
+ pos_msec = self.video_reader.get(cv2.CAP_PROP_POS_MSEC)
58
+ pos_frame = self.video_reader.get(cv2.CAP_PROP_POS_FRAMES)
59
+ status, img = self.video_reader.read()
60
+ if not status:
61
+ _logger.debug(f'releasing video reader for {self.video_path}')
62
+ self.video_reader.release()
63
+ self.video_reader = None
64
+ raise StopIteration
65
+ if pos_frame % self.frame_freq == 0:
66
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
67
+ result = {
68
+ 'frame_idx': self.next_frame_idx,
69
+ 'pos_msec': pos_msec,
70
+ 'pos_frame': pos_frame,
71
+ 'frame': PIL.Image.fromarray(img),
72
+ }
73
+ self.next_frame_idx += 1
74
+ # frame_freq > 1: jumping to the target frame here with video_reader.set() is far slower than just
75
+ # skipping the unwanted frames
76
+ return result
77
+
78
+ def close(self) -> None:
79
+ if self.video_reader is not None:
80
+ self.video_reader.release()
81
+ self.video_reader = None
82
+
83
+ def set_pos(self, pos: int) -> None:
84
+ """Seek to frame idx"""
85
+ if pos == self.next_frame_idx:
86
+ return
87
+ _logger.debug(f'seeking to frame {pos}')
88
+ self.video_reader.set(cv2.CAP_PROP_POS_FRAMES, pos * self.frame_freq)
89
+ self.next_frame_idx = pos
@@ -0,0 +1,54 @@
1
+ import dataclasses
2
+ import importlib
3
+ import os
4
+ import pkgutil
5
+ from typing import Callable, Dict
6
+
7
+ import sqlalchemy as sql
8
+ import sqlalchemy.orm as orm
9
+
10
+ from .schema import SystemInfo, SystemInfoMd
11
+
12
+ # current version of the metadata; this is incremented whenever the metadata schema changes
13
+ VERSION = 12
14
+
15
+
16
+ def create_system_info(engine: sql.engine.Engine) -> None:
17
+ """Create the systemmetadata record"""
18
+ system_md = SystemInfoMd(schema_version=VERSION)
19
+ record = SystemInfo(md=dataclasses.asdict(system_md))
20
+ with orm.Session(engine, future=True) as session:
21
+ session.add(record)
22
+ session.flush()
23
+ session.commit()
24
+
25
+ # conversion functions for upgrading the metadata schema from one version to the following
26
+ # key: old schema version
27
+ converter_cbs: Dict[int, Callable[[sql.engine.Engine], None]] = {}
28
+
29
+ def register_converter(version: int, cb: Callable[[sql.engine.Engine], None]) -> None:
30
+ global converter_cbs
31
+ converter_cbs[version] = cb
32
+
33
+ # load all converter modules
34
+ for _, modname, _ in pkgutil.iter_modules([os.path.dirname(__file__) + '/converters']):
35
+ importlib.import_module('pixeltable.metadata.converters.' + modname)
36
+
37
+ def upgrade_md(engine: sql.engine.Engine) -> None:
38
+ """Upgrade the metadata schema to the current version"""
39
+ with orm.Session(engine, future=True) as session:
40
+ system_info = session.query(SystemInfo).one().md
41
+ md_version = system_info['schema_version']
42
+ if md_version == VERSION:
43
+ return
44
+ while md_version < VERSION:
45
+ if md_version not in converter_cbs:
46
+ raise RuntimeError(f'No metadata converter for version {md_version}')
47
+ print(f'Converting metadata from version {md_version} to {md_version + 1}')
48
+ converter_cbs[md_version](engine)
49
+ md_version += 1
50
+ # update system info
51
+ conn = session.connection()
52
+ system_info_md = SystemInfoMd(schema_version=VERSION)
53
+ conn.execute(SystemInfo.__table__.update().values(md=dataclasses.asdict(system_info_md)))
54
+ session.commit()
@@ -0,0 +1,18 @@
1
+ import sqlalchemy as sql
2
+
3
+ from pixeltable.metadata.schema import Table, TableSchemaVersion
4
+ from pixeltable.metadata import register_converter
5
+
6
+
7
+ def convert_10(engine: sql.engine.Engine) -> None:
8
+ default_table_attrs = {"comment": None, "num_retained_versions": 10}
9
+ with engine.begin() as conn:
10
+ # Because `parameters` wasn't actually used for anything,
11
+ # we can simply delete it without any data loss.
12
+ conn.execute(sql.update(Table).values(md=Table.md - 'parameters'))
13
+ # Add `table_attrs` to all instances of tableschemaversions.md.
14
+ conn.execute(sql.update(TableSchemaVersion).values(md=TableSchemaVersion.md.concat(default_table_attrs)))
15
+ return
16
+
17
+
18
+ register_converter(10, convert_10)
@@ -0,0 +1,211 @@
1
+ from typing import Optional, List, Dict, get_type_hints, Type, Any, TypeVar, Tuple, Union
2
+ import platform
3
+ import uuid
4
+ import dataclasses
5
+
6
+ import sqlalchemy as sql
7
+ from sqlalchemy import Integer, String, Boolean, BigInteger, LargeBinary
8
+ from sqlalchemy.dialects.postgresql import UUID, JSONB
9
+ from sqlalchemy import ForeignKey, UniqueConstraint, ForeignKeyConstraint
10
+ from sqlalchemy.orm import declarative_base
11
+
12
+ Base = declarative_base()
13
+
14
+ T = TypeVar('T')
15
+
16
+ def md_from_dict(data_class_type: Type[T], data: Any) -> T:
17
+ """Re-instantiate a dataclass instance that contains nested dataclasses from a dict."""
18
+ if dataclasses.is_dataclass(data_class_type):
19
+ fieldtypes = {f: t for f, t in get_type_hints(data_class_type).items()}
20
+ return data_class_type(**{f: md_from_dict(fieldtypes[f], data[f]) for f in data})
21
+ elif hasattr(data_class_type, '__origin__'):
22
+ if data_class_type.__origin__ is Union and type(None) in data_class_type.__args__:
23
+ # Handling Optional types
24
+ non_none_args = [arg for arg in data_class_type.__args__ if arg is not type(None)]
25
+ if len(non_none_args) == 1:
26
+ return md_from_dict(non_none_args[0], data) if data is not None else None
27
+ elif data_class_type.__origin__ is list:
28
+ return [md_from_dict(data_class_type.__args__[0], elem) for elem in data]
29
+ elif data_class_type.__origin__ is dict:
30
+ key_type = data_class_type.__args__[0]
31
+ val_type = data_class_type.__args__[1]
32
+ return {key_type(key): md_from_dict(val_type, val) for key, val in data.items()}
33
+ elif data_class_type.__origin__ is tuple:
34
+ return tuple(md_from_dict(arg_type, elem) for arg_type, elem in zip(data_class_type.__args__, data))
35
+ else:
36
+ return data
37
+
38
+
39
+ # structure of the stored metadata:
40
+ # - each schema entity that grows somehow proportionally to the data (# of output_rows, total insert operations,
41
+ # number of schema changes) gets its own table
42
+ # - each table has an 'md' column that basically contains the payload
43
+ # - exceptions to that are foreign keys without which lookups would be too slow (ex.: TableSchemaVersions.tbl_id)
44
+ # - the md column contains a dataclass serialized to json; this has the advantage of making changes to the metadata
45
+ # schema easier (the goal is not to have to rely on some schema migration framework; if that breaks for some user,
46
+ # it would be very difficult to patch up)
47
+
48
+ @dataclasses.dataclass
49
+ class SystemInfoMd:
50
+ schema_version: int
51
+
52
+
53
+ class SystemInfo(Base):
54
+ """A single-row table that contains system-wide metadata."""
55
+ __tablename__ = 'systeminfo'
56
+ dummy = sql.Column(Integer, primary_key=True, default=0, nullable=False)
57
+ md = sql.Column(JSONB, nullable=False) # SystemInfoMd
58
+
59
+
60
+ @dataclasses.dataclass
61
+ class DirMd:
62
+ name: str
63
+
64
+
65
+ class Dir(Base):
66
+ __tablename__ = 'dirs'
67
+
68
+ id = sql.Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4, nullable=False)
69
+ parent_id = sql.Column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=True)
70
+ md = sql.Column(JSONB, nullable=False)
71
+
72
+
73
+ @dataclasses.dataclass
74
+ class ColumnHistory:
75
+ """
76
+ Records when a column was added/dropped, which is needed to GC unreachable storage columns
77
+ (a column that was added after table snapshot n and dropped before table snapshot n+1 can be removed
78
+ from the stored table).
79
+ One record per column (across all schema versions).
80
+ """
81
+ col_id: int
82
+ schema_version_add: int
83
+ schema_version_drop: Optional[int]
84
+
85
+
86
+ @dataclasses.dataclass
87
+ class ViewMd:
88
+ is_snapshot: bool
89
+
90
+ # (table id, version); for mutable views, all versions are None
91
+ base_versions: List[Tuple[str, Optional[int]]]
92
+
93
+ # filter predicate applied to the base table; view-only
94
+ predicate: Optional[Dict[str, Any]]
95
+
96
+ # ComponentIterator subclass; only for component views
97
+ iterator_class_fqn: Optional[str]
98
+
99
+ # args to pass to the iterator class constructor; only for component views
100
+ iterator_args: Optional[Dict[str, Any]]
101
+
102
+
103
+ @dataclasses.dataclass
104
+ class TableMd:
105
+ name: str
106
+
107
+ # monotonically increasing w/in Table for both data and schema changes, starting at 0
108
+ current_version: int
109
+ # each version has a corresponding schema version (current_version >= current_schema_version)
110
+ current_schema_version: int
111
+
112
+ # used to assign Column.id
113
+ next_col_id: int
114
+
115
+ # - used to assign the rowid column in the storage table
116
+ # - every row is assigned a unique and immutable rowid on insertion
117
+ next_row_id: int
118
+
119
+ column_history: Dict[int, ColumnHistory] # col_id -> ColumnHistory
120
+
121
+ view_md: Optional[ViewMd]
122
+
123
+
124
+ class Table(Base):
125
+ """
126
+ Table represents both tables and views.
127
+
128
+ Views are in essence a subclass of tables, because they also store materialized columns. The differences are:
129
+ - views have a base, which is either a (live) table or a snapshot
130
+ - views can have a filter predicate
131
+ """
132
+ __tablename__ = 'tables'
133
+
134
+ MAX_VERSION = 9223372036854775807 # 2^63 - 1
135
+
136
+ id = sql.Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4, nullable=False)
137
+ dir_id = sql.Column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=False)
138
+ md = sql.Column(JSONB, nullable=False) # TableMd
139
+
140
+
141
+ @dataclasses.dataclass
142
+ class TableVersionMd:
143
+ created_at: float # time.time()
144
+ version: int
145
+ schema_version: int
146
+
147
+
148
+ class TableVersion(Base):
149
+ __tablename__ = 'tableversions'
150
+ tbl_id = sql.Column(UUID(as_uuid=True), ForeignKey('tables.id'), primary_key=True, nullable=False)
151
+ version = sql.Column(BigInteger, primary_key=True, nullable=False)
152
+ md = sql.Column(JSONB, nullable=False) # TableVersionMd
153
+
154
+
155
+ @dataclasses.dataclass
156
+ class SchemaColumn:
157
+ """
158
+ Records the logical (user-visible) schema of a table.
159
+ Contains the full set of columns for each new schema version: one record per (column x schema version).
160
+ """
161
+ pos: int
162
+ name: str
163
+ col_type: dict
164
+ is_pk: bool
165
+ value_expr: Optional[dict]
166
+ stored: Optional[bool]
167
+ # if True, creates vector index for this column
168
+ is_indexed: bool
169
+
170
+
171
+ @dataclasses.dataclass
172
+ class TableSchemaVersionMd:
173
+ schema_version: int
174
+ preceding_schema_version: Optional[int]
175
+ columns: Dict[int, SchemaColumn] # col_id -> SchemaColumn
176
+ num_retained_versions: int
177
+ comment: str
178
+
179
+
180
+ # versioning: each table schema change results in a new record
181
+ class TableSchemaVersion(Base):
182
+ __tablename__ = 'tableschemaversions'
183
+
184
+ tbl_id = sql.Column(UUID(as_uuid=True), ForeignKey('tables.id'), primary_key=True, nullable=False)
185
+ schema_version = sql.Column(BigInteger, primary_key=True, nullable=False)
186
+ md = sql.Column(JSONB, nullable=False) # TableSchemaVersionMd
187
+
188
+
189
+ @dataclasses.dataclass
190
+ class FunctionMd:
191
+ name: str
192
+ py_version: str # platform.python_version
193
+ class_name: str # name of the Function subclass
194
+ md: dict # part of the output of Function.to_store()
195
+
196
+
197
+ class Function(Base):
198
+ """
199
+ User-defined functions that are not module functions (ie, aren't available at runtime as a symbol in a known
200
+ module).
201
+ Functions without a name are anonymous functions used in the definition of a computed column.
202
+ Functions that have names are also assigned to a database and directory.
203
+ We store the Python version under which a Function was created (and the callable pickled) in order to warn
204
+ against version mismatches.
205
+ """
206
+ __tablename__ = 'functions'
207
+
208
+ id = sql.Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4, nullable=False)
209
+ dir_id = sql.Column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=True)
210
+ md = sql.Column(JSONB, nullable=False) # FunctionMd
211
+ binary_obj = sql.Column(LargeBinary, nullable=True)