pixeltable 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (119) hide show
  1. pixeltable/__init__.py +53 -0
  2. pixeltable/__version__.py +3 -0
  3. pixeltable/catalog/__init__.py +13 -0
  4. pixeltable/catalog/catalog.py +159 -0
  5. pixeltable/catalog/column.py +181 -0
  6. pixeltable/catalog/dir.py +32 -0
  7. pixeltable/catalog/globals.py +33 -0
  8. pixeltable/catalog/insertable_table.py +192 -0
  9. pixeltable/catalog/named_function.py +36 -0
  10. pixeltable/catalog/path.py +58 -0
  11. pixeltable/catalog/path_dict.py +139 -0
  12. pixeltable/catalog/schema_object.py +39 -0
  13. pixeltable/catalog/table.py +695 -0
  14. pixeltable/catalog/table_version.py +1026 -0
  15. pixeltable/catalog/table_version_path.py +133 -0
  16. pixeltable/catalog/view.py +203 -0
  17. pixeltable/dataframe.py +749 -0
  18. pixeltable/env.py +466 -0
  19. pixeltable/exceptions.py +17 -0
  20. pixeltable/exec/__init__.py +10 -0
  21. pixeltable/exec/aggregation_node.py +78 -0
  22. pixeltable/exec/cache_prefetch_node.py +116 -0
  23. pixeltable/exec/component_iteration_node.py +79 -0
  24. pixeltable/exec/data_row_batch.py +94 -0
  25. pixeltable/exec/exec_context.py +22 -0
  26. pixeltable/exec/exec_node.py +61 -0
  27. pixeltable/exec/expr_eval_node.py +217 -0
  28. pixeltable/exec/in_memory_data_node.py +73 -0
  29. pixeltable/exec/media_validation_node.py +43 -0
  30. pixeltable/exec/sql_scan_node.py +226 -0
  31. pixeltable/exprs/__init__.py +25 -0
  32. pixeltable/exprs/arithmetic_expr.py +102 -0
  33. pixeltable/exprs/array_slice.py +71 -0
  34. pixeltable/exprs/column_property_ref.py +77 -0
  35. pixeltable/exprs/column_ref.py +114 -0
  36. pixeltable/exprs/comparison.py +77 -0
  37. pixeltable/exprs/compound_predicate.py +98 -0
  38. pixeltable/exprs/data_row.py +199 -0
  39. pixeltable/exprs/expr.py +594 -0
  40. pixeltable/exprs/expr_set.py +39 -0
  41. pixeltable/exprs/function_call.py +382 -0
  42. pixeltable/exprs/globals.py +69 -0
  43. pixeltable/exprs/image_member_access.py +96 -0
  44. pixeltable/exprs/in_predicate.py +96 -0
  45. pixeltable/exprs/inline_array.py +109 -0
  46. pixeltable/exprs/inline_dict.py +103 -0
  47. pixeltable/exprs/is_null.py +38 -0
  48. pixeltable/exprs/json_mapper.py +121 -0
  49. pixeltable/exprs/json_path.py +159 -0
  50. pixeltable/exprs/literal.py +66 -0
  51. pixeltable/exprs/object_ref.py +41 -0
  52. pixeltable/exprs/predicate.py +44 -0
  53. pixeltable/exprs/row_builder.py +329 -0
  54. pixeltable/exprs/rowid_ref.py +94 -0
  55. pixeltable/exprs/similarity_expr.py +65 -0
  56. pixeltable/exprs/type_cast.py +53 -0
  57. pixeltable/exprs/variable.py +45 -0
  58. pixeltable/ext/__init__.py +5 -0
  59. pixeltable/ext/functions/yolox.py +92 -0
  60. pixeltable/func/__init__.py +7 -0
  61. pixeltable/func/aggregate_function.py +197 -0
  62. pixeltable/func/callable_function.py +113 -0
  63. pixeltable/func/expr_template_function.py +99 -0
  64. pixeltable/func/function.py +141 -0
  65. pixeltable/func/function_registry.py +227 -0
  66. pixeltable/func/globals.py +46 -0
  67. pixeltable/func/nos_function.py +202 -0
  68. pixeltable/func/signature.py +162 -0
  69. pixeltable/func/udf.py +164 -0
  70. pixeltable/functions/__init__.py +95 -0
  71. pixeltable/functions/eval.py +215 -0
  72. pixeltable/functions/fireworks.py +34 -0
  73. pixeltable/functions/huggingface.py +167 -0
  74. pixeltable/functions/image.py +16 -0
  75. pixeltable/functions/openai.py +289 -0
  76. pixeltable/functions/pil/image.py +147 -0
  77. pixeltable/functions/string.py +13 -0
  78. pixeltable/functions/together.py +143 -0
  79. pixeltable/functions/util.py +52 -0
  80. pixeltable/functions/video.py +62 -0
  81. pixeltable/globals.py +425 -0
  82. pixeltable/index/__init__.py +2 -0
  83. pixeltable/index/base.py +51 -0
  84. pixeltable/index/embedding_index.py +168 -0
  85. pixeltable/io/__init__.py +3 -0
  86. pixeltable/io/hf_datasets.py +188 -0
  87. pixeltable/io/pandas.py +148 -0
  88. pixeltable/io/parquet.py +192 -0
  89. pixeltable/iterators/__init__.py +3 -0
  90. pixeltable/iterators/base.py +52 -0
  91. pixeltable/iterators/document.py +432 -0
  92. pixeltable/iterators/video.py +88 -0
  93. pixeltable/metadata/__init__.py +58 -0
  94. pixeltable/metadata/converters/convert_10.py +18 -0
  95. pixeltable/metadata/converters/convert_12.py +3 -0
  96. pixeltable/metadata/converters/convert_13.py +41 -0
  97. pixeltable/metadata/schema.py +234 -0
  98. pixeltable/plan.py +620 -0
  99. pixeltable/store.py +424 -0
  100. pixeltable/tool/create_test_db_dump.py +184 -0
  101. pixeltable/tool/create_test_video.py +81 -0
  102. pixeltable/type_system.py +846 -0
  103. pixeltable/utils/__init__.py +17 -0
  104. pixeltable/utils/arrow.py +98 -0
  105. pixeltable/utils/clip.py +18 -0
  106. pixeltable/utils/coco.py +136 -0
  107. pixeltable/utils/documents.py +69 -0
  108. pixeltable/utils/filecache.py +195 -0
  109. pixeltable/utils/help.py +11 -0
  110. pixeltable/utils/http_server.py +70 -0
  111. pixeltable/utils/media_store.py +76 -0
  112. pixeltable/utils/pytorch.py +91 -0
  113. pixeltable/utils/s3.py +13 -0
  114. pixeltable/utils/sql.py +17 -0
  115. pixeltable/utils/transactional_directory.py +35 -0
  116. pixeltable-0.0.0.dist-info/LICENSE +18 -0
  117. pixeltable-0.0.0.dist-info/METADATA +131 -0
  118. pixeltable-0.0.0.dist-info/RECORD +119 -0
  119. pixeltable-0.0.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,432 @@
1
+ import dataclasses
2
+ import enum
3
+ import logging
4
+ from typing import Dict, Any, List, Tuple, Optional, Iterable, Iterator
5
+
6
+ import ftfy
7
+
8
+ from pixeltable.env import Env
9
+ from pixeltable.exceptions import Error
10
+ from pixeltable.type_system import ColumnType, DocumentType, StringType, IntType, JsonType
11
+ from pixeltable.utils.documents import get_document_handle
12
+ from .base import ComponentIterator
13
+
14
+ _logger = logging.getLogger('pixeltable')
15
+
16
+ class ChunkMetadata(enum.Enum):
17
+ TITLE = 1
18
+ HEADING = 2
19
+ SOURCELINE = 3
20
+ PAGE = 4
21
+ BOUNDING_BOX = 5
22
+
23
+ class Separator(enum.Enum):
24
+ HEADING = 1
25
+ PARAGRAPH = 2
26
+ SENTENCE = 3
27
+ TOKEN_LIMIT = 4
28
+ CHAR_LIMIT = 5
29
+ PAGE = 6
30
+
31
+ @dataclasses.dataclass
32
+ class DocumentSectionMetadata:
33
+ """Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
34
+ # html and markdown metadata
35
+ sourceline: Optional[int] = None
36
+ # the stack of headings up to the most recently observed one;
37
+ # eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
38
+ heading: Optional[Dict[int, str]] = None
39
+
40
+ # pdf-specific metadata
41
+ page: Optional[int] = None
42
+ # bounding box as an {x1, y1, x2, y2} dictionary
43
+ bounding_box: Optional[Dict[str, float]] = None
44
+
45
+ @dataclasses.dataclass
46
+ class DocumentSection:
47
+ """A single document chunk, according to some of the splitting criteria"""
48
+ text: Optional[str]
49
+ metadata: Optional[DocumentSectionMetadata]
50
+
51
+
52
+ def _parse_separators(separators: str) -> List[Separator]:
53
+ ret = []
54
+ for s in separators.split(','):
55
+ clean_s = s.strip().upper()
56
+ if not clean_s:
57
+ continue
58
+ if clean_s not in Separator.__members__:
59
+ raise Error(
60
+ f'Invalid separator: `{s.strip()}`. Valid separators are: {", ".join(Separator.__members__).lower()}'
61
+ )
62
+ ret.append(Separator[clean_s])
63
+ return ret
64
+
65
+
66
+ def _parse_metadata(metadata: str) -> List[ChunkMetadata]:
67
+ ret = []
68
+ for m in metadata.split(','):
69
+ clean_m = m.strip().upper()
70
+ if not clean_m:
71
+ continue
72
+ if clean_m not in ChunkMetadata.__members__:
73
+ raise Error(
74
+ f'Invalid metadata: `{m.strip()}`. Valid metadata are: {", ".join(ChunkMetadata.__members__).lower()}'
75
+ )
76
+ ret.append(ChunkMetadata[clean_m])
77
+ return ret
78
+
79
+
80
+ _HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
81
+
82
+ class DocumentSplitter(ComponentIterator):
83
+ """Iterator over pieces of a document. The document is split into chunks based on the specified separators.
84
+ The iterator output tuples are of schema {'text': StringType()}, but can include additional metadata fields if specified
85
+ in the `metadata` argument as explained below.
86
+ All chunk text is passed through `ftfy.fix_text` to fix up common problems with unicode sequences.
87
+
88
+ Args:
89
+ `metadata`: which additional metadata fields to include in the output schema:
90
+ 'title', 'heading' (HTML and Markdown), 'sourceline' (HTML), 'page' (PDF), 'bounding_box' (PDF).
91
+ The input can be a comma-separated string of these values eg. 'title,heading,sourceline'.
92
+ `separators`: which separators to use to split the document into rows. Options are:
93
+ 'heading', 'paragraph', 'sentence', 'token_limit', 'char_limit', 'page'. As with metadata, this is can be a
94
+ comma-separated string eg. 'heading, token_limit'.
95
+ `limit`: the maximum number of tokens or characters in each chunk if 'token_limit' or 'char_limit' is specified.
96
+ """
97
+ METADATA_COLUMN_TYPES = {
98
+ ChunkMetadata.TITLE: StringType(nullable=True),
99
+ ChunkMetadata.HEADING: JsonType(nullable=True),
100
+ ChunkMetadata.SOURCELINE: IntType(nullable=True),
101
+ ChunkMetadata.PAGE: IntType(nullable=True),
102
+ ChunkMetadata.BOUNDING_BOX: JsonType(nullable=True),
103
+ }
104
+
105
+ def __init__(
106
+ self, document: str, *, separators: str, limit: Optional[int] = None, overlap: Optional[int] = None, metadata: str = '',
107
+ html_skip_tags: Optional[List[str]] = None, tiktoken_encoding: Optional[str] = 'cl100k_base',
108
+ tiktoken_target_model: Optional[str] = None
109
+ ):
110
+ if html_skip_tags is None:
111
+ html_skip_tags = ['nav']
112
+ self._doc_handle = get_document_handle(document)
113
+ assert self._doc_handle is not None
114
+ # calling the output_schema method to validate the input arguments
115
+ self.output_schema(separators=separators, metadata=metadata, limit=limit, overlap=overlap)
116
+ self._separators = _parse_separators(separators)
117
+ self._metadata_fields = _parse_metadata(metadata)
118
+ if self._doc_handle.bs_doc is not None:
119
+ title = self._doc_handle.bs_doc.title
120
+ if title is None:
121
+ self._doc_title = ''
122
+ else:
123
+ self._doc_title = ftfy.fix_text(title.get_text().strip())
124
+ else:
125
+ self._doc_title = ''
126
+ self._limit = 0 if limit is None else limit
127
+ self._skip_tags = html_skip_tags
128
+ self._overlap = 0 if overlap is None else overlap
129
+ self._tiktoken_encoding = tiktoken_encoding
130
+ self._tiktoken_target_model = tiktoken_target_model
131
+
132
+ # set up processing pipeline
133
+ if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
134
+ assert self._doc_handle.bs_doc is not None
135
+ self._sections = self._html_sections()
136
+ elif self._doc_handle.format == DocumentType.DocumentFormat.MD:
137
+ assert self._doc_handle.md_ast is not None
138
+ self._sections = self._markdown_sections()
139
+ elif self._doc_handle.format == DocumentType.DocumentFormat.PDF:
140
+ assert self._doc_handle.pdf_doc is not None
141
+ self._sections = self._pdf_sections()
142
+ else:
143
+ assert False, f'unknown document format: {self._doc_handle.format}'
144
+
145
+ if Separator.SENTENCE in self._separators:
146
+ self._sections = self._sentence_sections(self._sections)
147
+ if Separator.TOKEN_LIMIT in self._separators:
148
+ self._sections = self._token_chunks(self._sections)
149
+ if Separator.CHAR_LIMIT in self._separators:
150
+ self._sections = self._char_chunks(self._sections)
151
+
152
+ @classmethod
153
+ def input_schema(cls) -> Dict[str, ColumnType]:
154
+ return {
155
+ 'document': DocumentType(nullable=False),
156
+ 'separators': StringType(nullable=False),
157
+ 'metadata': StringType(nullable=True),
158
+ 'limit': IntType(nullable=True),
159
+ 'overlap': IntType(nullable=True),
160
+ 'skip_tags': StringType(nullable=True),
161
+ 'tiktoken_encoding': StringType(nullable=True),
162
+ 'tiktoken_target_model': StringType(nullable=True),
163
+ }
164
+
165
+ @classmethod
166
+ def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
167
+ schema = {'text': StringType()}
168
+ md_fields = _parse_metadata(kwargs['metadata']) if 'metadata' in kwargs else []
169
+
170
+ for md_field in md_fields:
171
+ schema[md_field.name.lower()] = cls.METADATA_COLUMN_TYPES[md_field]
172
+
173
+ assert 'separators' in kwargs
174
+ separators = _parse_separators(kwargs['separators'])
175
+
176
+ limit = kwargs.get('limit')
177
+ overlap = kwargs.get('overlap')
178
+
179
+ if limit is not None or overlap is not None:
180
+ if Separator.TOKEN_LIMIT not in separators and Separator.CHAR_LIMIT not in separators:
181
+ raise Error('limit/overlap requires the "token_limit" or "char_limit" separator')
182
+ if limit is not None and limit <= 0:
183
+ raise Error('"limit" must be an integer > 0')
184
+ if overlap is not None and overlap < 0:
185
+ raise Error('"overlap" must be an integer >= 0')
186
+ if Separator.TOKEN_LIMIT in separators or Separator.CHAR_LIMIT in separators:
187
+ if Separator.TOKEN_LIMIT in separators and Separator.CHAR_LIMIT in separators:
188
+ raise Error('Cannot specify both "token_limit" and "char_limit" separators')
189
+ if kwargs.get('limit') is None:
190
+ raise Error('limit is required with "token_limit"/"char_limit" separators')
191
+
192
+ # check dependencies at the end
193
+ if Separator.SENTENCE in separators:
194
+ Env.get().require_package('spacy')
195
+ if Separator.TOKEN_LIMIT in separators:
196
+ Env.get().require_package('tiktoken')
197
+
198
+ return schema, []
199
+
200
+ def __next__(self) -> Dict[str, Any]:
201
+ while True:
202
+ section = next(self._sections)
203
+ if section.text is None:
204
+ continue
205
+ result = {'text': section.text}
206
+ for md_field in self._metadata_fields:
207
+ if md_field == ChunkMetadata.TITLE:
208
+ result[md_field.name.lower()] = self._doc_title
209
+ elif md_field == ChunkMetadata.HEADING:
210
+ result[md_field.name.lower()] = section.metadata.heading
211
+ elif md_field == ChunkMetadata.SOURCELINE:
212
+ result[md_field.name.lower()] = section.metadata.sourceline
213
+ elif md_field == ChunkMetadata.PAGE:
214
+ result[md_field.name.lower()] = section.metadata.page
215
+ elif md_field == ChunkMetadata.BOUNDING_BOX:
216
+ result[md_field.name.lower()] = section.metadata.bounding_box
217
+ return result
218
+
219
+ def _html_sections(self) -> Iterator[DocumentSection]:
220
+ """Create DocumentSections reflecting the html-specific separators"""
221
+ import bs4
222
+ emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
223
+ emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
224
+ # current state
225
+ accumulated_text = [] # currently accumulated text
226
+ # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
227
+
228
+ headings: Dict[int, str] = {} # current state of observed headings (level -> text)
229
+ sourceline = 0 # most recently seen sourceline
230
+
231
+ def update_metadata(el: bs4.Tag) -> None:
232
+ # update current state
233
+ nonlocal headings, sourceline
234
+ sourceline = el.sourceline
235
+ if el.name in _HTML_HEADINGS:
236
+ level = int(el.name[1])
237
+ # remove the previously seen lower levels
238
+ lower_levels = [l for l in headings if l > level]
239
+ for l in lower_levels:
240
+ del headings[l]
241
+ headings[level] = el.get_text().strip()
242
+
243
+ def emit() -> None:
244
+ nonlocal accumulated_text, headings, sourceline
245
+ if len(accumulated_text) > 0:
246
+ md = DocumentSectionMetadata(sourceline=sourceline, heading=headings.copy())
247
+ full_text = ' '.join(accumulated_text)
248
+ full_text = ftfy.fix_text(full_text)
249
+ yield DocumentSection(text=full_text, metadata=md)
250
+ accumulated_text = []
251
+
252
+ def process_element(el: bs4.PageElement) -> Iterator[DocumentSection]:
253
+ # process the element and emit sections as necessary
254
+ nonlocal accumulated_text, headings, sourceline, emit_on_heading, emit_on_paragraph
255
+ if el.name in self._skip_tags:
256
+ return
257
+
258
+ if isinstance(el, bs4.NavigableString):
259
+ # accumulate text until we see a tag we care about
260
+ text = el.get_text().strip()
261
+ if len(text) > 0:
262
+ accumulated_text.append(text)
263
+ return
264
+
265
+ if el.name in _HTML_HEADINGS:
266
+ if emit_on_heading:
267
+ yield from emit()
268
+ update_metadata(el)
269
+ elif el.name == 'p':
270
+ if emit_on_paragraph:
271
+ yield from emit()
272
+ update_metadata(el)
273
+ for child in el.children:
274
+ yield from process_element(child)
275
+
276
+ yield from process_element(self._doc_handle.bs_doc)
277
+ yield from emit()
278
+
279
+ def _markdown_sections(self) -> Iterator[DocumentSection]:
280
+ """Create DocumentSections reflecting the html-specific separators"""
281
+ assert self._doc_handle.md_ast is not None
282
+ emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
283
+ emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
284
+ # current state
285
+ accumulated_text = [] # currently accumulated text
286
+ # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
287
+ headings: Dict[int, str] = {} # current state of observed headings (level -> text)
288
+
289
+ def update_headings(heading: Dict) -> None:
290
+ # update current state
291
+ nonlocal headings
292
+ assert 'type' in heading and heading['type'] == 'heading'
293
+ level = heading['attrs']['level']
294
+ text = heading['children'][0]['raw'].strip()
295
+ # remove the previously seen lower levels
296
+ lower_levels = [l for l in headings.keys() if l > level]
297
+ for l in lower_levels:
298
+ del headings[l]
299
+ headings[level] = text
300
+
301
+ def emit() -> None:
302
+ nonlocal accumulated_text, headings
303
+ if len(accumulated_text) > 0:
304
+ metadata = DocumentSectionMetadata(sourceline=0, heading=headings.copy())
305
+ yield DocumentSection(text=ftfy.fix_text(' '.join(accumulated_text)), metadata=metadata)
306
+ accumulated_text = []
307
+
308
+ def process_element(el: Dict) -> Iterator[DocumentSection]:
309
+ # process the element and emit sections as necessary
310
+ nonlocal accumulated_text, headings, emit_on_heading, emit_on_paragraph
311
+ assert 'type' in el
312
+
313
+ if el['type'] == 'text':
314
+ # accumulate text until we see a separator element
315
+ text = el['raw'].strip()
316
+ if len(text) > 0:
317
+ accumulated_text.append(text)
318
+ return
319
+
320
+ if el['type'] == 'heading':
321
+ if emit_on_heading:
322
+ yield from emit()
323
+ update_headings(el)
324
+ elif el['type'] == 'paragraph':
325
+ if emit_on_paragraph:
326
+ yield from emit()
327
+ if 'children' not in el:
328
+ return
329
+ for child in el['children']:
330
+ yield from process_element(child)
331
+
332
+ for el in self._doc_handle.md_ast:
333
+ yield from process_element(el)
334
+ yield from emit()
335
+
336
+ def _pdf_sections(self) -> Iterator[DocumentSection]:
337
+ """Create DocumentSections reflecting the pdf-specific separators"""
338
+ import fitz
339
+ doc: fitz.Document = self._doc_handle.pdf_doc
340
+ assert doc is not None
341
+
342
+ emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
343
+ emit_on_page = Separator.PAGE in self._separators or emit_on_paragraph
344
+
345
+ accumulated_text = [] # invariant: all elements are ftfy clean and non-empty
346
+
347
+ def _add_cleaned_text(raw_text: str) -> None:
348
+ fixed = ftfy.fix_text(raw_text)
349
+ if fixed:
350
+ accumulated_text.append(fixed)
351
+
352
+ def _emit_text() -> str:
353
+ full_text = ''.join(accumulated_text)
354
+ accumulated_text.clear()
355
+ return full_text
356
+
357
+ for page_number, page in enumerate(doc.pages()):
358
+ for block in page.get_text('blocks'):
359
+ # there is no concept of paragraph in pdf, block is the closest thing
360
+ # we can get (eg a paragraph in text may cut across pages)
361
+ # see pymupdf docs https://pymupdf.readthedocs.io/en/latest/app1.html
362
+ # other libraries like pdfminer also lack an explicit paragraph concept
363
+ x1, y1, x2, y2, text, _, _ = block
364
+ _add_cleaned_text(text)
365
+ if accumulated_text and emit_on_paragraph:
366
+ bbox = {'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2}
367
+ metadata = DocumentSectionMetadata(page=page_number, bounding_box=bbox)
368
+ yield DocumentSection(text=_emit_text(), metadata=metadata)
369
+
370
+ if accumulated_text and emit_on_page and not emit_on_paragraph:
371
+ yield DocumentSection(text=_emit_text(),
372
+ metadata=DocumentSectionMetadata(page=page_number))
373
+ accumulated_text = []
374
+
375
+ if accumulated_text and not emit_on_page:
376
+ yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata())
377
+
378
+ def _sentence_sections(self, input_sections: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
379
+ """Split the input sections into sentences"""
380
+ for section in input_sections:
381
+ if section.text is not None:
382
+ doc = Env.get().spacy_nlp(section.text)
383
+ for sent in doc.sents:
384
+ yield DocumentSection(text=sent.text, metadata=section.metadata)
385
+
386
+ def _token_chunks(self, input: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
387
+ import tiktoken
388
+ if self._tiktoken_target_model is not None:
389
+ encoding = tiktoken.encoding_for_model(self._tiktoken_target_model)
390
+ else:
391
+ encoding = tiktoken.get_encoding(self._tiktoken_encoding)
392
+ assert self._limit > 0 and self._overlap >= 0
393
+
394
+ for section in input:
395
+ if section.text is None:
396
+ continue
397
+ tokens = encoding.encode(section.text)
398
+ start_idx = 0
399
+ text = None
400
+ while start_idx < len(tokens):
401
+ end_idx = min(start_idx + self._limit, len(tokens))
402
+ while end_idx > start_idx:
403
+ # find a cutoff point that doesn't cut in the middle of utf8 multi-byte sequences
404
+ try:
405
+ # check that the truncated data can be properly decoded
406
+ text = encoding.decode(tokens[start_idx:end_idx], errors='strict')
407
+ break
408
+ except UnicodeDecodeError:
409
+ # we split the token array at a point where the utf8 encoding is broken
410
+ end_idx -= 1
411
+
412
+ assert end_idx > start_idx
413
+ assert text
414
+ yield DocumentSection(text=text, metadata=section.metadata)
415
+ start_idx = max(start_idx + 1, end_idx - self._overlap) # ensure we make progress
416
+
417
+ def _char_chunks(self, input: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
418
+ for section in input:
419
+ if section.text is None:
420
+ continue
421
+ start_idx = 0
422
+ while start_idx < len(section.text):
423
+ end_idx = min(start_idx + self._limit, len(section.text))
424
+ text = section.text[start_idx:end_idx]
425
+ yield DocumentSection(text=text, metadata=section.metadata)
426
+ start_idx += self._limit - self._overlap
427
+
428
+ def close(self) -> None:
429
+ pass
430
+
431
+ def set_pos(self, pos: int) -> None:
432
+ pass
@@ -0,0 +1,88 @@
1
+ import logging
2
+ import math
3
+ from pathlib import Path
4
+ from typing import Dict, Any, List, Tuple
5
+
6
+ import PIL.Image
7
+ import cv2
8
+
9
+ from pixeltable import exprs
10
+ from pixeltable.exceptions import Error
11
+ from pixeltable.type_system import ColumnType, VideoType, ImageType, IntType, FloatType
12
+ from .base import ComponentIterator
13
+
14
+ _logger = logging.getLogger('pixeltable')
15
+
16
+ class FrameIterator(ComponentIterator):
17
+ def __init__(self, video: str, *, fps: float = 0.0):
18
+ video_path = Path(video)
19
+ assert video_path.exists() and video_path.is_file()
20
+ self.video_path = video_path
21
+ self.fps = fps
22
+ self.video_reader = cv2.VideoCapture(str(video_path))
23
+ if not self.video_reader.isOpened():
24
+ raise Error(f'Failed to open video: {video}')
25
+ video_fps = int(self.video_reader.get(cv2.CAP_PROP_FPS))
26
+ if fps > video_fps:
27
+ raise Error(f'Video {video}: requested fps ({fps}) exceeds that of the video ({video_fps})')
28
+ self.frame_freq = int(video_fps / fps) if fps > 0 else 1
29
+ num_video_frames = int(self.video_reader.get(cv2.CAP_PROP_FRAME_COUNT))
30
+ if num_video_frames == 0:
31
+ raise Error(f'Video {video}: failed to get number of frames')
32
+ # ceil: round up to ensure we count frame 0
33
+ self.num_frames = math.ceil(num_video_frames / self.frame_freq) if fps > 0 else num_video_frames
34
+ _logger.debug(f'FrameIterator: path={self.video_path} fps={self.fps}')
35
+
36
+ self.next_frame_idx = 0
37
+
38
+ @classmethod
39
+ def input_schema(cls) -> Dict[str, ColumnType]:
40
+ return {
41
+ 'video': VideoType(nullable=False),
42
+ 'fps': FloatType()
43
+ }
44
+
45
+ @classmethod
46
+ def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
47
+ return {
48
+ 'frame_idx': IntType(),
49
+ 'pos_msec': FloatType(),
50
+ 'pos_frame': FloatType(),
51
+ 'frame': ImageType(),
52
+ }, ['frame']
53
+
54
+ def __next__(self) -> Dict[str, Any]:
55
+ while True:
56
+ pos_msec = self.video_reader.get(cv2.CAP_PROP_POS_MSEC)
57
+ pos_frame = self.video_reader.get(cv2.CAP_PROP_POS_FRAMES)
58
+ status, img = self.video_reader.read()
59
+ if not status:
60
+ _logger.debug(f'releasing video reader for {self.video_path}')
61
+ self.video_reader.release()
62
+ self.video_reader = None
63
+ raise StopIteration
64
+ if pos_frame % self.frame_freq == 0:
65
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
66
+ result = {
67
+ 'frame_idx': self.next_frame_idx,
68
+ 'pos_msec': pos_msec,
69
+ 'pos_frame': pos_frame,
70
+ 'frame': PIL.Image.fromarray(img),
71
+ }
72
+ self.next_frame_idx += 1
73
+ # frame_freq > 1: jumping to the target frame here with video_reader.set() is far slower than just
74
+ # skipping the unwanted frames
75
+ return result
76
+
77
+ def close(self) -> None:
78
+ if self.video_reader is not None:
79
+ self.video_reader.release()
80
+ self.video_reader = None
81
+
82
+ def set_pos(self, pos: int) -> None:
83
+ """Seek to frame idx"""
84
+ if pos == self.next_frame_idx:
85
+ return
86
+ _logger.debug(f'seeking to frame {pos}')
87
+ self.video_reader.set(cv2.CAP_PROP_POS_FRAMES, pos * self.frame_freq)
88
+ self.next_frame_idx = pos
@@ -0,0 +1,58 @@
1
+ import dataclasses
2
+ import importlib
3
+ import os
4
+ import pkgutil
5
+ from typing import Callable, Dict
6
+
7
+ import sqlalchemy as sql
8
+ import sqlalchemy.orm as orm
9
+
10
+ from .schema import SystemInfo, SystemInfoMd
11
+
12
+ # current version of the metadata; this is incremented whenever the metadata schema changes
13
+ VERSION = 14
14
+
15
+
16
+ def create_system_info(engine: sql.engine.Engine) -> None:
17
+ """Create the systemmetadata record"""
18
+ system_md = SystemInfoMd(schema_version=VERSION)
19
+ record = SystemInfo(md=dataclasses.asdict(system_md))
20
+ with orm.Session(engine, future=True) as session:
21
+ session.add(record)
22
+ session.flush()
23
+ session.commit()
24
+
25
+ # conversion functions for upgrading the metadata schema from one version to the following
26
+ # key: old schema version
27
+ converter_cbs: Dict[int, Callable[[sql.engine.Engine], None]] = {}
28
+
29
+ def register_converter(version: int, cb: Callable[[sql.engine.Engine], None]) -> None:
30
+ global converter_cbs
31
+ converter_cbs[version] = cb
32
+
33
+ def noop_converter(engine: sql.engine.Engine) -> None:
34
+ # Converter to use when incrementing the schema version, but without any functional changes
35
+ pass
36
+
37
+ # load all converter modules
38
+ for _, modname, _ in pkgutil.iter_modules([os.path.dirname(__file__) + '/converters']):
39
+ importlib.import_module('pixeltable.metadata.converters.' + modname)
40
+
41
+ def upgrade_md(engine: sql.engine.Engine) -> None:
42
+ """Upgrade the metadata schema to the current version"""
43
+ with orm.Session(engine) as session:
44
+ system_info = session.query(SystemInfo).one().md
45
+ md_version = system_info['schema_version']
46
+ if md_version == VERSION:
47
+ return
48
+ while md_version < VERSION:
49
+ if md_version not in converter_cbs:
50
+ raise RuntimeError(f'No metadata converter for version {md_version}')
51
+ print(f'Converting metadata from version {md_version} to {md_version + 1}')
52
+ converter_cbs[md_version](engine)
53
+ md_version += 1
54
+ # update system info
55
+ conn = session.connection()
56
+ system_info_md = SystemInfoMd(schema_version=VERSION)
57
+ conn.execute(SystemInfo.__table__.update().values(md=dataclasses.asdict(system_info_md)))
58
+ session.commit()
@@ -0,0 +1,18 @@
1
+ import sqlalchemy as sql
2
+
3
+ from pixeltable.metadata.schema import Table, TableSchemaVersion
4
+ from pixeltable.metadata import register_converter
5
+
6
+
7
+ def convert_10(engine: sql.engine.Engine) -> None:
8
+ default_table_attrs = {"comment": None, "num_retained_versions": 10}
9
+ with engine.begin() as conn:
10
+ # Because `parameters` wasn't actually used for anything,
11
+ # we can simply delete it without any data loss.
12
+ conn.execute(sql.update(Table).values(md=Table.md - 'parameters'))
13
+ # Add `table_attrs` to all instances of tableschemaversions.md.
14
+ conn.execute(sql.update(TableSchemaVersion).values(md=TableSchemaVersion.md.concat(default_table_attrs)))
15
+ return
16
+
17
+
18
+ register_converter(10, convert_10)
@@ -0,0 +1,3 @@
1
+ from pixeltable.metadata import register_converter, noop_converter
2
+
3
+ register_converter(12, noop_converter)
@@ -0,0 +1,41 @@
1
+ import logging
2
+ from typing import Any
3
+
4
+ import sqlalchemy as sql
5
+
6
+ from pixeltable.metadata import register_converter
7
+ from pixeltable.metadata.schema import Table
8
+
9
+ _logger = logging.getLogger('pixeltable')
10
+
11
+
12
+ def convert_13(engine: sql.engine.Engine) -> None:
13
+ with engine.begin() as conn:
14
+ for row in conn.execute(sql.select(Table)):
15
+ id = row[0]
16
+ md = row[2]
17
+ updated_md = _update_md(md)
18
+ if updated_md != md:
19
+ _logger.info(f'Updating schema for table: {id}')
20
+ conn.execute(sql.update(Table).where(Table.id == id).values(md=updated_md))
21
+
22
+
23
+ # Traverse the schema dictionary and replace instances of `ExplicitBatchedFunction` with
24
+ # `CallableFunction`. DB versions prior to 14 can't contain serialized batched functions,
25
+ # so this is all we need to do.
26
+ def _update_md(md: Any) -> Any:
27
+ if isinstance(md, dict):
28
+ updated_md = {}
29
+ for k, v in md.items():
30
+ if k == '_classpath' and v == 'pixeltable.func.batched_function.ExplicitBatchedFunction':
31
+ updated_md[k] = 'pixeltable.func.callable_function.CallableFunction'
32
+ else:
33
+ updated_md[k] = _update_md(v)
34
+ return updated_md
35
+ elif isinstance(md, list):
36
+ return [_update_md(v) for v in md]
37
+ else:
38
+ return md
39
+
40
+
41
+ register_converter(13, convert_13)