pixeltable 0.1.0__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (147) hide show
  1. pixeltable/__init__.py +34 -6
  2. pixeltable/catalog/__init__.py +13 -0
  3. pixeltable/catalog/catalog.py +159 -0
  4. pixeltable/catalog/column.py +200 -0
  5. pixeltable/catalog/dir.py +32 -0
  6. pixeltable/catalog/globals.py +33 -0
  7. pixeltable/catalog/insertable_table.py +191 -0
  8. pixeltable/catalog/named_function.py +36 -0
  9. pixeltable/catalog/path.py +58 -0
  10. pixeltable/catalog/path_dict.py +139 -0
  11. pixeltable/catalog/schema_object.py +39 -0
  12. pixeltable/catalog/table.py +581 -0
  13. pixeltable/catalog/table_version.py +749 -0
  14. pixeltable/catalog/table_version_path.py +133 -0
  15. pixeltable/catalog/view.py +203 -0
  16. pixeltable/client.py +590 -30
  17. pixeltable/dataframe.py +540 -349
  18. pixeltable/env.py +359 -45
  19. pixeltable/exceptions.py +12 -21
  20. pixeltable/exec/__init__.py +9 -0
  21. pixeltable/exec/aggregation_node.py +78 -0
  22. pixeltable/exec/cache_prefetch_node.py +116 -0
  23. pixeltable/exec/component_iteration_node.py +79 -0
  24. pixeltable/exec/data_row_batch.py +95 -0
  25. pixeltable/exec/exec_context.py +22 -0
  26. pixeltable/exec/exec_node.py +61 -0
  27. pixeltable/exec/expr_eval_node.py +217 -0
  28. pixeltable/exec/in_memory_data_node.py +69 -0
  29. pixeltable/exec/media_validation_node.py +43 -0
  30. pixeltable/exec/sql_scan_node.py +225 -0
  31. pixeltable/exprs/__init__.py +24 -0
  32. pixeltable/exprs/arithmetic_expr.py +102 -0
  33. pixeltable/exprs/array_slice.py +71 -0
  34. pixeltable/exprs/column_property_ref.py +77 -0
  35. pixeltable/exprs/column_ref.py +105 -0
  36. pixeltable/exprs/comparison.py +77 -0
  37. pixeltable/exprs/compound_predicate.py +98 -0
  38. pixeltable/exprs/data_row.py +195 -0
  39. pixeltable/exprs/expr.py +586 -0
  40. pixeltable/exprs/expr_set.py +39 -0
  41. pixeltable/exprs/function_call.py +380 -0
  42. pixeltable/exprs/globals.py +69 -0
  43. pixeltable/exprs/image_member_access.py +115 -0
  44. pixeltable/exprs/image_similarity_predicate.py +58 -0
  45. pixeltable/exprs/inline_array.py +107 -0
  46. pixeltable/exprs/inline_dict.py +101 -0
  47. pixeltable/exprs/is_null.py +38 -0
  48. pixeltable/exprs/json_mapper.py +121 -0
  49. pixeltable/exprs/json_path.py +159 -0
  50. pixeltable/exprs/literal.py +54 -0
  51. pixeltable/exprs/object_ref.py +41 -0
  52. pixeltable/exprs/predicate.py +44 -0
  53. pixeltable/exprs/row_builder.py +355 -0
  54. pixeltable/exprs/rowid_ref.py +94 -0
  55. pixeltable/exprs/type_cast.py +53 -0
  56. pixeltable/exprs/variable.py +45 -0
  57. pixeltable/func/__init__.py +9 -0
  58. pixeltable/func/aggregate_function.py +194 -0
  59. pixeltable/func/batched_function.py +53 -0
  60. pixeltable/func/callable_function.py +69 -0
  61. pixeltable/func/expr_template_function.py +82 -0
  62. pixeltable/func/function.py +110 -0
  63. pixeltable/func/function_registry.py +227 -0
  64. pixeltable/func/globals.py +36 -0
  65. pixeltable/func/nos_function.py +202 -0
  66. pixeltable/func/signature.py +166 -0
  67. pixeltable/func/udf.py +163 -0
  68. pixeltable/functions/__init__.py +52 -103
  69. pixeltable/functions/eval.py +216 -0
  70. pixeltable/functions/fireworks.py +34 -0
  71. pixeltable/functions/huggingface.py +120 -0
  72. pixeltable/functions/image.py +16 -0
  73. pixeltable/functions/openai.py +256 -0
  74. pixeltable/functions/pil/image.py +148 -7
  75. pixeltable/functions/string.py +13 -0
  76. pixeltable/functions/together.py +122 -0
  77. pixeltable/functions/util.py +41 -0
  78. pixeltable/functions/video.py +62 -0
  79. pixeltable/iterators/__init__.py +3 -0
  80. pixeltable/iterators/base.py +48 -0
  81. pixeltable/iterators/document.py +311 -0
  82. pixeltable/iterators/video.py +89 -0
  83. pixeltable/metadata/__init__.py +54 -0
  84. pixeltable/metadata/converters/convert_10.py +18 -0
  85. pixeltable/metadata/schema.py +211 -0
  86. pixeltable/plan.py +656 -0
  87. pixeltable/store.py +418 -182
  88. pixeltable/tests/conftest.py +146 -88
  89. pixeltable/tests/functions/test_fireworks.py +42 -0
  90. pixeltable/tests/functions/test_functions.py +60 -0
  91. pixeltable/tests/functions/test_huggingface.py +158 -0
  92. pixeltable/tests/functions/test_openai.py +152 -0
  93. pixeltable/tests/functions/test_together.py +111 -0
  94. pixeltable/tests/test_audio.py +65 -0
  95. pixeltable/tests/test_catalog.py +27 -0
  96. pixeltable/tests/test_client.py +14 -14
  97. pixeltable/tests/test_component_view.py +370 -0
  98. pixeltable/tests/test_dataframe.py +439 -0
  99. pixeltable/tests/test_dirs.py +78 -62
  100. pixeltable/tests/test_document.py +120 -0
  101. pixeltable/tests/test_exprs.py +592 -135
  102. pixeltable/tests/test_function.py +297 -67
  103. pixeltable/tests/test_migration.py +43 -0
  104. pixeltable/tests/test_nos.py +54 -0
  105. pixeltable/tests/test_snapshot.py +208 -0
  106. pixeltable/tests/test_table.py +1195 -263
  107. pixeltable/tests/test_transactional_directory.py +42 -0
  108. pixeltable/tests/test_types.py +5 -11
  109. pixeltable/tests/test_video.py +151 -34
  110. pixeltable/tests/test_view.py +530 -0
  111. pixeltable/tests/utils.py +320 -45
  112. pixeltable/tool/create_test_db_dump.py +149 -0
  113. pixeltable/tool/create_test_video.py +81 -0
  114. pixeltable/type_system.py +445 -124
  115. pixeltable/utils/__init__.py +17 -46
  116. pixeltable/utils/arrow.py +98 -0
  117. pixeltable/utils/clip.py +12 -15
  118. pixeltable/utils/coco.py +136 -0
  119. pixeltable/utils/documents.py +39 -0
  120. pixeltable/utils/filecache.py +195 -0
  121. pixeltable/utils/help.py +11 -0
  122. pixeltable/utils/hf_datasets.py +157 -0
  123. pixeltable/utils/media_store.py +76 -0
  124. pixeltable/utils/parquet.py +167 -0
  125. pixeltable/utils/pytorch.py +91 -0
  126. pixeltable/utils/s3.py +13 -0
  127. pixeltable/utils/sql.py +17 -0
  128. pixeltable/utils/transactional_directory.py +35 -0
  129. pixeltable-0.2.4.dist-info/LICENSE +18 -0
  130. pixeltable-0.2.4.dist-info/METADATA +127 -0
  131. pixeltable-0.2.4.dist-info/RECORD +132 -0
  132. {pixeltable-0.1.0.dist-info → pixeltable-0.2.4.dist-info}/WHEEL +1 -1
  133. pixeltable/catalog.py +0 -1421
  134. pixeltable/exprs.py +0 -1745
  135. pixeltable/function.py +0 -269
  136. pixeltable/functions/clip.py +0 -10
  137. pixeltable/functions/pil/__init__.py +0 -23
  138. pixeltable/functions/tf.py +0 -21
  139. pixeltable/index.py +0 -57
  140. pixeltable/tests/test_dict.py +0 -24
  141. pixeltable/tests/test_functions.py +0 -11
  142. pixeltable/tests/test_tf.py +0 -69
  143. pixeltable/tf.py +0 -33
  144. pixeltable/utils/tf.py +0 -33
  145. pixeltable/utils/video.py +0 -32
  146. pixeltable-0.1.0.dist-info/METADATA +0 -34
  147. pixeltable-0.1.0.dist-info/RECORD +0 -36
@@ -0,0 +1,41 @@
1
+ from typing import Tuple, List, Optional
2
+ import types
3
+ import sys
4
+
5
+ import pixeltable.func as func
6
+ import pixeltable.type_system as ts
7
+ import pixeltable.env as env
8
+
9
+
10
+ def create_nos_modules() -> List[types.ModuleType]:
11
+ """Create module pixeltable.functions.nos with one submodule per task and return the submodules"""
12
+ models = env.Env.get().nos_client.ListModels()
13
+ model_info = [env.Env.get().nos_client.GetModelInfo(model) for model in models]
14
+ model_info.sort(key=lambda info: info.task.value)
15
+
16
+ module_name = 'pixeltable.functions.nos'
17
+ nos_module = types.ModuleType(module_name)
18
+ nos_module.__package__ = 'pixeltable.functions'
19
+ sys.modules[module_name] = nos_module
20
+
21
+ prev_task = ''
22
+ new_modules: List[types.ModuleType] = []
23
+ sub_module: Optional[types.ModuleType] = None
24
+ for info in model_info:
25
+ if info.task.value != prev_task:
26
+ # we construct one submodule per task
27
+ namespace = info.task.name.lower()
28
+ submodule_name = f'{module_name}.{namespace}'
29
+ sub_module = types.ModuleType(submodule_name)
30
+ sub_module.__package__ = module_name
31
+ setattr(nos_module, namespace, sub_module)
32
+ new_modules.append(sub_module)
33
+ sys.modules[submodule_name] = sub_module
34
+ prev_task = info.task.value
35
+
36
+ # add a Function for this model to the module
37
+ model_id = info.name.replace("/", "_").replace("-", "_")
38
+ pt_func = func.NOSFunction(info, f'{submodule_name}.{model_id}')
39
+ setattr(sub_module, model_id, pt_func)
40
+
41
+ return new_modules
@@ -0,0 +1,62 @@
1
+ from typing import Optional
2
+ import uuid
3
+ import av
4
+ import sys
5
+
6
+ import pixeltable.env as env
7
+ import pixeltable.func as func
8
+ import pixeltable.type_system as ts
9
+
10
+
11
+ _format_defaults = { # format -> (codec, ext)
12
+ 'wav': ('pcm_s16le', 'wav'),
13
+ 'mp3': ('libmp3lame', 'mp3'),
14
+ 'flac': ('flac', 'flac'),
15
+ #'mp4': ('aac', 'm4a'),
16
+ }
17
+
18
+ # for mp4:
19
+ # - extract_audio() fails with "Application provided invalid, non monotonically increasing dts to muxer in stream 0: 1146 >= 290"
20
+ # - chatgpt suggests this can be fixed in the following manner
21
+ # for packet in container.demux(audio_stream):
22
+ # packet.pts = None # Reset the PTS and DTS to allow FFmpeg to set them automatically
23
+ # packet.dts = None
24
+ # for frame in packet.decode():
25
+ # frame.pts = None
26
+ # for packet in output_stream.encode(frame):
27
+ # output_container.mux(packet)
28
+ #
29
+ # # Flush remaining packets
30
+ # for packet in output_stream.encode():
31
+ # output_container.mux(packet)
32
+
33
+
34
+ _extract_audio_param_types = [
35
+ ts.VideoType(nullable=False),
36
+ ts.IntType(nullable=False),
37
+ ts.StringType(nullable=False),
38
+ ts.StringType(nullable=False)
39
+ ]
40
+ @func.udf(return_type=ts.AudioType(nullable=True), param_types=_extract_audio_param_types)
41
+ def extract_audio(
42
+ video_path: str, stream_idx: int = 0, format: str = 'wav', codec: Optional[str] = None
43
+ ) -> Optional[str]:
44
+ """Extract an audio stream from a video file, save it as a media file and return its path"""
45
+ if format not in _format_defaults:
46
+ raise ValueError(f'extract_audio(): unsupported audio format: {format}')
47
+ default_codec, ext = _format_defaults[format]
48
+
49
+ with av.open(video_path) as container:
50
+ if len(container.streams.audio) <= stream_idx:
51
+ return None
52
+ audio_stream = container.streams.audio[stream_idx]
53
+ # create this in our tmp directory, so it'll get cleaned up if it's being generated as part of a query
54
+ output_filename = str(env.Env.get().tmp_dir / f"{uuid.uuid4()}.{ext}")
55
+
56
+ with av.open(output_filename, "w", format=format) as output_container:
57
+ output_stream = output_container.add_stream(codec or default_codec)
58
+ for packet in container.demux(audio_stream):
59
+ for frame in packet.decode():
60
+ output_container.mux(output_stream.encode(frame))
61
+
62
+ return output_filename
@@ -0,0 +1,3 @@
1
+ from .base import ComponentIterator
2
+ from .video import FrameIterator
3
+
@@ -0,0 +1,48 @@
1
+ from __future__ import annotations
2
+ from typing import Dict, Any, Tuple, List
3
+ from abc import abstractmethod, ABC
4
+
5
+ from pixeltable.type_system import ColumnType
6
+
7
+
8
+ class ComponentIterator(ABC):
9
+ """Base class for iterators."""
10
+
11
+ @classmethod
12
+ @abstractmethod
13
+ def input_schema(cls) -> Dict[str, ColumnType]:
14
+ """Provide the Pixeltable types of the init() parameters
15
+
16
+ The keys need to match the names of the init() parameters. This is equivalent to the parameters_types
17
+ parameter of the @function decorator.
18
+ """
19
+ raise NotImplementedError
20
+
21
+ @classmethod
22
+ @abstractmethod
23
+ def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
24
+ """Specify the dictionary returned by next() and a list of unstored column names
25
+
26
+ Returns:
27
+ a dictionary which is turned into a list of columns in the output table
28
+ a list of unstored column names
29
+ """
30
+ raise NotImplementedError
31
+
32
+ def __iter__(self) -> ComponentIterator:
33
+ return self
34
+
35
+ @abstractmethod
36
+ def __next__(self) -> Dict[str, Any]:
37
+ """Return the next element of the iterator as a dictionary or raise StopIteration"""
38
+ raise NotImplementedError
39
+
40
+ @abstractmethod
41
+ def close(self) -> None:
42
+ """Close the iterator and release all resources"""
43
+ raise NotImplementedError
44
+
45
+ @abstractmethod
46
+ def set_pos(self, pos: int) -> None:
47
+ """Set the iterator position to pos"""
48
+ raise NotImplementedError
@@ -0,0 +1,311 @@
1
+ from typing import Dict, Any, List, Tuple, Generator, Optional, Iterable
2
+ import logging
3
+ import dataclasses
4
+ import enum
5
+
6
+ from .base import ComponentIterator
7
+
8
+ from pixeltable.type_system import ColumnType, DocumentType, StringType, IntType, JsonType
9
+ from pixeltable.exceptions import Error
10
+ from pixeltable.env import Env
11
+ from pixeltable.utils.documents import get_document_handle
12
+
13
+
14
+ _logger = logging.getLogger('pixeltable')
15
+
16
+
17
+ class ChunkMetadata(enum.Enum):
18
+ TITLE = 1
19
+ HEADINGS = 2
20
+ SOURCELINE = 3
21
+
22
+
23
+ class Separator(enum.Enum):
24
+ HEADING = 1
25
+ PARAGRAPH = 2
26
+ SENTENCE = 3
27
+ TOKEN_LIMIT = 4
28
+ CHAR_LIMIT = 5
29
+
30
+
31
+ @dataclasses.dataclass
32
+ class DocumentSectionMd:
33
+ """Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
34
+ source_line: int
35
+
36
+ # the stack of headings up to the most recently observed one;
37
+ # eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
38
+ headings: Dict[int, str]
39
+
40
+
41
+ @dataclasses.dataclass
42
+ class DocumentSection:
43
+ """A single document chunk, according to some of the splitting criteria"""
44
+ text: Optional[str]
45
+ md: Optional[DocumentSectionMd]
46
+
47
+
48
+ class DocumentSplitter(ComponentIterator):
49
+ """"Iterator over pieces of a document"""
50
+ MD_COLUMN_TYPES = {
51
+ ChunkMetadata.TITLE: StringType(),
52
+ ChunkMetadata.HEADINGS: JsonType(),
53
+ ChunkMetadata.SOURCELINE: IntType()
54
+ }
55
+
56
+ def __init__(
57
+ self, document: str, *, separators: str, limit: int = 0, overlap: int = 0, metadata: str = '',
58
+ html_skip_tags: List[str] = None, tiktoken_encoding: Optional[str] = 'cl100k_base',
59
+ tiktoken_target_model: Optional[str] = None
60
+ ):
61
+ import bs4
62
+ if html_skip_tags is None:
63
+ html_skip_tags = ['nav']
64
+ with open(document, 'r', encoding='utf8') as fh:
65
+ s = fh.read()
66
+ self._doc_handle = get_document_handle(s)
67
+ assert self._doc_handle is not None
68
+ self._separators = [Separator[s.upper()] for s in separators.split(',')]
69
+ self._md_fields = [ChunkMetadata[m.upper()] for m in metadata.split(',')] if len(metadata) > 0 else []
70
+ self._doc_title = \
71
+ self._doc_handle.bs_doc.title.get_text().strip() if self._doc_handle.bs_doc is not None else ''
72
+ self._limit = limit
73
+ self._skip_tags = html_skip_tags
74
+ self._overlap = overlap
75
+ self._tiktoken_encoding = tiktoken_encoding
76
+ self._tiktoken_target_model = tiktoken_target_model
77
+
78
+ # set up processing pipeline
79
+ if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
80
+ assert self._doc_handle.bs_doc is not None
81
+ self._sections = self._html_sections()
82
+ else:
83
+ assert self._doc_handle.md_ast is not None
84
+ self._sections = self._markdown_sections()
85
+ if Separator.SENTENCE in self._separators:
86
+ self._sections = self._sentence_sections(self._sections)
87
+ if Separator.TOKEN_LIMIT in self._separators:
88
+ self._sections = self._token_chunks(self._sections)
89
+ if Separator.CHAR_LIMIT in self._separators:
90
+ self._sections = self._char_chunks(self._sections)
91
+
92
+ @classmethod
93
+ def input_schema(cls) -> Dict[str, ColumnType]:
94
+ return {
95
+ 'document': DocumentType(nullable=False),
96
+ 'separators': StringType(nullable=False),
97
+ 'metadata': StringType(nullable=True),
98
+ 'limit': IntType(nullable=True),
99
+ 'overlap': IntType(nullable=True),
100
+ 'skip_tags': StringType(nullable=True),
101
+ 'tiktoken_encoding': StringType(nullable=True),
102
+ 'tiktoken_target_model': StringType(nullable=True),
103
+ }
104
+
105
+ @classmethod
106
+ def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
107
+ schema = {'text': StringType()}
108
+ if 'metadata' in kwargs and len(kwargs['metadata']) > 0:
109
+ md_fields = kwargs['metadata'].split(',')
110
+ for md_field in md_fields:
111
+ if not hasattr(ChunkMetadata, md_field.upper()):
112
+ raise Error(f'Invalid metadata field {md_field}')
113
+ schema[md_field.lower()] = cls.MD_COLUMN_TYPES[ChunkMetadata[md_field.upper()]]
114
+
115
+ assert 'separators' in kwargs
116
+ separators = kwargs['separators'].split(',')
117
+ for separator in separators:
118
+ if not hasattr(Separator, separator.upper()):
119
+ raise Error(f'Invalid separator {separator}')
120
+
121
+ # check dependencies
122
+ if 'sentence' in separators:
123
+ Env.get().require_package('spacy')
124
+ if 'token_limit' in separators:
125
+ Env.get().require_package('tiktoken')
126
+
127
+ if 'limit' in kwargs or 'overlap' in kwargs:
128
+ if 'token_limit' not in separators and 'char_limit' not in separators:
129
+ raise Error('limit/overlap requires the "token_limit" or "char_limit" separator')
130
+ if 'limit' in kwargs and int(kwargs['limit']) <= 0:
131
+ raise Error('"limit" must be an integer > 0')
132
+ if 'overlap' in kwargs and int(kwargs['overlap']) < 0:
133
+ raise Error('"overlap" must be an integer >= 0')
134
+ if 'token_limit' in separators or 'char_limit' in separators:
135
+ if 'token_limit' in separators and 'char_limit' in separators:
136
+ raise Error('Cannot specify both "token_limit" and "char_limit" separators')
137
+ if 'limit' not in kwargs:
138
+ raise Error('limit is required with "token_limit"/"char_limit" separators')
139
+
140
+ return schema, []
141
+
142
+ def __next__(self) -> Dict[str, Any]:
143
+ while True:
144
+ section = next(self._sections)
145
+ if section.text is None:
146
+ continue
147
+ result = {'text': section.text}
148
+ for md_field in self._md_fields:
149
+ if md_field == ChunkMetadata.TITLE:
150
+ result[md_field.name.lower()] = self._doc_title
151
+ elif md_field == ChunkMetadata.HEADINGS:
152
+ result[md_field.name.lower()] = section.md.headings
153
+ elif md_field == ChunkMetadata.SOURCELINE:
154
+ result[md_field.name.lower()] = section.md.source_line
155
+ return result
156
+
157
+ def _html_sections(self) -> Generator[DocumentSection, None, None]:
158
+ """Create DocumentSections reflecting the html-specific separators"""
159
+ import bs4
160
+ emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
161
+ emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
162
+ # current state
163
+ text_section = '' # currently accumulated text
164
+ headings: Dict[int, str] = {} # current state of observed headings (level -> text)
165
+ sourceline = 0 # most recently seen sourceline
166
+
167
+ def update_md(el: bs4.Tag) -> None:
168
+ # update current state
169
+ nonlocal headings, sourceline
170
+ sourceline = el.sourceline
171
+ if el.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
172
+ level = int(el.name[1])
173
+ # remove the previously seen lower levels
174
+ lower_levels = [l for l in headings.keys() if l > level]
175
+ for l in lower_levels:
176
+ del headings[l]
177
+ headings[level] = el.get_text().strip()
178
+
179
+ def emit() -> None:
180
+ nonlocal text_section, headings, sourceline
181
+ if len(text_section) > 0:
182
+ md = DocumentSectionMd(sourceline, headings.copy())
183
+ yield DocumentSection(text=text_section, md=md)
184
+ text_section = ''
185
+
186
+ def process_element(el: bs4.PageElement) -> Generator[DocumentSection, None, None]:
187
+ # process the element and emit sections as necessary
188
+ nonlocal text_section, headings, sourceline, emit_on_heading, emit_on_paragraph
189
+ if el.name in self._skip_tags:
190
+ return
191
+
192
+ if isinstance(el, bs4.NavigableString):
193
+ # accumulate text until we see a tag we care about
194
+ text = el.get_text().strip()
195
+ if len(text) > 0:
196
+ text_section += ' ' + text
197
+ return
198
+
199
+ if el.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
200
+ if emit_on_heading:
201
+ yield from emit()
202
+ update_md(el)
203
+ elif el.name == 'p':
204
+ if emit_on_paragraph:
205
+ yield from emit()
206
+ update_md(el)
207
+ for child in el.children:
208
+ yield from process_element(child)
209
+
210
+ yield from process_element(self._doc_handle.bs_doc)
211
+ yield from emit()
212
+
213
+ def _markdown_sections(self) -> Generator[DocumentSection, None, None]:
214
+ """Create DocumentSections reflecting the html-specific separators"""
215
+ assert self._doc_handle.md_ast is not None
216
+ emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
217
+ emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
218
+ # current state
219
+ text_section = '' # currently accumulated text
220
+ headings: Dict[int, str] = {} # current state of observed headings (level -> text)
221
+
222
+ def update_headings(heading: Dict) -> None:
223
+ # update current state
224
+ nonlocal headings
225
+ assert 'type' in heading and heading['type'] == 'heading'
226
+ level = heading['attrs']['level']
227
+ text = heading['children'][0]['raw'].strip()
228
+ # remove the previously seen lower levels
229
+ lower_levels = [l for l in headings.keys() if l > level]
230
+ for l in lower_levels:
231
+ del headings[l]
232
+ headings[level] = text
233
+
234
+ def emit() -> None:
235
+ nonlocal text_section, headings
236
+ if len(text_section) > 0:
237
+ md = DocumentSectionMd(0, headings.copy())
238
+ yield DocumentSection(text=text_section, md=md)
239
+ text_section = ''
240
+
241
+ def process_element(el: Dict) -> Generator[DocumentSection, None, None]:
242
+ # process the element and emit sections as necessary
243
+ nonlocal text_section, headings, emit_on_heading, emit_on_paragraph
244
+ assert 'type' in el
245
+
246
+ if el['type'] == 'text':
247
+ # accumulate text until we see a separator element
248
+ text = el['raw'].strip()
249
+ if len(text) > 0:
250
+ text_section += ' ' + text
251
+ return
252
+
253
+ if el['type'] == 'heading':
254
+ if emit_on_heading:
255
+ yield from emit()
256
+ update_headings(el)
257
+ elif el['type'] == 'paragraph':
258
+ if emit_on_paragraph:
259
+ yield from emit()
260
+ if 'children' not in el:
261
+ return
262
+ for child in el['children']:
263
+ yield from process_element(child)
264
+
265
+ for el in self._doc_handle.md_ast:
266
+ yield from process_element(el)
267
+ yield from emit()
268
+
269
+ def _sentence_sections(self, input_sections: Iterable[DocumentSection]) -> Generator[DocumentSection, None, None]:
270
+ """Split the input sections into sentences"""
271
+ for section in input_sections:
272
+ if section.text is not None:
273
+ doc = Env.get().spacy_nlp(section.text)
274
+ for sent in doc.sents:
275
+ yield DocumentSection(text=sent.text, md=section.md)
276
+
277
+ def _token_chunks(self, input: Iterable[DocumentSection]) -> Generator[DocumentSection, None, None]:
278
+ import tiktoken
279
+ if self._tiktoken_target_model is not None:
280
+ encoding = tiktoken.encoding_for_model(self._tiktoken_target_model)
281
+ else:
282
+ encoding = tiktoken.get_encoding(self._tiktoken_encoding)
283
+ assert self._limit > 0 and self._overlap >= 0
284
+
285
+ for section in input:
286
+ if section.text is None:
287
+ continue
288
+ tokens = encoding.encode(section.text)
289
+ start_idx = 0
290
+ while start_idx < len(tokens):
291
+ end_idx = min(start_idx + self._limit, len(tokens))
292
+ text = encoding.decode(tokens[start_idx:end_idx])
293
+ yield DocumentSection(text=text, md=section.md)
294
+ start_idx += self._limit - self._overlap
295
+
296
+ def _char_chunks(self, input: Iterable[DocumentSection]) -> Generator[DocumentSection, None, None]:
297
+ for section in input:
298
+ if section.text is None:
299
+ continue
300
+ start_idx = 0
301
+ while start_idx < len(section.text):
302
+ end_idx = min(start_idx + self._limit, len(section.text))
303
+ text = section.text[start_idx:end_idx]
304
+ yield DocumentSection(text=text, md=section.md)
305
+ start_idx += self._limit - self._overlap
306
+
307
+ def close(self) -> None:
308
+ pass
309
+
310
+ def set_pos(self, pos: int) -> None:
311
+ pass
@@ -0,0 +1,89 @@
1
+ from typing import Dict, Any, List, Tuple
2
+ from pathlib import Path
3
+ import math
4
+ import logging
5
+
6
+ import cv2
7
+ import PIL.Image
8
+
9
+ from .base import ComponentIterator
10
+
11
+ from pixeltable.type_system import ColumnType, VideoType, ImageType, IntType, FloatType
12
+ from pixeltable.exceptions import Error
13
+
14
+
15
+ _logger = logging.getLogger('pixeltable')
16
+
17
+ class FrameIterator(ComponentIterator):
18
+ def __init__(self, video: str, fps: float = 0.0):
19
+ video_path = Path(video)
20
+ assert video_path.exists() and video_path.is_file()
21
+ self.video_path = video_path
22
+ self.fps = fps
23
+ self.video_reader = cv2.VideoCapture(str(video_path))
24
+ if not self.video_reader.isOpened():
25
+ raise Error(f'Failed to open video: {video}')
26
+ video_fps = int(self.video_reader.get(cv2.CAP_PROP_FPS))
27
+ if fps > video_fps:
28
+ raise Error(f'Video {video}: requested fps ({fps}) exceeds that of the video ({video_fps})')
29
+ self.frame_freq = int(video_fps / fps) if fps > 0 else 1
30
+ num_video_frames = int(self.video_reader.get(cv2.CAP_PROP_FRAME_COUNT))
31
+ if num_video_frames == 0:
32
+ raise Error(f'Video {video}: failed to get number of frames')
33
+ # ceil: round up to ensure we count frame 0
34
+ self.num_frames = math.ceil(num_video_frames / self.frame_freq) if fps > 0 else num_video_frames
35
+ _logger.debug(f'FrameIterator: path={self.video_path} fps={self.fps}')
36
+
37
+ self.next_frame_idx = 0
38
+
39
+ @classmethod
40
+ def input_schema(cls) -> Dict[str, ColumnType]:
41
+ return {
42
+ 'video': VideoType(nullable=False),
43
+ 'fps': FloatType()
44
+ }
45
+
46
+ @classmethod
47
+ def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
48
+ return {
49
+ 'frame_idx': IntType(),
50
+ 'pos_msec': FloatType(),
51
+ 'pos_frame': FloatType(),
52
+ 'frame': ImageType(),
53
+ }, ['frame']
54
+
55
+ def __next__(self) -> Dict[str, Any]:
56
+ while True:
57
+ pos_msec = self.video_reader.get(cv2.CAP_PROP_POS_MSEC)
58
+ pos_frame = self.video_reader.get(cv2.CAP_PROP_POS_FRAMES)
59
+ status, img = self.video_reader.read()
60
+ if not status:
61
+ _logger.debug(f'releasing video reader for {self.video_path}')
62
+ self.video_reader.release()
63
+ self.video_reader = None
64
+ raise StopIteration
65
+ if pos_frame % self.frame_freq == 0:
66
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
67
+ result = {
68
+ 'frame_idx': self.next_frame_idx,
69
+ 'pos_msec': pos_msec,
70
+ 'pos_frame': pos_frame,
71
+ 'frame': PIL.Image.fromarray(img),
72
+ }
73
+ self.next_frame_idx += 1
74
+ # frame_freq > 1: jumping to the target frame here with video_reader.set() is far slower than just
75
+ # skipping the unwanted frames
76
+ return result
77
+
78
+ def close(self) -> None:
79
+ if self.video_reader is not None:
80
+ self.video_reader.release()
81
+ self.video_reader = None
82
+
83
+ def set_pos(self, pos: int) -> None:
84
+ """Seek to frame idx"""
85
+ if pos == self.next_frame_idx:
86
+ return
87
+ _logger.debug(f'seeking to frame {pos}')
88
+ self.video_reader.set(cv2.CAP_PROP_POS_FRAMES, pos * self.frame_freq)
89
+ self.next_frame_idx = pos
@@ -0,0 +1,54 @@
1
+ import dataclasses
2
+ import importlib
3
+ import os
4
+ import pkgutil
5
+ from typing import Callable, Dict
6
+
7
+ import sqlalchemy as sql
8
+ import sqlalchemy.orm as orm
9
+
10
+ from .schema import SystemInfo, SystemInfoMd
11
+
12
+ # current version of the metadata; this is incremented whenever the metadata schema changes
13
+ VERSION = 12
14
+
15
+
16
+ def create_system_info(engine: sql.engine.Engine) -> None:
17
+ """Create the systemmetadata record"""
18
+ system_md = SystemInfoMd(schema_version=VERSION)
19
+ record = SystemInfo(md=dataclasses.asdict(system_md))
20
+ with orm.Session(engine, future=True) as session:
21
+ session.add(record)
22
+ session.flush()
23
+ session.commit()
24
+
25
+ # conversion functions for upgrading the metadata schema from one version to the following
26
+ # key: old schema version
27
+ converter_cbs: Dict[int, Callable[[sql.engine.Engine], None]] = {}
28
+
29
+ def register_converter(version: int, cb: Callable[[sql.engine.Engine], None]) -> None:
30
+ global converter_cbs
31
+ converter_cbs[version] = cb
32
+
33
+ # load all converter modules
34
+ for _, modname, _ in pkgutil.iter_modules([os.path.dirname(__file__) + '/converters']):
35
+ importlib.import_module('pixeltable.metadata.converters.' + modname)
36
+
37
+ def upgrade_md(engine: sql.engine.Engine) -> None:
38
+ """Upgrade the metadata schema to the current version"""
39
+ with orm.Session(engine, future=True) as session:
40
+ system_info = session.query(SystemInfo).one().md
41
+ md_version = system_info['schema_version']
42
+ if md_version == VERSION:
43
+ return
44
+ while md_version < VERSION:
45
+ if md_version not in converter_cbs:
46
+ raise RuntimeError(f'No metadata converter for version {md_version}')
47
+ print(f'Converting metadata from version {md_version} to {md_version + 1}')
48
+ converter_cbs[md_version](engine)
49
+ md_version += 1
50
+ # update system info
51
+ conn = session.connection()
52
+ system_info_md = SystemInfoMd(schema_version=VERSION)
53
+ conn.execute(SystemInfo.__table__.update().values(md=dataclasses.asdict(system_info_md)))
54
+ session.commit()
@@ -0,0 +1,18 @@
1
+ import sqlalchemy as sql
2
+
3
+ from pixeltable.metadata.schema import Table, TableSchemaVersion
4
+ from pixeltable.metadata import register_converter
5
+
6
+
7
+ def convert_10(engine: sql.engine.Engine) -> None:
8
+ default_table_attrs = {"comment": None, "num_retained_versions": 10}
9
+ with engine.begin() as conn:
10
+ # Because `parameters` wasn't actually used for anything,
11
+ # we can simply delete it without any data loss.
12
+ conn.execute(sql.update(Table).values(md=Table.md - 'parameters'))
13
+ # Add `table_attrs` to all instances of tableschemaversions.md.
14
+ conn.execute(sql.update(TableSchemaVersion).values(md=TableSchemaVersion.md.concat(default_table_attrs)))
15
+ return
16
+
17
+
18
+ register_converter(10, convert_10)