pixeltable 0.1.0__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +34 -6
- pixeltable/catalog/__init__.py +13 -0
- pixeltable/catalog/catalog.py +159 -0
- pixeltable/catalog/column.py +200 -0
- pixeltable/catalog/dir.py +32 -0
- pixeltable/catalog/globals.py +33 -0
- pixeltable/catalog/insertable_table.py +191 -0
- pixeltable/catalog/named_function.py +36 -0
- pixeltable/catalog/path.py +58 -0
- pixeltable/catalog/path_dict.py +139 -0
- pixeltable/catalog/schema_object.py +39 -0
- pixeltable/catalog/table.py +581 -0
- pixeltable/catalog/table_version.py +749 -0
- pixeltable/catalog/table_version_path.py +133 -0
- pixeltable/catalog/view.py +203 -0
- pixeltable/client.py +590 -30
- pixeltable/dataframe.py +540 -349
- pixeltable/env.py +359 -45
- pixeltable/exceptions.py +12 -21
- pixeltable/exec/__init__.py +9 -0
- pixeltable/exec/aggregation_node.py +78 -0
- pixeltable/exec/cache_prefetch_node.py +116 -0
- pixeltable/exec/component_iteration_node.py +79 -0
- pixeltable/exec/data_row_batch.py +95 -0
- pixeltable/exec/exec_context.py +22 -0
- pixeltable/exec/exec_node.py +61 -0
- pixeltable/exec/expr_eval_node.py +217 -0
- pixeltable/exec/in_memory_data_node.py +69 -0
- pixeltable/exec/media_validation_node.py +43 -0
- pixeltable/exec/sql_scan_node.py +225 -0
- pixeltable/exprs/__init__.py +24 -0
- pixeltable/exprs/arithmetic_expr.py +102 -0
- pixeltable/exprs/array_slice.py +71 -0
- pixeltable/exprs/column_property_ref.py +77 -0
- pixeltable/exprs/column_ref.py +105 -0
- pixeltable/exprs/comparison.py +77 -0
- pixeltable/exprs/compound_predicate.py +98 -0
- pixeltable/exprs/data_row.py +195 -0
- pixeltable/exprs/expr.py +586 -0
- pixeltable/exprs/expr_set.py +39 -0
- pixeltable/exprs/function_call.py +380 -0
- pixeltable/exprs/globals.py +69 -0
- pixeltable/exprs/image_member_access.py +115 -0
- pixeltable/exprs/image_similarity_predicate.py +58 -0
- pixeltable/exprs/inline_array.py +107 -0
- pixeltable/exprs/inline_dict.py +101 -0
- pixeltable/exprs/is_null.py +38 -0
- pixeltable/exprs/json_mapper.py +121 -0
- pixeltable/exprs/json_path.py +159 -0
- pixeltable/exprs/literal.py +54 -0
- pixeltable/exprs/object_ref.py +41 -0
- pixeltable/exprs/predicate.py +44 -0
- pixeltable/exprs/row_builder.py +355 -0
- pixeltable/exprs/rowid_ref.py +94 -0
- pixeltable/exprs/type_cast.py +53 -0
- pixeltable/exprs/variable.py +45 -0
- pixeltable/func/__init__.py +9 -0
- pixeltable/func/aggregate_function.py +194 -0
- pixeltable/func/batched_function.py +53 -0
- pixeltable/func/callable_function.py +69 -0
- pixeltable/func/expr_template_function.py +82 -0
- pixeltable/func/function.py +110 -0
- pixeltable/func/function_registry.py +227 -0
- pixeltable/func/globals.py +36 -0
- pixeltable/func/nos_function.py +202 -0
- pixeltable/func/signature.py +166 -0
- pixeltable/func/udf.py +163 -0
- pixeltable/functions/__init__.py +52 -103
- pixeltable/functions/eval.py +216 -0
- pixeltable/functions/fireworks.py +34 -0
- pixeltable/functions/huggingface.py +120 -0
- pixeltable/functions/image.py +16 -0
- pixeltable/functions/openai.py +256 -0
- pixeltable/functions/pil/image.py +148 -7
- pixeltable/functions/string.py +13 -0
- pixeltable/functions/together.py +122 -0
- pixeltable/functions/util.py +41 -0
- pixeltable/functions/video.py +62 -0
- pixeltable/iterators/__init__.py +3 -0
- pixeltable/iterators/base.py +48 -0
- pixeltable/iterators/document.py +311 -0
- pixeltable/iterators/video.py +89 -0
- pixeltable/metadata/__init__.py +54 -0
- pixeltable/metadata/converters/convert_10.py +18 -0
- pixeltable/metadata/schema.py +211 -0
- pixeltable/plan.py +656 -0
- pixeltable/store.py +418 -182
- pixeltable/tests/conftest.py +146 -88
- pixeltable/tests/functions/test_fireworks.py +42 -0
- pixeltable/tests/functions/test_functions.py +60 -0
- pixeltable/tests/functions/test_huggingface.py +158 -0
- pixeltable/tests/functions/test_openai.py +152 -0
- pixeltable/tests/functions/test_together.py +111 -0
- pixeltable/tests/test_audio.py +65 -0
- pixeltable/tests/test_catalog.py +27 -0
- pixeltable/tests/test_client.py +14 -14
- pixeltable/tests/test_component_view.py +370 -0
- pixeltable/tests/test_dataframe.py +439 -0
- pixeltable/tests/test_dirs.py +78 -62
- pixeltable/tests/test_document.py +120 -0
- pixeltable/tests/test_exprs.py +592 -135
- pixeltable/tests/test_function.py +297 -67
- pixeltable/tests/test_migration.py +43 -0
- pixeltable/tests/test_nos.py +54 -0
- pixeltable/tests/test_snapshot.py +208 -0
- pixeltable/tests/test_table.py +1195 -263
- pixeltable/tests/test_transactional_directory.py +42 -0
- pixeltable/tests/test_types.py +5 -11
- pixeltable/tests/test_video.py +151 -34
- pixeltable/tests/test_view.py +530 -0
- pixeltable/tests/utils.py +320 -45
- pixeltable/tool/create_test_db_dump.py +149 -0
- pixeltable/tool/create_test_video.py +81 -0
- pixeltable/type_system.py +445 -124
- pixeltable/utils/__init__.py +17 -46
- pixeltable/utils/arrow.py +98 -0
- pixeltable/utils/clip.py +12 -15
- pixeltable/utils/coco.py +136 -0
- pixeltable/utils/documents.py +39 -0
- pixeltable/utils/filecache.py +195 -0
- pixeltable/utils/help.py +11 -0
- pixeltable/utils/hf_datasets.py +157 -0
- pixeltable/utils/media_store.py +76 -0
- pixeltable/utils/parquet.py +167 -0
- pixeltable/utils/pytorch.py +91 -0
- pixeltable/utils/s3.py +13 -0
- pixeltable/utils/sql.py +17 -0
- pixeltable/utils/transactional_directory.py +35 -0
- pixeltable-0.2.4.dist-info/LICENSE +18 -0
- pixeltable-0.2.4.dist-info/METADATA +127 -0
- pixeltable-0.2.4.dist-info/RECORD +132 -0
- {pixeltable-0.1.0.dist-info → pixeltable-0.2.4.dist-info}/WHEEL +1 -1
- pixeltable/catalog.py +0 -1421
- pixeltable/exprs.py +0 -1745
- pixeltable/function.py +0 -269
- pixeltable/functions/clip.py +0 -10
- pixeltable/functions/pil/__init__.py +0 -23
- pixeltable/functions/tf.py +0 -21
- pixeltable/index.py +0 -57
- pixeltable/tests/test_dict.py +0 -24
- pixeltable/tests/test_functions.py +0 -11
- pixeltable/tests/test_tf.py +0 -69
- pixeltable/tf.py +0 -33
- pixeltable/utils/tf.py +0 -33
- pixeltable/utils/video.py +0 -32
- pixeltable-0.1.0.dist-info/METADATA +0 -34
- pixeltable-0.1.0.dist-info/RECORD +0 -36
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from typing import Tuple, List, Optional
|
|
2
|
+
import types
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
import pixeltable.func as func
|
|
6
|
+
import pixeltable.type_system as ts
|
|
7
|
+
import pixeltable.env as env
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def create_nos_modules() -> List[types.ModuleType]:
|
|
11
|
+
"""Create module pixeltable.functions.nos with one submodule per task and return the submodules"""
|
|
12
|
+
models = env.Env.get().nos_client.ListModels()
|
|
13
|
+
model_info = [env.Env.get().nos_client.GetModelInfo(model) for model in models]
|
|
14
|
+
model_info.sort(key=lambda info: info.task.value)
|
|
15
|
+
|
|
16
|
+
module_name = 'pixeltable.functions.nos'
|
|
17
|
+
nos_module = types.ModuleType(module_name)
|
|
18
|
+
nos_module.__package__ = 'pixeltable.functions'
|
|
19
|
+
sys.modules[module_name] = nos_module
|
|
20
|
+
|
|
21
|
+
prev_task = ''
|
|
22
|
+
new_modules: List[types.ModuleType] = []
|
|
23
|
+
sub_module: Optional[types.ModuleType] = None
|
|
24
|
+
for info in model_info:
|
|
25
|
+
if info.task.value != prev_task:
|
|
26
|
+
# we construct one submodule per task
|
|
27
|
+
namespace = info.task.name.lower()
|
|
28
|
+
submodule_name = f'{module_name}.{namespace}'
|
|
29
|
+
sub_module = types.ModuleType(submodule_name)
|
|
30
|
+
sub_module.__package__ = module_name
|
|
31
|
+
setattr(nos_module, namespace, sub_module)
|
|
32
|
+
new_modules.append(sub_module)
|
|
33
|
+
sys.modules[submodule_name] = sub_module
|
|
34
|
+
prev_task = info.task.value
|
|
35
|
+
|
|
36
|
+
# add a Function for this model to the module
|
|
37
|
+
model_id = info.name.replace("/", "_").replace("-", "_")
|
|
38
|
+
pt_func = func.NOSFunction(info, f'{submodule_name}.{model_id}')
|
|
39
|
+
setattr(sub_module, model_id, pt_func)
|
|
40
|
+
|
|
41
|
+
return new_modules
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
import uuid
|
|
3
|
+
import av
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
import pixeltable.env as env
|
|
7
|
+
import pixeltable.func as func
|
|
8
|
+
import pixeltable.type_system as ts
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
_format_defaults = { # format -> (codec, ext)
|
|
12
|
+
'wav': ('pcm_s16le', 'wav'),
|
|
13
|
+
'mp3': ('libmp3lame', 'mp3'),
|
|
14
|
+
'flac': ('flac', 'flac'),
|
|
15
|
+
#'mp4': ('aac', 'm4a'),
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
# for mp4:
|
|
19
|
+
# - extract_audio() fails with "Application provided invalid, non monotonically increasing dts to muxer in stream 0: 1146 >= 290"
|
|
20
|
+
# - chatgpt suggests this can be fixed in the following manner
|
|
21
|
+
# for packet in container.demux(audio_stream):
|
|
22
|
+
# packet.pts = None # Reset the PTS and DTS to allow FFmpeg to set them automatically
|
|
23
|
+
# packet.dts = None
|
|
24
|
+
# for frame in packet.decode():
|
|
25
|
+
# frame.pts = None
|
|
26
|
+
# for packet in output_stream.encode(frame):
|
|
27
|
+
# output_container.mux(packet)
|
|
28
|
+
#
|
|
29
|
+
# # Flush remaining packets
|
|
30
|
+
# for packet in output_stream.encode():
|
|
31
|
+
# output_container.mux(packet)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
_extract_audio_param_types = [
|
|
35
|
+
ts.VideoType(nullable=False),
|
|
36
|
+
ts.IntType(nullable=False),
|
|
37
|
+
ts.StringType(nullable=False),
|
|
38
|
+
ts.StringType(nullable=False)
|
|
39
|
+
]
|
|
40
|
+
@func.udf(return_type=ts.AudioType(nullable=True), param_types=_extract_audio_param_types)
|
|
41
|
+
def extract_audio(
|
|
42
|
+
video_path: str, stream_idx: int = 0, format: str = 'wav', codec: Optional[str] = None
|
|
43
|
+
) -> Optional[str]:
|
|
44
|
+
"""Extract an audio stream from a video file, save it as a media file and return its path"""
|
|
45
|
+
if format not in _format_defaults:
|
|
46
|
+
raise ValueError(f'extract_audio(): unsupported audio format: {format}')
|
|
47
|
+
default_codec, ext = _format_defaults[format]
|
|
48
|
+
|
|
49
|
+
with av.open(video_path) as container:
|
|
50
|
+
if len(container.streams.audio) <= stream_idx:
|
|
51
|
+
return None
|
|
52
|
+
audio_stream = container.streams.audio[stream_idx]
|
|
53
|
+
# create this in our tmp directory, so it'll get cleaned up if it's being generated as part of a query
|
|
54
|
+
output_filename = str(env.Env.get().tmp_dir / f"{uuid.uuid4()}.{ext}")
|
|
55
|
+
|
|
56
|
+
with av.open(output_filename, "w", format=format) as output_container:
|
|
57
|
+
output_stream = output_container.add_stream(codec or default_codec)
|
|
58
|
+
for packet in container.demux(audio_stream):
|
|
59
|
+
for frame in packet.decode():
|
|
60
|
+
output_container.mux(output_stream.encode(frame))
|
|
61
|
+
|
|
62
|
+
return output_filename
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Dict, Any, Tuple, List
|
|
3
|
+
from abc import abstractmethod, ABC
|
|
4
|
+
|
|
5
|
+
from pixeltable.type_system import ColumnType
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ComponentIterator(ABC):
|
|
9
|
+
"""Base class for iterators."""
|
|
10
|
+
|
|
11
|
+
@classmethod
|
|
12
|
+
@abstractmethod
|
|
13
|
+
def input_schema(cls) -> Dict[str, ColumnType]:
|
|
14
|
+
"""Provide the Pixeltable types of the init() parameters
|
|
15
|
+
|
|
16
|
+
The keys need to match the names of the init() parameters. This is equivalent to the parameters_types
|
|
17
|
+
parameter of the @function decorator.
|
|
18
|
+
"""
|
|
19
|
+
raise NotImplementedError
|
|
20
|
+
|
|
21
|
+
@classmethod
|
|
22
|
+
@abstractmethod
|
|
23
|
+
def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
|
|
24
|
+
"""Specify the dictionary returned by next() and a list of unstored column names
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
a dictionary which is turned into a list of columns in the output table
|
|
28
|
+
a list of unstored column names
|
|
29
|
+
"""
|
|
30
|
+
raise NotImplementedError
|
|
31
|
+
|
|
32
|
+
def __iter__(self) -> ComponentIterator:
|
|
33
|
+
return self
|
|
34
|
+
|
|
35
|
+
@abstractmethod
|
|
36
|
+
def __next__(self) -> Dict[str, Any]:
|
|
37
|
+
"""Return the next element of the iterator as a dictionary or raise StopIteration"""
|
|
38
|
+
raise NotImplementedError
|
|
39
|
+
|
|
40
|
+
@abstractmethod
|
|
41
|
+
def close(self) -> None:
|
|
42
|
+
"""Close the iterator and release all resources"""
|
|
43
|
+
raise NotImplementedError
|
|
44
|
+
|
|
45
|
+
@abstractmethod
|
|
46
|
+
def set_pos(self, pos: int) -> None:
|
|
47
|
+
"""Set the iterator position to pos"""
|
|
48
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
from typing import Dict, Any, List, Tuple, Generator, Optional, Iterable
|
|
2
|
+
import logging
|
|
3
|
+
import dataclasses
|
|
4
|
+
import enum
|
|
5
|
+
|
|
6
|
+
from .base import ComponentIterator
|
|
7
|
+
|
|
8
|
+
from pixeltable.type_system import ColumnType, DocumentType, StringType, IntType, JsonType
|
|
9
|
+
from pixeltable.exceptions import Error
|
|
10
|
+
from pixeltable.env import Env
|
|
11
|
+
from pixeltable.utils.documents import get_document_handle
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
_logger = logging.getLogger('pixeltable')
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ChunkMetadata(enum.Enum):
|
|
18
|
+
TITLE = 1
|
|
19
|
+
HEADINGS = 2
|
|
20
|
+
SOURCELINE = 3
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class Separator(enum.Enum):
|
|
24
|
+
HEADING = 1
|
|
25
|
+
PARAGRAPH = 2
|
|
26
|
+
SENTENCE = 3
|
|
27
|
+
TOKEN_LIMIT = 4
|
|
28
|
+
CHAR_LIMIT = 5
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclasses.dataclass
|
|
32
|
+
class DocumentSectionMd:
|
|
33
|
+
"""Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
|
|
34
|
+
source_line: int
|
|
35
|
+
|
|
36
|
+
# the stack of headings up to the most recently observed one;
|
|
37
|
+
# eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
|
|
38
|
+
headings: Dict[int, str]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclasses.dataclass
|
|
42
|
+
class DocumentSection:
|
|
43
|
+
"""A single document chunk, according to some of the splitting criteria"""
|
|
44
|
+
text: Optional[str]
|
|
45
|
+
md: Optional[DocumentSectionMd]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class DocumentSplitter(ComponentIterator):
|
|
49
|
+
""""Iterator over pieces of a document"""
|
|
50
|
+
MD_COLUMN_TYPES = {
|
|
51
|
+
ChunkMetadata.TITLE: StringType(),
|
|
52
|
+
ChunkMetadata.HEADINGS: JsonType(),
|
|
53
|
+
ChunkMetadata.SOURCELINE: IntType()
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
def __init__(
|
|
57
|
+
self, document: str, *, separators: str, limit: int = 0, overlap: int = 0, metadata: str = '',
|
|
58
|
+
html_skip_tags: List[str] = None, tiktoken_encoding: Optional[str] = 'cl100k_base',
|
|
59
|
+
tiktoken_target_model: Optional[str] = None
|
|
60
|
+
):
|
|
61
|
+
import bs4
|
|
62
|
+
if html_skip_tags is None:
|
|
63
|
+
html_skip_tags = ['nav']
|
|
64
|
+
with open(document, 'r', encoding='utf8') as fh:
|
|
65
|
+
s = fh.read()
|
|
66
|
+
self._doc_handle = get_document_handle(s)
|
|
67
|
+
assert self._doc_handle is not None
|
|
68
|
+
self._separators = [Separator[s.upper()] for s in separators.split(',')]
|
|
69
|
+
self._md_fields = [ChunkMetadata[m.upper()] for m in metadata.split(',')] if len(metadata) > 0 else []
|
|
70
|
+
self._doc_title = \
|
|
71
|
+
self._doc_handle.bs_doc.title.get_text().strip() if self._doc_handle.bs_doc is not None else ''
|
|
72
|
+
self._limit = limit
|
|
73
|
+
self._skip_tags = html_skip_tags
|
|
74
|
+
self._overlap = overlap
|
|
75
|
+
self._tiktoken_encoding = tiktoken_encoding
|
|
76
|
+
self._tiktoken_target_model = tiktoken_target_model
|
|
77
|
+
|
|
78
|
+
# set up processing pipeline
|
|
79
|
+
if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
|
|
80
|
+
assert self._doc_handle.bs_doc is not None
|
|
81
|
+
self._sections = self._html_sections()
|
|
82
|
+
else:
|
|
83
|
+
assert self._doc_handle.md_ast is not None
|
|
84
|
+
self._sections = self._markdown_sections()
|
|
85
|
+
if Separator.SENTENCE in self._separators:
|
|
86
|
+
self._sections = self._sentence_sections(self._sections)
|
|
87
|
+
if Separator.TOKEN_LIMIT in self._separators:
|
|
88
|
+
self._sections = self._token_chunks(self._sections)
|
|
89
|
+
if Separator.CHAR_LIMIT in self._separators:
|
|
90
|
+
self._sections = self._char_chunks(self._sections)
|
|
91
|
+
|
|
92
|
+
@classmethod
|
|
93
|
+
def input_schema(cls) -> Dict[str, ColumnType]:
|
|
94
|
+
return {
|
|
95
|
+
'document': DocumentType(nullable=False),
|
|
96
|
+
'separators': StringType(nullable=False),
|
|
97
|
+
'metadata': StringType(nullable=True),
|
|
98
|
+
'limit': IntType(nullable=True),
|
|
99
|
+
'overlap': IntType(nullable=True),
|
|
100
|
+
'skip_tags': StringType(nullable=True),
|
|
101
|
+
'tiktoken_encoding': StringType(nullable=True),
|
|
102
|
+
'tiktoken_target_model': StringType(nullable=True),
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
@classmethod
|
|
106
|
+
def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
|
|
107
|
+
schema = {'text': StringType()}
|
|
108
|
+
if 'metadata' in kwargs and len(kwargs['metadata']) > 0:
|
|
109
|
+
md_fields = kwargs['metadata'].split(',')
|
|
110
|
+
for md_field in md_fields:
|
|
111
|
+
if not hasattr(ChunkMetadata, md_field.upper()):
|
|
112
|
+
raise Error(f'Invalid metadata field {md_field}')
|
|
113
|
+
schema[md_field.lower()] = cls.MD_COLUMN_TYPES[ChunkMetadata[md_field.upper()]]
|
|
114
|
+
|
|
115
|
+
assert 'separators' in kwargs
|
|
116
|
+
separators = kwargs['separators'].split(',')
|
|
117
|
+
for separator in separators:
|
|
118
|
+
if not hasattr(Separator, separator.upper()):
|
|
119
|
+
raise Error(f'Invalid separator {separator}')
|
|
120
|
+
|
|
121
|
+
# check dependencies
|
|
122
|
+
if 'sentence' in separators:
|
|
123
|
+
Env.get().require_package('spacy')
|
|
124
|
+
if 'token_limit' in separators:
|
|
125
|
+
Env.get().require_package('tiktoken')
|
|
126
|
+
|
|
127
|
+
if 'limit' in kwargs or 'overlap' in kwargs:
|
|
128
|
+
if 'token_limit' not in separators and 'char_limit' not in separators:
|
|
129
|
+
raise Error('limit/overlap requires the "token_limit" or "char_limit" separator')
|
|
130
|
+
if 'limit' in kwargs and int(kwargs['limit']) <= 0:
|
|
131
|
+
raise Error('"limit" must be an integer > 0')
|
|
132
|
+
if 'overlap' in kwargs and int(kwargs['overlap']) < 0:
|
|
133
|
+
raise Error('"overlap" must be an integer >= 0')
|
|
134
|
+
if 'token_limit' in separators or 'char_limit' in separators:
|
|
135
|
+
if 'token_limit' in separators and 'char_limit' in separators:
|
|
136
|
+
raise Error('Cannot specify both "token_limit" and "char_limit" separators')
|
|
137
|
+
if 'limit' not in kwargs:
|
|
138
|
+
raise Error('limit is required with "token_limit"/"char_limit" separators')
|
|
139
|
+
|
|
140
|
+
return schema, []
|
|
141
|
+
|
|
142
|
+
def __next__(self) -> Dict[str, Any]:
|
|
143
|
+
while True:
|
|
144
|
+
section = next(self._sections)
|
|
145
|
+
if section.text is None:
|
|
146
|
+
continue
|
|
147
|
+
result = {'text': section.text}
|
|
148
|
+
for md_field in self._md_fields:
|
|
149
|
+
if md_field == ChunkMetadata.TITLE:
|
|
150
|
+
result[md_field.name.lower()] = self._doc_title
|
|
151
|
+
elif md_field == ChunkMetadata.HEADINGS:
|
|
152
|
+
result[md_field.name.lower()] = section.md.headings
|
|
153
|
+
elif md_field == ChunkMetadata.SOURCELINE:
|
|
154
|
+
result[md_field.name.lower()] = section.md.source_line
|
|
155
|
+
return result
|
|
156
|
+
|
|
157
|
+
def _html_sections(self) -> Generator[DocumentSection, None, None]:
|
|
158
|
+
"""Create DocumentSections reflecting the html-specific separators"""
|
|
159
|
+
import bs4
|
|
160
|
+
emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
|
|
161
|
+
emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
|
|
162
|
+
# current state
|
|
163
|
+
text_section = '' # currently accumulated text
|
|
164
|
+
headings: Dict[int, str] = {} # current state of observed headings (level -> text)
|
|
165
|
+
sourceline = 0 # most recently seen sourceline
|
|
166
|
+
|
|
167
|
+
def update_md(el: bs4.Tag) -> None:
|
|
168
|
+
# update current state
|
|
169
|
+
nonlocal headings, sourceline
|
|
170
|
+
sourceline = el.sourceline
|
|
171
|
+
if el.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
|
172
|
+
level = int(el.name[1])
|
|
173
|
+
# remove the previously seen lower levels
|
|
174
|
+
lower_levels = [l for l in headings.keys() if l > level]
|
|
175
|
+
for l in lower_levels:
|
|
176
|
+
del headings[l]
|
|
177
|
+
headings[level] = el.get_text().strip()
|
|
178
|
+
|
|
179
|
+
def emit() -> None:
|
|
180
|
+
nonlocal text_section, headings, sourceline
|
|
181
|
+
if len(text_section) > 0:
|
|
182
|
+
md = DocumentSectionMd(sourceline, headings.copy())
|
|
183
|
+
yield DocumentSection(text=text_section, md=md)
|
|
184
|
+
text_section = ''
|
|
185
|
+
|
|
186
|
+
def process_element(el: bs4.PageElement) -> Generator[DocumentSection, None, None]:
|
|
187
|
+
# process the element and emit sections as necessary
|
|
188
|
+
nonlocal text_section, headings, sourceline, emit_on_heading, emit_on_paragraph
|
|
189
|
+
if el.name in self._skip_tags:
|
|
190
|
+
return
|
|
191
|
+
|
|
192
|
+
if isinstance(el, bs4.NavigableString):
|
|
193
|
+
# accumulate text until we see a tag we care about
|
|
194
|
+
text = el.get_text().strip()
|
|
195
|
+
if len(text) > 0:
|
|
196
|
+
text_section += ' ' + text
|
|
197
|
+
return
|
|
198
|
+
|
|
199
|
+
if el.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
|
200
|
+
if emit_on_heading:
|
|
201
|
+
yield from emit()
|
|
202
|
+
update_md(el)
|
|
203
|
+
elif el.name == 'p':
|
|
204
|
+
if emit_on_paragraph:
|
|
205
|
+
yield from emit()
|
|
206
|
+
update_md(el)
|
|
207
|
+
for child in el.children:
|
|
208
|
+
yield from process_element(child)
|
|
209
|
+
|
|
210
|
+
yield from process_element(self._doc_handle.bs_doc)
|
|
211
|
+
yield from emit()
|
|
212
|
+
|
|
213
|
+
def _markdown_sections(self) -> Generator[DocumentSection, None, None]:
|
|
214
|
+
"""Create DocumentSections reflecting the html-specific separators"""
|
|
215
|
+
assert self._doc_handle.md_ast is not None
|
|
216
|
+
emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
|
|
217
|
+
emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
|
|
218
|
+
# current state
|
|
219
|
+
text_section = '' # currently accumulated text
|
|
220
|
+
headings: Dict[int, str] = {} # current state of observed headings (level -> text)
|
|
221
|
+
|
|
222
|
+
def update_headings(heading: Dict) -> None:
|
|
223
|
+
# update current state
|
|
224
|
+
nonlocal headings
|
|
225
|
+
assert 'type' in heading and heading['type'] == 'heading'
|
|
226
|
+
level = heading['attrs']['level']
|
|
227
|
+
text = heading['children'][0]['raw'].strip()
|
|
228
|
+
# remove the previously seen lower levels
|
|
229
|
+
lower_levels = [l for l in headings.keys() if l > level]
|
|
230
|
+
for l in lower_levels:
|
|
231
|
+
del headings[l]
|
|
232
|
+
headings[level] = text
|
|
233
|
+
|
|
234
|
+
def emit() -> None:
|
|
235
|
+
nonlocal text_section, headings
|
|
236
|
+
if len(text_section) > 0:
|
|
237
|
+
md = DocumentSectionMd(0, headings.copy())
|
|
238
|
+
yield DocumentSection(text=text_section, md=md)
|
|
239
|
+
text_section = ''
|
|
240
|
+
|
|
241
|
+
def process_element(el: Dict) -> Generator[DocumentSection, None, None]:
|
|
242
|
+
# process the element and emit sections as necessary
|
|
243
|
+
nonlocal text_section, headings, emit_on_heading, emit_on_paragraph
|
|
244
|
+
assert 'type' in el
|
|
245
|
+
|
|
246
|
+
if el['type'] == 'text':
|
|
247
|
+
# accumulate text until we see a separator element
|
|
248
|
+
text = el['raw'].strip()
|
|
249
|
+
if len(text) > 0:
|
|
250
|
+
text_section += ' ' + text
|
|
251
|
+
return
|
|
252
|
+
|
|
253
|
+
if el['type'] == 'heading':
|
|
254
|
+
if emit_on_heading:
|
|
255
|
+
yield from emit()
|
|
256
|
+
update_headings(el)
|
|
257
|
+
elif el['type'] == 'paragraph':
|
|
258
|
+
if emit_on_paragraph:
|
|
259
|
+
yield from emit()
|
|
260
|
+
if 'children' not in el:
|
|
261
|
+
return
|
|
262
|
+
for child in el['children']:
|
|
263
|
+
yield from process_element(child)
|
|
264
|
+
|
|
265
|
+
for el in self._doc_handle.md_ast:
|
|
266
|
+
yield from process_element(el)
|
|
267
|
+
yield from emit()
|
|
268
|
+
|
|
269
|
+
def _sentence_sections(self, input_sections: Iterable[DocumentSection]) -> Generator[DocumentSection, None, None]:
|
|
270
|
+
"""Split the input sections into sentences"""
|
|
271
|
+
for section in input_sections:
|
|
272
|
+
if section.text is not None:
|
|
273
|
+
doc = Env.get().spacy_nlp(section.text)
|
|
274
|
+
for sent in doc.sents:
|
|
275
|
+
yield DocumentSection(text=sent.text, md=section.md)
|
|
276
|
+
|
|
277
|
+
def _token_chunks(self, input: Iterable[DocumentSection]) -> Generator[DocumentSection, None, None]:
|
|
278
|
+
import tiktoken
|
|
279
|
+
if self._tiktoken_target_model is not None:
|
|
280
|
+
encoding = tiktoken.encoding_for_model(self._tiktoken_target_model)
|
|
281
|
+
else:
|
|
282
|
+
encoding = tiktoken.get_encoding(self._tiktoken_encoding)
|
|
283
|
+
assert self._limit > 0 and self._overlap >= 0
|
|
284
|
+
|
|
285
|
+
for section in input:
|
|
286
|
+
if section.text is None:
|
|
287
|
+
continue
|
|
288
|
+
tokens = encoding.encode(section.text)
|
|
289
|
+
start_idx = 0
|
|
290
|
+
while start_idx < len(tokens):
|
|
291
|
+
end_idx = min(start_idx + self._limit, len(tokens))
|
|
292
|
+
text = encoding.decode(tokens[start_idx:end_idx])
|
|
293
|
+
yield DocumentSection(text=text, md=section.md)
|
|
294
|
+
start_idx += self._limit - self._overlap
|
|
295
|
+
|
|
296
|
+
def _char_chunks(self, input: Iterable[DocumentSection]) -> Generator[DocumentSection, None, None]:
|
|
297
|
+
for section in input:
|
|
298
|
+
if section.text is None:
|
|
299
|
+
continue
|
|
300
|
+
start_idx = 0
|
|
301
|
+
while start_idx < len(section.text):
|
|
302
|
+
end_idx = min(start_idx + self._limit, len(section.text))
|
|
303
|
+
text = section.text[start_idx:end_idx]
|
|
304
|
+
yield DocumentSection(text=text, md=section.md)
|
|
305
|
+
start_idx += self._limit - self._overlap
|
|
306
|
+
|
|
307
|
+
def close(self) -> None:
|
|
308
|
+
pass
|
|
309
|
+
|
|
310
|
+
def set_pos(self, pos: int) -> None:
|
|
311
|
+
pass
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
from typing import Dict, Any, List, Tuple
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import math
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
import cv2
|
|
7
|
+
import PIL.Image
|
|
8
|
+
|
|
9
|
+
from .base import ComponentIterator
|
|
10
|
+
|
|
11
|
+
from pixeltable.type_system import ColumnType, VideoType, ImageType, IntType, FloatType
|
|
12
|
+
from pixeltable.exceptions import Error
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
_logger = logging.getLogger('pixeltable')
|
|
16
|
+
|
|
17
|
+
class FrameIterator(ComponentIterator):
|
|
18
|
+
def __init__(self, video: str, fps: float = 0.0):
|
|
19
|
+
video_path = Path(video)
|
|
20
|
+
assert video_path.exists() and video_path.is_file()
|
|
21
|
+
self.video_path = video_path
|
|
22
|
+
self.fps = fps
|
|
23
|
+
self.video_reader = cv2.VideoCapture(str(video_path))
|
|
24
|
+
if not self.video_reader.isOpened():
|
|
25
|
+
raise Error(f'Failed to open video: {video}')
|
|
26
|
+
video_fps = int(self.video_reader.get(cv2.CAP_PROP_FPS))
|
|
27
|
+
if fps > video_fps:
|
|
28
|
+
raise Error(f'Video {video}: requested fps ({fps}) exceeds that of the video ({video_fps})')
|
|
29
|
+
self.frame_freq = int(video_fps / fps) if fps > 0 else 1
|
|
30
|
+
num_video_frames = int(self.video_reader.get(cv2.CAP_PROP_FRAME_COUNT))
|
|
31
|
+
if num_video_frames == 0:
|
|
32
|
+
raise Error(f'Video {video}: failed to get number of frames')
|
|
33
|
+
# ceil: round up to ensure we count frame 0
|
|
34
|
+
self.num_frames = math.ceil(num_video_frames / self.frame_freq) if fps > 0 else num_video_frames
|
|
35
|
+
_logger.debug(f'FrameIterator: path={self.video_path} fps={self.fps}')
|
|
36
|
+
|
|
37
|
+
self.next_frame_idx = 0
|
|
38
|
+
|
|
39
|
+
@classmethod
|
|
40
|
+
def input_schema(cls) -> Dict[str, ColumnType]:
|
|
41
|
+
return {
|
|
42
|
+
'video': VideoType(nullable=False),
|
|
43
|
+
'fps': FloatType()
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
@classmethod
|
|
47
|
+
def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
|
|
48
|
+
return {
|
|
49
|
+
'frame_idx': IntType(),
|
|
50
|
+
'pos_msec': FloatType(),
|
|
51
|
+
'pos_frame': FloatType(),
|
|
52
|
+
'frame': ImageType(),
|
|
53
|
+
}, ['frame']
|
|
54
|
+
|
|
55
|
+
def __next__(self) -> Dict[str, Any]:
|
|
56
|
+
while True:
|
|
57
|
+
pos_msec = self.video_reader.get(cv2.CAP_PROP_POS_MSEC)
|
|
58
|
+
pos_frame = self.video_reader.get(cv2.CAP_PROP_POS_FRAMES)
|
|
59
|
+
status, img = self.video_reader.read()
|
|
60
|
+
if not status:
|
|
61
|
+
_logger.debug(f'releasing video reader for {self.video_path}')
|
|
62
|
+
self.video_reader.release()
|
|
63
|
+
self.video_reader = None
|
|
64
|
+
raise StopIteration
|
|
65
|
+
if pos_frame % self.frame_freq == 0:
|
|
66
|
+
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
|
67
|
+
result = {
|
|
68
|
+
'frame_idx': self.next_frame_idx,
|
|
69
|
+
'pos_msec': pos_msec,
|
|
70
|
+
'pos_frame': pos_frame,
|
|
71
|
+
'frame': PIL.Image.fromarray(img),
|
|
72
|
+
}
|
|
73
|
+
self.next_frame_idx += 1
|
|
74
|
+
# frame_freq > 1: jumping to the target frame here with video_reader.set() is far slower than just
|
|
75
|
+
# skipping the unwanted frames
|
|
76
|
+
return result
|
|
77
|
+
|
|
78
|
+
def close(self) -> None:
|
|
79
|
+
if self.video_reader is not None:
|
|
80
|
+
self.video_reader.release()
|
|
81
|
+
self.video_reader = None
|
|
82
|
+
|
|
83
|
+
def set_pos(self, pos: int) -> None:
|
|
84
|
+
"""Seek to frame idx"""
|
|
85
|
+
if pos == self.next_frame_idx:
|
|
86
|
+
return
|
|
87
|
+
_logger.debug(f'seeking to frame {pos}')
|
|
88
|
+
self.video_reader.set(cv2.CAP_PROP_POS_FRAMES, pos * self.frame_freq)
|
|
89
|
+
self.next_frame_idx = pos
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import importlib
|
|
3
|
+
import os
|
|
4
|
+
import pkgutil
|
|
5
|
+
from typing import Callable, Dict
|
|
6
|
+
|
|
7
|
+
import sqlalchemy as sql
|
|
8
|
+
import sqlalchemy.orm as orm
|
|
9
|
+
|
|
10
|
+
from .schema import SystemInfo, SystemInfoMd
|
|
11
|
+
|
|
12
|
+
# current version of the metadata; this is incremented whenever the metadata schema changes
|
|
13
|
+
VERSION = 12
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def create_system_info(engine: sql.engine.Engine) -> None:
|
|
17
|
+
"""Create the systemmetadata record"""
|
|
18
|
+
system_md = SystemInfoMd(schema_version=VERSION)
|
|
19
|
+
record = SystemInfo(md=dataclasses.asdict(system_md))
|
|
20
|
+
with orm.Session(engine, future=True) as session:
|
|
21
|
+
session.add(record)
|
|
22
|
+
session.flush()
|
|
23
|
+
session.commit()
|
|
24
|
+
|
|
25
|
+
# conversion functions for upgrading the metadata schema from one version to the following
|
|
26
|
+
# key: old schema version
|
|
27
|
+
converter_cbs: Dict[int, Callable[[sql.engine.Engine], None]] = {}
|
|
28
|
+
|
|
29
|
+
def register_converter(version: int, cb: Callable[[sql.engine.Engine], None]) -> None:
|
|
30
|
+
global converter_cbs
|
|
31
|
+
converter_cbs[version] = cb
|
|
32
|
+
|
|
33
|
+
# load all converter modules
|
|
34
|
+
for _, modname, _ in pkgutil.iter_modules([os.path.dirname(__file__) + '/converters']):
|
|
35
|
+
importlib.import_module('pixeltable.metadata.converters.' + modname)
|
|
36
|
+
|
|
37
|
+
def upgrade_md(engine: sql.engine.Engine) -> None:
|
|
38
|
+
"""Upgrade the metadata schema to the current version"""
|
|
39
|
+
with orm.Session(engine, future=True) as session:
|
|
40
|
+
system_info = session.query(SystemInfo).one().md
|
|
41
|
+
md_version = system_info['schema_version']
|
|
42
|
+
if md_version == VERSION:
|
|
43
|
+
return
|
|
44
|
+
while md_version < VERSION:
|
|
45
|
+
if md_version not in converter_cbs:
|
|
46
|
+
raise RuntimeError(f'No metadata converter for version {md_version}')
|
|
47
|
+
print(f'Converting metadata from version {md_version} to {md_version + 1}')
|
|
48
|
+
converter_cbs[md_version](engine)
|
|
49
|
+
md_version += 1
|
|
50
|
+
# update system info
|
|
51
|
+
conn = session.connection()
|
|
52
|
+
system_info_md = SystemInfoMd(schema_version=VERSION)
|
|
53
|
+
conn.execute(SystemInfo.__table__.update().values(md=dataclasses.asdict(system_info_md)))
|
|
54
|
+
session.commit()
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import sqlalchemy as sql
|
|
2
|
+
|
|
3
|
+
from pixeltable.metadata.schema import Table, TableSchemaVersion
|
|
4
|
+
from pixeltable.metadata import register_converter
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def convert_10(engine: sql.engine.Engine) -> None:
|
|
8
|
+
default_table_attrs = {"comment": None, "num_retained_versions": 10}
|
|
9
|
+
with engine.begin() as conn:
|
|
10
|
+
# Because `parameters` wasn't actually used for anything,
|
|
11
|
+
# we can simply delete it without any data loss.
|
|
12
|
+
conn.execute(sql.update(Table).values(md=Table.md - 'parameters'))
|
|
13
|
+
# Add `table_attrs` to all instances of tableschemaversions.md.
|
|
14
|
+
conn.execute(sql.update(TableSchemaVersion).values(md=TableSchemaVersion.md.concat(default_table_attrs)))
|
|
15
|
+
return
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
register_converter(10, convert_10)
|