pixeltable 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +64 -11
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +1 -1
- pixeltable/catalog/catalog.py +50 -27
- pixeltable/catalog/column.py +27 -11
- pixeltable/catalog/dir.py +6 -4
- pixeltable/catalog/globals.py +8 -1
- pixeltable/catalog/insertable_table.py +22 -12
- pixeltable/catalog/named_function.py +10 -6
- pixeltable/catalog/path.py +3 -2
- pixeltable/catalog/path_dict.py +8 -6
- pixeltable/catalog/schema_object.py +2 -1
- pixeltable/catalog/table.py +121 -101
- pixeltable/catalog/table_version.py +291 -142
- pixeltable/catalog/table_version_path.py +8 -5
- pixeltable/catalog/view.py +67 -26
- pixeltable/dataframe.py +106 -81
- pixeltable/env.py +28 -24
- pixeltable/exec/__init__.py +2 -2
- pixeltable/exec/aggregation_node.py +10 -4
- pixeltable/exec/cache_prefetch_node.py +5 -3
- pixeltable/exec/component_iteration_node.py +9 -9
- pixeltable/exec/data_row_batch.py +21 -10
- pixeltable/exec/exec_context.py +10 -3
- pixeltable/exec/exec_node.py +23 -12
- pixeltable/exec/expr_eval/evaluators.py +13 -7
- pixeltable/exec/expr_eval/expr_eval_node.py +24 -15
- pixeltable/exec/expr_eval/globals.py +30 -7
- pixeltable/exec/expr_eval/row_buffer.py +5 -6
- pixeltable/exec/expr_eval/schedulers.py +151 -31
- pixeltable/exec/in_memory_data_node.py +8 -7
- pixeltable/exec/row_update_node.py +15 -5
- pixeltable/exec/sql_node.py +56 -27
- pixeltable/exprs/__init__.py +2 -2
- pixeltable/exprs/arithmetic_expr.py +57 -26
- pixeltable/exprs/array_slice.py +1 -1
- pixeltable/exprs/column_property_ref.py +2 -1
- pixeltable/exprs/column_ref.py +20 -15
- pixeltable/exprs/comparison.py +6 -2
- pixeltable/exprs/compound_predicate.py +1 -3
- pixeltable/exprs/data_row.py +2 -2
- pixeltable/exprs/expr.py +108 -72
- pixeltable/exprs/expr_dict.py +2 -1
- pixeltable/exprs/expr_set.py +3 -1
- pixeltable/exprs/function_call.py +39 -41
- pixeltable/exprs/globals.py +1 -0
- pixeltable/exprs/in_predicate.py +2 -2
- pixeltable/exprs/inline_expr.py +20 -17
- pixeltable/exprs/json_mapper.py +4 -2
- pixeltable/exprs/json_path.py +12 -18
- pixeltable/exprs/literal.py +5 -9
- pixeltable/exprs/method_ref.py +1 -0
- pixeltable/exprs/object_ref.py +1 -1
- pixeltable/exprs/row_builder.py +32 -17
- pixeltable/exprs/rowid_ref.py +14 -5
- pixeltable/exprs/similarity_expr.py +11 -6
- pixeltable/exprs/sql_element_cache.py +1 -1
- pixeltable/exprs/type_cast.py +24 -9
- pixeltable/ext/__init__.py +1 -0
- pixeltable/ext/functions/__init__.py +1 -0
- pixeltable/ext/functions/whisperx.py +2 -2
- pixeltable/ext/functions/yolox.py +11 -11
- pixeltable/func/aggregate_function.py +17 -13
- pixeltable/func/callable_function.py +6 -6
- pixeltable/func/expr_template_function.py +15 -14
- pixeltable/func/function.py +16 -16
- pixeltable/func/function_registry.py +11 -8
- pixeltable/func/globals.py +4 -2
- pixeltable/func/query_template_function.py +12 -13
- pixeltable/func/signature.py +18 -9
- pixeltable/func/tools.py +10 -17
- pixeltable/func/udf.py +106 -11
- pixeltable/functions/__init__.py +21 -2
- pixeltable/functions/anthropic.py +16 -12
- pixeltable/functions/fireworks.py +63 -5
- pixeltable/functions/gemini.py +13 -3
- pixeltable/functions/globals.py +18 -6
- pixeltable/functions/huggingface.py +20 -38
- pixeltable/functions/image.py +7 -3
- pixeltable/functions/json.py +1 -0
- pixeltable/functions/llama_cpp.py +1 -4
- pixeltable/functions/mistralai.py +31 -20
- pixeltable/functions/ollama.py +4 -18
- pixeltable/functions/openai.py +231 -113
- pixeltable/functions/replicate.py +11 -10
- pixeltable/functions/string.py +70 -7
- pixeltable/functions/timestamp.py +21 -8
- pixeltable/functions/together.py +66 -52
- pixeltable/functions/video.py +1 -0
- pixeltable/functions/vision.py +14 -11
- pixeltable/functions/whisper.py +2 -1
- pixeltable/globals.py +60 -26
- pixeltable/index/__init__.py +1 -1
- pixeltable/index/btree.py +5 -3
- pixeltable/index/embedding_index.py +15 -14
- pixeltable/io/__init__.py +1 -1
- pixeltable/io/external_store.py +30 -25
- pixeltable/io/fiftyone.py +6 -14
- pixeltable/io/globals.py +33 -27
- pixeltable/io/hf_datasets.py +2 -1
- pixeltable/io/label_studio.py +77 -68
- pixeltable/io/pandas.py +36 -23
- pixeltable/io/parquet.py +9 -12
- pixeltable/iterators/__init__.py +1 -0
- pixeltable/iterators/audio.py +205 -0
- pixeltable/iterators/document.py +19 -8
- pixeltable/iterators/image.py +6 -24
- pixeltable/iterators/string.py +3 -6
- pixeltable/iterators/video.py +1 -7
- pixeltable/metadata/__init__.py +7 -1
- pixeltable/metadata/converters/convert_10.py +2 -2
- pixeltable/metadata/converters/convert_15.py +1 -5
- pixeltable/metadata/converters/convert_16.py +2 -4
- pixeltable/metadata/converters/convert_17.py +2 -4
- pixeltable/metadata/converters/convert_18.py +2 -4
- pixeltable/metadata/converters/convert_19.py +2 -5
- pixeltable/metadata/converters/convert_20.py +1 -4
- pixeltable/metadata/converters/convert_21.py +4 -6
- pixeltable/metadata/converters/convert_22.py +1 -0
- pixeltable/metadata/converters/convert_23.py +5 -5
- pixeltable/metadata/converters/convert_24.py +12 -13
- pixeltable/metadata/converters/convert_26.py +23 -0
- pixeltable/metadata/converters/util.py +3 -4
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +13 -2
- pixeltable/plan.py +173 -98
- pixeltable/share/__init__.py +0 -0
- pixeltable/share/packager.py +218 -0
- pixeltable/store.py +42 -26
- pixeltable/type_system.py +102 -75
- pixeltable/utils/arrow.py +7 -8
- pixeltable/utils/coco.py +16 -17
- pixeltable/utils/code.py +1 -1
- pixeltable/utils/console_output.py +6 -3
- pixeltable/utils/description_helper.py +7 -7
- pixeltable/utils/documents.py +3 -1
- pixeltable/utils/filecache.py +12 -7
- pixeltable/utils/http_server.py +9 -8
- pixeltable/utils/iceberg.py +14 -0
- pixeltable/utils/media_store.py +3 -2
- pixeltable/utils/pytorch.py +11 -14
- pixeltable/utils/s3.py +1 -0
- pixeltable/utils/sql.py +1 -0
- pixeltable/utils/transactional_directory.py +2 -2
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.4.dist-info}/METADATA +9 -9
- pixeltable-0.3.4.dist-info/RECORD +166 -0
- pixeltable-0.3.2.dist-info/RECORD +0 -161
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.4.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.4.dist-info}/WHEEL +0 -0
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.4.dist-info}/entry_points.txt +0 -0
pixeltable/io/pandas.py
CHANGED
|
@@ -9,10 +9,13 @@ import pixeltable.type_system as ts
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def import_pandas(
|
|
12
|
-
tbl_name: str,
|
|
12
|
+
tbl_name: str,
|
|
13
|
+
df: pd.DataFrame,
|
|
14
|
+
*,
|
|
15
|
+
schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
|
|
13
16
|
primary_key: Optional[Union[str, list[str]]] = None,
|
|
14
17
|
num_retained_versions: int = 10,
|
|
15
|
-
comment: str = ''
|
|
18
|
+
comment: str = '',
|
|
16
19
|
) -> pxt.Table:
|
|
17
20
|
"""Creates a new base table from a Pandas
|
|
18
21
|
[`DataFrame`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html), with the
|
|
@@ -45,17 +48,21 @@ def import_pandas(
|
|
|
45
48
|
|
|
46
49
|
schema, pxt_pk = __df_to_pxt_schema(df, schema_overrides, primary_key)
|
|
47
50
|
tbl_rows = (dict(__df_row_to_pxt_row(row, schema)) for row in df.itertuples())
|
|
48
|
-
table = pxt.create_table(
|
|
51
|
+
table = pxt.create_table(
|
|
52
|
+
tbl_name, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment
|
|
53
|
+
)
|
|
49
54
|
table.insert(tbl_rows)
|
|
50
55
|
return table
|
|
51
56
|
|
|
52
57
|
|
|
53
58
|
def import_csv(
|
|
54
|
-
tbl_name: str,
|
|
59
|
+
tbl_name: str,
|
|
60
|
+
filepath_or_buffer,
|
|
61
|
+
schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
|
|
55
62
|
primary_key: Optional[Union[str, list[str]]] = None,
|
|
56
63
|
num_retained_versions: int = 10,
|
|
57
64
|
comment: str = '',
|
|
58
|
-
**kwargs
|
|
65
|
+
**kwargs,
|
|
59
66
|
) -> pxt.Table:
|
|
60
67
|
"""
|
|
61
68
|
Creates a new base table from a csv file. This is a convenience method and is equivalent
|
|
@@ -67,15 +74,25 @@ def import_csv(
|
|
|
67
74
|
A handle to the newly created [`Table`][pixeltable.Table].
|
|
68
75
|
"""
|
|
69
76
|
df = pd.read_csv(filepath_or_buffer, **kwargs)
|
|
70
|
-
return import_pandas(
|
|
77
|
+
return import_pandas(
|
|
78
|
+
tbl_name,
|
|
79
|
+
df,
|
|
80
|
+
schema_overrides=schema_overrides,
|
|
81
|
+
primary_key=primary_key,
|
|
82
|
+
num_retained_versions=num_retained_versions,
|
|
83
|
+
comment=comment,
|
|
84
|
+
)
|
|
71
85
|
|
|
72
86
|
|
|
73
87
|
def import_excel(
|
|
74
|
-
tbl_name: str,
|
|
88
|
+
tbl_name: str,
|
|
89
|
+
io,
|
|
90
|
+
*args,
|
|
91
|
+
schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
|
|
75
92
|
primary_key: Optional[Union[str, list[str]]] = None,
|
|
76
93
|
num_retained_versions: int = 10,
|
|
77
94
|
comment: str = '',
|
|
78
|
-
**kwargs
|
|
95
|
+
**kwargs,
|
|
79
96
|
) -> pxt.Table:
|
|
80
97
|
"""
|
|
81
98
|
Creates a new base table from an Excel (.xlsx) file. This is a convenience method and is
|
|
@@ -87,7 +104,14 @@ def import_excel(
|
|
|
87
104
|
A handle to the newly created [`Table`][pixeltable.Table].
|
|
88
105
|
"""
|
|
89
106
|
df = pd.read_excel(io, *args, **kwargs)
|
|
90
|
-
return import_pandas(
|
|
107
|
+
return import_pandas(
|
|
108
|
+
tbl_name,
|
|
109
|
+
df,
|
|
110
|
+
schema_overrides=schema_overrides,
|
|
111
|
+
primary_key=primary_key,
|
|
112
|
+
num_retained_versions=num_retained_versions,
|
|
113
|
+
comment=comment,
|
|
114
|
+
)
|
|
91
115
|
|
|
92
116
|
|
|
93
117
|
def __df_to_pxt_schema(
|
|
@@ -161,20 +185,9 @@ def __np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series, nullable: bo
|
|
|
161
185
|
"""
|
|
162
186
|
Infers a Pixeltable type based on a Numpy dtype.
|
|
163
187
|
"""
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
if np.issubdtype(np_dtype, np.floating):
|
|
168
|
-
return pxt.FloatType(nullable=nullable)
|
|
169
|
-
|
|
170
|
-
if np.issubdtype(np_dtype, np.bool_):
|
|
171
|
-
return pxt.BoolType(nullable=nullable)
|
|
172
|
-
|
|
173
|
-
if np.issubdtype(np_dtype, np.character):
|
|
174
|
-
return pxt.StringType(nullable=nullable)
|
|
175
|
-
|
|
176
|
-
if np.issubdtype(np_dtype, np.datetime64):
|
|
177
|
-
return pxt.TimestampType(nullable=nullable)
|
|
188
|
+
pxttype = ts.ArrayType.from_np_dtype(np_dtype, nullable)
|
|
189
|
+
if pxttype is not None:
|
|
190
|
+
return pxttype
|
|
178
191
|
|
|
179
192
|
if np_dtype == np.object_:
|
|
180
193
|
# The `object_` dtype can mean all sorts of things; see if we can infer the Pixeltable type
|
pixeltable/io/parquet.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import datetime
|
|
3
4
|
import io
|
|
4
5
|
import json
|
|
5
6
|
import logging
|
|
@@ -11,16 +12,16 @@ from typing import Any, Optional, Union
|
|
|
11
12
|
|
|
12
13
|
import numpy as np
|
|
13
14
|
import PIL.Image
|
|
14
|
-
import datetime
|
|
15
15
|
|
|
16
16
|
import pixeltable as pxt
|
|
17
|
-
from pixeltable.env import Env
|
|
18
17
|
import pixeltable.exceptions as exc
|
|
19
18
|
import pixeltable.type_system as ts
|
|
19
|
+
from pixeltable.env import Env
|
|
20
20
|
from pixeltable.utils.transactional_directory import transactional_directory
|
|
21
21
|
|
|
22
22
|
if typing.TYPE_CHECKING:
|
|
23
23
|
import pyarrow as pa
|
|
24
|
+
|
|
24
25
|
import pixeltable as pxt
|
|
25
26
|
|
|
26
27
|
_logger = logging.getLogger('pixeltable')
|
|
@@ -43,11 +44,11 @@ def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path:
|
|
|
43
44
|
|
|
44
45
|
|
|
45
46
|
def export_parquet(
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
47
|
+
table_or_df: Union[pxt.Table, pxt.DataFrame],
|
|
48
|
+
parquet_path: Path,
|
|
49
|
+
partition_size_bytes: int = 100_000_000,
|
|
50
|
+
inline_images: bool = False,
|
|
51
|
+
) -> None:
|
|
51
52
|
"""
|
|
52
53
|
Exports a dataframe's data to one or more Parquet files. Requires pyarrow to be installed.
|
|
53
54
|
|
|
@@ -159,11 +160,7 @@ def parquet_schema_to_pixeltable_schema(parquet_path: str) -> dict[str, Optional
|
|
|
159
160
|
|
|
160
161
|
|
|
161
162
|
def import_parquet(
|
|
162
|
-
table: str,
|
|
163
|
-
*,
|
|
164
|
-
parquet_path: str,
|
|
165
|
-
schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
|
|
166
|
-
**kwargs: Any,
|
|
163
|
+
table: str, *, parquet_path: str, schema_overrides: Optional[dict[str, ts.ColumnType]] = None, **kwargs: Any
|
|
167
164
|
) -> pxt.Table:
|
|
168
165
|
"""Creates a new base table from a Parquet file or set of files. Requires pyarrow to be installed.
|
|
169
166
|
|
pixeltable/iterators/__init__.py
CHANGED
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import math
|
|
3
|
+
import uuid
|
|
4
|
+
from fractions import Fraction
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Optional
|
|
7
|
+
|
|
8
|
+
import av # type: ignore[import-untyped]
|
|
9
|
+
|
|
10
|
+
import pixeltable.env as env
|
|
11
|
+
import pixeltable.exceptions as excs
|
|
12
|
+
import pixeltable.type_system as ts
|
|
13
|
+
|
|
14
|
+
from .base import ComponentIterator
|
|
15
|
+
|
|
16
|
+
_logger = logging.getLogger('pixeltable')
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class AudioSplitter(ComponentIterator):
|
|
20
|
+
"""
|
|
21
|
+
Iterator over chunks of an audio file. The audio file is split into smaller chunks, where the duration of each chunk is determined by chunk_duration_sec.
|
|
22
|
+
The iterator yields audio chunks as pxt.Audio, along with the start and end time of each chunk.
|
|
23
|
+
If the input contains no audio, no chunks are yielded.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
chunk_duration_sec: Audio chunk duration in seconds
|
|
27
|
+
overlap_sec: Overlap between consecutive chunks in seconds.
|
|
28
|
+
min_chunk_duration_sec: Drop the last chunk if it is smaller than min_chunk_duration_sec
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
# Input parameters
|
|
32
|
+
audio_path: Path
|
|
33
|
+
chunk_duration_sec: float
|
|
34
|
+
overlap_sec: float
|
|
35
|
+
|
|
36
|
+
# audio stream details
|
|
37
|
+
container: av.container.input.InputContainer
|
|
38
|
+
audio_time_base: Fraction # seconds per presentation time
|
|
39
|
+
|
|
40
|
+
# List of chunks to extract
|
|
41
|
+
# Each chunk is defined by start and end presentation timestamps in audio file (int)
|
|
42
|
+
chunks_to_extract_in_pts: Optional[list[tuple[int, int]]] = []
|
|
43
|
+
# next chunk to extract
|
|
44
|
+
next_pos: int
|
|
45
|
+
|
|
46
|
+
__codec_map = {
|
|
47
|
+
'mp3': 'mp3', # MP3 decoder -> mp3/libmp3lame encoder
|
|
48
|
+
'mp3float': 'mp3', # MP3float decoder -> mp3 encoder
|
|
49
|
+
'aac': 'aac', # AAC decoder -> AAC encoder
|
|
50
|
+
'vorbis': 'libvorbis', # Vorbis decoder -> libvorbis encoder
|
|
51
|
+
'opus': 'libopus', # Opus decoder -> libopus encoder
|
|
52
|
+
'flac': 'flac', # FLAC decoder -> FLAC encoder
|
|
53
|
+
'wavpack': 'wavpack', # WavPack decoder -> WavPack encoder
|
|
54
|
+
'alac': 'alac', # ALAC decoder -> ALAC encoder
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
def __init__(
|
|
58
|
+
self, audio: str, chunk_duration_sec: float, *, overlap_sec: float = 0.0, min_chunk_duration_sec: float = 0.0
|
|
59
|
+
):
|
|
60
|
+
if chunk_duration_sec <= 0.0:
|
|
61
|
+
raise excs.Error('chunk_duration_sec must be a positive number')
|
|
62
|
+
if chunk_duration_sec < min_chunk_duration_sec:
|
|
63
|
+
raise excs.Error('chunk_duration_sec must be at least min_chunk_duration_sec')
|
|
64
|
+
if overlap_sec >= chunk_duration_sec:
|
|
65
|
+
raise excs.Error('overlap_sec must be less than chunk_duration_sec')
|
|
66
|
+
audio_path = Path(audio)
|
|
67
|
+
assert audio_path.exists() and audio_path.is_file()
|
|
68
|
+
self.audio_path = audio_path
|
|
69
|
+
self.next_pos = 0
|
|
70
|
+
self.container = av.open(str(audio_path))
|
|
71
|
+
if len(self.container.streams.audio) == 0:
|
|
72
|
+
# No audio stream
|
|
73
|
+
return
|
|
74
|
+
self.chunk_duration_sec = chunk_duration_sec
|
|
75
|
+
self.overlap_sec = overlap_sec
|
|
76
|
+
self.min_chunk_duration_sec = min_chunk_duration_sec
|
|
77
|
+
self.audio_time_base = self.container.streams.audio[0].time_base
|
|
78
|
+
|
|
79
|
+
audio_start_time_pts = self.container.streams.audio[0].start_time or 0
|
|
80
|
+
audio_start_time_sec = float(audio_start_time_pts * self.audio_time_base)
|
|
81
|
+
total_audio_duration_pts = self.container.streams.audio[0].duration or 0
|
|
82
|
+
total_audio_duration_sec = float(total_audio_duration_pts * self.audio_time_base)
|
|
83
|
+
|
|
84
|
+
self.chunks_to_extract_in_pts = [
|
|
85
|
+
(round(start / self.audio_time_base), round(end / self.audio_time_base))
|
|
86
|
+
for (start, end) in self.build_chunks(
|
|
87
|
+
audio_start_time_sec, total_audio_duration_sec, chunk_duration_sec, overlap_sec, min_chunk_duration_sec
|
|
88
|
+
)
|
|
89
|
+
]
|
|
90
|
+
_logger.debug(
|
|
91
|
+
f'AudioIterator: path={self.audio_path} total_audio_duration_pts={total_audio_duration_pts} chunks_to_extract_in_pts={self.chunks_to_extract_in_pts}'
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
@classmethod
|
|
95
|
+
def build_chunks(
|
|
96
|
+
cls,
|
|
97
|
+
start_time_sec: float,
|
|
98
|
+
total_duration_sec: float,
|
|
99
|
+
chunk_duration_sec: float,
|
|
100
|
+
overlap_sec: float,
|
|
101
|
+
min_chunk_duration_sec: float,
|
|
102
|
+
) -> list[tuple[float, float]]:
|
|
103
|
+
chunks_to_extract_in_sec: list[tuple[float, float]] = []
|
|
104
|
+
current_pos = start_time_sec
|
|
105
|
+
end_time = start_time_sec + total_duration_sec
|
|
106
|
+
while current_pos < end_time:
|
|
107
|
+
chunk_start = current_pos
|
|
108
|
+
chunk_end = min(chunk_start + chunk_duration_sec, end_time)
|
|
109
|
+
chunks_to_extract_in_sec.append((chunk_start, chunk_end))
|
|
110
|
+
if chunk_end >= end_time:
|
|
111
|
+
break
|
|
112
|
+
current_pos = chunk_end - overlap_sec
|
|
113
|
+
# If the last chunk is smaller than min_chunk_duration_sec then drop the last chunk from the list
|
|
114
|
+
if (
|
|
115
|
+
len(chunks_to_extract_in_sec) > 0
|
|
116
|
+
and (chunks_to_extract_in_sec[-1][1] - chunks_to_extract_in_sec[-1][0]) < min_chunk_duration_sec
|
|
117
|
+
):
|
|
118
|
+
return chunks_to_extract_in_sec[:-1] # return all but the last chunk
|
|
119
|
+
return chunks_to_extract_in_sec
|
|
120
|
+
|
|
121
|
+
@classmethod
|
|
122
|
+
def input_schema(cls) -> dict[str, ts.ColumnType]:
|
|
123
|
+
return {
|
|
124
|
+
'audio': ts.AudioType(nullable=False),
|
|
125
|
+
'chunk_duration_sec': ts.FloatType(nullable=True),
|
|
126
|
+
'overlap_sec': ts.FloatType(nullable=True),
|
|
127
|
+
'min_chunk_duration_sec': ts.FloatType(nullable=True),
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
@classmethod
|
|
131
|
+
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
132
|
+
return {
|
|
133
|
+
'start_time_sec': ts.FloatType(),
|
|
134
|
+
'end_time_sec': ts.FloatType(),
|
|
135
|
+
'audio_chunk': ts.AudioType(nullable=True),
|
|
136
|
+
}, []
|
|
137
|
+
|
|
138
|
+
def __next__(self) -> dict[str, Any]:
|
|
139
|
+
if self.next_pos >= len(self.chunks_to_extract_in_pts):
|
|
140
|
+
raise StopIteration
|
|
141
|
+
target_chunk_start, target_chunk_end = self.chunks_to_extract_in_pts[self.next_pos]
|
|
142
|
+
chunk_start_pts = 0
|
|
143
|
+
chunk_end_pts = 0
|
|
144
|
+
chunk_file = str(env.Env.get().tmp_dir / f'{uuid.uuid4()}{self.audio_path.suffix}')
|
|
145
|
+
output_container = av.open(chunk_file, mode='w')
|
|
146
|
+
input_stream = self.container.streams.audio[0]
|
|
147
|
+
codec_name = AudioSplitter.__codec_map.get(input_stream.codec_context.name, input_stream.codec_context.name)
|
|
148
|
+
output_stream = output_container.add_stream(codec_name, rate=input_stream.codec_context.sample_rate)
|
|
149
|
+
frame_count = 0
|
|
150
|
+
# Since frames don't align with chunk boundaries, we may have read an extra frame in previous iteration
|
|
151
|
+
# Seek to the nearest frame in stream at current chunk start time
|
|
152
|
+
self.container.seek(target_chunk_start, backward=True, stream=self.container.streams.audio[0])
|
|
153
|
+
while True:
|
|
154
|
+
try:
|
|
155
|
+
frame = next(self.container.decode(audio=0))
|
|
156
|
+
except EOFError as e:
|
|
157
|
+
raise excs.Error(f'Failed to read audio file `{self.audio_path}`, error `{e}`')
|
|
158
|
+
except StopIteration:
|
|
159
|
+
# no more frames to scan
|
|
160
|
+
break
|
|
161
|
+
if frame.pts < target_chunk_start:
|
|
162
|
+
# Current frame is behind chunk's start time, always get frame next to chunk's start time
|
|
163
|
+
continue
|
|
164
|
+
if frame.pts >= target_chunk_end:
|
|
165
|
+
# Frame has crossed the chunk boundary, it should be picked up by next chunk, throw away the current frame
|
|
166
|
+
break
|
|
167
|
+
frame_end = frame.pts + frame.samples
|
|
168
|
+
if frame_count == 0:
|
|
169
|
+
# Record start of the first frame
|
|
170
|
+
chunk_start_pts = frame.pts
|
|
171
|
+
# Write frame to output container
|
|
172
|
+
frame_count += 1
|
|
173
|
+
# If encode returns packets, write them to output container. Some encoders will buffer the frames.
|
|
174
|
+
output_container.mux(output_stream.encode(frame))
|
|
175
|
+
# record this frame's end as chunks end
|
|
176
|
+
chunk_end_pts = frame_end
|
|
177
|
+
# Check if frame's end has crossed the chunk boundary
|
|
178
|
+
if frame_end >= target_chunk_end:
|
|
179
|
+
break
|
|
180
|
+
|
|
181
|
+
# record result
|
|
182
|
+
if frame_count > 0:
|
|
183
|
+
# flush encoder
|
|
184
|
+
output_container.mux(output_stream.encode(None))
|
|
185
|
+
output_container.close()
|
|
186
|
+
result = {
|
|
187
|
+
'start_time_sec': round(float(chunk_start_pts * self.audio_time_base), 4),
|
|
188
|
+
'end_time_sec': round(float(chunk_end_pts * self.audio_time_base), 4),
|
|
189
|
+
'audio_chunk': chunk_file if frame_count > 0 else None,
|
|
190
|
+
}
|
|
191
|
+
_logger.debug('audio chunk result: %s', result)
|
|
192
|
+
self.next_pos += 1
|
|
193
|
+
return result
|
|
194
|
+
else:
|
|
195
|
+
# It's possible that there are no frames in the range of the last chunk, stop the iterator in this case.
|
|
196
|
+
# Note that start_time points at the first frame so case applies only for the last chunk
|
|
197
|
+
assert self.next_pos == len(self.chunks_to_extract_in_pts) - 1
|
|
198
|
+
self.next_pos += 1
|
|
199
|
+
raise StopIteration
|
|
200
|
+
|
|
201
|
+
def close(self) -> None:
|
|
202
|
+
self.container.close()
|
|
203
|
+
|
|
204
|
+
def set_pos(self, pos: int) -> None:
|
|
205
|
+
pass
|
pixeltable/iterators/document.py
CHANGED
|
@@ -35,6 +35,7 @@ class Separator(enum.Enum):
|
|
|
35
35
|
@dataclasses.dataclass
|
|
36
36
|
class DocumentSectionMetadata:
|
|
37
37
|
"""Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
|
|
38
|
+
|
|
38
39
|
# html and markdown metadata
|
|
39
40
|
sourceline: Optional[int] = None
|
|
40
41
|
# the stack of headings up to the most recently observed one;
|
|
@@ -50,6 +51,7 @@ class DocumentSectionMetadata:
|
|
|
50
51
|
@dataclasses.dataclass
|
|
51
52
|
class DocumentSection:
|
|
52
53
|
"""A single document chunk, according to some of the splitting criteria"""
|
|
54
|
+
|
|
53
55
|
text: Optional[str]
|
|
54
56
|
metadata: Optional[DocumentSectionMetadata]
|
|
55
57
|
|
|
@@ -93,6 +95,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
93
95
|
|
|
94
96
|
Chunked text will be cleaned with `ftfy.fix_text` to fix up common problems with unicode sequences.
|
|
95
97
|
"""
|
|
98
|
+
|
|
96
99
|
METADATA_COLUMN_TYPES = {
|
|
97
100
|
ChunkMetadata.TITLE: StringType(nullable=True),
|
|
98
101
|
ChunkMetadata.HEADING: JsonType(nullable=True),
|
|
@@ -102,10 +105,16 @@ class DocumentSplitter(ComponentIterator):
|
|
|
102
105
|
}
|
|
103
106
|
|
|
104
107
|
def __init__(
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
108
|
+
self,
|
|
109
|
+
document: str,
|
|
110
|
+
*,
|
|
111
|
+
separators: str,
|
|
112
|
+
limit: Optional[int] = None,
|
|
113
|
+
overlap: Optional[int] = None,
|
|
114
|
+
metadata: str = '',
|
|
115
|
+
html_skip_tags: Optional[list[str]] = None,
|
|
116
|
+
tiktoken_encoding: Optional[str] = 'cl100k_base',
|
|
117
|
+
tiktoken_target_model: Optional[str] = None,
|
|
109
118
|
):
|
|
110
119
|
"""Init method for `DocumentSplitter` class.
|
|
111
120
|
|
|
@@ -234,13 +243,14 @@ class DocumentSplitter(ComponentIterator):
|
|
|
234
243
|
def _html_sections(self) -> Iterator[DocumentSection]:
|
|
235
244
|
"""Create DocumentSections reflecting the html-specific separators"""
|
|
236
245
|
import bs4
|
|
246
|
+
|
|
237
247
|
emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
|
|
238
248
|
emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
|
|
239
249
|
# current state
|
|
240
250
|
accumulated_text: list[str] = [] # currently accumulated text
|
|
241
251
|
# accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
|
|
242
252
|
|
|
243
|
-
headings: dict[str, str] = {}
|
|
253
|
+
headings: dict[str, str] = {} # current state of observed headings (level -> text)
|
|
244
254
|
sourceline = 0 # most recently seen sourceline
|
|
245
255
|
|
|
246
256
|
def update_metadata(el: bs4.Tag) -> None:
|
|
@@ -300,7 +310,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
300
310
|
# current state
|
|
301
311
|
accumulated_text: list[str] = [] # currently accumulated text
|
|
302
312
|
# accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
|
|
303
|
-
headings: dict[str, str] = {}
|
|
313
|
+
headings: dict[str, str] = {} # current state of observed headings (level -> text)
|
|
304
314
|
|
|
305
315
|
def update_headings(heading: dict) -> None:
|
|
306
316
|
# update current state
|
|
@@ -353,6 +363,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
353
363
|
def _pdf_sections(self) -> Iterator[DocumentSection]:
|
|
354
364
|
"""Create DocumentSections reflecting the pdf-specific separators"""
|
|
355
365
|
import fitz # type: ignore[import-untyped]
|
|
366
|
+
|
|
356
367
|
doc: fitz.Document = self._doc_handle.pdf_doc
|
|
357
368
|
assert doc is not None
|
|
358
369
|
|
|
@@ -385,8 +396,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
385
396
|
yield DocumentSection(text=_emit_text(), metadata=metadata)
|
|
386
397
|
|
|
387
398
|
if accumulated_text and emit_on_page and not emit_on_paragraph:
|
|
388
|
-
yield DocumentSection(text=_emit_text(),
|
|
389
|
-
metadata=DocumentSectionMetadata(page=page_number))
|
|
399
|
+
yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata(page=page_number))
|
|
390
400
|
accumulated_text = []
|
|
391
401
|
|
|
392
402
|
if accumulated_text and not emit_on_page:
|
|
@@ -411,6 +421,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
411
421
|
|
|
412
422
|
def _token_chunks(self, input: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
|
|
413
423
|
import tiktoken
|
|
424
|
+
|
|
414
425
|
if self._tiktoken_target_model is not None:
|
|
415
426
|
encoding = tiktoken.encoding_for_model(self._tiktoken_target_model)
|
|
416
427
|
else:
|
pixeltable/iterators/image.py
CHANGED
|
@@ -30,15 +30,9 @@ class TileIterator(ComponentIterator):
|
|
|
30
30
|
__i: int
|
|
31
31
|
__j: int
|
|
32
32
|
|
|
33
|
-
def __init__(
|
|
34
|
-
self,
|
|
35
|
-
image: PIL.Image.Image,
|
|
36
|
-
*,
|
|
37
|
-
tile_size: tuple[int, int],
|
|
38
|
-
overlap: tuple[int, int] = (0, 0),
|
|
39
|
-
):
|
|
33
|
+
def __init__(self, image: PIL.Image.Image, *, tile_size: tuple[int, int], overlap: tuple[int, int] = (0, 0)):
|
|
40
34
|
if overlap[0] >= tile_size[0] or overlap[1] >= tile_size[1]:
|
|
41
|
-
raise excs.Error(f
|
|
35
|
+
raise excs.Error(f'overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}')
|
|
42
36
|
|
|
43
37
|
self.__image = image
|
|
44
38
|
self.__image.load()
|
|
@@ -64,11 +58,7 @@ class TileIterator(ComponentIterator):
|
|
|
64
58
|
x2 = x1 + self.__tile_size[0]
|
|
65
59
|
y2 = y1 + self.__tile_size[1]
|
|
66
60
|
tile = self.__image.crop((x1, y1, x2, y2))
|
|
67
|
-
result = {
|
|
68
|
-
'tile': tile,
|
|
69
|
-
'tile_coord': [self.__i, self.__j],
|
|
70
|
-
'tile_box': [x1, y1, x2, y2]
|
|
71
|
-
}
|
|
61
|
+
result = {'tile': tile, 'tile_coord': [self.__i, self.__j], 'tile_box': [x1, y1, x2, y2]}
|
|
72
62
|
|
|
73
63
|
self.__i += 1
|
|
74
64
|
if self.__i >= self.__xlen:
|
|
@@ -85,16 +75,8 @@ class TileIterator(ComponentIterator):
|
|
|
85
75
|
|
|
86
76
|
@classmethod
|
|
87
77
|
def input_schema(cls, *args: Any, **kwargs: Any) -> dict[str, ts.ColumnType]:
|
|
88
|
-
return {
|
|
89
|
-
'image': ts.ImageType(),
|
|
90
|
-
'tile_size': ts.JsonType(),
|
|
91
|
-
'overlap': ts.JsonType(),
|
|
92
|
-
}
|
|
78
|
+
return {'image': ts.ImageType(), 'tile_size': ts.JsonType(), 'overlap': ts.JsonType()}
|
|
93
79
|
|
|
94
80
|
@classmethod
|
|
95
|
-
def output_schema(cls,
|
|
96
|
-
return {
|
|
97
|
-
'tile': ts.ImageType(),
|
|
98
|
-
'tile_coord': ts.JsonType(),
|
|
99
|
-
'tile_box': ts.JsonType(),
|
|
100
|
-
}, ['tile']
|
|
81
|
+
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
82
|
+
return {'tile': ts.ImageType(), 'tile_coord': ts.JsonType(), 'tile_box': ts.JsonType()}, ['tile']
|
pixeltable/iterators/string.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Any, Iterator
|
|
2
2
|
|
|
3
3
|
import pixeltable.exceptions as excs
|
|
4
4
|
import pixeltable.type_system as ts
|
|
@@ -30,11 +30,8 @@ class StringSplitter(ComponentIterator):
|
|
|
30
30
|
|
|
31
31
|
@classmethod
|
|
32
32
|
def input_schema(cls, *args: Any, **kwargs: Any) -> dict[str, ts.ColumnType]:
|
|
33
|
-
return {
|
|
34
|
-
'text': ts.StringType(),
|
|
35
|
-
'separators': ts.StringType(),
|
|
36
|
-
}
|
|
33
|
+
return {'text': ts.StringType(), 'separators': ts.StringType()}
|
|
37
34
|
|
|
38
35
|
@classmethod
|
|
39
|
-
def output_schema(cls,
|
|
36
|
+
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
40
37
|
return {'text': ts.StringType()}, []
|
pixeltable/iterators/video.py
CHANGED
|
@@ -24,7 +24,6 @@ class FrameIterator(ComponentIterator):
|
|
|
24
24
|
frame of the video will always be extracted, and the remaining frames will be spaced as evenly as possible.
|
|
25
25
|
|
|
26
26
|
Args:
|
|
27
|
-
video: URL or path of the video to use for frame extraction.
|
|
28
27
|
fps: Number of frames to extract per second of video. This may be a fractional value, such as 0.5.
|
|
29
28
|
If omitted or set to 0.0, then the native framerate of the video will be used (all frames will be
|
|
30
29
|
extracted). If `fps` is greater than the frame rate of the video, an error will be raised.
|
|
@@ -167,12 +166,7 @@ class FrameIterator(ComponentIterator):
|
|
|
167
166
|
img = frame.to_image()
|
|
168
167
|
assert isinstance(img, PIL.Image.Image)
|
|
169
168
|
pos_msec = float(pts * self.video_time_base * 1000)
|
|
170
|
-
result = {
|
|
171
|
-
'frame_idx': self.next_pos,
|
|
172
|
-
'pos_msec': pos_msec,
|
|
173
|
-
'pos_frame': video_idx,
|
|
174
|
-
'frame': img,
|
|
175
|
-
}
|
|
169
|
+
result = {'frame_idx': self.next_pos, 'pos_msec': pos_msec, 'pos_frame': video_idx, 'frame': img}
|
|
176
170
|
self.next_pos += 1
|
|
177
171
|
return result
|
|
178
172
|
|
pixeltable/metadata/__init__.py
CHANGED
|
@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
|
|
|
10
10
|
from .schema import SystemInfo, SystemInfoMd
|
|
11
11
|
|
|
12
12
|
# current version of the metadata; this is incremented whenever the metadata schema changes
|
|
13
|
-
VERSION =
|
|
13
|
+
VERSION = 27
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def create_system_info(engine: sql.engine.Engine) -> None:
|
|
@@ -22,20 +22,25 @@ def create_system_info(engine: sql.engine.Engine) -> None:
|
|
|
22
22
|
session.flush()
|
|
23
23
|
session.commit()
|
|
24
24
|
|
|
25
|
+
|
|
25
26
|
# conversion functions for upgrading the metadata schema from one version to the following
|
|
26
27
|
# key: old schema version
|
|
27
28
|
converter_cbs: dict[int, Callable[[sql.engine.Engine], None]] = {}
|
|
28
29
|
|
|
30
|
+
|
|
29
31
|
def register_converter(version: int) -> Callable[[Callable[[sql.engine.Engine], None]], None]:
|
|
30
32
|
def decorator(fn: Callable[[sql.engine.Engine], None]) -> None:
|
|
31
33
|
global converter_cbs
|
|
32
34
|
converter_cbs[version] = fn
|
|
35
|
+
|
|
33
36
|
return decorator
|
|
34
37
|
|
|
38
|
+
|
|
35
39
|
# load all converter modules
|
|
36
40
|
for _, modname, _ in pkgutil.iter_modules([os.path.dirname(__file__) + '/converters']):
|
|
37
41
|
importlib.import_module('pixeltable.metadata.converters.' + modname)
|
|
38
42
|
|
|
43
|
+
|
|
39
44
|
def upgrade_md(engine: sql.engine.Engine) -> None:
|
|
40
45
|
"""Upgrade the metadata schema to the current version"""
|
|
41
46
|
with orm.Session(engine) as session:
|
|
@@ -48,6 +53,7 @@ def upgrade_md(engine: sql.engine.Engine) -> None:
|
|
|
48
53
|
if md_version not in converter_cbs:
|
|
49
54
|
raise RuntimeError(f'No metadata converter for version {md_version}')
|
|
50
55
|
from pixeltable.env import Env
|
|
56
|
+
|
|
51
57
|
Env.get().console_logger.info(f'Converting metadata from version {md_version} to {md_version + 1}')
|
|
52
58
|
converter_cbs[md_version](engine)
|
|
53
59
|
md_version += 1
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import sqlalchemy as sql
|
|
2
2
|
|
|
3
|
-
from pixeltable.metadata.schema import Table, TableSchemaVersion
|
|
4
3
|
from pixeltable.metadata import register_converter
|
|
4
|
+
from pixeltable.metadata.schema import Table, TableSchemaVersion
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
@register_converter(version=10)
|
|
8
8
|
def _(engine: sql.engine.Engine) -> None:
|
|
9
|
-
default_table_attrs = {
|
|
9
|
+
default_table_attrs = {'comment': None, 'num_retained_versions': 10}
|
|
10
10
|
with engine.begin() as conn:
|
|
11
11
|
# Because `parameters` wasn't actually used for anything,
|
|
12
12
|
# we can simply delete it without any data loss.
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
|
|
2
1
|
import inspect
|
|
3
2
|
import logging
|
|
4
3
|
from typing import Any
|
|
@@ -37,8 +36,5 @@ def __update_md(orig_d: dict, binary_obj: bytes) -> Any:
|
|
|
37
36
|
params.append(func.Parameter(name=name, col_type=col_type, kind=kind, default=default, is_batched=is_batched))
|
|
38
37
|
is_batched = 'batch_size' in orig_d
|
|
39
38
|
sig = func.Signature(return_type, params, is_batched=is_batched)
|
|
40
|
-
d = {
|
|
41
|
-
'signature': sig.as_dict(),
|
|
42
|
-
'batch_size': orig_d['batch_size'] if is_batched else None,
|
|
43
|
-
}
|
|
39
|
+
d = {'signature': sig.as_dict(), 'batch_size': orig_d['batch_size'] if is_batched else None}
|
|
44
40
|
return d
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from uuid import UUID
|
|
2
|
+
|
|
2
3
|
import sqlalchemy as sql
|
|
3
4
|
|
|
4
5
|
from pixeltable.metadata import register_converter
|
|
@@ -7,10 +8,7 @@ from pixeltable.metadata.converters.util import convert_table_md
|
|
|
7
8
|
|
|
8
9
|
@register_converter(version=16)
|
|
9
10
|
def _(engine: sql.engine.Engine) -> None:
|
|
10
|
-
convert_table_md(
|
|
11
|
-
engine,
|
|
12
|
-
table_md_updater=__update_table_md
|
|
13
|
-
)
|
|
11
|
+
convert_table_md(engine, table_md_updater=__update_table_md)
|
|
14
12
|
|
|
15
13
|
|
|
16
14
|
def __update_table_md(table_md: dict, table_id: UUID) -> None:
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from uuid import UUID
|
|
2
|
+
|
|
2
3
|
import sqlalchemy as sql
|
|
3
4
|
|
|
4
5
|
from pixeltable.metadata import register_converter
|
|
@@ -7,10 +8,7 @@ from pixeltable.metadata.converters.util import convert_table_md
|
|
|
7
8
|
|
|
8
9
|
@register_converter(version=17)
|
|
9
10
|
def _(engine: sql.engine.Engine) -> None:
|
|
10
|
-
convert_table_md(
|
|
11
|
-
engine,
|
|
12
|
-
table_md_updater=__update_table_md
|
|
13
|
-
)
|
|
11
|
+
convert_table_md(engine, table_md_updater=__update_table_md)
|
|
14
12
|
|
|
15
13
|
|
|
16
14
|
def __update_table_md(table_md: dict, table_id: UUID) -> None:
|