pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pixeltable/__init__.py +42 -8
- pixeltable/{dataframe.py → _query.py} +470 -206
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +5 -4
- pixeltable/catalog/catalog.py +1785 -432
- pixeltable/catalog/column.py +190 -113
- pixeltable/catalog/dir.py +2 -4
- pixeltable/catalog/globals.py +19 -46
- pixeltable/catalog/insertable_table.py +191 -98
- pixeltable/catalog/path.py +63 -23
- pixeltable/catalog/schema_object.py +11 -15
- pixeltable/catalog/table.py +843 -436
- pixeltable/catalog/table_metadata.py +103 -0
- pixeltable/catalog/table_version.py +978 -657
- pixeltable/catalog/table_version_handle.py +72 -16
- pixeltable/catalog/table_version_path.py +112 -43
- pixeltable/catalog/tbl_ops.py +53 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +134 -90
- pixeltable/config.py +134 -22
- pixeltable/env.py +471 -157
- pixeltable/exceptions.py +6 -0
- pixeltable/exec/__init__.py +4 -1
- pixeltable/exec/aggregation_node.py +7 -8
- pixeltable/exec/cache_prefetch_node.py +83 -110
- pixeltable/exec/cell_materialization_node.py +268 -0
- pixeltable/exec/cell_reconstruction_node.py +168 -0
- pixeltable/exec/component_iteration_node.py +4 -3
- pixeltable/exec/data_row_batch.py +8 -65
- pixeltable/exec/exec_context.py +16 -4
- pixeltable/exec/exec_node.py +13 -36
- pixeltable/exec/expr_eval/evaluators.py +11 -7
- pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
- pixeltable/exec/expr_eval/globals.py +8 -5
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +106 -56
- pixeltable/exec/globals.py +35 -0
- pixeltable/exec/in_memory_data_node.py +19 -19
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +16 -9
- pixeltable/exec/sql_node.py +351 -84
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +27 -22
- pixeltable/exprs/array_slice.py +3 -3
- pixeltable/exprs/column_property_ref.py +36 -23
- pixeltable/exprs/column_ref.py +213 -89
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +5 -4
- pixeltable/exprs/data_row.py +164 -54
- pixeltable/exprs/expr.py +70 -44
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +17 -10
- pixeltable/exprs/function_call.py +100 -40
- pixeltable/exprs/globals.py +2 -2
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +18 -32
- pixeltable/exprs/is_null.py +7 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +56 -22
- pixeltable/exprs/literal.py +27 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +167 -67
- pixeltable/exprs/rowid_ref.py +25 -10
- pixeltable/exprs/similarity_expr.py +58 -40
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +5 -5
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +17 -11
- pixeltable/func/function.py +18 -20
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +29 -27
- pixeltable/func/signature.py +46 -19
- pixeltable/func/tools.py +31 -13
- pixeltable/func/udf.py +18 -20
- pixeltable/functions/__init__.py +16 -0
- pixeltable/functions/anthropic.py +123 -77
- pixeltable/functions/audio.py +147 -10
- pixeltable/functions/bedrock.py +13 -6
- pixeltable/functions/date.py +7 -4
- pixeltable/functions/deepseek.py +35 -43
- pixeltable/functions/document.py +81 -0
- pixeltable/functions/fal.py +76 -0
- pixeltable/functions/fireworks.py +11 -20
- pixeltable/functions/gemini.py +195 -39
- pixeltable/functions/globals.py +142 -14
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1056 -24
- pixeltable/functions/image.py +115 -57
- pixeltable/functions/json.py +1 -1
- pixeltable/functions/llama_cpp.py +28 -13
- pixeltable/functions/math.py +67 -5
- pixeltable/functions/mistralai.py +18 -55
- pixeltable/functions/net.py +70 -0
- pixeltable/functions/ollama.py +20 -13
- pixeltable/functions/openai.py +240 -226
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +4 -4
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +239 -69
- pixeltable/functions/timestamp.py +16 -16
- pixeltable/functions/together.py +24 -84
- pixeltable/functions/twelvelabs.py +188 -0
- pixeltable/functions/util.py +6 -1
- pixeltable/functions/uuid.py +30 -0
- pixeltable/functions/video.py +1515 -107
- pixeltable/functions/vision.py +8 -8
- pixeltable/functions/voyageai.py +289 -0
- pixeltable/functions/whisper.py +16 -8
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/{ext/functions → functions}/yolox.py +2 -4
- pixeltable/globals.py +362 -115
- pixeltable/index/base.py +17 -21
- pixeltable/index/btree.py +28 -22
- pixeltable/index/embedding_index.py +100 -118
- pixeltable/io/__init__.py +4 -2
- pixeltable/io/datarows.py +8 -7
- pixeltable/io/external_store.py +56 -105
- pixeltable/io/fiftyone.py +13 -13
- pixeltable/io/globals.py +31 -30
- pixeltable/io/hf_datasets.py +61 -16
- pixeltable/io/label_studio.py +74 -70
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +21 -12
- pixeltable/io/parquet.py +25 -105
- pixeltable/io/table_data_conduit.py +250 -123
- pixeltable/io/utils.py +4 -4
- pixeltable/iterators/__init__.py +2 -1
- pixeltable/iterators/audio.py +26 -25
- pixeltable/iterators/base.py +9 -3
- pixeltable/iterators/document.py +112 -78
- pixeltable/iterators/image.py +12 -15
- pixeltable/iterators/string.py +11 -4
- pixeltable/iterators/video.py +523 -120
- pixeltable/metadata/__init__.py +14 -3
- pixeltable/metadata/converters/convert_13.py +2 -2
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_30.py +34 -21
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_35.py +9 -0
- pixeltable/metadata/converters/convert_36.py +38 -0
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/convert_41.py +12 -0
- pixeltable/metadata/converters/convert_42.py +9 -0
- pixeltable/metadata/converters/convert_43.py +44 -0
- pixeltable/metadata/converters/util.py +20 -31
- pixeltable/metadata/notes.py +9 -0
- pixeltable/metadata/schema.py +140 -53
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +382 -115
- pixeltable/share/__init__.py +1 -1
- pixeltable/share/packager.py +547 -83
- pixeltable/share/protocol/__init__.py +33 -0
- pixeltable/share/protocol/common.py +165 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +119 -0
- pixeltable/share/publish.py +257 -59
- pixeltable/store.py +311 -194
- pixeltable/type_system.py +373 -211
- pixeltable/utils/__init__.py +2 -3
- pixeltable/utils/arrow.py +131 -17
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +346 -0
- pixeltable/utils/coco.py +6 -6
- pixeltable/utils/code.py +3 -3
- pixeltable/utils/console_output.py +4 -1
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/dbms.py +32 -6
- pixeltable/utils/description_helper.py +4 -5
- pixeltable/utils/documents.py +7 -18
- pixeltable/utils/exception_handler.py +7 -30
- pixeltable/utils/filecache.py +6 -6
- pixeltable/utils/formatter.py +86 -48
- pixeltable/utils/gcs_store.py +295 -0
- pixeltable/utils/http.py +133 -0
- pixeltable/utils/http_server.py +2 -3
- pixeltable/utils/iceberg.py +1 -2
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +90 -0
- pixeltable/utils/local_store.py +322 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +573 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +527 -0
- pixeltable/utils/sql.py +26 -0
- pixeltable/utils/system.py +30 -0
- pixeltable-0.5.7.dist-info/METADATA +579 -0
- pixeltable-0.5.7.dist-info/RECORD +227 -0
- {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
- pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/catalog/named_function.py +0 -40
- pixeltable/ext/__init__.py +0 -17
- pixeltable/ext/functions/__init__.py +0 -11
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/utils/media_store.py +0 -77
- pixeltable/utils/s3.py +0 -17
- pixeltable-0.3.14.dist-info/METADATA +0 -434
- pixeltable-0.3.14.dist-info/RECORD +0 -186
- pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
- {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
pixeltable/utils/__init__.py
CHANGED
|
@@ -2,7 +2,6 @@ import hashlib
|
|
|
2
2
|
import urllib.parse
|
|
3
3
|
import urllib.request
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Optional, Union
|
|
6
5
|
|
|
7
6
|
|
|
8
7
|
def print_perf_counter_delta(delta: float) -> str:
|
|
@@ -24,7 +23,7 @@ def print_perf_counter_delta(delta: float) -> str:
|
|
|
24
23
|
return f'{delta:.2f} s'
|
|
25
24
|
|
|
26
25
|
|
|
27
|
-
def sha256sum(path:
|
|
26
|
+
def sha256sum(path: Path | str) -> str:
|
|
28
27
|
"""
|
|
29
28
|
Compute the SHA256 hash of a file.
|
|
30
29
|
"""
|
|
@@ -39,7 +38,7 @@ def sha256sum(path: Union[Path, str]) -> str:
|
|
|
39
38
|
return h.hexdigest()
|
|
40
39
|
|
|
41
40
|
|
|
42
|
-
def parse_local_file_path(file_or_url: str) ->
|
|
41
|
+
def parse_local_file_path(file_or_url: str) -> Path | None:
|
|
43
42
|
"""
|
|
44
43
|
Parses a string that may be either a URL or a local file path.
|
|
45
44
|
|
pixeltable/utils/arrow.py
CHANGED
|
@@ -1,15 +1,23 @@
|
|
|
1
1
|
import datetime
|
|
2
|
-
|
|
2
|
+
import io
|
|
3
|
+
import json
|
|
4
|
+
import uuid
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Iterator, cast
|
|
3
6
|
|
|
4
7
|
import numpy as np
|
|
8
|
+
import PIL.Image
|
|
5
9
|
import pyarrow as pa
|
|
6
10
|
|
|
11
|
+
import pixeltable.exceptions as excs
|
|
7
12
|
import pixeltable.type_system as ts
|
|
8
13
|
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
import pixeltable as pxt
|
|
16
|
+
|
|
9
17
|
PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
|
|
10
18
|
pa.string(): ts.StringType(nullable=True),
|
|
11
19
|
pa.large_string(): ts.StringType(nullable=True),
|
|
12
|
-
pa.timestamp('us', tz=
|
|
20
|
+
pa.timestamp('us', tz='UTC'): ts.TimestampType(nullable=True),
|
|
13
21
|
pa.bool_(): ts.BoolType(nullable=True),
|
|
14
22
|
pa.int8(): ts.IntType(nullable=True),
|
|
15
23
|
pa.int16(): ts.IntType(nullable=True),
|
|
@@ -23,16 +31,19 @@ PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
|
|
|
23
31
|
pa.float64(): ts.FloatType(nullable=True),
|
|
24
32
|
pa.date32(): ts.DateType(nullable=True),
|
|
25
33
|
pa.date64(): ts.DateType(nullable=True),
|
|
26
|
-
pa.
|
|
34
|
+
pa.uuid(): ts.UUIDType(nullable=True),
|
|
35
|
+
pa.binary(): ts.BinaryType(nullable=True),
|
|
27
36
|
}
|
|
28
37
|
|
|
29
38
|
PXT_TO_PA_TYPES: dict[type[ts.ColumnType], pa.DataType] = {
|
|
30
39
|
ts.StringType: pa.string(),
|
|
31
|
-
ts.TimestampType: pa.timestamp('us', tz=
|
|
40
|
+
ts.TimestampType: pa.timestamp('us', tz='UTC'), # postgres timestamp is microseconds
|
|
32
41
|
ts.DateType: pa.date32(), # This could be date64
|
|
42
|
+
ts.UUIDType: pa.uuid(),
|
|
33
43
|
ts.BoolType: pa.bool_(),
|
|
34
44
|
ts.IntType: pa.int64(),
|
|
35
45
|
ts.FloatType: pa.float32(),
|
|
46
|
+
ts.BinaryType: pa.binary(),
|
|
36
47
|
ts.JsonType: pa.string(), # TODO(orm) pa.struct() is possible
|
|
37
48
|
ts.ImageType: pa.binary(), # inline image
|
|
38
49
|
ts.AudioType: pa.string(), # path
|
|
@@ -41,7 +52,7 @@ PXT_TO_PA_TYPES: dict[type[ts.ColumnType], pa.DataType] = {
|
|
|
41
52
|
}
|
|
42
53
|
|
|
43
54
|
|
|
44
|
-
def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) ->
|
|
55
|
+
def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) -> ts.ColumnType | None:
|
|
45
56
|
"""Convert a pyarrow DataType to a pixeltable ColumnType if one is defined.
|
|
46
57
|
Returns None if no conversion is currently implemented.
|
|
47
58
|
"""
|
|
@@ -54,50 +65,144 @@ def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) -> Optional[ts.C
|
|
|
54
65
|
dtype = to_pixeltable_type(arrow_type.value_type, nullable)
|
|
55
66
|
if dtype is None:
|
|
56
67
|
return None
|
|
57
|
-
return ts.ArrayType(shape=arrow_type.shape, dtype=dtype, nullable=nullable)
|
|
68
|
+
return ts.ArrayType(shape=tuple(arrow_type.shape), dtype=dtype, nullable=nullable)
|
|
58
69
|
else:
|
|
59
70
|
return None
|
|
60
71
|
|
|
61
72
|
|
|
62
|
-
def to_arrow_type(pixeltable_type: ts.ColumnType) ->
|
|
73
|
+
def to_arrow_type(pixeltable_type: ts.ColumnType) -> pa.DataType | None:
|
|
63
74
|
"""Convert a pixeltable DataType to a pyarrow datatype if one is defined.
|
|
64
75
|
Returns None if no conversion is currently implemented.
|
|
65
76
|
"""
|
|
66
77
|
if pixeltable_type.__class__ in PXT_TO_PA_TYPES:
|
|
67
78
|
return PXT_TO_PA_TYPES[pixeltable_type.__class__]
|
|
68
79
|
elif isinstance(pixeltable_type, ts.ArrayType):
|
|
69
|
-
return pa.fixed_shape_tensor(pa.from_numpy_dtype(pixeltable_type.
|
|
80
|
+
return pa.fixed_shape_tensor(pa.from_numpy_dtype(pixeltable_type.dtype), pixeltable_type.shape)
|
|
70
81
|
else:
|
|
71
82
|
return None
|
|
72
83
|
|
|
73
84
|
|
|
74
|
-
def
|
|
85
|
+
def to_pxt_schema(
|
|
75
86
|
arrow_schema: pa.Schema, schema_overrides: dict[str, Any], primary_key: list[str]
|
|
76
87
|
) -> dict[str, ts.ColumnType]:
|
|
77
88
|
"""Convert a pyarrow Schema to a schema using pyarrow names and pixeltable types."""
|
|
78
|
-
|
|
89
|
+
pxt_schema = {
|
|
79
90
|
field.name: to_pixeltable_type(field.type, field.name not in primary_key)
|
|
80
91
|
if field.name not in schema_overrides
|
|
81
92
|
else schema_overrides[field.name]
|
|
82
93
|
for field in arrow_schema
|
|
83
94
|
}
|
|
84
|
-
return
|
|
95
|
+
return pxt_schema
|
|
85
96
|
|
|
86
97
|
|
|
87
98
|
def to_arrow_schema(pixeltable_schema: dict[str, Any]) -> pa.Schema:
|
|
88
|
-
return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items())
|
|
99
|
+
return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items())
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _to_record_batch(column_vals: dict[str, list[Any]], schema: pa.Schema) -> pa.RecordBatch:
|
|
103
|
+
import pyarrow as pa
|
|
104
|
+
|
|
105
|
+
pa_arrays: list[pa.Array] = []
|
|
106
|
+
for field in schema:
|
|
107
|
+
if isinstance(field.type, pa.FixedShapeTensorType):
|
|
108
|
+
stacked_arr = np.stack(column_vals[field.name])
|
|
109
|
+
pa_arrays.append(pa.FixedShapeTensorArray.from_numpy_ndarray(stacked_arr))
|
|
110
|
+
else:
|
|
111
|
+
pa_array = cast(pa.Array, pa.array(column_vals[field.name]))
|
|
112
|
+
pa_arrays.append(pa_array)
|
|
113
|
+
return pa.RecordBatch.from_arrays(pa_arrays, schema=schema)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def to_record_batches(query: 'pxt.Query', batch_size_bytes: int) -> Iterator[pa.RecordBatch]:
|
|
117
|
+
arrow_schema = to_arrow_schema(query.schema)
|
|
118
|
+
batch_columns: dict[str, list[Any]] = {k: [] for k in query.schema}
|
|
119
|
+
current_byte_estimate = 0
|
|
120
|
+
num_batch_rows = 0
|
|
89
121
|
|
|
122
|
+
# TODO: in order to avoid having to deal with ExprEvalError here, ResultSet should be an iterator
|
|
123
|
+
# over _exec()
|
|
124
|
+
try:
|
|
125
|
+
for data_row in query._exec():
|
|
126
|
+
num_batch_rows += 1
|
|
127
|
+
for (col_name, col_type), e in zip(query.schema.items(), query._select_list_exprs):
|
|
128
|
+
val = data_row[e.slot_idx]
|
|
129
|
+
val_size_bytes: int
|
|
130
|
+
if val is None:
|
|
131
|
+
batch_columns[col_name].append(val)
|
|
132
|
+
continue
|
|
90
133
|
|
|
91
|
-
|
|
134
|
+
assert val is not None
|
|
135
|
+
if col_type.is_image_type():
|
|
136
|
+
# images get inlined into the parquet file
|
|
137
|
+
if data_row.file_paths[e.slot_idx] is not None:
|
|
138
|
+
# if there is a file, read directly to preserve information
|
|
139
|
+
with open(data_row.file_paths[e.slot_idx], 'rb') as f:
|
|
140
|
+
val = f.read()
|
|
141
|
+
elif isinstance(val, PIL.Image.Image):
|
|
142
|
+
# no file available: save as png
|
|
143
|
+
buf = io.BytesIO()
|
|
144
|
+
val.save(buf, format='png')
|
|
145
|
+
val = buf.getvalue()
|
|
146
|
+
else:
|
|
147
|
+
raise excs.Error(f'unknown image type {type(val)}')
|
|
148
|
+
val_size_bytes = len(val)
|
|
149
|
+
elif col_type.is_string_type():
|
|
150
|
+
val_size_bytes = len(val)
|
|
151
|
+
elif col_type.is_uuid_type():
|
|
152
|
+
# pa.uuid() uses fixed_size_binary(16) as storage type
|
|
153
|
+
val = val.bytes # Convert UUID to 16-byte binary for arrow
|
|
154
|
+
val_size_bytes = len(val)
|
|
155
|
+
elif col_type.is_binary_type():
|
|
156
|
+
val_size_bytes = len(val)
|
|
157
|
+
elif col_type.is_media_type():
|
|
158
|
+
assert data_row.file_paths[e.slot_idx] is not None
|
|
159
|
+
val = data_row.file_paths[e.slot_idx]
|
|
160
|
+
val_size_bytes = len(val)
|
|
161
|
+
elif col_type.is_json_type():
|
|
162
|
+
val = json.dumps(val)
|
|
163
|
+
val_size_bytes = len(val)
|
|
164
|
+
elif col_type.is_array_type():
|
|
165
|
+
val_size_bytes = val.nbytes
|
|
166
|
+
elif col_type.is_int_type() or col_type.is_float_type():
|
|
167
|
+
val_size_bytes = 8
|
|
168
|
+
elif col_type.is_bool_type():
|
|
169
|
+
val_size_bytes = 1
|
|
170
|
+
elif col_type.is_date_type():
|
|
171
|
+
val_size_bytes = 4
|
|
172
|
+
elif col_type.is_timestamp_type():
|
|
173
|
+
val = val.astimezone(datetime.timezone.utc)
|
|
174
|
+
val_size_bytes = 8
|
|
175
|
+
else:
|
|
176
|
+
raise excs.Error(f'unknown type {col_type} for {col_name}')
|
|
177
|
+
|
|
178
|
+
batch_columns[col_name].append(val)
|
|
179
|
+
current_byte_estimate += val_size_bytes
|
|
180
|
+
|
|
181
|
+
if current_byte_estimate > batch_size_bytes and num_batch_rows > 0:
|
|
182
|
+
record_batch = _to_record_batch(batch_columns, arrow_schema)
|
|
183
|
+
yield record_batch
|
|
184
|
+
batch_columns = {k: [] for k in query.schema}
|
|
185
|
+
current_byte_estimate = 0
|
|
186
|
+
num_batch_rows = 0
|
|
187
|
+
|
|
188
|
+
except excs.ExprEvalError as e:
|
|
189
|
+
query._raise_expr_eval_err(e)
|
|
190
|
+
|
|
191
|
+
if num_batch_rows > 0:
|
|
192
|
+
record_batch = _to_record_batch(batch_columns, arrow_schema)
|
|
193
|
+
yield record_batch
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def to_pydict(batch: pa.Table | pa.RecordBatch) -> dict[str, list | np.ndarray]:
|
|
92
197
|
"""Convert a RecordBatch to a dictionary of lists, unlike pa.lib.RecordBatch.to_pydict,
|
|
93
198
|
this function will not convert numpy arrays to lists, and will preserve the original numpy dtype.
|
|
94
199
|
"""
|
|
95
|
-
out: dict[str,
|
|
200
|
+
out: dict[str, list | np.ndarray] = {}
|
|
96
201
|
for k, name in enumerate(batch.schema.names):
|
|
97
202
|
col = batch.column(k)
|
|
98
203
|
if isinstance(col.type, pa.FixedShapeTensorType):
|
|
99
204
|
# treat array columns as numpy arrays to easily preserve numpy type
|
|
100
|
-
out[name] = col.to_numpy(zero_copy_only=False)
|
|
205
|
+
out[name] = col.to_numpy(zero_copy_only=False)
|
|
101
206
|
else:
|
|
102
207
|
# for the rest, use pydict to preserve python types
|
|
103
208
|
out[name] = col.to_pylist()
|
|
@@ -105,7 +210,7 @@ def to_pydict(batch: Union[pa.Table, pa.RecordBatch]) -> dict[str, Union[list, n
|
|
|
105
210
|
return out
|
|
106
211
|
|
|
107
212
|
|
|
108
|
-
def iter_tuples(batch:
|
|
213
|
+
def iter_tuples(batch: pa.Table | pa.RecordBatch) -> Iterator[dict[str, Any]]:
|
|
109
214
|
"""Convert a RecordBatch to an iterator of dictionaries. also works with pa.Table and pa.RowGroup"""
|
|
110
215
|
pydict = to_pydict(batch)
|
|
111
216
|
assert len(pydict) > 0, 'empty record batch'
|
|
@@ -129,6 +234,15 @@ def _ar_val_to_pxt_val(val: Any, pxt_type: ts.ColumnType) -> Any:
|
|
|
129
234
|
return bool(val)
|
|
130
235
|
elif pxt_type.is_string_type():
|
|
131
236
|
return str(val)
|
|
237
|
+
elif pxt_type.is_uuid_type():
|
|
238
|
+
if isinstance(val, uuid.UUID):
|
|
239
|
+
return val
|
|
240
|
+
if isinstance(val, bytes):
|
|
241
|
+
return uuid.UUID(bytes=val)
|
|
242
|
+
return uuid.UUID(val)
|
|
243
|
+
elif pxt_type.is_binary_type():
|
|
244
|
+
assert isinstance(val, bytes)
|
|
245
|
+
return val
|
|
132
246
|
elif pxt_type.is_date_type():
|
|
133
247
|
if isinstance(val, str):
|
|
134
248
|
return datetime.date.fromisoformat(val)
|
|
@@ -145,7 +259,7 @@ def _ar_val_to_pxt_val(val: Any, pxt_type: ts.ColumnType) -> Any:
|
|
|
145
259
|
|
|
146
260
|
|
|
147
261
|
def iter_tuples2(
|
|
148
|
-
batch:
|
|
262
|
+
batch: pa.Table | pa.RecordBatch, col_mapping: dict[str, str] | None, schema: dict[str, ts.ColumnType]
|
|
149
263
|
) -> Iterator[dict[str, Any]]:
|
|
150
264
|
"""Convert a RecordBatch to an iterator of dictionaries. also works with pa.Table and pa.RowGroup"""
|
|
151
265
|
pydict = to_pydict(batch)
|
pixeltable/utils/av.py
ADDED
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from fractions import Fraction
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from types import TracebackType
|
|
7
|
+
from typing import Any, Iterator
|
|
8
|
+
|
|
9
|
+
import av
|
|
10
|
+
import av.stream
|
|
11
|
+
import PIL.Image
|
|
12
|
+
from typing_extensions import Self
|
|
13
|
+
|
|
14
|
+
from pixeltable.env import Env
|
|
15
|
+
|
|
16
|
+
# format -> (codec, extension)
|
|
17
|
+
AUDIO_FORMATS: dict[str, tuple[str, str]] = {
|
|
18
|
+
'wav': ('pcm_s16le', 'wav'),
|
|
19
|
+
'mp3': ('libmp3lame', 'mp3'),
|
|
20
|
+
'flac': ('flac', 'flac'),
|
|
21
|
+
'mp4': ('aac', 'm4a'),
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_metadata(path: str) -> dict:
|
|
26
|
+
with av.open(path) as container:
|
|
27
|
+
assert isinstance(container, av.container.InputContainer)
|
|
28
|
+
streams_info = [__get_stream_metadata(stream) for stream in container.streams]
|
|
29
|
+
result = {
|
|
30
|
+
'bit_exact': getattr(container, 'bit_exact', False),
|
|
31
|
+
'bit_rate': container.bit_rate,
|
|
32
|
+
'size': container.size,
|
|
33
|
+
'metadata': container.metadata,
|
|
34
|
+
'streams': streams_info,
|
|
35
|
+
}
|
|
36
|
+
return result
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def __get_stream_metadata(stream: av.stream.Stream) -> dict:
|
|
40
|
+
if stream.type not in ('audio', 'video'):
|
|
41
|
+
return {'type': stream.type} # Currently unsupported
|
|
42
|
+
|
|
43
|
+
codec_context = stream.codec_context
|
|
44
|
+
codec_context_md: dict[str, Any] = {
|
|
45
|
+
'name': codec_context.name,
|
|
46
|
+
'codec_tag': codec_context.codec_tag.encode('unicode-escape').decode('utf-8'),
|
|
47
|
+
'profile': codec_context.profile,
|
|
48
|
+
}
|
|
49
|
+
metadata = {
|
|
50
|
+
'type': stream.type,
|
|
51
|
+
'duration': stream.duration,
|
|
52
|
+
'time_base': float(stream.time_base) if stream.time_base is not None else None,
|
|
53
|
+
'duration_seconds': float(stream.duration * stream.time_base)
|
|
54
|
+
if stream.duration is not None and stream.time_base is not None
|
|
55
|
+
else None,
|
|
56
|
+
'frames': stream.frames,
|
|
57
|
+
'metadata': stream.metadata,
|
|
58
|
+
'codec_context': codec_context_md,
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
if stream.type == 'audio':
|
|
62
|
+
# Additional metadata for audio
|
|
63
|
+
channels = getattr(stream.codec_context, 'channels', None)
|
|
64
|
+
codec_context_md['channels'] = int(channels) if channels is not None else None
|
|
65
|
+
else:
|
|
66
|
+
assert stream.type == 'video'
|
|
67
|
+
assert isinstance(stream, av.video.stream.VideoStream)
|
|
68
|
+
# Additional metadata for video
|
|
69
|
+
codec_context_md['pix_fmt'] = getattr(stream.codec_context, 'pix_fmt', None)
|
|
70
|
+
metadata.update(
|
|
71
|
+
**{
|
|
72
|
+
'width': stream.width,
|
|
73
|
+
'height': stream.height,
|
|
74
|
+
'frames': stream.frames,
|
|
75
|
+
'average_rate': float(stream.average_rate) if stream.average_rate is not None else None,
|
|
76
|
+
'base_rate': float(stream.base_rate) if stream.base_rate is not None else None,
|
|
77
|
+
'guessed_rate': float(stream.guessed_rate) if stream.guessed_rate is not None else None,
|
|
78
|
+
}
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
return metadata
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def get_video_duration(path: str) -> float | None:
|
|
85
|
+
"""Return video duration in seconds."""
|
|
86
|
+
with av.open(path) as container:
|
|
87
|
+
video_stream = container.streams.video[0]
|
|
88
|
+
if video_stream is None:
|
|
89
|
+
return None
|
|
90
|
+
if video_stream.duration is not None:
|
|
91
|
+
return float(video_stream.duration * video_stream.time_base)
|
|
92
|
+
|
|
93
|
+
# if duration is not in the header, look for it in the last packet
|
|
94
|
+
last_pts: int | None = None
|
|
95
|
+
for packet in container.demux(video_stream):
|
|
96
|
+
if packet.pts is not None:
|
|
97
|
+
last_pts = packet.pts
|
|
98
|
+
if last_pts is not None:
|
|
99
|
+
return float(last_pts * video_stream.time_base)
|
|
100
|
+
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def has_audio_stream(path: str) -> bool:
|
|
105
|
+
"""Check if video has audio stream using PyAV."""
|
|
106
|
+
md = get_metadata(path)
|
|
107
|
+
return any(stream['type'] == 'audio' for stream in md['streams'])
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def ffmpeg_clip_cmd(
|
|
111
|
+
input_path: str,
|
|
112
|
+
output_path: str,
|
|
113
|
+
start_time: float,
|
|
114
|
+
duration: float | None = None,
|
|
115
|
+
fast: bool = True,
|
|
116
|
+
video_encoder: str | None = None,
|
|
117
|
+
video_encoder_args: dict[str, Any] | None = None,
|
|
118
|
+
) -> list[str]:
|
|
119
|
+
cmd = ['ffmpeg']
|
|
120
|
+
if fast:
|
|
121
|
+
# fast: -ss before -i
|
|
122
|
+
cmd.extend(
|
|
123
|
+
[
|
|
124
|
+
'-ss',
|
|
125
|
+
str(start_time),
|
|
126
|
+
'-i',
|
|
127
|
+
input_path,
|
|
128
|
+
'-map',
|
|
129
|
+
'0', # Copy all streams from input
|
|
130
|
+
'-c',
|
|
131
|
+
'copy', # Stream copy (no re-encoding)
|
|
132
|
+
]
|
|
133
|
+
)
|
|
134
|
+
else:
|
|
135
|
+
if video_encoder is None:
|
|
136
|
+
video_encoder = Env.get().default_video_encoder
|
|
137
|
+
|
|
138
|
+
# accurate: -ss after -i
|
|
139
|
+
cmd.extend(
|
|
140
|
+
[
|
|
141
|
+
'-i',
|
|
142
|
+
input_path,
|
|
143
|
+
'-ss',
|
|
144
|
+
str(start_time),
|
|
145
|
+
'-map',
|
|
146
|
+
'0', # Copy all streams from input
|
|
147
|
+
'-c:a',
|
|
148
|
+
'copy', # audio copy
|
|
149
|
+
'-c:s',
|
|
150
|
+
'copy', # subtitle copy
|
|
151
|
+
'-c:v',
|
|
152
|
+
video_encoder, # re-encode video
|
|
153
|
+
]
|
|
154
|
+
)
|
|
155
|
+
if video_encoder_args is not None:
|
|
156
|
+
for k, v in video_encoder_args.items():
|
|
157
|
+
cmd.extend([f'-{k}', str(v)])
|
|
158
|
+
|
|
159
|
+
if duration is not None:
|
|
160
|
+
cmd.extend(['-t', str(duration)])
|
|
161
|
+
cmd.extend(['-loglevel', 'error', output_path])
|
|
162
|
+
return cmd
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def ffmpeg_segment_cmd(
|
|
166
|
+
input_path: str,
|
|
167
|
+
output_pattern: str,
|
|
168
|
+
segment_duration: float | None = None,
|
|
169
|
+
segment_times: list[float] | None = None,
|
|
170
|
+
video_encoder: str | None = None,
|
|
171
|
+
video_encoder_args: dict[str, Any] | None = None,
|
|
172
|
+
) -> list[str]:
|
|
173
|
+
"""Commandline for frame-accurate segmentation"""
|
|
174
|
+
assert (segment_duration is None) != (segment_times is None)
|
|
175
|
+
if video_encoder is None:
|
|
176
|
+
video_encoder = Env.get().default_video_encoder
|
|
177
|
+
|
|
178
|
+
cmd = [
|
|
179
|
+
'ffmpeg',
|
|
180
|
+
'-i',
|
|
181
|
+
input_path,
|
|
182
|
+
'-map',
|
|
183
|
+
'0', # Copy all streams from input
|
|
184
|
+
'-c:a',
|
|
185
|
+
'copy', # don't re-encode audio
|
|
186
|
+
'-c:v',
|
|
187
|
+
video_encoder, # re-encode video
|
|
188
|
+
]
|
|
189
|
+
if video_encoder_args is not None:
|
|
190
|
+
for k, v in video_encoder_args.items():
|
|
191
|
+
cmd.extend([f'-{k}', str(v)])
|
|
192
|
+
cmd.extend(['-f', 'segment'])
|
|
193
|
+
|
|
194
|
+
# -force_key_frames needs to precede -f segment
|
|
195
|
+
if segment_duration is not None:
|
|
196
|
+
cmd.extend(
|
|
197
|
+
[
|
|
198
|
+
'-force_key_frames',
|
|
199
|
+
f'expr:gte(t,n_forced*{segment_duration})', # Force keyframe at each segment boundary
|
|
200
|
+
'-f',
|
|
201
|
+
'segment',
|
|
202
|
+
'-segment_time',
|
|
203
|
+
str(segment_duration),
|
|
204
|
+
]
|
|
205
|
+
)
|
|
206
|
+
else:
|
|
207
|
+
assert segment_times is not None
|
|
208
|
+
times_str = ','.join([str(t) for t in segment_times])
|
|
209
|
+
cmd.extend(['-force_key_frames', times_str, '-f', 'segment', '-segment_times', times_str])
|
|
210
|
+
|
|
211
|
+
cmd.extend(
|
|
212
|
+
[
|
|
213
|
+
'-reset_timestamps',
|
|
214
|
+
'1', # Reset timestamps for each segment
|
|
215
|
+
'-loglevel',
|
|
216
|
+
'error', # Only show errors
|
|
217
|
+
output_pattern,
|
|
218
|
+
]
|
|
219
|
+
)
|
|
220
|
+
return cmd
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
class VideoFrames:
|
|
224
|
+
"""
|
|
225
|
+
Context manager for iterating over video frames at a specified frame rate.
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
path: Path to the video file
|
|
229
|
+
fps: Number of frames to extract per second. If None or 0.0, extracts all frames.
|
|
230
|
+
"""
|
|
231
|
+
|
|
232
|
+
path: Path
|
|
233
|
+
fps: float
|
|
234
|
+
container: av.container.input.InputContainer | None
|
|
235
|
+
video_framerate: Fraction | None
|
|
236
|
+
video_time_base: Fraction | None
|
|
237
|
+
video_start_time: int | None
|
|
238
|
+
|
|
239
|
+
@dataclass
|
|
240
|
+
class Item:
|
|
241
|
+
frame_idx: int
|
|
242
|
+
pts: int
|
|
243
|
+
dts: int
|
|
244
|
+
time: float
|
|
245
|
+
is_corrupt: bool
|
|
246
|
+
key_frame: bool
|
|
247
|
+
pict_type: int
|
|
248
|
+
interlaced_frame: bool
|
|
249
|
+
frame: PIL.Image.Image
|
|
250
|
+
|
|
251
|
+
def __init__(self, path: Path, fps: float | None = None) -> None:
|
|
252
|
+
self.path = path
|
|
253
|
+
self.fps = 0.0 if fps is None else fps
|
|
254
|
+
self.container = None
|
|
255
|
+
self.video_framerate = None
|
|
256
|
+
self.video_time_base = None
|
|
257
|
+
self.video_start_time = None
|
|
258
|
+
|
|
259
|
+
def __enter__(self) -> Self:
|
|
260
|
+
self.container = av.open(self.path)
|
|
261
|
+
stream = self.container.streams.video[0]
|
|
262
|
+
self.video_framerate = stream.average_rate
|
|
263
|
+
self.video_time_base = stream.time_base
|
|
264
|
+
self.video_start_time = stream.start_time or 0
|
|
265
|
+
return self
|
|
266
|
+
|
|
267
|
+
def __exit__(
|
|
268
|
+
self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: TracebackType | None
|
|
269
|
+
) -> None:
|
|
270
|
+
# Clean up
|
|
271
|
+
if self.container:
|
|
272
|
+
self.container.close()
|
|
273
|
+
|
|
274
|
+
def __iter__(self) -> Iterator[Item]:
|
|
275
|
+
num_returned = 0
|
|
276
|
+
frame_idx = -1
|
|
277
|
+
while True:
|
|
278
|
+
try:
|
|
279
|
+
frame = next(self.container.decode(video=0))
|
|
280
|
+
except (StopIteration, EOFError):
|
|
281
|
+
return
|
|
282
|
+
|
|
283
|
+
frame_idx += 1
|
|
284
|
+
if self.fps == 0.0 or (num_returned <= frame.time * self.fps):
|
|
285
|
+
img = frame.to_image()
|
|
286
|
+
assert isinstance(img, PIL.Image.Image)
|
|
287
|
+
yield VideoFrames.Item(
|
|
288
|
+
frame_idx=frame_idx,
|
|
289
|
+
pts=frame.pts,
|
|
290
|
+
dts=frame.dts,
|
|
291
|
+
time=frame.time,
|
|
292
|
+
is_corrupt=frame.is_corrupt,
|
|
293
|
+
key_frame=frame.key_frame,
|
|
294
|
+
pict_type=frame.pict_type,
|
|
295
|
+
interlaced_frame=frame.interlaced_frame,
|
|
296
|
+
frame=img,
|
|
297
|
+
)
|
|
298
|
+
num_returned += 1
|