PyPI - pixeltable - Versions diffs - 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl - Mend

pixeltable 0.3.14py3-none-any.whl → 0.5.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (220) hide show

pixeltable/__init__.py +42 -8
pixeltable/{dataframe.py → _query.py} +470 -206
pixeltable/_version.py +1 -0
pixeltable/catalog/__init__.py +5 -4
pixeltable/catalog/catalog.py +1785 -432
pixeltable/catalog/column.py +190 -113
pixeltable/catalog/dir.py +2 -4
pixeltable/catalog/globals.py +19 -46
pixeltable/catalog/insertable_table.py +191 -98
pixeltable/catalog/path.py +63 -23
pixeltable/catalog/schema_object.py +11 -15
pixeltable/catalog/table.py +843 -436
pixeltable/catalog/table_metadata.py +103 -0
pixeltable/catalog/table_version.py +978 -657
pixeltable/catalog/table_version_handle.py +72 -16
pixeltable/catalog/table_version_path.py +112 -43
pixeltable/catalog/tbl_ops.py +53 -0
pixeltable/catalog/update_status.py +191 -0
pixeltable/catalog/view.py +134 -90
pixeltable/config.py +134 -22
pixeltable/env.py +471 -157
pixeltable/exceptions.py +6 -0
pixeltable/exec/__init__.py +4 -1
pixeltable/exec/aggregation_node.py +7 -8
pixeltable/exec/cache_prefetch_node.py +83 -110
pixeltable/exec/cell_materialization_node.py +268 -0
pixeltable/exec/cell_reconstruction_node.py +168 -0
pixeltable/exec/component_iteration_node.py +4 -3
pixeltable/exec/data_row_batch.py +8 -65
pixeltable/exec/exec_context.py +16 -4
pixeltable/exec/exec_node.py +13 -36
pixeltable/exec/expr_eval/evaluators.py +11 -7
pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
pixeltable/exec/expr_eval/globals.py +8 -5
pixeltable/exec/expr_eval/row_buffer.py +1 -2
pixeltable/exec/expr_eval/schedulers.py +106 -56
pixeltable/exec/globals.py +35 -0
pixeltable/exec/in_memory_data_node.py +19 -19
pixeltable/exec/object_store_save_node.py +293 -0
pixeltable/exec/row_update_node.py +16 -9
pixeltable/exec/sql_node.py +351 -84
pixeltable/exprs/__init__.py +1 -1
pixeltable/exprs/arithmetic_expr.py +27 -22
pixeltable/exprs/array_slice.py +3 -3
pixeltable/exprs/column_property_ref.py +36 -23
pixeltable/exprs/column_ref.py +213 -89
pixeltable/exprs/comparison.py +5 -5
pixeltable/exprs/compound_predicate.py +5 -4
pixeltable/exprs/data_row.py +164 -54
pixeltable/exprs/expr.py +70 -44
pixeltable/exprs/expr_dict.py +3 -3
pixeltable/exprs/expr_set.py +17 -10
pixeltable/exprs/function_call.py +100 -40
pixeltable/exprs/globals.py +2 -2
pixeltable/exprs/in_predicate.py +4 -4
pixeltable/exprs/inline_expr.py +18 -32
pixeltable/exprs/is_null.py +7 -3
pixeltable/exprs/json_mapper.py +8 -8
pixeltable/exprs/json_path.py +56 -22
pixeltable/exprs/literal.py +27 -5
pixeltable/exprs/method_ref.py +2 -2
pixeltable/exprs/object_ref.py +2 -2
pixeltable/exprs/row_builder.py +167 -67
pixeltable/exprs/rowid_ref.py +25 -10
pixeltable/exprs/similarity_expr.py +58 -40
pixeltable/exprs/sql_element_cache.py +4 -4
pixeltable/exprs/string_op.py +5 -5
pixeltable/exprs/type_cast.py +3 -5
pixeltable/func/__init__.py +1 -0
pixeltable/func/aggregate_function.py +8 -8
pixeltable/func/callable_function.py +9 -9
pixeltable/func/expr_template_function.py +17 -11
pixeltable/func/function.py +18 -20
pixeltable/func/function_registry.py +6 -7
pixeltable/func/globals.py +2 -3
pixeltable/func/mcp.py +74 -0
pixeltable/func/query_template_function.py +29 -27
pixeltable/func/signature.py +46 -19
pixeltable/func/tools.py +31 -13
pixeltable/func/udf.py +18 -20
pixeltable/functions/__init__.py +16 -0
pixeltable/functions/anthropic.py +123 -77
pixeltable/functions/audio.py +147 -10
pixeltable/functions/bedrock.py +13 -6
pixeltable/functions/date.py +7 -4
pixeltable/functions/deepseek.py +35 -43
pixeltable/functions/document.py +81 -0
pixeltable/functions/fal.py +76 -0
pixeltable/functions/fireworks.py +11 -20
pixeltable/functions/gemini.py +195 -39
pixeltable/functions/globals.py +142 -14
pixeltable/functions/groq.py +108 -0
pixeltable/functions/huggingface.py +1056 -24
pixeltable/functions/image.py +115 -57
pixeltable/functions/json.py +1 -1
pixeltable/functions/llama_cpp.py +28 -13
pixeltable/functions/math.py +67 -5
pixeltable/functions/mistralai.py +18 -55
pixeltable/functions/net.py +70 -0
pixeltable/functions/ollama.py +20 -13
pixeltable/functions/openai.py +240 -226
pixeltable/functions/openrouter.py +143 -0
pixeltable/functions/replicate.py +4 -4
pixeltable/functions/reve.py +250 -0
pixeltable/functions/string.py +239 -69
pixeltable/functions/timestamp.py +16 -16
pixeltable/functions/together.py +24 -84
pixeltable/functions/twelvelabs.py +188 -0
pixeltable/functions/util.py +6 -1
pixeltable/functions/uuid.py +30 -0
pixeltable/functions/video.py +1515 -107
pixeltable/functions/vision.py +8 -8
pixeltable/functions/voyageai.py +289 -0
pixeltable/functions/whisper.py +16 -8
pixeltable/functions/whisperx.py +179 -0
pixeltable/{ext/functions → functions}/yolox.py +2 -4
pixeltable/globals.py +362 -115
pixeltable/index/base.py +17 -21
pixeltable/index/btree.py +28 -22
pixeltable/index/embedding_index.py +100 -118
pixeltable/io/__init__.py +4 -2
pixeltable/io/datarows.py +8 -7
pixeltable/io/external_store.py +56 -105
pixeltable/io/fiftyone.py +13 -13
pixeltable/io/globals.py +31 -30
pixeltable/io/hf_datasets.py +61 -16
pixeltable/io/label_studio.py +74 -70
pixeltable/io/lancedb.py +3 -0
pixeltable/io/pandas.py +21 -12
pixeltable/io/parquet.py +25 -105
pixeltable/io/table_data_conduit.py +250 -123
pixeltable/io/utils.py +4 -4
pixeltable/iterators/__init__.py +2 -1
pixeltable/iterators/audio.py +26 -25
pixeltable/iterators/base.py +9 -3
pixeltable/iterators/document.py +112 -78
pixeltable/iterators/image.py +12 -15
pixeltable/iterators/string.py +11 -4
pixeltable/iterators/video.py +523 -120
pixeltable/metadata/__init__.py +14 -3
pixeltable/metadata/converters/convert_13.py +2 -2
pixeltable/metadata/converters/convert_18.py +2 -2
pixeltable/metadata/converters/convert_19.py +2 -2
pixeltable/metadata/converters/convert_20.py +2 -2
pixeltable/metadata/converters/convert_21.py +2 -2
pixeltable/metadata/converters/convert_22.py +2 -2
pixeltable/metadata/converters/convert_24.py +2 -2
pixeltable/metadata/converters/convert_25.py +2 -2
pixeltable/metadata/converters/convert_26.py +2 -2
pixeltable/metadata/converters/convert_29.py +4 -4
pixeltable/metadata/converters/convert_30.py +34 -21
pixeltable/metadata/converters/convert_34.py +2 -2
pixeltable/metadata/converters/convert_35.py +9 -0
pixeltable/metadata/converters/convert_36.py +38 -0
pixeltable/metadata/converters/convert_37.py +15 -0
pixeltable/metadata/converters/convert_38.py +39 -0
pixeltable/metadata/converters/convert_39.py +124 -0
pixeltable/metadata/converters/convert_40.py +73 -0
pixeltable/metadata/converters/convert_41.py +12 -0
pixeltable/metadata/converters/convert_42.py +9 -0
pixeltable/metadata/converters/convert_43.py +44 -0
pixeltable/metadata/converters/util.py +20 -31
pixeltable/metadata/notes.py +9 -0
pixeltable/metadata/schema.py +140 -53
pixeltable/metadata/utils.py +74 -0
pixeltable/mypy/__init__.py +3 -0
pixeltable/mypy/mypy_plugin.py +123 -0
pixeltable/plan.py +382 -115
pixeltable/share/__init__.py +1 -1
pixeltable/share/packager.py +547 -83
pixeltable/share/protocol/__init__.py +33 -0
pixeltable/share/protocol/common.py +165 -0
pixeltable/share/protocol/operation_types.py +33 -0
pixeltable/share/protocol/replica.py +119 -0
pixeltable/share/publish.py +257 -59
pixeltable/store.py +311 -194
pixeltable/type_system.py +373 -211
pixeltable/utils/__init__.py +2 -3
pixeltable/utils/arrow.py +131 -17
pixeltable/utils/av.py +298 -0
pixeltable/utils/azure_store.py +346 -0
pixeltable/utils/coco.py +6 -6
pixeltable/utils/code.py +3 -3
pixeltable/utils/console_output.py +4 -1
pixeltable/utils/coroutine.py +6 -23
pixeltable/utils/dbms.py +32 -6
pixeltable/utils/description_helper.py +4 -5
pixeltable/utils/documents.py +7 -18
pixeltable/utils/exception_handler.py +7 -30
pixeltable/utils/filecache.py +6 -6
pixeltable/utils/formatter.py +86 -48
pixeltable/utils/gcs_store.py +295 -0
pixeltable/utils/http.py +133 -0
pixeltable/utils/http_server.py +2 -3
pixeltable/utils/iceberg.py +1 -2
pixeltable/utils/image.py +17 -0
pixeltable/utils/lancedb.py +90 -0
pixeltable/utils/local_store.py +322 -0
pixeltable/utils/misc.py +5 -0
pixeltable/utils/object_stores.py +573 -0
pixeltable/utils/pydantic.py +60 -0
pixeltable/utils/pytorch.py +5 -6
pixeltable/utils/s3_store.py +527 -0
pixeltable/utils/sql.py +26 -0
pixeltable/utils/system.py +30 -0
pixeltable-0.5.7.dist-info/METADATA +579 -0
pixeltable-0.5.7.dist-info/RECORD +227 -0
{pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
pixeltable/__version__.py +0 -3
pixeltable/catalog/named_function.py +0 -40
pixeltable/ext/__init__.py +0 -17
pixeltable/ext/functions/__init__.py +0 -11
pixeltable/ext/functions/whisperx.py +0 -77
pixeltable/utils/media_store.py +0 -77
pixeltable/utils/s3.py +0 -17
pixeltable-0.3.14.dist-info/METADATA +0 -434
pixeltable-0.3.14.dist-info/RECORD +0 -186
pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
{pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0

pixeltable/utils/__init__.py CHANGED Viewed

@@ -2,7 +2,6 @@ import hashlib
 import urllib.parse
 import urllib.request
 from pathlib import Path
-from typing import Optional, Union
 def print_perf_counter_delta(delta: float) -> str:
@@ -24,7 +23,7 @@ def print_perf_counter_delta(delta: float) -> str:
         return f'{delta:.2f} s'
-def sha256sum(path: Union[Path, str]) -> str:
+def sha256sum(path: Path | str) -> str:
     """
     Compute the SHA256 hash of a file.
     """
@@ -39,7 +38,7 @@ def sha256sum(path: Union[Path, str]) -> str:
     return h.hexdigest()
-def parse_local_file_path(file_or_url: str) -> Optional[Path]:
+def parse_local_file_path(file_or_url: str) -> Path | None:
     """
     Parses a string that may be either a URL or a local file path.

pixeltable/utils/arrow.py CHANGED Viewed

@@ -1,15 +1,23 @@
 import datetime
-from typing import Any, Iterator, Optional, Union
+import io
+import json
+import uuid
+from typing import TYPE_CHECKING, Any, Iterator, cast
 import numpy as np
+import PIL.Image
 import pyarrow as pa
+import pixeltable.exceptions as excs
 import pixeltable.type_system as ts
+if TYPE_CHECKING:
+    import pixeltable as pxt
 PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
     pa.string(): ts.StringType(nullable=True),
     pa.large_string(): ts.StringType(nullable=True),
-    pa.timestamp('us', tz=datetime.timezone.utc): ts.TimestampType(nullable=True),
+    pa.timestamp('us', tz='UTC'): ts.TimestampType(nullable=True),
     pa.bool_(): ts.BoolType(nullable=True),
     pa.int8(): ts.IntType(nullable=True),
     pa.int16(): ts.IntType(nullable=True),
@@ -23,16 +31,19 @@ PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
     pa.float64(): ts.FloatType(nullable=True),
     pa.date32(): ts.DateType(nullable=True),
     pa.date64(): ts.DateType(nullable=True),
-    pa.binary(): None,  # cannot import binary (inline image)
+    pa.uuid(): ts.UUIDType(nullable=True),
+    pa.binary(): ts.BinaryType(nullable=True),
 }
 PXT_TO_PA_TYPES: dict[type[ts.ColumnType], pa.DataType] = {
     ts.StringType: pa.string(),
-    ts.TimestampType: pa.timestamp('us', tz=datetime.timezone.utc),  # postgres timestamp is microseconds
+    ts.TimestampType: pa.timestamp('us', tz='UTC'),  # postgres timestamp is microseconds
     ts.DateType: pa.date32(),  # This could be date64
+    ts.UUIDType: pa.uuid(),
     ts.BoolType: pa.bool_(),
     ts.IntType: pa.int64(),
     ts.FloatType: pa.float32(),
+    ts.BinaryType: pa.binary(),
     ts.JsonType: pa.string(),  # TODO(orm) pa.struct() is possible
     ts.ImageType: pa.binary(),  # inline image
     ts.AudioType: pa.string(),  # path
@@ -41,7 +52,7 @@ PXT_TO_PA_TYPES: dict[type[ts.ColumnType], pa.DataType] = {
 }
-def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) -> Optional[ts.ColumnType]:
+def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) -> ts.ColumnType | None:
     """Convert a pyarrow DataType to a pixeltable ColumnType if one is defined.
     Returns None if no conversion is currently implemented.
     """
@@ -54,50 +65,144 @@ def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) -> Optional[ts.C
         dtype = to_pixeltable_type(arrow_type.value_type, nullable)
         if dtype is None:
             return None
-        return ts.ArrayType(shape=arrow_type.shape, dtype=dtype, nullable=nullable)
+        return ts.ArrayType(shape=tuple(arrow_type.shape), dtype=dtype, nullable=nullable)
     else:
         return None
-def to_arrow_type(pixeltable_type: ts.ColumnType) -> Optional[pa.DataType]:
+def to_arrow_type(pixeltable_type: ts.ColumnType) -> pa.DataType | None:
     """Convert a pixeltable DataType to a pyarrow datatype if one is defined.
     Returns None if no conversion is currently implemented.
     """
     if pixeltable_type.__class__ in PXT_TO_PA_TYPES:
         return PXT_TO_PA_TYPES[pixeltable_type.__class__]
     elif isinstance(pixeltable_type, ts.ArrayType):
-        return pa.fixed_shape_tensor(pa.from_numpy_dtype(pixeltable_type.numpy_dtype()), pixeltable_type.shape)
+        return pa.fixed_shape_tensor(pa.from_numpy_dtype(pixeltable_type.dtype), pixeltable_type.shape)
     else:
         return None
-def ar_infer_schema(
+def to_pxt_schema(
     arrow_schema: pa.Schema, schema_overrides: dict[str, Any], primary_key: list[str]
 ) -> dict[str, ts.ColumnType]:
     """Convert a pyarrow Schema to a schema using pyarrow names and pixeltable types."""
-    ar_schema = {
+    pxt_schema = {
         field.name: to_pixeltable_type(field.type, field.name not in primary_key)
         if field.name not in schema_overrides
         else schema_overrides[field.name]
         for field in arrow_schema
     }
-    return ar_schema
+    return pxt_schema
 def to_arrow_schema(pixeltable_schema: dict[str, Any]) -> pa.Schema:
-    return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items())  # type: ignore[misc]
+    return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items())
+def _to_record_batch(column_vals: dict[str, list[Any]], schema: pa.Schema) -> pa.RecordBatch:
+    import pyarrow as pa
+    pa_arrays: list[pa.Array] = []
+    for field in schema:
+        if isinstance(field.type, pa.FixedShapeTensorType):
+            stacked_arr = np.stack(column_vals[field.name])
+            pa_arrays.append(pa.FixedShapeTensorArray.from_numpy_ndarray(stacked_arr))
+        else:
+            pa_array = cast(pa.Array, pa.array(column_vals[field.name]))
+            pa_arrays.append(pa_array)
+    return pa.RecordBatch.from_arrays(pa_arrays, schema=schema)
+def to_record_batches(query: 'pxt.Query', batch_size_bytes: int) -> Iterator[pa.RecordBatch]:
+    arrow_schema = to_arrow_schema(query.schema)
+    batch_columns: dict[str, list[Any]] = {k: [] for k in query.schema}
+    current_byte_estimate = 0
+    num_batch_rows = 0
+    # TODO: in order to avoid having to deal with ExprEvalError here, ResultSet should be an iterator
+    # over _exec()
+    try:
+        for data_row in query._exec():
+            num_batch_rows += 1
+            for (col_name, col_type), e in zip(query.schema.items(), query._select_list_exprs):
+                val = data_row[e.slot_idx]
+                val_size_bytes: int
+                if val is None:
+                    batch_columns[col_name].append(val)
+                    continue
-def to_pydict(batch: Union[pa.Table, pa.RecordBatch]) -> dict[str, Union[list, np.ndarray]]:
+                assert val is not None
+                if col_type.is_image_type():
+                    # images get inlined into the parquet file
+                    if data_row.file_paths[e.slot_idx] is not None:
+                        # if there is a file, read directly to preserve information
+                        with open(data_row.file_paths[e.slot_idx], 'rb') as f:
+                            val = f.read()
+                    elif isinstance(val, PIL.Image.Image):
+                        # no file available: save as png
+                        buf = io.BytesIO()
+                        val.save(buf, format='png')
+                        val = buf.getvalue()
+                    else:
+                        raise excs.Error(f'unknown image type {type(val)}')
+                    val_size_bytes = len(val)
+                elif col_type.is_string_type():
+                    val_size_bytes = len(val)
+                elif col_type.is_uuid_type():
+                    # pa.uuid() uses fixed_size_binary(16) as storage type
+                    val = val.bytes  # Convert UUID to 16-byte binary for arrow
+                    val_size_bytes = len(val)
+                elif col_type.is_binary_type():
+                    val_size_bytes = len(val)
+                elif col_type.is_media_type():
+                    assert data_row.file_paths[e.slot_idx] is not None
+                    val = data_row.file_paths[e.slot_idx]
+                    val_size_bytes = len(val)
+                elif col_type.is_json_type():
+                    val = json.dumps(val)
+                    val_size_bytes = len(val)
+                elif col_type.is_array_type():
+                    val_size_bytes = val.nbytes
+                elif col_type.is_int_type() or col_type.is_float_type():
+                    val_size_bytes = 8
+                elif col_type.is_bool_type():
+                    val_size_bytes = 1
+                elif col_type.is_date_type():
+                    val_size_bytes = 4
+                elif col_type.is_timestamp_type():
+                    val = val.astimezone(datetime.timezone.utc)
+                    val_size_bytes = 8
+                else:
+                    raise excs.Error(f'unknown type {col_type} for {col_name}')
+                batch_columns[col_name].append(val)
+                current_byte_estimate += val_size_bytes
+            if current_byte_estimate > batch_size_bytes and num_batch_rows > 0:
+                record_batch = _to_record_batch(batch_columns, arrow_schema)
+                yield record_batch
+                batch_columns = {k: [] for k in query.schema}
+                current_byte_estimate = 0
+                num_batch_rows = 0
+    except excs.ExprEvalError as e:
+        query._raise_expr_eval_err(e)
+    if num_batch_rows > 0:
+        record_batch = _to_record_batch(batch_columns, arrow_schema)
+        yield record_batch
+def to_pydict(batch: pa.Table | pa.RecordBatch) -> dict[str, list | np.ndarray]:
     """Convert a RecordBatch to a dictionary of lists, unlike pa.lib.RecordBatch.to_pydict,
     this function will not convert numpy arrays to lists, and will preserve the original numpy dtype.
     """
-    out: dict[str, Union[list, np.ndarray]] = {}
+    out: dict[str, list | np.ndarray] = {}
     for k, name in enumerate(batch.schema.names):
         col = batch.column(k)
         if isinstance(col.type, pa.FixedShapeTensorType):
             # treat array columns as numpy arrays to easily preserve numpy type
-            out[name] = col.to_numpy(zero_copy_only=False)  # type: ignore[call-arg]
+            out[name] = col.to_numpy(zero_copy_only=False)
         else:
             # for the rest, use pydict to preserve python types
             out[name] = col.to_pylist()
@@ -105,7 +210,7 @@ def to_pydict(batch: Union[pa.Table, pa.RecordBatch]) -> dict[str, Union[list, n
     return out
-def iter_tuples(batch: Union[pa.Table, pa.RecordBatch]) -> Iterator[dict[str, Any]]:
+def iter_tuples(batch: pa.Table | pa.RecordBatch) -> Iterator[dict[str, Any]]:
     """Convert a RecordBatch to an iterator of dictionaries. also works with pa.Table and pa.RowGroup"""
     pydict = to_pydict(batch)
     assert len(pydict) > 0, 'empty record batch'
@@ -129,6 +234,15 @@ def _ar_val_to_pxt_val(val: Any, pxt_type: ts.ColumnType) -> Any:
         return bool(val)
     elif pxt_type.is_string_type():
         return str(val)
+    elif pxt_type.is_uuid_type():
+        if isinstance(val, uuid.UUID):
+            return val
+        if isinstance(val, bytes):
+            return uuid.UUID(bytes=val)
+        return uuid.UUID(val)
+    elif pxt_type.is_binary_type():
+        assert isinstance(val, bytes)
+        return val
     elif pxt_type.is_date_type():
         if isinstance(val, str):
             return datetime.date.fromisoformat(val)
@@ -145,7 +259,7 @@ def _ar_val_to_pxt_val(val: Any, pxt_type: ts.ColumnType) -> Any:
 def iter_tuples2(
-    batch: Union[pa.Table, pa.RecordBatch], col_mapping: Optional[dict[str, str]], schema: dict[str, ts.ColumnType]
+    batch: pa.Table | pa.RecordBatch, col_mapping: dict[str, str] | None, schema: dict[str, ts.ColumnType]
 ) -> Iterator[dict[str, Any]]:
     """Convert a RecordBatch to an iterator of dictionaries. also works with pa.Table and pa.RowGroup"""
     pydict = to_pydict(batch)

pixeltable/utils/av.py ADDED Viewed

@@ -0,0 +1,298 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from fractions import Fraction
+from pathlib import Path
+from types import TracebackType
+from typing import Any, Iterator
+import av
+import av.stream
+import PIL.Image
+from typing_extensions import Self
+from pixeltable.env import Env
+# format -> (codec, extension)
+AUDIO_FORMATS: dict[str, tuple[str, str]] = {
+    'wav': ('pcm_s16le', 'wav'),
+    'mp3': ('libmp3lame', 'mp3'),
+    'flac': ('flac', 'flac'),
+    'mp4': ('aac', 'm4a'),
+}
+def get_metadata(path: str) -> dict:
+    with av.open(path) as container:
+        assert isinstance(container, av.container.InputContainer)
+        streams_info = [__get_stream_metadata(stream) for stream in container.streams]
+        result = {
+            'bit_exact': getattr(container, 'bit_exact', False),
+            'bit_rate': container.bit_rate,
+            'size': container.size,
+            'metadata': container.metadata,
+            'streams': streams_info,
+        }
+    return result
+def __get_stream_metadata(stream: av.stream.Stream) -> dict:
+    if stream.type not in ('audio', 'video'):
+        return {'type': stream.type}  # Currently unsupported
+    codec_context = stream.codec_context
+    codec_context_md: dict[str, Any] = {
+        'name': codec_context.name,
+        'codec_tag': codec_context.codec_tag.encode('unicode-escape').decode('utf-8'),
+        'profile': codec_context.profile,
+    }
+    metadata = {
+        'type': stream.type,
+        'duration': stream.duration,
+        'time_base': float(stream.time_base) if stream.time_base is not None else None,
+        'duration_seconds': float(stream.duration * stream.time_base)
+        if stream.duration is not None and stream.time_base is not None
+        else None,
+        'frames': stream.frames,
+        'metadata': stream.metadata,
+        'codec_context': codec_context_md,
+    }
+    if stream.type == 'audio':
+        # Additional metadata for audio
+        channels = getattr(stream.codec_context, 'channels', None)
+        codec_context_md['channels'] = int(channels) if channels is not None else None
+    else:
+        assert stream.type == 'video'
+        assert isinstance(stream, av.video.stream.VideoStream)
+        # Additional metadata for video
+        codec_context_md['pix_fmt'] = getattr(stream.codec_context, 'pix_fmt', None)
+        metadata.update(
+            **{
+                'width': stream.width,
+                'height': stream.height,
+                'frames': stream.frames,
+                'average_rate': float(stream.average_rate) if stream.average_rate is not None else None,
+                'base_rate': float(stream.base_rate) if stream.base_rate is not None else None,
+                'guessed_rate': float(stream.guessed_rate) if stream.guessed_rate is not None else None,
+            }
+        )
+    return metadata
+def get_video_duration(path: str) -> float | None:
+    """Return video duration in seconds."""
+    with av.open(path) as container:
+        video_stream = container.streams.video[0]
+        if video_stream is None:
+            return None
+        if video_stream.duration is not None:
+            return float(video_stream.duration * video_stream.time_base)
+        # if duration is not in the header, look for it in the last packet
+        last_pts: int | None = None
+        for packet in container.demux(video_stream):
+            if packet.pts is not None:
+                last_pts = packet.pts
+        if last_pts is not None:
+            return float(last_pts * video_stream.time_base)
+        return None
+def has_audio_stream(path: str) -> bool:
+    """Check if video has audio stream using PyAV."""
+    md = get_metadata(path)
+    return any(stream['type'] == 'audio' for stream in md['streams'])
+def ffmpeg_clip_cmd(
+    input_path: str,
+    output_path: str,
+    start_time: float,
+    duration: float | None = None,
+    fast: bool = True,
+    video_encoder: str | None = None,
+    video_encoder_args: dict[str, Any] | None = None,
+) -> list[str]:
+    cmd = ['ffmpeg']
+    if fast:
+        # fast: -ss before -i
+        cmd.extend(
+            [
+                '-ss',
+                str(start_time),
+                '-i',
+                input_path,
+                '-map',
+                '0',  # Copy all streams from input
+                '-c',
+                'copy',  # Stream copy (no re-encoding)
+            ]
+        )
+    else:
+        if video_encoder is None:
+            video_encoder = Env.get().default_video_encoder
+        # accurate: -ss after -i
+        cmd.extend(
+            [
+                '-i',
+                input_path,
+                '-ss',
+                str(start_time),
+                '-map',
+                '0',  # Copy all streams from input
+                '-c:a',
+                'copy',  # audio copy
+                '-c:s',
+                'copy',  # subtitle copy
+                '-c:v',
+                video_encoder,  # re-encode video
+            ]
+        )
+        if video_encoder_args is not None:
+            for k, v in video_encoder_args.items():
+                cmd.extend([f'-{k}', str(v)])
+    if duration is not None:
+        cmd.extend(['-t', str(duration)])
+    cmd.extend(['-loglevel', 'error', output_path])
+    return cmd
+def ffmpeg_segment_cmd(
+    input_path: str,
+    output_pattern: str,
+    segment_duration: float | None = None,
+    segment_times: list[float] | None = None,
+    video_encoder: str | None = None,
+    video_encoder_args: dict[str, Any] | None = None,
+) -> list[str]:
+    """Commandline for frame-accurate segmentation"""
+    assert (segment_duration is None) != (segment_times is None)
+    if video_encoder is None:
+        video_encoder = Env.get().default_video_encoder
+    cmd = [
+        'ffmpeg',
+        '-i',
+        input_path,
+        '-map',
+        '0',  # Copy all streams from input
+        '-c:a',
+        'copy',  # don't re-encode audio
+        '-c:v',
+        video_encoder,  # re-encode video
+    ]
+    if video_encoder_args is not None:
+        for k, v in video_encoder_args.items():
+            cmd.extend([f'-{k}', str(v)])
+    cmd.extend(['-f', 'segment'])
+    # -force_key_frames needs to precede -f segment
+    if segment_duration is not None:
+        cmd.extend(
+            [
+                '-force_key_frames',
+                f'expr:gte(t,n_forced*{segment_duration})',  # Force keyframe at each segment boundary
+                '-f',
+                'segment',
+                '-segment_time',
+                str(segment_duration),
+            ]
+        )
+    else:
+        assert segment_times is not None
+        times_str = ','.join([str(t) for t in segment_times])
+        cmd.extend(['-force_key_frames', times_str, '-f', 'segment', '-segment_times', times_str])
+    cmd.extend(
+        [
+            '-reset_timestamps',
+            '1',  # Reset timestamps for each segment
+            '-loglevel',
+            'error',  # Only show errors
+            output_pattern,
+        ]
+    )
+    return cmd
+class VideoFrames:
+    """
+    Context manager for iterating over video frames at a specified frame rate.
+    Args:
+        path: Path to the video file
+        fps: Number of frames to extract per second. If None or 0.0, extracts all frames.
+    """
+    path: Path
+    fps: float
+    container: av.container.input.InputContainer | None
+    video_framerate: Fraction | None
+    video_time_base: Fraction | None
+    video_start_time: int | None
+    @dataclass
+    class Item:
+        frame_idx: int
+        pts: int
+        dts: int
+        time: float
+        is_corrupt: bool
+        key_frame: bool
+        pict_type: int
+        interlaced_frame: bool
+        frame: PIL.Image.Image
+    def __init__(self, path: Path, fps: float | None = None) -> None:
+        self.path = path
+        self.fps = 0.0 if fps is None else fps
+        self.container = None
+        self.video_framerate = None
+        self.video_time_base = None
+        self.video_start_time = None
+    def __enter__(self) -> Self:
+        self.container = av.open(self.path)
+        stream = self.container.streams.video[0]
+        self.video_framerate = stream.average_rate
+        self.video_time_base = stream.time_base
+        self.video_start_time = stream.start_time or 0
+        return self
+    def __exit__(
+        self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: TracebackType | None
+    ) -> None:
+        # Clean up
+        if self.container:
+            self.container.close()
+    def __iter__(self) -> Iterator[Item]:
+        num_returned = 0
+        frame_idx = -1
+        while True:
+            try:
+                frame = next(self.container.decode(video=0))
+            except (StopIteration, EOFError):
+                return
+            frame_idx += 1
+            if self.fps == 0.0 or (num_returned <= frame.time * self.fps):
+                img = frame.to_image()
+                assert isinstance(img, PIL.Image.Image)
+                yield VideoFrames.Item(
+                    frame_idx=frame_idx,
+                    pts=frame.pts,
+                    dts=frame.dts,
+                    time=frame.time,
+                    is_corrupt=frame.is_corrupt,
+                    key_frame=frame.key_frame,
+                    pict_type=frame.pict_type,
+                    interlaced_frame=frame.interlaced_frame,
+                    frame=img,
+                )
+                num_returned += 1

pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl

pixeltable 0.3.14py3-none-any.whl → 0.5.7py3-none-any.whl