PyPI - pixeltable - Versions diffs - 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl - Mend

pixeltable 0.4.0rc3py3-none-any.whl → 0.4.20py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (202) hide show

pixeltable/__init__.py +23 -5
pixeltable/_version.py +1 -0
pixeltable/catalog/__init__.py +5 -3
pixeltable/catalog/catalog.py +1318 -404
pixeltable/catalog/column.py +186 -115
pixeltable/catalog/dir.py +1 -2
pixeltable/catalog/globals.py +11 -43
pixeltable/catalog/insertable_table.py +167 -79
pixeltable/catalog/path.py +61 -23
pixeltable/catalog/schema_object.py +9 -10
pixeltable/catalog/table.py +626 -308
pixeltable/catalog/table_metadata.py +101 -0
pixeltable/catalog/table_version.py +713 -569
pixeltable/catalog/table_version_handle.py +37 -6
pixeltable/catalog/table_version_path.py +42 -29
pixeltable/catalog/tbl_ops.py +50 -0
pixeltable/catalog/update_status.py +191 -0
pixeltable/catalog/view.py +108 -94
pixeltable/config.py +128 -22
pixeltable/dataframe.py +188 -100
pixeltable/env.py +407 -136
pixeltable/exceptions.py +6 -0
pixeltable/exec/__init__.py +3 -0
pixeltable/exec/aggregation_node.py +7 -8
pixeltable/exec/cache_prefetch_node.py +83 -110
pixeltable/exec/cell_materialization_node.py +231 -0
pixeltable/exec/cell_reconstruction_node.py +135 -0
pixeltable/exec/component_iteration_node.py +4 -3
pixeltable/exec/data_row_batch.py +8 -65
pixeltable/exec/exec_context.py +16 -4
pixeltable/exec/exec_node.py +13 -36
pixeltable/exec/expr_eval/evaluators.py +7 -6
pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
pixeltable/exec/expr_eval/globals.py +8 -5
pixeltable/exec/expr_eval/row_buffer.py +1 -2
pixeltable/exec/expr_eval/schedulers.py +190 -30
pixeltable/exec/globals.py +32 -0
pixeltable/exec/in_memory_data_node.py +18 -18
pixeltable/exec/object_store_save_node.py +293 -0
pixeltable/exec/row_update_node.py +16 -9
pixeltable/exec/sql_node.py +206 -101
pixeltable/exprs/__init__.py +1 -1
pixeltable/exprs/arithmetic_expr.py +27 -22
pixeltable/exprs/array_slice.py +3 -3
pixeltable/exprs/column_property_ref.py +34 -30
pixeltable/exprs/column_ref.py +92 -96
pixeltable/exprs/comparison.py +5 -5
pixeltable/exprs/compound_predicate.py +5 -4
pixeltable/exprs/data_row.py +152 -55
pixeltable/exprs/expr.py +62 -43
pixeltable/exprs/expr_dict.py +3 -3
pixeltable/exprs/expr_set.py +17 -10
pixeltable/exprs/function_call.py +75 -37
pixeltable/exprs/globals.py +1 -2
pixeltable/exprs/in_predicate.py +4 -4
pixeltable/exprs/inline_expr.py +10 -27
pixeltable/exprs/is_null.py +1 -3
pixeltable/exprs/json_mapper.py +8 -8
pixeltable/exprs/json_path.py +56 -22
pixeltable/exprs/literal.py +5 -5
pixeltable/exprs/method_ref.py +2 -2
pixeltable/exprs/object_ref.py +2 -2
pixeltable/exprs/row_builder.py +127 -53
pixeltable/exprs/rowid_ref.py +8 -12
pixeltable/exprs/similarity_expr.py +50 -25
pixeltable/exprs/sql_element_cache.py +4 -4
pixeltable/exprs/string_op.py +5 -5
pixeltable/exprs/type_cast.py +3 -5
pixeltable/func/__init__.py +1 -0
pixeltable/func/aggregate_function.py +8 -8
pixeltable/func/callable_function.py +9 -9
pixeltable/func/expr_template_function.py +10 -10
pixeltable/func/function.py +18 -20
pixeltable/func/function_registry.py +6 -7
pixeltable/func/globals.py +2 -3
pixeltable/func/mcp.py +74 -0
pixeltable/func/query_template_function.py +20 -18
pixeltable/func/signature.py +43 -16
pixeltable/func/tools.py +23 -13
pixeltable/func/udf.py +18 -20
pixeltable/functions/__init__.py +6 -0
pixeltable/functions/anthropic.py +93 -33
pixeltable/functions/audio.py +114 -10
pixeltable/functions/bedrock.py +13 -6
pixeltable/functions/date.py +1 -1
pixeltable/functions/deepseek.py +20 -9
pixeltable/functions/fireworks.py +2 -2
pixeltable/functions/gemini.py +28 -11
pixeltable/functions/globals.py +13 -13
pixeltable/functions/groq.py +108 -0
pixeltable/functions/huggingface.py +1046 -23
pixeltable/functions/image.py +9 -18
pixeltable/functions/llama_cpp.py +23 -8
pixeltable/functions/math.py +3 -4
pixeltable/functions/mistralai.py +4 -15
pixeltable/functions/ollama.py +16 -9
pixeltable/functions/openai.py +104 -82
pixeltable/functions/openrouter.py +143 -0
pixeltable/functions/replicate.py +2 -2
pixeltable/functions/reve.py +250 -0
pixeltable/functions/string.py +21 -28
pixeltable/functions/timestamp.py +13 -14
pixeltable/functions/together.py +4 -6
pixeltable/functions/twelvelabs.py +92 -0
pixeltable/functions/util.py +6 -1
pixeltable/functions/video.py +1388 -106
pixeltable/functions/vision.py +7 -7
pixeltable/functions/whisper.py +15 -7
pixeltable/functions/whisperx.py +179 -0
pixeltable/{ext/functions → functions}/yolox.py +2 -4
pixeltable/globals.py +332 -105
pixeltable/index/base.py +13 -22
pixeltable/index/btree.py +23 -22
pixeltable/index/embedding_index.py +32 -44
pixeltable/io/__init__.py +4 -2
pixeltable/io/datarows.py +7 -6
pixeltable/io/external_store.py +49 -77
pixeltable/io/fiftyone.py +11 -11
pixeltable/io/globals.py +29 -28
pixeltable/io/hf_datasets.py +17 -9
pixeltable/io/label_studio.py +70 -66
pixeltable/io/lancedb.py +3 -0
pixeltable/io/pandas.py +12 -11
pixeltable/io/parquet.py +13 -93
pixeltable/io/table_data_conduit.py +71 -47
pixeltable/io/utils.py +3 -3
pixeltable/iterators/__init__.py +2 -1
pixeltable/iterators/audio.py +21 -11
pixeltable/iterators/document.py +116 -55
pixeltable/iterators/image.py +5 -2
pixeltable/iterators/video.py +293 -13
pixeltable/metadata/__init__.py +4 -2
pixeltable/metadata/converters/convert_18.py +2 -2
pixeltable/metadata/converters/convert_19.py +2 -2
pixeltable/metadata/converters/convert_20.py +2 -2
pixeltable/metadata/converters/convert_21.py +2 -2
pixeltable/metadata/converters/convert_22.py +2 -2
pixeltable/metadata/converters/convert_24.py +2 -2
pixeltable/metadata/converters/convert_25.py +2 -2
pixeltable/metadata/converters/convert_26.py +2 -2
pixeltable/metadata/converters/convert_29.py +4 -4
pixeltable/metadata/converters/convert_34.py +2 -2
pixeltable/metadata/converters/convert_36.py +2 -2
pixeltable/metadata/converters/convert_37.py +15 -0
pixeltable/metadata/converters/convert_38.py +39 -0
pixeltable/metadata/converters/convert_39.py +124 -0
pixeltable/metadata/converters/convert_40.py +73 -0
pixeltable/metadata/converters/util.py +13 -12
pixeltable/metadata/notes.py +4 -0
pixeltable/metadata/schema.py +79 -42
pixeltable/metadata/utils.py +74 -0
pixeltable/mypy/__init__.py +3 -0
pixeltable/mypy/mypy_plugin.py +123 -0
pixeltable/plan.py +274 -223
pixeltable/share/__init__.py +1 -1
pixeltable/share/packager.py +259 -129
pixeltable/share/protocol/__init__.py +34 -0
pixeltable/share/protocol/common.py +170 -0
pixeltable/share/protocol/operation_types.py +33 -0
pixeltable/share/protocol/replica.py +109 -0
pixeltable/share/publish.py +213 -57
pixeltable/store.py +238 -175
pixeltable/type_system.py +104 -63
pixeltable/utils/__init__.py +2 -3
pixeltable/utils/arrow.py +108 -13
pixeltable/utils/av.py +298 -0
pixeltable/utils/azure_store.py +305 -0
pixeltable/utils/code.py +3 -3
pixeltable/utils/console_output.py +4 -1
pixeltable/utils/coroutine.py +6 -23
pixeltable/utils/dbms.py +31 -5
pixeltable/utils/description_helper.py +4 -5
pixeltable/utils/documents.py +5 -6
pixeltable/utils/exception_handler.py +7 -30
pixeltable/utils/filecache.py +6 -6
pixeltable/utils/formatter.py +4 -6
pixeltable/utils/gcs_store.py +283 -0
pixeltable/utils/http_server.py +2 -3
pixeltable/utils/iceberg.py +1 -2
pixeltable/utils/image.py +17 -0
pixeltable/utils/lancedb.py +88 -0
pixeltable/utils/local_store.py +316 -0
pixeltable/utils/misc.py +5 -0
pixeltable/utils/object_stores.py +528 -0
pixeltable/utils/pydantic.py +60 -0
pixeltable/utils/pytorch.py +5 -6
pixeltable/utils/s3_store.py +392 -0
pixeltable-0.4.20.dist-info/METADATA +587 -0
pixeltable-0.4.20.dist-info/RECORD +218 -0
{pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
pixeltable/__version__.py +0 -3
pixeltable/ext/__init__.py +0 -17
pixeltable/ext/functions/__init__.py +0 -11
pixeltable/ext/functions/whisperx.py +0 -77
pixeltable/utils/media_store.py +0 -77
pixeltable/utils/s3.py +0 -17
pixeltable/utils/sample.py +0 -25
pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
{pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0

pixeltable/io/table_data_conduit.py CHANGED Viewed

@@ -8,9 +8,11 @@ import urllib.parse
 import urllib.request
 from dataclasses import dataclass, field, fields
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Optional, Union, cast
+from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, cast
+import numpy as np
 import pandas as pd
+import PIL
 from pyarrow.parquet import ParquetDataset
 import pixeltable as pxt
@@ -47,16 +49,16 @@ class TableDataConduitFormat(str, enum.Enum):
 @dataclass
 class TableDataConduit:
-    source: TableDataSource
-    source_format: Optional[str] = None
-    source_column_map: Optional[dict[str, str]] = None
+    source: 'TableDataSource'
+    source_format: str | None = None
+    source_column_map: dict[str, str] | None = None
     if_row_exists: Literal['update', 'ignore', 'error'] = 'error'
-    pxt_schema: Optional[dict[str, Any]] = None
-    src_schema_overrides: Optional[dict[str, Any]] = None
-    src_schema: Optional[dict[str, Any]] = None
-    pxt_pk: Optional[list[str]] = None
-    src_pk: Optional[list[str]] = None
-    valid_rows: Optional[RowData] = None
+    pxt_schema: dict[str, ts.ColumnType] | None = None
+    src_schema_overrides: dict[str, ts.ColumnType] | None = None
+    src_schema: dict[str, ts.ColumnType] | None = None
+    pxt_pk: list[str] | None = None
+    src_pk: list[str] | None = None
+    valid_rows: RowData | None = None
     extra_fields: dict[str, Any] = field(default_factory=dict)
     reqd_col_names: set[str] = field(default_factory=set)
@@ -87,7 +89,7 @@ class TableDataConduit:
         for name, coltype in self.pxt_schema.items():
             self.pxt_schema[name] = ts.ColumnType.normalize_type(coltype)
-    def infer_schema(self) -> dict[str, Any]:
+    def infer_schema(self) -> dict[str, ts.ColumnType]:
         raise NotImplementedError
     def valid_row_batch(self) -> Iterator[RowData]:
@@ -101,7 +103,7 @@ class TableDataConduit:
     def add_table_info(self, table: pxt.Table) -> None:
         """Add information about the table into which we are inserting data"""
         assert isinstance(table, pxt.Table)
-        self.pxt_schema = table._schema
+        self.pxt_schema = table._get_schema()
         self.pxt_pk = table._tbl_version.get().primary_key
         for col in table._tbl_version_path.columns():
             if col.is_required_for_insert:
@@ -137,7 +139,7 @@ class DFTableDataConduit(TableDataConduit):
         t.pxt_df = tds.source
         return t
-    def infer_schema(self) -> dict[str, Any]:
+    def infer_schema(self) -> dict[str, ts.ColumnType]:
         self.pxt_schema = self.pxt_df.schema
         self.pxt_pk = self.src_pk
         return self.pxt_schema
@@ -149,7 +151,7 @@ class DFTableDataConduit(TableDataConduit):
 class RowDataTableDataConduit(TableDataConduit):
-    raw_rows: Optional[RowData] = None
+    raw_rows: RowData | None = None
     disable_mapping: bool = True
     batch_count: int = 0
@@ -168,7 +170,7 @@ class RowDataTableDataConduit(TableDataConduit):
         t.batch_count = 0
         return t
-    def infer_schema(self) -> dict[str, Any]:
+    def infer_schema(self) -> dict[str, ts.ColumnType]:
         from .datarows import _infer_schema_from_rows
         if self.source_column_map is None:
@@ -239,7 +241,7 @@ class PandasTableDataConduit(TableDataConduit):
         t.batch_count = 0
         return t
-    def infer_schema_part1(self) -> tuple[dict[str, Any], list[str]]:
+    def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
         """Return inferred schema, inferred primary key, and source column map"""
         if self.source_column_map is None:
             if self.src_schema_overrides is None:
@@ -252,7 +254,7 @@ class PandasTableDataConduit(TableDataConduit):
         else:
             raise NotImplementedError()
-    def infer_schema(self) -> dict[str, Any]:
+    def infer_schema(self) -> dict[str, ts.ColumnType]:
         self.pxt_schema, self.pxt_pk = self.infer_schema_part1()
         self.normalize_pxt_schema_types()
         _df_check_primary_key_values(self.pd_df, self.src_pk)
@@ -325,10 +327,13 @@ class JsonTableDataConduit(TableDataConduit):
 class HFTableDataConduit(TableDataConduit):
-    hf_ds: Optional[Union[datasets.Dataset, datasets.DatasetDict]] = None
-    column_name_for_split: Optional[str] = None
+    """
+    TODO:
+    - use set_format('arrow') and convert ChunkedArrays to PIL.Image.Image instead of going through numpy, which is slow
+    """
+    column_name_for_split: str | None = None
     categorical_features: dict[str, dict[int, str]]
-    hf_schema: dict[str, Any] = None
     dataset_dict: dict[str, datasets.Dataset] = None
     hf_schema_source: dict[str, Any] = None
@@ -340,9 +345,19 @@ class HFTableDataConduit(TableDataConduit):
         import datasets
         assert isinstance(tds.source, (datasets.Dataset, datasets.DatasetDict))
-        t.hf_ds = tds.source
         if 'column_name_for_split' in t.extra_fields:
             t.column_name_for_split = t.extra_fields['column_name_for_split']
+        # make sure we get numpy arrays for arrays, not Python lists
+        source = tds.source.with_format(type='numpy')
+        if isinstance(source, datasets.Dataset):
+            # when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
+            raw_name = source.split._name
+            split_name = raw_name.split('[')[0] if raw_name is not None else None
+            t.dataset_dict = {split_name: source}
+        else:
+            assert isinstance(source, datasets.DatasetDict)
+            t.dataset_dict = source
         return t
     @classmethod
@@ -356,13 +371,13 @@ class HFTableDataConduit(TableDataConduit):
         except ImportError:
             return False
-    def infer_schema_part1(self) -> tuple[dict[str, Any], list[str]]:
+    def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
         from pixeltable.io.hf_datasets import _get_hf_schema, huggingface_schema_to_pxt_schema
         if self.source_column_map is None:
             if self.src_schema_overrides is None:
                 self.src_schema_overrides = {}
-            self.hf_schema_source = _get_hf_schema(self.hf_ds)
+            self.hf_schema_source = _get_hf_schema(self.source)
             self.src_schema = huggingface_schema_to_pxt_schema(
                 self.hf_schema_source, self.src_schema_overrides, self.src_pk
             )
@@ -397,15 +412,6 @@ class HFTableDataConduit(TableDataConduit):
     def prepare_insert(self) -> None:
         import datasets
-        if isinstance(self.source, datasets.Dataset):
-            # when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
-            raw_name = self.source.split._name
-            split_name = raw_name.split('[')[0] if raw_name is not None else None
-            self.dataset_dict = {split_name: self.source}
-        else:
-            assert isinstance(self.source, datasets.DatasetDict)
-            self.dataset_dict = self.source
         # extract all class labels from the dataset to translate category ints to strings
         self.categorical_features = {
             feature_name: feature_type.names
@@ -416,26 +422,44 @@ class HFTableDataConduit(TableDataConduit):
             self.source_column_map = {}
         self.check_source_columns_are_insertable(self.hf_schema_source.keys())
-    def _translate_row(self, row: dict[str, Any], split_name: str) -> dict[str, Any]:
+    def _translate_row(self, row: dict[str, Any], split_name: str, features: datasets.Features) -> dict[str, Any]:
         output_row: dict[str, Any] = {}
         for col_name, val in row.items():
             # translate category ints to strings
             new_val = self.categorical_features[col_name][val] if col_name in self.categorical_features else val
             mapped_col_name = self.source_column_map.get(col_name, col_name)
-            # Convert values to the appropriate type if needed
-            try:
-                checked_val = self.pxt_schema[mapped_col_name].create_literal(new_val)
-            except TypeError as e:
-                msg = str(e)
-                raise excs.Error(f'Error in column {col_name}: {msg[0].lower() + msg[1:]}\nRow: {row}') from e
-            output_row[mapped_col_name] = checked_val
+            new_val = self._translate_val(new_val, features[col_name])
+            output_row[mapped_col_name] = new_val
         # add split name to output row
         if self.column_name_for_split is not None:
             output_row[self.column_name_for_split] = split_name
         return output_row
+    def _translate_val(self, val: Any, feature: datasets.Feature) -> Any:
+        """Convert numpy scalars to Python types and images to PIL.Image.Image"""
+        import datasets
+        if isinstance(feature, datasets.Value):
+            if isinstance(val, (np.generic, np.ndarray)):
+                # a scalar, which we want as a standard Python type
+                assert np.ndim(val) == 0
+                return val.item()
+            else:
+                # a standard Python object
+                return val
+        elif isinstance(feature, datasets.Sequence):
+            assert np.ndim(val) > 0
+            return val
+        elif isinstance(feature, datasets.Image):
+            return PIL.Image.fromarray(val)
+        elif isinstance(feature, dict):
+            assert isinstance(val, dict)
+            return {k: self._translate_val(v, feature[k]) for k, v in val.items()}
+        else:
+            return val
     def valid_row_batch(self) -> Iterator[RowData]:
         for split_name, split_dataset in self.dataset_dict.items():
             num_batches = split_dataset.size_in_bytes / self._K_BATCH_SIZE_BYTES
@@ -444,7 +468,7 @@ class HFTableDataConduit(TableDataConduit):
             batch = []
             for row in split_dataset:
-                batch.append(self._translate_row(row, split_name))
+                batch.append(self._translate_row(row, split_name, split_dataset.features))
                 if len(batch) >= tuples_per_batch:
                     yield batch
                     batch = []
@@ -454,7 +478,7 @@ class HFTableDataConduit(TableDataConduit):
 class ParquetTableDataConduit(TableDataConduit):
-    pq_ds: Optional[ParquetDataset] = None
+    pq_ds: ParquetDataset | None = None
     @classmethod
     def from_tds(cls, tds: TableDataConduit) -> 'ParquetTableDataConduit':
@@ -469,13 +493,13 @@ class ParquetTableDataConduit(TableDataConduit):
         t.pq_ds = parquet.ParquetDataset(str(input_path))
         return t
-    def infer_schema_part1(self) -> tuple[dict[str, Any], list[str]]:
-        from pixeltable.utils.arrow import ar_infer_schema
+    def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
+        from pixeltable.utils.arrow import to_pxt_schema
         if self.source_column_map is None:
             if self.src_schema_overrides is None:
                 self.src_schema_overrides = {}
-            self.src_schema = ar_infer_schema(self.pq_ds.schema, self.src_schema_overrides, self.src_pk)
+            self.src_schema = to_pxt_schema(self.pq_ds.schema, self.src_schema_overrides, self.src_pk)
             inferred_schema, inferred_pk, self.source_column_map = normalize_schema_names(
                 self.src_schema, self.src_pk, self.src_schema_overrides
             )
@@ -483,7 +507,7 @@ class ParquetTableDataConduit(TableDataConduit):
         else:
             raise NotImplementedError()
-    def infer_schema(self) -> dict[str, Any]:
+    def infer_schema(self) -> dict[str, ts.ColumnType]:
         self.pxt_schema, self.pxt_pk = self.infer_schema_part1()
         self.normalize_pxt_schema_types()
         self.prepare_insert()
@@ -504,7 +528,7 @@ class ParquetTableDataConduit(TableDataConduit):
         from pixeltable.utils.arrow import iter_tuples2
         try:
-            for fragment in self.pq_ds.fragments:  # type: ignore[attr-defined]
+            for fragment in self.pq_ds.fragments:
                 for batch in fragment.to_batches():
                     dict_batch = list(iter_tuples2(batch, self.source_column_map, self.pxt_schema))
                     self.total_rows += len(dict_batch)

pixeltable/io/utils.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from keyword import iskeyword as is_python_keyword
-from typing import Any, Optional, Union
+from typing import Any
 import pixeltable as pxt
 import pixeltable.exceptions as excs
@@ -21,7 +21,7 @@ def normalize_pxt_col_name(name: str) -> str:
     return id
-def normalize_primary_key_parameter(primary_key: Optional[Union[str, list[str]]] = None) -> list[str]:
+def normalize_primary_key_parameter(primary_key: str | list[str] | None = None) -> list[str]:
     if primary_key is None:
         primary_key = []
     elif isinstance(primary_key, str):
@@ -40,7 +40,7 @@ def normalize_schema_names(
     primary_key: list[str],
     schema_overrides: dict[str, Any],
     require_valid_pxt_column_names: bool = False,
-) -> tuple[dict[str, Any], list[str], Optional[dict[str, str]]]:
+) -> tuple[dict[str, Any], list[str], dict[str, str] | None]:
     """
     Convert all names in the input schema from source names to valid Pixeltable identifiers
     - Ensure that all names are unique.

pixeltable/iterators/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
+"""Iterators for splitting media and documents into components."""
 # ruff: noqa: F401
 from .audio import AudioSplitter
@@ -5,7 +6,7 @@ from .base import ComponentIterator
 from .document import DocumentSplitter
 from .image import TileIterator
 from .string import StringSplitter
-from .video import FrameIterator
+from .video import FrameIterator, VideoSplitter
 __default_dir = {symbol for symbol in dir() if not symbol.startswith('_')}
 __removed_symbols = {'base', 'document', 'video'}

pixeltable/iterators/audio.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import logging
-import uuid
 from fractions import Fraction
 from pathlib import Path
-from typing import Any, ClassVar, Optional
+from typing import Any, ClassVar
 import av
-from pixeltable import env, exceptions as excs, type_system as ts
+from pixeltable import exceptions as excs, type_system as ts
+from pixeltable.utils.local_store import TempStore
 from .base import ComponentIterator
@@ -37,7 +37,7 @@ class AudioSplitter(ComponentIterator):
     # List of chunks to extract
     # Each chunk is defined by start and end presentation timestamps in audio file (int)
-    chunks_to_extract_in_pts: Optional[list[tuple[int, int]]]
+    chunks_to_extract_in_pts: list[tuple[int, int]] | None
     # next chunk to extract
     next_pos: int
@@ -55,12 +55,9 @@ class AudioSplitter(ComponentIterator):
     def __init__(
         self, audio: str, chunk_duration_sec: float, *, overlap_sec: float = 0.0, min_chunk_duration_sec: float = 0.0
     ):
-        if chunk_duration_sec <= 0.0:
-            raise excs.Error('chunk_duration_sec must be a positive number')
-        if chunk_duration_sec < min_chunk_duration_sec:
-            raise excs.Error('chunk_duration_sec must be at least min_chunk_duration_sec')
-        if overlap_sec >= chunk_duration_sec:
-            raise excs.Error('overlap_sec must be less than chunk_duration_sec')
+        assert chunk_duration_sec > 0.0
+        assert chunk_duration_sec >= min_chunk_duration_sec
+        assert overlap_sec < chunk_duration_sec
         audio_path = Path(audio)
         assert audio_path.exists() and audio_path.is_file()
         self.audio_path = audio_path
@@ -128,6 +125,19 @@ class AudioSplitter(ComponentIterator):
     @classmethod
     def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
+        param_names = ['chunk_duration_sec', 'min_chunk_duration_sec', 'overlap_sec']
+        params = dict(zip(param_names, args))
+        params.update(kwargs)
+        chunk_duration_sec = params['chunk_duration_sec']
+        min_chunk_duration_sec = params.get('min_chunk_duration_sec', 0.0)
+        overlap_sec = params.get('overlap_sec', 0.0)
+        if chunk_duration_sec <= 0.0:
+            raise excs.Error('chunk_duration_sec must be a positive number')
+        if chunk_duration_sec < min_chunk_duration_sec:
+            raise excs.Error('chunk_duration_sec must be at least min_chunk_duration_sec')
+        if overlap_sec >= chunk_duration_sec:
+            raise excs.Error('overlap_sec must be less than chunk_duration_sec')
         return {
             'start_time_sec': ts.FloatType(),
             'end_time_sec': ts.FloatType(),
@@ -140,7 +150,7 @@ class AudioSplitter(ComponentIterator):
         target_chunk_start, target_chunk_end = self.chunks_to_extract_in_pts[self.next_pos]
         chunk_start_pts = 0
         chunk_end_pts = 0
-        chunk_file = str(env.Env.get().tmp_dir / f'{uuid.uuid4()}{self.audio_path.suffix}')
+        chunk_file = str(TempStore.create_path(extension=self.audio_path.suffix))
         output_container = av.open(chunk_file, mode='w')
         input_stream = self.container.streams.audio[0]
         codec_name = AudioSplitter.__codec_map.get(input_stream.codec_context.name, input_stream.codec_context.name)

pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl

Potentially problematic release.

pixeltable 0.4.0rc3py3-none-any.whl → 0.4.20py3-none-any.whl