PyPI - datachain - Versions diffs - 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl - Mend

datachain 0.30.5py3-none-any.whl → 0.39.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (119) hide show

datachain/__init__.py +4 -0
datachain/asyn.py +11 -12
datachain/cache.py +5 -5
datachain/catalog/__init__.py +0 -2
datachain/catalog/catalog.py +276 -354
datachain/catalog/dependency.py +164 -0
datachain/catalog/loader.py +8 -3
datachain/checkpoint.py +43 -0
datachain/cli/__init__.py +10 -17
datachain/cli/commands/__init__.py +1 -8
datachain/cli/commands/datasets.py +42 -27
datachain/cli/commands/ls.py +15 -15
datachain/cli/commands/show.py +2 -2
datachain/cli/parser/__init__.py +3 -43
datachain/cli/parser/job.py +1 -1
datachain/cli/parser/utils.py +1 -2
datachain/cli/utils.py +2 -15
datachain/client/azure.py +2 -2
datachain/client/fsspec.py +34 -23
datachain/client/gcs.py +3 -3
datachain/client/http.py +157 -0
datachain/client/local.py +11 -7
datachain/client/s3.py +3 -3
datachain/config.py +4 -8
datachain/data_storage/db_engine.py +12 -6
datachain/data_storage/job.py +2 -0
datachain/data_storage/metastore.py +716 -137
datachain/data_storage/schema.py +20 -27
datachain/data_storage/serializer.py +105 -15
datachain/data_storage/sqlite.py +114 -114
datachain/data_storage/warehouse.py +140 -48
datachain/dataset.py +109 -89
datachain/delta.py +117 -42
datachain/diff/__init__.py +25 -33
datachain/error.py +24 -0
datachain/func/aggregate.py +9 -11
datachain/func/array.py +12 -12
datachain/func/base.py +7 -4
datachain/func/conditional.py +9 -13
datachain/func/func.py +63 -45
datachain/func/numeric.py +5 -7
datachain/func/string.py +2 -2
datachain/hash_utils.py +123 -0
datachain/job.py +11 -7
datachain/json.py +138 -0
datachain/lib/arrow.py +18 -15
datachain/lib/audio.py +60 -59
datachain/lib/clip.py +14 -13
datachain/lib/convert/python_to_sql.py +6 -10
datachain/lib/convert/values_to_tuples.py +151 -53
datachain/lib/data_model.py +23 -19
datachain/lib/dataset_info.py +7 -7
datachain/lib/dc/__init__.py +2 -1
datachain/lib/dc/csv.py +22 -26
datachain/lib/dc/database.py +37 -34
datachain/lib/dc/datachain.py +518 -324
datachain/lib/dc/datasets.py +38 -30
datachain/lib/dc/hf.py +16 -20
datachain/lib/dc/json.py +17 -18
datachain/lib/dc/listings.py +5 -8
datachain/lib/dc/pandas.py +3 -6
datachain/lib/dc/parquet.py +33 -21
datachain/lib/dc/records.py +9 -13
datachain/lib/dc/storage.py +103 -65
datachain/lib/dc/storage_pattern.py +251 -0
datachain/lib/dc/utils.py +17 -14
datachain/lib/dc/values.py +3 -6
datachain/lib/file.py +187 -50
datachain/lib/hf.py +7 -5
datachain/lib/image.py +13 -13
datachain/lib/listing.py +5 -5
datachain/lib/listing_info.py +1 -2
datachain/lib/meta_formats.py +2 -3
datachain/lib/model_store.py +20 -8
datachain/lib/namespaces.py +59 -7
datachain/lib/projects.py +51 -9
datachain/lib/pytorch.py +31 -23
datachain/lib/settings.py +188 -85
datachain/lib/signal_schema.py +302 -64
datachain/lib/text.py +8 -7
datachain/lib/udf.py +103 -63
datachain/lib/udf_signature.py +59 -34
datachain/lib/utils.py +20 -0
datachain/lib/video.py +3 -4
datachain/lib/webdataset.py +31 -36
datachain/lib/webdataset_laion.py +15 -16
datachain/listing.py +12 -5
datachain/model/bbox.py +3 -1
datachain/namespace.py +22 -3
datachain/node.py +6 -6
datachain/nodes_thread_pool.py +0 -1
datachain/plugins.py +24 -0
datachain/project.py +4 -4
datachain/query/batch.py +10 -12
datachain/query/dataset.py +376 -194
datachain/query/dispatch.py +112 -84
datachain/query/metrics.py +3 -4
datachain/query/params.py +2 -3
datachain/query/queue.py +2 -1
datachain/query/schema.py +7 -6
datachain/query/session.py +190 -33
datachain/query/udf.py +9 -6
datachain/remote/studio.py +90 -53
datachain/script_meta.py +12 -12
datachain/sql/sqlite/base.py +37 -25
datachain/sql/sqlite/types.py +1 -1
datachain/sql/types.py +36 -5
datachain/studio.py +49 -40
datachain/toolkit/split.py +31 -10
datachain/utils.py +39 -48
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/METADATA +26 -38
datachain-0.39.0.dist-info/RECORD +173 -0
datachain/cli/commands/query.py +0 -54
datachain/query/utils.py +0 -36
datachain-0.30.5.dist-info/RECORD +0 -168
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/WHEEL +0 -0
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0

datachain/lib/udf.py CHANGED Viewed

@@ -1,9 +1,8 @@
-import sys
-import traceback
+import hashlib
 from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
 from contextlib import closing, nullcontext
 from functools import partial
-from typing import TYPE_CHECKING, Any, Optional, TypeVar
+from typing import TYPE_CHECKING, Any, TypeVar
 import attrs
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback
@@ -12,9 +11,10 @@ from pydantic import BaseModel
 from datachain.asyn import AsyncMapper
 from datachain.cache import temporary_cache
 from datachain.dataset import RowDict
+from datachain.hash_utils import hash_callable
 from datachain.lib.convert.flatten import flatten
 from datachain.lib.file import DataModel, File
-from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
+from datachain.lib.utils import AbstractUDF, DataChainParamsError
 from datachain.query.batch import (
     Batch,
     BatchingStrategy,
@@ -40,8 +40,44 @@ T = TypeVar("T", bound=Sequence[Any])
 class UdfError(DataChainParamsError):
-    def __init__(self, msg):
-        super().__init__(f"UDF error: {msg}")
+    """Exception raised for UDF-related errors."""
+    def __init__(self, message: str) -> None:
+        self.message = message
+        super().__init__(message)
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__!s}: {self.message!s}"
+    def __reduce__(self):
+        """Custom reduce method for pickling."""
+        return self.__class__, (self.message,)
+class UdfRunError(Exception):
+    """Exception raised when UDF execution fails."""
+    def __init__(
+        self,
+        error: Exception | str,
+        stacktrace: str | None = None,
+        udf_name: str | None = None,
+    ) -> None:
+        self.error = error
+        self.stacktrace = stacktrace
+        self.udf_name = udf_name
+        super().__init__(str(error))
+    def __str__(self) -> str:
+        if isinstance(self.error, UdfRunError):
+            return str(self.error)
+        if isinstance(self.error, Exception):
+            return f"{self.error.__class__.__name__!s}: {self.error!s}"
+        return f"{self.__class__.__name__!s}: {self.error!s}"
+    def __reduce__(self):
+        """Custom reduce method for pickling."""
+        return self.__class__, (self.error, self.stacktrace, self.udf_name)
 ColumnType = Any
@@ -54,25 +90,16 @@ UDFOutputSpec = Mapping[str, ColumnType]
 UDFResult = dict[str, Any]
-@attrs.define
-class UDFProperties:
-    udf: "UDFAdapter"
-    def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
-        return self.udf.get_batching(use_partitioning)
-    @property
-    def batch_rows(self):
-        return self.udf.batch_rows
 @attrs.define(slots=False)
 class UDFAdapter:
     inner: "UDFBase"
     output: UDFOutputSpec
-    batch_rows: Optional[int] = None
+    batch_size: int | None = None
     batch: int = 1
+    def hash(self) -> str:
+        return self.inner.hash()
     def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
         if use_partitioning:
             return Partition()
@@ -83,11 +110,6 @@ class UDFAdapter:
             return Batch(self.batch)
         raise ValueError(f"invalid batch size {self.batch}")
-    @property
-    def properties(self):
-        # For backwards compatibility.
-        return UDFProperties(self)
     def run(
         self,
         udf_fields: "Sequence[str]",
@@ -164,10 +186,31 @@ class UDFBase(AbstractUDF):
     prefetch: int = 0
     def __init__(self):
-        self.params: Optional[SignalSchema] = None
+        self.params: SignalSchema | None = None
         self.output = None
         self._func = None
+    def hash(self) -> str:
+        """
+        Creates SHA hash of this UDF function. It takes into account function,
+        inputs and outputs.
+        For function-based UDFs, hashes self._func.
+        For class-based UDFs, hashes the process method.
+        """
+        # Hash user code: either _func (function-based) or process method (class-based)
+        func_to_hash = self._func if self._func else self.process
+        parts = [
+            hash_callable(func_to_hash),
+            self.params.hash() if self.params else "",
+            self.output.hash(),
+        ]
+        return hashlib.sha256(
+            b"".join([bytes.fromhex(part) for part in parts])
+        ).hexdigest()
     def process(self, *args, **kwargs):
         """Processing function that needs to be defined by user"""
         if not self._func:
@@ -188,7 +231,7 @@ class UDFBase(AbstractUDF):
         self,
         sign: "UdfSignature",
         params: "SignalSchema",
-        func: Optional[Callable],
+        func: Callable | None,
     ):
         self.params = params
         self.output = sign.output_schema
@@ -237,13 +280,13 @@ class UDFBase(AbstractUDF):
     def to_udf_wrapper(
         self,
-        batch_rows: Optional[int] = None,
+        batch_size: int | None = None,
         batch: int = 1,
     ) -> UDFAdapter:
         return UDFAdapter(
             self,
             self.output.to_udf_spec(),
-            batch_rows,
+            batch_size,
             batch,
         )
@@ -295,28 +338,14 @@ class UDFBase(AbstractUDF):
                     self._set_stream_recursive(field_value, catalog, cache, download_cb)
     def _prepare_row(self, row, udf_fields, catalog, cache, download_cb):
-        row_dict = RowDict(zip(udf_fields, row))
+        row_dict = RowDict(zip(udf_fields, row, strict=False))
         return self._parse_row(row_dict, catalog, cache, download_cb)
     def _prepare_row_and_id(self, row, udf_fields, catalog, cache, download_cb):
-        row_dict = RowDict(zip(udf_fields, row))
+        row_dict = RowDict(zip(udf_fields, row, strict=False))
         udf_input = self._parse_row(row_dict, catalog, cache, download_cb)
         return row_dict["sys__id"], *udf_input
-    def process_safe(self, obj_rows):
-        try:
-            result_objs = self.process(*obj_rows)
-        except Exception as e:  # noqa: BLE001
-            msg = f"============== Error in user code: '{self.name}' =============="
-            print(msg)
-            exc_type, exc_value, exc_traceback = sys.exc_info()
-            traceback.print_exception(exc_type, exc_value, exc_traceback.tb_next)
-            print("=" * len(msg))
-            raise DataChainError(
-                f"Error in user code in class '{self.name}': {e!s}"
-            ) from None
-        return result_objs
 def noop(*args, **kwargs):
     pass
@@ -324,7 +353,7 @@ def noop(*args, **kwargs):
 async def _prefetch_input(
     row: T,
-    download_cb: Optional["Callback"] = None,
+    download_cb: Callback | None = None,
     after_prefetch: "Callable[[], None]" = noop,
 ) -> T:
     for obj in row:
@@ -347,8 +376,8 @@ def _remove_prefetched(row: T) -> None:
 def _prefetch_inputs(
     prepared_inputs: "Iterable[T]",
     prefetch: int = 0,
-    download_cb: Optional["Callback"] = None,
-    after_prefetch: Optional[Callable[[], None]] = None,
+    download_cb: Callback | None = None,
+    after_prefetch: Callable[[], None] | None = None,
     remove_prefetched: bool = False,
 ) -> "abc.Generator[T, None, None]":
     if not prefetch:
@@ -415,9 +444,12 @@ class Mapper(UDFBase):
         with closing(prepared_inputs):
             for id_, *udf_args in prepared_inputs:
-                result_objs = self.process_safe(udf_args)
+                result_objs = self.process(*udf_args)
                 udf_output = self._flatten_row(result_objs)
-                output = [{"sys__id": id_} | dict(zip(self.signal_names, udf_output))]
+                output = [
+                    {"sys__id": id_}
+                    | dict(zip(self.signal_names, udf_output, strict=False))
+                ]
                 processed_cb.relative_update(1)
                 yield output
@@ -465,17 +497,19 @@ class BatchMapper(UDFBase):
                         row, udf_fields, catalog, cache, download_cb
                     )
                     for row in batch
-                ]
+                ],
+                strict=False,
             )
-            result_objs = list(self.process_safe(udf_args))
+            result_objs = list(self.process(*udf_args))
             n_objs = len(result_objs)
             assert n_objs == n_rows, (
                 f"{self.name} returns {n_objs} rows, but {n_rows} were expected"
             )
             udf_outputs = (self._flatten_row(row) for row in result_objs)
             output = [
-                {"sys__id": row_id} | dict(zip(self.signal_names, signals))
-                for row_id, signals in zip(row_ids, udf_outputs)
+                {"sys__id": row_id}
+                | dict(zip(self.signal_names, signals, strict=False))
+                for row_id, signals in zip(row_ids, udf_outputs, strict=False)
             ]
             processed_cb.relative_update(n_rows)
             yield output
@@ -508,10 +542,10 @@ class Generator(UDFBase):
                     )
         def _process_row(row):
-            with safe_closing(self.process_safe(row)) as result_objs:
+            with safe_closing(self.process(*row)) as result_objs:
                 for result_obj in result_objs:
                     udf_output = self._flatten_row(result_obj)
-                    yield dict(zip(self.signal_names, udf_output))
+                    yield dict(zip(self.signal_names, udf_output, strict=False))
         prepared_inputs = _prepare_rows(udf_inputs)
         prepared_inputs = _prefetch_inputs(
@@ -546,15 +580,21 @@ class Aggregator(UDFBase):
         self.setup()
         for batch in udf_inputs:
-            udf_args = zip(
-                *[
-                    self._prepare_row(row, udf_fields, catalog, cache, download_cb)
-                    for row in batch
-                ]
-            )
-            result_objs = self.process_safe(udf_args)
+            prepared_rows = [
+                self._prepare_row(row, udf_fields, catalog, cache, download_cb)
+                for row in batch
+            ]
+            batched_args = zip(*prepared_rows, strict=False)
+            # Convert aggregated column values to lists. This keeps behavior
+            # consistent with the type hints promoted in the public API.
+            udf_args = [
+                list(arg) if isinstance(arg, tuple) else arg for arg in batched_args
+            ]
+            result_objs = self.process(*udf_args)
             udf_outputs = (self._flatten_row(row) for row in result_objs)
-            output = (dict(zip(self.signal_names, row)) for row in udf_outputs)
+            output = (
+                dict(zip(self.signal_names, row, strict=False)) for row in udf_outputs
+            )
             processed_cb.relative_update(len(batch))
             yield output

datachain/lib/udf_signature.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import inspect
-from collections.abc import Generator, Iterator, Sequence
+from collections.abc import Callable, Generator, Iterator, Sequence
 from dataclasses import dataclass
-from typing import Any, Callable, Union, get_args, get_origin
+from typing import Any, get_args, get_origin
 from datachain.lib.data_model import DataType, DataTypeNames, is_chain_type
 from datachain.lib.signal_schema import SignalSchema
 from datachain.lib.udf import UDFBase
-from datachain.lib.utils import AbstractUDF, DataChainParamsError
+from datachain.lib.utils import AbstractUDF, DataChainParamsError, callable_name
 class UdfSignatureError(DataChainParamsError):
@@ -17,8 +17,8 @@ class UdfSignatureError(DataChainParamsError):
 @dataclass
 class UdfSignature:  # noqa: PLW1641
-    func: Union[Callable, UDFBase]
-    params: dict[str, Union[DataType, Any]]
+    func: Callable | UDFBase
+    params: dict[str, DataType | Any]
     output_schema: SignalSchema
     DEFAULT_RETURN_TYPE = str
@@ -28,24 +28,29 @@ class UdfSignature:  # noqa: PLW1641
         cls,
         chain: str,
         signal_map: dict[str, Callable],
-        func: Union[None, UDFBase, Callable] = None,
-        params: Union[None, str, Sequence[str]] = None,
-        output: Union[None, DataType, Sequence[str], dict[str, DataType]] = None,
+        func: UDFBase | Callable | None = None,
+        params: str | Sequence[str] | None = None,
+        output: DataType | Sequence[str] | dict[str, DataType] | None = None,
         is_generator: bool = True,
     ) -> "UdfSignature":
         keys = ", ".join(signal_map.keys())
         if len(signal_map) > 1:
             raise UdfSignatureError(
                 chain,
-                f"multiple signals '{keys}' are not supported in processors."
-                " Chain multiple processors instead.",
+                (
+                    f"multiple signals '{keys}' are not supported in processors."
+                    " Chain multiple processors instead.",
+                ),
             )
-        udf_func: Union[UDFBase, Callable]
+        udf_func: UDFBase | Callable
         if len(signal_map) == 1:
             if func is not None:
                 raise UdfSignatureError(
                     chain,
-                    f"processor can't have signal '{keys}' with function '{func}'",
+                    (
+                        "processor can't have signal "
+                        f"'{keys}' with function '{callable_name(func)}'"
+                    ),
                 )
             signal_name, udf_func = next(iter(signal_map.items()))
         else:
@@ -56,13 +61,16 @@ class UdfSignature:  # noqa: PLW1641
             signal_name = None
         if not isinstance(udf_func, UDFBase) and not callable(udf_func):
-            raise UdfSignatureError(chain, f"UDF '{udf_func}' is not callable")
+            raise UdfSignatureError(
+                chain,
+                f"UDF '{callable_name(udf_func)}' is not callable",
+            )
         func_params_map_sign, func_outs_sign, is_iterator = cls._func_signature(
             chain, udf_func
         )
-        udf_params: dict[str, Union[DataType, Any]] = {}
+        udf_params: dict[str, DataType | Any] = {}
         if params:
             udf_params = (
                 {params: Any} if isinstance(params, str) else dict.fromkeys(params, Any)
@@ -76,14 +84,15 @@ class UdfSignature:  # noqa: PLW1641
             }
         if output:
+            # Use the actual resolved function (udf_func) for clearer error messages
             udf_output_map = UdfSignature._validate_output(
-                chain, signal_name, func, func_outs_sign, output
+                chain, signal_name, udf_func, func_outs_sign, output
             )
         else:
             if not func_outs_sign:
                 raise UdfSignatureError(
                     chain,
-                    f"outputs are not defined in function '{udf_func}'"
+                    f"outputs are not defined in function '{callable_name(udf_func)}'"
                     " hints or 'output'",
                 )
@@ -97,9 +106,12 @@ class UdfSignature:  # noqa: PLW1641
             if is_generator and not is_iterator:
                 raise UdfSignatureError(
                     chain,
-                    f"function '{func}' cannot be used in generator/aggregator"
-                    " because it returns a type that is not Iterator/Generator."
-                    f" Instead, it returns '{func_outs_sign}'",
+                    (
+                        f"function '{callable_name(udf_func)}' cannot be used in "
+                        "generator/aggregator because it returns a type that is "
+                        "not Iterator/Generator. "
+                        f"Instead, it returns '{func_outs_sign}'"
+                    ),
                 )
             if isinstance(func_outs_sign, tuple):
@@ -124,11 +136,14 @@ class UdfSignature:  # noqa: PLW1641
             if len(func_outs_sign) != len(output):
                 raise UdfSignatureError(
                     chain,
-                    f"length of outputs names ({len(output)}) and function '{func}'"
-                    f" return type length ({len(func_outs_sign)}) does not match",
+                    (
+                        f"length of outputs names ({len(output)}) and function "
+                        f"'{callable_name(func)}' return type length "
+                        f"({len(func_outs_sign)}) does not match"
+                    ),
                 )
-            udf_output_map = dict(zip(output, func_outs_sign))
+            udf_output_map = dict(zip(output, func_outs_sign, strict=False))
         elif isinstance(output, dict):
             for key, value in output.items():
                 if not isinstance(key, str):
@@ -164,7 +179,7 @@ class UdfSignature:  # noqa: PLW1641
     @staticmethod
     def _func_signature(
-        chain: str, udf_func: Union[Callable, UDFBase]
+        chain: str, udf_func: Callable | UDFBase
     ) -> tuple[dict[str, type], Sequence[type], bool]:
         if isinstance(udf_func, AbstractUDF):
             func = udf_func.process  # type: ignore[unreachable]
@@ -183,17 +198,27 @@ class UdfSignature:  # noqa: PLW1641
             orig = get_origin(anno)
             if inspect.isclass(orig) and issubclass(orig, Iterator):
                 args = get_args(anno)
-                if len(args) > 1 and not (
-                    issubclass(orig, Generator) and len(args) == 3
-                ):
-                    raise UdfSignatureError(
-                        chain,
-                        f"function '{func}' should return iterator with a single"
-                        f" value while '{args}' are specified",
-                    )
-                is_iterator = True
-                anno = args[0]
-                orig = get_origin(anno)
+                # For typing.Iterator without type args, default to DEFAULT_RETURN_TYPE
+                if len(args) == 0:
+                    is_iterator = True
+                    anno = UdfSignature.DEFAULT_RETURN_TYPE
+                    orig = get_origin(anno)
+                else:
+                    # typing.Generator[T, S, R] has 3 args; allow that shape
+                    if len(args) > 1 and not (
+                        issubclass(orig, Generator) and len(args) == 3
+                    ):
+                        raise UdfSignatureError(
+                            chain,
+                            (
+                                f"function '{callable_name(func)}' should return "
+                                "iterator with a single value while "
+                                f"'{args}' are specified"
+                            ),
+                        )
+                    is_iterator = True
+                    anno = args[0]
+                    orig = get_origin(anno)
             if orig and orig is tuple:
                 output_types = tuple(get_args(anno))  # type: ignore[assignment]

datachain/lib/utils.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import inspect
 import re
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
@@ -32,6 +33,25 @@ class DataChainColumnError(DataChainParamsError):
         super().__init__(f"Error for column {col_name}: {msg}")
+def callable_name(obj: object) -> str:
+    """Return a friendly name for a callable or UDF-like instance."""
+    # UDF classes in DataChain inherit from AbstractUDF; prefer class name
+    if isinstance(obj, AbstractUDF):
+        return obj.__class__.__name__
+    # Plain functions and bound/unbound methods
+    if inspect.ismethod(obj) or inspect.isfunction(obj):
+        # __name__ exists for functions/methods; includes "<lambda>" for lambdas
+        return obj.__name__  # type: ignore[attr-defined]
+    # Generic callable object
+    if callable(obj):
+        return obj.__class__.__name__
+    # Fallback for non-callables
+    return str(obj)
 def normalize_col_names(col_names: Sequence[str]) -> dict[str, str]:
     """Returns normalized_name -> original_name dict."""
     gen_col_counter = 0

datachain/lib/video.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import posixpath
 import shutil
 import tempfile
-from typing import Optional, Union
 from numpy import ndarray
@@ -18,7 +17,7 @@ except ImportError as exc:
     ) from exc
-def video_info(file: Union[File, VideoFile]) -> Video:
+def video_info(file: File | VideoFile) -> Video:
     """
     Returns video file information.
@@ -108,7 +107,7 @@ def video_frame_np(video: VideoFile, frame: int) -> ndarray:
 def validate_frame_range(
     video: VideoFile,
     start: int = 0,
-    end: Optional[int] = None,
+    end: int | None = None,
     step: int = 1,
 ) -> tuple[int, int, int]:
     """
@@ -186,7 +185,7 @@ def save_video_fragment(
     start: float,
     end: float,
     output: str,
-    format: Optional[str] = None,
+    format: str | None = None,
 ) -> VideoFile:
     """
     Saves video interval as a new video file. If output is a remote path,

datachain/lib/webdataset.py CHANGED Viewed

@@ -1,20 +1,13 @@
-import json
 import tarfile
+import types
 import warnings
-from collections.abc import Iterator, Sequence
+from collections.abc import Callable, Iterator, Sequence
 from pathlib import Path
-from typing import (
-    Any,
-    Callable,
-    ClassVar,
-    Optional,
-    Union,
-    get_args,
-    get_origin,
-)
+from typing import Any, ClassVar, Union, get_args, get_origin
 from pydantic import Field
+from datachain import json
 from datachain.lib.data_model import DataModel
 from datachain.lib.file import File
 from datachain.lib.tar import build_tar_member
@@ -64,28 +57,28 @@ class WDSBasic(DataModel):
 class WDSAllFile(WDSBasic):
-    txt: Optional[str] = Field(default=None)
-    text: Optional[str] = Field(default=None)
-    cap: Optional[str] = Field(default=None)
-    transcript: Optional[str] = Field(default=None)
-    cls: Optional[int] = Field(default=None)
-    cls2: Optional[int] = Field(default=None)
-    index: Optional[int] = Field(default=None)
-    inx: Optional[int] = Field(default=None)
-    id: Optional[int] = Field(default=None)
-    json: Optional[dict] = Field(default=None)  # type: ignore[assignment]
-    jsn: Optional[dict] = Field(default=None)
-    pyd: Optional[bytes] = Field(default=None)
-    pickle: Optional[bytes] = Field(default=None)
-    pth: Optional[bytes] = Field(default=None)
-    ten: Optional[bytes] = Field(default=None)
-    tb: Optional[bytes] = Field(default=None)
-    mp: Optional[bytes] = Field(default=None)
-    msg: Optional[bytes] = Field(default=None)
-    npy: Optional[bytes] = Field(default=None)
-    npz: Optional[bytes] = Field(default=None)
-    cbor: Optional[bytes] = Field(default=None)
+    txt: str | None = Field(default=None)
+    text: str | None = Field(default=None)
+    cap: str | None = Field(default=None)
+    transcript: str | None = Field(default=None)
+    cls: int | None = Field(default=None)
+    cls2: int | None = Field(default=None)
+    index: int | None = Field(default=None)
+    inx: int | None = Field(default=None)
+    id: int | None = Field(default=None)
+    json: dict | None = Field(default=None)  # type: ignore[assignment]
+    jsn: dict | None = Field(default=None)
+    pyd: bytes | None = Field(default=None)
+    pickle: bytes | None = Field(default=None)
+    pth: bytes | None = Field(default=None)
+    ten: bytes | None = Field(default=None)
+    tb: bytes | None = Field(default=None)
+    mp: bytes | None = Field(default=None)
+    msg: bytes | None = Field(default=None)
+    npy: bytes | None = Field(default=None)
+    npz: bytes | None = Field(default=None)
+    cbor: bytes | None = Field(default=None)
 class WDSReadableSubclass(DataModel):
@@ -189,9 +182,11 @@ class Builder:
             return
         anno = field.annotation
-        if get_origin(anno) == Union:
-            args = get_args(anno)
-            anno = args[0]
+        anno_origin = get_origin(anno)
+        if anno_origin in (Union, types.UnionType):
+            anno_args = get_args(anno)
+            if len(anno_args) == 2 and type(None) in anno_args:
+                return anno_args[0] if anno_args[1] is type(None) else anno_args[1]
         return anno

datachain 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl

datachain 0.30.5py3-none-any.whl → 0.39.0py3-none-any.whl