PyPI - datachain - Versions diffs - 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl - Mend

datachain 0.30.5py3-none-any.whl → 0.39.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (119) hide show

datachain/__init__.py +4 -0
datachain/asyn.py +11 -12
datachain/cache.py +5 -5
datachain/catalog/__init__.py +0 -2
datachain/catalog/catalog.py +276 -354
datachain/catalog/dependency.py +164 -0
datachain/catalog/loader.py +8 -3
datachain/checkpoint.py +43 -0
datachain/cli/__init__.py +10 -17
datachain/cli/commands/__init__.py +1 -8
datachain/cli/commands/datasets.py +42 -27
datachain/cli/commands/ls.py +15 -15
datachain/cli/commands/show.py +2 -2
datachain/cli/parser/__init__.py +3 -43
datachain/cli/parser/job.py +1 -1
datachain/cli/parser/utils.py +1 -2
datachain/cli/utils.py +2 -15
datachain/client/azure.py +2 -2
datachain/client/fsspec.py +34 -23
datachain/client/gcs.py +3 -3
datachain/client/http.py +157 -0
datachain/client/local.py +11 -7
datachain/client/s3.py +3 -3
datachain/config.py +4 -8
datachain/data_storage/db_engine.py +12 -6
datachain/data_storage/job.py +2 -0
datachain/data_storage/metastore.py +716 -137
datachain/data_storage/schema.py +20 -27
datachain/data_storage/serializer.py +105 -15
datachain/data_storage/sqlite.py +114 -114
datachain/data_storage/warehouse.py +140 -48
datachain/dataset.py +109 -89
datachain/delta.py +117 -42
datachain/diff/__init__.py +25 -33
datachain/error.py +24 -0
datachain/func/aggregate.py +9 -11
datachain/func/array.py +12 -12
datachain/func/base.py +7 -4
datachain/func/conditional.py +9 -13
datachain/func/func.py +63 -45
datachain/func/numeric.py +5 -7
datachain/func/string.py +2 -2
datachain/hash_utils.py +123 -0
datachain/job.py +11 -7
datachain/json.py +138 -0
datachain/lib/arrow.py +18 -15
datachain/lib/audio.py +60 -59
datachain/lib/clip.py +14 -13
datachain/lib/convert/python_to_sql.py +6 -10
datachain/lib/convert/values_to_tuples.py +151 -53
datachain/lib/data_model.py +23 -19
datachain/lib/dataset_info.py +7 -7
datachain/lib/dc/__init__.py +2 -1
datachain/lib/dc/csv.py +22 -26
datachain/lib/dc/database.py +37 -34
datachain/lib/dc/datachain.py +518 -324
datachain/lib/dc/datasets.py +38 -30
datachain/lib/dc/hf.py +16 -20
datachain/lib/dc/json.py +17 -18
datachain/lib/dc/listings.py +5 -8
datachain/lib/dc/pandas.py +3 -6
datachain/lib/dc/parquet.py +33 -21
datachain/lib/dc/records.py +9 -13
datachain/lib/dc/storage.py +103 -65
datachain/lib/dc/storage_pattern.py +251 -0
datachain/lib/dc/utils.py +17 -14
datachain/lib/dc/values.py +3 -6
datachain/lib/file.py +187 -50
datachain/lib/hf.py +7 -5
datachain/lib/image.py +13 -13
datachain/lib/listing.py +5 -5
datachain/lib/listing_info.py +1 -2
datachain/lib/meta_formats.py +2 -3
datachain/lib/model_store.py +20 -8
datachain/lib/namespaces.py +59 -7
datachain/lib/projects.py +51 -9
datachain/lib/pytorch.py +31 -23
datachain/lib/settings.py +188 -85
datachain/lib/signal_schema.py +302 -64
datachain/lib/text.py +8 -7
datachain/lib/udf.py +103 -63
datachain/lib/udf_signature.py +59 -34
datachain/lib/utils.py +20 -0
datachain/lib/video.py +3 -4
datachain/lib/webdataset.py +31 -36
datachain/lib/webdataset_laion.py +15 -16
datachain/listing.py +12 -5
datachain/model/bbox.py +3 -1
datachain/namespace.py +22 -3
datachain/node.py +6 -6
datachain/nodes_thread_pool.py +0 -1
datachain/plugins.py +24 -0
datachain/project.py +4 -4
datachain/query/batch.py +10 -12
datachain/query/dataset.py +376 -194
datachain/query/dispatch.py +112 -84
datachain/query/metrics.py +3 -4
datachain/query/params.py +2 -3
datachain/query/queue.py +2 -1
datachain/query/schema.py +7 -6
datachain/query/session.py +190 -33
datachain/query/udf.py +9 -6
datachain/remote/studio.py +90 -53
datachain/script_meta.py +12 -12
datachain/sql/sqlite/base.py +37 -25
datachain/sql/sqlite/types.py +1 -1
datachain/sql/types.py +36 -5
datachain/studio.py +49 -40
datachain/toolkit/split.py +31 -10
datachain/utils.py +39 -48
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/METADATA +26 -38
datachain-0.39.0.dist-info/RECORD +173 -0
datachain/cli/commands/query.py +0 -54
datachain/query/utils.py +0 -36
datachain-0.30.5.dist-info/RECORD +0 -168
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/WHEEL +0 -0
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
{datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0

datachain/delta.py CHANGED Viewed

@@ -1,17 +1,22 @@
 from collections.abc import Sequence
 from copy import copy
 from functools import wraps
-from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
+from typing import TYPE_CHECKING, TypeVar
 import datachain
-from datachain.dataset import DatasetDependency
-from datachain.error import DatasetNotFoundError
+from datachain.dataset import DatasetDependency, DatasetRecord
+from datachain.error import DatasetNotFoundError, SchemaDriftError
 from datachain.project import Project
+from datachain.query.dataset import UnionSchemaMismatchError
 if TYPE_CHECKING:
-    from typing_extensions import Concatenate, ParamSpec
+    from collections.abc import Callable
+    from typing import Concatenate
+    from typing_extensions import ParamSpec
     from datachain.lib.dc import DataChain
+    from datachain.lib.signal_schema import SignalSchema
     P = ParamSpec("P")
@@ -30,9 +35,10 @@ def delta_disabled(
     @wraps(method)
     def _inner(self: T, *args: "P.args", **kwargs: "P.kwargs") -> T:
-        if self.delta:
+        if self.delta and not self._delta_unsafe:
             raise NotImplementedError(
-                f"Delta update cannot be used with {method.__name__}"
+                f"Cannot use {method.__name__} with delta datasets - may cause"
+                " inconsistency. Use delta_unsafe flag to allow this operation."
             )
         return method(self, *args, **kwargs)
@@ -49,13 +55,55 @@ def _append_steps(dc: "DataChain", other: "DataChain"):
     return dc
+def _format_schema_drift_message(
+    context: str,
+    existing_schema: "SignalSchema",
+    updated_schema: "SignalSchema",
+) -> tuple[str, bool]:
+    missing_cols, new_cols = existing_schema.compare_signals(updated_schema)
+    if not new_cols and not missing_cols:
+        return "", False
+    parts: list[str] = []
+    if new_cols:
+        parts.append("new columns detected: " + ", ".join(sorted(new_cols)))
+    if missing_cols:
+        parts.append(
+            "columns missing in updated data: " + ", ".join(sorted(missing_cols))
+        )
+    details = "; ".join(parts)
+    message = f"Delta update failed: schema drift detected while {context}: {details}."
+    return message, True
+def _safe_union(
+    left: "DataChain",
+    right: "DataChain",
+    context: str,
+) -> "DataChain":
+    try:
+        return left.union(right)
+    except UnionSchemaMismatchError as exc:
+        message, has_drift = _format_schema_drift_message(
+            context,
+            left.signals_schema,
+            right.signals_schema,
+        )
+        if has_drift:
+            raise SchemaDriftError(message) from exc
+        raise
 def _get_delta_chain(
     source_ds_name: str,
     source_ds_project: Project,
     source_ds_version: str,
     source_ds_latest_version: str,
-    on: Union[str, Sequence[str]],
-    compare: Optional[Union[str, Sequence[str]]] = None,
+    on: str | Sequence[str],
+    compare: str | Sequence[str] | None = None,
 ) -> "DataChain":
     """Get delta chain for processing changes between versions."""
     source_dc = datachain.read_dataset(
@@ -83,11 +131,11 @@ def _get_retry_chain(
     source_ds_name: str,
     source_ds_project: Project,
     source_ds_version: str,
-    on: Union[str, Sequence[str]],
-    right_on: Optional[Union[str, Sequence[str]]],
-    delta_retry: Optional[Union[bool, str]],
+    on: str | Sequence[str],
+    right_on: str | Sequence[str] | None,
+    delta_retry: bool | str | None,
     diff_chain: "DataChain",
-) -> Optional["DataChain"]:
+) -> "DataChain | None":
     """Get retry chain for processing error records and missing records."""
     # Import here to avoid circular import
     from datachain.lib.dc import C
@@ -113,7 +161,9 @@ def _get_retry_chain(
         error_records = result_dataset.filter(C(delta_retry) != "")
         error_source_records = source_dc.merge(
             error_records, on=on, right_on=right_on, inner=True
-        ).select(*list(source_dc.signals_schema.values))
+        ).select(
+            *list(source_dc.signals_schema.clone_without_sys_signals().values.keys())
+        )
         retry_chain = error_source_records
     # Handle missing records if delta_retry is True
@@ -124,21 +174,30 @@ def _get_retry_chain(
     # Subtract also diff chain since some items might be picked
     # up by `delta=True` itself (e.g. records got modified AND are missing in the
     # result dataset atm)
-    return retry_chain.subtract(diff_chain, on=on) if retry_chain else None
+    on = [on] if isinstance(on, str) else on
+    return (
+        retry_chain.diff(
+            diff_chain, on=on, added=True, same=True, modified=False, deleted=False
+        ).distinct(*on)
+        if retry_chain
+        else None
+    )
 def _get_source_info(
+    source_ds: DatasetRecord,
     name: str,
     namespace_name: str,
     project_name: str,
     latest_version: str,
     catalog,
 ) -> tuple[
-    Optional[str],
-    Optional[Project],
-    Optional[str],
-    Optional[str],
-    Optional[list[DatasetDependency]],
+    str | None,
+    Project | None,
+    str | None,
+    str | None,
+    list[DatasetDependency] | None,
 ]:
     """Get source dataset information and dependencies.
@@ -154,25 +213,25 @@ def _get_source_info(
         indirect=False,
     )
-    dep = dependencies[0]
-    if not dep:
+    source_ds_dep = next(
+        (d for d in dependencies if d and d.name == source_ds.name), None
+    )
+    if not source_ds_dep:
         # Starting dataset was removed, back off to normal dataset creation
         return None, None, None, None, None
-    source_ds_project = catalog.metastore.get_project(dep.project, dep.namespace)
-    source_ds_name = dep.name
-    source_ds_version = dep.version
-    source_ds_latest_version = catalog.get_dataset(
-        source_ds_name,
-        namespace_name=source_ds_project.namespace.name,
-        project_name=source_ds_project.name,
-    ).latest_version
+    # Refresh starting dataset to have new versions if they are created
+    source_ds = catalog.get_dataset(
+        source_ds.name,
+        namespace_name=source_ds.project.namespace.name,
+        project_name=source_ds.project.name,
+    )
     return (
-        source_ds_name,
-        source_ds_project,
-        source_ds_version,
-        source_ds_latest_version,
+        source_ds.name,
+        source_ds.project,
+        source_ds_dep.version,
+        source_ds.latest_version,
         dependencies,
     )
@@ -182,11 +241,11 @@ def delta_retry_update(
     namespace_name: str,
     project_name: str,
     name: str,
-    on: Union[str, Sequence[str]],
-    right_on: Optional[Union[str, Sequence[str]]] = None,
-    compare: Optional[Union[str, Sequence[str]]] = None,
-    delta_retry: Optional[Union[bool, str]] = None,
-) -> tuple[Optional["DataChain"], Optional[list[DatasetDependency]], bool]:
+    on: str | Sequence[str],
+    right_on: str | Sequence[str] | None = None,
+    compare: str | Sequence[str] | None = None,
+    delta_retry: bool | str | None = None,
+) -> tuple["DataChain | None", list[DatasetDependency] | None, bool]:
     """
     Creates new chain that consists of the last version of current delta dataset
     plus diff from the source with all needed modifications.
@@ -244,7 +303,14 @@ def delta_retry_update(
         source_ds_version,
         source_ds_latest_version,
         dependencies,
-    ) = _get_source_info(name, namespace_name, project_name, latest_version, catalog)
+    ) = _get_source_info(
+        dc._query.starting_step.dataset,  # type: ignore[union-attr]
+        name,
+        namespace_name,
+        project_name,
+        latest_version,
+        catalog,
+    )
     # If source_ds_name is None, starting dataset was removed
     if source_ds_name is None:
@@ -267,8 +333,9 @@ def delta_retry_update(
     if dependencies:
         dependencies = copy(dependencies)
         dependencies = [d for d in dependencies if d is not None]
+        source_ds_dep = next(d for d in dependencies if d.name == source_ds_name)
         # Update to latest version
-        dependencies[0].version = source_ds_latest_version  # type: ignore[union-attr]
+        source_ds_dep.version = source_ds_latest_version  # type: ignore[union-attr]
     # Handle retry functionality if enabled
     if delta_retry:
@@ -288,7 +355,11 @@ def delta_retry_update(
     # Combine delta and retry chains
     if retry_chain is not None:
-        processing_chain = diff_chain.union(retry_chain)
+        processing_chain = _safe_union(
+            diff_chain,
+            retry_chain,
+            context="combining retry records with delta changes",
+        )
     else:
         processing_chain = diff_chain
@@ -312,5 +383,9 @@ def delta_retry_update(
         modified=False,
         deleted=False,
     )
-    result_chain = compared_chain.union(processing_chain)
+    result_chain = _safe_union(
+        compared_chain,
+        processing_chain,
+        context="merging the delta output with the existing dataset version",
+    )
     return result_chain, dependencies, True

datachain/diff/__init__.py CHANGED Viewed

@@ -1,8 +1,6 @@
-import random
-import string
 from collections.abc import Sequence
 from enum import Enum
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING
 from datachain.func import case, ifelse, isnone, or_
 from datachain.lib.signal_schema import SignalSchema
@@ -11,16 +9,12 @@ from datachain.query.schema import Column
 if TYPE_CHECKING:
     from datachain.lib.dc import DataChain
 C = Column
-def get_status_col_name() -> str:
-    """Returns new unique status col name"""
-    return "diff_" + "".join(
-        random.choice(string.ascii_letters)  # noqa: S311
-        for _ in range(10)
-    )
+STATUS_COL_NAME = "diff_7aeed3aa17ba4d50b8d1c368c76e16a6"
+LEFT_DIFF_COL_NAME = "diff_95f95344064a4b819c8625cd1a5cfc2b"
+RIGHT_DIFF_COL_NAME = "diff_5808838a49b54849aa461d7387376d34"
 class CompareStatus(str, Enum):
@@ -30,25 +24,25 @@ class CompareStatus(str, Enum):
     SAME = "S"
-def _compare(  # noqa: C901, PLR0912
+def _compare(  # noqa: C901
     left: "DataChain",
     right: "DataChain",
-    on: Union[str, Sequence[str]],
-    right_on: Optional[Union[str, Sequence[str]]] = None,
-    compare: Optional[Union[str, Sequence[str]]] = None,
-    right_compare: Optional[Union[str, Sequence[str]]] = None,
+    on: str | Sequence[str],
+    right_on: str | Sequence[str] | None = None,
+    compare: str | Sequence[str] | None = None,
+    right_compare: str | Sequence[str] | None = None,
     added: bool = True,
     deleted: bool = True,
     modified: bool = True,
     same: bool = True,
-    status_col: Optional[str] = None,
+    status_col: str | None = None,
 ) -> "DataChain":
     """Comparing two chains by identifying rows that are added, deleted, modified
     or same"""
     rname = "right_"
     schema = left.signals_schema  # final chain must have schema from left chain
-    def _to_list(obj: Optional[Union[str, Sequence[str]]]) -> Optional[list[str]]:
+    def _to_list(obj: str | Sequence[str] | None) -> list[str] | None:
         if obj is None:
             return None
         return [obj] if isinstance(obj, str) else list(obj)
@@ -101,21 +95,23 @@ def _compare(  # noqa: C901, PLR0912
         compare = right_compare = [c for c in cols if c in right_cols and c not in on]  # type: ignore[misc]
     # get diff column names
-    diff_col = status_col or get_status_col_name()
-    ldiff_col = get_status_col_name()
-    rdiff_col = get_status_col_name()
+    diff_col = status_col or STATUS_COL_NAME
+    ldiff_col = LEFT_DIFF_COL_NAME
+    rdiff_col = RIGHT_DIFF_COL_NAME
     # adding helper diff columns, which will be removed after
     left = left.mutate(**{ldiff_col: 1})
     right = right.mutate(**{rdiff_col: 1})
-    if not compare:
+    if compare is None:
         modified_cond = True
+    elif len(compare) == 0:
+        modified_cond = False
     else:
         modified_cond = or_(  # type: ignore[assignment]
             *[
                 C(c) != (C(f"{rname}{rc}") if c == rc else C(rc))
-                for c, rc in zip(compare, right_compare)  # type: ignore[arg-type]
+                for c, rc in zip(compare, right_compare, strict=False)  # type: ignore[arg-type]
             ]
         )
@@ -139,7 +135,7 @@ def _compare(  # noqa: C901, PLR0912
                     C(f"{rname + l_on if on == right_on else r_on}"),
                     C(l_on),
                 )
-                for l_on, r_on in zip(on, right_on)  # type: ignore[arg-type]
+                for l_on, r_on in zip(on, right_on, strict=False)  # type: ignore[arg-type]
             }
         )
         .select_except(ldiff_col, rdiff_col)
@@ -157,11 +153,7 @@ def _compare(  # noqa: C901, PLR0912
     if status_col:
         cols_select.append(diff_col)
-    if not dc_diff._sys:
-        # TODO workaround when sys signal is not available in diff
-        dc_diff = dc_diff.settings(sys=True).select(*cols_select).settings(sys=False)
-    else:
-        dc_diff = dc_diff.select(*cols_select)
+    dc_diff = dc_diff.select(*cols_select)
     # final schema is schema from the left chain with status column added if needed
     dc_diff.signals_schema = (
@@ -174,10 +166,10 @@ def _compare(  # noqa: C901, PLR0912
 def compare_and_split(
     left: "DataChain",
     right: "DataChain",
-    on: Union[str, Sequence[str]],
-    right_on: Optional[Union[str, Sequence[str]]] = None,
-    compare: Optional[Union[str, Sequence[str]]] = None,
-    right_compare: Optional[Union[str, Sequence[str]]] = None,
+    on: str | Sequence[str],
+    right_on: str | Sequence[str] | None = None,
+    compare: str | Sequence[str] | None = None,
+    right_compare: str | Sequence[str] | None = None,
     added: bool = True,
     deleted: bool = True,
     modified: bool = True,
@@ -227,7 +219,7 @@ def compare_and_split(
         )
         ```
     """
-    status_col = get_status_col_name()
+    status_col = STATUS_COL_NAME
     res = _compare(
         left,

datachain/error.py CHANGED Viewed

@@ -2,6 +2,10 @@ class DataChainError(RuntimeError):
     pass
+class SchemaDriftError(DataChainError):
+    pass
 class InvalidDatasetNameError(RuntimeError):
     pass
@@ -34,6 +38,14 @@ class ProjectCreateNotAllowedError(NotAllowedError):
     pass
+class ProjectDeleteNotAllowedError(NotAllowedError):
+    pass
+class NamespaceDeleteNotAllowedError(NotAllowedError):
+    pass
 class ProjectNotFoundError(NotFoundError):
     pass
@@ -89,3 +101,15 @@ class TableMissingError(DataChainError):
 class OutdatedDatabaseSchemaError(DataChainError):
     pass
+class CheckpointNotFoundError(NotFoundError):
+    pass
+class JobNotFoundError(NotFoundError):
+    pass
+class JobAncestryDepthExceededError(DataChainError):
+    pass

datachain/func/aggregate.py CHANGED Viewed

@@ -1,5 +1,3 @@
-from typing import Optional, Union
 from sqlalchemy import func as sa_func
 from datachain.query.schema import Column
@@ -8,7 +6,7 @@ from datachain.sql.functions import aggregate
 from .func import Func
-def count(col: Optional[Union[str, Column]] = None) -> Func:
+def count(col: str | Column | None = None) -> Func:
     """
     Returns a COUNT aggregate SQL function for the specified column.
@@ -44,7 +42,7 @@ def count(col: Optional[Union[str, Column]] = None) -> Func:
     )
-def sum(col: Union[str, Column]) -> Func:
+def sum(col: str | Column) -> Func:
     """
     Returns the SUM aggregate SQL function for the specified column.
@@ -74,7 +72,7 @@ def sum(col: Union[str, Column]) -> Func:
     return Func("sum", inner=sa_func.sum, cols=[col])
-def avg(col: Union[str, Column]) -> Func:
+def avg(col: str | Column) -> Func:
     """
     Returns the AVG aggregate SQL function for the specified column.
@@ -104,7 +102,7 @@ def avg(col: Union[str, Column]) -> Func:
     return Func("avg", inner=aggregate.avg, cols=[col], result_type=float)
-def min(col: Union[str, Column]) -> Func:
+def min(col: str | Column) -> Func:
     """
     Returns the MIN aggregate SQL function for the specified column.
@@ -134,7 +132,7 @@ def min(col: Union[str, Column]) -> Func:
     return Func("min", inner=sa_func.min, cols=[col])
-def max(col: Union[str, Column]) -> Func:
+def max(col: str | Column) -> Func:
     """
     Returns the MAX aggregate SQL function for the given column name.
@@ -164,7 +162,7 @@ def max(col: Union[str, Column]) -> Func:
     return Func("max", inner=sa_func.max, cols=[col])
-def any_value(col: Union[str, Column]) -> Func:
+def any_value(col: str | Column) -> Func:
     """
     Returns the ANY_VALUE aggregate SQL function for the given column name.
@@ -198,7 +196,7 @@ def any_value(col: Union[str, Column]) -> Func:
     return Func("any_value", inner=aggregate.any_value, cols=[col])
-def collect(col: Union[str, Column]) -> Func:
+def collect(col: str | Column) -> Func:
     """
     Returns the COLLECT aggregate SQL function for the given column name.
@@ -229,7 +227,7 @@ def collect(col: Union[str, Column]) -> Func:
     return Func("collect", inner=aggregate.collect, cols=[col], is_array=True)
-def concat(col: Union[str, Column], separator="") -> Func:
+def concat(col: str | Column, separator="") -> Func:
     """
     Returns the CONCAT aggregate SQL function for the given column name.
@@ -348,7 +346,7 @@ def dense_rank() -> Func:
     return Func("dense_rank", inner=sa_func.dense_rank, result_type=int, is_window=True)
-def first(col: Union[str, Column]) -> Func:
+def first(col: str | Column) -> Func:
     """
     Returns the FIRST_VALUE window function for SQL queries.

datachain/func/array.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from collections.abc import Sequence
-from typing import Any, Optional, Union
+from typing import Any
 from datachain.query.schema import Column
 from datachain.sql.functions import array
@@ -7,7 +7,7 @@ from datachain.sql.functions import array
 from .func import Func
-def cosine_distance(*args: Union[str, Column, Func, Sequence]) -> Func:
+def cosine_distance(*args: str | Column | Func | Sequence) -> Func:
     """
     Returns the cosine distance between two vectors.
@@ -62,7 +62,7 @@ def cosine_distance(*args: Union[str, Column, Func, Sequence]) -> Func:
     )
-def euclidean_distance(*args: Union[str, Column, Func, Sequence]) -> Func:
+def euclidean_distance(*args: str | Column | Func | Sequence) -> Func:
     """
     Returns the Euclidean distance between two vectors.
@@ -115,7 +115,7 @@ def euclidean_distance(*args: Union[str, Column, Func, Sequence]) -> Func:
     )
-def length(arg: Union[str, Column, Func, Sequence]) -> Func:
+def length(arg: str | Column | Func | Sequence) -> Func:
     """
     Returns the length of the array.
@@ -151,7 +151,7 @@ def length(arg: Union[str, Column, Func, Sequence]) -> Func:
     return Func("length", inner=array.length, cols=cols, args=args, result_type=int)
-def contains(arr: Union[str, Column, Func, Sequence], elem: Any) -> Func:
+def contains(arr: str | Column | Func | Sequence, elem: Any) -> Func:
     """
     Checks whether the array contains the specified element.
@@ -196,9 +196,9 @@ def contains(arr: Union[str, Column, Func, Sequence], elem: Any) -> Func:
 def slice(
-    arr: Union[str, Column, Func, Sequence],
+    arr: str | Column | Func | Sequence,
     offset: int,
-    length: Optional[int] = None,
+    length: int | None = None,
 ) -> Func:
     """
     Returns a slice of the array starting from the specified offset.
@@ -272,7 +272,7 @@ def slice(
 def join(
-    arr: Union[str, Column, Func, Sequence],
+    arr: str | Column | Func | Sequence,
     sep: str = "",
 ) -> Func:
     """
@@ -322,7 +322,7 @@ def join(
     )
-def get_element(arg: Union[str, Column, Func, Sequence], index: int) -> Func:
+def get_element(arg: str | Column | Func | Sequence, index: int) -> Func:
     """
     Returns the element at the given index from the array.
     If the index is out of bounds, it returns None or columns default value.
@@ -359,8 +359,8 @@ def get_element(arg: Union[str, Column, Func, Sequence], index: int) -> Func:
                 return str  # if the array is empty, return str as default type
         return None
-    cols: Optional[Union[str, Column, Func, Sequence]]
-    args: Union[str, Column, Func, Sequence, int]
+    cols: str | Column | Func | Sequence | None
+    args: str | Column | Func | Sequence | int
     if isinstance(arg, (str, Column, Func)):
         cols = [arg]
@@ -379,7 +379,7 @@ def get_element(arg: Union[str, Column, Func, Sequence], index: int) -> Func:
     )
-def sip_hash_64(arg: Union[str, Column, Func, Sequence]) -> Func:
+def sip_hash_64(arg: str | Column | Func | Sequence) -> Func:
     """
     Returns the SipHash-64 hash of the array.

datachain/func/base.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from abc import ABCMeta, abstractmethod
-from typing import TYPE_CHECKING, Optional
+from collections.abc import Sequence
+from typing import TYPE_CHECKING
 if TYPE_CHECKING:
     from sqlalchemy import TableClause
@@ -12,12 +13,14 @@ class Function:
     __metaclass__ = ABCMeta
     name: str
+    cols: Sequence
+    args: Sequence
     @abstractmethod
     def get_column(
         self,
-        signals_schema: Optional["SignalSchema"] = None,
-        label: Optional[str] = None,
-        table: Optional["TableClause"] = None,
+        signals_schema: "SignalSchema | None" = None,
+        label: str | None = None,
+        table: "TableClause | None" = None,
     ) -> "Column":
         pass

datachain 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl

datachain 0.30.5py3-none-any.whl → 0.39.0py3-none-any.whl