PyPI - datachain - Versions diffs - 0.8.13__py3-none-any.whl → 0.9.1__py3-none-any.whl - Mend

datachain 0.8.13py3-none-any.whl → 0.9.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (30) hide show

datachain/__init__.py +10 -0
datachain/catalog/catalog.py +32 -9
datachain/cli/__init__.py +2 -0
datachain/cli/commands/datasets.py +78 -12
datachain/cli/parser/__init__.py +62 -12
datachain/cli/parser/job.py +14 -4
datachain/cli/parser/studio.py +8 -0
datachain/cli/parser/utils.py +20 -1
datachain/dataset.py +7 -4
datachain/diff/__init__.py +78 -128
datachain/fs/reference.py +21 -0
datachain/func/__init__.py +3 -1
datachain/func/conditional.py +66 -2
datachain/job.py +1 -1
datachain/lib/arrow.py +1 -11
datachain/lib/dc.py +2 -0
datachain/lib/file.py +298 -8
datachain/lib/hf.py +1 -1
datachain/lib/video.py +223 -0
datachain/query/dataset.py +28 -3
datachain/remote/studio.py +13 -6
datachain/studio.py +34 -12
datachain/utils.py +12 -2
{datachain-0.8.13.dist-info → datachain-0.9.1.dist-info}/METADATA +13 -5
{datachain-0.8.13.dist-info → datachain-0.9.1.dist-info}/RECORD +30 -28
/datachain/{lib/vfile.py → fs/__init__.py} +0 -0
{datachain-0.8.13.dist-info → datachain-0.9.1.dist-info}/LICENSE +0 -0
{datachain-0.8.13.dist-info → datachain-0.9.1.dist-info}/WHEEL +0 -0
{datachain-0.8.13.dist-info → datachain-0.9.1.dist-info}/entry_points.txt +0 -0
{datachain-0.8.13.dist-info → datachain-0.9.1.dist-info}/top_level.txt +0 -0

datachain/diff/__init__.py CHANGED Viewed

@@ -4,11 +4,9 @@ from collections.abc import Sequence
 from enum import Enum
 from typing import TYPE_CHECKING, Optional, Union
-import sqlalchemy as sa
+from datachain.func import case, ifelse, isnone, or_
 from datachain.lib.signal_schema import SignalSchema
 from datachain.query.schema import Column
-from datachain.sql.types import String
 if TYPE_CHECKING:
     from datachain.lib.dc import DataChain
@@ -32,7 +30,7 @@ class CompareStatus(str, Enum):
     SAME = "S"
-def _compare(  # noqa: PLR0912, PLR0915, C901
+def _compare(  # noqa: C901
     left: "DataChain",
     right: "DataChain",
     on: Union[str, Sequence[str]],
@@ -47,63 +45,46 @@ def _compare(  # noqa: PLR0912, PLR0915, C901
 ) -> "DataChain":
     """Comparing two chains by identifying rows that are added, deleted, modified
     or same"""
-    dialect = left._query.dialect
     rname = "right_"
+    schema = left.signals_schema  # final chain must have schema from left chain
-    def _rprefix(c: str, rc: str) -> str:
-        """Returns prefix of right of two companion left - right columns
-        from merge. If companion columns have the same name then prefix will
-        be present in right column name, otherwise it won't.
-        """
-        return rname if c == rc else ""
-    def _to_list(obj: Union[str, Sequence[str]]) -> list[str]:
+    def _to_list(obj: Optional[Union[str, Sequence[str]]]) -> Optional[list[str]]:
+        if obj is None:
+            return None
         return [obj] if isinstance(obj, str) else list(obj)
-    if on is None:
-        raise ValueError("'on' must be specified")
-    on = _to_list(on)
-    if right_on:
-        right_on = _to_list(right_on)
-        if len(on) != len(right_on):
-            raise ValueError("'on' and 'right_on' must be have the same length")
-    if compare:
-        compare = _to_list(compare)
-    if right_compare:
-        if not compare:
-            raise ValueError("'compare' must be defined if 'right_compare' is defined")
-        right_compare = _to_list(right_compare)
-        if len(compare) != len(right_compare):
-            raise ValueError(
-                "'compare' and 'right_compare' must be have the same length"
-            )
+    on = _to_list(on)  # type: ignore[assignment]
+    right_on = _to_list(right_on)
+    compare = _to_list(compare)
+    right_compare = _to_list(right_compare)
     if not any([added, deleted, modified, same]):
         raise ValueError(
             "At least one of added, deleted, modified, same flags must be set"
         )
-    need_status_col = bool(status_col)
-    # we still need status column for internal implementation even if not
-    # needed in the output
-    status_col = status_col or get_status_col_name()
-    # calculate on and compare column names
-    right_on = right_on or on
+    if on is None:
+        raise ValueError("'on' must be specified")
+    if right_on and len(on) != len(right_on):
+        raise ValueError("'on' and 'right_on' must be have the same length")
+    if right_compare and not compare:
+        raise ValueError("'compare' must be defined if 'right_compare' is defined")
+    if compare and right_compare and len(compare) != len(right_compare):
+        raise ValueError("'compare' and 'right_compare' must have the same length")
+    # all left and right columns
     cols = left.signals_schema.clone_without_sys_signals().db_signals()
     right_cols = right.signals_schema.clone_without_sys_signals().db_signals()
+    # getting correct on and right_on column names
     on = left.signals_schema.resolve(*on).db_signals()  # type: ignore[assignment]
-    right_on = right.signals_schema.resolve(*right_on).db_signals()  # type: ignore[assignment]
+    right_on = right.signals_schema.resolve(*(right_on or on)).db_signals()  # type: ignore[assignment]
+    # getting correct compare and right_compare column names if they are defined
     if compare:
-        right_compare = right_compare or compare
         compare = left.signals_schema.resolve(*compare).db_signals()  # type: ignore[assignment]
-        right_compare = right.signals_schema.resolve(*right_compare).db_signals()  # type: ignore[assignment]
+        right_compare = right.signals_schema.resolve(
+            *(right_compare or compare)
+        ).db_signals()  # type: ignore[assignment]
     elif not compare and len(cols) != len(right_cols):
         # here we will mark all rows that are not added or deleted as modified since
         # there was no explicit list of compare columns provided (meaning we need
@@ -113,103 +94,72 @@ def _compare(  # noqa: PLR0912, PLR0915, C901
         compare = None
         right_compare = None
     else:
-        compare = [c for c in cols if c in right_cols]  # type: ignore[misc, assignment]
-        right_compare = compare
+        # we are checking all columns as explicit compare is not defined
+        compare = right_compare = [c for c in cols if c in right_cols and c not in on]  # type: ignore[misc]
-    diff_cond = []
+    # get diff column names
+    diff_col = status_col or get_status_col_name()
+    ldiff_col = get_status_col_name()
+    rdiff_col = get_status_col_name()
-    if added:
-        added_cond = sa.and_(
-            *[
-                C(c) == None  # noqa: E711
-                for c in [f"{_rprefix(c, rc)}{rc}" for c, rc in zip(on, right_on)]
-            ]
-        )
-        diff_cond.append((added_cond, CompareStatus.ADDED))
-    if modified and compare:
-        modified_cond = sa.or_(
-            *[
-                C(c) != C(f"{_rprefix(c, rc)}{rc}")
-                for c, rc in zip(compare, right_compare)  # type: ignore[arg-type]
-            ]
-        )
-        diff_cond.append((modified_cond, CompareStatus.MODIFIED))
-    if same and compare:
-        same_cond = sa.and_(
+    # adding helper diff columns, which will be removed after
+    left = left.mutate(**{ldiff_col: 1})
+    right = right.mutate(**{rdiff_col: 1})
+    if not compare:
+        modified_cond = True
+    else:
+        modified_cond = or_(  # type: ignore[assignment]
             *[
-                C(c) == C(f"{_rprefix(c, rc)}{rc}")
+                C(c) != (C(f"{rname}{rc}") if c == rc else C(rc))
                 for c, rc in zip(compare, right_compare)  # type: ignore[arg-type]
             ]
         )
-        diff_cond.append((same_cond, CompareStatus.SAME))
-    diff = sa.case(*diff_cond, else_=None if compare else CompareStatus.MODIFIED).label(
-        status_col
-    )
-    diff.type = String()
-    left_right_merge = left.merge(
-        right, on=on, right_on=right_on, inner=False, rname=rname
-    )
-    left_right_merge_select = left_right_merge._query.select(
-        *(
-            [C(c) for c in left_right_merge.signals_schema.db_signals("sys")]
-            + [C(c) for c in on]
-            + [C(c) for c in cols if c not in on]
-            + [diff]
-        )
-    )
-    diff_col = sa.literal(CompareStatus.DELETED).label(status_col)
-    diff_col.type = String()
-    right_left_merge = right.merge(
-        left, on=right_on, right_on=on, inner=False, rname=rname
-    ).filter(
-        sa.and_(
-            *[C(f"{_rprefix(c, rc)}{c}") == None for c, rc in zip(on, right_on)]  # noqa: E711
+    dc_diff = (
+        left.merge(right, on=on, right_on=right_on, rname=rname, full=True)
+        .mutate(
+            **{
+                diff_col: case(
+                    (isnone(ldiff_col), CompareStatus.DELETED),
+                    (isnone(rdiff_col), CompareStatus.ADDED),
+                    (modified_cond, CompareStatus.MODIFIED),
+                    else_=CompareStatus.SAME,
+                )
+            }
         )
-    )
-    def _default_val(chain: "DataChain", col: str):
-        col_type = chain._query.column_types[col]  # type: ignore[index]
-        val = sa.literal(col_type.default_value(dialect)).label(col)
-        val.type = col_type()
-        return val
-    right_left_merge_select = right_left_merge._query.select(
-        *(
-            [C(c) for c in right_left_merge.signals_schema.db_signals("sys")]
-            + [
-                C(c) if c == rc else _default_val(left, c)
-                for c, rc in zip(on, right_on)
-            ]
-            + [
-                C(c) if c in right_cols else _default_val(left, c)  # type: ignore[arg-type]
-                for c in cols
-                if c not in on
-            ]
-            + [diff_col]
+        # when the row is deleted, we need to take column values from the right chain
+        .mutate(
+            **{
+                f"{c}": ifelse(
+                    C(diff_col) == CompareStatus.DELETED, C(f"{rname}{c}"), C(c)
+                )
+                for c in [c for c in cols if c in right_cols]
+            }
         )
+        .select_except(ldiff_col, rdiff_col)
     )
+    if not added:
+        dc_diff = dc_diff.filter(C(diff_col) != CompareStatus.ADDED)
+    if not modified:
+        dc_diff = dc_diff.filter(C(diff_col) != CompareStatus.MODIFIED)
+    if not same:
+        dc_diff = dc_diff.filter(C(diff_col) != CompareStatus.SAME)
     if not deleted:
-        res = left_right_merge_select
-    elif deleted and not any([added, modified, same]):
-        res = right_left_merge_select
-    else:
-        res = left_right_merge_select.union(right_left_merge_select)
+        dc_diff = dc_diff.filter(C(diff_col) != CompareStatus.DELETED)
-    res = res.filter(C(status_col) != None)  # noqa: E711
+    if status_col:
+        cols.append(diff_col)  # type: ignore[arg-type]
-    schema = left.signals_schema
-    if need_status_col:
-        res = res.select()
-        schema = SignalSchema({status_col: str}) | schema
-    else:
-        res = res.select_except(C(status_col))
+    dc_diff = dc_diff.select(*cols)
+    # final schema is schema from the left chain with status column added if needed
+    dc_diff.signals_schema = (
+        schema if not status_col else SignalSchema({status_col: str}) | schema
+    )
-    return left._evolve(query=res, signal_schema=schema)
+    return dc_diff
 def compare_and_split(

datachain/fs/reference.py ADDED Viewed

@@ -0,0 +1,21 @@
+import fsspec
+from packaging.version import Version, parse
+# fsspec==2025.2.0 added support for a proper `open()` in `ReferenceFileSystem`.
+# Remove this module when `fsspec` minimum version requirement can be bumped.
+if parse(fsspec.__version__) < Version("2025.2.0"):
+    from fsspec.core import split_protocol
+    from fsspec.implementations import reference
+    class ReferenceFileSystem(reference.ReferenceFileSystem):
+        def _open(self, path, mode="rb", *args, **kwargs):
+            # overriding because `fsspec`'s `ReferenceFileSystem._open`
+            # reads the whole file in-memory.
+            (uri,) = self.references[path]
+            protocol, _ = split_protocol(uri)
+            return self.fss[protocol].open(uri, mode, *args, **kwargs)
+else:
+    from fsspec.implementations.reference import ReferenceFileSystem  # type: ignore[no-redef]  # noqa: I001
+__all__ = ["ReferenceFileSystem"]

datachain/func/__init__.py CHANGED Viewed

@@ -16,13 +16,14 @@ from .aggregate import (
     sum,
 )
 from .array import contains, cosine_distance, euclidean_distance, length, sip_hash_64
-from .conditional import case, greatest, ifelse, isnone, least
+from .conditional import and_, case, greatest, ifelse, isnone, least, or_
 from .numeric import bit_and, bit_hamming_distance, bit_or, bit_xor, int_hash_64
 from .random import rand
 from .string import byte_hamming_distance
 from .window import window
 __all__ = [
+    "and_",
     "any_value",
     "array",
     "avg",
@@ -49,6 +50,7 @@ __all__ = [
     "literal",
     "max",
     "min",
+    "or_",
     "path",
     "rand",
     "random",

datachain/func/conditional.py CHANGED Viewed

@@ -1,7 +1,9 @@
 from typing import Optional, Union
 from sqlalchemy import ColumnElement
+from sqlalchemy import and_ as sql_and
 from sqlalchemy import case as sql_case
+from sqlalchemy import or_ as sql_or
 from datachain.lib.utils import DataChainParamsError
 from datachain.query.schema import Column
@@ -89,7 +91,7 @@ def least(*args: Union[ColT, float]) -> Func:
 def case(
-    *args: tuple[Union[ColumnElement, Func], CaseT], else_: Optional[CaseT] = None
+    *args: tuple[Union[ColumnElement, Func, bool], CaseT], else_: Optional[CaseT] = None
 ) -> Func:
     """
     Returns the case function that produces case expression which has a list of
@@ -99,7 +101,7 @@ def case(
     Result type is inferred from condition results.
     Args:
-        args tuple((ColumnElement | Func),(str | int | float | complex | bool, Func, ColumnElement)):
+        args tuple((ColumnElement | Func | bool),(str | int | float | complex | bool, Func, ColumnElement)):
             Tuple of condition and values pair.
         else_ (str | int | float | complex | bool, Func): optional else value in case
             expression. If omitted, and no case conditions are satisfied, the result
@@ -118,12 +120,16 @@ def case(
     supported_types = [int, float, complex, str, bool]
     def _get_type(val):
+        from enum import Enum
         if isinstance(val, Func):
             # nested functions
             return val.result_type
         if isinstance(val, Column):
             # at this point we cannot know what is the type of a column
             return None
+        if isinstance(val, Enum):
+            return type(val.value)
         return type(val)
     if not args:
@@ -204,3 +210,61 @@ def isnone(col: Union[str, Column]) -> Func:
         col = C(col)
     return case((col.is_(None) if col is not None else True, True), else_=False)
+def or_(*args: Union[ColumnElement, Func]) -> Func:
+    """
+    Returns the function that produces conjunction of expressions joined by OR
+    logical operator.
+    Args:
+        args (ColumnElement | Func): The expressions for OR statement.
+    Returns:
+        Func: A Func object that represents the or function.
+    Example:
+        ```py
+        dc.mutate(
+            test=ifelse(or_(isnone("name"), C("name") == ''), "Empty", "Not Empty")
+        )
+        ```
+    """
+    cols, func_args = [], []
+    for arg in args:
+        if isinstance(arg, (str, Func)):
+            cols.append(arg)
+        else:
+            func_args.append(arg)
+    return Func("or", inner=sql_or, cols=cols, args=func_args, result_type=bool)
+def and_(*args: Union[ColumnElement, Func]) -> Func:
+    """
+    Returns the function that produces conjunction of expressions joined by AND
+    logical operator.
+    Args:
+        args (ColumnElement | Func): The expressions for AND statement.
+    Returns:
+        Func: A Func object that represents the and function.
+    Example:
+        ```py
+        dc.mutate(
+            test=ifelse(and_(isnone("name"), isnone("surname")), "Empty", "Not Empty")
+        )
+        ```
+    """
+    cols, func_args = [], []
+    for arg in args:
+        if isinstance(arg, (str, Func)):
+            cols.append(arg)
+        else:
+            func_args.append(arg)
+    return Func("and", inner=sql_and, cols=cols, args=func_args, result_type=bool)

datachain/job.py CHANGED Viewed

@@ -25,7 +25,7 @@ class Job:
     @classmethod
     def parse(
-        cls: type[J],
+        cls,
         id: Union[str, uuid.UUID],
         name: str,
         status: int,

datachain/lib/arrow.py CHANGED Viewed

@@ -2,13 +2,12 @@ from collections.abc import Sequence
 from itertools import islice
 from typing import TYPE_CHECKING, Any, Optional
-import fsspec.implementations.reference
 import orjson
 import pyarrow as pa
-from fsspec.core import split_protocol
 from pyarrow.dataset import CsvFileFormat, dataset
 from tqdm.auto import tqdm
+from datachain.fs.reference import ReferenceFileSystem
 from datachain.lib.data_model import dict_to_data_model
 from datachain.lib.file import ArrowRow, File
 from datachain.lib.model_store import ModelStore
@@ -27,15 +26,6 @@ if TYPE_CHECKING:
 DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY = b"DataChain SignalSchema"
-class ReferenceFileSystem(fsspec.implementations.reference.ReferenceFileSystem):
-    def _open(self, path, mode="rb", *args, **kwargs):
-        # overriding because `fsspec`'s `ReferenceFileSystem._open`
-        # reads the whole file in-memory.
-        (uri,) = self.references[path]
-        protocol, _ = split_protocol(uri)
-        return self.fss[protocol].open(uri, mode, *args, **kwargs)
 class ArrowGenerator(Generator):
     DEFAULT_BATCH_SIZE = 2**17  # same as `pyarrow._dataset._DEFAULT_BATCH_SIZE`

datachain/lib/dc.py CHANGED Viewed

@@ -481,6 +481,7 @@ class DataChain:
         version: Optional[int] = None,
         session: Optional[Session] = None,
         settings: Optional[dict] = None,
+        fallback_to_remote: bool = True,
     ) -> "Self":
         """Get data from a saved Dataset. It returns the chain itself.
@@ -498,6 +499,7 @@ class DataChain:
             version=version,
             session=session,
             indexing_column_types=File._datachain_column_types,
+            fallback_to_remote=fallback_to_remote,
         )
         telemetry.send_event_once("class", "datachain_init", name=name, version=version)
         if settings:

datachain 0.8.13__py3-none-any.whl → 0.9.1__py3-none-any.whl

Potentially problematic release.

datachain 0.8.13py3-none-any.whl → 0.9.1py3-none-any.whl