PyPI - datachain - Versions diffs - 0.8.10__py3-none-any.whl → 0.8.12__py3-none-any.whl - Mend

datachain 0.8.10py3-none-any.whl → 0.8.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (33) hide show

datachain/cache.py +4 -4
datachain/catalog/__init__.py +0 -2
datachain/catalog/catalog.py +103 -158
datachain/cli/__init__.py +7 -14
datachain/cli/commands/__init__.py +0 -2
datachain/cli/commands/datasets.py +0 -19
datachain/cli/parser/__init__.py +27 -41
datachain/cli/parser/studio.py +7 -6
datachain/cli/parser/utils.py +18 -0
datachain/client/fsspec.py +11 -8
datachain/client/local.py +4 -4
datachain/data_storage/schema.py +1 -1
datachain/dataset.py +1 -7
datachain/error.py +12 -0
datachain/func/__init__.py +2 -1
datachain/func/conditional.py +77 -26
datachain/func/func.py +17 -6
datachain/lib/dc.py +24 -4
datachain/lib/file.py +16 -0
datachain/lib/listing.py +30 -12
datachain/lib/pytorch.py +1 -1
datachain/lib/udf.py +1 -1
datachain/listing.py +1 -13
datachain/node.py +0 -15
datachain/nodes_fetcher.py +2 -2
datachain/remote/studio.py +2 -14
datachain/studio.py +1 -1
{datachain-0.8.10.dist-info → datachain-0.8.12.dist-info}/METADATA +3 -7
{datachain-0.8.10.dist-info → datachain-0.8.12.dist-info}/RECORD +33 -33
{datachain-0.8.10.dist-info → datachain-0.8.12.dist-info}/LICENSE +0 -0
{datachain-0.8.10.dist-info → datachain-0.8.12.dist-info}/WHEEL +0 -0
{datachain-0.8.10.dist-info → datachain-0.8.12.dist-info}/entry_points.txt +0 -0
{datachain-0.8.10.dist-info → datachain-0.8.12.dist-info}/top_level.txt +0 -0

datachain/cli/parser/__init__.py CHANGED Viewed

@@ -8,7 +8,14 @@ from datachain.cli.utils import BooleanOptionalAction, KeyValueArgs
 from .job import add_jobs_parser
 from .studio import add_auth_parser
-from .utils import FIND_COLUMNS, add_show_args, add_sources_arg, find_columns_type
+from .utils import (
+    FIND_COLUMNS,
+    add_anon_arg,
+    add_show_args,
+    add_sources_arg,
+    add_update_arg,
+    find_columns_type,
+)
 def get_parser() -> ArgumentParser:  # noqa: PLR0915
@@ -32,19 +39,6 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         "-q", "--quiet", action="count", default=0, help="Be quiet"
     )
-    parent_parser.add_argument(
-        "--anon",
-        action="store_true",
-        help="Use anonymous access to storage",
-    )
-    parent_parser.add_argument(
-        "-u",
-        "--update",
-        action="count",
-        default=0,
-        help="Update cached list of files for the sources",
-    )
     parent_parser.add_argument(
         "--debug-sql",
         action="store_true",
@@ -92,6 +86,8 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         action="store_true",
         help="Do not expand globs (such as * or ?)",
     )
+    add_anon_arg(parse_cp)
+    add_update_arg(parse_cp)
     parse_clone = subp.add_parser(
         "clone", parents=[parent_parser], description="Copy data files from the cloud."
@@ -127,6 +123,8 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         action="store_true",
         help="Do not copy files, just create a dataset",
     )
+    add_anon_arg(parse_clone)
+    add_update_arg(parse_clone)
     add_auth_parser(subp, parent_parser)
     add_jobs_parser(subp, parent_parser)
@@ -137,6 +135,7 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         parents=[parent_parser],
         description="Commands for managing datasets.",
     )
+    add_anon_arg(datasets_parser)
     datasets_subparser = datasets_parser.add_subparsers(
         dest="datasets_cmd",
         help="Use `datachain dataset CMD --help` to display command-specific help",
@@ -308,34 +307,11 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         help="The team to delete a dataset. By default, it will use team from config",
     )
-    dataset_stats_parser = datasets_subparser.add_parser(
-        "stats", parents=[parent_parser], description="Show basic dataset statistics."
-    )
-    dataset_stats_parser.add_argument("name", type=str, help="Dataset name")
-    dataset_stats_parser.add_argument(
-        "--version",
-        action="store",
-        default=None,
-        type=int,
-        help="Dataset version",
-    )
-    dataset_stats_parser.add_argument(
-        "-b",
-        "--bytes",
-        default=False,
-        action="store_true",
-        help="Display size in bytes instead of human-readable size",
-    )
-    dataset_stats_parser.add_argument(
-        "--si",
-        default=False,
-        action="store_true",
-        help="Display size using powers of 1000 not 1024",
-    )
     parse_ls = subp.add_parser(
         "ls", parents=[parent_parser], description="List storage contents."
     )
+    add_anon_arg(parse_ls)
+    add_update_arg(parse_ls)
     add_sources_arg(parse_ls, nargs="*")
     parse_ls.add_argument(
         "-l",
@@ -375,6 +351,8 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         "du", parents=[parent_parser], description="Display space usage."
     )
     add_sources_arg(parse_du)
+    add_anon_arg(parse_du)
+    add_update_arg(parse_du)
     parse_du.add_argument(
         "-b",
         "--bytes",
@@ -404,6 +382,8 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
     parse_find = subp.add_parser(
         "find", parents=[parent_parser], description="Search in a directory hierarchy."
     )
+    add_anon_arg(parse_find)
+    add_update_arg(parse_find)
     add_sources_arg(parse_find)
     parse_find.add_argument(
         "--name",
@@ -457,6 +437,8 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
     parse_index = subp.add_parser(
         "index", parents=[parent_parser], description="Index storage location."
     )
+    add_anon_arg(parse_index)
+    add_update_arg(parse_index)
     add_sources_arg(parse_index)
     show_parser = subp.add_parser(
@@ -480,6 +462,7 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         parents=[parent_parser],
         description="Create a new dataset with a query script.",
     )
+    add_anon_arg(query_parser)
     query_parser.add_argument(
         "script", metavar="<script.py>", type=str, help="Filepath for script"
     )
@@ -504,14 +487,17 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         help="Query parameters",
     )
-    subp.add_parser(
+    parse_clear_cache = subp.add_parser(
         "clear-cache",
         parents=[parent_parser],
         description="Clear the local file cache.",
     )
-    subp.add_parser(
+    add_anon_arg(parse_clear_cache)
+    parse_gc = subp.add_parser(
         "gc", parents=[parent_parser], description="Garbage collect temporary tables."
     )
+    add_anon_arg(parse_gc)
     subp.add_parser("internal-run-udf", parents=[parent_parser])
     subp.add_parser("internal-run-udf-worker", parents=[parent_parser])

datachain/cli/parser/studio.py CHANGED Viewed

@@ -1,9 +1,8 @@
 def add_auth_parser(subparsers, parent_parser) -> None:
+    from dvc_studio_client.auth import AVAILABLE_SCOPES
     auth_help = "Manage Studio authentication"
-    auth_description = (
-        "Manage authentication and settings for Studio. "
-        "Configure tokens for sharing datasets and using Studio features."
-    )
+    auth_description = "Manage authentication and settings for Studio. "
     auth_parser = subparsers.add_parser(
         "auth",
@@ -19,8 +18,10 @@ def add_auth_parser(subparsers, parent_parser) -> None:
     auth_login_help = "Authenticate with Studio"
     auth_login_description = (
         "Authenticate with Studio using default scopes. "
-        "A random name will be assigned as the token name if not specified."
+        "A random name will be assigned if the token name is not specified."
     )
+    allowed_scopes = ", ".join(AVAILABLE_SCOPES)
     login_parser = auth_subparser.add_parser(
         "login",
         parents=[parent_parser],
@@ -40,7 +41,7 @@ def add_auth_parser(subparsers, parent_parser) -> None:
         "--scopes",
         action="store",
         default=None,
-        help="Authentication token scopes",
+        help=f"Authentication token scopes. Allowed scopes: {allowed_scopes}",
     )
     login_parser.add_argument(

datachain/cli/parser/utils.py CHANGED Viewed

@@ -34,6 +34,24 @@ def add_sources_arg(parser: ArgumentParser, nargs: Union[str, int] = "+") -> Act
     )
+def add_anon_arg(parser: ArgumentParser) -> None:
+    parser.add_argument(
+        "--anon",
+        action="store_true",
+        help="Use anonymous access to storage",
+    )
+def add_update_arg(parser: ArgumentParser) -> None:
+    parser.add_argument(
+        "-u",
+        "--update",
+        action="count",
+        default=0,
+        help="Update cached list of files for the sources",
+    )
 def add_show_args(parser: ArgumentParser) -> None:
     parser.add_argument(
         "--limit",

datachain/client/fsspec.py CHANGED Viewed

@@ -3,6 +3,7 @@ import functools
 import logging
 import multiprocessing
 import os
+import posixpath
 import re
 import sys
 from abc import ABC, abstractmethod
@@ -25,7 +26,7 @@ from fsspec.asyn import get_loop, sync
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback
 from tqdm.auto import tqdm
-from datachain.cache import DataChainCache
+from datachain.cache import Cache
 from datachain.client.fileslice import FileWrapper
 from datachain.error import ClientError as DataChainClientError
 from datachain.nodes_fetcher import NodesFetcher
@@ -74,9 +75,7 @@ class Client(ABC):
     PREFIX: ClassVar[str]
     protocol: ClassVar[str]
-    def __init__(
-        self, name: str, fs_kwargs: dict[str, Any], cache: DataChainCache
-    ) -> None:
+    def __init__(self, name: str, fs_kwargs: dict[str, Any], cache: Cache) -> None:
         self.name = name
         self.fs_kwargs = fs_kwargs
         self._fs: Optional[AbstractFileSystem] = None
@@ -122,7 +121,7 @@ class Client(ABC):
         return cls.get_uri(storage_name), rel_path
     @staticmethod
-    def get_client(source: str, cache: DataChainCache, **kwargs) -> "Client":
+    def get_client(source: str, cache: Cache, **kwargs) -> "Client":
         cls = Client.get_implementation(source)
         storage_url, _ = cls.split_url(source)
         if os.name == "nt":
@@ -145,7 +144,7 @@ class Client(ABC):
     def from_name(
         cls,
         name: str,
-        cache: DataChainCache,
+        cache: Cache,
         kwargs: dict[str, Any],
     ) -> "Client":
         return cls(name, kwargs, cache)
@@ -154,7 +153,7 @@ class Client(ABC):
     def from_source(
         cls,
         uri: "StorageURI",
-        cache: DataChainCache,
+        cache: Cache,
         **kwargs,
     ) -> "Client":
         return cls(cls.FS_CLASS._strip_protocol(uri), kwargs, cache)
@@ -390,8 +389,12 @@ class Client(ABC):
             self.fs.open(self.get_full_path(file.path, file.version)), cb
         )  # type: ignore[return-value]
-    def upload(self, path: str, data: bytes) -> "File":
+    def upload(self, data: bytes, path: str) -> "File":
         full_path = self.get_full_path(path)
+        parent = posixpath.dirname(full_path)
+        self.fs.makedirs(parent, exist_ok=True)
         self.fs.pipe_file(full_path, data)
         file_info = self.fs.info(full_path)
         return self.info_to_file(file_info, path)

datachain/client/local.py CHANGED Viewed

@@ -12,7 +12,7 @@ from datachain.lib.file import File
 from .fsspec import Client
 if TYPE_CHECKING:
-    from datachain.cache import DataChainCache
+    from datachain.cache import Cache
     from datachain.dataset import StorageURI
@@ -25,7 +25,7 @@ class FileClient(Client):
         self,
         name: str,
         fs_kwargs: dict[str, Any],
-        cache: "DataChainCache",
+        cache: "Cache",
         use_symlinks: bool = False,
     ) -> None:
         super().__init__(name, fs_kwargs, cache)
@@ -82,7 +82,7 @@ class FileClient(Client):
         return bucket, path
     @classmethod
-    def from_name(cls, name: str, cache: "DataChainCache", kwargs) -> "FileClient":
+    def from_name(cls, name: str, cache: "Cache", kwargs) -> "FileClient":
         use_symlinks = kwargs.pop("use_symlinks", False)
         return cls(name, kwargs, cache, use_symlinks=use_symlinks)
@@ -90,7 +90,7 @@ class FileClient(Client):
     def from_source(
         cls,
         uri: str,
-        cache: "DataChainCache",
+        cache: "Cache",
         use_symlinks: bool = False,
         **kwargs,
     ) -> "FileClient":

datachain/data_storage/schema.py CHANGED Viewed

@@ -200,7 +200,7 @@ class DataTable:
         columns: Sequence["sa.Column"] = (),
         metadata: Optional["sa.MetaData"] = None,
     ):
-        # copy columns, since re-using the same objects from another table
+        # copy columns, since reusing the same objects from another table
         # may raise an error
         columns = cls.sys_columns() + [cls.copy_column(c) for c in columns]
         columns = dedup_columns(columns)

datachain/dataset.py CHANGED Viewed

@@ -91,7 +91,7 @@ class DatasetDependency:
         if self.type == DatasetDependencyType.DATASET:
             return self.name
-        list_dataset_name, _, _ = parse_listing_uri(self.name.strip("/"), None, {})
+        list_dataset_name, _, _ = parse_listing_uri(self.name.strip("/"), {})
         assert list_dataset_name
         return list_dataset_name
@@ -150,12 +150,6 @@ class DatasetDependency:
         return hash(f"{self.type}_{self.name}_{self.version}")
-@dataclass
-class DatasetStats:
-    num_objects: Optional[int]  # None if table is missing
-    size: Optional[int]  # in bytes None if table is missing or empty
 class DatasetStatus:
     CREATED = 1
     PENDING = 2

datachain/error.py CHANGED Viewed

@@ -1,3 +1,15 @@
+import botocore.errorfactory
+import botocore.exceptions
+import gcsfs.retry
+REMOTE_ERRORS = (
+    gcsfs.retry.HttpError,  # GCS
+    OSError,  # GCS
+    botocore.exceptions.BotoCoreError,  # S3
+    ValueError,  # Azure
+)
 class DataChainError(RuntimeError):
     pass

datachain/func/__init__.py CHANGED Viewed

@@ -16,7 +16,7 @@ from .aggregate import (
     sum,
 )
 from .array import cosine_distance, euclidean_distance, length, sip_hash_64
-from .conditional import case, greatest, ifelse, least
+from .conditional import case, greatest, ifelse, isnone, least
 from .numeric import bit_and, bit_hamming_distance, bit_or, bit_xor, int_hash_64
 from .random import rand
 from .string import byte_hamming_distance
@@ -42,6 +42,7 @@ __all__ = [
     "greatest",
     "ifelse",
     "int_hash_64",
+    "isnone",
     "least",
     "length",
     "literal",

datachain/func/conditional.py CHANGED Viewed

@@ -1,14 +1,15 @@
-from typing import Union
+from typing import Optional, Union
+from sqlalchemy import ColumnElement
 from sqlalchemy import case as sql_case
-from sqlalchemy.sql.elements import BinaryExpression
 from datachain.lib.utils import DataChainParamsError
+from datachain.query.schema import Column
 from datachain.sql.functions import conditional
 from .func import ColT, Func
-CaseT = Union[int, float, complex, bool, str]
+CaseT = Union[int, float, complex, bool, str, Func, ColumnElement]
 def greatest(*args: Union[ColT, float]) -> Func:
@@ -87,17 +88,22 @@ def least(*args: Union[ColT, float]) -> Func:
     )
-def case(*args: tuple[BinaryExpression, CaseT], else_=None) -> Func:
+def case(
+    *args: tuple[Union[ColumnElement, Func], CaseT], else_: Optional[CaseT] = None
+) -> Func:
     """
     Returns the case function that produces case expression which has a list of
-    conditions and corresponding results. Results can only be python primitives
-    like string, numbes or booleans. Result type is inferred from condition results.
+    conditions and corresponding results. Results can be python primitives like string,
+    numbers or booleans but can also be other nested functions (including case function)
+    or columns.
+    Result type is inferred from condition results.
     Args:
-        args (tuple(BinaryExpression, value(str | int | float | complex | bool):
-            - Tuple of binary expression and values pair which corresponds to one
-            case condition - value
-        else_ (str | int | float | complex | bool): else value in case expression
+        args tuple((ColumnElement | Func),(str | int | float | complex | bool, Func, ColumnElement)):
+            Tuple of condition and values pair.
+        else_ (str | int | float | complex | bool, Func): optional else value in case
+            expression. If omitted, and no case conditions are satisfied, the result
+            will be None (NULL in DB).
     Returns:
         Func: A Func object that represents the case function.
@@ -108,39 +114,59 @@ def case(*args: tuple[BinaryExpression, CaseT], else_=None) -> Func:
             res=func.case((C("num") > 0, "P"), (C("num") < 0, "N"), else_="Z"),
         )
         ```
-    """
+    """  # noqa: E501
     supported_types = [int, float, complex, str, bool]
-    type_ = type(else_) if else_ else None
+    def _get_type(val):
+        if isinstance(val, Func):
+            # nested functions
+            return val.result_type
+        if isinstance(val, Column):
+            # at this point we cannot know what is the type of a column
+            return None
+        return type(val)
     if not args:
         raise DataChainParamsError("Missing statements")
-    for arg in args:
-        if type_ and not isinstance(arg[1], type_):
-            raise DataChainParamsError("Statement values must be of the same type")
-        type_ = type(arg[1])
+    type_ = _get_type(else_) if else_ is not None else None
-    if type_ not in supported_types:
+    for arg in args:
+        arg_type = _get_type(arg[1])
+        if arg_type is None:
+            # we couldn't figure out the type of case value
+            continue
+        if type_ and arg_type != type_:
+            raise DataChainParamsError(
+                f"Statement values must be of the same type, got {type_} and {arg_type}"
+            )
+        type_ = arg_type
+    if type_ is not None and type_ not in supported_types:
         raise DataChainParamsError(
             f"Only python literals ({supported_types}) are supported for values"
         )
     kwargs = {"else_": else_}
-    return Func("case", inner=sql_case, args=args, kwargs=kwargs, result_type=type_)
+    return Func("case", inner=sql_case, cols=args, kwargs=kwargs, result_type=type_)
-def ifelse(condition: BinaryExpression, if_val: CaseT, else_val: CaseT) -> Func:
+def ifelse(
+    condition: Union[ColumnElement, Func], if_val: CaseT, else_val: CaseT
+) -> Func:
     """
     Returns the ifelse function that produces if expression which has a condition
-    and values for true and false outcome. Results can only be python primitives
-    like string, numbes or booleans. Result type is inferred from the values.
+    and values for true and false outcome. Results can be one of python primitives
+    like string, numbers or booleans, but can also be nested functions or columns.
+    Result type is inferred from the values.
     Args:
-        condition: BinaryExpression - condition which is evaluated
-        if_val: (str | int | float | complex | bool): value for true condition outcome
-        else_val: (str | int | float | complex | bool): value for false condition
-         outcome
+        condition (ColumnElement, Func):  Condition which is evaluated.
+        if_val (str | int | float | complex | bool, Func, ColumnElement): Value for true
+            condition outcome.
+        else_val (str | int | float | complex | bool, Func, ColumnElement): Value for
+            false condition outcome.
     Returns:
         Func: A Func object that represents the ifelse function.
@@ -148,8 +174,33 @@ def ifelse(condition: BinaryExpression, if_val: CaseT, else_val: CaseT) -> Func:
     Example:
         ```py
         dc.mutate(
-            res=func.ifelse(C("num") > 0, "P", "N"),
+            res=func.ifelse(isnone("col"), "EMPTY", "NOT_EMPTY")
         )
         ```
     """
     return case((condition, if_val), else_=else_val)
+def isnone(col: Union[str, Column]) -> Func:
+    """
+    Returns True if column value is None, otherwise False.
+    Args:
+        col (str | Column): Column to check if it's None or not.
+            If a string is provided, it is assumed to be the name of the column.
+    Returns:
+        Func: A Func object that represents the conditional to check if column is None.
+    Example:
+        ```py
+        dc.mutate(test=ifelse(isnone("col"), "EMPTY", "NOT_EMPTY"))
+        ```
+    """
+    from datachain import C
+    if isinstance(col, str):
+        # if string, it is assumed to be the name of the column
+        col = C(col)
+    return case((col.is_(None) if col is not None else True, True), else_=False)

datachain/func/func.py CHANGED Viewed

@@ -23,7 +23,7 @@ if TYPE_CHECKING:
     from .window import Window
-ColT = Union[str, ColumnElement, "Func"]
+ColT = Union[str, ColumnElement, "Func", tuple]
 class Func(Function):
@@ -78,7 +78,7 @@ class Func(Function):
         return (
             [
                 col
-                if isinstance(col, (Func, BindParameter, Case, Comparator))
+                if isinstance(col, (Func, BindParameter, Case, Comparator, tuple))
                 else ColumnMeta.to_db_name(
                     col.name if isinstance(col, ColumnElement) else col
                 )
@@ -381,17 +381,24 @@ class Func(Function):
         col_type = self.get_result_type(signals_schema)
         sql_type = python_to_sql(col_type)
-        def get_col(col: ColT) -> ColT:
+        def get_col(col: ColT, string_as_literal=False) -> ColT:
+            # string_as_literal is used only for conditionals like `case()` where
+            # literals are nested inside ColT as we have tuples of condition - values
+            # and if user wants to set some case value as column, explicit `C("col")`
+            # syntax must be used to distinguish from literals
+            if isinstance(col, tuple):
+                return tuple(get_col(x, string_as_literal=True) for x in col)
             if isinstance(col, Func):
                 return col.get_column(signals_schema, table=table)
-            if isinstance(col, str):
+            if isinstance(col, str) and not string_as_literal:
                 column = Column(col, sql_type)
                 column.table = table
                 return column
             return col
         cols = [get_col(col) for col in self._db_cols]
-        func_col = self.inner(*cols, *self.args, **self.kwargs)
+        kwargs = {k: get_col(v, string_as_literal=True) for k, v in self.kwargs.items()}
+        func_col = self.inner(*cols, *self.args, **kwargs)
         if self.is_window:
             if not self.window:
@@ -416,6 +423,10 @@ class Func(Function):
 def get_db_col_type(signals_schema: "SignalSchema", col: ColT) -> "DataType":
+    if isinstance(col, tuple):
+        # we can only get tuple from case statement where the first tuple item
+        # is condition, and second one is value which type is important
+        col = col[1]
     if isinstance(col, Func):
         return col.get_result_type(signals_schema)
@@ -423,7 +434,7 @@ def get_db_col_type(signals_schema: "SignalSchema", col: ColT) -> "DataType":
         return sql_to_python(col)
     return signals_schema.get_column_type(
-        col.name if isinstance(col, ColumnElement) else col
+        col.name if isinstance(col, ColumnElement) else col  # type: ignore[arg-type]
     )

datachain 0.8.10__py3-none-any.whl → 0.8.12__py3-none-any.whl

Potentially problematic release.

datachain 0.8.10py3-none-any.whl → 0.8.12py3-none-any.whl