PyPI - datachain - Versions diffs - 0.8.2__py3-none-any.whl → 0.8.4__py3-none-any.whl - Mend

datachain 0.8.2py3-none-any.whl → 0.8.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (44) hide show

datachain/cache.py +4 -2
datachain/catalog/catalog.py +100 -54
datachain/catalog/datasource.py +4 -6
datachain/cli/__init__.py +311 -0
datachain/cli/commands/__init__.py +29 -0
datachain/cli/commands/datasets.py +129 -0
datachain/cli/commands/du.py +14 -0
datachain/cli/commands/index.py +12 -0
datachain/cli/commands/ls.py +169 -0
datachain/cli/commands/misc.py +28 -0
datachain/cli/commands/query.py +53 -0
datachain/cli/commands/show.py +38 -0
datachain/cli/parser/__init__.py +547 -0
datachain/cli/parser/job.py +120 -0
datachain/cli/parser/studio.py +126 -0
datachain/cli/parser/utils.py +63 -0
datachain/{cli_utils.py → cli/utils.py} +27 -1
datachain/client/azure.py +21 -1
datachain/client/fsspec.py +45 -13
datachain/client/gcs.py +10 -2
datachain/client/local.py +4 -4
datachain/client/s3.py +10 -0
datachain/dataset.py +1 -0
datachain/func/__init__.py +2 -2
datachain/func/conditional.py +52 -0
datachain/func/func.py +5 -1
datachain/lib/arrow.py +4 -0
datachain/lib/dc.py +18 -3
datachain/lib/file.py +1 -1
datachain/lib/listing.py +36 -3
datachain/lib/signal_schema.py +89 -27
datachain/listing.py +1 -5
datachain/node.py +27 -1
datachain/progress.py +2 -2
datachain/query/session.py +1 -1
datachain/studio.py +58 -38
datachain/utils.py +1 -1
{datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/METADATA +6 -6
{datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/RECORD +43 -31
{datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/WHEEL +1 -1
datachain/cli.py +0 -1475
{datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/LICENSE +0 -0
{datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/entry_points.txt +0 -0
{datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/top_level.txt +0 -0

datachain/cli/parser/studio.py ADDED Viewed

@@ -0,0 +1,126 @@
+def add_studio_parser(subparsers, parent_parser) -> None:
+    studio_help = "Commands to authenticate DataChain with Iterative Studio"
+    studio_description = (
+        "Authenticate DataChain with Studio and set the token. "
+        "Once this token has been properly configured,\n"
+        "DataChain will utilize it for seamlessly sharing datasets\n"
+        "and using Studio features from CLI"
+    )
+    studio_parser = subparsers.add_parser(
+        "studio",
+        parents=[parent_parser],
+        description=studio_description,
+        help=studio_help,
+    )
+    studio_subparser = studio_parser.add_subparsers(
+        dest="cmd",
+        help="Use `DataChain studio CMD --help` to display command-specific help.",
+        required=True,
+    )
+    studio_login_help = "Authenticate DataChain with Studio host"
+    studio_login_description = (
+        "By default, this command authenticates the DataChain with Studio\n"
+        "using default scopes and assigns a random name as the token name."
+    )
+    login_parser = studio_subparser.add_parser(
+        "login",
+        parents=[parent_parser],
+        description=studio_login_description,
+        help=studio_login_help,
+    )
+    login_parser.add_argument(
+        "-H",
+        "--hostname",
+        action="store",
+        default=None,
+        help="The hostname of the Studio instance to authenticate with.",
+    )
+    login_parser.add_argument(
+        "-s",
+        "--scopes",
+        action="store",
+        default=None,
+        help="The scopes for the authentication token. ",
+    )
+    login_parser.add_argument(
+        "-n",
+        "--name",
+        action="store",
+        default=None,
+        help="The name of the authentication token. It will be used to\n"
+        "identify token shown in Studio profile.",
+    )
+    login_parser.add_argument(
+        "--no-open",
+        action="store_true",
+        default=False,
+        help="Use authentication flow based on user code.\n"
+        "You will be presented with user code to enter in browser.\n"
+        "DataChain will also use this if it cannot launch browser on your behalf.",
+    )
+    studio_logout_help = "Logout user from Studio"
+    studio_logout_description = "This removes the studio token from your global config."
+    studio_subparser.add_parser(
+        "logout",
+        parents=[parent_parser],
+        description=studio_logout_description,
+        help=studio_logout_help,
+    )
+    studio_team_help = "Set the default team for DataChain"
+    studio_team_description = (
+        "Set the default team for DataChain to use when interacting with Studio."
+    )
+    team_parser = studio_subparser.add_parser(
+        "team",
+        parents=[parent_parser],
+        description=studio_team_description,
+        help=studio_team_help,
+    )
+    team_parser.add_argument(
+        "team_name",
+        action="store",
+        help="The name of the team to set as the default.",
+    )
+    team_parser.add_argument(
+        "--global",
+        action="store_true",
+        default=False,
+        help="Set the team globally for all DataChain projects.",
+    )
+    studio_token_help = "View the token datachain uses to contact Studio"  # noqa: S105 # nosec B105
+    studio_subparser.add_parser(
+        "token",
+        parents=[parent_parser],
+        description=studio_token_help,
+        help=studio_token_help,
+    )
+    studio_ls_dataset_help = "List the available datasets from Studio"
+    studio_ls_dataset_description = (
+        "This command lists all the datasets available in Studio.\n"
+        "It will show the dataset name and the number of versions available."
+    )
+    ls_dataset_parser = studio_subparser.add_parser(
+        "dataset",
+        parents=[parent_parser],
+        description=studio_ls_dataset_description,
+        help=studio_ls_dataset_help,
+    )
+    ls_dataset_parser.add_argument(
+        "--team",
+        action="store",
+        default=None,
+        help="The team to list datasets for. By default, it will use team from config.",
+    )

datachain/cli/parser/utils.py ADDED Viewed

@@ -0,0 +1,63 @@
+from argparse import Action, ArgumentParser, ArgumentTypeError
+from typing import Union
+from datachain.cli.utils import CommaSeparatedArgs
+FIND_COLUMNS = ["du", "name", "path", "size", "type"]
+def find_columns_type(
+    columns_str: str,
+    default_colums_str: str = "path",
+) -> list[str]:
+    if not columns_str:
+        columns_str = default_colums_str
+    return [parse_find_column(c) for c in columns_str.split(",")]
+def parse_find_column(column: str) -> str:
+    column_lower = column.strip().lower()
+    if column_lower in FIND_COLUMNS:
+        return column_lower
+    raise ArgumentTypeError(
+        f"Invalid column for find: '{column}' Options are: {','.join(FIND_COLUMNS)}"
+    )
+def add_sources_arg(parser: ArgumentParser, nargs: Union[str, int] = "+") -> Action:
+    return parser.add_argument(
+        "sources",
+        type=str,
+        nargs=nargs,
+        help="Data sources - paths to cloud storage dirs",
+    )
+def add_show_args(parser: ArgumentParser) -> None:
+    parser.add_argument(
+        "--limit",
+        action="store",
+        default=10,
+        type=int,
+        help="Number of rows to show",
+    )
+    parser.add_argument(
+        "--offset",
+        action="store",
+        default=0,
+        type=int,
+        help="Number of rows to offset",
+    )
+    parser.add_argument(
+        "--columns",
+        default=[],
+        action=CommaSeparatedArgs,
+        help="Columns to show",
+    )
+    parser.add_argument(
+        "--no-collapse",
+        action="store_true",
+        default=False,
+        help="Do not collapse the columns",
+    )

datachain/{cli_utils.py → cli/utils.py} RENAMED Viewed

@@ -1,4 +1,8 @@
-from argparse import SUPPRESS, Action, ArgumentError, _AppendAction
+import logging
+from argparse import SUPPRESS, Action, ArgumentError, Namespace, _AppendAction
+from typing import Optional
+from datachain.error import DataChainError
 class BooleanOptionalAction(Action):
@@ -70,3 +74,25 @@ class KeyValueArgs(_AppendAction):  # pylint: disable=protected-access
             items[key.strip()] = value
         setattr(namespace, self.dest, items)
+def get_logging_level(args: Namespace) -> int:
+    if args.quiet:
+        return logging.CRITICAL
+    if args.verbose:
+        return logging.DEBUG
+    return logging.INFO
+def determine_flavors(studio: bool, local: bool, all: bool, token: Optional[str]):
+    if studio and not token:
+        raise DataChainError(
+            "Not logged in to Studio. Log in with 'datachain studio login'."
+        )
+    if local or studio:
+        all = False
+    all = all and not (local or studio)
+    return all, local, studio

datachain/client/azure.py CHANGED Viewed

@@ -1,4 +1,5 @@
-from typing import Any
+from typing import Any, Optional
+from urllib.parse import parse_qs, urlsplit, urlunsplit
 from adlfs import AzureBlobFileSystem
 from tqdm import tqdm
@@ -25,6 +26,16 @@ class AzureClient(Client):
             size=v.get("size", ""),
         )
+    def url(self, path: str, expires: int = 3600, **kwargs) -> str:
+        """
+        Generate a signed URL for the given path.
+        """
+        version_id = kwargs.pop("version_id", None)
+        result = self.fs.sign(
+            self.get_full_path(path, version_id), expiration=expires, **kwargs
+        )
+        return result + (f"&versionid={version_id}" if version_id else "")
     async def _fetch_flat(self, start_prefix: str, result_queue: ResultQueue) -> None:
         prefix = start_prefix
         if prefix:
@@ -57,4 +68,13 @@ class AzureClient(Client):
         finally:
             result_queue.put_nowait(None)
+    @classmethod
+    def version_path(cls, path: str, version_id: Optional[str]) -> str:
+        parts = list(urlsplit(path))
+        query = parse_qs(parts[3])
+        if "versionid" in query:
+            raise ValueError("path already includes a version query")
+        parts[3] = f"versionid={version_id}" if version_id else ""
+        return urlunsplit(parts)
     _fetch_default = _fetch_flat

datachain/client/fsspec.py CHANGED Viewed

@@ -137,6 +137,10 @@ class Client(ABC):
         fs.invalidate_cache()
         return fs
+    @classmethod
+    def version_path(cls, path: str, version_id: Optional[str]) -> str:
+        return path
     @classmethod
     def from_name(
         cls,
@@ -198,17 +202,37 @@ class Client(ABC):
         return self._fs
     def url(self, path: str, expires: int = 3600, **kwargs) -> str:
-        return self.fs.sign(self.get_full_path(path), expiration=expires, **kwargs)
+        return self.fs.sign(
+            self.get_full_path(path, kwargs.pop("version_id", None)),
+            expiration=expires,
+            **kwargs,
+        )
     async def get_current_etag(self, file: "File") -> str:
-        info = await self.fs._info(self.get_full_path(file.path))
-        return self.info_to_file(info, "").etag
-    async def get_size(self, path: str) -> int:
-        return await self.fs._size(path)
-    async def get_file(self, lpath, rpath, callback):
-        return await self.fs._get_file(lpath, rpath, callback=callback)
+        kwargs = {}
+        if self.fs.version_aware:
+            kwargs["version_id"] = file.version
+        info = await self.fs._info(
+            self.get_full_path(file.path, file.version), **kwargs
+        )
+        return self.info_to_file(info, file.path).etag
+    def get_file_info(self, path: str, version_id: Optional[str] = None) -> "File":
+        info = self.fs.info(self.get_full_path(path, version_id), version_id=version_id)
+        return self.info_to_file(info, path)
+    async def get_size(self, path: str, version_id: Optional[str] = None) -> int:
+        return await self.fs._size(
+            self.version_path(path, version_id), version_id=version_id
+        )
+    async def get_file(self, lpath, rpath, callback, version_id: Optional[str] = None):
+        return await self.fs._get_file(
+            self.version_path(lpath, version_id),
+            rpath,
+            callback=callback,
+            version_id=version_id,
+        )
     async def scandir(
         self, start_prefix: str, method: str = "default"
@@ -315,11 +339,11 @@ class Client(ABC):
     def rel_path(self, path: str) -> str:
         return self.fs.split_path(path)[1]
-    def get_full_path(self, rel_path: str) -> str:
-        return f"{self.PREFIX}{self.name}/{rel_path}"
+    def get_full_path(self, rel_path: str, version_id: Optional[str] = None) -> str:
+        return self.version_path(f"{self.PREFIX}{self.name}/{rel_path}", version_id)
     @abstractmethod
-    def info_to_file(self, v: dict[str, Any], parent: str) -> "File": ...
+    def info_to_file(self, v: dict[str, Any], path: str) -> "File": ...
     def fetch_nodes(
         self,
@@ -362,7 +386,15 @@ class Client(ABC):
         if use_cache and (cache_path := self.cache.get_path(file)):
             return open(cache_path, mode="rb")
         assert not file.location
-        return FileWrapper(self.fs.open(self.get_full_path(file.path)), cb)  # type: ignore[return-value]
+        return FileWrapper(
+            self.fs.open(self.get_full_path(file.path, file.version)), cb
+        )  # type: ignore[return-value]
+    def upload(self, path: str, data: bytes) -> "File":
+        full_path = self.get_full_path(path)
+        self.fs.pipe_file(full_path, data)
+        file_info = self.fs.info(full_path)
+        return self.info_to_file(file_info, path)
     def download(self, file: "File", *, callback: Callback = DEFAULT_CALLBACK) -> None:
         sync(get_loop(), functools.partial(self._download, file, callback=callback))

datachain/client/gcs.py CHANGED Viewed

@@ -38,9 +38,13 @@ class GCSClient(Client):
         If the client is anonymous, a public URL is returned instead
         (see https://cloud.google.com/storage/docs/access-public-data#api-link).
         """
+        version_id = kwargs.pop("version_id", None)
         if self.fs.storage_options.get("token") == "anon":
-            return f"https://storage.googleapis.com/{self.name}/{path}"
-        return self.fs.sign(self.get_full_path(path), expiration=expires, **kwargs)
+            query = f"?generation={version_id}" if version_id else ""
+            return f"https://storage.googleapis.com/{self.name}/{path}{query}"
+        return self.fs.sign(
+            self.get_full_path(path, version_id), expiration=expires, **kwargs
+        )
     @staticmethod
     def parse_timestamp(timestamp: str) -> datetime:
@@ -131,3 +135,7 @@ class GCSClient(Client):
             last_modified=self.parse_timestamp(v["updated"]),
             size=v.get("size", ""),
         )
+    @classmethod
+    def version_path(cls, path: str, version_id: Optional[str]) -> str:
+        return f"{path}#{version_id}" if version_id else path

datachain/client/local.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 import posixpath
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Optional
 from urllib.parse import urlparse
 from fsspec.implementations.local import LocalFileSystem
@@ -105,10 +105,10 @@ class FileClient(Client):
         info = self.fs.info(self.get_full_path(file.path))
         return self.info_to_file(info, "").etag
-    async def get_size(self, path: str) -> int:
+    async def get_size(self, path: str, version_id: Optional[str] = None) -> int:
         return self.fs.size(path)
-    async def get_file(self, lpath, rpath, callback):
+    async def get_file(self, lpath, rpath, callback, version_id: Optional[str] = None):
         return self.fs.get_file(lpath, rpath, callback=callback)
     async def ls_dir(self, path):
@@ -117,7 +117,7 @@ class FileClient(Client):
     def rel_path(self, path):
         return posixpath.relpath(path, self.name)
-    def get_full_path(self, rel_path):
+    def get_full_path(self, rel_path, version_id: Optional[str] = None):
         full_path = Path(self.name, rel_path).as_posix()
         if rel_path.endswith("/") or not rel_path:
             full_path += "/"

datachain/client/s3.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import asyncio
 from typing import Any, Optional, cast
+from urllib.parse import parse_qs, urlsplit, urlunsplit
 from botocore.exceptions import NoCredentialsError
 from s3fs import S3FileSystem
@@ -121,6 +122,15 @@ class ClientS3(Client):
             size=v["Size"],
         )
+    @classmethod
+    def version_path(cls, path: str, version_id: Optional[str]) -> str:
+        parts = list(urlsplit(path))
+        query = parse_qs(parts[3])
+        if "versionId" in query:
+            raise ValueError("path already includes a version query")
+        parts[3] = f"versionId={version_id}" if version_id else ""
+        return urlunsplit(parts)
     async def _fetch_dir(
         self,
         prefix,

datachain/dataset.py CHANGED Viewed

@@ -92,6 +92,7 @@ class DatasetDependency:
             return self.name
         list_dataset_name, _, _ = parse_listing_uri(self.name.strip("/"), None, {})
+        assert list_dataset_name
         return list_dataset_name
     @classmethod

datachain/func/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from sqlalchemy import case, literal
+from sqlalchemy import literal
 from . import array, path, random, string
 from .aggregate import (
@@ -16,7 +16,7 @@ from .aggregate import (
     sum,
 )
 from .array import cosine_distance, euclidean_distance, length, sip_hash_64
-from .conditional import greatest, least
+from .conditional import case, greatest, least
 from .numeric import bit_and, bit_hamming_distance, bit_or, bit_xor, int_hash_64
 from .random import rand
 from .string import byte_hamming_distance

datachain/func/conditional.py CHANGED Viewed

@@ -1,5 +1,9 @@
 from typing import Union
+from sqlalchemy import case as sql_case
+from sqlalchemy.sql.elements import BinaryExpression
+from datachain.lib.utils import DataChainParamsError
 from datachain.sql.functions import conditional
 from .func import ColT, Func
@@ -79,3 +83,51 @@ def least(*args: Union[ColT, float]) -> Func:
     return Func(
         "least", inner=conditional.least, cols=cols, args=func_args, result_type=int
     )
+def case(
+    *args: tuple[BinaryExpression, Union[int, float, complex, bool, str]], else_=None
+) -> Func:
+    """
+    Returns the case function that produces case expression which has a list of
+    conditions and corresponding results. Results can only be python primitives
+    like string, numbes or booleans. Result type is inferred from condition results.
+    Args:
+        args (tuple(BinaryExpression, value(str | int | float | complex | bool):
+            - Tuple of binary expression and values pair which corresponds to one
+            case condition - value
+        else_ (str | int | float | complex | bool): else value in case expression
+    Returns:
+        Func: A Func object that represents the case function.
+    Example:
+        ```py
+        dc.mutate(
+            res=func.case((C("num") > 0, "P"), (C("num") < 0, "N"), else_="Z"),
+        )
+        ```
+    Note:
+        - Result column will always be of the same type as the input columns.
+    """
+    supported_types = [int, float, complex, str, bool]
+    type_ = type(else_) if else_ else None
+    if not args:
+        raise DataChainParamsError("Missing case statements")
+    for arg in args:
+        if type_ and not isinstance(arg[1], type_):
+            raise DataChainParamsError("Case statement values must be of the same type")
+        type_ = type(arg[1])
+    if type_ not in supported_types:
+        raise DataChainParamsError(
+            f"Case supports only python literals ({supported_types}) for values"
+        )
+    kwargs = {"else_": else_}
+    return Func("case", inner=sql_case, args=args, kwargs=kwargs, result_type=type_)

datachain/func/func.py CHANGED Viewed

@@ -35,6 +35,7 @@ class Func(Function):
         inner: Callable,
         cols: Optional[Sequence[ColT]] = None,
         args: Optional[Sequence[Any]] = None,
+        kwargs: Optional[dict[str, Any]] = None,
         result_type: Optional["DataType"] = None,
         is_array: bool = False,
         is_window: bool = False,
@@ -45,6 +46,7 @@ class Func(Function):
         self.inner = inner
         self.cols = cols or []
         self.args = args or []
+        self.kwargs = kwargs or {}
         self.result_type = result_type
         self.is_array = is_array
         self.is_window = is_window
@@ -63,6 +65,7 @@ class Func(Function):
             self.inner,
             self.cols,
             self.args,
+            self.kwargs,
             self.result_type,
             self.is_array,
             self.is_window,
@@ -333,6 +336,7 @@ class Func(Function):
             self.inner,
             self.cols,
             self.args,
+            self.kwargs,
             self.result_type,
             self.is_array,
             self.is_window,
@@ -387,7 +391,7 @@ class Func(Function):
             return col
         cols = [get_col(col) for col in self._db_cols]
-        func_col = self.inner(*cols, *self.args)
+        func_col = self.inner(*cols, *self.args, **self.kwargs)
         if self.is_window:
             if not self.window:

datachain/lib/arrow.py CHANGED Viewed

@@ -149,6 +149,10 @@ def infer_schema(chain: "DataChain", **kwargs) -> pa.Schema:
     for file in chain.collect("file"):
         ds = dataset(file.get_path(), filesystem=file.get_fs(), **kwargs)  # type: ignore[union-attr]
         schemas.append(ds.schema)
+    if not schemas:
+        raise ValueError(
+            "Cannot infer schema (no files to process or can't access them)"
+        )
     return pa.unify_schemas(schemas)

datachain/lib/dc.py CHANGED Viewed

@@ -32,7 +32,7 @@ from datachain.lib.data_model import DataModel, DataType, DataValue, dict_to_dat
 from datachain.lib.dataset_info import DatasetInfo
 from datachain.lib.file import ArrowRow, File, FileType, get_file_type
 from datachain.lib.file import ExportPlacement as FileExportPlacement
-from datachain.lib.listing import get_listing, list_bucket, ls
+from datachain.lib.listing import get_file_info, get_listing, list_bucket, ls
 from datachain.lib.listing_info import ListingInfo
 from datachain.lib.meta_formats import read_meta
 from datachain.lib.model_store import ModelStore
@@ -438,6 +438,18 @@ class DataChain:
             uri, session, update=update
         )
+        # ds_name is None if object is a file, we don't want to use cache
+        # or do listing in that case - just read that single object
+        if not list_ds_name:
+            dc = cls.from_values(
+                session=session,
+                settings=settings,
+                in_memory=in_memory,
+                file=[get_file_info(list_uri, cache, client_config=client_config)],
+            )
+            dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
+            return dc
         if update or not list_ds_exists:
             (
                 cls.from_records(
@@ -1634,7 +1646,7 @@ class DataChain:
         output: OutputType = None,
         object_name: str = "",
         **fr_map,
-    ) -> "DataChain":
+    ) -> "Self":
         """Generate chain from list of values.
         Example:
@@ -1647,7 +1659,7 @@ class DataChain:
         def _func_fr() -> Iterator[tuple_type]:  # type: ignore[valid-type]
             yield from tuples
-        chain = DataChain.from_records(
+        chain = cls.from_records(
             DataChain.DEFAULT_FILE_RECORD,
             session=session,
             settings=settings,
@@ -1870,6 +1882,9 @@ class DataChain:
                     "`nrows` only supported for csv and json formats.",
                 )
+        if "file" not in self.schema or not self.count():
+            raise DatasetPrepareError(self.name, "no files to parse.")
         schema = None
         col_names = output if isinstance(output, Sequence) else None
         if col_names or not output:

datachain/lib/file.py CHANGED Viewed

@@ -364,7 +364,7 @@ class File(DataModel):
         try:
             info = client.fs.info(client.get_full_path(self.path))
-            converted_info = client.info_to_file(info, self.source)
+            converted_info = client.info_to_file(info, self.path)
             return type(self)(
                 path=self.path,
                 source=self.source,

datachain 0.8.2__py3-none-any.whl → 0.8.4__py3-none-any.whl

Potentially problematic release.

datachain 0.8.2py3-none-any.whl → 0.8.4py3-none-any.whl