PyPI - datachain - Versions diffs - 0.6.1__py3-none-any.whl → 0.6.3__py3-none-any.whl - Mend

datachain 0.6.1py3-none-any.whl → 0.6.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (30) hide show

datachain/catalog/catalog.py +61 -219
datachain/cli.py +136 -22
datachain/client/fsspec.py +9 -0
datachain/client/local.py +11 -32
datachain/config.py +126 -51
datachain/data_storage/schema.py +66 -33
datachain/data_storage/sqlite.py +4 -4
datachain/data_storage/warehouse.py +101 -125
datachain/lib/arrow.py +2 -15
datachain/lib/data_model.py +10 -2
datachain/lib/dc.py +211 -52
datachain/lib/func/__init__.py +20 -2
datachain/lib/func/aggregate.py +319 -8
datachain/lib/func/func.py +97 -9
datachain/lib/listing.py +6 -21
datachain/lib/listing_info.py +4 -0
datachain/lib/signal_schema.py +8 -5
datachain/lib/udf.py +3 -3
datachain/lib/utils.py +30 -0
datachain/listing.py +22 -48
datachain/query/dataset.py +11 -3
datachain/remote/studio.py +63 -14
datachain/studio.py +129 -0
datachain/utils.py +58 -0
{datachain-0.6.1.dist-info → datachain-0.6.3.dist-info}/METADATA +7 -6
{datachain-0.6.1.dist-info → datachain-0.6.3.dist-info}/RECORD +30 -29
{datachain-0.6.1.dist-info → datachain-0.6.3.dist-info}/WHEEL +1 -1
{datachain-0.6.1.dist-info → datachain-0.6.3.dist-info}/LICENSE +0 -0
{datachain-0.6.1.dist-info → datachain-0.6.3.dist-info}/entry_points.txt +0 -0
{datachain-0.6.1.dist-info → datachain-0.6.3.dist-info}/top_level.txt +0 -0

datachain/cli.py CHANGED Viewed

@@ -15,8 +15,8 @@ import shtab
 from datachain import Session, utils
 from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyValueArgs
 from datachain.lib.dc import DataChain
+from datachain.studio import process_studio_cli_args
 from datachain.telemetry import telemetry
-from datachain.utils import DataChainDir
 if TYPE_CHECKING:
     from datachain.catalog import Catalog
@@ -98,6 +98,134 @@ def add_show_args(parser: ArgumentParser) -> None:
     )
+def add_studio_parser(subparsers, parent_parser) -> None:
+    studio_help = "Commands to authenticate DataChain with Iterative Studio"
+    studio_description = (
+        "Authenticate DataChain with Studio and set the token. "
+        "Once this token has been properly configured,\n"
+        "DataChain will utilize it for seamlessly sharing datasets\n"
+        "and using Studio features from CLI"
+    )
+    studio_parser = subparsers.add_parser(
+        "studio",
+        parents=[parent_parser],
+        description=studio_description,
+        help=studio_help,
+    )
+    studio_subparser = studio_parser.add_subparsers(
+        dest="cmd",
+        help="Use `DataChain studio CMD --help` to display command-specific help.",
+        required=True,
+    )
+    studio_login_help = "Authenticate DataChain with Studio host"
+    studio_login_description = (
+        "By default, this command authenticates the DataChain with Studio\n"
+        "using default scopes and assigns a random name as the token name."
+    )
+    login_parser = studio_subparser.add_parser(
+        "login",
+        parents=[parent_parser],
+        description=studio_login_description,
+        help=studio_login_help,
+    )
+    login_parser.add_argument(
+        "-H",
+        "--hostname",
+        action="store",
+        default=None,
+        help="The hostname of the Studio instance to authenticate with.",
+    )
+    login_parser.add_argument(
+        "-s",
+        "--scopes",
+        action="store",
+        default=None,
+        help="The scopes for the authentication token. ",
+    )
+    login_parser.add_argument(
+        "-n",
+        "--name",
+        action="store",
+        default=None,
+        help="The name of the authentication token. It will be used to\n"
+        "identify token shown in Studio profile.",
+    )
+    login_parser.add_argument(
+        "--no-open",
+        action="store_true",
+        default=False,
+        help="Use authentication flow based on user code.\n"
+        "You will be presented with user code to enter in browser.\n"
+        "DataChain will also use this if it cannot launch browser on your behalf.",
+    )
+    studio_logout_help = "Logout user from Studio"
+    studio_logout_description = "This removes the studio token from your global config."
+    studio_subparser.add_parser(
+        "logout",
+        parents=[parent_parser],
+        description=studio_logout_description,
+        help=studio_logout_help,
+    )
+    studio_team_help = "Set the default team for DataChain"
+    studio_team_description = (
+        "Set the default team for DataChain to use when interacting with Studio."
+    )
+    team_parser = studio_subparser.add_parser(
+        "team",
+        parents=[parent_parser],
+        description=studio_team_description,
+        help=studio_team_help,
+    )
+    team_parser.add_argument(
+        "team_name",
+        action="store",
+        help="The name of the team to set as the default.",
+    )
+    team_parser.add_argument(
+        "--global",
+        action="store_true",
+        default=False,
+        help="Set the team globally for all DataChain projects.",
+    )
+    studio_token_help = "View the token datachain uses to contact Studio"  # noqa: S105 # nosec B105
+    studio_subparser.add_parser(
+        "token",
+        parents=[parent_parser],
+        description=studio_token_help,
+        help=studio_token_help,
+    )
+    studio_ls_dataset_help = "List the available datasets from Studio"
+    studio_ls_dataset_description = (
+        "This command lists all the datasets available in Studio.\n"
+        "It will show the dataset name and the number of versions available."
+    )
+    ls_dataset_parser = studio_subparser.add_parser(
+        "datasets",
+        parents=[parent_parser],
+        description=studio_ls_dataset_description,
+        help=studio_ls_dataset_help,
+    )
+    ls_dataset_parser.add_argument(
+        "--team",
+        action="store",
+        default=None,
+        help="The team to list datasets for. By default, it will use team from config.",
+    )
 def get_parser() -> ArgumentParser:  # noqa: PLR0915
     try:
         __version__ = version("datachain")
@@ -121,12 +249,6 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         action="store_true",
         help="AWS anon (aka awscli's --no-sign-request)",
     )
-    parent_parser.add_argument(
-        "--ttl",
-        type=human_time_type,
-        default=TTL_HUMAN,
-        help="Time-to-live of data source cache. Negative equals forever.",
-    )
     parent_parser.add_argument(
         "-u", "--update", action="count", default=0, help="Update cache"
     )
@@ -226,6 +348,8 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         help="Use a different filename for the resulting .edatachain file",
     )
+    add_studio_parser(subp, parent_parser)
     parse_pull = subp.add_parser(
         "pull",
         parents=[parent_parser],
@@ -638,16 +762,13 @@ def format_ls_entry(entry: str) -> str:
 def ls_remote(
-    url: str,
-    username: str,
-    token: str,
     paths: Iterable[str],
     long: bool = False,
 ):
     from datachain.node import long_line_str
     from datachain.remote.studio import StudioClient
-    client = StudioClient(url, username, token)
+    client = StudioClient()
     first = True
     for path, response in client.ls(paths):
         if not first:
@@ -679,17 +800,14 @@ def ls(
     **kwargs,
 ):
     if config is None:
-        from .config import get_remote_config, read_config
+        from .config import Config
-        config = get_remote_config(read_config(DataChainDir.find().root), remote=remote)
+        config = Config().get_remote_config(remote=remote)
     remote_type = config["type"]
     if remote_type == "local":
         ls_local(sources, long=long, **kwargs)
     else:
         ls_remote(
-            config["url"],
-            config["username"],
-            config["token"],
             sources,
             long=long,
         )
@@ -887,7 +1005,6 @@ def main(argv: Optional[list[str]] = None) -> int:  # noqa: C901, PLR0912, PLR09
                 edatachain_only=False,
                 no_edatachain_file=True,
                 no_glob=args.no_glob,
-                ttl=args.ttl,
             )
         elif args.command == "clone":
             catalog.clone(
@@ -897,7 +1014,6 @@ def main(argv: Optional[list[str]] = None) -> int:  # noqa: C901, PLR0912, PLR09
                 update=bool(args.update),
                 recursive=bool(args.recursive),
                 no_glob=args.no_glob,
-                ttl=args.ttl,
                 no_cp=args.no_cp,
                 edatachain=args.edatachain,
                 edatachain_file=args.edatachain_file,
@@ -923,7 +1039,6 @@ def main(argv: Optional[list[str]] = None) -> int:  # noqa: C901, PLR0912, PLR09
                 args.sources,
                 long=bool(args.long),
                 remote=args.remote,
-                ttl=args.ttl,
                 update=bool(args.update),
                 client_config=client_config,
             )
@@ -957,7 +1072,6 @@ def main(argv: Optional[list[str]] = None) -> int:  # noqa: C901, PLR0912, PLR09
                 show_bytes=args.bytes,
                 depth=args.depth,
                 si=args.si,
-                ttl=args.ttl,
                 update=bool(args.update),
                 client_config=client_config,
             )
@@ -965,7 +1079,6 @@ def main(argv: Optional[list[str]] = None) -> int:  # noqa: C901, PLR0912, PLR09
             results_found = False
             for result in catalog.find(
                 args.sources,
-                ttl=args.ttl,
                 update=bool(args.update),
                 names=args.name,
                 inames=args.iname,
@@ -983,7 +1096,6 @@ def main(argv: Optional[list[str]] = None) -> int:  # noqa: C901, PLR0912, PLR09
             index(
                 catalog,
                 args.sources,
-                ttl=args.ttl,
                 update=bool(args.update),
             )
         elif args.command == "completion":
@@ -1001,6 +1113,8 @@ def main(argv: Optional[list[str]] = None) -> int:  # noqa: C901, PLR0912, PLR09
             clear_cache(catalog)
         elif args.command == "gc":
             garbage_collect(catalog)
+        elif args.command == "studio":
+            process_studio_cli_args(args)
         else:
             print(f"invalid command: {args.command}", file=sys.stderr)
             return 1

datachain/client/fsspec.py CHANGED Viewed

@@ -124,6 +124,9 @@ class Client(ABC):
     def get_client(source: str, cache: DataChainCache, **kwargs) -> "Client":
         cls = Client.get_implementation(source)
         storage_url, _ = cls.split_url(source)
+        if os.name == "nt":
+            storage_url = storage_url.removeprefix("/")
         return cls.from_name(storage_url, cache, kwargs)
     @classmethod
@@ -171,6 +174,12 @@ class Client(ABC):
     @classmethod
     def split_url(cls, url: str) -> tuple[str, str]:
+        """
+        Splits the URL into two pieces:
+        1. bucket name without protocol (everything up until the first /)
+        2. path which is the rest of URL starting from bucket name
+        e.g s3://my-bucket/animals/dogs -> (my-bucket, animals/dogs)
+        """
         fill_path = url[len(cls.PREFIX) :]
         path_split = fill_path.split("/", 1)
         bucket = path_split[0]

datachain/client/local.py CHANGED Viewed

@@ -29,25 +29,7 @@ class FileClient(Client):
     @classmethod
     def get_uri(cls, name) -> StorageURI:
-        """
-        This returns root of FS as uri, e.g
-            Linux & Mac : file:///
-            Windows: file:///C:/
-        """
-        return StorageURI(Path(name).as_uri())
-    @staticmethod
-    def root_dir() -> str:
-        """
-        Returns file system root path.
-        Linux &  MacOS: /
-        Windows: C:/
-        """
-        return Path.cwd().anchor.replace(os.sep, posixpath.sep)
-    @staticmethod
-    def root_path() -> Path:
-        return Path(FileClient.root_dir())
+        return StorageURI(f'{cls.PREFIX}/{name.removeprefix("/")}')
     @classmethod
     def ls_buckets(cls, **kwargs):
@@ -75,23 +57,20 @@ class FileClient(Client):
     @classmethod
     def split_url(cls, url: str) -> tuple[str, str]:
-        """
-        Splits url into two components:
-            1. root of the FS which will later on become the name of the storage
-            2. path which will later on become partial path
-        Note that URL needs to be have file:/// protocol.
-        Examples:
-            file:///tmp/dir -> / + tmp/dir
-            file:///c:/windows/files -> c:/ + windows/files
-        """
         parsed = urlparse(url)
         if parsed.scheme == "file":
             scheme, rest = url.split(":", 1)
-            uri = f"{scheme.lower()}:{rest}"
+            url = f"{scheme.lower()}:{rest}"
         else:
-            uri = cls.path_to_uri(url)
-        return cls.root_dir(), uri.removeprefix(cls.root_path().as_uri())
+            url = cls.path_to_uri(url)
+        fill_path = url[len(cls.PREFIX) :]
+        path_split = fill_path.rsplit("/", 1)
+        bucket = path_split[0]
+        if os.name == "nt":
+            bucket = bucket.removeprefix("/")
+        path = path_split[1] if len(path_split) > 1 else ""
+        return bucket, path
     @classmethod
     def from_name(cls, name: str, cache, kwargs) -> "FileClient":

datachain/config.py CHANGED Viewed

@@ -1,62 +1,137 @@
-import os
 from collections.abc import Mapping
-from typing import TYPE_CHECKING, Optional
+from contextlib import contextmanager
+from enum import Enum
+from typing import Optional, Union
-from tomlkit import load
+from tomlkit import TOMLDocument, dump, load
-if TYPE_CHECKING:
-    from tomlkit import TOMLDocument
+from datachain.utils import DataChainDir, global_config_dir, system_config_dir
-def read_config(datachain_root: str) -> Optional["TOMLDocument"]:
-    config_path = os.path.join(datachain_root, "config")
-    try:
-        with open(config_path, encoding="utf-8") as f:
-            return load(f)
-    except FileNotFoundError:
-        return None
+# Define an enum with value system, global and local
+class ConfigLevel(Enum):
+    SYSTEM = "system"
+    GLOBAL = "global"
+    LOCAL = "local"
-def get_remote_config(
-    config: Optional["TOMLDocument"], remote: str = ""
-) -> Mapping[str, str]:
-    if config is None:
-        return {"type": "local"}
-    if not remote:
+class Config:
+    SYSTEM_LEVELS = (ConfigLevel.SYSTEM, ConfigLevel.GLOBAL)
+    LOCAL_LEVELS = (ConfigLevel.LOCAL,)
+    # In the order of precedence
+    LEVELS = SYSTEM_LEVELS + LOCAL_LEVELS
+    def __init__(
+        self,
+        level: Optional[ConfigLevel] = None,
+    ):
+        self.level = level
+        self.init()
+    @classmethod
+    def get_dir(cls, level: Optional[ConfigLevel]) -> str:
+        if level == ConfigLevel.SYSTEM:
+            return system_config_dir()
+        if level == ConfigLevel.GLOBAL:
+            return global_config_dir()
+        return str(DataChainDir.find().root)
+    def init(self):
+        d = DataChainDir(self.get_dir(self.level))
+        d.init()
+    def load_one(self, level: Optional[ConfigLevel] = None) -> TOMLDocument:
+        config_path = DataChainDir(self.get_dir(level)).config
         try:
-            remote = config["core"]["default-remote"]  # type: ignore[index,assignment]
-        except KeyError:
+            with open(config_path, encoding="utf-8") as f:
+                return load(f)
+        except FileNotFoundError:
+            return TOMLDocument()
+    def load_config_to_level(self) -> TOMLDocument:
+        merged_conf = TOMLDocument()
+        for merge_level in self.LEVELS:
+            if merge_level == self.level:
+                break
+            config = self.load_one(merge_level)
+            if config:
+                merge(merged_conf, config)
+        return merged_conf
+    def read(self) -> TOMLDocument:
+        if self.level is None:
+            return self.load_config_to_level()
+        return self.load_one(self.level)
+    @contextmanager
+    def edit(self):
+        config = self.load_one(self.level)
+        yield config
+        self.write(config)
+    def config_file(self):
+        return DataChainDir(self.get_dir(self.level)).config
+    def write(self, config: TOMLDocument):
+        with open(self.config_file(), "w") as f:
+            dump(config, f)
+    def get_remote_config(self, remote: str = "") -> Mapping[str, str]:
+        config = self.read()
+        if not config:
             return {"type": "local"}
-    try:
-        remote_conf: Mapping[str, str] = config["remote"][remote]  # type: ignore[assignment,index]
-    except KeyError:
-        raise Exception(
-            f"missing config section for default remote: remote.{remote}"
-        ) from None
-    except Exception as exc:
-        raise Exception("invalid config") from exc
-    if not isinstance(remote_conf, Mapping):
-        raise TypeError(f"config section remote.{remote} must be a mapping")
-    remote_type = remote_conf.get("type")
-    if remote_type not in ("local", "http"):
-        raise Exception(
-            f'config section remote.{remote} must have "type" with one of: '
-            '"local", "http"'
-        )
-    if remote_type == "http":
-        for key in ["url", "username", "token"]:
+        if not remote:
             try:
-                remote_conf[key]
+                remote = config["core"]["default-remote"]  # type: ignore[index,assignment]
             except KeyError:
-                raise Exception(
-                    f"config section remote.{remote} of type {remote_type} "
-                    f"must contain key {key}"
-                ) from None
-    elif remote_type != "local":
-        raise Exception(
-            f"config section remote.{remote} has invalid remote type {remote_type}"
-        )
-    return remote_conf
+                return {"type": "local"}
+        try:
+            remote_conf: Mapping[str, str] = config["remote"][remote]  # type: ignore[assignment,index]
+        except KeyError:
+            raise Exception(
+                f"missing config section for default remote: remote.{remote}"
+            ) from None
+        except Exception as exc:
+            raise Exception("invalid config") from exc
+        if not isinstance(remote_conf, Mapping):
+            raise TypeError(f"config section remote.{remote} must be a mapping")
+        remote_type = remote_conf.get("type")
+        if remote_type not in ("local", "http"):
+            raise Exception(
+                f'config section remote.{remote} must have "type" with one of: '
+                '"local", "http"'
+            )
+        if remote_type == "http":
+            for key in ["url", "username", "token"]:
+                try:
+                    remote_conf[key]
+                except KeyError:
+                    raise Exception(
+                        f"config section remote.{remote} of type {remote_type} "
+                        f"must contain key {key}"
+                    ) from None
+        elif remote_type != "local":
+            raise Exception(
+                f"config section remote.{remote} has invalid remote type {remote_type}"
+            )
+        return remote_conf
+def merge(into: Union[TOMLDocument, dict], update: Union[TOMLDocument, dict]):
+    """Merges second dict into first recursively"""
+    for key, val in update.items():
+        if isinstance(into.get(key), dict) and isinstance(val, dict):
+            merge(into[key], val)  # type: ignore[arg-type]
+        else:
+            into[key] = val

datachain/data_storage/schema.py CHANGED Viewed

@@ -26,6 +26,13 @@ if TYPE_CHECKING:
     from sqlalchemy.sql.elements import ColumnElement
+DEFAULT_DELIMITER = "__"
+def col_name(name: str, object_name: str = "file") -> str:
+    return f"{object_name}{DEFAULT_DELIMITER}{name}"
 def dedup_columns(columns: Iterable[sa.Column]) -> list[sa.Column]:
     """
     Removes duplicate columns from a list of columns.
@@ -76,64 +83,81 @@ def convert_rows_custom_column_types(
 class DirExpansion:
-    @staticmethod
-    def base_select(q):
+    def __init__(self, object_name: str):
+        self.object_name = object_name
+    def col_name(self, name: str, object_name: Optional[str] = None) -> str:
+        object_name = object_name or self.object_name
+        return col_name(name, object_name)
+    def c(self, query, name: str, object_name: Optional[str] = None) -> str:
+        return getattr(query.c, self.col_name(name, object_name=object_name))
+    def base_select(self, q):
         return sa.select(
-            q.c.sys__id,
-            false().label("is_dir"),
-            q.c.source,
-            q.c.path,
-            q.c.version,
-            q.c.location,
+            self.c(q, "id", object_name="sys"),
+            false().label(self.col_name("is_dir")),
+            self.c(q, "source"),
+            self.c(q, "path"),
+            self.c(q, "version"),
+            self.c(q, "location"),
         )
-    @staticmethod
-    def apply_group_by(q):
+    def apply_group_by(self, q):
         return (
             sa.select(
                 f.min(q.c.sys__id).label("sys__id"),
-                q.c.is_dir,
-                q.c.source,
-                q.c.path,
-                q.c.version,
-                f.max(q.c.location).label("location"),
+                self.c(q, "is_dir"),
+                self.c(q, "source"),
+                self.c(q, "path"),
+                self.c(q, "version"),
+                f.max(self.c(q, "location")).label(self.col_name("location")),
             )
             .select_from(q)
-            .group_by(q.c.source, q.c.path, q.c.is_dir, q.c.version)
-            .order_by(q.c.source, q.c.path, q.c.is_dir, q.c.version)
+            .group_by(
+                self.c(q, "source"),
+                self.c(q, "path"),
+                self.c(q, "is_dir"),
+                self.c(q, "version"),
+            )
+            .order_by(
+                self.c(q, "source"),
+                self.c(q, "path"),
+                self.c(q, "is_dir"),
+                self.c(q, "version"),
+            )
         )
-    @classmethod
-    def query(cls, q):
-        q = cls.base_select(q).cte(recursive=True)
-        parent = path.parent(q.c.path)
+    def query(self, q):
+        q = self.base_select(q).cte(recursive=True)
+        parent = path.parent(self.c(q, "path"))
         q = q.union_all(
             sa.select(
                 sa.literal(-1).label("sys__id"),
-                true().label("is_dir"),
-                q.c.source,
-                parent.label("path"),
-                sa.literal("").label("version"),
-                null().label("location"),
+                true().label(self.col_name("is_dir")),
+                self.c(q, "source"),
+                parent.label(self.col_name("path")),
+                sa.literal("").label(self.col_name("version")),
+                null().label(self.col_name("location")),
             ).where(parent != "")
         )
-        return cls.apply_group_by(q)
+        return self.apply_group_by(q)
 class DataTable:
-    dataset_dir_expansion = staticmethod(DirExpansion.query)
     def __init__(
         self,
         name: str,
         engine: "Engine",
         metadata: Optional["sa.MetaData"] = None,
         column_types: Optional[dict[str, SQLType]] = None,
+        object_name: str = "file",
     ):
         self.name: str = name
         self.engine = engine
         self.metadata: sa.MetaData = metadata if metadata is not None else sa.MetaData()
         self.column_types: dict[str, SQLType] = column_types or {}
+        self.object_name = object_name
     @staticmethod
     def copy_column(
@@ -204,9 +228,18 @@ class DataTable:
     def columns(self) -> "ReadOnlyColumnCollection[str, sa.Column[Any]]":
         return self.table.columns
-    @property
-    def c(self):
-        return self.columns
+    def col_name(self, name: str, object_name: Optional[str] = None) -> str:
+        object_name = object_name or self.object_name
+        return col_name(name, object_name)
+    def without_object(
+        self, column_name: str, object_name: Optional[str] = None
+    ) -> str:
+        object_name = object_name or self.object_name
+        return column_name.removeprefix(f"{object_name}{DEFAULT_DELIMITER}")
+    def c(self, name: str, object_name: Optional[str] = None):
+        return getattr(self.columns, self.col_name(name, object_name=object_name))
     @property
     def table(self) -> "sa.Table":
@@ -246,7 +279,7 @@ class DataTable:
         ]
     def dir_expansion(self):
-        return self.dataset_dir_expansion(self)
+        return DirExpansion(self.object_name)
 PARTITION_COLUMN_ID = "partition_id"

datachain 0.6.1__py3-none-any.whl → 0.6.3__py3-none-any.whl

Potentially problematic release.

datachain 0.6.1py3-none-any.whl → 0.6.3py3-none-any.whl