PyPI - datachain - Versions diffs - 0.6.1__py3-none-any.whl → 0.6.2__py3-none-any.whl - Mend

datachain 0.6.1py3-none-any.whl → 0.6.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (27) hide show

datachain/catalog/catalog.py +61 -219
datachain/cli.py +136 -22
datachain/client/fsspec.py +9 -0
datachain/client/local.py +11 -32
datachain/config.py +126 -51
datachain/data_storage/schema.py +66 -33
datachain/data_storage/sqlite.py +4 -4
datachain/data_storage/warehouse.py +101 -125
datachain/lib/dc.py +211 -52
datachain/lib/func/__init__.py +20 -2
datachain/lib/func/aggregate.py +319 -8
datachain/lib/func/func.py +97 -9
datachain/lib/listing.py +6 -21
datachain/lib/listing_info.py +4 -0
datachain/lib/signal_schema.py +8 -5
datachain/lib/udf.py +3 -3
datachain/listing.py +22 -48
datachain/query/dataset.py +11 -3
datachain/remote/studio.py +63 -14
datachain/studio.py +129 -0
datachain/utils.py +58 -0
{datachain-0.6.1.dist-info → datachain-0.6.2.dist-info}/METADATA +7 -6
{datachain-0.6.1.dist-info → datachain-0.6.2.dist-info}/RECORD +27 -26
{datachain-0.6.1.dist-info → datachain-0.6.2.dist-info}/WHEEL +1 -1
{datachain-0.6.1.dist-info → datachain-0.6.2.dist-info}/LICENSE +0 -0
{datachain-0.6.1.dist-info → datachain-0.6.2.dist-info}/entry_points.txt +0 -0
{datachain-0.6.1.dist-info → datachain-0.6.2.dist-info}/top_level.txt +0 -0

datachain/listing.py CHANGED Viewed

@@ -4,12 +4,10 @@ from collections.abc import Iterable, Iterator
 from itertools import zip_longest
 from typing import TYPE_CHECKING, Optional
-from fsspec.asyn import get_loop, sync
 from sqlalchemy import Column
 from sqlalchemy.sql import func
 from tqdm import tqdm
-from datachain.lib.file import File
 from datachain.node import DirType, Node, NodeWithPath
 from datachain.sql.functions import path as pathfunc
 from datachain.utils import suffix_to_number
@@ -17,33 +15,29 @@ from datachain.utils import suffix_to_number
 if TYPE_CHECKING:
     from datachain.catalog.datasource import DataSource
     from datachain.client import Client
-    from datachain.data_storage import AbstractMetastore, AbstractWarehouse
+    from datachain.data_storage import AbstractWarehouse
     from datachain.dataset import DatasetRecord
-    from datachain.storage import Storage
 class Listing:
     def __init__(
         self,
-        storage: Optional["Storage"],
-        metastore: "AbstractMetastore",
         warehouse: "AbstractWarehouse",
         client: "Client",
         dataset: Optional["DatasetRecord"],
+        object_name: str = "file",
     ):
-        self.storage = storage
-        self.metastore = metastore
         self.warehouse = warehouse
         self.client = client
         self.dataset = dataset  # dataset representing bucket listing
+        self.object_name = object_name
     def clone(self) -> "Listing":
         return self.__class__(
-            self.storage,
-            self.metastore.clone(),
             self.warehouse.clone(),
             self.client,
             self.dataset,
+            self.object_name,
         )
     def __enter__(self) -> "Listing":
@@ -53,46 +47,20 @@ class Listing:
         self.close()
     def close(self) -> None:
-        self.metastore.close()
         self.warehouse.close()
     @property
-    def id(self):
-        return self.storage.id
+    def uri(self):
+        from datachain.lib.listing import listing_uri_from_name
+        return listing_uri_from_name(self.dataset.name)
     @property
     def dataset_rows(self):
-        return self.warehouse.dataset_rows(self.dataset, self.dataset.latest_version)
-    def fetch(self, start_prefix="", method: str = "default") -> None:
-        sync(get_loop(), self._fetch, start_prefix, method)
-    async def _fetch(self, start_prefix: str, method: str) -> None:
-        with self.clone() as fetch_listing:
-            if start_prefix:
-                start_prefix = start_prefix.rstrip("/")
-            try:
-                async for entries in fetch_listing.client.scandir(
-                    start_prefix, method=method
-                ):
-                    fetch_listing.insert_entries(entries)
-                    if len(entries) > 1:
-                        fetch_listing.metastore.update_last_inserted_at()
-            finally:
-                fetch_listing.insert_entries_done()
-    def insert_entry(self, entry: File) -> None:
-        self.insert_entries([entry])
-    def insert_entries(self, entries: Iterable[File]) -> None:
-        self.warehouse.insert_rows(
-            self.dataset_rows.get_table(),
-            self.warehouse.prepare_entries(entries),
+        return self.warehouse.dataset_rows(
+            self.dataset, self.dataset.latest_version, object_name=self.object_name
         )
-    def insert_entries_done(self) -> None:
-        self.warehouse.insert_rows_done(self.dataset_rows.get_table())
     def expand_path(self, path, use_glob=True) -> list[Node]:
         if use_glob and glob.has_magic(path):
             return self.warehouse.expand_path(self.dataset_rows, path)
@@ -200,25 +168,31 @@ class Listing:
         conds = []
         if names:
             for name in names:
-                conds.append(pathfunc.name(Column("path")).op("GLOB")(name))
+                conds.append(
+                    pathfunc.name(Column(dr.col_name("path"))).op("GLOB")(name)
+                )
         if inames:
             for iname in inames:
                 conds.append(
-                    func.lower(pathfunc.name(Column("path"))).op("GLOB")(iname.lower())
+                    func.lower(pathfunc.name(Column(dr.col_name("path")))).op("GLOB")(
+                        iname.lower()
+                    )
                 )
         if paths:
             for path in paths:
-                conds.append(Column("path").op("GLOB")(path))
+                conds.append(Column(dr.col_name("path")).op("GLOB")(path))
         if ipaths:
             for ipath in ipaths:
-                conds.append(func.lower(Column("path")).op("GLOB")(ipath.lower()))
+                conds.append(
+                    func.lower(Column(dr.col_name("path"))).op("GLOB")(ipath.lower())
+                )
         if size is not None:
             size_limit = suffix_to_number(size)
             if size_limit >= 0:
-                conds.append(Column("size") >= size_limit)
+                conds.append(Column(dr.col_name("size")) >= size_limit)
             else:
-                conds.append(Column("size") <= -size_limit)
+                conds.append(Column(dr.col_name("size")) <= -size_limit)
         return self.warehouse.find(
             dr,

datachain/query/dataset.py CHANGED Viewed

@@ -10,6 +10,7 @@ from abc import ABC, abstractmethod
 from collections.abc import Generator, Iterable, Iterator, Sequence
 from copy import copy
 from functools import wraps
+from secrets import token_hex
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -173,10 +174,10 @@ class QueryStep(StartingStep):
             return sqlalchemy.select(*columns)
         dataset = self.catalog.get_dataset(self.dataset_name)
-        table = self.catalog.warehouse.dataset_rows(dataset, self.dataset_version)
+        dr = self.catalog.warehouse.dataset_rows(dataset, self.dataset_version)
         return step_result(
-            q, table.c, dependencies=[(self.dataset_name, self.dataset_version)]
+            q, dr.columns, dependencies=[(self.dataset_name, self.dataset_version)]
         )
@@ -720,10 +721,17 @@ class SQLMutate(SQLClause):
     def apply_sql_clause(self, query: Select) -> Select:
         original_subquery = query.subquery()
+        to_mutate = {c.name for c in self.args}
+        prefix = f"mutate{token_hex(8)}_"
+        cols = [
+            c.label(prefix + c.name) if c.name in to_mutate else c
+            for c in original_subquery.c
+        ]
         # this is needed for new column to be used in clauses
         # like ORDER BY, otherwise new column is not recognized
         subquery = (
-            sqlalchemy.select(*original_subquery.c, *self.args)
+            sqlalchemy.select(*cols, *self.args)
             .select_from(original_subquery)
             .subquery()
         )

datachain/remote/studio.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import json
 import logging
+import os
 from collections.abc import Iterable, Iterator
 from datetime import datetime, timedelta, timezone
 from struct import unpack
@@ -10,8 +11,10 @@ from typing import (
     TypeVar,
 )
+from datachain.config import Config
 from datachain.dataset import DatasetStats
-from datachain.utils import retry_with_backoff
+from datachain.error import DataChainError
+from datachain.utils import STUDIO_URL, retry_with_backoff
 T = TypeVar("T")
 LsData = Optional[list[dict[str, Any]]]
@@ -54,14 +57,54 @@ class Response(Generic[T]):
 class StudioClient:
-    def __init__(
-        self, url: str, username: str, token: str, timeout: float = 3600.0
-    ) -> None:
+    def __init__(self, timeout: float = 3600.0, team: Optional[str] = None) -> None:
         self._check_dependencies()
-        self.url = url.rstrip("/")
-        self.username = username
-        self.token = token
         self.timeout = timeout
+        self._config = None
+        self._team = team
+    @property
+    def token(self) -> str:
+        token = os.environ.get("DVC_STUDIO_TOKEN") or self.config.get("token")
+        if not token:
+            raise DataChainError(
+                "Studio token is not set. Use `datachain studio login` "
+                "or environment variable `DVC_STUDIO_TOKEN` to set it."
+            )
+        return token
+    @property
+    def url(self) -> str:
+        return (
+            os.environ.get("DVC_STUDIO_URL") or self.config.get("url") or STUDIO_URL
+        ) + "/api"
+    @property
+    def config(self) -> dict:
+        if self._config is None:
+            self._config = Config().read().get("studio", {})
+        return self._config  # type: ignore [return-value]
+    @property
+    def team(self) -> str:
+        if self._team is None:
+            self._team = self._get_team()
+        return self._team
+    def _get_team(self) -> str:
+        team = os.environ.get("DVC_STUDIO_TEAM") or self.config.get("team")
+        if not team:
+            raise DataChainError(
+                "Studio team is not set. "
+                "Use `datachain studio team <team_name>` "
+                "or environment variable `DVC_STUDIO_TEAM` to set it."
+                "You can also set it in the config file as team under studio."
+            )
+        return team
     def _check_dependencies(self) -> None:
         try:
@@ -80,7 +123,7 @@ class StudioClient:
         response = requests.post(
             f"{self.url}/{route}",
-            json={**data, "team_name": self.username},
+            json={**data, "team_name": self.team},
             headers={
                 "Content-Type": "application/json",
                 "Authorization": f"token {self.token}",
@@ -108,7 +151,7 @@ class StudioClient:
         response = requests.post(
             f"{self.url}/{route}",
-            json={**data, "team_name": self.username},
+            json={**data, "team_name": self.team},
             headers={
                 "Content-Type": "application/json",
                 "Authorization": f"token {self.token}",
@@ -174,6 +217,9 @@ class StudioClient:
             response = self._send_request_msgpack("ls", {"source": path})
             yield path, response
+    def ls_datasets(self) -> Response[LsData]:
+        return self._send_request("datachain/ls-datasets", {})
     def dataset_info(self, name: str) -> Response[DatasetInfoData]:
         def _parse_dataset_info(dataset_info):
             _parse_dates(dataset_info, ["created_at", "finished_at"])
@@ -182,7 +228,7 @@ class StudioClient:
             return dataset_info
-        response = self._send_request("dataset-info", {"dataset_name": name})
+        response = self._send_request("datachain/dataset-info", {"dataset_name": name})
         if response.ok:
             response.data = _parse_dataset_info(response.data)
         return response
@@ -192,13 +238,14 @@ class StudioClient:
     ) -> Response[DatasetRowsData]:
         req_data = {"dataset_name": name, "dataset_version": version}
         return self._send_request_msgpack(
-            "dataset-rows",
+            "datachain/dataset-rows",
             {**req_data, "offset": offset, "limit": DATASET_ROWS_CHUNK_SIZE},
         )
     def dataset_stats(self, name: str, version: int) -> Response[DatasetStatsData]:
         response = self._send_request(
-            "dataset-stats", {"dataset_name": name, "dataset_version": version}
+            "datachain/dataset-stats",
+            {"dataset_name": name, "dataset_version": version},
         )
         if response.ok:
             response.data = DatasetStats(**response.data)
@@ -208,12 +255,14 @@ class StudioClient:
         self, name: str, version: int
     ) -> Response[DatasetExportSignedUrls]:
         return self._send_request(
-            "dataset-export", {"dataset_name": name, "dataset_version": version}
+            "datachain/dataset-export",
+            {"dataset_name": name, "dataset_version": version},
         )
     def dataset_export_status(
         self, name: str, version: int
     ) -> Response[DatasetExportStatus]:
         return self._send_request(
-            "dataset-export-status", {"dataset_name": name, "dataset_version": version}
+            "datachain/dataset-export-status",
+            {"dataset_name": name, "dataset_version": version},
         )

datachain/studio.py ADDED Viewed

@@ -0,0 +1,129 @@
+import os
+from typing import TYPE_CHECKING
+from datachain.catalog.catalog import raise_remote_error
+from datachain.config import Config, ConfigLevel
+from datachain.error import DataChainError
+from datachain.remote.studio import StudioClient
+from datachain.utils import STUDIO_URL
+if TYPE_CHECKING:
+    from argparse import Namespace
+POST_LOGIN_MESSAGE = (
+    "Once you've logged in, return here "
+    "and you'll be ready to start using DataChain with Studio."
+)
+def process_studio_cli_args(args: "Namespace"):
+    if args.cmd == "login":
+        return login(args)
+    if args.cmd == "logout":
+        return logout()
+    if args.cmd == "token":
+        return token()
+    if args.cmd == "datasets":
+        return list_datasets(args)
+    if args.cmd == "team":
+        return set_team(args)
+    raise DataChainError(f"Unknown command '{args.cmd}'.")
+def set_team(args: "Namespace"):
+    level = ConfigLevel.GLOBAL if args.__dict__.get("global") else ConfigLevel.LOCAL
+    config = Config(level)
+    with config.edit() as conf:
+        studio_conf = conf.get("studio", {})
+        studio_conf["team"] = args.team_name
+        conf["studio"] = studio_conf
+    print(f"Set default team to '{args.team_name}' in {config.config_file()}")
+def login(args: "Namespace"):
+    from dvc_studio_client.auth import StudioAuthError, get_access_token
+    config = Config().read().get("studio", {})
+    name = args.name
+    hostname = (
+        args.hostname
+        or os.environ.get("DVC_STUDIO_URL")
+        or config.get("url")
+        or STUDIO_URL
+    )
+    scopes = args.scopes
+    if config.get("url", hostname) == hostname and "token" in config:
+        raise DataChainError(
+            "Token already exists. "
+            "To login with a different token, "
+            "logout using `datachain studio logout`."
+        )
+    open_browser = not args.no_open
+    try:
+        _, access_token = get_access_token(
+            token_name=name,
+            hostname=hostname,
+            scopes=scopes,
+            open_browser=open_browser,
+            client_name="DataChain",
+            post_login_message=POST_LOGIN_MESSAGE,
+        )
+    except StudioAuthError as exc:
+        raise DataChainError(f"Failed to authenticate with Studio: {exc}") from exc
+    config_path = save_config(hostname, access_token)
+    print(f"Authentication complete. Saved token to {config_path}.")
+    return 0
+def logout():
+    with Config(ConfigLevel.GLOBAL).edit() as conf:
+        token = conf.get("studio", {}).get("token")
+        if not token:
+            raise DataChainError(
+                "Not logged in to Studio. Log in with 'datachain studio login'."
+            )
+        del conf["studio"]["token"]
+    print("Logged out from Studio. (you can log back in with 'datachain studio login')")
+def token():
+    config = Config().read().get("studio", {})
+    token = config.get("token")
+    if not token:
+        raise DataChainError(
+            "Not logged in to Studio. Log in with 'datachain studio login'."
+        )
+    print(token)
+def list_datasets(args: "Namespace"):
+    client = StudioClient(team=args.team)
+    response = client.ls_datasets()
+    if not response.ok:
+        raise_remote_error(response.message)
+    if not response.data:
+        print("No datasets found.")
+        return
+    for d in response.data:
+        name = d.get("name")
+        for v in d.get("versions", []):
+            version = v.get("version")
+            print(f"{name} (v{version})")
+def save_config(hostname, token):
+    config = Config(ConfigLevel.GLOBAL)
+    with config.edit() as conf:
+        studio_conf = conf.get("studio", {})
+        studio_conf["url"] = hostname
+        studio_conf["token"] = token
+        conf["studio"] = studio_conf
+    return config.config_file()

datachain/utils.py CHANGED Viewed

@@ -15,6 +15,7 @@ from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
 from uuid import UUID
 import cloudpickle
+import platformdirs
 from dateutil import tz
 from dateutil.parser import isoparse
 from pydantic import BaseModel
@@ -25,6 +26,13 @@ if TYPE_CHECKING:
 NUL = b"\0"
 TIME_ZERO = datetime.fromtimestamp(0, tz=timezone.utc)
+APPNAME = "datachain"
+APPAUTHOR = "iterative"
+ENV_DATACHAIN_SYSTEM_CONFIG_DIR = "DATACHAIN_SYSTEM_CONFIG_DIR"
+ENV_DATACHAIN_GLOBAL_CONFIG_DIR = "DATACHAIN_GLOBAL_CONFIG_DIR"
+STUDIO_URL = "https://studio.dvc.ai"
 T = TypeVar("T", bound="DataChainDir")
@@ -33,6 +41,7 @@ class DataChainDir:
     CACHE = "cache"
     TMP = "tmp"
     DB = "db"
+    CONFIG = "config"
     ENV_VAR = "DATACHAIN_DIR"
     ENV_VAR_DATACHAIN_ROOT = "DATACHAIN_ROOT_DIR"
@@ -42,6 +51,7 @@ class DataChainDir:
         cache: Optional[str] = None,
         tmp: Optional[str] = None,
         db: Optional[str] = None,
+        config: Optional[str] = None,
     ) -> None:
         self.root = osp.abspath(root) if root is not None else self.default_root()
         self.cache = (
@@ -51,12 +61,24 @@ class DataChainDir:
             osp.abspath(tmp) if tmp is not None else osp.join(self.root, self.TMP)
         )
         self.db = osp.abspath(db) if db is not None else osp.join(self.root, self.DB)
+        self.config = (
+            osp.abspath(config)
+            if config is not None
+            else osp.join(self.root, self.CONFIG)
+        )
+        self.config = (
+            osp.abspath(config)
+            if config is not None
+            else osp.join(self.root, self.CONFIG)
+        )
     def init(self):
         os.makedirs(self.root, exist_ok=True)
         os.makedirs(self.cache, exist_ok=True)
         os.makedirs(self.tmp, exist_ok=True)
         os.makedirs(osp.split(self.db)[0], exist_ok=True)
+        os.makedirs(osp.split(self.config)[0], exist_ok=True)
+        os.makedirs(osp.split(self.config)[0], exist_ok=True)
     @classmethod
     def default_root(cls) -> str:
@@ -82,6 +104,18 @@ class DataChainDir:
         return instance
+def system_config_dir():
+    return os.getenv(ENV_DATACHAIN_SYSTEM_CONFIG_DIR) or platformdirs.site_config_dir(
+        APPNAME, APPAUTHOR
+    )
+def global_config_dir():
+    return os.getenv(ENV_DATACHAIN_GLOBAL_CONFIG_DIR) or platformdirs.user_config_dir(
+        APPNAME, APPAUTHOR
+    )
 def human_time_to_int(time: str) -> Optional[int]:
     if not time:
         return None
@@ -421,3 +455,27 @@ def env2bool(var, undefined=False):
     if var is None:
         return undefined
     return bool(re.search("1|y|yes|true", var, flags=re.IGNORECASE))
+def nested_dict_path_set(
+    data: dict[str, Any], path: Sequence[str], value: Any
+) -> dict[str, Any]:
+    """Sets a value inside a nested dict based on the list of dict keys as a path,
+    and will create sub-dicts as needed to set the value."""
+    sub_data = data
+    for element in path[:-1]:
+        if element not in sub_data:
+            sub_data[element] = {}
+        sub_data = sub_data[element]
+    sub_data[path[len(path) - 1]] = value
+    return data
+def row_to_nested_dict(
+    headers: Iterable[Sequence[str]], row: Iterable[Any]
+) -> dict[str, Any]:
+    """Converts a row to a nested dict based on the provided headers."""
+    result: dict[str, Any] = {}
+    for h, v in zip(headers, row):
+        nested_dict_path_set(result, h, v)
+    return result

{datachain-0.6.1.dist-info → datachain-0.6.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datachain
-Version: 0.6.1
+Version: 0.6.2
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License: Apache-2.0
@@ -19,7 +19,7 @@ License-File: LICENSE
 Requires-Dist: pyyaml
 Requires-Dist: tomlkit
 Requires-Dist: tqdm
-Requires-Dist: numpy
+Requires-Dist: numpy <3,>=1
 Requires-Dist: pandas >=2.0.0
 Requires-Dist: pyarrow
 Requires-Dist: typing-extensions
@@ -38,15 +38,16 @@ Requires-Dist: orjson >=3.10.5
 Requires-Dist: pydantic <3,>=2
 Requires-Dist: jmespath >=1.0
 Requires-Dist: datamodel-code-generator >=0.25
-Requires-Dist: Pillow <11,>=10.0.0
+Requires-Dist: Pillow <12,>=10.0.0
 Requires-Dist: msgpack <2,>=1.0.4
 Requires-Dist: psutil
 Requires-Dist: huggingface-hub
 Requires-Dist: iterative-telemetry >=0.0.9
-Requires-Dist: numpy <2,>=1 ; sys_platform == "win32"
+Requires-Dist: platformdirs
+Requires-Dist: dvc-studio-client <1,>=0.21
 Provides-Extra: dev
 Requires-Dist: datachain[docs,tests] ; extra == 'dev'
-Requires-Dist: mypy ==1.12.0 ; extra == 'dev'
+Requires-Dist: mypy ==1.12.1 ; extra == 'dev'
 Requires-Dist: types-python-dateutil ; extra == 'dev'
 Requires-Dist: types-pytz ; extra == 'dev'
 Requires-Dist: types-PyYAML ; extra == 'dev'
@@ -63,7 +64,7 @@ Requires-Dist: datachain[tests] ; extra == 'examples'
 Requires-Dist: numpy <2,>=1 ; extra == 'examples'
 Requires-Dist: defusedxml ; extra == 'examples'
 Requires-Dist: accelerate ; extra == 'examples'
-Requires-Dist: unstructured[embed-huggingface,pdf] ; extra == 'examples'
+Requires-Dist: unstructured[embed-huggingface,pdf] <0.16.0 ; extra == 'examples'
 Requires-Dist: pdfplumber ==0.11.4 ; extra == 'examples'
 Requires-Dist: huggingface-hub[hf_transfer] ; extra == 'examples'
 Requires-Dist: onnx ==1.16.1 ; extra == 'examples'

datachain 0.6.1__py3-none-any.whl → 0.6.2__py3-none-any.whl

Potentially problematic release.

datachain 0.6.1py3-none-any.whl → 0.6.2py3-none-any.whl