PyPI - datachain - Versions diffs - 0.8.2__py3-none-any.whl → 0.8.4__py3-none-any.whl - Mend

datachain 0.8.2py3-none-any.whl → 0.8.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (44) hide show

datachain/cache.py +4 -2
datachain/catalog/catalog.py +100 -54
datachain/catalog/datasource.py +4 -6
datachain/cli/__init__.py +311 -0
datachain/cli/commands/__init__.py +29 -0
datachain/cli/commands/datasets.py +129 -0
datachain/cli/commands/du.py +14 -0
datachain/cli/commands/index.py +12 -0
datachain/cli/commands/ls.py +169 -0
datachain/cli/commands/misc.py +28 -0
datachain/cli/commands/query.py +53 -0
datachain/cli/commands/show.py +38 -0
datachain/cli/parser/__init__.py +547 -0
datachain/cli/parser/job.py +120 -0
datachain/cli/parser/studio.py +126 -0
datachain/cli/parser/utils.py +63 -0
datachain/{cli_utils.py → cli/utils.py} +27 -1
datachain/client/azure.py +21 -1
datachain/client/fsspec.py +45 -13
datachain/client/gcs.py +10 -2
datachain/client/local.py +4 -4
datachain/client/s3.py +10 -0
datachain/dataset.py +1 -0
datachain/func/__init__.py +2 -2
datachain/func/conditional.py +52 -0
datachain/func/func.py +5 -1
datachain/lib/arrow.py +4 -0
datachain/lib/dc.py +18 -3
datachain/lib/file.py +1 -1
datachain/lib/listing.py +36 -3
datachain/lib/signal_schema.py +89 -27
datachain/listing.py +1 -5
datachain/node.py +27 -1
datachain/progress.py +2 -2
datachain/query/session.py +1 -1
datachain/studio.py +58 -38
datachain/utils.py +1 -1
{datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/METADATA +6 -6
{datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/RECORD +43 -31
{datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/WHEEL +1 -1
datachain/cli.py +0 -1475
{datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/LICENSE +0 -0
{datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/entry_points.txt +0 -0
{datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/top_level.txt +0 -0

datachain/lib/listing.py CHANGED Viewed

@@ -39,6 +39,15 @@ def list_bucket(uri: str, cache, client_config=None) -> Callable:
     return list_func
+def get_file_info(uri: str, cache, client_config=None) -> File:
+    """
+    Wrapper to return File object by its URI
+    """
+    client = Client.get_client(uri, cache, **(client_config or {}))  # type: ignore[arg-type]
+    _, path = Client.parse_url(uri)
+    return client.get_file_info(path)
 def ls(
     dc: D,
     path: str,
@@ -76,7 +85,25 @@ def ls(
     return dc.filter(pathfunc.parent(_file_c("path")) == path.lstrip("/").rstrip("/*"))
-def parse_listing_uri(uri: str, cache, client_config) -> tuple[str, str, str]:
+def _isfile(client: "Client", path: str) -> bool:
+    """
+    Returns True if uri points to a file
+    """
+    try:
+        info = client.fs.info(path)
+        name = info.get("name")
+        # case for special simulated directories on some clouds
+        # e.g. Google creates a zero byte file with the same name as the
+        # directory with a trailing slash at the end
+        if not name or name.endswith("/"):
+            return False
+        return info["type"] == "file"
+    except:  # noqa: E722
+        return False
+def parse_listing_uri(uri: str, cache, client_config) -> tuple[Optional[str], str, str]:
     """
     Parsing uri and returns listing dataset name, listing uri and listing path
     """
@@ -85,7 +112,9 @@ def parse_listing_uri(uri: str, cache, client_config) -> tuple[str, str, str]:
     storage_uri, path = Client.parse_url(uri)
     telemetry.log_param("client", client.PREFIX)
-    if uses_glob(path) or client.fs.isfile(uri):
+    if not uri.endswith("/") and _isfile(client, uri):
+        return None, f'{storage_uri}/{path.lstrip("/")}', path
+    if uses_glob(path):
         lst_uri_path = posixpath.dirname(path)
     else:
         storage_uri, path = Client.parse_url(f'{uri.rstrip("/")}/')
@@ -113,7 +142,7 @@ def listing_uri_from_name(dataset_name: str) -> str:
 def get_listing(
     uri: str, session: "Session", update: bool = False
-) -> tuple[str, str, str, bool]:
+) -> tuple[Optional[str], str, str, bool]:
     """Returns correct listing dataset name that must be used for saving listing
     operation. It takes into account existing listings and reusability of those.
     It also returns boolean saying if returned dataset name is reused / already
@@ -131,6 +160,10 @@ def get_listing(
     ds_name, list_uri, list_path = parse_listing_uri(uri, cache, client_config)
     listing = None
+    # if we don't want to use cached dataset (e.g. for a single file listing)
+    if not ds_name:
+        return None, list_uri, list_path, False
     listings = [
         ls for ls in catalog.listings() if not ls.is_expired and ls.contains(ds_name)
     ]

datachain/lib/signal_schema.py CHANGED Viewed

@@ -13,13 +13,14 @@ from typing import (  # noqa: UP035
     Final,
     List,
     Literal,
+    Mapping,
     Optional,
     Union,
     get_args,
     get_origin,
 )
-from pydantic import BaseModel, create_model
+from pydantic import BaseModel, Field, create_model
 from sqlalchemy import ColumnElement
 from typing_extensions import Literal as LiteralEx
@@ -85,8 +86,31 @@ class SignalResolvingTypeError(SignalResolvingError):
         )
+class CustomType(BaseModel):
+    schema_version: int = Field(ge=1, le=2, strict=True)
+    name: str
+    fields: dict[str, str]
+    bases: list[tuple[str, str, Optional[str]]]
+    @classmethod
+    def deserialize(cls, data: dict[str, Any], type_name: str) -> "CustomType":
+        version = data.get("schema_version", 1)
+        if version == 1:
+            data = {
+                "schema_version": 1,
+                "name": type_name,
+                "fields": data,
+                "bases": [],
+            }
+        return cls(**data)
 def create_feature_model(
-    name: str, fields: dict[str, Union[type, tuple[type, Any]]]
+    name: str,
+    fields: Mapping[str, Union[type, None, tuple[type, Any]]],
+    base: Optional[type] = None,
 ) -> type[BaseModel]:
     """
     This gets or returns a dynamic feature model for use in restoring a model
@@ -98,7 +122,7 @@ def create_feature_model(
     name = name.replace("@", "_")
     return create_model(
         name,
-        __base__=DataModel,  # type: ignore[call-overload]
+        __base__=base or DataModel,  # type: ignore[call-overload]
         # These are tuples for each field of: annotation, default (if any)
         **{
             field_name: anno if isinstance(anno, tuple) else (anno, None)
@@ -156,7 +180,7 @@ class SignalSchema:
         return SignalSchema(signals)
     @staticmethod
-    def _serialize_custom_model_fields(
+    def _serialize_custom_model(
         version_name: str, fr: type[BaseModel], custom_types: dict[str, Any]
     ) -> str:
         """This serializes any custom type information to the provided custom_types
@@ -165,12 +189,23 @@ class SignalSchema:
             # This type is already stored in custom_types.
             return version_name
         fields = {}
         for field_name, info in fr.model_fields.items():
             field_type = info.annotation
             # All fields should be typed.
             assert field_type
             fields[field_name] = SignalSchema._serialize_type(field_type, custom_types)
-        custom_types[version_name] = fields
+        bases: list[tuple[str, str, Optional[str]]] = []
+        for type_ in fr.__mro__:
+            model_store_name = (
+                ModelStore.get_name(type_) if issubclass(type_, DataModel) else None
+            )
+            bases.append((type_.__name__, type_.__module__, model_store_name))
+        ct = CustomType(schema_version=2, name=version_name, fields=fields, bases=bases)
+        custom_types[version_name] = ct.model_dump()
         return version_name
     @staticmethod
@@ -184,15 +219,12 @@ class SignalSchema:
             if st is None or not ModelStore.is_pydantic(st):
                 continue
             # Register and save feature types.
-            ModelStore.register(st)
             st_version_name = ModelStore.get_name(st)
             if st is fr:
                 # If the main type is Pydantic, then use the ModelStore version name.
                 type_name = st_version_name
             # Save this type to custom_types.
-            SignalSchema._serialize_custom_model_fields(
-                st_version_name, st, custom_types
-            )
+            SignalSchema._serialize_custom_model(st_version_name, st, custom_types)
         return type_name
     def serialize(self) -> dict[str, Any]:
@@ -215,7 +247,7 @@ class SignalSchema:
                 depth += 1
             elif c == "]":
                 if depth == 0:
-                    raise TypeError(
+                    raise ValueError(
                         "Extra closing square bracket when parsing subtype list"
                     )
                 depth -= 1
@@ -223,16 +255,51 @@ class SignalSchema:
                 subtypes.append(type_name[start:i].strip())
                 start = i + 1
         if depth > 0:
-            raise TypeError("Unclosed square bracket when parsing subtype list")
+            raise ValueError("Unclosed square bracket when parsing subtype list")
         subtypes.append(type_name[start:].strip())
         return subtypes
     @staticmethod
-    def _resolve_type(type_name: str, custom_types: dict[str, Any]) -> Optional[type]:  # noqa: PLR0911
+    def _deserialize_custom_type(
+        type_name: str, custom_types: dict[str, Any]
+    ) -> Optional[type]:
+        """Given a type name like MyType@v1 gets a type from ModelStore or recreates
+        it based on the information from the custom types dict that includes fields and
+        bases."""
+        model_name, version = ModelStore.parse_name_version(type_name)
+        fr = ModelStore.get(model_name, version)
+        if fr:
+            return fr
+        if type_name in custom_types:
+            ct = CustomType.deserialize(custom_types[type_name], type_name)
+            fields = {
+                field_name: SignalSchema._resolve_type(field_type_str, custom_types)
+                for field_name, field_type_str in ct.fields.items()
+            }
+            base_model = None
+            for base in ct.bases:
+                _, _, model_store_name = base
+                if model_store_name:
+                    model_name, version = ModelStore.parse_name_version(
+                        model_store_name
+                    )
+                    base_model = ModelStore.get(model_name, version)
+                    if base_model:
+                        break
+            return create_feature_model(type_name, fields, base=base_model)
+        return None
+    @staticmethod
+    def _resolve_type(type_name: str, custom_types: dict[str, Any]) -> Optional[type]:
         """Convert a string-based type back into a python type."""
         type_name = type_name.strip()
         if not type_name:
-            raise TypeError("Type cannot be empty")
+            raise ValueError("Type cannot be empty")
         if type_name == "NoneType":
             return None
@@ -240,14 +307,14 @@ class SignalSchema:
         subtypes: Optional[tuple[Optional[type], ...]] = None
         if bracket_idx > -1:
             if bracket_idx == 0:
-                raise TypeError("Type cannot start with '['")
+                raise ValueError("Type cannot start with '['")
             close_bracket_idx = type_name.rfind("]")
             if close_bracket_idx == -1:
-                raise TypeError("Unclosed square bracket when parsing type")
+                raise ValueError("Unclosed square bracket when parsing type")
             if close_bracket_idx < bracket_idx:
-                raise TypeError("Square brackets are out of order when parsing type")
+                raise ValueError("Square brackets are out of order when parsing type")
             if close_bracket_idx == bracket_idx + 1:
-                raise TypeError("Empty square brackets when parsing type")
+                raise ValueError("Empty square brackets when parsing type")
             subtype_names = SignalSchema._split_subtypes(
                 type_name[bracket_idx + 1 : close_bracket_idx]
             )
@@ -267,18 +334,10 @@ class SignalSchema:
                 return fr[subtypes]  # type: ignore[index]
             return fr  # type: ignore[return-value]
-        model_name, version = ModelStore.parse_name_version(type_name)
-        fr = ModelStore.get(model_name, version)
+        fr = SignalSchema._deserialize_custom_type(type_name, custom_types)
         if fr:
             return fr
-        if type_name in custom_types:
-            fields = custom_types[type_name]
-            fields = {
-                field_name: SignalSchema._resolve_type(field_type_str, custom_types)
-                for field_name, field_type_str in fields.items()
-            }
-            return create_feature_model(type_name, fields)
         # This can occur if a third-party or custom type is used, which is not available
         # when deserializing.
         warnings.warn(
@@ -317,7 +376,7 @@ class SignalSchema:
                         stacklevel=2,
                     )
                     continue
-            except TypeError as err:
+            except ValueError as err:
                 raise SignalSchemaError(
                     f"cannot deserialize '{signal}': {err}"
                 ) from err
@@ -662,6 +721,9 @@ class SignalSchema:
                 stacklevel=2,
             )
             return "Any"
+        if ModelStore.is_pydantic(type_):
+            ModelStore.register(type_)
+            return ModelStore.get_name(type_)
         return type_.__name__
     @staticmethod

datachain/listing.py CHANGED Viewed

@@ -157,11 +157,7 @@ class Listing:
         counter = 0
         for node in all_nodes:
-            dst = os.path.join(output, *node.path)
-            dst_dir = os.path.dirname(dst)
-            os.makedirs(dst_dir, exist_ok=True)
-            file = node.n.to_file(self.client.uri)
-            self.client.instantiate_object(file, dst, progress_bar, force)
+            node.instantiate(self.client, output, progress_bar, force=force)
             counter += 1
             if counter > 1000:
                 progress_bar.update(counter)

datachain/node.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import os
 from datetime import datetime
 from typing import TYPE_CHECKING, Any, Optional
@@ -10,6 +11,8 @@ from datachain.utils import TIME_ZERO, time_to_str
 if TYPE_CHECKING:
     from typing_extensions import Self
+    from datachain.client import Client
 class DirType:
     FILE = 0
@@ -114,7 +117,21 @@ class Node:
         )
     @classmethod
-    def from_dict(cls, d: dict[str, Any], file_prefix: str = "file") -> "Self":
+    def from_file(cls, f: File) -> "Self":
+        return cls(
+            source=StorageURI(f.source),
+            path=f.path,
+            etag=f.etag,
+            is_latest=f.is_latest,
+            size=f.size,
+            last_modified=f.last_modified,
+            version=f.version,
+            location=str(f.location) if f.location else None,
+            dir_type=DirType.FILE,
+        )
+    @classmethod
+    def from_row(cls, d: dict[str, Any], file_prefix: str = "file") -> "Self":
         def _dval(field_name: str):
             return d.get(f"{file_prefix}__{field_name}")
@@ -174,6 +191,15 @@ class NodeWithPath:
             path += "/"
         return path
+    def instantiate(
+        self, client: "Client", output: str, progress_bar, *, force: bool = False
+    ):
+        dst = os.path.join(output, *self.path)
+        dst_dir = os.path.dirname(dst)
+        os.makedirs(dst_dir, exist_ok=True)
+        file = self.n.to_file(client.uri)
+        client.instantiate_object(file, dst, progress_bar, force)
 TIME_FMT = "%Y-%m-%d %H:%M"

datachain/progress.py CHANGED Viewed

@@ -61,7 +61,7 @@ class Tqdm(tqdm):
         disable  : If (default: None) or False,
             will be determined by logging level.
             May be overridden to `True` due to non-TTY status.
-            Skip override by specifying env var `DVC_IGNORE_ISATTY`.
+            Skip override by specifying env var `DATACHAIN_IGNORE_ISATTY`.
         kwargs  : anything accepted by `tqdm.tqdm()`
         """
         kwargs = kwargs.copy()
@@ -77,7 +77,7 @@ class Tqdm(tqdm):
         # auto-disable based on TTY
         if (
             not disable
-            and not env2bool("DVC_IGNORE_ISATTY")
+            and not env2bool("DATACHAIN_IGNORE_ISATTY")
             and hasattr(file, "isatty")
         ):
             disable = not file.isatty()

datachain/query/session.py CHANGED Viewed

@@ -55,7 +55,7 @@ class Session:
         client_config: Optional[dict] = None,
         in_memory: bool = False,
     ):
-        if re.match(r"^[0-9a-zA-Z]+$", name) is None:
+        if re.match(r"^[0-9a-zA-Z]*$", name) is None:
             raise ValueError(
                 f"Session name can contain only letters or numbers - '{name}' given."
             )

datachain/studio.py CHANGED Viewed

@@ -20,21 +20,7 @@ POST_LOGIN_MESSAGE = (
 )
-def process_studio_cli_args(args: "Namespace"):  # noqa: PLR0911
-    if args.cmd == "login":
-        return login(args)
-    if args.cmd == "logout":
-        return logout()
-    if args.cmd == "token":
-        return token()
-    if args.cmd == "datasets":
-        rows = [
-            {"Name": name, "Version": version}
-            for name, version in list_datasets(args.team)
-        ]
-        print(tabulate(rows, headers="keys"))
-        return 0
+def process_jobs_args(args: "Namespace"):
     if args.cmd == "run":
         return create_job(
             args.query_file,
@@ -50,6 +36,25 @@ def process_studio_cli_args(args: "Namespace"):  # noqa: PLR0911
     if args.cmd == "cancel":
         return cancel_job(args.job_id, args.team)
+    if args.cmd == "logs":
+        return show_job_logs(args.job_id, args.team)
+    raise DataChainError(f"Unknown command '{args.cmd}'.")
+def process_studio_cli_args(args: "Namespace"):
+    if args.cmd == "login":
+        return login(args)
+    if args.cmd == "logout":
+        return logout()
+    if args.cmd == "token":
+        return token()
+    if args.cmd == "dataset":
+        rows = [
+            {"Name": name, "Version": version}
+            for name, version in list_datasets(args.team)
+        ]
+        print(tabulate(rows, headers="keys"))
+        return 0
     if args.cmd == "team":
         return set_team(args)
@@ -187,6 +192,32 @@ def save_config(hostname, token):
     return config.config_file()
+def show_logs_from_client(client, job_id):
+    # Sync usage
+    async def _run():
+        async for message in client.tail_job_logs(job_id):
+            if "logs" in message:
+                for log in message["logs"]:
+                    print(log["message"], end="")
+            elif "job" in message:
+                print(f"\n>>>> Job is now in {message['job']['status']} status.")
+    asyncio.run(_run())
+    response = client.dataset_job_versions(job_id)
+    if not response.ok:
+        raise_remote_error(response.message)
+    response_data = response.data
+    if response_data:
+        dataset_versions = response_data.get("dataset_versions", [])
+        print("\n\n>>>> Dataset versions created during the job:")
+        for version in dataset_versions:
+            print(f"    - {version.get('dataset_name')}@v{version.get('version')}")
+    else:
+        print("No dataset versions created during the job.")
 def create_job(
     query_file: str,
     team_name: Optional[str],
@@ -236,29 +267,7 @@ def create_job(
     print("Open the job in Studio at", response.data.get("job", {}).get("url"))
     print("=" * 40)
-    # Sync usage
-    async def _run():
-        async for message in client.tail_job_logs(job_id):
-            if "logs" in message:
-                for log in message["logs"]:
-                    print(log["message"], end="")
-            elif "job" in message:
-                print(f"\n>>>> Job is now in {message['job']['status']} status.")
-    asyncio.run(_run())
-    response = client.dataset_job_versions(job_id)
-    if not response.ok:
-        raise_remote_error(response.message)
-    response_data = response.data
-    if response_data:
-        dataset_versions = response_data.get("dataset_versions", [])
-        print("\n\n>>>> Dataset versions created during the job:")
-        for version in dataset_versions:
-            print(f"    - {version.get('dataset_name')}@v{version.get('version')}")
-    else:
-        print("No dataset versions created during the job.")
+    show_logs_from_client(client, job_id)
 def upload_files(client: StudioClient, files: list[str]) -> list[str]:
@@ -293,3 +302,14 @@ def cancel_job(job_id: str, team_name: Optional[str]):
         raise_remote_error(response.message)
     print(f"Job {job_id} canceled")
+def show_job_logs(job_id: str, team_name: Optional[str]):
+    token = Config().read().get("studio", {}).get("token")
+    if not token:
+        raise DataChainError(
+            "Not logged in to Studio. Log in with 'datachain studio login'."
+        )
+    client = StudioClient(team=team_name)
+    show_logs_from_client(client, job_id)

datachain/utils.py CHANGED Viewed

@@ -30,7 +30,7 @@ APPNAME = "datachain"
 APPAUTHOR = "iterative"
 ENV_DATACHAIN_SYSTEM_CONFIG_DIR = "DATACHAIN_SYSTEM_CONFIG_DIR"
 ENV_DATACHAIN_GLOBAL_CONFIG_DIR = "DATACHAIN_GLOBAL_CONFIG_DIR"
-STUDIO_URL = "https://studio.dvc.ai"
+STUDIO_URL = "https://studio.datachain.ai"
 T = TypeVar("T", bound="DataChainDir")

{datachain-0.8.2.dist-info → datachain-0.8.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datachain
-Version: 0.8.2
+Version: 0.8.4
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License: Apache-2.0
@@ -50,7 +50,7 @@ Requires-Dist: websockets
 Provides-Extra: docs
 Requires-Dist: mkdocs>=1.5.2; extra == "docs"
 Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
-Requires-Dist: mkdocs-material>=9.3.1; extra == "docs"
+Requires-Dist: mkdocs-material==9.5.22; extra == "docs"
 Requires-Dist: mkdocs-section-index>=0.3.6; extra == "docs"
 Requires-Dist: mkdocstrings-python>=1.6.3; extra == "docs"
 Requires-Dist: mkdocs-literate-nav>=0.6.1; extra == "docs"
@@ -72,7 +72,7 @@ Requires-Dist: pytest<9,>=8; extra == "tests"
 Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
 Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
 Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
-Requires-Dist: pytest-servers[all]>=0.5.8; extra == "tests"
+Requires-Dist: pytest-servers[all]>=0.5.9; extra == "tests"
 Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
 Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
 Requires-Dist: virtualenv; extra == "tests"
@@ -84,7 +84,7 @@ Requires-Dist: requests-mock; extra == "tests"
 Requires-Dist: scipy; extra == "tests"
 Provides-Extra: dev
 Requires-Dist: datachain[docs,tests]; extra == "dev"
-Requires-Dist: mypy==1.14.0; extra == "dev"
+Requires-Dist: mypy==1.14.1; extra == "dev"
 Requires-Dist: types-python-dateutil; extra == "dev"
 Requires-Dist: types-pytz; extra == "dev"
 Requires-Dist: types-PyYAML; extra == "dev"
@@ -95,11 +95,11 @@ Requires-Dist: datachain[tests]; extra == "examples"
 Requires-Dist: defusedxml; extra == "examples"
 Requires-Dist: accelerate; extra == "examples"
 Requires-Dist: unstructured_ingest[embed-huggingface]; extra == "examples"
-Requires-Dist: unstructured[pdf]; extra == "examples"
+Requires-Dist: unstructured[pdf]<0.16.12; extra == "examples"
 Requires-Dist: pdfplumber==0.11.4; extra == "examples"
 Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
 Requires-Dist: onnx==1.16.1; extra == "examples"
-Requires-Dist: ultralytics==8.3.53; extra == "examples"
+Requires-Dist: ultralytics==8.3.55; extra == "examples"
 ================
 |logo| DataChain

datachain 0.8.2__py3-none-any.whl → 0.8.4__py3-none-any.whl

Potentially problematic release.

datachain 0.8.2py3-none-any.whl → 0.8.4py3-none-any.whl