PyPI - datachain - Versions diffs - 0.8.13__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

datachain 0.8.13py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (30) hide show

datachain/__init__.py +10 -0
datachain/catalog/catalog.py +32 -9
datachain/cli/__init__.py +2 -0
datachain/cli/commands/datasets.py +78 -12
datachain/cli/parser/__init__.py +62 -12
datachain/cli/parser/job.py +14 -4
datachain/cli/parser/studio.py +8 -0
datachain/cli/parser/utils.py +20 -1
datachain/dataset.py +7 -4
datachain/diff/__init__.py +78 -128
datachain/fs/reference.py +21 -0
datachain/func/__init__.py +3 -1
datachain/func/conditional.py +66 -2
datachain/job.py +1 -1
datachain/lib/arrow.py +1 -11
datachain/lib/dc.py +2 -0
datachain/lib/file.py +292 -5
datachain/lib/hf.py +1 -1
datachain/lib/video.py +223 -0
datachain/query/dataset.py +28 -3
datachain/remote/studio.py +13 -6
datachain/studio.py +34 -12
datachain/utils.py +12 -2
{datachain-0.8.13.dist-info → datachain-0.9.0.dist-info}/METADATA +13 -5
{datachain-0.8.13.dist-info → datachain-0.9.0.dist-info}/RECORD +30 -28
/datachain/{lib/vfile.py → fs/__init__.py} +0 -0
{datachain-0.8.13.dist-info → datachain-0.9.0.dist-info}/LICENSE +0 -0
{datachain-0.8.13.dist-info → datachain-0.9.0.dist-info}/WHEEL +0 -0
{datachain-0.8.13.dist-info → datachain-0.9.0.dist-info}/entry_points.txt +0 -0
{datachain-0.8.13.dist-info → datachain-0.9.0.dist-info}/top_level.txt +0 -0

datachain/__init__.py CHANGED Viewed

@@ -4,9 +4,14 @@ from datachain.lib.file import (
     ArrowRow,
     File,
     FileError,
+    Image,
     ImageFile,
     TarVFile,
     TextFile,
+    Video,
+    VideoFile,
+    VideoFragment,
+    VideoFrame,
 )
 from datachain.lib.model_store import ModelStore
 from datachain.lib.udf import Aggregator, Generator, Mapper
@@ -27,6 +32,7 @@ __all__ = [
     "File",
     "FileError",
     "Generator",
+    "Image",
     "ImageFile",
     "Mapper",
     "ModelStore",
@@ -34,6 +40,10 @@ __all__ = [
     "Sys",
     "TarVFile",
     "TextFile",
+    "Video",
+    "VideoFile",
+    "VideoFragment",
+    "VideoFrame",
     "is_chain_type",
     "metrics",
     "param",

datachain/catalog/catalog.py CHANGED Viewed

@@ -89,10 +89,6 @@ PULL_DATASET_SLEEP_INTERVAL = 0.1  # sleep time while waiting for chunk to be av
 PULL_DATASET_CHECK_STATUS_INTERVAL = 20  # interval to check export status in Studio
-def raise_remote_error(error_message: str) -> NoReturn:
-    raise DataChainError(f"Error from server: {error_message}")
 def noop(_: str):
     pass
@@ -211,14 +207,14 @@ class DatasetRowsFetcher(NodesThreadPool):
             self.remote_ds_name, self.remote_ds_version
         )
         if not export_status_response.ok:
-            raise_remote_error(export_status_response.message)
+            raise DataChainError(export_status_response.message)
         export_status = export_status_response.data["status"]  # type: ignore [index]
         if export_status == "failed":
-            raise_remote_error("Dataset export failed in Studio")
+            raise DataChainError("Dataset export failed in Studio")
         if export_status == "removed":
-            raise_remote_error("Dataset export removed in Studio")
+            raise DataChainError("Dataset export removed in Studio")
         self.last_status_check = time.time()
@@ -1101,6 +1097,31 @@ class Catalog:
     def get_dataset(self, name: str) -> DatasetRecord:
         return self.metastore.get_dataset(name)
+    def get_dataset_with_remote_fallback(
+        self, name: str, version: Optional[int] = None
+    ) -> DatasetRecord:
+        try:
+            ds = self.get_dataset(name)
+            if version and not ds.has_version(version):
+                raise DatasetVersionNotFoundError(
+                    f"Dataset {name} does not have version {version}"
+                )
+            return ds
+        except (DatasetNotFoundError, DatasetVersionNotFoundError):
+            print("Dataset not found in local catalog, trying to get from studio")
+            remote_ds_uri = f"{DATASET_PREFIX}{name}"
+            if version:
+                remote_ds_uri += f"@v{version}"
+            self.pull_dataset(
+                remote_ds_uri=remote_ds_uri,
+                local_ds_name=name,
+                local_ds_version=version,
+            )
+            return self.get_dataset(name)
     def get_dataset_with_version_uuid(self, uuid: str) -> DatasetRecord:
         """Returns dataset that contains version with specific uuid"""
         for dataset in self.ls_datasets():
@@ -1113,7 +1134,7 @@ class Catalog:
         info_response = studio_client.dataset_info(name)
         if not info_response.ok:
-            raise_remote_error(info_response.message)
+            raise DataChainError(info_response.message)
         dataset_info = info_response.data
         assert isinstance(dataset_info, dict)
@@ -1209,6 +1230,8 @@ class Catalog:
         **kwargs,
     ) -> str:
         client_config = client_config or self.client_config
+        if client_config.get("anon"):
+            content_disposition = None
         client = Client.get_client(source, self.cache, **client_config)
         return client.url(
             path,
@@ -1407,7 +1430,7 @@ class Catalog:
             remote_ds_name, remote_ds_version.version
         )
         if not export_response.ok:
-            raise_remote_error(export_response.message)
+            raise DataChainError(export_response.message)
         signed_urls = export_response.data

datachain/cli/__init__.py CHANGED Viewed

@@ -160,6 +160,8 @@ def handle_dataset_command(args, catalog):
             local=args.local,
             all=args.all,
             team=args.team,
+            latest_only=not args.versions,
+            name=args.name,
         ),
         "rm": lambda: rm_dataset(
             catalog,

datachain/cli/commands/datasets.py CHANGED Viewed

@@ -12,49 +12,115 @@ from datachain.error import DatasetNotFoundError
 from datachain.studio import list_datasets as list_datasets_studio
+def group_dataset_versions(datasets, latest_only=True):
+    grouped = {}
+    # Sort to ensure groupby works as expected
+    # (groupby expects consecutive items with the same key)
+    for name, version in sorted(datasets):
+        grouped.setdefault(name, []).append(version)
+    if latest_only:
+        # For each dataset name, pick the highest version.
+        return {name: max(versions) for name, versions in grouped.items()}
+    # For each dataset name, return a sorted list of unique versions.
+    return {name: sorted(set(versions)) for name, versions in grouped.items()}
 def list_datasets(
     catalog: "Catalog",
     studio: bool = False,
     local: bool = False,
     all: bool = True,
     team: Optional[str] = None,
+    latest_only: bool = True,
+    name: Optional[str] = None,
 ):
     token = Config().read().get("studio", {}).get("token")
     all, local, studio = determine_flavors(studio, local, all, token)
+    if name:
+        latest_only = False
-    local_datasets = set(list_datasets_local(catalog)) if all or local else set()
+    local_datasets = set(list_datasets_local(catalog, name)) if all or local else set()
     studio_datasets = (
-        set(list_datasets_studio(team=team)) if (all or studio) and token else set()
+        set(list_datasets_studio(team=team, name=name))
+        if (all or studio) and token
+        else set()
     )
+    # Group the datasets for both local and studio sources.
+    local_grouped = group_dataset_versions(local_datasets, latest_only)
+    studio_grouped = group_dataset_versions(studio_datasets, latest_only)
+    # Merge all dataset names from both sources.
+    all_dataset_names = sorted(set(local_grouped.keys()) | set(studio_grouped.keys()))
+    datasets = []
+    if latest_only:
+        # For each dataset name, get the latest version from each source (if available).
+        for n in all_dataset_names:
+            datasets.append((n, (local_grouped.get(n), studio_grouped.get(n))))
+    else:
+        # For each dataset name, merge all versions from both sources.
+        for n in all_dataset_names:
+            local_versions = local_grouped.get(n, [])
+            studio_versions = studio_grouped.get(n, [])
+            # If neither source has any versions, record it as (None, None).
+            if not local_versions and not studio_versions:
+                datasets.append((n, (None, None)))
+            else:
+                # For each unique version from either source, record its presence.
+                for version in sorted(set(local_versions) | set(studio_versions)):
+                    datasets.append(
+                        (
+                            n,
+                            (
+                                version if version in local_versions else None,
+                                version if version in studio_versions else None,
+                            ),
+                        )
+                    )
     rows = [
         _datasets_tabulate_row(
-            name=name,
-            version=version,
+            name=n,
             both=(all or (local and studio)) and token,
-            local=(name, version) in local_datasets,
-            studio=(name, version) in studio_datasets,
+            local_version=local_version,
+            studio_version=studio_version,
         )
-        for name, version in local_datasets.union(studio_datasets)
+        for n, (local_version, studio_version) in datasets
     ]
     print(tabulate(rows, headers="keys"))
-def list_datasets_local(catalog: "Catalog"):
+def list_datasets_local(catalog: "Catalog", name: Optional[str] = None):
+    if name:
+        yield from list_datasets_local_versions(catalog, name)
+        return
     for d in catalog.ls_datasets():
         for v in d.versions:
             yield (d.name, v.version)
-def _datasets_tabulate_row(name, version, both, local, studio):
+def list_datasets_local_versions(catalog: "Catalog", name: str):
+    ds = catalog.get_dataset(name)
+    for v in ds.versions:
+        yield (name, v.version)
+def _datasets_tabulate_row(name, both, local_version, studio_version):
     row = {
         "Name": name,
-        "Version": version,
     }
     if both:
-        row["Studio"] = "\u2714" if studio else "\u2716"
-        row["Local"] = "\u2714" if local else "\u2716"
+        row["Studio"] = f"v{studio_version}" if studio_version else "\u2716"
+        row["Local"] = f"v{local_version}" if local_version else "\u2716"
+    else:
+        latest_version = local_version or studio_version
+        row["Latest Version"] = f"v{latest_version}" if latest_version else "\u2716"
     return row

datachain/cli/parser/__init__.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import argparse
-from argparse import ArgumentParser
 from importlib.metadata import PackageNotFoundError, version
 import shtab
@@ -10,12 +9,16 @@ from .job import add_jobs_parser
 from .studio import add_auth_parser
 from .utils import (
     FIND_COLUMNS,
+    CustomHelpFormatter,
     add_anon_arg,
     add_show_args,
     add_sources_arg,
     add_update_arg,
     find_columns_type,
 )
+from .utils import (
+    CustomArgumentParser as ArgumentParser,
+)
 def get_parser() -> ArgumentParser:  # noqa: PLR0915
@@ -28,10 +31,11 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
     parser = ArgumentParser(
         description="DataChain: Wrangle unstructured AI data at scale.",
         prog="datachain",
+        formatter_class=CustomHelpFormatter,
     )
     parser.add_argument("-V", "--version", action="version", version=__version__)
-    parent_parser = ArgumentParser(add_help=False)
+    parent_parser = ArgumentParser(add_help=False, formatter_class=CustomHelpFormatter)
     parent_parser.add_argument(
         "-v", "--verbose", action="count", default=0, help="Be verbose"
     )
@@ -59,7 +63,10 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         help=f"Use `{parser.prog} command --help` for command-specific help",
     )
     parse_cp = subp.add_parser(
-        "cp", parents=[parent_parser], description="Copy data files from the cloud."
+        "cp",
+        parents=[parent_parser],
+        description="Copy data files from the cloud.",
+        formatter_class=CustomHelpFormatter,
     )
     add_sources_arg(parse_cp).complete = shtab.DIR  # type: ignore[attr-defined]
     parse_cp.add_argument(
@@ -90,7 +97,10 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
     add_update_arg(parse_cp)
     parse_clone = subp.add_parser(
-        "clone", parents=[parent_parser], description="Copy data files from the cloud."
+        "clone",
+        parents=[parent_parser],
+        description="Copy data files from the cloud.",
+        formatter_class=CustomHelpFormatter,
     )
     add_sources_arg(parse_clone).complete = shtab.DIR  # type: ignore[attr-defined]
     parse_clone.add_argument(
@@ -134,6 +144,7 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         aliases=["ds"],
         parents=[parent_parser],
         description="Commands for managing datasets.",
+        formatter_class=CustomHelpFormatter,
     )
     add_anon_arg(datasets_parser)
     datasets_subparser = datasets_parser.add_subparsers(
@@ -145,6 +156,7 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         "pull",
         parents=[parent_parser],
         description="Pull specific dataset version from Studio.",
+        formatter_class=CustomHelpFormatter,
     )
     parse_pull.add_argument(
         "dataset",
@@ -188,7 +200,10 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
     )
     parse_edit_dataset = datasets_subparser.add_parser(
-        "edit", parents=[parent_parser], description="Edit dataset metadata."
+        "edit",
+        parents=[parent_parser],
+        description="Edit dataset metadata.",
+        formatter_class=CustomHelpFormatter,
     )
     parse_edit_dataset.add_argument("name", type=str, help="Dataset name")
     parse_edit_dataset.add_argument(
@@ -234,7 +249,19 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
     )
     datasets_ls_parser = datasets_subparser.add_parser(
-        "ls", parents=[parent_parser], description="List datasets."
+        "ls",
+        parents=[parent_parser],
+        description="List datasets.",
+        formatter_class=CustomHelpFormatter,
+    )
+    datasets_ls_parser.add_argument(
+        "name", action="store", help="Name of the dataset to list", nargs="?"
+    )
+    datasets_ls_parser.add_argument(
+        "--versions",
+        action="store_true",
+        default=False,
+        help="List all the versions of each dataset",
     )
     datasets_ls_parser.add_argument(
         "--studio",
@@ -264,7 +291,11 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
     )
     rm_dataset_parser = datasets_subparser.add_parser(
-        "rm", parents=[parent_parser], description="Remove dataset.", aliases=["remove"]
+        "rm",
+        parents=[parent_parser],
+        description="Remove dataset.",
+        aliases=["remove"],
+        formatter_class=CustomHelpFormatter,
     )
     rm_dataset_parser.add_argument("name", type=str, help="Dataset name")
     rm_dataset_parser.add_argument(
@@ -308,7 +339,10 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
     )
     parse_ls = subp.add_parser(
-        "ls", parents=[parent_parser], description="List storage contents."
+        "ls",
+        parents=[parent_parser],
+        description="List storage contents.",
+        formatter_class=CustomHelpFormatter,
     )
     add_anon_arg(parse_ls)
     add_update_arg(parse_ls)
@@ -348,7 +382,10 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
     )
     parse_du = subp.add_parser(
-        "du", parents=[parent_parser], description="Display space usage."
+        "du",
+        parents=[parent_parser],
+        description="Display space usage.",
+        formatter_class=CustomHelpFormatter,
     )
     add_sources_arg(parse_du)
     add_anon_arg(parse_du)
@@ -380,7 +417,10 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
     )
     parse_find = subp.add_parser(
-        "find", parents=[parent_parser], description="Search in a directory hierarchy."
+        "find",
+        parents=[parent_parser],
+        description="Search in a directory hierarchy.",
+        formatter_class=CustomHelpFormatter,
     )
     add_anon_arg(parse_find)
     add_update_arg(parse_find)
@@ -435,7 +475,10 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
     )
     parse_index = subp.add_parser(
-        "index", parents=[parent_parser], description="Index storage location."
+        "index",
+        parents=[parent_parser],
+        description="Index storage location.",
+        formatter_class=CustomHelpFormatter,
     )
     add_anon_arg(parse_index)
     add_update_arg(parse_index)
@@ -445,6 +488,7 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         "show",
         parents=[parent_parser],
         description="Create a new dataset with a query script.",
+        formatter_class=CustomHelpFormatter,
     )
     show_parser.add_argument("name", type=str, help="Dataset name")
     show_parser.add_argument(
@@ -461,6 +505,7 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         "query",
         parents=[parent_parser],
         description="Create a new dataset with a query script.",
+        formatter_class=CustomHelpFormatter,
     )
     add_anon_arg(query_parser)
     query_parser.add_argument(
@@ -491,11 +536,15 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         "clear-cache",
         parents=[parent_parser],
         description="Clear the local file cache.",
+        formatter_class=CustomHelpFormatter,
     )
     add_anon_arg(parse_clear_cache)
     parse_gc = subp.add_parser(
-        "gc", parents=[parent_parser], description="Garbage collect temporary tables."
+        "gc",
+        parents=[parent_parser],
+        description="Garbage collect temporary tables.",
+        formatter_class=CustomHelpFormatter,
     )
     add_anon_arg(parse_gc)
@@ -510,6 +559,7 @@ def add_completion_parser(subparsers, parents):
         "completion",
         parents=parents,
         description="Output shell completion script.",
+        formatter_class=CustomHelpFormatter,
     )
     parser.add_argument(
         "-s",

datachain/cli/parser/job.py CHANGED Viewed

@@ -1,8 +1,15 @@
+from datachain.cli.parser.utils import CustomHelpFormatter
 def add_jobs_parser(subparsers, parent_parser) -> None:
     jobs_help = "Manage jobs in Studio"
     jobs_description = "Commands to manage job execution in Studio."
     jobs_parser = subparsers.add_parser(
-        "job", parents=[parent_parser], description=jobs_description, help=jobs_help
+        "job",
+        parents=[parent_parser],
+        description=jobs_description,
+        help=jobs_help,
+        formatter_class=CustomHelpFormatter,
     )
     jobs_subparser = jobs_parser.add_subparsers(
         dest="cmd",
@@ -17,10 +24,11 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
         parents=[parent_parser],
         description=studio_run_description,
         help=studio_run_help,
+        formatter_class=CustomHelpFormatter,
     )
     studio_run_parser.add_argument(
-        "query_file",
+        "file",
         action="store",
         help="Query file to run",
     )
@@ -78,10 +86,11 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
         parents=[parent_parser],
         description=studio_cancel_description,
         help=studio_cancel_help,
+        formatter_class=CustomHelpFormatter,
     )
     studio_cancel_parser.add_argument(
-        "job_id",
+        "id",
         action="store",
         help="Job ID to cancel",
     )
@@ -100,10 +109,11 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
         parents=[parent_parser],
         description=studio_log_description,
         help=studio_log_help,
+        formatter_class=CustomHelpFormatter,
     )
     studio_log_parser.add_argument(
-        "job_id",
+        "id",
         action="store",
         help="Job ID to show logs for",
     )

datachain/cli/parser/studio.py CHANGED Viewed

@@ -1,3 +1,6 @@
+from datachain.cli.parser.utils import CustomHelpFormatter
 def add_auth_parser(subparsers, parent_parser) -> None:
     from dvc_studio_client.auth import AVAILABLE_SCOPES
@@ -9,6 +12,7 @@ def add_auth_parser(subparsers, parent_parser) -> None:
         parents=[parent_parser],
         description=auth_description,
         help=auth_help,
+        formatter_class=CustomHelpFormatter,
     )
     auth_subparser = auth_parser.add_subparsers(
         dest="cmd",
@@ -27,6 +31,7 @@ def add_auth_parser(subparsers, parent_parser) -> None:
         parents=[parent_parser],
         description=auth_login_description,
         help=auth_login_help,
+        formatter_class=CustomHelpFormatter,
     )
     login_parser.add_argument(
@@ -69,6 +74,7 @@ def add_auth_parser(subparsers, parent_parser) -> None:
         parents=[parent_parser],
         description=auth_logout_description,
         help=auth_logout_help,
+        formatter_class=CustomHelpFormatter,
     )
     auth_team_help = "Set default team for Studio operations"
@@ -79,6 +85,7 @@ def add_auth_parser(subparsers, parent_parser) -> None:
         parents=[parent_parser],
         description=auth_team_description,
         help=auth_team_help,
+        formatter_class=CustomHelpFormatter,
     )
     team_parser.add_argument(
         "team_name",
@@ -100,4 +107,5 @@ def add_auth_parser(subparsers, parent_parser) -> None:
         parents=[parent_parser],
         description=auth_token_description,
         help=auth_token_help,
+        formatter_class=CustomHelpFormatter,
     )

datachain/cli/parser/utils.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from argparse import Action, ArgumentParser, ArgumentTypeError
+from argparse import Action, ArgumentParser, ArgumentTypeError, HelpFormatter
 from typing import Union
 from datachain.cli.utils import CommaSeparatedArgs
@@ -6,6 +6,25 @@ from datachain.cli.utils import CommaSeparatedArgs
 FIND_COLUMNS = ["du", "name", "path", "size", "type"]
+class CustomHelpFormatter(HelpFormatter):
+    def add_arguments(self, actions):
+        # Sort arguments to move --help and --version to the end
+        normal_actions = [
+            a for a in actions if a.dest not in ("help", "verbose", "quiet")
+        ]
+        special_actions = [a for a in actions if a.dest in ("help", "verbose", "quiet")]
+        super().add_arguments(normal_actions + special_actions)
+class CustomArgumentParser(ArgumentParser):
+    def error(self, message):
+        internal_commands = ["internal-run-udf", "internal-run-udf-worker"]
+        hidden_portion = "".join(f"'{cmd}', " for cmd in internal_commands)
+        message = message.replace(hidden_portion, "")
+        super().error(message)
 def find_columns_type(
     columns_str: str,
     default_colums_str: str = "path",

datachain/dataset.py CHANGED Viewed

@@ -181,7 +181,7 @@ class DatasetVersion:
     @classmethod
     def parse(  # noqa: PLR0913
-        cls: type[V],
+        cls,
         id: int,
         uuid: str,
         dataset_id: int,
@@ -288,7 +288,7 @@ class DatasetListVersion:
     @classmethod
     def parse(
-        cls: type[LV],
+        cls,
         id: int,
         uuid: str,
         dataset_id: int,
@@ -352,7 +352,7 @@ class DatasetRecord:
     @classmethod
     def parse(  # noqa: PLR0913
-        cls: type[T],
+        cls,
         id: int,
         name: str,
         description: Optional[str],
@@ -567,7 +567,7 @@ class DatasetListRecord:
     @classmethod
     def parse(  # noqa: PLR0913
-        cls: type[LT],
+        cls,
         id: int,
         name: str,
         description: Optional[str],
@@ -628,6 +628,9 @@ class DatasetListRecord:
         self.versions.sort(key=lambda v: v.version)
         return self
+    def latest_version(self) -> DatasetListVersion:
+        return max(self.versions, key=lambda v: v.version)
     @property
     def is_bucket_listing(self) -> bool:
         """

datachain 0.8.13__py3-none-any.whl → 0.9.0__py3-none-any.whl

Potentially problematic release.

datachain 0.8.13py3-none-any.whl → 0.9.0py3-none-any.whl