PyPI - datachain - Versions diffs - 0.7.2__tar.gz → 0.7.3__tar.gz - Mend

datachain 0.7.2tar.gz → 0.7.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (280) hide show

{datachain-0.7.2/src/datachain.egg-info → datachain-0.7.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datachain
-Version: 0.7.2
+Version: 0.7.3
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License: Apache-2.0
@@ -139,7 +139,7 @@ Key Features
 ============
 📂 **Multimodal Dataset Versioning.**
-   - Version unstructured data without redundant data copies, by supporitng
+   - Version unstructured data without redundant data copies, by supporting
      references to S3, GCP, Azure, and local file systems.
    - Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet, etc.
    - Unite files and metadata together into persistent, versioned, columnar datasets.

{datachain-0.7.2 → datachain-0.7.3}/README.rst RENAMED Viewed

@@ -37,7 +37,7 @@ Key Features
 ============
 📂 **Multimodal Dataset Versioning.**
-   - Version unstructured data without redundant data copies, by supporitng
+   - Version unstructured data without redundant data copies, by supporting
      references to S3, GCP, Azure, and local file systems.
    - Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet, etc.
    - Unite files and metadata together into persistent, versioned, columnar datasets.

{datachain-0.7.2 → datachain-0.7.3}/src/datachain/catalog/catalog.py RENAMED Viewed

@@ -38,6 +38,7 @@ from datachain.dataset import (
     DATASET_PREFIX,
     QUERY_DATASET_PREFIX,
     DatasetDependency,
+    DatasetListRecord,
     DatasetRecord,
     DatasetStats,
     DatasetStatus,
@@ -72,7 +73,7 @@ if TYPE_CHECKING:
         AbstractMetastore,
         AbstractWarehouse,
     )
-    from datachain.dataset import DatasetVersion
+    from datachain.dataset import DatasetListVersion
     from datachain.job import Job
     from datachain.lib.file import File
     from datachain.listing import Listing
@@ -1135,7 +1136,7 @@ class Catalog:
         return direct_dependencies
-    def ls_datasets(self, include_listing: bool = False) -> Iterator[DatasetRecord]:
+    def ls_datasets(self, include_listing: bool = False) -> Iterator[DatasetListRecord]:
         datasets = self.metastore.list_datasets()
         for d in datasets:
             if not d.is_bucket_listing or include_listing:
@@ -1144,7 +1145,7 @@ class Catalog:
     def list_datasets_versions(
         self,
         include_listing: bool = False,
-    ) -> Iterator[tuple[DatasetRecord, "DatasetVersion", Optional["Job"]]]:
+    ) -> Iterator[tuple[DatasetListRecord, "DatasetListVersion", Optional["Job"]]]:
         """Iterate over all dataset versions with related jobs."""
         datasets = list(self.ls_datasets(include_listing=include_listing))

{datachain-0.7.2 → datachain-0.7.3}/src/datachain/cli.py RENAMED Viewed

@@ -18,7 +18,12 @@ from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyVa
 from datachain.config import Config
 from datachain.error import DataChainError
 from datachain.lib.dc import DataChain
-from datachain.studio import list_datasets, process_studio_cli_args
+from datachain.studio import (
+    edit_studio_dataset,
+    list_datasets,
+    process_studio_cli_args,
+    remove_studio_dataset,
+)
 from datachain.telemetry import telemetry
 if TYPE_CHECKING:
@@ -403,21 +408,44 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
     parse_edit_dataset.add_argument(
         "--new-name",
         action="store",
-        default="",
         help="Dataset new name",
     )
     parse_edit_dataset.add_argument(
         "--description",
         action="store",
-        default="",
         help="Dataset description",
     )
     parse_edit_dataset.add_argument(
         "--labels",
-        default=[],
         nargs="+",
         help="Dataset labels",
     )
+    parse_edit_dataset.add_argument(
+        "--studio",
+        action="store_true",
+        default=False,
+        help="Edit dataset from Studio",
+    )
+    parse_edit_dataset.add_argument(
+        "-L",
+        "--local",
+        action="store_true",
+        default=False,
+        help="Edit local dataset only",
+    )
+    parse_edit_dataset.add_argument(
+        "-a",
+        "--all",
+        action="store_true",
+        default=True,
+        help="Edit both datasets from studio and local",
+    )
+    parse_edit_dataset.add_argument(
+        "--team",
+        action="store",
+        default=None,
+        help="The team to edit a dataset. By default, it will use team from config.",
+    )
     datasets_parser = subp.add_parser(
         "datasets", parents=[parent_parser], description="List datasets"
@@ -466,6 +494,32 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         action=BooleanOptionalAction,
         help="Force delete registered dataset with all of it's versions",
     )
+    rm_dataset_parser.add_argument(
+        "--studio",
+        action="store_true",
+        default=False,
+        help="Remove dataset from Studio",
+    )
+    rm_dataset_parser.add_argument(
+        "-L",
+        "--local",
+        action="store_true",
+        default=False,
+        help="Remove local datasets only",
+    )
+    rm_dataset_parser.add_argument(
+        "-a",
+        "--all",
+        action="store_true",
+        default=True,
+        help="Remove both local and studio",
+    )
+    rm_dataset_parser.add_argument(
+        "--team",
+        action="store",
+        default=None,
+        help="The team to delete a dataset. By default, it will use team from config.",
+    )
     dataset_stats_parser = subp.add_parser(
         "dataset-stats",
@@ -909,8 +963,40 @@ def rm_dataset(
     name: str,
     version: Optional[int] = None,
     force: Optional[bool] = False,
+    studio: bool = False,
+    local: bool = False,
+    all: bool = True,
+    team: Optional[str] = None,
+):
+    token = Config().read().get("studio", {}).get("token")
+    all, local, studio = _determine_flavors(studio, local, all, token)
+    if all or local:
+        catalog.remove_dataset(name, version=version, force=force)
+    if (all or studio) and token:
+        remove_studio_dataset(team, name, version, force)
+def edit_dataset(
+    catalog: "Catalog",
+    name: str,
+    new_name: Optional[str] = None,
+    description: Optional[str] = None,
+    labels: Optional[list[str]] = None,
+    studio: bool = False,
+    local: bool = False,
+    all: bool = True,
+    team: Optional[str] = None,
 ):
-    catalog.remove_dataset(name, version=version, force=force)
+    token = Config().read().get("studio", {}).get("token")
+    all, local, studio = _determine_flavors(studio, local, all, token)
+    if all or local:
+        catalog.edit_dataset(name, new_name, description, labels)
+    if (all or studio) and token:
+        edit_studio_dataset(team, name, new_name, description, labels)
 def dataset_stats(
@@ -1127,11 +1213,16 @@ def main(argv: Optional[list[str]] = None) -> int:  # noqa: C901, PLR0912, PLR09
                 edatachain_file=args.edatachain_file,
             )
         elif args.command == "edit-dataset":
-            catalog.edit_dataset(
+            edit_dataset(
+                catalog,
                 args.name,
-                description=args.description,
                 new_name=args.new_name,
+                description=args.description,
                 labels=args.labels,
+                studio=args.studio,
+                local=args.local,
+                all=args.all,
+                team=args.team,
             )
         elif args.command == "ls":
             ls(
@@ -1164,7 +1255,16 @@ def main(argv: Optional[list[str]] = None) -> int:  # noqa: C901, PLR0912, PLR09
                 schema=args.schema,
             )
         elif args.command == "rm-dataset":
-            rm_dataset(catalog, args.name, version=args.version, force=args.force)
+            rm_dataset(
+                catalog,
+                args.name,
+                version=args.version,
+                force=args.force,
+                studio=args.studio,
+                local=args.local,
+                all=args.all,
+                team=args.team,
+            )
         elif args.command == "dataset-stats":
             dataset_stats(
                 catalog,

{datachain-0.7.2 → datachain-0.7.3}/src/datachain/data_storage/metastore.py RENAMED Viewed

@@ -27,6 +27,8 @@ from datachain.data_storage import JobQueryType, JobStatus
 from datachain.data_storage.serializer import Serializable
 from datachain.dataset import (
     DatasetDependency,
+    DatasetListRecord,
+    DatasetListVersion,
     DatasetRecord,
     DatasetStatus,
     DatasetVersion,
@@ -59,6 +61,8 @@ class AbstractMetastore(ABC, Serializable):
     schema: "schema.Schema"
     dataset_class: type[DatasetRecord] = DatasetRecord
+    dataset_list_class: type[DatasetListRecord] = DatasetListRecord
+    dataset_list_version_class: type[DatasetListVersion] = DatasetListVersion
     dependency_class: type[DatasetDependency] = DatasetDependency
     job_class: type[Job] = Job
@@ -166,11 +170,11 @@ class AbstractMetastore(ABC, Serializable):
         """
     @abstractmethod
-    def list_datasets(self) -> Iterator[DatasetRecord]:
+    def list_datasets(self) -> Iterator[DatasetListRecord]:
         """Lists all datasets."""
     @abstractmethod
-    def list_datasets_by_prefix(self, prefix: str) -> Iterator["DatasetRecord"]:
+    def list_datasets_by_prefix(self, prefix: str) -> Iterator["DatasetListRecord"]:
         """Lists all datasets which names start with prefix."""
     @abstractmethod
@@ -348,6 +352,14 @@ class AbstractDBMetastore(AbstractMetastore):
             if c.name  # type: ignore [attr-defined]
         ]
+    @cached_property
+    def _dataset_list_fields(self) -> list[str]:
+        return [
+            c.name  # type: ignore [attr-defined]
+            for c in self._datasets_columns()
+            if c.name in self.dataset_list_class.__dataclass_fields__  # type: ignore [attr-defined]
+        ]
     @classmethod
     def _datasets_versions_columns(cls) -> list["SchemaItem"]:
         """Datasets versions table columns."""
@@ -390,6 +402,15 @@ class AbstractDBMetastore(AbstractMetastore):
             if c.name  # type: ignore [attr-defined]
         ]
+    @cached_property
+    def _dataset_list_version_fields(self) -> list[str]:
+        return [
+            c.name  # type: ignore [attr-defined]
+            for c in self._datasets_versions_columns()
+            if c.name  # type: ignore [attr-defined]
+            in self.dataset_list_version_class.__dataclass_fields__
+        ]
     @classmethod
     def _datasets_dependencies_columns(cls) -> list["SchemaItem"]:
         """Datasets dependencies table columns."""
@@ -671,7 +692,25 @@ class AbstractDBMetastore(AbstractMetastore):
             if dataset:
                 yield dataset
-    def _base_dataset_query(self):
+    def _parse_list_dataset(self, rows) -> Optional[DatasetListRecord]:
+        versions = [self.dataset_list_class.parse(*r) for r in rows]
+        if not versions:
+            return None
+        return reduce(lambda ds, version: ds.merge_versions(version), versions)
+    def _parse_dataset_list(self, rows) -> Iterator["DatasetListRecord"]:
+        # grouping rows by dataset id
+        for _, g in groupby(rows, lambda r: r[0]):
+            dataset = self._parse_list_dataset(list(g))
+            if dataset:
+                yield dataset
+    def _get_dataset_query(
+        self,
+        dataset_fields: list[str],
+        dataset_version_fields: list[str],
+        isouter: bool = True,
+    ):
         if not (
             self.db.has_table(self._datasets.name)
             and self.db.has_table(self._datasets_versions.name)
@@ -680,23 +719,36 @@ class AbstractDBMetastore(AbstractMetastore):
         d = self._datasets
         dv = self._datasets_versions
         query = self._datasets_select(
-            *(getattr(d.c, f) for f in self._dataset_fields),
-            *(getattr(dv.c, f) for f in self._dataset_version_fields),
+            *(getattr(d.c, f) for f in dataset_fields),
+            *(getattr(dv.c, f) for f in dataset_version_fields),
         )
-        j = d.join(dv, d.c.id == dv.c.dataset_id, isouter=True)
+        j = d.join(dv, d.c.id == dv.c.dataset_id, isouter=isouter)
         return query.select_from(j)
-    def list_datasets(self) -> Iterator["DatasetRecord"]:
+    def _base_dataset_query(self):
+        return self._get_dataset_query(
+            self._dataset_fields, self._dataset_version_fields
+        )
+    def _base_list_datasets_query(self):
+        return self._get_dataset_query(
+            self._dataset_list_fields, self._dataset_list_version_fields, isouter=False
+        )
+    def list_datasets(self) -> Iterator["DatasetListRecord"]:
         """Lists all datasets."""
-        yield from self._parse_datasets(self.db.execute(self._base_dataset_query()))
+        yield from self._parse_dataset_list(
+            self.db.execute(self._base_list_datasets_query())
+        )
     def list_datasets_by_prefix(
         self, prefix: str, conn=None
-    ) -> Iterator["DatasetRecord"]:
-        query = self._base_dataset_query()
+    ) -> Iterator["DatasetListRecord"]:
+        query = self._base_list_datasets_query()
         query = query.where(self._datasets.c.name.startswith(prefix))
-        yield from self._parse_datasets(self.db.execute(query))
+        yield from self._parse_dataset_list(self.db.execute(query))
     def get_dataset(self, name: str, conn=None) -> DatasetRecord:
         """Gets a single dataset by name"""

{datachain-0.7.2 → datachain-0.7.3}/src/datachain/dataset.py RENAMED Viewed

@@ -15,7 +15,9 @@ from datachain.error import DatasetVersionNotFoundError
 from datachain.sql.types import NAME_TYPES_MAPPING, SQLType
 T = TypeVar("T", bound="DatasetRecord")
+LT = TypeVar("LT", bound="DatasetListRecord")
 V = TypeVar("V", bound="DatasetVersion")
+LV = TypeVar("LV", bound="DatasetListVersion")
 DD = TypeVar("DD", bound="DatasetDependency")
 DATASET_PREFIX = "ds://"
@@ -264,6 +266,59 @@ class DatasetVersion:
         return cls(**kwargs)
+@dataclass
+class DatasetListVersion:
+    id: int
+    uuid: str
+    dataset_id: int
+    version: int
+    status: int
+    created_at: datetime
+    finished_at: Optional[datetime]
+    error_message: str
+    error_stack: str
+    num_objects: Optional[int]
+    size: Optional[int]
+    query_script: str = ""
+    job_id: Optional[str] = None
+    @classmethod
+    def parse(
+        cls: type[LV],
+        id: int,
+        uuid: str,
+        dataset_id: int,
+        version: int,
+        status: int,
+        created_at: datetime,
+        finished_at: Optional[datetime],
+        error_message: str,
+        error_stack: str,
+        num_objects: Optional[int],
+        size: Optional[int],
+        query_script: str = "",
+        job_id: Optional[str] = None,
+    ):
+        return cls(
+            id,
+            uuid,
+            dataset_id,
+            version,
+            status,
+            created_at,
+            finished_at,
+            error_message,
+            error_stack,
+            num_objects,
+            size,
+            query_script,
+            job_id,
+        )
+    def __hash__(self):
+        return hash(f"{self.dataset_id}_{self.version}")
 @dataclass
 class DatasetRecord:
     id: int
@@ -447,20 +502,6 @@ class DatasetRecord:
         identifier = self.identifier(version)
         return f"{DATASET_PREFIX}{identifier}"
-    @property
-    def is_bucket_listing(self) -> bool:
-        """
-        For bucket listing we implicitly create underlying dataset to hold data. This
-        method is checking if this is one of those datasets.
-        """
-        from datachain.client import Client
-        # TODO refactor and maybe remove method in
-        # https://github.com/iterative/datachain/issues/318
-        return Client.is_data_source_uri(self.name) or self.name.startswith(
-            LISTING_PREFIX
-        )
     @property
     def versions_values(self) -> list[int]:
         """
@@ -499,5 +540,92 @@ class DatasetRecord:
         return cls(**kwargs, versions=versions)
+@dataclass
+class DatasetListRecord:
+    id: int
+    name: str
+    description: Optional[str]
+    labels: list[str]
+    versions: list[DatasetListVersion]
+    created_at: Optional[datetime] = None
+    @classmethod
+    def parse(  # noqa: PLR0913
+        cls: type[LT],
+        id: int,
+        name: str,
+        description: Optional[str],
+        labels: str,
+        created_at: datetime,
+        version_id: int,
+        version_uuid: str,
+        version_dataset_id: int,
+        version: int,
+        version_status: int,
+        version_created_at: datetime,
+        version_finished_at: Optional[datetime],
+        version_error_message: str,
+        version_error_stack: str,
+        version_num_objects: Optional[int],
+        version_size: Optional[int],
+        version_query_script: Optional[str],
+        version_job_id: Optional[str] = None,
+    ) -> "DatasetListRecord":
+        labels_lst: list[str] = json.loads(labels) if labels else []
+        dataset_version = DatasetListVersion.parse(
+            version_id,
+            version_uuid,
+            version_dataset_id,
+            version,
+            version_status,
+            version_created_at,
+            version_finished_at,
+            version_error_message,
+            version_error_stack,
+            version_num_objects,
+            version_size,
+            version_query_script,  # type: ignore[arg-type]
+            version_job_id,
+        )
+        return cls(
+            id,
+            name,
+            description,
+            labels_lst,
+            [dataset_version],
+            created_at,
+        )
+    def merge_versions(self, other: "DatasetListRecord") -> "DatasetListRecord":
+        """Merge versions from another dataset"""
+        if other.id != self.id:
+            raise RuntimeError("Cannot merge versions of datasets with different ids")
+        if not other.versions:
+            # nothing to merge
+            return self
+        if not self.versions:
+            self.versions = []
+        self.versions = list(set(self.versions + other.versions))
+        self.versions.sort(key=lambda v: v.version)
+        return self
+    @property
+    def is_bucket_listing(self) -> bool:
+        """
+        For bucket listing we implicitly create underlying dataset to hold data. This
+        method is checking if this is one of those datasets.
+        """
+        from datachain.client import Client
+        # TODO refactor and maybe remove method in
+        # https://github.com/iterative/datachain/issues/318
+        return Client.is_data_source_uri(self.name) or self.name.startswith(
+            LISTING_PREFIX
+        )
 class RowDict(dict):
     pass

{datachain-0.7.2 → datachain-0.7.3}/src/datachain/lib/dataset_info.py RENAMED Viewed

@@ -5,7 +5,11 @@ from uuid import uuid4
 from pydantic import Field, field_validator
-from datachain.dataset import DatasetRecord, DatasetStatus, DatasetVersion
+from datachain.dataset import (
+    DatasetListRecord,
+    DatasetListVersion,
+    DatasetStatus,
+)
 from datachain.job import Job
 from datachain.lib.data_model import DataModel
 from datachain.utils import TIME_ZERO
@@ -57,8 +61,8 @@ class DatasetInfo(DataModel):
     @classmethod
     def from_models(
         cls,
-        dataset: DatasetRecord,
-        version: DatasetVersion,
+        dataset: DatasetListRecord,
+        version: DatasetListVersion,
         job: Optional[Job],
     ) -> "Self":
         return cls(

{datachain-0.7.2 → datachain-0.7.3}/src/datachain/remote/studio.py RENAMED Viewed

@@ -178,17 +178,9 @@ class StudioClient:
             data = {}
         if not ok:
-            logger.error(
-                "Got bad response from Studio, content is %s",
-                response.content.decode("utf-8"),
-            )
             if response.status_code == 403:
                 message = f"Not authorized for the team {self.team}"
             else:
-                logger.error(
-                    "Got bad response from Studio, content is %s",
-                    response.content.decode("utf-8"),
-                )
                 message = data.get("message", "")
         else:
             message = ""
@@ -230,6 +222,46 @@ class StudioClient:
     def ls_datasets(self) -> Response[LsData]:
         return self._send_request("datachain/ls-datasets", {})
+    def edit_dataset(
+        self,
+        name: str,
+        new_name: Optional[str] = None,
+        description: Optional[str] = None,
+        labels: Optional[list[str]] = None,
+    ) -> Response[DatasetInfoData]:
+        body = {
+            "dataset_name": name,
+        }
+        if new_name is not None:
+            body["new_name"] = new_name
+        if description is not None:
+            body["description"] = description
+        if labels is not None:
+            body["labels"] = labels  # type: ignore[assignment]
+        return self._send_request(
+            "datachain/edit-dataset",
+            body,
+        )
+    def rm_dataset(
+        self,
+        name: str,
+        version: Optional[int] = None,
+        force: Optional[bool] = False,
+    ) -> Response[DatasetInfoData]:
+        return self._send_request(
+            "datachain/rm-dataset",
+            {
+                "dataset_name": name,
+                "version": version,
+                "force": force,
+            },
+        )
     def dataset_info(self, name: str) -> Response[DatasetInfoData]:
         def _parse_dataset_info(dataset_info):
             _parse_dates(dataset_info, ["created_at", "finished_at"])

{datachain-0.7.2 → datachain-0.7.3}/src/datachain/studio.py RENAMED Viewed

@@ -130,6 +130,35 @@ def list_datasets(team: Optional[str] = None):
             yield (name, version)
+def edit_studio_dataset(
+    team_name: Optional[str],
+    name: str,
+    new_name: Optional[str] = None,
+    description: Optional[str] = None,
+    labels: Optional[list[str]] = None,
+):
+    client = StudioClient(team=team_name)
+    response = client.edit_dataset(name, new_name, description, labels)
+    if not response.ok:
+        raise_remote_error(response.message)
+    print(f"Dataset {name} updated")
+def remove_studio_dataset(
+    team_name: Optional[str],
+    name: str,
+    version: Optional[int] = None,
+    force: Optional[bool] = False,
+):
+    client = StudioClient(team=team_name)
+    response = client.rm_dataset(name, version, force)
+    if not response.ok:
+        raise_remote_error(response.message)
+    print(f"Dataset {name} removed")
 def save_config(hostname, token):
     config = Config(ConfigLevel.GLOBAL)
     with config.edit() as conf:

{datachain-0.7.2 → datachain-0.7.3/src/datachain.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datachain
-Version: 0.7.2
+Version: 0.7.3
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License: Apache-2.0
@@ -139,7 +139,7 @@ Key Features
 ============
 📂 **Multimodal Dataset Versioning.**
-   - Version unstructured data without redundant data copies, by supporitng
+   - Version unstructured data without redundant data copies, by supporting
      references to S3, GCP, Azure, and local file systems.
    - Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet, etc.
    - Unite files and metadata together into persistent, versioned, columnar datasets.

datachain 0.7.2__tar.gz → 0.7.3__tar.gz

Potentially problematic release.

datachain 0.7.2tar.gz → 0.7.3tar.gz