PyPI - datachain - Versions diffs - 0.6.7__py3-none-any.whl → 0.6.9__py3-none-any.whl - Mend

datachain 0.6.7py3-none-any.whl → 0.6.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (23) hide show

datachain/__init__.py +2 -1
datachain/catalog/catalog.py +5 -0
datachain/cli.py +137 -23
datachain/client/fsspec.py +1 -1
datachain/data_storage/metastore.py +4 -0
datachain/dataset.py +5 -0
datachain/lib/dataset_info.py +3 -0
datachain/lib/dc.py +26 -6
datachain/lib/file.py +0 -3
datachain/lib/meta_formats.py +1 -0
datachain/lib/models/__init__.py +5 -0
datachain/lib/models/bbox.py +45 -0
datachain/lib/models/pose.py +37 -0
datachain/lib/models/yolo.py +39 -0
datachain/lib/signal_schema.py +1 -1
datachain/remote/studio.py +12 -2
datachain/studio.py +18 -6
{datachain-0.6.7.dist-info → datachain-0.6.9.dist-info}/METADATA +43 -21
{datachain-0.6.7.dist-info → datachain-0.6.9.dist-info}/RECORD +23 -19
{datachain-0.6.7.dist-info → datachain-0.6.9.dist-info}/WHEEL +1 -1
{datachain-0.6.7.dist-info → datachain-0.6.9.dist-info}/LICENSE +0 -0
{datachain-0.6.7.dist-info → datachain-0.6.9.dist-info}/entry_points.txt +0 -0
{datachain-0.6.7.dist-info → datachain-0.6.9.dist-info}/top_level.txt +0 -0

datachain/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from datachain.lib import func
+from datachain.lib import func, models
 from datachain.lib.data_model import DataModel, DataType, is_chain_type
 from datachain.lib.dc import C, Column, DataChain, Sys
 from datachain.lib.file import (
@@ -38,5 +38,6 @@ __all__ = [
     "func",
     "is_chain_type",
     "metrics",
+    "models",
     "param",
 ]

datachain/catalog/catalog.py CHANGED Viewed

@@ -769,6 +769,7 @@ class Catalog:
         create_rows: Optional[bool] = True,
         validate_version: Optional[bool] = True,
         listing: Optional[bool] = False,
+        uuid: Optional[str] = None,
     ) -> "DatasetRecord":
         """
         Creates new dataset of a specific version.
@@ -816,6 +817,7 @@ class Catalog:
             query_script=query_script,
             create_rows_table=create_rows,
             columns=columns,
+            uuid=uuid,
         )
     def create_new_dataset_version(
@@ -832,6 +834,7 @@ class Catalog:
         script_output="",
         create_rows_table=True,
         job_id: Optional[str] = None,
+        uuid: Optional[str] = None,
     ) -> DatasetRecord:
         """
         Creates dataset version if it doesn't exist.
@@ -855,6 +858,7 @@ class Catalog:
             schema=schema,
             job_id=job_id,
             ignore_if_exists=True,
+            uuid=uuid,
         )
         if create_rows_table:
@@ -1400,6 +1404,7 @@ class Catalog:
             columns=columns,
             feature_schema=remote_dataset_version.feature_schema,
             validate_version=False,
+            uuid=remote_dataset_version.uuid,
         )
         # asking remote to export dataset rows table to s3 and to return signed

datachain/cli.py CHANGED Viewed

@@ -4,18 +4,21 @@ import shlex
 import sys
 import traceback
 from argparse import Action, ArgumentParser, ArgumentTypeError, Namespace
-from collections.abc import Iterable, Iterator, Mapping, Sequence
+from collections.abc import Iterable, Iterator, Sequence
 from importlib.metadata import PackageNotFoundError, version
 from itertools import chain
 from multiprocessing import freeze_support
 from typing import TYPE_CHECKING, Optional, Union
 import shtab
+from tabulate import tabulate
 from datachain import Session, utils
 from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyValueArgs
+from datachain.config import Config
+from datachain.error import DataChainError
 from datachain.lib.dc import DataChain
-from datachain.studio import process_studio_cli_args
+from datachain.studio import list_datasets, process_studio_cli_args
 from datachain.telemetry import telemetry
 if TYPE_CHECKING:
@@ -416,7 +419,36 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         help="Dataset labels",
     )
-    subp.add_parser("ls-datasets", parents=[parent_parser], description="List datasets")
+    datasets_parser = subp.add_parser(
+        "datasets", parents=[parent_parser], description="List datasets"
+    )
+    datasets_parser.add_argument(
+        "--studio",
+        action="store_true",
+        default=False,
+        help="List the files in the Studio",
+    )
+    datasets_parser.add_argument(
+        "-L",
+        "--local",
+        action="store_true",
+        default=False,
+        help="List local files only",
+    )
+    datasets_parser.add_argument(
+        "-a",
+        "--all",
+        action="store_true",
+        default=True,
+        help="List all files including hidden files",
+    )
+    datasets_parser.add_argument(
+        "--team",
+        action="store",
+        default=None,
+        help="The team to list datasets for. By default, it will use team from config.",
+    )
     rm_dataset_parser = subp.add_parser(
         "rm-dataset", parents=[parent_parser], description="Removes dataset"
     )
@@ -474,10 +506,30 @@ def get_parser() -> ArgumentParser:  # noqa: PLR0915
         help="List files in the long format",
     )
     parse_ls.add_argument(
-        "--remote",
+        "--studio",
+        action="store_true",
+        default=False,
+        help="List the files in the Studio",
+    )
+    parse_ls.add_argument(
+        "-L",
+        "--local",
+        action="store_true",
+        default=False,
+        help="List local files only",
+    )
+    parse_ls.add_argument(
+        "-a",
+        "--all",
+        action="store_true",
+        default=True,
+        help="List all files including hidden files",
+    )
+    parse_ls.add_argument(
+        "--team",
         action="store",
-        default="",
-        help="Name of remote to use",
+        default=None,
+        help="The team to list datasets for. By default, it will use team from config.",
     )
     parse_du = subp.add_parser(
@@ -758,11 +810,12 @@ def format_ls_entry(entry: str) -> str:
 def ls_remote(
     paths: Iterable[str],
     long: bool = False,
+    team: Optional[str] = None,
 ):
     from datachain.node import long_line_str
     from datachain.remote.studio import StudioClient
-    client = StudioClient()
+    client = StudioClient(team=team)
     first = True
     for path, response in client.ls(paths):
         if not first:
@@ -789,28 +842,66 @@ def ls_remote(
 def ls(
     sources,
     long: bool = False,
-    remote: str = "",
-    config: Optional[Mapping[str, str]] = None,
+    studio: bool = False,
+    local: bool = False,
+    all: bool = True,
+    team: Optional[str] = None,
     **kwargs,
 ):
-    if config is None:
-        from .config import Config
+    token = Config().read().get("studio", {}).get("token")
+    all, local, studio = _determine_flavors(studio, local, all, token)
-        config = Config().get_remote_config(remote=remote)
-    remote_type = config["type"]
-    if remote_type == "local":
+    if all or local:
         ls_local(sources, long=long, **kwargs)
-    else:
-        ls_remote(
-            sources,
-            long=long,
+    if (all or studio) and token:
+        ls_remote(sources, long=long, team=team)
+def datasets(
+    catalog: "Catalog",
+    studio: bool = False,
+    local: bool = False,
+    all: bool = True,
+    team: Optional[str] = None,
+):
+    token = Config().read().get("studio", {}).get("token")
+    all, local, studio = _determine_flavors(studio, local, all, token)
+    local_datasets = set(list_datasets_local(catalog)) if all or local else set()
+    studio_datasets = (
+        set(list_datasets(team=team)) if (all or studio) and token else set()
+    )
+    rows = [
+        _datasets_tabulate_row(
+            name=name,
+            version=version,
+            both=(all or (local and studio)) and token,
+            local=(name, version) in local_datasets,
+            studio=(name, version) in studio_datasets,
         )
+        for name, version in local_datasets.union(studio_datasets)
+    ]
+    print(tabulate(rows, headers="keys"))
-def ls_datasets(catalog: "Catalog"):
+def list_datasets_local(catalog: "Catalog"):
     for d in catalog.ls_datasets():
         for v in d.versions:
-            print(f"{d.name} (v{v.version})")
+            yield (d.name, v.version)
+def _datasets_tabulate_row(name, version, both, local, studio):
+    row = {
+        "Name": name,
+        "Version": version,
+    }
+    if both:
+        row["Studio"] = "\u2714" if studio else "\u2716"
+        row["Local"] = "\u2714" if local else "\u2716"
+    return row
 def rm_dataset(
@@ -953,6 +1044,20 @@ def completion(shell: str) -> str:
     )
+def _determine_flavors(studio: bool, local: bool, all: bool, token: Optional[str]):
+    if studio and not token:
+        raise DataChainError(
+            "Not logged in to Studio. Log in with 'datachain studio login'."
+        )
+    if local or studio:
+        all = False
+    all = all and not (local or studio)
+    return all, local, studio
 def main(argv: Optional[list[str]] = None) -> int:  # noqa: C901, PLR0912, PLR0915
     # Required for Windows multiprocessing support
     freeze_support()
@@ -1032,12 +1137,21 @@ def main(argv: Optional[list[str]] = None) -> int:  # noqa: C901, PLR0912, PLR09
             ls(
                 args.sources,
                 long=bool(args.long),
-                remote=args.remote,
+                studio=args.studio,
+                local=args.local,
+                all=args.all,
+                team=args.team,
                 update=bool(args.update),
                 client_config=client_config,
             )
-        elif args.command == "ls-datasets":
-            ls_datasets(catalog)
+        elif args.command == "datasets":
+            datasets(
+                catalog=catalog,
+                studio=args.studio,
+                local=args.local,
+                all=args.all,
+                team=args.team,
+            )
         elif args.command == "show":
             show(
                 catalog,

datachain/client/fsspec.py CHANGED Viewed

@@ -358,7 +358,7 @@ class Client(ABC):
     ) -> BinaryIO:
         """Open a file, including files in tar archives."""
         if use_cache and (cache_path := self.cache.get_path(file)):
-            return open(cache_path, mode="rb")  # noqa: SIM115
+            return open(cache_path, mode="rb")
         assert not file.location
         return FileWrapper(self.fs.open(self.get_full_path(file.path)), cb)  # type: ignore[return-value]

datachain/data_storage/metastore.py CHANGED Viewed

@@ -138,6 +138,7 @@ class AbstractMetastore(ABC, Serializable):
         size: Optional[int] = None,
         preview: Optional[list[dict]] = None,
         job_id: Optional[str] = None,
+        uuid: Optional[str] = None,
     ) -> DatasetRecord:
         """Creates new dataset version."""
@@ -352,6 +353,7 @@ class AbstractDBMetastore(AbstractMetastore):
         """Datasets versions table columns."""
         return [
             Column("id", Integer, primary_key=True),
+            Column("uuid", Text, nullable=False, default=uuid4()),
             Column(
                 "dataset_id",
                 Integer,
@@ -545,6 +547,7 @@ class AbstractDBMetastore(AbstractMetastore):
         size: Optional[int] = None,
         preview: Optional[list[dict]] = None,
         job_id: Optional[str] = None,
+        uuid: Optional[str] = None,
         conn=None,
     ) -> DatasetRecord:
         """Creates new dataset version."""
@@ -555,6 +558,7 @@ class AbstractDBMetastore(AbstractMetastore):
         query = self._datasets_versions_insert().values(
             dataset_id=dataset.id,
+            uuid=uuid or str(uuid4()),
             version=version,
             status=status,
             feature_schema=json.dumps(feature_schema or {}),

datachain/dataset.py CHANGED Viewed

@@ -163,6 +163,7 @@ class DatasetStatus:
 @dataclass
 class DatasetVersion:
     id: int
+    uuid: str
     dataset_id: int
     version: int
     status: int
@@ -184,6 +185,7 @@ class DatasetVersion:
     def parse(  # noqa: PLR0913
         cls: type[V],
         id: int,
+        uuid: str,
         dataset_id: int,
         version: int,
         status: int,
@@ -203,6 +205,7 @@ class DatasetVersion:
     ):
         return cls(
             id,
+            uuid,
             dataset_id,
             version,
             status,
@@ -306,6 +309,7 @@ class DatasetRecord:
         query_script: str,
         schema: str,
         version_id: int,
+        version_uuid: str,
         version_dataset_id: int,
         version: int,
         version_status: int,
@@ -331,6 +335,7 @@ class DatasetRecord:
         dataset_version = DatasetVersion.parse(
             version_id,
+            version_uuid,
             version_dataset_id,
             version,
             version_status,

datachain/lib/dataset_info.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import json
 from datetime import datetime
 from typing import TYPE_CHECKING, Any, Optional, Union
+from uuid import uuid4
 from pydantic import Field, field_validator
@@ -15,6 +16,7 @@ if TYPE_CHECKING:
 class DatasetInfo(DataModel):
     name: str
+    uuid: str = Field(default=str(uuid4()))
     version: int = Field(default=1)
     status: int = Field(default=DatasetStatus.CREATED)
     created_at: datetime = Field(default=TIME_ZERO)
@@ -60,6 +62,7 @@ class DatasetInfo(DataModel):
         job: Optional[Job],
     ) -> "Self":
         return cls(
+            uuid=version.uuid,
             name=dataset.name,
             version=version.version,
             status=version.status,

datachain/lib/dc.py CHANGED Viewed

@@ -30,7 +30,7 @@ from datachain.client.local import FileClient
 from datachain.dataset import DatasetRecord
 from datachain.lib.convert.python_to_sql import python_to_sql
 from datachain.lib.convert.values_to_tuples import values_to_tuples
-from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
+from datachain.lib.data_model import DataModel, DataType, DataValue, dict_to_data_model
 from datachain.lib.dataset_info import DatasetInfo
 from datachain.lib.file import ArrowRow, File, get_file_type
 from datachain.lib.file import ExportPlacement as FileExportPlacement
@@ -895,7 +895,7 @@ class DataChain:
         2. Group-based UDF function input: Instead of individual rows, the function
            receives a list all rows within each group defined by `partition_by`.
-        Example:
+        Examples:
             ```py
             chain = chain.agg(
                 total=lambda category, amount: [sum(amount)],
@@ -904,6 +904,26 @@ class DataChain:
             )
             chain.save("new_dataset")
             ```
+            An alternative syntax, when you need to specify a more complex function:
+            ```py
+            # It automatically resolves which columns to pass to the function
+            # by looking at the function signature.
+            def agg_sum(
+                file: list[File], amount: list[float]
+            ) -> Iterator[tuple[File, float]]:
+                yield file[0], sum(amount)
+            chain = chain.agg(
+                agg_sum,
+                output={"file": File, "total": float},
+                # Alternative syntax is to use `C` (short for Column) to specify
+                # a column name or a nested column, e.g. C("file.path").
+                partition_by=C("category"),
+            )
+            chain.save("new_dataset")
+            ```
         """
         udf_obj = self._udf_to_obj(Aggregator, func, params, output, signal_map)
         return self._evolve(
@@ -1242,15 +1262,15 @@ class DataChain:
         return self.results(row_factory=to_dict)
     @overload
-    def collect(self) -> Iterator[tuple[DataType, ...]]: ...
+    def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
     @overload
-    def collect(self, col: str) -> Iterator[DataType]: ...  # type: ignore[overload-overlap]
+    def collect(self, col: str) -> Iterator[DataValue]: ...
     @overload
-    def collect(self, *cols: str) -> Iterator[tuple[DataType, ...]]: ...
+    def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
-    def collect(self, *cols: str) -> Iterator[Union[DataType, tuple[DataType, ...]]]:  # type: ignore[overload-overlap,misc]
+    def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]:  # type: ignore[overload-overlap,misc]
         """Yields rows of values, optionally limited to the specified columns.
         Args:

datachain/lib/file.py CHANGED Viewed

@@ -20,9 +20,6 @@ from PIL import Image
 from pyarrow.dataset import dataset
 from pydantic import Field, field_validator
-if TYPE_CHECKING:
-    from typing_extensions import Self
 from datachain.client.fileslice import FileSlice
 from datachain.lib.data_model import DataModel
 from datachain.lib.utils import DataChainError

datachain/lib/meta_formats.py CHANGED Viewed

@@ -114,6 +114,7 @@ def read_meta(  # noqa: C901
             )
         )
         (model_output,) = chain.collect("meta_schema")
+        assert isinstance(model_output, str)
         if print_schema:
             print(f"{model_output}")
         # Below 'spec' should be a dynamically converted DataModel from Pydantic

datachain/lib/models/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from . import yolo
+from .bbox import BBox
+from .pose import Pose, Pose3D
+__all__ = ["BBox", "Pose", "Pose3D", "yolo"]

datachain/lib/models/bbox.py ADDED Viewed

@@ -0,0 +1,45 @@
+from typing import Optional
+from pydantic import Field
+from datachain.lib.data_model import DataModel
+class BBox(DataModel):
+    """
+    A data model for representing bounding boxes.
+    Attributes:
+        title (str): The title of the bounding box.
+        x1 (float): The x-coordinate of the top-left corner of the bounding box.
+        y1 (float): The y-coordinate of the top-left corner of the bounding box.
+        x2 (float): The x-coordinate of the bottom-right corner of the bounding box.
+        y2 (float): The y-coordinate of the bottom-right corner of the bounding box.
+    The bounding box is defined by two points:
+        - (x1, y1): The top-left corner of the box.
+        - (x2, y2): The bottom-right corner of the box.
+    """
+    title: str = Field(default="")
+    x1: float = Field(default=0)
+    y1: float = Field(default=0)
+    x2: float = Field(default=0)
+    y2: float = Field(default=0)
+    @staticmethod
+    def from_xywh(bbox: list[float], title: Optional[str] = None) -> "BBox":
+        """
+        Converts a bounding box in (x, y, width, height) format
+        to a BBox data model instance.
+        Args:
+            bbox (list[float]): A bounding box, represented as a list
+                                of four floats [x, y, width, height].
+        Returns:
+            BBox2D: An instance of the BBox data model.
+        """
+        assert len(bbox) == 4, f"Bounding box must have 4 elements, got f{len(bbox)}"
+        x, y, w, h = bbox
+        return BBox(title=title or "", x1=x, y1=y, x2=x + w, y2=y + h)

datachain/lib/models/pose.py ADDED Viewed

@@ -0,0 +1,37 @@
+from pydantic import Field
+from datachain.lib.data_model import DataModel
+class Pose(DataModel):
+    """
+    A data model for representing pose keypoints.
+    Attributes:
+        x (list[float]): The x-coordinates of the keypoints.
+        y (list[float]): The y-coordinates of the keypoints.
+    The keypoints are represented as lists of x and y coordinates, where each index
+    corresponds to a specific body part.
+    """
+    x: list[float] = Field(default=None)
+    y: list[float] = Field(default=None)
+class Pose3D(DataModel):
+    """
+    A data model for representing 3D pose keypoints.
+    Attributes:
+        x (list[float]): The x-coordinates of the keypoints.
+        y (list[float]): The y-coordinates of the keypoints.
+        visible (list[float]): The visibility of the keypoints.
+    The keypoints are represented as lists of x, y, and visibility values,
+    where each index corresponds to a specific body part.
+    """
+    x: list[float] = Field(default=None)
+    y: list[float] = Field(default=None)
+    visible: list[float] = Field(default=None)

datachain/lib/models/yolo.py ADDED Viewed

@@ -0,0 +1,39 @@
+"""
+This module contains the YOLO models.
+YOLO stands for "You Only Look Once", a family of object detection models that
+are designed to be fast and accurate. The models are trained to detect objects
+in images by dividing the image into a grid and predicting the bounding boxes
+and class probabilities for each grid cell.
+More information about YOLO can be found here:
+- https://pjreddie.com/darknet/yolo/
+- https://docs.ultralytics.com/
+"""
+class PoseBodyPart:
+    """
+    An enumeration of body parts for YOLO pose keypoints.
+    More information about the body parts can be found here:
+    https://docs.ultralytics.com/tasks/pose/
+    """
+    nose = 0
+    left_eye = 1
+    right_eye = 2
+    left_ear = 3
+    right_ear = 4
+    left_shoulder = 5
+    right_shoulder = 6
+    left_elbow = 7
+    right_elbow = 8
+    left_wrist = 9
+    right_wrist = 10
+    left_hip = 11
+    right_hip = 12
+    left_knee = 13
+    right_knee = 14
+    left_ankle = 15
+    right_ankle = 16

datachain/lib/signal_schema.py CHANGED Viewed

@@ -378,7 +378,7 @@ class SignalSchema:
     def row_to_features(
         self, row: Sequence, catalog: "Catalog", cache: bool = False
-    ) -> list[DataType]:
+    ) -> list[DataValue]:
         res = []
         pos = 0
         for fr_cls in self.values.values():

datachain/remote/studio.py CHANGED Viewed

@@ -131,6 +131,12 @@ class StudioClient:
             timeout=self.timeout,
         )
         ok = response.ok
+        if not ok:
+            if response.status_code == 403:
+                message = f"Not authorized for the team {self.team}"
+                raise DataChainError(message)
+            logger.error("Got bad response from Studio")
         content = msgpack.unpackb(response.content, ext_hook=self._unpacker_hook)
         response_data = content.get("data")
         if ok and response_data is None:
@@ -177,8 +183,12 @@ class StudioClient:
                 response.content.decode("utf-8"),
             )
             if response.status_code == 403:
-                message = "Not authorized"
+                message = f"Not authorized for the team {self.team}"
             else:
+                logger.error(
+                    "Got bad response from Studio, content is %s",
+                    response.content.decode("utf-8"),
+                )
                 message = data.get("message", "")
         else:
             message = ""
@@ -214,7 +224,7 @@ class StudioClient:
         # to handle cases where a path will be expanded (i.e. globs)
         response: Response[LsData]
         for path in paths:
-            response = self._send_request_msgpack("ls", {"source": path})
+            response = self._send_request_msgpack("datachain/ls", {"source": path})
             yield path, response
     def ls_datasets(self) -> Response[LsData]:

datachain/studio.py CHANGED Viewed

@@ -1,8 +1,11 @@
 import os
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
+from tabulate import tabulate
 from datachain.catalog.catalog import raise_remote_error
 from datachain.config import Config, ConfigLevel
+from datachain.dataset import QUERY_DATASET_PREFIX
 from datachain.error import DataChainError
 from datachain.remote.studio import StudioClient
 from datachain.utils import STUDIO_URL
@@ -24,7 +27,13 @@ def process_studio_cli_args(args: "Namespace"):
     if args.cmd == "token":
         return token()
     if args.cmd == "datasets":
-        return list_datasets(args)
+        rows = [
+            {"Name": name, "Version": version}
+            for name, version in list_datasets(args.team)
+        ]
+        print(tabulate(rows, headers="keys"))
+        return 0
     if args.cmd == "team":
         return set_team(args)
     raise DataChainError(f"Unknown command '{args.cmd}'.")
@@ -103,19 +112,22 @@ def token():
     print(token)
-def list_datasets(args: "Namespace"):
-    client = StudioClient(team=args.team)
+def list_datasets(team: Optional[str] = None):
+    client = StudioClient(team=team)
     response = client.ls_datasets()
     if not response.ok:
         raise_remote_error(response.message)
     if not response.data:
-        print("No datasets found.")
         return
     for d in response.data:
         name = d.get("name")
+        if name and name.startswith(QUERY_DATASET_PREFIX):
+            continue
         for v in d.get("versions", []):
             version = v.get("version")
-            print(f"{name} (v{version})")
+            yield (name, version)
 def save_config(hostname, token):

{datachain-0.6.7.dist-info → datachain-0.6.9.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datachain
-Version: 0.6.7
+Version: 0.6.9
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License: Apache-2.0
@@ -45,6 +45,7 @@ Requires-Dist: huggingface-hub
 Requires-Dist: iterative-telemetry >=0.0.9
 Requires-Dist: platformdirs
 Requires-Dist: dvc-studio-client <1,>=0.21
+Requires-Dist: tabulate
 Provides-Extra: dev
 Requires-Dist: datachain[docs,tests] ; extra == 'dev'
 Requires-Dist: mypy ==1.13.0 ; extra == 'dev'
@@ -52,6 +53,7 @@ Requires-Dist: types-python-dateutil ; extra == 'dev'
 Requires-Dist: types-pytz ; extra == 'dev'
 Requires-Dist: types-PyYAML ; extra == 'dev'
 Requires-Dist: types-requests ; extra == 'dev'
+Requires-Dist: types-tabulate ; extra == 'dev'
 Provides-Extra: docs
 Requires-Dist: mkdocs >=1.5.2 ; extra == 'docs'
 Requires-Dist: mkdocs-gen-files >=0.5.0 ; extra == 'docs'
@@ -118,33 +120,41 @@ Requires-Dist: usearch ; extra == 'vector'
    :target: https://github.com/iterative/datachain/actions/workflows/tests.yml
    :alt: Tests
-DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
-It is made to organize your unstructured data into datasets and wrangle it at scale on
-your local machine. Datachain does not abstract or hide the AI models and API calls, but helps to integrate them into the postmodern data stack.
+DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
+data like images, audio, videos, text and PDFs. It integrates with external storage
+(e.g., S3) to process data efficiently without data duplication and manages metadata
+in an internal database for easy and efficient querying.
+Use Cases
+=========
+1. **Multimodal Dataset Preparation and Curation**: ideal for organizing and
+   refining data in pre-training, finetuning or LLM evaluating stages.
+2. **GenAI Data Analytics**: Enables advanced analytics for multimodal data and
+   ad-hoc analytics using LLMs.
 Key Features
 ============
-📂 **Storage as a Source of Truth.**
-   - Process unstructured data without redundant copies from S3, GCP, Azure, and local
-     file systems.
-   - Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet.
+📂 **Multimodal Dataset Versioning.**
+   - Version unstructured data without redundant data copies, by supporitng
+     references to S3, GCP, Azure, and local file systems.
+   - Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet, etc.
    - Unite files and metadata together into persistent, versioned, columnar datasets.
-🐍 **Python-friendly data pipelines.**
-   - Operate on Python objects and object fields.
-   - Built-in parallelization and out-of-memory compute without SQL or Spark.
+🐍 **Python-friendly.**
+   - Operate on Python objects and object fields: float scores, strings, matrixes,
+     LLM response objects.
+   - Run Python code in a high-scale, terabytes size datasets, with built-in
+     parallelization and memory-efficient computing — no SQL or Spark required.
 🧠 **Data Enrichment and Processing.**
    - Generate metadata using local AI models and LLM APIs.
-   - Filter, join, and group by metadata. Search by vector embeddings.
+   - Filter, join, and group datasets by metadata. Search by vector embeddings.
+   - High-performance vectorized operations on Python objects: sum, count, avg, etc.
    - Pass datasets to Pytorch and Tensorflow, or export them back into storage.
-🚀 **Efficiency.**
-   - Parallelization, out-of-memory workloads and data caching.
-   - Vectorized operations on Python object fields: sum, count, avg, etc.
-   - Optimized vector search.
 Quick Start
 -----------
@@ -194,7 +204,7 @@ Batch inference with a simple sentiment model using the `transformers` library:
     pip install transformers
-The code below downloads files the cloud, and applies a user-defined function
+The code below downloads files from the cloud, and applies a user-defined function
 to each one of them. All files with a positive sentiment
 detected are then copied to the local directory.
@@ -427,6 +437,19 @@ name suffix, the following code will do it:
     loader = DataLoader(chain, batch_size=1)
+DataChain Studio Platform
+-------------------------
+`DataChain Studio`_ is a proprietary solution for teams that offers:
+- **Centralized dataset registry** to manage data, code and dependency
+  dependencies in one place.
+- **Data Lineage** for data sources as well as direvative dataset.
+- **UI for Multimodal Data** like images, videos, and PDFs.
+- **Scalable Compute** to handle large datasets (100M+ files) and in-house
+  AI model inference.
+- **Access control** including SSO and team based collaboration.
 Tutorials
 ---------
@@ -460,6 +483,5 @@ Community and Support
 .. _Pydantic: https://github.com/pydantic/pydantic
 .. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
 .. _SQLite: https://www.sqlite.org/
-.. _Getting Started: https://datachain.dvc.ai/
-.. |Flowchart| image:: https://github.com/iterative/datachain/blob/main/docs/assets/flowchart.png?raw=true
-   :alt: DataChain FlowChart
+.. _Getting Started: https://docs.datachain.ai/
+.. _DataChain Studio: https://studio.datachain.ai/

{datachain-0.6.7.dist-info → datachain-0.6.9.dist-info}/RECORD RENAMED Viewed

@@ -1,11 +1,11 @@
-datachain/__init__.py,sha256=OGzc8xZWtwqxiiutjU4AxCRPY0lrX_csgERiTrq4G0o,908
+datachain/__init__.py,sha256=nnTyB5MpCfBZ6D85JPz-5hUT7i-68Is-47Bxgew8lRw,930
 datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
 datachain/asyn.py,sha256=Lg3Ck1PQLjQziMx9KU4atzbEnJXTE0924WMYkhgWtGU,8247
 datachain/cache.py,sha256=s0YHN7qurmQv-eC265TjeureK84TebWWAnL07cxchZQ,2997
-datachain/cli.py,sha256=Wl-xMpTRgrkg4drX5I_QxAB1IATyULHCXOdx_wfoLVg,33529
+datachain/cli.py,sha256=hdVt_HJumQVgtaBAtBVJm-uPyYVogMXNVLmRcZyWHgk,36677
 datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
 datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
-datachain/dataset.py,sha256=lLUbUbJP1TYL9Obkc0f2IDziGcDylZge9ORQjK-WtXs,14717
+datachain/dataset.py,sha256=0IN-5y723y-bnFlieKtOFZLCjwX_yplFo3q0DV7LRPw,14821
 datachain/error.py,sha256=bxAAL32lSeMgzsQDEHbGTGORj-mPzzpCRvWDPueJNN4,1092
 datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
 datachain/listing.py,sha256=AV23WZq-k6e2zeeNBhVQP1-2PrwNCYidO0HBDKzpVaA,7152
@@ -14,17 +14,17 @@ datachain/nodes_fetcher.py,sha256=F-73-h19HHNGtHFBGKk7p3mc0ALm4a9zGnzhtuUjnp4,11
 datachain/nodes_thread_pool.py,sha256=uPo-xl8zG5m9YgODjPFBpbcqqHjI-dcxH87yAbj_qco,3192
 datachain/progress.py,sha256=5KotcvvzAUL_RF0GEj4JY0IB1lyImnmHxe89YkT1XO4,4330
 datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datachain/studio.py,sha256=d-jUsYpfI1LEv3g8KU-lLchVgb9L0TXvlHakieFud_E,3788
+datachain/studio.py,sha256=6kxF7VxPAbh9D7_Bk8_SghS5OXrwUwSpDaw19eNCTP4,4083
 datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
 datachain/utils.py,sha256=-mSFowjIidJ4_sMXInvNHLn4rK_QnHuIlLuH1_lMGmI,13897
 datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
-datachain/catalog/catalog.py,sha256=VwItaZG8MUqNKYz0xopDCdkVkbbxgTZYky3ElgsK5-M,57183
+datachain/catalog/catalog.py,sha256=Iwb562grttdGcrNVHCna_n7e884BqwGhQwAgYagBwyg,57347
 datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
 datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
 datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
 datachain/client/azure.py,sha256=ffxs26zm6KLAL1aUWJm-vtzuZP3LSNha7UDGXynMBKo,2234
 datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
-datachain/client/fsspec.py,sha256=C6C5AO6ndkgcoUxCRN9_8fUzqX2cRWJWG6FL6oD9X_Q,12708
+datachain/client/fsspec.py,sha256=Ai5m7alkAnv-RWXuLbZ95SKEPaQ3Pyk5ujDy50JDX5w,12692
 datachain/client/gcs.py,sha256=cnTIr5GS6dbYOEYfqehhyQu3dr6XNjPHSg5U3FkivUk,4124
 datachain/client/hf.py,sha256=XeVJVbiNViZCpn3sfb90Fr8SYO3BdLmfE3hOWMoqInE,951
 datachain/client/local.py,sha256=vwbgCwZ7IqY2voj2l7tLJjgov7Dp--fEUvUwUBsMbls,4457
@@ -33,7 +33,7 @@ datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZ
 datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
 datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
 datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
-datachain/data_storage/metastore.py,sha256=-TJCqG70VofSVOh2yEez4dwjHS3eQL8p7d9uO3WTVwM,35878
+datachain/data_storage/metastore.py,sha256=5b7o_CSHC2djottebYn-Hq5q0yaSLOKPIRCnaVRvjsU,36056
 datachain/data_storage/schema.py,sha256=scANMQqozita3HjEtq7eupMgh6yYkrZHoXtfuL2RoQg,9879
 datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
 datachain/data_storage/sqlite.py,sha256=wb8xlMJYYyt59wft0psJj587d-AwpNThzIqspVcKnRI,27388
@@ -42,18 +42,18 @@ datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/lib/arrow.py,sha256=-hu9tic79a01SY2UBqkA3U6wUr6tnE3T3q5q_BnO93A,9156
 datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
 datachain/lib/data_model.py,sha256=dau4AlZBhOFvF7pEKMeqCeRkcFFg5KFvTBWW_2CdH5g,2371
-datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
-datachain/lib/dc.py,sha256=U1evAvSs563OMuUVildoaIOuOFiNB6fZcsN4BI8L9f0,85076
-datachain/lib/file.py,sha256=LjTW_-PDAnoUhvyB4bJ8Y8n__XGqrxvmd9mDOF0Gir8,14875
+datachain/lib/dataset_info.py,sha256=q0EW9tj5jXGSD9Lzct9zbH4P1lfIGd_cIWqhnMxv7Q0,2464
+datachain/lib/dc.py,sha256=RQ8p95rzCMRY4ygFecO_hhQ3IgQHmbLXNqhcaINvGcI,85841
+datachain/lib/file.py,sha256=lHxE1wOGR4QJBQ3AYjhPLwpX72dOi06vkcwA-WSAGlg,14817
 datachain/lib/hf.py,sha256=BW2NPpqxkpPwkSaGlppT8Rbs8zPpyYC-tR6htY08c-0,5817
 datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
 datachain/lib/listing.py,sha256=cVkCp7TRVpcZKSx-Bbk9t51bQI9Mw0o86W6ZPhAsuzM,3667
 datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
-datachain/lib/meta_formats.py,sha256=3f-0vpMTesagS9iMd3y9-u9r-7g0eqYsxmK4fVfNWlw,6635
+datachain/lib/meta_formats.py,sha256=anK2bDVbaeCCh0yvKUBaW2MVos3zRgdaSV8uSduzPcU,6680
 datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
 datachain/lib/pytorch.py,sha256=W-ARi2xH1f1DUkVfRuerW-YWYgSaJASmNCxtz2lrJGI,6072
 datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
-datachain/lib/signal_schema.py,sha256=mQuviKAdZzFtZcbZHhqzUP-zivQ9MDZiLQhE54OPbOA,24555
+datachain/lib/signal_schema.py,sha256=xwkE5bxJxUhZTjrA6jqN87XbSXPikCbL6eOPL9WyrKM,24556
 datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
 datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
 datachain/lib/udf.py,sha256=4CqK51n3bntXCmkwoOQIrX34wMKOknkC23HtR4D_2vM,12705
@@ -71,6 +71,10 @@ datachain/lib/convert/values_to_tuples.py,sha256=varRCnSMT_pZmHznrd2Yi05qXLLz_v9
 datachain/lib/func/__init__.py,sha256=wlAKhGV0QDg9y7reSwoUF8Vicfqh_YOUNIXLzxICGz4,403
 datachain/lib/func/aggregate.py,sha256=H1ziFQdaK9zvnxvttfnEzkkyGvEEmMAvmgCsBV6nfm8,10917
 datachain/lib/func/func.py,sha256=HAJZ_tpiRG2R-et7pr0WnoyNZYtpbPn3_HBuL3RQpbU,4800
+datachain/lib/models/__init__.py,sha256=AGvjPbUokJiir3uelTa4XGtNSECkMFc5Xmi_N3AtxPQ,119
+datachain/lib/models/bbox.py,sha256=aiYNhvEcRK3dEN4MBcptmkPKc9kMP16ZQdu7xPk6hek,1555
+datachain/lib/models/pose.py,sha256=peuJPNSiGuTXfCfGIABwv8PGYistvTTBmtf-8X8E_eA,1077
+datachain/lib/models/yolo.py,sha256=eftoJDUa8iOpFTF1EkKVAd5Q-3HRd6X4eCIZ9h5p4nI,972
 datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
 datachain/query/batch.py,sha256=5fEhORFe7li12SdYddaSK3LyqksMfCHhwN1_A6TfsA4,3485
 datachain/query/dataset.py,sha256=MGArYxioeGvm8w7hQtQAjEI6wsZN_XAoh4-jO4d0U5Q,53926
@@ -81,7 +85,7 @@ datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
 datachain/query/schema.py,sha256=b_KnVy6B26Ol4nYG0LqNNpeQ1QYPk95YRGUjXfdaQWs,6606
 datachain/query/session.py,sha256=50SOdLNCjqHHKI-L4xGXyzTVxzMWfANqKqjeYre-c2k,5959
 datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datachain/remote/studio.py,sha256=yCjK5fYN-OseMwakUc2nWU3ktUJNBWJHHSRBaHAwfPw,8768
+datachain/remote/studio.py,sha256=g88kHdlRhmruiWwoIxq_JJoymZUrtMAL937NWQyWyXI,9209
 datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
 datachain/sql/selectable.py,sha256=fBM-wS1TUA42kVEAAiwqGtibIevyZAEritwt8PZGyLQ,1589
 datachain/sql/types.py,sha256=ASSPkmM5EzdRindqj2O7WHLXq8VHAgFYedG8lYfGvVI,14045
@@ -100,9 +104,9 @@ datachain/sql/sqlite/base.py,sha256=aHSZVvh4XSVkvZ07h3jMoRlHI4sWD8y3SnmGs9xMG9Y,
 datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
 datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
 datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
-datachain-0.6.7.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
-datachain-0.6.7.dist-info/METADATA,sha256=JfsOnrPpyCXuxHel2XXD2BQXK6khsm-z25jxUAx8KIk,17188
-datachain-0.6.7.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
-datachain-0.6.7.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
-datachain-0.6.7.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
-datachain-0.6.7.dist-info/RECORD,,
+datachain-0.6.9.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
+datachain-0.6.9.dist-info/METADATA,sha256=McKhuW43_7Q3iJKxueIYbk-rpYF6rbIKeFinzeeUzMo,18037
+datachain-0.6.9.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
+datachain-0.6.9.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
+datachain-0.6.9.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
+datachain-0.6.9.dist-info/RECORD,,

{datachain-0.6.7.dist-info → datachain-0.6.9.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.3.0)
+Generator: setuptools (75.5.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{datachain-0.6.7.dist-info → datachain-0.6.9.dist-info}/LICENSE RENAMED Viewed

File without changes

{datachain-0.6.7.dist-info → datachain-0.6.9.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{datachain-0.6.7.dist-info → datachain-0.6.9.dist-info}/top_level.txt RENAMED Viewed

File without changes

datachain 0.6.7__py3-none-any.whl → 0.6.9__py3-none-any.whl

Potentially problematic release.

datachain 0.6.7py3-none-any.whl → 0.6.9py3-none-any.whl