PyPI - datachain - Versions diffs - 0.6.9__py3-none-any.whl → 0.6.10__py3-none-any.whl - Mend

datachain 0.6.9py3-none-any.whl → 0.6.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (21) hide show

datachain/catalog/catalog.py +15 -3
datachain/data_storage/sqlite.py +6 -2
datachain/lib/dc.py +53 -0
datachain/lib/models/__init__.py +4 -3
datachain/lib/models/bbox.py +96 -25
datachain/lib/models/pose.py +79 -8
datachain/lib/models/segment.py +53 -0
datachain/lib/models/ultralytics/__init__.py +14 -0
datachain/lib/models/ultralytics/bbox.py +189 -0
datachain/lib/models/ultralytics/pose.py +126 -0
datachain/lib/models/ultralytics/segment.py +121 -0
datachain/listing.py +24 -7
datachain/toolkit/__init__.py +3 -0
datachain/toolkit/split.py +67 -0
{datachain-0.6.9.dist-info → datachain-0.6.10.dist-info}/METADATA +2 -2
{datachain-0.6.9.dist-info → datachain-0.6.10.dist-info}/RECORD +20 -14
datachain/lib/models/yolo.py +0 -39
{datachain-0.6.9.dist-info → datachain-0.6.10.dist-info}/LICENSE +0 -0
{datachain-0.6.9.dist-info → datachain-0.6.10.dist-info}/WHEEL +0 -0
{datachain-0.6.9.dist-info → datachain-0.6.10.dist-info}/entry_points.txt +0 -0
{datachain-0.6.9.dist-info → datachain-0.6.10.dist-info}/top_level.txt +0 -0

datachain/catalog/catalog.py CHANGED Viewed

@@ -603,9 +603,10 @@ class Catalog:
         )
         lst = Listing(
+            self.metastore.clone(),
             self.warehouse.clone(),
             Client.get_client(list_uri, self.cache, **self.client_config),
-            self.get_dataset(list_ds_name),
+            dataset_name=list_ds_name,
             object_name=object_name,
         )
@@ -698,9 +699,13 @@ class Catalog:
                     client = self.get_client(source, **client_config)
                     uri = client.uri
-                    st = self.warehouse.clone()
                     dataset_name, _, _, _ = DataChain.parse_uri(uri, self.session)
-                    listing = Listing(st, client, self.get_dataset(dataset_name))
+                    listing = Listing(
+                        self.metastore.clone(),
+                        self.warehouse.clone(),
+                        client,
+                        dataset_name=dataset_name,
+                    )
                     rows = DatasetQuery(
                         name=dataset.name, version=ds_version, catalog=self
                     ).to_db_records()
@@ -1354,6 +1359,13 @@ class Catalog:
             # we will create new one if it doesn't exist
             pass
+        if dataset and version and dataset.has_version(version):
+            """No need to communicate with Studio at all"""
+            dataset_uri = create_dataset_uri(remote_dataset_name, version)
+            print(f"Local copy of dataset {dataset_uri} already present")
+            _instantiate_dataset()
+            return
         remote_dataset = self.get_remote_dataset(remote_dataset_name)
         # if version is not specified in uri, take the latest one
         if not version:

datachain/data_storage/sqlite.py CHANGED Viewed

@@ -747,8 +747,12 @@ class SQLiteWarehouse(AbstractWarehouse):
         ids = self.db.execute(select_ids).fetchall()
-        select_q = query.with_only_columns(
-            *[c for c in query.selected_columns if c.name != "sys__id"]
+        select_q = (
+            query.with_only_columns(
+                *[c for c in query.selected_columns if c.name != "sys__id"]
+            )
+            .offset(None)
+            .limit(None)
         )
         for batch in batched_it(ids, 10_000):

datachain/lib/dc.py CHANGED Viewed

@@ -642,6 +642,59 @@ class DataChain:
         }
         return chain.gen(**signal_dict)  # type: ignore[misc, arg-type]
+    def explode(
+        self,
+        col: str,
+        model_name: Optional[str] = None,
+        object_name: Optional[str] = None,
+    ) -> "DataChain":
+        """Explodes a column containing JSON objects (dict or str DataChain type) into
+           individual columns based on the schema of the JSON. Schema is inferred from
+           the first row of the column.
+        Args:
+            col: the name of the column containing JSON to be exploded.
+            model_name: optional generated model name.  By default generates the name
+                automatically.
+            object_name: optional generated object column name. By default generates the
+                name automatically.
+        Returns:
+            DataChain: A new DataChain instance with the new set of columns.
+        """
+        import json
+        import pyarrow as pa
+        from datachain.lib.arrow import schema_to_output
+        json_value = next(self.limit(1).collect(col))
+        json_dict = (
+            json.loads(json_value) if isinstance(json_value, str) else json_value
+        )
+        if not isinstance(json_dict, dict):
+            raise TypeError(f"Column {col} should be a string or dict type with JSON")
+        schema = pa.Table.from_pylist([json_dict]).schema
+        output = schema_to_output(schema, None)
+        if not model_name:
+            model_name = f"{col.title()}ExplodedModel"
+        model = dict_to_data_model(model_name, output)
+        def json_to_model(json_value: Union[str, dict]):
+            json_dict = (
+                json.loads(json_value) if isinstance(json_value, str) else json_value
+            )
+            return model.model_validate(json_dict)
+        if not object_name:
+            object_name = f"{col}_expl"
+        return self.map(json_to_model, params=col, output={object_name: model})
     @classmethod
     def datasets(
         cls,

datachain/lib/models/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
-from . import yolo
-from .bbox import BBox
+from . import ultralytics
+from .bbox import BBox, OBBox
 from .pose import Pose, Pose3D
+from .segment import Segments
-__all__ = ["BBox", "Pose", "Pose3D", "yolo"]
+__all__ = ["BBox", "OBBox", "Pose", "Pose3D", "Segments", "ultralytics"]

datachain/lib/models/bbox.py CHANGED Viewed

@@ -1,5 +1,3 @@
-from typing import Optional
 from pydantic import Field
 from datachain.lib.data_model import DataModel
@@ -11,10 +9,7 @@ class BBox(DataModel):
     Attributes:
         title (str): The title of the bounding box.
-        x1 (float): The x-coordinate of the top-left corner of the bounding box.
-        y1 (float): The y-coordinate of the top-left corner of the bounding box.
-        x2 (float): The x-coordinate of the bottom-right corner of the bounding box.
-        y2 (float): The y-coordinate of the bottom-right corner of the bounding box.
+        coords (list[int]): The coordinates of the bounding box.
     The bounding box is defined by two points:
         - (x1, y1): The top-left corner of the box.
@@ -22,24 +17,100 @@ class BBox(DataModel):
     """
     title: str = Field(default="")
-    x1: float = Field(default=0)
-    y1: float = Field(default=0)
-    x2: float = Field(default=0)
-    y2: float = Field(default=0)
+    coords: list[int] = Field(default=None)
+    @staticmethod
+    def from_list(coords: list[float], title: str = "") -> "BBox":
+        assert len(coords) == 4, "Bounding box coordinates must be a list of 4 floats."
+        assert all(
+            isinstance(value, (int, float)) for value in coords
+        ), "Bounding box coordinates must be integers or floats."
+        return BBox(
+            title=title,
+            coords=[round(c) for c in coords],
+        )
+    @staticmethod
+    def from_dict(coords: dict[str, float], title: str = "") -> "BBox":
+        assert (
+            len(coords) == 4
+        ), "Bounding box coordinates must be a dictionary of 4 floats."
+        assert set(coords) == {
+            "x1",
+            "y1",
+            "x2",
+            "y2",
+        }, "Bounding box coordinates must contain keys with coordinates."
+        assert all(
+            isinstance(value, (int, float)) for value in coords.values()
+        ), "Bounding box coordinates must be integers or floats."
+        return BBox(
+            title=title,
+            coords=[
+                round(coords["x1"]),
+                round(coords["y1"]),
+                round(coords["x2"]),
+                round(coords["y2"]),
+            ],
+        )
+class OBBox(DataModel):
+    """
+    A data model for representing oriented bounding boxes.
+    Attributes:
+        title (str): The title of the oriented bounding box.
+        coords (list[int]): The coordinates of the oriented bounding box.
+    The oriented bounding box is defined by four points:
+        - (x1, y1): The first corner of the box.
+        - (x2, y2): The second corner of the box.
+        - (x3, y3): The third corner of the box.
+        - (x4, y4): The fourth corner of the box.
+    """
+    title: str = Field(default="")
+    coords: list[int] = Field(default=None)
+    @staticmethod
+    def from_list(coords: list[float], title: str = "") -> "OBBox":
+        assert (
+            len(coords) == 8
+        ), "Oriented bounding box coordinates must be a list of 8 floats."
+        assert all(
+            isinstance(value, (int, float)) for value in coords
+        ), "Oriented bounding box coordinates must be integers or floats."
+        return OBBox(
+            title=title,
+            coords=[round(c) for c in coords],
+        )
     @staticmethod
-    def from_xywh(bbox: list[float], title: Optional[str] = None) -> "BBox":
-        """
-        Converts a bounding box in (x, y, width, height) format
-        to a BBox data model instance.
-        Args:
-            bbox (list[float]): A bounding box, represented as a list
-                                of four floats [x, y, width, height].
-        Returns:
-            BBox2D: An instance of the BBox data model.
-        """
-        assert len(bbox) == 4, f"Bounding box must have 4 elements, got f{len(bbox)}"
-        x, y, w, h = bbox
-        return BBox(title=title or "", x1=x, y1=y, x2=x + w, y2=y + h)
+    def from_dict(coords: dict[str, float], title: str = "") -> "OBBox":
+        assert set(coords) == {
+            "x1",
+            "y1",
+            "x2",
+            "y2",
+            "x3",
+            "y3",
+            "x4",
+            "y4",
+        }, "Oriented bounding box coordinates must contain keys with coordinates."
+        assert all(
+            isinstance(value, (int, float)) for value in coords.values()
+        ), "Oriented bounding box coordinates must be integers or floats."
+        return OBBox(
+            title=title,
+            coords=[
+                round(coords["x1"]),
+                round(coords["y1"]),
+                round(coords["x2"]),
+                round(coords["y2"]),
+                round(coords["x3"]),
+                round(coords["y3"]),
+                round(coords["x4"]),
+                round(coords["y4"]),
+            ],
+        )

datachain/lib/models/pose.py CHANGED Viewed

@@ -8,15 +8,48 @@ class Pose(DataModel):
     A data model for representing pose keypoints.
     Attributes:
-        x (list[float]): The x-coordinates of the keypoints.
-        y (list[float]): The y-coordinates of the keypoints.
+        x (list[int]): The x-coordinates of the keypoints.
+        y (list[int]): The y-coordinates of the keypoints.
     The keypoints are represented as lists of x and y coordinates, where each index
     corresponds to a specific body part.
     """
-    x: list[float] = Field(default=None)
-    y: list[float] = Field(default=None)
+    x: list[int] = Field(default=None)
+    y: list[int] = Field(default=None)
+    @staticmethod
+    def from_list(points: list[list[float]]) -> "Pose":
+        assert len(points) == 2, "Pose coordinates must be a list of 2 lists."
+        points_x, points_y = points
+        assert (
+            len(points_x) == len(points_y) == 17
+        ), "Pose x and y coordinates must have the same length of 17."
+        assert all(
+            isinstance(value, (int, float)) for value in [*points_x, *points_y]
+        ), "Pose coordinates must be integers or floats."
+        return Pose(
+            x=[round(coord) for coord in points_x],
+            y=[round(coord) for coord in points_y],
+        )
+    @staticmethod
+    def from_dict(points: dict[str, list[float]]) -> "Pose":
+        assert set(points) == {
+            "x",
+            "y",
+        }, "Pose coordinates must contain keys 'x' and 'y'."
+        points_x, points_y = points["x"], points["y"]
+        assert (
+            len(points_x) == len(points_y) == 17
+        ), "Pose x and y coordinates must have the same length of 17."
+        assert all(
+            isinstance(value, (int, float)) for value in [*points_x, *points_y]
+        ), "Pose coordinates must be integers or floats."
+        return Pose(
+            x=[round(coord) for coord in points_x],
+            y=[round(coord) for coord in points_y],
+        )
 class Pose3D(DataModel):
@@ -24,14 +57,52 @@ class Pose3D(DataModel):
     A data model for representing 3D pose keypoints.
     Attributes:
-        x (list[float]): The x-coordinates of the keypoints.
-        y (list[float]): The y-coordinates of the keypoints.
+        x (list[int]): The x-coordinates of the keypoints.
+        y (list[int]): The y-coordinates of the keypoints.
         visible (list[float]): The visibility of the keypoints.
     The keypoints are represented as lists of x, y, and visibility values,
     where each index corresponds to a specific body part.
     """
-    x: list[float] = Field(default=None)
-    y: list[float] = Field(default=None)
+    x: list[int] = Field(default=None)
+    y: list[int] = Field(default=None)
     visible: list[float] = Field(default=None)
+    @staticmethod
+    def from_list(points: list[list[float]]) -> "Pose3D":
+        assert len(points) == 3, "Pose coordinates must be a list of 3 lists."
+        points_x, points_y, points_v = points
+        assert (
+            len(points_x) == len(points_y) == len(points_v) == 17
+        ), "Pose x, y, and visibility coordinates must have the same length of 17."
+        assert all(
+            isinstance(value, (int, float))
+            for value in [*points_x, *points_y, *points_v]
+        ), "Pose coordinates must be integers or floats."
+        return Pose3D(
+            x=[round(coord) for coord in points_x],
+            y=[round(coord) for coord in points_y],
+            visible=points_v,
+        )
+    @staticmethod
+    def from_dict(points: dict[str, list[float]]) -> "Pose3D":
+        assert set(points) == {
+            "x",
+            "y",
+            "visible",
+        }, "Pose coordinates must contain keys 'x', 'y', and 'visible'."
+        points_x, points_y, points_v = points["x"], points["y"], points["visible"]
+        assert (
+            len(points_x) == len(points_y) == len(points_v) == 17
+        ), "Pose x, y, and visibility coordinates must have the same length of 17."
+        assert all(
+            isinstance(value, (int, float))
+            for value in [*points_x, *points_y, *points_v]
+        ), "Pose coordinates must be integers or floats."
+        return Pose3D(
+            x=[round(coord) for coord in points_x],
+            y=[round(coord) for coord in points_y],
+            visible=points_v,
+        )

datachain/lib/models/segment.py ADDED Viewed

@@ -0,0 +1,53 @@
+from pydantic import Field
+from datachain.lib.data_model import DataModel
+class Segments(DataModel):
+    """
+    A data model for representing segments.
+    Attributes:
+        title (str): The title of the segments.
+        x (list[int]): The x-coordinates of the segments.
+        y (list[int]): The y-coordinates of the segments.
+    The segments are represented as lists of x and y coordinates, where each index
+    corresponds to a specific segment.
+    """
+    title: str = Field(default="")
+    x: list[int] = Field(default=None)
+    y: list[int] = Field(default=None)
+    @staticmethod
+    def from_list(points: list[list[float]], title: str = "") -> "Segments":
+        assert len(points) == 2, "Segments coordinates must be a list of 2 lists."
+        points_x, points_y = points
+        assert len(points_x) == len(
+            points_y
+        ), "Segments x and y coordinates must have the same length."
+        assert all(
+            isinstance(value, (int, float)) for value in [*points_x, *points_y]
+        ), "Segments coordinates must be integers or floats."
+        return Segments(
+            title=title,
+            x=[round(coord) for coord in points_x],
+            y=[round(coord) for coord in points_y],
+        )
+    @staticmethod
+    def from_dict(points: dict[str, list[float]], title: str = "") -> "Segments":
+        assert set(points) == {
+            "x",
+            "y",
+        }, "Segments coordinates must contain keys 'x' and 'y'."
+        points_x, points_y = points["x"], points["y"]
+        assert all(
+            isinstance(value, (int, float)) for value in [*points_x, *points_y]
+        ), "Segments coordinates must be integers or floats."
+        return Segments(
+            title=title,
+            x=[round(coord) for coord in points_x],
+            y=[round(coord) for coord in points_y],
+        )

datachain/lib/models/ultralytics/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+from .bbox import YoloBBox, YoloBBoxes, YoloOBBox, YoloOBBoxes
+from .pose import YoloPose, YoloPoses
+from .segment import YoloSegment, YoloSegments
+__all__ = [
+    "YoloBBox",
+    "YoloBBoxes",
+    "YoloOBBox",
+    "YoloOBBoxes",
+    "YoloPose",
+    "YoloPoses",
+    "YoloSegment",
+    "YoloSegments",
+]

datachain/lib/models/ultralytics/bbox.py ADDED Viewed

@@ -0,0 +1,189 @@
+"""
+This module contains the YOLO models.
+YOLO stands for "You Only Look Once", a family of object detection models that
+are designed to be fast and accurate. The models are trained to detect objects
+in images by dividing the image into a grid and predicting the bounding boxes
+and class probabilities for each grid cell.
+More information about YOLO can be found here:
+- https://pjreddie.com/darknet/yolo/
+- https://docs.ultralytics.com/
+"""
+from io import BytesIO
+from typing import TYPE_CHECKING
+from PIL import Image
+from pydantic import Field
+from datachain.lib.data_model import DataModel
+from datachain.lib.models.bbox import BBox, OBBox
+if TYPE_CHECKING:
+    from ultralytics.engine.results import Results
+    from ultralytics.models import YOLO
+    from datachain.lib.file import File
+class YoloBBox(DataModel):
+    """
+    A class representing a bounding box detected by a YOLO model.
+    Attributes:
+        cls: The class of the detected object.
+        name: The name of the detected object.
+        confidence: The confidence score of the detection.
+        box: The bounding box of the detected object
+    """
+    cls: int = Field(default=-1)
+    name: str = Field(default="")
+    confidence: float = Field(default=0)
+    box: BBox = Field(default=None)
+    @staticmethod
+    def from_file(yolo: "YOLO", file: "File") -> "YoloBBox":
+        results = yolo(Image.open(BytesIO(file.read())))
+        if len(results) == 0:
+            return YoloBBox()
+        return YoloBBox.from_result(results[0])
+    @staticmethod
+    def from_result(result: "Results") -> "YoloBBox":
+        summary = result.summary()
+        if not summary:
+            return YoloBBox()
+        name = summary[0].get("name", "")
+        box = (
+            BBox.from_dict(summary[0]["box"], title=name)
+            if "box" in summary[0]
+            else BBox()
+        )
+        return YoloBBox(
+            cls=summary[0]["class"],
+            name=name,
+            confidence=summary[0]["confidence"],
+            box=box,
+        )
+class YoloBBoxes(DataModel):
+    """
+    A class representing a list of bounding boxes detected by a YOLO model.
+    Attributes:
+        cls: A list of classes of the detected objects.
+        name: A list of names of the detected objects.
+        confidence: A list of confidence scores of the detections.
+        box: A list of bounding boxes of the detected objects
+    """
+    cls: list[int]
+    name: list[str]
+    confidence: list[float]
+    box: list[BBox]
+    @staticmethod
+    def from_file(yolo: "YOLO", file: "File") -> "YoloBBoxes":
+        results = yolo(Image.open(BytesIO(file.read())))
+        return YoloBBoxes.from_results(results)
+    @staticmethod
+    def from_results(results: list["Results"]) -> "YoloBBoxes":
+        cls, names, confidence, box = [], [], [], []
+        for r in results:
+            for s in r.summary():
+                name = s.get("name", "")
+                cls.append(s["class"])
+                names.append(name)
+                confidence.append(s["confidence"])
+                box.append(BBox.from_dict(s.get("box", {}), title=name))
+        return YoloBBoxes(
+            cls=cls,
+            name=names,
+            confidence=confidence,
+            box=box,
+        )
+class YoloOBBox(DataModel):
+    """
+    A class representing an oriented bounding box detected by a YOLO model.
+    Attributes:
+        cls: The class of the detected object.
+        name: The name of the detected object.
+        confidence: The confidence score of the detection.
+        box: The oriented bounding box of the detected object.
+    """
+    cls: int = Field(default=-1)
+    name: str = Field(default="")
+    confidence: float = Field(default=0)
+    box: OBBox = Field(default=None)
+    @staticmethod
+    def from_file(yolo: "YOLO", file: "File") -> "YoloOBBox":
+        results = yolo(Image.open(BytesIO(file.read())))
+        if len(results) == 0:
+            return YoloOBBox()
+        return YoloOBBox.from_result(results[0])
+    @staticmethod
+    def from_result(result: "Results") -> "YoloOBBox":
+        summary = result.summary()
+        if not summary:
+            return YoloOBBox()
+        name = summary[0].get("name", "")
+        box = (
+            OBBox.from_dict(summary[0]["box"], title=name)
+            if "box" in summary[0]
+            else OBBox()
+        )
+        return YoloOBBox(
+            cls=summary[0]["class"],
+            name=name,
+            confidence=summary[0]["confidence"],
+            box=box,
+        )
+class YoloOBBoxes(DataModel):
+    """
+    A class representing a list of oriented bounding boxes detected by a YOLO model.
+    Attributes:
+        cls: A list of classes of the detected objects.
+        name: A list of names of the detected objects.
+        confidence: A list of confidence scores of the detections.
+        box: A list of oriented bounding boxes of the detected objects.
+    """
+    cls: list[int]
+    name: list[str]
+    confidence: list[float]
+    box: list[OBBox]
+    @staticmethod
+    def from_file(yolo: "YOLO", file: "File") -> "YoloOBBoxes":
+        results = yolo(Image.open(BytesIO(file.read())))
+        return YoloOBBoxes.from_results(results)
+    @staticmethod
+    def from_results(results: list["Results"]) -> "YoloOBBoxes":
+        cls, names, confidence, box = [], [], [], []
+        for r in results:
+            for s in r.summary():
+                name = s.get("name", "")
+                cls.append(s["class"])
+                names.append(name)
+                confidence.append(s["confidence"])
+                box.append(OBBox.from_dict(s.get("box", {}), title=name))
+        return YoloOBBoxes(
+            cls=cls,
+            name=names,
+            confidence=confidence,
+            box=box,
+        )

datachain/lib/models/ultralytics/pose.py ADDED Viewed

@@ -0,0 +1,126 @@
+"""
+This module contains the YOLO models.
+YOLO stands for "You Only Look Once", a family of object detection models that
+are designed to be fast and accurate. The models are trained to detect objects
+in images by dividing the image into a grid and predicting the bounding boxes
+and class probabilities for each grid cell.
+More information about YOLO can be found here:
+- https://pjreddie.com/darknet/yolo/
+- https://docs.ultralytics.com/
+"""
+from typing import TYPE_CHECKING
+from pydantic import Field
+from datachain.lib.data_model import DataModel
+from datachain.lib.models.bbox import BBox
+from datachain.lib.models.pose import Pose3D
+if TYPE_CHECKING:
+    from ultralytics.engine.results import Results
+class YoloPoseBodyPart:
+    """An enumeration of body parts for YOLO pose keypoints."""
+    nose = 0
+    left_eye = 1
+    right_eye = 2
+    left_ear = 3
+    right_ear = 4
+    left_shoulder = 5
+    right_shoulder = 6
+    left_elbow = 7
+    right_elbow = 8
+    left_wrist = 9
+    right_wrist = 10
+    left_hip = 11
+    right_hip = 12
+    left_knee = 13
+    right_knee = 14
+    left_ankle = 15
+    right_ankle = 16
+class YoloPose(DataModel):
+    """
+    A data model for YOLO pose keypoints.
+    Attributes:
+        cls: The class of the pose.
+        name: The name of the pose.
+        confidence: The confidence score of the pose.
+        box: The bounding box of the pose.
+        keypoints: The 3D pose keypoints.
+    """
+    cls: int = Field(default=-1)
+    name: str = Field(default="")
+    confidence: float = Field(default=0)
+    box: BBox = Field(default=None)
+    keypoints: Pose3D = Field(default=None)
+    @staticmethod
+    def from_result(result: "Results") -> "YoloPose":
+        summary = result.summary()
+        if not summary:
+            return YoloPose()
+        name = summary[0].get("name", "")
+        box = (
+            BBox.from_dict(summary[0]["box"], title=name)
+            if "box" in summary[0]
+            else BBox()
+        )
+        keypoints = (
+            Pose3D.from_dict(summary[0]["keypoints"])
+            if "keypoints" in summary[0]
+            else Pose3D()
+        )
+        return YoloPose(
+            cls=summary[0]["class"],
+            name=name,
+            confidence=summary[0]["confidence"],
+            box=box,
+            keypoints=keypoints,
+        )
+class YoloPoses(DataModel):
+    """
+    A data model for a list of YOLO pose keypoints.
+    Attributes:
+        cls: The classes of the poses.
+        name: The names of the poses.
+        confidence: The confidence scores of the poses.
+        box: The bounding boxes of the poses.
+        keypoints: The 3D pose keypoints of the poses.
+    """
+    cls: list[int]
+    name: list[str]
+    confidence: list[float]
+    box: list[BBox]
+    keypoints: list[Pose3D]
+    @staticmethod
+    def from_results(results: list["Results"]) -> "YoloPoses":
+        cls, names, confidence, box, keypoints = [], [], [], [], []
+        for r in results:
+            for s in r.summary():
+                name = s.get("name", "")
+                cls.append(s["class"])
+                names.append(name)
+                confidence.append(s["confidence"])
+                box.append(BBox.from_dict(s.get("box", {}), title=name))
+                keypoints.append(Pose3D.from_dict(s.get("keypoints", {})))
+        return YoloPoses(
+            cls=cls,
+            name=names,
+            confidence=confidence,
+            box=box,
+            keypoints=keypoints,
+        )

datachain/lib/models/ultralytics/segment.py ADDED Viewed

@@ -0,0 +1,121 @@
+"""
+This module contains the YOLO models.
+YOLO stands for "You Only Look Once", a family of object detection models that
+are designed to be fast and accurate. The models are trained to detect objects
+in images by dividing the image into a grid and predicting the bounding boxes
+and class probabilities for each grid cell.
+More information about YOLO can be found here:
+- https://pjreddie.com/darknet/yolo/
+- https://docs.ultralytics.com/
+"""
+from io import BytesIO
+from typing import TYPE_CHECKING
+from PIL import Image
+from pydantic import Field
+from datachain.lib.data_model import DataModel
+from datachain.lib.models.bbox import BBox
+from datachain.lib.models.segment import Segments
+if TYPE_CHECKING:
+    from ultralytics.engine.results import Results
+    from ultralytics.models import YOLO
+    from datachain.lib.file import File
+class YoloSegment(DataModel):
+    """
+    A data model for a single YOLO segment.
+    Attributes:
+        cls (int): The class of the segment.
+        name (str): The name of the segment.
+        confidence (float): The confidence of the segment.
+        box (BBox): The bounding box of the segment.
+        segments (Segments): The segments of the segment.
+    """
+    cls: int = Field(default=-1)
+    name: str = Field(default="")
+    confidence: float = Field(default=0)
+    box: BBox = Field(default=None)
+    segments: Segments = Field(default=None)
+    @staticmethod
+    def from_file(yolo: "YOLO", file: "File") -> "YoloSegment":
+        results = yolo(Image.open(BytesIO(file.read())))
+        if len(results) == 0:
+            return YoloSegment()
+        return YoloSegment.from_result(results[0])
+    @staticmethod
+    def from_result(result: "Results") -> "YoloSegment":
+        summary = result.summary()
+        if not summary:
+            return YoloSegment()
+        name = summary[0].get("name", "")
+        box = (
+            BBox.from_dict(summary[0]["box"], title=name)
+            if "box" in summary[0]
+            else BBox()
+        )
+        segments = (
+            Segments.from_dict(summary[0]["segments"], title=name)
+            if "segments" in summary[0]
+            else Segments()
+        )
+        return YoloSegment(
+            cls=summary[0]["class"],
+            name=summary[0]["name"],
+            confidence=summary[0]["confidence"],
+            box=box,
+            segments=segments,
+        )
+class YoloSegments(DataModel):
+    """
+    A data model for a list of YOLO segments.
+    Attributes:
+        cls (list[int]): The classes of the segments.
+        name (list[str]): The names of the segments.
+        confidence (list[float]): The confidences of the segments.
+        box (list[BBox]): The bounding boxes of the segments.
+        segments (list[Segments]): The segments of the segments.
+    """
+    cls: list[int]
+    name: list[str]
+    confidence: list[float]
+    box: list[BBox]
+    segments: list[Segments]
+    @staticmethod
+    def from_file(yolo: "YOLO", file: "File") -> "YoloSegments":
+        results = yolo(Image.open(BytesIO(file.read())))
+        return YoloSegments.from_results(results)
+    @staticmethod
+    def from_results(results: list["Results"]) -> "YoloSegments":
+        cls, names, confidence, box, segments = [], [], [], [], []
+        for r in results:
+            for s in r.summary():
+                name = s.get("name", "")
+                cls.append(s["class"])
+                names.append(name)
+                confidence.append(s["confidence"])
+                box.append(BBox.from_dict(s.get("box", {}), title=name))
+                segments.append(Segments.from_dict(s.get("segments", {}), title=name))
+        return YoloSegments(
+            cls=cls,
+            name=names,
+            confidence=confidence,
+            box=box,
+            segments=segments,
+        )

datachain/listing.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import glob
 import os
 from collections.abc import Iterable, Iterator
+from functools import cached_property
 from itertools import zip_longest
 from typing import TYPE_CHECKING, Optional
@@ -15,28 +16,34 @@ from datachain.utils import suffix_to_number
 if TYPE_CHECKING:
     from datachain.catalog.datasource import DataSource
     from datachain.client import Client
-    from datachain.data_storage import AbstractWarehouse
+    from datachain.data_storage import AbstractMetastore, AbstractWarehouse
     from datachain.dataset import DatasetRecord
 class Listing:
     def __init__(
         self,
+        metastore: "AbstractMetastore",
         warehouse: "AbstractWarehouse",
         client: "Client",
-        dataset: Optional["DatasetRecord"],
+        dataset_name: Optional["str"] = None,
+        dataset_version: Optional[int] = None,
         object_name: str = "file",
     ):
+        self.metastore = metastore
         self.warehouse = warehouse
         self.client = client
-        self.dataset = dataset  # dataset representing bucket listing
+        self.dataset_name = dataset_name  # dataset representing bucket listing
+        self.dataset_version = dataset_version  # dataset representing bucket listing
         self.object_name = object_name
     def clone(self) -> "Listing":
         return self.__class__(
+            self.metastore.clone(),
             self.warehouse.clone(),
             self.client,
-            self.dataset,
+            self.dataset_name,
+            self.dataset_version,
             self.object_name,
         )
@@ -53,12 +60,22 @@ class Listing:
     def uri(self):
         from datachain.lib.listing import listing_uri_from_name
-        return listing_uri_from_name(self.dataset.name)
+        assert self.dataset_name
-    @property
+        return listing_uri_from_name(self.dataset_name)
+    @cached_property
+    def dataset(self) -> "DatasetRecord":
+        assert self.dataset_name
+        return self.metastore.get_dataset(self.dataset_name)
+    @cached_property
     def dataset_rows(self):
+        dataset = self.dataset
         return self.warehouse.dataset_rows(
-            self.dataset, self.dataset.latest_version, object_name=self.object_name
+            dataset,
+            self.dataset_version or dataset.latest_version,
+            object_name=self.object_name,
         )
     def expand_path(self, path, use_glob=True) -> list[Node]:

datachain/toolkit/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .split import train_test_split
+__all__ = ["train_test_split"]

datachain/toolkit/split.py ADDED Viewed

@@ -0,0 +1,67 @@
+from datachain import C, DataChain
+def train_test_split(dc: DataChain, weights: list[float]) -> list[DataChain]:
+    """
+    Splits a DataChain into multiple subsets based on the provided weights.
+    This function partitions the rows or items of a DataChain into disjoint subsets,
+    ensuring that the relative sizes of the subsets correspond to the given weights.
+    It is particularly useful for creating training, validation, and test datasets.
+    Args:
+        dc (DataChain):
+            The DataChain instance to split.
+        weights (list[float]):
+            A list of weights indicating the relative proportions of the splits.
+            The weights do not need to sum to 1; they will be normalized internally.
+            For example:
+            - `[0.7, 0.3]` corresponds to a 70/30 split;
+            - `[2, 1, 1]` corresponds to a 50/25/25 split.
+    Returns:
+        list[DataChain]:
+            A list of DataChain instances, one for each weight in the weights list.
+    Examples:
+        Train-test split:
+        ```python
+        from datachain import DataChain
+        from datachain.toolkit import train_test_split
+        # Load a DataChain from a storage source (e.g., S3 bucket)
+        dc = DataChain.from_storage("s3://bucket/dir/")
+        # Perform a 70/30 train-test split
+        train, test = train_test_split(dc, [0.7, 0.3])
+        # Save the resulting splits
+        train.save("dataset_train")
+        test.save("dataset_test")
+        ```
+        Train-test-validation split:
+        ```python
+        train, test, val = train_test_split(dc, [0.7, 0.2, 0.1])
+        train.save("dataset_train")
+        test.save("dataset_test")
+        val.save("dataset_val")
+        ```
+    Note:
+        The splits are random but deterministic, based on Dataset `sys__rand` field.
+    """
+    if len(weights) < 2:
+        raise ValueError("Weights should have at least two elements")
+    if any(weight < 0 for weight in weights):
+        raise ValueError("Weights should be non-negative")
+    weights_normalized = [weight / sum(weights) for weight in weights]
+    return [
+        dc.filter(
+            C("sys__rand") % 1000 >= round(sum(weights_normalized[:index]) * 1000),
+            C("sys__rand") % 1000 < round(sum(weights_normalized[: index + 1]) * 1000),
+        )
+        for index, _ in enumerate(weights_normalized)
+    ]

{datachain-0.6.9.dist-info → datachain-0.6.10.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datachain
-Version: 0.6.9
+Version: 0.6.10
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License: Apache-2.0
@@ -82,7 +82,7 @@ Requires-Dist: pytest <9,>=8 ; extra == 'tests'
 Requires-Dist: pytest-sugar >=0.9.6 ; extra == 'tests'
 Requires-Dist: pytest-cov >=4.1.0 ; extra == 'tests'
 Requires-Dist: pytest-mock >=3.12.0 ; extra == 'tests'
-Requires-Dist: pytest-servers[all] >=0.5.7 ; extra == 'tests'
+Requires-Dist: pytest-servers[all] >=0.5.8 ; extra == 'tests'
 Requires-Dist: pytest-benchmark[histogram] ; extra == 'tests'
 Requires-Dist: pytest-xdist >=3.3.1 ; extra == 'tests'
 Requires-Dist: virtualenv ; extra == 'tests'

{datachain-0.6.9.dist-info → datachain-0.6.10.dist-info}/RECORD RENAMED Viewed

@@ -8,7 +8,7 @@ datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
 datachain/dataset.py,sha256=0IN-5y723y-bnFlieKtOFZLCjwX_yplFo3q0DV7LRPw,14821
 datachain/error.py,sha256=bxAAL32lSeMgzsQDEHbGTGORj-mPzzpCRvWDPueJNN4,1092
 datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
-datachain/listing.py,sha256=AV23WZq-k6e2zeeNBhVQP1-2PrwNCYidO0HBDKzpVaA,7152
+datachain/listing.py,sha256=TgKg25ZWAP5enzKgw2_2GUPJVdnQUh6uySHB5SJrUY4,7773
 datachain/node.py,sha256=i7_jC8VcW6W5VYkDszAOu0H-rNBuqXB4UnLEh4wFzjc,5195
 datachain/nodes_fetcher.py,sha256=F-73-h19HHNGtHFBGKk7p3mc0ALm4a9zGnzhtuUjnp4,1107
 datachain/nodes_thread_pool.py,sha256=uPo-xl8zG5m9YgODjPFBpbcqqHjI-dcxH87yAbj_qco,3192
@@ -18,7 +18,7 @@ datachain/studio.py,sha256=6kxF7VxPAbh9D7_Bk8_SghS5OXrwUwSpDaw19eNCTP4,4083
 datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
 datachain/utils.py,sha256=-mSFowjIidJ4_sMXInvNHLn4rK_QnHuIlLuH1_lMGmI,13897
 datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
-datachain/catalog/catalog.py,sha256=Iwb562grttdGcrNVHCna_n7e884BqwGhQwAgYagBwyg,57347
+datachain/catalog/catalog.py,sha256=J1nUWLI4RYCvvR6fB4neQBtB7V-CTh4PM71irhNmJc4,57817
 datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
 datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
 datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
@@ -36,14 +36,14 @@ datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s
 datachain/data_storage/metastore.py,sha256=5b7o_CSHC2djottebYn-Hq5q0yaSLOKPIRCnaVRvjsU,36056
 datachain/data_storage/schema.py,sha256=scANMQqozita3HjEtq7eupMgh6yYkrZHoXtfuL2RoQg,9879
 datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
-datachain/data_storage/sqlite.py,sha256=wb8xlMJYYyt59wft0psJj587d-AwpNThzIqspVcKnRI,27388
+datachain/data_storage/sqlite.py,sha256=CspRUlYsIcubgzvcQxTACnmcuKESSLZcqCl0dcrtRiA,27471
 datachain/data_storage/warehouse.py,sha256=xwMaR4jBpR13vjG3zrhphH4z2_CFLNj0KPF0LJCXCJ8,30727
 datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/lib/arrow.py,sha256=-hu9tic79a01SY2UBqkA3U6wUr6tnE3T3q5q_BnO93A,9156
 datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
 datachain/lib/data_model.py,sha256=dau4AlZBhOFvF7pEKMeqCeRkcFFg5KFvTBWW_2CdH5g,2371
 datachain/lib/dataset_info.py,sha256=q0EW9tj5jXGSD9Lzct9zbH4P1lfIGd_cIWqhnMxv7Q0,2464
-datachain/lib/dc.py,sha256=RQ8p95rzCMRY4ygFecO_hhQ3IgQHmbLXNqhcaINvGcI,85841
+datachain/lib/dc.py,sha256=BmRgCt5fXvBqlFV07KN-nWszueRyCkC7td1x7T4BZ7k,87688
 datachain/lib/file.py,sha256=lHxE1wOGR4QJBQ3AYjhPLwpX72dOi06vkcwA-WSAGlg,14817
 datachain/lib/hf.py,sha256=BW2NPpqxkpPwkSaGlppT8Rbs8zPpyYC-tR6htY08c-0,5817
 datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
@@ -71,10 +71,14 @@ datachain/lib/convert/values_to_tuples.py,sha256=varRCnSMT_pZmHznrd2Yi05qXLLz_v9
 datachain/lib/func/__init__.py,sha256=wlAKhGV0QDg9y7reSwoUF8Vicfqh_YOUNIXLzxICGz4,403
 datachain/lib/func/aggregate.py,sha256=H1ziFQdaK9zvnxvttfnEzkkyGvEEmMAvmgCsBV6nfm8,10917
 datachain/lib/func/func.py,sha256=HAJZ_tpiRG2R-et7pr0WnoyNZYtpbPn3_HBuL3RQpbU,4800
-datachain/lib/models/__init__.py,sha256=AGvjPbUokJiir3uelTa4XGtNSECkMFc5Xmi_N3AtxPQ,119
-datachain/lib/models/bbox.py,sha256=aiYNhvEcRK3dEN4MBcptmkPKc9kMP16ZQdu7xPk6hek,1555
-datachain/lib/models/pose.py,sha256=peuJPNSiGuTXfCfGIABwv8PGYistvTTBmtf-8X8E_eA,1077
-datachain/lib/models/yolo.py,sha256=eftoJDUa8iOpFTF1EkKVAd5Q-3HRd6X4eCIZ9h5p4nI,972
+datachain/lib/models/__init__.py,sha256=6iwqXWcybyELKdLEe59yUPl8R8ZHDY4lA-xCHVYPdOA,191
+datachain/lib/models/bbox.py,sha256=UJ_64D8TQglX2B_ueseILPoT3cGIWr9McVg0mv2YdmE,3717
+datachain/lib/models/pose.py,sha256=KC-OpLC7-3v6qg4YN6pXlfAgtg88VLQoRc75JCEmbfY,3931
+datachain/lib/models/segment.py,sha256=ergCFnEzLDzaU75p1_KvWgal1LSv4VuFmkWLkRJeaVk,1862
+datachain/lib/models/ultralytics/__init__.py,sha256=g8mgII0k_RJiOG9kd4k_ECfCgDhT_iPh3vCC_5OiDD4,305
+datachain/lib/models/ultralytics/bbox.py,sha256=LAaezAnnugfBiczWZ63NTo65kX2BegR5WGXjQTOTE28,5784
+datachain/lib/models/ultralytics/pose.py,sha256=nMoEeeY_Zi7Iiu7vIo9ZTq8ARUdg_BcZMQIA_WgRNk4,3488
+datachain/lib/models/ultralytics/segment.py,sha256=IHnthsq6uQ6DSdHLK2akbdd0Eq8wW7oaAK6pUG8nxJc,3818
 datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
 datachain/query/batch.py,sha256=5fEhORFe7li12SdYddaSK3LyqksMfCHhwN1_A6TfsA4,3485
 datachain/query/dataset.py,sha256=MGArYxioeGvm8w7hQtQAjEI6wsZN_XAoh4-jO4d0U5Q,53926
@@ -103,10 +107,12 @@ datachain/sql/sqlite/__init__.py,sha256=TAdJX0Bg28XdqPO-QwUVKy8rg78cgMileHvMNot7
 datachain/sql/sqlite/base.py,sha256=aHSZVvh4XSVkvZ07h3jMoRlHI4sWD8y3SnmGs9xMG9Y,14375
 datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
 datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
+datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
+datachain/toolkit/split.py,sha256=6FcEJgUsJsUcCqKW5aXuJy4DvbcQ7_dFbsfNPhn8EVg,2377
 datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
-datachain-0.6.9.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
-datachain-0.6.9.dist-info/METADATA,sha256=McKhuW43_7Q3iJKxueIYbk-rpYF6rbIKeFinzeeUzMo,18037
-datachain-0.6.9.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
-datachain-0.6.9.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
-datachain-0.6.9.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
-datachain-0.6.9.dist-info/RECORD,,
+datachain-0.6.10.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
+datachain-0.6.10.dist-info/METADATA,sha256=AgQuuefAhZRIL1jDJWz-q4daqA5ZmnQN8dafqnt01XA,18038
+datachain-0.6.10.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
+datachain-0.6.10.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
+datachain-0.6.10.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
+datachain-0.6.10.dist-info/RECORD,,

datachain/lib/models/yolo.py DELETED Viewed

@@ -1,39 +0,0 @@
-"""
-This module contains the YOLO models.
-YOLO stands for "You Only Look Once", a family of object detection models that
-are designed to be fast and accurate. The models are trained to detect objects
-in images by dividing the image into a grid and predicting the bounding boxes
-and class probabilities for each grid cell.
-More information about YOLO can be found here:
-- https://pjreddie.com/darknet/yolo/
-- https://docs.ultralytics.com/
-"""
-class PoseBodyPart:
-    """
-    An enumeration of body parts for YOLO pose keypoints.
-    More information about the body parts can be found here:
-    https://docs.ultralytics.com/tasks/pose/
-    """
-    nose = 0
-    left_eye = 1
-    right_eye = 2
-    left_ear = 3
-    right_ear = 4
-    left_shoulder = 5
-    right_shoulder = 6
-    left_elbow = 7
-    right_elbow = 8
-    left_wrist = 9
-    right_wrist = 10
-    left_hip = 11
-    right_hip = 12
-    left_knee = 13
-    right_knee = 14
-    left_ankle = 15
-    right_ankle = 16

{datachain-0.6.9.dist-info → datachain-0.6.10.dist-info}/LICENSE RENAMED Viewed

File without changes

{datachain-0.6.9.dist-info → datachain-0.6.10.dist-info}/WHEEL RENAMED Viewed

File without changes

{datachain-0.6.9.dist-info → datachain-0.6.10.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{datachain-0.6.9.dist-info → datachain-0.6.10.dist-info}/top_level.txt RENAMED Viewed

File without changes

datachain 0.6.9__py3-none-any.whl → 0.6.10__py3-none-any.whl

Potentially problematic release.

datachain 0.6.9py3-none-any.whl → 0.6.10py3-none-any.whl