PyPI - datachain - Versions diffs - 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl - Mend

datachain 0.14.2py3-none-any.whl → 0.39.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

datachain/__init__.py +20 -0
datachain/asyn.py +11 -12
datachain/cache.py +7 -7
datachain/catalog/__init__.py +2 -2
datachain/catalog/catalog.py +621 -507
datachain/catalog/dependency.py +164 -0
datachain/catalog/loader.py +28 -18
datachain/checkpoint.py +43 -0
datachain/cli/__init__.py +24 -33
datachain/cli/commands/__init__.py +1 -8
datachain/cli/commands/datasets.py +83 -52
datachain/cli/commands/ls.py +17 -17
datachain/cli/commands/show.py +4 -4
datachain/cli/parser/__init__.py +8 -74
datachain/cli/parser/job.py +95 -3
datachain/cli/parser/studio.py +11 -4
datachain/cli/parser/utils.py +1 -2
datachain/cli/utils.py +2 -15
datachain/client/azure.py +4 -4
datachain/client/fsspec.py +45 -28
datachain/client/gcs.py +6 -6
datachain/client/hf.py +29 -2
datachain/client/http.py +157 -0
datachain/client/local.py +15 -11
datachain/client/s3.py +17 -9
datachain/config.py +4 -8
datachain/data_storage/db_engine.py +12 -6
datachain/data_storage/job.py +5 -1
datachain/data_storage/metastore.py +1252 -186
datachain/data_storage/schema.py +58 -45
datachain/data_storage/serializer.py +105 -15
datachain/data_storage/sqlite.py +286 -127
datachain/data_storage/warehouse.py +250 -113
datachain/dataset.py +353 -148
datachain/delta.py +391 -0
datachain/diff/__init__.py +27 -29
datachain/error.py +60 -0
datachain/func/__init__.py +2 -1
datachain/func/aggregate.py +66 -42
datachain/func/array.py +242 -38
datachain/func/base.py +7 -4
datachain/func/conditional.py +110 -60
datachain/func/func.py +96 -45
datachain/func/numeric.py +55 -38
datachain/func/path.py +32 -20
datachain/func/random.py +2 -2
datachain/func/string.py +67 -37
datachain/func/window.py +7 -8
datachain/hash_utils.py +123 -0
datachain/job.py +11 -7
datachain/json.py +138 -0
datachain/lib/arrow.py +58 -22
datachain/lib/audio.py +245 -0
datachain/lib/clip.py +14 -13
datachain/lib/convert/flatten.py +5 -3
datachain/lib/convert/python_to_sql.py +6 -10
datachain/lib/convert/sql_to_python.py +8 -0
datachain/lib/convert/values_to_tuples.py +156 -51
datachain/lib/data_model.py +42 -20
datachain/lib/dataset_info.py +36 -8
datachain/lib/dc/__init__.py +8 -2
datachain/lib/dc/csv.py +25 -28
datachain/lib/dc/database.py +398 -0
datachain/lib/dc/datachain.py +1289 -425
datachain/lib/dc/datasets.py +320 -38
datachain/lib/dc/hf.py +38 -24
datachain/lib/dc/json.py +29 -32
datachain/lib/dc/listings.py +112 -8
datachain/lib/dc/pandas.py +16 -12
datachain/lib/dc/parquet.py +35 -23
datachain/lib/dc/records.py +31 -23
datachain/lib/dc/storage.py +154 -64
datachain/lib/dc/storage_pattern.py +251 -0
datachain/lib/dc/utils.py +24 -16
datachain/lib/dc/values.py +8 -9
datachain/lib/file.py +622 -89
datachain/lib/hf.py +69 -39
datachain/lib/image.py +14 -14
datachain/lib/listing.py +14 -11
datachain/lib/listing_info.py +1 -2
datachain/lib/meta_formats.py +3 -4
datachain/lib/model_store.py +39 -7
datachain/lib/namespaces.py +125 -0
datachain/lib/projects.py +130 -0
datachain/lib/pytorch.py +32 -21
datachain/lib/settings.py +192 -56
datachain/lib/signal_schema.py +427 -104
datachain/lib/tar.py +1 -2
datachain/lib/text.py +8 -7
datachain/lib/udf.py +164 -76
datachain/lib/udf_signature.py +60 -35
datachain/lib/utils.py +118 -4
datachain/lib/video.py +17 -9
datachain/lib/webdataset.py +61 -56
datachain/lib/webdataset_laion.py +15 -16
datachain/listing.py +22 -10
datachain/model/bbox.py +3 -1
datachain/model/ultralytics/bbox.py +16 -12
datachain/model/ultralytics/pose.py +16 -12
datachain/model/ultralytics/segment.py +16 -12
datachain/namespace.py +84 -0
datachain/node.py +6 -6
datachain/nodes_thread_pool.py +0 -1
datachain/plugins.py +24 -0
datachain/project.py +78 -0
datachain/query/batch.py +40 -41
datachain/query/dataset.py +604 -322
datachain/query/dispatch.py +261 -154
datachain/query/metrics.py +4 -6
datachain/query/params.py +2 -3
datachain/query/queue.py +3 -12
datachain/query/schema.py +11 -6
datachain/query/session.py +200 -33
datachain/query/udf.py +34 -2
datachain/remote/studio.py +171 -69
datachain/script_meta.py +12 -12
datachain/semver.py +68 -0
datachain/sql/__init__.py +2 -0
datachain/sql/functions/array.py +33 -1
datachain/sql/postgresql_dialect.py +9 -0
datachain/sql/postgresql_types.py +21 -0
datachain/sql/sqlite/__init__.py +5 -1
datachain/sql/sqlite/base.py +102 -29
datachain/sql/sqlite/types.py +8 -13
datachain/sql/types.py +70 -15
datachain/studio.py +223 -46
datachain/toolkit/split.py +31 -10
datachain/utils.py +101 -59
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
datachain-0.39.0.dist-info/RECORD +173 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
datachain/cli/commands/query.py +0 -53
datachain/query/utils.py +0 -42
datachain-0.14.2.dist-info/RECORD +0 -158
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0

datachain/model/ultralytics/pose.py CHANGED Viewed

@@ -56,16 +56,16 @@ class YoloPose(DataModel):
         if not summary:
             return YoloPose(box=BBox(), pose=Pose3D())
         name = summary[0].get("name", "")
-        box = (
-            BBox.from_dict(summary[0]["box"], title=name)
-            if "box" in summary[0]
-            else BBox()
-        )
-        pose = (
-            Pose3D.from_dict(summary[0]["keypoints"])
-            if "keypoints" in summary[0]
-            else Pose3D()
-        )
+        if summary[0].get("box"):
+            assert isinstance(summary[0]["box"], dict)
+            box = BBox.from_dict(summary[0]["box"], title=name)
+        else:
+            box = BBox()
+        if summary[0].get("keypoints"):
+            assert isinstance(summary[0]["keypoints"], dict)
+            pose = Pose3D.from_dict(summary[0]["keypoints"])
+        else:
+            pose = Pose3D()
         return YoloPose(
             cls=summary[0]["class"],
             name=name,
@@ -102,8 +102,12 @@ class YoloPoses(DataModel):
                 cls.append(s["class"])
                 names.append(name)
                 confidence.append(s["confidence"])
-                box.append(BBox.from_dict(s.get("box", {}), title=name))
-                pose.append(Pose3D.from_dict(s.get("keypoints", {})))
+                if s.get("box"):
+                    assert isinstance(s["box"], dict)
+                    box.append(BBox.from_dict(s["box"], title=name))
+                if s.get("keypoints"):
+                    assert isinstance(s["keypoints"], dict)
+                    pose.append(Pose3D.from_dict(s["keypoints"]))
         return YoloPoses(
             cls=cls,
             name=names,

datachain/model/ultralytics/segment.py CHANGED Viewed

@@ -34,16 +34,16 @@ class YoloSegment(DataModel):
         if not summary:
             return YoloSegment(box=BBox(), segment=Segment())
         name = summary[0].get("name", "")
-        box = (
-            BBox.from_dict(summary[0]["box"], title=name)
-            if "box" in summary[0]
-            else BBox()
-        )
-        segment = (
-            Segment.from_dict(summary[0]["segments"], title=name)
-            if "segments" in summary[0]
-            else Segment()
-        )
+        if summary[0].get("box"):
+            assert isinstance(summary[0]["box"], dict)
+            box = BBox.from_dict(summary[0]["box"], title=name)
+        else:
+            box = BBox()
+        if summary[0].get("segments"):
+            assert isinstance(summary[0]["segments"], dict)
+            segment = Segment.from_dict(summary[0]["segments"], title=name)
+        else:
+            segment = Segment()
         return YoloSegment(
             cls=summary[0]["class"],
             name=summary[0]["name"],
@@ -80,8 +80,12 @@ class YoloSegments(DataModel):
                 cls.append(s["class"])
                 names.append(name)
                 confidence.append(s["confidence"])
-                box.append(BBox.from_dict(s.get("box", {}), title=name))
-                segment.append(Segment.from_dict(s.get("segments", {}), title=name))
+                if s.get("box"):
+                    assert isinstance(s["box"], dict)
+                    box.append(BBox.from_dict(s["box"], title=name))
+                if s.get("segments"):
+                    assert isinstance(s["segments"], dict)
+                    segment.append(Segment.from_dict(s["segments"], title=name))
         return YoloSegments(
             cls=cls,
             name=names,

datachain/namespace.py ADDED Viewed

@@ -0,0 +1,84 @@
+import builtins
+from dataclasses import dataclass, fields
+from datetime import datetime
+from typing import Any, TypeVar
+from datachain.error import InvalidNamespaceNameError
+N = TypeVar("N", bound="Namespace")
+NAMESPACE_NAME_RESERVED_CHARS = [".", "@"]
+def parse_name(name: str) -> tuple[str, str | None]:
+    """
+    Parses namespace name into namespace and optional project name.
+    If both namespace and project are defined in name, they need to be split by dot
+    e.g dev.my-project
+    Valid inputs:
+        - dev.my-project
+        - dev
+    """
+    parts = name.split(".")
+    if len(parts) == 1:
+        return name, None
+    if len(parts) == 2:
+        return parts[0], parts[1]
+    raise InvalidNamespaceNameError(
+        f"Invalid namespace format: {name}. Expected 'namespace' or 'ns1.ns2'."
+    )
+@dataclass(frozen=True)
+class Namespace:
+    id: int
+    uuid: str
+    name: str
+    descr: str | None
+    created_at: datetime
+    @staticmethod
+    def validate_name(name: str) -> None:
+        """Throws exception if name is invalid, otherwise returns None"""
+        if not name:
+            raise InvalidNamespaceNameError("Namespace name cannot be empty")
+        for c in NAMESPACE_NAME_RESERVED_CHARS:
+            if c in name:
+                raise InvalidNamespaceNameError(
+                    f"Character {c} is reserved and not allowed in namespace name"
+                )
+        if name in [Namespace.default(), Namespace.system()]:
+            raise InvalidNamespaceNameError(
+                f"Namespace name {name} is reserved and cannot be used."
+            )
+    @staticmethod
+    def default() -> str:
+        """Name of default namespace"""
+        return "local"
+    @staticmethod
+    def system() -> str:
+        """Name of the system namespace"""
+        return "system"
+    @property
+    def is_system(self):
+        return self.name == Namespace.system()
+    @classmethod
+    def parse(
+        cls: builtins.type[N],
+        id: int,
+        uuid: str,
+        name: str,
+        descr: str | None,
+        created_at: datetime,
+    ) -> "Namespace":
+        return cls(id, uuid, name, descr, created_at)
+    @classmethod
+    def from_dict(cls, d: dict[str, Any]) -> "Namespace":
+        kwargs = {f.name: d[f.name] for f in fields(cls) if f.name in d}
+        return cls(**kwargs)

datachain/node.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
 from datetime import datetime
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any
 import attrs
@@ -53,11 +53,11 @@ class Node:
     sys__rand: int = 0
     path: str = ""
     etag: str = ""
-    version: Optional[str] = None
+    version: str | None = None
     is_latest: bool = True
-    last_modified: Optional[datetime] = None
+    last_modified: datetime | None = None
     size: int = 0
-    location: Optional[str] = None
+    location: str | None = None
     source: StorageURI = StorageURI("")  # noqa: RUF009
     dir_type: int = DirType.FILE
@@ -90,7 +90,7 @@ class Node:
             return self.path + "/"
         return self.path
-    def to_file(self, source: Optional[StorageURI] = None) -> File:
+    def to_file(self, source: StorageURI | None = None) -> File:
         if source is None:
             source = self.source
         return File(
@@ -189,7 +189,7 @@ class NodeWithPath:
 TIME_FMT = "%Y-%m-%d %H:%M"
-def long_line_str(name: str, timestamp: Optional[datetime]) -> str:
+def long_line_str(name: str, timestamp: datetime | None) -> str:
     if timestamp is None:
         time = "-"
     else:

datachain/nodes_thread_pool.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import concurrent
 import concurrent.futures
 import threading
 from abc import ABC, abstractmethod

datachain/plugins.py ADDED Viewed

@@ -0,0 +1,24 @@
+"""Plugin loader for DataChain callables.
+Discovers and invokes entry points in the group "datachain.callables" once
+per process. This enables external packages (e.g., Studio) to register
+their callables with the serializer registry without explicit imports.
+"""
+from importlib import metadata as importlib_metadata
+_plugins_loaded = False
+def ensure_plugins_loaded() -> None:
+    global _plugins_loaded  # noqa: PLW0603
+    if _plugins_loaded:
+        return
+    # Compatible across importlib.metadata versions
+    eps_obj = importlib_metadata.entry_points()
+    for ep in eps_obj.select(group="datachain.callables"):
+        func = ep.load()
+        func()
+    _plugins_loaded = True

datachain/project.py ADDED Viewed

@@ -0,0 +1,78 @@
+import builtins
+from dataclasses import dataclass, fields
+from datetime import datetime
+from typing import Any, TypeVar
+from datachain.error import InvalidProjectNameError
+from datachain.namespace import Namespace
+P = TypeVar("P", bound="Project")
+PROJECT_NAME_RESERVED_CHARS = [".", "@"]
+@dataclass(frozen=True)
+class Project:
+    id: int
+    uuid: str
+    name: str
+    descr: str | None
+    created_at: datetime
+    namespace: Namespace
+    @staticmethod
+    def validate_name(name: str) -> None:
+        """Throws exception if name is invalid, otherwise returns None"""
+        if not name:
+            raise InvalidProjectNameError("Project name cannot be empty")
+        for c in PROJECT_NAME_RESERVED_CHARS:
+            if c in name:
+                raise InvalidProjectNameError(
+                    f"Character {c} is reserved and not allowed in project name."
+                )
+        if name in [Project.default(), Project.listing()]:
+            raise InvalidProjectNameError(
+                f"Project name {name} is reserved and cannot be used."
+            )
+    @staticmethod
+    def default() -> str:
+        """Name of default project"""
+        return "local"
+    @staticmethod
+    def listing() -> str:
+        """Name of listing project where all listing datasets will be saved"""
+        return "listing"
+    @classmethod
+    def parse(
+        cls: builtins.type[P],
+        namespace_id: int,
+        namespace_uuid: str,
+        namespace_name: str,
+        namespace_descr: str | None,
+        namespace_created_at: datetime,
+        project_id: int,
+        uuid: str,
+        name: str,
+        descr: str | None,
+        created_at: datetime,
+        project_namespace_id: int,
+    ) -> "Project":
+        namespace = Namespace.parse(
+            namespace_id,
+            namespace_uuid,
+            namespace_name,
+            namespace_descr,
+            namespace_created_at,
+        )
+        return cls(project_id, uuid, name, descr, created_at, namespace)
+    @classmethod
+    def from_dict(cls, d: dict[str, Any]) -> "Project":
+        namespace = Namespace.from_dict(d.pop("namespace"))
+        kwargs = {f.name: d[f.name] for f in fields(cls) if f.name in d}
+        return cls(**kwargs, namespace=namespace)

datachain/query/batch.py CHANGED Viewed

@@ -1,24 +1,14 @@
 import contextlib
 import math
 from abc import ABC, abstractmethod
-from collections.abc import Generator, Sequence
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Callable, Optional, Union
-from datachain.data_storage.schema import PARTITION_COLUMN_ID
-from datachain.data_storage.warehouse import SELECT_BATCH_SIZE
-from datachain.query.utils import get_query_column, get_query_id_column
-if TYPE_CHECKING:
-    from sqlalchemy import Select
+from collections.abc import Callable, Generator, Sequence
+import sqlalchemy as sa
-@dataclass
-class RowsOutputBatch:
-    rows: Sequence[Sequence]
+from datachain.data_storage.schema import PARTITION_COLUMN_ID
-RowsOutput = Union[Sequence, RowsOutputBatch]
+RowsOutputBatch = Sequence[Sequence]
+RowsOutput = Sequence | RowsOutputBatch
 class BatchingStrategy(ABC):
@@ -30,8 +20,8 @@ class BatchingStrategy(ABC):
     def __call__(
         self,
         execute: Callable,
-        query: "Select",
-        ids_only: bool = False,
+        query: sa.Select,
+        id_col: sa.ColumnElement | None = None,
     ) -> Generator[RowsOutput, None, None]:
         """Apply the provided parameters to the UDF."""
@@ -47,12 +37,16 @@ class NoBatching(BatchingStrategy):
     def __call__(
         self,
         execute: Callable,
-        query: "Select",
-        ids_only: bool = False,
+        query: sa.Select,
+        id_col: sa.ColumnElement | None = None,
     ) -> Generator[Sequence, None, None]:
-        if ids_only:
-            query = query.with_only_columns(get_query_id_column(query))
-        return execute(query)
+        ids_only = False
+        if id_col is not None:
+            query = query.with_only_columns(id_col)
+            ids_only = True
+        rows = execute(query)
+        yield from (r[0] for r in rows) if ids_only else rows
 class Batch(BatchingStrategy):
@@ -69,27 +63,31 @@ class Batch(BatchingStrategy):
     def __call__(
         self,
         execute: Callable,
-        query: "Select",
-        ids_only: bool = False,
-    ) -> Generator[RowsOutputBatch, None, None]:
-        if ids_only:
-            query = query.with_only_columns(get_query_id_column(query))
+        query: sa.Select,
+        id_col: sa.ColumnElement | None = None,
+    ) -> Generator[RowsOutput, None, None]:
+        from datachain.data_storage.warehouse import SELECT_BATCH_SIZE
+        ids_only = False
+        if id_col is not None:
+            query = query.with_only_columns(id_col)
+            ids_only = True
         # choose page size that is a multiple of the batch size
         page_size = math.ceil(SELECT_BATCH_SIZE / self.count) * self.count
         # select rows in batches
-        results: list[Sequence] = []
+        results = []
         with contextlib.closing(execute(query, page_size=page_size)) as rows:
             for row in rows:
                 results.append(row)
                 if len(results) >= self.count:
                     batch, results = results[: self.count], results[self.count :]
-                    yield RowsOutputBatch(batch)
+                    yield [r[0] for r in batch] if ids_only else batch
             if len(results) > 0:
-                yield RowsOutputBatch(results)
+                yield [r[0] for r in results] if ids_only else results
 class Partition(BatchingStrategy):
@@ -104,18 +102,19 @@ class Partition(BatchingStrategy):
     def __call__(
         self,
         execute: Callable,
-        query: "Select",
-        ids_only: bool = False,
-    ) -> Generator[RowsOutputBatch, None, None]:
-        id_col = get_query_id_column(query)
-        if (partition_col := get_query_column(query, PARTITION_COLUMN_ID)) is None:
+        query: sa.Select,
+        id_col: sa.ColumnElement | None = None,
+    ) -> Generator[RowsOutput, None, None]:
+        if (partition_col := query.selected_columns.get(PARTITION_COLUMN_ID)) is None:
             raise RuntimeError("partition column not found in query")
-        if ids_only:
+        ids_only = False
+        if id_col is not None:
             query = query.with_only_columns(id_col, partition_col)
+            ids_only = True
-        current_partition: Optional[int] = None
-        batch: list[Sequence] = []
+        current_partition: int | None = None
+        batch: list = []
         query_fields = [str(c.name) for c in query.selected_columns]
         id_column_idx = query_fields.index("sys__id")
@@ -132,9 +131,9 @@ class Partition(BatchingStrategy):
                 if current_partition != partition:
                     current_partition = partition
                     if len(batch) > 0:
-                        yield RowsOutputBatch(batch)
+                        yield batch
                         batch = []
-                batch.append([row[id_column_idx]] if ids_only else row)
+                batch.append(row[id_column_idx] if ids_only else row)
             if len(batch) > 0:
-                yield RowsOutputBatch(batch)
+                yield batch

datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl

datachain 0.14.2py3-none-any.whl → 0.39.0py3-none-any.whl