PyPI - datachain - Versions diffs - 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl - Mend

datachain 0.14.2py3-none-any.whl → 0.39.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

datachain/__init__.py +20 -0
datachain/asyn.py +11 -12
datachain/cache.py +7 -7
datachain/catalog/__init__.py +2 -2
datachain/catalog/catalog.py +621 -507
datachain/catalog/dependency.py +164 -0
datachain/catalog/loader.py +28 -18
datachain/checkpoint.py +43 -0
datachain/cli/__init__.py +24 -33
datachain/cli/commands/__init__.py +1 -8
datachain/cli/commands/datasets.py +83 -52
datachain/cli/commands/ls.py +17 -17
datachain/cli/commands/show.py +4 -4
datachain/cli/parser/__init__.py +8 -74
datachain/cli/parser/job.py +95 -3
datachain/cli/parser/studio.py +11 -4
datachain/cli/parser/utils.py +1 -2
datachain/cli/utils.py +2 -15
datachain/client/azure.py +4 -4
datachain/client/fsspec.py +45 -28
datachain/client/gcs.py +6 -6
datachain/client/hf.py +29 -2
datachain/client/http.py +157 -0
datachain/client/local.py +15 -11
datachain/client/s3.py +17 -9
datachain/config.py +4 -8
datachain/data_storage/db_engine.py +12 -6
datachain/data_storage/job.py +5 -1
datachain/data_storage/metastore.py +1252 -186
datachain/data_storage/schema.py +58 -45
datachain/data_storage/serializer.py +105 -15
datachain/data_storage/sqlite.py +286 -127
datachain/data_storage/warehouse.py +250 -113
datachain/dataset.py +353 -148
datachain/delta.py +391 -0
datachain/diff/__init__.py +27 -29
datachain/error.py +60 -0
datachain/func/__init__.py +2 -1
datachain/func/aggregate.py +66 -42
datachain/func/array.py +242 -38
datachain/func/base.py +7 -4
datachain/func/conditional.py +110 -60
datachain/func/func.py +96 -45
datachain/func/numeric.py +55 -38
datachain/func/path.py +32 -20
datachain/func/random.py +2 -2
datachain/func/string.py +67 -37
datachain/func/window.py +7 -8
datachain/hash_utils.py +123 -0
datachain/job.py +11 -7
datachain/json.py +138 -0
datachain/lib/arrow.py +58 -22
datachain/lib/audio.py +245 -0
datachain/lib/clip.py +14 -13
datachain/lib/convert/flatten.py +5 -3
datachain/lib/convert/python_to_sql.py +6 -10
datachain/lib/convert/sql_to_python.py +8 -0
datachain/lib/convert/values_to_tuples.py +156 -51
datachain/lib/data_model.py +42 -20
datachain/lib/dataset_info.py +36 -8
datachain/lib/dc/__init__.py +8 -2
datachain/lib/dc/csv.py +25 -28
datachain/lib/dc/database.py +398 -0
datachain/lib/dc/datachain.py +1289 -425
datachain/lib/dc/datasets.py +320 -38
datachain/lib/dc/hf.py +38 -24
datachain/lib/dc/json.py +29 -32
datachain/lib/dc/listings.py +112 -8
datachain/lib/dc/pandas.py +16 -12
datachain/lib/dc/parquet.py +35 -23
datachain/lib/dc/records.py +31 -23
datachain/lib/dc/storage.py +154 -64
datachain/lib/dc/storage_pattern.py +251 -0
datachain/lib/dc/utils.py +24 -16
datachain/lib/dc/values.py +8 -9
datachain/lib/file.py +622 -89
datachain/lib/hf.py +69 -39
datachain/lib/image.py +14 -14
datachain/lib/listing.py +14 -11
datachain/lib/listing_info.py +1 -2
datachain/lib/meta_formats.py +3 -4
datachain/lib/model_store.py +39 -7
datachain/lib/namespaces.py +125 -0
datachain/lib/projects.py +130 -0
datachain/lib/pytorch.py +32 -21
datachain/lib/settings.py +192 -56
datachain/lib/signal_schema.py +427 -104
datachain/lib/tar.py +1 -2
datachain/lib/text.py +8 -7
datachain/lib/udf.py +164 -76
datachain/lib/udf_signature.py +60 -35
datachain/lib/utils.py +118 -4
datachain/lib/video.py +17 -9
datachain/lib/webdataset.py +61 -56
datachain/lib/webdataset_laion.py +15 -16
datachain/listing.py +22 -10
datachain/model/bbox.py +3 -1
datachain/model/ultralytics/bbox.py +16 -12
datachain/model/ultralytics/pose.py +16 -12
datachain/model/ultralytics/segment.py +16 -12
datachain/namespace.py +84 -0
datachain/node.py +6 -6
datachain/nodes_thread_pool.py +0 -1
datachain/plugins.py +24 -0
datachain/project.py +78 -0
datachain/query/batch.py +40 -41
datachain/query/dataset.py +604 -322
datachain/query/dispatch.py +261 -154
datachain/query/metrics.py +4 -6
datachain/query/params.py +2 -3
datachain/query/queue.py +3 -12
datachain/query/schema.py +11 -6
datachain/query/session.py +200 -33
datachain/query/udf.py +34 -2
datachain/remote/studio.py +171 -69
datachain/script_meta.py +12 -12
datachain/semver.py +68 -0
datachain/sql/__init__.py +2 -0
datachain/sql/functions/array.py +33 -1
datachain/sql/postgresql_dialect.py +9 -0
datachain/sql/postgresql_types.py +21 -0
datachain/sql/sqlite/__init__.py +5 -1
datachain/sql/sqlite/base.py +102 -29
datachain/sql/sqlite/types.py +8 -13
datachain/sql/types.py +70 -15
datachain/studio.py +223 -46
datachain/toolkit/split.py +31 -10
datachain/utils.py +101 -59
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
datachain-0.39.0.dist-info/RECORD +173 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
datachain/cli/commands/query.py +0 -53
datachain/query/utils.py +0 -42
datachain-0.14.2.dist-info/RECORD +0 -158
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0

datachain/lib/projects.py ADDED Viewed

@@ -0,0 +1,130 @@
+from datachain.error import ProjectCreateNotAllowedError, ProjectDeleteNotAllowedError
+from datachain.project import Project
+from datachain.query import Session
+def create(
+    namespace: str,
+    name: str,
+    descr: str | None = None,
+    session: Session | None = None,
+) -> Project:
+    """
+    Creates a new project under a specified namespace.
+    Projects help organize datasets. A default project is always available,
+    but users can create additional ones (only in Studio, not via CLI).
+    Parameters:
+        name: Name of the new project.
+        namespace: Namespace to create the project in. Created if it doesn't exist.
+        descr: Optional description of the project.
+        session: Optional session to use for the operation.
+    Example:
+        ```py
+        import datachain as dc
+        project = dc.create_project("dev", "my-project", "My personal project")
+        ```
+    """
+    session = Session.get(session)
+    from datachain.lib.dc.utils import is_studio
+    if not is_studio():
+        raise ProjectCreateNotAllowedError("Creating project is not allowed")
+    Project.validate_name(name)
+    return session.catalog.metastore.create_project(namespace, name, descr)
+def get(name: str, namespace: str, session: Session | None) -> Project:
+    """
+    Gets a project by name in some namespace.
+    If the project is not found, a `ProjectNotFoundError` is raised.
+    Parameters:
+        name : The name of the project.
+        namespace : The name of the namespace.
+        session : Session to use for getting project.
+    Example:
+        ```py
+        import datachain as dc
+        from datachain.lib.projects import get as get_project
+        project = get_project("my-project", "local")
+        ```
+    """
+    return Session.get(session).catalog.metastore.get_project(name, namespace)
+def ls(namespace: str | None = None, session: Session | None = None) -> list[Project]:
+    """
+    Gets a list of projects in a specific namespace or from all namespaces.
+    Parameters:
+        namespace : An optional namespace name.
+        session : Session to use for getting project.
+    Example:
+        ```py
+        import datachain as dc
+        from datachain.lib.projects import ls as ls_projects
+        local_namespace_projects = ls_projects("local")
+        all_projects = ls_projects()
+        ```
+    """
+    session = Session.get(session)
+    namespace_id = None
+    if namespace:
+        namespace_id = session.catalog.metastore.get_namespace(namespace).id
+    return session.catalog.metastore.list_projects(namespace_id)
+def delete(name: str, namespace: str, session: Session | None = None) -> None:
+    """
+    Removes a project by name within a namespace.
+    Raises:
+        ProjectNotFoundError: If the project does not exist.
+        ProjectDeleteNotAllowedError: If the project is non-empty,
+            is the default project, or is a listing project,
+            as these cannot be removed.
+    Parameters:
+        name : The name of the project.
+        namespace : The name of the namespace.
+        session : Session to use for getting project.
+    Example:
+        ```py
+        import datachain as dc
+        dc.delete_project("my-project", "local")
+        ```
+    """
+    session = Session.get(session)
+    metastore = session.catalog.metastore
+    project = metastore.get_project(name, namespace)
+    if metastore.is_listing_project(name, namespace):
+        raise ProjectDeleteNotAllowedError(
+            f"Project {metastore.listing_project_name} cannot be removed"
+        )
+    if metastore.is_default_project(name, namespace):
+        raise ProjectDeleteNotAllowedError(
+            f"Project {metastore.default_project_name} cannot be removed"
+        )
+    num_datasets = metastore.count_datasets(project.id)
+    if num_datasets > 0:
+        raise ProjectDeleteNotAllowedError(
+            f"Project cannot be removed. It contains {num_datasets} dataset(s). "
+            "Please remove the dataset(s) first."
+        )
+    metastore.remove_project(project.id)

datachain/lib/pytorch.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import logging
 import os
 import weakref
-from collections.abc import Generator, Iterable, Iterator
+from collections.abc import Callable, Generator, Iterable, Iterator
 from contextlib import closing
-from typing import TYPE_CHECKING, Any, Callable, Optional
+from typing import TYPE_CHECKING, Any
 from PIL import Image
 from torch import float32
@@ -43,13 +43,13 @@ class PytorchDataset(IterableDataset):
     def __init__(
         self,
         name: str,
-        version: Optional[int] = None,
-        catalog: Optional["Catalog"] = None,
-        transform: Optional["Transform"] = None,
-        tokenizer: Optional[Callable] = None,
-        tokenizer_kwargs: Optional[dict[str, Any]] = None,
+        version: str | None = None,
+        catalog: Catalog | None = None,
+        transform: "Transform | None" = None,
+        tokenizer: Callable | None = None,
+        tokenizer_kwargs: dict[str, Any] | None = None,
         num_samples: int = 0,
-        dc_settings: Optional[Settings] = None,
+        dc_settings: Settings | None = None,
         remove_prefetched: bool = False,
     ):
         """
@@ -60,7 +60,7 @@ class PytorchDataset(IterableDataset):
         Args:
             name (str): Name of DataChain dataset to stream.
-            version (int): Version of DataChain dataset to stream.
+            version (str): Version of DataChain dataset to stream.
             catalog (Catalog): DataChain catalog to which dataset belongs.
             transform (Transform): Torchvision transforms to apply to the dataset.
             tokenizer (Callable): Tokenizer to use to tokenize text values.
@@ -74,6 +74,7 @@ class PytorchDataset(IterableDataset):
         self.tokenizer = tokenizer
         self.tokenizer_kwargs = tokenizer_kwargs or {}
         self.num_samples = num_samples
+        owns_catalog = catalog is None
         if catalog is None:
             catalog = get_catalog()
         self._init_catalog(catalog)
@@ -84,7 +85,7 @@ class PytorchDataset(IterableDataset):
             self.prefetch = prefetch
         self._cache = catalog.cache
-        self._prefetch_cache: Optional[Cache] = None
+        self._prefetch_cache: Cache | None = None
         self._remove_prefetched = remove_prefetched
         if prefetch and not self.cache:
             tmp_dir = catalog.cache.tmp_dir
@@ -93,6 +94,10 @@ class PytorchDataset(IterableDataset):
             self._cache = self._prefetch_cache
             weakref.finalize(self, self._prefetch_cache.destroy)
+        # Close the catalog if we created it - we only needed it for clone params
+        if owns_catalog:
+            catalog.close()
     def close(self) -> None:
         if self._prefetch_cache:
             self._prefetch_cache.destroy()
@@ -104,7 +109,7 @@ class PytorchDataset(IterableDataset):
         self._ms_params = catalog.metastore.clone_params()
         self._wh_params = catalog.warehouse.clone_params()
         self._catalog_params = catalog.get_init_params()
-        self.catalog: Optional[Catalog] = None
+        self.catalog: Catalog | None = None
     def _get_catalog(self) -> "Catalog":
         ms_cls, ms_args, ms_kwargs = self._ms_params
@@ -121,16 +126,22 @@ class PytorchDataset(IterableDataset):
         total_workers: int,
     ) -> Generator[tuple[Any, ...], None, None]:
         catalog = self._get_catalog()
-        session = Session("PyTorch", catalog=catalog)
-        ds = read_dataset(
-            name=self.name, version=self.version, session=session
-        ).settings(cache=self.cache, prefetch=self.prefetch)
-        ds = ds.remove_file_signals()
-        if self.num_samples > 0:
-            ds = ds.sample(self.num_samples)
-        ds = ds.chunk(total_rank, total_workers)
-        yield from ds.collect()
+        try:
+            session = Session("PyTorch", catalog=catalog)
+            ds = read_dataset(
+                name=self.name, version=self.version, session=session
+            ).settings(cache=self.cache, prefetch=self.prefetch)
+            # remove file signals from dataset
+            schema = ds.signals_schema.clone_without_file_signals()
+            ds = ds.select(*schema.values.keys())
+            if self.num_samples > 0:
+                ds = ds.sample(self.num_samples)
+            ds = ds.chunk(total_rank, total_workers)
+            yield from ds.to_iter()
+        finally:
+            catalog.close()
     def _iter_with_prefetch(self) -> Generator[tuple[Any], None, None]:
         from datachain.lib.udf import _prefetch_inputs

datachain/lib/settings.py CHANGED Viewed

@@ -1,78 +1,214 @@
+from typing import Any
 from datachain.lib.utils import DataChainParamsError
+DEFAULT_CACHE = False
+DEFAULT_PREFETCH = 2
+DEFAULT_BATCH_SIZE = 2_000
 class SettingsError(DataChainParamsError):
-    def __init__(self, msg):
+    def __init__(self, msg: str) -> None:
         super().__init__(f"Dataset settings error: {msg}")
 class Settings:
-    def __init__(
+    """Settings for datachain."""
+    _cache: bool | None
+    _prefetch: int | None
+    _parallel: bool | int | None
+    _workers: int | None
+    _namespace: str | None
+    _project: str | None
+    _min_task_size: int | None
+    _batch_size: int | None
+    def __init__(  # noqa: C901, PLR0912
         self,
-        cache=None,
-        parallel=None,
-        workers=None,
-        min_task_size=None,
-        prefetch=None,
-    ):
-        self._cache = cache
-        self.parallel = parallel
-        self._workers = workers
-        self.min_task_size = min_task_size
-        self.prefetch = prefetch
-        if not isinstance(cache, bool) and cache is not None:
-            raise SettingsError(
-                "'cache' argument must be bool"
-                f" while {cache.__class__.__name__} was given"
-            )
-        if not isinstance(parallel, int) and parallel is not None:
-            raise SettingsError(
-                "'parallel' argument must be int or None"
-                f" while {parallel.__class__.__name__} was given"
-            )
-        if (
-            not isinstance(workers, bool)
-            and not isinstance(workers, int)
-            and workers is not None
-        ):
-            raise SettingsError(
-                "'workers' argument must be int or bool"
-                f" while {workers.__class__.__name__} was given"
-            )
-        if min_task_size is not None and not isinstance(min_task_size, int):
-            raise SettingsError(
-                "'min_task_size' argument must be int or None"
-                f", {min_task_size.__class__.__name__} was given"
-            )
+        cache: bool | None = None,
+        prefetch: bool | int | None = None,
+        parallel: bool | int | None = None,
+        workers: int | None = None,
+        namespace: str | None = None,
+        project: str | None = None,
+        min_task_size: int | None = None,
+        batch_size: int | None = None,
+    ) -> None:
+        if cache is None:
+            self._cache = None
+        else:
+            if not isinstance(cache, bool):
+                raise SettingsError(
+                    "'cache' argument must be bool"
+                    f" while {cache.__class__.__name__} was given"
+                )
+            self._cache = cache
+        if prefetch is None or prefetch is True:
+            self._prefetch = None
+        elif prefetch is False:
+            self._prefetch = 0  # disable prefetch (False == 0)
+        else:
+            if not isinstance(prefetch, int):
+                raise SettingsError(
+                    "'prefetch' argument must be int or bool"
+                    f" while {prefetch.__class__.__name__} was given"
+                )
+            if prefetch < 0:
+                raise SettingsError(
+                    "'prefetch' argument must be non-negative integer"
+                    f", {prefetch} was given"
+                )
+            self._prefetch = prefetch
+        if parallel is None or parallel is False:
+            self._parallel = None
+        elif parallel is True:
+            self._parallel = True
+        else:
+            if not isinstance(parallel, int):
+                raise SettingsError(
+                    "'parallel' argument must be int or bool"
+                    f" while {parallel.__class__.__name__} was given"
+                )
+            if parallel <= 0:
+                raise SettingsError(
+                    "'parallel' argument must be positive integer"
+                    f", {parallel} was given"
+                )
+            self._parallel = parallel
+        if workers is None:
+            self._workers = None
+        else:
+            if not isinstance(workers, int) or isinstance(workers, bool):
+                raise SettingsError(
+                    "'workers' argument must be int"
+                    f" while {workers.__class__.__name__} was given"
+                )
+            if workers <= 0:
+                raise SettingsError(
+                    f"'workers' argument must be positive integer, {workers} was given"
+                )
+            self._workers = workers
+        if namespace is None:
+            self._namespace = None
+        else:
+            if not isinstance(namespace, str):
+                raise SettingsError(
+                    "'namespace' argument must be str"
+                    f", {namespace.__class__.__name__} was given"
+                )
+            self._namespace = namespace
+        if project is None:
+            self._project = None
+        else:
+            if not isinstance(project, str):
+                raise SettingsError(
+                    "'project' argument must be str"
+                    f", {project.__class__.__name__} was given"
+                )
+            self._project = project
+        if min_task_size is None:
+            self._min_task_size = None
+        else:
+            if not isinstance(min_task_size, int) or isinstance(min_task_size, bool):
+                raise SettingsError(
+                    "'min_task_size' argument must be int"
+                    f", {min_task_size.__class__.__name__} was given"
+                )
+            if min_task_size <= 0:
+                raise SettingsError(
+                    "'min_task_size' argument must be positive integer"
+                    f", {min_task_size} was given"
+                )
+            self._min_task_size = min_task_size
+        if batch_size is None:
+            self._batch_size = None
+        else:
+            if not isinstance(batch_size, int) or isinstance(batch_size, bool):
+                raise SettingsError(
+                    "'batch_size' argument must be int"
+                    f", {batch_size.__class__.__name__} was given"
+                )
+            if batch_size <= 0:
+                raise SettingsError(
+                    "'batch_size' argument must be positive integer"
+                    f", {batch_size} was given"
+                )
+            self._batch_size = batch_size
+    @property
+    def cache(self) -> bool:
+        return self._cache if self._cache is not None else DEFAULT_CACHE
+    @property
+    def prefetch(self) -> int | None:
+        return self._prefetch if self._prefetch is not None else DEFAULT_PREFETCH
+    @property
+    def parallel(self) -> bool | int | None:
+        return self._parallel if self._parallel is not None else None
+    @property
+    def workers(self) -> int | None:
+        return self._workers if self._workers is not None else None
+    @property
+    def namespace(self) -> str | None:
+        return self._namespace if self._namespace is not None else None
+    @property
+    def project(self) -> str | None:
+        return self._project if self._project is not None else None
     @property
-    def cache(self):
-        return self._cache if self._cache is not None else False
+    def min_task_size(self) -> int | None:
+        return self._min_task_size if self._min_task_size is not None else None
     @property
-    def workers(self):
-        return self._workers if self._workers is not None else False
+    def batch_size(self) -> int:
+        return self._batch_size if self._batch_size is not None else DEFAULT_BATCH_SIZE
-    def to_dict(self):
-        res = {}
+    def to_dict(self) -> dict[str, Any]:
+        res: dict[str, Any] = {}
         if self._cache is not None:
             res["cache"] = self.cache
-        if self.parallel is not None:
+        if self._prefetch is not None:
+            res["prefetch"] = self.prefetch
+        if self._parallel is not None:
             res["parallel"] = self.parallel
         if self._workers is not None:
             res["workers"] = self.workers
-        if self.min_task_size is not None:
+        if self._min_task_size is not None:
             res["min_task_size"] = self.min_task_size
+        if self._namespace is not None:
+            res["namespace"] = self.namespace
+        if self._project is not None:
+            res["project"] = self.project
+        if self._batch_size is not None:
+            res["batch_size"] = self.batch_size
         return res
-    def add(self, settings: "Settings"):
-        self._cache = settings._cache or self._cache
-        self.parallel = settings.parallel or self.parallel
-        self._workers = settings._workers or self._workers
-        self.min_task_size = settings.min_task_size or self.min_task_size
-        if settings.prefetch is not None:
-            self.prefetch = settings.prefetch
+    def add(self, settings: "Settings") -> None:
+        if settings._cache is not None:
+            self._cache = settings._cache
+        if settings._prefetch is not None:
+            self._prefetch = settings._prefetch
+        if settings._parallel is not None:
+            self._parallel = settings._parallel
+        if settings._workers is not None:
+            self._workers = settings._workers
+        if settings._namespace is not None:
+            self._namespace = settings._namespace
+        if settings._project is not None:
+            self._project = settings._project
+        if settings._min_task_size is not None:
+            self._min_task_size = settings._min_task_size
+        if settings._batch_size is not None:
+            self._batch_size = settings._batch_size

datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl

datachain 0.14.2py3-none-any.whl → 0.39.0py3-none-any.whl