PyPI - datachain - Versions diffs - 0.21.1__py3-none-any.whl → 0.22.0__py3-none-any.whl - Mend

datachain 0.21.1py3-none-any.whl → 0.22.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (48) hide show

datachain/__init__.py +2 -0
datachain/cache.py +2 -2
datachain/catalog/catalog.py +180 -65
datachain/cli/__init__.py +0 -7
datachain/cli/commands/datasets.py +43 -28
datachain/cli/commands/ls.py +2 -2
datachain/cli/parser/__init__.py +1 -35
datachain/client/fsspec.py +5 -3
datachain/client/hf.py +10 -0
datachain/client/local.py +4 -4
datachain/data_storage/metastore.py +422 -37
datachain/data_storage/sqlite.py +136 -7
datachain/data_storage/warehouse.py +26 -7
datachain/dataset.py +126 -12
datachain/delta.py +11 -7
datachain/error.py +36 -0
datachain/func/func.py +1 -1
datachain/lib/arrow.py +3 -3
datachain/lib/dataset_info.py +4 -0
datachain/lib/dc/datachain.py +260 -92
datachain/lib/dc/datasets.py +104 -50
datachain/lib/dc/listings.py +3 -3
datachain/lib/dc/records.py +1 -0
datachain/lib/dc/storage.py +38 -40
datachain/lib/file.py +77 -23
datachain/lib/listing.py +3 -1
datachain/lib/meta_formats.py +1 -1
datachain/lib/namespaces.py +71 -0
datachain/lib/projects.py +86 -0
datachain/lib/pytorch.py +1 -1
datachain/lib/settings.py +10 -0
datachain/lib/tar.py +1 -2
datachain/lib/udf.py +1 -1
datachain/lib/udf_signature.py +1 -1
datachain/lib/webdataset.py +30 -20
datachain/listing.py +3 -1
datachain/namespace.py +65 -0
datachain/project.py +78 -0
datachain/query/dataset.py +71 -46
datachain/query/session.py +1 -1
datachain/remote/studio.py +61 -26
datachain/studio.py +23 -6
{datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/METADATA +2 -2
{datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/RECORD +48 -44
{datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/WHEEL +0 -0
{datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/entry_points.txt +0 -0
{datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/licenses/LICENSE +0 -0
{datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/top_level.txt +0 -0

datachain/lib/namespaces.py ADDED Viewed

@@ -0,0 +1,71 @@
+from typing import Optional
+from datachain.error import NamespaceCreateNotAllowedError
+from datachain.namespace import Namespace
+from datachain.query import Session
+def create(
+    name: str, descr: Optional[str] = None, session: Optional[Session] = None
+) -> Namespace:
+    """
+    Creates a new namespace.
+    Namespaces organize projects, which in turn organize datasets. A default
+    namespace always exists and is used if none is specified. Multiple namespaces
+    can be created in Studio, but only the default is available in the CLI.
+    Parameters:
+        name: Name of the new namespace.
+        descr: Optional description of the namespace.
+        session: Optional session to use for the operation.
+    Example:
+        ```py
+        from datachain.lib.namespaces import create as create_namespace
+        namespace = create_namespace("dev", "Dev namespace")
+        ```
+    """
+    session = Session.get(session)
+    if not session.catalog.metastore.namespace_allowed_to_create:
+        raise NamespaceCreateNotAllowedError("Creating namespace is not allowed")
+    Namespace.validate_name(name)
+    return session.catalog.metastore.create_namespace(name, descr)
+def get(name: str, session: Optional[Session] = None) -> Namespace:
+    """
+    Gets a namespace by name.
+    If the namespace is not found, a `NamespaceNotFoundError` is raised.
+    Parameters:
+        name : The name of the namespace.
+        session : Session to use for getting namespace.
+    Example:
+        ```py
+        import datachain as dc
+        namespace = dc.get_namespace("local")
+        ```
+    """
+    session = Session.get(session)
+    return session.catalog.metastore.get_namespace(name)
+def ls(session: Optional[Session] = None) -> list[Namespace]:
+    """
+    Gets a list of all namespaces.
+    Parameters:
+        session : Session to use for getting namespaces.
+    Example:
+        ```py
+        from datachain.lib.namespaces import ls as ls_namespaces
+        namespaces = ls_namespaces()
+        ```
+    """
+    return Session.get(session).catalog.metastore.list_namespaces()

datachain/lib/projects.py ADDED Viewed

@@ -0,0 +1,86 @@
+from typing import Optional
+from datachain.error import ProjectCreateNotAllowedError
+from datachain.project import Project
+from datachain.query import Session
+def create(
+    namespace: str,
+    name: str,
+    descr: Optional[str] = None,
+    session: Optional[Session] = None,
+) -> Project:
+    """
+    Creates a new project under a specified namespace.
+    Projects help organize datasets. A default project is always available,
+    but users can create additional ones (only in Studio, not via CLI).
+    Parameters:
+        name: Name of the new project.
+        namespace: Namespace to create the project in. Created if it doesn't exist.
+        descr: Optional description of the project.
+        session: Optional session to use for the operation.
+    Example:
+        ```py
+        import datachain as dc
+        project = dc.create_project("dev", "my-project", "My personal project")
+        ```
+    """
+    session = Session.get(session)
+    if not session.catalog.metastore.project_allowed_to_create:
+        raise ProjectCreateNotAllowedError("Creating project is not allowed")
+    Project.validate_name(name)
+    return session.catalog.metastore.create_project(namespace, name, descr)
+def get(name: str, namespace: str, session: Optional[Session]) -> Project:
+    """
+    Gets a project by name in some namespace.
+    If the project is not found, a `ProjectNotFoundError` is raised.
+    Parameters:
+        name : The name of the project.
+        namespace : The name of the namespace.
+        session : Session to use for getting project.
+    Example:
+        ```py
+        import datachain as dc
+        from datachain.lib.projects import get as get_project
+        project  = get_project("my-project", "local")
+        ```
+    """
+    return Session.get(session).catalog.metastore.get_project(name, namespace)
+def ls(
+    namespace: Optional[str] = None, session: Optional[Session] = None
+) -> list[Project]:
+    """
+    Gets a list of projects in a specific namespace or from all namespaces.
+    Parameters:
+        namespace : An optional namespace name.
+        session : Session to use for getting project.
+    Example:
+        ```py
+        import datachain as dc
+        from datachain.lib.projects import ls as ls_projects
+        local_namespace_projects = ls_projects("local")
+        all_projects = ls_projects()
+        ```
+    """
+    session = Session.get(session)
+    namespace_id = None
+    if namespace:
+        namespace_id = session.catalog.metastore.get_namespace(namespace).id
+    return session.catalog.metastore.list_projects(namespace_id)

datachain/lib/pytorch.py CHANGED Viewed

@@ -130,7 +130,7 @@ class PytorchDataset(IterableDataset):
         if self.num_samples > 0:
             ds = ds.sample(self.num_samples)
         ds = ds.chunk(total_rank, total_workers)
-        yield from ds.collect()
+        yield from ds.to_iter()
     def _iter_with_prefetch(self) -> Generator[tuple[Any], None, None]:
         from datachain.lib.udf import _prefetch_inputs

datachain/lib/settings.py CHANGED Viewed

@@ -14,12 +14,16 @@ class Settings:
         workers=None,
         min_task_size=None,
         prefetch=None,
+        namespace=None,
+        project=None,
     ):
         self._cache = cache
         self.parallel = parallel
         self._workers = workers
         self.min_task_size = min_task_size
         self.prefetch = prefetch
+        self.namespace = namespace
+        self.project = project
         if not isinstance(cache, bool) and cache is not None:
             raise SettingsError(
@@ -67,6 +71,10 @@ class Settings:
             res["workers"] = self.workers
         if self.min_task_size is not None:
             res["min_task_size"] = self.min_task_size
+        if self.namespace is not None:
+            res["namespace"] = self.namespace
+        if self.project is not None:
+            res["project"] = self.project
         return res
     def add(self, settings: "Settings"):
@@ -74,5 +82,7 @@ class Settings:
         self.parallel = settings.parallel or self.parallel
         self._workers = settings._workers or self._workers
         self.min_task_size = settings.min_task_size or self.min_task_size
+        self.namespace = settings.namespace or self.namespace
+        self.project = settings.project or self.project
         if settings.prefetch is not None:
             self.prefetch = settings.prefetch

datachain/lib/tar.py CHANGED Viewed

@@ -6,12 +6,11 @@ from datachain.lib.file import File, TarVFile
 def build_tar_member(parent: File, info: tarfile.TarInfo) -> File:
-    new_parent = parent.get_full_name()
     etag_string = "-".join([parent.etag, info.name, str(info.mtime)])
     etag = hashlib.md5(etag_string.encode(), usedforsecurity=False).hexdigest()
     return File(
         source=parent.source,
-        path=f"{new_parent}/{info.name}",
+        path=f"{parent.path}/{info.name}",
         version=parent.version,
         size=info.size,
         etag=etag,

datachain/lib/udf.py CHANGED Viewed

@@ -309,7 +309,7 @@ async def _prefetch_input(
     after_prefetch: "Callable[[], None]" = noop,
 ) -> T:
     for obj in row:
-        if isinstance(obj, File) and await obj._prefetch(download_cb):
+        if isinstance(obj, File) and obj.path and await obj._prefetch(download_cb):
             after_prefetch()
     return row

datachain/lib/udf_signature.py CHANGED Viewed

@@ -16,7 +16,7 @@ class UdfSignatureError(DataChainParamsError):
 @dataclass
-class UdfSignature:
+class UdfSignature:  # noqa: PLW1641
     func: Union[Callable, UDFBase]
     params: dict[str, Union[DataType, Any]]
     output_schema: SignalSchema

datachain/lib/webdataset.py CHANGED Viewed

@@ -34,29 +34,29 @@ warnings.filterwarnings(
 class WDSError(DataChainError):
-    def __init__(self, tar_stream, message: str):
-        super().__init__(f"WebDataset error '{tar_stream.get_full_name()}': {message}")
+    def __init__(self, tar_name: str, message: str):
+        super().__init__(f"WebDataset error '{tar_name}': {message}")
 class CoreFileDuplicationError(WDSError):
-    def __init__(self, tar_stream, file1: str, file2: str):
+    def __init__(self, tar_name: str, file1: str, file2: str):
         super().__init__(
-            tar_stream, f"duplication of files with core extensions: {file1}, {file2}"
+            tar_name, f"duplication of files with core extensions: {file1}, {file2}"
         )
 class CoreFileNotFoundError(WDSError):
-    def __init__(self, tar_stream, extensions, stem):
+    def __init__(self, tar_name: str, extensions: Sequence[str], stem: str):
         super().__init__(
-            tar_stream,
+            tar_name,
             f"no files with the extensions '{','.join(extensions)}'"
             f" were found for file stem {stem}",
         )
 class UnknownFileExtensionError(WDSError):
-    def __init__(self, tar_stream, name, ext):
-        super().__init__(tar_stream, f"unknown extension '{ext}' for file '{name}'")
+    def __init__(self, tar_name, name: str, ext: str):
+        super().__init__(tar_name, f"unknown extension '{ext}' for file '{name}'")
 class WDSBasic(DataModel):
@@ -113,10 +113,10 @@ class Builder:
     def __init__(
         self,
         tar_stream: File,
-        core_extensions: list[str],
+        core_extensions: Sequence[str],
         wds_class: type[WDSBasic],
-        tar,
-        encoding="utf-8",
+        tar: tarfile.TarFile,
+        encoding: str = "utf-8",
     ):
         self._core_extensions = core_extensions
         self._tar_stream = tar_stream
@@ -145,18 +145,20 @@ class Builder:
         if ext in self._core_extensions:
             if self.state.core_file is not None:
                 raise CoreFileDuplicationError(
-                    self._tar_stream, file.name, self.state.core_file.name
+                    self._tar_stream.name, file.name, self.state.core_file.name
                 )
             self.state.core_file = file
         elif ext in self.state.data:
             raise WDSError(
-                self._tar_stream,
+                self._tar_stream.name,
                 f"file with extension '.{ext}' already exists in the archive",
             )
         else:
             type_ = self._get_type(ext)
             if type_ is None:
-                raise UnknownFileExtensionError(self._tar_stream, fstream.name, ext)
+                raise UnknownFileExtensionError(
+                    self._tar_stream.name, fstream.name, ext
+                )
             if issubclass(type_, WDSReadableSubclass):
                 reader = type_._reader
@@ -165,7 +167,7 @@ class Builder:
             if reader is None:
                 raise WDSError(
-                    self._tar_stream,
+                    self._tar_stream.name,
                     f"unable to find a reader for type {type_}, extension .{ext}",
                 )
             self.state.data[ext] = reader(self, file)
@@ -173,7 +175,7 @@ class Builder:
     def produce(self):
         if self.state.core_file is None:
             raise CoreFileNotFoundError(
-                self._tar_stream, self._core_extensions, self.state.stem
+                self._tar_stream.name, self._core_extensions, self.state.stem
             )
         file = build_tar_member(self._tar_stream, self.state.core_file)
@@ -194,7 +196,13 @@ class Builder:
         return anno
-def get_tar_groups(stream, tar, core_extensions, spec, encoding="utf-8"):
+def get_tar_groups(
+    stream: File,
+    tar: tarfile.TarFile,
+    core_extensions: Sequence[str],
+    spec: type[WDSBasic],
+    encoding: str = "utf-8",
+) -> Iterator[WDSBasic]:
     builder = Builder(stream, core_extensions, spec, tar, encoding)
     for item in sorted(tar.getmembers(), key=lambda m: Path(m.name).stem):
@@ -210,9 +218,11 @@ def get_tar_groups(stream, tar, core_extensions, spec, encoding="utf-8"):
 def process_webdataset(
-    core_extensions: Sequence[str] = ("jpg", "png"), spec=WDSAllFile, encoding="utf-8"
-) -> Callable:
-    def wds_func(file: File) -> Iterator[spec]:
+    core_extensions: Sequence[str] = ("jpg", "png"),
+    spec: type[WDSBasic] = WDSAllFile,
+    encoding: str = "utf-8",
+) -> Callable[[File], Iterator]:
+    def wds_func(file: File) -> Iterator[spec]:  # type: ignore[valid-type]
         with file.open() as fd:
             with tarfile.open(fileobj=fd) as tar:
                 yield from get_tar_groups(file, tar, core_extensions, spec, encoding)

datachain/listing.py CHANGED Viewed

@@ -66,7 +66,9 @@ class Listing:
     @cached_property
     def dataset(self) -> "DatasetRecord":
         assert self.dataset_name
-        return self.metastore.get_dataset(self.dataset_name)
+        return self.metastore.get_dataset(
+            self.dataset_name, self.metastore.listing_project.id
+        )
     @cached_property
     def dataset_rows(self):

datachain/namespace.py ADDED Viewed

@@ -0,0 +1,65 @@
+import builtins
+from dataclasses import dataclass, fields
+from datetime import datetime
+from typing import Any, Optional, TypeVar
+from datachain.error import InvalidNamespaceNameError
+N = TypeVar("N", bound="Namespace")
+NAMESPACE_NAME_RESERVED_CHARS = ["."]
+@dataclass(frozen=True)
+class Namespace:
+    id: int
+    uuid: str
+    name: str
+    descr: Optional[str]
+    created_at: datetime
+    @staticmethod
+    def validate_name(name: str) -> None:
+        """Throws exception if name is invalid, otherwise returns None"""
+        if not name:
+            raise InvalidNamespaceNameError("Namespace name cannot be empty")
+        for c in NAMESPACE_NAME_RESERVED_CHARS:
+            if c in name:
+                raise InvalidNamespaceNameError(
+                    f"Character {c} is reserved and not allowed in namespace name"
+                )
+        if name in [Namespace.default(), Namespace.system()]:
+            raise InvalidNamespaceNameError(
+                f"Namespace name {name} is reserved and cannot be used."
+            )
+    @staticmethod
+    def default() -> str:
+        """Name of default namespace"""
+        return "local"
+    @staticmethod
+    def system() -> str:
+        """Name of the system namespace"""
+        return "system"
+    @property
+    def is_system(self):
+        return self.name == Namespace.system()
+    @classmethod
+    def parse(
+        cls: builtins.type[N],
+        id: int,
+        uuid: str,
+        name: str,
+        descr: Optional[str],
+        created_at: datetime,
+    ) -> "Namespace":
+        return cls(id, uuid, name, descr, created_at)
+    @classmethod
+    def from_dict(cls, d: dict[str, Any]) -> "Namespace":
+        kwargs = {f.name: d[f.name] for f in fields(cls) if f.name in d}
+        return cls(**kwargs)

datachain/project.py ADDED Viewed

@@ -0,0 +1,78 @@
+import builtins
+from dataclasses import dataclass, fields
+from datetime import datetime
+from typing import Any, Optional, TypeVar
+from datachain.error import InvalidProjectNameError
+from datachain.namespace import Namespace
+P = TypeVar("P", bound="Project")
+PROJECT_NAME_RESERVED_CHARS = ["."]
+@dataclass(frozen=True)
+class Project:
+    id: int
+    uuid: str
+    name: str
+    descr: Optional[str]
+    created_at: datetime
+    namespace: Namespace
+    @staticmethod
+    def validate_name(name: str) -> None:
+        """Throws exception if name is invalid, otherwise returns None"""
+        if not name:
+            raise InvalidProjectNameError("Project name cannot be empty")
+        for c in PROJECT_NAME_RESERVED_CHARS:
+            if c in name:
+                raise InvalidProjectNameError(
+                    f"Character {c} is reserved and not allowed in project name."
+                )
+        if name in [Project.default(), Project.listing()]:
+            raise InvalidProjectNameError(
+                f"Project name {name} is reserved and cannot be used."
+            )
+    @staticmethod
+    def default() -> str:
+        """Name of default project"""
+        return "local"
+    @staticmethod
+    def listing() -> str:
+        """Name of listing project where all listing datasets will be saved"""
+        return "listing"
+    @classmethod
+    def parse(
+        cls: builtins.type[P],
+        namespace_id: int,
+        namespace_uuid: str,
+        namespace_name: str,
+        namespace_descr: Optional[str],
+        namespace_created_at: datetime,
+        project_id: int,
+        uuid: str,
+        name: str,
+        descr: Optional[str],
+        created_at: datetime,
+        project_namespace_id: int,
+    ) -> "Project":
+        namespace = Namespace.parse(
+            namespace_id,
+            namespace_uuid,
+            namespace_name,
+            namespace_descr,
+            namespace_created_at,
+        )
+        return cls(project_id, uuid, name, descr, created_at, namespace)
+    @classmethod
+    def from_dict(cls, d: dict[str, Any]) -> "Project":
+        namespace = Namespace.from_dict(d.pop("namespace"))
+        kwargs = {f.name: d[f.name] for f in fields(cls) if f.name in d}
+        return cls(**kwargs, namespace=namespace)

datachain 0.21.1__py3-none-any.whl → 0.22.0__py3-none-any.whl

Potentially problematic release.

datachain 0.21.1py3-none-any.whl → 0.22.0py3-none-any.whl