PyPI - datachain - Versions diffs - 0.19.1__py3-none-any.whl → 0.20.0__py3-none-any.whl - Mend

datachain 0.19.1py3-none-any.whl → 0.20.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (37) hide show

datachain/__init__.py +3 -0
datachain/catalog/catalog.py +180 -65
datachain/cli/__init__.py +0 -7
datachain/cli/commands/datasets.py +43 -28
datachain/cli/parser/__init__.py +1 -35
datachain/cli/parser/job.py +25 -0
datachain/cli/parser/studio.py +11 -4
datachain/data_storage/metastore.py +390 -37
datachain/data_storage/schema.py +23 -1
datachain/data_storage/sqlite.py +139 -7
datachain/data_storage/warehouse.py +26 -7
datachain/dataset.py +125 -12
datachain/delta.py +9 -5
datachain/error.py +36 -0
datachain/lib/dataset_info.py +4 -0
datachain/lib/dc/datachain.py +86 -7
datachain/lib/dc/datasets.py +62 -12
datachain/lib/dc/listings.py +111 -0
datachain/lib/dc/records.py +1 -0
datachain/lib/dc/storage.py +14 -2
datachain/lib/listing.py +3 -1
datachain/lib/namespaces.py +73 -0
datachain/lib/projects.py +86 -0
datachain/lib/settings.py +10 -0
datachain/listing.py +3 -1
datachain/namespace.py +65 -0
datachain/project.py +78 -0
datachain/query/dataset.py +71 -46
datachain/query/session.py +1 -1
datachain/remote/studio.py +67 -26
datachain/studio.py +68 -8
{datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/METADATA +2 -2
{datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/RECORD +37 -33
{datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/WHEEL +0 -0
{datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/entry_points.txt +0 -0
{datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/licenses/LICENSE +0 -0
{datachain-0.19.1.dist-info → datachain-0.20.0.dist-info}/top_level.txt +0 -0

datachain/lib/dataset_info.py CHANGED Viewed

@@ -22,6 +22,8 @@ if TYPE_CHECKING:
 class DatasetInfo(DataModel):
     name: str
+    namespace_name: str
+    project_name: str
     uuid: str = Field(default=str(uuid4()))
     version: str = Field(default=DEFAULT_DATASET_VERSION)
     status: int = Field(default=DatasetStatus.CREATED)
@@ -91,6 +93,8 @@ class DatasetInfo(DataModel):
         return cls(
             uuid=version.uuid,
             name=dataset.name,
+            namespace_name=dataset.project.namespace.name,
+            project_name=dataset.project.name,
             version=version.version,
             status=version.status,
             created_at=version.created_at,

datachain/lib/dc/datachain.py CHANGED Viewed

@@ -24,7 +24,7 @@ from pydantic import BaseModel
 from tqdm import tqdm
 from datachain import semver
-from datachain.dataset import DatasetRecord
+from datachain.dataset import DatasetRecord, parse_dataset_name
 from datachain.delta import delta_disabled
 from datachain.func import literal
 from datachain.func.base import Function
@@ -37,6 +37,7 @@ from datachain.lib.file import (
     FileExporter,
 )
 from datachain.lib.file import ExportPlacement as FileExportPlacement
+from datachain.lib.projects import get as get_project
 from datachain.lib.settings import Settings
 from datachain.lib.signal_schema import SignalSchema
 from datachain.lib.udf import Aggregator, BatchMapper, Generator, Mapper, UDFBase
@@ -261,7 +262,7 @@ class DataChain:
         """Underlying dataset, if there is one."""
         if not self.name:
             return None
-        return self.session.catalog.get_dataset(self.name)
+        return self.session.catalog.get_dataset(self.name, self._query.project)
     def __or__(self, other: "Self") -> "Self":
         """Return `self.union(other)`."""
@@ -312,6 +313,8 @@ class DataChain:
         min_task_size=None,
         prefetch: Optional[int] = None,
         sys: Optional[bool] = None,
+        namespace: Optional[str] = None,
+        project: Optional[str] = None,
     ) -> "Self":
         """Change settings for chain.
@@ -327,6 +330,8 @@ class DataChain:
             prefetch: number of workers to use for downloading files in advance.
                       This is enabled by default and uses 2 workers.
                       To disable prefetching, set it to 0.
+            namespace: namespace name.
+            project: project name.
         Example:
             ```py
@@ -340,7 +345,11 @@ class DataChain:
         if sys is None:
             sys = self._sys
         settings = copy.copy(self._settings)
-        settings.add(Settings(cache, parallel, workers, min_task_size, prefetch))
+        settings.add(
+            Settings(
+                cache, parallel, workers, min_task_size, prefetch, namespace, project
+            )
+        )
         return self._evolve(settings=settings, _sys=sys)
     def reset_settings(self, settings: Optional[Settings] = None) -> "Self":
@@ -490,6 +499,22 @@ class DataChain:
         )
         return listings(*args, **kwargs)
+    @property
+    def namespace_name(self) -> str:
+        """Current namespace name in which the chain is running"""
+        return (
+            self._settings.namespace
+            or self.session.catalog.metastore.default_namespace_name
+        )
+    @property
+    def project_name(self) -> str:
+        """Current project name in which the chain is running"""
+        return (
+            self._settings.project
+            or self.session.catalog.metastore.default_project_name
+        )
     def persist(self) -> "Self":
         """Saves temporary chain that will be removed after the process ends.
         Temporary datasets are useful for optimization, for example when we have
@@ -499,7 +524,12 @@ class DataChain:
         It returns the chain itself.
         """
         schema = self.signals_schema.clone_without_sys_signals().serialize()
-        return self._evolve(query=self._query.save(feature_schema=schema))
+        project = get_project(
+            self.project_name, self.namespace_name, session=self.session
+        )
+        return self._evolve(
+            query=self._query.save(project=project, feature_schema=schema)
+        )
     def save(  # type: ignore[override]
         self,
@@ -513,7 +543,10 @@ class DataChain:
         """Save to a Dataset. It returns the chain itself.
         Parameters:
-            name : dataset name.
+            name : dataset name. It can be full name consisting of namespace and
+                project, but it can also be just a regular dataset name in which
+                case we are taking namespace and project from settings, if they
+                are defined there, or default ones instead.
             version : version of a dataset. If version is not specified and dataset
                 already exists, version patch increment will happen e.g 1.2.1 -> 1.2.2.
             description : description of a dataset.
@@ -535,6 +568,21 @@ class DataChain:
                 " patch"
             )
+        namespace_name, project_name, name = parse_dataset_name(name)
+        namespace_name = (
+            namespace_name
+            or self._settings.namespace
+            or self.session.catalog.metastore.default_namespace_name
+        )
+        project_name = (
+            project_name
+            or self._settings.project
+            or self.session.catalog.metastore.default_project_name
+        )
+        project = get_project(project_name, namespace_name, session=self.session)
         schema = self.signals_schema.clone_without_sys_signals().serialize()
         # Handle retry and delta functionality
@@ -558,6 +606,7 @@ class DataChain:
                     query=result_ds._query.save(
                         name=name,
                         version=version,
+                        project=project,
                         feature_schema=schema,
                         dependencies=dependencies,
                         **kwargs,
@@ -577,6 +626,7 @@ class DataChain:
             query=self._query.save(
                 name=name,
                 version=version,
+                project=project,
                 description=description,
                 attrs=attrs,
                 feature_schema=schema,
@@ -2239,16 +2289,45 @@ class DataChain:
             Combining filters with "or"
             ```py
-            dc.filter(C("file.path").glob("cat*") | C("file.path").glob("dog*))
+            dc.filter(
+                C("file.path").glob("cat*") |
+                C("file.path").glob("dog*")
+            )
+            ```
+            ```py
+            dc.filter(dc.func.or_(
+                C("file.path").glob("cat*"),
+                C("file.path").glob("dog*")
+            ))
             ```
             Combining filters with "and"
             ```py
             dc.filter(
-                C("file.path").glob("*.jpg) &
+                C("file.path").glob("*.jpg"),
+                string.length(C("file.path")) > 5
+            )
+            ```
+            ```py
+            dc.filter(
+                C("file.path").glob("*.jpg") &
                 (string.length(C("file.path")) > 5)
             )
             ```
+            ```py
+            dc.filter(dc.func.and_(
+                C("file.path").glob("*.jpg"),
+                string.length(C("file.path")) > 5
+            ))
+            ```
+            Combining filters with "not"
+            ```py
+            dc.filter(~(C("file.path").glob("*.jpg")))
+            ```
         """
         return self._evolve(query=self._query.filter(*args))

datachain/lib/dc/datasets.py CHANGED Viewed

@@ -1,11 +1,13 @@
 from collections.abc import Sequence
 from typing import TYPE_CHECKING, Optional, Union, get_origin, get_type_hints
+from datachain.dataset import parse_dataset_name
 from datachain.error import DatasetVersionNotFoundError
 from datachain.lib.dataset_info import DatasetInfo
 from datachain.lib.file import (
     File,
 )
+from datachain.lib.projects import get as get_project
 from datachain.lib.settings import Settings
 from datachain.lib.signal_schema import SignalSchema
 from datachain.query import Session
@@ -24,6 +26,8 @@ if TYPE_CHECKING:
 def read_dataset(
     name: str,
+    namespace: Optional[str] = None,
+    project: Optional[str] = None,
     version: Optional[Union[str, int]] = None,
     session: Optional[Session] = None,
     settings: Optional[dict] = None,
@@ -38,7 +42,12 @@ def read_dataset(
     If dataset or version is not found locally, it will try to pull it from Studio.
     Parameters:
-        name : dataset name
+        name: The dataset name, which can be a fully qualified name including the
+            namespace and project. Alternatively, it can be a regular name, in which
+            case the explicitly defined namespace and project will be used if they are
+            set; otherwise, default values will be applied.
+        namespace : optional name of namespace in which dataset to read is created
+        project : optional name of project in which dataset to read is created
         version : dataset version
         session : Session to use for the chain.
         settings : Settings to use for the chain.
@@ -86,6 +95,11 @@ def read_dataset(
         chain = dc.read_dataset("my_cats")
         ```
+        ```py
+        import datachain as dc
+        chain = dc.read_dataset("dev.animals.my_cats")
+        ```
         ```py
         chain = dc.read_dataset("my_cats", fallback_to_studio=False)
         ```
@@ -116,6 +130,15 @@ def read_dataset(
     from .datachain import DataChain
+    session = Session.get(session)
+    catalog = session.catalog
+    namespace_name, project_name, name = parse_dataset_name(name)
+    namespace_name = (
+        namespace_name or namespace or catalog.metastore.default_namespace_name
+    )
+    project_name = project_name or project or catalog.metastore.default_project_name
     if version is not None:
         try:
             # for backward compatibility we still allow users to put version as integer
@@ -125,7 +148,9 @@ def read_dataset(
             # all 2.* dataset versions). If dataset doesn't have any versions where
             # major part is equal to that input, exception is thrown.
             major = int(version)
-            dataset = Session.get(session).catalog.get_dataset(name)
+            dataset = session.catalog.get_dataset(
+                name, get_project(project_name, namespace_name, session=session)
+            )
             latest_major = dataset.latest_major_version(major)
             if not latest_major:
                 raise DatasetVersionNotFoundError(
@@ -136,19 +161,22 @@ def read_dataset(
             # version is in new semver string format, continuing as normal
             pass
+    if settings:
+        _settings = Settings(**settings)
+    else:
+        _settings = Settings()
     query = DatasetQuery(
         name=name,
+        project_name=project_name,
+        namespace_name=namespace_name,
         version=version,  #  type: ignore[arg-type]
         session=session,
         indexing_column_types=File._datachain_column_types,
         fallback_to_studio=fallback_to_studio,
     )
-    telemetry.send_event_once("class", "datachain_init", name=name, version=version)
-    if settings:
-        _settings = Settings(**settings)
-    else:
-        _settings = Settings()
+    telemetry.send_event_once("class", "datachain_init", name=name, version=version)
     signals_schema = SignalSchema({"sys": Sys})
     if query.feature_schema:
         signals_schema |= SignalSchema.deserialize(query.feature_schema)
@@ -251,6 +279,8 @@ def datasets(
 def delete_dataset(
     name: str,
+    namespace: Optional[str] = None,
+    project: Optional[str] = None,
     version: Optional[str] = None,
     force: Optional[bool] = False,
     studio: Optional[bool] = False,
@@ -261,11 +291,16 @@ def delete_dataset(
     a force flag.
     Args:
-        name : Dataset name
+        name: The dataset name, which can be a fully qualified name including the
+            namespace and project. Alternatively, it can be a regular name, in which
+            case the explicitly defined namespace and project will be used if they are
+            set; otherwise, default values will be applied.
+        namespace : optional name of namespace in which dataset to delete is created
+        project : optional name of project in which dataset to delete is created
         version : Optional dataset version
         force: If true, all datasets versions will be removed. Defaults to False.
-        studio: If True, removes dataset from Studio only,
-            otherwise remove from local. Defaults to False.
+        studio: If True, removes dataset from Studio only, otherwise removes local
+            dataset. Defaults to False.
         session: Optional session instance. If not provided, uses default session.
         in_memory: If True, creates an in-memory session. Defaults to False.
@@ -282,11 +317,26 @@ def delete_dataset(
         dc.delete_dataset("cats", version="1.0.0")
         ```
     """
+    from datachain.studio import remove_studio_dataset
     session = Session.get(session, in_memory=in_memory)
     catalog = session.catalog
+    namespace_name, project_name, name = parse_dataset_name(name)
+    namespace_name = (
+        namespace_name or namespace or catalog.metastore.default_namespace_name
+    )
+    project_name = project_name or project or catalog.metastore.default_project_name
+    if not catalog.metastore.is_local_dataset(namespace_name) and studio:
+        return remove_studio_dataset(
+            None, name, namespace_name, project_name, version=version, force=force
+        )
+    ds_project = get_project(project_name, namespace_name, session=session)
     if not force:
-        version = version or catalog.get_dataset(name).latest_version
+        version = version or catalog.get_dataset(name, ds_project).latest_version
     else:
         version = None
-    catalog.remove_dataset(name, version=version, force=force, studio=studio)
+    catalog.remove_dataset(name, ds_project, version=version, force=force)

datachain/lib/dc/listings.py CHANGED Viewed

@@ -3,19 +3,58 @@ from typing import (
     Optional,
 )
+from datachain.lib.listing import LISTING_PREFIX, ls
 from datachain.lib.listing_info import ListingInfo
+from datachain.lib.settings import Settings
+from datachain.lib.signal_schema import SignalSchema
 from datachain.query import Session
+from datachain.query.dataset import DatasetQuery, QueryStep, step_result
 from .values import read_values
 if TYPE_CHECKING:
     from typing_extensions import ParamSpec
+    from datachain.dataset import DatasetVersion
+    from datachain.query.dataset import StepResult
     from .datachain import DataChain
     P = ParamSpec("P")
+class ReadOnlyQueryStep(QueryStep):
+    """
+    This step is used to read the dataset in read-only mode.
+    It is used to avoid the need to read the table metadata from the warehouse.
+    This is useful when we want to list the files in the dataset.
+    """
+    def apply(self) -> "StepResult":
+        import sqlalchemy as sa
+        def q(*columns):
+            return sa.select(*columns)
+        table_name = self.catalog.warehouse.dataset_table_name(
+            self.dataset, self.dataset_version
+        )
+        dataset_row_cls = self.catalog.warehouse.schema.dataset_row_cls
+        table = dataset_row_cls.new_table(
+            table_name,
+            columns=(
+                [
+                    *dataset_row_cls.sys_columns(),
+                    *dataset_row_cls.listing_columns(),
+                ]
+            ),
+        )
+        return step_result(
+            q, table.columns, dependencies=[(self.dataset, self.dataset_version)]
+        )
 def listings(
     session: Optional[Session] = None,
     in_memory: bool = False,
@@ -41,3 +80,75 @@ def listings(
         output={column: ListingInfo},
         **{column: catalog.listings()},  # type: ignore[arg-type]
     )
+def read_listing_dataset(
+    name: str,
+    version: Optional[str] = None,
+    path: str = "",
+    session: Optional["Session"] = None,
+    settings: Optional[dict] = None,
+) -> tuple["DataChain", "DatasetVersion"]:
+    """Read a listing dataset and return a DataChain and listing version.
+    Args:
+        name: Name of the dataset
+        version: Version of the dataset
+        path: Path within the listing to read. Path can have globs.
+        session: Optional Session object to use for reading
+        settings: Optional settings dictionary to use for reading
+    Returns:
+        tuple[DataChain, DatasetVersion]: A tuple containing:
+            - DataChain configured for listing files
+            - DatasetVersion object for the specified listing version
+    Example:
+        ```py
+        import datachain as dc
+        chain, listing_version = dc.read_listing_dataset(
+            "lst__s3://my-bucket/my-path", version="1.0.0", path="my-path"
+        )
+        chain.show()
+        ```
+    """
+    # Configure and return a DataChain for reading listing dataset files
+    # Uses ReadOnlyQueryStep to avoid warehouse metadata lookups
+    from datachain.lib.dc import Sys
+    from datachain.lib.file import File
+    from .datachain import DataChain
+    if not name.startswith(LISTING_PREFIX):
+        name = LISTING_PREFIX + name
+    session = Session.get(session)
+    dataset = session.catalog.get_dataset(name)
+    if version is None:
+        version = dataset.latest_version
+    query = DatasetQuery(
+        name=name,
+        session=session,
+        indexing_column_types=File._datachain_column_types,
+        fallback_to_studio=False,
+    )
+    if settings:
+        cfg = {**settings}
+        if "prefetch" not in cfg:
+            cfg["prefetch"] = 0
+        _settings = Settings(**cfg)
+    else:
+        _settings = Settings(prefetch=0)
+    signal_schema = SignalSchema({"sys": Sys, "file": File})
+    query.starting_step = ReadOnlyQueryStep(query.catalog, dataset, version)
+    query.version = version
+    # We already know that this is a listing dataset,
+    # so we can set the listing function to True
+    query.set_listing_fn(lambda: True)
+    chain = DataChain(query, _settings, signal_schema)
+    chain = ls(chain, path, recursive=True, column="file")
+    return chain, dataset.get_version(version)

datachain/lib/dc/records.py CHANGED Viewed

@@ -68,6 +68,7 @@ def read_records(
     dsr = catalog.create_dataset(
         name,
+        catalog.metastore.default_project,
         columns=columns,
         feature_schema=(
             signal_schema.clone_without_sys_signals().serialize()

datachain/lib/dc/storage.py CHANGED Viewed

@@ -144,6 +144,8 @@ def read_storage(
     catalog = session.catalog
     cache = catalog.cache
     client_config = session.catalog.client_config
+    listing_namespace_name = catalog.metastore.system_namespace_name
+    listing_project_name = catalog.metastore.listing_project_name
     uris = uri if isinstance(uri, (list, tuple)) else [uri]
@@ -167,7 +169,13 @@ def read_storage(
             )
             continue
-        dc = read_dataset(list_ds_name, session=session, settings=settings)
+        dc = read_dataset(
+            list_ds_name,
+            namespace=listing_namespace_name,
+            project=listing_project_name,
+            session=session,
+            settings=settings,
+        )
         dc._query.update = update
         dc.signals_schema = dc.signals_schema.mutate({f"{column}": file_type})
@@ -182,7 +190,11 @@ def read_storage(
                         settings=settings,
                         in_memory=in_memory,
                     )
-                    .settings(prefetch=0)
+                    .settings(
+                        prefetch=0,
+                        namespace=listing_namespace_name,
+                        project=listing_project_name,
+                    )
                     .gen(
                         list_bucket(lst_uri, cache, client_config=client_config),
                         output={f"{column}": file_type},

datachain/lib/listing.py CHANGED Viewed

@@ -123,6 +123,9 @@ def parse_listing_uri(uri: str) -> tuple[str, str, str]:
         f"{LISTING_PREFIX}{storage_uri}/{posixpath.join(lst_uri_path, '').lstrip('/')}"
     )
+    # we should remove dots from the name
+    ds_name = ds_name.replace(".", "_")
     return ds_name, lst_uri, path
@@ -195,5 +198,4 @@ def get_listing(
         list_path = f"{ds_name.strip('/').removeprefix(listing.name)}/{list_path}"
     ds_name = listing.name if listing else ds_name
     return ds_name, list_uri, list_path, bool(listing)

datachain/lib/namespaces.py ADDED Viewed

@@ -0,0 +1,73 @@
+from typing import Optional
+from datachain.error import NamespaceCreateNotAllowedError
+from datachain.namespace import Namespace
+from datachain.query import Session
+def create(
+    name: str, description: Optional[str] = None, session: Optional[Session] = None
+) -> Namespace:
+    """
+    Creates a new custom namespace.
+    A Namespace is an object used to organize datasets. It has name and a list of
+    Project objects underneath it. On the other hand, each Project can have multiple
+    datasets.
+    Note that creating namespaces is not allowed in the local environment, unlike
+    in Studio, where it is allowed.
+    In local environment all datasets are created under the default `local` namespace.
+    Parameters:
+        name : The name of the namespace.
+        description : A description of the namespace.
+        session : Session to use for creating namespace.
+    Example:
+        ```py
+        import datachain as dc
+        namespace = dc.namespaces.create("dev", "Dev namespace")
+        ```
+    """
+    session = Session.get(session)
+    if not session.catalog.metastore.namespace_allowed_to_create:
+        raise NamespaceCreateNotAllowedError("Creating custom namespace is not allowed")
+    Namespace.validate_name(name)
+    return session.catalog.metastore.create_namespace(name, description)
+def get(name: str, session: Optional[Session]) -> Namespace:
+    """
+    Gets a namespace by name.
+    If the namespace is not found, a `NamespaceNotFoundError` is raised.
+    Parameters:
+        name : The name of the namespace.
+        session : Session to use for getting namespace.
+    Example:
+        ```py
+        import datachain as dc
+        namespace = dc.get_namespace("local")
+        ```
+    """
+    session = Session.get(session)
+    return session.catalog.metastore.get_namespace(name)
+def ls(session: Optional[Session] = None) -> list[Namespace]:
+    """
+    Gets a list of all namespaces.
+    Parameters:
+        session : Session to use for getting namespaces.
+    Example:
+        ```py
+        import datachain as dc
+        namespaces = dc.namespaces.ls()
+        ```
+    """
+    return Session.get(session).catalog.metastore.list_namespaces()

datachain 0.19.1__py3-none-any.whl → 0.20.0__py3-none-any.whl

Potentially problematic release.

datachain 0.19.1py3-none-any.whl → 0.20.0py3-none-any.whl