PyPI - datachain - Versions diffs - 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl - Mend

datachain 0.3.9py3-none-any.whl → 0.3.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (29) hide show

datachain/catalog/catalog.py +57 -212
datachain/cli.py +6 -38
datachain/client/fsspec.py +3 -0
datachain/client/hf.py +47 -0
datachain/data_storage/metastore.py +2 -29
datachain/data_storage/sqlite.py +3 -12
datachain/data_storage/warehouse.py +20 -29
datachain/dataset.py +44 -32
datachain/job.py +4 -3
datachain/lib/arrow.py +21 -5
datachain/lib/dataset_info.py +4 -0
datachain/lib/dc.py +183 -59
datachain/lib/file.py +10 -33
datachain/lib/hf.py +2 -1
datachain/lib/listing.py +102 -94
datachain/lib/listing_info.py +32 -0
datachain/lib/meta_formats.py +39 -56
datachain/lib/signal_schema.py +5 -2
datachain/node.py +13 -0
datachain/query/dataset.py +12 -105
datachain/query/metrics.py +8 -0
datachain/utils.py +5 -0
{datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/METADATA +7 -3
{datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/RECORD +28 -27
{datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/WHEEL +1 -1
datachain/catalog/subclass.py +0 -60
{datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/LICENSE +0 -0
{datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/entry_points.txt +0 -0
{datachain-0.3.9.dist-info → datachain-0.3.11.dist-info}/top_level.txt +0 -0

datachain/lib/dc.py CHANGED Viewed

@@ -27,7 +27,16 @@ from datachain.lib.convert.values_to_tuples import values_to_tuples
 from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
 from datachain.lib.dataset_info import DatasetInfo
 from datachain.lib.file import ExportPlacement as FileExportPlacement
-from datachain.lib.file import File, IndexedFile, get_file
+from datachain.lib.file import File, IndexedFile, get_file_type
+from datachain.lib.listing import (
+    is_listing_dataset,
+    is_listing_expired,
+    is_listing_subset,
+    list_bucket,
+    ls,
+    parse_listing_uri,
+)
+from datachain.lib.listing_info import ListingInfo
 from datachain.lib.meta_formats import read_meta, read_schema
 from datachain.lib.model_store import ModelStore
 from datachain.lib.settings import Settings
@@ -47,7 +56,7 @@ from datachain.query.dataset import (
     PartitionByType,
     detach,
 )
-from datachain.query.schema import Column, DatasetRow
+from datachain.query.schema import DEFAULT_DELIMITER, Column, DatasetRow
 from datachain.sql.functions import path as pathfunc
 from datachain.utils import inside_notebook
@@ -103,11 +112,31 @@ class DatasetFromValuesError(DataChainParamsError):  # noqa: D101
         super().__init__(f"Dataset{name} from values error: {msg}")
+def _get_merge_error_str(col: Union[str, sqlalchemy.ColumnElement]) -> str:
+    if isinstance(col, str):
+        return col
+    if isinstance(col, sqlalchemy.Column):
+        return col.name.replace(DEFAULT_DELIMITER, ".")
+    if isinstance(col, sqlalchemy.ColumnElement) and hasattr(col, "name"):
+        return f"{col.name} expression"
+    return str(col)
 class DatasetMergeError(DataChainParamsError):  # noqa: D101
-    def __init__(self, on: Sequence[str], right_on: Optional[Sequence[str]], msg: str):  # noqa: D107
-        on_str = ", ".join(on) if isinstance(on, Sequence) else ""
+    def __init__(  # noqa: D107
+        self,
+        on: Sequence[Union[str, sqlalchemy.ColumnElement]],
+        right_on: Optional[Sequence[Union[str, sqlalchemy.ColumnElement]]],
+        msg: str,
+    ):
+        def _get_str(on: Sequence[Union[str, sqlalchemy.ColumnElement]]) -> str:
+            if not isinstance(on, Sequence):
+                return str(on)  # type: ignore[unreachable]
+            return ", ".join([_get_merge_error_str(col) for col in on])
+        on_str = _get_str(on)
         right_on_str = (
-            ", right_on='" + ", ".join(right_on) + "'"
+            ", right_on='" + _get_str(right_on) + "'"
             if right_on and isinstance(right_on, Sequence)
             else ""
         )
@@ -130,7 +159,7 @@ class Sys(DataModel):
 class DataChain(DatasetQuery):
-    """AI 🔗 DataChain - a data structure for batch data processing and evaluation.
+    """DataChain - a data structure for batch data processing and evaluation.
     It represents a sequence of data manipulation steps such as reading data from
     storages, running AI or LLM models or calling external services API to validate or
@@ -243,13 +272,24 @@ class DataChain(DatasetQuery):
         """Returns Column instance with a type if name is found in current schema,
         otherwise raises an exception.
         """
-        name_path = name.split(".")
+        if "." in name:
+            name_path = name.split(".")
+        elif DEFAULT_DELIMITER in name:
+            name_path = name.split(DEFAULT_DELIMITER)
+        else:
+            name_path = [name]
         for path, type_, _, _ in self.signals_schema.get_flat_tree():
             if path == name_path:
                 return Column(name, python_to_sql(type_))
         raise ValueError(f"Column with name {name} not found in the schema")
+    def c(self, column: Union[str, Column]) -> Column:
+        """Returns Column instance attached to the current chain."""
+        c = self.column(column) if isinstance(column, str) else self.column(column.name)
+        c.table = self.table
+        return c
     def print_schema(self) -> None:
         """Print schema of the chain."""
         self._effective_signals_schema.print_tree()
@@ -311,7 +351,7 @@ class DataChain(DatasetQuery):
     @classmethod
     def from_storage(
         cls,
-        path,
+        uri,
         *,
         type: Literal["binary", "text", "image"] = "binary",
         session: Optional[Session] = None,
@@ -320,41 +360,73 @@ class DataChain(DatasetQuery):
         recursive: Optional[bool] = True,
         object_name: str = "file",
         update: bool = False,
-        **kwargs,
+        anon: bool = False,
     ) -> "Self":
         """Get data from a storage as a list of file with all file attributes.
         It returns the chain itself as usual.
         Parameters:
-            path : storage URI with directory. URI must start with storage prefix such
+            uri : storage URI with directory. URI must start with storage prefix such
                 as `s3://`, `gs://`, `az://` or "file:///"
             type : read file as "binary", "text", or "image" data. Default is "binary".
             recursive : search recursively for the given path.
             object_name : Created object column name.
             update : force storage reindexing. Default is False.
+            anon : If True, we will treat cloud bucket as public one
         Example:
             ```py
             chain = DataChain.from_storage("s3://my-bucket/my-dir")
             ```
         """
-        func = get_file(type)
-        return (
-            cls(
-                path,
-                session=session,
-                settings=settings,
-                recursive=recursive,
-                update=update,
-                in_memory=in_memory,
-                **kwargs,
-            )
-            .map(**{object_name: func})
-            .select(object_name)
+        file_type = get_file_type(type)
+        client_config = {"anon": True} if anon else None
+        session = Session.get(session, client_config=client_config, in_memory=in_memory)
+        list_dataset_name, list_uri, list_path = parse_listing_uri(
+            uri, session.catalog.cache, session.catalog.client_config
         )
+        need_listing = True
+        for ds in cls.listings(session=session, in_memory=in_memory).collect("listing"):
+            if (
+                not is_listing_expired(ds.created_at)  # type: ignore[union-attr]
+                and is_listing_subset(ds.name, list_dataset_name)  # type: ignore[union-attr]
+                and not update
+            ):
+                need_listing = False
+                list_dataset_name = ds.name  # type: ignore[union-attr]
+        if need_listing:
+            # caching new listing to special listing dataset
+            (
+                cls.from_records(
+                    DataChain.DEFAULT_FILE_RECORD,
+                    session=session,
+                    settings=settings,
+                    in_memory=in_memory,
+                )
+                .gen(
+                    list_bucket(list_uri, client_config=session.catalog.client_config),
+                    output={f"{object_name}": File},
+                )
+                .save(list_dataset_name, listing=True)
+            )
+        dc = cls.from_dataset(list_dataset_name, session=session)
+        dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
+        return ls(dc, list_path, recursive=recursive, object_name=object_name)
     @classmethod
-    def from_dataset(cls, name: str, version: Optional[int] = None) -> "DataChain":
+    def from_dataset(
+        cls,
+        name: str,
+        version: Optional[int] = None,
+        session: Optional[Session] = None,
+    ) -> "DataChain":
         """Get data from a saved Dataset. It returns the chain itself.
         Parameters:
@@ -366,7 +438,7 @@ class DataChain(DatasetQuery):
             chain = DataChain.from_dataset("my_cats")
             ```
         """
-        return DataChain(name=name, version=version)
+        return DataChain(name=name, version=version, session=session)
     @classmethod
     def from_json(
@@ -419,7 +491,7 @@ class DataChain(DatasetQuery):
             object_name = jmespath_to_name(jmespath)
         if not object_name:
             object_name = meta_type
-        chain = DataChain.from_storage(path=path, type=type, **kwargs)
+        chain = DataChain.from_storage(uri=path, type=type, **kwargs)
         signal_dict = {
             object_name: read_meta(
                 schema_from=schema_from,
@@ -479,7 +551,7 @@ class DataChain(DatasetQuery):
             object_name = jmespath_to_name(jmespath)
         if not object_name:
             object_name = meta_type
-        chain = DataChain.from_storage(path=path, type=type, **kwargs)
+        chain = DataChain.from_storage(uri=path, type=type, **kwargs)
         signal_dict = {
             object_name: read_meta(
                 schema_from=schema_from,
@@ -500,6 +572,7 @@ class DataChain(DatasetQuery):
         settings: Optional[dict] = None,
         in_memory: bool = False,
         object_name: str = "dataset",
+        include_listing: bool = False,
     ) -> "DataChain":
         """Generate chain with list of registered datasets.
@@ -517,7 +590,9 @@ class DataChain(DatasetQuery):
         datasets = [
             DatasetInfo.from_models(d, v, j)
-            for d, v, j in catalog.list_datasets_versions()
+            for d, v, j in catalog.list_datasets_versions(
+                include_listing=include_listing
+            )
         ]
         return cls.from_values(
@@ -528,6 +603,42 @@ class DataChain(DatasetQuery):
             **{object_name: datasets},  # type: ignore[arg-type]
         )
+    @classmethod
+    def listings(
+        cls,
+        session: Optional[Session] = None,
+        in_memory: bool = False,
+        object_name: str = "listing",
+        **kwargs,
+    ) -> "DataChain":
+        """Generate chain with list of cached listings.
+        Listing is a special kind of dataset which has directory listing data of
+        some underlying storage (e.g S3 bucket).
+        Example:
+            ```py
+            from datachain import DataChain
+            DataChain.listings().show()
+            ```
+        """
+        session = Session.get(session, in_memory=in_memory)
+        catalog = kwargs.get("catalog") or session.catalog
+        listings = [
+            ListingInfo.from_models(d, v, j)
+            for d, v, j in catalog.list_datasets_versions(
+                include_listing=True, **kwargs
+            )
+            if is_listing_dataset(d.name)
+        ]
+        return cls.from_values(
+            session=session,
+            in_memory=in_memory,
+            output={object_name: ListingInfo},
+            **{object_name: listings},  # type: ignore[arg-type]
+        )
     def print_json_schema(  # type: ignore[override]
         self, jmespath: Optional[str] = None, model_name: Optional[str] = None
     ) -> "Self":
@@ -570,7 +681,7 @@ class DataChain(DatasetQuery):
         )
     def save(  # type: ignore[override]
-        self, name: Optional[str] = None, version: Optional[int] = None
+        self, name: Optional[str] = None, version: Optional[int] = None, **kwargs
     ) -> "Self":
         """Save to a Dataset. It returns the chain itself.
@@ -580,7 +691,7 @@ class DataChain(DatasetQuery):
             version : version of a dataset. Default - the last version that exist.
         """
         schema = self.signals_schema.clone_without_sys_signals().serialize()
-        return super().save(name=name, version=version, feature_schema=schema)
+        return super().save(name=name, version=version, feature_schema=schema, **kwargs)
     def apply(self, func, *args, **kwargs):
         """Apply any function to the chain.
@@ -1060,8 +1171,17 @@ class DataChain(DatasetQuery):
     def merge(
         self,
         right_ds: "DataChain",
-        on: Union[str, Sequence[str]],
-        right_on: Union[str, Sequence[str], None] = None,
+        on: Union[
+            str,
+            sqlalchemy.ColumnElement,
+            Sequence[Union[str, sqlalchemy.ColumnElement]],
+        ],
+        right_on: Union[
+            str,
+            sqlalchemy.ColumnElement,
+            Sequence[Union[str, sqlalchemy.ColumnElement]],
+            None,
+        ] = None,
         inner=False,
         rname="right_",
     ) -> "Self":
@@ -1086,7 +1206,7 @@ class DataChain(DatasetQuery):
         if on is None:
             raise DatasetMergeError(["None"], None, "'on' must be specified")
-        if isinstance(on, str):
+        if isinstance(on, (str, sqlalchemy.ColumnElement)):
             on = [on]
         elif not isinstance(on, Sequence):
             raise DatasetMergeError(
@@ -1095,19 +1215,15 @@ class DataChain(DatasetQuery):
                 f"'on' must be 'str' or 'Sequence' object but got type '{type(on)}'",
             )
-        signals_schema = self.signals_schema.clone_without_sys_signals()
-        on_columns: list[str] = signals_schema.resolve(*on).db_signals()  # type: ignore[assignment]
-        right_signals_schema = right_ds.signals_schema.clone_without_sys_signals()
         if right_on is not None:
-            if isinstance(right_on, str):
+            if isinstance(right_on, (str, sqlalchemy.ColumnElement)):
                 right_on = [right_on]
             elif not isinstance(right_on, Sequence):
                 raise DatasetMergeError(
                     on,
                     right_on,
                     "'right_on' must be 'str' or 'Sequence' object"
-                    f" but got type '{right_on}'",
+                    f" but got type '{type(right_on)}'",
                 )
             if len(right_on) != len(on):
@@ -1115,34 +1231,39 @@ class DataChain(DatasetQuery):
                     on, right_on, "'on' and 'right_on' must have the same length'"
                 )
-            right_on_columns: list[str] = right_signals_schema.resolve(
-                *right_on
-            ).db_signals()  # type: ignore[assignment]
-            if len(right_on_columns) != len(on_columns):
-                on_str = ", ".join(right_on_columns)
-                right_on_str = ", ".join(right_on_columns)
-                raise DatasetMergeError(
-                    on,
-                    right_on,
-                    "'on' and 'right_on' must have the same number of columns in db'."
-                    f" on -> {on_str}, right_on -> {right_on_str}",
-                )
-        else:
-            right_on = on
-            right_on_columns = on_columns
         if self == right_ds:
             right_ds = right_ds.clone(new_table=True)
+        errors = []
+        def _resolve(
+            ds: DataChain,
+            col: Union[str, sqlalchemy.ColumnElement],
+            side: Union[str, None],
+        ):
+            try:
+                return ds.c(col) if isinstance(col, (str, C)) else col
+            except ValueError:
+                if side:
+                    errors.append(f"{_get_merge_error_str(col)} in {side}")
         ops = [
-            self.c(left) == right_ds.c(right)
-            for left, right in zip(on_columns, right_on_columns)
+            _resolve(self, left, "left")
+            == _resolve(right_ds, right, "right" if right_on else None)
+            for left, right in zip(on, right_on or on)
         ]
+        if errors:
+            raise DatasetMergeError(
+                on, right_on, f"Could not resolve {', '.join(errors)}"
+            )
         ds = self.join(right_ds, sqlalchemy.and_(*ops), inner, rname + "{name}")
         ds.feature_schema = None
+        signals_schema = self.signals_schema.clone_without_sys_signals()
+        right_signals_schema = right_ds.signals_schema.clone_without_sys_signals()
         ds.signals_schema = SignalSchema({"sys": Sys}) | signals_schema.merge(
             right_signals_schema, rname
         )
@@ -1665,7 +1786,10 @@ class DataChain(DatasetQuery):
         if schema:
             signal_schema = SignalSchema(schema)
-            columns = signal_schema.db_signals(as_columns=True)  # type: ignore[assignment]
+            columns = [
+                sqlalchemy.Column(c.name, c.type)  # type: ignore[union-attr]
+                for c in signal_schema.db_signals(as_columns=True)  # type: ignore[assignment]
+            ]
         else:
             columns = [
                 sqlalchemy.Column(name, typ)

datachain/lib/file.py CHANGED Viewed

@@ -349,39 +349,6 @@ class ImageFile(File):
         self.read().save(destination)
-def get_file(type_: Literal["binary", "text", "image"] = "binary"):
-    file: type[File] = File
-    if type_ == "text":
-        file = TextFile
-    elif type_ == "image":
-        file = ImageFile  # type: ignore[assignment]
-    def get_file_type(
-        source: str,
-        path: str,
-        size: int,
-        version: str,
-        etag: str,
-        is_latest: bool,
-        last_modified: datetime,
-        location: Optional[Union[dict, list[dict]]],
-        vtype: str,
-    ) -> file:  # type: ignore[valid-type]
-        return file(
-            source=source,
-            path=path,
-            size=size,
-            version=version,
-            etag=etag,
-            is_latest=is_latest,
-            last_modified=last_modified,
-            location=location,
-            vtype=vtype,
-        )
-    return get_file_type
 class IndexedFile(DataModel):
     """Metadata indexed from tabular files.
@@ -390,3 +357,13 @@ class IndexedFile(DataModel):
     file: File
     index: int
+def get_file_type(type_: Literal["binary", "text", "image"] = "binary") -> type[File]:
+    file: type[File] = File
+    if type_ == "text":
+        file = TextFile
+    elif type_ == "image":
+        file = ImageFile  # type: ignore[assignment]
+    return file

datachain/lib/hf.py CHANGED Viewed

@@ -99,7 +99,8 @@ class HFGenerator(Generator):
 def stream_splits(ds: Union[str, HFDatasetType], *args, **kwargs):
     if isinstance(ds, str):
-        ds = load_dataset(ds, *args, streaming=True, **kwargs)
+        kwargs["streaming"] = True
+        ds = load_dataset(ds, *args, **kwargs)
     if isinstance(ds, (DatasetDict, IterableDatasetDict)):
         return ds
     return {"": ds}

datachain/lib/listing.py CHANGED Viewed

@@ -1,103 +1,26 @@
-import asyncio
-from collections.abc import AsyncIterator, Iterator, Sequence
-from typing import Callable, Optional
+import posixpath
+from collections.abc import Iterator
+from datetime import datetime, timedelta, timezone
+from typing import TYPE_CHECKING, Callable, Optional
-from botocore.exceptions import ClientError
 from fsspec.asyn import get_loop
+from sqlalchemy.sql.expression import true
 from datachain.asyn import iter_over_async
 from datachain.client import Client
-from datachain.error import ClientError as DataChainClientError
 from datachain.lib.file import File
+from datachain.query.schema import Column
+from datachain.sql.functions import path as pathfunc
+from datachain.utils import uses_glob
-ResultQueue = asyncio.Queue[Optional[Sequence[File]]]
-DELIMITER = "/"  # Path delimiter
-FETCH_WORKERS = 100
-async def _fetch_dir(client, prefix, result_queue) -> set[str]:
-    path = f"{client.name}/{prefix}"
-    infos = await client.ls_dir(path)
-    files = []
-    subdirs = set()
-    for info in infos:
-        full_path = info["name"]
-        subprefix = client.rel_path(full_path)
-        if prefix.strip(DELIMITER) == subprefix.strip(DELIMITER):
-            continue
-        if info["type"] == "directory":
-            subdirs.add(subprefix)
-        else:
-            files.append(client.info_to_file(info, subprefix))
-    if files:
-        await result_queue.put(files)
-    return subdirs
-async def _fetch(
-    client, start_prefix: str, result_queue: ResultQueue, fetch_workers
-) -> None:
-    loop = get_loop()
-    queue: asyncio.Queue[str] = asyncio.Queue()
-    queue.put_nowait(start_prefix)
-    async def process(queue) -> None:
-        while True:
-            prefix = await queue.get()
-            try:
-                subdirs = await _fetch_dir(client, prefix, result_queue)
-                for subdir in subdirs:
-                    queue.put_nowait(subdir)
-            except Exception:
-                while not queue.empty():
-                    queue.get_nowait()
-                    queue.task_done()
-                raise
-            finally:
-                queue.task_done()
-    try:
-        workers: list[asyncio.Task] = [
-            loop.create_task(process(queue)) for _ in range(fetch_workers)
-        ]
-        # Wait for all fetch tasks to complete
-        await queue.join()
-        # Stop the workers
-        excs = []
-        for worker in workers:
-            if worker.done() and (exc := worker.exception()):
-                excs.append(exc)
-            else:
-                worker.cancel()
-        if excs:
-            raise excs[0]
-    except ClientError as exc:
-        raise DataChainClientError(
-            exc.response.get("Error", {}).get("Message") or exc,
-            exc.response.get("Error", {}).get("Code"),
-        ) from exc
-    finally:
-        # This ensures the progress bar is closed before any exceptions are raised
-        result_queue.put_nowait(None)
-async def _scandir(client, prefix, fetch_workers) -> AsyncIterator:
-    """Recursively goes through dir tree and yields files"""
-    result_queue: ResultQueue = asyncio.Queue()
-    loop = get_loop()
-    main_task = loop.create_task(_fetch(client, prefix, result_queue, fetch_workers))
-    while (files := await result_queue.get()) is not None:
-        for f in files:
-            yield f
-    await main_task
-def list_bucket(uri: str, client_config=None, fetch_workers=FETCH_WORKERS) -> Callable:
+if TYPE_CHECKING:
+    from datachain.lib.dc import DataChain
+LISTING_TTL = 4 * 60 * 60  # cached listing lasts 4 hours
+LISTING_PREFIX = "lst__"  # listing datasets start with this name
+def list_bucket(uri: str, client_config=None) -> Callable:
     """
     Function that returns another generator function that yields File objects
     from bucket where each File represents one bucket entry.
@@ -106,6 +29,91 @@ def list_bucket(uri: str, client_config=None, fetch_workers=FETCH_WORKERS) -> Ca
     def list_func() -> Iterator[File]:
         config = client_config or {}
         client, path = Client.parse_url(uri, None, **config)  # type: ignore[arg-type]
-        yield from iter_over_async(_scandir(client, path, fetch_workers), get_loop())
+        for entries in iter_over_async(client.scandir(path.rstrip("/")), get_loop()):
+            for entry in entries:
+                yield entry.to_file(client.uri)
     return list_func
+def ls(
+    dc: "DataChain",
+    path: str,
+    recursive: Optional[bool] = True,
+    object_name="file",
+):
+    """
+    Return files by some path from DataChain instance which contains bucket listing.
+    Path can have globs.
+    If recursive is set to False, only first level children will be returned by
+    specified path
+    """
+    def _file_c(name: str) -> Column:
+        return Column(f"{object_name}.{name}")
+    dc = dc.filter(_file_c("is_latest") == true())
+    if recursive:
+        if not path or path == "/":
+            # root of a bucket, returning all latest files from it
+            return dc
+        if not uses_glob(path):
+            # path is not glob, so it's pointing to some directory or a specific
+            # file and we are adding proper filter for it
+            return dc.filter(
+                (_file_c("path") == path)
+                | (_file_c("path").glob(path.rstrip("/") + "/*"))
+            )
+        # path has glob syntax so we are returning glob filter
+        return dc.filter(_file_c("path").glob(path))
+    # returning only first level children by path
+    return dc.filter(pathfunc.parent(_file_c("path")) == path.lstrip("/").rstrip("/*"))
+def parse_listing_uri(uri: str, cache, client_config) -> tuple[str, str, str]:
+    """
+    Parsing uri and returns listing dataset name, listing uri and listing path
+    """
+    client, path = Client.parse_url(uri, cache, **client_config)
+    # clean path without globs
+    lst_uri_path = (
+        posixpath.dirname(path) if uses_glob(path) or client.fs.isfile(uri) else path
+    )
+    lst_uri = f"{client.uri}/{lst_uri_path.lstrip('/')}"
+    ds_name = (
+        f"{LISTING_PREFIX}{client.uri}/{posixpath.join(lst_uri_path, '').lstrip('/')}"
+    )
+    return ds_name, lst_uri, path
+def is_listing_dataset(name: str) -> bool:
+    """Returns True if it's special listing dataset"""
+    return name.startswith(LISTING_PREFIX)
+def listing_uri_from_name(dataset_name: str) -> str:
+    """Returns clean storage URI from listing dataset name"""
+    if not is_listing_dataset(dataset_name):
+        raise ValueError(f"Dataset {dataset_name} is not a listing")
+    return dataset_name.removeprefix(LISTING_PREFIX)
+def is_listing_expired(created_at: datetime) -> bool:
+    """Checks if listing has expired based on it's creation date"""
+    return datetime.now(timezone.utc) > created_at + timedelta(seconds=LISTING_TTL)
+def is_listing_subset(ds1_name: str, ds2_name: str) -> bool:
+    """
+    Checks if one listing contains another one by comparing corresponding dataset names
+    """
+    assert ds1_name.endswith("/")
+    assert ds2_name.endswith("/")
+    return ds2_name.startswith(ds1_name)

datachain 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl

Potentially problematic release.

datachain 0.3.9py3-none-any.whl → 0.3.11py3-none-any.whl