PyPI - datachain - Versions diffs - 0.8.2__py3-none-any.whl → 0.8.3__py3-none-any.whl - Mend

datachain 0.8.2py3-none-any.whl → 0.8.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (20) hide show

datachain/cache.py +4 -2
datachain/catalog/catalog.py +100 -54
datachain/catalog/datasource.py +4 -6
datachain/client/azure.py +21 -1
datachain/client/fsspec.py +35 -9
datachain/client/gcs.py +10 -2
datachain/client/local.py +4 -4
datachain/client/s3.py +10 -0
datachain/dataset.py +1 -0
datachain/lib/dc.py +15 -3
datachain/lib/listing.py +18 -3
datachain/listing.py +1 -5
datachain/node.py +27 -1
datachain/query/session.py +1 -1
{datachain-0.8.2.dist-info → datachain-0.8.3.dist-info}/METADATA +2 -2
{datachain-0.8.2.dist-info → datachain-0.8.3.dist-info}/RECORD +20 -20
{datachain-0.8.2.dist-info → datachain-0.8.3.dist-info}/LICENSE +0 -0
{datachain-0.8.2.dist-info → datachain-0.8.3.dist-info}/WHEEL +0 -0
{datachain-0.8.2.dist-info → datachain-0.8.3.dist-info}/entry_points.txt +0 -0
{datachain-0.8.2.dist-info → datachain-0.8.3.dist-info}/top_level.txt +0 -0

datachain/cache.py CHANGED Viewed

@@ -61,14 +61,16 @@ class DataChainCache:
         tmp_info = odb_fs.join(self.odb.tmp_dir, tmp_fname())  # type: ignore[arg-type]
         size = file.size
         if size < 0:
-            size = await client.get_size(from_path)
+            size = await client.get_size(from_path, version_id=file.version)
         cb = callback or TqdmCallback(
             tqdm_kwargs={"desc": odb_fs.name(from_path), "bytes": True},
             tqdm_cls=Tqdm,
             size=size,
         )
         try:
-            await client.get_file(from_path, tmp_info, callback=cb)
+            await client.get_file(
+                from_path, tmp_info, callback=cb, version_id=file.version
+            )
         finally:
             if not callback:
                 cb.close()

datachain/catalog/catalog.py CHANGED Viewed

@@ -240,7 +240,8 @@ class DatasetRowsFetcher(NodesThreadPool):
 class NodeGroup:
     """Class for a group of nodes from the same source"""
-    listing: "Listing"
+    listing: Optional["Listing"]
+    client: "Client"
     sources: list[DataSource]
     # The source path within the bucket
@@ -268,9 +269,7 @@ class NodeGroup:
         Download this node group to cache.
         """
         if self.sources:
-            self.listing.client.fetch_nodes(
-                self.iternodes(recursive), shared_progress_bar=pbar
-            )
+            self.client.fetch_nodes(self.iternodes(recursive), shared_progress_bar=pbar)
 def check_output_dataset_file(
@@ -375,7 +374,7 @@ def collect_nodes_for_cp(
     # Collect all sources to process
     for node_group in node_groups:
-        listing: Listing = node_group.listing
+        listing: Optional[Listing] = node_group.listing
         valid_sources: list[DataSource] = []
         for dsrc in node_group.sources:
             if dsrc.is_single_object():
@@ -383,6 +382,7 @@ def collect_nodes_for_cp(
                 total_files += 1
                 valid_sources.append(dsrc)
             else:
+                assert listing
                 node = dsrc.node
                 if not recursive:
                     print(f"{node.full_path} is a directory (not copied).")
@@ -433,37 +433,51 @@ def instantiate_node_groups(
     )
     output_dir = output
+    output_file = None
     if copy_to_filename:
         output_dir = os.path.dirname(output)
         if not output_dir:
             output_dir = "."
+        output_file = os.path.basename(output)
     # Instantiate these nodes
     for node_group in node_groups:
         if not node_group.sources:
             continue
-        listing: Listing = node_group.listing
+        listing: Optional[Listing] = node_group.listing
         source_path: str = node_group.source_path
         copy_dir_contents = always_copy_dir_contents or source_path.endswith("/")
-        instantiated_nodes = listing.collect_nodes_to_instantiate(
-            node_group.sources,
-            copy_to_filename,
-            recursive,
-            copy_dir_contents,
-            source_path,
-            node_group.is_edatachain,
-            node_group.is_dataset,
-        )
-        if not virtual_only:
-            listing.instantiate_nodes(
-                instantiated_nodes,
-                output_dir,
-                total_files,
-                force=force,
-                shared_progress_bar=instantiate_progress_bar,
+        if not listing:
+            source = node_group.sources[0]
+            client = source.client
+            node = NodeWithPath(source.node, [output_file or source.node.path])
+            instantiated_nodes = [node]
+            if not virtual_only:
+                node.instantiate(
+                    client, output_dir, instantiate_progress_bar, force=force
+                )
+        else:
+            instantiated_nodes = listing.collect_nodes_to_instantiate(
+                node_group.sources,
+                copy_to_filename,
+                recursive,
+                copy_dir_contents,
+                source_path,
+                node_group.is_edatachain,
+                node_group.is_dataset,
             )
+            if not virtual_only:
+                listing.instantiate_nodes(
+                    instantiated_nodes,
+                    output_dir,
+                    total_files,
+                    force=force,
+                    shared_progress_bar=instantiate_progress_bar,
+                )
         node_group.instantiated_nodes = instantiated_nodes
     if instantiate_progress_bar:
         instantiate_progress_bar.close()
@@ -592,7 +606,7 @@ class Catalog:
         client_config=None,
         object_name="file",
         skip_indexing=False,
-    ) -> tuple["Listing", str]:
+    ) -> tuple[Optional["Listing"], "Client", str]:
         from datachain.lib.dc import DataChain
         from datachain.listing import Listing
@@ -603,16 +617,19 @@ class Catalog:
         list_ds_name, list_uri, list_path, _ = get_listing(
             source, self.session, update=update
         )
+        lst = None
+        client = Client.get_client(list_uri, self.cache, **self.client_config)
+        if list_ds_name:
+            lst = Listing(
+                self.metastore.clone(),
+                self.warehouse.clone(),
+                client,
+                dataset_name=list_ds_name,
+                object_name=object_name,
+            )
-        lst = Listing(
-            self.metastore.clone(),
-            self.warehouse.clone(),
-            Client.get_client(list_uri, self.cache, **self.client_config),
-            dataset_name=list_ds_name,
-            object_name=object_name,
-        )
-        return lst, list_path
+        return lst, client, list_path
     def _remove_dataset_rows_and_warehouse_info(
         self, dataset: DatasetRecord, version: int, **kwargs
@@ -635,13 +652,13 @@ class Catalog:
     ) -> Optional[list["DataSource"]]:
         enlisted_sources = []
         for src in sources:  # Opt: parallel
-            listing, file_path = self.enlist_source(
+            listing, client, file_path = self.enlist_source(
                 src,
                 update,
                 client_config=client_config or self.client_config,
                 skip_indexing=skip_indexing,
             )
-            enlisted_sources.append((listing, file_path))
+            enlisted_sources.append((listing, client, file_path))
         if only_index:
             # sometimes we don't really need listing result (e.g on indexing process)
@@ -649,10 +666,16 @@ class Catalog:
             return None
         dsrc_all: list[DataSource] = []
-        for listing, file_path in enlisted_sources:
-            nodes = listing.expand_path(file_path)
-            dir_only = file_path.endswith("/")
-            dsrc_all.extend(DataSource(listing, node, dir_only) for node in nodes)
+        for listing, client, file_path in enlisted_sources:
+            if not listing:
+                nodes = [Node.from_file(client.get_file_info(file_path))]
+                dir_only = False
+            else:
+                nodes = listing.expand_path(file_path)
+                dir_only = file_path.endswith("/")
+            dsrc_all.extend(
+                DataSource(listing, client, node, dir_only) for node in nodes
+            )
         return dsrc_all
     def enlist_sources_grouped(
@@ -667,7 +690,7 @@ class Catalog:
         def _row_to_node(d: dict[str, Any]) -> Node:
             del d["file__source"]
-            return Node.from_dict(d)
+            return Node.from_row(d)
         enlisted_sources: list[tuple[bool, bool, Any]] = []
         client_config = client_config or self.client_config
@@ -677,7 +700,7 @@ class Catalog:
                 edatachain_data = parse_edatachain_file(src)
                 indexed_sources = []
                 for ds in edatachain_data:
-                    listing, source_path = self.enlist_source(
+                    listing, _, source_path = self.enlist_source(
                         ds["data-source"]["uri"],
                         update,
                         client_config=client_config,
@@ -701,6 +724,7 @@ class Catalog:
                     client = self.get_client(source, **client_config)
                     uri = client.uri
                     dataset_name, _, _, _ = get_listing(uri, self.session)
+                    assert dataset_name
                     listing = Listing(
                         self.metastore.clone(),
                         self.warehouse.clone(),
@@ -713,6 +737,7 @@ class Catalog:
                     indexed_sources.append(
                         (
                             listing,
+                            client,
                             source,
                             [_row_to_node(r) for r in rows],
                             ds_name,
@@ -722,25 +747,28 @@ class Catalog:
                 enlisted_sources.append((False, True, indexed_sources))
             else:
-                listing, source_path = self.enlist_source(
+                listing, client, source_path = self.enlist_source(
                     src, update, client_config=client_config
                 )
-                enlisted_sources.append((False, False, (listing, source_path)))
+                enlisted_sources.append((False, False, (listing, client, source_path)))
         node_groups = []
         for is_datachain, is_dataset, payload in enlisted_sources:  # Opt: parallel
             if is_dataset:
                 for (
                     listing,
+                    client,
                     source_path,
                     nodes,
                     dataset_name,
                     dataset_version,
                 ) in payload:
-                    dsrc = [DataSource(listing, node) for node in nodes]
+                    assert listing
+                    dsrc = [DataSource(listing, client, node) for node in nodes]
                     node_groups.append(
                         NodeGroup(
                             listing,
+                            client,
                             dsrc,
                             source_path,
                             dataset_name=dataset_name,
@@ -749,18 +777,30 @@ class Catalog:
                     )
             elif is_datachain:
                 for listing, source_path, paths in payload:
-                    dsrc = [DataSource(listing, listing.resolve_path(p)) for p in paths]
+                    assert listing
+                    dsrc = [
+                        DataSource(listing, listing.client, listing.resolve_path(p))
+                        for p in paths
+                    ]
                     node_groups.append(
-                        NodeGroup(listing, dsrc, source_path, is_edatachain=True)
+                        NodeGroup(
+                            listing,
+                            listing.client,
+                            dsrc,
+                            source_path,
+                            is_edatachain=True,
+                        )
                     )
             else:
-                listing, source_path = payload
-                as_container = source_path.endswith("/")
-                dsrc = [
-                    DataSource(listing, n, as_container)
-                    for n in listing.expand_path(source_path, use_glob=not no_glob)
-                ]
-                node_groups.append(NodeGroup(listing, dsrc, source_path))
+                listing, client, source_path = payload
+                if not listing:
+                    nodes = [Node.from_file(client.get_file_info(source_path))]
+                    as_container = False
+                else:
+                    as_container = source_path.endswith("/")
+                    nodes = listing.expand_path(source_path, use_glob=not no_glob)
+                dsrc = [DataSource(listing, client, n, as_container) for n in nodes]
+                node_groups.append(NodeGroup(listing, client, dsrc, source_path))
         return node_groups
@@ -1196,10 +1236,16 @@ class Catalog:
         return q.to_db_records()
-    def signed_url(self, source: str, path: str, client_config=None) -> str:
+    def signed_url(
+        self,
+        source: str,
+        path: str,
+        version_id: Optional[str] = None,
+        client_config=None,
+    ) -> str:
         client_config = client_config or self.client_config
         client = Client.get_client(source, self.cache, **client_config)
-        return client.url(path)
+        return client.url(path, version_id=version_id)
     def export_dataset_table(
         self,

datachain/catalog/datasource.py CHANGED Viewed

@@ -4,21 +4,19 @@ from datachain.node import DirType, NodeWithPath
 class DataSource:
-    def __init__(self, listing, node, as_container=False):
+    def __init__(self, listing, client, node, as_container=False):
         self.listing = listing
+        self.client = client
         self.node = node
         self.as_container = (
             as_container  # Indicates whether a .tar file is handled as a container
         )
-    def get_full_path(self):
-        return self.get_node_full_path(self.node)
     def get_node_full_path(self, node):
-        return self.listing.client.get_full_path(node.full_path)
+        return self.client.get_full_path(node.full_path)
     def get_node_full_path_from_path(self, full_path):
-        return self.listing.client.get_full_path(full_path)
+        return self.client.get_full_path(full_path)
     def is_single_object(self):
         return self.node.dir_type == DirType.FILE or (

datachain/client/azure.py CHANGED Viewed

@@ -1,4 +1,5 @@
-from typing import Any
+from typing import Any, Optional
+from urllib.parse import parse_qs, urlsplit, urlunsplit
 from adlfs import AzureBlobFileSystem
 from tqdm import tqdm
@@ -25,6 +26,16 @@ class AzureClient(Client):
             size=v.get("size", ""),
         )
+    def url(self, path: str, expires: int = 3600, **kwargs) -> str:
+        """
+        Generate a signed URL for the given path.
+        """
+        version_id = kwargs.pop("version_id", None)
+        result = self.fs.sign(
+            self.get_full_path(path, version_id), expiration=expires, **kwargs
+        )
+        return result + (f"&versionid={version_id}" if version_id else "")
     async def _fetch_flat(self, start_prefix: str, result_queue: ResultQueue) -> None:
         prefix = start_prefix
         if prefix:
@@ -57,4 +68,13 @@ class AzureClient(Client):
         finally:
             result_queue.put_nowait(None)
+    @classmethod
+    def version_path(cls, path: str, version_id: Optional[str]) -> str:
+        parts = list(urlsplit(path))
+        query = parse_qs(parts[3])
+        if "versionid" in query:
+            raise ValueError("path already includes a version query")
+        parts[3] = f"versionid={version_id}" if version_id else ""
+        return urlunsplit(parts)
     _fetch_default = _fetch_flat

datachain/client/fsspec.py CHANGED Viewed

@@ -137,6 +137,10 @@ class Client(ABC):
         fs.invalidate_cache()
         return fs
+    @classmethod
+    def version_path(cls, path: str, version_id: Optional[str]) -> str:
+        return path
     @classmethod
     def from_name(
         cls,
@@ -198,17 +202,37 @@ class Client(ABC):
         return self._fs
     def url(self, path: str, expires: int = 3600, **kwargs) -> str:
-        return self.fs.sign(self.get_full_path(path), expiration=expires, **kwargs)
+        return self.fs.sign(
+            self.get_full_path(path, kwargs.pop("version_id", None)),
+            expiration=expires,
+            **kwargs,
+        )
     async def get_current_etag(self, file: "File") -> str:
-        info = await self.fs._info(self.get_full_path(file.path))
+        kwargs = {}
+        if self.fs.version_aware:
+            kwargs["version_id"] = file.version
+        info = await self.fs._info(
+            self.get_full_path(file.path, file.version), **kwargs
+        )
         return self.info_to_file(info, "").etag
-    async def get_size(self, path: str) -> int:
-        return await self.fs._size(path)
+    def get_file_info(self, path: str, version_id: Optional[str] = None) -> "File":
+        info = self.fs.info(self.get_full_path(path, version_id), version_id=version_id)
+        return self.info_to_file(info, path)
+    async def get_size(self, path: str, version_id: Optional[str] = None) -> int:
+        return await self.fs._size(
+            self.version_path(path, version_id), version_id=version_id
+        )
-    async def get_file(self, lpath, rpath, callback):
-        return await self.fs._get_file(lpath, rpath, callback=callback)
+    async def get_file(self, lpath, rpath, callback, version_id: Optional[str] = None):
+        return await self.fs._get_file(
+            self.version_path(lpath, version_id),
+            rpath,
+            callback=callback,
+            version_id=version_id,
+        )
     async def scandir(
         self, start_prefix: str, method: str = "default"
@@ -315,8 +339,8 @@ class Client(ABC):
     def rel_path(self, path: str) -> str:
         return self.fs.split_path(path)[1]
-    def get_full_path(self, rel_path: str) -> str:
-        return f"{self.PREFIX}{self.name}/{rel_path}"
+    def get_full_path(self, rel_path: str, version_id: Optional[str] = None) -> str:
+        return self.version_path(f"{self.PREFIX}{self.name}/{rel_path}", version_id)
     @abstractmethod
     def info_to_file(self, v: dict[str, Any], parent: str) -> "File": ...
@@ -362,7 +386,9 @@ class Client(ABC):
         if use_cache and (cache_path := self.cache.get_path(file)):
             return open(cache_path, mode="rb")
         assert not file.location
-        return FileWrapper(self.fs.open(self.get_full_path(file.path)), cb)  # type: ignore[return-value]
+        return FileWrapper(
+            self.fs.open(self.get_full_path(file.path, file.version)), cb
+        )  # type: ignore[return-value]
     def download(self, file: "File", *, callback: Callback = DEFAULT_CALLBACK) -> None:
         sync(get_loop(), functools.partial(self._download, file, callback=callback))

datachain/client/gcs.py CHANGED Viewed

@@ -38,9 +38,13 @@ class GCSClient(Client):
         If the client is anonymous, a public URL is returned instead
         (see https://cloud.google.com/storage/docs/access-public-data#api-link).
         """
+        version_id = kwargs.pop("version_id", None)
         if self.fs.storage_options.get("token") == "anon":
-            return f"https://storage.googleapis.com/{self.name}/{path}"
-        return self.fs.sign(self.get_full_path(path), expiration=expires, **kwargs)
+            query = f"?generation={version_id}" if version_id else ""
+            return f"https://storage.googleapis.com/{self.name}/{path}{query}"
+        return self.fs.sign(
+            self.get_full_path(path, version_id), expiration=expires, **kwargs
+        )
     @staticmethod
     def parse_timestamp(timestamp: str) -> datetime:
@@ -131,3 +135,7 @@ class GCSClient(Client):
             last_modified=self.parse_timestamp(v["updated"]),
             size=v.get("size", ""),
         )
+    @classmethod
+    def version_path(cls, path: str, version_id: Optional[str]) -> str:
+        return f"{path}#{version_id}" if version_id else path

datachain/client/local.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 import posixpath
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Optional
 from urllib.parse import urlparse
 from fsspec.implementations.local import LocalFileSystem
@@ -105,10 +105,10 @@ class FileClient(Client):
         info = self.fs.info(self.get_full_path(file.path))
         return self.info_to_file(info, "").etag
-    async def get_size(self, path: str) -> int:
+    async def get_size(self, path: str, version_id: Optional[str] = None) -> int:
         return self.fs.size(path)
-    async def get_file(self, lpath, rpath, callback):
+    async def get_file(self, lpath, rpath, callback, version_id: Optional[str] = None):
         return self.fs.get_file(lpath, rpath, callback=callback)
     async def ls_dir(self, path):
@@ -117,7 +117,7 @@ class FileClient(Client):
     def rel_path(self, path):
         return posixpath.relpath(path, self.name)
-    def get_full_path(self, rel_path):
+    def get_full_path(self, rel_path, version_id: Optional[str] = None):
         full_path = Path(self.name, rel_path).as_posix()
         if rel_path.endswith("/") or not rel_path:
             full_path += "/"

datachain/client/s3.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import asyncio
 from typing import Any, Optional, cast
+from urllib.parse import parse_qs, urlsplit, urlunsplit
 from botocore.exceptions import NoCredentialsError
 from s3fs import S3FileSystem
@@ -121,6 +122,15 @@ class ClientS3(Client):
             size=v["Size"],
         )
+    @classmethod
+    def version_path(cls, path: str, version_id: Optional[str]) -> str:
+        parts = list(urlsplit(path))
+        query = parse_qs(parts[3])
+        if "versionId" in query:
+            raise ValueError("path already includes a version query")
+        parts[3] = f"versionId={version_id}" if version_id else ""
+        return urlunsplit(parts)
     async def _fetch_dir(
         self,
         prefix,

datachain/dataset.py CHANGED Viewed

@@ -92,6 +92,7 @@ class DatasetDependency:
             return self.name
         list_dataset_name, _, _ = parse_listing_uri(self.name.strip("/"), None, {})
+        assert list_dataset_name
         return list_dataset_name
     @classmethod

datachain/lib/dc.py CHANGED Viewed

@@ -32,7 +32,7 @@ from datachain.lib.data_model import DataModel, DataType, DataValue, dict_to_dat
 from datachain.lib.dataset_info import DatasetInfo
 from datachain.lib.file import ArrowRow, File, FileType, get_file_type
 from datachain.lib.file import ExportPlacement as FileExportPlacement
-from datachain.lib.listing import get_listing, list_bucket, ls
+from datachain.lib.listing import get_file_info, get_listing, list_bucket, ls
 from datachain.lib.listing_info import ListingInfo
 from datachain.lib.meta_formats import read_meta
 from datachain.lib.model_store import ModelStore
@@ -438,6 +438,18 @@ class DataChain:
             uri, session, update=update
         )
+        # ds_name is None if object is a file, we don't want to use cache
+        # or do listing in that case - just read that single object
+        if not list_ds_name:
+            dc = cls.from_values(
+                session=session,
+                settings=settings,
+                in_memory=in_memory,
+                file=[get_file_info(list_uri, cache, client_config=client_config)],
+            )
+            dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
+            return dc
         if update or not list_ds_exists:
             (
                 cls.from_records(
@@ -1634,7 +1646,7 @@ class DataChain:
         output: OutputType = None,
         object_name: str = "",
         **fr_map,
-    ) -> "DataChain":
+    ) -> "Self":
         """Generate chain from list of values.
         Example:
@@ -1647,7 +1659,7 @@ class DataChain:
         def _func_fr() -> Iterator[tuple_type]:  # type: ignore[valid-type]
             yield from tuples
-        chain = DataChain.from_records(
+        chain = cls.from_records(
             DataChain.DEFAULT_FILE_RECORD,
             session=session,
             settings=settings,

datachain/lib/listing.py CHANGED Viewed

@@ -39,6 +39,15 @@ def list_bucket(uri: str, cache, client_config=None) -> Callable:
     return list_func
+def get_file_info(uri: str, cache, client_config=None) -> File:
+    """
+    Wrapper to return File object by its URI
+    """
+    client = Client.get_client(uri, cache, **(client_config or {}))  # type: ignore[arg-type]
+    _, path = Client.parse_url(uri)
+    return client.get_file_info(path)
 def ls(
     dc: D,
     path: str,
@@ -76,7 +85,7 @@ def ls(
     return dc.filter(pathfunc.parent(_file_c("path")) == path.lstrip("/").rstrip("/*"))
-def parse_listing_uri(uri: str, cache, client_config) -> tuple[str, str, str]:
+def parse_listing_uri(uri: str, cache, client_config) -> tuple[Optional[str], str, str]:
     """
     Parsing uri and returns listing dataset name, listing uri and listing path
     """
@@ -85,7 +94,9 @@ def parse_listing_uri(uri: str, cache, client_config) -> tuple[str, str, str]:
     storage_uri, path = Client.parse_url(uri)
     telemetry.log_param("client", client.PREFIX)
-    if uses_glob(path) or client.fs.isfile(uri):
+    if not uri.endswith("/") and client.fs.isfile(uri):
+        return None, f'{storage_uri}/{path.lstrip("/")}', path
+    if uses_glob(path):
         lst_uri_path = posixpath.dirname(path)
     else:
         storage_uri, path = Client.parse_url(f'{uri.rstrip("/")}/')
@@ -113,7 +124,7 @@ def listing_uri_from_name(dataset_name: str) -> str:
 def get_listing(
     uri: str, session: "Session", update: bool = False
-) -> tuple[str, str, str, bool]:
+) -> tuple[Optional[str], str, str, bool]:
     """Returns correct listing dataset name that must be used for saving listing
     operation. It takes into account existing listings and reusability of those.
     It also returns boolean saying if returned dataset name is reused / already
@@ -131,6 +142,10 @@ def get_listing(
     ds_name, list_uri, list_path = parse_listing_uri(uri, cache, client_config)
     listing = None
+    # if we don't want to use cached dataset (e.g. for a single file listing)
+    if not ds_name:
+        return None, list_uri, list_path, False
     listings = [
         ls for ls in catalog.listings() if not ls.is_expired and ls.contains(ds_name)
     ]

datachain/listing.py CHANGED Viewed

@@ -157,11 +157,7 @@ class Listing:
         counter = 0
         for node in all_nodes:
-            dst = os.path.join(output, *node.path)
-            dst_dir = os.path.dirname(dst)
-            os.makedirs(dst_dir, exist_ok=True)
-            file = node.n.to_file(self.client.uri)
-            self.client.instantiate_object(file, dst, progress_bar, force)
+            node.instantiate(self.client, output, progress_bar, force=force)
             counter += 1
             if counter > 1000:
                 progress_bar.update(counter)

datachain/node.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import os
 from datetime import datetime
 from typing import TYPE_CHECKING, Any, Optional
@@ -10,6 +11,8 @@ from datachain.utils import TIME_ZERO, time_to_str
 if TYPE_CHECKING:
     from typing_extensions import Self
+    from datachain.client import Client
 class DirType:
     FILE = 0
@@ -114,7 +117,21 @@ class Node:
         )
     @classmethod
-    def from_dict(cls, d: dict[str, Any], file_prefix: str = "file") -> "Self":
+    def from_file(cls, f: File) -> "Self":
+        return cls(
+            source=StorageURI(f.source),
+            path=f.path,
+            etag=f.etag,
+            is_latest=f.is_latest,
+            size=f.size,
+            last_modified=f.last_modified,
+            version=f.version,
+            location=str(f.location) if f.location else None,
+            dir_type=DirType.FILE,
+        )
+    @classmethod
+    def from_row(cls, d: dict[str, Any], file_prefix: str = "file") -> "Self":
         def _dval(field_name: str):
             return d.get(f"{file_prefix}__{field_name}")
@@ -174,6 +191,15 @@ class NodeWithPath:
             path += "/"
         return path
+    def instantiate(
+        self, client: "Client", output: str, progress_bar, *, force: bool = False
+    ):
+        dst = os.path.join(output, *self.path)
+        dst_dir = os.path.dirname(dst)
+        os.makedirs(dst_dir, exist_ok=True)
+        file = self.n.to_file(client.uri)
+        client.instantiate_object(file, dst, progress_bar, force)
 TIME_FMT = "%Y-%m-%d %H:%M"

datachain/query/session.py CHANGED Viewed

@@ -55,7 +55,7 @@ class Session:
         client_config: Optional[dict] = None,
         in_memory: bool = False,
     ):
-        if re.match(r"^[0-9a-zA-Z]+$", name) is None:
+        if re.match(r"^[0-9a-zA-Z]*$", name) is None:
             raise ValueError(
                 f"Session name can contain only letters or numbers - '{name}' given."
             )

{datachain-0.8.2.dist-info → datachain-0.8.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datachain
-Version: 0.8.2
+Version: 0.8.3
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License: Apache-2.0
@@ -72,7 +72,7 @@ Requires-Dist: pytest<9,>=8; extra == "tests"
 Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
 Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
 Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
-Requires-Dist: pytest-servers[all]>=0.5.8; extra == "tests"
+Requires-Dist: pytest-servers[all]>=0.5.9; extra == "tests"
 Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
 Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
 Requires-Dist: virtualenv; extra == "tests"

{datachain-0.8.2.dist-info → datachain-0.8.3.dist-info}/RECORD RENAMED Viewed

@@ -1,15 +1,15 @@
 datachain/__init__.py,sha256=ofPJ6B-d-ybSDRrE7J6wqF_ZRAB2W9U8l-eeuBtqPLg,865
 datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
 datachain/asyn.py,sha256=5aKrjnUxk0mtnZeFKNJd1DCE0MsnSoyJBZkr0y9H_a0,9313
-datachain/cache.py,sha256=s0YHN7qurmQv-eC265TjeureK84TebWWAnL07cxchZQ,2997
+datachain/cache.py,sha256=4xI0Ct2gVXuLZPqKdbjmfb_KD2klou-9WnL1WNhIuCA,3077
 datachain/cli.py,sha256=gNXVoMfKINUhKjOpYN48tpyNBK13M0hkQWqra4jNSJQ,43137
 datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
 datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
-datachain/dataset.py,sha256=P-pDBgvPqJGDhq_I7fwCfb6hY8E8mIAO8Q0NT7SNlNE,19128
+datachain/dataset.py,sha256=5HtqZBRaaToa_C74g62bACjBaCRf2Y6BDgIACLhK1ZA,19161
 datachain/error.py,sha256=bxAAL32lSeMgzsQDEHbGTGORj-mPzzpCRvWDPueJNN4,1092
 datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
-datachain/listing.py,sha256=TgKg25ZWAP5enzKgw2_2GUPJVdnQUh6uySHB5SJrUY4,7773
-datachain/node.py,sha256=o8Sqy92QkzzcLK6XmIFLyDSE6Rw6kUTmGRhEmfLFdhg,5211
+datachain/listing.py,sha256=WdiWMVa0xZ-LtR3SJ0gFLgYUI6VaLI0DSEE_KvfikXs,7582
+datachain/node.py,sha256=HSpjBUBQBWXUUpbUEq839dsSc5KR2O8ww1Udl4jQemY,6023
 datachain/nodes_fetcher.py,sha256=ILMzUW5o4_6lUOVrLDC9gJPCXfcgKnMG68plrc7dAOA,1113
 datachain/nodes_thread_pool.py,sha256=uPo-xl8zG5m9YgODjPFBpbcqqHjI-dcxH87yAbj_qco,3192
 datachain/progress.py,sha256=5KotcvvzAUL_RF0GEj4JY0IB1lyImnmHxe89YkT1XO4,4330
@@ -18,17 +18,17 @@ datachain/studio.py,sha256=BegIXunW1n-sZtHSe3a30Mw2MXexVGRn_GU-OzjRRKM,8725
 datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
 datachain/utils.py,sha256=9iP8RGm3MHemj13qt1VxRGYAsA6v-627M22o0fr76_M,13906
 datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
-datachain/catalog/catalog.py,sha256=CacK-tfgMM-ZpE0cW7Rfosx1aqXV0shyUy0TfHZnBOQ,58385
-datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
+datachain/catalog/catalog.py,sha256=ixXJKftUIG_ZBPdie1dJAPPHddWV6HZwb3GO-TRHtxY,60103
+datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
 datachain/catalog/loader.py,sha256=HA_mBC7q_My8j2WnSvIjUGuJpl6SIdg5vvy_lagxJlA,5733
 datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
-datachain/client/azure.py,sha256=ffxs26zm6KLAL1aUWJm-vtzuZP3LSNha7UDGXynMBKo,2234
+datachain/client/azure.py,sha256=D-mfLtpiq6O-DaSs-ofEEYhjIZBNfgRw1l9R7UgxEM4,3055
 datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
-datachain/client/fsspec.py,sha256=kf1blSGNcEXJ0tra3y5i35jc1aAy-67wMHXkqjlRMXg,12736
-datachain/client/gcs.py,sha256=tAm5CCO86UNuSwTCHVPOiPbj1fBhnEYDoEVLKvv9H5I,4632
+datachain/client/fsspec.py,sha256=rr6-M1iu30x8PAXpOD84U2Vh4CHU0-SdfJFdVZF3ouA,13650
+datachain/client/gcs.py,sha256=MI94GXpCRqAlaF56HNrzQbXA-yR7bn2FOBPzO-lG_SI,4947
 datachain/client/hf.py,sha256=XeVJVbiNViZCpn3sfb90Fr8SYO3BdLmfE3hOWMoqInE,951
-datachain/client/local.py,sha256=f2HBqWH8SQM5CyiJ0ljfePVROg2FszWaAn6E2c8RiLE,4596
-datachain/client/s3.py,sha256=CVHBUZ1Ic2Q3370nl-Bbe69phuWjFlrVv9dTJKBpRT0,6019
+datachain/client/local.py,sha256=iHQKh-HhoNzqZ2yaiuIfZWGXtt_X9FMSA-TN_03zjPc,4708
+datachain/client/s3.py,sha256=67XISS6tW9bnhlbRtKJEAYd_JQvtLHqdPBxm8ySrJl8,6440
 datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
 datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
 datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
@@ -53,12 +53,12 @@ datachain/lib/arrow.py,sha256=33Od_XECCfWR9PUDBdevSooXS4mpMdPx_hoMLjpaELU,9734
 datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
 datachain/lib/data_model.py,sha256=zS4lmXHVBXc9ntcyea2a1CRLXGSAN_0glXcF88CohgY,2685
 datachain/lib/dataset_info.py,sha256=IjdF1E0TQNOq9YyynfWiCFTeZpbyGfyJvxgJY4YN810,2493
-datachain/lib/dc.py,sha256=dLBvM_Fr26AQBD3t_-oLSymLyoJQxmkJQs3cC-mZCu8,90492
+datachain/lib/dc.py,sha256=UhyNLYVuCPJPz-EamMVPFjYwzJzbFrDzXvb07PscykI,91015
 datachain/lib/diff.py,sha256=Yurzyi7PzZzY80HOnVTpwtbWzSJ1LqN8NgZWwZOh_UU,6732
 datachain/lib/file.py,sha256=KeccxOulTQCLitdHZoTaq96xpE-5kmWZCrT9X9bRkD0,15049
 datachain/lib/hf.py,sha256=a-zFpDmZIR4r8dlNNTjfpAKSnuJ9xyRXlgcdENiXt3E,5864
 datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
-datachain/lib/listing.py,sha256=C-JmWMJcErbjNJ7ygwaXjP7Ak3nS-MjYTgSn7vvkICg,5536
+datachain/lib/listing.py,sha256=8OPAJZbjPIGQ7qJPyfJEI1s9j9tP0GkKfyHebjQxPx0,6092
 datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
 datachain/lib/meta_formats.py,sha256=hDPfEkcmiLZOjhBBXuareMdnq65Wj8vZvxjmum6cROM,6377
 datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
@@ -95,7 +95,7 @@ datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,93
 datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
 datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
 datachain/query/schema.py,sha256=b_KnVy6B26Ol4nYG0LqNNpeQ1QYPk95YRGUjXfdaQWs,6606
-datachain/query/session.py,sha256=09FtSS3cvfRv2iBQZcbCBMIiRywv7Guhy5nmLEiltq4,5998
+datachain/query/session.py,sha256=fQAtl5zRESRDfRS2d5J9KgrWauunCtrd96vP4Ns1KlE,5998
 datachain/query/udf.py,sha256=GY8E9pnzPE7ZKl_jvetZpn9R2rlUtMlhoYj4UmrzFzw,594
 datachain/query/utils.py,sha256=u0A_BwG9PNs0DxoDcvSWgWLpj3ByTUv8CqH13CIuGag,1293
 datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -121,9 +121,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
 datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
 datachain/toolkit/split.py,sha256=z3zRJNzjWrpPuRw-zgFbCOBKInyYxJew8ygrYQRQLNc,2930
 datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
-datachain-0.8.2.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
-datachain-0.8.2.dist-info/METADATA,sha256=MFVRJVJBLh_Cq3aV_1V4dHKkf15HTZHLUWWQNbRId3I,11066
-datachain-0.8.2.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-datachain-0.8.2.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
-datachain-0.8.2.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
-datachain-0.8.2.dist-info/RECORD,,
+datachain-0.8.3.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
+datachain-0.8.3.dist-info/METADATA,sha256=gAPCEMlRQirhIDHK61LPuF1NNNaZQxwMlTVG-8fZDnM,11066
+datachain-0.8.3.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
+datachain-0.8.3.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
+datachain-0.8.3.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
+datachain-0.8.3.dist-info/RECORD,,

{datachain-0.8.2.dist-info → datachain-0.8.3.dist-info}/LICENSE RENAMED Viewed

File without changes

{datachain-0.8.2.dist-info → datachain-0.8.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{datachain-0.8.2.dist-info → datachain-0.8.3.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{datachain-0.8.2.dist-info → datachain-0.8.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

datachain 0.8.2__py3-none-any.whl → 0.8.3__py3-none-any.whl

Potentially problematic release.

datachain 0.8.2py3-none-any.whl → 0.8.3py3-none-any.whl