PyPI - datachain - Versions diffs - 0.14.0__py3-none-any.whl → 0.14.1__py3-none-any.whl - Mend

datachain 0.14.0py3-none-any.whl → 0.14.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (11) hide show

datachain/catalog/catalog.py CHANGED Viewed

@@ -588,7 +588,7 @@ class Catalog:
         from_storage(
             source, session=self.session, update=update, object_name=object_name
-        )
+        ).exec()
         list_ds_name, list_uri, list_path, _ = get_listing(
             source, self.session, update=update

datachain/client/fsspec.py CHANGED Viewed

@@ -89,9 +89,9 @@ class Client(ABC):
         from .local import FileClient
         from .s3 import ClientS3
-        protocol = urlparse(str(url)).scheme
+        protocol = urlparse(os.fspath(url)).scheme
-        if not protocol or _is_win_local_path(str(url)):
+        if not protocol or _is_win_local_path(os.fspath(url)):
             return FileClient
         if protocol == ClientS3.protocol:
             return ClientS3
@@ -122,7 +122,7 @@ class Client(ABC):
         source: Union[str, os.PathLike[str]], cache: Cache, **kwargs
     ) -> "Client":
         cls = Client.get_implementation(source)
-        storage_url, _ = cls.split_url(str(source))
+        storage_url, _ = cls.split_url(os.fspath(source))
         if os.name == "nt":
             storage_url = storage_url.removeprefix("/")

datachain/lib/dc/json.py CHANGED Viewed

@@ -64,7 +64,7 @@ def from_json(
     from .storage import from_storage
     if schema_from == "auto":
-        schema_from = str(path)
+        schema_from = os.fspath(path)
     def jmespath_to_name(s: str):
         name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s)  # type: ignore[union-attr]

datachain/lib/dc/storage.py CHANGED Viewed

@@ -6,11 +6,15 @@ from typing import (
 )
 from datachain.lib.file import (
-    File,
     FileType,
     get_file_type,
 )
-from datachain.lib.listing import get_file_info, get_listing, list_bucket, ls
+from datachain.lib.listing import (
+    get_file_info,
+    get_listing,
+    list_bucket,
+    ls,
+)
 from datachain.query import Session
 if TYPE_CHECKING:
@@ -18,7 +22,7 @@ if TYPE_CHECKING:
 def from_storage(
-    uri: Union[str, os.PathLike[str]],
+    uri: Union[str, os.PathLike[str], list[str], list[os.PathLike[str]]],
     *,
     type: FileType = "binary",
     session: Optional[Session] = None,
@@ -30,11 +34,12 @@ def from_storage(
     anon: bool = False,
     client_config: Optional[dict] = None,
 ) -> "DataChain":
-    """Get data from a storage as a list of file with all file attributes.
+    """Get data from storage(s) as a list of file with all file attributes.
     It returns the chain itself as usual.
     Parameters:
-        uri : storage URI with directory. URI must start with storage prefix such
+        uri : storage URI with directory or list of URIs.
+            URIs must start with storage prefix such
             as `s3://`, `gs://`, `az://` or "file:///"
         type : read file as "binary", "text", or "image" data. Default is "binary".
         recursive : search recursively for the given path.
@@ -43,16 +48,26 @@ def from_storage(
         anon : If True, we will treat cloud bucket as public one
         client_config : Optional client configuration for the storage client.
-    Example:
-        Simple call from s3
-        ```py
+    Returns:
+        DataChain: A DataChain object containing the file information.
+    Examples:
+        Simple call from s3:
+        ```python
         import datachain as dc
         chain = dc.from_storage("s3://my-bucket/my-dir")
         ```
-        With AWS S3-compatible storage
-        ```py
-        import datachain as dc
+        Multiple URIs:
+        ```python
+        chain = dc.from_storage([
+            "s3://bucket1/dir1",
+            "s3://bucket2/dir2"
+        ])
+        ```
+        With AWS S3-compatible storage:
+        ```python
         chain = dc.from_storage(
             "s3://my-bucket/my-dir",
             client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
@@ -62,9 +77,15 @@ def from_storage(
         Pass existing session
         ```py
         session = Session.get()
-        import datachain as dc
-        chain = dc.from_storage("s3://my-bucket/my-dir", session=session)
+        chain = dc.from_storage([
+            "path/to/dir1",
+            "path/to/dir2"
+        ], session=session, recursive=True)
         ```
+    Note:
+        When using multiple URIs with `update=True`, the function optimizes by
+        avoiding redundant updates for URIs pointing to the same storage location.
     """
     from .datachain import DataChain
     from .datasets import from_dataset
@@ -79,40 +100,71 @@ def from_storage(
     cache = session.catalog.cache
     client_config = session.catalog.client_config
-    list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
-        uri, session, update=update
-    )
+    uris = uri if isinstance(uri, (list, tuple)) else [uri]
+    if not uris:
+        raise ValueError("No URIs provided")
+    storage_chain = None
+    listed_ds_name = set()
+    file_values = []
+    for single_uri in uris:
+        list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
+            single_uri, session, update=update
+        )
+        # list_ds_name is None if object is a file, we don't want to use cache
+        # or do listing in that case - just read that single object
+        if not list_ds_name:
+            file_values.append(
+                get_file_info(list_uri, cache, client_config=client_config)
+            )
+            continue
+        dc = from_dataset(list_ds_name, session=session, settings=settings)
+        dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
-    # ds_name is None if object is a file, we don't want to use cache
-    # or do listing in that case - just read that single object
-    if not list_ds_name:
-        dc = from_values(
+        if update or not list_ds_exists:
+            def lst_fn(ds_name, lst_uri):
+                # disable prefetch for listing, as it pre-downloads all files
+                (
+                    from_records(
+                        DataChain.DEFAULT_FILE_RECORD,
+                        session=session,
+                        settings=settings,
+                        in_memory=in_memory,
+                    )
+                    .settings(prefetch=0)
+                    .gen(
+                        list_bucket(lst_uri, cache, client_config=client_config),
+                        output={f"{object_name}": file_type},
+                    )
+                    .save(ds_name, listing=True)
+                )
+            dc._query.add_before_steps(
+                lambda ds_name=list_ds_name, lst_uri=list_uri: lst_fn(ds_name, lst_uri)
+            )
+        chain = ls(dc, list_path, recursive=recursive, object_name=object_name)
+        storage_chain = storage_chain.union(chain) if storage_chain else chain
+        listed_ds_name.add(list_ds_name)
+    if file_values:
+        file_chain = from_values(
             session=session,
             settings=settings,
             in_memory=in_memory,
-            file=[get_file_info(list_uri, cache, client_config=client_config)],
+            file=file_values,
         )
-        dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
-        return dc
-    if update or not list_ds_exists:
-        # disable prefetch for listing, as it pre-downloads all files
-        (
-            from_records(
-                DataChain.DEFAULT_FILE_RECORD,
-                session=session,
-                settings=settings,
-                in_memory=in_memory,
-            )
-            .settings(prefetch=0)
-            .gen(
-                list_bucket(list_uri, cache, client_config=client_config),
-                output={f"{object_name}": File},
-            )
-            .save(list_ds_name, listing=True)
+        file_chain.signals_schema = file_chain.signals_schema.mutate(
+            {f"{object_name}": file_type}
         )
+        storage_chain = storage_chain.union(file_chain) if storage_chain else file_chain
-    dc = from_dataset(list_ds_name, session=session, settings=settings)
-    dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
+    assert storage_chain is not None
-    return ls(dc, list_path, recursive=recursive, object_name=object_name)
+    return storage_chain

datachain/query/dataset.py CHANGED Viewed

@@ -47,6 +47,7 @@ from datachain.error import (
     QueryScriptCancelError,
 )
 from datachain.func.base import Function
+from datachain.lib.listing import is_listing_dataset
 from datachain.lib.udf import UDFAdapter, _get_cache
 from datachain.progress import CombinedDownloadCallback, TqdmCombinedDownloadCallback
 from datachain.query.schema import C, UDFParamSpec, normalize_param
@@ -151,13 +152,6 @@ def step_result(
     )
-class StartingStep(ABC):
-    """An initial query processing step, referencing a data source."""
-    @abstractmethod
-    def apply(self) -> "StepResult": ...
 @frozen
 class Step(ABC):
     """A query processing step (filtering, mutation, etc.)"""
@@ -170,7 +164,7 @@ class Step(ABC):
 @frozen
-class QueryStep(StartingStep):
+class QueryStep:
     catalog: "Catalog"
     dataset_name: str
     dataset_version: int
@@ -1097,26 +1091,42 @@ class DatasetQuery:
         self.temp_table_names: list[str] = []
         self.dependencies: set[DatasetDependencyType] = set()
         self.table = self.get_table()
-        self.starting_step: StartingStep
+        self.starting_step: Optional[QueryStep] = None
         self.name: Optional[str] = None
         self.version: Optional[int] = None
         self.feature_schema: Optional[dict] = None
         self.column_types: Optional[dict[str, Any]] = None
+        self.before_steps: list[Callable] = []
-        self.name = name
+        self.list_ds_name: Optional[str] = None
-        if fallback_to_studio and is_token_set():
-            ds = self.catalog.get_dataset_with_remote_fallback(name, version)
+        self.name = name
+        self.dialect = self.catalog.warehouse.db.dialect
+        if version:
+            self.version = version
+        if is_listing_dataset(name):
+            # not setting query step yet as listing dataset might not exist at
+            # this point
+            self.list_ds_name = name
+        elif fallback_to_studio and is_token_set():
+            self._set_starting_step(
+                self.catalog.get_dataset_with_remote_fallback(name, version)
+            )
         else:
-            ds = self.catalog.get_dataset(name)
+            self._set_starting_step(self.catalog.get_dataset(name))
+    def _set_starting_step(self, ds: "DatasetRecord") -> None:
+        if not self.version:
+            self.version = ds.latest_version
-        self.version = version or ds.latest_version
+        self.starting_step = QueryStep(self.catalog, ds.name, self.version)
+        # at this point we know our starting dataset so setting up schemas
         self.feature_schema = ds.get_version(self.version).feature_schema
         self.column_types = copy(ds.schema)
         if "sys__id" in self.column_types:
             self.column_types.pop("sys__id")
-        self.starting_step = QueryStep(self.catalog, name, self.version)
-        self.dialect = self.catalog.warehouse.db.dialect
     def __iter__(self):
         return iter(self.db_results())
@@ -1180,11 +1190,23 @@ class DatasetQuery:
         col.table = self.table
         return col
+    def add_before_steps(self, fn: Callable) -> None:
+        """
+        Setting custom function to be run before applying steps
+        """
+        self.before_steps.append(fn)
     def apply_steps(self) -> QueryGenerator:
         """
         Apply the steps in the query and return the resulting
         sqlalchemy.SelectBase.
         """
+        for fn in self.before_steps:
+            fn()
+        if self.list_ds_name:
+            # at this point we know what is our starting listing dataset name
+            self._set_starting_step(self.catalog.get_dataset(self.list_ds_name))  # type: ignore [arg-type]
         query = self.clone()
         index = os.getenv("DATACHAIN_QUERY_CHUNK_INDEX", self._chunk_index)
@@ -1203,6 +1225,7 @@ class DatasetQuery:
             query = query.filter(C.sys__rand % total == index)
             query.steps = query.steps[-1:] + query.steps[:-1]
+        assert query.starting_step
         result = query.starting_step.apply()
         self.dependencies.update(result.dependencies)

{datachain-0.14.0.dist-info → datachain-0.14.1.dist-info}/METADATA RENAMED Viewed

@@ -1,9 +1,9 @@
 Metadata-Version: 2.4
 Name: datachain
-Version: 0.14.0
+Version: 0.14.1
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
-License: Apache-2.0
+License-Expression: Apache-2.0
 Project-URL: Documentation, https://datachain.dvc.ai
 Project-URL: Issues, https://github.com/iterative/datachain/issues
 Project-URL: Source, https://github.com/iterative/datachain

{datachain-0.14.0.dist-info → datachain-0.14.1.dist-info}/RECORD RENAMED Viewed

@@ -17,7 +17,7 @@ datachain/studio.py,sha256=9MEpFPLKI3gG4isKklcfD5BMLeNsSXhtOUboOjW4Fdc,10017
 datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
 datachain/utils.py,sha256=CLAYkI7iPbLYw3Pjh5EkWuc2UOs8wEbuXQnqIs4UyV8,14173
 datachain/catalog/__init__.py,sha256=cMZzSz3VoUi-6qXSVaHYN-agxQuAcz2XSqnEPZ55crE,353
-datachain/catalog/catalog.py,sha256=6dDTbSom8JzxLD_cbFboKtsiYtGR5WIOEOQTtCQ5mws,60722
+datachain/catalog/catalog.py,sha256=FGW2cEOysgVMyokqIFAJ1PB-RYJrqDEFGfHP5qLYO-k,60729
 datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
 datachain/catalog/loader.py,sha256=AhSQR_-S-9lY3DcXn3PVZv9UtarHOMlDy2x75iDwUjo,6035
 datachain/cli/__init__.py,sha256=YPVkuQ7IezNhtzo5xrfca1hEIiZtFxOlJCOzAOEuxmA,8335
@@ -37,7 +37,7 @@ datachain/cli/parser/utils.py,sha256=rETdD-9Hq9A4OolgfT7jQw4aoawtbfmkdtH6E7nkhpI
 datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
 datachain/client/azure.py,sha256=ma6fJcnveG8wpNy1PSrN5hgvmRdCj8Sf3RKjfd3qCyM,3221
 datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
-datachain/client/fsspec.py,sha256=VutCpF8MDisDwdnJvJpiTuDU9BRRAa0Km3ZkD0sKaI0,13834
+datachain/client/fsspec.py,sha256=UJ7PDq1F11gf7OMjfXYqzrS1GHL3FZctOwXI0S_LU74,13852
 datachain/client/gcs.py,sha256=tepsstv-6WkkJ16SVXIPKPlWdNyFlTqrUlDwulWlWGQ,5116
 datachain/client/hf.py,sha256=posnI5WOKOMG1yY_ZiV9Orcd24QsUPKZlOXgJVLxxrM,1558
 datachain/client/local.py,sha256=cGoCYflribzexiOe-Y1qbaE2fJRh-_EgQrfCSa0yK_E,4568
@@ -99,12 +99,12 @@ datachain/lib/dc/csv.py,sha256=OaVHYnOZiYEfsUcispXuGcIYQKF03u4XrRf6Fgce6Kk,4401
 datachain/lib/dc/datachain.py,sha256=NdGCRNk3NZCGQHs-sq0jiKkvsXiowiqDQTY_X4AbL6o,76390
 datachain/lib/dc/datasets.py,sha256=0vdgNpA_xakFgnfm78I1yU98u2hvOawOXS872pg2F48,4329
 datachain/lib/dc/hf.py,sha256=F_ME1IpUlQfhqVGe__Uz7jLwd-fp-O7pu50OLhkaG0w,2170
-datachain/lib/dc/json.py,sha256=gVH69oP8b5FR1YX3c_4Z_G1nFsAQ_xFz6fBg0J-U9ak,2719
+datachain/lib/dc/json.py,sha256=mlrqsmxLDYNP7dmde3IDYP01QlbUzP8Pj5UDqlqJcZ0,2725
 datachain/lib/dc/listings.py,sha256=c2ASPhwRhPDMbA5esYp3kMVw6sQ7vsWEflHWh9x7tkw,1044
 datachain/lib/dc/pandas.py,sha256=eteVB6DqRGAU2tDF_Bep7JRU4nny3uyVPbGKOZ6PVq0,1249
 datachain/lib/dc/parquet.py,sha256=tO0rDL3XZ24rqkUJYAYn_yAyZgIYV5N6r28MTlPE0Z0,1809
 datachain/lib/dc/records.py,sha256=zV4vPJvCEd5mBv-E_q-VfrSXNjcfu74QY884z3QuftM,2524
-datachain/lib/dc/storage.py,sha256=PIz6K2VOtrVV7XUNd3BESp3P5WovgaG1RgBYut0OBNA,3789
+datachain/lib/dc/storage.py,sha256=mIAlNEYRJ8r3yHA2sJyt8duwuSfehbPro7WqMQvezIc,5295
 datachain/lib/dc/utils.py,sha256=Ct-0FqCaDhNWHx09gJFcCXJGPjMI-VZr4t-GJyqTi44,3984
 datachain/lib/dc/values.py,sha256=PLBZew0BYO3mv7W3n8OF5Ad-5tp5eWPqlbiVxG5pJ30,1409
 datachain/model/__init__.py,sha256=R9faX5OHV1xh2EW-g2MPedwbtEqt3LodJRyluB-QylI,189
@@ -118,7 +118,7 @@ datachain/model/ultralytics/pose.py,sha256=gXAWfAk4OWZl93hKcQPKZvqJa3nIrECB4RM8K
 datachain/model/ultralytics/segment.py,sha256=koq1HASo29isf0in6oSlzmU4IzsmOXe87F1ajQQVfh4,2911
 datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
 datachain/query/batch.py,sha256=6w8gzLTmLeylststu-gT5jIqEfi4-djS7_yTYyeo-fw,4190
-datachain/query/dataset.py,sha256=J3NgcrzSP2dFg8JVqDodyBh1QEia_B-alcyfI3xKlZE,57256
+datachain/query/dataset.py,sha256=Em5vfKkZygzXCiWRYUBGLSh3eWlIamMBvh328YNnmww,58201
 datachain/query/dispatch.py,sha256=_1vjeQ1wjUoxlik55k0JkWqQCUfMjgVWmEOyWRkx0dU,12437
 datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
 datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -150,9 +150,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
 datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
 datachain/toolkit/split.py,sha256=VdcP_zVLqAxuSrze3BaR-dBzTmyKkCUAiAremw3OEPU,2914
 datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
-datachain-0.14.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
-datachain-0.14.0.dist-info/METADATA,sha256=lC1I5lSWJX7a9oNpsRnEOM_L1W3hfnY8Op7iGWaNNcM,11324
-datachain-0.14.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
-datachain-0.14.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
-datachain-0.14.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
-datachain-0.14.0.dist-info/RECORD,,
+datachain-0.14.1.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
+datachain-0.14.1.dist-info/METADATA,sha256=UPk0v7fsYz_eTsJf5YpexjD4jrjpWsKEyAVNSXN3KvE,11335
+datachain-0.14.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
+datachain-0.14.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
+datachain-0.14.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
+datachain-0.14.1.dist-info/RECORD,,

{datachain-0.14.0.dist-info → datachain-0.14.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{datachain-0.14.0.dist-info → datachain-0.14.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{datachain-0.14.0.dist-info → datachain-0.14.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{datachain-0.14.0.dist-info → datachain-0.14.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

datachain 0.14.0__py3-none-any.whl → 0.14.1__py3-none-any.whl

Potentially problematic release.

datachain 0.14.0py3-none-any.whl → 0.14.1py3-none-any.whl