PyPI - datachain - Versions diffs - 0.14.0__py3-none-any.whl → 0.14.2__py3-none-any.whl - Mend

datachain 0.14.0py3-none-any.whl → 0.14.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (28) hide show

datachain/__init__.py +18 -18
datachain/catalog/catalog.py +6 -6
datachain/cli/commands/show.py +2 -2
datachain/client/fsspec.py +3 -3
datachain/lib/dc/__init__.py +18 -18
datachain/lib/dc/csv.py +5 -5
datachain/lib/dc/datachain.py +42 -42
datachain/lib/dc/datasets.py +7 -7
datachain/lib/dc/hf.py +5 -5
datachain/lib/dc/json.py +6 -6
datachain/lib/dc/listings.py +2 -2
datachain/lib/dc/pandas.py +4 -4
datachain/lib/dc/parquet.py +5 -5
datachain/lib/dc/records.py +4 -4
datachain/lib/dc/storage.py +101 -48
datachain/lib/dc/values.py +4 -4
datachain/lib/listing.py +11 -0
datachain/lib/meta_formats.py +2 -2
datachain/lib/pytorch.py +2 -2
datachain/lib/udf.py +1 -1
datachain/query/dataset.py +52 -16
datachain/toolkit/split.py +1 -1
{datachain-0.14.0.dist-info → datachain-0.14.2.dist-info}/METADATA +6 -6
{datachain-0.14.0.dist-info → datachain-0.14.2.dist-info}/RECORD +28 -28
{datachain-0.14.0.dist-info → datachain-0.14.2.dist-info}/WHEEL +0 -0
{datachain-0.14.0.dist-info → datachain-0.14.2.dist-info}/entry_points.txt +0 -0
{datachain-0.14.0.dist-info → datachain-0.14.2.dist-info}/licenses/LICENSE +0 -0
{datachain-0.14.0.dist-info → datachain-0.14.2.dist-info}/top_level.txt +0 -0

datachain/lib/dc/parquet.py CHANGED Viewed

@@ -15,7 +15,7 @@ if TYPE_CHECKING:
     P = ParamSpec("P")
-def from_parquet(
+def read_parquet(
     path,
     partitioning: Any = "hive",
     output: Optional[dict[str, DataType]] = None,
@@ -43,18 +43,18 @@ def from_parquet(
         Reading a single file:
         ```py
         import datachain as dc
-        dc.from_parquet("s3://mybucket/file.parquet")
+        dc.read_parquet("s3://mybucket/file.parquet")
         ```
         Reading a partitioned dataset from a directory:
         ```py
         import datachain as dc
-        dc.from_parquet("s3://mybucket/dir")
+        dc.read_parquet("s3://mybucket/dir")
         ```
     """
-    from .storage import from_storage
+    from .storage import read_storage
-    chain = from_storage(path, session=session, settings=settings, **kwargs)
+    chain = read_storage(path, session=session, settings=settings, **kwargs)
     return chain.parse_tabular(
         output=output,
         object_name=object_name,

datachain/lib/dc/records.py CHANGED Viewed

@@ -21,7 +21,7 @@ if TYPE_CHECKING:
     P = ParamSpec("P")
-def from_records(
+def read_records(
     to_insert: Optional[Union[dict, list[dict]]],
     session: Optional[Session] = None,
     settings: Optional[dict] = None,
@@ -40,10 +40,10 @@ def from_records(
     Example:
         ```py
         import datachain as dc
-        single_record = dc.from_records(dc.DEFAULT_FILE_RECORD)
+        single_record = dc.read_records(dc.DEFAULT_FILE_RECORD)
         ```
     """
-    from .datasets import from_dataset
+    from .datasets import read_dataset
     session = Session.get(session, in_memory=in_memory)
     catalog = session.catalog
@@ -87,4 +87,4 @@ def from_records(
     insert_q = dr.get_table().insert()
     for record in to_insert:
         db.execute(insert_q.values(**record))
-    return from_dataset(name=dsr.name, session=session, settings=settings)
+    return read_dataset(name=dsr.name, session=session, settings=settings)

datachain/lib/dc/storage.py CHANGED Viewed

@@ -6,19 +6,23 @@ from typing import (
 )
 from datachain.lib.file import (
-    File,
     FileType,
     get_file_type,
 )
-from datachain.lib.listing import get_file_info, get_listing, list_bucket, ls
+from datachain.lib.listing import (
+    get_file_info,
+    get_listing,
+    list_bucket,
+    ls,
+)
 from datachain.query import Session
 if TYPE_CHECKING:
     from .datachain import DataChain
-def from_storage(
-    uri: Union[str, os.PathLike[str]],
+def read_storage(
+    uri: Union[str, os.PathLike[str], list[str], list[os.PathLike[str]]],
     *,
     type: FileType = "binary",
     session: Optional[Session] = None,
@@ -30,11 +34,12 @@ def from_storage(
     anon: bool = False,
     client_config: Optional[dict] = None,
 ) -> "DataChain":
-    """Get data from a storage as a list of file with all file attributes.
+    """Get data from storage(s) as a list of file with all file attributes.
     It returns the chain itself as usual.
     Parameters:
-        uri : storage URI with directory. URI must start with storage prefix such
+        uri : storage URI with directory or list of URIs.
+            URIs must start with storage prefix such
             as `s3://`, `gs://`, `az://` or "file:///"
         type : read file as "binary", "text", or "image" data. Default is "binary".
         recursive : search recursively for the given path.
@@ -43,17 +48,27 @@ def from_storage(
         anon : If True, we will treat cloud bucket as public one
         client_config : Optional client configuration for the storage client.
-    Example:
-        Simple call from s3
-        ```py
+    Returns:
+        DataChain: A DataChain object containing the file information.
+    Examples:
+        Simple call from s3:
+        ```python
         import datachain as dc
-        chain = dc.from_storage("s3://my-bucket/my-dir")
+        chain = dc.read_storage("s3://my-bucket/my-dir")
         ```
-        With AWS S3-compatible storage
-        ```py
-        import datachain as dc
-        chain = dc.from_storage(
+        Multiple URIs:
+        ```python
+        chain = dc.read_storage([
+            "s3://bucket1/dir1",
+            "s3://bucket2/dir2"
+        ])
+        ```
+        With AWS S3-compatible storage:
+        ```python
+        chain = dc.read_storage(
             "s3://my-bucket/my-dir",
             client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
         )
@@ -62,14 +77,20 @@ def from_storage(
         Pass existing session
         ```py
         session = Session.get()
-        import datachain as dc
-        chain = dc.from_storage("s3://my-bucket/my-dir", session=session)
+        chain = dc.read_storage([
+            "path/to/dir1",
+            "path/to/dir2"
+        ], session=session, recursive=True)
         ```
+    Note:
+        When using multiple URIs with `update=True`, the function optimizes by
+        avoiding redundant updates for URIs pointing to the same storage location.
     """
     from .datachain import DataChain
-    from .datasets import from_dataset
-    from .records import from_records
-    from .values import from_values
+    from .datasets import read_dataset
+    from .records import read_records
+    from .values import read_values
     file_type = get_file_type(type)
@@ -79,40 +100,72 @@ def from_storage(
     cache = session.catalog.cache
     client_config = session.catalog.client_config
-    list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
-        uri, session, update=update
-    )
+    uris = uri if isinstance(uri, (list, tuple)) else [uri]
+    if not uris:
+        raise ValueError("No URIs provided")
+    storage_chain = None
+    listed_ds_name = set()
+    file_values = []
+    for single_uri in uris:
+        list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
+            single_uri, session, update=update
+        )
+        # list_ds_name is None if object is a file, we don't want to use cache
+        # or do listing in that case - just read that single object
+        if not list_ds_name:
+            file_values.append(
+                get_file_info(list_uri, cache, client_config=client_config)
+            )
+            continue
+        dc = read_dataset(list_ds_name, session=session, settings=settings)
+        dc._query.update = update
+        dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
-    # ds_name is None if object is a file, we don't want to use cache
-    # or do listing in that case - just read that single object
-    if not list_ds_name:
-        dc = from_values(
+        if update or not list_ds_exists:
+            def lst_fn(ds_name, lst_uri):
+                # disable prefetch for listing, as it pre-downloads all files
+                (
+                    read_records(
+                        DataChain.DEFAULT_FILE_RECORD,
+                        session=session,
+                        settings=settings,
+                        in_memory=in_memory,
+                    )
+                    .settings(prefetch=0)
+                    .gen(
+                        list_bucket(lst_uri, cache, client_config=client_config),
+                        output={f"{object_name}": file_type},
+                    )
+                    .save(ds_name, listing=True)
+                )
+            dc._query.set_listing_fn(
+                lambda ds_name=list_ds_name, lst_uri=list_uri: lst_fn(ds_name, lst_uri)
+            )
+        chain = ls(dc, list_path, recursive=recursive, object_name=object_name)
+        storage_chain = storage_chain.union(chain) if storage_chain else chain
+        listed_ds_name.add(list_ds_name)
+    if file_values:
+        file_chain = read_values(
             session=session,
             settings=settings,
             in_memory=in_memory,
-            file=[get_file_info(list_uri, cache, client_config=client_config)],
+            file=file_values,
         )
-        dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
-        return dc
-    if update or not list_ds_exists:
-        # disable prefetch for listing, as it pre-downloads all files
-        (
-            from_records(
-                DataChain.DEFAULT_FILE_RECORD,
-                session=session,
-                settings=settings,
-                in_memory=in_memory,
-            )
-            .settings(prefetch=0)
-            .gen(
-                list_bucket(list_uri, cache, client_config=client_config),
-                output={f"{object_name}": File},
-            )
-            .save(list_ds_name, listing=True)
+        file_chain.signals_schema = file_chain.signals_schema.mutate(
+            {f"{object_name}": file_type}
         )
+        storage_chain = storage_chain.union(file_chain) if storage_chain else file_chain
-    dc = from_dataset(list_ds_name, session=session, settings=settings)
-    dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
+    assert storage_chain is not None
-    return ls(dc, list_path, recursive=recursive, object_name=object_name)
+    return storage_chain

datachain/lib/dc/values.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import (
 from datachain.lib.convert.values_to_tuples import values_to_tuples
 from datachain.lib.data_model import dict_to_data_model
-from datachain.lib.dc.records import from_records
+from datachain.lib.dc.records import read_records
 from datachain.lib.dc.utils import OutputType
 from datachain.query import Session
@@ -18,7 +18,7 @@ if TYPE_CHECKING:
     P = ParamSpec("P")
-def from_values(
+def read_values(
     ds_name: str = "",
     session: Optional[Session] = None,
     settings: Optional[dict] = None,
@@ -32,7 +32,7 @@ def from_values(
     Example:
         ```py
         import datachain as dc
-        dc.from_values(fib=[1, 2, 3, 5, 8])
+        dc.read_values(fib=[1, 2, 3, 5, 8])
         ```
     """
     from .datachain import DataChain
@@ -42,7 +42,7 @@ def from_values(
     def _func_fr() -> Iterator[tuple_type]:  # type: ignore[valid-type]
         yield from tuples
-    chain = from_records(
+    chain = read_records(
         DataChain.DEFAULT_FILE_RECORD,
         session=session,
         settings=settings,

datachain/lib/listing.py CHANGED Viewed

@@ -4,6 +4,7 @@ import os
 import posixpath
 from collections.abc import Iterator
 from contextlib import contextmanager
+from datetime import datetime, timedelta, timezone
 from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
 from fsspec.asyn import get_loop
@@ -32,6 +33,16 @@ logging.getLogger("aiobotocore.credentials").setLevel(logging.CRITICAL)
 logging.getLogger("gcsfs").setLevel(logging.CRITICAL)
+def listing_dataset_expired(lst_ds) -> bool:
+    """Function that checks if listing dataset is expired or not"""
+    lst_version = lst_ds.versions[-1]
+    if not lst_version.finished_at:
+        return False
+    expires = lst_version.finished_at + timedelta(seconds=LISTING_TTL)
+    return datetime.now(timezone.utc) > expires
 def list_bucket(uri: str, cache, client_config=None) -> Callable:
     """
     Function that returns another generator function that yields File objects

datachain/lib/meta_formats.py CHANGED Viewed

@@ -103,10 +103,10 @@ def read_meta(  # noqa: C901
     model_name=None,
     nrows=None,
 ) -> Callable:
-    from datachain import from_storage
+    from datachain import read_storage
     if schema_from:
-        file = next(from_storage(schema_from, type="text").limit(1).collect("file"))
+        file = next(read_storage(schema_from, type="text").limit(1).collect("file"))
         model_code = gen_datamodel_code(
             file, format=format, jmespath=jmespath, model_name=model_name
         )

datachain/lib/pytorch.py CHANGED Viewed

@@ -14,7 +14,7 @@ from torchvision.transforms import v2
 from datachain import Session
 from datachain.cache import get_temp_cache
 from datachain.catalog import Catalog, get_catalog
-from datachain.lib.dc.datasets import from_dataset
+from datachain.lib.dc.datasets import read_dataset
 from datachain.lib.settings import Settings
 from datachain.lib.text import convert_text
 from datachain.progress import CombinedDownloadCallback
@@ -122,7 +122,7 @@ class PytorchDataset(IterableDataset):
     ) -> Generator[tuple[Any, ...], None, None]:
         catalog = self._get_catalog()
         session = Session("PyTorch", catalog=catalog)
-        ds = from_dataset(
+        ds = read_dataset(
             name=self.name, version=self.version, session=session
         ).settings(cache=self.cache, prefetch=self.prefetch)
         ds = ds.remove_file_signals()

datachain/lib/udf.py CHANGED Viewed

@@ -145,7 +145,7 @@ class UDFBase(AbstractUDF):
                 return emb[0].tolist()
         (
-            dc.from_storage(
+            dc.read_storage(
                 "gs://datachain-demo/fashion-product-images/images", type="image"
             )
             .limit(5)

datachain/query/dataset.py CHANGED Viewed

@@ -47,6 +47,10 @@ from datachain.error import (
     QueryScriptCancelError,
 )
 from datachain.func.base import Function
+from datachain.lib.listing import (
+    is_listing_dataset,
+    listing_dataset_expired,
+)
 from datachain.lib.udf import UDFAdapter, _get_cache
 from datachain.progress import CombinedDownloadCallback, TqdmCombinedDownloadCallback
 from datachain.query.schema import C, UDFParamSpec, normalize_param
@@ -151,13 +155,6 @@ def step_result(
     )
-class StartingStep(ABC):
-    """An initial query processing step, referencing a data source."""
-    @abstractmethod
-    def apply(self) -> "StepResult": ...
 @frozen
 class Step(ABC):
     """A query processing step (filtering, mutation, etc.)"""
@@ -170,7 +167,7 @@ class Step(ABC):
 @frozen
-class QueryStep(StartingStep):
+class QueryStep:
     catalog: "Catalog"
     dataset_name: str
     dataset_version: int
@@ -1086,6 +1083,7 @@ class DatasetQuery:
         indexing_column_types: Optional[dict[str, Any]] = None,
         in_memory: bool = False,
         fallback_to_studio: bool = True,
+        update: bool = False,
     ) -> None:
         from datachain.remote.studio import is_token_set
@@ -1097,26 +1095,44 @@ class DatasetQuery:
         self.temp_table_names: list[str] = []
         self.dependencies: set[DatasetDependencyType] = set()
         self.table = self.get_table()
-        self.starting_step: StartingStep
+        self.starting_step: Optional[QueryStep] = None
         self.name: Optional[str] = None
         self.version: Optional[int] = None
         self.feature_schema: Optional[dict] = None
         self.column_types: Optional[dict[str, Any]] = None
+        self.before_steps: list[Callable] = []
+        self.listing_fn: Optional[Callable] = None
+        self.update = update
-        self.name = name
+        self.list_ds_name: Optional[str] = None
-        if fallback_to_studio and is_token_set():
-            ds = self.catalog.get_dataset_with_remote_fallback(name, version)
+        self.name = name
+        self.dialect = self.catalog.warehouse.db.dialect
+        if version:
+            self.version = version
+        if is_listing_dataset(name):
+            # not setting query step yet as listing dataset might not exist at
+            # this point
+            self.list_ds_name = name
+        elif fallback_to_studio and is_token_set():
+            self._set_starting_step(
+                self.catalog.get_dataset_with_remote_fallback(name, version)
+            )
         else:
-            ds = self.catalog.get_dataset(name)
+            self._set_starting_step(self.catalog.get_dataset(name))
+    def _set_starting_step(self, ds: "DatasetRecord") -> None:
+        if not self.version:
+            self.version = ds.latest_version
+        self.starting_step = QueryStep(self.catalog, ds.name, self.version)
-        self.version = version or ds.latest_version
+        # at this point we know our starting dataset so setting up schemas
         self.feature_schema = ds.get_version(self.version).feature_schema
         self.column_types = copy(ds.schema)
         if "sys__id" in self.column_types:
             self.column_types.pop("sys__id")
-        self.starting_step = QueryStep(self.catalog, name, self.version)
-        self.dialect = self.catalog.warehouse.db.dialect
     def __iter__(self):
         return iter(self.db_results())
@@ -1180,11 +1196,30 @@ class DatasetQuery:
         col.table = self.table
         return col
+    def set_listing_fn(self, fn: Callable) -> None:
+        """Setting listing function to be run if needed"""
+        self.listing_fn = fn
     def apply_steps(self) -> QueryGenerator:
         """
         Apply the steps in the query and return the resulting
         sqlalchemy.SelectBase.
         """
+        if self.list_ds_name and not self.starting_step:
+            listing_ds = None
+            try:
+                listing_ds = self.catalog.get_dataset(self.list_ds_name)
+            except DatasetNotFoundError:
+                pass
+            if not listing_ds or self.update or listing_dataset_expired(listing_ds):
+                assert self.listing_fn
+                self.listing_fn()
+                listing_ds = self.catalog.get_dataset(self.list_ds_name)
+            # at this point we know what is our starting listing dataset name
+            self._set_starting_step(listing_ds)  # type: ignore [arg-type]
         query = self.clone()
         index = os.getenv("DATACHAIN_QUERY_CHUNK_INDEX", self._chunk_index)
@@ -1203,6 +1238,7 @@ class DatasetQuery:
             query = query.filter(C.sys__rand % total == index)
             query.steps = query.steps[-1:] + query.steps[:-1]
+        assert query.starting_step
         result = query.starting_step.apply()
         self.dependencies.update(result.dependencies)

datachain/toolkit/split.py CHANGED Viewed

@@ -41,7 +41,7 @@ def train_test_split(
         from datachain.toolkit import train_test_split
         # Load a DataChain from a storage source (e.g., S3 bucket)
-        dc = dc.from_storage("s3://bucket/dir/")
+        dc = dc.read_storage("s3://bucket/dir/")
         # Perform a 70/30 train-test split
         train, test = train_test_split(dc, [0.7, 0.3])

{datachain-0.14.0.dist-info → datachain-0.14.2.dist-info}/METADATA RENAMED Viewed

@@ -1,9 +1,9 @@
 Metadata-Version: 2.4
 Name: datachain
-Version: 0.14.0
+Version: 0.14.2
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
-License: Apache-2.0
+License-Expression: Apache-2.0
 Project-URL: Documentation, https://datachain.dvc.ai
 Project-URL: Issues, https://github.com/iterative/datachain/issues
 Project-URL: Source, https://github.com/iterative/datachain
@@ -38,7 +38,7 @@ Requires-Dist: sqlalchemy>=2
 Requires-Dist: multiprocess==0.70.16
 Requires-Dist: cloudpickle
 Requires-Dist: orjson>=3.10.5
-Requires-Dist: pydantic<3,>=2
+Requires-Dist: pydantic<2.11,>=2
 Requires-Dist: jmespath>=1.0
 Requires-Dist: datamodel-code-generator>=0.25
 Requires-Dist: Pillow<12,>=10.0.0
@@ -171,8 +171,8 @@ high confidence scores.
     import datachain as dc
-    meta = dc.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
-    images = dc.from_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
+    meta = dc.read_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
+    images = dc.read_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
     images_id = images.map(id=lambda file: file.path.split('.')[-2])
     annotated = images_id.merge(meta, on="id", right_on="meta.id")
@@ -213,7 +213,7 @@ Python code:
          return result.lower().startswith("success")
     chain = (
-       dc.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
+       dc.read_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
        .settings(parallel=4, cache=True)
        .map(is_success=eval_dialogue)
        .save("mistral_files")

datachain 0.14.0__py3-none-any.whl → 0.14.2__py3-none-any.whl

Potentially problematic release.

datachain 0.14.0py3-none-any.whl → 0.14.2py3-none-any.whl