PyPI - datachain - Versions diffs - 0.13.1__py3-none-any.whl → 0.14.1__py3-none-any.whl - Mend

datachain 0.13.1py3-none-any.whl → 0.14.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (29) hide show

datachain/__init__.py +28 -1
datachain/catalog/catalog.py +6 -10
datachain/cli/commands/ls.py +2 -2
datachain/cli/commands/show.py +2 -3
datachain/client/fsspec.py +3 -3
datachain/lib/dc/__init__.py +32 -0
datachain/lib/dc/csv.py +127 -0
datachain/lib/{dc.py → dc/datachain.py} +144 -733
datachain/lib/dc/datasets.py +149 -0
datachain/lib/dc/hf.py +73 -0
datachain/lib/dc/json.py +91 -0
datachain/lib/dc/listings.py +43 -0
datachain/lib/dc/pandas.py +56 -0
datachain/lib/dc/parquet.py +65 -0
datachain/lib/dc/records.py +90 -0
datachain/lib/dc/storage.py +170 -0
datachain/lib/dc/utils.py +128 -0
datachain/lib/dc/values.py +53 -0
datachain/lib/meta_formats.py +2 -4
datachain/lib/pytorch.py +2 -2
datachain/lib/udf.py +3 -3
datachain/query/dataset.py +39 -16
datachain/toolkit/split.py +2 -2
{datachain-0.13.1.dist-info → datachain-0.14.1.dist-info}/METADATA +11 -11
{datachain-0.13.1.dist-info → datachain-0.14.1.dist-info}/RECORD +29 -17
{datachain-0.13.1.dist-info → datachain-0.14.1.dist-info}/WHEEL +1 -1
{datachain-0.13.1.dist-info → datachain-0.14.1.dist-info}/entry_points.txt +0 -0
{datachain-0.13.1.dist-info → datachain-0.14.1.dist-info}/licenses/LICENSE +0 -0
{datachain-0.13.1.dist-info → datachain-0.14.1.dist-info}/top_level.txt +0 -0

datachain/lib/dc/storage.py ADDED Viewed

@@ -0,0 +1,170 @@
+import os.path
+from typing import (
+    TYPE_CHECKING,
+    Optional,
+    Union,
+)
+from datachain.lib.file import (
+    FileType,
+    get_file_type,
+)
+from datachain.lib.listing import (
+    get_file_info,
+    get_listing,
+    list_bucket,
+    ls,
+)
+from datachain.query import Session
+if TYPE_CHECKING:
+    from .datachain import DataChain
+def from_storage(
+    uri: Union[str, os.PathLike[str], list[str], list[os.PathLike[str]]],
+    *,
+    type: FileType = "binary",
+    session: Optional[Session] = None,
+    settings: Optional[dict] = None,
+    in_memory: bool = False,
+    recursive: Optional[bool] = True,
+    object_name: str = "file",
+    update: bool = False,
+    anon: bool = False,
+    client_config: Optional[dict] = None,
+) -> "DataChain":
+    """Get data from storage(s) as a list of file with all file attributes.
+    It returns the chain itself as usual.
+    Parameters:
+        uri : storage URI with directory or list of URIs.
+            URIs must start with storage prefix such
+            as `s3://`, `gs://`, `az://` or "file:///"
+        type : read file as "binary", "text", or "image" data. Default is "binary".
+        recursive : search recursively for the given path.
+        object_name : Created object column name.
+        update : force storage reindexing. Default is False.
+        anon : If True, we will treat cloud bucket as public one
+        client_config : Optional client configuration for the storage client.
+    Returns:
+        DataChain: A DataChain object containing the file information.
+    Examples:
+        Simple call from s3:
+        ```python
+        import datachain as dc
+        chain = dc.from_storage("s3://my-bucket/my-dir")
+        ```
+        Multiple URIs:
+        ```python
+        chain = dc.from_storage([
+            "s3://bucket1/dir1",
+            "s3://bucket2/dir2"
+        ])
+        ```
+        With AWS S3-compatible storage:
+        ```python
+        chain = dc.from_storage(
+            "s3://my-bucket/my-dir",
+            client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
+        )
+        ```
+        Pass existing session
+        ```py
+        session = Session.get()
+        chain = dc.from_storage([
+            "path/to/dir1",
+            "path/to/dir2"
+        ], session=session, recursive=True)
+        ```
+    Note:
+        When using multiple URIs with `update=True`, the function optimizes by
+        avoiding redundant updates for URIs pointing to the same storage location.
+    """
+    from .datachain import DataChain
+    from .datasets import from_dataset
+    from .records import from_records
+    from .values import from_values
+    file_type = get_file_type(type)
+    if anon:
+        client_config = (client_config or {}) | {"anon": True}
+    session = Session.get(session, client_config=client_config, in_memory=in_memory)
+    cache = session.catalog.cache
+    client_config = session.catalog.client_config
+    uris = uri if isinstance(uri, (list, tuple)) else [uri]
+    if not uris:
+        raise ValueError("No URIs provided")
+    storage_chain = None
+    listed_ds_name = set()
+    file_values = []
+    for single_uri in uris:
+        list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
+            single_uri, session, update=update
+        )
+        # list_ds_name is None if object is a file, we don't want to use cache
+        # or do listing in that case - just read that single object
+        if not list_ds_name:
+            file_values.append(
+                get_file_info(list_uri, cache, client_config=client_config)
+            )
+            continue
+        dc = from_dataset(list_ds_name, session=session, settings=settings)
+        dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
+        if update or not list_ds_exists:
+            def lst_fn(ds_name, lst_uri):
+                # disable prefetch for listing, as it pre-downloads all files
+                (
+                    from_records(
+                        DataChain.DEFAULT_FILE_RECORD,
+                        session=session,
+                        settings=settings,
+                        in_memory=in_memory,
+                    )
+                    .settings(prefetch=0)
+                    .gen(
+                        list_bucket(lst_uri, cache, client_config=client_config),
+                        output={f"{object_name}": file_type},
+                    )
+                    .save(ds_name, listing=True)
+                )
+            dc._query.add_before_steps(
+                lambda ds_name=list_ds_name, lst_uri=list_uri: lst_fn(ds_name, lst_uri)
+            )
+        chain = ls(dc, list_path, recursive=recursive, object_name=object_name)
+        storage_chain = storage_chain.union(chain) if storage_chain else chain
+        listed_ds_name.add(list_ds_name)
+    if file_values:
+        file_chain = from_values(
+            session=session,
+            settings=settings,
+            in_memory=in_memory,
+            file=file_values,
+        )
+        file_chain.signals_schema = file_chain.signals_schema.mutate(
+            {f"{object_name}": file_type}
+        )
+        storage_chain = storage_chain.union(file_chain) if storage_chain else file_chain
+    assert storage_chain is not None
+    return storage_chain

datachain/lib/dc/utils.py ADDED Viewed

@@ -0,0 +1,128 @@
+from collections.abc import Sequence
+from functools import wraps
+from typing import (
+    TYPE_CHECKING,
+    Callable,
+    Optional,
+    TypeVar,
+    Union,
+)
+import sqlalchemy
+from sqlalchemy.sql.functions import GenericFunction
+from datachain.func.base import Function
+from datachain.lib.data_model import DataModel, DataType
+from datachain.lib.utils import DataChainParamsError
+from datachain.query.schema import DEFAULT_DELIMITER
+if TYPE_CHECKING:
+    from typing_extensions import Concatenate, ParamSpec
+    from .datachain import DataChain
+    P = ParamSpec("P")
+D = TypeVar("D", bound="DataChain")
+def resolve_columns(
+    method: "Callable[Concatenate[D, P], D]",
+) -> "Callable[Concatenate[D, P], D]":
+    """Decorator that resolvs input column names to their actual DB names. This is
+    specially important for nested columns as user works with them by using dot
+    notation e.g (file.name) but are actually defined with default delimiter
+    in DB, e.g file__name.
+    If there are any sql functions in arguments, they will just be transferred as is
+    to a method.
+    """
+    @wraps(method)
+    def _inner(self: D, *args: "P.args", **kwargs: "P.kwargs") -> D:
+        resolved_args = self.signals_schema.resolve(
+            *[arg for arg in args if not isinstance(arg, GenericFunction)]  # type: ignore[arg-type]
+        ).db_signals()
+        for idx, arg in enumerate(args):
+            if isinstance(arg, GenericFunction):
+                resolved_args.insert(idx, arg)  # type: ignore[arg-type]
+        return method(self, *resolved_args, **kwargs)
+    return _inner
+class DatasetPrepareError(DataChainParamsError):
+    def __init__(self, name, msg, output=None):
+        name = f" '{name}'" if name else ""
+        output = f" output '{output}'" if output else ""
+        super().__init__(f"Dataset{name}{output} processing prepare error: {msg}")
+class DatasetFromValuesError(DataChainParamsError):
+    def __init__(self, name, msg):
+        name = f" '{name}'" if name else ""
+        super().__init__(f"Dataset{name} from values error: {msg}")
+MergeColType = Union[str, Function, sqlalchemy.ColumnElement]
+def _validate_merge_on(
+    on: Union[MergeColType, Sequence[MergeColType]],
+    ds: "DataChain",
+) -> Sequence[MergeColType]:
+    if isinstance(on, (str, sqlalchemy.ColumnElement)):
+        return [on]
+    if isinstance(on, Function):
+        return [on.get_column(table=ds._query.table)]
+    if isinstance(on, Sequence):
+        return [
+            c.get_column(table=ds._query.table) if isinstance(c, Function) else c
+            for c in on
+        ]
+def _get_merge_error_str(col: MergeColType) -> str:
+    if isinstance(col, str):
+        return col
+    if isinstance(col, Function):
+        return f"{col.name}()"
+    if isinstance(col, sqlalchemy.Column):
+        return col.name.replace(DEFAULT_DELIMITER, ".")
+    if isinstance(col, sqlalchemy.ColumnElement) and hasattr(col, "name"):
+        return f"{col.name} expression"
+    return str(col)
+class DatasetMergeError(DataChainParamsError):
+    def __init__(
+        self,
+        on: Union[MergeColType, Sequence[MergeColType]],
+        right_on: Optional[Union[MergeColType, Sequence[MergeColType]]],
+        msg: str,
+    ):
+        def _get_str(
+            on: Union[MergeColType, Sequence[MergeColType]],
+        ) -> str:
+            if not isinstance(on, Sequence):
+                return str(on)  # type: ignore[unreachable]
+            return ", ".join([_get_merge_error_str(col) for col in on])
+        on_str = _get_str(on)
+        right_on_str = (
+            ", right_on='" + _get_str(right_on) + "'"
+            if right_on and isinstance(right_on, Sequence)
+            else ""
+        )
+        super().__init__(f"Merge error on='{on_str}'{right_on_str}: {msg}")
+OutputType = Union[None, DataType, Sequence[str], dict[str, DataType]]
+class Sys(DataModel):
+    """Model for internal DataChain signals `id` and `rand`."""
+    id: int
+    rand: int

datachain/lib/dc/values.py ADDED Viewed

@@ -0,0 +1,53 @@
+from collections.abc import Iterator
+from typing import (
+    TYPE_CHECKING,
+    Optional,
+)
+from datachain.lib.convert.values_to_tuples import values_to_tuples
+from datachain.lib.data_model import dict_to_data_model
+from datachain.lib.dc.records import from_records
+from datachain.lib.dc.utils import OutputType
+from datachain.query import Session
+if TYPE_CHECKING:
+    from typing_extensions import ParamSpec
+    from .datachain import DataChain
+    P = ParamSpec("P")
+def from_values(
+    ds_name: str = "",
+    session: Optional[Session] = None,
+    settings: Optional[dict] = None,
+    in_memory: bool = False,
+    output: OutputType = None,
+    object_name: str = "",
+    **fr_map,
+) -> "DataChain":
+    """Generate chain from list of values.
+    Example:
+        ```py
+        import datachain as dc
+        dc.from_values(fib=[1, 2, 3, 5, 8])
+        ```
+    """
+    from .datachain import DataChain
+    tuple_type, output, tuples = values_to_tuples(ds_name, output, **fr_map)
+    def _func_fr() -> Iterator[tuple_type]:  # type: ignore[valid-type]
+        yield from tuples
+    chain = from_records(
+        DataChain.DEFAULT_FILE_RECORD,
+        session=session,
+        settings=settings,
+        in_memory=in_memory,
+    )
+    if object_name:
+        output = {object_name: dict_to_data_model(object_name, output)}  # type: ignore[arg-type]
+    return chain.gen(_func_fr, output=output)

datachain/lib/meta_formats.py CHANGED Viewed

@@ -103,12 +103,10 @@ def read_meta(  # noqa: C901
     model_name=None,
     nrows=None,
 ) -> Callable:
-    from datachain.lib.dc import DataChain
+    from datachain import from_storage
     if schema_from:
-        file = next(
-            DataChain.from_storage(schema_from, type="text").limit(1).collect("file")
-        )
+        file = next(from_storage(schema_from, type="text").limit(1).collect("file"))
         model_code = gen_datamodel_code(
             file, format=format, jmespath=jmespath, model_name=model_name
         )

datachain/lib/pytorch.py CHANGED Viewed

@@ -14,7 +14,7 @@ from torchvision.transforms import v2
 from datachain import Session
 from datachain.cache import get_temp_cache
 from datachain.catalog import Catalog, get_catalog
-from datachain.lib.dc import DataChain
+from datachain.lib.dc.datasets import from_dataset
 from datachain.lib.settings import Settings
 from datachain.lib.text import convert_text
 from datachain.progress import CombinedDownloadCallback
@@ -122,7 +122,7 @@ class PytorchDataset(IterableDataset):
     ) -> Generator[tuple[Any, ...], None, None]:
         catalog = self._get_catalog()
         session = Session("PyTorch", catalog=catalog)
-        ds = DataChain.from_dataset(
+        ds = from_dataset(
             name=self.name, version=self.version, session=session
         ).settings(cache=self.cache, prefetch=self.prefetch)
         ds = ds.remove_file_signals()

datachain/lib/udf.py CHANGED Viewed

@@ -123,10 +123,10 @@ class UDFBase(AbstractUDF):
     Example:
         ```py
-        from datachain import C, DataChain, Mapper
+        import datachain as dc
         import open_clip
-        class ImageEncoder(Mapper):
+        class ImageEncoder(dc.Mapper):
             def __init__(self, model_name: str, pretrained: str):
                 self.model_name = model_name
                 self.pretrained = pretrained
@@ -145,7 +145,7 @@ class UDFBase(AbstractUDF):
                 return emb[0].tolist()
         (
-            DataChain.from_storage(
+            dc.from_storage(
                 "gs://datachain-demo/fashion-product-images/images", type="image"
             )
             .limit(5)

datachain/query/dataset.py CHANGED Viewed

@@ -47,6 +47,7 @@ from datachain.error import (
     QueryScriptCancelError,
 )
 from datachain.func.base import Function
+from datachain.lib.listing import is_listing_dataset
 from datachain.lib.udf import UDFAdapter, _get_cache
 from datachain.progress import CombinedDownloadCallback, TqdmCombinedDownloadCallback
 from datachain.query.schema import C, UDFParamSpec, normalize_param
@@ -151,13 +152,6 @@ def step_result(
     )
-class StartingStep(ABC):
-    """An initial query processing step, referencing a data source."""
-    @abstractmethod
-    def apply(self) -> "StepResult": ...
 @frozen
 class Step(ABC):
     """A query processing step (filtering, mutation, etc.)"""
@@ -170,7 +164,7 @@ class Step(ABC):
 @frozen
-class QueryStep(StartingStep):
+class QueryStep:
     catalog: "Catalog"
     dataset_name: str
     dataset_version: int
@@ -1097,26 +1091,42 @@ class DatasetQuery:
         self.temp_table_names: list[str] = []
         self.dependencies: set[DatasetDependencyType] = set()
         self.table = self.get_table()
-        self.starting_step: StartingStep
+        self.starting_step: Optional[QueryStep] = None
         self.name: Optional[str] = None
         self.version: Optional[int] = None
         self.feature_schema: Optional[dict] = None
         self.column_types: Optional[dict[str, Any]] = None
+        self.before_steps: list[Callable] = []
-        self.name = name
+        self.list_ds_name: Optional[str] = None
-        if fallback_to_studio and is_token_set():
-            ds = self.catalog.get_dataset_with_remote_fallback(name, version)
+        self.name = name
+        self.dialect = self.catalog.warehouse.db.dialect
+        if version:
+            self.version = version
+        if is_listing_dataset(name):
+            # not setting query step yet as listing dataset might not exist at
+            # this point
+            self.list_ds_name = name
+        elif fallback_to_studio and is_token_set():
+            self._set_starting_step(
+                self.catalog.get_dataset_with_remote_fallback(name, version)
+            )
         else:
-            ds = self.catalog.get_dataset(name)
+            self._set_starting_step(self.catalog.get_dataset(name))
+    def _set_starting_step(self, ds: "DatasetRecord") -> None:
+        if not self.version:
+            self.version = ds.latest_version
-        self.version = version or ds.latest_version
+        self.starting_step = QueryStep(self.catalog, ds.name, self.version)
+        # at this point we know our starting dataset so setting up schemas
         self.feature_schema = ds.get_version(self.version).feature_schema
         self.column_types = copy(ds.schema)
         if "sys__id" in self.column_types:
             self.column_types.pop("sys__id")
-        self.starting_step = QueryStep(self.catalog, name, self.version)
-        self.dialect = self.catalog.warehouse.db.dialect
     def __iter__(self):
         return iter(self.db_results())
@@ -1180,11 +1190,23 @@ class DatasetQuery:
         col.table = self.table
         return col
+    def add_before_steps(self, fn: Callable) -> None:
+        """
+        Setting custom function to be run before applying steps
+        """
+        self.before_steps.append(fn)
     def apply_steps(self) -> QueryGenerator:
         """
         Apply the steps in the query and return the resulting
         sqlalchemy.SelectBase.
         """
+        for fn in self.before_steps:
+            fn()
+        if self.list_ds_name:
+            # at this point we know what is our starting listing dataset name
+            self._set_starting_step(self.catalog.get_dataset(self.list_ds_name))  # type: ignore [arg-type]
         query = self.clone()
         index = os.getenv("DATACHAIN_QUERY_CHUNK_INDEX", self._chunk_index)
@@ -1203,6 +1225,7 @@ class DatasetQuery:
             query = query.filter(C.sys__rand % total == index)
             query.steps = query.steps[-1:] + query.steps[:-1]
+        assert query.starting_step
         result = query.starting_step.apply()
         self.dependencies.update(result.dependencies)

datachain/toolkit/split.py CHANGED Viewed

@@ -37,11 +37,11 @@ def train_test_split(
     Examples:
         Train-test split:
         ```python
-        from datachain import DataChain
+        import datachain as dc
         from datachain.toolkit import train_test_split
         # Load a DataChain from a storage source (e.g., S3 bucket)
-        dc = DataChain.from_storage("s3://bucket/dir/")
+        dc = dc.from_storage("s3://bucket/dir/")
         # Perform a 70/30 train-test split
         train, test = train_test_split(dc, [0.7, 0.3])

{datachain-0.13.1.dist-info → datachain-0.14.1.dist-info}/METADATA RENAMED Viewed

@@ -1,9 +1,9 @@
 Metadata-Version: 2.4
 Name: datachain
-Version: 0.13.1
+Version: 0.14.1
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
-License: Apache-2.0
+License-Expression: Apache-2.0
 Project-URL: Documentation, https://datachain.dvc.ai
 Project-URL: Issues, https://github.com/iterative/datachain/issues
 Project-URL: Source, https://github.com/iterative/datachain
@@ -169,16 +169,16 @@ high confidence scores.
 .. code:: py
-    from datachain import Column, DataChain
+    import datachain as dc
-    meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
-    images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
+    meta = dc.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
+    images = dc.from_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
     images_id = images.map(id=lambda file: file.path.split('.')[-2])
     annotated = images_id.merge(meta, on="id", right_on="meta.id")
-    likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
-                                   & (Column("meta.inference.class_") == "cat"))
+    likely_cats = annotated.filter((dc.Column("meta.inference.confidence") > 0.93) \
+                                   & (dc.Column("meta.inference.class_") == "cat"))
     likely_cats.to_storage("high-confidence-cats/", signal="file")
@@ -199,11 +199,11 @@ Python code:
     import os
     from mistralai import Mistral
-    from datachain import File, DataChain, Column
+    import datachain as dc
     PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
-    def eval_dialogue(file: File) -> bool:
+    def eval_dialogue(file: dc.File) -> bool:
          client = Mistral(api_key = os.environ["MISTRAL_API_KEY"])
          response = client.chat.complete(
              model="open-mixtral-8x22b",
@@ -213,13 +213,13 @@ Python code:
          return result.lower().startswith("success")
     chain = (
-       DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
+       dc.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
        .settings(parallel=4, cache=True)
        .map(is_success=eval_dialogue)
        .save("mistral_files")
     )
-    successful_chain = chain.filter(Column("is_success") == True)
+    successful_chain = chain.filter(dc.Column("is_success") == True)
     successful_chain.to_storage("./output_mistral")
     print(f"{successful_chain.count()} files were exported")

datachain 0.13.1__py3-none-any.whl → 0.14.1__py3-none-any.whl

Potentially problematic release.

datachain 0.13.1py3-none-any.whl → 0.14.1py3-none-any.whl