PyPI - datachain - Versions diffs - 0.13.1__py3-none-any.whl → 0.14.0__py3-none-any.whl - Mend

datachain 0.13.1py3-none-any.whl → 0.14.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (27) hide show

datachain/__init__.py +28 -1
datachain/catalog/catalog.py +5 -9
datachain/cli/commands/ls.py +2 -2
datachain/cli/commands/show.py +2 -3
datachain/lib/dc/__init__.py +32 -0
datachain/lib/dc/csv.py +127 -0
datachain/lib/{dc.py → dc/datachain.py} +144 -733
datachain/lib/dc/datasets.py +149 -0
datachain/lib/dc/hf.py +73 -0
datachain/lib/dc/json.py +91 -0
datachain/lib/dc/listings.py +43 -0
datachain/lib/dc/pandas.py +56 -0
datachain/lib/dc/parquet.py +65 -0
datachain/lib/dc/records.py +90 -0
datachain/lib/dc/storage.py +118 -0
datachain/lib/dc/utils.py +128 -0
datachain/lib/dc/values.py +53 -0
datachain/lib/meta_formats.py +2 -4
datachain/lib/pytorch.py +2 -2
datachain/lib/udf.py +3 -3
datachain/toolkit/split.py +2 -2
{datachain-0.13.1.dist-info → datachain-0.14.0.dist-info}/METADATA +10 -10
{datachain-0.13.1.dist-info → datachain-0.14.0.dist-info}/RECORD +27 -15
{datachain-0.13.1.dist-info → datachain-0.14.0.dist-info}/WHEEL +1 -1
{datachain-0.13.1.dist-info → datachain-0.14.0.dist-info}/entry_points.txt +0 -0
{datachain-0.13.1.dist-info → datachain-0.14.0.dist-info}/licenses/LICENSE +0 -0
{datachain-0.13.1.dist-info → datachain-0.14.0.dist-info}/top_level.txt +0 -0

datachain/lib/dc/datasets.py ADDED Viewed

@@ -0,0 +1,149 @@
+from typing import (
+    TYPE_CHECKING,
+    Optional,
+)
+from datachain.lib.dataset_info import DatasetInfo
+from datachain.lib.file import (
+    File,
+)
+from datachain.lib.settings import Settings
+from datachain.lib.signal_schema import SignalSchema
+from datachain.query import Session
+from datachain.query.dataset import DatasetQuery
+from .utils import Sys
+from .values import from_values
+if TYPE_CHECKING:
+    from typing_extensions import ParamSpec
+    from .datachain import DataChain
+    P = ParamSpec("P")
+def from_dataset(
+    name: str,
+    version: Optional[int] = None,
+    session: Optional[Session] = None,
+    settings: Optional[dict] = None,
+    fallback_to_studio: bool = True,
+) -> "DataChain":
+    """Get data from a saved Dataset. It returns the chain itself.
+    If dataset or version is not found locally, it will try to pull it from Studio.
+    Parameters:
+        name : dataset name
+        version : dataset version
+        session : Session to use for the chain.
+        settings : Settings to use for the chain.
+        fallback_to_studio : Try to pull dataset from Studio if not found locally.
+            Default is True.
+    Example:
+        ```py
+        import datachain as dc
+        chain = dc.from_dataset("my_cats")
+        ```
+        ```py
+        chain = dc.from_dataset("my_cats", fallback_to_studio=False)
+        ```
+        ```py
+        chain = dc.from_dataset("my_cats", version=1)
+        ```
+        ```py
+        session = Session.get(client_config={"aws_endpoint_url": "<minio-url>"})
+        settings = {
+            "cache": True,
+            "parallel": 4,
+            "workers": 4,
+            "min_task_size": 1000,
+            "prefetch": 10,
+        }
+        chain = dc.from_dataset(
+            name="my_cats",
+            version=1,
+            session=session,
+            settings=settings,
+            fallback_to_studio=True,
+        )
+        ```
+    """
+    from datachain.telemetry import telemetry
+    from .datachain import DataChain
+    query = DatasetQuery(
+        name=name,
+        version=version,
+        session=session,
+        indexing_column_types=File._datachain_column_types,
+        fallback_to_studio=fallback_to_studio,
+    )
+    telemetry.send_event_once("class", "datachain_init", name=name, version=version)
+    if settings:
+        _settings = Settings(**settings)
+    else:
+        _settings = Settings()
+    signals_schema = SignalSchema({"sys": Sys})
+    if query.feature_schema:
+        signals_schema |= SignalSchema.deserialize(query.feature_schema)
+    else:
+        signals_schema |= SignalSchema.from_column_types(query.column_types or {})
+    return DataChain(query, _settings, signals_schema)
+def datasets(
+    session: Optional[Session] = None,
+    settings: Optional[dict] = None,
+    in_memory: bool = False,
+    object_name: str = "dataset",
+    include_listing: bool = False,
+    studio: bool = False,
+) -> "DataChain":
+    """Generate chain with list of registered datasets.
+    Args:
+        session: Optional session instance. If not provided, uses default session.
+        settings: Optional dictionary of settings to configure the chain.
+        in_memory: If True, creates an in-memory session. Defaults to False.
+        object_name: Name of the output object in the chain. Defaults to "dataset".
+        include_listing: If True, includes listing datasets. Defaults to False.
+        studio: If True, returns datasets from Studio only,
+            otherwise returns all local datasets. Defaults to False.
+    Returns:
+        DataChain: A new DataChain instance containing dataset information.
+    Example:
+        ```py
+        import datachain as dc
+        chain = dc.datasets()
+        for ds in chain.collect("dataset"):
+            print(f"{ds.name}@v{ds.version}")
+        ```
+    """
+    session = Session.get(session, in_memory=in_memory)
+    catalog = session.catalog
+    datasets_values = [
+        DatasetInfo.from_models(d, v, j)
+        for d, v, j in catalog.list_datasets_versions(
+            include_listing=include_listing, studio=studio
+        )
+    ]
+    return from_values(
+        session=session,
+        settings=settings,
+        in_memory=in_memory,
+        output={object_name: DatasetInfo},
+        **{object_name: datasets_values},  # type: ignore[arg-type]
+    )

datachain/lib/dc/hf.py ADDED Viewed

@@ -0,0 +1,73 @@
+from typing import (
+    TYPE_CHECKING,
+    Optional,
+    Union,
+)
+from datachain.lib.data_model import dict_to_data_model
+from datachain.query import Session
+if TYPE_CHECKING:
+    from typing_extensions import ParamSpec
+    from datachain.lib.data_model import DataType
+    from datachain.lib.hf import HFDatasetType
+    from .datachain import DataChain
+    P = ParamSpec("P")
+def from_hf(
+    dataset: Union[str, "HFDatasetType"],
+    *args,
+    session: Optional[Session] = None,
+    settings: Optional[dict] = None,
+    object_name: str = "",
+    model_name: str = "",
+    **kwargs,
+) -> "DataChain":
+    """Generate chain from huggingface hub dataset.
+    Parameters:
+        dataset : Path or name of the dataset to read from Hugging Face Hub,
+            or an instance of `datasets.Dataset`-like object.
+        session : Session to use for the chain.
+        settings : Settings to use for the chain.
+        object_name : Generated object column name.
+        model_name : Generated model name.
+        kwargs : Parameters to pass to datasets.load_dataset.
+    Example:
+        Load from Hugging Face Hub:
+        ```py
+        import datachain as dc
+        chain = dc.from_hf("beans", split="train")
+        ```
+        Generate chain from loaded dataset:
+        ```py
+        from datasets import load_dataset
+        ds = load_dataset("beans", split="train")
+        import datachain as dc
+        chain = dc.from_hf(ds)
+        ```
+    """
+    from datachain.lib.hf import HFGenerator, get_output_schema, stream_splits
+    from .values import from_values
+    output: dict[str, DataType] = {}
+    ds_dict = stream_splits(dataset, *args, **kwargs)
+    if len(ds_dict) > 1:
+        output = {"split": str}
+    model_name = model_name or object_name or ""
+    hf_features = next(iter(ds_dict.values())).features
+    output = output | get_output_schema(hf_features)
+    model = dict_to_data_model(model_name, output)
+    if object_name:
+        output = {object_name: model}
+    chain = from_values(split=list(ds_dict.keys()), session=session, settings=settings)
+    return chain.gen(HFGenerator(dataset, model, *args, **kwargs), output=output)

datachain/lib/dc/json.py ADDED Viewed

@@ -0,0 +1,91 @@
+import os
+import os.path
+import re
+from typing import (
+    TYPE_CHECKING,
+    Optional,
+    Union,
+)
+from datachain.lib.data_model import DataType
+from datachain.lib.file import (
+    File,
+    FileType,
+)
+from datachain.lib.meta_formats import read_meta
+if TYPE_CHECKING:
+    from typing_extensions import ParamSpec
+    from .datachain import DataChain
+    P = ParamSpec("P")
+def from_json(
+    path: Union[str, os.PathLike[str]],
+    type: FileType = "text",
+    spec: Optional[DataType] = None,
+    schema_from: Optional[str] = "auto",
+    jmespath: Optional[str] = None,
+    object_name: Optional[str] = "",
+    model_name: Optional[str] = None,
+    format: Optional[str] = "json",
+    nrows=None,
+    **kwargs,
+) -> "DataChain":
+    """Get data from JSON. It returns the chain itself.
+    Parameters:
+        path : storage URI with directory. URI must start with storage prefix such
+            as `s3://`, `gs://`, `az://` or "file:///"
+        type : read file as "binary", "text", or "image" data. Default is "text".
+        spec : optional Data Model
+        schema_from : path to sample to infer spec (if schema not provided)
+        object_name : generated object column name
+        model_name : optional generated model name
+        format: "json", "jsonl"
+        jmespath : optional JMESPATH expression to reduce JSON
+        nrows : optional row limit for jsonl and JSON arrays
+    Example:
+        infer JSON schema from data, reduce using JMESPATH
+        ```py
+        import datachain as dc
+        chain = dc.from_json("gs://json", jmespath="key1.key2")
+        ```
+        infer JSON schema from a particular path
+        ```py
+        import datachain as dc
+        chain = dc.from_json("gs://json_ds", schema_from="gs://json/my.json")
+        ```
+    """
+    from .storage import from_storage
+    if schema_from == "auto":
+        schema_from = str(path)
+    def jmespath_to_name(s: str):
+        name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s)  # type: ignore[union-attr]
+        return s[:name_end]
+    if (not object_name) and jmespath:
+        object_name = jmespath_to_name(jmespath)
+    if not object_name:
+        object_name = format
+    chain = from_storage(uri=path, type=type, **kwargs)
+    signal_dict = {
+        object_name: read_meta(
+            schema_from=schema_from,
+            format=format,
+            spec=spec,
+            model_name=model_name,
+            jmespath=jmespath,
+            nrows=nrows,
+        ),
+        "params": {"file": File},
+    }
+    # disable prefetch if nrows is set
+    settings = {"prefetch": 0} if nrows else {}
+    return chain.settings(**settings).gen(**signal_dict)  # type: ignore[misc, arg-type]

datachain/lib/dc/listings.py ADDED Viewed

@@ -0,0 +1,43 @@
+from typing import (
+    TYPE_CHECKING,
+    Optional,
+)
+from datachain.lib.listing_info import ListingInfo
+from datachain.query import Session
+from .values import from_values
+if TYPE_CHECKING:
+    from typing_extensions import ParamSpec
+    from .datachain import DataChain
+    P = ParamSpec("P")
+def listings(
+    session: Optional[Session] = None,
+    in_memory: bool = False,
+    object_name: str = "listing",
+    **kwargs,
+) -> "DataChain":
+    """Generate chain with list of cached listings.
+    Listing is a special kind of dataset which has directory listing data of
+    some underlying storage (e.g S3 bucket).
+    Example:
+        ```py
+        import datachain as dc
+        dc.listings().show()
+        ```
+    """
+    session = Session.get(session, in_memory=in_memory)
+    catalog = kwargs.get("catalog") or session.catalog
+    return from_values(
+        session=session,
+        in_memory=in_memory,
+        output={object_name: ListingInfo},
+        **{object_name: catalog.listings()},  # type: ignore[arg-type]
+    )

datachain/lib/dc/pandas.py ADDED Viewed

@@ -0,0 +1,56 @@
+from typing import (
+    TYPE_CHECKING,
+    Optional,
+)
+from datachain.query import Session
+from .values import from_values
+if TYPE_CHECKING:
+    import pandas as pd
+    from typing_extensions import ParamSpec
+    from .datachain import DataChain
+    P = ParamSpec("P")
+def from_pandas(  # type: ignore[override]
+    df: "pd.DataFrame",
+    name: str = "",
+    session: Optional[Session] = None,
+    settings: Optional[dict] = None,
+    in_memory: bool = False,
+    object_name: str = "",
+) -> "DataChain":
+    """Generate chain from pandas data-frame.
+    Example:
+        ```py
+        import pandas as pd
+        import datachain as dc
+        df = pd.DataFrame({"fib": [1, 2, 3, 5, 8]})
+        dc.from_pandas(df)
+        ```
+    """
+    from .utils import DatasetPrepareError
+    fr_map = {col.lower(): df[col].tolist() for col in df.columns}
+    for column in fr_map:
+        if not column.isidentifier():
+            raise DatasetPrepareError(
+                name,
+                f"import from pandas error - '{column}' cannot be a column name",
+            )
+    return from_values(
+        name,
+        session,
+        settings=settings,
+        object_name=object_name,
+        in_memory=in_memory,
+        **fr_map,
+    )

datachain/lib/dc/parquet.py ADDED Viewed

@@ -0,0 +1,65 @@
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Optional,
+)
+from datachain.lib.data_model import DataType
+from datachain.query import Session
+if TYPE_CHECKING:
+    from typing_extensions import ParamSpec
+    from .datachain import DataChain
+    P = ParamSpec("P")
+def from_parquet(
+    path,
+    partitioning: Any = "hive",
+    output: Optional[dict[str, DataType]] = None,
+    object_name: str = "",
+    model_name: str = "",
+    source: bool = True,
+    session: Optional[Session] = None,
+    settings: Optional[dict] = None,
+    **kwargs,
+) -> "DataChain":
+    """Generate chain from parquet files.
+    Parameters:
+        path : Storage URI with directory. URI must start with storage prefix such
+            as `s3://`, `gs://`, `az://` or "file:///".
+        partitioning : Any pyarrow partitioning schema.
+        output : Dictionary defining column names and their corresponding types.
+        object_name : Created object column name.
+        model_name : Generated model name.
+        source : Whether to include info about the source file.
+        session : Session to use for the chain.
+        settings : Settings to use for the chain.
+    Example:
+        Reading a single file:
+        ```py
+        import datachain as dc
+        dc.from_parquet("s3://mybucket/file.parquet")
+        ```
+        Reading a partitioned dataset from a directory:
+        ```py
+        import datachain as dc
+        dc.from_parquet("s3://mybucket/dir")
+        ```
+    """
+    from .storage import from_storage
+    chain = from_storage(path, session=session, settings=settings, **kwargs)
+    return chain.parse_tabular(
+        output=output,
+        object_name=object_name,
+        model_name=model_name,
+        source=source,
+        format="parquet",
+        partitioning=partitioning,
+    )

datachain/lib/dc/records.py ADDED Viewed

@@ -0,0 +1,90 @@
+from typing import (
+    TYPE_CHECKING,
+    Optional,
+    Union,
+)
+import sqlalchemy
+from datachain.lib.data_model import DataType
+from datachain.lib.file import (
+    File,
+)
+from datachain.lib.signal_schema import SignalSchema
+from datachain.query import Session
+if TYPE_CHECKING:
+    from typing_extensions import ParamSpec
+    from .datachain import DataChain
+    P = ParamSpec("P")
+def from_records(
+    to_insert: Optional[Union[dict, list[dict]]],
+    session: Optional[Session] = None,
+    settings: Optional[dict] = None,
+    in_memory: bool = False,
+    schema: Optional[dict[str, DataType]] = None,
+) -> "DataChain":
+    """Create a DataChain from the provided records. This method can be used for
+    programmatically generating a chain in contrast of reading data from storages
+    or other sources.
+    Parameters:
+        to_insert : records (or a single record) to insert. Each record is
+                    a dictionary of signals and theirs values.
+        schema : describes chain signals and their corresponding types
+    Example:
+        ```py
+        import datachain as dc
+        single_record = dc.from_records(dc.DEFAULT_FILE_RECORD)
+        ```
+    """
+    from .datasets import from_dataset
+    session = Session.get(session, in_memory=in_memory)
+    catalog = session.catalog
+    name = session.generate_temp_dataset_name()
+    signal_schema = None
+    columns: list[sqlalchemy.Column] = []
+    if schema:
+        signal_schema = SignalSchema(schema)
+        columns = [
+            sqlalchemy.Column(c.name, c.type)  # type: ignore[union-attr]
+            for c in signal_schema.db_signals(as_columns=True)  # type: ignore[assignment]
+        ]
+    else:
+        columns = [
+            sqlalchemy.Column(name, typ)
+            for name, typ in File._datachain_column_types.items()
+        ]
+    dsr = catalog.create_dataset(
+        name,
+        columns=columns,
+        feature_schema=(
+            signal_schema.clone_without_sys_signals().serialize()
+            if signal_schema
+            else None
+        ),
+    )
+    session.add_dataset_version(dsr, dsr.latest_version)
+    if isinstance(to_insert, dict):
+        to_insert = [to_insert]
+    elif not to_insert:
+        to_insert = []
+    warehouse = catalog.warehouse
+    dr = warehouse.dataset_rows(dsr)
+    db = warehouse.db
+    insert_q = dr.get_table().insert()
+    for record in to_insert:
+        db.execute(insert_q.values(**record))
+    return from_dataset(name=dsr.name, session=session, settings=settings)

datachain/lib/dc/storage.py ADDED Viewed

@@ -0,0 +1,118 @@
+import os.path
+from typing import (
+    TYPE_CHECKING,
+    Optional,
+    Union,
+)
+from datachain.lib.file import (
+    File,
+    FileType,
+    get_file_type,
+)
+from datachain.lib.listing import get_file_info, get_listing, list_bucket, ls
+from datachain.query import Session
+if TYPE_CHECKING:
+    from .datachain import DataChain
+def from_storage(
+    uri: Union[str, os.PathLike[str]],
+    *,
+    type: FileType = "binary",
+    session: Optional[Session] = None,
+    settings: Optional[dict] = None,
+    in_memory: bool = False,
+    recursive: Optional[bool] = True,
+    object_name: str = "file",
+    update: bool = False,
+    anon: bool = False,
+    client_config: Optional[dict] = None,
+) -> "DataChain":
+    """Get data from a storage as a list of file with all file attributes.
+    It returns the chain itself as usual.
+    Parameters:
+        uri : storage URI with directory. URI must start with storage prefix such
+            as `s3://`, `gs://`, `az://` or "file:///"
+        type : read file as "binary", "text", or "image" data. Default is "binary".
+        recursive : search recursively for the given path.
+        object_name : Created object column name.
+        update : force storage reindexing. Default is False.
+        anon : If True, we will treat cloud bucket as public one
+        client_config : Optional client configuration for the storage client.
+    Example:
+        Simple call from s3
+        ```py
+        import datachain as dc
+        chain = dc.from_storage("s3://my-bucket/my-dir")
+        ```
+        With AWS S3-compatible storage
+        ```py
+        import datachain as dc
+        chain = dc.from_storage(
+            "s3://my-bucket/my-dir",
+            client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
+        )
+        ```
+        Pass existing session
+        ```py
+        session = Session.get()
+        import datachain as dc
+        chain = dc.from_storage("s3://my-bucket/my-dir", session=session)
+        ```
+    """
+    from .datachain import DataChain
+    from .datasets import from_dataset
+    from .records import from_records
+    from .values import from_values
+    file_type = get_file_type(type)
+    if anon:
+        client_config = (client_config or {}) | {"anon": True}
+    session = Session.get(session, client_config=client_config, in_memory=in_memory)
+    cache = session.catalog.cache
+    client_config = session.catalog.client_config
+    list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
+        uri, session, update=update
+    )
+    # ds_name is None if object is a file, we don't want to use cache
+    # or do listing in that case - just read that single object
+    if not list_ds_name:
+        dc = from_values(
+            session=session,
+            settings=settings,
+            in_memory=in_memory,
+            file=[get_file_info(list_uri, cache, client_config=client_config)],
+        )
+        dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
+        return dc
+    if update or not list_ds_exists:
+        # disable prefetch for listing, as it pre-downloads all files
+        (
+            from_records(
+                DataChain.DEFAULT_FILE_RECORD,
+                session=session,
+                settings=settings,
+                in_memory=in_memory,
+            )
+            .settings(prefetch=0)
+            .gen(
+                list_bucket(list_uri, cache, client_config=client_config),
+                output={f"{object_name}": File},
+            )
+            .save(list_ds_name, listing=True)
+        )
+    dc = from_dataset(list_ds_name, session=session, settings=settings)
+    dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
+    return ls(dc, list_path, recursive=recursive, object_name=object_name)

datachain 0.13.1__py3-none-any.whl → 0.14.0__py3-none-any.whl

Potentially problematic release.

datachain 0.13.1py3-none-any.whl → 0.14.0py3-none-any.whl