PyPI - datachain - Versions diffs - 0.8.0__tar.gz → 0.8.1__tar.gz - Mend

datachain 0.8.0tar.gz → 0.8.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (290) hide show

{datachain-0.8.0 → datachain-0.8.1}/.github/workflows/benchmarks.yml RENAMED Viewed

@@ -25,7 +25,7 @@ jobs:
           python-version: '3.12'
       - name: Setup uv
-        uses: astral-sh/setup-uv@v4
+        uses: astral-sh/setup-uv@v5
         with:
           enable-cache: true
           cache-suffix: benchmarks

{datachain-0.8.0 → datachain-0.8.1}/.github/workflows/release.yml RENAMED Viewed

@@ -27,7 +27,7 @@ jobs:
           python-version: '3.12'
       - name: Setup uv
-        uses: astral-sh/setup-uv@v4
+        uses: astral-sh/setup-uv@v5
       - name: Install nox
         run: uv pip install nox --system

{datachain-0.8.0 → datachain-0.8.1}/.github/workflows/tests-studio.yml RENAMED Viewed

@@ -81,7 +81,7 @@ jobs:
           python-version: ${{ matrix.pyv }}
       - name: Setup uv
-        uses: astral-sh/setup-uv@v4
+        uses: astral-sh/setup-uv@v5
         with:
           enable-cache: true
           cache-suffix: studio

{datachain-0.8.0 → datachain-0.8.1}/.github/workflows/tests.yml RENAMED Viewed

@@ -37,7 +37,7 @@ jobs:
           python-version: '3.9'
       - name: Setup uv
-        uses: astral-sh/setup-uv@v4
+        uses: astral-sh/setup-uv@v5
         with:
           enable-cache: true
           cache-suffix: lint
@@ -94,7 +94,7 @@ jobs:
           python-version: ${{ matrix.pyv }}
       - name: Setup uv
-        uses: astral-sh/setup-uv@v4
+        uses: astral-sh/setup-uv@v5
         with:
           enable-cache: true
           cache-suffix: tests-${{ matrix.pyv }}
@@ -157,7 +157,7 @@ jobs:
           python-version: ${{ matrix.pyv }}
       - name: Setup uv
-        uses: astral-sh/setup-uv@v4
+        uses: astral-sh/setup-uv@v5
         with:
           enable-cache: true
           cache-suffix: examples-${{ matrix.pyv }}

{datachain-0.8.0 → datachain-0.8.1}/.pre-commit-config.yaml RENAMED Viewed

@@ -24,7 +24,7 @@ repos:
       - id: trailing-whitespace
         exclude: '^LICENSES/'
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: 'v0.8.3'
+    rev: 'v0.8.4'
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix]

{datachain-0.8.0/src/datachain.egg-info → datachain-0.8.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datachain
-Version: 0.8.0
+Version: 0.8.1
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License: Apache-2.0
@@ -84,7 +84,7 @@ Requires-Dist: requests-mock; extra == "tests"
 Requires-Dist: scipy; extra == "tests"
 Provides-Extra: dev
 Requires-Dist: datachain[docs,tests]; extra == "dev"
-Requires-Dist: mypy==1.13.0; extra == "dev"
+Requires-Dist: mypy==1.14.0; extra == "dev"
 Requires-Dist: types-python-dateutil; extra == "dev"
 Requires-Dist: types-pytz; extra == "dev"
 Requires-Dist: types-PyYAML; extra == "dev"
@@ -99,7 +99,7 @@ Requires-Dist: unstructured[pdf]; extra == "examples"
 Requires-Dist: pdfplumber==0.11.4; extra == "examples"
 Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
 Requires-Dist: onnx==1.16.1; extra == "examples"
-Requires-Dist: ultralytics==8.3.50; extra == "examples"
+Requires-Dist: ultralytics==8.3.53; extra == "examples"
 ================
 |logo| DataChain

{datachain-0.8.0 → datachain-0.8.1}/docs/quick-start.md RENAMED Viewed

@@ -59,6 +59,8 @@ Batch inference with a simple sentiment model using the
 pip install transformers
 ```
+Note, `transformers` works only if `torch`, `tensorflow` >= 2.0, or `flax` are installed.
 The code below downloads files from the cloud, and applies a
 user-defined function to each one of them. All files with a positive
 sentiment detected are then copied to the local directory.
@@ -114,13 +116,14 @@ DataChain can parallelize API calls; the free Mistral tier supports up
 to 4 requests at the same time.
 ``` py
+import os
 from mistralai import Mistral
 from datachain import File, DataChain, Column
 PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
 def eval_dialogue(file: File) -> bool:
-     client = Mistral()
+     client = Mistral(api_key = os.environ["MISTRAL_API_KEY"])
      response = client.chat.complete(
          model="open-mixtral-8x22b",
          messages=[{"role": "system", "content": PROMPT},
@@ -130,7 +133,6 @@ def eval_dialogue(file: File) -> bool:
 chain = (
    DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
-   .settings(parallel=4, cache=True)
    .map(is_success=eval_dialogue)
    .save("mistral_files")
 )

{datachain-0.8.0 → datachain-0.8.1}/pyproject.toml RENAMED Viewed

@@ -96,7 +96,7 @@ tests = [
 ]
 dev = [
   "datachain[docs,tests]",
-  "mypy==1.13.0",
+  "mypy==1.14.0",
   "types-python-dateutil",
   "types-pytz",
   "types-PyYAML",
@@ -112,7 +112,7 @@ examples = [
   "pdfplumber==0.11.4",
   "huggingface_hub[hf_transfer]",
   "onnx==1.16.1",
-  "ultralytics==8.3.50"
+  "ultralytics==8.3.53"
 ]
 [project.urls]

{datachain-0.8.0 → datachain-0.8.1}/src/datachain/catalog/catalog.py RENAMED Viewed

@@ -52,6 +52,7 @@ from datachain.error import (
     QueryScriptCancelError,
     QueryScriptRunError,
 )
+from datachain.lib.listing import get_listing
 from datachain.node import DirType, Node, NodeWithPath
 from datachain.nodes_thread_pool import NodesThreadPool
 from datachain.remote.studio import StudioClient
@@ -599,7 +600,7 @@ class Catalog:
             source, session=self.session, update=update, object_name=object_name
         )
-        list_ds_name, list_uri, list_path, _ = DataChain.parse_uri(
+        list_ds_name, list_uri, list_path, _ = get_listing(
             source, self.session, update=update
         )
@@ -697,11 +698,9 @@ class Catalog:
                 )
                 indexed_sources = []
                 for source in dataset_sources:
-                    from datachain.lib.dc import DataChain
                     client = self.get_client(source, **client_config)
                     uri = client.uri
-                    dataset_name, _, _, _ = DataChain.parse_uri(uri, self.session)
+                    dataset_name, _, _, _ = get_listing(uri, self.session)
                     listing = Listing(
                         self.metastore.clone(),
                         self.warehouse.clone(),

{datachain-0.8.0 → datachain-0.8.1}/src/datachain/client/gcs.py RENAMED Viewed

@@ -32,6 +32,15 @@ class GCSClient(Client):
         return cast(GCSFileSystem, super().create_fs(**kwargs))
+    def url(self, path: str, expires: int = 3600, **kwargs) -> str:
+        try:
+            return self.fs.sign(self.get_full_path(path), expiration=expires, **kwargs)
+        except AttributeError as exc:
+            is_anon = self.fs.storage_options.get("token") == "anon"
+            if is_anon and "you need a private key to sign credentials" in str(exc):
+                return f"https://storage.googleapis.com/{self.name}/{path}"
+            raise
     @staticmethod
     def parse_timestamp(timestamp: str) -> datetime:
         """

{datachain-0.8.0 → datachain-0.8.1}/src/datachain/data_storage/warehouse.py RENAMED Viewed

@@ -216,7 +216,6 @@ class AbstractWarehouse(ABC, Serializable):
         limit = query._limit
         paginated_query = query.limit(page_size)
-        results = None
         offset = 0
         num_yielded = 0

{datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/arrow.py RENAMED Viewed

@@ -1,9 +1,11 @@
 from collections.abc import Sequence
-from tempfile import NamedTemporaryFile
+from itertools import islice
 from typing import TYPE_CHECKING, Any, Optional
+import fsspec.implementations.reference
 import orjson
 import pyarrow as pa
+from fsspec.core import split_protocol
 from pyarrow.dataset import CsvFileFormat, dataset
 from tqdm import tqdm
@@ -25,7 +27,18 @@ if TYPE_CHECKING:
 DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY = b"DataChain SignalSchema"
+class ReferenceFileSystem(fsspec.implementations.reference.ReferenceFileSystem):
+    def _open(self, path, mode="rb", *args, **kwargs):
+        # overriding because `fsspec`'s `ReferenceFileSystem._open`
+        # reads the whole file in-memory.
+        (uri,) = self.references[path]
+        protocol, _ = split_protocol(uri)
+        return self.fss[protocol]._open(uri, mode, *args, **kwargs)
 class ArrowGenerator(Generator):
+    DEFAULT_BATCH_SIZE = 2**17  # same as `pyarrow._dataset._DEFAULT_BATCH_SIZE`
     def __init__(
         self,
         input_schema: Optional["pa.Schema"] = None,
@@ -55,57 +68,80 @@ class ArrowGenerator(Generator):
     def process(self, file: File):
         if file._caching_enabled:
             file.ensure_cached()
-            path = file.get_local_path()
-            ds = dataset(path, schema=self.input_schema, **self.kwargs)
-        elif self.nrows:
-            path = _nrows_file(file, self.nrows)
-            ds = dataset(path, schema=self.input_schema, **self.kwargs)
+            cache_path = file.get_local_path()
+            fs_path = file.path
+            fs = ReferenceFileSystem({fs_path: [cache_path]})
         else:
-            path = file.get_path()
-            ds = dataset(
-                path, filesystem=file.get_fs(), schema=self.input_schema, **self.kwargs
-            )
+            fs, fs_path = file.get_fs(), file.get_path()
+        ds = dataset(fs_path, schema=self.input_schema, filesystem=fs, **self.kwargs)
         hf_schema = _get_hf_schema(ds.schema)
         use_datachain_schema = (
             bool(ds.schema.metadata)
             and DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY in ds.schema.metadata
         )
-        index = 0
-        with tqdm(desc="Parsed by pyarrow", unit=" rows") as pbar:
-            for record_batch in ds.to_batches():
-                for record in record_batch.to_pylist():
-                    if use_datachain_schema and self.output_schema:
-                        vals = [_nested_model_instantiate(record, self.output_schema)]
-                    else:
-                        vals = list(record.values())
-                        if self.output_schema:
-                            fields = self.output_schema.model_fields
-                            vals_dict = {}
-                            for i, ((field, field_info), val) in enumerate(
-                                zip(fields.items(), vals)
-                            ):
-                                anno = field_info.annotation
-                                if hf_schema:
-                                    from datachain.lib.hf import convert_feature
-                                    feat = list(hf_schema[0].values())[i]
-                                    vals_dict[field] = convert_feature(val, feat, anno)
-                                elif ModelStore.is_pydantic(anno):
-                                    vals_dict[field] = anno(**val)  # type: ignore[misc]
-                                else:
-                                    vals_dict[field] = val
-                            vals = [self.output_schema(**vals_dict)]
-                    if self.source:
-                        kwargs: dict = self.kwargs
-                        # Can't serialize CsvFileFormat; may lose formatting options.
-                        if isinstance(kwargs.get("format"), CsvFileFormat):
-                            kwargs["format"] = "csv"
-                        arrow_file = ArrowRow(file=file, index=index, kwargs=kwargs)
-                        yield [arrow_file, *vals]
-                    else:
-                        yield vals
-                    index += 1
-                pbar.update(len(record_batch))
+        kw = {}
+        if self.nrows:
+            kw = {"batch_size": min(self.DEFAULT_BATCH_SIZE, self.nrows)}
+        def iter_records():
+            for record_batch in ds.to_batches(**kw):
+                yield from record_batch.to_pylist()
+        it = islice(iter_records(), self.nrows)
+        with tqdm(it, desc="Parsed by pyarrow", unit="rows", total=self.nrows) as pbar:
+            for index, record in enumerate(pbar):
+                yield self._process_record(
+                    record, file, index, hf_schema, use_datachain_schema
+                )
+    def _process_record(
+        self,
+        record: dict[str, Any],
+        file: File,
+        index: int,
+        hf_schema: Optional[tuple["Features", dict[str, "DataType"]]],
+        use_datachain_schema: bool,
+    ):
+        if use_datachain_schema and self.output_schema:
+            vals = [_nested_model_instantiate(record, self.output_schema)]
+        else:
+            vals = self._process_non_datachain_record(record, hf_schema)
+        if self.source:
+            kwargs: dict = self.kwargs
+            # Can't serialize CsvFileFormat; may lose formatting options.
+            if isinstance(kwargs.get("format"), CsvFileFormat):
+                kwargs["format"] = "csv"
+            arrow_file = ArrowRow(file=file, index=index, kwargs=kwargs)
+            return [arrow_file, *vals]
+        return vals
+    def _process_non_datachain_record(
+        self,
+        record: dict[str, Any],
+        hf_schema: Optional[tuple["Features", dict[str, "DataType"]]],
+    ):
+        vals = list(record.values())
+        if not self.output_schema:
+            return vals
+        fields = self.output_schema.model_fields
+        vals_dict = {}
+        for i, ((field, field_info), val) in enumerate(zip(fields.items(), vals)):
+            anno = field_info.annotation
+            if hf_schema:
+                from datachain.lib.hf import convert_feature
+                feat = list(hf_schema[0].values())[i]
+                vals_dict[field] = convert_feature(val, feat, anno)
+            elif ModelStore.is_pydantic(anno):
+                vals_dict[field] = anno(**val)  # type: ignore[misc]
+            else:
+                vals_dict[field] = val
+        return [self.output_schema(**vals_dict)]
 def infer_schema(chain: "DataChain", **kwargs) -> pa.Schema:
@@ -190,18 +226,6 @@ def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type:  # noqa:
     raise TypeError(f"{col_type!r} datatypes not supported, column: {column}")
-def _nrows_file(file: File, nrows: int) -> str:
-    tf = NamedTemporaryFile(delete=False)  # noqa: SIM115
-    with file.open(mode="r") as reader:
-        with open(tf.name, "a") as writer:
-            for row, line in enumerate(reader):
-                if row >= nrows:
-                    break
-                writer.write(line)
-                writer.write("\n")
-    return tf.name
 def _get_hf_schema(
     schema: "pa.Schema",
 ) -> Optional[tuple["Features", dict[str, "DataType"]]]:

{datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/dc.py RENAMED Viewed

@@ -11,7 +11,6 @@ from typing import (
     BinaryIO,
     Callable,
     ClassVar,
-    Literal,
     Optional,
     TypeVar,
     Union,
@@ -24,8 +23,6 @@ from pydantic import BaseModel
 from sqlalchemy.sql.functions import GenericFunction
 from sqlalchemy.sql.sqltypes import NullType
-from datachain.client import Client
-from datachain.client.local import FileClient
 from datachain.dataset import DatasetRecord
 from datachain.func.base import Function
 from datachain.func.func import Func
@@ -33,13 +30,9 @@ from datachain.lib.convert.python_to_sql import python_to_sql
 from datachain.lib.convert.values_to_tuples import values_to_tuples
 from datachain.lib.data_model import DataModel, DataType, DataValue, dict_to_data_model
 from datachain.lib.dataset_info import DatasetInfo
-from datachain.lib.file import ArrowRow, File, get_file_type
+from datachain.lib.file import ArrowRow, File, FileType, get_file_type
 from datachain.lib.file import ExportPlacement as FileExportPlacement
-from datachain.lib.listing import (
-    list_bucket,
-    ls,
-    parse_listing_uri,
-)
+from datachain.lib.listing import get_listing, list_bucket, ls
 from datachain.lib.listing_info import ListingInfo
 from datachain.lib.meta_formats import read_meta
 from datachain.lib.model_store import ModelStore
@@ -403,53 +396,12 @@ class DataChain:
         self.signals_schema |= signals_schema
         return self
-    @classmethod
-    def parse_uri(
-        cls, uri: str, session: Session, update: bool = False
-    ) -> tuple[str, str, str, bool]:
-        """Returns correct listing dataset name that must be used for saving listing
-        operation. It takes into account existing listings and reusability of those.
-        It also returns boolean saying if returned dataset name is reused / already
-        exists or not, and it returns correct listing path that should be used to find
-        rows based on uri.
-        """
-        catalog = session.catalog
-        cache = catalog.cache
-        client_config = catalog.client_config
-        client = Client.get_client(uri, cache, **client_config)
-        ds_name, list_uri, list_path = parse_listing_uri(uri, cache, client_config)
-        listing = None
-        listings = [
-            ls
-            for ls in catalog.listings()
-            if not ls.is_expired and ls.contains(ds_name)
-        ]
-        if listings:
-            if update:
-                # choosing the smallest possible one to minimize update time
-                listing = sorted(listings, key=lambda ls: len(ls.name))[0]
-            else:
-                # no need to update, choosing the most recent one
-                listing = sorted(listings, key=lambda ls: ls.created_at)[-1]
-        if isinstance(client, FileClient) and listing and listing.name != ds_name:
-            # For local file system we need to fix listing path / prefix
-            # if we are reusing existing listing
-            list_path = f'{ds_name.strip("/").removeprefix(listing.name)}/{list_path}'
-        ds_name = listing.name if listing else ds_name
-        return ds_name, list_uri, list_path, bool(listing)
     @classmethod
     def from_storage(
         cls,
         uri,
         *,
-        type: Literal["binary", "text", "image"] = "binary",
+        type: FileType = "binary",
         session: Optional[Session] = None,
         settings: Optional[dict] = None,
         in_memory: bool = False,
@@ -482,7 +434,7 @@ class DataChain:
         cache = session.catalog.cache
         client_config = session.catalog.client_config
-        list_ds_name, list_uri, list_path, list_ds_exists = cls.parse_uri(
+        list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
             uri, session, update=update
         )
@@ -548,7 +500,7 @@ class DataChain:
     def from_json(
         cls,
         path,
-        type: Literal["binary", "text", "image"] = "text",
+        type: FileType = "text",
         spec: Optional[DataType] = None,
         schema_from: Optional[str] = "auto",
         jmespath: Optional[str] = None,
@@ -605,7 +557,9 @@ class DataChain:
                 nrows=nrows,
             )
         }
-        return chain.gen(**signal_dict)  # type: ignore[misc, arg-type]
+        # disable prefetch if nrows is set
+        settings = {"prefetch": 0} if nrows else {}
+        return chain.settings(**settings).gen(**signal_dict)  # type: ignore[misc, arg-type]
     def explode(
         self,
@@ -1942,7 +1896,10 @@ class DataChain:
         if source:
             output = {"source": ArrowRow} | output  # type: ignore[assignment,operator]
-        return self.gen(
+        # disable prefetch if nrows is set
+        settings = {"prefetch": 0} if nrows else {}
+        return self.settings(**settings).gen(  # type: ignore[arg-type]
             ArrowGenerator(schema, model, source, nrows, **kwargs), output=output
         )
@@ -2024,8 +1981,6 @@ class DataChain:
             else:
                 msg = f"error parsing csv - incompatible output type {type(output)}"
                 raise DatasetPrepareError(chain.name, msg)
-        elif nrows:
-            nrows += 1
         parse_options = ParseOptions(delimiter=delimiter)
         read_options = ReadOptions(column_names=column_names)

{datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/file.py RENAMED Viewed

@@ -39,6 +39,8 @@ logger = logging.getLogger("datachain")
 # how to create file path when exporting
 ExportPlacement = Literal["filename", "etag", "fullpath", "checksum"]
+FileType = Literal["binary", "text", "image"]
 class VFileError(DataChainError):
     def __init__(self, file: "File", message: str, vtype: str = ""):
@@ -470,7 +472,7 @@ class ArrowRow(DataModel):
             return record_batch.to_pylist()[0]
-def get_file_type(type_: Literal["binary", "text", "image"] = "binary") -> type[File]:
+def get_file_type(type_: FileType = "binary") -> type[File]:
     file: type[File] = File
     if type_ == "text":
         file = TextFile

{datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/listing.py RENAMED Viewed

@@ -15,6 +15,7 @@ from datachain.utils import uses_glob
 if TYPE_CHECKING:
     from datachain.lib.dc import DataChain
+    from datachain.query.session import Session
 LISTING_TTL = 4 * 60 * 60  # cached listing lasts 4 hours
 LISTING_PREFIX = "lst__"  # listing datasets start with this name
@@ -108,3 +109,46 @@ def listing_uri_from_name(dataset_name: str) -> str:
     if not is_listing_dataset(dataset_name):
         raise ValueError(f"Dataset {dataset_name} is not a listing")
     return dataset_name.removeprefix(LISTING_PREFIX)
+def get_listing(
+    uri: str, session: "Session", update: bool = False
+) -> tuple[str, str, str, bool]:
+    """Returns correct listing dataset name that must be used for saving listing
+    operation. It takes into account existing listings and reusability of those.
+    It also returns boolean saying if returned dataset name is reused / already
+    exists or not (on update it always returns False - just because there was no
+    reason to complicate it so far). And it returns correct listing path that should
+    be used to find rows based on uri.
+    """
+    from datachain.client.local import FileClient
+    catalog = session.catalog
+    cache = catalog.cache
+    client_config = catalog.client_config
+    client = Client.get_client(uri, cache, **client_config)
+    ds_name, list_uri, list_path = parse_listing_uri(uri, cache, client_config)
+    listing = None
+    listings = [
+        ls for ls in catalog.listings() if not ls.is_expired and ls.contains(ds_name)
+    ]
+    # if no need to update - choosing the most recent one;
+    # otherwise, we'll using the exact original `ds_name`` in this case:
+    # - if a "bigger" listing exists, we don't want to update it, it's better
+    #   to create a new "smaller" one on "update=True"
+    # - if an exact listing exists it will have the same name as `ds_name`
+    #   anyway below
+    if listings and not update:
+        listing = sorted(listings, key=lambda ls: ls.created_at)[-1]
+    # for local file system we need to fix listing path / prefix
+    # if we are reusing existing listing
+    if isinstance(client, FileClient) and listing and listing.name != ds_name:
+        list_path = f'{ds_name.strip("/").removeprefix(listing.name)}/{list_path}'
+    ds_name = listing.name if listing else ds_name
+    return ds_name, list_uri, list_path, bool(listing)

{datachain-0.8.0 → datachain-0.8.1}/src/datachain/lib/udf.py RENAMED Viewed

@@ -85,7 +85,6 @@ class UDFAdapter:
         udf_fields: "Sequence[str]",
         udf_inputs: "Iterable[RowsOutput]",
         catalog: "Catalog",
-        is_generator: bool,
         cache: bool,
         download_cb: Callback = DEFAULT_CALLBACK,
         processed_cb: Callback = DEFAULT_CALLBACK,

datachain 0.8.0__tar.gz → 0.8.1__tar.gz

Potentially problematic release.

datachain 0.8.0tar.gz → 0.8.1tar.gz