PyPI - datachain - Versions diffs - 0.8.0__py3-none-any.whl → 0.8.2__py3-none-any.whl - Mend

datachain 0.8.0py3-none-any.whl → 0.8.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (21) hide show

datachain/catalog/catalog.py +3 -4
datachain/client/gcs.py +10 -0
datachain/data_storage/warehouse.py +0 -1
datachain/lib/arrow.py +82 -58
datachain/lib/dc.py +12 -57
datachain/lib/file.py +3 -1
datachain/lib/listing.py +44 -0
datachain/lib/udf.py +0 -1
datachain/query/batch.py +32 -6
datachain/query/dataset.py +17 -17
datachain/query/dispatch.py +125 -125
datachain/query/session.py +8 -5
datachain/query/udf.py +20 -0
datachain/query/utils.py +42 -0
datachain/utils.py +1 -1
{datachain-0.8.0.dist-info → datachain-0.8.2.dist-info}/METADATA +85 -3
{datachain-0.8.0.dist-info → datachain-0.8.2.dist-info}/RECORD +21 -19
{datachain-0.8.0.dist-info → datachain-0.8.2.dist-info}/LICENSE +0 -0
{datachain-0.8.0.dist-info → datachain-0.8.2.dist-info}/WHEEL +0 -0
{datachain-0.8.0.dist-info → datachain-0.8.2.dist-info}/entry_points.txt +0 -0
{datachain-0.8.0.dist-info → datachain-0.8.2.dist-info}/top_level.txt +0 -0

datachain/catalog/catalog.py CHANGED Viewed

@@ -52,6 +52,7 @@ from datachain.error import (
     QueryScriptCancelError,
     QueryScriptRunError,
 )
+from datachain.lib.listing import get_listing
 from datachain.node import DirType, Node, NodeWithPath
 from datachain.nodes_thread_pool import NodesThreadPool
 from datachain.remote.studio import StudioClient
@@ -599,7 +600,7 @@ class Catalog:
             source, session=self.session, update=update, object_name=object_name
         )
-        list_ds_name, list_uri, list_path, _ = DataChain.parse_uri(
+        list_ds_name, list_uri, list_path, _ = get_listing(
             source, self.session, update=update
         )
@@ -697,11 +698,9 @@ class Catalog:
                 )
                 indexed_sources = []
                 for source in dataset_sources:
-                    from datachain.lib.dc import DataChain
                     client = self.get_client(source, **client_config)
                     uri = client.uri
-                    dataset_name, _, _, _ = DataChain.parse_uri(uri, self.session)
+                    dataset_name, _, _, _ = get_listing(uri, self.session)
                     listing = Listing(
                         self.metastore.clone(),
                         self.warehouse.clone(),

datachain/client/gcs.py CHANGED Viewed

@@ -32,6 +32,16 @@ class GCSClient(Client):
         return cast(GCSFileSystem, super().create_fs(**kwargs))
+    def url(self, path: str, expires: int = 3600, **kwargs) -> str:
+        """
+        Generate a signed URL for the given path.
+        If the client is anonymous, a public URL is returned instead
+        (see https://cloud.google.com/storage/docs/access-public-data#api-link).
+        """
+        if self.fs.storage_options.get("token") == "anon":
+            return f"https://storage.googleapis.com/{self.name}/{path}"
+        return self.fs.sign(self.get_full_path(path), expiration=expires, **kwargs)
     @staticmethod
     def parse_timestamp(timestamp: str) -> datetime:
         """

datachain/data_storage/warehouse.py CHANGED Viewed

@@ -216,7 +216,6 @@ class AbstractWarehouse(ABC, Serializable):
         limit = query._limit
         paginated_query = query.limit(page_size)
-        results = None
         offset = 0
         num_yielded = 0

datachain/lib/arrow.py CHANGED Viewed

@@ -1,9 +1,11 @@
 from collections.abc import Sequence
-from tempfile import NamedTemporaryFile
+from itertools import islice
 from typing import TYPE_CHECKING, Any, Optional
+import fsspec.implementations.reference
 import orjson
 import pyarrow as pa
+from fsspec.core import split_protocol
 from pyarrow.dataset import CsvFileFormat, dataset
 from tqdm import tqdm
@@ -25,7 +27,18 @@ if TYPE_CHECKING:
 DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY = b"DataChain SignalSchema"
+class ReferenceFileSystem(fsspec.implementations.reference.ReferenceFileSystem):
+    def _open(self, path, mode="rb", *args, **kwargs):
+        # overriding because `fsspec`'s `ReferenceFileSystem._open`
+        # reads the whole file in-memory.
+        (uri,) = self.references[path]
+        protocol, _ = split_protocol(uri)
+        return self.fss[protocol]._open(uri, mode, *args, **kwargs)
 class ArrowGenerator(Generator):
+    DEFAULT_BATCH_SIZE = 2**17  # same as `pyarrow._dataset._DEFAULT_BATCH_SIZE`
     def __init__(
         self,
         input_schema: Optional["pa.Schema"] = None,
@@ -55,57 +68,80 @@ class ArrowGenerator(Generator):
     def process(self, file: File):
         if file._caching_enabled:
             file.ensure_cached()
-            path = file.get_local_path()
-            ds = dataset(path, schema=self.input_schema, **self.kwargs)
-        elif self.nrows:
-            path = _nrows_file(file, self.nrows)
-            ds = dataset(path, schema=self.input_schema, **self.kwargs)
+            cache_path = file.get_local_path()
+            fs_path = file.path
+            fs = ReferenceFileSystem({fs_path: [cache_path]})
         else:
-            path = file.get_path()
-            ds = dataset(
-                path, filesystem=file.get_fs(), schema=self.input_schema, **self.kwargs
-            )
+            fs, fs_path = file.get_fs(), file.get_path()
+        ds = dataset(fs_path, schema=self.input_schema, filesystem=fs, **self.kwargs)
         hf_schema = _get_hf_schema(ds.schema)
         use_datachain_schema = (
             bool(ds.schema.metadata)
             and DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY in ds.schema.metadata
         )
-        index = 0
-        with tqdm(desc="Parsed by pyarrow", unit=" rows") as pbar:
-            for record_batch in ds.to_batches():
-                for record in record_batch.to_pylist():
-                    if use_datachain_schema and self.output_schema:
-                        vals = [_nested_model_instantiate(record, self.output_schema)]
-                    else:
-                        vals = list(record.values())
-                        if self.output_schema:
-                            fields = self.output_schema.model_fields
-                            vals_dict = {}
-                            for i, ((field, field_info), val) in enumerate(
-                                zip(fields.items(), vals)
-                            ):
-                                anno = field_info.annotation
-                                if hf_schema:
-                                    from datachain.lib.hf import convert_feature
-                                    feat = list(hf_schema[0].values())[i]
-                                    vals_dict[field] = convert_feature(val, feat, anno)
-                                elif ModelStore.is_pydantic(anno):
-                                    vals_dict[field] = anno(**val)  # type: ignore[misc]
-                                else:
-                                    vals_dict[field] = val
-                            vals = [self.output_schema(**vals_dict)]
-                    if self.source:
-                        kwargs: dict = self.kwargs
-                        # Can't serialize CsvFileFormat; may lose formatting options.
-                        if isinstance(kwargs.get("format"), CsvFileFormat):
-                            kwargs["format"] = "csv"
-                        arrow_file = ArrowRow(file=file, index=index, kwargs=kwargs)
-                        yield [arrow_file, *vals]
-                    else:
-                        yield vals
-                    index += 1
-                pbar.update(len(record_batch))
+        kw = {}
+        if self.nrows:
+            kw = {"batch_size": min(self.DEFAULT_BATCH_SIZE, self.nrows)}
+        def iter_records():
+            for record_batch in ds.to_batches(**kw):
+                yield from record_batch.to_pylist()
+        it = islice(iter_records(), self.nrows)
+        with tqdm(it, desc="Parsed by pyarrow", unit="rows", total=self.nrows) as pbar:
+            for index, record in enumerate(pbar):
+                yield self._process_record(
+                    record, file, index, hf_schema, use_datachain_schema
+                )
+    def _process_record(
+        self,
+        record: dict[str, Any],
+        file: File,
+        index: int,
+        hf_schema: Optional[tuple["Features", dict[str, "DataType"]]],
+        use_datachain_schema: bool,
+    ):
+        if use_datachain_schema and self.output_schema:
+            vals = [_nested_model_instantiate(record, self.output_schema)]
+        else:
+            vals = self._process_non_datachain_record(record, hf_schema)
+        if self.source:
+            kwargs: dict = self.kwargs
+            # Can't serialize CsvFileFormat; may lose formatting options.
+            if isinstance(kwargs.get("format"), CsvFileFormat):
+                kwargs["format"] = "csv"
+            arrow_file = ArrowRow(file=file, index=index, kwargs=kwargs)
+            return [arrow_file, *vals]
+        return vals
+    def _process_non_datachain_record(
+        self,
+        record: dict[str, Any],
+        hf_schema: Optional[tuple["Features", dict[str, "DataType"]]],
+    ):
+        vals = list(record.values())
+        if not self.output_schema:
+            return vals
+        fields = self.output_schema.model_fields
+        vals_dict = {}
+        for i, ((field, field_info), val) in enumerate(zip(fields.items(), vals)):
+            anno = field_info.annotation
+            if hf_schema:
+                from datachain.lib.hf import convert_feature
+                feat = list(hf_schema[0].values())[i]
+                vals_dict[field] = convert_feature(val, feat, anno)
+            elif ModelStore.is_pydantic(anno):
+                vals_dict[field] = anno(**val)  # type: ignore[misc]
+            else:
+                vals_dict[field] = val
+        return [self.output_schema(**vals_dict)]
 def infer_schema(chain: "DataChain", **kwargs) -> pa.Schema:
@@ -190,18 +226,6 @@ def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type:  # noqa:
     raise TypeError(f"{col_type!r} datatypes not supported, column: {column}")
-def _nrows_file(file: File, nrows: int) -> str:
-    tf = NamedTemporaryFile(delete=False)  # noqa: SIM115
-    with file.open(mode="r") as reader:
-        with open(tf.name, "a") as writer:
-            for row, line in enumerate(reader):
-                if row >= nrows:
-                    break
-                writer.write(line)
-                writer.write("\n")
-    return tf.name
 def _get_hf_schema(
     schema: "pa.Schema",
 ) -> Optional[tuple["Features", dict[str, "DataType"]]]:

datachain/lib/dc.py CHANGED Viewed

@@ -11,7 +11,6 @@ from typing import (
     BinaryIO,
     Callable,
     ClassVar,
-    Literal,
     Optional,
     TypeVar,
     Union,
@@ -24,8 +23,6 @@ from pydantic import BaseModel
 from sqlalchemy.sql.functions import GenericFunction
 from sqlalchemy.sql.sqltypes import NullType
-from datachain.client import Client
-from datachain.client.local import FileClient
 from datachain.dataset import DatasetRecord
 from datachain.func.base import Function
 from datachain.func.func import Func
@@ -33,13 +30,9 @@ from datachain.lib.convert.python_to_sql import python_to_sql
 from datachain.lib.convert.values_to_tuples import values_to_tuples
 from datachain.lib.data_model import DataModel, DataType, DataValue, dict_to_data_model
 from datachain.lib.dataset_info import DatasetInfo
-from datachain.lib.file import ArrowRow, File, get_file_type
+from datachain.lib.file import ArrowRow, File, FileType, get_file_type
 from datachain.lib.file import ExportPlacement as FileExportPlacement
-from datachain.lib.listing import (
-    list_bucket,
-    ls,
-    parse_listing_uri,
-)
+from datachain.lib.listing import get_listing, list_bucket, ls
 from datachain.lib.listing_info import ListingInfo
 from datachain.lib.meta_formats import read_meta
 from datachain.lib.model_store import ModelStore
@@ -403,53 +396,12 @@ class DataChain:
         self.signals_schema |= signals_schema
         return self
-    @classmethod
-    def parse_uri(
-        cls, uri: str, session: Session, update: bool = False
-    ) -> tuple[str, str, str, bool]:
-        """Returns correct listing dataset name that must be used for saving listing
-        operation. It takes into account existing listings and reusability of those.
-        It also returns boolean saying if returned dataset name is reused / already
-        exists or not, and it returns correct listing path that should be used to find
-        rows based on uri.
-        """
-        catalog = session.catalog
-        cache = catalog.cache
-        client_config = catalog.client_config
-        client = Client.get_client(uri, cache, **client_config)
-        ds_name, list_uri, list_path = parse_listing_uri(uri, cache, client_config)
-        listing = None
-        listings = [
-            ls
-            for ls in catalog.listings()
-            if not ls.is_expired and ls.contains(ds_name)
-        ]
-        if listings:
-            if update:
-                # choosing the smallest possible one to minimize update time
-                listing = sorted(listings, key=lambda ls: len(ls.name))[0]
-            else:
-                # no need to update, choosing the most recent one
-                listing = sorted(listings, key=lambda ls: ls.created_at)[-1]
-        if isinstance(client, FileClient) and listing and listing.name != ds_name:
-            # For local file system we need to fix listing path / prefix
-            # if we are reusing existing listing
-            list_path = f'{ds_name.strip("/").removeprefix(listing.name)}/{list_path}'
-        ds_name = listing.name if listing else ds_name
-        return ds_name, list_uri, list_path, bool(listing)
     @classmethod
     def from_storage(
         cls,
         uri,
         *,
-        type: Literal["binary", "text", "image"] = "binary",
+        type: FileType = "binary",
         session: Optional[Session] = None,
         settings: Optional[dict] = None,
         in_memory: bool = False,
@@ -482,7 +434,7 @@ class DataChain:
         cache = session.catalog.cache
         client_config = session.catalog.client_config
-        list_ds_name, list_uri, list_path, list_ds_exists = cls.parse_uri(
+        list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
             uri, session, update=update
         )
@@ -548,7 +500,7 @@ class DataChain:
     def from_json(
         cls,
         path,
-        type: Literal["binary", "text", "image"] = "text",
+        type: FileType = "text",
         spec: Optional[DataType] = None,
         schema_from: Optional[str] = "auto",
         jmespath: Optional[str] = None,
@@ -605,7 +557,9 @@ class DataChain:
                 nrows=nrows,
             )
         }
-        return chain.gen(**signal_dict)  # type: ignore[misc, arg-type]
+        # disable prefetch if nrows is set
+        settings = {"prefetch": 0} if nrows else {}
+        return chain.settings(**settings).gen(**signal_dict)  # type: ignore[misc, arg-type]
     def explode(
         self,
@@ -1942,7 +1896,10 @@ class DataChain:
         if source:
             output = {"source": ArrowRow} | output  # type: ignore[assignment,operator]
-        return self.gen(
+        # disable prefetch if nrows is set
+        settings = {"prefetch": 0} if nrows else {}
+        return self.settings(**settings).gen(  # type: ignore[arg-type]
             ArrowGenerator(schema, model, source, nrows, **kwargs), output=output
         )
@@ -2024,8 +1981,6 @@ class DataChain:
             else:
                 msg = f"error parsing csv - incompatible output type {type(output)}"
                 raise DatasetPrepareError(chain.name, msg)
-        elif nrows:
-            nrows += 1
         parse_options = ParseOptions(delimiter=delimiter)
         read_options = ReadOptions(column_names=column_names)

datachain/lib/file.py CHANGED Viewed

@@ -39,6 +39,8 @@ logger = logging.getLogger("datachain")
 # how to create file path when exporting
 ExportPlacement = Literal["filename", "etag", "fullpath", "checksum"]
+FileType = Literal["binary", "text", "image"]
 class VFileError(DataChainError):
     def __init__(self, file: "File", message: str, vtype: str = ""):
@@ -470,7 +472,7 @@ class ArrowRow(DataModel):
             return record_batch.to_pylist()[0]
-def get_file_type(type_: Literal["binary", "text", "image"] = "binary") -> type[File]:
+def get_file_type(type_: FileType = "binary") -> type[File]:
     file: type[File] = File
     if type_ == "text":
         file = TextFile

datachain/lib/listing.py CHANGED Viewed

@@ -15,6 +15,7 @@ from datachain.utils import uses_glob
 if TYPE_CHECKING:
     from datachain.lib.dc import DataChain
+    from datachain.query.session import Session
 LISTING_TTL = 4 * 60 * 60  # cached listing lasts 4 hours
 LISTING_PREFIX = "lst__"  # listing datasets start with this name
@@ -108,3 +109,46 @@ def listing_uri_from_name(dataset_name: str) -> str:
     if not is_listing_dataset(dataset_name):
         raise ValueError(f"Dataset {dataset_name} is not a listing")
     return dataset_name.removeprefix(LISTING_PREFIX)
+def get_listing(
+    uri: str, session: "Session", update: bool = False
+) -> tuple[str, str, str, bool]:
+    """Returns correct listing dataset name that must be used for saving listing
+    operation. It takes into account existing listings and reusability of those.
+    It also returns boolean saying if returned dataset name is reused / already
+    exists or not (on update it always returns False - just because there was no
+    reason to complicate it so far). And it returns correct listing path that should
+    be used to find rows based on uri.
+    """
+    from datachain.client.local import FileClient
+    catalog = session.catalog
+    cache = catalog.cache
+    client_config = catalog.client_config
+    client = Client.get_client(uri, cache, **client_config)
+    ds_name, list_uri, list_path = parse_listing_uri(uri, cache, client_config)
+    listing = None
+    listings = [
+        ls for ls in catalog.listings() if not ls.is_expired and ls.contains(ds_name)
+    ]
+    # if no need to update - choosing the most recent one;
+    # otherwise, we'll using the exact original `ds_name`` in this case:
+    # - if a "bigger" listing exists, we don't want to update it, it's better
+    #   to create a new "smaller" one on "update=True"
+    # - if an exact listing exists it will have the same name as `ds_name`
+    #   anyway below
+    if listings and not update:
+        listing = sorted(listings, key=lambda ls: ls.created_at)[-1]
+    # for local file system we need to fix listing path / prefix
+    # if we are reusing existing listing
+    if isinstance(client, FileClient) and listing and listing.name != ds_name:
+        list_path = f'{ds_name.strip("/").removeprefix(listing.name)}/{list_path}'
+    ds_name = listing.name if listing else ds_name
+    return ds_name, list_uri, list_path, bool(listing)

datachain/lib/udf.py CHANGED Viewed

@@ -85,7 +85,6 @@ class UDFAdapter:
         udf_fields: "Sequence[str]",
         udf_inputs: "Iterable[RowsOutput]",
         catalog: "Catalog",
-        is_generator: bool,
         cache: bool,
         download_cb: Callback = DEFAULT_CALLBACK,
         processed_cb: Callback = DEFAULT_CALLBACK,

datachain/query/batch.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Callable, Optional, Union
 from datachain.data_storage.schema import PARTITION_COLUMN_ID
 from datachain.data_storage.warehouse import SELECT_BATCH_SIZE
+from datachain.query.utils import get_query_column, get_query_id_column
 if TYPE_CHECKING:
     from sqlalchemy import Select
@@ -23,11 +24,14 @@ RowsOutput = Union[Sequence, RowsOutputBatch]
 class BatchingStrategy(ABC):
     """BatchingStrategy provides means of batching UDF executions."""
+    is_batching: bool
     @abstractmethod
     def __call__(
         self,
-        execute: Callable[..., Generator[Sequence, None, None]],
+        execute: Callable,
         query: "Select",
+        ids_only: bool = False,
     ) -> Generator[RowsOutput, None, None]:
         """Apply the provided parameters to the UDF."""
@@ -38,11 +42,16 @@ class NoBatching(BatchingStrategy):
     batch UDF calls.
     """
+    is_batching = False
     def __call__(
         self,
-        execute: Callable[..., Generator[Sequence, None, None]],
+        execute: Callable,
         query: "Select",
+        ids_only: bool = False,
     ) -> Generator[Sequence, None, None]:
+        if ids_only:
+            query = query.with_only_columns(get_query_id_column(query))
         return execute(query)
@@ -52,14 +61,20 @@ class Batch(BatchingStrategy):
     is passed a sequence of multiple parameter sets.
     """
+    is_batching = True
     def __init__(self, count: int):
         self.count = count
     def __call__(
         self,
-        execute: Callable[..., Generator[Sequence, None, None]],
+        execute: Callable,
         query: "Select",
+        ids_only: bool = False,
     ) -> Generator[RowsOutputBatch, None, None]:
+        if ids_only:
+            query = query.with_only_columns(get_query_id_column(query))
         # choose page size that is a multiple of the batch size
         page_size = math.ceil(SELECT_BATCH_SIZE / self.count) * self.count
@@ -84,19 +99,30 @@ class Partition(BatchingStrategy):
     Dataset rows need to be sorted by the grouping column.
     """
+    is_batching = True
     def __call__(
         self,
-        execute: Callable[..., Generator[Sequence, None, None]],
+        execute: Callable,
         query: "Select",
+        ids_only: bool = False,
     ) -> Generator[RowsOutputBatch, None, None]:
+        id_col = get_query_id_column(query)
+        if (partition_col := get_query_column(query, PARTITION_COLUMN_ID)) is None:
+            raise RuntimeError("partition column not found in query")
+        if ids_only:
+            query = query.with_only_columns(id_col, partition_col)
         current_partition: Optional[int] = None
         batch: list[Sequence] = []
         query_fields = [str(c.name) for c in query.selected_columns]
+        id_column_idx = query_fields.index("sys__id")
         partition_column_idx = query_fields.index(PARTITION_COLUMN_ID)
         ordered_query = query.order_by(None).order_by(
-            PARTITION_COLUMN_ID,
+            partition_col,
             *query._order_by_clauses,
         )
@@ -108,7 +134,7 @@ class Partition(BatchingStrategy):
                     if len(batch) > 0:
                         yield RowsOutputBatch(batch)
                         batch = []
-                batch.append(row)
+                batch.append([row[id_column_idx]] if ids_only else row)
             if len(batch) > 0:
                 yield RowsOutputBatch(batch)

datachain 0.8.0__py3-none-any.whl → 0.8.2__py3-none-any.whl

Potentially problematic release.

datachain 0.8.0py3-none-any.whl → 0.8.2py3-none-any.whl