PyPI - datachain - Versions diffs - 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

datachain 0.3.0py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (18) hide show

datachain/catalog/catalog.py +1 -1
datachain/client/fsspec.py +1 -4
datachain/client/local.py +2 -7
datachain/data_storage/warehouse.py +8 -14
datachain/lib/dc.py +1 -1
datachain/lib/udf.py +21 -14
datachain/query/batch.py +45 -41
datachain/query/dataset.py +13 -6
datachain/query/dispatch.py +53 -68
datachain/query/queue.py +120 -0
datachain/query/udf.py +23 -8
datachain/utils.py +17 -2
{datachain-0.3.0.dist-info → datachain-0.3.1.dist-info}/METADATA +1 -1
{datachain-0.3.0.dist-info → datachain-0.3.1.dist-info}/RECORD +18 -17
{datachain-0.3.0.dist-info → datachain-0.3.1.dist-info}/LICENSE +0 -0
{datachain-0.3.0.dist-info → datachain-0.3.1.dist-info}/WHEEL +0 -0
{datachain-0.3.0.dist-info → datachain-0.3.1.dist-info}/entry_points.txt +0 -0
{datachain-0.3.0.dist-info → datachain-0.3.1.dist-info}/top_level.txt +0 -0

datachain/catalog/catalog.py CHANGED Viewed

@@ -676,7 +676,7 @@ class Catalog:
     def parse_url(self, uri: str, **config: Any) -> tuple[Client, str]:
         config = config or self.client_config
-        return Client.parse_url(uri, self.metastore, self.cache, **config)
+        return Client.parse_url(uri, self.cache, **config)
     def get_client(self, uri: StorageURI, **config: Any) -> Client:
         """

datachain/client/fsspec.py CHANGED Viewed

@@ -37,7 +37,6 @@ from datachain.storage import StorageURI
 if TYPE_CHECKING:
     from fsspec.spec import AbstractFileSystem
-    from datachain.data_storage import AbstractMetastore
 logger = logging.getLogger("datachain")
@@ -116,13 +115,12 @@ class Client(ABC):
     @staticmethod
     def parse_url(
         source: str,
-        metastore: "AbstractMetastore",
         cache: DataChainCache,
         **kwargs,
     ) -> tuple["Client", str]:
         cls = Client.get_implementation(source)
         storage_url, rel_path = cls.split_url(source)
-        client = cls.from_name(storage_url, metastore, cache, kwargs)
+        client = cls.from_name(storage_url, cache, kwargs)
         return client, rel_path
     @classmethod
@@ -136,7 +134,6 @@ class Client(ABC):
     def from_name(
         cls,
         name: str,
-        metastore: "AbstractMetastore",
         cache: DataChainCache,
         kwargs: dict[str, Any],
     ) -> "Client":

datachain/client/local.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 import posixpath
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import TYPE_CHECKING, Any
+from typing import Any
 from urllib.parse import urlparse
 from fsspec.implementations.local import LocalFileSystem
@@ -12,9 +12,6 @@ from datachain.storage import StorageURI
 from .fsspec import Client
-if TYPE_CHECKING:
-    from datachain.data_storage import AbstractMetastore
 class FileClient(Client):
     FS_CLASS = LocalFileSystem
@@ -97,9 +94,7 @@ class FileClient(Client):
         return cls.root_dir(), uri.removeprefix(cls.root_path().as_uri())
     @classmethod
-    def from_name(
-        cls, name: str, metastore: "AbstractMetastore", cache, kwargs
-    ) -> "FileClient":
+    def from_name(cls, name: str, cache, kwargs) -> "FileClient":
         use_symlinks = kwargs.pop("use_symlinks", False)
         return cls(name, kwargs, cache, use_symlinks=use_symlinks)

datachain/data_storage/warehouse.py CHANGED Viewed

@@ -17,7 +17,7 @@ from sqlalchemy.sql.expression import true
 from datachain.client import Client
 from datachain.data_storage.serializer import Serializable
-from datachain.dataset import DatasetRecord, RowDict
+from datachain.dataset import DatasetRecord
 from datachain.node import DirType, DirTypeGroup, Entry, Node, NodeWithPath, get_path
 from datachain.sql.functions import path as pathfunc
 from datachain.sql.types import Int, SQLType
@@ -201,23 +201,17 @@ class AbstractWarehouse(ABC, Serializable):
     def dataset_select_paginated(
         self,
         query,
-        limit: Optional[int] = None,
-        order_by: tuple["ColumnElement[Any]", ...] = (),
         page_size: int = SELECT_BATCH_SIZE,
-    ) -> Generator[RowDict, None, None]:
+    ) -> Generator[Sequence, None, None]:
         """
         This is equivalent to `db.execute`, but for selecting rows in batches
         """
-        cols = query.selected_columns
-        cols_names = [c.name for c in cols]
+        limit = query._limit
+        paginated_query = query.limit(page_size)
-        if not order_by:
-            ordering = [cols.sys__id]
-        else:
-            ordering = order_by  # type: ignore[assignment]
-        # reset query order by and apply new order by id
-        paginated_query = query.order_by(None).order_by(*ordering).limit(page_size)
+        if not paginated_query._order_by_clauses:
+            # default order by is order by `sys__id`
+            paginated_query = paginated_query.order_by(query.selected_columns.sys__id)
         results = None
         offset = 0
@@ -236,7 +230,7 @@ class AbstractWarehouse(ABC, Serializable):
                 processed = False
                 for row in results:
                     processed = True
-                    yield RowDict(zip(cols_names, row))
+                    yield row
                     num_yielded += 1
                 if not processed:

datachain/lib/dc.py CHANGED Viewed

@@ -1623,7 +1623,7 @@ class DataChain(DatasetQuery):
             Using glob to match patterns
             ```py
-            dc.filter(C("file.name").glob("*.jpg))
+            dc.filter(C("file.name").glob("*.jpg"))
             ```
             Using `datachain.sql.functions`

datachain/lib/udf.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import sys
 import traceback
-from collections.abc import Iterable, Iterator
 from typing import TYPE_CHECKING, Callable, Optional
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback
@@ -14,16 +13,19 @@ from datachain.lib.model_store import ModelStore
 from datachain.lib.signal_schema import SignalSchema
 from datachain.lib.udf_signature import UdfSignature
 from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
-from datachain.query.batch import RowBatch
+from datachain.query.batch import UDFInputBatch
 from datachain.query.schema import ColumnParameter
 from datachain.query.udf import UDFBase as _UDFBase
-from datachain.query.udf import UDFProperties, UDFResult
+from datachain.query.udf import UDFProperties
 if TYPE_CHECKING:
+    from collections.abc import Iterable, Iterator, Sequence
     from typing_extensions import Self
     from datachain.catalog import Catalog
-    from datachain.query.batch import BatchingResult
+    from datachain.query.batch import RowsOutput, UDFInput
+    from datachain.query.udf import UDFResult
 class UdfError(DataChainParamsError):
@@ -42,22 +44,27 @@ class UDFAdapter(_UDFBase):
     def run(
         self,
-        udf_inputs: "Iterable[BatchingResult]",
+        udf_fields: "Sequence[str]",
+        udf_inputs: "Iterable[RowsOutput]",
         catalog: "Catalog",
         is_generator: bool,
         cache: bool,
         download_cb: Callback = DEFAULT_CALLBACK,
         processed_cb: Callback = DEFAULT_CALLBACK,
-    ) -> Iterator[Iterable["UDFResult"]]:
+    ) -> "Iterator[Iterable[UDFResult]]":
         self.inner._catalog = catalog
         if hasattr(self.inner, "setup") and callable(self.inner.setup):
             self.inner.setup()
-        for batch in udf_inputs:
-            n_rows = len(batch.rows) if isinstance(batch, RowBatch) else 1
-            output = self.run_once(catalog, batch, is_generator, cache, cb=download_cb)
-            processed_cb.relative_update(n_rows)
-            yield output
+        yield from super().run(
+            udf_fields,
+            udf_inputs,
+            catalog,
+            is_generator,
+            cache,
+            download_cb,
+            processed_cb,
+        )
         if hasattr(self.inner, "teardown") and callable(self.inner.teardown):
             self.inner.teardown()
@@ -65,12 +72,12 @@ class UDFAdapter(_UDFBase):
     def run_once(
         self,
         catalog: "Catalog",
-        arg: "BatchingResult",
+        arg: "UDFInput",
         is_generator: bool = False,
         cache: bool = False,
         cb: Callback = DEFAULT_CALLBACK,
-    ) -> Iterable[UDFResult]:
-        if isinstance(arg, RowBatch):
+    ) -> "Iterable[UDFResult]":
+        if isinstance(arg, UDFInputBatch):
             udf_inputs = [
                 self.bind_parameters(catalog, row, cache=cache, cb=cb)
                 for row in arg.rows

datachain/query/batch.py CHANGED Viewed

@@ -5,21 +5,29 @@ from collections.abc import Generator, Sequence
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Callable, Optional, Union
-import sqlalchemy as sa
 from datachain.data_storage.schema import PARTITION_COLUMN_ID
 from datachain.data_storage.warehouse import SELECT_BATCH_SIZE
 if TYPE_CHECKING:
+    from sqlalchemy import Select
     from datachain.dataset import RowDict
 @dataclass
-class RowBatch:
+class RowsOutputBatch:
+    rows: Sequence[Sequence]
+RowsOutput = Union[Sequence, RowsOutputBatch]
+@dataclass
+class UDFInputBatch:
     rows: Sequence["RowDict"]
-BatchingResult = Union["RowDict", RowBatch]
+UDFInput = Union["RowDict", UDFInputBatch]
 class BatchingStrategy(ABC):
@@ -28,9 +36,9 @@ class BatchingStrategy(ABC):
     @abstractmethod
     def __call__(
         self,
-        execute: Callable,
-        query: sa.sql.selectable.Select,
-    ) -> Generator[BatchingResult, None, None]:
+        execute: Callable[..., Generator[Sequence, None, None]],
+        query: "Select",
+    ) -> Generator[RowsOutput, None, None]:
         """Apply the provided parameters to the UDF."""
@@ -42,10 +50,10 @@ class NoBatching(BatchingStrategy):
     def __call__(
         self,
-        execute: Callable,
-        query: sa.sql.selectable.Select,
-    ) -> Generator["RowDict", None, None]:
-        return execute(query, limit=query._limit, order_by=query._order_by_clauses)
+        execute: Callable[..., Generator[Sequence, None, None]],
+        query: "Select",
+    ) -> Generator[Sequence, None, None]:
+        return execute(query)
 class Batch(BatchingStrategy):
@@ -59,31 +67,24 @@ class Batch(BatchingStrategy):
     def __call__(
         self,
-        execute: Callable,
-        query: sa.sql.selectable.Select,
-    ) -> Generator[RowBatch, None, None]:
+        execute: Callable[..., Generator[Sequence, None, None]],
+        query: "Select",
+    ) -> Generator[RowsOutputBatch, None, None]:
         # choose page size that is a multiple of the batch size
         page_size = math.ceil(SELECT_BATCH_SIZE / self.count) * self.count
         # select rows in batches
-        results: list[RowDict] = []
-        with contextlib.closing(
-            execute(
-                query,
-                page_size=page_size,
-                limit=query._limit,
-                order_by=query._order_by_clauses,
-            )
-        ) as rows:
+        results: list[Sequence] = []
+        with contextlib.closing(execute(query, page_size=page_size)) as rows:
             for row in rows:
                 results.append(row)
                 if len(results) >= self.count:
                     batch, results = results[: self.count], results[self.count :]
-                    yield RowBatch(batch)
+                    yield RowsOutputBatch(batch)
             if len(results) > 0:
-                yield RowBatch(results)
+                yield RowsOutputBatch(results)
 class Partition(BatchingStrategy):
@@ -95,27 +96,30 @@ class Partition(BatchingStrategy):
     def __call__(
         self,
-        execute: Callable,
-        query: sa.sql.selectable.Select,
-    ) -> Generator[RowBatch, None, None]:
+        execute: Callable[..., Generator[Sequence, None, None]],
+        query: "Select",
+    ) -> Generator[RowsOutputBatch, None, None]:
         current_partition: Optional[int] = None
-        batch: list[RowDict] = []
-        with contextlib.closing(
-            execute(
-                query,
-                order_by=(PARTITION_COLUMN_ID, "sys__id", *query._order_by_clauses),
-                limit=query._limit,
-            )
-        ) as rows:
+        batch: list[Sequence] = []
+        query_fields = [str(c.name) for c in query.selected_columns]
+        partition_column_idx = query_fields.index(PARTITION_COLUMN_ID)
+        ordered_query = query.order_by(None).order_by(
+            PARTITION_COLUMN_ID,
+            "sys__id",
+            *query._order_by_clauses,
+        )
+        with contextlib.closing(execute(ordered_query)) as rows:
             for row in rows:
-                partition = row[PARTITION_COLUMN_ID]
+                partition = row[partition_column_idx]
                 if current_partition != partition:
                     current_partition = partition
                     if len(batch) > 0:
-                        yield RowBatch(batch)
+                        yield RowsOutputBatch(batch)
                         batch = []
                 batch.append(row)
             if len(batch) > 0:
-                yield RowBatch(batch)
+                yield RowsOutputBatch(batch)

datachain/query/dataset.py CHANGED Viewed

@@ -461,6 +461,8 @@ class UDFStep(Step, ABC):
         processes = determine_processes(self.parallel)
+        udf_fields = [str(c.name) for c in query.selected_columns]
         try:
             if workers:
                 from datachain.catalog.loader import get_distributed_class
@@ -473,6 +475,7 @@ class UDFStep(Step, ABC):
                     query,
                     workers,
                     processes,
+                    udf_fields=udf_fields,
                     is_generator=self.is_generator,
                     use_partitioning=use_partitioning,
                     cache=self.cache,
@@ -489,6 +492,7 @@ class UDFStep(Step, ABC):
                     "warehouse_clone_params": self.catalog.warehouse.clone_params(),
                     "table": udf_table,
                     "query": query,
+                    "udf_fields": udf_fields,
                     "batching": batching,
                     "processes": processes,
                     "is_generator": self.is_generator,
@@ -528,6 +532,7 @@ class UDFStep(Step, ABC):
                     generated_cb = get_generated_callback(self.is_generator)
                     try:
                         udf_results = udf.run(
+                            udf_fields,
                             udf_inputs,
                             self.catalog,
                             self.is_generator,
@@ -1244,21 +1249,23 @@ class DatasetQuery:
         actual_params = [normalize_param(p) for p in params]
         try:
             query = self.apply_steps().select()
+            query_fields = [str(c.name) for c in query.selected_columns]
-            def row_iter() -> Generator[RowDict, None, None]:
+            def row_iter() -> Generator[Sequence, None, None]:
                 # warehouse isn't threadsafe, we need to clone() it
                 # in the thread that uses the results
                 with self.catalog.warehouse.clone() as warehouse:
-                    gen = warehouse.dataset_select_paginated(
-                        query, limit=query._limit, order_by=query._order_by_clauses
-                    )
+                    gen = warehouse.dataset_select_paginated(query)
                     with contextlib.closing(gen) as rows:
                         yield from rows
-            async def get_params(row: RowDict) -> tuple:
+            async def get_params(row: Sequence) -> tuple:
+                row_dict = RowDict(zip(query_fields, row))
                 return tuple(
                     [
-                        await p.get_value_async(self.catalog, row, mapper, **kwargs)
+                        await p.get_value_async(
+                            self.catalog, row_dict, mapper, **kwargs
+                        )
                         for p in actual_params
                     ]
                 )

datachain/query/dispatch.py CHANGED Viewed

@@ -2,11 +2,8 @@ import contextlib
 from collections.abc import Iterator, Sequence
 from itertools import chain
 from multiprocessing import cpu_count
-from queue import Empty, Full, Queue
 from sys import stdin
-from time import sleep
-from types import GeneratorType
-from typing import Any, Optional
+from typing import Optional
 import attrs
 import multiprocess
@@ -22,7 +19,16 @@ from datachain.query.dataset import (
     get_processed_callback,
     process_udf_outputs,
 )
+from datachain.query.queue import (
+    get_from_queue,
+    marshal,
+    msgpack_pack,
+    msgpack_unpack,
+    put_into_queue,
+    unmarshal,
+)
 from datachain.query.udf import UDFBase, UDFFactory, UDFResult
+from datachain.utils import batched_it
 DEFAULT_BATCH_SIZE = 10000
 STOP_SIGNAL = "STOP"
@@ -44,44 +50,6 @@ def get_n_workers_from_arg(n_workers: Optional[int] = None) -> int:
     return n_workers
-# For more context on the get_from_queue and put_into_queue functions, see the
-# discussion here:
-# https://github.com/iterative/dvcx/pull/1297#issuecomment-2026308773
-# This problem is not exactly described by, but is also related to these Python issues:
-# https://github.com/python/cpython/issues/66587
-# https://github.com/python/cpython/issues/88628
-# https://github.com/python/cpython/issues/108645
-def get_from_queue(queue: Queue) -> Any:
-    """
-    Gets an item from a queue.
-    This is required to handle signals, such as KeyboardInterrupt exceptions
-    while waiting for items to be available, although only on certain installations.
-    (See the above comment for more context.)
-    """
-    while True:
-        try:
-            return queue.get_nowait()
-        except Empty:
-            sleep(0.01)
-def put_into_queue(queue: Queue, item: Any) -> None:
-    """
-    Puts an item into a queue.
-    This is required to handle signals, such as KeyboardInterrupt exceptions
-    while waiting for items to be queued, although only on certain installations.
-    (See the above comment for more context.)
-    """
-    while True:
-        try:
-            queue.put_nowait(item)
-            return
-        except Full:
-            sleep(0.01)
 def udf_entrypoint() -> int:
     # Load UDF info from stdin
     udf_info = load(stdin.buffer)
@@ -100,8 +68,9 @@ def udf_entrypoint() -> int:
         udf_info["id_generator_clone_params"],
         udf_info["metastore_clone_params"],
         udf_info["warehouse_clone_params"],
-        is_generator=udf_info.get("is_generator", False),
+        udf_fields=udf_info["udf_fields"],
         cache=udf_info["cache"],
+        is_generator=udf_info.get("is_generator", False),
     )
     query = udf_info["query"]
@@ -121,7 +90,7 @@ def udf_entrypoint() -> int:
         generated_cb = get_generated_callback(dispatch.is_generator)
         try:
             udf_results = dispatch.run_udf_parallel(
-                udf_inputs,
+                marshal(udf_inputs),
                 n_workers=n_workers,
                 processed_cb=processed_cb,
                 download_cb=download_cb,
@@ -142,6 +111,9 @@ def udf_worker_entrypoint() -> int:
 class UDFDispatcher:
+    catalog: Optional[Catalog] = None
+    task_queue: Optional[multiprocess.Queue] = None
+    done_queue: Optional[multiprocess.Queue] = None
     _batch_size: Optional[int] = None
     def __init__(
@@ -151,9 +123,10 @@ class UDFDispatcher:
         id_generator_clone_params,
         metastore_clone_params,
         warehouse_clone_params,
-        cache,
-        is_generator=False,
-        buffer_size=DEFAULT_BATCH_SIZE,
+        udf_fields: "Sequence[str]",
+        cache: bool,
+        is_generator: bool = False,
+        buffer_size: int = DEFAULT_BATCH_SIZE,
     ):
         self.udf_data = udf_data
         self.catalog_init_params = catalog_init_params
@@ -172,12 +145,13 @@ class UDFDispatcher:
             self.warehouse_args,
             self.warehouse_kwargs,
         ) = warehouse_clone_params
-        self.is_generator = is_generator
+        self.udf_fields = udf_fields
         self.cache = cache
+        self.is_generator = is_generator
+        self.buffer_size = buffer_size
         self.catalog = None
         self.task_queue = None
         self.done_queue = None
-        self.buffer_size = buffer_size
         self.ctx = get_context("spawn")
     @property
@@ -226,6 +200,7 @@ class UDFDispatcher:
             self.done_queue,
             self.is_generator,
             self.cache,
+            self.udf_fields,
         )
     def _run_worker(self) -> None:
@@ -233,7 +208,11 @@ class UDFDispatcher:
             worker = self._create_worker()
             worker.run()
         except (Exception, KeyboardInterrupt) as e:
-            put_into_queue(self.done_queue, {"status": FAILED_STATUS, "exception": e})
+            if self.done_queue:
+                put_into_queue(
+                    self.done_queue,
+                    {"status": FAILED_STATUS, "exception": e},
+                )
             raise
     @staticmethod
@@ -249,7 +228,6 @@ class UDFDispatcher:
         self,
         input_rows,
         n_workers: Optional[int] = None,
-        cache: bool = False,
         input_queue=None,
         processed_cb: Callback = DEFAULT_CALLBACK,
         download_cb: Callback = DEFAULT_CALLBACK,
@@ -299,21 +277,24 @@ class UDFDispatcher:
                 result = get_from_queue(self.done_queue)
                 status = result["status"]
                 if status == NOTIFY_STATUS:
-                    download_cb.relative_update(result["downloaded"])
+                    if downloaded := result.get("downloaded"):
+                        download_cb.relative_update(downloaded)
+                    if processed := result.get("processed"):
+                        processed_cb.relative_update(processed)
                 elif status == FINISHED_STATUS:
                     # Worker finished
                     n_workers -= 1
                 elif status == OK_STATUS:
-                    processed_cb.relative_update(result["processed"])
-                    yield result["result"]
+                    if processed := result.get("processed"):
+                        processed_cb.relative_update(processed)
+                    yield msgpack_unpack(result["result"])
                 else:  # Failed / error
                     n_workers -= 1
-                    exc = result.get("exception")
-                    if exc:
+                    if exc := result.get("exception"):
                         raise exc
                     raise RuntimeError("Internal error: Parallel UDF execution failed")
-                if not streaming_mode and not input_finished:
+                if status == OK_STATUS and not streaming_mode and not input_finished:
                     try:
                         put_into_queue(self.task_queue, next(input_data))
                     except StopIteration:
@@ -348,7 +329,7 @@ class UDFDispatcher:
 class WorkerCallback(Callback):
-    def __init__(self, queue: multiprocess.Queue):
+    def __init__(self, queue: "multiprocess.Queue"):
         self.queue = queue
         super().__init__()
@@ -369,10 +350,11 @@ class ProcessedCallback(Callback):
 class UDFWorker:
     catalog: Catalog
     udf: UDFBase
-    task_queue: multiprocess.Queue
-    done_queue: multiprocess.Queue
+    task_queue: "multiprocess.Queue"
+    done_queue: "multiprocess.Queue"
     is_generator: bool
     cache: bool
+    udf_fields: Sequence[str]
     cb: Callback = attrs.field()
     @cb.default
@@ -382,7 +364,8 @@ class UDFWorker:
     def run(self) -> None:
         processed_cb = ProcessedCallback()
         udf_results = self.udf.run(
-            self.get_inputs(),
+            self.udf_fields,
+            unmarshal(self.get_inputs()),
             self.catalog,
             self.is_generator,
             self.cache,
@@ -390,15 +373,17 @@ class UDFWorker:
             processed_cb=processed_cb,
         )
         for udf_output in udf_results:
-            if isinstance(udf_output, GeneratorType):
-                udf_output = list(udf_output)  # can not pickle generator
+            for batch in batched_it(udf_output, DEFAULT_BATCH_SIZE):
+                put_into_queue(
+                    self.done_queue,
+                    {
+                        "status": OK_STATUS,
+                        "result": msgpack_pack(list(batch)),
+                    },
+                )
             put_into_queue(
                 self.done_queue,
-                {
-                    "status": OK_STATUS,
-                    "result": udf_output,
-                    "processed": processed_cb.processed_rows,
-                },
+                {"status": NOTIFY_STATUS, "processed": processed_cb.processed_rows},
             )
         put_into_queue(self.done_queue, {"status": FINISHED_STATUS})

datachain/query/queue.py ADDED Viewed

@@ -0,0 +1,120 @@
+import datetime
+from collections.abc import Iterable, Iterator
+from queue import Empty, Full, Queue
+from struct import pack, unpack
+from time import sleep
+from typing import Any
+import msgpack
+from datachain.query.batch import RowsOutput, RowsOutputBatch
+DEFAULT_BATCH_SIZE = 10000
+STOP_SIGNAL = "STOP"
+OK_STATUS = "OK"
+FINISHED_STATUS = "FINISHED"
+FAILED_STATUS = "FAILED"
+NOTIFY_STATUS = "NOTIFY"
+# For more context on the get_from_queue and put_into_queue functions, see the
+# discussion here:
+# https://github.com/iterative/dvcx/pull/1297#issuecomment-2026308773
+# This problem is not exactly described by, but is also related to these Python issues:
+# https://github.com/python/cpython/issues/66587
+# https://github.com/python/cpython/issues/88628
+# https://github.com/python/cpython/issues/108645
+def get_from_queue(queue: Queue) -> Any:
+    """
+    Gets an item from a queue.
+    This is required to handle signals, such as KeyboardInterrupt exceptions
+    while waiting for items to be available, although only on certain installations.
+    (See the above comment for more context.)
+    """
+    while True:
+        try:
+            return queue.get_nowait()
+        except Empty:
+            sleep(0.01)
+def put_into_queue(queue: Queue, item: Any) -> None:
+    """
+    Puts an item into a queue.
+    This is required to handle signals, such as KeyboardInterrupt exceptions
+    while waiting for items to be queued, although only on certain installations.
+    (See the above comment for more context.)
+    """
+    while True:
+        try:
+            queue.put_nowait(item)
+            return
+        except Full:
+            sleep(0.01)
+MSGPACK_EXT_TYPE_DATETIME = 42
+MSGPACK_EXT_TYPE_ROWS_INPUT_BATCH = 43
+def _msgpack_pack_extended_types(obj: Any) -> msgpack.ExtType:
+    if isinstance(obj, datetime.datetime):
+        # packing date object as 1 or 2 variables, depending if timezone info is present
+        #   - timestamp
+        #   - [OPTIONAL] timezone offset from utc in seconds if timezone info exists
+        if obj.tzinfo:
+            data = (obj.timestamp(), int(obj.utcoffset().total_seconds()))  # type: ignore   # noqa: PGH003
+            return msgpack.ExtType(MSGPACK_EXT_TYPE_DATETIME, pack("!dl", *data))
+        data = (obj.timestamp(),)  # type: ignore   # noqa: PGH003
+        return msgpack.ExtType(MSGPACK_EXT_TYPE_DATETIME, pack("!d", *data))
+    if isinstance(obj, RowsOutputBatch):
+        return msgpack.ExtType(
+            MSGPACK_EXT_TYPE_ROWS_INPUT_BATCH,
+            msgpack_pack(obj.rows),
+        )
+    raise TypeError(f"Unknown type: {obj}")
+def msgpack_pack(obj: Any) -> bytes:
+    return msgpack.packb(obj, default=_msgpack_pack_extended_types)
+def _msgpack_unpack_extended_types(code: int, data: bytes) -> Any:
+    if code == MSGPACK_EXT_TYPE_DATETIME:
+        has_timezone = False
+        if len(data) == 8:
+            # we send only timestamp without timezone if data is 8 bytes
+            values = unpack("!d", data)
+        else:
+            has_timezone = True
+            values = unpack("!dl", data)
+        timestamp = values[0]
+        tz_info = None
+        if has_timezone:
+            timezone_offset = values[1]
+            tz_info = datetime.timezone(datetime.timedelta(seconds=timezone_offset))
+        return datetime.datetime.fromtimestamp(timestamp, tz=tz_info)
+    if code == MSGPACK_EXT_TYPE_ROWS_INPUT_BATCH:
+        return RowsOutputBatch(msgpack_unpack(data))
+    return msgpack.ExtType(code, data)
+def msgpack_unpack(data: bytes) -> Any:
+    return msgpack.unpackb(data, ext_hook=_msgpack_unpack_extended_types)
+def marshal(obj: Iterator[RowsOutput]) -> Iterable[bytes]:
+    for row in obj:
+        yield msgpack_pack(row)
+def unmarshal(obj: Iterator[bytes]) -> Iterable[RowsOutput]:
+    for row in obj:
+        yield msgpack_unpack(row)

datachain/query/udf.py CHANGED Viewed

@@ -15,7 +15,14 @@ from fsspec.callbacks import DEFAULT_CALLBACK, Callback
 from datachain.dataset import RowDict
-from .batch import Batch, BatchingStrategy, NoBatching, Partition, RowBatch
+from .batch import (
+    Batch,
+    BatchingStrategy,
+    NoBatching,
+    Partition,
+    RowsOutputBatch,
+    UDFInputBatch,
+)
 from .schema import (
     UDFParameter,
     UDFParamSpec,
@@ -25,7 +32,7 @@ from .schema import (
 if TYPE_CHECKING:
     from datachain.catalog import Catalog
-    from .batch import BatchingResult
+    from .batch import RowsOutput, UDFInput
 ColumnType = Any
@@ -107,7 +114,8 @@ class UDFBase:
     def run(
         self,
-        udf_inputs: "Iterable[BatchingResult]",
+        udf_fields: "Sequence[str]",
+        udf_inputs: "Iterable[RowsOutput]",
         catalog: "Catalog",
         is_generator: bool,
         cache: bool,
@@ -115,15 +123,22 @@ class UDFBase:
         processed_cb: Callback = DEFAULT_CALLBACK,
     ) -> Iterator[Iterable["UDFResult"]]:
         for batch in udf_inputs:
-            n_rows = len(batch.rows) if isinstance(batch, RowBatch) else 1
-            output = self.run_once(catalog, batch, is_generator, cache, cb=download_cb)
+            if isinstance(batch, RowsOutputBatch):
+                n_rows = len(batch.rows)
+                inputs: UDFInput = UDFInputBatch(
+                    [RowDict(zip(udf_fields, row)) for row in batch.rows]
+                )
+            else:
+                n_rows = 1
+                inputs = RowDict(zip(udf_fields, batch))
+            output = self.run_once(catalog, inputs, is_generator, cache, cb=download_cb)
             processed_cb.relative_update(n_rows)
             yield output
     def run_once(
         self,
         catalog: "Catalog",
-        arg: "BatchingResult",
+        arg: "UDFInput",
         is_generator: bool = False,
         cache: bool = False,
         cb: Callback = DEFAULT_CALLBACK,
@@ -199,12 +214,12 @@ class UDFWrapper(UDFBase):
     def run_once(
         self,
         catalog: "Catalog",
-        arg: "BatchingResult",
+        arg: "UDFInput",
         is_generator: bool = False,
         cache: bool = False,
         cb: Callback = DEFAULT_CALLBACK,
     ) -> Iterable[UDFResult]:
-        if isinstance(arg, RowBatch):
+        if isinstance(arg, UDFInputBatch):
             udf_inputs = [
                 self.bind_parameters(catalog, row, cache=cache, cb=cb)
                 for row in arg.rows

datachain/utils.py CHANGED Viewed

@@ -10,7 +10,7 @@ import sys
 import time
 from collections.abc import Iterable, Iterator, Sequence
 from datetime import date, datetime, timezone
-from itertools import islice
+from itertools import chain, islice
 from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
 from uuid import UUID
@@ -241,7 +241,7 @@ _T_co = TypeVar("_T_co", covariant=True)
 def batched(iterable: Iterable[_T_co], n: int) -> Iterator[tuple[_T_co, ...]]:
-    "Batch data into tuples of length n. The last batch may be shorter."
+    """Batch data into tuples of length n. The last batch may be shorter."""
     # Based on: https://docs.python.org/3/library/itertools.html#itertools-recipes
     # batched('ABCDEFG', 3) --> ABC DEF G
     if n < 1:
@@ -251,6 +251,21 @@ def batched(iterable: Iterable[_T_co], n: int) -> Iterator[tuple[_T_co, ...]]:
         yield batch
+def batched_it(iterable: Iterable[_T_co], n: int) -> Iterator[Iterator[_T_co]]:
+    """Batch data into iterators of length n. The last batch may be shorter."""
+    # batched('ABCDEFG', 3) --> ABC DEF G
+    if n < 1:
+        raise ValueError("Batch size must be at least one")
+    it = iter(iterable)
+    while True:
+        chunk_it = islice(it, n)
+        try:
+            first_el = next(chunk_it)
+        except StopIteration:
+            return
+        yield chain((first_el,), chunk_it)
 def flatten(items):
     for item in items:
         if isinstance(item, list):

{datachain-0.3.0.dist-info → datachain-0.3.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datachain
-Version: 0.3.0
+Version: 0.3.1
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License: Apache-2.0

{datachain-0.3.0.dist-info → datachain-0.3.1.dist-info}/RECORD RENAMED Viewed

@@ -15,18 +15,18 @@ datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A
 datachain/progress.py,sha256=7_8FtJs770ITK9sMq-Lt4k4k18QmYl4yIG_kCoWID3o,4559
 datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
-datachain/utils.py,sha256=kgH5NPj47eC_KrFTd6ZS206lKVhnJVFt5XsqkK6ppTc,12483
+datachain/utils.py,sha256=ROVCLwb37VmFRzgTlSGUDw4eJNgYGiQ4yMX581HfUX8,12988
 datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
-datachain/catalog/catalog.py,sha256=BJ8ZP9mleUbN5Y4CoYJ94R_tnnsA9sHdZq2RBGwVN5Y,80291
+datachain/catalog/catalog.py,sha256=9-7SnMjh5ruH9sdKDo8P5EklX9oC2EHH6bnku6ZqLko,80275
 datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
 datachain/catalog/loader.py,sha256=GJ8zhEYkC7TuaPzCsjJQ4LtTdECu-wwYzC12MikPOMQ,7307
 datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
 datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
 datachain/client/azure.py,sha256=3RfDTAI_TszDy9WazHQd3bI3sS2wDFrNXfNqCDewZgE,2214
 datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
-datachain/client/fsspec.py,sha256=VrssoNenXsFxznr-Xx1haZPlXU-dr-WHdxmdbgFI_UA,13378
+datachain/client/fsspec.py,sha256=G4QTm3KPhlaV74T3gLXJ86345_ak8CH38ezn2ET-oLc,13230
 datachain/client/gcs.py,sha256=Mt77W_l8_fK61gLm4mmxNmENuOM0ETwxdiFp4S8d-_w,4105
-datachain/client/local.py,sha256=yhC-pMKdprJ-rMGwPpBmPkdkG5riIIKkVSe6kNpyCok,5076
+datachain/client/local.py,sha256=SyGnqcrbtSvDK6IJsQa6NxxHwbWaWIP1GLZsQBXg_IA,4939
 datachain/client/s3.py,sha256=GfRZZzNPQPRsYjoef8bbsLbanJPUlCbyGTTK8ojzp8A,6136
 datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZv32Y8,398
 datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
@@ -36,13 +36,13 @@ datachain/data_storage/metastore.py,sha256=nxcY6nwyEmQWMAo33sNGO-FgUFQs2amBGGnZz
 datachain/data_storage/schema.py,sha256=Idi-29fckvZozzvkyz3nTR2FOIajPlSuPdIEO7SMvXM,7863
 datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
 datachain/data_storage/sqlite.py,sha256=0r6L_a2hdGRoR_gl06v1qWhEFOS_Q31aldHyk07Yx-M,26857
-datachain/data_storage/warehouse.py,sha256=eEZvzYwpqwzzLXqHWjB6l4tRsIHifIr8VWI5STm53LE,33310
+datachain/data_storage/warehouse.py,sha256=MXYkUG69UK2wbIFsZFvT7rKzXlnSitDMp3Vzj_IIsnA,33089
 datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/lib/arrow.py,sha256=R8wDUDEa-5hYjI3HW9cqvOYYJpeeah5lbhFIL3gkmcE,4915
 datachain/lib/clip.py,sha256=16u4b_y2Y15nUS2UN_8ximMo6r_-_4IQpmct2ol-e-g,5730
 datachain/lib/data_model.py,sha256=qfTtQNncS5pt9SvXdMEa5kClniaT6XBGBfO7onEz2TI,1632
 datachain/lib/dataset_info.py,sha256=lONGr71ozo1DS4CQEhnpKORaU4qFb6Ketv8Xm8CVm2U,2188
-datachain/lib/dc.py,sha256=bU45N7vBlxSyS6bpe0ShQ1c0DpXKFVfNcFcvbBrE1Ag,58011
+datachain/lib/dc.py,sha256=e24ecfIcypVkmVBqvr-p06zpwrw7GD20gy1gBJQPT-I,58012
 datachain/lib/file.py,sha256=ZHpdilDPYCob8uqtwUPtBvBNxVvQRq4AC_0IGg5m-G4,12003
 datachain/lib/image.py,sha256=TgYhRhzd4nkytfFMeykQkPyzqb5Le_-tU81unVMPn4Q,2328
 datachain/lib/meta_formats.py,sha256=jlSYWRUeDMjun_YCsQ2JxyaDJpEpokzHDPmKUAoCXnU,7034
@@ -51,7 +51,7 @@ datachain/lib/pytorch.py,sha256=9PsypKseyKfIimTmTQOgb-pbNXgeeAHLdlWx0qRPULY,5660
 datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
 datachain/lib/signal_schema.py,sha256=VL9TR0CJ3eRzjIDr-8e-e7cZKuMBbPUZtY2lGAsucc0,15734
 datachain/lib/text.py,sha256=dVe2Ilc_gW2EV0kun0UwegiCkapWcd20cef7CgINWHU,1083
-datachain/lib/udf.py,sha256=IjuDt2B8E3xEHhcJnaK_ZhmivdrOYPXz5uf7ylpktws,11815
+datachain/lib/udf.py,sha256=n3x6No-7l5LAciPJPWwZbA8WtTnGUU7d0wRL6CyfZh8,11847
 datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
 datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
 datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -64,15 +64,16 @@ datachain/lib/convert/sql_to_python.py,sha256=lGnKzSF_tz9Y_5SSKkrIU95QEjpcDzvOxI
 datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xdq56Tw,2012
 datachain/lib/convert/values_to_tuples.py,sha256=aVoHWMOUGLAiS6_BBwKJqVIne91VffOW6-dWyNE7oHg,3715
 datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
-datachain/query/batch.py,sha256=j-_ZcuQra2Ro3Wj4crtqQCg-7xuv-p84hr4QHdvT7as,3479
+datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
 datachain/query/builtins.py,sha256=EmKPYsoQ46zwdyOn54MuCzvYFmfsBn5F8zyF7UBUfrc,2550
-datachain/query/dataset.py,sha256=nfRRz6mkUz0tcD084rx-ps4PUWnZr5JQlIlRUF-PpSc,59919
-datachain/query/dispatch.py,sha256=oGX9ZuoKWPB_EyqAZD_eULcO3OejY44_keSmFS6SHT0,13315
+datachain/query/dataset.py,sha256=sRKY2it_znlzTNOt_OCRe008rHu0TXMnFwvGsnthSO0,60209
+datachain/query/dispatch.py,sha256=GBh3EZHDp5AaXxrjOpfrpfsuy7Umnqxu-MAXcK9X3gc,12945
 datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
 datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
+datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
 datachain/query/schema.py,sha256=O3mTM5DRjvRAJCI7O9mR8wOdFJbgI1jIjvtfl5YvjI4,7755
 datachain/query/session.py,sha256=qTzkXgwMJdJhal3rVt3hdv3x1EXT1IHuXcwkC-Ex0As,4111
-datachain/query/udf.py,sha256=c0IOTkcedpOQEmX-Idlrrl1__1IecNXL0N9oUO9Dtkg,7755
+datachain/query/udf.py,sha256=j3NhmKK5rYG5TclcM2Sr0LhS1tmYLMjzMugx9G9iFLM,8100
 datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
 datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
@@ -92,9 +93,9 @@ datachain/sql/sqlite/base.py,sha256=LBYmXqXsVF30fbcnR55evCZHbPDCzMdGk_ogPLps63s,
 datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
 datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
 datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
-datachain-0.3.0.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
-datachain-0.3.0.dist-info/METADATA,sha256=x0jqtxoQE9ynjAAKFeyrz0rvyuv_E2e0D6UuhU3Yu_I,17268
-datachain-0.3.0.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
-datachain-0.3.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
-datachain-0.3.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
-datachain-0.3.0.dist-info/RECORD,,
+datachain-0.3.1.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
+datachain-0.3.1.dist-info/METADATA,sha256=qR3OMpGUkx0cKelnl51d9uksn5H-Wn4LvTJbUnTMDuQ,17268
+datachain-0.3.1.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
+datachain-0.3.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
+datachain-0.3.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
+datachain-0.3.1.dist-info/RECORD,,

{datachain-0.3.0.dist-info → datachain-0.3.1.dist-info}/LICENSE RENAMED Viewed

File without changes

{datachain-0.3.0.dist-info → datachain-0.3.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{datachain-0.3.0.dist-info → datachain-0.3.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{datachain-0.3.0.dist-info → datachain-0.3.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

datachain 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

Potentially problematic release.

datachain 0.3.0py3-none-any.whl → 0.3.1py3-none-any.whl