PyPI - datachain - Versions diffs - 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl - Mend

datachain 0.14.2py3-none-any.whl → 0.39.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

datachain/__init__.py +20 -0
datachain/asyn.py +11 -12
datachain/cache.py +7 -7
datachain/catalog/__init__.py +2 -2
datachain/catalog/catalog.py +621 -507
datachain/catalog/dependency.py +164 -0
datachain/catalog/loader.py +28 -18
datachain/checkpoint.py +43 -0
datachain/cli/__init__.py +24 -33
datachain/cli/commands/__init__.py +1 -8
datachain/cli/commands/datasets.py +83 -52
datachain/cli/commands/ls.py +17 -17
datachain/cli/commands/show.py +4 -4
datachain/cli/parser/__init__.py +8 -74
datachain/cli/parser/job.py +95 -3
datachain/cli/parser/studio.py +11 -4
datachain/cli/parser/utils.py +1 -2
datachain/cli/utils.py +2 -15
datachain/client/azure.py +4 -4
datachain/client/fsspec.py +45 -28
datachain/client/gcs.py +6 -6
datachain/client/hf.py +29 -2
datachain/client/http.py +157 -0
datachain/client/local.py +15 -11
datachain/client/s3.py +17 -9
datachain/config.py +4 -8
datachain/data_storage/db_engine.py +12 -6
datachain/data_storage/job.py +5 -1
datachain/data_storage/metastore.py +1252 -186
datachain/data_storage/schema.py +58 -45
datachain/data_storage/serializer.py +105 -15
datachain/data_storage/sqlite.py +286 -127
datachain/data_storage/warehouse.py +250 -113
datachain/dataset.py +353 -148
datachain/delta.py +391 -0
datachain/diff/__init__.py +27 -29
datachain/error.py +60 -0
datachain/func/__init__.py +2 -1
datachain/func/aggregate.py +66 -42
datachain/func/array.py +242 -38
datachain/func/base.py +7 -4
datachain/func/conditional.py +110 -60
datachain/func/func.py +96 -45
datachain/func/numeric.py +55 -38
datachain/func/path.py +32 -20
datachain/func/random.py +2 -2
datachain/func/string.py +67 -37
datachain/func/window.py +7 -8
datachain/hash_utils.py +123 -0
datachain/job.py +11 -7
datachain/json.py +138 -0
datachain/lib/arrow.py +58 -22
datachain/lib/audio.py +245 -0
datachain/lib/clip.py +14 -13
datachain/lib/convert/flatten.py +5 -3
datachain/lib/convert/python_to_sql.py +6 -10
datachain/lib/convert/sql_to_python.py +8 -0
datachain/lib/convert/values_to_tuples.py +156 -51
datachain/lib/data_model.py +42 -20
datachain/lib/dataset_info.py +36 -8
datachain/lib/dc/__init__.py +8 -2
datachain/lib/dc/csv.py +25 -28
datachain/lib/dc/database.py +398 -0
datachain/lib/dc/datachain.py +1289 -425
datachain/lib/dc/datasets.py +320 -38
datachain/lib/dc/hf.py +38 -24
datachain/lib/dc/json.py +29 -32
datachain/lib/dc/listings.py +112 -8
datachain/lib/dc/pandas.py +16 -12
datachain/lib/dc/parquet.py +35 -23
datachain/lib/dc/records.py +31 -23
datachain/lib/dc/storage.py +154 -64
datachain/lib/dc/storage_pattern.py +251 -0
datachain/lib/dc/utils.py +24 -16
datachain/lib/dc/values.py +8 -9
datachain/lib/file.py +622 -89
datachain/lib/hf.py +69 -39
datachain/lib/image.py +14 -14
datachain/lib/listing.py +14 -11
datachain/lib/listing_info.py +1 -2
datachain/lib/meta_formats.py +3 -4
datachain/lib/model_store.py +39 -7
datachain/lib/namespaces.py +125 -0
datachain/lib/projects.py +130 -0
datachain/lib/pytorch.py +32 -21
datachain/lib/settings.py +192 -56
datachain/lib/signal_schema.py +427 -104
datachain/lib/tar.py +1 -2
datachain/lib/text.py +8 -7
datachain/lib/udf.py +164 -76
datachain/lib/udf_signature.py +60 -35
datachain/lib/utils.py +118 -4
datachain/lib/video.py +17 -9
datachain/lib/webdataset.py +61 -56
datachain/lib/webdataset_laion.py +15 -16
datachain/listing.py +22 -10
datachain/model/bbox.py +3 -1
datachain/model/ultralytics/bbox.py +16 -12
datachain/model/ultralytics/pose.py +16 -12
datachain/model/ultralytics/segment.py +16 -12
datachain/namespace.py +84 -0
datachain/node.py +6 -6
datachain/nodes_thread_pool.py +0 -1
datachain/plugins.py +24 -0
datachain/project.py +78 -0
datachain/query/batch.py +40 -41
datachain/query/dataset.py +604 -322
datachain/query/dispatch.py +261 -154
datachain/query/metrics.py +4 -6
datachain/query/params.py +2 -3
datachain/query/queue.py +3 -12
datachain/query/schema.py +11 -6
datachain/query/session.py +200 -33
datachain/query/udf.py +34 -2
datachain/remote/studio.py +171 -69
datachain/script_meta.py +12 -12
datachain/semver.py +68 -0
datachain/sql/__init__.py +2 -0
datachain/sql/functions/array.py +33 -1
datachain/sql/postgresql_dialect.py +9 -0
datachain/sql/postgresql_types.py +21 -0
datachain/sql/sqlite/__init__.py +5 -1
datachain/sql/sqlite/base.py +102 -29
datachain/sql/sqlite/types.py +8 -13
datachain/sql/types.py +70 -15
datachain/studio.py +223 -46
datachain/toolkit/split.py +31 -10
datachain/utils.py +101 -59
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
datachain-0.39.0.dist-info/RECORD +173 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
datachain/cli/commands/query.py +0 -53
datachain/query/utils.py +0 -42
datachain-0.14.2.dist-info/RECORD +0 -158
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0

datachain/query/dispatch.py CHANGED Viewed

@@ -1,23 +1,24 @@
 import contextlib
+import traceback
 from collections.abc import Iterable, Sequence
 from itertools import chain
 from multiprocessing import cpu_count
+from queue import Empty
 from sys import stdin
-from threading import Timer
-from typing import TYPE_CHECKING, Optional
+from time import monotonic, sleep
+from typing import TYPE_CHECKING, Literal
-import attrs
 import multiprocess
 from cloudpickle import load, loads
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback
-from multiprocess import get_context
-from sqlalchemy.sql import func
+from multiprocess.context import Process
+from multiprocess.queues import Queue as MultiprocessQueue
 from datachain.catalog import Catalog
 from datachain.catalog.catalog import clone_catalog_with_cache
-from datachain.catalog.loader import get_distributed_class
-from datachain.lib.udf import _get_cache
-from datachain.query.batch import RowsOutput, RowsOutputBatch
+from datachain.catalog.loader import DISTRIBUTED_IMPORT_PATH, get_udf_distributor_class
+from datachain.lib.model_store import ModelStore
+from datachain.lib.udf import UdfRunError, _get_cache
 from datachain.query.dataset import (
     get_download_callback,
     get_generated_callback,
@@ -26,7 +27,6 @@ from datachain.query.dataset import (
 )
 from datachain.query.queue import get_from_queue, put_into_queue
 from datachain.query.udf import UdfInfo
-from datachain.query.utils import get_query_id_column
 from datachain.utils import batched, flatten, safe_closing
 if TYPE_CHECKING:
@@ -34,6 +34,7 @@ if TYPE_CHECKING:
     from datachain.data_storage import AbstractMetastore, AbstractWarehouse
     from datachain.lib.udf import UDFAdapter
+    from datachain.query.batch import RowsOutput
 DEFAULT_BATCH_SIZE = 10000
 STOP_SIGNAL = "STOP"
@@ -43,7 +44,7 @@ FAILED_STATUS = "FAILED"
 NOTIFY_STATUS = "NOTIFY"
-def get_n_workers_from_arg(n_workers: Optional[int] = None) -> int:
+def get_n_workers_from_arg(n_workers: int | None = None) -> int:
     if not n_workers:
         return cpu_count()
     if n_workers < 1:
@@ -52,55 +53,60 @@ def get_n_workers_from_arg(n_workers: Optional[int] = None) -> int:
 def udf_entrypoint() -> int:
+    """Parallel processing (faster for more CPU-heavy UDFs)."""
     # Load UDF info from stdin
     udf_info: UdfInfo = load(stdin.buffer)
-    # Parallel processing (faster for more CPU-heavy UDFs)
-    dispatch = UDFDispatcher(udf_info)
     query = udf_info["query"]
+    if "sys__id" not in query.selected_columns:
+        raise RuntimeError("sys__id column is required in UDF query")
     batching = udf_info["batching"]
-    n_workers = udf_info["processes"]
-    if n_workers is True:
-        n_workers = None  # Use default number of CPUs (cores)
+    is_generator = udf_info["is_generator"]
+    download_cb = get_download_callback()
+    processed_cb = get_processed_callback()
+    generated_cb = get_generated_callback(is_generator)
     wh_cls, wh_args, wh_kwargs = udf_info["warehouse_clone_params"]
     warehouse: AbstractWarehouse = wh_cls(*wh_args, **wh_kwargs)
-    total_rows = next(
-        warehouse.db.execute(
-            query.with_only_columns(func.count(query.c.sys__id)).order_by(None)
-        )
-    )[0]
     with contextlib.closing(
-        batching(warehouse.dataset_select_paginated, query, ids_only=True)
+        batching(
+            warehouse.dataset_select_paginated,
+            query,
+            id_col=query.selected_columns.sys__id,
+        )
     ) as udf_inputs:
-        download_cb = get_download_callback()
-        processed_cb = get_processed_callback()
         try:
-            dispatch.run_udf_parallel(
+            UDFDispatcher(udf_info).run_udf(
                 udf_inputs,
-                total_rows=total_rows,
-                n_workers=n_workers,
-                processed_cb=processed_cb,
                 download_cb=download_cb,
+                processed_cb=processed_cb,
+                generated_cb=generated_cb,
             )
         finally:
             download_cb.close()
             processed_cb.close()
+            generated_cb.close()
     return 0
 def udf_worker_entrypoint() -> int:
-    return get_distributed_class().run_worker()
+    if not (udf_distributor_class := get_udf_distributor_class()):
+        raise RuntimeError(
+            f"{DISTRIBUTED_IMPORT_PATH} import path is required "
+            "for distributed UDF processing."
+        )
+    return udf_distributor_class.run_udf()
 class UDFDispatcher:
-    catalog: Optional[Catalog] = None
-    task_queue: Optional[multiprocess.Queue] = None
-    done_queue: Optional[multiprocess.Queue] = None
+    _catalog: Catalog | None = None
+    task_queue: MultiprocessQueue | None = None
+    done_queue: MultiprocessQueue | None = None
     def __init__(self, udf_info: UdfInfo, buffer_size: int = DEFAULT_BATCH_SIZE):
         self.udf_data = udf_info["udf_data"]
@@ -113,30 +119,38 @@ class UDFDispatcher:
         self.cache = udf_info["cache"]
         self.is_generator = udf_info["is_generator"]
         self.is_batching = udf_info["batching"].is_batching
+        self.processes = udf_info["processes"]
+        self.rows_total = udf_info["rows_total"]
+        self.batch_size = udf_info["batch_size"]
         self.buffer_size = buffer_size
-        self.catalog = None
         self.task_queue = None
         self.done_queue = None
-        self.ctx = get_context("spawn")
+        self.ctx = multiprocess.get_context("spawn")
-    def _create_worker(self) -> "UDFWorker":
-        if not self.catalog:
+    @property
+    def catalog(self) -> "Catalog":
+        if not self._catalog:
             ms_cls, ms_args, ms_kwargs = self.metastore_clone_params
             metastore: AbstractMetastore = ms_cls(*ms_args, **ms_kwargs)
             ws_cls, ws_args, ws_kwargs = self.warehouse_clone_params
             warehouse: AbstractWarehouse = ws_cls(*ws_args, **ws_kwargs)
-            self.catalog = Catalog(metastore, warehouse, **self.catalog_init_params)
-        self.udf = loads(self.udf_data)
+            self._catalog = Catalog(metastore, warehouse, **self.catalog_init_params)
+        return self._catalog
+    def _create_worker(self) -> "UDFWorker":
+        udf: UDFAdapter = loads(self.udf_data)
+        # Ensure all registered DataModels have rebuilt schemas in worker processes.
+        ModelStore.rebuild_all()
         return UDFWorker(
             self.catalog,
-            self.udf,
+            udf,
             self.task_queue,
             self.done_queue,
             self.query,
             self.table,
-            self.is_generator,
-            self.is_batching,
             self.cache,
+            self.is_batching,
+            self.batch_size,
             self.udf_fields,
         )
@@ -146,45 +160,109 @@ class UDFDispatcher:
             worker.run()
         except (Exception, KeyboardInterrupt) as e:
             if self.done_queue:
+                # We put the exception into the done queue so the main process
+                # can handle it appropriately. We include the stacktrace to propagate
+                # it to the main process and show it to the user.
                 put_into_queue(
                     self.done_queue,
-                    {"status": FAILED_STATUS, "exception": e},
+                    {
+                        "status": FAILED_STATUS,
+                        "exception": e,
+                        "stacktrace": traceback.format_exc(),
+                    },
                 )
+            if isinstance(e, KeyboardInterrupt):
+                return
             raise
-    @staticmethod
-    def send_stop_signal_to_workers(task_queue, n_workers: Optional[int] = None):
+    def run_udf(
+        self,
+        input_rows: Iterable["RowsOutput"],
+        download_cb: Callback = DEFAULT_CALLBACK,
+        processed_cb: Callback = DEFAULT_CALLBACK,
+        generated_cb: Callback = DEFAULT_CALLBACK,
+    ) -> None:
+        n_workers = self.processes
+        if n_workers is True:
+            n_workers = None  # Use default number of CPUs (cores)
+        elif not n_workers or n_workers < 1:
+            n_workers = 1  # Single-threaded (on this worker)
         n_workers = get_n_workers_from_arg(n_workers)
-        for _ in range(n_workers):
-            put_into_queue(task_queue, STOP_SIGNAL)
-    def create_input_queue(self):
-        return self.ctx.Queue()
+        if n_workers == 1:
+            # no need to spawn worker processes if we are running in a single process
+            self.run_udf_single(input_rows, download_cb, processed_cb, generated_cb)
+        else:
+            if self.buffer_size < n_workers:
+                raise RuntimeError(
+                    "Parallel run error: buffer size is smaller than "
+                    f"number of workers: {self.buffer_size} < {n_workers}"
+                )
-    def run_udf_parallel(  # noqa: C901, PLR0912
+            self.run_udf_parallel(
+                n_workers, input_rows, download_cb, processed_cb, generated_cb
+            )
+    def run_udf_single(
         self,
-        input_rows: Iterable[RowsOutput],
-        total_rows: int,
-        n_workers: Optional[int] = None,
-        processed_cb: Callback = DEFAULT_CALLBACK,
+        input_rows: Iterable["RowsOutput"],
         download_cb: Callback = DEFAULT_CALLBACK,
+        processed_cb: Callback = DEFAULT_CALLBACK,
+        generated_cb: Callback = DEFAULT_CALLBACK,
     ) -> None:
-        n_workers = get_n_workers_from_arg(n_workers)
+        udf: UDFAdapter = loads(self.udf_data)
+        # Rebuild schemas in single process too for consistency (cheap, idempotent).
+        ModelStore.rebuild_all()
+        if not self.is_batching:
+            input_rows = flatten(input_rows)
+        def get_inputs() -> Iterable["RowsOutput"]:
+            warehouse = self.catalog.warehouse.clone()
+            for ids in batched(input_rows, DEFAULT_BATCH_SIZE):
+                yield from warehouse.dataset_rows_select_from_ids(
+                    self.query, ids, self.is_batching
+                )
-        input_batch_size = total_rows // n_workers
+        prefetch = udf.prefetch
+        with _get_cache(self.catalog.cache, prefetch, use_cache=self.cache) as _cache:
+            udf_results = udf.run(
+                self.udf_fields,
+                get_inputs(),
+                self.catalog,
+                self.cache,
+                download_cb=download_cb,
+                processed_cb=processed_cb,
+            )
+            with safe_closing(udf_results):
+                process_udf_outputs(
+                    self.catalog.warehouse.clone(),
+                    self.table,
+                    udf_results,
+                    udf,
+                    cb=generated_cb,
+                    batch_size=self.batch_size,
+                )
+    def input_batch_size(self, n_workers: int) -> int:
+        input_batch_size = self.rows_total // n_workers
         if input_batch_size == 0:
             input_batch_size = 1
         elif input_batch_size > DEFAULT_BATCH_SIZE:
             input_batch_size = DEFAULT_BATCH_SIZE
+        return input_batch_size
-        if self.buffer_size < n_workers:
-            raise RuntimeError(
-                "Parallel run error: buffer size is smaller than "
-                f"number of workers: {self.buffer_size} < {n_workers}"
-            )
+    def run_udf_parallel(  # noqa: C901, PLR0912
+        self,
+        n_workers: int,
+        input_rows: Iterable["RowsOutput"],
+        download_cb: Callback = DEFAULT_CALLBACK,
+        processed_cb: Callback = DEFAULT_CALLBACK,
+        generated_cb: Callback = DEFAULT_CALLBACK,
+    ) -> None:
         self.task_queue = self.ctx.Queue()
         self.done_queue = self.ctx.Queue()
         pool = [
             self.ctx.Process(name=f"Worker-UDF-{i}", target=self._run_worker)
             for i in range(n_workers)
@@ -192,14 +270,14 @@ class UDFDispatcher:
         for p in pool:
             p.start()
-        # Will be set to True if all tasks complete normally
-        normal_completion = False
         try:
             # Will be set to True when the input is exhausted
             input_finished = False
-            if not self.is_batching:
-                input_rows = batched(flatten(input_rows), input_batch_size)
+            input_rows = batched(
+                input_rows if self.is_batching else flatten(input_rows),
+                self.input_batch_size(n_workers),
+            )
             # Stop all workers after the input rows have finished processing
             input_data = chain(input_rows, [STOP_SIGNAL] * n_workers)
@@ -214,12 +292,29 @@ class UDFDispatcher:
             # Process all tasks
             while n_workers > 0:
-                result = get_from_queue(self.done_queue)
+                while True:
+                    try:
+                        result = self.done_queue.get_nowait()
+                        break
+                    except Empty:
+                        for p in pool:
+                            exitcode = p.exitcode
+                            if exitcode not in (None, 0):
+                                message = (
+                                    f"Worker {p.name} exited unexpectedly with "
+                                    f"code {exitcode}"
+                                )
+                                raise RuntimeError(message) from None
+                        sleep(0.01)
+                if bytes_downloaded := result.get("bytes_downloaded"):
+                    download_cb.relative_update(bytes_downloaded)
                 if downloaded := result.get("downloaded"):
-                    download_cb.relative_update(downloaded)
+                    download_cb.increment_file_count(downloaded)
                 if processed := result.get("processed"):
                     processed_cb.relative_update(processed)
+                if generated := result.get("generated"):
+                    generated_cb.relative_update(generated)
                 status = result["status"]
                 if status in (OK_STATUS, NOTIFY_STATUS):
@@ -229,7 +324,9 @@ class UDFDispatcher:
                 else:  # Failed / error
                     n_workers -= 1
                     if exc := result.get("exception"):
-                        raise exc
+                        if isinstance(exc, KeyboardInterrupt):
+                            raise exc
+                        raise UdfRunError(exc, stacktrace=result.get("stacktrace"))
                     raise RuntimeError("Internal error: Parallel UDF execution failed")
                 if status == OK_STATUS and not input_finished:
@@ -237,75 +334,104 @@ class UDFDispatcher:
                         put_into_queue(self.task_queue, next(input_data))
                     except StopIteration:
                         input_finished = True
-            # Finished with all tasks normally
-            normal_completion = True
         finally:
-            if not normal_completion:
-                # Stop all workers if there is an unexpected exception
-                for _ in pool:
-                    put_into_queue(self.task_queue, STOP_SIGNAL)
-                self.task_queue.close()
-                # This allows workers (and this process) to exit without
-                # consuming any remaining data in the queues.
-                # (If they exit due to an exception.)
-                self.task_queue.cancel_join_thread()
-                self.done_queue.cancel_join_thread()
-                # Flush all items from the done queue.
-                # This is needed if any workers are still running.
-                while n_workers > 0:
-                    result = get_from_queue(self.done_queue)
-                    status = result["status"]
-                    if status != OK_STATUS:
-                        n_workers -= 1
-            # Wait for workers to stop
-            for p in pool:
-                p.join()
-class WorkerCallback(Callback):
-    def __init__(self, queue: "multiprocess.Queue"):
+            self._shutdown_workers(pool)
+    def _shutdown_workers(self, pool: list[Process]) -> None:
+        self._terminate_pool(pool)
+        self._drain_queue(self.done_queue)
+        self._drain_queue(self.task_queue)
+        self._close_queue(self.done_queue)
+        self._close_queue(self.task_queue)
+    def _terminate_pool(self, pool: list[Process]) -> None:
+        for proc in pool:
+            if proc.is_alive():
+                proc.terminate()
+        deadline = monotonic() + 1.0
+        for proc in pool:
+            if not proc.is_alive():
+                continue
+            remaining = deadline - monotonic()
+            if remaining > 0:
+                proc.join(remaining)
+            if proc.is_alive():
+                proc.kill()
+                proc.join(timeout=0.2)
+    def _drain_queue(self, queue: MultiprocessQueue) -> None:
+        while True:
+            try:
+                queue.get_nowait()
+            except Empty:
+                return
+            except (OSError, ValueError):
+                return
+    def _close_queue(self, queue: MultiprocessQueue) -> None:
+        with contextlib.suppress(OSError, ValueError):
+            queue.close()
+        with contextlib.suppress(RuntimeError, AssertionError, ValueError):
+            queue.join_thread()
+class DownloadCallback(Callback):
+    def __init__(self, queue: MultiprocessQueue) -> None:
         self.queue = queue
         super().__init__()
     def relative_update(self, inc: int = 1) -> None:
+        put_into_queue(self.queue, {"status": NOTIFY_STATUS, "bytes_downloaded": inc})
+    def increment_file_count(self, inc: int = 1) -> None:
         put_into_queue(self.queue, {"status": NOTIFY_STATUS, "downloaded": inc})
 class ProcessedCallback(Callback):
-    def __init__(self):
-        self.processed_rows: Optional[int] = None
+    def __init__(
+        self,
+        name: Literal["processed", "generated"],
+        queue: MultiprocessQueue,
+    ) -> None:
+        self.name = name
+        self.queue = queue
         super().__init__()
     def relative_update(self, inc: int = 1) -> None:
-        self.processed_rows = inc
+        put_into_queue(self.queue, {"status": NOTIFY_STATUS, self.name: inc})
-@attrs.define
 class UDFWorker:
-    catalog: "Catalog"
-    udf: "UDFAdapter"
-    task_queue: "multiprocess.Queue"
-    done_queue: "multiprocess.Queue"
-    query: "Select"
-    table: "Table"
-    is_generator: bool
-    is_batching: bool
-    cache: bool
-    udf_fields: Sequence[str]
-    cb: Callback = attrs.field()
-    @cb.default
-    def _default_callback(self) -> WorkerCallback:
-        return WorkerCallback(self.done_queue)
+    def __init__(
+        self,
+        catalog: "Catalog",
+        udf: "UDFAdapter",
+        task_queue: MultiprocessQueue,
+        done_queue: MultiprocessQueue,
+        query: "Select",
+        table: "Table",
+        cache: bool,
+        is_batching: bool,
+        batch_size: int,
+        udf_fields: Sequence[str],
+    ) -> None:
+        self.catalog = catalog
+        self.udf = udf
+        self.task_queue = task_queue
+        self.done_queue = done_queue
+        self.query = query
+        self.table = table
+        self.cache = cache
+        self.is_batching = is_batching
+        self.batch_size = batch_size
+        self.udf_fields = udf_fields
+        self.download_cb = DownloadCallback(self.done_queue)
+        self.processed_cb = ProcessedCallback("processed", self.done_queue)
+        self.generated_cb = ProcessedCallback("generated", self.done_queue)
     def run(self) -> None:
-        processed_cb = ProcessedCallback()
-        generated_cb = get_generated_callback(self.is_generator)
         prefetch = self.udf.prefetch
         with _get_cache(self.catalog.cache, prefetch, use_cache=self.cache) as _cache:
             catalog = clone_catalog_with_cache(self.catalog, _cache)
@@ -314,48 +440,29 @@ class UDFWorker:
                 self.get_inputs(),
                 catalog,
                 self.cache,
-                download_cb=self.cb,
-                processed_cb=processed_cb,
+                download_cb=self.download_cb,
+                processed_cb=self.processed_cb,
             )
             with safe_closing(udf_results):
                 process_udf_outputs(
                     catalog.warehouse,
                     self.table,
-                    self.notify_and_process(udf_results, processed_cb),
+                    self.notify_and_process(udf_results),
                     self.udf,
-                    cb=generated_cb,
+                    cb=self.generated_cb,
+                    batch_size=self.batch_size,
                 )
+        put_into_queue(self.done_queue, {"status": FINISHED_STATUS})
-        put_into_queue(
-            self.done_queue,
-            {"status": FINISHED_STATUS, "processed": processed_cb.processed_rows},
-        )
-    def notify_and_process(self, udf_results, processed_cb):
+    def notify_and_process(self, udf_results):
         for row in udf_results:
-            put_into_queue(
-                self.done_queue,
-                {"status": OK_STATUS, "processed": processed_cb.processed_rows},
-            )
+            put_into_queue(self.done_queue, {"status": OK_STATUS})
             yield row
-    def get_inputs(self):
+    def get_inputs(self) -> Iterable["RowsOutput"]:
         warehouse = self.catalog.warehouse.clone()
-        col_id = get_query_id_column(self.query)
-        if self.is_batching:
-            while (batch := get_from_queue(self.task_queue)) != STOP_SIGNAL:
-                ids = [row[0] for row in batch.rows]
-                rows = warehouse.dataset_rows_select(self.query.where(col_id.in_(ids)))
-                yield RowsOutputBatch(list(rows))
-        else:
-            while (batch := get_from_queue(self.task_queue)) != STOP_SIGNAL:
-                yield from warehouse.dataset_rows_select(
-                    self.query.where(col_id.in_(batch))
+        while (batch := get_from_queue(self.task_queue)) != STOP_SIGNAL:
+            for ids in batched(batch, DEFAULT_BATCH_SIZE):
+                yield from warehouse.dataset_rows_select_from_ids(
+                    self.query, ids, self.is_batching
                 )
-class RepeatTimer(Timer):
-    def run(self):
-        while not self.finished.wait(self.interval):
-            self.function(*self.args, **self.kwargs)

datachain/query/metrics.py CHANGED Viewed

@@ -1,10 +1,9 @@
 import os
-from typing import Optional, Union
-metrics: dict[str, Union[str, int, float, bool, None]] = {}
+metrics: dict[str, str | int | float | bool | None] = {}
-def set(key: str, value: Union[str, int, float, bool, None]) -> None:  # noqa: PYI041
+def set(key: str, value: str | int | float | bool | None) -> None:  # noqa: PYI041
     """Set a metric value."""
     if not isinstance(key, str):
         raise TypeError("Key must be a string")
@@ -15,13 +14,12 @@ def set(key: str, value: Union[str, int, float, bool, None]) -> None:  # noqa: P
     metrics[key] = value
     if job_id := os.getenv("DATACHAIN_JOB_ID"):
-        from datachain.data_storage.job import JobStatus
         from datachain.query.session import Session
         metastore = Session.get().catalog.metastore
-        metastore.set_job_status(job_id, JobStatus.RUNNING, metrics=metrics)
+        metastore.update_job(job_id, metrics=metrics)
-def get(key: str) -> Optional[Union[str, int, float, bool]]:
+def get(key: str) -> str | int | float | bool | None:
     """Get a metric value."""
     return metrics[key]

datachain/query/params.py CHANGED Viewed

@@ -1,11 +1,10 @@
 import json
 import os
-from typing import Optional
-params_cache: Optional[dict[str, str]] = None
+params_cache: dict[str, str] | None = None
-def param(key: str, default: Optional[str] = None) -> Optional[str]:
+def param(key: str, default: str | None = None) -> str | None:
     """Get query parameter."""
     if not isinstance(key, str):
         raise TypeError("Param key must be a string")

datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl

datachain 0.14.2py3-none-any.whl → 0.39.0py3-none-any.whl