PyPI - datachain - Versions diffs - 0.16.3__py3-none-any.whl → 0.16.5__py3-none-any.whl - Mend

datachain 0.16.3py3-none-any.whl → 0.16.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (29) hide show

datachain/catalog/catalog.py +5 -1
datachain/cli/__init__.py +11 -9
datachain/cli/commands/query.py +1 -0
datachain/cli/parser/__init__.py +9 -1
datachain/cli/parser/job.py +6 -1
datachain/data_storage/job.py +1 -0
datachain/data_storage/metastore.py +82 -71
datachain/data_storage/warehouse.py +46 -34
datachain/lib/arrow.py +23 -1
datachain/lib/dc/csv.py +1 -0
datachain/lib/dc/datachain.py +30 -13
datachain/lib/listing.py +2 -0
datachain/lib/udf.py +17 -5
datachain/query/batch.py +40 -39
datachain/query/dataset.py +33 -32
datachain/query/dispatch.py +137 -75
datachain/query/metrics.py +1 -2
datachain/query/queue.py +1 -11
datachain/query/udf.py +1 -1
datachain/query/utils.py +8 -14
datachain/remote/studio.py +2 -0
datachain/studio.py +3 -0
datachain/utils.py +3 -0
{datachain-0.16.3.dist-info → datachain-0.16.5.dist-info}/METADATA +1 -1
{datachain-0.16.3.dist-info → datachain-0.16.5.dist-info}/RECORD +29 -29
{datachain-0.16.3.dist-info → datachain-0.16.5.dist-info}/WHEEL +1 -1
{datachain-0.16.3.dist-info → datachain-0.16.5.dist-info}/entry_points.txt +0 -0
{datachain-0.16.3.dist-info → datachain-0.16.5.dist-info}/licenses/LICENSE +0 -0
{datachain-0.16.3.dist-info → datachain-0.16.5.dist-info}/top_level.txt +0 -0

datachain/query/dispatch.py CHANGED Viewed

@@ -3,7 +3,6 @@ from collections.abc import Iterable, Sequence
 from itertools import chain
 from multiprocessing import cpu_count
 from sys import stdin
-from threading import Timer
 from typing import TYPE_CHECKING, Literal, Optional
 import multiprocess
@@ -15,7 +14,6 @@ from datachain.catalog import Catalog
 from datachain.catalog.catalog import clone_catalog_with_cache
 from datachain.catalog.loader import DISTRIBUTED_IMPORT_PATH, get_udf_distributor_class
 from datachain.lib.udf import _get_cache
-from datachain.query.batch import RowsOutput, RowsOutputBatch
 from datachain.query.dataset import (
     get_download_callback,
     get_generated_callback,
@@ -32,6 +30,7 @@ if TYPE_CHECKING:
     from datachain.data_storage import AbstractMetastore, AbstractWarehouse
     from datachain.lib.udf import UDFAdapter
+    from datachain.query.batch import RowsOutput
 DEFAULT_BATCH_SIZE = 10000
 STOP_SIGNAL = "STOP"
@@ -50,34 +49,30 @@ def get_n_workers_from_arg(n_workers: Optional[int] = None) -> int:
 def udf_entrypoint() -> int:
+    """Parallel processing (faster for more CPU-heavy UDFs)."""
     # Load UDF info from stdin
     udf_info: UdfInfo = load(stdin.buffer)
-    # Parallel processing (faster for more CPU-heavy UDFs)
-    dispatch = UDFDispatcher(udf_info)
     query = udf_info["query"]
-    rows_total = udf_info["rows_total"]
     batching = udf_info["batching"]
     is_generator = udf_info["is_generator"]
-    n_workers = udf_info["processes"]
-    if n_workers is True:
-        n_workers = None  # Use default number of CPUs (cores)
+    download_cb = get_download_callback()
+    processed_cb = get_processed_callback()
+    generated_cb = get_generated_callback(is_generator)
     wh_cls, wh_args, wh_kwargs = udf_info["warehouse_clone_params"]
     warehouse: AbstractWarehouse = wh_cls(*wh_args, **wh_kwargs)
+    id_col = get_query_id_column(query)
     with contextlib.closing(
-        batching(warehouse.dataset_select_paginated, query, ids_only=True)
+        batching(warehouse.dataset_select_paginated, query, id_col=id_col)
     ) as udf_inputs:
-        download_cb = get_download_callback()
-        processed_cb = get_processed_callback()
-        generated_cb = get_generated_callback(is_generator)
         try:
-            dispatch.run_udf_parallel(
+            UDFDispatcher(udf_info).run_udf(
                 udf_inputs,
-                rows_total=rows_total,
-                n_workers=n_workers,
+                ids_only=id_col is not None,
                 download_cb=download_cb,
                 processed_cb=processed_cb,
                 generated_cb=generated_cb,
@@ -90,17 +85,18 @@ def udf_entrypoint() -> int:
     return 0
-def udf_worker_entrypoint() -> int:
+def udf_worker_entrypoint(fd: Optional[int] = None) -> int:
     if not (udf_distributor_class := get_udf_distributor_class()):
         raise RuntimeError(
             f"{DISTRIBUTED_IMPORT_PATH} import path is required "
             "for distributed UDF processing."
         )
-    return udf_distributor_class.run_worker()
+    return udf_distributor_class.run_udf(fd)
 class UDFDispatcher:
-    catalog: Optional[Catalog] = None
+    _catalog: Optional[Catalog] = None
     task_queue: Optional[multiprocess.Queue] = None
     done_queue: Optional[multiprocess.Queue] = None
@@ -115,77 +111,147 @@ class UDFDispatcher:
         self.cache = udf_info["cache"]
         self.is_generator = udf_info["is_generator"]
         self.is_batching = udf_info["batching"].is_batching
+        self.processes = udf_info["processes"]
+        self.rows_total = udf_info["rows_total"]
         self.buffer_size = buffer_size
-        self.catalog = None
         self.task_queue = None
         self.done_queue = None
         self.ctx = get_context("spawn")
-    def _create_worker(self) -> "UDFWorker":
-        if not self.catalog:
+    @property
+    def catalog(self) -> "Catalog":
+        if not self._catalog:
             ms_cls, ms_args, ms_kwargs = self.metastore_clone_params
             metastore: AbstractMetastore = ms_cls(*ms_args, **ms_kwargs)
             ws_cls, ws_args, ws_kwargs = self.warehouse_clone_params
             warehouse: AbstractWarehouse = ws_cls(*ws_args, **ws_kwargs)
-            self.catalog = Catalog(metastore, warehouse, **self.catalog_init_params)
-        self.udf = loads(self.udf_data)
+            self._catalog = Catalog(metastore, warehouse, **self.catalog_init_params)
+        return self._catalog
+    def _create_worker(self) -> "UDFWorker":
+        udf: UDFAdapter = loads(self.udf_data)
         return UDFWorker(
             self.catalog,
-            self.udf,
+            udf,
             self.task_queue,
             self.done_queue,
             self.query,
             self.table,
-            self.is_batching,
             self.cache,
+            self.is_batching,
             self.udf_fields,
         )
-    def _run_worker(self) -> None:
+    def _run_worker(self, ids_only: bool) -> None:
         try:
             worker = self._create_worker()
-            worker.run()
+            worker.run(ids_only)
         except (Exception, KeyboardInterrupt) as e:
             if self.done_queue:
                 put_into_queue(
                     self.done_queue,
                     {"status": FAILED_STATUS, "exception": e},
                 )
+            if isinstance(e, KeyboardInterrupt):
+                return
             raise
-    @staticmethod
-    def send_stop_signal_to_workers(task_queue, n_workers: Optional[int] = None):
+    def run_udf(
+        self,
+        input_rows: Iterable["RowsOutput"],
+        ids_only: bool,
+        download_cb: Callback = DEFAULT_CALLBACK,
+        processed_cb: Callback = DEFAULT_CALLBACK,
+        generated_cb: Callback = DEFAULT_CALLBACK,
+    ) -> None:
+        n_workers = self.processes
+        if n_workers is True:
+            n_workers = None  # Use default number of CPUs (cores)
+        elif not n_workers or n_workers < 1:
+            n_workers = 1  # Single-threaded (on this worker)
         n_workers = get_n_workers_from_arg(n_workers)
-        for _ in range(n_workers):
-            put_into_queue(task_queue, STOP_SIGNAL)
-    def run_udf_parallel(  # noqa: C901, PLR0912
+        if n_workers == 1:
+            # no need to spawn worker processes if we are running in a single process
+            self.run_udf_single(
+                input_rows, ids_only, download_cb, processed_cb, generated_cb
+            )
+        else:
+            if self.buffer_size < n_workers:
+                raise RuntimeError(
+                    "Parallel run error: buffer size is smaller than "
+                    f"number of workers: {self.buffer_size} < {n_workers}"
+                )
+            self.run_udf_parallel(
+                n_workers, input_rows, ids_only, download_cb, processed_cb, generated_cb
+            )
+    def run_udf_single(
         self,
-        input_rows: Iterable[RowsOutput],
-        rows_total: int,
-        n_workers: Optional[int] = None,
+        input_rows: Iterable["RowsOutput"],
+        ids_only: bool,
         download_cb: Callback = DEFAULT_CALLBACK,
         processed_cb: Callback = DEFAULT_CALLBACK,
         generated_cb: Callback = DEFAULT_CALLBACK,
     ) -> None:
-        n_workers = get_n_workers_from_arg(n_workers)
+        udf: UDFAdapter = loads(self.udf_data)
+        if ids_only and not self.is_batching:
+            input_rows = flatten(input_rows)
+        def get_inputs() -> Iterable["RowsOutput"]:
+            warehouse = self.catalog.warehouse.clone()
+            if ids_only:
+                yield from warehouse.dataset_rows_select_from_ids(
+                    self.query, input_rows, self.is_batching
+                )
+            else:
+                yield from input_rows
+        prefetch = udf.prefetch
+        with _get_cache(self.catalog.cache, prefetch, use_cache=self.cache) as _cache:
+            udf_results = udf.run(
+                self.udf_fields,
+                get_inputs(),
+                self.catalog,
+                self.cache,
+                download_cb=download_cb,
+                processed_cb=processed_cb,
+            )
+            with safe_closing(udf_results):
+                process_udf_outputs(
+                    self.catalog.warehouse.clone(),
+                    self.table,
+                    udf_results,
+                    udf,
+                    cb=generated_cb,
+                )
-        input_batch_size = rows_total // n_workers
+    def input_batch_size(self, n_workers: int) -> int:
+        input_batch_size = self.rows_total // n_workers
         if input_batch_size == 0:
             input_batch_size = 1
         elif input_batch_size > DEFAULT_BATCH_SIZE:
             input_batch_size = DEFAULT_BATCH_SIZE
+        return input_batch_size
-        if self.buffer_size < n_workers:
-            raise RuntimeError(
-                "Parallel run error: buffer size is smaller than "
-                f"number of workers: {self.buffer_size} < {n_workers}"
-            )
+    def run_udf_parallel(  # noqa: C901, PLR0912
+        self,
+        n_workers: int,
+        input_rows: Iterable["RowsOutput"],
+        ids_only: bool,
+        download_cb: Callback = DEFAULT_CALLBACK,
+        processed_cb: Callback = DEFAULT_CALLBACK,
+        generated_cb: Callback = DEFAULT_CALLBACK,
+    ) -> None:
         self.task_queue = self.ctx.Queue()
         self.done_queue = self.ctx.Queue()
         pool = [
-            self.ctx.Process(name=f"Worker-UDF-{i}", target=self._run_worker)
+            self.ctx.Process(
+                name=f"Worker-UDF-{i}", target=self._run_worker, args=[ids_only]
+            )
             for i in range(n_workers)
         ]
         for p in pool:
@@ -198,7 +264,8 @@ class UDFDispatcher:
             input_finished = False
             if not self.is_batching:
-                input_rows = batched(flatten(input_rows), input_batch_size)
+                batch_size = self.input_batch_size(n_workers)
+                input_rows = batched(flatten(input_rows), batch_size)
             # Stop all workers after the input rows have finished processing
             input_data = chain(input_rows, [STOP_SIGNAL] * n_workers)
@@ -213,10 +280,15 @@ class UDFDispatcher:
             # Process all tasks
             while n_workers > 0:
-                result = get_from_queue(self.done_queue)
+                try:
+                    result = get_from_queue(self.done_queue)
+                except KeyboardInterrupt:
+                    break
+                if bytes_downloaded := result.get("bytes_downloaded"):
+                    download_cb.relative_update(bytes_downloaded)
                 if downloaded := result.get("downloaded"):
-                    download_cb.relative_update(downloaded)
+                    download_cb.increment_file_count(downloaded)
                 if processed := result.get("processed"):
                     processed_cb.relative_update(processed)
                 if generated := result.get("generated"):
@@ -246,13 +318,12 @@ class UDFDispatcher:
                 # Stop all workers if there is an unexpected exception
                 for _ in pool:
                     put_into_queue(self.task_queue, STOP_SIGNAL)
-                self.task_queue.close()
                 # This allows workers (and this process) to exit without
                 # consuming any remaining data in the queues.
                 # (If they exit due to an exception.)
-                self.task_queue.cancel_join_thread()
-                self.done_queue.cancel_join_thread()
+                self.task_queue.close()
+                self.task_queue.join_thread()
                 # Flush all items from the done queue.
                 # This is needed if any workers are still running.
@@ -262,6 +333,9 @@ class UDFDispatcher:
                     if status != OK_STATUS:
                         n_workers -= 1
+                self.done_queue.close()
+                self.done_queue.join_thread()
             # Wait for workers to stop
             for p in pool:
                 p.join()
@@ -273,8 +347,7 @@ class DownloadCallback(Callback):
         super().__init__()
     def relative_update(self, inc: int = 1) -> None:
-        # This callback is used to notify the size of the downloaded files
-        pass
+        put_into_queue(self.queue, {"status": NOTIFY_STATUS, "bytes_downloaded": inc})
     def increment_file_count(self, inc: int = 1) -> None:
         put_into_queue(self.queue, {"status": NOTIFY_STATUS, "downloaded": inc})
@@ -303,8 +376,8 @@ class UDFWorker:
         done_queue: "multiprocess.Queue",
         query: "Select",
         table: "Table",
-        is_batching: bool,
         cache: bool,
+        is_batching: bool,
         udf_fields: Sequence[str],
     ) -> None:
         self.catalog = catalog
@@ -313,21 +386,21 @@ class UDFWorker:
         self.done_queue = done_queue
         self.query = query
         self.table = table
-        self.is_batching = is_batching
         self.cache = cache
+        self.is_batching = is_batching
         self.udf_fields = udf_fields
         self.download_cb = DownloadCallback(self.done_queue)
         self.processed_cb = ProcessedCallback("processed", self.done_queue)
         self.generated_cb = ProcessedCallback("generated", self.done_queue)
-    def run(self) -> None:
+    def run(self, ids_only: bool) -> None:
         prefetch = self.udf.prefetch
         with _get_cache(self.catalog.cache, prefetch, use_cache=self.cache) as _cache:
             catalog = clone_catalog_with_cache(self.catalog, _cache)
             udf_results = self.udf.run(
                 self.udf_fields,
-                self.get_inputs(),
+                self.get_inputs(ids_only),
                 catalog,
                 self.cache,
                 download_cb=self.download_cb,
@@ -348,23 +421,12 @@ class UDFWorker:
             put_into_queue(self.done_queue, {"status": OK_STATUS})
             yield row
-    def get_inputs(self):
+    def get_inputs(self, ids_only: bool) -> Iterable["RowsOutput"]:
         warehouse = self.catalog.warehouse.clone()
-        col_id = get_query_id_column(self.query)
-        if self.is_batching:
-            while (batch := get_from_queue(self.task_queue)) != STOP_SIGNAL:
-                ids = [row[0] for row in batch.rows]
-                rows = warehouse.dataset_rows_select(self.query.where(col_id.in_(ids)))
-                yield RowsOutputBatch(list(rows))
-        else:
-            while (batch := get_from_queue(self.task_queue)) != STOP_SIGNAL:
-                yield from warehouse.dataset_rows_select(
-                    self.query.where(col_id.in_(batch))
+        while (batch := get_from_queue(self.task_queue)) != STOP_SIGNAL:
+            if ids_only:
+                yield from warehouse.dataset_rows_select_from_ids(
+                    self.query, batch, self.is_batching
                 )
-class RepeatTimer(Timer):
-    def run(self):
-        while not self.finished.wait(self.interval):
-            self.function(*self.args, **self.kwargs)
+            else:
+                yield from batch

datachain/query/metrics.py CHANGED Viewed

@@ -15,11 +15,10 @@ def set(key: str, value: Union[str, int, float, bool, None]) -> None:  # noqa: P
     metrics[key] = value
     if job_id := os.getenv("DATACHAIN_JOB_ID"):
-        from datachain.data_storage.job import JobStatus
         from datachain.query.session import Session
         metastore = Session.get().catalog.metastore
-        metastore.set_job_status(job_id, JobStatus.RUNNING, metrics=metrics)
+        metastore.update_job(job_id, metrics=metrics)
 def get(key: str) -> Optional[Union[str, int, float, bool]]:

datachain/query/queue.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import Any
 import msgpack
-from datachain.query.batch import RowsOutput, RowsOutputBatch
+from datachain.query.batch import RowsOutput
 DEFAULT_BATCH_SIZE = 10000
 STOP_SIGNAL = "STOP"
@@ -56,7 +56,6 @@ def put_into_queue(queue: Queue, item: Any) -> None:
 MSGPACK_EXT_TYPE_DATETIME = 42
-MSGPACK_EXT_TYPE_ROWS_INPUT_BATCH = 43
 def _msgpack_pack_extended_types(obj: Any) -> msgpack.ExtType:
@@ -70,12 +69,6 @@ def _msgpack_pack_extended_types(obj: Any) -> msgpack.ExtType:
         data = (obj.timestamp(),)  # type: ignore   # noqa: PGH003
         return msgpack.ExtType(MSGPACK_EXT_TYPE_DATETIME, pack("!d", *data))
-    if isinstance(obj, RowsOutputBatch):
-        return msgpack.ExtType(
-            MSGPACK_EXT_TYPE_ROWS_INPUT_BATCH,
-            msgpack_pack(obj.rows),
-        )
     raise TypeError(f"Unknown type: {obj}")
@@ -100,9 +93,6 @@ def _msgpack_unpack_extended_types(code: int, data: bytes) -> Any:
             tz_info = datetime.timezone(datetime.timedelta(seconds=timezone_offset))
         return datetime.datetime.fromtimestamp(timestamp, tz=tz_info)
-    if code == MSGPACK_EXT_TYPE_ROWS_INPUT_BATCH:
-        return RowsOutputBatch(msgpack_unpack(data))
     return msgpack.ExtType(code, data)

datachain/query/udf.py CHANGED Viewed

@@ -46,4 +46,4 @@ class AbstractUDFDistributor(ABC):
     @staticmethod
     @abstractmethod
-    def run_worker() -> int: ...
+    def run_udf(fd: Optional[int] = None) -> int: ...

datachain/query/utils.py CHANGED Viewed

@@ -1,33 +1,27 @@
-from typing import TYPE_CHECKING, Optional, Union
+from typing import Optional, Union
-from sqlalchemy import Column
+import sqlalchemy as sa
-if TYPE_CHECKING:
-    from sqlalchemy import ColumnElement, Select, TextClause
-ColT = Union[Column, "ColumnElement", "TextClause"]
+ColT = Union[sa.Column, sa.ColumnElement, sa.TextClause]
 def column_name(col: ColT) -> str:
     """Returns column name from column element."""
-    return col.name if isinstance(col, Column) else str(col)
+    return col.name if isinstance(col, sa.Column) else str(col)
-def get_query_column(query: "Select", name: str) -> Optional[ColT]:
+def get_query_column(query: sa.Select, name: str) -> Optional[ColT]:
     """Returns column element from query by name or None if column not found."""
     return next((col for col in query.inner_columns if column_name(col) == name), None)
-def get_query_id_column(query: "Select") -> ColT:
+def get_query_id_column(query: sa.Select) -> Optional[sa.ColumnElement]:
     """Returns ID column element from query or None if column not found."""
     col = get_query_column(query, "sys__id")
-    if col is None:
-        raise RuntimeError("sys__id column not found in query")
-    return col
+    return col if col is not None and isinstance(col, sa.ColumnElement) else None
-def select_only_columns(query: "Select", *names: str) -> "Select":
+def select_only_columns(query: sa.Select, *names: str) -> sa.Select:
     """Returns query selecting defined columns only."""
     if not names:
         return query

datachain/remote/studio.py CHANGED Viewed

@@ -387,6 +387,7 @@ class StudioClient:
         files: Optional[list[str]] = None,
         python_version: Optional[str] = None,
         requirements: Optional[str] = None,
+        repository: Optional[str] = None,
     ) -> Response[JobData]:
         data = {
             "query": query,
@@ -397,6 +398,7 @@ class StudioClient:
             "files": files,
             "python_version": python_version,
             "requirements": requirements,
+            "repository": repository,
         }
         return self._send_request("datachain/job", data)

datachain/studio.py CHANGED Viewed

@@ -35,6 +35,7 @@ def process_jobs_args(args: "Namespace"):
             args.workers,
             args.files,
             args.python_version,
+            args.repository,
             args.req,
             args.req_file,
         )
@@ -256,6 +257,7 @@ def create_job(
     workers: Optional[int] = None,
     files: Optional[list[str]] = None,
     python_version: Optional[str] = None,
+    repository: Optional[str] = None,
     req: Optional[list[str]] = None,
     req_file: Optional[str] = None,
 ):
@@ -284,6 +286,7 @@ def create_job(
         query_name=os.path.basename(query_file),
         files=file_ids,
         python_version=python_version,
+        repository=repository,
         requirements=requirements,
     )
     if not response.ok:

datachain/utils.py CHANGED Viewed

@@ -323,6 +323,9 @@ def determine_processes(
         return True
     if parallel < 0:
         return True
+    if parallel == 1:
+        # Disable parallel processing if only one process is requested.
+        return False
     return parallel

{datachain-0.16.3.dist-info → datachain-0.16.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: datachain
-Version: 0.16.3
+Version: 0.16.5
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License-Expression: Apache-2.0

datachain 0.16.3__py3-none-any.whl → 0.16.5__py3-none-any.whl

Potentially problematic release.

datachain 0.16.3py3-none-any.whl → 0.16.5py3-none-any.whl