PyPI - datachain - Versions diffs - 0.16.4__py3-none-any.whl → 0.16.5__py3-none-any.whl - Mend

datachain 0.16.4py3-none-any.whl → 0.16.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (26) hide show

datachain/catalog/catalog.py +5 -1
datachain/cli/__init__.py +11 -9
datachain/cli/commands/query.py +1 -0
datachain/cli/parser/__init__.py +9 -1
datachain/data_storage/job.py +1 -0
datachain/data_storage/metastore.py +82 -71
datachain/data_storage/warehouse.py +46 -34
datachain/lib/arrow.py +23 -1
datachain/lib/dc/csv.py +1 -0
datachain/lib/dc/datachain.py +30 -13
datachain/lib/listing.py +2 -0
datachain/lib/udf.py +17 -5
datachain/query/batch.py +40 -39
datachain/query/dataset.py +33 -32
datachain/query/dispatch.py +137 -75
datachain/query/metrics.py +1 -2
datachain/query/queue.py +1 -11
datachain/query/udf.py +1 -1
datachain/query/utils.py +8 -14
datachain/utils.py +3 -0
{datachain-0.16.4.dist-info → datachain-0.16.5.dist-info}/METADATA +1 -1
{datachain-0.16.4.dist-info → datachain-0.16.5.dist-info}/RECORD +26 -26
{datachain-0.16.4.dist-info → datachain-0.16.5.dist-info}/WHEEL +1 -1
{datachain-0.16.4.dist-info → datachain-0.16.5.dist-info}/entry_points.txt +0 -0
{datachain-0.16.4.dist-info → datachain-0.16.5.dist-info}/licenses/LICENSE +0 -0
{datachain-0.16.4.dist-info → datachain-0.16.5.dist-info}/top_level.txt +0 -0

datachain/lib/dc/datachain.py CHANGED Viewed

@@ -1636,18 +1636,27 @@ class DataChain:
         """
         from pyarrow.dataset import CsvFileFormat, JsonFileFormat
-        from datachain.lib.arrow import ArrowGenerator, infer_schema, schema_to_output
+        from datachain.lib.arrow import (
+            ArrowGenerator,
+            fix_pyarrow_format,
+            infer_schema,
+            schema_to_output,
+        )
-        if nrows:
-            format = kwargs.get("format")
-            if format not in ["csv", "json"] and not isinstance(
-                format, (CsvFileFormat, JsonFileFormat)
-            ):
-                raise DatasetPrepareError(
-                    self.name,
-                    "error in `parse_tabular` - "
-                    "`nrows` only supported for csv and json formats.",
-                )
+        parse_options = kwargs.pop("parse_options", None)
+        if format := kwargs.get("format"):
+            kwargs["format"] = fix_pyarrow_format(format, parse_options)
+        if (
+            nrows
+            and format not in ["csv", "json"]
+            and not isinstance(format, (CsvFileFormat, JsonFileFormat))
+        ):
+            raise DatasetPrepareError(
+                self.name,
+                "error in `parse_tabular` - "
+                "`nrows` only supported for csv and json formats.",
+            )
         if "file" not in self.schema or not self.count():
             raise DatasetPrepareError(self.name, "no files to parse.")
@@ -1656,7 +1665,7 @@ class DataChain:
         col_names = output if isinstance(output, Sequence) else None
         if col_names or not output:
             try:
-                schema = infer_schema(self, **kwargs)
+                schema = infer_schema(self, **kwargs, parse_options=parse_options)
                 output, _ = schema_to_output(schema, col_names)
             except ValueError as e:
                 raise DatasetPrepareError(self.name, e) from e
@@ -1682,7 +1691,15 @@ class DataChain:
         # disable prefetch if nrows is set
         settings = {"prefetch": 0} if nrows else {}
         return self.settings(**settings).gen(  # type: ignore[arg-type]
-            ArrowGenerator(schema, model, source, nrows, **kwargs), output=output
+            ArrowGenerator(
+                schema,
+                model,
+                source,
+                nrows,
+                parse_options=parse_options,
+                **kwargs,
+            ),
+            output=output,
         )
     @classmethod

datachain/lib/listing.py CHANGED Viewed

@@ -56,6 +56,8 @@ def list_bucket(uri: str, cache, client_config=None) -> Callable:
         for entries in iter_over_async(client.scandir(path.rstrip("/")), get_loop()):
             yield from entries
+    list_func.__name__ = "read_storage"
     return list_func

datachain/lib/udf.py CHANGED Viewed

@@ -218,6 +218,18 @@ class UDFBase(AbstractUDF):
     def name(self):
         return self.__class__.__name__
+    @property
+    def verbose_name(self):
+        """Returns the name of the function or class that implements the UDF."""
+        if self._func and callable(self._func):
+            if hasattr(self._func, "__name__"):
+                return self._func.__name__
+            if hasattr(self._func, "__class__") and hasattr(
+                self._func.__class__, "__name__"
+            ):
+                return self._func.__class__.__name__
+        return "<unknown>"
     @property
     def signal_names(self) -> Iterable[str]:
         return self.output.to_udf_spec().keys()
@@ -411,13 +423,13 @@ class BatchMapper(UDFBase):
         self.setup()
         for batch in udf_inputs:
-            n_rows = len(batch.rows)
+            n_rows = len(batch)
             row_ids, *udf_args = zip(
                 *[
                     self._prepare_row_and_id(
                         row, udf_fields, catalog, cache, download_cb
                     )
-                    for row in batch.rows
+                    for row in batch
                 ]
             )
             result_objs = list(self.process_safe(udf_args))
@@ -489,7 +501,7 @@ class Aggregator(UDFBase):
     def run(
         self,
-        udf_fields: "Sequence[str]",
+        udf_fields: Sequence[str],
         udf_inputs: Iterable[RowsOutputBatch],
         catalog: "Catalog",
         cache: bool,
@@ -502,13 +514,13 @@ class Aggregator(UDFBase):
             udf_args = zip(
                 *[
                     self._prepare_row(row, udf_fields, catalog, cache, download_cb)
-                    for row in batch.rows
+                    for row in batch
                 ]
             )
             result_objs = self.process_safe(udf_args)
             udf_outputs = (self._flatten_row(row) for row in result_objs)
             output = (dict(zip(self.signal_names, row)) for row in udf_outputs)
-            processed_cb.relative_update(len(batch.rows))
+            processed_cb.relative_update(len(batch))
             yield output
         self.teardown()

datachain/query/batch.py CHANGED Viewed

@@ -2,22 +2,14 @@ import contextlib
 import math
 from abc import ABC, abstractmethod
 from collections.abc import Generator, Sequence
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Callable, Optional, Union
-from datachain.data_storage.schema import PARTITION_COLUMN_ID
-from datachain.data_storage.warehouse import SELECT_BATCH_SIZE
-from datachain.query.utils import get_query_column, get_query_id_column
-if TYPE_CHECKING:
-    from sqlalchemy import Select
+from typing import Callable, Optional, Union
+import sqlalchemy as sa
-@dataclass
-class RowsOutputBatch:
-    rows: Sequence[Sequence]
+from datachain.data_storage.schema import PARTITION_COLUMN_ID
+from datachain.query.utils import get_query_column
+RowsOutputBatch = Sequence[Sequence]
 RowsOutput = Union[Sequence, RowsOutputBatch]
@@ -30,8 +22,8 @@ class BatchingStrategy(ABC):
     def __call__(
         self,
         execute: Callable,
-        query: "Select",
-        ids_only: bool = False,
+        query: sa.Select,
+        id_col: Optional[sa.ColumnElement] = None,
     ) -> Generator[RowsOutput, None, None]:
         """Apply the provided parameters to the UDF."""
@@ -47,12 +39,16 @@ class NoBatching(BatchingStrategy):
     def __call__(
         self,
         execute: Callable,
-        query: "Select",
-        ids_only: bool = False,
+        query: sa.Select,
+        id_col: Optional[sa.ColumnElement] = None,
     ) -> Generator[Sequence, None, None]:
-        if ids_only:
-            query = query.with_only_columns(get_query_id_column(query))
-        return execute(query)
+        ids_only = False
+        if id_col is not None:
+            query = query.with_only_columns(id_col)
+            ids_only = True
+        rows = execute(query)
+        yield from (r[0] for r in rows) if ids_only else rows
 class Batch(BatchingStrategy):
@@ -69,27 +65,31 @@ class Batch(BatchingStrategy):
     def __call__(
         self,
         execute: Callable,
-        query: "Select",
-        ids_only: bool = False,
-    ) -> Generator[RowsOutputBatch, None, None]:
-        if ids_only:
-            query = query.with_only_columns(get_query_id_column(query))
+        query: sa.Select,
+        id_col: Optional[sa.ColumnElement] = None,
+    ) -> Generator[RowsOutput, None, None]:
+        from datachain.data_storage.warehouse import SELECT_BATCH_SIZE
+        ids_only = False
+        if id_col is not None:
+            query = query.with_only_columns(id_col)
+            ids_only = True
         # choose page size that is a multiple of the batch size
         page_size = math.ceil(SELECT_BATCH_SIZE / self.count) * self.count
         # select rows in batches
-        results: list[Sequence] = []
+        results = []
-        with contextlib.closing(execute(query, page_size=page_size)) as rows:
-            for row in rows:
+        with contextlib.closing(execute(query, page_size=page_size)) as batch_rows:
+            for row in batch_rows:
                 results.append(row)
                 if len(results) >= self.count:
                     batch, results = results[: self.count], results[self.count :]
-                    yield RowsOutputBatch(batch)
+                    yield [r[0] for r in batch] if ids_only else batch
             if len(results) > 0:
-                yield RowsOutputBatch(results)
+                yield [r[0] for r in results] if ids_only else results
 class Partition(BatchingStrategy):
@@ -104,18 +104,19 @@ class Partition(BatchingStrategy):
     def __call__(
         self,
         execute: Callable,
-        query: "Select",
-        ids_only: bool = False,
-    ) -> Generator[RowsOutputBatch, None, None]:
-        id_col = get_query_id_column(query)
+        query: sa.Select,
+        id_col: Optional[sa.ColumnElement] = None,
+    ) -> Generator[RowsOutput, None, None]:
         if (partition_col := get_query_column(query, PARTITION_COLUMN_ID)) is None:
             raise RuntimeError("partition column not found in query")
-        if ids_only:
+        ids_only = False
+        if id_col is not None:
             query = query.with_only_columns(id_col, partition_col)
+            ids_only = True
         current_partition: Optional[int] = None
-        batch: list[Sequence] = []
+        batch: list = []
         query_fields = [str(c.name) for c in query.selected_columns]
         id_column_idx = query_fields.index("sys__id")
@@ -132,9 +133,9 @@ class Partition(BatchingStrategy):
                 if current_partition != partition:
                     current_partition = partition
                     if len(batch) > 0:
-                        yield RowsOutputBatch(batch)
+                        yield batch
                         batch = []
-                batch.append([row[id_column_idx]] if ids_only else row)
+                batch.append(row[id_column_idx] if ids_only else row)
             if len(batch) > 0:
-                yield RowsOutputBatch(batch)
+                yield batch

datachain/query/dataset.py CHANGED Viewed

@@ -42,15 +42,9 @@ from datachain.data_storage.schema import (
     partition_columns,
 )
 from datachain.dataset import DATASET_PREFIX, DatasetStatus, RowDict
-from datachain.error import (
-    DatasetNotFoundError,
-    QueryScriptCancelError,
-)
+from datachain.error import DatasetNotFoundError, QueryScriptCancelError
 from datachain.func.base import Function
-from datachain.lib.listing import (
-    is_listing_dataset,
-    listing_dataset_expired,
-)
+from datachain.lib.listing import is_listing_dataset, listing_dataset_expired
 from datachain.lib.udf import UDFAdapter, _get_cache
 from datachain.progress import CombinedDownloadCallback, TqdmCombinedDownloadCallback
 from datachain.query.schema import C, UDFParamSpec, normalize_param
@@ -420,41 +414,30 @@ class UDFStep(Step, ABC):
         """
     def populate_udf_table(self, udf_table: "Table", query: Select) -> None:
-        from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE
-        rows_total = self.catalog.warehouse.query_count(query)
-        if rows_total == 0:
+        if (rows_total := self.catalog.warehouse.query_count(query)) == 0:
             return
+        from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE
+        from datachain.catalog.loader import (
+            DISTRIBUTED_IMPORT_PATH,
+            get_udf_distributor_class,
+        )
         workers = determine_workers(self.workers, rows_total=rows_total)
         processes = determine_processes(self.parallel, rows_total=rows_total)
         use_partitioning = self.partition_by is not None
         batching = self.udf.get_batching(use_partitioning)
         udf_fields = [str(c.name) for c in query.selected_columns]
+        udf_distributor_class = get_udf_distributor_class()
         prefetch = self.udf.prefetch
         with _get_cache(self.catalog.cache, prefetch, use_cache=self.cache) as _cache:
             catalog = clone_catalog_with_cache(self.catalog, _cache)
-            try:
-                if workers:
-                    if catalog.in_memory:
-                        raise RuntimeError(
-                            "In-memory databases cannot be used with "
-                            "distributed processing."
-                        )
-                    from datachain.catalog.loader import (
-                        DISTRIBUTED_IMPORT_PATH,
-                        get_udf_distributor_class,
-                    )
-                    if not (udf_distributor_class := get_udf_distributor_class()):
-                        raise RuntimeError(
-                            f"{DISTRIBUTED_IMPORT_PATH} import path is required "
-                            "for distributed UDF processing."
-                        )
+            try:
+                if udf_distributor_class and not catalog.in_memory:
+                    # Use the UDF distributor if available (running in SaaS)
                     udf_distributor = udf_distributor_class(
                         catalog=catalog,
                         table=udf_table,
@@ -470,7 +453,20 @@ class UDFStep(Step, ABC):
                         min_task_size=self.min_task_size,
                     )
                     udf_distributor()
-                elif processes:
+                    return
+                if workers:
+                    if catalog.in_memory:
+                        raise RuntimeError(
+                            "In-memory databases cannot be used with "
+                            "distributed processing."
+                        )
+                    raise RuntimeError(
+                        f"{DISTRIBUTED_IMPORT_PATH} import path is required "
+                        "for distributed UDF processing."
+                    )
+                if processes:
                     # Parallel processing (faster for more CPU-heavy UDFs)
                     if catalog.in_memory:
                         raise RuntimeError(
@@ -504,7 +500,12 @@ class UDFStep(Step, ABC):
                     with subprocess.Popen(  # noqa: S603
                         cmd, env=envs, stdin=subprocess.PIPE
                     ) as process:
-                        process.communicate(process_data)
+                        try:
+                            process.communicate(process_data)
+                        except KeyboardInterrupt:
+                            raise QueryScriptCancelError(
+                                "UDF execution was canceled by the user."
+                            ) from None
                         if retval := process.poll():
                             raise RuntimeError(
                                 f"UDF Execution Failed! Exit code: {retval}"

datachain 0.16.4__py3-none-any.whl → 0.16.5__py3-none-any.whl

Potentially problematic release.

datachain 0.16.4py3-none-any.whl → 0.16.5py3-none-any.whl