PyPI - datachain - Versions diffs - 0.5.1__tar.gz → 0.6.0__tar.gz - Mend

datachain 0.5.1tar.gz → 0.6.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (251) hide show

{datachain-0.5.1 → datachain-0.6.0}/.pre-commit-config.yaml RENAMED Viewed

@@ -4,7 +4,7 @@ ci:
   skip: [mypy]
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
+    rev: v5.0.0
     hooks:
       - id: check-added-large-files
         exclude: '^tests/examples/data/'
@@ -24,7 +24,7 @@ repos:
       - id: trailing-whitespace
         exclude: '^LICENSES/'
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: 'v0.6.8'
+    rev: 'v0.6.9'
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix]

{datachain-0.5.1/src/datachain.egg-info → datachain-0.6.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datachain
-Version: 0.5.1
+Version: 0.6.0
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License: Apache-2.0

{datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/convert/values_to_tuples.py RENAMED Viewed

@@ -4,7 +4,7 @@ from typing import Any, Union
 from datachain.lib.data_model import (
     DataType,
     DataTypeNames,
-    DataValuesType,
+    DataValue,
     is_chain_type,
 )
 from datachain.lib.utils import DataChainParamsError
@@ -20,7 +20,7 @@ class ValuesToTupleError(DataChainParamsError):
 def values_to_tuples(  # noqa: C901, PLR0912
     ds_name: str = "",
     output: Union[None, DataType, Sequence[str], dict[str, DataType]] = None,
-    **fr_map: Sequence[DataValuesType],
+    **fr_map: Sequence[DataValue],
 ) -> tuple[Any, Any, Any]:
     if output:
         if not isinstance(output, (Sequence, str, dict)):

{datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/data_model.py RENAMED Viewed

@@ -18,7 +18,7 @@ StandardType = Union[
 ]
 DataType = Union[type[BaseModel], StandardType]
 DataTypeNames = "BaseModel, int, str, float, bool, list, dict, bytes, datetime"
-DataValuesType = Union[BaseModel, int, str, float, bool, list, dict, bytes, datetime]
+DataValue = Union[BaseModel, int, str, float, bool, list, dict, bytes, datetime]
 class DataModel(BaseModel):

{datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/dc.py RENAMED Viewed

@@ -62,6 +62,7 @@ from datachain.telemetry import telemetry
 from datachain.utils import batched_it, inside_notebook
 if TYPE_CHECKING:
+    from pyarrow import DataType as ArrowDataType
     from typing_extensions import Concatenate, ParamSpec, Self
     from datachain.lib.hf import HFDatasetType
@@ -1024,7 +1025,7 @@ class DataChain:
         The supported functions:
            Numerical:   +, -, *, /, rand(), avg(), count(), func(),
                         greatest(), least(), max(), min(), sum()
-           String:      length(), split()
+           String:      length(), split(), replace(), regexp_replace()
            Filename:    name(), parent(), file_stem(), file_ext()
            Array:       length(), sip_hash_64(), euclidean_distance(),
                         cosine_distance()
@@ -1709,6 +1710,7 @@ class DataChain:
         nrows=None,
         session: Optional[Session] = None,
         settings: Optional[dict] = None,
+        column_types: Optional[dict[str, "Union[str, ArrowDataType]"]] = None,
         **kwargs,
     ) -> "DataChain":
         """Generate chain from csv files.
@@ -1727,6 +1729,9 @@ class DataChain:
             nrows : Optional row limit.
             session : Session to use for the chain.
             settings : Settings to use for the chain.
+            column_types : Dictionary of column names and their corresponding types.
+                It is passed to CSV reader and for each column specified type auto
+                inference is disabled.
         Example:
             Reading a csv file:
@@ -1742,6 +1747,15 @@ class DataChain:
         from pandas.io.parsers.readers import STR_NA_VALUES
         from pyarrow.csv import ConvertOptions, ParseOptions, ReadOptions
         from pyarrow.dataset import CsvFileFormat
+        from pyarrow.lib import type_for_alias
+        if column_types:
+            column_types = {
+                name: type_for_alias(typ) if isinstance(typ, str) else typ
+                for name, typ in column_types.items()
+            }
+        else:
+            column_types = {}
         chain = DataChain.from_storage(
             path, session=session, settings=settings, **kwargs
@@ -1767,7 +1781,9 @@ class DataChain:
         parse_options = ParseOptions(delimiter=delimiter)
         read_options = ReadOptions(column_names=column_names)
         convert_options = ConvertOptions(
-            strings_can_be_null=True, null_values=STR_NA_VALUES
+            strings_can_be_null=True,
+            null_values=STR_NA_VALUES,
+            column_types=column_types,
         )
         format = CsvFileFormat(
             parse_options=parse_options,

{datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/signal_schema.py RENAMED Viewed

@@ -25,7 +25,7 @@ from typing_extensions import Literal as LiteralEx
 from datachain.lib.convert.python_to_sql import python_to_sql
 from datachain.lib.convert.sql_to_python import sql_to_python
 from datachain.lib.convert.unflatten import unflatten_to_json_pos
-from datachain.lib.data_model import DataModel, DataType
+from datachain.lib.data_model import DataModel, DataType, DataValue
 from datachain.lib.file import File
 from datachain.lib.model_store import ModelStore
 from datachain.lib.utils import DataChainParamsError
@@ -110,7 +110,7 @@ class SignalSchema:
     values: dict[str, DataType]
     tree: dict[str, Any]
     setup_func: dict[str, Callable]
-    setup_values: Optional[dict[str, Callable]]
+    setup_values: Optional[dict[str, Any]]
     def __init__(
         self,
@@ -333,21 +333,21 @@ class SignalSchema:
                 res[db_name] = python_to_sql(type_)
         return res
-    def row_to_objs(self, row: Sequence[Any]) -> list[DataType]:
+    def row_to_objs(self, row: Sequence[Any]) -> list[DataValue]:
         self._init_setup_values()
-        objs = []
+        objs: list[DataValue] = []
         pos = 0
         for name, fr_type in self.values.items():
             if self.setup_values and (val := self.setup_values.get(name, None)):
                 objs.append(val)
             elif (fr := ModelStore.to_pydantic(fr_type)) is not None:
                 j, pos = unflatten_to_json_pos(fr, row, pos)
-                objs.append(fr(**j))  # type: ignore[arg-type]
+                objs.append(fr(**j))
             else:
                 objs.append(row[pos])
                 pos += 1
-        return objs  # type: ignore[return-value]
+        return objs
     def contains_file(self) -> bool:
         for type_ in self.values.values():

{datachain-0.5.1 → datachain-0.6.0}/src/datachain/lib/udf.py RENAMED Viewed

@@ -1,14 +1,15 @@
 import sys
 import traceback
 from collections.abc import Iterable, Iterator, Mapping, Sequence
-from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Callable, Optional
+import attrs
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback
 from pydantic import BaseModel
 from datachain.dataset import RowDict
 from datachain.lib.convert.flatten import flatten
+from datachain.lib.data_model import DataValue
 from datachain.lib.file import File
 from datachain.lib.signal_schema import SignalSchema
 from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
@@ -18,16 +19,14 @@ from datachain.query.batch import (
     NoBatching,
     Partition,
     RowsOutputBatch,
-    UDFInputBatch,
 )
-from datachain.query.schema import ColumnParameter, UDFParameter
 if TYPE_CHECKING:
     from typing_extensions import Self
     from datachain.catalog import Catalog
     from datachain.lib.udf_signature import UdfSignature
-    from datachain.query.batch import RowsOutput, UDFInput
+    from datachain.query.batch import RowsOutput
 class UdfError(DataChainParamsError):
@@ -45,11 +44,21 @@ UDFOutputSpec = Mapping[str, ColumnType]
 UDFResult = dict[str, Any]
-@dataclass
+@attrs.define
 class UDFProperties:
-    """Container for basic UDF properties."""
+    udf: "UDFAdapter"
-    params: list[UDFParameter]
+    def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
+        return self.udf.get_batching(use_partitioning)
+    @property
+    def batch(self):
+        return self.udf.batch
+@attrs.define(slots=False)
+class UDFAdapter:
+    inner: "UDFBase"
     output: UDFOutputSpec
     batch: int = 1
@@ -62,20 +71,10 @@ class UDFProperties:
             return Batch(self.batch)
         raise ValueError(f"invalid batch size {self.batch}")
-    def signal_names(self) -> Iterable[str]:
-        return self.output.keys()
-class UDFAdapter:
-    def __init__(
-        self,
-        inner: "UDFBase",
-        properties: UDFProperties,
-    ):
-        self.inner = inner
-        self.properties = properties
-        self.signal_names = properties.signal_names()
-        self.output = properties.output
+    @property
+    def properties(self):
+        # For backwards compatibility.
+        return UDFProperties(self)
     def run(
         self,
@@ -87,72 +86,14 @@ class UDFAdapter:
         download_cb: Callback = DEFAULT_CALLBACK,
         processed_cb: Callback = DEFAULT_CALLBACK,
     ) -> Iterator[Iterable[UDFResult]]:
-        self.inner.catalog = catalog
-        if hasattr(self.inner, "setup") and callable(self.inner.setup):
-            self.inner.setup()
-        for batch in udf_inputs:
-            if isinstance(batch, RowsOutputBatch):
-                n_rows = len(batch.rows)
-                inputs: UDFInput = UDFInputBatch(
-                    [RowDict(zip(udf_fields, row)) for row in batch.rows]
-                )
-            else:
-                n_rows = 1
-                inputs = RowDict(zip(udf_fields, batch))
-            output = self.run_once(catalog, inputs, is_generator, cache, cb=download_cb)
-            processed_cb.relative_update(n_rows)
-            yield output
-        if hasattr(self.inner, "teardown") and callable(self.inner.teardown):
-            self.inner.teardown()
-    def run_once(
-        self,
-        catalog: "Catalog",
-        arg: "UDFInput",
-        is_generator: bool = False,
-        cache: bool = False,
-        cb: Callback = DEFAULT_CALLBACK,
-    ) -> Iterable[UDFResult]:
-        if isinstance(arg, UDFInputBatch):
-            udf_inputs = [
-                self.bind_parameters(catalog, row, cache=cache, cb=cb)
-                for row in arg.rows
-            ]
-            udf_outputs = self.inner.run_once(udf_inputs, cache=cache, download_cb=cb)
-            return self._process_results(arg.rows, udf_outputs, is_generator)
-        if isinstance(arg, RowDict):
-            udf_inputs = self.bind_parameters(catalog, arg, cache=cache, cb=cb)
-            udf_outputs = self.inner.run_once(udf_inputs, cache=cache, download_cb=cb)
-            if not is_generator:
-                # udf_outputs is generator already if is_generator=True
-                udf_outputs = [udf_outputs]
-            return self._process_results([arg], udf_outputs, is_generator)
-        raise ValueError(f"Unexpected UDF argument: {arg}")
-    def bind_parameters(self, catalog: "Catalog", row: "RowDict", **kwargs) -> list:
-        return [p.get_value(catalog, row, **kwargs) for p in self.properties.params]
-    def _process_results(
-        self,
-        rows: Sequence["RowDict"],
-        results: Sequence[Sequence[Any]],
-        is_generator=False,
-    ) -> Iterable[UDFResult]:
-        """Create a list of dictionaries representing UDF results."""
-        # outputting rows
-        if is_generator:
-            # each row in results is a tuple of column values
-            return (dict(zip(self.signal_names, row)) for row in results)
-        # outputting signals
-        row_ids = [row["sys__id"] for row in rows]
-        return [
-            {"sys__id": row_id} | dict(zip(self.signal_names, signals))
-            for row_id, signals in zip(row_ids, results)
-        ]
+        yield from self.inner.run(
+            udf_fields,
+            udf_inputs,
+            catalog,
+            cache,
+            download_cb,
+            processed_cb,
+        )
 class UDFBase(AbstractUDF):
@@ -203,17 +144,12 @@ class UDFBase(AbstractUDF):
         ```
     """
-    is_input_batched = False
     is_output_batched = False
-    is_input_grouped = False
-    params_spec: Optional[list[str]]
     catalog: "Optional[Catalog]"
     def __init__(self):
-        self.params = None
+        self.params: Optional[SignalSchema] = None
         self.output = None
-        self.params_spec = None
-        self.output_spec = None
         self.catalog = None
         self._func = None
@@ -241,11 +177,6 @@ class UDFBase(AbstractUDF):
     ):
         self.params = params
         self.output = sign.output_schema
-        params_spec = self.params.to_udf_spec()
-        self.params_spec = list(params_spec.keys())
-        self.output_spec = self.output.to_udf_spec()
         self._func = func
     @classmethod
@@ -273,48 +204,27 @@ class UDFBase(AbstractUDF):
     def name(self):
         return self.__class__.__name__
+    @property
+    def signal_names(self) -> Iterable[str]:
+        return self.output.to_udf_spec().keys()
     def to_udf_wrapper(self, batch: int = 1) -> UDFAdapter:
-        assert self.params_spec is not None
-        properties = UDFProperties(
-            [ColumnParameter(p) for p in self.params_spec], self.output_spec, batch
+        return UDFAdapter(
+            self,
+            self.output.to_udf_spec(),
+            batch,
         )
-        return UDFAdapter(self, properties)
-    def validate_results(self, results, *args, **kwargs):
-        return results
-    def run_once(self, rows, cache, download_cb):
-        if self.is_input_batched:
-            objs = zip(*self._parse_rows(rows, cache, download_cb))
-        else:
-            objs = self._parse_rows([rows], cache, download_cb)[0]
-        result_objs = self.process_safe(objs)
-        if not self.is_output_batched:
-            result_objs = [result_objs]
-        # Generator expression is required, otherwise the value will be materialized
-        res = (self._flatten_row(row) for row in result_objs)
-        if not self.is_output_batched:
-            res = list(res)
-            assert (
-                len(res) == 1
-            ), f"{self.name} returns {len(res)} rows while it's not batched"
-            if isinstance(res[0], tuple):
-                res = res[0]
-        elif (
-            self.is_input_batched
-            and self.is_output_batched
-            and not self.is_input_grouped
-        ):
-            res = list(res)
-            assert len(res) == len(
-                rows
-            ), f"{self.name} returns {len(res)} rows while {len(rows)} expected"
-        return res
+    def run(
+        self,
+        udf_fields: "Sequence[str]",
+        udf_inputs: "Iterable[Any]",
+        catalog: "Catalog",
+        cache: bool,
+        download_cb: Callback = DEFAULT_CALLBACK,
+        processed_cb: Callback = DEFAULT_CALLBACK,
+    ) -> Iterator[Iterable[UDFResult]]:
+        raise NotImplementedError
     def _flatten_row(self, row):
         if len(self.output.values) > 1 and not isinstance(row, BaseModel):
@@ -328,17 +238,28 @@ class UDFBase(AbstractUDF):
     def _obj_to_list(obj):
         return flatten(obj) if isinstance(obj, BaseModel) else [obj]
-    def _parse_rows(self, rows, cache, download_cb):
-        objs = []
-        for row in rows:
-            obj_row = self.params.row_to_objs(row)
-            for obj in obj_row:
-                if isinstance(obj, File):
-                    obj._set_stream(
-                        self.catalog, caching_enabled=cache, download_cb=download_cb
-                    )
-            objs.append(obj_row)
-        return objs
+    def _parse_row(
+        self, row_dict: RowDict, cache: bool, download_cb: Callback
+    ) -> list[DataValue]:
+        assert self.params
+        row = [row_dict[p] for p in self.params.to_udf_spec()]
+        obj_row = self.params.row_to_objs(row)
+        for obj in obj_row:
+            if isinstance(obj, File):
+                assert self.catalog is not None
+                obj._set_stream(
+                    self.catalog, caching_enabled=cache, download_cb=download_cb
+                )
+        return obj_row
+    def _prepare_row(self, row, udf_fields, cache, download_cb):
+        row_dict = RowDict(zip(udf_fields, row))
+        return self._parse_row(row_dict, cache, download_cb)
+    def _prepare_row_and_id(self, row, udf_fields, cache, download_cb):
+        row_dict = RowDict(zip(udf_fields, row))
+        udf_input = self._parse_row(row_dict, cache, download_cb)
+        return row_dict["sys__id"], *udf_input
     def process_safe(self, obj_rows):
         try:
@@ -358,23 +279,128 @@ class UDFBase(AbstractUDF):
 class Mapper(UDFBase):
     """Inherit from this class to pass to `DataChain.map()`."""
+    def run(
+        self,
+        udf_fields: "Sequence[str]",
+        udf_inputs: "Iterable[Sequence[Any]]",
+        catalog: "Catalog",
+        cache: bool,
+        download_cb: Callback = DEFAULT_CALLBACK,
+        processed_cb: Callback = DEFAULT_CALLBACK,
+    ) -> Iterator[Iterable[UDFResult]]:
+        self.catalog = catalog
+        self.setup()
+        for row in udf_inputs:
+            id_, *udf_args = self._prepare_row_and_id(
+                row, udf_fields, cache, download_cb
+            )
+            result_objs = self.process_safe(udf_args)
+            udf_output = self._flatten_row(result_objs)
+            output = [{"sys__id": id_} | dict(zip(self.signal_names, udf_output))]
+            processed_cb.relative_update(1)
+            yield output
+        self.teardown()
 class BatchMapper(UDFBase):
     """Inherit from this class to pass to `DataChain.batch_map()`."""
-    is_input_batched = True
     is_output_batched = True
+    def run(
+        self,
+        udf_fields: Sequence[str],
+        udf_inputs: Iterable[RowsOutputBatch],
+        catalog: "Catalog",
+        cache: bool,
+        download_cb: Callback = DEFAULT_CALLBACK,
+        processed_cb: Callback = DEFAULT_CALLBACK,
+    ) -> Iterator[Iterable[UDFResult]]:
+        self.catalog = catalog
+        self.setup()
+        for batch in udf_inputs:
+            n_rows = len(batch.rows)
+            row_ids, *udf_args = zip(
+                *[
+                    self._prepare_row_and_id(row, udf_fields, cache, download_cb)
+                    for row in batch.rows
+                ]
+            )
+            result_objs = list(self.process_safe(udf_args))
+            n_objs = len(result_objs)
+            assert (
+                n_objs == n_rows
+            ), f"{self.name} returns {n_objs} rows, but {n_rows} were expected"
+            udf_outputs = (self._flatten_row(row) for row in result_objs)
+            output = [
+                {"sys__id": row_id} | dict(zip(self.signal_names, signals))
+                for row_id, signals in zip(row_ids, udf_outputs)
+            ]
+            processed_cb.relative_update(n_rows)
+            yield output
+        self.teardown()
 class Generator(UDFBase):
     """Inherit from this class to pass to `DataChain.gen()`."""
     is_output_batched = True
+    def run(
+        self,
+        udf_fields: "Sequence[str]",
+        udf_inputs: "Iterable[Sequence[Any]]",
+        catalog: "Catalog",
+        cache: bool,
+        download_cb: Callback = DEFAULT_CALLBACK,
+        processed_cb: Callback = DEFAULT_CALLBACK,
+    ) -> Iterator[Iterable[UDFResult]]:
+        self.catalog = catalog
+        self.setup()
+        for row in udf_inputs:
+            udf_args = self._prepare_row(row, udf_fields, cache, download_cb)
+            result_objs = self.process_safe(udf_args)
+            udf_outputs = (self._flatten_row(row) for row in result_objs)
+            output = (dict(zip(self.signal_names, row)) for row in udf_outputs)
+            processed_cb.relative_update(1)
+            yield output
+        self.teardown()
 class Aggregator(UDFBase):
     """Inherit from this class to pass to `DataChain.agg()`."""
-    is_input_batched = True
     is_output_batched = True
-    is_input_grouped = True
+    def run(
+        self,
+        udf_fields: "Sequence[str]",
+        udf_inputs: Iterable[RowsOutputBatch],
+        catalog: "Catalog",
+        cache: bool,
+        download_cb: Callback = DEFAULT_CALLBACK,
+        processed_cb: Callback = DEFAULT_CALLBACK,
+    ) -> Iterator[Iterable[UDFResult]]:
+        self.catalog = catalog
+        self.setup()
+        for batch in udf_inputs:
+            udf_args = zip(
+                *[
+                    self._prepare_row(row, udf_fields, cache, download_cb)
+                    for row in batch.rows
+                ]
+            )
+            result_objs = self.process_safe(udf_args)
+            udf_outputs = (self._flatten_row(row) for row in result_objs)
+            output = (dict(zip(self.signal_names, row)) for row in udf_outputs)
+            processed_cb.relative_update(len(batch.rows))
+            yield output
+        self.teardown()

{datachain-0.5.1 → datachain-0.6.0}/src/datachain/query/batch.py RENAMED Viewed

@@ -11,8 +11,6 @@ from datachain.data_storage.warehouse import SELECT_BATCH_SIZE
 if TYPE_CHECKING:
     from sqlalchemy import Select
-    from datachain.dataset import RowDict
 @dataclass
 class RowsOutputBatch:
@@ -22,14 +20,6 @@ class RowsOutputBatch:
 RowsOutput = Union[Sequence, RowsOutputBatch]
-@dataclass
-class UDFInputBatch:
-    rows: Sequence["RowDict"]
-UDFInput = Union["RowDict", UDFInputBatch]
 class BatchingStrategy(ABC):
     """BatchingStrategy provides means of batching UDF executions."""

{datachain-0.5.1 → datachain-0.6.0}/src/datachain/query/dataset.py RENAMED Viewed

@@ -392,7 +392,7 @@ class UDFStep(Step, ABC):
     def populate_udf_table(self, udf_table: "Table", query: Select) -> None:
         use_partitioning = self.partition_by is not None
-        batching = self.udf.properties.get_batching(use_partitioning)
+        batching = self.udf.get_batching(use_partitioning)
         workers = self.workers
         if (
             not workers

{datachain-0.5.1 → datachain-0.6.0}/src/datachain/query/dispatch.py RENAMED Viewed

@@ -114,7 +114,6 @@ class UDFDispatcher:
     catalog: Optional[Catalog] = None
     task_queue: Optional[multiprocess.Queue] = None
     done_queue: Optional[multiprocess.Queue] = None
-    _batch_size: Optional[int] = None
     def __init__(
         self,
@@ -154,17 +153,6 @@ class UDFDispatcher:
         self.done_queue = None
         self.ctx = get_context("spawn")
-    @property
-    def batch_size(self):
-        if self._batch_size is None:
-            if hasattr(self.udf, "properties") and hasattr(
-                self.udf.properties, "batch"
-            ):
-                self._batch_size = self.udf.properties.batch
-            else:
-                self._batch_size = 1
-        return self._batch_size
     def _create_worker(self) -> "UDFWorker":
         if not self.catalog:
             id_generator = self.id_generator_class(

{datachain-0.5.1 → datachain-0.6.0}/src/datachain/sql/functions/string.py RENAMED Viewed

@@ -37,6 +37,18 @@ class regexp_replace(GenericFunction):  # noqa: N801
     inherit_cache = True
+class replace(GenericFunction):  # noqa: N801
+    """
+    Replaces substring with another string.
+    """
+    type = String()
+    package = "string"
+    name = "replace"
+    inherit_cache = True
 compiler_not_implemented(length)
 compiler_not_implemented(split)
 compiler_not_implemented(regexp_replace)
+compiler_not_implemented(replace)

datachain 0.5.1__tar.gz → 0.6.0__tar.gz

Potentially problematic release.

datachain 0.5.1tar.gz → 0.6.0tar.gz