PyPI - datachain - Versions diffs - 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl - Mend

datachain 0.5.1py3-none-any.whl → 0.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (31) hide show

datachain/__init__.py +2 -0
datachain/catalog/catalog.py +1 -9
datachain/data_storage/sqlite.py +8 -0
datachain/data_storage/warehouse.py +0 -4
datachain/lib/convert/sql_to_python.py +8 -12
datachain/lib/convert/values_to_tuples.py +2 -2
datachain/lib/data_model.py +1 -1
datachain/lib/dc.py +82 -30
datachain/lib/func/__init__.py +14 -0
datachain/lib/func/aggregate.py +42 -0
datachain/lib/func/func.py +64 -0
datachain/lib/signal_schema.py +15 -9
datachain/lib/udf.py +177 -151
datachain/lib/utils.py +5 -0
datachain/query/__init__.py +1 -2
datachain/query/batch.py +0 -11
datachain/query/dataset.py +23 -44
datachain/query/dispatch.py +0 -12
datachain/query/schema.py +1 -61
datachain/query/session.py +33 -25
datachain/sql/functions/__init__.py +1 -1
datachain/sql/functions/aggregate.py +47 -0
datachain/sql/functions/array.py +0 -8
datachain/sql/functions/string.py +12 -0
datachain/sql/sqlite/base.py +30 -7
{datachain-0.5.1.dist-info → datachain-0.6.1.dist-info}/METADATA +2 -2
{datachain-0.5.1.dist-info → datachain-0.6.1.dist-info}/RECORD +31 -27
{datachain-0.5.1.dist-info → datachain-0.6.1.dist-info}/LICENSE +0 -0
{datachain-0.5.1.dist-info → datachain-0.6.1.dist-info}/WHEEL +0 -0
{datachain-0.5.1.dist-info → datachain-0.6.1.dist-info}/entry_points.txt +0 -0
{datachain-0.5.1.dist-info → datachain-0.6.1.dist-info}/top_level.txt +0 -0

datachain/lib/udf.py CHANGED Viewed

@@ -1,14 +1,15 @@
 import sys
 import traceback
 from collections.abc import Iterable, Iterator, Mapping, Sequence
-from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Callable, Optional
+import attrs
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback
 from pydantic import BaseModel
 from datachain.dataset import RowDict
 from datachain.lib.convert.flatten import flatten
+from datachain.lib.data_model import DataValue
 from datachain.lib.file import File
 from datachain.lib.signal_schema import SignalSchema
 from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
@@ -18,16 +19,14 @@ from datachain.query.batch import (
     NoBatching,
     Partition,
     RowsOutputBatch,
-    UDFInputBatch,
 )
-from datachain.query.schema import ColumnParameter, UDFParameter
 if TYPE_CHECKING:
     from typing_extensions import Self
     from datachain.catalog import Catalog
     from datachain.lib.udf_signature import UdfSignature
-    from datachain.query.batch import RowsOutput, UDFInput
+    from datachain.query.batch import RowsOutput
 class UdfError(DataChainParamsError):
@@ -45,11 +44,21 @@ UDFOutputSpec = Mapping[str, ColumnType]
 UDFResult = dict[str, Any]
-@dataclass
+@attrs.define
 class UDFProperties:
-    """Container for basic UDF properties."""
+    udf: "UDFAdapter"
-    params: list[UDFParameter]
+    def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
+        return self.udf.get_batching(use_partitioning)
+    @property
+    def batch(self):
+        return self.udf.batch
+@attrs.define(slots=False)
+class UDFAdapter:
+    inner: "UDFBase"
     output: UDFOutputSpec
     batch: int = 1
@@ -62,20 +71,10 @@ class UDFProperties:
             return Batch(self.batch)
         raise ValueError(f"invalid batch size {self.batch}")
-    def signal_names(self) -> Iterable[str]:
-        return self.output.keys()
-class UDFAdapter:
-    def __init__(
-        self,
-        inner: "UDFBase",
-        properties: UDFProperties,
-    ):
-        self.inner = inner
-        self.properties = properties
-        self.signal_names = properties.signal_names()
-        self.output = properties.output
+    @property
+    def properties(self):
+        # For backwards compatibility.
+        return UDFProperties(self)
     def run(
         self,
@@ -87,72 +86,14 @@ class UDFAdapter:
         download_cb: Callback = DEFAULT_CALLBACK,
         processed_cb: Callback = DEFAULT_CALLBACK,
     ) -> Iterator[Iterable[UDFResult]]:
-        self.inner.catalog = catalog
-        if hasattr(self.inner, "setup") and callable(self.inner.setup):
-            self.inner.setup()
-        for batch in udf_inputs:
-            if isinstance(batch, RowsOutputBatch):
-                n_rows = len(batch.rows)
-                inputs: UDFInput = UDFInputBatch(
-                    [RowDict(zip(udf_fields, row)) for row in batch.rows]
-                )
-            else:
-                n_rows = 1
-                inputs = RowDict(zip(udf_fields, batch))
-            output = self.run_once(catalog, inputs, is_generator, cache, cb=download_cb)
-            processed_cb.relative_update(n_rows)
-            yield output
-        if hasattr(self.inner, "teardown") and callable(self.inner.teardown):
-            self.inner.teardown()
-    def run_once(
-        self,
-        catalog: "Catalog",
-        arg: "UDFInput",
-        is_generator: bool = False,
-        cache: bool = False,
-        cb: Callback = DEFAULT_CALLBACK,
-    ) -> Iterable[UDFResult]:
-        if isinstance(arg, UDFInputBatch):
-            udf_inputs = [
-                self.bind_parameters(catalog, row, cache=cache, cb=cb)
-                for row in arg.rows
-            ]
-            udf_outputs = self.inner.run_once(udf_inputs, cache=cache, download_cb=cb)
-            return self._process_results(arg.rows, udf_outputs, is_generator)
-        if isinstance(arg, RowDict):
-            udf_inputs = self.bind_parameters(catalog, arg, cache=cache, cb=cb)
-            udf_outputs = self.inner.run_once(udf_inputs, cache=cache, download_cb=cb)
-            if not is_generator:
-                # udf_outputs is generator already if is_generator=True
-                udf_outputs = [udf_outputs]
-            return self._process_results([arg], udf_outputs, is_generator)
-        raise ValueError(f"Unexpected UDF argument: {arg}")
-    def bind_parameters(self, catalog: "Catalog", row: "RowDict", **kwargs) -> list:
-        return [p.get_value(catalog, row, **kwargs) for p in self.properties.params]
-    def _process_results(
-        self,
-        rows: Sequence["RowDict"],
-        results: Sequence[Sequence[Any]],
-        is_generator=False,
-    ) -> Iterable[UDFResult]:
-        """Create a list of dictionaries representing UDF results."""
-        # outputting rows
-        if is_generator:
-            # each row in results is a tuple of column values
-            return (dict(zip(self.signal_names, row)) for row in results)
-        # outputting signals
-        row_ids = [row["sys__id"] for row in rows]
-        return [
-            {"sys__id": row_id} | dict(zip(self.signal_names, signals))
-            for row_id, signals in zip(row_ids, results)
-        ]
+        yield from self.inner.run(
+            udf_fields,
+            udf_inputs,
+            catalog,
+            cache,
+            download_cb,
+            processed_cb,
+        )
 class UDFBase(AbstractUDF):
@@ -203,17 +144,12 @@ class UDFBase(AbstractUDF):
         ```
     """
-    is_input_batched = False
     is_output_batched = False
-    is_input_grouped = False
-    params_spec: Optional[list[str]]
     catalog: "Optional[Catalog]"
     def __init__(self):
-        self.params = None
+        self.params: Optional[SignalSchema] = None
         self.output = None
-        self.params_spec = None
-        self.output_spec = None
         self.catalog = None
         self._func = None
@@ -241,11 +177,6 @@ class UDFBase(AbstractUDF):
     ):
         self.params = params
         self.output = sign.output_schema
-        params_spec = self.params.to_udf_spec()
-        self.params_spec = list(params_spec.keys())
-        self.output_spec = self.output.to_udf_spec()
         self._func = func
     @classmethod
@@ -273,48 +204,27 @@ class UDFBase(AbstractUDF):
     def name(self):
         return self.__class__.__name__
+    @property
+    def signal_names(self) -> Iterable[str]:
+        return self.output.to_udf_spec().keys()
     def to_udf_wrapper(self, batch: int = 1) -> UDFAdapter:
-        assert self.params_spec is not None
-        properties = UDFProperties(
-            [ColumnParameter(p) for p in self.params_spec], self.output_spec, batch
+        return UDFAdapter(
+            self,
+            self.output.to_udf_spec(),
+            batch,
         )
-        return UDFAdapter(self, properties)
-    def validate_results(self, results, *args, **kwargs):
-        return results
-    def run_once(self, rows, cache, download_cb):
-        if self.is_input_batched:
-            objs = zip(*self._parse_rows(rows, cache, download_cb))
-        else:
-            objs = self._parse_rows([rows], cache, download_cb)[0]
-        result_objs = self.process_safe(objs)
-        if not self.is_output_batched:
-            result_objs = [result_objs]
-        # Generator expression is required, otherwise the value will be materialized
-        res = (self._flatten_row(row) for row in result_objs)
-        if not self.is_output_batched:
-            res = list(res)
-            assert (
-                len(res) == 1
-            ), f"{self.name} returns {len(res)} rows while it's not batched"
-            if isinstance(res[0], tuple):
-                res = res[0]
-        elif (
-            self.is_input_batched
-            and self.is_output_batched
-            and not self.is_input_grouped
-        ):
-            res = list(res)
-            assert len(res) == len(
-                rows
-            ), f"{self.name} returns {len(res)} rows while {len(rows)} expected"
-        return res
+    def run(
+        self,
+        udf_fields: "Sequence[str]",
+        udf_inputs: "Iterable[Any]",
+        catalog: "Catalog",
+        cache: bool,
+        download_cb: Callback = DEFAULT_CALLBACK,
+        processed_cb: Callback = DEFAULT_CALLBACK,
+    ) -> Iterator[Iterable[UDFResult]]:
+        raise NotImplementedError
     def _flatten_row(self, row):
         if len(self.output.values) > 1 and not isinstance(row, BaseModel):
@@ -328,17 +238,28 @@ class UDFBase(AbstractUDF):
     def _obj_to_list(obj):
         return flatten(obj) if isinstance(obj, BaseModel) else [obj]
-    def _parse_rows(self, rows, cache, download_cb):
-        objs = []
-        for row in rows:
-            obj_row = self.params.row_to_objs(row)
-            for obj in obj_row:
-                if isinstance(obj, File):
-                    obj._set_stream(
-                        self.catalog, caching_enabled=cache, download_cb=download_cb
-                    )
-            objs.append(obj_row)
-        return objs
+    def _parse_row(
+        self, row_dict: RowDict, cache: bool, download_cb: Callback
+    ) -> list[DataValue]:
+        assert self.params
+        row = [row_dict[p] for p in self.params.to_udf_spec()]
+        obj_row = self.params.row_to_objs(row)
+        for obj in obj_row:
+            if isinstance(obj, File):
+                assert self.catalog is not None
+                obj._set_stream(
+                    self.catalog, caching_enabled=cache, download_cb=download_cb
+                )
+        return obj_row
+    def _prepare_row(self, row, udf_fields, cache, download_cb):
+        row_dict = RowDict(zip(udf_fields, row))
+        return self._parse_row(row_dict, cache, download_cb)
+    def _prepare_row_and_id(self, row, udf_fields, cache, download_cb):
+        row_dict = RowDict(zip(udf_fields, row))
+        udf_input = self._parse_row(row_dict, cache, download_cb)
+        return row_dict["sys__id"], *udf_input
     def process_safe(self, obj_rows):
         try:
@@ -358,23 +279,128 @@ class UDFBase(AbstractUDF):
 class Mapper(UDFBase):
     """Inherit from this class to pass to `DataChain.map()`."""
+    def run(
+        self,
+        udf_fields: "Sequence[str]",
+        udf_inputs: "Iterable[Sequence[Any]]",
+        catalog: "Catalog",
+        cache: bool,
+        download_cb: Callback = DEFAULT_CALLBACK,
+        processed_cb: Callback = DEFAULT_CALLBACK,
+    ) -> Iterator[Iterable[UDFResult]]:
+        self.catalog = catalog
+        self.setup()
+        for row in udf_inputs:
+            id_, *udf_args = self._prepare_row_and_id(
+                row, udf_fields, cache, download_cb
+            )
+            result_objs = self.process_safe(udf_args)
+            udf_output = self._flatten_row(result_objs)
+            output = [{"sys__id": id_} | dict(zip(self.signal_names, udf_output))]
+            processed_cb.relative_update(1)
+            yield output
+        self.teardown()
 class BatchMapper(UDFBase):
     """Inherit from this class to pass to `DataChain.batch_map()`."""
-    is_input_batched = True
     is_output_batched = True
+    def run(
+        self,
+        udf_fields: Sequence[str],
+        udf_inputs: Iterable[RowsOutputBatch],
+        catalog: "Catalog",
+        cache: bool,
+        download_cb: Callback = DEFAULT_CALLBACK,
+        processed_cb: Callback = DEFAULT_CALLBACK,
+    ) -> Iterator[Iterable[UDFResult]]:
+        self.catalog = catalog
+        self.setup()
+        for batch in udf_inputs:
+            n_rows = len(batch.rows)
+            row_ids, *udf_args = zip(
+                *[
+                    self._prepare_row_and_id(row, udf_fields, cache, download_cb)
+                    for row in batch.rows
+                ]
+            )
+            result_objs = list(self.process_safe(udf_args))
+            n_objs = len(result_objs)
+            assert (
+                n_objs == n_rows
+            ), f"{self.name} returns {n_objs} rows, but {n_rows} were expected"
+            udf_outputs = (self._flatten_row(row) for row in result_objs)
+            output = [
+                {"sys__id": row_id} | dict(zip(self.signal_names, signals))
+                for row_id, signals in zip(row_ids, udf_outputs)
+            ]
+            processed_cb.relative_update(n_rows)
+            yield output
+        self.teardown()
 class Generator(UDFBase):
     """Inherit from this class to pass to `DataChain.gen()`."""
     is_output_batched = True
+    def run(
+        self,
+        udf_fields: "Sequence[str]",
+        udf_inputs: "Iterable[Sequence[Any]]",
+        catalog: "Catalog",
+        cache: bool,
+        download_cb: Callback = DEFAULT_CALLBACK,
+        processed_cb: Callback = DEFAULT_CALLBACK,
+    ) -> Iterator[Iterable[UDFResult]]:
+        self.catalog = catalog
+        self.setup()
+        for row in udf_inputs:
+            udf_args = self._prepare_row(row, udf_fields, cache, download_cb)
+            result_objs = self.process_safe(udf_args)
+            udf_outputs = (self._flatten_row(row) for row in result_objs)
+            output = (dict(zip(self.signal_names, row)) for row in udf_outputs)
+            processed_cb.relative_update(1)
+            yield output
+        self.teardown()
 class Aggregator(UDFBase):
     """Inherit from this class to pass to `DataChain.agg()`."""
-    is_input_batched = True
     is_output_batched = True
-    is_input_grouped = True
+    def run(
+        self,
+        udf_fields: "Sequence[str]",
+        udf_inputs: Iterable[RowsOutputBatch],
+        catalog: "Catalog",
+        cache: bool,
+        download_cb: Callback = DEFAULT_CALLBACK,
+        processed_cb: Callback = DEFAULT_CALLBACK,
+    ) -> Iterator[Iterable[UDFResult]]:
+        self.catalog = catalog
+        self.setup()
+        for batch in udf_inputs:
+            udf_args = zip(
+                *[
+                    self._prepare_row(row, udf_fields, cache, download_cb)
+                    for row in batch.rows
+                ]
+            )
+            result_objs = self.process_safe(udf_args)
+            udf_outputs = (self._flatten_row(row) for row in result_objs)
+            output = (dict(zip(self.signal_names, row)) for row in udf_outputs)
+            processed_cb.relative_update(len(batch.rows))
+            yield output
+        self.teardown()

datachain/lib/utils.py CHANGED Viewed

@@ -23,3 +23,8 @@ class DataChainError(Exception):
 class DataChainParamsError(DataChainError):
     def __init__(self, message):
         super().__init__(message)
+class DataChainColumnError(DataChainParamsError):
+    def __init__(self, col_name, msg):
+        super().__init__(f"Error for column {col_name}: {msg}")

datachain/query/__init__.py CHANGED Viewed

@@ -1,12 +1,11 @@
 from .dataset import DatasetQuery
 from .params import param
-from .schema import C, DatasetRow, LocalFilename, Object, Stream
+from .schema import C, LocalFilename, Object, Stream
 from .session import Session
 __all__ = [
     "C",
     "DatasetQuery",
-    "DatasetRow",
     "LocalFilename",
     "Object",
     "Session",

datachain/query/batch.py CHANGED Viewed

@@ -11,8 +11,6 @@ from datachain.data_storage.warehouse import SELECT_BATCH_SIZE
 if TYPE_CHECKING:
     from sqlalchemy import Select
-    from datachain.dataset import RowDict
 @dataclass
 class RowsOutputBatch:
@@ -22,14 +20,6 @@ class RowsOutputBatch:
 RowsOutput = Union[Sequence, RowsOutputBatch]
-@dataclass
-class UDFInputBatch:
-    rows: Sequence["RowDict"]
-UDFInput = Union["RowDict", UDFInputBatch]
 class BatchingStrategy(ABC):
     """BatchingStrategy provides means of batching UDF executions."""
@@ -107,7 +97,6 @@ class Partition(BatchingStrategy):
         ordered_query = query.order_by(None).order_by(
             PARTITION_COLUMN_ID,
-            "sys__id",
             *query._order_by_clauses,
         )

datachain/query/dataset.py CHANGED Viewed

@@ -392,7 +392,7 @@ class UDFStep(Step, ABC):
     def populate_udf_table(self, udf_table: "Table", query: Select) -> None:
         use_partitioning = self.partition_by is not None
-        batching = self.udf.properties.get_batching(use_partitioning)
+        batching = self.udf.get_batching(use_partitioning)
         workers = self.workers
         if (
             not workers
@@ -591,10 +591,6 @@ class UDFSignal(UDFStep):
             return query, []
         table = self.catalog.warehouse.create_pre_udf_table(query)
         q: Select = sqlalchemy.select(*table.c)
-        if query._order_by_clauses:
-            # we are adding ordering only if it's explicitly added by user in
-            # query part before adding signals
-            q = q.order_by(table.c.sys__id)
         return q, [table]
     def create_result_query(
@@ -630,11 +626,6 @@ class UDFSignal(UDFStep):
             else:
                 res = sqlalchemy.select(*cols1).select_from(subq)
-            if query._order_by_clauses:
-                # if ordering is used in query part before adding signals, we
-                # will have it as order by id from select from pre-created udf table
-                res = res.order_by(subq.c.sys__id)
             if self.partition_by is not None:
                 subquery = res.subquery()
                 res = sqlalchemy.select(*subquery.c).select_from(subquery)
@@ -666,13 +657,6 @@ class RowGenerator(UDFStep):
     def create_result_query(
         self, udf_table, query: Select
     ) -> tuple[QueryGeneratorFunc, list["sqlalchemy.Column"]]:
-        if not query._order_by_clauses:
-            # if we are not selecting all rows in UDF, we need to ensure that
-            # we get the same rows as we got as inputs of UDF since selecting
-            # without ordering can be non deterministic in some databases
-            c = query.selected_columns
-            query = query.order_by(c.sys__id)
         udf_table_query = udf_table.select().subquery()
         udf_table_cols: list[sqlalchemy.Label[Any]] = [
             label(c.name, c) for c in udf_table_query.columns
@@ -957,24 +941,24 @@ class SQLJoin(Step):
 @frozen
-class GroupBy(Step):
-    """Group rows by a specific column."""
-    cols: PartitionByType
+class SQLGroupBy(SQLClause):
+    cols: Sequence[Union[str, ColumnElement]]
+    group_by: Sequence[Union[str, ColumnElement]]
-    def clone(self) -> "Self":
-        return self.__class__(self.cols)
+    def apply_sql_clause(self, query) -> Select:
+        if not self.cols:
+            raise ValueError("No columns to select")
+        if not self.group_by:
+            raise ValueError("No columns to group by")
-    def apply(
-        self, query_generator: QueryGenerator, temp_tables: list[str]
-    ) -> StepResult:
-        query = query_generator.select()
-        grouped_query = query.group_by(*self.cols)
+        subquery = query.subquery()
-        def q(*columns):
-            return grouped_query.with_only_columns(*columns)
+        cols = [
+            subquery.c[str(c)] if isinstance(c, (str, C)) else c
+            for c in [*self.group_by, *self.cols]
+        ]
-        return step_result(q, grouped_query.selected_columns)
+        return sqlalchemy.select(*cols).select_from(subquery).group_by(*self.group_by)
 def _validate_columns(
@@ -1130,25 +1114,14 @@ class DatasetQuery:
             query.steps = query.steps[-1:] + query.steps[:-1]
         result = query.starting_step.apply()
-        group_by = None
         self.dependencies.update(result.dependencies)
         for step in query.steps:
-            if isinstance(step, GroupBy):
-                if group_by is not None:
-                    raise TypeError("only one group_by allowed")
-                group_by = step
-                continue
             result = step.apply(
                 result.query_generator, self.temp_table_names
             )  # a chain of steps linked by results
             self.dependencies.update(result.dependencies)
-        if group_by:
-            result = group_by.apply(result.query_generator, self.temp_table_names)
-            self.dependencies.update(result.dependencies)
         return result.query_generator
     @staticmethod
@@ -1410,9 +1383,13 @@ class DatasetQuery:
         return query.as_scalar()
     @detach
-    def group_by(self, *cols: ColumnElement) -> "Self":
+    def group_by(
+        self,
+        cols: Sequence[ColumnElement],
+        group_by: Sequence[ColumnElement],
+    ) -> "Self":
         query = self.clone()
-        query.steps.append(GroupBy(cols))
+        query.steps.append(SQLGroupBy(cols, group_by))
         return query
     @detach
@@ -1591,6 +1568,8 @@ class DatasetQuery:
             )
             version = version or dataset.latest_version
+            self.session.add_dataset_version(dataset=dataset, version=version)
             dr = self.catalog.warehouse.dataset_rows(dataset)
             self.catalog.warehouse.copy_table(dr.get_table(), query.select())

datachain/query/dispatch.py CHANGED Viewed

@@ -114,7 +114,6 @@ class UDFDispatcher:
     catalog: Optional[Catalog] = None
     task_queue: Optional[multiprocess.Queue] = None
     done_queue: Optional[multiprocess.Queue] = None
-    _batch_size: Optional[int] = None
     def __init__(
         self,
@@ -154,17 +153,6 @@ class UDFDispatcher:
         self.done_queue = None
         self.ctx = get_context("spawn")
-    @property
-    def batch_size(self):
-        if self._batch_size is None:
-            if hasattr(self.udf, "properties") and hasattr(
-                self.udf.properties, "batch"
-            ):
-                self._batch_size = self.udf.properties.batch
-            else:
-                self._batch_size = 1
-        return self._batch_size
     def _create_worker(self) -> "UDFWorker":
         if not self.catalog:
             id_generator = self.id_generator_class(

datachain 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl

Potentially problematic release.

datachain 0.5.1py3-none-any.whl → 0.6.1py3-none-any.whl