PyPI - datachain - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

datachain 0.4.0py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (22) hide show

datachain/catalog/catalog.py +8 -0
datachain/cli.py +3 -2
datachain/data_storage/metastore.py +28 -9
datachain/data_storage/sqlite.py +24 -32
datachain/data_storage/warehouse.py +1 -3
datachain/dataset.py +0 -3
datachain/lib/arrow.py +64 -19
datachain/lib/dc.py +310 -123
datachain/lib/listing.py +5 -3
datachain/lib/pytorch.py +5 -1
datachain/lib/udf.py +100 -78
datachain/lib/udf_signature.py +8 -6
datachain/query/dataset.py +7 -7
datachain/query/dispatch.py +2 -2
datachain/query/session.py +42 -0
{datachain-0.4.0.dist-info → datachain-0.5.1.dist-info}/METADATA +1 -1
{datachain-0.4.0.dist-info → datachain-0.5.1.dist-info}/RECORD +21 -22
datachain/query/udf.py +0 -126
{datachain-0.4.0.dist-info → datachain-0.5.1.dist-info}/LICENSE +0 -0
{datachain-0.4.0.dist-info → datachain-0.5.1.dist-info}/WHEEL +0 -0
{datachain-0.4.0.dist-info → datachain-0.5.1.dist-info}/entry_points.txt +0 -0
{datachain-0.4.0.dist-info → datachain-0.5.1.dist-info}/top_level.txt +0 -0

datachain/lib/listing.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import posixpath
 from collections.abc import Iterator
 from datetime import datetime, timedelta, timezone
-from typing import TYPE_CHECKING, Callable, Optional
+from typing import TYPE_CHECKING, Callable, Optional, TypeVar
 from fsspec.asyn import get_loop
 from sqlalchemy.sql.expression import true
@@ -20,6 +20,8 @@ if TYPE_CHECKING:
 LISTING_TTL = 4 * 60 * 60  # cached listing lasts 4 hours
 LISTING_PREFIX = "lst__"  # listing datasets start with this name
+D = TypeVar("D", bound="DataChain")
 def list_bucket(uri: str, cache, client_config=None) -> Callable:
     """
@@ -38,11 +40,11 @@ def list_bucket(uri: str, cache, client_config=None) -> Callable:
 def ls(
-    dc: "DataChain",
+    dc: D,
     path: str,
     recursive: Optional[bool] = True,
     object_name="file",
-):
+) -> D:
     """
     Return files by some path from DataChain instance which contains bucket listing.
     Path can have globs.

datachain/lib/pytorch.py CHANGED Viewed

@@ -9,6 +9,7 @@ from torch.utils.data import IterableDataset, get_worker_info
 from torchvision.transforms import v2
 from tqdm import tqdm
+from datachain import Session
 from datachain.catalog import Catalog, get_catalog
 from datachain.lib.dc import DataChain
 from datachain.lib.text import convert_text
@@ -87,8 +88,11 @@ class PytorchDataset(IterableDataset):
     def __iter__(self) -> Iterator[Any]:
         if self.catalog is None:
             self.catalog = self._get_catalog()
+        session = Session.get(catalog=self.catalog)
         total_rank, total_workers = self.get_rank_and_workers()
-        ds = DataChain(name=self.name, version=self.version, catalog=self.catalog)
+        ds = DataChain.from_dataset(
+            name=self.name, version=self.version, session=session
+        )
         ds = ds.remove_file_signals()
         if self.num_samples > 0:

datachain/lib/udf.py CHANGED Viewed

@@ -1,31 +1,33 @@
 import sys
 import traceback
-from typing import TYPE_CHECKING, Callable, Optional
+from collections.abc import Iterable, Iterator, Mapping, Sequence
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Callable, Optional
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback
 from pydantic import BaseModel
 from datachain.dataset import RowDict
 from datachain.lib.convert.flatten import flatten
-from datachain.lib.convert.unflatten import unflatten_to_json
 from datachain.lib.file import File
-from datachain.lib.model_store import ModelStore
 from datachain.lib.signal_schema import SignalSchema
-from datachain.lib.udf_signature import UdfSignature
 from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
-from datachain.query.batch import UDFInputBatch
-from datachain.query.schema import ColumnParameter
-from datachain.query.udf import UDFBase as _UDFBase
-from datachain.query.udf import UDFProperties
+from datachain.query.batch import (
+    Batch,
+    BatchingStrategy,
+    NoBatching,
+    Partition,
+    RowsOutputBatch,
+    UDFInputBatch,
+)
+from datachain.query.schema import ColumnParameter, UDFParameter
 if TYPE_CHECKING:
-    from collections.abc import Iterable, Iterator, Sequence
     from typing_extensions import Self
     from datachain.catalog import Catalog
+    from datachain.lib.udf_signature import UdfSignature
     from datachain.query.batch import RowsOutput, UDFInput
-    from datachain.query.udf import UDFResult
 class UdfError(DataChainParamsError):
@@ -33,14 +35,47 @@ class UdfError(DataChainParamsError):
         super().__init__(f"UDF error: {msg}")
-class UDFAdapter(_UDFBase):
+ColumnType = Any
+# Specification for the output of a UDF
+UDFOutputSpec = Mapping[str, ColumnType]
+# Result type when calling the UDF wrapper around the actual
+# Python function / class implementing it.
+UDFResult = dict[str, Any]
+@dataclass
+class UDFProperties:
+    """Container for basic UDF properties."""
+    params: list[UDFParameter]
+    output: UDFOutputSpec
+    batch: int = 1
+    def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
+        if use_partitioning:
+            return Partition()
+        if self.batch == 1:
+            return NoBatching()
+        if self.batch > 1:
+            return Batch(self.batch)
+        raise ValueError(f"invalid batch size {self.batch}")
+    def signal_names(self) -> Iterable[str]:
+        return self.output.keys()
+class UDFAdapter:
     def __init__(
         self,
         inner: "UDFBase",
         properties: UDFProperties,
     ):
         self.inner = inner
-        super().__init__(properties)
+        self.properties = properties
+        self.signal_names = properties.signal_names()
+        self.output = properties.output
     def run(
         self,
@@ -51,20 +86,23 @@ class UDFAdapter(_UDFBase):
         cache: bool,
         download_cb: Callback = DEFAULT_CALLBACK,
         processed_cb: Callback = DEFAULT_CALLBACK,
-    ) -> "Iterator[Iterable[UDFResult]]":
-        self.inner._catalog = catalog
+    ) -> Iterator[Iterable[UDFResult]]:
+        self.inner.catalog = catalog
         if hasattr(self.inner, "setup") and callable(self.inner.setup):
             self.inner.setup()
-        yield from super().run(
-            udf_fields,
-            udf_inputs,
-            catalog,
-            is_generator,
-            cache,
-            download_cb,
-            processed_cb,
-        )
+        for batch in udf_inputs:
+            if isinstance(batch, RowsOutputBatch):
+                n_rows = len(batch.rows)
+                inputs: UDFInput = UDFInputBatch(
+                    [RowDict(zip(udf_fields, row)) for row in batch.rows]
+                )
+            else:
+                n_rows = 1
+                inputs = RowDict(zip(udf_fields, batch))
+            output = self.run_once(catalog, inputs, is_generator, cache, cb=download_cb)
+            processed_cb.relative_update(n_rows)
+            yield output
         if hasattr(self.inner, "teardown") and callable(self.inner.teardown):
             self.inner.teardown()
@@ -76,23 +114,46 @@ class UDFAdapter(_UDFBase):
         is_generator: bool = False,
         cache: bool = False,
         cb: Callback = DEFAULT_CALLBACK,
-    ) -> "Iterable[UDFResult]":
+    ) -> Iterable[UDFResult]:
         if isinstance(arg, UDFInputBatch):
             udf_inputs = [
                 self.bind_parameters(catalog, row, cache=cache, cb=cb)
                 for row in arg.rows
             ]
-            udf_outputs = self.inner(udf_inputs, cache=cache, download_cb=cb)
+            udf_outputs = self.inner.run_once(udf_inputs, cache=cache, download_cb=cb)
             return self._process_results(arg.rows, udf_outputs, is_generator)
         if isinstance(arg, RowDict):
             udf_inputs = self.bind_parameters(catalog, arg, cache=cache, cb=cb)
-            udf_outputs = self.inner(*udf_inputs, cache=cache, download_cb=cb)
+            udf_outputs = self.inner.run_once(udf_inputs, cache=cache, download_cb=cb)
             if not is_generator:
                 # udf_outputs is generator already if is_generator=True
                 udf_outputs = [udf_outputs]
             return self._process_results([arg], udf_outputs, is_generator)
         raise ValueError(f"Unexpected UDF argument: {arg}")
+    def bind_parameters(self, catalog: "Catalog", row: "RowDict", **kwargs) -> list:
+        return [p.get_value(catalog, row, **kwargs) for p in self.properties.params]
+    def _process_results(
+        self,
+        rows: Sequence["RowDict"],
+        results: Sequence[Sequence[Any]],
+        is_generator=False,
+    ) -> Iterable[UDFResult]:
+        """Create a list of dictionaries representing UDF results."""
+        # outputting rows
+        if is_generator:
+            # each row in results is a tuple of column values
+            return (dict(zip(self.signal_names, row)) for row in results)
+        # outputting signals
+        row_ids = [row["sys__id"] for row in rows]
+        return [
+            {"sys__id": row_id} | dict(zip(self.signal_names, signals))
+            for row_id, signals in zip(row_ids, results)
+        ]
 class UDFBase(AbstractUDF):
     """Base class for stateful user-defined functions.
@@ -146,14 +207,14 @@ class UDFBase(AbstractUDF):
     is_output_batched = False
     is_input_grouped = False
     params_spec: Optional[list[str]]
+    catalog: "Optional[Catalog]"
     def __init__(self):
         self.params = None
         self.output = None
         self.params_spec = None
         self.output_spec = None
-        self._contains_stream = None
-        self._catalog = None
+        self.catalog = None
         self._func = None
     def process(self, *args, **kwargs):
@@ -174,9 +235,9 @@ class UDFBase(AbstractUDF):
     def _init(
         self,
-        sign: UdfSignature,
+        sign: "UdfSignature",
         params: SignalSchema,
-        func: Callable,
+        func: Optional[Callable],
     ):
         self.params = params
         self.output = sign.output_schema
@@ -190,13 +251,13 @@ class UDFBase(AbstractUDF):
     @classmethod
     def _create(
         cls,
-        sign: UdfSignature,
+        sign: "UdfSignature",
         params: SignalSchema,
     ) -> "Self":
         if isinstance(sign.func, AbstractUDF):
             if not isinstance(sign.func, cls):  # type: ignore[unreachable]
                 raise UdfError(
-                    f"cannot create UDF: provided UDF '{sign.func.__name__}'"
+                    f"cannot create UDF: provided UDF '{type(sign.func).__name__}'"
                     f" must be a child of target class '{cls.__name__}'",
                 )
             result = sign.func
@@ -212,13 +273,6 @@ class UDFBase(AbstractUDF):
     def name(self):
         return self.__class__.__name__
-    def set_catalog(self, catalog):
-        self._catalog = catalog.copy(db=False)
-    @property
-    def catalog(self):
-        return self._catalog
     def to_udf_wrapper(self, batch: int = 1) -> UDFAdapter:
         assert self.params_spec is not None
         properties = UDFProperties(
@@ -229,11 +283,9 @@ class UDFBase(AbstractUDF):
     def validate_results(self, results, *args, **kwargs):
         return results
-    def __call__(self, *rows, cache, download_cb):
-        if self.is_input_grouped:
-            objs = self._parse_grouped_rows(rows[0], cache, download_cb)
-        elif self.is_input_batched:
-            objs = zip(*self._parse_rows(rows[0], cache, download_cb))
+    def run_once(self, rows, cache, download_cb):
+        if self.is_input_batched:
+            objs = zip(*self._parse_rows(rows, cache, download_cb))
         else:
             objs = self._parse_rows([rows], cache, download_cb)[0]
@@ -259,8 +311,8 @@ class UDFBase(AbstractUDF):
         ):
             res = list(res)
             assert len(res) == len(
-                rows[0]
-            ), f"{self.name} returns {len(res)} rows while len(rows[0]) expected"
+                rows
+            ), f"{self.name} returns {len(res)} rows while {len(rows)} expected"
         return res
@@ -283,41 +335,11 @@ class UDFBase(AbstractUDF):
             for obj in obj_row:
                 if isinstance(obj, File):
                     obj._set_stream(
-                        self._catalog, caching_enabled=cache, download_cb=download_cb
+                        self.catalog, caching_enabled=cache, download_cb=download_cb
                     )
             objs.append(obj_row)
         return objs
-    def _parse_grouped_rows(self, group, cache, download_cb):
-        spec_map = {}
-        output_map = {}
-        for name, (anno, subtree) in self.params.tree.items():
-            if ModelStore.is_pydantic(anno):
-                length = sum(1 for _ in self.params._get_flat_tree(subtree, [], 0))
-            else:
-                length = 1
-            spec_map[name] = anno, length
-            output_map[name] = []
-        for flat_obj in group:
-            position = 0
-            for signal, (cls, length) in spec_map.items():
-                slice = flat_obj[position : position + length]
-                position += length
-                if ModelStore.is_pydantic(cls):
-                    obj = cls(**unflatten_to_json(cls, slice))
-                else:
-                    obj = slice[0]
-                if isinstance(obj, File):
-                    obj._set_stream(
-                        self._catalog, caching_enabled=cache, download_cb=download_cb
-                    )
-                output_map[signal].append(obj)
-        return list(output_map.values())
     def process_safe(self, obj_rows):
         try:
             result_objs = self.process(*obj_rows)

datachain/lib/udf_signature.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import inspect
 from collections.abc import Generator, Iterator, Sequence
 from dataclasses import dataclass
-from typing import Callable, Optional, Union, get_args, get_origin
+from typing import Callable, Union, get_args, get_origin
 from datachain.lib.data_model import DataType, DataTypeNames, is_chain_type
 from datachain.lib.signal_schema import SignalSchema
+from datachain.lib.udf import UDFBase
 from datachain.lib.utils import AbstractUDF, DataChainParamsError
@@ -16,7 +17,7 @@ class UdfSignatureError(DataChainParamsError):
 @dataclass
 class UdfSignature:
-    func: Callable
+    func: Union[Callable, UDFBase]
     params: Sequence[str]
     output_schema: SignalSchema
@@ -27,7 +28,7 @@ class UdfSignature:
         cls,
         chain: str,
         signal_map: dict[str, Callable],
-        func: Optional[Callable] = None,
+        func: Union[None, UDFBase, Callable] = None,
         params: Union[None, str, Sequence[str]] = None,
         output: Union[None, DataType, Sequence[str], dict[str, DataType]] = None,
         is_generator: bool = True,
@@ -39,6 +40,7 @@ class UdfSignature:
                 f"multiple signals '{keys}' are not supported in processors."
                 " Chain multiple processors instead.",
             )
+        udf_func: Union[UDFBase, Callable]
         if len(signal_map) == 1:
             if func is not None:
                 raise UdfSignatureError(
@@ -53,7 +55,7 @@ class UdfSignature:
             udf_func = func
             signal_name = None
-        if not callable(udf_func):
+        if not isinstance(udf_func, UDFBase) and not callable(udf_func):
             raise UdfSignatureError(chain, f"UDF '{udf_func}' is not callable")
         func_params_map_sign, func_outs_sign, is_iterator = (
@@ -73,7 +75,7 @@ class UdfSignature:
             if not func_outs_sign:
                 raise UdfSignatureError(
                     chain,
-                    f"outputs are not defined in function '{udf_func.__name__}'"
+                    f"outputs are not defined in function '{udf_func}'"
                     " hints or 'output'",
                 )
@@ -154,7 +156,7 @@ class UdfSignature:
     @staticmethod
     def _func_signature(
-        chain: str, udf_func: Callable
+        chain: str, udf_func: Union[Callable, UDFBase]
     ) -> tuple[dict[str, type], Sequence[type], bool]:
         if isinstance(udf_func, AbstractUDF):
             func = udf_func.process  # type: ignore[unreachable]

datachain/query/dataset.py CHANGED Viewed

@@ -42,6 +42,7 @@ from datachain.data_storage.schema import (
 )
 from datachain.dataset import DatasetStatus, RowDict
 from datachain.error import DatasetNotFoundError, QueryScriptCancelError
+from datachain.lib.udf import UDFAdapter
 from datachain.progress import CombinedDownloadCallback
 from datachain.sql.functions import rand
 from datachain.utils import (
@@ -53,7 +54,6 @@ from datachain.utils import (
 from .schema import C, UDFParamSpec, normalize_param
 from .session import Session
-from .udf import UDFBase
 if TYPE_CHECKING:
     from sqlalchemy.sql.elements import ClauseElement
@@ -299,7 +299,7 @@ def adjust_outputs(
     return row
-def get_udf_col_types(warehouse: "AbstractWarehouse", udf: UDFBase) -> list[tuple]:
+def get_udf_col_types(warehouse: "AbstractWarehouse", udf: UDFAdapter) -> list[tuple]:
     """Optimization: Precompute UDF column types so these don't have to be computed
     in the convert_type function for each row in a loop."""
     dialect = warehouse.db.dialect
@@ -320,7 +320,7 @@ def process_udf_outputs(
     warehouse: "AbstractWarehouse",
     udf_table: "Table",
     udf_results: Iterator[Iterable["UDFResult"]],
-    udf: UDFBase,
+    udf: UDFAdapter,
     batch_size: int = INSERT_BATCH_SIZE,
     cb: Callback = DEFAULT_CALLBACK,
 ) -> None:
@@ -364,7 +364,7 @@ def get_generated_callback(is_generator: bool = False) -> Callback:
 @frozen
 class UDFStep(Step, ABC):
-    udf: UDFBase
+    udf: UDFAdapter
     catalog: "Catalog"
     partition_by: Optional[PartitionByType] = None
     parallel: Optional[int] = None
@@ -1037,7 +1037,7 @@ class DatasetQuery:
         session: Optional[Session] = None,
         indexing_column_types: Optional[dict[str, Any]] = None,
         in_memory: bool = False,
-    ):
+    ) -> None:
         self.session = Session.get(session, catalog=catalog, in_memory=in_memory)
         self.catalog = catalog or self.session.catalog
         self.steps: list[Step] = []
@@ -1465,7 +1465,7 @@ class DatasetQuery:
     @detach
     def add_signals(
         self,
-        udf: UDFBase,
+        udf: UDFAdapter,
         parallel: Optional[int] = None,
         workers: Union[bool, int] = False,
         min_task_size: Optional[int] = None,
@@ -1509,7 +1509,7 @@ class DatasetQuery:
     @detach
     def generate(
         self,
-        udf: UDFBase,
+        udf: UDFAdapter,
         parallel: Optional[int] = None,
         workers: Union[bool, int] = False,
         min_task_size: Optional[int] = None,

datachain/query/dispatch.py CHANGED Viewed

@@ -13,6 +13,7 @@ from multiprocess import get_context
 from datachain.catalog import Catalog
 from datachain.catalog.loader import get_distributed_class
+from datachain.lib.udf import UDFAdapter, UDFResult
 from datachain.query.dataset import (
     get_download_callback,
     get_generated_callback,
@@ -27,7 +28,6 @@ from datachain.query.queue import (
     put_into_queue,
     unmarshal,
 )
-from datachain.query.udf import UDFBase, UDFResult
 from datachain.utils import batched_it
 DEFAULT_BATCH_SIZE = 10000
@@ -336,7 +336,7 @@ class ProcessedCallback(Callback):
 @attrs.define
 class UDFWorker:
     catalog: Catalog
-    udf: UDFBase
+    udf: UDFAdapter
     task_queue: "multiprocess.Queue"
     done_queue: "multiprocess.Queue"
     is_generator: bool

datachain/query/session.py CHANGED Viewed

@@ -1,5 +1,8 @@
 import atexit
+import logging
+import os
 import re
+import sys
 from typing import TYPE_CHECKING, Optional
 from uuid import uuid4
@@ -9,6 +12,8 @@ from datachain.error import TableMissingError
 if TYPE_CHECKING:
     from datachain.catalog import Catalog
+logger = logging.getLogger("datachain")
 class Session:
     """
@@ -35,6 +40,7 @@ class Session:
     GLOBAL_SESSION_CTX: Optional["Session"] = None
     GLOBAL_SESSION: Optional["Session"] = None
+    ORIGINAL_EXCEPT_HOOK = None
     DATASET_PREFIX = "session_"
     GLOBAL_SESSION_NAME = "global"
@@ -58,6 +64,7 @@ class Session:
         session_uuid = uuid4().hex[: self.SESSION_UUID_LEN]
         self.name = f"{name}_{session_uuid}"
+        self.job_id = os.getenv("DATACHAIN_JOB_ID") or str(uuid4())
         self.is_new_catalog = not catalog
         self.catalog = catalog or get_catalog(
             client_config=client_config, in_memory=in_memory
@@ -67,6 +74,9 @@ class Session:
         return self
     def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type:
+            self._cleanup_created_versions(self.name)
         self._cleanup_temp_datasets()
         if self.is_new_catalog:
             self.catalog.metastore.close_on_exit()
@@ -88,6 +98,21 @@ class Session:
         except TableMissingError:
             pass
+    def _cleanup_created_versions(self, job_id: str) -> None:
+        versions = self.catalog.metastore.get_job_dataset_versions(job_id)
+        if not versions:
+            return
+        datasets = {}
+        for dataset_name, version in versions:
+            if dataset_name not in datasets:
+                datasets[dataset_name] = self.catalog.get_dataset(dataset_name)
+            dataset = datasets[dataset_name]
+            logger.info(
+                "Removing dataset version %s@%s due to exception", dataset_name, version
+            )
+            self.catalog.remove_dataset_version(dataset, version)
     @classmethod
     def get(
         cls,
@@ -114,9 +139,23 @@ class Session:
                 in_memory=in_memory,
             )
             cls.GLOBAL_SESSION = cls.GLOBAL_SESSION_CTX.__enter__()
             atexit.register(cls._global_cleanup)
+            cls.ORIGINAL_EXCEPT_HOOK = sys.excepthook
+            sys.excepthook = cls.except_hook
         return cls.GLOBAL_SESSION
+    @staticmethod
+    def except_hook(exc_type, exc_value, exc_traceback):
+        Session._global_cleanup()
+        if Session.GLOBAL_SESSION_CTX is not None:
+            job_id = Session.GLOBAL_SESSION_CTX.job_id
+            Session.GLOBAL_SESSION_CTX._cleanup_created_versions(job_id)
+        if Session.ORIGINAL_EXCEPT_HOOK:
+            Session.ORIGINAL_EXCEPT_HOOK(exc_type, exc_value, exc_traceback)
     @classmethod
     def cleanup_for_tests(cls):
         if cls.GLOBAL_SESSION_CTX is not None:
@@ -125,6 +164,9 @@ class Session:
             cls.GLOBAL_SESSION_CTX = None
             atexit.unregister(cls._global_cleanup)
+        if cls.ORIGINAL_EXCEPT_HOOK:
+            sys.excepthook = cls.ORIGINAL_EXCEPT_HOOK
     @staticmethod
     def _global_cleanup():
         if Session.GLOBAL_SESSION_CTX is not None:

{datachain-0.4.0.dist-info → datachain-0.5.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datachain
-Version: 0.4.0
+Version: 0.5.1
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License: Apache-2.0

datachain 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

Potentially problematic release.

datachain 0.4.0py3-none-any.whl → 0.5.1py3-none-any.whl