PyPI - datachain - Versions diffs - 0.30.5__py3-none-any.whl → 0.30.7__py3-none-any.whl - Mend

datachain 0.30.5py3-none-any.whl → 0.30.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (22) hide show

datachain/cli/commands/datasets.py +32 -17
datachain/data_storage/sqlite.py +18 -15
datachain/data_storage/warehouse.py +7 -1
datachain/delta.py +36 -20
datachain/lib/dc/database.py +2 -2
datachain/lib/dc/datachain.py +36 -28
datachain/lib/dc/datasets.py +4 -0
datachain/lib/dc/records.py +2 -4
datachain/lib/dc/storage.py +5 -0
datachain/lib/settings.py +188 -85
datachain/lib/udf.py +3 -20
datachain/query/batch.py +2 -2
datachain/query/dataset.py +44 -17
datachain/query/dispatch.py +6 -0
datachain/query/udf.py +2 -0
datachain/utils.py +9 -10
{datachain-0.30.5.dist-info → datachain-0.30.7.dist-info}/METADATA +1 -1
{datachain-0.30.5.dist-info → datachain-0.30.7.dist-info}/RECORD +22 -22
{datachain-0.30.5.dist-info → datachain-0.30.7.dist-info}/WHEEL +0 -0
{datachain-0.30.5.dist-info → datachain-0.30.7.dist-info}/entry_points.txt +0 -0
{datachain-0.30.5.dist-info → datachain-0.30.7.dist-info}/licenses/LICENSE +0 -0
{datachain-0.30.5.dist-info → datachain-0.30.7.dist-info}/top_level.txt +0 -0

datachain/lib/settings.py CHANGED Viewed

@@ -1,111 +1,214 @@
+from typing import Any, Optional, Union
 from datachain.lib.utils import DataChainParamsError
-from datachain.utils import DEFAULT_CHUNK_ROWS
+DEFAULT_CACHE = False
+DEFAULT_PREFETCH = 2
+DEFAULT_BATCH_SIZE = 2_000
 class SettingsError(DataChainParamsError):
-    def __init__(self, msg):
+    def __init__(self, msg: str) -> None:
         super().__init__(f"Dataset settings error: {msg}")
 class Settings:
-    def __init__(
+    """Settings for datachain."""
+    _cache: Optional[bool]
+    _prefetch: Optional[int]
+    _parallel: Optional[Union[bool, int]]
+    _workers: Optional[int]
+    _namespace: Optional[str]
+    _project: Optional[str]
+    _min_task_size: Optional[int]
+    _batch_size: Optional[int]
+    def __init__(  # noqa: C901, PLR0912
         self,
-        cache=None,
-        parallel=None,
-        workers=None,
-        min_task_size=None,
-        prefetch=None,
-        namespace=None,
-        project=None,
-        batch_rows=None,
-    ):
-        self._cache = cache
-        self.parallel = parallel
-        self._workers = workers
-        self.min_task_size = min_task_size
-        self.prefetch = prefetch
-        self.namespace = namespace
-        self.project = project
-        self._chunk_rows = batch_rows
-        if not isinstance(cache, bool) and cache is not None:
-            raise SettingsError(
-                "'cache' argument must be bool"
-                f" while {cache.__class__.__name__} was given"
-            )
-        if not isinstance(parallel, int) and parallel is not None:
-            raise SettingsError(
-                "'parallel' argument must be int or None"
-                f" while {parallel.__class__.__name__} was given"
-            )
-        if (
-            not isinstance(workers, bool)
-            and not isinstance(workers, int)
-            and workers is not None
-        ):
-            raise SettingsError(
-                "'workers' argument must be int or bool"
-                f" while {workers.__class__.__name__} was given"
-            )
-        if min_task_size is not None and not isinstance(min_task_size, int):
-            raise SettingsError(
-                "'min_task_size' argument must be int or None"
-                f", {min_task_size.__class__.__name__} was given"
-            )
-        if batch_rows is not None and not isinstance(batch_rows, int):
-            raise SettingsError(
-                "'batch_rows' argument must be int or None"
-                f", {batch_rows.__class__.__name__} was given"
-            )
-        if batch_rows is not None and batch_rows <= 0:
-            raise SettingsError(
-                "'batch_rows' argument must be positive integer"
-                f", {batch_rows} was given"
-            )
+        cache: Optional[bool] = None,
+        prefetch: Optional[Union[bool, int]] = None,
+        parallel: Optional[Union[bool, int]] = None,
+        workers: Optional[int] = None,
+        namespace: Optional[str] = None,
+        project: Optional[str] = None,
+        min_task_size: Optional[int] = None,
+        batch_size: Optional[int] = None,
+    ) -> None:
+        if cache is None:
+            self._cache = None
+        else:
+            if not isinstance(cache, bool):
+                raise SettingsError(
+                    "'cache' argument must be bool"
+                    f" while {cache.__class__.__name__} was given"
+                )
+            self._cache = cache
+        if prefetch is None or prefetch is True:
+            self._prefetch = None
+        elif prefetch is False:
+            self._prefetch = 0  # disable prefetch (False == 0)
+        else:
+            if not isinstance(prefetch, int):
+                raise SettingsError(
+                    "'prefetch' argument must be int or bool"
+                    f" while {prefetch.__class__.__name__} was given"
+                )
+            if prefetch < 0:
+                raise SettingsError(
+                    "'prefetch' argument must be non-negative integer"
+                    f", {prefetch} was given"
+                )
+            self._prefetch = prefetch
+        if parallel is None or parallel is False:
+            self._parallel = None
+        elif parallel is True:
+            self._parallel = True
+        else:
+            if not isinstance(parallel, int):
+                raise SettingsError(
+                    "'parallel' argument must be int or bool"
+                    f" while {parallel.__class__.__name__} was given"
+                )
+            if parallel <= 0:
+                raise SettingsError(
+                    "'parallel' argument must be positive integer"
+                    f", {parallel} was given"
+                )
+            self._parallel = parallel
+        if workers is None:
+            self._workers = None
+        else:
+            if not isinstance(workers, int) or isinstance(workers, bool):
+                raise SettingsError(
+                    "'workers' argument must be int"
+                    f" while {workers.__class__.__name__} was given"
+                )
+            if workers <= 0:
+                raise SettingsError(
+                    f"'workers' argument must be positive integer, {workers} was given"
+                )
+            self._workers = workers
+        if namespace is None:
+            self._namespace = None
+        else:
+            if not isinstance(namespace, str):
+                raise SettingsError(
+                    "'namespace' argument must be str"
+                    f", {namespace.__class__.__name__} was given"
+                )
+            self._namespace = namespace
+        if project is None:
+            self._project = None
+        else:
+            if not isinstance(project, str):
+                raise SettingsError(
+                    "'project' argument must be str"
+                    f", {project.__class__.__name__} was given"
+                )
+            self._project = project
+        if min_task_size is None:
+            self._min_task_size = None
+        else:
+            if not isinstance(min_task_size, int) or isinstance(min_task_size, bool):
+                raise SettingsError(
+                    "'min_task_size' argument must be int"
+                    f", {min_task_size.__class__.__name__} was given"
+                )
+            if min_task_size <= 0:
+                raise SettingsError(
+                    "'min_task_size' argument must be positive integer"
+                    f", {min_task_size} was given"
+                )
+            self._min_task_size = min_task_size
+        if batch_size is None:
+            self._batch_size = None
+        else:
+            if not isinstance(batch_size, int) or isinstance(batch_size, bool):
+                raise SettingsError(
+                    "'batch_size' argument must be int"
+                    f", {batch_size.__class__.__name__} was given"
+                )
+            if batch_size <= 0:
+                raise SettingsError(
+                    "'batch_size' argument must be positive integer"
+                    f", {batch_size} was given"
+                )
+            self._batch_size = batch_size
+    @property
+    def cache(self) -> bool:
+        return self._cache if self._cache is not None else DEFAULT_CACHE
+    @property
+    def prefetch(self) -> Optional[int]:
+        return self._prefetch if self._prefetch is not None else DEFAULT_PREFETCH
+    @property
+    def parallel(self) -> Optional[Union[bool, int]]:
+        return self._parallel if self._parallel is not None else None
+    @property
+    def workers(self) -> Optional[int]:
+        return self._workers if self._workers is not None else None
+    @property
+    def namespace(self) -> Optional[str]:
+        return self._namespace if self._namespace is not None else None
     @property
-    def cache(self):
-        return self._cache if self._cache is not None else False
+    def project(self) -> Optional[str]:
+        return self._project if self._project is not None else None
     @property
-    def workers(self):
-        return self._workers if self._workers is not None else False
+    def min_task_size(self) -> Optional[int]:
+        return self._min_task_size if self._min_task_size is not None else None
     @property
-    def batch_rows(self):
-        return self._chunk_rows if self._chunk_rows is not None else DEFAULT_CHUNK_ROWS
+    def batch_size(self) -> int:
+        return self._batch_size if self._batch_size is not None else DEFAULT_BATCH_SIZE
-    def to_dict(self):
-        res = {}
+    def to_dict(self) -> dict[str, Any]:
+        res: dict[str, Any] = {}
         if self._cache is not None:
             res["cache"] = self.cache
-        if self.parallel is not None:
+        if self._prefetch is not None:
+            res["prefetch"] = self.prefetch
+        if self._parallel is not None:
             res["parallel"] = self.parallel
         if self._workers is not None:
             res["workers"] = self.workers
-        if self.min_task_size is not None:
+        if self._min_task_size is not None:
             res["min_task_size"] = self.min_task_size
-        if self.namespace is not None:
+        if self._namespace is not None:
             res["namespace"] = self.namespace
-        if self.project is not None:
+        if self._project is not None:
             res["project"] = self.project
-        if self._chunk_rows is not None:
-            res["batch_rows"] = self._chunk_rows
+        if self._batch_size is not None:
+            res["batch_size"] = self.batch_size
         return res
-    def add(self, settings: "Settings"):
-        self._cache = settings._cache or self._cache
-        self.parallel = settings.parallel or self.parallel
-        self._workers = settings._workers or self._workers
-        self.min_task_size = settings.min_task_size or self.min_task_size
-        self.namespace = settings.namespace or self.namespace
-        self.project = settings.project or self.project
-        if settings.prefetch is not None:
-            self.prefetch = settings.prefetch
-        if settings._chunk_rows is not None:
-            self._chunk_rows = settings._chunk_rows
+    def add(self, settings: "Settings") -> None:
+        if settings._cache is not None:
+            self._cache = settings._cache
+        if settings._prefetch is not None:
+            self._prefetch = settings._prefetch
+        if settings._parallel is not None:
+            self._parallel = settings._parallel
+        if settings._workers is not None:
+            self._workers = settings._workers
+        if settings._namespace is not None:
+            self._namespace = settings._namespace
+        if settings._project is not None:
+            self._project = settings._project
+        if settings._min_task_size is not None:
+            self._min_task_size = settings._min_task_size
+        if settings._batch_size is not None:
+            self._batch_size = settings._batch_size

datachain/lib/udf.py CHANGED Viewed

@@ -54,23 +54,11 @@ UDFOutputSpec = Mapping[str, ColumnType]
 UDFResult = dict[str, Any]
-@attrs.define
-class UDFProperties:
-    udf: "UDFAdapter"
-    def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
-        return self.udf.get_batching(use_partitioning)
-    @property
-    def batch_rows(self):
-        return self.udf.batch_rows
 @attrs.define(slots=False)
 class UDFAdapter:
     inner: "UDFBase"
     output: UDFOutputSpec
-    batch_rows: Optional[int] = None
+    batch_size: Optional[int] = None
     batch: int = 1
     def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
@@ -83,11 +71,6 @@ class UDFAdapter:
             return Batch(self.batch)
         raise ValueError(f"invalid batch size {self.batch}")
-    @property
-    def properties(self):
-        # For backwards compatibility.
-        return UDFProperties(self)
     def run(
         self,
         udf_fields: "Sequence[str]",
@@ -237,13 +220,13 @@ class UDFBase(AbstractUDF):
     def to_udf_wrapper(
         self,
-        batch_rows: Optional[int] = None,
+        batch_size: Optional[int] = None,
         batch: int = 1,
     ) -> UDFAdapter:
         return UDFAdapter(
             self,
             self.output.to_udf_spec(),
-            batch_rows,
+            batch_size,
             batch,
         )

datachain/query/batch.py CHANGED Viewed

@@ -81,8 +81,8 @@ class Batch(BatchingStrategy):
         # select rows in batches
         results = []
-        with contextlib.closing(execute(query, page_size=page_size)) as batch_rows:
-            for row in batch_rows:
+        with contextlib.closing(execute(query, page_size=page_size)) as rows:
+            for row in rows:
                 results.append(row)
                 if len(results) >= self.count:
                     batch, results = results[: self.count], results[self.count :]

datachain/query/dataset.py CHANGED Viewed

@@ -55,7 +55,6 @@ from datachain.query.udf import UdfInfo
 from datachain.sql.functions.random import rand
 from datachain.sql.types import SQLType
 from datachain.utils import (
-    batched,
     determine_processes,
     determine_workers,
     filtered_cloudpickle_dumps,
@@ -334,10 +333,10 @@ def process_udf_outputs(
     udf_results: Iterator[Iterable["UDFResult"]],
     udf: "UDFAdapter",
     cb: Callback = DEFAULT_CALLBACK,
+    batch_size: int = INSERT_BATCH_SIZE,
 ) -> None:
     # Optimization: Compute row types once, rather than for every row.
     udf_col_types = get_col_types(warehouse, udf.output)
-    batch_rows = udf.batch_rows or INSERT_BATCH_SIZE
     def _insert_rows():
         for udf_output in udf_results:
@@ -349,9 +348,7 @@ def process_udf_outputs(
                     cb.relative_update()
                     yield adjust_outputs(warehouse, row, udf_col_types)
-    for row_chunk in batched(_insert_rows(), batch_rows):
-        warehouse.insert_rows(udf_table, row_chunk)
+    warehouse.insert_rows(udf_table, _insert_rows(), batch_size=batch_size)
     warehouse.insert_rows_done(udf_table)
@@ -388,12 +385,13 @@ class UDFStep(Step, ABC):
     udf: "UDFAdapter"
     catalog: "Catalog"
     partition_by: Optional[PartitionByType] = None
+    is_generator = False
+    # Parameters from Settings
+    cache: bool = False
     parallel: Optional[int] = None
     workers: Union[bool, int] = False
     min_task_size: Optional[int] = None
-    is_generator = False
-    cache: bool = False
-    batch_rows: Optional[int] = None
+    batch_size: Optional[int] = None
     @abstractmethod
     def create_udf_table(self, query: Select) -> "Table":
@@ -450,6 +448,7 @@ class UDFStep(Step, ABC):
                         use_cache=self.cache,
                         is_generator=self.is_generator,
                         min_task_size=self.min_task_size,
+                        batch_size=self.batch_size,
                     )
                     udf_distributor()
                     return
@@ -486,6 +485,7 @@ class UDFStep(Step, ABC):
                         is_generator=self.is_generator,
                         cache=self.cache,
                         rows_total=rows_total,
+                        batch_size=self.batch_size or INSERT_BATCH_SIZE,
                     )
                     # Run the UDFDispatcher in another process to avoid needing
@@ -534,6 +534,7 @@ class UDFStep(Step, ABC):
                                 udf_results,
                                 self.udf,
                                 cb=generated_cb,
+                                batch_size=self.batch_size or INSERT_BATCH_SIZE,
                             )
                     finally:
                         download_cb.close()
@@ -595,7 +596,7 @@ class UDFStep(Step, ABC):
                 parallel=self.parallel,
                 workers=self.workers,
                 min_task_size=self.min_task_size,
-                batch_rows=self.batch_rows,
+                batch_size=self.batch_size,
             )
         return self.__class__(self.udf, self.catalog)
@@ -641,7 +642,16 @@ class UDFStep(Step, ABC):
 @frozen
 class UDFSignal(UDFStep):
+    udf: "UDFAdapter"
+    catalog: "Catalog"
+    partition_by: Optional[PartitionByType] = None
     is_generator = False
+    # Parameters from Settings
+    cache: bool = False
+    parallel: Optional[int] = None
+    workers: Union[bool, int] = False
+    min_task_size: Optional[int] = None
+    batch_size: Optional[int] = None
     def create_udf_table(self, query: Select) -> "Table":
         udf_output_columns: list[sqlalchemy.Column[Any]] = [
@@ -711,7 +721,16 @@ class UDFSignal(UDFStep):
 class RowGenerator(UDFStep):
     """Extend dataset with new rows."""
+    udf: "UDFAdapter"
+    catalog: "Catalog"
+    partition_by: Optional[PartitionByType] = None
     is_generator = True
+    # Parameters from Settings
+    cache: bool = False
+    parallel: Optional[int] = None
+    workers: Union[bool, int] = False
+    min_task_size: Optional[int] = None
+    batch_size: Optional[int] = None
     def create_udf_table(self, query: Select) -> "Table":
         warehouse = self.catalog.warehouse
@@ -1626,12 +1645,17 @@ class DatasetQuery:
     def add_signals(
         self,
         udf: "UDFAdapter",
+        partition_by: Optional[PartitionByType] = None,
+        # Parameters from Settings
+        cache: bool = False,
         parallel: Optional[int] = None,
         workers: Union[bool, int] = False,
         min_task_size: Optional[int] = None,
-        partition_by: Optional[PartitionByType] = None,
-        cache: bool = False,
-        batch_rows: Optional[int] = None,
+        batch_size: Optional[int] = None,
+        # Parameters are unused, kept only to match the signature of Settings.to_dict
+        prefetch: Optional[int] = None,
+        namespace: Optional[str] = None,
+        project: Optional[str] = None,
     ) -> "Self":
         """
         Adds one or more signals based on the results from the provided UDF.
@@ -1657,7 +1681,7 @@ class DatasetQuery:
                 workers=workers,
                 min_task_size=min_task_size,
                 cache=cache,
-                batch_rows=batch_rows,
+                batch_size=batch_size,
             )
         )
         return query
@@ -1672,14 +1696,17 @@ class DatasetQuery:
     def generate(
         self,
         udf: "UDFAdapter",
+        partition_by: Optional[PartitionByType] = None,
+        # Parameters from Settings
+        cache: bool = False,
         parallel: Optional[int] = None,
         workers: Union[bool, int] = False,
         min_task_size: Optional[int] = None,
-        partition_by: Optional[PartitionByType] = None,
+        batch_size: Optional[int] = None,
+        # Parameters are unused, kept only to match the signature of Settings.to_dict:
+        prefetch: Optional[int] = None,
         namespace: Optional[str] = None,
         project: Optional[str] = None,
-        cache: bool = False,
-        batch_rows: Optional[int] = None,
     ) -> "Self":
         query = self.clone()
         steps = query.steps
@@ -1692,7 +1719,7 @@ class DatasetQuery:
                 workers=workers,
                 min_task_size=min_task_size,
                 cache=cache,
-                batch_rows=batch_rows,
+                batch_size=batch_size,
             )
         )
         return query

datachain/query/dispatch.py CHANGED Viewed

@@ -114,6 +114,7 @@ class UDFDispatcher:
         self.is_batching = udf_info["batching"].is_batching
         self.processes = udf_info["processes"]
         self.rows_total = udf_info["rows_total"]
+        self.batch_size = udf_info["batch_size"]
         self.buffer_size = buffer_size
         self.task_queue = None
         self.done_queue = None
@@ -142,6 +143,7 @@ class UDFDispatcher:
             self.table,
             self.cache,
             self.is_batching,
+            self.batch_size,
             self.udf_fields,
         )
@@ -232,6 +234,7 @@ class UDFDispatcher:
                     udf_results,
                     udf,
                     cb=generated_cb,
+                    batch_size=self.batch_size,
                 )
     def input_batch_size(self, n_workers: int) -> int:
@@ -385,6 +388,7 @@ class UDFWorker:
         table: "Table",
         cache: bool,
         is_batching: bool,
+        batch_size: int,
         udf_fields: Sequence[str],
     ) -> None:
         self.catalog = catalog
@@ -395,6 +399,7 @@ class UDFWorker:
         self.table = table
         self.cache = cache
         self.is_batching = is_batching
+        self.batch_size = batch_size
         self.udf_fields = udf_fields
         self.download_cb = DownloadCallback(self.done_queue)
@@ -420,6 +425,7 @@ class UDFWorker:
                     self.notify_and_process(udf_results),
                     self.udf,
                     cb=self.generated_cb,
+                    batch_size=self.batch_size,
                 )
         put_into_queue(self.done_queue, {"status": FINISHED_STATUS})

datachain/query/udf.py CHANGED Viewed

@@ -21,6 +21,7 @@ class UdfInfo(TypedDict):
     is_generator: bool
     cache: bool
     rows_total: int
+    batch_size: int
 class AbstractUDFDistributor(ABC):
@@ -39,6 +40,7 @@ class AbstractUDFDistributor(ABC):
         use_cache: bool,
         is_generator: bool = False,
         min_task_size: Optional[Union[str, int]] = None,
+        batch_size: Optional[int] = None,
     ) -> None: ...
     @abstractmethod

datachain/utils.py CHANGED Viewed

@@ -25,7 +25,7 @@ if TYPE_CHECKING:
     from typing_extensions import Self
-DEFAULT_CHUNK_ROWS = 2000
+DEFAULT_BATCH_SIZE = 2000
 logger = logging.getLogger("datachain")
@@ -228,7 +228,7 @@ _T_co = TypeVar("_T_co", covariant=True)
 def _dynamic_batched_core(
     iterable: Iterable[_T_co],
-    batch_rows: int,
+    batch_size: int,
 ) -> Iterator[list[_T_co]]:
     """Core batching logic that yields lists."""
@@ -236,7 +236,7 @@ def _dynamic_batched_core(
     for item in iterable:
         # Check if adding this item would exceed limits
-        if len(batch) >= batch_rows and batch:  # Yield current batch if we have one
+        if len(batch) >= batch_size and batch:  # Yield current batch if we have one
             yield batch
             batch = []
@@ -247,23 +247,22 @@ def _dynamic_batched_core(
         yield batch
-def batched(iterable: Iterable[_T_co], batch_rows: int) -> Iterator[tuple[_T_co, ...]]:
+def batched(iterable: Iterable[_T_co], batch_size: int) -> Iterator[tuple[_T_co, ...]]:
     """
-    Batch data into tuples of length batch_rows .
+    Batch data into tuples of length batch_size.
     The last batch may be shorter.
     """
-    yield from (tuple(batch) for batch in _dynamic_batched_core(iterable, batch_rows))
+    yield from (tuple(batch) for batch in _dynamic_batched_core(iterable, batch_size))
 def batched_it(
     iterable: Iterable[_T_co],
-    batch_rows: int = DEFAULT_CHUNK_ROWS,
+    batch_size: int = DEFAULT_BATCH_SIZE,
 ) -> Iterator[Iterator[_T_co]]:
     """
-    Batch data into iterators with dynamic sizing
-    based on row count and memory usage.
+    Batch data into iterators with dynamic sizing based on row count and memory usage.
     """
-    yield from (iter(batch) for batch in _dynamic_batched_core(iterable, batch_rows))
+    yield from (iter(batch) for batch in _dynamic_batched_core(iterable, batch_size))
 def flatten(items):

{datachain-0.30.5.dist-info → datachain-0.30.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: datachain
-Version: 0.30.5
+Version: 0.30.7
 Summary: Wrangle unstructured AI data at scale
 Author-email: Dmitry Petrov <support@dvc.org>
 License-Expression: Apache-2.0

datachain 0.30.5__py3-none-any.whl → 0.30.7__py3-none-any.whl

Potentially problematic release.

datachain 0.30.5py3-none-any.whl → 0.30.7py3-none-any.whl