PyPI - datachain - Versions diffs - 0.2.9__py3-none-any.whl → 0.2.11__py3-none-any.whl - Mend

datachain 0.2.9py3-none-any.whl → 0.2.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (51) hide show

datachain/__init__.py +17 -8
datachain/catalog/catalog.py +5 -5
datachain/cli.py +0 -2
datachain/data_storage/schema.py +5 -5
datachain/data_storage/sqlite.py +1 -1
datachain/data_storage/warehouse.py +7 -7
datachain/lib/arrow.py +25 -8
datachain/lib/clip.py +6 -11
datachain/lib/convert/__init__.py +0 -0
datachain/lib/convert/flatten.py +67 -0
datachain/lib/convert/type_converter.py +96 -0
datachain/lib/convert/unflatten.py +69 -0
datachain/lib/convert/values_to_tuples.py +85 -0
datachain/lib/data_model.py +74 -0
datachain/lib/dc.py +225 -168
datachain/lib/file.py +41 -41
datachain/lib/gpt4_vision.py +1 -9
datachain/lib/hf_image_to_text.py +9 -17
datachain/lib/hf_pipeline.py +4 -12
datachain/lib/image.py +2 -18
datachain/lib/image_transform.py +0 -1
datachain/lib/iptc_exif_xmp.py +8 -15
datachain/lib/meta_formats.py +1 -5
datachain/lib/model_store.py +77 -0
datachain/lib/pytorch.py +9 -21
datachain/lib/signal_schema.py +139 -60
datachain/lib/text.py +5 -16
datachain/lib/udf.py +114 -30
datachain/lib/udf_signature.py +5 -5
datachain/lib/webdataset.py +3 -3
datachain/lib/webdataset_laion.py +2 -3
datachain/node.py +4 -4
datachain/query/batch.py +1 -1
datachain/query/dataset.py +51 -178
datachain/query/dispatch.py +43 -30
datachain/query/udf.py +46 -26
datachain/remote/studio.py +1 -9
datachain/torch/__init__.py +21 -0
datachain/utils.py +39 -0
{datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/METADATA +14 -12
{datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/RECORD +45 -43
{datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/WHEEL +1 -1
datachain/image/__init__.py +0 -3
datachain/lib/cached_stream.py +0 -38
datachain/lib/claude.py +0 -69
datachain/lib/feature.py +0 -412
datachain/lib/feature_registry.py +0 -51
datachain/lib/feature_utils.py +0 -154
{datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/LICENSE +0 -0
{datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/entry_points.txt +0 -0
{datachain-0.2.9.dist-info → datachain-0.2.11.dist-info}/top_level.txt +0 -0

datachain/lib/webdataset_laion.py CHANGED Viewed

@@ -2,9 +2,8 @@ from collections.abc import Iterator
 from typing import Optional
 import numpy as np
-from pydantic import Field
+from pydantic import BaseModel, Field
-from datachain.lib.feature import Feature
 from datachain.lib.file import File
 from datachain.lib.webdataset import WDSBasic, WDSReadableSubclass
@@ -34,7 +33,7 @@ class WDSLaion(WDSBasic):
     json: Laion  # type: ignore[assignment]
-class LaionMeta(Feature):
+class LaionMeta(BaseModel):
     file: File
     index: Optional[int] = Field(default=None)
     b32_img: list[float] = Field(default=None)

datachain/node.py CHANGED Viewed

@@ -46,8 +46,8 @@ class DirTypeGroup:
 @attrs.define
 class Node:
-    id: int = 0
-    random: int = -1
+    sys__id: int = 0
+    sys__rand: int = -1
     vtype: str = ""
     dir_type: Optional[int] = None
     parent: str = ""
@@ -127,11 +127,11 @@ class Node:
     @classmethod
     def from_dir(cls, parent, name, **kwargs) -> "Node":
-        return cls(id=-1, dir_type=DirType.DIR, parent=parent, name=name, **kwargs)
+        return cls(sys__id=-1, dir_type=DirType.DIR, parent=parent, name=name, **kwargs)
     @classmethod
     def root(cls) -> "Node":
-        return cls(-1, dir_type=DirType.DIR)
+        return cls(sys__id=-1, dir_type=DirType.DIR)
 @attrs.define

datachain/query/batch.py CHANGED Viewed

@@ -104,7 +104,7 @@ class Partition(BatchingStrategy):
         with contextlib.closing(
             execute(
                 query,
-                order_by=(PARTITION_COLUMN_ID, "id", *query._order_by_clauses),
+                order_by=(PARTITION_COLUMN_ID, "sys__id", *query._order_by_clauses),
                 limit=query._limit,
             )
         ) as rows:

datachain/query/dataset.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import ast
 import contextlib
 import datetime
 import inspect
@@ -10,7 +9,6 @@ import re
 import string
 import subprocess
 import sys
-import types
 from abc import ABC, abstractmethod
 from collections.abc import Generator, Iterable, Iterator, Sequence
 from copy import copy
@@ -26,10 +24,8 @@ from typing import (
 )
 import attrs
-import pandas as pd
 import sqlalchemy
 from attrs import frozen
-from dill import dumps, source
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback, TqdmCallback
 from sqlalchemy import Column
 from sqlalchemy.sql import func as f
@@ -52,12 +48,14 @@ from datachain.data_storage.schema import (
 from datachain.dataset import DatasetStatus, RowDict
 from datachain.error import DatasetNotFoundError, QueryScriptCancelError
 from datachain.progress import CombinedDownloadCallback
-from datachain.query.schema import DEFAULT_DELIMITER
 from datachain.sql.functions import rand
 from datachain.storage import Storage, StorageURI
-from datachain.utils import batched, determine_processes, inside_notebook
+from datachain.utils import (
+    batched,
+    determine_processes,
+    filtered_cloudpickle_dumps,
+)
-from .batch import RowBatch
 from .metrics import metrics
 from .schema import C, UDFParamSpec, normalize_param
 from .session import Session
@@ -257,7 +255,7 @@ class DatasetDiffOperation(Step):
         """
     def apply(self, query_generator, temp_tables: list[str]):
-        source_query = query_generator.exclude(("id",))
+        source_query = query_generator.exclude(("sys__id",))
         target_query = self.dq.apply_steps().select()
         temp_tables.extend(self.dq.temp_table_names)
@@ -427,22 +425,6 @@ def get_generated_callback(is_generator: bool = False) -> Callback:
     return DEFAULT_CALLBACK
-def run_udf(
-    udf,
-    udf_inputs,
-    catalog,
-    is_generator,
-    cache,
-    download_cb: Callback = DEFAULT_CALLBACK,
-    processed_cb: Callback = DEFAULT_CALLBACK,
-) -> Iterator[Iterable["UDFResult"]]:
-    for batch in udf_inputs:
-        n_rows = len(batch.rows) if isinstance(batch, RowBatch) else 1
-        output = udf(catalog, batch, is_generator, cache, cb=download_cb)
-        processed_cb.relative_update(n_rows)
-        yield output
 @frozen
 class UDF(Step, ABC):
     udf: UDFType
@@ -508,7 +490,7 @@ class UDF(Step, ABC):
             elif processes:
                 # Parallel processing (faster for more CPU-heavy UDFs)
                 udf_info = {
-                    "udf": self.udf,
+                    "udf_data": filtered_cloudpickle_dumps(self.udf),
                     "catalog_init": self.catalog.get_init_params(),
                     "id_generator_clone_params": (
                         self.catalog.id_generator.clone_params()
@@ -529,16 +511,15 @@ class UDF(Step, ABC):
                 envs = dict(os.environ)
                 envs.update({"PYTHONPATH": os.getcwd()})
-                with self.process_feature_module():
-                    process_data = dumps(udf_info, recurse=True)
-                    result = subprocess.run(  # noqa: S603
-                        [datachain_exec_path, "--internal-run-udf"],
-                        input=process_data,
-                        check=False,
-                        env=envs,
-                    )
-                    if result.returncode != 0:
-                        raise RuntimeError("UDF Execution Failed!")
+                process_data = filtered_cloudpickle_dumps(udf_info)
+                result = subprocess.run(  # noqa: S603
+                    [datachain_exec_path, "--internal-run-udf"],
+                    input=process_data,
+                    check=False,
+                    env=envs,
+                )
+                if result.returncode != 0:
+                    raise RuntimeError("UDF Execution Failed!")
             else:
                 # Otherwise process single-threaded (faster for smaller UDFs)
@@ -548,9 +529,6 @@ class UDF(Step, ABC):
                 else:
                     udf = self.udf
-                if hasattr(udf.func, "setup") and callable(udf.func.setup):
-                    udf.func.setup()
                 warehouse = self.catalog.warehouse
                 with contextlib.closing(
@@ -560,8 +538,7 @@ class UDF(Step, ABC):
                     processed_cb = get_processed_callback()
                     generated_cb = get_generated_callback(self.is_generator)
                     try:
-                        udf_results = run_udf(
-                            udf,
+                        udf_results = udf.run(
                             udf_inputs,
                             self.catalog,
                             self.is_generator,
@@ -583,9 +560,6 @@ class UDF(Step, ABC):
                 warehouse.insert_rows_done(udf_table)
-                if hasattr(udf.func, "teardown") and callable(udf.func.teardown):
-                    udf.func.teardown()
         except QueryScriptCancelError:
             self.catalog.warehouse.close()
             sys.exit(QUERY_SCRIPT_CANCELED_EXIT_CODE)
@@ -594,57 +568,6 @@ class UDF(Step, ABC):
             self.catalog.warehouse.close()
             raise
-    @contextlib.contextmanager
-    def process_feature_module(self):
-        # Generate a random name for the feature module
-        feature_module_name = "tmp" + _random_string(10)
-        # Create a dynamic module with the generated name
-        dynamic_module = types.ModuleType(feature_module_name)
-        # Get the import lines for the necessary objects from the main module
-        main_module = sys.modules["__main__"]
-        if getattr(main_module, "__file__", None):
-            import_lines = list(get_imports(main_module))
-        else:
-            import_lines = [
-                source.getimport(obj, alias=name)
-                for name, obj in main_module.__dict__.items()
-                if _imports(obj) and not (name.startswith("__") and name.endswith("__"))
-            ]
-        # Get the feature classes from the main module
-        feature_classes = {
-            name: obj
-            for name, obj in main_module.__dict__.items()
-            if _feature_predicate(obj)
-        }
-        if not feature_classes:
-            yield None
-            return
-        # Get the source code of the feature classes
-        feature_sources = [source.getsource(cls) for _, cls in feature_classes.items()]
-        # Set the module name for the feature classes to the generated name
-        for name, cls in feature_classes.items():
-            cls.__module__ = feature_module_name
-            setattr(dynamic_module, name, cls)
-        # Add the dynamic module to the sys.modules dictionary
-        sys.modules[feature_module_name] = dynamic_module
-        # Combine the import lines and feature sources
-        feature_file = "\n".join(import_lines) + "\n" + "\n".join(feature_sources)
-        # Write the module content to a .py file
-        with open(f"{feature_module_name}.py", "w") as module_file:
-            module_file.write(feature_file)
-        try:
-            yield feature_module_name
-        finally:
-            for cls in feature_classes.values():
-                cls.__module__ = main_module.__name__
-            os.unlink(f"{feature_module_name}.py")
-            # Remove the dynamic module from sys.modules
-            del sys.modules[feature_module_name]
     def create_partitions_table(self, query: Select) -> "Table":
         """
         Create temporary table with group by partitions.
@@ -663,7 +586,7 @@ class UDF(Step, ABC):
         # fill table with partitions
         cols = [
-            query.selected_columns.id,
+            query.selected_columns.sys__id,
             f.dense_rank().over(order_by=list_partition_by).label(PARTITION_COLUMN_ID),
         ]
         self.catalog.warehouse.db.execute(
@@ -697,7 +620,7 @@ class UDF(Step, ABC):
             subq = query.subquery()
             query = (
                 sqlalchemy.select(*subq.c)
-                .outerjoin(partition_tbl, partition_tbl.c.id == subq.c.id)
+                .outerjoin(partition_tbl, partition_tbl.c.sys__id == subq.c.sys__id)
                 .add_columns(*partition_columns())
             )
@@ -729,18 +652,18 @@ class UDFSignal(UDF):
         columns = [
             sqlalchemy.Column(c.name, c.type)
             for c in query.selected_columns
-            if c.name != "id"
+            if c.name != "sys__id"
         ]
         table = self.catalog.warehouse.create_udf_table(self.udf_table_name(), columns)
         select_q = query.with_only_columns(
-            *[c for c in query.selected_columns if c.name != "id"]
+            *[c for c in query.selected_columns if c.name != "sys__id"]
         )
         # if there is order by clause we need row_number to preserve order
         # if there is no order by clause we still need row_number to generate
         # unique ids as uniqueness is important for this table
         select_q = select_q.add_columns(
-            f.row_number().over(order_by=select_q._order_by_clauses).label("id")
+            f.row_number().over(order_by=select_q._order_by_clauses).label("sys__id")
         )
         self.catalog.warehouse.db.execute(
@@ -756,7 +679,7 @@ class UDFSignal(UDF):
         if query._order_by_clauses:
             # we are adding ordering only if it's explicitly added by user in
             # query part before adding signals
-            q = q.order_by(table.c.id)
+            q = q.order_by(table.c.sys__id)
         return q, [table]
     def create_result_query(
@@ -766,7 +689,7 @@ class UDFSignal(UDF):
         original_cols = [c for c in subq.c if c.name not in partition_col_names]
         # new signal columns that are added to udf_table
-        signal_cols = [c for c in udf_table.c if c.name != "id"]
+        signal_cols = [c for c in udf_table.c if c.name != "sys__id"]
         signal_name_cols = {c.name: c for c in signal_cols}
         cols = signal_cols
@@ -786,7 +709,7 @@ class UDFSignal(UDF):
                 res = (
                     sqlalchemy.select(*cols1)
                     .select_from(subq)
-                    .outerjoin(udf_table, udf_table.c.id == subq.c.id)
+                    .outerjoin(udf_table, udf_table.c.sys__id == subq.c.sys__id)
                     .add_columns(*cols2)
                 )
             else:
@@ -795,7 +718,7 @@ class UDFSignal(UDF):
             if query._order_by_clauses:
                 # if ordering is used in query part before adding signals, we
                 # will have it as order by id from select from pre-created udf table
-                res = res.order_by(subq.c.id)
+                res = res.order_by(subq.c.sys__id)
             if self.partition_by is not None:
                 subquery = res.subquery()
@@ -833,7 +756,7 @@ class RowGenerator(UDF):
             # we get the same rows as we got as inputs of UDF since selecting
             # without ordering can be non deterministic in some databases
             c = query.selected_columns
-            query = query.order_by(c.id)
+            query = query.order_by(c.sys__id)
         udf_table_query = udf_table.select().subquery()
         udf_table_cols: list[sqlalchemy.Label[Any]] = [
@@ -1025,7 +948,7 @@ class SQLJoin(Step):
         q1_column_names = {c.name for c in q1_columns}
         q2_columns = [
             c
-            if c.name not in q1_column_names and c.name != "id"
+            if c.name not in q1_column_names and c.name != "sys__id"
             else c.label(self.rname.format(name=c.name))
             for c in q2.c
         ]
@@ -1165,8 +1088,8 @@ class DatasetQuery:
             self.version = version or ds.latest_version
             self.feature_schema = ds.get_version(self.version).feature_schema
             self.column_types = copy(ds.schema)
-            if "id" in self.column_types:
-                self.column_types.pop("id")
+            if "sys__id" in self.column_types:
+                self.column_types.pop("sys__id")
             self.starting_step = QueryStep(self.catalog, name, self.version)
             # attaching to specific dataset
             self.name = name
@@ -1239,7 +1162,7 @@ class DatasetQuery:
             query.steps = self._chunk_limit(query.steps, index, total)
             # Prepend the chunk filter to the step chain.
-            query = query.filter(C.random % total == index)
+            query = query.filter(C.sys__rand % total == index)
             query.steps = query.steps[-1:] + query.steps[:-1]
         result = query.starting_step.apply()
@@ -1366,20 +1289,12 @@ class DatasetQuery:
         finally:
             self.cleanup()
-    def to_records(self) -> list[dict]:
-        with self.as_iterable() as result:
-            cols = result.columns
-            return [dict(zip(cols, row)) for row in result]
-    def to_pandas(self) -> "pd.DataFrame":
-        records = self.to_records()
-        df = pd.DataFrame.from_records(records)
-        df.columns = [c.replace(DEFAULT_DELIMITER, ".") for c in df.columns]
-        return df
+    def to_records(self) -> list[dict[str, Any]]:
+        return self.results(lambda cols, row: dict(zip(cols, row)))
     def shuffle(self) -> "Self":
         # ToDo: implement shaffle based on seed and/or generating random column
-        return self.order_by(C.random)
+        return self.order_by(C.sys__rand)
     def sample(self, n) -> "Self":
         """
@@ -1395,22 +1310,6 @@ class DatasetQuery:
         return sampled.limit(n)
-    def show(self, limit=20) -> None:
-        df = self.limit(limit).to_pandas()
-        options = ["display.max_colwidth", 50, "display.show_dimensions", False]
-        with pd.option_context(*options):
-            if inside_notebook():
-                from IPython.display import display
-                display(df)
-            else:
-                print(df.to_string())
-        if len(df) == limit:
-            print(f"[limited by {limit} objects]")
     def clone(self, new_table=True) -> "Self":
         obj = copy(self)
         obj.steps = obj.steps.copy()
@@ -1508,30 +1407,35 @@ class DatasetQuery:
         query.steps.append(SQLOffset(offset))
         return query
+    def as_scalar(self) -> Any:
+        with self.as_iterable() as rows:
+            row = next(iter(rows))
+        return row[0]
     def count(self) -> int:
         query = self.clone()
         query.steps.append(SQLCount())
-        return query.results()[0][0]
+        return query.as_scalar()
-    def sum(self, col: ColumnElement):
+    def sum(self, col: ColumnElement) -> int:
         query = self.clone()
         query.steps.append(SQLSelect((f.sum(col),)))
-        return query.results()[0][0]
+        return query.as_scalar()
-    def avg(self, col: ColumnElement):
+    def avg(self, col: ColumnElement) -> int:
         query = self.clone()
         query.steps.append(SQLSelect((f.avg(col),)))
-        return query.results()[0][0]
+        return query.as_scalar()
-    def min(self, col: ColumnElement):
+    def min(self, col: ColumnElement) -> int:
         query = self.clone()
         query.steps.append(SQLSelect((f.min(col),)))
-        return query.results()[0][0]
+        return query.as_scalar()
-    def max(self, col: ColumnElement):
+    def max(self, col: ColumnElement) -> int:
         query = self.clone()
         query.steps.append(SQLSelect((f.max(col),)))
-        return query.results()[0][0]
+        return query.as_scalar()
     @detach
     def group_by(self, *cols: ColumnElement) -> "Self":
@@ -1723,7 +1627,7 @@ class DatasetQuery:
                 c if isinstance(c, Column) else Column(c.name, c.type)
                 for c in query.columns
             ]
-            if not [c for c in columns if c.name != "id"]:
+            if not [c for c in columns if c.name != "sys__id"]:
                 raise RuntimeError(
                     "No columns to save in the query. "
                     "Ensure at least one column (other than 'id') is selected."
@@ -1742,11 +1646,11 @@ class DatasetQuery:
             # Exclude the id column and let the db create it to avoid unique
             # constraint violations.
-            q = query.exclude(("id",))
+            q = query.exclude(("sys__id",))
             if q._order_by_clauses:
                 # ensuring we have id sorted by order by clause if it exists in a query
                 q = q.add_columns(
-                    f.row_number().over(order_by=q._order_by_clauses).label("id")
+                    f.row_number().over(order_by=q._order_by_clauses).label("sys__id")
                 )
             cols = tuple(c.name for c in q.columns)
@@ -1873,34 +1777,3 @@ def _random_string(length: int) -> str:
         random.choice(string.ascii_letters + string.digits)  # noqa: S311
         for i in range(length)
     )
-def _feature_predicate(obj):
-    from datachain.lib.feature import Feature
-    return inspect.isclass(obj) and source.isfrommain(obj) and issubclass(obj, Feature)
-def _imports(obj):
-    return not source.isfrommain(obj)
-def get_imports(m):
-    root = ast.parse(inspect.getsource(m))
-    for node in ast.iter_child_nodes(root):
-        if isinstance(node, ast.Import):
-            module = None
-        elif isinstance(node, ast.ImportFrom):
-            module = node.module
-        else:
-            continue
-        for n in node.names:
-            import_script = ""
-            if module:
-                import_script += f"from {module} "
-            import_script += f"import {n.name}"
-            if n.asname:
-                import_script += f" as {n.asname}"
-            yield import_script

datachain/query/dispatch.py CHANGED Viewed

@@ -10,13 +10,12 @@ from typing import Any, Optional
 import attrs
 import multiprocess
-from dill import load
+from cloudpickle import load, loads
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback
 from multiprocess import get_context
 from datachain.catalog import Catalog
 from datachain.catalog.loader import get_distributed_class
-from datachain.query.batch import RowBatch
 from datachain.query.dataset import (
     get_download_callback,
     get_generated_callback,
@@ -85,7 +84,7 @@ def put_into_queue(queue: Queue, item: Any) -> None:
 def udf_entrypoint() -> int:
     # Load UDF info from stdin
-    udf_info = load(stdin.buffer)  # noqa: S301
+    udf_info = load(stdin.buffer)
     (
         warehouse_class,
@@ -96,7 +95,7 @@ def udf_entrypoint() -> int:
     # Parallel processing (faster for more CPU-heavy UDFs)
     dispatch = UDFDispatcher(
-        udf_info["udf"],
+        udf_info["udf_data"],
         udf_info["catalog_init"],
         udf_info["id_generator_clone_params"],
         udf_info["metastore_clone_params"],
@@ -109,7 +108,7 @@ def udf_entrypoint() -> int:
     batching = udf_info["batching"]
     table = udf_info["table"]
     n_workers = udf_info["processes"]
-    udf = udf_info["udf"]
+    udf = loads(udf_info["udf_data"])
     if n_workers is True:
         # Use default number of CPUs (cores)
         n_workers = None
@@ -147,7 +146,7 @@ class UDFDispatcher:
     def __init__(
         self,
-        udf,
+        udf_data,
         catalog_init_params,
         id_generator_clone_params,
         metastore_clone_params,
@@ -156,14 +155,7 @@ class UDFDispatcher:
         is_generator=False,
         buffer_size=DEFAULT_BATCH_SIZE,
     ):
-        # isinstance cannot be used here, as dill packages the entire class definition,
-        # and so these two types are not considered exactly equal,
-        # even if they have the same import path.
-        if full_module_type_path(type(udf)) != full_module_type_path(UDFFactory):
-            self.udf = udf
-        else:
-            self.udf = None
-            self.udf_factory = udf
+        self.udf_data = udf_data
         self.catalog_init_params = catalog_init_params
         (
             self.id_generator_class,
@@ -215,6 +207,15 @@ class UDFDispatcher:
             self.catalog = Catalog(
                 id_generator, metastore, warehouse, **self.catalog_init_params
             )
+        udf = loads(self.udf_data)
+        # isinstance cannot be used here, as cloudpickle packages the entire class
+        # definition, and so these two types are not considered exactly equal,
+        # even if they have the same import path.
+        if full_module_type_path(type(udf)) != full_module_type_path(UDFFactory):
+            self.udf = udf
+        else:
+            self.udf = None
+            self.udf_factory = udf
         if not self.udf:
             self.udf = self.udf_factory()
@@ -355,6 +356,15 @@ class WorkerCallback(Callback):
         put_into_queue(self.queue, {"status": NOTIFY_STATUS, "downloaded": inc})
+class ProcessedCallback(Callback):
+    def __init__(self):
+        self.processed_rows: Optional[int] = None
+        super().__init__()
+    def relative_update(self, inc: int = 1) -> None:
+        self.processed_rows = inc
 @attrs.define
 class UDFWorker:
     catalog: Catalog
@@ -370,25 +380,28 @@ class UDFWorker:
         return WorkerCallback(self.done_queue)
     def run(self) -> None:
-        if hasattr(self.udf.func, "setup") and callable(self.udf.func.setup):
-            self.udf.func.setup()
-        while (batch := get_from_queue(self.task_queue)) != STOP_SIGNAL:
-            n_rows = len(batch.rows) if isinstance(batch, RowBatch) else 1
-            udf_output = self.udf(
-                self.catalog,
-                batch,
-                is_generator=self.is_generator,
-                cache=self.cache,
-                cb=self.cb,
-            )
+        processed_cb = ProcessedCallback()
+        udf_results = self.udf.run(
+            self.get_inputs(),
+            self.catalog,
+            self.is_generator,
+            self.cache,
+            download_cb=self.cb,
+            processed_cb=processed_cb,
+        )
+        for udf_output in udf_results:
             if isinstance(udf_output, GeneratorType):
                 udf_output = list(udf_output)  # can not pickle generator
             put_into_queue(
                 self.done_queue,
-                {"status": OK_STATUS, "result": udf_output, "processed": n_rows},
+                {
+                    "status": OK_STATUS,
+                    "result": udf_output,
+                    "processed": processed_cb.processed_rows,
+                },
             )
-        if hasattr(self.udf.func, "teardown") and callable(self.udf.func.teardown):
-            self.udf.func.teardown()
         put_into_queue(self.done_queue, {"status": FINISHED_STATUS})
+    def get_inputs(self):
+        while (batch := get_from_queue(self.task_queue)) != STOP_SIGNAL:
+            yield batch

datachain 0.2.9__py3-none-any.whl → 0.2.11__py3-none-any.whl

Potentially problematic release.

datachain 0.2.9py3-none-any.whl → 0.2.11py3-none-any.whl