PyPI - datachain - Versions diffs - 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

datachain 0.3.0py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (26) hide show

datachain/catalog/catalog.py +11 -2
datachain/client/fsspec.py +1 -4
datachain/client/local.py +2 -7
datachain/data_storage/schema.py +22 -8
datachain/data_storage/sqlite.py +5 -0
datachain/data_storage/warehouse.py +8 -14
datachain/lib/dc.py +28 -14
datachain/lib/meta_formats.py +8 -2
datachain/lib/udf.py +21 -14
datachain/node.py +1 -1
datachain/query/batch.py +45 -41
datachain/query/dataset.py +13 -6
datachain/query/dispatch.py +53 -68
datachain/query/queue.py +120 -0
datachain/query/schema.py +4 -0
datachain/query/udf.py +23 -8
datachain/sql/default/base.py +3 -0
datachain/sql/sqlite/base.py +3 -0
datachain/sql/types.py +120 -11
datachain/utils.py +17 -2
{datachain-0.3.0.dist-info → datachain-0.3.2.dist-info}/METADATA +74 -86
{datachain-0.3.0.dist-info → datachain-0.3.2.dist-info}/RECORD +26 -25
{datachain-0.3.0.dist-info → datachain-0.3.2.dist-info}/WHEEL +1 -1
{datachain-0.3.0.dist-info → datachain-0.3.2.dist-info}/LICENSE +0 -0
{datachain-0.3.0.dist-info → datachain-0.3.2.dist-info}/entry_points.txt +0 -0
{datachain-0.3.0.dist-info → datachain-0.3.2.dist-info}/top_level.txt +0 -0

datachain/query/dispatch.py CHANGED Viewed

@@ -2,11 +2,8 @@ import contextlib
 from collections.abc import Iterator, Sequence
 from itertools import chain
 from multiprocessing import cpu_count
-from queue import Empty, Full, Queue
 from sys import stdin
-from time import sleep
-from types import GeneratorType
-from typing import Any, Optional
+from typing import Optional
 import attrs
 import multiprocess
@@ -22,7 +19,16 @@ from datachain.query.dataset import (
     get_processed_callback,
     process_udf_outputs,
 )
+from datachain.query.queue import (
+    get_from_queue,
+    marshal,
+    msgpack_pack,
+    msgpack_unpack,
+    put_into_queue,
+    unmarshal,
+)
 from datachain.query.udf import UDFBase, UDFFactory, UDFResult
+from datachain.utils import batched_it
 DEFAULT_BATCH_SIZE = 10000
 STOP_SIGNAL = "STOP"
@@ -44,44 +50,6 @@ def get_n_workers_from_arg(n_workers: Optional[int] = None) -> int:
     return n_workers
-# For more context on the get_from_queue and put_into_queue functions, see the
-# discussion here:
-# https://github.com/iterative/dvcx/pull/1297#issuecomment-2026308773
-# This problem is not exactly described by, but is also related to these Python issues:
-# https://github.com/python/cpython/issues/66587
-# https://github.com/python/cpython/issues/88628
-# https://github.com/python/cpython/issues/108645
-def get_from_queue(queue: Queue) -> Any:
-    """
-    Gets an item from a queue.
-    This is required to handle signals, such as KeyboardInterrupt exceptions
-    while waiting for items to be available, although only on certain installations.
-    (See the above comment for more context.)
-    """
-    while True:
-        try:
-            return queue.get_nowait()
-        except Empty:
-            sleep(0.01)
-def put_into_queue(queue: Queue, item: Any) -> None:
-    """
-    Puts an item into a queue.
-    This is required to handle signals, such as KeyboardInterrupt exceptions
-    while waiting for items to be queued, although only on certain installations.
-    (See the above comment for more context.)
-    """
-    while True:
-        try:
-            queue.put_nowait(item)
-            return
-        except Full:
-            sleep(0.01)
 def udf_entrypoint() -> int:
     # Load UDF info from stdin
     udf_info = load(stdin.buffer)
@@ -100,8 +68,9 @@ def udf_entrypoint() -> int:
         udf_info["id_generator_clone_params"],
         udf_info["metastore_clone_params"],
         udf_info["warehouse_clone_params"],
-        is_generator=udf_info.get("is_generator", False),
+        udf_fields=udf_info["udf_fields"],
         cache=udf_info["cache"],
+        is_generator=udf_info.get("is_generator", False),
     )
     query = udf_info["query"]
@@ -121,7 +90,7 @@ def udf_entrypoint() -> int:
         generated_cb = get_generated_callback(dispatch.is_generator)
         try:
             udf_results = dispatch.run_udf_parallel(
-                udf_inputs,
+                marshal(udf_inputs),
                 n_workers=n_workers,
                 processed_cb=processed_cb,
                 download_cb=download_cb,
@@ -142,6 +111,9 @@ def udf_worker_entrypoint() -> int:
 class UDFDispatcher:
+    catalog: Optional[Catalog] = None
+    task_queue: Optional[multiprocess.Queue] = None
+    done_queue: Optional[multiprocess.Queue] = None
     _batch_size: Optional[int] = None
     def __init__(
@@ -151,9 +123,10 @@ class UDFDispatcher:
         id_generator_clone_params,
         metastore_clone_params,
         warehouse_clone_params,
-        cache,
-        is_generator=False,
-        buffer_size=DEFAULT_BATCH_SIZE,
+        udf_fields: "Sequence[str]",
+        cache: bool,
+        is_generator: bool = False,
+        buffer_size: int = DEFAULT_BATCH_SIZE,
     ):
         self.udf_data = udf_data
         self.catalog_init_params = catalog_init_params
@@ -172,12 +145,13 @@ class UDFDispatcher:
             self.warehouse_args,
             self.warehouse_kwargs,
         ) = warehouse_clone_params
-        self.is_generator = is_generator
+        self.udf_fields = udf_fields
         self.cache = cache
+        self.is_generator = is_generator
+        self.buffer_size = buffer_size
         self.catalog = None
         self.task_queue = None
         self.done_queue = None
-        self.buffer_size = buffer_size
         self.ctx = get_context("spawn")
     @property
@@ -226,6 +200,7 @@ class UDFDispatcher:
             self.done_queue,
             self.is_generator,
             self.cache,
+            self.udf_fields,
         )
     def _run_worker(self) -> None:
@@ -233,7 +208,11 @@ class UDFDispatcher:
             worker = self._create_worker()
             worker.run()
         except (Exception, KeyboardInterrupt) as e:
-            put_into_queue(self.done_queue, {"status": FAILED_STATUS, "exception": e})
+            if self.done_queue:
+                put_into_queue(
+                    self.done_queue,
+                    {"status": FAILED_STATUS, "exception": e},
+                )
             raise
     @staticmethod
@@ -249,7 +228,6 @@ class UDFDispatcher:
         self,
         input_rows,
         n_workers: Optional[int] = None,
-        cache: bool = False,
         input_queue=None,
         processed_cb: Callback = DEFAULT_CALLBACK,
         download_cb: Callback = DEFAULT_CALLBACK,
@@ -299,21 +277,24 @@ class UDFDispatcher:
                 result = get_from_queue(self.done_queue)
                 status = result["status"]
                 if status == NOTIFY_STATUS:
-                    download_cb.relative_update(result["downloaded"])
+                    if downloaded := result.get("downloaded"):
+                        download_cb.relative_update(downloaded)
+                    if processed := result.get("processed"):
+                        processed_cb.relative_update(processed)
                 elif status == FINISHED_STATUS:
                     # Worker finished
                     n_workers -= 1
                 elif status == OK_STATUS:
-                    processed_cb.relative_update(result["processed"])
-                    yield result["result"]
+                    if processed := result.get("processed"):
+                        processed_cb.relative_update(processed)
+                    yield msgpack_unpack(result["result"])
                 else:  # Failed / error
                     n_workers -= 1
-                    exc = result.get("exception")
-                    if exc:
+                    if exc := result.get("exception"):
                         raise exc
                     raise RuntimeError("Internal error: Parallel UDF execution failed")
-                if not streaming_mode and not input_finished:
+                if status == OK_STATUS and not streaming_mode and not input_finished:
                     try:
                         put_into_queue(self.task_queue, next(input_data))
                     except StopIteration:
@@ -348,7 +329,7 @@ class UDFDispatcher:
 class WorkerCallback(Callback):
-    def __init__(self, queue: multiprocess.Queue):
+    def __init__(self, queue: "multiprocess.Queue"):
         self.queue = queue
         super().__init__()
@@ -369,10 +350,11 @@ class ProcessedCallback(Callback):
 class UDFWorker:
     catalog: Catalog
     udf: UDFBase
-    task_queue: multiprocess.Queue
-    done_queue: multiprocess.Queue
+    task_queue: "multiprocess.Queue"
+    done_queue: "multiprocess.Queue"
     is_generator: bool
     cache: bool
+    udf_fields: Sequence[str]
     cb: Callback = attrs.field()
     @cb.default
@@ -382,7 +364,8 @@ class UDFWorker:
     def run(self) -> None:
         processed_cb = ProcessedCallback()
         udf_results = self.udf.run(
-            self.get_inputs(),
+            self.udf_fields,
+            unmarshal(self.get_inputs()),
             self.catalog,
             self.is_generator,
             self.cache,
@@ -390,15 +373,17 @@ class UDFWorker:
             processed_cb=processed_cb,
         )
         for udf_output in udf_results:
-            if isinstance(udf_output, GeneratorType):
-                udf_output = list(udf_output)  # can not pickle generator
+            for batch in batched_it(udf_output, DEFAULT_BATCH_SIZE):
+                put_into_queue(
+                    self.done_queue,
+                    {
+                        "status": OK_STATUS,
+                        "result": msgpack_pack(list(batch)),
+                    },
+                )
             put_into_queue(
                 self.done_queue,
-                {
-                    "status": OK_STATUS,
-                    "result": udf_output,
-                    "processed": processed_cb.processed_rows,
-                },
+                {"status": NOTIFY_STATUS, "processed": processed_cb.processed_rows},
             )
         put_into_queue(self.done_queue, {"status": FINISHED_STATUS})

datachain/query/queue.py ADDED Viewed

@@ -0,0 +1,120 @@
+import datetime
+from collections.abc import Iterable, Iterator
+from queue import Empty, Full, Queue
+from struct import pack, unpack
+from time import sleep
+from typing import Any
+import msgpack
+from datachain.query.batch import RowsOutput, RowsOutputBatch
+DEFAULT_BATCH_SIZE = 10000
+STOP_SIGNAL = "STOP"
+OK_STATUS = "OK"
+FINISHED_STATUS = "FINISHED"
+FAILED_STATUS = "FAILED"
+NOTIFY_STATUS = "NOTIFY"
+# For more context on the get_from_queue and put_into_queue functions, see the
+# discussion here:
+# https://github.com/iterative/dvcx/pull/1297#issuecomment-2026308773
+# This problem is not exactly described by, but is also related to these Python issues:
+# https://github.com/python/cpython/issues/66587
+# https://github.com/python/cpython/issues/88628
+# https://github.com/python/cpython/issues/108645
+def get_from_queue(queue: Queue) -> Any:
+    """
+    Gets an item from a queue.
+    This is required to handle signals, such as KeyboardInterrupt exceptions
+    while waiting for items to be available, although only on certain installations.
+    (See the above comment for more context.)
+    """
+    while True:
+        try:
+            return queue.get_nowait()
+        except Empty:
+            sleep(0.01)
+def put_into_queue(queue: Queue, item: Any) -> None:
+    """
+    Puts an item into a queue.
+    This is required to handle signals, such as KeyboardInterrupt exceptions
+    while waiting for items to be queued, although only on certain installations.
+    (See the above comment for more context.)
+    """
+    while True:
+        try:
+            queue.put_nowait(item)
+            return
+        except Full:
+            sleep(0.01)
+MSGPACK_EXT_TYPE_DATETIME = 42
+MSGPACK_EXT_TYPE_ROWS_INPUT_BATCH = 43
+def _msgpack_pack_extended_types(obj: Any) -> msgpack.ExtType:
+    if isinstance(obj, datetime.datetime):
+        # packing date object as 1 or 2 variables, depending if timezone info is present
+        #   - timestamp
+        #   - [OPTIONAL] timezone offset from utc in seconds if timezone info exists
+        if obj.tzinfo:
+            data = (obj.timestamp(), int(obj.utcoffset().total_seconds()))  # type: ignore   # noqa: PGH003
+            return msgpack.ExtType(MSGPACK_EXT_TYPE_DATETIME, pack("!dl", *data))
+        data = (obj.timestamp(),)  # type: ignore   # noqa: PGH003
+        return msgpack.ExtType(MSGPACK_EXT_TYPE_DATETIME, pack("!d", *data))
+    if isinstance(obj, RowsOutputBatch):
+        return msgpack.ExtType(
+            MSGPACK_EXT_TYPE_ROWS_INPUT_BATCH,
+            msgpack_pack(obj.rows),
+        )
+    raise TypeError(f"Unknown type: {obj}")
+def msgpack_pack(obj: Any) -> bytes:
+    return msgpack.packb(obj, default=_msgpack_pack_extended_types)
+def _msgpack_unpack_extended_types(code: int, data: bytes) -> Any:
+    if code == MSGPACK_EXT_TYPE_DATETIME:
+        has_timezone = False
+        if len(data) == 8:
+            # we send only timestamp without timezone if data is 8 bytes
+            values = unpack("!d", data)
+        else:
+            has_timezone = True
+            values = unpack("!dl", data)
+        timestamp = values[0]
+        tz_info = None
+        if has_timezone:
+            timezone_offset = values[1]
+            tz_info = datetime.timezone(datetime.timedelta(seconds=timezone_offset))
+        return datetime.datetime.fromtimestamp(timestamp, tz=tz_info)
+    if code == MSGPACK_EXT_TYPE_ROWS_INPUT_BATCH:
+        return RowsOutputBatch(msgpack_unpack(data))
+    return msgpack.ExtType(code, data)
+def msgpack_unpack(data: bytes) -> Any:
+    return msgpack.unpackb(data, ext_hook=_msgpack_unpack_extended_types)
+def marshal(obj: Iterator[RowsOutput]) -> Iterable[bytes]:
+    for row in obj:
+        yield msgpack_pack(row)
+def unmarshal(obj: Iterator[bytes]) -> Iterable[RowsOutput]:
+    for row in obj:
+        yield msgpack_unpack(row)

datachain/query/schema.py CHANGED Viewed

@@ -45,6 +45,10 @@ class Column(sa.ColumnClause, metaclass=ColumnMeta):
         """Search for matches using glob pattern matching."""
         return self.op("GLOB")(glob_str)
+    def regexp(self, regexp_str):
+        """Search for matches using regexp pattern matching."""
+        return self.op("REGEXP")(regexp_str)
 class UDFParameter(ABC):
     @abstractmethod

datachain/query/udf.py CHANGED Viewed

@@ -15,7 +15,14 @@ from fsspec.callbacks import DEFAULT_CALLBACK, Callback
 from datachain.dataset import RowDict
-from .batch import Batch, BatchingStrategy, NoBatching, Partition, RowBatch
+from .batch import (
+    Batch,
+    BatchingStrategy,
+    NoBatching,
+    Partition,
+    RowsOutputBatch,
+    UDFInputBatch,
+)
 from .schema import (
     UDFParameter,
     UDFParamSpec,
@@ -25,7 +32,7 @@ from .schema import (
 if TYPE_CHECKING:
     from datachain.catalog import Catalog
-    from .batch import BatchingResult
+    from .batch import RowsOutput, UDFInput
 ColumnType = Any
@@ -107,7 +114,8 @@ class UDFBase:
     def run(
         self,
-        udf_inputs: "Iterable[BatchingResult]",
+        udf_fields: "Sequence[str]",
+        udf_inputs: "Iterable[RowsOutput]",
         catalog: "Catalog",
         is_generator: bool,
         cache: bool,
@@ -115,15 +123,22 @@ class UDFBase:
         processed_cb: Callback = DEFAULT_CALLBACK,
     ) -> Iterator[Iterable["UDFResult"]]:
         for batch in udf_inputs:
-            n_rows = len(batch.rows) if isinstance(batch, RowBatch) else 1
-            output = self.run_once(catalog, batch, is_generator, cache, cb=download_cb)
+            if isinstance(batch, RowsOutputBatch):
+                n_rows = len(batch.rows)
+                inputs: UDFInput = UDFInputBatch(
+                    [RowDict(zip(udf_fields, row)) for row in batch.rows]
+                )
+            else:
+                n_rows = 1
+                inputs = RowDict(zip(udf_fields, batch))
+            output = self.run_once(catalog, inputs, is_generator, cache, cb=download_cb)
             processed_cb.relative_update(n_rows)
             yield output
     def run_once(
         self,
         catalog: "Catalog",
-        arg: "BatchingResult",
+        arg: "UDFInput",
         is_generator: bool = False,
         cache: bool = False,
         cb: Callback = DEFAULT_CALLBACK,
@@ -199,12 +214,12 @@ class UDFWrapper(UDFBase):
     def run_once(
         self,
         catalog: "Catalog",
-        arg: "BatchingResult",
+        arg: "UDFInput",
         is_generator: bool = False,
         cache: bool = False,
         cb: Callback = DEFAULT_CALLBACK,
     ) -> Iterable[UDFResult]:
-        if isinstance(arg, RowBatch):
+        if isinstance(arg, UDFInputBatch):
             udf_inputs = [
                 self.bind_parameters(catalog, row, cache=cache, cb=cb)
                 for row in arg.rows

datachain/sql/default/base.py CHANGED Viewed

@@ -1,8 +1,10 @@
 from datachain.sql.types import (
+    DBDefaults,
     TypeConverter,
     TypeDefaults,
     TypeReadConverter,
     register_backend_types,
+    register_db_defaults,
     register_type_defaults,
     register_type_read_converters,
 )
@@ -18,5 +20,6 @@ def setup() -> None:
     register_backend_types("default", TypeConverter())
     register_type_read_converters("default", TypeReadConverter())
     register_type_defaults("default", TypeDefaults())
+    register_db_defaults("default", DBDefaults())
     setup_is_complete = True

datachain/sql/sqlite/base.py CHANGED Viewed

@@ -22,8 +22,10 @@ from datachain.sql.sqlite.types import (
     register_type_converters,
 )
 from datachain.sql.types import (
+    DBDefaults,
     TypeDefaults,
     register_backend_types,
+    register_db_defaults,
     register_type_defaults,
     register_type_read_converters,
 )
@@ -66,6 +68,7 @@ def setup():
     register_backend_types("sqlite", SQLiteTypeConverter())
     register_type_read_converters("sqlite", SQLiteTypeReadConverter())
     register_type_defaults("sqlite", TypeDefaults())
+    register_db_defaults("sqlite", DBDefaults())
     compiles(sql_path.parent, "sqlite")(compile_path_parent)
     compiles(sql_path.name, "sqlite")(compile_path_name)

datachain 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

Potentially problematic release.

datachain 0.3.0py3-none-any.whl → 0.3.2py3-none-any.whl