PyPI - zoopipe - Versions diffs - 2026.1.20__cp310-abi3-macosx_11_0_arm64.whl - Mend

zoopipe 2026.1.20__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

zoopipe/__init__.py +72 -0
zoopipe/engines/__init__.py +4 -0
zoopipe/engines/base.py +45 -0
zoopipe/engines/dask.py +225 -0
zoopipe/engines/local.py +215 -0
zoopipe/engines/ray.py +252 -0
zoopipe/hooks/__init__.py +4 -0
zoopipe/hooks/base.py +70 -0
zoopipe/hooks/sql.py +94 -0
zoopipe/input_adapter/__init__.py +24 -0
zoopipe/input_adapter/arrow.py +38 -0
zoopipe/input_adapter/base.py +48 -0
zoopipe/input_adapter/csv.py +144 -0
zoopipe/input_adapter/duckdb.py +54 -0
zoopipe/input_adapter/excel.py +51 -0
zoopipe/input_adapter/json.py +73 -0
zoopipe/input_adapter/kafka.py +39 -0
zoopipe/input_adapter/parquet.py +85 -0
zoopipe/input_adapter/pygen.py +37 -0
zoopipe/input_adapter/sql.py +103 -0
zoopipe/manager.py +211 -0
zoopipe/output_adapter/__init__.py +23 -0
zoopipe/output_adapter/arrow.py +50 -0
zoopipe/output_adapter/base.py +41 -0
zoopipe/output_adapter/csv.py +71 -0
zoopipe/output_adapter/duckdb.py +46 -0
zoopipe/output_adapter/excel.py +42 -0
zoopipe/output_adapter/json.py +66 -0
zoopipe/output_adapter/kafka.py +39 -0
zoopipe/output_adapter/parquet.py +49 -0
zoopipe/output_adapter/pygen.py +29 -0
zoopipe/output_adapter/sql.py +43 -0
zoopipe/pipe.py +263 -0
zoopipe/protocols.py +37 -0
zoopipe/py.typed +0 -0
zoopipe/report.py +173 -0
zoopipe/utils/__init__.py +0 -0
zoopipe/utils/dependency.py +78 -0
zoopipe/zoopipe_rust_core.abi3.so +0 -0
zoopipe-2026.1.20.dist-info/METADATA +231 -0
zoopipe-2026.1.20.dist-info/RECORD +43 -0
zoopipe-2026.1.20.dist-info/WHEEL +4 -0
zoopipe-2026.1.20.dist-info/licenses/LICENSE +21 -0

zoopipe/__init__.py ADDED Viewed

@@ -0,0 +1,72 @@
+from zoopipe.engines import BaseEngine, MultiProcessEngine
+from zoopipe.hooks.base import BaseHook, HookStore
+from zoopipe.hooks.sql import SQLExpansionHook
+from zoopipe.input_adapter.arrow import ArrowInputAdapter
+from zoopipe.input_adapter.csv import CSVInputAdapter
+from zoopipe.input_adapter.duckdb import DuckDBInputAdapter
+from zoopipe.input_adapter.excel import ExcelInputAdapter
+from zoopipe.input_adapter.json import JSONInputAdapter
+from zoopipe.input_adapter.kafka import KafkaInputAdapter
+from zoopipe.input_adapter.parquet import ParquetInputAdapter
+from zoopipe.input_adapter.pygen import PyGeneratorInputAdapter
+from zoopipe.input_adapter.sql import SQLInputAdapter, SQLPaginationInputAdapter
+from zoopipe.manager import PipeManager
+from zoopipe.output_adapter.arrow import ArrowOutputAdapter
+from zoopipe.output_adapter.csv import CSVOutputAdapter
+from zoopipe.output_adapter.duckdb import DuckDBOutputAdapter
+from zoopipe.output_adapter.excel import ExcelOutputAdapter
+from zoopipe.output_adapter.json import JSONOutputAdapter
+from zoopipe.output_adapter.kafka import KafkaOutputAdapter
+from zoopipe.output_adapter.parquet import ParquetOutputAdapter
+from zoopipe.output_adapter.pygen import PyGeneratorOutputAdapter
+from zoopipe.output_adapter.sql import SQLOutputAdapter
+from zoopipe.pipe import Pipe
+from zoopipe.protocols import InputAdapterProtocol, OutputAdapterProtocol
+from zoopipe.report import (
+    EntryStatus,
+    EntryTypedDict,
+    FlowReport,
+    FlowStatus,
+    get_logger,
+)
+from zoopipe.zoopipe_rust_core import MultiThreadExecutor, SingleThreadExecutor
+__all__ = [
+    "Pipe",
+    "PipeManager",
+    "BaseEngine",
+    "MultiProcessEngine",
+    "FlowReport",
+    "FlowStatus",
+    "BaseHook",
+    "HookStore",
+    "EntryStatus",
+    "EntryTypedDict",
+    "get_logger",
+    "SingleThreadExecutor",
+    "MultiThreadExecutor",
+    "SQLExpansionHook",
+    "InputAdapterProtocol",
+    "OutputAdapterProtocol",
+    # Input Adapters
+    "ArrowInputAdapter",
+    "CSVInputAdapter",
+    "DuckDBInputAdapter",
+    "ExcelInputAdapter",
+    "JSONInputAdapter",
+    "PyGeneratorInputAdapter",
+    "SQLInputAdapter",
+    "SQLPaginationInputAdapter",
+    "ParquetInputAdapter",
+    "KafkaInputAdapter",
+    # Output Adapters
+    "ArrowOutputAdapter",
+    "CSVOutputAdapter",
+    "DuckDBOutputAdapter",
+    "ExcelOutputAdapter",
+    "JSONOutputAdapter",
+    "PyGeneratorOutputAdapter",
+    "SQLOutputAdapter",
+    "ParquetOutputAdapter",
+    "KafkaOutputAdapter",
+]

zoopipe/engines/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from zoopipe.engines.base import BaseEngine
+from zoopipe.engines.local import MultiProcessEngine
+__all__ = ["BaseEngine", "MultiProcessEngine"]

zoopipe/engines/base.py ADDED Viewed

@@ -0,0 +1,45 @@
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from zoopipe.pipe import Pipe
+    from zoopipe.report import FlowReport
+class BaseEngine(ABC):
+    """
+    Abstract base class for ZooPipe execution engines.
+    Engines are responsible for the "Orchestration" layer of the pipeline,
+    deciding WHERE and HOW different pipe shards are executed
+    (locally, distributed, etc.).
+    """
+    @abstractmethod
+    def start(self, pipes: list[Pipe]) -> None:
+        """Execute the given list of pipes."""
+        pass
+    @abstractmethod
+    def wait(self, timeout: float | None = None) -> bool:
+        """Wait for execution to finish."""
+        pass
+    @abstractmethod
+    def shutdown(self, timeout: float = 5.0) -> None:
+        """Forcibly stop execution."""
+        pass
+    @property
+    @abstractmethod
+    def is_running(self) -> bool:
+        """Check if the engine is currently running."""
+        pass
+    @property
+    @abstractmethod
+    def report(self) -> FlowReport:
+        """Get an aggregated report of the current execution."""
+        pass

zoopipe/engines/dask.py ADDED Viewed

@@ -0,0 +1,225 @@
+from __future__ import annotations
+import os
+import re
+from datetime import datetime
+from importlib import metadata
+from typing import TYPE_CHECKING, Any
+from dask.distributed import Client, get_client
+from zoopipe.engines.base import BaseEngine
+from zoopipe.engines.local import PipeReport
+from zoopipe.report import FlowReport, FlowStatus
+from zoopipe.utils.dependency import install_dependencies as _install_dependencies
+if TYPE_CHECKING:
+    from zoopipe.pipe import Pipe
+class DaskPipeWorker:
+    """
+    Dask Worker that wraps a single Pipe execution.
+    Can be used as a Dask Actor for stateful reporting.
+    """
+    def __init__(self, pipe: Pipe, index: int):
+        self.pipe = pipe
+        self.index = index
+        self.is_finished = False
+        self.has_error = False
+    def run(self) -> None:
+        """Execute the pipe."""
+        try:
+            self.pipe.start(wait=True)
+        except Exception:
+            self.has_error = True
+        finally:
+            self.is_finished = True
+    def get_report(self) -> PipeReport:
+        """Get the current progress snapshot from the pipe."""
+        report = self.pipe.report
+        return PipeReport(
+            pipe_index=self.index,
+            total_processed=report.total_processed,
+            success_count=report.success_count,
+            error_count=report.error_count,
+            ram_bytes=report.ram_bytes,
+            is_finished=self.is_finished or report.is_finished,
+            has_error=self.has_error,
+            is_alive=not self.is_finished,
+        )
+class DaskEngine(BaseEngine):
+    """
+    Distributed execution engine using Dask.
+    """
+    def __init__(self, address: str | None = None, **kwargs: Any):
+        try:
+            self.client = get_client(address) if address else get_client()
+        except (ValueError, RuntimeError):
+            # No client running, create one
+            self.client = Client(address=address, **kwargs)
+        # Prepare environment
+        self._prepare_runtime_env()
+        self._workers: list[Any] = []
+        self._futures: list[Any] = []
+        self._start_time: datetime | None = None
+        self._cached_report: FlowReport | None = None
+    def _prepare_runtime_env(self) -> None:
+        """
+        Configure the Dask workers based on whether we are in
+        development mode or being used as a library.
+        """
+        # 1. Detect environment and versions
+        is_dev_mode = False
+        try:
+            # heuristic: if we are in the zoopipe repo and have the ABI, it's dev mode
+            if (
+                os.path.exists("src/zoopipe")
+                and os.path.exists("pyproject.toml")
+                and any(f.endswith(".so") for f in os.listdir("src/zoopipe"))
+            ):
+                is_dev_mode = True
+        except Exception:
+            pass
+        # 2. Setup dependencies
+        deps = []
+        if is_dev_mode:
+            # Dev mode: Extract dependencies from pyproject.toml
+            try:
+                with open("pyproject.toml", "r") as f:
+                    toml_content = f.read()
+                    match = re.search(
+                        r"dependencies\s*=\s*\[(.*?)\]", toml_content, re.DOTALL
+                    )
+                    if match:
+                        dep_block = match.group(1)
+                        deps = re.findall(r'["\'](.*?)["\']', dep_block)
+            except Exception:
+                pass
+        else:
+            # User mode: install current zoopipe version
+            try:
+                version = metadata.version("zoopipe")
+                deps.append(f"zoopipe=={version}")
+            except metadata.PackageNotFoundError:
+                deps = ["pydantic>=2.0"]
+        # Install dependencies on all workers
+        if deps:
+            try:
+                unique_deps = list(set(deps))
+                # _install_dependencies is defined at module level to be picklable
+                self.client.run(_install_dependencies, unique_deps)
+            except Exception:
+                pass
+        # 3. Handle local code path for dev mode
+        if is_dev_mode:
+            src_path = os.path.abspath("src")
+            def append_path(path: str):
+                import sys
+                if path not in sys.path:
+                    sys.path.append(path)
+            self.client.run(append_path, src_path)
+    def start(self, pipes: list[Pipe]) -> None:
+        if self.is_running:
+            raise RuntimeError("DaskEngine is already running")
+        self._start_time = datetime.now()
+        # 1. Submit Workers as Actors
+        # It is CRITICAL to use actor=True so they maintain state (live Pipe instance)
+        actor_futures = [
+            self.client.submit(DaskPipeWorker, pipe, i, actor=True)
+            for i, pipe in enumerate(pipes)
+        ]
+        self._workers = [f.result() for f in actor_futures]
+        # 2. Launch execution WITHOUT BLOCKING
+        self._futures = [worker.run() for worker in self._workers]
+        self._cached_report = None
+    def wait(self, timeout: float | None = None) -> bool:
+        if not self._futures:
+            return True
+        start = datetime.now()
+        while self.is_running:
+            if timeout and (datetime.now() - start).total_seconds() > timeout:
+                return False
+            import time
+            time.sleep(0.1)
+        return True
+    def shutdown(self, timeout: float = 5.0) -> None:
+        # Dask actors don't have a direct 'kill', they stay alive as long
+        # as the client/cluster is up or they are garbage collected.
+        # But we can try to signal them if needed.
+        self._workers = []
+        self._futures = []
+        self._cached_report = None
+    @property
+    def is_running(self) -> bool:
+        if not self._futures:
+            return False
+        # In Dask, an actor future is running if it is not 'done'
+        return any(not f.done() for f in self._futures)
+    @property
+    def report(self) -> FlowReport:
+        if self._cached_report and self._cached_report.is_finished:
+            return self._cached_report
+        report = FlowReport()
+        report.start_time = self._start_time
+        p_reports = self.pipe_reports
+        for pr in p_reports:
+            report.total_processed += pr.total_processed
+            report.success_count += pr.success_count
+            report.error_count += pr.error_count
+            report.ram_bytes += pr.ram_bytes
+        all_finished = not self.is_running
+        any_error = any(pr.has_error for pr in p_reports)
+        if all_finished:
+            report.status = FlowStatus.FAILED if any_error else FlowStatus.COMPLETED
+            report.end_time = datetime.now()
+            report._finished_event.set()
+            self._cached_report = report
+        else:
+            report.status = FlowStatus.RUNNING
+        return report
+    @property
+    def pipe_reports(self) -> list[PipeReport]:
+        if not self._workers:
+            return []
+        # Get reports from actors
+        return [w.get_report().result() for w in self._workers]
+    def get_pipe_report(self, index: int) -> PipeReport:
+        if not self._workers:
+            raise RuntimeError("Engine has not been started")
+        return self._workers[index].get_report().result()

zoopipe/engines/local.py ADDED Viewed

@@ -0,0 +1,215 @@
+from __future__ import annotations
+import multiprocessing
+from ctypes import c_int, c_longlong
+from dataclasses import dataclass
+from datetime import datetime
+from multiprocessing.sharedctypes import Synchronized
+from typing import TYPE_CHECKING
+from zoopipe.engines.base import BaseEngine
+from zoopipe.report import FlowReport, FlowStatus
+if TYPE_CHECKING:
+    from zoopipe.pipe import Pipe
+@dataclass
+class PipeProcess:
+    """
+    Internal handle for a Pipe running in an isolated worker process.
+    """
+    process: multiprocessing.Process
+    total_processed: Synchronized[c_longlong]
+    success_count: Synchronized[c_longlong]
+    error_count: Synchronized[c_longlong]
+    ram_bytes: Synchronized[c_longlong]
+    is_finished: Synchronized[c_int]
+    has_error: Synchronized[c_int]
+    pipe_index: int = 0
+@dataclass
+class PipeReport:
+    """
+    Snapshot of the current status of a single managed pipe.
+    """
+    pipe_index: int
+    total_processed: int = 0
+    success_count: int = 0
+    error_count: int = 0
+    ram_bytes: int = 0
+    is_finished: bool = False
+    has_error: bool = False
+    is_alive: bool = True
+def _run_pipe(
+    pipe: Pipe,
+    total_processed: Synchronized[c_longlong],
+    success_count: Synchronized[c_longlong],
+    error_count: Synchronized[c_longlong],
+    ram_bytes: Synchronized[c_longlong],
+    is_finished: Synchronized[c_int],
+    has_error: Synchronized[c_int],
+) -> None:
+    try:
+        pipe.start(wait=False)
+        while not pipe.report.is_finished:
+            total_processed.value = pipe.report.total_processed
+            success_count.value = pipe.report.success_count
+            error_count.value = pipe.report.error_count
+            ram_bytes.value = pipe.report.ram_bytes
+            pipe.report.wait(timeout=1)
+        total_processed.value = pipe.report.total_processed
+        success_count.value = pipe.report.success_count
+        error_count.value = pipe.report.error_count
+        ram_bytes.value = pipe.report.ram_bytes
+    except Exception:
+        has_error.value = 1
+    finally:
+        is_finished.value = 1
+class MultiProcessEngine(BaseEngine):
+    """
+    Engine that executes pipes in multiple local processes.
+    """
+    def __init__(self):
+        self._pipe_processes: list[PipeProcess] = []
+        self._start_time: datetime | None = None
+        self._cached_report: FlowReport | None = None
+    def start(self, pipes: list[Pipe]) -> None:
+        if self.is_running:
+            raise RuntimeError("Engine is already running")
+        self._start_time = datetime.now()
+        self._pipe_processes.clear()
+        self._cached_report = None
+        for i, pipe in enumerate(pipes):
+            total_processed: Synchronized[c_longlong] = multiprocessing.Value(
+                "q", 0, lock=False
+            )
+            success_count: Synchronized[c_longlong] = multiprocessing.Value(
+                "q", 0, lock=False
+            )
+            error_count: Synchronized[c_longlong] = multiprocessing.Value(
+                "q", 0, lock=False
+            )
+            ram_bytes: Synchronized[c_longlong] = multiprocessing.Value(
+                "q", 0, lock=False
+            )
+            is_finished: Synchronized[c_int] = multiprocessing.Value("i", 0, lock=False)
+            has_error: Synchronized[c_int] = multiprocessing.Value("i", 0, lock=False)
+            process = multiprocessing.Process(
+                target=_run_pipe,
+                args=(
+                    pipe,
+                    total_processed,
+                    success_count,
+                    error_count,
+                    ram_bytes,
+                    is_finished,
+                    has_error,
+                ),
+            )
+            process.start()
+            self._pipe_processes.append(
+                PipeProcess(
+                    process=process,
+                    total_processed=total_processed,
+                    success_count=success_count,
+                    error_count=error_count,
+                    ram_bytes=ram_bytes,
+                    is_finished=is_finished,
+                    has_error=has_error,
+                    pipe_index=i,
+                )
+            )
+    def wait(self, timeout: float | None = None) -> bool:
+        for pp in self._pipe_processes:
+            pp.process.join(timeout=timeout)
+        return all(not pp.process.is_alive() for pp in self._pipe_processes)
+    def shutdown(self, timeout: float = 5.0) -> None:
+        for pp in self._pipe_processes:
+            if pp.process.is_alive():
+                pp.process.terminate()
+        for pp in self._pipe_processes:
+            pp.process.join(timeout=timeout)
+            if pp.process.is_alive():
+                pp.process.kill()
+        self._pipe_processes.clear()
+    @property
+    def is_running(self) -> bool:
+        return bool(self._pipe_processes) and any(
+            pp.process.is_alive() for pp in self._pipe_processes
+        )
+    @property
+    def report(self) -> FlowReport:
+        if self._cached_report and self._cached_report.is_finished:
+            return self._cached_report
+        report = FlowReport()
+        report.start_time = self._start_time
+        for pp in self._pipe_processes:
+            report.total_processed += pp.total_processed.value
+            report.success_count += pp.success_count.value
+            report.error_count += pp.error_count.value
+            report.ram_bytes += pp.ram_bytes.value
+        all_finished = all(pp.is_finished.value == 1 for pp in self._pipe_processes)
+        any_error = any(pp.has_error.value == 1 for pp in self._pipe_processes)
+        if all_finished:
+            report.status = FlowStatus.FAILED if any_error else FlowStatus.COMPLETED
+            report.end_time = datetime.now()
+            report._finished_event.set()
+            self._cached_report = report
+        else:
+            report.status = FlowStatus.RUNNING
+        return report
+    @property
+    def pipe_reports(self) -> list[PipeReport]:
+        """Get reports for all managed pipes."""
+        return [self.get_pipe_report(i) for i in range(len(self._pipe_processes))]
+    def get_pipe_report(self, index: int) -> PipeReport:
+        if not self._pipe_processes:
+            raise RuntimeError("Engine has not been started")
+        pp = self._pipe_processes[index]
+        return PipeReport(
+            pipe_index=index,
+            total_processed=pp.total_processed.value,
+            success_count=pp.success_count.value,
+            error_count=pp.error_count.value,
+            ram_bytes=pp.ram_bytes.value,
+            is_finished=pp.is_finished.value == 1,
+            has_error=pp.has_error.value == 1,
+            is_alive=pp.process.is_alive(),
+        )
+def _init_multiprocessing() -> None:
+    try:
+        multiprocessing.set_start_method("fork", force=True)
+    except RuntimeError:
+        pass
+_init_multiprocessing()