PyPI - zoopipe - Versions diffs - 2026.1.20__cp310-abi3-macosx_11_0_arm64.whl - Mend

zoopipe 2026.1.20__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

zoopipe/__init__.py +72 -0
zoopipe/engines/__init__.py +4 -0
zoopipe/engines/base.py +45 -0
zoopipe/engines/dask.py +225 -0
zoopipe/engines/local.py +215 -0
zoopipe/engines/ray.py +252 -0
zoopipe/hooks/__init__.py +4 -0
zoopipe/hooks/base.py +70 -0
zoopipe/hooks/sql.py +94 -0
zoopipe/input_adapter/__init__.py +24 -0
zoopipe/input_adapter/arrow.py +38 -0
zoopipe/input_adapter/base.py +48 -0
zoopipe/input_adapter/csv.py +144 -0
zoopipe/input_adapter/duckdb.py +54 -0
zoopipe/input_adapter/excel.py +51 -0
zoopipe/input_adapter/json.py +73 -0
zoopipe/input_adapter/kafka.py +39 -0
zoopipe/input_adapter/parquet.py +85 -0
zoopipe/input_adapter/pygen.py +37 -0
zoopipe/input_adapter/sql.py +103 -0
zoopipe/manager.py +211 -0
zoopipe/output_adapter/__init__.py +23 -0
zoopipe/output_adapter/arrow.py +50 -0
zoopipe/output_adapter/base.py +41 -0
zoopipe/output_adapter/csv.py +71 -0
zoopipe/output_adapter/duckdb.py +46 -0
zoopipe/output_adapter/excel.py +42 -0
zoopipe/output_adapter/json.py +66 -0
zoopipe/output_adapter/kafka.py +39 -0
zoopipe/output_adapter/parquet.py +49 -0
zoopipe/output_adapter/pygen.py +29 -0
zoopipe/output_adapter/sql.py +43 -0
zoopipe/pipe.py +263 -0
zoopipe/protocols.py +37 -0
zoopipe/py.typed +0 -0
zoopipe/report.py +173 -0
zoopipe/utils/__init__.py +0 -0
zoopipe/utils/dependency.py +78 -0
zoopipe/zoopipe_rust_core.abi3.so +0 -0
zoopipe-2026.1.20.dist-info/METADATA +231 -0
zoopipe-2026.1.20.dist-info/RECORD +43 -0
zoopipe-2026.1.20.dist-info/WHEEL +4 -0
zoopipe-2026.1.20.dist-info/licenses/LICENSE +21 -0

zoopipe/pipe.py ADDED Viewed

@@ -0,0 +1,263 @@
+import logging
+import threading
+from pydantic import TypeAdapter, ValidationError
+from zoopipe.hooks.base import BaseHook, HookStore
+from zoopipe.protocols import InputAdapterProtocol, OutputAdapterProtocol
+from zoopipe.report import EntryStatus, FlowReport, get_logger
+from zoopipe.zoopipe_rust_core import (
+    MultiThreadExecutor,
+    NativePipe,
+    SingleThreadExecutor,
+)
+class Pipe:
+    """
+    The main execution unit for data processing pipelines.
+    A Pipe connects an input adapter to one or more output adapters,
+    handles validation via Pydantic models, and executes pre- and post-validation hooks.
+    By default, a Pipe executes sequentially. For parallel execution across
+    multiple cores or processes, it is recommended to use `PipeManager`.
+    """
+    def __init__(
+        self,
+        input_adapter: InputAdapterProtocol | None = None,
+        output_adapter: OutputAdapterProtocol | None = None,
+        error_output_adapter: OutputAdapterProtocol | None = None,
+        schema_model: type | None = None,
+        pre_validation_hooks: list[BaseHook] | None = None,
+        post_validation_hooks: list[BaseHook] | None = None,
+        logger: logging.Logger | None = None,
+        report_update_interval: int = 1,
+        executor: SingleThreadExecutor | MultiThreadExecutor | None = None,
+    ) -> None:
+        """
+        Initialize a new Pipe.
+        Args:
+            input_adapter: Source of data.
+            output_adapter: Destination for successfully validated data.
+            error_output_adapter: Optional destination for data that failed validation.
+            schema_model: Optional Pydantic model class for validation.
+            pre_validation_hooks: Hooks to run before validation.
+            post_validation_hooks: Hooks to run after validation.
+            logger: Optional custom logger.
+            report_update_interval: How often (in batches) to update the
+                progress report.
+            executor: Strategy for batch processing. Defaults to SingleThreadExecutor.
+                For advanced parallel execution, use `PipeManager`.
+        """
+        self.input_adapter = input_adapter
+        self.output_adapter = output_adapter
+        self.error_output_adapter = error_output_adapter
+        self.schema_model = schema_model
+        bundled_pre_hooks = []
+        if self.input_adapter and hasattr(self.input_adapter, "get_hooks"):
+            bundled_pre_hooks.extend(self.input_adapter.get_hooks())
+        bundled_post_hooks = []
+        if self.output_adapter and hasattr(self.output_adapter, "get_hooks"):
+            bundled_post_hooks.extend(self.output_adapter.get_hooks())
+        if self.error_output_adapter and hasattr(
+            self.error_output_adapter, "get_hooks"
+        ):
+            bundled_post_hooks.extend(self.error_output_adapter.get_hooks())
+        self.pre_validation_hooks = bundled_pre_hooks + (pre_validation_hooks or [])
+        self.post_validation_hooks = bundled_post_hooks + (post_validation_hooks or [])
+        self.logger = logger or get_logger()
+        self.report_update_interval = report_update_interval
+        self.executor = executor or SingleThreadExecutor()
+        self._report = FlowReport()
+        self._thread: threading.Thread | None = None
+        self._store: HookStore = {}
+        self._validator = TypeAdapter(self.schema_model) if self.schema_model else None
+        self._batch_validator = (
+            TypeAdapter(list[self.schema_model]) if self.schema_model else None
+        )
+        self._status_validated = EntryStatus.VALIDATED
+        self._status_failed = EntryStatus.FAILED
+    def _process_batch(self, entries: list[dict]) -> list[dict]:
+        local_store: HookStore = {}
+        for hook in self.pre_validation_hooks:
+            entries = hook.execute(entries, local_store)
+        if self._validator:
+            self._validate_batch(entries)
+        for hook in self.post_validation_hooks:
+            entries = hook.execute(entries, local_store)
+        return entries
+    def _validate_batch(self, entries: list[dict]) -> None:
+        try:
+            raw_data_list = [e["raw_data"] for e in entries]
+            validated_list = self._batch_validator.validate_python(raw_data_list)
+            for entry, processed in zip(entries, validated_list):
+                entry["validated_data"] = processed.model_dump()
+                entry["status"] = self._status_validated
+        except ValidationError as e:
+            for error in e.errors():
+                entry_index = error["loc"][0]
+                entry = entries[entry_index]
+                entry["status"] = self._status_failed
+                entry["errors"].append({"msg": str(error), "type": "validation_error"})
+    @property
+    def report(self) -> FlowReport:
+        """Get the current progress report of the pipeline."""
+        return self._report
+    def start(self, wait: bool = False) -> None:
+        """
+        Start the pipeline execution in a separate thread.
+        Args:
+            wait: If True, blocks until the pipeline finishes.
+        """
+        if self._thread and self._thread.is_alive():
+            raise RuntimeError("Pipe is already running")
+        reader = self.input_adapter.get_native_reader()
+        writer = self.output_adapter.get_native_writer()
+        error_writer = None
+        if self.error_output_adapter:
+            error_writer = self.error_output_adapter.get_native_writer()
+        native_pipe = NativePipe(
+            reader=reader,
+            writer=writer,
+            error_writer=error_writer,
+            batch_processor=self._process_batch,
+            report=self._report,
+            report_update_interval=self.report_update_interval,
+            executor=self.executor,
+        )
+        self._thread = threading.Thread(
+            target=self._run_native,
+            args=(native_pipe,),
+            daemon=False,
+        )
+        self._thread.start()
+        if wait:
+            self.wait()
+    def _run_native(self, native_pipe: NativePipe) -> None:
+        try:
+            for hook in self.pre_validation_hooks:
+                hook.setup(self._store)
+            for hook in self.post_validation_hooks:
+                hook.setup(self._store)
+            native_pipe.run()
+        except Exception as e:
+            self.logger.error(f"Pipeline execution failed: {e}")
+            self._report._mark_failed(e)
+            raise
+        finally:
+            for hook in self.pre_validation_hooks:
+                hook.teardown(self._store)
+            for hook in self.post_validation_hooks:
+                hook.teardown(self._store)
+    def shutdown(self, timeout: float = 5.0) -> None:
+        """
+        Request the pipeline to stop and wait for it to finish.
+        Args:
+            timeout: Maximum time to wait for the thread to join.
+        """
+        self._report.abort()
+        if self._thread and self._thread.is_alive():
+            self._thread.join(timeout=timeout)
+            if self._thread.is_alive():
+                self.logger.warning(
+                    "Pipeline thread did not finish cleanly within timeout"
+                )
+    def wait(self, timeout: float | None = None) -> bool:
+        """
+        Wait for the pipeline to finish.
+        Args:
+            timeout: Optional timeout in seconds.
+        Returns:
+            True if the pipeline finished, False if it timed out.
+        """
+        return self._report.wait(timeout)
+    def __enter__(self) -> "Pipe":
+        self.start()
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        if not self._report.is_finished:
+            self.shutdown()
+        if self._thread and self._thread.is_alive():
+            self._thread.join(timeout=10.0)
+            if self._thread.is_alive():
+                self.logger.warning("Pipeline thread still running after context exit")
+    def __repr__(self) -> str:
+        return f"<Pipe input={self.input_adapter} output={self.output_adapter}>"
+    def __getstate__(self) -> dict:
+        """Serialize the pipe state, handling non-picklable Rust objects."""
+        state = self.__dict__.copy()
+        # Handle executor serialization
+        executor = state["executor"]
+        exec_config = {
+            "class_name": executor.__class__.__name__,
+            "batch_size": executor.get_batch_size(),
+        }
+        # MultiThreadExecutor specific attribute (not directly exposed via property,
+        # so we rely on the constructor's default or we'd need to store it if we could)
+        # For now, we'll try to use a safe reconstruction.
+        state["executor_config"] = exec_config
+        del state["executor"]
+        # Internal non-serializable objects
+        state["_thread"] = None
+        state["_validator"] = None
+        state["_batch_validator"] = None
+        return state
+    def __setstate__(self, state: dict) -> None:
+        """Restore the pipe state and reconstruct non-picklable objects."""
+        exec_config = state.pop("executor_config")
+        class_name = exec_config["class_name"]
+        batch_size = exec_config["batch_size"]
+        if class_name == "MultiThreadExecutor":
+            state["executor"] = MultiThreadExecutor(batch_size=batch_size)
+        else:
+            state["executor"] = SingleThreadExecutor(batch_size=batch_size)
+        self.__dict__.update(state)
+        # Reconstruct validators
+        self._validator = TypeAdapter(self.schema_model) if self.schema_model else None
+        self._batch_validator = (
+            TypeAdapter(list[self.schema_model]) if self.schema_model else None
+        )
+__all__ = ["Pipe", "SingleThreadExecutor", "MultiThreadExecutor"]

zoopipe/protocols.py ADDED Viewed

@@ -0,0 +1,37 @@
+from typing import Any, Protocol, runtime_checkable
+@runtime_checkable
+class InputAdapterProtocol(Protocol):
+    """
+    Protocol defining the minimal interface for a pipeline source.
+    Any object implementing this protocol can be used as the input
+    source for a Pipe.
+    """
+    def get_native_reader(self) -> Any:
+        """Returns the Rust-level reader."""
+        ...
+    def get_hooks(self) -> list[Any]:
+        """Returns optional hooks for data expansion or pre-processing."""
+        ...
+@runtime_checkable
+class OutputAdapterProtocol(Protocol):
+    """
+    Protocol defining the minimal interface for a pipeline destination.
+    Any object implementing this protocol can be used as the output
+    target for a Pipe.
+    """
+    def get_native_writer(self) -> Any:
+        """Returns the Rust-level writer."""
+        ...
+    def get_hooks(self) -> list[Any]:
+        """Returns optional hooks for cleanup or post-processing."""
+        ...

zoopipe/py.typed ADDED Viewed

File without changes

zoopipe/report.py ADDED Viewed

@@ -0,0 +1,173 @@
+import enum
+import logging
+import sys
+import threading
+import typing
+from datetime import datetime
+def get_logger(name: str = "zoopipe") -> logging.Logger:
+    """
+    Get a configured logger for zoopipe.
+    Args:
+        name: Name of the logger to retrieve.
+    """
+    logger = logging.getLogger(name)
+    if not logger.handlers:
+        handler = logging.StreamHandler(sys.stdout)
+        handler.setFormatter(
+            logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+        )
+        logger.addHandler(handler)
+        logger.setLevel(logging.INFO)
+    return logger
+class EntryStatus(enum.Enum):
+    """
+    Status of an individual data entry in the pipeline lifecycle.
+    - PENDING: Initial state after ingestion.
+    - VALIDATED: Successfully passed schema validation.
+    - FAILED: Encountered validation errors or processing issues.
+    """
+    PENDING = "pending"
+    VALIDATED = "validated"
+    FAILED = "failed"
+class EntryTypedDict(typing.TypedDict):
+    """
+    Structure of the record envelope as it flows through the pipeline.
+    The envelope contains not only the actual business data but also
+    operational metadata, unique identification, and error tracking.
+    """
+    id: typing.Any
+    position: int | None
+    status: EntryStatus
+    raw_data: dict[str, typing.Any]
+    validated_data: dict[str, typing.Any] | None
+    errors: list[dict[str, typing.Any]]
+    metadata: dict[str, typing.Any]
+class FlowStatus(enum.Enum):
+    """
+    Lifecycle status of a Pipe or PipeManager execution.
+    - PENDING: Execution hasn't started yet.
+    - RUNNING: Actively processing batches.
+    - COMPLETED: Finished successfully (all source data consumed).
+    - FAILED: Partially finished due to an unhandled exception.
+    - ABORTED: Stopped manually by the user.
+    """
+    PENDING = "pending"
+    RUNNING = "running"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    ABORTED = "aborted"
+class FlowReport:
+    """
+    Live progress tracker and final summary for a pipeline execution.
+    FlowReport provides real-time access to processing metrics,
+    memory usage, and execution status. It is automatically updated
+    by the Rust core during execution.
+    """
+    def __init__(self) -> None:
+        """Initialize an empty FlowReport."""
+        self.status = FlowStatus.PENDING
+        self.total_processed = 0
+        self.success_count = 0
+        self.error_count = 0
+        self.ram_bytes = 0
+        self.exception: Exception | None = None
+        self.start_time: datetime | None = None
+        self.end_time: datetime | None = None
+        self._finished_event = threading.Event()
+    @property
+    def duration(self) -> float:
+        """Total execution time in seconds."""
+        start = self.start_time
+        if not start:
+            return 0.0
+        end = self.end_time or datetime.now()
+        return (end - start).total_seconds()
+    @property
+    def items_per_second(self) -> float:
+        """Processing speed (items per second)."""
+        duration = self.duration
+        if duration == 0:
+            return 0.0
+        return self.total_processed / duration
+    @property
+    def is_finished(self) -> bool:
+        """Check if the pipeline has finished."""
+        return self._finished_event.is_set()
+    def wait(self, timeout: float | None = None) -> bool:
+        """
+        Wait for the pipeline to finish.
+        Args:
+            timeout: Optional timeout in seconds.
+        Returns:
+            True if the pipeline finished, False if it timed out.
+        """
+        return self._finished_event.wait(timeout)
+    def _mark_running(self) -> None:
+        self.status = FlowStatus.RUNNING
+        self.start_time = datetime.now()
+    def _mark_completed(self) -> None:
+        self.status = FlowStatus.COMPLETED
+        self.end_time = datetime.now()
+        self._finished_event.set()
+    def abort(self) -> None:
+        """Abort the pipeline execution."""
+        self.status = FlowStatus.ABORTED
+        self.end_time = datetime.now()
+        self._finished_event.set()
+    def _mark_failed(self, exception: Exception) -> None:
+        self.status = FlowStatus.FAILED
+        self.exception = exception
+        self.end_time = datetime.now()
+        self._finished_event.set()
+    def __repr__(self) -> str:
+        return (
+            f"<FlowReport status={self.status.value} "
+            f"processed={self.total_processed} "
+            f"success={self.success_count} "
+            f"error={self.error_count} "
+            f"ram={self.ram_bytes / 1024 / 1024:.2f}MB "
+            f"fps={self.items_per_second:.2f} "
+            f"duration={self.duration:.2f}s>"
+        )
+    def __getstate__(self) -> dict:
+        """Serialize the report state, excluding non-picklable lock objects."""
+        state = self.__dict__.copy()
+        del state["_finished_event"]
+        return state
+    def __setstate__(self, state: dict) -> None:
+        """Restore the report state and reconstruct the event lock."""
+        self.__dict__.update(state)
+        self._finished_event = threading.Event()
+        if self.status in (FlowStatus.COMPLETED, FlowStatus.FAILED, FlowStatus.ABORTED):
+            self._finished_event.set()

zoopipe/utils/__init__.py ADDED Viewed

File without changes

zoopipe/utils/dependency.py ADDED Viewed

@@ -0,0 +1,78 @@
+import importlib.util
+import shutil
+import subprocess
+import sys
+def _try_env_install_with_pip(packages: list[str]) -> bool:
+    """
+    Try to install packages using standard pip module.
+    """
+    if importlib.util.find_spec("pip") is None:
+        return False
+    try:
+        subprocess.check_call(
+            [sys.executable, "-m", "pip", "install", *packages],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+        return True
+    except (subprocess.CalledProcessError, OSError):
+        return False
+def _try_env_install_with_uv(packages: list[str]) -> bool:
+    """
+    Try to install packages using 'uv pip install'.
+    """
+    uv_path = shutil.which("uv")
+    if not uv_path:
+        return False
+    try:
+        subprocess.check_call(
+            [uv_path, "pip", "install", *packages],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+        return True
+    except (subprocess.CalledProcessError, OSError):
+        return False
+def _try_env_install_with_poetry(packages: list[str]) -> bool:
+    """
+    Try to install packages using 'poetry run pip install'.
+    """
+    poetry_path = shutil.which("poetry")
+    if not poetry_path:
+        return False
+    try:
+        # Assuming we are in a poetry env
+        subprocess.check_call(
+            [poetry_path, "run", "pip", "install", *packages],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+        return True
+    except (subprocess.CalledProcessError, OSError):
+        return False
+def install_dependencies(packages: list[str]) -> None:
+    """
+    Agnostically install dependencies using available package managers.
+    Strategies: pip -> uv -> poetry.
+    If all fail, it does nothing (assuming manual provisioning).
+    """
+    if not packages:
+        return
+    if _try_env_install_with_pip(packages):
+        return
+    if _try_env_install_with_uv(packages):
+        return
+    if _try_env_install_with_poetry(packages):
+        return

zoopipe/zoopipe_rust_core.abi3.so ADDED Viewed

Binary file