PyPI - zoopipe - Versions diffs - 2026.1.20__cp310-abi3-macosx_11_0_arm64.whl - Mend

zoopipe 2026.1.20__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

zoopipe/__init__.py +72 -0
zoopipe/engines/__init__.py +4 -0
zoopipe/engines/base.py +45 -0
zoopipe/engines/dask.py +225 -0
zoopipe/engines/local.py +215 -0
zoopipe/engines/ray.py +252 -0
zoopipe/hooks/__init__.py +4 -0
zoopipe/hooks/base.py +70 -0
zoopipe/hooks/sql.py +94 -0
zoopipe/input_adapter/__init__.py +24 -0
zoopipe/input_adapter/arrow.py +38 -0
zoopipe/input_adapter/base.py +48 -0
zoopipe/input_adapter/csv.py +144 -0
zoopipe/input_adapter/duckdb.py +54 -0
zoopipe/input_adapter/excel.py +51 -0
zoopipe/input_adapter/json.py +73 -0
zoopipe/input_adapter/kafka.py +39 -0
zoopipe/input_adapter/parquet.py +85 -0
zoopipe/input_adapter/pygen.py +37 -0
zoopipe/input_adapter/sql.py +103 -0
zoopipe/manager.py +211 -0
zoopipe/output_adapter/__init__.py +23 -0
zoopipe/output_adapter/arrow.py +50 -0
zoopipe/output_adapter/base.py +41 -0
zoopipe/output_adapter/csv.py +71 -0
zoopipe/output_adapter/duckdb.py +46 -0
zoopipe/output_adapter/excel.py +42 -0
zoopipe/output_adapter/json.py +66 -0
zoopipe/output_adapter/kafka.py +39 -0
zoopipe/output_adapter/parquet.py +49 -0
zoopipe/output_adapter/pygen.py +29 -0
zoopipe/output_adapter/sql.py +43 -0
zoopipe/pipe.py +263 -0
zoopipe/protocols.py +37 -0
zoopipe/py.typed +0 -0
zoopipe/report.py +173 -0
zoopipe/utils/__init__.py +0 -0
zoopipe/utils/dependency.py +78 -0
zoopipe/zoopipe_rust_core.abi3.so +0 -0
zoopipe-2026.1.20.dist-info/METADATA +231 -0
zoopipe-2026.1.20.dist-info/RECORD +43 -0
zoopipe-2026.1.20.dist-info/WHEEL +4 -0
zoopipe-2026.1.20.dist-info/licenses/LICENSE +21 -0

zoopipe/output_adapter/arrow.py ADDED Viewed

@@ -0,0 +1,50 @@
+import pathlib
+import typing
+from zoopipe.output_adapter.base import BaseOutputAdapter
+from zoopipe.zoopipe_rust_core import ArrowWriter
+class ArrowOutputAdapter(BaseOutputAdapter):
+    """
+    Writes data to disk in Apache Arrow IPC (feather) format.
+    This adapter automatically handles parent directory creation and uses
+    optimized Rust code for fast serialization.
+    """
+    def __init__(
+        self,
+        output: typing.Union[str, pathlib.Path],
+    ):
+        """
+        Initialize the ArrowOutputAdapter.
+        Args:
+            output: destination file path (string or Path).
+        """
+        self.output_path = str(output)
+    def split(self, workers: int) -> typing.List["ArrowOutputAdapter"]:
+        """
+        Split the output adapter into `workers` partitions.
+        Generates filenames like `filename_part_1.arrow`.
+        """
+        path = pathlib.Path(self.output_path)
+        stem = path.stem
+        suffix = path.suffix
+        parent = path.parent
+        shards = []
+        for i in range(workers):
+            part_name = f"{stem}_part_{i + 1}{suffix}"
+            part_path = parent / part_name
+            shards.append(self.__class__(output=str(part_path)))
+        return shards
+    def get_native_writer(self) -> ArrowWriter:
+        pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
+        return ArrowWriter(self.output_path)
+__all__ = ["ArrowOutputAdapter"]

zoopipe/output_adapter/base.py ADDED Viewed

@@ -0,0 +1,41 @@
+import abc
+import typing
+class BaseOutputAdapter(abc.ABC):
+    """
+    Abstract base class for all output adapters.
+    Output adapters bridge the pipeline results to external destinations.
+    They provide the native Rust writer used by the execution core.
+    """
+    @property
+    def can_split(self) -> bool:
+        """Return True if this adapter supports parallel splitting."""
+        return type(self).split != BaseOutputAdapter.split
+    @abc.abstractmethod
+    def get_native_writer(self) -> typing.Any:
+        """
+        Return the underlying Rust writer instance.
+        The writer is responsible for serializing and persisting entries
+        passed from the internal pipe buffer.
+        """
+        raise NotImplementedError
+    def get_hooks(self) -> list[typing.Any]:
+        """
+        Return a list of hooks to be executed by the pipeline.
+        Can be used for post-processing or cleaning up resources
+        after the data has been written.
+        """
+        return []
+    def split(self, workers: int) -> typing.List["BaseOutputAdapter"]:
+        """
+        Split the output adapter into `workers` partitions for parallel writing.
+        """
+        return [self]

zoopipe/output_adapter/csv.py ADDED Viewed

@@ -0,0 +1,71 @@
+import pathlib
+import typing
+from zoopipe.output_adapter.base import BaseOutputAdapter
+from zoopipe.zoopipe_rust_core import CSVWriter
+class CSVOutputAdapter(BaseOutputAdapter):
+    """
+    Writes pipeline results to CSV files.
+    Handles directory creation and uses a buffered writer in Rust to ensure
+    high-throughput performance.
+    """
+    def __init__(
+        self,
+        output: typing.Union[str, pathlib.Path],
+        delimiter: str = ",",
+        quotechar: str = '"',
+        fieldnames: list[str] | None = None,
+    ):
+        """
+        Initialize the CSVOutputAdapter.
+        Args:
+            output: Path where the CSV file will be created.
+            delimiter: Column separator.
+            quotechar: Character used for quoting fields.
+            fieldnames: Optional list of column names for the header.
+        """
+        self.output_path = str(output)
+        self.delimiter = delimiter
+        self.quotechar = quotechar
+        self.fieldnames = fieldnames
+    def split(self, workers: int) -> typing.List["CSVOutputAdapter"]:
+        """
+        Split the output adapter into `workers` partitions.
+        Generates filenames like `filename_part_1.csv`.
+        """
+        path = pathlib.Path(self.output_path)
+        stem = path.stem
+        suffix = path.suffix
+        parent = path.parent
+        shards = []
+        for i in range(workers):
+            part_name = f"{stem}_part_{i + 1}{suffix}"
+            part_path = parent / part_name
+            shards.append(
+                self.__class__(
+                    output=str(part_path),
+                    delimiter=self.delimiter,
+                    quotechar=self.quotechar,
+                    fieldnames=self.fieldnames,
+                )
+            )
+        return shards
+    def get_native_writer(self) -> CSVWriter:
+        pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
+        return CSVWriter(
+            self.output_path,
+            delimiter=ord(self.delimiter),
+            quote=ord(self.quotechar),
+            fieldnames=self.fieldnames,
+        )
+__all__ = ["CSVOutputAdapter"]

zoopipe/output_adapter/duckdb.py ADDED Viewed

@@ -0,0 +1,46 @@
+import pathlib
+import typing
+from zoopipe.output_adapter.base import BaseOutputAdapter
+from zoopipe.zoopipe_rust_core import DuckDBWriter
+class DuckDBOutputAdapter(BaseOutputAdapter):
+    """
+    Persists data into DuckDB database files.
+    Supports replacing or appending to existing tables, leveraging DuckDB's
+    transactional integrity and high-speed storage.
+    """
+    def __init__(
+        self,
+        output: typing.Union[str, pathlib.Path],
+        table_name: str,
+        mode: str = "replace",
+    ):
+        """
+        Initialize the DuckDBOutputAdapter.
+        Args:
+            output: Path to the DuckDB database file.
+            table_name: Name of the table to write to.
+            mode: Write mode ('replace', 'append', or 'fail').
+        """
+        self.output_path = str(output)
+        self.table_name = table_name
+        self.mode = mode
+        if mode not in ["replace", "append", "fail"]:
+            raise ValueError("mode must be 'replace', 'append', or 'fail'")
+    def get_native_writer(self) -> DuckDBWriter:
+        pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
+        return DuckDBWriter(
+            self.output_path,
+            self.table_name,
+            mode=self.mode,
+        )
+__all__ = ["DuckDBOutputAdapter"]

zoopipe/output_adapter/excel.py ADDED Viewed

@@ -0,0 +1,42 @@
+import pathlib
+import typing
+from zoopipe.output_adapter.base import BaseOutputAdapter
+from zoopipe.zoopipe_rust_core import ExcelWriter
+class ExcelOutputAdapter(BaseOutputAdapter):
+    """
+    Creates Excel files (.xlsx) from pipeline entries.
+    Provides a simple way to export processed data to spreadsheets, with
+    support for custom worksheet names and column headers.
+    """
+    def __init__(
+        self,
+        path: typing.Union[str, pathlib.Path],
+        sheet_name: typing.Optional[str] = None,
+        fieldnames: typing.Optional[typing.List[str]] = None,
+    ):
+        """
+        Initialize the ExcelOutputAdapter.
+        Args:
+            path: Path where the Excel file will be created.
+            sheet_name: Optional name for the worksheet.
+            fieldnames: Optional list of column names for the header.
+        """
+        self.path = str(path)
+        self.sheet_name = sheet_name
+        self.fieldnames = fieldnames
+    def get_native_writer(self) -> ExcelWriter:
+        return ExcelWriter(
+            self.path,
+            sheet_name=self.sheet_name,
+            fieldnames=self.fieldnames,
+        )
+__all__ = ["ExcelOutputAdapter"]

zoopipe/output_adapter/json.py ADDED Viewed

@@ -0,0 +1,66 @@
+import pathlib
+import typing
+from zoopipe.output_adapter.base import BaseOutputAdapter
+from zoopipe.zoopipe_rust_core import JSONWriter
+class JSONOutputAdapter(BaseOutputAdapter):
+    """
+    Serializes data to JSON format, supporting both array and
+    line-delimited (JSONL) outputs.
+    Equipped with a fast Rust-powered serializer that can indent results or
+    output them in a compact single-line per record format.
+    """
+    def __init__(
+        self,
+        output: typing.Union[str, pathlib.Path],
+        format: str = "array",
+        indent: int | None = None,
+    ):
+        """
+        Initialize the JSONOutputAdapter.
+        Args:
+            output: Path where the JSON file will be created.
+            format: JSON format ('array' for a single JSON array, or
+                'lines' for JSONLines).
+            indent: Optional indentation for pretty-printing.
+        """
+        self.output_path = str(output)
+        self.format = format
+        self.indent = indent
+    def split(self, workers: int) -> typing.List["JSONOutputAdapter"]:
+        """
+        Split the output adapter into `workers` partitions.
+        Generates filenames like `filename_part_0.jsonl`.
+        """
+        path = pathlib.Path(self.output_path)
+        stem = path.stem
+        suffix = path.suffix
+        parent = path.parent
+        shards = []
+        for i in range(workers):
+            part_name = f"{stem}_part_{i + 1}{suffix}"
+            part_path = parent / part_name
+            shards.append(
+                self.__class__(
+                    output=str(part_path), format=self.format, indent=self.indent
+                )
+            )
+        return shards
+    def get_native_writer(self) -> JSONWriter:
+        pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
+        return JSONWriter(
+            self.output_path,
+            format=self.format,
+            indent=self.indent,
+        )
+__all__ = ["JSONOutputAdapter"]

zoopipe/output_adapter/kafka.py ADDED Viewed

@@ -0,0 +1,39 @@
+from zoopipe.output_adapter.base import BaseOutputAdapter
+from zoopipe.zoopipe_rust_core import KafkaWriter
+class KafkaOutputAdapter(BaseOutputAdapter):
+    """
+    Produces messages to an Apache Kafka topic.
+    Streams pipeline results into Kafka topics, with configurable
+    acknowledgment settings ensuring reliable message delivery.
+    """
+    def __init__(
+        self,
+        uri: str,
+        acks: int = 1,
+        timeout: int = 30,
+    ):
+        """
+        Kafka Output Adapter.
+        Args:
+            uri: Kafka URI (e.g., 'kafka://localhost:9092/topic')
+            acks: Required ACKs (None=0, 1, 'all'=-1). Defaults to 1.
+            timeout: Ack timeout in seconds.
+        """
+        self.uri = uri
+        self.acks = acks
+        self.timeout = timeout
+    def get_native_writer(self) -> KafkaWriter:
+        return KafkaWriter(
+            self.uri,
+            acks=self.acks,
+            timeout=self.timeout,
+        )
+__all__ = ["KafkaOutputAdapter"]

zoopipe/output_adapter/parquet.py ADDED Viewed

@@ -0,0 +1,49 @@
+import pathlib
+import typing
+from zoopipe.output_adapter.base import BaseOutputAdapter
+from zoopipe.zoopipe_rust_core import ParquetWriter
+class ParquetOutputAdapter(BaseOutputAdapter):
+    """
+    Writes data to Apache Parquet files.
+    Provides highly efficient columnar storage using the Arrow ecosystem,
+    making it ideal for large-scale analytical processing.
+    """
+    def __init__(
+        self,
+        path: typing.Union[str, pathlib.Path],
+    ):
+        """
+        Initialize the ParquetOutputAdapter.
+        Args:
+            path: Path where the Parquet file will be created.
+        """
+        self.path = str(path)
+    def split(self, workers: int) -> typing.List["ParquetOutputAdapter"]:
+        """
+        Split the output adapter into `workers` partitions.
+        Generates filenames like `filename_part_1.parquet`.
+        """
+        path = pathlib.Path(self.path)
+        stem = path.stem
+        suffix = path.suffix
+        parent = path.parent
+        shards = []
+        for i in range(workers):
+            part_name = f"{stem}_part_{i + 1}{suffix}"
+            part_path = parent / part_name
+            shards.append(self.__class__(path=str(part_path)))
+        return shards
+    def get_native_writer(self) -> ParquetWriter:
+        return ParquetWriter(self.path)
+__all__ = ["ParquetOutputAdapter"]

zoopipe/output_adapter/pygen.py ADDED Viewed

@@ -0,0 +1,29 @@
+from zoopipe.output_adapter.base import BaseOutputAdapter
+from zoopipe.zoopipe_rust_core import PyGeneratorWriter
+class PyGeneratorOutputAdapter(BaseOutputAdapter):
+    """
+    Exposes pipeline results as a Python generator.
+    This adapter provides a bridge back to Python code, allowing you to
+    iterate over the processed results as they become available.
+    """
+    def __init__(self, queue_size: int = 1000):
+        """
+        Initialize the PyGeneratorOutputAdapter.
+        Args:
+            queue_size: Buffer size for the internal queue.
+        """
+        self._writer = PyGeneratorWriter(queue_size=queue_size)
+    def get_native_writer(self) -> PyGeneratorWriter:
+        return self._writer
+    def __iter__(self):
+        return self._writer
+__all__ = ["PyGeneratorOutputAdapter"]

zoopipe/output_adapter/sql.py ADDED Viewed

@@ -0,0 +1,43 @@
+from zoopipe.output_adapter.base import BaseOutputAdapter
+from zoopipe.zoopipe_rust_core import SQLWriter
+class SQLOutputAdapter(BaseOutputAdapter):
+    """
+    Writes data into SQL databases via bulk inserts.
+    Manages database transactions and performs batch insertions using
+    optimized SQL writers in the Rust core.
+    """
+    def __init__(
+        self,
+        uri: str,
+        table_name: str,
+        mode: str = "replace",
+        batch_size: int = 500,
+    ):
+        """
+        Initialize the SQLOutputAdapter.
+        Args:
+            uri: Database URI.
+            table_name: Name of the table to write to.
+            mode: Write mode ('replace', 'append', or 'fail').
+            batch_size: Number of records to insert per transaction.
+        """
+        self.uri = uri
+        self.table_name = table_name
+        self.mode = mode
+        self.batch_size = batch_size
+    def get_native_writer(self) -> SQLWriter:
+        return SQLWriter(
+            self.uri,
+            self.table_name,
+            mode=self.mode,
+            batch_size=self.batch_size,
+        )
+__all__ = ["SQLOutputAdapter"]