zoopipe 2026.1.20__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. zoopipe/__init__.py +72 -0
  2. zoopipe/engines/__init__.py +4 -0
  3. zoopipe/engines/base.py +45 -0
  4. zoopipe/engines/dask.py +225 -0
  5. zoopipe/engines/local.py +215 -0
  6. zoopipe/engines/ray.py +252 -0
  7. zoopipe/hooks/__init__.py +4 -0
  8. zoopipe/hooks/base.py +70 -0
  9. zoopipe/hooks/sql.py +94 -0
  10. zoopipe/input_adapter/__init__.py +24 -0
  11. zoopipe/input_adapter/arrow.py +38 -0
  12. zoopipe/input_adapter/base.py +48 -0
  13. zoopipe/input_adapter/csv.py +144 -0
  14. zoopipe/input_adapter/duckdb.py +54 -0
  15. zoopipe/input_adapter/excel.py +51 -0
  16. zoopipe/input_adapter/json.py +73 -0
  17. zoopipe/input_adapter/kafka.py +39 -0
  18. zoopipe/input_adapter/parquet.py +85 -0
  19. zoopipe/input_adapter/pygen.py +37 -0
  20. zoopipe/input_adapter/sql.py +103 -0
  21. zoopipe/manager.py +211 -0
  22. zoopipe/output_adapter/__init__.py +23 -0
  23. zoopipe/output_adapter/arrow.py +50 -0
  24. zoopipe/output_adapter/base.py +41 -0
  25. zoopipe/output_adapter/csv.py +71 -0
  26. zoopipe/output_adapter/duckdb.py +46 -0
  27. zoopipe/output_adapter/excel.py +42 -0
  28. zoopipe/output_adapter/json.py +66 -0
  29. zoopipe/output_adapter/kafka.py +39 -0
  30. zoopipe/output_adapter/parquet.py +49 -0
  31. zoopipe/output_adapter/pygen.py +29 -0
  32. zoopipe/output_adapter/sql.py +43 -0
  33. zoopipe/pipe.py +263 -0
  34. zoopipe/protocols.py +37 -0
  35. zoopipe/py.typed +0 -0
  36. zoopipe/report.py +173 -0
  37. zoopipe/utils/__init__.py +0 -0
  38. zoopipe/utils/dependency.py +78 -0
  39. zoopipe/zoopipe_rust_core.abi3.so +0 -0
  40. zoopipe-2026.1.20.dist-info/METADATA +231 -0
  41. zoopipe-2026.1.20.dist-info/RECORD +43 -0
  42. zoopipe-2026.1.20.dist-info/WHEEL +4 -0
  43. zoopipe-2026.1.20.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,50 @@
1
+ import pathlib
2
+ import typing
3
+
4
+ from zoopipe.output_adapter.base import BaseOutputAdapter
5
+ from zoopipe.zoopipe_rust_core import ArrowWriter
6
+
7
+
8
+ class ArrowOutputAdapter(BaseOutputAdapter):
9
+ """
10
+ Writes data to disk in Apache Arrow IPC (feather) format.
11
+
12
+ This adapter automatically handles parent directory creation and uses
13
+ optimized Rust code for fast serialization.
14
+ """
15
+
16
+ def __init__(
17
+ self,
18
+ output: typing.Union[str, pathlib.Path],
19
+ ):
20
+ """
21
+ Initialize the ArrowOutputAdapter.
22
+
23
+ Args:
24
+ output: destination file path (string or Path).
25
+ """
26
+ self.output_path = str(output)
27
+
28
+ def split(self, workers: int) -> typing.List["ArrowOutputAdapter"]:
29
+ """
30
+ Split the output adapter into `workers` partitions.
31
+ Generates filenames like `filename_part_1.arrow`.
32
+ """
33
+ path = pathlib.Path(self.output_path)
34
+ stem = path.stem
35
+ suffix = path.suffix
36
+ parent = path.parent
37
+
38
+ shards = []
39
+ for i in range(workers):
40
+ part_name = f"{stem}_part_{i + 1}{suffix}"
41
+ part_path = parent / part_name
42
+ shards.append(self.__class__(output=str(part_path)))
43
+ return shards
44
+
45
+ def get_native_writer(self) -> ArrowWriter:
46
+ pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
47
+ return ArrowWriter(self.output_path)
48
+
49
+
50
+ __all__ = ["ArrowOutputAdapter"]
@@ -0,0 +1,41 @@
1
+ import abc
2
+ import typing
3
+
4
+
5
+ class BaseOutputAdapter(abc.ABC):
6
+ """
7
+ Abstract base class for all output adapters.
8
+
9
+ Output adapters bridge the pipeline results to external destinations.
10
+ They provide the native Rust writer used by the execution core.
11
+ """
12
+
13
+ @property
14
+ def can_split(self) -> bool:
15
+ """Return True if this adapter supports parallel splitting."""
16
+ return type(self).split != BaseOutputAdapter.split
17
+
18
+ @abc.abstractmethod
19
+ def get_native_writer(self) -> typing.Any:
20
+ """
21
+ Return the underlying Rust writer instance.
22
+
23
+ The writer is responsible for serializing and persisting entries
24
+ passed from the internal pipe buffer.
25
+ """
26
+ raise NotImplementedError
27
+
28
+ def get_hooks(self) -> list[typing.Any]:
29
+ """
30
+ Return a list of hooks to be executed by the pipeline.
31
+
32
+ Can be used for post-processing or cleaning up resources
33
+ after the data has been written.
34
+ """
35
+ return []
36
+
37
+ def split(self, workers: int) -> typing.List["BaseOutputAdapter"]:
38
+ """
39
+ Split the output adapter into `workers` partitions for parallel writing.
40
+ """
41
+ return [self]
@@ -0,0 +1,71 @@
1
+ import pathlib
2
+ import typing
3
+
4
+ from zoopipe.output_adapter.base import BaseOutputAdapter
5
+ from zoopipe.zoopipe_rust_core import CSVWriter
6
+
7
+
8
+ class CSVOutputAdapter(BaseOutputAdapter):
9
+ """
10
+ Writes pipeline results to CSV files.
11
+
12
+ Handles directory creation and uses a buffered writer in Rust to ensure
13
+ high-throughput performance.
14
+ """
15
+
16
+ def __init__(
17
+ self,
18
+ output: typing.Union[str, pathlib.Path],
19
+ delimiter: str = ",",
20
+ quotechar: str = '"',
21
+ fieldnames: list[str] | None = None,
22
+ ):
23
+ """
24
+ Initialize the CSVOutputAdapter.
25
+
26
+ Args:
27
+ output: Path where the CSV file will be created.
28
+ delimiter: Column separator.
29
+ quotechar: Character used for quoting fields.
30
+ fieldnames: Optional list of column names for the header.
31
+ """
32
+ self.output_path = str(output)
33
+ self.delimiter = delimiter
34
+ self.quotechar = quotechar
35
+ self.fieldnames = fieldnames
36
+
37
+ def split(self, workers: int) -> typing.List["CSVOutputAdapter"]:
38
+ """
39
+ Split the output adapter into `workers` partitions.
40
+ Generates filenames like `filename_part_1.csv`.
41
+ """
42
+ path = pathlib.Path(self.output_path)
43
+ stem = path.stem
44
+ suffix = path.suffix
45
+ parent = path.parent
46
+
47
+ shards = []
48
+ for i in range(workers):
49
+ part_name = f"{stem}_part_{i + 1}{suffix}"
50
+ part_path = parent / part_name
51
+ shards.append(
52
+ self.__class__(
53
+ output=str(part_path),
54
+ delimiter=self.delimiter,
55
+ quotechar=self.quotechar,
56
+ fieldnames=self.fieldnames,
57
+ )
58
+ )
59
+ return shards
60
+
61
+ def get_native_writer(self) -> CSVWriter:
62
+ pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
63
+ return CSVWriter(
64
+ self.output_path,
65
+ delimiter=ord(self.delimiter),
66
+ quote=ord(self.quotechar),
67
+ fieldnames=self.fieldnames,
68
+ )
69
+
70
+
71
+ __all__ = ["CSVOutputAdapter"]
@@ -0,0 +1,46 @@
1
+ import pathlib
2
+ import typing
3
+
4
+ from zoopipe.output_adapter.base import BaseOutputAdapter
5
+ from zoopipe.zoopipe_rust_core import DuckDBWriter
6
+
7
+
8
+ class DuckDBOutputAdapter(BaseOutputAdapter):
9
+ """
10
+ Persists data into DuckDB database files.
11
+
12
+ Supports replacing or appending to existing tables, leveraging DuckDB's
13
+ transactional integrity and high-speed storage.
14
+ """
15
+
16
+ def __init__(
17
+ self,
18
+ output: typing.Union[str, pathlib.Path],
19
+ table_name: str,
20
+ mode: str = "replace",
21
+ ):
22
+ """
23
+ Initialize the DuckDBOutputAdapter.
24
+
25
+ Args:
26
+ output: Path to the DuckDB database file.
27
+ table_name: Name of the table to write to.
28
+ mode: Write mode ('replace', 'append', or 'fail').
29
+ """
30
+ self.output_path = str(output)
31
+ self.table_name = table_name
32
+ self.mode = mode
33
+
34
+ if mode not in ["replace", "append", "fail"]:
35
+ raise ValueError("mode must be 'replace', 'append', or 'fail'")
36
+
37
+ def get_native_writer(self) -> DuckDBWriter:
38
+ pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
39
+ return DuckDBWriter(
40
+ self.output_path,
41
+ self.table_name,
42
+ mode=self.mode,
43
+ )
44
+
45
+
46
+ __all__ = ["DuckDBOutputAdapter"]
@@ -0,0 +1,42 @@
1
+ import pathlib
2
+ import typing
3
+
4
+ from zoopipe.output_adapter.base import BaseOutputAdapter
5
+ from zoopipe.zoopipe_rust_core import ExcelWriter
6
+
7
+
8
+ class ExcelOutputAdapter(BaseOutputAdapter):
9
+ """
10
+ Creates Excel files (.xlsx) from pipeline entries.
11
+
12
+ Provides a simple way to export processed data to spreadsheets, with
13
+ support for custom worksheet names and column headers.
14
+ """
15
+
16
+ def __init__(
17
+ self,
18
+ path: typing.Union[str, pathlib.Path],
19
+ sheet_name: typing.Optional[str] = None,
20
+ fieldnames: typing.Optional[typing.List[str]] = None,
21
+ ):
22
+ """
23
+ Initialize the ExcelOutputAdapter.
24
+
25
+ Args:
26
+ path: Path where the Excel file will be created.
27
+ sheet_name: Optional name for the worksheet.
28
+ fieldnames: Optional list of column names for the header.
29
+ """
30
+ self.path = str(path)
31
+ self.sheet_name = sheet_name
32
+ self.fieldnames = fieldnames
33
+
34
+ def get_native_writer(self) -> ExcelWriter:
35
+ return ExcelWriter(
36
+ self.path,
37
+ sheet_name=self.sheet_name,
38
+ fieldnames=self.fieldnames,
39
+ )
40
+
41
+
42
+ __all__ = ["ExcelOutputAdapter"]
@@ -0,0 +1,66 @@
1
+ import pathlib
2
+ import typing
3
+
4
+ from zoopipe.output_adapter.base import BaseOutputAdapter
5
+ from zoopipe.zoopipe_rust_core import JSONWriter
6
+
7
+
8
+ class JSONOutputAdapter(BaseOutputAdapter):
9
+ """
10
+ Serializes data to JSON format, supporting both array and
11
+ line-delimited (JSONL) outputs.
12
+
13
+ Equipped with a fast Rust-powered serializer that can indent results or
14
+ output them in a compact single-line per record format.
15
+ """
16
+
17
+ def __init__(
18
+ self,
19
+ output: typing.Union[str, pathlib.Path],
20
+ format: str = "array",
21
+ indent: int | None = None,
22
+ ):
23
+ """
24
+ Initialize the JSONOutputAdapter.
25
+
26
+ Args:
27
+ output: Path where the JSON file will be created.
28
+ format: JSON format ('array' for a single JSON array, or
29
+ 'lines' for JSONLines).
30
+ indent: Optional indentation for pretty-printing.
31
+ """
32
+ self.output_path = str(output)
33
+ self.format = format
34
+ self.indent = indent
35
+
36
+ def split(self, workers: int) -> typing.List["JSONOutputAdapter"]:
37
+ """
38
+ Split the output adapter into `workers` partitions.
39
+ Generates filenames like `filename_part_0.jsonl`.
40
+ """
41
+ path = pathlib.Path(self.output_path)
42
+ stem = path.stem
43
+ suffix = path.suffix
44
+ parent = path.parent
45
+
46
+ shards = []
47
+ for i in range(workers):
48
+ part_name = f"{stem}_part_{i + 1}{suffix}"
49
+ part_path = parent / part_name
50
+ shards.append(
51
+ self.__class__(
52
+ output=str(part_path), format=self.format, indent=self.indent
53
+ )
54
+ )
55
+ return shards
56
+
57
+ def get_native_writer(self) -> JSONWriter:
58
+ pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
59
+ return JSONWriter(
60
+ self.output_path,
61
+ format=self.format,
62
+ indent=self.indent,
63
+ )
64
+
65
+
66
+ __all__ = ["JSONOutputAdapter"]
@@ -0,0 +1,39 @@
1
+ from zoopipe.output_adapter.base import BaseOutputAdapter
2
+ from zoopipe.zoopipe_rust_core import KafkaWriter
3
+
4
+
5
+ class KafkaOutputAdapter(BaseOutputAdapter):
6
+ """
7
+ Produces messages to an Apache Kafka topic.
8
+
9
+ Streams pipeline results into Kafka topics, with configurable
10
+ acknowledgment settings ensuring reliable message delivery.
11
+ """
12
+
13
+ def __init__(
14
+ self,
15
+ uri: str,
16
+ acks: int = 1,
17
+ timeout: int = 30,
18
+ ):
19
+ """
20
+ Kafka Output Adapter.
21
+
22
+ Args:
23
+ uri: Kafka URI (e.g., 'kafka://localhost:9092/topic')
24
+ acks: Required ACKs (None=0, 1, 'all'=-1). Defaults to 1.
25
+ timeout: Ack timeout in seconds.
26
+ """
27
+ self.uri = uri
28
+ self.acks = acks
29
+ self.timeout = timeout
30
+
31
+ def get_native_writer(self) -> KafkaWriter:
32
+ return KafkaWriter(
33
+ self.uri,
34
+ acks=self.acks,
35
+ timeout=self.timeout,
36
+ )
37
+
38
+
39
+ __all__ = ["KafkaOutputAdapter"]
@@ -0,0 +1,49 @@
1
+ import pathlib
2
+ import typing
3
+
4
+ from zoopipe.output_adapter.base import BaseOutputAdapter
5
+ from zoopipe.zoopipe_rust_core import ParquetWriter
6
+
7
+
8
+ class ParquetOutputAdapter(BaseOutputAdapter):
9
+ """
10
+ Writes data to Apache Parquet files.
11
+
12
+ Provides highly efficient columnar storage using the Arrow ecosystem,
13
+ making it ideal for large-scale analytical processing.
14
+ """
15
+
16
+ def __init__(
17
+ self,
18
+ path: typing.Union[str, pathlib.Path],
19
+ ):
20
+ """
21
+ Initialize the ParquetOutputAdapter.
22
+
23
+ Args:
24
+ path: Path where the Parquet file will be created.
25
+ """
26
+ self.path = str(path)
27
+
28
+ def split(self, workers: int) -> typing.List["ParquetOutputAdapter"]:
29
+ """
30
+ Split the output adapter into `workers` partitions.
31
+ Generates filenames like `filename_part_1.parquet`.
32
+ """
33
+ path = pathlib.Path(self.path)
34
+ stem = path.stem
35
+ suffix = path.suffix
36
+ parent = path.parent
37
+
38
+ shards = []
39
+ for i in range(workers):
40
+ part_name = f"{stem}_part_{i + 1}{suffix}"
41
+ part_path = parent / part_name
42
+ shards.append(self.__class__(path=str(part_path)))
43
+ return shards
44
+
45
+ def get_native_writer(self) -> ParquetWriter:
46
+ return ParquetWriter(self.path)
47
+
48
+
49
+ __all__ = ["ParquetOutputAdapter"]
@@ -0,0 +1,29 @@
1
+ from zoopipe.output_adapter.base import BaseOutputAdapter
2
+ from zoopipe.zoopipe_rust_core import PyGeneratorWriter
3
+
4
+
5
+ class PyGeneratorOutputAdapter(BaseOutputAdapter):
6
+ """
7
+ Exposes pipeline results as a Python generator.
8
+
9
+ This adapter provides a bridge back to Python code, allowing you to
10
+ iterate over the processed results as they become available.
11
+ """
12
+
13
+ def __init__(self, queue_size: int = 1000):
14
+ """
15
+ Initialize the PyGeneratorOutputAdapter.
16
+
17
+ Args:
18
+ queue_size: Buffer size for the internal queue.
19
+ """
20
+ self._writer = PyGeneratorWriter(queue_size=queue_size)
21
+
22
+ def get_native_writer(self) -> PyGeneratorWriter:
23
+ return self._writer
24
+
25
+ def __iter__(self):
26
+ return self._writer
27
+
28
+
29
+ __all__ = ["PyGeneratorOutputAdapter"]
@@ -0,0 +1,43 @@
1
+ from zoopipe.output_adapter.base import BaseOutputAdapter
2
+ from zoopipe.zoopipe_rust_core import SQLWriter
3
+
4
+
5
+ class SQLOutputAdapter(BaseOutputAdapter):
6
+ """
7
+ Writes data into SQL databases via bulk inserts.
8
+
9
+ Manages database transactions and performs batch insertions using
10
+ optimized SQL writers in the Rust core.
11
+ """
12
+
13
+ def __init__(
14
+ self,
15
+ uri: str,
16
+ table_name: str,
17
+ mode: str = "replace",
18
+ batch_size: int = 500,
19
+ ):
20
+ """
21
+ Initialize the SQLOutputAdapter.
22
+
23
+ Args:
24
+ uri: Database URI.
25
+ table_name: Name of the table to write to.
26
+ mode: Write mode ('replace', 'append', or 'fail').
27
+ batch_size: Number of records to insert per transaction.
28
+ """
29
+ self.uri = uri
30
+ self.table_name = table_name
31
+ self.mode = mode
32
+ self.batch_size = batch_size
33
+
34
+ def get_native_writer(self) -> SQLWriter:
35
+ return SQLWriter(
36
+ self.uri,
37
+ self.table_name,
38
+ mode=self.mode,
39
+ batch_size=self.batch_size,
40
+ )
41
+
42
+
43
+ __all__ = ["SQLOutputAdapter"]