zoopipe 2026.1.20__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. zoopipe/__init__.py +72 -0
  2. zoopipe/engines/__init__.py +4 -0
  3. zoopipe/engines/base.py +45 -0
  4. zoopipe/engines/dask.py +225 -0
  5. zoopipe/engines/local.py +215 -0
  6. zoopipe/engines/ray.py +252 -0
  7. zoopipe/hooks/__init__.py +4 -0
  8. zoopipe/hooks/base.py +70 -0
  9. zoopipe/hooks/sql.py +94 -0
  10. zoopipe/input_adapter/__init__.py +24 -0
  11. zoopipe/input_adapter/arrow.py +38 -0
  12. zoopipe/input_adapter/base.py +48 -0
  13. zoopipe/input_adapter/csv.py +144 -0
  14. zoopipe/input_adapter/duckdb.py +54 -0
  15. zoopipe/input_adapter/excel.py +51 -0
  16. zoopipe/input_adapter/json.py +73 -0
  17. zoopipe/input_adapter/kafka.py +39 -0
  18. zoopipe/input_adapter/parquet.py +85 -0
  19. zoopipe/input_adapter/pygen.py +37 -0
  20. zoopipe/input_adapter/sql.py +103 -0
  21. zoopipe/manager.py +211 -0
  22. zoopipe/output_adapter/__init__.py +23 -0
  23. zoopipe/output_adapter/arrow.py +50 -0
  24. zoopipe/output_adapter/base.py +41 -0
  25. zoopipe/output_adapter/csv.py +71 -0
  26. zoopipe/output_adapter/duckdb.py +46 -0
  27. zoopipe/output_adapter/excel.py +42 -0
  28. zoopipe/output_adapter/json.py +66 -0
  29. zoopipe/output_adapter/kafka.py +39 -0
  30. zoopipe/output_adapter/parquet.py +49 -0
  31. zoopipe/output_adapter/pygen.py +29 -0
  32. zoopipe/output_adapter/sql.py +43 -0
  33. zoopipe/pipe.py +263 -0
  34. zoopipe/protocols.py +37 -0
  35. zoopipe/py.typed +0 -0
  36. zoopipe/report.py +173 -0
  37. zoopipe/utils/__init__.py +0 -0
  38. zoopipe/utils/dependency.py +78 -0
  39. zoopipe/zoopipe_rust_core.abi3.so +0 -0
  40. zoopipe-2026.1.20.dist-info/METADATA +231 -0
  41. zoopipe-2026.1.20.dist-info/RECORD +43 -0
  42. zoopipe-2026.1.20.dist-info/WHEEL +4 -0
  43. zoopipe-2026.1.20.dist-info/licenses/LICENSE +21 -0
zoopipe/__init__.py ADDED
@@ -0,0 +1,72 @@
1
+ from zoopipe.engines import BaseEngine, MultiProcessEngine
2
+ from zoopipe.hooks.base import BaseHook, HookStore
3
+ from zoopipe.hooks.sql import SQLExpansionHook
4
+ from zoopipe.input_adapter.arrow import ArrowInputAdapter
5
+ from zoopipe.input_adapter.csv import CSVInputAdapter
6
+ from zoopipe.input_adapter.duckdb import DuckDBInputAdapter
7
+ from zoopipe.input_adapter.excel import ExcelInputAdapter
8
+ from zoopipe.input_adapter.json import JSONInputAdapter
9
+ from zoopipe.input_adapter.kafka import KafkaInputAdapter
10
+ from zoopipe.input_adapter.parquet import ParquetInputAdapter
11
+ from zoopipe.input_adapter.pygen import PyGeneratorInputAdapter
12
+ from zoopipe.input_adapter.sql import SQLInputAdapter, SQLPaginationInputAdapter
13
+ from zoopipe.manager import PipeManager
14
+ from zoopipe.output_adapter.arrow import ArrowOutputAdapter
15
+ from zoopipe.output_adapter.csv import CSVOutputAdapter
16
+ from zoopipe.output_adapter.duckdb import DuckDBOutputAdapter
17
+ from zoopipe.output_adapter.excel import ExcelOutputAdapter
18
+ from zoopipe.output_adapter.json import JSONOutputAdapter
19
+ from zoopipe.output_adapter.kafka import KafkaOutputAdapter
20
+ from zoopipe.output_adapter.parquet import ParquetOutputAdapter
21
+ from zoopipe.output_adapter.pygen import PyGeneratorOutputAdapter
22
+ from zoopipe.output_adapter.sql import SQLOutputAdapter
23
+ from zoopipe.pipe import Pipe
24
+ from zoopipe.protocols import InputAdapterProtocol, OutputAdapterProtocol
25
+ from zoopipe.report import (
26
+ EntryStatus,
27
+ EntryTypedDict,
28
+ FlowReport,
29
+ FlowStatus,
30
+ get_logger,
31
+ )
32
+ from zoopipe.zoopipe_rust_core import MultiThreadExecutor, SingleThreadExecutor
33
+
34
+ __all__ = [
35
+ "Pipe",
36
+ "PipeManager",
37
+ "BaseEngine",
38
+ "MultiProcessEngine",
39
+ "FlowReport",
40
+ "FlowStatus",
41
+ "BaseHook",
42
+ "HookStore",
43
+ "EntryStatus",
44
+ "EntryTypedDict",
45
+ "get_logger",
46
+ "SingleThreadExecutor",
47
+ "MultiThreadExecutor",
48
+ "SQLExpansionHook",
49
+ "InputAdapterProtocol",
50
+ "OutputAdapterProtocol",
51
+ # Input Adapters
52
+ "ArrowInputAdapter",
53
+ "CSVInputAdapter",
54
+ "DuckDBInputAdapter",
55
+ "ExcelInputAdapter",
56
+ "JSONInputAdapter",
57
+ "PyGeneratorInputAdapter",
58
+ "SQLInputAdapter",
59
+ "SQLPaginationInputAdapter",
60
+ "ParquetInputAdapter",
61
+ "KafkaInputAdapter",
62
+ # Output Adapters
63
+ "ArrowOutputAdapter",
64
+ "CSVOutputAdapter",
65
+ "DuckDBOutputAdapter",
66
+ "ExcelOutputAdapter",
67
+ "JSONOutputAdapter",
68
+ "PyGeneratorOutputAdapter",
69
+ "SQLOutputAdapter",
70
+ "ParquetOutputAdapter",
71
+ "KafkaOutputAdapter",
72
+ ]
@@ -0,0 +1,4 @@
1
+ from zoopipe.engines.base import BaseEngine
2
+ from zoopipe.engines.local import MultiProcessEngine
3
+
4
+ __all__ = ["BaseEngine", "MultiProcessEngine"]
@@ -0,0 +1,45 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import TYPE_CHECKING
5
+
6
+ if TYPE_CHECKING:
7
+ from zoopipe.pipe import Pipe
8
+ from zoopipe.report import FlowReport
9
+
10
+
11
+ class BaseEngine(ABC):
12
+ """
13
+ Abstract base class for ZooPipe execution engines.
14
+
15
+ Engines are responsible for the "Orchestration" layer of the pipeline,
16
+ deciding WHERE and HOW different pipe shards are executed
17
+ (locally, distributed, etc.).
18
+ """
19
+
20
+ @abstractmethod
21
+ def start(self, pipes: list[Pipe]) -> None:
22
+ """Execute the given list of pipes."""
23
+ pass
24
+
25
+ @abstractmethod
26
+ def wait(self, timeout: float | None = None) -> bool:
27
+ """Wait for execution to finish."""
28
+ pass
29
+
30
+ @abstractmethod
31
+ def shutdown(self, timeout: float = 5.0) -> None:
32
+ """Forcibly stop execution."""
33
+ pass
34
+
35
+ @property
36
+ @abstractmethod
37
+ def is_running(self) -> bool:
38
+ """Check if the engine is currently running."""
39
+ pass
40
+
41
+ @property
42
+ @abstractmethod
43
+ def report(self) -> FlowReport:
44
+ """Get an aggregated report of the current execution."""
45
+ pass
@@ -0,0 +1,225 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import re
5
+ from datetime import datetime
6
+ from importlib import metadata
7
+ from typing import TYPE_CHECKING, Any
8
+
9
+ from dask.distributed import Client, get_client
10
+
11
+ from zoopipe.engines.base import BaseEngine
12
+ from zoopipe.engines.local import PipeReport
13
+ from zoopipe.report import FlowReport, FlowStatus
14
+ from zoopipe.utils.dependency import install_dependencies as _install_dependencies
15
+
16
+ if TYPE_CHECKING:
17
+ from zoopipe.pipe import Pipe
18
+
19
+
20
+ class DaskPipeWorker:
21
+ """
22
+ Dask Worker that wraps a single Pipe execution.
23
+ Can be used as a Dask Actor for stateful reporting.
24
+ """
25
+
26
+ def __init__(self, pipe: Pipe, index: int):
27
+ self.pipe = pipe
28
+ self.index = index
29
+ self.is_finished = False
30
+ self.has_error = False
31
+
32
+ def run(self) -> None:
33
+ """Execute the pipe."""
34
+ try:
35
+ self.pipe.start(wait=True)
36
+ except Exception:
37
+ self.has_error = True
38
+ finally:
39
+ self.is_finished = True
40
+
41
+ def get_report(self) -> PipeReport:
42
+ """Get the current progress snapshot from the pipe."""
43
+ report = self.pipe.report
44
+ return PipeReport(
45
+ pipe_index=self.index,
46
+ total_processed=report.total_processed,
47
+ success_count=report.success_count,
48
+ error_count=report.error_count,
49
+ ram_bytes=report.ram_bytes,
50
+ is_finished=self.is_finished or report.is_finished,
51
+ has_error=self.has_error,
52
+ is_alive=not self.is_finished,
53
+ )
54
+
55
+
56
+ class DaskEngine(BaseEngine):
57
+ """
58
+ Distributed execution engine using Dask.
59
+ """
60
+
61
+ def __init__(self, address: str | None = None, **kwargs: Any):
62
+ try:
63
+ self.client = get_client(address) if address else get_client()
64
+ except (ValueError, RuntimeError):
65
+ # No client running, create one
66
+ self.client = Client(address=address, **kwargs)
67
+
68
+ # Prepare environment
69
+ self._prepare_runtime_env()
70
+
71
+ self._workers: list[Any] = []
72
+ self._futures: list[Any] = []
73
+ self._start_time: datetime | None = None
74
+ self._cached_report: FlowReport | None = None
75
+
76
+ def _prepare_runtime_env(self) -> None:
77
+ """
78
+ Configure the Dask workers based on whether we are in
79
+ development mode or being used as a library.
80
+ """
81
+ # 1. Detect environment and versions
82
+ is_dev_mode = False
83
+ try:
84
+ # heuristic: if we are in the zoopipe repo and have the ABI, it's dev mode
85
+ if (
86
+ os.path.exists("src/zoopipe")
87
+ and os.path.exists("pyproject.toml")
88
+ and any(f.endswith(".so") for f in os.listdir("src/zoopipe"))
89
+ ):
90
+ is_dev_mode = True
91
+ except Exception:
92
+ pass
93
+
94
+ # 2. Setup dependencies
95
+ deps = []
96
+ if is_dev_mode:
97
+ # Dev mode: Extract dependencies from pyproject.toml
98
+ try:
99
+ with open("pyproject.toml", "r") as f:
100
+ toml_content = f.read()
101
+ match = re.search(
102
+ r"dependencies\s*=\s*\[(.*?)\]", toml_content, re.DOTALL
103
+ )
104
+ if match:
105
+ dep_block = match.group(1)
106
+ deps = re.findall(r'["\'](.*?)["\']', dep_block)
107
+ except Exception:
108
+ pass
109
+ else:
110
+ # User mode: install current zoopipe version
111
+ try:
112
+ version = metadata.version("zoopipe")
113
+ deps.append(f"zoopipe=={version}")
114
+ except metadata.PackageNotFoundError:
115
+ deps = ["pydantic>=2.0"]
116
+
117
+ # Install dependencies on all workers
118
+ if deps:
119
+ try:
120
+ unique_deps = list(set(deps))
121
+ # _install_dependencies is defined at module level to be picklable
122
+ self.client.run(_install_dependencies, unique_deps)
123
+ except Exception:
124
+ pass
125
+
126
+ # 3. Handle local code path for dev mode
127
+ if is_dev_mode:
128
+ src_path = os.path.abspath("src")
129
+
130
+ def append_path(path: str):
131
+ import sys
132
+
133
+ if path not in sys.path:
134
+ sys.path.append(path)
135
+
136
+ self.client.run(append_path, src_path)
137
+
138
+ def start(self, pipes: list[Pipe]) -> None:
139
+ if self.is_running:
140
+ raise RuntimeError("DaskEngine is already running")
141
+
142
+ self._start_time = datetime.now()
143
+
144
+ # 1. Submit Workers as Actors
145
+ # It is CRITICAL to use actor=True so they maintain state (live Pipe instance)
146
+ actor_futures = [
147
+ self.client.submit(DaskPipeWorker, pipe, i, actor=True)
148
+ for i, pipe in enumerate(pipes)
149
+ ]
150
+ self._workers = [f.result() for f in actor_futures]
151
+
152
+ # 2. Launch execution WITHOUT BLOCKING
153
+ self._futures = [worker.run() for worker in self._workers]
154
+
155
+ self._cached_report = None
156
+
157
+ def wait(self, timeout: float | None = None) -> bool:
158
+ if not self._futures:
159
+ return True
160
+
161
+ start = datetime.now()
162
+ while self.is_running:
163
+ if timeout and (datetime.now() - start).total_seconds() > timeout:
164
+ return False
165
+ import time
166
+
167
+ time.sleep(0.1)
168
+ return True
169
+
170
+ def shutdown(self, timeout: float = 5.0) -> None:
171
+ # Dask actors don't have a direct 'kill', they stay alive as long
172
+ # as the client/cluster is up or they are garbage collected.
173
+ # But we can try to signal them if needed.
174
+ self._workers = []
175
+ self._futures = []
176
+ self._cached_report = None
177
+
178
+ @property
179
+ def is_running(self) -> bool:
180
+ if not self._futures:
181
+ return False
182
+
183
+ # In Dask, an actor future is running if it is not 'done'
184
+ return any(not f.done() for f in self._futures)
185
+
186
+ @property
187
+ def report(self) -> FlowReport:
188
+ if self._cached_report and self._cached_report.is_finished:
189
+ return self._cached_report
190
+
191
+ report = FlowReport()
192
+ report.start_time = self._start_time
193
+
194
+ p_reports = self.pipe_reports
195
+ for pr in p_reports:
196
+ report.total_processed += pr.total_processed
197
+ report.success_count += pr.success_count
198
+ report.error_count += pr.error_count
199
+ report.ram_bytes += pr.ram_bytes
200
+
201
+ all_finished = not self.is_running
202
+ any_error = any(pr.has_error for pr in p_reports)
203
+
204
+ if all_finished:
205
+ report.status = FlowStatus.FAILED if any_error else FlowStatus.COMPLETED
206
+ report.end_time = datetime.now()
207
+ report._finished_event.set()
208
+ self._cached_report = report
209
+ else:
210
+ report.status = FlowStatus.RUNNING
211
+
212
+ return report
213
+
214
+ @property
215
+ def pipe_reports(self) -> list[PipeReport]:
216
+ if not self._workers:
217
+ return []
218
+
219
+ # Get reports from actors
220
+ return [w.get_report().result() for w in self._workers]
221
+
222
+ def get_pipe_report(self, index: int) -> PipeReport:
223
+ if not self._workers:
224
+ raise RuntimeError("Engine has not been started")
225
+ return self._workers[index].get_report().result()
@@ -0,0 +1,215 @@
1
+ from __future__ import annotations
2
+
3
+ import multiprocessing
4
+ from ctypes import c_int, c_longlong
5
+ from dataclasses import dataclass
6
+ from datetime import datetime
7
+ from multiprocessing.sharedctypes import Synchronized
8
+ from typing import TYPE_CHECKING
9
+
10
+ from zoopipe.engines.base import BaseEngine
11
+ from zoopipe.report import FlowReport, FlowStatus
12
+
13
+ if TYPE_CHECKING:
14
+ from zoopipe.pipe import Pipe
15
+
16
+
17
+ @dataclass
18
+ class PipeProcess:
19
+ """
20
+ Internal handle for a Pipe running in an isolated worker process.
21
+ """
22
+
23
+ process: multiprocessing.Process
24
+ total_processed: Synchronized[c_longlong]
25
+ success_count: Synchronized[c_longlong]
26
+ error_count: Synchronized[c_longlong]
27
+ ram_bytes: Synchronized[c_longlong]
28
+ is_finished: Synchronized[c_int]
29
+ has_error: Synchronized[c_int]
30
+ pipe_index: int = 0
31
+
32
+
33
+ @dataclass
34
+ class PipeReport:
35
+ """
36
+ Snapshot of the current status of a single managed pipe.
37
+ """
38
+
39
+ pipe_index: int
40
+ total_processed: int = 0
41
+ success_count: int = 0
42
+ error_count: int = 0
43
+ ram_bytes: int = 0
44
+ is_finished: bool = False
45
+ has_error: bool = False
46
+ is_alive: bool = True
47
+
48
+
49
+ def _run_pipe(
50
+ pipe: Pipe,
51
+ total_processed: Synchronized[c_longlong],
52
+ success_count: Synchronized[c_longlong],
53
+ error_count: Synchronized[c_longlong],
54
+ ram_bytes: Synchronized[c_longlong],
55
+ is_finished: Synchronized[c_int],
56
+ has_error: Synchronized[c_int],
57
+ ) -> None:
58
+ try:
59
+ pipe.start(wait=False)
60
+
61
+ while not pipe.report.is_finished:
62
+ total_processed.value = pipe.report.total_processed
63
+ success_count.value = pipe.report.success_count
64
+ error_count.value = pipe.report.error_count
65
+ ram_bytes.value = pipe.report.ram_bytes
66
+ pipe.report.wait(timeout=1)
67
+
68
+ total_processed.value = pipe.report.total_processed
69
+ success_count.value = pipe.report.success_count
70
+ error_count.value = pipe.report.error_count
71
+ ram_bytes.value = pipe.report.ram_bytes
72
+ except Exception:
73
+ has_error.value = 1
74
+ finally:
75
+ is_finished.value = 1
76
+
77
+
78
+ class MultiProcessEngine(BaseEngine):
79
+ """
80
+ Engine that executes pipes in multiple local processes.
81
+ """
82
+
83
+ def __init__(self):
84
+ self._pipe_processes: list[PipeProcess] = []
85
+ self._start_time: datetime | None = None
86
+ self._cached_report: FlowReport | None = None
87
+
88
+ def start(self, pipes: list[Pipe]) -> None:
89
+ if self.is_running:
90
+ raise RuntimeError("Engine is already running")
91
+
92
+ self._start_time = datetime.now()
93
+ self._pipe_processes.clear()
94
+ self._cached_report = None
95
+
96
+ for i, pipe in enumerate(pipes):
97
+ total_processed: Synchronized[c_longlong] = multiprocessing.Value(
98
+ "q", 0, lock=False
99
+ )
100
+ success_count: Synchronized[c_longlong] = multiprocessing.Value(
101
+ "q", 0, lock=False
102
+ )
103
+ error_count: Synchronized[c_longlong] = multiprocessing.Value(
104
+ "q", 0, lock=False
105
+ )
106
+ ram_bytes: Synchronized[c_longlong] = multiprocessing.Value(
107
+ "q", 0, lock=False
108
+ )
109
+ is_finished: Synchronized[c_int] = multiprocessing.Value("i", 0, lock=False)
110
+ has_error: Synchronized[c_int] = multiprocessing.Value("i", 0, lock=False)
111
+
112
+ process = multiprocessing.Process(
113
+ target=_run_pipe,
114
+ args=(
115
+ pipe,
116
+ total_processed,
117
+ success_count,
118
+ error_count,
119
+ ram_bytes,
120
+ is_finished,
121
+ has_error,
122
+ ),
123
+ )
124
+ process.start()
125
+
126
+ self._pipe_processes.append(
127
+ PipeProcess(
128
+ process=process,
129
+ total_processed=total_processed,
130
+ success_count=success_count,
131
+ error_count=error_count,
132
+ ram_bytes=ram_bytes,
133
+ is_finished=is_finished,
134
+ has_error=has_error,
135
+ pipe_index=i,
136
+ )
137
+ )
138
+
139
+ def wait(self, timeout: float | None = None) -> bool:
140
+ for pp in self._pipe_processes:
141
+ pp.process.join(timeout=timeout)
142
+ return all(not pp.process.is_alive() for pp in self._pipe_processes)
143
+
144
+ def shutdown(self, timeout: float = 5.0) -> None:
145
+ for pp in self._pipe_processes:
146
+ if pp.process.is_alive():
147
+ pp.process.terminate()
148
+ for pp in self._pipe_processes:
149
+ pp.process.join(timeout=timeout)
150
+ if pp.process.is_alive():
151
+ pp.process.kill()
152
+ self._pipe_processes.clear()
153
+
154
+ @property
155
+ def is_running(self) -> bool:
156
+ return bool(self._pipe_processes) and any(
157
+ pp.process.is_alive() for pp in self._pipe_processes
158
+ )
159
+
160
+ @property
161
+ def report(self) -> FlowReport:
162
+ if self._cached_report and self._cached_report.is_finished:
163
+ return self._cached_report
164
+
165
+ report = FlowReport()
166
+ report.start_time = self._start_time
167
+
168
+ for pp in self._pipe_processes:
169
+ report.total_processed += pp.total_processed.value
170
+ report.success_count += pp.success_count.value
171
+ report.error_count += pp.error_count.value
172
+ report.ram_bytes += pp.ram_bytes.value
173
+
174
+ all_finished = all(pp.is_finished.value == 1 for pp in self._pipe_processes)
175
+ any_error = any(pp.has_error.value == 1 for pp in self._pipe_processes)
176
+
177
+ if all_finished:
178
+ report.status = FlowStatus.FAILED if any_error else FlowStatus.COMPLETED
179
+ report.end_time = datetime.now()
180
+ report._finished_event.set()
181
+ self._cached_report = report
182
+ else:
183
+ report.status = FlowStatus.RUNNING
184
+
185
+ return report
186
+
187
+ @property
188
+ def pipe_reports(self) -> list[PipeReport]:
189
+ """Get reports for all managed pipes."""
190
+ return [self.get_pipe_report(i) for i in range(len(self._pipe_processes))]
191
+
192
+ def get_pipe_report(self, index: int) -> PipeReport:
193
+ if not self._pipe_processes:
194
+ raise RuntimeError("Engine has not been started")
195
+ pp = self._pipe_processes[index]
196
+ return PipeReport(
197
+ pipe_index=index,
198
+ total_processed=pp.total_processed.value,
199
+ success_count=pp.success_count.value,
200
+ error_count=pp.error_count.value,
201
+ ram_bytes=pp.ram_bytes.value,
202
+ is_finished=pp.is_finished.value == 1,
203
+ has_error=pp.has_error.value == 1,
204
+ is_alive=pp.process.is_alive(),
205
+ )
206
+
207
+
208
+ def _init_multiprocessing() -> None:
209
+ try:
210
+ multiprocessing.set_start_method("fork", force=True)
211
+ except RuntimeError:
212
+ pass
213
+
214
+
215
+ _init_multiprocessing()