zoopipe 2026.1.20__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. zoopipe/__init__.py +72 -0
  2. zoopipe/engines/__init__.py +4 -0
  3. zoopipe/engines/base.py +45 -0
  4. zoopipe/engines/dask.py +225 -0
  5. zoopipe/engines/local.py +215 -0
  6. zoopipe/engines/ray.py +252 -0
  7. zoopipe/hooks/__init__.py +4 -0
  8. zoopipe/hooks/base.py +70 -0
  9. zoopipe/hooks/sql.py +94 -0
  10. zoopipe/input_adapter/__init__.py +24 -0
  11. zoopipe/input_adapter/arrow.py +38 -0
  12. zoopipe/input_adapter/base.py +48 -0
  13. zoopipe/input_adapter/csv.py +144 -0
  14. zoopipe/input_adapter/duckdb.py +54 -0
  15. zoopipe/input_adapter/excel.py +51 -0
  16. zoopipe/input_adapter/json.py +73 -0
  17. zoopipe/input_adapter/kafka.py +39 -0
  18. zoopipe/input_adapter/parquet.py +85 -0
  19. zoopipe/input_adapter/pygen.py +37 -0
  20. zoopipe/input_adapter/sql.py +103 -0
  21. zoopipe/manager.py +211 -0
  22. zoopipe/output_adapter/__init__.py +23 -0
  23. zoopipe/output_adapter/arrow.py +50 -0
  24. zoopipe/output_adapter/base.py +41 -0
  25. zoopipe/output_adapter/csv.py +71 -0
  26. zoopipe/output_adapter/duckdb.py +46 -0
  27. zoopipe/output_adapter/excel.py +42 -0
  28. zoopipe/output_adapter/json.py +66 -0
  29. zoopipe/output_adapter/kafka.py +39 -0
  30. zoopipe/output_adapter/parquet.py +49 -0
  31. zoopipe/output_adapter/pygen.py +29 -0
  32. zoopipe/output_adapter/sql.py +43 -0
  33. zoopipe/pipe.py +263 -0
  34. zoopipe/protocols.py +37 -0
  35. zoopipe/py.typed +0 -0
  36. zoopipe/report.py +173 -0
  37. zoopipe/utils/__init__.py +0 -0
  38. zoopipe/utils/dependency.py +78 -0
  39. zoopipe/zoopipe_rust_core.abi3.so +0 -0
  40. zoopipe-2026.1.20.dist-info/METADATA +231 -0
  41. zoopipe-2026.1.20.dist-info/RECORD +43 -0
  42. zoopipe-2026.1.20.dist-info/WHEEL +4 -0
  43. zoopipe-2026.1.20.dist-info/licenses/LICENSE +21 -0
zoopipe/pipe.py ADDED
@@ -0,0 +1,263 @@
1
+ import logging
2
+ import threading
3
+
4
+ from pydantic import TypeAdapter, ValidationError
5
+
6
+ from zoopipe.hooks.base import BaseHook, HookStore
7
+ from zoopipe.protocols import InputAdapterProtocol, OutputAdapterProtocol
8
+ from zoopipe.report import EntryStatus, FlowReport, get_logger
9
+ from zoopipe.zoopipe_rust_core import (
10
+ MultiThreadExecutor,
11
+ NativePipe,
12
+ SingleThreadExecutor,
13
+ )
14
+
15
+
16
+ class Pipe:
17
+ """
18
+ The main execution unit for data processing pipelines.
19
+
20
+ A Pipe connects an input adapter to one or more output adapters,
21
+ handles validation via Pydantic models, and executes pre- and post-validation hooks.
22
+
23
+ By default, a Pipe executes sequentially. For parallel execution across
24
+ multiple cores or processes, it is recommended to use `PipeManager`.
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ input_adapter: InputAdapterProtocol | None = None,
30
+ output_adapter: OutputAdapterProtocol | None = None,
31
+ error_output_adapter: OutputAdapterProtocol | None = None,
32
+ schema_model: type | None = None,
33
+ pre_validation_hooks: list[BaseHook] | None = None,
34
+ post_validation_hooks: list[BaseHook] | None = None,
35
+ logger: logging.Logger | None = None,
36
+ report_update_interval: int = 1,
37
+ executor: SingleThreadExecutor | MultiThreadExecutor | None = None,
38
+ ) -> None:
39
+ """
40
+ Initialize a new Pipe.
41
+
42
+ Args:
43
+ input_adapter: Source of data.
44
+ output_adapter: Destination for successfully validated data.
45
+ error_output_adapter: Optional destination for data that failed validation.
46
+ schema_model: Optional Pydantic model class for validation.
47
+ pre_validation_hooks: Hooks to run before validation.
48
+ post_validation_hooks: Hooks to run after validation.
49
+ logger: Optional custom logger.
50
+ report_update_interval: How often (in batches) to update the
51
+ progress report.
52
+ executor: Strategy for batch processing. Defaults to SingleThreadExecutor.
53
+ For advanced parallel execution, use `PipeManager`.
54
+ """
55
+ self.input_adapter = input_adapter
56
+ self.output_adapter = output_adapter
57
+ self.error_output_adapter = error_output_adapter
58
+ self.schema_model = schema_model
59
+
60
+ bundled_pre_hooks = []
61
+ if self.input_adapter and hasattr(self.input_adapter, "get_hooks"):
62
+ bundled_pre_hooks.extend(self.input_adapter.get_hooks())
63
+
64
+ bundled_post_hooks = []
65
+ if self.output_adapter and hasattr(self.output_adapter, "get_hooks"):
66
+ bundled_post_hooks.extend(self.output_adapter.get_hooks())
67
+ if self.error_output_adapter and hasattr(
68
+ self.error_output_adapter, "get_hooks"
69
+ ):
70
+ bundled_post_hooks.extend(self.error_output_adapter.get_hooks())
71
+
72
+ self.pre_validation_hooks = bundled_pre_hooks + (pre_validation_hooks or [])
73
+ self.post_validation_hooks = bundled_post_hooks + (post_validation_hooks or [])
74
+
75
+ self.logger = logger or get_logger()
76
+
77
+ self.report_update_interval = report_update_interval
78
+ self.executor = executor or SingleThreadExecutor()
79
+
80
+ self._report = FlowReport()
81
+ self._thread: threading.Thread | None = None
82
+ self._store: HookStore = {}
83
+ self._validator = TypeAdapter(self.schema_model) if self.schema_model else None
84
+ self._batch_validator = (
85
+ TypeAdapter(list[self.schema_model]) if self.schema_model else None
86
+ )
87
+ self._status_validated = EntryStatus.VALIDATED
88
+ self._status_failed = EntryStatus.FAILED
89
+
90
+ def _process_batch(self, entries: list[dict]) -> list[dict]:
91
+ local_store: HookStore = {}
92
+
93
+ for hook in self.pre_validation_hooks:
94
+ entries = hook.execute(entries, local_store)
95
+
96
+ if self._validator:
97
+ self._validate_batch(entries)
98
+
99
+ for hook in self.post_validation_hooks:
100
+ entries = hook.execute(entries, local_store)
101
+
102
+ return entries
103
+
104
+ def _validate_batch(self, entries: list[dict]) -> None:
105
+ try:
106
+ raw_data_list = [e["raw_data"] for e in entries]
107
+ validated_list = self._batch_validator.validate_python(raw_data_list)
108
+ for entry, processed in zip(entries, validated_list):
109
+ entry["validated_data"] = processed.model_dump()
110
+ entry["status"] = self._status_validated
111
+ except ValidationError as e:
112
+ for error in e.errors():
113
+ entry_index = error["loc"][0]
114
+ entry = entries[entry_index]
115
+ entry["status"] = self._status_failed
116
+ entry["errors"].append({"msg": str(error), "type": "validation_error"})
117
+
118
+ @property
119
+ def report(self) -> FlowReport:
120
+ """Get the current progress report of the pipeline."""
121
+ return self._report
122
+
123
+ def start(self, wait: bool = False) -> None:
124
+ """
125
+ Start the pipeline execution in a separate thread.
126
+
127
+ Args:
128
+ wait: If True, blocks until the pipeline finishes.
129
+ """
130
+ if self._thread and self._thread.is_alive():
131
+ raise RuntimeError("Pipe is already running")
132
+
133
+ reader = self.input_adapter.get_native_reader()
134
+ writer = self.output_adapter.get_native_writer()
135
+ error_writer = None
136
+ if self.error_output_adapter:
137
+ error_writer = self.error_output_adapter.get_native_writer()
138
+
139
+ native_pipe = NativePipe(
140
+ reader=reader,
141
+ writer=writer,
142
+ error_writer=error_writer,
143
+ batch_processor=self._process_batch,
144
+ report=self._report,
145
+ report_update_interval=self.report_update_interval,
146
+ executor=self.executor,
147
+ )
148
+
149
+ self._thread = threading.Thread(
150
+ target=self._run_native,
151
+ args=(native_pipe,),
152
+ daemon=False,
153
+ )
154
+ self._thread.start()
155
+
156
+ if wait:
157
+ self.wait()
158
+
159
+ def _run_native(self, native_pipe: NativePipe) -> None:
160
+ try:
161
+ for hook in self.pre_validation_hooks:
162
+ hook.setup(self._store)
163
+ for hook in self.post_validation_hooks:
164
+ hook.setup(self._store)
165
+
166
+ native_pipe.run()
167
+ except Exception as e:
168
+ self.logger.error(f"Pipeline execution failed: {e}")
169
+ self._report._mark_failed(e)
170
+ raise
171
+ finally:
172
+ for hook in self.pre_validation_hooks:
173
+ hook.teardown(self._store)
174
+ for hook in self.post_validation_hooks:
175
+ hook.teardown(self._store)
176
+
177
+ def shutdown(self, timeout: float = 5.0) -> None:
178
+ """
179
+ Request the pipeline to stop and wait for it to finish.
180
+
181
+ Args:
182
+ timeout: Maximum time to wait for the thread to join.
183
+ """
184
+ self._report.abort()
185
+ if self._thread and self._thread.is_alive():
186
+ self._thread.join(timeout=timeout)
187
+ if self._thread.is_alive():
188
+ self.logger.warning(
189
+ "Pipeline thread did not finish cleanly within timeout"
190
+ )
191
+
192
+ def wait(self, timeout: float | None = None) -> bool:
193
+ """
194
+ Wait for the pipeline to finish.
195
+
196
+ Args:
197
+ timeout: Optional timeout in seconds.
198
+ Returns:
199
+ True if the pipeline finished, False if it timed out.
200
+ """
201
+ return self._report.wait(timeout)
202
+
203
+ def __enter__(self) -> "Pipe":
204
+ self.start()
205
+ return self
206
+
207
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
208
+ if not self._report.is_finished:
209
+ self.shutdown()
210
+
211
+ if self._thread and self._thread.is_alive():
212
+ self._thread.join(timeout=10.0)
213
+ if self._thread.is_alive():
214
+ self.logger.warning("Pipeline thread still running after context exit")
215
+
216
+ def __repr__(self) -> str:
217
+ return f"<Pipe input={self.input_adapter} output={self.output_adapter}>"
218
+
219
+ def __getstate__(self) -> dict:
220
+ """Serialize the pipe state, handling non-picklable Rust objects."""
221
+ state = self.__dict__.copy()
222
+
223
+ # Handle executor serialization
224
+ executor = state["executor"]
225
+ exec_config = {
226
+ "class_name": executor.__class__.__name__,
227
+ "batch_size": executor.get_batch_size(),
228
+ }
229
+ # MultiThreadExecutor specific attribute (not directly exposed via property,
230
+ # so we rely on the constructor's default or we'd need to store it if we could)
231
+ # For now, we'll try to use a safe reconstruction.
232
+ state["executor_config"] = exec_config
233
+ del state["executor"]
234
+
235
+ # Internal non-serializable objects
236
+ state["_thread"] = None
237
+ state["_validator"] = None
238
+ state["_batch_validator"] = None
239
+
240
+ return state
241
+
242
+ def __setstate__(self, state: dict) -> None:
243
+ """Restore the pipe state and reconstruct non-picklable objects."""
244
+ exec_config = state.pop("executor_config")
245
+
246
+ class_name = exec_config["class_name"]
247
+ batch_size = exec_config["batch_size"]
248
+
249
+ if class_name == "MultiThreadExecutor":
250
+ state["executor"] = MultiThreadExecutor(batch_size=batch_size)
251
+ else:
252
+ state["executor"] = SingleThreadExecutor(batch_size=batch_size)
253
+
254
+ self.__dict__.update(state)
255
+
256
+ # Reconstruct validators
257
+ self._validator = TypeAdapter(self.schema_model) if self.schema_model else None
258
+ self._batch_validator = (
259
+ TypeAdapter(list[self.schema_model]) if self.schema_model else None
260
+ )
261
+
262
+
263
+ __all__ = ["Pipe", "SingleThreadExecutor", "MultiThreadExecutor"]
zoopipe/protocols.py ADDED
@@ -0,0 +1,37 @@
1
+ from typing import Any, Protocol, runtime_checkable
2
+
3
+
4
+ @runtime_checkable
5
+ class InputAdapterProtocol(Protocol):
6
+ """
7
+ Protocol defining the minimal interface for a pipeline source.
8
+
9
+ Any object implementing this protocol can be used as the input
10
+ source for a Pipe.
11
+ """
12
+
13
+ def get_native_reader(self) -> Any:
14
+ """Returns the Rust-level reader."""
15
+ ...
16
+
17
+ def get_hooks(self) -> list[Any]:
18
+ """Returns optional hooks for data expansion or pre-processing."""
19
+ ...
20
+
21
+
22
+ @runtime_checkable
23
+ class OutputAdapterProtocol(Protocol):
24
+ """
25
+ Protocol defining the minimal interface for a pipeline destination.
26
+
27
+ Any object implementing this protocol can be used as the output
28
+ target for a Pipe.
29
+ """
30
+
31
+ def get_native_writer(self) -> Any:
32
+ """Returns the Rust-level writer."""
33
+ ...
34
+
35
+ def get_hooks(self) -> list[Any]:
36
+ """Returns optional hooks for cleanup or post-processing."""
37
+ ...
zoopipe/py.typed ADDED
File without changes
zoopipe/report.py ADDED
@@ -0,0 +1,173 @@
1
+ import enum
2
+ import logging
3
+ import sys
4
+ import threading
5
+ import typing
6
+ from datetime import datetime
7
+
8
+
9
+ def get_logger(name: str = "zoopipe") -> logging.Logger:
10
+ """
11
+ Get a configured logger for zoopipe.
12
+
13
+ Args:
14
+ name: Name of the logger to retrieve.
15
+ """
16
+ logger = logging.getLogger(name)
17
+ if not logger.handlers:
18
+ handler = logging.StreamHandler(sys.stdout)
19
+ handler.setFormatter(
20
+ logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
21
+ )
22
+ logger.addHandler(handler)
23
+ logger.setLevel(logging.INFO)
24
+ return logger
25
+
26
+
27
+ class EntryStatus(enum.Enum):
28
+ """
29
+ Status of an individual data entry in the pipeline lifecycle.
30
+
31
+ - PENDING: Initial state after ingestion.
32
+ - VALIDATED: Successfully passed schema validation.
33
+ - FAILED: Encountered validation errors or processing issues.
34
+ """
35
+
36
+ PENDING = "pending"
37
+ VALIDATED = "validated"
38
+ FAILED = "failed"
39
+
40
+
41
+ class EntryTypedDict(typing.TypedDict):
42
+ """
43
+ Structure of the record envelope as it flows through the pipeline.
44
+
45
+ The envelope contains not only the actual business data but also
46
+ operational metadata, unique identification, and error tracking.
47
+ """
48
+
49
+ id: typing.Any
50
+ position: int | None
51
+ status: EntryStatus
52
+ raw_data: dict[str, typing.Any]
53
+ validated_data: dict[str, typing.Any] | None
54
+ errors: list[dict[str, typing.Any]]
55
+ metadata: dict[str, typing.Any]
56
+
57
+
58
+ class FlowStatus(enum.Enum):
59
+ """
60
+ Lifecycle status of a Pipe or PipeManager execution.
61
+
62
+ - PENDING: Execution hasn't started yet.
63
+ - RUNNING: Actively processing batches.
64
+ - COMPLETED: Finished successfully (all source data consumed).
65
+ - FAILED: Partially finished due to an unhandled exception.
66
+ - ABORTED: Stopped manually by the user.
67
+ """
68
+
69
+ PENDING = "pending"
70
+ RUNNING = "running"
71
+ COMPLETED = "completed"
72
+ FAILED = "failed"
73
+ ABORTED = "aborted"
74
+
75
+
76
+ class FlowReport:
77
+ """
78
+ Live progress tracker and final summary for a pipeline execution.
79
+
80
+ FlowReport provides real-time access to processing metrics,
81
+ memory usage, and execution status. It is automatically updated
82
+ by the Rust core during execution.
83
+ """
84
+
85
+ def __init__(self) -> None:
86
+ """Initialize an empty FlowReport."""
87
+ self.status = FlowStatus.PENDING
88
+ self.total_processed = 0
89
+ self.success_count = 0
90
+ self.error_count = 0
91
+ self.ram_bytes = 0
92
+ self.exception: Exception | None = None
93
+ self.start_time: datetime | None = None
94
+ self.end_time: datetime | None = None
95
+ self._finished_event = threading.Event()
96
+
97
+ @property
98
+ def duration(self) -> float:
99
+ """Total execution time in seconds."""
100
+ start = self.start_time
101
+ if not start:
102
+ return 0.0
103
+ end = self.end_time or datetime.now()
104
+ return (end - start).total_seconds()
105
+
106
+ @property
107
+ def items_per_second(self) -> float:
108
+ """Processing speed (items per second)."""
109
+ duration = self.duration
110
+ if duration == 0:
111
+ return 0.0
112
+ return self.total_processed / duration
113
+
114
+ @property
115
+ def is_finished(self) -> bool:
116
+ """Check if the pipeline has finished."""
117
+ return self._finished_event.is_set()
118
+
119
+ def wait(self, timeout: float | None = None) -> bool:
120
+ """
121
+ Wait for the pipeline to finish.
122
+
123
+ Args:
124
+ timeout: Optional timeout in seconds.
125
+ Returns:
126
+ True if the pipeline finished, False if it timed out.
127
+ """
128
+ return self._finished_event.wait(timeout)
129
+
130
+ def _mark_running(self) -> None:
131
+ self.status = FlowStatus.RUNNING
132
+ self.start_time = datetime.now()
133
+
134
+ def _mark_completed(self) -> None:
135
+ self.status = FlowStatus.COMPLETED
136
+ self.end_time = datetime.now()
137
+ self._finished_event.set()
138
+
139
+ def abort(self) -> None:
140
+ """Abort the pipeline execution."""
141
+ self.status = FlowStatus.ABORTED
142
+ self.end_time = datetime.now()
143
+ self._finished_event.set()
144
+
145
+ def _mark_failed(self, exception: Exception) -> None:
146
+ self.status = FlowStatus.FAILED
147
+ self.exception = exception
148
+ self.end_time = datetime.now()
149
+ self._finished_event.set()
150
+
151
+ def __repr__(self) -> str:
152
+ return (
153
+ f"<FlowReport status={self.status.value} "
154
+ f"processed={self.total_processed} "
155
+ f"success={self.success_count} "
156
+ f"error={self.error_count} "
157
+ f"ram={self.ram_bytes / 1024 / 1024:.2f}MB "
158
+ f"fps={self.items_per_second:.2f} "
159
+ f"duration={self.duration:.2f}s>"
160
+ )
161
+
162
+ def __getstate__(self) -> dict:
163
+ """Serialize the report state, excluding non-picklable lock objects."""
164
+ state = self.__dict__.copy()
165
+ del state["_finished_event"]
166
+ return state
167
+
168
+ def __setstate__(self, state: dict) -> None:
169
+ """Restore the report state and reconstruct the event lock."""
170
+ self.__dict__.update(state)
171
+ self._finished_event = threading.Event()
172
+ if self.status in (FlowStatus.COMPLETED, FlowStatus.FAILED, FlowStatus.ABORTED):
173
+ self._finished_event.set()
File without changes
@@ -0,0 +1,78 @@
1
+ import importlib.util
2
+ import shutil
3
+ import subprocess
4
+ import sys
5
+
6
+
7
+ def _try_env_install_with_pip(packages: list[str]) -> bool:
8
+ """
9
+ Try to install packages using standard pip module.
10
+ """
11
+ if importlib.util.find_spec("pip") is None:
12
+ return False
13
+
14
+ try:
15
+ subprocess.check_call(
16
+ [sys.executable, "-m", "pip", "install", *packages],
17
+ stdout=subprocess.DEVNULL,
18
+ stderr=subprocess.DEVNULL,
19
+ )
20
+ return True
21
+ except (subprocess.CalledProcessError, OSError):
22
+ return False
23
+
24
+
25
+ def _try_env_install_with_uv(packages: list[str]) -> bool:
26
+ """
27
+ Try to install packages using 'uv pip install'.
28
+ """
29
+ uv_path = shutil.which("uv")
30
+ if not uv_path:
31
+ return False
32
+
33
+ try:
34
+ subprocess.check_call(
35
+ [uv_path, "pip", "install", *packages],
36
+ stdout=subprocess.DEVNULL,
37
+ stderr=subprocess.DEVNULL,
38
+ )
39
+ return True
40
+ except (subprocess.CalledProcessError, OSError):
41
+ return False
42
+
43
+
44
+ def _try_env_install_with_poetry(packages: list[str]) -> bool:
45
+ """
46
+ Try to install packages using 'poetry run pip install'.
47
+ """
48
+ poetry_path = shutil.which("poetry")
49
+ if not poetry_path:
50
+ return False
51
+
52
+ try:
53
+ # Assuming we are in a poetry env
54
+ subprocess.check_call(
55
+ [poetry_path, "run", "pip", "install", *packages],
56
+ stdout=subprocess.DEVNULL,
57
+ stderr=subprocess.DEVNULL,
58
+ )
59
+ return True
60
+ except (subprocess.CalledProcessError, OSError):
61
+ return False
62
+
63
+
64
+ def install_dependencies(packages: list[str]) -> None:
65
+ """
66
+ Agnostically install dependencies using available package managers.
67
+ Strategies: pip -> uv -> poetry.
68
+ If all fail, it does nothing (assuming manual provisioning).
69
+ """
70
+ if not packages:
71
+ return
72
+
73
+ if _try_env_install_with_pip(packages):
74
+ return
75
+ if _try_env_install_with_uv(packages):
76
+ return
77
+ if _try_env_install_with_poetry(packages):
78
+ return
Binary file