zoopipe 2026.1.20__cp310-abi3-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zoopipe/__init__.py +72 -0
- zoopipe/engines/__init__.py +4 -0
- zoopipe/engines/base.py +45 -0
- zoopipe/engines/dask.py +225 -0
- zoopipe/engines/local.py +215 -0
- zoopipe/engines/ray.py +252 -0
- zoopipe/hooks/__init__.py +4 -0
- zoopipe/hooks/base.py +70 -0
- zoopipe/hooks/sql.py +94 -0
- zoopipe/input_adapter/__init__.py +24 -0
- zoopipe/input_adapter/arrow.py +38 -0
- zoopipe/input_adapter/base.py +48 -0
- zoopipe/input_adapter/csv.py +144 -0
- zoopipe/input_adapter/duckdb.py +54 -0
- zoopipe/input_adapter/excel.py +51 -0
- zoopipe/input_adapter/json.py +73 -0
- zoopipe/input_adapter/kafka.py +39 -0
- zoopipe/input_adapter/parquet.py +85 -0
- zoopipe/input_adapter/pygen.py +37 -0
- zoopipe/input_adapter/sql.py +103 -0
- zoopipe/manager.py +211 -0
- zoopipe/output_adapter/__init__.py +23 -0
- zoopipe/output_adapter/arrow.py +50 -0
- zoopipe/output_adapter/base.py +41 -0
- zoopipe/output_adapter/csv.py +71 -0
- zoopipe/output_adapter/duckdb.py +46 -0
- zoopipe/output_adapter/excel.py +42 -0
- zoopipe/output_adapter/json.py +66 -0
- zoopipe/output_adapter/kafka.py +39 -0
- zoopipe/output_adapter/parquet.py +49 -0
- zoopipe/output_adapter/pygen.py +29 -0
- zoopipe/output_adapter/sql.py +43 -0
- zoopipe/pipe.py +263 -0
- zoopipe/protocols.py +37 -0
- zoopipe/py.typed +0 -0
- zoopipe/report.py +173 -0
- zoopipe/utils/__init__.py +0 -0
- zoopipe/utils/dependency.py +78 -0
- zoopipe/zoopipe_rust_core.abi3.so +0 -0
- zoopipe-2026.1.20.dist-info/METADATA +231 -0
- zoopipe-2026.1.20.dist-info/RECORD +43 -0
- zoopipe-2026.1.20.dist-info/WHEEL +4 -0
- zoopipe-2026.1.20.dist-info/licenses/LICENSE +21 -0
zoopipe/engines/ray.py
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from importlib import metadata
|
|
7
|
+
from typing import TYPE_CHECKING, Any
|
|
8
|
+
|
|
9
|
+
import ray
|
|
10
|
+
|
|
11
|
+
from zoopipe.engines.base import BaseEngine
|
|
12
|
+
from zoopipe.engines.local import PipeReport
|
|
13
|
+
from zoopipe.report import FlowReport, FlowStatus
|
|
14
|
+
from zoopipe.utils.dependency import install_dependencies
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from zoopipe.pipe import Pipe
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@ray.remote(memory=512 * 1024 * 1024) # Limit actor memory to 512MB
|
|
21
|
+
class RayPipeWorker:
|
|
22
|
+
"""
|
|
23
|
+
Ray Actor that wraps a single Pipe execution.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(self, pipe: Pipe, index: int):
|
|
27
|
+
self.pipe = pipe
|
|
28
|
+
self.index = index
|
|
29
|
+
self.is_finished = False
|
|
30
|
+
self.has_error = False
|
|
31
|
+
|
|
32
|
+
def run(self) -> None:
|
|
33
|
+
try:
|
|
34
|
+
self.pipe.start(wait=True)
|
|
35
|
+
except Exception:
|
|
36
|
+
self.has_error = True
|
|
37
|
+
finally:
|
|
38
|
+
self.is_finished = True
|
|
39
|
+
|
|
40
|
+
def get_report(self) -> PipeReport:
|
|
41
|
+
report = self.pipe.report
|
|
42
|
+
return PipeReport(
|
|
43
|
+
pipe_index=self.index,
|
|
44
|
+
total_processed=report.total_processed,
|
|
45
|
+
success_count=report.success_count,
|
|
46
|
+
error_count=report.error_count,
|
|
47
|
+
ram_bytes=report.ram_bytes,
|
|
48
|
+
is_finished=self.is_finished or report.is_finished,
|
|
49
|
+
has_error=self.has_error,
|
|
50
|
+
is_alive=not self.is_finished,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@ray.remote
|
|
55
|
+
def _install_dependencies(packages: list[str]) -> None:
|
|
56
|
+
install_dependencies(packages)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class RayEngine(BaseEngine):
|
|
60
|
+
"""
|
|
61
|
+
Distributed execution engine using Ray.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
def __init__(self, address: str | None = None, **kwargs: Any):
|
|
65
|
+
if not ray.is_initialized():
|
|
66
|
+
# Silence the accelerator visible devices warning for future Ray versions
|
|
67
|
+
os.environ.setdefault("RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO", "0")
|
|
68
|
+
|
|
69
|
+
# Prepare default runtime_env and get dependencies
|
|
70
|
+
runtime_env, deps = self._prepare_runtime_env(kwargs.pop("runtime_env", {}))
|
|
71
|
+
|
|
72
|
+
# Default to lean initialization
|
|
73
|
+
ray_args = {
|
|
74
|
+
"address": address,
|
|
75
|
+
"num_cpus": kwargs.pop("num_cpus", None),
|
|
76
|
+
"include_dashboard": kwargs.pop("include_dashboard", False),
|
|
77
|
+
"logging_level": kwargs.pop("logging_level", "error"),
|
|
78
|
+
"runtime_env": runtime_env,
|
|
79
|
+
**kwargs,
|
|
80
|
+
}
|
|
81
|
+
ray.init(**ray_args)
|
|
82
|
+
|
|
83
|
+
# Manually install dependencies on all nodes using our agnostic strategy
|
|
84
|
+
if deps:
|
|
85
|
+
self._install_deps_on_all_nodes(deps)
|
|
86
|
+
|
|
87
|
+
self._workers: list[Any] = []
|
|
88
|
+
self._futures: list[Any] = []
|
|
89
|
+
self._start_time: datetime | None = None
|
|
90
|
+
self._cached_report: FlowReport | None = None
|
|
91
|
+
|
|
92
|
+
def _install_deps_on_all_nodes(self, deps: list[str]) -> None:
|
|
93
|
+
"""
|
|
94
|
+
Run the agnostic dependency installer on all connected Ray nodes.
|
|
95
|
+
This provides support for pip, uv, and poetry environments.
|
|
96
|
+
"""
|
|
97
|
+
nodes = ray.nodes()
|
|
98
|
+
alive_nodes = [n for n in nodes if n.get("Alive")]
|
|
99
|
+
|
|
100
|
+
refs = []
|
|
101
|
+
for node in alive_nodes:
|
|
102
|
+
# We use resources placement group strategy to force execution
|
|
103
|
+
# on specific node. The 'node:<ip>' resource is automatically
|
|
104
|
+
# present on each node.
|
|
105
|
+
node_ip = node.get("NodeManagerAddress")
|
|
106
|
+
if node_ip:
|
|
107
|
+
refs.append(
|
|
108
|
+
_install_dependencies.options(
|
|
109
|
+
resources={f"node:{node_ip}": 0.001}
|
|
110
|
+
).remote(deps)
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
if refs:
|
|
114
|
+
ray.get(refs)
|
|
115
|
+
|
|
116
|
+
def _prepare_runtime_env(
|
|
117
|
+
self, runtime_env: dict[str, Any]
|
|
118
|
+
) -> tuple[dict[str, Any], list[str]]:
|
|
119
|
+
"""
|
|
120
|
+
Configure the Ray runtime environment based on whether we are in
|
|
121
|
+
development mode or being used as a library.
|
|
122
|
+
Returns modified runtime_env and a list of dependencies to install manually.
|
|
123
|
+
"""
|
|
124
|
+
# 1. Detect environment and versions
|
|
125
|
+
is_dev_mode = False
|
|
126
|
+
try:
|
|
127
|
+
# heuristic: if we are in the zoopipe repo and have the ABI, it's dev mode
|
|
128
|
+
if (
|
|
129
|
+
os.path.exists("src/zoopipe")
|
|
130
|
+
and os.path.exists("pyproject.toml")
|
|
131
|
+
and any(f.endswith(".so") for f in os.listdir("src/zoopipe"))
|
|
132
|
+
):
|
|
133
|
+
is_dev_mode = True
|
|
134
|
+
except Exception:
|
|
135
|
+
pass
|
|
136
|
+
|
|
137
|
+
# 2. Setup pip dependencies
|
|
138
|
+
deps = []
|
|
139
|
+
if "pip" not in runtime_env:
|
|
140
|
+
if is_dev_mode:
|
|
141
|
+
# Dev mode: Extract dependencies from pyproject.toml (Source of Truth)
|
|
142
|
+
try:
|
|
143
|
+
with open("pyproject.toml", "r") as f:
|
|
144
|
+
toml_content = f.read()
|
|
145
|
+
# Find dependencies = [ ... ] block
|
|
146
|
+
match = re.search(
|
|
147
|
+
r"dependencies\s*=\s*\[(.*?)\]", toml_content, re.DOTALL
|
|
148
|
+
)
|
|
149
|
+
if match:
|
|
150
|
+
dep_block = match.group(1)
|
|
151
|
+
deps = re.findall(r'["\'](.*?)["\']', dep_block)
|
|
152
|
+
except Exception:
|
|
153
|
+
pass
|
|
154
|
+
else:
|
|
155
|
+
# User mode: zoopipe package will pull its own dependencies
|
|
156
|
+
try:
|
|
157
|
+
version = metadata.version("zoopipe")
|
|
158
|
+
deps.append(f"zoopipe=={version}")
|
|
159
|
+
except metadata.PackageNotFoundError:
|
|
160
|
+
# Fallback to hardcoded core if everything fails
|
|
161
|
+
deps = ["pydantic>=2.0"]
|
|
162
|
+
|
|
163
|
+
# NOTE: We DO NOT set 'pip' in runtime_env because we want to use our
|
|
164
|
+
# agnostic installer.
|
|
165
|
+
# runtime_env["pip"] = deps <-- REMOVED
|
|
166
|
+
|
|
167
|
+
# 3. Ship code and binaries
|
|
168
|
+
if "working_dir" not in runtime_env:
|
|
169
|
+
runtime_env["working_dir"] = "."
|
|
170
|
+
|
|
171
|
+
# In dev mode, we need src/ in PYTHONPATH to find the local zoopipe
|
|
172
|
+
if is_dev_mode:
|
|
173
|
+
env_vars = runtime_env.get("env_vars", {})
|
|
174
|
+
if "PYTHONPATH" not in env_vars:
|
|
175
|
+
# Ray adds working_dir to sys.path,
|
|
176
|
+
# but we need src/ for 'import zoopipe'
|
|
177
|
+
env_vars["PYTHONPATH"] = "./src"
|
|
178
|
+
runtime_env["env_vars"] = env_vars
|
|
179
|
+
|
|
180
|
+
return runtime_env
|
|
181
|
+
|
|
182
|
+
def start(self, pipes: list[Pipe]) -> None:
|
|
183
|
+
if self.is_running:
|
|
184
|
+
raise RuntimeError("RayEngine is already running")
|
|
185
|
+
|
|
186
|
+
self._start_time = datetime.now()
|
|
187
|
+
self._workers = [RayPipeWorker.remote(pipe, i) for i, pipe in enumerate(pipes)]
|
|
188
|
+
self._futures = [w.run.remote() for w in self._workers]
|
|
189
|
+
self._cached_report = None
|
|
190
|
+
|
|
191
|
+
def wait(self, timeout: float | None = None) -> bool:
|
|
192
|
+
if not self._futures:
|
|
193
|
+
return True
|
|
194
|
+
|
|
195
|
+
ready, _ = ray.wait(
|
|
196
|
+
self._futures, num_returns=len(self._futures), timeout=timeout
|
|
197
|
+
)
|
|
198
|
+
return len(ready) == len(self._futures)
|
|
199
|
+
|
|
200
|
+
def shutdown(self, timeout: float = 5.0) -> None:
|
|
201
|
+
for worker in self._workers:
|
|
202
|
+
ray.kill(worker)
|
|
203
|
+
self._workers = []
|
|
204
|
+
self._futures = []
|
|
205
|
+
self._cached_report = None
|
|
206
|
+
|
|
207
|
+
@property
|
|
208
|
+
def is_running(self) -> bool:
|
|
209
|
+
if not self._futures:
|
|
210
|
+
return False
|
|
211
|
+
ready, _ = ray.wait(self._futures, num_returns=len(self._futures), timeout=0)
|
|
212
|
+
return len(ready) < len(self._futures)
|
|
213
|
+
|
|
214
|
+
@property
|
|
215
|
+
def report(self) -> FlowReport:
|
|
216
|
+
if self._cached_report and self._cached_report.is_finished:
|
|
217
|
+
return self._cached_report
|
|
218
|
+
|
|
219
|
+
report = FlowReport()
|
|
220
|
+
report.start_time = self._start_time
|
|
221
|
+
|
|
222
|
+
p_reports = self.pipe_reports
|
|
223
|
+
for pr in p_reports:
|
|
224
|
+
report.total_processed += pr.total_processed
|
|
225
|
+
report.success_count += pr.success_count
|
|
226
|
+
report.error_count += pr.error_count
|
|
227
|
+
report.ram_bytes += pr.ram_bytes
|
|
228
|
+
|
|
229
|
+
all_finished = all(pr.is_finished for pr in p_reports)
|
|
230
|
+
any_error = any(pr.has_error for pr in p_reports)
|
|
231
|
+
|
|
232
|
+
if all_finished:
|
|
233
|
+
report.status = FlowStatus.FAILED if any_error else FlowStatus.COMPLETED
|
|
234
|
+
report.end_time = datetime.now()
|
|
235
|
+
report._finished_event.set()
|
|
236
|
+
self._cached_report = report
|
|
237
|
+
else:
|
|
238
|
+
report.status = FlowStatus.RUNNING
|
|
239
|
+
|
|
240
|
+
return report
|
|
241
|
+
|
|
242
|
+
@property
|
|
243
|
+
def pipe_reports(self) -> list[PipeReport]:
|
|
244
|
+
if not self._workers:
|
|
245
|
+
return []
|
|
246
|
+
# Centralized collection from all actors in one pass
|
|
247
|
+
return ray.get([w.get_report.remote() for w in self._workers])
|
|
248
|
+
|
|
249
|
+
def get_pipe_report(self, index: int) -> PipeReport:
|
|
250
|
+
if not self._workers:
|
|
251
|
+
raise RuntimeError("Engine has not been started")
|
|
252
|
+
return ray.get(self._workers[index].get_report.remote())
|
zoopipe/hooks/base.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import typing
|
|
2
|
+
|
|
3
|
+
from zoopipe.report import EntryTypedDict
|
|
4
|
+
|
|
5
|
+
#: Type alias for the shared state between hooks.
|
|
6
|
+
HookStore = dict[str, typing.Any]
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class HookPriority:
|
|
10
|
+
"""
|
|
11
|
+
Standard priority levels for hooks.
|
|
12
|
+
|
|
13
|
+
Lower values correspond to higher priority (run earlier).
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
VERY_HIGH = 0
|
|
17
|
+
HIGH = 25
|
|
18
|
+
NORMAL = 50
|
|
19
|
+
LOW = 75
|
|
20
|
+
VERY_LOW = 100
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class BaseHook:
|
|
24
|
+
"""
|
|
25
|
+
Abstract base class for pipeline lifecycle hooks.
|
|
26
|
+
|
|
27
|
+
Hooks allow executing custom Python logic at different stages of
|
|
28
|
+
the pipeline (setup, batch execution, and teardown). They can maintain
|
|
29
|
+
state between batches using the shared HookStore.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, priority: int = HookPriority.NORMAL):
|
|
33
|
+
"""
|
|
34
|
+
Initialize the hook with a specific priority.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
priority: Execution order (lower values run first).
|
|
38
|
+
"""
|
|
39
|
+
self.priority = priority
|
|
40
|
+
|
|
41
|
+
def setup(self, store: HookStore) -> None:
|
|
42
|
+
"""
|
|
43
|
+
Called once before the pipeline starts processing data.
|
|
44
|
+
|
|
45
|
+
Use this to initialize connections, resources, or shared state.
|
|
46
|
+
"""
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
def execute(
|
|
50
|
+
self, entries: list[EntryTypedDict], store: HookStore
|
|
51
|
+
) -> list[EntryTypedDict]:
|
|
52
|
+
"""
|
|
53
|
+
Process a batch of entries.
|
|
54
|
+
|
|
55
|
+
This method is where transformations or decorations happen.
|
|
56
|
+
It can modify the entries in-place or return a new list.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
entries: List of dictionaries representing pipeline items.
|
|
60
|
+
store: Shared state between different hooks and different batches.
|
|
61
|
+
"""
|
|
62
|
+
return entries
|
|
63
|
+
|
|
64
|
+
def teardown(self, store: HookStore) -> None:
|
|
65
|
+
"""
|
|
66
|
+
Called once after the pipeline finish or if an error occurs.
|
|
67
|
+
|
|
68
|
+
Use this to release resources, close connections, or log final stats.
|
|
69
|
+
"""
|
|
70
|
+
pass
|
zoopipe/hooks/sql.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import typing
|
|
2
|
+
|
|
3
|
+
from zoopipe.hooks.base import BaseHook, HookStore
|
|
4
|
+
from zoopipe.report import EntryStatus, get_logger
|
|
5
|
+
|
|
6
|
+
if typing.TYPE_CHECKING:
|
|
7
|
+
from zoopipe.report import EntryTypedDict
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class SQLExpansionHook(BaseHook):
|
|
11
|
+
"""
|
|
12
|
+
Expands anchor records (e.g., ID ranges) into full records by querying a SQL table.
|
|
13
|
+
|
|
14
|
+
This hook is designed to work with chunked data ingestion. It takes minimal
|
|
15
|
+
identifying information (anchors) and performs a bulk fetch from the database
|
|
16
|
+
to retrieve the complete rows.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(
|
|
20
|
+
self, connection_factory: typing.Callable[[], typing.Any], table_name: str
|
|
21
|
+
):
|
|
22
|
+
"""
|
|
23
|
+
Initialize the SQLExpansionHook.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
connection_factory: Callable that returns a database connection.
|
|
27
|
+
table_name: Name of the SQL table to fetch data from.
|
|
28
|
+
"""
|
|
29
|
+
super().__init__()
|
|
30
|
+
self.connection_factory = connection_factory
|
|
31
|
+
self.table_name = table_name
|
|
32
|
+
self.logger = get_logger()
|
|
33
|
+
|
|
34
|
+
def setup(self, store: HookStore) -> None:
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
def execute(
|
|
38
|
+
self, entries: list["EntryTypedDict"], store: HookStore
|
|
39
|
+
) -> list["EntryTypedDict"]:
|
|
40
|
+
expanded = []
|
|
41
|
+
conn = self.connection_factory()
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
cursor = conn.cursor()
|
|
45
|
+
self.logger.debug(
|
|
46
|
+
f"SQLExpansionHook: Expanding batch of {len(entries)} anchor(s)"
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
for anchor in entries:
|
|
50
|
+
raw = anchor["raw_data"]
|
|
51
|
+
min_id = raw.get("min_id")
|
|
52
|
+
max_id = raw.get("max_id")
|
|
53
|
+
|
|
54
|
+
if min_id is None or max_id is None:
|
|
55
|
+
continue
|
|
56
|
+
|
|
57
|
+
cursor.execute(
|
|
58
|
+
f"SELECT * FROM {self.table_name} WHERE id BETWEEN ? AND ?",
|
|
59
|
+
(min_id, max_id),
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
columns = (
|
|
63
|
+
[column[0] for column in cursor.description]
|
|
64
|
+
if cursor.description
|
|
65
|
+
else []
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
rows = cursor.fetchall()
|
|
69
|
+
|
|
70
|
+
for row in rows:
|
|
71
|
+
if columns:
|
|
72
|
+
data = dict(zip(columns, row))
|
|
73
|
+
else:
|
|
74
|
+
data = dict(row)
|
|
75
|
+
|
|
76
|
+
expanded.append(
|
|
77
|
+
{
|
|
78
|
+
"id": None,
|
|
79
|
+
"position": None,
|
|
80
|
+
"status": EntryStatus.PENDING,
|
|
81
|
+
"raw_data": data,
|
|
82
|
+
"validated_data": None,
|
|
83
|
+
"metadata": anchor["metadata"],
|
|
84
|
+
"errors": [],
|
|
85
|
+
}
|
|
86
|
+
)
|
|
87
|
+
cursor.close()
|
|
88
|
+
finally:
|
|
89
|
+
conn.close()
|
|
90
|
+
|
|
91
|
+
return expanded
|
|
92
|
+
|
|
93
|
+
def teardown(self, store: HookStore) -> None:
|
|
94
|
+
pass
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from zoopipe.input_adapter.arrow import ArrowInputAdapter
|
|
2
|
+
from zoopipe.input_adapter.base import BaseInputAdapter
|
|
3
|
+
from zoopipe.input_adapter.csv import CSVInputAdapter
|
|
4
|
+
from zoopipe.input_adapter.duckdb import DuckDBInputAdapter
|
|
5
|
+
from zoopipe.input_adapter.excel import ExcelInputAdapter
|
|
6
|
+
from zoopipe.input_adapter.json import JSONInputAdapter
|
|
7
|
+
from zoopipe.input_adapter.kafka import KafkaInputAdapter
|
|
8
|
+
from zoopipe.input_adapter.parquet import ParquetInputAdapter
|
|
9
|
+
from zoopipe.input_adapter.pygen import PyGeneratorInputAdapter
|
|
10
|
+
from zoopipe.input_adapter.sql import SQLInputAdapter, SQLPaginationInputAdapter
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"BaseInputAdapter",
|
|
14
|
+
"CSVInputAdapter",
|
|
15
|
+
"JSONInputAdapter",
|
|
16
|
+
"DuckDBInputAdapter",
|
|
17
|
+
"ArrowInputAdapter",
|
|
18
|
+
"ExcelInputAdapter",
|
|
19
|
+
"SQLInputAdapter",
|
|
20
|
+
"SQLPaginationInputAdapter",
|
|
21
|
+
"ParquetInputAdapter",
|
|
22
|
+
"PyGeneratorInputAdapter",
|
|
23
|
+
"KafkaInputAdapter",
|
|
24
|
+
]
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import pathlib
|
|
2
|
+
import typing
|
|
3
|
+
|
|
4
|
+
from zoopipe.input_adapter.base import BaseInputAdapter
|
|
5
|
+
from zoopipe.zoopipe_rust_core import ArrowReader
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ArrowInputAdapter(BaseInputAdapter):
|
|
9
|
+
"""
|
|
10
|
+
Reads records from Apache Arrow IPC (feather) files.
|
|
11
|
+
|
|
12
|
+
Provides high-speed sequential access to Arrow data with minimal
|
|
13
|
+
serialization overhead.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
source: typing.Union[str, pathlib.Path],
|
|
19
|
+
generate_ids: bool = True,
|
|
20
|
+
):
|
|
21
|
+
"""
|
|
22
|
+
Initialize the ArrowInputAdapter.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
source: Path to the Arrow file.
|
|
26
|
+
generate_ids: Whether to generate unique IDs for each record.
|
|
27
|
+
"""
|
|
28
|
+
self.source_path = str(source)
|
|
29
|
+
self.generate_ids = generate_ids
|
|
30
|
+
|
|
31
|
+
def get_native_reader(self) -> ArrowReader:
|
|
32
|
+
return ArrowReader(
|
|
33
|
+
self.source_path,
|
|
34
|
+
generate_ids=self.generate_ids,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
__all__ = ["ArrowInputAdapter"]
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
import typing
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class BaseInputAdapter(abc.ABC):
|
|
6
|
+
"""
|
|
7
|
+
Abstract base class for all input adapters.
|
|
8
|
+
|
|
9
|
+
Input adapters are responsible for providing a native Rust reader
|
|
10
|
+
and optional hooks that are specific to the data source.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
@abc.abstractmethod
|
|
14
|
+
def get_native_reader(self) -> typing.Any:
|
|
15
|
+
"""
|
|
16
|
+
Return the underlying Rust reader instance.
|
|
17
|
+
|
|
18
|
+
This reader must implement the common reader interface in Rust
|
|
19
|
+
to be compatible with the NativePipe.
|
|
20
|
+
"""
|
|
21
|
+
raise NotImplementedError
|
|
22
|
+
|
|
23
|
+
def get_hooks(self) -> list[typing.Any]:
|
|
24
|
+
"""
|
|
25
|
+
Return a list of hooks to be executed by the pipeline.
|
|
26
|
+
|
|
27
|
+
Typically used for pre-fetching data or expanding anchor records
|
|
28
|
+
before they reach the main processing stage.
|
|
29
|
+
"""
|
|
30
|
+
return []
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def can_split(self) -> bool:
|
|
34
|
+
"""Return True if this adapter supports parallel splitting."""
|
|
35
|
+
return type(self).split != BaseInputAdapter.split
|
|
36
|
+
|
|
37
|
+
def split(self, workers: int) -> typing.List["BaseInputAdapter"]:
|
|
38
|
+
"""
|
|
39
|
+
Split the input adapter into `workers` shards for parallel processing.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
workers: Number of partitions to create.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
A list of input adapters, each responsible for a subset of the data.
|
|
46
|
+
Default implementation returns [self] (no splitting).
|
|
47
|
+
"""
|
|
48
|
+
return [self]
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import pathlib
|
|
3
|
+
import typing
|
|
4
|
+
|
|
5
|
+
from zoopipe.input_adapter.base import BaseInputAdapter
|
|
6
|
+
from zoopipe.zoopipe_rust_core import CSVReader, get_file_size
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class CSVInputAdapter(BaseInputAdapter):
|
|
10
|
+
"""
|
|
11
|
+
A high-performance CSV reader supporting both local and S3 sources.
|
|
12
|
+
|
|
13
|
+
Uses a multi-threaded parser in the Rust core to ensure fast data ingestion
|
|
14
|
+
without blocking the Python GIL.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
source: typing.Union[str, pathlib.Path],
|
|
20
|
+
delimiter: str = ",",
|
|
21
|
+
quotechar: str = '"',
|
|
22
|
+
skip_rows: int = 0,
|
|
23
|
+
fieldnames: list[str] | None = None,
|
|
24
|
+
generate_ids: bool = True,
|
|
25
|
+
limit: int | None = None,
|
|
26
|
+
start_byte: int = 0,
|
|
27
|
+
end_byte: int | None = None,
|
|
28
|
+
):
|
|
29
|
+
"""
|
|
30
|
+
Initialize the CSVInputAdapter.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
source: Path to the CSV file or S3 URI.
|
|
34
|
+
delimiter: Column separator.
|
|
35
|
+
quotechar: Character used for quoting fields.
|
|
36
|
+
skip_rows: Number of rows to skip at the beginning.
|
|
37
|
+
fieldnames: Optional list of column names.
|
|
38
|
+
generate_ids: Whether to generate unique IDs for each record.
|
|
39
|
+
limit: Maximum number of rows to read (optional).
|
|
40
|
+
start_byte: Byte offset to start reading from.
|
|
41
|
+
end_byte: Byte offset to stop reading at.
|
|
42
|
+
"""
|
|
43
|
+
self.source_path = str(source)
|
|
44
|
+
self.delimiter = delimiter
|
|
45
|
+
self.quotechar = quotechar
|
|
46
|
+
self.skip_rows = skip_rows
|
|
47
|
+
self.fieldnames = fieldnames
|
|
48
|
+
self.generate_ids = generate_ids
|
|
49
|
+
self.limit = limit
|
|
50
|
+
self.start_byte = start_byte
|
|
51
|
+
self.end_byte = end_byte
|
|
52
|
+
|
|
53
|
+
def split(self, workers: int) -> typing.List["CSVInputAdapter"]:
|
|
54
|
+
"""
|
|
55
|
+
Split the CSV input into `workers` byte-range shards.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
file_size = get_file_size(self.source_path)
|
|
59
|
+
|
|
60
|
+
chunk_size = file_size // workers
|
|
61
|
+
|
|
62
|
+
# Ensure we have fieldnames if not explicitly provided
|
|
63
|
+
# This is CRITICAL for partial reads (start_byte > 0)
|
|
64
|
+
final_fieldnames = self.fieldnames
|
|
65
|
+
if final_fieldnames is None:
|
|
66
|
+
if self.source_path.startswith("s3://"):
|
|
67
|
+
# Use Rust reader to discover headers from S3
|
|
68
|
+
final_fieldnames = self.get_native_reader().headers
|
|
69
|
+
else:
|
|
70
|
+
with open(self.source_path, "r") as f:
|
|
71
|
+
reader = csv.reader(
|
|
72
|
+
f, delimiter=self.delimiter, quotechar=self.quotechar
|
|
73
|
+
)
|
|
74
|
+
try:
|
|
75
|
+
final_fieldnames = next(reader)
|
|
76
|
+
except StopIteration:
|
|
77
|
+
final_fieldnames = []
|
|
78
|
+
|
|
79
|
+
shards = []
|
|
80
|
+
for i in range(workers):
|
|
81
|
+
start = i * chunk_size
|
|
82
|
+
# Last worker takes rest of file
|
|
83
|
+
end = (i + 1) * chunk_size if i < workers - 1 else None
|
|
84
|
+
|
|
85
|
+
shards.append(
|
|
86
|
+
self.__class__(
|
|
87
|
+
source=self.source_path,
|
|
88
|
+
delimiter=self.delimiter,
|
|
89
|
+
quotechar=self.quotechar,
|
|
90
|
+
skip_rows=self.skip_rows,
|
|
91
|
+
fieldnames=final_fieldnames,
|
|
92
|
+
generate_ids=self.generate_ids,
|
|
93
|
+
limit=self.limit,
|
|
94
|
+
start_byte=start,
|
|
95
|
+
end_byte=end,
|
|
96
|
+
)
|
|
97
|
+
)
|
|
98
|
+
return shards
|
|
99
|
+
|
|
100
|
+
def get_native_reader(self) -> CSVReader:
|
|
101
|
+
# Pass start_byte and end_byte
|
|
102
|
+
return CSVReader(
|
|
103
|
+
self.source_path,
|
|
104
|
+
delimiter=ord(self.delimiter),
|
|
105
|
+
quote=ord(self.quotechar),
|
|
106
|
+
skip_rows=self.skip_rows,
|
|
107
|
+
fieldnames=self.fieldnames,
|
|
108
|
+
generate_ids=self.generate_ids,
|
|
109
|
+
limit=self.limit,
|
|
110
|
+
start_byte=self.start_byte,
|
|
111
|
+
end_byte=self.end_byte,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
@staticmethod
|
|
115
|
+
def count_rows(
|
|
116
|
+
source: str | pathlib.Path,
|
|
117
|
+
delimiter: str = ",",
|
|
118
|
+
quotechar: str = '"',
|
|
119
|
+
has_header: bool = True,
|
|
120
|
+
) -> int:
|
|
121
|
+
"""
|
|
122
|
+
Efficiently count the number of rows in a CSV file using the Rust core.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
source: Path to the CSV file.
|
|
126
|
+
delimiter: Column separator. (Default: ',')
|
|
127
|
+
quotechar: Character used for quoting. (Default: '"')
|
|
128
|
+
has_header: Whether the file has a header row to ignore in user count
|
|
129
|
+
(if used in context where header matters, though CSVReader.count_rows
|
|
130
|
+
name implies all records).
|
|
131
|
+
Actually pass this to rust to decide if first row is record or header.
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
Number of rows (records).
|
|
135
|
+
"""
|
|
136
|
+
return CSVReader.count_rows(
|
|
137
|
+
str(source),
|
|
138
|
+
ord(delimiter),
|
|
139
|
+
ord(quotechar),
|
|
140
|
+
has_header,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
__all__ = ["CSVInputAdapter"]
|