zoopipe 2026.1.20__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. zoopipe/__init__.py +72 -0
  2. zoopipe/engines/__init__.py +4 -0
  3. zoopipe/engines/base.py +45 -0
  4. zoopipe/engines/dask.py +225 -0
  5. zoopipe/engines/local.py +215 -0
  6. zoopipe/engines/ray.py +252 -0
  7. zoopipe/hooks/__init__.py +4 -0
  8. zoopipe/hooks/base.py +70 -0
  9. zoopipe/hooks/sql.py +94 -0
  10. zoopipe/input_adapter/__init__.py +24 -0
  11. zoopipe/input_adapter/arrow.py +38 -0
  12. zoopipe/input_adapter/base.py +48 -0
  13. zoopipe/input_adapter/csv.py +144 -0
  14. zoopipe/input_adapter/duckdb.py +54 -0
  15. zoopipe/input_adapter/excel.py +51 -0
  16. zoopipe/input_adapter/json.py +73 -0
  17. zoopipe/input_adapter/kafka.py +39 -0
  18. zoopipe/input_adapter/parquet.py +85 -0
  19. zoopipe/input_adapter/pygen.py +37 -0
  20. zoopipe/input_adapter/sql.py +103 -0
  21. zoopipe/manager.py +211 -0
  22. zoopipe/output_adapter/__init__.py +23 -0
  23. zoopipe/output_adapter/arrow.py +50 -0
  24. zoopipe/output_adapter/base.py +41 -0
  25. zoopipe/output_adapter/csv.py +71 -0
  26. zoopipe/output_adapter/duckdb.py +46 -0
  27. zoopipe/output_adapter/excel.py +42 -0
  28. zoopipe/output_adapter/json.py +66 -0
  29. zoopipe/output_adapter/kafka.py +39 -0
  30. zoopipe/output_adapter/parquet.py +49 -0
  31. zoopipe/output_adapter/pygen.py +29 -0
  32. zoopipe/output_adapter/sql.py +43 -0
  33. zoopipe/pipe.py +263 -0
  34. zoopipe/protocols.py +37 -0
  35. zoopipe/py.typed +0 -0
  36. zoopipe/report.py +173 -0
  37. zoopipe/utils/__init__.py +0 -0
  38. zoopipe/utils/dependency.py +78 -0
  39. zoopipe/zoopipe_rust_core.abi3.so +0 -0
  40. zoopipe-2026.1.20.dist-info/METADATA +231 -0
  41. zoopipe-2026.1.20.dist-info/RECORD +43 -0
  42. zoopipe-2026.1.20.dist-info/WHEEL +4 -0
  43. zoopipe-2026.1.20.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,54 @@
1
+ import pathlib
2
+ import typing
3
+
4
+ from zoopipe.input_adapter.base import BaseInputAdapter
5
+ from zoopipe.zoopipe_rust_core import DuckDBReader
6
+
7
+
8
+ class DuckDBInputAdapter(BaseInputAdapter):
9
+ """
10
+ Executes SQL queries against DuckDB database files.
11
+
12
+ Directly interfaces with DuckDB to stream query results, enabling
13
+ efficient processing of large datasets stored in analytical databases.
14
+ """
15
+
16
+ def __init__(
17
+ self,
18
+ source: typing.Union[str, pathlib.Path],
19
+ query: str | None = None,
20
+ table_name: str | None = None,
21
+ generate_ids: bool = True,
22
+ ):
23
+ """
24
+ Initialize the DuckDBInputAdapter.
25
+
26
+ Args:
27
+ source: Path to the DuckDB database file.
28
+ query: SQL query to execute.
29
+ table_name: Or name of the table to read (equiv to SELECT * FROM table).
30
+ generate_ids: Whether to generate unique IDs for each record.
31
+ """
32
+ self.source_path = str(source)
33
+ self.generate_ids = generate_ids
34
+
35
+ if query is None and table_name is None:
36
+ raise ValueError("Either query or table_name must be provided")
37
+
38
+ if query is not None and table_name is not None:
39
+ raise ValueError("Only one of query or table_name should be provided")
40
+
41
+ if query is not None:
42
+ self.query = query
43
+ else:
44
+ self.query = f"SELECT * FROM {table_name}"
45
+
46
+ def get_native_reader(self) -> DuckDBReader:
47
+ return DuckDBReader(
48
+ self.source_path,
49
+ self.query,
50
+ generate_ids=self.generate_ids,
51
+ )
52
+
53
+
54
+ __all__ = ["DuckDBInputAdapter"]
@@ -0,0 +1,51 @@
1
+ import pathlib
2
+ import typing
3
+
4
+ from zoopipe.input_adapter.base import BaseInputAdapter
5
+ from zoopipe.zoopipe_rust_core import ExcelReader
6
+
7
+
8
+ class ExcelInputAdapter(BaseInputAdapter):
9
+ """
10
+ Reads Excel files (.xlsx, .xls, .ods, .xlsb) using the Calamine engine.
11
+
12
+ Provides high-performance, memory-efficient parsing of various spreadsheet
13
+ formats directly from Rust. Supports sheet selection by name or index,
14
+ skipping header rows, and custom field mapping.
15
+ """
16
+
17
+ def __init__(
18
+ self,
19
+ source: typing.Union[str, pathlib.Path],
20
+ sheet: typing.Union[str, int, None] = None,
21
+ skip_rows: int = 0,
22
+ fieldnames: typing.Optional[typing.List[str]] = None,
23
+ generate_ids: bool = True,
24
+ ):
25
+ """
26
+ Initialize the ExcelInputAdapter.
27
+
28
+ Args:
29
+ source: Path to the Excel file.
30
+ sheet: Sheet name (str) or index (int) to read. Defaults to the first sheet.
31
+ skip_rows: Number of rows to skip at the beginning.
32
+ fieldnames: Optional list of column names.
33
+ generate_ids: Whether to generate unique IDs for each record.
34
+ """
35
+ self.source_path = str(source)
36
+ self.sheet = sheet
37
+ self.skip_rows = skip_rows
38
+ self.fieldnames = fieldnames
39
+ self.generate_ids = generate_ids
40
+
41
+ def get_native_reader(self) -> ExcelReader:
42
+ return ExcelReader(
43
+ self.source_path,
44
+ sheet=self.sheet,
45
+ skip_rows=self.skip_rows,
46
+ fieldnames=self.fieldnames,
47
+ generate_ids=self.generate_ids,
48
+ )
49
+
50
+
51
+ __all__ = ["ExcelInputAdapter"]
@@ -0,0 +1,73 @@
1
+ import pathlib
2
+ import typing
3
+
4
+ from zoopipe.input_adapter.base import BaseInputAdapter
5
+ from zoopipe.zoopipe_rust_core import JSONReader
6
+
7
+
8
+ class JSONInputAdapter(BaseInputAdapter):
9
+ """
10
+ Reads data from JSON or JSONLines (.jsonl) files.
11
+
12
+ It supports both standard JSON arrays and line-delimited records.
13
+ The adapter uses a fast Rust-based parser that streams data efficiently,
14
+ making it suitable for very large datasets.
15
+ """
16
+
17
+ def __init__(
18
+ self,
19
+ source: typing.Union[str, pathlib.Path],
20
+ start_byte: int = 0,
21
+ end_byte: int | None = None,
22
+ ):
23
+ """
24
+ Initialize the JSONInputAdapter.
25
+
26
+ Args:
27
+ source: Path to the JSONLines file.
28
+ start_byte: Byte offset to start reading from.
29
+ end_byte: Byte offset to stop reading at.
30
+ """
31
+ self.source_path = str(source)
32
+ self.start_byte = start_byte
33
+ self.end_byte = end_byte
34
+
35
+ @property
36
+ def can_split(self) -> bool:
37
+ """Only allow splitting for JSONLines/NDJSON formats."""
38
+ path = self.source_path.lower()
39
+ return path.endswith(".jsonl") or path.endswith(".ndjson")
40
+
41
+ def split(self, workers: int) -> typing.List["JSONInputAdapter"]:
42
+ """
43
+ Split the JSON input into `workers` byte-range shards.
44
+ """
45
+ from zoopipe.zoopipe_rust_core import get_file_size
46
+
47
+ file_size = get_file_size(self.source_path)
48
+
49
+ chunk_size = file_size // workers
50
+ shards = []
51
+ for i in range(workers):
52
+ start = i * chunk_size
53
+ # Last worker takes rest of file
54
+ end = (i + 1) * chunk_size if i < workers - 1 else None
55
+
56
+ shards.append(
57
+ self.__class__(
58
+ source=self.source_path,
59
+ start_byte=start,
60
+ end_byte=end,
61
+ )
62
+ )
63
+ return shards
64
+
65
+ def get_native_reader(self) -> JSONReader:
66
+ return JSONReader(
67
+ self.source_path,
68
+ start_byte=self.start_byte,
69
+ end_byte=self.end_byte,
70
+ )
71
+
72
+
73
+ __all__ = ["JSONInputAdapter"]
@@ -0,0 +1,39 @@
1
+ from zoopipe.input_adapter.base import BaseInputAdapter
2
+ from zoopipe.zoopipe_rust_core import KafkaReader
3
+
4
+
5
+ class KafkaInputAdapter(BaseInputAdapter):
6
+ """
7
+ Consumes messages from Apache Kafka topics.
8
+
9
+ Acts as a Kafka consumer, streaming messages into the pipeline with
10
+ support for consumer groups and offset management.
11
+ """
12
+
13
+ def __init__(
14
+ self,
15
+ uri: str,
16
+ group_id: str | None = None,
17
+ generate_ids: bool = True,
18
+ ):
19
+ """
20
+ Kafka Input Adapter.
21
+
22
+ Args:
23
+ uri: Kafka URI (e.g., 'kafka://localhost:9092/topic')
24
+ group_id: Optional consumer group ID.
25
+ generate_ids: Whether to generate unique IDs for each message.
26
+ """
27
+ self.uri = uri
28
+ self.group_id = group_id
29
+ self.generate_ids = generate_ids
30
+
31
+ def get_native_reader(self) -> KafkaReader:
32
+ return KafkaReader(
33
+ self.uri,
34
+ group_id=self.group_id,
35
+ generate_ids=self.generate_ids,
36
+ )
37
+
38
+
39
+ __all__ = ["KafkaInputAdapter"]
@@ -0,0 +1,85 @@
1
+ import pathlib
2
+ import typing
3
+
4
+ from zoopipe.input_adapter.base import BaseInputAdapter
5
+ from zoopipe.zoopipe_rust_core import ParquetReader
6
+
7
+
8
+ class ParquetInputAdapter(BaseInputAdapter):
9
+ """
10
+ Reads records from Apache Parquet files.
11
+
12
+ Utilizes the Arrow ecosystem for efficient columnar data reading and
13
+ multi-threaded loading.
14
+ """
15
+
16
+ def __init__(
17
+ self,
18
+ source: typing.Union[str, pathlib.Path],
19
+ generate_ids: bool = True,
20
+ batch_size: int = 1024,
21
+ limit: int | None = None,
22
+ offset: int = 0,
23
+ row_groups: typing.List[int] | None = None,
24
+ ):
25
+ """
26
+ Initialize the ParquetInputAdapter.
27
+
28
+ Args:
29
+ source: Path to the Parquet file.
30
+ generate_ids: Whether to generate unique IDs for each record.
31
+ batch_size: Number of records to read at once from the file.
32
+ limit: Maximum number of rows to read.
33
+ offset: Number of rows to skip.
34
+ """
35
+ self.source_path = str(source)
36
+ self.generate_ids = generate_ids
37
+ self.batch_size = batch_size
38
+ self.limit = limit
39
+ self.offset = offset
40
+ self.row_groups = row_groups
41
+
42
+ def split(self, workers: int) -> typing.List["ParquetInputAdapter"]:
43
+ """
44
+ Split the Parquet input into `workers` shards based on Row Groups.
45
+ """
46
+ row_group_rows = ParquetReader.get_row_groups_info(self.source_path)
47
+ num_groups = len(row_group_rows)
48
+
49
+ if num_groups < workers:
50
+ workers = num_groups
51
+
52
+ if workers <= 1:
53
+ return [self]
54
+
55
+ # Distribute row groups among workers
56
+ groups_per_worker = num_groups // workers
57
+ shards = []
58
+ for i in range(workers):
59
+ start_idx = i * groups_per_worker
60
+ end_idx = (i + 1) * groups_per_worker if i < workers - 1 else num_groups
61
+
62
+ assigned_groups = list(range(start_idx, end_idx))
63
+
64
+ shards.append(
65
+ self.__class__(
66
+ source=self.source_path,
67
+ generate_ids=self.generate_ids,
68
+ batch_size=self.batch_size,
69
+ row_groups=assigned_groups,
70
+ )
71
+ )
72
+ return shards
73
+
74
+ def get_native_reader(self) -> ParquetReader:
75
+ return ParquetReader(
76
+ self.source_path,
77
+ generate_ids=self.generate_ids,
78
+ batch_size=self.batch_size,
79
+ limit=self.limit,
80
+ offset=self.offset,
81
+ row_groups=self.row_groups,
82
+ )
83
+
84
+
85
+ __all__ = ["ParquetInputAdapter"]
@@ -0,0 +1,37 @@
1
+ import typing
2
+
3
+ from zoopipe.input_adapter.base import BaseInputAdapter
4
+ from zoopipe.zoopipe_rust_core import PyGeneratorReader
5
+
6
+
7
+ class PyGeneratorInputAdapter(BaseInputAdapter):
8
+ """
9
+ Bridges Python iterables and generators into the pipeline.
10
+
11
+ Allows using any custom Python logic or in-memory data as a source
12
+ for the pipeline.
13
+ """
14
+
15
+ def __init__(
16
+ self,
17
+ iterable: typing.Iterable[typing.Any],
18
+ generate_ids: bool = True,
19
+ ):
20
+ """
21
+ Initialize the PyGeneratorInputAdapter.
22
+
23
+ Args:
24
+ iterable: Any Python iterable or generator yielding dictionaries.
25
+ generate_ids: Whether to generate unique IDs for each record.
26
+ """
27
+ self.iterable = iterable
28
+ self.generate_ids = generate_ids
29
+
30
+ def get_native_reader(self) -> PyGeneratorReader:
31
+ return PyGeneratorReader(
32
+ self.iterable,
33
+ generate_ids=self.generate_ids,
34
+ )
35
+
36
+
37
+ __all__ = ["PyGeneratorInputAdapter"]
@@ -0,0 +1,103 @@
1
+ import typing
2
+
3
+ from zoopipe.input_adapter.base import BaseInputAdapter
4
+ from zoopipe.zoopipe_rust_core import SQLReader
5
+
6
+
7
+ class SQLInputAdapter(BaseInputAdapter):
8
+ """
9
+ Streams records from SQL databases using standard queries.
10
+
11
+ Supports any database compatible with SQLAlchemy URIs. It executes a
12
+ provided query or fetches a whole table using a native Rust executor
13
+ for optimal performance.
14
+ """
15
+
16
+ def __init__(
17
+ self,
18
+ uri: str,
19
+ query: str | None = None,
20
+ table_name: str | None = None,
21
+ generate_ids: bool = True,
22
+ ):
23
+ """
24
+ Initialize the SQLInputAdapter.
25
+
26
+ Args:
27
+ uri: Database URI (e.g., 'sqlite:///data.db').
28
+ query: SQL query to execute.
29
+ table_name: Or name of the table to read (equiv to SELECT * FROM table).
30
+ generate_ids: Whether to generate unique IDs for each record.
31
+ """
32
+ self.uri = uri
33
+ self.generate_ids = generate_ids
34
+
35
+ if query is None and table_name is None:
36
+ raise ValueError("Either query or table_name must be provided")
37
+
38
+ if query is not None and table_name is not None:
39
+ raise ValueError("Only one of query or table_name should be provided")
40
+
41
+ if query is not None:
42
+ self.query = query
43
+ else:
44
+ self.query = f"SELECT * FROM {table_name}"
45
+
46
+ def get_native_reader(self) -> SQLReader:
47
+ return SQLReader(
48
+ self.uri,
49
+ self.query,
50
+ generate_ids=self.generate_ids,
51
+ )
52
+
53
+
54
+ class SQLPaginationInputAdapter(SQLInputAdapter):
55
+ """
56
+ Input adapter for SQL databases using anchor-based pagination.
57
+
58
+ This adapter generates ID ranges (anchors) and utilizes SQLExpansionHook
59
+ to fetch full records in chunks, which is more efficient for very large tables.
60
+ """
61
+
62
+ def __init__(
63
+ self,
64
+ uri: str,
65
+ table_name: str,
66
+ id_column: str,
67
+ chunk_size: int,
68
+ connection_factory: typing.Callable[[], typing.Any],
69
+ ):
70
+ """
71
+ Initialize the SQLPaginationInputAdapter.
72
+
73
+ Args:
74
+ uri: Database URI.
75
+ table_name: Name of the table to read.
76
+ id_column: Primary key or indexed column used for pagination.
77
+ chunk_size: Number of records to fetch per chunk.
78
+ connection_factory: Callable that returns a database connection
79
+ for the hook.
80
+ """
81
+ self.table_name = table_name
82
+ self.id_column = id_column
83
+ self.chunk_size = chunk_size
84
+ self.connection_factory = connection_factory
85
+
86
+ query = f"""
87
+ WITH RECURSIVE ranges(n) AS (
88
+ SELECT MIN({id_column}) FROM {table_name}
89
+ UNION ALL
90
+ SELECT n + {chunk_size} FROM ranges
91
+ WHERE n + {chunk_size} <= (SELECT MAX({id_column}) FROM {table_name})
92
+ )
93
+ SELECT n as min_id, n + {chunk_size} - 1 as max_id FROM ranges
94
+ """
95
+ super().__init__(uri, query=query)
96
+
97
+ def get_hooks(self):
98
+ from zoopipe.hooks.sql import SQLExpansionHook
99
+
100
+ return [SQLExpansionHook(self.connection_factory, self.table_name)]
101
+
102
+
103
+ __all__ = ["SQLInputAdapter", "SQLPaginationInputAdapter"]
zoopipe/manager.py ADDED
@@ -0,0 +1,211 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import shutil
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ from zoopipe.engines import MultiProcessEngine
8
+ from zoopipe.engines.local import PipeReport
9
+ from zoopipe.zoopipe_rust_core import MultiThreadExecutor, SingleThreadExecutor
10
+
11
+ if TYPE_CHECKING:
12
+ from zoopipe.engines.base import BaseEngine
13
+ from zoopipe.pipe import Pipe
14
+ from zoopipe.report import FlowReport
15
+
16
+
17
+ class PipeManager:
18
+ """
19
+ Manages one or more Pipes using an execution Engine.
20
+
21
+ PipeManager acts as the high-level orchestrator. It handles the sharding
22
+ of data sources across multiple workers and coordinates their execution
23
+ through a pluggable Engine (e.g., Local Multiprocessing, Ray, Dask).
24
+ """
25
+
26
+ def __init__(self, pipes: list[Pipe], engine: BaseEngine | None = None):
27
+ """
28
+ Initialize PipeManager with a list of Pipe instances.
29
+
30
+ Args:
31
+ pipes: List of Pipe objects to manage.
32
+ engine: Optional execution engine. Defaults to MultiProcessEngine.
33
+ """
34
+ self.pipes = pipes
35
+ self.engine = engine or MultiProcessEngine()
36
+ self._merge_info: dict[str, Any] = {}
37
+ self.should_merge = False
38
+
39
+ @property
40
+ def is_running(self) -> bool:
41
+ """Check if the execution is currently running."""
42
+ return self.engine.is_running
43
+
44
+ @property
45
+ def pipe_count(self) -> int:
46
+ """Get the number of pipes being managed."""
47
+ return len(self.pipes)
48
+
49
+ def start(self) -> None:
50
+ """
51
+ Start all managed pipes using the configured engine.
52
+ """
53
+ self.engine.start(self.pipes)
54
+
55
+ def wait(self, timeout: float | None = None) -> bool:
56
+ """
57
+ Wait for execution to finish.
58
+
59
+ Args:
60
+ timeout: Optional maximum time to wait.
61
+ Returns:
62
+ True if execution finished.
63
+ """
64
+ return self.engine.wait(timeout)
65
+
66
+ def shutdown(self, timeout: float = 5.0) -> None:
67
+ """
68
+ Forcibly stop all running pipes.
69
+
70
+ Args:
71
+ timeout: Maximum time to wait for termination.
72
+ """
73
+ self.engine.shutdown(timeout)
74
+
75
+ @property
76
+ def report(self) -> FlowReport:
77
+ """Get an aggregated report of all running pipes."""
78
+ return self.engine.report
79
+
80
+ def get_pipe_report(self, index: int) -> PipeReport:
81
+ """
82
+ Get the current report for a specific pipe.
83
+
84
+ Args:
85
+ index: The index of the pipe in the original list.
86
+ """
87
+ if hasattr(self.engine, "get_pipe_report"):
88
+ return self.engine.get_pipe_report(index)
89
+ raise AttributeError(
90
+ f"Engine {self.engine.__class__.__name__} does not support per-pipe reports"
91
+ )
92
+
93
+ @property
94
+ def pipe_reports(self) -> list[PipeReport]:
95
+ """Get reports for all managed pipes."""
96
+ if hasattr(self.engine, "pipe_reports"):
97
+ return self.engine.pipe_reports
98
+ # Fallback if the engine doesn't have the property but has the method
99
+ if hasattr(self.engine, "get_pipe_report"):
100
+ return [self.engine.get_pipe_report(i) for i in range(self.pipe_count)]
101
+ raise AttributeError(
102
+ f"Engine {self.engine.__class__.__name__} does not support per-pipe reports"
103
+ )
104
+
105
+ def __enter__(self) -> PipeManager:
106
+ self.start()
107
+ return self
108
+
109
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
110
+ if self.is_running:
111
+ self.shutdown()
112
+
113
+ @classmethod
114
+ def parallelize_pipe(
115
+ cls,
116
+ pipe: Pipe,
117
+ workers: int,
118
+ should_merge: bool = False,
119
+ executor: SingleThreadExecutor | MultiThreadExecutor | None = None,
120
+ engine: BaseEngine | None = None,
121
+ ) -> PipeManager:
122
+ """
123
+ Create a PipeManager that runs the given pipe in parallel across
124
+ `workers` shards.
125
+
126
+ Automatically splits the input and output adapters to ensure safe
127
+ parallel execution.
128
+
129
+ Args:
130
+ pipe: The source pipe to parallelize.
131
+ workers: Number of shards to use.
132
+ should_merge: Whether to merge the output shards automatically.
133
+ executor: Internal batch executor for each shard.
134
+ engine: Optional execution engine.
135
+
136
+ Returns:
137
+ A configured PipeManager instance.
138
+ """
139
+ if not pipe.input_adapter.can_split or not pipe.output_adapter.can_split:
140
+ workers = 1
141
+
142
+ input_shards = pipe.input_adapter.split(workers)
143
+ output_shards = pipe.output_adapter.split(workers)
144
+
145
+ if len(input_shards) != workers or len(output_shards) != workers:
146
+ raise ValueError(
147
+ f"Adapters failed to split into {workers} shards. "
148
+ f"Got {len(input_shards)} inputs and {len(output_shards)} outputs."
149
+ )
150
+
151
+ exec_strategy = executor or pipe.executor
152
+
153
+ pipes = []
154
+ for i in range(workers):
155
+ sharded_pipe = type(pipe)(
156
+ input_adapter=input_shards[i],
157
+ output_adapter=output_shards[i],
158
+ schema_model=pipe.schema_model,
159
+ pre_validation_hooks=pipe.pre_validation_hooks,
160
+ post_validation_hooks=pipe.post_validation_hooks,
161
+ report_update_interval=pipe.report_update_interval,
162
+ executor=exec_strategy,
163
+ )
164
+ pipes.append(sharded_pipe)
165
+
166
+ manager = cls(pipes, engine=engine)
167
+ manager.should_merge = should_merge
168
+ manager._merge_info = {
169
+ "target": getattr(pipe.output_adapter, "output_path", None),
170
+ "sources": [getattr(shard, "output_path", None) for shard in output_shards],
171
+ }
172
+ return manager
173
+
174
+ def merge(self) -> None:
175
+ """
176
+ Merge the output files from all pipes into the final destination.
177
+ """
178
+ if not self._should_merge():
179
+ return
180
+
181
+ target = self._merge_info["target"]
182
+ sources = [s for s in self._merge_info["sources"] if s and os.path.exists(s)]
183
+
184
+ with open(target, "wb") as dest:
185
+ for src_path in sources:
186
+ with open(src_path, "rb") as src:
187
+ self._append_file(dest, src)
188
+
189
+ def _should_merge(self) -> bool:
190
+ if not self.should_merge or not self._merge_info.get("target"):
191
+ return False
192
+ sources = [s for s in self._merge_info.get("sources", []) if s]
193
+ return len(sources) > 1
194
+
195
+ def _append_file(self, dest, src) -> None:
196
+ """Append file content using zero-copy where available."""
197
+ try:
198
+ offset, size = 0, os.fstat(src.fileno()).st_size
199
+ while offset < size:
200
+ sent = os.sendfile(dest.fileno(), src.fileno(), offset, size - offset)
201
+ if sent == 0:
202
+ break
203
+ offset += sent
204
+ except (OSError, AttributeError):
205
+ src.seek(0)
206
+ shutil.copyfileobj(src, dest)
207
+
208
+ def __repr__(self) -> str:
209
+ status = "running" if self.is_running else "stopped"
210
+ return f"<PipeManager pipes={self.pipe_count} status={status} "
211
+ f"engine={self.engine.__class__.__name__}>"
@@ -0,0 +1,23 @@
1
+ from zoopipe.output_adapter.arrow import ArrowOutputAdapter
2
+ from zoopipe.output_adapter.base import BaseOutputAdapter
3
+ from zoopipe.output_adapter.csv import CSVOutputAdapter
4
+ from zoopipe.output_adapter.duckdb import DuckDBOutputAdapter
5
+ from zoopipe.output_adapter.excel import ExcelOutputAdapter
6
+ from zoopipe.output_adapter.json import JSONOutputAdapter
7
+ from zoopipe.output_adapter.kafka import KafkaOutputAdapter
8
+ from zoopipe.output_adapter.parquet import ParquetOutputAdapter
9
+ from zoopipe.output_adapter.pygen import PyGeneratorOutputAdapter
10
+ from zoopipe.output_adapter.sql import SQLOutputAdapter
11
+
12
+ __all__ = [
13
+ "BaseOutputAdapter",
14
+ "CSVOutputAdapter",
15
+ "JSONOutputAdapter",
16
+ "DuckDBOutputAdapter",
17
+ "ArrowOutputAdapter",
18
+ "ExcelOutputAdapter",
19
+ "SQLOutputAdapter",
20
+ "ParquetOutputAdapter",
21
+ "PyGeneratorOutputAdapter",
22
+ "KafkaOutputAdapter",
23
+ ]