zoopipe 2026.1.20__cp310-abi3-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zoopipe/__init__.py +72 -0
- zoopipe/engines/__init__.py +4 -0
- zoopipe/engines/base.py +45 -0
- zoopipe/engines/dask.py +225 -0
- zoopipe/engines/local.py +215 -0
- zoopipe/engines/ray.py +252 -0
- zoopipe/hooks/__init__.py +4 -0
- zoopipe/hooks/base.py +70 -0
- zoopipe/hooks/sql.py +94 -0
- zoopipe/input_adapter/__init__.py +24 -0
- zoopipe/input_adapter/arrow.py +38 -0
- zoopipe/input_adapter/base.py +48 -0
- zoopipe/input_adapter/csv.py +144 -0
- zoopipe/input_adapter/duckdb.py +54 -0
- zoopipe/input_adapter/excel.py +51 -0
- zoopipe/input_adapter/json.py +73 -0
- zoopipe/input_adapter/kafka.py +39 -0
- zoopipe/input_adapter/parquet.py +85 -0
- zoopipe/input_adapter/pygen.py +37 -0
- zoopipe/input_adapter/sql.py +103 -0
- zoopipe/manager.py +211 -0
- zoopipe/output_adapter/__init__.py +23 -0
- zoopipe/output_adapter/arrow.py +50 -0
- zoopipe/output_adapter/base.py +41 -0
- zoopipe/output_adapter/csv.py +71 -0
- zoopipe/output_adapter/duckdb.py +46 -0
- zoopipe/output_adapter/excel.py +42 -0
- zoopipe/output_adapter/json.py +66 -0
- zoopipe/output_adapter/kafka.py +39 -0
- zoopipe/output_adapter/parquet.py +49 -0
- zoopipe/output_adapter/pygen.py +29 -0
- zoopipe/output_adapter/sql.py +43 -0
- zoopipe/pipe.py +263 -0
- zoopipe/protocols.py +37 -0
- zoopipe/py.typed +0 -0
- zoopipe/report.py +173 -0
- zoopipe/utils/__init__.py +0 -0
- zoopipe/utils/dependency.py +78 -0
- zoopipe/zoopipe_rust_core.abi3.so +0 -0
- zoopipe-2026.1.20.dist-info/METADATA +231 -0
- zoopipe-2026.1.20.dist-info/RECORD +43 -0
- zoopipe-2026.1.20.dist-info/WHEEL +4 -0
- zoopipe-2026.1.20.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import pathlib
|
|
2
|
+
import typing
|
|
3
|
+
|
|
4
|
+
from zoopipe.input_adapter.base import BaseInputAdapter
|
|
5
|
+
from zoopipe.zoopipe_rust_core import DuckDBReader
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DuckDBInputAdapter(BaseInputAdapter):
|
|
9
|
+
"""
|
|
10
|
+
Executes SQL queries against DuckDB database files.
|
|
11
|
+
|
|
12
|
+
Directly interfaces with DuckDB to stream query results, enabling
|
|
13
|
+
efficient processing of large datasets stored in analytical databases.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
source: typing.Union[str, pathlib.Path],
|
|
19
|
+
query: str | None = None,
|
|
20
|
+
table_name: str | None = None,
|
|
21
|
+
generate_ids: bool = True,
|
|
22
|
+
):
|
|
23
|
+
"""
|
|
24
|
+
Initialize the DuckDBInputAdapter.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
source: Path to the DuckDB database file.
|
|
28
|
+
query: SQL query to execute.
|
|
29
|
+
table_name: Or name of the table to read (equiv to SELECT * FROM table).
|
|
30
|
+
generate_ids: Whether to generate unique IDs for each record.
|
|
31
|
+
"""
|
|
32
|
+
self.source_path = str(source)
|
|
33
|
+
self.generate_ids = generate_ids
|
|
34
|
+
|
|
35
|
+
if query is None and table_name is None:
|
|
36
|
+
raise ValueError("Either query or table_name must be provided")
|
|
37
|
+
|
|
38
|
+
if query is not None and table_name is not None:
|
|
39
|
+
raise ValueError("Only one of query or table_name should be provided")
|
|
40
|
+
|
|
41
|
+
if query is not None:
|
|
42
|
+
self.query = query
|
|
43
|
+
else:
|
|
44
|
+
self.query = f"SELECT * FROM {table_name}"
|
|
45
|
+
|
|
46
|
+
def get_native_reader(self) -> DuckDBReader:
|
|
47
|
+
return DuckDBReader(
|
|
48
|
+
self.source_path,
|
|
49
|
+
self.query,
|
|
50
|
+
generate_ids=self.generate_ids,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
__all__ = ["DuckDBInputAdapter"]
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import pathlib
|
|
2
|
+
import typing
|
|
3
|
+
|
|
4
|
+
from zoopipe.input_adapter.base import BaseInputAdapter
|
|
5
|
+
from zoopipe.zoopipe_rust_core import ExcelReader
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ExcelInputAdapter(BaseInputAdapter):
|
|
9
|
+
"""
|
|
10
|
+
Reads Excel files (.xlsx, .xls, .ods, .xlsb) using the Calamine engine.
|
|
11
|
+
|
|
12
|
+
Provides high-performance, memory-efficient parsing of various spreadsheet
|
|
13
|
+
formats directly from Rust. Supports sheet selection by name or index,
|
|
14
|
+
skipping header rows, and custom field mapping.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
source: typing.Union[str, pathlib.Path],
|
|
20
|
+
sheet: typing.Union[str, int, None] = None,
|
|
21
|
+
skip_rows: int = 0,
|
|
22
|
+
fieldnames: typing.Optional[typing.List[str]] = None,
|
|
23
|
+
generate_ids: bool = True,
|
|
24
|
+
):
|
|
25
|
+
"""
|
|
26
|
+
Initialize the ExcelInputAdapter.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
source: Path to the Excel file.
|
|
30
|
+
sheet: Sheet name (str) or index (int) to read. Defaults to the first sheet.
|
|
31
|
+
skip_rows: Number of rows to skip at the beginning.
|
|
32
|
+
fieldnames: Optional list of column names.
|
|
33
|
+
generate_ids: Whether to generate unique IDs for each record.
|
|
34
|
+
"""
|
|
35
|
+
self.source_path = str(source)
|
|
36
|
+
self.sheet = sheet
|
|
37
|
+
self.skip_rows = skip_rows
|
|
38
|
+
self.fieldnames = fieldnames
|
|
39
|
+
self.generate_ids = generate_ids
|
|
40
|
+
|
|
41
|
+
def get_native_reader(self) -> ExcelReader:
|
|
42
|
+
return ExcelReader(
|
|
43
|
+
self.source_path,
|
|
44
|
+
sheet=self.sheet,
|
|
45
|
+
skip_rows=self.skip_rows,
|
|
46
|
+
fieldnames=self.fieldnames,
|
|
47
|
+
generate_ids=self.generate_ids,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
__all__ = ["ExcelInputAdapter"]
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import pathlib
|
|
2
|
+
import typing
|
|
3
|
+
|
|
4
|
+
from zoopipe.input_adapter.base import BaseInputAdapter
|
|
5
|
+
from zoopipe.zoopipe_rust_core import JSONReader
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class JSONInputAdapter(BaseInputAdapter):
|
|
9
|
+
"""
|
|
10
|
+
Reads data from JSON or JSONLines (.jsonl) files.
|
|
11
|
+
|
|
12
|
+
It supports both standard JSON arrays and line-delimited records.
|
|
13
|
+
The adapter uses a fast Rust-based parser that streams data efficiently,
|
|
14
|
+
making it suitable for very large datasets.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
source: typing.Union[str, pathlib.Path],
|
|
20
|
+
start_byte: int = 0,
|
|
21
|
+
end_byte: int | None = None,
|
|
22
|
+
):
|
|
23
|
+
"""
|
|
24
|
+
Initialize the JSONInputAdapter.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
source: Path to the JSONLines file.
|
|
28
|
+
start_byte: Byte offset to start reading from.
|
|
29
|
+
end_byte: Byte offset to stop reading at.
|
|
30
|
+
"""
|
|
31
|
+
self.source_path = str(source)
|
|
32
|
+
self.start_byte = start_byte
|
|
33
|
+
self.end_byte = end_byte
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def can_split(self) -> bool:
|
|
37
|
+
"""Only allow splitting for JSONLines/NDJSON formats."""
|
|
38
|
+
path = self.source_path.lower()
|
|
39
|
+
return path.endswith(".jsonl") or path.endswith(".ndjson")
|
|
40
|
+
|
|
41
|
+
def split(self, workers: int) -> typing.List["JSONInputAdapter"]:
|
|
42
|
+
"""
|
|
43
|
+
Split the JSON input into `workers` byte-range shards.
|
|
44
|
+
"""
|
|
45
|
+
from zoopipe.zoopipe_rust_core import get_file_size
|
|
46
|
+
|
|
47
|
+
file_size = get_file_size(self.source_path)
|
|
48
|
+
|
|
49
|
+
chunk_size = file_size // workers
|
|
50
|
+
shards = []
|
|
51
|
+
for i in range(workers):
|
|
52
|
+
start = i * chunk_size
|
|
53
|
+
# Last worker takes rest of file
|
|
54
|
+
end = (i + 1) * chunk_size if i < workers - 1 else None
|
|
55
|
+
|
|
56
|
+
shards.append(
|
|
57
|
+
self.__class__(
|
|
58
|
+
source=self.source_path,
|
|
59
|
+
start_byte=start,
|
|
60
|
+
end_byte=end,
|
|
61
|
+
)
|
|
62
|
+
)
|
|
63
|
+
return shards
|
|
64
|
+
|
|
65
|
+
def get_native_reader(self) -> JSONReader:
|
|
66
|
+
return JSONReader(
|
|
67
|
+
self.source_path,
|
|
68
|
+
start_byte=self.start_byte,
|
|
69
|
+
end_byte=self.end_byte,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
__all__ = ["JSONInputAdapter"]
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from zoopipe.input_adapter.base import BaseInputAdapter
|
|
2
|
+
from zoopipe.zoopipe_rust_core import KafkaReader
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class KafkaInputAdapter(BaseInputAdapter):
|
|
6
|
+
"""
|
|
7
|
+
Consumes messages from Apache Kafka topics.
|
|
8
|
+
|
|
9
|
+
Acts as a Kafka consumer, streaming messages into the pipeline with
|
|
10
|
+
support for consumer groups and offset management.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
uri: str,
|
|
16
|
+
group_id: str | None = None,
|
|
17
|
+
generate_ids: bool = True,
|
|
18
|
+
):
|
|
19
|
+
"""
|
|
20
|
+
Kafka Input Adapter.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
uri: Kafka URI (e.g., 'kafka://localhost:9092/topic')
|
|
24
|
+
group_id: Optional consumer group ID.
|
|
25
|
+
generate_ids: Whether to generate unique IDs for each message.
|
|
26
|
+
"""
|
|
27
|
+
self.uri = uri
|
|
28
|
+
self.group_id = group_id
|
|
29
|
+
self.generate_ids = generate_ids
|
|
30
|
+
|
|
31
|
+
def get_native_reader(self) -> KafkaReader:
|
|
32
|
+
return KafkaReader(
|
|
33
|
+
self.uri,
|
|
34
|
+
group_id=self.group_id,
|
|
35
|
+
generate_ids=self.generate_ids,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
__all__ = ["KafkaInputAdapter"]
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import pathlib
|
|
2
|
+
import typing
|
|
3
|
+
|
|
4
|
+
from zoopipe.input_adapter.base import BaseInputAdapter
|
|
5
|
+
from zoopipe.zoopipe_rust_core import ParquetReader
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ParquetInputAdapter(BaseInputAdapter):
|
|
9
|
+
"""
|
|
10
|
+
Reads records from Apache Parquet files.
|
|
11
|
+
|
|
12
|
+
Utilizes the Arrow ecosystem for efficient columnar data reading and
|
|
13
|
+
multi-threaded loading.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
source: typing.Union[str, pathlib.Path],
|
|
19
|
+
generate_ids: bool = True,
|
|
20
|
+
batch_size: int = 1024,
|
|
21
|
+
limit: int | None = None,
|
|
22
|
+
offset: int = 0,
|
|
23
|
+
row_groups: typing.List[int] | None = None,
|
|
24
|
+
):
|
|
25
|
+
"""
|
|
26
|
+
Initialize the ParquetInputAdapter.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
source: Path to the Parquet file.
|
|
30
|
+
generate_ids: Whether to generate unique IDs for each record.
|
|
31
|
+
batch_size: Number of records to read at once from the file.
|
|
32
|
+
limit: Maximum number of rows to read.
|
|
33
|
+
offset: Number of rows to skip.
|
|
34
|
+
"""
|
|
35
|
+
self.source_path = str(source)
|
|
36
|
+
self.generate_ids = generate_ids
|
|
37
|
+
self.batch_size = batch_size
|
|
38
|
+
self.limit = limit
|
|
39
|
+
self.offset = offset
|
|
40
|
+
self.row_groups = row_groups
|
|
41
|
+
|
|
42
|
+
def split(self, workers: int) -> typing.List["ParquetInputAdapter"]:
|
|
43
|
+
"""
|
|
44
|
+
Split the Parquet input into `workers` shards based on Row Groups.
|
|
45
|
+
"""
|
|
46
|
+
row_group_rows = ParquetReader.get_row_groups_info(self.source_path)
|
|
47
|
+
num_groups = len(row_group_rows)
|
|
48
|
+
|
|
49
|
+
if num_groups < workers:
|
|
50
|
+
workers = num_groups
|
|
51
|
+
|
|
52
|
+
if workers <= 1:
|
|
53
|
+
return [self]
|
|
54
|
+
|
|
55
|
+
# Distribute row groups among workers
|
|
56
|
+
groups_per_worker = num_groups // workers
|
|
57
|
+
shards = []
|
|
58
|
+
for i in range(workers):
|
|
59
|
+
start_idx = i * groups_per_worker
|
|
60
|
+
end_idx = (i + 1) * groups_per_worker if i < workers - 1 else num_groups
|
|
61
|
+
|
|
62
|
+
assigned_groups = list(range(start_idx, end_idx))
|
|
63
|
+
|
|
64
|
+
shards.append(
|
|
65
|
+
self.__class__(
|
|
66
|
+
source=self.source_path,
|
|
67
|
+
generate_ids=self.generate_ids,
|
|
68
|
+
batch_size=self.batch_size,
|
|
69
|
+
row_groups=assigned_groups,
|
|
70
|
+
)
|
|
71
|
+
)
|
|
72
|
+
return shards
|
|
73
|
+
|
|
74
|
+
def get_native_reader(self) -> ParquetReader:
|
|
75
|
+
return ParquetReader(
|
|
76
|
+
self.source_path,
|
|
77
|
+
generate_ids=self.generate_ids,
|
|
78
|
+
batch_size=self.batch_size,
|
|
79
|
+
limit=self.limit,
|
|
80
|
+
offset=self.offset,
|
|
81
|
+
row_groups=self.row_groups,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
__all__ = ["ParquetInputAdapter"]
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import typing
|
|
2
|
+
|
|
3
|
+
from zoopipe.input_adapter.base import BaseInputAdapter
|
|
4
|
+
from zoopipe.zoopipe_rust_core import PyGeneratorReader
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class PyGeneratorInputAdapter(BaseInputAdapter):
|
|
8
|
+
"""
|
|
9
|
+
Bridges Python iterables and generators into the pipeline.
|
|
10
|
+
|
|
11
|
+
Allows using any custom Python logic or in-memory data as a source
|
|
12
|
+
for the pipeline.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
iterable: typing.Iterable[typing.Any],
|
|
18
|
+
generate_ids: bool = True,
|
|
19
|
+
):
|
|
20
|
+
"""
|
|
21
|
+
Initialize the PyGeneratorInputAdapter.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
iterable: Any Python iterable or generator yielding dictionaries.
|
|
25
|
+
generate_ids: Whether to generate unique IDs for each record.
|
|
26
|
+
"""
|
|
27
|
+
self.iterable = iterable
|
|
28
|
+
self.generate_ids = generate_ids
|
|
29
|
+
|
|
30
|
+
def get_native_reader(self) -> PyGeneratorReader:
|
|
31
|
+
return PyGeneratorReader(
|
|
32
|
+
self.iterable,
|
|
33
|
+
generate_ids=self.generate_ids,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
__all__ = ["PyGeneratorInputAdapter"]
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import typing
|
|
2
|
+
|
|
3
|
+
from zoopipe.input_adapter.base import BaseInputAdapter
|
|
4
|
+
from zoopipe.zoopipe_rust_core import SQLReader
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class SQLInputAdapter(BaseInputAdapter):
|
|
8
|
+
"""
|
|
9
|
+
Streams records from SQL databases using standard queries.
|
|
10
|
+
|
|
11
|
+
Supports any database compatible with SQLAlchemy URIs. It executes a
|
|
12
|
+
provided query or fetches a whole table using a native Rust executor
|
|
13
|
+
for optimal performance.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
uri: str,
|
|
19
|
+
query: str | None = None,
|
|
20
|
+
table_name: str | None = None,
|
|
21
|
+
generate_ids: bool = True,
|
|
22
|
+
):
|
|
23
|
+
"""
|
|
24
|
+
Initialize the SQLInputAdapter.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
uri: Database URI (e.g., 'sqlite:///data.db').
|
|
28
|
+
query: SQL query to execute.
|
|
29
|
+
table_name: Or name of the table to read (equiv to SELECT * FROM table).
|
|
30
|
+
generate_ids: Whether to generate unique IDs for each record.
|
|
31
|
+
"""
|
|
32
|
+
self.uri = uri
|
|
33
|
+
self.generate_ids = generate_ids
|
|
34
|
+
|
|
35
|
+
if query is None and table_name is None:
|
|
36
|
+
raise ValueError("Either query or table_name must be provided")
|
|
37
|
+
|
|
38
|
+
if query is not None and table_name is not None:
|
|
39
|
+
raise ValueError("Only one of query or table_name should be provided")
|
|
40
|
+
|
|
41
|
+
if query is not None:
|
|
42
|
+
self.query = query
|
|
43
|
+
else:
|
|
44
|
+
self.query = f"SELECT * FROM {table_name}"
|
|
45
|
+
|
|
46
|
+
def get_native_reader(self) -> SQLReader:
|
|
47
|
+
return SQLReader(
|
|
48
|
+
self.uri,
|
|
49
|
+
self.query,
|
|
50
|
+
generate_ids=self.generate_ids,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class SQLPaginationInputAdapter(SQLInputAdapter):
|
|
55
|
+
"""
|
|
56
|
+
Input adapter for SQL databases using anchor-based pagination.
|
|
57
|
+
|
|
58
|
+
This adapter generates ID ranges (anchors) and utilizes SQLExpansionHook
|
|
59
|
+
to fetch full records in chunks, which is more efficient for very large tables.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
uri: str,
|
|
65
|
+
table_name: str,
|
|
66
|
+
id_column: str,
|
|
67
|
+
chunk_size: int,
|
|
68
|
+
connection_factory: typing.Callable[[], typing.Any],
|
|
69
|
+
):
|
|
70
|
+
"""
|
|
71
|
+
Initialize the SQLPaginationInputAdapter.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
uri: Database URI.
|
|
75
|
+
table_name: Name of the table to read.
|
|
76
|
+
id_column: Primary key or indexed column used for pagination.
|
|
77
|
+
chunk_size: Number of records to fetch per chunk.
|
|
78
|
+
connection_factory: Callable that returns a database connection
|
|
79
|
+
for the hook.
|
|
80
|
+
"""
|
|
81
|
+
self.table_name = table_name
|
|
82
|
+
self.id_column = id_column
|
|
83
|
+
self.chunk_size = chunk_size
|
|
84
|
+
self.connection_factory = connection_factory
|
|
85
|
+
|
|
86
|
+
query = f"""
|
|
87
|
+
WITH RECURSIVE ranges(n) AS (
|
|
88
|
+
SELECT MIN({id_column}) FROM {table_name}
|
|
89
|
+
UNION ALL
|
|
90
|
+
SELECT n + {chunk_size} FROM ranges
|
|
91
|
+
WHERE n + {chunk_size} <= (SELECT MAX({id_column}) FROM {table_name})
|
|
92
|
+
)
|
|
93
|
+
SELECT n as min_id, n + {chunk_size} - 1 as max_id FROM ranges
|
|
94
|
+
"""
|
|
95
|
+
super().__init__(uri, query=query)
|
|
96
|
+
|
|
97
|
+
def get_hooks(self):
|
|
98
|
+
from zoopipe.hooks.sql import SQLExpansionHook
|
|
99
|
+
|
|
100
|
+
return [SQLExpansionHook(self.connection_factory, self.table_name)]
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
__all__ = ["SQLInputAdapter", "SQLPaginationInputAdapter"]
|
zoopipe/manager.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import shutil
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
|
+
|
|
7
|
+
from zoopipe.engines import MultiProcessEngine
|
|
8
|
+
from zoopipe.engines.local import PipeReport
|
|
9
|
+
from zoopipe.zoopipe_rust_core import MultiThreadExecutor, SingleThreadExecutor
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from zoopipe.engines.base import BaseEngine
|
|
13
|
+
from zoopipe.pipe import Pipe
|
|
14
|
+
from zoopipe.report import FlowReport
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PipeManager:
|
|
18
|
+
"""
|
|
19
|
+
Manages one or more Pipes using an execution Engine.
|
|
20
|
+
|
|
21
|
+
PipeManager acts as the high-level orchestrator. It handles the sharding
|
|
22
|
+
of data sources across multiple workers and coordinates their execution
|
|
23
|
+
through a pluggable Engine (e.g., Local Multiprocessing, Ray, Dask).
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(self, pipes: list[Pipe], engine: BaseEngine | None = None):
|
|
27
|
+
"""
|
|
28
|
+
Initialize PipeManager with a list of Pipe instances.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
pipes: List of Pipe objects to manage.
|
|
32
|
+
engine: Optional execution engine. Defaults to MultiProcessEngine.
|
|
33
|
+
"""
|
|
34
|
+
self.pipes = pipes
|
|
35
|
+
self.engine = engine or MultiProcessEngine()
|
|
36
|
+
self._merge_info: dict[str, Any] = {}
|
|
37
|
+
self.should_merge = False
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def is_running(self) -> bool:
|
|
41
|
+
"""Check if the execution is currently running."""
|
|
42
|
+
return self.engine.is_running
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def pipe_count(self) -> int:
|
|
46
|
+
"""Get the number of pipes being managed."""
|
|
47
|
+
return len(self.pipes)
|
|
48
|
+
|
|
49
|
+
def start(self) -> None:
|
|
50
|
+
"""
|
|
51
|
+
Start all managed pipes using the configured engine.
|
|
52
|
+
"""
|
|
53
|
+
self.engine.start(self.pipes)
|
|
54
|
+
|
|
55
|
+
def wait(self, timeout: float | None = None) -> bool:
|
|
56
|
+
"""
|
|
57
|
+
Wait for execution to finish.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
timeout: Optional maximum time to wait.
|
|
61
|
+
Returns:
|
|
62
|
+
True if execution finished.
|
|
63
|
+
"""
|
|
64
|
+
return self.engine.wait(timeout)
|
|
65
|
+
|
|
66
|
+
def shutdown(self, timeout: float = 5.0) -> None:
|
|
67
|
+
"""
|
|
68
|
+
Forcibly stop all running pipes.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
timeout: Maximum time to wait for termination.
|
|
72
|
+
"""
|
|
73
|
+
self.engine.shutdown(timeout)
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def report(self) -> FlowReport:
|
|
77
|
+
"""Get an aggregated report of all running pipes."""
|
|
78
|
+
return self.engine.report
|
|
79
|
+
|
|
80
|
+
def get_pipe_report(self, index: int) -> PipeReport:
|
|
81
|
+
"""
|
|
82
|
+
Get the current report for a specific pipe.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
index: The index of the pipe in the original list.
|
|
86
|
+
"""
|
|
87
|
+
if hasattr(self.engine, "get_pipe_report"):
|
|
88
|
+
return self.engine.get_pipe_report(index)
|
|
89
|
+
raise AttributeError(
|
|
90
|
+
f"Engine {self.engine.__class__.__name__} does not support per-pipe reports"
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def pipe_reports(self) -> list[PipeReport]:
|
|
95
|
+
"""Get reports for all managed pipes."""
|
|
96
|
+
if hasattr(self.engine, "pipe_reports"):
|
|
97
|
+
return self.engine.pipe_reports
|
|
98
|
+
# Fallback if the engine doesn't have the property but has the method
|
|
99
|
+
if hasattr(self.engine, "get_pipe_report"):
|
|
100
|
+
return [self.engine.get_pipe_report(i) for i in range(self.pipe_count)]
|
|
101
|
+
raise AttributeError(
|
|
102
|
+
f"Engine {self.engine.__class__.__name__} does not support per-pipe reports"
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
def __enter__(self) -> PipeManager:
|
|
106
|
+
self.start()
|
|
107
|
+
return self
|
|
108
|
+
|
|
109
|
+
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
110
|
+
if self.is_running:
|
|
111
|
+
self.shutdown()
|
|
112
|
+
|
|
113
|
+
@classmethod
|
|
114
|
+
def parallelize_pipe(
|
|
115
|
+
cls,
|
|
116
|
+
pipe: Pipe,
|
|
117
|
+
workers: int,
|
|
118
|
+
should_merge: bool = False,
|
|
119
|
+
executor: SingleThreadExecutor | MultiThreadExecutor | None = None,
|
|
120
|
+
engine: BaseEngine | None = None,
|
|
121
|
+
) -> PipeManager:
|
|
122
|
+
"""
|
|
123
|
+
Create a PipeManager that runs the given pipe in parallel across
|
|
124
|
+
`workers` shards.
|
|
125
|
+
|
|
126
|
+
Automatically splits the input and output adapters to ensure safe
|
|
127
|
+
parallel execution.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
pipe: The source pipe to parallelize.
|
|
131
|
+
workers: Number of shards to use.
|
|
132
|
+
should_merge: Whether to merge the output shards automatically.
|
|
133
|
+
executor: Internal batch executor for each shard.
|
|
134
|
+
engine: Optional execution engine.
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
A configured PipeManager instance.
|
|
138
|
+
"""
|
|
139
|
+
if not pipe.input_adapter.can_split or not pipe.output_adapter.can_split:
|
|
140
|
+
workers = 1
|
|
141
|
+
|
|
142
|
+
input_shards = pipe.input_adapter.split(workers)
|
|
143
|
+
output_shards = pipe.output_adapter.split(workers)
|
|
144
|
+
|
|
145
|
+
if len(input_shards) != workers or len(output_shards) != workers:
|
|
146
|
+
raise ValueError(
|
|
147
|
+
f"Adapters failed to split into {workers} shards. "
|
|
148
|
+
f"Got {len(input_shards)} inputs and {len(output_shards)} outputs."
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
exec_strategy = executor or pipe.executor
|
|
152
|
+
|
|
153
|
+
pipes = []
|
|
154
|
+
for i in range(workers):
|
|
155
|
+
sharded_pipe = type(pipe)(
|
|
156
|
+
input_adapter=input_shards[i],
|
|
157
|
+
output_adapter=output_shards[i],
|
|
158
|
+
schema_model=pipe.schema_model,
|
|
159
|
+
pre_validation_hooks=pipe.pre_validation_hooks,
|
|
160
|
+
post_validation_hooks=pipe.post_validation_hooks,
|
|
161
|
+
report_update_interval=pipe.report_update_interval,
|
|
162
|
+
executor=exec_strategy,
|
|
163
|
+
)
|
|
164
|
+
pipes.append(sharded_pipe)
|
|
165
|
+
|
|
166
|
+
manager = cls(pipes, engine=engine)
|
|
167
|
+
manager.should_merge = should_merge
|
|
168
|
+
manager._merge_info = {
|
|
169
|
+
"target": getattr(pipe.output_adapter, "output_path", None),
|
|
170
|
+
"sources": [getattr(shard, "output_path", None) for shard in output_shards],
|
|
171
|
+
}
|
|
172
|
+
return manager
|
|
173
|
+
|
|
174
|
+
def merge(self) -> None:
|
|
175
|
+
"""
|
|
176
|
+
Merge the output files from all pipes into the final destination.
|
|
177
|
+
"""
|
|
178
|
+
if not self._should_merge():
|
|
179
|
+
return
|
|
180
|
+
|
|
181
|
+
target = self._merge_info["target"]
|
|
182
|
+
sources = [s for s in self._merge_info["sources"] if s and os.path.exists(s)]
|
|
183
|
+
|
|
184
|
+
with open(target, "wb") as dest:
|
|
185
|
+
for src_path in sources:
|
|
186
|
+
with open(src_path, "rb") as src:
|
|
187
|
+
self._append_file(dest, src)
|
|
188
|
+
|
|
189
|
+
def _should_merge(self) -> bool:
|
|
190
|
+
if not self.should_merge or not self._merge_info.get("target"):
|
|
191
|
+
return False
|
|
192
|
+
sources = [s for s in self._merge_info.get("sources", []) if s]
|
|
193
|
+
return len(sources) > 1
|
|
194
|
+
|
|
195
|
+
def _append_file(self, dest, src) -> None:
|
|
196
|
+
"""Append file content using zero-copy where available."""
|
|
197
|
+
try:
|
|
198
|
+
offset, size = 0, os.fstat(src.fileno()).st_size
|
|
199
|
+
while offset < size:
|
|
200
|
+
sent = os.sendfile(dest.fileno(), src.fileno(), offset, size - offset)
|
|
201
|
+
if sent == 0:
|
|
202
|
+
break
|
|
203
|
+
offset += sent
|
|
204
|
+
except (OSError, AttributeError):
|
|
205
|
+
src.seek(0)
|
|
206
|
+
shutil.copyfileobj(src, dest)
|
|
207
|
+
|
|
208
|
+
def __repr__(self) -> str:
|
|
209
|
+
status = "running" if self.is_running else "stopped"
|
|
210
|
+
return f"<PipeManager pipes={self.pipe_count} status={status} "
|
|
211
|
+
f"engine={self.engine.__class__.__name__}>"
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from zoopipe.output_adapter.arrow import ArrowOutputAdapter
|
|
2
|
+
from zoopipe.output_adapter.base import BaseOutputAdapter
|
|
3
|
+
from zoopipe.output_adapter.csv import CSVOutputAdapter
|
|
4
|
+
from zoopipe.output_adapter.duckdb import DuckDBOutputAdapter
|
|
5
|
+
from zoopipe.output_adapter.excel import ExcelOutputAdapter
|
|
6
|
+
from zoopipe.output_adapter.json import JSONOutputAdapter
|
|
7
|
+
from zoopipe.output_adapter.kafka import KafkaOutputAdapter
|
|
8
|
+
from zoopipe.output_adapter.parquet import ParquetOutputAdapter
|
|
9
|
+
from zoopipe.output_adapter.pygen import PyGeneratorOutputAdapter
|
|
10
|
+
from zoopipe.output_adapter.sql import SQLOutputAdapter
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"BaseOutputAdapter",
|
|
14
|
+
"CSVOutputAdapter",
|
|
15
|
+
"JSONOutputAdapter",
|
|
16
|
+
"DuckDBOutputAdapter",
|
|
17
|
+
"ArrowOutputAdapter",
|
|
18
|
+
"ExcelOutputAdapter",
|
|
19
|
+
"SQLOutputAdapter",
|
|
20
|
+
"ParquetOutputAdapter",
|
|
21
|
+
"PyGeneratorOutputAdapter",
|
|
22
|
+
"KafkaOutputAdapter",
|
|
23
|
+
]
|