zoopipe 2026.1.14__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of zoopipe might be problematic. Click here for more details.

zoopipe/__init__.py ADDED
@@ -0,0 +1,53 @@
1
+ from zoopipe.core import Pipe
2
+ from zoopipe.hooks.base import BaseHook, HookStore
3
+ from zoopipe.input_adapter.arrow import ArrowInputAdapter
4
+ from zoopipe.input_adapter.csv import CSVInputAdapter
5
+ from zoopipe.input_adapter.duckdb import DuckDBInputAdapter
6
+ from zoopipe.input_adapter.json import JSONInputAdapter
7
+ from zoopipe.input_adapter.parquet import ParquetInputAdapter
8
+ from zoopipe.input_adapter.pygen import PyGeneratorInputAdapter
9
+ from zoopipe.input_adapter.sql import SQLInputAdapter
10
+ from zoopipe.output_adapter.arrow import ArrowOutputAdapter
11
+ from zoopipe.output_adapter.csv import CSVOutputAdapter
12
+ from zoopipe.output_adapter.duckdb import DuckDBOutputAdapter
13
+ from zoopipe.output_adapter.json import JSONOutputAdapter
14
+ from zoopipe.output_adapter.parquet import ParquetOutputAdapter
15
+ from zoopipe.output_adapter.pygen import PyGeneratorOutputAdapter
16
+ from zoopipe.output_adapter.sql import SQLOutputAdapter
17
+ from zoopipe.report import (
18
+ EntryStatus,
19
+ EntryTypedDict,
20
+ FlowReport,
21
+ FlowStatus,
22
+ get_logger,
23
+ )
24
+ from zoopipe.zoopipe_rust_core import MultiThreadExecutor, SingleThreadExecutor
25
+
26
+ __all__ = [
27
+ "Pipe",
28
+ "FlowReport",
29
+ "FlowStatus",
30
+ "BaseHook",
31
+ "HookStore",
32
+ "EntryStatus",
33
+ "EntryTypedDict",
34
+ "get_logger",
35
+ "SingleThreadExecutor",
36
+ "MultiThreadExecutor",
37
+ # Input Adapters
38
+ "ArrowInputAdapter",
39
+ "CSVInputAdapter",
40
+ "DuckDBInputAdapter",
41
+ "JSONInputAdapter",
42
+ "PyGeneratorInputAdapter",
43
+ "SQLInputAdapter",
44
+ "ParquetInputAdapter",
45
+ # Output Adapters
46
+ "ArrowOutputAdapter",
47
+ "CSVOutputAdapter",
48
+ "DuckDBOutputAdapter",
49
+ "JSONOutputAdapter",
50
+ "PyGeneratorOutputAdapter",
51
+ "SQLOutputAdapter",
52
+ "ParquetOutputAdapter",
53
+ ]
zoopipe/core.py ADDED
@@ -0,0 +1,137 @@
1
+ import logging
2
+ import threading
3
+ import typing
4
+
5
+ from pydantic import TypeAdapter
6
+
7
+ from zoopipe.report import EntryStatus, FlowReport, get_logger
8
+ from zoopipe.zoopipe_rust_core import (
9
+ MultiThreadExecutor,
10
+ NativePipe,
11
+ SingleThreadExecutor,
12
+ )
13
+
14
+
15
+ class Pipe:
16
+ def __init__(
17
+ self,
18
+ input_adapter: typing.Any,
19
+ output_adapter: typing.Any,
20
+ error_output_adapter: typing.Any = None,
21
+ schema_model: typing.Any = None,
22
+ pre_validation_hooks: list[typing.Any] | None = None,
23
+ post_validation_hooks: list[typing.Any] | None = None,
24
+ logger: logging.Logger | None = None,
25
+ report_update_interval: int = 1,
26
+ executor: typing.Any = None,
27
+ ) -> None:
28
+ from zoopipe.zoopipe_rust_core import SingleThreadExecutor
29
+
30
+ self.input_adapter = input_adapter
31
+ self.output_adapter = output_adapter
32
+ self.error_output_adapter = error_output_adapter
33
+ self.schema_model = schema_model
34
+
35
+ self.pre_validation_hooks = pre_validation_hooks or []
36
+ self.post_validation_hooks = post_validation_hooks or []
37
+
38
+ self.logger = logger or get_logger()
39
+
40
+ self.report_update_interval = report_update_interval
41
+ self.executor = executor or SingleThreadExecutor()
42
+ self._report = FlowReport()
43
+ self._thread: threading.Thread | None = None
44
+ self._store: dict[str, typing.Any] = {}
45
+ self._validator = TypeAdapter(self.schema_model) if self.schema_model else None
46
+
47
+ def _process_batch(self, entries: list[dict]) -> list[dict]:
48
+ for hook in self.pre_validation_hooks:
49
+ entries = hook.execute(entries, self._store)
50
+
51
+ if self._validator:
52
+ for entry in entries:
53
+ try:
54
+ processed = self._validator.validate_python(entry["raw_data"])
55
+ entry["validated_data"] = (
56
+ processed.model_dump()
57
+ if hasattr(processed, "model_dump")
58
+ else processed
59
+ )
60
+ entry["status"] = EntryStatus.VALIDATED
61
+ except Exception as e:
62
+ entry["status"] = EntryStatus.FAILED
63
+ entry["errors"].append({"msg": str(e), "type": "validation_error"})
64
+
65
+ for hook in self.post_validation_hooks:
66
+ entries = hook.execute(entries, self._store)
67
+
68
+ return entries
69
+
70
+ @property
71
+ def report(self) -> FlowReport:
72
+ return self._report
73
+
74
+ def start(self) -> None:
75
+ if self._thread and self._thread.is_alive():
76
+ raise RuntimeError("Pipe is already running")
77
+
78
+ reader = self.input_adapter.get_native_reader()
79
+ writer = self.output_adapter.get_native_writer()
80
+ error_writer = None
81
+ if self.error_output_adapter:
82
+ error_writer = self.error_output_adapter.get_native_writer()
83
+
84
+ native_pipe = NativePipe(
85
+ reader=reader,
86
+ writer=writer,
87
+ error_writer=error_writer,
88
+ batch_processor=self._process_batch,
89
+ report=self._report,
90
+ report_update_interval=self.report_update_interval,
91
+ executor=self.executor,
92
+ )
93
+
94
+ self._thread = threading.Thread(
95
+ target=self._run_native,
96
+ args=(native_pipe,),
97
+ daemon=True,
98
+ )
99
+ self._thread.start()
100
+
101
+ def _run_native(self, native_pipe: NativePipe) -> None:
102
+ try:
103
+ for hook in self.pre_validation_hooks:
104
+ hook.setup(self._store)
105
+ for hook in self.post_validation_hooks:
106
+ hook.setup(self._store)
107
+
108
+ native_pipe.run()
109
+ except Exception as e:
110
+ self.logger.error(f"Pipeline execution failed: {e}")
111
+ self._report._mark_failed(e)
112
+ raise
113
+ finally:
114
+ for hook in self.pre_validation_hooks:
115
+ hook.teardown(self._store)
116
+ for hook in self.post_validation_hooks:
117
+ hook.teardown(self._store)
118
+
119
+ def shutdown(self) -> None:
120
+ self._report.abort()
121
+
122
+ def wait(self, timeout: float | None = None) -> bool:
123
+ return self._report.wait(timeout)
124
+
125
+ def __enter__(self) -> "Pipe":
126
+ self.start()
127
+ return self
128
+
129
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
130
+ if not self._report.is_finished:
131
+ self.shutdown()
132
+
133
+ def __repr__(self) -> str:
134
+ return f"<Pipe input={self.input_adapter} output={self.output_adapter}>"
135
+
136
+
137
+ __all__ = ["Pipe", "SingleThreadExecutor", "MultiThreadExecutor"]
@@ -0,0 +1,3 @@
1
+ from zoopipe.hooks.base import BaseHook, HookPriority, HookStore
2
+
3
+ __all__ = ["BaseHook", "HookStore", "HookPriority"]
zoopipe/hooks/base.py ADDED
@@ -0,0 +1,29 @@
1
+ import typing
2
+
3
+ from zoopipe.report import EntryTypedDict
4
+
5
+ HookStore = dict[str, typing.Any]
6
+
7
+
8
+ class HookPriority:
9
+ VERY_HIGH = 0
10
+ HIGH = 25
11
+ NORMAL = 50
12
+ LOW = 75
13
+ VERY_LOW = 100
14
+
15
+
16
+ class BaseHook:
17
+ def __init__(self, priority: int = HookPriority.NORMAL):
18
+ self.priority = priority
19
+
20
+ def setup(self, store: HookStore) -> None:
21
+ pass
22
+
23
+ def execute(
24
+ self, entries: list[EntryTypedDict], store: HookStore
25
+ ) -> list[EntryTypedDict]:
26
+ return entries
27
+
28
+ def teardown(self, store: HookStore) -> None:
29
+ pass
@@ -0,0 +1,19 @@
1
+ from zoopipe.input_adapter.arrow import ArrowInputAdapter
2
+ from zoopipe.input_adapter.base import BaseInputAdapter
3
+ from zoopipe.input_adapter.csv import CSVInputAdapter
4
+ from zoopipe.input_adapter.duckdb import DuckDBInputAdapter
5
+ from zoopipe.input_adapter.json import JSONInputAdapter
6
+ from zoopipe.input_adapter.parquet import ParquetInputAdapter
7
+ from zoopipe.input_adapter.pygen import PyGeneratorInputAdapter
8
+ from zoopipe.input_adapter.sql import SQLInputAdapter
9
+
10
+ __all__ = [
11
+ "BaseInputAdapter",
12
+ "CSVInputAdapter",
13
+ "JSONInputAdapter",
14
+ "DuckDBInputAdapter",
15
+ "ArrowInputAdapter",
16
+ "SQLInputAdapter",
17
+ "ParquetInputAdapter",
18
+ "PyGeneratorInputAdapter",
19
+ ]
@@ -0,0 +1,24 @@
1
+ import pathlib
2
+ import typing
3
+
4
+ from zoopipe.input_adapter.base import BaseInputAdapter
5
+ from zoopipe.zoopipe_rust_core import ArrowReader
6
+
7
+
8
+ class ArrowInputAdapter(BaseInputAdapter):
9
+ def __init__(
10
+ self,
11
+ source: typing.Union[str, pathlib.Path],
12
+ generate_ids: bool = True,
13
+ ):
14
+ self.source_path = str(source)
15
+ self.generate_ids = generate_ids
16
+
17
+ def get_native_reader(self) -> ArrowReader:
18
+ return ArrowReader(
19
+ self.source_path,
20
+ generate_ids=self.generate_ids,
21
+ )
22
+
23
+
24
+ __all__ = ["ArrowInputAdapter"]
@@ -0,0 +1,8 @@
1
+ import abc
2
+ import typing
3
+
4
+
5
+ class BaseInputAdapter(abc.ABC):
6
+ @abc.abstractmethod
7
+ def get_native_reader(self) -> typing.Any:
8
+ raise NotImplementedError
@@ -0,0 +1,36 @@
1
+ import pathlib
2
+ import typing
3
+
4
+ from zoopipe.input_adapter.base import BaseInputAdapter
5
+ from zoopipe.zoopipe_rust_core import CSVReader
6
+
7
+
8
+ class CSVInputAdapter(BaseInputAdapter):
9
+ def __init__(
10
+ self,
11
+ source: typing.Union[str, pathlib.Path],
12
+ delimiter: str = ",",
13
+ quotechar: str = '"',
14
+ skip_rows: int = 0,
15
+ fieldnames: list[str] | None = None,
16
+ generate_ids: bool = True,
17
+ ):
18
+ self.source_path = str(source)
19
+ self.delimiter = delimiter
20
+ self.quotechar = quotechar
21
+ self.skip_rows = skip_rows
22
+ self.fieldnames = fieldnames
23
+ self.generate_ids = generate_ids
24
+
25
+ def get_native_reader(self) -> CSVReader:
26
+ return CSVReader(
27
+ self.source_path,
28
+ delimiter=ord(self.delimiter),
29
+ quote=ord(self.quotechar),
30
+ skip_rows=self.skip_rows,
31
+ fieldnames=self.fieldnames,
32
+ generate_ids=self.generate_ids,
33
+ )
34
+
35
+
36
+ __all__ = ["CSVInputAdapter"]
@@ -0,0 +1,38 @@
1
+ import pathlib
2
+ import typing
3
+
4
+ from zoopipe.input_adapter.base import BaseInputAdapter
5
+ from zoopipe.zoopipe_rust_core import DuckDBReader
6
+
7
+
8
+ class DuckDBInputAdapter(BaseInputAdapter):
9
+ def __init__(
10
+ self,
11
+ source: typing.Union[str, pathlib.Path],
12
+ query: str | None = None,
13
+ table_name: str | None = None,
14
+ generate_ids: bool = True,
15
+ ):
16
+ self.source_path = str(source)
17
+ self.generate_ids = generate_ids
18
+
19
+ if query is None and table_name is None:
20
+ raise ValueError("Either query or table_name must be provided")
21
+
22
+ if query is not None and table_name is not None:
23
+ raise ValueError("Only one of query or table_name should be provided")
24
+
25
+ if query is not None:
26
+ self.query = query
27
+ else:
28
+ self.query = f"SELECT * FROM {table_name}"
29
+
30
+ def get_native_reader(self) -> DuckDBReader:
31
+ return DuckDBReader(
32
+ self.source_path,
33
+ self.query,
34
+ generate_ids=self.generate_ids,
35
+ )
36
+
37
+
38
+ __all__ = ["DuckDBInputAdapter"]
@@ -0,0 +1,19 @@
1
+ import pathlib
2
+ import typing
3
+
4
+ from zoopipe.input_adapter.base import BaseInputAdapter
5
+ from zoopipe.zoopipe_rust_core import JSONReader
6
+
7
+
8
+ class JSONInputAdapter(BaseInputAdapter):
9
+ def __init__(
10
+ self,
11
+ source: typing.Union[str, pathlib.Path],
12
+ ):
13
+ self.source_path = str(source)
14
+
15
+ def get_native_reader(self) -> JSONReader:
16
+ return JSONReader(self.source_path)
17
+
18
+
19
+ __all__ = ["JSONInputAdapter"]
@@ -0,0 +1,24 @@
1
+ import pathlib
2
+ import typing
3
+
4
+ from zoopipe.input_adapter.base import BaseInputAdapter
5
+ from zoopipe.zoopipe_rust_core import ParquetReader
6
+
7
+
8
+ class ParquetInputAdapter(BaseInputAdapter):
9
+ def __init__(
10
+ self,
11
+ source: typing.Union[str, pathlib.Path],
12
+ generate_ids: bool = True,
13
+ ):
14
+ self.source_path = str(source)
15
+ self.generate_ids = generate_ids
16
+
17
+ def get_native_reader(self) -> ParquetReader:
18
+ return ParquetReader(
19
+ self.source_path,
20
+ generate_ids=self.generate_ids,
21
+ )
22
+
23
+
24
+ __all__ = ["ParquetInputAdapter"]
@@ -0,0 +1,23 @@
1
+ import typing
2
+
3
+ from zoopipe.input_adapter.base import BaseInputAdapter
4
+ from zoopipe.zoopipe_rust_core import PyGeneratorReader
5
+
6
+
7
+ class PyGeneratorInputAdapter(BaseInputAdapter):
8
+ def __init__(
9
+ self,
10
+ iterable: typing.Iterable[typing.Any],
11
+ generate_ids: bool = True,
12
+ ):
13
+ self.iterable = iterable
14
+ self.generate_ids = generate_ids
15
+
16
+ def get_native_reader(self) -> PyGeneratorReader:
17
+ return PyGeneratorReader(
18
+ self.iterable,
19
+ generate_ids=self.generate_ids,
20
+ )
21
+
22
+
23
+ __all__ = ["PyGeneratorInputAdapter"]
@@ -0,0 +1,35 @@
1
+ from zoopipe.input_adapter.base import BaseInputAdapter
2
+ from zoopipe.zoopipe_rust_core import SQLReader
3
+
4
+
5
+ class SQLInputAdapter(BaseInputAdapter):
6
+ def __init__(
7
+ self,
8
+ uri: str,
9
+ query: str | None = None,
10
+ table_name: str | None = None,
11
+ generate_ids: bool = True,
12
+ ):
13
+ self.uri = uri
14
+ self.generate_ids = generate_ids
15
+
16
+ if query is None and table_name is None:
17
+ raise ValueError("Either query or table_name must be provided")
18
+
19
+ if query is not None and table_name is not None:
20
+ raise ValueError("Only one of query or table_name should be provided")
21
+
22
+ if query is not None:
23
+ self.query = query
24
+ else:
25
+ self.query = f"SELECT * FROM {table_name}"
26
+
27
+ def get_native_reader(self) -> SQLReader:
28
+ return SQLReader(
29
+ self.uri,
30
+ self.query,
31
+ generate_ids=self.generate_ids,
32
+ )
33
+
34
+
35
+ __all__ = ["SQLInputAdapter"]
@@ -0,0 +1,19 @@
1
+ from zoopipe.output_adapter.arrow import ArrowOutputAdapter
2
+ from zoopipe.output_adapter.base import BaseOutputAdapter
3
+ from zoopipe.output_adapter.csv import CSVOutputAdapter
4
+ from zoopipe.output_adapter.duckdb import DuckDBOutputAdapter
5
+ from zoopipe.output_adapter.json import JSONOutputAdapter
6
+ from zoopipe.output_adapter.parquet import ParquetOutputAdapter
7
+ from zoopipe.output_adapter.pygen import PyGeneratorOutputAdapter
8
+ from zoopipe.output_adapter.sql import SQLOutputAdapter
9
+
10
+ __all__ = [
11
+ "BaseOutputAdapter",
12
+ "CSVOutputAdapter",
13
+ "JSONOutputAdapter",
14
+ "DuckDBOutputAdapter",
15
+ "ArrowOutputAdapter",
16
+ "SQLOutputAdapter",
17
+ "ParquetOutputAdapter",
18
+ "PyGeneratorOutputAdapter",
19
+ ]
@@ -0,0 +1,20 @@
1
+ import pathlib
2
+ import typing
3
+
4
+ from zoopipe.output_adapter.base import BaseOutputAdapter
5
+ from zoopipe.zoopipe_rust_core import ArrowWriter
6
+
7
+
8
+ class ArrowOutputAdapter(BaseOutputAdapter):
9
+ def __init__(
10
+ self,
11
+ output: typing.Union[str, pathlib.Path],
12
+ ):
13
+ self.output_path = str(output)
14
+
15
+ def get_native_writer(self) -> ArrowWriter:
16
+ pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
17
+ return ArrowWriter(self.output_path)
18
+
19
+
20
+ __all__ = ["ArrowOutputAdapter"]
@@ -0,0 +1,8 @@
1
+ import abc
2
+ import typing
3
+
4
+
5
+ class BaseOutputAdapter(abc.ABC):
6
+ @abc.abstractmethod
7
+ def get_native_writer(self) -> typing.Any:
8
+ raise NotImplementedError
@@ -0,0 +1,31 @@
1
+ import pathlib
2
+ import typing
3
+
4
+ from zoopipe.output_adapter.base import BaseOutputAdapter
5
+ from zoopipe.zoopipe_rust_core import CSVWriter
6
+
7
+
8
+ class CSVOutputAdapter(BaseOutputAdapter):
9
+ def __init__(
10
+ self,
11
+ output: typing.Union[str, pathlib.Path],
12
+ delimiter: str = ",",
13
+ quotechar: str = '"',
14
+ fieldnames: list[str] | None = None,
15
+ ):
16
+ self.output_path = str(output)
17
+ self.delimiter = delimiter
18
+ self.quotechar = quotechar
19
+ self.fieldnames = fieldnames
20
+
21
+ def get_native_writer(self) -> CSVWriter:
22
+ pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
23
+ return CSVWriter(
24
+ self.output_path,
25
+ delimiter=ord(self.delimiter),
26
+ quote=ord(self.quotechar),
27
+ fieldnames=self.fieldnames,
28
+ )
29
+
30
+
31
+ __all__ = ["CSVOutputAdapter"]
@@ -0,0 +1,31 @@
1
+ import pathlib
2
+ import typing
3
+
4
+ from zoopipe.output_adapter.base import BaseOutputAdapter
5
+ from zoopipe.zoopipe_rust_core import DuckDBWriter
6
+
7
+
8
+ class DuckDBOutputAdapter(BaseOutputAdapter):
9
+ def __init__(
10
+ self,
11
+ output: typing.Union[str, pathlib.Path],
12
+ table_name: str,
13
+ mode: str = "replace",
14
+ ):
15
+ self.output_path = str(output)
16
+ self.table_name = table_name
17
+ self.mode = mode
18
+
19
+ if mode not in ["replace", "append", "fail"]:
20
+ raise ValueError("mode must be 'replace', 'append', or 'fail'")
21
+
22
+ def get_native_writer(self) -> DuckDBWriter:
23
+ pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
24
+ return DuckDBWriter(
25
+ self.output_path,
26
+ self.table_name,
27
+ mode=self.mode,
28
+ )
29
+
30
+
31
+ __all__ = ["DuckDBOutputAdapter"]
@@ -0,0 +1,28 @@
1
+ import pathlib
2
+ import typing
3
+
4
+ from zoopipe.output_adapter.base import BaseOutputAdapter
5
+ from zoopipe.zoopipe_rust_core import JSONWriter
6
+
7
+
8
+ class JSONOutputAdapter(BaseOutputAdapter):
9
+ def __init__(
10
+ self,
11
+ output: typing.Union[str, pathlib.Path],
12
+ format: str = "array",
13
+ indent: int | None = None,
14
+ ):
15
+ self.output_path = str(output)
16
+ self.format = format
17
+ self.indent = indent
18
+
19
+ def get_native_writer(self) -> JSONWriter:
20
+ pathlib.Path(self.output_path).parent.mkdir(parents=True, exist_ok=True)
21
+ return JSONWriter(
22
+ self.output_path,
23
+ format=self.format,
24
+ indent=self.indent,
25
+ )
26
+
27
+
28
+ __all__ = ["JSONOutputAdapter"]
@@ -0,0 +1,19 @@
1
+ import pathlib
2
+ import typing
3
+
4
+ from zoopipe.output_adapter.base import BaseOutputAdapter
5
+ from zoopipe.zoopipe_rust_core import ParquetWriter
6
+
7
+
8
+ class ParquetOutputAdapter(BaseOutputAdapter):
9
+ def __init__(
10
+ self,
11
+ path: typing.Union[str, pathlib.Path],
12
+ ):
13
+ self.path = str(path)
14
+
15
+ def get_native_writer(self) -> ParquetWriter:
16
+ return ParquetWriter(self.path)
17
+
18
+
19
+ __all__ = ["ParquetOutputAdapter"]
@@ -0,0 +1,16 @@
1
+ from zoopipe.output_adapter.base import BaseOutputAdapter
2
+ from zoopipe.zoopipe_rust_core import PyGeneratorWriter
3
+
4
+
5
+ class PyGeneratorOutputAdapter(BaseOutputAdapter):
6
+ def __init__(self, queue_size: int = 1000):
7
+ self._writer = PyGeneratorWriter(queue_size=queue_size)
8
+
9
+ def get_native_writer(self) -> PyGeneratorWriter:
10
+ return self._writer
11
+
12
+ def __iter__(self):
13
+ return self._writer
14
+
15
+
16
+ __all__ = ["PyGeneratorOutputAdapter"]
@@ -0,0 +1,27 @@
1
+ from zoopipe.output_adapter.base import BaseOutputAdapter
2
+ from zoopipe.zoopipe_rust_core import SQLWriter
3
+
4
+
5
+ class SQLOutputAdapter(BaseOutputAdapter):
6
+ def __init__(
7
+ self,
8
+ uri: str,
9
+ table_name: str,
10
+ mode: str = "replace",
11
+ batch_size: int = 500,
12
+ ):
13
+ self.uri = uri
14
+ self.table_name = table_name
15
+ self.mode = mode
16
+ self.batch_size = batch_size
17
+
18
+ def get_native_writer(self) -> SQLWriter:
19
+ return SQLWriter(
20
+ self.uri,
21
+ self.table_name,
22
+ mode=self.mode,
23
+ batch_size=self.batch_size,
24
+ )
25
+
26
+
27
+ __all__ = ["SQLOutputAdapter"]
zoopipe/report.py ADDED
@@ -0,0 +1,108 @@
1
+ import enum
2
+ import logging
3
+ import sys
4
+ import threading
5
+ import typing
6
+ from datetime import datetime
7
+
8
+
9
+ def get_logger(name: str = "zoopipe") -> logging.Logger:
10
+ logger = logging.getLogger(name)
11
+ if not logger.handlers:
12
+ handler = logging.StreamHandler(sys.stdout)
13
+ handler.setFormatter(
14
+ logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
15
+ )
16
+ logger.addHandler(handler)
17
+ logger.setLevel(logging.INFO)
18
+ return logger
19
+
20
+
21
+ class EntryStatus(enum.Enum):
22
+ PENDING = "pending"
23
+ VALIDATED = "validated"
24
+ FAILED = "failed"
25
+
26
+
27
+ class EntryTypedDict(typing.TypedDict):
28
+ id: typing.Any
29
+ position: int | None
30
+ status: EntryStatus
31
+ raw_data: dict[str, typing.Any]
32
+ validated_data: dict[str, typing.Any] | None
33
+ errors: list[dict[str, typing.Any]]
34
+ metadata: dict[str, typing.Any]
35
+
36
+
37
+ class FlowStatus(enum.Enum):
38
+ PENDING = "pending"
39
+ RUNNING = "running"
40
+ COMPLETED = "completed"
41
+ FAILED = "failed"
42
+ ABORTED = "aborted"
43
+
44
+
45
+ class FlowReport:
46
+ def __init__(self) -> None:
47
+ self.status = FlowStatus.PENDING
48
+ self.total_processed = 0
49
+ self.success_count = 0
50
+ self.error_count = 0
51
+ self.ram_bytes = 0
52
+ self.exception: Exception | None = None
53
+ self.start_time: datetime | None = None
54
+ self.end_time: datetime | None = None
55
+ self._finished_event = threading.Event()
56
+
57
+ @property
58
+ def duration(self) -> float:
59
+ start = self.start_time
60
+ if not start:
61
+ return 0.0
62
+ end = self.end_time or datetime.now()
63
+ return (end - start).total_seconds()
64
+
65
+ @property
66
+ def items_per_second(self) -> float:
67
+ duration = self.duration
68
+ if duration == 0:
69
+ return 0.0
70
+ return self.total_processed / duration
71
+
72
+ @property
73
+ def is_finished(self) -> bool:
74
+ return self._finished_event.is_set()
75
+
76
+ def wait(self, timeout: float | None = None) -> bool:
77
+ return self._finished_event.wait(timeout)
78
+
79
+ def _mark_running(self) -> None:
80
+ self.status = FlowStatus.RUNNING
81
+ self.start_time = datetime.now()
82
+
83
+ def _mark_completed(self) -> None:
84
+ self.status = FlowStatus.COMPLETED
85
+ self.end_time = datetime.now()
86
+ self._finished_event.set()
87
+
88
+ def abort(self) -> None:
89
+ self.status = FlowStatus.ABORTED
90
+ self.end_time = datetime.now()
91
+ self._finished_event.set()
92
+
93
+ def _mark_failed(self, exception: Exception) -> None:
94
+ self.status = FlowStatus.FAILED
95
+ self.exception = exception
96
+ self.end_time = datetime.now()
97
+ self._finished_event.set()
98
+
99
+ def __repr__(self) -> str:
100
+ return (
101
+ f"<FlowReport status={self.status.value} "
102
+ f"processed={self.total_processed} "
103
+ f"success={self.success_count} "
104
+ f"error={self.error_count} "
105
+ f"ram={self.ram_bytes / 1024 / 1024:.2f}MB "
106
+ f"fps={self.items_per_second:.2f} "
107
+ f"duration={self.duration:.2f}s>"
108
+ )
Binary file
@@ -0,0 +1,110 @@
1
+ Metadata-Version: 2.4
2
+ Name: zoopipe
3
+ Version: 2026.1.14
4
+ Requires-Dist: pydantic>=2.12.5
5
+ License-File: LICENSE
6
+ Summary: ZooPipe is a data processing framework that allows you to process data in a declarative way.
7
+ Author-email: Alberto Daniel Badia <alberto_badia@enlacepatagonia.com>
8
+ Requires-Python: >=3.10, <3.14
9
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
10
+ Project-URL: Homepage, https://github.com/albertobadia/zoopipe
11
+
12
+ # ZooPipe
13
+
14
+ **ZooPipe** is a lean, ultra-high-performance data processing engine for Python. It leverages a **100% Rust core** to handle I/O and orchestration, while keeping the flexibility of Python for schema validation (via Pydantic) and custom data enrichment (via Hooks).
15
+
16
+ ---
17
+
18
+ ## ✨ Key Features
19
+
20
+ - 🚀 **100% Native Rust Engine**: The core execution loop, including CSV and JSON parsing/writing, is implemented in Rust for maximum throughput.
21
+ - 🔍 **Declarative Validation**: Use [Pydantic](https://docs.pydantic.dev/) models to define and validate your data structures naturally.
22
+ - 🪝 **Python Hooks**: Transform and enrich data at any stage using standard Python functions or classes.
23
+ - ⚡ **Zero-Copy Intent**: Minimal overhead between the Rust processing engine and Python validation/hooks.
24
+ - 🚨 **Automated Error Routing**: Native support for routing failed records to a dedicated error output.
25
+ - 📊 **Multiple Format Support**: Optimized readers/writers for CSV, JSONL, and SQL databases (via SQLx with batch inserts).
26
+ - 🔧 **Pluggable Executors**: Choose between single-threaded or multi-threaded execution strategies.
27
+
28
+ ---
29
+
30
+ ## 🚀 Quick Start
31
+
32
+ ### Installation
33
+
34
+ ```bash
35
+ uv build
36
+ uv run maturin develop --release
37
+ ```
38
+
39
+ ### Simple Example
40
+
41
+ ```python
42
+ from pydantic import BaseModel, ConfigDict
43
+ from zoopipe import CSVInputAdapter, CSVOutputAdapter, Pipe
44
+
45
+
46
+ class UserSchema(BaseModel):
47
+ model_config = ConfigDict(extra="ignore")
48
+ user_id: str
49
+ username: str
50
+ age: int
51
+
52
+
53
+ pipe = Pipe(
54
+ input_adapter=CSVInputAdapter("users.csv"),
55
+ output_adapter=CSVOutputAdapter("processed_users.csv"),
56
+ error_output_adapter=CSVOutputAdapter("errors.csv"),
57
+ schema_model=UserSchema,
58
+ )
59
+
60
+ pipe.start()
61
+ pipe.wait()
62
+
63
+ print(f"Finished! Processed {pipe.report.total_processed} items.")
64
+ ```
65
+
66
+ ---
67
+
68
+ ## 📚 Documentation
69
+
70
+ ### Core Concepts
71
+
72
+ - [**Executors Guide**](docs/executors.md) - Choose and configure execution strategies
73
+
74
+ ### Input/Output Adapters
75
+
76
+ #### File Formats
77
+
78
+ - [**CSV Adapters**](docs/csv.md) - High-performance CSV reading and writing
79
+ - [**JSON Adapters**](docs/json.md) - JSONL and JSON array format support
80
+ - [**Parquet Adapters**](docs/parquet.md) - Columnar storage for analytics and data lakes
81
+ - [**Arrow Adapters**](docs/arrow.md) - Apache Arrow IPC format for zero-copy interoperability
82
+
83
+ #### Databases
84
+
85
+ - [**SQL Adapters**](docs/sql.md) - Read from and write to SQL databases with batch optimization
86
+ - [**DuckDB Adapters**](docs/duckdb.md) - Analytical database for OLAP workloads
87
+
88
+ #### Advanced
89
+
90
+ - [**Python Generator Adapters**](docs/pygen.md) - In-memory streaming and testing
91
+ - [**Cloud Storage (S3)**](docs/cloud-storage.md) - Read and write data from Amazon S3 and compatible services
92
+
93
+ ---
94
+
95
+ ## 🛠 Architecture
96
+
97
+ ZooPipe is designed as a thin Python wrapper around a powerful Rust core:
98
+
99
+ 1. **Python Layer**: Configuration, Pydantic models, and custom Hooks.
100
+ 2. **Rust Core**:
101
+ - **Adapters**: High-speed CSV/JSON/SQL Readers and Writers with optimized batch operations.
102
+ - **NativePipe**: Orchestrates the loop, fetching chunks, calling a consolidated Python batch processor, and routing result batches.
103
+ - **Executors**: Single-threaded or multi-threaded batch processing strategies.
104
+
105
+ ---
106
+
107
+ ## 📄 License
108
+
109
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
110
+
@@ -0,0 +1,28 @@
1
+ zoopipe/__init__.py,sha256=4XOTaRdZHxI92k_pvt9gGQRIFJXJutzRDWBBIBVneJw,1725
2
+ zoopipe/core.py,sha256=BBRu0U6SaTAFVAamOG7USsihuOZODSJMG__TkUyldOw,4594
3
+ zoopipe/hooks/__init__.py,sha256=yuEH4UryYk3bpphFonmJtc_9AOt4WeU5m_IDMsULHCs,118
4
+ zoopipe/hooks/base.py,sha256=_s1IP2rvucQOf_pRPOTmWAAJzSyAE9LHZ9fXRb6Nh84,565
5
+ zoopipe/input_adapter/__init__.py,sha256=rzwJcMvgIfTJDv38Xx20x0iYOoosm80DYTJ4VLELQoU,682
6
+ zoopipe/input_adapter/arrow.py,sha256=R8HgZrplciO_youK6EaL-OsYL7zOFDrwBBAuw8cfGYQ,579
7
+ zoopipe/input_adapter/base.py,sha256=NNzG1wqgHhT1E0tk-rkNJaqQfkNc9F_f-Gq0SPRuZO0,165
8
+ zoopipe/input_adapter/csv.py,sha256=rsGi4A495oqNlDs7KJIt82kG-Mo1MDWO_kWYRcrL4IY,1004
9
+ zoopipe/input_adapter/duckdb.py,sha256=C3d-Y0y5WDGXdgyrLJki5udHaIWF_BfMkuAqHBlnvcU,1079
10
+ zoopipe/input_adapter/json.py,sha256=69_0Mh_QMjRi2y2TuXAj6JLwdKWByeYagYcSvHkqnbM,431
11
+ zoopipe/input_adapter/parquet.py,sha256=iwMFcgP6COy8XMLAHynZJv2kihRvET-fYihr91bWIRQ,589
12
+ zoopipe/input_adapter/pygen.py,sha256=WNqx48pUuBFzimBqX7sZ0zf9bnlhuklVeMA4FiO7T84,583
13
+ zoopipe/input_adapter/sql.py,sha256=kdKOr3mMwZRukDKiNTVOOEnI1y-3t5wxfZEfGne5Zgg,979
14
+ zoopipe/output_adapter/__init__.py,sha256=ThLxko-_4OyIcHqW8PFyE0ac_-K0BpJDeeMfg-Dxl9U,706
15
+ zoopipe/output_adapter/arrow.py,sha256=oNcuGUihyPhHEp8CvbLa0UAoK_WL5IyVf8jiGzfmTQ0,522
16
+ zoopipe/output_adapter/base.py,sha256=4zGlyb_l3ma-eTN45NkhhFcoQh43ndpdFjpBnazO2qs,166
17
+ zoopipe/output_adapter/csv.py,sha256=LZDSJTnICgzJpk_0w2Gpo7Jqx5ztwmmictN-t-RoqQM,869
18
+ zoopipe/output_adapter/duckdb.py,sha256=FzYS5OdH1Y65AadCFr1AZfqu7tPzAlxrI0HWpjuZXdU,856
19
+ zoopipe/output_adapter/json.py,sha256=oGvp_CCqKCgdnJDNKr-VCfdcZXUqewZ1TvMyG8pqLcc,728
20
+ zoopipe/output_adapter/parquet.py,sha256=VJGVM4Mek6tEmgbJlpEyd-Zh9X_PIAQUDoJ5TwjCmdk,433
21
+ zoopipe/output_adapter/pygen.py,sha256=e_0hwftDggt3krF8vrd3I8YYDE7gc6YWCBAZzzQHbiQ,456
22
+ zoopipe/output_adapter/sql.py,sha256=w6WbwYTcuUSo417v9YaE7kPm2ixH41KpYaIHoHGR8GM,652
23
+ zoopipe/report.py,sha256=sL_M9Ce2HTVAsPX8qgStrIrI0lI1kbg3il_ntcYZkIg,3061
24
+ zoopipe/zoopipe_rust_core.abi3.so,sha256=xY7KoWVZWf-2F2jGW1GWj8uYcvpd0bxO6rNyYsEWI7M,59637840
25
+ zoopipe-2026.1.14.dist-info/METADATA,sha256=qj_U-ALKTMRKwnw5GKrARihBEojTl2JuV4HScSzoQBM,3806
26
+ zoopipe-2026.1.14.dist-info/WHEEL,sha256=vZ12AMAE5CVtd8oYbYGrz3omfHuIZCNO_3P50V00s00,104
27
+ zoopipe-2026.1.14.dist-info/licenses/LICENSE,sha256=4WRhonN0HErkcdxwCRoaBxdR4suhdVxZj_14XXMVgtw,1077
28
+ zoopipe-2026.1.14.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: maturin (1.11.5)
3
+ Root-Is-Purelib: false
4
+ Tag: cp310-abi3-macosx_11_0_arm64
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Alberto Daniel Badia
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.