pycairn 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pycairn-0.0.1/PKG-INFO +53 -0
- pycairn-0.0.1/README.md +43 -0
- pycairn-0.0.1/pyproject.toml +26 -0
- pycairn-0.0.1/src/pycairn/__init__.py +10 -0
- pycairn-0.0.1/src/pycairn/_models.py +174 -0
- pycairn-0.0.1/src/pycairn/cairn.py +53 -0
- pycairn-0.0.1/src/pycairn/utils.py +31 -0
pycairn-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: pycairn
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: A simple and lightweight pipeline manifest writer tool for Python.
|
|
5
|
+
Author: Shekhrozx
|
|
6
|
+
Author-email: Shekhrozx <shekhrozx@gmail.com>
|
|
7
|
+
Requires-Dist: pydantic>=2.13.4
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
|
|
11
|
+
# PyCairn (A simple and lightweight pipeline manifest writer tool for Python)
|
|
12
|
+
|
|
13
|
+
This is a simple and lightweight pipeline manifest writer tool for Python. It allows you to easily track the steps of your data processing pipeline, including inputs, outputs, parameters, and metrics.
|
|
14
|
+
|
|
15
|
+
## Installation
|
|
16
|
+
You can install PyCairn using pip:
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install pycairn
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Usage
|
|
23
|
+
|
|
24
|
+
```python
|
|
25
|
+
from pycairn import Artifact, Cairn
|
|
26
|
+
|
|
27
|
+
cairn = Cairn(pipeline="etl", run_id="2026-06-23T01", path="2026-06-23T01.json")
|
|
28
|
+
|
|
29
|
+
# Step 1: extract
|
|
30
|
+
with cairn.step("extract", params={"source": "api/v2"}) as s:
|
|
31
|
+
out = run_extract() # writes data/raw.parquet
|
|
32
|
+
art = Artifact.from_path("/path/to/file", type="file-type", rows=len(out))
|
|
33
|
+
s.outputs.append(art)
|
|
34
|
+
s.metrics["rows"] = len(out)
|
|
35
|
+
|
|
36
|
+
# Step 2: transform
|
|
37
|
+
prev = cairn.output_of("extract")[0].path
|
|
38
|
+
with cairn.step("transform", inputs=[prev]) as s:
|
|
39
|
+
df = run_transform(prev) # writes data/clean.parquet
|
|
40
|
+
s.outputs.append(Artifact.from_path("data/clean.parquet", type="parquet", rows=len(df)))
|
|
41
|
+
s.metrics["null_rate"] = 0.02
|
|
42
|
+
|
|
43
|
+
...
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## License
|
|
47
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details
|
|
48
|
+
|
|
49
|
+
## Contributing
|
|
50
|
+
Contributions are welcome! Please feel free to submit issues or pull requests.
|
|
51
|
+
|
|
52
|
+
## Acknowledgements
|
|
53
|
+
- Inspired by the need for a simple and effective way to track data processing pipelines in Python.
|
pycairn-0.0.1/README.md
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# PyCairn (A simple and lightweight pipeline manifest writer tool for Python)
|
|
2
|
+
|
|
3
|
+
This is a simple and lightweight pipeline manifest writer tool for Python. It allows you to easily track the steps of your data processing pipeline, including inputs, outputs, parameters, and metrics.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
You can install PyCairn using pip:
|
|
7
|
+
|
|
8
|
+
```bash
|
|
9
|
+
pip install pycairn
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
## Usage
|
|
13
|
+
|
|
14
|
+
```python
|
|
15
|
+
from pycairn import Artifact, Cairn
|
|
16
|
+
|
|
17
|
+
cairn = Cairn(pipeline="etl", run_id="2026-06-23T01", path="2026-06-23T01.json")
|
|
18
|
+
|
|
19
|
+
# Step 1: extract
|
|
20
|
+
with cairn.step("extract", params={"source": "api/v2"}) as s:
|
|
21
|
+
out = run_extract() # writes data/raw.parquet
|
|
22
|
+
art = Artifact.from_path("/path/to/file", type="file-type", rows=len(out))
|
|
23
|
+
s.outputs.append(art)
|
|
24
|
+
s.metrics["rows"] = len(out)
|
|
25
|
+
|
|
26
|
+
# Step 2: transform
|
|
27
|
+
prev = cairn.output_of("extract")[0].path
|
|
28
|
+
with cairn.step("transform", inputs=[prev]) as s:
|
|
29
|
+
df = run_transform(prev) # writes data/clean.parquet
|
|
30
|
+
s.outputs.append(Artifact.from_path("data/clean.parquet", type="parquet", rows=len(df)))
|
|
31
|
+
s.metrics["null_rate"] = 0.02
|
|
32
|
+
|
|
33
|
+
...
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## License
|
|
37
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details
|
|
38
|
+
|
|
39
|
+
## Contributing
|
|
40
|
+
Contributions are welcome! Please feel free to submit issues or pull requests.
|
|
41
|
+
|
|
42
|
+
## Acknowledgements
|
|
43
|
+
- Inspired by the need for a simple and effective way to track data processing pipelines in Python.
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "pycairn"
|
|
3
|
+
version = "0.0.1"
|
|
4
|
+
description = "A simple and lightweight pipeline manifest writer tool for Python."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "Shekhrozx", email = "shekhrozx@gmail.com" }
|
|
8
|
+
]
|
|
9
|
+
requires-python = ">=3.10"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"pydantic>=2.13.4",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
[[tool.mypy.overrides]]
|
|
15
|
+
module = "pycairn"
|
|
16
|
+
ignore_missing_imports = true
|
|
17
|
+
|
|
18
|
+
[build-system]
|
|
19
|
+
requires = ["uv_build>=0.11.6,<0.12.0"]
|
|
20
|
+
build-backend = "uv_build"
|
|
21
|
+
|
|
22
|
+
[dependency-groups]
|
|
23
|
+
test = [
|
|
24
|
+
"pytest>=9.1.1",
|
|
25
|
+
]
|
|
26
|
+
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Any
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
7
|
+
|
|
8
|
+
from pycairn import utils
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Artifact(BaseModel):
|
|
12
|
+
path: str
|
|
13
|
+
type: str | None = None # "parquet", "model", "csv"...
|
|
14
|
+
bytes: int | None = None
|
|
15
|
+
sha256: str | None = None
|
|
16
|
+
meta: dict[str, Any] = Field(default_factory=dict) # rows, schema, etc.
|
|
17
|
+
|
|
18
|
+
@classmethod
|
|
19
|
+
def from_path(cls, path: str | Path, type: str | None = None, **meta: Any) -> "Artifact":
|
|
20
|
+
"""
|
|
21
|
+
Create an Artifact instance from a file path.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
path (str | Path): The file path.
|
|
25
|
+
type (str | None): The type of the artifact (e.g., "parquet", "model", "csv").
|
|
26
|
+
**meta (Any): Additional metadata.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
Artifact: The created Artifact instance.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
p = Path(path)
|
|
33
|
+
info: dict[str, Any] = {"path": str(p)}
|
|
34
|
+
if p.exists() and p.is_file():
|
|
35
|
+
info |= {"bytes": p.stat().st_size, "sha256": utils.sha256(p)}
|
|
36
|
+
return cls(**info, type=type, meta=meta)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class Step(BaseModel):
|
|
40
|
+
name: str
|
|
41
|
+
status: utils.Status = utils.Status.pending # pending|running|success|failed|skipped
|
|
42
|
+
started_at: str | None = None
|
|
43
|
+
ended_at: str | None = None
|
|
44
|
+
duration_s: float | None = None
|
|
45
|
+
inputs: list[str] = Field(default_factory=list)
|
|
46
|
+
outputs: list[Artifact] = Field(default_factory=list)
|
|
47
|
+
metrics: dict[str, Any] = Field(default_factory=dict)
|
|
48
|
+
params: dict[str, Any] = Field(default_factory=dict)
|
|
49
|
+
error: str | None = None
|
|
50
|
+
|
|
51
|
+
def add_artifact(self, artifact: Artifact) -> None:
|
|
52
|
+
"""
|
|
53
|
+
Add an artifact to the step's outputs.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
artifact (Artifact): The artifact to add.
|
|
57
|
+
"""
|
|
58
|
+
if not isinstance(artifact, Artifact):
|
|
59
|
+
raise TypeError(f"Expected Artifact, got {type(artifact).__name__}")
|
|
60
|
+
|
|
61
|
+
if artifact in self.inputs:
|
|
62
|
+
raise ValueError(f"Artifact with path '{artifact.path}' is already listed as an input.")
|
|
63
|
+
|
|
64
|
+
self.outputs.append(artifact)
|
|
65
|
+
return None
|
|
66
|
+
|
|
67
|
+
def running(self, inputs: list[str] | None = None, params: dict | None = None) -> None:
|
|
68
|
+
"""
|
|
69
|
+
Mark the step as running and record the start time.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
inputs (list[str] | None): The input paths for the step.
|
|
73
|
+
params (dict | None): The parameters for the step.
|
|
74
|
+
"""
|
|
75
|
+
self.status = utils.Status.running
|
|
76
|
+
self.started_at = utils.now_iso()
|
|
77
|
+
self.inputs = inputs or []
|
|
78
|
+
self.params = params or {}
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
def failed(self, error: str) -> None:
|
|
82
|
+
"""
|
|
83
|
+
Mark the step as failed and record the error message.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
error (str): The error message.
|
|
87
|
+
"""
|
|
88
|
+
self.status = utils.Status.failed
|
|
89
|
+
self.error = error
|
|
90
|
+
return None
|
|
91
|
+
|
|
92
|
+
def success(self) -> None:
|
|
93
|
+
"""Mark the step as successful."""
|
|
94
|
+
self.status = utils.Status.success
|
|
95
|
+
return None
|
|
96
|
+
|
|
97
|
+
def skipped(self) -> None:
|
|
98
|
+
"""Mark the step as skipped."""
|
|
99
|
+
self.status = utils.Status.skipped
|
|
100
|
+
return None
|
|
101
|
+
|
|
102
|
+
def end(self, start: datetime) -> None:
|
|
103
|
+
"""
|
|
104
|
+
Record the end time and calculate the duration of the step.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
start (datetime): The start time of the step.
|
|
108
|
+
"""
|
|
109
|
+
self.ended_at = utils.now_iso()
|
|
110
|
+
self.duration_s = (utils.now() - start).total_seconds()
|
|
111
|
+
return None
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class Manifest(BaseModel):
|
|
115
|
+
run_id: str
|
|
116
|
+
pipeline: str
|
|
117
|
+
created_at: str = Field(default_factory=utils.now_iso)
|
|
118
|
+
status: utils.Status = utils.Status.running # running|success|failed
|
|
119
|
+
steps: list[Step] = Field(default_factory=list)
|
|
120
|
+
|
|
121
|
+
def find_step(self, name: str) -> Step | None:
|
|
122
|
+
"""
|
|
123
|
+
Find a step by name.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
name (str): The name of the step.
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
Step | None: The found step or None if not found.
|
|
130
|
+
"""
|
|
131
|
+
return next((s for s in self.steps if s.name == name), None)
|
|
132
|
+
|
|
133
|
+
def get_or_create_step(self, name: str) -> Step:
|
|
134
|
+
"""
|
|
135
|
+
Get an existing step by name or create a new one if it doesn't exist.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
name (str): The name of the step.
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
Step: The found or newly created step.
|
|
142
|
+
"""
|
|
143
|
+
step = self.find_step(name)
|
|
144
|
+
if step is None:
|
|
145
|
+
step = Step(name=name)
|
|
146
|
+
self.steps.append(step)
|
|
147
|
+
return step
|
|
148
|
+
|
|
149
|
+
def output_of(self, step_name: str) -> list[Artifact]:
|
|
150
|
+
"""
|
|
151
|
+
Get the outputs of a specific step by name.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
step_name (str): The name of the step.
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
list[Artifact]: A list of artifacts produced by the step.
|
|
158
|
+
"""
|
|
159
|
+
step = self.find_step(step_name)
|
|
160
|
+
return step.outputs if step else []
|
|
161
|
+
|
|
162
|
+
def failed(self) -> None:
|
|
163
|
+
"""Mark the manifest status as failed."""
|
|
164
|
+
self.status = utils.Status.failed
|
|
165
|
+
return None
|
|
166
|
+
|
|
167
|
+
def success(self) -> None:
|
|
168
|
+
"""Mark the manifest status as success."""
|
|
169
|
+
self.status = utils.Status.success
|
|
170
|
+
return None
|
|
171
|
+
|
|
172
|
+
def is_all_done(self) -> bool:
|
|
173
|
+
"""Check if all steps are done (either success or skipped)."""
|
|
174
|
+
return all(s.status in utils.SUCCESS_STATUSES for s in self.steps)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import traceback
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from contextlib import contextmanager
|
|
7
|
+
|
|
8
|
+
from pycairn import Artifact, Manifest, utils
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Cairn:
|
|
12
|
+
def __init__(self, pipeline: str, run_id: str, path: str | Path):
|
|
13
|
+
self.path = Path(path)
|
|
14
|
+
self.path.parent.mkdir(parents=True, exist_ok=True)
|
|
15
|
+
if self.path.exists():
|
|
16
|
+
self.manifest = Manifest.model_validate_json(self.path.read_text())
|
|
17
|
+
else:
|
|
18
|
+
self.manifest = Manifest(run_id=run_id, pipeline=pipeline)
|
|
19
|
+
self._save()
|
|
20
|
+
|
|
21
|
+
def _save(self) -> None:
|
|
22
|
+
# atomic write: tmp -> fsync -> rename
|
|
23
|
+
tmp = self.path.with_suffix(self.path.suffix + ".tmp")
|
|
24
|
+
tmp.write_text(self.manifest.model_dump_json(indent=2))
|
|
25
|
+
os.replace(tmp, self.path)
|
|
26
|
+
|
|
27
|
+
@contextmanager
|
|
28
|
+
def step(self, name: str, inputs: list[str] | None = None, params: dict | None = None):
|
|
29
|
+
step = self.manifest.get_or_create_step(name)
|
|
30
|
+
|
|
31
|
+
step.running(inputs=inputs, params=params)
|
|
32
|
+
self._save()
|
|
33
|
+
|
|
34
|
+
start = utils.now()
|
|
35
|
+
try:
|
|
36
|
+
yield step # caller fills outputs/metrics
|
|
37
|
+
step.success()
|
|
38
|
+
except Exception:
|
|
39
|
+
step.failed(traceback.format_exc())
|
|
40
|
+
self.manifest.failed()
|
|
41
|
+
self._save()
|
|
42
|
+
raise
|
|
43
|
+
finally:
|
|
44
|
+
step.end(start)
|
|
45
|
+
self._save()
|
|
46
|
+
|
|
47
|
+
# mark whole run done if last step succeeded and nothing failed
|
|
48
|
+
if self.manifest.is_all_done():
|
|
49
|
+
self.manifest.success()
|
|
50
|
+
self._save()
|
|
51
|
+
|
|
52
|
+
def output_of(self, step_name: str) -> list[Artifact]:
|
|
53
|
+
return self.manifest.output_of(step_name)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from enum import StrEnum
|
|
4
|
+
import hashlib
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def now() -> datetime:
|
|
10
|
+
return datetime.now(timezone.utc)
|
|
11
|
+
|
|
12
|
+
def now_iso() -> str:
|
|
13
|
+
return datetime.now(timezone.utc).isoformat()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def sha256(path: Path, chunk: int = 1 << 20) -> str:
|
|
17
|
+
h = hashlib.sha256()
|
|
18
|
+
with open(path, "rb") as f:
|
|
19
|
+
for block in iter(lambda: f.read(chunk), b""):
|
|
20
|
+
h.update(block)
|
|
21
|
+
return h.hexdigest()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Status(StrEnum):
|
|
25
|
+
pending = "pending"
|
|
26
|
+
running = "running"
|
|
27
|
+
success = "success"
|
|
28
|
+
failed = "failed"
|
|
29
|
+
skipped = "skipped"
|
|
30
|
+
|
|
31
|
+
SUCCESS_STATUSES = (Status.success, Status.skipped)
|