pycairn 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pycairn-0.0.1/PKG-INFO ADDED
@@ -0,0 +1,53 @@
1
+ Metadata-Version: 2.3
2
+ Name: pycairn
3
+ Version: 0.0.1
4
+ Summary: A simple and lightweight pipeline manifest writer tool for Python.
5
+ Author: Shekhrozx
6
+ Author-email: Shekhrozx <shekhrozx@gmail.com>
7
+ Requires-Dist: pydantic>=2.13.4
8
+ Requires-Python: >=3.10
9
+ Description-Content-Type: text/markdown
10
+
11
+ # PyCairn (A simple and lightweight pipeline manifest writer tool for Python)
12
+
13
+ This is a simple and lightweight pipeline manifest writer tool for Python. It allows you to easily track the steps of your data processing pipeline, including inputs, outputs, parameters, and metrics.
14
+
15
+ ## Installation
16
+ You can install PyCairn using pip:
17
+
18
+ ```bash
19
+ pip install pycairn
20
+ ```
21
+
22
+ ## Usage
23
+
24
+ ```python
25
+ from pycairn import Artifact, Cairn
26
+
27
+ cairn = Cairn(pipeline="etl", run_id="2026-06-23T01", path="2026-06-23T01.json")
28
+
29
+ # Step 1: extract
30
+ with cairn.step("extract", params={"source": "api/v2"}) as s:
31
+ out = run_extract() # writes data/raw.parquet
32
+ art = Artifact.from_path("/path/to/file", type="file-type", rows=len(out))
33
+ s.outputs.append(art)
34
+ s.metrics["rows"] = len(out)
35
+
36
+ # Step 2: transform
37
+ prev = cairn.output_of("extract")[0].path
38
+ with cairn.step("transform", inputs=[prev]) as s:
39
+ df = run_transform(prev) # writes data/clean.parquet
40
+ s.outputs.append(Artifact.from_path("data/clean.parquet", type="parquet", rows=len(df)))
41
+ s.metrics["null_rate"] = 0.02
42
+
43
+ ...
44
+ ```
45
+
46
+ ## License
47
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details
48
+
49
+ ## Contributing
50
+ Contributions are welcome! Please feel free to submit issues or pull requests.
51
+
52
+ ## Acknowledgements
53
+ - Inspired by the need for a simple and effective way to track data processing pipelines in Python.
@@ -0,0 +1,43 @@
1
+ # PyCairn (A simple and lightweight pipeline manifest writer tool for Python)
2
+
3
+ This is a simple and lightweight pipeline manifest writer tool for Python. It allows you to easily track the steps of your data processing pipeline, including inputs, outputs, parameters, and metrics.
4
+
5
+ ## Installation
6
+ You can install PyCairn using pip:
7
+
8
+ ```bash
9
+ pip install pycairn
10
+ ```
11
+
12
+ ## Usage
13
+
14
+ ```python
15
+ from pycairn import Artifact, Cairn
16
+
17
+ cairn = Cairn(pipeline="etl", run_id="2026-06-23T01", path="2026-06-23T01.json")
18
+
19
+ # Step 1: extract
20
+ with cairn.step("extract", params={"source": "api/v2"}) as s:
21
+ out = run_extract() # writes data/raw.parquet
22
+ art = Artifact.from_path("/path/to/file", type="file-type", rows=len(out))
23
+ s.outputs.append(art)
24
+ s.metrics["rows"] = len(out)
25
+
26
+ # Step 2: transform
27
+ prev = cairn.output_of("extract")[0].path
28
+ with cairn.step("transform", inputs=[prev]) as s:
29
+ df = run_transform(prev) # writes data/clean.parquet
30
+ s.outputs.append(Artifact.from_path("data/clean.parquet", type="parquet", rows=len(df)))
31
+ s.metrics["null_rate"] = 0.02
32
+
33
+ ...
34
+ ```
35
+
36
+ ## License
37
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details
38
+
39
+ ## Contributing
40
+ Contributions are welcome! Please feel free to submit issues or pull requests.
41
+
42
+ ## Acknowledgements
43
+ - Inspired by the need for a simple and effective way to track data processing pipelines in Python.
@@ -0,0 +1,26 @@
1
+ [project]
2
+ name = "pycairn"
3
+ version = "0.0.1"
4
+ description = "A simple and lightweight pipeline manifest writer tool for Python."
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "Shekhrozx", email = "shekhrozx@gmail.com" }
8
+ ]
9
+ requires-python = ">=3.10"
10
+ dependencies = [
11
+ "pydantic>=2.13.4",
12
+ ]
13
+
14
+ [[tool.mypy.overrides]]
15
+ module = "pycairn"
16
+ ignore_missing_imports = true
17
+
18
+ [build-system]
19
+ requires = ["uv_build>=0.11.6,<0.12.0"]
20
+ build-backend = "uv_build"
21
+
22
+ [dependency-groups]
23
+ test = [
24
+ "pytest>=9.1.1",
25
+ ]
26
+
@@ -0,0 +1,10 @@
1
+ from ._models import Manifest, Step, Artifact
2
+ from .cairn import Cairn
3
+
4
+
5
+ __all__ = [
6
+ "Artifact",
7
+ "Manifest",
8
+ "Step",
9
+ "Cairn"
10
+ ]
@@ -0,0 +1,174 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime
4
+ from typing import Any
5
+ from pathlib import Path
6
+ from pydantic import BaseModel, Field
7
+
8
+ from pycairn import utils
9
+
10
+
11
+ class Artifact(BaseModel):
12
+ path: str
13
+ type: str | None = None # "parquet", "model", "csv"...
14
+ bytes: int | None = None
15
+ sha256: str | None = None
16
+ meta: dict[str, Any] = Field(default_factory=dict) # rows, schema, etc.
17
+
18
+ @classmethod
19
+ def from_path(cls, path: str | Path, type: str | None = None, **meta: Any) -> "Artifact":
20
+ """
21
+ Create an Artifact instance from a file path.
22
+
23
+ Args:
24
+ path (str | Path): The file path.
25
+ type (str | None): The type of the artifact (e.g., "parquet", "model", "csv").
26
+ **meta (Any): Additional metadata.
27
+
28
+ Returns:
29
+ Artifact: The created Artifact instance.
30
+ """
31
+
32
+ p = Path(path)
33
+ info: dict[str, Any] = {"path": str(p)}
34
+ if p.exists() and p.is_file():
35
+ info |= {"bytes": p.stat().st_size, "sha256": utils.sha256(p)}
36
+ return cls(**info, type=type, meta=meta)
37
+
38
+
39
+ class Step(BaseModel):
40
+ name: str
41
+ status: utils.Status = utils.Status.pending # pending|running|success|failed|skipped
42
+ started_at: str | None = None
43
+ ended_at: str | None = None
44
+ duration_s: float | None = None
45
+ inputs: list[str] = Field(default_factory=list)
46
+ outputs: list[Artifact] = Field(default_factory=list)
47
+ metrics: dict[str, Any] = Field(default_factory=dict)
48
+ params: dict[str, Any] = Field(default_factory=dict)
49
+ error: str | None = None
50
+
51
+ def add_artifact(self, artifact: Artifact) -> None:
52
+ """
53
+ Add an artifact to the step's outputs.
54
+
55
+ Args:
56
+ artifact (Artifact): The artifact to add.
57
+ """
58
+ if not isinstance(artifact, Artifact):
59
+ raise TypeError(f"Expected Artifact, got {type(artifact).__name__}")
60
+
61
+ if artifact in self.inputs:
62
+ raise ValueError(f"Artifact with path '{artifact.path}' is already listed as an input.")
63
+
64
+ self.outputs.append(artifact)
65
+ return None
66
+
67
+ def running(self, inputs: list[str] | None = None, params: dict | None = None) -> None:
68
+ """
69
+ Mark the step as running and record the start time.
70
+
71
+ Args:
72
+ inputs (list[str] | None): The input paths for the step.
73
+ params (dict | None): The parameters for the step.
74
+ """
75
+ self.status = utils.Status.running
76
+ self.started_at = utils.now_iso()
77
+ self.inputs = inputs or []
78
+ self.params = params or {}
79
+ return None
80
+
81
+ def failed(self, error: str) -> None:
82
+ """
83
+ Mark the step as failed and record the error message.
84
+
85
+ Args:
86
+ error (str): The error message.
87
+ """
88
+ self.status = utils.Status.failed
89
+ self.error = error
90
+ return None
91
+
92
+ def success(self) -> None:
93
+ """Mark the step as successful."""
94
+ self.status = utils.Status.success
95
+ return None
96
+
97
+ def skipped(self) -> None:
98
+ """Mark the step as skipped."""
99
+ self.status = utils.Status.skipped
100
+ return None
101
+
102
+ def end(self, start: datetime) -> None:
103
+ """
104
+ Record the end time and calculate the duration of the step.
105
+
106
+ Args:
107
+ start (datetime): The start time of the step.
108
+ """
109
+ self.ended_at = utils.now_iso()
110
+ self.duration_s = (utils.now() - start).total_seconds()
111
+ return None
112
+
113
+
114
+ class Manifest(BaseModel):
115
+ run_id: str
116
+ pipeline: str
117
+ created_at: str = Field(default_factory=utils.now_iso)
118
+ status: utils.Status = utils.Status.running # running|success|failed
119
+ steps: list[Step] = Field(default_factory=list)
120
+
121
+ def find_step(self, name: str) -> Step | None:
122
+ """
123
+ Find a step by name.
124
+
125
+ Args:
126
+ name (str): The name of the step.
127
+
128
+ Returns:
129
+ Step | None: The found step or None if not found.
130
+ """
131
+ return next((s for s in self.steps if s.name == name), None)
132
+
133
+ def get_or_create_step(self, name: str) -> Step:
134
+ """
135
+ Get an existing step by name or create a new one if it doesn't exist.
136
+
137
+ Args:
138
+ name (str): The name of the step.
139
+
140
+ Returns:
141
+ Step: The found or newly created step.
142
+ """
143
+ step = self.find_step(name)
144
+ if step is None:
145
+ step = Step(name=name)
146
+ self.steps.append(step)
147
+ return step
148
+
149
+ def output_of(self, step_name: str) -> list[Artifact]:
150
+ """
151
+ Get the outputs of a specific step by name.
152
+
153
+ Args:
154
+ step_name (str): The name of the step.
155
+
156
+ Returns:
157
+ list[Artifact]: A list of artifacts produced by the step.
158
+ """
159
+ step = self.find_step(step_name)
160
+ return step.outputs if step else []
161
+
162
+ def failed(self) -> None:
163
+ """Mark the manifest status as failed."""
164
+ self.status = utils.Status.failed
165
+ return None
166
+
167
+ def success(self) -> None:
168
+ """Mark the manifest status as success."""
169
+ self.status = utils.Status.success
170
+ return None
171
+
172
+ def is_all_done(self) -> bool:
173
+ """Check if all steps are done (either success or skipped)."""
174
+ return all(s.status in utils.SUCCESS_STATUSES for s in self.steps)
@@ -0,0 +1,53 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import traceback
5
+ from pathlib import Path
6
+ from contextlib import contextmanager
7
+
8
+ from pycairn import Artifact, Manifest, utils
9
+
10
+
11
+ class Cairn:
12
+ def __init__(self, pipeline: str, run_id: str, path: str | Path):
13
+ self.path = Path(path)
14
+ self.path.parent.mkdir(parents=True, exist_ok=True)
15
+ if self.path.exists():
16
+ self.manifest = Manifest.model_validate_json(self.path.read_text())
17
+ else:
18
+ self.manifest = Manifest(run_id=run_id, pipeline=pipeline)
19
+ self._save()
20
+
21
+ def _save(self) -> None:
22
+ # atomic write: tmp -> fsync -> rename
23
+ tmp = self.path.with_suffix(self.path.suffix + ".tmp")
24
+ tmp.write_text(self.manifest.model_dump_json(indent=2))
25
+ os.replace(tmp, self.path)
26
+
27
+ @contextmanager
28
+ def step(self, name: str, inputs: list[str] | None = None, params: dict | None = None):
29
+ step = self.manifest.get_or_create_step(name)
30
+
31
+ step.running(inputs=inputs, params=params)
32
+ self._save()
33
+
34
+ start = utils.now()
35
+ try:
36
+ yield step # caller fills outputs/metrics
37
+ step.success()
38
+ except Exception:
39
+ step.failed(traceback.format_exc())
40
+ self.manifest.failed()
41
+ self._save()
42
+ raise
43
+ finally:
44
+ step.end(start)
45
+ self._save()
46
+
47
+ # mark whole run done if last step succeeded and nothing failed
48
+ if self.manifest.is_all_done():
49
+ self.manifest.success()
50
+ self._save()
51
+
52
+ def output_of(self, step_name: str) -> list[Artifact]:
53
+ return self.manifest.output_of(step_name)
@@ -0,0 +1,31 @@
1
+ from __future__ import annotations
2
+
3
+ from enum import StrEnum
4
+ import hashlib
5
+ from pathlib import Path
6
+ from datetime import datetime, timezone
7
+
8
+
9
+ def now() -> datetime:
10
+ return datetime.now(timezone.utc)
11
+
12
+ def now_iso() -> str:
13
+ return datetime.now(timezone.utc).isoformat()
14
+
15
+
16
+ def sha256(path: Path, chunk: int = 1 << 20) -> str:
17
+ h = hashlib.sha256()
18
+ with open(path, "rb") as f:
19
+ for block in iter(lambda: f.read(chunk), b""):
20
+ h.update(block)
21
+ return h.hexdigest()
22
+
23
+
24
+ class Status(StrEnum):
25
+ pending = "pending"
26
+ running = "running"
27
+ success = "success"
28
+ failed = "failed"
29
+ skipped = "skipped"
30
+
31
+ SUCCESS_STATUSES = (Status.success, Status.skipped)