mareforma 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. mareforma-0.1.0/LICENSE +21 -0
  2. mareforma-0.1.0/PKG-INFO +194 -0
  3. mareforma-0.1.0/README.md +159 -0
  4. mareforma-0.1.0/mareforma/__init__.py +10 -0
  5. mareforma-0.1.0/mareforma/_toml_writer.py +54 -0
  6. mareforma-0.1.0/mareforma/cli.py +1306 -0
  7. mareforma-0.1.0/mareforma/db.py +1094 -0
  8. mareforma-0.1.0/mareforma/distance.py +227 -0
  9. mareforma-0.1.0/mareforma/exporters/__init__.py +3 -0
  10. mareforma-0.1.0/mareforma/exporters/jsonld.py +263 -0
  11. mareforma-0.1.0/mareforma/git.py +214 -0
  12. mareforma-0.1.0/mareforma/health.py +163 -0
  13. mareforma-0.1.0/mareforma/initializer.py +62 -0
  14. mareforma-0.1.0/mareforma/inspector.py +413 -0
  15. mareforma-0.1.0/mareforma/pipeline/__init__.py +25 -0
  16. mareforma-0.1.0/mareforma/pipeline/context.py +445 -0
  17. mareforma-0.1.0/mareforma/pipeline/dag.py +155 -0
  18. mareforma-0.1.0/mareforma/pipeline/discovery.py +116 -0
  19. mareforma-0.1.0/mareforma/pipeline/lock.py +180 -0
  20. mareforma-0.1.0/mareforma/pipeline/runner.py +337 -0
  21. mareforma-0.1.0/mareforma/registry.py +277 -0
  22. mareforma-0.1.0/mareforma/scaffold.py +161 -0
  23. mareforma-0.1.0/mareforma/support.py +291 -0
  24. mareforma-0.1.0/mareforma/transforms.py +195 -0
  25. mareforma-0.1.0/mareforma.egg-info/PKG-INFO +194 -0
  26. mareforma-0.1.0/mareforma.egg-info/SOURCES.txt +44 -0
  27. mareforma-0.1.0/mareforma.egg-info/dependency_links.txt +1 -0
  28. mareforma-0.1.0/mareforma.egg-info/entry_points.txt +2 -0
  29. mareforma-0.1.0/mareforma.egg-info/requires.txt +19 -0
  30. mareforma-0.1.0/mareforma.egg-info/top_level.txt +1 -0
  31. mareforma-0.1.0/pyproject.toml +62 -0
  32. mareforma-0.1.0/setup.cfg +4 -0
  33. mareforma-0.1.0/tests/test_build_cli.py +280 -0
  34. mareforma-0.1.0/tests/test_cli.py +314 -0
  35. mareforma-0.1.0/tests/test_cross_diff.py +180 -0
  36. mareforma-0.1.0/tests/test_dag.py +144 -0
  37. mareforma-0.1.0/tests/test_db.py +725 -0
  38. mareforma-0.1.0/tests/test_discovery.py +229 -0
  39. mareforma-0.1.0/tests/test_distance.py +112 -0
  40. mareforma-0.1.0/tests/test_health.py +220 -0
  41. mareforma-0.1.0/tests/test_inspector.py +305 -0
  42. mareforma-0.1.0/tests/test_jsonld.py +291 -0
  43. mareforma-0.1.0/tests/test_registry.py +146 -0
  44. mareforma-0.1.0/tests/test_runner.py +285 -0
  45. mareforma-0.1.0/tests/test_support.py +209 -0
  46. mareforma-0.1.0/tests/test_transforms.py +230 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Felipe Yáñez
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,194 @@
1
+ Metadata-Version: 2.4
2
+ Name: mareforma
3
+ Version: 0.1.0
4
+ Summary: Automatic provenance for AI-driven research pipelines
5
+ License-Expression: MIT
6
+ Project-URL: Homepage, https://github.com/mareforma/mareforma
7
+ Project-URL: Repository, https://github.com/mareforma/mareforma
8
+ Project-URL: Issues, https://github.com/mareforma/mareforma/issues
9
+ Keywords: provenance,reproducibility,ai-agents,research,data-pipeline
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
17
+ Requires-Python: >=3.10
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: click>=8.1
21
+ Requires-Dist: tomli-w>=1.0
22
+ Requires-Dist: rich>=13.0
23
+ Requires-Dist: tomli>=2.0; python_version < "3.11"
24
+ Provides-Extra: git
25
+ Requires-Dist: gitpython>=3.1; extra == "git"
26
+ Provides-Extra: paper
27
+ Requires-Dist: httpx>=0.27; extra == "paper"
28
+ Provides-Extra: dev
29
+ Requires-Dist: pytest>=7.4; extra == "dev"
30
+ Requires-Dist: pytest-cov; extra == "dev"
31
+ Requires-Dist: pytest-httpx>=0.30; extra == "dev"
32
+ Requires-Dist: gitpython>=3.1; extra == "dev"
33
+ Requires-Dist: httpx>=0.27; extra == "dev"
34
+ Dynamic: license-file
35
+
36
+ # mareforma
37
+
38
+ [![Python](https://img.shields.io/pypi/pyversions/mareforma)](https://pypi.org/project/mareforma/)
39
+ [![Tests](https://github.com/mareforma/mareforma/actions/workflows/tests.yml/badge.svg)](https://github.com/mareforma/mareforma/actions/workflows/tests.yml)
40
+ [![PyPI](https://img.shields.io/pypi/v/mareforma)](https://pypi.org/project/mareforma/)
41
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
42
+
43
+ Automatic epistemic provenance for life sciences pipelines. Write transforms, run `build`, and mareforma figures out what kind of result you produced and how well-supported it is — no manual annotation required.
44
+
45
+ ---
46
+
47
+ ## Install
48
+
49
+ ```bash
50
+ pip install mareforma
51
+ ```
52
+
53
+ Requires Python ≥ 3.10.
54
+
55
+ ---
56
+
57
+ ## How it works
58
+
59
+ Write normal Python pipeline functions. mareforma auto-classifies each result.
60
+
61
+ ```python
62
+ from mareforma import transform, BuildContext
63
+ import pandas as pd
64
+
65
+ @transform("morphology.load")
66
+ def load(ctx: BuildContext) -> None:
67
+ files = list(ctx.source_path("morphology").glob("*.swc"))
68
+ ctx.save("skeletons", files, fmt="pickle")
69
+
70
+ @transform("morphology.features", depends_on=["morphology.load"])
71
+ def compute_features(ctx: BuildContext) -> None:
72
+ skeletons = ctx.load("morphology.load.skeletons")
73
+ df = pd.DataFrame([_extract_features(s) for s in skeletons])
74
+ ctx.save("features", df, fmt="csv")
75
+ ```
76
+
77
+ ```bash
78
+ mareforma build
79
+ # ✓ morphology.load done (1.2s)
80
+ # ✓ morphology.features done (3.8s)
81
+
82
+ mareforma trace morphology.features
83
+ # morphology
84
+ # └── morphology.load ──────── RAW ── SINGLE
85
+ # └── morphology.features ANALYSED ── REPLICATED ◇
86
+ ```
87
+
88
+ That's it. No annotations. mareforma reads your artifacts, classifies each transform, and tracks support level automatically.
89
+
90
+ ---
91
+
92
+ ## What gets classified automatically
93
+
94
+ **Transform class** — inferred from artifact content:
95
+
96
+ | Class | Meaning |
97
+ |---|---|
98
+ | `RAW` | Root node — no upstream dependencies |
99
+ | `PROCESSED` | Output values ⊆ input values, row count ≤ input count |
100
+ | `ANALYSED` | New values computed, within input value range |
101
+ | `INFERRED` | Output values outside all input ranges |
102
+
103
+ **Support level** — inferred from run history:
104
+
105
+ | Level | Meaning |
106
+ |---|---|
107
+ | `SINGLE` | One run |
108
+ | `REPLICATED ◇` | Same output hash across ≥2 runs |
109
+ | `CONVERGED ●` | Same step name across ≥2 independent sources |
110
+ | `CONSISTENT ◆` | A run has a DOI-linked claim in `supports` |
111
+ | `ESTABLISHED ●●` | CONVERGED + CONSISTENT |
112
+
113
+ SINGLE through CONVERGED require no annotation. CONSISTENT and ESTABLISHED require one DOI string in a claim.
114
+
115
+ ---
116
+
117
+ ## Quickstart
118
+
119
+ ```bash
120
+ # 1. Init
121
+ cd my_project/
122
+ mareforma init
123
+
124
+ # 2. Register a data source
125
+ mareforma add-source morphology --path data/morphology/raw/ \
126
+ --description "Neuron skeleton reconstructions"
127
+
128
+ # 3. Build — classification is automatic
129
+ mareforma build
130
+
131
+ # 4. Inspect the epistemic graph
132
+ mareforma trace morphology.features
133
+
134
+ # 5. Check overall health
135
+ mareforma status
136
+
137
+ # 6. Optional: link a result to literature (unlocks CONSISTENT)
138
+ mareforma claim add "Neuron size increases with cortical depth" \
139
+ --source morphology --supports 10.64898/2026.03.05.709819
140
+
141
+ # 7. Export provenance graph
142
+ mareforma export
143
+ ```
144
+
145
+ ---
146
+
147
+ ## BuildContext API
148
+
149
+ | Method | Description |
150
+ |---|---|
151
+ | `ctx.source_path("name")` | Raw data path for a registered source |
152
+ | `ctx.save("name", data, fmt=...)` | Persist artifact (`pickle`, `parquet`, `csv`, `numpy`) |
153
+ | `ctx.load("transform.artifact")` | Load upstream artifact |
154
+ | `ctx.claim("text", supports=[DOI])` | Optional: link this run to literature |
155
+ | `ctx.log("message")` | Write to console |
156
+
157
+ ---
158
+
159
+ ## CLI reference
160
+
161
+ | Command | Description |
162
+ |---|---|
163
+ | `mareforma init` | Initialise project |
164
+ | `mareforma add-source <name>` | Register a data source |
165
+ | `mareforma check` | Validate paths and required fields |
166
+ | `mareforma build [source]` | Run the pipeline DAG (`--dry-run`, `--force`) |
167
+ | `mareforma trace <transform>` | Ancestry tree with class and support level (`--json`) |
168
+ | `mareforma status` | Epistemic health dashboard (`--json`) |
169
+ | `mareforma diff <transform>` | Compare the two most recent runs (`--json`) |
170
+ | `mareforma log` | Last build status (`--json`) |
171
+ | `mareforma explain [source]` | Dump project ontology (`--json`) |
172
+ | `mareforma export` | Write `ontology.jsonld` |
173
+ | `mareforma claim add TEXT` | Link a result to literature (`--supports DOI`) |
174
+ | `mareforma claim list` | List claims (`--status`, `--source`, `--json`) |
175
+ | `mareforma claim show ID` | Full claim detail |
176
+ | `mareforma claim update ID` | Update confidence, status, or supports |
177
+
178
+ ---
179
+
180
+ ## Project structure
181
+
182
+ ```
183
+ my_project/
184
+ ├── .mareforma/
185
+ │ └── graph.db ← provenance graph (commit this)
186
+ ├── mareforma.project.toml ← project ontology (commit this)
187
+ ├── claims.toml ← claims backup, auto-generated (commit this)
188
+ ├── ontology.jsonld ← JSON-LD export (commit this)
189
+ └── data/
190
+ └── source_name/
191
+ ├── raw/ ← your data
192
+ └── preprocessing/
193
+ └── build_transform.py
194
+ ```
@@ -0,0 +1,159 @@
1
+ # mareforma
2
+
3
+ [![Python](https://img.shields.io/pypi/pyversions/mareforma)](https://pypi.org/project/mareforma/)
4
+ [![Tests](https://github.com/mareforma/mareforma/actions/workflows/tests.yml/badge.svg)](https://github.com/mareforma/mareforma/actions/workflows/tests.yml)
5
+ [![PyPI](https://img.shields.io/pypi/v/mareforma)](https://pypi.org/project/mareforma/)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
7
+
8
+ Automatic epistemic provenance for life sciences pipelines. Write transforms, run `build`, and mareforma figures out what kind of result you produced and how well-supported it is — no manual annotation required.
9
+
10
+ ---
11
+
12
+ ## Install
13
+
14
+ ```bash
15
+ pip install mareforma
16
+ ```
17
+
18
+ Requires Python ≥ 3.10.
19
+
20
+ ---
21
+
22
+ ## How it works
23
+
24
+ Write normal Python pipeline functions. mareforma auto-classifies each result.
25
+
26
+ ```python
27
+ from mareforma import transform, BuildContext
28
+ import pandas as pd
29
+
30
+ @transform("morphology.load")
31
+ def load(ctx: BuildContext) -> None:
32
+ files = list(ctx.source_path("morphology").glob("*.swc"))
33
+ ctx.save("skeletons", files, fmt="pickle")
34
+
35
+ @transform("morphology.features", depends_on=["morphology.load"])
36
+ def compute_features(ctx: BuildContext) -> None:
37
+ skeletons = ctx.load("morphology.load.skeletons")
38
+ df = pd.DataFrame([_extract_features(s) for s in skeletons])
39
+ ctx.save("features", df, fmt="csv")
40
+ ```
41
+
42
+ ```bash
43
+ mareforma build
44
+ # ✓ morphology.load done (1.2s)
45
+ # ✓ morphology.features done (3.8s)
46
+
47
+ mareforma trace morphology.features
48
+ # morphology
49
+ # └── morphology.load ──────── RAW ── SINGLE
50
+ # └── morphology.features ANALYSED ── REPLICATED ◇
51
+ ```
52
+
53
+ That's it. No annotations. mareforma reads your artifacts, classifies each transform, and tracks support level automatically.
54
+
55
+ ---
56
+
57
+ ## What gets classified automatically
58
+
59
+ **Transform class** — inferred from artifact content:
60
+
61
+ | Class | Meaning |
62
+ |---|---|
63
+ | `RAW` | Root node — no upstream dependencies |
64
+ | `PROCESSED` | Output values ⊆ input values, row count ≤ input count |
65
+ | `ANALYSED` | New values computed, within input value range |
66
+ | `INFERRED` | Output values outside all input ranges |
67
+
68
+ **Support level** — inferred from run history:
69
+
70
+ | Level | Meaning |
71
+ |---|---|
72
+ | `SINGLE` | One run |
73
+ | `REPLICATED ◇` | Same output hash across ≥2 runs |
74
+ | `CONVERGED ●` | Same step name across ≥2 independent sources |
75
+ | `CONSISTENT ◆` | A run has a DOI-linked claim in `supports` |
76
+ | `ESTABLISHED ●●` | CONVERGED + CONSISTENT |
77
+
78
+ SINGLE through CONVERGED require no annotation. CONSISTENT and ESTABLISHED require one DOI string in a claim.
79
+
80
+ ---
81
+
82
+ ## Quickstart
83
+
84
+ ```bash
85
+ # 1. Init
86
+ cd my_project/
87
+ mareforma init
88
+
89
+ # 2. Register a data source
90
+ mareforma add-source morphology --path data/morphology/raw/ \
91
+ --description "Neuron skeleton reconstructions"
92
+
93
+ # 3. Build — classification is automatic
94
+ mareforma build
95
+
96
+ # 4. Inspect the epistemic graph
97
+ mareforma trace morphology.features
98
+
99
+ # 5. Check overall health
100
+ mareforma status
101
+
102
+ # 6. Optional: link a result to literature (unlocks CONSISTENT)
103
+ mareforma claim add "Neuron size increases with cortical depth" \
104
+ --source morphology --supports 10.64898/2026.03.05.709819
105
+
106
+ # 7. Export provenance graph
107
+ mareforma export
108
+ ```
109
+
110
+ ---
111
+
112
+ ## BuildContext API
113
+
114
+ | Method | Description |
115
+ |---|---|
116
+ | `ctx.source_path("name")` | Raw data path for a registered source |
117
+ | `ctx.save("name", data, fmt=...)` | Persist artifact (`pickle`, `parquet`, `csv`, `numpy`) |
118
+ | `ctx.load("transform.artifact")` | Load upstream artifact |
119
+ | `ctx.claim("text", supports=[DOI])` | Optional: link this run to literature |
120
+ | `ctx.log("message")` | Write to console |
121
+
122
+ ---
123
+
124
+ ## CLI reference
125
+
126
+ | Command | Description |
127
+ |---|---|
128
+ | `mareforma init` | Initialise project |
129
+ | `mareforma add-source <name>` | Register a data source |
130
+ | `mareforma check` | Validate paths and required fields |
131
+ | `mareforma build [source]` | Run the pipeline DAG (`--dry-run`, `--force`) |
132
+ | `mareforma trace <transform>` | Ancestry tree with class and support level (`--json`) |
133
+ | `mareforma status` | Epistemic health dashboard (`--json`) |
134
+ | `mareforma diff <transform>` | Compare the two most recent runs (`--json`) |
135
+ | `mareforma log` | Last build status (`--json`) |
136
+ | `mareforma explain [source]` | Dump project ontology (`--json`) |
137
+ | `mareforma export` | Write `ontology.jsonld` |
138
+ | `mareforma claim add TEXT` | Link a result to literature (`--supports DOI`) |
139
+ | `mareforma claim list` | List claims (`--status`, `--source`, `--json`) |
140
+ | `mareforma claim show ID` | Full claim detail |
141
+ | `mareforma claim update ID` | Update confidence, status, or supports |
142
+
143
+ ---
144
+
145
+ ## Project structure
146
+
147
+ ```
148
+ my_project/
149
+ ├── .mareforma/
150
+ │ └── graph.db ← provenance graph (commit this)
151
+ ├── mareforma.project.toml ← project ontology (commit this)
152
+ ├── claims.toml ← claims backup, auto-generated (commit this)
153
+ ├── ontology.jsonld ← JSON-LD export (commit this)
154
+ └── data/
155
+ └── source_name/
156
+ ├── raw/ ← your data
157
+ └── preprocessing/
158
+ └── build_transform.py
159
+ ```
@@ -0,0 +1,10 @@
1
+ """Mareforma — The provenance layer for AI-driven research pipelines."""
2
+
3
+ __description__ = "Mareforma — The provenance layer for AI-driven research pipelines."
4
+ __version__ = "0.1.0"
5
+
6
+ from mareforma.transforms import transform, registry
7
+ from mareforma.initializer import initialize
8
+ from mareforma.pipeline.context import BuildContext
9
+
10
+ __all__ = ["transform", "registry", "initialize", "BuildContext", "__version__"]
@@ -0,0 +1,54 @@
1
+ """
2
+ _toml_writer.py — Minimal TOML serialiser for the subset mareforma uses.
3
+
4
+ Supports: str, int, float, bool, list of str/int/float, nested dicts.
5
+ This covers everything needed for mareforma.project.toml without requiring
6
+ the external tomli-w package (though tomli-w is preferred when available).
7
+
8
+ Not a general-purpose TOML writer. Do not use outside mareforma.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from typing import Any
14
+
15
+
16
+ def dumps(data: dict[str, Any]) -> str:
17
+ """Serialize *data* to a TOML string."""
18
+ lines: list[str] = []
19
+ _write_table(lines, data, prefix="")
20
+ return "\n".join(lines) + "\n"
21
+
22
+
23
+ def _write_table(lines: list[str], table: dict[str, Any], prefix: str) -> None:
24
+ # Write scalar/list values first, then nested tables.
25
+ deferred: list[tuple[str, dict]] = []
26
+
27
+ for key, value in table.items():
28
+ full_key = f"{prefix}.{key}" if prefix else key
29
+
30
+ if isinstance(value, dict):
31
+ deferred.append((full_key, value))
32
+ elif isinstance(value, list):
33
+ items = ", ".join(_scalar(v) for v in value)
34
+ lines.append(f"{key} = [{items}]")
35
+ else:
36
+ lines.append(f"{key} = {_scalar(value)}")
37
+
38
+ for full_key, sub in deferred:
39
+ lines.append("")
40
+ lines.append(f"[{full_key}]")
41
+ _write_table(lines, sub, prefix=full_key)
42
+
43
+
44
+ def _scalar(value: Any) -> str:
45
+ if isinstance(value, bool):
46
+ return "true" if value else "false"
47
+ if isinstance(value, int):
48
+ return str(value)
49
+ if isinstance(value, float):
50
+ return repr(value)
51
+ if isinstance(value, str):
52
+ escaped = value.replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")
53
+ return f'"{escaped}"'
54
+ raise TypeError(f"Unsupported TOML value type: {type(value)}")