vahtian 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,16 @@
1
+ name: publish
2
+ on:
3
+ push:
4
+ tags: ["v*"]
5
+ permissions:
6
+ id-token: write # OIDC trusted publishing — no API token stored
7
+ jobs:
8
+ build-and-publish:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v4
12
+ - uses: actions/setup-python@v5
13
+ with:
14
+ python-version: "3.x"
15
+ - run: pipx run build
16
+ - uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,5 @@
1
+ dist/
2
+ build/
3
+ *.egg-info/
4
+ __pycache__/
5
+ .pytest_cache/
vahtian-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,40 @@
1
+ Metadata-Version: 2.4
2
+ Name: vahtian
3
+ Version: 0.1.0
4
+ Summary: Reproducible, provenance-first evidence tooling — freeze a corpus, verify it, keep a hash-chained audit trail. Human-first, auditable, local-first.
5
+ Project-URL: Homepage, https://vahtian.com/
6
+ Project-URL: Documentation, https://vahtian.com/agents/
7
+ Project-URL: Source, https://github.com/heidihelena/vahtian
8
+ Author: Heidi Helena Andersén
9
+ License: Apache-2.0
10
+ Keywords: audit trail,evidence synthesis,provenance,reproducibility,research integrity,systematic review
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: Apache Software License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
16
+ Requires-Python: >=3.9
17
+ Description-Content-Type: text/markdown
18
+
19
+ # vahtian (Python)
20
+
21
+ Reproducible, provenance-first evidence tooling. **Freeze** a record set into a
22
+ content-hashed, provenance-stamped, date-locked corpus; **verify** reproducibility;
23
+ keep a **hash-chained audit trail**. Stdlib-only.
24
+
25
+ The same core and on-disk format exist in the R package **`vahtian`**, so a corpus
26
+ frozen in Python verifies in R and vice versa.
27
+
28
+ ```python
29
+ import vahtian
30
+ corpus = vahtian.freeze(records, search_date="2026-06-23")
31
+ corpus.save("frozen-corpus") # frozen-corpus.jsonl + .manifest.json
32
+ assert vahtian.verify(corpus) # tamper-evident
33
+
34
+ L = vahtian.Ledger()
35
+ L.append("human:hha", "rate", {"record_id": "pmid:12345", "value": "supported"})
36
+ L.append("ai:opus/pv1", "advise", {"record_id": "pmid:12345", "value": "supported"})
37
+ assert L.verify() # retro-edits break the chain
38
+ ```
39
+
40
+ `vahti` (Finnish) = sentinel / guard. Human-first. AI-second. Auditable. Apache-2.0.
@@ -0,0 +1,22 @@
1
+ # vahtian (Python)
2
+
3
+ Reproducible, provenance-first evidence tooling. **Freeze** a record set into a
4
+ content-hashed, provenance-stamped, date-locked corpus; **verify** reproducibility;
5
+ keep a **hash-chained audit trail**. Stdlib-only.
6
+
7
+ The same core and on-disk format exist in the R package **`vahtian`**, so a corpus
8
+ frozen in Python verifies in R and vice versa.
9
+
10
+ ```python
11
+ import vahtian
12
+ corpus = vahtian.freeze(records, search_date="2026-06-23")
13
+ corpus.save("frozen-corpus") # frozen-corpus.jsonl + .manifest.json
14
+ assert vahtian.verify(corpus) # tamper-evident
15
+
16
+ L = vahtian.Ledger()
17
+ L.append("human:hha", "rate", {"record_id": "pmid:12345", "value": "supported"})
18
+ L.append("ai:opus/pv1", "advise", {"record_id": "pmid:12345", "value": "supported"})
19
+ assert L.verify() # retro-edits break the chain
20
+ ```
21
+
22
+ `vahti` (Finnish) = sentinel / guard. Human-first. AI-second. Auditable. Apache-2.0.
@@ -0,0 +1,32 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "vahtian"
7
+ version = "0.1.0"
8
+ description = "Reproducible, provenance-first evidence tooling — freeze a corpus, verify it, keep a hash-chained audit trail. Human-first, auditable, local-first."
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = { text = "Apache-2.0" }
12
+ authors = [{ name = "Heidi Helena Andersén" }]
13
+ keywords = ["research integrity", "systematic review", "provenance", "reproducibility", "evidence synthesis", "audit trail"]
14
+ classifiers = [
15
+ "Development Status :: 3 - Alpha",
16
+ "Intended Audience :: Science/Research",
17
+ "License :: OSI Approved :: Apache Software License",
18
+ "Programming Language :: Python :: 3",
19
+ "Topic :: Scientific/Engineering :: Information Analysis",
20
+ ]
21
+ dependencies = [] # stdlib only — re-runnable anywhere
22
+
23
+ [project.urls]
24
+ Homepage = "https://vahtian.com/"
25
+ Documentation = "https://vahtian.com/agents/"
26
+ Source = "https://github.com/heidihelena/vahtian"
27
+
28
+ [project.scripts]
29
+ vahtian = "vahtian.cli:main"
30
+
31
+ [tool.hatch.build.targets.wheel]
32
+ packages = ["src/vahtian"]
@@ -0,0 +1,19 @@
1
+ """vahtian — reproducible, provenance-first evidence tooling.
2
+
3
+ Freeze a record set into a content-hashed, provenance-stamped, date-locked
4
+ corpus; verify reproducibility; keep a hash-chained audit trail. The same core
5
+ and on-disk format exist in the R package `vahtian`, so artifacts interoperate.
6
+
7
+ Human-first. AI-second. Auditable. Apache-2.0.
8
+ """
9
+ from .provenance import Corpus, freeze, load, content_hash, record_id, SPEC_VERSION
10
+ from .audit import Ledger
11
+
12
+ __version__ = "0.1.0"
13
+ __all__ = ["Corpus", "freeze", "load", "verify", "content_hash", "record_id",
14
+ "Ledger", "SPEC_VERSION", "__version__"]
15
+
16
+
17
+ def verify(corpus: "Corpus") -> bool:
18
+ """Convenience: vahtian.verify(corpus) == corpus.verify()."""
19
+ return corpus.verify()
@@ -0,0 +1,48 @@
1
+ """Hash-chained, tamper-evident audit ledger — who did what, in order.
2
+
3
+ Each entry hashes the previous entry's hash, so any retro-edit or deletion breaks
4
+ the chain and verify() catches it. Shared format with the R package.
5
+ """
6
+ from __future__ import annotations
7
+ import json, hashlib
8
+ from datetime import datetime, timezone
9
+
10
+ GENESIS = "sha256:" + "0" * 64
11
+
12
+
13
+ def _canonical(obj) -> str:
14
+ return json.dumps(obj, sort_keys=True, separators=(",", ":"), ensure_ascii=False)
15
+
16
+
17
+ def _entry_hash(prev_hash: str, body: dict) -> str:
18
+ h = hashlib.sha256()
19
+ h.update(prev_hash.encode("utf-8"))
20
+ h.update(_canonical(body).encode("utf-8"))
21
+ return "sha256:" + h.hexdigest()
22
+
23
+
24
+ class Ledger:
25
+ def __init__(self):
26
+ self.entries: list[dict] = []
27
+
28
+ def append(self, actor: str, action: str, payload: dict | None = None, *, ts: str | None = None) -> dict:
29
+ prev = self.entries[-1]["entry_hash"] if self.entries else GENESIS
30
+ body = {"seq": len(self.entries), "ts": ts or datetime.now(timezone.utc).isoformat(),
31
+ "actor": actor, "action": action, "payload": payload or {}, "prev_hash": prev}
32
+ entry = {**body, "entry_hash": _entry_hash(prev, body)}
33
+ self.entries.append(entry)
34
+ return entry
35
+
36
+ def verify(self) -> bool:
37
+ prev = GENESIS
38
+ for e in self.entries:
39
+ body = {k: e[k] for k in ("seq", "ts", "actor", "action", "payload", "prev_hash")}
40
+ if e["prev_hash"] != prev or _entry_hash(prev, body) != e["entry_hash"]:
41
+ return False
42
+ prev = e["entry_hash"]
43
+ return True
44
+
45
+ def save(self, path: str) -> None:
46
+ with open(path, "w", encoding="utf-8") as f:
47
+ for e in self.entries:
48
+ f.write(_canonical(e) + "\n")
@@ -0,0 +1,16 @@
1
+ """`vahtian` CLI — verify a frozen corpus or an audit ledger."""
2
+ import sys
3
+ from . import load, __version__
4
+ from .audit import Ledger
5
+
6
+ def main(argv=None):
7
+ argv = argv if argv is not None else sys.argv[1:]
8
+ if not argv or argv[0] in ("-h", "--help"):
9
+ print("vahtian", __version__, "\n vahtian verify <corpus-prefix> # check a frozen corpus is untampered")
10
+ return 0
11
+ if argv[0] == "verify" and len(argv) > 1:
12
+ c = load(argv[1])
13
+ ok = c.verify()
14
+ print(("OK " if ok else "FAIL ") + f"{len(c.records)} records · {c.search_date} · {c.content_hash}")
15
+ return 0 if ok else 1
16
+ print("unknown command:", argv[0]); return 2
@@ -0,0 +1,100 @@
1
+ """Frozen, provenance-stamped evidence corpus — the Vahtian reproducibility core.
2
+
3
+ A corpus is a deduped set of records, each carrying per-source provenance and a
4
+ locked search date, summarised by a content hash. Re-running the search and
5
+ re-freezing must reproduce the same hash; verify() proves it (tamper-evident).
6
+
7
+ The on-disk format (frozen-corpus.jsonl + .manifest.json) is shared byte-for-byte
8
+ with the R package, so a corpus frozen in Python verifies in R and vice versa.
9
+ """
10
+ from __future__ import annotations
11
+ import json, hashlib
12
+ from dataclasses import dataclass, field, asdict
13
+ from datetime import date, datetime, timezone
14
+
15
+ SPEC_VERSION = "vahtian-corpus/1"
16
+
17
+
18
+ def record_id(rec: dict) -> str:
19
+ """Stable identity: PMID > DOI > title-hash. Matches the R implementation."""
20
+ if rec.get("pmid"):
21
+ return f"pmid:{str(rec['pmid']).strip()}"
22
+ if rec.get("doi"):
23
+ return "doi:" + str(rec["doi"]).strip().lower()
24
+ title = (rec.get("title") or "").strip().lower()
25
+ return "title:" + hashlib.sha256(title.encode("utf-8")).hexdigest()[:16]
26
+
27
+
28
+ def _canonical(obj) -> str:
29
+ # Deterministic serialisation: sorted keys, no whitespace, UTF-8.
30
+ return json.dumps(obj, sort_keys=True, separators=(",", ":"), ensure_ascii=False)
31
+
32
+
33
+ def content_hash(records: list[dict]) -> str:
34
+ """sha256 over records sorted by record_id — order-independent, reproducible."""
35
+ ordered = sorted(records, key=record_id)
36
+ h = hashlib.sha256()
37
+ for r in ordered:
38
+ h.update(_canonical(r).encode("utf-8"))
39
+ h.update(b"\n")
40
+ return "sha256:" + h.hexdigest()
41
+
42
+
43
+ @dataclass
44
+ class Corpus:
45
+ records: list[dict]
46
+ search_date: str
47
+ content_hash: str
48
+ created: str = field(default="")
49
+ spec: str = SPEC_VERSION
50
+
51
+ def verify(self) -> bool:
52
+ """True iff the stored content_hash still matches the records (untampered)."""
53
+ return content_hash(self.records) == self.content_hash
54
+
55
+ def manifest(self) -> dict:
56
+ return {"spec": self.spec, "n_records": len(self.records),
57
+ "search_date": self.search_date, "content_hash": self.content_hash,
58
+ "created": self.created}
59
+
60
+ def save(self, path_prefix: str) -> None:
61
+ with open(path_prefix + ".jsonl", "w", encoding="utf-8") as f:
62
+ for r in sorted(self.records, key=record_id):
63
+ f.write(_canonical(r) + "\n")
64
+ with open(path_prefix + ".manifest.json", "w", encoding="utf-8") as f:
65
+ f.write(json.dumps(self.manifest(), indent=2))
66
+
67
+
68
+ def freeze(records: list[dict], search_date: str | None = None, *, now: str | None = None) -> Corpus:
69
+ """Dedupe by record_id, lock the search date, compute the content hash."""
70
+ by_id: dict[str, dict] = {}
71
+ for rec in records:
72
+ rid = record_id(rec)
73
+ merged = dict(rec); merged["record_id"] = rid
74
+ if rid in by_id: # merge provenance across sources
75
+ prov = by_id[rid].get("provenance", []) + merged.get("provenance", [])
76
+ merged = {**by_id[rid], **merged, "provenance": prov}
77
+ by_id[rid] = merged
78
+ records = list(by_id.values())
79
+ for r in records: # canonicalise provenance order so the hash
80
+ prov = r.get("provenance") # is independent of source arrival order
81
+ if isinstance(prov, list):
82
+ r["provenance"] = sorted(prov, key=_canonical)
83
+ sd = search_date or date.today().isoformat()
84
+ return Corpus(records=records, search_date=sd,
85
+ content_hash=content_hash(records),
86
+ created=now or datetime.now(timezone.utc).isoformat())
87
+
88
+
89
+ def load(path_prefix: str) -> Corpus:
90
+ records = []
91
+ with open(path_prefix + ".jsonl", encoding="utf-8") as f:
92
+ for line in f:
93
+ line = line.strip()
94
+ if line:
95
+ records.append(json.loads(line))
96
+ with open(path_prefix + ".manifest.json", encoding="utf-8") as f:
97
+ m = json.load(f)
98
+ return Corpus(records=records, search_date=m["search_date"],
99
+ content_hash=m["content_hash"], created=m.get("created", ""),
100
+ spec=m.get("spec", SPEC_VERSION))
@@ -0,0 +1,44 @@
1
+ import vahtian
2
+
3
+ # The cross-language parity gate: the R package `vahtian` asserts this SAME literal.
4
+ # If either canonical serialiser drifts, one of the two CIs goes red.
5
+ GOLDEN = "sha256:50ca741a72e7058870d0ca7594b0c37faa7183472fcb1752b7a6c5abe23cafd1"
6
+
7
+ def test_golden_cross_language_hash():
8
+ recs = [
9
+ {"pmid": "12345", "title": "PD-L1 AI scoring agrees with pathologists",
10
+ "provenance": [{"source": "pubmed", "retrieved": "2026-06-23"}]},
11
+ {"doi": "10.1/x", "title": "A second study",
12
+ "provenance": [{"source": "openalex", "retrieved": "2026-06-23"}]},
13
+ {"pmid": "12345", "title": "PD-L1 AI scoring agrees with pathologists",
14
+ "provenance": [{"source": "europepmc", "retrieved": "2026-06-23"}]},
15
+ ]
16
+ c = vahtian.freeze(recs, search_date="2026-06-23")
17
+ assert len(c.records) == 2
18
+ assert c.content_hash == GOLDEN
19
+
20
+ def test_dedupe_and_reproducible():
21
+ recs = [
22
+ {"pmid": "1", "title": "A", "provenance": [{"source": "pubmed"}]},
23
+ {"pmid": "1", "title": "A", "provenance": [{"source": "europepmc"}]},
24
+ {"doi": "10.1/x", "title": "B", "provenance": [{"source": "openalex"}]},
25
+ ]
26
+ c1 = vahtian.freeze(recs, search_date="2026-06-23", now="t")
27
+ c2 = vahtian.freeze(list(reversed(recs)), search_date="2026-06-23", now="t2")
28
+ assert len(c1.records) == 2
29
+ assert c1.content_hash == c2.content_hash # order-independent
30
+ assert c1.verify()
31
+
32
+ def test_tamper_detection():
33
+ c = vahtian.freeze([{"pmid": "1", "title": "A"}], search_date="2026-06-23")
34
+ assert c.verify()
35
+ c.records[0]["title"] = "tampered"
36
+ assert not c.verify()
37
+
38
+ def test_audit_chain():
39
+ L = vahtian.Ledger()
40
+ L.append("human", "rate", {"v": "supported"}, ts="t1")
41
+ L.append("ai", "advise", {"v": "supported"}, ts="t2")
42
+ assert L.verify()
43
+ L.entries[0]["payload"]["v"] = "x"
44
+ assert not L.verify()