vahtian 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vahtian-0.1.0/.github/workflows/publish.yml +16 -0
- vahtian-0.1.0/.gitignore +5 -0
- vahtian-0.1.0/PKG-INFO +40 -0
- vahtian-0.1.0/README.md +22 -0
- vahtian-0.1.0/pyproject.toml +32 -0
- vahtian-0.1.0/src/vahtian/__init__.py +19 -0
- vahtian-0.1.0/src/vahtian/audit.py +48 -0
- vahtian-0.1.0/src/vahtian/cli.py +16 -0
- vahtian-0.1.0/src/vahtian/provenance.py +100 -0
- vahtian-0.1.0/tests/test_core.py +44 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
name: publish
|
|
2
|
+
on:
|
|
3
|
+
push:
|
|
4
|
+
tags: ["v*"]
|
|
5
|
+
permissions:
|
|
6
|
+
id-token: write # OIDC trusted publishing — no API token stored
|
|
7
|
+
jobs:
|
|
8
|
+
build-and-publish:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
steps:
|
|
11
|
+
- uses: actions/checkout@v4
|
|
12
|
+
- uses: actions/setup-python@v5
|
|
13
|
+
with:
|
|
14
|
+
python-version: "3.x"
|
|
15
|
+
- run: pipx run build
|
|
16
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
vahtian-0.1.0/.gitignore
ADDED
vahtian-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vahtian
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Reproducible, provenance-first evidence tooling — freeze a corpus, verify it, keep a hash-chained audit trail. Human-first, auditable, local-first.
|
|
5
|
+
Project-URL: Homepage, https://vahtian.com/
|
|
6
|
+
Project-URL: Documentation, https://vahtian.com/agents/
|
|
7
|
+
Project-URL: Source, https://github.com/heidihelena/vahtian
|
|
8
|
+
Author: Heidi Helena Andersén
|
|
9
|
+
License: Apache-2.0
|
|
10
|
+
Keywords: audit trail,evidence synthesis,provenance,reproducibility,research integrity,systematic review
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
16
|
+
Requires-Python: >=3.9
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
# vahtian (Python)
|
|
20
|
+
|
|
21
|
+
Reproducible, provenance-first evidence tooling. **Freeze** a record set into a
|
|
22
|
+
content-hashed, provenance-stamped, date-locked corpus; **verify** reproducibility;
|
|
23
|
+
keep a **hash-chained audit trail**. Stdlib-only.
|
|
24
|
+
|
|
25
|
+
The same core and on-disk format exist in the R package **`vahtian`**, so a corpus
|
|
26
|
+
frozen in Python verifies in R and vice versa.
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
import vahtian
|
|
30
|
+
corpus = vahtian.freeze(records, search_date="2026-06-23")
|
|
31
|
+
corpus.save("frozen-corpus") # frozen-corpus.jsonl + .manifest.json
|
|
32
|
+
assert vahtian.verify(corpus) # tamper-evident
|
|
33
|
+
|
|
34
|
+
L = vahtian.Ledger()
|
|
35
|
+
L.append("human:hha", "rate", {"record_id": "pmid:12345", "value": "supported"})
|
|
36
|
+
L.append("ai:opus/pv1", "advise", {"record_id": "pmid:12345", "value": "supported"})
|
|
37
|
+
assert L.verify() # retro-edits break the chain
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
`vahti` (Finnish) = sentinel / guard. Human-first. AI-second. Auditable. Apache-2.0.
|
vahtian-0.1.0/README.md
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# vahtian (Python)
|
|
2
|
+
|
|
3
|
+
Reproducible, provenance-first evidence tooling. **Freeze** a record set into a
|
|
4
|
+
content-hashed, provenance-stamped, date-locked corpus; **verify** reproducibility;
|
|
5
|
+
keep a **hash-chained audit trail**. Stdlib-only.
|
|
6
|
+
|
|
7
|
+
The same core and on-disk format exist in the R package **`vahtian`**, so a corpus
|
|
8
|
+
frozen in Python verifies in R and vice versa.
|
|
9
|
+
|
|
10
|
+
```python
|
|
11
|
+
import vahtian
|
|
12
|
+
corpus = vahtian.freeze(records, search_date="2026-06-23")
|
|
13
|
+
corpus.save("frozen-corpus") # frozen-corpus.jsonl + .manifest.json
|
|
14
|
+
assert vahtian.verify(corpus) # tamper-evident
|
|
15
|
+
|
|
16
|
+
L = vahtian.Ledger()
|
|
17
|
+
L.append("human:hha", "rate", {"record_id": "pmid:12345", "value": "supported"})
|
|
18
|
+
L.append("ai:opus/pv1", "advise", {"record_id": "pmid:12345", "value": "supported"})
|
|
19
|
+
assert L.verify() # retro-edits break the chain
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
`vahti` (Finnish) = sentinel / guard. Human-first. AI-second. Auditable. Apache-2.0.
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "vahtian"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Reproducible, provenance-first evidence tooling — freeze a corpus, verify it, keep a hash-chained audit trail. Human-first, auditable, local-first."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = { text = "Apache-2.0" }
|
|
12
|
+
authors = [{ name = "Heidi Helena Andersén" }]
|
|
13
|
+
keywords = ["research integrity", "systematic review", "provenance", "reproducibility", "evidence synthesis", "audit trail"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"Intended Audience :: Science/Research",
|
|
17
|
+
"License :: OSI Approved :: Apache Software License",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
20
|
+
]
|
|
21
|
+
dependencies = [] # stdlib only — re-runnable anywhere
|
|
22
|
+
|
|
23
|
+
[project.urls]
|
|
24
|
+
Homepage = "https://vahtian.com/"
|
|
25
|
+
Documentation = "https://vahtian.com/agents/"
|
|
26
|
+
Source = "https://github.com/heidihelena/vahtian"
|
|
27
|
+
|
|
28
|
+
[project.scripts]
|
|
29
|
+
vahtian = "vahtian.cli:main"
|
|
30
|
+
|
|
31
|
+
[tool.hatch.build.targets.wheel]
|
|
32
|
+
packages = ["src/vahtian"]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""vahtian — reproducible, provenance-first evidence tooling.
|
|
2
|
+
|
|
3
|
+
Freeze a record set into a content-hashed, provenance-stamped, date-locked
|
|
4
|
+
corpus; verify reproducibility; keep a hash-chained audit trail. The same core
|
|
5
|
+
and on-disk format exist in the R package `vahtian`, so artifacts interoperate.
|
|
6
|
+
|
|
7
|
+
Human-first. AI-second. Auditable. Apache-2.0.
|
|
8
|
+
"""
|
|
9
|
+
from .provenance import Corpus, freeze, load, content_hash, record_id, SPEC_VERSION
|
|
10
|
+
from .audit import Ledger
|
|
11
|
+
|
|
12
|
+
__version__ = "0.1.0"
|
|
13
|
+
__all__ = ["Corpus", "freeze", "load", "verify", "content_hash", "record_id",
|
|
14
|
+
"Ledger", "SPEC_VERSION", "__version__"]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def verify(corpus: "Corpus") -> bool:
|
|
18
|
+
"""Convenience: vahtian.verify(corpus) == corpus.verify()."""
|
|
19
|
+
return corpus.verify()
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Hash-chained, tamper-evident audit ledger — who did what, in order.
|
|
2
|
+
|
|
3
|
+
Each entry hashes the previous entry's hash, so any retro-edit or deletion breaks
|
|
4
|
+
the chain and verify() catches it. Shared format with the R package.
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
import json, hashlib
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
|
|
10
|
+
GENESIS = "sha256:" + "0" * 64
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _canonical(obj) -> str:
|
|
14
|
+
return json.dumps(obj, sort_keys=True, separators=(",", ":"), ensure_ascii=False)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _entry_hash(prev_hash: str, body: dict) -> str:
|
|
18
|
+
h = hashlib.sha256()
|
|
19
|
+
h.update(prev_hash.encode("utf-8"))
|
|
20
|
+
h.update(_canonical(body).encode("utf-8"))
|
|
21
|
+
return "sha256:" + h.hexdigest()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Ledger:
|
|
25
|
+
def __init__(self):
|
|
26
|
+
self.entries: list[dict] = []
|
|
27
|
+
|
|
28
|
+
def append(self, actor: str, action: str, payload: dict | None = None, *, ts: str | None = None) -> dict:
|
|
29
|
+
prev = self.entries[-1]["entry_hash"] if self.entries else GENESIS
|
|
30
|
+
body = {"seq": len(self.entries), "ts": ts or datetime.now(timezone.utc).isoformat(),
|
|
31
|
+
"actor": actor, "action": action, "payload": payload or {}, "prev_hash": prev}
|
|
32
|
+
entry = {**body, "entry_hash": _entry_hash(prev, body)}
|
|
33
|
+
self.entries.append(entry)
|
|
34
|
+
return entry
|
|
35
|
+
|
|
36
|
+
def verify(self) -> bool:
|
|
37
|
+
prev = GENESIS
|
|
38
|
+
for e in self.entries:
|
|
39
|
+
body = {k: e[k] for k in ("seq", "ts", "actor", "action", "payload", "prev_hash")}
|
|
40
|
+
if e["prev_hash"] != prev or _entry_hash(prev, body) != e["entry_hash"]:
|
|
41
|
+
return False
|
|
42
|
+
prev = e["entry_hash"]
|
|
43
|
+
return True
|
|
44
|
+
|
|
45
|
+
def save(self, path: str) -> None:
|
|
46
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
47
|
+
for e in self.entries:
|
|
48
|
+
f.write(_canonical(e) + "\n")
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""`vahtian` CLI — verify a frozen corpus or an audit ledger."""
|
|
2
|
+
import sys
|
|
3
|
+
from . import load, __version__
|
|
4
|
+
from .audit import Ledger
|
|
5
|
+
|
|
6
|
+
def main(argv=None):
|
|
7
|
+
argv = argv if argv is not None else sys.argv[1:]
|
|
8
|
+
if not argv or argv[0] in ("-h", "--help"):
|
|
9
|
+
print("vahtian", __version__, "\n vahtian verify <corpus-prefix> # check a frozen corpus is untampered")
|
|
10
|
+
return 0
|
|
11
|
+
if argv[0] == "verify" and len(argv) > 1:
|
|
12
|
+
c = load(argv[1])
|
|
13
|
+
ok = c.verify()
|
|
14
|
+
print(("OK " if ok else "FAIL ") + f"{len(c.records)} records · {c.search_date} · {c.content_hash}")
|
|
15
|
+
return 0 if ok else 1
|
|
16
|
+
print("unknown command:", argv[0]); return 2
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""Frozen, provenance-stamped evidence corpus — the Vahtian reproducibility core.
|
|
2
|
+
|
|
3
|
+
A corpus is a deduped set of records, each carrying per-source provenance and a
|
|
4
|
+
locked search date, summarised by a content hash. Re-running the search and
|
|
5
|
+
re-freezing must reproduce the same hash; verify() proves it (tamper-evident).
|
|
6
|
+
|
|
7
|
+
The on-disk format (frozen-corpus.jsonl + .manifest.json) is shared byte-for-byte
|
|
8
|
+
with the R package, so a corpus frozen in Python verifies in R and vice versa.
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
import json, hashlib
|
|
12
|
+
from dataclasses import dataclass, field, asdict
|
|
13
|
+
from datetime import date, datetime, timezone
|
|
14
|
+
|
|
15
|
+
SPEC_VERSION = "vahtian-corpus/1"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def record_id(rec: dict) -> str:
|
|
19
|
+
"""Stable identity: PMID > DOI > title-hash. Matches the R implementation."""
|
|
20
|
+
if rec.get("pmid"):
|
|
21
|
+
return f"pmid:{str(rec['pmid']).strip()}"
|
|
22
|
+
if rec.get("doi"):
|
|
23
|
+
return "doi:" + str(rec["doi"]).strip().lower()
|
|
24
|
+
title = (rec.get("title") or "").strip().lower()
|
|
25
|
+
return "title:" + hashlib.sha256(title.encode("utf-8")).hexdigest()[:16]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _canonical(obj) -> str:
|
|
29
|
+
# Deterministic serialisation: sorted keys, no whitespace, UTF-8.
|
|
30
|
+
return json.dumps(obj, sort_keys=True, separators=(",", ":"), ensure_ascii=False)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def content_hash(records: list[dict]) -> str:
|
|
34
|
+
"""sha256 over records sorted by record_id — order-independent, reproducible."""
|
|
35
|
+
ordered = sorted(records, key=record_id)
|
|
36
|
+
h = hashlib.sha256()
|
|
37
|
+
for r in ordered:
|
|
38
|
+
h.update(_canonical(r).encode("utf-8"))
|
|
39
|
+
h.update(b"\n")
|
|
40
|
+
return "sha256:" + h.hexdigest()
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class Corpus:
|
|
45
|
+
records: list[dict]
|
|
46
|
+
search_date: str
|
|
47
|
+
content_hash: str
|
|
48
|
+
created: str = field(default="")
|
|
49
|
+
spec: str = SPEC_VERSION
|
|
50
|
+
|
|
51
|
+
def verify(self) -> bool:
|
|
52
|
+
"""True iff the stored content_hash still matches the records (untampered)."""
|
|
53
|
+
return content_hash(self.records) == self.content_hash
|
|
54
|
+
|
|
55
|
+
def manifest(self) -> dict:
|
|
56
|
+
return {"spec": self.spec, "n_records": len(self.records),
|
|
57
|
+
"search_date": self.search_date, "content_hash": self.content_hash,
|
|
58
|
+
"created": self.created}
|
|
59
|
+
|
|
60
|
+
def save(self, path_prefix: str) -> None:
|
|
61
|
+
with open(path_prefix + ".jsonl", "w", encoding="utf-8") as f:
|
|
62
|
+
for r in sorted(self.records, key=record_id):
|
|
63
|
+
f.write(_canonical(r) + "\n")
|
|
64
|
+
with open(path_prefix + ".manifest.json", "w", encoding="utf-8") as f:
|
|
65
|
+
f.write(json.dumps(self.manifest(), indent=2))
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def freeze(records: list[dict], search_date: str | None = None, *, now: str | None = None) -> Corpus:
|
|
69
|
+
"""Dedupe by record_id, lock the search date, compute the content hash."""
|
|
70
|
+
by_id: dict[str, dict] = {}
|
|
71
|
+
for rec in records:
|
|
72
|
+
rid = record_id(rec)
|
|
73
|
+
merged = dict(rec); merged["record_id"] = rid
|
|
74
|
+
if rid in by_id: # merge provenance across sources
|
|
75
|
+
prov = by_id[rid].get("provenance", []) + merged.get("provenance", [])
|
|
76
|
+
merged = {**by_id[rid], **merged, "provenance": prov}
|
|
77
|
+
by_id[rid] = merged
|
|
78
|
+
records = list(by_id.values())
|
|
79
|
+
for r in records: # canonicalise provenance order so the hash
|
|
80
|
+
prov = r.get("provenance") # is independent of source arrival order
|
|
81
|
+
if isinstance(prov, list):
|
|
82
|
+
r["provenance"] = sorted(prov, key=_canonical)
|
|
83
|
+
sd = search_date or date.today().isoformat()
|
|
84
|
+
return Corpus(records=records, search_date=sd,
|
|
85
|
+
content_hash=content_hash(records),
|
|
86
|
+
created=now or datetime.now(timezone.utc).isoformat())
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def load(path_prefix: str) -> Corpus:
|
|
90
|
+
records = []
|
|
91
|
+
with open(path_prefix + ".jsonl", encoding="utf-8") as f:
|
|
92
|
+
for line in f:
|
|
93
|
+
line = line.strip()
|
|
94
|
+
if line:
|
|
95
|
+
records.append(json.loads(line))
|
|
96
|
+
with open(path_prefix + ".manifest.json", encoding="utf-8") as f:
|
|
97
|
+
m = json.load(f)
|
|
98
|
+
return Corpus(records=records, search_date=m["search_date"],
|
|
99
|
+
content_hash=m["content_hash"], created=m.get("created", ""),
|
|
100
|
+
spec=m.get("spec", SPEC_VERSION))
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import vahtian
|
|
2
|
+
|
|
3
|
+
# The cross-language parity gate: the R package `vahtian` asserts this SAME literal.
|
|
4
|
+
# If either canonical serialiser drifts, one of the two CIs goes red.
|
|
5
|
+
GOLDEN = "sha256:50ca741a72e7058870d0ca7594b0c37faa7183472fcb1752b7a6c5abe23cafd1"
|
|
6
|
+
|
|
7
|
+
def test_golden_cross_language_hash():
|
|
8
|
+
recs = [
|
|
9
|
+
{"pmid": "12345", "title": "PD-L1 AI scoring agrees with pathologists",
|
|
10
|
+
"provenance": [{"source": "pubmed", "retrieved": "2026-06-23"}]},
|
|
11
|
+
{"doi": "10.1/x", "title": "A second study",
|
|
12
|
+
"provenance": [{"source": "openalex", "retrieved": "2026-06-23"}]},
|
|
13
|
+
{"pmid": "12345", "title": "PD-L1 AI scoring agrees with pathologists",
|
|
14
|
+
"provenance": [{"source": "europepmc", "retrieved": "2026-06-23"}]},
|
|
15
|
+
]
|
|
16
|
+
c = vahtian.freeze(recs, search_date="2026-06-23")
|
|
17
|
+
assert len(c.records) == 2
|
|
18
|
+
assert c.content_hash == GOLDEN
|
|
19
|
+
|
|
20
|
+
def test_dedupe_and_reproducible():
|
|
21
|
+
recs = [
|
|
22
|
+
{"pmid": "1", "title": "A", "provenance": [{"source": "pubmed"}]},
|
|
23
|
+
{"pmid": "1", "title": "A", "provenance": [{"source": "europepmc"}]},
|
|
24
|
+
{"doi": "10.1/x", "title": "B", "provenance": [{"source": "openalex"}]},
|
|
25
|
+
]
|
|
26
|
+
c1 = vahtian.freeze(recs, search_date="2026-06-23", now="t")
|
|
27
|
+
c2 = vahtian.freeze(list(reversed(recs)), search_date="2026-06-23", now="t2")
|
|
28
|
+
assert len(c1.records) == 2
|
|
29
|
+
assert c1.content_hash == c2.content_hash # order-independent
|
|
30
|
+
assert c1.verify()
|
|
31
|
+
|
|
32
|
+
def test_tamper_detection():
|
|
33
|
+
c = vahtian.freeze([{"pmid": "1", "title": "A"}], search_date="2026-06-23")
|
|
34
|
+
assert c.verify()
|
|
35
|
+
c.records[0]["title"] = "tampered"
|
|
36
|
+
assert not c.verify()
|
|
37
|
+
|
|
38
|
+
def test_audit_chain():
|
|
39
|
+
L = vahtian.Ledger()
|
|
40
|
+
L.append("human", "rate", {"v": "supported"}, ts="t1")
|
|
41
|
+
L.append("ai", "advise", {"v": "supported"}, ts="t2")
|
|
42
|
+
assert L.verify()
|
|
43
|
+
L.entries[0]["payload"]["v"] = "x"
|
|
44
|
+
assert not L.verify()
|