contig 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- contig/__init__.py +0 -0
- contig/benchmark.py +204 -0
- contig/bundle.py +91 -0
- contig/cli.py +1270 -0
- contig/corpus.py +320 -0
- contig/cost.py +61 -0
- contig/data/detector_corpus.jsonl +15 -0
- contig/data/eval_history.jsonl +2 -0
- contig/data/reference_runs.jsonl +0 -0
- contig/datashape.py +62 -0
- contig/detect.py +528 -0
- contig/estimate.py +165 -0
- contig/eval_history.py +66 -0
- contig/events.py +147 -0
- contig/lifecycle.py +156 -0
- contig/methods.py +95 -0
- contig/models.py +359 -0
- contig/nfconfig.py +149 -0
- contig/notify.py +131 -0
- contig/planner.py +63 -0
- contig/progress.py +163 -0
- contig/provenance.py +81 -0
- contig/reference.py +41 -0
- contig/registry.py +161 -0
- contig/repair.py +159 -0
- contig/report.py +323 -0
- contig/runner.py +267 -0
- contig/samplesheet.py +78 -0
- contig/self_heal.py +559 -0
- contig/signing.py +129 -0
- contig/snakemake.py +80 -0
- contig/verification/__init__.py +0 -0
- contig/verification/cross_sample.py +118 -0
- contig/verification/qc_ingest.py +30 -0
- contig/verification/rule_pack.py +301 -0
- contig/verification/run_qc.py +70 -0
- contig/verification/structural.py +347 -0
- contig/workspace.py +51 -0
- contig-0.1.0.dist-info/METADATA +8 -0
- contig-0.1.0.dist-info/RECORD +42 -0
- contig-0.1.0.dist-info/WHEEL +4 -0
- contig-0.1.0.dist-info/entry_points.txt +2 -0
contig/__init__.py
ADDED
|
File without changes
|
contig/benchmark.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
"""Cross-run benchmark: compare a run against a designated reference (PRD contract A).
|
|
2
|
+
|
|
3
|
+
A run is judged against a designated REFERENCE run for its (pipeline, assay), not
|
|
4
|
+
bit-for-bit. We compare each shared numeric QC metric within a RELATIVE tolerance
|
|
5
|
+
and add a structural-shape check (the same set of QC check names present), so the
|
|
6
|
+
benchmark is robust to the run-to-run non-determinism a real pipeline produces
|
|
7
|
+
while still catching a genuine drift in a metric or in the shape of the output.
|
|
8
|
+
|
|
9
|
+
The reference registry is a committed JSONL, one entry per (pipeline, assay),
|
|
10
|
+
carrying the reference run's numeric QC values. It is the accumulated baseline a
|
|
11
|
+
researcher trusts: "this run still matches the result we validated".
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from os import PathLike
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
from pydantic import BaseModel
|
|
20
|
+
|
|
21
|
+
from contig.models import RunRecord
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ReferenceEntry(BaseModel):
|
|
25
|
+
"""One designated reference baseline for a (pipeline, assay).
|
|
26
|
+
|
|
27
|
+
`metrics` are the reference run's numeric QC values keyed by check name;
|
|
28
|
+
`recorded_at` is when the baseline was set, for provenance.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
pipeline: str
|
|
32
|
+
assay: str
|
|
33
|
+
reference_run_id: str
|
|
34
|
+
metrics: dict[str, float] = {}
|
|
35
|
+
recorded_at: str
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class ReferenceRegistry(BaseModel):
|
|
39
|
+
"""The full set of designated references, one per (pipeline, assay)."""
|
|
40
|
+
|
|
41
|
+
entries: list[ReferenceEntry] = []
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def default_reference_path() -> Path:
|
|
45
|
+
"""Path to the committed reference registry shipped with the package."""
|
|
46
|
+
return Path(__file__).parent / "data" / "reference_runs.jsonl"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def load_reference_registry(path: str | PathLike[str]) -> ReferenceRegistry:
|
|
50
|
+
"""Read the JSONL registry into a ReferenceRegistry; a missing file is empty."""
|
|
51
|
+
p = Path(path)
|
|
52
|
+
if not p.exists():
|
|
53
|
+
return ReferenceRegistry(entries=[])
|
|
54
|
+
entries = [
|
|
55
|
+
ReferenceEntry.model_validate_json(line)
|
|
56
|
+
for line in p.read_text().splitlines()
|
|
57
|
+
if line.strip()
|
|
58
|
+
]
|
|
59
|
+
return ReferenceRegistry(entries=entries)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def save_reference_registry(registry: ReferenceRegistry, path: str | PathLike[str]) -> None:
|
|
63
|
+
"""Write the registry as JSONL (one ReferenceEntry per line)."""
|
|
64
|
+
p = Path(path)
|
|
65
|
+
p.parent.mkdir(parents=True, exist_ok=True)
|
|
66
|
+
p.write_text("".join(entry.model_dump_json() + "\n" for entry in registry.entries))
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def reference_for(
|
|
70
|
+
registry: ReferenceRegistry, pipeline: str, assay: str
|
|
71
|
+
) -> ReferenceEntry | None:
|
|
72
|
+
"""Return the reference entry for a (pipeline, assay), or None if none is set."""
|
|
73
|
+
for entry in registry.entries:
|
|
74
|
+
if entry.pipeline == pipeline and entry.assay == assay:
|
|
75
|
+
return entry
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def record_reference(
|
|
80
|
+
registry: ReferenceRegistry,
|
|
81
|
+
*,
|
|
82
|
+
pipeline: str,
|
|
83
|
+
assay: str,
|
|
84
|
+
reference_run_id: str,
|
|
85
|
+
metrics: dict[str, float],
|
|
86
|
+
recorded_at: str,
|
|
87
|
+
) -> ReferenceRegistry:
|
|
88
|
+
"""Return a registry with the reference for (pipeline, assay) set or replaced.
|
|
89
|
+
|
|
90
|
+
Deduped by (pipeline, assay): recording a new reference for a pair that
|
|
91
|
+
already has one replaces it, so there is always exactly one baseline per
|
|
92
|
+
pair. The input registry is not mutated.
|
|
93
|
+
"""
|
|
94
|
+
new_entry = ReferenceEntry(
|
|
95
|
+
pipeline=pipeline,
|
|
96
|
+
assay=assay,
|
|
97
|
+
reference_run_id=reference_run_id,
|
|
98
|
+
metrics=dict(metrics),
|
|
99
|
+
recorded_at=recorded_at,
|
|
100
|
+
)
|
|
101
|
+
kept = [
|
|
102
|
+
e for e in registry.entries
|
|
103
|
+
if not (e.pipeline == pipeline and e.assay == assay)
|
|
104
|
+
]
|
|
105
|
+
return ReferenceRegistry(entries=kept + [new_entry])
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def metrics_from_run(record: RunRecord) -> dict[str, float]:
|
|
109
|
+
"""The run's numeric QC values keyed by check name (the benchmark inputs).
|
|
110
|
+
|
|
111
|
+
Only checks that carry a numeric value are kept; a structural check with no
|
|
112
|
+
value cannot be compared on magnitude, so it is excluded from the metrics.
|
|
113
|
+
"""
|
|
114
|
+
return {
|
|
115
|
+
result.check: float(result.value)
|
|
116
|
+
for result in record.qc_results
|
|
117
|
+
if result.value is not None
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def benchmark_run(
|
|
122
|
+
record: RunRecord,
|
|
123
|
+
registry: ReferenceRegistry,
|
|
124
|
+
*,
|
|
125
|
+
assay: str,
|
|
126
|
+
tolerance: float,
|
|
127
|
+
) -> dict:
|
|
128
|
+
"""Compare a run's QC metrics against its designated reference (PRD contract A).
|
|
129
|
+
|
|
130
|
+
Finds the reference for the run's (pipeline, assay). For each metric the run
|
|
131
|
+
and the reference share, the run value is within tolerance when its relative
|
|
132
|
+
difference from the reference is at most `tolerance` (relative, not absolute).
|
|
133
|
+
A structural-shape mismatch (the run and reference do not carry the same set
|
|
134
|
+
of QC check names) is itself drift, even if every shared value matches.
|
|
135
|
+
|
|
136
|
+
Returns the dashboard contract:
|
|
137
|
+
`{reference_run_id, tolerance, matched, drifted, checks, status}` where each
|
|
138
|
+
check is `{name, run_value, reference_value, within_tolerance, delta}` and
|
|
139
|
+
status is "match", "drift", or "no_reference". No reference is not an error:
|
|
140
|
+
status is "no_reference" with a message and no checks.
|
|
141
|
+
"""
|
|
142
|
+
entry = reference_for(registry, record.pipeline, assay)
|
|
143
|
+
if entry is None:
|
|
144
|
+
return {
|
|
145
|
+
"reference_run_id": None,
|
|
146
|
+
"tolerance": tolerance,
|
|
147
|
+
"matched": 0,
|
|
148
|
+
"drifted": 0,
|
|
149
|
+
"checks": [],
|
|
150
|
+
"status": "no_reference",
|
|
151
|
+
"message": (
|
|
152
|
+
f"no reference set for pipeline {record.pipeline!r} / assay {assay!r}"
|
|
153
|
+
),
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
run_metrics = metrics_from_run(record)
|
|
157
|
+
shared = sorted(set(run_metrics) & set(entry.metrics))
|
|
158
|
+
same_shape = set(run_metrics) == set(entry.metrics)
|
|
159
|
+
|
|
160
|
+
checks: list[dict] = []
|
|
161
|
+
matched = 0
|
|
162
|
+
drifted = 0
|
|
163
|
+
for name in shared:
|
|
164
|
+
run_value = run_metrics[name]
|
|
165
|
+
reference_value = entry.metrics[name]
|
|
166
|
+
delta = _relative_delta(run_value, reference_value)
|
|
167
|
+
within = delta <= tolerance
|
|
168
|
+
if within:
|
|
169
|
+
matched += 1
|
|
170
|
+
else:
|
|
171
|
+
drifted += 1
|
|
172
|
+
checks.append(
|
|
173
|
+
{
|
|
174
|
+
"name": name,
|
|
175
|
+
"run_value": run_value,
|
|
176
|
+
"reference_value": reference_value,
|
|
177
|
+
"within_tolerance": within,
|
|
178
|
+
"delta": delta,
|
|
179
|
+
}
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
# A value drift OR a shape mismatch is drift; only all values within
|
|
183
|
+
# tolerance AND the same shape counts as a match.
|
|
184
|
+
status = "match" if drifted == 0 and same_shape else "drift"
|
|
185
|
+
return {
|
|
186
|
+
"reference_run_id": entry.reference_run_id,
|
|
187
|
+
"tolerance": tolerance,
|
|
188
|
+
"matched": matched,
|
|
189
|
+
"drifted": drifted,
|
|
190
|
+
"checks": checks,
|
|
191
|
+
"status": status,
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _relative_delta(run_value: float, reference_value: float) -> float:
|
|
196
|
+
"""Relative difference of run from reference: |run - ref| / |ref|.
|
|
197
|
+
|
|
198
|
+
A zero reference falls back to the absolute difference (there is no relative
|
|
199
|
+
scale to divide by), so an exact zero-vs-zero is a delta of 0 and any nonzero
|
|
200
|
+
run against a zero reference is the run's own magnitude.
|
|
201
|
+
"""
|
|
202
|
+
if reference_value == 0:
|
|
203
|
+
return abs(run_value)
|
|
204
|
+
return abs(run_value - reference_value) / abs(reference_value)
|
contig/bundle.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""The portable provenance bundle (ARCHITECTURE §7).
|
|
2
|
+
|
|
3
|
+
A bundle is the artifact that makes a run "re-runnable by a stranger": the full
|
|
4
|
+
RunRecord serialized to disk, plus the helper that derives the input checksums
|
|
5
|
+
that anchor it.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
from contig.models import RunRecord, sha256_file
|
|
15
|
+
|
|
16
|
+
# The env var that, when set to a hex or base64 Ed25519 private key, makes
|
|
17
|
+
# write_bundle emit a detached signature sidecar next to the record. Absent or
|
|
18
|
+
# empty means no sidecar (signing is opt-in and never logs the key).
|
|
19
|
+
SIGNING_KEY_ENV = "CONTIG_SIGNING_KEY"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def write_bundle(record: RunRecord, dest_dir: str | Path) -> Path:
|
|
23
|
+
"""Serialize ``record`` to ``dest_dir/run_record.json`` and return that path.
|
|
24
|
+
|
|
25
|
+
When ``CONTIG_SIGNING_KEY`` is set (and signing is available), also write a
|
|
26
|
+
detached signature sidecar at ``dest_dir/signature.json`` over the record's
|
|
27
|
+
canonical content. The signature signs the record content, never the sidecar.
|
|
28
|
+
"""
|
|
29
|
+
dest = Path(dest_dir)
|
|
30
|
+
dest.mkdir(parents=True, exist_ok=True)
|
|
31
|
+
json_path = dest / "run_record.json"
|
|
32
|
+
json_path.write_text(record.model_dump_json(indent=2))
|
|
33
|
+
_maybe_write_signature(record, dest)
|
|
34
|
+
return json_path
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _maybe_write_signature(record: RunRecord, dest: Path) -> None:
|
|
38
|
+
"""Write signature.json when a signing key is configured; otherwise do nothing."""
|
|
39
|
+
private_key = os.environ.get(SIGNING_KEY_ENV)
|
|
40
|
+
if not private_key:
|
|
41
|
+
return
|
|
42
|
+
# Imported lazily so the bundle module loads even where cryptography is absent;
|
|
43
|
+
# a configured key with signing unavailable raises, surfacing the misconfig.
|
|
44
|
+
from contig.signing import canonical_sha256, public_key_for, sign_record
|
|
45
|
+
|
|
46
|
+
sidecar = {
|
|
47
|
+
"algo": "ed25519",
|
|
48
|
+
"public_key": public_key_for(private_key),
|
|
49
|
+
"signature": sign_record(record, private_key),
|
|
50
|
+
"signed_sha256": canonical_sha256(record),
|
|
51
|
+
}
|
|
52
|
+
(dest / "signature.json").write_text(json.dumps(sidecar, indent=2))
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def load_bundle(dest_dir: str | Path) -> RunRecord:
|
|
56
|
+
"""Reconstruct the RunRecord from ``dest_dir/run_record.json``."""
|
|
57
|
+
json_path = Path(dest_dir) / "run_record.json"
|
|
58
|
+
return RunRecord.model_validate_json(json_path.read_text())
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def compute_input_checksums(paths: list[str | Path]) -> dict[str, str]:
|
|
62
|
+
"""Map each input file's basename to its SHA-256, for RunRecord.input_checksums.
|
|
63
|
+
|
|
64
|
+
Basenames keep the provenance portable, but two inputs sharing a basename would
|
|
65
|
+
silently clobber (corrupting the record), so a collision is a hard error.
|
|
66
|
+
"""
|
|
67
|
+
checksums: dict[str, str] = {}
|
|
68
|
+
for p in paths:
|
|
69
|
+
name = Path(p).name
|
|
70
|
+
if name in checksums:
|
|
71
|
+
raise ValueError(f"duplicate input basename {name!r}; inputs must have unique names")
|
|
72
|
+
checksums[name] = sha256_file(p)
|
|
73
|
+
return checksums
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def compute_output_checksums(results_dir: str | Path) -> dict[str, str]:
|
|
77
|
+
"""Map each output file under ``results_dir`` to its SHA-256 (PRD contract B).
|
|
78
|
+
|
|
79
|
+
Keys are paths relative to ``results_dir`` (POSIX separators, so the key
|
|
80
|
+
survives a re-hash on any platform); this anchors the produced outputs in the
|
|
81
|
+
RunRecord so ``contig verify`` can detect drift. An absent results dir maps to
|
|
82
|
+
an empty dict: a run that produced no outputs has nothing to anchor.
|
|
83
|
+
"""
|
|
84
|
+
root = Path(results_dir)
|
|
85
|
+
if not root.is_dir():
|
|
86
|
+
return {}
|
|
87
|
+
checksums: dict[str, str] = {}
|
|
88
|
+
for path in sorted(p for p in root.rglob("*") if p.is_file()):
|
|
89
|
+
rel = path.relative_to(root).as_posix()
|
|
90
|
+
checksums[rel] = sha256_file(path)
|
|
91
|
+
return checksums
|