pen-stack 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. pen_stack/__init__.py +2 -0
  2. pen_stack/_resources.py +34 -0
  3. pen_stack/adapt/__init__.py +14 -0
  4. pen_stack/adapt/finetune.py +33 -0
  5. pen_stack/adapt/ingest.py +86 -0
  6. pen_stack/adapt/pipeline.py +101 -0
  7. pen_stack/adapt/recalibrate.py +58 -0
  8. pen_stack/adapt/report.py +130 -0
  9. pen_stack/agent/__init__.py +1 -0
  10. pen_stack/agent/guardrails.py +49 -0
  11. pen_stack/agent/mcp_server.py +42 -0
  12. pen_stack/agent/orchestrator.py +106 -0
  13. pen_stack/agent/pen_agent.py +169 -0
  14. pen_stack/agent/tools.py +130 -0
  15. pen_stack/atlas/__init__.py +1 -0
  16. pen_stack/atlas/build_wtkb.py +80 -0
  17. pen_stack/atlas/crosslink.py +144 -0
  18. pen_stack/atlas/expand.py +190 -0
  19. pen_stack/atlas/schema.py +59 -0
  20. pen_stack/atlas/scorecard.py +134 -0
  21. pen_stack/atlas/universe.py +75 -0
  22. pen_stack/atlas/variant_propose.py +155 -0
  23. pen_stack/bridge/__init__.py +1 -0
  24. pen_stack/bridge/activity.py +52 -0
  25. pen_stack/bridge/cli.py +65 -0
  26. pen_stack/bridge/fold_qc.py +53 -0
  27. pen_stack/bridge/guide_qc.py +84 -0
  28. pen_stack/bridge/ingest.py +139 -0
  29. pen_stack/bridge/offtarget.py +133 -0
  30. pen_stack/bridge/ortholog_screen.py +73 -0
  31. pen_stack/bridge/pipeline.py +83 -0
  32. pen_stack/cli.py +126 -0
  33. pen_stack/data/__init__.py +1 -0
  34. pen_stack/data/encode.py +84 -0
  35. pen_stack/data/genome.py +71 -0
  36. pen_stack/data/ingest_chromatin.py +119 -0
  37. pen_stack/data/ingest_integration.py +112 -0
  38. pen_stack/data/ingest_safety_annot.py +164 -0
  39. pen_stack/data/ingest_trip.py +76 -0
  40. pen_stack/mech/__init__.py +1 -0
  41. pen_stack/mech/classify_atlas.py +71 -0
  42. pen_stack/mech/whitelist.py +66 -0
  43. pen_stack/monitor/__init__.py +1 -0
  44. pen_stack/monitor/europepmc.py +32 -0
  45. pen_stack/monitor/run.py +57 -0
  46. pen_stack/monitor/triage.py +63 -0
  47. pen_stack/planner/__init__.py +1 -0
  48. pen_stack/planner/cargo.py +56 -0
  49. pen_stack/planner/cargo_polish.py +146 -0
  50. pen_stack/planner/delivery.py +32 -0
  51. pen_stack/planner/multiplex.py +110 -0
  52. pen_stack/planner/optimize.py +156 -0
  53. pen_stack/planner/pipeline.py +86 -0
  54. pen_stack/planner/report.py +26 -0
  55. pen_stack/rag/__init__.py +1 -0
  56. pen_stack/rag/index.py +53 -0
  57. pen_stack/rag/llm.py +178 -0
  58. pen_stack/rag/qa.py +105 -0
  59. pen_stack/score/__init__.py +1 -0
  60. pen_stack/score/recalibrate.py +77 -0
  61. pen_stack/score/therapeutic.py +85 -0
  62. pen_stack/server/__init__.py +1 -0
  63. pen_stack/server/api.py +142 -0
  64. pen_stack/ui/__init__.py +1 -0
  65. pen_stack/ui/app.py +518 -0
  66. pen_stack/validate/__init__.py +1 -0
  67. pen_stack/validate/adapt_demo.py +69 -0
  68. pen_stack/validate/agent_eval.py +117 -0
  69. pen_stack/validate/blind_gsh_discovery.py +165 -0
  70. pen_stack/validate/cargo_directionality.py +57 -0
  71. pen_stack/validate/durability_baselines.py +150 -0
  72. pen_stack/validate/forward_hypotheses.py +104 -0
  73. pen_stack/validate/guide_qc_demo.py +58 -0
  74. pen_stack/validate/intent_specification.py +82 -0
  75. pen_stack/validate/paper3_benchmark.py +165 -0
  76. pen_stack/validate/paper4_real_validation.py +144 -0
  77. pen_stack/validate/paper4_validation.py +82 -0
  78. pen_stack/validate/seq_vs_measured.py +134 -0
  79. pen_stack/validate/within_locus_ranking.py +74 -0
  80. pen_stack/validate/writer_recovery.py +86 -0
  81. pen_stack/wgenome/__init__.py +1 -0
  82. pen_stack/wgenome/chromatin_seq.py +83 -0
  83. pen_stack/wgenome/durability.py +108 -0
  84. pen_stack/wgenome/export_tracks.py +52 -0
  85. pen_stack/wgenome/features.py +82 -0
  86. pen_stack/wgenome/gsh_baseline.py +117 -0
  87. pen_stack/wgenome/providers.py +245 -0
  88. pen_stack/wgenome/safety.py +69 -0
  89. pen_stack/wgenome/structure3d.py +168 -0
  90. pen_stack/wgenome/writability.py +72 -0
  91. pen_stack-3.1.0.dist-info/METADATA +451 -0
  92. pen_stack-3.1.0.dist-info/RECORD +96 -0
  93. pen_stack-3.1.0.dist-info/WHEEL +5 -0
  94. pen_stack-3.1.0.dist-info/entry_points.txt +3 -0
  95. pen_stack-3.1.0.dist-info/licenses/LICENSE +21 -0
  96. pen_stack-3.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,82 @@
1
+ """Intent specification-compliance (v3.1, WS-A2) - NOT a predictive benchmark.
2
+
3
+ This reframes the former "discriminating-stratum recovery@k" result. For a *targeted* intent the planner
4
+ ranks the goal's own gene first by construction (see docs/benchmark_circularity.md), so gene-level recovery
5
+ is definitional and must NOT be reported as predictive skill or carry a p-value/CI.
6
+
7
+ What remains valid is a **behavioral-correctness** property: does the same locus change rank under opposing
8
+ goals exactly as specified? An in-gene site must rank HIGH under a disruption/excision intent (hitting the
9
+ gene is the goal) and LOW under safe-harbour insertion (the gene must be avoided). We report this as an
10
+ exact-match correctness table, never as recovery or a hypothesis test.
11
+
12
+ Outputs: out/intent_specification.json.
13
+ """
14
+ from __future__ import annotations
15
+
16
+ import json
17
+ from pathlib import Path
18
+
19
+ import pandas as pd
20
+
21
+ from pen_stack.planner.optimize import EditIntent, plan
22
+
23
+ _OUT = Path(__file__).resolve().parents[2] / "out" / "intent_specification.json"
24
+ _WDF = Path(__file__).resolve().parents[2].parent / "phase_1" / "out" / "atlas_k562.parquet"
25
+
26
+ # (gene, targeted-intent) pairs whose documented write is INSIDE the gene/element.
27
+ _CASES = [
28
+ ("TRAC", EditIntent.KNOCK_IN_DISRUPT),
29
+ ("PDCD1", EditIntent.KNOCK_IN_DISRUPT),
30
+ ("B2M", EditIntent.KNOCK_IN_DISRUPT),
31
+ ("BCL11A", EditIntent.REG_EXCISION),
32
+ ("HBG1", EditIntent.REG_EXCISION),
33
+ ("FXN", EditIntent.REPEAT_EXCISION),
34
+ ("ALB", EditIntent.HIGH_DURABILITY),
35
+ ]
36
+
37
+
38
+ def _top_is_on_target(gene: str, intent: EditIntent, wdf: pd.DataFrame, k: int = 5) -> bool | None:
39
+ ranked = plan(gene, intent, 2000, wdf, k=k)
40
+ if ranked.empty:
41
+ return None
42
+ return bool(ranked.iloc[0]["on_target"])
43
+
44
+
45
+ def specification_table(wdf: pd.DataFrame | None = None) -> pd.DataFrame:
46
+ if wdf is None:
47
+ wdf = pd.read_parquet(_WDF)
48
+ rows = []
49
+ for gene, targeted in _CASES:
50
+ # under the targeted intent the in-gene site SHOULD rank #1 (hitting the gene is the goal)
51
+ under_targeted = _top_is_on_target(gene, targeted, wdf)
52
+ # under safe-harbour the same in-gene site should NOT rank #1 (the gene must be avoided)
53
+ under_safe = _top_is_on_target(gene, EditIntent.SAFE_HARBOUR, wdf)
54
+ correct = (under_targeted is True) and (under_safe is False)
55
+ rows.append({"gene": gene, "targeted_intent": targeted.value,
56
+ "top_on_target_under_targeted": under_targeted,
57
+ "top_on_target_under_safe_harbour": under_safe,
58
+ "specification_correct": correct})
59
+ return pd.DataFrame(rows)
60
+
61
+
62
+ def run(out: str | Path = _OUT) -> dict:
63
+ tab = specification_table()
64
+ n = len(tab)
65
+ n_correct = int(tab["specification_correct"].sum())
66
+ report = {
67
+ "what_this_is": "behavioral specification-compliance, NOT a predictive benchmark or recovery metric",
68
+ "property": "the same locus must rank high under a targeted intent and low under safe-harbour",
69
+ "n_cases": n,
70
+ "n_correct": n_correct,
71
+ "all_correct": n_correct == n,
72
+ "table": tab.to_dict("records"),
73
+ "scope": "definitional by design; no recovery@k, p-value, or CI is attached to this result. The "
74
+ "predictive headline is the blind safe-harbour discovery (WS-A3), not this table.",
75
+ }
76
+ Path(out).parent.mkdir(parents=True, exist_ok=True)
77
+ Path(out).write_text(json.dumps(report, indent=2, default=str), encoding="utf-8")
78
+ return report
79
+
80
+
81
+ if __name__ == "__main__": # pragma: no cover
82
+ print(json.dumps(run(), indent=2, default=str))
@@ -0,0 +1,165 @@
1
+ """Two-stratum recovery@k benchmark (Phase 3, Step 3.5).
2
+
3
+ CIRCULARITY NOTICE (v3.1, WS-A). The *discriminating* (targeted-intent) stratum result reported here -
4
+ "recovery@10 = 1.00 vs 0.00", with a McNemar p and a bootstrap CI - is **definitional, not predictive**:
5
+ an on-target identity term (`on_target = gene == target_gene`, magnitude 1.0) dominates a [0,1] base, so
6
+ the planner ranks the goal's own gene first by construction. See `docs/benchmark_circularity.md`. It must
7
+ NOT be cited as predictive evidence. The de-circularized replacements are
8
+ `pen_stack/validate/{intent_specification,blind_gsh_discovery,writer_recovery,within_locus_ranking}.py`,
9
+ with the **blind GSH discovery (AUROC vs matched controls)** as the honest headline. The *control* stratum
10
+ below (genome-wide safe-harbour search) is non-circular and remains valid.
11
+
12
+ (Original docstring follows.) Show the Write Planner recovers documented targeted-writes - *especially the
13
+ non-obvious ones a naive baseline cannot* - from the goal (gene + edit_intent) alone, with the precise site
14
+ held out. The panel is adversarial to the baseline by construction:
15
+
16
+ * Control stratum (safe-harbour writes): a safety ranker should recover these - the Planner must not be
17
+ worse.
18
+ * Discriminating stratum (therapeutic-into-functional-locus writes): an intent-blind safety ranker keeps
19
+ proposing safe harbours and *misses* the intended (often intragenic) target; the Planner, conditioned
20
+ on edit_intent, recovers them. This is the headline.
21
+
22
+ Anti-leakage: the Planner scores a fixed candidate POOL (panel loci + decoy genes) from the goal only;
23
+ recovery@k = the documented locus appearing in the Planner's top-k. The baseline ranks the same pool by
24
+ safety alone (intent-blind). Reported per stratum with a McNemar exact test + bootstrap CI of the gap.
25
+
26
+ Inputs : data/benchmark_panel.csv (frozen, SHA-locked in prereg/paper3.yaml); Phase-1 writability atlas.
27
+ Outputs: out/benchmark_report.json.
28
+ """
29
+ from __future__ import annotations
30
+
31
+ import json
32
+ from functools import lru_cache
33
+ from pathlib import Path
34
+
35
+ import numpy as np
36
+ import pandas as pd
37
+
38
+ from pen_stack.planner.optimize import load_intent_weights, score_candidates
39
+
40
+ _ROOT = Path(__file__).resolve().parents[2]
41
+ _PANEL = _ROOT / "data" / "benchmark_panel.csv"
42
+ _OUT = _ROOT / "out" / "benchmark_report.json"
43
+ BIN_BP = 1000
44
+ N_DECOYS = 30
45
+ SEED = 20260602
46
+
47
+
48
+ @lru_cache(maxsize=4)
49
+ def _gene_coords() -> pd.DataFrame:
50
+ from pen_stack.planner.optimize import gene_coords_path
51
+ return pd.read_parquet(gene_coords_path())
52
+
53
+
54
+ def _gene_candidate(gene: str, writable_df: pd.DataFrame) -> dict | None:
55
+ """Aggregate a gene's body bins into one pool candidate (mean safety/durability + a representative bin)."""
56
+ gc = _gene_coords()
57
+ g = gc[gc["gene"] == gene]
58
+ if g.empty:
59
+ return None
60
+ r = g.iloc[0]
61
+ lo, hi = int(r["start"]) // BIN_BP, int(r["end"]) // BIN_BP
62
+ body = writable_df[(writable_df["chrom"] == r["chrom"]) & (writable_df["bin"].between(lo, hi))]
63
+ if body.empty:
64
+ return None
65
+ # represent the locus by its BEST writable bin - the site a planner would actually target within it
66
+ best = body.loc[body["writability"].idxmax()]
67
+ return {"gene": gene, "chrom": r["chrom"], "bin": int(best["bin"]),
68
+ "safety": float(best["safety"]), "p_durable": float(best["p_durable"]),
69
+ "reachable_tier1": best["reachable_tier1"]}
70
+
71
+
72
+ def build_pool(panel: pd.DataFrame, writable_df: pd.DataFrame, n_decoys: int = N_DECOYS) -> pd.DataFrame:
73
+ """Candidate pool = panel genes + random decoy genes (deterministic), aggregated in this cell type."""
74
+ rows = []
75
+ for gene in panel["gene"].unique():
76
+ c = _gene_candidate(gene, writable_df)
77
+ if c:
78
+ rows.append(c)
79
+ gc = _gene_coords()
80
+ rng = np.random.default_rng(SEED)
81
+ pool_genes = set(panel["gene"])
82
+ decoy_choices = gc[~gc["gene"].isin(pool_genes)]["gene"].dropna().unique()
83
+ for gene in rng.choice(decoy_choices, size=min(n_decoys, len(decoy_choices)), replace=False):
84
+ c = _gene_candidate(gene, writable_df)
85
+ if c:
86
+ rows.append(c)
87
+ return pd.DataFrame(rows).drop_duplicates("gene").reset_index(drop=True)
88
+
89
+
90
+ def _writable(ct: str) -> pd.DataFrame:
91
+ from pen_stack.atlas.crosslink import load_writability
92
+ return load_writability(ct)
93
+
94
+
95
+ def recovery_at_k(panel: pd.DataFrame, k: int = 10, cargo_bp: int = 2000) -> pd.DataFrame:
96
+ """Planner (goal-conditioned) vs baseline (intent-blind safety), recovery@k per panel entry."""
97
+ rows = []
98
+ pools: dict[str, pd.DataFrame] = {}
99
+ for _, t in panel.iterrows():
100
+ ct = t["ct"]
101
+ if ct not in pools:
102
+ pools[ct] = build_pool(panel, _writable(ct))
103
+ pool = pools[ct].copy()
104
+ # PLANNER: score the pool with this entry's intent. on_target marks the entry's own target gene
105
+ # ONLY for *targeted* intents; safe-harbour is genome-wide (the destination is not a gene-to-avoid),
106
+ # so on_target stays False and recovery is pure safety x durability ranking.
107
+ genome_wide = bool(load_intent_weights()["intents"][t["intent"]].get("genome_wide", False))
108
+ pool["on_target"] = (pool["gene"] == t["gene"]) & (not genome_wide)
109
+ scored = score_candidates(pool, t["intent"], cargo_bp)
110
+ planner_topk = list(scored.head(k)["gene"])
111
+ # BASELINE: intent-blind, rank the same pool by safety only. Stable sort + tie-breakers so the
112
+ # saturated-safety ties resolve identically every run (default quicksort is not stable).
113
+ baseline_topk = list(pool.sort_values(["safety", "chrom", "bin"], ascending=[False, True, True],
114
+ kind="stable").head(k)["gene"])
115
+ rows.append({"name": t["name"], "gene": t["gene"], "stratum": t["stratum"],
116
+ "intent": t["intent"],
117
+ "planner_hit": int(t["gene"] in planner_topk),
118
+ "baseline_hit": int(t["gene"] in baseline_topk)})
119
+ return pd.DataFrame(rows)
120
+
121
+
122
+ def stratified_report(rec: pd.DataFrame) -> dict:
123
+ from statsmodels.stats.contingency_tables import mcnemar
124
+ out = {}
125
+ for s in ["control", "discriminating"]:
126
+ sub = rec[rec["stratum"] == s]
127
+ if sub.empty:
128
+ continue
129
+ b = int(((sub.planner_hit == 1) & (sub.baseline_hit == 0)).sum()) # planner wins
130
+ c = int(((sub.planner_hit == 0) & (sub.baseline_hit == 1)).sum()) # baseline wins
131
+ a = int(((sub.planner_hit == 1) & (sub.baseline_hit == 1)).sum())
132
+ d = int(((sub.planner_hit == 0) & (sub.baseline_hit == 0)).sum())
133
+ res = mcnemar([[a, b], [c, d]], exact=True)
134
+ # bootstrap CI of the recovery gap (planner - baseline)
135
+ diff = (sub.planner_hit - sub.baseline_hit).to_numpy()
136
+ rng = np.random.default_rng(SEED)
137
+ boot = [rng.choice(diff, size=len(diff), replace=True).mean() for _ in range(5000)]
138
+ ci = (float(np.percentile(boot, 2.5)), float(np.percentile(boot, 97.5)))
139
+ out[s] = {"n": int(len(sub)),
140
+ "planner_recovery": round(float(sub.planner_hit.mean()), 4),
141
+ "baseline_recovery": round(float(sub.baseline_hit.mean()), 4),
142
+ "planner_wins": b, "baseline_wins": c,
143
+ "mcnemar_pvalue": float(res.pvalue),
144
+ "gap_mean": round(float(diff.mean()), 4),
145
+ "gap_ci95": [round(ci[0], 4), round(ci[1], 4)],
146
+ "ci_excludes_zero": bool(ci[0] > 0)}
147
+ return out
148
+
149
+
150
+ def run(k: int = 10, out: str | Path = _OUT) -> dict:
151
+ panel = pd.read_csv(_PANEL)
152
+ rec = recovery_at_k(panel, k=k)
153
+ report = {"k": k, "n_panel": len(panel), "strata": stratified_report(rec),
154
+ "per_case": rec.to_dict("records")}
155
+ Path(out).parent.mkdir(parents=True, exist_ok=True)
156
+ Path(out).write_text(json.dumps(report, indent=2), encoding="utf-8")
157
+ return report
158
+
159
+
160
+ if __name__ == "__main__": # pragma: no cover
161
+ r = run()
162
+ print(json.dumps(r["strata"], indent=2))
163
+ print("\nper-case:")
164
+ for c in r["per_case"]:
165
+ print(f" [{c['stratum'][:4]}] {c['name']:8s} {c['intent']:26s} planner={c['planner_hit']} baseline={c['baseline_hit']}")
@@ -0,0 +1,144 @@
1
+ """Paper 4 validation on the REAL Perry 2025 data (Phase 1.5).
2
+
3
+ Now that the Perry 2025 supplementary (Science adz0276) is available locally, the previously *gated*
4
+ criteria are validated against measured data (raw tables stay local - copyrighted; only derived results
5
+ are written):
6
+
7
+ 1. **Measured position profile** - derive per-position protective weights from 6,856 real off-targets
8
+ (UMI-weighted). The data confirm the mechanism: the central core (positions 7-9, esp. 8) is the most
9
+ conserved; distal positions are tolerant. This measured profile replaces the literature one.
10
+
11
+ 2. **HEADLINE - blind discrimination of real off-targets, beating Hamming.** Real observed off-targets
12
+ (which recombined -> core preserved) are positives; a core-disrupted decoy of each (position-8 mutated ->
13
+ non-recombinogenic) is the negative. The position-weight model separates them near-perfectly where a
14
+ position-blind Hamming ranking cannot (AUROC).
15
+
16
+ 3. **DMS variant-effect** - the Perry Table S3 deep mutational scan recovers the top activity-enhancing
17
+ single mutants (e.g. N322P, H50K); completes the Phase-2 Section 2.4 DMS variant-proposal step.
18
+
19
+ 4. **Honest limitation** - predicted sequence-risk does NOT rank the *magnitude* of recombination among
20
+ already-observed off-targets (that is dominated by genomic context, not core sequence).
21
+
22
+ Outputs: out/bridge_real_validation.json, features/bridge_offtarget_profile_measured.parquet.
23
+ """
24
+ from __future__ import annotations
25
+
26
+ import json
27
+ import random
28
+ from pathlib import Path
29
+
30
+ from pen_stack.bridge.ingest import derive_measured_profile, load_dms, load_insertion_sites
31
+ from pen_stack.bridge.offtarget import hamming_risk, mismatches, position_weights, risk_score
32
+
33
+ _ROOT = Path(__file__).resolve().parents[2]
34
+ _OUT = _ROOT / "out" / "bridge_real_validation.json"
35
+ _PROFILE = _ROOT / "data" / "curated" / "bridge_offtarget_profile_measured.parquet" # derived, committable
36
+ _CORE0 = 7 # 0-based index of position 8 (the most-conserved / most-critical position)
37
+
38
+
39
+ def _auroc(scores, labels) -> float:
40
+ pos = [s for s, y in zip(scores, labels) if y == 1]
41
+ neg = [s for s, y in zip(scores, labels) if y == 0]
42
+ if not pos or not neg:
43
+ return float("nan")
44
+ wins = sum((p > n) + 0.5 * (p == n) for p in pos for n in neg)
45
+ return wins / (len(pos) * len(neg))
46
+
47
+
48
+ def measured_profile() -> dict:
49
+ prof = derive_measured_profile()
50
+ if prof.empty:
51
+ return {"available": False}
52
+ _PROFILE.parent.mkdir(parents=True, exist_ok=True)
53
+ prof.to_parquet(_PROFILE, index=False)
54
+ cons = dict(zip(prof["position"], prof["conservation"]))
55
+ top = sorted(cons, key=cons.get, reverse=True)[:3]
56
+ return {"available": True, "n_offtargets": int(prof["n_offtarget"].iloc[0]) if "n_offtarget" in prof
57
+ else int(prof["n_offtargets"].iloc[0]),
58
+ "conservation": {int(k): round(float(v), 3) for k, v in cons.items()},
59
+ "most_critical_positions": [int(p) for p in top],
60
+ "central_core_confirmed": bool(set(top) & {7, 8, 9})}
61
+
62
+
63
+ def discrimination_auroc(seed: int = 20260602) -> dict:
64
+ s2 = load_insertion_sites()
65
+ if s2.empty:
66
+ return {"available": False}
67
+ off = s2[(s2["On-Target"] == False) & # noqa: E712
68
+ (s2["Insertion_Site_Sequence"].str.len() == 14) &
69
+ (s2["Plasmid_Encoded_Sequence"].str.len() == 14)]
70
+ w = position_weights() # measured weights
71
+ rng = random.Random(seed)
72
+ scores_m, scores_h, labels = [], [], []
73
+ n = 0
74
+ for seq, intended in zip(off["Insertion_Site_Sequence"], off["Plasmid_Encoded_Sequence"]):
75
+ if seq[_CORE0] != intended[_CORE0]:
76
+ continue # only positives that preserve the critical core position
77
+ # positive: the real off-target
78
+ mm = mismatches(seq, intended)
79
+ scores_m.append(risk_score(mm, w))
80
+ scores_h.append(hamming_risk(mm, 14))
81
+ labels.append(1)
82
+ # negative: same site but the critical core position mutated (non-recombinogenic decoy)
83
+ alt = rng.choice([b for b in "ACGT" if b != seq[_CORE0]])
84
+ decoy = seq[:_CORE0] + alt + seq[_CORE0 + 1:]
85
+ mmd = mismatches(decoy, intended)
86
+ scores_m.append(risk_score(mmd, w))
87
+ scores_h.append(hamming_risk(mmd, 14))
88
+ labels.append(0)
89
+ n += 1
90
+ return {"available": True, "n_pairs": n,
91
+ "model_auroc": round(_auroc(scores_m, labels), 4),
92
+ "hamming_auroc": round(_auroc(scores_h, labels), 4),
93
+ "model_beats_hamming": _auroc(scores_m, labels) > _auroc(scores_h, labels)}
94
+
95
+
96
+ def dms_enhancers(top_k: int = 10) -> dict:
97
+ dms = load_dms()
98
+ if dms.empty:
99
+ return {"available": False}
100
+ import pandas as pd
101
+ dms = dms.copy()
102
+ dms["Z"] = pd.to_numeric(dms["Z_Score_wrt_WT"], errors="coerce")
103
+ dms = dms.dropna(subset=["Z"])
104
+ top = dms.sort_values("Z", ascending=False).head(top_k)
105
+ enh = int((dms["Z"] > 0).sum())
106
+ return {"available": True, "n_variants": int(len(dms)),
107
+ "n_enhancing": enh, "frac_enhancing": round(enh / len(dms), 4),
108
+ "top_enhancers": [{"mutation": str(m), "z": round(float(z), 3)}
109
+ for m, z in zip(top["Mutation"], top["Z"])]}
110
+
111
+
112
+ def magnitude_limit() -> dict:
113
+ """Honest: predicted risk vs measured %_of_insertions among observed off-targets (weak by design)."""
114
+ from scipy.stats import spearmanr
115
+ s2 = load_insertion_sites()
116
+ if s2.empty:
117
+ return {"available": False}
118
+ off = s2[(s2["On-Target"] == False) & # noqa: E712
119
+ (s2["Insertion_Site_Sequence"].str.len() == 14) &
120
+ (s2["Plasmid_Encoded_Sequence"].str.len() == 14)]
121
+ w = position_weights()
122
+ risk = [risk_score(mismatches(s, i), w) for s, i in
123
+ zip(off["Insertion_Site_Sequence"], off["Plasmid_Encoded_Sequence"])]
124
+ rho = spearmanr(risk, off["%_of_Insertions"].values).correlation
125
+ return {"available": True, "risk_vs_magnitude_spearman": round(float(rho), 3),
126
+ "note": "weak by design - recombination magnitude among observed off-targets is dominated by "
127
+ "genomic context, not core sequence; the model's value is discrimination, not magnitude"}
128
+
129
+
130
+ def run(out: str | Path = _OUT) -> dict:
131
+ report = {
132
+ "measured_profile": measured_profile(),
133
+ "discrimination_headline": discrimination_auroc(),
134
+ "dms_enhancers": dms_enhancers(),
135
+ "magnitude_limitation": magnitude_limit(),
136
+ "data_source": "Perry et al. 2025, Science 391:eadz0276 (Tables S1-S3) - raw tables local/copyrighted",
137
+ }
138
+ Path(out).parent.mkdir(parents=True, exist_ok=True)
139
+ Path(out).write_text(json.dumps(report, indent=2), encoding="utf-8")
140
+ return report
141
+
142
+
143
+ if __name__ == "__main__": # pragma: no cover
144
+ print(json.dumps(run(), indent=2))
@@ -0,0 +1,82 @@
1
+ """Paper 4 validation (Phase 1.5) - off-target engine vs naive Hamming.
2
+
3
+ The headline criterion that does NOT need the paywalled measured data: the position-weight model is
4
+ strictly more informative than a position-blind Hamming ranking. On a controlled set of pseudosites with
5
+ the SAME mismatch count but different positions, the model ranks biologically plausible off-targets
6
+ (distal mismatches, core preserved) above implausible ones (central CT core disrupted), while Hamming
7
+ cannot separate them. We quantify this as the AUROC of each score for discriminating
8
+ core-preserving (label 1, real off-target risk) vs core-disrupting (label 0, recombination abolished).
9
+
10
+ The blind recall of Perry 2025's measured off-target coordinates is gated on the paywalled supplementary
11
+ (prereg/paper4.yaml) and is not computed here.
12
+
13
+ Outputs: out/bridge_validation.json.
14
+ """
15
+ from __future__ import annotations
16
+
17
+ import json
18
+ import random
19
+ from pathlib import Path
20
+
21
+ from pen_stack.bridge.ingest import load_profile_config
22
+ from pen_stack.bridge.offtarget import hamming_risk, mismatches, position_weights, risk_score
23
+
24
+ _OUT = Path(__file__).resolve().parents[2] / "out" / "bridge_validation.json"
25
+ _BASES = "ACGT"
26
+
27
+
28
+ def _auroc(scores: list[float], labels: list[int]) -> float:
29
+ """AUROC via the Mann-Whitney U statistic (ties counted as 0.5)."""
30
+ pos = [s for s, y in zip(scores, labels) if y == 1]
31
+ neg = [s for s, y in zip(scores, labels) if y == 0]
32
+ if not pos or not neg:
33
+ return float("nan")
34
+ wins = sum((p > n) + 0.5 * (p == n) for p in pos for n in neg)
35
+ return wins / (len(pos) * len(neg))
36
+
37
+
38
+ def build_controlled_set(core: str, n: int = 400, seed: int = 20260602) -> list[dict]:
39
+ """Generate pseudosites with 1-2 mismatches; label 1 if core (CT) preserved, 0 if core disrupted."""
40
+ rng = random.Random(seed)
41
+ cfg = load_profile_config()
42
+ core_idx = [p - 1 for p in cfg["central_core_positions"]]
43
+ rows = []
44
+ for _ in range(n):
45
+ k = rng.choice([1, 2])
46
+ positions = rng.sample(range(len(core)), k)
47
+ site = list(core)
48
+ for p in positions:
49
+ site[p] = rng.choice([b for b in _BASES if b != core[p]])
50
+ site = "".join(site)
51
+ core_disrupted = any(p in core_idx for p in positions)
52
+ rows.append({"site": site, "n_mm": k, "core_preserved": int(not core_disrupted)})
53
+ return rows
54
+
55
+
56
+ def run(core: str = "ACGTGTCTACGTGA", out: str | Path = _OUT) -> dict:
57
+ # synthetic, data-independent demonstration -> pin to the literature profile (the measured Perry
58
+ # profile is used by paper4_real_validation; here position 8 weight 1.0 makes the mechanism crisp).
59
+ weights = position_weights(prefer_measured=False)
60
+ rows = build_controlled_set(core)
61
+ model_scores, ham_scores, labels = [], [], []
62
+ for r in rows:
63
+ mm = mismatches(r["site"], core)
64
+ model_scores.append(risk_score(mm, weights))
65
+ ham_scores.append(hamming_risk(mm, len(core)))
66
+ labels.append(r["core_preserved"])
67
+ report = {
68
+ "core": core, "n_pseudosites": len(rows),
69
+ "n_core_preserved": sum(labels), "n_core_disrupted": len(labels) - sum(labels),
70
+ "model_auroc": round(_auroc(model_scores, labels), 4),
71
+ "hamming_auroc": round(_auroc(ham_scores, labels), 4),
72
+ "model_beats_hamming": _auroc(model_scores, labels) > _auroc(ham_scores, labels),
73
+ "note": "position-weight model vs naive Hamming on core-preserving vs core-disrupting pseudosites; "
74
+ "blind recall of Perry 2025 measured off-targets is gated on the paywalled supplementary",
75
+ }
76
+ Path(out).parent.mkdir(parents=True, exist_ok=True)
77
+ Path(out).write_text(json.dumps(report, indent=2), encoding="utf-8")
78
+ return report
79
+
80
+
81
+ if __name__ == "__main__": # pragma: no cover
82
+ print(json.dumps(run(), indent=2))
@@ -0,0 +1,134 @@
1
+ """WS-C2 - predicted-vs-measured chromatin validation.
2
+
3
+ For a cell type with BOTH measured ENCODE tracks and AlphaGenome predictions (K562, HepG2), on a seeded
4
+ held-out sample of bins:
5
+ 1. per-track agreement (Spearman + Pearson, predicted vs measured) for the marks AlphaGenome covers;
6
+ 2. score-level degradation: recompute writability/safety/p_durable from quantile-mapped predicted tracks
7
+ and correlate against the measured-track scores (how well the predicted epigenome recovers the scores).
8
+
9
+ Honest scope (stated in M1): AlphaGenome predicts for cell types in/near its training data; this enriches
10
+ covered types and approximates related ones - the cross-cell-type writability claim is bounded by that
11
+ coverage. K562 has no predicted H3K9me3 (excluded for K562). Predictions are cached for offline re-runs.
12
+
13
+ Acceptance (prereg/ws_c.yaml): report the per-track correlations and the score-level Spearman; the tool
14
+ flags low confidence where predicted-track agreement is poor. The requirement is that this is measured and
15
+ reported, not a fixed threshold.
16
+ """
17
+ from __future__ import annotations
18
+
19
+ import json
20
+ from pathlib import Path
21
+
22
+ import numpy as np
23
+ import pandas as pd
24
+
25
+ from pen_stack.wgenome import chromatin_seq as cs
26
+ from pen_stack.wgenome.features import _log_dist
27
+ from pen_stack.wgenome.providers import AlphaGenomeProvider
28
+
29
+ _ROOT = Path(__file__).resolve().parents[2]
30
+ _FEAT = _ROOT.parent / "phase_1" / "features"
31
+ _OUT = _ROOT / "out" / "seq_vs_measured.json"
32
+ _LOW_CONF = 0.3 # median per-track Spearman below this -> flag low confidence for the cell type
33
+
34
+
35
+ def _spearman(a, b) -> float:
36
+ a, b = pd.Series(np.asarray(a, float)), pd.Series(np.asarray(b, float))
37
+ return float(a.corr(b, method="spearman"))
38
+
39
+
40
+ def _pearson(a, b) -> float:
41
+ a, b = pd.Series(np.asarray(a, float)), pd.Series(np.asarray(b, float))
42
+ return float(a.corr(b, method="pearson"))
43
+
44
+
45
+ def _sample_bins(ct: str, n: int, seed: int):
46
+ """Seeded sample of ASSAYED (non-all-zero) bins - where measured signal exists to correlate against.
47
+
48
+ Returns (sample_df, full_chromatin_df, mark_columns).
49
+ """
50
+ chrom = pd.read_parquet(_FEAT / f"chromatin_{ct}.parquet")
51
+ marks = [c for c in ["atac", "dnase", "H3K27ac", "H3K4me1", "H3K4me3", "H3K9me3", "H3K27me3"]
52
+ if c in chrom.columns]
53
+ active = chrom[chrom[marks].abs().sum(axis=1) > 0]
54
+ return active.sample(n=min(n, len(active)), random_state=seed).reset_index(drop=True), chrom, marks
55
+
56
+
57
+ def _measured_matrix(sample: pd.DataFrame, ct: str) -> pd.DataFrame:
58
+ """Scoring matrix for the sampled bins matching the trained schema: measured tracks + safety
59
+ log-distances + integration features (integ_*). Integration features are genomic, not predicted."""
60
+ from pen_stack.wgenome.features import SAFETY_DIST, add_accessibility
61
+ safe = pd.read_parquet(_FEAT / "safety_annot.parquet")
62
+ m = sample.merge(safe, on=["chrom", "bin"], how="left")
63
+ m = add_accessibility(m)
64
+ for d in SAFETY_DIST:
65
+ if d in m.columns:
66
+ m[f"log_{d}"] = _log_dist(m[d])
67
+ integ_path = _FEAT / f"integration_{ct}.parquet"
68
+ if integ_path.exists():
69
+ integ = pd.read_parquet(integ_path)
70
+ m = m.merge(integ, on=["chrom", "bin"], how="left")
71
+ for c in [c for c in integ.columns if c.startswith("integ_")]:
72
+ m[c] = m[c].fillna(0)
73
+ return m
74
+
75
+
76
+ def run(ct: str = "k562", n: int = 120, seed: int = 20260604, offline: bool = False,
77
+ out: str | Path = _OUT) -> dict:
78
+ if not (_FEAT / f"chromatin_{ct}.parquet").exists():
79
+ return {"available": False, "note": f"measured chromatin for {ct} absent"}
80
+ provider = AlphaGenomeProvider(assembly="hg38")
81
+ if not provider.available() and not offline:
82
+ return {"available": False, "note": "AlphaGenome package+key absent; C2 pending (provide key)"}
83
+
84
+ sample, _chrom, marks = _sample_bins(ct, n, seed)
85
+ pred = cs.predicted_tracks_frame(ct, sample[["chrom", "bin"]], provider, offline=offline)
86
+ if pred.empty:
87
+ return {"available": False, "note": "no predicted tracks (offline cache empty - run live once)"}
88
+ merged = sample.merge(pred, on=["chrom", "bin"], how="inner", suffixes=("_meas", "_pred"))
89
+
90
+ per_track = {}
91
+ for t in marks:
92
+ mc, pc = f"{t}_meas", f"{t}_pred"
93
+ if mc in merged and pc in merged and merged[pc].notna().sum() >= 5:
94
+ per_track[t] = {"spearman": round(_spearman(merged[mc], merged[pc]), 4),
95
+ "pearson": round(_pearson(merged[mc], merged[pc]), 4),
96
+ "n": int(merged[pc].notna().sum())}
97
+ median_sp = float(np.nanmedian([v["spearman"] for v in per_track.values()])) if per_track else float("nan")
98
+
99
+ # score-level degradation (needs the trained pickles)
100
+ score_block = {"available": False, "note": "trained safety/durability pickles absent"}
101
+ if (_ROOT.parent / "phase_1" / "out" / f"safety_{ct}.pkl").exists():
102
+ meas_m = _measured_matrix(sample, ct)
103
+ meas_scores = cs.recompute_scores(meas_m, ct)
104
+ pred_m = cs.build_predicted_matrix(meas_m, pred, ct)
105
+ pred_scores = cs.recompute_scores(pred_m, ct)
106
+ j = meas_scores.merge(pred_scores, on=["chrom", "bin"], suffixes=("_meas", "_pred"))
107
+ sl = {f"{s}_spearman": round(_spearman(j[f"{s}_meas"], j[f"{s}_pred"]), 4)
108
+ for s in ["writability", "safety", "p_durable"]}
109
+ score_block = {"available": True, "n": int(len(j)), **sl,
110
+ # honest flag: predicted tracks recover per-track signal but the COMPOSITE writability
111
+ # score degrades - so the measured-track atlas stays the backbone (hybrid decision).
112
+ "score_replacement_low_confidence": bool(sl["writability_spearman"] < _LOW_CONF),
113
+ "interpretation": "predicted tracks approximate measured tracks per-track (esp. "
114
+ "accessibility), but rebuilding the composite writability score "
115
+ "from predictions degrades substantially - use measured tracks as "
116
+ "the backbone; AlphaGenome for on-demand track/3D signals."}
117
+
118
+ from pen_stack.wgenome.providers import MODEL_VERSION
119
+ report = {"available": True, "ct": ct, "n_sample": int(len(merged)), "seed": seed,
120
+ "model_version": MODEL_VERSION,
121
+ "marks_covered": list(per_track), "k562_missing_H3K9me3": ct.lower() == "k562",
122
+ "per_track": per_track, "median_track_spearman": round(median_sp, 4),
123
+ "low_confidence": bool(np.isnan(median_sp) or median_sp < _LOW_CONF),
124
+ "score_level_degradation": score_block,
125
+ "scope": "AlphaGenome covers cell types in/near its training data; cross-cell-type writability "
126
+ "is bounded by that coverage. Predicted tracks are in model units - per-track uses "
127
+ "rank (Spearman); score-level quantile-maps predicted tracks onto the measured marginal."}
128
+ Path(out).parent.mkdir(parents=True, exist_ok=True)
129
+ Path(out).write_text(json.dumps(report, indent=2, default=str), encoding="utf-8")
130
+ return report
131
+
132
+
133
+ if __name__ == "__main__": # pragma: no cover
134
+ print(json.dumps(run(), indent=2, default=str))