PyPI - pen-stack - Versions diffs - 3.1.0__py3-none-any.whl - Mend

pen-stack 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

pen_stack/__init__.py +2 -0
pen_stack/_resources.py +34 -0
pen_stack/adapt/__init__.py +14 -0
pen_stack/adapt/finetune.py +33 -0
pen_stack/adapt/ingest.py +86 -0
pen_stack/adapt/pipeline.py +101 -0
pen_stack/adapt/recalibrate.py +58 -0
pen_stack/adapt/report.py +130 -0
pen_stack/agent/__init__.py +1 -0
pen_stack/agent/guardrails.py +49 -0
pen_stack/agent/mcp_server.py +42 -0
pen_stack/agent/orchestrator.py +106 -0
pen_stack/agent/pen_agent.py +169 -0
pen_stack/agent/tools.py +130 -0
pen_stack/atlas/__init__.py +1 -0
pen_stack/atlas/build_wtkb.py +80 -0
pen_stack/atlas/crosslink.py +144 -0
pen_stack/atlas/expand.py +190 -0
pen_stack/atlas/schema.py +59 -0
pen_stack/atlas/scorecard.py +134 -0
pen_stack/atlas/universe.py +75 -0
pen_stack/atlas/variant_propose.py +155 -0
pen_stack/bridge/__init__.py +1 -0
pen_stack/bridge/activity.py +52 -0
pen_stack/bridge/cli.py +65 -0
pen_stack/bridge/fold_qc.py +53 -0
pen_stack/bridge/guide_qc.py +84 -0
pen_stack/bridge/ingest.py +139 -0
pen_stack/bridge/offtarget.py +133 -0
pen_stack/bridge/ortholog_screen.py +73 -0
pen_stack/bridge/pipeline.py +83 -0
pen_stack/cli.py +126 -0
pen_stack/data/__init__.py +1 -0
pen_stack/data/encode.py +84 -0
pen_stack/data/genome.py +71 -0
pen_stack/data/ingest_chromatin.py +119 -0
pen_stack/data/ingest_integration.py +112 -0
pen_stack/data/ingest_safety_annot.py +164 -0
pen_stack/data/ingest_trip.py +76 -0
pen_stack/mech/__init__.py +1 -0
pen_stack/mech/classify_atlas.py +71 -0
pen_stack/mech/whitelist.py +66 -0
pen_stack/monitor/__init__.py +1 -0
pen_stack/monitor/europepmc.py +32 -0
pen_stack/monitor/run.py +57 -0
pen_stack/monitor/triage.py +63 -0
pen_stack/planner/__init__.py +1 -0
pen_stack/planner/cargo.py +56 -0
pen_stack/planner/cargo_polish.py +146 -0
pen_stack/planner/delivery.py +32 -0
pen_stack/planner/multiplex.py +110 -0
pen_stack/planner/optimize.py +156 -0
pen_stack/planner/pipeline.py +86 -0
pen_stack/planner/report.py +26 -0
pen_stack/rag/__init__.py +1 -0
pen_stack/rag/index.py +53 -0
pen_stack/rag/llm.py +178 -0
pen_stack/rag/qa.py +105 -0
pen_stack/score/__init__.py +1 -0
pen_stack/score/recalibrate.py +77 -0
pen_stack/score/therapeutic.py +85 -0
pen_stack/server/__init__.py +1 -0
pen_stack/server/api.py +142 -0
pen_stack/ui/__init__.py +1 -0
pen_stack/ui/app.py +518 -0
pen_stack/validate/__init__.py +1 -0
pen_stack/validate/adapt_demo.py +69 -0
pen_stack/validate/agent_eval.py +117 -0
pen_stack/validate/blind_gsh_discovery.py +165 -0
pen_stack/validate/cargo_directionality.py +57 -0
pen_stack/validate/durability_baselines.py +150 -0
pen_stack/validate/forward_hypotheses.py +104 -0
pen_stack/validate/guide_qc_demo.py +58 -0
pen_stack/validate/intent_specification.py +82 -0
pen_stack/validate/paper3_benchmark.py +165 -0
pen_stack/validate/paper4_real_validation.py +144 -0
pen_stack/validate/paper4_validation.py +82 -0
pen_stack/validate/seq_vs_measured.py +134 -0
pen_stack/validate/within_locus_ranking.py +74 -0
pen_stack/validate/writer_recovery.py +86 -0
pen_stack/wgenome/__init__.py +1 -0
pen_stack/wgenome/chromatin_seq.py +83 -0
pen_stack/wgenome/durability.py +108 -0
pen_stack/wgenome/export_tracks.py +52 -0
pen_stack/wgenome/features.py +82 -0
pen_stack/wgenome/gsh_baseline.py +117 -0
pen_stack/wgenome/providers.py +245 -0
pen_stack/wgenome/safety.py +69 -0
pen_stack/wgenome/structure3d.py +168 -0
pen_stack/wgenome/writability.py +72 -0
pen_stack-3.1.0.dist-info/METADATA +451 -0
pen_stack-3.1.0.dist-info/RECORD +96 -0
pen_stack-3.1.0.dist-info/WHEEL +5 -0
pen_stack-3.1.0.dist-info/entry_points.txt +3 -0
pen_stack-3.1.0.dist-info/licenses/LICENSE +21 -0
pen_stack-3.1.0.dist-info/top_level.txt +1 -0

pen_stack/validate/intent_specification.py ADDED Viewed

@@ -0,0 +1,82 @@
+"""Intent specification-compliance (v3.1, WS-A2) - NOT a predictive benchmark.
+This reframes the former "discriminating-stratum recovery@k" result. For a *targeted* intent the planner
+ranks the goal's own gene first by construction (see docs/benchmark_circularity.md), so gene-level recovery
+is definitional and must NOT be reported as predictive skill or carry a p-value/CI.
+What remains valid is a **behavioral-correctness** property: does the same locus change rank under opposing
+goals exactly as specified? An in-gene site must rank HIGH under a disruption/excision intent (hitting the
+gene is the goal) and LOW under safe-harbour insertion (the gene must be avoided). We report this as an
+exact-match correctness table, never as recovery or a hypothesis test.
+Outputs: out/intent_specification.json.
+"""
+from __future__ import annotations
+import json
+from pathlib import Path
+import pandas as pd
+from pen_stack.planner.optimize import EditIntent, plan
+_OUT = Path(__file__).resolve().parents[2] / "out" / "intent_specification.json"
+_WDF = Path(__file__).resolve().parents[2].parent / "phase_1" / "out" / "atlas_k562.parquet"
+# (gene, targeted-intent) pairs whose documented write is INSIDE the gene/element.
+_CASES = [
+    ("TRAC", EditIntent.KNOCK_IN_DISRUPT),
+    ("PDCD1", EditIntent.KNOCK_IN_DISRUPT),
+    ("B2M", EditIntent.KNOCK_IN_DISRUPT),
+    ("BCL11A", EditIntent.REG_EXCISION),
+    ("HBG1", EditIntent.REG_EXCISION),
+    ("FXN", EditIntent.REPEAT_EXCISION),
+    ("ALB", EditIntent.HIGH_DURABILITY),
+]
+def _top_is_on_target(gene: str, intent: EditIntent, wdf: pd.DataFrame, k: int = 5) -> bool | None:
+    ranked = plan(gene, intent, 2000, wdf, k=k)
+    if ranked.empty:
+        return None
+    return bool(ranked.iloc[0]["on_target"])
+def specification_table(wdf: pd.DataFrame | None = None) -> pd.DataFrame:
+    if wdf is None:
+        wdf = pd.read_parquet(_WDF)
+    rows = []
+    for gene, targeted in _CASES:
+        # under the targeted intent the in-gene site SHOULD rank #1 (hitting the gene is the goal)
+        under_targeted = _top_is_on_target(gene, targeted, wdf)
+        # under safe-harbour the same in-gene site should NOT rank #1 (the gene must be avoided)
+        under_safe = _top_is_on_target(gene, EditIntent.SAFE_HARBOUR, wdf)
+        correct = (under_targeted is True) and (under_safe is False)
+        rows.append({"gene": gene, "targeted_intent": targeted.value,
+                     "top_on_target_under_targeted": under_targeted,
+                     "top_on_target_under_safe_harbour": under_safe,
+                     "specification_correct": correct})
+    return pd.DataFrame(rows)
+def run(out: str | Path = _OUT) -> dict:
+    tab = specification_table()
+    n = len(tab)
+    n_correct = int(tab["specification_correct"].sum())
+    report = {
+        "what_this_is": "behavioral specification-compliance, NOT a predictive benchmark or recovery metric",
+        "property": "the same locus must rank high under a targeted intent and low under safe-harbour",
+        "n_cases": n,
+        "n_correct": n_correct,
+        "all_correct": n_correct == n,
+        "table": tab.to_dict("records"),
+        "scope": "definitional by design; no recovery@k, p-value, or CI is attached to this result. The "
+                 "predictive headline is the blind safe-harbour discovery (WS-A3), not this table.",
+    }
+    Path(out).parent.mkdir(parents=True, exist_ok=True)
+    Path(out).write_text(json.dumps(report, indent=2, default=str), encoding="utf-8")
+    return report
+if __name__ == "__main__":  # pragma: no cover
+    print(json.dumps(run(), indent=2, default=str))

pen_stack/validate/paper3_benchmark.py ADDED Viewed

@@ -0,0 +1,165 @@
+"""Two-stratum recovery@k benchmark (Phase 3, Step 3.5).
+CIRCULARITY NOTICE (v3.1, WS-A). The *discriminating* (targeted-intent) stratum result reported here -
+"recovery@10 = 1.00 vs 0.00", with a McNemar p and a bootstrap CI - is **definitional, not predictive**:
+an on-target identity term (`on_target = gene == target_gene`, magnitude 1.0) dominates a [0,1] base, so
+the planner ranks the goal's own gene first by construction. See `docs/benchmark_circularity.md`. It must
+NOT be cited as predictive evidence. The de-circularized replacements are
+`pen_stack/validate/{intent_specification,blind_gsh_discovery,writer_recovery,within_locus_ranking}.py`,
+with the **blind GSH discovery (AUROC vs matched controls)** as the honest headline. The *control* stratum
+below (genome-wide safe-harbour search) is non-circular and remains valid.
+(Original docstring follows.) Show the Write Planner recovers documented targeted-writes - *especially the
+non-obvious ones a naive baseline cannot* - from the goal (gene + edit_intent) alone, with the precise site
+held out. The panel is adversarial to the baseline by construction:
+  * Control stratum (safe-harbour writes): a safety ranker should recover these - the Planner must not be
+    worse.
+  * Discriminating stratum (therapeutic-into-functional-locus writes): an intent-blind safety ranker keeps
+    proposing safe harbours and *misses* the intended (often intragenic) target; the Planner, conditioned
+    on edit_intent, recovers them. This is the headline.
+Anti-leakage: the Planner scores a fixed candidate POOL (panel loci + decoy genes) from the goal only;
+recovery@k = the documented locus appearing in the Planner's top-k. The baseline ranks the same pool by
+safety alone (intent-blind). Reported per stratum with a McNemar exact test + bootstrap CI of the gap.
+Inputs : data/benchmark_panel.csv (frozen, SHA-locked in prereg/paper3.yaml); Phase-1 writability atlas.
+Outputs: out/benchmark_report.json.
+"""
+from __future__ import annotations
+import json
+from functools import lru_cache
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from pen_stack.planner.optimize import load_intent_weights, score_candidates
+_ROOT = Path(__file__).resolve().parents[2]
+_PANEL = _ROOT / "data" / "benchmark_panel.csv"
+_OUT = _ROOT / "out" / "benchmark_report.json"
+BIN_BP = 1000
+N_DECOYS = 30
+SEED = 20260602
+@lru_cache(maxsize=4)
+def _gene_coords() -> pd.DataFrame:
+    from pen_stack.planner.optimize import gene_coords_path
+    return pd.read_parquet(gene_coords_path())
+def _gene_candidate(gene: str, writable_df: pd.DataFrame) -> dict | None:
+    """Aggregate a gene's body bins into one pool candidate (mean safety/durability + a representative bin)."""
+    gc = _gene_coords()
+    g = gc[gc["gene"] == gene]
+    if g.empty:
+        return None
+    r = g.iloc[0]
+    lo, hi = int(r["start"]) // BIN_BP, int(r["end"]) // BIN_BP
+    body = writable_df[(writable_df["chrom"] == r["chrom"]) & (writable_df["bin"].between(lo, hi))]
+    if body.empty:
+        return None
+    # represent the locus by its BEST writable bin - the site a planner would actually target within it
+    best = body.loc[body["writability"].idxmax()]
+    return {"gene": gene, "chrom": r["chrom"], "bin": int(best["bin"]),
+            "safety": float(best["safety"]), "p_durable": float(best["p_durable"]),
+            "reachable_tier1": best["reachable_tier1"]}
+def build_pool(panel: pd.DataFrame, writable_df: pd.DataFrame, n_decoys: int = N_DECOYS) -> pd.DataFrame:
+    """Candidate pool = panel genes + random decoy genes (deterministic), aggregated in this cell type."""
+    rows = []
+    for gene in panel["gene"].unique():
+        c = _gene_candidate(gene, writable_df)
+        if c:
+            rows.append(c)
+    gc = _gene_coords()
+    rng = np.random.default_rng(SEED)
+    pool_genes = set(panel["gene"])
+    decoy_choices = gc[~gc["gene"].isin(pool_genes)]["gene"].dropna().unique()
+    for gene in rng.choice(decoy_choices, size=min(n_decoys, len(decoy_choices)), replace=False):
+        c = _gene_candidate(gene, writable_df)
+        if c:
+            rows.append(c)
+    return pd.DataFrame(rows).drop_duplicates("gene").reset_index(drop=True)
+def _writable(ct: str) -> pd.DataFrame:
+    from pen_stack.atlas.crosslink import load_writability
+    return load_writability(ct)
+def recovery_at_k(panel: pd.DataFrame, k: int = 10, cargo_bp: int = 2000) -> pd.DataFrame:
+    """Planner (goal-conditioned) vs baseline (intent-blind safety), recovery@k per panel entry."""
+    rows = []
+    pools: dict[str, pd.DataFrame] = {}
+    for _, t in panel.iterrows():
+        ct = t["ct"]
+        if ct not in pools:
+            pools[ct] = build_pool(panel, _writable(ct))
+        pool = pools[ct].copy()
+        # PLANNER: score the pool with this entry's intent. on_target marks the entry's own target gene
+        # ONLY for *targeted* intents; safe-harbour is genome-wide (the destination is not a gene-to-avoid),
+        # so on_target stays False and recovery is pure safety x durability ranking.
+        genome_wide = bool(load_intent_weights()["intents"][t["intent"]].get("genome_wide", False))
+        pool["on_target"] = (pool["gene"] == t["gene"]) & (not genome_wide)
+        scored = score_candidates(pool, t["intent"], cargo_bp)
+        planner_topk = list(scored.head(k)["gene"])
+        # BASELINE: intent-blind, rank the same pool by safety only. Stable sort + tie-breakers so the
+        # saturated-safety ties resolve identically every run (default quicksort is not stable).
+        baseline_topk = list(pool.sort_values(["safety", "chrom", "bin"], ascending=[False, True, True],
+                                              kind="stable").head(k)["gene"])
+        rows.append({"name": t["name"], "gene": t["gene"], "stratum": t["stratum"],
+                     "intent": t["intent"],
+                     "planner_hit": int(t["gene"] in planner_topk),
+                     "baseline_hit": int(t["gene"] in baseline_topk)})
+    return pd.DataFrame(rows)
+def stratified_report(rec: pd.DataFrame) -> dict:
+    from statsmodels.stats.contingency_tables import mcnemar
+    out = {}
+    for s in ["control", "discriminating"]:
+        sub = rec[rec["stratum"] == s]
+        if sub.empty:
+            continue
+        b = int(((sub.planner_hit == 1) & (sub.baseline_hit == 0)).sum())   # planner wins
+        c = int(((sub.planner_hit == 0) & (sub.baseline_hit == 1)).sum())   # baseline wins
+        a = int(((sub.planner_hit == 1) & (sub.baseline_hit == 1)).sum())
+        d = int(((sub.planner_hit == 0) & (sub.baseline_hit == 0)).sum())
+        res = mcnemar([[a, b], [c, d]], exact=True)
+        # bootstrap CI of the recovery gap (planner - baseline)
+        diff = (sub.planner_hit - sub.baseline_hit).to_numpy()
+        rng = np.random.default_rng(SEED)
+        boot = [rng.choice(diff, size=len(diff), replace=True).mean() for _ in range(5000)]
+        ci = (float(np.percentile(boot, 2.5)), float(np.percentile(boot, 97.5)))
+        out[s] = {"n": int(len(sub)),
+                  "planner_recovery": round(float(sub.planner_hit.mean()), 4),
+                  "baseline_recovery": round(float(sub.baseline_hit.mean()), 4),
+                  "planner_wins": b, "baseline_wins": c,
+                  "mcnemar_pvalue": float(res.pvalue),
+                  "gap_mean": round(float(diff.mean()), 4),
+                  "gap_ci95": [round(ci[0], 4), round(ci[1], 4)],
+                  "ci_excludes_zero": bool(ci[0] > 0)}
+    return out
+def run(k: int = 10, out: str | Path = _OUT) -> dict:
+    panel = pd.read_csv(_PANEL)
+    rec = recovery_at_k(panel, k=k)
+    report = {"k": k, "n_panel": len(panel), "strata": stratified_report(rec),
+              "per_case": rec.to_dict("records")}
+    Path(out).parent.mkdir(parents=True, exist_ok=True)
+    Path(out).write_text(json.dumps(report, indent=2), encoding="utf-8")
+    return report
+if __name__ == "__main__":  # pragma: no cover
+    r = run()
+    print(json.dumps(r["strata"], indent=2))
+    print("\nper-case:")
+    for c in r["per_case"]:
+        print(f"  [{c['stratum'][:4]}] {c['name']:8s} {c['intent']:26s} planner={c['planner_hit']} baseline={c['baseline_hit']}")

pen_stack/validate/paper4_real_validation.py ADDED Viewed

@@ -0,0 +1,144 @@
+"""Paper 4 validation on the REAL Perry 2025 data (Phase 1.5).
+Now that the Perry 2025 supplementary (Science adz0276) is available locally, the previously *gated*
+criteria are validated against measured data (raw tables stay local - copyrighted; only derived results
+are written):
+1. **Measured position profile** - derive per-position protective weights from 6,856 real off-targets
+   (UMI-weighted). The data confirm the mechanism: the central core (positions 7-9, esp. 8) is the most
+   conserved; distal positions are tolerant. This measured profile replaces the literature one.
+2. **HEADLINE - blind discrimination of real off-targets, beating Hamming.** Real observed off-targets
+   (which recombined -> core preserved) are positives; a core-disrupted decoy of each (position-8 mutated ->
+   non-recombinogenic) is the negative. The position-weight model separates them near-perfectly where a
+   position-blind Hamming ranking cannot (AUROC).
+3. **DMS variant-effect** - the Perry Table S3 deep mutational scan recovers the top activity-enhancing
+   single mutants (e.g. N322P, H50K); completes the Phase-2 Section 2.4 DMS variant-proposal step.
+4. **Honest limitation** - predicted sequence-risk does NOT rank the *magnitude* of recombination among
+   already-observed off-targets (that is dominated by genomic context, not core sequence).
+Outputs: out/bridge_real_validation.json, features/bridge_offtarget_profile_measured.parquet.
+"""
+from __future__ import annotations
+import json
+import random
+from pathlib import Path
+from pen_stack.bridge.ingest import derive_measured_profile, load_dms, load_insertion_sites
+from pen_stack.bridge.offtarget import hamming_risk, mismatches, position_weights, risk_score
+_ROOT = Path(__file__).resolve().parents[2]
+_OUT = _ROOT / "out" / "bridge_real_validation.json"
+_PROFILE = _ROOT / "data" / "curated" / "bridge_offtarget_profile_measured.parquet"  # derived, committable
+_CORE0 = 7   # 0-based index of position 8 (the most-conserved / most-critical position)
+def _auroc(scores, labels) -> float:
+    pos = [s for s, y in zip(scores, labels) if y == 1]
+    neg = [s for s, y in zip(scores, labels) if y == 0]
+    if not pos or not neg:
+        return float("nan")
+    wins = sum((p > n) + 0.5 * (p == n) for p in pos for n in neg)
+    return wins / (len(pos) * len(neg))
+def measured_profile() -> dict:
+    prof = derive_measured_profile()
+    if prof.empty:
+        return {"available": False}
+    _PROFILE.parent.mkdir(parents=True, exist_ok=True)
+    prof.to_parquet(_PROFILE, index=False)
+    cons = dict(zip(prof["position"], prof["conservation"]))
+    top = sorted(cons, key=cons.get, reverse=True)[:3]
+    return {"available": True, "n_offtargets": int(prof["n_offtarget"].iloc[0]) if "n_offtarget" in prof
+            else int(prof["n_offtargets"].iloc[0]),
+            "conservation": {int(k): round(float(v), 3) for k, v in cons.items()},
+            "most_critical_positions": [int(p) for p in top],
+            "central_core_confirmed": bool(set(top) & {7, 8, 9})}
+def discrimination_auroc(seed: int = 20260602) -> dict:
+    s2 = load_insertion_sites()
+    if s2.empty:
+        return {"available": False}
+    off = s2[(s2["On-Target"] == False) &  # noqa: E712
+             (s2["Insertion_Site_Sequence"].str.len() == 14) &
+             (s2["Plasmid_Encoded_Sequence"].str.len() == 14)]
+    w = position_weights()           # measured weights
+    rng = random.Random(seed)
+    scores_m, scores_h, labels = [], [], []
+    n = 0
+    for seq, intended in zip(off["Insertion_Site_Sequence"], off["Plasmid_Encoded_Sequence"]):
+        if seq[_CORE0] != intended[_CORE0]:
+            continue                 # only positives that preserve the critical core position
+        # positive: the real off-target
+        mm = mismatches(seq, intended)
+        scores_m.append(risk_score(mm, w))
+        scores_h.append(hamming_risk(mm, 14))
+        labels.append(1)
+        # negative: same site but the critical core position mutated (non-recombinogenic decoy)
+        alt = rng.choice([b for b in "ACGT" if b != seq[_CORE0]])
+        decoy = seq[:_CORE0] + alt + seq[_CORE0 + 1:]
+        mmd = mismatches(decoy, intended)
+        scores_m.append(risk_score(mmd, w))
+        scores_h.append(hamming_risk(mmd, 14))
+        labels.append(0)
+        n += 1
+    return {"available": True, "n_pairs": n,
+            "model_auroc": round(_auroc(scores_m, labels), 4),
+            "hamming_auroc": round(_auroc(scores_h, labels), 4),
+            "model_beats_hamming": _auroc(scores_m, labels) > _auroc(scores_h, labels)}
+def dms_enhancers(top_k: int = 10) -> dict:
+    dms = load_dms()
+    if dms.empty:
+        return {"available": False}
+    import pandas as pd
+    dms = dms.copy()
+    dms["Z"] = pd.to_numeric(dms["Z_Score_wrt_WT"], errors="coerce")
+    dms = dms.dropna(subset=["Z"])
+    top = dms.sort_values("Z", ascending=False).head(top_k)
+    enh = int((dms["Z"] > 0).sum())
+    return {"available": True, "n_variants": int(len(dms)),
+            "n_enhancing": enh, "frac_enhancing": round(enh / len(dms), 4),
+            "top_enhancers": [{"mutation": str(m), "z": round(float(z), 3)}
+                              for m, z in zip(top["Mutation"], top["Z"])]}
+def magnitude_limit() -> dict:
+    """Honest: predicted risk vs measured %_of_insertions among observed off-targets (weak by design)."""
+    from scipy.stats import spearmanr
+    s2 = load_insertion_sites()
+    if s2.empty:
+        return {"available": False}
+    off = s2[(s2["On-Target"] == False) &  # noqa: E712
+             (s2["Insertion_Site_Sequence"].str.len() == 14) &
+             (s2["Plasmid_Encoded_Sequence"].str.len() == 14)]
+    w = position_weights()
+    risk = [risk_score(mismatches(s, i), w) for s, i in
+            zip(off["Insertion_Site_Sequence"], off["Plasmid_Encoded_Sequence"])]
+    rho = spearmanr(risk, off["%_of_Insertions"].values).correlation
+    return {"available": True, "risk_vs_magnitude_spearman": round(float(rho), 3),
+            "note": "weak by design - recombination magnitude among observed off-targets is dominated by "
+                    "genomic context, not core sequence; the model's value is discrimination, not magnitude"}
+def run(out: str | Path = _OUT) -> dict:
+    report = {
+        "measured_profile": measured_profile(),
+        "discrimination_headline": discrimination_auroc(),
+        "dms_enhancers": dms_enhancers(),
+        "magnitude_limitation": magnitude_limit(),
+        "data_source": "Perry et al. 2025, Science 391:eadz0276 (Tables S1-S3) - raw tables local/copyrighted",
+    }
+    Path(out).parent.mkdir(parents=True, exist_ok=True)
+    Path(out).write_text(json.dumps(report, indent=2), encoding="utf-8")
+    return report
+if __name__ == "__main__":  # pragma: no cover
+    print(json.dumps(run(), indent=2))

pen_stack/validate/paper4_validation.py ADDED Viewed

@@ -0,0 +1,82 @@
+"""Paper 4 validation (Phase 1.5) - off-target engine vs naive Hamming.
+The headline criterion that does NOT need the paywalled measured data: the position-weight model is
+strictly more informative than a position-blind Hamming ranking. On a controlled set of pseudosites with
+the SAME mismatch count but different positions, the model ranks biologically plausible off-targets
+(distal mismatches, core preserved) above implausible ones (central CT core disrupted), while Hamming
+cannot separate them. We quantify this as the AUROC of each score for discriminating
+core-preserving (label 1, real off-target risk) vs core-disrupting (label 0, recombination abolished).
+The blind recall of Perry 2025's measured off-target coordinates is gated on the paywalled supplementary
+(prereg/paper4.yaml) and is not computed here.
+Outputs: out/bridge_validation.json.
+"""
+from __future__ import annotations
+import json
+import random
+from pathlib import Path
+from pen_stack.bridge.ingest import load_profile_config
+from pen_stack.bridge.offtarget import hamming_risk, mismatches, position_weights, risk_score
+_OUT = Path(__file__).resolve().parents[2] / "out" / "bridge_validation.json"
+_BASES = "ACGT"
+def _auroc(scores: list[float], labels: list[int]) -> float:
+    """AUROC via the Mann-Whitney U statistic (ties counted as 0.5)."""
+    pos = [s for s, y in zip(scores, labels) if y == 1]
+    neg = [s for s, y in zip(scores, labels) if y == 0]
+    if not pos or not neg:
+        return float("nan")
+    wins = sum((p > n) + 0.5 * (p == n) for p in pos for n in neg)
+    return wins / (len(pos) * len(neg))
+def build_controlled_set(core: str, n: int = 400, seed: int = 20260602) -> list[dict]:
+    """Generate pseudosites with 1-2 mismatches; label 1 if core (CT) preserved, 0 if core disrupted."""
+    rng = random.Random(seed)
+    cfg = load_profile_config()
+    core_idx = [p - 1 for p in cfg["central_core_positions"]]
+    rows = []
+    for _ in range(n):
+        k = rng.choice([1, 2])
+        positions = rng.sample(range(len(core)), k)
+        site = list(core)
+        for p in positions:
+            site[p] = rng.choice([b for b in _BASES if b != core[p]])
+        site = "".join(site)
+        core_disrupted = any(p in core_idx for p in positions)
+        rows.append({"site": site, "n_mm": k, "core_preserved": int(not core_disrupted)})
+    return rows
+def run(core: str = "ACGTGTCTACGTGA", out: str | Path = _OUT) -> dict:
+    # synthetic, data-independent demonstration -> pin to the literature profile (the measured Perry
+    # profile is used by paper4_real_validation; here position 8 weight 1.0 makes the mechanism crisp).
+    weights = position_weights(prefer_measured=False)
+    rows = build_controlled_set(core)
+    model_scores, ham_scores, labels = [], [], []
+    for r in rows:
+        mm = mismatches(r["site"], core)
+        model_scores.append(risk_score(mm, weights))
+        ham_scores.append(hamming_risk(mm, len(core)))
+        labels.append(r["core_preserved"])
+    report = {
+        "core": core, "n_pseudosites": len(rows),
+        "n_core_preserved": sum(labels), "n_core_disrupted": len(labels) - sum(labels),
+        "model_auroc": round(_auroc(model_scores, labels), 4),
+        "hamming_auroc": round(_auroc(ham_scores, labels), 4),
+        "model_beats_hamming": _auroc(model_scores, labels) > _auroc(ham_scores, labels),
+        "note": "position-weight model vs naive Hamming on core-preserving vs core-disrupting pseudosites; "
+                "blind recall of Perry 2025 measured off-targets is gated on the paywalled supplementary",
+    }
+    Path(out).parent.mkdir(parents=True, exist_ok=True)
+    Path(out).write_text(json.dumps(report, indent=2), encoding="utf-8")
+    return report
+if __name__ == "__main__":  # pragma: no cover
+    print(json.dumps(run(), indent=2))

pen_stack/validate/seq_vs_measured.py ADDED Viewed

@@ -0,0 +1,134 @@
+"""WS-C2 - predicted-vs-measured chromatin validation.
+For a cell type with BOTH measured ENCODE tracks and AlphaGenome predictions (K562, HepG2), on a seeded
+held-out sample of bins:
+  1. per-track agreement (Spearman + Pearson, predicted vs measured) for the marks AlphaGenome covers;
+  2. score-level degradation: recompute writability/safety/p_durable from quantile-mapped predicted tracks
+     and correlate against the measured-track scores (how well the predicted epigenome recovers the scores).
+Honest scope (stated in M1): AlphaGenome predicts for cell types in/near its training data; this enriches
+covered types and approximates related ones - the cross-cell-type writability claim is bounded by that
+coverage. K562 has no predicted H3K9me3 (excluded for K562). Predictions are cached for offline re-runs.
+Acceptance (prereg/ws_c.yaml): report the per-track correlations and the score-level Spearman; the tool
+flags low confidence where predicted-track agreement is poor. The requirement is that this is measured and
+reported, not a fixed threshold.
+"""
+from __future__ import annotations
+import json
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from pen_stack.wgenome import chromatin_seq as cs
+from pen_stack.wgenome.features import _log_dist
+from pen_stack.wgenome.providers import AlphaGenomeProvider
+_ROOT = Path(__file__).resolve().parents[2]
+_FEAT = _ROOT.parent / "phase_1" / "features"
+_OUT = _ROOT / "out" / "seq_vs_measured.json"
+_LOW_CONF = 0.3   # median per-track Spearman below this -> flag low confidence for the cell type
+def _spearman(a, b) -> float:
+    a, b = pd.Series(np.asarray(a, float)), pd.Series(np.asarray(b, float))
+    return float(a.corr(b, method="spearman"))
+def _pearson(a, b) -> float:
+    a, b = pd.Series(np.asarray(a, float)), pd.Series(np.asarray(b, float))
+    return float(a.corr(b, method="pearson"))
+def _sample_bins(ct: str, n: int, seed: int):
+    """Seeded sample of ASSAYED (non-all-zero) bins - where measured signal exists to correlate against.
+    Returns (sample_df, full_chromatin_df, mark_columns).
+    """
+    chrom = pd.read_parquet(_FEAT / f"chromatin_{ct}.parquet")
+    marks = [c for c in ["atac", "dnase", "H3K27ac", "H3K4me1", "H3K4me3", "H3K9me3", "H3K27me3"]
+             if c in chrom.columns]
+    active = chrom[chrom[marks].abs().sum(axis=1) > 0]
+    return active.sample(n=min(n, len(active)), random_state=seed).reset_index(drop=True), chrom, marks
+def _measured_matrix(sample: pd.DataFrame, ct: str) -> pd.DataFrame:
+    """Scoring matrix for the sampled bins matching the trained schema: measured tracks + safety
+    log-distances + integration features (integ_*). Integration features are genomic, not predicted."""
+    from pen_stack.wgenome.features import SAFETY_DIST, add_accessibility
+    safe = pd.read_parquet(_FEAT / "safety_annot.parquet")
+    m = sample.merge(safe, on=["chrom", "bin"], how="left")
+    m = add_accessibility(m)
+    for d in SAFETY_DIST:
+        if d in m.columns:
+            m[f"log_{d}"] = _log_dist(m[d])
+    integ_path = _FEAT / f"integration_{ct}.parquet"
+    if integ_path.exists():
+        integ = pd.read_parquet(integ_path)
+        m = m.merge(integ, on=["chrom", "bin"], how="left")
+        for c in [c for c in integ.columns if c.startswith("integ_")]:
+            m[c] = m[c].fillna(0)
+    return m
+def run(ct: str = "k562", n: int = 120, seed: int = 20260604, offline: bool = False,
+        out: str | Path = _OUT) -> dict:
+    if not (_FEAT / f"chromatin_{ct}.parquet").exists():
+        return {"available": False, "note": f"measured chromatin for {ct} absent"}
+    provider = AlphaGenomeProvider(assembly="hg38")
+    if not provider.available() and not offline:
+        return {"available": False, "note": "AlphaGenome package+key absent; C2 pending (provide key)"}
+    sample, _chrom, marks = _sample_bins(ct, n, seed)
+    pred = cs.predicted_tracks_frame(ct, sample[["chrom", "bin"]], provider, offline=offline)
+    if pred.empty:
+        return {"available": False, "note": "no predicted tracks (offline cache empty - run live once)"}
+    merged = sample.merge(pred, on=["chrom", "bin"], how="inner", suffixes=("_meas", "_pred"))
+    per_track = {}
+    for t in marks:
+        mc, pc = f"{t}_meas", f"{t}_pred"
+        if mc in merged and pc in merged and merged[pc].notna().sum() >= 5:
+            per_track[t] = {"spearman": round(_spearman(merged[mc], merged[pc]), 4),
+                            "pearson": round(_pearson(merged[mc], merged[pc]), 4),
+                            "n": int(merged[pc].notna().sum())}
+    median_sp = float(np.nanmedian([v["spearman"] for v in per_track.values()])) if per_track else float("nan")
+    # score-level degradation (needs the trained pickles)
+    score_block = {"available": False, "note": "trained safety/durability pickles absent"}
+    if (_ROOT.parent / "phase_1" / "out" / f"safety_{ct}.pkl").exists():
+        meas_m = _measured_matrix(sample, ct)
+        meas_scores = cs.recompute_scores(meas_m, ct)
+        pred_m = cs.build_predicted_matrix(meas_m, pred, ct)
+        pred_scores = cs.recompute_scores(pred_m, ct)
+        j = meas_scores.merge(pred_scores, on=["chrom", "bin"], suffixes=("_meas", "_pred"))
+        sl = {f"{s}_spearman": round(_spearman(j[f"{s}_meas"], j[f"{s}_pred"]), 4)
+              for s in ["writability", "safety", "p_durable"]}
+        score_block = {"available": True, "n": int(len(j)), **sl,
+                       # honest flag: predicted tracks recover per-track signal but the COMPOSITE writability
+                       # score degrades - so the measured-track atlas stays the backbone (hybrid decision).
+                       "score_replacement_low_confidence": bool(sl["writability_spearman"] < _LOW_CONF),
+                       "interpretation": "predicted tracks approximate measured tracks per-track (esp. "
+                                         "accessibility), but rebuilding the composite writability score "
+                                         "from predictions degrades substantially - use measured tracks as "
+                                         "the backbone; AlphaGenome for on-demand track/3D signals."}
+    from pen_stack.wgenome.providers import MODEL_VERSION
+    report = {"available": True, "ct": ct, "n_sample": int(len(merged)), "seed": seed,
+              "model_version": MODEL_VERSION,
+              "marks_covered": list(per_track), "k562_missing_H3K9me3": ct.lower() == "k562",
+              "per_track": per_track, "median_track_spearman": round(median_sp, 4),
+              "low_confidence": bool(np.isnan(median_sp) or median_sp < _LOW_CONF),
+              "score_level_degradation": score_block,
+              "scope": "AlphaGenome covers cell types in/near its training data; cross-cell-type writability "
+                       "is bounded by that coverage. Predicted tracks are in model units - per-track uses "
+                       "rank (Spearman); score-level quantile-maps predicted tracks onto the measured marginal."}
+    Path(out).parent.mkdir(parents=True, exist_ok=True)
+    Path(out).write_text(json.dumps(report, indent=2, default=str), encoding="utf-8")
+    return report
+if __name__ == "__main__":  # pragma: no cover
+    print(json.dumps(run(), indent=2, default=str))