pen-stack 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. pen_stack/__init__.py +2 -0
  2. pen_stack/_resources.py +34 -0
  3. pen_stack/adapt/__init__.py +14 -0
  4. pen_stack/adapt/finetune.py +33 -0
  5. pen_stack/adapt/ingest.py +86 -0
  6. pen_stack/adapt/pipeline.py +101 -0
  7. pen_stack/adapt/recalibrate.py +58 -0
  8. pen_stack/adapt/report.py +130 -0
  9. pen_stack/agent/__init__.py +1 -0
  10. pen_stack/agent/guardrails.py +49 -0
  11. pen_stack/agent/mcp_server.py +42 -0
  12. pen_stack/agent/orchestrator.py +106 -0
  13. pen_stack/agent/pen_agent.py +169 -0
  14. pen_stack/agent/tools.py +130 -0
  15. pen_stack/atlas/__init__.py +1 -0
  16. pen_stack/atlas/build_wtkb.py +80 -0
  17. pen_stack/atlas/crosslink.py +144 -0
  18. pen_stack/atlas/expand.py +190 -0
  19. pen_stack/atlas/schema.py +59 -0
  20. pen_stack/atlas/scorecard.py +134 -0
  21. pen_stack/atlas/universe.py +75 -0
  22. pen_stack/atlas/variant_propose.py +155 -0
  23. pen_stack/bridge/__init__.py +1 -0
  24. pen_stack/bridge/activity.py +52 -0
  25. pen_stack/bridge/cli.py +65 -0
  26. pen_stack/bridge/fold_qc.py +53 -0
  27. pen_stack/bridge/guide_qc.py +84 -0
  28. pen_stack/bridge/ingest.py +139 -0
  29. pen_stack/bridge/offtarget.py +133 -0
  30. pen_stack/bridge/ortholog_screen.py +73 -0
  31. pen_stack/bridge/pipeline.py +83 -0
  32. pen_stack/cli.py +126 -0
  33. pen_stack/data/__init__.py +1 -0
  34. pen_stack/data/encode.py +84 -0
  35. pen_stack/data/genome.py +71 -0
  36. pen_stack/data/ingest_chromatin.py +119 -0
  37. pen_stack/data/ingest_integration.py +112 -0
  38. pen_stack/data/ingest_safety_annot.py +164 -0
  39. pen_stack/data/ingest_trip.py +76 -0
  40. pen_stack/mech/__init__.py +1 -0
  41. pen_stack/mech/classify_atlas.py +71 -0
  42. pen_stack/mech/whitelist.py +66 -0
  43. pen_stack/monitor/__init__.py +1 -0
  44. pen_stack/monitor/europepmc.py +32 -0
  45. pen_stack/monitor/run.py +57 -0
  46. pen_stack/monitor/triage.py +63 -0
  47. pen_stack/planner/__init__.py +1 -0
  48. pen_stack/planner/cargo.py +56 -0
  49. pen_stack/planner/cargo_polish.py +146 -0
  50. pen_stack/planner/delivery.py +32 -0
  51. pen_stack/planner/multiplex.py +110 -0
  52. pen_stack/planner/optimize.py +156 -0
  53. pen_stack/planner/pipeline.py +86 -0
  54. pen_stack/planner/report.py +26 -0
  55. pen_stack/rag/__init__.py +1 -0
  56. pen_stack/rag/index.py +53 -0
  57. pen_stack/rag/llm.py +178 -0
  58. pen_stack/rag/qa.py +105 -0
  59. pen_stack/score/__init__.py +1 -0
  60. pen_stack/score/recalibrate.py +77 -0
  61. pen_stack/score/therapeutic.py +85 -0
  62. pen_stack/server/__init__.py +1 -0
  63. pen_stack/server/api.py +142 -0
  64. pen_stack/ui/__init__.py +1 -0
  65. pen_stack/ui/app.py +518 -0
  66. pen_stack/validate/__init__.py +1 -0
  67. pen_stack/validate/adapt_demo.py +69 -0
  68. pen_stack/validate/agent_eval.py +117 -0
  69. pen_stack/validate/blind_gsh_discovery.py +165 -0
  70. pen_stack/validate/cargo_directionality.py +57 -0
  71. pen_stack/validate/durability_baselines.py +150 -0
  72. pen_stack/validate/forward_hypotheses.py +104 -0
  73. pen_stack/validate/guide_qc_demo.py +58 -0
  74. pen_stack/validate/intent_specification.py +82 -0
  75. pen_stack/validate/paper3_benchmark.py +165 -0
  76. pen_stack/validate/paper4_real_validation.py +144 -0
  77. pen_stack/validate/paper4_validation.py +82 -0
  78. pen_stack/validate/seq_vs_measured.py +134 -0
  79. pen_stack/validate/within_locus_ranking.py +74 -0
  80. pen_stack/validate/writer_recovery.py +86 -0
  81. pen_stack/wgenome/__init__.py +1 -0
  82. pen_stack/wgenome/chromatin_seq.py +83 -0
  83. pen_stack/wgenome/durability.py +108 -0
  84. pen_stack/wgenome/export_tracks.py +52 -0
  85. pen_stack/wgenome/features.py +82 -0
  86. pen_stack/wgenome/gsh_baseline.py +117 -0
  87. pen_stack/wgenome/providers.py +245 -0
  88. pen_stack/wgenome/safety.py +69 -0
  89. pen_stack/wgenome/structure3d.py +168 -0
  90. pen_stack/wgenome/writability.py +72 -0
  91. pen_stack-3.1.0.dist-info/METADATA +451 -0
  92. pen_stack-3.1.0.dist-info/RECORD +96 -0
  93. pen_stack-3.1.0.dist-info/WHEEL +5 -0
  94. pen_stack-3.1.0.dist-info/entry_points.txt +3 -0
  95. pen_stack-3.1.0.dist-info/licenses/LICENSE +21 -0
  96. pen_stack-3.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,74 @@
1
+ """Within-locus site ranking (v3.1, WS-A5) - descriptive.
2
+
3
+ For a large validated safe-harbour gene, does the planner rank the documented intronic safe bin above the
4
+ other bins in that locus? We rank every 1 kb bin in the gene body by writability and report the documented
5
+ bin's within-locus percentile. Descriptive (few qualifying loci); not a hypothesis test.
6
+
7
+ Documented safe sub-region coordinates (hg38, widely cited):
8
+ - AAVS1 = PPP1R12C intron 1, chr19:55,115,768 (DeKelver 2010, 10.1101/gr.106773.110)
9
+ - CLYBL = CLYBL intron 2, chr13:99,816,475 (Cerbini 2015, 10.1371/journal.pone.0116032)
10
+
11
+ Acceptance (prereg/ws_a.yaml): the documented bin lands in the top quartile (>= 75th percentile of
12
+ writability within the locus) for a pre-registered fraction of loci; reported per locus.
13
+ """
14
+ from __future__ import annotations
15
+
16
+ import json
17
+ from pathlib import Path
18
+
19
+ import pandas as pd
20
+
21
+ _ROOT = Path(__file__).resolve().parents[2]
22
+ _OUT = _ROOT / "out" / "within_locus_ranking.json"
23
+ _WDF = _ROOT.parent / "phase_1" / "out" / "atlas_k562.parquet"
24
+
25
+ # documented safe bins (gene, chrom, documented_bp)
26
+ _LOCI = [
27
+ {"name": "AAVS1", "gene": "PPP1R12C", "chrom": "chr19", "doc_bp": 55115768,
28
+ "doi": "10.1101/gr.106773.110"},
29
+ {"name": "CLYBL", "gene": "CLYBL", "chrom": "chr13", "doc_bp": 99816475,
30
+ "doi": "10.1371/journal.pone.0116032"},
31
+ ]
32
+
33
+
34
+ def run(out: str | Path = _OUT) -> dict:
35
+ from pen_stack.planner.optimize import _gene_coords
36
+ wdf = pd.read_parquet(_WDF)
37
+ gc = _gene_coords()
38
+ rows = []
39
+ for loc in _LOCI:
40
+ g = gc[gc["gene"] == loc["gene"]]
41
+ if g.empty:
42
+ continue
43
+ r = g.iloc[0]
44
+ lo, hi = int(r["start"]) // 1000, int(r["end"]) // 1000
45
+ body = wdf[(wdf["chrom"] == loc["chrom"]) & (wdf["bin"].between(lo, hi))].dropna(subset=["writability"])
46
+ if body.empty:
47
+ continue
48
+ doc_bin = loc["doc_bp"] // 1000
49
+ doc_row = body[body["bin"] == doc_bin]
50
+ if doc_row.empty: # nearest available bin in the body
51
+ doc_row = body.iloc[(body["bin"] - doc_bin).abs().argsort()[:1]]
52
+ doc_w = float(doc_row.iloc[0]["writability"])
53
+ pct = float((body["writability"] < doc_w).mean()) # within-locus percentile of the documented bin
54
+ rows.append({"name": loc["name"], "gene": loc["gene"], "n_bins": int(len(body)),
55
+ "documented_bin": int(doc_bin), "documented_writability": round(doc_w, 4),
56
+ "within_locus_percentile": round(pct, 3), "top_quartile": bool(pct >= 0.75),
57
+ "doi": loc["doi"]})
58
+ tab = pd.DataFrame(rows)
59
+ n = len(tab)
60
+ n_top = int(tab["top_quartile"].sum()) if n else 0
61
+ report = {
62
+ "what_this_is": "within-locus ranking of the documented safe bin (descriptive, not a hypothesis test)",
63
+ "n_loci": n, "n_top_quartile": n_top,
64
+ "fraction_top_quartile": round(n_top / n, 3) if n else None,
65
+ "per_locus": rows,
66
+ "scope": "few qualifying loci; descriptive; the documented sub-region is a 1 kb bin approximation.",
67
+ }
68
+ Path(out).parent.mkdir(parents=True, exist_ok=True)
69
+ Path(out).write_text(json.dumps(report, indent=2, default=str), encoding="utf-8")
70
+ return report
71
+
72
+
73
+ if __name__ == "__main__": # pragma: no cover
74
+ print(json.dumps(run(), indent=2, default=str))
@@ -0,0 +1,86 @@
1
+ """Diversified writer-family recovery (v3.1, WS-A4).
2
+
3
+ The Phase-3 panel was bridge-dominated, so writer choice barely varied. Here we add DSB-free, large-cargo
4
+ documented writes (CAST, PASTE/PE-integrase, large serine-integrase landing pads) so the correct family
5
+ genuinely changes with cargo size. The writer is held out; we recover the family used from the goal +
6
+ intent + cargo size + cell type alone.
7
+
8
+ Selection rule (documented, not tuned): recommend the **smallest-capacity DSB-free writer family that fits
9
+ the cargo** (do not deploy a 50 kb integrase for a 2 kb insert when a programmable bridge suffices); ties
10
+ broken by measured human-cell activity. This makes cargo size load-bearing for the writer choice.
11
+
12
+ Acceptance (prereg/ws_a.yaml): writer-family recovery@1 exceeds the prevalence baseline by a pre-registered
13
+ margin on >= 8 entries spanning >= 3 families; reported per family.
14
+ """
15
+ from __future__ import annotations
16
+
17
+ import json
18
+ from collections import Counter
19
+ from pathlib import Path
20
+
21
+ import pandas as pd
22
+
23
+ _ROOT = Path(__file__).resolve().parents[2]
24
+ _PANEL = _ROOT / "data" / "writer_panel.csv"
25
+ _ATLAS = _ROOT / "pen_stack" / "atlas" / "atlas.parquet"
26
+ _OUT = _ROOT / "out" / "writer_recovery.json"
27
+
28
+
29
+ def _family_caps() -> pd.DataFrame:
30
+ """family -> (cargo_capacity_bp, dsb_free, human_cell_activity proxy) from the Writer Atlas cores."""
31
+ atlas = pd.read_parquet(_ATLAS)
32
+ core = atlas[atlas["entry_kind"] == "curated_core"] if "entry_kind" in atlas else atlas
33
+ rows = []
34
+ for fam, sub in core.groupby("family"):
35
+ r = sub.iloc[0]
36
+ cap = r.get("cargo_capacity_bp")
37
+ act = r.get("S_HumanCell")
38
+ rows.append({"family": fam,
39
+ "cargo_capacity_bp": (float(cap) if pd.notna(cap) else None),
40
+ "dsb_free": bool(r.get("dsb_free", False)),
41
+ "activity": (float(act) if pd.notna(act) else 0.4)})
42
+ return pd.DataFrame(rows)
43
+
44
+
45
+ def recover_writer_family(cargo_bp: int, dsb_free_required: bool = True) -> str | None:
46
+ """Smallest-capacity DSB-free family that fits the cargo; ties by activity."""
47
+ caps = _family_caps()
48
+ cand = caps[caps["cargo_capacity_bp"].notna() & (caps["cargo_capacity_bp"] >= cargo_bp)]
49
+ if dsb_free_required:
50
+ cand = cand[cand["dsb_free"]]
51
+ if cand.empty:
52
+ return None
53
+ cand = cand.sort_values(["cargo_capacity_bp", "activity"], ascending=[True, False])
54
+ return cand.iloc[0]["family"]
55
+
56
+
57
+ def run(out: str | Path = _OUT) -> dict:
58
+ panel = pd.read_csv(_PANEL)
59
+ panel["predicted_family"] = [recover_writer_family(int(r.cargo_bp), bool(r.dsb_free_required))
60
+ for r in panel.itertuples()]
61
+ panel["hit"] = panel["predicted_family"] == panel["family"]
62
+ n, n_hit = len(panel), int(panel["hit"].sum())
63
+ # prevalence baseline: always guess the most common family -> expected accuracy = max class share
64
+ prev = Counter(panel["family"])
65
+ prevalence_at1 = max(prev.values()) / n
66
+ per_family = {fam: {"n": int((panel["family"] == fam).sum()),
67
+ "recall@1": round(float(panel[panel["family"] == fam]["hit"].mean()), 3)}
68
+ for fam in sorted(prev)}
69
+ report = {
70
+ "what_this_is": "writer-family recovery@1 from goal+intent+cargo+ct, writer held out (non-circular)",
71
+ "n_entries": n, "n_families": len(prev),
72
+ "recovery_at_1": round(n_hit / n, 4),
73
+ "prevalence_baseline_at_1": round(prevalence_at1, 4),
74
+ "beats_prevalence": bool(n_hit / n > prevalence_at1),
75
+ "per_family": per_family,
76
+ "selection_rule": "smallest-capacity DSB-free family that fits the cargo; ties by human-cell activity",
77
+ "cases": panel[["name", "family", "cargo_bp", "predicted_family", "hit", "doi"]].to_dict("records"),
78
+ "scope": "small N; documented writes are survivorship-biased; cargo size is the dominant signal.",
79
+ }
80
+ Path(out).parent.mkdir(parents=True, exist_ok=True)
81
+ Path(out).write_text(json.dumps(report, indent=2, default=str), encoding="utf-8")
82
+ return report
83
+
84
+
85
+ if __name__ == "__main__": # pragma: no cover
86
+ print(json.dumps({k: v for k, v in run().items() if k != "cases"}, indent=2, default=str))
@@ -0,0 +1 @@
1
+ """pen_stack.wgenome - see PEN-STACK v3.0 program doc."""
@@ -0,0 +1,83 @@
1
+ """Sequence-derived chromatin tracks (WS-C2): map AlphaGenome predictions onto the measured-atlas schema
2
+ and recompute writability/safety/durability from predicted tracks.
3
+
4
+ Two honest details:
5
+ * Unit handling. AlphaGenome track outputs are in the model's own units, not the measured ENCODE scale the
6
+ safety/durability models were trained on. Per-track agreement is therefore reported with Spearman (rank,
7
+ unit-free) alongside Pearson. For the *score-level* recompute we quantile-map each predicted track onto
8
+ the measured track's marginal (a standard rank-preserving calibration), so the recomputed scores test
9
+ whether AlphaGenome's RANKING of the epigenome recovers the measured-track scores - not a unit accident.
10
+ * Coverage. AlphaGenome predicts H3K9me3 for HepG2 but NOT K562; missing marks come back NaN and are
11
+ excluded from per-track correlation and passed as NaN to the (NaN-native) durability model.
12
+ """
13
+ from __future__ import annotations
14
+
15
+ from pathlib import Path
16
+
17
+ import numpy as np
18
+ import pandas as pd
19
+
20
+ from pen_stack.wgenome.providers import AlphaGenomeProvider, _HISTONES, TRACK_NAMES
21
+
22
+ _ROOT = Path(__file__).resolve().parents[2]
23
+ _P1_FEAT = _ROOT.parent / "phase_1" / "features"
24
+ _P1_OUT = _ROOT.parent / "phase_1" / "out"
25
+
26
+
27
+ def predicted_tracks_frame(ct: str, bins: pd.DataFrame, provider: AlphaGenomeProvider | None = None,
28
+ offline: bool = False) -> pd.DataFrame:
29
+ """Predicted 7-track values for the given (chrom, bin) rows. Cached per bin in the provider."""
30
+ provider = provider or AlphaGenomeProvider(assembly="hg38")
31
+ rows = []
32
+ for r in bins.itertuples():
33
+ rec = provider.tracks(r.chrom, int(r.bin), ct, offline=offline)
34
+ if rec.get("available"):
35
+ rows.append({"chrom": r.chrom, "bin": int(r.bin),
36
+ **{t: rec.get(t, np.nan) for t in TRACK_NAMES}})
37
+ return pd.DataFrame(rows)
38
+
39
+
40
+ def quantile_map(pred: pd.Series, measured: pd.Series) -> pd.Series:
41
+ """Map `pred` onto `measured`'s marginal by matching ranks (rank-preserving calibration)."""
42
+ pred = pred.astype(float)
43
+ if pred.notna().sum() < 2 or measured.notna().sum() < 2:
44
+ return pred
45
+ ranks = pred.rank(pct=True, na_option="keep")
46
+ q = np.nanpercentile(measured.to_numpy(dtype=float), np.clip(ranks.to_numpy() * 100, 0, 100))
47
+ return pd.Series(q, index=pred.index)
48
+
49
+
50
+ def _load_models(ct: str):
51
+ from pen_stack.wgenome.writability import load_pickle
52
+ safety = load_pickle(str(_P1_OUT / f"safety_{ct}.pkl"))
53
+ dur = load_pickle(str(_P1_OUT / "durability.pkl"))
54
+ return safety, dur
55
+
56
+
57
+ def recompute_scores(matrix: pd.DataFrame, ct: str) -> pd.DataFrame:
58
+ """Apply the trained safety + durability models to a feature matrix; return writability components."""
59
+ from pen_stack.wgenome.writability import build_writability
60
+ safety, dur = _load_models(ct)
61
+ return build_writability(matrix, safety, dur)
62
+
63
+
64
+ def build_predicted_matrix(measured_matrix: pd.DataFrame, predicted: pd.DataFrame, ct: str) -> pd.DataFrame:
65
+ """Substitute quantile-mapped predicted tracks into a copy of the measured feature matrix.
66
+
67
+ Distance/integration features are genomic (not predicted) and are kept as-is; only the chromatin tracks
68
+ (atac/dnase/5 histones -> accessibility + marks) are replaced, then `accessibility` is rederived.
69
+ """
70
+ from pen_stack.wgenome.features import add_accessibility
71
+ m = measured_matrix.merge(predicted, on=["chrom", "bin"], how="inner", suffixes=("", "_pred"))
72
+ for t in TRACK_NAMES:
73
+ pc = f"{t}_pred"
74
+ if pc in m.columns and t in m.columns:
75
+ m[t] = quantile_map(m[pc], m[t]) # map predicted onto this sample's measured marginal
76
+ m = m.drop(columns=[c for c in m.columns if c.endswith("_pred")])
77
+ m = m.drop(columns=["accessibility"], errors="ignore")
78
+ return add_accessibility(m)
79
+
80
+
81
+ def histone_marks_for(ct: str) -> list[str]:
82
+ """Marks AlphaGenome actually predicts for this cell type (K562 lacks H3K9me3)."""
83
+ return [m for m in _HISTONES if not (ct.lower() == "k562" and m == "H3K9me3")]
@@ -0,0 +1,108 @@
1
+ """Durability layer (Phase 1, Step 1.7) - the conditional chromatin-context model.
2
+
3
+ Learns ONE function: `local chromatin features -> (expression level, silenced/stable)` on TRIP
4
+ integrations. The model never sees a coordinate, so it is cell-type-agnostic in function: to score a
5
+ new cell type you supply its chromatin tracks. This is the layer no safe-harbour resource provides,
6
+ and TRIP supervises exactly the writing-relevant quantity (position effect on an integrated cassette).
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import os
11
+ from pathlib import Path
12
+
13
+ import lightgbm as lgb
14
+ import numpy as np
15
+ import pandas as pd
16
+ import pyBigWig
17
+ from scipy.stats import spearmanr
18
+ from sklearn.metrics import roc_auc_score
19
+ from sklearn.model_selection import GroupKFold
20
+
21
+ # canonical chromatin feature names (must match across mouse training + human application)
22
+ CHROMATIN = ["atac", "dnase", "H3K27ac", "H3K4me1", "H3K4me3", "H3K9me3", "H3K27me3"]
23
+
24
+
25
+ def liftover_positions(df: pd.DataFrame, chain_file: str) -> pd.DataFrame:
26
+ """Lift (chrom,pos) with a UCSC chain (e.g. mm9->mm10). Drops positions that fail to lift."""
27
+ from pyliftover import LiftOver
28
+ lo = LiftOver(chain_file)
29
+ out = []
30
+ for _, r in df.iterrows():
31
+ c = lo.convert_coordinate(r["chrom"], int(r["pos"]))
32
+ if c:
33
+ row = r.to_dict()
34
+ row["chrom"], row["pos"] = c[0][0], c[0][1]
35
+ out.append(row)
36
+ return pd.DataFrame(out)
37
+
38
+
39
+ def extract_chromatin_at(df: pd.DataFrame, panel: dict, raw_dir: str, download_fn,
40
+ window: int = 2500) -> pd.DataFrame:
41
+ """Point-query each bigWig's mean signal in +/-window around each integration position.
42
+ Only the integration sites are queried (no genome-wide binning needed)."""
43
+ out = df.copy()
44
+ for name, rec in panel.items():
45
+ path = download_fn(rec["href"], os.path.join(raw_dir, f"mES_{name}_{rec['accession']}.bigWig"))
46
+ bw = pyBigWig.open(path)
47
+ chroms = set(bw.chroms().keys())
48
+ vals = []
49
+ for chrom, pos in zip(out["chrom"], out["pos"]):
50
+ key = chrom if chrom in chroms else chrom.replace("chr", "")
51
+ if key not in chroms:
52
+ vals.append(0.0)
53
+ continue
54
+ try:
55
+ v = bw.stats(key, max(0, pos - window), pos + window, type="mean")[0]
56
+ except (RuntimeError, IndexError):
57
+ v = None
58
+ vals.append(0.0 if v is None else float(v))
59
+ out[name] = vals
60
+ bw.close()
61
+ os.remove(path)
62
+ print(f" extracted {name} at {len(out)} sites", flush=True)
63
+ return out
64
+
65
+
66
+ def train_durability(trip_df: pd.DataFrame, seed: int = 42) -> dict:
67
+ feats = [c for c in CHROMATIN if c in trip_df.columns]
68
+ df = trip_df.dropna(subset=feats + ["expression"]).copy()
69
+ X = df[feats].astype("float32").fillna(0.0)
70
+ y_expr = df["expression"].to_numpy()
71
+ y_sil = df["silenced"].astype(int).to_numpy()
72
+ groups = df["chrom"].astype("category").cat.codes.to_numpy()
73
+
74
+ gkf = GroupKFold(n_splits=min(5, len(np.unique(groups))))
75
+ oof_expr = np.zeros(len(df))
76
+ oof_sil = np.zeros(len(df))
77
+ for tr, te in gkf.split(X, y_expr, groups):
78
+ reg = lgb.LGBMRegressor(n_estimators=500, learning_rate=0.03, num_leaves=31,
79
+ subsample=0.8, random_state=seed, n_jobs=-1, verbosity=-1).fit(X.iloc[tr], y_expr[tr])
80
+ oof_expr[te] = reg.predict(X.iloc[te])
81
+ clf = lgb.LGBMClassifier(n_estimators=500, learning_rate=0.03, num_leaves=31,
82
+ subsample=0.8, random_state=seed, n_jobs=-1, verbosity=-1).fit(X.iloc[tr], y_sil[tr])
83
+ oof_sil[te] = clf.predict_proba(X.iloc[te])[:, 1]
84
+
85
+ rho = float(spearmanr(oof_expr, y_expr).statistic)
86
+ auroc = float(roc_auc_score(y_sil, oof_sil))
87
+ # baseline: H3K9me3 (heterochromatin) alone as a silencing predictor, and LAD-like (low ATAC) for expression
88
+ base_sil = roc_auc_score(y_sil, df["H3K9me3"].fillna(0)) if "H3K9me3" in df else float("nan")
89
+ base_expr = spearmanr(df.get("atac", pd.Series(0, index=df.index)).fillna(0), y_expr).statistic
90
+
91
+ final_reg = lgb.LGBMRegressor(n_estimators=500, learning_rate=0.03, num_leaves=31,
92
+ random_state=seed, n_jobs=-1, verbosity=-1).fit(X, y_expr)
93
+ imp = dict(sorted(zip(feats, final_reg.feature_importances_.tolist()), key=lambda kv: kv[1], reverse=True))
94
+ return {
95
+ "n": int(len(df)), "features": feats,
96
+ "expr_spearman": rho, "expr_baseline_atac_spearman": float(base_expr),
97
+ "silenced_auroc": auroc, "silenced_baseline_h3k9me3_auroc": float(base_sil),
98
+ "feature_importance": imp, "reg": final_reg,
99
+ "clf": lgb.LGBMClassifier(n_estimators=500, learning_rate=0.03, num_leaves=31,
100
+ random_state=seed, n_jobs=-1, verbosity=-1).fit(X, y_sil),
101
+ }
102
+
103
+
104
+ def save_models(res: dict, out_dir: str, tag: str = "durability") -> None:
105
+ import pickle
106
+ Path(out_dir).mkdir(parents=True, exist_ok=True)
107
+ with open(f"{out_dir}/{tag}.pkl", "wb") as fh:
108
+ pickle.dump({"reg": res["reg"], "clf": res["clf"], "features": res["features"]}, fh)
@@ -0,0 +1,52 @@
1
+ """Export the Writable Genome atlas as genome-browser tracks (Phase 1, Step 1.11).
2
+
3
+ Writes per-cell-type BigWig tracks (writability, safety, p_durable) loadable in IGV/UCSC, plus a BED
4
+ of the top-writable loci. The atlas parquet stays the queryable source; these are the shareable tracks.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ from pathlib import Path
9
+
10
+ import pandas as pd
11
+ import pyBigWig
12
+
13
+ from pen_stack.data.genome import MAIN_CHROMS, load_chrom_sizes
14
+
15
+ BIN_BP = 1000
16
+ TRACKS = ["writability", "safety", "p_durable"]
17
+
18
+
19
+ def write_bigwig(df: pd.DataFrame, col: str, chrom_sizes: dict[str, int], out_bw: str) -> None:
20
+ bw = pyBigWig.open(out_bw, "w")
21
+ # header must be sorted; keep canonical chrom order with sizes
22
+ chroms = [(c, chrom_sizes[c]) for c in MAIN_CHROMS if c in chrom_sizes]
23
+ bw.addHeader(chroms)
24
+ for chrom, _ in chroms:
25
+ g = df[df["chrom"] == chrom].sort_values("bin")
26
+ if g.empty:
27
+ continue
28
+ starts = (g["bin"].to_numpy() * BIN_BP).astype("int64")
29
+ vals = g[col].astype("float64").fillna(0.0).to_numpy()
30
+ bw.addEntries(chrom, list(starts), values=list(vals), span=BIN_BP, step=BIN_BP)
31
+ bw.close()
32
+
33
+
34
+ def export_atlas(atlas_parquet: str, chrom_sizes_tsv: str, out_dir: str, ct: str,
35
+ top_n: int = 5000) -> dict:
36
+ df = pd.read_parquet(atlas_parquet)
37
+ sizes = load_chrom_sizes(chrom_sizes_tsv)
38
+ Path(out_dir).mkdir(parents=True, exist_ok=True)
39
+ written = {}
40
+ for col in TRACKS:
41
+ if col in df.columns:
42
+ out_bw = f"{out_dir}/atlas_{ct}_{col}.bw"
43
+ write_bigwig(df, col, sizes, out_bw)
44
+ written[col] = out_bw
45
+ # top-writable loci BED
46
+ top = df.nlargest(top_n, "writability")[["chrom", "bin", "writability"]].copy()
47
+ top["start"] = top["bin"] * BIN_BP
48
+ top["end"] = top["start"] + BIN_BP
49
+ bed = f"{out_dir}/atlas_{ct}_top{top_n}.bed"
50
+ top[["chrom", "start", "end", "writability"]].to_csv(bed, sep="\t", header=False, index=False)
51
+ written["top_bed"] = bed
52
+ return written
@@ -0,0 +1,82 @@
1
+ """Assemble the per-cell-type training/scoring matrix (Phase 1, bridge between 1A and 1B).
2
+
3
+ Joins the cell-type chromatin feature store + the (cell-type-agnostic) safety-annotation store
4
+ (+ integration-outcome store when available) on (chrom, bin) into one matrix the safety and
5
+ durability layers consume. Keeps feature provenance explicit.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+
11
+ import numpy as np
12
+ import pandas as pd
13
+
14
+ # Unified chromatin feature set: ONE accessibility feature (ATAC where present, else DNase) +
15
+ # the 5 core histone marks. This makes every cell type share an IDENTICAL schema, so a cell type
16
+ # that lacks a specific accessibility assay (e.g. CD34+ HSPC has DNase but no ATAC) is fully
17
+ # specified rather than "partial" - ATAC and DNase are interchangeable open-chromatin assays.
18
+ CHROMATIN_TRACKS = ["accessibility", "H3K27ac", "H3K4me1", "H3K4me3", "H3K9me3", "H3K27me3"]
19
+ ACCESS_SOURCES = ["atac", "dnase"]
20
+ SAFETY_DIST = ["dist_oncogene", "dist_tsg", "dist_essential", "dist_tss"]
21
+
22
+
23
+ def add_accessibility(m: pd.DataFrame) -> pd.DataFrame:
24
+ """Derive the unified `accessibility` column: prefer ATAC, fall back to DNase."""
25
+ if "accessibility" not in m.columns:
26
+ if "atac" in m.columns:
27
+ m["accessibility"] = m["atac"]
28
+ if "dnase" in m.columns: # fill any ATAC gaps with DNase
29
+ m["accessibility"] = m["accessibility"].fillna(m["dnase"])
30
+ elif "dnase" in m.columns:
31
+ m["accessibility"] = m["dnase"]
32
+ return m
33
+
34
+
35
+ def _log_dist(s: pd.Series) -> pd.Series:
36
+ # large/Inf for "no feature on chromosome" -> log1p of a capped distance; NaN -> max
37
+ v = s.fillna(s.max() if s.notna().any() else 1e8).clip(lower=0)
38
+ return np.log1p(v)
39
+
40
+
41
+ def assemble_matrix(chromatin_parquet: str, safety_parquet: str,
42
+ integration_parquet: str | None = None,
43
+ out_parquet: str | None = None) -> pd.DataFrame:
44
+ chrom = pd.read_parquet(chromatin_parquet)
45
+ safe = pd.read_parquet(safety_parquet)
46
+ m = chrom.merge(safe, on=["chrom", "bin"], how="inner")
47
+ m = add_accessibility(m) # unify ATAC/DNase -> accessibility
48
+
49
+ # log-scaled distance features (raw kept too, for transparency)
50
+ for d in SAFETY_DIST:
51
+ if d in m.columns:
52
+ m[f"log_{d}"] = _log_dist(m[d])
53
+
54
+ if integration_parquet and Path(integration_parquet).exists():
55
+ integ = pd.read_parquet(integration_parquet)
56
+ m = m.merge(integ, on=["chrom", "bin"], how="left")
57
+ for c in [c for c in integ.columns if c not in ("chrom", "bin")]:
58
+ m[c] = m[c].fillna(0)
59
+
60
+ if out_parquet:
61
+ Path(out_parquet).parent.mkdir(parents=True, exist_ok=True)
62
+ m.to_parquet(out_parquet, index=False)
63
+ return m
64
+
65
+
66
+ def resolve_integration(feat_dir: str, ct: str) -> str | None:
67
+ """Integration-feature parquet for a cell type: prefer the cell-type-specific MLV set
68
+ (LaFave K562/HepG2); fall back to the cell-type-agnostic VISDB retroviral-propensity track so a
69
+ cell type without its own integration assay (e.g. CD34+ HSPC) still gets an integration feature."""
70
+ ct_specific = Path(feat_dir) / f"integration_{ct}.parquet"
71
+ if ct_specific.exists():
72
+ return str(ct_specific)
73
+ fallback = Path(feat_dir) / "integration_density.parquet"
74
+ return str(fallback) if fallback.exists() else None
75
+
76
+
77
+ def feature_columns(df: pd.DataFrame) -> list[str]:
78
+ """The model feature set: chromatin marks + log-distances + any integration features."""
79
+ feats = [c for c in CHROMATIN_TRACKS if c in df.columns]
80
+ feats += [c for c in df.columns if c.startswith("log_dist_")]
81
+ feats += [c for c in df.columns if c.startswith("integ_")]
82
+ return feats
@@ -0,0 +1,117 @@
1
+ """Genomic safe-harbour (GSH) rule-set baseline (v3.1, WS-B3).
2
+
3
+ A published multi-criterion GSH rule (Papapetrou/Sadelain/Pellenz style) implemented from the existing
4
+ per-bin annotations: outside a gene, and minimum distances to the nearest TSS, cancer/oncogene, and
5
+ essential gene. We compute it as a graded safety score and compare its **safe-harbour discrimination**
6
+ (held-out validated GSH vs matched controls, reusing WS-A3) against the learned writability model.
7
+
8
+ The headline safety claim is **discrimination** (validated GSH vs matched controls), NOT the
9
+ `genotoxic_cis` AUROC - which is circular (its label is proximity to five oncogenes, i.e. the distance
10
+ baseline's own definition) and is demoted to a clearly-labeled diagnostic.
11
+
12
+ Acceptance (prereg/ws_b.yaml): the learned model beats the GSH rule-set on discrimination AUROC; report
13
+ the delta. If it does not, say so - the rule is a strong, interpretable baseline.
14
+ """
15
+ from __future__ import annotations
16
+
17
+ import json
18
+ from pathlib import Path
19
+
20
+ import numpy as np
21
+ import pandas as pd
22
+
23
+ _ROOT = Path(__file__).resolve().parents[2]
24
+ _OUT = _ROOT / "out" / "gsh_baseline.json"
25
+
26
+ # published-style minimum distances (bp). Graded: a bin scores higher the further it clears each minimum.
27
+ _MIN_DIST = {"dist_tss": 5000, "dist_oncogene": 50000, "dist_essential": 50000}
28
+
29
+
30
+ def gsh_rule_score(df: pd.DataFrame) -> pd.Series:
31
+ """Graded GSH-rule safety score in [0,1]: mean over criteria of min(dist / threshold, 1)."""
32
+ parts = []
33
+ for col, thr in _MIN_DIST.items():
34
+ if col in df.columns:
35
+ parts.append((df[col].clip(lower=0) / thr).clip(upper=1.0))
36
+ if not parts:
37
+ return pd.Series(0.0, index=df.index)
38
+ return pd.concat(parts, axis=1).mean(axis=1)
39
+
40
+
41
+ def _auroc(scores, labels) -> float:
42
+ pos = [s for s, y in zip(scores, labels) if y == 1]
43
+ neg = [s for s, y in zip(scores, labels) if y == 0]
44
+ if not pos or not neg:
45
+ return float("nan")
46
+ return sum((p > n) + 0.5 * (p == n) for p in pos for n in neg) / (len(pos) * len(neg))
47
+
48
+
49
+ def run(ct: str = "k562", out: str | Path = _OUT) -> dict:
50
+ """Discrimination AUROC: GSH rule-set vs the learned writability model, on the WS-A3 GSH/controls."""
51
+ from pen_stack.validate.blind_gsh_discovery import _load_features, gsh_positives
52
+ import yaml
53
+ cfg = yaml.safe_load((_ROOT / "configs" / "gsh_validated_heldout.yaml").read_text(encoding="utf-8"))
54
+ df = _load_features(ct)
55
+ safe = pd.read_parquet(_ROOT.parent / "phase_1" / "features" / "safety_annot.parquet")[
56
+ ["chrom", "bin", "dist_tss", "dist_oncogene", "dist_essential"]]
57
+ df = df.drop(columns=[c for c in ["dist_tss", "dist_oncogene", "dist_essential"] if c in df.columns]).merge(
58
+ safe, on=["chrom", "bin"], how="left")
59
+ df["gsh_rule"] = gsh_rule_score(df)
60
+
61
+ positives = gsh_positives(df, cfg)
62
+ controls = pd.read_parquet(_ROOT / "data" / "gsh_matched_controls.parquet")
63
+ idx = df.set_index(["chrom", "bin"])
64
+ def vals(frame, col):
65
+ return [idx.loc[(r.chrom, r.bin), col] for r in frame.itertuples() if (r.chrom, r.bin) in idx.index]
66
+ pr, cr = vals(positives, "gsh_rule"), vals(controls, "gsh_rule")
67
+ pw, cw = vals(positives, "writability"), vals(controls, "writability")
68
+ labels_r = [1] * len(pr) + [0] * len(cr)
69
+ labels_w = [1] * len(pw) + [0] * len(cw)
70
+ auroc_rule = _auroc(pr + cr, labels_r)
71
+ auroc_learned = _auroc(pw + cw, labels_w)
72
+
73
+ # Bootstrap 95% CI for the learned AUROC and the learned-minus-rule delta (prereg/ws_b.yaml: report delta
74
+ # AND CI). Resample positives and controls independently (stratified). With only ~5 GSH positives the CI
75
+ # is WIDE by construction - reported honestly rather than hidden.
76
+ rng = np.random.default_rng(20260604)
77
+ npos, nctrl = len(pw), len(cw)
78
+ boot_learned, boot_delta = [], []
79
+ if npos and nctrl:
80
+ pw_a, cw_a = np.array(pw, float), np.array(cw, float)
81
+ pr_a, cr_a = np.array(pr, float), np.array(cr, float)
82
+ for _ in range(2000):
83
+ pi = rng.integers(0, npos, npos)
84
+ ci = rng.integers(0, nctrl, nctrl)
85
+ lab = [1] * npos + [0] * nctrl
86
+ al = _auroc(list(pw_a[pi]) + list(cw_a[ci]), lab)
87
+ ar = _auroc(list(pr_a[pi]) + list(cr_a[ci]), lab)
88
+ if not (np.isnan(al) or np.isnan(ar)):
89
+ boot_learned.append(al)
90
+ boot_delta.append(al - ar)
91
+
92
+ def _ci(b):
93
+ return [round(float(np.percentile(b, 2.5)), 4), round(float(np.percentile(b, 97.5)), 4)] if b else None
94
+
95
+ report = {
96
+ "primary_safety_metric": "safe-harbour discrimination (validated GSH vs matched controls)",
97
+ "n_positives": npos, "n_controls": nctrl,
98
+ "auroc_learned_writability": round(auroc_learned, 4),
99
+ "auroc_learned_ci95": _ci(boot_learned),
100
+ "auroc_gsh_ruleset_baseline": round(auroc_rule, 4) if not np.isnan(auroc_rule) else None,
101
+ "learned_beats_ruleset": bool(auroc_learned > auroc_rule) if not np.isnan(auroc_rule) else None,
102
+ "delta": round(auroc_learned - auroc_rule, 4) if not np.isnan(auroc_rule) else None,
103
+ "delta_ci95": _ci(boot_delta),
104
+ "delta_ci_excludes_zero": (bool(_ci(boot_delta)[0] > 0) if boot_delta else None),
105
+ "ci_note": f"bootstrap 2000x over {npos} positives + {nctrl} controls (seed 20260604); CI is wide "
106
+ "because only ~5 validated GSH anchor the positives - reported honestly.",
107
+ "genotoxic_cis_auroc": "DEMOTED to a diagnostic - circular (label = proximity to 5 oncogenes = the "
108
+ "distance baseline's own definition); not a safety headline",
109
+ "rule_thresholds_bp": _MIN_DIST,
110
+ }
111
+ Path(out).parent.mkdir(parents=True, exist_ok=True)
112
+ Path(out).write_text(json.dumps(report, indent=2, default=str), encoding="utf-8")
113
+ return report
114
+
115
+
116
+ if __name__ == "__main__": # pragma: no cover
117
+ print(json.dumps(run(), indent=2, default=str))