PyPI - pen-stack - Versions diffs - 3.1.0__py3-none-any.whl - Mend

pen-stack 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

pen_stack/__init__.py +2 -0
pen_stack/_resources.py +34 -0
pen_stack/adapt/__init__.py +14 -0
pen_stack/adapt/finetune.py +33 -0
pen_stack/adapt/ingest.py +86 -0
pen_stack/adapt/pipeline.py +101 -0
pen_stack/adapt/recalibrate.py +58 -0
pen_stack/adapt/report.py +130 -0
pen_stack/agent/__init__.py +1 -0
pen_stack/agent/guardrails.py +49 -0
pen_stack/agent/mcp_server.py +42 -0
pen_stack/agent/orchestrator.py +106 -0
pen_stack/agent/pen_agent.py +169 -0
pen_stack/agent/tools.py +130 -0
pen_stack/atlas/__init__.py +1 -0
pen_stack/atlas/build_wtkb.py +80 -0
pen_stack/atlas/crosslink.py +144 -0
pen_stack/atlas/expand.py +190 -0
pen_stack/atlas/schema.py +59 -0
pen_stack/atlas/scorecard.py +134 -0
pen_stack/atlas/universe.py +75 -0
pen_stack/atlas/variant_propose.py +155 -0
pen_stack/bridge/__init__.py +1 -0
pen_stack/bridge/activity.py +52 -0
pen_stack/bridge/cli.py +65 -0
pen_stack/bridge/fold_qc.py +53 -0
pen_stack/bridge/guide_qc.py +84 -0
pen_stack/bridge/ingest.py +139 -0
pen_stack/bridge/offtarget.py +133 -0
pen_stack/bridge/ortholog_screen.py +73 -0
pen_stack/bridge/pipeline.py +83 -0
pen_stack/cli.py +126 -0
pen_stack/data/__init__.py +1 -0
pen_stack/data/encode.py +84 -0
pen_stack/data/genome.py +71 -0
pen_stack/data/ingest_chromatin.py +119 -0
pen_stack/data/ingest_integration.py +112 -0
pen_stack/data/ingest_safety_annot.py +164 -0
pen_stack/data/ingest_trip.py +76 -0
pen_stack/mech/__init__.py +1 -0
pen_stack/mech/classify_atlas.py +71 -0
pen_stack/mech/whitelist.py +66 -0
pen_stack/monitor/__init__.py +1 -0
pen_stack/monitor/europepmc.py +32 -0
pen_stack/monitor/run.py +57 -0
pen_stack/monitor/triage.py +63 -0
pen_stack/planner/__init__.py +1 -0
pen_stack/planner/cargo.py +56 -0
pen_stack/planner/cargo_polish.py +146 -0
pen_stack/planner/delivery.py +32 -0
pen_stack/planner/multiplex.py +110 -0
pen_stack/planner/optimize.py +156 -0
pen_stack/planner/pipeline.py +86 -0
pen_stack/planner/report.py +26 -0
pen_stack/rag/__init__.py +1 -0
pen_stack/rag/index.py +53 -0
pen_stack/rag/llm.py +178 -0
pen_stack/rag/qa.py +105 -0
pen_stack/score/__init__.py +1 -0
pen_stack/score/recalibrate.py +77 -0
pen_stack/score/therapeutic.py +85 -0
pen_stack/server/__init__.py +1 -0
pen_stack/server/api.py +142 -0
pen_stack/ui/__init__.py +1 -0
pen_stack/ui/app.py +518 -0
pen_stack/validate/__init__.py +1 -0
pen_stack/validate/adapt_demo.py +69 -0
pen_stack/validate/agent_eval.py +117 -0
pen_stack/validate/blind_gsh_discovery.py +165 -0
pen_stack/validate/cargo_directionality.py +57 -0
pen_stack/validate/durability_baselines.py +150 -0
pen_stack/validate/forward_hypotheses.py +104 -0
pen_stack/validate/guide_qc_demo.py +58 -0
pen_stack/validate/intent_specification.py +82 -0
pen_stack/validate/paper3_benchmark.py +165 -0
pen_stack/validate/paper4_real_validation.py +144 -0
pen_stack/validate/paper4_validation.py +82 -0
pen_stack/validate/seq_vs_measured.py +134 -0
pen_stack/validate/within_locus_ranking.py +74 -0
pen_stack/validate/writer_recovery.py +86 -0
pen_stack/wgenome/__init__.py +1 -0
pen_stack/wgenome/chromatin_seq.py +83 -0
pen_stack/wgenome/durability.py +108 -0
pen_stack/wgenome/export_tracks.py +52 -0
pen_stack/wgenome/features.py +82 -0
pen_stack/wgenome/gsh_baseline.py +117 -0
pen_stack/wgenome/providers.py +245 -0
pen_stack/wgenome/safety.py +69 -0
pen_stack/wgenome/structure3d.py +168 -0
pen_stack/wgenome/writability.py +72 -0
pen_stack-3.1.0.dist-info/METADATA +451 -0
pen_stack-3.1.0.dist-info/RECORD +96 -0
pen_stack-3.1.0.dist-info/WHEEL +5 -0
pen_stack-3.1.0.dist-info/entry_points.txt +3 -0
pen_stack-3.1.0.dist-info/licenses/LICENSE +21 -0
pen_stack-3.1.0.dist-info/top_level.txt +1 -0

pen_stack/validate/within_locus_ranking.py ADDED Viewed

@@ -0,0 +1,74 @@
+"""Within-locus site ranking (v3.1, WS-A5) - descriptive.
+For a large validated safe-harbour gene, does the planner rank the documented intronic safe bin above the
+other bins in that locus? We rank every 1 kb bin in the gene body by writability and report the documented
+bin's within-locus percentile. Descriptive (few qualifying loci); not a hypothesis test.
+Documented safe sub-region coordinates (hg38, widely cited):
+  - AAVS1  = PPP1R12C intron 1, chr19:55,115,768 (DeKelver 2010, 10.1101/gr.106773.110)
+  - CLYBL  = CLYBL intron 2, chr13:99,816,475 (Cerbini 2015, 10.1371/journal.pone.0116032)
+Acceptance (prereg/ws_a.yaml): the documented bin lands in the top quartile (>= 75th percentile of
+writability within the locus) for a pre-registered fraction of loci; reported per locus.
+"""
+from __future__ import annotations
+import json
+from pathlib import Path
+import pandas as pd
+_ROOT = Path(__file__).resolve().parents[2]
+_OUT = _ROOT / "out" / "within_locus_ranking.json"
+_WDF = _ROOT.parent / "phase_1" / "out" / "atlas_k562.parquet"
+# documented safe bins (gene, chrom, documented_bp)
+_LOCI = [
+    {"name": "AAVS1", "gene": "PPP1R12C", "chrom": "chr19", "doc_bp": 55115768,
+     "doi": "10.1101/gr.106773.110"},
+    {"name": "CLYBL", "gene": "CLYBL", "chrom": "chr13", "doc_bp": 99816475,
+     "doi": "10.1371/journal.pone.0116032"},
+]
+def run(out: str | Path = _OUT) -> dict:
+    from pen_stack.planner.optimize import _gene_coords
+    wdf = pd.read_parquet(_WDF)
+    gc = _gene_coords()
+    rows = []
+    for loc in _LOCI:
+        g = gc[gc["gene"] == loc["gene"]]
+        if g.empty:
+            continue
+        r = g.iloc[0]
+        lo, hi = int(r["start"]) // 1000, int(r["end"]) // 1000
+        body = wdf[(wdf["chrom"] == loc["chrom"]) & (wdf["bin"].between(lo, hi))].dropna(subset=["writability"])
+        if body.empty:
+            continue
+        doc_bin = loc["doc_bp"] // 1000
+        doc_row = body[body["bin"] == doc_bin]
+        if doc_row.empty:                       # nearest available bin in the body
+            doc_row = body.iloc[(body["bin"] - doc_bin).abs().argsort()[:1]]
+        doc_w = float(doc_row.iloc[0]["writability"])
+        pct = float((body["writability"] < doc_w).mean())   # within-locus percentile of the documented bin
+        rows.append({"name": loc["name"], "gene": loc["gene"], "n_bins": int(len(body)),
+                     "documented_bin": int(doc_bin), "documented_writability": round(doc_w, 4),
+                     "within_locus_percentile": round(pct, 3), "top_quartile": bool(pct >= 0.75),
+                     "doi": loc["doi"]})
+    tab = pd.DataFrame(rows)
+    n = len(tab)
+    n_top = int(tab["top_quartile"].sum()) if n else 0
+    report = {
+        "what_this_is": "within-locus ranking of the documented safe bin (descriptive, not a hypothesis test)",
+        "n_loci": n, "n_top_quartile": n_top,
+        "fraction_top_quartile": round(n_top / n, 3) if n else None,
+        "per_locus": rows,
+        "scope": "few qualifying loci; descriptive; the documented sub-region is a 1 kb bin approximation.",
+    }
+    Path(out).parent.mkdir(parents=True, exist_ok=True)
+    Path(out).write_text(json.dumps(report, indent=2, default=str), encoding="utf-8")
+    return report
+if __name__ == "__main__":  # pragma: no cover
+    print(json.dumps(run(), indent=2, default=str))

pen_stack/validate/writer_recovery.py ADDED Viewed

@@ -0,0 +1,86 @@
+"""Diversified writer-family recovery (v3.1, WS-A4).
+The Phase-3 panel was bridge-dominated, so writer choice barely varied. Here we add DSB-free, large-cargo
+documented writes (CAST, PASTE/PE-integrase, large serine-integrase landing pads) so the correct family
+genuinely changes with cargo size. The writer is held out; we recover the family used from the goal +
+intent + cargo size + cell type alone.
+Selection rule (documented, not tuned): recommend the **smallest-capacity DSB-free writer family that fits
+the cargo** (do not deploy a 50 kb integrase for a 2 kb insert when a programmable bridge suffices); ties
+broken by measured human-cell activity. This makes cargo size load-bearing for the writer choice.
+Acceptance (prereg/ws_a.yaml): writer-family recovery@1 exceeds the prevalence baseline by a pre-registered
+margin on >= 8 entries spanning >= 3 families; reported per family.
+"""
+from __future__ import annotations
+import json
+from collections import Counter
+from pathlib import Path
+import pandas as pd
+_ROOT = Path(__file__).resolve().parents[2]
+_PANEL = _ROOT / "data" / "writer_panel.csv"
+_ATLAS = _ROOT / "pen_stack" / "atlas" / "atlas.parquet"
+_OUT = _ROOT / "out" / "writer_recovery.json"
+def _family_caps() -> pd.DataFrame:
+    """family -> (cargo_capacity_bp, dsb_free, human_cell_activity proxy) from the Writer Atlas cores."""
+    atlas = pd.read_parquet(_ATLAS)
+    core = atlas[atlas["entry_kind"] == "curated_core"] if "entry_kind" in atlas else atlas
+    rows = []
+    for fam, sub in core.groupby("family"):
+        r = sub.iloc[0]
+        cap = r.get("cargo_capacity_bp")
+        act = r.get("S_HumanCell")
+        rows.append({"family": fam,
+                     "cargo_capacity_bp": (float(cap) if pd.notna(cap) else None),
+                     "dsb_free": bool(r.get("dsb_free", False)),
+                     "activity": (float(act) if pd.notna(act) else 0.4)})
+    return pd.DataFrame(rows)
+def recover_writer_family(cargo_bp: int, dsb_free_required: bool = True) -> str | None:
+    """Smallest-capacity DSB-free family that fits the cargo; ties by activity."""
+    caps = _family_caps()
+    cand = caps[caps["cargo_capacity_bp"].notna() & (caps["cargo_capacity_bp"] >= cargo_bp)]
+    if dsb_free_required:
+        cand = cand[cand["dsb_free"]]
+    if cand.empty:
+        return None
+    cand = cand.sort_values(["cargo_capacity_bp", "activity"], ascending=[True, False])
+    return cand.iloc[0]["family"]
+def run(out: str | Path = _OUT) -> dict:
+    panel = pd.read_csv(_PANEL)
+    panel["predicted_family"] = [recover_writer_family(int(r.cargo_bp), bool(r.dsb_free_required))
+                                 for r in panel.itertuples()]
+    panel["hit"] = panel["predicted_family"] == panel["family"]
+    n, n_hit = len(panel), int(panel["hit"].sum())
+    # prevalence baseline: always guess the most common family -> expected accuracy = max class share
+    prev = Counter(panel["family"])
+    prevalence_at1 = max(prev.values()) / n
+    per_family = {fam: {"n": int((panel["family"] == fam).sum()),
+                        "recall@1": round(float(panel[panel["family"] == fam]["hit"].mean()), 3)}
+                  for fam in sorted(prev)}
+    report = {
+        "what_this_is": "writer-family recovery@1 from goal+intent+cargo+ct, writer held out (non-circular)",
+        "n_entries": n, "n_families": len(prev),
+        "recovery_at_1": round(n_hit / n, 4),
+        "prevalence_baseline_at_1": round(prevalence_at1, 4),
+        "beats_prevalence": bool(n_hit / n > prevalence_at1),
+        "per_family": per_family,
+        "selection_rule": "smallest-capacity DSB-free family that fits the cargo; ties by human-cell activity",
+        "cases": panel[["name", "family", "cargo_bp", "predicted_family", "hit", "doi"]].to_dict("records"),
+        "scope": "small N; documented writes are survivorship-biased; cargo size is the dominant signal.",
+    }
+    Path(out).parent.mkdir(parents=True, exist_ok=True)
+    Path(out).write_text(json.dumps(report, indent=2, default=str), encoding="utf-8")
+    return report
+if __name__ == "__main__":  # pragma: no cover
+    print(json.dumps({k: v for k, v in run().items() if k != "cases"}, indent=2, default=str))

pen_stack/wgenome/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """pen_stack.wgenome - see PEN-STACK v3.0 program doc."""

pen_stack/wgenome/chromatin_seq.py ADDED Viewed

@@ -0,0 +1,83 @@
+"""Sequence-derived chromatin tracks (WS-C2): map AlphaGenome predictions onto the measured-atlas schema
+and recompute writability/safety/durability from predicted tracks.
+Two honest details:
+  * Unit handling. AlphaGenome track outputs are in the model's own units, not the measured ENCODE scale the
+    safety/durability models were trained on. Per-track agreement is therefore reported with Spearman (rank,
+    unit-free) alongside Pearson. For the *score-level* recompute we quantile-map each predicted track onto
+    the measured track's marginal (a standard rank-preserving calibration), so the recomputed scores test
+    whether AlphaGenome's RANKING of the epigenome recovers the measured-track scores - not a unit accident.
+  * Coverage. AlphaGenome predicts H3K9me3 for HepG2 but NOT K562; missing marks come back NaN and are
+    excluded from per-track correlation and passed as NaN to the (NaN-native) durability model.
+"""
+from __future__ import annotations
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from pen_stack.wgenome.providers import AlphaGenomeProvider, _HISTONES, TRACK_NAMES
+_ROOT = Path(__file__).resolve().parents[2]
+_P1_FEAT = _ROOT.parent / "phase_1" / "features"
+_P1_OUT = _ROOT.parent / "phase_1" / "out"
+def predicted_tracks_frame(ct: str, bins: pd.DataFrame, provider: AlphaGenomeProvider | None = None,
+                           offline: bool = False) -> pd.DataFrame:
+    """Predicted 7-track values for the given (chrom, bin) rows. Cached per bin in the provider."""
+    provider = provider or AlphaGenomeProvider(assembly="hg38")
+    rows = []
+    for r in bins.itertuples():
+        rec = provider.tracks(r.chrom, int(r.bin), ct, offline=offline)
+        if rec.get("available"):
+            rows.append({"chrom": r.chrom, "bin": int(r.bin),
+                         **{t: rec.get(t, np.nan) for t in TRACK_NAMES}})
+    return pd.DataFrame(rows)
+def quantile_map(pred: pd.Series, measured: pd.Series) -> pd.Series:
+    """Map `pred` onto `measured`'s marginal by matching ranks (rank-preserving calibration)."""
+    pred = pred.astype(float)
+    if pred.notna().sum() < 2 or measured.notna().sum() < 2:
+        return pred
+    ranks = pred.rank(pct=True, na_option="keep")
+    q = np.nanpercentile(measured.to_numpy(dtype=float), np.clip(ranks.to_numpy() * 100, 0, 100))
+    return pd.Series(q, index=pred.index)
+def _load_models(ct: str):
+    from pen_stack.wgenome.writability import load_pickle
+    safety = load_pickle(str(_P1_OUT / f"safety_{ct}.pkl"))
+    dur = load_pickle(str(_P1_OUT / "durability.pkl"))
+    return safety, dur
+def recompute_scores(matrix: pd.DataFrame, ct: str) -> pd.DataFrame:
+    """Apply the trained safety + durability models to a feature matrix; return writability components."""
+    from pen_stack.wgenome.writability import build_writability
+    safety, dur = _load_models(ct)
+    return build_writability(matrix, safety, dur)
+def build_predicted_matrix(measured_matrix: pd.DataFrame, predicted: pd.DataFrame, ct: str) -> pd.DataFrame:
+    """Substitute quantile-mapped predicted tracks into a copy of the measured feature matrix.
+    Distance/integration features are genomic (not predicted) and are kept as-is; only the chromatin tracks
+    (atac/dnase/5 histones -> accessibility + marks) are replaced, then `accessibility` is rederived.
+    """
+    from pen_stack.wgenome.features import add_accessibility
+    m = measured_matrix.merge(predicted, on=["chrom", "bin"], how="inner", suffixes=("", "_pred"))
+    for t in TRACK_NAMES:
+        pc = f"{t}_pred"
+        if pc in m.columns and t in m.columns:
+            m[t] = quantile_map(m[pc], m[t])     # map predicted onto this sample's measured marginal
+    m = m.drop(columns=[c for c in m.columns if c.endswith("_pred")])
+    m = m.drop(columns=["accessibility"], errors="ignore")
+    return add_accessibility(m)
+def histone_marks_for(ct: str) -> list[str]:
+    """Marks AlphaGenome actually predicts for this cell type (K562 lacks H3K9me3)."""
+    return [m for m in _HISTONES if not (ct.lower() == "k562" and m == "H3K9me3")]

pen_stack/wgenome/durability.py ADDED Viewed

@@ -0,0 +1,108 @@
+"""Durability layer (Phase 1, Step 1.7) - the conditional chromatin-context model.
+Learns ONE function: `local chromatin features -> (expression level, silenced/stable)` on TRIP
+integrations. The model never sees a coordinate, so it is cell-type-agnostic in function: to score a
+new cell type you supply its chromatin tracks. This is the layer no safe-harbour resource provides,
+and TRIP supervises exactly the writing-relevant quantity (position effect on an integrated cassette).
+"""
+from __future__ import annotations
+import os
+from pathlib import Path
+import lightgbm as lgb
+import numpy as np
+import pandas as pd
+import pyBigWig
+from scipy.stats import spearmanr
+from sklearn.metrics import roc_auc_score
+from sklearn.model_selection import GroupKFold
+# canonical chromatin feature names (must match across mouse training + human application)
+CHROMATIN = ["atac", "dnase", "H3K27ac", "H3K4me1", "H3K4me3", "H3K9me3", "H3K27me3"]
+def liftover_positions(df: pd.DataFrame, chain_file: str) -> pd.DataFrame:
+    """Lift (chrom,pos) with a UCSC chain (e.g. mm9->mm10). Drops positions that fail to lift."""
+    from pyliftover import LiftOver
+    lo = LiftOver(chain_file)
+    out = []
+    for _, r in df.iterrows():
+        c = lo.convert_coordinate(r["chrom"], int(r["pos"]))
+        if c:
+            row = r.to_dict()
+            row["chrom"], row["pos"] = c[0][0], c[0][1]
+            out.append(row)
+    return pd.DataFrame(out)
+def extract_chromatin_at(df: pd.DataFrame, panel: dict, raw_dir: str, download_fn,
+                         window: int = 2500) -> pd.DataFrame:
+    """Point-query each bigWig's mean signal in +/-window around each integration position.
+    Only the integration sites are queried (no genome-wide binning needed)."""
+    out = df.copy()
+    for name, rec in panel.items():
+        path = download_fn(rec["href"], os.path.join(raw_dir, f"mES_{name}_{rec['accession']}.bigWig"))
+        bw = pyBigWig.open(path)
+        chroms = set(bw.chroms().keys())
+        vals = []
+        for chrom, pos in zip(out["chrom"], out["pos"]):
+            key = chrom if chrom in chroms else chrom.replace("chr", "")
+            if key not in chroms:
+                vals.append(0.0)
+                continue
+            try:
+                v = bw.stats(key, max(0, pos - window), pos + window, type="mean")[0]
+            except (RuntimeError, IndexError):
+                v = None
+            vals.append(0.0 if v is None else float(v))
+        out[name] = vals
+        bw.close()
+        os.remove(path)
+        print(f"  extracted {name} at {len(out)} sites", flush=True)
+    return out
+def train_durability(trip_df: pd.DataFrame, seed: int = 42) -> dict:
+    feats = [c for c in CHROMATIN if c in trip_df.columns]
+    df = trip_df.dropna(subset=feats + ["expression"]).copy()
+    X = df[feats].astype("float32").fillna(0.0)
+    y_expr = df["expression"].to_numpy()
+    y_sil = df["silenced"].astype(int).to_numpy()
+    groups = df["chrom"].astype("category").cat.codes.to_numpy()
+    gkf = GroupKFold(n_splits=min(5, len(np.unique(groups))))
+    oof_expr = np.zeros(len(df))
+    oof_sil = np.zeros(len(df))
+    for tr, te in gkf.split(X, y_expr, groups):
+        reg = lgb.LGBMRegressor(n_estimators=500, learning_rate=0.03, num_leaves=31,
+                                subsample=0.8, random_state=seed, n_jobs=-1, verbosity=-1).fit(X.iloc[tr], y_expr[tr])
+        oof_expr[te] = reg.predict(X.iloc[te])
+        clf = lgb.LGBMClassifier(n_estimators=500, learning_rate=0.03, num_leaves=31,
+                                 subsample=0.8, random_state=seed, n_jobs=-1, verbosity=-1).fit(X.iloc[tr], y_sil[tr])
+        oof_sil[te] = clf.predict_proba(X.iloc[te])[:, 1]
+    rho = float(spearmanr(oof_expr, y_expr).statistic)
+    auroc = float(roc_auc_score(y_sil, oof_sil))
+    # baseline: H3K9me3 (heterochromatin) alone as a silencing predictor, and LAD-like (low ATAC) for expression
+    base_sil = roc_auc_score(y_sil, df["H3K9me3"].fillna(0)) if "H3K9me3" in df else float("nan")
+    base_expr = spearmanr(df.get("atac", pd.Series(0, index=df.index)).fillna(0), y_expr).statistic
+    final_reg = lgb.LGBMRegressor(n_estimators=500, learning_rate=0.03, num_leaves=31,
+                                  random_state=seed, n_jobs=-1, verbosity=-1).fit(X, y_expr)
+    imp = dict(sorted(zip(feats, final_reg.feature_importances_.tolist()), key=lambda kv: kv[1], reverse=True))
+    return {
+        "n": int(len(df)), "features": feats,
+        "expr_spearman": rho, "expr_baseline_atac_spearman": float(base_expr),
+        "silenced_auroc": auroc, "silenced_baseline_h3k9me3_auroc": float(base_sil),
+        "feature_importance": imp, "reg": final_reg,
+        "clf": lgb.LGBMClassifier(n_estimators=500, learning_rate=0.03, num_leaves=31,
+                                  random_state=seed, n_jobs=-1, verbosity=-1).fit(X, y_sil),
+    }
+def save_models(res: dict, out_dir: str, tag: str = "durability") -> None:
+    import pickle
+    Path(out_dir).mkdir(parents=True, exist_ok=True)
+    with open(f"{out_dir}/{tag}.pkl", "wb") as fh:
+        pickle.dump({"reg": res["reg"], "clf": res["clf"], "features": res["features"]}, fh)

pen_stack/wgenome/export_tracks.py ADDED Viewed

@@ -0,0 +1,52 @@
+"""Export the Writable Genome atlas as genome-browser tracks (Phase 1, Step 1.11).
+Writes per-cell-type BigWig tracks (writability, safety, p_durable) loadable in IGV/UCSC, plus a BED
+of the top-writable loci. The atlas parquet stays the queryable source; these are the shareable tracks.
+"""
+from __future__ import annotations
+from pathlib import Path
+import pandas as pd
+import pyBigWig
+from pen_stack.data.genome import MAIN_CHROMS, load_chrom_sizes
+BIN_BP = 1000
+TRACKS = ["writability", "safety", "p_durable"]
+def write_bigwig(df: pd.DataFrame, col: str, chrom_sizes: dict[str, int], out_bw: str) -> None:
+    bw = pyBigWig.open(out_bw, "w")
+    # header must be sorted; keep canonical chrom order with sizes
+    chroms = [(c, chrom_sizes[c]) for c in MAIN_CHROMS if c in chrom_sizes]
+    bw.addHeader(chroms)
+    for chrom, _ in chroms:
+        g = df[df["chrom"] == chrom].sort_values("bin")
+        if g.empty:
+            continue
+        starts = (g["bin"].to_numpy() * BIN_BP).astype("int64")
+        vals = g[col].astype("float64").fillna(0.0).to_numpy()
+        bw.addEntries(chrom, list(starts), values=list(vals), span=BIN_BP, step=BIN_BP)
+    bw.close()
+def export_atlas(atlas_parquet: str, chrom_sizes_tsv: str, out_dir: str, ct: str,
+                 top_n: int = 5000) -> dict:
+    df = pd.read_parquet(atlas_parquet)
+    sizes = load_chrom_sizes(chrom_sizes_tsv)
+    Path(out_dir).mkdir(parents=True, exist_ok=True)
+    written = {}
+    for col in TRACKS:
+        if col in df.columns:
+            out_bw = f"{out_dir}/atlas_{ct}_{col}.bw"
+            write_bigwig(df, col, sizes, out_bw)
+            written[col] = out_bw
+    # top-writable loci BED
+    top = df.nlargest(top_n, "writability")[["chrom", "bin", "writability"]].copy()
+    top["start"] = top["bin"] * BIN_BP
+    top["end"] = top["start"] + BIN_BP
+    bed = f"{out_dir}/atlas_{ct}_top{top_n}.bed"
+    top[["chrom", "start", "end", "writability"]].to_csv(bed, sep="\t", header=False, index=False)
+    written["top_bed"] = bed
+    return written

pen_stack/wgenome/features.py ADDED Viewed

@@ -0,0 +1,82 @@
+"""Assemble the per-cell-type training/scoring matrix (Phase 1, bridge between 1A and 1B).
+Joins the cell-type chromatin feature store + the (cell-type-agnostic) safety-annotation store
+(+ integration-outcome store when available) on (chrom, bin) into one matrix the safety and
+durability layers consume. Keeps feature provenance explicit.
+"""
+from __future__ import annotations
+from pathlib import Path
+import numpy as np
+import pandas as pd
+# Unified chromatin feature set: ONE accessibility feature (ATAC where present, else DNase) +
+# the 5 core histone marks. This makes every cell type share an IDENTICAL schema, so a cell type
+# that lacks a specific accessibility assay (e.g. CD34+ HSPC has DNase but no ATAC) is fully
+# specified rather than "partial" - ATAC and DNase are interchangeable open-chromatin assays.
+CHROMATIN_TRACKS = ["accessibility", "H3K27ac", "H3K4me1", "H3K4me3", "H3K9me3", "H3K27me3"]
+ACCESS_SOURCES = ["atac", "dnase"]
+SAFETY_DIST = ["dist_oncogene", "dist_tsg", "dist_essential", "dist_tss"]
+def add_accessibility(m: pd.DataFrame) -> pd.DataFrame:
+    """Derive the unified `accessibility` column: prefer ATAC, fall back to DNase."""
+    if "accessibility" not in m.columns:
+        if "atac" in m.columns:
+            m["accessibility"] = m["atac"]
+            if "dnase" in m.columns:                      # fill any ATAC gaps with DNase
+                m["accessibility"] = m["accessibility"].fillna(m["dnase"])
+        elif "dnase" in m.columns:
+            m["accessibility"] = m["dnase"]
+    return m
+def _log_dist(s: pd.Series) -> pd.Series:
+    # large/Inf for "no feature on chromosome" -> log1p of a capped distance; NaN -> max
+    v = s.fillna(s.max() if s.notna().any() else 1e8).clip(lower=0)
+    return np.log1p(v)
+def assemble_matrix(chromatin_parquet: str, safety_parquet: str,
+                    integration_parquet: str | None = None,
+                    out_parquet: str | None = None) -> pd.DataFrame:
+    chrom = pd.read_parquet(chromatin_parquet)
+    safe = pd.read_parquet(safety_parquet)
+    m = chrom.merge(safe, on=["chrom", "bin"], how="inner")
+    m = add_accessibility(m)                               # unify ATAC/DNase -> accessibility
+    # log-scaled distance features (raw kept too, for transparency)
+    for d in SAFETY_DIST:
+        if d in m.columns:
+            m[f"log_{d}"] = _log_dist(m[d])
+    if integration_parquet and Path(integration_parquet).exists():
+        integ = pd.read_parquet(integration_parquet)
+        m = m.merge(integ, on=["chrom", "bin"], how="left")
+        for c in [c for c in integ.columns if c not in ("chrom", "bin")]:
+            m[c] = m[c].fillna(0)
+    if out_parquet:
+        Path(out_parquet).parent.mkdir(parents=True, exist_ok=True)
+        m.to_parquet(out_parquet, index=False)
+    return m
+def resolve_integration(feat_dir: str, ct: str) -> str | None:
+    """Integration-feature parquet for a cell type: prefer the cell-type-specific MLV set
+    (LaFave K562/HepG2); fall back to the cell-type-agnostic VISDB retroviral-propensity track so a
+    cell type without its own integration assay (e.g. CD34+ HSPC) still gets an integration feature."""
+    ct_specific = Path(feat_dir) / f"integration_{ct}.parquet"
+    if ct_specific.exists():
+        return str(ct_specific)
+    fallback = Path(feat_dir) / "integration_density.parquet"
+    return str(fallback) if fallback.exists() else None
+def feature_columns(df: pd.DataFrame) -> list[str]:
+    """The model feature set: chromatin marks + log-distances + any integration features."""
+    feats = [c for c in CHROMATIN_TRACKS if c in df.columns]
+    feats += [c for c in df.columns if c.startswith("log_dist_")]
+    feats += [c for c in df.columns if c.startswith("integ_")]
+    return feats

pen_stack/wgenome/gsh_baseline.py ADDED Viewed

@@ -0,0 +1,117 @@
+"""Genomic safe-harbour (GSH) rule-set baseline (v3.1, WS-B3).
+A published multi-criterion GSH rule (Papapetrou/Sadelain/Pellenz style) implemented from the existing
+per-bin annotations: outside a gene, and minimum distances to the nearest TSS, cancer/oncogene, and
+essential gene. We compute it as a graded safety score and compare its **safe-harbour discrimination**
+(held-out validated GSH vs matched controls, reusing WS-A3) against the learned writability model.
+The headline safety claim is **discrimination** (validated GSH vs matched controls), NOT the
+`genotoxic_cis` AUROC - which is circular (its label is proximity to five oncogenes, i.e. the distance
+baseline's own definition) and is demoted to a clearly-labeled diagnostic.
+Acceptance (prereg/ws_b.yaml): the learned model beats the GSH rule-set on discrimination AUROC; report
+the delta. If it does not, say so - the rule is a strong, interpretable baseline.
+"""
+from __future__ import annotations
+import json
+from pathlib import Path
+import numpy as np
+import pandas as pd
+_ROOT = Path(__file__).resolve().parents[2]
+_OUT = _ROOT / "out" / "gsh_baseline.json"
+# published-style minimum distances (bp). Graded: a bin scores higher the further it clears each minimum.
+_MIN_DIST = {"dist_tss": 5000, "dist_oncogene": 50000, "dist_essential": 50000}
+def gsh_rule_score(df: pd.DataFrame) -> pd.Series:
+    """Graded GSH-rule safety score in [0,1]: mean over criteria of min(dist / threshold, 1)."""
+    parts = []
+    for col, thr in _MIN_DIST.items():
+        if col in df.columns:
+            parts.append((df[col].clip(lower=0) / thr).clip(upper=1.0))
+    if not parts:
+        return pd.Series(0.0, index=df.index)
+    return pd.concat(parts, axis=1).mean(axis=1)
+def _auroc(scores, labels) -> float:
+    pos = [s for s, y in zip(scores, labels) if y == 1]
+    neg = [s for s, y in zip(scores, labels) if y == 0]
+    if not pos or not neg:
+        return float("nan")
+    return sum((p > n) + 0.5 * (p == n) for p in pos for n in neg) / (len(pos) * len(neg))
+def run(ct: str = "k562", out: str | Path = _OUT) -> dict:
+    """Discrimination AUROC: GSH rule-set vs the learned writability model, on the WS-A3 GSH/controls."""
+    from pen_stack.validate.blind_gsh_discovery import _load_features, gsh_positives
+    import yaml
+    cfg = yaml.safe_load((_ROOT / "configs" / "gsh_validated_heldout.yaml").read_text(encoding="utf-8"))
+    df = _load_features(ct)
+    safe = pd.read_parquet(_ROOT.parent / "phase_1" / "features" / "safety_annot.parquet")[
+        ["chrom", "bin", "dist_tss", "dist_oncogene", "dist_essential"]]
+    df = df.drop(columns=[c for c in ["dist_tss", "dist_oncogene", "dist_essential"] if c in df.columns]).merge(
+        safe, on=["chrom", "bin"], how="left")
+    df["gsh_rule"] = gsh_rule_score(df)
+    positives = gsh_positives(df, cfg)
+    controls = pd.read_parquet(_ROOT / "data" / "gsh_matched_controls.parquet")
+    idx = df.set_index(["chrom", "bin"])
+    def vals(frame, col):
+        return [idx.loc[(r.chrom, r.bin), col] for r in frame.itertuples() if (r.chrom, r.bin) in idx.index]
+    pr, cr = vals(positives, "gsh_rule"), vals(controls, "gsh_rule")
+    pw, cw = vals(positives, "writability"), vals(controls, "writability")
+    labels_r = [1] * len(pr) + [0] * len(cr)
+    labels_w = [1] * len(pw) + [0] * len(cw)
+    auroc_rule = _auroc(pr + cr, labels_r)
+    auroc_learned = _auroc(pw + cw, labels_w)
+    # Bootstrap 95% CI for the learned AUROC and the learned-minus-rule delta (prereg/ws_b.yaml: report delta
+    # AND CI). Resample positives and controls independently (stratified). With only ~5 GSH positives the CI
+    # is WIDE by construction - reported honestly rather than hidden.
+    rng = np.random.default_rng(20260604)
+    npos, nctrl = len(pw), len(cw)
+    boot_learned, boot_delta = [], []
+    if npos and nctrl:
+        pw_a, cw_a = np.array(pw, float), np.array(cw, float)
+        pr_a, cr_a = np.array(pr, float), np.array(cr, float)
+        for _ in range(2000):
+            pi = rng.integers(0, npos, npos)
+            ci = rng.integers(0, nctrl, nctrl)
+            lab = [1] * npos + [0] * nctrl
+            al = _auroc(list(pw_a[pi]) + list(cw_a[ci]), lab)
+            ar = _auroc(list(pr_a[pi]) + list(cr_a[ci]), lab)
+            if not (np.isnan(al) or np.isnan(ar)):
+                boot_learned.append(al)
+                boot_delta.append(al - ar)
+    def _ci(b):
+        return [round(float(np.percentile(b, 2.5)), 4), round(float(np.percentile(b, 97.5)), 4)] if b else None
+    report = {
+        "primary_safety_metric": "safe-harbour discrimination (validated GSH vs matched controls)",
+        "n_positives": npos, "n_controls": nctrl,
+        "auroc_learned_writability": round(auroc_learned, 4),
+        "auroc_learned_ci95": _ci(boot_learned),
+        "auroc_gsh_ruleset_baseline": round(auroc_rule, 4) if not np.isnan(auroc_rule) else None,
+        "learned_beats_ruleset": bool(auroc_learned > auroc_rule) if not np.isnan(auroc_rule) else None,
+        "delta": round(auroc_learned - auroc_rule, 4) if not np.isnan(auroc_rule) else None,
+        "delta_ci95": _ci(boot_delta),
+        "delta_ci_excludes_zero": (bool(_ci(boot_delta)[0] > 0) if boot_delta else None),
+        "ci_note": f"bootstrap 2000x over {npos} positives + {nctrl} controls (seed 20260604); CI is wide "
+                   "because only ~5 validated GSH anchor the positives - reported honestly.",
+        "genotoxic_cis_auroc": "DEMOTED to a diagnostic - circular (label = proximity to 5 oncogenes = the "
+                               "distance baseline's own definition); not a safety headline",
+        "rule_thresholds_bp": _MIN_DIST,
+    }
+    Path(out).parent.mkdir(parents=True, exist_ok=True)
+    Path(out).write_text(json.dumps(report, indent=2, default=str), encoding="utf-8")
+    return report
+if __name__ == "__main__":  # pragma: no cover
+    print(json.dumps(run(), indent=2, default=str))