PyPI - pen-stack - Versions diffs - 3.1.0__py3-none-any.whl - Mend

pen-stack 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

pen_stack/__init__.py +2 -0
pen_stack/_resources.py +34 -0
pen_stack/adapt/__init__.py +14 -0
pen_stack/adapt/finetune.py +33 -0
pen_stack/adapt/ingest.py +86 -0
pen_stack/adapt/pipeline.py +101 -0
pen_stack/adapt/recalibrate.py +58 -0
pen_stack/adapt/report.py +130 -0
pen_stack/agent/__init__.py +1 -0
pen_stack/agent/guardrails.py +49 -0
pen_stack/agent/mcp_server.py +42 -0
pen_stack/agent/orchestrator.py +106 -0
pen_stack/agent/pen_agent.py +169 -0
pen_stack/agent/tools.py +130 -0
pen_stack/atlas/__init__.py +1 -0
pen_stack/atlas/build_wtkb.py +80 -0
pen_stack/atlas/crosslink.py +144 -0
pen_stack/atlas/expand.py +190 -0
pen_stack/atlas/schema.py +59 -0
pen_stack/atlas/scorecard.py +134 -0
pen_stack/atlas/universe.py +75 -0
pen_stack/atlas/variant_propose.py +155 -0
pen_stack/bridge/__init__.py +1 -0
pen_stack/bridge/activity.py +52 -0
pen_stack/bridge/cli.py +65 -0
pen_stack/bridge/fold_qc.py +53 -0
pen_stack/bridge/guide_qc.py +84 -0
pen_stack/bridge/ingest.py +139 -0
pen_stack/bridge/offtarget.py +133 -0
pen_stack/bridge/ortholog_screen.py +73 -0
pen_stack/bridge/pipeline.py +83 -0
pen_stack/cli.py +126 -0
pen_stack/data/__init__.py +1 -0
pen_stack/data/encode.py +84 -0
pen_stack/data/genome.py +71 -0
pen_stack/data/ingest_chromatin.py +119 -0
pen_stack/data/ingest_integration.py +112 -0
pen_stack/data/ingest_safety_annot.py +164 -0
pen_stack/data/ingest_trip.py +76 -0
pen_stack/mech/__init__.py +1 -0
pen_stack/mech/classify_atlas.py +71 -0
pen_stack/mech/whitelist.py +66 -0
pen_stack/monitor/__init__.py +1 -0
pen_stack/monitor/europepmc.py +32 -0
pen_stack/monitor/run.py +57 -0
pen_stack/monitor/triage.py +63 -0
pen_stack/planner/__init__.py +1 -0
pen_stack/planner/cargo.py +56 -0
pen_stack/planner/cargo_polish.py +146 -0
pen_stack/planner/delivery.py +32 -0
pen_stack/planner/multiplex.py +110 -0
pen_stack/planner/optimize.py +156 -0
pen_stack/planner/pipeline.py +86 -0
pen_stack/planner/report.py +26 -0
pen_stack/rag/__init__.py +1 -0
pen_stack/rag/index.py +53 -0
pen_stack/rag/llm.py +178 -0
pen_stack/rag/qa.py +105 -0
pen_stack/score/__init__.py +1 -0
pen_stack/score/recalibrate.py +77 -0
pen_stack/score/therapeutic.py +85 -0
pen_stack/server/__init__.py +1 -0
pen_stack/server/api.py +142 -0
pen_stack/ui/__init__.py +1 -0
pen_stack/ui/app.py +518 -0
pen_stack/validate/__init__.py +1 -0
pen_stack/validate/adapt_demo.py +69 -0
pen_stack/validate/agent_eval.py +117 -0
pen_stack/validate/blind_gsh_discovery.py +165 -0
pen_stack/validate/cargo_directionality.py +57 -0
pen_stack/validate/durability_baselines.py +150 -0
pen_stack/validate/forward_hypotheses.py +104 -0
pen_stack/validate/guide_qc_demo.py +58 -0
pen_stack/validate/intent_specification.py +82 -0
pen_stack/validate/paper3_benchmark.py +165 -0
pen_stack/validate/paper4_real_validation.py +144 -0
pen_stack/validate/paper4_validation.py +82 -0
pen_stack/validate/seq_vs_measured.py +134 -0
pen_stack/validate/within_locus_ranking.py +74 -0
pen_stack/validate/writer_recovery.py +86 -0
pen_stack/wgenome/__init__.py +1 -0
pen_stack/wgenome/chromatin_seq.py +83 -0
pen_stack/wgenome/durability.py +108 -0
pen_stack/wgenome/export_tracks.py +52 -0
pen_stack/wgenome/features.py +82 -0
pen_stack/wgenome/gsh_baseline.py +117 -0
pen_stack/wgenome/providers.py +245 -0
pen_stack/wgenome/safety.py +69 -0
pen_stack/wgenome/structure3d.py +168 -0
pen_stack/wgenome/writability.py +72 -0
pen_stack-3.1.0.dist-info/METADATA +451 -0
pen_stack-3.1.0.dist-info/RECORD +96 -0
pen_stack-3.1.0.dist-info/WHEEL +5 -0
pen_stack-3.1.0.dist-info/entry_points.txt +3 -0
pen_stack-3.1.0.dist-info/licenses/LICENSE +21 -0
pen_stack-3.1.0.dist-info/top_level.txt +1 -0

pen_stack/cli.py ADDED Viewed

@@ -0,0 +1,126 @@
+"""PEN-STACK unified CLI (subcommands wired per-phase: atlas, score, writable, crosslink, monitor).
+One entry point - ``pen-stack`` - over the whole stack. Heavy data (the Phase-1 writability atlas) is
+loaded lazily and degrades gracefully when absent, so ``info`` / ``atlas`` work from a clean install.
+"""
+from __future__ import annotations
+import click
+from pen_stack import __version__
+@click.group()
+@click.version_option(__version__, prog_name="pen-stack")
+def main():
+    """PEN-STACK - open infrastructure for genome writing."""
+@main.command()
+def info():
+    """Show stack status and module map."""
+    click.echo(f"PEN-STACK v{__version__}")
+    click.echo("Pillar B (flagship): wgenome  - Writable Genome (safety x durability x reachability)")
+    click.echo("Pillar A (companion): atlas, mech, score - Writer Atlas + WT-KB")
+    click.echo("Engine: planner - Write Planner (inverse design)")
+    click.echo("Beachhead: bridge - bridge-recombinase off-target engine")
+    click.echo("Services: monitor, rag, agent, ui, server")
+@main.command()
+@click.option("--family", default=None, help="Filter to one writer family.")
+@click.option("--coverage", is_flag=True, help="Show per-family coverage + confidence breakdown.")
+@click.option("--limit", default=10, help="Max rows to print.")
+def atlas(family, coverage, limit):
+    """Query the Writer Atlas."""
+    import pandas as pd
+    from pen_stack.atlas.crosslink import _ATLAS
+    df = pd.read_parquet(_ATLAS)
+    if coverage:
+        cov = (df.groupby("family")
+                 .agg(n=("representative_system", "size"),
+                      measured=("confidence", lambda s: (s == "measured").sum()),
+                      tier=("reachability_tier", "first"))
+                 .reset_index())
+        click.echo(cov.to_string(index=False))
+        click.echo(f"\nTOTAL systems: {len(df):,} across {df['family'].nunique()} families")
+        return
+    if family:
+        df = df[df["family"] == family]
+    cols = [c for c in ["representative_system", "family", "confidence", "deliv_class",
+                        "readiness", "reachability_tier"] if c in df.columns]
+    click.echo(df[cols].head(limit).to_string(index=False))
+@main.command()
+@click.option("--gene", required=True, help="Target gene symbol.")
+@click.option("--ct", default="k562", help="Cell type (k562/hepg2/hspc).")
+@click.option("--top", default=10, help="Top writable bins to show.")
+def writable(gene, ct, top):
+    """Rank writable loci overlapping a gene."""
+    from pen_stack.atlas.crosslink import loci_for_gene
+    try:
+        g = loci_for_gene(gene, ct)
+    except FileNotFoundError as e:
+        raise click.ClickException(f"Phase-1 writability atlas not available: {e}") from e
+    if g.empty:
+        click.echo(f"No writable bins found for {gene} in {ct}.")
+        return
+    click.echo(g[["chrom", "bin", "safety", "p_durable", "writability"]].head(top).to_string(index=False))
+@main.command()
+@click.option("--family", help="Writer family -> ranked reachable loci.")
+@click.option("--chrom", help="Locus chrom (with --bin) -> reachable writers.")
+@click.option("--bin", "bin_idx", type=int, help="Locus 1kb bin index.")
+@click.option("--ct", default="k562")
+@click.option("--top", default=10)
+def crosslink(family, chrom, bin_idx, ct, top):
+    """Writer<->locus cross-link queries."""
+    from pen_stack.atlas import crosslink as cl
+    try:
+        if family:
+            click.echo(cl.loci_for_writer(family, ct, top=top).to_string(index=False))
+        elif chrom and bin_idx is not None:
+            click.echo(cl.writers_for_locus(chrom, bin_idx, ct).head(top).to_string(index=False))
+        else:
+            raise click.UsageError("provide --family OR (--chrom and --bin)")
+    except FileNotFoundError as e:
+        raise click.ClickException(f"Phase-1 writability atlas not available: {e}") from e
+@main.command()
+@click.option("--gene", required=True, help="Target gene symbol.")
+@click.option("--intent", required=True,
+              type=click.Choice(["safe_harbour_insertion", "knock_in_with_disruption",
+                                 "high_durability_insertion", "regulatory_excision", "repeat_excision"]))
+@click.option("--cargo-bp", default=2000, help="Payload size (bp).")
+@click.option("--ct", default="k562", help="Cell type (k562/hepg2/hspc).")
+@click.option("--k", default=3, help="Number of ranked plans.")
+def plan(gene, intent, cargo_bp, ct, k):
+    """Write Planner: goal + edit_intent -> ranked, traceable plans."""
+    from pen_stack.planner.optimize import EditIntent
+    from pen_stack.planner.pipeline import plan_write
+    from pen_stack.planner.report import render_plans
+    try:
+        plans = plan_write(gene, EditIntent(intent), cargo_bp, ct, k=k)
+    except FileNotFoundError as e:
+        raise click.ClickException(f"Phase-1 writability atlas not available: {e}") from e
+    click.echo(render_plans(plans))
+@main.command()
+@click.option("--since", default="2026-01-01", help="Earliest publication date (YYYY-MM-DD).")
+@click.option("--back-test", is_flag=True, help="Run the ISPpu10 back-test window.")
+def monitor(since, back_test):
+    """Run PEN-MONITOR (Europe PMC living-database scan -> curation queue)."""
+    from pen_stack.monitor.run import run_monitor
+    res = run_monitor(since=since, back_test=back_test)
+    click.echo(f"PEN-MONITOR: {res['n_hits']} hits, {res['n_candidates']} candidates -> {res['queue']}")
+    if res.get("isppu10_found") is not None:
+        click.echo(f"ISPpu10 back-test: {'FOUND' if res['isppu10_found'] else 'not found'}")
+if __name__ == "__main__":
+    main()

pen_stack/data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """pen_stack.data - see PEN-STACK v3.0 program doc."""

pen_stack/data/encode.py ADDED Viewed

@@ -0,0 +1,84 @@
+"""ENCODE REST resolver (Phase 1, Step 1.1).
+Resolves released hg38 bigWig SIGNAL files for a (biosample, assay/target) pair via the ENCODE
+Portal REST API - so we never hard-code possibly-wrong file accessions. Returns accession + href.
+"""
+from __future__ import annotations
+import requests
+ENCODE = "https://www.encodeproject.org"
+HEADERS = {"accept": "application/json"}
+# preferred processed signal output per assay (fold-change over control where available)
+_PREF_OUTPUT = [
+    "fold change over control",
+    "signal p-value",
+    "read-depth normalized signal",
+    "signal",
+]
+def _search(params: dict) -> list[dict]:
+    r = requests.get(f"{ENCODE}/search/", params=params, headers=HEADERS, timeout=60)
+    if r.status_code == 404:
+        return []   # ENCODE returns 404 for zero-result searches with some param combos
+    r.raise_for_status()
+    return r.json().get("@graph", [])
+def find_bigwig(biosample: str, assay_title: str, target: str | None = None,
+                assembly: str = "GRCh38") -> dict | None:
+    """Find one released bigWig signal file for a biosample + assay (+ histone target).
+    biosample e.g. 'K562'; assay_title e.g. 'Histone ChIP-seq' / 'ATAC-seq' / 'DNase-seq';
+    target e.g. 'H3K27ac' (None for ATAC/DNase).
+    """
+    params = {
+        "type": "File",
+        "file_format": "bigWig",
+        "output_type": _PREF_OUTPUT,
+        "assembly": assembly,
+        "status": "released",
+        "biosample_ontology.term_name": biosample,
+        "assay_title": assay_title,
+        "format": "json",
+        "limit": "50",
+    }
+    if target:
+        params["target.label"] = target
+    files = _search(params)
+    if not files:
+        return None
+    # rank by preferred output_type order, prefer non-isogenic-replicate consensus where present
+    def rank(f):
+        ot = f.get("output_type", "")
+        return _PREF_OUTPUT.index(ot) if ot in _PREF_OUTPUT else len(_PREF_OUTPUT)
+    f = sorted(files, key=rank)[0]
+    return {"accession": f["accession"], "href": ENCODE + f["href"],
+            "output_type": f.get("output_type"), "assembly": assembly,
+            "biosample": biosample, "assay": assay_title, "target": target}
+# default track panel per the prereg (durability features)
+DEFAULT_PANEL = [
+    ("ATAC-seq", None),
+    ("DNase-seq", None),
+    ("Histone ChIP-seq", "H3K27ac"),
+    ("Histone ChIP-seq", "H3K4me1"),
+    ("Histone ChIP-seq", "H3K4me3"),
+    ("Histone ChIP-seq", "H3K9me3"),
+    ("Histone ChIP-seq", "H3K27me3"),
+]
+def resolve_panel(biosample: str, panel=DEFAULT_PANEL, assembly: str = "GRCh38") -> dict[str, dict]:
+    """Return {track_name: file_record} for the panel, skipping assays with no released bigWig.
+    Partial panels are returned as-is (e.g. a cell type lacking some histone marks) - graceful."""
+    out = {}
+    for assay, target in panel:
+        rec = find_bigwig(biosample, assay, target, assembly=assembly)
+        name = target or assay.split("-")[0].lower()   # H3K27ac / atac / dnase
+        if rec:
+            out[name] = rec
+    return out

pen_stack/data/genome.py ADDED Viewed

@@ -0,0 +1,71 @@
+"""hg38 genome scaffolding (Phase 1, Step 1.1 foundation).
+Fetches hg38 chromosome sizes and builds the canonical 1 kb bin grid (autosomes + X) that every
+feature store is keyed on. Pure-CPU, small; runs in any container.
+"""
+from __future__ import annotations
+from pathlib import Path
+import pandas as pd
+import requests
+UCSC_CHROM_SIZES = "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes"
+MAIN_CHROMS = [f"chr{i}" for i in range(1, 23)] + ["chrX"]
+BIN_BP = 1000
+def fetch_chrom_sizes(out_tsv: str | Path, url: str = UCSC_CHROM_SIZES,
+                      chroms: list[str] = MAIN_CHROMS) -> dict[str, int]:
+    txt = requests.get(url, timeout=60).text
+    sizes = {}
+    for line in txt.splitlines():
+        if not line.strip():
+            continue
+        c, n = line.split("\t")[:2]
+        if c in chroms:
+            sizes[c] = int(n)
+    sizes = {c: sizes[c] for c in chroms if c in sizes}   # canonical order
+    Path(out_tsv).parent.mkdir(parents=True, exist_ok=True)
+    Path(out_tsv).write_text("".join(f"{c}\t{n}\n" for c, n in sizes.items()))
+    return sizes
+def build_bin_grid(chrom_sizes: dict[str, int], out_parquet: str | Path | None = None,
+                   bin_bp: int = BIN_BP) -> pd.DataFrame:
+    rows = []
+    for c, n in chrom_sizes.items():
+        nbins = n // bin_bp
+        starts = range(0, nbins * bin_bp, bin_bp)
+        rows.append(pd.DataFrame({"chrom": c, "start": starts}))
+    grid = pd.concat(rows, ignore_index=True)
+    grid["end"] = grid["start"] + bin_bp
+    grid["bin"] = grid["start"] // bin_bp
+    if out_parquet:
+        Path(out_parquet).parent.mkdir(parents=True, exist_ok=True)
+        grid.to_parquet(out_parquet, index=False)
+    return grid
+def load_chrom_sizes(tsv: str | Path) -> dict[str, int]:
+    out = {}
+    for line in Path(tsv).read_text().splitlines():
+        if line.strip():
+            c, n = line.split("\t")[:2]
+            out[c] = int(n)
+    return out
+def main() -> None:
+    import argparse
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--sizes-out", default="/data/raw/hg38.chrom.sizes")
+    ap.add_argument("--grid-out", default="/data/features/bin_grid_1kb.parquet")
+    a = ap.parse_args()
+    sizes = fetch_chrom_sizes(a.sizes_out)
+    grid = build_bin_grid(sizes, a.grid_out)
+    print(f"chroms={len(sizes)} total_bins={len(grid)} -> {a.grid_out}")
+if __name__ == "__main__":
+    main()

pen_stack/data/ingest_chromatin.py ADDED Viewed

@@ -0,0 +1,119 @@
+"""Chromatin feature store (Phase 1, Step 1.1).
+Resolves the ENCODE bigWig panel for a cell type, downloads tracks IN PARALLEL, bins each to the
+canonical 1 kb grid (mean signal per bin) IN PARALLEL across cores, merges into one feature-store
+parquet, and deletes the raw bigWigs (500 GB discipline). Run in Docker on the VM.
+"""
+from __future__ import annotations
+import argparse
+import os
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import pyBigWig
+import requests
+from pen_stack.data.encode import resolve_panel
+from pen_stack.data.genome import MAIN_CHROMS, load_chrom_sizes
+BIN_BP = 1000
+def download(href: str, dest: str) -> str:
+    dest = str(dest)
+    if os.path.exists(dest) and os.path.getsize(dest) > 0:
+        return dest
+    Path(dest).parent.mkdir(parents=True, exist_ok=True)
+    with requests.get(href, stream=True, timeout=900) as r:
+        r.raise_for_status()
+        with open(dest, "wb") as fh:
+            for chunk in r.iter_content(chunk_size=1 << 20):
+                fh.write(chunk)
+    return dest
+def bin_one(args) -> tuple[str, pd.DataFrame]:
+    """Bin one bigWig to 1 kb mean per bin (module-level for ProcessPool picklability)."""
+    name, path, sizes = args
+    bw = pyBigWig.open(path)
+    bw_chroms = set(bw.chroms().keys())
+    frames = []
+    for c in MAIN_CHROMS:
+        if c not in sizes:
+            continue
+        n = sizes[c] // BIN_BP
+        key = c if c in bw_chroms else c.replace("chr", "")
+        if key not in bw_chroms:
+            frames.append(pd.DataFrame({"chrom": c, "bin": range(n), name: np.zeros(n, "float32")}))
+            continue
+        vals = bw.stats(key, 0, n * BIN_BP, nBins=n, type="mean")
+        v = np.array([0.0 if x is None else float(x) for x in vals], dtype="float32")
+        frames.append(pd.DataFrame({"chrom": c, "bin": range(n), name: v}))
+    bw.close()
+    return name, pd.concat(frames, ignore_index=True)
+def build_feature_store(biosample: str, chrom_sizes_tsv: str, raw_dir: str, out_parquet: str,
+                        max_dl: int = 7, max_bin: int = 7) -> pd.DataFrame:
+    sizes = load_chrom_sizes(chrom_sizes_tsv)
+    panel = resolve_panel(biosample)
+    print(f"[{biosample}] resolved tracks: {list(panel.keys())}", flush=True)
+    if not panel:
+        raise SystemExit(f"no ENCODE bigWig tracks resolved for {biosample}")
+    # 1) parallel download
+    paths = {}
+    with ThreadPoolExecutor(max_workers=max_dl) as ex:
+        futs = {ex.submit(download, rec["href"],
+                          os.path.join(raw_dir, f"{biosample}_{name}_{rec['accession']}.bigWig")): name
+                for name, rec in panel.items()}
+        for fut in futs:
+            name = futs[fut]
+            paths[name] = fut.result()
+            print(f"  downloaded {name}", flush=True)
+    # 2) parallel bin
+    binned = {}
+    with ProcessPoolExecutor(max_workers=max_bin) as ex:
+        for name, df in ex.map(bin_one, [(n, paths[n], sizes) for n in panel]):
+            binned[name] = df
+            print(f"  binned {name}", flush=True)
+    base = None
+    for name in panel:
+        base = binned[name] if base is None else base.merge(binned[name], on=["chrom", "bin"])
+    base["biosample"] = biosample
+    # 3) clean raws
+    for p in paths.values():
+        try:
+            os.remove(p)
+        except OSError:
+            pass
+    Path(out_parquet).parent.mkdir(parents=True, exist_ok=True)
+    base.to_parquet(out_parquet, index=False)
+    pd.DataFrame([{"track": n, "accession": panel[n]["accession"],
+                   "output_type": panel[n]["output_type"]} for n in panel]
+                 ).to_csv(out_parquet.replace(".parquet", "_manifest.csv"), index=False)
+    return base
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--biosample", required=True)
+    ap.add_argument("--sizes", default="/data/raw/hg38.chrom.sizes")
+    ap.add_argument("--raw-dir", default="/data/raw/encode")
+    ap.add_argument("--out", default=None)
+    a = ap.parse_args()
+    out = a.out or f"/data/features/chromatin_{a.biosample.lower()}.parquet"
+    df = build_feature_store(a.biosample, a.sizes, a.raw_dir, out)
+    cols = [c for c in df.columns if c not in ("chrom", "bin", "biosample")]
+    print(f"feature store {out}: bins={len(df)} tracks={cols}", flush=True)
+if __name__ == "__main__":
+    main()

pen_stack/data/ingest_integration.py ADDED Viewed

@@ -0,0 +1,112 @@
+"""Integration-propensity features (Phase 1, Step 1.2).
+Builds per-1 kb-bin retroviral integration density from VISDB integration tables (HIV, HTLV, MLV;
+coordinates already lifted to hg38 in VISDB). Integration propensity reflects accessible/active
+chromatin and is a feature for both the safety layer and "where insertions land".
+NOTE (honest scope): VISDB's MLV set is tiny (~32 sites); the large >3.7M MLV-in-K562/HepG2 sets
+referenced in the plan live in specific papers'/GEO supplements and are sourced separately. The
+GENOTOXIC labels (clonal-outcome CIS) come from the clinical gene list (Step 1.4) - this module
+supplies the integration-DENSITY feature, not the danger label.
+"""
+from __future__ import annotations
+import argparse
+import glob
+import os
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from pen_stack.data.genome import MAIN_CHROMS
+BIN_BP = 1000
+def load_visdb(csv_dir: str) -> pd.DataFrame:
+    frames = []
+    for f in sorted(glob.glob(os.path.join(csv_dir, "*.csv"))):
+        virus = Path(f).stem
+        df = pd.read_csv(f, dtype=str)
+        cols = {c.lower().strip(): c for c in df.columns}
+        chrom_c = cols.get("human chromosome")
+        start_c = cols.get("hg38_start")
+        if not chrom_c or not start_c:
+            continue
+        sub = pd.DataFrame({
+            "chrom": df[chrom_c].astype(str).map(lambda c: c if c.startswith("chr") else f"chr{c}"),
+            "pos": pd.to_numeric(df[start_c], errors="coerce"),
+            "virus": virus,
+        }).dropna(subset=["pos"])
+        frames.append(sub)
+    out = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame(columns=["chrom", "pos", "virus"])
+    out = out[out["chrom"].isin(MAIN_CHROMS)].copy()
+    out["pos"] = out["pos"].astype(int)
+    return out
+def density_per_bin(integ: pd.DataFrame, bin_grid: str, out_parquet: str) -> pd.DataFrame:
+    grid = pd.read_parquet(bin_grid)[["chrom", "bin"]]
+    integ["bin"] = integ["pos"] // BIN_BP
+    dens = integ.groupby(["chrom", "bin"]).size().rename("integ_density").reset_index()
+    out = grid.merge(dens, on=["chrom", "bin"], how="left")
+    out["integ_density"] = out["integ_density"].fillna(0).astype("int32")
+    out["integ_log_density"] = np.log1p(out["integ_density"]).astype("float32")
+    Path(out_parquet).parent.mkdir(parents=True, exist_ok=True)
+    out.to_parquet(out_parquet, index=False)
+    return out
+def lafave_density(bed_gz: str, chain_file: str, bin_grid: str, out_parquet: str) -> pd.DataFrame:
+    """Cell-type-specific MLV integration density from a LaFave et al. 2014 BED (hg19 -> hg38 lift).
+    The LaFave K562/HepG2 MLV integration BEDs are on hg19; lift each site to hg38 with the UCSC
+    chain, then bin to 1 kb. This is the plan's >3.7M MLV-in-K562/HepG2 supervision (Bushman/NHGRI).
+    """
+    from pyliftover import LiftOver
+    lo = LiftOver(chain_file)
+    sites = []
+    with __import__("gzip").open(bed_gz, "rt") as fh:
+        for line in fh:
+            if line.startswith("track") or not line.strip():
+                continue
+            f = line.split("\t")
+            chrom, start = f[0], int(f[1])
+            conv = lo.convert_coordinate(chrom, start)
+            if conv:
+                nc, npos = conv[0][0], conv[0][1]
+                if nc in MAIN_CHROMS:
+                    sites.append((nc, npos))
+    integ = pd.DataFrame(sites, columns=["chrom", "pos"])
+    print(f"lifted {len(integ)} / sites to hg38")
+    out = density_per_bin(integ, bin_grid, out_parquet)
+    out = out.rename(columns={"integ_density": "integ_mlv_density",
+                              "integ_log_density": "integ_mlv_log_density"})
+    out.to_parquet(out_parquet, index=False)
+    return out
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--mode", choices=["visdb", "lafave"], default="visdb")
+    ap.add_argument("--visdb-dir", default="/data/external/visdb")
+    ap.add_argument("--lafave-bed", default=None)
+    ap.add_argument("--chain", default="/data/external/hg19ToHg38.over.chain.gz")
+    ap.add_argument("--bin-grid", default="/data/features/bin_grid_1kb.parquet")
+    ap.add_argument("--out", default="/data/features/integration_density.parquet")
+    a = ap.parse_args()
+    if a.mode == "lafave":
+        out = lafave_density(a.lafave_bed, a.chain, a.bin_grid, a.out)
+        nz = int((out["integ_mlv_density"] > 0).sum())
+        print(f"MLV density: bins={len(out)} nonzero={nz} max={int(out['integ_mlv_density'].max())} -> {a.out}")
+        return
+    integ = load_visdb(a.visdb_dir)
+    print(f"loaded {len(integ)} integration sites; by virus: {integ['virus'].value_counts().to_dict()}")
+    out = density_per_bin(integ, a.bin_grid, a.out)
+    nz = int((out["integ_density"] > 0).sum())
+    print(f"integration density: bins={len(out)} nonzero={nz} max={int(out['integ_density'].max())}")
+if __name__ == "__main__":
+    main()

pen_stack/data/ingest_safety_annot.py ADDED Viewed

@@ -0,0 +1,164 @@
+"""Safety annotations per 1 kb bin (Phase 1, Step 1.4).
+Builds per-bin safety features from COSMIC Cancer Gene Census (oncogene/TSG loci),
+DepMap CRISPRGeneEffect (essential genes), and GENCODE (gene/TSS distances):
+  - dist_oncogene, dist_tsg, dist_essential, dist_tss  (bp to nearest, via bedtools closest)
+  - genotoxic_cis flag (bins within a window of LMO2/MECOM/CCND2/PRDM16/HMGA2)
+Inputs are staged on the VM under /data/external (COSMIC tsv, DepMap csv); GENCODE is downloaded.
+Runs CPU-only in the penstack:phase1 image (bedtools in-image). Output keyed on (chrom, bin).
+"""
+from __future__ import annotations
+import argparse
+import gzip
+import os
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import pybedtools
+import requests
+from pen_stack.data.genome import MAIN_CHROMS
+GENCODE_GTF = ("https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/"
+               "release_46/gencode.v46.basic.annotation.gtf.gz")
+GENOTOXIC = ["LMO2", "MECOM", "EVI1", "CCND2", "PRDM16", "HMGA2"]
+BIN_BP = 1000
+CIS_WINDOW = 50000  # bp window around a genotoxic gene to flag
+def _chr(c: str) -> str:
+    c = str(c)
+    return c if c.startswith("chr") else f"chr{c}"
+def load_cosmic(tsv: str) -> pd.DataFrame:
+    df = pd.read_csv(tsv, sep="\t", dtype=str)
+    df = df.dropna(subset=["CHROMOSOME", "GENOME_START", "GENOME_STOP"])
+    df["chrom"] = df["CHROMOSOME"].map(_chr)
+    df["start"] = pd.to_numeric(df["GENOME_START"], errors="coerce")
+    df["end"] = pd.to_numeric(df["GENOME_STOP"], errors="coerce")
+    df = df.dropna(subset=["start", "end"])
+    df["role"] = df.get("ROLE_IN_CANCER", "").fillna("")
+    df = df[df["chrom"].isin(MAIN_CHROMS)]
+    df["start"] = df["start"].astype(int)
+    df["end"] = df["end"].astype(int)
+    return df[["chrom", "start", "end", "GENE_SYMBOL", "role"]]
+def load_depmap_essential(csv: str, thresh: float = -0.5) -> set[str]:
+    """Common-essential genes: mean Chronos effect across cell lines < thresh."""
+    df = pd.read_csv(csv, index_col=0)
+    means = df.mean(axis=0)
+    genes = {c.split(" (")[0] for c, m in means.items() if m < thresh}
+    return genes
+def download_gencode(dest: str, url: str = GENCODE_GTF) -> str:
+    if not (os.path.exists(dest) and os.path.getsize(dest) > 0):
+        Path(dest).parent.mkdir(parents=True, exist_ok=True)
+        with requests.get(url, stream=True, timeout=600) as r:
+            r.raise_for_status()
+            with open(dest, "wb") as fh:
+                for ch in r.iter_content(1 << 20):
+                    fh.write(ch)
+    return dest
+def parse_gencode_genes(gtf_gz: str) -> pd.DataFrame:
+    rows = []
+    with gzip.open(gtf_gz, "rt") as fh:
+        for line in fh:
+            if line.startswith("#"):
+                continue
+            f = line.rstrip("\n").split("\t")
+            if f[2] != "gene":
+                continue
+            chrom = f[0]
+            if chrom not in MAIN_CHROMS:
+                continue
+            start, end, strand = int(f[3]), int(f[4]), f[6]
+            attrs = f[8]
+            name = ""
+            for kv in attrs.split(";"):
+                kv = kv.strip()
+                if kv.startswith("gene_name"):
+                    name = kv.split('"')[1]
+                    break
+            tss = start if strand == "+" else end
+            rows.append((chrom, start, end, strand, name, tss))
+    return pd.DataFrame(rows, columns=["chrom", "start", "end", "strand", "gene_name", "tss"])
+def _bed(df: pd.DataFrame, cols=("chrom", "start", "end")) -> pybedtools.BedTool:
+    b = df[list(cols)].copy()
+    b.columns = ["chrom", "start", "end"]
+    b = b.sort_values(["chrom", "start"])
+    return pybedtools.BedTool.from_dataframe(b)
+def nearest_dist(bins_bed: pybedtools.BedTool, feat_df: pd.DataFrame, name: str) -> pd.DataFrame:
+    if feat_df.empty:
+        return pd.DataFrame(columns=["chrom", "start", name])
+    fb = _bed(feat_df).sort()
+    closest = bins_bed.closest(fb, d=True)
+    out = closest.to_dataframe(header=None, usecols=[0, 1, closest.field_count() - 1],
+                               names=["chrom", "start", name])
+    # bedtools closest -d returns -1 when there is NO feature on that chromosome; that means
+    # "no nearby feature" (effectively infinite distance), NOT distance 0. Map the sentinel to NaN.
+    out[name] = out[name].where(out[name] >= 0, other=np.nan)
+    return out.groupby(["chrom", "start"], as_index=False)[name].min()
+def build(bin_grid: str, cosmic_tsv: str, depmap_csv: str, gencode_dest: str,
+          sizes_tsv: str, out_parquet: str) -> pd.DataFrame:
+    grid = pd.read_parquet(bin_grid)[["chrom", "start", "bin"]]
+    bins_bed = _bed(grid.assign(end=grid["start"] + BIN_BP)).sort()
+    cosmic = load_cosmic(cosmic_tsv)
+    onco = cosmic[cosmic["role"].str.contains("oncogene", case=False, na=False)]
+    tsg = cosmic[cosmic["role"].str.contains("TSG", case=False, na=False)]
+    gtf = download_gencode(gencode_dest)
+    genes = parse_gencode_genes(gtf)
+    ess_syms = load_depmap_essential(depmap_csv)
+    ess = genes[genes["gene_name"].isin(ess_syms)]
+    out = grid.copy()
+    for nm, fdf in [("dist_oncogene", onco), ("dist_tsg", tsg),
+                    ("dist_essential", ess), ("dist_tss", genes.assign(end=genes["tss"] + 1, start=genes["tss"]))]:
+        d = nearest_dist(bins_bed, fdf, nm)
+        out = out.merge(d, on=["chrom", "start"], how="left")
+    # genotoxic CIS flag
+    gtox = genes[genes["gene_name"].isin(GENOTOXIC)].copy()
+    gtox["start"] = (gtox["start"] - CIS_WINDOW).clip(lower=0)
+    gtox["end"] = gtox["end"] + CIS_WINDOW
+    gflag = nearest_dist(bins_bed, gtox, "dist_gtox")
+    out = out.merge(gflag, on=["chrom", "start"], how="left")
+    out["genotoxic_cis"] = (out["dist_gtox"].fillna(1e9) == 0)
+    Path(out_parquet).parent.mkdir(parents=True, exist_ok=True)
+    out.to_parquet(out_parquet, index=False)
+    return out
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--bin-grid", default="/data/features/bin_grid_1kb.parquet")
+    ap.add_argument("--cosmic", default="/data/external/Cosmic_CancerGeneCensus_v104_GRCh38.tsv")
+    ap.add_argument("--depmap", default="/data/external/CRISPRGeneEffect.csv")
+    ap.add_argument("--gencode", default="/data/raw/gencode.v46.basic.gtf.gz")
+    ap.add_argument("--sizes", default="/data/raw/hg38.chrom.sizes")
+    ap.add_argument("--out", default="/data/features/safety_annot.parquet")
+    a = ap.parse_args()
+    df = build(a.bin_grid, a.cosmic, a.depmap, a.gencode, a.sizes, a.out)
+    n_onco = (df["dist_oncogene"] == 0).sum()
+    print(f"safety_annot bins={len(df)} cols={[c for c in df.columns if c.startswith('dist') or c=='genotoxic_cis']}")
+    print(f"bins in an oncogene={n_onco} genotoxic_cis bins={int(df['genotoxic_cis'].sum())}")
+if __name__ == "__main__":
+    main()