pen-stack 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. pen_stack/__init__.py +2 -0
  2. pen_stack/_resources.py +34 -0
  3. pen_stack/adapt/__init__.py +14 -0
  4. pen_stack/adapt/finetune.py +33 -0
  5. pen_stack/adapt/ingest.py +86 -0
  6. pen_stack/adapt/pipeline.py +101 -0
  7. pen_stack/adapt/recalibrate.py +58 -0
  8. pen_stack/adapt/report.py +130 -0
  9. pen_stack/agent/__init__.py +1 -0
  10. pen_stack/agent/guardrails.py +49 -0
  11. pen_stack/agent/mcp_server.py +42 -0
  12. pen_stack/agent/orchestrator.py +106 -0
  13. pen_stack/agent/pen_agent.py +169 -0
  14. pen_stack/agent/tools.py +130 -0
  15. pen_stack/atlas/__init__.py +1 -0
  16. pen_stack/atlas/build_wtkb.py +80 -0
  17. pen_stack/atlas/crosslink.py +144 -0
  18. pen_stack/atlas/expand.py +190 -0
  19. pen_stack/atlas/schema.py +59 -0
  20. pen_stack/atlas/scorecard.py +134 -0
  21. pen_stack/atlas/universe.py +75 -0
  22. pen_stack/atlas/variant_propose.py +155 -0
  23. pen_stack/bridge/__init__.py +1 -0
  24. pen_stack/bridge/activity.py +52 -0
  25. pen_stack/bridge/cli.py +65 -0
  26. pen_stack/bridge/fold_qc.py +53 -0
  27. pen_stack/bridge/guide_qc.py +84 -0
  28. pen_stack/bridge/ingest.py +139 -0
  29. pen_stack/bridge/offtarget.py +133 -0
  30. pen_stack/bridge/ortholog_screen.py +73 -0
  31. pen_stack/bridge/pipeline.py +83 -0
  32. pen_stack/cli.py +126 -0
  33. pen_stack/data/__init__.py +1 -0
  34. pen_stack/data/encode.py +84 -0
  35. pen_stack/data/genome.py +71 -0
  36. pen_stack/data/ingest_chromatin.py +119 -0
  37. pen_stack/data/ingest_integration.py +112 -0
  38. pen_stack/data/ingest_safety_annot.py +164 -0
  39. pen_stack/data/ingest_trip.py +76 -0
  40. pen_stack/mech/__init__.py +1 -0
  41. pen_stack/mech/classify_atlas.py +71 -0
  42. pen_stack/mech/whitelist.py +66 -0
  43. pen_stack/monitor/__init__.py +1 -0
  44. pen_stack/monitor/europepmc.py +32 -0
  45. pen_stack/monitor/run.py +57 -0
  46. pen_stack/monitor/triage.py +63 -0
  47. pen_stack/planner/__init__.py +1 -0
  48. pen_stack/planner/cargo.py +56 -0
  49. pen_stack/planner/cargo_polish.py +146 -0
  50. pen_stack/planner/delivery.py +32 -0
  51. pen_stack/planner/multiplex.py +110 -0
  52. pen_stack/planner/optimize.py +156 -0
  53. pen_stack/planner/pipeline.py +86 -0
  54. pen_stack/planner/report.py +26 -0
  55. pen_stack/rag/__init__.py +1 -0
  56. pen_stack/rag/index.py +53 -0
  57. pen_stack/rag/llm.py +178 -0
  58. pen_stack/rag/qa.py +105 -0
  59. pen_stack/score/__init__.py +1 -0
  60. pen_stack/score/recalibrate.py +77 -0
  61. pen_stack/score/therapeutic.py +85 -0
  62. pen_stack/server/__init__.py +1 -0
  63. pen_stack/server/api.py +142 -0
  64. pen_stack/ui/__init__.py +1 -0
  65. pen_stack/ui/app.py +518 -0
  66. pen_stack/validate/__init__.py +1 -0
  67. pen_stack/validate/adapt_demo.py +69 -0
  68. pen_stack/validate/agent_eval.py +117 -0
  69. pen_stack/validate/blind_gsh_discovery.py +165 -0
  70. pen_stack/validate/cargo_directionality.py +57 -0
  71. pen_stack/validate/durability_baselines.py +150 -0
  72. pen_stack/validate/forward_hypotheses.py +104 -0
  73. pen_stack/validate/guide_qc_demo.py +58 -0
  74. pen_stack/validate/intent_specification.py +82 -0
  75. pen_stack/validate/paper3_benchmark.py +165 -0
  76. pen_stack/validate/paper4_real_validation.py +144 -0
  77. pen_stack/validate/paper4_validation.py +82 -0
  78. pen_stack/validate/seq_vs_measured.py +134 -0
  79. pen_stack/validate/within_locus_ranking.py +74 -0
  80. pen_stack/validate/writer_recovery.py +86 -0
  81. pen_stack/wgenome/__init__.py +1 -0
  82. pen_stack/wgenome/chromatin_seq.py +83 -0
  83. pen_stack/wgenome/durability.py +108 -0
  84. pen_stack/wgenome/export_tracks.py +52 -0
  85. pen_stack/wgenome/features.py +82 -0
  86. pen_stack/wgenome/gsh_baseline.py +117 -0
  87. pen_stack/wgenome/providers.py +245 -0
  88. pen_stack/wgenome/safety.py +69 -0
  89. pen_stack/wgenome/structure3d.py +168 -0
  90. pen_stack/wgenome/writability.py +72 -0
  91. pen_stack-3.1.0.dist-info/METADATA +451 -0
  92. pen_stack-3.1.0.dist-info/RECORD +96 -0
  93. pen_stack-3.1.0.dist-info/WHEEL +5 -0
  94. pen_stack-3.1.0.dist-info/entry_points.txt +3 -0
  95. pen_stack-3.1.0.dist-info/licenses/LICENSE +21 -0
  96. pen_stack-3.1.0.dist-info/top_level.txt +1 -0
pen_stack/cli.py ADDED
@@ -0,0 +1,126 @@
1
+ """PEN-STACK unified CLI (subcommands wired per-phase: atlas, score, writable, crosslink, monitor).
2
+
3
+ One entry point - ``pen-stack`` - over the whole stack. Heavy data (the Phase-1 writability atlas) is
4
+ loaded lazily and degrades gracefully when absent, so ``info`` / ``atlas`` work from a clean install.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import click
9
+
10
+ from pen_stack import __version__
11
+
12
+
13
+ @click.group()
14
+ @click.version_option(__version__, prog_name="pen-stack")
15
+ def main():
16
+ """PEN-STACK - open infrastructure for genome writing."""
17
+
18
+
19
+ @main.command()
20
+ def info():
21
+ """Show stack status and module map."""
22
+ click.echo(f"PEN-STACK v{__version__}")
23
+ click.echo("Pillar B (flagship): wgenome - Writable Genome (safety x durability x reachability)")
24
+ click.echo("Pillar A (companion): atlas, mech, score - Writer Atlas + WT-KB")
25
+ click.echo("Engine: planner - Write Planner (inverse design)")
26
+ click.echo("Beachhead: bridge - bridge-recombinase off-target engine")
27
+ click.echo("Services: monitor, rag, agent, ui, server")
28
+
29
+
30
+ @main.command()
31
+ @click.option("--family", default=None, help="Filter to one writer family.")
32
+ @click.option("--coverage", is_flag=True, help="Show per-family coverage + confidence breakdown.")
33
+ @click.option("--limit", default=10, help="Max rows to print.")
34
+ def atlas(family, coverage, limit):
35
+ """Query the Writer Atlas."""
36
+ import pandas as pd
37
+
38
+ from pen_stack.atlas.crosslink import _ATLAS
39
+ df = pd.read_parquet(_ATLAS)
40
+ if coverage:
41
+ cov = (df.groupby("family")
42
+ .agg(n=("representative_system", "size"),
43
+ measured=("confidence", lambda s: (s == "measured").sum()),
44
+ tier=("reachability_tier", "first"))
45
+ .reset_index())
46
+ click.echo(cov.to_string(index=False))
47
+ click.echo(f"\nTOTAL systems: {len(df):,} across {df['family'].nunique()} families")
48
+ return
49
+ if family:
50
+ df = df[df["family"] == family]
51
+ cols = [c for c in ["representative_system", "family", "confidence", "deliv_class",
52
+ "readiness", "reachability_tier"] if c in df.columns]
53
+ click.echo(df[cols].head(limit).to_string(index=False))
54
+
55
+
56
+ @main.command()
57
+ @click.option("--gene", required=True, help="Target gene symbol.")
58
+ @click.option("--ct", default="k562", help="Cell type (k562/hepg2/hspc).")
59
+ @click.option("--top", default=10, help="Top writable bins to show.")
60
+ def writable(gene, ct, top):
61
+ """Rank writable loci overlapping a gene."""
62
+ from pen_stack.atlas.crosslink import loci_for_gene
63
+ try:
64
+ g = loci_for_gene(gene, ct)
65
+ except FileNotFoundError as e:
66
+ raise click.ClickException(f"Phase-1 writability atlas not available: {e}") from e
67
+ if g.empty:
68
+ click.echo(f"No writable bins found for {gene} in {ct}.")
69
+ return
70
+ click.echo(g[["chrom", "bin", "safety", "p_durable", "writability"]].head(top).to_string(index=False))
71
+
72
+
73
+ @main.command()
74
+ @click.option("--family", help="Writer family -> ranked reachable loci.")
75
+ @click.option("--chrom", help="Locus chrom (with --bin) -> reachable writers.")
76
+ @click.option("--bin", "bin_idx", type=int, help="Locus 1kb bin index.")
77
+ @click.option("--ct", default="k562")
78
+ @click.option("--top", default=10)
79
+ def crosslink(family, chrom, bin_idx, ct, top):
80
+ """Writer<->locus cross-link queries."""
81
+ from pen_stack.atlas import crosslink as cl
82
+ try:
83
+ if family:
84
+ click.echo(cl.loci_for_writer(family, ct, top=top).to_string(index=False))
85
+ elif chrom and bin_idx is not None:
86
+ click.echo(cl.writers_for_locus(chrom, bin_idx, ct).head(top).to_string(index=False))
87
+ else:
88
+ raise click.UsageError("provide --family OR (--chrom and --bin)")
89
+ except FileNotFoundError as e:
90
+ raise click.ClickException(f"Phase-1 writability atlas not available: {e}") from e
91
+
92
+
93
+ @main.command()
94
+ @click.option("--gene", required=True, help="Target gene symbol.")
95
+ @click.option("--intent", required=True,
96
+ type=click.Choice(["safe_harbour_insertion", "knock_in_with_disruption",
97
+ "high_durability_insertion", "regulatory_excision", "repeat_excision"]))
98
+ @click.option("--cargo-bp", default=2000, help="Payload size (bp).")
99
+ @click.option("--ct", default="k562", help="Cell type (k562/hepg2/hspc).")
100
+ @click.option("--k", default=3, help="Number of ranked plans.")
101
+ def plan(gene, intent, cargo_bp, ct, k):
102
+ """Write Planner: goal + edit_intent -> ranked, traceable plans."""
103
+ from pen_stack.planner.optimize import EditIntent
104
+ from pen_stack.planner.pipeline import plan_write
105
+ from pen_stack.planner.report import render_plans
106
+ try:
107
+ plans = plan_write(gene, EditIntent(intent), cargo_bp, ct, k=k)
108
+ except FileNotFoundError as e:
109
+ raise click.ClickException(f"Phase-1 writability atlas not available: {e}") from e
110
+ click.echo(render_plans(plans))
111
+
112
+
113
+ @main.command()
114
+ @click.option("--since", default="2026-01-01", help="Earliest publication date (YYYY-MM-DD).")
115
+ @click.option("--back-test", is_flag=True, help="Run the ISPpu10 back-test window.")
116
+ def monitor(since, back_test):
117
+ """Run PEN-MONITOR (Europe PMC living-database scan -> curation queue)."""
118
+ from pen_stack.monitor.run import run_monitor
119
+ res = run_monitor(since=since, back_test=back_test)
120
+ click.echo(f"PEN-MONITOR: {res['n_hits']} hits, {res['n_candidates']} candidates -> {res['queue']}")
121
+ if res.get("isppu10_found") is not None:
122
+ click.echo(f"ISPpu10 back-test: {'FOUND' if res['isppu10_found'] else 'not found'}")
123
+
124
+
125
+ if __name__ == "__main__":
126
+ main()
@@ -0,0 +1 @@
1
+ """pen_stack.data - see PEN-STACK v3.0 program doc."""
@@ -0,0 +1,84 @@
1
+ """ENCODE REST resolver (Phase 1, Step 1.1).
2
+
3
+ Resolves released hg38 bigWig SIGNAL files for a (biosample, assay/target) pair via the ENCODE
4
+ Portal REST API - so we never hard-code possibly-wrong file accessions. Returns accession + href.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import requests
9
+
10
+ ENCODE = "https://www.encodeproject.org"
11
+ HEADERS = {"accept": "application/json"}
12
+
13
+ # preferred processed signal output per assay (fold-change over control where available)
14
+ _PREF_OUTPUT = [
15
+ "fold change over control",
16
+ "signal p-value",
17
+ "read-depth normalized signal",
18
+ "signal",
19
+ ]
20
+
21
+
22
+ def _search(params: dict) -> list[dict]:
23
+ r = requests.get(f"{ENCODE}/search/", params=params, headers=HEADERS, timeout=60)
24
+ if r.status_code == 404:
25
+ return [] # ENCODE returns 404 for zero-result searches with some param combos
26
+ r.raise_for_status()
27
+ return r.json().get("@graph", [])
28
+
29
+
30
+ def find_bigwig(biosample: str, assay_title: str, target: str | None = None,
31
+ assembly: str = "GRCh38") -> dict | None:
32
+ """Find one released bigWig signal file for a biosample + assay (+ histone target).
33
+
34
+ biosample e.g. 'K562'; assay_title e.g. 'Histone ChIP-seq' / 'ATAC-seq' / 'DNase-seq';
35
+ target e.g. 'H3K27ac' (None for ATAC/DNase).
36
+ """
37
+ params = {
38
+ "type": "File",
39
+ "file_format": "bigWig",
40
+ "output_type": _PREF_OUTPUT,
41
+ "assembly": assembly,
42
+ "status": "released",
43
+ "biosample_ontology.term_name": biosample,
44
+ "assay_title": assay_title,
45
+ "format": "json",
46
+ "limit": "50",
47
+ }
48
+ if target:
49
+ params["target.label"] = target
50
+ files = _search(params)
51
+ if not files:
52
+ return None
53
+ # rank by preferred output_type order, prefer non-isogenic-replicate consensus where present
54
+ def rank(f):
55
+ ot = f.get("output_type", "")
56
+ return _PREF_OUTPUT.index(ot) if ot in _PREF_OUTPUT else len(_PREF_OUTPUT)
57
+ f = sorted(files, key=rank)[0]
58
+ return {"accession": f["accession"], "href": ENCODE + f["href"],
59
+ "output_type": f.get("output_type"), "assembly": assembly,
60
+ "biosample": biosample, "assay": assay_title, "target": target}
61
+
62
+
63
+ # default track panel per the prereg (durability features)
64
+ DEFAULT_PANEL = [
65
+ ("ATAC-seq", None),
66
+ ("DNase-seq", None),
67
+ ("Histone ChIP-seq", "H3K27ac"),
68
+ ("Histone ChIP-seq", "H3K4me1"),
69
+ ("Histone ChIP-seq", "H3K4me3"),
70
+ ("Histone ChIP-seq", "H3K9me3"),
71
+ ("Histone ChIP-seq", "H3K27me3"),
72
+ ]
73
+
74
+
75
+ def resolve_panel(biosample: str, panel=DEFAULT_PANEL, assembly: str = "GRCh38") -> dict[str, dict]:
76
+ """Return {track_name: file_record} for the panel, skipping assays with no released bigWig.
77
+ Partial panels are returned as-is (e.g. a cell type lacking some histone marks) - graceful."""
78
+ out = {}
79
+ for assay, target in panel:
80
+ rec = find_bigwig(biosample, assay, target, assembly=assembly)
81
+ name = target or assay.split("-")[0].lower() # H3K27ac / atac / dnase
82
+ if rec:
83
+ out[name] = rec
84
+ return out
@@ -0,0 +1,71 @@
1
+ """hg38 genome scaffolding (Phase 1, Step 1.1 foundation).
2
+
3
+ Fetches hg38 chromosome sizes and builds the canonical 1 kb bin grid (autosomes + X) that every
4
+ feature store is keyed on. Pure-CPU, small; runs in any container.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ from pathlib import Path
9
+
10
+ import pandas as pd
11
+ import requests
12
+
13
+ UCSC_CHROM_SIZES = "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes"
14
+ MAIN_CHROMS = [f"chr{i}" for i in range(1, 23)] + ["chrX"]
15
+ BIN_BP = 1000
16
+
17
+
18
+ def fetch_chrom_sizes(out_tsv: str | Path, url: str = UCSC_CHROM_SIZES,
19
+ chroms: list[str] = MAIN_CHROMS) -> dict[str, int]:
20
+ txt = requests.get(url, timeout=60).text
21
+ sizes = {}
22
+ for line in txt.splitlines():
23
+ if not line.strip():
24
+ continue
25
+ c, n = line.split("\t")[:2]
26
+ if c in chroms:
27
+ sizes[c] = int(n)
28
+ sizes = {c: sizes[c] for c in chroms if c in sizes} # canonical order
29
+ Path(out_tsv).parent.mkdir(parents=True, exist_ok=True)
30
+ Path(out_tsv).write_text("".join(f"{c}\t{n}\n" for c, n in sizes.items()))
31
+ return sizes
32
+
33
+
34
+ def build_bin_grid(chrom_sizes: dict[str, int], out_parquet: str | Path | None = None,
35
+ bin_bp: int = BIN_BP) -> pd.DataFrame:
36
+ rows = []
37
+ for c, n in chrom_sizes.items():
38
+ nbins = n // bin_bp
39
+ starts = range(0, nbins * bin_bp, bin_bp)
40
+ rows.append(pd.DataFrame({"chrom": c, "start": starts}))
41
+ grid = pd.concat(rows, ignore_index=True)
42
+ grid["end"] = grid["start"] + bin_bp
43
+ grid["bin"] = grid["start"] // bin_bp
44
+ if out_parquet:
45
+ Path(out_parquet).parent.mkdir(parents=True, exist_ok=True)
46
+ grid.to_parquet(out_parquet, index=False)
47
+ return grid
48
+
49
+
50
+ def load_chrom_sizes(tsv: str | Path) -> dict[str, int]:
51
+ out = {}
52
+ for line in Path(tsv).read_text().splitlines():
53
+ if line.strip():
54
+ c, n = line.split("\t")[:2]
55
+ out[c] = int(n)
56
+ return out
57
+
58
+
59
+ def main() -> None:
60
+ import argparse
61
+ ap = argparse.ArgumentParser()
62
+ ap.add_argument("--sizes-out", default="/data/raw/hg38.chrom.sizes")
63
+ ap.add_argument("--grid-out", default="/data/features/bin_grid_1kb.parquet")
64
+ a = ap.parse_args()
65
+ sizes = fetch_chrom_sizes(a.sizes_out)
66
+ grid = build_bin_grid(sizes, a.grid_out)
67
+ print(f"chroms={len(sizes)} total_bins={len(grid)} -> {a.grid_out}")
68
+
69
+
70
+ if __name__ == "__main__":
71
+ main()
@@ -0,0 +1,119 @@
1
+ """Chromatin feature store (Phase 1, Step 1.1).
2
+
3
+ Resolves the ENCODE bigWig panel for a cell type, downloads tracks IN PARALLEL, bins each to the
4
+ canonical 1 kb grid (mean signal per bin) IN PARALLEL across cores, merges into one feature-store
5
+ parquet, and deletes the raw bigWigs (500 GB discipline). Run in Docker on the VM.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import argparse
10
+ import os
11
+ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
12
+ from pathlib import Path
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+ import pyBigWig
17
+ import requests
18
+
19
+ from pen_stack.data.encode import resolve_panel
20
+ from pen_stack.data.genome import MAIN_CHROMS, load_chrom_sizes
21
+
22
+ BIN_BP = 1000
23
+
24
+
25
+ def download(href: str, dest: str) -> str:
26
+ dest = str(dest)
27
+ if os.path.exists(dest) and os.path.getsize(dest) > 0:
28
+ return dest
29
+ Path(dest).parent.mkdir(parents=True, exist_ok=True)
30
+ with requests.get(href, stream=True, timeout=900) as r:
31
+ r.raise_for_status()
32
+ with open(dest, "wb") as fh:
33
+ for chunk in r.iter_content(chunk_size=1 << 20):
34
+ fh.write(chunk)
35
+ return dest
36
+
37
+
38
+ def bin_one(args) -> tuple[str, pd.DataFrame]:
39
+ """Bin one bigWig to 1 kb mean per bin (module-level for ProcessPool picklability)."""
40
+ name, path, sizes = args
41
+ bw = pyBigWig.open(path)
42
+ bw_chroms = set(bw.chroms().keys())
43
+ frames = []
44
+ for c in MAIN_CHROMS:
45
+ if c not in sizes:
46
+ continue
47
+ n = sizes[c] // BIN_BP
48
+ key = c if c in bw_chroms else c.replace("chr", "")
49
+ if key not in bw_chroms:
50
+ frames.append(pd.DataFrame({"chrom": c, "bin": range(n), name: np.zeros(n, "float32")}))
51
+ continue
52
+ vals = bw.stats(key, 0, n * BIN_BP, nBins=n, type="mean")
53
+ v = np.array([0.0 if x is None else float(x) for x in vals], dtype="float32")
54
+ frames.append(pd.DataFrame({"chrom": c, "bin": range(n), name: v}))
55
+ bw.close()
56
+ return name, pd.concat(frames, ignore_index=True)
57
+
58
+
59
+ def build_feature_store(biosample: str, chrom_sizes_tsv: str, raw_dir: str, out_parquet: str,
60
+ max_dl: int = 7, max_bin: int = 7) -> pd.DataFrame:
61
+ sizes = load_chrom_sizes(chrom_sizes_tsv)
62
+ panel = resolve_panel(biosample)
63
+ print(f"[{biosample}] resolved tracks: {list(panel.keys())}", flush=True)
64
+ if not panel:
65
+ raise SystemExit(f"no ENCODE bigWig tracks resolved for {biosample}")
66
+
67
+ # 1) parallel download
68
+ paths = {}
69
+ with ThreadPoolExecutor(max_workers=max_dl) as ex:
70
+ futs = {ex.submit(download, rec["href"],
71
+ os.path.join(raw_dir, f"{biosample}_{name}_{rec['accession']}.bigWig")): name
72
+ for name, rec in panel.items()}
73
+ for fut in futs:
74
+ name = futs[fut]
75
+ paths[name] = fut.result()
76
+ print(f" downloaded {name}", flush=True)
77
+
78
+ # 2) parallel bin
79
+ binned = {}
80
+ with ProcessPoolExecutor(max_workers=max_bin) as ex:
81
+ for name, df in ex.map(bin_one, [(n, paths[n], sizes) for n in panel]):
82
+ binned[name] = df
83
+ print(f" binned {name}", flush=True)
84
+
85
+ base = None
86
+ for name in panel:
87
+ base = binned[name] if base is None else base.merge(binned[name], on=["chrom", "bin"])
88
+ base["biosample"] = biosample
89
+
90
+ # 3) clean raws
91
+ for p in paths.values():
92
+ try:
93
+ os.remove(p)
94
+ except OSError:
95
+ pass
96
+
97
+ Path(out_parquet).parent.mkdir(parents=True, exist_ok=True)
98
+ base.to_parquet(out_parquet, index=False)
99
+ pd.DataFrame([{"track": n, "accession": panel[n]["accession"],
100
+ "output_type": panel[n]["output_type"]} for n in panel]
101
+ ).to_csv(out_parquet.replace(".parquet", "_manifest.csv"), index=False)
102
+ return base
103
+
104
+
105
+ def main() -> None:
106
+ ap = argparse.ArgumentParser()
107
+ ap.add_argument("--biosample", required=True)
108
+ ap.add_argument("--sizes", default="/data/raw/hg38.chrom.sizes")
109
+ ap.add_argument("--raw-dir", default="/data/raw/encode")
110
+ ap.add_argument("--out", default=None)
111
+ a = ap.parse_args()
112
+ out = a.out or f"/data/features/chromatin_{a.biosample.lower()}.parquet"
113
+ df = build_feature_store(a.biosample, a.sizes, a.raw_dir, out)
114
+ cols = [c for c in df.columns if c not in ("chrom", "bin", "biosample")]
115
+ print(f"feature store {out}: bins={len(df)} tracks={cols}", flush=True)
116
+
117
+
118
+ if __name__ == "__main__":
119
+ main()
@@ -0,0 +1,112 @@
1
+ """Integration-propensity features (Phase 1, Step 1.2).
2
+
3
+ Builds per-1 kb-bin retroviral integration density from VISDB integration tables (HIV, HTLV, MLV;
4
+ coordinates already lifted to hg38 in VISDB). Integration propensity reflects accessible/active
5
+ chromatin and is a feature for both the safety layer and "where insertions land".
6
+
7
+ NOTE (honest scope): VISDB's MLV set is tiny (~32 sites); the large >3.7M MLV-in-K562/HepG2 sets
8
+ referenced in the plan live in specific papers'/GEO supplements and are sourced separately. The
9
+ GENOTOXIC labels (clonal-outcome CIS) come from the clinical gene list (Step 1.4) - this module
10
+ supplies the integration-DENSITY feature, not the danger label.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import argparse
15
+ import glob
16
+ import os
17
+ from pathlib import Path
18
+
19
+ import numpy as np
20
+ import pandas as pd
21
+
22
+ from pen_stack.data.genome import MAIN_CHROMS
23
+
24
+ BIN_BP = 1000
25
+
26
+
27
+ def load_visdb(csv_dir: str) -> pd.DataFrame:
28
+ frames = []
29
+ for f in sorted(glob.glob(os.path.join(csv_dir, "*.csv"))):
30
+ virus = Path(f).stem
31
+ df = pd.read_csv(f, dtype=str)
32
+ cols = {c.lower().strip(): c for c in df.columns}
33
+ chrom_c = cols.get("human chromosome")
34
+ start_c = cols.get("hg38_start")
35
+ if not chrom_c or not start_c:
36
+ continue
37
+ sub = pd.DataFrame({
38
+ "chrom": df[chrom_c].astype(str).map(lambda c: c if c.startswith("chr") else f"chr{c}"),
39
+ "pos": pd.to_numeric(df[start_c], errors="coerce"),
40
+ "virus": virus,
41
+ }).dropna(subset=["pos"])
42
+ frames.append(sub)
43
+ out = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame(columns=["chrom", "pos", "virus"])
44
+ out = out[out["chrom"].isin(MAIN_CHROMS)].copy()
45
+ out["pos"] = out["pos"].astype(int)
46
+ return out
47
+
48
+
49
+ def density_per_bin(integ: pd.DataFrame, bin_grid: str, out_parquet: str) -> pd.DataFrame:
50
+ grid = pd.read_parquet(bin_grid)[["chrom", "bin"]]
51
+ integ["bin"] = integ["pos"] // BIN_BP
52
+ dens = integ.groupby(["chrom", "bin"]).size().rename("integ_density").reset_index()
53
+ out = grid.merge(dens, on=["chrom", "bin"], how="left")
54
+ out["integ_density"] = out["integ_density"].fillna(0).astype("int32")
55
+ out["integ_log_density"] = np.log1p(out["integ_density"]).astype("float32")
56
+ Path(out_parquet).parent.mkdir(parents=True, exist_ok=True)
57
+ out.to_parquet(out_parquet, index=False)
58
+ return out
59
+
60
+
61
+ def lafave_density(bed_gz: str, chain_file: str, bin_grid: str, out_parquet: str) -> pd.DataFrame:
62
+ """Cell-type-specific MLV integration density from a LaFave et al. 2014 BED (hg19 -> hg38 lift).
63
+
64
+ The LaFave K562/HepG2 MLV integration BEDs are on hg19; lift each site to hg38 with the UCSC
65
+ chain, then bin to 1 kb. This is the plan's >3.7M MLV-in-K562/HepG2 supervision (Bushman/NHGRI).
66
+ """
67
+ from pyliftover import LiftOver
68
+ lo = LiftOver(chain_file)
69
+ sites = []
70
+ with __import__("gzip").open(bed_gz, "rt") as fh:
71
+ for line in fh:
72
+ if line.startswith("track") or not line.strip():
73
+ continue
74
+ f = line.split("\t")
75
+ chrom, start = f[0], int(f[1])
76
+ conv = lo.convert_coordinate(chrom, start)
77
+ if conv:
78
+ nc, npos = conv[0][0], conv[0][1]
79
+ if nc in MAIN_CHROMS:
80
+ sites.append((nc, npos))
81
+ integ = pd.DataFrame(sites, columns=["chrom", "pos"])
82
+ print(f"lifted {len(integ)} / sites to hg38")
83
+ out = density_per_bin(integ, bin_grid, out_parquet)
84
+ out = out.rename(columns={"integ_density": "integ_mlv_density",
85
+ "integ_log_density": "integ_mlv_log_density"})
86
+ out.to_parquet(out_parquet, index=False)
87
+ return out
88
+
89
+
90
+ def main() -> None:
91
+ ap = argparse.ArgumentParser()
92
+ ap.add_argument("--mode", choices=["visdb", "lafave"], default="visdb")
93
+ ap.add_argument("--visdb-dir", default="/data/external/visdb")
94
+ ap.add_argument("--lafave-bed", default=None)
95
+ ap.add_argument("--chain", default="/data/external/hg19ToHg38.over.chain.gz")
96
+ ap.add_argument("--bin-grid", default="/data/features/bin_grid_1kb.parquet")
97
+ ap.add_argument("--out", default="/data/features/integration_density.parquet")
98
+ a = ap.parse_args()
99
+ if a.mode == "lafave":
100
+ out = lafave_density(a.lafave_bed, a.chain, a.bin_grid, a.out)
101
+ nz = int((out["integ_mlv_density"] > 0).sum())
102
+ print(f"MLV density: bins={len(out)} nonzero={nz} max={int(out['integ_mlv_density'].max())} -> {a.out}")
103
+ return
104
+ integ = load_visdb(a.visdb_dir)
105
+ print(f"loaded {len(integ)} integration sites; by virus: {integ['virus'].value_counts().to_dict()}")
106
+ out = density_per_bin(integ, a.bin_grid, a.out)
107
+ nz = int((out["integ_density"] > 0).sum())
108
+ print(f"integration density: bins={len(out)} nonzero={nz} max={int(out['integ_density'].max())}")
109
+
110
+
111
+ if __name__ == "__main__":
112
+ main()
@@ -0,0 +1,164 @@
1
+ """Safety annotations per 1 kb bin (Phase 1, Step 1.4).
2
+
3
+ Builds per-bin safety features from COSMIC Cancer Gene Census (oncogene/TSG loci),
4
+ DepMap CRISPRGeneEffect (essential genes), and GENCODE (gene/TSS distances):
5
+ - dist_oncogene, dist_tsg, dist_essential, dist_tss (bp to nearest, via bedtools closest)
6
+ - genotoxic_cis flag (bins within a window of LMO2/MECOM/CCND2/PRDM16/HMGA2)
7
+
8
+ Inputs are staged on the VM under /data/external (COSMIC tsv, DepMap csv); GENCODE is downloaded.
9
+ Runs CPU-only in the penstack:phase1 image (bedtools in-image). Output keyed on (chrom, bin).
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import argparse
14
+ import gzip
15
+ import os
16
+ from pathlib import Path
17
+
18
+ import numpy as np
19
+ import pandas as pd
20
+ import pybedtools
21
+ import requests
22
+
23
+ from pen_stack.data.genome import MAIN_CHROMS
24
+
25
+ GENCODE_GTF = ("https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/"
26
+ "release_46/gencode.v46.basic.annotation.gtf.gz")
27
+ GENOTOXIC = ["LMO2", "MECOM", "EVI1", "CCND2", "PRDM16", "HMGA2"]
28
+ BIN_BP = 1000
29
+ CIS_WINDOW = 50000 # bp window around a genotoxic gene to flag
30
+
31
+
32
+ def _chr(c: str) -> str:
33
+ c = str(c)
34
+ return c if c.startswith("chr") else f"chr{c}"
35
+
36
+
37
+ def load_cosmic(tsv: str) -> pd.DataFrame:
38
+ df = pd.read_csv(tsv, sep="\t", dtype=str)
39
+ df = df.dropna(subset=["CHROMOSOME", "GENOME_START", "GENOME_STOP"])
40
+ df["chrom"] = df["CHROMOSOME"].map(_chr)
41
+ df["start"] = pd.to_numeric(df["GENOME_START"], errors="coerce")
42
+ df["end"] = pd.to_numeric(df["GENOME_STOP"], errors="coerce")
43
+ df = df.dropna(subset=["start", "end"])
44
+ df["role"] = df.get("ROLE_IN_CANCER", "").fillna("")
45
+ df = df[df["chrom"].isin(MAIN_CHROMS)]
46
+ df["start"] = df["start"].astype(int)
47
+ df["end"] = df["end"].astype(int)
48
+ return df[["chrom", "start", "end", "GENE_SYMBOL", "role"]]
49
+
50
+
51
+ def load_depmap_essential(csv: str, thresh: float = -0.5) -> set[str]:
52
+ """Common-essential genes: mean Chronos effect across cell lines < thresh."""
53
+ df = pd.read_csv(csv, index_col=0)
54
+ means = df.mean(axis=0)
55
+ genes = {c.split(" (")[0] for c, m in means.items() if m < thresh}
56
+ return genes
57
+
58
+
59
+ def download_gencode(dest: str, url: str = GENCODE_GTF) -> str:
60
+ if not (os.path.exists(dest) and os.path.getsize(dest) > 0):
61
+ Path(dest).parent.mkdir(parents=True, exist_ok=True)
62
+ with requests.get(url, stream=True, timeout=600) as r:
63
+ r.raise_for_status()
64
+ with open(dest, "wb") as fh:
65
+ for ch in r.iter_content(1 << 20):
66
+ fh.write(ch)
67
+ return dest
68
+
69
+
70
+ def parse_gencode_genes(gtf_gz: str) -> pd.DataFrame:
71
+ rows = []
72
+ with gzip.open(gtf_gz, "rt") as fh:
73
+ for line in fh:
74
+ if line.startswith("#"):
75
+ continue
76
+ f = line.rstrip("\n").split("\t")
77
+ if f[2] != "gene":
78
+ continue
79
+ chrom = f[0]
80
+ if chrom not in MAIN_CHROMS:
81
+ continue
82
+ start, end, strand = int(f[3]), int(f[4]), f[6]
83
+ attrs = f[8]
84
+ name = ""
85
+ for kv in attrs.split(";"):
86
+ kv = kv.strip()
87
+ if kv.startswith("gene_name"):
88
+ name = kv.split('"')[1]
89
+ break
90
+ tss = start if strand == "+" else end
91
+ rows.append((chrom, start, end, strand, name, tss))
92
+ return pd.DataFrame(rows, columns=["chrom", "start", "end", "strand", "gene_name", "tss"])
93
+
94
+
95
+ def _bed(df: pd.DataFrame, cols=("chrom", "start", "end")) -> pybedtools.BedTool:
96
+ b = df[list(cols)].copy()
97
+ b.columns = ["chrom", "start", "end"]
98
+ b = b.sort_values(["chrom", "start"])
99
+ return pybedtools.BedTool.from_dataframe(b)
100
+
101
+
102
+ def nearest_dist(bins_bed: pybedtools.BedTool, feat_df: pd.DataFrame, name: str) -> pd.DataFrame:
103
+ if feat_df.empty:
104
+ return pd.DataFrame(columns=["chrom", "start", name])
105
+ fb = _bed(feat_df).sort()
106
+ closest = bins_bed.closest(fb, d=True)
107
+ out = closest.to_dataframe(header=None, usecols=[0, 1, closest.field_count() - 1],
108
+ names=["chrom", "start", name])
109
+ # bedtools closest -d returns -1 when there is NO feature on that chromosome; that means
110
+ # "no nearby feature" (effectively infinite distance), NOT distance 0. Map the sentinel to NaN.
111
+ out[name] = out[name].where(out[name] >= 0, other=np.nan)
112
+ return out.groupby(["chrom", "start"], as_index=False)[name].min()
113
+
114
+
115
+ def build(bin_grid: str, cosmic_tsv: str, depmap_csv: str, gencode_dest: str,
116
+ sizes_tsv: str, out_parquet: str) -> pd.DataFrame:
117
+ grid = pd.read_parquet(bin_grid)[["chrom", "start", "bin"]]
118
+ bins_bed = _bed(grid.assign(end=grid["start"] + BIN_BP)).sort()
119
+
120
+ cosmic = load_cosmic(cosmic_tsv)
121
+ onco = cosmic[cosmic["role"].str.contains("oncogene", case=False, na=False)]
122
+ tsg = cosmic[cosmic["role"].str.contains("TSG", case=False, na=False)]
123
+
124
+ gtf = download_gencode(gencode_dest)
125
+ genes = parse_gencode_genes(gtf)
126
+ ess_syms = load_depmap_essential(depmap_csv)
127
+ ess = genes[genes["gene_name"].isin(ess_syms)]
128
+
129
+ out = grid.copy()
130
+ for nm, fdf in [("dist_oncogene", onco), ("dist_tsg", tsg),
131
+ ("dist_essential", ess), ("dist_tss", genes.assign(end=genes["tss"] + 1, start=genes["tss"]))]:
132
+ d = nearest_dist(bins_bed, fdf, nm)
133
+ out = out.merge(d, on=["chrom", "start"], how="left")
134
+
135
+ # genotoxic CIS flag
136
+ gtox = genes[genes["gene_name"].isin(GENOTOXIC)].copy()
137
+ gtox["start"] = (gtox["start"] - CIS_WINDOW).clip(lower=0)
138
+ gtox["end"] = gtox["end"] + CIS_WINDOW
139
+ gflag = nearest_dist(bins_bed, gtox, "dist_gtox")
140
+ out = out.merge(gflag, on=["chrom", "start"], how="left")
141
+ out["genotoxic_cis"] = (out["dist_gtox"].fillna(1e9) == 0)
142
+
143
+ Path(out_parquet).parent.mkdir(parents=True, exist_ok=True)
144
+ out.to_parquet(out_parquet, index=False)
145
+ return out
146
+
147
+
148
+ def main() -> None:
149
+ ap = argparse.ArgumentParser()
150
+ ap.add_argument("--bin-grid", default="/data/features/bin_grid_1kb.parquet")
151
+ ap.add_argument("--cosmic", default="/data/external/Cosmic_CancerGeneCensus_v104_GRCh38.tsv")
152
+ ap.add_argument("--depmap", default="/data/external/CRISPRGeneEffect.csv")
153
+ ap.add_argument("--gencode", default="/data/raw/gencode.v46.basic.gtf.gz")
154
+ ap.add_argument("--sizes", default="/data/raw/hg38.chrom.sizes")
155
+ ap.add_argument("--out", default="/data/features/safety_annot.parquet")
156
+ a = ap.parse_args()
157
+ df = build(a.bin_grid, a.cosmic, a.depmap, a.gencode, a.sizes, a.out)
158
+ n_onco = (df["dist_oncogene"] == 0).sum()
159
+ print(f"safety_annot bins={len(df)} cols={[c for c in df.columns if c.startswith('dist') or c=='genotoxic_cis']}")
160
+ print(f"bins in an oncogene={n_onco} genotoxic_cis bins={int(df['genotoxic_cis'].sum())}")
161
+
162
+
163
+ if __name__ == "__main__":
164
+ main()