pen-stack 3.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pen_stack/__init__.py +2 -0
- pen_stack/_resources.py +34 -0
- pen_stack/adapt/__init__.py +14 -0
- pen_stack/adapt/finetune.py +33 -0
- pen_stack/adapt/ingest.py +86 -0
- pen_stack/adapt/pipeline.py +101 -0
- pen_stack/adapt/recalibrate.py +58 -0
- pen_stack/adapt/report.py +130 -0
- pen_stack/agent/__init__.py +1 -0
- pen_stack/agent/guardrails.py +49 -0
- pen_stack/agent/mcp_server.py +42 -0
- pen_stack/agent/orchestrator.py +106 -0
- pen_stack/agent/pen_agent.py +169 -0
- pen_stack/agent/tools.py +130 -0
- pen_stack/atlas/__init__.py +1 -0
- pen_stack/atlas/build_wtkb.py +80 -0
- pen_stack/atlas/crosslink.py +144 -0
- pen_stack/atlas/expand.py +190 -0
- pen_stack/atlas/schema.py +59 -0
- pen_stack/atlas/scorecard.py +134 -0
- pen_stack/atlas/universe.py +75 -0
- pen_stack/atlas/variant_propose.py +155 -0
- pen_stack/bridge/__init__.py +1 -0
- pen_stack/bridge/activity.py +52 -0
- pen_stack/bridge/cli.py +65 -0
- pen_stack/bridge/fold_qc.py +53 -0
- pen_stack/bridge/guide_qc.py +84 -0
- pen_stack/bridge/ingest.py +139 -0
- pen_stack/bridge/offtarget.py +133 -0
- pen_stack/bridge/ortholog_screen.py +73 -0
- pen_stack/bridge/pipeline.py +83 -0
- pen_stack/cli.py +126 -0
- pen_stack/data/__init__.py +1 -0
- pen_stack/data/encode.py +84 -0
- pen_stack/data/genome.py +71 -0
- pen_stack/data/ingest_chromatin.py +119 -0
- pen_stack/data/ingest_integration.py +112 -0
- pen_stack/data/ingest_safety_annot.py +164 -0
- pen_stack/data/ingest_trip.py +76 -0
- pen_stack/mech/__init__.py +1 -0
- pen_stack/mech/classify_atlas.py +71 -0
- pen_stack/mech/whitelist.py +66 -0
- pen_stack/monitor/__init__.py +1 -0
- pen_stack/monitor/europepmc.py +32 -0
- pen_stack/monitor/run.py +57 -0
- pen_stack/monitor/triage.py +63 -0
- pen_stack/planner/__init__.py +1 -0
- pen_stack/planner/cargo.py +56 -0
- pen_stack/planner/cargo_polish.py +146 -0
- pen_stack/planner/delivery.py +32 -0
- pen_stack/planner/multiplex.py +110 -0
- pen_stack/planner/optimize.py +156 -0
- pen_stack/planner/pipeline.py +86 -0
- pen_stack/planner/report.py +26 -0
- pen_stack/rag/__init__.py +1 -0
- pen_stack/rag/index.py +53 -0
- pen_stack/rag/llm.py +178 -0
- pen_stack/rag/qa.py +105 -0
- pen_stack/score/__init__.py +1 -0
- pen_stack/score/recalibrate.py +77 -0
- pen_stack/score/therapeutic.py +85 -0
- pen_stack/server/__init__.py +1 -0
- pen_stack/server/api.py +142 -0
- pen_stack/ui/__init__.py +1 -0
- pen_stack/ui/app.py +518 -0
- pen_stack/validate/__init__.py +1 -0
- pen_stack/validate/adapt_demo.py +69 -0
- pen_stack/validate/agent_eval.py +117 -0
- pen_stack/validate/blind_gsh_discovery.py +165 -0
- pen_stack/validate/cargo_directionality.py +57 -0
- pen_stack/validate/durability_baselines.py +150 -0
- pen_stack/validate/forward_hypotheses.py +104 -0
- pen_stack/validate/guide_qc_demo.py +58 -0
- pen_stack/validate/intent_specification.py +82 -0
- pen_stack/validate/paper3_benchmark.py +165 -0
- pen_stack/validate/paper4_real_validation.py +144 -0
- pen_stack/validate/paper4_validation.py +82 -0
- pen_stack/validate/seq_vs_measured.py +134 -0
- pen_stack/validate/within_locus_ranking.py +74 -0
- pen_stack/validate/writer_recovery.py +86 -0
- pen_stack/wgenome/__init__.py +1 -0
- pen_stack/wgenome/chromatin_seq.py +83 -0
- pen_stack/wgenome/durability.py +108 -0
- pen_stack/wgenome/export_tracks.py +52 -0
- pen_stack/wgenome/features.py +82 -0
- pen_stack/wgenome/gsh_baseline.py +117 -0
- pen_stack/wgenome/providers.py +245 -0
- pen_stack/wgenome/safety.py +69 -0
- pen_stack/wgenome/structure3d.py +168 -0
- pen_stack/wgenome/writability.py +72 -0
- pen_stack-3.1.0.dist-info/METADATA +451 -0
- pen_stack-3.1.0.dist-info/RECORD +96 -0
- pen_stack-3.1.0.dist-info/WHEEL +5 -0
- pen_stack-3.1.0.dist-info/entry_points.txt +3 -0
- pen_stack-3.1.0.dist-info/licenses/LICENSE +21 -0
- pen_stack-3.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
"""External sequence-to-function providers (v3.1, WS-C).
|
|
2
|
+
|
|
3
|
+
AlphaGenomeProvider wraps Google DeepMind's AlphaGenome (free, non-commercial) behind a small, cached,
|
|
4
|
+
provider-agnostic interface so the rest of PEN-STACK never imports `alphagenome` directly. It supplies:
|
|
5
|
+
|
|
6
|
+
* tracks(interval, outputs, ontology) -> per-base predictions (ATAC/DNASE/RNA_SEQ/CHIP_HISTONE/...)
|
|
7
|
+
* expression(interval, ontology) -> scalar endogenous expression proxy (mean RNA_SEQ over interval)
|
|
8
|
+
* contact_map(interval) -> predicted Hi-C contact matrix (3D structural-risk feature, WS-C2)
|
|
9
|
+
|
|
10
|
+
Design rules (match the rest of the stack):
|
|
11
|
+
* The LLM and the provider are NON-load-bearing for reproducibility - every cached value is keyed by an
|
|
12
|
+
explicit (assembly, interval, output, ontology) tuple and written to disk, so a run is reproducible
|
|
13
|
+
offline from the cache without re-querying the API.
|
|
14
|
+
* The API key is read from env (ALPHAGENOME_API_KEY) or a gitignored file; NEVER committed.
|
|
15
|
+
* `alphagenome` is an optional dependency. If the package or key is absent, `available()` is False and the
|
|
16
|
+
dependent baselines (WS-B1, WS-C) report `pending` rather than crashing - the core stack is unaffected.
|
|
17
|
+
|
|
18
|
+
Caching: predictions are large (up to ~1M rows). We cache the *reduced* features we actually consume
|
|
19
|
+
(scalar expression, mean track signal, contact-map summary statistics) as small JSON/parquet, not the raw
|
|
20
|
+
1 Mb tensors, keyed by a content hash of the request.
|
|
21
|
+
"""
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import hashlib
|
|
25
|
+
import json
|
|
26
|
+
import os
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
|
|
29
|
+
_ROOT = Path(__file__).resolve().parents[2]
|
|
30
|
+
_CACHE = _ROOT / "data" / "alphagenome_cache"
|
|
31
|
+
_KEY_FILE = _ROOT / "configs" / "alphagenome_api_key.txt"
|
|
32
|
+
_KEY_ENV = "ALPHAGENOME_API_KEY"
|
|
33
|
+
|
|
34
|
+
# 1 Mb is AlphaGenome's max; expression/structural features use it for full regulatory context.
|
|
35
|
+
SEQ_LEN_1MB = 1_048_576
|
|
36
|
+
|
|
37
|
+
# Model version recorded in track-cache keys + artifacts (C1 reproducibility). Bump when the served model
|
|
38
|
+
# changes so stale predictions are not silently reused.
|
|
39
|
+
MODEL_VERSION = "alphagenome-2025-06"
|
|
40
|
+
|
|
41
|
+
# The seven measured-atlas tracks and their AlphaGenome sources. The five histone marks come from the single
|
|
42
|
+
# CHIP_HISTONE output selected by its `histone_mark` metadata column.
|
|
43
|
+
_HISTONES = ["H3K27ac", "H3K4me1", "H3K4me3", "H3K9me3", "H3K27me3"]
|
|
44
|
+
TRACK_NAMES = ["atac", "dnase", *_HISTONES]
|
|
45
|
+
|
|
46
|
+
# K562 / HepG2 cell-type ontologies (verified against AlphaGenome human output_metadata).
|
|
47
|
+
CT_ONTOLOGY = {"k562": "EFO:0002067", "hepg2": "EFO:0001187"}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _resolve_key() -> str | None:
|
|
51
|
+
"""API key from env first, then a gitignored file. Returns None if neither is present."""
|
|
52
|
+
k = os.environ.get(_KEY_ENV)
|
|
53
|
+
if k:
|
|
54
|
+
return k.strip()
|
|
55
|
+
if _KEY_FILE.exists():
|
|
56
|
+
for line in _KEY_FILE.read_text(encoding="utf-8").splitlines():
|
|
57
|
+
s = line.strip().rstrip('",; ')
|
|
58
|
+
if s and not s.lower().startswith("alphagenome") and len(s) > 20:
|
|
59
|
+
return s
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def package_available() -> bool:
|
|
64
|
+
try:
|
|
65
|
+
import alphagenome # noqa: F401
|
|
66
|
+
return True
|
|
67
|
+
except Exception: # noqa: BLE001
|
|
68
|
+
return False
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _cache_key(*parts) -> str:
|
|
72
|
+
return hashlib.sha256("|".join(str(p) for p in parts).encode()).hexdigest()[:24]
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class AlphaGenomeProvider:
|
|
76
|
+
"""Cached wrapper around AlphaGenome's dna_client. Construct with `AlphaGenomeProvider()`."""
|
|
77
|
+
|
|
78
|
+
def __init__(self, api_key: str | None = None, assembly: str = "hg38", cache_dir: Path = _CACHE):
|
|
79
|
+
self.assembly = assembly
|
|
80
|
+
self.cache_dir = Path(cache_dir)
|
|
81
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
82
|
+
self._key = api_key or _resolve_key()
|
|
83
|
+
self._model = None # lazily created on first live call
|
|
84
|
+
|
|
85
|
+
# -- availability ------------------------------------------------------
|
|
86
|
+
def available(self) -> bool:
|
|
87
|
+
"""True when both the package and a key are present (a live call is possible)."""
|
|
88
|
+
return package_available() and self._key is not None
|
|
89
|
+
|
|
90
|
+
def _client(self):
|
|
91
|
+
if self._model is None:
|
|
92
|
+
from alphagenome.models import dna_client
|
|
93
|
+
self._model = dna_client.create(self._key)
|
|
94
|
+
return self._model
|
|
95
|
+
|
|
96
|
+
# -- cache helpers -----------------------------------------------------
|
|
97
|
+
def _load(self, key: str):
|
|
98
|
+
f = self.cache_dir / f"{key}.json"
|
|
99
|
+
if f.exists():
|
|
100
|
+
return json.loads(f.read_text(encoding="utf-8"))
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
def _store(self, key: str, value: dict) -> None:
|
|
104
|
+
(self.cache_dir / f"{key}.json").write_text(json.dumps(value, default=str), encoding="utf-8")
|
|
105
|
+
|
|
106
|
+
# -- features ----------------------------------------------------------
|
|
107
|
+
def expression(self, chrom: str, start: int, end: int, ontology: str, organism: str = "human",
|
|
108
|
+
center_bp: int = 20_000, offline: bool = False) -> dict:
|
|
109
|
+
"""Scalar endogenous-expression proxy: mean predicted RNA_SEQ in a central window (cached).
|
|
110
|
+
|
|
111
|
+
The 1 Mb model context is needed for regulatory reach, but the proxy averages only the central
|
|
112
|
+
`center_bp` (host-locus expression at the integration site) rather than the whole 1 Mb, which would
|
|
113
|
+
wash out the local signal.
|
|
114
|
+
"""
|
|
115
|
+
key = _cache_key("expr", self.assembly, organism, chrom, start, end, ontology, center_bp)
|
|
116
|
+
hit = self._load(key)
|
|
117
|
+
if hit is not None:
|
|
118
|
+
return hit
|
|
119
|
+
if offline:
|
|
120
|
+
return {"available": False, "reason": "offline: not in cache", "key": key}
|
|
121
|
+
if not self.available():
|
|
122
|
+
return {"available": False, "reason": "alphagenome package or key absent", "key": key}
|
|
123
|
+
from alphagenome.data import genome
|
|
124
|
+
from alphagenome.models import dna_client
|
|
125
|
+
org = (dna_client.Organism.MUS_MUSCULUS if organism == "mouse"
|
|
126
|
+
else dna_client.Organism.HOMO_SAPIENS)
|
|
127
|
+
interval = genome.Interval(chromosome=chrom, start=start, end=end).resize(SEQ_LEN_1MB)
|
|
128
|
+
out = self._client().predict_interval(
|
|
129
|
+
interval=interval, organism=org,
|
|
130
|
+
requested_outputs=[dna_client.OutputType.RNA_SEQ], ontology_terms=[ontology])
|
|
131
|
+
import numpy as np
|
|
132
|
+
arr = np.asarray(out.rna_seq.values) # (1_048_576, n_tracks)
|
|
133
|
+
mid = arr.shape[0] // 2
|
|
134
|
+
half = max(1, center_bp // 2)
|
|
135
|
+
central = arr[max(0, mid - half):mid + half]
|
|
136
|
+
rec = {"available": True, "rna_seq_mean": float(central.mean()),
|
|
137
|
+
"rna_seq_max": float(central.max()), "center_bp": center_bp,
|
|
138
|
+
"chrom": chrom, "start": start, "end": end,
|
|
139
|
+
"ontology": ontology, "organism": organism, "key": key}
|
|
140
|
+
self._store(key, rec)
|
|
141
|
+
return rec
|
|
142
|
+
|
|
143
|
+
def tracks(self, chrom: str, bin: int, ct: str, bin_size: int = 1000, center_bp: int = 1000,
|
|
144
|
+
offline: bool = False) -> dict:
|
|
145
|
+
"""Predicted values of the seven measured-atlas tracks at a 1 kb bin (central-window mean, cached).
|
|
146
|
+
|
|
147
|
+
`ct` is "k562" or "hepg2"; the bin centre is `bin*bin_size + bin_size/2`, predicted in 1 Mb context.
|
|
148
|
+
Returns {atac, dnase, H3K27ac, H3K4me1, H3K4me3, H3K9me3, H3K27me3, model_version, ...}.
|
|
149
|
+
"""
|
|
150
|
+
ontology = CT_ONTOLOGY.get(ct.lower(), ct)
|
|
151
|
+
key = _cache_key("tracks", self.assembly, MODEL_VERSION, chrom, bin, ontology, bin_size, center_bp)
|
|
152
|
+
hit = self._load(key)
|
|
153
|
+
if hit is not None:
|
|
154
|
+
return hit
|
|
155
|
+
if offline:
|
|
156
|
+
return {"available": False, "reason": "offline: not in cache", "key": key}
|
|
157
|
+
if not self.available():
|
|
158
|
+
return {"available": False, "reason": "alphagenome package or key absent", "key": key}
|
|
159
|
+
import numpy as np
|
|
160
|
+
from alphagenome.data import genome
|
|
161
|
+
from alphagenome.models import dna_client
|
|
162
|
+
pos = bin * bin_size + bin_size // 2
|
|
163
|
+
interval = genome.Interval(chromosome=chrom, start=pos, end=pos).resize(SEQ_LEN_1MB)
|
|
164
|
+
out = self._client().predict_interval(
|
|
165
|
+
interval=interval,
|
|
166
|
+
requested_outputs=[dna_client.OutputType.ATAC, dna_client.OutputType.DNASE,
|
|
167
|
+
dna_client.OutputType.CHIP_HISTONE],
|
|
168
|
+
ontology_terms=[ontology])
|
|
169
|
+
|
|
170
|
+
def central(values) -> np.ndarray:
|
|
171
|
+
arr = np.asarray(values)
|
|
172
|
+
mid = arr.shape[0] // 2
|
|
173
|
+
half = max(1, center_bp // 2)
|
|
174
|
+
return arr[max(0, mid - half):mid + half]
|
|
175
|
+
|
|
176
|
+
rec = {"available": True, "chrom": chrom, "bin": int(bin), "ct": ct, "ontology": ontology,
|
|
177
|
+
"model_version": MODEL_VERSION, "center_bp": center_bp, "key": key,
|
|
178
|
+
"atac": float(central(out.atac.values).mean()),
|
|
179
|
+
"dnase": float(central(out.dnase.values).mean())}
|
|
180
|
+
ch = out.chip_histone
|
|
181
|
+
md, vals = ch.metadata.reset_index(drop=True), central(ch.values)
|
|
182
|
+
for mark in _HISTONES:
|
|
183
|
+
cols = md.index[md["histone_mark"] == mark].to_numpy()
|
|
184
|
+
rec[mark] = float(vals[:, cols].mean()) if len(cols) else float("nan")
|
|
185
|
+
self._store(key, rec)
|
|
186
|
+
return rec
|
|
187
|
+
|
|
188
|
+
def contact_map_summary(self, chrom: str, start: int, end: int, ontology: str) -> dict:
|
|
189
|
+
"""3D structural-risk summary (WS-C2): variance + mean of the predicted contact map (cached)."""
|
|
190
|
+
key = _cache_key("contact", self.assembly, chrom, start, end, ontology)
|
|
191
|
+
hit = self._load(key)
|
|
192
|
+
if hit is not None:
|
|
193
|
+
return hit
|
|
194
|
+
if not self.available():
|
|
195
|
+
return {"available": False, "reason": "alphagenome package or key absent", "key": key}
|
|
196
|
+
from alphagenome.data import genome
|
|
197
|
+
from alphagenome.models import dna_client
|
|
198
|
+
interval = genome.Interval(chromosome=chrom, start=start, end=end).resize(SEQ_LEN_1MB)
|
|
199
|
+
out = self._client().predict_interval(
|
|
200
|
+
interval=interval, requested_outputs=[dna_client.OutputType.CONTACT_MAPS],
|
|
201
|
+
ontology_terms=[ontology])
|
|
202
|
+
import numpy as np
|
|
203
|
+
m = np.asarray(out.contact_maps.values)
|
|
204
|
+
rec = {"available": True, "contact_mean": float(m.mean()), "contact_var": float(m.var()),
|
|
205
|
+
"chrom": chrom, "start": start, "end": end, "ontology": ontology, "key": key}
|
|
206
|
+
self._store(key, rec)
|
|
207
|
+
return rec
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
class MeasuredTrackProvider:
|
|
211
|
+
"""The existing measured-ENCODE backbone: reads `phase_1/features/chromatin_{ct}.parquet` per bin.
|
|
212
|
+
|
|
213
|
+
Same `tracks()` signature as AlphaGenomeProvider so C2 can compare predicted vs measured on identical
|
|
214
|
+
bins, and so downstream code can swap providers without branching.
|
|
215
|
+
"""
|
|
216
|
+
|
|
217
|
+
_P1 = _ROOT.parent / "phase_1" / "features"
|
|
218
|
+
|
|
219
|
+
def __init__(self, ct: str):
|
|
220
|
+
import pandas as pd
|
|
221
|
+
self.ct = ct.lower()
|
|
222
|
+
self._df = pd.read_parquet(self._P1 / f"chromatin_{self.ct}.parquet").set_index(["chrom", "bin"])
|
|
223
|
+
|
|
224
|
+
def available(self) -> bool:
|
|
225
|
+
return True
|
|
226
|
+
|
|
227
|
+
def tracks(self, chrom: str, bin: int, ct: str | None = None, **_: object) -> dict:
|
|
228
|
+
try:
|
|
229
|
+
row = self._df.loc[(chrom, int(bin))]
|
|
230
|
+
except KeyError:
|
|
231
|
+
return {"available": False, "reason": "bin not in measured grid"}
|
|
232
|
+
return {"available": True, "chrom": chrom, "bin": int(bin), "ct": self.ct,
|
|
233
|
+
**{t: float(row[t]) for t in TRACK_NAMES if t in row}}
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def smoke() -> dict:
|
|
237
|
+
"""Lightweight readiness probe used by tests/CLI - reports availability without a live call."""
|
|
238
|
+
p = AlphaGenomeProvider()
|
|
239
|
+
return {"package_available": package_available(), "key_present": _resolve_key() is not None,
|
|
240
|
+
"available": p.available(), "model_version": MODEL_VERSION, "track_names": TRACK_NAMES,
|
|
241
|
+
"cache_dir": str(_CACHE)}
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
if __name__ == "__main__": # pragma: no cover
|
|
245
|
+
print(json.dumps(smoke(), indent=2))
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Safety layer (Phase 1, Step 1.6) - calibrated genotoxicity-risk model.
|
|
2
|
+
|
|
3
|
+
Position features -> P(genotoxic) with isotonic calibration and CHROMOSOME-BLOCK cross-validation
|
|
4
|
+
(so adjacent 1 kb bins never leak between train/test). Always reported against the honest baseline:
|
|
5
|
+
distance-to-nearest-oncogene. Output is a calibrated risk per bin.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import lightgbm as lgb
|
|
10
|
+
import numpy as np
|
|
11
|
+
import pandas as pd
|
|
12
|
+
from sklearn.isotonic import IsotonicRegression
|
|
13
|
+
from sklearn.metrics import average_precision_score, roc_auc_score
|
|
14
|
+
from sklearn.model_selection import GroupKFold
|
|
15
|
+
|
|
16
|
+
from pen_stack.wgenome.features import feature_columns
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _blocks(chrom: pd.Series) -> np.ndarray:
|
|
20
|
+
"""Chromosome-block groups for leakage-free CV."""
|
|
21
|
+
return chrom.astype("category").cat.codes.to_numpy()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def train_safety(df: pd.DataFrame, label: str = "genotoxic_cis", n_splits: int = 5,
|
|
25
|
+
seed: int = 42) -> dict:
|
|
26
|
+
feats = feature_columns(df)
|
|
27
|
+
X = df[feats].astype("float32").fillna(0.0)
|
|
28
|
+
y = df[label].astype(int).to_numpy()
|
|
29
|
+
groups = _blocks(df["chrom"])
|
|
30
|
+
|
|
31
|
+
gkf = GroupKFold(n_splits=min(n_splits, len(np.unique(groups))))
|
|
32
|
+
oof = np.zeros(len(df), dtype="float64")
|
|
33
|
+
for tr, te in gkf.split(X, y, groups):
|
|
34
|
+
pos = max(1, int(y[tr].sum()))
|
|
35
|
+
spw = max(1.0, (len(tr) - pos) / pos) # class imbalance
|
|
36
|
+
clf = lgb.LGBMClassifier(n_estimators=400, learning_rate=0.03, num_leaves=63,
|
|
37
|
+
subsample=0.8, colsample_bytree=0.8, scale_pos_weight=spw,
|
|
38
|
+
random_state=seed, n_jobs=-1, verbosity=-1)
|
|
39
|
+
clf.fit(X.iloc[tr], y[tr])
|
|
40
|
+
raw = clf.predict_proba(X.iloc[te])[:, 1]
|
|
41
|
+
# isotonic calibration fit on the training fold's OOB-ish raw scores
|
|
42
|
+
iso = IsotonicRegression(out_of_bounds="clip")
|
|
43
|
+
raw_tr = clf.predict_proba(X.iloc[tr])[:, 1]
|
|
44
|
+
iso.fit(raw_tr, y[tr])
|
|
45
|
+
oof[te] = iso.transform(raw)
|
|
46
|
+
|
|
47
|
+
auroc = roc_auc_score(y, oof)
|
|
48
|
+
auprc = average_precision_score(y, oof)
|
|
49
|
+
|
|
50
|
+
# honest baseline: closer to oncogene => riskier
|
|
51
|
+
base = -df["dist_oncogene"].fillna(df["dist_oncogene"].max()).to_numpy()
|
|
52
|
+
auroc_base = roc_auc_score(y, base)
|
|
53
|
+
auprc_base = average_precision_score(y, base)
|
|
54
|
+
|
|
55
|
+
# final model on all data (for scoring), + feature importance
|
|
56
|
+
pos = max(1, int(y.sum()))
|
|
57
|
+
spw = max(1.0, (len(y) - pos) / pos)
|
|
58
|
+
final = lgb.LGBMClassifier(n_estimators=400, learning_rate=0.03, num_leaves=63,
|
|
59
|
+
subsample=0.8, colsample_bytree=0.8, scale_pos_weight=spw,
|
|
60
|
+
random_state=seed, n_jobs=-1, verbosity=-1).fit(X, y)
|
|
61
|
+
imp = dict(sorted(zip(feats, final.feature_importances_.tolist()),
|
|
62
|
+
key=lambda kv: kv[1], reverse=True))
|
|
63
|
+
return {
|
|
64
|
+
"n": int(len(df)), "n_pos": int(y.sum()), "features": feats,
|
|
65
|
+
"auroc_model": float(auroc), "auprc_model": float(auprc),
|
|
66
|
+
"auroc_baseline": float(auroc_base), "auprc_baseline": float(auprc_base),
|
|
67
|
+
"auroc_delta": float(auroc - auroc_base),
|
|
68
|
+
"feature_importance": imp, "model": final, "oof": oof,
|
|
69
|
+
}
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""WS-C3 - 3D structural-risk via AlphaGenome contact-map deltas.
|
|
2
|
+
|
|
3
|
+
A cassette insertion can rewire 3D contacts and bring a distal enhancer into contact with an oncogene
|
|
4
|
+
promoter (enhancer hijacking). We simulate the insertion with AlphaGenome's `predict_variant` (the cassette
|
|
5
|
+
is the alternate allele - an insertion - so the model applies it to its own reference and handles the
|
|
6
|
+
coordinate shift server-side; no local FASTA needed). We predict the reference and edited 1 Mb contact maps
|
|
7
|
+
and compute:
|
|
8
|
+
|
|
9
|
+
* insulation change at the insertion site (diamond insulation score, ref vs edited);
|
|
10
|
+
* aberrant contact gain between the insertion site and a target oncogene promoter bin.
|
|
11
|
+
|
|
12
|
+
To isolate the *regulatory* effect from the pure coordinate-shift artifact, every metric is reported for a
|
|
13
|
+
strong-enhancer insert AND a length-matched neutral insert; the strong-minus-neutral difference is the
|
|
14
|
+
signal. Output is a `structural_risk` score + flag with a confidence field.
|
|
15
|
+
|
|
16
|
+
GATE G-C: this ships as a FLAG WITH CONFIDENCE, never a hard pass/fail. No ground-truth dataset of
|
|
17
|
+
insertion-induced hijacking exists, so this is NOT validated as a predictor - only sanity-checked on known
|
|
18
|
+
enhancer-hijacking loci (TAL1, LMO2, GFI1B, MYC) where a strong-enhancer insert should raise aberrant
|
|
19
|
+
contacts above a matched neutral insert. Contacts are cell-type-specific (default GM12878, EFO:0002784 -
|
|
20
|
+
K562 has no AlphaGenome Hi-C track); insertion changes coordinates in ways the model was not trained on.
|
|
21
|
+
"""
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import hashlib
|
|
25
|
+
import json
|
|
26
|
+
import urllib.request
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
|
|
29
|
+
import numpy as np
|
|
30
|
+
|
|
31
|
+
from pen_stack.wgenome.providers import SEQ_LEN_1MB, AlphaGenomeProvider
|
|
32
|
+
|
|
33
|
+
_ROOT = Path(__file__).resolve().parents[2]
|
|
34
|
+
_CACHE = _ROOT / "data" / "alphagenome_cache"
|
|
35
|
+
HIC_ONTOLOGY = "EFO:0002784" # GM12878 - canonical deep Hi-C; K562 has no AlphaGenome contact track
|
|
36
|
+
CONTACT_BINS = 512 # 1 Mb / 512 ~ 2048 bp per contact bin
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _ucsc_ref(chrom: str, pos: int, length: int = 1) -> str:
|
|
40
|
+
"""Reference bases [pos, pos+length) on hg38 via the UCSC REST API (cached on disk)."""
|
|
41
|
+
key = f"ucsc_{chrom}_{pos}_{length}"
|
|
42
|
+
f = _CACHE / f"{key}.json"
|
|
43
|
+
if f.exists():
|
|
44
|
+
return json.loads(f.read_text(encoding="utf-8"))["dna"].upper()
|
|
45
|
+
u = (f"https://api.genome.ucsc.edu/getData/sequence?genome=hg38;chrom={chrom};"
|
|
46
|
+
f"start={pos};end={pos + length}")
|
|
47
|
+
d = json.load(urllib.request.urlopen(u)) # noqa: S310
|
|
48
|
+
_CACHE.mkdir(parents=True, exist_ok=True)
|
|
49
|
+
f.write_text(json.dumps({"dna": d["dna"]}), encoding="utf-8")
|
|
50
|
+
return d["dna"].upper()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def strong_enhancer_insert(n: int = 1600) -> str:
|
|
54
|
+
"""Simulated strong enhancer: tiled clusters of active-enhancer TF motif cores (ETS/GATA/AP-1/RUNX)."""
|
|
55
|
+
motif = "GGAAGTGATAAGTGACTCAGGAAGTGACCACA" # GGAA(ETS) / GATA / TGACTCA(AP-1) / TGTGGT(RUNX-rc)
|
|
56
|
+
return (motif * (n // len(motif) + 1))[:n]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def neutral_insert(n: int = 1600) -> str:
|
|
60
|
+
"""Length-matched neutral insert: low-complexity AT-rich filler (poor regulatory potential)."""
|
|
61
|
+
return ("ATATATTAATTATAAT" * (n // 16 + 1))[:n]
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _contact_matrices(provider: AlphaGenomeProvider, chrom: str, pos: int, insert: str,
|
|
65
|
+
ontology: str = HIC_ONTOLOGY):
|
|
66
|
+
"""Reference + edited (insertion) 1 Mb contact matrices via predict_variant."""
|
|
67
|
+
from alphagenome.data import genome
|
|
68
|
+
from alphagenome.models import dna_client
|
|
69
|
+
anchor = _ucsc_ref(chrom, pos, 1)
|
|
70
|
+
var = genome.Variant(chromosome=chrom, position=pos, reference_bases=anchor,
|
|
71
|
+
alternate_bases=anchor + insert)
|
|
72
|
+
interval = var.reference_interval.resize(SEQ_LEN_1MB)
|
|
73
|
+
out = provider._client().predict_variant( # noqa: SLF001
|
|
74
|
+
interval=interval, variant=var,
|
|
75
|
+
requested_outputs=[dna_client.OutputType.CONTACT_MAPS], ontology_terms=[ontology])
|
|
76
|
+
ref = np.asarray(out.reference.contact_maps.values)
|
|
77
|
+
alt = np.asarray(out.alternate.contact_maps.values)
|
|
78
|
+
return ref[..., 0] if ref.ndim == 3 else ref, alt[..., 0] if alt.ndim == 3 else alt
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def insulation_score(mat: np.ndarray, idx: int, w: int = 10) -> float:
|
|
82
|
+
"""Diamond insulation: mean contact in the w x w square straddling position `idx`."""
|
|
83
|
+
n = mat.shape[0]
|
|
84
|
+
a, b = max(0, idx - w), min(n, idx + w)
|
|
85
|
+
if a >= idx or b <= idx:
|
|
86
|
+
return float("nan")
|
|
87
|
+
return float(mat[a:idx, idx:b].mean())
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _bin_of(offset_bp: int) -> int:
|
|
91
|
+
"""Contact-map bin index for a genomic offset (bp) from the 1 Mb interval centre."""
|
|
92
|
+
return int(round(CONTACT_BINS / 2 + offset_bp / (SEQ_LEN_1MB / CONTACT_BINS)))
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def structural_risk(chrom: str, site_pos: int, oncogene_pos: int, ontology: str = HIC_ONTOLOGY,
|
|
96
|
+
provider: AlphaGenomeProvider | None = None, offline: bool = False) -> dict:
|
|
97
|
+
"""Strong-enhancer vs neutral insertion at `site_pos`; aberrant contact gain toward `oncogene_pos`."""
|
|
98
|
+
provider = provider or AlphaGenomeProvider(assembly="hg38")
|
|
99
|
+
ins_strong, ins_neutral = strong_enhancer_insert(), neutral_insert()
|
|
100
|
+
key_src = f"struct3d|{chrom}|{site_pos}|{oncogene_pos}|{ontology}|{hashlib.sha256((ins_strong+ins_neutral).encode()).hexdigest()[:8]}"
|
|
101
|
+
key = hashlib.sha256(key_src.encode()).hexdigest()[:24]
|
|
102
|
+
cf = _CACHE / f"{key}.json"
|
|
103
|
+
if cf.exists():
|
|
104
|
+
return json.loads(cf.read_text(encoding="utf-8"))
|
|
105
|
+
if offline or not provider.available():
|
|
106
|
+
return {"available": False, "reason": "offline or AlphaGenome key absent", "key": key}
|
|
107
|
+
|
|
108
|
+
site_idx = CONTACT_BINS // 2
|
|
109
|
+
tgt_idx = _bin_of(oncogene_pos - site_pos)
|
|
110
|
+
res = {}
|
|
111
|
+
for label, insert in (("strong_enhancer", ins_strong), ("neutral", ins_neutral)):
|
|
112
|
+
ref, alt = _contact_matrices(provider, chrom, site_pos, insert, ontology)
|
|
113
|
+
ins_ref, ins_alt = insulation_score(ref, site_idx), insulation_score(alt, site_idx)
|
|
114
|
+
t = min(tgt_idx, CONTACT_BINS - 1)
|
|
115
|
+
contact_ref = float(ref[site_idx, t]) if 0 <= t < CONTACT_BINS else float("nan")
|
|
116
|
+
contact_alt = float(alt[site_idx, t]) if 0 <= t < CONTACT_BINS else float("nan")
|
|
117
|
+
res[label] = {"insulation_change": round(ins_alt - ins_ref, 5),
|
|
118
|
+
"oncogene_contact_gain": round(contact_alt - contact_ref, 5)}
|
|
119
|
+
gain_strong = res["strong_enhancer"]["oncogene_contact_gain"]
|
|
120
|
+
gain_neutral = res["neutral"]["oncogene_contact_gain"]
|
|
121
|
+
aberrant = gain_strong - gain_neutral
|
|
122
|
+
out = {"available": True, "chrom": chrom, "site_pos": site_pos, "oncogene_pos": oncogene_pos,
|
|
123
|
+
"ontology": ontology, "target_bin_offset": tgt_idx - site_idx,
|
|
124
|
+
"per_insert": res, "aberrant_contact_gain_strong_minus_neutral": round(aberrant, 5),
|
|
125
|
+
"structural_risk": round(float(max(0.0, aberrant)), 5),
|
|
126
|
+
"flag": bool(aberrant > 0),
|
|
127
|
+
"confidence": "heuristic; not a calibrated probability (Gate G-C); sanity-check only",
|
|
128
|
+
"key": key}
|
|
129
|
+
_CACHE.mkdir(parents=True, exist_ok=True)
|
|
130
|
+
cf.write_text(json.dumps(out, default=str), encoding="utf-8")
|
|
131
|
+
return out
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
# Known enhancer-hijacking loci (hg38) for the qualitative sanity check. Insertion site placed ~120 kb from
|
|
135
|
+
# the oncogene promoter (within a 1 Mb window / typical TAD reach).
|
|
136
|
+
HIJACK_LOCI = {
|
|
137
|
+
"TAL1": {"chrom": "chr1", "oncogene_pos": 47_209_257, "site_pos": 47_209_257 - 120_000},
|
|
138
|
+
"LMO2": {"chrom": "chr11", "oncogene_pos": 33_859_520, "site_pos": 33_859_520 - 120_000},
|
|
139
|
+
"GFI1B": {"chrom": "chr9", "oncogene_pos": 132_990_996, "site_pos": 132_990_996 - 120_000},
|
|
140
|
+
"MYC": {"chrom": "chr8", "oncogene_pos": 127_735_434, "site_pos": 127_735_434 - 120_000},
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def sanity(ontology: str = HIC_ONTOLOGY, offline: bool = False, out: str | Path | None = None) -> dict:
|
|
145
|
+
"""C3 sanity check across the known hijacking loci: strong-enhancer insert should raise aberrant
|
|
146
|
+
contacts above a matched neutral insert at more loci than not (qualitative, not a validated predictor)."""
|
|
147
|
+
provider = AlphaGenomeProvider(assembly="hg38")
|
|
148
|
+
rows = {}
|
|
149
|
+
for name, c in HIJACK_LOCI.items():
|
|
150
|
+
r = structural_risk(c["chrom"], c["site_pos"], c["oncogene_pos"], ontology, provider, offline)
|
|
151
|
+
rows[name] = r
|
|
152
|
+
scored = [v["aberrant_contact_gain_strong_minus_neutral"] for v in rows.values() if v.get("available")]
|
|
153
|
+
report = {"available": bool(scored), "ontology": ontology, "n_loci": len(rows),
|
|
154
|
+
"n_strong_gt_neutral": int(sum(1 for s in scored if s > 0)),
|
|
155
|
+
"per_locus": {k: (v.get("aberrant_contact_gain_strong_minus_neutral") if v.get("available")
|
|
156
|
+
else v.get("reason")) for k, v in rows.items()},
|
|
157
|
+
"sanity_pass": bool(scored and sum(1 for s in scored if s > 0) > len(scored) / 2),
|
|
158
|
+
"scope": "qualitative sanity check only; ships as a flag with confidence (Gate G-C), never "
|
|
159
|
+
"a hard pass/fail; contacts are cell-type-specific (GM12878)."}
|
|
160
|
+
if out:
|
|
161
|
+
Path(out).parent.mkdir(parents=True, exist_ok=True)
|
|
162
|
+
Path(out).write_text(json.dumps({"loci": rows, "summary": report}, indent=2, default=str),
|
|
163
|
+
encoding="utf-8")
|
|
164
|
+
return report
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
if __name__ == "__main__": # pragma: no cover
|
|
168
|
+
print(json.dumps(sanity(out=_ROOT / "out" / "structure3d_sanity.json"), indent=2, default=str))
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Writability integration (Phase 1, Step 1.9).
|
|
2
|
+
|
|
3
|
+
Combines the three layers into a transparent, DECOMPOSABLE per-locus writability profile (components
|
|
4
|
+
kept visible; never collapsed into one opaque number):
|
|
5
|
+
|
|
6
|
+
writability = f(safety, durability, reachability)
|
|
7
|
+
|
|
8
|
+
- safety: 1 - P(genotoxic) from the safety model (calibrated risk; safe-harbour discriminating).
|
|
9
|
+
- durability: P(durable | epigenome) = 1 - P(silenced), the mouse-trained conditional function APPLIED
|
|
10
|
+
to the human epigenome's histone marks (the cell-type-transfer the design hinges on).
|
|
11
|
+
- reachability: WT-KB writer set + tier (Tier-1 reprogrammable writers are broadly available at 1 kb;
|
|
12
|
+
fine-grained site choice is a design-time concern handled by the Planner).
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import pickle
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
import numpy as np
|
|
20
|
+
import pandas as pd
|
|
21
|
+
|
|
22
|
+
from pen_stack.wgenome.features import feature_columns
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def load_pickle(path: str):
|
|
26
|
+
with open(path, "rb") as fh:
|
|
27
|
+
return pickle.load(fh)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def apply_safety(matrix: pd.DataFrame, safety_model) -> np.ndarray:
|
|
31
|
+
feats = feature_columns(matrix)
|
|
32
|
+
p_genotoxic = safety_model.predict_proba(matrix[feats].astype("float32").fillna(0.0))[:, 1]
|
|
33
|
+
return 1.0 - p_genotoxic # safety = 1 - risk
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def apply_durability(matrix: pd.DataFrame, dur_models: dict) -> tuple[np.ndarray, np.ndarray]:
|
|
37
|
+
"""Apply the mouse-trained conditional function to the human epigenome's histone marks.
|
|
38
|
+
|
|
39
|
+
ROBUST to partial chromatin panels (e.g. CD34+ HSPC lacks some tracks): every model feature is
|
|
40
|
+
provided in the trained order; tracks absent from this cell type are passed as NaN, which LightGBM
|
|
41
|
+
handles natively. This is the 'graceful degradation under partial annotation' behaviour, by design.
|
|
42
|
+
"""
|
|
43
|
+
X = pd.DataFrame(index=matrix.index)
|
|
44
|
+
for f in dur_models["features"]: # exact training feature set + order
|
|
45
|
+
X[f] = matrix[f].astype("float32") if f in matrix.columns else np.nan
|
|
46
|
+
expr = dur_models["reg"].predict(X)
|
|
47
|
+
p_silenced = dur_models["clf"].predict_proba(X)[:, 1]
|
|
48
|
+
return expr, 1.0 - p_silenced # predicted expression, P(durable)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def build_writability(matrix: pd.DataFrame, safety_model, dur_models: dict,
|
|
52
|
+
w_safety: float = 0.5, w_durability: float = 0.5,
|
|
53
|
+
out_parquet: str | None = None) -> pd.DataFrame:
|
|
54
|
+
out = matrix[["chrom", "bin"]].copy()
|
|
55
|
+
out["safety"] = apply_safety(matrix, safety_model)
|
|
56
|
+
expr, p_durable = apply_durability(matrix, dur_models)
|
|
57
|
+
out["pred_expression"] = expr
|
|
58
|
+
out["p_durable"] = p_durable
|
|
59
|
+
# reachability: Tier-1 reprogrammable writers broadly available at locus level (honest annotation)
|
|
60
|
+
out["reachable_tier1"] = "bridge_IS110;Cas9;Cas12a"
|
|
61
|
+
# decomposable composite (documented weights; components above stay visible)
|
|
62
|
+
out["writability"] = w_safety * out["safety"] + w_durability * out["p_durable"]
|
|
63
|
+
if out_parquet:
|
|
64
|
+
Path(out_parquet).parent.mkdir(parents=True, exist_ok=True)
|
|
65
|
+
out.to_parquet(out_parquet, index=False)
|
|
66
|
+
return out
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def rank_loci_near(writ_df: pd.DataFrame, chrom: str, start: int, end: int, k: int = 10) -> pd.DataFrame:
|
|
70
|
+
"""Inverse query seed (Phase-3 Planner): rank writable bins in a window."""
|
|
71
|
+
w = writ_df.query("chrom == @chrom and bin*1000 >= @start and bin*1000 <= @end")
|
|
72
|
+
return w.sort_values("writability", ascending=False).head(k)
|