pen-stack 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. pen_stack/__init__.py +2 -0
  2. pen_stack/_resources.py +34 -0
  3. pen_stack/adapt/__init__.py +14 -0
  4. pen_stack/adapt/finetune.py +33 -0
  5. pen_stack/adapt/ingest.py +86 -0
  6. pen_stack/adapt/pipeline.py +101 -0
  7. pen_stack/adapt/recalibrate.py +58 -0
  8. pen_stack/adapt/report.py +130 -0
  9. pen_stack/agent/__init__.py +1 -0
  10. pen_stack/agent/guardrails.py +49 -0
  11. pen_stack/agent/mcp_server.py +42 -0
  12. pen_stack/agent/orchestrator.py +106 -0
  13. pen_stack/agent/pen_agent.py +169 -0
  14. pen_stack/agent/tools.py +130 -0
  15. pen_stack/atlas/__init__.py +1 -0
  16. pen_stack/atlas/build_wtkb.py +80 -0
  17. pen_stack/atlas/crosslink.py +144 -0
  18. pen_stack/atlas/expand.py +190 -0
  19. pen_stack/atlas/schema.py +59 -0
  20. pen_stack/atlas/scorecard.py +134 -0
  21. pen_stack/atlas/universe.py +75 -0
  22. pen_stack/atlas/variant_propose.py +155 -0
  23. pen_stack/bridge/__init__.py +1 -0
  24. pen_stack/bridge/activity.py +52 -0
  25. pen_stack/bridge/cli.py +65 -0
  26. pen_stack/bridge/fold_qc.py +53 -0
  27. pen_stack/bridge/guide_qc.py +84 -0
  28. pen_stack/bridge/ingest.py +139 -0
  29. pen_stack/bridge/offtarget.py +133 -0
  30. pen_stack/bridge/ortholog_screen.py +73 -0
  31. pen_stack/bridge/pipeline.py +83 -0
  32. pen_stack/cli.py +126 -0
  33. pen_stack/data/__init__.py +1 -0
  34. pen_stack/data/encode.py +84 -0
  35. pen_stack/data/genome.py +71 -0
  36. pen_stack/data/ingest_chromatin.py +119 -0
  37. pen_stack/data/ingest_integration.py +112 -0
  38. pen_stack/data/ingest_safety_annot.py +164 -0
  39. pen_stack/data/ingest_trip.py +76 -0
  40. pen_stack/mech/__init__.py +1 -0
  41. pen_stack/mech/classify_atlas.py +71 -0
  42. pen_stack/mech/whitelist.py +66 -0
  43. pen_stack/monitor/__init__.py +1 -0
  44. pen_stack/monitor/europepmc.py +32 -0
  45. pen_stack/monitor/run.py +57 -0
  46. pen_stack/monitor/triage.py +63 -0
  47. pen_stack/planner/__init__.py +1 -0
  48. pen_stack/planner/cargo.py +56 -0
  49. pen_stack/planner/cargo_polish.py +146 -0
  50. pen_stack/planner/delivery.py +32 -0
  51. pen_stack/planner/multiplex.py +110 -0
  52. pen_stack/planner/optimize.py +156 -0
  53. pen_stack/planner/pipeline.py +86 -0
  54. pen_stack/planner/report.py +26 -0
  55. pen_stack/rag/__init__.py +1 -0
  56. pen_stack/rag/index.py +53 -0
  57. pen_stack/rag/llm.py +178 -0
  58. pen_stack/rag/qa.py +105 -0
  59. pen_stack/score/__init__.py +1 -0
  60. pen_stack/score/recalibrate.py +77 -0
  61. pen_stack/score/therapeutic.py +85 -0
  62. pen_stack/server/__init__.py +1 -0
  63. pen_stack/server/api.py +142 -0
  64. pen_stack/ui/__init__.py +1 -0
  65. pen_stack/ui/app.py +518 -0
  66. pen_stack/validate/__init__.py +1 -0
  67. pen_stack/validate/adapt_demo.py +69 -0
  68. pen_stack/validate/agent_eval.py +117 -0
  69. pen_stack/validate/blind_gsh_discovery.py +165 -0
  70. pen_stack/validate/cargo_directionality.py +57 -0
  71. pen_stack/validate/durability_baselines.py +150 -0
  72. pen_stack/validate/forward_hypotheses.py +104 -0
  73. pen_stack/validate/guide_qc_demo.py +58 -0
  74. pen_stack/validate/intent_specification.py +82 -0
  75. pen_stack/validate/paper3_benchmark.py +165 -0
  76. pen_stack/validate/paper4_real_validation.py +144 -0
  77. pen_stack/validate/paper4_validation.py +82 -0
  78. pen_stack/validate/seq_vs_measured.py +134 -0
  79. pen_stack/validate/within_locus_ranking.py +74 -0
  80. pen_stack/validate/writer_recovery.py +86 -0
  81. pen_stack/wgenome/__init__.py +1 -0
  82. pen_stack/wgenome/chromatin_seq.py +83 -0
  83. pen_stack/wgenome/durability.py +108 -0
  84. pen_stack/wgenome/export_tracks.py +52 -0
  85. pen_stack/wgenome/features.py +82 -0
  86. pen_stack/wgenome/gsh_baseline.py +117 -0
  87. pen_stack/wgenome/providers.py +245 -0
  88. pen_stack/wgenome/safety.py +69 -0
  89. pen_stack/wgenome/structure3d.py +168 -0
  90. pen_stack/wgenome/writability.py +72 -0
  91. pen_stack-3.1.0.dist-info/METADATA +451 -0
  92. pen_stack-3.1.0.dist-info/RECORD +96 -0
  93. pen_stack-3.1.0.dist-info/WHEEL +5 -0
  94. pen_stack-3.1.0.dist-info/entry_points.txt +3 -0
  95. pen_stack-3.1.0.dist-info/licenses/LICENSE +21 -0
  96. pen_stack-3.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,245 @@
1
+ """External sequence-to-function providers (v3.1, WS-C).
2
+
3
+ AlphaGenomeProvider wraps Google DeepMind's AlphaGenome (free, non-commercial) behind a small, cached,
4
+ provider-agnostic interface so the rest of PEN-STACK never imports `alphagenome` directly. It supplies:
5
+
6
+ * tracks(interval, outputs, ontology) -> per-base predictions (ATAC/DNASE/RNA_SEQ/CHIP_HISTONE/...)
7
+ * expression(interval, ontology) -> scalar endogenous expression proxy (mean RNA_SEQ over interval)
8
+ * contact_map(interval) -> predicted Hi-C contact matrix (3D structural-risk feature, WS-C2)
9
+
10
+ Design rules (match the rest of the stack):
11
+ * The LLM and the provider are NON-load-bearing for reproducibility - every cached value is keyed by an
12
+ explicit (assembly, interval, output, ontology) tuple and written to disk, so a run is reproducible
13
+ offline from the cache without re-querying the API.
14
+ * The API key is read from env (ALPHAGENOME_API_KEY) or a gitignored file; NEVER committed.
15
+ * `alphagenome` is an optional dependency. If the package or key is absent, `available()` is False and the
16
+ dependent baselines (WS-B1, WS-C) report `pending` rather than crashing - the core stack is unaffected.
17
+
18
+ Caching: predictions are large (up to ~1M rows). We cache the *reduced* features we actually consume
19
+ (scalar expression, mean track signal, contact-map summary statistics) as small JSON/parquet, not the raw
20
+ 1 Mb tensors, keyed by a content hash of the request.
21
+ """
22
+ from __future__ import annotations
23
+
24
+ import hashlib
25
+ import json
26
+ import os
27
+ from pathlib import Path
28
+
29
+ _ROOT = Path(__file__).resolve().parents[2]
30
+ _CACHE = _ROOT / "data" / "alphagenome_cache"
31
+ _KEY_FILE = _ROOT / "configs" / "alphagenome_api_key.txt"
32
+ _KEY_ENV = "ALPHAGENOME_API_KEY"
33
+
34
+ # 1 Mb is AlphaGenome's max; expression/structural features use it for full regulatory context.
35
+ SEQ_LEN_1MB = 1_048_576
36
+
37
+ # Model version recorded in track-cache keys + artifacts (C1 reproducibility). Bump when the served model
38
+ # changes so stale predictions are not silently reused.
39
+ MODEL_VERSION = "alphagenome-2025-06"
40
+
41
+ # The seven measured-atlas tracks and their AlphaGenome sources. The five histone marks come from the single
42
+ # CHIP_HISTONE output selected by its `histone_mark` metadata column.
43
+ _HISTONES = ["H3K27ac", "H3K4me1", "H3K4me3", "H3K9me3", "H3K27me3"]
44
+ TRACK_NAMES = ["atac", "dnase", *_HISTONES]
45
+
46
+ # K562 / HepG2 cell-type ontologies (verified against AlphaGenome human output_metadata).
47
+ CT_ONTOLOGY = {"k562": "EFO:0002067", "hepg2": "EFO:0001187"}
48
+
49
+
50
+ def _resolve_key() -> str | None:
51
+ """API key from env first, then a gitignored file. Returns None if neither is present."""
52
+ k = os.environ.get(_KEY_ENV)
53
+ if k:
54
+ return k.strip()
55
+ if _KEY_FILE.exists():
56
+ for line in _KEY_FILE.read_text(encoding="utf-8").splitlines():
57
+ s = line.strip().rstrip('",; ')
58
+ if s and not s.lower().startswith("alphagenome") and len(s) > 20:
59
+ return s
60
+ return None
61
+
62
+
63
+ def package_available() -> bool:
64
+ try:
65
+ import alphagenome # noqa: F401
66
+ return True
67
+ except Exception: # noqa: BLE001
68
+ return False
69
+
70
+
71
+ def _cache_key(*parts) -> str:
72
+ return hashlib.sha256("|".join(str(p) for p in parts).encode()).hexdigest()[:24]
73
+
74
+
75
+ class AlphaGenomeProvider:
76
+ """Cached wrapper around AlphaGenome's dna_client. Construct with `AlphaGenomeProvider()`."""
77
+
78
+ def __init__(self, api_key: str | None = None, assembly: str = "hg38", cache_dir: Path = _CACHE):
79
+ self.assembly = assembly
80
+ self.cache_dir = Path(cache_dir)
81
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
82
+ self._key = api_key or _resolve_key()
83
+ self._model = None # lazily created on first live call
84
+
85
+ # -- availability ------------------------------------------------------
86
+ def available(self) -> bool:
87
+ """True when both the package and a key are present (a live call is possible)."""
88
+ return package_available() and self._key is not None
89
+
90
+ def _client(self):
91
+ if self._model is None:
92
+ from alphagenome.models import dna_client
93
+ self._model = dna_client.create(self._key)
94
+ return self._model
95
+
96
+ # -- cache helpers -----------------------------------------------------
97
+ def _load(self, key: str):
98
+ f = self.cache_dir / f"{key}.json"
99
+ if f.exists():
100
+ return json.loads(f.read_text(encoding="utf-8"))
101
+ return None
102
+
103
+ def _store(self, key: str, value: dict) -> None:
104
+ (self.cache_dir / f"{key}.json").write_text(json.dumps(value, default=str), encoding="utf-8")
105
+
106
+ # -- features ----------------------------------------------------------
107
+ def expression(self, chrom: str, start: int, end: int, ontology: str, organism: str = "human",
108
+ center_bp: int = 20_000, offline: bool = False) -> dict:
109
+ """Scalar endogenous-expression proxy: mean predicted RNA_SEQ in a central window (cached).
110
+
111
+ The 1 Mb model context is needed for regulatory reach, but the proxy averages only the central
112
+ `center_bp` (host-locus expression at the integration site) rather than the whole 1 Mb, which would
113
+ wash out the local signal.
114
+ """
115
+ key = _cache_key("expr", self.assembly, organism, chrom, start, end, ontology, center_bp)
116
+ hit = self._load(key)
117
+ if hit is not None:
118
+ return hit
119
+ if offline:
120
+ return {"available": False, "reason": "offline: not in cache", "key": key}
121
+ if not self.available():
122
+ return {"available": False, "reason": "alphagenome package or key absent", "key": key}
123
+ from alphagenome.data import genome
124
+ from alphagenome.models import dna_client
125
+ org = (dna_client.Organism.MUS_MUSCULUS if organism == "mouse"
126
+ else dna_client.Organism.HOMO_SAPIENS)
127
+ interval = genome.Interval(chromosome=chrom, start=start, end=end).resize(SEQ_LEN_1MB)
128
+ out = self._client().predict_interval(
129
+ interval=interval, organism=org,
130
+ requested_outputs=[dna_client.OutputType.RNA_SEQ], ontology_terms=[ontology])
131
+ import numpy as np
132
+ arr = np.asarray(out.rna_seq.values) # (1_048_576, n_tracks)
133
+ mid = arr.shape[0] // 2
134
+ half = max(1, center_bp // 2)
135
+ central = arr[max(0, mid - half):mid + half]
136
+ rec = {"available": True, "rna_seq_mean": float(central.mean()),
137
+ "rna_seq_max": float(central.max()), "center_bp": center_bp,
138
+ "chrom": chrom, "start": start, "end": end,
139
+ "ontology": ontology, "organism": organism, "key": key}
140
+ self._store(key, rec)
141
+ return rec
142
+
143
+ def tracks(self, chrom: str, bin: int, ct: str, bin_size: int = 1000, center_bp: int = 1000,
144
+ offline: bool = False) -> dict:
145
+ """Predicted values of the seven measured-atlas tracks at a 1 kb bin (central-window mean, cached).
146
+
147
+ `ct` is "k562" or "hepg2"; the bin centre is `bin*bin_size + bin_size/2`, predicted in 1 Mb context.
148
+ Returns {atac, dnase, H3K27ac, H3K4me1, H3K4me3, H3K9me3, H3K27me3, model_version, ...}.
149
+ """
150
+ ontology = CT_ONTOLOGY.get(ct.lower(), ct)
151
+ key = _cache_key("tracks", self.assembly, MODEL_VERSION, chrom, bin, ontology, bin_size, center_bp)
152
+ hit = self._load(key)
153
+ if hit is not None:
154
+ return hit
155
+ if offline:
156
+ return {"available": False, "reason": "offline: not in cache", "key": key}
157
+ if not self.available():
158
+ return {"available": False, "reason": "alphagenome package or key absent", "key": key}
159
+ import numpy as np
160
+ from alphagenome.data import genome
161
+ from alphagenome.models import dna_client
162
+ pos = bin * bin_size + bin_size // 2
163
+ interval = genome.Interval(chromosome=chrom, start=pos, end=pos).resize(SEQ_LEN_1MB)
164
+ out = self._client().predict_interval(
165
+ interval=interval,
166
+ requested_outputs=[dna_client.OutputType.ATAC, dna_client.OutputType.DNASE,
167
+ dna_client.OutputType.CHIP_HISTONE],
168
+ ontology_terms=[ontology])
169
+
170
+ def central(values) -> np.ndarray:
171
+ arr = np.asarray(values)
172
+ mid = arr.shape[0] // 2
173
+ half = max(1, center_bp // 2)
174
+ return arr[max(0, mid - half):mid + half]
175
+
176
+ rec = {"available": True, "chrom": chrom, "bin": int(bin), "ct": ct, "ontology": ontology,
177
+ "model_version": MODEL_VERSION, "center_bp": center_bp, "key": key,
178
+ "atac": float(central(out.atac.values).mean()),
179
+ "dnase": float(central(out.dnase.values).mean())}
180
+ ch = out.chip_histone
181
+ md, vals = ch.metadata.reset_index(drop=True), central(ch.values)
182
+ for mark in _HISTONES:
183
+ cols = md.index[md["histone_mark"] == mark].to_numpy()
184
+ rec[mark] = float(vals[:, cols].mean()) if len(cols) else float("nan")
185
+ self._store(key, rec)
186
+ return rec
187
+
188
+ def contact_map_summary(self, chrom: str, start: int, end: int, ontology: str) -> dict:
189
+ """3D structural-risk summary (WS-C2): variance + mean of the predicted contact map (cached)."""
190
+ key = _cache_key("contact", self.assembly, chrom, start, end, ontology)
191
+ hit = self._load(key)
192
+ if hit is not None:
193
+ return hit
194
+ if not self.available():
195
+ return {"available": False, "reason": "alphagenome package or key absent", "key": key}
196
+ from alphagenome.data import genome
197
+ from alphagenome.models import dna_client
198
+ interval = genome.Interval(chromosome=chrom, start=start, end=end).resize(SEQ_LEN_1MB)
199
+ out = self._client().predict_interval(
200
+ interval=interval, requested_outputs=[dna_client.OutputType.CONTACT_MAPS],
201
+ ontology_terms=[ontology])
202
+ import numpy as np
203
+ m = np.asarray(out.contact_maps.values)
204
+ rec = {"available": True, "contact_mean": float(m.mean()), "contact_var": float(m.var()),
205
+ "chrom": chrom, "start": start, "end": end, "ontology": ontology, "key": key}
206
+ self._store(key, rec)
207
+ return rec
208
+
209
+
210
+ class MeasuredTrackProvider:
211
+ """The existing measured-ENCODE backbone: reads `phase_1/features/chromatin_{ct}.parquet` per bin.
212
+
213
+ Same `tracks()` signature as AlphaGenomeProvider so C2 can compare predicted vs measured on identical
214
+ bins, and so downstream code can swap providers without branching.
215
+ """
216
+
217
+ _P1 = _ROOT.parent / "phase_1" / "features"
218
+
219
+ def __init__(self, ct: str):
220
+ import pandas as pd
221
+ self.ct = ct.lower()
222
+ self._df = pd.read_parquet(self._P1 / f"chromatin_{self.ct}.parquet").set_index(["chrom", "bin"])
223
+
224
+ def available(self) -> bool:
225
+ return True
226
+
227
+ def tracks(self, chrom: str, bin: int, ct: str | None = None, **_: object) -> dict:
228
+ try:
229
+ row = self._df.loc[(chrom, int(bin))]
230
+ except KeyError:
231
+ return {"available": False, "reason": "bin not in measured grid"}
232
+ return {"available": True, "chrom": chrom, "bin": int(bin), "ct": self.ct,
233
+ **{t: float(row[t]) for t in TRACK_NAMES if t in row}}
234
+
235
+
236
+ def smoke() -> dict:
237
+ """Lightweight readiness probe used by tests/CLI - reports availability without a live call."""
238
+ p = AlphaGenomeProvider()
239
+ return {"package_available": package_available(), "key_present": _resolve_key() is not None,
240
+ "available": p.available(), "model_version": MODEL_VERSION, "track_names": TRACK_NAMES,
241
+ "cache_dir": str(_CACHE)}
242
+
243
+
244
+ if __name__ == "__main__": # pragma: no cover
245
+ print(json.dumps(smoke(), indent=2))
@@ -0,0 +1,69 @@
1
+ """Safety layer (Phase 1, Step 1.6) - calibrated genotoxicity-risk model.
2
+
3
+ Position features -> P(genotoxic) with isotonic calibration and CHROMOSOME-BLOCK cross-validation
4
+ (so adjacent 1 kb bins never leak between train/test). Always reported against the honest baseline:
5
+ distance-to-nearest-oncogene. Output is a calibrated risk per bin.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import lightgbm as lgb
10
+ import numpy as np
11
+ import pandas as pd
12
+ from sklearn.isotonic import IsotonicRegression
13
+ from sklearn.metrics import average_precision_score, roc_auc_score
14
+ from sklearn.model_selection import GroupKFold
15
+
16
+ from pen_stack.wgenome.features import feature_columns
17
+
18
+
19
+ def _blocks(chrom: pd.Series) -> np.ndarray:
20
+ """Chromosome-block groups for leakage-free CV."""
21
+ return chrom.astype("category").cat.codes.to_numpy()
22
+
23
+
24
+ def train_safety(df: pd.DataFrame, label: str = "genotoxic_cis", n_splits: int = 5,
25
+ seed: int = 42) -> dict:
26
+ feats = feature_columns(df)
27
+ X = df[feats].astype("float32").fillna(0.0)
28
+ y = df[label].astype(int).to_numpy()
29
+ groups = _blocks(df["chrom"])
30
+
31
+ gkf = GroupKFold(n_splits=min(n_splits, len(np.unique(groups))))
32
+ oof = np.zeros(len(df), dtype="float64")
33
+ for tr, te in gkf.split(X, y, groups):
34
+ pos = max(1, int(y[tr].sum()))
35
+ spw = max(1.0, (len(tr) - pos) / pos) # class imbalance
36
+ clf = lgb.LGBMClassifier(n_estimators=400, learning_rate=0.03, num_leaves=63,
37
+ subsample=0.8, colsample_bytree=0.8, scale_pos_weight=spw,
38
+ random_state=seed, n_jobs=-1, verbosity=-1)
39
+ clf.fit(X.iloc[tr], y[tr])
40
+ raw = clf.predict_proba(X.iloc[te])[:, 1]
41
+ # isotonic calibration fit on the training fold's OOB-ish raw scores
42
+ iso = IsotonicRegression(out_of_bounds="clip")
43
+ raw_tr = clf.predict_proba(X.iloc[tr])[:, 1]
44
+ iso.fit(raw_tr, y[tr])
45
+ oof[te] = iso.transform(raw)
46
+
47
+ auroc = roc_auc_score(y, oof)
48
+ auprc = average_precision_score(y, oof)
49
+
50
+ # honest baseline: closer to oncogene => riskier
51
+ base = -df["dist_oncogene"].fillna(df["dist_oncogene"].max()).to_numpy()
52
+ auroc_base = roc_auc_score(y, base)
53
+ auprc_base = average_precision_score(y, base)
54
+
55
+ # final model on all data (for scoring), + feature importance
56
+ pos = max(1, int(y.sum()))
57
+ spw = max(1.0, (len(y) - pos) / pos)
58
+ final = lgb.LGBMClassifier(n_estimators=400, learning_rate=0.03, num_leaves=63,
59
+ subsample=0.8, colsample_bytree=0.8, scale_pos_weight=spw,
60
+ random_state=seed, n_jobs=-1, verbosity=-1).fit(X, y)
61
+ imp = dict(sorted(zip(feats, final.feature_importances_.tolist()),
62
+ key=lambda kv: kv[1], reverse=True))
63
+ return {
64
+ "n": int(len(df)), "n_pos": int(y.sum()), "features": feats,
65
+ "auroc_model": float(auroc), "auprc_model": float(auprc),
66
+ "auroc_baseline": float(auroc_base), "auprc_baseline": float(auprc_base),
67
+ "auroc_delta": float(auroc - auroc_base),
68
+ "feature_importance": imp, "model": final, "oof": oof,
69
+ }
@@ -0,0 +1,168 @@
1
+ """WS-C3 - 3D structural-risk via AlphaGenome contact-map deltas.
2
+
3
+ A cassette insertion can rewire 3D contacts and bring a distal enhancer into contact with an oncogene
4
+ promoter (enhancer hijacking). We simulate the insertion with AlphaGenome's `predict_variant` (the cassette
5
+ is the alternate allele - an insertion - so the model applies it to its own reference and handles the
6
+ coordinate shift server-side; no local FASTA needed). We predict the reference and edited 1 Mb contact maps
7
+ and compute:
8
+
9
+ * insulation change at the insertion site (diamond insulation score, ref vs edited);
10
+ * aberrant contact gain between the insertion site and a target oncogene promoter bin.
11
+
12
+ To isolate the *regulatory* effect from the pure coordinate-shift artifact, every metric is reported for a
13
+ strong-enhancer insert AND a length-matched neutral insert; the strong-minus-neutral difference is the
14
+ signal. Output is a `structural_risk` score + flag with a confidence field.
15
+
16
+ GATE G-C: this ships as a FLAG WITH CONFIDENCE, never a hard pass/fail. No ground-truth dataset of
17
+ insertion-induced hijacking exists, so this is NOT validated as a predictor - only sanity-checked on known
18
+ enhancer-hijacking loci (TAL1, LMO2, GFI1B, MYC) where a strong-enhancer insert should raise aberrant
19
+ contacts above a matched neutral insert. Contacts are cell-type-specific (default GM12878, EFO:0002784 -
20
+ K562 has no AlphaGenome Hi-C track); insertion changes coordinates in ways the model was not trained on.
21
+ """
22
+ from __future__ import annotations
23
+
24
+ import hashlib
25
+ import json
26
+ import urllib.request
27
+ from pathlib import Path
28
+
29
+ import numpy as np
30
+
31
+ from pen_stack.wgenome.providers import SEQ_LEN_1MB, AlphaGenomeProvider
32
+
33
+ _ROOT = Path(__file__).resolve().parents[2]
34
+ _CACHE = _ROOT / "data" / "alphagenome_cache"
35
+ HIC_ONTOLOGY = "EFO:0002784" # GM12878 - canonical deep Hi-C; K562 has no AlphaGenome contact track
36
+ CONTACT_BINS = 512 # 1 Mb / 512 ~ 2048 bp per contact bin
37
+
38
+
39
+ def _ucsc_ref(chrom: str, pos: int, length: int = 1) -> str:
40
+ """Reference bases [pos, pos+length) on hg38 via the UCSC REST API (cached on disk)."""
41
+ key = f"ucsc_{chrom}_{pos}_{length}"
42
+ f = _CACHE / f"{key}.json"
43
+ if f.exists():
44
+ return json.loads(f.read_text(encoding="utf-8"))["dna"].upper()
45
+ u = (f"https://api.genome.ucsc.edu/getData/sequence?genome=hg38;chrom={chrom};"
46
+ f"start={pos};end={pos + length}")
47
+ d = json.load(urllib.request.urlopen(u)) # noqa: S310
48
+ _CACHE.mkdir(parents=True, exist_ok=True)
49
+ f.write_text(json.dumps({"dna": d["dna"]}), encoding="utf-8")
50
+ return d["dna"].upper()
51
+
52
+
53
+ def strong_enhancer_insert(n: int = 1600) -> str:
54
+ """Simulated strong enhancer: tiled clusters of active-enhancer TF motif cores (ETS/GATA/AP-1/RUNX)."""
55
+ motif = "GGAAGTGATAAGTGACTCAGGAAGTGACCACA" # GGAA(ETS) / GATA / TGACTCA(AP-1) / TGTGGT(RUNX-rc)
56
+ return (motif * (n // len(motif) + 1))[:n]
57
+
58
+
59
+ def neutral_insert(n: int = 1600) -> str:
60
+ """Length-matched neutral insert: low-complexity AT-rich filler (poor regulatory potential)."""
61
+ return ("ATATATTAATTATAAT" * (n // 16 + 1))[:n]
62
+
63
+
64
+ def _contact_matrices(provider: AlphaGenomeProvider, chrom: str, pos: int, insert: str,
65
+ ontology: str = HIC_ONTOLOGY):
66
+ """Reference + edited (insertion) 1 Mb contact matrices via predict_variant."""
67
+ from alphagenome.data import genome
68
+ from alphagenome.models import dna_client
69
+ anchor = _ucsc_ref(chrom, pos, 1)
70
+ var = genome.Variant(chromosome=chrom, position=pos, reference_bases=anchor,
71
+ alternate_bases=anchor + insert)
72
+ interval = var.reference_interval.resize(SEQ_LEN_1MB)
73
+ out = provider._client().predict_variant( # noqa: SLF001
74
+ interval=interval, variant=var,
75
+ requested_outputs=[dna_client.OutputType.CONTACT_MAPS], ontology_terms=[ontology])
76
+ ref = np.asarray(out.reference.contact_maps.values)
77
+ alt = np.asarray(out.alternate.contact_maps.values)
78
+ return ref[..., 0] if ref.ndim == 3 else ref, alt[..., 0] if alt.ndim == 3 else alt
79
+
80
+
81
+ def insulation_score(mat: np.ndarray, idx: int, w: int = 10) -> float:
82
+ """Diamond insulation: mean contact in the w x w square straddling position `idx`."""
83
+ n = mat.shape[0]
84
+ a, b = max(0, idx - w), min(n, idx + w)
85
+ if a >= idx or b <= idx:
86
+ return float("nan")
87
+ return float(mat[a:idx, idx:b].mean())
88
+
89
+
90
+ def _bin_of(offset_bp: int) -> int:
91
+ """Contact-map bin index for a genomic offset (bp) from the 1 Mb interval centre."""
92
+ return int(round(CONTACT_BINS / 2 + offset_bp / (SEQ_LEN_1MB / CONTACT_BINS)))
93
+
94
+
95
+ def structural_risk(chrom: str, site_pos: int, oncogene_pos: int, ontology: str = HIC_ONTOLOGY,
96
+ provider: AlphaGenomeProvider | None = None, offline: bool = False) -> dict:
97
+ """Strong-enhancer vs neutral insertion at `site_pos`; aberrant contact gain toward `oncogene_pos`."""
98
+ provider = provider or AlphaGenomeProvider(assembly="hg38")
99
+ ins_strong, ins_neutral = strong_enhancer_insert(), neutral_insert()
100
+ key_src = f"struct3d|{chrom}|{site_pos}|{oncogene_pos}|{ontology}|{hashlib.sha256((ins_strong+ins_neutral).encode()).hexdigest()[:8]}"
101
+ key = hashlib.sha256(key_src.encode()).hexdigest()[:24]
102
+ cf = _CACHE / f"{key}.json"
103
+ if cf.exists():
104
+ return json.loads(cf.read_text(encoding="utf-8"))
105
+ if offline or not provider.available():
106
+ return {"available": False, "reason": "offline or AlphaGenome key absent", "key": key}
107
+
108
+ site_idx = CONTACT_BINS // 2
109
+ tgt_idx = _bin_of(oncogene_pos - site_pos)
110
+ res = {}
111
+ for label, insert in (("strong_enhancer", ins_strong), ("neutral", ins_neutral)):
112
+ ref, alt = _contact_matrices(provider, chrom, site_pos, insert, ontology)
113
+ ins_ref, ins_alt = insulation_score(ref, site_idx), insulation_score(alt, site_idx)
114
+ t = min(tgt_idx, CONTACT_BINS - 1)
115
+ contact_ref = float(ref[site_idx, t]) if 0 <= t < CONTACT_BINS else float("nan")
116
+ contact_alt = float(alt[site_idx, t]) if 0 <= t < CONTACT_BINS else float("nan")
117
+ res[label] = {"insulation_change": round(ins_alt - ins_ref, 5),
118
+ "oncogene_contact_gain": round(contact_alt - contact_ref, 5)}
119
+ gain_strong = res["strong_enhancer"]["oncogene_contact_gain"]
120
+ gain_neutral = res["neutral"]["oncogene_contact_gain"]
121
+ aberrant = gain_strong - gain_neutral
122
+ out = {"available": True, "chrom": chrom, "site_pos": site_pos, "oncogene_pos": oncogene_pos,
123
+ "ontology": ontology, "target_bin_offset": tgt_idx - site_idx,
124
+ "per_insert": res, "aberrant_contact_gain_strong_minus_neutral": round(aberrant, 5),
125
+ "structural_risk": round(float(max(0.0, aberrant)), 5),
126
+ "flag": bool(aberrant > 0),
127
+ "confidence": "heuristic; not a calibrated probability (Gate G-C); sanity-check only",
128
+ "key": key}
129
+ _CACHE.mkdir(parents=True, exist_ok=True)
130
+ cf.write_text(json.dumps(out, default=str), encoding="utf-8")
131
+ return out
132
+
133
+
134
+ # Known enhancer-hijacking loci (hg38) for the qualitative sanity check. Insertion site placed ~120 kb from
135
+ # the oncogene promoter (within a 1 Mb window / typical TAD reach).
136
+ HIJACK_LOCI = {
137
+ "TAL1": {"chrom": "chr1", "oncogene_pos": 47_209_257, "site_pos": 47_209_257 - 120_000},
138
+ "LMO2": {"chrom": "chr11", "oncogene_pos": 33_859_520, "site_pos": 33_859_520 - 120_000},
139
+ "GFI1B": {"chrom": "chr9", "oncogene_pos": 132_990_996, "site_pos": 132_990_996 - 120_000},
140
+ "MYC": {"chrom": "chr8", "oncogene_pos": 127_735_434, "site_pos": 127_735_434 - 120_000},
141
+ }
142
+
143
+
144
+ def sanity(ontology: str = HIC_ONTOLOGY, offline: bool = False, out: str | Path | None = None) -> dict:
145
+ """C3 sanity check across the known hijacking loci: strong-enhancer insert should raise aberrant
146
+ contacts above a matched neutral insert at more loci than not (qualitative, not a validated predictor)."""
147
+ provider = AlphaGenomeProvider(assembly="hg38")
148
+ rows = {}
149
+ for name, c in HIJACK_LOCI.items():
150
+ r = structural_risk(c["chrom"], c["site_pos"], c["oncogene_pos"], ontology, provider, offline)
151
+ rows[name] = r
152
+ scored = [v["aberrant_contact_gain_strong_minus_neutral"] for v in rows.values() if v.get("available")]
153
+ report = {"available": bool(scored), "ontology": ontology, "n_loci": len(rows),
154
+ "n_strong_gt_neutral": int(sum(1 for s in scored if s > 0)),
155
+ "per_locus": {k: (v.get("aberrant_contact_gain_strong_minus_neutral") if v.get("available")
156
+ else v.get("reason")) for k, v in rows.items()},
157
+ "sanity_pass": bool(scored and sum(1 for s in scored if s > 0) > len(scored) / 2),
158
+ "scope": "qualitative sanity check only; ships as a flag with confidence (Gate G-C), never "
159
+ "a hard pass/fail; contacts are cell-type-specific (GM12878)."}
160
+ if out:
161
+ Path(out).parent.mkdir(parents=True, exist_ok=True)
162
+ Path(out).write_text(json.dumps({"loci": rows, "summary": report}, indent=2, default=str),
163
+ encoding="utf-8")
164
+ return report
165
+
166
+
167
+ if __name__ == "__main__": # pragma: no cover
168
+ print(json.dumps(sanity(out=_ROOT / "out" / "structure3d_sanity.json"), indent=2, default=str))
@@ -0,0 +1,72 @@
1
+ """Writability integration (Phase 1, Step 1.9).
2
+
3
+ Combines the three layers into a transparent, DECOMPOSABLE per-locus writability profile (components
4
+ kept visible; never collapsed into one opaque number):
5
+
6
+ writability = f(safety, durability, reachability)
7
+
8
+ - safety: 1 - P(genotoxic) from the safety model (calibrated risk; safe-harbour discriminating).
9
+ - durability: P(durable | epigenome) = 1 - P(silenced), the mouse-trained conditional function APPLIED
10
+ to the human epigenome's histone marks (the cell-type-transfer the design hinges on).
11
+ - reachability: WT-KB writer set + tier (Tier-1 reprogrammable writers are broadly available at 1 kb;
12
+ fine-grained site choice is a design-time concern handled by the Planner).
13
+ """
14
+ from __future__ import annotations
15
+
16
+ import pickle
17
+ from pathlib import Path
18
+
19
+ import numpy as np
20
+ import pandas as pd
21
+
22
+ from pen_stack.wgenome.features import feature_columns
23
+
24
+
25
+ def load_pickle(path: str):
26
+ with open(path, "rb") as fh:
27
+ return pickle.load(fh)
28
+
29
+
30
+ def apply_safety(matrix: pd.DataFrame, safety_model) -> np.ndarray:
31
+ feats = feature_columns(matrix)
32
+ p_genotoxic = safety_model.predict_proba(matrix[feats].astype("float32").fillna(0.0))[:, 1]
33
+ return 1.0 - p_genotoxic # safety = 1 - risk
34
+
35
+
36
+ def apply_durability(matrix: pd.DataFrame, dur_models: dict) -> tuple[np.ndarray, np.ndarray]:
37
+ """Apply the mouse-trained conditional function to the human epigenome's histone marks.
38
+
39
+ ROBUST to partial chromatin panels (e.g. CD34+ HSPC lacks some tracks): every model feature is
40
+ provided in the trained order; tracks absent from this cell type are passed as NaN, which LightGBM
41
+ handles natively. This is the 'graceful degradation under partial annotation' behaviour, by design.
42
+ """
43
+ X = pd.DataFrame(index=matrix.index)
44
+ for f in dur_models["features"]: # exact training feature set + order
45
+ X[f] = matrix[f].astype("float32") if f in matrix.columns else np.nan
46
+ expr = dur_models["reg"].predict(X)
47
+ p_silenced = dur_models["clf"].predict_proba(X)[:, 1]
48
+ return expr, 1.0 - p_silenced # predicted expression, P(durable)
49
+
50
+
51
+ def build_writability(matrix: pd.DataFrame, safety_model, dur_models: dict,
52
+ w_safety: float = 0.5, w_durability: float = 0.5,
53
+ out_parquet: str | None = None) -> pd.DataFrame:
54
+ out = matrix[["chrom", "bin"]].copy()
55
+ out["safety"] = apply_safety(matrix, safety_model)
56
+ expr, p_durable = apply_durability(matrix, dur_models)
57
+ out["pred_expression"] = expr
58
+ out["p_durable"] = p_durable
59
+ # reachability: Tier-1 reprogrammable writers broadly available at locus level (honest annotation)
60
+ out["reachable_tier1"] = "bridge_IS110;Cas9;Cas12a"
61
+ # decomposable composite (documented weights; components above stay visible)
62
+ out["writability"] = w_safety * out["safety"] + w_durability * out["p_durable"]
63
+ if out_parquet:
64
+ Path(out_parquet).parent.mkdir(parents=True, exist_ok=True)
65
+ out.to_parquet(out_parquet, index=False)
66
+ return out
67
+
68
+
69
+ def rank_loci_near(writ_df: pd.DataFrame, chrom: str, start: int, end: int, k: int = 10) -> pd.DataFrame:
70
+ """Inverse query seed (Phase-3 Planner): rank writable bins in a window."""
71
+ w = writ_df.query("chrom == @chrom and bin*1000 >= @start and bin*1000 <= @end")
72
+ return w.sort_values("writability", ascending=False).head(k)