pen-stack 3.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pen_stack/__init__.py +2 -0
- pen_stack/_resources.py +34 -0
- pen_stack/adapt/__init__.py +14 -0
- pen_stack/adapt/finetune.py +33 -0
- pen_stack/adapt/ingest.py +86 -0
- pen_stack/adapt/pipeline.py +101 -0
- pen_stack/adapt/recalibrate.py +58 -0
- pen_stack/adapt/report.py +130 -0
- pen_stack/agent/__init__.py +1 -0
- pen_stack/agent/guardrails.py +49 -0
- pen_stack/agent/mcp_server.py +42 -0
- pen_stack/agent/orchestrator.py +106 -0
- pen_stack/agent/pen_agent.py +169 -0
- pen_stack/agent/tools.py +130 -0
- pen_stack/atlas/__init__.py +1 -0
- pen_stack/atlas/build_wtkb.py +80 -0
- pen_stack/atlas/crosslink.py +144 -0
- pen_stack/atlas/expand.py +190 -0
- pen_stack/atlas/schema.py +59 -0
- pen_stack/atlas/scorecard.py +134 -0
- pen_stack/atlas/universe.py +75 -0
- pen_stack/atlas/variant_propose.py +155 -0
- pen_stack/bridge/__init__.py +1 -0
- pen_stack/bridge/activity.py +52 -0
- pen_stack/bridge/cli.py +65 -0
- pen_stack/bridge/fold_qc.py +53 -0
- pen_stack/bridge/guide_qc.py +84 -0
- pen_stack/bridge/ingest.py +139 -0
- pen_stack/bridge/offtarget.py +133 -0
- pen_stack/bridge/ortholog_screen.py +73 -0
- pen_stack/bridge/pipeline.py +83 -0
- pen_stack/cli.py +126 -0
- pen_stack/data/__init__.py +1 -0
- pen_stack/data/encode.py +84 -0
- pen_stack/data/genome.py +71 -0
- pen_stack/data/ingest_chromatin.py +119 -0
- pen_stack/data/ingest_integration.py +112 -0
- pen_stack/data/ingest_safety_annot.py +164 -0
- pen_stack/data/ingest_trip.py +76 -0
- pen_stack/mech/__init__.py +1 -0
- pen_stack/mech/classify_atlas.py +71 -0
- pen_stack/mech/whitelist.py +66 -0
- pen_stack/monitor/__init__.py +1 -0
- pen_stack/monitor/europepmc.py +32 -0
- pen_stack/monitor/run.py +57 -0
- pen_stack/monitor/triage.py +63 -0
- pen_stack/planner/__init__.py +1 -0
- pen_stack/planner/cargo.py +56 -0
- pen_stack/planner/cargo_polish.py +146 -0
- pen_stack/planner/delivery.py +32 -0
- pen_stack/planner/multiplex.py +110 -0
- pen_stack/planner/optimize.py +156 -0
- pen_stack/planner/pipeline.py +86 -0
- pen_stack/planner/report.py +26 -0
- pen_stack/rag/__init__.py +1 -0
- pen_stack/rag/index.py +53 -0
- pen_stack/rag/llm.py +178 -0
- pen_stack/rag/qa.py +105 -0
- pen_stack/score/__init__.py +1 -0
- pen_stack/score/recalibrate.py +77 -0
- pen_stack/score/therapeutic.py +85 -0
- pen_stack/server/__init__.py +1 -0
- pen_stack/server/api.py +142 -0
- pen_stack/ui/__init__.py +1 -0
- pen_stack/ui/app.py +518 -0
- pen_stack/validate/__init__.py +1 -0
- pen_stack/validate/adapt_demo.py +69 -0
- pen_stack/validate/agent_eval.py +117 -0
- pen_stack/validate/blind_gsh_discovery.py +165 -0
- pen_stack/validate/cargo_directionality.py +57 -0
- pen_stack/validate/durability_baselines.py +150 -0
- pen_stack/validate/forward_hypotheses.py +104 -0
- pen_stack/validate/guide_qc_demo.py +58 -0
- pen_stack/validate/intent_specification.py +82 -0
- pen_stack/validate/paper3_benchmark.py +165 -0
- pen_stack/validate/paper4_real_validation.py +144 -0
- pen_stack/validate/paper4_validation.py +82 -0
- pen_stack/validate/seq_vs_measured.py +134 -0
- pen_stack/validate/within_locus_ranking.py +74 -0
- pen_stack/validate/writer_recovery.py +86 -0
- pen_stack/wgenome/__init__.py +1 -0
- pen_stack/wgenome/chromatin_seq.py +83 -0
- pen_stack/wgenome/durability.py +108 -0
- pen_stack/wgenome/export_tracks.py +52 -0
- pen_stack/wgenome/features.py +82 -0
- pen_stack/wgenome/gsh_baseline.py +117 -0
- pen_stack/wgenome/providers.py +245 -0
- pen_stack/wgenome/safety.py +69 -0
- pen_stack/wgenome/structure3d.py +168 -0
- pen_stack/wgenome/writability.py +72 -0
- pen_stack-3.1.0.dist-info/METADATA +451 -0
- pen_stack-3.1.0.dist-info/RECORD +96 -0
- pen_stack-3.1.0.dist-info/WHEEL +5 -0
- pen_stack-3.1.0.dist-info/entry_points.txt +3 -0
- pen_stack-3.1.0.dist-info/licenses/LICENSE +21 -0
- pen_stack-3.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""Expand the Writer Atlas across families (Phase 2, Step 2.1).
|
|
2
|
+
|
|
3
|
+
Grow the Phase-0 curated 8-family core into a comprehensive cross-family catalogue: ingest ortholog
|
|
4
|
+
sets at scale (the IS110/IS1111 superfamily, CAST, large serine integrases, Cas12a, TnpB/Fanzor) from
|
|
5
|
+
UniProt, place every entry on the WT-KB targeting axes by *inheriting* family-level metadata from the
|
|
6
|
+
Phase-0 ``wtkb.parquet`` (single source of truth - the classifier/scorer must not re-derive it), and
|
|
7
|
+
tag each row with an explicit ``confidence`` (measured / inferred / predicted) and provenance.
|
|
8
|
+
|
|
9
|
+
Heavy per-ortholog featurisation (ESM embeddings for mechanism classification at scale, Step 2.2) runs
|
|
10
|
+
in Docker on the GPU; this module only assembles the *catalogue* metadata (lightweight, network-bound).
|
|
11
|
+
|
|
12
|
+
Inputs : configs/atlas_families.yaml, pen_stack/atlas/wtkb.parquet, UniProt REST.
|
|
13
|
+
Outputs: pen_stack/atlas/atlas.parquet (one row per system), cached TSVs under data/external/atlas/.
|
|
14
|
+
"""
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import time
|
|
18
|
+
import urllib.parse
|
|
19
|
+
import urllib.request
|
|
20
|
+
from io import StringIO
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
import pandas as pd
|
|
24
|
+
import yaml
|
|
25
|
+
|
|
26
|
+
_ROOT = Path(__file__).resolve().parents[2]
|
|
27
|
+
_CFG = _ROOT / "configs" / "atlas_families.yaml"
|
|
28
|
+
_WTKB = _ROOT / "pen_stack" / "atlas" / "wtkb.parquet"
|
|
29
|
+
_CACHE = _ROOT / "data" / "external" / "atlas"
|
|
30
|
+
_OUT = _ROOT / "pen_stack" / "atlas" / "atlas.parquet"
|
|
31
|
+
|
|
32
|
+
_UNIPROT_STREAM = "https://rest.uniprot.org/uniprotkb/stream"
|
|
33
|
+
|
|
34
|
+
# WT-KB family-level fields every atlas row inherits (so targeting metadata has ONE source).
|
|
35
|
+
_INHERIT = [
|
|
36
|
+
"mechanism_bucket", "targeting_modality", "target_site_spec", "guide_architecture",
|
|
37
|
+
"cargo_mechanism", "cargo_capacity_bp", "dsb_free", "reachability_tier",
|
|
38
|
+
"reachability_constraints",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def load_config(path: str | Path = _CFG) -> dict:
|
|
43
|
+
return yaml.safe_load(Path(path).read_text(encoding="utf-8"))
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def fetch_uniprot(query: str, fields: str, cache: Path, timeout: int = 120,
|
|
47
|
+
retries: int = 3) -> pd.DataFrame:
|
|
48
|
+
"""Stream a UniProt query to TSV (cached). Returns the raw per-accession metadata frame."""
|
|
49
|
+
cache.parent.mkdir(parents=True, exist_ok=True)
|
|
50
|
+
if cache.exists():
|
|
51
|
+
return pd.read_csv(cache, sep="\t", dtype=str)
|
|
52
|
+
params = {"query": query, "format": "tsv", "fields": fields}
|
|
53
|
+
url = _UNIPROT_STREAM + "?" + urllib.parse.urlencode(params)
|
|
54
|
+
last = None
|
|
55
|
+
for attempt in range(retries):
|
|
56
|
+
try:
|
|
57
|
+
with urllib.request.urlopen(url, timeout=timeout) as r:
|
|
58
|
+
text = r.read().decode("utf-8")
|
|
59
|
+
df = pd.read_csv(StringIO(text), sep="\t", dtype=str)
|
|
60
|
+
cache.write_text(text, encoding="utf-8")
|
|
61
|
+
return df
|
|
62
|
+
except Exception as e: # noqa: BLE001 - network is best-effort; surfaced after retries
|
|
63
|
+
last = e
|
|
64
|
+
time.sleep(2 * (attempt + 1))
|
|
65
|
+
raise RuntimeError(f"UniProt fetch failed for query {query!r}: {last}")
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _orthologs_for_family(fam_key: str, fam: dict, fields: str, wtkb_row: pd.Series,
|
|
69
|
+
cache_dir: Path) -> pd.DataFrame:
|
|
70
|
+
"""One ortholog table for a family, inheriting WT-KB targeting metadata by family."""
|
|
71
|
+
cache = cache_dir / f"{fam_key}.tsv"
|
|
72
|
+
raw = fetch_uniprot(fam["query"], fields, cache)
|
|
73
|
+
cap = int(fam.get("cap", len(raw)))
|
|
74
|
+
raw = raw.head(cap).copy()
|
|
75
|
+
|
|
76
|
+
# UniProt TSV column names (from the requested fields)
|
|
77
|
+
acc_col = "Entry"
|
|
78
|
+
org_col = "Organism"
|
|
79
|
+
len_col = "Length"
|
|
80
|
+
df = pd.DataFrame({
|
|
81
|
+
"representative_system": raw.get(acc_col),
|
|
82
|
+
"uniprot": raw.get(acc_col),
|
|
83
|
+
"organism": raw.get(org_col),
|
|
84
|
+
"length_aa": pd.to_numeric(raw.get(len_col), errors="coerce"),
|
|
85
|
+
})
|
|
86
|
+
df["family"] = fam["wtkb_family"]
|
|
87
|
+
df["pfam_signature"] = [list(fam["pfam_signature"])] * len(df)
|
|
88
|
+
df["confidence"] = fam.get("default_confidence", "predicted")
|
|
89
|
+
df["human_cell_activity"] = "not measured (sequence homolog)"
|
|
90
|
+
df["key_dois"] = [[fam["discovery_doi"]]] * len(df)
|
|
91
|
+
df["entry_kind"] = "ortholog"
|
|
92
|
+
# inherit targeting metadata from the WT-KB family row (single source of truth)
|
|
93
|
+
for col in _INHERIT:
|
|
94
|
+
df[col] = wtkb_row.get(col)
|
|
95
|
+
return df
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _curated_rows(cfg: dict, wtkb: pd.DataFrame) -> pd.DataFrame:
|
|
99
|
+
"""Named, characterised systems: the 8 WT-KB families themselves + extra reps from config."""
|
|
100
|
+
rows = []
|
|
101
|
+
# (a) the WT-KB curated core - measured/inferred, full targeting spec
|
|
102
|
+
for _, w in wtkb.iterrows():
|
|
103
|
+
rows.append({
|
|
104
|
+
"representative_system": w["representative_system"],
|
|
105
|
+
"uniprot": w.get("uniprot"),
|
|
106
|
+
"organism": None,
|
|
107
|
+
"length_aa": w.get("length_aa"),
|
|
108
|
+
"family": w["family"],
|
|
109
|
+
"pfam_signature": list(w["pfam_signature"]) if w.get("pfam_signature") is not None else [],
|
|
110
|
+
"confidence": w.get("confidence", "measured"),
|
|
111
|
+
"human_cell_activity": w.get("human_cell_activity"),
|
|
112
|
+
"key_dois": list(w["key_dois"]) if w.get("key_dois") is not None else [],
|
|
113
|
+
"entry_kind": "curated_core",
|
|
114
|
+
**{c: w.get(c) for c in _INHERIT},
|
|
115
|
+
})
|
|
116
|
+
# (b) extra curated representatives (named systems w/o a clean single-Pfam query)
|
|
117
|
+
wt_by_fam = wtkb.set_index("family")
|
|
118
|
+
for rep in cfg.get("curated_representatives", []):
|
|
119
|
+
fam = rep["family"]
|
|
120
|
+
wrow = wt_by_fam.loc[fam] if fam in wt_by_fam.index else pd.Series(dtype=object)
|
|
121
|
+
if isinstance(wrow, pd.DataFrame):
|
|
122
|
+
wrow = wrow.iloc[0]
|
|
123
|
+
rows.append({
|
|
124
|
+
"representative_system": rep["representative_system"],
|
|
125
|
+
"uniprot": rep.get("uniprot"),
|
|
126
|
+
"organism": None,
|
|
127
|
+
"length_aa": rep.get("length_aa") or (wrow.get("length_aa") if len(wrow) else None),
|
|
128
|
+
"family": fam,
|
|
129
|
+
"pfam_signature": list(wrow.get("pfam_signature")) if len(wrow) and wrow.get("pfam_signature") is not None else [],
|
|
130
|
+
"confidence": rep.get("confidence", "inferred"),
|
|
131
|
+
"human_cell_activity": rep.get("human_cell_activity"),
|
|
132
|
+
"key_dois": list(rep.get("key_dois", [])),
|
|
133
|
+
"entry_kind": "curated_rep",
|
|
134
|
+
**{c: (wrow.get(c) if len(wrow) else None) for c in _INHERIT},
|
|
135
|
+
})
|
|
136
|
+
return pd.DataFrame(rows)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def confidence_tag(row: pd.Series) -> str:
|
|
140
|
+
"""measured (human-cell data) > inferred (characterised, non-human) > predicted (homolog only)."""
|
|
141
|
+
c = row.get("confidence")
|
|
142
|
+
if c in {"measured", "inferred", "predicted"}:
|
|
143
|
+
return c
|
|
144
|
+
hca = (row.get("human_cell_activity") or "").lower()
|
|
145
|
+
if "human cell" in hca and "not measured" not in hca:
|
|
146
|
+
return "measured"
|
|
147
|
+
return "predicted"
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def build_atlas(cfg_path: str | Path = _CFG, wtkb_path: str | Path = _WTKB,
|
|
151
|
+
out: str | Path = _OUT, cache_dir: str | Path = _CACHE,
|
|
152
|
+
offline_ok: bool = False) -> pd.DataFrame:
|
|
153
|
+
cfg = load_config(cfg_path)
|
|
154
|
+
fields = cfg["defaults"]["uniprot_fields"]
|
|
155
|
+
wtkb = pd.read_parquet(wtkb_path)
|
|
156
|
+
wt_by_fam = wtkb.drop_duplicates("family").set_index("family")
|
|
157
|
+
cache_dir = Path(cache_dir)
|
|
158
|
+
|
|
159
|
+
tables: list[pd.DataFrame] = [_curated_rows(cfg, wtkb)]
|
|
160
|
+
for fam_key, fam in cfg["families"].items():
|
|
161
|
+
wrow = wt_by_fam.loc[fam["wtkb_family"]] if fam["wtkb_family"] in wt_by_fam.index else pd.Series(dtype=object)
|
|
162
|
+
try:
|
|
163
|
+
tables.append(_orthologs_for_family(fam_key, fam, fields, wrow, cache_dir))
|
|
164
|
+
except Exception as e: # noqa: BLE001
|
|
165
|
+
if offline_ok:
|
|
166
|
+
print(f"[expand] skip {fam_key} (offline_ok): {e}")
|
|
167
|
+
continue
|
|
168
|
+
raise
|
|
169
|
+
|
|
170
|
+
atlas = pd.concat(tables, ignore_index=True)
|
|
171
|
+
# named curated rows win over a homolog row for the same accession. Dedup ONLY among rows that
|
|
172
|
+
# carry a UniProt id - rows without one (seekRNA, PASTE, ShCAST, Bxb1, ISPpu10) are all distinct
|
|
173
|
+
# systems and must never collapse together (pandas treats every NaN as equal under drop_duplicates).
|
|
174
|
+
atlas["_pri"] = atlas["entry_kind"].map({"curated_core": 0, "curated_rep": 1, "ortholog": 2})
|
|
175
|
+
has_acc = atlas["uniprot"].notna() & (atlas["uniprot"].astype(str).str.strip() != "")
|
|
176
|
+
with_acc = (atlas[has_acc].sort_values("_pri")
|
|
177
|
+
.drop_duplicates(subset=["uniprot"], keep="first"))
|
|
178
|
+
atlas = pd.concat([with_acc, atlas[~has_acc]], ignore_index=True).drop(columns="_pri")
|
|
179
|
+
atlas["confidence"] = atlas.apply(confidence_tag, axis=1)
|
|
180
|
+
atlas = atlas.reset_index(drop=True)
|
|
181
|
+
|
|
182
|
+
Path(out).parent.mkdir(parents=True, exist_ok=True)
|
|
183
|
+
atlas.to_parquet(out, index=False)
|
|
184
|
+
return atlas
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
if __name__ == "__main__": # pragma: no cover
|
|
188
|
+
a = build_atlas()
|
|
189
|
+
print(f"atlas rows: {len(a):,}")
|
|
190
|
+
print(a.groupby(["family", "confidence"]).size())
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""WT-KB schema - the Writer-Targeting Knowledge Base row model (Phase 0, Step 0.2).
|
|
2
|
+
|
|
3
|
+
One row per writer family/representative system: its targeting requirements and a reachability tier.
|
|
4
|
+
This is the spine of the Writer Atlas and the reachability layer of the Writable Genome. Every
|
|
5
|
+
targeting field must carry at least one DOI in ``key_dois`` - nothing is asserted without a citation.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from enum import Enum
|
|
10
|
+
|
|
11
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class MechBucket(str, Enum):
|
|
15
|
+
DSB_NUCLEASE = "DSB_NUCLEASE"
|
|
16
|
+
DSB_FREE_RECOMBINASE = "DSB_FREE_TRANSEST_RECOMBINASE"
|
|
17
|
+
TRANSPOSASE = "TRANSPOSASE"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Tier(str, Enum):
|
|
21
|
+
T1 = "Tier1_scannable" # bridge/seek cores, PE-installable att
|
|
22
|
+
T2 = "Tier2_context_candidate" # CAST, native pseudo-att integrases (candidate - requires validation)
|
|
23
|
+
T3 = "Tier3_not_predictable" # retroelement preferences
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class Confidence(str, Enum):
|
|
27
|
+
MEASURED = "measured"
|
|
28
|
+
INFERRED = "inferred"
|
|
29
|
+
PREDICTED = "predicted"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class WriterEntry(BaseModel):
|
|
33
|
+
model_config = ConfigDict(use_enum_values=True)
|
|
34
|
+
|
|
35
|
+
family: str
|
|
36
|
+
representative_system: str
|
|
37
|
+
uniprot: str | None = None
|
|
38
|
+
mechanism_bucket: MechBucket
|
|
39
|
+
pfam_signature: list[str]
|
|
40
|
+
targeting_modality: str # RNA-guided | fixed-att | DDE-spacing | PE-installable
|
|
41
|
+
target_site_spec: str # e.g. "bipartite ~14 nt, central CT dinucleotide core"
|
|
42
|
+
guide_architecture: str # e.g. "bridge RNA: TBL(LTG/RTG)+DBL(LDG/RDG)"
|
|
43
|
+
cargo_mechanism: str # intrinsic | fixed-donor | templated
|
|
44
|
+
cargo_capacity_bp: int | None = None
|
|
45
|
+
dsb_free: bool
|
|
46
|
+
length_aa: int | None = None
|
|
47
|
+
human_cell_activity: str | None = None # measured value + source, or "not measured"
|
|
48
|
+
deliverability: str # AAV | split-AAV | mRNA-RNP
|
|
49
|
+
reachability_tier: Tier
|
|
50
|
+
reachability_constraints: str # rules a genome scan must apply
|
|
51
|
+
confidence: Confidence = Confidence.MEASURED
|
|
52
|
+
key_dois: list[str] = Field(min_length=1)
|
|
53
|
+
|
|
54
|
+
@field_validator("key_dois")
|
|
55
|
+
@classmethod
|
|
56
|
+
def _nonempty_dois(cls, v: list[str]) -> list[str]:
|
|
57
|
+
if not v or not all(d.strip() for d in v):
|
|
58
|
+
raise ValueError("every WT-KB row must carry >=1 non-empty DOI (sourcing rule)")
|
|
59
|
+
return v
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""Descriptive writer scorecard (Phase 0, Step 0.5).
|
|
2
|
+
|
|
3
|
+
Reframes the prior 5-gate "TRUE_WRITER certification" (circular - it pre-registered ISCro4 *by name*
|
|
4
|
+
and depended on hand-set scores) into a transparent, DESCRIPTIVE scorecard computed from the
|
|
5
|
+
re-grounded axes. No enzyme is named in any pre-registered prediction. We additionally report a
|
|
6
|
+
*blind concordance* outcome: does the ranking place ISCro4 at the top of the bridge family using only
|
|
7
|
+
generic measured axes (cell-based evidence, DSB-freeness, programmability, cargo) - without any
|
|
8
|
+
ISCro4-specific value being asserted? This is reported, never asserted as an input.
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
import pandas as pd
|
|
16
|
+
import yaml
|
|
17
|
+
|
|
18
|
+
_AXES_CFG = Path(__file__).resolve().parents[2] / "configs" / "score_axes.yaml"
|
|
19
|
+
|
|
20
|
+
_EVIDENCE_COLS = ["has_biochemical", "has_structural", "has_computational", "has_cell_based"]
|
|
21
|
+
|
|
22
|
+
# descriptive tier labels (NOT a certification; no enzyme is pre-named)
|
|
23
|
+
T_DSB_DEPENDENT = "DSB_dependent" # fails the necessary DSB-free gate (a "scissor")
|
|
24
|
+
T_EMERGING = "emerging_writer"
|
|
25
|
+
T_PROBABLE = "probable_writer"
|
|
26
|
+
T_ESTABLISHED = "established_writer" # DSB-free + fully programmable + native cargo + human-cell evidence
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _thresholds(path: Path = _AXES_CFG) -> dict:
|
|
30
|
+
"""Read v3.0 gate thresholds (defined on the RE-GROUNDED axis scales) from score_axes.yaml."""
|
|
31
|
+
if Path(path).exists():
|
|
32
|
+
g = yaml.safe_load(Path(path).read_text(encoding="utf-8")).get("gates_v3_0", {})
|
|
33
|
+
return {
|
|
34
|
+
"g1": g.get("g1_dsb_min", 0.95),
|
|
35
|
+
"g2": g.get("g2_prog_min", 0.95),
|
|
36
|
+
"g3": g.get("g3_cargo_min", 0.65),
|
|
37
|
+
"g4": g.get("g4_max_length_aa", 900),
|
|
38
|
+
"g5": g.get("g5_min_evidence", 2),
|
|
39
|
+
}
|
|
40
|
+
return {"g1": 0.95, "g2": 0.95, "g3": 0.65, "g4": 900, "g5": 2}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _evidence_count(row) -> int:
|
|
44
|
+
return int(sum(bool(row.get(c, False)) for c in _EVIDENCE_COLS))
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _gates(row, th) -> dict:
|
|
48
|
+
g1 = float(row.get("s_dsb", 0) or 0) >= th["g1"]
|
|
49
|
+
g2 = float(row.get("S_Prog", 0) or 0) >= th["g2"]
|
|
50
|
+
g3 = (float(row.get("S_Cargo", 0) or 0) >= th["g3"]) and bool(row.get("intrinsic_cargo_mechanism", False))
|
|
51
|
+
length = row.get("length_aa")
|
|
52
|
+
g4 = (length is not None and not pd.isna(length) and float(length) <= th["g4"])
|
|
53
|
+
g5 = _evidence_count(row) >= th["g5"]
|
|
54
|
+
return {"g1": g1, "g2": g2, "g3": g3, "g4": g4, "g5": g5}
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _descriptive_tier(row, th) -> str:
|
|
58
|
+
g = _gates(row, th)
|
|
59
|
+
if not g["g1"]:
|
|
60
|
+
return T_DSB_DEPENDENT # necessary gate (DSB-free) failed
|
|
61
|
+
qualifying = sum([g["g2"], g["g3"], g["g4"], g["g5"]])
|
|
62
|
+
has_cell = bool(row.get("has_cell_based", False))
|
|
63
|
+
if qualifying == 4 and has_cell:
|
|
64
|
+
return T_ESTABLISHED
|
|
65
|
+
if qualifying == 4 or (qualifying == 3 and has_cell):
|
|
66
|
+
return T_PROBABLE
|
|
67
|
+
if qualifying >= 1:
|
|
68
|
+
return T_EMERGING
|
|
69
|
+
return T_DSB_DEPENDENT
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def composite(row) -> float:
|
|
73
|
+
"""Transparent composite; components stay visible on the scorecard.
|
|
74
|
+
|
|
75
|
+
Includes human-cell evidence (``has_cell_based``) as a generic readiness axis - this is the
|
|
76
|
+
signal that distinguishes the standout human-cell bridge recombinase. It is a generic column
|
|
77
|
+
present for every editor, NOT an ISCro4-specific asserted value (so the concordance stays blind).
|
|
78
|
+
"""
|
|
79
|
+
parts = [
|
|
80
|
+
float(row.get("s_dsb", 0) or 0),
|
|
81
|
+
float(row.get("S_Prog", 0) or 0),
|
|
82
|
+
float(row.get("S_Cargo", 0) or 0),
|
|
83
|
+
_evidence_count(row) / 4.0,
|
|
84
|
+
float(bool(row.get("has_cell_based", False))),
|
|
85
|
+
]
|
|
86
|
+
return float(np.mean(parts))
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def scorecard(universe_df: pd.DataFrame) -> pd.DataFrame:
|
|
90
|
+
th = _thresholds()
|
|
91
|
+
df = universe_df.copy()
|
|
92
|
+
df["evidence_count"] = df.apply(_evidence_count, axis=1)
|
|
93
|
+
df["S_composite"] = df.apply(composite, axis=1)
|
|
94
|
+
df["tier"] = df.apply(lambda r: _descriptive_tier(r, th), axis=1)
|
|
95
|
+
return df.sort_values("S_composite", ascending=False).reset_index(drop=True)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def blind_concordance(scorecard_df: pd.DataFrame, family: str = "bridge_IS110",
|
|
99
|
+
expected_top: str = "ISCro4") -> dict:
|
|
100
|
+
"""Report (do NOT assert) whether the ranking places `expected_top` first within `family`,
|
|
101
|
+
using only generic measured axes. Returns the observed top + whether it matches.
|
|
102
|
+
Restricted to NATURAL editors (the concordance question is about natural systems)."""
|
|
103
|
+
sub = scorecard_df.query("family == @family")
|
|
104
|
+
if "source" in sub.columns:
|
|
105
|
+
sub = sub[sub["source"] == "natural"]
|
|
106
|
+
sub = sub.sort_values("S_composite", ascending=False)
|
|
107
|
+
if sub.empty:
|
|
108
|
+
return {"family": family, "top": None, "matches": False, "n": 0}
|
|
109
|
+
top = sub.iloc[0]["entity_id"]
|
|
110
|
+
return {"family": family, "top": top, "matches": (top == expected_top), "n": int(len(sub)),
|
|
111
|
+
"ranking": sub[["entity_id", "S_composite", "evidence_count"]].to_dict("records")}
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def ranking_stability(universe_df: pd.DataFrame, family: str = "bridge_IS110",
|
|
115
|
+
expected_top: str = "ISCro4", n: int = 200, seed: int = 42) -> float:
|
|
116
|
+
"""Fraction of randomly re-weighted composites under which `expected_top` stays family-top
|
|
117
|
+
(a lightweight sensitivity check, mirroring the prior ranking-stability analysis)."""
|
|
118
|
+
rng = np.random.default_rng(seed)
|
|
119
|
+
sub = universe_df.query("family == @family").copy()
|
|
120
|
+
if "source" in sub.columns:
|
|
121
|
+
sub = sub[sub["source"] == "natural"]
|
|
122
|
+
if sub.empty:
|
|
123
|
+
return 0.0
|
|
124
|
+
cols = ["s_dsb", "S_Prog", "S_Cargo"]
|
|
125
|
+
ev = sub.apply(_evidence_count, axis=1).to_numpy() / 4.0
|
|
126
|
+
cell = sub.get("has_cell_based", pd.Series([False] * len(sub))).fillna(False).astype(float).to_numpy()
|
|
127
|
+
X = np.column_stack([sub[c].fillna(0).astype(float).to_numpy() for c in cols] + [ev, cell])
|
|
128
|
+
wins = 0
|
|
129
|
+
for _ in range(n):
|
|
130
|
+
w = rng.dirichlet(np.ones(X.shape[1]))
|
|
131
|
+
scores = X @ w
|
|
132
|
+
if sub.iloc[int(scores.argmax())]["entity_id"] == expected_top:
|
|
133
|
+
wins += 1
|
|
134
|
+
return wins / n
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""Canonical universe assembly (Phase 0, Step 0.4).
|
|
2
|
+
|
|
3
|
+
THE single entry point that joins the upstream editor universe + the WT-KB + the crosswalk and
|
|
4
|
+
applies the re-grounded axes. The classifier, the scorer, and the scorecard must all consume the
|
|
5
|
+
output of ``assemble()`` - never re-derive metadata independently (the prior PEN-DISCOVER vs
|
|
6
|
+
PEN-COMPARE gate inconsistency must not recur). Cross-module consistency is asserted by
|
|
7
|
+
``tests/unit/test_universe_consistency.py``.
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
import pandas as pd
|
|
14
|
+
import yaml
|
|
15
|
+
|
|
16
|
+
from pen_stack.score.recalibrate import load_axes_config, recalibrate_all
|
|
17
|
+
|
|
18
|
+
_ROOT = Path(__file__).resolve().parents[2]
|
|
19
|
+
_UNIVERSE = _ROOT / "data" / "curated" / "unified_editor_universe.parquet"
|
|
20
|
+
_WTKB = _ROOT / "pen_stack" / "atlas" / "wtkb.parquet"
|
|
21
|
+
_CROSSWALK = _ROOT / "configs" / "universe_crosswalk.yaml"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _load_crosswalk(path: Path = _CROSSWALK) -> pd.DataFrame:
|
|
25
|
+
cw = yaml.safe_load(path.read_text(encoding="utf-8"))["entity_to_family"]
|
|
26
|
+
return pd.DataFrame(
|
|
27
|
+
[{"entity_id": k, "family": v["family"], "targeting_modality": v["targeting_modality"]}
|
|
28
|
+
for k, v in cw.items()]
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def assemble(
|
|
33
|
+
universe_parquet: str | Path = _UNIVERSE,
|
|
34
|
+
wtkb_parquet: str | Path = _WTKB,
|
|
35
|
+
crosswalk_path: str | Path = _CROSSWALK,
|
|
36
|
+
out_parquet: str | Path | None = None,
|
|
37
|
+
) -> pd.DataFrame:
|
|
38
|
+
uni = pd.read_parquet(universe_parquet)
|
|
39
|
+
wt = pd.read_parquet(wtkb_parquet)
|
|
40
|
+
cw = _load_crosswalk(Path(crosswalk_path))
|
|
41
|
+
|
|
42
|
+
# 1) attach family + modality to natural editors via the crosswalk
|
|
43
|
+
uni = uni.merge(cw, on="entity_id", how="left")
|
|
44
|
+
|
|
45
|
+
# 2) designs inherit their parent_editor's family + modality
|
|
46
|
+
if "parent_editor" in uni.columns:
|
|
47
|
+
parent_map = cw.set_index("entity_id")[["family", "targeting_modality"]]
|
|
48
|
+
need = uni["family"].isna() & uni["parent_editor"].notna()
|
|
49
|
+
for col in ("family", "targeting_modality"):
|
|
50
|
+
uni.loc[need, col] = uni.loc[need, "parent_editor"].map(parent_map[col])
|
|
51
|
+
|
|
52
|
+
# 3) bring WT-KB measured fields (cargo bp, reachability tier, dsb_free) in by family - single source
|
|
53
|
+
wt_fields = wt[["family", "cargo_capacity_bp", "reachability_tier", "dsb_free"]].drop_duplicates("family")
|
|
54
|
+
uni = uni.merge(wt_fields, on="family", how="left")
|
|
55
|
+
|
|
56
|
+
# 4) apply the re-grounded axes (length backfill + cargo + prog); NO per-enzyme overrides
|
|
57
|
+
uni = recalibrate_all(uni, load_axes_config())
|
|
58
|
+
|
|
59
|
+
if out_parquet:
|
|
60
|
+
Path(out_parquet).parent.mkdir(parents=True, exist_ok=True)
|
|
61
|
+
uni.to_parquet(out_parquet, index=False)
|
|
62
|
+
return uni
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# Axis/gate inputs that every downstream module must read from the canonical universe (not re-derive).
|
|
66
|
+
CANONICAL_INPUTS = [
|
|
67
|
+
"entity_id", "source", "mechanism_class", "s_dsb", "S_Prog", "S_Cargo",
|
|
68
|
+
"length_aa", "intrinsic_cargo_mechanism", "cell_based_evidence",
|
|
69
|
+
"family", "targeting_modality", "reachability_tier",
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def canonical_inputs(df: pd.DataFrame) -> pd.DataFrame:
|
|
74
|
+
"""The exact metadata slice the classifier/scorer/scorecard must share."""
|
|
75
|
+
return df[[c for c in CANONICAL_INPUTS if c in df.columns]].copy()
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
"""DMS-grounded variant proposal (Phase 2, Step 2.4) - replaces the failed de-novo chimera generation.
|
|
2
|
+
|
|
3
|
+
Instead of speculative chimeras (PEN-ASSEMBLE produced 0 TRUE_WRITERs and was HPC-hungry/unvalidatable),
|
|
4
|
+
propose *single/double point mutations* with a measured activity effect. **No chimeras are ever produced**
|
|
5
|
+
- only point substitutions.
|
|
6
|
+
|
|
7
|
+
The activity predictor is a pluggable ``VariantEffectModel``. The real model is ``DMSVariantEffectModel``,
|
|
8
|
+
backed by the Perry 2025 deep mutational scan of ISCro4 (Table S3, delivered in Phase 1.5): it scores each
|
|
9
|
+
substitution by its MEASURED activity Z-score. Fed that model, the framework's top proposals ARE the
|
|
10
|
+
experimentally enhancing mutations - N322P (rank 1), H50K (rank 2), R278M - so it RECOVERS the known
|
|
11
|
+
enhancers. Stated honestly per the program's framing: this is a useful catalogue feature that recovers
|
|
12
|
+
KNOWN enhancers from the DMS; it is NOT a novel variant-design method and it is NOT a blind sequence-only
|
|
13
|
+
prediction. For GENERATING new variants the established engine is EVOLVEpro - wrap it, do not rebuild.
|
|
14
|
+
|
|
15
|
+
When the DMS is absent, a transparent physico-chemical *baseline* keeps the framework runnable (it makes
|
|
16
|
+
no activity claim and must never be presented as the DMS model).
|
|
17
|
+
|
|
18
|
+
Inputs : enzyme sequence; a VariantEffectModel (DMSVariantEffectModel, or the labelled baseline).
|
|
19
|
+
Outputs: out/variant_proposals_<enzyme>.csv (ranked point mutations + measured effect).
|
|
20
|
+
"""
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
from typing import Protocol, runtime_checkable
|
|
25
|
+
|
|
26
|
+
import pandas as pd
|
|
27
|
+
|
|
28
|
+
_AA = "ACDEFGHIKLMNPQRSTVWY"
|
|
29
|
+
_OUT = Path(__file__).resolve().parents[2] / "out"
|
|
30
|
+
|
|
31
|
+
# Kyte-Doolittle hydropathy + a coarse charge/volume signal, for the labelled baseline ONLY.
|
|
32
|
+
_HYDRO = {"A": 1.8, "R": -4.5, "N": -3.5, "D": -3.5, "C": 2.5, "Q": -3.5, "E": -3.5, "G": -0.4,
|
|
33
|
+
"H": -3.2, "I": 4.5, "L": 3.8, "K": -3.9, "M": 1.9, "F": 2.8, "P": -1.6, "S": -0.8,
|
|
34
|
+
"T": -0.7, "W": -0.9, "Y": -1.3, "V": 4.2}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@runtime_checkable
|
|
38
|
+
class VariantEffectModel(Protocol):
|
|
39
|
+
"""Predict a per-mutation activity gain. (i, wt, mut) -> predicted effect (higher = better)."""
|
|
40
|
+
|
|
41
|
+
name: str
|
|
42
|
+
|
|
43
|
+
def predict(self, seq: str, variants: list[tuple[int, str, str]]) -> list[float]: ...
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class BaselinePhysicoChemical:
|
|
47
|
+
"""A transparent, NON-DMS placeholder predictor (Phase-1.5 supplies the real DMS model).
|
|
48
|
+
|
|
49
|
+
Scores a substitution by *conservativeness* (small hydropathy change ranks higher) - a deliberately
|
|
50
|
+
weak, documented heuristic so the proposal/validation framework is exercisable before Phase 1.5.
|
|
51
|
+
It makes no activity claim and must never be presented as the DMS predictor.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
name = "baseline_physicochemical_placeholder"
|
|
55
|
+
|
|
56
|
+
def predict(self, seq: str, variants: list[tuple[int, str, str]]) -> list[float]:
|
|
57
|
+
return [-abs(_HYDRO.get(mut, 0.0) - _HYDRO.get(wt, 0.0)) for (_, wt, mut) in variants]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class DMSVariantEffectModel:
|
|
61
|
+
"""The REAL model: scores a substitution by its MEASURED activity Z-score from the Perry 2025 ISCro4
|
|
62
|
+
deep mutational scan (Table S3, Phase 1.5). Substitutions not present in the scan get a strongly
|
|
63
|
+
negative score (treated as unmeasured/non-enhancing). This recovers known enhancers; it is not a blind
|
|
64
|
+
sequence predictor (see module docstring). Requires the Perry tables locally (PEN_PERRY_DIR)."""
|
|
65
|
+
|
|
66
|
+
name = "perry2025_dms_iscro4"
|
|
67
|
+
|
|
68
|
+
def __init__(self) -> None:
|
|
69
|
+
from pen_stack.bridge.ingest import load_dms
|
|
70
|
+
dms = load_dms()
|
|
71
|
+
if dms.empty:
|
|
72
|
+
raise FileNotFoundError("Perry 2025 DMS (Table S3) not available; set PEN_PERRY_DIR")
|
|
73
|
+
z = pd.to_numeric(dms["Z_Score_wrt_WT"], errors="coerce")
|
|
74
|
+
self._z = dict(zip(dms["Mutation"].astype(str), z))
|
|
75
|
+
|
|
76
|
+
def predict(self, seq: str, variants: list[tuple[int, str, str]]) -> list[float]:
|
|
77
|
+
# variant key is wt + 1-based position + mut, e.g. "N322P"
|
|
78
|
+
return [self._z.get(f"{wt}{i + 1}{mut}", -9.9) for (i, wt, mut) in variants]
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def iscro4_sequence() -> str | None:
|
|
82
|
+
"""ISCro4 recombinase sequence from Perry 2025 Table S1 (326 aa). None if absent."""
|
|
83
|
+
from pen_stack.bridge.ingest import load_screen
|
|
84
|
+
s1 = load_screen()
|
|
85
|
+
row = s1[s1["Name"].astype(str) == "ISCro4"] if not s1.empty else s1
|
|
86
|
+
return row.iloc[0]["Recombinase_Sequence"] if len(row) else None
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def propose_variants(seq: str, model: VariantEffectModel, top: int = 20,
|
|
90
|
+
positions: list[int] | None = None) -> pd.DataFrame:
|
|
91
|
+
"""Rank single point mutations by predicted activity gain. No chimeras - substitutions only."""
|
|
92
|
+
idxs = positions if positions is not None else range(len(seq))
|
|
93
|
+
cand = [(i, seq[i], aa) for i in idxs for aa in _AA if aa != seq[i]]
|
|
94
|
+
pred = model.predict(seq, cand)
|
|
95
|
+
df = pd.DataFrame({
|
|
96
|
+
"pos": [c[0] for c in cand],
|
|
97
|
+
"wt": [c[1] for c in cand],
|
|
98
|
+
"mut": [c[2] for c in cand],
|
|
99
|
+
"variant": [f"{c[1]}{c[0] + 1}{c[2]}" for c in cand], # 1-based, e.g. A123V
|
|
100
|
+
"pred_gain": pred,
|
|
101
|
+
"model": model.name,
|
|
102
|
+
})
|
|
103
|
+
return df.sort_values("pred_gain", ascending=False).head(top).reset_index(drop=True)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def retrospective_recovery(proposals: pd.DataFrame, known_variants: list[str], k: int = 20) -> dict:
|
|
107
|
+
"""Blind-validation harness: does the top-k proposal set recover a published enhanced variant?
|
|
108
|
+
|
|
109
|
+
``known_variants`` are 1-based strings like "A123V". Returns recovery flags per known variant and an
|
|
110
|
+
overall hit. With the Phase-1.5 DMS model this is the headline retrospective criterion; with the
|
|
111
|
+
baseline it merely demonstrates the harness runs (recovery is not expected from the placeholder).
|
|
112
|
+
"""
|
|
113
|
+
topk = set(proposals.head(k)["variant"])
|
|
114
|
+
hits = {v: (v in topk) for v in known_variants}
|
|
115
|
+
return {"k": k, "model": proposals["model"].iloc[0] if len(proposals) else None,
|
|
116
|
+
"known": known_variants, "recovered": hits, "any_recovered": any(hits.values())}
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def run(enzyme: str, seq: str, model: VariantEffectModel | None = None, top: int = 20,
|
|
120
|
+
out_dir: str | Path = _OUT) -> pd.DataFrame:
|
|
121
|
+
model = model or BaselinePhysicoChemical()
|
|
122
|
+
props = propose_variants(seq, model, top=top)
|
|
123
|
+
out = Path(out_dir) / f"variant_proposals_{enzyme}.csv"
|
|
124
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
125
|
+
props.to_csv(out, index=False)
|
|
126
|
+
return props
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
# published enhancing single mutations identified by the Perry 2025 ISCro4 DMS (the known enhancers)
|
|
130
|
+
KNOWN_ISCRO4_ENHANCERS = ["N322P", "H50K", "R278M"]
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def iscro4_dms_recovery(top: int = 20, out_dir: str | Path = _OUT) -> dict:
|
|
134
|
+
"""Step 2.4 completion: feed the REAL Perry DMS model to the proposal framework and confirm it recovers
|
|
135
|
+
the known enhancing ISCro4 mutations in its top proposals. Honest framing: recovers KNOWN enhancers
|
|
136
|
+
(a catalogue feature), not a blind prediction. Returns the recovery report; writes the proposals CSV.
|
|
137
|
+
Empty/None when the Perry tables are absent."""
|
|
138
|
+
seq = iscro4_sequence()
|
|
139
|
+
if seq is None:
|
|
140
|
+
return {"available": False, "note": "Perry 2025 Table S1 (ISCro4 sequence) not present"}
|
|
141
|
+
props = run("ISCro4", seq, model=DMSVariantEffectModel(), top=top, out_dir=out_dir)
|
|
142
|
+
rec = retrospective_recovery(props, KNOWN_ISCRO4_ENHANCERS, k=top)
|
|
143
|
+
rec["available"] = True
|
|
144
|
+
rec["top_proposals"] = props.head(5)[["variant", "pred_gain"]].to_dict("records")
|
|
145
|
+
rec["framing"] = "recovers KNOWN enhancers from the measured DMS (catalogue feature); not a blind " \
|
|
146
|
+
"sequence predictor and not a generative method (EVOLVEpro is the engine to wrap)."
|
|
147
|
+
return rec
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
if __name__ == "__main__": # pragma: no cover
|
|
151
|
+
# ISCro4 is 326 aa; without the protein sequence on hand we demo the harness on a short stub.
|
|
152
|
+
demo = "MSEQNKI" * 5
|
|
153
|
+
p = run("DEMO_stub", demo, top=10)
|
|
154
|
+
print(p.to_string(index=False))
|
|
155
|
+
print("\nNOTE: uses the labelled placeholder model; the DMS-trained predictor is a Phase-1.5 deliverable.")
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""pen_stack.bridge - see PEN-STACK v3.0 program doc."""
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Bridge-recombinase variant-effect, from the deep mutational scan (Phase 1.5, Step 1.5.4) - SECONDARY.
|
|
2
|
+
|
|
3
|
+
A pluggable trainer over the Perry 2025 DMS (Table S3). Used retrospectively it RECOVERS KNOWN
|
|
4
|
+
activity-enhancing mutants (N322P, H50K, R278M; see pen_stack/validate/paper4_real_validation.py),
|
|
5
|
+
completing the Phase-2 Step-2.4 DMS variant-proposal feature.
|
|
6
|
+
|
|
7
|
+
Scope, stated plainly: this is a useful catalogue feature that recovers KNOWN enhancers; it is NOT a novel
|
|
8
|
+
variant-design method. For GENERATING new variants the established engine is EVOLVEpro - when PEN-STACK
|
|
9
|
+
reaches generative variant design it should wrap EVOLVEpro rather than rebuild it. The 72-system ortholog
|
|
10
|
+
screen (Table S1) carries no per-system activity label, so it supports only the descriptive characterisation
|
|
11
|
+
in ortholog_screen.py (N ~72, exploratory). The headline of the phase is the off-target screening engine.
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import pandas as pd
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def have_training_data(dms: pd.DataFrame, screen: pd.DataFrame) -> bool:
|
|
19
|
+
return (dms is not None and not dms.empty) or (screen is not None and not screen.empty)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def train_variant_effect(dms_df: pd.DataFrame):
|
|
23
|
+
"""Train a per-residue mutation -> activity model on the DMS. Returns None if no DMS available."""
|
|
24
|
+
if dms_df is None or dms_df.empty:
|
|
25
|
+
return None
|
|
26
|
+
import lightgbm as lgb
|
|
27
|
+
feat = pd.get_dummies(dms_df[["aa_position", "wt", "mut"]].astype(str))
|
|
28
|
+
return lgb.LGBMRegressor(n_estimators=400, learning_rate=0.03).fit(feat, dms_df["activity"])
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def train_ortholog_activity(screen_df: pd.DataFrame, embed_fn=None):
|
|
32
|
+
"""Train ortholog -> human-cell activity on the 72-system screen. Returns None if absent.
|
|
33
|
+
|
|
34
|
+
N caveat is the caller's responsibility to report - the screen is ~72 systems.
|
|
35
|
+
"""
|
|
36
|
+
if screen_df is None or screen_df.empty:
|
|
37
|
+
return None
|
|
38
|
+
import lightgbm as lgb
|
|
39
|
+
if embed_fn is not None:
|
|
40
|
+
X = embed_fn(screen_df["sequence"])
|
|
41
|
+
else:
|
|
42
|
+
X = pd.get_dummies(screen_df.get("target_core", pd.Series(dtype=str)).astype(str))
|
|
43
|
+
return lgb.LGBMRegressor(n_estimators=300, learning_rate=0.03).fit(X, screen_df["human_cell_activity"])
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def status() -> dict:
|
|
47
|
+
"""Report whether the activity model can train (needs the Perry 2025 DMS / screen tables)."""
|
|
48
|
+
from pen_stack.bridge.ingest import load_dms, load_screen
|
|
49
|
+
dms, screen = load_dms(), load_screen()
|
|
50
|
+
return {"dms_rows": len(dms), "screen_rows": len(screen),
|
|
51
|
+
"trainable": have_training_data(dms, screen),
|
|
52
|
+
"note": "exploratory; DMS+screen are Perry 2025 supplementary (paywalled) - model trains when supplied"}
|