pen-stack 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. pen_stack/__init__.py +2 -0
  2. pen_stack/_resources.py +34 -0
  3. pen_stack/adapt/__init__.py +14 -0
  4. pen_stack/adapt/finetune.py +33 -0
  5. pen_stack/adapt/ingest.py +86 -0
  6. pen_stack/adapt/pipeline.py +101 -0
  7. pen_stack/adapt/recalibrate.py +58 -0
  8. pen_stack/adapt/report.py +130 -0
  9. pen_stack/agent/__init__.py +1 -0
  10. pen_stack/agent/guardrails.py +49 -0
  11. pen_stack/agent/mcp_server.py +42 -0
  12. pen_stack/agent/orchestrator.py +106 -0
  13. pen_stack/agent/pen_agent.py +169 -0
  14. pen_stack/agent/tools.py +130 -0
  15. pen_stack/atlas/__init__.py +1 -0
  16. pen_stack/atlas/build_wtkb.py +80 -0
  17. pen_stack/atlas/crosslink.py +144 -0
  18. pen_stack/atlas/expand.py +190 -0
  19. pen_stack/atlas/schema.py +59 -0
  20. pen_stack/atlas/scorecard.py +134 -0
  21. pen_stack/atlas/universe.py +75 -0
  22. pen_stack/atlas/variant_propose.py +155 -0
  23. pen_stack/bridge/__init__.py +1 -0
  24. pen_stack/bridge/activity.py +52 -0
  25. pen_stack/bridge/cli.py +65 -0
  26. pen_stack/bridge/fold_qc.py +53 -0
  27. pen_stack/bridge/guide_qc.py +84 -0
  28. pen_stack/bridge/ingest.py +139 -0
  29. pen_stack/bridge/offtarget.py +133 -0
  30. pen_stack/bridge/ortholog_screen.py +73 -0
  31. pen_stack/bridge/pipeline.py +83 -0
  32. pen_stack/cli.py +126 -0
  33. pen_stack/data/__init__.py +1 -0
  34. pen_stack/data/encode.py +84 -0
  35. pen_stack/data/genome.py +71 -0
  36. pen_stack/data/ingest_chromatin.py +119 -0
  37. pen_stack/data/ingest_integration.py +112 -0
  38. pen_stack/data/ingest_safety_annot.py +164 -0
  39. pen_stack/data/ingest_trip.py +76 -0
  40. pen_stack/mech/__init__.py +1 -0
  41. pen_stack/mech/classify_atlas.py +71 -0
  42. pen_stack/mech/whitelist.py +66 -0
  43. pen_stack/monitor/__init__.py +1 -0
  44. pen_stack/monitor/europepmc.py +32 -0
  45. pen_stack/monitor/run.py +57 -0
  46. pen_stack/monitor/triage.py +63 -0
  47. pen_stack/planner/__init__.py +1 -0
  48. pen_stack/planner/cargo.py +56 -0
  49. pen_stack/planner/cargo_polish.py +146 -0
  50. pen_stack/planner/delivery.py +32 -0
  51. pen_stack/planner/multiplex.py +110 -0
  52. pen_stack/planner/optimize.py +156 -0
  53. pen_stack/planner/pipeline.py +86 -0
  54. pen_stack/planner/report.py +26 -0
  55. pen_stack/rag/__init__.py +1 -0
  56. pen_stack/rag/index.py +53 -0
  57. pen_stack/rag/llm.py +178 -0
  58. pen_stack/rag/qa.py +105 -0
  59. pen_stack/score/__init__.py +1 -0
  60. pen_stack/score/recalibrate.py +77 -0
  61. pen_stack/score/therapeutic.py +85 -0
  62. pen_stack/server/__init__.py +1 -0
  63. pen_stack/server/api.py +142 -0
  64. pen_stack/ui/__init__.py +1 -0
  65. pen_stack/ui/app.py +518 -0
  66. pen_stack/validate/__init__.py +1 -0
  67. pen_stack/validate/adapt_demo.py +69 -0
  68. pen_stack/validate/agent_eval.py +117 -0
  69. pen_stack/validate/blind_gsh_discovery.py +165 -0
  70. pen_stack/validate/cargo_directionality.py +57 -0
  71. pen_stack/validate/durability_baselines.py +150 -0
  72. pen_stack/validate/forward_hypotheses.py +104 -0
  73. pen_stack/validate/guide_qc_demo.py +58 -0
  74. pen_stack/validate/intent_specification.py +82 -0
  75. pen_stack/validate/paper3_benchmark.py +165 -0
  76. pen_stack/validate/paper4_real_validation.py +144 -0
  77. pen_stack/validate/paper4_validation.py +82 -0
  78. pen_stack/validate/seq_vs_measured.py +134 -0
  79. pen_stack/validate/within_locus_ranking.py +74 -0
  80. pen_stack/validate/writer_recovery.py +86 -0
  81. pen_stack/wgenome/__init__.py +1 -0
  82. pen_stack/wgenome/chromatin_seq.py +83 -0
  83. pen_stack/wgenome/durability.py +108 -0
  84. pen_stack/wgenome/export_tracks.py +52 -0
  85. pen_stack/wgenome/features.py +82 -0
  86. pen_stack/wgenome/gsh_baseline.py +117 -0
  87. pen_stack/wgenome/providers.py +245 -0
  88. pen_stack/wgenome/safety.py +69 -0
  89. pen_stack/wgenome/structure3d.py +168 -0
  90. pen_stack/wgenome/writability.py +72 -0
  91. pen_stack-3.1.0.dist-info/METADATA +451 -0
  92. pen_stack-3.1.0.dist-info/RECORD +96 -0
  93. pen_stack-3.1.0.dist-info/WHEEL +5 -0
  94. pen_stack-3.1.0.dist-info/entry_points.txt +3 -0
  95. pen_stack-3.1.0.dist-info/licenses/LICENSE +21 -0
  96. pen_stack-3.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,65 @@
1
+ """pen-bridge CLI (Phase 1.5, Step 1.5.5) - the first public instrument of PEN-STACK.
2
+
3
+ pen-bridge design --target <14nt> --donor <14nt> [--scaffold ISCro4_enhanced] [--ct k562]
4
+
5
+ Designs the bridge RNA (wrapped Arc designer) and reports off-target + fold/cross-loop QC.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import json
10
+
11
+ import click
12
+
13
+
14
+ @click.group()
15
+ def main():
16
+ """pen-bridge - bridge-recombinase design + off-target/QC (PEN-STACK)."""
17
+
18
+
19
+ @main.command()
20
+ @click.option("--target", "-t", required=True, help="14 nt target core (DNA).")
21
+ @click.option("--donor", "-d", required=True, help="14 nt donor core (DNA).")
22
+ @click.option("--scaffold", "-s", default="ISCro4_enhanced",
23
+ type=click.Choice(["IS621", "ISCro4_WT", "ISCro4_enhanced"]))
24
+ @click.option("--ct", default=None, help="Overlay Phase-1 safety for this cell type (k562/hepg2/hspc).")
25
+ @click.option("--no-scan", is_flag=True, help="Skip the genome-wide off-target scan (QC only).")
26
+ @click.option("--chroms", default=None, help="Comma-separated chroms to scan (default chr1..22,X).")
27
+ def design(target, donor, scaffold, ct, no_scan, chroms):
28
+ """Design a bridge RNA and assess off-target + fold/cross-loop QC."""
29
+ from pen_stack.bridge.pipeline import design_and_assess
30
+ chrom_list = chroms.split(",") if chroms else None
31
+ res = design_and_assess(target, donor, scaffold, chroms=chrom_list, ct=ct, scan=not no_scan)
32
+ brna, off, qc = res["brna"], res["offtargets"], res["qc"]
33
+ click.echo(f"Bridge RNA ({scaffold}): target={brna['target']} donor={brna['donor']}")
34
+ if brna.get("available"):
35
+ click.echo(f" bridge_sequence: {brna['bridge_sequence'][:80]}... ({len(brna['bridge_sequence'])} nt)")
36
+ else:
37
+ click.echo(f" (designer: {brna['note']})")
38
+ click.echo(f"QC: cross-loop {qc['cross_loop']} pass={qc['pass']}")
39
+ if "fold" in qc and qc["fold"].get("available"):
40
+ click.echo(f" fold MFE: {qc['fold']['mfe']}")
41
+ if off.get("scanned"):
42
+ click.echo(f"Off-target: {off['n_candidates']} candidate pseudosites "
43
+ f"({off['n_exact']} exact); top by risk:")
44
+ t = off["table"]
45
+ cols = [c for c in ["chrom", "pos", "site", "n_mm", "risk", "safety"] if c in t.columns]
46
+ click.echo(t.head(10)[cols].to_string(index=False))
47
+ else:
48
+ click.echo(f"Off-target: {off.get('note', 'not scanned')}")
49
+ click.echo(res["disclaimer"])
50
+
51
+
52
+ @main.command()
53
+ def profile():
54
+ """Show the position-weight off-target profile (and its provenance)."""
55
+ from pen_stack.bridge.ingest import load_profile_config
56
+ cfg = load_profile_config()
57
+ click.echo(json.dumps({"core_length": cfg["core_length"],
58
+ "central_core_positions": cfg["central_core_positions"],
59
+ "max_mismatches": cfg["max_mismatches"],
60
+ "protective_weight": cfg["protective_weight"],
61
+ "provenance": cfg["provenance"]}, indent=2))
62
+
63
+
64
+ if __name__ == "__main__":
65
+ main()
@@ -0,0 +1,53 @@
1
+ """Bridge-RNA fold / cross-loop QC (Phase 1.5, Step 1.5.3).
2
+
3
+ Predict whether a designed bridge RNA folds correctly (ViennaRNA, in the VM image) and flag DBL-DBL /
4
+ TBL-TBL self/cross-recombination risk from guide complementarity - an experimentally observed failure
5
+ mode where the target- and donor-binding loops recombine with each other instead of the genome.
6
+
7
+ ``cross_loop_risk`` is pure-Python (no dependency); ``fold`` uses ViennaRNA and degrades gracefully when
8
+ the package is absent (returns None) so the rest of the QC still runs.
9
+ """
10
+ from __future__ import annotations
11
+
12
+ _PAIR = {"A": "U", "U": "A", "G": "C", "C": "G", "T": "A"}
13
+
14
+
15
+ def fold(scaffold_seq: str) -> dict:
16
+ """MFE fold of the bridge-RNA scaffold. Returns {structure, mfe} or {available: False}."""
17
+ try:
18
+ import RNA
19
+ except Exception: # noqa: BLE001 - ViennaRNA only in the VM image
20
+ return {"available": False, "note": "ViennaRNA not installed (runs in the VM image)"}
21
+ fc = RNA.fold_compound(scaffold_seq.upper().replace("T", "U"))
22
+ struct, mfe = fc.mfe()
23
+ return {"available": True, "structure": struct, "mfe": round(float(mfe), 2),
24
+ "length": len(scaffold_seq)}
25
+
26
+
27
+ def _complementarity(a: str, b: str) -> float:
28
+ """Fraction of positions where a pairs with the reverse-complement of b (crude antiparallel match)."""
29
+ a = a.upper()
30
+ b_rc = "".join(_PAIR.get(x, "N") for x in reversed(b.upper()))
31
+ n = min(len(a), len(b_rc))
32
+ if n == 0:
33
+ return 0.0
34
+ return sum(1 for x, y in zip(a[:n], b_rc[:n]) if x == y) / n
35
+
36
+
37
+ def cross_loop_risk(target_guide: str, donor_guide: str) -> dict:
38
+ """Self/cross complementarity of the binding loops. High values predict unintended recombination."""
39
+ return {"tbl_self": round(_complementarity(target_guide, target_guide), 3),
40
+ "dbl_self": round(_complementarity(donor_guide, donor_guide), 3),
41
+ "tbl_dbl": round(_complementarity(target_guide, donor_guide), 3)}
42
+
43
+
44
+ def qc_verdict(target_guide: str, donor_guide: str, scaffold_seq: str | None = None,
45
+ cross_loop_threshold: float = 0.6) -> dict:
46
+ """Combined fold + cross-loop verdict for a design."""
47
+ xl = cross_loop_risk(target_guide, donor_guide)
48
+ flags = [k for k, v in xl.items() if v >= cross_loop_threshold]
49
+ out = {"cross_loop": xl, "cross_loop_flags": flags,
50
+ "pass": len(flags) == 0}
51
+ if scaffold_seq:
52
+ out["fold"] = fold(scaffold_seq)
53
+ return out
@@ -0,0 +1,84 @@
1
+ """Bridge-RNA guide ranking / QC layer (v3.1, WS-G2).
2
+
3
+ Wraps a bridge-RNA design: when a default guide design trips a QC flag - self-complementarity, cross-loop
4
+ (TBL-DBL) recombination, poor scaffold fold (MFE), or off-target - this enumerates candidate variants and
5
+ RANKS them by the existing fold-QC (`bridge/fold_qc.py`) plus off-target risk (`bridge/offtarget.py`).
6
+
7
+ This is a RANKING layer, not validated design: it retrospectively down-ranks known-bad guides; it makes NO
8
+ claim of generating superior novel guides. It reuses the validated QC primitives so the score is grounded.
9
+ """
10
+ from __future__ import annotations
11
+
12
+ from pen_stack.bridge import fold_qc
13
+
14
+ _PAIR = {"A": "T", "T": "A", "G": "C", "C": "G", "U": "A"}
15
+
16
+
17
+ def _revcomp(s: str) -> str:
18
+ return "".join(_PAIR.get(b, "N") for b in reversed(s.upper()))
19
+
20
+
21
+ def qc_flags(target_guide: str, donor_guide: str, scaffold_seq: str | None = None,
22
+ offtarget_count: int | None = None, cross_loop_threshold: float = 0.6,
23
+ mfe_per_nt_warn: float = -0.5) -> dict:
24
+ """Tripped QC flags for one design. Pure-python except the optional ViennaRNA fold (degrades)."""
25
+ xl = fold_qc.cross_loop_risk(target_guide, donor_guide)
26
+ flags = []
27
+ if xl["tbl_self"] >= cross_loop_threshold or xl["dbl_self"] >= cross_loop_threshold:
28
+ flags.append("self_complementarity")
29
+ if xl["tbl_dbl"] >= cross_loop_threshold:
30
+ flags.append("cross_loop_recombination")
31
+ fold = fold_qc.fold(scaffold_seq) if scaffold_seq else {"available": False}
32
+ if fold.get("available") and fold["length"] and (fold["mfe"] / fold["length"]) < mfe_per_nt_warn:
33
+ flags.append("poor_fold_mfe")
34
+ if offtarget_count is not None and offtarget_count > 0:
35
+ flags.append("off_target")
36
+ return {"cross_loop": xl, "fold": fold, "offtarget_count": offtarget_count, "flags": flags,
37
+ "pass": len(flags) == 0}
38
+
39
+
40
+ def qc_score(target_guide: str, donor_guide: str, scaffold_seq: str | None = None,
41
+ offtarget_count: int | None = None) -> float:
42
+ """Combined QC quality in [0,1] (HIGHER = safer): penalize cross-loop complementarity, weak scaffold
43
+ fold, and off-targets. Used only to RANK candidate guides, not to certify them."""
44
+ xl = fold_qc.cross_loop_risk(target_guide, donor_guide)
45
+ score = 1.0 - max(xl["tbl_self"], xl["dbl_self"], xl["tbl_dbl"]) # cross-loop is the dominant penalty
46
+ if scaffold_seq:
47
+ fold = fold_qc.fold(scaffold_seq)
48
+ if fold.get("available") and fold["length"]:
49
+ # reward a fold near the expected ~ -0.35 kcal/mol per nt; penalize too-weak structure
50
+ score -= min(0.3, max(0.0, -0.35 - fold["mfe"] / fold["length"]))
51
+ if offtarget_count:
52
+ score -= min(0.4, 0.1 * offtarget_count)
53
+ return round(max(0.0, min(1.0, score)), 4)
54
+
55
+
56
+ def rank_variants(variants: list[dict]) -> list[dict]:
57
+ """Rank guide variants by QC score (best first). Each variant: {name, target_guide, donor_guide,
58
+ optional scaffold_seq, optional offtarget_count}."""
59
+ scored = []
60
+ for v in variants:
61
+ s = qc_score(v["target_guide"], v["donor_guide"], v.get("scaffold_seq"), v.get("offtarget_count"))
62
+ scored.append({**{k: v[k] for k in ("name",) if k in v}, "qc_score": s,
63
+ "flags": qc_flags(v["target_guide"], v["donor_guide"], v.get("scaffold_seq"),
64
+ v.get("offtarget_count"))["flags"]})
65
+ return sorted(scored, key=lambda r: r["qc_score"], reverse=True)
66
+
67
+
68
+ def screen_and_rank(default: dict, variants: list[dict] | None = None) -> dict:
69
+ """If the default design trips a flag, rank the provided variants by QC and recommend the best.
70
+
71
+ `variants` are caller-supplied (e.g. from bridgernadesigner enumeration); if absent, only the default's
72
+ QC verdict is returned. No novel-guide generation is claimed.
73
+ """
74
+ d_flags = qc_flags(default["target_guide"], default["donor_guide"], default.get("scaffold_seq"),
75
+ default.get("offtarget_count"))
76
+ out = {"default_flags": d_flags["flags"], "default_pass": d_flags["pass"]}
77
+ if d_flags["pass"] or not variants:
78
+ out["ranked"] = []
79
+ out["recommended"] = None if not d_flags["pass"] else "default (no flags)"
80
+ return out
81
+ ranked = rank_variants(variants)
82
+ out["ranked"] = ranked
83
+ out["recommended"] = ranked[0] if ranked else None
84
+ return out
@@ -0,0 +1,139 @@
1
+ """Acquire / load the bridge-recombinase training data (Phase 1.5, Step 1.5.1).
2
+
3
+ Three tables supervise the engine: the measured **off-target profile** (per-position mismatch tolerance),
4
+ the **DMS** (variant->activity), and the **72-system human-cell activity screen**. The Perry 2025
5
+ supplementary (Science adz0276) is paywalled and not bulk-downloadable from the build environment; the
6
+ loaders below read the real tables when supplied, and otherwise fall back to the literature-grounded
7
+ position-weight profile (`configs/bridge_offtarget_profile.yaml`) so the engine runs end-to-end.
8
+
9
+ Outputs (when real tables are present): features/bridge_offtarget_profile.parquet, bridge_dms.parquet,
10
+ bridge_screen.parquet.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import os
15
+ from functools import lru_cache
16
+ from pathlib import Path
17
+
18
+ import pandas as pd
19
+ import yaml
20
+
21
+ _ROOT = Path(__file__).resolve().parents[2]
22
+ _CFG = _ROOT / "configs" / "bridge_offtarget_profile.yaml"
23
+
24
+ # Perry 2025 supplementary (Science adz0276) - copyrighted; kept LOCAL, never committed/redistributed.
25
+ # Default location: Final_Part_v3.0/Perry_et_al/ (override with PEN_PERRY_DIR).
26
+ _PERRY_FILES = {
27
+ "orthologs": "science.adz0276_table_s1.xlsx", # S1: 72 bridge recombinase orthologs
28
+ "offtargets": "science.adz0276_table_s2.xlsx", # S2: genome-wide insertion sites (off-targets)
29
+ "dms": "science.adz0276_table_s3.xlsx", # S3: deep mutational scan
30
+ }
31
+
32
+
33
+ def perry_dir() -> Path | None:
34
+ env = os.environ.get("PEN_PERRY_DIR")
35
+ for cand in ([Path(env)] if env else []) + [_ROOT.parent / "Perry_et_al"]:
36
+ if cand.exists():
37
+ return cand
38
+ return None
39
+
40
+
41
+ def _perry(name: str) -> Path | None:
42
+ d = perry_dir()
43
+ if d is None:
44
+ return None
45
+ p = d / _PERRY_FILES[name]
46
+ return p if p.exists() else None
47
+
48
+
49
+ @lru_cache(maxsize=1)
50
+ def load_profile_config(path: str | Path = _CFG) -> dict:
51
+ return yaml.safe_load(Path(path).read_text(encoding="utf-8"))
52
+
53
+
54
+ def protective_weights() -> dict[int, float]:
55
+ """Per-position protective weight (1 = mismatch abolishes recombination; 0 = fully tolerated)."""
56
+ cfg = load_profile_config()
57
+ return {int(k): float(v) for k, v in cfg["protective_weight"].items()}
58
+
59
+
60
+ def load_insertion_sites() -> pd.DataFrame:
61
+ """Perry 2025 Table S2 - measured genome-wide insertion sites (on- + off-target). Empty if absent.
62
+
63
+ Columns include Intended_Site_Name, Plasmid_Encoded_Sequence (the intended 14-nt target),
64
+ Insertion_Site, Insertion_Site_Sequence (measured 14-nt), UMI_Count, %_of_Insertions, On-Target.
65
+ """
66
+ p = _perry("offtargets")
67
+ if p is None:
68
+ return pd.DataFrame()
69
+ df = pd.read_excel(p, sheet_name="Genome Wide Insertion Sites")
70
+ return df.dropna(subset=["Insertion_Site_Sequence", "Plasmid_Encoded_Sequence"])
71
+
72
+
73
+ _MEASURED_PARQUET = _ROOT / "data" / "curated" / "bridge_offtarget_profile_measured.parquet"
74
+
75
+
76
+ def load_measured_profile() -> pd.DataFrame:
77
+ """The MEASURED per-position profile. Prefers the committed derived parquet (available everywhere via
78
+ git); otherwise re-derives from the raw Perry tables (local only). Empty if neither is present."""
79
+ if _MEASURED_PARQUET.exists():
80
+ return pd.read_parquet(_MEASURED_PARQUET)
81
+ return derive_measured_profile()
82
+
83
+
84
+ def derive_measured_profile() -> pd.DataFrame:
85
+ """Per-position protective weight derived from the MEASURED off-targets (UMI-weighted conservation).
86
+
87
+ Among real off-targets (which recombined despite mismatches), positions that stay matched are the
88
+ specificity determinants (high protective weight); frequently-mismatched positions are tolerant.
89
+ Returns cols: position(1-based), conservation, protective_weight, source. Empty if Perry data absent.
90
+ """
91
+ s2 = load_insertion_sites()
92
+ if s2.empty:
93
+ return pd.DataFrame()
94
+ off = s2[(s2["On-Target"] == False) & # noqa: E712
95
+ (s2["Insertion_Site_Sequence"].str.len() == 14) &
96
+ (s2["Plasmid_Encoded_Sequence"].str.len() == 14)]
97
+ L = 14
98
+ match = [0.0] * L
99
+ tot = 0.0
100
+ for seq, intended, umi in zip(off["Insertion_Site_Sequence"], off["Plasmid_Encoded_Sequence"],
101
+ off["UMI_Count"]):
102
+ w = float(umi)
103
+ for j in range(L):
104
+ if seq[j] == intended[j]:
105
+ match[j] += w
106
+ tot += w
107
+ cons = [m / tot for m in match]
108
+ return pd.DataFrame({"position": list(range(1, L + 1)), "conservation": cons,
109
+ "protective_weight": cons, "source": "perry2025_table_s2_measured",
110
+ "n_offtargets": len(off)})
111
+
112
+
113
+ def load_offtarget_profile(use_measured: bool = True) -> pd.DataFrame:
114
+ """Measured profile (Perry S2) if available and requested, else the literature position weights."""
115
+ if use_measured:
116
+ m = derive_measured_profile()
117
+ if not m.empty:
118
+ return m.rename(columns={"protective_weight": "_pw"}).assign(
119
+ rel_recombination=lambda d: 1 - d["_pw"]).drop(columns="_pw")
120
+ w = protective_weights()
121
+ return pd.DataFrame({"position": list(w), "rel_recombination": [1 - v for v in w.values()],
122
+ "source": "literature_position_weights"})
123
+
124
+
125
+ def load_dms() -> pd.DataFrame:
126
+ """Perry 2025 Table S3 - deep mutational scan (Position, Mutation, Z_Score_wrt_WT). Empty if absent."""
127
+ p = _perry("dms")
128
+ if p is None:
129
+ return pd.DataFrame(columns=["Position", "Mutation", "Z_Score_wrt_WT"])
130
+ df = pd.read_excel(p, sheet_name="L2FC_Relative_Z-Scores")
131
+ return df[df["Position"] != "All"].copy()
132
+
133
+
134
+ def load_screen() -> pd.DataFrame:
135
+ """Perry 2025 Table S1 - 72 bridge recombinase orthologs (Name, sequences, Target, Donor). Empty if absent."""
136
+ p = _perry("orthologs")
137
+ if p is None:
138
+ return pd.DataFrame(columns=["Name", "Recombinase_Sequence", "bRNA_Sequence", "Donor", "Target"])
139
+ return pd.read_excel(p, sheet_name="Sheet1")
@@ -0,0 +1,133 @@
1
+ """Genome-wide bridge-recombinase off-target engine (Phase 1.5, Step 1.5.2) - HEADLINE.
2
+
3
+ Given a bridge-RNA design's target core (bipartite ~14 nt with a central CT dinucleotide), scan hg38 for
4
+ pseudosites tolerating up to ~2 mismatches and score each by a position-weight model (some positions
5
+ tolerate substitutions, the central core does not). This is the clinical gatekeeper: it tells a designer
6
+ where else in the genome the recombinase might write.
7
+
8
+ Efficiency: the central core (CT) must match for recombination, so we **seed on the core dinucleotide**
9
+ and verify the surrounding 14-mer - bounding the scan without loading the genome into RAM (per-chromosome
10
+ via pysam). Scoring beats a naive Hamming ranking *because mismatch position matters*.
11
+
12
+ Also exposes ``predict_offtargets(writer_family, site, ...)`` - the summary entry the Phase-3 Planner
13
+ cargo step calls (so its off-target annotation is no longer "pending Phase 1.5").
14
+ """
15
+ from __future__ import annotations
16
+
17
+ import re
18
+ from pathlib import Path
19
+
20
+ import pandas as pd
21
+
22
+ from pen_stack.bridge.ingest import load_measured_profile, load_profile_config, protective_weights
23
+
24
+ _COMP = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"}
25
+
26
+
27
+ def position_weights(prefer_measured: bool = True) -> dict[int, float]:
28
+ """0-based protective weight per core position (1 = mismatch abolishes recombination).
29
+
30
+ Prefers the MEASURED Perry-2025 profile (committed parquet, available everywhere) when present;
31
+ otherwise the literature-grounded config weights.
32
+ """
33
+ if prefer_measured:
34
+ m = load_measured_profile()
35
+ if not m.empty:
36
+ return {int(p) - 1: float(w) for p, w in zip(m["position"], m["protective_weight"])}
37
+ return {p - 1: w for p, w in protective_weights().items()}
38
+
39
+
40
+ def mismatches(window: str, core: str) -> list[tuple[int, str]]:
41
+ return [(j, window[j]) for j in range(len(core)) if window[j] != core[j]]
42
+
43
+
44
+ def risk_score(mm: list[tuple[int, str]], weights: dict[int, float]) -> float:
45
+ """Fewer / weaker-position mismatches -> higher off-target risk. Perfect match -> 1.0."""
46
+ if not mm:
47
+ return 1.0
48
+ r = 1.0
49
+ for j, _ in mm:
50
+ r *= (1 - weights.get(j, 0.5))
51
+ return float(r)
52
+
53
+
54
+ def hamming_risk(mm: list[tuple[int, str]], core_len: int) -> float:
55
+ """Naive baseline: position-blind - risk decreases uniformly with mismatch count."""
56
+ return float((core_len - len(mm)) / core_len)
57
+
58
+
59
+ def scan_sequence(seq: str, core: str, max_mm: int, weights: dict[int, float],
60
+ core_positions: list[int]) -> list[dict]:
61
+ """Seed on the central core dinucleotide, verify the full core with <= max_mm mismatches."""
62
+ seq = seq.upper()
63
+ L = len(core)
64
+ c0 = core_positions[0] # 0-based index of the core's first central base
65
+ motif = core[c0:c0 + len(core_positions)] # e.g. 'CT'
66
+ hits = []
67
+ for m in re.finditer(f"(?={motif})", seq): # overlapping seed matches
68
+ start = m.start() - c0 # align so the motif sits at the core position
69
+ if start < 0 or start + L > len(seq):
70
+ continue
71
+ window = seq[start:start + L]
72
+ if "N" in window:
73
+ continue
74
+ mm = mismatches(window, core)
75
+ if len(mm) <= max_mm:
76
+ hits.append({"pos": start, "site": window, "n_mm": len(mm),
77
+ "risk": risk_score(mm, weights), "hamming": hamming_risk(mm, L)})
78
+ return hits
79
+
80
+
81
+ def scan_offtargets(fasta: str | Path, target_core: str, chroms: list[str],
82
+ max_mm: int | None = None) -> pd.DataFrame:
83
+ """Genome-wide off-target scan for a target core. Per-chromosome (memory-bounded)."""
84
+ from pysam import FastaFile
85
+ cfg = load_profile_config()
86
+ max_mm = cfg["max_mismatches"] if max_mm is None else max_mm
87
+ core_pos = [p - 1 for p in cfg["central_core_positions"]]
88
+ weights = position_weights()
89
+ fa = FastaFile(str(fasta))
90
+ rows = []
91
+ for c in chroms:
92
+ for h in scan_sequence(fa.fetch(c), target_core.upper(), max_mm, weights, core_pos):
93
+ rows.append({"chrom": c, **h})
94
+ fa.close()
95
+ df = pd.DataFrame(rows)
96
+ return df.sort_values("risk", ascending=False).reset_index(drop=True) if not df.empty else df
97
+
98
+
99
+ # ---------------------------------------------------------------- Phase-3 Planner hook + design API
100
+
101
+ def predict_offtargets(writer_family: str, site: tuple | None = None, target_core: str | None = None,
102
+ fasta: str | Path | None = None, chroms: list[str] | None = None,
103
+ top: int = 20) -> dict:
104
+ """Off-target summary for a writer at a site - the entry the Phase-3 cargo step calls.
105
+
106
+ Only bridge/seek families are RNA-guided pseudosite-scannable. If a genome + target core are
107
+ available it returns a real genome-wide scan summary; otherwise it reports the engine is ready and
108
+ how to run the full scan (never fabricates off-target sites).
109
+ """
110
+ if writer_family not in {"bridge_IS110", "seek_IS1111"}:
111
+ return {"family": writer_family, "applicable": False,
112
+ "note": "off-target pseudosite scan applies to RNA-guided bridge/seek recombinases only"}
113
+ if not (target_core and fasta):
114
+ return {"family": writer_family, "applicable": True, "status": "engine_ready", "site": site,
115
+ "note": "provide target_core + hg38 fasta (pen-bridge design) for a genome-wide scan"}
116
+ df = scan_offtargets(fasta, target_core, chroms or [], )
117
+ n_exact = int((df["n_mm"] == 0).sum()) if not df.empty else 0
118
+ return {"family": writer_family, "applicable": True, "status": "scanned",
119
+ "target_core": target_core, "n_candidates": int(len(df)),
120
+ "n_exact_matches": n_exact,
121
+ "top": df.head(top).to_dict("records") if not df.empty else []}
122
+
123
+
124
+ if __name__ == "__main__": # pragma: no cover
125
+ # tiny self-test on a synthetic sequence
126
+ cfg = load_profile_config()
127
+ cp = [p - 1 for p in cfg["central_core_positions"]]
128
+ w = position_weights()
129
+ core = "AAACGTCTACGTTT" # 14 nt, CT at positions 7-8 (0-based 6-7)
130
+ seq = "GGGG" + core + "TTTT" + core[:6] + "GG" + core[8:] + "AA" # one exact + one core-disrupted
131
+ hits = scan_sequence(seq, core, cfg["max_mismatches"], w, cp)
132
+ for h in hits:
133
+ print(h)
@@ -0,0 +1,73 @@
1
+ """72-system bridge-recombinase ortholog characterisation (Phase 1.5, Step 1.5.4 secondary).
2
+
3
+ EXPLORATORY, descriptive only. The Perry 2025 Table S1 lists 72 bridge-recombinase orthologs with their
4
+ recombinase sequence, bRNA, donor and target, but it does NOT include a per-system human-cell activity
5
+ value, so a supervised ortholog-activity *predictor* cannot be trained from the public tables. Instead we
6
+ provide an honest, descriptive characterisation: sequence-feature summaries and a similarity ranking to the
7
+ one experimentally-validated standout (ISCro4). This is a *feature* (a way to organise the 72 systems),
8
+ not a method, and must not be read as an activity prediction.
9
+
10
+ N = 72 (small). Do not lean on this; it is a secondary, exploratory result with an explicit caveat.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ from collections import Counter
15
+
16
+ import pandas as pd
17
+
18
+ _AA = "ACDEFGHIKLMNPQRSTVWY"
19
+
20
+
21
+ def _kmer_vec(seq: str, k: int = 2) -> Counter:
22
+ seq = "".join(c for c in str(seq).upper() if c in _AA)
23
+ return Counter(seq[i:i + k] for i in range(len(seq) - k + 1))
24
+
25
+
26
+ def _cosine(a: Counter, b: Counter) -> float:
27
+ keys = set(a) | set(b)
28
+ dot = sum(a[k] * b[k] for k in keys)
29
+ na = sum(v * v for v in a.values()) ** 0.5
30
+ nb = sum(v * v for v in b.values()) ** 0.5
31
+ return float(dot / (na * nb)) if na and nb else 0.0
32
+
33
+
34
+ def characterise(reference: str = "ISCro4") -> pd.DataFrame:
35
+ """Describe the 72 orthologs: length + 2-mer cosine similarity to the reference (ISCro4). Empty if S1 absent."""
36
+ from pen_stack.bridge.ingest import load_screen
37
+ s1 = load_screen()
38
+ if s1.empty:
39
+ return pd.DataFrame()
40
+ s1 = s1.dropna(subset=["Recombinase_Sequence"]).copy()
41
+ s1["length_aa"] = s1["Recombinase_Sequence"].str.len()
42
+ ref_rows = s1[s1["Name"].astype(str) == reference]
43
+ if ref_rows.empty:
44
+ return s1[["Name", "length_aa"]].assign(similarity_to_ref=float("nan"), reference=reference)
45
+ ref_vec = _kmer_vec(ref_rows.iloc[0]["Recombinase_Sequence"])
46
+ s1["similarity_to_ref"] = s1["Recombinase_Sequence"].apply(lambda x: _cosine(_kmer_vec(x), ref_vec))
47
+ s1["reference"] = reference
48
+ return (s1[["Name", "length_aa", "similarity_to_ref", "reference"]]
49
+ .sort_values("similarity_to_ref", ascending=False).reset_index(drop=True))
50
+
51
+
52
+ def summary(reference: str = "ISCro4") -> dict:
53
+ df = characterise(reference)
54
+ if df.empty:
55
+ return {"available": False, "note": "Perry 2025 Table S1 not present"}
56
+ return {
57
+ "available": True,
58
+ "exploratory": True,
59
+ "n_systems": int(len(df)),
60
+ "reference": reference,
61
+ "length_range_aa": [int(df["length_aa"].min()), int(df["length_aa"].max())],
62
+ "median_length_aa": int(df["length_aa"].median()),
63
+ "most_similar_to_ref": df[df["Name"].astype(str) != reference].head(5)[
64
+ ["Name", "similarity_to_ref"]].round(3).to_dict("records"),
65
+ "caveat": "DESCRIPTIVE ONLY. Table S1 has no per-system activity label, so this is NOT an activity "
66
+ "predictor; it is a sequence-similarity organisation of the 72 systems relative to the one "
67
+ "validated standout (ISCro4). N=72 (small). Do not interpret similarity as predicted activity.",
68
+ }
69
+
70
+
71
+ if __name__ == "__main__": # pragma: no cover
72
+ import json
73
+ print(json.dumps(summary(), indent=2, default=str))
@@ -0,0 +1,83 @@
1
+ """pen-bridge: design + assess a bridge-RNA (Phase 1.5, Step 1.5.5).
2
+
3
+ WRAPS the authoritative Arc BridgeRNADesigner (``bridgernadesigner``) - does not reimplement it - and adds
4
+ the PEN-STACK layer on top: genome-wide off-target prediction (1.5.2), fold + cross-loop QC (1.5.3), and
5
+ optional overlay with the Phase-1 safety layer (is an off-target in a dangerous locus?).
6
+
7
+ Graceful: if ``bridgernadesigner`` is absent, off-target + cross-loop still run on the user-supplied
8
+ target/donor cores; only the full scaffold sequence (for ViennaRNA folding) needs the designer.
9
+ """
10
+ from __future__ import annotations
11
+
12
+ from pathlib import Path
13
+
14
+ from pen_stack.bridge.fold_qc import qc_verdict
15
+ from pen_stack.bridge.offtarget import scan_offtargets
16
+
17
+ # default hg38 locations (VM); overridable
18
+ _HG38_CANDIDATES = [
19
+ Path.home() / "cast-bench" / "data" / "external" / "genomes" / "hg38.fa",
20
+ Path("/work/data/external/genomes/hg38.fa"),
21
+ Path("data/external/genomes/hg38.fa"),
22
+ ]
23
+
24
+
25
+ def _hg38() -> Path | None:
26
+ import os
27
+ env = os.environ.get("PEN_HG38")
28
+ if env and Path(env).exists():
29
+ return Path(env)
30
+ return next((p for p in _HG38_CANDIDATES if p.exists()), None)
31
+
32
+
33
+ def design_brna(target: str, donor: str, scaffold: str = "ISCro4_enhanced") -> dict:
34
+ """Call the wrapped Arc designer. Returns the bridge sequence + cores, or a graceful note."""
35
+ try:
36
+ from bridgernadesigner.run import design_bridge_rna
37
+ except Exception as e: # noqa: BLE001
38
+ return {"available": False, "target": target.upper(), "donor": donor.upper(),
39
+ "scaffold": scaffold, "note": f"bridgernadesigner not installed ({e}); pip install bridgernadesigner"}
40
+ brna = design_bridge_rna(target, donor, scaffold)
41
+ return {"available": True, "scaffold": scaffold, "target": brna.target, "donor": brna.donor,
42
+ "bridge_sequence": brna.bridge_sequence}
43
+
44
+
45
+ def design_and_assess(target: str, donor: str, scaffold: str = "ISCro4_enhanced",
46
+ chroms: list[str] | None = None, fasta: str | Path | None = None,
47
+ ct: str | None = None, scan: bool = True) -> dict:
48
+ """End-to-end: design (wrapped) -> off-target + fold/cross-loop QC -> optional safety overlay."""
49
+ brna = design_brna(target, donor, scaffold)
50
+ tcore, dcore = brna["target"], brna["donor"]
51
+
52
+ qc = qc_verdict(tcore, dcore, brna.get("bridge_sequence"))
53
+
54
+ off = {"scanned": False}
55
+ if scan:
56
+ fa = Path(fasta) if fasta else _hg38()
57
+ if fa and fa.exists():
58
+ chroms = chroms or [f"chr{i}" for i in range(1, 23)] + ["chrX"]
59
+ df = scan_offtargets(fa, tcore, chroms)
60
+ if ct is not None:
61
+ df = annotate_with_safety(df, ct)
62
+ off = {"scanned": True, "n_candidates": int(len(df)),
63
+ "n_exact": int((df["n_mm"] == 0).sum()) if not df.empty else 0,
64
+ "table": df}
65
+ else:
66
+ off = {"scanned": False, "note": "hg38 fasta not found; set PEN_HG38 or pass fasta="}
67
+
68
+ return {"brna": brna, "offtargets": off, "qc": qc,
69
+ "disclaimer": "Decision-support only; predicted off-targets require experimental validation."}
70
+
71
+
72
+ def annotate_with_safety(off_df, ct: str):
73
+ """Overlay each off-target with the Phase-1 safety score (is the off-target in a dangerous locus?)."""
74
+ if off_df.empty:
75
+ return off_df
76
+ try:
77
+ from pen_stack.atlas.crosslink import load_writability
78
+ wdf = load_writability(ct)[["chrom", "bin", "safety"]]
79
+ out = off_df.copy()
80
+ out["bin"] = (out["pos"] // 1000).astype(int)
81
+ return out.merge(wdf, on=["chrom", "bin"], how="left")
82
+ except Exception: # noqa: BLE001 - safety overlay is optional
83
+ return off_df