PyPI - pen-stack - Versions diffs - 3.1.0__py3-none-any.whl - Mend

pen-stack 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

pen_stack/__init__.py +2 -0
pen_stack/_resources.py +34 -0
pen_stack/adapt/__init__.py +14 -0
pen_stack/adapt/finetune.py +33 -0
pen_stack/adapt/ingest.py +86 -0
pen_stack/adapt/pipeline.py +101 -0
pen_stack/adapt/recalibrate.py +58 -0
pen_stack/adapt/report.py +130 -0
pen_stack/agent/__init__.py +1 -0
pen_stack/agent/guardrails.py +49 -0
pen_stack/agent/mcp_server.py +42 -0
pen_stack/agent/orchestrator.py +106 -0
pen_stack/agent/pen_agent.py +169 -0
pen_stack/agent/tools.py +130 -0
pen_stack/atlas/__init__.py +1 -0
pen_stack/atlas/build_wtkb.py +80 -0
pen_stack/atlas/crosslink.py +144 -0
pen_stack/atlas/expand.py +190 -0
pen_stack/atlas/schema.py +59 -0
pen_stack/atlas/scorecard.py +134 -0
pen_stack/atlas/universe.py +75 -0
pen_stack/atlas/variant_propose.py +155 -0
pen_stack/bridge/__init__.py +1 -0
pen_stack/bridge/activity.py +52 -0
pen_stack/bridge/cli.py +65 -0
pen_stack/bridge/fold_qc.py +53 -0
pen_stack/bridge/guide_qc.py +84 -0
pen_stack/bridge/ingest.py +139 -0
pen_stack/bridge/offtarget.py +133 -0
pen_stack/bridge/ortholog_screen.py +73 -0
pen_stack/bridge/pipeline.py +83 -0
pen_stack/cli.py +126 -0
pen_stack/data/__init__.py +1 -0
pen_stack/data/encode.py +84 -0
pen_stack/data/genome.py +71 -0
pen_stack/data/ingest_chromatin.py +119 -0
pen_stack/data/ingest_integration.py +112 -0
pen_stack/data/ingest_safety_annot.py +164 -0
pen_stack/data/ingest_trip.py +76 -0
pen_stack/mech/__init__.py +1 -0
pen_stack/mech/classify_atlas.py +71 -0
pen_stack/mech/whitelist.py +66 -0
pen_stack/monitor/__init__.py +1 -0
pen_stack/monitor/europepmc.py +32 -0
pen_stack/monitor/run.py +57 -0
pen_stack/monitor/triage.py +63 -0
pen_stack/planner/__init__.py +1 -0
pen_stack/planner/cargo.py +56 -0
pen_stack/planner/cargo_polish.py +146 -0
pen_stack/planner/delivery.py +32 -0
pen_stack/planner/multiplex.py +110 -0
pen_stack/planner/optimize.py +156 -0
pen_stack/planner/pipeline.py +86 -0
pen_stack/planner/report.py +26 -0
pen_stack/rag/__init__.py +1 -0
pen_stack/rag/index.py +53 -0
pen_stack/rag/llm.py +178 -0
pen_stack/rag/qa.py +105 -0
pen_stack/score/__init__.py +1 -0
pen_stack/score/recalibrate.py +77 -0
pen_stack/score/therapeutic.py +85 -0
pen_stack/server/__init__.py +1 -0
pen_stack/server/api.py +142 -0
pen_stack/ui/__init__.py +1 -0
pen_stack/ui/app.py +518 -0
pen_stack/validate/__init__.py +1 -0
pen_stack/validate/adapt_demo.py +69 -0
pen_stack/validate/agent_eval.py +117 -0
pen_stack/validate/blind_gsh_discovery.py +165 -0
pen_stack/validate/cargo_directionality.py +57 -0
pen_stack/validate/durability_baselines.py +150 -0
pen_stack/validate/forward_hypotheses.py +104 -0
pen_stack/validate/guide_qc_demo.py +58 -0
pen_stack/validate/intent_specification.py +82 -0
pen_stack/validate/paper3_benchmark.py +165 -0
pen_stack/validate/paper4_real_validation.py +144 -0
pen_stack/validate/paper4_validation.py +82 -0
pen_stack/validate/seq_vs_measured.py +134 -0
pen_stack/validate/within_locus_ranking.py +74 -0
pen_stack/validate/writer_recovery.py +86 -0
pen_stack/wgenome/__init__.py +1 -0
pen_stack/wgenome/chromatin_seq.py +83 -0
pen_stack/wgenome/durability.py +108 -0
pen_stack/wgenome/export_tracks.py +52 -0
pen_stack/wgenome/features.py +82 -0
pen_stack/wgenome/gsh_baseline.py +117 -0
pen_stack/wgenome/providers.py +245 -0
pen_stack/wgenome/safety.py +69 -0
pen_stack/wgenome/structure3d.py +168 -0
pen_stack/wgenome/writability.py +72 -0
pen_stack-3.1.0.dist-info/METADATA +451 -0
pen_stack-3.1.0.dist-info/RECORD +96 -0
pen_stack-3.1.0.dist-info/WHEEL +5 -0
pen_stack-3.1.0.dist-info/entry_points.txt +3 -0
pen_stack-3.1.0.dist-info/licenses/LICENSE +21 -0
pen_stack-3.1.0.dist-info/top_level.txt +1 -0

pen_stack/bridge/cli.py ADDED Viewed

@@ -0,0 +1,65 @@
+"""pen-bridge CLI (Phase 1.5, Step 1.5.5) - the first public instrument of PEN-STACK.
+    pen-bridge design --target <14nt> --donor <14nt> [--scaffold ISCro4_enhanced] [--ct k562]
+Designs the bridge RNA (wrapped Arc designer) and reports off-target + fold/cross-loop QC.
+"""
+from __future__ import annotations
+import json
+import click
+@click.group()
+def main():
+    """pen-bridge - bridge-recombinase design + off-target/QC (PEN-STACK)."""
+@main.command()
+@click.option("--target", "-t", required=True, help="14 nt target core (DNA).")
+@click.option("--donor", "-d", required=True, help="14 nt donor core (DNA).")
+@click.option("--scaffold", "-s", default="ISCro4_enhanced",
+              type=click.Choice(["IS621", "ISCro4_WT", "ISCro4_enhanced"]))
+@click.option("--ct", default=None, help="Overlay Phase-1 safety for this cell type (k562/hepg2/hspc).")
+@click.option("--no-scan", is_flag=True, help="Skip the genome-wide off-target scan (QC only).")
+@click.option("--chroms", default=None, help="Comma-separated chroms to scan (default chr1..22,X).")
+def design(target, donor, scaffold, ct, no_scan, chroms):
+    """Design a bridge RNA and assess off-target + fold/cross-loop QC."""
+    from pen_stack.bridge.pipeline import design_and_assess
+    chrom_list = chroms.split(",") if chroms else None
+    res = design_and_assess(target, donor, scaffold, chroms=chrom_list, ct=ct, scan=not no_scan)
+    brna, off, qc = res["brna"], res["offtargets"], res["qc"]
+    click.echo(f"Bridge RNA ({scaffold}): target={brna['target']} donor={brna['donor']}")
+    if brna.get("available"):
+        click.echo(f"  bridge_sequence: {brna['bridge_sequence'][:80]}... ({len(brna['bridge_sequence'])} nt)")
+    else:
+        click.echo(f"  (designer: {brna['note']})")
+    click.echo(f"QC: cross-loop {qc['cross_loop']}  pass={qc['pass']}")
+    if "fold" in qc and qc["fold"].get("available"):
+        click.echo(f"    fold MFE: {qc['fold']['mfe']}")
+    if off.get("scanned"):
+        click.echo(f"Off-target: {off['n_candidates']} candidate pseudosites "
+                   f"({off['n_exact']} exact); top by risk:")
+        t = off["table"]
+        cols = [c for c in ["chrom", "pos", "site", "n_mm", "risk", "safety"] if c in t.columns]
+        click.echo(t.head(10)[cols].to_string(index=False))
+    else:
+        click.echo(f"Off-target: {off.get('note', 'not scanned')}")
+    click.echo(res["disclaimer"])
+@main.command()
+def profile():
+    """Show the position-weight off-target profile (and its provenance)."""
+    from pen_stack.bridge.ingest import load_profile_config
+    cfg = load_profile_config()
+    click.echo(json.dumps({"core_length": cfg["core_length"],
+                           "central_core_positions": cfg["central_core_positions"],
+                           "max_mismatches": cfg["max_mismatches"],
+                           "protective_weight": cfg["protective_weight"],
+                           "provenance": cfg["provenance"]}, indent=2))
+if __name__ == "__main__":
+    main()

pen_stack/bridge/fold_qc.py ADDED Viewed

@@ -0,0 +1,53 @@
+"""Bridge-RNA fold / cross-loop QC (Phase 1.5, Step 1.5.3).
+Predict whether a designed bridge RNA folds correctly (ViennaRNA, in the VM image) and flag DBL-DBL /
+TBL-TBL self/cross-recombination risk from guide complementarity - an experimentally observed failure
+mode where the target- and donor-binding loops recombine with each other instead of the genome.
+``cross_loop_risk`` is pure-Python (no dependency); ``fold`` uses ViennaRNA and degrades gracefully when
+the package is absent (returns None) so the rest of the QC still runs.
+"""
+from __future__ import annotations
+_PAIR = {"A": "U", "U": "A", "G": "C", "C": "G", "T": "A"}
+def fold(scaffold_seq: str) -> dict:
+    """MFE fold of the bridge-RNA scaffold. Returns {structure, mfe} or {available: False}."""
+    try:
+        import RNA
+    except Exception:  # noqa: BLE001 - ViennaRNA only in the VM image
+        return {"available": False, "note": "ViennaRNA not installed (runs in the VM image)"}
+    fc = RNA.fold_compound(scaffold_seq.upper().replace("T", "U"))
+    struct, mfe = fc.mfe()
+    return {"available": True, "structure": struct, "mfe": round(float(mfe), 2),
+            "length": len(scaffold_seq)}
+def _complementarity(a: str, b: str) -> float:
+    """Fraction of positions where a pairs with the reverse-complement of b (crude antiparallel match)."""
+    a = a.upper()
+    b_rc = "".join(_PAIR.get(x, "N") for x in reversed(b.upper()))
+    n = min(len(a), len(b_rc))
+    if n == 0:
+        return 0.0
+    return sum(1 for x, y in zip(a[:n], b_rc[:n]) if x == y) / n
+def cross_loop_risk(target_guide: str, donor_guide: str) -> dict:
+    """Self/cross complementarity of the binding loops. High values predict unintended recombination."""
+    return {"tbl_self": round(_complementarity(target_guide, target_guide), 3),
+            "dbl_self": round(_complementarity(donor_guide, donor_guide), 3),
+            "tbl_dbl": round(_complementarity(target_guide, donor_guide), 3)}
+def qc_verdict(target_guide: str, donor_guide: str, scaffold_seq: str | None = None,
+               cross_loop_threshold: float = 0.6) -> dict:
+    """Combined fold + cross-loop verdict for a design."""
+    xl = cross_loop_risk(target_guide, donor_guide)
+    flags = [k for k, v in xl.items() if v >= cross_loop_threshold]
+    out = {"cross_loop": xl, "cross_loop_flags": flags,
+           "pass": len(flags) == 0}
+    if scaffold_seq:
+        out["fold"] = fold(scaffold_seq)
+    return out

pen_stack/bridge/guide_qc.py ADDED Viewed

@@ -0,0 +1,84 @@
+"""Bridge-RNA guide ranking / QC layer (v3.1, WS-G2).
+Wraps a bridge-RNA design: when a default guide design trips a QC flag - self-complementarity, cross-loop
+(TBL-DBL) recombination, poor scaffold fold (MFE), or off-target - this enumerates candidate variants and
+RANKS them by the existing fold-QC (`bridge/fold_qc.py`) plus off-target risk (`bridge/offtarget.py`).
+This is a RANKING layer, not validated design: it retrospectively down-ranks known-bad guides; it makes NO
+claim of generating superior novel guides. It reuses the validated QC primitives so the score is grounded.
+"""
+from __future__ import annotations
+from pen_stack.bridge import fold_qc
+_PAIR = {"A": "T", "T": "A", "G": "C", "C": "G", "U": "A"}
+def _revcomp(s: str) -> str:
+    return "".join(_PAIR.get(b, "N") for b in reversed(s.upper()))
+def qc_flags(target_guide: str, donor_guide: str, scaffold_seq: str | None = None,
+             offtarget_count: int | None = None, cross_loop_threshold: float = 0.6,
+             mfe_per_nt_warn: float = -0.5) -> dict:
+    """Tripped QC flags for one design. Pure-python except the optional ViennaRNA fold (degrades)."""
+    xl = fold_qc.cross_loop_risk(target_guide, donor_guide)
+    flags = []
+    if xl["tbl_self"] >= cross_loop_threshold or xl["dbl_self"] >= cross_loop_threshold:
+        flags.append("self_complementarity")
+    if xl["tbl_dbl"] >= cross_loop_threshold:
+        flags.append("cross_loop_recombination")
+    fold = fold_qc.fold(scaffold_seq) if scaffold_seq else {"available": False}
+    if fold.get("available") and fold["length"] and (fold["mfe"] / fold["length"]) < mfe_per_nt_warn:
+        flags.append("poor_fold_mfe")
+    if offtarget_count is not None and offtarget_count > 0:
+        flags.append("off_target")
+    return {"cross_loop": xl, "fold": fold, "offtarget_count": offtarget_count, "flags": flags,
+            "pass": len(flags) == 0}
+def qc_score(target_guide: str, donor_guide: str, scaffold_seq: str | None = None,
+             offtarget_count: int | None = None) -> float:
+    """Combined QC quality in [0,1] (HIGHER = safer): penalize cross-loop complementarity, weak scaffold
+    fold, and off-targets. Used only to RANK candidate guides, not to certify them."""
+    xl = fold_qc.cross_loop_risk(target_guide, donor_guide)
+    score = 1.0 - max(xl["tbl_self"], xl["dbl_self"], xl["tbl_dbl"])     # cross-loop is the dominant penalty
+    if scaffold_seq:
+        fold = fold_qc.fold(scaffold_seq)
+        if fold.get("available") and fold["length"]:
+            # reward a fold near the expected ~ -0.35 kcal/mol per nt; penalize too-weak structure
+            score -= min(0.3, max(0.0, -0.35 - fold["mfe"] / fold["length"]))
+    if offtarget_count:
+        score -= min(0.4, 0.1 * offtarget_count)
+    return round(max(0.0, min(1.0, score)), 4)
+def rank_variants(variants: list[dict]) -> list[dict]:
+    """Rank guide variants by QC score (best first). Each variant: {name, target_guide, donor_guide,
+    optional scaffold_seq, optional offtarget_count}."""
+    scored = []
+    for v in variants:
+        s = qc_score(v["target_guide"], v["donor_guide"], v.get("scaffold_seq"), v.get("offtarget_count"))
+        scored.append({**{k: v[k] for k in ("name",) if k in v}, "qc_score": s,
+                       "flags": qc_flags(v["target_guide"], v["donor_guide"], v.get("scaffold_seq"),
+                                         v.get("offtarget_count"))["flags"]})
+    return sorted(scored, key=lambda r: r["qc_score"], reverse=True)
+def screen_and_rank(default: dict, variants: list[dict] | None = None) -> dict:
+    """If the default design trips a flag, rank the provided variants by QC and recommend the best.
+    `variants` are caller-supplied (e.g. from bridgernadesigner enumeration); if absent, only the default's
+    QC verdict is returned. No novel-guide generation is claimed.
+    """
+    d_flags = qc_flags(default["target_guide"], default["donor_guide"], default.get("scaffold_seq"),
+                       default.get("offtarget_count"))
+    out = {"default_flags": d_flags["flags"], "default_pass": d_flags["pass"]}
+    if d_flags["pass"] or not variants:
+        out["ranked"] = []
+        out["recommended"] = None if not d_flags["pass"] else "default (no flags)"
+        return out
+    ranked = rank_variants(variants)
+    out["ranked"] = ranked
+    out["recommended"] = ranked[0] if ranked else None
+    return out

pen_stack/bridge/ingest.py ADDED Viewed

@@ -0,0 +1,139 @@
+"""Acquire / load the bridge-recombinase training data (Phase 1.5, Step 1.5.1).
+Three tables supervise the engine: the measured **off-target profile** (per-position mismatch tolerance),
+the **DMS** (variant->activity), and the **72-system human-cell activity screen**. The Perry 2025
+supplementary (Science adz0276) is paywalled and not bulk-downloadable from the build environment; the
+loaders below read the real tables when supplied, and otherwise fall back to the literature-grounded
+position-weight profile (`configs/bridge_offtarget_profile.yaml`) so the engine runs end-to-end.
+Outputs (when real tables are present): features/bridge_offtarget_profile.parquet, bridge_dms.parquet,
+bridge_screen.parquet.
+"""
+from __future__ import annotations
+import os
+from functools import lru_cache
+from pathlib import Path
+import pandas as pd
+import yaml
+_ROOT = Path(__file__).resolve().parents[2]
+_CFG = _ROOT / "configs" / "bridge_offtarget_profile.yaml"
+# Perry 2025 supplementary (Science adz0276) - copyrighted; kept LOCAL, never committed/redistributed.
+# Default location: Final_Part_v3.0/Perry_et_al/ (override with PEN_PERRY_DIR).
+_PERRY_FILES = {
+    "orthologs": "science.adz0276_table_s1.xlsx",          # S1: 72 bridge recombinase orthologs
+    "offtargets": "science.adz0276_table_s2.xlsx",         # S2: genome-wide insertion sites (off-targets)
+    "dms": "science.adz0276_table_s3.xlsx",                # S3: deep mutational scan
+}
+def perry_dir() -> Path | None:
+    env = os.environ.get("PEN_PERRY_DIR")
+    for cand in ([Path(env)] if env else []) + [_ROOT.parent / "Perry_et_al"]:
+        if cand.exists():
+            return cand
+    return None
+def _perry(name: str) -> Path | None:
+    d = perry_dir()
+    if d is None:
+        return None
+    p = d / _PERRY_FILES[name]
+    return p if p.exists() else None
+@lru_cache(maxsize=1)
+def load_profile_config(path: str | Path = _CFG) -> dict:
+    return yaml.safe_load(Path(path).read_text(encoding="utf-8"))
+def protective_weights() -> dict[int, float]:
+    """Per-position protective weight (1 = mismatch abolishes recombination; 0 = fully tolerated)."""
+    cfg = load_profile_config()
+    return {int(k): float(v) for k, v in cfg["protective_weight"].items()}
+def load_insertion_sites() -> pd.DataFrame:
+    """Perry 2025 Table S2 - measured genome-wide insertion sites (on- + off-target). Empty if absent.
+    Columns include Intended_Site_Name, Plasmid_Encoded_Sequence (the intended 14-nt target),
+    Insertion_Site, Insertion_Site_Sequence (measured 14-nt), UMI_Count, %_of_Insertions, On-Target.
+    """
+    p = _perry("offtargets")
+    if p is None:
+        return pd.DataFrame()
+    df = pd.read_excel(p, sheet_name="Genome Wide Insertion Sites")
+    return df.dropna(subset=["Insertion_Site_Sequence", "Plasmid_Encoded_Sequence"])
+_MEASURED_PARQUET = _ROOT / "data" / "curated" / "bridge_offtarget_profile_measured.parquet"
+def load_measured_profile() -> pd.DataFrame:
+    """The MEASURED per-position profile. Prefers the committed derived parquet (available everywhere via
+    git); otherwise re-derives from the raw Perry tables (local only). Empty if neither is present."""
+    if _MEASURED_PARQUET.exists():
+        return pd.read_parquet(_MEASURED_PARQUET)
+    return derive_measured_profile()
+def derive_measured_profile() -> pd.DataFrame:
+    """Per-position protective weight derived from the MEASURED off-targets (UMI-weighted conservation).
+    Among real off-targets (which recombined despite mismatches), positions that stay matched are the
+    specificity determinants (high protective weight); frequently-mismatched positions are tolerant.
+    Returns cols: position(1-based), conservation, protective_weight, source. Empty if Perry data absent.
+    """
+    s2 = load_insertion_sites()
+    if s2.empty:
+        return pd.DataFrame()
+    off = s2[(s2["On-Target"] == False) &  # noqa: E712
+             (s2["Insertion_Site_Sequence"].str.len() == 14) &
+             (s2["Plasmid_Encoded_Sequence"].str.len() == 14)]
+    L = 14
+    match = [0.0] * L
+    tot = 0.0
+    for seq, intended, umi in zip(off["Insertion_Site_Sequence"], off["Plasmid_Encoded_Sequence"],
+                                  off["UMI_Count"]):
+        w = float(umi)
+        for j in range(L):
+            if seq[j] == intended[j]:
+                match[j] += w
+        tot += w
+    cons = [m / tot for m in match]
+    return pd.DataFrame({"position": list(range(1, L + 1)), "conservation": cons,
+                         "protective_weight": cons, "source": "perry2025_table_s2_measured",
+                         "n_offtargets": len(off)})
+def load_offtarget_profile(use_measured: bool = True) -> pd.DataFrame:
+    """Measured profile (Perry S2) if available and requested, else the literature position weights."""
+    if use_measured:
+        m = derive_measured_profile()
+        if not m.empty:
+            return m.rename(columns={"protective_weight": "_pw"}).assign(
+                rel_recombination=lambda d: 1 - d["_pw"]).drop(columns="_pw")
+    w = protective_weights()
+    return pd.DataFrame({"position": list(w), "rel_recombination": [1 - v for v in w.values()],
+                         "source": "literature_position_weights"})
+def load_dms() -> pd.DataFrame:
+    """Perry 2025 Table S3 - deep mutational scan (Position, Mutation, Z_Score_wrt_WT). Empty if absent."""
+    p = _perry("dms")
+    if p is None:
+        return pd.DataFrame(columns=["Position", "Mutation", "Z_Score_wrt_WT"])
+    df = pd.read_excel(p, sheet_name="L2FC_Relative_Z-Scores")
+    return df[df["Position"] != "All"].copy()
+def load_screen() -> pd.DataFrame:
+    """Perry 2025 Table S1 - 72 bridge recombinase orthologs (Name, sequences, Target, Donor). Empty if absent."""
+    p = _perry("orthologs")
+    if p is None:
+        return pd.DataFrame(columns=["Name", "Recombinase_Sequence", "bRNA_Sequence", "Donor", "Target"])
+    return pd.read_excel(p, sheet_name="Sheet1")

pen_stack/bridge/offtarget.py ADDED Viewed

@@ -0,0 +1,133 @@
+"""Genome-wide bridge-recombinase off-target engine (Phase 1.5, Step 1.5.2) - HEADLINE.
+Given a bridge-RNA design's target core (bipartite ~14 nt with a central CT dinucleotide), scan hg38 for
+pseudosites tolerating up to ~2 mismatches and score each by a position-weight model (some positions
+tolerate substitutions, the central core does not). This is the clinical gatekeeper: it tells a designer
+where else in the genome the recombinase might write.
+Efficiency: the central core (CT) must match for recombination, so we **seed on the core dinucleotide**
+and verify the surrounding 14-mer - bounding the scan without loading the genome into RAM (per-chromosome
+via pysam). Scoring beats a naive Hamming ranking *because mismatch position matters*.
+Also exposes ``predict_offtargets(writer_family, site, ...)`` - the summary entry the Phase-3 Planner
+cargo step calls (so its off-target annotation is no longer "pending Phase 1.5").
+"""
+from __future__ import annotations
+import re
+from pathlib import Path
+import pandas as pd
+from pen_stack.bridge.ingest import load_measured_profile, load_profile_config, protective_weights
+_COMP = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"}
+def position_weights(prefer_measured: bool = True) -> dict[int, float]:
+    """0-based protective weight per core position (1 = mismatch abolishes recombination).
+    Prefers the MEASURED Perry-2025 profile (committed parquet, available everywhere) when present;
+    otherwise the literature-grounded config weights.
+    """
+    if prefer_measured:
+        m = load_measured_profile()
+        if not m.empty:
+            return {int(p) - 1: float(w) for p, w in zip(m["position"], m["protective_weight"])}
+    return {p - 1: w for p, w in protective_weights().items()}
+def mismatches(window: str, core: str) -> list[tuple[int, str]]:
+    return [(j, window[j]) for j in range(len(core)) if window[j] != core[j]]
+def risk_score(mm: list[tuple[int, str]], weights: dict[int, float]) -> float:
+    """Fewer / weaker-position mismatches -> higher off-target risk. Perfect match -> 1.0."""
+    if not mm:
+        return 1.0
+    r = 1.0
+    for j, _ in mm:
+        r *= (1 - weights.get(j, 0.5))
+    return float(r)
+def hamming_risk(mm: list[tuple[int, str]], core_len: int) -> float:
+    """Naive baseline: position-blind - risk decreases uniformly with mismatch count."""
+    return float((core_len - len(mm)) / core_len)
+def scan_sequence(seq: str, core: str, max_mm: int, weights: dict[int, float],
+                  core_positions: list[int]) -> list[dict]:
+    """Seed on the central core dinucleotide, verify the full core with <= max_mm mismatches."""
+    seq = seq.upper()
+    L = len(core)
+    c0 = core_positions[0]                      # 0-based index of the core's first central base
+    motif = core[c0:c0 + len(core_positions)]   # e.g. 'CT'
+    hits = []
+    for m in re.finditer(f"(?={motif})", seq):  # overlapping seed matches
+        start = m.start() - c0                  # align so the motif sits at the core position
+        if start < 0 or start + L > len(seq):
+            continue
+        window = seq[start:start + L]
+        if "N" in window:
+            continue
+        mm = mismatches(window, core)
+        if len(mm) <= max_mm:
+            hits.append({"pos": start, "site": window, "n_mm": len(mm),
+                         "risk": risk_score(mm, weights), "hamming": hamming_risk(mm, L)})
+    return hits
+def scan_offtargets(fasta: str | Path, target_core: str, chroms: list[str],
+                    max_mm: int | None = None) -> pd.DataFrame:
+    """Genome-wide off-target scan for a target core. Per-chromosome (memory-bounded)."""
+    from pysam import FastaFile
+    cfg = load_profile_config()
+    max_mm = cfg["max_mismatches"] if max_mm is None else max_mm
+    core_pos = [p - 1 for p in cfg["central_core_positions"]]
+    weights = position_weights()
+    fa = FastaFile(str(fasta))
+    rows = []
+    for c in chroms:
+        for h in scan_sequence(fa.fetch(c), target_core.upper(), max_mm, weights, core_pos):
+            rows.append({"chrom": c, **h})
+    fa.close()
+    df = pd.DataFrame(rows)
+    return df.sort_values("risk", ascending=False).reset_index(drop=True) if not df.empty else df
+# ---------------------------------------------------------------- Phase-3 Planner hook + design API
+def predict_offtargets(writer_family: str, site: tuple | None = None, target_core: str | None = None,
+                       fasta: str | Path | None = None, chroms: list[str] | None = None,
+                       top: int = 20) -> dict:
+    """Off-target summary for a writer at a site - the entry the Phase-3 cargo step calls.
+    Only bridge/seek families are RNA-guided pseudosite-scannable. If a genome + target core are
+    available it returns a real genome-wide scan summary; otherwise it reports the engine is ready and
+    how to run the full scan (never fabricates off-target sites).
+    """
+    if writer_family not in {"bridge_IS110", "seek_IS1111"}:
+        return {"family": writer_family, "applicable": False,
+                "note": "off-target pseudosite scan applies to RNA-guided bridge/seek recombinases only"}
+    if not (target_core and fasta):
+        return {"family": writer_family, "applicable": True, "status": "engine_ready", "site": site,
+                "note": "provide target_core + hg38 fasta (pen-bridge design) for a genome-wide scan"}
+    df = scan_offtargets(fasta, target_core, chroms or [], )
+    n_exact = int((df["n_mm"] == 0).sum()) if not df.empty else 0
+    return {"family": writer_family, "applicable": True, "status": "scanned",
+            "target_core": target_core, "n_candidates": int(len(df)),
+            "n_exact_matches": n_exact,
+            "top": df.head(top).to_dict("records") if not df.empty else []}
+if __name__ == "__main__":  # pragma: no cover
+    # tiny self-test on a synthetic sequence
+    cfg = load_profile_config()
+    cp = [p - 1 for p in cfg["central_core_positions"]]
+    w = position_weights()
+    core = "AAACGTCTACGTTT"   # 14 nt, CT at positions 7-8 (0-based 6-7)
+    seq = "GGGG" + core + "TTTT" + core[:6] + "GG" + core[8:] + "AA"  # one exact + one core-disrupted
+    hits = scan_sequence(seq, core, cfg["max_mismatches"], w, cp)
+    for h in hits:
+        print(h)

pen_stack/bridge/ortholog_screen.py ADDED Viewed

@@ -0,0 +1,73 @@
+"""72-system bridge-recombinase ortholog characterisation (Phase 1.5, Step 1.5.4 secondary).
+EXPLORATORY, descriptive only. The Perry 2025 Table S1 lists 72 bridge-recombinase orthologs with their
+recombinase sequence, bRNA, donor and target, but it does NOT include a per-system human-cell activity
+value, so a supervised ortholog-activity *predictor* cannot be trained from the public tables. Instead we
+provide an honest, descriptive characterisation: sequence-feature summaries and a similarity ranking to the
+one experimentally-validated standout (ISCro4). This is a *feature* (a way to organise the 72 systems),
+not a method, and must not be read as an activity prediction.
+N = 72 (small). Do not lean on this; it is a secondary, exploratory result with an explicit caveat.
+"""
+from __future__ import annotations
+from collections import Counter
+import pandas as pd
+_AA = "ACDEFGHIKLMNPQRSTVWY"
+def _kmer_vec(seq: str, k: int = 2) -> Counter:
+    seq = "".join(c for c in str(seq).upper() if c in _AA)
+    return Counter(seq[i:i + k] for i in range(len(seq) - k + 1))
+def _cosine(a: Counter, b: Counter) -> float:
+    keys = set(a) | set(b)
+    dot = sum(a[k] * b[k] for k in keys)
+    na = sum(v * v for v in a.values()) ** 0.5
+    nb = sum(v * v for v in b.values()) ** 0.5
+    return float(dot / (na * nb)) if na and nb else 0.0
+def characterise(reference: str = "ISCro4") -> pd.DataFrame:
+    """Describe the 72 orthologs: length + 2-mer cosine similarity to the reference (ISCro4). Empty if S1 absent."""
+    from pen_stack.bridge.ingest import load_screen
+    s1 = load_screen()
+    if s1.empty:
+        return pd.DataFrame()
+    s1 = s1.dropna(subset=["Recombinase_Sequence"]).copy()
+    s1["length_aa"] = s1["Recombinase_Sequence"].str.len()
+    ref_rows = s1[s1["Name"].astype(str) == reference]
+    if ref_rows.empty:
+        return s1[["Name", "length_aa"]].assign(similarity_to_ref=float("nan"), reference=reference)
+    ref_vec = _kmer_vec(ref_rows.iloc[0]["Recombinase_Sequence"])
+    s1["similarity_to_ref"] = s1["Recombinase_Sequence"].apply(lambda x: _cosine(_kmer_vec(x), ref_vec))
+    s1["reference"] = reference
+    return (s1[["Name", "length_aa", "similarity_to_ref", "reference"]]
+            .sort_values("similarity_to_ref", ascending=False).reset_index(drop=True))
+def summary(reference: str = "ISCro4") -> dict:
+    df = characterise(reference)
+    if df.empty:
+        return {"available": False, "note": "Perry 2025 Table S1 not present"}
+    return {
+        "available": True,
+        "exploratory": True,
+        "n_systems": int(len(df)),
+        "reference": reference,
+        "length_range_aa": [int(df["length_aa"].min()), int(df["length_aa"].max())],
+        "median_length_aa": int(df["length_aa"].median()),
+        "most_similar_to_ref": df[df["Name"].astype(str) != reference].head(5)[
+            ["Name", "similarity_to_ref"]].round(3).to_dict("records"),
+        "caveat": "DESCRIPTIVE ONLY. Table S1 has no per-system activity label, so this is NOT an activity "
+                  "predictor; it is a sequence-similarity organisation of the 72 systems relative to the one "
+                  "validated standout (ISCro4). N=72 (small). Do not interpret similarity as predicted activity.",
+    }
+if __name__ == "__main__":  # pragma: no cover
+    import json
+    print(json.dumps(summary(), indent=2, default=str))

pen_stack/bridge/pipeline.py ADDED Viewed

@@ -0,0 +1,83 @@
+"""pen-bridge: design + assess a bridge-RNA (Phase 1.5, Step 1.5.5).
+WRAPS the authoritative Arc BridgeRNADesigner (``bridgernadesigner``) - does not reimplement it - and adds
+the PEN-STACK layer on top: genome-wide off-target prediction (1.5.2), fold + cross-loop QC (1.5.3), and
+optional overlay with the Phase-1 safety layer (is an off-target in a dangerous locus?).
+Graceful: if ``bridgernadesigner`` is absent, off-target + cross-loop still run on the user-supplied
+target/donor cores; only the full scaffold sequence (for ViennaRNA folding) needs the designer.
+"""
+from __future__ import annotations
+from pathlib import Path
+from pen_stack.bridge.fold_qc import qc_verdict
+from pen_stack.bridge.offtarget import scan_offtargets
+# default hg38 locations (VM); overridable
+_HG38_CANDIDATES = [
+    Path.home() / "cast-bench" / "data" / "external" / "genomes" / "hg38.fa",
+    Path("/work/data/external/genomes/hg38.fa"),
+    Path("data/external/genomes/hg38.fa"),
+]
+def _hg38() -> Path | None:
+    import os
+    env = os.environ.get("PEN_HG38")
+    if env and Path(env).exists():
+        return Path(env)
+    return next((p for p in _HG38_CANDIDATES if p.exists()), None)
+def design_brna(target: str, donor: str, scaffold: str = "ISCro4_enhanced") -> dict:
+    """Call the wrapped Arc designer. Returns the bridge sequence + cores, or a graceful note."""
+    try:
+        from bridgernadesigner.run import design_bridge_rna
+    except Exception as e:  # noqa: BLE001
+        return {"available": False, "target": target.upper(), "donor": donor.upper(),
+                "scaffold": scaffold, "note": f"bridgernadesigner not installed ({e}); pip install bridgernadesigner"}
+    brna = design_bridge_rna(target, donor, scaffold)
+    return {"available": True, "scaffold": scaffold, "target": brna.target, "donor": brna.donor,
+            "bridge_sequence": brna.bridge_sequence}
+def design_and_assess(target: str, donor: str, scaffold: str = "ISCro4_enhanced",
+                      chroms: list[str] | None = None, fasta: str | Path | None = None,
+                      ct: str | None = None, scan: bool = True) -> dict:
+    """End-to-end: design (wrapped) -> off-target + fold/cross-loop QC -> optional safety overlay."""
+    brna = design_brna(target, donor, scaffold)
+    tcore, dcore = brna["target"], brna["donor"]
+    qc = qc_verdict(tcore, dcore, brna.get("bridge_sequence"))
+    off = {"scanned": False}
+    if scan:
+        fa = Path(fasta) if fasta else _hg38()
+        if fa and fa.exists():
+            chroms = chroms or [f"chr{i}" for i in range(1, 23)] + ["chrX"]
+            df = scan_offtargets(fa, tcore, chroms)
+            if ct is not None:
+                df = annotate_with_safety(df, ct)
+            off = {"scanned": True, "n_candidates": int(len(df)),
+                   "n_exact": int((df["n_mm"] == 0).sum()) if not df.empty else 0,
+                   "table": df}
+        else:
+            off = {"scanned": False, "note": "hg38 fasta not found; set PEN_HG38 or pass fasta="}
+    return {"brna": brna, "offtargets": off, "qc": qc,
+            "disclaimer": "Decision-support only; predicted off-targets require experimental validation."}
+def annotate_with_safety(off_df, ct: str):
+    """Overlay each off-target with the Phase-1 safety score (is the off-target in a dangerous locus?)."""
+    if off_df.empty:
+        return off_df
+    try:
+        from pen_stack.atlas.crosslink import load_writability
+        wdf = load_writability(ct)[["chrom", "bin", "safety"]]
+        out = off_df.copy()
+        out["bin"] = (out["pos"] // 1000).astype(int)
+        return out.merge(wdf, on=["chrom", "bin"], how="left")
+    except Exception:  # noqa: BLE001 - safety overlay is optional
+        return off_df