PyPI - pen-stack - Versions diffs - 3.1.0__py3-none-any.whl - Mend

pen-stack 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

pen_stack/__init__.py +2 -0
pen_stack/_resources.py +34 -0
pen_stack/adapt/__init__.py +14 -0
pen_stack/adapt/finetune.py +33 -0
pen_stack/adapt/ingest.py +86 -0
pen_stack/adapt/pipeline.py +101 -0
pen_stack/adapt/recalibrate.py +58 -0
pen_stack/adapt/report.py +130 -0
pen_stack/agent/__init__.py +1 -0
pen_stack/agent/guardrails.py +49 -0
pen_stack/agent/mcp_server.py +42 -0
pen_stack/agent/orchestrator.py +106 -0
pen_stack/agent/pen_agent.py +169 -0
pen_stack/agent/tools.py +130 -0
pen_stack/atlas/__init__.py +1 -0
pen_stack/atlas/build_wtkb.py +80 -0
pen_stack/atlas/crosslink.py +144 -0
pen_stack/atlas/expand.py +190 -0
pen_stack/atlas/schema.py +59 -0
pen_stack/atlas/scorecard.py +134 -0
pen_stack/atlas/universe.py +75 -0
pen_stack/atlas/variant_propose.py +155 -0
pen_stack/bridge/__init__.py +1 -0
pen_stack/bridge/activity.py +52 -0
pen_stack/bridge/cli.py +65 -0
pen_stack/bridge/fold_qc.py +53 -0
pen_stack/bridge/guide_qc.py +84 -0
pen_stack/bridge/ingest.py +139 -0
pen_stack/bridge/offtarget.py +133 -0
pen_stack/bridge/ortholog_screen.py +73 -0
pen_stack/bridge/pipeline.py +83 -0
pen_stack/cli.py +126 -0
pen_stack/data/__init__.py +1 -0
pen_stack/data/encode.py +84 -0
pen_stack/data/genome.py +71 -0
pen_stack/data/ingest_chromatin.py +119 -0
pen_stack/data/ingest_integration.py +112 -0
pen_stack/data/ingest_safety_annot.py +164 -0
pen_stack/data/ingest_trip.py +76 -0
pen_stack/mech/__init__.py +1 -0
pen_stack/mech/classify_atlas.py +71 -0
pen_stack/mech/whitelist.py +66 -0
pen_stack/monitor/__init__.py +1 -0
pen_stack/monitor/europepmc.py +32 -0
pen_stack/monitor/run.py +57 -0
pen_stack/monitor/triage.py +63 -0
pen_stack/planner/__init__.py +1 -0
pen_stack/planner/cargo.py +56 -0
pen_stack/planner/cargo_polish.py +146 -0
pen_stack/planner/delivery.py +32 -0
pen_stack/planner/multiplex.py +110 -0
pen_stack/planner/optimize.py +156 -0
pen_stack/planner/pipeline.py +86 -0
pen_stack/planner/report.py +26 -0
pen_stack/rag/__init__.py +1 -0
pen_stack/rag/index.py +53 -0
pen_stack/rag/llm.py +178 -0
pen_stack/rag/qa.py +105 -0
pen_stack/score/__init__.py +1 -0
pen_stack/score/recalibrate.py +77 -0
pen_stack/score/therapeutic.py +85 -0
pen_stack/server/__init__.py +1 -0
pen_stack/server/api.py +142 -0
pen_stack/ui/__init__.py +1 -0
pen_stack/ui/app.py +518 -0
pen_stack/validate/__init__.py +1 -0
pen_stack/validate/adapt_demo.py +69 -0
pen_stack/validate/agent_eval.py +117 -0
pen_stack/validate/blind_gsh_discovery.py +165 -0
pen_stack/validate/cargo_directionality.py +57 -0
pen_stack/validate/durability_baselines.py +150 -0
pen_stack/validate/forward_hypotheses.py +104 -0
pen_stack/validate/guide_qc_demo.py +58 -0
pen_stack/validate/intent_specification.py +82 -0
pen_stack/validate/paper3_benchmark.py +165 -0
pen_stack/validate/paper4_real_validation.py +144 -0
pen_stack/validate/paper4_validation.py +82 -0
pen_stack/validate/seq_vs_measured.py +134 -0
pen_stack/validate/within_locus_ranking.py +74 -0
pen_stack/validate/writer_recovery.py +86 -0
pen_stack/wgenome/__init__.py +1 -0
pen_stack/wgenome/chromatin_seq.py +83 -0
pen_stack/wgenome/durability.py +108 -0
pen_stack/wgenome/export_tracks.py +52 -0
pen_stack/wgenome/features.py +82 -0
pen_stack/wgenome/gsh_baseline.py +117 -0
pen_stack/wgenome/providers.py +245 -0
pen_stack/wgenome/safety.py +69 -0
pen_stack/wgenome/structure3d.py +168 -0
pen_stack/wgenome/writability.py +72 -0
pen_stack-3.1.0.dist-info/METADATA +451 -0
pen_stack-3.1.0.dist-info/RECORD +96 -0
pen_stack-3.1.0.dist-info/WHEEL +5 -0
pen_stack-3.1.0.dist-info/entry_points.txt +3 -0
pen_stack-3.1.0.dist-info/licenses/LICENSE +21 -0
pen_stack-3.1.0.dist-info/top_level.txt +1 -0

pen_stack/planner/multiplex.py ADDED Viewed

@@ -0,0 +1,110 @@
+"""Multiplex translocation-risk flag (v3.1, WS-G1).
+For a multi-edit plan (2-5 edits), two simultaneous double-strand breaks (DSBs) at different loci can
+mis-join into a TRANSLOCATION. This is a classical, interpretable SCREEN - not a calibrated translocation
+predictor. We gather every edit's DSB sites (on-target + predicted off-targets, each with a cut probability),
+enumerate all site PAIRS exactly (cheap for 2-5 edits), and combine pairwise DSB-join probabilities into a
+`translocation_risk` in [0,1].
+Key honest property: **DSB-free writers (bridge / seek recombinases) contribute NO cut sites**, so a plan
+built from them carries ~zero translocation risk - which is the whole point of programmable recombinases.
+The flag is monotonic (more sites / higher cut prob / closer pairs -> higher risk) and reports its top pairs
+so a user can see WHY. A QUBO formulation is provided as a documented OPTIONAL baseline, off by default.
+"""
+from __future__ import annotations
+import math
+from itertools import combinations
+# writer families that cut DNA (DSB) vs DSB-free programmable recombinases / writers.
+_DSB_FREE = {"bridge_is110", "bridge_iscro4", "seek_is1111", "bridge", "seek", "pe_integrase",
+             "prime_editor", "recombinase"}
+_DEFAULT_ON_TARGET_CUT = 0.8        # nominal on-target cut efficiency for a DSB nuclease (documented prior)
+_INTRA_CHROM_LENGTH = 1.0e7         # bp decay length for intra-chromosomal join propensity (10 Mb)
+def is_dsb_free(family: str | None) -> bool:
+    return str(family or "").lower() in _DSB_FREE
+def cut_sites(edit: dict) -> list[dict]:
+    """DSB sites for one edit. DSB-free writers -> []. Otherwise on-target (+ off-targets if provided).
+    `edit` keys: family, chrom, pos (on-target); optional on_target_cut; optional offtargets=[{chrom,pos,
+    p_cut|risk}]. Off-target risk in [0,1] is used directly as a cut probability.
+    """
+    if is_dsb_free(edit.get("family")):
+        return []
+    sites = []
+    if edit.get("chrom") is not None and edit.get("pos") is not None:
+        sites.append({"chrom": edit["chrom"], "pos": int(edit["pos"]),
+                      "p_cut": float(edit.get("on_target_cut", _DEFAULT_ON_TARGET_CUT)),
+                      "kind": "on_target", "edit": edit.get("name")})
+    for ot in edit.get("offtargets", []) or []:
+        p = float(ot.get("p_cut", ot.get("risk", 0.0)))
+        if p > 0 and ot.get("chrom") is not None and ot.get("pos") is not None:
+            sites.append({"chrom": ot["chrom"], "pos": int(ot["pos"]), "p_cut": min(1.0, p),
+                          "kind": "off_target", "edit": edit.get("name")})
+    return sites
+def _join_factor(a: dict, b: dict) -> float:
+    """Propensity that two DSBs mis-join: 1.0 inter-chromosomal; distance-decayed intra-chromosomal."""
+    if a["chrom"] != b["chrom"]:
+        return 1.0
+    d = abs(a["pos"] - b["pos"])
+    return math.exp(-d / _INTRA_CHROM_LENGTH)
+def pairwise_risks(sites: list[dict]) -> list[dict]:
+    """Exact pairwise DSB-join probabilities for every unordered site pair (across and within edits)."""
+    out = []
+    for i, j in combinations(range(len(sites)), 2):
+        a, b = sites[i], sites[j]
+        jp = a["p_cut"] * b["p_cut"] * _join_factor(a, b)
+        out.append({"a": f"{a['edit']}:{a['kind']}@{a['chrom']}:{a['pos']}",
+                    "b": f"{b['edit']}:{b['kind']}@{b['chrom']}:{b['pos']}",
+                    "inter_chromosomal": a["chrom"] != b["chrom"], "join_prob": round(jp, 5)})
+    return sorted(out, key=lambda r: r["join_prob"], reverse=True)
+def translocation_risk(edits: list[dict], low: float = 0.05, moderate: float = 0.2,
+                       top_k: int = 5) -> dict:
+    """Aggregate translocation-risk flag for a multi-edit plan. risk = 1 - prod(1 - pairwise_join_prob).
+    Monotonic in every pairwise probability; interpretable via the top contributing pairs. A SCREEN, not a
+    calibrated predictor.
+    """
+    if not 2 <= len(edits) <= 5:
+        # still computes, but the flag is meant for multiplex (2-5 simultaneous edits)
+        note = "translocation risk is defined for multiplex plans (2-5 simultaneous edits)"
+    else:
+        note = None
+    sites = [s for e in edits for s in cut_sites(e)]
+    pairs = pairwise_risks(sites)
+    prod = 1.0
+    for p in pairs:
+        prod *= (1.0 - p["join_prob"])
+    risk = round(1.0 - prod, 5)
+    band = "low" if risk < low else ("moderate" if risk < moderate else "high")
+    n_dsb_free = sum(1 for e in edits if is_dsb_free(e.get("family")))
+    return {"translocation_risk": risk, "band": band, "n_edits": len(edits),
+            "n_cut_sites": len(sites), "n_pairs": len(pairs),
+            "n_dsb_free_edits": n_dsb_free,
+            "all_dsb_free": n_dsb_free == len(edits),
+            "top_pairs": pairs[:top_k],
+            "note": note,
+            "scope": "classical pairwise DSB-join SCREEN, not a calibrated translocation predictor; "
+                     "DSB-free recombinase plans carry ~zero risk by construction"}
+def qubo_baseline(edits: list[dict], variants_per_edit: dict[str, list[dict]] | None = None) -> dict:
+    """OPTIONAL, OFF BY DEFAULT - a documented QUBO baseline for selecting per-edit guide variants that
+    minimize total pairwise translocation risk. Returns the QUBO Q-matrix terms only; no solver is invoked
+    and this is NOT the recommended path (the exact pairwise screen above is exact for 2-5 edits). Provided
+    for completeness / external comparison, clearly labeled optional.
+    """
+    return {"enabled": False, "kind": "QUBO (optional baseline)",
+            "note": "exact pairwise enumeration is tractable and exact for 2-5 edits; the QUBO path is an "
+                    "optional baseline for large multiplex selection problems and is off by default.",
+            "n_variant_sets": len(variants_per_edit or {})}

pen_stack/planner/optimize.py ADDED Viewed

@@ -0,0 +1,156 @@
+"""Inverse-design optimiser with edit_intent (Phase 3, Step 3.1).
+Given a goal (gene/locus, edit_intent, cargo, cell type), search destination x writer for the joint
+optimum of safety x durability x reachability x writer-activity, conditioned on an explicit
+``edit_intent``. The intent is *load-bearing*: its ``target_gene_sign`` decides whether hitting the
+named target gene/element is penalised (safe-harbour: avoid) or rewarded (knock-in / excision: intended)
+- so the same locus ranks high or low depending only on the stated goal.
+Components are retained on every candidate row; the score is a transparent linear combination read from
+``configs/intent_weights.yaml``. Reachability is a hard filter (Tier-1 high-confidence; Tier-2 candidate
+flagged). Writer activity comes from the Phase-2 Writer Atlas (measured human-cell axis per family).
+Inputs : Phase-1 writability atlas (safety/p_durable/reachable_tier1) + Phase-2 atlas.parquet.
+Outputs: ranked (writer, site) candidates with full component provenance.
+"""
+from __future__ import annotations
+from enum import Enum
+from functools import lru_cache
+from pathlib import Path
+import pandas as pd
+import yaml
+_ROOT = Path(__file__).resolve().parents[2]
+_CFG = _ROOT / "configs" / "intent_weights.yaml"
+_ATLAS = _ROOT / "pen_stack" / "atlas" / "atlas.parquet"
+BIN_BP = 1000
+class EditIntent(str, Enum):
+    SAFE_HARBOUR = "safe_harbour_insertion"
+    KNOCK_IN_DISRUPT = "knock_in_with_disruption"
+    HIGH_DURABILITY = "high_durability_insertion"
+    REG_EXCISION = "regulatory_excision"
+    REPEAT_EXCISION = "repeat_excision"
+@lru_cache(maxsize=1)
+def load_intent_weights(path: str | Path = _CFG) -> dict:
+    return yaml.safe_load(Path(path).read_text(encoding="utf-8"))
+@lru_cache(maxsize=1)
+def writer_activity_by_family(atlas_path: str | Path = _ATLAS) -> dict:
+    """Per-family writer-activity proxy from the Writer Atlas curated cores (measured human-cell axis).
+    Falls back to readiness when S_HumanCell is missing. Used so the optimiser prefers writers that
+    actually work in human cells (e.g. bridge ISCro4) over weakly-active families.
+    """
+    atlas = pd.read_parquet(atlas_path)
+    core = atlas[atlas["entry_kind"] == "curated_core"] if "entry_kind" in atlas else atlas
+    act = {}
+    for fam, sub in core.groupby("family"):
+        r = sub.iloc[0]
+        a = r.get("S_HumanCell")
+        if a is None or pd.isna(a):
+            a = r.get("readiness", 0.5)
+        act[fam] = float(a) if pd.notna(a) else 0.5
+    return act
+def _best_writer(reachable_tier1: str, cargo_bp: int, atlas_caps: dict, activity: dict) -> tuple[str, float, bool]:
+    """Pick the best reachable writer that fits the cargo: (family, activity, cargo_ok)."""
+    fams = [f for f in str(reachable_tier1).split(";") if f]
+    best, best_act, best_ok = None, -1.0, False
+    for f in fams:
+        cap = atlas_caps.get(f)
+        ok = (cap is None) or (cargo_bp <= cap)
+        a = activity.get(f, 0.4)
+        # prefer cargo-fitting writers; among those, highest activity
+        rank = (1 if ok else 0, a)
+        if rank > (1 if best_ok else 0, best_act):
+            best, best_act, best_ok = f, a, ok
+    return best or (fams[0] if fams else "unknown"), best_act if best else 0.4, best_ok
+def score_candidates(cands: pd.DataFrame, intent: EditIntent | str, cargo_bp: int) -> pd.DataFrame:
+    """Score a candidate DataFrame (needs: safety, p_durable, reachable_tier1, on_target[bool]).
+    Adds: writer (family), writer_activity, cargo_ok, score, and the retained components.
+    """
+    intent = EditIntent(intent) if not isinstance(intent, EditIntent) else intent
+    cfg = load_intent_weights()
+    w = cfg["intents"][intent.value]
+    mag = float(cfg.get("on_target_magnitude", 1.0))
+    atlas = pd.read_parquet(_ATLAS)
+    caps = (atlas.dropna(subset=["cargo_capacity_bp"]).groupby("family")["cargo_capacity_bp"].max().to_dict())
+    activity = writer_activity_by_family()
+    out = cands.copy()
+    picks = out["reachable_tier1"].apply(lambda rt: _best_writer(rt, cargo_bp, caps, activity))
+    out["writer"] = [p[0] for p in picks]
+    out["writer_activity"] = [p[1] for p in picks]
+    out["cargo_ok"] = [p[2] for p in picks]
+    on_target = out.get("on_target", pd.Series(False, index=out.index)).astype(float)
+    base = (w["safety"] * out["safety"].astype(float)
+            + w["durability"] * out["p_durable"].astype(float)
+            + w["activity"] * out["writer_activity"].astype(float))
+    # target_gene_sign: +1 -> penalise on-target (avoid the gene); -1 -> reward on-target (hit the gene)
+    out["score"] = base - w["target_gene_sign"] * mag * on_target
+    # cargo that cannot be delivered by any reachable writer is penalised
+    out.loc[~out["cargo_ok"], "score"] -= 0.5
+    out["intent"] = intent.value
+    # Deterministic ranking: a stable sort with explicit tie-breakers, so tied scores (common when safety
+    # saturates) always resolve identically across runs - the default quicksort is NOT stable.
+    keys = ["score"] + [c for c in ("chrom", "bin", "gene") if c in out.columns]
+    asc = [False] + [True] * (len(keys) - 1)
+    return out.sort_values(keys, ascending=asc, kind="stable").reset_index(drop=True)
+def gene_coords_path() -> Path:
+    """Locate gene_coords.parquet: packaged copy first (works in any container), then phase_1."""
+    for p in (_ROOT / "data" / "curated" / "gene_coords.parquet",
+              _ROOT.parent / "phase_1" / "app_data" / "gene_coords.parquet"):
+        if p.exists():
+            return p
+    return _ROOT / "data" / "curated" / "gene_coords.parquet"
+@lru_cache(maxsize=8)
+def _gene_coords(path: str | None = None) -> pd.DataFrame:
+    return pd.read_parquet(Path(path) if path else gene_coords_path())
+def gene_region(gene: str, flank_kb: int = 50) -> tuple[str, int, int] | None:
+    gc = _gene_coords()
+    g = gc[gc["gene"] == gene]
+    if g.empty:
+        return None
+    r = g.iloc[0]
+    return r["chrom"], max(0, int(r["start"]) - flank_kb * 1000), int(r["end"]) + flank_kb * 1000
+def plan(gene: str, intent: EditIntent | str, cargo_bp: int, writable_df: pd.DataFrame,
+         k: int = 10, flank_kb: int = 50) -> pd.DataFrame:
+    """Rank (writer, site) candidates near a gene for the given intent. Components retained."""
+    intent = EditIntent(intent) if not isinstance(intent, EditIntent) else intent
+    reg = gene_region(gene, flank_kb)
+    if reg is None:
+        return pd.DataFrame()
+    chrom, lo, hi = reg
+    sub = writable_df[(writable_df["chrom"] == chrom)
+                      & (writable_df["bin"].between(lo // BIN_BP, hi // BIN_BP))].copy()
+    if sub.empty:
+        return pd.DataFrame()
+    # on_target = bin overlaps the gene body (not just the flank)
+    g = _gene_coords()
+    gr = g[g["gene"] == gene].iloc[0]
+    sub["on_target"] = sub["bin"].between(int(gr["start"]) // BIN_BP, int(gr["end"]) // BIN_BP)
+    scored = score_candidates(sub, intent, cargo_bp)
+    cols = ["chrom", "bin", "writer", "safety", "p_durable", "writer_activity",
+            "on_target", "cargo_ok", "reachable_tier1", "score", "intent"]
+    return scored[[c for c in cols if c in scored.columns]].head(k)

pen_stack/planner/pipeline.py ADDED Viewed

@@ -0,0 +1,86 @@
+"""End-to-end Write Planner (Phase 3, Step 3.4).
+One call - ``plan_write(gene, intent, payload_bp, ct)`` - composes the inverse-design optimiser (3.1),
+cargo/donor design (3.2), and delivery recommendation (3.3) into ranked, fully traceable plans. Every
+numeric field is tagged with the module/dataset that produced it (provenance), so nothing is asserted
+without a source. Heavy data (the Phase-1 writability atlas) is loaded lazily via the cross-link.
+"""
+from __future__ import annotations
+from functools import lru_cache
+from pathlib import Path
+import pandas as pd
+from pen_stack.planner.cargo import design_cargo
+from pen_stack.planner.delivery import recommend_delivery
+from pen_stack.planner.optimize import EditIntent, plan
+_ATLAS = Path(__file__).resolve().parents[1] / "atlas" / "atlas.parquet"
+BIN_BP = 1000
+@lru_cache(maxsize=1)
+def _writer_meta() -> dict:
+    """family -> {length_aa, cargo_capacity_bp, deliv_class, reachability_tier} from the Writer Atlas."""
+    atlas = pd.read_parquet(_ATLAS)
+    core = atlas[atlas["entry_kind"] == "curated_core"] if "entry_kind" in atlas else atlas
+    meta = {}
+    for fam, sub in core.groupby("family"):
+        r = sub.iloc[0]
+        meta[fam] = {
+            "length_aa": (int(r["length_aa"]) if pd.notna(r.get("length_aa")) else None),
+            "cargo_capacity_bp": (int(r["cargo_capacity_bp"]) if pd.notna(r.get("cargo_capacity_bp")) else None),
+            "deliv_class": r.get("deliv_class"),
+            "reachability_tier": r.get("reachability_tier"),
+        }
+    return meta
+def plan_write(gene: str, intent: EditIntent | str, payload_bp: int, ct: str = "k562",
+               k: int = 5, writable_df: pd.DataFrame | None = None) -> list[dict]:
+    """Return ranked, traceable write plans for a goal. Each plan = site + writer + cargo + delivery."""
+    if writable_df is None:
+        from pen_stack.atlas.crosslink import load_writability
+        writable_df = load_writability(ct)
+    cands = plan(gene, intent, payload_bp, writable_df, k=k)
+    meta = _writer_meta()
+    plans = []
+    for _, row in cands.iterrows():
+        fam = row["writer"]
+        wm = meta.get(fam, {})
+        writer_row = {"family": fam, "cargo_capacity_bp": wm.get("cargo_capacity_bp"),
+                      "deliv_class": wm.get("deliv_class")}
+        site = (row["chrom"], int(row["bin"]) * BIN_BP)
+        cargo = design_cargo(payload_bp, writer_row, site, ct)
+        eff_bp = (wm.get("length_aa") or 0) * 3
+        deliv = recommend_delivery(eff_bp, payload_bp, ct)
+        plans.append({
+            "gene": gene, "intent": EditIntent(intent).value if not isinstance(intent, EditIntent) else intent.value,
+            "site": {"chrom": row["chrom"], "bin": int(row["bin"]), "pos": site[1]},
+            "writer": fam,
+            "reachability_tier": wm.get("reachability_tier"),
+            "safety": round(float(row["safety"]), 4),
+            "durability": round(float(row["p_durable"]), 4),
+            "writer_activity": round(float(row["writer_activity"]), 4),
+            "on_target": bool(row["on_target"]),
+            "score": round(float(row["score"]), 4),
+            "cargo": cargo,
+            "delivery": deliv,
+            "provenance": {
+                "safety": "wgenome.safety (LightGBM, COSMIC/DepMap/MLV)",
+                "durability": "wgenome.durability (TRIP conditional chromatin model)",
+                "writer_activity": "atlas.score.therapeutic (measured human-cell axis)",
+                "reachability": "atlas.crosslink (Phase-1 reachable_tier1 + WT-KB tier)",
+                "delivery": "planner.delivery (configs/delivery_rules.yaml)",
+                "offtargets": "planner.cargo (bridge engine = Phase 1.5)",
+            },
+            "disclaimer": "Decision-support only; not a clinical directive. Tier-2/3 reachability is candidate.",
+        })
+    return plans
+if __name__ == "__main__":  # pragma: no cover
+    import json
+    ps = plan_write("TRAC", EditIntent.KNOCK_IN_DISRUPT, 2000, "k562", k=3)
+    print(json.dumps(ps[0], indent=2, default=str)[:1200])

pen_stack/planner/report.py ADDED Viewed

@@ -0,0 +1,26 @@
+"""Human-readable Write Planner report (Phase 3, Step 3.4)."""
+from __future__ import annotations
+def render_plan(p: dict) -> str:
+    s = p["site"]
+    lines = [
+        f"Write plan for {p['gene']}  (intent: {p['intent']})",
+        f"  Site        : {s['chrom']}:{s['pos']:,}  (bin {s['bin']}, on_target={p['on_target']})",
+        f"  Writer      : {p['writer']}  [reachability {p['reachability_tier']}]",
+        f"  Scores      : safety {p['safety']} | durability {p['durability']} | "
+        f"writer-activity {p['writer_activity']} | score {p['score']}",
+        f"  Cargo       : payload {p['cargo']['payload_bp']} bp -> assembled {p['cargo']['assembled_bp']} bp "
+        f"(size_ok={p['cargo']['size_ok']}, codon-optimised, insulated)",
+        f"  Delivery    : {p['delivery']['delivery']}  ({p['delivery']['rationale']})",
+    ]
+    if "offtargets" in p["cargo"]:
+        lines.append(f"  Off-target  : {p['cargo']['offtargets'].get('status', p['cargo']['offtargets'])}")
+    lines.append(f"  Note        : {p['disclaimer']}")
+    return "\n".join(lines)
+def render_plans(plans: list[dict]) -> str:
+    if not plans:
+        return "No plan found (gene not in the atlas, or no reachable site)."
+    return f"\n{'='*72}\n".join(f"[rank {i+1}]\n{render_plan(p)}" for i, p in enumerate(plans))

pen_stack/rag/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """pen_stack.rag - see PEN-STACK v3.0 program doc."""

pen_stack/rag/index.py ADDED Viewed

@@ -0,0 +1,53 @@
+"""Grounded document index for the PEN-STACK RAG (Phase 2, Step 2.8).
+Builds a cited corpus of fact cards from the curated atlas + WT-KB (each card carries its source DOIs),
+so retrieval-grounded answers always have a citation. If PaperQA + an LLM are available they can index a
+literature corpus on top; the keyword retriever here is the dependency-light default that guarantees the
+"every factual claim is cited" contract without any model.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from pathlib import Path
+import pandas as pd
+_ATLAS = Path(__file__).resolve().parents[1] / "atlas" / "atlas.parquet"
+@dataclass
+class Card:
+    key: str
+    text: str
+    citations: list[str] = field(default_factory=list)
+def build_cards(atlas_parquet: str | Path = _ATLAS) -> list[Card]:
+    """One fact card per writer family, summarising its measured targeting + readiness, with DOIs."""
+    df = pd.read_parquet(atlas_parquet)
+    cards: list[Card] = []
+    for fam, sub in df.groupby("family"):
+        core = sub[sub["entry_kind"].isin(["curated_core", "curated_rep"])]
+        rep = core.iloc[0] if len(core) else sub.iloc[0]
+        dois: list[str] = []
+        for d in core["key_dois"] if len(core) else sub["key_dois"]:
+            dois.extend(str(x) for x in list(d) if str(x).strip())
+        text = (f"Writer family {fam}: representative {rep['representative_system']}; "
+                f"mechanism {rep.get('mechanism_bucket')}; targeting {rep.get('targeting_modality')}; "
+                f"reachability {rep.get('reachability_tier')}; deliverability {rep.get('deliv_class')}; "
+                f"cargo {rep.get('cargo_capacity_bp')} bp; human-cell activity: "
+                f"{rep.get('human_cell_activity')}. {len(sub):,} systems catalogued.")
+        cards.append(Card(key=fam, text=text, citations=sorted(set(dois))))
+    return cards
+def retrieve(question: str, cards: list[Card], k: int = 3) -> list[Card]:
+    """Keyword overlap retriever (lower-cased token Jaccard). Deterministic, no model needed."""
+    q = set(_tok(question))
+    scored = [(len(q & set(_tok(c.text + " " + c.key))), c) for c in cards]
+    scored.sort(key=lambda x: x[0], reverse=True)
+    return [c for n, c in scored if n > 0][:k]
+def _tok(s: str) -> list[str]:
+    return [w for w in "".join(ch.lower() if ch.isalnum() else " " for ch in s).split() if len(w) > 2]

pen_stack/rag/llm.py ADDED Viewed

@@ -0,0 +1,178 @@
+"""Provider-agnostic LLM layer for PEN-STACK services (RAG, agent, PEN-MONITOR).
+Hybrid backend: a strong hosted model for reasoning/agent/Q&A (default NVIDIA Nemotron, OpenAI-compatible)
+with automatic fallback to a local, free, private model (Ollama). The single switch is `configs/llm.yaml`.
+This is strictly an orchestration/phrasing layer. Every quantitative claim and every citation still comes
+from the deterministic validated-tool path; the LLM never introduces a number, gene, or citation. The
+choice of model therefore does not affect scientific reproducibility - only the quality of orchestration
+and prose. If no provider is reachable, the callers fall back to the deterministic answer (LLM optional).
+Secrets: the API key is read from the env var named in `api_key_env`, then from the gitignored
+`api_key_file`. Keys are NEVER committed.
+"""
+from __future__ import annotations
+import json
+import os
+import urllib.request
+from pathlib import Path
+import yaml
+_CFG = Path(__file__).resolve().parents[2] / "configs" / "llm.yaml"
+_ROOT = Path(__file__).resolve().parents[2]
+_SYSTEM = ("You rephrase already-verified genome-writing facts into one clear paragraph for a wet-lab "
+           "scientist. Use ONLY the facts provided. Do NOT invent or alter any number, gene, or citation. "
+           "Do not give clinical advice. Keep it under 90 words.")
+def load_llm_config(path: str | Path = _CFG) -> dict:
+    return yaml.safe_load(Path(path).read_text(encoding="utf-8"))
+def _provider_cfg(cfg: dict, name: str) -> dict | None:
+    return (cfg.get("providers") or {}).get(name)
+def _resolve_key(pcfg: dict) -> str | None:
+    env = pcfg.get("api_key_env")
+    if env and os.environ.get(env):
+        return os.environ[env].strip()
+    f = pcfg.get("api_key_file")
+    if f:
+        p = Path(f)
+        if not p.is_absolute():
+            p = _ROOT / f
+        if p.exists():
+            return p.read_text(encoding="utf-8").strip()
+    return pcfg.get("api_key")
+def _norm_tool_calls(raw: list | None) -> list:
+    out = []
+    for c in raw or []:
+        fn = c.get("function", {})
+        args = fn.get("arguments", {})
+        if isinstance(args, str):
+            try:
+                args = json.loads(args or "{}")
+            except json.JSONDecodeError:
+                args = {}
+        out.append({"function": {"name": fn.get("name"), "arguments": args}})
+    return out
+def _chat_openai(pcfg: dict, messages: list, tools: list | None, temperature: float,
+                 timeout: int) -> dict | None:
+    """OpenAI-compatible /v1/chat/completions (NVIDIA NIM, OpenAI, vLLM, Ollama /v1)."""
+    base = pcfg["api_base"].rstrip("/")
+    key = _resolve_key(pcfg)
+    payload = {"model": pcfg["model"], "messages": messages, "temperature": temperature,
+               "max_tokens": int(pcfg.get("max_tokens", 1024))}
+    if tools:
+        payload["tools"] = tools
+        payload["tool_choice"] = "auto"
+    headers = {"Content-Type": "application/json"}
+    if key:
+        headers["Authorization"] = f"Bearer {key}"
+    req = urllib.request.Request(f"{base}/chat/completions", data=json.dumps(payload).encode(), headers=headers)
+    with urllib.request.urlopen(req, timeout=timeout) as r:
+        d = json.load(r)
+    msg = d["choices"][0]["message"]
+    return {"content": (msg.get("content") or "").strip(), "tool_calls": _norm_tool_calls(msg.get("tool_calls")),
+            "raw": msg, "style": "openai"}
+def _chat_ollama(pcfg: dict, messages: list, tools: list | None, temperature: float,
+                 timeout: int) -> dict | None:
+    """Ollama native /api/chat."""
+    base = pcfg["api_base"].rstrip("/")
+    payload = {"model": str(pcfg["model"]).split("/")[-1], "messages": messages, "stream": False,
+               "options": {"temperature": temperature}}
+    if tools:
+        payload["tools"] = tools
+    req = urllib.request.Request(f"{base}/api/chat", data=json.dumps(payload).encode(),
+                                 headers={"Content-Type": "application/json"})
+    with urllib.request.urlopen(req, timeout=timeout) as r:
+        d = json.load(r)
+    msg = d.get("message", {})
+    return {"content": (msg.get("content") or "").strip(), "tool_calls": _norm_tool_calls(msg.get("tool_calls")),
+            "raw": msg, "style": "ollama"}
+def _call_provider(name: str, cfg: dict, messages: list, tools: list | None, timeout: int) -> dict | None:
+    pcfg = _provider_cfg(cfg, name)
+    if not pcfg:
+        return None
+    temp = float(cfg.get("temperature", 0.1))
+    style = pcfg.get("style", "openai")
+    try:
+        if style == "ollama":
+            return _chat_ollama(pcfg, messages, tools, temp, timeout)
+        return _chat_openai(pcfg, messages, tools, temp, timeout)
+    except Exception:  # noqa: BLE001 - any provider failure -> let the caller try the fallback
+        return None
+# Cooldown cache: once a provider fails (e.g. Ollama not installed on the laptop tier), skip it for
+# `health_ttl` seconds instead of re-attempting it on every call. This is what prevents the multi-minute
+# stalls when a configured provider is absent/slow - we pay one failed attempt, then bypass it.
+_COOLDOWN: dict[str, float] = {}
+def chat(messages: list, tools: list | None = None, cfg: dict | None = None,
+         timeout: int | None = None) -> dict | None:
+    """Provider-agnostic chat. Tries the active provider, then the configured fallback, skipping any
+    provider in cooldown (recently unreachable). Returns {content, tool_calls, provider} or None if every
+    provider fails (callers then degrade deterministically - the LLM is non-load-bearing)."""
+    import time
+    cfg = cfg or load_llm_config()
+    timeout = timeout if timeout is not None else int(cfg.get("call_timeout", 60))
+    ttl = float(cfg.get("health_ttl", 120))
+    order = [cfg.get("provider", "nvidia")]
+    fb = cfg.get("fallback")
+    if fb and fb not in order:
+        order.append(fb)
+    now = time.time()
+    tried_any = False
+    for name in order:
+        if _COOLDOWN.get(name, 0) > now:        # provider recently failed -> skip without waiting
+            continue
+        tried_any = True
+        res = _call_provider(name, cfg, messages, tools, timeout)
+        if res is not None:
+            res["provider"] = name
+            _COOLDOWN.pop(name, None)
+            return res
+        _COOLDOWN[name] = now + ttl              # mark unreachable; don't retry for ttl seconds
+    if not tried_any:                            # every provider in cooldown -> one cheap retry of the first
+        name = order[0]
+        res = _call_provider(name, cfg, messages, tools, min(timeout, int(cfg.get("health_timeout", 8))))
+        if res is not None:
+            res["provider"] = name
+            _COOLDOWN.pop(name, None)
+            return res
+    return None
+def active_provider(cfg: dict | None = None, timeout: int | None = None) -> str | None:
+    """Name of the first reachable provider (active, then fallback), or None. Uses the config `health_timeout`
+    by default so an absent provider is detected quickly (and then cooled down by chat())."""
+    cfg = cfg or load_llm_config()
+    timeout = timeout if timeout is not None else int(cfg.get("health_timeout", 8))
+    r = chat([{"role": "user", "content": "ok"}], cfg=cfg, timeout=timeout)
+    return r.get("provider") if r else None
+def available(cfg: dict | None = None, timeout: int = 30) -> bool:
+    return active_provider(cfg, timeout) is not None
+def phrase(facts: str, cfg: dict | None = None, timeout: int = 120) -> str | None:
+    """Rephrase grounded facts. Returns None on any failure (caller keeps the deterministic answer)."""
+    msgs = [{"role": "system", "content": _SYSTEM},
+            {"role": "user", "content": f"Facts:\n{facts}\n\nRephrase as one paragraph."}]
+    r = chat(msgs, cfg=cfg, timeout=timeout)
+    return (r.get("content") or None) if r else None