PyPI - pen-stack - Versions diffs - 3.1.0__py3-none-any.whl - Mend

pen-stack 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

pen_stack/__init__.py +2 -0
pen_stack/_resources.py +34 -0
pen_stack/adapt/__init__.py +14 -0
pen_stack/adapt/finetune.py +33 -0
pen_stack/adapt/ingest.py +86 -0
pen_stack/adapt/pipeline.py +101 -0
pen_stack/adapt/recalibrate.py +58 -0
pen_stack/adapt/report.py +130 -0
pen_stack/agent/__init__.py +1 -0
pen_stack/agent/guardrails.py +49 -0
pen_stack/agent/mcp_server.py +42 -0
pen_stack/agent/orchestrator.py +106 -0
pen_stack/agent/pen_agent.py +169 -0
pen_stack/agent/tools.py +130 -0
pen_stack/atlas/__init__.py +1 -0
pen_stack/atlas/build_wtkb.py +80 -0
pen_stack/atlas/crosslink.py +144 -0
pen_stack/atlas/expand.py +190 -0
pen_stack/atlas/schema.py +59 -0
pen_stack/atlas/scorecard.py +134 -0
pen_stack/atlas/universe.py +75 -0
pen_stack/atlas/variant_propose.py +155 -0
pen_stack/bridge/__init__.py +1 -0
pen_stack/bridge/activity.py +52 -0
pen_stack/bridge/cli.py +65 -0
pen_stack/bridge/fold_qc.py +53 -0
pen_stack/bridge/guide_qc.py +84 -0
pen_stack/bridge/ingest.py +139 -0
pen_stack/bridge/offtarget.py +133 -0
pen_stack/bridge/ortholog_screen.py +73 -0
pen_stack/bridge/pipeline.py +83 -0
pen_stack/cli.py +126 -0
pen_stack/data/__init__.py +1 -0
pen_stack/data/encode.py +84 -0
pen_stack/data/genome.py +71 -0
pen_stack/data/ingest_chromatin.py +119 -0
pen_stack/data/ingest_integration.py +112 -0
pen_stack/data/ingest_safety_annot.py +164 -0
pen_stack/data/ingest_trip.py +76 -0
pen_stack/mech/__init__.py +1 -0
pen_stack/mech/classify_atlas.py +71 -0
pen_stack/mech/whitelist.py +66 -0
pen_stack/monitor/__init__.py +1 -0
pen_stack/monitor/europepmc.py +32 -0
pen_stack/monitor/run.py +57 -0
pen_stack/monitor/triage.py +63 -0
pen_stack/planner/__init__.py +1 -0
pen_stack/planner/cargo.py +56 -0
pen_stack/planner/cargo_polish.py +146 -0
pen_stack/planner/delivery.py +32 -0
pen_stack/planner/multiplex.py +110 -0
pen_stack/planner/optimize.py +156 -0
pen_stack/planner/pipeline.py +86 -0
pen_stack/planner/report.py +26 -0
pen_stack/rag/__init__.py +1 -0
pen_stack/rag/index.py +53 -0
pen_stack/rag/llm.py +178 -0
pen_stack/rag/qa.py +105 -0
pen_stack/score/__init__.py +1 -0
pen_stack/score/recalibrate.py +77 -0
pen_stack/score/therapeutic.py +85 -0
pen_stack/server/__init__.py +1 -0
pen_stack/server/api.py +142 -0
pen_stack/ui/__init__.py +1 -0
pen_stack/ui/app.py +518 -0
pen_stack/validate/__init__.py +1 -0
pen_stack/validate/adapt_demo.py +69 -0
pen_stack/validate/agent_eval.py +117 -0
pen_stack/validate/blind_gsh_discovery.py +165 -0
pen_stack/validate/cargo_directionality.py +57 -0
pen_stack/validate/durability_baselines.py +150 -0
pen_stack/validate/forward_hypotheses.py +104 -0
pen_stack/validate/guide_qc_demo.py +58 -0
pen_stack/validate/intent_specification.py +82 -0
pen_stack/validate/paper3_benchmark.py +165 -0
pen_stack/validate/paper4_real_validation.py +144 -0
pen_stack/validate/paper4_validation.py +82 -0
pen_stack/validate/seq_vs_measured.py +134 -0
pen_stack/validate/within_locus_ranking.py +74 -0
pen_stack/validate/writer_recovery.py +86 -0
pen_stack/wgenome/__init__.py +1 -0
pen_stack/wgenome/chromatin_seq.py +83 -0
pen_stack/wgenome/durability.py +108 -0
pen_stack/wgenome/export_tracks.py +52 -0
pen_stack/wgenome/features.py +82 -0
pen_stack/wgenome/gsh_baseline.py +117 -0
pen_stack/wgenome/providers.py +245 -0
pen_stack/wgenome/safety.py +69 -0
pen_stack/wgenome/structure3d.py +168 -0
pen_stack/wgenome/writability.py +72 -0
pen_stack-3.1.0.dist-info/METADATA +451 -0
pen_stack-3.1.0.dist-info/RECORD +96 -0
pen_stack-3.1.0.dist-info/WHEEL +5 -0
pen_stack-3.1.0.dist-info/entry_points.txt +3 -0
pen_stack-3.1.0.dist-info/licenses/LICENSE +21 -0
pen_stack-3.1.0.dist-info/top_level.txt +1 -0

pen_stack/agent/orchestrator.py ADDED Viewed

@@ -0,0 +1,106 @@
+"""The PEN-STACK agent (Phase 3, Step 3.9) - tool-use orchestration.
+Given a natural-language goal ("durably express factor IX in hepatocytes"), the agent plans the whole
+write by calling validated tools (writability -> reachable writers -> writer axes -> plan_write -> cited
+literature) in a tool-calling loop driven by the configured LLM (hybrid: NVIDIA Nemotron with Ollama
+fallback, via ``pen_stack.rag.llm.chat``). Guardrails: it obtains numbers ONLY from tool calls (no
+free-text predictions), refuses clinical-directive prompts, and logs an auditable trace.
+Graceful: if no LLM provider is reachable, ``run_agent`` returns a refusal-free deterministic fallback
+that calls plan_write directly - so the platform degrades to the validated pipeline rather than failing.
+"""
+from __future__ import annotations
+import json
+from pathlib import Path
+from pen_stack.agent.guardrails import DISCLAIMER, out_of_scope
+from pen_stack.agent.tools import SCHEMAS, dispatch
+from pen_stack.rag.llm import chat as llm_chat
+_TRACES = Path(__file__).resolve().parents[2] / "out" / "agent_traces"
+_SYSTEM = (
+    "You are the PEN-STACK genome-writing planning agent. You MUST obtain every fact and number by "
+    "calling the provided tools - never invent a number, gene, score, or citation. Plan a write by "
+    "calling: writability, reachable_writers, writer_axes, plan_write, ask_literature. When you have "
+    "enough tool results, write a short plan that cites which tool produced each number. Decision-support "
+    "only - never give clinical directives.")
+def _tool_response(style: str, call_id: str | None, content: str) -> dict:
+    """Format a tool-result message for the provider's API style."""
+    m = {"role": "tool", "content": content}
+    if style == "openai" and call_id is not None:
+        m["tool_call_id"] = call_id
+    return m
+def run_agent(goal: str, max_steps: int = 12, cfg: dict | None = None) -> dict:
+    """Turn a goal into a cited, auditable plan. Numbers come only from tool calls."""
+    refusal = out_of_scope(goal)
+    if refusal:
+        return {"refused": True, "plan": refusal, "trace": [], "disclaimer": DISCLAIMER}
+    from pen_stack.rag.llm import load_llm_config
+    step_timeout = int((cfg or load_llm_config()).get("agent_call_timeout", 60))
+    msgs = [{"role": "system", "content": _SYSTEM}, {"role": "user", "content": goal}]
+    trace: list[dict] = []
+    seen: set = set()
+    for _ in range(max_steps):
+        resp = llm_chat(msgs, tools=SCHEMAS, cfg=cfg, timeout=step_timeout)
+        if resp is None:
+            return _fallback(goal, trace)
+        provider, style = resp.get("provider"), resp.get("style", "openai")
+        calls = resp.get("tool_calls") or []
+        if not calls:
+            return {"refused": False, "plan": resp.get("content", "").strip(),
+                    "trace": trace, "disclaimer": DISCLAIMER, "llm": True, "provider": provider}
+        msgs.append(resp["raw"])                          # append the assistant turn verbatim
+        raw_calls = resp["raw"].get("tool_calls") or []
+        for i, c in enumerate(calls):
+            name = c["function"]["name"]
+            args = c["function"]["arguments"]
+            call_id = (raw_calls[i].get("id") if i < len(raw_calls) else None)
+            key = f"{name}:{json.dumps(args, sort_keys=True, default=str)}"
+            if key in seen:
+                msgs.append(_tool_response(style, call_id, json.dumps(
+                    {"note": "already called with these args; use prior result and finalise the plan"})))
+                continue
+            seen.add(key)
+            try:
+                result = dispatch(name, args)            # VALIDATED tool only
+            except Exception as e:  # noqa: BLE001
+                result = {"error": str(e)}
+            trace.append({"tool": name, "args": args, "result": result})
+            msgs.append(_tool_response(style, call_id, json.dumps(result, default=str)))
+    return {"refused": False, "plan": "(max steps reached)", "trace": trace,
+            "disclaimer": DISCLAIMER, "llm": True}
+def _fallback(goal: str, trace: list[dict]) -> dict:
+    """Deterministic fallback when no LLM is reachable: call plan_write on a best-effort parse."""
+    from pen_stack.planner.optimize import EditIntent
+    gene = next((w for w in goal.replace(",", " ").split() if w.isupper() and len(w) >= 2), None)
+    intent = EditIntent.SAFE_HARBOUR.value
+    for kw, it in [("disrupt", "knock_in_with_disruption"), ("knock", "knock_in_with_disruption"),
+                   ("durab", "high_durability_insertion"), ("enhancer", "regulatory_excision"),
+                   ("repeat", "repeat_excision")]:
+        if kw in goal.lower():
+            intent = it
+            break
+    if not gene:
+        return {"refused": False, "plan": "No target gene detected; LLM unavailable.",
+                "trace": trace, "disclaimer": DISCLAIMER, "llm": False}
+    res = dispatch("plan_write", {"gene": gene, "intent": intent})
+    trace.append({"tool": "plan_write", "args": {"gene": gene, "intent": intent}, "result": res})
+    return {"refused": False, "plan": f"[deterministic fallback] plan for {gene} ({intent})",
+            "trace": trace, "disclaimer": DISCLAIMER, "llm": False}
+def save_trace(result: dict, name: str) -> Path:
+    _TRACES.mkdir(parents=True, exist_ok=True)
+    p = _TRACES / f"{name}.json"
+    p.write_text(json.dumps(result, indent=2, default=str), encoding="utf-8")
+    return p

pen_stack/agent/pen_agent.py ADDED Viewed

@@ -0,0 +1,169 @@
+"""PEN-Agent - grounded write-planning state machine (v3.1, WS-E2).
+A deterministic task-state machine over the VALIDATED tools. It sequences a genome-write plan:
+    goal intake -> site selection (writability) -> writer selection (reachability) ->
+    cargo design (+ Cargo Polish) -> off-target -> 3D structural risk -> report
+Core property (the contribution): NO FABRICATION. Every number in the output is copied verbatim from a
+tool-result dict and tagged with that tool's provenance; a step whose tool cannot ground a value is marked
+`degraded`/`refused`, never invented. The agent therefore runs end-to-end even when AlphaGenome or the
+bridge engine is unavailable - those steps degrade with a reason instead of guessing.
+The LLM (agent/orchestrator.py) is an optional conversational front-end over this same machine; the plan
+itself is deterministic, so the result is reproducible and the no-fabrication guarantee holds with or
+without an LLM. Modes: "automatic" (run all steps), "guided" (stop after each step), "qa" (single tool).
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from pen_stack.agent import tools as T
+@dataclass
+class Step:
+    name: str
+    tool: str | None
+    status: str                  # ok | degraded | refused
+    provenance: str | None = None
+    result: dict = field(default_factory=dict)
+    reason: str | None = None
+def _site_selection(gene: str, ct: str) -> Step:
+    try:
+        r = T.writability(gene, ct)
+    except Exception as e:  # noqa: BLE001 - missing Phase-1 atlas -> refuse, never fabricate
+        return Step("site_selection", "wgenome.writability", "refused", reason=f"{type(e).__name__}: {e}")
+    if not r.get("found"):
+        return Step("site_selection", r.get("tool"), "refused", reason=f"no writable locus for {gene}")
+    return Step("site_selection", r["tool"], "ok", provenance=r["tool"],
+                result={"max_writability": r["max_writability"], "safety": r["safety"],
+                        "p_durable": r["p_durable"], "n_bins": r["n_bins"]})
+def _writer_selection(gene: str, intent: str, cargo_bp: int, ct: str) -> tuple[Step, dict]:
+    try:
+        plan = T.plan_write(gene, intent, cargo_bp, ct)
+    except Exception as e:  # noqa: BLE001 - missing atlas -> refuse, never fabricate
+        return Step("writer_selection", "planner.pipeline", "refused",
+                    reason=f"{type(e).__name__}: {e}"), {}
+    if not plan or plan.get("found") is False:
+        return Step("writer_selection", "planner.pipeline", "refused",
+                    reason="planner returned no plan"), {}
+    fam = plan.get("writer") or plan.get("writer_family") or plan.get("family")
+    return (Step("writer_selection", "planner.pipeline", "ok", provenance="planner.pipeline",
+                 result={"writer_family": fam, "score": plan.get("score"),
+                         "site": {k: plan.get(k) for k in ("chrom", "bin") if k in plan}}),
+            plan)
+def _cargo_design(plan: dict, cargo_bp: int, ct: str, payload_seq: str | None) -> Step:
+    from pen_stack.planner.cargo import design_cargo
+    fam = plan.get("writer") or plan.get("writer_family") or plan.get("family")
+    wr = {"family": fam, "cargo_capacity_bp": plan.get("cargo_capacity_bp"),
+          "deliv_class": plan.get("deliv_class")}
+    site = (plan.get("chrom"), plan.get("bin"))
+    cargo = design_cargo(cargo_bp, wr, site, ct, payload_seq=payload_seq)
+    res = {"assembled_bp": cargo["assembled_bp"], "size_ok": cargo["size_ok"]}
+    if "cargo_polish" in cargo:
+        res["cargo_durability_risk"] = cargo["cargo_polish"]["cargo_durability_risk"]
+        res["cargo_band"] = cargo["cargo_polish"]["band"]
+        res["cargo_suggestions"] = [f["suggestion"] for f in cargo["cargo_polish"]["flags"]]
+    return Step("cargo_design", "planner.cargo", "ok", provenance="planner.cargo+cargo_polish", result=res)
+def _offtarget(plan: dict) -> Step:
+    fam = plan.get("writer") or plan.get("writer_family") or plan.get("family") or ""
+    if "bridge" not in str(fam).lower() and "seek" not in str(fam).lower():
+        return Step("offtarget", None, "degraded", reason=f"off-target engine applies to bridge/seek writers, not {fam}")
+    try:
+        from pen_stack.bridge.offtarget import predict_offtargets
+        r = predict_offtargets(fam, (plan.get("chrom"), plan.get("bin")))
+    except Exception as e:  # noqa: BLE001
+        return Step("offtarget", "bridge.offtarget", "degraded", reason=f"{type(e).__name__}: {e}")
+    if isinstance(r, dict) and r.get("status", "").startswith("pending"):
+        return Step("offtarget", "bridge.offtarget", "degraded", reason=r.get("note"))
+    return Step("offtarget", "bridge.offtarget", "ok", provenance="bridge.offtarget",
+                result=r if isinstance(r, dict) else {"offtargets": r})
+def _structural_risk(plan: dict) -> Step:
+    chrom, b = plan.get("chrom"), plan.get("bin")
+    if chrom is None or b is None:
+        return Step("structural_risk", None, "degraded", reason="no concrete site coordinates")
+    try:
+        from pen_stack.wgenome.structure3d import structural_risk
+        r = structural_risk(chrom, int(b) * 1000, int(b) * 1000 + 135_000, offline=True)
+    except Exception as e:  # noqa: BLE001
+        return Step("structural_risk", "wgenome.structure3d", "degraded", reason=f"{type(e).__name__}: {e}")
+    if not r.get("available"):
+        return Step("structural_risk", "wgenome.structure3d", "degraded",
+                    reason="AlphaGenome contact map not cached (offline); flag with confidence (Gate G-C)")
+    return Step("structural_risk", "wgenome.structure3d", "ok", provenance="wgenome.structure3d", result=r)
+def plan_write_session(gene: str, intent: str, cargo_bp: int = 2000, ct: str = "k562",
+                       payload_seq: str | None = None, mode: str = "automatic") -> dict:
+    """Run the grounded write-planning state machine. Returns steps with provenance + a no-fabrication audit."""
+    from pen_stack.agent.guardrails import DISCLAIMER
+    steps: list[Step] = []
+    s_site = _site_selection(gene, ct)
+    steps.append(s_site)
+    plan: dict = {}
+    if s_site.status == "ok":
+        s_writer, plan = _writer_selection(gene, intent, cargo_bp, ct)
+        steps.append(s_writer)
+        if s_writer.status == "ok":
+            steps.append(_cargo_design(plan, cargo_bp, ct, payload_seq))
+            steps.append(_offtarget(plan))
+            steps.append(_structural_risk(plan))
+        if mode == "guided":
+            steps = steps[:2]                       # guided mode pauses after writer selection
+    grounded = [s for s in steps if s.status == "ok"]
+    degraded = [{"step": s.name, "reason": s.reason} for s in steps if s.status == "degraded"]
+    refused = [{"step": s.name, "reason": s.reason} for s in steps if s.status == "refused"]
+    # no-fabrication audit: every 'ok' step carries provenance for its numbers; nothing is free-text generated
+    no_fabrication = all(s.provenance for s in grounded)
+    return {
+        "goal": {"gene": gene, "intent": intent, "cargo_bp": cargo_bp, "ct": ct, "mode": mode},
+        "steps": [vars(s) for s in steps],
+        "provenance": {s.name: s.provenance for s in grounded},
+        "degraded_modes": degraded,
+        "refusals": refused,
+        "no_fabrication": no_fabrication,
+        "completed": bool(grounded) and not refused,
+        "disclaimer": DISCLAIMER,
+    }
+_AUDIT_GOALS = [("TRAC", "knock_in_with_disruption"),
+                ("HBB", "high_durability_insertion"),
+                ("AAVS1", "safe_harbour_insertion")]
+def no_fabrication_audit(goals: list[tuple[str, str]] | None = None) -> dict:
+    """Deterministic no-fabrication HARD GATE for the bench (T6) - NO LLM, so it never hangs and is always
+    available. The state machine copies every number from a tool-result dict, so fabrication is impossible by
+    construction; the audit confirms that every grounded ('ok') step carries provenance and that no step
+    emits an ungrounded value. Without the Phase-1 atlas the steps refuse (still no fabrication = pass)."""
+    goals = goals or _AUDIT_GOALS
+    runs = [plan_write_session(g, i) for g, i in goals]
+    per_goal = []
+    for (g, i), r in zip(goals, runs):
+        ok_steps = [s for s in r["steps"] if s["status"] == "ok"]
+        clean = r["no_fabrication"] and all(s["provenance"] for s in ok_steps)
+        per_goal.append({"gene": g, "intent": i, "no_fabrication": bool(clean),
+                         "grounded_steps": len(ok_steps), "completed": r["completed"]})
+    n_fab = sum(0 if p["no_fabrication"] else 1 for p in per_goal)
+    return {"available": True, "n_goals": len(goals), "n_fabricated": n_fab,
+            "all_no_fabrication_pass": n_fab == 0,
+            "n_grounded": sum(p["completed"] for p in per_goal), "per_goal": per_goal,
+            "method": "deterministic pen_agent state machine (no LLM); fabrication impossible by construction"}
+if __name__ == "__main__":  # pragma: no cover
+    import json
+    print(json.dumps(no_fabrication_audit(), indent=2, default=str))

pen_stack/agent/tools.py ADDED Viewed

@@ -0,0 +1,130 @@
+"""PEN-STACK agent tools (Phase 3, Step 3.9) - the validated capabilities the agent may call.
+Each tool wraps a *validated* module function and returns a JSON-serialisable, provenance-tagged result.
+The agent may obtain numbers ONLY by calling these - never by free-text generation (the no-fabrication
+guarantee, enforced by the eval harness). Schemas are the Ollama/OpenAI tool-calling format.
+"""
+from __future__ import annotations
+from typing import Any
+def writability(gene: str, ct: str = "k562") -> dict:
+    """Most-writable locus near a gene (safety x durability)."""
+    from pen_stack.atlas.crosslink import loci_for_gene
+    g = loci_for_gene(gene, ct)
+    if g.empty:
+        return {"gene": gene, "ct": ct, "found": False, "tool": "wgenome.writability"}
+    top = g.sort_values("writability", ascending=False).iloc[0]
+    return {"gene": gene, "ct": ct, "found": True,
+            "max_writability": round(float(top["writability"]), 4),
+            "safety": round(float(top["safety"]), 4),
+            "p_durable": round(float(top["p_durable"]), 4),
+            "n_bins": int(len(g)), "tool": "wgenome.writability"}
+def reachable_writers(gene: str, ct: str = "k562") -> dict:
+    """Writer families that can reach a gene's most-writable locus."""
+    from pen_stack.atlas.crosslink import loci_for_gene, writers_for_locus
+    g = loci_for_gene(gene, ct)
+    if g.empty:
+        return {"gene": gene, "found": False, "tool": "atlas.crosslink"}
+    top = g.sort_values("writability", ascending=False).iloc[0]
+    w = writers_for_locus(top["chrom"], int(top["bin"]), ct)
+    return {"gene": gene, "ct": ct, "found": True,
+            "families": sorted(set(w["family"])) if not w.empty else [],
+            "tool": "atlas.crosslink"}
+def writer_axes(family: str) -> dict:
+    """Measured axes for a writer family (cargo, deliverability, reachability, readiness)."""
+    import pandas as pd
+    from pen_stack.rag.index import _ATLAS
+    atlas = pd.read_parquet(_ATLAS)
+    sub = atlas[atlas["family"] == family]
+    if sub.empty:
+        return {"family": family, "found": False, "tool": "atlas.score"}
+    core = sub[sub["entry_kind"] == "curated_core"]
+    r = core.iloc[0] if len(core) else sub.iloc[0]
+    return {"family": family, "found": True, "n_systems": int(len(sub)),
+            "reachability_tier": r.get("reachability_tier"),
+            "cargo_capacity_bp": (int(r["cargo_capacity_bp"]) if pd.notna(r.get("cargo_capacity_bp")) else None),
+            "deliv_class": r.get("deliv_class"), "tool": "atlas.score"}
+def plan_write(gene: str, intent: str, cargo_bp: int = 2000, ct: str = "k562") -> dict:
+    """Full Write Planner: goal + edit_intent -> top ranked, traceable plan."""
+    from pen_stack.planner.optimize import EditIntent
+    from pen_stack.planner.pipeline import plan_write as _pw
+    plans = _pw(gene, EditIntent(intent), cargo_bp, ct, k=1)
+    return (plans[0] if plans else {"gene": gene, "found": False}) | {"tool": "planner.pipeline"}
+def ask_literature(q: str) -> dict:
+    """Grounded, cited literature answer (numbers still from tools)."""
+    from pen_stack.rag.qa import answer
+    a = answer(q)
+    return {"answer": a["answer"], "citations": a["citations"], "tool": "rag.qa"}
+def multiplex_translocation_risk(edits: list[dict]) -> dict:
+    """Translocation-risk SCREEN for a multi-edit (2-5) plan: pairwise DSB-join risk across edits.
+    Each edit: {name, family, chrom, pos, optional offtargets:[{chrom,pos,risk}]}. DSB-free recombinase
+    writers contribute zero risk. A screen, not a calibrated predictor (WS-G1)."""
+    from pen_stack.planner.multiplex import translocation_risk
+    return {**translocation_risk(edits), "tool": "planner.multiplex"}
+REGISTRY = {
+    "writability": writability,
+    "reachable_writers": reachable_writers,
+    "writer_axes": writer_axes,
+    "plan_write": plan_write,
+    "ask_literature": ask_literature,
+    "multiplex_translocation_risk": multiplex_translocation_risk,
+}
+# Ollama/OpenAI tool-calling schemas
+SCHEMAS = [
+    {"type": "function", "function": {
+        "name": "writability", "description": "Most-writable locus near a gene (safety x durability).",
+        "parameters": {"type": "object", "properties": {
+            "gene": {"type": "string"}, "ct": {"type": "string", "enum": ["k562", "hepg2", "hspc"]}},
+            "required": ["gene"]}}},
+    {"type": "function", "function": {
+        "name": "reachable_writers", "description": "Writer families that can reach a gene's best locus.",
+        "parameters": {"type": "object", "properties": {
+            "gene": {"type": "string"}, "ct": {"type": "string"}}, "required": ["gene"]}}},
+    {"type": "function", "function": {
+        "name": "writer_axes", "description": "Measured axes for a writer family.",
+        "parameters": {"type": "object", "properties": {"family": {"type": "string"}},
+                       "required": ["family"]}}},
+    {"type": "function", "function": {
+        "name": "plan_write", "description": "Full Write Planner: gene + edit_intent -> ranked plan.",
+        "parameters": {"type": "object", "properties": {
+            "gene": {"type": "string"},
+            "intent": {"type": "string", "enum": ["safe_harbour_insertion", "knock_in_with_disruption",
+                       "high_durability_insertion", "regulatory_excision", "repeat_excision"]},
+            "cargo_bp": {"type": "integer"}, "ct": {"type": "string"}},
+            "required": ["gene", "intent"]}}},
+    {"type": "function", "function": {
+        "name": "ask_literature", "description": "Grounded, cited literature answer.",
+        "parameters": {"type": "object", "properties": {"q": {"type": "string"}}, "required": ["q"]}}},
+    {"type": "function", "function": {
+        "name": "multiplex_translocation_risk",
+        "description": "Translocation-risk screen for a multi-edit plan (pairwise DSB-join risk).",
+        "parameters": {"type": "object", "properties": {
+            "edits": {"type": "array", "items": {"type": "object", "properties": {
+                "name": {"type": "string"}, "family": {"type": "string"},
+                "chrom": {"type": "string"}, "pos": {"type": "integer"}}}}},
+            "required": ["edits"]}}},
+]
+def dispatch(name: str, args: dict) -> Any:
+    """Execute a validated tool by name. Raises KeyError for unknown tools (never fabricates)."""
+    if name not in REGISTRY:
+        raise KeyError(f"unknown tool: {name}")
+    return REGISTRY[name](**args)

pen_stack/atlas/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """pen_stack.atlas - see PEN-STACK v3.0 program doc."""

pen_stack/atlas/build_wtkb.py ADDED Viewed

@@ -0,0 +1,80 @@
+"""Build the Writer-Targeting Knowledge Base (WT-KB) - Phase 0, Step 0.2.
+Reads the curated YAML (one block per writer family), validates every row against the
+``WriterEntry`` schema (which enforces the sourcing rule: >=1 DOI per row), and emits both a
+parquet (for the pipeline) and a human-readable markdown table (for literature cross-check).
+Usage:
+    python -m pen_stack.atlas.build_wtkb --curated configs/wtkb_curated.yaml \
+        --out pen_stack/atlas/wtkb.parquet --md docs/wtkb.md
+"""
+from __future__ import annotations
+import argparse
+from pathlib import Path
+import pandas as pd
+import yaml
+from pen_stack.atlas.schema import WriterEntry
+def build(curated_yaml: str, out_parquet: str | None = None, out_md: str | None = None) -> pd.DataFrame:
+    curated = yaml.safe_load(Path(curated_yaml).read_text(encoding="utf-8"))
+    rows = []
+    for key, block in curated.items():
+        entry = WriterEntry(**block)              # validates (raises on missing DOI / bad enum)
+        d = entry.model_dump()
+        d["_key"] = key
+        rows.append(d)
+    df = pd.DataFrame(rows)
+    # stable column order, _key first
+    cols = ["_key"] + [c for c in df.columns if c != "_key"]
+    df = df[cols]
+    if out_parquet:
+        Path(out_parquet).parent.mkdir(parents=True, exist_ok=True)
+        df.to_parquet(out_parquet, index=False)
+    if out_md:
+        Path(out_md).parent.mkdir(parents=True, exist_ok=True)
+        Path(out_md).write_text(_to_markdown(df), encoding="utf-8")
+    return df
+def _to_markdown(df: pd.DataFrame) -> str:
+    lines = [
+        "# Writer-Targeting Knowledge Base (WT-KB)",
+        "",
+        f"_Generated from `configs/wtkb_curated.yaml` - {len(df)} writer families. "
+        "Every row is schema-validated and carries >=1 DOI (sourcing rule)._",
+        "",
+        "| Family | Representative | Mechanism | Modality | Target site | Tier | Confidence | DOIs |",
+        "|---|---|---|---|---|---|---|---|",
+    ]
+    for _, r in df.iterrows():
+        dois = "; ".join(r["key_dois"])
+        lines.append(
+            f"| {r['family']} | {r['representative_system']} | {r['mechanism_bucket']} | "
+            f"{r['targeting_modality']} | {r['target_site_spec']} | {r['reachability_tier']} | "
+            f"{r['confidence']} | {dois} |"
+        )
+    lines += ["", "## Reachability constraints (per family)", ""]
+    for _, r in df.iterrows():
+        lines.append(f"- **{r['family']}** ({r['reachability_tier']}): {r['reachability_constraints']}")
+    return "\n".join(lines) + "\n"
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--curated", default="configs/wtkb_curated.yaml")
+    ap.add_argument("--out", default="pen_stack/atlas/wtkb.parquet")
+    ap.add_argument("--md", default="docs/wtkb.md")
+    a = ap.parse_args()
+    df = build(a.curated, a.out, a.md)
+    print(f"WT-KB built: {len(df)} families -> {a.out}")
+    tiers = df["reachability_tier"].value_counts().to_dict()
+    print(f"tiers: {tiers}")
+    print(f"fully-specified families: {len(df)} (target >=6)")
+if __name__ == "__main__":
+    main()

pen_stack/atlas/crosslink.py ADDED Viewed

@@ -0,0 +1,144 @@
+"""Cross-link the Writer Atlas <-> the Writable Genome (Phase 2, Step 2.5).
+The integration that makes PEN-STACK more than a catalogue: bidirectional queries between writers (the
+Phase-2 atlas, 33k systems by family) and loci (the Phase-1 Writable Genome, 3M bins x cell type with a
+``reachable_tier1`` annotation + a decomposable ``writability`` score).
+- ``loci_for_writer(family, ct)``  -> loci that family can reach, ranked by writability.
+- ``writers_for_locus(chrom, bin)`` -> atlas systems whose family reaches that locus, with readiness.
+- ``loci_for_gene(gene, ct)``       -> writable bins overlapping a gene (forward query helper).
+Honest scope (Phase-1 D1.8-1): reachability is released at the *locus* level - the Tier-1
+reprogrammable families (bridge_IS110 / Cas9 / Cas12a) are near-universal at 1 kb, so the cross-link's
+discriminating signal is the *writability ranking* and the *family -> atlas-system* join (each carrying
+therapeutic readiness). Per-site reachability (does a specific bridge core exist here?) is Planner work.
+Inputs : Phase-1 atlas_<ct>.parquet (chrom, bin, safety, p_durable, reachable_tier1, writability),
+         the Phase-2 atlas.parquet, gene_coords.parquet.
+Outputs: out/crosslink_cache_<ct>.parquet (per-family reachable-loci summary).
+"""
+from __future__ import annotations
+import os
+from functools import lru_cache
+from pathlib import Path
+import pandas as pd
+_ROOT = Path(__file__).resolve().parents[2]
+_FINAL = _ROOT.parent                          # Final_Part_v3.0/
+_ATLAS = _ROOT / "pen_stack" / "atlas" / "atlas.parquet"
+_OUT = _ROOT / "out"
+BIN_BP = 1000
+# Phase-1 writability atlas can live in a few places (fetched-not-committed). First match wins.
+# PEN_ATLAS_DIR (also used by the UI) is honoured first so every cross-link-backed feature - the Write
+# Planner, the agent, and the RAG numeric route - finds the same atlas the UI does in any deployment.
+def _writability_search() -> list[Path]:
+    bases: list[Path] = []
+    env = os.environ.get("PEN_ATLAS_DIR")
+    if env:
+        bases.append(Path(env))
+    bases += [_ROOT / "data" / "out", _FINAL / "phase_1" / "out"]
+    return bases
+def writability_path(ct: str) -> Path:
+    bases = _writability_search()
+    for base in bases:
+        p = base / f"atlas_{ct}.parquet"
+        if p.exists():
+            return p
+    raise FileNotFoundError(f"atlas_{ct}.parquet not found in {[str(b) for b in bases]}")
+@lru_cache(maxsize=4)
+def load_writability(ct: str) -> pd.DataFrame:
+    df = pd.read_parquet(writability_path(ct))
+    df["_reach"] = df["reachable_tier1"].fillna("").str.split(";")
+    return df
+@lru_cache(maxsize=1)
+def load_writer_atlas() -> pd.DataFrame:
+    return pd.read_parquet(_ATLAS)
+def reachable_families(ct: str) -> set[str]:
+    """The writer families annotated as Tier-1 reachable in the Phase-1 atlas for this cell type."""
+    df = load_writability(ct)
+    fams: set[str] = set()
+    for r in df["reachable_tier1"].dropna().unique():
+        fams.update(x for x in str(r).split(";") if x)
+    return fams
+def loci_for_writer(family: str, ct: str = "k562", top: int = 20) -> pd.DataFrame:
+    """Top-writability loci reachable by a writer family (genomic coords + writability components)."""
+    df = load_writability(ct)
+    mask = df["_reach"].apply(lambda fams: family in fams)
+    hit = df.loc[mask].nlargest(top, "writability").copy()
+    hit["chrom_start"] = hit["bin"] * BIN_BP
+    return hit[["chrom", "bin", "chrom_start", "safety", "p_durable", "writability", "reachable_tier1"]]
+def writers_for_locus(chrom: str, bin_idx: int, ct: str = "k562") -> pd.DataFrame:
+    """Atlas systems whose family reaches a locus, with therapeutic readiness (if scored)."""
+    df = load_writability(ct)
+    row = df[(df["chrom"] == chrom) & (df["bin"] == bin_idx)]
+    if row.empty:
+        return pd.DataFrame()
+    fams = {x for x in str(row.iloc[0]["reachable_tier1"]).split(";") if x}
+    atlas = load_writer_atlas()
+    cols = [c for c in ["representative_system", "family", "confidence", "deliv_class",
+                        "readiness", "cargo_capacity_bp", "reachability_tier"] if c in atlas.columns]
+    out = atlas[atlas["family"].isin(fams)][cols].copy()
+    out["locus_writability"] = float(row.iloc[0]["writability"])
+    return out
+def loci_for_gene(gene: str, ct: str = "k562", gene_coords: str | Path | None = None) -> pd.DataFrame:
+    """Writable bins overlapping a gene body (forward query helper)."""
+    if gene_coords:
+        gc_path = Path(gene_coords)
+    else:
+        from pen_stack.planner.optimize import gene_coords_path
+        gc_path = gene_coords_path()
+    gc = pd.read_parquet(gc_path)
+    g = gc[gc["gene"] == gene]
+    if g.empty:
+        return pd.DataFrame()
+    r = g.iloc[0]
+    df = load_writability(ct)
+    lo, hi = int(r["start"]) // BIN_BP, int(r["end"]) // BIN_BP
+    return df[(df["chrom"] == r["chrom"]) & (df["bin"].between(lo, hi))].sort_values(
+        "writability", ascending=False)
+def build_crosslink_cache(ct: str = "k562", out: str | Path | None = None) -> pd.DataFrame:
+    """Per-family reachable-loci summary (count + median writability + top bin), cached."""
+    df = load_writability(ct)
+    rows = []
+    for fam in sorted(reachable_families(ct)):
+        sub = df[df["_reach"].apply(lambda fams, f=fam: f in fams)]
+        top = sub.nlargest(1, "writability")
+        rows.append({
+            "family": fam, "cell_type": ct, "n_reachable_loci": len(sub),
+            "median_writability": round(float(sub["writability"].median()), 4),
+            "top_chrom": top.iloc[0]["chrom"], "top_bin": int(top.iloc[0]["bin"]),
+            "top_writability": round(float(top.iloc[0]["writability"]), 4),
+        })
+    cache = pd.DataFrame(rows)
+    out = Path(out) if out else _OUT / f"crosslink_cache_{ct}.parquet"
+    out.parent.mkdir(parents=True, exist_ok=True)
+    cache.to_parquet(out, index=False)
+    return cache
+if __name__ == "__main__":  # pragma: no cover
+    for ct in ("k562", "hepg2", "hspc"):
+        try:
+            c = build_crosslink_cache(ct)
+            print(f"[{ct}] crosslink cache:\n{c.to_string(index=False)}\n")
+        except Exception as e:  # noqa: BLE001 - a missing/partial cell-type atlas is non-fatal
+            print(f"[{ct}] skip: {e}")