pen-stack 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. pen_stack/__init__.py +2 -0
  2. pen_stack/_resources.py +34 -0
  3. pen_stack/adapt/__init__.py +14 -0
  4. pen_stack/adapt/finetune.py +33 -0
  5. pen_stack/adapt/ingest.py +86 -0
  6. pen_stack/adapt/pipeline.py +101 -0
  7. pen_stack/adapt/recalibrate.py +58 -0
  8. pen_stack/adapt/report.py +130 -0
  9. pen_stack/agent/__init__.py +1 -0
  10. pen_stack/agent/guardrails.py +49 -0
  11. pen_stack/agent/mcp_server.py +42 -0
  12. pen_stack/agent/orchestrator.py +106 -0
  13. pen_stack/agent/pen_agent.py +169 -0
  14. pen_stack/agent/tools.py +130 -0
  15. pen_stack/atlas/__init__.py +1 -0
  16. pen_stack/atlas/build_wtkb.py +80 -0
  17. pen_stack/atlas/crosslink.py +144 -0
  18. pen_stack/atlas/expand.py +190 -0
  19. pen_stack/atlas/schema.py +59 -0
  20. pen_stack/atlas/scorecard.py +134 -0
  21. pen_stack/atlas/universe.py +75 -0
  22. pen_stack/atlas/variant_propose.py +155 -0
  23. pen_stack/bridge/__init__.py +1 -0
  24. pen_stack/bridge/activity.py +52 -0
  25. pen_stack/bridge/cli.py +65 -0
  26. pen_stack/bridge/fold_qc.py +53 -0
  27. pen_stack/bridge/guide_qc.py +84 -0
  28. pen_stack/bridge/ingest.py +139 -0
  29. pen_stack/bridge/offtarget.py +133 -0
  30. pen_stack/bridge/ortholog_screen.py +73 -0
  31. pen_stack/bridge/pipeline.py +83 -0
  32. pen_stack/cli.py +126 -0
  33. pen_stack/data/__init__.py +1 -0
  34. pen_stack/data/encode.py +84 -0
  35. pen_stack/data/genome.py +71 -0
  36. pen_stack/data/ingest_chromatin.py +119 -0
  37. pen_stack/data/ingest_integration.py +112 -0
  38. pen_stack/data/ingest_safety_annot.py +164 -0
  39. pen_stack/data/ingest_trip.py +76 -0
  40. pen_stack/mech/__init__.py +1 -0
  41. pen_stack/mech/classify_atlas.py +71 -0
  42. pen_stack/mech/whitelist.py +66 -0
  43. pen_stack/monitor/__init__.py +1 -0
  44. pen_stack/monitor/europepmc.py +32 -0
  45. pen_stack/monitor/run.py +57 -0
  46. pen_stack/monitor/triage.py +63 -0
  47. pen_stack/planner/__init__.py +1 -0
  48. pen_stack/planner/cargo.py +56 -0
  49. pen_stack/planner/cargo_polish.py +146 -0
  50. pen_stack/planner/delivery.py +32 -0
  51. pen_stack/planner/multiplex.py +110 -0
  52. pen_stack/planner/optimize.py +156 -0
  53. pen_stack/planner/pipeline.py +86 -0
  54. pen_stack/planner/report.py +26 -0
  55. pen_stack/rag/__init__.py +1 -0
  56. pen_stack/rag/index.py +53 -0
  57. pen_stack/rag/llm.py +178 -0
  58. pen_stack/rag/qa.py +105 -0
  59. pen_stack/score/__init__.py +1 -0
  60. pen_stack/score/recalibrate.py +77 -0
  61. pen_stack/score/therapeutic.py +85 -0
  62. pen_stack/server/__init__.py +1 -0
  63. pen_stack/server/api.py +142 -0
  64. pen_stack/ui/__init__.py +1 -0
  65. pen_stack/ui/app.py +518 -0
  66. pen_stack/validate/__init__.py +1 -0
  67. pen_stack/validate/adapt_demo.py +69 -0
  68. pen_stack/validate/agent_eval.py +117 -0
  69. pen_stack/validate/blind_gsh_discovery.py +165 -0
  70. pen_stack/validate/cargo_directionality.py +57 -0
  71. pen_stack/validate/durability_baselines.py +150 -0
  72. pen_stack/validate/forward_hypotheses.py +104 -0
  73. pen_stack/validate/guide_qc_demo.py +58 -0
  74. pen_stack/validate/intent_specification.py +82 -0
  75. pen_stack/validate/paper3_benchmark.py +165 -0
  76. pen_stack/validate/paper4_real_validation.py +144 -0
  77. pen_stack/validate/paper4_validation.py +82 -0
  78. pen_stack/validate/seq_vs_measured.py +134 -0
  79. pen_stack/validate/within_locus_ranking.py +74 -0
  80. pen_stack/validate/writer_recovery.py +86 -0
  81. pen_stack/wgenome/__init__.py +1 -0
  82. pen_stack/wgenome/chromatin_seq.py +83 -0
  83. pen_stack/wgenome/durability.py +108 -0
  84. pen_stack/wgenome/export_tracks.py +52 -0
  85. pen_stack/wgenome/features.py +82 -0
  86. pen_stack/wgenome/gsh_baseline.py +117 -0
  87. pen_stack/wgenome/providers.py +245 -0
  88. pen_stack/wgenome/safety.py +69 -0
  89. pen_stack/wgenome/structure3d.py +168 -0
  90. pen_stack/wgenome/writability.py +72 -0
  91. pen_stack-3.1.0.dist-info/METADATA +451 -0
  92. pen_stack-3.1.0.dist-info/RECORD +96 -0
  93. pen_stack-3.1.0.dist-info/WHEEL +5 -0
  94. pen_stack-3.1.0.dist-info/entry_points.txt +3 -0
  95. pen_stack-3.1.0.dist-info/licenses/LICENSE +21 -0
  96. pen_stack-3.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,106 @@
1
+ """The PEN-STACK agent (Phase 3, Step 3.9) - tool-use orchestration.
2
+
3
+ Given a natural-language goal ("durably express factor IX in hepatocytes"), the agent plans the whole
4
+ write by calling validated tools (writability -> reachable writers -> writer axes -> plan_write -> cited
5
+ literature) in a tool-calling loop driven by the configured LLM (hybrid: NVIDIA Nemotron with Ollama
6
+ fallback, via ``pen_stack.rag.llm.chat``). Guardrails: it obtains numbers ONLY from tool calls (no
7
+ free-text predictions), refuses clinical-directive prompts, and logs an auditable trace.
8
+
9
+ Graceful: if no LLM provider is reachable, ``run_agent`` returns a refusal-free deterministic fallback
10
+ that calls plan_write directly - so the platform degrades to the validated pipeline rather than failing.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ from pathlib import Path
16
+
17
+ from pen_stack.agent.guardrails import DISCLAIMER, out_of_scope
18
+ from pen_stack.agent.tools import SCHEMAS, dispatch
19
+ from pen_stack.rag.llm import chat as llm_chat
20
+
21
+ _TRACES = Path(__file__).resolve().parents[2] / "out" / "agent_traces"
22
+
23
+ _SYSTEM = (
24
+ "You are the PEN-STACK genome-writing planning agent. You MUST obtain every fact and number by "
25
+ "calling the provided tools - never invent a number, gene, score, or citation. Plan a write by "
26
+ "calling: writability, reachable_writers, writer_axes, plan_write, ask_literature. When you have "
27
+ "enough tool results, write a short plan that cites which tool produced each number. Decision-support "
28
+ "only - never give clinical directives.")
29
+
30
+
31
+ def _tool_response(style: str, call_id: str | None, content: str) -> dict:
32
+ """Format a tool-result message for the provider's API style."""
33
+ m = {"role": "tool", "content": content}
34
+ if style == "openai" and call_id is not None:
35
+ m["tool_call_id"] = call_id
36
+ return m
37
+
38
+
39
+ def run_agent(goal: str, max_steps: int = 12, cfg: dict | None = None) -> dict:
40
+ """Turn a goal into a cited, auditable plan. Numbers come only from tool calls."""
41
+ refusal = out_of_scope(goal)
42
+ if refusal:
43
+ return {"refused": True, "plan": refusal, "trace": [], "disclaimer": DISCLAIMER}
44
+
45
+ from pen_stack.rag.llm import load_llm_config
46
+ step_timeout = int((cfg or load_llm_config()).get("agent_call_timeout", 60))
47
+ msgs = [{"role": "system", "content": _SYSTEM}, {"role": "user", "content": goal}]
48
+ trace: list[dict] = []
49
+ seen: set = set()
50
+
51
+ for _ in range(max_steps):
52
+ resp = llm_chat(msgs, tools=SCHEMAS, cfg=cfg, timeout=step_timeout)
53
+ if resp is None:
54
+ return _fallback(goal, trace)
55
+ provider, style = resp.get("provider"), resp.get("style", "openai")
56
+ calls = resp.get("tool_calls") or []
57
+ if not calls:
58
+ return {"refused": False, "plan": resp.get("content", "").strip(),
59
+ "trace": trace, "disclaimer": DISCLAIMER, "llm": True, "provider": provider}
60
+ msgs.append(resp["raw"]) # append the assistant turn verbatim
61
+ raw_calls = resp["raw"].get("tool_calls") or []
62
+ for i, c in enumerate(calls):
63
+ name = c["function"]["name"]
64
+ args = c["function"]["arguments"]
65
+ call_id = (raw_calls[i].get("id") if i < len(raw_calls) else None)
66
+ key = f"{name}:{json.dumps(args, sort_keys=True, default=str)}"
67
+ if key in seen:
68
+ msgs.append(_tool_response(style, call_id, json.dumps(
69
+ {"note": "already called with these args; use prior result and finalise the plan"})))
70
+ continue
71
+ seen.add(key)
72
+ try:
73
+ result = dispatch(name, args) # VALIDATED tool only
74
+ except Exception as e: # noqa: BLE001
75
+ result = {"error": str(e)}
76
+ trace.append({"tool": name, "args": args, "result": result})
77
+ msgs.append(_tool_response(style, call_id, json.dumps(result, default=str)))
78
+ return {"refused": False, "plan": "(max steps reached)", "trace": trace,
79
+ "disclaimer": DISCLAIMER, "llm": True}
80
+
81
+
82
+ def _fallback(goal: str, trace: list[dict]) -> dict:
83
+ """Deterministic fallback when no LLM is reachable: call plan_write on a best-effort parse."""
84
+ from pen_stack.planner.optimize import EditIntent
85
+ gene = next((w for w in goal.replace(",", " ").split() if w.isupper() and len(w) >= 2), None)
86
+ intent = EditIntent.SAFE_HARBOUR.value
87
+ for kw, it in [("disrupt", "knock_in_with_disruption"), ("knock", "knock_in_with_disruption"),
88
+ ("durab", "high_durability_insertion"), ("enhancer", "regulatory_excision"),
89
+ ("repeat", "repeat_excision")]:
90
+ if kw in goal.lower():
91
+ intent = it
92
+ break
93
+ if not gene:
94
+ return {"refused": False, "plan": "No target gene detected; LLM unavailable.",
95
+ "trace": trace, "disclaimer": DISCLAIMER, "llm": False}
96
+ res = dispatch("plan_write", {"gene": gene, "intent": intent})
97
+ trace.append({"tool": "plan_write", "args": {"gene": gene, "intent": intent}, "result": res})
98
+ return {"refused": False, "plan": f"[deterministic fallback] plan for {gene} ({intent})",
99
+ "trace": trace, "disclaimer": DISCLAIMER, "llm": False}
100
+
101
+
102
+ def save_trace(result: dict, name: str) -> Path:
103
+ _TRACES.mkdir(parents=True, exist_ok=True)
104
+ p = _TRACES / f"{name}.json"
105
+ p.write_text(json.dumps(result, indent=2, default=str), encoding="utf-8")
106
+ return p
@@ -0,0 +1,169 @@
1
+ """PEN-Agent - grounded write-planning state machine (v3.1, WS-E2).
2
+
3
+ A deterministic task-state machine over the VALIDATED tools. It sequences a genome-write plan:
4
+
5
+ goal intake -> site selection (writability) -> writer selection (reachability) ->
6
+ cargo design (+ Cargo Polish) -> off-target -> 3D structural risk -> report
7
+
8
+ Core property (the contribution): NO FABRICATION. Every number in the output is copied verbatim from a
9
+ tool-result dict and tagged with that tool's provenance; a step whose tool cannot ground a value is marked
10
+ `degraded`/`refused`, never invented. The agent therefore runs end-to-end even when AlphaGenome or the
11
+ bridge engine is unavailable - those steps degrade with a reason instead of guessing.
12
+
13
+ The LLM (agent/orchestrator.py) is an optional conversational front-end over this same machine; the plan
14
+ itself is deterministic, so the result is reproducible and the no-fabrication guarantee holds with or
15
+ without an LLM. Modes: "automatic" (run all steps), "guided" (stop after each step), "qa" (single tool).
16
+ """
17
+ from __future__ import annotations
18
+
19
+ from dataclasses import dataclass, field
20
+
21
+ from pen_stack.agent import tools as T
22
+
23
+
24
+ @dataclass
25
+ class Step:
26
+ name: str
27
+ tool: str | None
28
+ status: str # ok | degraded | refused
29
+ provenance: str | None = None
30
+ result: dict = field(default_factory=dict)
31
+ reason: str | None = None
32
+
33
+
34
+ def _site_selection(gene: str, ct: str) -> Step:
35
+ try:
36
+ r = T.writability(gene, ct)
37
+ except Exception as e: # noqa: BLE001 - missing Phase-1 atlas -> refuse, never fabricate
38
+ return Step("site_selection", "wgenome.writability", "refused", reason=f"{type(e).__name__}: {e}")
39
+ if not r.get("found"):
40
+ return Step("site_selection", r.get("tool"), "refused", reason=f"no writable locus for {gene}")
41
+ return Step("site_selection", r["tool"], "ok", provenance=r["tool"],
42
+ result={"max_writability": r["max_writability"], "safety": r["safety"],
43
+ "p_durable": r["p_durable"], "n_bins": r["n_bins"]})
44
+
45
+
46
+ def _writer_selection(gene: str, intent: str, cargo_bp: int, ct: str) -> tuple[Step, dict]:
47
+ try:
48
+ plan = T.plan_write(gene, intent, cargo_bp, ct)
49
+ except Exception as e: # noqa: BLE001 - missing atlas -> refuse, never fabricate
50
+ return Step("writer_selection", "planner.pipeline", "refused",
51
+ reason=f"{type(e).__name__}: {e}"), {}
52
+ if not plan or plan.get("found") is False:
53
+ return Step("writer_selection", "planner.pipeline", "refused",
54
+ reason="planner returned no plan"), {}
55
+ fam = plan.get("writer") or plan.get("writer_family") or plan.get("family")
56
+ return (Step("writer_selection", "planner.pipeline", "ok", provenance="planner.pipeline",
57
+ result={"writer_family": fam, "score": plan.get("score"),
58
+ "site": {k: plan.get(k) for k in ("chrom", "bin") if k in plan}}),
59
+ plan)
60
+
61
+
62
+ def _cargo_design(plan: dict, cargo_bp: int, ct: str, payload_seq: str | None) -> Step:
63
+ from pen_stack.planner.cargo import design_cargo
64
+ fam = plan.get("writer") or plan.get("writer_family") or plan.get("family")
65
+ wr = {"family": fam, "cargo_capacity_bp": plan.get("cargo_capacity_bp"),
66
+ "deliv_class": plan.get("deliv_class")}
67
+ site = (plan.get("chrom"), plan.get("bin"))
68
+ cargo = design_cargo(cargo_bp, wr, site, ct, payload_seq=payload_seq)
69
+ res = {"assembled_bp": cargo["assembled_bp"], "size_ok": cargo["size_ok"]}
70
+ if "cargo_polish" in cargo:
71
+ res["cargo_durability_risk"] = cargo["cargo_polish"]["cargo_durability_risk"]
72
+ res["cargo_band"] = cargo["cargo_polish"]["band"]
73
+ res["cargo_suggestions"] = [f["suggestion"] for f in cargo["cargo_polish"]["flags"]]
74
+ return Step("cargo_design", "planner.cargo", "ok", provenance="planner.cargo+cargo_polish", result=res)
75
+
76
+
77
+ def _offtarget(plan: dict) -> Step:
78
+ fam = plan.get("writer") or plan.get("writer_family") or plan.get("family") or ""
79
+ if "bridge" not in str(fam).lower() and "seek" not in str(fam).lower():
80
+ return Step("offtarget", None, "degraded", reason=f"off-target engine applies to bridge/seek writers, not {fam}")
81
+ try:
82
+ from pen_stack.bridge.offtarget import predict_offtargets
83
+ r = predict_offtargets(fam, (plan.get("chrom"), plan.get("bin")))
84
+ except Exception as e: # noqa: BLE001
85
+ return Step("offtarget", "bridge.offtarget", "degraded", reason=f"{type(e).__name__}: {e}")
86
+ if isinstance(r, dict) and r.get("status", "").startswith("pending"):
87
+ return Step("offtarget", "bridge.offtarget", "degraded", reason=r.get("note"))
88
+ return Step("offtarget", "bridge.offtarget", "ok", provenance="bridge.offtarget",
89
+ result=r if isinstance(r, dict) else {"offtargets": r})
90
+
91
+
92
+ def _structural_risk(plan: dict) -> Step:
93
+ chrom, b = plan.get("chrom"), plan.get("bin")
94
+ if chrom is None or b is None:
95
+ return Step("structural_risk", None, "degraded", reason="no concrete site coordinates")
96
+ try:
97
+ from pen_stack.wgenome.structure3d import structural_risk
98
+ r = structural_risk(chrom, int(b) * 1000, int(b) * 1000 + 135_000, offline=True)
99
+ except Exception as e: # noqa: BLE001
100
+ return Step("structural_risk", "wgenome.structure3d", "degraded", reason=f"{type(e).__name__}: {e}")
101
+ if not r.get("available"):
102
+ return Step("structural_risk", "wgenome.structure3d", "degraded",
103
+ reason="AlphaGenome contact map not cached (offline); flag with confidence (Gate G-C)")
104
+ return Step("structural_risk", "wgenome.structure3d", "ok", provenance="wgenome.structure3d", result=r)
105
+
106
+
107
+ def plan_write_session(gene: str, intent: str, cargo_bp: int = 2000, ct: str = "k562",
108
+ payload_seq: str | None = None, mode: str = "automatic") -> dict:
109
+ """Run the grounded write-planning state machine. Returns steps with provenance + a no-fabrication audit."""
110
+ from pen_stack.agent.guardrails import DISCLAIMER
111
+ steps: list[Step] = []
112
+ s_site = _site_selection(gene, ct)
113
+ steps.append(s_site)
114
+ plan: dict = {}
115
+ if s_site.status == "ok":
116
+ s_writer, plan = _writer_selection(gene, intent, cargo_bp, ct)
117
+ steps.append(s_writer)
118
+ if s_writer.status == "ok":
119
+ steps.append(_cargo_design(plan, cargo_bp, ct, payload_seq))
120
+ steps.append(_offtarget(plan))
121
+ steps.append(_structural_risk(plan))
122
+ if mode == "guided":
123
+ steps = steps[:2] # guided mode pauses after writer selection
124
+
125
+ grounded = [s for s in steps if s.status == "ok"]
126
+ degraded = [{"step": s.name, "reason": s.reason} for s in steps if s.status == "degraded"]
127
+ refused = [{"step": s.name, "reason": s.reason} for s in steps if s.status == "refused"]
128
+ # no-fabrication audit: every 'ok' step carries provenance for its numbers; nothing is free-text generated
129
+ no_fabrication = all(s.provenance for s in grounded)
130
+ return {
131
+ "goal": {"gene": gene, "intent": intent, "cargo_bp": cargo_bp, "ct": ct, "mode": mode},
132
+ "steps": [vars(s) for s in steps],
133
+ "provenance": {s.name: s.provenance for s in grounded},
134
+ "degraded_modes": degraded,
135
+ "refusals": refused,
136
+ "no_fabrication": no_fabrication,
137
+ "completed": bool(grounded) and not refused,
138
+ "disclaimer": DISCLAIMER,
139
+ }
140
+
141
+
142
+ _AUDIT_GOALS = [("TRAC", "knock_in_with_disruption"),
143
+ ("HBB", "high_durability_insertion"),
144
+ ("AAVS1", "safe_harbour_insertion")]
145
+
146
+
147
+ def no_fabrication_audit(goals: list[tuple[str, str]] | None = None) -> dict:
148
+ """Deterministic no-fabrication HARD GATE for the bench (T6) - NO LLM, so it never hangs and is always
149
+ available. The state machine copies every number from a tool-result dict, so fabrication is impossible by
150
+ construction; the audit confirms that every grounded ('ok') step carries provenance and that no step
151
+ emits an ungrounded value. Without the Phase-1 atlas the steps refuse (still no fabrication = pass)."""
152
+ goals = goals or _AUDIT_GOALS
153
+ runs = [plan_write_session(g, i) for g, i in goals]
154
+ per_goal = []
155
+ for (g, i), r in zip(goals, runs):
156
+ ok_steps = [s for s in r["steps"] if s["status"] == "ok"]
157
+ clean = r["no_fabrication"] and all(s["provenance"] for s in ok_steps)
158
+ per_goal.append({"gene": g, "intent": i, "no_fabrication": bool(clean),
159
+ "grounded_steps": len(ok_steps), "completed": r["completed"]})
160
+ n_fab = sum(0 if p["no_fabrication"] else 1 for p in per_goal)
161
+ return {"available": True, "n_goals": len(goals), "n_fabricated": n_fab,
162
+ "all_no_fabrication_pass": n_fab == 0,
163
+ "n_grounded": sum(p["completed"] for p in per_goal), "per_goal": per_goal,
164
+ "method": "deterministic pen_agent state machine (no LLM); fabrication impossible by construction"}
165
+
166
+
167
+ if __name__ == "__main__": # pragma: no cover
168
+ import json
169
+ print(json.dumps(no_fabrication_audit(), indent=2, default=str))
@@ -0,0 +1,130 @@
1
+ """PEN-STACK agent tools (Phase 3, Step 3.9) - the validated capabilities the agent may call.
2
+
3
+ Each tool wraps a *validated* module function and returns a JSON-serialisable, provenance-tagged result.
4
+ The agent may obtain numbers ONLY by calling these - never by free-text generation (the no-fabrication
5
+ guarantee, enforced by the eval harness). Schemas are the Ollama/OpenAI tool-calling format.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ from typing import Any
10
+
11
+
12
+ def writability(gene: str, ct: str = "k562") -> dict:
13
+ """Most-writable locus near a gene (safety x durability)."""
14
+ from pen_stack.atlas.crosslink import loci_for_gene
15
+ g = loci_for_gene(gene, ct)
16
+ if g.empty:
17
+ return {"gene": gene, "ct": ct, "found": False, "tool": "wgenome.writability"}
18
+ top = g.sort_values("writability", ascending=False).iloc[0]
19
+ return {"gene": gene, "ct": ct, "found": True,
20
+ "max_writability": round(float(top["writability"]), 4),
21
+ "safety": round(float(top["safety"]), 4),
22
+ "p_durable": round(float(top["p_durable"]), 4),
23
+ "n_bins": int(len(g)), "tool": "wgenome.writability"}
24
+
25
+
26
+ def reachable_writers(gene: str, ct: str = "k562") -> dict:
27
+ """Writer families that can reach a gene's most-writable locus."""
28
+ from pen_stack.atlas.crosslink import loci_for_gene, writers_for_locus
29
+ g = loci_for_gene(gene, ct)
30
+ if g.empty:
31
+ return {"gene": gene, "found": False, "tool": "atlas.crosslink"}
32
+ top = g.sort_values("writability", ascending=False).iloc[0]
33
+ w = writers_for_locus(top["chrom"], int(top["bin"]), ct)
34
+ return {"gene": gene, "ct": ct, "found": True,
35
+ "families": sorted(set(w["family"])) if not w.empty else [],
36
+ "tool": "atlas.crosslink"}
37
+
38
+
39
+ def writer_axes(family: str) -> dict:
40
+ """Measured axes for a writer family (cargo, deliverability, reachability, readiness)."""
41
+ import pandas as pd
42
+
43
+ from pen_stack.rag.index import _ATLAS
44
+ atlas = pd.read_parquet(_ATLAS)
45
+ sub = atlas[atlas["family"] == family]
46
+ if sub.empty:
47
+ return {"family": family, "found": False, "tool": "atlas.score"}
48
+ core = sub[sub["entry_kind"] == "curated_core"]
49
+ r = core.iloc[0] if len(core) else sub.iloc[0]
50
+ return {"family": family, "found": True, "n_systems": int(len(sub)),
51
+ "reachability_tier": r.get("reachability_tier"),
52
+ "cargo_capacity_bp": (int(r["cargo_capacity_bp"]) if pd.notna(r.get("cargo_capacity_bp")) else None),
53
+ "deliv_class": r.get("deliv_class"), "tool": "atlas.score"}
54
+
55
+
56
+ def plan_write(gene: str, intent: str, cargo_bp: int = 2000, ct: str = "k562") -> dict:
57
+ """Full Write Planner: goal + edit_intent -> top ranked, traceable plan."""
58
+ from pen_stack.planner.optimize import EditIntent
59
+ from pen_stack.planner.pipeline import plan_write as _pw
60
+ plans = _pw(gene, EditIntent(intent), cargo_bp, ct, k=1)
61
+ return (plans[0] if plans else {"gene": gene, "found": False}) | {"tool": "planner.pipeline"}
62
+
63
+
64
+ def ask_literature(q: str) -> dict:
65
+ """Grounded, cited literature answer (numbers still from tools)."""
66
+ from pen_stack.rag.qa import answer
67
+ a = answer(q)
68
+ return {"answer": a["answer"], "citations": a["citations"], "tool": "rag.qa"}
69
+
70
+
71
+ def multiplex_translocation_risk(edits: list[dict]) -> dict:
72
+ """Translocation-risk SCREEN for a multi-edit (2-5) plan: pairwise DSB-join risk across edits.
73
+
74
+ Each edit: {name, family, chrom, pos, optional offtargets:[{chrom,pos,risk}]}. DSB-free recombinase
75
+ writers contribute zero risk. A screen, not a calibrated predictor (WS-G1)."""
76
+ from pen_stack.planner.multiplex import translocation_risk
77
+ return {**translocation_risk(edits), "tool": "planner.multiplex"}
78
+
79
+
80
+ REGISTRY = {
81
+ "writability": writability,
82
+ "reachable_writers": reachable_writers,
83
+ "writer_axes": writer_axes,
84
+ "plan_write": plan_write,
85
+ "ask_literature": ask_literature,
86
+ "multiplex_translocation_risk": multiplex_translocation_risk,
87
+ }
88
+
89
+ # Ollama/OpenAI tool-calling schemas
90
+ SCHEMAS = [
91
+ {"type": "function", "function": {
92
+ "name": "writability", "description": "Most-writable locus near a gene (safety x durability).",
93
+ "parameters": {"type": "object", "properties": {
94
+ "gene": {"type": "string"}, "ct": {"type": "string", "enum": ["k562", "hepg2", "hspc"]}},
95
+ "required": ["gene"]}}},
96
+ {"type": "function", "function": {
97
+ "name": "reachable_writers", "description": "Writer families that can reach a gene's best locus.",
98
+ "parameters": {"type": "object", "properties": {
99
+ "gene": {"type": "string"}, "ct": {"type": "string"}}, "required": ["gene"]}}},
100
+ {"type": "function", "function": {
101
+ "name": "writer_axes", "description": "Measured axes for a writer family.",
102
+ "parameters": {"type": "object", "properties": {"family": {"type": "string"}},
103
+ "required": ["family"]}}},
104
+ {"type": "function", "function": {
105
+ "name": "plan_write", "description": "Full Write Planner: gene + edit_intent -> ranked plan.",
106
+ "parameters": {"type": "object", "properties": {
107
+ "gene": {"type": "string"},
108
+ "intent": {"type": "string", "enum": ["safe_harbour_insertion", "knock_in_with_disruption",
109
+ "high_durability_insertion", "regulatory_excision", "repeat_excision"]},
110
+ "cargo_bp": {"type": "integer"}, "ct": {"type": "string"}},
111
+ "required": ["gene", "intent"]}}},
112
+ {"type": "function", "function": {
113
+ "name": "ask_literature", "description": "Grounded, cited literature answer.",
114
+ "parameters": {"type": "object", "properties": {"q": {"type": "string"}}, "required": ["q"]}}},
115
+ {"type": "function", "function": {
116
+ "name": "multiplex_translocation_risk",
117
+ "description": "Translocation-risk screen for a multi-edit plan (pairwise DSB-join risk).",
118
+ "parameters": {"type": "object", "properties": {
119
+ "edits": {"type": "array", "items": {"type": "object", "properties": {
120
+ "name": {"type": "string"}, "family": {"type": "string"},
121
+ "chrom": {"type": "string"}, "pos": {"type": "integer"}}}}},
122
+ "required": ["edits"]}}},
123
+ ]
124
+
125
+
126
+ def dispatch(name: str, args: dict) -> Any:
127
+ """Execute a validated tool by name. Raises KeyError for unknown tools (never fabricates)."""
128
+ if name not in REGISTRY:
129
+ raise KeyError(f"unknown tool: {name}")
130
+ return REGISTRY[name](**args)
@@ -0,0 +1 @@
1
+ """pen_stack.atlas - see PEN-STACK v3.0 program doc."""
@@ -0,0 +1,80 @@
1
+ """Build the Writer-Targeting Knowledge Base (WT-KB) - Phase 0, Step 0.2.
2
+
3
+ Reads the curated YAML (one block per writer family), validates every row against the
4
+ ``WriterEntry`` schema (which enforces the sourcing rule: >=1 DOI per row), and emits both a
5
+ parquet (for the pipeline) and a human-readable markdown table (for literature cross-check).
6
+
7
+ Usage:
8
+ python -m pen_stack.atlas.build_wtkb --curated configs/wtkb_curated.yaml \
9
+ --out pen_stack/atlas/wtkb.parquet --md docs/wtkb.md
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import argparse
14
+ from pathlib import Path
15
+
16
+ import pandas as pd
17
+ import yaml
18
+
19
+ from pen_stack.atlas.schema import WriterEntry
20
+
21
+
22
+ def build(curated_yaml: str, out_parquet: str | None = None, out_md: str | None = None) -> pd.DataFrame:
23
+ curated = yaml.safe_load(Path(curated_yaml).read_text(encoding="utf-8"))
24
+ rows = []
25
+ for key, block in curated.items():
26
+ entry = WriterEntry(**block) # validates (raises on missing DOI / bad enum)
27
+ d = entry.model_dump()
28
+ d["_key"] = key
29
+ rows.append(d)
30
+ df = pd.DataFrame(rows)
31
+ # stable column order, _key first
32
+ cols = ["_key"] + [c for c in df.columns if c != "_key"]
33
+ df = df[cols]
34
+ if out_parquet:
35
+ Path(out_parquet).parent.mkdir(parents=True, exist_ok=True)
36
+ df.to_parquet(out_parquet, index=False)
37
+ if out_md:
38
+ Path(out_md).parent.mkdir(parents=True, exist_ok=True)
39
+ Path(out_md).write_text(_to_markdown(df), encoding="utf-8")
40
+ return df
41
+
42
+
43
+ def _to_markdown(df: pd.DataFrame) -> str:
44
+ lines = [
45
+ "# Writer-Targeting Knowledge Base (WT-KB)",
46
+ "",
47
+ f"_Generated from `configs/wtkb_curated.yaml` - {len(df)} writer families. "
48
+ "Every row is schema-validated and carries >=1 DOI (sourcing rule)._",
49
+ "",
50
+ "| Family | Representative | Mechanism | Modality | Target site | Tier | Confidence | DOIs |",
51
+ "|---|---|---|---|---|---|---|---|",
52
+ ]
53
+ for _, r in df.iterrows():
54
+ dois = "; ".join(r["key_dois"])
55
+ lines.append(
56
+ f"| {r['family']} | {r['representative_system']} | {r['mechanism_bucket']} | "
57
+ f"{r['targeting_modality']} | {r['target_site_spec']} | {r['reachability_tier']} | "
58
+ f"{r['confidence']} | {dois} |"
59
+ )
60
+ lines += ["", "## Reachability constraints (per family)", ""]
61
+ for _, r in df.iterrows():
62
+ lines.append(f"- **{r['family']}** ({r['reachability_tier']}): {r['reachability_constraints']}")
63
+ return "\n".join(lines) + "\n"
64
+
65
+
66
+ def main() -> None:
67
+ ap = argparse.ArgumentParser()
68
+ ap.add_argument("--curated", default="configs/wtkb_curated.yaml")
69
+ ap.add_argument("--out", default="pen_stack/atlas/wtkb.parquet")
70
+ ap.add_argument("--md", default="docs/wtkb.md")
71
+ a = ap.parse_args()
72
+ df = build(a.curated, a.out, a.md)
73
+ print(f"WT-KB built: {len(df)} families -> {a.out}")
74
+ tiers = df["reachability_tier"].value_counts().to_dict()
75
+ print(f"tiers: {tiers}")
76
+ print(f"fully-specified families: {len(df)} (target >=6)")
77
+
78
+
79
+ if __name__ == "__main__":
80
+ main()
@@ -0,0 +1,144 @@
1
+ """Cross-link the Writer Atlas <-> the Writable Genome (Phase 2, Step 2.5).
2
+
3
+ The integration that makes PEN-STACK more than a catalogue: bidirectional queries between writers (the
4
+ Phase-2 atlas, 33k systems by family) and loci (the Phase-1 Writable Genome, 3M bins x cell type with a
5
+ ``reachable_tier1`` annotation + a decomposable ``writability`` score).
6
+
7
+ - ``loci_for_writer(family, ct)`` -> loci that family can reach, ranked by writability.
8
+ - ``writers_for_locus(chrom, bin)`` -> atlas systems whose family reaches that locus, with readiness.
9
+ - ``loci_for_gene(gene, ct)`` -> writable bins overlapping a gene (forward query helper).
10
+
11
+ Honest scope (Phase-1 D1.8-1): reachability is released at the *locus* level - the Tier-1
12
+ reprogrammable families (bridge_IS110 / Cas9 / Cas12a) are near-universal at 1 kb, so the cross-link's
13
+ discriminating signal is the *writability ranking* and the *family -> atlas-system* join (each carrying
14
+ therapeutic readiness). Per-site reachability (does a specific bridge core exist here?) is Planner work.
15
+
16
+ Inputs : Phase-1 atlas_<ct>.parquet (chrom, bin, safety, p_durable, reachable_tier1, writability),
17
+ the Phase-2 atlas.parquet, gene_coords.parquet.
18
+ Outputs: out/crosslink_cache_<ct>.parquet (per-family reachable-loci summary).
19
+ """
20
+ from __future__ import annotations
21
+
22
+ import os
23
+ from functools import lru_cache
24
+ from pathlib import Path
25
+
26
+ import pandas as pd
27
+
28
+ _ROOT = Path(__file__).resolve().parents[2]
29
+ _FINAL = _ROOT.parent # Final_Part_v3.0/
30
+ _ATLAS = _ROOT / "pen_stack" / "atlas" / "atlas.parquet"
31
+ _OUT = _ROOT / "out"
32
+ BIN_BP = 1000
33
+
34
+ # Phase-1 writability atlas can live in a few places (fetched-not-committed). First match wins.
35
+ # PEN_ATLAS_DIR (also used by the UI) is honoured first so every cross-link-backed feature - the Write
36
+ # Planner, the agent, and the RAG numeric route - finds the same atlas the UI does in any deployment.
37
+ def _writability_search() -> list[Path]:
38
+ bases: list[Path] = []
39
+ env = os.environ.get("PEN_ATLAS_DIR")
40
+ if env:
41
+ bases.append(Path(env))
42
+ bases += [_ROOT / "data" / "out", _FINAL / "phase_1" / "out"]
43
+ return bases
44
+
45
+
46
+ def writability_path(ct: str) -> Path:
47
+ bases = _writability_search()
48
+ for base in bases:
49
+ p = base / f"atlas_{ct}.parquet"
50
+ if p.exists():
51
+ return p
52
+ raise FileNotFoundError(f"atlas_{ct}.parquet not found in {[str(b) for b in bases]}")
53
+
54
+
55
+ @lru_cache(maxsize=4)
56
+ def load_writability(ct: str) -> pd.DataFrame:
57
+ df = pd.read_parquet(writability_path(ct))
58
+ df["_reach"] = df["reachable_tier1"].fillna("").str.split(";")
59
+ return df
60
+
61
+
62
+ @lru_cache(maxsize=1)
63
+ def load_writer_atlas() -> pd.DataFrame:
64
+ return pd.read_parquet(_ATLAS)
65
+
66
+
67
+ def reachable_families(ct: str) -> set[str]:
68
+ """The writer families annotated as Tier-1 reachable in the Phase-1 atlas for this cell type."""
69
+ df = load_writability(ct)
70
+ fams: set[str] = set()
71
+ for r in df["reachable_tier1"].dropna().unique():
72
+ fams.update(x for x in str(r).split(";") if x)
73
+ return fams
74
+
75
+
76
+ def loci_for_writer(family: str, ct: str = "k562", top: int = 20) -> pd.DataFrame:
77
+ """Top-writability loci reachable by a writer family (genomic coords + writability components)."""
78
+ df = load_writability(ct)
79
+ mask = df["_reach"].apply(lambda fams: family in fams)
80
+ hit = df.loc[mask].nlargest(top, "writability").copy()
81
+ hit["chrom_start"] = hit["bin"] * BIN_BP
82
+ return hit[["chrom", "bin", "chrom_start", "safety", "p_durable", "writability", "reachable_tier1"]]
83
+
84
+
85
+ def writers_for_locus(chrom: str, bin_idx: int, ct: str = "k562") -> pd.DataFrame:
86
+ """Atlas systems whose family reaches a locus, with therapeutic readiness (if scored)."""
87
+ df = load_writability(ct)
88
+ row = df[(df["chrom"] == chrom) & (df["bin"] == bin_idx)]
89
+ if row.empty:
90
+ return pd.DataFrame()
91
+ fams = {x for x in str(row.iloc[0]["reachable_tier1"]).split(";") if x}
92
+ atlas = load_writer_atlas()
93
+ cols = [c for c in ["representative_system", "family", "confidence", "deliv_class",
94
+ "readiness", "cargo_capacity_bp", "reachability_tier"] if c in atlas.columns]
95
+ out = atlas[atlas["family"].isin(fams)][cols].copy()
96
+ out["locus_writability"] = float(row.iloc[0]["writability"])
97
+ return out
98
+
99
+
100
+ def loci_for_gene(gene: str, ct: str = "k562", gene_coords: str | Path | None = None) -> pd.DataFrame:
101
+ """Writable bins overlapping a gene body (forward query helper)."""
102
+ if gene_coords:
103
+ gc_path = Path(gene_coords)
104
+ else:
105
+ from pen_stack.planner.optimize import gene_coords_path
106
+ gc_path = gene_coords_path()
107
+ gc = pd.read_parquet(gc_path)
108
+ g = gc[gc["gene"] == gene]
109
+ if g.empty:
110
+ return pd.DataFrame()
111
+ r = g.iloc[0]
112
+ df = load_writability(ct)
113
+ lo, hi = int(r["start"]) // BIN_BP, int(r["end"]) // BIN_BP
114
+ return df[(df["chrom"] == r["chrom"]) & (df["bin"].between(lo, hi))].sort_values(
115
+ "writability", ascending=False)
116
+
117
+
118
+ def build_crosslink_cache(ct: str = "k562", out: str | Path | None = None) -> pd.DataFrame:
119
+ """Per-family reachable-loci summary (count + median writability + top bin), cached."""
120
+ df = load_writability(ct)
121
+ rows = []
122
+ for fam in sorted(reachable_families(ct)):
123
+ sub = df[df["_reach"].apply(lambda fams, f=fam: f in fams)]
124
+ top = sub.nlargest(1, "writability")
125
+ rows.append({
126
+ "family": fam, "cell_type": ct, "n_reachable_loci": len(sub),
127
+ "median_writability": round(float(sub["writability"].median()), 4),
128
+ "top_chrom": top.iloc[0]["chrom"], "top_bin": int(top.iloc[0]["bin"]),
129
+ "top_writability": round(float(top.iloc[0]["writability"]), 4),
130
+ })
131
+ cache = pd.DataFrame(rows)
132
+ out = Path(out) if out else _OUT / f"crosslink_cache_{ct}.parquet"
133
+ out.parent.mkdir(parents=True, exist_ok=True)
134
+ cache.to_parquet(out, index=False)
135
+ return cache
136
+
137
+
138
+ if __name__ == "__main__": # pragma: no cover
139
+ for ct in ("k562", "hepg2", "hspc"):
140
+ try:
141
+ c = build_crosslink_cache(ct)
142
+ print(f"[{ct}] crosslink cache:\n{c.to_string(index=False)}\n")
143
+ except Exception as e: # noqa: BLE001 - a missing/partial cell-type atlas is non-fatal
144
+ print(f"[{ct}] skip: {e}")