pen-stack 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. pen_stack/__init__.py +2 -0
  2. pen_stack/_resources.py +34 -0
  3. pen_stack/adapt/__init__.py +14 -0
  4. pen_stack/adapt/finetune.py +33 -0
  5. pen_stack/adapt/ingest.py +86 -0
  6. pen_stack/adapt/pipeline.py +101 -0
  7. pen_stack/adapt/recalibrate.py +58 -0
  8. pen_stack/adapt/report.py +130 -0
  9. pen_stack/agent/__init__.py +1 -0
  10. pen_stack/agent/guardrails.py +49 -0
  11. pen_stack/agent/mcp_server.py +42 -0
  12. pen_stack/agent/orchestrator.py +106 -0
  13. pen_stack/agent/pen_agent.py +169 -0
  14. pen_stack/agent/tools.py +130 -0
  15. pen_stack/atlas/__init__.py +1 -0
  16. pen_stack/atlas/build_wtkb.py +80 -0
  17. pen_stack/atlas/crosslink.py +144 -0
  18. pen_stack/atlas/expand.py +190 -0
  19. pen_stack/atlas/schema.py +59 -0
  20. pen_stack/atlas/scorecard.py +134 -0
  21. pen_stack/atlas/universe.py +75 -0
  22. pen_stack/atlas/variant_propose.py +155 -0
  23. pen_stack/bridge/__init__.py +1 -0
  24. pen_stack/bridge/activity.py +52 -0
  25. pen_stack/bridge/cli.py +65 -0
  26. pen_stack/bridge/fold_qc.py +53 -0
  27. pen_stack/bridge/guide_qc.py +84 -0
  28. pen_stack/bridge/ingest.py +139 -0
  29. pen_stack/bridge/offtarget.py +133 -0
  30. pen_stack/bridge/ortholog_screen.py +73 -0
  31. pen_stack/bridge/pipeline.py +83 -0
  32. pen_stack/cli.py +126 -0
  33. pen_stack/data/__init__.py +1 -0
  34. pen_stack/data/encode.py +84 -0
  35. pen_stack/data/genome.py +71 -0
  36. pen_stack/data/ingest_chromatin.py +119 -0
  37. pen_stack/data/ingest_integration.py +112 -0
  38. pen_stack/data/ingest_safety_annot.py +164 -0
  39. pen_stack/data/ingest_trip.py +76 -0
  40. pen_stack/mech/__init__.py +1 -0
  41. pen_stack/mech/classify_atlas.py +71 -0
  42. pen_stack/mech/whitelist.py +66 -0
  43. pen_stack/monitor/__init__.py +1 -0
  44. pen_stack/monitor/europepmc.py +32 -0
  45. pen_stack/monitor/run.py +57 -0
  46. pen_stack/monitor/triage.py +63 -0
  47. pen_stack/planner/__init__.py +1 -0
  48. pen_stack/planner/cargo.py +56 -0
  49. pen_stack/planner/cargo_polish.py +146 -0
  50. pen_stack/planner/delivery.py +32 -0
  51. pen_stack/planner/multiplex.py +110 -0
  52. pen_stack/planner/optimize.py +156 -0
  53. pen_stack/planner/pipeline.py +86 -0
  54. pen_stack/planner/report.py +26 -0
  55. pen_stack/rag/__init__.py +1 -0
  56. pen_stack/rag/index.py +53 -0
  57. pen_stack/rag/llm.py +178 -0
  58. pen_stack/rag/qa.py +105 -0
  59. pen_stack/score/__init__.py +1 -0
  60. pen_stack/score/recalibrate.py +77 -0
  61. pen_stack/score/therapeutic.py +85 -0
  62. pen_stack/server/__init__.py +1 -0
  63. pen_stack/server/api.py +142 -0
  64. pen_stack/ui/__init__.py +1 -0
  65. pen_stack/ui/app.py +518 -0
  66. pen_stack/validate/__init__.py +1 -0
  67. pen_stack/validate/adapt_demo.py +69 -0
  68. pen_stack/validate/agent_eval.py +117 -0
  69. pen_stack/validate/blind_gsh_discovery.py +165 -0
  70. pen_stack/validate/cargo_directionality.py +57 -0
  71. pen_stack/validate/durability_baselines.py +150 -0
  72. pen_stack/validate/forward_hypotheses.py +104 -0
  73. pen_stack/validate/guide_qc_demo.py +58 -0
  74. pen_stack/validate/intent_specification.py +82 -0
  75. pen_stack/validate/paper3_benchmark.py +165 -0
  76. pen_stack/validate/paper4_real_validation.py +144 -0
  77. pen_stack/validate/paper4_validation.py +82 -0
  78. pen_stack/validate/seq_vs_measured.py +134 -0
  79. pen_stack/validate/within_locus_ranking.py +74 -0
  80. pen_stack/validate/writer_recovery.py +86 -0
  81. pen_stack/wgenome/__init__.py +1 -0
  82. pen_stack/wgenome/chromatin_seq.py +83 -0
  83. pen_stack/wgenome/durability.py +108 -0
  84. pen_stack/wgenome/export_tracks.py +52 -0
  85. pen_stack/wgenome/features.py +82 -0
  86. pen_stack/wgenome/gsh_baseline.py +117 -0
  87. pen_stack/wgenome/providers.py +245 -0
  88. pen_stack/wgenome/safety.py +69 -0
  89. pen_stack/wgenome/structure3d.py +168 -0
  90. pen_stack/wgenome/writability.py +72 -0
  91. pen_stack-3.1.0.dist-info/METADATA +451 -0
  92. pen_stack-3.1.0.dist-info/RECORD +96 -0
  93. pen_stack-3.1.0.dist-info/WHEEL +5 -0
  94. pen_stack-3.1.0.dist-info/entry_points.txt +3 -0
  95. pen_stack-3.1.0.dist-info/licenses/LICENSE +21 -0
  96. pen_stack-3.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,110 @@
1
+ """Multiplex translocation-risk flag (v3.1, WS-G1).
2
+
3
+ For a multi-edit plan (2-5 edits), two simultaneous double-strand breaks (DSBs) at different loci can
4
+ mis-join into a TRANSLOCATION. This is a classical, interpretable SCREEN - not a calibrated translocation
5
+ predictor. We gather every edit's DSB sites (on-target + predicted off-targets, each with a cut probability),
6
+ enumerate all site PAIRS exactly (cheap for 2-5 edits), and combine pairwise DSB-join probabilities into a
7
+ `translocation_risk` in [0,1].
8
+
9
+ Key honest property: **DSB-free writers (bridge / seek recombinases) contribute NO cut sites**, so a plan
10
+ built from them carries ~zero translocation risk - which is the whole point of programmable recombinases.
11
+ The flag is monotonic (more sites / higher cut prob / closer pairs -> higher risk) and reports its top pairs
12
+ so a user can see WHY. A QUBO formulation is provided as a documented OPTIONAL baseline, off by default.
13
+ """
14
+ from __future__ import annotations
15
+
16
+ import math
17
+ from itertools import combinations
18
+
19
+ # writer families that cut DNA (DSB) vs DSB-free programmable recombinases / writers.
20
+ _DSB_FREE = {"bridge_is110", "bridge_iscro4", "seek_is1111", "bridge", "seek", "pe_integrase",
21
+ "prime_editor", "recombinase"}
22
+ _DEFAULT_ON_TARGET_CUT = 0.8 # nominal on-target cut efficiency for a DSB nuclease (documented prior)
23
+ _INTRA_CHROM_LENGTH = 1.0e7 # bp decay length for intra-chromosomal join propensity (10 Mb)
24
+
25
+
26
+ def is_dsb_free(family: str | None) -> bool:
27
+ return str(family or "").lower() in _DSB_FREE
28
+
29
+
30
+ def cut_sites(edit: dict) -> list[dict]:
31
+ """DSB sites for one edit. DSB-free writers -> []. Otherwise on-target (+ off-targets if provided).
32
+
33
+ `edit` keys: family, chrom, pos (on-target); optional on_target_cut; optional offtargets=[{chrom,pos,
34
+ p_cut|risk}]. Off-target risk in [0,1] is used directly as a cut probability.
35
+ """
36
+ if is_dsb_free(edit.get("family")):
37
+ return []
38
+ sites = []
39
+ if edit.get("chrom") is not None and edit.get("pos") is not None:
40
+ sites.append({"chrom": edit["chrom"], "pos": int(edit["pos"]),
41
+ "p_cut": float(edit.get("on_target_cut", _DEFAULT_ON_TARGET_CUT)),
42
+ "kind": "on_target", "edit": edit.get("name")})
43
+ for ot in edit.get("offtargets", []) or []:
44
+ p = float(ot.get("p_cut", ot.get("risk", 0.0)))
45
+ if p > 0 and ot.get("chrom") is not None and ot.get("pos") is not None:
46
+ sites.append({"chrom": ot["chrom"], "pos": int(ot["pos"]), "p_cut": min(1.0, p),
47
+ "kind": "off_target", "edit": edit.get("name")})
48
+ return sites
49
+
50
+
51
+ def _join_factor(a: dict, b: dict) -> float:
52
+ """Propensity that two DSBs mis-join: 1.0 inter-chromosomal; distance-decayed intra-chromosomal."""
53
+ if a["chrom"] != b["chrom"]:
54
+ return 1.0
55
+ d = abs(a["pos"] - b["pos"])
56
+ return math.exp(-d / _INTRA_CHROM_LENGTH)
57
+
58
+
59
+ def pairwise_risks(sites: list[dict]) -> list[dict]:
60
+ """Exact pairwise DSB-join probabilities for every unordered site pair (across and within edits)."""
61
+ out = []
62
+ for i, j in combinations(range(len(sites)), 2):
63
+ a, b = sites[i], sites[j]
64
+ jp = a["p_cut"] * b["p_cut"] * _join_factor(a, b)
65
+ out.append({"a": f"{a['edit']}:{a['kind']}@{a['chrom']}:{a['pos']}",
66
+ "b": f"{b['edit']}:{b['kind']}@{b['chrom']}:{b['pos']}",
67
+ "inter_chromosomal": a["chrom"] != b["chrom"], "join_prob": round(jp, 5)})
68
+ return sorted(out, key=lambda r: r["join_prob"], reverse=True)
69
+
70
+
71
+ def translocation_risk(edits: list[dict], low: float = 0.05, moderate: float = 0.2,
72
+ top_k: int = 5) -> dict:
73
+ """Aggregate translocation-risk flag for a multi-edit plan. risk = 1 - prod(1 - pairwise_join_prob).
74
+
75
+ Monotonic in every pairwise probability; interpretable via the top contributing pairs. A SCREEN, not a
76
+ calibrated predictor.
77
+ """
78
+ if not 2 <= len(edits) <= 5:
79
+ # still computes, but the flag is meant for multiplex (2-5 simultaneous edits)
80
+ note = "translocation risk is defined for multiplex plans (2-5 simultaneous edits)"
81
+ else:
82
+ note = None
83
+ sites = [s for e in edits for s in cut_sites(e)]
84
+ pairs = pairwise_risks(sites)
85
+ prod = 1.0
86
+ for p in pairs:
87
+ prod *= (1.0 - p["join_prob"])
88
+ risk = round(1.0 - prod, 5)
89
+ band = "low" if risk < low else ("moderate" if risk < moderate else "high")
90
+ n_dsb_free = sum(1 for e in edits if is_dsb_free(e.get("family")))
91
+ return {"translocation_risk": risk, "band": band, "n_edits": len(edits),
92
+ "n_cut_sites": len(sites), "n_pairs": len(pairs),
93
+ "n_dsb_free_edits": n_dsb_free,
94
+ "all_dsb_free": n_dsb_free == len(edits),
95
+ "top_pairs": pairs[:top_k],
96
+ "note": note,
97
+ "scope": "classical pairwise DSB-join SCREEN, not a calibrated translocation predictor; "
98
+ "DSB-free recombinase plans carry ~zero risk by construction"}
99
+
100
+
101
+ def qubo_baseline(edits: list[dict], variants_per_edit: dict[str, list[dict]] | None = None) -> dict:
102
+ """OPTIONAL, OFF BY DEFAULT - a documented QUBO baseline for selecting per-edit guide variants that
103
+ minimize total pairwise translocation risk. Returns the QUBO Q-matrix terms only; no solver is invoked
104
+ and this is NOT the recommended path (the exact pairwise screen above is exact for 2-5 edits). Provided
105
+ for completeness / external comparison, clearly labeled optional.
106
+ """
107
+ return {"enabled": False, "kind": "QUBO (optional baseline)",
108
+ "note": "exact pairwise enumeration is tractable and exact for 2-5 edits; the QUBO path is an "
109
+ "optional baseline for large multiplex selection problems and is off by default.",
110
+ "n_variant_sets": len(variants_per_edit or {})}
@@ -0,0 +1,156 @@
1
+ """Inverse-design optimiser with edit_intent (Phase 3, Step 3.1).
2
+
3
+ Given a goal (gene/locus, edit_intent, cargo, cell type), search destination x writer for the joint
4
+ optimum of safety x durability x reachability x writer-activity, conditioned on an explicit
5
+ ``edit_intent``. The intent is *load-bearing*: its ``target_gene_sign`` decides whether hitting the
6
+ named target gene/element is penalised (safe-harbour: avoid) or rewarded (knock-in / excision: intended)
7
+ - so the same locus ranks high or low depending only on the stated goal.
8
+
9
+ Components are retained on every candidate row; the score is a transparent linear combination read from
10
+ ``configs/intent_weights.yaml``. Reachability is a hard filter (Tier-1 high-confidence; Tier-2 candidate
11
+ flagged). Writer activity comes from the Phase-2 Writer Atlas (measured human-cell axis per family).
12
+
13
+ Inputs : Phase-1 writability atlas (safety/p_durable/reachable_tier1) + Phase-2 atlas.parquet.
14
+ Outputs: ranked (writer, site) candidates with full component provenance.
15
+ """
16
+ from __future__ import annotations
17
+
18
+ from enum import Enum
19
+ from functools import lru_cache
20
+ from pathlib import Path
21
+
22
+ import pandas as pd
23
+ import yaml
24
+
25
+ _ROOT = Path(__file__).resolve().parents[2]
26
+ _CFG = _ROOT / "configs" / "intent_weights.yaml"
27
+ _ATLAS = _ROOT / "pen_stack" / "atlas" / "atlas.parquet"
28
+ BIN_BP = 1000
29
+
30
+
31
+ class EditIntent(str, Enum):
32
+ SAFE_HARBOUR = "safe_harbour_insertion"
33
+ KNOCK_IN_DISRUPT = "knock_in_with_disruption"
34
+ HIGH_DURABILITY = "high_durability_insertion"
35
+ REG_EXCISION = "regulatory_excision"
36
+ REPEAT_EXCISION = "repeat_excision"
37
+
38
+
39
+ @lru_cache(maxsize=1)
40
+ def load_intent_weights(path: str | Path = _CFG) -> dict:
41
+ return yaml.safe_load(Path(path).read_text(encoding="utf-8"))
42
+
43
+
44
+ @lru_cache(maxsize=1)
45
+ def writer_activity_by_family(atlas_path: str | Path = _ATLAS) -> dict:
46
+ """Per-family writer-activity proxy from the Writer Atlas curated cores (measured human-cell axis).
47
+
48
+ Falls back to readiness when S_HumanCell is missing. Used so the optimiser prefers writers that
49
+ actually work in human cells (e.g. bridge ISCro4) over weakly-active families.
50
+ """
51
+ atlas = pd.read_parquet(atlas_path)
52
+ core = atlas[atlas["entry_kind"] == "curated_core"] if "entry_kind" in atlas else atlas
53
+ act = {}
54
+ for fam, sub in core.groupby("family"):
55
+ r = sub.iloc[0]
56
+ a = r.get("S_HumanCell")
57
+ if a is None or pd.isna(a):
58
+ a = r.get("readiness", 0.5)
59
+ act[fam] = float(a) if pd.notna(a) else 0.5
60
+ return act
61
+
62
+
63
+ def _best_writer(reachable_tier1: str, cargo_bp: int, atlas_caps: dict, activity: dict) -> tuple[str, float, bool]:
64
+ """Pick the best reachable writer that fits the cargo: (family, activity, cargo_ok)."""
65
+ fams = [f for f in str(reachable_tier1).split(";") if f]
66
+ best, best_act, best_ok = None, -1.0, False
67
+ for f in fams:
68
+ cap = atlas_caps.get(f)
69
+ ok = (cap is None) or (cargo_bp <= cap)
70
+ a = activity.get(f, 0.4)
71
+ # prefer cargo-fitting writers; among those, highest activity
72
+ rank = (1 if ok else 0, a)
73
+ if rank > (1 if best_ok else 0, best_act):
74
+ best, best_act, best_ok = f, a, ok
75
+ return best or (fams[0] if fams else "unknown"), best_act if best else 0.4, best_ok
76
+
77
+
78
+ def score_candidates(cands: pd.DataFrame, intent: EditIntent | str, cargo_bp: int) -> pd.DataFrame:
79
+ """Score a candidate DataFrame (needs: safety, p_durable, reachable_tier1, on_target[bool]).
80
+
81
+ Adds: writer (family), writer_activity, cargo_ok, score, and the retained components.
82
+ """
83
+ intent = EditIntent(intent) if not isinstance(intent, EditIntent) else intent
84
+ cfg = load_intent_weights()
85
+ w = cfg["intents"][intent.value]
86
+ mag = float(cfg.get("on_target_magnitude", 1.0))
87
+
88
+ atlas = pd.read_parquet(_ATLAS)
89
+ caps = (atlas.dropna(subset=["cargo_capacity_bp"]).groupby("family")["cargo_capacity_bp"].max().to_dict())
90
+ activity = writer_activity_by_family()
91
+
92
+ out = cands.copy()
93
+ picks = out["reachable_tier1"].apply(lambda rt: _best_writer(rt, cargo_bp, caps, activity))
94
+ out["writer"] = [p[0] for p in picks]
95
+ out["writer_activity"] = [p[1] for p in picks]
96
+ out["cargo_ok"] = [p[2] for p in picks]
97
+
98
+ on_target = out.get("on_target", pd.Series(False, index=out.index)).astype(float)
99
+ base = (w["safety"] * out["safety"].astype(float)
100
+ + w["durability"] * out["p_durable"].astype(float)
101
+ + w["activity"] * out["writer_activity"].astype(float))
102
+ # target_gene_sign: +1 -> penalise on-target (avoid the gene); -1 -> reward on-target (hit the gene)
103
+ out["score"] = base - w["target_gene_sign"] * mag * on_target
104
+ # cargo that cannot be delivered by any reachable writer is penalised
105
+ out.loc[~out["cargo_ok"], "score"] -= 0.5
106
+ out["intent"] = intent.value
107
+ # Deterministic ranking: a stable sort with explicit tie-breakers, so tied scores (common when safety
108
+ # saturates) always resolve identically across runs - the default quicksort is NOT stable.
109
+ keys = ["score"] + [c for c in ("chrom", "bin", "gene") if c in out.columns]
110
+ asc = [False] + [True] * (len(keys) - 1)
111
+ return out.sort_values(keys, ascending=asc, kind="stable").reset_index(drop=True)
112
+
113
+
114
+ def gene_coords_path() -> Path:
115
+ """Locate gene_coords.parquet: packaged copy first (works in any container), then phase_1."""
116
+ for p in (_ROOT / "data" / "curated" / "gene_coords.parquet",
117
+ _ROOT.parent / "phase_1" / "app_data" / "gene_coords.parquet"):
118
+ if p.exists():
119
+ return p
120
+ return _ROOT / "data" / "curated" / "gene_coords.parquet"
121
+
122
+
123
+ @lru_cache(maxsize=8)
124
+ def _gene_coords(path: str | None = None) -> pd.DataFrame:
125
+ return pd.read_parquet(Path(path) if path else gene_coords_path())
126
+
127
+
128
+ def gene_region(gene: str, flank_kb: int = 50) -> tuple[str, int, int] | None:
129
+ gc = _gene_coords()
130
+ g = gc[gc["gene"] == gene]
131
+ if g.empty:
132
+ return None
133
+ r = g.iloc[0]
134
+ return r["chrom"], max(0, int(r["start"]) - flank_kb * 1000), int(r["end"]) + flank_kb * 1000
135
+
136
+
137
+ def plan(gene: str, intent: EditIntent | str, cargo_bp: int, writable_df: pd.DataFrame,
138
+ k: int = 10, flank_kb: int = 50) -> pd.DataFrame:
139
+ """Rank (writer, site) candidates near a gene for the given intent. Components retained."""
140
+ intent = EditIntent(intent) if not isinstance(intent, EditIntent) else intent
141
+ reg = gene_region(gene, flank_kb)
142
+ if reg is None:
143
+ return pd.DataFrame()
144
+ chrom, lo, hi = reg
145
+ sub = writable_df[(writable_df["chrom"] == chrom)
146
+ & (writable_df["bin"].between(lo // BIN_BP, hi // BIN_BP))].copy()
147
+ if sub.empty:
148
+ return pd.DataFrame()
149
+ # on_target = bin overlaps the gene body (not just the flank)
150
+ g = _gene_coords()
151
+ gr = g[g["gene"] == gene].iloc[0]
152
+ sub["on_target"] = sub["bin"].between(int(gr["start"]) // BIN_BP, int(gr["end"]) // BIN_BP)
153
+ scored = score_candidates(sub, intent, cargo_bp)
154
+ cols = ["chrom", "bin", "writer", "safety", "p_durable", "writer_activity",
155
+ "on_target", "cargo_ok", "reachable_tier1", "score", "intent"]
156
+ return scored[[c for c in cols if c in scored.columns]].head(k)
@@ -0,0 +1,86 @@
1
+ """End-to-end Write Planner (Phase 3, Step 3.4).
2
+
3
+ One call - ``plan_write(gene, intent, payload_bp, ct)`` - composes the inverse-design optimiser (3.1),
4
+ cargo/donor design (3.2), and delivery recommendation (3.3) into ranked, fully traceable plans. Every
5
+ numeric field is tagged with the module/dataset that produced it (provenance), so nothing is asserted
6
+ without a source. Heavy data (the Phase-1 writability atlas) is loaded lazily via the cross-link.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ from functools import lru_cache
11
+ from pathlib import Path
12
+
13
+ import pandas as pd
14
+
15
+ from pen_stack.planner.cargo import design_cargo
16
+ from pen_stack.planner.delivery import recommend_delivery
17
+ from pen_stack.planner.optimize import EditIntent, plan
18
+
19
+ _ATLAS = Path(__file__).resolve().parents[1] / "atlas" / "atlas.parquet"
20
+ BIN_BP = 1000
21
+
22
+
23
+ @lru_cache(maxsize=1)
24
+ def _writer_meta() -> dict:
25
+ """family -> {length_aa, cargo_capacity_bp, deliv_class, reachability_tier} from the Writer Atlas."""
26
+ atlas = pd.read_parquet(_ATLAS)
27
+ core = atlas[atlas["entry_kind"] == "curated_core"] if "entry_kind" in atlas else atlas
28
+ meta = {}
29
+ for fam, sub in core.groupby("family"):
30
+ r = sub.iloc[0]
31
+ meta[fam] = {
32
+ "length_aa": (int(r["length_aa"]) if pd.notna(r.get("length_aa")) else None),
33
+ "cargo_capacity_bp": (int(r["cargo_capacity_bp"]) if pd.notna(r.get("cargo_capacity_bp")) else None),
34
+ "deliv_class": r.get("deliv_class"),
35
+ "reachability_tier": r.get("reachability_tier"),
36
+ }
37
+ return meta
38
+
39
+
40
+ def plan_write(gene: str, intent: EditIntent | str, payload_bp: int, ct: str = "k562",
41
+ k: int = 5, writable_df: pd.DataFrame | None = None) -> list[dict]:
42
+ """Return ranked, traceable write plans for a goal. Each plan = site + writer + cargo + delivery."""
43
+ if writable_df is None:
44
+ from pen_stack.atlas.crosslink import load_writability
45
+ writable_df = load_writability(ct)
46
+ cands = plan(gene, intent, payload_bp, writable_df, k=k)
47
+ meta = _writer_meta()
48
+ plans = []
49
+ for _, row in cands.iterrows():
50
+ fam = row["writer"]
51
+ wm = meta.get(fam, {})
52
+ writer_row = {"family": fam, "cargo_capacity_bp": wm.get("cargo_capacity_bp"),
53
+ "deliv_class": wm.get("deliv_class")}
54
+ site = (row["chrom"], int(row["bin"]) * BIN_BP)
55
+ cargo = design_cargo(payload_bp, writer_row, site, ct)
56
+ eff_bp = (wm.get("length_aa") or 0) * 3
57
+ deliv = recommend_delivery(eff_bp, payload_bp, ct)
58
+ plans.append({
59
+ "gene": gene, "intent": EditIntent(intent).value if not isinstance(intent, EditIntent) else intent.value,
60
+ "site": {"chrom": row["chrom"], "bin": int(row["bin"]), "pos": site[1]},
61
+ "writer": fam,
62
+ "reachability_tier": wm.get("reachability_tier"),
63
+ "safety": round(float(row["safety"]), 4),
64
+ "durability": round(float(row["p_durable"]), 4),
65
+ "writer_activity": round(float(row["writer_activity"]), 4),
66
+ "on_target": bool(row["on_target"]),
67
+ "score": round(float(row["score"]), 4),
68
+ "cargo": cargo,
69
+ "delivery": deliv,
70
+ "provenance": {
71
+ "safety": "wgenome.safety (LightGBM, COSMIC/DepMap/MLV)",
72
+ "durability": "wgenome.durability (TRIP conditional chromatin model)",
73
+ "writer_activity": "atlas.score.therapeutic (measured human-cell axis)",
74
+ "reachability": "atlas.crosslink (Phase-1 reachable_tier1 + WT-KB tier)",
75
+ "delivery": "planner.delivery (configs/delivery_rules.yaml)",
76
+ "offtargets": "planner.cargo (bridge engine = Phase 1.5)",
77
+ },
78
+ "disclaimer": "Decision-support only; not a clinical directive. Tier-2/3 reachability is candidate.",
79
+ })
80
+ return plans
81
+
82
+
83
+ if __name__ == "__main__": # pragma: no cover
84
+ import json
85
+ ps = plan_write("TRAC", EditIntent.KNOCK_IN_DISRUPT, 2000, "k562", k=3)
86
+ print(json.dumps(ps[0], indent=2, default=str)[:1200])
@@ -0,0 +1,26 @@
1
+ """Human-readable Write Planner report (Phase 3, Step 3.4)."""
2
+ from __future__ import annotations
3
+
4
+
5
+ def render_plan(p: dict) -> str:
6
+ s = p["site"]
7
+ lines = [
8
+ f"Write plan for {p['gene']} (intent: {p['intent']})",
9
+ f" Site : {s['chrom']}:{s['pos']:,} (bin {s['bin']}, on_target={p['on_target']})",
10
+ f" Writer : {p['writer']} [reachability {p['reachability_tier']}]",
11
+ f" Scores : safety {p['safety']} | durability {p['durability']} | "
12
+ f"writer-activity {p['writer_activity']} | score {p['score']}",
13
+ f" Cargo : payload {p['cargo']['payload_bp']} bp -> assembled {p['cargo']['assembled_bp']} bp "
14
+ f"(size_ok={p['cargo']['size_ok']}, codon-optimised, insulated)",
15
+ f" Delivery : {p['delivery']['delivery']} ({p['delivery']['rationale']})",
16
+ ]
17
+ if "offtargets" in p["cargo"]:
18
+ lines.append(f" Off-target : {p['cargo']['offtargets'].get('status', p['cargo']['offtargets'])}")
19
+ lines.append(f" Note : {p['disclaimer']}")
20
+ return "\n".join(lines)
21
+
22
+
23
+ def render_plans(plans: list[dict]) -> str:
24
+ if not plans:
25
+ return "No plan found (gene not in the atlas, or no reachable site)."
26
+ return f"\n{'='*72}\n".join(f"[rank {i+1}]\n{render_plan(p)}" for i, p in enumerate(plans))
@@ -0,0 +1 @@
1
+ """pen_stack.rag - see PEN-STACK v3.0 program doc."""
pen_stack/rag/index.py ADDED
@@ -0,0 +1,53 @@
1
+ """Grounded document index for the PEN-STACK RAG (Phase 2, Step 2.8).
2
+
3
+ Builds a cited corpus of fact cards from the curated atlas + WT-KB (each card carries its source DOIs),
4
+ so retrieval-grounded answers always have a citation. If PaperQA + an LLM are available they can index a
5
+ literature corpus on top; the keyword retriever here is the dependency-light default that guarantees the
6
+ "every factual claim is cited" contract without any model.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass, field
11
+ from pathlib import Path
12
+
13
+ import pandas as pd
14
+
15
+ _ATLAS = Path(__file__).resolve().parents[1] / "atlas" / "atlas.parquet"
16
+
17
+
18
+ @dataclass
19
+ class Card:
20
+ key: str
21
+ text: str
22
+ citations: list[str] = field(default_factory=list)
23
+
24
+
25
+ def build_cards(atlas_parquet: str | Path = _ATLAS) -> list[Card]:
26
+ """One fact card per writer family, summarising its measured targeting + readiness, with DOIs."""
27
+ df = pd.read_parquet(atlas_parquet)
28
+ cards: list[Card] = []
29
+ for fam, sub in df.groupby("family"):
30
+ core = sub[sub["entry_kind"].isin(["curated_core", "curated_rep"])]
31
+ rep = core.iloc[0] if len(core) else sub.iloc[0]
32
+ dois: list[str] = []
33
+ for d in core["key_dois"] if len(core) else sub["key_dois"]:
34
+ dois.extend(str(x) for x in list(d) if str(x).strip())
35
+ text = (f"Writer family {fam}: representative {rep['representative_system']}; "
36
+ f"mechanism {rep.get('mechanism_bucket')}; targeting {rep.get('targeting_modality')}; "
37
+ f"reachability {rep.get('reachability_tier')}; deliverability {rep.get('deliv_class')}; "
38
+ f"cargo {rep.get('cargo_capacity_bp')} bp; human-cell activity: "
39
+ f"{rep.get('human_cell_activity')}. {len(sub):,} systems catalogued.")
40
+ cards.append(Card(key=fam, text=text, citations=sorted(set(dois))))
41
+ return cards
42
+
43
+
44
+ def retrieve(question: str, cards: list[Card], k: int = 3) -> list[Card]:
45
+ """Keyword overlap retriever (lower-cased token Jaccard). Deterministic, no model needed."""
46
+ q = set(_tok(question))
47
+ scored = [(len(q & set(_tok(c.text + " " + c.key))), c) for c in cards]
48
+ scored.sort(key=lambda x: x[0], reverse=True)
49
+ return [c for n, c in scored if n > 0][:k]
50
+
51
+
52
+ def _tok(s: str) -> list[str]:
53
+ return [w for w in "".join(ch.lower() if ch.isalnum() else " " for ch in s).split() if len(w) > 2]
pen_stack/rag/llm.py ADDED
@@ -0,0 +1,178 @@
1
+ """Provider-agnostic LLM layer for PEN-STACK services (RAG, agent, PEN-MONITOR).
2
+
3
+ Hybrid backend: a strong hosted model for reasoning/agent/Q&A (default NVIDIA Nemotron, OpenAI-compatible)
4
+ with automatic fallback to a local, free, private model (Ollama). The single switch is `configs/llm.yaml`.
5
+
6
+ This is strictly an orchestration/phrasing layer. Every quantitative claim and every citation still comes
7
+ from the deterministic validated-tool path; the LLM never introduces a number, gene, or citation. The
8
+ choice of model therefore does not affect scientific reproducibility - only the quality of orchestration
9
+ and prose. If no provider is reachable, the callers fall back to the deterministic answer (LLM optional).
10
+
11
+ Secrets: the API key is read from the env var named in `api_key_env`, then from the gitignored
12
+ `api_key_file`. Keys are NEVER committed.
13
+ """
14
+ from __future__ import annotations
15
+
16
+ import json
17
+ import os
18
+ import urllib.request
19
+ from pathlib import Path
20
+
21
+ import yaml
22
+
23
+ _CFG = Path(__file__).resolve().parents[2] / "configs" / "llm.yaml"
24
+ _ROOT = Path(__file__).resolve().parents[2]
25
+
26
+ _SYSTEM = ("You rephrase already-verified genome-writing facts into one clear paragraph for a wet-lab "
27
+ "scientist. Use ONLY the facts provided. Do NOT invent or alter any number, gene, or citation. "
28
+ "Do not give clinical advice. Keep it under 90 words.")
29
+
30
+
31
+ def load_llm_config(path: str | Path = _CFG) -> dict:
32
+ return yaml.safe_load(Path(path).read_text(encoding="utf-8"))
33
+
34
+
35
+ def _provider_cfg(cfg: dict, name: str) -> dict | None:
36
+ return (cfg.get("providers") or {}).get(name)
37
+
38
+
39
+ def _resolve_key(pcfg: dict) -> str | None:
40
+ env = pcfg.get("api_key_env")
41
+ if env and os.environ.get(env):
42
+ return os.environ[env].strip()
43
+ f = pcfg.get("api_key_file")
44
+ if f:
45
+ p = Path(f)
46
+ if not p.is_absolute():
47
+ p = _ROOT / f
48
+ if p.exists():
49
+ return p.read_text(encoding="utf-8").strip()
50
+ return pcfg.get("api_key")
51
+
52
+
53
+ def _norm_tool_calls(raw: list | None) -> list:
54
+ out = []
55
+ for c in raw or []:
56
+ fn = c.get("function", {})
57
+ args = fn.get("arguments", {})
58
+ if isinstance(args, str):
59
+ try:
60
+ args = json.loads(args or "{}")
61
+ except json.JSONDecodeError:
62
+ args = {}
63
+ out.append({"function": {"name": fn.get("name"), "arguments": args}})
64
+ return out
65
+
66
+
67
+ def _chat_openai(pcfg: dict, messages: list, tools: list | None, temperature: float,
68
+ timeout: int) -> dict | None:
69
+ """OpenAI-compatible /v1/chat/completions (NVIDIA NIM, OpenAI, vLLM, Ollama /v1)."""
70
+ base = pcfg["api_base"].rstrip("/")
71
+ key = _resolve_key(pcfg)
72
+ payload = {"model": pcfg["model"], "messages": messages, "temperature": temperature,
73
+ "max_tokens": int(pcfg.get("max_tokens", 1024))}
74
+ if tools:
75
+ payload["tools"] = tools
76
+ payload["tool_choice"] = "auto"
77
+ headers = {"Content-Type": "application/json"}
78
+ if key:
79
+ headers["Authorization"] = f"Bearer {key}"
80
+ req = urllib.request.Request(f"{base}/chat/completions", data=json.dumps(payload).encode(), headers=headers)
81
+ with urllib.request.urlopen(req, timeout=timeout) as r:
82
+ d = json.load(r)
83
+ msg = d["choices"][0]["message"]
84
+ return {"content": (msg.get("content") or "").strip(), "tool_calls": _norm_tool_calls(msg.get("tool_calls")),
85
+ "raw": msg, "style": "openai"}
86
+
87
+
88
+ def _chat_ollama(pcfg: dict, messages: list, tools: list | None, temperature: float,
89
+ timeout: int) -> dict | None:
90
+ """Ollama native /api/chat."""
91
+ base = pcfg["api_base"].rstrip("/")
92
+ payload = {"model": str(pcfg["model"]).split("/")[-1], "messages": messages, "stream": False,
93
+ "options": {"temperature": temperature}}
94
+ if tools:
95
+ payload["tools"] = tools
96
+ req = urllib.request.Request(f"{base}/api/chat", data=json.dumps(payload).encode(),
97
+ headers={"Content-Type": "application/json"})
98
+ with urllib.request.urlopen(req, timeout=timeout) as r:
99
+ d = json.load(r)
100
+ msg = d.get("message", {})
101
+ return {"content": (msg.get("content") or "").strip(), "tool_calls": _norm_tool_calls(msg.get("tool_calls")),
102
+ "raw": msg, "style": "ollama"}
103
+
104
+
105
+ def _call_provider(name: str, cfg: dict, messages: list, tools: list | None, timeout: int) -> dict | None:
106
+ pcfg = _provider_cfg(cfg, name)
107
+ if not pcfg:
108
+ return None
109
+ temp = float(cfg.get("temperature", 0.1))
110
+ style = pcfg.get("style", "openai")
111
+ try:
112
+ if style == "ollama":
113
+ return _chat_ollama(pcfg, messages, tools, temp, timeout)
114
+ return _chat_openai(pcfg, messages, tools, temp, timeout)
115
+ except Exception: # noqa: BLE001 - any provider failure -> let the caller try the fallback
116
+ return None
117
+
118
+
119
+ # Cooldown cache: once a provider fails (e.g. Ollama not installed on the laptop tier), skip it for
120
+ # `health_ttl` seconds instead of re-attempting it on every call. This is what prevents the multi-minute
121
+ # stalls when a configured provider is absent/slow - we pay one failed attempt, then bypass it.
122
+ _COOLDOWN: dict[str, float] = {}
123
+
124
+
125
+ def chat(messages: list, tools: list | None = None, cfg: dict | None = None,
126
+ timeout: int | None = None) -> dict | None:
127
+ """Provider-agnostic chat. Tries the active provider, then the configured fallback, skipping any
128
+ provider in cooldown (recently unreachable). Returns {content, tool_calls, provider} or None if every
129
+ provider fails (callers then degrade deterministically - the LLM is non-load-bearing)."""
130
+ import time
131
+ cfg = cfg or load_llm_config()
132
+ timeout = timeout if timeout is not None else int(cfg.get("call_timeout", 60))
133
+ ttl = float(cfg.get("health_ttl", 120))
134
+ order = [cfg.get("provider", "nvidia")]
135
+ fb = cfg.get("fallback")
136
+ if fb and fb not in order:
137
+ order.append(fb)
138
+ now = time.time()
139
+ tried_any = False
140
+ for name in order:
141
+ if _COOLDOWN.get(name, 0) > now: # provider recently failed -> skip without waiting
142
+ continue
143
+ tried_any = True
144
+ res = _call_provider(name, cfg, messages, tools, timeout)
145
+ if res is not None:
146
+ res["provider"] = name
147
+ _COOLDOWN.pop(name, None)
148
+ return res
149
+ _COOLDOWN[name] = now + ttl # mark unreachable; don't retry for ttl seconds
150
+ if not tried_any: # every provider in cooldown -> one cheap retry of the first
151
+ name = order[0]
152
+ res = _call_provider(name, cfg, messages, tools, min(timeout, int(cfg.get("health_timeout", 8))))
153
+ if res is not None:
154
+ res["provider"] = name
155
+ _COOLDOWN.pop(name, None)
156
+ return res
157
+ return None
158
+
159
+
160
+ def active_provider(cfg: dict | None = None, timeout: int | None = None) -> str | None:
161
+ """Name of the first reachable provider (active, then fallback), or None. Uses the config `health_timeout`
162
+ by default so an absent provider is detected quickly (and then cooled down by chat())."""
163
+ cfg = cfg or load_llm_config()
164
+ timeout = timeout if timeout is not None else int(cfg.get("health_timeout", 8))
165
+ r = chat([{"role": "user", "content": "ok"}], cfg=cfg, timeout=timeout)
166
+ return r.get("provider") if r else None
167
+
168
+
169
+ def available(cfg: dict | None = None, timeout: int = 30) -> bool:
170
+ return active_provider(cfg, timeout) is not None
171
+
172
+
173
+ def phrase(facts: str, cfg: dict | None = None, timeout: int = 120) -> str | None:
174
+ """Rephrase grounded facts. Returns None on any failure (caller keeps the deterministic answer)."""
175
+ msgs = [{"role": "system", "content": _SYSTEM},
176
+ {"role": "user", "content": f"Facts:\n{facts}\n\nRephrase as one paragraph."}]
177
+ r = chat(msgs, cfg=cfg, timeout=timeout)
178
+ return (r.get("content") or None) if r else None