pen-stack 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. pen_stack/__init__.py +2 -0
  2. pen_stack/_resources.py +34 -0
  3. pen_stack/adapt/__init__.py +14 -0
  4. pen_stack/adapt/finetune.py +33 -0
  5. pen_stack/adapt/ingest.py +86 -0
  6. pen_stack/adapt/pipeline.py +101 -0
  7. pen_stack/adapt/recalibrate.py +58 -0
  8. pen_stack/adapt/report.py +130 -0
  9. pen_stack/agent/__init__.py +1 -0
  10. pen_stack/agent/guardrails.py +49 -0
  11. pen_stack/agent/mcp_server.py +42 -0
  12. pen_stack/agent/orchestrator.py +106 -0
  13. pen_stack/agent/pen_agent.py +169 -0
  14. pen_stack/agent/tools.py +130 -0
  15. pen_stack/atlas/__init__.py +1 -0
  16. pen_stack/atlas/build_wtkb.py +80 -0
  17. pen_stack/atlas/crosslink.py +144 -0
  18. pen_stack/atlas/expand.py +190 -0
  19. pen_stack/atlas/schema.py +59 -0
  20. pen_stack/atlas/scorecard.py +134 -0
  21. pen_stack/atlas/universe.py +75 -0
  22. pen_stack/atlas/variant_propose.py +155 -0
  23. pen_stack/bridge/__init__.py +1 -0
  24. pen_stack/bridge/activity.py +52 -0
  25. pen_stack/bridge/cli.py +65 -0
  26. pen_stack/bridge/fold_qc.py +53 -0
  27. pen_stack/bridge/guide_qc.py +84 -0
  28. pen_stack/bridge/ingest.py +139 -0
  29. pen_stack/bridge/offtarget.py +133 -0
  30. pen_stack/bridge/ortholog_screen.py +73 -0
  31. pen_stack/bridge/pipeline.py +83 -0
  32. pen_stack/cli.py +126 -0
  33. pen_stack/data/__init__.py +1 -0
  34. pen_stack/data/encode.py +84 -0
  35. pen_stack/data/genome.py +71 -0
  36. pen_stack/data/ingest_chromatin.py +119 -0
  37. pen_stack/data/ingest_integration.py +112 -0
  38. pen_stack/data/ingest_safety_annot.py +164 -0
  39. pen_stack/data/ingest_trip.py +76 -0
  40. pen_stack/mech/__init__.py +1 -0
  41. pen_stack/mech/classify_atlas.py +71 -0
  42. pen_stack/mech/whitelist.py +66 -0
  43. pen_stack/monitor/__init__.py +1 -0
  44. pen_stack/monitor/europepmc.py +32 -0
  45. pen_stack/monitor/run.py +57 -0
  46. pen_stack/monitor/triage.py +63 -0
  47. pen_stack/planner/__init__.py +1 -0
  48. pen_stack/planner/cargo.py +56 -0
  49. pen_stack/planner/cargo_polish.py +146 -0
  50. pen_stack/planner/delivery.py +32 -0
  51. pen_stack/planner/multiplex.py +110 -0
  52. pen_stack/planner/optimize.py +156 -0
  53. pen_stack/planner/pipeline.py +86 -0
  54. pen_stack/planner/report.py +26 -0
  55. pen_stack/rag/__init__.py +1 -0
  56. pen_stack/rag/index.py +53 -0
  57. pen_stack/rag/llm.py +178 -0
  58. pen_stack/rag/qa.py +105 -0
  59. pen_stack/score/__init__.py +1 -0
  60. pen_stack/score/recalibrate.py +77 -0
  61. pen_stack/score/therapeutic.py +85 -0
  62. pen_stack/server/__init__.py +1 -0
  63. pen_stack/server/api.py +142 -0
  64. pen_stack/ui/__init__.py +1 -0
  65. pen_stack/ui/app.py +518 -0
  66. pen_stack/validate/__init__.py +1 -0
  67. pen_stack/validate/adapt_demo.py +69 -0
  68. pen_stack/validate/agent_eval.py +117 -0
  69. pen_stack/validate/blind_gsh_discovery.py +165 -0
  70. pen_stack/validate/cargo_directionality.py +57 -0
  71. pen_stack/validate/durability_baselines.py +150 -0
  72. pen_stack/validate/forward_hypotheses.py +104 -0
  73. pen_stack/validate/guide_qc_demo.py +58 -0
  74. pen_stack/validate/intent_specification.py +82 -0
  75. pen_stack/validate/paper3_benchmark.py +165 -0
  76. pen_stack/validate/paper4_real_validation.py +144 -0
  77. pen_stack/validate/paper4_validation.py +82 -0
  78. pen_stack/validate/seq_vs_measured.py +134 -0
  79. pen_stack/validate/within_locus_ranking.py +74 -0
  80. pen_stack/validate/writer_recovery.py +86 -0
  81. pen_stack/wgenome/__init__.py +1 -0
  82. pen_stack/wgenome/chromatin_seq.py +83 -0
  83. pen_stack/wgenome/durability.py +108 -0
  84. pen_stack/wgenome/export_tracks.py +52 -0
  85. pen_stack/wgenome/features.py +82 -0
  86. pen_stack/wgenome/gsh_baseline.py +117 -0
  87. pen_stack/wgenome/providers.py +245 -0
  88. pen_stack/wgenome/safety.py +69 -0
  89. pen_stack/wgenome/structure3d.py +168 -0
  90. pen_stack/wgenome/writability.py +72 -0
  91. pen_stack-3.1.0.dist-info/METADATA +451 -0
  92. pen_stack-3.1.0.dist-info/RECORD +96 -0
  93. pen_stack-3.1.0.dist-info/WHEEL +5 -0
  94. pen_stack-3.1.0.dist-info/entry_points.txt +3 -0
  95. pen_stack-3.1.0.dist-info/licenses/LICENSE +21 -0
  96. pen_stack-3.1.0.dist-info/top_level.txt +1 -0
pen_stack/__init__.py ADDED
@@ -0,0 +1,2 @@
1
+ """PEN-STACK v3.0 - open infrastructure for genome writing."""
2
+ __version__ = "3.1.0"
@@ -0,0 +1,34 @@
1
+ """Resolve repo-relative resource files (configs, prereg, curated data) in both layouts.
2
+
3
+ PEN-STACK is a research pipeline: the pip wheel ships the importable library + CLI + the pure-logic tools,
4
+ while the full data pipeline (3 M-row atlases, curated configs, BigWig tracks) lives in the cloned repo and
5
+ on Zenodo, per the data policy in the README. This helper finds resource files when running from a source
6
+ checkout/sdist, and gives installed users a single escape hatch (`PEN_STACK_HOME`) to point at a checkout.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import os
11
+ from pathlib import Path
12
+
13
+ _PKG = Path(__file__).resolve().parent # .../pen_stack
14
+ _ENV = "PEN_STACK_HOME"
15
+
16
+
17
+ def project_root() -> Path:
18
+ """Best guess at the project root holding configs/, prereg/, data/. `PEN_STACK_HOME` overrides."""
19
+ env = os.environ.get(_ENV)
20
+ if env:
21
+ return Path(env).expanduser()
22
+ return _PKG.parent # repo root in a source checkout / sdist
23
+
24
+
25
+ def resource(rel: str) -> Path:
26
+ """Absolute path to a repo-relative resource (e.g. 'configs/cargo_polish.yaml'). Raises a clear,
27
+ actionable error if it is not present (e.g. a bare `pip install` without `PEN_STACK_HOME` or a checkout)."""
28
+ p = project_root() / rel
29
+ if not p.exists():
30
+ raise FileNotFoundError(
31
+ f"resource {rel!r} not found at {p}. The pip wheel ships the library, not the full data/config "
32
+ f"tree. Clone the repo for the full pipeline, or set {_ENV} to a checkout: "
33
+ f"export {_ENV}=/path/to/pen-stack")
34
+ return p
@@ -0,0 +1,14 @@
1
+ """Local recalibration / private-data adaptation (v3.1, WS-F).
2
+
3
+ Released PEN-STACK models can be recalibrated (or lightly fine-tuned) on a user's own assays - inside
4
+ Docker, on private data that never leaves the machine - behind a VALIDATION GATE so quality cannot silently
5
+ regress. The adapted artifact activates only if it beats the released model on the user's held-out split;
6
+ the released model is never overwritten (separate versioning under models/local_<id>/).
7
+ """
8
+ from __future__ import annotations
9
+
10
+ from pen_stack.adapt.pipeline import adapt
11
+ from pen_stack.adapt.recalibrate import IsotonicCalibrator, recalibrate
12
+ from pen_stack.adapt.report import evaluate, gate, model_card
13
+
14
+ __all__ = ["adapt", "IsotonicCalibrator", "recalibrate", "evaluate", "gate", "model_card"]
@@ -0,0 +1,33 @@
1
+ """WS-F2(b) - OPTIONAL light fine-tuning: a LightGBM head on the user's features.
2
+
3
+ This is the heavier, opt-in path (the default WS-F adaptation is isotonic recalibration, which is far more
4
+ robust on small private datasets). It trains a small LightGBM classifier on the user's features+labels - or
5
+ continues training from a released booster via `init_model` - and is subject to the SAME validation gate:
6
+ it activates only if it beats the released model on the held-out split. Small-data overfitting is mitigated
7
+ (not eliminated) by the gate, shallow trees, and strong regularization.
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import numpy as np
12
+
13
+
14
+ def finetune_head(X, y, init_model=None, seed: int = 0, n_estimators: int = 100):
15
+ """Train (or continue-train) a small, regularized LightGBM head. Returns the fitted model."""
16
+ import lightgbm as lgb
17
+ X, y = np.asarray(X, float), np.asarray(y, float)
18
+ if X.ndim == 1:
19
+ X = X.reshape(-1, 1)
20
+ model = lgb.LGBMClassifier(
21
+ n_estimators=n_estimators, num_leaves=15, max_depth=4, learning_rate=0.05,
22
+ min_child_samples=20, reg_lambda=1.0, subsample=0.8, colsample_bytree=0.8,
23
+ random_state=seed, verbose=-1)
24
+ model.fit(X, y.astype(int), init_model=init_model)
25
+ return model
26
+
27
+
28
+ def predict_proba(model, X):
29
+ import numpy as np
30
+ X = np.asarray(X, float)
31
+ if X.ndim == 1:
32
+ X = X.reshape(-1, 1)
33
+ return model.predict_proba(X)[:, 1]
@@ -0,0 +1,86 @@
1
+ """WS-F1 - ingest a user's private assay into per-site labels matching the model's feature schema.
2
+
3
+ The runnable in-code path is TABULAR: a CSV/TSV/Parquet of sites + an outcome label (and, optionally, the
4
+ released-model score column or the per-bin features to attach). The upstream FASTQ/BAM -> per-site label
5
+ derivation (integration-site sequencing, GUIDE-seq, expression-stability profiling) is documented in
6
+ docs/private_data_formats.md and runs in the Docker image with the usual aligners; it produces exactly the
7
+ tabular schema this module validates, so the two halves compose.
8
+
9
+ Schema (standardized output): chrom, bin, ct, label, [score], [features...]
10
+ * label: 0/1 (discrimination) or a real value in [0,1] (calibration target).
11
+ * score: the released model's output for that site (safety / p_durable / writability) - the thing we
12
+ recalibrate. If absent, attach_features() joins it from the writability atlas.
13
+ """
14
+ from __future__ import annotations
15
+
16
+ from pathlib import Path
17
+
18
+ import numpy as np
19
+ import pandas as pd
20
+
21
+ BIN_BP = 1000
22
+ REQUIRED = ("chrom", "label") # plus one of {bin, pos}
23
+
24
+
25
+ def load_user_labels(path: str | Path) -> pd.DataFrame:
26
+ """Load + validate a user label table (.csv/.tsv/.parquet). Returns a frame with chrom, bin, ct, label."""
27
+ p = Path(path)
28
+ if p.suffix in (".parquet", ".pq"):
29
+ df = pd.read_parquet(p)
30
+ else:
31
+ df = pd.read_csv(p, sep="\t" if p.suffix in (".tsv", ".txt") else ",")
32
+ return normalize(df)
33
+
34
+
35
+ def normalize(df: pd.DataFrame) -> pd.DataFrame:
36
+ """Validate columns and standardize: derive `bin` from `pos` if needed; coerce label; default ct."""
37
+ df = df.copy()
38
+ missing = [c for c in REQUIRED if c not in df.columns]
39
+ if missing:
40
+ raise ValueError(f"user table missing required columns: {missing} (have {list(df.columns)})")
41
+ if "bin" not in df.columns:
42
+ if "pos" not in df.columns:
43
+ raise ValueError("user table needs a 'bin' or a 'pos' column to locate each site")
44
+ df["bin"] = (df["pos"].astype(int) // BIN_BP).astype(int)
45
+ if "ct" not in df.columns:
46
+ df["ct"] = "user"
47
+ lab = pd.to_numeric(df["label"], errors="coerce")
48
+ if lab.isna().any():
49
+ raise ValueError("label column has non-numeric / missing values")
50
+ if not (((lab == 0) | (lab == 1)).all() or ((lab >= 0) & (lab <= 1)).all()):
51
+ raise ValueError("label must be binary {0,1} or a probability in [0,1]")
52
+ df["label"] = lab.astype(float)
53
+ keep = ["chrom", "bin", "ct", "label"] + [c for c in df.columns
54
+ if c not in ("chrom", "bin", "ct", "label", "pos")]
55
+ return df[keep].reset_index(drop=True)
56
+
57
+
58
+ def attach_features(df: pd.DataFrame, target: str = "safety", ct: str = "k562") -> pd.DataFrame:
59
+ """Join the released model's score for `target` (safety|p_durable|writability) from the Phase-1 atlas.
60
+
61
+ No-op if the score column is already present (user supplied it). Raises if the atlas is unavailable and
62
+ no score column exists - the caller then supplies scores directly.
63
+ """
64
+ col = {"safety": "safety", "durability": "p_durable", "p_durable": "p_durable",
65
+ "writability": "writability"}.get(target, target)
66
+ if col in df.columns and df[col].notna().any():
67
+ return df.rename(columns={col: "score"}) if col != "score" else df
68
+ if "score" in df.columns:
69
+ return df
70
+ atlas = Path(__file__).resolve().parents[2].parent / "phase_1" / "out" / f"atlas_{ct}.parquet"
71
+ if not atlas.exists():
72
+ raise FileNotFoundError(
73
+ f"no '{col}'/'score' column and Phase-1 atlas absent ({atlas}); supply the released-model score "
74
+ "column in the user table, or run inside the image where the atlas is mounted.")
75
+ a = pd.read_parquet(atlas, columns=["chrom", "bin", col])
76
+ out = df.merge(a, on=["chrom", "bin"], how="left").rename(columns={col: "score"})
77
+ if out["score"].isna().any():
78
+ out = out.dropna(subset=["score"]).reset_index(drop=True)
79
+ return out
80
+
81
+
82
+ def schema_summary(df: pd.DataFrame) -> dict:
83
+ return {"n_sites": int(len(df)), "n_chroms": int(df["chrom"].nunique()),
84
+ "label_kind": "binary" if set(np.unique(df["label"])) <= {0.0, 1.0} else "continuous",
85
+ "positive_rate": round(float(df["label"].mean()), 4),
86
+ "has_score": "score" in df.columns}
@@ -0,0 +1,101 @@
1
+ """WS-F2 - the adaptation pipeline (ingest -> split -> recalibrate/finetune -> held-out gate -> version).
2
+
3
+ `adapt()` is the one entry point. It splits the user's sites into train/held-out (chromosome-grouped when
4
+ possible), fits the adaptation on train, scores the released vs adapted model on the SAME held-out split,
5
+ and applies the validation gate. The adapted artifact is written under models/local_<id>/ ONLY - the
6
+ released model is never overwritten, and its fingerprint is checked before/after to prove it (acceptance).
7
+ """
8
+ from __future__ import annotations
9
+
10
+ from pathlib import Path
11
+
12
+ import numpy as np
13
+ import pandas as pd
14
+
15
+ from pen_stack.adapt import report as R
16
+ from pen_stack.adapt.recalibrate import recalibrate
17
+
18
+ _ROOT = Path(__file__).resolve().parents[2]
19
+ _MODELS = _ROOT / "models"
20
+ # released score-producing modules - the "released model" we must prove is unchanged by adaptation.
21
+ _RELEASED = [_ROOT / "pen_stack" / "wgenome" / m for m in ("safety.py", "durability.py", "writability.py")]
22
+
23
+
24
+ def _split(df: pd.DataFrame, seed: int, holdout_frac: float = 0.3):
25
+ """Chromosome-grouped holdout when >=2 chromosomes (no leakage); else a seeded random split."""
26
+ rng = np.random.default_rng(seed)
27
+ chroms = df["chrom"].unique()
28
+ if len(chroms) >= 2:
29
+ n_ho = max(1, int(round(len(chroms) * holdout_frac)))
30
+ ho_chroms = set(rng.choice(chroms, size=n_ho, replace=False))
31
+ mask = df["chrom"].isin(ho_chroms)
32
+ else:
33
+ mask = pd.Series(rng.random(len(df)) < holdout_frac, index=df.index)
34
+ return df[~mask].reset_index(drop=True), df[mask].reset_index(drop=True)
35
+
36
+
37
+ def adapt(df: pd.DataFrame, target: str = "safety", method: str = "isotonic", local_id: str = "local",
38
+ seed: int = 20260604, primary: str = "brier", margin: float = 0.0,
39
+ feature_cols: list[str] | None = None, models_dir: str | Path = _MODELS) -> dict:
40
+ """Recalibrate (or fine-tune) the released `target` score on the user frame (needs 'score' + 'label').
41
+
42
+ Returns the held-out before/after report + the gate decision + the artifact paths. The adapted model is
43
+ activated (written + flagged) only if it beats the released model on the held-out split.
44
+ """
45
+ if "score" not in df.columns or "label" not in df.columns:
46
+ raise ValueError("adapt() needs standardized columns 'score' and 'label' (see adapt.ingest)")
47
+ fp_before = R.released_fingerprint(*_RELEASED)
48
+
49
+ train, holdout = _split(df, seed)
50
+ if len(train) < 5 or len(holdout) < 3:
51
+ raise ValueError(f"not enough data after split (train={len(train)}, holdout={len(holdout)})")
52
+
53
+ base_holdout = np.clip(holdout["score"].to_numpy(float), 0, 1) # released score as a probability
54
+ if method == "isotonic":
55
+ cal = recalibrate(train["score"], train["label"])
56
+ adapted_holdout = cal.transform(holdout["score"])
57
+ artifact = "calibrator.json"
58
+ elif method == "finetune":
59
+ from pen_stack.adapt.finetune import finetune_head, predict_proba
60
+ cols = feature_cols or ["score"]
61
+ model = finetune_head(train[cols].to_numpy(float), train["label"], seed=seed)
62
+ adapted_holdout = predict_proba(model, holdout[cols].to_numpy(float))
63
+ cal, artifact = None, "head.txt"
64
+ else:
65
+ raise ValueError(f"unknown method: {method!r} (use 'isotonic' or 'finetune')")
66
+
67
+ # no-skill constant predictor: the TRAIN base rate applied to every held-out site (no leakage). The
68
+ # adapted model must beat this too, else its 'improvement' is just regression to climatology.
69
+ base_rate = float(np.clip(train["label"].mean(), 1e-6, 1 - 1e-6))
70
+ no_skill = R.evaluate(np.full(len(holdout), base_rate), holdout["label"])
71
+
72
+ base = R.evaluate(base_holdout, holdout["label"])
73
+ adapted = R.evaluate(adapted_holdout, holdout["label"])
74
+ gate = R.gate(base, adapted, primary=primary, margin=margin, no_skill=no_skill)
75
+
76
+ out_dir = Path(models_dir) / f"local_{local_id}"
77
+ fp_after = R.released_fingerprint(*_RELEASED)
78
+ released_unchanged = fp_before == fp_after
79
+ report = {"local_id": local_id, "target": target, "method": method,
80
+ "n_train": int(len(train)), "n_holdout": int(len(holdout)),
81
+ "held_out_before": base, "held_out_after": adapted, "held_out_no_skill": no_skill, "gate": gate,
82
+ "released_model_unchanged": released_unchanged,
83
+ "released_fingerprint": fp_after, "activated": gate["activate"]}
84
+ card = R.model_card(f"local_{local_id}", target, method, base, adapted, gate,
85
+ len(train), len(holdout), fp_after)
86
+ paths = R.write_report(out_dir, report, card)
87
+ # persist the adapted artifact ONLY when the gate passes; otherwise remove any stale artifact so a
88
+ # previously-activated adaptation that now fails the gate is not left active (released model stays in force).
89
+ artifact_path = out_dir / artifact
90
+ if gate["activate"]:
91
+ if method == "isotonic":
92
+ cal.save(artifact_path)
93
+ else:
94
+ model.booster_.save_model(str(artifact_path))
95
+ paths["artifact"] = str(artifact_path)
96
+ else:
97
+ for stale in (out_dir / "calibrator.json", out_dir / "head.txt"):
98
+ if stale.exists():
99
+ stale.unlink()
100
+ report["paths"] = paths
101
+ return report
@@ -0,0 +1,58 @@
1
+ """WS-F2(a) - isotonic recalibration of a released score on user labels (private, in-container).
2
+
3
+ Isotonic regression learns a monotonic map released_score -> calibrated probability. Being monotonic it
4
+ NEVER changes the ranking (AUROC is preserved); it only fixes calibration (Brier / ECE). Small and robust on
5
+ the small datasets users typically have. The calibrator is saved as plain JSON under models/local_<id>/ -
6
+ the released model is untouched.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ from pathlib import Path
12
+
13
+ import numpy as np
14
+
15
+
16
+ class IsotonicCalibrator:
17
+ """Thin, serializable wrapper around sklearn's IsotonicRegression (saved as JSON, no pickle)."""
18
+
19
+ def __init__(self):
20
+ self._iso = None
21
+ self.fitted = False
22
+
23
+ def fit(self, scores, labels) -> "IsotonicCalibrator":
24
+ from sklearn.isotonic import IsotonicRegression
25
+ s, y = np.asarray(scores, float), np.asarray(labels, float)
26
+ self._iso = IsotonicRegression(out_of_bounds="clip", y_min=0.0, y_max=1.0)
27
+ self._iso.fit(s, y)
28
+ self.fitted = True
29
+ return self
30
+
31
+ def transform(self, scores):
32
+ if not self.fitted:
33
+ raise RuntimeError("calibrator not fitted")
34
+ return self._iso.predict(np.asarray(scores, float))
35
+
36
+ def to_dict(self) -> dict:
37
+ return {"kind": "isotonic", "x": list(map(float, self._iso.X_thresholds_)),
38
+ "y": list(map(float, self._iso.y_thresholds_))}
39
+
40
+ def save(self, path: str | Path) -> Path:
41
+ Path(path).parent.mkdir(parents=True, exist_ok=True)
42
+ Path(path).write_text(json.dumps(self.to_dict(), indent=2), encoding="utf-8")
43
+ return Path(path)
44
+
45
+ @classmethod
46
+ def load(cls, path: str | Path) -> "IsotonicCalibrator":
47
+ d = json.loads(Path(path).read_text(encoding="utf-8"))
48
+ obj = cls()
49
+ from sklearn.isotonic import IsotonicRegression
50
+ iso = IsotonicRegression(out_of_bounds="clip", y_min=0.0, y_max=1.0)
51
+ iso.fit(np.asarray(d["x"], float), np.asarray(d["y"], float)) # re-fit on the stored knots
52
+ obj._iso, obj.fitted = iso, True
53
+ return obj
54
+
55
+
56
+ def recalibrate(scores, labels) -> IsotonicCalibrator:
57
+ """Fit an isotonic calibrator mapping a released score to a calibrated probability on user labels."""
58
+ return IsotonicCalibrator().fit(scores, labels)
@@ -0,0 +1,130 @@
1
+ """WS-F2(c,d) - held-out evaluation, the validation GATE, and the model card.
2
+
3
+ The adapted artifact ACTIVATES only if it beats the released model on the user's held-out split (the gate).
4
+ Calibration is judged by Brier score + expected calibration error (ECE, lower is better); discrimination by
5
+ AUROC (higher is better). The released model is provably unchanged (its artifact hash is recorded and
6
+ re-checked); a before/after report and a model card are always written.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ from datetime import datetime, timezone
12
+ from pathlib import Path
13
+
14
+ import numpy as np
15
+
16
+
17
+ def _auroc(scores, labels) -> float:
18
+ pos = [s for s, y in zip(scores, labels) if y == 1]
19
+ neg = [s for s, y in zip(scores, labels) if y == 0]
20
+ if not pos or not neg:
21
+ return float("nan")
22
+ return sum((p > n) + 0.5 * (p == n) for p in pos for n in neg) / (len(pos) * len(neg))
23
+
24
+
25
+ def _ece(probs, labels, n_bins: int = 10) -> float:
26
+ probs, labels = np.asarray(probs, float), np.asarray(labels, float)
27
+ edges = np.linspace(0, 1, n_bins + 1)
28
+ ece, n = 0.0, len(probs)
29
+ for i in range(n_bins):
30
+ m = (probs >= edges[i]) & (probs < edges[i + 1] if i < n_bins - 1 else probs <= edges[i + 1])
31
+ if m.sum():
32
+ ece += (m.sum() / n) * abs(probs[m].mean() - labels[m].mean())
33
+ return float(ece)
34
+
35
+
36
+ def evaluate(probs, labels) -> dict:
37
+ """Calibration + discrimination metrics for a set of probabilities against binary labels."""
38
+ probs, labels = np.asarray(probs, float), np.asarray(labels, float)
39
+ brier = float(np.mean((probs - labels) ** 2))
40
+ biny = labels if set(np.unique(labels)) <= {0.0, 1.0} else (labels >= 0.5).astype(float)
41
+ return {"n": int(len(probs)), "brier": round(brier, 5), "ece": round(_ece(probs, biny), 5),
42
+ "auroc": round(_auroc(list(probs), list(biny)), 4)}
43
+
44
+
45
+ def gate(base: dict, adapted: dict, primary: str = "brier", margin: float = 0.0,
46
+ no_skill: dict | None = None) -> dict:
47
+ """Activate the adapted model only if it BEATS the released model AND the no-skill constant predictor on
48
+ the held-out primary metric.
49
+
50
+ primary='brier'|'ece' -> lower is better; primary='auroc' -> higher is better. `margin` is the minimum
51
+ improvement required (guards against noise on small holdouts). The `no_skill` guard is essential:
52
+ recalibration can trivially lower Brier by regressing to the base rate, so we require the adapted model
53
+ to beat the constant base-rate predictor too - otherwise the 'improvement' is no skill, just climatology.
54
+ """
55
+ lower_better = primary in ("brier", "ece")
56
+
57
+ def better(x, ref):
58
+ return (ref - x) if lower_better else (x - ref)
59
+
60
+ b, a = base[primary], adapted[primary]
61
+ imp_released = better(a, b)
62
+ beats_released = imp_released > margin
63
+ beats_no_skill = True
64
+ ns = None
65
+ if no_skill is not None:
66
+ ns = no_skill[primary]
67
+ beats_no_skill = better(a, ns) > margin
68
+ activate = bool(beats_released and beats_no_skill)
69
+ if activate:
70
+ decision = "ADAPTED ACTIVATED (beats released AND the no-skill constant on held-out)"
71
+ elif not beats_no_skill:
72
+ decision = "ADAPTED REJECTED (improvement is no skill - does not beat the constant base rate)"
73
+ else:
74
+ decision = "ADAPTED REJECTED (does not beat released; released model kept)"
75
+ return {"primary_metric": primary, "lower_is_better": lower_better,
76
+ "released": b, "adapted": a, "no_skill_constant": ns,
77
+ "improvement_vs_released": round(imp_released, 5), "margin": margin,
78
+ "beats_released": bool(beats_released), "beats_no_skill": bool(beats_no_skill),
79
+ "activate": activate, "decision": decision}
80
+
81
+
82
+ def released_fingerprint(*paths: str | Path) -> dict:
83
+ """Hash designated released-model artifacts so we can prove they are unchanged by adaptation."""
84
+ import hashlib
85
+ fp = {}
86
+ for p in paths:
87
+ p = Path(p)
88
+ if p.exists():
89
+ fp[str(p.name)] = hashlib.sha256(p.read_bytes()).hexdigest()[:16]
90
+ return fp
91
+
92
+
93
+ def model_card(local_id: str, target: str, method: str, base: dict, adapted: dict, gate_res: dict,
94
+ n_train: int, n_holdout: int, released_fp: dict) -> str:
95
+ ts = datetime.now(timezone.utc).strftime("%Y-%m-%d")
96
+ return "\n".join([
97
+ f"# PEN-STACK local adaptation - {local_id}",
98
+ "",
99
+ f"- **Date:** {ts}",
100
+ f"- **Target score:** {target} **Method:** {method}",
101
+ f"- **Data:** {n_train} train / {n_holdout} held-out sites (private, in-container)",
102
+ f"- **Released-model fingerprint (unchanged):** {released_fp}",
103
+ "",
104
+ "## Held-out before/after",
105
+ "| metric | released | adapted |",
106
+ "|---|---|---|",
107
+ f"| Brier (lower better) | {base['brier']} | {adapted['brier']} |",
108
+ f"| ECE (lower better) | {base['ece']} | {adapted['ece']} |",
109
+ f"| AUROC (higher better) | {base['auroc']} | {adapted['auroc']} |",
110
+ "",
111
+ f"## Gate: **{gate_res['decision']}**",
112
+ f"- primary metric `{gate_res['primary_metric']}`: released {gate_res['released']} -> adapted "
113
+ f"{gate_res['adapted']} (improvement vs released {gate_res['improvement_vs_released']}, "
114
+ f"no-skill constant {gate_res.get('no_skill_constant')}, margin {gate_res['margin']}; "
115
+ f"beats released={gate_res['beats_released']}, beats no-skill={gate_res['beats_no_skill']}).",
116
+ "",
117
+ "## Scope",
118
+ "Recalibration / light fine-tuning on a small private dataset; overfitting is mitigated (not "
119
+ "eliminated) by the held-out gate. Not unsupervised learning from raw reads. The released model is "
120
+ "never overwritten - this artifact lives under `models/local_<id>/` and activates only if the gate "
121
+ "passed.",
122
+ ])
123
+
124
+
125
+ def write_report(out_dir: str | Path, report: dict, card: str) -> dict:
126
+ out = Path(out_dir)
127
+ out.mkdir(parents=True, exist_ok=True)
128
+ (out / "report.json").write_text(json.dumps(report, indent=2, default=str), encoding="utf-8")
129
+ (out / "model_card.md").write_text(card, encoding="utf-8")
130
+ return {"report": str(out / "report.json"), "model_card": str(out / "model_card.md")}
@@ -0,0 +1 @@
1
+ """pen_stack.agent - see PEN-STACK v3.0 program doc."""
@@ -0,0 +1,49 @@
1
+ """LLM guardrails for PEN-STACK platform services (Phase 2, Section 2B).
2
+
3
+ The contract every service obeys: **grounded** (answers from the curated atlas + indexed literature),
4
+ **cited** (every factual claim carries a source), **defer-to-models** (any quantitative claim is produced
5
+ by a validated tool call, never guessed by the LLM), **decision-support** (never a clinical directive),
6
+ **budget-aware**, **auditable** (a provenance block accompanies every answer).
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import re
11
+
12
+ DISCLAIMER = ("Decision-support only - PEN-STACK returns calibrated risk/durability/reachability "
13
+ "estimates, not clinical directives. Tier-2/3 reachability is candidate and requires "
14
+ "experimental validation. Verify all designs experimentally.")
15
+
16
+ # Questions PEN-STACK must refuse: clinical directives, diagnosis, dosing, treatment decisions for a
17
+ # specific patient. (Scientific questions about loci/writers/safety are in scope.)
18
+ _REFUSE_PATTERNS = [
19
+ r"\bshould i (treat|inject|dose|administer|give)\b",
20
+ r"\b(diagnos|prescrib|dosage|dosing)\w*\b",
21
+ r"\b(my|this|the) patient\b",
22
+ r"\bdose\b.{0,40}\b(child|patient|human|person|kid|baby|infant)\b", # dosing for a person = clinical
23
+ r"\b(what|which) dose\b", # dosing questions are clinical
24
+ r"\bis it safe (to|for) (a |the |my )?(patient|human|person|child)\b",
25
+ r"\bclinical (decision|recommendation|advice) for\b",
26
+ ]
27
+
28
+
29
+ def out_of_scope(question: str) -> str | None:
30
+ """Return a refusal reason if the question is a clinical directive, else None."""
31
+ q = question.lower()
32
+ for pat in _REFUSE_PATTERNS:
33
+ if re.search(pat, q):
34
+ return ("This is a clinical-directive question. PEN-STACK is decision-support "
35
+ "infrastructure for genome-writing design and does not give clinical advice.")
36
+ return None
37
+
38
+
39
+ def enforce_grounded(answer: dict) -> dict:
40
+ """Assert the auditable contract on a finished answer: numeric claims must trace to a tool call."""
41
+ answer.setdefault("disclaimer", DISCLAIMER)
42
+ answer.setdefault("provenance", [])
43
+ answer.setdefault("citations", [])
44
+ # if the answer reports numbers, there must be a tool-call provenance entry backing them
45
+ has_number = bool(re.search(r"\d", str(answer.get("answer", ""))))
46
+ if has_number and not answer["provenance"]:
47
+ answer["warning"] = "numeric claim without tool provenance - suppressed"
48
+ answer["answer"] = "(suppressed: a number was produced without a backing tool call)"
49
+ return answer
@@ -0,0 +1,42 @@
1
+ """PEN-STACK MCP server (Phase 3, Step 3.10; v3.1 WS-E2) - expose the validated capabilities to any agent.
2
+
3
+ Wraps the validated tools as a Model Context Protocol server (fastmcp) so any MCP client (Claude, etc.)
4
+ can call ``writability``, ``reachable_writers``, ``writer_axes``, ``plan_write``, ``ask_literature`` and the
5
+ grounded ``plan_write_session`` (the full PEN-Agent state machine) and receive correct, provenance-tagged
6
+ results - turning PEN-STACK into shared agentic infrastructure.
7
+
8
+ Run: ``python -m pen_stack.agent.mcp_server`` (needs the ``services`` extra: ``pip install fastmcp``).
9
+ """
10
+ from __future__ import annotations
11
+
12
+ from pen_stack.agent import pen_agent, tools
13
+
14
+ try:
15
+ from fastmcp import FastMCP
16
+ except ImportError as e: # pragma: no cover - services extra optional
17
+ raise ImportError("fastmcp not installed: pip install 'pen-stack[services]'") from e
18
+
19
+ mcp = FastMCP("pen-stack")
20
+
21
+ # register each validated tool (the same functions the in-process agent and the eval harness use)
22
+ mcp.tool()(tools.writability)
23
+ mcp.tool()(tools.reachable_writers)
24
+ mcp.tool()(tools.writer_axes)
25
+ mcp.tool()(tools.plan_write)
26
+ mcp.tool()(tools.ask_literature)
27
+ mcp.tool()(tools.multiplex_translocation_risk) # WS-G1: multiplex translocation-risk screen
28
+
29
+
30
+ @mcp.tool()
31
+ def plan_write_session(gene: str, intent: str, cargo_bp: int = 2000, ct: str = "k562",
32
+ payload_seq: str | None = None, mode: str = "automatic") -> dict:
33
+ """PEN-Agent: grounded write-planning state machine (site -> writer -> cargo+polish -> off-target -> 3D).
34
+
35
+ Every number is copied from a tool result with provenance; ungrounded steps degrade/refuse, never
36
+ fabricate. Modes: automatic | guided | qa."""
37
+ return pen_agent.plan_write_session(gene, intent, cargo_bp=cargo_bp, ct=ct,
38
+ payload_seq=payload_seq, mode=mode)
39
+
40
+
41
+ if __name__ == "__main__": # pragma: no cover
42
+ mcp.run()