pen-stack 3.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pen_stack/__init__.py +2 -0
- pen_stack/_resources.py +34 -0
- pen_stack/adapt/__init__.py +14 -0
- pen_stack/adapt/finetune.py +33 -0
- pen_stack/adapt/ingest.py +86 -0
- pen_stack/adapt/pipeline.py +101 -0
- pen_stack/adapt/recalibrate.py +58 -0
- pen_stack/adapt/report.py +130 -0
- pen_stack/agent/__init__.py +1 -0
- pen_stack/agent/guardrails.py +49 -0
- pen_stack/agent/mcp_server.py +42 -0
- pen_stack/agent/orchestrator.py +106 -0
- pen_stack/agent/pen_agent.py +169 -0
- pen_stack/agent/tools.py +130 -0
- pen_stack/atlas/__init__.py +1 -0
- pen_stack/atlas/build_wtkb.py +80 -0
- pen_stack/atlas/crosslink.py +144 -0
- pen_stack/atlas/expand.py +190 -0
- pen_stack/atlas/schema.py +59 -0
- pen_stack/atlas/scorecard.py +134 -0
- pen_stack/atlas/universe.py +75 -0
- pen_stack/atlas/variant_propose.py +155 -0
- pen_stack/bridge/__init__.py +1 -0
- pen_stack/bridge/activity.py +52 -0
- pen_stack/bridge/cli.py +65 -0
- pen_stack/bridge/fold_qc.py +53 -0
- pen_stack/bridge/guide_qc.py +84 -0
- pen_stack/bridge/ingest.py +139 -0
- pen_stack/bridge/offtarget.py +133 -0
- pen_stack/bridge/ortholog_screen.py +73 -0
- pen_stack/bridge/pipeline.py +83 -0
- pen_stack/cli.py +126 -0
- pen_stack/data/__init__.py +1 -0
- pen_stack/data/encode.py +84 -0
- pen_stack/data/genome.py +71 -0
- pen_stack/data/ingest_chromatin.py +119 -0
- pen_stack/data/ingest_integration.py +112 -0
- pen_stack/data/ingest_safety_annot.py +164 -0
- pen_stack/data/ingest_trip.py +76 -0
- pen_stack/mech/__init__.py +1 -0
- pen_stack/mech/classify_atlas.py +71 -0
- pen_stack/mech/whitelist.py +66 -0
- pen_stack/monitor/__init__.py +1 -0
- pen_stack/monitor/europepmc.py +32 -0
- pen_stack/monitor/run.py +57 -0
- pen_stack/monitor/triage.py +63 -0
- pen_stack/planner/__init__.py +1 -0
- pen_stack/planner/cargo.py +56 -0
- pen_stack/planner/cargo_polish.py +146 -0
- pen_stack/planner/delivery.py +32 -0
- pen_stack/planner/multiplex.py +110 -0
- pen_stack/planner/optimize.py +156 -0
- pen_stack/planner/pipeline.py +86 -0
- pen_stack/planner/report.py +26 -0
- pen_stack/rag/__init__.py +1 -0
- pen_stack/rag/index.py +53 -0
- pen_stack/rag/llm.py +178 -0
- pen_stack/rag/qa.py +105 -0
- pen_stack/score/__init__.py +1 -0
- pen_stack/score/recalibrate.py +77 -0
- pen_stack/score/therapeutic.py +85 -0
- pen_stack/server/__init__.py +1 -0
- pen_stack/server/api.py +142 -0
- pen_stack/ui/__init__.py +1 -0
- pen_stack/ui/app.py +518 -0
- pen_stack/validate/__init__.py +1 -0
- pen_stack/validate/adapt_demo.py +69 -0
- pen_stack/validate/agent_eval.py +117 -0
- pen_stack/validate/blind_gsh_discovery.py +165 -0
- pen_stack/validate/cargo_directionality.py +57 -0
- pen_stack/validate/durability_baselines.py +150 -0
- pen_stack/validate/forward_hypotheses.py +104 -0
- pen_stack/validate/guide_qc_demo.py +58 -0
- pen_stack/validate/intent_specification.py +82 -0
- pen_stack/validate/paper3_benchmark.py +165 -0
- pen_stack/validate/paper4_real_validation.py +144 -0
- pen_stack/validate/paper4_validation.py +82 -0
- pen_stack/validate/seq_vs_measured.py +134 -0
- pen_stack/validate/within_locus_ranking.py +74 -0
- pen_stack/validate/writer_recovery.py +86 -0
- pen_stack/wgenome/__init__.py +1 -0
- pen_stack/wgenome/chromatin_seq.py +83 -0
- pen_stack/wgenome/durability.py +108 -0
- pen_stack/wgenome/export_tracks.py +52 -0
- pen_stack/wgenome/features.py +82 -0
- pen_stack/wgenome/gsh_baseline.py +117 -0
- pen_stack/wgenome/providers.py +245 -0
- pen_stack/wgenome/safety.py +69 -0
- pen_stack/wgenome/structure3d.py +168 -0
- pen_stack/wgenome/writability.py +72 -0
- pen_stack-3.1.0.dist-info/METADATA +451 -0
- pen_stack-3.1.0.dist-info/RECORD +96 -0
- pen_stack-3.1.0.dist-info/WHEEL +5 -0
- pen_stack-3.1.0.dist-info/entry_points.txt +3 -0
- pen_stack-3.1.0.dist-info/licenses/LICENSE +21 -0
- pen_stack-3.1.0.dist-info/top_level.txt +1 -0
pen_stack/__init__.py
ADDED
pen_stack/_resources.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Resolve repo-relative resource files (configs, prereg, curated data) in both layouts.
|
|
2
|
+
|
|
3
|
+
PEN-STACK is a research pipeline: the pip wheel ships the importable library + CLI + the pure-logic tools,
|
|
4
|
+
while the full data pipeline (3 M-row atlases, curated configs, BigWig tracks) lives in the cloned repo and
|
|
5
|
+
on Zenodo, per the data policy in the README. This helper finds resource files when running from a source
|
|
6
|
+
checkout/sdist, and gives installed users a single escape hatch (`PEN_STACK_HOME`) to point at a checkout.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
_PKG = Path(__file__).resolve().parent # .../pen_stack
|
|
14
|
+
_ENV = "PEN_STACK_HOME"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def project_root() -> Path:
|
|
18
|
+
"""Best guess at the project root holding configs/, prereg/, data/. `PEN_STACK_HOME` overrides."""
|
|
19
|
+
env = os.environ.get(_ENV)
|
|
20
|
+
if env:
|
|
21
|
+
return Path(env).expanduser()
|
|
22
|
+
return _PKG.parent # repo root in a source checkout / sdist
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def resource(rel: str) -> Path:
|
|
26
|
+
"""Absolute path to a repo-relative resource (e.g. 'configs/cargo_polish.yaml'). Raises a clear,
|
|
27
|
+
actionable error if it is not present (e.g. a bare `pip install` without `PEN_STACK_HOME` or a checkout)."""
|
|
28
|
+
p = project_root() / rel
|
|
29
|
+
if not p.exists():
|
|
30
|
+
raise FileNotFoundError(
|
|
31
|
+
f"resource {rel!r} not found at {p}. The pip wheel ships the library, not the full data/config "
|
|
32
|
+
f"tree. Clone the repo for the full pipeline, or set {_ENV} to a checkout: "
|
|
33
|
+
f"export {_ENV}=/path/to/pen-stack")
|
|
34
|
+
return p
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Local recalibration / private-data adaptation (v3.1, WS-F).
|
|
2
|
+
|
|
3
|
+
Released PEN-STACK models can be recalibrated (or lightly fine-tuned) on a user's own assays - inside
|
|
4
|
+
Docker, on private data that never leaves the machine - behind a VALIDATION GATE so quality cannot silently
|
|
5
|
+
regress. The adapted artifact activates only if it beats the released model on the user's held-out split;
|
|
6
|
+
the released model is never overwritten (separate versioning under models/local_<id>/).
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from pen_stack.adapt.pipeline import adapt
|
|
11
|
+
from pen_stack.adapt.recalibrate import IsotonicCalibrator, recalibrate
|
|
12
|
+
from pen_stack.adapt.report import evaluate, gate, model_card
|
|
13
|
+
|
|
14
|
+
__all__ = ["adapt", "IsotonicCalibrator", "recalibrate", "evaluate", "gate", "model_card"]
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""WS-F2(b) - OPTIONAL light fine-tuning: a LightGBM head on the user's features.
|
|
2
|
+
|
|
3
|
+
This is the heavier, opt-in path (the default WS-F adaptation is isotonic recalibration, which is far more
|
|
4
|
+
robust on small private datasets). It trains a small LightGBM classifier on the user's features+labels - or
|
|
5
|
+
continues training from a released booster via `init_model` - and is subject to the SAME validation gate:
|
|
6
|
+
it activates only if it beats the released model on the held-out split. Small-data overfitting is mitigated
|
|
7
|
+
(not eliminated) by the gate, shallow trees, and strong regularization.
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def finetune_head(X, y, init_model=None, seed: int = 0, n_estimators: int = 100):
|
|
15
|
+
"""Train (or continue-train) a small, regularized LightGBM head. Returns the fitted model."""
|
|
16
|
+
import lightgbm as lgb
|
|
17
|
+
X, y = np.asarray(X, float), np.asarray(y, float)
|
|
18
|
+
if X.ndim == 1:
|
|
19
|
+
X = X.reshape(-1, 1)
|
|
20
|
+
model = lgb.LGBMClassifier(
|
|
21
|
+
n_estimators=n_estimators, num_leaves=15, max_depth=4, learning_rate=0.05,
|
|
22
|
+
min_child_samples=20, reg_lambda=1.0, subsample=0.8, colsample_bytree=0.8,
|
|
23
|
+
random_state=seed, verbose=-1)
|
|
24
|
+
model.fit(X, y.astype(int), init_model=init_model)
|
|
25
|
+
return model
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def predict_proba(model, X):
|
|
29
|
+
import numpy as np
|
|
30
|
+
X = np.asarray(X, float)
|
|
31
|
+
if X.ndim == 1:
|
|
32
|
+
X = X.reshape(-1, 1)
|
|
33
|
+
return model.predict_proba(X)[:, 1]
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""WS-F1 - ingest a user's private assay into per-site labels matching the model's feature schema.
|
|
2
|
+
|
|
3
|
+
The runnable in-code path is TABULAR: a CSV/TSV/Parquet of sites + an outcome label (and, optionally, the
|
|
4
|
+
released-model score column or the per-bin features to attach). The upstream FASTQ/BAM -> per-site label
|
|
5
|
+
derivation (integration-site sequencing, GUIDE-seq, expression-stability profiling) is documented in
|
|
6
|
+
docs/private_data_formats.md and runs in the Docker image with the usual aligners; it produces exactly the
|
|
7
|
+
tabular schema this module validates, so the two halves compose.
|
|
8
|
+
|
|
9
|
+
Schema (standardized output): chrom, bin, ct, label, [score], [features...]
|
|
10
|
+
* label: 0/1 (discrimination) or a real value in [0,1] (calibration target).
|
|
11
|
+
* score: the released model's output for that site (safety / p_durable / writability) - the thing we
|
|
12
|
+
recalibrate. If absent, attach_features() joins it from the writability atlas.
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
import pandas as pd
|
|
20
|
+
|
|
21
|
+
BIN_BP = 1000
|
|
22
|
+
REQUIRED = ("chrom", "label") # plus one of {bin, pos}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def load_user_labels(path: str | Path) -> pd.DataFrame:
|
|
26
|
+
"""Load + validate a user label table (.csv/.tsv/.parquet). Returns a frame with chrom, bin, ct, label."""
|
|
27
|
+
p = Path(path)
|
|
28
|
+
if p.suffix in (".parquet", ".pq"):
|
|
29
|
+
df = pd.read_parquet(p)
|
|
30
|
+
else:
|
|
31
|
+
df = pd.read_csv(p, sep="\t" if p.suffix in (".tsv", ".txt") else ",")
|
|
32
|
+
return normalize(df)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def normalize(df: pd.DataFrame) -> pd.DataFrame:
|
|
36
|
+
"""Validate columns and standardize: derive `bin` from `pos` if needed; coerce label; default ct."""
|
|
37
|
+
df = df.copy()
|
|
38
|
+
missing = [c for c in REQUIRED if c not in df.columns]
|
|
39
|
+
if missing:
|
|
40
|
+
raise ValueError(f"user table missing required columns: {missing} (have {list(df.columns)})")
|
|
41
|
+
if "bin" not in df.columns:
|
|
42
|
+
if "pos" not in df.columns:
|
|
43
|
+
raise ValueError("user table needs a 'bin' or a 'pos' column to locate each site")
|
|
44
|
+
df["bin"] = (df["pos"].astype(int) // BIN_BP).astype(int)
|
|
45
|
+
if "ct" not in df.columns:
|
|
46
|
+
df["ct"] = "user"
|
|
47
|
+
lab = pd.to_numeric(df["label"], errors="coerce")
|
|
48
|
+
if lab.isna().any():
|
|
49
|
+
raise ValueError("label column has non-numeric / missing values")
|
|
50
|
+
if not (((lab == 0) | (lab == 1)).all() or ((lab >= 0) & (lab <= 1)).all()):
|
|
51
|
+
raise ValueError("label must be binary {0,1} or a probability in [0,1]")
|
|
52
|
+
df["label"] = lab.astype(float)
|
|
53
|
+
keep = ["chrom", "bin", "ct", "label"] + [c for c in df.columns
|
|
54
|
+
if c not in ("chrom", "bin", "ct", "label", "pos")]
|
|
55
|
+
return df[keep].reset_index(drop=True)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def attach_features(df: pd.DataFrame, target: str = "safety", ct: str = "k562") -> pd.DataFrame:
|
|
59
|
+
"""Join the released model's score for `target` (safety|p_durable|writability) from the Phase-1 atlas.
|
|
60
|
+
|
|
61
|
+
No-op if the score column is already present (user supplied it). Raises if the atlas is unavailable and
|
|
62
|
+
no score column exists - the caller then supplies scores directly.
|
|
63
|
+
"""
|
|
64
|
+
col = {"safety": "safety", "durability": "p_durable", "p_durable": "p_durable",
|
|
65
|
+
"writability": "writability"}.get(target, target)
|
|
66
|
+
if col in df.columns and df[col].notna().any():
|
|
67
|
+
return df.rename(columns={col: "score"}) if col != "score" else df
|
|
68
|
+
if "score" in df.columns:
|
|
69
|
+
return df
|
|
70
|
+
atlas = Path(__file__).resolve().parents[2].parent / "phase_1" / "out" / f"atlas_{ct}.parquet"
|
|
71
|
+
if not atlas.exists():
|
|
72
|
+
raise FileNotFoundError(
|
|
73
|
+
f"no '{col}'/'score' column and Phase-1 atlas absent ({atlas}); supply the released-model score "
|
|
74
|
+
"column in the user table, or run inside the image where the atlas is mounted.")
|
|
75
|
+
a = pd.read_parquet(atlas, columns=["chrom", "bin", col])
|
|
76
|
+
out = df.merge(a, on=["chrom", "bin"], how="left").rename(columns={col: "score"})
|
|
77
|
+
if out["score"].isna().any():
|
|
78
|
+
out = out.dropna(subset=["score"]).reset_index(drop=True)
|
|
79
|
+
return out
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def schema_summary(df: pd.DataFrame) -> dict:
|
|
83
|
+
return {"n_sites": int(len(df)), "n_chroms": int(df["chrom"].nunique()),
|
|
84
|
+
"label_kind": "binary" if set(np.unique(df["label"])) <= {0.0, 1.0} else "continuous",
|
|
85
|
+
"positive_rate": round(float(df["label"].mean()), 4),
|
|
86
|
+
"has_score": "score" in df.columns}
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""WS-F2 - the adaptation pipeline (ingest -> split -> recalibrate/finetune -> held-out gate -> version).
|
|
2
|
+
|
|
3
|
+
`adapt()` is the one entry point. It splits the user's sites into train/held-out (chromosome-grouped when
|
|
4
|
+
possible), fits the adaptation on train, scores the released vs adapted model on the SAME held-out split,
|
|
5
|
+
and applies the validation gate. The adapted artifact is written under models/local_<id>/ ONLY - the
|
|
6
|
+
released model is never overwritten, and its fingerprint is checked before/after to prove it (acceptance).
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
from pen_stack.adapt import report as R
|
|
16
|
+
from pen_stack.adapt.recalibrate import recalibrate
|
|
17
|
+
|
|
18
|
+
_ROOT = Path(__file__).resolve().parents[2]
|
|
19
|
+
_MODELS = _ROOT / "models"
|
|
20
|
+
# released score-producing modules - the "released model" we must prove is unchanged by adaptation.
|
|
21
|
+
_RELEASED = [_ROOT / "pen_stack" / "wgenome" / m for m in ("safety.py", "durability.py", "writability.py")]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _split(df: pd.DataFrame, seed: int, holdout_frac: float = 0.3):
|
|
25
|
+
"""Chromosome-grouped holdout when >=2 chromosomes (no leakage); else a seeded random split."""
|
|
26
|
+
rng = np.random.default_rng(seed)
|
|
27
|
+
chroms = df["chrom"].unique()
|
|
28
|
+
if len(chroms) >= 2:
|
|
29
|
+
n_ho = max(1, int(round(len(chroms) * holdout_frac)))
|
|
30
|
+
ho_chroms = set(rng.choice(chroms, size=n_ho, replace=False))
|
|
31
|
+
mask = df["chrom"].isin(ho_chroms)
|
|
32
|
+
else:
|
|
33
|
+
mask = pd.Series(rng.random(len(df)) < holdout_frac, index=df.index)
|
|
34
|
+
return df[~mask].reset_index(drop=True), df[mask].reset_index(drop=True)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def adapt(df: pd.DataFrame, target: str = "safety", method: str = "isotonic", local_id: str = "local",
|
|
38
|
+
seed: int = 20260604, primary: str = "brier", margin: float = 0.0,
|
|
39
|
+
feature_cols: list[str] | None = None, models_dir: str | Path = _MODELS) -> dict:
|
|
40
|
+
"""Recalibrate (or fine-tune) the released `target` score on the user frame (needs 'score' + 'label').
|
|
41
|
+
|
|
42
|
+
Returns the held-out before/after report + the gate decision + the artifact paths. The adapted model is
|
|
43
|
+
activated (written + flagged) only if it beats the released model on the held-out split.
|
|
44
|
+
"""
|
|
45
|
+
if "score" not in df.columns or "label" not in df.columns:
|
|
46
|
+
raise ValueError("adapt() needs standardized columns 'score' and 'label' (see adapt.ingest)")
|
|
47
|
+
fp_before = R.released_fingerprint(*_RELEASED)
|
|
48
|
+
|
|
49
|
+
train, holdout = _split(df, seed)
|
|
50
|
+
if len(train) < 5 or len(holdout) < 3:
|
|
51
|
+
raise ValueError(f"not enough data after split (train={len(train)}, holdout={len(holdout)})")
|
|
52
|
+
|
|
53
|
+
base_holdout = np.clip(holdout["score"].to_numpy(float), 0, 1) # released score as a probability
|
|
54
|
+
if method == "isotonic":
|
|
55
|
+
cal = recalibrate(train["score"], train["label"])
|
|
56
|
+
adapted_holdout = cal.transform(holdout["score"])
|
|
57
|
+
artifact = "calibrator.json"
|
|
58
|
+
elif method == "finetune":
|
|
59
|
+
from pen_stack.adapt.finetune import finetune_head, predict_proba
|
|
60
|
+
cols = feature_cols or ["score"]
|
|
61
|
+
model = finetune_head(train[cols].to_numpy(float), train["label"], seed=seed)
|
|
62
|
+
adapted_holdout = predict_proba(model, holdout[cols].to_numpy(float))
|
|
63
|
+
cal, artifact = None, "head.txt"
|
|
64
|
+
else:
|
|
65
|
+
raise ValueError(f"unknown method: {method!r} (use 'isotonic' or 'finetune')")
|
|
66
|
+
|
|
67
|
+
# no-skill constant predictor: the TRAIN base rate applied to every held-out site (no leakage). The
|
|
68
|
+
# adapted model must beat this too, else its 'improvement' is just regression to climatology.
|
|
69
|
+
base_rate = float(np.clip(train["label"].mean(), 1e-6, 1 - 1e-6))
|
|
70
|
+
no_skill = R.evaluate(np.full(len(holdout), base_rate), holdout["label"])
|
|
71
|
+
|
|
72
|
+
base = R.evaluate(base_holdout, holdout["label"])
|
|
73
|
+
adapted = R.evaluate(adapted_holdout, holdout["label"])
|
|
74
|
+
gate = R.gate(base, adapted, primary=primary, margin=margin, no_skill=no_skill)
|
|
75
|
+
|
|
76
|
+
out_dir = Path(models_dir) / f"local_{local_id}"
|
|
77
|
+
fp_after = R.released_fingerprint(*_RELEASED)
|
|
78
|
+
released_unchanged = fp_before == fp_after
|
|
79
|
+
report = {"local_id": local_id, "target": target, "method": method,
|
|
80
|
+
"n_train": int(len(train)), "n_holdout": int(len(holdout)),
|
|
81
|
+
"held_out_before": base, "held_out_after": adapted, "held_out_no_skill": no_skill, "gate": gate,
|
|
82
|
+
"released_model_unchanged": released_unchanged,
|
|
83
|
+
"released_fingerprint": fp_after, "activated": gate["activate"]}
|
|
84
|
+
card = R.model_card(f"local_{local_id}", target, method, base, adapted, gate,
|
|
85
|
+
len(train), len(holdout), fp_after)
|
|
86
|
+
paths = R.write_report(out_dir, report, card)
|
|
87
|
+
# persist the adapted artifact ONLY when the gate passes; otherwise remove any stale artifact so a
|
|
88
|
+
# previously-activated adaptation that now fails the gate is not left active (released model stays in force).
|
|
89
|
+
artifact_path = out_dir / artifact
|
|
90
|
+
if gate["activate"]:
|
|
91
|
+
if method == "isotonic":
|
|
92
|
+
cal.save(artifact_path)
|
|
93
|
+
else:
|
|
94
|
+
model.booster_.save_model(str(artifact_path))
|
|
95
|
+
paths["artifact"] = str(artifact_path)
|
|
96
|
+
else:
|
|
97
|
+
for stale in (out_dir / "calibrator.json", out_dir / "head.txt"):
|
|
98
|
+
if stale.exists():
|
|
99
|
+
stale.unlink()
|
|
100
|
+
report["paths"] = paths
|
|
101
|
+
return report
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""WS-F2(a) - isotonic recalibration of a released score on user labels (private, in-container).
|
|
2
|
+
|
|
3
|
+
Isotonic regression learns a monotonic map released_score -> calibrated probability. Being monotonic it
|
|
4
|
+
NEVER changes the ranking (AUROC is preserved); it only fixes calibration (Brier / ECE). Small and robust on
|
|
5
|
+
the small datasets users typically have. The calibrator is saved as plain JSON under models/local_<id>/ -
|
|
6
|
+
the released model is untouched.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class IsotonicCalibrator:
|
|
17
|
+
"""Thin, serializable wrapper around sklearn's IsotonicRegression (saved as JSON, no pickle)."""
|
|
18
|
+
|
|
19
|
+
def __init__(self):
|
|
20
|
+
self._iso = None
|
|
21
|
+
self.fitted = False
|
|
22
|
+
|
|
23
|
+
def fit(self, scores, labels) -> "IsotonicCalibrator":
|
|
24
|
+
from sklearn.isotonic import IsotonicRegression
|
|
25
|
+
s, y = np.asarray(scores, float), np.asarray(labels, float)
|
|
26
|
+
self._iso = IsotonicRegression(out_of_bounds="clip", y_min=0.0, y_max=1.0)
|
|
27
|
+
self._iso.fit(s, y)
|
|
28
|
+
self.fitted = True
|
|
29
|
+
return self
|
|
30
|
+
|
|
31
|
+
def transform(self, scores):
|
|
32
|
+
if not self.fitted:
|
|
33
|
+
raise RuntimeError("calibrator not fitted")
|
|
34
|
+
return self._iso.predict(np.asarray(scores, float))
|
|
35
|
+
|
|
36
|
+
def to_dict(self) -> dict:
|
|
37
|
+
return {"kind": "isotonic", "x": list(map(float, self._iso.X_thresholds_)),
|
|
38
|
+
"y": list(map(float, self._iso.y_thresholds_))}
|
|
39
|
+
|
|
40
|
+
def save(self, path: str | Path) -> Path:
|
|
41
|
+
Path(path).parent.mkdir(parents=True, exist_ok=True)
|
|
42
|
+
Path(path).write_text(json.dumps(self.to_dict(), indent=2), encoding="utf-8")
|
|
43
|
+
return Path(path)
|
|
44
|
+
|
|
45
|
+
@classmethod
|
|
46
|
+
def load(cls, path: str | Path) -> "IsotonicCalibrator":
|
|
47
|
+
d = json.loads(Path(path).read_text(encoding="utf-8"))
|
|
48
|
+
obj = cls()
|
|
49
|
+
from sklearn.isotonic import IsotonicRegression
|
|
50
|
+
iso = IsotonicRegression(out_of_bounds="clip", y_min=0.0, y_max=1.0)
|
|
51
|
+
iso.fit(np.asarray(d["x"], float), np.asarray(d["y"], float)) # re-fit on the stored knots
|
|
52
|
+
obj._iso, obj.fitted = iso, True
|
|
53
|
+
return obj
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def recalibrate(scores, labels) -> IsotonicCalibrator:
|
|
57
|
+
"""Fit an isotonic calibrator mapping a released score to a calibrated probability on user labels."""
|
|
58
|
+
return IsotonicCalibrator().fit(scores, labels)
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""WS-F2(c,d) - held-out evaluation, the validation GATE, and the model card.
|
|
2
|
+
|
|
3
|
+
The adapted artifact ACTIVATES only if it beats the released model on the user's held-out split (the gate).
|
|
4
|
+
Calibration is judged by Brier score + expected calibration error (ECE, lower is better); discrimination by
|
|
5
|
+
AUROC (higher is better). The released model is provably unchanged (its artifact hash is recorded and
|
|
6
|
+
re-checked); a before/after report and a model card are always written.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _auroc(scores, labels) -> float:
|
|
18
|
+
pos = [s for s, y in zip(scores, labels) if y == 1]
|
|
19
|
+
neg = [s for s, y in zip(scores, labels) if y == 0]
|
|
20
|
+
if not pos or not neg:
|
|
21
|
+
return float("nan")
|
|
22
|
+
return sum((p > n) + 0.5 * (p == n) for p in pos for n in neg) / (len(pos) * len(neg))
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _ece(probs, labels, n_bins: int = 10) -> float:
|
|
26
|
+
probs, labels = np.asarray(probs, float), np.asarray(labels, float)
|
|
27
|
+
edges = np.linspace(0, 1, n_bins + 1)
|
|
28
|
+
ece, n = 0.0, len(probs)
|
|
29
|
+
for i in range(n_bins):
|
|
30
|
+
m = (probs >= edges[i]) & (probs < edges[i + 1] if i < n_bins - 1 else probs <= edges[i + 1])
|
|
31
|
+
if m.sum():
|
|
32
|
+
ece += (m.sum() / n) * abs(probs[m].mean() - labels[m].mean())
|
|
33
|
+
return float(ece)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def evaluate(probs, labels) -> dict:
|
|
37
|
+
"""Calibration + discrimination metrics for a set of probabilities against binary labels."""
|
|
38
|
+
probs, labels = np.asarray(probs, float), np.asarray(labels, float)
|
|
39
|
+
brier = float(np.mean((probs - labels) ** 2))
|
|
40
|
+
biny = labels if set(np.unique(labels)) <= {0.0, 1.0} else (labels >= 0.5).astype(float)
|
|
41
|
+
return {"n": int(len(probs)), "brier": round(brier, 5), "ece": round(_ece(probs, biny), 5),
|
|
42
|
+
"auroc": round(_auroc(list(probs), list(biny)), 4)}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def gate(base: dict, adapted: dict, primary: str = "brier", margin: float = 0.0,
|
|
46
|
+
no_skill: dict | None = None) -> dict:
|
|
47
|
+
"""Activate the adapted model only if it BEATS the released model AND the no-skill constant predictor on
|
|
48
|
+
the held-out primary metric.
|
|
49
|
+
|
|
50
|
+
primary='brier'|'ece' -> lower is better; primary='auroc' -> higher is better. `margin` is the minimum
|
|
51
|
+
improvement required (guards against noise on small holdouts). The `no_skill` guard is essential:
|
|
52
|
+
recalibration can trivially lower Brier by regressing to the base rate, so we require the adapted model
|
|
53
|
+
to beat the constant base-rate predictor too - otherwise the 'improvement' is no skill, just climatology.
|
|
54
|
+
"""
|
|
55
|
+
lower_better = primary in ("brier", "ece")
|
|
56
|
+
|
|
57
|
+
def better(x, ref):
|
|
58
|
+
return (ref - x) if lower_better else (x - ref)
|
|
59
|
+
|
|
60
|
+
b, a = base[primary], adapted[primary]
|
|
61
|
+
imp_released = better(a, b)
|
|
62
|
+
beats_released = imp_released > margin
|
|
63
|
+
beats_no_skill = True
|
|
64
|
+
ns = None
|
|
65
|
+
if no_skill is not None:
|
|
66
|
+
ns = no_skill[primary]
|
|
67
|
+
beats_no_skill = better(a, ns) > margin
|
|
68
|
+
activate = bool(beats_released and beats_no_skill)
|
|
69
|
+
if activate:
|
|
70
|
+
decision = "ADAPTED ACTIVATED (beats released AND the no-skill constant on held-out)"
|
|
71
|
+
elif not beats_no_skill:
|
|
72
|
+
decision = "ADAPTED REJECTED (improvement is no skill - does not beat the constant base rate)"
|
|
73
|
+
else:
|
|
74
|
+
decision = "ADAPTED REJECTED (does not beat released; released model kept)"
|
|
75
|
+
return {"primary_metric": primary, "lower_is_better": lower_better,
|
|
76
|
+
"released": b, "adapted": a, "no_skill_constant": ns,
|
|
77
|
+
"improvement_vs_released": round(imp_released, 5), "margin": margin,
|
|
78
|
+
"beats_released": bool(beats_released), "beats_no_skill": bool(beats_no_skill),
|
|
79
|
+
"activate": activate, "decision": decision}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def released_fingerprint(*paths: str | Path) -> dict:
|
|
83
|
+
"""Hash designated released-model artifacts so we can prove they are unchanged by adaptation."""
|
|
84
|
+
import hashlib
|
|
85
|
+
fp = {}
|
|
86
|
+
for p in paths:
|
|
87
|
+
p = Path(p)
|
|
88
|
+
if p.exists():
|
|
89
|
+
fp[str(p.name)] = hashlib.sha256(p.read_bytes()).hexdigest()[:16]
|
|
90
|
+
return fp
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def model_card(local_id: str, target: str, method: str, base: dict, adapted: dict, gate_res: dict,
|
|
94
|
+
n_train: int, n_holdout: int, released_fp: dict) -> str:
|
|
95
|
+
ts = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
|
96
|
+
return "\n".join([
|
|
97
|
+
f"# PEN-STACK local adaptation - {local_id}",
|
|
98
|
+
"",
|
|
99
|
+
f"- **Date:** {ts}",
|
|
100
|
+
f"- **Target score:** {target} **Method:** {method}",
|
|
101
|
+
f"- **Data:** {n_train} train / {n_holdout} held-out sites (private, in-container)",
|
|
102
|
+
f"- **Released-model fingerprint (unchanged):** {released_fp}",
|
|
103
|
+
"",
|
|
104
|
+
"## Held-out before/after",
|
|
105
|
+
"| metric | released | adapted |",
|
|
106
|
+
"|---|---|---|",
|
|
107
|
+
f"| Brier (lower better) | {base['brier']} | {adapted['brier']} |",
|
|
108
|
+
f"| ECE (lower better) | {base['ece']} | {adapted['ece']} |",
|
|
109
|
+
f"| AUROC (higher better) | {base['auroc']} | {adapted['auroc']} |",
|
|
110
|
+
"",
|
|
111
|
+
f"## Gate: **{gate_res['decision']}**",
|
|
112
|
+
f"- primary metric `{gate_res['primary_metric']}`: released {gate_res['released']} -> adapted "
|
|
113
|
+
f"{gate_res['adapted']} (improvement vs released {gate_res['improvement_vs_released']}, "
|
|
114
|
+
f"no-skill constant {gate_res.get('no_skill_constant')}, margin {gate_res['margin']}; "
|
|
115
|
+
f"beats released={gate_res['beats_released']}, beats no-skill={gate_res['beats_no_skill']}).",
|
|
116
|
+
"",
|
|
117
|
+
"## Scope",
|
|
118
|
+
"Recalibration / light fine-tuning on a small private dataset; overfitting is mitigated (not "
|
|
119
|
+
"eliminated) by the held-out gate. Not unsupervised learning from raw reads. The released model is "
|
|
120
|
+
"never overwritten - this artifact lives under `models/local_<id>/` and activates only if the gate "
|
|
121
|
+
"passed.",
|
|
122
|
+
])
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def write_report(out_dir: str | Path, report: dict, card: str) -> dict:
|
|
126
|
+
out = Path(out_dir)
|
|
127
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
128
|
+
(out / "report.json").write_text(json.dumps(report, indent=2, default=str), encoding="utf-8")
|
|
129
|
+
(out / "model_card.md").write_text(card, encoding="utf-8")
|
|
130
|
+
return {"report": str(out / "report.json"), "model_card": str(out / "model_card.md")}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""pen_stack.agent - see PEN-STACK v3.0 program doc."""
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""LLM guardrails for PEN-STACK platform services (Phase 2, Section 2B).
|
|
2
|
+
|
|
3
|
+
The contract every service obeys: **grounded** (answers from the curated atlas + indexed literature),
|
|
4
|
+
**cited** (every factual claim carries a source), **defer-to-models** (any quantitative claim is produced
|
|
5
|
+
by a validated tool call, never guessed by the LLM), **decision-support** (never a clinical directive),
|
|
6
|
+
**budget-aware**, **auditable** (a provenance block accompanies every answer).
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
|
|
12
|
+
DISCLAIMER = ("Decision-support only - PEN-STACK returns calibrated risk/durability/reachability "
|
|
13
|
+
"estimates, not clinical directives. Tier-2/3 reachability is candidate and requires "
|
|
14
|
+
"experimental validation. Verify all designs experimentally.")
|
|
15
|
+
|
|
16
|
+
# Questions PEN-STACK must refuse: clinical directives, diagnosis, dosing, treatment decisions for a
|
|
17
|
+
# specific patient. (Scientific questions about loci/writers/safety are in scope.)
|
|
18
|
+
_REFUSE_PATTERNS = [
|
|
19
|
+
r"\bshould i (treat|inject|dose|administer|give)\b",
|
|
20
|
+
r"\b(diagnos|prescrib|dosage|dosing)\w*\b",
|
|
21
|
+
r"\b(my|this|the) patient\b",
|
|
22
|
+
r"\bdose\b.{0,40}\b(child|patient|human|person|kid|baby|infant)\b", # dosing for a person = clinical
|
|
23
|
+
r"\b(what|which) dose\b", # dosing questions are clinical
|
|
24
|
+
r"\bis it safe (to|for) (a |the |my )?(patient|human|person|child)\b",
|
|
25
|
+
r"\bclinical (decision|recommendation|advice) for\b",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def out_of_scope(question: str) -> str | None:
|
|
30
|
+
"""Return a refusal reason if the question is a clinical directive, else None."""
|
|
31
|
+
q = question.lower()
|
|
32
|
+
for pat in _REFUSE_PATTERNS:
|
|
33
|
+
if re.search(pat, q):
|
|
34
|
+
return ("This is a clinical-directive question. PEN-STACK is decision-support "
|
|
35
|
+
"infrastructure for genome-writing design and does not give clinical advice.")
|
|
36
|
+
return None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def enforce_grounded(answer: dict) -> dict:
|
|
40
|
+
"""Assert the auditable contract on a finished answer: numeric claims must trace to a tool call."""
|
|
41
|
+
answer.setdefault("disclaimer", DISCLAIMER)
|
|
42
|
+
answer.setdefault("provenance", [])
|
|
43
|
+
answer.setdefault("citations", [])
|
|
44
|
+
# if the answer reports numbers, there must be a tool-call provenance entry backing them
|
|
45
|
+
has_number = bool(re.search(r"\d", str(answer.get("answer", ""))))
|
|
46
|
+
if has_number and not answer["provenance"]:
|
|
47
|
+
answer["warning"] = "numeric claim without tool provenance - suppressed"
|
|
48
|
+
answer["answer"] = "(suppressed: a number was produced without a backing tool call)"
|
|
49
|
+
return answer
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""PEN-STACK MCP server (Phase 3, Step 3.10; v3.1 WS-E2) - expose the validated capabilities to any agent.
|
|
2
|
+
|
|
3
|
+
Wraps the validated tools as a Model Context Protocol server (fastmcp) so any MCP client (Claude, etc.)
|
|
4
|
+
can call ``writability``, ``reachable_writers``, ``writer_axes``, ``plan_write``, ``ask_literature`` and the
|
|
5
|
+
grounded ``plan_write_session`` (the full PEN-Agent state machine) and receive correct, provenance-tagged
|
|
6
|
+
results - turning PEN-STACK into shared agentic infrastructure.
|
|
7
|
+
|
|
8
|
+
Run: ``python -m pen_stack.agent.mcp_server`` (needs the ``services`` extra: ``pip install fastmcp``).
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from pen_stack.agent import pen_agent, tools
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
from fastmcp import FastMCP
|
|
16
|
+
except ImportError as e: # pragma: no cover - services extra optional
|
|
17
|
+
raise ImportError("fastmcp not installed: pip install 'pen-stack[services]'") from e
|
|
18
|
+
|
|
19
|
+
mcp = FastMCP("pen-stack")
|
|
20
|
+
|
|
21
|
+
# register each validated tool (the same functions the in-process agent and the eval harness use)
|
|
22
|
+
mcp.tool()(tools.writability)
|
|
23
|
+
mcp.tool()(tools.reachable_writers)
|
|
24
|
+
mcp.tool()(tools.writer_axes)
|
|
25
|
+
mcp.tool()(tools.plan_write)
|
|
26
|
+
mcp.tool()(tools.ask_literature)
|
|
27
|
+
mcp.tool()(tools.multiplex_translocation_risk) # WS-G1: multiplex translocation-risk screen
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@mcp.tool()
|
|
31
|
+
def plan_write_session(gene: str, intent: str, cargo_bp: int = 2000, ct: str = "k562",
|
|
32
|
+
payload_seq: str | None = None, mode: str = "automatic") -> dict:
|
|
33
|
+
"""PEN-Agent: grounded write-planning state machine (site -> writer -> cargo+polish -> off-target -> 3D).
|
|
34
|
+
|
|
35
|
+
Every number is copied from a tool result with provenance; ungrounded steps degrade/refuse, never
|
|
36
|
+
fabricate. Modes: automatic | guided | qa."""
|
|
37
|
+
return pen_agent.plan_write_session(gene, intent, cargo_bp=cargo_bp, ct=ct,
|
|
38
|
+
payload_seq=payload_seq, mode=mode)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
if __name__ == "__main__": # pragma: no cover
|
|
42
|
+
mcp.run()
|