PyPI - pen-stack - Versions diffs - 3.1.0__py3-none-any.whl - Mend

pen-stack 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

pen_stack/__init__.py +2 -0
pen_stack/_resources.py +34 -0
pen_stack/adapt/__init__.py +14 -0
pen_stack/adapt/finetune.py +33 -0
pen_stack/adapt/ingest.py +86 -0
pen_stack/adapt/pipeline.py +101 -0
pen_stack/adapt/recalibrate.py +58 -0
pen_stack/adapt/report.py +130 -0
pen_stack/agent/__init__.py +1 -0
pen_stack/agent/guardrails.py +49 -0
pen_stack/agent/mcp_server.py +42 -0
pen_stack/agent/orchestrator.py +106 -0
pen_stack/agent/pen_agent.py +169 -0
pen_stack/agent/tools.py +130 -0
pen_stack/atlas/__init__.py +1 -0
pen_stack/atlas/build_wtkb.py +80 -0
pen_stack/atlas/crosslink.py +144 -0
pen_stack/atlas/expand.py +190 -0
pen_stack/atlas/schema.py +59 -0
pen_stack/atlas/scorecard.py +134 -0
pen_stack/atlas/universe.py +75 -0
pen_stack/atlas/variant_propose.py +155 -0
pen_stack/bridge/__init__.py +1 -0
pen_stack/bridge/activity.py +52 -0
pen_stack/bridge/cli.py +65 -0
pen_stack/bridge/fold_qc.py +53 -0
pen_stack/bridge/guide_qc.py +84 -0
pen_stack/bridge/ingest.py +139 -0
pen_stack/bridge/offtarget.py +133 -0
pen_stack/bridge/ortholog_screen.py +73 -0
pen_stack/bridge/pipeline.py +83 -0
pen_stack/cli.py +126 -0
pen_stack/data/__init__.py +1 -0
pen_stack/data/encode.py +84 -0
pen_stack/data/genome.py +71 -0
pen_stack/data/ingest_chromatin.py +119 -0
pen_stack/data/ingest_integration.py +112 -0
pen_stack/data/ingest_safety_annot.py +164 -0
pen_stack/data/ingest_trip.py +76 -0
pen_stack/mech/__init__.py +1 -0
pen_stack/mech/classify_atlas.py +71 -0
pen_stack/mech/whitelist.py +66 -0
pen_stack/monitor/__init__.py +1 -0
pen_stack/monitor/europepmc.py +32 -0
pen_stack/monitor/run.py +57 -0
pen_stack/monitor/triage.py +63 -0
pen_stack/planner/__init__.py +1 -0
pen_stack/planner/cargo.py +56 -0
pen_stack/planner/cargo_polish.py +146 -0
pen_stack/planner/delivery.py +32 -0
pen_stack/planner/multiplex.py +110 -0
pen_stack/planner/optimize.py +156 -0
pen_stack/planner/pipeline.py +86 -0
pen_stack/planner/report.py +26 -0
pen_stack/rag/__init__.py +1 -0
pen_stack/rag/index.py +53 -0
pen_stack/rag/llm.py +178 -0
pen_stack/rag/qa.py +105 -0
pen_stack/score/__init__.py +1 -0
pen_stack/score/recalibrate.py +77 -0
pen_stack/score/therapeutic.py +85 -0
pen_stack/server/__init__.py +1 -0
pen_stack/server/api.py +142 -0
pen_stack/ui/__init__.py +1 -0
pen_stack/ui/app.py +518 -0
pen_stack/validate/__init__.py +1 -0
pen_stack/validate/adapt_demo.py +69 -0
pen_stack/validate/agent_eval.py +117 -0
pen_stack/validate/blind_gsh_discovery.py +165 -0
pen_stack/validate/cargo_directionality.py +57 -0
pen_stack/validate/durability_baselines.py +150 -0
pen_stack/validate/forward_hypotheses.py +104 -0
pen_stack/validate/guide_qc_demo.py +58 -0
pen_stack/validate/intent_specification.py +82 -0
pen_stack/validate/paper3_benchmark.py +165 -0
pen_stack/validate/paper4_real_validation.py +144 -0
pen_stack/validate/paper4_validation.py +82 -0
pen_stack/validate/seq_vs_measured.py +134 -0
pen_stack/validate/within_locus_ranking.py +74 -0
pen_stack/validate/writer_recovery.py +86 -0
pen_stack/wgenome/__init__.py +1 -0
pen_stack/wgenome/chromatin_seq.py +83 -0
pen_stack/wgenome/durability.py +108 -0
pen_stack/wgenome/export_tracks.py +52 -0
pen_stack/wgenome/features.py +82 -0
pen_stack/wgenome/gsh_baseline.py +117 -0
pen_stack/wgenome/providers.py +245 -0
pen_stack/wgenome/safety.py +69 -0
pen_stack/wgenome/structure3d.py +168 -0
pen_stack/wgenome/writability.py +72 -0
pen_stack-3.1.0.dist-info/METADATA +451 -0
pen_stack-3.1.0.dist-info/RECORD +96 -0
pen_stack-3.1.0.dist-info/WHEEL +5 -0
pen_stack-3.1.0.dist-info/entry_points.txt +3 -0
pen_stack-3.1.0.dist-info/licenses/LICENSE +21 -0
pen_stack-3.1.0.dist-info/top_level.txt +1 -0

pen_stack/data/ingest_trip.py ADDED Viewed

@@ -0,0 +1,76 @@
+"""TRIP durability supervision (Phase 1, Step 1.3).
+Ingests the Akhtar et al. 2013 TRIP data (GEO GSE49806 tet-O + GSE49807 mPGK; mouse mESC): each row is
+one integrated reporter at a genomic position with expression. Produces (position, expression level,
+silenced/expressed label) - the supervision for the conditional chromatin-context durability model.
+The model learns `local chromatin features -> expression`; it never sees the coordinate. So TRIP being
+mouse is fine: attach mouse (mES) chromatin features at these positions, train the function, then apply
+it to a human epigenome (the headline function-transfer test).
+"""
+from __future__ import annotations
+import argparse
+import gzip
+from pathlib import Path
+import numpy as np
+import pandas as pd
+def load_trip(txt_gz: str, promoter: str) -> pd.DataFrame:
+    """Robust to both TRIP schemas: GSE49807 (plain) and GSE49806 (leading '#' comment + multi-Dox
+    columns; we use the 100 ng full-induction normalization/expression pair)."""
+    with gzip.open(txt_gz, "rt") as fh:
+        raw = pd.read_csv(fh, sep="\t", comment="#", dtype=str)
+    cols = {c.lower().strip(): c for c in raw.columns}
+    chrom_c = cols.get("chromosome")
+    pos_c = cols.get("position")
+    norm_c = cols.get("normalization_counts_100ng_1") or cols.get("normalization_counts")
+    expr_c = cols.get("expression_counts_100ng_1") or cols.get("expression_counts")
+    if not all([chrom_c, pos_c, norm_c, expr_c]):
+        raise ValueError(f"{txt_gz}: missing expected columns; have {list(raw.columns)[:8]}")
+    df = pd.DataFrame({
+        "chrom": raw[chrom_c].astype(str),
+        "pos": pd.to_numeric(raw[pos_c], errors="coerce"),
+        "norm_counts": pd.to_numeric(raw[norm_c], errors="coerce"),
+        "expr_counts": pd.to_numeric(raw[expr_c], errors="coerce"),
+    }).dropna()
+    df["pos"] = df["pos"].astype(int)
+    df["promoter"] = promoter
+    return df
+def assemble(files: dict[str, str], out_parquet: str, silenced_quantile: float = 0.25) -> pd.DataFrame:
+    parts = [load_trip(path, prom) for prom, path in files.items()]
+    df = pd.concat(parts, ignore_index=True)
+    # normalized expression (expression per normalization read), log scale
+    df["expression"] = np.log2((df["expr_counts"] + 1) / (df["norm_counts"] + 1))
+    # silenced/expressed: low tail of expression flagged silenced (per promoter, to control for promoter strength)
+    df["silenced"] = False
+    for prom, g in df.groupby("promoter"):
+        thr = g["expression"].quantile(silenced_quantile)
+        df.loc[g.index, "silenced"] = g["expression"] <= thr
+    df["stable"] = ~df["silenced"]
+    Path(out_parquet).parent.mkdir(parents=True, exist_ok=True)
+    df.to_parquet(out_parquet, index=False)
+    return df
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--teto", default="/data/external/trip/GSE49806_S2.txt.gz")
+    ap.add_argument("--mpgk", default="/data/external/trip/GSE49807_S3.txt.gz")
+    ap.add_argument("--out", default="/data/features/trip_mesc.parquet")
+    a = ap.parse_args()
+    files = {k: v for k, v in {"tetO": a.teto, "mPGK": a.mpgk}.items() if Path(v).exists()}
+    df = assemble(files, a.out)
+    print(f"TRIP integrations: {len(df)}  promoters={df['promoter'].value_counts().to_dict()}")
+    print(f"expression range (log2): [{df['expression'].min():.2f}, {df['expression'].max():.2f}]  "
+          f"~{2**(df['expression'].max()-df['expression'].min()):.0f}-fold")
+    print(f"silenced={int(df['silenced'].sum())} stable={int(df['stable'].sum())}")
+    print(f"chroms: {sorted(df['chrom'].unique())[:6]}...  (mouse build)")
+if __name__ == "__main__":
+    main()

pen_stack/mech/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """pen_stack.mech - see PEN-STACK v3.0 program doc."""

pen_stack/mech/classify_atlas.py ADDED Viewed

@@ -0,0 +1,71 @@
+"""Mechanism-aware classification at scale (Phase 2, Step 2.2).
+Run the audited Pfam-whitelist classifier over the expanded Writer Atlas. For every system, derive a
+``mech_pred`` bucket + ``mech_conf`` *independently* from its Pfam domain architecture (homology), then
+compare against the inherited/audited ``mechanism_bucket`` - keeping homology and mechanism distinct, as
+the program requires. Low-confidence / conflicting / disagreeing calls are written to a review queue and
+flagged, never hidden.
+Inputs : pen_stack/atlas/atlas.parquet, the 18-family whitelist.
+Outputs: atlas.parquet updated with mech_pred / mech_conf / mech_basis / mech_agrees,
+         out/mech_review_queue.csv.
+"""
+from __future__ import annotations
+from pathlib import Path
+import pandas as pd
+from pen_stack.mech.whitelist import PfamWhitelist
+_ROOT = Path(__file__).resolve().parents[2]
+_ATLAS = _ROOT / "pen_stack" / "atlas" / "atlas.parquet"
+_QUEUE = _ROOT / "out" / "mech_review_queue.csv"
+def classify_atlas(atlas_parquet: str | Path = _ATLAS, out: str | Path = _ATLAS,
+                   queue: str | Path = _QUEUE) -> pd.DataFrame:
+    atlas = pd.read_parquet(atlas_parquet)
+    wl = PfamWhitelist()
+    calls = atlas["pfam_signature"].apply(lambda s: wl.classify(list(s) if s is not None else []))
+    atlas["mech_pred"] = [c.bucket for c in calls]
+    atlas["mech_conf"] = [c.confidence for c in calls]
+    atlas["mech_basis"] = [c.basis for c in calls]
+    # agreement with the inherited/audited mechanism label (None where one side is missing)
+    atlas["mech_agrees"] = [
+        (mp == mb) if (mp is not None and pd.notna(mb)) else None
+        for mp, mb in zip(atlas["mech_pred"], atlas["mechanism_bucket"])
+    ]
+    atlas["mech_class_version"] = wl.version
+    # review queue: no domain evidence, conflicting evidence, or disagreement with the audited label
+    flag = (
+        atlas["mech_conf"].isin(["none", "conflicting"])
+        | atlas["mech_agrees"].eq(False)   # explicit False (disagreement), not NaN
+    )
+    q = atlas.loc[flag, ["representative_system", "family", "pfam_signature",
+                         "mechanism_bucket", "mech_pred", "mech_conf", "mech_basis",
+                         "mech_agrees", "confidence"]]
+    Path(queue).parent.mkdir(parents=True, exist_ok=True)
+    q.to_csv(queue, index=False)
+    atlas.to_parquet(out, index=False)
+    return atlas
+def core_agreement(atlas: pd.DataFrame) -> dict:
+    """Agreement on the curated 8-family core against the audited 18-family labels."""
+    core = atlas[atlas["entry_kind"] == "curated_core"]
+    scored = core[core["mech_pred"].notna()]
+    agree = int((scored["mech_pred"] == scored["mechanism_bucket"]).sum())
+    return {"n_core": len(core), "n_scored": len(scored), "n_agree": agree,
+            "agreement": round(agree / len(scored), 4) if len(scored) else None}
+if __name__ == "__main__":  # pragma: no cover
+    a = classify_atlas()
+    print("mech_conf distribution:\n", a["mech_conf"].value_counts())
+    print("\ncore agreement:", core_agreement(a))
+    n_flag = int((a["mech_conf"].isin(["none", "conflicting"]) | (a["mech_agrees"].eq(False))).sum())
+    print("\nreview queue rows:", n_flag)

pen_stack/mech/whitelist.py ADDED Viewed

@@ -0,0 +1,66 @@
+"""The InterPro-audited 18-family Pfam whitelist (imported from genome-atlas v1.2.1).
+Tier-A of MECH-CLASS: maps a Pfam domain architecture to a mechanism bucket
+(``DSB_NUCLEASE`` / ``DSB_FREE_TRANSEST_RECOMBINASE`` / ``TRANSPOSASE``) using domain presence plus
+*composite co-occurrence rules* (e.g. Cas9 requires >=2 of its 3 signature domains; IS110 requires
+both PF01548 and PF02371). This is the audited backbone the program carries forward (Section 9); the retired
+ESM-2 "PEN-DISCOVER" head is not used - domain evidence is the load-bearing mechanism signal.
+The source YAML ``pfam_whitelist.yaml`` is the genome-atlas asset, accessions verified against InterPro
+on 2026-04-22 (v1.2.1 corrected three v1.2.0 accession errors).
+"""
+from __future__ import annotations
+from collections import Counter
+from dataclasses import dataclass
+from pathlib import Path
+import yaml
+_WL_PATH = Path(__file__).resolve().parent / "pfam_whitelist.yaml"
+# Composite architectures: a call is only "composite-grade" (highest confidence) when the required
+# co-occurring domains are all present. Derived from the whitelist co_occurs_with fields.
+_COMPOSITES = {
+    "Cas9": {"min": 2, "of": {"PF13395", "PF18541", "PF16595"}, "bucket": "DSB_NUCLEASE"},
+    "IS110_bridge": {"min": 2, "of": {"PF01548", "PF02371"}, "bucket": "DSB_FREE_TRANSEST_RECOMBINASE"},
+}
+@dataclass(frozen=True)
+class MechCall:
+    bucket: str | None         # mechanism bucket, or None if no whitelisted domain present
+    confidence: str            # composite | single | conflicting | none
+    basis: str                 # human-readable evidence trail
+    matched: tuple[str, ...]   # whitelisted accessions that fired
+class PfamWhitelist:
+    def __init__(self, path: str | Path = _WL_PATH):
+        doms = yaml.safe_load(Path(path).read_text(encoding="utf-8"))["domains"]
+        self.bucket_of: dict[str, str] = {d["accession"]: d["mechanism_bucket"] for d in doms}
+        self.name_of: dict[str, str] = {d["accession"]: d.get("name", "") for d in doms}
+        self.version = "1.2.1"
+    def classify(self, pfam_signature) -> MechCall:
+        """Mechanism from a Pfam domain set - independent of any inherited/family label."""
+        sig = {str(a).strip() for a in (pfam_signature or []) if str(a).strip()}
+        hits = sorted(sig & set(self.bucket_of))
+        if not hits:
+            return MechCall(None, "none", "no whitelisted Pfam domain present", ())
+        # composite rule wins (most specific, highest confidence)
+        for name, rule in _COMPOSITES.items():
+            present = sig & rule["of"]
+            if len(present) >= rule["min"]:
+                return MechCall(rule["bucket"], "composite",
+                                f"{name}: {len(present)}/{len(rule['of'])} signature domains "
+                                f"({','.join(sorted(present))})", tuple(hits))
+        buckets = Counter(self.bucket_of[a] for a in hits)
+        top, n = buckets.most_common(1)[0]
+        if len(buckets) > 1:
+            return MechCall(top, "conflicting",
+                            f"mixed domain evidence: {dict(buckets)}", tuple(hits))
+        return MechCall(top, "single",
+                        f"single-bucket domain evidence: {','.join(hits)}", tuple(hits))

pen_stack/monitor/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """pen_stack.monitor - see PEN-STACK v3.0 program doc."""

pen_stack/monitor/europepmc.py ADDED Viewed

@@ -0,0 +1,32 @@
+"""Europe PMC client for PEN-MONITOR (Phase 2, Step 2.7).
+Europe PMC is the right primary source: open REST API, full-text + preprints, no licence friction.
+This module only *fetches* - triage + queueing live in triage.py / run.py.
+"""
+from __future__ import annotations
+import time
+import urllib.parse
+import urllib.request
+from io import BytesIO
+import json
+EPMC = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
+def search(query: str, since_date: str | None = None, page_size: int = 100,
+           timeout: int = 30, retries: int = 3) -> list[dict]:
+    """Search Europe PMC. ``since_date`` (YYYY-MM-DD) filters on first publication date."""
+    q = query if not since_date else f"{query} AND FIRST_PDATE:[{since_date} TO *]"
+    params = {"query": q, "format": "json", "pageSize": page_size, "resultType": "core"}
+    url = EPMC + "?" + urllib.parse.urlencode(params)
+    last = None
+    for attempt in range(retries):
+        try:
+            with urllib.request.urlopen(url, timeout=timeout) as r:
+                data = json.load(BytesIO(r.read()))
+            return data.get("resultList", {}).get("result", [])
+        except Exception as e:  # noqa: BLE001 - network best-effort
+            last = e
+            time.sleep(2 * (attempt + 1))
+    raise RuntimeError(f"Europe PMC search failed for {query!r}: {last}")

pen_stack/monitor/run.py ADDED Viewed

@@ -0,0 +1,57 @@
+"""PEN-MONITOR orchestrator (Phase 2, Step 2.7) - the Europe PMC living-database engine.
+Poll Europe PMC for every writer-family query, triage each hit into a candidate row (always cited),
+de-duplicate, and write a human-reviewed curation queue. The atlas is **never** auto-edited; accepted
+entries flow into the WT-KB/atlas with confidence=inferred only after a human accepts them.
+Back-test: with ``back_test=True`` and a date window covering March 2026, the engine must surface the
+known recent writer ISPpu10 (Europe PMC PPR1218813) into the queue - the pre-registered success check.
+"""
+from __future__ import annotations
+from pathlib import Path
+import pandas as pd
+from pen_stack.monitor.europepmc import search
+from pen_stack.monitor.triage import _load_cues, triage_hit
+_OUT = Path(__file__).resolve().parents[2] / "out" / "monitor_queue.csv"
+def run_monitor(since: str = "2026-01-01", page_size: int = 50, back_test: bool = False,
+                out: str | Path = _OUT, cfg_path: str | Path | None = None) -> dict:
+    cfg = _load_cues(cfg_path) if cfg_path else _load_cues()
+    rows, n_hits = [], 0
+    for q in cfg["queries"]:
+        try:
+            hits = search(q["terms"], since_date=since, page_size=page_size)
+        except RuntimeError:
+            continue
+        n_hits += len(hits)
+        for h in hits:
+            rows.append(triage_hit(h, default_family=q.get("family"), cfg=cfg))
+    queue = pd.DataFrame(rows)
+    if not queue.empty:
+        queue = queue.drop_duplicates(subset=["source_id"]).reset_index(drop=True)
+        # every queued candidate must carry a citation (source_id or doi)
+        queue = queue[queue["source_id"].notna() | queue["doi"].notna()]
+    Path(out).parent.mkdir(parents=True, exist_ok=True)
+    queue.to_csv(out, index=False)
+    res = {"since": since, "n_hits": n_hits, "n_candidates": int(len(queue)), "queue": str(out)}
+    if back_test:
+        found = False
+        if not queue.empty:
+            blob = (queue["title"].fillna("") + " " + queue["source_id"].fillna("")).str.lower()
+            found = bool(blob.str.contains("isppu10").any() or
+                         (queue["source_id"] == "PPR1218813").any())
+        res["isppu10_found"] = found
+    return res
+if __name__ == "__main__":  # pragma: no cover
+    r = run_monitor(since="2026-01-01", back_test=True)
+    print(r)

pen_stack/monitor/triage.py ADDED Viewed

@@ -0,0 +1,63 @@
+"""Triage Europe PMC hits into candidate writer-system rows (Phase 2, Step 2.7).
+Grounded extraction: pull candidate fields (family, organism cue, human-cell evidence) from a hit's
+title/abstract using documented keyword cues, **always** carrying the source citation (Europe PMC id +
+DOI). An optional LLM pass (Ollama/Qwen via litellm) can enrich the abstract extraction, but it never
+invents a citation and never auto-edits the atlas - its output is just another candidate for the queue.
+The rule-based path is the reliable default (works offline, fully reproducible, satisfies the back-test).
+"""
+from __future__ import annotations
+import re
+from pathlib import Path
+import yaml
+_CFG = Path(__file__).resolve().parents[2] / "configs" / "monitor_queries.yaml"
+def _load_cues(path: str | Path = _CFG) -> dict:
+    return yaml.safe_load(Path(path).read_text(encoding="utf-8"))
+def classify_family(text: str, cfg: dict) -> tuple[str | None, list[str]]:
+    """Best-guess writer family from keyword cues; returns (family, matched_cues)."""
+    t = text.lower()
+    best, best_hits = None, []
+    for fam, cues in cfg.get("family_cues", {}).items():
+        hits = [c for c in cues if c in t]
+        if len(hits) > len(best_hits):
+            best, best_hits = fam, hits
+    return best, best_hits
+def has_human_cell_evidence(text: str, cfg: dict) -> bool:
+    t = text.lower()
+    return any(cue in t for cue in cfg.get("human_cell_cues", []))
+_ORG_RE = re.compile(r"\b([A-Z][a-z]+ [a-z]{3,})\b")  # coarse "Genus species" cue
+def triage_hit(hit: dict, default_family: str | None = None, cfg: dict | None = None) -> dict:
+    """Return a candidate row for the curation queue. Always carries a citation; never auto-edits."""
+    cfg = cfg or _load_cues()
+    title = hit.get("title", "") or ""
+    abstract = hit.get("abstractText", "") or ""
+    text = f"{title}. {abstract}"
+    fam, cues = classify_family(text, cfg)
+    org = _ORG_RE.search(abstract)
+    return {
+        "candidate_family": fam or default_family,
+        "matched_cues": ";".join(cues),
+        "organism_cue": org.group(1) if org else None,
+        "human_cell_evidence": has_human_cell_evidence(text, cfg),
+        "title": title[:300],
+        "source_id": hit.get("id"),
+        "source_db": hit.get("source"),
+        "doi": hit.get("doi"),
+        "pub_date": hit.get("firstPublicationDate"),
+        "confidence": "inferred",       # candidate - stays inferred until a human reviews/measures
+        "status": "pending_review",     # NEVER auto-accepted into the atlas
+    }

pen_stack/planner/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """pen_stack.planner - see PEN-STACK v3.0 program doc."""

pen_stack/planner/cargo.py ADDED Viewed

@@ -0,0 +1,56 @@
+"""Cargo / donor design (Phase 3, Step 3.2).
+Assemble a donor construct spec for a chosen writer + site: insulators (protect durability), promoter +
+polyA, codon optimisation flag for the host cell type, and a size check against the writer's
+deliverability/cargo class. For bridge/seek writers, attach the Phase-1.5 off-target prediction *if the
+bridge engine is available* - otherwise the field is marked pending (Phase 1.5), so the Planner runs
+end-to-end now and the off-target annotation drops in once Phase 1.5 lands.
+We design at the level of construct *elements + sizes* (the payload sequence is the user's CDS/regulatory
+cassette); element lengths are nominal, documented constants.
+"""
+from __future__ import annotations
+# nominal element sizes (bp) for the assembled donor; documented, not hidden
+_ELEMENTS = {"insulator_5": 250, "promoter": 600, "polyA": 250, "insulator_3": 250}
+def _bridge_offtarget(writer_family: str, site: tuple) -> dict:
+    """Optional Phase-1.5 hook. Returns the off-target prediction if the bridge engine exists, else pending."""
+    try:
+        from pen_stack.bridge.offtarget import predict_offtargets  # Phase 1.5 deliverable
+    except Exception:  # noqa: BLE001 - engine not built yet
+        return {"status": "pending_phase_1_5", "note": "bridge off-target engine ships in Phase 1.5"}
+    return predict_offtargets(writer_family, site)
+def design_cargo(payload_bp: int, writer_row: dict, site: tuple, ct: str,
+                 payload_seq: str | None = None) -> dict:
+    """Assemble a donor construct spec. writer_row needs: family, cargo_capacity_bp, deliv_class.
+    If `payload_seq` is given, attach the WS-D Cargo Polish sequence-risk scan (cargo_durability_risk +
+    actionable suggestions) - the locus model scores the site, Cargo Polish scores the insert.
+    """
+    fam = writer_row.get("family")
+    cap = writer_row.get("cargo_capacity_bp")
+    elements = dict(_ELEMENTS)
+    assembled_bp = int(payload_bp) + sum(elements.values())
+    size_ok = (cap is None) or (assembled_bp <= cap)
+    out = {
+        "host": ct,
+        "payload_bp": int(payload_bp),
+        "elements": elements,                       # insulators + promoter + polyA
+        "assembled_bp": assembled_bp,
+        "codon_optimised": True,
+        "writer_family": fam,
+        "cargo_capacity_bp": cap,
+        "size_ok": size_ok,
+        "deliverability": writer_row.get("deliv_class"),
+    }
+    if fam in {"bridge_IS110", "seek_IS1111"}:
+        out["offtargets"] = _bridge_offtarget(fam, site)
+    if payload_seq:
+        from pen_stack.planner.cargo_polish import scan_cargo
+        out["cargo_polish"] = scan_cargo(payload_seq)
+    return out

pen_stack/planner/cargo_polish.py ADDED Viewed

@@ -0,0 +1,146 @@
+"""Cargo Polish - cargo-sequence durability-risk scan (v3.1, WS-D).
+The locus model scores WHERE to write; this scores WHAT is written. It scans the insert (the user's
+cassette sequence) for known sequence triggers of transgene silencing/instability and emits a
+`cargo_durability_risk` score in [0,1] with a band and, for every flag, a concrete remedy.
+This is a HEURISTIC flag, not a supervised silencing predictor: it catches documented sequence triggers
+(CpG-island density -> de novo methylation; GC extremes; cryptic splice consensus; strong mRNA secondary
+structure; known silencer motifs), not all silencing causes. Thresholds are documented constants
+(configs/cargo_polish.yaml) from the silencing literature. ViennaRNA (MFE) is optional and degrades
+gracefully (the structure term is skipped, noted) so the scan runs anywhere; the other terms are pure-Python.
+Acceptance (prereg/ws_d.yaml): reproduces established directionality - high-CpG bacterial-style cassettes
+score above CpG-depleted / insulator-flanked constructs on a small curated set - and every flag carries a
+concrete suggestion.
+"""
+from __future__ import annotations
+import re
+from functools import lru_cache
+import yaml
+@lru_cache(maxsize=1)
+def _cfg() -> dict:
+    from pen_stack._resources import resource
+    return yaml.safe_load(resource("configs/cargo_polish.yaml").read_text(encoding="utf-8"))
+def _clean(seq: str) -> str:
+    return re.sub(r"[^ACGT]", "", (seq or "").upper())
+def gc_fraction(seq: str) -> float:
+    s = _clean(seq)
+    return (s.count("G") + s.count("C")) / len(s) if s else 0.0
+def cpg_islands(seq: str) -> list[dict]:
+    """Gardiner-Garden & Frommer sliding window: obs/exp CpG > threshold AND GC > threshold over the window."""
+    c = _cfg()["cpg_island"]
+    s = _clean(seq)
+    w, step = c["window_bp"], max(1, c["window_bp"] // 4)
+    out = []
+    for i in range(0, max(1, len(s) - w + 1), step):
+        win = s[i:i + w]
+        if len(win) < w:
+            break
+        nC, nG = win.count("C"), win.count("G")
+        gc = (nC + nG) / w
+        exp = (nC * nG) / w if nC and nG else 0.0
+        obs_exp = (win.count("CG") / exp) if exp else 0.0
+        if obs_exp > c["obs_exp_min"] and gc > c["gc_min"]:
+            out.append({"start": i, "obs_exp": round(obs_exp, 3), "gc": round(gc, 3)})
+    # merge overlapping windows into island count
+    merged, last_end = 0, -1
+    for isl in out:
+        if isl["start"] > last_end:
+            merged += 1
+        last_end = isl["start"] + w
+    return [{"n_islands": merged, "windows": out}] if merged else []
+def cryptic_splice_sites(seq: str) -> dict:
+    c = _cfg()["cryptic_splice"]
+    s = _clean(seq)
+    donors = len(re.findall(c["donor_motif"], s))
+    acceptors = len(re.findall(c["acceptor_motif"], s))
+    return {"donor": donors, "acceptor": acceptors, "total": donors + acceptors}
+def silencer_motifs(seq: str) -> list[dict]:
+    s = _clean(seq)
+    hits = []
+    for m in _cfg()["silencer_motifs"]["motifs"]:
+        n = len(re.findall(m["pattern"], s))
+        if n:
+            hits.append({"name": m["name"], "count": n, "note": m["note"]})
+    return hits
+def mfe_per_nt(seq: str) -> dict:
+    """ViennaRNA minimum-free-energy per nucleotide of the transcribed insert; graceful if RNA is absent."""
+    s = _clean(seq)
+    if len(s) < 10:
+        return {"available": False, "note": "sequence too short"}
+    try:
+        import RNA
+    except Exception:  # noqa: BLE001 - ViennaRNA only in the bio extra / VM image
+        return {"available": False, "note": "ViennaRNA not installed (bio extra / VM image)"}
+    fc = RNA.fold_compound(s.replace("T", "U"))
+    _struct, mfe = fc.mfe()
+    return {"available": True, "mfe": round(float(mfe), 2), "mfe_per_nt": round(float(mfe) / len(s), 4)}
+def scan_cargo(seq: str) -> dict:
+    """Aggregate the cargo durability-risk scan: score in [0,1], band, and per-flag concrete suggestions."""
+    cfg = _cfg()
+    s = _clean(seq)
+    flags, risk = [], 0.0
+    sug = cfg["suggestions"]
+    isl = cpg_islands(s)
+    if isl:
+        n = isl[0]["n_islands"]
+        risk += min(0.5, n * cfg["cpg_island"]["risk_per_island"])
+        flags.append({"category": "cpg_island", "detail": f"{n} CpG island(s)", "suggestion": sug["cpg_island"]})
+    gc = gc_fraction(s)
+    if gc and (gc < cfg["gc_extremes"]["gc_low"] or gc > cfg["gc_extremes"]["gc_high"]):
+        risk += cfg["gc_extremes"]["risk"]
+        flags.append({"category": "gc_extremes", "detail": f"GC={gc:.2f}", "suggestion": sug["gc_extremes"]})
+    cs = cryptic_splice_sites(s)
+    if cs["total"]:
+        risk += min(cfg["cryptic_splice"]["risk_per_site_capped"], 0.05 * cs["total"])
+        flags.append({"category": "cryptic_splice", "detail": f"{cs['total']} splice consensus site(s)",
+                      "suggestion": sug["cryptic_splice"]})
+    sm = silencer_motifs(s)
+    if sm:
+        risk += min(cfg["silencer_motifs"]["risk_per_motif_capped"], 0.07 * sum(h["count"] for h in sm))
+        flags.append({"category": "silencer_motifs", "detail": ", ".join(h["name"] for h in sm),
+                      "suggestion": sug["silencer_motifs"]})
+    mfe = mfe_per_nt(s)
+    if mfe.get("available") and mfe["mfe_per_nt"] < cfg["secondary_structure"]["mfe_per_nt_warn"]:
+        risk += cfg["secondary_structure"]["risk"]
+        flags.append({"category": "secondary_structure", "detail": f"MFE/nt={mfe['mfe_per_nt']}",
+                      "suggestion": sug["secondary_structure"]})
+    risk = round(min(1.0, risk), 4)
+    b = cfg["bands"]
+    band = "low" if risk < b["low"] else ("moderate" if risk < b["moderate"] else "high")
+    return {"cargo_durability_risk": risk, "band": band, "length_bp": len(s),
+            "gc": round(gc, 4), "n_flags": len(flags), "flags": flags,
+            "components": {"cpg_islands": isl, "cryptic_splice": cs, "silencer_motifs": sm,
+                           "secondary_structure": mfe},
+            "scope": "heuristic sequence-trigger scan, not a supervised silencing predictor"}
+if __name__ == "__main__":  # pragma: no cover
+    import json
+    demo = "CGCGCGCGGCGGCGCGCGGCGGCGCGCGGCGGCGCG" * 8
+    print(json.dumps(scan_cargo(demo), indent=2, default=str))

pen_stack/planner/delivery.py ADDED Viewed

@@ -0,0 +1,32 @@
+"""Delivery recommendation (Phase 3, Step 3.3).
+Recommend a delivery modality from the total payload (writer effector + cargo) and the target cell type,
+using the documented rule table in configs/delivery_rules.yaml (no hidden constants).
+"""
+from __future__ import annotations
+from functools import lru_cache
+from pathlib import Path
+import yaml
+_CFG = Path(__file__).resolve().parents[2] / "configs" / "delivery_rules.yaml"
+@lru_cache(maxsize=1)
+def _rules(path: str | Path = _CFG) -> dict:
+    return yaml.safe_load(Path(path).read_text(encoding="utf-8"))
+def recommend_delivery(effector_bp: int, cargo_bp: int, ct: str) -> dict:
+    """Return {delivery, total_bp, rationale}. effector_bp ~= writer length_aa * 3."""
+    cfg = _rules()
+    total = int(effector_bp) + int(cargo_bp)
+    for rule in cfg["rules"]:
+        if total <= rule["max_total_bp"]:
+            return {"delivery": rule["delivery"], "total_bp": total,
+                    "rationale": f"total payload {total} bp <= {rule['max_total_bp']} bp"}
+    fallback = (cfg["ex_vivo_fallback"] if ct in cfg.get("ex_vivo_cell_types", [])
+                else cfg["in_vivo_fallback"])
+    return {"delivery": fallback, "total_bp": total,
+            "rationale": f"total payload {total} bp exceeds dual-AAV; cell type {ct}"}