pen-stack 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. pen_stack/__init__.py +2 -0
  2. pen_stack/_resources.py +34 -0
  3. pen_stack/adapt/__init__.py +14 -0
  4. pen_stack/adapt/finetune.py +33 -0
  5. pen_stack/adapt/ingest.py +86 -0
  6. pen_stack/adapt/pipeline.py +101 -0
  7. pen_stack/adapt/recalibrate.py +58 -0
  8. pen_stack/adapt/report.py +130 -0
  9. pen_stack/agent/__init__.py +1 -0
  10. pen_stack/agent/guardrails.py +49 -0
  11. pen_stack/agent/mcp_server.py +42 -0
  12. pen_stack/agent/orchestrator.py +106 -0
  13. pen_stack/agent/pen_agent.py +169 -0
  14. pen_stack/agent/tools.py +130 -0
  15. pen_stack/atlas/__init__.py +1 -0
  16. pen_stack/atlas/build_wtkb.py +80 -0
  17. pen_stack/atlas/crosslink.py +144 -0
  18. pen_stack/atlas/expand.py +190 -0
  19. pen_stack/atlas/schema.py +59 -0
  20. pen_stack/atlas/scorecard.py +134 -0
  21. pen_stack/atlas/universe.py +75 -0
  22. pen_stack/atlas/variant_propose.py +155 -0
  23. pen_stack/bridge/__init__.py +1 -0
  24. pen_stack/bridge/activity.py +52 -0
  25. pen_stack/bridge/cli.py +65 -0
  26. pen_stack/bridge/fold_qc.py +53 -0
  27. pen_stack/bridge/guide_qc.py +84 -0
  28. pen_stack/bridge/ingest.py +139 -0
  29. pen_stack/bridge/offtarget.py +133 -0
  30. pen_stack/bridge/ortholog_screen.py +73 -0
  31. pen_stack/bridge/pipeline.py +83 -0
  32. pen_stack/cli.py +126 -0
  33. pen_stack/data/__init__.py +1 -0
  34. pen_stack/data/encode.py +84 -0
  35. pen_stack/data/genome.py +71 -0
  36. pen_stack/data/ingest_chromatin.py +119 -0
  37. pen_stack/data/ingest_integration.py +112 -0
  38. pen_stack/data/ingest_safety_annot.py +164 -0
  39. pen_stack/data/ingest_trip.py +76 -0
  40. pen_stack/mech/__init__.py +1 -0
  41. pen_stack/mech/classify_atlas.py +71 -0
  42. pen_stack/mech/whitelist.py +66 -0
  43. pen_stack/monitor/__init__.py +1 -0
  44. pen_stack/monitor/europepmc.py +32 -0
  45. pen_stack/monitor/run.py +57 -0
  46. pen_stack/monitor/triage.py +63 -0
  47. pen_stack/planner/__init__.py +1 -0
  48. pen_stack/planner/cargo.py +56 -0
  49. pen_stack/planner/cargo_polish.py +146 -0
  50. pen_stack/planner/delivery.py +32 -0
  51. pen_stack/planner/multiplex.py +110 -0
  52. pen_stack/planner/optimize.py +156 -0
  53. pen_stack/planner/pipeline.py +86 -0
  54. pen_stack/planner/report.py +26 -0
  55. pen_stack/rag/__init__.py +1 -0
  56. pen_stack/rag/index.py +53 -0
  57. pen_stack/rag/llm.py +178 -0
  58. pen_stack/rag/qa.py +105 -0
  59. pen_stack/score/__init__.py +1 -0
  60. pen_stack/score/recalibrate.py +77 -0
  61. pen_stack/score/therapeutic.py +85 -0
  62. pen_stack/server/__init__.py +1 -0
  63. pen_stack/server/api.py +142 -0
  64. pen_stack/ui/__init__.py +1 -0
  65. pen_stack/ui/app.py +518 -0
  66. pen_stack/validate/__init__.py +1 -0
  67. pen_stack/validate/adapt_demo.py +69 -0
  68. pen_stack/validate/agent_eval.py +117 -0
  69. pen_stack/validate/blind_gsh_discovery.py +165 -0
  70. pen_stack/validate/cargo_directionality.py +57 -0
  71. pen_stack/validate/durability_baselines.py +150 -0
  72. pen_stack/validate/forward_hypotheses.py +104 -0
  73. pen_stack/validate/guide_qc_demo.py +58 -0
  74. pen_stack/validate/intent_specification.py +82 -0
  75. pen_stack/validate/paper3_benchmark.py +165 -0
  76. pen_stack/validate/paper4_real_validation.py +144 -0
  77. pen_stack/validate/paper4_validation.py +82 -0
  78. pen_stack/validate/seq_vs_measured.py +134 -0
  79. pen_stack/validate/within_locus_ranking.py +74 -0
  80. pen_stack/validate/writer_recovery.py +86 -0
  81. pen_stack/wgenome/__init__.py +1 -0
  82. pen_stack/wgenome/chromatin_seq.py +83 -0
  83. pen_stack/wgenome/durability.py +108 -0
  84. pen_stack/wgenome/export_tracks.py +52 -0
  85. pen_stack/wgenome/features.py +82 -0
  86. pen_stack/wgenome/gsh_baseline.py +117 -0
  87. pen_stack/wgenome/providers.py +245 -0
  88. pen_stack/wgenome/safety.py +69 -0
  89. pen_stack/wgenome/structure3d.py +168 -0
  90. pen_stack/wgenome/writability.py +72 -0
  91. pen_stack-3.1.0.dist-info/METADATA +451 -0
  92. pen_stack-3.1.0.dist-info/RECORD +96 -0
  93. pen_stack-3.1.0.dist-info/WHEEL +5 -0
  94. pen_stack-3.1.0.dist-info/entry_points.txt +3 -0
  95. pen_stack-3.1.0.dist-info/licenses/LICENSE +21 -0
  96. pen_stack-3.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,76 @@
1
+ """TRIP durability supervision (Phase 1, Step 1.3).
2
+
3
+ Ingests the Akhtar et al. 2013 TRIP data (GEO GSE49806 tet-O + GSE49807 mPGK; mouse mESC): each row is
4
+ one integrated reporter at a genomic position with expression. Produces (position, expression level,
5
+ silenced/expressed label) - the supervision for the conditional chromatin-context durability model.
6
+
7
+ The model learns `local chromatin features -> expression`; it never sees the coordinate. So TRIP being
8
+ mouse is fine: attach mouse (mES) chromatin features at these positions, train the function, then apply
9
+ it to a human epigenome (the headline function-transfer test).
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import argparse
14
+ import gzip
15
+ from pathlib import Path
16
+
17
+ import numpy as np
18
+ import pandas as pd
19
+
20
+
21
+ def load_trip(txt_gz: str, promoter: str) -> pd.DataFrame:
22
+ """Robust to both TRIP schemas: GSE49807 (plain) and GSE49806 (leading '#' comment + multi-Dox
23
+ columns; we use the 100 ng full-induction normalization/expression pair)."""
24
+ with gzip.open(txt_gz, "rt") as fh:
25
+ raw = pd.read_csv(fh, sep="\t", comment="#", dtype=str)
26
+ cols = {c.lower().strip(): c for c in raw.columns}
27
+ chrom_c = cols.get("chromosome")
28
+ pos_c = cols.get("position")
29
+ norm_c = cols.get("normalization_counts_100ng_1") or cols.get("normalization_counts")
30
+ expr_c = cols.get("expression_counts_100ng_1") or cols.get("expression_counts")
31
+ if not all([chrom_c, pos_c, norm_c, expr_c]):
32
+ raise ValueError(f"{txt_gz}: missing expected columns; have {list(raw.columns)[:8]}")
33
+ df = pd.DataFrame({
34
+ "chrom": raw[chrom_c].astype(str),
35
+ "pos": pd.to_numeric(raw[pos_c], errors="coerce"),
36
+ "norm_counts": pd.to_numeric(raw[norm_c], errors="coerce"),
37
+ "expr_counts": pd.to_numeric(raw[expr_c], errors="coerce"),
38
+ }).dropna()
39
+ df["pos"] = df["pos"].astype(int)
40
+ df["promoter"] = promoter
41
+ return df
42
+
43
+
44
+ def assemble(files: dict[str, str], out_parquet: str, silenced_quantile: float = 0.25) -> pd.DataFrame:
45
+ parts = [load_trip(path, prom) for prom, path in files.items()]
46
+ df = pd.concat(parts, ignore_index=True)
47
+ # normalized expression (expression per normalization read), log scale
48
+ df["expression"] = np.log2((df["expr_counts"] + 1) / (df["norm_counts"] + 1))
49
+ # silenced/expressed: low tail of expression flagged silenced (per promoter, to control for promoter strength)
50
+ df["silenced"] = False
51
+ for prom, g in df.groupby("promoter"):
52
+ thr = g["expression"].quantile(silenced_quantile)
53
+ df.loc[g.index, "silenced"] = g["expression"] <= thr
54
+ df["stable"] = ~df["silenced"]
55
+ Path(out_parquet).parent.mkdir(parents=True, exist_ok=True)
56
+ df.to_parquet(out_parquet, index=False)
57
+ return df
58
+
59
+
60
+ def main() -> None:
61
+ ap = argparse.ArgumentParser()
62
+ ap.add_argument("--teto", default="/data/external/trip/GSE49806_S2.txt.gz")
63
+ ap.add_argument("--mpgk", default="/data/external/trip/GSE49807_S3.txt.gz")
64
+ ap.add_argument("--out", default="/data/features/trip_mesc.parquet")
65
+ a = ap.parse_args()
66
+ files = {k: v for k, v in {"tetO": a.teto, "mPGK": a.mpgk}.items() if Path(v).exists()}
67
+ df = assemble(files, a.out)
68
+ print(f"TRIP integrations: {len(df)} promoters={df['promoter'].value_counts().to_dict()}")
69
+ print(f"expression range (log2): [{df['expression'].min():.2f}, {df['expression'].max():.2f}] "
70
+ f"~{2**(df['expression'].max()-df['expression'].min()):.0f}-fold")
71
+ print(f"silenced={int(df['silenced'].sum())} stable={int(df['stable'].sum())}")
72
+ print(f"chroms: {sorted(df['chrom'].unique())[:6]}... (mouse build)")
73
+
74
+
75
+ if __name__ == "__main__":
76
+ main()
@@ -0,0 +1 @@
1
+ """pen_stack.mech - see PEN-STACK v3.0 program doc."""
@@ -0,0 +1,71 @@
1
+ """Mechanism-aware classification at scale (Phase 2, Step 2.2).
2
+
3
+ Run the audited Pfam-whitelist classifier over the expanded Writer Atlas. For every system, derive a
4
+ ``mech_pred`` bucket + ``mech_conf`` *independently* from its Pfam domain architecture (homology), then
5
+ compare against the inherited/audited ``mechanism_bucket`` - keeping homology and mechanism distinct, as
6
+ the program requires. Low-confidence / conflicting / disagreeing calls are written to a review queue and
7
+ flagged, never hidden.
8
+
9
+ Inputs : pen_stack/atlas/atlas.parquet, the 18-family whitelist.
10
+ Outputs: atlas.parquet updated with mech_pred / mech_conf / mech_basis / mech_agrees,
11
+ out/mech_review_queue.csv.
12
+ """
13
+ from __future__ import annotations
14
+
15
+ from pathlib import Path
16
+
17
+ import pandas as pd
18
+
19
+ from pen_stack.mech.whitelist import PfamWhitelist
20
+
21
+ _ROOT = Path(__file__).resolve().parents[2]
22
+ _ATLAS = _ROOT / "pen_stack" / "atlas" / "atlas.parquet"
23
+ _QUEUE = _ROOT / "out" / "mech_review_queue.csv"
24
+
25
+
26
+ def classify_atlas(atlas_parquet: str | Path = _ATLAS, out: str | Path = _ATLAS,
27
+ queue: str | Path = _QUEUE) -> pd.DataFrame:
28
+ atlas = pd.read_parquet(atlas_parquet)
29
+ wl = PfamWhitelist()
30
+
31
+ calls = atlas["pfam_signature"].apply(lambda s: wl.classify(list(s) if s is not None else []))
32
+ atlas["mech_pred"] = [c.bucket for c in calls]
33
+ atlas["mech_conf"] = [c.confidence for c in calls]
34
+ atlas["mech_basis"] = [c.basis for c in calls]
35
+ # agreement with the inherited/audited mechanism label (None where one side is missing)
36
+ atlas["mech_agrees"] = [
37
+ (mp == mb) if (mp is not None and pd.notna(mb)) else None
38
+ for mp, mb in zip(atlas["mech_pred"], atlas["mechanism_bucket"])
39
+ ]
40
+ atlas["mech_class_version"] = wl.version
41
+
42
+ # review queue: no domain evidence, conflicting evidence, or disagreement with the audited label
43
+ flag = (
44
+ atlas["mech_conf"].isin(["none", "conflicting"])
45
+ | atlas["mech_agrees"].eq(False) # explicit False (disagreement), not NaN
46
+ )
47
+ q = atlas.loc[flag, ["representative_system", "family", "pfam_signature",
48
+ "mechanism_bucket", "mech_pred", "mech_conf", "mech_basis",
49
+ "mech_agrees", "confidence"]]
50
+ Path(queue).parent.mkdir(parents=True, exist_ok=True)
51
+ q.to_csv(queue, index=False)
52
+
53
+ atlas.to_parquet(out, index=False)
54
+ return atlas
55
+
56
+
57
+ def core_agreement(atlas: pd.DataFrame) -> dict:
58
+ """Agreement on the curated 8-family core against the audited 18-family labels."""
59
+ core = atlas[atlas["entry_kind"] == "curated_core"]
60
+ scored = core[core["mech_pred"].notna()]
61
+ agree = int((scored["mech_pred"] == scored["mechanism_bucket"]).sum())
62
+ return {"n_core": len(core), "n_scored": len(scored), "n_agree": agree,
63
+ "agreement": round(agree / len(scored), 4) if len(scored) else None}
64
+
65
+
66
+ if __name__ == "__main__": # pragma: no cover
67
+ a = classify_atlas()
68
+ print("mech_conf distribution:\n", a["mech_conf"].value_counts())
69
+ print("\ncore agreement:", core_agreement(a))
70
+ n_flag = int((a["mech_conf"].isin(["none", "conflicting"]) | (a["mech_agrees"].eq(False))).sum())
71
+ print("\nreview queue rows:", n_flag)
@@ -0,0 +1,66 @@
1
+ """The InterPro-audited 18-family Pfam whitelist (imported from genome-atlas v1.2.1).
2
+
3
+ Tier-A of MECH-CLASS: maps a Pfam domain architecture to a mechanism bucket
4
+ (``DSB_NUCLEASE`` / ``DSB_FREE_TRANSEST_RECOMBINASE`` / ``TRANSPOSASE``) using domain presence plus
5
+ *composite co-occurrence rules* (e.g. Cas9 requires >=2 of its 3 signature domains; IS110 requires
6
+ both PF01548 and PF02371). This is the audited backbone the program carries forward (Section 9); the retired
7
+ ESM-2 "PEN-DISCOVER" head is not used - domain evidence is the load-bearing mechanism signal.
8
+
9
+ The source YAML ``pfam_whitelist.yaml`` is the genome-atlas asset, accessions verified against InterPro
10
+ on 2026-04-22 (v1.2.1 corrected three v1.2.0 accession errors).
11
+ """
12
+ from __future__ import annotations
13
+
14
+ from collections import Counter
15
+ from dataclasses import dataclass
16
+ from pathlib import Path
17
+
18
+ import yaml
19
+
20
+ _WL_PATH = Path(__file__).resolve().parent / "pfam_whitelist.yaml"
21
+
22
+ # Composite architectures: a call is only "composite-grade" (highest confidence) when the required
23
+ # co-occurring domains are all present. Derived from the whitelist co_occurs_with fields.
24
+ _COMPOSITES = {
25
+ "Cas9": {"min": 2, "of": {"PF13395", "PF18541", "PF16595"}, "bucket": "DSB_NUCLEASE"},
26
+ "IS110_bridge": {"min": 2, "of": {"PF01548", "PF02371"}, "bucket": "DSB_FREE_TRANSEST_RECOMBINASE"},
27
+ }
28
+
29
+
30
+ @dataclass(frozen=True)
31
+ class MechCall:
32
+ bucket: str | None # mechanism bucket, or None if no whitelisted domain present
33
+ confidence: str # composite | single | conflicting | none
34
+ basis: str # human-readable evidence trail
35
+ matched: tuple[str, ...] # whitelisted accessions that fired
36
+
37
+
38
+ class PfamWhitelist:
39
+ def __init__(self, path: str | Path = _WL_PATH):
40
+ doms = yaml.safe_load(Path(path).read_text(encoding="utf-8"))["domains"]
41
+ self.bucket_of: dict[str, str] = {d["accession"]: d["mechanism_bucket"] for d in doms}
42
+ self.name_of: dict[str, str] = {d["accession"]: d.get("name", "") for d in doms}
43
+ self.version = "1.2.1"
44
+
45
+ def classify(self, pfam_signature) -> MechCall:
46
+ """Mechanism from a Pfam domain set - independent of any inherited/family label."""
47
+ sig = {str(a).strip() for a in (pfam_signature or []) if str(a).strip()}
48
+ hits = sorted(sig & set(self.bucket_of))
49
+ if not hits:
50
+ return MechCall(None, "none", "no whitelisted Pfam domain present", ())
51
+
52
+ # composite rule wins (most specific, highest confidence)
53
+ for name, rule in _COMPOSITES.items():
54
+ present = sig & rule["of"]
55
+ if len(present) >= rule["min"]:
56
+ return MechCall(rule["bucket"], "composite",
57
+ f"{name}: {len(present)}/{len(rule['of'])} signature domains "
58
+ f"({','.join(sorted(present))})", tuple(hits))
59
+
60
+ buckets = Counter(self.bucket_of[a] for a in hits)
61
+ top, n = buckets.most_common(1)[0]
62
+ if len(buckets) > 1:
63
+ return MechCall(top, "conflicting",
64
+ f"mixed domain evidence: {dict(buckets)}", tuple(hits))
65
+ return MechCall(top, "single",
66
+ f"single-bucket domain evidence: {','.join(hits)}", tuple(hits))
@@ -0,0 +1 @@
1
+ """pen_stack.monitor - see PEN-STACK v3.0 program doc."""
@@ -0,0 +1,32 @@
1
+ """Europe PMC client for PEN-MONITOR (Phase 2, Step 2.7).
2
+
3
+ Europe PMC is the right primary source: open REST API, full-text + preprints, no licence friction.
4
+ This module only *fetches* - triage + queueing live in triage.py / run.py.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import time
9
+ import urllib.parse
10
+ import urllib.request
11
+ from io import BytesIO
12
+ import json
13
+
14
+ EPMC = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
15
+
16
+
17
+ def search(query: str, since_date: str | None = None, page_size: int = 100,
18
+ timeout: int = 30, retries: int = 3) -> list[dict]:
19
+ """Search Europe PMC. ``since_date`` (YYYY-MM-DD) filters on first publication date."""
20
+ q = query if not since_date else f"{query} AND FIRST_PDATE:[{since_date} TO *]"
21
+ params = {"query": q, "format": "json", "pageSize": page_size, "resultType": "core"}
22
+ url = EPMC + "?" + urllib.parse.urlencode(params)
23
+ last = None
24
+ for attempt in range(retries):
25
+ try:
26
+ with urllib.request.urlopen(url, timeout=timeout) as r:
27
+ data = json.load(BytesIO(r.read()))
28
+ return data.get("resultList", {}).get("result", [])
29
+ except Exception as e: # noqa: BLE001 - network best-effort
30
+ last = e
31
+ time.sleep(2 * (attempt + 1))
32
+ raise RuntimeError(f"Europe PMC search failed for {query!r}: {last}")
@@ -0,0 +1,57 @@
1
+ """PEN-MONITOR orchestrator (Phase 2, Step 2.7) - the Europe PMC living-database engine.
2
+
3
+ Poll Europe PMC for every writer-family query, triage each hit into a candidate row (always cited),
4
+ de-duplicate, and write a human-reviewed curation queue. The atlas is **never** auto-edited; accepted
5
+ entries flow into the WT-KB/atlas with confidence=inferred only after a human accepts them.
6
+
7
+ Back-test: with ``back_test=True`` and a date window covering March 2026, the engine must surface the
8
+ known recent writer ISPpu10 (Europe PMC PPR1218813) into the queue - the pre-registered success check.
9
+ """
10
+ from __future__ import annotations
11
+
12
+ from pathlib import Path
13
+
14
+ import pandas as pd
15
+
16
+ from pen_stack.monitor.europepmc import search
17
+ from pen_stack.monitor.triage import _load_cues, triage_hit
18
+
19
+ _OUT = Path(__file__).resolve().parents[2] / "out" / "monitor_queue.csv"
20
+
21
+
22
+ def run_monitor(since: str = "2026-01-01", page_size: int = 50, back_test: bool = False,
23
+ out: str | Path = _OUT, cfg_path: str | Path | None = None) -> dict:
24
+ cfg = _load_cues(cfg_path) if cfg_path else _load_cues()
25
+ rows, n_hits = [], 0
26
+ for q in cfg["queries"]:
27
+ try:
28
+ hits = search(q["terms"], since_date=since, page_size=page_size)
29
+ except RuntimeError:
30
+ continue
31
+ n_hits += len(hits)
32
+ for h in hits:
33
+ rows.append(triage_hit(h, default_family=q.get("family"), cfg=cfg))
34
+
35
+ queue = pd.DataFrame(rows)
36
+ if not queue.empty:
37
+ queue = queue.drop_duplicates(subset=["source_id"]).reset_index(drop=True)
38
+ # every queued candidate must carry a citation (source_id or doi)
39
+ queue = queue[queue["source_id"].notna() | queue["doi"].notna()]
40
+
41
+ Path(out).parent.mkdir(parents=True, exist_ok=True)
42
+ queue.to_csv(out, index=False)
43
+
44
+ res = {"since": since, "n_hits": n_hits, "n_candidates": int(len(queue)), "queue": str(out)}
45
+ if back_test:
46
+ found = False
47
+ if not queue.empty:
48
+ blob = (queue["title"].fillna("") + " " + queue["source_id"].fillna("")).str.lower()
49
+ found = bool(blob.str.contains("isppu10").any() or
50
+ (queue["source_id"] == "PPR1218813").any())
51
+ res["isppu10_found"] = found
52
+ return res
53
+
54
+
55
+ if __name__ == "__main__": # pragma: no cover
56
+ r = run_monitor(since="2026-01-01", back_test=True)
57
+ print(r)
@@ -0,0 +1,63 @@
1
+ """Triage Europe PMC hits into candidate writer-system rows (Phase 2, Step 2.7).
2
+
3
+ Grounded extraction: pull candidate fields (family, organism cue, human-cell evidence) from a hit's
4
+ title/abstract using documented keyword cues, **always** carrying the source citation (Europe PMC id +
5
+ DOI). An optional LLM pass (Ollama/Qwen via litellm) can enrich the abstract extraction, but it never
6
+ invents a citation and never auto-edits the atlas - its output is just another candidate for the queue.
7
+
8
+ The rule-based path is the reliable default (works offline, fully reproducible, satisfies the back-test).
9
+ """
10
+ from __future__ import annotations
11
+
12
+ import re
13
+ from pathlib import Path
14
+
15
+ import yaml
16
+
17
+ _CFG = Path(__file__).resolve().parents[2] / "configs" / "monitor_queries.yaml"
18
+
19
+
20
+ def _load_cues(path: str | Path = _CFG) -> dict:
21
+ return yaml.safe_load(Path(path).read_text(encoding="utf-8"))
22
+
23
+
24
+ def classify_family(text: str, cfg: dict) -> tuple[str | None, list[str]]:
25
+ """Best-guess writer family from keyword cues; returns (family, matched_cues)."""
26
+ t = text.lower()
27
+ best, best_hits = None, []
28
+ for fam, cues in cfg.get("family_cues", {}).items():
29
+ hits = [c for c in cues if c in t]
30
+ if len(hits) > len(best_hits):
31
+ best, best_hits = fam, hits
32
+ return best, best_hits
33
+
34
+
35
+ def has_human_cell_evidence(text: str, cfg: dict) -> bool:
36
+ t = text.lower()
37
+ return any(cue in t for cue in cfg.get("human_cell_cues", []))
38
+
39
+
40
+ _ORG_RE = re.compile(r"\b([A-Z][a-z]+ [a-z]{3,})\b") # coarse "Genus species" cue
41
+
42
+
43
+ def triage_hit(hit: dict, default_family: str | None = None, cfg: dict | None = None) -> dict:
44
+ """Return a candidate row for the curation queue. Always carries a citation; never auto-edits."""
45
+ cfg = cfg or _load_cues()
46
+ title = hit.get("title", "") or ""
47
+ abstract = hit.get("abstractText", "") or ""
48
+ text = f"{title}. {abstract}"
49
+ fam, cues = classify_family(text, cfg)
50
+ org = _ORG_RE.search(abstract)
51
+ return {
52
+ "candidate_family": fam or default_family,
53
+ "matched_cues": ";".join(cues),
54
+ "organism_cue": org.group(1) if org else None,
55
+ "human_cell_evidence": has_human_cell_evidence(text, cfg),
56
+ "title": title[:300],
57
+ "source_id": hit.get("id"),
58
+ "source_db": hit.get("source"),
59
+ "doi": hit.get("doi"),
60
+ "pub_date": hit.get("firstPublicationDate"),
61
+ "confidence": "inferred", # candidate - stays inferred until a human reviews/measures
62
+ "status": "pending_review", # NEVER auto-accepted into the atlas
63
+ }
@@ -0,0 +1 @@
1
+ """pen_stack.planner - see PEN-STACK v3.0 program doc."""
@@ -0,0 +1,56 @@
1
+ """Cargo / donor design (Phase 3, Step 3.2).
2
+
3
+ Assemble a donor construct spec for a chosen writer + site: insulators (protect durability), promoter +
4
+ polyA, codon optimisation flag for the host cell type, and a size check against the writer's
5
+ deliverability/cargo class. For bridge/seek writers, attach the Phase-1.5 off-target prediction *if the
6
+ bridge engine is available* - otherwise the field is marked pending (Phase 1.5), so the Planner runs
7
+ end-to-end now and the off-target annotation drops in once Phase 1.5 lands.
8
+
9
+ We design at the level of construct *elements + sizes* (the payload sequence is the user's CDS/regulatory
10
+ cassette); element lengths are nominal, documented constants.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ # nominal element sizes (bp) for the assembled donor; documented, not hidden
15
+ _ELEMENTS = {"insulator_5": 250, "promoter": 600, "polyA": 250, "insulator_3": 250}
16
+
17
+
18
+ def _bridge_offtarget(writer_family: str, site: tuple) -> dict:
19
+ """Optional Phase-1.5 hook. Returns the off-target prediction if the bridge engine exists, else pending."""
20
+ try:
21
+ from pen_stack.bridge.offtarget import predict_offtargets # Phase 1.5 deliverable
22
+ except Exception: # noqa: BLE001 - engine not built yet
23
+ return {"status": "pending_phase_1_5", "note": "bridge off-target engine ships in Phase 1.5"}
24
+ return predict_offtargets(writer_family, site)
25
+
26
+
27
+ def design_cargo(payload_bp: int, writer_row: dict, site: tuple, ct: str,
28
+ payload_seq: str | None = None) -> dict:
29
+ """Assemble a donor construct spec. writer_row needs: family, cargo_capacity_bp, deliv_class.
30
+
31
+ If `payload_seq` is given, attach the WS-D Cargo Polish sequence-risk scan (cargo_durability_risk +
32
+ actionable suggestions) - the locus model scores the site, Cargo Polish scores the insert.
33
+ """
34
+ fam = writer_row.get("family")
35
+ cap = writer_row.get("cargo_capacity_bp")
36
+ elements = dict(_ELEMENTS)
37
+ assembled_bp = int(payload_bp) + sum(elements.values())
38
+ size_ok = (cap is None) or (assembled_bp <= cap)
39
+
40
+ out = {
41
+ "host": ct,
42
+ "payload_bp": int(payload_bp),
43
+ "elements": elements, # insulators + promoter + polyA
44
+ "assembled_bp": assembled_bp,
45
+ "codon_optimised": True,
46
+ "writer_family": fam,
47
+ "cargo_capacity_bp": cap,
48
+ "size_ok": size_ok,
49
+ "deliverability": writer_row.get("deliv_class"),
50
+ }
51
+ if fam in {"bridge_IS110", "seek_IS1111"}:
52
+ out["offtargets"] = _bridge_offtarget(fam, site)
53
+ if payload_seq:
54
+ from pen_stack.planner.cargo_polish import scan_cargo
55
+ out["cargo_polish"] = scan_cargo(payload_seq)
56
+ return out
@@ -0,0 +1,146 @@
1
+ """Cargo Polish - cargo-sequence durability-risk scan (v3.1, WS-D).
2
+
3
+ The locus model scores WHERE to write; this scores WHAT is written. It scans the insert (the user's
4
+ cassette sequence) for known sequence triggers of transgene silencing/instability and emits a
5
+ `cargo_durability_risk` score in [0,1] with a band and, for every flag, a concrete remedy.
6
+
7
+ This is a HEURISTIC flag, not a supervised silencing predictor: it catches documented sequence triggers
8
+ (CpG-island density -> de novo methylation; GC extremes; cryptic splice consensus; strong mRNA secondary
9
+ structure; known silencer motifs), not all silencing causes. Thresholds are documented constants
10
+ (configs/cargo_polish.yaml) from the silencing literature. ViennaRNA (MFE) is optional and degrades
11
+ gracefully (the structure term is skipped, noted) so the scan runs anywhere; the other terms are pure-Python.
12
+
13
+ Acceptance (prereg/ws_d.yaml): reproduces established directionality - high-CpG bacterial-style cassettes
14
+ score above CpG-depleted / insulator-flanked constructs on a small curated set - and every flag carries a
15
+ concrete suggestion.
16
+ """
17
+ from __future__ import annotations
18
+
19
+ import re
20
+ from functools import lru_cache
21
+
22
+ import yaml
23
+
24
+
25
+ @lru_cache(maxsize=1)
26
+ def _cfg() -> dict:
27
+ from pen_stack._resources import resource
28
+ return yaml.safe_load(resource("configs/cargo_polish.yaml").read_text(encoding="utf-8"))
29
+
30
+
31
+ def _clean(seq: str) -> str:
32
+ return re.sub(r"[^ACGT]", "", (seq or "").upper())
33
+
34
+
35
+ def gc_fraction(seq: str) -> float:
36
+ s = _clean(seq)
37
+ return (s.count("G") + s.count("C")) / len(s) if s else 0.0
38
+
39
+
40
+ def cpg_islands(seq: str) -> list[dict]:
41
+ """Gardiner-Garden & Frommer sliding window: obs/exp CpG > threshold AND GC > threshold over the window."""
42
+ c = _cfg()["cpg_island"]
43
+ s = _clean(seq)
44
+ w, step = c["window_bp"], max(1, c["window_bp"] // 4)
45
+ out = []
46
+ for i in range(0, max(1, len(s) - w + 1), step):
47
+ win = s[i:i + w]
48
+ if len(win) < w:
49
+ break
50
+ nC, nG = win.count("C"), win.count("G")
51
+ gc = (nC + nG) / w
52
+ exp = (nC * nG) / w if nC and nG else 0.0
53
+ obs_exp = (win.count("CG") / exp) if exp else 0.0
54
+ if obs_exp > c["obs_exp_min"] and gc > c["gc_min"]:
55
+ out.append({"start": i, "obs_exp": round(obs_exp, 3), "gc": round(gc, 3)})
56
+ # merge overlapping windows into island count
57
+ merged, last_end = 0, -1
58
+ for isl in out:
59
+ if isl["start"] > last_end:
60
+ merged += 1
61
+ last_end = isl["start"] + w
62
+ return [{"n_islands": merged, "windows": out}] if merged else []
63
+
64
+
65
+ def cryptic_splice_sites(seq: str) -> dict:
66
+ c = _cfg()["cryptic_splice"]
67
+ s = _clean(seq)
68
+ donors = len(re.findall(c["donor_motif"], s))
69
+ acceptors = len(re.findall(c["acceptor_motif"], s))
70
+ return {"donor": donors, "acceptor": acceptors, "total": donors + acceptors}
71
+
72
+
73
+ def silencer_motifs(seq: str) -> list[dict]:
74
+ s = _clean(seq)
75
+ hits = []
76
+ for m in _cfg()["silencer_motifs"]["motifs"]:
77
+ n = len(re.findall(m["pattern"], s))
78
+ if n:
79
+ hits.append({"name": m["name"], "count": n, "note": m["note"]})
80
+ return hits
81
+
82
+
83
+ def mfe_per_nt(seq: str) -> dict:
84
+ """ViennaRNA minimum-free-energy per nucleotide of the transcribed insert; graceful if RNA is absent."""
85
+ s = _clean(seq)
86
+ if len(s) < 10:
87
+ return {"available": False, "note": "sequence too short"}
88
+ try:
89
+ import RNA
90
+ except Exception: # noqa: BLE001 - ViennaRNA only in the bio extra / VM image
91
+ return {"available": False, "note": "ViennaRNA not installed (bio extra / VM image)"}
92
+ fc = RNA.fold_compound(s.replace("T", "U"))
93
+ _struct, mfe = fc.mfe()
94
+ return {"available": True, "mfe": round(float(mfe), 2), "mfe_per_nt": round(float(mfe) / len(s), 4)}
95
+
96
+
97
+ def scan_cargo(seq: str) -> dict:
98
+ """Aggregate the cargo durability-risk scan: score in [0,1], band, and per-flag concrete suggestions."""
99
+ cfg = _cfg()
100
+ s = _clean(seq)
101
+ flags, risk = [], 0.0
102
+ sug = cfg["suggestions"]
103
+
104
+ isl = cpg_islands(s)
105
+ if isl:
106
+ n = isl[0]["n_islands"]
107
+ risk += min(0.5, n * cfg["cpg_island"]["risk_per_island"])
108
+ flags.append({"category": "cpg_island", "detail": f"{n} CpG island(s)", "suggestion": sug["cpg_island"]})
109
+
110
+ gc = gc_fraction(s)
111
+ if gc and (gc < cfg["gc_extremes"]["gc_low"] or gc > cfg["gc_extremes"]["gc_high"]):
112
+ risk += cfg["gc_extremes"]["risk"]
113
+ flags.append({"category": "gc_extremes", "detail": f"GC={gc:.2f}", "suggestion": sug["gc_extremes"]})
114
+
115
+ cs = cryptic_splice_sites(s)
116
+ if cs["total"]:
117
+ risk += min(cfg["cryptic_splice"]["risk_per_site_capped"], 0.05 * cs["total"])
118
+ flags.append({"category": "cryptic_splice", "detail": f"{cs['total']} splice consensus site(s)",
119
+ "suggestion": sug["cryptic_splice"]})
120
+
121
+ sm = silencer_motifs(s)
122
+ if sm:
123
+ risk += min(cfg["silencer_motifs"]["risk_per_motif_capped"], 0.07 * sum(h["count"] for h in sm))
124
+ flags.append({"category": "silencer_motifs", "detail": ", ".join(h["name"] for h in sm),
125
+ "suggestion": sug["silencer_motifs"]})
126
+
127
+ mfe = mfe_per_nt(s)
128
+ if mfe.get("available") and mfe["mfe_per_nt"] < cfg["secondary_structure"]["mfe_per_nt_warn"]:
129
+ risk += cfg["secondary_structure"]["risk"]
130
+ flags.append({"category": "secondary_structure", "detail": f"MFE/nt={mfe['mfe_per_nt']}",
131
+ "suggestion": sug["secondary_structure"]})
132
+
133
+ risk = round(min(1.0, risk), 4)
134
+ b = cfg["bands"]
135
+ band = "low" if risk < b["low"] else ("moderate" if risk < b["moderate"] else "high")
136
+ return {"cargo_durability_risk": risk, "band": band, "length_bp": len(s),
137
+ "gc": round(gc, 4), "n_flags": len(flags), "flags": flags,
138
+ "components": {"cpg_islands": isl, "cryptic_splice": cs, "silencer_motifs": sm,
139
+ "secondary_structure": mfe},
140
+ "scope": "heuristic sequence-trigger scan, not a supervised silencing predictor"}
141
+
142
+
143
+ if __name__ == "__main__": # pragma: no cover
144
+ import json
145
+ demo = "CGCGCGCGGCGGCGCGCGGCGGCGCGCGGCGGCGCG" * 8
146
+ print(json.dumps(scan_cargo(demo), indent=2, default=str))
@@ -0,0 +1,32 @@
1
+ """Delivery recommendation (Phase 3, Step 3.3).
2
+
3
+ Recommend a delivery modality from the total payload (writer effector + cargo) and the target cell type,
4
+ using the documented rule table in configs/delivery_rules.yaml (no hidden constants).
5
+ """
6
+ from __future__ import annotations
7
+
8
+ from functools import lru_cache
9
+ from pathlib import Path
10
+
11
+ import yaml
12
+
13
+ _CFG = Path(__file__).resolve().parents[2] / "configs" / "delivery_rules.yaml"
14
+
15
+
16
+ @lru_cache(maxsize=1)
17
+ def _rules(path: str | Path = _CFG) -> dict:
18
+ return yaml.safe_load(Path(path).read_text(encoding="utf-8"))
19
+
20
+
21
+ def recommend_delivery(effector_bp: int, cargo_bp: int, ct: str) -> dict:
22
+ """Return {delivery, total_bp, rationale}. effector_bp ~= writer length_aa * 3."""
23
+ cfg = _rules()
24
+ total = int(effector_bp) + int(cargo_bp)
25
+ for rule in cfg["rules"]:
26
+ if total <= rule["max_total_bp"]:
27
+ return {"delivery": rule["delivery"], "total_bp": total,
28
+ "rationale": f"total payload {total} bp <= {rule['max_total_bp']} bp"}
29
+ fallback = (cfg["ex_vivo_fallback"] if ct in cfg.get("ex_vivo_cell_types", [])
30
+ else cfg["in_vivo_fallback"])
31
+ return {"delivery": fallback, "total_bp": total,
32
+ "rationale": f"total payload {total} bp exceeds dual-AAV; cell type {ct}"}