@platforma-open/milaboratories.3d-structure-prediction.software 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,150 @@
1
+ """Sequence sanitization pipeline for antibody structure prediction.
2
+
3
+ Implements R9-R14 from the block spec. Order matters — stop-codon detection
4
+ must run before non-standard-AA stripping, otherwise `*` gets removed and the
5
+ check never fires. R15 (VHH hallmark check) runs post-prediction on the
6
+ IMGT-numbered antibody — see `numbering.vhh_hallmarks_present`.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import re
12
+ from dataclasses import dataclass, field
13
+
14
+ STANDARD_AA = set("ACDEFGHIKLMNPQRSTVWY")
15
+ NON_STANDARD_STRIPPED = set("BJXZ")
16
+ GAP_CHARS = set("-.")
17
+
18
+ VH_LENGTH_RANGE = (108, 135)
19
+ VL_LENGTH_RANGE = (102, 120)
20
+
21
+ # Pyroglutamate normalization: leading pE, pyroGlu, <pE> → E.
22
+ PYROGLU_PATTERNS = [
23
+ re.compile(r"^<\s*pE\s*>", re.IGNORECASE),
24
+ re.compile(r"^pyroGlu", re.IGNORECASE),
25
+ re.compile(r"^pE", re.IGNORECASE),
26
+ ]
27
+
28
+ # Signal peptide heuristic (R14). M/signal ↦ mature VH start.
29
+ SIGNAL_PEPTIDE_RE = re.compile(
30
+ r"^M[LVIAFWCM]{3,15}(E[VI]QL|Q[VI]QL|DIQM|EIVLT|QVQLV|QLVQS|DVQL)"
31
+ )
32
+
33
+
34
+ @dataclass
35
+ class SanitizationResult:
36
+ vh: str = ""
37
+ vl: str = ""
38
+ failure_reason: str = ""
39
+ warnings: list[str] = field(default_factory=list)
40
+
41
+ @property
42
+ def success(self) -> bool:
43
+ return not self.failure_reason
44
+
45
+
46
+ def normalize(seq: str) -> str:
47
+ """Uppercase, strip whitespace, remove gap chars, normalize pyroGlu prefix."""
48
+ if seq is None:
49
+ return ""
50
+ # Strip whitespace first so pyroGlu patterns can match the leading text.
51
+ s = seq.strip()
52
+ for pat in PYROGLU_PATTERNS:
53
+ if pat.match(s):
54
+ s = pat.sub("E", s, count=1)
55
+ break
56
+ s = s.upper()
57
+ s = "".join(c for c in s if c not in GAP_CHARS and not c.isspace())
58
+ return s
59
+
60
+
61
+ def has_stop_codon(seq: str) -> bool:
62
+ return "*" in seq
63
+
64
+
65
+ def strip_non_standard(seq: str) -> str:
66
+ return "".join(c for c in seq if c not in NON_STANDARD_STRIPPED)
67
+
68
+
69
+ def in_length_range(seq: str, kind: str) -> bool:
70
+ lo, hi = VH_LENGTH_RANGE if kind == "H" else VL_LENGTH_RANGE
71
+ return lo <= len(seq) <= hi
72
+
73
+
74
+ def detect_signal_peptide(seq: str) -> bool:
75
+ return bool(SIGNAL_PEPTIDE_RE.match(seq))
76
+
77
+
78
+ def sanitize_chain(raw: str, kind: str) -> tuple[str, str, list[str]]:
79
+ """Run the full sanitization pipeline on a single chain.
80
+
81
+ Returns (clean_seq, failure_reason, warnings). Empty failure_reason means
82
+ success. The caller decides what to do with warnings (they do not fail).
83
+ """
84
+ warnings: list[str] = []
85
+
86
+ normalized = normalize(raw)
87
+ if not normalized:
88
+ return "", "empty_sequence", warnings
89
+
90
+ # R10 — stop codon before stripping.
91
+ if has_stop_codon(normalized):
92
+ return "", "stop_codon_mid_sequence", warnings
93
+
94
+ # R11 — strip non-standard AAs.
95
+ stripped = strip_non_standard(normalized)
96
+ if not stripped or len(stripped) < 20:
97
+ return "", "non_standard_aa_only_after_strip", warnings
98
+
99
+ # R13 — length bounds.
100
+ if not in_length_range(stripped, kind):
101
+ return "", "length_out_of_range", warnings
102
+
103
+ # R14 — signal peptide warning on VH only.
104
+ if kind == "H" and detect_signal_peptide(stripped):
105
+ warnings.append("probable_signal_peptide")
106
+
107
+ # Reject any remaining character outside the standard 20-aa alphabet.
108
+ bad = set(stripped) - STANDARD_AA
109
+ if bad:
110
+ return "", "non_standard_aa_residue", warnings
111
+
112
+ return stripped, "", warnings
113
+
114
+
115
+ def sanitize_pair(
116
+ vh_raw: str,
117
+ vl_raw: str | None,
118
+ mode: str,
119
+ ) -> SanitizationResult:
120
+ """Sanitize a (VH, VL) pair according to the mode.
121
+
122
+ `mode` is "ABodyBuilder2" (paired) or "NanoBodyBuilder2" (VH-only).
123
+ """
124
+ result = SanitizationResult()
125
+
126
+ vh, vh_reason, vh_warnings = sanitize_chain(vh_raw, "H")
127
+ result.warnings.extend(vh_warnings)
128
+ if vh_reason:
129
+ result.failure_reason = vh_reason
130
+ return result
131
+ result.vh = vh
132
+
133
+ if mode == "ABodyBuilder2":
134
+ if vl_raw is None or not vl_raw.strip():
135
+ result.failure_reason = "light_chain_missing_in_paired_mode"
136
+ return result
137
+ vl, vl_reason, vl_warnings = sanitize_chain(vl_raw, "L")
138
+ result.warnings.extend(vl_warnings)
139
+ if vl_reason:
140
+ result.failure_reason = vl_reason
141
+ return result
142
+ result.vl = vl
143
+ elif mode == "NanoBodyBuilder2":
144
+ # R15 hallmark check runs post-prediction in run_immunebuilder.py
145
+ # against IMGT-numbered residues (see numbering.vhh_hallmarks_present).
146
+ pass
147
+ else:
148
+ result.failure_reason = f"unknown_mode:{mode}"
149
+
150
+ return result