@platforma-open/milaboratories.3d-structure-prediction.software 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +265 -0
- package/dist/artifacts/py-archive/archive.json +1 -0
- package/dist/artifacts/py-archive/docker_x64.json +1 -0
- package/dist/docker/Dockerfile-py-archive +22 -0
- package/dist/tengo/software/immunebuilder-predict.sw.json +1 -0
- package/package.json +41 -0
- package/pkg-platforma-open-milaboratories.3d-structure-prediction.software-py-archive-1.0.0.tgz +0 -0
- package/src_python/numbering.py +103 -0
- package/src_python/pdb_writer.py +137 -0
- package/src_python/requirements.txt +4 -0
- package/src_python/run_immunebuilder.py +647 -0
- package/src_python/sanitize.py +150 -0
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""Sequence sanitization pipeline for antibody structure prediction.
|
|
2
|
+
|
|
3
|
+
Implements R9-R14 from the block spec. Order matters — stop-codon detection
|
|
4
|
+
must run before non-standard-AA stripping, otherwise `*` gets removed and the
|
|
5
|
+
check never fires. R15 (VHH hallmark check) runs post-prediction on the
|
|
6
|
+
IMGT-numbered antibody — see `numbering.vhh_hallmarks_present`.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
|
|
14
|
+
STANDARD_AA = set("ACDEFGHIKLMNPQRSTVWY")
|
|
15
|
+
NON_STANDARD_STRIPPED = set("BJXZ")
|
|
16
|
+
GAP_CHARS = set("-.")
|
|
17
|
+
|
|
18
|
+
VH_LENGTH_RANGE = (108, 135)
|
|
19
|
+
VL_LENGTH_RANGE = (102, 120)
|
|
20
|
+
|
|
21
|
+
# Pyroglutamate normalization: leading pE, pyroGlu, <pE> → E.
|
|
22
|
+
PYROGLU_PATTERNS = [
|
|
23
|
+
re.compile(r"^<\s*pE\s*>", re.IGNORECASE),
|
|
24
|
+
re.compile(r"^pyroGlu", re.IGNORECASE),
|
|
25
|
+
re.compile(r"^pE", re.IGNORECASE),
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
# Signal peptide heuristic (R14). M/signal ↦ mature VH start.
|
|
29
|
+
SIGNAL_PEPTIDE_RE = re.compile(
|
|
30
|
+
r"^M[LVIAFWCM]{3,15}(E[VI]QL|Q[VI]QL|DIQM|EIVLT|QVQLV|QLVQS|DVQL)"
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class SanitizationResult:
|
|
36
|
+
vh: str = ""
|
|
37
|
+
vl: str = ""
|
|
38
|
+
failure_reason: str = ""
|
|
39
|
+
warnings: list[str] = field(default_factory=list)
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def success(self) -> bool:
|
|
43
|
+
return not self.failure_reason
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def normalize(seq: str) -> str:
|
|
47
|
+
"""Uppercase, strip whitespace, remove gap chars, normalize pyroGlu prefix."""
|
|
48
|
+
if seq is None:
|
|
49
|
+
return ""
|
|
50
|
+
# Strip whitespace first so pyroGlu patterns can match the leading text.
|
|
51
|
+
s = seq.strip()
|
|
52
|
+
for pat in PYROGLU_PATTERNS:
|
|
53
|
+
if pat.match(s):
|
|
54
|
+
s = pat.sub("E", s, count=1)
|
|
55
|
+
break
|
|
56
|
+
s = s.upper()
|
|
57
|
+
s = "".join(c for c in s if c not in GAP_CHARS and not c.isspace())
|
|
58
|
+
return s
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def has_stop_codon(seq: str) -> bool:
|
|
62
|
+
return "*" in seq
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def strip_non_standard(seq: str) -> str:
|
|
66
|
+
return "".join(c for c in seq if c not in NON_STANDARD_STRIPPED)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def in_length_range(seq: str, kind: str) -> bool:
|
|
70
|
+
lo, hi = VH_LENGTH_RANGE if kind == "H" else VL_LENGTH_RANGE
|
|
71
|
+
return lo <= len(seq) <= hi
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def detect_signal_peptide(seq: str) -> bool:
|
|
75
|
+
return bool(SIGNAL_PEPTIDE_RE.match(seq))
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def sanitize_chain(raw: str, kind: str) -> tuple[str, str, list[str]]:
|
|
79
|
+
"""Run the full sanitization pipeline on a single chain.
|
|
80
|
+
|
|
81
|
+
Returns (clean_seq, failure_reason, warnings). Empty failure_reason means
|
|
82
|
+
success. The caller decides what to do with warnings (they do not fail).
|
|
83
|
+
"""
|
|
84
|
+
warnings: list[str] = []
|
|
85
|
+
|
|
86
|
+
normalized = normalize(raw)
|
|
87
|
+
if not normalized:
|
|
88
|
+
return "", "empty_sequence", warnings
|
|
89
|
+
|
|
90
|
+
# R10 — stop codon before stripping.
|
|
91
|
+
if has_stop_codon(normalized):
|
|
92
|
+
return "", "stop_codon_mid_sequence", warnings
|
|
93
|
+
|
|
94
|
+
# R11 — strip non-standard AAs.
|
|
95
|
+
stripped = strip_non_standard(normalized)
|
|
96
|
+
if not stripped or len(stripped) < 20:
|
|
97
|
+
return "", "non_standard_aa_only_after_strip", warnings
|
|
98
|
+
|
|
99
|
+
# R13 — length bounds.
|
|
100
|
+
if not in_length_range(stripped, kind):
|
|
101
|
+
return "", "length_out_of_range", warnings
|
|
102
|
+
|
|
103
|
+
# R14 — signal peptide warning on VH only.
|
|
104
|
+
if kind == "H" and detect_signal_peptide(stripped):
|
|
105
|
+
warnings.append("probable_signal_peptide")
|
|
106
|
+
|
|
107
|
+
# Reject any remaining character outside the standard 20-aa alphabet.
|
|
108
|
+
bad = set(stripped) - STANDARD_AA
|
|
109
|
+
if bad:
|
|
110
|
+
return "", "non_standard_aa_residue", warnings
|
|
111
|
+
|
|
112
|
+
return stripped, "", warnings
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def sanitize_pair(
|
|
116
|
+
vh_raw: str,
|
|
117
|
+
vl_raw: str | None,
|
|
118
|
+
mode: str,
|
|
119
|
+
) -> SanitizationResult:
|
|
120
|
+
"""Sanitize a (VH, VL) pair according to the mode.
|
|
121
|
+
|
|
122
|
+
`mode` is "ABodyBuilder2" (paired) or "NanoBodyBuilder2" (VH-only).
|
|
123
|
+
"""
|
|
124
|
+
result = SanitizationResult()
|
|
125
|
+
|
|
126
|
+
vh, vh_reason, vh_warnings = sanitize_chain(vh_raw, "H")
|
|
127
|
+
result.warnings.extend(vh_warnings)
|
|
128
|
+
if vh_reason:
|
|
129
|
+
result.failure_reason = vh_reason
|
|
130
|
+
return result
|
|
131
|
+
result.vh = vh
|
|
132
|
+
|
|
133
|
+
if mode == "ABodyBuilder2":
|
|
134
|
+
if vl_raw is None or not vl_raw.strip():
|
|
135
|
+
result.failure_reason = "light_chain_missing_in_paired_mode"
|
|
136
|
+
return result
|
|
137
|
+
vl, vl_reason, vl_warnings = sanitize_chain(vl_raw, "L")
|
|
138
|
+
result.warnings.extend(vl_warnings)
|
|
139
|
+
if vl_reason:
|
|
140
|
+
result.failure_reason = vl_reason
|
|
141
|
+
return result
|
|
142
|
+
result.vl = vl
|
|
143
|
+
elif mode == "NanoBodyBuilder2":
|
|
144
|
+
# R15 hallmark check runs post-prediction in run_immunebuilder.py
|
|
145
|
+
# against IMGT-numbered residues (see numbering.vhh_hallmarks_present).
|
|
146
|
+
pass
|
|
147
|
+
else:
|
|
148
|
+
result.failure_reason = f"unknown_mode:{mode}"
|
|
149
|
+
|
|
150
|
+
return result
|