protonate-utils 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- protonate_utils-0.1.0.dist-info/METADATA +294 -0
- protonate_utils-0.1.0.dist-info/RECORD +7 -0
- protonate_utils-0.1.0.dist-info/WHEEL +5 -0
- protonate_utils-0.1.0.dist-info/entry_points.txt +2 -0
- protonate_utils-0.1.0.dist-info/licenses/LICENSE +21 -0
- protonate_utils-0.1.0.dist-info/top_level.txt +1 -0
- protonate_utils.py +827 -0
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: protonate-utils
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Add hydrogens to ligands and proteins at a target pH.
|
|
5
|
+
Author-email: Patrick Walters <wpwalters@gmail.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 Patrick Walters
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://github.com/PatWalters/protonate_utils
|
|
29
|
+
Project-URL: Repository, https://github.com/PatWalters/protonate_utils
|
|
30
|
+
Project-URL: Issues, https://github.com/PatWalters/protonate_utils/issues
|
|
31
|
+
Keywords: cheminformatics,protonation,hydrogens,rdkit,pdb
|
|
32
|
+
Classifier: Programming Language :: Python :: 3
|
|
33
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
34
|
+
Classifier: Operating System :: OS Independent
|
|
35
|
+
Classifier: Topic :: Scientific/Engineering :: Chemistry
|
|
36
|
+
Requires-Python: >=3.9
|
|
37
|
+
Description-Content-Type: text/markdown
|
|
38
|
+
License-File: LICENSE
|
|
39
|
+
Requires-Dist: rdkit
|
|
40
|
+
Requires-Dist: dimorphite-dl
|
|
41
|
+
Requires-Dist: biotite
|
|
42
|
+
Requires-Dist: hydride
|
|
43
|
+
Requires-Dist: numpy
|
|
44
|
+
Provides-Extra: test
|
|
45
|
+
Requires-Dist: pytest; extra == "test"
|
|
46
|
+
Dynamic: license-file
|
|
47
|
+
|
|
48
|
+
# protonate_utils
|
|
49
|
+
|
|
50
|
+
A single utility for adding hydrogens to **ligands** and **proteins** at a
|
|
51
|
+
target pH, for use in molecular modeling and structure-based drug design.
|
|
52
|
+
|
|
53
|
+
## Why this exists
|
|
54
|
+
|
|
55
|
+
Most structures you download — a ligand from a database, a protein from the
|
|
56
|
+
PDB — are missing hydrogens, or carry hydrogens that don't reflect the
|
|
57
|
+
protonation state at physiological pH. Getting these right matters: a
|
|
58
|
+
carboxylic acid is deprotonated (`-COO⁻`) at pH 7.4, a basic amine is
|
|
59
|
+
protonated (`-NH₃⁺`), and a histidine side chain can go either way. Downstream
|
|
60
|
+
tasks — docking, free-energy calculations, MD simulations, electrostatics —
|
|
61
|
+
all depend on the correct charge and hydrogen placement.
|
|
62
|
+
|
|
63
|
+
Ligands and proteins need different tools for this. Small molecules are best
|
|
64
|
+
handled with cheminformatics pKa models; proteins need residue-aware logic and
|
|
65
|
+
geometry-based hydrogen placement. `protonate_utils.py` wraps the appropriate
|
|
66
|
+
specialist tool for each case behind one consistent interface, so you don't
|
|
67
|
+
have to remember two separate workflows:
|
|
68
|
+
|
|
69
|
+
- **Ligands** use [Dimorphite-DL](https://github.com/durrantlab/dimorphite_dl)
|
|
70
|
+
for pH-aware protonation states and [the RDKit](https://www.rdkit.org/) for
|
|
71
|
+
structure handling. When the input has 3D coordinates, the heavy-atom
|
|
72
|
+
geometry is preserved exactly — only the newly added hydrogens are given
|
|
73
|
+
computed positions.
|
|
74
|
+
- **Proteins** use [Hydride](https://hydride.biotite-python.org/) for
|
|
75
|
+
geometry-based hydrogen addition and
|
|
76
|
+
[Biotite](https://www.biotite-python.org/) for PDB handling, with formal
|
|
77
|
+
charges estimated per amino acid at the requested pH.
|
|
78
|
+
|
|
79
|
+
Everything is exposed both as a **command-line tool** and as an importable
|
|
80
|
+
**Python API**.
|
|
81
|
+
|
|
82
|
+
## Installation
|
|
83
|
+
|
|
84
|
+
Clone the repo and install it with `pip`:
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
git clone https://github.com/PatWalters/protonate_utils
|
|
88
|
+
cd protonate_utils
|
|
89
|
+
pip install -e .
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
This installs the dependencies for both modes (RDKit + Dimorphite-DL for
|
|
93
|
+
ligands, Biotite + Hydride + NumPy for proteins), puts a `protonate-utils`
|
|
94
|
+
command on your `PATH`, and makes `import protonate_utils` available.
|
|
95
|
+
|
|
96
|
+
## Command-line usage
|
|
97
|
+
|
|
98
|
+
Once installed, use the `protonate-utils` command. The first argument selects
|
|
99
|
+
the mode: `ligand` or `protein`. (You can also run it without installing via
|
|
100
|
+
`python protonate_utils.py …` from a checkout.)
|
|
101
|
+
|
|
102
|
+
### Ligands
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
# SDF in, SDF out (3D coordinates preserved, hydrogens placed from geometry)
|
|
106
|
+
protonate-utils ligand input.sdf output.sdf
|
|
107
|
+
|
|
108
|
+
# SMILES in, SMILES out, at a custom pH
|
|
109
|
+
protonate-utils ligand input.smi output.smi --ph 7.4
|
|
110
|
+
|
|
111
|
+
# Mixed: read SDF, write SMILES
|
|
112
|
+
protonate-utils ligand input.sdf output.smi
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
Input and output formats are inferred from the file extension:
|
|
116
|
+
`.smi`/`.smiles` is treated as SMILES, anything else as SDF. SMILES files are
|
|
117
|
+
read one molecule per line as `SMILES [optional name]`.
|
|
118
|
+
|
|
119
|
+
| Option | Default | Description |
|
|
120
|
+
|----------|---------|--------------------------------------|
|
|
121
|
+
| `--ph` | `7.4` | Target pH for protonation. |
|
|
122
|
+
|
|
123
|
+
Molecules that fail to parse or protonate are skipped with a warning on
|
|
124
|
+
stderr; the run reports how many were read, written, and skipped.
|
|
125
|
+
|
|
126
|
+
### Proteins
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
# Remove a bound ligand by residue name, then add hydrogens
|
|
130
|
+
protonate-utils protein input.pdb AP5 output.pdb
|
|
131
|
+
|
|
132
|
+
# Keep everything (no ligand removal)
|
|
133
|
+
protonate-utils protein input.pdb none output.pdb --ph 7.0
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
The second positional argument is the residue name (3-letter CCD code) of a
|
|
137
|
+
ligand to remove before protonation — pass `none` to keep all atoms. Output
|
|
138
|
+
hydrogens are reordered so each one immediately follows the heavy atom it is
|
|
139
|
+
bonded to.
|
|
140
|
+
|
|
141
|
+
| Option | Default | Description |
|
|
142
|
+
|--------------|---------|-----------------------------------------------------|
|
|
143
|
+
| `--ph` | `7.0` | pH used to estimate amino-acid formal charges. |
|
|
144
|
+
| `--no-relax` | off | Skip dihedral relaxation of the added hydrogens. |
|
|
145
|
+
|
|
146
|
+
## Python API
|
|
147
|
+
|
|
148
|
+
Import the functions directly from `protonate_utils`. There are symmetric
|
|
149
|
+
in-memory and file-to-file entry points for both ligands and proteins.
|
|
150
|
+
|
|
151
|
+
| | Ligands | Proteins |
|
|
152
|
+
|------------------|------------------------------------------|-----------------------------------|
|
|
153
|
+
| In-memory core | `protonate_molecule(mol, ph)` | `protonate_structure(structure, …)` |
|
|
154
|
+
| Convenience | `protonate_smiles_string(smiles, ph)` | — |
|
|
155
|
+
| File → file | `protonate_ligands(in, out, ph)` | `prepare_structure(in, res, out, …)` |
|
|
156
|
+
| I/O helpers | `read_molecules(path)`, `make_writer(path)` | (Biotite `PDBFile`) |
|
|
157
|
+
|
|
158
|
+
### Ligands
|
|
159
|
+
|
|
160
|
+
Protonate a single SMILES string and get a SMILES string back:
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
from protonate_utils import protonate_smiles_string
|
|
164
|
+
|
|
165
|
+
protonate_smiles_string("CC(=O)O") # 'CC(=O)[O-]'
|
|
166
|
+
protonate_smiles_string("OP(=O)(O)O", ph=7.4) # 'O=P([O-])([O-])O'
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
`protonate_smiles_string` raises `ValueError` on an unparseable SMILES; other
|
|
170
|
+
failures (e.g. Dimorphite-DL cannot handle the molecule) propagate as
|
|
171
|
+
exceptions.
|
|
172
|
+
|
|
173
|
+
Protonate an RDKit `Mol` while preserving its 3D coordinates:
|
|
174
|
+
|
|
175
|
+
```python
|
|
176
|
+
from rdkit import Chem
|
|
177
|
+
from protonate_utils import protonate_molecule, read_molecules
|
|
178
|
+
|
|
179
|
+
mol = next(read_molecules("ligand.sdf"))
|
|
180
|
+
protonated = protonate_molecule(mol, ph=7.4) # Mol with explicit Hs + coords
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
Pass `add_coord_hs=False` to keep protonation implicit (no explicit hydrogen
|
|
184
|
+
atoms added) — appropriate when you intend to serialize to SMILES.
|
|
185
|
+
|
|
186
|
+
Batch-convert a whole file (the CLI ligand path):
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
from protonate_utils import protonate_ligands
|
|
190
|
+
|
|
191
|
+
protonate_ligands("input.sdf", "output.sdf", ph=7.4)
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
### Proteins
|
|
195
|
+
|
|
196
|
+
Protonate an in-memory Biotite `AtomArray` and get a hydrogenated one back:
|
|
197
|
+
|
|
198
|
+
```python
|
|
199
|
+
import biotite.structure.io.pdb as pdb
|
|
200
|
+
from protonate_utils import protonate_structure
|
|
201
|
+
|
|
202
|
+
structure = pdb.PDBFile.read("input.pdb").get_structure(model=1)
|
|
203
|
+
hydrogenated = protonate_structure(
|
|
204
|
+
structure,
|
|
205
|
+
ligand_res_name="AP5", # or None / "none" to keep all atoms
|
|
206
|
+
ph=7.0,
|
|
207
|
+
relax=True,
|
|
208
|
+
)
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
`protonate_structure` raises `ValueError` if `ligand_res_name` is given but no
|
|
212
|
+
atoms with that residue name exist. The returned `AtomArray` has hydrogens
|
|
213
|
+
added and reordered to follow their bonded heavy atoms.
|
|
214
|
+
|
|
215
|
+
Read a PDB, protonate, and write a PDB in one call (the CLI protein path):
|
|
216
|
+
|
|
217
|
+
```python
|
|
218
|
+
from protonate_utils import prepare_structure
|
|
219
|
+
|
|
220
|
+
prepare_structure("input.pdb", "AP5", "output.pdb", ph=7.0, relax=True)
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
## How it works
|
|
224
|
+
|
|
225
|
+
### Ligand protonation
|
|
226
|
+
|
|
227
|
+
1. Pre-existing hydrogens are stripped; any 3D conformer on the heavy atoms is
|
|
228
|
+
kept.
|
|
229
|
+
2. Dimorphite-DL enumerates candidate microstate(s) within a ±0.5 pH window.
|
|
230
|
+
One is chosen deterministically by a **site-by-site plausibility** check
|
|
231
|
+
rather than by net charge — see
|
|
232
|
+
[Correcting Dimorphite-DL microstates](#correcting-dimorphite-dl-microstates)
|
|
233
|
+
below — and any residual implausible ionization is repaired against the
|
|
234
|
+
input. The SMILES string is a final tiebreak, so re-runs are stable.
|
|
235
|
+
3. The chosen template's formal charges **and** total hydrogen counts are
|
|
236
|
+
mapped back onto the original atoms via a charge-insensitive substructure
|
|
237
|
+
match (so `-COOH` still matches `-COO⁻`). Carrying the H count — not just
|
|
238
|
+
the charge — keeps the RDKit's kekulization correct on aromatic heterocycles.
|
|
239
|
+
4. With 3D input, `Chem.AddHs(addCoords=True)` adds hydrogens positioned from
|
|
240
|
+
the existing geometry; heavy-atom coordinates are never moved. Without
|
|
241
|
+
coordinates (SMILES), protonation stays implicit.
|
|
242
|
+
|
|
243
|
+
### Correcting Dimorphite-DL microstates
|
|
244
|
+
|
|
245
|
+
Dimorphite-DL enumerates *every* microstate whose modeled pKa falls anywhere
|
|
246
|
+
near the pH window, including many that are negligibly populated at pH 7.4. Left
|
|
247
|
+
to a "most ionized" or "closest net charge" rule, the selector picks chemically
|
|
248
|
+
wrong states: it deprotonates amides and phenols and protonates anilines. We add
|
|
249
|
+
a per-atom legitimacy check (`_charge_change_is_legitimate`) that compares each
|
|
250
|
+
candidate to the input atom-by-atom and accepts a formal-charge change only when
|
|
251
|
+
that group genuinely ionizes near physiological pH:
|
|
252
|
+
|
|
253
|
+
| Group | Typical pKa | At pH 7.4 | Dimorphite enumerates | We |
|
|
254
|
+
|-------|-------------|-----------|-----------------------|----|
|
|
255
|
+
| Aliphatic amine | pKaH ~10 | cation | both | **protonate** |
|
|
256
|
+
| Amidine / guanidine | pKaH ~12–13 | cation | both | **protonate** |
|
|
257
|
+
| Carboxylic acid | ~4 | anion | anion | **deprotonate** |
|
|
258
|
+
| Sulfonic / sulfinic / phosphate / phosphonate | <2–7 | anion | anion | **deprotonate** |
|
|
259
|
+
| Sulfonamide / acylsulfonamide / tetrazole | ~3–10 | anion | both | **deprotonate** |
|
|
260
|
+
| Carboxamide N–H | ~17–22 | neutral | both → `[N⁻]` *or* `[NH⁺]` | **keep neutral** |
|
|
261
|
+
| Aniline / amino-heteroarene | pKaH ~3–5 | neutral | both → `[NH⁺]` | **keep neutral** |
|
|
262
|
+
| Cyanamide (N–C≡N) | pKaH ~0 | neutral | both → `[NH⁺]` | **keep neutral** |
|
|
263
|
+
| Imidazole / pyrazole / indazole / indole / triazole N–H | ~10–17 | neutral | both → `[n⁻]` | **keep neutral** |
|
|
264
|
+
| Phenol / alcohol | ~10–16 | neutral | both → `[O⁻]` | **keep neutral** |
|
|
265
|
+
| Plain thiol / thione | ~7–10 | neutral | both → `[S⁻]` | **keep neutral** |
|
|
266
|
+
|
|
267
|
+
Two further safeguards:
|
|
268
|
+
|
|
269
|
+
- **Repair fallback.** When Dimorphite offers *only* an implausibly-ionized
|
|
270
|
+
microstate (e.g. it returns just the `[N⁻]` form of an O-alkyl hydroxamate or
|
|
271
|
+
imide, with no neutral alternative to select), the offending site is reverted
|
|
272
|
+
to the input's protonation rather than emitted as-is.
|
|
273
|
+
- **Input charges preserved.** A change is only judged relative to the input, so
|
|
274
|
+
charges already present in the SMILES — quaternary ammonium salts, *N*-oxides,
|
|
275
|
+
mesoionic zwitterions — are never altered.
|
|
276
|
+
|
|
277
|
+
Borderline acids/bases whose pKa sits right at 7.4 (e.g. *p*-nitrophenol ~7.15,
|
|
278
|
+
mercaptoazoles ~7) are deliberately defaulted to neutral; they are ~50/50 at
|
|
279
|
+
physiological pH, so this is at least as defensible as ionizing them and avoids
|
|
280
|
+
mis-ionizing the far more common ordinary phenols and amides. Validated across
|
|
281
|
+
the 2,173-molecule Biogen logS set: no skips, no heavy-atom changes, and the
|
|
282
|
+
selection is deterministic.
|
|
283
|
+
|
|
284
|
+
### Protein protonation
|
|
285
|
+
|
|
286
|
+
1. Optionally remove a ligand by residue name, then strip any existing
|
|
287
|
+
hydrogens.
|
|
288
|
+
2. Assign covalent bonds from CCD residue templates
|
|
289
|
+
(`connect_via_residue_names`).
|
|
290
|
+
3. Estimate per-residue formal charges for canonical amino acids at the
|
|
291
|
+
requested pH (`hydride.estimate_amino_acid_charges`).
|
|
292
|
+
4. Add hydrogens with Hydride and, by default, relax their geometry.
|
|
293
|
+
5. Reorder atoms so each hydrogen immediately follows the heavy atom it is
|
|
294
|
+
bonded to.
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
protonate_utils.py,sha256=eBk8YVXQa4_6D32sewXyBJJ-hVfwh3DqdHoho4PQuos,32105
|
|
2
|
+
protonate_utils-0.1.0.dist-info/licenses/LICENSE,sha256=Qavx6RZFwj7qFjJ2V7LUMTeuYsOV9p5yXL1E8xf7WMg,1072
|
|
3
|
+
protonate_utils-0.1.0.dist-info/METADATA,sha256=TKGgIP0wlAb1l1mmGUVMai2DrV4XDc-61EENk29wGAY,12948
|
|
4
|
+
protonate_utils-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
5
|
+
protonate_utils-0.1.0.dist-info/entry_points.txt,sha256=je5fmMyHIYDcSYTnlcmFQyiORt-4C37WEPQAdHlzwfk,57
|
|
6
|
+
protonate_utils-0.1.0.dist-info/top_level.txt,sha256=5AXq8us0pha4ORS_L-5DdMpziL307cpPjLt-8AqiSZk,16
|
|
7
|
+
protonate_utils-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Patrick Walters
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
protonate_utils
|
protonate_utils.py
ADDED
|
@@ -0,0 +1,827 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
"""
|
|
3
|
+
Protonation utilities for both ligands and proteins, selected by the
|
|
4
|
+
first command-line argument:
|
|
5
|
+
|
|
6
|
+
protonate_utils.py ligand ... # small molecules (SDF/SMILES)
|
|
7
|
+
protonate_utils.py protein ... # protein structures (PDB)
|
|
8
|
+
|
|
9
|
+
Ligand mode
|
|
10
|
+
-----------
|
|
11
|
+
Add hydrogens to small molecules at a target pH (default 7.4). Input and
|
|
12
|
+
output may be either SDF or SMILES; the format is inferred from the file
|
|
13
|
+
extension (.smi/.smiles for SMILES, otherwise SDF).
|
|
14
|
+
|
|
15
|
+
pH-aware protonation states are predicted with Dimorphite-DL, the
|
|
16
|
+
resulting formal charges *and* H counts are mapped back onto the
|
|
17
|
+
original molecule via substructure matching, and explicit hydrogens are
|
|
18
|
+
added with Chem.AddHs(addCoords=True). When the input has 3D coordinates
|
|
19
|
+
(SDF), heavy-atom positions are not disturbed; only the newly added
|
|
20
|
+
hydrogens get computed positions based on the existing geometry. SMILES
|
|
21
|
+
input has no coordinates, so protonation is applied without any geometry.
|
|
22
|
+
|
|
23
|
+
Protein mode
|
|
24
|
+
------------
|
|
25
|
+
Read a local PDB file, optionally remove a ligand by residue name, add
|
|
26
|
+
hydrogens with Hydride at a target pH (default 7.0), and write the
|
|
27
|
+
result to a PDB file. Hydrogens are reordered so that each hydrogen
|
|
28
|
+
immediately follows the heavy atom to which it is bonded.
|
|
29
|
+
|
|
30
|
+
Install:
|
|
31
|
+
pip install rdkit dimorphite-dl # ligand mode
|
|
32
|
+
pip install biotite hydride numpy # protein mode
|
|
33
|
+
|
|
34
|
+
Usage:
|
|
35
|
+
protonate_utils.py ligand input.sdf output.sdf
|
|
36
|
+
protonate_utils.py ligand input.smi output.smi --ph 7.4
|
|
37
|
+
protonate_utils.py protein input.pdb AP5 output.pdb
|
|
38
|
+
protonate_utils.py protein input.pdb none output.pdb --ph 7.0
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
import argparse
|
|
42
|
+
import sys
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# ---------------------------------------------------------------------------
|
|
46
|
+
# Ligand mode (RDKit + Dimorphite-DL)
|
|
47
|
+
# ---------------------------------------------------------------------------
|
|
48
|
+
|
|
49
|
+
def _skeleton_copy(mol):
|
|
50
|
+
"""
|
|
51
|
+
Return a charge- and H-agnostic copy of `mol` for use as a
|
|
52
|
+
substructure-match template that aligns two molecules differing only in
|
|
53
|
+
protonation state (an input molecule against a Dimorphite-DL microstate of
|
|
54
|
+
itself).
|
|
55
|
+
|
|
56
|
+
Bond orders and aromaticity are preserved -- they are what distinguishes,
|
|
57
|
+
say, an amidine's ``=N`` from its ``-N``, so flattening them would let the
|
|
58
|
+
charge/H mapping land on the wrong nitrogen and blow up its valence.
|
|
59
|
+
Only formal charges, explicit Hs and radicals are cleared, and implicit
|
|
60
|
+
Hs are switched off so a neutralized cation can't overflow its valence.
|
|
61
|
+
|
|
62
|
+
Crucially we do *not* run a full sanitize: re-kekulizing a neutralized
|
|
63
|
+
aromatic cation such as a protonated pyridinium ``[nH+]`` is what made the
|
|
64
|
+
indazole/imidazole molecules fail. The molecule keeps the aromaticity
|
|
65
|
+
perceived at parse time, and a light property-cache/ring refresh is all
|
|
66
|
+
substructure matching needs, so this never raises.
|
|
67
|
+
"""
|
|
68
|
+
from rdkit import Chem
|
|
69
|
+
|
|
70
|
+
m = Chem.RWMol(mol)
|
|
71
|
+
for a in m.GetAtoms():
|
|
72
|
+
a.SetFormalCharge(0)
|
|
73
|
+
a.SetNumExplicitHs(0)
|
|
74
|
+
a.SetNoImplicit(True)
|
|
75
|
+
a.SetNumRadicalElectrons(0)
|
|
76
|
+
m.UpdatePropertyCache(strict=False)
|
|
77
|
+
Chem.FastFindRings(m)
|
|
78
|
+
return m
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _is_amide_nitrogen(n_atom):
|
|
82
|
+
"""
|
|
83
|
+
True if `n_atom` is a (thio)carboxamide nitrogen -- bonded to a carbon
|
|
84
|
+
that bears a double bond to O or S -- and *not* also bonded to a sulfonyl
|
|
85
|
+
group. A plain carboxamide N-H has pKa ~17-22 and stays neutral at
|
|
86
|
+
physiological pH, but an (acyl)sulfonamide N-H is genuinely acidic, so we
|
|
87
|
+
exclude that case (the caller treats its deprotonation as legitimate).
|
|
88
|
+
"""
|
|
89
|
+
from rdkit import Chem
|
|
90
|
+
|
|
91
|
+
has_carbonyl = False
|
|
92
|
+
has_sulfonyl = False
|
|
93
|
+
for nbr in n_atom.GetNeighbors():
|
|
94
|
+
z = nbr.GetAtomicNum()
|
|
95
|
+
if z == 6:
|
|
96
|
+
for b in nbr.GetBonds():
|
|
97
|
+
other = b.GetOtherAtom(nbr)
|
|
98
|
+
if (b.GetBondType() == Chem.BondType.DOUBLE
|
|
99
|
+
and other.GetAtomicNum() in (8, 16)):
|
|
100
|
+
has_carbonyl = True
|
|
101
|
+
elif z == 16:
|
|
102
|
+
# Sulfonyl S(=O)(=O) neighbour -> acidic (acyl)sulfonamide.
|
|
103
|
+
o_doubles = sum(
|
|
104
|
+
1 for b in nbr.GetBonds()
|
|
105
|
+
if b.GetBondType() == Chem.BondType.DOUBLE
|
|
106
|
+
and b.GetOtherAtom(nbr).GetAtomicNum() == 8
|
|
107
|
+
)
|
|
108
|
+
if o_doubles >= 2:
|
|
109
|
+
has_sulfonyl = True
|
|
110
|
+
return has_carbonyl and not has_sulfonyl
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _nitrogen_is_acylated_or_sulfonylated(n_atom):
|
|
114
|
+
"""
|
|
115
|
+
True if `n_atom` is bonded to a carbonyl/thiocarbonyl carbon or a sulfonyl
|
|
116
|
+
sulfur. Such a nitrogen (amide, imide, sulfonamide, N-acylsulfonamide) has
|
|
117
|
+
its lone pair tied up by the adjacent electron-withdrawing group and is not
|
|
118
|
+
basic, so it must never be *protonated* at physiological pH -- even the
|
|
119
|
+
acidic acylsulfonamide/imide cases, which `_is_amide_nitrogen` deliberately
|
|
120
|
+
excludes so their *deprotonation* stays allowed.
|
|
121
|
+
"""
|
|
122
|
+
from rdkit import Chem
|
|
123
|
+
|
|
124
|
+
for nbr in n_atom.GetNeighbors():
|
|
125
|
+
z = nbr.GetAtomicNum()
|
|
126
|
+
if z == 6:
|
|
127
|
+
for b in nbr.GetBonds():
|
|
128
|
+
other = b.GetOtherAtom(nbr)
|
|
129
|
+
if (b.GetBondType() == Chem.BondType.DOUBLE
|
|
130
|
+
and other.GetAtomicNum() in (8, 16)):
|
|
131
|
+
return True
|
|
132
|
+
elif z == 16:
|
|
133
|
+
o_doubles = sum(
|
|
134
|
+
1 for b in nbr.GetBonds()
|
|
135
|
+
if b.GetBondType() == Chem.BondType.DOUBLE
|
|
136
|
+
and b.GetOtherAtom(nbr).GetAtomicNum() == 8
|
|
137
|
+
)
|
|
138
|
+
if o_doubles >= 2:
|
|
139
|
+
return True
|
|
140
|
+
return False
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _is_aromatic_amine_nitrogen(n_atom):
|
|
144
|
+
"""
|
|
145
|
+
True if `n_atom` is an aniline/aromatic-amine nitrogen -- a non-aromatic N
|
|
146
|
+
bonded directly to an aromatic ring atom -- whose lone pair is delocalised
|
|
147
|
+
into the ring. Such nitrogens are weak bases (aniline pKaH ~4.6;
|
|
148
|
+
amino-pyridines/-pyrimidines/-azines pKaH ~3-5) and stay essentially neutral
|
|
149
|
+
at pH 7.4, yet Dimorphite-DL still enumerates a protonated microstate for
|
|
150
|
+
them.
|
|
151
|
+
|
|
152
|
+
An *aliphatic* amine (no aromatic neighbour) and the C=N nitrogens of an
|
|
153
|
+
amidine/guanidine/benzamidine (whose neighbouring carbon is not itself
|
|
154
|
+
aromatic) are excluded here, so they remain protonatable. Strongly-basic
|
|
155
|
+
amino-heteroarenes (e.g. 2-aminoimidazole, 4-aminopyridine) protonate on
|
|
156
|
+
their *ring* nitrogen, a different atom, so this exclusion does not affect
|
|
157
|
+
them.
|
|
158
|
+
"""
|
|
159
|
+
if n_atom.GetAtomicNum() != 7 or n_atom.GetIsAromatic():
|
|
160
|
+
return False
|
|
161
|
+
if _is_amide_nitrogen(n_atom):
|
|
162
|
+
return False
|
|
163
|
+
return any(nbr.GetIsAromatic() for nbr in n_atom.GetNeighbors())
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _is_cyanamide_nitrogen(n_atom):
|
|
167
|
+
"""
|
|
168
|
+
True if `n_atom` is a cyanamide nitrogen -- bonded to a nitrile carbon
|
|
169
|
+
(N-C#N). The triple-bonded nitrile is strongly electron-withdrawing and
|
|
170
|
+
ties up the nitrogen lone pair, so a dialkylcyanamide has pKaH ~0 (cyanamide
|
|
171
|
+
itself is faintly *acidic*, pKa ~10) and is non-basic at pH 7.4. Dimorphite-DL
|
|
172
|
+
nonetheless enumerates a protonated microstate for it, which we must reject.
|
|
173
|
+
"""
|
|
174
|
+
from rdkit import Chem
|
|
175
|
+
|
|
176
|
+
if n_atom.GetAtomicNum() != 7:
|
|
177
|
+
return False
|
|
178
|
+
for nbr in n_atom.GetNeighbors():
|
|
179
|
+
if nbr.GetAtomicNum() != 6:
|
|
180
|
+
continue
|
|
181
|
+
for b in nbr.GetBonds():
|
|
182
|
+
other = b.GetOtherAtom(nbr)
|
|
183
|
+
if (b.GetBondType() == Chem.BondType.TRIPLE
|
|
184
|
+
and other.GetAtomicNum() == 7
|
|
185
|
+
and other.GetIdx() != n_atom.GetIdx()):
|
|
186
|
+
return True
|
|
187
|
+
return False
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _bonded_to_acidifying_centre(atom):
|
|
191
|
+
"""
|
|
192
|
+
True if `atom` is bonded to an electron-withdrawing centre that makes an
|
|
193
|
+
O-H/S-H on it a strong acid (pKa < ~7): a carbonyl/thiocarbonyl carbon
|
|
194
|
+
(carboxyl/thioacid), a phosphorus oxyacid, or a sulfur oxyacid. Used to
|
|
195
|
+
tell a genuine acid (carboxyl pKa ~4, sulfonic <2, phosphate ~1-7) apart
|
|
196
|
+
from a weak one whose conjugate base is essentially absent at pH 7.4.
|
|
197
|
+
"""
|
|
198
|
+
from rdkit import Chem
|
|
199
|
+
|
|
200
|
+
for nbr in atom.GetNeighbors():
|
|
201
|
+
z = nbr.GetAtomicNum()
|
|
202
|
+
if z in (15, 16):
|
|
203
|
+
# Phosphorus oxyacid, or sulfonic/sulfinic acid: the neighbouring
|
|
204
|
+
# P/S bears at least one double-bonded oxygen.
|
|
205
|
+
if any(
|
|
206
|
+
b.GetBondType() == Chem.BondType.DOUBLE
|
|
207
|
+
and b.GetOtherAtom(nbr).GetAtomicNum() == 8
|
|
208
|
+
for b in nbr.GetBonds()
|
|
209
|
+
):
|
|
210
|
+
return True
|
|
211
|
+
elif z == 6:
|
|
212
|
+
# Carbonyl/thiocarbonyl carbon -> carboxyl / thioacid.
|
|
213
|
+
for b in nbr.GetBonds():
|
|
214
|
+
other = b.GetOtherAtom(nbr)
|
|
215
|
+
if (b.GetBondType() == Chem.BondType.DOUBLE
|
|
216
|
+
and other is not atom
|
|
217
|
+
and other.GetAtomicNum() in (8, 16)):
|
|
218
|
+
return True
|
|
219
|
+
return False
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _is_acidic_oxygen(o_atom):
|
|
223
|
+
"""
|
|
224
|
+
True if deprotonating this oxygen's O-H gives an anion that actually exists
|
|
225
|
+
at pH 7.4 -- i.e. the oxygen of a carboxyl, sulfonic/sulfinic, or
|
|
226
|
+
phosphorus oxyacid (pKa < ~7). A phenol (O on an aromatic carbon, pKa ~10),
|
|
227
|
+
an alcohol (O on sp3 carbon, pKa ~16), or a hydroxy-heteroarene (really a
|
|
228
|
+
neutral lactam tautomer) is >90% neutral at physiological pH, yet
|
|
229
|
+
Dimorphite-DL still enumerates its ``[O-]`` microstate, so those must be
|
|
230
|
+
rejected -- the acid-side analogue of the weak amide/azole N-H.
|
|
231
|
+
"""
|
|
232
|
+
return o_atom.GetAtomicNum() == 8 and _bonded_to_acidifying_centre(o_atom)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def _is_acidic_sulfur(s_atom):
|
|
236
|
+
"""
|
|
237
|
+
True if deprotonating this sulfur's S-H gives a thiolate present at pH 7.4
|
|
238
|
+
-- a thioacid S adjacent to a carbonyl (pKa ~3) or a sulfur/phosphorus
|
|
239
|
+
oxyacid. A plain alkyl thiol (pKa ~10.5) or aromatic thiol/thione (pKa ~7,
|
|
240
|
+
e.g. a mercaptoazole) is predominantly neutral, so its Dimorphite-enumerated
|
|
241
|
+
``[S-]`` microstate is rejected.
|
|
242
|
+
"""
|
|
243
|
+
return s_atom.GetAtomicNum() == 16 and _bonded_to_acidifying_centre(s_atom)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _is_acidic_aromatic_nitrogen(n_atom):
|
|
247
|
+
"""
|
|
248
|
+
True if `n_atom` is an aromatic ring N-H acidic enough to deprotonate near
|
|
249
|
+
physiological pH. In practice that means only tetrazole-grade azoles,
|
|
250
|
+
whose ring carries four nitrogens (N-H pKa ~4.9). The common aromatic
|
|
251
|
+
N-H heterocycles -- pyrrole/indole (1 ring N, pKa ~17), imidazole/pyrazole
|
|
252
|
+
(2 N, pKa ~14), triazole (3 N, pKa ~10) -- are >99% neutral at pH 7.4, so
|
|
253
|
+
their Dimorphite-enumerated ``[n-]`` microstates must be rejected.
|
|
254
|
+
"""
|
|
255
|
+
mol = n_atom.GetOwningMol()
|
|
256
|
+
ring_info = mol.GetRingInfo()
|
|
257
|
+
idx = n_atom.GetIdx()
|
|
258
|
+
most_ring_nitrogens = 0
|
|
259
|
+
for ring in ring_info.AtomRings():
|
|
260
|
+
if idx in ring:
|
|
261
|
+
n_count = sum(
|
|
262
|
+
1 for i in ring if mol.GetAtomWithIdx(i).GetAtomicNum() == 7
|
|
263
|
+
)
|
|
264
|
+
most_ring_nitrogens = max(most_ring_nitrogens, n_count)
|
|
265
|
+
return most_ring_nitrogens >= 4
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def _charge_change_is_legitimate(atom, delta_q):
|
|
269
|
+
"""
|
|
270
|
+
Decide whether changing `atom`'s formal charge by `delta_q` (candidate
|
|
271
|
+
minus input) reflects a real ionization near physiological pH.
|
|
272
|
+
|
|
273
|
+
Protonation to a cation is only sensible on a nitrogen base (amine,
|
|
274
|
+
amidine, guanidine, aromatic N). Deprotonation to an anion is sensible on a
|
|
275
|
+
strong oxygen/sulfur acid (carboxyl, sulfonic/sulfinic, phosphate, thioacid)
|
|
276
|
+
and on a genuinely acidic nitrogen (sulfonamide, tetrazole, ...). It is
|
|
277
|
+
*not* sensible on the weakly-acidic groups that Dimorphite-DL nonetheless
|
|
278
|
+
enumerates a deprotonated microstate for: a plain carboxamide (pKa ~17-22)
|
|
279
|
+
or aromatic N-H heterocycle (imidazole/pyrazole/indazole/indole, pKa
|
|
280
|
+
~13-17), nor a phenol (pKa ~10), alcohol (pKa ~16), or plain thiol/thione
|
|
281
|
+
(pKa ~7-10), all >90% neutral at pH 7.4. Flagging those here lets the
|
|
282
|
+
selector reject them.
|
|
283
|
+
"""
|
|
284
|
+
if delta_q > 0:
|
|
285
|
+
# Protonation to a cation. Only a nitrogen base accepts a proton near
|
|
286
|
+
# physiological pH. An amide nitrogen is *not* basic (its conjugate
|
|
287
|
+
# acid pKa is ~0), so reject protonation there even though Dimorphite-DL
|
|
288
|
+
# enumerates the [NH+] microstate.
|
|
289
|
+
if atom.GetAtomicNum() != 7:
|
|
290
|
+
return False
|
|
291
|
+
if _nitrogen_is_acylated_or_sulfonylated(atom):
|
|
292
|
+
return False
|
|
293
|
+
if _is_aromatic_amine_nitrogen(atom):
|
|
294
|
+
return False
|
|
295
|
+
if _is_cyanamide_nitrogen(atom):
|
|
296
|
+
return False
|
|
297
|
+
return True
|
|
298
|
+
# delta_q < 0: deprotonation to an anion.
|
|
299
|
+
z = atom.GetAtomicNum()
|
|
300
|
+
if z == 8:
|
|
301
|
+
# Carboxyl/sulfonate/phosphate oxygen deprotonates; phenol/alcohol
|
|
302
|
+
# (pKa ~10-16) stays neutral at pH 7.4.
|
|
303
|
+
return _is_acidic_oxygen(atom)
|
|
304
|
+
if z == 16:
|
|
305
|
+
# Thioacid sulfur deprotonates; plain thiol/thione stays neutral.
|
|
306
|
+
return _is_acidic_sulfur(atom)
|
|
307
|
+
if z == 7:
|
|
308
|
+
if _is_amide_nitrogen(atom):
|
|
309
|
+
return False
|
|
310
|
+
if atom.GetIsAromatic() and not _is_acidic_aromatic_nitrogen(atom):
|
|
311
|
+
return False
|
|
312
|
+
return True
|
|
313
|
+
return False
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def _count_illegitimate_ionizations(input_mol, cand_mol):
|
|
317
|
+
"""
|
|
318
|
+
Align `cand_mol` to `input_mol` atom-by-atom -- their heavy-atom
|
|
319
|
+
skeletons are identical, only protonation differs -- and count the formal
|
|
320
|
+
charge changes that don't correspond to a legitimate ionization (see
|
|
321
|
+
`_charge_change_is_legitimate`). Comparing against the input (rather than
|
|
322
|
+
against neutral) means a charge already present in the input is never
|
|
323
|
+
penalised; only newly introduced, chemically implausible ionizations are.
|
|
324
|
+
|
|
325
|
+
Returns a large sentinel if the two can't be aligned, so such candidates
|
|
326
|
+
sort last without crashing the selection.
|
|
327
|
+
"""
|
|
328
|
+
match = _skeleton_copy(input_mol).GetSubstructMatch(
|
|
329
|
+
_skeleton_copy(cand_mol)
|
|
330
|
+
)
|
|
331
|
+
if not match or len(match) != cand_mol.GetNumAtoms():
|
|
332
|
+
return 1_000_000
|
|
333
|
+
|
|
334
|
+
bad = 0
|
|
335
|
+
for cand_idx, input_idx in enumerate(match):
|
|
336
|
+
ca = cand_mol.GetAtomWithIdx(cand_idx)
|
|
337
|
+
ia = input_mol.GetAtomWithIdx(input_idx)
|
|
338
|
+
delta_q = ca.GetFormalCharge() - ia.GetFormalCharge()
|
|
339
|
+
if delta_q and not _charge_change_is_legitimate(ca, delta_q):
|
|
340
|
+
bad += 1
|
|
341
|
+
return bad
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def _repair_illegitimate_ionizations(input_mol, cand_smiles):
|
|
345
|
+
"""
|
|
346
|
+
Revert any still-illegitimate ionization in `cand_smiles` to the input's
|
|
347
|
+
protonation at that atom.
|
|
348
|
+
|
|
349
|
+
Selection (`_pick_state`) can only choose among the microstates
|
|
350
|
+
Dimorphite-DL offers. For an activated-but-not-acidic nitrogen -- an
|
|
351
|
+
O-alkyl hydroxamate, an acylhydrazide, a plain imide -- Dimorphite may
|
|
352
|
+
return *only* the deprotonated ``[N-]`` form, with no neutral alternative
|
|
353
|
+
to pick. Here we align the chosen candidate to the input atom-by-atom and,
|
|
354
|
+
for every formal-charge change that isn't a legitimate ionization (see
|
|
355
|
+
`_charge_change_is_legitimate`), copy the input atom's charge and hydrogen
|
|
356
|
+
count back onto the candidate. Genuine acids handled correctly upstream
|
|
357
|
+
(carboxyl, sulfonamide, tetrazole, acylsulfonamide) have *legitimate*
|
|
358
|
+
changes and are left untouched.
|
|
359
|
+
|
|
360
|
+
Returns a canonical SMILES, or `cand_smiles` unchanged if the molecules
|
|
361
|
+
can't be aligned or the repaired structure won't sanitize.
|
|
362
|
+
"""
|
|
363
|
+
from rdkit import Chem
|
|
364
|
+
|
|
365
|
+
if input_mol is None:
|
|
366
|
+
return cand_smiles
|
|
367
|
+
cand_mol = Chem.MolFromSmiles(cand_smiles)
|
|
368
|
+
if cand_mol is None:
|
|
369
|
+
return cand_smiles
|
|
370
|
+
|
|
371
|
+
match = _skeleton_copy(input_mol).GetSubstructMatch(_skeleton_copy(cand_mol))
|
|
372
|
+
if not match or len(match) != cand_mol.GetNumAtoms():
|
|
373
|
+
return cand_smiles
|
|
374
|
+
|
|
375
|
+
rw = Chem.RWMol(cand_mol)
|
|
376
|
+
changed = False
|
|
377
|
+
for cand_idx, input_idx in enumerate(match):
|
|
378
|
+
ca = rw.GetAtomWithIdx(cand_idx)
|
|
379
|
+
ia = input_mol.GetAtomWithIdx(input_idx)
|
|
380
|
+
delta_q = ca.GetFormalCharge() - ia.GetFormalCharge()
|
|
381
|
+
if delta_q and not _charge_change_is_legitimate(ca, delta_q):
|
|
382
|
+
ca.SetFormalCharge(ia.GetFormalCharge())
|
|
383
|
+
ca.SetNumExplicitHs(ia.GetTotalNumHs())
|
|
384
|
+
ca.SetNoImplicit(True)
|
|
385
|
+
changed = True
|
|
386
|
+
|
|
387
|
+
if not changed:
|
|
388
|
+
return cand_smiles
|
|
389
|
+
try:
|
|
390
|
+
repaired = rw.GetMol()
|
|
391
|
+
Chem.SanitizeMol(repaired)
|
|
392
|
+
except Exception:
|
|
393
|
+
return cand_smiles
|
|
394
|
+
return Chem.MolToSmiles(repaired)
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def _pick_state(input_smiles, states):
|
|
398
|
+
"""
|
|
399
|
+
Choose one microstate from Dimorphite-DL's output deterministically.
|
|
400
|
+
|
|
401
|
+
Dimorphite returns every microstate whose pKa falls within the
|
|
402
|
+
requested pH window. Its list order is not stable across Python
|
|
403
|
+
runs, so taking `states[0]` makes the pipeline non-deterministic:
|
|
404
|
+
e.g. a secondary alkyl amide can come back as either NH or N-, and
|
|
405
|
+
we'd silently flip between them on re-runs.
|
|
406
|
+
|
|
407
|
+
Selection happens in two tiers (lower is better):
|
|
408
|
+
|
|
409
|
+
1. **Site-by-site plausibility.** Each candidate is aligned to the
|
|
410
|
+
input atom-by-atom and its formal-charge changes are checked: a
|
|
411
|
+
cation must form on a nitrogen base, an anion on an O/S acid or a
|
|
412
|
+
genuinely acidic nitrogen (sulfonamide, tetrazole, ...). Dimorphite
|
|
413
|
+
also enumerates implausible microstates -- most notably a
|
|
414
|
+
deprotonated carboxamide ``C(=O)[N-]`` (N-H pKa ~17-22) -- and those
|
|
415
|
+
are penalised by their count of illegitimate changes, so the neutral
|
|
416
|
+
amide is kept over its bogus anion.
|
|
417
|
+
|
|
418
|
+
2. **Most ionized.** Among equally plausible candidates, prefer the one
|
|
419
|
+
with the greatest total ionic character (sum of |formal charge|).
|
|
420
|
+
When dimorphite is unsure it returns both the ionized and the neutral
|
|
421
|
+
form (a primary amine comes back as both ``CCC[NH3+]`` and ``CCCN``);
|
|
422
|
+
this keeps the ionized one and, unlike "match the input charge", does
|
|
423
|
+
not collapse back to a neutral input drawn without explicit charges. A
|
|
424
|
+
zwitterion (net charge 0 but two charged atoms) is preferred over its
|
|
425
|
+
neutral form.
|
|
426
|
+
|
|
427
|
+
The SMILES string is a final deterministic tiebreak. For groups with
|
|
428
|
+
pKa far from the pH window dimorphite returns a single state, so the
|
|
429
|
+
choice only matters when dimorphite is unsure.
|
|
430
|
+
"""
|
|
431
|
+
from rdkit import Chem
|
|
432
|
+
|
|
433
|
+
input_mol = Chem.MolFromSmiles(input_smiles)
|
|
434
|
+
|
|
435
|
+
def score(smi):
|
|
436
|
+
m = Chem.MolFromSmiles(smi)
|
|
437
|
+
if m is None:
|
|
438
|
+
# Unparseable candidate: sort strictly last.
|
|
439
|
+
return (1_000_000, 0, smi)
|
|
440
|
+
illegitimate = (
|
|
441
|
+
_count_illegitimate_ionizations(input_mol, m)
|
|
442
|
+
if input_mol is not None else 0
|
|
443
|
+
)
|
|
444
|
+
ionic = sum(abs(a.GetFormalCharge()) for a in m.GetAtoms())
|
|
445
|
+
# Fewest implausible ionizations first, then most ionized
|
|
446
|
+
# (negate for min), then SMILES tiebreak.
|
|
447
|
+
return (illegitimate, -ionic, smi)
|
|
448
|
+
|
|
449
|
+
best = min(states, key=score)
|
|
450
|
+
# The best available state may still carry an illegitimate ionization when
|
|
451
|
+
# Dimorphite offered no cleaner alternative; revert those sites to the input.
|
|
452
|
+
return _repair_illegitimate_ionizations(input_mol, best)
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
def _target_atom_states(mol_heavy, ph):
|
|
456
|
+
"""
|
|
457
|
+
Use Dimorphite-DL to determine the dominant protonation state at
|
|
458
|
+
`ph`, then return a dict {atom_idx: (formal_charge, total_num_hs)}
|
|
459
|
+
aligned to the atom indices of `mol_heavy`.
|
|
460
|
+
|
|
461
|
+
Returning the total H count along with the charge is important: for
|
|
462
|
+
aromatic heterocycles, charge alone underspecifies the atom and
|
|
463
|
+
RDKit will fail to kekulize after the change. The template's H count
|
|
464
|
+
fully constrains the bonding state.
|
|
465
|
+
"""
|
|
466
|
+
from rdkit import Chem
|
|
467
|
+
from dimorphite_dl import protonate_smiles
|
|
468
|
+
|
|
469
|
+
smiles = Chem.MolToSmiles(mol_heavy)
|
|
470
|
+
states = protonate_smiles(smiles, ph_min=ph - 0.5, ph_max=ph + 0.5)
|
|
471
|
+
if not states:
|
|
472
|
+
raise RuntimeError(f"Dimorphite-DL returned no states for {smiles!r}")
|
|
473
|
+
|
|
474
|
+
chosen = _pick_state(smiles, states)
|
|
475
|
+
template = Chem.MolFromSmiles(chosen)
|
|
476
|
+
if template is None:
|
|
477
|
+
raise RuntimeError(
|
|
478
|
+
f"RDKit could not parse Dimorphite-DL output {chosen!r}"
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
# Map heavy-atom indices between original and template via a skeleton
|
|
482
|
+
# (charge/H/bond-order-agnostic) match, so e.g. -COOH still matches -COO-
|
|
483
|
+
# and protonated aromatic heterocycles still match their neutral form.
|
|
484
|
+
match = _skeleton_copy(mol_heavy).GetSubstructMatch(
|
|
485
|
+
_skeleton_copy(template)
|
|
486
|
+
)
|
|
487
|
+
if not match:
|
|
488
|
+
raise RuntimeError(
|
|
489
|
+
"Could not align protonation template with input molecule"
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
out = {}
|
|
493
|
+
for template_idx, orig_idx in enumerate(match):
|
|
494
|
+
ta = template.GetAtomWithIdx(template_idx)
|
|
495
|
+
out[orig_idx] = (ta.GetFormalCharge(), ta.GetTotalNumHs())
|
|
496
|
+
return out
|
|
497
|
+
|
|
498
|
+
|
|
499
|
+
def protonate_molecule(mol, ph, add_coord_hs=True):
|
|
500
|
+
"""
|
|
501
|
+
Return a Mol with pH-appropriate protonation.
|
|
502
|
+
|
|
503
|
+
When the input carries a 3D conformer and `add_coord_hs` is set,
|
|
504
|
+
explicit hydrogens are added and positioned from the existing
|
|
505
|
+
geometry while the heavy-atom coordinates are preserved (this is the
|
|
506
|
+
SDF-output path). Otherwise protonation is left implicit, which is
|
|
507
|
+
what a SMILES writer wants and avoids hydrogens at bogus positions.
|
|
508
|
+
"""
|
|
509
|
+
from rdkit import Chem
|
|
510
|
+
|
|
511
|
+
props = mol.GetPropsAsDict()
|
|
512
|
+
name = mol.GetProp("_Name") if mol.HasProp("_Name") else ""
|
|
513
|
+
|
|
514
|
+
# Strip any pre-existing Hs; any conformer on heavy atoms is preserved.
|
|
515
|
+
mol_heavy = Chem.RemoveHs(mol)
|
|
516
|
+
has_coords = mol_heavy.GetNumConformers() > 0
|
|
517
|
+
|
|
518
|
+
# Apply Dimorphite-DL's pH-appropriate charges and H counts to the
|
|
519
|
+
# molecule. Setting NoImplicit=True with an explicit H count makes
|
|
520
|
+
# the atom state fully determined, which keeps kekulization happy on
|
|
521
|
+
# aromatic heterocycles.
|
|
522
|
+
new_states = _target_atom_states(mol_heavy, ph)
|
|
523
|
+
mol_heavy = Chem.RWMol(mol_heavy)
|
|
524
|
+
for idx, (charge, n_hs) in new_states.items():
|
|
525
|
+
a = mol_heavy.GetAtomWithIdx(idx)
|
|
526
|
+
a.SetFormalCharge(charge)
|
|
527
|
+
a.SetNumExplicitHs(n_hs)
|
|
528
|
+
a.SetNoImplicit(True)
|
|
529
|
+
Chem.SanitizeMol(mol_heavy)
|
|
530
|
+
|
|
531
|
+
# With 3D coordinates, add explicit hydrogens positioned from the
|
|
532
|
+
# existing heavy-atom geometry (heavy-atom coordinates are not
|
|
533
|
+
# modified). Without coordinates, or when the caller doesn't want
|
|
534
|
+
# them, keep the protonation implicit so a SMILES writer renders it
|
|
535
|
+
# cleanly without bogus zeroed positions.
|
|
536
|
+
if has_coords and add_coord_hs:
|
|
537
|
+
protonated = Chem.AddHs(mol_heavy, addCoords=True)
|
|
538
|
+
else:
|
|
539
|
+
protonated = mol_heavy
|
|
540
|
+
|
|
541
|
+
# Restore name and SDF tags.
|
|
542
|
+
if name:
|
|
543
|
+
protonated.SetProp("_Name", name)
|
|
544
|
+
for key, value in props.items():
|
|
545
|
+
protonated.SetProp(key, str(value))
|
|
546
|
+
return protonated
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
def protonate_smiles_string(smiles, ph=7.4):
|
|
550
|
+
"""
|
|
551
|
+
Protonate a single SMILES string at `ph` and return the resulting
|
|
552
|
+
SMILES. Convenience wrapper around `protonate_molecule` for the
|
|
553
|
+
common string-in/string-out case (no coordinates involved).
|
|
554
|
+
|
|
555
|
+
Raises ValueError if `smiles` cannot be parsed; other failures
|
|
556
|
+
(e.g. Dimorphite-DL could not handle the molecule) propagate from
|
|
557
|
+
`protonate_molecule`.
|
|
558
|
+
"""
|
|
559
|
+
from rdkit import Chem
|
|
560
|
+
|
|
561
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
562
|
+
if mol is None:
|
|
563
|
+
raise ValueError(f"Could not parse SMILES {smiles!r}")
|
|
564
|
+
protonated = protonate_molecule(mol, ph, add_coord_hs=False)
|
|
565
|
+
return Chem.MolToSmiles(protonated)
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
def _is_smiles_path(path):
|
|
569
|
+
return path.lower().endswith((".smi", ".smiles"))
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
def read_molecules(path):
|
|
573
|
+
"""
|
|
574
|
+
Yield molecules from `path`, which may be SMILES (.smi/.smiles) or
|
|
575
|
+
SDF. Unparseable entries are yielded as None so callers can count
|
|
576
|
+
and report them. SMILES files are read as one molecule per line,
|
|
577
|
+
"SMILES [optional name]".
|
|
578
|
+
"""
|
|
579
|
+
from rdkit import Chem
|
|
580
|
+
|
|
581
|
+
if _is_smiles_path(path):
|
|
582
|
+
with open(path) as fh:
|
|
583
|
+
for line in fh:
|
|
584
|
+
line = line.strip()
|
|
585
|
+
if not line:
|
|
586
|
+
continue
|
|
587
|
+
parts = line.split(None, 1)
|
|
588
|
+
mol = Chem.MolFromSmiles(parts[0])
|
|
589
|
+
if mol is not None and len(parts) > 1:
|
|
590
|
+
mol.SetProp("_Name", parts[1].strip())
|
|
591
|
+
yield mol
|
|
592
|
+
else:
|
|
593
|
+
for mol in Chem.SDMolSupplier(path, removeHs=False, sanitize=True):
|
|
594
|
+
yield mol
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
def make_writer(path):
|
|
598
|
+
"""Return an SDF or SMILES writer based on the output extension."""
|
|
599
|
+
from rdkit import Chem
|
|
600
|
+
|
|
601
|
+
if _is_smiles_path(path):
|
|
602
|
+
return Chem.SmilesWriter(path, includeHeader=False)
|
|
603
|
+
return Chem.SDWriter(path)
|
|
604
|
+
|
|
605
|
+
|
|
606
|
+
def protonate_ligands(input_path, output_path, ph):
|
|
607
|
+
"""Batch-protonate a ligand file (SDF/SMILES) into another."""
|
|
608
|
+
writer = make_writer(output_path)
|
|
609
|
+
# SMILES output never needs coordinate-bearing explicit hydrogens.
|
|
610
|
+
add_coord_hs = not _is_smiles_path(output_path)
|
|
611
|
+
|
|
612
|
+
n_in = n_out = n_fail = 0
|
|
613
|
+
for mol in read_molecules(input_path):
|
|
614
|
+
n_in += 1
|
|
615
|
+
if mol is None:
|
|
616
|
+
n_fail += 1
|
|
617
|
+
print(
|
|
618
|
+
f"[warn] skipping molecule {n_in}: RDKit failed to parse",
|
|
619
|
+
file=sys.stderr,
|
|
620
|
+
)
|
|
621
|
+
continue
|
|
622
|
+
try:
|
|
623
|
+
protonated = protonate_molecule(mol, ph, add_coord_hs=add_coord_hs)
|
|
624
|
+
except Exception as exc:
|
|
625
|
+
n_fail += 1
|
|
626
|
+
label = mol.GetProp("_Name") if mol.HasProp("_Name") else f"#{n_in}"
|
|
627
|
+
print(f"[warn] skipping {label}: {exc}", file=sys.stderr)
|
|
628
|
+
continue
|
|
629
|
+
writer.write(protonated)
|
|
630
|
+
n_out += 1
|
|
631
|
+
|
|
632
|
+
writer.close()
|
|
633
|
+
print(f"Read {n_in} molecules, wrote {n_out}, skipped {n_fail}.")
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
# ---------------------------------------------------------------------------
|
|
637
|
+
# Protein mode (Biotite + Hydride)
|
|
638
|
+
# ---------------------------------------------------------------------------
|
|
639
|
+
|
|
640
|
+
def reorder_hydrogens_after_heavy_atoms(atoms):
|
|
641
|
+
"""
|
|
642
|
+
Return a new AtomArray where each hydrogen immediately follows the
|
|
643
|
+
heavy atom it is bonded to.
|
|
644
|
+
|
|
645
|
+
Hydrogens bonded to the same heavy atom appear in the order Hydride
|
|
646
|
+
originally placed them. Orphan hydrogens (no heavy-atom bond found)
|
|
647
|
+
are appended at the end as a fallback.
|
|
648
|
+
|
|
649
|
+
The input AtomArray must have a populated BondList.
|
|
650
|
+
"""
|
|
651
|
+
import numpy as np
|
|
652
|
+
|
|
653
|
+
if atoms.bonds is None:
|
|
654
|
+
raise ValueError(
|
|
655
|
+
"AtomArray must have an associated BondList; "
|
|
656
|
+
"call connect_via_residue_names() before reordering."
|
|
657
|
+
)
|
|
658
|
+
|
|
659
|
+
# neighbors has shape (n_atoms, max_bonds); -1 entries are padding.
|
|
660
|
+
neighbors, _ = atoms.bonds.get_all_bonds()
|
|
661
|
+
is_hydrogen = atoms.element == "H"
|
|
662
|
+
n_atoms = len(atoms)
|
|
663
|
+
|
|
664
|
+
new_order = []
|
|
665
|
+
placed = np.zeros(n_atoms, dtype=bool)
|
|
666
|
+
|
|
667
|
+
for i in range(n_atoms):
|
|
668
|
+
if placed[i] or is_hydrogen[i]:
|
|
669
|
+
continue
|
|
670
|
+
# Place the heavy atom.
|
|
671
|
+
new_order.append(i)
|
|
672
|
+
placed[i] = True
|
|
673
|
+
# Then its bonded hydrogens, in their original relative order.
|
|
674
|
+
h_indices = sorted(
|
|
675
|
+
int(j) for j in neighbors[i]
|
|
676
|
+
if j >= 0 and is_hydrogen[j] and not placed[j]
|
|
677
|
+
)
|
|
678
|
+
for j in h_indices:
|
|
679
|
+
new_order.append(j)
|
|
680
|
+
placed[j] = True
|
|
681
|
+
|
|
682
|
+
# Any leftover atoms (e.g. unbonded hydrogens) go at the end.
|
|
683
|
+
for i in range(n_atoms):
|
|
684
|
+
if not placed[i]:
|
|
685
|
+
new_order.append(i)
|
|
686
|
+
placed[i] = True
|
|
687
|
+
|
|
688
|
+
return atoms[new_order]
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
def protonate_structure(structure, ligand_res_name=None, ph=7.0, relax=True):
|
|
692
|
+
"""
|
|
693
|
+
Return a hydrogenated copy of a protein `AtomArray`.
|
|
694
|
+
|
|
695
|
+
In-memory analogue of `protonate_molecule` for proteins: takes an
|
|
696
|
+
AtomArray (e.g. from ``pdb.PDBFile.read(path).get_structure(model=1)``)
|
|
697
|
+
and returns a new AtomArray with pH-appropriate hydrogens added and
|
|
698
|
+
each hydrogen reordered to immediately follow its bonded heavy atom.
|
|
699
|
+
|
|
700
|
+
If `ligand_res_name` is given (and not "none"), atoms with that
|
|
701
|
+
residue name are removed first; a ValueError is raised if no such
|
|
702
|
+
atoms exist. Any pre-existing hydrogens are stripped before Hydride
|
|
703
|
+
adds them back.
|
|
704
|
+
"""
|
|
705
|
+
import numpy as np
|
|
706
|
+
import biotite.structure as struc
|
|
707
|
+
import hydride
|
|
708
|
+
|
|
709
|
+
# Optionally remove the ligand by residue name (3-letter CCD code).
|
|
710
|
+
# "none" (any case) means "keep everything".
|
|
711
|
+
if ligand_res_name is not None and ligand_res_name.lower() != "none":
|
|
712
|
+
target = ligand_res_name.upper()
|
|
713
|
+
keep_mask = np.char.upper(structure.res_name.astype(str)) != target
|
|
714
|
+
if keep_mask.all():
|
|
715
|
+
raise ValueError(
|
|
716
|
+
f"No atoms with res_name '{target}' found in structure."
|
|
717
|
+
)
|
|
718
|
+
structure = structure[keep_mask]
|
|
719
|
+
|
|
720
|
+
# Strip any pre-existing hydrogens; Hydride will add them itself.
|
|
721
|
+
structure = structure[structure.element != "H"]
|
|
722
|
+
|
|
723
|
+
# Assign covalent bonds from CCD residue templates.
|
|
724
|
+
structure.bonds = struc.connect_via_residue_names(structure)
|
|
725
|
+
|
|
726
|
+
# Set formal charges for canonical amino acids at the requested pH.
|
|
727
|
+
charges = hydride.estimate_amino_acid_charges(structure, ph=ph)
|
|
728
|
+
structure.set_annotation("charge", charges)
|
|
729
|
+
|
|
730
|
+
# Add hydrogens, then optionally relax their geometry.
|
|
731
|
+
structure, _ = hydride.add_hydrogen(structure)
|
|
732
|
+
if relax:
|
|
733
|
+
structure.coord = hydride.relax_hydrogen(structure)
|
|
734
|
+
|
|
735
|
+
# Reorder so each hydrogen follows its bonded heavy atom.
|
|
736
|
+
return reorder_hydrogens_after_heavy_atoms(structure)
|
|
737
|
+
|
|
738
|
+
|
|
739
|
+
def prepare_structure(input_path, ligand_res_name, output_path,
|
|
740
|
+
ph=7.0, relax=True, quiet=False):
|
|
741
|
+
"""
|
|
742
|
+
Read a PDB file, protonate it with `protonate_structure`, and write
|
|
743
|
+
the result to another PDB file. File-to-file driver analogous to
|
|
744
|
+
`protonate_ligands` on the ligand side.
|
|
745
|
+
"""
|
|
746
|
+
import biotite.structure.io.pdb as pdb
|
|
747
|
+
|
|
748
|
+
structure = pdb.PDBFile.read(input_path).get_structure(model=1)
|
|
749
|
+
structure = protonate_structure(
|
|
750
|
+
structure, ligand_res_name=ligand_res_name, ph=ph, relax=relax
|
|
751
|
+
)
|
|
752
|
+
|
|
753
|
+
out = pdb.PDBFile()
|
|
754
|
+
out.set_structure(structure)
|
|
755
|
+
out.write(output_path)
|
|
756
|
+
if not quiet:
|
|
757
|
+
print(f"Wrote hydrogenated structure to {output_path}")
|
|
758
|
+
|
|
759
|
+
|
|
760
|
+
# ---------------------------------------------------------------------------
|
|
761
|
+
# Command-line interface
|
|
762
|
+
# ---------------------------------------------------------------------------
|
|
763
|
+
|
|
764
|
+
def parse_args():
|
|
765
|
+
p = argparse.ArgumentParser(
|
|
766
|
+
description=__doc__,
|
|
767
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
768
|
+
)
|
|
769
|
+
sub = p.add_subparsers(dest="mode", required=True,
|
|
770
|
+
metavar="{ligand,protein}")
|
|
771
|
+
|
|
772
|
+
lig = sub.add_parser(
|
|
773
|
+
"ligand",
|
|
774
|
+
help="Protonate small molecules (SDF/SMILES).",
|
|
775
|
+
description="Protonate small molecules with Dimorphite-DL.",
|
|
776
|
+
)
|
|
777
|
+
lig.add_argument(
|
|
778
|
+
"input", help="Path to the input file (SDF, or .smi/.smiles for SMILES)"
|
|
779
|
+
)
|
|
780
|
+
lig.add_argument(
|
|
781
|
+
"output", help="Path to the output file (SDF, or .smi/.smiles for SMILES)"
|
|
782
|
+
)
|
|
783
|
+
lig.add_argument(
|
|
784
|
+
"--ph", type=float, default=7.4,
|
|
785
|
+
help="Target pH for protonation (default: 7.4)",
|
|
786
|
+
)
|
|
787
|
+
|
|
788
|
+
prot = sub.add_parser(
|
|
789
|
+
"protein",
|
|
790
|
+
help="Protonate a protein structure (PDB).",
|
|
791
|
+
description="Add hydrogens to a protein with Hydride.",
|
|
792
|
+
)
|
|
793
|
+
prot.add_argument("input", help="Path to the input PDB file")
|
|
794
|
+
prot.add_argument(
|
|
795
|
+
"ligand_res_name",
|
|
796
|
+
help="Residue name of the ligand to remove (e.g. ATP, HEM, AP5). "
|
|
797
|
+
"Pass 'none' to skip ligand removal.",
|
|
798
|
+
)
|
|
799
|
+
prot.add_argument("output", help="Path to the output PDB file")
|
|
800
|
+
prot.add_argument(
|
|
801
|
+
"--ph", type=float, default=7.0,
|
|
802
|
+
help="pH used to estimate amino-acid formal charges (default: 7.0)",
|
|
803
|
+
)
|
|
804
|
+
prot.add_argument(
|
|
805
|
+
"--no-relax", action="store_true",
|
|
806
|
+
help="Skip dihedral relaxation of hydrogens.",
|
|
807
|
+
)
|
|
808
|
+
|
|
809
|
+
return p.parse_args()
|
|
810
|
+
|
|
811
|
+
|
|
812
|
+
def main():
|
|
813
|
+
args = parse_args()
|
|
814
|
+
if args.mode == "ligand":
|
|
815
|
+
protonate_ligands(args.input, args.output, args.ph)
|
|
816
|
+
elif args.mode == "protein":
|
|
817
|
+
prepare_structure(
|
|
818
|
+
input_path=args.input,
|
|
819
|
+
ligand_res_name=args.ligand_res_name,
|
|
820
|
+
output_path=args.output,
|
|
821
|
+
ph=args.ph,
|
|
822
|
+
relax=not args.no_relax,
|
|
823
|
+
)
|
|
824
|
+
|
|
825
|
+
|
|
826
|
+
if __name__ == "__main__":
|
|
827
|
+
main()
|