prism-pruner 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- prism_pruner/__init__.py +1 -0
- prism_pruner/algebra.py +163 -0
- prism_pruner/conformer_ensemble.py +57 -0
- prism_pruner/graph_manipulations.py +195 -0
- prism_pruner/pruner.py +623 -0
- prism_pruner/rmsd.py +38 -0
- prism_pruner/torsion_module.py +472 -0
- prism_pruner/typing.py +15 -0
- prism_pruner/utils.py +153 -0
- prism_pruner-0.0.3.dist-info/METADATA +34 -0
- prism_pruner-0.0.3.dist-info/RECORD +14 -0
- prism_pruner-0.0.3.dist-info/WHEEL +5 -0
- prism_pruner-0.0.3.dist-info/licenses/LICENSE +21 -0
- prism_pruner-0.0.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,472 @@
|
|
|
1
|
+
"""PRISM - PRuning Interface for Similar Molecules."""
|
|
2
|
+
|
|
3
|
+
from copy import deepcopy
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Callable, Iterable, Sequence
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from networkx import (
|
|
9
|
+
Graph,
|
|
10
|
+
connected_components,
|
|
11
|
+
has_path,
|
|
12
|
+
is_isomorphic,
|
|
13
|
+
minimum_spanning_tree,
|
|
14
|
+
shortest_path,
|
|
15
|
+
subgraph,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
from prism_pruner.algebra import vec_angle
|
|
19
|
+
from prism_pruner.graph_manipulations import (
|
|
20
|
+
get_phenyl_ids,
|
|
21
|
+
get_sp_n,
|
|
22
|
+
is_amide_n,
|
|
23
|
+
is_ester_o,
|
|
24
|
+
)
|
|
25
|
+
from prism_pruner.rmsd import rmsd_and_max
|
|
26
|
+
from prism_pruner.typing import Array1D_bool, Array1D_str, Array2D_float, Array2D_int
|
|
27
|
+
from prism_pruner.utils import rotate_dihedral
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class Torsion:
|
|
32
|
+
"""Torsion class."""
|
|
33
|
+
|
|
34
|
+
i1: int
|
|
35
|
+
i2: int
|
|
36
|
+
i3: int
|
|
37
|
+
i4: int
|
|
38
|
+
mode: str | None = None
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def torsion(self) -> tuple[int, int, int, int]:
|
|
42
|
+
"""Return tuple of indices defining the torsion."""
|
|
43
|
+
return (self.i1, self.i2, self.i3, self.i4)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def in_cycle(torsion: Torsion, graph: Graph) -> bool:
|
|
47
|
+
"""Return True if the torsion is part of a cycle."""
|
|
48
|
+
graph.remove_edge(torsion.i2, torsion.i3)
|
|
49
|
+
cyclical: bool = has_path(graph, torsion.i1, torsion.i4)
|
|
50
|
+
graph.add_edge(torsion.i2, torsion.i3)
|
|
51
|
+
return cyclical
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def is_rotable(
|
|
55
|
+
torsion: Torsion,
|
|
56
|
+
graph: Graph,
|
|
57
|
+
hydrogen_bonds: list[list[int]],
|
|
58
|
+
keepdummy: bool = False,
|
|
59
|
+
) -> bool:
|
|
60
|
+
"""Return True if the Torsion object is rotatable.
|
|
61
|
+
|
|
62
|
+
hydrogen bonds: iterable with pairs of sorted atomic indices.
|
|
63
|
+
"""
|
|
64
|
+
if sorted((torsion.i2, torsion.i3)) in hydrogen_bonds:
|
|
65
|
+
# self.n_fold = 6
|
|
66
|
+
# # This has to be an intermolecular HB: rotate it
|
|
67
|
+
# return True
|
|
68
|
+
return False
|
|
69
|
+
|
|
70
|
+
if _is_free(torsion.i2, graph) or (_is_free(torsion.i3, graph)):
|
|
71
|
+
if keepdummy or (
|
|
72
|
+
is_nondummy(torsion.i2, torsion.i3, graph)
|
|
73
|
+
and (is_nondummy(torsion.i3, torsion.i2, graph))
|
|
74
|
+
):
|
|
75
|
+
return True
|
|
76
|
+
|
|
77
|
+
return False
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def get_n_fold(torsion: Torsion, graph: Graph) -> int:
|
|
81
|
+
"""Return the n-fold of the rotation."""
|
|
82
|
+
atoms = (graph.nodes[torsion.i2]["atoms"], graph.nodes[torsion.i3]["atoms"])
|
|
83
|
+
|
|
84
|
+
if "H" in atoms:
|
|
85
|
+
return 6 # H-N, H-O hydrogen bonds
|
|
86
|
+
|
|
87
|
+
if is_amide_n(torsion.i2, graph, mode=2) or (is_amide_n(torsion.i3, graph, mode=2)):
|
|
88
|
+
# tertiary amides rotations are 2-fold
|
|
89
|
+
return 2
|
|
90
|
+
|
|
91
|
+
if ("C" in atoms) or ("N" in atoms) or ("S" in atoms): # if C, N or S atoms
|
|
92
|
+
sp_n_i2 = get_sp_n(torsion.i2, graph)
|
|
93
|
+
sp_n_i3 = get_sp_n(torsion.i3, graph)
|
|
94
|
+
|
|
95
|
+
if 3 == sp_n_i2 == sp_n_i3:
|
|
96
|
+
return 3
|
|
97
|
+
|
|
98
|
+
if 3 in (sp_n_i2, sp_n_i3): # Csp3-X, Nsp3-X, Ssulfone-X
|
|
99
|
+
if torsion.mode == "csearch":
|
|
100
|
+
return 3
|
|
101
|
+
|
|
102
|
+
elif torsion.mode == "symmetry":
|
|
103
|
+
return sp_n_i3 or 2
|
|
104
|
+
|
|
105
|
+
if 2 in (sp_n_i2, sp_n_i3):
|
|
106
|
+
return 2
|
|
107
|
+
|
|
108
|
+
return 4 # O-O, S-S, Ar-Ar, Ar-CO, and everything else
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def get_angles(torsion: Torsion, graph: Graph) -> tuple[int, ...]:
|
|
112
|
+
"""Return the angles associated with the torsion."""
|
|
113
|
+
d = {
|
|
114
|
+
2: (0, 180),
|
|
115
|
+
3: (0, 120, 240),
|
|
116
|
+
4: (0, 90, 180, 270),
|
|
117
|
+
6: (0, 60, 120, 180, 240, 300),
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
n_fold = get_n_fold(torsion, graph)
|
|
121
|
+
|
|
122
|
+
return d[n_fold]
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _is_free(index: int, graph: Graph) -> bool:
|
|
126
|
+
"""Return whether the torsion is free to rotate.
|
|
127
|
+
|
|
128
|
+
Return True if the index specified
|
|
129
|
+
satisfies all of the following:
|
|
130
|
+
- Is not a sp2 carbonyl carbon atom
|
|
131
|
+
- Is not the oxygen atom of an ester
|
|
132
|
+
- Is not the nitrogen atom of a secondary amide (CONHR)
|
|
133
|
+
"""
|
|
134
|
+
if all(
|
|
135
|
+
(
|
|
136
|
+
graph.nodes[index]["atoms"] == "C",
|
|
137
|
+
2 == get_sp_n(index, graph),
|
|
138
|
+
"O" in (graph.nodes[n]["atoms"] for n in graph.neighbors(index)),
|
|
139
|
+
)
|
|
140
|
+
):
|
|
141
|
+
return False
|
|
142
|
+
|
|
143
|
+
if is_amide_n(index, graph, mode=1):
|
|
144
|
+
return False
|
|
145
|
+
|
|
146
|
+
if is_ester_o(index, graph):
|
|
147
|
+
return False
|
|
148
|
+
|
|
149
|
+
return True
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def is_nondummy(i: int, root: int, graph: Graph) -> bool:
|
|
153
|
+
"""Return whether the torsion is not dummy.
|
|
154
|
+
|
|
155
|
+
Checks that a molecular rotation along the dihedral
|
|
156
|
+
angle (*, root, i, *) is non-dummy, that is the atom
|
|
157
|
+
at index i, in the direction opposite to the one leading
|
|
158
|
+
to root, has different substituents. i.e. methyl, CF3 and tBu
|
|
159
|
+
rotations should return False.
|
|
160
|
+
"""
|
|
161
|
+
if graph.nodes[i]["atoms"] not in ("C", "N"):
|
|
162
|
+
return True
|
|
163
|
+
# for now, we only discard rotations around carbon
|
|
164
|
+
# and nitrogen atoms, like methyl/tert-butyl/triphenyl
|
|
165
|
+
# and flat symmetrical rings like phenyl, N-pyrrolyl...
|
|
166
|
+
|
|
167
|
+
G = deepcopy(graph)
|
|
168
|
+
nb = list(G.neighbors(i))
|
|
169
|
+
nb.remove(root)
|
|
170
|
+
|
|
171
|
+
if len(nb) == 1:
|
|
172
|
+
if len(list(G.neighbors(nb[0]))) == 2:
|
|
173
|
+
return False
|
|
174
|
+
# if node i has two bonds only (one with root and one with a)
|
|
175
|
+
# and the other atom (a) has two bonds only (one with i)
|
|
176
|
+
# the rotation is considered dummy: some other rotation
|
|
177
|
+
# will account for its freedom (i.e. alkynes, hydrogen bonds)
|
|
178
|
+
|
|
179
|
+
# check if it is a phenyl-like rotation
|
|
180
|
+
if len(nb) == 2:
|
|
181
|
+
# get the 6 indices of the aromatic atoms (i1-i6)
|
|
182
|
+
phenyl_indices = get_phenyl_ids(i, G)
|
|
183
|
+
|
|
184
|
+
# compare the two halves of the 6-membered ring (indices i2-i3 region with i5-i6 region)
|
|
185
|
+
if phenyl_indices is not None:
|
|
186
|
+
i1, i2, i3, i4, i5, i6 = phenyl_indices
|
|
187
|
+
G.remove_edge(i3, i4)
|
|
188
|
+
G.remove_edge(i4, i5)
|
|
189
|
+
G.remove_edge(i1, i2)
|
|
190
|
+
G.remove_edge(i1, i6)
|
|
191
|
+
|
|
192
|
+
subgraphs = [
|
|
193
|
+
subgraph(G, _set) for _set in connected_components(G) if i2 in _set or i6 in _set
|
|
194
|
+
]
|
|
195
|
+
|
|
196
|
+
if len(subgraphs) == 2:
|
|
197
|
+
return not is_isomorphic(
|
|
198
|
+
subgraphs[0],
|
|
199
|
+
subgraphs[1],
|
|
200
|
+
node_match=lambda n1, n2: n1["atoms"] == n2["atoms"],
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
# We should not end up here, but if we do, rotation should not be dummy
|
|
204
|
+
return True
|
|
205
|
+
|
|
206
|
+
# if not, compare immediate neighbors of i
|
|
207
|
+
for n in nb:
|
|
208
|
+
G.remove_edge(i, n)
|
|
209
|
+
|
|
210
|
+
# make a set of each fragment around the chopped n-i bonds,
|
|
211
|
+
# but only for fragments that are not root nor contain other random,
|
|
212
|
+
# disconnected parts of the graph
|
|
213
|
+
subgraphs_nodes = [
|
|
214
|
+
_set for _set in connected_components(G) if root not in _set and any(n in _set for n in nb)
|
|
215
|
+
]
|
|
216
|
+
|
|
217
|
+
if len(subgraphs_nodes) == 1:
|
|
218
|
+
return True
|
|
219
|
+
# if not, the torsion is likely to be rotable
|
|
220
|
+
# (tetramethylguanidyl alanine C(β)-N bond)
|
|
221
|
+
|
|
222
|
+
subgraphs = [subgraph(G, s) for s in subgraphs_nodes]
|
|
223
|
+
for sub in subgraphs[1:]:
|
|
224
|
+
if not is_isomorphic(
|
|
225
|
+
subgraphs[0], sub, node_match=lambda n1, n2: n1["atoms"] == n2["atoms"]
|
|
226
|
+
):
|
|
227
|
+
return True
|
|
228
|
+
# Care should be taken because chiral centers are not taken into account: a rotation
|
|
229
|
+
# involving an index where substituents only differ by stereochemistry, and where a
|
|
230
|
+
# rotation is not an element of symmetry of the subsystem, the rotation is considered
|
|
231
|
+
# dummy even if it would be more correct not to. For rotaionally corrected RMSD this
|
|
232
|
+
# should only cause small inefficiencies and not lead to discarding any good conformer.
|
|
233
|
+
|
|
234
|
+
return False
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def get_hydrogen_bonds(
|
|
238
|
+
coords: Array2D_float,
|
|
239
|
+
atoms: Array1D_str,
|
|
240
|
+
graph: Graph,
|
|
241
|
+
d_min: float = 2.5,
|
|
242
|
+
d_max: float = 3.3,
|
|
243
|
+
max_angle: int = 45,
|
|
244
|
+
elements: Sequence[Sequence[str]] | None = None,
|
|
245
|
+
fragments: Sequence[Sequence[int]] | None = None,
|
|
246
|
+
) -> list[list[int]]:
|
|
247
|
+
"""Return a list of tuples with the indices of hydrogen bonding partners.
|
|
248
|
+
|
|
249
|
+
An HB is a pair of atoms:
|
|
250
|
+
- with one H and one X (N or O) atom
|
|
251
|
+
- with an Y-X distance between d_min and d_max (i.e. N-O, Angstroms)
|
|
252
|
+
- with an Y-H-X angle below max_angle (i.e. N-H-O, degrees)
|
|
253
|
+
|
|
254
|
+
elements: iterable of two iterables with donor atomic symbols in the first
|
|
255
|
+
element and acceptors in the second. default: (("N", "O"), ("N", "O"))
|
|
256
|
+
|
|
257
|
+
If fragments is specified (iterable of iterable of indices for each fragment)
|
|
258
|
+
the function only returns inter-fragment hydrogen bonds.
|
|
259
|
+
"""
|
|
260
|
+
hbs = []
|
|
261
|
+
# initializing output list
|
|
262
|
+
|
|
263
|
+
if elements is None:
|
|
264
|
+
elements = (("N", "O"), ("N", "O", "F"))
|
|
265
|
+
|
|
266
|
+
het_idx_from = np.array([i for i, a in enumerate(atoms) if a in elements[0]], dtype=int)
|
|
267
|
+
het_idx_to = np.array([i for i, a in enumerate(atoms) if a in elements[1]], dtype=int)
|
|
268
|
+
# indices where N or O (or user-specified elements) atoms are present.
|
|
269
|
+
|
|
270
|
+
for i1 in het_idx_from:
|
|
271
|
+
for i2 in het_idx_to:
|
|
272
|
+
# if inter-fragment HBs are requested, skip intra-HBs
|
|
273
|
+
if fragments is not None:
|
|
274
|
+
if any(((i1 in f and i2 in f) for f in fragments)):
|
|
275
|
+
continue
|
|
276
|
+
|
|
277
|
+
# keep close pairs
|
|
278
|
+
if d_min < np.linalg.norm(coords[i1] - coords[i2]) < d_max:
|
|
279
|
+
# getting the indices of all H atoms attached to them
|
|
280
|
+
Hs = [i for i in graph.neighbors(i1) if graph.nodes[i]["atoms"] == "H"]
|
|
281
|
+
|
|
282
|
+
# versor connectring the two Heteroatoms
|
|
283
|
+
versor = coords[i2] - coords[i1]
|
|
284
|
+
versor = versor / np.linalg.norm(versor)
|
|
285
|
+
|
|
286
|
+
for iH in Hs:
|
|
287
|
+
# vectors connecting heteroatoms to H
|
|
288
|
+
v1 = coords[iH] - coords[i1]
|
|
289
|
+
v2 = coords[iH] - coords[i2]
|
|
290
|
+
|
|
291
|
+
# lengths of these vectors
|
|
292
|
+
d1 = np.linalg.norm(v1)
|
|
293
|
+
d2 = np.linalg.norm(v2)
|
|
294
|
+
|
|
295
|
+
# scalar projection in the heteroatom direction
|
|
296
|
+
l1 = v1 @ versor
|
|
297
|
+
l2 = v2 @ -versor
|
|
298
|
+
|
|
299
|
+
# largest planar angle between Het-H and Het-Het, in degrees (0 to 90°)
|
|
300
|
+
alfa = vec_angle(v1, versor) if l1 < l2 else vec_angle(v2, -versor)
|
|
301
|
+
|
|
302
|
+
# if the three atoms are not too far from being in line
|
|
303
|
+
if alfa < max_angle:
|
|
304
|
+
# adding the correct pair of atoms to results
|
|
305
|
+
if d1 < d2:
|
|
306
|
+
hbs.append(sorted((iH, i2)))
|
|
307
|
+
else:
|
|
308
|
+
hbs.append(sorted((iH, i1)))
|
|
309
|
+
|
|
310
|
+
break
|
|
311
|
+
|
|
312
|
+
return hbs
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def _get_rotation_mask(graph: Graph, torsion: Iterable[int]) -> Array1D_bool:
|
|
316
|
+
"""Return the rotation mask to be applied to coordinates before rotation.
|
|
317
|
+
|
|
318
|
+
Get mask for the atoms that will rotate in a torsion:
|
|
319
|
+
all the ones in the graph reachable from the last index
|
|
320
|
+
of the torsion but not going through the central two
|
|
321
|
+
atoms in the torsion quadruplet.
|
|
322
|
+
"""
|
|
323
|
+
_, i2, i3, i4 = torsion
|
|
324
|
+
|
|
325
|
+
graph.remove_edge(i2, i3)
|
|
326
|
+
reachable_indices = shortest_path(graph, i4).keys()
|
|
327
|
+
# get all indices reachable from i4 not going through i2-i3
|
|
328
|
+
|
|
329
|
+
graph.add_edge(i2, i3)
|
|
330
|
+
# restore modified graph
|
|
331
|
+
|
|
332
|
+
mask = np.array([i in reachable_indices for i in graph.nodes], dtype=bool)
|
|
333
|
+
# generate boolean mask
|
|
334
|
+
|
|
335
|
+
# if np.count_nonzero(mask) > int(len(mask)/2):
|
|
336
|
+
# mask = ~mask
|
|
337
|
+
# if we want to rotate more than half of the indices,
|
|
338
|
+
# invert the selection so that we do less math
|
|
339
|
+
|
|
340
|
+
mask[i3] = False
|
|
341
|
+
# do not rotate i3: it would not move,
|
|
342
|
+
# since it lies on the rotation axis
|
|
343
|
+
|
|
344
|
+
return mask
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def _get_quadruplets(graph: Graph) -> Array2D_int:
|
|
348
|
+
"""Return list of quadruplets that indicate potential torsions."""
|
|
349
|
+
# Step 1: Find spanning tree
|
|
350
|
+
spanning_tree = minimum_spanning_tree(graph)
|
|
351
|
+
|
|
352
|
+
# Step 2: Add dihedrals for spanning tree
|
|
353
|
+
dihedrals = []
|
|
354
|
+
|
|
355
|
+
# For each edge in the spanning tree, we can potentially define a dihedral
|
|
356
|
+
# We need edges that have at least 2 neighbors each to form a 4-point dihedral
|
|
357
|
+
for edge in spanning_tree.edges():
|
|
358
|
+
i, j = edge
|
|
359
|
+
|
|
360
|
+
# Find neighbors of i and j in the original graph
|
|
361
|
+
i_neighbors = [n for n in graph.neighbors(i) if n not in (i, j)]
|
|
362
|
+
j_neighbors = [n for n in graph.neighbors(j) if n not in (i, j)]
|
|
363
|
+
|
|
364
|
+
if len(i_neighbors) > 0 and len(j_neighbors) > 0:
|
|
365
|
+
# Form dihedral: neighbor_of_i - i - j - neighbor_of_j
|
|
366
|
+
k = i_neighbors[0] # Choose first available neighbor
|
|
367
|
+
m = j_neighbors[0] # Choose first available neighbor
|
|
368
|
+
dihedrals.append((k, i, j, m))
|
|
369
|
+
|
|
370
|
+
return np.array(dihedrals)
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
def get_torsions(
|
|
374
|
+
graph: Graph,
|
|
375
|
+
hydrogen_bonds: list[list[int]],
|
|
376
|
+
double_bonds: list[tuple[int, int]],
|
|
377
|
+
keepdummy: bool = False,
|
|
378
|
+
mode: str = "csearch",
|
|
379
|
+
) -> list[Torsion]:
|
|
380
|
+
"""Return list of Torsion objects."""
|
|
381
|
+
torsions = []
|
|
382
|
+
for path in _get_quadruplets(graph):
|
|
383
|
+
_, i2, i3, _ = path
|
|
384
|
+
bt = tuple(sorted((i2, i3)))
|
|
385
|
+
|
|
386
|
+
if bt not in double_bonds:
|
|
387
|
+
t = Torsion(*path)
|
|
388
|
+
t.mode = mode
|
|
389
|
+
|
|
390
|
+
if (not in_cycle(t, graph)) and is_rotable(
|
|
391
|
+
t, graph, hydrogen_bonds, keepdummy=keepdummy
|
|
392
|
+
):
|
|
393
|
+
torsions.append(t)
|
|
394
|
+
# Create non-redundant torsion objects
|
|
395
|
+
# Rejects (4,3,2,1) if (1,2,3,4) is present
|
|
396
|
+
# Rejects torsions that do not represent a rotable bond
|
|
397
|
+
|
|
398
|
+
return torsions
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def rotationally_corrected_rmsd_and_max(
|
|
402
|
+
ref: Array2D_float,
|
|
403
|
+
coord: Array2D_float,
|
|
404
|
+
atoms: Array1D_str,
|
|
405
|
+
torsions: Array2D_int,
|
|
406
|
+
graph: Graph,
|
|
407
|
+
angles: Sequence[Sequence[int]],
|
|
408
|
+
heavy_atoms_only: bool = True,
|
|
409
|
+
debugfunction: Callable[..., object] | None = None,
|
|
410
|
+
return_type: str = "rmsd",
|
|
411
|
+
) -> tuple[float, float] | Array2D_float:
|
|
412
|
+
"""Return RMSD and max deviation, corrected for degenerate torsions.
|
|
413
|
+
|
|
414
|
+
Return a tuple with the RMSD between p and q
|
|
415
|
+
and the maximum deviation of their positions.
|
|
416
|
+
"""
|
|
417
|
+
assert return_type in ("rmsd", "coords")
|
|
418
|
+
|
|
419
|
+
torsion_corrections = [0 for _ in torsions]
|
|
420
|
+
|
|
421
|
+
mask = (
|
|
422
|
+
np.array([a != "H" for a in atoms]) if heavy_atoms_only else np.ones(len(atoms), dtype=bool)
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
# Now rotate every dummy torsion by the appropriate increment until we minimize local RMSD
|
|
426
|
+
for i, torsion in enumerate(torsions):
|
|
427
|
+
best_rmsd = 1e10
|
|
428
|
+
|
|
429
|
+
# Look for the rotational angle set that minimizes the torsion RMSD and save it for later
|
|
430
|
+
for angle in angles[i]:
|
|
431
|
+
coord = rotate_dihedral(coord, torsion, angle, indices_to_be_moved=[torsion[3]])
|
|
432
|
+
|
|
433
|
+
locally_corrected_rmsd, _ = rmsd_and_max(ref[torsion], coord[torsion])
|
|
434
|
+
|
|
435
|
+
if locally_corrected_rmsd < best_rmsd:
|
|
436
|
+
best_rmsd = locally_corrected_rmsd
|
|
437
|
+
torsion_corrections[i] = angle
|
|
438
|
+
|
|
439
|
+
# it is faster to undo the rotation rather than working with a copy of coordss
|
|
440
|
+
coord = rotate_dihedral(coord, torsion, -angle, indices_to_be_moved=[torsion[3]])
|
|
441
|
+
|
|
442
|
+
# now rotate that angle to the desired orientation before going to the next angle
|
|
443
|
+
if torsion_corrections[i] != 0:
|
|
444
|
+
coord = rotate_dihedral(
|
|
445
|
+
coord, torsion, torsion_corrections[i], mask=_get_rotation_mask(graph, torsion)
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
if debugfunction is not None:
|
|
449
|
+
global_rmsd = rmsd_and_max(ref[mask], coord[mask])[0]
|
|
450
|
+
debugfunction(
|
|
451
|
+
f" Torsion {i + 1} - {torsion}: best θ = {torsion_corrections[i]}°, "
|
|
452
|
+
+ f"4-atom RMSD: {best_rmsd:.3f} Å, global RMSD: {global_rmsd:.3f} Å"
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
# we should have the optimal orientation on all torsions now:
|
|
456
|
+
# calculate the RMSD
|
|
457
|
+
rmsd, maxdev = rmsd_and_max(ref[mask], coord[mask])
|
|
458
|
+
|
|
459
|
+
# since we could have segmented graphs, and therefore potentially only rotate
|
|
460
|
+
# subsets of the graph where the torsion last two indices are,
|
|
461
|
+
# we have to undo the final rotation too (would not be needed for connected graphs)
|
|
462
|
+
for torsion, optimal_angle in zip(
|
|
463
|
+
reversed(torsions), reversed(torsion_corrections), strict=False
|
|
464
|
+
):
|
|
465
|
+
coord = rotate_dihedral(
|
|
466
|
+
coord, torsion, -optimal_angle, mask=_get_rotation_mask(graph, torsion)
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
if return_type == "rmsd":
|
|
470
|
+
return rmsd, maxdev
|
|
471
|
+
|
|
472
|
+
return coord
|
prism_pruner/typing.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""PRISM - PRuning Interface for Similar Molecules."""
|
|
2
|
+
|
|
3
|
+
from typing import Annotated, Any, Union
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from numpy.typing import NDArray
|
|
7
|
+
|
|
8
|
+
Array3D_float = Annotated[NDArray[np.float64], "shape: (nconfs, natoms, 3)"]
|
|
9
|
+
Array2D_float = Annotated[NDArray[np.float64], "shape: (natoms, 3)"]
|
|
10
|
+
Array2D_int = Annotated[NDArray[np.int32], "shape: (a, b)"]
|
|
11
|
+
Array1D_float = Annotated[NDArray[np.float64], "shape: (energy,)"]
|
|
12
|
+
Array1D_int = Annotated[NDArray[np.int32], "shape: (natoms,)"]
|
|
13
|
+
Array1D_str = Annotated[NDArray[np.str_], "shape: (natoms,)"]
|
|
14
|
+
Array1D_bool = Annotated[NDArray[np.bool_], "shape: (n,)"]
|
|
15
|
+
FloatIterable = Union[tuple[float, ...], NDArray[np.floating[Any]]]
|
prism_pruner/utils.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"""PRISM - PRuning Interface for Similar Molecules."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Sequence
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from numpy.linalg import LinAlgError
|
|
7
|
+
from numpy.typing import ArrayLike
|
|
8
|
+
|
|
9
|
+
from prism_pruner.algebra import get_alignment_matrix, rot_mat_from_pointer
|
|
10
|
+
from prism_pruner.typing import Array1D_bool, Array1D_int, Array1D_str, Array2D_float, Array3D_float
|
|
11
|
+
|
|
12
|
+
EH_TO_EV = 27.211399
|
|
13
|
+
EH_TO_KCAL = 627.5096080305927
|
|
14
|
+
EV_TO_KCAL = 23.060541945329334
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def align_structures(
|
|
18
|
+
structures: Array3D_float, indices: Array1D_int | None = None
|
|
19
|
+
) -> Array3D_float:
|
|
20
|
+
"""Align structures.
|
|
21
|
+
|
|
22
|
+
Aligns molecules of a structure array (shape is (n_structures, n_atoms, 3))
|
|
23
|
+
to the first one, based on the indices. If not provided, all atoms are used
|
|
24
|
+
to get the best alignment. Return is the aligned array.
|
|
25
|
+
"""
|
|
26
|
+
reference = structures[0]
|
|
27
|
+
targets = structures[1:]
|
|
28
|
+
if isinstance(indices, (list, tuple)):
|
|
29
|
+
indices = np.array(indices)
|
|
30
|
+
|
|
31
|
+
indices = indices if indices is not None else np.array([i for i, _ in enumerate(structures[0])])
|
|
32
|
+
|
|
33
|
+
reference -= np.mean(reference[indices], axis=0)
|
|
34
|
+
for t, _ in enumerate(targets):
|
|
35
|
+
targets[t] -= np.mean(targets[t, indices], axis=0)
|
|
36
|
+
|
|
37
|
+
output = np.zeros(structures.shape)
|
|
38
|
+
output[0] = reference
|
|
39
|
+
|
|
40
|
+
for t, target in enumerate(targets):
|
|
41
|
+
try:
|
|
42
|
+
matrix = get_alignment_matrix(reference[indices], target[indices])
|
|
43
|
+
|
|
44
|
+
except LinAlgError:
|
|
45
|
+
# it is actually possible for the kabsch alg not to converge
|
|
46
|
+
matrix = np.eye(3)
|
|
47
|
+
|
|
48
|
+
# output[t+1] = np.array([matrix @ vector for vector in target])
|
|
49
|
+
output[t + 1] = (matrix @ target.T).T
|
|
50
|
+
|
|
51
|
+
return output
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def time_to_string(total_time: float, verbose: bool = False, digits: int = 1) -> str:
|
|
55
|
+
"""Convert totaltime (float) to a timestring with hours, minutes and seconds."""
|
|
56
|
+
timestring = ""
|
|
57
|
+
|
|
58
|
+
names = ("days", "hours", "minutes", "seconds") if verbose else ("d", "h", "m", "s")
|
|
59
|
+
|
|
60
|
+
if total_time > 24 * 3600:
|
|
61
|
+
d = total_time // (24 * 3600)
|
|
62
|
+
timestring += f"{int(d)} {names[0]} "
|
|
63
|
+
total_time %= 24 * 3600
|
|
64
|
+
|
|
65
|
+
if total_time > 3600:
|
|
66
|
+
h = total_time // 3600
|
|
67
|
+
timestring += f"{int(h)} {names[1]} "
|
|
68
|
+
total_time %= 3600
|
|
69
|
+
|
|
70
|
+
if total_time > 60:
|
|
71
|
+
m = total_time // 60
|
|
72
|
+
timestring += f"{int(m)} {names[2]} "
|
|
73
|
+
total_time %= 60
|
|
74
|
+
|
|
75
|
+
timestring += f"{round(total_time, digits):{2 + digits}} {names[3]}"
|
|
76
|
+
|
|
77
|
+
return timestring
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
double_bonds_thresholds_dict = {
|
|
81
|
+
"CC": 1.4,
|
|
82
|
+
"CN": 1.3,
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def get_double_bonds_indices(coords: Array2D_float, atoms: Array1D_str) -> list[tuple[int, int]]:
|
|
87
|
+
"""Return a list containing 2-elements tuples of indices involved in any double bond."""
|
|
88
|
+
mask = atoms != "H"
|
|
89
|
+
numbering = np.arange(len(coords))[mask]
|
|
90
|
+
coords = coords[mask]
|
|
91
|
+
atoms_masked = atoms[mask]
|
|
92
|
+
output = []
|
|
93
|
+
|
|
94
|
+
for i1, _ in enumerate(coords):
|
|
95
|
+
for i2 in range(i1 + 1, len(coords)):
|
|
96
|
+
dist = np.linalg.norm(coords[i1] - coords[i2])
|
|
97
|
+
tag = "".join(sorted([atoms_masked[i1], atoms_masked[i2]]))
|
|
98
|
+
|
|
99
|
+
threshold = double_bonds_thresholds_dict.get(tag)
|
|
100
|
+
if threshold is not None and dist < threshold:
|
|
101
|
+
output.append((numbering[i1], numbering[i2]))
|
|
102
|
+
|
|
103
|
+
return output
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def rotate_dihedral(
|
|
107
|
+
coords: Array2D_float,
|
|
108
|
+
dihedral: list[int] | tuple[int, ...],
|
|
109
|
+
angle: float,
|
|
110
|
+
mask: Array1D_bool | None = None,
|
|
111
|
+
indices_to_be_moved: ArrayLike | None = None,
|
|
112
|
+
) -> Array2D_float:
|
|
113
|
+
"""Rotate a molecule around a given bond.
|
|
114
|
+
|
|
115
|
+
Atoms that will move are the ones
|
|
116
|
+
specified by mask or indices_to_be_moved.
|
|
117
|
+
If both are None, only the first index of
|
|
118
|
+
the dihedral iterable is moved.
|
|
119
|
+
|
|
120
|
+
angle: angle, in degrees
|
|
121
|
+
"""
|
|
122
|
+
i1, i2, i3, *_ = dihedral
|
|
123
|
+
|
|
124
|
+
if indices_to_be_moved is not None:
|
|
125
|
+
mask = np.isin(np.arange(len(coords)), indices_to_be_moved)
|
|
126
|
+
|
|
127
|
+
if mask is None:
|
|
128
|
+
mask = np.zeros(len(coords), dtype=bool)
|
|
129
|
+
mask[i1] = True
|
|
130
|
+
|
|
131
|
+
axis = coords[i2] - coords[i3]
|
|
132
|
+
mat = rot_mat_from_pointer(axis, angle)
|
|
133
|
+
|
|
134
|
+
center = coords[i3]
|
|
135
|
+
coords[mask] = (coords[mask] - center) @ mat.T + center
|
|
136
|
+
|
|
137
|
+
return coords
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def flatten(array: Sequence[Any], typefunc: type = float) -> list[Any]:
|
|
141
|
+
"""Return the unraveled sequence, with items coerced into the typefunc type."""
|
|
142
|
+
out = []
|
|
143
|
+
|
|
144
|
+
def rec(_l: Any) -> None:
|
|
145
|
+
"""Recursive unraveling function."""
|
|
146
|
+
for e in _l:
|
|
147
|
+
if type(e) in [list, tuple, np.ndarray]:
|
|
148
|
+
rec(e)
|
|
149
|
+
else:
|
|
150
|
+
out.append(typefunc(e))
|
|
151
|
+
|
|
152
|
+
rec(array)
|
|
153
|
+
return out
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: prism_pruner
|
|
3
|
+
Version: 0.0.3
|
|
4
|
+
Summary: Prism Pruner
|
|
5
|
+
Author-email: Nicolò Tampellini <nicolo.tampellini@yale.edu>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Requires-Python: >=3.12
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: networkx>=3.0
|
|
11
|
+
Requires-Dist: numpy>=2.0
|
|
12
|
+
Requires-Dist: periodictable
|
|
13
|
+
Requires-Dist: scipy>=1.10
|
|
14
|
+
Requires-Dist: tqdm>=4
|
|
15
|
+
Dynamic: license-file
|
|
16
|
+
|
|
17
|
+
# Prism Pruner
|
|
18
|
+
|
|
19
|
+
[](https://github.com/ntampellini/prism_pruner/blob/master/LICENSE)
|
|
20
|
+
[](https://pixi.sh)
|
|
21
|
+
[](https://github.com/astral-sh/ruff)
|
|
22
|
+
[](https://github.com/ntampellini/prism_pruner/actions/)
|
|
23
|
+
[](https://codecov.io/gh/ntampellini/prism_pruner)
|
|
24
|
+
[](https://pypi.org/project/prism-pruner/)
|
|
25
|
+
|
|
26
|
+
PRISM (PRuning Interface for Similar Molecules) is the modular similarity pruning code from [FIRECODE](https://github.com/ntampellini/FIRECODE/tree/main), in a standalone package. It filters out duplicate structures from conformational ensembles, leaving behind non-redundant states.
|
|
27
|
+
|
|
28
|
+
The code implements a cached, iterative, divide-and conquer approach on increasingly large subsets of the ensemble and removes duplicates as assessed by one of three metrics:
|
|
29
|
+
- Heavy-atom RMSD and maximum deviation
|
|
30
|
+
- Rotamer-corrected heavy-atom RMSD and maximum deviation
|
|
31
|
+
- Relative deviation of the moments of inertia on the principal axes
|
|
32
|
+
|
|
33
|
+
## Credits
|
|
34
|
+
This package was created with [Cookiecutter](https://github.com/audreyr/cookiecutter) and the [jevandezande/pixi-cookiecutter](https://github.com/jevandezande/pixi-cookiecutter) project template.
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
prism_pruner/__init__.py,sha256=M3KYy269Z7PmWOgRhMcBZySZSmC3pjG7lRIG17eN-FQ,55
|
|
2
|
+
prism_pruner/algebra.py,sha256=4oKViTtGiTzpZs3sQQaiHX3fpQeAmlpdHFfTo2FPrGU,4951
|
|
3
|
+
prism_pruner/conformer_ensemble.py,sha256=9VYpRb0k-IhicAl1pYsOjX26WkrCqiqhH588_3ALB-k,1837
|
|
4
|
+
prism_pruner/graph_manipulations.py,sha256=PNAa2zEcPezFg-83dvtSU-UYCOYVcvR3AYBpdEzWMNY,6321
|
|
5
|
+
prism_pruner/pruner.py,sha256=JIoT2L8w3hqlAPsfNDGdUn2nNBG_AKEE-jH7u7L1T0Y,20493
|
|
6
|
+
prism_pruner/rmsd.py,sha256=09CHQy2-z3mWA6cQhoNRSuA_E5JZ7NEtCj1Al_Wjl6M,877
|
|
7
|
+
prism_pruner/torsion_module.py,sha256=LoFnvmb3OBMzfKxaHK54YtlpgSO6QMYiDbSb60jXwlc,16023
|
|
8
|
+
prism_pruner/typing.py,sha256=fBHZgLf38MlvIoOHaMZOP4thI-9OvhHK3AnjuqFPbfU,676
|
|
9
|
+
prism_pruner/utils.py,sha256=OdV9qX6XiocKzPMLL9UmLKj8poKnipJmaf8KdsGlNTs,4594
|
|
10
|
+
prism_pruner-0.0.3.dist-info/licenses/LICENSE,sha256=Im9pMXp0ignxYTY5QMacrME_3l6QVtQXO6QvO3bVriY,1075
|
|
11
|
+
prism_pruner-0.0.3.dist-info/METADATA,sha256=M4FgFBJfl_GbK3nSGpfrVuJpjsDMsn0byj4ChQ0BCsg,2045
|
|
12
|
+
prism_pruner-0.0.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
13
|
+
prism_pruner-0.0.3.dist-info/top_level.txt,sha256=GdtwtPlTsKhTsjMoj4bo6wJVoyzFX371HKQU32l6Q84,13
|
|
14
|
+
prism_pruner-0.0.3.dist-info/RECORD,,
|