prism-pruner 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of prism-pruner might be problematic. Click here for more details.
- prism_pruner/__init__.py +1 -0
- prism_pruner/algebra.py +190 -0
- prism_pruner/graph_manipulations.py +194 -0
- prism_pruner/pruner.py +571 -0
- prism_pruner/pt.py +12 -0
- prism_pruner/rmsd.py +39 -0
- prism_pruner/torsion_module.py +468 -0
- prism_pruner/typing.py +15 -0
- prism_pruner/utils.py +246 -0
- prism_pruner-0.0.1.dist-info/METADATA +28 -0
- prism_pruner-0.0.1.dist-info/RECORD +14 -0
- prism_pruner-0.0.1.dist-info/WHEEL +5 -0
- prism_pruner-0.0.1.dist-info/licenses/LICENSE +21 -0
- prism_pruner-0.0.1.dist-info/top_level.txt +1 -0
prism_pruner/utils.py
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
"""PRISM - PRuning Interface for Similar Molecules."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Sequence, TextIO
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
from numpy.linalg import LinAlgError
|
|
8
|
+
from numpy.typing import ArrayLike
|
|
9
|
+
|
|
10
|
+
from prism_pruner.algebra import get_alignment_matrix, norm_of, rot_mat_from_pointer
|
|
11
|
+
from prism_pruner.pt import pt
|
|
12
|
+
from prism_pruner.typing import Array1D_bool, Array1D_int, Array2D_float, Array3D_float
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def align_structures(
|
|
16
|
+
structures: Array3D_float, indices: Array1D_int | None = None
|
|
17
|
+
) -> Array3D_float:
|
|
18
|
+
"""Align structures.
|
|
19
|
+
|
|
20
|
+
Aligns molecules of a structure array (shape is (n_structures, n_atoms, 3))
|
|
21
|
+
to the first one, based on the indices. If not provided, all atoms are used
|
|
22
|
+
to get the best alignment. Return is the aligned array.
|
|
23
|
+
"""
|
|
24
|
+
reference = structures[0]
|
|
25
|
+
targets = structures[1:]
|
|
26
|
+
if isinstance(indices, (list, tuple)):
|
|
27
|
+
indices = np.array(indices)
|
|
28
|
+
|
|
29
|
+
indices = indices if indices is not None else np.array([i for i, _ in enumerate(structures[0])])
|
|
30
|
+
|
|
31
|
+
reference -= np.mean(reference[indices], axis=0)
|
|
32
|
+
for t, _ in enumerate(targets):
|
|
33
|
+
targets[t] -= np.mean(targets[t, indices], axis=0)
|
|
34
|
+
|
|
35
|
+
output = np.zeros(structures.shape)
|
|
36
|
+
output[0] = reference
|
|
37
|
+
|
|
38
|
+
for t, target in enumerate(targets):
|
|
39
|
+
try:
|
|
40
|
+
matrix = get_alignment_matrix(reference[indices], target[indices])
|
|
41
|
+
|
|
42
|
+
except LinAlgError:
|
|
43
|
+
# it is actually possible for the kabsch alg not to converge
|
|
44
|
+
matrix = np.eye(3)
|
|
45
|
+
|
|
46
|
+
# output[t+1] = np.array([matrix @ vector for vector in target])
|
|
47
|
+
output[t + 1] = (matrix @ target.T).T
|
|
48
|
+
|
|
49
|
+
return output
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def write_xyz(
|
|
53
|
+
coords: Array2D_float, atomnos: Array1D_int, output: TextIO, title: str = "temp"
|
|
54
|
+
) -> None:
|
|
55
|
+
"""Write xyz coordinates to a TextIO file."""
|
|
56
|
+
assert atomnos.shape[0] == coords.shape[0]
|
|
57
|
+
assert coords.shape[1] == 3
|
|
58
|
+
string = ""
|
|
59
|
+
string += str(len(coords))
|
|
60
|
+
string += f"\n{title}\n"
|
|
61
|
+
for i, atom in enumerate(coords):
|
|
62
|
+
string += "%s % .6f % .6f % .6f\n" % (pt[atomnos[i]].symbol, atom[0], atom[1], atom[2])
|
|
63
|
+
output.write(string)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class XYZParser:
|
|
67
|
+
"""cclib-like parser for .xyz multimolecular files."""
|
|
68
|
+
|
|
69
|
+
def __init__(self, filename: str, pt: Any):
|
|
70
|
+
"""Initialize XYZParser and parse the file.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
filename (str): Path to the .xyz file
|
|
74
|
+
pt: periodictable table instance for atomic number lookup
|
|
75
|
+
|
|
76
|
+
Raises
|
|
77
|
+
------
|
|
78
|
+
FileNotFoundError: If the specified file does not exist
|
|
79
|
+
"""
|
|
80
|
+
self.filename = filename
|
|
81
|
+
self.pt = pt
|
|
82
|
+
self.atomcoords_list: list[Array3D_float] = []
|
|
83
|
+
self.atomnos_list: list[Array1D_int] = []
|
|
84
|
+
|
|
85
|
+
self._parse_file()
|
|
86
|
+
|
|
87
|
+
self.atomcoords: Array3D_float = np.asarray(self.atomcoords_list)
|
|
88
|
+
|
|
89
|
+
self.atomnos: Array1D_int = np.asarray(self.atomnos_list[0])
|
|
90
|
+
|
|
91
|
+
def _parse_file(self) -> None:
|
|
92
|
+
"""Parse the .xyz file and populate atomcoords and atomnos."""
|
|
93
|
+
filepath = Path(self.filename)
|
|
94
|
+
|
|
95
|
+
if not filepath.exists():
|
|
96
|
+
raise FileNotFoundError(f"File '{self.filename}' not found")
|
|
97
|
+
|
|
98
|
+
with open(filepath, "r") as f:
|
|
99
|
+
lines = f.readlines()
|
|
100
|
+
|
|
101
|
+
i = 0
|
|
102
|
+
while i < len(lines):
|
|
103
|
+
# Skip empty lines
|
|
104
|
+
if not lines[i].strip():
|
|
105
|
+
i += 1
|
|
106
|
+
continue
|
|
107
|
+
|
|
108
|
+
# Read number of atoms
|
|
109
|
+
try:
|
|
110
|
+
natoms = int(lines[i].strip())
|
|
111
|
+
except ValueError:
|
|
112
|
+
i += 1
|
|
113
|
+
continue
|
|
114
|
+
|
|
115
|
+
# Skip comment line
|
|
116
|
+
i += 2
|
|
117
|
+
|
|
118
|
+
coords = []
|
|
119
|
+
atomnos = []
|
|
120
|
+
|
|
121
|
+
# Read atom data
|
|
122
|
+
for j in range(natoms):
|
|
123
|
+
if i + j < len(lines):
|
|
124
|
+
parts = lines[i + j].split()
|
|
125
|
+
if len(parts) >= 4:
|
|
126
|
+
symbol = parts[0]
|
|
127
|
+
x, y, z = map(float, parts[1:4])
|
|
128
|
+
|
|
129
|
+
# Get atomic number from periodictable
|
|
130
|
+
atomic_no = getattr(self.pt, symbol).number
|
|
131
|
+
|
|
132
|
+
coords.append([x, y, z])
|
|
133
|
+
atomnos.append(atomic_no)
|
|
134
|
+
|
|
135
|
+
if coords:
|
|
136
|
+
self.atomcoords_list.append(np.array(coords))
|
|
137
|
+
self.atomnos_list.append(np.array(atomnos))
|
|
138
|
+
|
|
139
|
+
i += natoms
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def read_xyz(filename: str) -> XYZParser:
|
|
143
|
+
"""Read a .xyz file and return a cclib-like mol object."""
|
|
144
|
+
mol = XYZParser(filename, pt)
|
|
145
|
+
return mol
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def time_to_string(total_time: float, verbose: bool = False, digits: int = 1) -> str:
|
|
149
|
+
"""Convert totaltime (float) to a timestring with hours, minutes and seconds."""
|
|
150
|
+
timestring = ""
|
|
151
|
+
|
|
152
|
+
names = ("days", "hours", "minutes", "seconds") if verbose else ("d", "h", "m", "s")
|
|
153
|
+
|
|
154
|
+
if total_time > 24 * 3600:
|
|
155
|
+
d = total_time // (24 * 3600)
|
|
156
|
+
timestring += f"{int(d)} {names[0]} "
|
|
157
|
+
total_time %= 24 * 3600
|
|
158
|
+
|
|
159
|
+
if total_time > 3600:
|
|
160
|
+
h = total_time // 3600
|
|
161
|
+
timestring += f"{int(h)} {names[1]} "
|
|
162
|
+
total_time %= 3600
|
|
163
|
+
|
|
164
|
+
if total_time > 60:
|
|
165
|
+
m = total_time // 60
|
|
166
|
+
timestring += f"{int(m)} {names[2]} "
|
|
167
|
+
total_time %= 60
|
|
168
|
+
|
|
169
|
+
timestring += f"{round(total_time, digits):{2 + digits}} {names[3]}"
|
|
170
|
+
|
|
171
|
+
return timestring
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
double_bonds_thresholds_dict = {
|
|
175
|
+
"CC": 1.4,
|
|
176
|
+
"CN": 1.3,
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def get_double_bonds_indices(coords: Array2D_float, atomnos: Array1D_int) -> list[tuple[int, int]]:
|
|
181
|
+
"""Return a list containing 2-elements tuples of indices involved in any double bond."""
|
|
182
|
+
mask = atomnos != 1
|
|
183
|
+
numbering = np.arange(len(coords))[mask]
|
|
184
|
+
coords = coords[mask]
|
|
185
|
+
atomnos = atomnos[mask]
|
|
186
|
+
output = []
|
|
187
|
+
|
|
188
|
+
for i1, _ in enumerate(coords):
|
|
189
|
+
for i2 in range(i1 + 1, len(coords)):
|
|
190
|
+
dist = norm_of(coords[i1] - coords[i2])
|
|
191
|
+
tag = "".join(sorted([pt[atomnos[i1]].symbol, pt[atomnos[i2]].symbol]))
|
|
192
|
+
|
|
193
|
+
threshold = double_bonds_thresholds_dict.get(tag)
|
|
194
|
+
if threshold is not None and dist < threshold:
|
|
195
|
+
output.append((numbering[i1], numbering[i2]))
|
|
196
|
+
|
|
197
|
+
return output
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def rotate_dihedral(
|
|
201
|
+
coords: Array2D_float,
|
|
202
|
+
dihedral: list[int] | tuple[int, ...],
|
|
203
|
+
angle: float,
|
|
204
|
+
mask: Array1D_bool | None = None,
|
|
205
|
+
indices_to_be_moved: ArrayLike | None = None,
|
|
206
|
+
) -> Array2D_float:
|
|
207
|
+
"""Rotate a molecule around a given bond.
|
|
208
|
+
|
|
209
|
+
Atoms that will move are the ones
|
|
210
|
+
specified by mask or indices_to_be_moved.
|
|
211
|
+
If both are None, only the first index of
|
|
212
|
+
the dihedral iterable is moved.
|
|
213
|
+
|
|
214
|
+
angle: angle, in degrees
|
|
215
|
+
"""
|
|
216
|
+
i1, i2, i3, _ = dihedral
|
|
217
|
+
|
|
218
|
+
if indices_to_be_moved is not None:
|
|
219
|
+
mask = np.isin(np.arange(len(coords)), indices_to_be_moved)
|
|
220
|
+
|
|
221
|
+
if mask is None:
|
|
222
|
+
mask = np.array([[i1]])
|
|
223
|
+
|
|
224
|
+
axis = coords[i2] - coords[i3]
|
|
225
|
+
mat = rot_mat_from_pointer(axis, angle)
|
|
226
|
+
center = coords[i3]
|
|
227
|
+
|
|
228
|
+
coords[mask] = (mat @ (coords[mask] - center).T).T + center
|
|
229
|
+
|
|
230
|
+
return coords
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def flatten(array: Sequence[Any], typefunc: type = float) -> list[Any]:
|
|
234
|
+
"""Return the unraveled sequence, with items coerced into the typefunc type."""
|
|
235
|
+
out = []
|
|
236
|
+
|
|
237
|
+
def rec(_l: Any) -> None:
|
|
238
|
+
"""Recursive unraveling function."""
|
|
239
|
+
for e in _l:
|
|
240
|
+
if type(e) in [list, tuple, np.ndarray]:
|
|
241
|
+
rec(e)
|
|
242
|
+
else:
|
|
243
|
+
out.append(typefunc(e))
|
|
244
|
+
|
|
245
|
+
rec(array)
|
|
246
|
+
return out
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: prism_pruner
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Prism Pruner
|
|
5
|
+
Author-email: Nicolò Tampellini <nicolo.tampellini@yale.edu>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Requires-Python: >=3.12
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Dynamic: license-file
|
|
11
|
+
|
|
12
|
+
# Prism Pruner
|
|
13
|
+
|
|
14
|
+
[](https://github.com/ntampellini/prism_pruner/blob/master/LICENSE)
|
|
15
|
+
[](https://pixi.sh)
|
|
16
|
+
[](https://github.com/astral-sh/ruff)
|
|
17
|
+
[](https://github.com/ntampellini/prism_pruner/actions/)
|
|
18
|
+
[](https://codecov.io/gh/ntampellini/prism_pruner)
|
|
19
|
+
|
|
20
|
+
PRISM (PRuning Interface for Similar Molecules) is the modular similarity pruning code from [FIRECODE](https://github.com/ntampellini/FIRECODE/tree/main), in a standalone package. It filters out duplicate structures from conformational ensembles, leaving behind non-redundant states.
|
|
21
|
+
|
|
22
|
+
The code implements a cached, iterative, divide-and conquer approach on increasingly large subsets of the ensemble and removes duplicates as assessed by one of three metrics:
|
|
23
|
+
- Heavy-atom RMSD and maximum deviation
|
|
24
|
+
- Rotamer-corrected heavy-atom RMSD and maximum deviation
|
|
25
|
+
- Relative deviation of the moments of inertia on the principal axes
|
|
26
|
+
|
|
27
|
+
## Credits
|
|
28
|
+
This package was created with [Cookiecutter](https://github.com/audreyr/cookiecutter) and the [jevandezande/pixi-cookiecutter](https://github.com/jevandezande/pixi-cookiecutter) project template.
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
prism_pruner/__init__.py,sha256=M3KYy269Z7PmWOgRhMcBZySZSmC3pjG7lRIG17eN-FQ,55
|
|
2
|
+
prism_pruner/algebra.py,sha256=cGGMFur0F4NM9OxBucfzpBvXBa4dScSk8VDaU6x_254,6010
|
|
3
|
+
prism_pruner/graph_manipulations.py,sha256=-8uEOCWezayx2k4eVQVU61Y5-Nv5hBVjXdKSsoYEwoI,6342
|
|
4
|
+
prism_pruner/pruner.py,sha256=JA5Vn_PrmNdKZPOC8YE5Wteq17FApa2UF0C1v-EpU_c,18744
|
|
5
|
+
prism_pruner/pt.py,sha256=XoF7uksmZgJlqZb8t_efEaLGS2RqA67rENR4Hx_YBHg,302
|
|
6
|
+
prism_pruner/rmsd.py,sha256=v11h_xTWF7Ea-MDnQXHGa9XwkvEEMJvDZgCC8bOmdKA,931
|
|
7
|
+
prism_pruner/torsion_module.py,sha256=YNjhVqS0wdlqdYXHYonU8munCEfRijcTz-s5Dn_SKwc,15905
|
|
8
|
+
prism_pruner/typing.py,sha256=WyBF38NsM34bT-uT8EvzEXfDCf2jdATRSX8WtQeYCes,679
|
|
9
|
+
prism_pruner/utils.py,sha256=6T5rNpawscDw4xfkH5Ua11IZUKJXjoJ3ANZV-vyO1vI,7395
|
|
10
|
+
prism_pruner-0.0.1.dist-info/licenses/LICENSE,sha256=Im9pMXp0ignxYTY5QMacrME_3l6QVtQXO6QvO3bVriY,1075
|
|
11
|
+
prism_pruner-0.0.1.dist-info/METADATA,sha256=QgcgRkYkY7WBqMnSQDChM5B4nR-x7QtH2oOVZiV7mGg,1807
|
|
12
|
+
prism_pruner-0.0.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
13
|
+
prism_pruner-0.0.1.dist-info/top_level.txt,sha256=GdtwtPlTsKhTsjMoj4bo6wJVoyzFX371HKQU32l6Q84,13
|
|
14
|
+
prism_pruner-0.0.1.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Nicolò Tampellini
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
prism_pruner
|