prism-pruner 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Nicolò Tampellini
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,28 @@
1
+ Metadata-Version: 2.4
2
+ Name: prism_pruner
3
+ Version: 0.0.1
4
+ Summary: Prism Pruner
5
+ Author-email: Nicolò Tampellini <nicolo.tampellini@yale.edu>
6
+ License-Expression: MIT
7
+ Requires-Python: >=3.12
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Dynamic: license-file
11
+
12
+ # Prism Pruner
13
+
14
+ [![License](https://img.shields.io/github/license/ntampellini/prism_pruner)](https://github.com/ntampellini/prism_pruner/blob/master/LICENSE)
15
+ [![Powered by: Pixi](https://img.shields.io/badge/Powered_by-Pixi-facc15)](https://pixi.sh)
16
+ [![Code style: ruff](https://img.shields.io/badge/code%20style-ruff-000000.svg)](https://github.com/astral-sh/ruff)
17
+ [![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/ntampellini/prism_pruner/test.yml?branch=master&logo=github-actions)](https://github.com/ntampellini/prism_pruner/actions/)
18
+ [![Codecov](https://img.shields.io/codecov/c/github/ntampellini/prism_pruner)](https://codecov.io/gh/ntampellini/prism_pruner)
19
+
20
+ PRISM (PRuning Interface for Similar Molecules) is the modular similarity pruning code from [FIRECODE](https://github.com/ntampellini/FIRECODE/tree/main), in a standalone package. It filters out duplicate structures from conformational ensembles, leaving behind non-redundant states.
21
+
22
+ The code implements a cached, iterative, divide-and conquer approach on increasingly large subsets of the ensemble and removes duplicates as assessed by one of three metrics:
23
+ - Heavy-atom RMSD and maximum deviation
24
+ - Rotamer-corrected heavy-atom RMSD and maximum deviation
25
+ - Relative deviation of the moments of inertia on the principal axes
26
+
27
+ ## Credits
28
+ This package was created with [Cookiecutter](https://github.com/audreyr/cookiecutter) and the [jevandezande/pixi-cookiecutter](https://github.com/jevandezande/pixi-cookiecutter) project template.
@@ -0,0 +1,17 @@
1
+ # Prism Pruner
2
+
3
+ [![License](https://img.shields.io/github/license/ntampellini/prism_pruner)](https://github.com/ntampellini/prism_pruner/blob/master/LICENSE)
4
+ [![Powered by: Pixi](https://img.shields.io/badge/Powered_by-Pixi-facc15)](https://pixi.sh)
5
+ [![Code style: ruff](https://img.shields.io/badge/code%20style-ruff-000000.svg)](https://github.com/astral-sh/ruff)
6
+ [![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/ntampellini/prism_pruner/test.yml?branch=master&logo=github-actions)](https://github.com/ntampellini/prism_pruner/actions/)
7
+ [![Codecov](https://img.shields.io/codecov/c/github/ntampellini/prism_pruner)](https://codecov.io/gh/ntampellini/prism_pruner)
8
+
9
+ PRISM (PRuning Interface for Similar Molecules) is the modular similarity pruning code from [FIRECODE](https://github.com/ntampellini/FIRECODE/tree/main), in a standalone package. It filters out duplicate structures from conformational ensembles, leaving behind non-redundant states.
10
+
11
+ The code implements a cached, iterative, divide-and conquer approach on increasingly large subsets of the ensemble and removes duplicates as assessed by one of three metrics:
12
+ - Heavy-atom RMSD and maximum deviation
13
+ - Rotamer-corrected heavy-atom RMSD and maximum deviation
14
+ - Relative deviation of the moments of inertia on the principal axes
15
+
16
+ ## Credits
17
+ This package was created with [Cookiecutter](https://github.com/audreyr/cookiecutter) and the [jevandezande/pixi-cookiecutter](https://github.com/jevandezande/pixi-cookiecutter) project template.
@@ -0,0 +1 @@
1
+ """PRISM - PRuning Interface for Similar Molecules."""
@@ -0,0 +1,190 @@
1
+ """Algebra utilities."""
2
+
3
+ from typing import Sequence
4
+
5
+ import numpy as np
6
+
7
+ from prism_pruner.typing import Array1D_float, Array1D_int, Array2D_float, Array3D_float
8
+
9
+
10
+ def norm(vec: Array1D_int) -> Array1D_int:
11
+ """Normalize a vector (3D only)."""
12
+ return vec / np.sqrt((vec[0] * vec[0] + vec[1] * vec[1] + vec[2] * vec[2])) # type: ignore[no-any-return]
13
+
14
+
15
+ def norm_of(vec: Array1D_int) -> float:
16
+ """Norm of a vector (3D only)."""
17
+ return float(np.sqrt((vec[0] * vec[0] + vec[1] * vec[1] + vec[2] * vec[2])))
18
+
19
+
20
+ def vec_angle(v1: Array1D_int, v2: Array1D_int) -> float:
21
+ """Return the planar angle defined by two 3D vectors."""
22
+ return float(
23
+ np.degrees(
24
+ np.arccos(
25
+ np.clip(np.dot(norm(v1), norm(v2)), -1.0, 1.0),
26
+ )
27
+ )
28
+ )
29
+
30
+
31
+ def dihedral(p: Array2D_float) -> float:
32
+ """
33
+ Find dihedral angle in degrees from 4 3D vecs.
34
+
35
+ Praxeolitic formula: 1 sqrt, 1 cross product.
36
+ """
37
+ p0, p1, p2, p3 = p
38
+
39
+ b0 = -1.0 * (p1 - p0)
40
+ b1 = p2 - p1
41
+ b2 = p3 - p2
42
+
43
+ # normalize b1 so that it does not influence magnitude of vector
44
+ # rejections that come next
45
+ b1 /= norm_of(b1)
46
+
47
+ # vector rejections
48
+ # v = projection of b0 onto plane perpendicular to b1
49
+ # = b0 minus component that aligns with b1
50
+ # w = projection of b2 onto plane perpendicular to b1
51
+ # = b2 minus component that aligns with b1
52
+ v = b0 - np.dot(b0, b1) * b1
53
+ w = b2 - np.dot(b2, b1) * b1
54
+
55
+ # angle between v and w in a plane is the torsion angle
56
+ # v and w may not be normalized but that's fine since tan is y/x
57
+ x = np.dot(v, w)
58
+ y = np.dot(np.cross(b1, v), w)
59
+
60
+ return float(np.degrees(np.arctan2(y, x)))
61
+
62
+
63
+ def rot_mat_from_pointer(pointer: Array1D_int, angle: float) -> Array2D_float:
64
+ """
65
+ Get the rotation matrix from the rotation pivot using a quaternion.
66
+
67
+ :param pointer: 3D vector representing the rotation pivot
68
+ :param angle: rotation angle in degrees
69
+ :return rotation_matrix: matrix that applied to a point, rotates it along the pointer
70
+ """
71
+ assert pointer.shape[0] == 3
72
+
73
+ angle_2 = np.radians(angle) / 2
74
+ sin = np.sin(angle_2)
75
+ pointer = norm(pointer)
76
+ return quaternion_to_rotation_matrix(
77
+ [
78
+ sin * pointer[0],
79
+ sin * pointer[1],
80
+ sin * pointer[2],
81
+ np.cos(angle_2),
82
+ ]
83
+ )
84
+
85
+
86
+ def quaternion_to_rotation_matrix(quat: Array1D_float | Sequence[float]) -> Array2D_float:
87
+ """
88
+ Convert a quaternion into a full three-dimensional rotation matrix.
89
+
90
+ This rotation matrix converts a point in the local reference frame to a
91
+ point in the global reference frame.
92
+
93
+ :param quat: 4-element array representing the quaternion (q0, q1, q2, q3)
94
+ :return: 3x3 element array representing the full 3D rotation matrix
95
+ """
96
+ # Extract the values from Q (adjusting for scalar last in input)
97
+ q1, q2, q3, q0 = quat
98
+
99
+ # First row of the rotation matrix
100
+ r00 = 2 * (q0 * q0 + q1 * q1) - 1
101
+ r01 = 2 * (q1 * q2 - q0 * q3)
102
+ r02 = 2 * (q1 * q3 + q0 * q2)
103
+
104
+ # Second row of the rotation matrix
105
+ r10 = 2 * (q1 * q2 + q0 * q3)
106
+ r11 = 2 * (q0 * q0 + q2 * q2) - 1
107
+ r12 = 2 * (q2 * q3 - q0 * q1)
108
+
109
+ # Third row of the rotation matrix
110
+ r20 = 2 * (q1 * q3 - q0 * q2)
111
+ r21 = 2 * (q2 * q3 + q0 * q1)
112
+ r22 = 2 * (q0 * q0 + q3 * q3) - 1
113
+
114
+ # 3x3 rotation matrix
115
+ return np.array([[r00, r01, r02], [r10, r11, r12], [r20, r21, r22]])
116
+
117
+
118
+ def kronecker_delta(i: int, j: int) -> int:
119
+ """Kronecker delta."""
120
+ return int(i == j)
121
+
122
+
123
+ def get_inertia_moments(coords: Array3D_float, masses: Array1D_float) -> Array1D_float:
124
+ """
125
+ Find the moments of inertia of the three principal axes.
126
+
127
+ :return: diagonal of the diagonalized inertia tensor, that is
128
+ a shape (3,) array with the moments of inertia along the main axes.
129
+ (I_x, I_y and largest I_z last)
130
+ """
131
+ coords -= center_of_mass(coords, masses)
132
+ inertia_moment_matrix = np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])
133
+
134
+ for i in range(3):
135
+ for j in range(3):
136
+ k = kronecker_delta(i, j)
137
+ inertia_moment_matrix[i][j] = sum(
138
+ [
139
+ masses[n] * ((norm_of(coords[n]) ** 2) * k - coords[n][i] * coords[n][j])
140
+ for n, _ in enumerate(coords)
141
+ ]
142
+ )
143
+
144
+ inertia_moment_matrix = diagonalize(inertia_moment_matrix)
145
+
146
+ return np.diag(inertia_moment_matrix)
147
+
148
+
149
+ def diagonalize(a: Array2D_float) -> Array2D_float:
150
+ """Build the diagonalized matrix."""
151
+ eigenvalues_of_a, eigenvectors_of_a = np.linalg.eig(a)
152
+ b = eigenvectors_of_a[:, np.abs(eigenvalues_of_a).argsort()]
153
+ return np.dot(np.linalg.inv(b), np.dot(a, b)) # type: ignore[no-any-return]
154
+
155
+
156
+ def center_of_mass(coords: Array3D_float, masses: Array1D_float) -> Array1D_float:
157
+ """Find the center of mass for the atomic system."""
158
+ total_mass = sum([masses[i] for i in range(len(coords))])
159
+ w = np.array([0.0, 0.0, 0.0])
160
+ for i in range(len(coords)):
161
+ w += coords[i] * masses[i]
162
+ return w / total_mass # type: ignore[no-any-return]
163
+
164
+
165
+ def get_moi_deviation_vec(
166
+ coords1: Array2D_float, coords2: Array2D_float, masses: Array1D_float
167
+ ) -> Array1D_float:
168
+ """Determine the relative difference of the three principal axes moments of inertia."""
169
+ im_1 = get_inertia_moments(coords1, masses)
170
+ im_2 = get_inertia_moments(coords2, masses)
171
+
172
+ return np.abs(im_1 - im_2) / im_1
173
+
174
+
175
+ def get_alignment_matrix(p: Array1D_float, q: Array1D_float) -> Array2D_float:
176
+ """
177
+ Build the rotation matrix that aligns vectors q to p (Kabsch algorithm).
178
+
179
+ Assumes centered vector sets (i.e. their mean is the origin).
180
+ """
181
+ # calculate the covariance matrix
182
+ cov_mat = np.ascontiguousarray(p.T) @ q
183
+
184
+ # Compute the SVD
185
+ v, _, w = np.linalg.svd(cov_mat)
186
+
187
+ if (np.linalg.det(v) * np.linalg.det(w)) < 0.0:
188
+ v[:, -1] = -v[:, -1]
189
+
190
+ return np.dot(v, w) # type: ignore[no-any-return]
@@ -0,0 +1,194 @@
1
+ """Graph manipulation utilities for molecular structures."""
2
+
3
+ from functools import lru_cache
4
+
5
+ import numpy as np
6
+ from networkx import Graph, all_simple_paths, from_numpy_array, set_node_attributes
7
+ from scipy.spatial.distance import cdist
8
+
9
+ from prism_pruner.algebra import dihedral, norm_of
10
+ from prism_pruner.pt import pt
11
+ from prism_pruner.typing import Array1D_bool, Array1D_int, Array2D_float
12
+
13
+
14
+ @lru_cache()
15
+ def d_min_bond(a1: int, a2: int, factor: float = 1.2) -> float:
16
+ """Return the bond distance between two atoms."""
17
+ return factor * (pt[a1].covalent_radius + pt[a2].covalent_radius) # type: ignore [no-any-return]
18
+
19
+
20
+ def graphize(
21
+ atoms: Array1D_int,
22
+ coords: Array2D_float,
23
+ mask: Array1D_bool | None = None,
24
+ ) -> Graph:
25
+ """
26
+ Return a NetworkX undirected graph of molecular connectivity.
27
+
28
+ :param atoms: atomic numbers
29
+ :param coords: atomic coordinates as 3D vectors
30
+ :param mask: bool array, with False for atoms to be excluded in the bond evaluation
31
+ :return: connectivity graph
32
+ """
33
+ mask = np.array([True for _ in atoms], dtype=bool) if mask is None else mask
34
+ assert len(coords) == len(atoms)
35
+ assert len(coords) == len(mask)
36
+
37
+ matrix = np.zeros((len(coords), len(coords)))
38
+ for i, mask_i in enumerate(mask):
39
+ if not mask_i:
40
+ continue
41
+
42
+ for j, mask_j in enumerate(mask[i + 1 :], start=i + 1):
43
+ if not mask_j:
44
+ continue
45
+
46
+ if norm_of(coords[i] - coords[j]) < d_min_bond(atoms[i], atoms[j]):
47
+ matrix[i][j] = 1
48
+
49
+ graph = from_numpy_array(matrix)
50
+ set_node_attributes(graph, dict(enumerate(atoms)), "atomnos")
51
+
52
+ return graph
53
+
54
+
55
+ def get_sp_n(index: int, graph: Graph) -> int | None:
56
+ """
57
+ Get hybridization of selected atom.
58
+
59
+ Return n, that is the apex of sp^n hybridization for CONPS atoms.
60
+ This is just an assimilation to the carbon geometry in relation to sp^n:
61
+ - sp¹ is linear
62
+ - sp² is planar
63
+ - sp³ is tetraedral
64
+ This is mainly used to understand if a torsion is to be rotated or not.
65
+ """
66
+ element = graph.nodes[index]["atomnos"]
67
+
68
+ if element not in {6, 7, 8, 15, 16}:
69
+ return None
70
+
71
+ d: dict[int, dict[int, int | None]] = {
72
+ 6: {2: 1, 3: 2, 4: 3}, # C - 2 neighbors means sp, 3 nb means sp2, 4 nb sp3
73
+ 7: {2: 2, 3: None, 4: 3}, # N - 2 neighbors means sp2, 3 nb could mean sp3 or sp2, 4 nb sp3
74
+ 8: {1: 2, 2: 3, 3: 3, 4: 3}, # O
75
+ 15: {2: 2, 3: 3, 4: 3}, # P - like N
76
+ 16: {2: 2, 3: 3, 4: 3}, # S
77
+ }
78
+ return d[element].get(len(set(graph.neighbors(index))))
79
+
80
+
81
+ def is_amide_n(index: int, graph: Graph, mode: int = -1) -> bool:
82
+ """
83
+ Assess if the atom is an amide-like nitrogen.
84
+
85
+ Note: carbamates and ureas are considered amides.
86
+
87
+ mode:
88
+ -1 - any amide
89
+ 0 - primary amide (CONH2)
90
+ 1 - secondary amide (CONHR)
91
+ 2 - tertiary amide (CONR2)
92
+ """
93
+ # Must be a nitrogen atom
94
+ if graph.nodes[index]["atomnos"] == 7:
95
+ nb = set(graph.neighbors(index))
96
+ nb_atoms = [graph.nodes[j]["atomnos"] for j in nb]
97
+
98
+ if mode != -1:
99
+ # Primary amides need to have 1H, secondary amides none
100
+ if nb_atoms.count(1) != (2, 1, 0)[mode]:
101
+ return False
102
+
103
+ for n in nb:
104
+ # There must be at least one carbon atom next to N
105
+ if graph.nodes[n]["atomnos"] == 6:
106
+ nb_nb = set(graph.neighbors(n))
107
+ # Bonded to three atoms
108
+ if len(nb_nb) == 3:
109
+ # and at least one of them has to be an oxygen
110
+ if 8 in {graph.nodes[i]["atomnos"] for i in nb_nb}:
111
+ return True
112
+ return False
113
+
114
+
115
+ def is_ester_o(index: int, graph: Graph) -> bool:
116
+ """
117
+ Assess if the index is an ester-like oxygen.
118
+
119
+ Note: carbamates and carbonates return True, carboxylic acids return False.
120
+ """
121
+ if graph.nodes[index]["atomnos"] == 8:
122
+ if 1 in (nb := set(graph.neighbors(index))):
123
+ return False
124
+
125
+ for n in nb:
126
+ if graph.nodes[n]["atomnos"] == 6:
127
+ nb_nb = set(graph.neighbors(n))
128
+ if len(nb_nb) == 3:
129
+ nb_nb_sym = [graph.nodes[i]["atomnos"] for i in nb_nb]
130
+ if nb_nb_sym.count(8) > 1:
131
+ return True
132
+ return False
133
+
134
+
135
+ def is_phenyl(coords: Array2D_float) -> bool:
136
+ """
137
+ Assess if the six atomic coords refer to a phenyl-like ring.
138
+
139
+ Note: quinones evaluate to True
140
+
141
+ :params coords: six coordinates of C/N atoms
142
+ :return: bool indicating if the six atoms look like part of a phenyl/naphtyl/pyridine
143
+ system, coordinates for the center of that ring
144
+ """
145
+ # if any atomic couple is more than 3 A away from each other, this is not a Ph
146
+ if np.max(cdist(coords, coords)) > 3:
147
+ return False
148
+
149
+ threshold_delta: float = 1 - np.cos(10 * np.pi / 180)
150
+ flat_delta: float = 1 - np.abs(np.cos(dihedral(coords[[0, 1, 2, 3]]) * np.pi / 180))
151
+
152
+ return flat_delta < threshold_delta
153
+
154
+
155
+ def get_phenyl_ids(index: int, graph: Graph) -> list[int] | None:
156
+ """If index is part of a phenyl, return the six heavy atoms ids associated with the ring."""
157
+ for n in graph.neighbors(index):
158
+ for path in all_simple_paths(graph, source=index, target=n, cutoff=6):
159
+ if len(path) != 6 or any(graph.nodes[n]["atomnos"] == 1 for n in path):
160
+ continue
161
+ if all(len(set(graph.neighbors(i))) == 3 for i in path):
162
+ return path # type: ignore [no-any-return]
163
+
164
+ return None
165
+
166
+
167
+ def find_paths(
168
+ graph: Graph,
169
+ u: int,
170
+ n: int,
171
+ exclude_set: set[int] | None = None,
172
+ ) -> list[list[int]]:
173
+ """
174
+ Find paths in graph.
175
+
176
+ Recursively find all paths of a NetworkX graph with length = n, starting from node u.
177
+
178
+ :param graph: NetworkX graph
179
+ :param u: starting node
180
+ :param n: path length
181
+ :param exclude_set: set of nodes to exclude from the paths
182
+ :return: list of paths (each path is a list of node indices)
183
+ """
184
+ exclude_set = (exclude_set or set()) | {u}
185
+
186
+ if n == 0:
187
+ return [[u]]
188
+
189
+ return [
190
+ [u, *path]
191
+ for neighbor in graph.neighbors(u)
192
+ if neighbor not in exclude_set
193
+ for path in find_paths(graph, neighbor, n - 1, exclude_set)
194
+ ]