rdworks 0.25.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rdworks/__init__.py +35 -0
- rdworks/autograph/__init__.py +4 -0
- rdworks/autograph/autograph.py +184 -0
- rdworks/autograph/centroid.py +90 -0
- rdworks/autograph/dynamictreecut.py +135 -0
- rdworks/autograph/nmrclust.py +123 -0
- rdworks/autograph/rckmeans.py +74 -0
- rdworks/bitqt/__init__.py +1 -0
- rdworks/bitqt/bitqt.py +355 -0
- rdworks/conf.py +374 -0
- rdworks/descriptor.py +36 -0
- rdworks/display.py +206 -0
- rdworks/ionized.py +170 -0
- rdworks/matchedseries.py +260 -0
- rdworks/mol.py +1522 -0
- rdworks/mollibr.py +887 -0
- rdworks/pka.py +38 -0
- rdworks/predefined/Asinex_fragment.xml +20 -0
- rdworks/predefined/Astex_RO3.xml +16 -0
- rdworks/predefined/Baell2010_PAINS/Baell2010A.xml +52 -0
- rdworks/predefined/Baell2010_PAINS/Baell2010B.xml +169 -0
- rdworks/predefined/Baell2010_PAINS/Baell2010C.xml +1231 -0
- rdworks/predefined/Baell2010_PAINS/PAINS-less-than-015-hits.xml +2048 -0
- rdworks/predefined/Baell2010_PAINS/PAINS-less-than-150-hits.xml +278 -0
- rdworks/predefined/Baell2010_PAINS/PAINS-more-than-150-hits.xml +83 -0
- rdworks/predefined/Baell2010_PAINS/makexml.py +70 -0
- rdworks/predefined/Brenk2008_Dundee/makexml.py +21 -0
- rdworks/predefined/CNS.xml +18 -0
- rdworks/predefined/ChEMBL_Walters/BMS.xml +543 -0
- rdworks/predefined/ChEMBL_Walters/Dundee.xml +318 -0
- rdworks/predefined/ChEMBL_Walters/Glaxo.xml +168 -0
- rdworks/predefined/ChEMBL_Walters/Inpharmatica.xml +276 -0
- rdworks/predefined/ChEMBL_Walters/LINT.xml +174 -0
- rdworks/predefined/ChEMBL_Walters/MLSMR.xml +351 -0
- rdworks/predefined/ChEMBL_Walters/PAINS.xml +1446 -0
- rdworks/predefined/ChEMBL_Walters/SureChEMBL.xml +501 -0
- rdworks/predefined/ChEMBL_Walters/makexml.py +40 -0
- rdworks/predefined/Hann1999_Glaxo/Hann1999.xml +168 -0
- rdworks/predefined/Hann1999_Glaxo/Hann1999Acid.xml +102 -0
- rdworks/predefined/Hann1999_Glaxo/Hann1999Base.xml +6 -0
- rdworks/predefined/Hann1999_Glaxo/Hann1999ElPh.xml +6 -0
- rdworks/predefined/Hann1999_Glaxo/Hann1999NuPh.xml +6 -0
- rdworks/predefined/Hann1999_Glaxo/makexml.py +83 -0
- rdworks/predefined/Kazius2005/Kazius2005.xml +114 -0
- rdworks/predefined/Kazius2005/makexml.py +66 -0
- rdworks/predefined/ZINC_druglike.xml +24 -0
- rdworks/predefined/ZINC_fragment.xml +14 -0
- rdworks/predefined/ZINC_leadlike.xml +15 -0
- rdworks/predefined/fragment.xml +7 -0
- rdworks/predefined/ionized/simple_smarts_pattern.csv +57 -0
- rdworks/predefined/ionized/smarts_pattern.csv +107 -0
- rdworks/predefined/misc/makexml.py +119 -0
- rdworks/predefined/misc/reactive-part-2.xml +104 -0
- rdworks/predefined/misc/reactive-part-3.xml +74 -0
- rdworks/predefined/misc/reactive.xml +321 -0
- rdworks/readin.py +312 -0
- rdworks/rgroup.py +2173 -0
- rdworks/scaffold.py +520 -0
- rdworks/std.py +143 -0
- rdworks/stereoisomers.py +127 -0
- rdworks/tautomers.py +20 -0
- rdworks/units.py +63 -0
- rdworks/utils.py +495 -0
- rdworks/xml.py +260 -0
- rdworks-0.25.7.dist-info/METADATA +37 -0
- rdworks-0.25.7.dist-info/RECORD +69 -0
- rdworks-0.25.7.dist-info/WHEEL +5 -0
- rdworks-0.25.7.dist-info/licenses/LICENSE +21 -0
- rdworks-0.25.7.dist-info/top_level.txt +1 -0
rdworks/bitqt/bitqt.py
ADDED
@@ -0,0 +1,355 @@
|
|
1
|
+
"""
|
2
|
+
@author: Roy Gonzalez-Aleman [roy_gonzalez@fq.uh.cu]
|
3
|
+
@author: Daniel Platero Rochart [daniel.platero@gmail.com]
|
4
|
+
"""
|
5
|
+
from collections import deque, OrderedDict
|
6
|
+
from typing import Tuple, Optional
|
7
|
+
|
8
|
+
import numpy as np
|
9
|
+
import pandas as pd
|
10
|
+
|
11
|
+
from bitarray import util as bu
|
12
|
+
from bitarray import bitarray as ba
|
13
|
+
|
14
|
+
from ..autograph.centroid import centroid_medoid
|
15
|
+
|
16
|
+
|
17
|
+
|
18
|
+
def convert_to_bitarr_matrix(rmsdMatrix:np.array, cutoff:float) -> OrderedDict:
|
19
|
+
"""
|
20
|
+
Convert RMSD binary-encoded square matrix.
|
21
|
+
Pairwise similarity is saved in RAM as bits (dict of bitarrays), not floats.
|
22
|
+
Returns:
|
23
|
+
matrix : collections.OrderedDict. dict of bitarrays.
|
24
|
+
"""
|
25
|
+
N = rmsdMatrix.shape[0]
|
26
|
+
cutoff = np.full(N, cutoff, dtype=np.float32)
|
27
|
+
# numpy.full(shape, fill_value, dtype=None, order='C', *, like=None)
|
28
|
+
# Return a new array of given shape and type, filled with fill_value.
|
29
|
+
matrix = OrderedDict()
|
30
|
+
to_explore = range(N)
|
31
|
+
for i in to_explore:
|
32
|
+
rmsd_ = rmsdMatrix[i,:]
|
33
|
+
# mdtraj.rmsd(target, reference, frame=0, atom_indices=None, parallel=True, precentered=False)
|
34
|
+
# Compute RMSD of all conformations in target to a reference conformation. Note, this will center the conformations in place.
|
35
|
+
vector_np = np.less_equal(rmsd_, cutoff)
|
36
|
+
# Return the truth value of (x1 <= x2) element-wise.
|
37
|
+
bitarr = ba()
|
38
|
+
bitarr.pack(vector_np.tobytes())
|
39
|
+
bitarr.fill()
|
40
|
+
matrix.update({i: bitarr})
|
41
|
+
return matrix
|
42
|
+
|
43
|
+
|
44
|
+
|
45
|
+
def calc_matrix_degrees(unclustered_bit, matrix):
|
46
|
+
"""
|
47
|
+
Calculate number of neighbors (degree) of unclustered nodes in matrix.
|
48
|
+
|
49
|
+
Parameters
|
50
|
+
----------
|
51
|
+
unclustered_bit : bitarray.bitarray
|
52
|
+
bitarray with indices of unclustered nodes turned on.
|
53
|
+
matrix : collections.OrderedDict
|
54
|
+
dict of bitarrays.
|
55
|
+
|
56
|
+
Returns
|
57
|
+
-------
|
58
|
+
degrees : numpy.ndarray
|
59
|
+
array containing each node degree. Clustered nodes have degree = 0.
|
60
|
+
|
61
|
+
"""
|
62
|
+
one = ba('1')
|
63
|
+
degrees = np.zeros(len(unclustered_bit), dtype=np.int32)
|
64
|
+
for node in unclustered_bit.itersearch(one):
|
65
|
+
try:
|
66
|
+
degrees[node] = matrix[node].count()
|
67
|
+
except KeyError:
|
68
|
+
pass
|
69
|
+
return degrees
|
70
|
+
|
71
|
+
|
72
|
+
def colour_matrix(degrees, matrix):
|
73
|
+
"""
|
74
|
+
Greedy coloring of bit-encoded RMSD matrix.
|
75
|
+
|
76
|
+
Parameters
|
77
|
+
----------
|
78
|
+
degrees : numpy.ndarray
|
79
|
+
array containing each node degree. Clustered nodes have degree = 0.
|
80
|
+
matrix : collections.OrderedDict
|
81
|
+
dict of bitarrays.
|
82
|
+
|
83
|
+
Returns
|
84
|
+
-------
|
85
|
+
colors : numpy.ndarray
|
86
|
+
array of colors assigned to each node of the matrix.
|
87
|
+
"""
|
88
|
+
# Constants ---------------------------------------------------------------
|
89
|
+
N = degrees.size
|
90
|
+
m = len(matrix)
|
91
|
+
one = ba('1')
|
92
|
+
xcolor = 0
|
93
|
+
# Initialize containers ---------------------------------------------------
|
94
|
+
ordered_by_degrees = iter((-degrees[:m]).argsort())
|
95
|
+
colors = np.zeros(N, dtype=np.int32)
|
96
|
+
colored = ba(N)
|
97
|
+
colored.setall(0)
|
98
|
+
seen = set()
|
99
|
+
while True:
|
100
|
+
# Retrieve the max-degree node ----------------------------------------
|
101
|
+
max_node = next(ordered_by_degrees)
|
102
|
+
if max_node in seen:
|
103
|
+
continue
|
104
|
+
seen.add(max_node)
|
105
|
+
xcolor += 1
|
106
|
+
not_neighbors = ~ matrix[max_node]
|
107
|
+
not_colored = ~colored
|
108
|
+
candidates = not_neighbors & not_colored
|
109
|
+
# Nodes passing conditions (not-neighb, not-colored, not-neighb) ------
|
110
|
+
passed = [max_node]
|
111
|
+
for candidate in candidates.itersearch(one):
|
112
|
+
passed.append(candidate)
|
113
|
+
try:
|
114
|
+
candidates &= ~matrix[candidate]
|
115
|
+
except KeyError:
|
116
|
+
continue
|
117
|
+
if not candidates.any():
|
118
|
+
break
|
119
|
+
seen.update(passed)
|
120
|
+
# Deliver a color class to passed nodes -------------------------------
|
121
|
+
colors[passed] = xcolor
|
122
|
+
colored = ba()
|
123
|
+
colored.pack(colors.astype(np.bool_).tobytes())
|
124
|
+
if colored.count(0) == 0:
|
125
|
+
break
|
126
|
+
return colors
|
127
|
+
|
128
|
+
|
129
|
+
def bitarray_to_np(bitarr):
|
130
|
+
"""
|
131
|
+
Convert from bitarray.bitarray to numpy.ndarray efficiently.
|
132
|
+
|
133
|
+
Parameters
|
134
|
+
----------
|
135
|
+
bitarr : bitarray.bitarray
|
136
|
+
a bitarray.
|
137
|
+
|
138
|
+
Returns
|
139
|
+
-------
|
140
|
+
numpy.ndarray
|
141
|
+
boolean bitarray equivalent to the binary bitarray input object.
|
142
|
+
"""
|
143
|
+
return np.unpackbits(bitarr).astype(np.bool_)
|
144
|
+
|
145
|
+
|
146
|
+
def do_bit_cascade(big_node, degrees, colors, matrix, max_):
|
147
|
+
"""
|
148
|
+
Perform succesive AND operations between an initial bitarray and subsequent
|
149
|
+
bitarray candidates to search for a clique.
|
150
|
+
|
151
|
+
Parameters
|
152
|
+
----------
|
153
|
+
big_node : int
|
154
|
+
node whose bitarray will start the operations.
|
155
|
+
degrees : numpy.ndarray
|
156
|
+
array containing each node degree. Clustered nodes have degree = 0.
|
157
|
+
colors : numpy.ndarray
|
158
|
+
array of colors assigned to each node of the matrix.
|
159
|
+
clustered_bit : bitarray.bitarray
|
160
|
+
bitarray with indices of clustered nodes turned on.
|
161
|
+
matrix : collections.OrderedDict
|
162
|
+
dict of bitarrays.
|
163
|
+
max_ : int
|
164
|
+
Stop iterative AND operations after the initial bitarray has max_
|
165
|
+
bits turned on.
|
166
|
+
|
167
|
+
Returns
|
168
|
+
-------
|
169
|
+
init_cascade : bitarray.bitarray
|
170
|
+
initial bitarray before any AND operation.
|
171
|
+
ar : numpy.ndarray
|
172
|
+
array of nodes forming a clique.
|
173
|
+
"""
|
174
|
+
init_cascade = matrix[big_node]
|
175
|
+
# .... recovering neighbors and their information .........................
|
176
|
+
neighb = bitarray_to_np(init_cascade).nonzero()[0]
|
177
|
+
neighb_colors = colors[neighb]
|
178
|
+
if len(set(neighb_colors.tolist())) <= max_:
|
179
|
+
return None
|
180
|
+
neighb_degrees = degrees[neighb]
|
181
|
+
g = np.bincount(neighb_colors)
|
182
|
+
neighb_g = g[neighb_colors]
|
183
|
+
# .... ordering neighbors by g ---> colors ---> degrees ...................
|
184
|
+
idx = np.lexsort([-neighb_degrees, neighb_colors, neighb_g])
|
185
|
+
candidates_info = zip(neighb[idx], neighb_colors[idx])
|
186
|
+
|
187
|
+
# .... BitCascade considering divergence ..................................
|
188
|
+
counter = 0
|
189
|
+
seen = set()
|
190
|
+
for candidate, color in candidates_info:
|
191
|
+
if (color in seen) or (not init_cascade[candidate]):
|
192
|
+
continue
|
193
|
+
seen.add(color)
|
194
|
+
init_cascade = matrix[candidate] & init_cascade
|
195
|
+
counter += 1
|
196
|
+
COUNT = init_cascade.count()
|
197
|
+
if (COUNT <= max_):
|
198
|
+
return None
|
199
|
+
if counter >= COUNT:
|
200
|
+
break
|
201
|
+
ar = np.nonzero(np.unpackbits(init_cascade).astype(np.bool_))[0]
|
202
|
+
return init_cascade, ar
|
203
|
+
|
204
|
+
|
205
|
+
def set_to_bitarray(set_, N):
|
206
|
+
"""
|
207
|
+
Convert from python set to bitarray.bitarray.
|
208
|
+
|
209
|
+
Parameters
|
210
|
+
----------
|
211
|
+
set_ : set
|
212
|
+
a python set.
|
213
|
+
N : int
|
214
|
+
lenght of the desired bitarray. It must be greater than the maximum
|
215
|
+
value of indices present in set.
|
216
|
+
|
217
|
+
Returns
|
218
|
+
-------
|
219
|
+
bitarr : bitarray.bitarray
|
220
|
+
bitarray of lenght N with indices present in set turned on.
|
221
|
+
"""
|
222
|
+
zero_arr = np.zeros(N, dtype=np.bool_)
|
223
|
+
zero_arr[list(set_)] = 1
|
224
|
+
bitarr = ba()
|
225
|
+
bitarr.pack(zero_arr.tobytes())
|
226
|
+
return bitarr
|
227
|
+
|
228
|
+
|
229
|
+
def get_cluster_stats(clusters):
|
230
|
+
"""
|
231
|
+
Get "cluster_statistics.txt" containing clusterID, cluster_size, and
|
232
|
+
cluster percentage from trajectory.
|
233
|
+
|
234
|
+
Parameters
|
235
|
+
----------
|
236
|
+
clusters : numpy.ndarray
|
237
|
+
array of clusters ID.
|
238
|
+
outdir : str
|
239
|
+
Path where to create the VMD visualization .log.
|
240
|
+
|
241
|
+
Returns
|
242
|
+
-------
|
243
|
+
clusters_df : pandas.DataFrame
|
244
|
+
dataframe with cluster_statistics info.
|
245
|
+
"""
|
246
|
+
clusters_df = pd.DataFrame(columns=['cluster_id', 'size', 'percent'])
|
247
|
+
clusters_df['cluster_id'] = list(range(0, clusters.max() + 1))
|
248
|
+
sizes = []
|
249
|
+
for x in clusters_df.cluster_id:
|
250
|
+
sizes.append(len(np.where(clusters == x)[0]))
|
251
|
+
clusters_df['size'] = sizes
|
252
|
+
|
253
|
+
sum_ = clusters_df['size'].sum()
|
254
|
+
percents = [round(x / sum_ * 100, 4) for x in clusters_df['size']]
|
255
|
+
clusters_df['percent'] = percents
|
256
|
+
|
257
|
+
return clusters_df
|
258
|
+
|
259
|
+
|
260
|
+
def BitQT(rmsdMatrix:np.array, cutoff:float, min_clust_size:int=2, nclust:Optional[int]=None) -> Tuple:
|
261
|
+
"""BitQT clustering
|
262
|
+
Returns:
|
263
|
+
(cluster_assignment, centroid_indices)
|
264
|
+
"""
|
265
|
+
matrix = convert_to_bitarr_matrix(rmsdMatrix, cutoff)
|
266
|
+
# ++++ Tracking clust/uNCLUSTERed bits to avoid re-computations +++++++++++
|
267
|
+
N = len(matrix[0])
|
268
|
+
m = len(matrix)
|
269
|
+
unclust_bit = ba(N)
|
270
|
+
unclust_bit.setall(1)
|
271
|
+
clustered_bit = unclust_bit.copy()
|
272
|
+
clustered_bit.setall(0)
|
273
|
+
zeros = np.zeros(N, dtype=np.int32)
|
274
|
+
# ++++ Save clusters in an array (1 .. N) +++++++++++++++++++++++++++++++++
|
275
|
+
clusters_array = np.zeros(N, dtype=np.int32)
|
276
|
+
NCLUSTER = 0
|
277
|
+
clustered = set()
|
278
|
+
nmembers = []
|
279
|
+
# ++++ Coloring ordered vertices (1 .. N) +++++++++++++++++++++++++++++++++
|
280
|
+
degrees = calc_matrix_degrees(unclust_bit, matrix)
|
281
|
+
ordered_by_degs = degrees.argsort()[::-1]
|
282
|
+
colors = colour_matrix(ordered_by_degs, matrix)
|
283
|
+
# colors[np.frombuffer(clustered_bit.unpack(), dtype=np.bool)] = 0
|
284
|
+
|
285
|
+
# =========================================================================
|
286
|
+
# 2. Main algorithm: BitQT !
|
287
|
+
# =========================================================================
|
288
|
+
while any(degrees):
|
289
|
+
NCLUSTER += 1
|
290
|
+
# ++++ Find a big clique early ++++++++++++++++++++++++++++++++++++++++
|
291
|
+
big_node = degrees.argmax()
|
292
|
+
bit_clique, big_clique = do_bit_cascade(big_node, degrees, colors,
|
293
|
+
matrix, 0)
|
294
|
+
big_clique_size = big_clique.size
|
295
|
+
# ++++ Find promising nodes +++++++++++++++++++++++++++++++++++++++++++
|
296
|
+
biggers = degrees > big_clique_size
|
297
|
+
biggers[big_clique] = False
|
298
|
+
cluster_colors = colors[big_clique]
|
299
|
+
biggers_colors = colors[biggers]
|
300
|
+
promising_colors = np.setdiff1d(biggers_colors, cluster_colors)
|
301
|
+
promising_nodes = deque()
|
302
|
+
for x in promising_colors:
|
303
|
+
promising_nodes.extend(((colors == x) & biggers).nonzero()[0])
|
304
|
+
# ++++ Explore all promising nodes ++++++++++++++++++++++++++++++++++++
|
305
|
+
cum_found = big_clique
|
306
|
+
while promising_nodes:
|
307
|
+
node = promising_nodes.popleft()
|
308
|
+
try:
|
309
|
+
bit_clique, clique = do_bit_cascade(node, degrees, colors,
|
310
|
+
matrix, big_clique_size)
|
311
|
+
CLIQUE_SIZE = len(clique)
|
312
|
+
except TypeError:
|
313
|
+
CLIQUE_SIZE = 0
|
314
|
+
# ++++ Cumulative update only if biggers candidates are found +++++
|
315
|
+
if CLIQUE_SIZE > big_clique_size:
|
316
|
+
big_node = node
|
317
|
+
big_clique = clique
|
318
|
+
big_clique_size = big_clique.size
|
319
|
+
# ++++ Repeat previous condition ++++++++++++++++++++++++++++++
|
320
|
+
cum_found = np.concatenate((cum_found, big_clique))
|
321
|
+
biggers = degrees > big_clique_size
|
322
|
+
biggers[cum_found] = False
|
323
|
+
cluster_colors = colors[big_clique]
|
324
|
+
biggers_colors = colors[biggers]
|
325
|
+
promising_colors = np.setdiff1d(biggers_colors, cluster_colors)
|
326
|
+
promising_nodes = deque()
|
327
|
+
for x in promising_colors:
|
328
|
+
promising_nodes.extend(((colors == x) & biggers).nonzero()[0])
|
329
|
+
nmembers.append(big_clique_size)
|
330
|
+
|
331
|
+
if (big_clique_size < min_clust_size) or (nclust and NCLUSTER == nclust):
|
332
|
+
break
|
333
|
+
|
334
|
+
# ++++ Save new cluster & update NCLUSTER +++++++++++++++++++++++++++++
|
335
|
+
clusters_array[big_clique] = NCLUSTER
|
336
|
+
# ++++ Update (un)clustered_bit +++++++++++++++++++++++++++++++++++++++
|
337
|
+
clustered.update(big_clique)
|
338
|
+
clustered_bit = set_to_bitarray(clustered, N)
|
339
|
+
unclust_bit = ~clustered_bit
|
340
|
+
# ++++ Hard erasing of clustered frames from matrix +++++++++++++++++++
|
341
|
+
degrees = zeros.copy()
|
342
|
+
for x in unclust_bit[:m].itersearch(ba('1')):
|
343
|
+
degrees[x] = matrix[x].count()
|
344
|
+
if bu.count_and(matrix[x], clustered_bit):
|
345
|
+
matrix[x] &= (matrix[x] ^ clustered_bit)
|
346
|
+
|
347
|
+
# =========================================================================
|
348
|
+
# 3. Output
|
349
|
+
# =========================================================================
|
350
|
+
# cluster_stats = get_cluster_stats(clusters_array[:m], args.outdir)
|
351
|
+
|
352
|
+
cluster_assignment = list(clusters_array[:m])
|
353
|
+
centroid_indices = centroid_medoid(cluster_assignment, rmsdMatrix)
|
354
|
+
|
355
|
+
return cluster_assignment, centroid_indices
|