rdworks 0.25.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rdworks/__init__.py +35 -0
- rdworks/autograph/__init__.py +4 -0
- rdworks/autograph/autograph.py +184 -0
- rdworks/autograph/centroid.py +90 -0
- rdworks/autograph/dynamictreecut.py +135 -0
- rdworks/autograph/nmrclust.py +123 -0
- rdworks/autograph/rckmeans.py +74 -0
- rdworks/bitqt/__init__.py +1 -0
- rdworks/bitqt/bitqt.py +355 -0
- rdworks/conf.py +374 -0
- rdworks/descriptor.py +36 -0
- rdworks/display.py +206 -0
- rdworks/ionized.py +170 -0
- rdworks/matchedseries.py +260 -0
- rdworks/mol.py +1522 -0
- rdworks/mollibr.py +887 -0
- rdworks/pka.py +38 -0
- rdworks/predefined/Asinex_fragment.xml +20 -0
- rdworks/predefined/Astex_RO3.xml +16 -0
- rdworks/predefined/Baell2010_PAINS/Baell2010A.xml +52 -0
- rdworks/predefined/Baell2010_PAINS/Baell2010B.xml +169 -0
- rdworks/predefined/Baell2010_PAINS/Baell2010C.xml +1231 -0
- rdworks/predefined/Baell2010_PAINS/PAINS-less-than-015-hits.xml +2048 -0
- rdworks/predefined/Baell2010_PAINS/PAINS-less-than-150-hits.xml +278 -0
- rdworks/predefined/Baell2010_PAINS/PAINS-more-than-150-hits.xml +83 -0
- rdworks/predefined/Baell2010_PAINS/makexml.py +70 -0
- rdworks/predefined/Brenk2008_Dundee/makexml.py +21 -0
- rdworks/predefined/CNS.xml +18 -0
- rdworks/predefined/ChEMBL_Walters/BMS.xml +543 -0
- rdworks/predefined/ChEMBL_Walters/Dundee.xml +318 -0
- rdworks/predefined/ChEMBL_Walters/Glaxo.xml +168 -0
- rdworks/predefined/ChEMBL_Walters/Inpharmatica.xml +276 -0
- rdworks/predefined/ChEMBL_Walters/LINT.xml +174 -0
- rdworks/predefined/ChEMBL_Walters/MLSMR.xml +351 -0
- rdworks/predefined/ChEMBL_Walters/PAINS.xml +1446 -0
- rdworks/predefined/ChEMBL_Walters/SureChEMBL.xml +501 -0
- rdworks/predefined/ChEMBL_Walters/makexml.py +40 -0
- rdworks/predefined/Hann1999_Glaxo/Hann1999.xml +168 -0
- rdworks/predefined/Hann1999_Glaxo/Hann1999Acid.xml +102 -0
- rdworks/predefined/Hann1999_Glaxo/Hann1999Base.xml +6 -0
- rdworks/predefined/Hann1999_Glaxo/Hann1999ElPh.xml +6 -0
- rdworks/predefined/Hann1999_Glaxo/Hann1999NuPh.xml +6 -0
- rdworks/predefined/Hann1999_Glaxo/makexml.py +83 -0
- rdworks/predefined/Kazius2005/Kazius2005.xml +114 -0
- rdworks/predefined/Kazius2005/makexml.py +66 -0
- rdworks/predefined/ZINC_druglike.xml +24 -0
- rdworks/predefined/ZINC_fragment.xml +14 -0
- rdworks/predefined/ZINC_leadlike.xml +15 -0
- rdworks/predefined/fragment.xml +7 -0
- rdworks/predefined/ionized/simple_smarts_pattern.csv +57 -0
- rdworks/predefined/ionized/smarts_pattern.csv +107 -0
- rdworks/predefined/misc/makexml.py +119 -0
- rdworks/predefined/misc/reactive-part-2.xml +104 -0
- rdworks/predefined/misc/reactive-part-3.xml +74 -0
- rdworks/predefined/misc/reactive.xml +321 -0
- rdworks/readin.py +312 -0
- rdworks/rgroup.py +2173 -0
- rdworks/scaffold.py +520 -0
- rdworks/std.py +143 -0
- rdworks/stereoisomers.py +127 -0
- rdworks/tautomers.py +20 -0
- rdworks/units.py +63 -0
- rdworks/utils.py +495 -0
- rdworks/xml.py +260 -0
- rdworks-0.25.7.dist-info/METADATA +37 -0
- rdworks-0.25.7.dist-info/RECORD +69 -0
- rdworks-0.25.7.dist-info/WHEEL +5 -0
- rdworks-0.25.7.dist-info/licenses/LICENSE +21 -0
- rdworks-0.25.7.dist-info/top_level.txt +1 -0
rdworks/__init__.py
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
__version__ = '0.25.7'
|
2
|
+
|
3
|
+
from rdworks.xml import list_predefined_xml, get_predefined_xml, parse_xml
|
4
|
+
from rdworks.units import ev2kcalpermol, hartree2ev, hartree2kcalpermol, periodictable
|
5
|
+
from rdworks.readin import read_csv, merge_csv, read_dataframe, read_smi, read_sdf, read_mae
|
6
|
+
from rdworks.std import desalt_smiles, standardize_smiles, standardize
|
7
|
+
from rdworks.tautomers import complete_tautomers
|
8
|
+
from rdworks.stereoisomers import complete_stereoisomers
|
9
|
+
from rdworks.ionized import IonizedStates
|
10
|
+
from rdworks.rgroup import expand_rgroup, most_common, most_common_in_NP
|
11
|
+
from rdworks.scaffold import scaffold_network, scaffold_tree, BRICS_fragmented, BRICS_fragment_indices
|
12
|
+
from rdworks.matchedseries import MatchedSeries
|
13
|
+
from rdworks.descriptor import rd_descriptor, rd_descriptor_f
|
14
|
+
from rdworks.utils import fix_decimal_places_in_list, fix_decimal_places_in_dict, mae_to_dict, mae_rd_index
|
15
|
+
from rdworks.display import svg
|
16
|
+
from rdworks.conf import Conf
|
17
|
+
from rdworks.mol import Mol
|
18
|
+
from rdworks.mollibr import MolLibr
|
19
|
+
|
20
|
+
from rdkit import rdBase, RDLogger
|
21
|
+
rdkit_logger = RDLogger.logger().setLevel(RDLogger.CRITICAL)
|
22
|
+
|
23
|
+
import logging
|
24
|
+
|
25
|
+
main_logger = logging.getLogger()
|
26
|
+
main_logger.setLevel(logging.INFO) # level: DEBUG < INFO < WARNING < ERROR < CRITICAL
|
27
|
+
logger_formatter = logging.Formatter(
|
28
|
+
fmt='%(asctime)s %(levelname)s %(message)s',
|
29
|
+
datefmt='%Y-%m-%d %H:%M:%S')
|
30
|
+
logger_ch = logging.StreamHandler()
|
31
|
+
logger_ch.setFormatter(logger_formatter)
|
32
|
+
main_logger.addHandler(logger_ch)
|
33
|
+
|
34
|
+
|
35
|
+
__rdkit_version__ = rdBase.rdkitVersion
|
@@ -0,0 +1,184 @@
|
|
1
|
+
# 2020-04-17
|
2
|
+
|
3
|
+
# Kiyoto Aramis Tanemura
|
4
|
+
|
5
|
+
# Code for Louvain algorithm was getting lengthy. I dedicate its own module
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
|
9
|
+
from .centroid import centroid_autograph
|
10
|
+
|
11
|
+
def getModularity(affinityMatrix, communityAssignments, resolution = 1.0):
|
12
|
+
# Provided an affinity matrix (np.array) and nodes specifying their assigned communities (list),
|
13
|
+
# return the modularity of the whole graph
|
14
|
+
arrayDim = affinityMatrix.shape[0]
|
15
|
+
communities = list(set(communityAssignments))
|
16
|
+
Q = 0
|
17
|
+
affinityMatrix[range(arrayDim), range(arrayDim)] += affinityMatrix[range(arrayDim), range(arrayDim)]
|
18
|
+
two_m = np.sum(affinityMatrix)
|
19
|
+
for C in communities:
|
20
|
+
communityIndices = [i for i in range(len(communityAssignments)) if communityAssignments[i] == C]
|
21
|
+
sigma_in = np.sum(affinityMatrix[communityIndices, :][:, communityIndices])
|
22
|
+
sigma_tot = np.sum(affinityMatrix[communityIndices, :])
|
23
|
+
Q += sigma_in / two_m - resolution * (sigma_tot / two_m) ** 2
|
24
|
+
return Q
|
25
|
+
|
26
|
+
def LouvainPhase1(affinityMatrix, communityAssignment, Q_threshold, max_iter, resolution):
|
27
|
+
# inputs: affinityMatrix as numpy array. Clear diagonals prior to entering and filter edges with weights below threshold
|
28
|
+
# communityAssignments: list of length of nodes. Values are community IDs
|
29
|
+
# Q_threshold: minimum global modularity to terminate phase 1
|
30
|
+
# max_iter: number of iteration to force termination of phase 1
|
31
|
+
# output: updated community Assignment list
|
32
|
+
|
33
|
+
num_nodes = affinityMatrix.shape[0]
|
34
|
+
|
35
|
+
affinityMatrix[range(num_nodes), range(num_nodes)] += affinityMatrix[range(num_nodes), range(num_nodes)]
|
36
|
+
|
37
|
+
two_m = np.sum(affinityMatrix)
|
38
|
+
|
39
|
+
modularity_current = getModularity(affinityMatrix, communityAssignment, resolution)
|
40
|
+
changeModularity = 1
|
41
|
+
|
42
|
+
adjacencyMatrix = affinityMatrix > 0
|
43
|
+
iterations = 0
|
44
|
+
|
45
|
+
while changeModularity > Q_threshold and iterations < max_iter:
|
46
|
+
modularity_prev = modularity_current
|
47
|
+
for i in range(num_nodes): # for each node
|
48
|
+
neighborIndices = [j for j in range(num_nodes) if adjacencyMatrix[i,j] == 1]
|
49
|
+
communities = set([communityAssignment[k] for k in neighborIndices])
|
50
|
+
communities.discard(communityAssignment[i])
|
51
|
+
communities = list(communities)
|
52
|
+
modularityList = []
|
53
|
+
# ki: sum of edges incident to node i
|
54
|
+
ki = np.sum(affinityMatrix[i, :])
|
55
|
+
for C in communities:
|
56
|
+
C_members = [x for x in range(num_nodes) if communityAssignment[x] == C]
|
57
|
+
# Sum weights of all edges incident to nodes in C
|
58
|
+
sigma_tot = np.sum(affinityMatrix[C_members, :])
|
59
|
+
# ki_in: sum of weight of edges incident to node i in C
|
60
|
+
ki_in = np.sum(affinityMatrix[i, :][C_members])
|
61
|
+
deltaQ = ki_in/two_m - resolution * sigma_tot * ki / (two_m ** 2 / 2) # simplified formula. Derivation from (https://hal.archives-ouvertes.fr/hal-01231784/document)
|
62
|
+
modularityList.append(deltaQ)
|
63
|
+
modularityList.append(0) # append zero to avoid error by empty list
|
64
|
+
maxQgain = np.max(modularityList)
|
65
|
+
if maxQgain > 0:
|
66
|
+
communityToJoin = communities[modularityList.index(maxQgain)]
|
67
|
+
communityAssignment[i] = communityToJoin
|
68
|
+
modularity_current = getModularity(affinityMatrix, communityAssignment, resolution)
|
69
|
+
changeModularity = modularity_current - modularity_prev
|
70
|
+
iterations += 1
|
71
|
+
|
72
|
+
return communityAssignment
|
73
|
+
|
74
|
+
def LouvainPhase2(affinityMatrix, communityAssignment):
|
75
|
+
# Merge each communities into supernodes
|
76
|
+
# input: affinityMatrix or condensed graph
|
77
|
+
# output: condensed graph and list of communities corresponding to the axis of graph
|
78
|
+
num_nodes = affinityMatrix.shape[0]
|
79
|
+
communities = list(set(communityAssignment))
|
80
|
+
num_communities = len(communities)
|
81
|
+
phase2graph = np.zeros([num_communities, num_communities])
|
82
|
+
for i in range(num_communities):
|
83
|
+
i_members = [x for x in range(num_nodes) if communityAssignment[x] == communities[i]]
|
84
|
+
for j in range(num_communities):
|
85
|
+
j_members = [x for x in range(num_nodes) if communityAssignment[x] == communities[j]]
|
86
|
+
phase2graph[i, j] = np.sum(affinityMatrix[i_members, :][:, j_members]) * (1/2) ** (i == j)
|
87
|
+
# if same communities, edges are double counted. If i == j, divide by two
|
88
|
+
|
89
|
+
return phase2graph, communities
|
90
|
+
|
91
|
+
|
92
|
+
def Louvain(affinityMatrix, Q_threshold = 0.001, max_iter = 50, resolution = 1.0):
|
93
|
+
# perform the two phases of Louvain community iteratively
|
94
|
+
graph = affinityMatrix
|
95
|
+
comm = list(range(affinityMatrix.shape[0]))
|
96
|
+
# communityAssignmentRecord and commRefList are lists of lists, with same lengths.
|
97
|
+
# communityAssignmentRecord stores assingment after phase 1
|
98
|
+
# commRefList keeps the individual communities before assignment
|
99
|
+
communityAssignmentRecord = []
|
100
|
+
commRefList = []
|
101
|
+
changeModularity = 1
|
102
|
+
iterations = 0
|
103
|
+
while changeModularity > Q_threshold and iterations < max_iter:
|
104
|
+
graph, comm = LouvainPhase2(graph, comm)
|
105
|
+
commRefList.append(list(comm))
|
106
|
+
modularity_past = getModularity(graph, comm, resolution) # Note: we want to compare Q before and after reassignments in phase 1
|
107
|
+
comm = LouvainPhase1(graph, comm, Q_threshold, max_iter, resolution)
|
108
|
+
communityAssignmentRecord.append(list(comm))
|
109
|
+
modularity_curr = getModularity(graph, comm, resolution)
|
110
|
+
changeModularity = modularity_curr - modularity_past
|
111
|
+
iterations += 1
|
112
|
+
# print('changeModularity', changeModularity)
|
113
|
+
|
114
|
+
for i in range(len(communityAssignmentRecord) - 2, 0, -1):
|
115
|
+
changeDict = {}
|
116
|
+
for j in range(len(communityAssignmentRecord[i])):
|
117
|
+
newComm = communityAssignmentRecord[i][j]
|
118
|
+
oldComm = commRefList[i][j]
|
119
|
+
if newComm != oldComm:
|
120
|
+
changeDict[oldComm] = newComm # Change key to value
|
121
|
+
|
122
|
+
for key in changeDict:
|
123
|
+
oldIndices = [x for x in range(len(communityAssignmentRecord[i - 1])) if communityAssignmentRecord[i - 1][x] == key]
|
124
|
+
for j in oldIndices:
|
125
|
+
communityAssignmentRecord[i - 1][j] = changeDict[key]
|
126
|
+
return communityAssignmentRecord[0]
|
127
|
+
|
128
|
+
|
129
|
+
def rbfKernel(r, epsilon = 1.0):
|
130
|
+
return np.exp(-(epsilon*r)**2)
|
131
|
+
|
132
|
+
|
133
|
+
def findThreshold(affinityMatrix):
|
134
|
+
numFiles = affinityMatrix.shape[0]
|
135
|
+
# Get all values in affinityMatrix. Sort in descending order
|
136
|
+
vals = np.array([])
|
137
|
+
for i in range(numFiles - 1):
|
138
|
+
vals = np.append(vals, affinityMatrix[i, i + 1:])
|
139
|
+
|
140
|
+
vals = np.sort(vals, axis = None)[::-1]
|
141
|
+
|
142
|
+
# Initialize index values
|
143
|
+
upperIndex = 0
|
144
|
+
lowerIndex = len(vals) - 1
|
145
|
+
|
146
|
+
while upperIndex != lowerIndex - 1: # Until the indices are consecutive, iterate
|
147
|
+
midIndex = int(np.mean([lowerIndex, upperIndex]))
|
148
|
+
adjacencyMatrix = affinityMatrix > vals[midIndex]
|
149
|
+
# Will tally nodes visited from first node in graph by BFS. If not all nodes were visited, there are more than one component.
|
150
|
+
nodesVisited = [0] + [i for i in range(numFiles) if adjacencyMatrix[0, i] == 1]
|
151
|
+
# nodesVisited = [i for i in range(numFiles) if adjacencyMatrix[0, i] == 1]
|
152
|
+
for i in nodesVisited:
|
153
|
+
newNodes = [j for j in range(numFiles) if adjacencyMatrix[i, j] == 1 and j not in nodesVisited]
|
154
|
+
nodesVisited += newNodes
|
155
|
+
while len(newNodes) > 0:
|
156
|
+
newNodes1list = []
|
157
|
+
for i in newNodes:
|
158
|
+
newNodes1 = [j for j in range(numFiles) if adjacencyMatrix[i, j] == 1 and j not in nodesVisited]
|
159
|
+
newNodes1list += newNodes1
|
160
|
+
nodesVisited += newNodes1
|
161
|
+
|
162
|
+
newNodes = newNodes1list
|
163
|
+
|
164
|
+
# If every node was visited, then we have exactly one component. Otherwise, there are disconnected graphs
|
165
|
+
if len(nodesVisited) < numFiles:
|
166
|
+
upperIndex = midIndex
|
167
|
+
else:
|
168
|
+
lowerIndex = midIndex
|
169
|
+
|
170
|
+
threshold = vals[lowerIndex]
|
171
|
+
|
172
|
+
return threshold
|
173
|
+
|
174
|
+
|
175
|
+
def AutoGraph(rmsdMatrix):
|
176
|
+
N = rmsdMatrix.shape[0]
|
177
|
+
affinityMatrix = rbfKernel(rmsdMatrix)
|
178
|
+
affinityMatrix[range(N), range(N)] = 0 # set diagonal zero
|
179
|
+
threshold = findThreshold(affinityMatrix)
|
180
|
+
adjacencyMatrix = affinityMatrix > threshold
|
181
|
+
filteredAffinityMatrix = affinityMatrix * adjacencyMatrix
|
182
|
+
communityAssignment = Louvain(filteredAffinityMatrix, Q_threshold=0.0, max_iter=50, resolution=1.0)
|
183
|
+
centroid_indices = centroid_autograph(N, communityAssignment, rmsdMatrix, threshold)
|
184
|
+
return communityAssignment, centroid_indices
|
@@ -0,0 +1,90 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
import numpy as np
|
3
|
+
|
4
|
+
|
5
|
+
def centroid_medoid(communityAssignment, rmsdMatrix) -> list:
|
6
|
+
"""returns a list of centroids based on medoids
|
7
|
+
|
8
|
+
Medoids are representative objects of a data set or a cluster within a data set
|
9
|
+
whose sum of dissimilarities to all the objects in the cluster is minimal.
|
10
|
+
|
11
|
+
Args:
|
12
|
+
communityAssignment: (list) list of community assignment correspoinding to the index of fileList
|
13
|
+
rmsdMatrix: (numpy array) Matrix containing pairwise atomic RMSD between all conformers
|
14
|
+
|
15
|
+
Returns:
|
16
|
+
a list of centroids
|
17
|
+
"""
|
18
|
+
N = rmsdMatrix.shape[0]
|
19
|
+
community_indices = list(set(communityAssignment))
|
20
|
+
centroids = []
|
21
|
+
for C in community_indices:
|
22
|
+
C_members = [x for x in range(N) if communityAssignment[x] == C]
|
23
|
+
community_submatrix = rmsdMatrix[C_members, :][:, C_members]
|
24
|
+
dist_sum = np.sum(community_submatrix, axis = 1)
|
25
|
+
centroids.append(C_members[np.argmin(dist_sum)])
|
26
|
+
return centroids
|
27
|
+
|
28
|
+
|
29
|
+
def diekstra(filtered_rmsd_matrix_community, i):
|
30
|
+
'''Use Dijkstra's algorithm to find shortest path from index i to all other nodes'''
|
31
|
+
# initialize lists
|
32
|
+
visited = []
|
33
|
+
unvisited = [x for x in range(filtered_rmsd_matrix_community.shape[0])]
|
34
|
+
record = [np.inf for x in range(filtered_rmsd_matrix_community.shape[0])]#{x: np.inf for x in range(graph.shape[0])}
|
35
|
+
record[i] = 0
|
36
|
+
lastNode = [-1 for x in record]
|
37
|
+
# repeat until all nodes have been visited
|
38
|
+
while len(unvisited) > 0:
|
39
|
+
visit_index = unvisited[np.argmin([record[x] for x in unvisited])]
|
40
|
+
unvisited_neighbors = [x for x in unvisited if filtered_rmsd_matrix_community[visit_index, x] > 0]
|
41
|
+
# Calculate distance to unvisited neighbor. If value is shorter than recorded, update distance.
|
42
|
+
updateDist = filtered_rmsd_matrix_community[visit_index, :] + record[visit_index]
|
43
|
+
for j in unvisited_neighbors:
|
44
|
+
record[j] = np.min([updateDist[j], record[j]])
|
45
|
+
if updateDist[j] < record[j]:
|
46
|
+
lastNode[j] = visit_index
|
47
|
+
# update visited/unvisited node list
|
48
|
+
unvisited.remove(visit_index)
|
49
|
+
visited.append(visit_index)
|
50
|
+
return record, lastNode
|
51
|
+
|
52
|
+
|
53
|
+
def centroid_betweenness(num, communityAssignment, filtered_rmsd_matrix):
|
54
|
+
# Provided with list of conformers assigned to communities, choose representative centroid by conformers of maximum in community betweenness
|
55
|
+
# inputs
|
56
|
+
# fileList: (list) names of xyz files for each conformer
|
57
|
+
# communityAssignment: (list) list of community assignment correspoinding to the index of fileList
|
58
|
+
# filtered_rmsd_matrix: {np.array) RMSD matrix between conformers, except assigning distances above threshold to zero
|
59
|
+
communityList = list(set(communityAssignment))
|
60
|
+
centralNodes = []
|
61
|
+
comm_size = []
|
62
|
+
for C in communityList:
|
63
|
+
C_members = [x for x in range(num) if communityAssignment[x] == C]
|
64
|
+
C_member_files = [x for x in C_members]
|
65
|
+
comm_size.append(len(C_members))
|
66
|
+
community_subgraph = filtered_rmsd_matrix[C_members, :][:, C_members]
|
67
|
+
community_betweenness = np.zeros(len(C_members))
|
68
|
+
for i in range(len(C_member_files)):
|
69
|
+
record, lastnode = diekstra(community_subgraph, i)
|
70
|
+
for j in range(len(C_member_files)):
|
71
|
+
previous_node = lastnode[j]
|
72
|
+
while previous_node != -1:
|
73
|
+
community_betweenness[previous_node] += 1
|
74
|
+
previous_node = lastnode[previous_node]
|
75
|
+
max_betweenness_index = np.argmax(community_betweenness)
|
76
|
+
centralNodes.append(C_member_files[max_betweenness_index])
|
77
|
+
|
78
|
+
# Sort centers by size of clusters in descending order
|
79
|
+
centralDf = pd.DataFrame({'size': comm_size}, index = centralNodes)
|
80
|
+
centralDf.sort_values(by = 'size', ascending = False, inplace = True)
|
81
|
+
|
82
|
+
return list(centralDf.index)
|
83
|
+
|
84
|
+
|
85
|
+
def centroid_autograph(N, communityAssignment, rmsdMatrix, threshold, centroid_selection='betweenness', filteredAffinityMatrix=None):
|
86
|
+
'''Return file names of conformers designated as centroids. If energy is provided, find lowest energy conformers in each cluster. Otherwise choose by maximum in-cluster weighted degree'''
|
87
|
+
if centroid_selection == 'betweenness':
|
88
|
+
return centroid_betweenness(N, communityAssignment, rmsdMatrix * rmsdMatrix < np.sqrt(-np.log(threshold))) # add filtered rmsd matrix
|
89
|
+
else:
|
90
|
+
print('centroid criterion not recognized. Use keywords "degree", "eccentricity", or "betweenness" for centroid_selection or provide an energy output to base the selection')
|
@@ -0,0 +1,135 @@
|
|
1
|
+
# 2020-05-22
|
2
|
+
|
3
|
+
# Kiyoto Aramis Tanemura
|
4
|
+
|
5
|
+
# The Ward algorithm for conformational clustering is used in building Markov State Model (DOI: 10.1021/acs.jctc.6b01238). To circumvent threshold selection, we will apply the dynamic tree cut method, used in conjunction to the Ward dendogram for automated conformational clustering (DOI: 10.1186/s13321-017-0208-0).
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
|
9
|
+
from scipy.cluster.hierarchy import linkage, to_tree
|
10
|
+
from scipy.spatial.distance import squareform
|
11
|
+
|
12
|
+
from .centroid import centroid_medoid
|
13
|
+
|
14
|
+
|
15
|
+
def get_ward_dendrogram(rmsd_mat):
|
16
|
+
'''Use Scipy functions to obtain dendrogram using Ward method
|
17
|
+
Returns the dendrogram (refer to Scipy linkage output format) as a ClusterNode object'''
|
18
|
+
dend = linkage(squareform(rmsd_mat), method = 'ward', optimal_ordering = True)
|
19
|
+
return dend
|
20
|
+
|
21
|
+
|
22
|
+
def goLeftmost(node, path, path_record):
|
23
|
+
'''provided a node, travel left until a leaf is reached'''
|
24
|
+
curr_node = node
|
25
|
+
if path in path_record:
|
26
|
+
return path
|
27
|
+
while not curr_node.is_leaf():
|
28
|
+
curr_node = curr_node.get_left()
|
29
|
+
path += ('l')
|
30
|
+
return path
|
31
|
+
|
32
|
+
|
33
|
+
def travelDown(ref_node, rel_path):
|
34
|
+
'''reach a node below, provided a starting node an path'''
|
35
|
+
curr_node = ref_node
|
36
|
+
for i in rel_path:
|
37
|
+
if i == 'l':
|
38
|
+
curr_node = curr_node.get_left()
|
39
|
+
elif i == 'r':
|
40
|
+
curr_node = curr_node.get_right()
|
41
|
+
return curr_node
|
42
|
+
|
43
|
+
|
44
|
+
def getHeights(root):
|
45
|
+
'''return heights of nonleaf nodes and their corresponding paths'''
|
46
|
+
heights = []
|
47
|
+
path_recorded = []
|
48
|
+
path = '.'
|
49
|
+
curr_node = root
|
50
|
+
if root.is_leaf():
|
51
|
+
return [0], [path]
|
52
|
+
while 'l' in list(path) or not curr_node.is_leaf():
|
53
|
+
path = goLeftmost(curr_node, path, path_recorded)
|
54
|
+
path = path[:-1]
|
55
|
+
curr_node = travelDown(root, path)
|
56
|
+
if path not in path_recorded:
|
57
|
+
heights.append(curr_node.dist)
|
58
|
+
path_recorded.append(path)
|
59
|
+
path += 'r'
|
60
|
+
curr_node = travelDown(root, path)
|
61
|
+
return heights, path_recorded
|
62
|
+
|
63
|
+
|
64
|
+
def treeCutCore(H, I, tau = 5):
|
65
|
+
'''Determine significant clusters provided one calibration value'''
|
66
|
+
H_hat = H - I
|
67
|
+
trans_indices = [x for x in range(len(H)-1) if H_hat[x] > 0 and H_hat[x+1] < 0]
|
68
|
+
breakpoints = []
|
69
|
+
for i in trans_indices:
|
70
|
+
back_index = 1
|
71
|
+
while H_hat[i-back_index] > 0 and back_index <= i:
|
72
|
+
back_index += 1
|
73
|
+
breakpoints.append(i - back_index + 1)
|
74
|
+
# Find significant breakpoints
|
75
|
+
significant_breakpoints = [breakpoints[x] for x in range(len(breakpoints)) if trans_indices[x] - breakpoints[x] > tau]
|
76
|
+
return significant_breakpoints
|
77
|
+
|
78
|
+
|
79
|
+
def adaptiveTreecutCore(H, tau = 5):
|
80
|
+
'''Perform treeCutCore at mean height. If no significant breakpoints are detected,
|
81
|
+
continue the operation below and above the mean.'''
|
82
|
+
if len(H) == 0:
|
83
|
+
return []
|
84
|
+
lm = np.mean(H)
|
85
|
+
lu = np.mean([lm, np.max(H)])
|
86
|
+
ld = np.mean([lm, np.min(H)])
|
87
|
+
bps = treeCutCore(H, lm, tau)
|
88
|
+
if len(bps) == 0:
|
89
|
+
bps = treeCutCore(H, ld, tau)
|
90
|
+
if len(bps) == 0:
|
91
|
+
bps = treeCutCore(H, lu, tau)
|
92
|
+
return bps
|
93
|
+
|
94
|
+
|
95
|
+
def getClusterNodeIndices(comprehensive_path_list, cluster_substring):
|
96
|
+
# Given a root string, return all permutations as indices
|
97
|
+
return [x for x in range(len(comprehensive_path_list)) if comprehensive_path_list[x].startswith(cluster_substring)]
|
98
|
+
|
99
|
+
|
100
|
+
# After looking at the Java implementation, I suspect the breakpoints, corresponding to the indices of heights (distance value of nonleaf nodes),
|
101
|
+
# also correspond to the indices of leaves. Try clustering only by collecting all breakpoints, then using the indices to subset leaves.
|
102
|
+
def dynamicTreeCut(tree, n, tau = 5):
|
103
|
+
allHeights, allPaths = getHeights(tree)
|
104
|
+
allHeights = [0] + allHeights + [0] # Sandwitch the heights with zero so that the ends can be included or left out from major clusters.
|
105
|
+
breakpoints = [0,-1]
|
106
|
+
updateList = [-1]
|
107
|
+
while len(updateList) > 0:
|
108
|
+
updateList = []
|
109
|
+
for i in range(len(breakpoints) - 1):
|
110
|
+
Hi = allHeights[breakpoints[i]:breakpoints[i+1]]
|
111
|
+
cutpoints = adaptiveTreecutCore(Hi, tau)
|
112
|
+
updateList += [x for x in cutpoints if x not in breakpoints]
|
113
|
+
breakpoints += updateList
|
114
|
+
return breakpoints
|
115
|
+
|
116
|
+
|
117
|
+
def report_assingments(breakpoints, tree):
|
118
|
+
'''To standardize outputs with the other clustering algorithms, this function takes the list of ClusterNodes produced in dynamicTreeCut and returns a list of ints specifying the cluster assignment'''
|
119
|
+
leaves = tree.pre_order(lambda x: x.id)
|
120
|
+
n = len(leaves)
|
121
|
+
comm_assing = np.zeros(n, int)
|
122
|
+
for i in range(len(breakpoints) - 1):
|
123
|
+
members = leaves[breakpoints[i]:breakpoints[i+1]]
|
124
|
+
comm_assing[members] = i
|
125
|
+
return comm_assing.tolist()
|
126
|
+
|
127
|
+
|
128
|
+
def DynamicTreeCut(rmsdMatrix, tau = 5): # Ward clustering
|
129
|
+
N = rmsdMatrix.shape[0]
|
130
|
+
dend = get_ward_dendrogram(rmsdMatrix)
|
131
|
+
tree = to_tree(dend)
|
132
|
+
breakpoints = dynamicTreeCut(tree, N, tau)
|
133
|
+
communityAssignment = report_assingments(breakpoints, tree)
|
134
|
+
centroid_indices = centroid_medoid(communityAssignment, rmsdMatrix)
|
135
|
+
return communityAssignment, centroid_indices
|
@@ -0,0 +1,123 @@
|
|
1
|
+
# 2020-04-29
|
2
|
+
|
3
|
+
# Kiyoto Aramis Tanemura
|
4
|
+
|
5
|
+
# NMRCLUST algorithm implemented in python using numpy. Original algorithm DOI: 10.1093/protein/9.11.1063
|
6
|
+
# To use in your code, first compute rmsdMatrix
|
7
|
+
# Then 'communityAssignment = NRMCLUST(rmsdMatrix)'
|
8
|
+
# The file list corresponding to the axis of the rmsdMatrix will be assigned to clusters specified in communityAssignment
|
9
|
+
|
10
|
+
import numpy as np
|
11
|
+
|
12
|
+
from .centroid import centroid_medoid
|
13
|
+
|
14
|
+
|
15
|
+
def averageLinkage(rmsdMatrix, clusterXmembers, clusterYmembers):
|
16
|
+
subgraph = rmsdMatrix[clusterXmembers,:][:,clusterYmembers]
|
17
|
+
return np.mean(subgraph)
|
18
|
+
|
19
|
+
|
20
|
+
def spread(rmsdMatrix, members):
|
21
|
+
subgraph = rmsdMatrix[members,:][:,members]
|
22
|
+
N = len(members)
|
23
|
+
offDiagonalSum = np.sum(subgraph) / 2 # note: diagonals are zero
|
24
|
+
return offDiagonalSum / (N * (N-1) / 2)
|
25
|
+
|
26
|
+
|
27
|
+
def memberIndices(communityID, communityAssignment):
|
28
|
+
indices = [x for x in range(len(communityAssignment)) if communityAssignment[x] == communityID]
|
29
|
+
return indices
|
30
|
+
|
31
|
+
|
32
|
+
def averageSpread(rmsdMatrix, communityAssignment):
|
33
|
+
communities = list(set(communityAssignment))
|
34
|
+
spreads = [spread(rmsdMatrix, memberIndices(C, communityAssignment)) for C in communities]
|
35
|
+
return np.mean(spreads)
|
36
|
+
|
37
|
+
|
38
|
+
def mergeClusters(rmsdMatrix, communityAssignment, aveLinkArray = None):
|
39
|
+
communities = list(set(communityAssignment))
|
40
|
+
numCommunities = len(communities)
|
41
|
+
if type(aveLinkArray) == type(None):
|
42
|
+
aveLinkArray = np.zeros([numCommunities, numCommunities])
|
43
|
+
for i in range(numCommunities - 1):
|
44
|
+
for j in range(i + 1, numCommunities):
|
45
|
+
i_members = memberIndices(communities[i], communityAssignment)
|
46
|
+
j_members = memberIndices(communities[j], communityAssignment)
|
47
|
+
aveLinkVal = averageLinkage(rmsdMatrix, i_members, j_members)
|
48
|
+
aveLinkArray[i,j] = aveLinkVal
|
49
|
+
aveLinkArray[j,i] = aveLinkVal
|
50
|
+
|
51
|
+
# Populate diagonals (self similarity) with max average linkage value to remove it from consideration.
|
52
|
+
aveLinkArray[range(numCommunities), range(numCommunities)] = np.max(aveLinkArray) + 0.01
|
53
|
+
minVal = np.min(aveLinkArray)
|
54
|
+
# Find value of minimum average linkage. Keep only the first index
|
55
|
+
i, j = np.where(aveLinkArray == minVal)
|
56
|
+
i = i[0]
|
57
|
+
j = j[0]
|
58
|
+
# Merge communities as recorded on communityAssignment
|
59
|
+
C = communities[i]
|
60
|
+
G = communities[j]
|
61
|
+
i_members=memberIndices(communities[i], communityAssignment)
|
62
|
+
j_members=memberIndices(communities[j], communityAssignment)
|
63
|
+
for mem in j_members:
|
64
|
+
communityAssignment[mem] = C
|
65
|
+
|
66
|
+
for k in range(numCommunities):
|
67
|
+
if communities[k] in [C, G]:
|
68
|
+
continue
|
69
|
+
k_members = memberIndices(communities[k], communityAssignment)
|
70
|
+
aveLinkVal = averageLinkage(rmsdMatrix, i_members+j_members, k_members)
|
71
|
+
aveLinkArray[i, k] = aveLinkVal
|
72
|
+
aveLinkArray[k, i] = aveLinkVal
|
73
|
+
|
74
|
+
aveLinkArray = np.delete(aveLinkArray, j, 0)
|
75
|
+
aveLinkArray = np.delete(aveLinkArray, j, 1)
|
76
|
+
|
77
|
+
return communityAssignment, aveLinkArray
|
78
|
+
|
79
|
+
|
80
|
+
def normalizeAvSpVal(AvSpVal, AvSpMax, AvSpMin, N):
|
81
|
+
return (N - 1) / (AvSpMax - AvSpMin) * (AvSpVal - AvSpMin) + 1
|
82
|
+
|
83
|
+
|
84
|
+
def normalizeAvSp(AvSpList, N):
|
85
|
+
AvSpMax = np.max(AvSpList)
|
86
|
+
AvSpMin = np.min(AvSpList)
|
87
|
+
if AvSpMax == AvSpMin:
|
88
|
+
return [1 for x in AvSpList]
|
89
|
+
return [normalizeAvSpVal(x, AvSpMax, AvSpMin, N) for x in AvSpList]
|
90
|
+
|
91
|
+
|
92
|
+
def NMRCLUST(rmsdMatrix):
|
93
|
+
N = rmsdMatrix.shape[0]
|
94
|
+
AvSpList = []
|
95
|
+
assignList = []
|
96
|
+
aveLinkArray = None
|
97
|
+
communityAssignment = list(range(N))
|
98
|
+
singletonPresent = True # spread cannot be calculated if size of cluster is 1. Avoid spread calculation until each cluster has at least 2 members
|
99
|
+
while singletonPresent:
|
100
|
+
communityAssignment, aveLinkArray = mergeClusters(rmsdMatrix, communityAssignment, aveLinkArray)
|
101
|
+
communities = list(set(communityAssignment))
|
102
|
+
commcount = [len([x for x in communityAssignment if x == C]) for C in communities]
|
103
|
+
if 1 not in commcount:
|
104
|
+
singletonPresent = False
|
105
|
+
elif len(communities) == 2:
|
106
|
+
singletonPresent = False
|
107
|
+
|
108
|
+
# Begin recording Average spread and cumminities assignment once singletons are absent
|
109
|
+
# Continue recording until all are merged into one cluster
|
110
|
+
while len(set(communityAssignment)) > 1:
|
111
|
+
communityAssignment, aveLinkArray = mergeClusters(rmsdMatrix, communityAssignment, aveLinkArray)
|
112
|
+
AvSpList.append(float(averageSpread(rmsdMatrix, communityAssignment)))
|
113
|
+
assignList.append(list(communityAssignment))
|
114
|
+
|
115
|
+
nClustList = [len(set(x)) for x in assignList] # number of clusters from when it was began recoded to one cluster, decreasing by one cluster at each step
|
116
|
+
AvSpNormList = normalizeAvSp(AvSpList, N)
|
117
|
+
penaltyVals = [AvSpNormList[x] + nClustList[x] for x in range(len(AvSpNormList))]
|
118
|
+
minPenalty = np.min(penaltyVals)
|
119
|
+
minPenaltyIndex = penaltyVals.index(minPenalty)
|
120
|
+
communityAssignment = assignList[minPenaltyIndex]
|
121
|
+
centroid_indices = centroid_medoid(communityAssignment, rmsdMatrix)
|
122
|
+
|
123
|
+
return communityAssignment, centroid_indices
|
@@ -0,0 +1,74 @@
|
|
1
|
+
# 2020-05-20
|
2
|
+
|
3
|
+
# Kiyoto Aramis Tanemura
|
4
|
+
|
5
|
+
# We consider the use of representative conformation K-means to benchmark against AutoGraph as a method which does not require specification of number of clusters or threshold.
|
6
|
+
# Original algorithm found at DOI: 10.1186/s13321-017-0208-0
|
7
|
+
|
8
|
+
import numpy as np
|
9
|
+
from random import sample
|
10
|
+
from math import factorial
|
11
|
+
|
12
|
+
def kmedoid(rmsdMatrix, k = 2):
|
13
|
+
# Generic k-medoid function with rmsd matrix as input
|
14
|
+
n = rmsdMatrix.shape[0]
|
15
|
+
medoids = sample(range(n), k)
|
16
|
+
prev_medoids = []
|
17
|
+
classification = np.zeros(n, dtype = int).tolist()
|
18
|
+
while medoids != prev_medoids:
|
19
|
+
for i in range(n):
|
20
|
+
prev_medoids = medoids
|
21
|
+
min_index = np.argmin(rmsdMatrix[i,:][medoids])
|
22
|
+
classification[i] = min_index
|
23
|
+
for j in range(k):
|
24
|
+
members = [x for x in range(n) if classification[x] == j]
|
25
|
+
sub_rmsd = rmsdMatrix[members,:][:, members]
|
26
|
+
center_index = np.argmin(np.sum(sub_rmsd, axis = 0))
|
27
|
+
medoids[j] = center_index
|
28
|
+
|
29
|
+
return classification, medoids
|
30
|
+
|
31
|
+
|
32
|
+
def comb(n, r):
|
33
|
+
if n < r:
|
34
|
+
return 1
|
35
|
+
return factorial(n) / (factorial(r) * factorial(n-r))
|
36
|
+
|
37
|
+
|
38
|
+
def MSQb(rmsdMatrix, medoids):
|
39
|
+
sub_rmsd = rmsdMatrix[medoids, :][:, medoids]
|
40
|
+
return np.sum(sub_rmsd) / (2 * comb(len(medoids), 2))
|
41
|
+
|
42
|
+
|
43
|
+
def MSQw(rmsdMatrix,classification):
|
44
|
+
tally = 0
|
45
|
+
n = len(classification)
|
46
|
+
for i in set(classification):
|
47
|
+
members = [x for x in range(n) if classification[x] == i]
|
48
|
+
sub_rmsd = rmsdMatrix[members, :][:, members]
|
49
|
+
tally += np.sum(sub_rmsd) / (2 * comb(len(members), 2))
|
50
|
+
return tally / len(set(classification))
|
51
|
+
|
52
|
+
|
53
|
+
def SMA(MSQb_list, W = 10):
|
54
|
+
if len(MSQb_list) >= W:
|
55
|
+
return np.mean(MSQb_list[-10:])
|
56
|
+
return -1
|
57
|
+
|
58
|
+
|
59
|
+
def RCKmeans(rmsdMatrix):
|
60
|
+
m = rmsdMatrix.shape[0]
|
61
|
+
K_MSQb = [0, 0]
|
62
|
+
prevSMA = -1
|
63
|
+
for k in range(2,m):
|
64
|
+
MSQw_list = np.zeros(100, dtype = int).tolist()
|
65
|
+
MSQb_list = np.zeros(100, dtype = int).tolist()
|
66
|
+
for i in range(100):
|
67
|
+
classification, medoids = kmedoid(rmsdMatrix, k)
|
68
|
+
MSQw_list[i] = MSQw(rmsdMatrix, classification)
|
69
|
+
MSQb_list[i] = MSQb(rmsdMatrix, medoids)
|
70
|
+
K_MSQb.append(MSQb_list[np.argmin(MSQw_list)])
|
71
|
+
currSMA = SMA(K_MSQb, 10)
|
72
|
+
if currSMA < prevSMA:
|
73
|
+
return kmedoid(rmsdMatrix, np.argmax(K_MSQb))
|
74
|
+
prevSMA = currSMA
|
@@ -0,0 +1 @@
|
|
1
|
+
from .bitqt import BitQT, get_cluster_stats
|