rdworks 0.25.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. rdworks/__init__.py +35 -0
  2. rdworks/autograph/__init__.py +4 -0
  3. rdworks/autograph/autograph.py +184 -0
  4. rdworks/autograph/centroid.py +90 -0
  5. rdworks/autograph/dynamictreecut.py +135 -0
  6. rdworks/autograph/nmrclust.py +123 -0
  7. rdworks/autograph/rckmeans.py +74 -0
  8. rdworks/bitqt/__init__.py +1 -0
  9. rdworks/bitqt/bitqt.py +355 -0
  10. rdworks/conf.py +374 -0
  11. rdworks/descriptor.py +36 -0
  12. rdworks/display.py +206 -0
  13. rdworks/ionized.py +170 -0
  14. rdworks/matchedseries.py +260 -0
  15. rdworks/mol.py +1522 -0
  16. rdworks/mollibr.py +887 -0
  17. rdworks/pka.py +38 -0
  18. rdworks/predefined/Asinex_fragment.xml +20 -0
  19. rdworks/predefined/Astex_RO3.xml +16 -0
  20. rdworks/predefined/Baell2010_PAINS/Baell2010A.xml +52 -0
  21. rdworks/predefined/Baell2010_PAINS/Baell2010B.xml +169 -0
  22. rdworks/predefined/Baell2010_PAINS/Baell2010C.xml +1231 -0
  23. rdworks/predefined/Baell2010_PAINS/PAINS-less-than-015-hits.xml +2048 -0
  24. rdworks/predefined/Baell2010_PAINS/PAINS-less-than-150-hits.xml +278 -0
  25. rdworks/predefined/Baell2010_PAINS/PAINS-more-than-150-hits.xml +83 -0
  26. rdworks/predefined/Baell2010_PAINS/makexml.py +70 -0
  27. rdworks/predefined/Brenk2008_Dundee/makexml.py +21 -0
  28. rdworks/predefined/CNS.xml +18 -0
  29. rdworks/predefined/ChEMBL_Walters/BMS.xml +543 -0
  30. rdworks/predefined/ChEMBL_Walters/Dundee.xml +318 -0
  31. rdworks/predefined/ChEMBL_Walters/Glaxo.xml +168 -0
  32. rdworks/predefined/ChEMBL_Walters/Inpharmatica.xml +276 -0
  33. rdworks/predefined/ChEMBL_Walters/LINT.xml +174 -0
  34. rdworks/predefined/ChEMBL_Walters/MLSMR.xml +351 -0
  35. rdworks/predefined/ChEMBL_Walters/PAINS.xml +1446 -0
  36. rdworks/predefined/ChEMBL_Walters/SureChEMBL.xml +501 -0
  37. rdworks/predefined/ChEMBL_Walters/makexml.py +40 -0
  38. rdworks/predefined/Hann1999_Glaxo/Hann1999.xml +168 -0
  39. rdworks/predefined/Hann1999_Glaxo/Hann1999Acid.xml +102 -0
  40. rdworks/predefined/Hann1999_Glaxo/Hann1999Base.xml +6 -0
  41. rdworks/predefined/Hann1999_Glaxo/Hann1999ElPh.xml +6 -0
  42. rdworks/predefined/Hann1999_Glaxo/Hann1999NuPh.xml +6 -0
  43. rdworks/predefined/Hann1999_Glaxo/makexml.py +83 -0
  44. rdworks/predefined/Kazius2005/Kazius2005.xml +114 -0
  45. rdworks/predefined/Kazius2005/makexml.py +66 -0
  46. rdworks/predefined/ZINC_druglike.xml +24 -0
  47. rdworks/predefined/ZINC_fragment.xml +14 -0
  48. rdworks/predefined/ZINC_leadlike.xml +15 -0
  49. rdworks/predefined/fragment.xml +7 -0
  50. rdworks/predefined/ionized/simple_smarts_pattern.csv +57 -0
  51. rdworks/predefined/ionized/smarts_pattern.csv +107 -0
  52. rdworks/predefined/misc/makexml.py +119 -0
  53. rdworks/predefined/misc/reactive-part-2.xml +104 -0
  54. rdworks/predefined/misc/reactive-part-3.xml +74 -0
  55. rdworks/predefined/misc/reactive.xml +321 -0
  56. rdworks/readin.py +312 -0
  57. rdworks/rgroup.py +2173 -0
  58. rdworks/scaffold.py +520 -0
  59. rdworks/std.py +143 -0
  60. rdworks/stereoisomers.py +127 -0
  61. rdworks/tautomers.py +20 -0
  62. rdworks/units.py +63 -0
  63. rdworks/utils.py +495 -0
  64. rdworks/xml.py +260 -0
  65. rdworks-0.25.7.dist-info/METADATA +37 -0
  66. rdworks-0.25.7.dist-info/RECORD +69 -0
  67. rdworks-0.25.7.dist-info/WHEEL +5 -0
  68. rdworks-0.25.7.dist-info/licenses/LICENSE +21 -0
  69. rdworks-0.25.7.dist-info/top_level.txt +1 -0
rdworks/__init__.py ADDED
@@ -0,0 +1,35 @@
1
+ __version__ = '0.25.7'
2
+
3
+ from rdworks.xml import list_predefined_xml, get_predefined_xml, parse_xml
4
+ from rdworks.units import ev2kcalpermol, hartree2ev, hartree2kcalpermol, periodictable
5
+ from rdworks.readin import read_csv, merge_csv, read_dataframe, read_smi, read_sdf, read_mae
6
+ from rdworks.std import desalt_smiles, standardize_smiles, standardize
7
+ from rdworks.tautomers import complete_tautomers
8
+ from rdworks.stereoisomers import complete_stereoisomers
9
+ from rdworks.ionized import IonizedStates
10
+ from rdworks.rgroup import expand_rgroup, most_common, most_common_in_NP
11
+ from rdworks.scaffold import scaffold_network, scaffold_tree, BRICS_fragmented, BRICS_fragment_indices
12
+ from rdworks.matchedseries import MatchedSeries
13
+ from rdworks.descriptor import rd_descriptor, rd_descriptor_f
14
+ from rdworks.utils import fix_decimal_places_in_list, fix_decimal_places_in_dict, mae_to_dict, mae_rd_index
15
+ from rdworks.display import svg
16
+ from rdworks.conf import Conf
17
+ from rdworks.mol import Mol
18
+ from rdworks.mollibr import MolLibr
19
+
20
+ from rdkit import rdBase, RDLogger
21
+ rdkit_logger = RDLogger.logger().setLevel(RDLogger.CRITICAL)
22
+
23
+ import logging
24
+
25
+ main_logger = logging.getLogger()
26
+ main_logger.setLevel(logging.INFO) # level: DEBUG < INFO < WARNING < ERROR < CRITICAL
27
+ logger_formatter = logging.Formatter(
28
+ fmt='%(asctime)s %(levelname)s %(message)s',
29
+ datefmt='%Y-%m-%d %H:%M:%S')
30
+ logger_ch = logging.StreamHandler()
31
+ logger_ch.setFormatter(logger_formatter)
32
+ main_logger.addHandler(logger_ch)
33
+
34
+
35
+ __rdkit_version__ = rdBase.rdkitVersion
@@ -0,0 +1,4 @@
1
+ from .nmrclust import NMRCLUST
2
+ from .rckmeans import RCKmeans
3
+ from .dynamictreecut import DynamicTreeCut
4
+ from .autograph import AutoGraph
@@ -0,0 +1,184 @@
1
+ # 2020-04-17
2
+
3
+ # Kiyoto Aramis Tanemura
4
+
5
+ # Code for Louvain algorithm was getting lengthy. I dedicate its own module
6
+
7
+ import numpy as np
8
+
9
+ from .centroid import centroid_autograph
10
+
11
+ def getModularity(affinityMatrix, communityAssignments, resolution = 1.0):
12
+ # Provided an affinity matrix (np.array) and nodes specifying their assigned communities (list),
13
+ # return the modularity of the whole graph
14
+ arrayDim = affinityMatrix.shape[0]
15
+ communities = list(set(communityAssignments))
16
+ Q = 0
17
+ affinityMatrix[range(arrayDim), range(arrayDim)] += affinityMatrix[range(arrayDim), range(arrayDim)]
18
+ two_m = np.sum(affinityMatrix)
19
+ for C in communities:
20
+ communityIndices = [i for i in range(len(communityAssignments)) if communityAssignments[i] == C]
21
+ sigma_in = np.sum(affinityMatrix[communityIndices, :][:, communityIndices])
22
+ sigma_tot = np.sum(affinityMatrix[communityIndices, :])
23
+ Q += sigma_in / two_m - resolution * (sigma_tot / two_m) ** 2
24
+ return Q
25
+
26
+ def LouvainPhase1(affinityMatrix, communityAssignment, Q_threshold, max_iter, resolution):
27
+ # inputs: affinityMatrix as numpy array. Clear diagonals prior to entering and filter edges with weights below threshold
28
+ # communityAssignments: list of length of nodes. Values are community IDs
29
+ # Q_threshold: minimum global modularity to terminate phase 1
30
+ # max_iter: number of iteration to force termination of phase 1
31
+ # output: updated community Assignment list
32
+
33
+ num_nodes = affinityMatrix.shape[0]
34
+
35
+ affinityMatrix[range(num_nodes), range(num_nodes)] += affinityMatrix[range(num_nodes), range(num_nodes)]
36
+
37
+ two_m = np.sum(affinityMatrix)
38
+
39
+ modularity_current = getModularity(affinityMatrix, communityAssignment, resolution)
40
+ changeModularity = 1
41
+
42
+ adjacencyMatrix = affinityMatrix > 0
43
+ iterations = 0
44
+
45
+ while changeModularity > Q_threshold and iterations < max_iter:
46
+ modularity_prev = modularity_current
47
+ for i in range(num_nodes): # for each node
48
+ neighborIndices = [j for j in range(num_nodes) if adjacencyMatrix[i,j] == 1]
49
+ communities = set([communityAssignment[k] for k in neighborIndices])
50
+ communities.discard(communityAssignment[i])
51
+ communities = list(communities)
52
+ modularityList = []
53
+ # ki: sum of edges incident to node i
54
+ ki = np.sum(affinityMatrix[i, :])
55
+ for C in communities:
56
+ C_members = [x for x in range(num_nodes) if communityAssignment[x] == C]
57
+ # Sum weights of all edges incident to nodes in C
58
+ sigma_tot = np.sum(affinityMatrix[C_members, :])
59
+ # ki_in: sum of weight of edges incident to node i in C
60
+ ki_in = np.sum(affinityMatrix[i, :][C_members])
61
+ deltaQ = ki_in/two_m - resolution * sigma_tot * ki / (two_m ** 2 / 2) # simplified formula. Derivation from (https://hal.archives-ouvertes.fr/hal-01231784/document)
62
+ modularityList.append(deltaQ)
63
+ modularityList.append(0) # append zero to avoid error by empty list
64
+ maxQgain = np.max(modularityList)
65
+ if maxQgain > 0:
66
+ communityToJoin = communities[modularityList.index(maxQgain)]
67
+ communityAssignment[i] = communityToJoin
68
+ modularity_current = getModularity(affinityMatrix, communityAssignment, resolution)
69
+ changeModularity = modularity_current - modularity_prev
70
+ iterations += 1
71
+
72
+ return communityAssignment
73
+
74
+ def LouvainPhase2(affinityMatrix, communityAssignment):
75
+ # Merge each communities into supernodes
76
+ # input: affinityMatrix or condensed graph
77
+ # output: condensed graph and list of communities corresponding to the axis of graph
78
+ num_nodes = affinityMatrix.shape[0]
79
+ communities = list(set(communityAssignment))
80
+ num_communities = len(communities)
81
+ phase2graph = np.zeros([num_communities, num_communities])
82
+ for i in range(num_communities):
83
+ i_members = [x for x in range(num_nodes) if communityAssignment[x] == communities[i]]
84
+ for j in range(num_communities):
85
+ j_members = [x for x in range(num_nodes) if communityAssignment[x] == communities[j]]
86
+ phase2graph[i, j] = np.sum(affinityMatrix[i_members, :][:, j_members]) * (1/2) ** (i == j)
87
+ # if same communities, edges are double counted. If i == j, divide by two
88
+
89
+ return phase2graph, communities
90
+
91
+
92
+ def Louvain(affinityMatrix, Q_threshold = 0.001, max_iter = 50, resolution = 1.0):
93
+ # perform the two phases of Louvain community iteratively
94
+ graph = affinityMatrix
95
+ comm = list(range(affinityMatrix.shape[0]))
96
+ # communityAssignmentRecord and commRefList are lists of lists, with same lengths.
97
+ # communityAssignmentRecord stores assingment after phase 1
98
+ # commRefList keeps the individual communities before assignment
99
+ communityAssignmentRecord = []
100
+ commRefList = []
101
+ changeModularity = 1
102
+ iterations = 0
103
+ while changeModularity > Q_threshold and iterations < max_iter:
104
+ graph, comm = LouvainPhase2(graph, comm)
105
+ commRefList.append(list(comm))
106
+ modularity_past = getModularity(graph, comm, resolution) # Note: we want to compare Q before and after reassignments in phase 1
107
+ comm = LouvainPhase1(graph, comm, Q_threshold, max_iter, resolution)
108
+ communityAssignmentRecord.append(list(comm))
109
+ modularity_curr = getModularity(graph, comm, resolution)
110
+ changeModularity = modularity_curr - modularity_past
111
+ iterations += 1
112
+ # print('changeModularity', changeModularity)
113
+
114
+ for i in range(len(communityAssignmentRecord) - 2, 0, -1):
115
+ changeDict = {}
116
+ for j in range(len(communityAssignmentRecord[i])):
117
+ newComm = communityAssignmentRecord[i][j]
118
+ oldComm = commRefList[i][j]
119
+ if newComm != oldComm:
120
+ changeDict[oldComm] = newComm # Change key to value
121
+
122
+ for key in changeDict:
123
+ oldIndices = [x for x in range(len(communityAssignmentRecord[i - 1])) if communityAssignmentRecord[i - 1][x] == key]
124
+ for j in oldIndices:
125
+ communityAssignmentRecord[i - 1][j] = changeDict[key]
126
+ return communityAssignmentRecord[0]
127
+
128
+
129
+ def rbfKernel(r, epsilon = 1.0):
130
+ return np.exp(-(epsilon*r)**2)
131
+
132
+
133
+ def findThreshold(affinityMatrix):
134
+ numFiles = affinityMatrix.shape[0]
135
+ # Get all values in affinityMatrix. Sort in descending order
136
+ vals = np.array([])
137
+ for i in range(numFiles - 1):
138
+ vals = np.append(vals, affinityMatrix[i, i + 1:])
139
+
140
+ vals = np.sort(vals, axis = None)[::-1]
141
+
142
+ # Initialize index values
143
+ upperIndex = 0
144
+ lowerIndex = len(vals) - 1
145
+
146
+ while upperIndex != lowerIndex - 1: # Until the indices are consecutive, iterate
147
+ midIndex = int(np.mean([lowerIndex, upperIndex]))
148
+ adjacencyMatrix = affinityMatrix > vals[midIndex]
149
+ # Will tally nodes visited from first node in graph by BFS. If not all nodes were visited, there are more than one component.
150
+ nodesVisited = [0] + [i for i in range(numFiles) if adjacencyMatrix[0, i] == 1]
151
+ # nodesVisited = [i for i in range(numFiles) if adjacencyMatrix[0, i] == 1]
152
+ for i in nodesVisited:
153
+ newNodes = [j for j in range(numFiles) if adjacencyMatrix[i, j] == 1 and j not in nodesVisited]
154
+ nodesVisited += newNodes
155
+ while len(newNodes) > 0:
156
+ newNodes1list = []
157
+ for i in newNodes:
158
+ newNodes1 = [j for j in range(numFiles) if adjacencyMatrix[i, j] == 1 and j not in nodesVisited]
159
+ newNodes1list += newNodes1
160
+ nodesVisited += newNodes1
161
+
162
+ newNodes = newNodes1list
163
+
164
+ # If every node was visited, then we have exactly one component. Otherwise, there are disconnected graphs
165
+ if len(nodesVisited) < numFiles:
166
+ upperIndex = midIndex
167
+ else:
168
+ lowerIndex = midIndex
169
+
170
+ threshold = vals[lowerIndex]
171
+
172
+ return threshold
173
+
174
+
175
+ def AutoGraph(rmsdMatrix):
176
+ N = rmsdMatrix.shape[0]
177
+ affinityMatrix = rbfKernel(rmsdMatrix)
178
+ affinityMatrix[range(N), range(N)] = 0 # set diagonal zero
179
+ threshold = findThreshold(affinityMatrix)
180
+ adjacencyMatrix = affinityMatrix > threshold
181
+ filteredAffinityMatrix = affinityMatrix * adjacencyMatrix
182
+ communityAssignment = Louvain(filteredAffinityMatrix, Q_threshold=0.0, max_iter=50, resolution=1.0)
183
+ centroid_indices = centroid_autograph(N, communityAssignment, rmsdMatrix, threshold)
184
+ return communityAssignment, centroid_indices
@@ -0,0 +1,90 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+
5
+ def centroid_medoid(communityAssignment, rmsdMatrix) -> list:
6
+ """returns a list of centroids based on medoids
7
+
8
+ Medoids are representative objects of a data set or a cluster within a data set
9
+ whose sum of dissimilarities to all the objects in the cluster is minimal.
10
+
11
+ Args:
12
+ communityAssignment: (list) list of community assignment correspoinding to the index of fileList
13
+ rmsdMatrix: (numpy array) Matrix containing pairwise atomic RMSD between all conformers
14
+
15
+ Returns:
16
+ a list of centroids
17
+ """
18
+ N = rmsdMatrix.shape[0]
19
+ community_indices = list(set(communityAssignment))
20
+ centroids = []
21
+ for C in community_indices:
22
+ C_members = [x for x in range(N) if communityAssignment[x] == C]
23
+ community_submatrix = rmsdMatrix[C_members, :][:, C_members]
24
+ dist_sum = np.sum(community_submatrix, axis = 1)
25
+ centroids.append(C_members[np.argmin(dist_sum)])
26
+ return centroids
27
+
28
+
29
+ def diekstra(filtered_rmsd_matrix_community, i):
30
+ '''Use Dijkstra's algorithm to find shortest path from index i to all other nodes'''
31
+ # initialize lists
32
+ visited = []
33
+ unvisited = [x for x in range(filtered_rmsd_matrix_community.shape[0])]
34
+ record = [np.inf for x in range(filtered_rmsd_matrix_community.shape[0])]#{x: np.inf for x in range(graph.shape[0])}
35
+ record[i] = 0
36
+ lastNode = [-1 for x in record]
37
+ # repeat until all nodes have been visited
38
+ while len(unvisited) > 0:
39
+ visit_index = unvisited[np.argmin([record[x] for x in unvisited])]
40
+ unvisited_neighbors = [x for x in unvisited if filtered_rmsd_matrix_community[visit_index, x] > 0]
41
+ # Calculate distance to unvisited neighbor. If value is shorter than recorded, update distance.
42
+ updateDist = filtered_rmsd_matrix_community[visit_index, :] + record[visit_index]
43
+ for j in unvisited_neighbors:
44
+ record[j] = np.min([updateDist[j], record[j]])
45
+ if updateDist[j] < record[j]:
46
+ lastNode[j] = visit_index
47
+ # update visited/unvisited node list
48
+ unvisited.remove(visit_index)
49
+ visited.append(visit_index)
50
+ return record, lastNode
51
+
52
+
53
+ def centroid_betweenness(num, communityAssignment, filtered_rmsd_matrix):
54
+ # Provided with list of conformers assigned to communities, choose representative centroid by conformers of maximum in community betweenness
55
+ # inputs
56
+ # fileList: (list) names of xyz files for each conformer
57
+ # communityAssignment: (list) list of community assignment correspoinding to the index of fileList
58
+ # filtered_rmsd_matrix: {np.array) RMSD matrix between conformers, except assigning distances above threshold to zero
59
+ communityList = list(set(communityAssignment))
60
+ centralNodes = []
61
+ comm_size = []
62
+ for C in communityList:
63
+ C_members = [x for x in range(num) if communityAssignment[x] == C]
64
+ C_member_files = [x for x in C_members]
65
+ comm_size.append(len(C_members))
66
+ community_subgraph = filtered_rmsd_matrix[C_members, :][:, C_members]
67
+ community_betweenness = np.zeros(len(C_members))
68
+ for i in range(len(C_member_files)):
69
+ record, lastnode = diekstra(community_subgraph, i)
70
+ for j in range(len(C_member_files)):
71
+ previous_node = lastnode[j]
72
+ while previous_node != -1:
73
+ community_betweenness[previous_node] += 1
74
+ previous_node = lastnode[previous_node]
75
+ max_betweenness_index = np.argmax(community_betweenness)
76
+ centralNodes.append(C_member_files[max_betweenness_index])
77
+
78
+ # Sort centers by size of clusters in descending order
79
+ centralDf = pd.DataFrame({'size': comm_size}, index = centralNodes)
80
+ centralDf.sort_values(by = 'size', ascending = False, inplace = True)
81
+
82
+ return list(centralDf.index)
83
+
84
+
85
+ def centroid_autograph(N, communityAssignment, rmsdMatrix, threshold, centroid_selection='betweenness', filteredAffinityMatrix=None):
86
+ '''Return file names of conformers designated as centroids. If energy is provided, find lowest energy conformers in each cluster. Otherwise choose by maximum in-cluster weighted degree'''
87
+ if centroid_selection == 'betweenness':
88
+ return centroid_betweenness(N, communityAssignment, rmsdMatrix * rmsdMatrix < np.sqrt(-np.log(threshold))) # add filtered rmsd matrix
89
+ else:
90
+ print('centroid criterion not recognized. Use keywords "degree", "eccentricity", or "betweenness" for centroid_selection or provide an energy output to base the selection')
@@ -0,0 +1,135 @@
1
+ # 2020-05-22
2
+
3
+ # Kiyoto Aramis Tanemura
4
+
5
+ # The Ward algorithm for conformational clustering is used in building Markov State Model (DOI: 10.1021/acs.jctc.6b01238). To circumvent threshold selection, we will apply the dynamic tree cut method, used in conjunction to the Ward dendogram for automated conformational clustering (DOI: 10.1186/s13321-017-0208-0).
6
+
7
+ import numpy as np
8
+
9
+ from scipy.cluster.hierarchy import linkage, to_tree
10
+ from scipy.spatial.distance import squareform
11
+
12
+ from .centroid import centroid_medoid
13
+
14
+
15
+ def get_ward_dendrogram(rmsd_mat):
16
+ '''Use Scipy functions to obtain dendrogram using Ward method
17
+ Returns the dendrogram (refer to Scipy linkage output format) as a ClusterNode object'''
18
+ dend = linkage(squareform(rmsd_mat), method = 'ward', optimal_ordering = True)
19
+ return dend
20
+
21
+
22
+ def goLeftmost(node, path, path_record):
23
+ '''provided a node, travel left until a leaf is reached'''
24
+ curr_node = node
25
+ if path in path_record:
26
+ return path
27
+ while not curr_node.is_leaf():
28
+ curr_node = curr_node.get_left()
29
+ path += ('l')
30
+ return path
31
+
32
+
33
+ def travelDown(ref_node, rel_path):
34
+ '''reach a node below, provided a starting node an path'''
35
+ curr_node = ref_node
36
+ for i in rel_path:
37
+ if i == 'l':
38
+ curr_node = curr_node.get_left()
39
+ elif i == 'r':
40
+ curr_node = curr_node.get_right()
41
+ return curr_node
42
+
43
+
44
+ def getHeights(root):
45
+ '''return heights of nonleaf nodes and their corresponding paths'''
46
+ heights = []
47
+ path_recorded = []
48
+ path = '.'
49
+ curr_node = root
50
+ if root.is_leaf():
51
+ return [0], [path]
52
+ while 'l' in list(path) or not curr_node.is_leaf():
53
+ path = goLeftmost(curr_node, path, path_recorded)
54
+ path = path[:-1]
55
+ curr_node = travelDown(root, path)
56
+ if path not in path_recorded:
57
+ heights.append(curr_node.dist)
58
+ path_recorded.append(path)
59
+ path += 'r'
60
+ curr_node = travelDown(root, path)
61
+ return heights, path_recorded
62
+
63
+
64
+ def treeCutCore(H, I, tau = 5):
65
+ '''Determine significant clusters provided one calibration value'''
66
+ H_hat = H - I
67
+ trans_indices = [x for x in range(len(H)-1) if H_hat[x] > 0 and H_hat[x+1] < 0]
68
+ breakpoints = []
69
+ for i in trans_indices:
70
+ back_index = 1
71
+ while H_hat[i-back_index] > 0 and back_index <= i:
72
+ back_index += 1
73
+ breakpoints.append(i - back_index + 1)
74
+ # Find significant breakpoints
75
+ significant_breakpoints = [breakpoints[x] for x in range(len(breakpoints)) if trans_indices[x] - breakpoints[x] > tau]
76
+ return significant_breakpoints
77
+
78
+
79
+ def adaptiveTreecutCore(H, tau = 5):
80
+ '''Perform treeCutCore at mean height. If no significant breakpoints are detected,
81
+ continue the operation below and above the mean.'''
82
+ if len(H) == 0:
83
+ return []
84
+ lm = np.mean(H)
85
+ lu = np.mean([lm, np.max(H)])
86
+ ld = np.mean([lm, np.min(H)])
87
+ bps = treeCutCore(H, lm, tau)
88
+ if len(bps) == 0:
89
+ bps = treeCutCore(H, ld, tau)
90
+ if len(bps) == 0:
91
+ bps = treeCutCore(H, lu, tau)
92
+ return bps
93
+
94
+
95
+ def getClusterNodeIndices(comprehensive_path_list, cluster_substring):
96
+ # Given a root string, return all permutations as indices
97
+ return [x for x in range(len(comprehensive_path_list)) if comprehensive_path_list[x].startswith(cluster_substring)]
98
+
99
+
100
+ # After looking at the Java implementation, I suspect the breakpoints, corresponding to the indices of heights (distance value of nonleaf nodes),
101
+ # also correspond to the indices of leaves. Try clustering only by collecting all breakpoints, then using the indices to subset leaves.
102
+ def dynamicTreeCut(tree, n, tau = 5):
103
+ allHeights, allPaths = getHeights(tree)
104
+ allHeights = [0] + allHeights + [0] # Sandwitch the heights with zero so that the ends can be included or left out from major clusters.
105
+ breakpoints = [0,-1]
106
+ updateList = [-1]
107
+ while len(updateList) > 0:
108
+ updateList = []
109
+ for i in range(len(breakpoints) - 1):
110
+ Hi = allHeights[breakpoints[i]:breakpoints[i+1]]
111
+ cutpoints = adaptiveTreecutCore(Hi, tau)
112
+ updateList += [x for x in cutpoints if x not in breakpoints]
113
+ breakpoints += updateList
114
+ return breakpoints
115
+
116
+
117
+ def report_assingments(breakpoints, tree):
118
+ '''To standardize outputs with the other clustering algorithms, this function takes the list of ClusterNodes produced in dynamicTreeCut and returns a list of ints specifying the cluster assignment'''
119
+ leaves = tree.pre_order(lambda x: x.id)
120
+ n = len(leaves)
121
+ comm_assing = np.zeros(n, int)
122
+ for i in range(len(breakpoints) - 1):
123
+ members = leaves[breakpoints[i]:breakpoints[i+1]]
124
+ comm_assing[members] = i
125
+ return comm_assing.tolist()
126
+
127
+
128
+ def DynamicTreeCut(rmsdMatrix, tau = 5): # Ward clustering
129
+ N = rmsdMatrix.shape[0]
130
+ dend = get_ward_dendrogram(rmsdMatrix)
131
+ tree = to_tree(dend)
132
+ breakpoints = dynamicTreeCut(tree, N, tau)
133
+ communityAssignment = report_assingments(breakpoints, tree)
134
+ centroid_indices = centroid_medoid(communityAssignment, rmsdMatrix)
135
+ return communityAssignment, centroid_indices
@@ -0,0 +1,123 @@
1
+ # 2020-04-29
2
+
3
+ # Kiyoto Aramis Tanemura
4
+
5
+ # NMRCLUST algorithm implemented in python using numpy. Original algorithm DOI: 10.1093/protein/9.11.1063
6
+ # To use in your code, first compute rmsdMatrix
7
+ # Then 'communityAssignment = NRMCLUST(rmsdMatrix)'
8
+ # The file list corresponding to the axis of the rmsdMatrix will be assigned to clusters specified in communityAssignment
9
+
10
+ import numpy as np
11
+
12
+ from .centroid import centroid_medoid
13
+
14
+
15
+ def averageLinkage(rmsdMatrix, clusterXmembers, clusterYmembers):
16
+ subgraph = rmsdMatrix[clusterXmembers,:][:,clusterYmembers]
17
+ return np.mean(subgraph)
18
+
19
+
20
+ def spread(rmsdMatrix, members):
21
+ subgraph = rmsdMatrix[members,:][:,members]
22
+ N = len(members)
23
+ offDiagonalSum = np.sum(subgraph) / 2 # note: diagonals are zero
24
+ return offDiagonalSum / (N * (N-1) / 2)
25
+
26
+
27
+ def memberIndices(communityID, communityAssignment):
28
+ indices = [x for x in range(len(communityAssignment)) if communityAssignment[x] == communityID]
29
+ return indices
30
+
31
+
32
+ def averageSpread(rmsdMatrix, communityAssignment):
33
+ communities = list(set(communityAssignment))
34
+ spreads = [spread(rmsdMatrix, memberIndices(C, communityAssignment)) for C in communities]
35
+ return np.mean(spreads)
36
+
37
+
38
+ def mergeClusters(rmsdMatrix, communityAssignment, aveLinkArray = None):
39
+ communities = list(set(communityAssignment))
40
+ numCommunities = len(communities)
41
+ if type(aveLinkArray) == type(None):
42
+ aveLinkArray = np.zeros([numCommunities, numCommunities])
43
+ for i in range(numCommunities - 1):
44
+ for j in range(i + 1, numCommunities):
45
+ i_members = memberIndices(communities[i], communityAssignment)
46
+ j_members = memberIndices(communities[j], communityAssignment)
47
+ aveLinkVal = averageLinkage(rmsdMatrix, i_members, j_members)
48
+ aveLinkArray[i,j] = aveLinkVal
49
+ aveLinkArray[j,i] = aveLinkVal
50
+
51
+ # Populate diagonals (self similarity) with max average linkage value to remove it from consideration.
52
+ aveLinkArray[range(numCommunities), range(numCommunities)] = np.max(aveLinkArray) + 0.01
53
+ minVal = np.min(aveLinkArray)
54
+ # Find value of minimum average linkage. Keep only the first index
55
+ i, j = np.where(aveLinkArray == minVal)
56
+ i = i[0]
57
+ j = j[0]
58
+ # Merge communities as recorded on communityAssignment
59
+ C = communities[i]
60
+ G = communities[j]
61
+ i_members=memberIndices(communities[i], communityAssignment)
62
+ j_members=memberIndices(communities[j], communityAssignment)
63
+ for mem in j_members:
64
+ communityAssignment[mem] = C
65
+
66
+ for k in range(numCommunities):
67
+ if communities[k] in [C, G]:
68
+ continue
69
+ k_members = memberIndices(communities[k], communityAssignment)
70
+ aveLinkVal = averageLinkage(rmsdMatrix, i_members+j_members, k_members)
71
+ aveLinkArray[i, k] = aveLinkVal
72
+ aveLinkArray[k, i] = aveLinkVal
73
+
74
+ aveLinkArray = np.delete(aveLinkArray, j, 0)
75
+ aveLinkArray = np.delete(aveLinkArray, j, 1)
76
+
77
+ return communityAssignment, aveLinkArray
78
+
79
+
80
+ def normalizeAvSpVal(AvSpVal, AvSpMax, AvSpMin, N):
81
+ return (N - 1) / (AvSpMax - AvSpMin) * (AvSpVal - AvSpMin) + 1
82
+
83
+
84
+ def normalizeAvSp(AvSpList, N):
85
+ AvSpMax = np.max(AvSpList)
86
+ AvSpMin = np.min(AvSpList)
87
+ if AvSpMax == AvSpMin:
88
+ return [1 for x in AvSpList]
89
+ return [normalizeAvSpVal(x, AvSpMax, AvSpMin, N) for x in AvSpList]
90
+
91
+
92
+ def NMRCLUST(rmsdMatrix):
93
+ N = rmsdMatrix.shape[0]
94
+ AvSpList = []
95
+ assignList = []
96
+ aveLinkArray = None
97
+ communityAssignment = list(range(N))
98
+ singletonPresent = True # spread cannot be calculated if size of cluster is 1. Avoid spread calculation until each cluster has at least 2 members
99
+ while singletonPresent:
100
+ communityAssignment, aveLinkArray = mergeClusters(rmsdMatrix, communityAssignment, aveLinkArray)
101
+ communities = list(set(communityAssignment))
102
+ commcount = [len([x for x in communityAssignment if x == C]) for C in communities]
103
+ if 1 not in commcount:
104
+ singletonPresent = False
105
+ elif len(communities) == 2:
106
+ singletonPresent = False
107
+
108
+ # Begin recording Average spread and cumminities assignment once singletons are absent
109
+ # Continue recording until all are merged into one cluster
110
+ while len(set(communityAssignment)) > 1:
111
+ communityAssignment, aveLinkArray = mergeClusters(rmsdMatrix, communityAssignment, aveLinkArray)
112
+ AvSpList.append(float(averageSpread(rmsdMatrix, communityAssignment)))
113
+ assignList.append(list(communityAssignment))
114
+
115
+ nClustList = [len(set(x)) for x in assignList] # number of clusters from when it was began recoded to one cluster, decreasing by one cluster at each step
116
+ AvSpNormList = normalizeAvSp(AvSpList, N)
117
+ penaltyVals = [AvSpNormList[x] + nClustList[x] for x in range(len(AvSpNormList))]
118
+ minPenalty = np.min(penaltyVals)
119
+ minPenaltyIndex = penaltyVals.index(minPenalty)
120
+ communityAssignment = assignList[minPenaltyIndex]
121
+ centroid_indices = centroid_medoid(communityAssignment, rmsdMatrix)
122
+
123
+ return communityAssignment, centroid_indices
@@ -0,0 +1,74 @@
1
+ # 2020-05-20
2
+
3
+ # Kiyoto Aramis Tanemura
4
+
5
+ # We consider the use of representative conformation K-means to benchmark against AutoGraph as a method which does not require specification of number of clusters or threshold.
6
+ # Original algorithm found at DOI: 10.1186/s13321-017-0208-0
7
+
8
+ import numpy as np
9
+ from random import sample
10
+ from math import factorial
11
+
12
+ def kmedoid(rmsdMatrix, k = 2):
13
+ # Generic k-medoid function with rmsd matrix as input
14
+ n = rmsdMatrix.shape[0]
15
+ medoids = sample(range(n), k)
16
+ prev_medoids = []
17
+ classification = np.zeros(n, dtype = int).tolist()
18
+ while medoids != prev_medoids:
19
+ for i in range(n):
20
+ prev_medoids = medoids
21
+ min_index = np.argmin(rmsdMatrix[i,:][medoids])
22
+ classification[i] = min_index
23
+ for j in range(k):
24
+ members = [x for x in range(n) if classification[x] == j]
25
+ sub_rmsd = rmsdMatrix[members,:][:, members]
26
+ center_index = np.argmin(np.sum(sub_rmsd, axis = 0))
27
+ medoids[j] = center_index
28
+
29
+ return classification, medoids
30
+
31
+
32
+ def comb(n, r):
33
+ if n < r:
34
+ return 1
35
+ return factorial(n) / (factorial(r) * factorial(n-r))
36
+
37
+
38
+ def MSQb(rmsdMatrix, medoids):
39
+ sub_rmsd = rmsdMatrix[medoids, :][:, medoids]
40
+ return np.sum(sub_rmsd) / (2 * comb(len(medoids), 2))
41
+
42
+
43
+ def MSQw(rmsdMatrix,classification):
44
+ tally = 0
45
+ n = len(classification)
46
+ for i in set(classification):
47
+ members = [x for x in range(n) if classification[x] == i]
48
+ sub_rmsd = rmsdMatrix[members, :][:, members]
49
+ tally += np.sum(sub_rmsd) / (2 * comb(len(members), 2))
50
+ return tally / len(set(classification))
51
+
52
+
53
+ def SMA(MSQb_list, W = 10):
54
+ if len(MSQb_list) >= W:
55
+ return np.mean(MSQb_list[-10:])
56
+ return -1
57
+
58
+
59
+ def RCKmeans(rmsdMatrix):
60
+ m = rmsdMatrix.shape[0]
61
+ K_MSQb = [0, 0]
62
+ prevSMA = -1
63
+ for k in range(2,m):
64
+ MSQw_list = np.zeros(100, dtype = int).tolist()
65
+ MSQb_list = np.zeros(100, dtype = int).tolist()
66
+ for i in range(100):
67
+ classification, medoids = kmedoid(rmsdMatrix, k)
68
+ MSQw_list[i] = MSQw(rmsdMatrix, classification)
69
+ MSQb_list[i] = MSQb(rmsdMatrix, medoids)
70
+ K_MSQb.append(MSQb_list[np.argmin(MSQw_list)])
71
+ currSMA = SMA(K_MSQb, 10)
72
+ if currSMA < prevSMA:
73
+ return kmedoid(rmsdMatrix, np.argmax(K_MSQb))
74
+ prevSMA = currSMA
@@ -0,0 +1 @@
1
+ from .bitqt import BitQT, get_cluster_stats