rdworks 0.25.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. rdworks-0.25.7/LICENSE +21 -0
  2. rdworks-0.25.7/PKG-INFO +37 -0
  3. rdworks-0.25.7/README.md +2 -0
  4. rdworks-0.25.7/pyproject.toml +62 -0
  5. rdworks-0.25.7/setup.cfg +4 -0
  6. rdworks-0.25.7/src/rdworks/__init__.py +35 -0
  7. rdworks-0.25.7/src/rdworks/autograph/__init__.py +4 -0
  8. rdworks-0.25.7/src/rdworks/autograph/autograph.py +184 -0
  9. rdworks-0.25.7/src/rdworks/autograph/centroid.py +90 -0
  10. rdworks-0.25.7/src/rdworks/autograph/dynamictreecut.py +135 -0
  11. rdworks-0.25.7/src/rdworks/autograph/nmrclust.py +123 -0
  12. rdworks-0.25.7/src/rdworks/autograph/rckmeans.py +74 -0
  13. rdworks-0.25.7/src/rdworks/bitqt/__init__.py +1 -0
  14. rdworks-0.25.7/src/rdworks/bitqt/bitqt.py +355 -0
  15. rdworks-0.25.7/src/rdworks/conf.py +374 -0
  16. rdworks-0.25.7/src/rdworks/descriptor.py +36 -0
  17. rdworks-0.25.7/src/rdworks/display.py +206 -0
  18. rdworks-0.25.7/src/rdworks/ionized.py +170 -0
  19. rdworks-0.25.7/src/rdworks/matchedseries.py +260 -0
  20. rdworks-0.25.7/src/rdworks/mol.py +1522 -0
  21. rdworks-0.25.7/src/rdworks/mollibr.py +887 -0
  22. rdworks-0.25.7/src/rdworks/pka.py +38 -0
  23. rdworks-0.25.7/src/rdworks/predefined/Asinex_fragment.xml +20 -0
  24. rdworks-0.25.7/src/rdworks/predefined/Astex_RO3.xml +16 -0
  25. rdworks-0.25.7/src/rdworks/predefined/Baell2010_PAINS/Baell2010A.xml +52 -0
  26. rdworks-0.25.7/src/rdworks/predefined/Baell2010_PAINS/Baell2010B.xml +169 -0
  27. rdworks-0.25.7/src/rdworks/predefined/Baell2010_PAINS/Baell2010C.xml +1231 -0
  28. rdworks-0.25.7/src/rdworks/predefined/Baell2010_PAINS/PAINS-less-than-015-hits.xml +2048 -0
  29. rdworks-0.25.7/src/rdworks/predefined/Baell2010_PAINS/PAINS-less-than-150-hits.xml +278 -0
  30. rdworks-0.25.7/src/rdworks/predefined/Baell2010_PAINS/PAINS-more-than-150-hits.xml +83 -0
  31. rdworks-0.25.7/src/rdworks/predefined/Baell2010_PAINS/makexml.py +70 -0
  32. rdworks-0.25.7/src/rdworks/predefined/Brenk2008_Dundee/makexml.py +21 -0
  33. rdworks-0.25.7/src/rdworks/predefined/CNS.xml +18 -0
  34. rdworks-0.25.7/src/rdworks/predefined/ChEMBL_Walters/BMS.xml +543 -0
  35. rdworks-0.25.7/src/rdworks/predefined/ChEMBL_Walters/Dundee.xml +318 -0
  36. rdworks-0.25.7/src/rdworks/predefined/ChEMBL_Walters/Glaxo.xml +168 -0
  37. rdworks-0.25.7/src/rdworks/predefined/ChEMBL_Walters/Inpharmatica.xml +276 -0
  38. rdworks-0.25.7/src/rdworks/predefined/ChEMBL_Walters/LINT.xml +174 -0
  39. rdworks-0.25.7/src/rdworks/predefined/ChEMBL_Walters/MLSMR.xml +351 -0
  40. rdworks-0.25.7/src/rdworks/predefined/ChEMBL_Walters/PAINS.xml +1446 -0
  41. rdworks-0.25.7/src/rdworks/predefined/ChEMBL_Walters/SureChEMBL.xml +501 -0
  42. rdworks-0.25.7/src/rdworks/predefined/ChEMBL_Walters/makexml.py +40 -0
  43. rdworks-0.25.7/src/rdworks/predefined/Hann1999_Glaxo/Hann1999.xml +168 -0
  44. rdworks-0.25.7/src/rdworks/predefined/Hann1999_Glaxo/Hann1999Acid.xml +102 -0
  45. rdworks-0.25.7/src/rdworks/predefined/Hann1999_Glaxo/Hann1999Base.xml +6 -0
  46. rdworks-0.25.7/src/rdworks/predefined/Hann1999_Glaxo/Hann1999ElPh.xml +6 -0
  47. rdworks-0.25.7/src/rdworks/predefined/Hann1999_Glaxo/Hann1999NuPh.xml +6 -0
  48. rdworks-0.25.7/src/rdworks/predefined/Hann1999_Glaxo/makexml.py +83 -0
  49. rdworks-0.25.7/src/rdworks/predefined/Kazius2005/Kazius2005.xml +114 -0
  50. rdworks-0.25.7/src/rdworks/predefined/Kazius2005/makexml.py +66 -0
  51. rdworks-0.25.7/src/rdworks/predefined/ZINC_druglike.xml +24 -0
  52. rdworks-0.25.7/src/rdworks/predefined/ZINC_fragment.xml +14 -0
  53. rdworks-0.25.7/src/rdworks/predefined/ZINC_leadlike.xml +15 -0
  54. rdworks-0.25.7/src/rdworks/predefined/fragment.xml +7 -0
  55. rdworks-0.25.7/src/rdworks/predefined/ionized/simple_smarts_pattern.csv +57 -0
  56. rdworks-0.25.7/src/rdworks/predefined/ionized/smarts_pattern.csv +107 -0
  57. rdworks-0.25.7/src/rdworks/predefined/misc/makexml.py +119 -0
  58. rdworks-0.25.7/src/rdworks/predefined/misc/reactive-part-2.xml +104 -0
  59. rdworks-0.25.7/src/rdworks/predefined/misc/reactive-part-3.xml +74 -0
  60. rdworks-0.25.7/src/rdworks/predefined/misc/reactive.xml +321 -0
  61. rdworks-0.25.7/src/rdworks/readin.py +312 -0
  62. rdworks-0.25.7/src/rdworks/rgroup.py +2173 -0
  63. rdworks-0.25.7/src/rdworks/scaffold.py +520 -0
  64. rdworks-0.25.7/src/rdworks/std.py +143 -0
  65. rdworks-0.25.7/src/rdworks/stereoisomers.py +127 -0
  66. rdworks-0.25.7/src/rdworks/tautomers.py +20 -0
  67. rdworks-0.25.7/src/rdworks/units.py +63 -0
  68. rdworks-0.25.7/src/rdworks/utils.py +495 -0
  69. rdworks-0.25.7/src/rdworks/xml.py +260 -0
  70. rdworks-0.25.7/src/rdworks.egg-info/PKG-INFO +37 -0
  71. rdworks-0.25.7/src/rdworks.egg-info/SOURCES.txt +76 -0
  72. rdworks-0.25.7/src/rdworks.egg-info/dependency_links.txt +1 -0
  73. rdworks-0.25.7/src/rdworks.egg-info/requires.txt +13 -0
  74. rdworks-0.25.7/src/rdworks.egg-info/top_level.txt +1 -0
  75. rdworks-0.25.7/tests/test_basics.py +506 -0
  76. rdworks-0.25.7/tests/test_nn_xtb.py +91 -0
  77. rdworks-0.25.7/tests/test_states.py +17 -0
  78. rdworks-0.25.7/tests/test_web.py +380 -0
rdworks-0.25.7/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024-2025 Sung-Hun Bae
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,37 @@
1
+ Metadata-Version: 2.4
2
+ Name: rdworks
3
+ Version: 0.25.7
4
+ Summary: Cheminformatics and AI/ML Work Built on RDKit
5
+ Author-email: Sung-Hun Bae <sunghun.bae@gmail.com>
6
+ Maintainer-email: Sung-Hun Bae <sunghun.bae@gmail.com>
7
+ License-Expression: MIT
8
+ Project-URL: Homepage, https://github.com/sunghunbae/rdworks
9
+ Project-URL: Repository, https://github.com/sunghunbae/rdworks.git
10
+ Project-URL: Issues, https://github.com/sunghunbae/rdworks/issues
11
+ Project-URL: Changelog, https://github.com/sunghunbae/rdworks/blob/master/CHANGELOG.md
12
+ Project-URL: Documentation, https://sunghunbae.github.io/rdworks/
13
+ Keywords: neural-network-potential,cheminformatics,rdkit
14
+ Classifier: Development Status :: 3 - Alpha
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Operating System :: OS Independent
18
+ Requires-Python: >=3.11
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Requires-Dist: numpy
22
+ Requires-Dist: scipy
23
+ Requires-Dist: scikit-learn
24
+ Requires-Dist: pandas
25
+ Requires-Dist: seaborn
26
+ Requires-Dist: networkx
27
+ Requires-Dist: tqdm
28
+ Requires-Dist: psutil
29
+ Requires-Dist: ase
30
+ Requires-Dist: rdkit>=2023
31
+ Requires-Dist: bitarray
32
+ Requires-Dist: cdpkit
33
+ Requires-Dist: pytest
34
+ Dynamic: license-file
35
+
36
+ # rdworks
37
+ Higher level wrapper using RDKit
@@ -0,0 +1,2 @@
1
+ # rdworks
2
+ Higher level wrapper using RDKit
@@ -0,0 +1,62 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "rdworks"
7
+ dynamic = ["version"]
8
+ requires-python = ">=3.11"
9
+ dependencies = [
10
+ "numpy",
11
+ "scipy",
12
+ "scikit-learn",
13
+ "pandas",
14
+ "seaborn",
15
+ "networkx",
16
+ "tqdm",
17
+ "psutil",
18
+ "ase",
19
+ "rdkit>=2023",
20
+ "bitarray",
21
+ "cdpkit",
22
+ "pytest",
23
+ ]
24
+ authors = [{name = "Sung-Hun Bae", email="sunghun.bae@gmail.com"}, ]
25
+ maintainers = [{name = "Sung-Hun Bae", email="sunghun.bae@gmail.com"}, ]
26
+ description = "Cheminformatics and AI/ML Work Built on RDKit"
27
+ readme = "README.md"
28
+ license = "MIT"
29
+ keywords = [
30
+ "neural-network-potential",
31
+ "cheminformatics",
32
+ "rdkit",
33
+ ]
34
+ classifiers = [
35
+ "Development Status :: 3 - Alpha",
36
+ "Intended Audience :: Developers",
37
+ "Programming Language :: Python :: 3",
38
+ "Operating System :: OS Independent",
39
+ ]
40
+
41
+ [project.urls]
42
+ Homepage = "https://github.com/sunghunbae/rdworks"
43
+ Repository = "https://github.com/sunghunbae/rdworks.git"
44
+ Issues = "https://github.com/sunghunbae/rdworks/issues"
45
+ Changelog = "https://github.com/sunghunbae/rdworks/blob/master/CHANGELOG.md"
46
+ Documentation = "https://sunghunbae.github.io/rdworks/"
47
+
48
+ [tool.setuptools.dynamic]
49
+ version = {attr = "rdworks.__version__"}
50
+
51
+ [tool.setuptools.packages.find]
52
+ where = ["src"]
53
+
54
+ [tool.setuptools.package-data]
55
+ "rdworks.auto3d.models" = [ "*" ]
56
+ "rdworks.predefined" = [ "*.xml" ]
57
+ "rdworks.predefined.Baell2010_PAINS" = [ "*.xml" ]
58
+ "rdworks.predefined.ChEMBL_Walters" = [ "*.xml" ]
59
+ "rdworks.predefined.Hann1999_Glaxo" = [ "*.xml" ]
60
+ "rdworks.predefined.Kazius2005" = [ "*.xml" ]
61
+ "rdworks.predefined.misc" = [ "*.xml" ]
62
+ "rdworks.predefined.ionized" = ["*.csv"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,35 @@
1
+ __version__ = '0.25.7'
2
+
3
+ from rdworks.xml import list_predefined_xml, get_predefined_xml, parse_xml
4
+ from rdworks.units import ev2kcalpermol, hartree2ev, hartree2kcalpermol, periodictable
5
+ from rdworks.readin import read_csv, merge_csv, read_dataframe, read_smi, read_sdf, read_mae
6
+ from rdworks.std import desalt_smiles, standardize_smiles, standardize
7
+ from rdworks.tautomers import complete_tautomers
8
+ from rdworks.stereoisomers import complete_stereoisomers
9
+ from rdworks.ionized import IonizedStates
10
+ from rdworks.rgroup import expand_rgroup, most_common, most_common_in_NP
11
+ from rdworks.scaffold import scaffold_network, scaffold_tree, BRICS_fragmented, BRICS_fragment_indices
12
+ from rdworks.matchedseries import MatchedSeries
13
+ from rdworks.descriptor import rd_descriptor, rd_descriptor_f
14
+ from rdworks.utils import fix_decimal_places_in_list, fix_decimal_places_in_dict, mae_to_dict, mae_rd_index
15
+ from rdworks.display import svg
16
+ from rdworks.conf import Conf
17
+ from rdworks.mol import Mol
18
+ from rdworks.mollibr import MolLibr
19
+
20
+ from rdkit import rdBase, RDLogger
21
+ rdkit_logger = RDLogger.logger().setLevel(RDLogger.CRITICAL)
22
+
23
+ import logging
24
+
25
+ main_logger = logging.getLogger()
26
+ main_logger.setLevel(logging.INFO) # level: DEBUG < INFO < WARNING < ERROR < CRITICAL
27
+ logger_formatter = logging.Formatter(
28
+ fmt='%(asctime)s %(levelname)s %(message)s',
29
+ datefmt='%Y-%m-%d %H:%M:%S')
30
+ logger_ch = logging.StreamHandler()
31
+ logger_ch.setFormatter(logger_formatter)
32
+ main_logger.addHandler(logger_ch)
33
+
34
+
35
+ __rdkit_version__ = rdBase.rdkitVersion
@@ -0,0 +1,4 @@
1
+ from .nmrclust import NMRCLUST
2
+ from .rckmeans import RCKmeans
3
+ from .dynamictreecut import DynamicTreeCut
4
+ from .autograph import AutoGraph
@@ -0,0 +1,184 @@
1
+ # 2020-04-17
2
+
3
+ # Kiyoto Aramis Tanemura
4
+
5
+ # Code for Louvain algorithm was getting lengthy. I dedicate its own module
6
+
7
+ import numpy as np
8
+
9
+ from .centroid import centroid_autograph
10
+
11
+ def getModularity(affinityMatrix, communityAssignments, resolution = 1.0):
12
+ # Provided an affinity matrix (np.array) and nodes specifying their assigned communities (list),
13
+ # return the modularity of the whole graph
14
+ arrayDim = affinityMatrix.shape[0]
15
+ communities = list(set(communityAssignments))
16
+ Q = 0
17
+ affinityMatrix[range(arrayDim), range(arrayDim)] += affinityMatrix[range(arrayDim), range(arrayDim)]
18
+ two_m = np.sum(affinityMatrix)
19
+ for C in communities:
20
+ communityIndices = [i for i in range(len(communityAssignments)) if communityAssignments[i] == C]
21
+ sigma_in = np.sum(affinityMatrix[communityIndices, :][:, communityIndices])
22
+ sigma_tot = np.sum(affinityMatrix[communityIndices, :])
23
+ Q += sigma_in / two_m - resolution * (sigma_tot / two_m) ** 2
24
+ return Q
25
+
26
+ def LouvainPhase1(affinityMatrix, communityAssignment, Q_threshold, max_iter, resolution):
27
+ # inputs: affinityMatrix as numpy array. Clear diagonals prior to entering and filter edges with weights below threshold
28
+ # communityAssignments: list of length of nodes. Values are community IDs
29
+ # Q_threshold: minimum global modularity to terminate phase 1
30
+ # max_iter: number of iteration to force termination of phase 1
31
+ # output: updated community Assignment list
32
+
33
+ num_nodes = affinityMatrix.shape[0]
34
+
35
+ affinityMatrix[range(num_nodes), range(num_nodes)] += affinityMatrix[range(num_nodes), range(num_nodes)]
36
+
37
+ two_m = np.sum(affinityMatrix)
38
+
39
+ modularity_current = getModularity(affinityMatrix, communityAssignment, resolution)
40
+ changeModularity = 1
41
+
42
+ adjacencyMatrix = affinityMatrix > 0
43
+ iterations = 0
44
+
45
+ while changeModularity > Q_threshold and iterations < max_iter:
46
+ modularity_prev = modularity_current
47
+ for i in range(num_nodes): # for each node
48
+ neighborIndices = [j for j in range(num_nodes) if adjacencyMatrix[i,j] == 1]
49
+ communities = set([communityAssignment[k] for k in neighborIndices])
50
+ communities.discard(communityAssignment[i])
51
+ communities = list(communities)
52
+ modularityList = []
53
+ # ki: sum of edges incident to node i
54
+ ki = np.sum(affinityMatrix[i, :])
55
+ for C in communities:
56
+ C_members = [x for x in range(num_nodes) if communityAssignment[x] == C]
57
+ # Sum weights of all edges incident to nodes in C
58
+ sigma_tot = np.sum(affinityMatrix[C_members, :])
59
+ # ki_in: sum of weight of edges incident to node i in C
60
+ ki_in = np.sum(affinityMatrix[i, :][C_members])
61
+ deltaQ = ki_in/two_m - resolution * sigma_tot * ki / (two_m ** 2 / 2) # simplified formula. Derivation from (https://hal.archives-ouvertes.fr/hal-01231784/document)
62
+ modularityList.append(deltaQ)
63
+ modularityList.append(0) # append zero to avoid error by empty list
64
+ maxQgain = np.max(modularityList)
65
+ if maxQgain > 0:
66
+ communityToJoin = communities[modularityList.index(maxQgain)]
67
+ communityAssignment[i] = communityToJoin
68
+ modularity_current = getModularity(affinityMatrix, communityAssignment, resolution)
69
+ changeModularity = modularity_current - modularity_prev
70
+ iterations += 1
71
+
72
+ return communityAssignment
73
+
74
+ def LouvainPhase2(affinityMatrix, communityAssignment):
75
+ # Merge each communities into supernodes
76
+ # input: affinityMatrix or condensed graph
77
+ # output: condensed graph and list of communities corresponding to the axis of graph
78
+ num_nodes = affinityMatrix.shape[0]
79
+ communities = list(set(communityAssignment))
80
+ num_communities = len(communities)
81
+ phase2graph = np.zeros([num_communities, num_communities])
82
+ for i in range(num_communities):
83
+ i_members = [x for x in range(num_nodes) if communityAssignment[x] == communities[i]]
84
+ for j in range(num_communities):
85
+ j_members = [x for x in range(num_nodes) if communityAssignment[x] == communities[j]]
86
+ phase2graph[i, j] = np.sum(affinityMatrix[i_members, :][:, j_members]) * (1/2) ** (i == j)
87
+ # if same communities, edges are double counted. If i == j, divide by two
88
+
89
+ return phase2graph, communities
90
+
91
+
92
+ def Louvain(affinityMatrix, Q_threshold = 0.001, max_iter = 50, resolution = 1.0):
93
+ # perform the two phases of Louvain community iteratively
94
+ graph = affinityMatrix
95
+ comm = list(range(affinityMatrix.shape[0]))
96
+ # communityAssignmentRecord and commRefList are lists of lists, with same lengths.
97
+ # communityAssignmentRecord stores assingment after phase 1
98
+ # commRefList keeps the individual communities before assignment
99
+ communityAssignmentRecord = []
100
+ commRefList = []
101
+ changeModularity = 1
102
+ iterations = 0
103
+ while changeModularity > Q_threshold and iterations < max_iter:
104
+ graph, comm = LouvainPhase2(graph, comm)
105
+ commRefList.append(list(comm))
106
+ modularity_past = getModularity(graph, comm, resolution) # Note: we want to compare Q before and after reassignments in phase 1
107
+ comm = LouvainPhase1(graph, comm, Q_threshold, max_iter, resolution)
108
+ communityAssignmentRecord.append(list(comm))
109
+ modularity_curr = getModularity(graph, comm, resolution)
110
+ changeModularity = modularity_curr - modularity_past
111
+ iterations += 1
112
+ # print('changeModularity', changeModularity)
113
+
114
+ for i in range(len(communityAssignmentRecord) - 2, 0, -1):
115
+ changeDict = {}
116
+ for j in range(len(communityAssignmentRecord[i])):
117
+ newComm = communityAssignmentRecord[i][j]
118
+ oldComm = commRefList[i][j]
119
+ if newComm != oldComm:
120
+ changeDict[oldComm] = newComm # Change key to value
121
+
122
+ for key in changeDict:
123
+ oldIndices = [x for x in range(len(communityAssignmentRecord[i - 1])) if communityAssignmentRecord[i - 1][x] == key]
124
+ for j in oldIndices:
125
+ communityAssignmentRecord[i - 1][j] = changeDict[key]
126
+ return communityAssignmentRecord[0]
127
+
128
+
129
+ def rbfKernel(r, epsilon = 1.0):
130
+ return np.exp(-(epsilon*r)**2)
131
+
132
+
133
+ def findThreshold(affinityMatrix):
134
+ numFiles = affinityMatrix.shape[0]
135
+ # Get all values in affinityMatrix. Sort in descending order
136
+ vals = np.array([])
137
+ for i in range(numFiles - 1):
138
+ vals = np.append(vals, affinityMatrix[i, i + 1:])
139
+
140
+ vals = np.sort(vals, axis = None)[::-1]
141
+
142
+ # Initialize index values
143
+ upperIndex = 0
144
+ lowerIndex = len(vals) - 1
145
+
146
+ while upperIndex != lowerIndex - 1: # Until the indices are consecutive, iterate
147
+ midIndex = int(np.mean([lowerIndex, upperIndex]))
148
+ adjacencyMatrix = affinityMatrix > vals[midIndex]
149
+ # Will tally nodes visited from first node in graph by BFS. If not all nodes were visited, there are more than one component.
150
+ nodesVisited = [0] + [i for i in range(numFiles) if adjacencyMatrix[0, i] == 1]
151
+ # nodesVisited = [i for i in range(numFiles) if adjacencyMatrix[0, i] == 1]
152
+ for i in nodesVisited:
153
+ newNodes = [j for j in range(numFiles) if adjacencyMatrix[i, j] == 1 and j not in nodesVisited]
154
+ nodesVisited += newNodes
155
+ while len(newNodes) > 0:
156
+ newNodes1list = []
157
+ for i in newNodes:
158
+ newNodes1 = [j for j in range(numFiles) if adjacencyMatrix[i, j] == 1 and j not in nodesVisited]
159
+ newNodes1list += newNodes1
160
+ nodesVisited += newNodes1
161
+
162
+ newNodes = newNodes1list
163
+
164
+ # If every node was visited, then we have exactly one component. Otherwise, there are disconnected graphs
165
+ if len(nodesVisited) < numFiles:
166
+ upperIndex = midIndex
167
+ else:
168
+ lowerIndex = midIndex
169
+
170
+ threshold = vals[lowerIndex]
171
+
172
+ return threshold
173
+
174
+
175
+ def AutoGraph(rmsdMatrix):
176
+ N = rmsdMatrix.shape[0]
177
+ affinityMatrix = rbfKernel(rmsdMatrix)
178
+ affinityMatrix[range(N), range(N)] = 0 # set diagonal zero
179
+ threshold = findThreshold(affinityMatrix)
180
+ adjacencyMatrix = affinityMatrix > threshold
181
+ filteredAffinityMatrix = affinityMatrix * adjacencyMatrix
182
+ communityAssignment = Louvain(filteredAffinityMatrix, Q_threshold=0.0, max_iter=50, resolution=1.0)
183
+ centroid_indices = centroid_autograph(N, communityAssignment, rmsdMatrix, threshold)
184
+ return communityAssignment, centroid_indices
@@ -0,0 +1,90 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+
5
+ def centroid_medoid(communityAssignment, rmsdMatrix) -> list:
6
+ """returns a list of centroids based on medoids
7
+
8
+ Medoids are representative objects of a data set or a cluster within a data set
9
+ whose sum of dissimilarities to all the objects in the cluster is minimal.
10
+
11
+ Args:
12
+ communityAssignment: (list) list of community assignment correspoinding to the index of fileList
13
+ rmsdMatrix: (numpy array) Matrix containing pairwise atomic RMSD between all conformers
14
+
15
+ Returns:
16
+ a list of centroids
17
+ """
18
+ N = rmsdMatrix.shape[0]
19
+ community_indices = list(set(communityAssignment))
20
+ centroids = []
21
+ for C in community_indices:
22
+ C_members = [x for x in range(N) if communityAssignment[x] == C]
23
+ community_submatrix = rmsdMatrix[C_members, :][:, C_members]
24
+ dist_sum = np.sum(community_submatrix, axis = 1)
25
+ centroids.append(C_members[np.argmin(dist_sum)])
26
+ return centroids
27
+
28
+
29
+ def diekstra(filtered_rmsd_matrix_community, i):
30
+ '''Use Dijkstra's algorithm to find shortest path from index i to all other nodes'''
31
+ # initialize lists
32
+ visited = []
33
+ unvisited = [x for x in range(filtered_rmsd_matrix_community.shape[0])]
34
+ record = [np.inf for x in range(filtered_rmsd_matrix_community.shape[0])]#{x: np.inf for x in range(graph.shape[0])}
35
+ record[i] = 0
36
+ lastNode = [-1 for x in record]
37
+ # repeat until all nodes have been visited
38
+ while len(unvisited) > 0:
39
+ visit_index = unvisited[np.argmin([record[x] for x in unvisited])]
40
+ unvisited_neighbors = [x for x in unvisited if filtered_rmsd_matrix_community[visit_index, x] > 0]
41
+ # Calculate distance to unvisited neighbor. If value is shorter than recorded, update distance.
42
+ updateDist = filtered_rmsd_matrix_community[visit_index, :] + record[visit_index]
43
+ for j in unvisited_neighbors:
44
+ record[j] = np.min([updateDist[j], record[j]])
45
+ if updateDist[j] < record[j]:
46
+ lastNode[j] = visit_index
47
+ # update visited/unvisited node list
48
+ unvisited.remove(visit_index)
49
+ visited.append(visit_index)
50
+ return record, lastNode
51
+
52
+
53
+ def centroid_betweenness(num, communityAssignment, filtered_rmsd_matrix):
54
+ # Provided with list of conformers assigned to communities, choose representative centroid by conformers of maximum in community betweenness
55
+ # inputs
56
+ # fileList: (list) names of xyz files for each conformer
57
+ # communityAssignment: (list) list of community assignment correspoinding to the index of fileList
58
+ # filtered_rmsd_matrix: {np.array) RMSD matrix between conformers, except assigning distances above threshold to zero
59
+ communityList = list(set(communityAssignment))
60
+ centralNodes = []
61
+ comm_size = []
62
+ for C in communityList:
63
+ C_members = [x for x in range(num) if communityAssignment[x] == C]
64
+ C_member_files = [x for x in C_members]
65
+ comm_size.append(len(C_members))
66
+ community_subgraph = filtered_rmsd_matrix[C_members, :][:, C_members]
67
+ community_betweenness = np.zeros(len(C_members))
68
+ for i in range(len(C_member_files)):
69
+ record, lastnode = diekstra(community_subgraph, i)
70
+ for j in range(len(C_member_files)):
71
+ previous_node = lastnode[j]
72
+ while previous_node != -1:
73
+ community_betweenness[previous_node] += 1
74
+ previous_node = lastnode[previous_node]
75
+ max_betweenness_index = np.argmax(community_betweenness)
76
+ centralNodes.append(C_member_files[max_betweenness_index])
77
+
78
+ # Sort centers by size of clusters in descending order
79
+ centralDf = pd.DataFrame({'size': comm_size}, index = centralNodes)
80
+ centralDf.sort_values(by = 'size', ascending = False, inplace = True)
81
+
82
+ return list(centralDf.index)
83
+
84
+
85
+ def centroid_autograph(N, communityAssignment, rmsdMatrix, threshold, centroid_selection='betweenness', filteredAffinityMatrix=None):
86
+ '''Return file names of conformers designated as centroids. If energy is provided, find lowest energy conformers in each cluster. Otherwise choose by maximum in-cluster weighted degree'''
87
+ if centroid_selection == 'betweenness':
88
+ return centroid_betweenness(N, communityAssignment, rmsdMatrix * rmsdMatrix < np.sqrt(-np.log(threshold))) # add filtered rmsd matrix
89
+ else:
90
+ print('centroid criterion not recognized. Use keywords "degree", "eccentricity", or "betweenness" for centroid_selection or provide an energy output to base the selection')
@@ -0,0 +1,135 @@
1
+ # 2020-05-22
2
+
3
+ # Kiyoto Aramis Tanemura
4
+
5
+ # The Ward algorithm for conformational clustering is used in building Markov State Model (DOI: 10.1021/acs.jctc.6b01238). To circumvent threshold selection, we will apply the dynamic tree cut method, used in conjunction to the Ward dendogram for automated conformational clustering (DOI: 10.1186/s13321-017-0208-0).
6
+
7
+ import numpy as np
8
+
9
+ from scipy.cluster.hierarchy import linkage, to_tree
10
+ from scipy.spatial.distance import squareform
11
+
12
+ from .centroid import centroid_medoid
13
+
14
+
15
+ def get_ward_dendrogram(rmsd_mat):
16
+ '''Use Scipy functions to obtain dendrogram using Ward method
17
+ Returns the dendrogram (refer to Scipy linkage output format) as a ClusterNode object'''
18
+ dend = linkage(squareform(rmsd_mat), method = 'ward', optimal_ordering = True)
19
+ return dend
20
+
21
+
22
+ def goLeftmost(node, path, path_record):
23
+ '''provided a node, travel left until a leaf is reached'''
24
+ curr_node = node
25
+ if path in path_record:
26
+ return path
27
+ while not curr_node.is_leaf():
28
+ curr_node = curr_node.get_left()
29
+ path += ('l')
30
+ return path
31
+
32
+
33
+ def travelDown(ref_node, rel_path):
34
+ '''reach a node below, provided a starting node an path'''
35
+ curr_node = ref_node
36
+ for i in rel_path:
37
+ if i == 'l':
38
+ curr_node = curr_node.get_left()
39
+ elif i == 'r':
40
+ curr_node = curr_node.get_right()
41
+ return curr_node
42
+
43
+
44
+ def getHeights(root):
45
+ '''return heights of nonleaf nodes and their corresponding paths'''
46
+ heights = []
47
+ path_recorded = []
48
+ path = '.'
49
+ curr_node = root
50
+ if root.is_leaf():
51
+ return [0], [path]
52
+ while 'l' in list(path) or not curr_node.is_leaf():
53
+ path = goLeftmost(curr_node, path, path_recorded)
54
+ path = path[:-1]
55
+ curr_node = travelDown(root, path)
56
+ if path not in path_recorded:
57
+ heights.append(curr_node.dist)
58
+ path_recorded.append(path)
59
+ path += 'r'
60
+ curr_node = travelDown(root, path)
61
+ return heights, path_recorded
62
+
63
+
64
+ def treeCutCore(H, I, tau = 5):
65
+ '''Determine significant clusters provided one calibration value'''
66
+ H_hat = H - I
67
+ trans_indices = [x for x in range(len(H)-1) if H_hat[x] > 0 and H_hat[x+1] < 0]
68
+ breakpoints = []
69
+ for i in trans_indices:
70
+ back_index = 1
71
+ while H_hat[i-back_index] > 0 and back_index <= i:
72
+ back_index += 1
73
+ breakpoints.append(i - back_index + 1)
74
+ # Find significant breakpoints
75
+ significant_breakpoints = [breakpoints[x] for x in range(len(breakpoints)) if trans_indices[x] - breakpoints[x] > tau]
76
+ return significant_breakpoints
77
+
78
+
79
+ def adaptiveTreecutCore(H, tau = 5):
80
+ '''Perform treeCutCore at mean height. If no significant breakpoints are detected,
81
+ continue the operation below and above the mean.'''
82
+ if len(H) == 0:
83
+ return []
84
+ lm = np.mean(H)
85
+ lu = np.mean([lm, np.max(H)])
86
+ ld = np.mean([lm, np.min(H)])
87
+ bps = treeCutCore(H, lm, tau)
88
+ if len(bps) == 0:
89
+ bps = treeCutCore(H, ld, tau)
90
+ if len(bps) == 0:
91
+ bps = treeCutCore(H, lu, tau)
92
+ return bps
93
+
94
+
95
+ def getClusterNodeIndices(comprehensive_path_list, cluster_substring):
96
+ # Given a root string, return all permutations as indices
97
+ return [x for x in range(len(comprehensive_path_list)) if comprehensive_path_list[x].startswith(cluster_substring)]
98
+
99
+
100
+ # After looking at the Java implementation, I suspect the breakpoints, corresponding to the indices of heights (distance value of nonleaf nodes),
101
+ # also correspond to the indices of leaves. Try clustering only by collecting all breakpoints, then using the indices to subset leaves.
102
+ def dynamicTreeCut(tree, n, tau = 5):
103
+ allHeights, allPaths = getHeights(tree)
104
+ allHeights = [0] + allHeights + [0] # Sandwitch the heights with zero so that the ends can be included or left out from major clusters.
105
+ breakpoints = [0,-1]
106
+ updateList = [-1]
107
+ while len(updateList) > 0:
108
+ updateList = []
109
+ for i in range(len(breakpoints) - 1):
110
+ Hi = allHeights[breakpoints[i]:breakpoints[i+1]]
111
+ cutpoints = adaptiveTreecutCore(Hi, tau)
112
+ updateList += [x for x in cutpoints if x not in breakpoints]
113
+ breakpoints += updateList
114
+ return breakpoints
115
+
116
+
117
+ def report_assingments(breakpoints, tree):
118
+ '''To standardize outputs with the other clustering algorithms, this function takes the list of ClusterNodes produced in dynamicTreeCut and returns a list of ints specifying the cluster assignment'''
119
+ leaves = tree.pre_order(lambda x: x.id)
120
+ n = len(leaves)
121
+ comm_assing = np.zeros(n, int)
122
+ for i in range(len(breakpoints) - 1):
123
+ members = leaves[breakpoints[i]:breakpoints[i+1]]
124
+ comm_assing[members] = i
125
+ return comm_assing.tolist()
126
+
127
+
128
+ def DynamicTreeCut(rmsdMatrix, tau = 5): # Ward clustering
129
+ N = rmsdMatrix.shape[0]
130
+ dend = get_ward_dendrogram(rmsdMatrix)
131
+ tree = to_tree(dend)
132
+ breakpoints = dynamicTreeCut(tree, N, tau)
133
+ communityAssignment = report_assingments(breakpoints, tree)
134
+ centroid_indices = centroid_medoid(communityAssignment, rmsdMatrix)
135
+ return communityAssignment, centroid_indices