PyPI - rdworks - Versions diffs - 0.25.7__tar.gz - Mend

rdworks 0.25.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

rdworks-0.25.7/LICENSE +21 -0
rdworks-0.25.7/PKG-INFO +37 -0
rdworks-0.25.7/README.md +2 -0
rdworks-0.25.7/pyproject.toml +62 -0
rdworks-0.25.7/setup.cfg +4 -0
rdworks-0.25.7/src/rdworks/__init__.py +35 -0
rdworks-0.25.7/src/rdworks/autograph/__init__.py +4 -0
rdworks-0.25.7/src/rdworks/autograph/autograph.py +184 -0
rdworks-0.25.7/src/rdworks/autograph/centroid.py +90 -0
rdworks-0.25.7/src/rdworks/autograph/dynamictreecut.py +135 -0
rdworks-0.25.7/src/rdworks/autograph/nmrclust.py +123 -0
rdworks-0.25.7/src/rdworks/autograph/rckmeans.py +74 -0
rdworks-0.25.7/src/rdworks/bitqt/__init__.py +1 -0
rdworks-0.25.7/src/rdworks/bitqt/bitqt.py +355 -0
rdworks-0.25.7/src/rdworks/conf.py +374 -0
rdworks-0.25.7/src/rdworks/descriptor.py +36 -0
rdworks-0.25.7/src/rdworks/display.py +206 -0
rdworks-0.25.7/src/rdworks/ionized.py +170 -0
rdworks-0.25.7/src/rdworks/matchedseries.py +260 -0
rdworks-0.25.7/src/rdworks/mol.py +1522 -0
rdworks-0.25.7/src/rdworks/mollibr.py +887 -0
rdworks-0.25.7/src/rdworks/pka.py +38 -0
rdworks-0.25.7/src/rdworks/predefined/Asinex_fragment.xml +20 -0
rdworks-0.25.7/src/rdworks/predefined/Astex_RO3.xml +16 -0
rdworks-0.25.7/src/rdworks/predefined/Baell2010_PAINS/Baell2010A.xml +52 -0
rdworks-0.25.7/src/rdworks/predefined/Baell2010_PAINS/Baell2010B.xml +169 -0
rdworks-0.25.7/src/rdworks/predefined/Baell2010_PAINS/Baell2010C.xml +1231 -0
rdworks-0.25.7/src/rdworks/predefined/Baell2010_PAINS/PAINS-less-than-015-hits.xml +2048 -0
rdworks-0.25.7/src/rdworks/predefined/Baell2010_PAINS/PAINS-less-than-150-hits.xml +278 -0
rdworks-0.25.7/src/rdworks/predefined/Baell2010_PAINS/PAINS-more-than-150-hits.xml +83 -0
rdworks-0.25.7/src/rdworks/predefined/Baell2010_PAINS/makexml.py +70 -0
rdworks-0.25.7/src/rdworks/predefined/Brenk2008_Dundee/makexml.py +21 -0
rdworks-0.25.7/src/rdworks/predefined/CNS.xml +18 -0
rdworks-0.25.7/src/rdworks/predefined/ChEMBL_Walters/BMS.xml +543 -0
rdworks-0.25.7/src/rdworks/predefined/ChEMBL_Walters/Dundee.xml +318 -0
rdworks-0.25.7/src/rdworks/predefined/ChEMBL_Walters/Glaxo.xml +168 -0
rdworks-0.25.7/src/rdworks/predefined/ChEMBL_Walters/Inpharmatica.xml +276 -0
rdworks-0.25.7/src/rdworks/predefined/ChEMBL_Walters/LINT.xml +174 -0
rdworks-0.25.7/src/rdworks/predefined/ChEMBL_Walters/MLSMR.xml +351 -0
rdworks-0.25.7/src/rdworks/predefined/ChEMBL_Walters/PAINS.xml +1446 -0
rdworks-0.25.7/src/rdworks/predefined/ChEMBL_Walters/SureChEMBL.xml +501 -0
rdworks-0.25.7/src/rdworks/predefined/ChEMBL_Walters/makexml.py +40 -0
rdworks-0.25.7/src/rdworks/predefined/Hann1999_Glaxo/Hann1999.xml +168 -0
rdworks-0.25.7/src/rdworks/predefined/Hann1999_Glaxo/Hann1999Acid.xml +102 -0
rdworks-0.25.7/src/rdworks/predefined/Hann1999_Glaxo/Hann1999Base.xml +6 -0
rdworks-0.25.7/src/rdworks/predefined/Hann1999_Glaxo/Hann1999ElPh.xml +6 -0
rdworks-0.25.7/src/rdworks/predefined/Hann1999_Glaxo/Hann1999NuPh.xml +6 -0
rdworks-0.25.7/src/rdworks/predefined/Hann1999_Glaxo/makexml.py +83 -0
rdworks-0.25.7/src/rdworks/predefined/Kazius2005/Kazius2005.xml +114 -0
rdworks-0.25.7/src/rdworks/predefined/Kazius2005/makexml.py +66 -0
rdworks-0.25.7/src/rdworks/predefined/ZINC_druglike.xml +24 -0
rdworks-0.25.7/src/rdworks/predefined/ZINC_fragment.xml +14 -0
rdworks-0.25.7/src/rdworks/predefined/ZINC_leadlike.xml +15 -0
rdworks-0.25.7/src/rdworks/predefined/fragment.xml +7 -0
rdworks-0.25.7/src/rdworks/predefined/ionized/simple_smarts_pattern.csv +57 -0
rdworks-0.25.7/src/rdworks/predefined/ionized/smarts_pattern.csv +107 -0
rdworks-0.25.7/src/rdworks/predefined/misc/makexml.py +119 -0
rdworks-0.25.7/src/rdworks/predefined/misc/reactive-part-2.xml +104 -0
rdworks-0.25.7/src/rdworks/predefined/misc/reactive-part-3.xml +74 -0
rdworks-0.25.7/src/rdworks/predefined/misc/reactive.xml +321 -0
rdworks-0.25.7/src/rdworks/readin.py +312 -0
rdworks-0.25.7/src/rdworks/rgroup.py +2173 -0
rdworks-0.25.7/src/rdworks/scaffold.py +520 -0
rdworks-0.25.7/src/rdworks/std.py +143 -0
rdworks-0.25.7/src/rdworks/stereoisomers.py +127 -0
rdworks-0.25.7/src/rdworks/tautomers.py +20 -0
rdworks-0.25.7/src/rdworks/units.py +63 -0
rdworks-0.25.7/src/rdworks/utils.py +495 -0
rdworks-0.25.7/src/rdworks/xml.py +260 -0
rdworks-0.25.7/src/rdworks.egg-info/PKG-INFO +37 -0
rdworks-0.25.7/src/rdworks.egg-info/SOURCES.txt +76 -0
rdworks-0.25.7/src/rdworks.egg-info/dependency_links.txt +1 -0
rdworks-0.25.7/src/rdworks.egg-info/requires.txt +13 -0
rdworks-0.25.7/src/rdworks.egg-info/top_level.txt +1 -0
rdworks-0.25.7/tests/test_basics.py +506 -0
rdworks-0.25.7/tests/test_nn_xtb.py +91 -0
rdworks-0.25.7/tests/test_states.py +17 -0
rdworks-0.25.7/tests/test_web.py +380 -0

rdworks-0.25.7/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2024-2025 Sung-Hun Bae
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

rdworks-0.25.7/PKG-INFO ADDED Viewed

@@ -0,0 +1,37 @@
+Metadata-Version: 2.4
+Name: rdworks
+Version: 0.25.7
+Summary: Cheminformatics and AI/ML Work Built on RDKit
+Author-email: Sung-Hun Bae <sunghun.bae@gmail.com>
+Maintainer-email: Sung-Hun Bae <sunghun.bae@gmail.com>
+License-Expression: MIT
+Project-URL: Homepage, https://github.com/sunghunbae/rdworks
+Project-URL: Repository, https://github.com/sunghunbae/rdworks.git
+Project-URL: Issues, https://github.com/sunghunbae/rdworks/issues
+Project-URL: Changelog, https://github.com/sunghunbae/rdworks/blob/master/CHANGELOG.md
+Project-URL: Documentation, https://sunghunbae.github.io/rdworks/
+Keywords: neural-network-potential,cheminformatics,rdkit
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Programming Language :: Python :: 3
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: numpy
+Requires-Dist: scipy
+Requires-Dist: scikit-learn
+Requires-Dist: pandas
+Requires-Dist: seaborn
+Requires-Dist: networkx
+Requires-Dist: tqdm
+Requires-Dist: psutil
+Requires-Dist: ase
+Requires-Dist: rdkit>=2023
+Requires-Dist: bitarray
+Requires-Dist: cdpkit
+Requires-Dist: pytest
+Dynamic: license-file
+# rdworks
+Higher level wrapper using RDKit

rdworks-0.25.7/README.md ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # rdworks
2	+ Higher level wrapper using RDKit

rdworks-0.25.7/pyproject.toml ADDED Viewed

@@ -0,0 +1,62 @@
+[build-system]
+requires = ["setuptools>=68"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "rdworks"
+dynamic = ["version"]
+requires-python = ">=3.11"
+dependencies = [
+    "numpy",
+    "scipy",
+    "scikit-learn",
+    "pandas",
+    "seaborn",
+    "networkx",
+    "tqdm",
+    "psutil",
+    "ase",
+    "rdkit>=2023",
+    "bitarray",
+    "cdpkit",
+    "pytest",
+    ]
+authors = [{name = "Sung-Hun Bae", email="sunghun.bae@gmail.com"}, ]
+maintainers = [{name = "Sung-Hun Bae", email="sunghun.bae@gmail.com"}, ]
+description = "Cheminformatics and AI/ML Work Built on RDKit"
+readme = "README.md"
+license = "MIT"
+keywords = [
+    "neural-network-potential",
+    "cheminformatics",
+    "rdkit",
+    ]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "Programming Language :: Python :: 3",
+    "Operating System :: OS Independent",
+    ]
+[project.urls]
+Homepage        = "https://github.com/sunghunbae/rdworks"
+Repository      = "https://github.com/sunghunbae/rdworks.git"
+Issues          = "https://github.com/sunghunbae/rdworks/issues"
+Changelog       = "https://github.com/sunghunbae/rdworks/blob/master/CHANGELOG.md"
+Documentation   = "https://sunghunbae.github.io/rdworks/"
+[tool.setuptools.dynamic]
+version = {attr = "rdworks.__version__"}
+[tool.setuptools.packages.find]
+where = ["src"]
+[tool.setuptools.package-data]
+"rdworks.auto3d.models" = [ "*" ]
+"rdworks.predefined" = [ "*.xml" ]
+"rdworks.predefined.Baell2010_PAINS" = [ "*.xml" ]
+"rdworks.predefined.ChEMBL_Walters" = [ "*.xml" ]
+"rdworks.predefined.Hann1999_Glaxo" = [ "*.xml" ]
+"rdworks.predefined.Kazius2005" = [ "*.xml" ]
+"rdworks.predefined.misc" = [ "*.xml" ]
+"rdworks.predefined.ionized" = ["*.csv"]

rdworks-0.25.7/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

rdworks-0.25.7/src/rdworks/__init__.py ADDED Viewed

@@ -0,0 +1,35 @@
+__version__ = '0.25.7'
+from rdworks.xml import list_predefined_xml, get_predefined_xml, parse_xml
+from rdworks.units import ev2kcalpermol, hartree2ev, hartree2kcalpermol, periodictable
+from rdworks.readin import read_csv, merge_csv, read_dataframe, read_smi, read_sdf, read_mae
+from rdworks.std import desalt_smiles, standardize_smiles, standardize
+from rdworks.tautomers import complete_tautomers
+from rdworks.stereoisomers import complete_stereoisomers
+from rdworks.ionized import IonizedStates
+from rdworks.rgroup import expand_rgroup, most_common, most_common_in_NP
+from rdworks.scaffold import scaffold_network, scaffold_tree, BRICS_fragmented, BRICS_fragment_indices
+from rdworks.matchedseries import MatchedSeries
+from rdworks.descriptor import rd_descriptor, rd_descriptor_f
+from rdworks.utils import fix_decimal_places_in_list, fix_decimal_places_in_dict, mae_to_dict, mae_rd_index
+from rdworks.display import svg
+from rdworks.conf import Conf
+from rdworks.mol import Mol
+from rdworks.mollibr import MolLibr
+from rdkit import rdBase, RDLogger
+rdkit_logger = RDLogger.logger().setLevel(RDLogger.CRITICAL)
+import logging
+main_logger = logging.getLogger()
+main_logger.setLevel(logging.INFO) # level: DEBUG < INFO < WARNING < ERROR < CRITICAL
+logger_formatter = logging.Formatter(
+    fmt='%(asctime)s %(levelname)s %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S')
+logger_ch = logging.StreamHandler()
+logger_ch.setFormatter(logger_formatter)
+main_logger.addHandler(logger_ch)
+__rdkit_version__ = rdBase.rdkitVersion

rdworks-0.25.7/src/rdworks/autograph/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .nmrclust import NMRCLUST
+from .rckmeans import RCKmeans
+from .dynamictreecut import DynamicTreeCut
+from .autograph import AutoGraph

rdworks-0.25.7/src/rdworks/autograph/autograph.py ADDED Viewed

@@ -0,0 +1,184 @@
+# 2020-04-17
+# Kiyoto Aramis Tanemura
+# Code for Louvain algorithm was getting lengthy. I dedicate its own module
+import numpy as np
+from .centroid import centroid_autograph
+def getModularity(affinityMatrix, communityAssignments, resolution = 1.0):
+    # Provided an affinity matrix (np.array) and nodes specifying their assigned communities (list),
+    # return the modularity of the whole graph
+    arrayDim = affinityMatrix.shape[0]
+    communities = list(set(communityAssignments))
+    Q = 0
+    affinityMatrix[range(arrayDim), range(arrayDim)] += affinityMatrix[range(arrayDim), range(arrayDim)]
+    two_m = np.sum(affinityMatrix)
+    for C in communities:
+        communityIndices = [i for i in range(len(communityAssignments)) if communityAssignments[i] == C]
+        sigma_in = np.sum(affinityMatrix[communityIndices, :][:, communityIndices])
+        sigma_tot = np.sum(affinityMatrix[communityIndices, :])
+        Q += sigma_in / two_m - resolution * (sigma_tot / two_m) ** 2
+    return Q
+def LouvainPhase1(affinityMatrix, communityAssignment, Q_threshold, max_iter, resolution):
+    # inputs: affinityMatrix as numpy array. Clear diagonals prior to entering and filter edges with weights below threshold
+    # communityAssignments: list of length of nodes. Values are community IDs
+    # Q_threshold: minimum global modularity to terminate phase 1
+    # max_iter: number of iteration to force termination of phase 1
+    # output: updated community Assignment list
+    num_nodes = affinityMatrix.shape[0]
+    affinityMatrix[range(num_nodes), range(num_nodes)] += affinityMatrix[range(num_nodes), range(num_nodes)]
+    two_m = np.sum(affinityMatrix)
+    modularity_current = getModularity(affinityMatrix, communityAssignment, resolution)
+    changeModularity = 1
+    adjacencyMatrix = affinityMatrix > 0
+    iterations = 0
+    while changeModularity > Q_threshold and iterations < max_iter:
+        modularity_prev = modularity_current
+        for i in range(num_nodes): # for each node
+            neighborIndices = [j for j in range(num_nodes) if adjacencyMatrix[i,j] == 1]
+            communities = set([communityAssignment[k] for k in neighborIndices])
+            communities.discard(communityAssignment[i])
+            communities = list(communities)
+            modularityList = []
+            # ki: sum of edges incident to node i
+            ki = np.sum(affinityMatrix[i, :])
+            for C in communities:
+                C_members = [x for x in range(num_nodes) if communityAssignment[x] == C]
+                # Sum weights of all edges incident to nodes in C
+                sigma_tot = np.sum(affinityMatrix[C_members, :])
+                # ki_in: sum of weight of edges incident to node i in C
+                ki_in = np.sum(affinityMatrix[i, :][C_members])
+                deltaQ = ki_in/two_m - resolution * sigma_tot * ki / (two_m ** 2 / 2) # simplified formula. Derivation from (https://hal.archives-ouvertes.fr/hal-01231784/document)
+                modularityList.append(deltaQ)
+            modularityList.append(0) # append zero to avoid error by empty list
+            maxQgain = np.max(modularityList)
+            if maxQgain > 0:
+                communityToJoin = communities[modularityList.index(maxQgain)]
+                communityAssignment[i] = communityToJoin
+        modularity_current = getModularity(affinityMatrix, communityAssignment, resolution)
+        changeModularity = modularity_current - modularity_prev
+        iterations += 1
+    return communityAssignment
+def LouvainPhase2(affinityMatrix, communityAssignment):
+    # Merge each communities into supernodes
+    # input: affinityMatrix or condensed graph
+    # output: condensed graph and list of communities corresponding to the axis of graph
+    num_nodes = affinityMatrix.shape[0]
+    communities = list(set(communityAssignment))
+    num_communities = len(communities)
+    phase2graph = np.zeros([num_communities, num_communities])
+    for i in range(num_communities):
+        i_members = [x for x in range(num_nodes) if communityAssignment[x] == communities[i]]
+        for j in range(num_communities):
+            j_members = [x for x in range(num_nodes) if communityAssignment[x] == communities[j]]
+            phase2graph[i, j] = np.sum(affinityMatrix[i_members, :][:, j_members]) * (1/2) ** (i == j)
+            # if same communities, edges are double counted. If i == j, divide by two
+    return phase2graph, communities
+def Louvain(affinityMatrix, Q_threshold = 0.001, max_iter = 50, resolution = 1.0):
+    # perform the two phases of Louvain community iteratively
+    graph = affinityMatrix
+    comm = list(range(affinityMatrix.shape[0]))
+    # communityAssignmentRecord and commRefList are lists of lists, with same lengths.
+    # communityAssignmentRecord stores assingment after phase 1
+    # commRefList keeps the individual communities before assignment
+    communityAssignmentRecord = []
+    commRefList = []
+    changeModularity = 1
+    iterations = 0
+    while changeModularity > Q_threshold and iterations < max_iter:
+        graph, comm = LouvainPhase2(graph, comm)
+        commRefList.append(list(comm))
+        modularity_past = getModularity(graph, comm, resolution) # Note: we want to compare Q before and after reassignments in phase 1
+        comm = LouvainPhase1(graph, comm, Q_threshold, max_iter, resolution)
+        communityAssignmentRecord.append(list(comm))
+        modularity_curr = getModularity(graph, comm, resolution)
+        changeModularity = modularity_curr - modularity_past
+        iterations += 1
+#        print('changeModularity', changeModularity)
+    for i in range(len(communityAssignmentRecord) - 2, 0, -1):
+        changeDict = {}
+        for j in range(len(communityAssignmentRecord[i])):
+            newComm = communityAssignmentRecord[i][j]
+            oldComm = commRefList[i][j]
+            if newComm != oldComm:
+                changeDict[oldComm] = newComm # Change key to value
+        for key in changeDict:
+            oldIndices = [x for x in range(len(communityAssignmentRecord[i - 1])) if communityAssignmentRecord[i - 1][x] == key]
+            for j in oldIndices:
+                communityAssignmentRecord[i - 1][j] = changeDict[key]
+    return communityAssignmentRecord[0]
+def rbfKernel(r, epsilon = 1.0):
+    return np.exp(-(epsilon*r)**2)
+def findThreshold(affinityMatrix):
+    numFiles = affinityMatrix.shape[0]
+    # Get all values in affinityMatrix. Sort in descending order
+    vals = np.array([])
+    for i in range(numFiles - 1):
+        vals = np.append(vals, affinityMatrix[i, i + 1:])
+    vals = np.sort(vals, axis = None)[::-1]
+    # Initialize index values
+    upperIndex = 0
+    lowerIndex = len(vals) - 1
+    while upperIndex != lowerIndex - 1: # Until the indices are consecutive, iterate
+        midIndex = int(np.mean([lowerIndex, upperIndex]))
+        adjacencyMatrix = affinityMatrix > vals[midIndex]
+        # Will tally nodes visited from first node in graph by BFS. If not all nodes were visited, there are more than one component.
+        nodesVisited = [0] + [i for i in range(numFiles) if adjacencyMatrix[0, i] == 1]
+#        nodesVisited = [i for i in range(numFiles) if adjacencyMatrix[0, i] == 1]
+        for i in nodesVisited:
+            newNodes = [j for j in range(numFiles) if adjacencyMatrix[i, j] == 1 and j not in nodesVisited]
+            nodesVisited += newNodes
+        while len(newNodes) > 0:
+            newNodes1list = []
+            for i in newNodes:
+                newNodes1 = [j for j in range(numFiles) if adjacencyMatrix[i, j] == 1 and j not in nodesVisited]
+                newNodes1list += newNodes1
+                nodesVisited += newNodes1
+            newNodes = newNodes1list
+        # If every node was visited, then we have exactly one component. Otherwise, there are disconnected graphs
+        if len(nodesVisited) < numFiles:
+            upperIndex = midIndex
+        else:
+            lowerIndex = midIndex
+    threshold = vals[lowerIndex]
+    return threshold
+def AutoGraph(rmsdMatrix):
+    N = rmsdMatrix.shape[0]
+    affinityMatrix = rbfKernel(rmsdMatrix)
+    affinityMatrix[range(N), range(N)] = 0 # set diagonal zero
+    threshold = findThreshold(affinityMatrix)
+    adjacencyMatrix = affinityMatrix > threshold
+    filteredAffinityMatrix = affinityMatrix * adjacencyMatrix
+    communityAssignment = Louvain(filteredAffinityMatrix, Q_threshold=0.0, max_iter=50, resolution=1.0)
+    centroid_indices = centroid_autograph(N, communityAssignment, rmsdMatrix, threshold)
+    return communityAssignment, centroid_indices

rdworks-0.25.7/src/rdworks/autograph/centroid.py ADDED Viewed

@@ -0,0 +1,90 @@
+import pandas as pd
+import numpy as np
+def centroid_medoid(communityAssignment, rmsdMatrix) -> list:
+    """returns a list of centroids based on medoids
+    Medoids are representative objects of a data set or a cluster within a data set
+    whose sum of dissimilarities to all the objects in the cluster is minimal.
+    Args:
+        communityAssignment: (list) list of community assignment correspoinding to the index of fileList
+        rmsdMatrix: (numpy array) Matrix containing pairwise atomic RMSD between all conformers
+    Returns:
+        a list of centroids
+    """
+    N = rmsdMatrix.shape[0]
+    community_indices = list(set(communityAssignment))
+    centroids = []
+    for C in community_indices:
+        C_members = [x for x in range(N) if communityAssignment[x] == C]
+        community_submatrix = rmsdMatrix[C_members, :][:, C_members]
+        dist_sum = np.sum(community_submatrix, axis = 1)
+        centroids.append(C_members[np.argmin(dist_sum)])
+    return centroids
+def diekstra(filtered_rmsd_matrix_community, i):
+    '''Use Dijkstra's algorithm to find shortest path from index i to all other nodes'''
+    # initialize lists
+    visited = []
+    unvisited = [x for x in range(filtered_rmsd_matrix_community.shape[0])]
+    record = [np.inf for x in range(filtered_rmsd_matrix_community.shape[0])]#{x: np.inf for x in range(graph.shape[0])}
+    record[i] = 0
+    lastNode = [-1 for x in record]
+    # repeat until all nodes have been visited
+    while len(unvisited) > 0:
+        visit_index = unvisited[np.argmin([record[x] for x in unvisited])]
+        unvisited_neighbors = [x for x in unvisited if filtered_rmsd_matrix_community[visit_index, x] > 0]
+        # Calculate distance to unvisited neighbor. If value is shorter than recorded, update distance.
+        updateDist = filtered_rmsd_matrix_community[visit_index, :] + record[visit_index]
+        for j in unvisited_neighbors:
+            record[j] = np.min([updateDist[j], record[j]])
+            if updateDist[j] < record[j]:
+                lastNode[j] = visit_index
+        # update visited/unvisited node list
+        unvisited.remove(visit_index)
+        visited.append(visit_index)
+    return record, lastNode
+def centroid_betweenness(num, communityAssignment, filtered_rmsd_matrix):
+    # Provided with list of conformers assigned to communities, choose representative centroid by conformers of maximum in community betweenness
+    # inputs
+    # fileList: (list) names of xyz files for each conformer
+    # communityAssignment: (list) list of community assignment correspoinding to the index of fileList
+    # filtered_rmsd_matrix: {np.array) RMSD matrix between conformers, except assigning distances above threshold to zero
+    communityList = list(set(communityAssignment))
+    centralNodes = []
+    comm_size = []
+    for C in communityList:
+        C_members = [x for x in range(num) if communityAssignment[x] == C]
+        C_member_files = [x for x in C_members]
+        comm_size.append(len(C_members))
+        community_subgraph = filtered_rmsd_matrix[C_members, :][:, C_members]
+        community_betweenness = np.zeros(len(C_members))
+        for i in range(len(C_member_files)):
+            record, lastnode = diekstra(community_subgraph, i)
+            for j in range(len(C_member_files)):
+                previous_node = lastnode[j]
+                while previous_node != -1:
+                    community_betweenness[previous_node] += 1
+                    previous_node = lastnode[previous_node]
+        max_betweenness_index = np.argmax(community_betweenness)
+        centralNodes.append(C_member_files[max_betweenness_index])
+    # Sort centers by size of clusters in descending order
+    centralDf = pd.DataFrame({'size': comm_size}, index = centralNodes)
+    centralDf.sort_values(by = 'size', ascending = False, inplace = True)
+    return list(centralDf.index)
+def centroid_autograph(N, communityAssignment, rmsdMatrix, threshold, centroid_selection='betweenness', filteredAffinityMatrix=None):
+    '''Return file names of conformers designated as centroids. If energy is provided, find lowest energy conformers in each cluster. Otherwise choose by maximum in-cluster weighted degree'''
+    if centroid_selection == 'betweenness':
+        return centroid_betweenness(N, communityAssignment, rmsdMatrix * rmsdMatrix < np.sqrt(-np.log(threshold))) # add filtered rmsd matrix
+    else:
+        print('centroid criterion not recognized. Use keywords "degree", "eccentricity", or "betweenness" for centroid_selection or provide an energy output to base the selection')

rdworks-0.25.7/src/rdworks/autograph/dynamictreecut.py ADDED Viewed

@@ -0,0 +1,135 @@
+# 2020-05-22
+# Kiyoto Aramis Tanemura
+# The Ward algorithm for conformational clustering is used in building Markov State Model (DOI: 10.1021/acs.jctc.6b01238). To circumvent threshold selection, we will apply the dynamic tree cut method, used in conjunction to the Ward dendogram for automated conformational clustering (DOI: 10.1186/s13321-017-0208-0).
+import numpy as np
+from scipy.cluster.hierarchy import linkage, to_tree
+from scipy.spatial.distance import squareform
+from .centroid import centroid_medoid
+def get_ward_dendrogram(rmsd_mat):
+    '''Use Scipy functions to obtain dendrogram using Ward method
+    Returns the dendrogram (refer to Scipy linkage output format) as a ClusterNode object'''
+    dend = linkage(squareform(rmsd_mat), method = 'ward', optimal_ordering = True)
+    return dend
+def goLeftmost(node, path, path_record):
+    '''provided a node, travel left until a leaf is reached'''
+    curr_node = node
+    if path in path_record:
+        return path
+    while not curr_node.is_leaf():
+        curr_node = curr_node.get_left()
+        path += ('l')
+    return path
+def travelDown(ref_node, rel_path):
+    '''reach a node below, provided a starting node an path'''
+    curr_node = ref_node
+    for i in rel_path:
+        if i == 'l':
+            curr_node = curr_node.get_left()
+        elif i == 'r':
+            curr_node = curr_node.get_right()
+    return curr_node
+def getHeights(root):
+    '''return heights of nonleaf nodes and their corresponding paths'''
+    heights = []
+    path_recorded = []
+    path = '.'
+    curr_node = root
+    if root.is_leaf():
+        return [0], [path]
+    while 'l' in list(path) or not curr_node.is_leaf():
+        path = goLeftmost(curr_node, path, path_recorded)
+        path = path[:-1]
+        curr_node = travelDown(root, path)
+        if path not in path_recorded:
+            heights.append(curr_node.dist)
+            path_recorded.append(path)
+            path += 'r'
+            curr_node = travelDown(root, path)
+    return heights, path_recorded
+def treeCutCore(H, I, tau = 5):
+    '''Determine significant clusters provided one calibration value'''
+    H_hat = H - I
+    trans_indices = [x for x in range(len(H)-1) if H_hat[x] > 0 and H_hat[x+1] < 0]
+    breakpoints = []
+    for i in trans_indices:
+        back_index = 1
+        while H_hat[i-back_index] > 0 and back_index <= i:
+            back_index += 1
+        breakpoints.append(i - back_index + 1)
+    # Find significant breakpoints
+    significant_breakpoints = [breakpoints[x] for x in range(len(breakpoints)) if trans_indices[x] - breakpoints[x] > tau]
+    return significant_breakpoints
+def adaptiveTreecutCore(H, tau = 5):
+    '''Perform treeCutCore at mean height. If no significant breakpoints are detected,
+    continue the operation below and above the mean.'''
+    if len(H) == 0:
+        return []
+    lm = np.mean(H)
+    lu = np.mean([lm, np.max(H)])
+    ld = np.mean([lm, np.min(H)])
+    bps = treeCutCore(H, lm, tau)
+    if len(bps) == 0:
+        bps = treeCutCore(H, ld, tau)
+        if len(bps) == 0:
+            bps = treeCutCore(H, lu, tau)
+    return bps
+def getClusterNodeIndices(comprehensive_path_list, cluster_substring):
+    # Given a root string, return all permutations as indices
+    return [x for x in range(len(comprehensive_path_list)) if comprehensive_path_list[x].startswith(cluster_substring)]
+# After looking at the Java implementation, I suspect the breakpoints, corresponding to the indices of heights (distance value of nonleaf nodes),
+# also correspond to the indices of leaves. Try clustering only by collecting all breakpoints, then using the indices to subset leaves.
+def dynamicTreeCut(tree, n, tau = 5):
+    allHeights, allPaths = getHeights(tree)
+    allHeights = [0] + allHeights + [0] # Sandwitch the heights with zero so that the ends can be included or left out from major clusters.
+    breakpoints = [0,-1]
+    updateList = [-1]
+    while len(updateList) > 0:
+        updateList = []
+        for i in range(len(breakpoints) - 1):
+            Hi = allHeights[breakpoints[i]:breakpoints[i+1]]
+            cutpoints = adaptiveTreecutCore(Hi, tau)
+            updateList += [x for x in cutpoints if x not in breakpoints]
+        breakpoints += updateList
+    return breakpoints
+def report_assingments(breakpoints, tree):
+    '''To standardize outputs with the other clustering algorithms, this function takes the list of ClusterNodes produced in dynamicTreeCut and returns a list of ints specifying the cluster assignment'''
+    leaves = tree.pre_order(lambda x: x.id)
+    n = len(leaves)
+    comm_assing = np.zeros(n, int)
+    for i in range(len(breakpoints) - 1):
+        members = leaves[breakpoints[i]:breakpoints[i+1]]
+        comm_assing[members] = i
+    return comm_assing.tolist()
+def DynamicTreeCut(rmsdMatrix, tau = 5): # Ward clustering
+    N = rmsdMatrix.shape[0]
+    dend = get_ward_dendrogram(rmsdMatrix)
+    tree = to_tree(dend)
+    breakpoints = dynamicTreeCut(tree, N, tau)
+    communityAssignment = report_assingments(breakpoints, tree)
+    centroid_indices = centroid_medoid(communityAssignment, rmsdMatrix)
+    return communityAssignment, centroid_indices