synkit 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- synkit/Chem/Fingerprint/__init__.py +0 -0
- synkit/Chem/Fingerprint/fp_calculator.py +122 -0
- synkit/Chem/Fingerprint/smiles_featurizer.py +185 -0
- synkit/Chem/Fingerprint/transformation_fp.py +79 -0
- synkit/Chem/Molecule/__init__.py +0 -0
- synkit/Chem/Molecule/standardize.py +137 -0
- synkit/Chem/Reaction/__init__.py +0 -0
- synkit/Chem/Reaction/balance_check.py +162 -0
- synkit/Chem/Reaction/cleanning.py +59 -0
- synkit/Chem/Reaction/deionize.py +289 -0
- synkit/Chem/Reaction/neutralize.py +256 -0
- synkit/Chem/Reaction/reagent.py +102 -0
- synkit/Chem/Reaction/standardize.py +157 -0
- synkit/Chem/Reaction/tautomerize.py +168 -0
- synkit/Graph/Cluster/__init__.py +0 -0
- synkit/Graph/Cluster/morphism.py +83 -0
- synkit/Graph/Feature/__init__.py +0 -0
- synkit/Graph/Feature/graph_descriptors.py +325 -0
- synkit/Graph/Feature/graph_fps.py +97 -0
- synkit/Graph/Feature/graph_signature.py +236 -0
- synkit/Graph/Feature/hash_fps.py +130 -0
- synkit/Graph/Feature/morgan_fps.py +87 -0
- synkit/Graph/Feature/path_fps.py +82 -0
- synkit/Graph/__init.py +0 -0
- synkit/IO/__init__.py +0 -0
- synkit/IO/chem_converter.py +231 -0
- synkit/IO/data_io.py +277 -0
- synkit/IO/data_process.py +49 -0
- synkit/IO/debug.py +78 -0
- synkit/IO/dg_to_gml.py +124 -0
- synkit/IO/gml_to_nx.py +119 -0
- synkit/IO/graph_to_mol.py +110 -0
- synkit/IO/mol_to_graph.py +282 -0
- synkit/IO/nx_to_gml.py +200 -0
- synkit/IO/parse_rule.py +172 -0
- synkit/IO/smiles_to_id.py +119 -0
- synkit/ITS/_misc.py +280 -0
- synkit/ITS/aam_validator.py +254 -0
- synkit/ITS/its_builder.py +94 -0
- synkit/ITS/its_construction.py +213 -0
- synkit/ITS/normalize_aam.py +183 -0
- synkit/ITS/partial_expand.py +170 -0
- synkit/Reactor/__init__.py +0 -0
- synkit/Reactor/core_engine.py +164 -0
- synkit/Reactor/inference.py +73 -0
- synkit/Reactor/multi_step.py +227 -0
- synkit/Reactor/multi_step_aam.py +82 -0
- synkit/Reactor/reagent.py +95 -0
- synkit/Reactor/rule_apply.py +81 -0
- synkit/Vis/__init__.py +0 -0
- synkit/Vis/chemical_graph_visualizer.py +378 -0
- synkit/Vis/chemical_reaction_visualizer.py +133 -0
- synkit/Vis/chemical_space.py +83 -0
- synkit/Vis/embedding.py +92 -0
- synkit/Vis/graph_visualizer.py +286 -0
- synkit/Vis/pdf_writer.py +143 -0
- synkit/Vis/rsmi_to_fig.py +169 -0
- synkit/__init__.py +0 -0
- synkit/_misc.py +181 -0
- synkit-0.0.1.dist-info/METADATA +148 -0
- synkit-0.0.1.dist-info/RECORD +63 -0
- synkit-0.0.1.dist-info/WHEEL +4 -0
- synkit-0.0.1.dist-info/licenses/LICENSE +21 -0
synkit/IO/parse_rule.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
# Regex patterns for nodes and edges
|
|
4
|
+
NODE_REGEX = re.compile(r'node \[ id (\d+) label "(\w+)" \]')
|
|
5
|
+
EDGE_REGEX = re.compile(r'edge \[ source (\d+) target (\d+) label "(.+?)" \]')
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def find_block(lines, keyword):
|
|
9
|
+
"""
|
|
10
|
+
Finds the start and end indices of a block (e.g., "left [", "context [", etc.)
|
|
11
|
+
in the given lines of GML. Returns (start_idx, end_idx) or (None, None) if not found.
|
|
12
|
+
"""
|
|
13
|
+
start_idx = None
|
|
14
|
+
depth = 0
|
|
15
|
+
for i, line in enumerate(lines):
|
|
16
|
+
stripped = line.strip()
|
|
17
|
+
if start_idx is None and stripped.startswith(keyword):
|
|
18
|
+
start_idx = i
|
|
19
|
+
depth = 1
|
|
20
|
+
elif start_idx is not None:
|
|
21
|
+
# Check brackets to maintain correct depth
|
|
22
|
+
if stripped.endswith("["):
|
|
23
|
+
depth += 1
|
|
24
|
+
elif stripped == "]":
|
|
25
|
+
depth -= 1
|
|
26
|
+
if depth == 0:
|
|
27
|
+
return start_idx, i
|
|
28
|
+
return None, None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def get_nodes_from_edges(block_lines):
|
|
32
|
+
"""
|
|
33
|
+
Extract node IDs from edges in the given block lines.
|
|
34
|
+
Returns a set of node IDs found in the edges.
|
|
35
|
+
"""
|
|
36
|
+
node_set = set()
|
|
37
|
+
for line in block_lines:
|
|
38
|
+
m = EDGE_REGEX.search(line.strip())
|
|
39
|
+
if m:
|
|
40
|
+
source, target, _ = m.groups()
|
|
41
|
+
node_set.update([source, target])
|
|
42
|
+
return node_set
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def parse_context(context_lines, node_regex=None, edge_regex=None):
|
|
46
|
+
"""
|
|
47
|
+
Parse the context lines to identify nodes and edges.
|
|
48
|
+
Returns two structures:
|
|
49
|
+
- context_nodes: {node_id: label}
|
|
50
|
+
- context_edges: list of (source, target, label)
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
context_nodes = {}
|
|
54
|
+
context_edges = []
|
|
55
|
+
for line in context_lines:
|
|
56
|
+
stripped = line.strip()
|
|
57
|
+
nm = NODE_REGEX.search(stripped)
|
|
58
|
+
if nm:
|
|
59
|
+
nid, lbl = nm.groups()
|
|
60
|
+
context_nodes[nid] = lbl
|
|
61
|
+
else:
|
|
62
|
+
em = EDGE_REGEX.search(stripped)
|
|
63
|
+
if em:
|
|
64
|
+
source, target, label = em.groups()
|
|
65
|
+
context_edges.append((source, target, label))
|
|
66
|
+
return context_nodes, context_edges
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def filter_context(context_lines, relevant_nodes):
|
|
70
|
+
"""
|
|
71
|
+
Given the context lines and a set of relevant nodes, remove hydrogen nodes
|
|
72
|
+
not in relevant_nodes and all edges connected to them. Returns filtered lines.
|
|
73
|
+
"""
|
|
74
|
+
context_nodes, context_edges = parse_context(context_lines)
|
|
75
|
+
|
|
76
|
+
# Identify hydrogen nodes to remove
|
|
77
|
+
hydrogen_nodes_to_remove = {
|
|
78
|
+
nid
|
|
79
|
+
for nid, lbl in context_nodes.items()
|
|
80
|
+
if lbl == "H" and nid not in relevant_nodes
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
filtered_context = []
|
|
84
|
+
for line in context_lines:
|
|
85
|
+
stripped = line.strip()
|
|
86
|
+
nm = NODE_REGEX.search(stripped)
|
|
87
|
+
em = EDGE_REGEX.search(stripped)
|
|
88
|
+
|
|
89
|
+
if nm:
|
|
90
|
+
nid, lbl = nm.groups()
|
|
91
|
+
if nid not in hydrogen_nodes_to_remove:
|
|
92
|
+
filtered_context.append(line)
|
|
93
|
+
elif em:
|
|
94
|
+
source, target, label = em.groups()
|
|
95
|
+
if (
|
|
96
|
+
source not in hydrogen_nodes_to_remove
|
|
97
|
+
and target not in hydrogen_nodes_to_remove
|
|
98
|
+
):
|
|
99
|
+
filtered_context.append(line)
|
|
100
|
+
else:
|
|
101
|
+
# Keep section lines like "context [" or "]"
|
|
102
|
+
filtered_context.append(line)
|
|
103
|
+
|
|
104
|
+
return filtered_context
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def strip_context(gml_text: str, remove_all: bool = True) -> str:
|
|
108
|
+
"""
|
|
109
|
+
Filters or clears the 'context' section of GML-like content based on the remove_all flag.
|
|
110
|
+
If remove_all is True, all edges in the 'context' section are removed.
|
|
111
|
+
If False, it removes hydrogen nodes that do not appear in both 'left' and 'right' sections,
|
|
112
|
+
along with their edges, while preserving the original structure and formatting of the GML.
|
|
113
|
+
|
|
114
|
+
Parameters:
|
|
115
|
+
- gml_text (str): GML-like content describing a chemical reaction rule.
|
|
116
|
+
- remove_all (bool): Flag to determine if all edges should be removed from the 'context'.
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
- str: The modified GML content with the filtered 'context' section.
|
|
120
|
+
"""
|
|
121
|
+
lines = gml_text.split("\n")
|
|
122
|
+
|
|
123
|
+
# Locate main sections: rule, left, context, right
|
|
124
|
+
rule_start, rule_end = find_block(lines, "rule [")
|
|
125
|
+
left_start, left_end = find_block(lines, "left [")
|
|
126
|
+
context_start, context_end = find_block(lines, "context [")
|
|
127
|
+
right_start, right_end = find_block(lines, "right [")
|
|
128
|
+
|
|
129
|
+
# If we cannot find proper structure, return original text
|
|
130
|
+
if any(
|
|
131
|
+
x is None
|
|
132
|
+
for x in [
|
|
133
|
+
rule_start,
|
|
134
|
+
rule_end,
|
|
135
|
+
left_start,
|
|
136
|
+
left_end,
|
|
137
|
+
context_start,
|
|
138
|
+
context_end,
|
|
139
|
+
right_start,
|
|
140
|
+
right_end,
|
|
141
|
+
]
|
|
142
|
+
):
|
|
143
|
+
return gml_text
|
|
144
|
+
|
|
145
|
+
# fmt: off
|
|
146
|
+
context_lines = lines[context_start: context_end + 1]
|
|
147
|
+
|
|
148
|
+
# Determine relevant nodes by intersection of nodes in left and right edges
|
|
149
|
+
left_nodes = get_nodes_from_edges(lines[left_start: left_end + 1])
|
|
150
|
+
right_nodes = get_nodes_from_edges(lines[right_start: right_end + 1])
|
|
151
|
+
# fmt: on
|
|
152
|
+
relevant_nodes = left_nodes.intersection(right_nodes)
|
|
153
|
+
|
|
154
|
+
# Filter the context section based on relevant nodes
|
|
155
|
+
filtered_context = filter_context(context_lines, relevant_nodes)
|
|
156
|
+
|
|
157
|
+
if remove_all:
|
|
158
|
+
# Remove all edges from the context
|
|
159
|
+
# Retain only node lines and other structural lines
|
|
160
|
+
final_context = []
|
|
161
|
+
for line in filtered_context:
|
|
162
|
+
if not EDGE_REGEX.search(line.strip()):
|
|
163
|
+
final_context.append(line)
|
|
164
|
+
filtered_context = final_context
|
|
165
|
+
|
|
166
|
+
# Rebuild the full GML text
|
|
167
|
+
# Replace the original context lines with the filtered or cleared context lines
|
|
168
|
+
# fmt: off
|
|
169
|
+
new_lines = lines[:context_start] + filtered_context + lines[context_end + 1:]
|
|
170
|
+
# fmt: on
|
|
171
|
+
|
|
172
|
+
return "\n".join(new_lines)
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import requests
|
|
3
|
+
import urllib.parse
|
|
4
|
+
from typing import List
|
|
5
|
+
from joblib import Parallel, delayed
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def smiles_to_iupac(smiles_string: str, timeout: int = 1):
|
|
9
|
+
"""
|
|
10
|
+
Converts a SMILES string to its corresponding IUPAC name(s) using the PubChem PUG REST API.
|
|
11
|
+
|
|
12
|
+
Parameters:
|
|
13
|
+
- smiles_string (str): The SMILES string of the compound (e.g., "C=O" for formaldehyde).
|
|
14
|
+
- timeout (int, optional): The timeout in seconds for the request. Default is 1 second.
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
- list: A list of IUPAC names associated with the SMILES string. Returns an empty list if none found.
|
|
18
|
+
"""
|
|
19
|
+
# URL encode the SMILES string to handle special characters
|
|
20
|
+
encoded_smiles = urllib.parse.quote(smiles_string)
|
|
21
|
+
|
|
22
|
+
# PubChem PUG REST API endpoint to retrieve properties
|
|
23
|
+
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{encoded_smiles}/property/IUPACName/JSON"
|
|
24
|
+
|
|
25
|
+
retries = 3 # Number of retries in case of failure
|
|
26
|
+
delay = 2 # Delay between retries (in seconds)
|
|
27
|
+
|
|
28
|
+
for attempt in range(retries):
|
|
29
|
+
try:
|
|
30
|
+
response = requests.get(url, timeout=timeout) # Adjust timeout for speed
|
|
31
|
+
response.raise_for_status() # Raise an HTTPError for bad responses
|
|
32
|
+
|
|
33
|
+
data = response.json()
|
|
34
|
+
|
|
35
|
+
# Extract the IUPAC name(s) from the response
|
|
36
|
+
properties = data.get("PropertyTable", {}).get("Properties", [])
|
|
37
|
+
|
|
38
|
+
if not properties:
|
|
39
|
+
print(f"No properties found for SMILES: {smiles_string}")
|
|
40
|
+
return []
|
|
41
|
+
|
|
42
|
+
iupac_names = [
|
|
43
|
+
prop.get("IUPACName") for prop in properties if prop.get("IUPACName")
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
if iupac_names:
|
|
47
|
+
return iupac_names
|
|
48
|
+
else:
|
|
49
|
+
print(f"No IUPAC name found for SMILES: {smiles_string}")
|
|
50
|
+
return []
|
|
51
|
+
|
|
52
|
+
except (requests.exceptions.RequestException, ValueError, KeyError) as e:
|
|
53
|
+
# If an error occurs, retry a few times
|
|
54
|
+
print(
|
|
55
|
+
f"Attempt {attempt + 1} failed for SMILES: {smiles_string}, Error: {e}"
|
|
56
|
+
)
|
|
57
|
+
if attempt < retries - 1:
|
|
58
|
+
time.sleep(delay) # Wait before retrying
|
|
59
|
+
else:
|
|
60
|
+
print(f"Final failure for SMILES: {smiles_string}")
|
|
61
|
+
return []
|
|
62
|
+
|
|
63
|
+
return []
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def batch_process_smiles(smiles_batch: List[str], timeout=1):
|
|
67
|
+
"""
|
|
68
|
+
Processes a batch of SMILES strings to get IUPAC names.
|
|
69
|
+
|
|
70
|
+
Parameters:
|
|
71
|
+
- smiles_batch (list): A list of SMILES strings to process.
|
|
72
|
+
- timeout (int): Timeout for requests (in seconds).
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
- list: A list of IUPAC name results for each SMILES in the batch.
|
|
76
|
+
"""
|
|
77
|
+
return [smiles_to_iupac(smiles, timeout) for smiles in smiles_batch]
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def get_iupac_for_smiles_list(
|
|
81
|
+
smiles_list: List[str], batch_size=10, n_jobs=4, timeout=1
|
|
82
|
+
):
|
|
83
|
+
"""
|
|
84
|
+
Convert a list of SMILES strings to their corresponding IUPAC names using the PubChem API with batch processing.
|
|
85
|
+
|
|
86
|
+
Parameters:
|
|
87
|
+
smiles_list (list): A list of SMILES strings to be converted to IUPAC names.
|
|
88
|
+
batch_size (int): Number of SMILES strings to process in each batch.
|
|
89
|
+
n_jobs (int): Number of parallel jobs to run for batch processing.
|
|
90
|
+
timeout (int): Timeout for requests (in seconds).
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
dict: A dictionary with SMILES as keys and lists of IUPAC names as values.
|
|
94
|
+
"""
|
|
95
|
+
# Split the list into smaller batches
|
|
96
|
+
# fmt: off
|
|
97
|
+
batches = [
|
|
98
|
+
smiles_list[i: i + batch_size] for i in range(0, len(smiles_list), batch_size)
|
|
99
|
+
]
|
|
100
|
+
# fmt: on
|
|
101
|
+
|
|
102
|
+
# Use joblib's Parallel and delayed to process batches in parallel
|
|
103
|
+
batch_results = Parallel(n_jobs=n_jobs)(
|
|
104
|
+
delayed(batch_process_smiles)(batch, timeout) for batch in batches
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# Flatten the list of results and map to SMILES
|
|
108
|
+
flattened_results = [item for sublist in batch_results for item in sublist]
|
|
109
|
+
iupac_dict = dict(zip(smiles_list, flattened_results))
|
|
110
|
+
|
|
111
|
+
return iupac_dict
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# Example of usage
|
|
115
|
+
smiles_list = ["CCO", "C=O", "CC(=O)O", "C1=CC=CC=C1", "C2H6O", "C4H10", "C5H12"]
|
|
116
|
+
iupac_results = get_iupac_for_smiles_list(smiles_list, batch_size=3, n_jobs=2)
|
|
117
|
+
|
|
118
|
+
for smiles, iupac_names in iupac_results.items():
|
|
119
|
+
print(f"SMILES: {smiles} => IUPAC Names: {iupac_names}")
|
synkit/ITS/_misc.py
ADDED
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import networkx as nx
|
|
3
|
+
from rdkit import Chem
|
|
4
|
+
from rdkit.Chem.MolStandardize import rdMolStandardize
|
|
5
|
+
|
|
6
|
+
from typing import Optional, List
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_rc(
|
|
10
|
+
ITS: nx.Graph,
|
|
11
|
+
element_key: list = ["element", "charge", "typesGH", "atom_map"],
|
|
12
|
+
bond_key: str = "order",
|
|
13
|
+
standard_key: str = "standard_order",
|
|
14
|
+
) -> nx.Graph:
|
|
15
|
+
"""
|
|
16
|
+
Extracts the reaction center (RC) graph from a given ITS graph by identifying edges
|
|
17
|
+
where the bond order changes, indicating a reaction event.
|
|
18
|
+
|
|
19
|
+
Parameters:
|
|
20
|
+
- ITS (nx.Graph): The ITS graph to extract the RC from.
|
|
21
|
+
- element_key (list): List of node attribute keys for atom properties.
|
|
22
|
+
Defaults to ['element', 'charge', 'typesGH'].
|
|
23
|
+
- bond_key (str): Edge attribute key for bond order. Defaults to 'order'.
|
|
24
|
+
- standard_key (str): Edge attribute key for standard order information.
|
|
25
|
+
Defaults to 'standard_order'.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
- nx.Graph: A new graph representing the reaction center of the ITS.
|
|
29
|
+
"""
|
|
30
|
+
rc = nx.Graph()
|
|
31
|
+
for n1, n2, data in ITS.edges(data=True):
|
|
32
|
+
if data.get(bond_key, [None, None])[0] != data.get(bond_key, [None, None])[1]:
|
|
33
|
+
rc.add_node(
|
|
34
|
+
n1, **{k: ITS.nodes[n1][k] for k in element_key if k in ITS.nodes[n1]}
|
|
35
|
+
)
|
|
36
|
+
rc.add_node(
|
|
37
|
+
n2, **{k: ITS.nodes[n2][k] for k in element_key if k in ITS.nodes[n2]}
|
|
38
|
+
)
|
|
39
|
+
rc.add_edge(
|
|
40
|
+
n1, n2, **{bond_key: data[bond_key], standard_key: data[standard_key]}
|
|
41
|
+
)
|
|
42
|
+
return rc
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def its_decompose(its_graph: nx.Graph, nodes_share="typesGH", edges_share="order"):
|
|
46
|
+
"""
|
|
47
|
+
Decompose an ITS graph into two separate graphs G and H based on shared
|
|
48
|
+
node and edge attributes.
|
|
49
|
+
|
|
50
|
+
Parameters:
|
|
51
|
+
- its_graph (nx.Graph): The integrated transition state (ITS) graph.
|
|
52
|
+
- nodes_share (str): Node attribute key that stores tuples with node attributes
|
|
53
|
+
or G and H.
|
|
54
|
+
- edges_share (str): Edge attribute key that stores tuples with edge attributes
|
|
55
|
+
for G and H.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
- Tuple[nx.Graph, nx.Graph]: A tuple containing the two graphs G and H.
|
|
59
|
+
"""
|
|
60
|
+
G = nx.Graph()
|
|
61
|
+
H = nx.Graph()
|
|
62
|
+
|
|
63
|
+
# Decompose nodes
|
|
64
|
+
for node, data in its_graph.nodes(data=True):
|
|
65
|
+
if nodes_share in data:
|
|
66
|
+
node_attr_g, node_attr_h = data[nodes_share]
|
|
67
|
+
# Unpack node attributes for G
|
|
68
|
+
G.add_node(
|
|
69
|
+
node,
|
|
70
|
+
element=node_attr_g[0],
|
|
71
|
+
aromatic=node_attr_g[1],
|
|
72
|
+
hcount=node_attr_g[2],
|
|
73
|
+
charge=node_attr_g[3],
|
|
74
|
+
neighbors=node_attr_g[4],
|
|
75
|
+
atom_map=node,
|
|
76
|
+
)
|
|
77
|
+
# Unpack node attributes for H
|
|
78
|
+
H.add_node(
|
|
79
|
+
node,
|
|
80
|
+
element=node_attr_h[0],
|
|
81
|
+
aromatic=node_attr_h[1],
|
|
82
|
+
hcount=node_attr_h[2],
|
|
83
|
+
charge=node_attr_h[3],
|
|
84
|
+
neighbors=node_attr_h[4],
|
|
85
|
+
atom_map=node,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# Decompose edges
|
|
89
|
+
for u, v, data in its_graph.edges(data=True):
|
|
90
|
+
if edges_share in data:
|
|
91
|
+
order_g, order_h = data[edges_share]
|
|
92
|
+
if order_g > 0: # Assuming 0 means no edge in G
|
|
93
|
+
G.add_edge(u, v, order=order_g)
|
|
94
|
+
if order_h > 0: # Assuming 0 means no edge in H
|
|
95
|
+
H.add_edge(u, v, order=order_h)
|
|
96
|
+
|
|
97
|
+
return G, H
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def compare_graphs(
|
|
101
|
+
graph1: nx.Graph,
|
|
102
|
+
graph2: nx.Graph,
|
|
103
|
+
node_attrs: list = ["element", "aromatic", "hcount", "charge", "neighbors"],
|
|
104
|
+
edge_attrs: list = ["order"],
|
|
105
|
+
) -> bool:
|
|
106
|
+
"""
|
|
107
|
+
Compare two graphs based on specified node and edge attributes.
|
|
108
|
+
|
|
109
|
+
Parameters:
|
|
110
|
+
- graph1 (nx.Graph): The first graph to compare.
|
|
111
|
+
- graph2 (nx.Graph): The second graph to compare.
|
|
112
|
+
- node_attrs (list): A list of node attribute names to include in the comparison.
|
|
113
|
+
- edge_attrs (list): A list of edge attribute names to include in the comparison.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
- bool: True if both graphs are identical with respect to the specified attributes,
|
|
117
|
+
otherwise False.
|
|
118
|
+
"""
|
|
119
|
+
# Compare node sets
|
|
120
|
+
if set(graph1.nodes()) != set(graph2.nodes()):
|
|
121
|
+
return False
|
|
122
|
+
|
|
123
|
+
# Compare nodes based on attributes
|
|
124
|
+
for node in graph1.nodes():
|
|
125
|
+
if node not in graph2:
|
|
126
|
+
return False
|
|
127
|
+
node_data1 = {attr: graph1.nodes[node].get(attr, None) for attr in node_attrs}
|
|
128
|
+
node_data2 = {attr: graph2.nodes[node].get(attr, None) for attr in node_attrs}
|
|
129
|
+
if node_data1 != node_data2:
|
|
130
|
+
return False
|
|
131
|
+
|
|
132
|
+
# Compare edge sets with sorted tuples
|
|
133
|
+
if set(tuple(sorted(edge)) for edge in graph1.edges()) != set(
|
|
134
|
+
tuple(sorted(edge)) for edge in graph2.edges()
|
|
135
|
+
):
|
|
136
|
+
return False
|
|
137
|
+
|
|
138
|
+
# Compare edges based on attributes
|
|
139
|
+
for edge in graph1.edges():
|
|
140
|
+
# Sort the edge for consistent comparison
|
|
141
|
+
sorted_edge = tuple(sorted(edge))
|
|
142
|
+
if sorted_edge not in graph2.edges():
|
|
143
|
+
return False
|
|
144
|
+
edge_data1 = {attr: graph1.edges[edge].get(attr, None) for attr in edge_attrs}
|
|
145
|
+
edge_data2 = {
|
|
146
|
+
attr: graph2.edges[sorted_edge].get(attr, None) for attr in edge_attrs
|
|
147
|
+
}
|
|
148
|
+
if edge_data1 != edge_data2:
|
|
149
|
+
return False
|
|
150
|
+
|
|
151
|
+
return True
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def enumerate_tautomers(reaction_smiles: str) -> Optional[List[str]]:
|
|
155
|
+
"""
|
|
156
|
+
Enumerates possible tautomers for reactants while canonicalizing the products in a
|
|
157
|
+
reaction SMILES string. This function first splits the reaction SMILES string into
|
|
158
|
+
reactants and products. It then generates all possible tautomers for the reactants and
|
|
159
|
+
canonicalizes the product molecule. The function returns a list of reaction SMILES
|
|
160
|
+
strings for each tautomer of the reactants combined with the canonical product.
|
|
161
|
+
|
|
162
|
+
Parameters:
|
|
163
|
+
- reaction_smiles (str): A SMILES string of the reaction formatted as
|
|
164
|
+
'reactants>>products'.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
- List[str] | None: A list of SMILES strings for the reaction, with each string
|
|
168
|
+
representing a different
|
|
169
|
+
- tautomer of the reactants combined with the canonicalized products. Returns None if
|
|
170
|
+
an error occurs or if invalid SMILES strings are provided.
|
|
171
|
+
|
|
172
|
+
Raises:
|
|
173
|
+
- ValueError: If the provided SMILES strings cannot be converted to molecule objects,
|
|
174
|
+
indicating invalid input.
|
|
175
|
+
"""
|
|
176
|
+
try:
|
|
177
|
+
# Split the input reaction SMILES string into reactants and products
|
|
178
|
+
reactants_smiles, products_smiles = reaction_smiles.split(">>")
|
|
179
|
+
|
|
180
|
+
# Convert SMILES strings to molecule objects
|
|
181
|
+
reactants_mol = Chem.MolFromSmiles(reactants_smiles)
|
|
182
|
+
products_mol = Chem.MolFromSmiles(products_smiles)
|
|
183
|
+
|
|
184
|
+
if reactants_mol is None or products_mol is None:
|
|
185
|
+
raise ValueError(
|
|
186
|
+
"Invalid SMILES string provided for reactants or products."
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# Initialize tautomer enumerator
|
|
190
|
+
|
|
191
|
+
enumerator = rdMolStandardize.TautomerEnumerator()
|
|
192
|
+
|
|
193
|
+
# Enumerate tautomers for the reactants and canonicalize the products
|
|
194
|
+
try:
|
|
195
|
+
reactants_can = enumerator.Enumerate(reactants_mol)
|
|
196
|
+
except Exception as e:
|
|
197
|
+
print(f"An error occurred: {e}")
|
|
198
|
+
reactants_can = [reactants_mol]
|
|
199
|
+
products_can = products_mol
|
|
200
|
+
|
|
201
|
+
# Convert molecule objects back to SMILES strings
|
|
202
|
+
reactants_can_smiles = [Chem.MolToSmiles(i) for i in reactants_can]
|
|
203
|
+
products_can_smiles = Chem.MolToSmiles(products_can)
|
|
204
|
+
|
|
205
|
+
# Combine each reactant tautomer with the canonical product in SMILES format
|
|
206
|
+
rsmi_list = [i + ">>" + products_can_smiles for i in reactants_can_smiles]
|
|
207
|
+
if len(rsmi_list) == 0:
|
|
208
|
+
return [reaction_smiles]
|
|
209
|
+
else:
|
|
210
|
+
# rsmi_list.remove(reaction_smiles)
|
|
211
|
+
rsmi_list.insert(0, reaction_smiles)
|
|
212
|
+
return rsmi_list
|
|
213
|
+
|
|
214
|
+
except Exception as e:
|
|
215
|
+
print(f"An error occurred: {e}")
|
|
216
|
+
return [reaction_smiles]
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def mapping_success_rate(list_mapping_data):
|
|
220
|
+
"""
|
|
221
|
+
Calculate the success rate of entries containing atom mappings in a list of data
|
|
222
|
+
strings.
|
|
223
|
+
|
|
224
|
+
Parameters:
|
|
225
|
+
- list_mapping_in_data (list of str): List containing strings to be searched for atom
|
|
226
|
+
mappings.
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
- float: The success rate of finding atom mappings in the list as a percentage.
|
|
230
|
+
|
|
231
|
+
Raises:
|
|
232
|
+
- ValueError: If the input list is empty.
|
|
233
|
+
"""
|
|
234
|
+
atom_map_pattern = re.compile(r":\d+")
|
|
235
|
+
if not list_mapping_data:
|
|
236
|
+
raise ValueError("The input list is empty, cannot calculate success rate.")
|
|
237
|
+
|
|
238
|
+
success = sum(
|
|
239
|
+
1 for entry in list_mapping_data if re.search(atom_map_pattern, entry)
|
|
240
|
+
)
|
|
241
|
+
rate = 100 * (success / len(list_mapping_data))
|
|
242
|
+
|
|
243
|
+
return round(rate, 2)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def expand_hydrogens(graph: nx.Graph) -> nx.Graph:
|
|
247
|
+
"""
|
|
248
|
+
For each node in the graph that has an 'hcount' attribute greater than zero,
|
|
249
|
+
adds the specified number of hydrogen nodes and connects them with edges that
|
|
250
|
+
have specific attributes.
|
|
251
|
+
|
|
252
|
+
Parameters
|
|
253
|
+
- graph (nx.Graph): A graph representing a molecule with nodes that can
|
|
254
|
+
include 'element', 'hcount', 'charge', and 'atom_map' attributes.
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
- nx.Graph: A new graph with hydrogen atoms expanded.
|
|
258
|
+
"""
|
|
259
|
+
new_graph = graph.copy() # Create a copy to modify and return
|
|
260
|
+
atom_map = (
|
|
261
|
+
max(data["atom_map"] for _, data in graph.nodes(data=True))
|
|
262
|
+
if graph.nodes
|
|
263
|
+
else 0
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
# Iterate through each node to process potential hydrogens
|
|
267
|
+
for node, data in graph.nodes(data=True):
|
|
268
|
+
hcount = data.get("hcount", 0)
|
|
269
|
+
if hcount > 0:
|
|
270
|
+
for _ in range(hcount):
|
|
271
|
+
atom_map += 1
|
|
272
|
+
hydrogen_node = {
|
|
273
|
+
"element": "H",
|
|
274
|
+
"charge": 0,
|
|
275
|
+
"atom_map": atom_map,
|
|
276
|
+
}
|
|
277
|
+
new_graph.add_node(atom_map, **hydrogen_node)
|
|
278
|
+
new_graph.add_edge(node, atom_map, order=(1.0, 1.0), standard_order=0.0)
|
|
279
|
+
|
|
280
|
+
return new_graph
|