synkit 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. synkit/Chem/Fingerprint/__init__.py +0 -0
  2. synkit/Chem/Fingerprint/fp_calculator.py +122 -0
  3. synkit/Chem/Fingerprint/smiles_featurizer.py +185 -0
  4. synkit/Chem/Fingerprint/transformation_fp.py +79 -0
  5. synkit/Chem/Molecule/__init__.py +0 -0
  6. synkit/Chem/Molecule/standardize.py +137 -0
  7. synkit/Chem/Reaction/__init__.py +0 -0
  8. synkit/Chem/Reaction/balance_check.py +162 -0
  9. synkit/Chem/Reaction/cleanning.py +59 -0
  10. synkit/Chem/Reaction/deionize.py +289 -0
  11. synkit/Chem/Reaction/neutralize.py +256 -0
  12. synkit/Chem/Reaction/reagent.py +102 -0
  13. synkit/Chem/Reaction/standardize.py +157 -0
  14. synkit/Chem/Reaction/tautomerize.py +168 -0
  15. synkit/Graph/Cluster/__init__.py +0 -0
  16. synkit/Graph/Cluster/morphism.py +83 -0
  17. synkit/Graph/Feature/__init__.py +0 -0
  18. synkit/Graph/Feature/graph_descriptors.py +325 -0
  19. synkit/Graph/Feature/graph_fps.py +97 -0
  20. synkit/Graph/Feature/graph_signature.py +236 -0
  21. synkit/Graph/Feature/hash_fps.py +130 -0
  22. synkit/Graph/Feature/morgan_fps.py +87 -0
  23. synkit/Graph/Feature/path_fps.py +82 -0
  24. synkit/Graph/__init.py +0 -0
  25. synkit/IO/__init__.py +0 -0
  26. synkit/IO/chem_converter.py +231 -0
  27. synkit/IO/data_io.py +277 -0
  28. synkit/IO/data_process.py +49 -0
  29. synkit/IO/debug.py +78 -0
  30. synkit/IO/dg_to_gml.py +124 -0
  31. synkit/IO/gml_to_nx.py +119 -0
  32. synkit/IO/graph_to_mol.py +110 -0
  33. synkit/IO/mol_to_graph.py +282 -0
  34. synkit/IO/nx_to_gml.py +200 -0
  35. synkit/IO/parse_rule.py +172 -0
  36. synkit/IO/smiles_to_id.py +119 -0
  37. synkit/ITS/_misc.py +280 -0
  38. synkit/ITS/aam_validator.py +254 -0
  39. synkit/ITS/its_builder.py +94 -0
  40. synkit/ITS/its_construction.py +213 -0
  41. synkit/ITS/normalize_aam.py +183 -0
  42. synkit/ITS/partial_expand.py +170 -0
  43. synkit/Reactor/__init__.py +0 -0
  44. synkit/Reactor/core_engine.py +164 -0
  45. synkit/Reactor/inference.py +73 -0
  46. synkit/Reactor/multi_step.py +227 -0
  47. synkit/Reactor/multi_step_aam.py +82 -0
  48. synkit/Reactor/reagent.py +95 -0
  49. synkit/Reactor/rule_apply.py +81 -0
  50. synkit/Vis/__init__.py +0 -0
  51. synkit/Vis/chemical_graph_visualizer.py +378 -0
  52. synkit/Vis/chemical_reaction_visualizer.py +133 -0
  53. synkit/Vis/chemical_space.py +83 -0
  54. synkit/Vis/embedding.py +92 -0
  55. synkit/Vis/graph_visualizer.py +286 -0
  56. synkit/Vis/pdf_writer.py +143 -0
  57. synkit/Vis/rsmi_to_fig.py +169 -0
  58. synkit/__init__.py +0 -0
  59. synkit/_misc.py +181 -0
  60. synkit-0.0.1.dist-info/METADATA +148 -0
  61. synkit-0.0.1.dist-info/RECORD +63 -0
  62. synkit-0.0.1.dist-info/WHEEL +4 -0
  63. synkit-0.0.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,172 @@
1
+ import re
2
+
3
+ # Regex patterns for nodes and edges
4
+ NODE_REGEX = re.compile(r'node \[ id (\d+) label "(\w+)" \]')
5
+ EDGE_REGEX = re.compile(r'edge \[ source (\d+) target (\d+) label "(.+?)" \]')
6
+
7
+
8
+ def find_block(lines, keyword):
9
+ """
10
+ Finds the start and end indices of a block (e.g., "left [", "context [", etc.)
11
+ in the given lines of GML. Returns (start_idx, end_idx) or (None, None) if not found.
12
+ """
13
+ start_idx = None
14
+ depth = 0
15
+ for i, line in enumerate(lines):
16
+ stripped = line.strip()
17
+ if start_idx is None and stripped.startswith(keyword):
18
+ start_idx = i
19
+ depth = 1
20
+ elif start_idx is not None:
21
+ # Check brackets to maintain correct depth
22
+ if stripped.endswith("["):
23
+ depth += 1
24
+ elif stripped == "]":
25
+ depth -= 1
26
+ if depth == 0:
27
+ return start_idx, i
28
+ return None, None
29
+
30
+
31
+ def get_nodes_from_edges(block_lines):
32
+ """
33
+ Extract node IDs from edges in the given block lines.
34
+ Returns a set of node IDs found in the edges.
35
+ """
36
+ node_set = set()
37
+ for line in block_lines:
38
+ m = EDGE_REGEX.search(line.strip())
39
+ if m:
40
+ source, target, _ = m.groups()
41
+ node_set.update([source, target])
42
+ return node_set
43
+
44
+
45
+ def parse_context(context_lines, node_regex=None, edge_regex=None):
46
+ """
47
+ Parse the context lines to identify nodes and edges.
48
+ Returns two structures:
49
+ - context_nodes: {node_id: label}
50
+ - context_edges: list of (source, target, label)
51
+ """
52
+
53
+ context_nodes = {}
54
+ context_edges = []
55
+ for line in context_lines:
56
+ stripped = line.strip()
57
+ nm = NODE_REGEX.search(stripped)
58
+ if nm:
59
+ nid, lbl = nm.groups()
60
+ context_nodes[nid] = lbl
61
+ else:
62
+ em = EDGE_REGEX.search(stripped)
63
+ if em:
64
+ source, target, label = em.groups()
65
+ context_edges.append((source, target, label))
66
+ return context_nodes, context_edges
67
+
68
+
69
+ def filter_context(context_lines, relevant_nodes):
70
+ """
71
+ Given the context lines and a set of relevant nodes, remove hydrogen nodes
72
+ not in relevant_nodes and all edges connected to them. Returns filtered lines.
73
+ """
74
+ context_nodes, context_edges = parse_context(context_lines)
75
+
76
+ # Identify hydrogen nodes to remove
77
+ hydrogen_nodes_to_remove = {
78
+ nid
79
+ for nid, lbl in context_nodes.items()
80
+ if lbl == "H" and nid not in relevant_nodes
81
+ }
82
+
83
+ filtered_context = []
84
+ for line in context_lines:
85
+ stripped = line.strip()
86
+ nm = NODE_REGEX.search(stripped)
87
+ em = EDGE_REGEX.search(stripped)
88
+
89
+ if nm:
90
+ nid, lbl = nm.groups()
91
+ if nid not in hydrogen_nodes_to_remove:
92
+ filtered_context.append(line)
93
+ elif em:
94
+ source, target, label = em.groups()
95
+ if (
96
+ source not in hydrogen_nodes_to_remove
97
+ and target not in hydrogen_nodes_to_remove
98
+ ):
99
+ filtered_context.append(line)
100
+ else:
101
+ # Keep section lines like "context [" or "]"
102
+ filtered_context.append(line)
103
+
104
+ return filtered_context
105
+
106
+
107
+ def strip_context(gml_text: str, remove_all: bool = True) -> str:
108
+ """
109
+ Filters or clears the 'context' section of GML-like content based on the remove_all flag.
110
+ If remove_all is True, all edges in the 'context' section are removed.
111
+ If False, it removes hydrogen nodes that do not appear in both 'left' and 'right' sections,
112
+ along with their edges, while preserving the original structure and formatting of the GML.
113
+
114
+ Parameters:
115
+ - gml_text (str): GML-like content describing a chemical reaction rule.
116
+ - remove_all (bool): Flag to determine if all edges should be removed from the 'context'.
117
+
118
+ Returns:
119
+ - str: The modified GML content with the filtered 'context' section.
120
+ """
121
+ lines = gml_text.split("\n")
122
+
123
+ # Locate main sections: rule, left, context, right
124
+ rule_start, rule_end = find_block(lines, "rule [")
125
+ left_start, left_end = find_block(lines, "left [")
126
+ context_start, context_end = find_block(lines, "context [")
127
+ right_start, right_end = find_block(lines, "right [")
128
+
129
+ # If we cannot find proper structure, return original text
130
+ if any(
131
+ x is None
132
+ for x in [
133
+ rule_start,
134
+ rule_end,
135
+ left_start,
136
+ left_end,
137
+ context_start,
138
+ context_end,
139
+ right_start,
140
+ right_end,
141
+ ]
142
+ ):
143
+ return gml_text
144
+
145
+ # fmt: off
146
+ context_lines = lines[context_start: context_end + 1]
147
+
148
+ # Determine relevant nodes by intersection of nodes in left and right edges
149
+ left_nodes = get_nodes_from_edges(lines[left_start: left_end + 1])
150
+ right_nodes = get_nodes_from_edges(lines[right_start: right_end + 1])
151
+ # fmt: on
152
+ relevant_nodes = left_nodes.intersection(right_nodes)
153
+
154
+ # Filter the context section based on relevant nodes
155
+ filtered_context = filter_context(context_lines, relevant_nodes)
156
+
157
+ if remove_all:
158
+ # Remove all edges from the context
159
+ # Retain only node lines and other structural lines
160
+ final_context = []
161
+ for line in filtered_context:
162
+ if not EDGE_REGEX.search(line.strip()):
163
+ final_context.append(line)
164
+ filtered_context = final_context
165
+
166
+ # Rebuild the full GML text
167
+ # Replace the original context lines with the filtered or cleared context lines
168
+ # fmt: off
169
+ new_lines = lines[:context_start] + filtered_context + lines[context_end + 1:]
170
+ # fmt: on
171
+
172
+ return "\n".join(new_lines)
@@ -0,0 +1,119 @@
1
+ import time
2
+ import requests
3
+ import urllib.parse
4
+ from typing import List
5
+ from joblib import Parallel, delayed
6
+
7
+
8
+ def smiles_to_iupac(smiles_string: str, timeout: int = 1):
9
+ """
10
+ Converts a SMILES string to its corresponding IUPAC name(s) using the PubChem PUG REST API.
11
+
12
+ Parameters:
13
+ - smiles_string (str): The SMILES string of the compound (e.g., "C=O" for formaldehyde).
14
+ - timeout (int, optional): The timeout in seconds for the request. Default is 1 second.
15
+
16
+ Returns:
17
+ - list: A list of IUPAC names associated with the SMILES string. Returns an empty list if none found.
18
+ """
19
+ # URL encode the SMILES string to handle special characters
20
+ encoded_smiles = urllib.parse.quote(smiles_string)
21
+
22
+ # PubChem PUG REST API endpoint to retrieve properties
23
+ url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{encoded_smiles}/property/IUPACName/JSON"
24
+
25
+ retries = 3 # Number of retries in case of failure
26
+ delay = 2 # Delay between retries (in seconds)
27
+
28
+ for attempt in range(retries):
29
+ try:
30
+ response = requests.get(url, timeout=timeout) # Adjust timeout for speed
31
+ response.raise_for_status() # Raise an HTTPError for bad responses
32
+
33
+ data = response.json()
34
+
35
+ # Extract the IUPAC name(s) from the response
36
+ properties = data.get("PropertyTable", {}).get("Properties", [])
37
+
38
+ if not properties:
39
+ print(f"No properties found for SMILES: {smiles_string}")
40
+ return []
41
+
42
+ iupac_names = [
43
+ prop.get("IUPACName") for prop in properties if prop.get("IUPACName")
44
+ ]
45
+
46
+ if iupac_names:
47
+ return iupac_names
48
+ else:
49
+ print(f"No IUPAC name found for SMILES: {smiles_string}")
50
+ return []
51
+
52
+ except (requests.exceptions.RequestException, ValueError, KeyError) as e:
53
+ # If an error occurs, retry a few times
54
+ print(
55
+ f"Attempt {attempt + 1} failed for SMILES: {smiles_string}, Error: {e}"
56
+ )
57
+ if attempt < retries - 1:
58
+ time.sleep(delay) # Wait before retrying
59
+ else:
60
+ print(f"Final failure for SMILES: {smiles_string}")
61
+ return []
62
+
63
+ return []
64
+
65
+
66
+ def batch_process_smiles(smiles_batch: List[str], timeout=1):
67
+ """
68
+ Processes a batch of SMILES strings to get IUPAC names.
69
+
70
+ Parameters:
71
+ - smiles_batch (list): A list of SMILES strings to process.
72
+ - timeout (int): Timeout for requests (in seconds).
73
+
74
+ Returns:
75
+ - list: A list of IUPAC name results for each SMILES in the batch.
76
+ """
77
+ return [smiles_to_iupac(smiles, timeout) for smiles in smiles_batch]
78
+
79
+
80
+ def get_iupac_for_smiles_list(
81
+ smiles_list: List[str], batch_size=10, n_jobs=4, timeout=1
82
+ ):
83
+ """
84
+ Convert a list of SMILES strings to their corresponding IUPAC names using the PubChem API with batch processing.
85
+
86
+ Parameters:
87
+ smiles_list (list): A list of SMILES strings to be converted to IUPAC names.
88
+ batch_size (int): Number of SMILES strings to process in each batch.
89
+ n_jobs (int): Number of parallel jobs to run for batch processing.
90
+ timeout (int): Timeout for requests (in seconds).
91
+
92
+ Returns:
93
+ dict: A dictionary with SMILES as keys and lists of IUPAC names as values.
94
+ """
95
+ # Split the list into smaller batches
96
+ # fmt: off
97
+ batches = [
98
+ smiles_list[i: i + batch_size] for i in range(0, len(smiles_list), batch_size)
99
+ ]
100
+ # fmt: on
101
+
102
+ # Use joblib's Parallel and delayed to process batches in parallel
103
+ batch_results = Parallel(n_jobs=n_jobs)(
104
+ delayed(batch_process_smiles)(batch, timeout) for batch in batches
105
+ )
106
+
107
+ # Flatten the list of results and map to SMILES
108
+ flattened_results = [item for sublist in batch_results for item in sublist]
109
+ iupac_dict = dict(zip(smiles_list, flattened_results))
110
+
111
+ return iupac_dict
112
+
113
+
114
+ # Example of usage
115
+ smiles_list = ["CCO", "C=O", "CC(=O)O", "C1=CC=CC=C1", "C2H6O", "C4H10", "C5H12"]
116
+ iupac_results = get_iupac_for_smiles_list(smiles_list, batch_size=3, n_jobs=2)
117
+
118
+ for smiles, iupac_names in iupac_results.items():
119
+ print(f"SMILES: {smiles} => IUPAC Names: {iupac_names}")
synkit/ITS/_misc.py ADDED
@@ -0,0 +1,280 @@
1
+ import re
2
+ import networkx as nx
3
+ from rdkit import Chem
4
+ from rdkit.Chem.MolStandardize import rdMolStandardize
5
+
6
+ from typing import Optional, List
7
+
8
+
9
+ def get_rc(
10
+ ITS: nx.Graph,
11
+ element_key: list = ["element", "charge", "typesGH", "atom_map"],
12
+ bond_key: str = "order",
13
+ standard_key: str = "standard_order",
14
+ ) -> nx.Graph:
15
+ """
16
+ Extracts the reaction center (RC) graph from a given ITS graph by identifying edges
17
+ where the bond order changes, indicating a reaction event.
18
+
19
+ Parameters:
20
+ - ITS (nx.Graph): The ITS graph to extract the RC from.
21
+ - element_key (list): List of node attribute keys for atom properties.
22
+ Defaults to ['element', 'charge', 'typesGH'].
23
+ - bond_key (str): Edge attribute key for bond order. Defaults to 'order'.
24
+ - standard_key (str): Edge attribute key for standard order information.
25
+ Defaults to 'standard_order'.
26
+
27
+ Returns:
28
+ - nx.Graph: A new graph representing the reaction center of the ITS.
29
+ """
30
+ rc = nx.Graph()
31
+ for n1, n2, data in ITS.edges(data=True):
32
+ if data.get(bond_key, [None, None])[0] != data.get(bond_key, [None, None])[1]:
33
+ rc.add_node(
34
+ n1, **{k: ITS.nodes[n1][k] for k in element_key if k in ITS.nodes[n1]}
35
+ )
36
+ rc.add_node(
37
+ n2, **{k: ITS.nodes[n2][k] for k in element_key if k in ITS.nodes[n2]}
38
+ )
39
+ rc.add_edge(
40
+ n1, n2, **{bond_key: data[bond_key], standard_key: data[standard_key]}
41
+ )
42
+ return rc
43
+
44
+
45
+ def its_decompose(its_graph: nx.Graph, nodes_share="typesGH", edges_share="order"):
46
+ """
47
+ Decompose an ITS graph into two separate graphs G and H based on shared
48
+ node and edge attributes.
49
+
50
+ Parameters:
51
+ - its_graph (nx.Graph): The integrated transition state (ITS) graph.
52
+ - nodes_share (str): Node attribute key that stores tuples with node attributes
53
+ or G and H.
54
+ - edges_share (str): Edge attribute key that stores tuples with edge attributes
55
+ for G and H.
56
+
57
+ Returns:
58
+ - Tuple[nx.Graph, nx.Graph]: A tuple containing the two graphs G and H.
59
+ """
60
+ G = nx.Graph()
61
+ H = nx.Graph()
62
+
63
+ # Decompose nodes
64
+ for node, data in its_graph.nodes(data=True):
65
+ if nodes_share in data:
66
+ node_attr_g, node_attr_h = data[nodes_share]
67
+ # Unpack node attributes for G
68
+ G.add_node(
69
+ node,
70
+ element=node_attr_g[0],
71
+ aromatic=node_attr_g[1],
72
+ hcount=node_attr_g[2],
73
+ charge=node_attr_g[3],
74
+ neighbors=node_attr_g[4],
75
+ atom_map=node,
76
+ )
77
+ # Unpack node attributes for H
78
+ H.add_node(
79
+ node,
80
+ element=node_attr_h[0],
81
+ aromatic=node_attr_h[1],
82
+ hcount=node_attr_h[2],
83
+ charge=node_attr_h[3],
84
+ neighbors=node_attr_h[4],
85
+ atom_map=node,
86
+ )
87
+
88
+ # Decompose edges
89
+ for u, v, data in its_graph.edges(data=True):
90
+ if edges_share in data:
91
+ order_g, order_h = data[edges_share]
92
+ if order_g > 0: # Assuming 0 means no edge in G
93
+ G.add_edge(u, v, order=order_g)
94
+ if order_h > 0: # Assuming 0 means no edge in H
95
+ H.add_edge(u, v, order=order_h)
96
+
97
+ return G, H
98
+
99
+
100
+ def compare_graphs(
101
+ graph1: nx.Graph,
102
+ graph2: nx.Graph,
103
+ node_attrs: list = ["element", "aromatic", "hcount", "charge", "neighbors"],
104
+ edge_attrs: list = ["order"],
105
+ ) -> bool:
106
+ """
107
+ Compare two graphs based on specified node and edge attributes.
108
+
109
+ Parameters:
110
+ - graph1 (nx.Graph): The first graph to compare.
111
+ - graph2 (nx.Graph): The second graph to compare.
112
+ - node_attrs (list): A list of node attribute names to include in the comparison.
113
+ - edge_attrs (list): A list of edge attribute names to include in the comparison.
114
+
115
+ Returns:
116
+ - bool: True if both graphs are identical with respect to the specified attributes,
117
+ otherwise False.
118
+ """
119
+ # Compare node sets
120
+ if set(graph1.nodes()) != set(graph2.nodes()):
121
+ return False
122
+
123
+ # Compare nodes based on attributes
124
+ for node in graph1.nodes():
125
+ if node not in graph2:
126
+ return False
127
+ node_data1 = {attr: graph1.nodes[node].get(attr, None) for attr in node_attrs}
128
+ node_data2 = {attr: graph2.nodes[node].get(attr, None) for attr in node_attrs}
129
+ if node_data1 != node_data2:
130
+ return False
131
+
132
+ # Compare edge sets with sorted tuples
133
+ if set(tuple(sorted(edge)) for edge in graph1.edges()) != set(
134
+ tuple(sorted(edge)) for edge in graph2.edges()
135
+ ):
136
+ return False
137
+
138
+ # Compare edges based on attributes
139
+ for edge in graph1.edges():
140
+ # Sort the edge for consistent comparison
141
+ sorted_edge = tuple(sorted(edge))
142
+ if sorted_edge not in graph2.edges():
143
+ return False
144
+ edge_data1 = {attr: graph1.edges[edge].get(attr, None) for attr in edge_attrs}
145
+ edge_data2 = {
146
+ attr: graph2.edges[sorted_edge].get(attr, None) for attr in edge_attrs
147
+ }
148
+ if edge_data1 != edge_data2:
149
+ return False
150
+
151
+ return True
152
+
153
+
154
+ def enumerate_tautomers(reaction_smiles: str) -> Optional[List[str]]:
155
+ """
156
+ Enumerates possible tautomers for reactants while canonicalizing the products in a
157
+ reaction SMILES string. This function first splits the reaction SMILES string into
158
+ reactants and products. It then generates all possible tautomers for the reactants and
159
+ canonicalizes the product molecule. The function returns a list of reaction SMILES
160
+ strings for each tautomer of the reactants combined with the canonical product.
161
+
162
+ Parameters:
163
+ - reaction_smiles (str): A SMILES string of the reaction formatted as
164
+ 'reactants>>products'.
165
+
166
+ Returns:
167
+ - List[str] | None: A list of SMILES strings for the reaction, with each string
168
+ representing a different
169
+ - tautomer of the reactants combined with the canonicalized products. Returns None if
170
+ an error occurs or if invalid SMILES strings are provided.
171
+
172
+ Raises:
173
+ - ValueError: If the provided SMILES strings cannot be converted to molecule objects,
174
+ indicating invalid input.
175
+ """
176
+ try:
177
+ # Split the input reaction SMILES string into reactants and products
178
+ reactants_smiles, products_smiles = reaction_smiles.split(">>")
179
+
180
+ # Convert SMILES strings to molecule objects
181
+ reactants_mol = Chem.MolFromSmiles(reactants_smiles)
182
+ products_mol = Chem.MolFromSmiles(products_smiles)
183
+
184
+ if reactants_mol is None or products_mol is None:
185
+ raise ValueError(
186
+ "Invalid SMILES string provided for reactants or products."
187
+ )
188
+
189
+ # Initialize tautomer enumerator
190
+
191
+ enumerator = rdMolStandardize.TautomerEnumerator()
192
+
193
+ # Enumerate tautomers for the reactants and canonicalize the products
194
+ try:
195
+ reactants_can = enumerator.Enumerate(reactants_mol)
196
+ except Exception as e:
197
+ print(f"An error occurred: {e}")
198
+ reactants_can = [reactants_mol]
199
+ products_can = products_mol
200
+
201
+ # Convert molecule objects back to SMILES strings
202
+ reactants_can_smiles = [Chem.MolToSmiles(i) for i in reactants_can]
203
+ products_can_smiles = Chem.MolToSmiles(products_can)
204
+
205
+ # Combine each reactant tautomer with the canonical product in SMILES format
206
+ rsmi_list = [i + ">>" + products_can_smiles for i in reactants_can_smiles]
207
+ if len(rsmi_list) == 0:
208
+ return [reaction_smiles]
209
+ else:
210
+ # rsmi_list.remove(reaction_smiles)
211
+ rsmi_list.insert(0, reaction_smiles)
212
+ return rsmi_list
213
+
214
+ except Exception as e:
215
+ print(f"An error occurred: {e}")
216
+ return [reaction_smiles]
217
+
218
+
219
+ def mapping_success_rate(list_mapping_data):
220
+ """
221
+ Calculate the success rate of entries containing atom mappings in a list of data
222
+ strings.
223
+
224
+ Parameters:
225
+ - list_mapping_in_data (list of str): List containing strings to be searched for atom
226
+ mappings.
227
+
228
+ Returns:
229
+ - float: The success rate of finding atom mappings in the list as a percentage.
230
+
231
+ Raises:
232
+ - ValueError: If the input list is empty.
233
+ """
234
+ atom_map_pattern = re.compile(r":\d+")
235
+ if not list_mapping_data:
236
+ raise ValueError("The input list is empty, cannot calculate success rate.")
237
+
238
+ success = sum(
239
+ 1 for entry in list_mapping_data if re.search(atom_map_pattern, entry)
240
+ )
241
+ rate = 100 * (success / len(list_mapping_data))
242
+
243
+ return round(rate, 2)
244
+
245
+
246
+ def expand_hydrogens(graph: nx.Graph) -> nx.Graph:
247
+ """
248
+ For each node in the graph that has an 'hcount' attribute greater than zero,
249
+ adds the specified number of hydrogen nodes and connects them with edges that
250
+ have specific attributes.
251
+
252
+ Parameters
253
+ - graph (nx.Graph): A graph representing a molecule with nodes that can
254
+ include 'element', 'hcount', 'charge', and 'atom_map' attributes.
255
+
256
+ Returns:
257
+ - nx.Graph: A new graph with hydrogen atoms expanded.
258
+ """
259
+ new_graph = graph.copy() # Create a copy to modify and return
260
+ atom_map = (
261
+ max(data["atom_map"] for _, data in graph.nodes(data=True))
262
+ if graph.nodes
263
+ else 0
264
+ )
265
+
266
+ # Iterate through each node to process potential hydrogens
267
+ for node, data in graph.nodes(data=True):
268
+ hcount = data.get("hcount", 0)
269
+ if hcount > 0:
270
+ for _ in range(hcount):
271
+ atom_map += 1
272
+ hydrogen_node = {
273
+ "element": "H",
274
+ "charge": 0,
275
+ "atom_map": atom_map,
276
+ }
277
+ new_graph.add_node(atom_map, **hydrogen_node)
278
+ new_graph.add_edge(node, atom_map, order=(1.0, 1.0), standard_order=0.0)
279
+
280
+ return new_graph