chemrecon 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. chemrecon/__init__.py +73 -0
  2. chemrecon/chem/__init__.py +0 -0
  3. chemrecon/chem/chemreaction.py +223 -0
  4. chemrecon/chem/constant_compounds.py +3 -0
  5. chemrecon/chem/create_mol.py +91 -0
  6. chemrecon/chem/elements.py +141 -0
  7. chemrecon/chem/gml/__init__.py +0 -0
  8. chemrecon/chem/gml/gml.py +324 -0
  9. chemrecon/chem/gml/gml_reactant_matching.py +130 -0
  10. chemrecon/chem/gml/gml_to_rdk.py +217 -0
  11. chemrecon/chem/mol.py +483 -0
  12. chemrecon/chem/sumformula.py +120 -0
  13. chemrecon/connection.py +97 -0
  14. chemrecon/core/__init__.py +0 -0
  15. chemrecon/core/id_types.py +687 -0
  16. chemrecon/core/ontology.py +209 -0
  17. chemrecon/core/populate_query_handler.py +336 -0
  18. chemrecon/core/query_handler.py +587 -0
  19. chemrecon/database/__init__.py +1 -0
  20. chemrecon/database/connect.py +63 -0
  21. chemrecon/database/connection_params/chemrecon_pub.dbinfo +5 -0
  22. chemrecon/database/connection_params/local_docker_dev.dbinfo +5 -0
  23. chemrecon/database/connection_params/local_docker_init.dbinfo +5 -0
  24. chemrecon/database/connection_params/local_docker_pub.dbinfo +5 -0
  25. chemrecon/database/params.py +88 -0
  26. chemrecon/entrygraph/draw.py +119 -0
  27. chemrecon/entrygraph/entrygraph.py +301 -0
  28. chemrecon/entrygraph/explorationprotocol.py +199 -0
  29. chemrecon/entrygraph/explore.py +421 -0
  30. chemrecon/entrygraph/explore_procedure.py +183 -0
  31. chemrecon/entrygraph/filter.py +88 -0
  32. chemrecon/entrygraph/scoring.py +141 -0
  33. chemrecon/query/__init__.py +26 -0
  34. chemrecon/query/create_entry.py +86 -0
  35. chemrecon/query/default_protocols.py +57 -0
  36. chemrecon/query/find_entry.py +84 -0
  37. chemrecon/query/get_relations.py +143 -0
  38. chemrecon/query/get_structures_from_compound.py +65 -0
  39. chemrecon/schema/__init__.py +86 -0
  40. chemrecon/schema/db_object.py +363 -0
  41. chemrecon/schema/direction.py +10 -0
  42. chemrecon/schema/entry_types/__init__.py +0 -0
  43. chemrecon/schema/entry_types/aam.py +34 -0
  44. chemrecon/schema/entry_types/aam_repr.py +37 -0
  45. chemrecon/schema/entry_types/compound.py +52 -0
  46. chemrecon/schema/entry_types/enzyme.py +49 -0
  47. chemrecon/schema/entry_types/molstructure.py +64 -0
  48. chemrecon/schema/entry_types/molstructure_repr.py +41 -0
  49. chemrecon/schema/entry_types/reaction.py +57 -0
  50. chemrecon/schema/enums.py +154 -0
  51. chemrecon/schema/procedural_relation_entrygraph.py +66 -0
  52. chemrecon/schema/relation_types_composed/__init__.py +0 -0
  53. chemrecon/schema/relation_types_composed/compound_has_molstructure_relation.py +59 -0
  54. chemrecon/schema/relation_types_composed/reaction_has_aam_relation.py +50 -0
  55. chemrecon/schema/relation_types_procedural/__init__.py +0 -0
  56. chemrecon/schema/relation_types_procedural/aam_convert_relation.py +69 -0
  57. chemrecon/schema/relation_types_procedural/compound_select_structure_proceduralrelation.py +36 -0
  58. chemrecon/schema/relation_types_procedural/compound_similarlity_proceduralrelation.py +1 -0
  59. chemrecon/schema/relation_types_procedural/molstructure_convert_relation.py +49 -0
  60. chemrecon/schema/relation_types_procedural/reaction_select_aam_proceduralrelation.py +38 -0
  61. chemrecon/schema/relation_types_procedural/reaction_similarity_proceduralrelation.py +1 -0
  62. chemrecon/schema/relation_types_source/__init__.py +0 -0
  63. chemrecon/schema/relation_types_source/aam_involves_molstructure_relation.py +77 -0
  64. chemrecon/schema/relation_types_source/aam_repr_involves_molstructure_repr_relation.py +79 -0
  65. chemrecon/schema/relation_types_source/compound_has_structure_representation_relation.py +33 -0
  66. chemrecon/schema/relation_types_source/compound_reference_relation.py +34 -0
  67. chemrecon/schema/relation_types_source/molstructure_standardisation_relation.py +71 -0
  68. chemrecon/schema/relation_types_source/ontology/__init__.py +0 -0
  69. chemrecon/schema/relation_types_source/ontology/compound_ontology.py +369 -0
  70. chemrecon/schema/relation_types_source/ontology/enzyme_ontology.py +142 -0
  71. chemrecon/schema/relation_types_source/ontology/reaction_ontology.py +140 -0
  72. chemrecon/schema/relation_types_source/reaction_has_aam_representation_relation.py +34 -0
  73. chemrecon/schema/relation_types_source/reaction_has_enzyme_relation.py +71 -0
  74. chemrecon/schema/relation_types_source/reaction_involves_compound_relation.py +69 -0
  75. chemrecon/schema/relation_types_source/reaction_reference_relation.py +33 -0
  76. chemrecon/scripts/initialize_database.py +494 -0
  77. chemrecon/utils/copy_signature.py +10 -0
  78. chemrecon/utils/encodeable_list.py +11 -0
  79. chemrecon/utils/get_id_type.py +70 -0
  80. chemrecon/utils/hungarian.py +31 -0
  81. chemrecon/utils/reactant_matching.py +168 -0
  82. chemrecon/utils/rxnutils.py +44 -0
  83. chemrecon/utils/set_cwd.py +12 -0
  84. chemrecon-0.1.1.dist-info/METADATA +143 -0
  85. chemrecon-0.1.1.dist-info/RECORD +86 -0
  86. chemrecon-0.1.1.dist-info/WHEEL +4 -0
@@ -0,0 +1,324 @@
1
+ """ Code to handle S_GML from Juri's M-CSA tool
2
+ """
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from typing import Optional
7
+
8
+ import matplotlib.pyplot as plt
9
+ import networkx as nx
10
+ from rdkit import Chem as rdk
11
+
12
+ from chemrecon.chem.mol import Mol
13
+
14
+ # Regex patterns
15
+ rule_pattern = re.compile(
16
+ r'^rule \[(?:ruleID \".*\")?(?:left \[(.*?)\])?context \[(.*?)\](?:right \[(.*?)\])?\]$'
17
+ )
18
+ label_sub = re.compile(
19
+ r'id (\d+) label \"([A-Z][a-z]?\d?\+?\-?|Amino\(..?, ..., \d+, [\w|*]\)|Alias\(.,.\)|\*)\"'
20
+ )
21
+ label_rep = r'id \g<1> label "\g<1>: \g<2>"'
22
+
23
+
24
+ def sanitise_gml(gml: str) -> str:
25
+ r = f'{gml}'
26
+ r.replace('\n', '').replace('\t', '')
27
+ return r
28
+
29
+
30
+ class GMLException(Exception):
31
+ pass
32
+
33
+
34
+ class GMLEmptyException(Exception):
35
+ pass
36
+
37
+
38
+ class GMLEntry:
39
+ entry_index: int
40
+ proposals: list[list[GMLStep]]
41
+ map: dict[int, int] # For proposal 0
42
+
43
+ def __init__(self, index: int):
44
+ self.entry_index = index
45
+ self.proposals = []
46
+
47
+ def add_step(self, proposal_index: int, step: GMLStep):
48
+ # Extend proposals list
49
+ while len(self.proposals) < proposal_index:
50
+ self.proposals.append([])
51
+
52
+ # Add step
53
+ self.proposals[proposal_index - 1].append(step)
54
+
55
+ def set_map(self, aam: dict[int, int]):
56
+ self.map = aam
57
+ sanity_check(self.left_graph(), self.right_graph(), self.map)
58
+
59
+ # Getters always get proposal 0 (best star rating in M-CSA)
60
+ def left_graph(self) -> GMLGraph:
61
+ return self.proposals[0][0].left_graph()
62
+
63
+ def right_graph(self) -> GMLGraph:
64
+ return self.proposals[0][-1].right_graph()
65
+
66
+ def atom_atom_map(self) -> dict[int, int]:
67
+ return self.map
68
+
69
+ class GMLStep:
70
+ step_index: int
71
+ gml: str
72
+ lhs: GMLGraph
73
+ rhs: GMLGraph
74
+
75
+ # Pure gml
76
+ left_gml: str
77
+ context_gml: str
78
+ right_gml: str
79
+
80
+ def __init__(self, step_index: int, gml: str):
81
+ self.step_index = step_index
82
+ self.gml = str(gml).replace('\n', '').replace('\t', '')
83
+
84
+ # Make S_GML for educt and product sides
85
+ match = re.match(rule_pattern, self.gml)
86
+ self.left_gml = match.group(1)
87
+ self.context_gml = match.group(2)
88
+ self.right_gml = match.group(3)
89
+
90
+ lhs = (f'graph [\n {self.left_gml} {self.context_gml} \n]'
91
+ if self.left_gml else f'graph [\n{self.context_gml}\n]')
92
+ rhs = (f'graph [\n {self.context_gml} {self.right_gml} \n]'
93
+ if self.right_gml else f'graph [\n{self.context_gml}\n]')
94
+
95
+ self.lhs = GMLGraph(
96
+ gml = lhs
97
+ )
98
+ self.rhs = GMLGraph(
99
+ gml = rhs
100
+ )
101
+
102
+ def left_graph(self) -> GMLGraph:
103
+ return self.lhs
104
+
105
+ def right_graph(self) -> GMLGraph:
106
+ return self.rhs
107
+
108
+
109
+ class GMLGraph:
110
+ graph: nx.Graph
111
+ gml: str
112
+ component_subsets: list[frozenset[int]]
113
+ component_subgraphs: list[nx.Graph]
114
+
115
+ def __init__(self, gml: str, graph: nx.Graph = None):
116
+ self.gml = sanitise_gml(gml)
117
+
118
+ # TODO raise exception of called with empty graph
119
+
120
+ if graph:
121
+ self.graph = graph
122
+ else:
123
+ # Make graph from gml
124
+ self.graph = nx.parse_gml(
125
+ gml,
126
+ label = 'id'
127
+ )
128
+ self.component_subgraphs = []
129
+ self.component_subsets = []
130
+
131
+ def get_component_subsets(self) -> list[frozenset[int]]:
132
+ if not self.component_subsets:
133
+ self.component_subsets = list(
134
+ frozenset(s)
135
+ for s in nx.connected_components(self.graph)
136
+ )
137
+ return self.component_subsets
138
+
139
+ def get_components_subgraphs(self) -> list[nx.Graph]:
140
+ if self.component_subgraphs:
141
+ return self.component_subgraphs
142
+ else:
143
+ if not self.component_subsets:
144
+ # Make subsets
145
+ self.get_component_subsets()
146
+ # Set the subgraphs
147
+ self.component_subgraphs = [
148
+ nx.subgraph(self.graph, compset) for compset in self.component_subsets
149
+ ]
150
+ return self.component_subgraphs
151
+
152
+ def remove_hydrogens(self):
153
+ g_ = self.graph.copy()
154
+ to_remove: list = list()
155
+ for n in g_.nodes():
156
+ if g_.nodes[n]['label'] == 'H' and g_.degree[n] > 0:
157
+ to_remove.append(n)
158
+ g_.remove_nodes_from(to_remove)
159
+ self.graph = g_
160
+
161
+ def get_subgraph(self, indices: set[int]) -> nx.Graph:
162
+ return self.graph.subgraph(indices)
163
+
164
+ def draw(self):
165
+ # Draw with networkx and matplotlib - need to display atom numbers on label!
166
+ plt.figure(1, figsize = (8, 8), dpi = 240)
167
+ pos = nx.spring_layout(self.graph)
168
+
169
+ # Graph structure and node labels
170
+ nx.draw(
171
+ self.graph, pos,
172
+ with_labels = True,
173
+ node_size = 50,
174
+ font_size = 6,
175
+ labels = {
176
+ k: f"{k}: {n['label']}" for k, n in self.graph.nodes.items()
177
+ }
178
+ )
179
+
180
+ # Edge labels
181
+ edict = dict()
182
+ for (u, v, d) in self.graph.edges.data('label'):
183
+ edict[(u, v)] = d
184
+
185
+ nx.draw_networkx_edge_labels(
186
+ self.graph, pos, edict, font_color = 'blue', font_size = 6
187
+ )
188
+
189
+ # Display graph
190
+ plt.show()
191
+ # TODO
192
+
193
+
194
+ # ChemGraph generation
195
+ def chemgraph_from_mol(mol: Mol) -> GMLGraph:
196
+ # Based on an RDK mol, create a ChemGraph object.
197
+ struct_graph = nx.Graph()
198
+ for atom in mol.mol.GetAtoms():
199
+ # Make the label based on element and charge
200
+ fcharge = atom.GetFormalCharge()
201
+ symbol = atom.GetSymbol()
202
+
203
+ # Do not include hydrogen
204
+ if symbol == 'H' and len(atom.GetNeighbors()) > 0:
205
+ continue
206
+
207
+ # Make label
208
+ label: str = 'ERR'
209
+ match fcharge:
210
+ case 0:
211
+ label = symbol
212
+ case -1:
213
+ label = f'{symbol}-'
214
+ case 1:
215
+ label = f'{symbol}+'
216
+ case _ if fcharge < 0:
217
+ label = f'{symbol}{fcharge}-'
218
+ case _ if fcharge >= 0:
219
+ label = f'{symbol}{fcharge}+'
220
+
221
+ struct_graph.add_node(
222
+ atom.GetIdx(),
223
+ label = label
224
+ )
225
+
226
+ for bond in mol.mol.GetBonds():
227
+ label: str = 'ERR'
228
+ match bond.GetBondType():
229
+ case rdk.BondType.SINGLE:
230
+ label = '-'
231
+ case rdk.BondType.DOUBLE:
232
+ label = '='
233
+ case rdk.BondType.TRIPLE:
234
+ label = '#'
235
+ case rdk.BondType.AROMATIC:
236
+ label = ':'
237
+ case _:
238
+ raise RuntimeError(f'Unimplemented bond-type {bond.GetBondType()}')
239
+
240
+ struct_graph.add_edge(
241
+ bond.GetBeginAtomIdx(),
242
+ bond.GetEndAtomIdx(),
243
+ label = label
244
+ )
245
+
246
+ # Finalize and add the graph to the list of graphs to match
247
+ cgraph = GMLGraph(
248
+ gml = '',
249
+ graph = struct_graph
250
+ )
251
+ return cgraph
252
+
253
+
254
+ # Entry serialisation
255
+ # ----------------------------------------------------------------------------------------------------------------------
256
+ def graph_to_gml(graph: nx.Graph, relabel_map: Optional[dict[int, int]] = None) -> str:
257
+ """ Only returns 'inner' gml.
258
+ """
259
+ if relabel_map is None:
260
+ relabel_map = {x: x for x in graph.nodes}
261
+
262
+ s = '[\n'
263
+ # s = 'graph [\n'
264
+ # Nodes
265
+ for n_index in graph.nodes:
266
+ label = graph.nodes[n_index]['label']
267
+ s += f'\tnode [ id {relabel_map[n_index]} label "{label}"]\n'
268
+
269
+ # Edges
270
+ for x, y in graph.edges:
271
+ label = graph.edges[(x, y)]['label']
272
+ s += f'\tedge [ source {relabel_map[x]} target {relabel_map[y]} label "{label}" ]\n'
273
+ s += '\n]'
274
+ return s
275
+
276
+ # Reaction graph sanitation and handling -------------------------------------------------------------------------------
277
+ def remove_residues(
278
+ chemgraph: GMLGraph
279
+ ) -> GMLGraph:
280
+ """ Given a chemgraph representing either side of a reaction, return the same graph with identified
281
+ catalytic enzyme residues removed.
282
+ """
283
+ g: nx.Graph = chemgraph.graph
284
+ remove_nodes: set = set() #
285
+
286
+ components = list(nx.connected_components(g))
287
+ for component in components:
288
+ for v in component:
289
+ l: str = g.nodes[v]['label']
290
+ if l.startswith('Amino'):
291
+ # Remove the entire component if it contains an amino acid
292
+ remove_nodes = remove_nodes.union(component)
293
+ break
294
+
295
+ # Remove the nodes
296
+ g_ = g.copy(as_view = False)
297
+ g_.remove_nodes_from(remove_nodes)
298
+ if g_.number_of_nodes() == 0:
299
+ raise GMLEmptyException
300
+ return GMLGraph(gml = '', graph = g_)
301
+
302
+
303
+ # Misc
304
+ # --------------------------------------------------------------------------------------------------------------
305
+ # Matching functions for graph similarity
306
+ def nmatch(n1, n2):
307
+ try:
308
+ return n1['label'] == n2['label']
309
+ except KeyError:
310
+ return 1
311
+
312
+
313
+ def ematch(e1, e2):
314
+ try:
315
+ return e1['label'] == e2['label']
316
+ except KeyError:
317
+ return 1
318
+
319
+ def sanity_check(l_graph: GMLGraph, r_graph: GMLGraph, map: dict[int, int]):
320
+ for l_index, r_index in map.items():
321
+ l_label: str = l_graph.graph.nodes[l_index]['label']
322
+ r_label: str = r_graph.graph.nodes[r_index]['label']
323
+ if l_label.removesuffix('-').removesuffix('+') != r_label.removesuffix('-').removesuffix('+'):
324
+ assert False, 'Wrong match'
@@ -0,0 +1,130 @@
1
+ from __future__ import annotations
2
+
3
+ import networkx as nx
4
+
5
+ from chem.elements import ChemGraph, chemgraph_from_mol, ematch, nmatch
6
+ from chem.elements import mol_from_structurerepresentation
7
+ from chemrecon.core.query_handler import QueryHandler
8
+ from chemrecon.schema.entry_types.compound import Compound
9
+ from chemrecon.schema.entry_types.reaction import Reaction
10
+ from chemrecon.schema.relation_types_source.compound_has_structure_representation_relation import \
11
+ CompoundHasStructureRepresentation
12
+ from chemrecon.schema.relation_types_source.reaction_involves_compound_relation import ReactionInvolvesCompound
13
+ from utils import hungarian as hungarian
14
+
15
+
16
+ # TODO change to use the general reactant_matching with ChemReactions
17
+
18
+ def match_reactants(
19
+ handler: QueryHandler,
20
+ procedure: Procedure,
21
+ chemgraph: ChemGraph,
22
+ reaction: Reaction,
23
+ side: int # -1 or 1
24
+ ) -> dict[frozenset[int], tuple[Compound, int]]:
25
+ """ Given a left or right S_GML of a reaction (already sanitised by removing enzymes), tries to match the
26
+ components of the graph to a list of entries with known structures using graph similarity. Returns a
27
+ mapping, from each reactant (compound, multiplicity index), to the set of nodes with which they
28
+ were identified.
29
+ """
30
+
31
+ # TODO eventually replace the structure getting alg. with entry graph-based approach
32
+
33
+ # Get participant compounds
34
+ reaction_all_compounds: list[tuple[Compound, int]] = list() # With stoich
35
+ compound_result_dict = handler.get_relations_with_entries_by_recon_ids_of_t1(
36
+ entry_type = Reaction,
37
+ recon_ids = [reaction.recon_id],
38
+ relation_type = ReactionInvolvesCompound,
39
+ )[0]
40
+ for compound_entry, relations in compound_result_dict.items():
41
+ relations: list[ReactionInvolvesCompound]
42
+ for relation in relations:
43
+ reaction_all_compounds.append(
44
+ (compound_entry, relation.n)
45
+ )
46
+
47
+ # Get existing structures for the attached compounds (converted to chemgraph objects?)
48
+ compound_graphs: dict[Compound, ChemGraph] = dict()
49
+ for compound_entry, stoich in reaction_all_compounds:
50
+ # Only take compounds on the correct side
51
+ if stoich * side < 0:
52
+ continue
53
+
54
+ # Get structures
55
+ struct_repr_relations = handler.get_relations_with_entries_by_recon_ids_of_t1(
56
+ entry_type = Compound,
57
+ recon_ids = [compound_entry.recon_id],
58
+ relation_type = CompoundHasStructureRepresentation
59
+ )
60
+ struct_reprs = struct_repr_relations[0].keys()
61
+
62
+ # For now, select an arbitrary structure
63
+ try:
64
+ struct_repr = list(struct_reprs)[0]
65
+ except IndexError:
66
+ # No structures found
67
+ continue
68
+
69
+ # Convert the selected structure to a S_GML ChemGraph
70
+ compound_graphs[compound_entry] = chemgraph_from_mol(mol_from_structurerepresentation(struct_repr))
71
+
72
+ # Perform the matching algorithm on match_structures against components of the given chemgraph
73
+ chemgraph.remove_hydrogens()
74
+ component_subsets: list[frozenset[int]] = chemgraph.get_component_subsets()
75
+ component_subgraphs: list[nx.Graph] = chemgraph.get_components_subgraphs()
76
+
77
+ # Log an error if component numbers don't match
78
+ if len(component_subgraphs) != len(compound_graphs):
79
+ # TODO Try to fix this by balancing cofactors etc
80
+ # TODO May be extra hydrons, water etc
81
+ # TODO - entry #6 has an extra water on both sides for instance
82
+ procedure.log_notice(f'Could not get structures for all components.'
83
+ f' Reaction {reaction} (side {side}).'
84
+ f' Number of S_GML graph components ({len(component_subgraphs)})'
85
+ f' does not match number of reactants ({len(compound_graphs)}).')
86
+
87
+ # Pairwise similarity (maps from each compound to the index of the component and the similarity)
88
+ graph_distance_pairwise: dict[Compound, list[tuple[int, float]]] = dict()
89
+
90
+ # Compute the similarity
91
+ for compound_entry, compound_chemgraph in compound_graphs.items():
92
+ component_distances: list[tuple[int, float]] = list()
93
+ for component_index, component in enumerate(component_subgraphs):
94
+ # nx.optimize_graph_edit_distance is a generator which generates successively more precise values.
95
+ dist = next(nx.optimize_graph_edit_distance(
96
+ compound_chemgraph.graph,
97
+ component,
98
+ node_match = nmatch,
99
+ edge_match = ematch
100
+ ))
101
+ component_distances.append((component_index, dist))
102
+ graph_distance_pairwise[compound_entry] = component_distances
103
+
104
+ # We make duplicate instances of each compound to match if stoich != 1
105
+ compound_instances: dict[tuple[Compound, int], ChemGraph] = dict()
106
+ for compound, stoich in reaction_all_compounds:
107
+ if stoich * side < 0:
108
+ continue
109
+ for i in range(abs(stoich)):
110
+ try:
111
+ compound_instances[(compound, i)] = compound_graphs[compound]
112
+ except KeyError as e:
113
+ pass
114
+
115
+ # Run the hungarian algorithm to get the best match with the instanced compounds
116
+ edges: dict[tuple[tuple[Compound, int], int], float] = dict()
117
+ for (compound, compound_instance_index), _ in compound_instances.items():
118
+ for component_index, distance in graph_distance_pairwise[compound]:
119
+ edges[((compound, compound_instance_index), component_index)] = distance
120
+ matching = hungarian.max_weight_matching(
121
+ edges = edges,
122
+ min_weight = True
123
+ )
124
+ # TODO extra logic to iterate the edit distance algorithm if the matching is 'close'?
125
+
126
+ matching_dict: dict[set[int], tuple[Compound, int]] = dict()
127
+ for (compound, compound_index), component_index in matching.items():
128
+ matching_dict[component_subsets[component_index]] = (compound, compound_index)
129
+
130
+ return matching_dict
@@ -0,0 +1,217 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+
5
+ import networkx as nx
6
+ from rdkit import Chem as rdk
7
+ from rdkit.Chem.rdChemReactions import ChemicalReaction
8
+
9
+ from chemrecon.chem.gml.gml import remove_residues, GMLGraph
10
+
11
+
12
+ atom_regex = re.compile(r'([A-Z][a-z]*)([0-9])?([+\-])?')
13
+
14
+
15
+ # Molecule
16
+ def gml_to_rdkit_mol(gml: str) -> rdk.Mol:
17
+ gmlgraph = GMLGraph(gml)
18
+ rdk_mol = rdk.EditableMol(rdk.Mol())
19
+ node_index: dict[int, int] = {}
20
+
21
+ # Atoms
22
+ for nx_index, nx_properties in gmlgraph.graph.nodes.items():
23
+ # Add atom with element (charge?) given by the label, and atom map number given by the index
24
+ atom = None
25
+ label: str = nx_properties['label']
26
+ if label == '*':
27
+ atom = atom = rdk.AtomFromSmiles(f'[*]')
28
+ else:
29
+ match = atom_regex.match(label)
30
+ charge = 0
31
+ try:
32
+ symbol = match.group(1)
33
+ plusminus = match.group(3)
34
+ if plusminus is None:
35
+ charge = 0
36
+ else:
37
+ if plusminus == '+':
38
+ charge = 1
39
+ elif plusminus == '-':
40
+ charge = -1
41
+ else:
42
+ raise Exception()
43
+
44
+ chargeabs = match.group(2)
45
+ if chargeabs:
46
+ charge * int(chargeabs)
47
+
48
+ except Exception:
49
+ raise ValueError(f'Cannot convert S_GML: Atom label: {label}')
50
+ # error.
51
+
52
+ if symbol == 'R':
53
+ # Radical
54
+ # TODO
55
+ atom = rdk.AtomFromSmiles(f'[*]')
56
+ elif symbol.startswith('Alias'):
57
+ # 'Alias in GML?'
58
+ # TODO what does this mean exactly
59
+ atom = rdk.AtomFromSmiles(f'[*]')
60
+ else:
61
+ atom = rdk.AtomFromSmiles(f'[{symbol}]')
62
+
63
+ # Set charge based on match
64
+ atom.SetFormalCharge(charge)
65
+
66
+ # Atom is set
67
+ if atom is None:
68
+ raise ValueError(f'Cannot convert S_GML: Atom label: {label}')
69
+
70
+ atom.SetAtomMapNum(nx_index)
71
+ atom_index = rdk_mol.AddAtom(atom)
72
+ node_index[nx_index] = atom_index
73
+
74
+ # Edges
75
+ for a, b in gmlgraph.graph.edges():
76
+ index_a = node_index[a]
77
+ index_b = node_index[b]
78
+ label: str = gmlgraph.graph.get_edge_data(a, b)['label']
79
+ match label:
80
+ case '-':
81
+ bondtype = rdk.BondType.SINGLE
82
+ case '=':
83
+ bondtype = rdk.BondType.DOUBLE
84
+ case '#':
85
+ bondtype = rdk.BondType.TRIPLE
86
+ case ':':
87
+ bondtype = rdk.BondType.AROMATIC
88
+ case _:
89
+ raise ValueError(f'Unknown bond type: {label}.')
90
+ rdk_mol.AddBond(index_a, index_b, bondtype)
91
+
92
+ # Mol is done, change to uneditable
93
+ rdk_mol_final = rdk_mol.GetMol()
94
+ return rdk_mol_final
95
+
96
+ # Reaction
97
+ def gml_to_rdkit_reaction(gml: str) -> ChemicalReaction:
98
+ """ Relies on S_GML in the format outputted by the M-CSA script.
99
+ """
100
+
101
+ # TODO change to re-use above code?
102
+
103
+ # Load two S_GML strings
104
+ l_str = ''
105
+ r_str = ''
106
+ step: int = 0
107
+ currentstr: str = ''
108
+ for l in gml.splitlines():
109
+ if l.startswith('graph') or l.startswith('left') or l.startswith('right'):
110
+ if step == 1:
111
+ l_str = currentstr
112
+ elif step == 2:
113
+ r_str = currentstr
114
+ # Start new string
115
+ step += 1
116
+ currentstr = ''
117
+ # Add to current string
118
+ currentstr += l
119
+
120
+ assert step == 2
121
+ r_str = currentstr
122
+
123
+ # Get the components and turn into MolInstances
124
+ # TODO should not be necessary
125
+ l_chemgraph = remove_residues(GMLGraph(l_str.replace('left', 'graph')))
126
+ r_chemgraph = remove_residues(GMLGraph(r_str.replace('right', 'graph')))
127
+
128
+ # For each component, add it
129
+ rdk_r = ChemicalReaction()
130
+ rdk_reaction = ChemicalReaction()
131
+
132
+ for chemgraph, side in [[l_chemgraph, -1], [r_chemgraph, 1]]:
133
+ for component in chemgraph.get_components_subgraphs():
134
+ component: nx.Graph
135
+ rdk_mol = rdk.EditableMol(rdk.Mol())
136
+ node_index: dict[int, int] = {}
137
+
138
+ # Atoms
139
+ for nx_index, nx_properties in component.nodes.items():
140
+ # Add atom with element (charge?) given by the label, and atom map number given by the index
141
+ atom = None
142
+ label: str = nx_properties['label']
143
+ if label == '*':
144
+ atom = rdk.AtomFromSmiles(f'[*]')
145
+ else:
146
+ match = atom_regex.match(label)
147
+ charge = 0
148
+ try:
149
+ symbol = match.group(1)
150
+ plusminus = match.group(3)
151
+ if plusminus is None:
152
+ charge = 0
153
+ else:
154
+ if plusminus == '+':
155
+ charge = 1
156
+ elif plusminus == '-':
157
+ charge = -1
158
+ else:
159
+ raise Exception()
160
+
161
+ chargeabs = match.group(2)
162
+ if chargeabs:
163
+ charge * int(chargeabs)
164
+
165
+ except Exception:
166
+ raise ValueError(f'Cannot convert S_GML: Atom label: {label}')
167
+ # error.
168
+ if symbol == 'R':
169
+ # Radical
170
+ # TODO
171
+ atom = rdk.AtomFromSmiles(f'[*]')
172
+ elif symbol.startswith('Alias'):
173
+ # 'Alias in GML?'
174
+ # TODO what does this mean exactly
175
+ atom = rdk.AtomFromSmiles(f'[*]')
176
+ else:
177
+ atom = rdk.AtomFromSmiles(f'[{symbol}]')
178
+
179
+
180
+ if atom is None:
181
+ raise ValueError(f'Cannot convert S_GML: Atom label: {label}')
182
+ atom.SetFormalCharge(charge)
183
+ atom.SetAtomMapNum(nx_index)
184
+ atom_index = rdk_mol.AddAtom(atom)
185
+ node_index[nx_index] = atom_index
186
+
187
+ # Edges
188
+ for a, b in component.edges():
189
+ index_a = node_index[a]
190
+ index_b = node_index[b]
191
+ label: str = component.get_edge_data(a, b)['label']
192
+ match label:
193
+ case '-':
194
+ bondtype = rdk.BondType.SINGLE
195
+ case '=':
196
+ bondtype = rdk.BondType.DOUBLE
197
+ case '#':
198
+ bondtype = rdk.BondType.TRIPLE
199
+ case ':':
200
+ bondtype = rdk.BondType.AROMATIC
201
+ case _:
202
+ raise ValueError(f'Unknown bond type: {label}.')
203
+ rdk_mol.AddBond(index_a, index_b, bondtype)
204
+
205
+ pass
206
+
207
+ # Mol is done, change to uneditable
208
+ rdk_mol_final = rdk_mol.GetMol()
209
+ # TODO ADD mol to reaction
210
+ match side:
211
+ case -1:
212
+ rdk_reaction.AddReactantTemplate(rdk_mol_final)
213
+ case 1:
214
+ rdk_reaction.AddProductTemplate(rdk_mol_final)
215
+
216
+ # Reaction now completed
217
+ return rdk_reaction