chemrecon 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chemrecon/__init__.py +73 -0
- chemrecon/chem/__init__.py +0 -0
- chemrecon/chem/chemreaction.py +223 -0
- chemrecon/chem/constant_compounds.py +3 -0
- chemrecon/chem/create_mol.py +91 -0
- chemrecon/chem/elements.py +141 -0
- chemrecon/chem/gml/__init__.py +0 -0
- chemrecon/chem/gml/gml.py +324 -0
- chemrecon/chem/gml/gml_reactant_matching.py +130 -0
- chemrecon/chem/gml/gml_to_rdk.py +217 -0
- chemrecon/chem/mol.py +483 -0
- chemrecon/chem/sumformula.py +120 -0
- chemrecon/connection.py +97 -0
- chemrecon/core/__init__.py +0 -0
- chemrecon/core/id_types.py +687 -0
- chemrecon/core/ontology.py +209 -0
- chemrecon/core/populate_query_handler.py +336 -0
- chemrecon/core/query_handler.py +587 -0
- chemrecon/database/__init__.py +1 -0
- chemrecon/database/connect.py +63 -0
- chemrecon/database/connection_params/chemrecon_pub.dbinfo +5 -0
- chemrecon/database/connection_params/local_docker_dev.dbinfo +5 -0
- chemrecon/database/connection_params/local_docker_init.dbinfo +5 -0
- chemrecon/database/connection_params/local_docker_pub.dbinfo +5 -0
- chemrecon/database/params.py +88 -0
- chemrecon/entrygraph/draw.py +119 -0
- chemrecon/entrygraph/entrygraph.py +301 -0
- chemrecon/entrygraph/explorationprotocol.py +199 -0
- chemrecon/entrygraph/explore.py +421 -0
- chemrecon/entrygraph/explore_procedure.py +183 -0
- chemrecon/entrygraph/filter.py +88 -0
- chemrecon/entrygraph/scoring.py +141 -0
- chemrecon/query/__init__.py +26 -0
- chemrecon/query/create_entry.py +86 -0
- chemrecon/query/default_protocols.py +57 -0
- chemrecon/query/find_entry.py +84 -0
- chemrecon/query/get_relations.py +143 -0
- chemrecon/query/get_structures_from_compound.py +65 -0
- chemrecon/schema/__init__.py +86 -0
- chemrecon/schema/db_object.py +363 -0
- chemrecon/schema/direction.py +10 -0
- chemrecon/schema/entry_types/__init__.py +0 -0
- chemrecon/schema/entry_types/aam.py +34 -0
- chemrecon/schema/entry_types/aam_repr.py +37 -0
- chemrecon/schema/entry_types/compound.py +52 -0
- chemrecon/schema/entry_types/enzyme.py +49 -0
- chemrecon/schema/entry_types/molstructure.py +64 -0
- chemrecon/schema/entry_types/molstructure_repr.py +41 -0
- chemrecon/schema/entry_types/reaction.py +57 -0
- chemrecon/schema/enums.py +154 -0
- chemrecon/schema/procedural_relation_entrygraph.py +66 -0
- chemrecon/schema/relation_types_composed/__init__.py +0 -0
- chemrecon/schema/relation_types_composed/compound_has_molstructure_relation.py +59 -0
- chemrecon/schema/relation_types_composed/reaction_has_aam_relation.py +50 -0
- chemrecon/schema/relation_types_procedural/__init__.py +0 -0
- chemrecon/schema/relation_types_procedural/aam_convert_relation.py +69 -0
- chemrecon/schema/relation_types_procedural/compound_select_structure_proceduralrelation.py +36 -0
- chemrecon/schema/relation_types_procedural/compound_similarlity_proceduralrelation.py +1 -0
- chemrecon/schema/relation_types_procedural/molstructure_convert_relation.py +49 -0
- chemrecon/schema/relation_types_procedural/reaction_select_aam_proceduralrelation.py +38 -0
- chemrecon/schema/relation_types_procedural/reaction_similarity_proceduralrelation.py +1 -0
- chemrecon/schema/relation_types_source/__init__.py +0 -0
- chemrecon/schema/relation_types_source/aam_involves_molstructure_relation.py +77 -0
- chemrecon/schema/relation_types_source/aam_repr_involves_molstructure_repr_relation.py +79 -0
- chemrecon/schema/relation_types_source/compound_has_structure_representation_relation.py +33 -0
- chemrecon/schema/relation_types_source/compound_reference_relation.py +34 -0
- chemrecon/schema/relation_types_source/molstructure_standardisation_relation.py +71 -0
- chemrecon/schema/relation_types_source/ontology/__init__.py +0 -0
- chemrecon/schema/relation_types_source/ontology/compound_ontology.py +369 -0
- chemrecon/schema/relation_types_source/ontology/enzyme_ontology.py +142 -0
- chemrecon/schema/relation_types_source/ontology/reaction_ontology.py +140 -0
- chemrecon/schema/relation_types_source/reaction_has_aam_representation_relation.py +34 -0
- chemrecon/schema/relation_types_source/reaction_has_enzyme_relation.py +71 -0
- chemrecon/schema/relation_types_source/reaction_involves_compound_relation.py +69 -0
- chemrecon/schema/relation_types_source/reaction_reference_relation.py +33 -0
- chemrecon/scripts/initialize_database.py +494 -0
- chemrecon/utils/copy_signature.py +10 -0
- chemrecon/utils/encodeable_list.py +11 -0
- chemrecon/utils/get_id_type.py +70 -0
- chemrecon/utils/hungarian.py +31 -0
- chemrecon/utils/reactant_matching.py +168 -0
- chemrecon/utils/rxnutils.py +44 -0
- chemrecon/utils/set_cwd.py +12 -0
- chemrecon-0.1.1.dist-info/METADATA +143 -0
- chemrecon-0.1.1.dist-info/RECORD +86 -0
- chemrecon-0.1.1.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
""" Code to handle S_GML from Juri's M-CSA tool
|
|
2
|
+
"""
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
import matplotlib.pyplot as plt
|
|
9
|
+
import networkx as nx
|
|
10
|
+
from rdkit import Chem as rdk
|
|
11
|
+
|
|
12
|
+
from chemrecon.chem.mol import Mol
|
|
13
|
+
|
|
14
|
+
# Regex patterns
|
|
15
|
+
rule_pattern = re.compile(
|
|
16
|
+
r'^rule \[(?:ruleID \".*\")?(?:left \[(.*?)\])?context \[(.*?)\](?:right \[(.*?)\])?\]$'
|
|
17
|
+
)
|
|
18
|
+
label_sub = re.compile(
|
|
19
|
+
r'id (\d+) label \"([A-Z][a-z]?\d?\+?\-?|Amino\(..?, ..., \d+, [\w|*]\)|Alias\(.,.\)|\*)\"'
|
|
20
|
+
)
|
|
21
|
+
label_rep = r'id \g<1> label "\g<1>: \g<2>"'
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def sanitise_gml(gml: str) -> str:
|
|
25
|
+
r = f'{gml}'
|
|
26
|
+
r.replace('\n', '').replace('\t', '')
|
|
27
|
+
return r
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class GMLException(Exception):
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class GMLEmptyException(Exception):
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class GMLEntry:
|
|
39
|
+
entry_index: int
|
|
40
|
+
proposals: list[list[GMLStep]]
|
|
41
|
+
map: dict[int, int] # For proposal 0
|
|
42
|
+
|
|
43
|
+
def __init__(self, index: int):
|
|
44
|
+
self.entry_index = index
|
|
45
|
+
self.proposals = []
|
|
46
|
+
|
|
47
|
+
def add_step(self, proposal_index: int, step: GMLStep):
|
|
48
|
+
# Extend proposals list
|
|
49
|
+
while len(self.proposals) < proposal_index:
|
|
50
|
+
self.proposals.append([])
|
|
51
|
+
|
|
52
|
+
# Add step
|
|
53
|
+
self.proposals[proposal_index - 1].append(step)
|
|
54
|
+
|
|
55
|
+
def set_map(self, aam: dict[int, int]):
|
|
56
|
+
self.map = aam
|
|
57
|
+
sanity_check(self.left_graph(), self.right_graph(), self.map)
|
|
58
|
+
|
|
59
|
+
# Getters always get proposal 0 (best star rating in M-CSA)
|
|
60
|
+
def left_graph(self) -> GMLGraph:
|
|
61
|
+
return self.proposals[0][0].left_graph()
|
|
62
|
+
|
|
63
|
+
def right_graph(self) -> GMLGraph:
|
|
64
|
+
return self.proposals[0][-1].right_graph()
|
|
65
|
+
|
|
66
|
+
def atom_atom_map(self) -> dict[int, int]:
|
|
67
|
+
return self.map
|
|
68
|
+
|
|
69
|
+
class GMLStep:
|
|
70
|
+
step_index: int
|
|
71
|
+
gml: str
|
|
72
|
+
lhs: GMLGraph
|
|
73
|
+
rhs: GMLGraph
|
|
74
|
+
|
|
75
|
+
# Pure gml
|
|
76
|
+
left_gml: str
|
|
77
|
+
context_gml: str
|
|
78
|
+
right_gml: str
|
|
79
|
+
|
|
80
|
+
def __init__(self, step_index: int, gml: str):
|
|
81
|
+
self.step_index = step_index
|
|
82
|
+
self.gml = str(gml).replace('\n', '').replace('\t', '')
|
|
83
|
+
|
|
84
|
+
# Make S_GML for educt and product sides
|
|
85
|
+
match = re.match(rule_pattern, self.gml)
|
|
86
|
+
self.left_gml = match.group(1)
|
|
87
|
+
self.context_gml = match.group(2)
|
|
88
|
+
self.right_gml = match.group(3)
|
|
89
|
+
|
|
90
|
+
lhs = (f'graph [\n {self.left_gml} {self.context_gml} \n]'
|
|
91
|
+
if self.left_gml else f'graph [\n{self.context_gml}\n]')
|
|
92
|
+
rhs = (f'graph [\n {self.context_gml} {self.right_gml} \n]'
|
|
93
|
+
if self.right_gml else f'graph [\n{self.context_gml}\n]')
|
|
94
|
+
|
|
95
|
+
self.lhs = GMLGraph(
|
|
96
|
+
gml = lhs
|
|
97
|
+
)
|
|
98
|
+
self.rhs = GMLGraph(
|
|
99
|
+
gml = rhs
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
def left_graph(self) -> GMLGraph:
|
|
103
|
+
return self.lhs
|
|
104
|
+
|
|
105
|
+
def right_graph(self) -> GMLGraph:
|
|
106
|
+
return self.rhs
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class GMLGraph:
|
|
110
|
+
graph: nx.Graph
|
|
111
|
+
gml: str
|
|
112
|
+
component_subsets: list[frozenset[int]]
|
|
113
|
+
component_subgraphs: list[nx.Graph]
|
|
114
|
+
|
|
115
|
+
def __init__(self, gml: str, graph: nx.Graph = None):
|
|
116
|
+
self.gml = sanitise_gml(gml)
|
|
117
|
+
|
|
118
|
+
# TODO raise exception of called with empty graph
|
|
119
|
+
|
|
120
|
+
if graph:
|
|
121
|
+
self.graph = graph
|
|
122
|
+
else:
|
|
123
|
+
# Make graph from gml
|
|
124
|
+
self.graph = nx.parse_gml(
|
|
125
|
+
gml,
|
|
126
|
+
label = 'id'
|
|
127
|
+
)
|
|
128
|
+
self.component_subgraphs = []
|
|
129
|
+
self.component_subsets = []
|
|
130
|
+
|
|
131
|
+
def get_component_subsets(self) -> list[frozenset[int]]:
|
|
132
|
+
if not self.component_subsets:
|
|
133
|
+
self.component_subsets = list(
|
|
134
|
+
frozenset(s)
|
|
135
|
+
for s in nx.connected_components(self.graph)
|
|
136
|
+
)
|
|
137
|
+
return self.component_subsets
|
|
138
|
+
|
|
139
|
+
def get_components_subgraphs(self) -> list[nx.Graph]:
|
|
140
|
+
if self.component_subgraphs:
|
|
141
|
+
return self.component_subgraphs
|
|
142
|
+
else:
|
|
143
|
+
if not self.component_subsets:
|
|
144
|
+
# Make subsets
|
|
145
|
+
self.get_component_subsets()
|
|
146
|
+
# Set the subgraphs
|
|
147
|
+
self.component_subgraphs = [
|
|
148
|
+
nx.subgraph(self.graph, compset) for compset in self.component_subsets
|
|
149
|
+
]
|
|
150
|
+
return self.component_subgraphs
|
|
151
|
+
|
|
152
|
+
def remove_hydrogens(self):
|
|
153
|
+
g_ = self.graph.copy()
|
|
154
|
+
to_remove: list = list()
|
|
155
|
+
for n in g_.nodes():
|
|
156
|
+
if g_.nodes[n]['label'] == 'H' and g_.degree[n] > 0:
|
|
157
|
+
to_remove.append(n)
|
|
158
|
+
g_.remove_nodes_from(to_remove)
|
|
159
|
+
self.graph = g_
|
|
160
|
+
|
|
161
|
+
def get_subgraph(self, indices: set[int]) -> nx.Graph:
|
|
162
|
+
return self.graph.subgraph(indices)
|
|
163
|
+
|
|
164
|
+
def draw(self):
|
|
165
|
+
# Draw with networkx and matplotlib - need to display atom numbers on label!
|
|
166
|
+
plt.figure(1, figsize = (8, 8), dpi = 240)
|
|
167
|
+
pos = nx.spring_layout(self.graph)
|
|
168
|
+
|
|
169
|
+
# Graph structure and node labels
|
|
170
|
+
nx.draw(
|
|
171
|
+
self.graph, pos,
|
|
172
|
+
with_labels = True,
|
|
173
|
+
node_size = 50,
|
|
174
|
+
font_size = 6,
|
|
175
|
+
labels = {
|
|
176
|
+
k: f"{k}: {n['label']}" for k, n in self.graph.nodes.items()
|
|
177
|
+
}
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
# Edge labels
|
|
181
|
+
edict = dict()
|
|
182
|
+
for (u, v, d) in self.graph.edges.data('label'):
|
|
183
|
+
edict[(u, v)] = d
|
|
184
|
+
|
|
185
|
+
nx.draw_networkx_edge_labels(
|
|
186
|
+
self.graph, pos, edict, font_color = 'blue', font_size = 6
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# Display graph
|
|
190
|
+
plt.show()
|
|
191
|
+
# TODO
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
# ChemGraph generation
|
|
195
|
+
def chemgraph_from_mol(mol: Mol) -> GMLGraph:
|
|
196
|
+
# Based on an RDK mol, create a ChemGraph object.
|
|
197
|
+
struct_graph = nx.Graph()
|
|
198
|
+
for atom in mol.mol.GetAtoms():
|
|
199
|
+
# Make the label based on element and charge
|
|
200
|
+
fcharge = atom.GetFormalCharge()
|
|
201
|
+
symbol = atom.GetSymbol()
|
|
202
|
+
|
|
203
|
+
# Do not include hydrogen
|
|
204
|
+
if symbol == 'H' and len(atom.GetNeighbors()) > 0:
|
|
205
|
+
continue
|
|
206
|
+
|
|
207
|
+
# Make label
|
|
208
|
+
label: str = 'ERR'
|
|
209
|
+
match fcharge:
|
|
210
|
+
case 0:
|
|
211
|
+
label = symbol
|
|
212
|
+
case -1:
|
|
213
|
+
label = f'{symbol}-'
|
|
214
|
+
case 1:
|
|
215
|
+
label = f'{symbol}+'
|
|
216
|
+
case _ if fcharge < 0:
|
|
217
|
+
label = f'{symbol}{fcharge}-'
|
|
218
|
+
case _ if fcharge >= 0:
|
|
219
|
+
label = f'{symbol}{fcharge}+'
|
|
220
|
+
|
|
221
|
+
struct_graph.add_node(
|
|
222
|
+
atom.GetIdx(),
|
|
223
|
+
label = label
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
for bond in mol.mol.GetBonds():
|
|
227
|
+
label: str = 'ERR'
|
|
228
|
+
match bond.GetBondType():
|
|
229
|
+
case rdk.BondType.SINGLE:
|
|
230
|
+
label = '-'
|
|
231
|
+
case rdk.BondType.DOUBLE:
|
|
232
|
+
label = '='
|
|
233
|
+
case rdk.BondType.TRIPLE:
|
|
234
|
+
label = '#'
|
|
235
|
+
case rdk.BondType.AROMATIC:
|
|
236
|
+
label = ':'
|
|
237
|
+
case _:
|
|
238
|
+
raise RuntimeError(f'Unimplemented bond-type {bond.GetBondType()}')
|
|
239
|
+
|
|
240
|
+
struct_graph.add_edge(
|
|
241
|
+
bond.GetBeginAtomIdx(),
|
|
242
|
+
bond.GetEndAtomIdx(),
|
|
243
|
+
label = label
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
# Finalize and add the graph to the list of graphs to match
|
|
247
|
+
cgraph = GMLGraph(
|
|
248
|
+
gml = '',
|
|
249
|
+
graph = struct_graph
|
|
250
|
+
)
|
|
251
|
+
return cgraph
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
# Entry serialisation
|
|
255
|
+
# ----------------------------------------------------------------------------------------------------------------------
|
|
256
|
+
def graph_to_gml(graph: nx.Graph, relabel_map: Optional[dict[int, int]] = None) -> str:
|
|
257
|
+
""" Only returns 'inner' gml.
|
|
258
|
+
"""
|
|
259
|
+
if relabel_map is None:
|
|
260
|
+
relabel_map = {x: x for x in graph.nodes}
|
|
261
|
+
|
|
262
|
+
s = '[\n'
|
|
263
|
+
# s = 'graph [\n'
|
|
264
|
+
# Nodes
|
|
265
|
+
for n_index in graph.nodes:
|
|
266
|
+
label = graph.nodes[n_index]['label']
|
|
267
|
+
s += f'\tnode [ id {relabel_map[n_index]} label "{label}"]\n'
|
|
268
|
+
|
|
269
|
+
# Edges
|
|
270
|
+
for x, y in graph.edges:
|
|
271
|
+
label = graph.edges[(x, y)]['label']
|
|
272
|
+
s += f'\tedge [ source {relabel_map[x]} target {relabel_map[y]} label "{label}" ]\n'
|
|
273
|
+
s += '\n]'
|
|
274
|
+
return s
|
|
275
|
+
|
|
276
|
+
# Reaction graph sanitation and handling -------------------------------------------------------------------------------
|
|
277
|
+
def remove_residues(
|
|
278
|
+
chemgraph: GMLGraph
|
|
279
|
+
) -> GMLGraph:
|
|
280
|
+
""" Given a chemgraph representing either side of a reaction, return the same graph with identified
|
|
281
|
+
catalytic enzyme residues removed.
|
|
282
|
+
"""
|
|
283
|
+
g: nx.Graph = chemgraph.graph
|
|
284
|
+
remove_nodes: set = set() #
|
|
285
|
+
|
|
286
|
+
components = list(nx.connected_components(g))
|
|
287
|
+
for component in components:
|
|
288
|
+
for v in component:
|
|
289
|
+
l: str = g.nodes[v]['label']
|
|
290
|
+
if l.startswith('Amino'):
|
|
291
|
+
# Remove the entire component if it contains an amino acid
|
|
292
|
+
remove_nodes = remove_nodes.union(component)
|
|
293
|
+
break
|
|
294
|
+
|
|
295
|
+
# Remove the nodes
|
|
296
|
+
g_ = g.copy(as_view = False)
|
|
297
|
+
g_.remove_nodes_from(remove_nodes)
|
|
298
|
+
if g_.number_of_nodes() == 0:
|
|
299
|
+
raise GMLEmptyException
|
|
300
|
+
return GMLGraph(gml = '', graph = g_)
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
# Misc
|
|
304
|
+
# --------------------------------------------------------------------------------------------------------------
|
|
305
|
+
# Matching functions for graph similarity
|
|
306
|
+
def nmatch(n1, n2):
|
|
307
|
+
try:
|
|
308
|
+
return n1['label'] == n2['label']
|
|
309
|
+
except KeyError:
|
|
310
|
+
return 1
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def ematch(e1, e2):
|
|
314
|
+
try:
|
|
315
|
+
return e1['label'] == e2['label']
|
|
316
|
+
except KeyError:
|
|
317
|
+
return 1
|
|
318
|
+
|
|
319
|
+
def sanity_check(l_graph: GMLGraph, r_graph: GMLGraph, map: dict[int, int]):
|
|
320
|
+
for l_index, r_index in map.items():
|
|
321
|
+
l_label: str = l_graph.graph.nodes[l_index]['label']
|
|
322
|
+
r_label: str = r_graph.graph.nodes[r_index]['label']
|
|
323
|
+
if l_label.removesuffix('-').removesuffix('+') != r_label.removesuffix('-').removesuffix('+'):
|
|
324
|
+
assert False, 'Wrong match'
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import networkx as nx
|
|
4
|
+
|
|
5
|
+
from chem.elements import ChemGraph, chemgraph_from_mol, ematch, nmatch
|
|
6
|
+
from chem.elements import mol_from_structurerepresentation
|
|
7
|
+
from chemrecon.core.query_handler import QueryHandler
|
|
8
|
+
from chemrecon.schema.entry_types.compound import Compound
|
|
9
|
+
from chemrecon.schema.entry_types.reaction import Reaction
|
|
10
|
+
from chemrecon.schema.relation_types_source.compound_has_structure_representation_relation import \
|
|
11
|
+
CompoundHasStructureRepresentation
|
|
12
|
+
from chemrecon.schema.relation_types_source.reaction_involves_compound_relation import ReactionInvolvesCompound
|
|
13
|
+
from utils import hungarian as hungarian
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# TODO change to use the general reactant_matching with ChemReactions
|
|
17
|
+
|
|
18
|
+
def match_reactants(
|
|
19
|
+
handler: QueryHandler,
|
|
20
|
+
procedure: Procedure,
|
|
21
|
+
chemgraph: ChemGraph,
|
|
22
|
+
reaction: Reaction,
|
|
23
|
+
side: int # -1 or 1
|
|
24
|
+
) -> dict[frozenset[int], tuple[Compound, int]]:
|
|
25
|
+
""" Given a left or right S_GML of a reaction (already sanitised by removing enzymes), tries to match the
|
|
26
|
+
components of the graph to a list of entries with known structures using graph similarity. Returns a
|
|
27
|
+
mapping, from each reactant (compound, multiplicity index), to the set of nodes with which they
|
|
28
|
+
were identified.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
# TODO eventually replace the structure getting alg. with entry graph-based approach
|
|
32
|
+
|
|
33
|
+
# Get participant compounds
|
|
34
|
+
reaction_all_compounds: list[tuple[Compound, int]] = list() # With stoich
|
|
35
|
+
compound_result_dict = handler.get_relations_with_entries_by_recon_ids_of_t1(
|
|
36
|
+
entry_type = Reaction,
|
|
37
|
+
recon_ids = [reaction.recon_id],
|
|
38
|
+
relation_type = ReactionInvolvesCompound,
|
|
39
|
+
)[0]
|
|
40
|
+
for compound_entry, relations in compound_result_dict.items():
|
|
41
|
+
relations: list[ReactionInvolvesCompound]
|
|
42
|
+
for relation in relations:
|
|
43
|
+
reaction_all_compounds.append(
|
|
44
|
+
(compound_entry, relation.n)
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# Get existing structures for the attached compounds (converted to chemgraph objects?)
|
|
48
|
+
compound_graphs: dict[Compound, ChemGraph] = dict()
|
|
49
|
+
for compound_entry, stoich in reaction_all_compounds:
|
|
50
|
+
# Only take compounds on the correct side
|
|
51
|
+
if stoich * side < 0:
|
|
52
|
+
continue
|
|
53
|
+
|
|
54
|
+
# Get structures
|
|
55
|
+
struct_repr_relations = handler.get_relations_with_entries_by_recon_ids_of_t1(
|
|
56
|
+
entry_type = Compound,
|
|
57
|
+
recon_ids = [compound_entry.recon_id],
|
|
58
|
+
relation_type = CompoundHasStructureRepresentation
|
|
59
|
+
)
|
|
60
|
+
struct_reprs = struct_repr_relations[0].keys()
|
|
61
|
+
|
|
62
|
+
# For now, select an arbitrary structure
|
|
63
|
+
try:
|
|
64
|
+
struct_repr = list(struct_reprs)[0]
|
|
65
|
+
except IndexError:
|
|
66
|
+
# No structures found
|
|
67
|
+
continue
|
|
68
|
+
|
|
69
|
+
# Convert the selected structure to a S_GML ChemGraph
|
|
70
|
+
compound_graphs[compound_entry] = chemgraph_from_mol(mol_from_structurerepresentation(struct_repr))
|
|
71
|
+
|
|
72
|
+
# Perform the matching algorithm on match_structures against components of the given chemgraph
|
|
73
|
+
chemgraph.remove_hydrogens()
|
|
74
|
+
component_subsets: list[frozenset[int]] = chemgraph.get_component_subsets()
|
|
75
|
+
component_subgraphs: list[nx.Graph] = chemgraph.get_components_subgraphs()
|
|
76
|
+
|
|
77
|
+
# Log an error if component numbers don't match
|
|
78
|
+
if len(component_subgraphs) != len(compound_graphs):
|
|
79
|
+
# TODO Try to fix this by balancing cofactors etc
|
|
80
|
+
# TODO May be extra hydrons, water etc
|
|
81
|
+
# TODO - entry #6 has an extra water on both sides for instance
|
|
82
|
+
procedure.log_notice(f'Could not get structures for all components.'
|
|
83
|
+
f' Reaction {reaction} (side {side}).'
|
|
84
|
+
f' Number of S_GML graph components ({len(component_subgraphs)})'
|
|
85
|
+
f' does not match number of reactants ({len(compound_graphs)}).')
|
|
86
|
+
|
|
87
|
+
# Pairwise similarity (maps from each compound to the index of the component and the similarity)
|
|
88
|
+
graph_distance_pairwise: dict[Compound, list[tuple[int, float]]] = dict()
|
|
89
|
+
|
|
90
|
+
# Compute the similarity
|
|
91
|
+
for compound_entry, compound_chemgraph in compound_graphs.items():
|
|
92
|
+
component_distances: list[tuple[int, float]] = list()
|
|
93
|
+
for component_index, component in enumerate(component_subgraphs):
|
|
94
|
+
# nx.optimize_graph_edit_distance is a generator which generates successively more precise values.
|
|
95
|
+
dist = next(nx.optimize_graph_edit_distance(
|
|
96
|
+
compound_chemgraph.graph,
|
|
97
|
+
component,
|
|
98
|
+
node_match = nmatch,
|
|
99
|
+
edge_match = ematch
|
|
100
|
+
))
|
|
101
|
+
component_distances.append((component_index, dist))
|
|
102
|
+
graph_distance_pairwise[compound_entry] = component_distances
|
|
103
|
+
|
|
104
|
+
# We make duplicate instances of each compound to match if stoich != 1
|
|
105
|
+
compound_instances: dict[tuple[Compound, int], ChemGraph] = dict()
|
|
106
|
+
for compound, stoich in reaction_all_compounds:
|
|
107
|
+
if stoich * side < 0:
|
|
108
|
+
continue
|
|
109
|
+
for i in range(abs(stoich)):
|
|
110
|
+
try:
|
|
111
|
+
compound_instances[(compound, i)] = compound_graphs[compound]
|
|
112
|
+
except KeyError as e:
|
|
113
|
+
pass
|
|
114
|
+
|
|
115
|
+
# Run the hungarian algorithm to get the best match with the instanced compounds
|
|
116
|
+
edges: dict[tuple[tuple[Compound, int], int], float] = dict()
|
|
117
|
+
for (compound, compound_instance_index), _ in compound_instances.items():
|
|
118
|
+
for component_index, distance in graph_distance_pairwise[compound]:
|
|
119
|
+
edges[((compound, compound_instance_index), component_index)] = distance
|
|
120
|
+
matching = hungarian.max_weight_matching(
|
|
121
|
+
edges = edges,
|
|
122
|
+
min_weight = True
|
|
123
|
+
)
|
|
124
|
+
# TODO extra logic to iterate the edit distance algorithm if the matching is 'close'?
|
|
125
|
+
|
|
126
|
+
matching_dict: dict[set[int], tuple[Compound, int]] = dict()
|
|
127
|
+
for (compound, compound_index), component_index in matching.items():
|
|
128
|
+
matching_dict[component_subsets[component_index]] = (compound, compound_index)
|
|
129
|
+
|
|
130
|
+
return matching_dict
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
import networkx as nx
|
|
6
|
+
from rdkit import Chem as rdk
|
|
7
|
+
from rdkit.Chem.rdChemReactions import ChemicalReaction
|
|
8
|
+
|
|
9
|
+
from chemrecon.chem.gml.gml import remove_residues, GMLGraph
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
atom_regex = re.compile(r'([A-Z][a-z]*)([0-9])?([+\-])?')
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# Molecule
|
|
16
|
+
def gml_to_rdkit_mol(gml: str) -> rdk.Mol:
|
|
17
|
+
gmlgraph = GMLGraph(gml)
|
|
18
|
+
rdk_mol = rdk.EditableMol(rdk.Mol())
|
|
19
|
+
node_index: dict[int, int] = {}
|
|
20
|
+
|
|
21
|
+
# Atoms
|
|
22
|
+
for nx_index, nx_properties in gmlgraph.graph.nodes.items():
|
|
23
|
+
# Add atom with element (charge?) given by the label, and atom map number given by the index
|
|
24
|
+
atom = None
|
|
25
|
+
label: str = nx_properties['label']
|
|
26
|
+
if label == '*':
|
|
27
|
+
atom = atom = rdk.AtomFromSmiles(f'[*]')
|
|
28
|
+
else:
|
|
29
|
+
match = atom_regex.match(label)
|
|
30
|
+
charge = 0
|
|
31
|
+
try:
|
|
32
|
+
symbol = match.group(1)
|
|
33
|
+
plusminus = match.group(3)
|
|
34
|
+
if plusminus is None:
|
|
35
|
+
charge = 0
|
|
36
|
+
else:
|
|
37
|
+
if plusminus == '+':
|
|
38
|
+
charge = 1
|
|
39
|
+
elif plusminus == '-':
|
|
40
|
+
charge = -1
|
|
41
|
+
else:
|
|
42
|
+
raise Exception()
|
|
43
|
+
|
|
44
|
+
chargeabs = match.group(2)
|
|
45
|
+
if chargeabs:
|
|
46
|
+
charge * int(chargeabs)
|
|
47
|
+
|
|
48
|
+
except Exception:
|
|
49
|
+
raise ValueError(f'Cannot convert S_GML: Atom label: {label}')
|
|
50
|
+
# error.
|
|
51
|
+
|
|
52
|
+
if symbol == 'R':
|
|
53
|
+
# Radical
|
|
54
|
+
# TODO
|
|
55
|
+
atom = rdk.AtomFromSmiles(f'[*]')
|
|
56
|
+
elif symbol.startswith('Alias'):
|
|
57
|
+
# 'Alias in GML?'
|
|
58
|
+
# TODO what does this mean exactly
|
|
59
|
+
atom = rdk.AtomFromSmiles(f'[*]')
|
|
60
|
+
else:
|
|
61
|
+
atom = rdk.AtomFromSmiles(f'[{symbol}]')
|
|
62
|
+
|
|
63
|
+
# Set charge based on match
|
|
64
|
+
atom.SetFormalCharge(charge)
|
|
65
|
+
|
|
66
|
+
# Atom is set
|
|
67
|
+
if atom is None:
|
|
68
|
+
raise ValueError(f'Cannot convert S_GML: Atom label: {label}')
|
|
69
|
+
|
|
70
|
+
atom.SetAtomMapNum(nx_index)
|
|
71
|
+
atom_index = rdk_mol.AddAtom(atom)
|
|
72
|
+
node_index[nx_index] = atom_index
|
|
73
|
+
|
|
74
|
+
# Edges
|
|
75
|
+
for a, b in gmlgraph.graph.edges():
|
|
76
|
+
index_a = node_index[a]
|
|
77
|
+
index_b = node_index[b]
|
|
78
|
+
label: str = gmlgraph.graph.get_edge_data(a, b)['label']
|
|
79
|
+
match label:
|
|
80
|
+
case '-':
|
|
81
|
+
bondtype = rdk.BondType.SINGLE
|
|
82
|
+
case '=':
|
|
83
|
+
bondtype = rdk.BondType.DOUBLE
|
|
84
|
+
case '#':
|
|
85
|
+
bondtype = rdk.BondType.TRIPLE
|
|
86
|
+
case ':':
|
|
87
|
+
bondtype = rdk.BondType.AROMATIC
|
|
88
|
+
case _:
|
|
89
|
+
raise ValueError(f'Unknown bond type: {label}.')
|
|
90
|
+
rdk_mol.AddBond(index_a, index_b, bondtype)
|
|
91
|
+
|
|
92
|
+
# Mol is done, change to uneditable
|
|
93
|
+
rdk_mol_final = rdk_mol.GetMol()
|
|
94
|
+
return rdk_mol_final
|
|
95
|
+
|
|
96
|
+
# Reaction
|
|
97
|
+
def gml_to_rdkit_reaction(gml: str) -> ChemicalReaction:
|
|
98
|
+
""" Relies on S_GML in the format outputted by the M-CSA script.
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
# TODO change to re-use above code?
|
|
102
|
+
|
|
103
|
+
# Load two S_GML strings
|
|
104
|
+
l_str = ''
|
|
105
|
+
r_str = ''
|
|
106
|
+
step: int = 0
|
|
107
|
+
currentstr: str = ''
|
|
108
|
+
for l in gml.splitlines():
|
|
109
|
+
if l.startswith('graph') or l.startswith('left') or l.startswith('right'):
|
|
110
|
+
if step == 1:
|
|
111
|
+
l_str = currentstr
|
|
112
|
+
elif step == 2:
|
|
113
|
+
r_str = currentstr
|
|
114
|
+
# Start new string
|
|
115
|
+
step += 1
|
|
116
|
+
currentstr = ''
|
|
117
|
+
# Add to current string
|
|
118
|
+
currentstr += l
|
|
119
|
+
|
|
120
|
+
assert step == 2
|
|
121
|
+
r_str = currentstr
|
|
122
|
+
|
|
123
|
+
# Get the components and turn into MolInstances
|
|
124
|
+
# TODO should not be necessary
|
|
125
|
+
l_chemgraph = remove_residues(GMLGraph(l_str.replace('left', 'graph')))
|
|
126
|
+
r_chemgraph = remove_residues(GMLGraph(r_str.replace('right', 'graph')))
|
|
127
|
+
|
|
128
|
+
# For each component, add it
|
|
129
|
+
rdk_r = ChemicalReaction()
|
|
130
|
+
rdk_reaction = ChemicalReaction()
|
|
131
|
+
|
|
132
|
+
for chemgraph, side in [[l_chemgraph, -1], [r_chemgraph, 1]]:
|
|
133
|
+
for component in chemgraph.get_components_subgraphs():
|
|
134
|
+
component: nx.Graph
|
|
135
|
+
rdk_mol = rdk.EditableMol(rdk.Mol())
|
|
136
|
+
node_index: dict[int, int] = {}
|
|
137
|
+
|
|
138
|
+
# Atoms
|
|
139
|
+
for nx_index, nx_properties in component.nodes.items():
|
|
140
|
+
# Add atom with element (charge?) given by the label, and atom map number given by the index
|
|
141
|
+
atom = None
|
|
142
|
+
label: str = nx_properties['label']
|
|
143
|
+
if label == '*':
|
|
144
|
+
atom = rdk.AtomFromSmiles(f'[*]')
|
|
145
|
+
else:
|
|
146
|
+
match = atom_regex.match(label)
|
|
147
|
+
charge = 0
|
|
148
|
+
try:
|
|
149
|
+
symbol = match.group(1)
|
|
150
|
+
plusminus = match.group(3)
|
|
151
|
+
if plusminus is None:
|
|
152
|
+
charge = 0
|
|
153
|
+
else:
|
|
154
|
+
if plusminus == '+':
|
|
155
|
+
charge = 1
|
|
156
|
+
elif plusminus == '-':
|
|
157
|
+
charge = -1
|
|
158
|
+
else:
|
|
159
|
+
raise Exception()
|
|
160
|
+
|
|
161
|
+
chargeabs = match.group(2)
|
|
162
|
+
if chargeabs:
|
|
163
|
+
charge * int(chargeabs)
|
|
164
|
+
|
|
165
|
+
except Exception:
|
|
166
|
+
raise ValueError(f'Cannot convert S_GML: Atom label: {label}')
|
|
167
|
+
# error.
|
|
168
|
+
if symbol == 'R':
|
|
169
|
+
# Radical
|
|
170
|
+
# TODO
|
|
171
|
+
atom = rdk.AtomFromSmiles(f'[*]')
|
|
172
|
+
elif symbol.startswith('Alias'):
|
|
173
|
+
# 'Alias in GML?'
|
|
174
|
+
# TODO what does this mean exactly
|
|
175
|
+
atom = rdk.AtomFromSmiles(f'[*]')
|
|
176
|
+
else:
|
|
177
|
+
atom = rdk.AtomFromSmiles(f'[{symbol}]')
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
if atom is None:
|
|
181
|
+
raise ValueError(f'Cannot convert S_GML: Atom label: {label}')
|
|
182
|
+
atom.SetFormalCharge(charge)
|
|
183
|
+
atom.SetAtomMapNum(nx_index)
|
|
184
|
+
atom_index = rdk_mol.AddAtom(atom)
|
|
185
|
+
node_index[nx_index] = atom_index
|
|
186
|
+
|
|
187
|
+
# Edges
|
|
188
|
+
for a, b in component.edges():
|
|
189
|
+
index_a = node_index[a]
|
|
190
|
+
index_b = node_index[b]
|
|
191
|
+
label: str = component.get_edge_data(a, b)['label']
|
|
192
|
+
match label:
|
|
193
|
+
case '-':
|
|
194
|
+
bondtype = rdk.BondType.SINGLE
|
|
195
|
+
case '=':
|
|
196
|
+
bondtype = rdk.BondType.DOUBLE
|
|
197
|
+
case '#':
|
|
198
|
+
bondtype = rdk.BondType.TRIPLE
|
|
199
|
+
case ':':
|
|
200
|
+
bondtype = rdk.BondType.AROMATIC
|
|
201
|
+
case _:
|
|
202
|
+
raise ValueError(f'Unknown bond type: {label}.')
|
|
203
|
+
rdk_mol.AddBond(index_a, index_b, bondtype)
|
|
204
|
+
|
|
205
|
+
pass
|
|
206
|
+
|
|
207
|
+
# Mol is done, change to uneditable
|
|
208
|
+
rdk_mol_final = rdk_mol.GetMol()
|
|
209
|
+
# TODO ADD mol to reaction
|
|
210
|
+
match side:
|
|
211
|
+
case -1:
|
|
212
|
+
rdk_reaction.AddReactantTemplate(rdk_mol_final)
|
|
213
|
+
case 1:
|
|
214
|
+
rdk_reaction.AddProductTemplate(rdk_mol_final)
|
|
215
|
+
|
|
216
|
+
# Reaction now completed
|
|
217
|
+
return rdk_reaction
|