chemrecon 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chemrecon/__init__.py +73 -0
- chemrecon/chem/__init__.py +0 -0
- chemrecon/chem/chemreaction.py +223 -0
- chemrecon/chem/constant_compounds.py +3 -0
- chemrecon/chem/create_mol.py +91 -0
- chemrecon/chem/elements.py +141 -0
- chemrecon/chem/gml/__init__.py +0 -0
- chemrecon/chem/gml/gml.py +324 -0
- chemrecon/chem/gml/gml_reactant_matching.py +130 -0
- chemrecon/chem/gml/gml_to_rdk.py +217 -0
- chemrecon/chem/mol.py +483 -0
- chemrecon/chem/sumformula.py +120 -0
- chemrecon/connection.py +97 -0
- chemrecon/core/__init__.py +0 -0
- chemrecon/core/id_types.py +687 -0
- chemrecon/core/ontology.py +209 -0
- chemrecon/core/populate_query_handler.py +336 -0
- chemrecon/core/query_handler.py +587 -0
- chemrecon/database/__init__.py +1 -0
- chemrecon/database/connect.py +63 -0
- chemrecon/database/connection_params/chemrecon_pub.dbinfo +5 -0
- chemrecon/database/connection_params/local_docker_dev.dbinfo +5 -0
- chemrecon/database/connection_params/local_docker_init.dbinfo +5 -0
- chemrecon/database/connection_params/local_docker_pub.dbinfo +5 -0
- chemrecon/database/params.py +88 -0
- chemrecon/entrygraph/draw.py +119 -0
- chemrecon/entrygraph/entrygraph.py +301 -0
- chemrecon/entrygraph/explorationprotocol.py +199 -0
- chemrecon/entrygraph/explore.py +421 -0
- chemrecon/entrygraph/explore_procedure.py +183 -0
- chemrecon/entrygraph/filter.py +88 -0
- chemrecon/entrygraph/scoring.py +141 -0
- chemrecon/query/__init__.py +26 -0
- chemrecon/query/create_entry.py +86 -0
- chemrecon/query/default_protocols.py +57 -0
- chemrecon/query/find_entry.py +84 -0
- chemrecon/query/get_relations.py +143 -0
- chemrecon/query/get_structures_from_compound.py +65 -0
- chemrecon/schema/__init__.py +86 -0
- chemrecon/schema/db_object.py +363 -0
- chemrecon/schema/direction.py +10 -0
- chemrecon/schema/entry_types/__init__.py +0 -0
- chemrecon/schema/entry_types/aam.py +34 -0
- chemrecon/schema/entry_types/aam_repr.py +37 -0
- chemrecon/schema/entry_types/compound.py +52 -0
- chemrecon/schema/entry_types/enzyme.py +49 -0
- chemrecon/schema/entry_types/molstructure.py +64 -0
- chemrecon/schema/entry_types/molstructure_repr.py +41 -0
- chemrecon/schema/entry_types/reaction.py +57 -0
- chemrecon/schema/enums.py +154 -0
- chemrecon/schema/procedural_relation_entrygraph.py +66 -0
- chemrecon/schema/relation_types_composed/__init__.py +0 -0
- chemrecon/schema/relation_types_composed/compound_has_molstructure_relation.py +59 -0
- chemrecon/schema/relation_types_composed/reaction_has_aam_relation.py +50 -0
- chemrecon/schema/relation_types_procedural/__init__.py +0 -0
- chemrecon/schema/relation_types_procedural/aam_convert_relation.py +69 -0
- chemrecon/schema/relation_types_procedural/compound_select_structure_proceduralrelation.py +36 -0
- chemrecon/schema/relation_types_procedural/compound_similarlity_proceduralrelation.py +1 -0
- chemrecon/schema/relation_types_procedural/molstructure_convert_relation.py +49 -0
- chemrecon/schema/relation_types_procedural/reaction_select_aam_proceduralrelation.py +38 -0
- chemrecon/schema/relation_types_procedural/reaction_similarity_proceduralrelation.py +1 -0
- chemrecon/schema/relation_types_source/__init__.py +0 -0
- chemrecon/schema/relation_types_source/aam_involves_molstructure_relation.py +77 -0
- chemrecon/schema/relation_types_source/aam_repr_involves_molstructure_repr_relation.py +79 -0
- chemrecon/schema/relation_types_source/compound_has_structure_representation_relation.py +33 -0
- chemrecon/schema/relation_types_source/compound_reference_relation.py +34 -0
- chemrecon/schema/relation_types_source/molstructure_standardisation_relation.py +71 -0
- chemrecon/schema/relation_types_source/ontology/__init__.py +0 -0
- chemrecon/schema/relation_types_source/ontology/compound_ontology.py +369 -0
- chemrecon/schema/relation_types_source/ontology/enzyme_ontology.py +142 -0
- chemrecon/schema/relation_types_source/ontology/reaction_ontology.py +140 -0
- chemrecon/schema/relation_types_source/reaction_has_aam_representation_relation.py +34 -0
- chemrecon/schema/relation_types_source/reaction_has_enzyme_relation.py +71 -0
- chemrecon/schema/relation_types_source/reaction_involves_compound_relation.py +69 -0
- chemrecon/schema/relation_types_source/reaction_reference_relation.py +33 -0
- chemrecon/scripts/initialize_database.py +494 -0
- chemrecon/utils/copy_signature.py +10 -0
- chemrecon/utils/encodeable_list.py +11 -0
- chemrecon/utils/get_id_type.py +70 -0
- chemrecon/utils/hungarian.py +31 -0
- chemrecon/utils/reactant_matching.py +168 -0
- chemrecon/utils/rxnutils.py +44 -0
- chemrecon/utils/set_cwd.py +12 -0
- chemrecon-0.1.1.dist-info/METADATA +143 -0
- chemrecon-0.1.1.dist-info/RECORD +86 -0
- chemrecon-0.1.1.dist-info/WHEEL +4 -0
chemrecon/chem/mol.py
ADDED
|
@@ -0,0 +1,483 @@
|
|
|
1
|
+
""" Defines a wrapper class for RDKit molecules.
|
|
2
|
+
A Mol may either be a MolTemplate or a MolInstance.
|
|
3
|
+
- MolInstance: The molecule as it appears in a reaction concretely.
|
|
4
|
+
- MolTemplate: More abstract representation of the molecule in general.
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import ast
|
|
9
|
+
from copy import copy, deepcopy
|
|
10
|
+
from typing import Type, Optional, Any
|
|
11
|
+
|
|
12
|
+
import rdkit.Chem as rdk
|
|
13
|
+
from rdkit.Chem import rdFingerprintGenerator as rdk_fp
|
|
14
|
+
import rdkit.Chem.MolStandardize.rdMolStandardize as rdk_std
|
|
15
|
+
from rdkit import DataStructs as rdk_ds
|
|
16
|
+
|
|
17
|
+
from chemrecon.chem.elements import atomicnum_element
|
|
18
|
+
from chemrecon.chem.sumformula import SumFormula
|
|
19
|
+
from chemrecon.schema import MolStructure
|
|
20
|
+
from chemrecon.schema.enums import FeatureEnum
|
|
21
|
+
|
|
22
|
+
# RDkit generators
|
|
23
|
+
fpgen = rdk_fp.GetRDKitFPGenerator()
|
|
24
|
+
|
|
25
|
+
# Main class
|
|
26
|
+
# --------------------------------------------------------------------------------------------------------------
|
|
27
|
+
class Mol:
|
|
28
|
+
""" Wrapper for RDKit Mol
|
|
29
|
+
"""
|
|
30
|
+
# TODO rename/refactor to 'MolStructure'?
|
|
31
|
+
mol: rdk.Mol
|
|
32
|
+
|
|
33
|
+
smiles: Optional[str]
|
|
34
|
+
features: set[Type[Feature]] # Features for which the rdk_mol is standardized
|
|
35
|
+
n_atoms: int
|
|
36
|
+
mass: float
|
|
37
|
+
charge: int
|
|
38
|
+
molformula: SumFormula
|
|
39
|
+
provenance: Optional[str]
|
|
40
|
+
propdict: dict[str, str] # Properties
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# Optional fingerprint
|
|
45
|
+
fp: Optional[Any]
|
|
46
|
+
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
rdk_mol: rdk.Mol,
|
|
50
|
+
set_features: set[Type[Feature]] = None,
|
|
51
|
+
provenance: Optional[str] = None
|
|
52
|
+
):
|
|
53
|
+
if rdk_mol is None:
|
|
54
|
+
raise ValueError('Cannot create Mol from None.')
|
|
55
|
+
|
|
56
|
+
# Normalize the molecule
|
|
57
|
+
try:
|
|
58
|
+
self.mol = rdk_std.Normalize(rdk_mol) # TODO what exactly does normalisation do?
|
|
59
|
+
except ValueError as e:
|
|
60
|
+
# Could not create molecule, or molecule is None?
|
|
61
|
+
print(e)
|
|
62
|
+
self.mol = None
|
|
63
|
+
self.smiles = None
|
|
64
|
+
# return
|
|
65
|
+
raise ValueError(f'Could not create molecule: {e}')
|
|
66
|
+
|
|
67
|
+
# Set smiles for printing
|
|
68
|
+
self.smiles = rdk.MolToSmiles(self.mol)
|
|
69
|
+
|
|
70
|
+
# Fix SMILES bug (non-C lowercase atom symbols)
|
|
71
|
+
# https://github.com/rdkit/rdkit/issues/3697
|
|
72
|
+
self.smiles = self.smiles.replace('[o', '[O')
|
|
73
|
+
|
|
74
|
+
if self.smiles is None:
|
|
75
|
+
raise ValueError('TODO fix')
|
|
76
|
+
|
|
77
|
+
# Compute featuers
|
|
78
|
+
if set_features is not None:
|
|
79
|
+
# Features set in constructor
|
|
80
|
+
self.features = set_features
|
|
81
|
+
else:
|
|
82
|
+
self.features = set()
|
|
83
|
+
for f in feats:
|
|
84
|
+
# Check standardization and set
|
|
85
|
+
if f.is_standardized(mol = self):
|
|
86
|
+
self.features.add(f)
|
|
87
|
+
|
|
88
|
+
# Get properties from name if initialized through an RXN file
|
|
89
|
+
try:
|
|
90
|
+
molprops_rxn = rdk_mol.GetProp('_Name')
|
|
91
|
+
try:
|
|
92
|
+
rxn_propdict = ast.literal_eval(molprops_rxn)
|
|
93
|
+
self.propdict = rxn_propdict
|
|
94
|
+
except Exception as e:
|
|
95
|
+
self.propdict = dict()
|
|
96
|
+
except KeyError:
|
|
97
|
+
# Not applicable, skip
|
|
98
|
+
self.propdict = dict()
|
|
99
|
+
pass
|
|
100
|
+
|
|
101
|
+
# Populate misc fields
|
|
102
|
+
self.n_atoms = self.mol.GetNumAtoms(onlyExplicit = False)
|
|
103
|
+
self.provenance = provenance
|
|
104
|
+
|
|
105
|
+
# Get MolFormula
|
|
106
|
+
self.molformula = self.get_molformula()
|
|
107
|
+
|
|
108
|
+
# TODO n_atoms_tracked
|
|
109
|
+
# TODO mass (simple)
|
|
110
|
+
|
|
111
|
+
self.charge = rdk.GetFormalCharge(self.mol)
|
|
112
|
+
|
|
113
|
+
# Set fp
|
|
114
|
+
self.fp = None
|
|
115
|
+
|
|
116
|
+
if self.provenance is None:
|
|
117
|
+
# TODO fix
|
|
118
|
+
pass
|
|
119
|
+
|
|
120
|
+
# Converters
|
|
121
|
+
# ----------------------------------------------------------------------------------------------------------
|
|
122
|
+
def to_smiles(self) -> str:
|
|
123
|
+
# TODO return more details
|
|
124
|
+
return self.smiles
|
|
125
|
+
|
|
126
|
+
# Database interfacing
|
|
127
|
+
# ----------------------------------------------------------------------------------------------------------
|
|
128
|
+
def to_database_struct(self) -> MolStructure:
|
|
129
|
+
""" Create a database row to be inserted."""
|
|
130
|
+
if self.mol is None and self.smiles is None:
|
|
131
|
+
raise ValueError('Cannot convert None entry')
|
|
132
|
+
|
|
133
|
+
# TODO fixes rdkit bug?
|
|
134
|
+
# Fix SMILES bug (non-C lowercase atom symbols)
|
|
135
|
+
# https://github.com/rdkit/rdkit/issues/3697
|
|
136
|
+
self.smiles.replace(' [o', '[O')
|
|
137
|
+
|
|
138
|
+
# Else, convert
|
|
139
|
+
return MolStructure(
|
|
140
|
+
smiles = self.to_smiles(),
|
|
141
|
+
std_feats = [
|
|
142
|
+
f.feature_enum for f in self.features
|
|
143
|
+
]
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# TODO insert calculated properties
|
|
147
|
+
|
|
148
|
+
# Calculate properties
|
|
149
|
+
# ----------------------------------------------------------------------------------------------------------
|
|
150
|
+
# TODO
|
|
151
|
+
|
|
152
|
+
# Standardisation
|
|
153
|
+
def get_standardised(self, feat_list: Optional[list[Feature]] = None) -> Mol:
|
|
154
|
+
""" Get a new Mol standardised according to the given features. If None given, standardise according
|
|
155
|
+
to all features.
|
|
156
|
+
"""
|
|
157
|
+
feat_list = feat_list or [F, I, C, T, S]
|
|
158
|
+
rdk_s: rdk.Mol = self.mol
|
|
159
|
+
for f in feat_list:
|
|
160
|
+
rdk_s = f.standardise_rdk(rdk_s)
|
|
161
|
+
return Mol(rdk_s, set_features = feat_list, provenance = self.provenance)
|
|
162
|
+
|
|
163
|
+
# Identity and similarity
|
|
164
|
+
# ------------------------------------------------------------------------------------------------------------------
|
|
165
|
+
def is_identical_up_to_map(self, other: Mol):
|
|
166
|
+
""" Checks whether two Mols are identical except for extra details such as atom mapping numbers. """
|
|
167
|
+
m: rdk.Mol = deepcopy(self.mol)
|
|
168
|
+
m_: rdk.Mol = deepcopy(other.mol)
|
|
169
|
+
|
|
170
|
+
for a in m.GetAtoms():
|
|
171
|
+
a.SetAtomMapNum(0)
|
|
172
|
+
for a in m_.GetAtoms():
|
|
173
|
+
a.SetAtomMapNum(0)
|
|
174
|
+
|
|
175
|
+
return rdk.MolToSmiles(m) == rdk.MolToSmiles(m_)
|
|
176
|
+
|
|
177
|
+
def is_identical_up_to(self, other: Mol, features: set[Feature]) -> bool:
|
|
178
|
+
# TODO
|
|
179
|
+
raise NotImplementedError
|
|
180
|
+
|
|
181
|
+
def generate_fingerprint(self):
|
|
182
|
+
if self.fp is None:
|
|
183
|
+
self.fp = fpgen.GetFingerprint(self.mol)
|
|
184
|
+
|
|
185
|
+
def get_similarity(self, other: Mol) -> float:
|
|
186
|
+
""" Get a similarity metric between two Mols. Should be 1 if the mols are identical,
|
|
187
|
+
and approach zero as the difference increases.
|
|
188
|
+
"""
|
|
189
|
+
self.generate_fingerprint()
|
|
190
|
+
other.generate_fingerprint()
|
|
191
|
+
return rdk_ds.TanimotoSimilarity(
|
|
192
|
+
self.fp, other.fp
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# Misc
|
|
196
|
+
# ----------------------------------------------------------------------------------------------------------
|
|
197
|
+
def feature_string(self) -> str:
|
|
198
|
+
s = ''
|
|
199
|
+
for f in feats:
|
|
200
|
+
if f in self.features:
|
|
201
|
+
s += f.symbol
|
|
202
|
+
else:
|
|
203
|
+
s += '-'
|
|
204
|
+
return s
|
|
205
|
+
|
|
206
|
+
def __repr__(self):
|
|
207
|
+
return f'{self.smiles} [{self.feature_string()}]'
|
|
208
|
+
|
|
209
|
+
def __hash__(self):
|
|
210
|
+
return f'{self.smiles}:::{self.feature_string()}'.__hash__()
|
|
211
|
+
|
|
212
|
+
def __eq__(self, other: Mol):
|
|
213
|
+
if isinstance(other, Mol):
|
|
214
|
+
return self.smiles == other.smiles and self.features == other.features
|
|
215
|
+
else:
|
|
216
|
+
return False
|
|
217
|
+
|
|
218
|
+
# Serialise
|
|
219
|
+
def serialize(self) -> dict:
|
|
220
|
+
return {
|
|
221
|
+
'smiles': self.to_smiles(),
|
|
222
|
+
'features': self.feature_string(),
|
|
223
|
+
'molformula': 'TODO' # TODO
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
# Get molecular formula
|
|
227
|
+
def get_molformula(self) -> SumFormula:
|
|
228
|
+
""" Returns the molecular formula
|
|
229
|
+
"""
|
|
230
|
+
# TODO requires rdkit documentation
|
|
231
|
+
formula_num: dict[int, int] = dict()
|
|
232
|
+
for a in self.mol.GetAtoms():
|
|
233
|
+
n: int = a.GetAtomicNum()
|
|
234
|
+
formula_num[n] = formula_num.get(n, 0) + 1
|
|
235
|
+
|
|
236
|
+
charge = rdk.GetFormalCharge(self.mol)
|
|
237
|
+
|
|
238
|
+
# Translate to formula of elements and return
|
|
239
|
+
return SumFormula(
|
|
240
|
+
formula = {
|
|
241
|
+
atomicnum_element[n]: i for n, i in formula_num.items()
|
|
242
|
+
},
|
|
243
|
+
charge = charge
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
# Mol Instance and Template
|
|
247
|
+
# ----------------------------------------------------------------------------------------------------------------------
|
|
248
|
+
class MolInstance(Mol):
|
|
249
|
+
""" MolInstance represents a particular instance of a molecule as it participates in a reaction.
|
|
250
|
+
"""
|
|
251
|
+
|
|
252
|
+
n_atoms_tracked: int
|
|
253
|
+
_atom_map: Optional[list[int]] # For each atom in the MolInstance, gives the mapping.
|
|
254
|
+
_atom_map_smiles_order: Optional[list[int]]
|
|
255
|
+
|
|
256
|
+
def __init__(self, rdk_mol: rdk.Mol, set_features: set[Type[Feature]] = None, provenance: Optional[str] = None):
|
|
257
|
+
super().__init__(rdk_mol, set_features = set_features, provenance = provenance)
|
|
258
|
+
self._atom_map = None
|
|
259
|
+
self._atom_map_smiles_order = None
|
|
260
|
+
|
|
261
|
+
def to_mol_template(self) -> MolTemplate:
|
|
262
|
+
""" Remove all information of the instance as it participates in a reaction (atom-to-atom map etc.)
|
|
263
|
+
to generate a generic template of the structure.
|
|
264
|
+
"""
|
|
265
|
+
mtemp = MolTemplate(
|
|
266
|
+
rdk_mol = self.mol,
|
|
267
|
+
set_features = self.features,
|
|
268
|
+
provenance = self.provenance
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
# Carry over certain properties
|
|
272
|
+
mtemp.propdict = self.propdict
|
|
273
|
+
return mtemp
|
|
274
|
+
|
|
275
|
+
def get_atom_map_in_native_order(self) -> list[int]:
|
|
276
|
+
# The numbering of the atoms as in the reaction it is an instance of.
|
|
277
|
+
if not self._atom_map:
|
|
278
|
+
self._atom_map = [
|
|
279
|
+
a.GetAtomMapNum()
|
|
280
|
+
for a in self.mol.GetAtoms()
|
|
281
|
+
]
|
|
282
|
+
|
|
283
|
+
return self._atom_map
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def get_atom_map_in_smiles_order(self, safe: bool = False) -> list[int]:
|
|
287
|
+
# Same as getting the atom map, but with the ordering of the atoms in the smiles string
|
|
288
|
+
# Needed to populate DB with atom-to-atom maps
|
|
289
|
+
if not self._atom_map_smiles_order:
|
|
290
|
+
smilesmol = rdk.MolFromSmiles(self.smiles, sanitize = not safe)
|
|
291
|
+
self._atom_map_smiles_order = [
|
|
292
|
+
a.GetAtomMapNum()
|
|
293
|
+
for a in smilesmol.GetAtoms()
|
|
294
|
+
]
|
|
295
|
+
|
|
296
|
+
return self._atom_map_smiles_order
|
|
297
|
+
|
|
298
|
+
def serialize(self) -> dict:
|
|
299
|
+
d = super().serialize()
|
|
300
|
+
d.update({
|
|
301
|
+
'template': self.to_mol_template().to_smiles()
|
|
302
|
+
})
|
|
303
|
+
return d
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
class MolTemplate(Mol):
|
|
307
|
+
""" MolTemplate represents a consistent type of molecule, independent of its particular participation and
|
|
308
|
+
mapping in a reaction.
|
|
309
|
+
"""
|
|
310
|
+
pass
|
|
311
|
+
|
|
312
|
+
def __init__(
|
|
313
|
+
self,
|
|
314
|
+
rdk_mol: rdk.Mol,
|
|
315
|
+
set_features: set[Type[Feature]] = None,
|
|
316
|
+
provenance: Optional[str] = None
|
|
317
|
+
):
|
|
318
|
+
rdk_mol_ = deepcopy(rdk_mol)
|
|
319
|
+
if not rdk_mol_:
|
|
320
|
+
raise ValueError(f'Could not create molecule')
|
|
321
|
+
|
|
322
|
+
# Remove atom-to-atom maps
|
|
323
|
+
for a in rdk_mol_.GetAtoms():
|
|
324
|
+
a.SetAtomMapNum(0)
|
|
325
|
+
|
|
326
|
+
super().__init__(rdk_mol = rdk_mol_, set_features = set_features, provenance = provenance)
|
|
327
|
+
|
|
328
|
+
def instantiate(self, mapping: list[int]) -> MolInstance:
|
|
329
|
+
""" Instantiate a template with a map, to produce an instance of the template as it participates in a reaction.
|
|
330
|
+
"""
|
|
331
|
+
rdk_mol_ = deepcopy(self.mol)
|
|
332
|
+
for a, mapnum in zip(rdk_mol_.GetAtoms(), mapping):
|
|
333
|
+
a.SetAtomMapNum(mapnum)
|
|
334
|
+
return MolInstance(rdk_mol = rdk_mol_, set_features = self.features, provenance = 'from_template')
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
# Features and standardization
|
|
338
|
+
# ----------------------------------------------------------------------------------------------------------------------
|
|
339
|
+
# ABC
|
|
340
|
+
class Feature:
|
|
341
|
+
symbol: str
|
|
342
|
+
name: str
|
|
343
|
+
feature_enum: FeatureEnum
|
|
344
|
+
|
|
345
|
+
@classmethod
|
|
346
|
+
def is_standardized(cls, mol: Mol) -> bool:
|
|
347
|
+
raise NotImplementedError
|
|
348
|
+
|
|
349
|
+
@classmethod
|
|
350
|
+
def standardise(cls, mol: Mol) -> rdk.Mol:
|
|
351
|
+
if mol.mol is None:
|
|
352
|
+
raise ValueError
|
|
353
|
+
return cls.standardise_rdk(mol.mol)
|
|
354
|
+
|
|
355
|
+
@classmethod
|
|
356
|
+
def standardise_rdk(cls, rdk_mol: rdk.Mol) -> rdk.Mol:
|
|
357
|
+
raise NotImplementedError
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
# FICTS features
|
|
361
|
+
# --------------------------------------------------------------------------------------------------------------
|
|
362
|
+
|
|
363
|
+
# [F] - Fragments
|
|
364
|
+
class F(Feature):
|
|
365
|
+
symbol = 'F'
|
|
366
|
+
name = 'Fragments'
|
|
367
|
+
feature_enum = FeatureEnum.F
|
|
368
|
+
|
|
369
|
+
@classmethod
|
|
370
|
+
def is_standardized(cls, mol: Mol) -> bool:
|
|
371
|
+
n_frags = rdk.GetMolFrags(mol.mol)
|
|
372
|
+
return len(n_frags) <= 1
|
|
373
|
+
|
|
374
|
+
@classmethod
|
|
375
|
+
def standardise_rdk(cls, rdk_mol: rdk.Mol) -> rdk.Mol:
|
|
376
|
+
mol_ = rdk_std.FragmentParent(rdk_mol, skipStandardize = True)
|
|
377
|
+
return mol_
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
# [I] - Isotope
|
|
381
|
+
class I(Feature):
|
|
382
|
+
symbol = 'I'
|
|
383
|
+
name = 'Isotope'
|
|
384
|
+
feature_enum = FeatureEnum.I
|
|
385
|
+
|
|
386
|
+
@classmethod
|
|
387
|
+
def is_standardized(cls, mol: Mol) -> bool:
|
|
388
|
+
a: rdk.Atom
|
|
389
|
+
for a in mol.mol.GetAtoms():
|
|
390
|
+
if a.GetIsotope():
|
|
391
|
+
return False
|
|
392
|
+
return True
|
|
393
|
+
|
|
394
|
+
@classmethod
|
|
395
|
+
def standardise_rdk(self, rdk_mol: rdk.Mol) -> rdk.Mol:
|
|
396
|
+
mol_ = copy(rdk_mol)
|
|
397
|
+
a: rdk.Atom
|
|
398
|
+
for a in mol_.GetAtoms():
|
|
399
|
+
if a.GetIsotope():
|
|
400
|
+
a.SetIsotope(0)
|
|
401
|
+
return mol_
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
# [C] - Charge
|
|
405
|
+
class C(Feature):
|
|
406
|
+
symbol = 'C'
|
|
407
|
+
name = 'Charge'
|
|
408
|
+
feature_enum = FeatureEnum.C
|
|
409
|
+
|
|
410
|
+
# TODO use other uncharging approach?
|
|
411
|
+
uncharger = rdk_std.Uncharger()
|
|
412
|
+
|
|
413
|
+
@classmethod
|
|
414
|
+
def is_standardized(cls, mol: Mol) -> bool:
|
|
415
|
+
mol_ = cls.uncharger.uncharge(mol.mol)
|
|
416
|
+
return mol_identical(mol.mol, mol_)
|
|
417
|
+
|
|
418
|
+
@classmethod
|
|
419
|
+
def standardise_rdk(cls, rdk_mol: rdk.Mol) -> rdk.Mol:
|
|
420
|
+
mol_ = cls.uncharger.uncharge(rdk_mol)
|
|
421
|
+
return mol_
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
# [T] - Tautomer
|
|
425
|
+
class T(Feature):
|
|
426
|
+
symbol = 'T'
|
|
427
|
+
name = 'Tautomer'
|
|
428
|
+
feature_enum = FeatureEnum.T
|
|
429
|
+
|
|
430
|
+
tautomer_enumerator = rdk_std.TautomerEnumerator()
|
|
431
|
+
|
|
432
|
+
@classmethod
|
|
433
|
+
def is_standardized(cls, mol: Mol) -> bool:
|
|
434
|
+
""" Checks if the generated canonical tautomer is different. """
|
|
435
|
+
try:
|
|
436
|
+
mol_canon = cls.tautomer_enumerator.Canonicalize(mol.mol)
|
|
437
|
+
return mol_identical(mol.mol, mol_canon)
|
|
438
|
+
except rdk.rdchem.AtomKekulizeException:
|
|
439
|
+
return False
|
|
440
|
+
|
|
441
|
+
@classmethod
|
|
442
|
+
def standardise_rdk(cls, rdk_mol: rdk.Mol) -> rdk.Mol:
|
|
443
|
+
mol_ = cls.tautomer_enumerator.Canonicalize(rdk_mol)
|
|
444
|
+
return mol_
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
# [S] - Stereo
|
|
448
|
+
class S(Feature):
|
|
449
|
+
symbol = 'S'
|
|
450
|
+
name = 'Stereo'
|
|
451
|
+
feature_enum = FeatureEnum.S
|
|
452
|
+
|
|
453
|
+
@classmethod
|
|
454
|
+
def is_standardized(cls, mol: Mol) -> bool:
|
|
455
|
+
# TODO redo this check
|
|
456
|
+
return not ('/' in mol.smiles or '\\' in mol.smiles or '@' in mol.smiles)
|
|
457
|
+
|
|
458
|
+
@classmethod
|
|
459
|
+
def standardise_rdk(cls, rdk_mol: rdk.Mol) -> rdk.Mol:
|
|
460
|
+
mol_ = copy(rdk_mol)
|
|
461
|
+
rdk.RemoveStereochemistry(mol_)
|
|
462
|
+
return mol_
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
feat_enum_map: dict[FeatureEnum, Type[Feature]] = {
|
|
466
|
+
FeatureEnum.F: F,
|
|
467
|
+
FeatureEnum.I: I,
|
|
468
|
+
FeatureEnum.C: C,
|
|
469
|
+
FeatureEnum.T: T,
|
|
470
|
+
FeatureEnum.S: S,
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
# List of features
|
|
474
|
+
feats: list[Type[Feature]] = [F, I, C, T, S]
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
# Utils
|
|
478
|
+
# --------------------------------------------------------------------------------------------------------------
|
|
479
|
+
def mol_identical(a: rdk.Mol, b: rdk.Mol) -> bool:
|
|
480
|
+
return a.HasSubstructMatch(b) and b.HasSubstructMatch(a)
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from chemrecon.chem.elements import Element
|
|
6
|
+
from chemrecon.chem import elements as chem
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SumFormula:
|
|
10
|
+
""" Encapsulates a molecular formula.
|
|
11
|
+
"""
|
|
12
|
+
formula: dict[Element, int]
|
|
13
|
+
charge: Optional[int]
|
|
14
|
+
|
|
15
|
+
def __init__(self, formula: dict[Element, int], charge: Optional[int] = 0):
|
|
16
|
+
""" Assumes 0 (neutral charge) rather than None (unknown). If charge is unknown, specify explicitly. """
|
|
17
|
+
self.formula = formula
|
|
18
|
+
self.charge = charge
|
|
19
|
+
|
|
20
|
+
def __getitem__(self, element: Element):
|
|
21
|
+
return self.formula[element]
|
|
22
|
+
|
|
23
|
+
def __iter__(self):
|
|
24
|
+
yield from self.formula.__iter__()
|
|
25
|
+
|
|
26
|
+
def __or__(self, other: SumFormula):
|
|
27
|
+
return self.formula | other.formula
|
|
28
|
+
|
|
29
|
+
def get(self, element: Element, default: int):
|
|
30
|
+
return self.formula.get(element, default)
|
|
31
|
+
|
|
32
|
+
def elements(self) -> set[Element]:
|
|
33
|
+
return set(self.formula.keys())
|
|
34
|
+
|
|
35
|
+
def is_zero(self) -> bool:
|
|
36
|
+
""" Returns true if all elements are 0, and charge is 0 or unknown.
|
|
37
|
+
"""
|
|
38
|
+
if any(i != 0 for _, i in self.formula.items()):
|
|
39
|
+
return False
|
|
40
|
+
if self.charge is not None and self.charge != 0:
|
|
41
|
+
return False
|
|
42
|
+
return True
|
|
43
|
+
|
|
44
|
+
def has_negative(self) -> bool:
|
|
45
|
+
""" Returns true if the formula has any negative atom counts (e.g. when it represents a difference).
|
|
46
|
+
"""
|
|
47
|
+
return any(v < 0 for v in self.formula.values())
|
|
48
|
+
|
|
49
|
+
# Arithmetic operations
|
|
50
|
+
def __add__(self, other: SumFormula):
|
|
51
|
+
""" Add the elements and charges. If at least one has undefined charge, the result has undefined charge.
|
|
52
|
+
"""
|
|
53
|
+
sumformula = {
|
|
54
|
+
e: self.get(e, 0) + other.get(e, 0)
|
|
55
|
+
for e in self.elements() | other.elements()
|
|
56
|
+
}
|
|
57
|
+
if self.charge is not None and other.charge is not None:
|
|
58
|
+
charge: Optional[int] = self.charge + other.charge
|
|
59
|
+
else:
|
|
60
|
+
charge: Optional[int] = None
|
|
61
|
+
|
|
62
|
+
return SumFormula(sumformula, charge)
|
|
63
|
+
|
|
64
|
+
def __sub__(self, other: SumFormula):
|
|
65
|
+
""" Computes the difference between sum formulae
|
|
66
|
+
"""
|
|
67
|
+
sumformula = {
|
|
68
|
+
e: self.get(e, 0) - other.get(e, 0)
|
|
69
|
+
for e in self.elements() | other.elements()
|
|
70
|
+
}
|
|
71
|
+
if self.charge is not None and other.charge is not None:
|
|
72
|
+
charge: Optional[int] = self.charge + other.charge
|
|
73
|
+
else:
|
|
74
|
+
charge: Optional[int] = None
|
|
75
|
+
|
|
76
|
+
return SumFormula(sumformula, charge)
|
|
77
|
+
|
|
78
|
+
def __eq__(self, other: SumFormula):
|
|
79
|
+
# TODO how to handle equality when one has charge 0 and other has unknown charge?
|
|
80
|
+
return self.formula == other.formula and self.charge == other.charge
|
|
81
|
+
|
|
82
|
+
# Misc
|
|
83
|
+
# ------------------------------------------------------------------------------------------------------------------
|
|
84
|
+
def __str__(self):
|
|
85
|
+
""" Prints output in Hill order
|
|
86
|
+
"""
|
|
87
|
+
# https://en.wikipedia.org/wiki/Chemical_formula#Condensed_formula
|
|
88
|
+
elist: list[tuple[Element, int]] = list()
|
|
89
|
+
if chem.C in self.elements():
|
|
90
|
+
# C, H first order
|
|
91
|
+
elist.append((chem.C, self[chem.C]))
|
|
92
|
+
if chem.H in self.elements():
|
|
93
|
+
elist.append((chem.H, self[chem.H]))
|
|
94
|
+
elist.extend([
|
|
95
|
+
(e, i)
|
|
96
|
+
for e, i in sorted(self.formula.items(), key = lambda pair: pair[0].symbol)
|
|
97
|
+
if e not in {chem.C, chem.H}
|
|
98
|
+
])
|
|
99
|
+
else:
|
|
100
|
+
# Alphabetical order
|
|
101
|
+
elist = [
|
|
102
|
+
(e, i)
|
|
103
|
+
for e, i in sorted(self.formula.items(), key = lambda pair: pair[0].symbol)
|
|
104
|
+
]
|
|
105
|
+
|
|
106
|
+
chargestr: str = ''
|
|
107
|
+
if self.charge is not None:
|
|
108
|
+
if self.charge > 0:
|
|
109
|
+
chargestr = f' {self.charge}+'
|
|
110
|
+
elif self.charge < 0:
|
|
111
|
+
chargestr = f' {abs(self.charge)}-'
|
|
112
|
+
|
|
113
|
+
# Print as string
|
|
114
|
+
return ''.join(f'{e.symbol}{i}' for e, i in elist) + f'{chargestr}'
|
|
115
|
+
|
|
116
|
+
def molformula_from_str(s: str) -> SumFormula:
|
|
117
|
+
""" Convert a molecular formula string to an object representation.
|
|
118
|
+
"""
|
|
119
|
+
# TODO
|
|
120
|
+
raise NotImplementedError(s)
|
chemrecon/connection.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
""" Handles the global connection state of ChemRecon.
|
|
2
|
+
"""
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os.path
|
|
6
|
+
|
|
7
|
+
import chemrecon.core.query_handler
|
|
8
|
+
import chemrecon.core.populate_query_handler
|
|
9
|
+
from chemrecon.database.connect import postgres_connect
|
|
10
|
+
from chemrecon.database.params import Params, local_docker_pub, local_docker_dev, chemrecon_pub, chemrecon_dev
|
|
11
|
+
|
|
12
|
+
# Handler, default to public production database
|
|
13
|
+
# ----------------------------------------------------------------------------------------------------------------------
|
|
14
|
+
handler: chemrecon.core.query_handler.QueryHandler # QueryHandler or PopulateQueryHandler
|
|
15
|
+
|
|
16
|
+
def get_query_handler() -> chemrecon.core.query_handler.QueryHandler:
|
|
17
|
+
""" Returns the current database query handler."""
|
|
18
|
+
return handler
|
|
19
|
+
|
|
20
|
+
def connect(params: Params, can_write: bool):
|
|
21
|
+
""" Sets the database handler for the ChemRecon library to a custom database connection.
|
|
22
|
+
Also defines whether the library is allowed to write/cache results in the database.
|
|
23
|
+
"""
|
|
24
|
+
global handler
|
|
25
|
+
|
|
26
|
+
conn = postgres_connect(params)
|
|
27
|
+
if can_write:
|
|
28
|
+
handler = chemrecon.core.populate_query_handler.PopulateQueryHandler(conn)
|
|
29
|
+
else:
|
|
30
|
+
handler = chemrecon.core.query_handler.QueryHandler(conn)
|
|
31
|
+
|
|
32
|
+
# Done
|
|
33
|
+
print(f'Handler set: {params.connection_string()}')
|
|
34
|
+
|
|
35
|
+
def disconnect():
|
|
36
|
+
global handler
|
|
37
|
+
handler = None
|
|
38
|
+
|
|
39
|
+
def connect_public():
|
|
40
|
+
""" Sets the ChemRecon library to use a database connection to the public
|
|
41
|
+
ChemRecon database maintained by the developer.
|
|
42
|
+
"""
|
|
43
|
+
connect(chemrecon_pub, can_write = False)
|
|
44
|
+
|
|
45
|
+
def connect_public_dev():
|
|
46
|
+
""" For developer use.
|
|
47
|
+
"""
|
|
48
|
+
connect(chemrecon_dev, can_write = True)
|
|
49
|
+
|
|
50
|
+
def connect_local_docker():
|
|
51
|
+
""" Sets the ChemRecon library to use a database in a local Docker container. Refer to the documentation for
|
|
52
|
+
details on how to run and administrate this database.
|
|
53
|
+
"""
|
|
54
|
+
connect(local_docker_pub, can_write = False)
|
|
55
|
+
|
|
56
|
+
def connect_local_docker_dev():
|
|
57
|
+
""" Sets the ChemRecon library to use a database in a local Docker container. Refer to the documentation for
|
|
58
|
+
details on how to run and administrate this database.
|
|
59
|
+
"""
|
|
60
|
+
connect(local_docker_dev, can_write = True)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# Local cache files location
|
|
64
|
+
# ----------------------------------------------------------------------------------------------------------------------
|
|
65
|
+
cache_dir: str = 'cache/'
|
|
66
|
+
|
|
67
|
+
def init_cache():
|
|
68
|
+
""" If cache dir doesn't exis
|
|
69
|
+
"""
|
|
70
|
+
if not os.path.exists(cache_dir):
|
|
71
|
+
os.makedirs(cache_dir)
|
|
72
|
+
|
|
73
|
+
def flush_cache():
|
|
74
|
+
""" Remove all cached results
|
|
75
|
+
"""
|
|
76
|
+
raise NotImplementedError()
|
|
77
|
+
|
|
78
|
+
def set_cache_dir(new_cache_dir: str):
|
|
79
|
+
global cache_dir
|
|
80
|
+
cache_dir = new_cache_dir
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
# Initialise the cache
|
|
84
|
+
init_cache()
|
|
85
|
+
|
|
86
|
+
# External tools / dependencies
|
|
87
|
+
# ----------------------------------------------------------------------------------------------------------------------
|
|
88
|
+
# Reaction Decoder Tool, RDT
|
|
89
|
+
# rdt_jar_location: str = './chemrecon/redistribute/rdt/'
|
|
90
|
+
# rdt_jar_filename: str = 'rdt-2.4.1-jar-with-dependencies.jar'
|
|
91
|
+
# rdt_jar_path: str = './chemrecon/redistribute/rdt/rdt-2.4.1-jar-with-dependencies.jar'
|
|
92
|
+
# TODO by default in chemrecon/dependencies/rdt/...
|
|
93
|
+
|
|
94
|
+
# def set_rdt_jar_path(path: str):
|
|
95
|
+
# # TODO
|
|
96
|
+
# raise NotImplementedError()
|
|
97
|
+
# pass
|
|
File without changes
|