rdworks 0.25.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. rdworks/__init__.py +35 -0
  2. rdworks/autograph/__init__.py +4 -0
  3. rdworks/autograph/autograph.py +184 -0
  4. rdworks/autograph/centroid.py +90 -0
  5. rdworks/autograph/dynamictreecut.py +135 -0
  6. rdworks/autograph/nmrclust.py +123 -0
  7. rdworks/autograph/rckmeans.py +74 -0
  8. rdworks/bitqt/__init__.py +1 -0
  9. rdworks/bitqt/bitqt.py +355 -0
  10. rdworks/conf.py +374 -0
  11. rdworks/descriptor.py +36 -0
  12. rdworks/display.py +206 -0
  13. rdworks/ionized.py +170 -0
  14. rdworks/matchedseries.py +260 -0
  15. rdworks/mol.py +1522 -0
  16. rdworks/mollibr.py +887 -0
  17. rdworks/pka.py +38 -0
  18. rdworks/predefined/Asinex_fragment.xml +20 -0
  19. rdworks/predefined/Astex_RO3.xml +16 -0
  20. rdworks/predefined/Baell2010_PAINS/Baell2010A.xml +52 -0
  21. rdworks/predefined/Baell2010_PAINS/Baell2010B.xml +169 -0
  22. rdworks/predefined/Baell2010_PAINS/Baell2010C.xml +1231 -0
  23. rdworks/predefined/Baell2010_PAINS/PAINS-less-than-015-hits.xml +2048 -0
  24. rdworks/predefined/Baell2010_PAINS/PAINS-less-than-150-hits.xml +278 -0
  25. rdworks/predefined/Baell2010_PAINS/PAINS-more-than-150-hits.xml +83 -0
  26. rdworks/predefined/Baell2010_PAINS/makexml.py +70 -0
  27. rdworks/predefined/Brenk2008_Dundee/makexml.py +21 -0
  28. rdworks/predefined/CNS.xml +18 -0
  29. rdworks/predefined/ChEMBL_Walters/BMS.xml +543 -0
  30. rdworks/predefined/ChEMBL_Walters/Dundee.xml +318 -0
  31. rdworks/predefined/ChEMBL_Walters/Glaxo.xml +168 -0
  32. rdworks/predefined/ChEMBL_Walters/Inpharmatica.xml +276 -0
  33. rdworks/predefined/ChEMBL_Walters/LINT.xml +174 -0
  34. rdworks/predefined/ChEMBL_Walters/MLSMR.xml +351 -0
  35. rdworks/predefined/ChEMBL_Walters/PAINS.xml +1446 -0
  36. rdworks/predefined/ChEMBL_Walters/SureChEMBL.xml +501 -0
  37. rdworks/predefined/ChEMBL_Walters/makexml.py +40 -0
  38. rdworks/predefined/Hann1999_Glaxo/Hann1999.xml +168 -0
  39. rdworks/predefined/Hann1999_Glaxo/Hann1999Acid.xml +102 -0
  40. rdworks/predefined/Hann1999_Glaxo/Hann1999Base.xml +6 -0
  41. rdworks/predefined/Hann1999_Glaxo/Hann1999ElPh.xml +6 -0
  42. rdworks/predefined/Hann1999_Glaxo/Hann1999NuPh.xml +6 -0
  43. rdworks/predefined/Hann1999_Glaxo/makexml.py +83 -0
  44. rdworks/predefined/Kazius2005/Kazius2005.xml +114 -0
  45. rdworks/predefined/Kazius2005/makexml.py +66 -0
  46. rdworks/predefined/ZINC_druglike.xml +24 -0
  47. rdworks/predefined/ZINC_fragment.xml +14 -0
  48. rdworks/predefined/ZINC_leadlike.xml +15 -0
  49. rdworks/predefined/fragment.xml +7 -0
  50. rdworks/predefined/ionized/simple_smarts_pattern.csv +57 -0
  51. rdworks/predefined/ionized/smarts_pattern.csv +107 -0
  52. rdworks/predefined/misc/makexml.py +119 -0
  53. rdworks/predefined/misc/reactive-part-2.xml +104 -0
  54. rdworks/predefined/misc/reactive-part-3.xml +74 -0
  55. rdworks/predefined/misc/reactive.xml +321 -0
  56. rdworks/readin.py +312 -0
  57. rdworks/rgroup.py +2173 -0
  58. rdworks/scaffold.py +520 -0
  59. rdworks/std.py +143 -0
  60. rdworks/stereoisomers.py +127 -0
  61. rdworks/tautomers.py +20 -0
  62. rdworks/units.py +63 -0
  63. rdworks/utils.py +495 -0
  64. rdworks/xml.py +260 -0
  65. rdworks-0.25.7.dist-info/METADATA +37 -0
  66. rdworks-0.25.7.dist-info/RECORD +69 -0
  67. rdworks-0.25.7.dist-info/WHEEL +5 -0
  68. rdworks-0.25.7.dist-info/licenses/LICENSE +21 -0
  69. rdworks-0.25.7.dist-info/top_level.txt +1 -0
rdworks/scaffold.py ADDED
@@ -0,0 +1,520 @@
1
+
2
+ """
3
+ This module provides functions to break a molecule into scaffolds.
4
+ """
5
+
6
+ import collections
7
+ import operator
8
+ import itertools
9
+
10
+ from typing import Any, Optional, List, Tuple
11
+
12
+ from rdkit import Chem
13
+ from rdkit.Chem import rdMolDescriptors
14
+ from rdkit.Chem.Scaffolds import MurckoScaffold
15
+ from rdkit.Chem import BRICS, AllChem
16
+
17
+ from .std import desalt_smiles
18
+
19
+
20
+ def remove_exocyclic(rdmol:Chem.Mol) -> Chem.Mol:
21
+ """Removes exocyclic chains or all terminal side chains.
22
+
23
+ It is equivalent to the `MurckoScaffold.GetScaffoldForMol(mol)`.
24
+ Args:
25
+ rdmol (Chem.Mol): input molecule.
26
+
27
+ Returns:
28
+ Chem.Mol: output molecule.
29
+ """
30
+
31
+ # all bonds between cyclic and acyclic atoms (single bond)
32
+ bis = rdmol.GetSubstructMatches(Chem.MolFromSmarts('[!R][R]'))
33
+
34
+ # bond indexes to cut
35
+ xbs = []
36
+ for bi in bis:
37
+ b = rdmol.GetBondBetweenAtoms(bi[0],bi[1])
38
+ fg_smi = Chem.MolToSmiles(
39
+ Chem.FragmentOnBonds(rdmol,[b.GetIdx()],addDummies=False)).split(".")
40
+ fg_mol = [Chem.MolFromSmiles(x) for x in fg_smi]
41
+ # ring count
42
+ fg_rc = [rdMolDescriptors.CalcNumRings(g) for g in fg_mol]
43
+ if 0 in fg_rc: # if one the fragmented parts has no ring system
44
+ xbs.append(b.GetIdx())
45
+ fg_smi = Chem.MolToSmiles(
46
+ Chem.FragmentOnBonds(rdmol,xbs,addDummies=False)).split(".")
47
+ fg_mol = [Chem.MolFromSmiles(x) for x in fg_smi]
48
+ fg_rc = [rdMolDescriptors.CalcNumRings(g) for g in fg_mol]
49
+ res = sorted(zip(fg_mol, fg_rc), key=lambda x: x[1], reverse=True)
50
+ molframe = res[0][0]
51
+
52
+ return molframe
53
+
54
+
55
+ def get_attached_linkers(mol:Chem.Mol) -> Any:
56
+ """Get linkers (connected non-ring atoms) between rings.
57
+
58
+ Args:
59
+ mol (Chem.Mol): input molecule.
60
+
61
+ Returns:
62
+ Any: linkers.
63
+ """
64
+
65
+ # convert a tuple of tuples to a list
66
+ non_ring_atoms = [t[0] for t in mol.GetSubstructMatches(Chem.MolFromSmarts('[!R]'))]
67
+ non_ring_atoms_attached = mol.GetSubstructMatches(Chem.MolFromSmarts('[!R][R]'))
68
+ attached_linkers = []
69
+ for (aj,ai),(ak,aii) in list(itertools.combinations(non_ring_atoms_attached,2)):
70
+ try:
71
+ jk = Chem.GetShortestPath(mol, aj, ak) #tuple
72
+ except:
73
+ continue
74
+ # all atoms along the path should be non ring atoms
75
+ if sum([1 for i in jk if i not in non_ring_atoms]) == 0:
76
+ attached_linkers.append( (ai,) + jk + (aii,) )
77
+ return attached_linkers
78
+
79
+
80
+ def breakup(parents:Any, maxChildren:Optional[int]=None, verbose:bool=False) -> List:
81
+ """Breaks up parents recursively and return a list of scaffolds.
82
+
83
+ Examples:
84
+ >>> [(rdmol, 'O=C(CCCc1ccccc1)N1CCn2cnnc2C1', 3, ((6, 7, 8, 9, 10, 5), (12, 13, 14, 18, 19, 11), (15, 14, 18, 17, 16)), ()), ..]
85
+
86
+ Args:
87
+ parents (Any): Chem.Mol object at first but changes during recursive calls
88
+ maxChildren (int, optional): max number of children
89
+ maxChildren = None --> scaffold network methods
90
+ maxChildren = 1 --> scaffold tree methods
91
+ verbose: print out children info
92
+
93
+ Returns:
94
+ [(rdmol, smiles, nr, rings_indices, other_info), ... ]
95
+ """
96
+
97
+ if not isinstance(parents, list) : # at initial call
98
+ if isinstance(parents, Chem.Mol):
99
+ parent = Chem.Mol(parents)
100
+ try:
101
+ # remove exocyclic group(s)
102
+ parent = MurckoScaffold.GetScaffoldForMol(parent)
103
+ # isomericSmiles = False
104
+ # (1) enables robust canonicalization in RDKit
105
+ # (2) removes stereochemistry to make offsprings non-chiral
106
+ # because preserving correct stereochemistry during breaking up
107
+ # is difficult and appears to have no/little meaning
108
+ smiles = Chem.MolToSmiles(parent, canonical=True, isomericSmiles=False)
109
+
110
+ # parent molecule reflects the SMILES
111
+ # all children will be affected by this
112
+ parent = Chem.MolFromSmiles(smiles)
113
+
114
+ rings = parent.GetRingInfo().AtomRings()
115
+ nr = len(rings)
116
+ priority = ()
117
+
118
+ # return empty list if molecule has no ring
119
+ if nr == 0:
120
+ return []
121
+
122
+ if verbose:
123
+ print((nr,smiles,))
124
+
125
+ parents = [(parent,smiles,nr,rings,priority)]
126
+
127
+ except:
128
+ return []
129
+
130
+ children= []
131
+ for parent,smiles,nr,rings,priority in parents:
132
+ # terminate recursion if parents have only one ring or more than 10 rings
133
+ if nr == 1 or nr > 10 :
134
+ return parents
135
+ # flatten atom index in all rings
136
+ atomsInRings = [ai for ring in rings for ai in ring]
137
+ # avoid removing atoms shared between two or more rings
138
+ atomsShared = [ai for ai, count in collections.Counter(atomsInRings).items() if count > 1]
139
+ fused_rings = sum([1 for ring in rings if len(set(ring).intersection(atomsShared)) > 0])
140
+ # terminate if parents have only one big fused ring system such that
141
+ # every ring has at least one shared atom
142
+ remove_linker_enforced = False
143
+ if nr > 5 :
144
+ if nr == fused_rings: # all rings are fused
145
+ return parents
146
+ else:
147
+ remove_linker_enforced = True
148
+ # number of aromatic rings
149
+ nar= sum([1 for ring in rings if parent.GetAtomWithIdx(ring[0]).GetIsAromatic()])
150
+ # linkers that are attached to rings
151
+ attached_linkers = get_attached_linkers(parent)
152
+
153
+ for ring in rings:
154
+
155
+ removed_ring_size = len(ring)
156
+ if removed_ring_size == 3:
157
+ removed_ring_3 = 1
158
+ else:
159
+ removed_ring_3 = 0
160
+ if removed_ring_size in [3,5,6]:
161
+ removed_ring_356 = 1
162
+ else:
163
+ removed_ring_356 = 0
164
+ if removed_ring_size >= 12:
165
+ removed_macrocycle = 1
166
+ else:
167
+ removed_macrocycle = 0
168
+
169
+ atomsToRemain = [ai for ai in ring if ai in atomsShared ]
170
+ atomsToRemove = [ai for ai in ring if ai not in atomsToRemain]
171
+
172
+ # there is nothing to do when there is no atoms to remove
173
+ # no child will be added to children
174
+ # retain bridged rings, spiro rings, and nolinear ring fusion patterns
175
+ if not atomsToRemove:
176
+ continue
177
+
178
+ # Rule 3 - choose the parent scaffold having the smallest number of acyclic linker bonds
179
+ # if isolated ring is to be removed
180
+ if len(atomsToRemove) == removed_ring_size:
181
+ # linker has two ring atoms at both ends
182
+ removed_linker_size_list = [
183
+ len(l)-2 for l in attached_linkers if l[0] in ring or l[-1] in ring
184
+ ]
185
+ removed_linkers = len(removed_linker_size_list)
186
+ if removed_linkers == 1:
187
+ removed_linker_size = removed_linker_size_list[0]
188
+ elif removed_linkers > 1:
189
+ continue # it will break the molecule
190
+ else :
191
+ removed_linker_size = 0
192
+ else:
193
+ removed_linker_size = -1
194
+ if remove_linker_enforced:
195
+ continue
196
+
197
+ # heteroatom count
198
+ removed_ring_hac = sum([1 for ai in ring if parent.GetAtomWithIdx(ai).GetSymbol() not in ["C","H"]])
199
+
200
+ # get exocyclic double bonded atom index
201
+ exo = []
202
+ for ai in atomsToRemove:
203
+ for b in parent.GetAtomWithIdx(ai).GetBonds():
204
+ if b.GetBondType() == Chem.BondType.DOUBLE:
205
+ # one of two indexes should be i (ring atom)
206
+ # and should be removed
207
+ # remove exocyclic double bonded atoms together
208
+ # unless these atoms belong to another ring
209
+ if ai == b.GetBeginAtomIdx():
210
+ if b.GetEndAtomIdx() not in atomsInRings:
211
+ exo += [ b.GetEndAtomIdx() ]
212
+ else:
213
+ if b.GetBeginAtomIdx() not in atomsInRings:
214
+ exo += [ b.GetBeginAtomIdx() ]
215
+ # remove exocyclic double bonded atoms as well
216
+ atomsToRemove += exo
217
+ # make sure to remove an atom with bigger index number first
218
+ # python sort function works as an in-place modifier
219
+ # RDKit will renumber after every RemoveAtom() so
220
+ # remove from highest to lowest atom index.
221
+ atomsToRemove.sort(reverse=True)
222
+
223
+ # use Chem.RWMol to preserve the original parent
224
+ rwmol = Chem.RWMol(parent)
225
+
226
+ explictHs = []
227
+ for ai in atomsToRemain:
228
+ for b in parent.GetAtomWithIdx(ai).GetBonds():
229
+ j,k = b.GetBeginAtomIdx(), b.GetEndAtomIdx()
230
+ if j in atomsToRemove or k in atomsToRemove:
231
+ explictHs.append(rwmol.GetAtomWithIdx(ai))
232
+
233
+ # Rule 1 - remove heterocycles of size 3 first
234
+ # the fusion bond connecting the three-membered ring with other rings is
235
+ # converted into a double bond
236
+ # Rule 2 - do not remove rings with >= 12 atoms if there are still smaller rings to remove
237
+ # Rule 6 - remove rings of sizes 3, 5, and 6 first
238
+ # Rule 10 - smaller rings are removed first
239
+ if removed_ring_size == 3 and len(atomsToRemove) == 1 and \
240
+ parent.GetAtomWithIdx(atomsToRemove[0]).GetSymbol() in ["O","N"] :
241
+ # fused three-membered ring, epoxides and aziridines
242
+ # removing an atom changes atom indexes
243
+ # so it should be done at the end
244
+ rwmol.RemoveBond(atomsShared[0],atomsShared[1])
245
+ rwmol.AddBond(atomsShared[0],atomsShared[1],order=Chem.BondType.DOUBLE)
246
+ rwmol.RemoveAtom(atomsToRemove[0])
247
+ else:
248
+ for ai in atomsToRemove:
249
+ rwmol.RemoveAtom(ai)
250
+ for a in explictHs:
251
+ a.SetNumExplicitHs(1)
252
+
253
+ try:
254
+ # get the modified molecule
255
+ child = rwmol.GetMol()
256
+ # ring removal should not break a molecule into pieces
257
+ child_smiles = Chem.MolToSmiles(child, canonical=True, isomericSmiles=False)
258
+ assert "." not in child_smiles
259
+ except:
260
+ continue
261
+
262
+ try:
263
+ Chem.SanitizeMol(child)
264
+ except:
265
+ continue
266
+
267
+ try:
268
+ # discard all the exocyclic groups of the child
269
+ child = MurckoScaffold.GetScaffoldForMol(child)
270
+ child_smiles = Chem.MolToSmiles(child, canonical=True, isomericSmiles=False)
271
+ assert child_smiles
272
+ # keep only non-redundant child
273
+ assert sum([1 for c in children if c[1] == child_smiles]) == 0
274
+ except:
275
+ continue
276
+
277
+ child_getRings = child.GetRingInfo()
278
+ child_rings = child_getRings.AtomRings()
279
+ child_nr = len(child_rings)
280
+ child_atomsInRings = [ai for child_ring in child_rings for ai in child_ring]
281
+ child_atomsShared = [ai for ai, count in collections.Counter(child_atomsInRings).items() if count > 1]
282
+ child_fused_rings = sum([1 for child_ring in child_rings if len(set(child_ring).intersection(child_atomsShared)) > 0])
283
+
284
+ #bridged compounds have two or more rings (a ring system) that
285
+ # contains a bridge—a single atom or an unbranched chain of atoms
286
+ #fused ring compounds have two rings linked by two adjacent atoms
287
+ #spiro compounds have two rings linked by a single atom
288
+ if nr == fused_rings and child_nr == child_fused_rings :
289
+ # Rule 4 - retain bridged rings, spiro rings,
290
+ # and nonlinear ring fusion patterns with preference
291
+ # Rule 5 - Bridged ring systems are retained with preference
292
+ # over spiro ring systems
293
+ child_ring_bonds = child_getRings.BondRings()
294
+ # flatten bond index in all rings
295
+ child_bondsInRings = [bi for child_bonds in child_ring_bonds for bi in child_bonds]
296
+ # bond shared between two or more rings
297
+ # the more bridges or nonlinear ring fusions there are, the higher the nrrb
298
+ # nrrb decreases if there are spiro connected ring systems
299
+ child_bondsShared = [bi for bi, count in collections.Counter(child_bondsInRings).items() if count > 1]
300
+ child_nrrb = len(child_bondsShared)
301
+ child_delta = child_nrrb -(child_nr-1)
302
+ child_delta_abs = abs(child_delta)
303
+ else:
304
+ child_delta = 0
305
+ child_delta_abs = 0
306
+
307
+ # Rule 7 - a fully aromatic ring system must not be dissected
308
+ # in a way that the resulting system is not aromatic any more
309
+ # Rule 11 - for mixed aromatic/non-aromatic ring systems,
310
+ # retain non-aromatic rings with priority
311
+
312
+ # number of aromatic rings
313
+ child_nar= sum([1 for child_ring in child_rings if child.GetAtomWithIdx(child_ring[0]).GetIsAromatic()])
314
+
315
+ if nr == nar:
316
+ if child_nr == child_nar:
317
+ removed_aromaticity = 0
318
+ else:
319
+ removed_aromaticity = 1
320
+ else:
321
+ removed_aromaticity = 0
322
+
323
+ # Rule 12 - remove rings first where the linker is attached to
324
+ # a ring heteroatom at either end of the linker
325
+ # Rule 8 - remove rings with the least number of heteroatoms first
326
+ # Rule 9 - if the number of heteroatoms is equal,
327
+ # the priority of heteroatoms to retain is N > O > S
328
+ try:
329
+ child_ring_hetatom = max([ ord(child.GetAtomWithIdx(ai).GetSymbol())
330
+ for child_ring in child_rings for ai in child_ring
331
+ if child.GetAtomWithIdx(ai).GetSymbol() in ["N","O","S"]])
332
+ except:
333
+ child_ring_hetatom = ord("X")
334
+
335
+ children.append((child, #0
336
+ child_smiles, #1
337
+ child_nr, #2
338
+ child_rings, #3
339
+ ( #4
340
+ removed_ring_3, #rule 1
341
+ -removed_macrocycle, #rule 2
342
+ removed_linker_size, #rule 3
343
+ child_delta_abs, #rule 4
344
+ child_delta, #rule 5
345
+ removed_ring_356, #rule 6
346
+ -removed_aromaticity, #rule 7
347
+ -removed_ring_hac, #rule 8
348
+ -child_ring_hetatom, #rule 9
349
+ -child_nar, #rule 11
350
+ child_smiles, #rule 12 - tie breaker
351
+ ),
352
+ ))
353
+ if children:
354
+ children = sorted(children, key=operator.itemgetter(4), reverse=True)
355
+ if verbose:
356
+ for d in children:
357
+ print(d[2],d[1],d[-1])
358
+ print("-"*40)
359
+ # limit the number of children if needed
360
+ # maxChildren = None --> scaffold network methods
361
+ # maxChildren = 1 --> scaffold tree methods
362
+ children = children[:maxChildren]
363
+ # do this recursively until one ring remains
364
+ return parents + breakup(children, maxChildren, verbose)
365
+ else:
366
+ # terminate when there is nothing to break up
367
+ return parents
368
+
369
+
370
+ def scaffold_tree(rdmol:Chem.Mol) -> List[Chem.Mol]:
371
+ """Returns scaffold tree.
372
+
373
+ Args:
374
+ rdmol (Chem.Mol): input molecule.
375
+
376
+ Returns:
377
+ List[Chem.Mol]: scaffold tree.
378
+ """
379
+ lmol = [rdmol]
380
+ tree = breakup(rdmol, maxChildren=1)
381
+ for (_rdmol, smiles, nr, ring_indices, other) in tree:
382
+ lmol.append(_rdmol)
383
+ return lmol
384
+
385
+
386
+ def scaffold_network(rdmol:Chem.Mol) -> List[Chem.Mol]:
387
+ """Returns scaffold network.
388
+
389
+ Args:
390
+ rdmol (Chem.Mol): input molecule.
391
+
392
+ Returns:
393
+ List[Chem.Mol]: scaffold network.
394
+ """
395
+ lmol = [rdmol]
396
+ network = breakup(rdmol, maxChildren=None)
397
+ for (_rdmol, smiles, nr, ring_indices, other) in network:
398
+ lmol.append(_rdmol)
399
+ return lmol
400
+
401
+
402
+ def BRICS_fragmented(rdmol:Chem.Mol,
403
+ min_atoms:Optional[int]=None,
404
+ max_atoms:Optional[int]=None) -> List[Chem.Mol]:
405
+ """Perform BRICKS decomposition and returns fragmented molecules.
406
+
407
+ Args:
408
+ rdmol (Chem.Mol): input molecule.
409
+ min_atoms (Optional[int], optional): min number of atoms for a fragment. Defaults to None.
410
+ max_atoms (Optional[int], optional): max number of atoms for a fragment. Defaults to None.
411
+
412
+ Returns:
413
+ List[Chem.Mol]: a list of fragmented molecules.
414
+ """
415
+ dummy = Chem.MolFromSmiles('*')
416
+ hydro = Chem.MolFromSmiles('[H]')
417
+ frag_smiles_set = BRICS.BRICSDecompose(Chem.Mol(rdmol))
418
+ # ex. ['[14*]c1ccccn1', '[16*]c1cccc([16*])c1', '[3*]O[3*]', '[4*]CCC', '[4*]C[8*]']
419
+
420
+ lfrag_rdmol = []
421
+ for frag_smi in frag_smiles_set:
422
+ (_, frag_rdmol) = desalt_smiles(frag_smi)
423
+ # replace dummy atom(s) with [H]
424
+ frag_rdmol_H= AllChem.ReplaceSubstructs(frag_rdmol, dummy, hydro, True)[0]
425
+ frag_rdmol = Chem.RemoveHs(frag_rdmol_H)
426
+ frag_smi = Chem.MolToSmiles(frag_rdmol)
427
+ # filter out molecules which are too small or too big
428
+ na = frag_rdmol.GetNumAtoms()
429
+ if (min_atoms and na < min_atoms) or (max_atoms and na > max_atoms):
430
+ continue
431
+ lfrag_rdmol.append(frag_rdmol)
432
+ return lfrag_rdmol
433
+
434
+
435
+ def depth_first_search(rdatom:Chem.Atom, origin_atom:Chem.Atom,
436
+ end_idx:int, group:List[int], BRICS_bonds:List[Tuple[int,int]]) -> List[List[int]]:
437
+ """Does recursive depth-first search.
438
+
439
+ Args:
440
+ rdatom (Chem.Atom): input atom.
441
+ origin_atom (Chem.Atom): origin atom.
442
+ end_idx (int): end index.
443
+ group (List[int]): group to be appended by the function.
444
+ BRICS_bonds (List[Tuple[int,int]]): list of bonds(tuple of two indexes)
445
+
446
+ Returns:
447
+ List[List[int]]: search output.
448
+ """
449
+ bonded_atoms = rdatom.GetNeighbors()
450
+ if (len(bonded_atoms) == 1) and (bonded_atoms[0] == origin_atom):
451
+ return
452
+ for atom in bonded_atoms:
453
+ idx = atom.GetIdx()
454
+ if (idx == end_idx) or (idx in group) or (sorted([rdatom.GetIdx(), idx]) in BRICS_bonds):
455
+ continue
456
+ group.append(idx)
457
+ depth_first_search(atom, rdatom, end_idx, group, BRICS_bonds)
458
+
459
+
460
+ def BRICS_fragment_indices(rdmol:Chem.Mol) -> List[List[int]]:
461
+ """Returns BRICS fragment/scaffold atom indices.
462
+
463
+ Args:
464
+ rdmol (Chem.Mol): input molecule.
465
+
466
+ Returns:
467
+ List[List[int]]: fragment/scaffold atom indices.
468
+ """
469
+ BRICS_bonds = [sorted(x[0]) for x in list(BRICS.FindBRICSBonds(rdmol))]
470
+ if BRICS_bonds:
471
+ indices = []
472
+ for bond in BRICS_bonds:
473
+ for (start_idx, end_idx) in [(bond[0], bond[1]), (bond[1], bond[0])]:
474
+ group = []
475
+ origin_atom = rdmol.GetAtomWithIdx(start_idx)
476
+ for atom in origin_atom.GetNeighbors():
477
+ idx = atom.GetIdx()
478
+ if (idx == end_idx):
479
+ continue
480
+ depth_first_search(atom, origin_atom, end_idx, group, BRICS_bonds)
481
+ if sorted(group) not in indices:
482
+ indices.append(sorted(group))
483
+ else: # all indices
484
+ indices = [ [a.GetIdx() for a in rdmol.GetAtoms()] ]
485
+ return sorted(indices, key=lambda x: len(x), reverse=True)
486
+
487
+
488
+ def rigid_fragment_indices(rdmol:Chem.Mol) -> List[List[int]]:
489
+ """Breaks a molecule at each rotatable bond and returns atom indices of fragments.
490
+ Args:
491
+ rdmol (Chem.Mol) : input molecule
492
+
493
+ Returns:
494
+ list of list (atom indices)
495
+ """
496
+ rotatable_bond_pattern = Chem.MolFromSmarts('[!$(*#*)&!D1]-&!@[!$(*#*)&!D1]')
497
+ rotatable_bonds = [sorted(x) for x in list(rdmol.GetSubstructMatches(rotatable_bond_pattern))]
498
+ if rotatable_bonds:
499
+ indices = []
500
+ for bond in rotatable_bonds:
501
+ for (start_idx, end_idx) in [(bond[0], bond[1]), (bond[1], bond[0])]:
502
+ group = []
503
+ origin_atom = rdmol.GetAtomWithIdx(start_idx)
504
+ for atom in origin_atom.GetNeighbors():
505
+ idx = atom.GetIdx()
506
+ if (idx == end_idx):
507
+ continue
508
+ depth_first_search(atom, origin_atom, end_idx, group, rotatable_bonds)
509
+ if sorted(group) not in indices:
510
+ indices.append(sorted(group))
511
+ else: # all indices
512
+ indices = [ [a.GetIdx() for a in rdmol.GetAtoms()] ]
513
+ # ignore H
514
+ indices_noH = []
515
+ for ii in indices:
516
+ indices_noH.append([i for i in ii if rdmol.GetAtomWithIdx(i).GetAtomicNum() != 1 ])
517
+
518
+ return sorted(indices_noH, key=lambda x: len(x), reverse=True)
519
+
520
+
rdworks/std.py ADDED
@@ -0,0 +1,143 @@
1
+ import operator
2
+ from typing import Tuple, Union
3
+
4
+ from rdkit import Chem
5
+ from rdkit.Chem.MolStandardize import rdMolStandardize
6
+
7
+
8
+ def desalt_smiles(smiles:str) -> Tuple[Union[str, None], Union[Chem.Mol, None]]:
9
+ """Returns (desalted SMILES string, rdkit.Chem.Mol).
10
+
11
+ Args:
12
+ smiles (str): input SMILES string.
13
+
14
+ Returns:
15
+ Tuple[Union[str, None], Union[Chem.Mol, None]]: (desalted SMILES, desalted rdkit.Chem.Mol)
16
+ """
17
+ mols = []
18
+ for smi in smiles.split("."):
19
+ try:
20
+ rdmol = Chem.MolFromSmiles(smi)
21
+ n = rdmol.GetNumAtoms()
22
+ mols.append((n, smi, rdmol))
23
+ except:
24
+ pass
25
+ if len(mols) > 0:
26
+ # `sorted` function compares the number of atoms first then smiles and rdmol.
27
+ # Comparing smiles string would be okay but comparison of rdmol objects will
28
+ # cause error because comparison operation for Chem.Mol is not supported.
29
+ # So we need to restrict the key to the number of atoms.
30
+ (n, desalted_smiles, desalted_rdmol) = sorted(mols, key=operator.itemgetter(0), reverse=True)[0]
31
+ return (desalted_smiles, desalted_rdmol)
32
+ else:
33
+ return (None, None)
34
+
35
+
36
+ def standardize_smiles(smiles:str) -> str:
37
+ """Returns standardized SMILES string.
38
+
39
+ The rdMolStandardize.StandardizeSmiles() function performs the following steps:
40
+
41
+ 1. mol = Chem.MolFromSmiles(sm)
42
+ 1. Chem.SanitizeMol(mol)
43
+ 1. mol = Chem.RemoveHs(mol)
44
+ 1. mol = rdMolStandardize.MetalDisconnector().Disconnect(mol)
45
+ 1. mol = rdMolStandardize.Normalize(mol)
46
+ 1. mol = rdMolStandardize.Reionize(mol)
47
+ 1. Chem.AssignStereochemistry(mol, force=True, cleanIt=True)
48
+ 1. Chem.MolToSmiles(mol)
49
+
50
+ See [rdkit notebook](https://github.com/rdkit/rdkit/blob/master/Docs/Notebooks/MolStandardize.ipynb) and
51
+ [greg's notebook](https://github.com/greglandrum/RSC_OpenScience_Standardization_202104/blob/main/Standardization%20and%20Validation%20with%20the%20RDKit.ipynb),
52
+ and [youtube video](https://www.youtube.com/watch?v=eWTApNX8dJQ).
53
+
54
+ Args:
55
+ smiles (str): input SMILES string.
56
+
57
+ Returns:
58
+ str: standardized SMILES string.
59
+
60
+
61
+ """
62
+ return rdMolStandardize.StandardizeSmiles(smiles)
63
+
64
+
65
+ def standardize(smiles:str) -> Chem.Mol:
66
+ """Returns standardized rdkit.Chem.Mol object.
67
+
68
+ Args:
69
+ smiles (str): input SMILES string.
70
+
71
+ Returns:
72
+ Chem.Mol: standardized rdkit.Chem.Mol object.
73
+ """
74
+ # follows the steps in
75
+ # https://github.com/greglandrum/RSC_OpenScience_Standardization_202104/blob/main/MolStandardize%20pieces.ipynb
76
+ # as
77
+ mol = Chem.MolFromSmiles(smiles)
78
+
79
+ # removeHs, disconnect metal atoms, normalize the molecule, reionize the molecule
80
+ clean_mol = rdMolStandardize.Cleanup(mol)
81
+
82
+ # if many fragments, get the "parent" (the actual mol we are interested in)
83
+ parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)
84
+
85
+ # try to neutralize molecule
86
+ uncharger = rdMolStandardize.Uncharger() # annoying, but necessary as no convenience method exists
87
+ uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)
88
+
89
+ # note that no attempt is made at reionization at this step
90
+ # nor at ionization at some pH (rdkit has no pKa caculator)
91
+ # the main aim to to represent all molecules from different sources
92
+ # in a (single) standard way, for use in ML, catalogue, etc.
93
+
94
+ te = rdMolStandardize.TautomerEnumerator() # idem
95
+ taut_uncharged_parent_clean_mol = te.Canonicalize(uncharged_parent_clean_mol)
96
+
97
+ return taut_uncharged_parent_clean_mol
98
+
99
+
100
+ def neutralize_atoms(rdmol:Chem.Mol) -> Chem.Mol:
101
+ """Neutralizes atoms.
102
+
103
+ It is adapted from Noel O'Boyle's nocharge code:
104
+ [rdkit cookbook](https://www.rdkit.org/docs/Cookbook.html),
105
+ [no charge](https://baoilleach.blogspot.com/2019/12/no-charge-simple-approach-to.html).
106
+ It is a neutralization by atom approach and neutralizes atoms with a +1 or -1 charge
107
+ by removing or adding hydrogen where possible. The SMARTS pattern checks for a hydrogen
108
+ in +1 charged atoms and checks for no neighbors with a negative charge (for +1 atoms)
109
+ and no neighbors with a positive charge (for -1 atoms), this is to avoid altering molecules
110
+ with charge separation (e.g., nitro groups).
111
+
112
+ The neutralize_atoms() function differs from the rdMolStandardize.Uncharger behavior.
113
+ See the [MolVS documentation for Uncharger](https://molvs.readthedocs.io/en/latest/api.html#molvs-charge).
114
+
115
+ > This class uncharges molecules by adding and/or removing hydrogens.
116
+ In cases where there is a positive charge that is not neutralizable,
117
+ any corresponding negative charge is also preserved. As an example,
118
+ rdMolStandardize.Uncharger will not change charges on C[N+](C)(C)CCC([O-])=O,
119
+ as there is a positive charge that is not neutralizable. In contrast, the neutralize_atoms()
120
+ function will attempt to neutralize any atoms it can (in this case to C[N+](C)(C)CCC(=O)O).
121
+ That is, neutralize_atoms() ignores the overall charge on the molecule, and attempts to neutralize
122
+ charges even if the neutralization introduces an overall formal charge on the molecule.
123
+
124
+ Args:
125
+ rdmol (rdkit.Chem.Mol) : input molecule.
126
+
127
+ Returns:
128
+ Chem.Mol: a copy of neutralized rdkit.Chem.Mol object.
129
+ """
130
+
131
+ rdmol_ = Chem.Mol(rdmol)
132
+ pattern = Chem.MolFromSmarts("[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]")
133
+ at_matches = rdmol_.GetSubstructMatches(pattern)
134
+ at_matches_list = [y[0] for y in at_matches]
135
+ if len(at_matches_list) > 0:
136
+ for at_idx in at_matches_list:
137
+ atom = rdmol_.GetAtomWithIdx(at_idx)
138
+ chg = atom.GetFormalCharge()
139
+ hcount = atom.GetTotalNumHs()
140
+ atom.SetFormalCharge(0)
141
+ atom.SetNumExplicitHs(hcount - chg)
142
+ atom.UpdatePropertyCache()
143
+ return rdmol_