rdworks 0.25.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. rdworks/__init__.py +35 -0
  2. rdworks/autograph/__init__.py +4 -0
  3. rdworks/autograph/autograph.py +184 -0
  4. rdworks/autograph/centroid.py +90 -0
  5. rdworks/autograph/dynamictreecut.py +135 -0
  6. rdworks/autograph/nmrclust.py +123 -0
  7. rdworks/autograph/rckmeans.py +74 -0
  8. rdworks/bitqt/__init__.py +1 -0
  9. rdworks/bitqt/bitqt.py +355 -0
  10. rdworks/conf.py +374 -0
  11. rdworks/descriptor.py +36 -0
  12. rdworks/display.py +206 -0
  13. rdworks/ionized.py +170 -0
  14. rdworks/matchedseries.py +260 -0
  15. rdworks/mol.py +1522 -0
  16. rdworks/mollibr.py +887 -0
  17. rdworks/pka.py +38 -0
  18. rdworks/predefined/Asinex_fragment.xml +20 -0
  19. rdworks/predefined/Astex_RO3.xml +16 -0
  20. rdworks/predefined/Baell2010_PAINS/Baell2010A.xml +52 -0
  21. rdworks/predefined/Baell2010_PAINS/Baell2010B.xml +169 -0
  22. rdworks/predefined/Baell2010_PAINS/Baell2010C.xml +1231 -0
  23. rdworks/predefined/Baell2010_PAINS/PAINS-less-than-015-hits.xml +2048 -0
  24. rdworks/predefined/Baell2010_PAINS/PAINS-less-than-150-hits.xml +278 -0
  25. rdworks/predefined/Baell2010_PAINS/PAINS-more-than-150-hits.xml +83 -0
  26. rdworks/predefined/Baell2010_PAINS/makexml.py +70 -0
  27. rdworks/predefined/Brenk2008_Dundee/makexml.py +21 -0
  28. rdworks/predefined/CNS.xml +18 -0
  29. rdworks/predefined/ChEMBL_Walters/BMS.xml +543 -0
  30. rdworks/predefined/ChEMBL_Walters/Dundee.xml +318 -0
  31. rdworks/predefined/ChEMBL_Walters/Glaxo.xml +168 -0
  32. rdworks/predefined/ChEMBL_Walters/Inpharmatica.xml +276 -0
  33. rdworks/predefined/ChEMBL_Walters/LINT.xml +174 -0
  34. rdworks/predefined/ChEMBL_Walters/MLSMR.xml +351 -0
  35. rdworks/predefined/ChEMBL_Walters/PAINS.xml +1446 -0
  36. rdworks/predefined/ChEMBL_Walters/SureChEMBL.xml +501 -0
  37. rdworks/predefined/ChEMBL_Walters/makexml.py +40 -0
  38. rdworks/predefined/Hann1999_Glaxo/Hann1999.xml +168 -0
  39. rdworks/predefined/Hann1999_Glaxo/Hann1999Acid.xml +102 -0
  40. rdworks/predefined/Hann1999_Glaxo/Hann1999Base.xml +6 -0
  41. rdworks/predefined/Hann1999_Glaxo/Hann1999ElPh.xml +6 -0
  42. rdworks/predefined/Hann1999_Glaxo/Hann1999NuPh.xml +6 -0
  43. rdworks/predefined/Hann1999_Glaxo/makexml.py +83 -0
  44. rdworks/predefined/Kazius2005/Kazius2005.xml +114 -0
  45. rdworks/predefined/Kazius2005/makexml.py +66 -0
  46. rdworks/predefined/ZINC_druglike.xml +24 -0
  47. rdworks/predefined/ZINC_fragment.xml +14 -0
  48. rdworks/predefined/ZINC_leadlike.xml +15 -0
  49. rdworks/predefined/fragment.xml +7 -0
  50. rdworks/predefined/ionized/simple_smarts_pattern.csv +57 -0
  51. rdworks/predefined/ionized/smarts_pattern.csv +107 -0
  52. rdworks/predefined/misc/makexml.py +119 -0
  53. rdworks/predefined/misc/reactive-part-2.xml +104 -0
  54. rdworks/predefined/misc/reactive-part-3.xml +74 -0
  55. rdworks/predefined/misc/reactive.xml +321 -0
  56. rdworks/readin.py +312 -0
  57. rdworks/rgroup.py +2173 -0
  58. rdworks/scaffold.py +520 -0
  59. rdworks/std.py +143 -0
  60. rdworks/stereoisomers.py +127 -0
  61. rdworks/tautomers.py +20 -0
  62. rdworks/units.py +63 -0
  63. rdworks/utils.py +495 -0
  64. rdworks/xml.py +260 -0
  65. rdworks-0.25.7.dist-info/METADATA +37 -0
  66. rdworks-0.25.7.dist-info/RECORD +69 -0
  67. rdworks-0.25.7.dist-info/WHEEL +5 -0
  68. rdworks-0.25.7.dist-info/licenses/LICENSE +21 -0
  69. rdworks-0.25.7.dist-info/top_level.txt +1 -0
rdworks/ionized.py ADDED
@@ -0,0 +1,170 @@
1
+ import importlib.resources
2
+ import pandas as pd
3
+
4
+ from rdkit import Chem
5
+
6
+ # adapted from https://github.com/dptech-corp/Uni-pKa/enumerator
7
+
8
+ class IonizedStates:
9
+ # Unreasonable chemical structures
10
+ unreasonable_patterns = [
11
+ Chem.MolFromSmarts(s) for s in [
12
+ "[#6X5]",
13
+ "[#7X5]",
14
+ "[#8X4]",
15
+ "[*r]=[*r]=[*r]",
16
+ "[#1]-[*+1]~[*-1]",
17
+ "[#1]-[*+1]=,:[*]-,:[*-1]",
18
+ "[#1]-[*+1]-,:[*]=,:[*-1]",
19
+ "[*+2]",
20
+ "[*-2]",
21
+ "[#1]-[#8+1].[#8-1,#7-1,#6-1]",
22
+ "[#1]-[#7+1,#8+1].[#7-1,#6-1]",
23
+ "[#1]-[#8+1].[#8-1,#6-1]",
24
+ "[#1]-[#7+1].[#8-1]-[C](-[C,#1])(-[C,#1])",
25
+ # "[#6;!$([#6]-,:[*]=,:[*]);!$([#6]-,:[#7,#8,#16])]=[C](-[O,N,S]-[#1])",
26
+ # "[#6]-,=[C](-[O,N,S])(-[O,N,S]-[#1])",
27
+ "[OX1]=[C]-[OH2+1]",
28
+ "[NX1,NX2H1,NX3H2]=[C]-[O]-[H]",
29
+ "[#6-1]=[*]-[*]",
30
+ "[cX2-1]",
31
+ "[N+1](=O)-[O]-[H]",
32
+ ]]
33
+
34
+ smarts_path = importlib.resources.files('rdworks.predefined.ionized')
35
+ protonation_patterns = pd.read_csv(smarts_path / 'simple_smarts_pattern.csv')
36
+
37
+ def __init__(self, smiles:str):
38
+ self.smiles = Chem.CanonSmiles(smiles)
39
+ self.rdmol = Chem.MolFromSmiles(self.smiles)
40
+ self.rdmol_H = Chem.AddHs(self.rdmol)
41
+ self.charge = Chem.GetFormalCharge(self.rdmol_H)
42
+ self.charge_max = 2
43
+ self.charge_min = -2
44
+ # initial states
45
+ self.states = {self.smiles : (self.rdmol_H, self.charge)}
46
+ # initial protonation sites
47
+ self.protonation_sites = {self.smiles : self.set_protonation_sites(self.smiles)}
48
+ # generate initial states
49
+ self.protonate(self.smiles)
50
+
51
+
52
+ def get_protonation_sites(self) -> dict:
53
+ return self.protonation_sites
54
+
55
+
56
+ def get_states_by_charge(self) -> dict:
57
+ self.ensemble()
58
+ data = {}
59
+ for smiles, (romol, charge) in self.states.items():
60
+ if charge in data:
61
+ data[charge].append(smiles)
62
+ else:
63
+ data[charge] = [smiles]
64
+
65
+ return data
66
+
67
+ def get_states(self) -> list:
68
+ return [smiles for smiles in self.states]
69
+
70
+
71
+ def get_states_mol(self) -> list[Chem.Mol]:
72
+ return [romol for smiles, (romol, charge) in self.states.items()]
73
+
74
+
75
+ def get_num_states(self) -> int:
76
+ return len(self.states)
77
+
78
+
79
+ @staticmethod
80
+ def clean_smiles(rdmol:Chem.Mol) -> str:
81
+ Chem.SanitizeMol(rdmol)
82
+ rdmol = Chem.MolFromSmiles(Chem.MolToSmiles(rdmol))
83
+ rdmol_H = Chem.AddHs(rdmol)
84
+ rdmol = Chem.RemoveHs(rdmol_H)
85
+ return Chem.CanonSmiles(Chem.MolToSmiles(rdmol))
86
+
87
+
88
+ @staticmethod
89
+ def set_protonation_sites(smiles:str) -> tuple:
90
+ subject = Chem.MolFromSmiles(smiles)
91
+ subject = Chem.AddHs(subject)
92
+ charge = Chem.GetFormalCharge(subject)
93
+ indices = [] # atom indices of protonation/deprotonation site(s)
94
+ for i, name, smarts, smarts_index, acid_or_base in IonizedStates.protonation_patterns.itertuples():
95
+ pattern = Chem.MolFromSmarts(smarts)
96
+ matches = subject.GetSubstructMatches(pattern)
97
+ # returns a list of tuples, where each tuple contains the indices
98
+ # of the atoms in the molecule that match the substructure query
99
+ # ex. ((1,), (2,), (3,))
100
+ if len(matches) > 0:
101
+ smarts_index = int(smarts_index)
102
+ indices += [(match[smarts_index], acid_or_base) for match in matches]
103
+ return (list(set(indices)), subject, charge)
104
+
105
+
106
+ @staticmethod
107
+ def reasonable(romol:Chem.Mol) -> bool:
108
+ return all([len(romol.GetSubstructMatches(p)) == 0 for p in IonizedStates.unreasonable_patterns])
109
+
110
+
111
+ def protonate(self, smiles:str) -> int:
112
+ num_added_states = 0
113
+
114
+ if smiles not in self.protonation_sites:
115
+ self.protonation_sites[smiles] = self.set_protonation_sites(smiles)
116
+
117
+ (indices, subject, charge) = self.protonation_sites[smiles]
118
+
119
+ if (charge >= self.charge_max) or (charge <= self.charge_min):
120
+ # formal charge will be increased or decreased by protonation/deprotonation
121
+ # so, if the charge of current state is already max or min
122
+ # there is nothing to do
123
+ return num_added_states
124
+
125
+ for (i, acid_or_base) in indices:
126
+ edmol = Chem.RWMol(subject) # edmol preserves Hs
127
+ if acid_or_base == 'A': # de-protonate
128
+ A = edmol.GetAtomWithIdx(i)
129
+ if A.GetAtomicNum() == 1:
130
+ X = A.GetNeighbors()[0] # there must be only one neighbor
131
+ charge = X.GetFormalCharge() - 1
132
+ X.SetFormalCharge(charge)
133
+ edmol.RemoveAtom(i)
134
+ else:
135
+ bonded_H_indices = [ H.GetIdx() for H in A.GetNeighbors() if H.GetAtomicNum() == 1 ]
136
+ nH = len(bonded_H_indices)
137
+ assert nH > 0, f"Cannot deprotonate an atom (idx={i}; no H)"
138
+ charge = A.GetFormalCharge() - 1
139
+ A.SetFormalCharge(charge)
140
+ edmol.RemoveAtom(bonded_H_indices[0])
141
+
142
+ elif acid_or_base == 'B': # protonate
143
+ B = edmol.GetAtomWithIdx(i)
144
+ assert B.GetAtomicNum() > 1, f"Cannot protonate an atom (idx={i}; {B.GetAtomicNum()})"
145
+ charge = B.GetFormalCharge() + 1
146
+ B.SetFormalCharge(charge)
147
+ nH = B.GetNumExplicitHs()
148
+ B.SetNumExplicitHs(nH+1)
149
+ edmol = Chem.AddHs(edmol)
150
+
151
+ # Clean up and save SMILES
152
+ state_smiles = IonizedStates.clean_smiles(edmol)
153
+ state_mol = Chem.MolFromSmiles(state_smiles)
154
+ state_mol = Chem.AddHs(state_mol)
155
+ state_charge = Chem.GetFormalCharge(state_mol)
156
+ if self.reasonable(state_mol):
157
+ if state_smiles in self.states:
158
+ continue
159
+ self.states[state_smiles] = (state_mol, state_charge)
160
+ num_added_states += 1
161
+
162
+ return num_added_states
163
+
164
+
165
+ def ensemble(self) -> None:
166
+ num_added_states = None
167
+ while num_added_states is None or num_added_states > 0:
168
+ states = self.states.copy()
169
+ for smiles in states:
170
+ num_added_states = self.protonate(smiles)
@@ -0,0 +1,260 @@
1
+ import os
2
+ import pathlib
3
+ import copy
4
+ import operator
5
+ from collections import defaultdict
6
+ from typing import List, Tuple, Union, Iterator
7
+
8
+ from rdkit import Chem, Geometry
9
+ from rdkit.Chem import Draw, AllChem, rdMMPA
10
+
11
+ from .mol import Mol, rd_descriptor, rd_descriptor_f
12
+ from .mollibr import MolLibr
13
+
14
+
15
+ class MatchedSeries:
16
+ def __init__(self,
17
+ mollibr:MolLibr,
18
+ sort_props:Union[List,str,None]=None,
19
+ core_min:int=5, core_max:int=30, size_min:int=3) -> None :
20
+ """Initialize.
21
+
22
+ Documented here: [MMS with rdkit](https://iwatobipen.wordpress.com/2016/02/01/create-matched-molecular-series-with-rdkit/),
23
+ [Mishima-syk](https://github.com/Mishima-syk/py4chemoinformatics/blob/master/ch07_graph.asciidoc),
24
+ and [rdkit docs](http://rdkit.org/docs/source/rdkit.Chem.rdMMPA.html).
25
+
26
+ Examples:
27
+ >>> import rdworks
28
+ >>> libr = rdworks.read_smi('test.smi')
29
+ >>> series = rdworks.MatchedSeries(libr)
30
+
31
+ Args:
32
+ mollibr (MolLibr): a library of molecules.
33
+ sort_props (Union[List,str,None], optional): how to sort molecules within a series. Defaults to None.
34
+ core_min (int, optional): min number of atoms for a core. Defaults to 5.
35
+ core_max (int, optional): max number of atoms for a core. Defaults to 30.
36
+ size_min (int, optional): min number of molecules for a series. Defaults to 3.
37
+
38
+ Raises:
39
+ TypeError: if `mollibr` is not rdworks.MolLibr object.
40
+ """
41
+ if isinstance(mollibr, MolLibr):
42
+ self.mollibr = copy.deepcopy(mollibr) # a copy of MolLibr
43
+ else:
44
+ raise TypeError('MatchedSeries() expects rdworks.MolLibr object')
45
+ if isinstance(sort_props, list):
46
+ self.sort_props = sort_props
47
+ elif isinstance(sort_props, str):
48
+ self.sort_props = [ sort_props ]
49
+ else:
50
+ self.sort_props = [ 'HAC' ]
51
+ self.core_min = core_min
52
+ self.core_max = core_max
53
+ self.size_min = size_min # minimum numer of R-groups in a series
54
+ # for consistent drawing
55
+ self.template_pattern = None
56
+ self.template_coord2D = None
57
+ self.series = self.libr_to_series()
58
+
59
+
60
+ def __str__(self) -> str:
61
+ """Returns a string representation of object.
62
+
63
+ Returns:
64
+ str: string representation.
65
+ """
66
+ return f"<rdworks.MatchedSeries({self.count()})>"
67
+
68
+
69
+ def __iter__(self) -> Iterator:
70
+ """Yields an iterator of molecules.
71
+
72
+ Yields:
73
+ Iterator: iterator of molecules.
74
+ """
75
+ return iter(self.series)
76
+
77
+
78
+ def __next__(self) -> Tuple:
79
+ """Next series.
80
+
81
+ Returns:
82
+ Tuple: (scaffold_SMILES, [(r-group_SMILES, rdworks.Mol, *sort_props_values)
83
+ """
84
+ return next(self.series)
85
+
86
+
87
+ def __getitem__(self, index:Union[int,slice]) -> Tuple:
88
+ """Operator `[]`.
89
+
90
+ Args:
91
+ index (Union[int,slice]): index or indexes.
92
+
93
+ Raises:
94
+ ValueError: if series is empty or index is out of range.
95
+
96
+ Returns:
97
+ Tuple: (scaffold_SMILES, [(r-group_SMILES, rdworks.Mol, *sort_props_values)
98
+ """
99
+ if self.count() == 0:
100
+ raise ValueError(f"MatchedSeries is empty")
101
+ try:
102
+ return self.series[index]
103
+ except:
104
+ raise ValueError(f"index should be 0..{self.count()-1}")
105
+
106
+
107
+ def count(self) -> int:
108
+ """Returns the count of series.
109
+
110
+ Returns:
111
+ int: count of series.
112
+ """
113
+ return len(self.series)
114
+
115
+
116
+ def libr_to_series(self) -> List[Tuple]:
117
+ """Returns a list of molecular series.
118
+
119
+ Raises:
120
+ RuntimeError: if a molecular cut cannot be defined.
121
+
122
+ Returns:
123
+ List[Tuple]:
124
+ [
125
+ (scaffold_SMILES, [(r-group_SMILES, rdworks.Mol, *sort_props_values), ...,]),
126
+ ...,
127
+ ]
128
+ """
129
+ series = defaultdict(list)
130
+ for mol in self.mollibr:
131
+ # make a single cut
132
+ list_of_frag = rdMMPA.FragmentMol(mol.rdmol, maxCuts=1, resultsAsMols=False)
133
+ # note: default parameters: maxCuts=3, maxCutBonds=20, resultsAsMols=True
134
+ for _, cut in list_of_frag:
135
+ try:
136
+ frag_smiles_1, frag_smiles_2 = cut.split('.')
137
+ except:
138
+ raise RuntimeError(f'{mol.name} fragment_tuple= {cut}')
139
+ n1 = Chem.MolFromSmiles(frag_smiles_1).GetNumHeavyAtoms()
140
+ n2 = Chem.MolFromSmiles(frag_smiles_2).GetNumHeavyAtoms()
141
+ # split scaffold core and rgroup symmetrically
142
+ if n1 >= self.core_min and n1 <= self.core_max and n1 > n2:
143
+ # frag_1 is the scaffold and frag_2 is the rgroup
144
+ series[frag_smiles_1].append((frag_smiles_2, mol))
145
+ if n2 >= self.core_min and n2 <= self.core_max and n2 > n1:
146
+ # frag_2 is the scaffold and frag_1 is the rgroup
147
+ series[frag_smiles_2].append((frag_smiles_1, mol))
148
+ # convert dict to list and remove size < self.size_min
149
+ series = [(k,v) for k,v in series.items() if len(v) >= self.size_min]
150
+ # sort by size (from the largest to the smallest)
151
+ series = sorted(series, key=lambda x: len(x[1]), reverse=True)
152
+ # sort by self.sort_props
153
+ series_r_group_sorted = []
154
+ for (scaffold_smi, r_group_) in series:
155
+ r_group = []
156
+ for (r_smi, mol) in r_group_:
157
+ values = []
158
+ for p in self.sort_props:
159
+ try:
160
+ v = mol.props[p]
161
+ except:
162
+ if p in rd_descriptor_f:
163
+ v = rd_descriptor_f[p](mol.rdmol) # calc. on the fly
164
+ mol.props.update({p:v})
165
+ else:
166
+ v = None
167
+ values.append(v)
168
+ r_group.append((r_smi, mol, *values)) # unpack values i.e. a=[2,3] b=(1,*a) == (1,2,3)
169
+ r_group = sorted(r_group, key=operator.itemgetter(slice(2, 2+len(self.sort_props))))
170
+ series_r_group_sorted.append((scaffold_smi, r_group))
171
+ return series_r_group_sorted
172
+
173
+
174
+ def template(self, SMARTS:str, rdmol:Chem.Mol) -> None:
175
+ """Sets drawing layout template.
176
+
177
+ Args:
178
+ SMARTS (str): SMARTS for template pattern.
179
+ rdmol (Chem.Mol): template molecule.
180
+ """
181
+
182
+ self.template_pattern = Chem.MolFromSmarts(SMARTS)
183
+ matched = rdmol.GetSubstructMatch(self.template_pattern)
184
+ coords = [rdmol.GetConformer().GetAtomPosition(x) for x in matched]
185
+ self.template_coords2D = [Geometry.Point2D(pt.x, pt.y) for pt in coords]
186
+
187
+
188
+ def depict(self, smiles:str) -> Chem.Mol:
189
+ """Draws a molecule according to self.template in a consistent way.
190
+
191
+ Args:
192
+ smiles (str): input molecule.
193
+
194
+ Returns:
195
+ Chem.Mol: 2D coordinated Chem.Mol for depiction.
196
+ """
197
+ rdmol_2d = Chem.MolFromSmiles(smiles)
198
+ try:
199
+ matched = rdmol_2d.GetSubstructMatch(self.template_pattern)
200
+ coordDict = {}
201
+ for i, coord in enumerate(self.template_coords2D):
202
+ coordDict[matched[i]] = coord
203
+ AllChem.Compute2DCoords(rdmol_2d, coordMap=coordDict)
204
+ except:
205
+ pass
206
+ return rdmol_2d
207
+
208
+
209
+ def report(self,
210
+ workdir:os.PathLike=pathlib.Path("."),
211
+ prefix:str="mmseries",
212
+ mols_per_row:int=8,
213
+ width:int=200,
214
+ height:int=200,
215
+ max_mols:int=200,
216
+ use_svg:bool=True) -> None:
217
+ """Plots individual series and an overview of series.
218
+
219
+ Args:
220
+ workdir (os.PathLike, optional): working directory. Defaults to pathlib.Path(".").
221
+ prefix (str, optional): prefix of output files. Defaults to "mmseries".
222
+ mols_per_row (int, optional): number of molecules per row. Defaults to 8.
223
+ width (int, optional): width. Defaults to 200.
224
+ height (int, optional): height. Defaults to 200.
225
+ max_mols (int, optional): max number of molecules. Defaults to 200.
226
+ use_svg (bool, optional): whether to use SVG format. Defaults to True.
227
+ """
228
+ scaffold_mols = []
229
+ scaffold_legends = []
230
+ for idx, (scaffold_smiles, list_tuples_r_groups) in enumerate(self.series, start=1):
231
+ num = len(list_tuples_r_groups)
232
+ scaffold_mols.append(Chem.MolFromSmiles(scaffold_smiles))
233
+ scaffold_legends.append(f'Series #{idx} (n={num})')
234
+ r_group_mols = []
235
+ r_group_legends = []
236
+ for (r_group_smiles, m, *values) in list_tuples_r_groups:
237
+ # (r-group_SMILES, rdworks.Mol, *sort_props_values)
238
+ values = list(map(str, values))
239
+ r_group_mols.append(Chem.MolFromSmiles(r_group_smiles))
240
+ r_group_legends.append(f'{m.name}\n{",".join(values)}')
241
+
242
+ # plot individual series
243
+ with open(workdir / f"{prefix}-{idx:03d}-count-{num:03d}.svg", "w") as svg:
244
+ mols = scaffold_mols[-1:] + r_group_mols
245
+ legends = scaffold_legends[-1:] + r_group_legends
246
+ img = Draw.MolsToGridImage(mols,
247
+ molsPerRow=mols_per_row,
248
+ subImgSize=(width, height),
249
+ legends=legends,
250
+ useSVG=use_svg)
251
+ svg.write(img)
252
+
253
+ # plot overview
254
+ with open(workdir / f"{prefix}-overview.svg", "w") as svg:
255
+ img = Draw.MolsToGridImage(scaffold_mols,
256
+ molsPerRow=mols_per_row,
257
+ subImgSize=(width, height),
258
+ legends=scaffold_legends,
259
+ useSVG=use_svg)
260
+ svg.write(img)