rdworks 0.25.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rdworks/__init__.py +35 -0
- rdworks/autograph/__init__.py +4 -0
- rdworks/autograph/autograph.py +184 -0
- rdworks/autograph/centroid.py +90 -0
- rdworks/autograph/dynamictreecut.py +135 -0
- rdworks/autograph/nmrclust.py +123 -0
- rdworks/autograph/rckmeans.py +74 -0
- rdworks/bitqt/__init__.py +1 -0
- rdworks/bitqt/bitqt.py +355 -0
- rdworks/conf.py +374 -0
- rdworks/descriptor.py +36 -0
- rdworks/display.py +206 -0
- rdworks/ionized.py +170 -0
- rdworks/matchedseries.py +260 -0
- rdworks/mol.py +1522 -0
- rdworks/mollibr.py +887 -0
- rdworks/pka.py +38 -0
- rdworks/predefined/Asinex_fragment.xml +20 -0
- rdworks/predefined/Astex_RO3.xml +16 -0
- rdworks/predefined/Baell2010_PAINS/Baell2010A.xml +52 -0
- rdworks/predefined/Baell2010_PAINS/Baell2010B.xml +169 -0
- rdworks/predefined/Baell2010_PAINS/Baell2010C.xml +1231 -0
- rdworks/predefined/Baell2010_PAINS/PAINS-less-than-015-hits.xml +2048 -0
- rdworks/predefined/Baell2010_PAINS/PAINS-less-than-150-hits.xml +278 -0
- rdworks/predefined/Baell2010_PAINS/PAINS-more-than-150-hits.xml +83 -0
- rdworks/predefined/Baell2010_PAINS/makexml.py +70 -0
- rdworks/predefined/Brenk2008_Dundee/makexml.py +21 -0
- rdworks/predefined/CNS.xml +18 -0
- rdworks/predefined/ChEMBL_Walters/BMS.xml +543 -0
- rdworks/predefined/ChEMBL_Walters/Dundee.xml +318 -0
- rdworks/predefined/ChEMBL_Walters/Glaxo.xml +168 -0
- rdworks/predefined/ChEMBL_Walters/Inpharmatica.xml +276 -0
- rdworks/predefined/ChEMBL_Walters/LINT.xml +174 -0
- rdworks/predefined/ChEMBL_Walters/MLSMR.xml +351 -0
- rdworks/predefined/ChEMBL_Walters/PAINS.xml +1446 -0
- rdworks/predefined/ChEMBL_Walters/SureChEMBL.xml +501 -0
- rdworks/predefined/ChEMBL_Walters/makexml.py +40 -0
- rdworks/predefined/Hann1999_Glaxo/Hann1999.xml +168 -0
- rdworks/predefined/Hann1999_Glaxo/Hann1999Acid.xml +102 -0
- rdworks/predefined/Hann1999_Glaxo/Hann1999Base.xml +6 -0
- rdworks/predefined/Hann1999_Glaxo/Hann1999ElPh.xml +6 -0
- rdworks/predefined/Hann1999_Glaxo/Hann1999NuPh.xml +6 -0
- rdworks/predefined/Hann1999_Glaxo/makexml.py +83 -0
- rdworks/predefined/Kazius2005/Kazius2005.xml +114 -0
- rdworks/predefined/Kazius2005/makexml.py +66 -0
- rdworks/predefined/ZINC_druglike.xml +24 -0
- rdworks/predefined/ZINC_fragment.xml +14 -0
- rdworks/predefined/ZINC_leadlike.xml +15 -0
- rdworks/predefined/fragment.xml +7 -0
- rdworks/predefined/ionized/simple_smarts_pattern.csv +57 -0
- rdworks/predefined/ionized/smarts_pattern.csv +107 -0
- rdworks/predefined/misc/makexml.py +119 -0
- rdworks/predefined/misc/reactive-part-2.xml +104 -0
- rdworks/predefined/misc/reactive-part-3.xml +74 -0
- rdworks/predefined/misc/reactive.xml +321 -0
- rdworks/readin.py +312 -0
- rdworks/rgroup.py +2173 -0
- rdworks/scaffold.py +520 -0
- rdworks/std.py +143 -0
- rdworks/stereoisomers.py +127 -0
- rdworks/tautomers.py +20 -0
- rdworks/units.py +63 -0
- rdworks/utils.py +495 -0
- rdworks/xml.py +260 -0
- rdworks-0.25.7.dist-info/METADATA +37 -0
- rdworks-0.25.7.dist-info/RECORD +69 -0
- rdworks-0.25.7.dist-info/WHEEL +5 -0
- rdworks-0.25.7.dist-info/licenses/LICENSE +21 -0
- rdworks-0.25.7.dist-info/top_level.txt +1 -0
rdworks/mollibr.py
ADDED
@@ -0,0 +1,887 @@
|
|
1
|
+
import copy
|
2
|
+
import itertools
|
3
|
+
import pandas as pd
|
4
|
+
import gzip
|
5
|
+
|
6
|
+
from pathlib import Path
|
7
|
+
from typing import Optional, Union, Self, Iterator
|
8
|
+
from collections import defaultdict
|
9
|
+
from concurrent.futures import ProcessPoolExecutor
|
10
|
+
from tqdm import tqdm
|
11
|
+
|
12
|
+
from rdkit import Chem, DataStructs
|
13
|
+
from rdkit.Chem import Draw
|
14
|
+
from rdkit.ML.Cluster import Butina
|
15
|
+
from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker
|
16
|
+
|
17
|
+
from rdworks.conf import Conf
|
18
|
+
from rdworks.mol import Mol
|
19
|
+
|
20
|
+
from rdworks.xml import list_predefined_xml
|
21
|
+
from rdworks.utils import precheck_path, guess_mol_id
|
22
|
+
|
23
|
+
|
24
|
+
class MolLibr:
|
25
|
+
def __init__(self,
|
26
|
+
molecules: list | tuple | set | None = None,
|
27
|
+
names: list | tuple | set | None = None,
|
28
|
+
std:bool=False,
|
29
|
+
max_workers:int=4,
|
30
|
+
chunksize:int=100,
|
31
|
+
progress:bool=False) -> None:
|
32
|
+
"""Create a rdworks.MolLibr object.
|
33
|
+
|
34
|
+
Args:
|
35
|
+
molecules (Optional[Union[list,tuple,set]], optional): a list/tuple/set of molecules
|
36
|
+
(rdworks.Mol | SMILES | rdkit.Chem.Mol). Defaults to None.
|
37
|
+
names (Optional[Union[list,tuple,set]], optional): a list/tuple/set of names.
|
38
|
+
Defaults to None.
|
39
|
+
std (bool, optional): whether to standardize molecules. Defaults to False.
|
40
|
+
max_workers (int, optional): max workers for parallel calculation. Defaults to 4.
|
41
|
+
chunksize (int, optional): chunksize for parallel calculation. Defaults to 100.
|
42
|
+
progress (bool, optional): whether to show progress bar. Defaults to False.
|
43
|
+
|
44
|
+
Raises:
|
45
|
+
ValueError: if counts of molecules and names differ.
|
46
|
+
TypeError: if molecule is not rdworks.Mol | SMILES | rdkit.Chem.Mol )
|
47
|
+
"""
|
48
|
+
self.libr = []
|
49
|
+
self.max_workers = max_workers
|
50
|
+
self.chunksize = chunksize
|
51
|
+
self.progress = progress
|
52
|
+
self.query = None
|
53
|
+
self.threshold = None
|
54
|
+
self.clusters = None
|
55
|
+
|
56
|
+
if molecules and isinstance(molecules, (list, tuple, set)):
|
57
|
+
if names and isinstance(names, (list, tuple, set)):
|
58
|
+
if len(names) != len(molecules):
|
59
|
+
raise ValueError('MolLibr() counts of molecules and names are different')
|
60
|
+
if isinstance(molecules[0], Mol):
|
61
|
+
self.libr = molecules
|
62
|
+
elif isinstance(molecules[0], Conf):
|
63
|
+
self.libr = [Mol(conf.rdmol, name=conf.name).props.update(conf.props) for conf in molecules]
|
64
|
+
elif isinstance(molecules[0], str): # SMILES string
|
65
|
+
if names:
|
66
|
+
self.libr = [Mol(smi, name=name, std=std) for (smi, name) in zip(molecules, names)]
|
67
|
+
else:
|
68
|
+
self.libr = [Mol(smi, std=std) for smi in molecules]
|
69
|
+
self.rename(prefix='entry') # default name
|
70
|
+
elif isinstance(molecules[0], Chem.Mol):
|
71
|
+
if names:
|
72
|
+
self.libr = [Mol(rdmol, name=name, std=std) for (rdmol, name) in zip(molecules, names)]
|
73
|
+
else:
|
74
|
+
self.libr = [Mol(rdmol, std=std) for rdmol in molecules]
|
75
|
+
self.rename(prefix='entry') # default name
|
76
|
+
else:
|
77
|
+
raise TypeError('MolLibr() takes a list|tuple|set of Mol|SMILES|Chem.Mol')
|
78
|
+
|
79
|
+
def copy(self) -> Self:
|
80
|
+
"""Returns a copy of self.
|
81
|
+
|
82
|
+
Returns:
|
83
|
+
Self: rdworks.MolLibr object.
|
84
|
+
"""
|
85
|
+
return copy.deepcopy(self)
|
86
|
+
|
87
|
+
|
88
|
+
def __str__(self) -> str:
|
89
|
+
"""Returns string representation.
|
90
|
+
|
91
|
+
Returns:
|
92
|
+
str: string representation.
|
93
|
+
"""
|
94
|
+
|
95
|
+
return f"<MolLibr({self.count()})>"
|
96
|
+
|
97
|
+
|
98
|
+
def __iter__(self) -> Iterator:
|
99
|
+
"""Yields an iterator of molecules.
|
100
|
+
|
101
|
+
Yields:
|
102
|
+
Iterator: iterator of molecules.
|
103
|
+
"""
|
104
|
+
return iter(self.libr)
|
105
|
+
|
106
|
+
|
107
|
+
def __next__(self) -> Mol:
|
108
|
+
"""Next molecule.
|
109
|
+
|
110
|
+
Returns:
|
111
|
+
Mol: next molecule (rdworks.Mol) object.
|
112
|
+
"""
|
113
|
+
return next(self.libr)
|
114
|
+
|
115
|
+
|
116
|
+
def __eq__(self, other:Self) -> bool:
|
117
|
+
"""Operator `==`.
|
118
|
+
|
119
|
+
Args:
|
120
|
+
other (rdworks.MolLibr): other rdworks.MolLibr object.
|
121
|
+
|
122
|
+
Returns:
|
123
|
+
bool: True if other rdworks.MolLibr object is identical with self.
|
124
|
+
"""
|
125
|
+
if isinstance(other, MolLibr):
|
126
|
+
return len(frozenset(self.libr) - frozenset(other.libr)) == 0
|
127
|
+
else:
|
128
|
+
return False
|
129
|
+
|
130
|
+
|
131
|
+
def __getitem__(self, index: int | slice) -> Mol:
|
132
|
+
"""Operator `[]`.
|
133
|
+
|
134
|
+
Args:
|
135
|
+
index (Union[int, slice]): index or slice of indexes.
|
136
|
+
|
137
|
+
Raises:
|
138
|
+
ValueError: if library is empty or index is out of range.
|
139
|
+
|
140
|
+
Returns:
|
141
|
+
Mol: rdworks.Mol object
|
142
|
+
"""
|
143
|
+
if self.count() == 0:
|
144
|
+
raise ValueError(f"library is empty")
|
145
|
+
try:
|
146
|
+
return self.libr[index]
|
147
|
+
except:
|
148
|
+
raise ValueError(f"index should be 0..{self.count()-1}")
|
149
|
+
|
150
|
+
|
151
|
+
def __add__(self, other:object) -> Self:
|
152
|
+
"""Operator `+`. Returns a copy of extended library.
|
153
|
+
|
154
|
+
Args:
|
155
|
+
other (object): other rdworks.Mol or rdworks.MolLibr object.
|
156
|
+
|
157
|
+
Raises:
|
158
|
+
TypeError: if `other` is not rdworks.Mol or rdworks.MolLibr.
|
159
|
+
|
160
|
+
Returns:
|
161
|
+
Self: rdworks.MolLibr object.
|
162
|
+
"""
|
163
|
+
if isinstance(other, Mol):
|
164
|
+
obj = copy.deepcopy(self)
|
165
|
+
obj.libr.append(other)
|
166
|
+
return obj
|
167
|
+
elif isinstance(other, MolLibr):
|
168
|
+
obj = copy.deepcopy(self)
|
169
|
+
obj.libr.extend(other.libr)
|
170
|
+
return obj
|
171
|
+
else:
|
172
|
+
raise TypeError("'+' operator expects rdworks.Mol or rdworks.MolLibr object")
|
173
|
+
|
174
|
+
|
175
|
+
def __iadd__(self, other: Mol | Self) -> Self:
|
176
|
+
"""Operator `+=`. Updates self by adding other molecule or library
|
177
|
+
|
178
|
+
Args:
|
179
|
+
other (object): other rdworks.Mol or rdworks.MolLibr object.
|
180
|
+
|
181
|
+
Raises:
|
182
|
+
TypeError: if `other` is not rdworks.Mol or rdworks.MolLibr.
|
183
|
+
|
184
|
+
Returns:
|
185
|
+
Self: rdworks.MolLibr object.
|
186
|
+
"""
|
187
|
+
if isinstance(other, Mol):
|
188
|
+
self.libr.append(other)
|
189
|
+
elif isinstance(other, MolLibr):
|
190
|
+
self.libr.extend(other.libr)
|
191
|
+
else:
|
192
|
+
raise TypeError("'+=' operator expects Mol or MolLibr object")
|
193
|
+
return self
|
194
|
+
|
195
|
+
|
196
|
+
def __sub__(self, other: Mol | Self) -> Self:
|
197
|
+
"""Operator `-`. Returns a copy of subtractive subset.
|
198
|
+
|
199
|
+
Args:
|
200
|
+
other (Union[Mol,Self]): other rdworks.Mol or rdworks.MolLibr object.
|
201
|
+
|
202
|
+
Raises:
|
203
|
+
TypeError: if `other` is not rdworks.Mol or rdworks.MolLibr.
|
204
|
+
|
205
|
+
Returns:
|
206
|
+
Self: a copy of subtractive subset.
|
207
|
+
"""
|
208
|
+
if isinstance(other, Mol):
|
209
|
+
difference = frozenset(self.libr) - frozenset([other])
|
210
|
+
elif isinstance(other, MolLibr):
|
211
|
+
difference = frozenset(self.libr) - frozenset(other.libr)
|
212
|
+
else:
|
213
|
+
raise TypeError("'-' operator expects rdworks.Mol or rdworks.MolLibr object")
|
214
|
+
obj = copy.deepcopy(self)
|
215
|
+
obj.libr = list(difference)
|
216
|
+
return obj
|
217
|
+
|
218
|
+
|
219
|
+
def __isub__(self, other: Mol | Self) -> Self:
|
220
|
+
"""Operator `-=`. Updates self by subtracting other molecule or library.
|
221
|
+
|
222
|
+
Args:
|
223
|
+
other (Union[Mol,Self]): other molecule or library.
|
224
|
+
|
225
|
+
Raises:
|
226
|
+
TypeError: if `other` is not rdworks.Mol or rdworks.MolLibr.
|
227
|
+
|
228
|
+
Returns:
|
229
|
+
Self: rdworks.MolLibr object.
|
230
|
+
"""
|
231
|
+
if isinstance(other, Mol):
|
232
|
+
difference = frozenset(self.libr) - frozenset([other])
|
233
|
+
elif isinstance(other, MolLibr):
|
234
|
+
difference = frozenset(self.libr) - frozenset(other.libr)
|
235
|
+
else:
|
236
|
+
raise TypeError("'-=' operator expects rdworks.Mol or rdworks.MolLibr object")
|
237
|
+
self.libr = list(difference)
|
238
|
+
return self
|
239
|
+
|
240
|
+
|
241
|
+
def __and__(self, other: Mol | Self) -> Self:
|
242
|
+
"""Operator `&`. Returns a copy of common subset.
|
243
|
+
|
244
|
+
Args:
|
245
|
+
other (Union[Mol,Self]): other molecule or library.
|
246
|
+
|
247
|
+
Raises:
|
248
|
+
TypeError: if `other` is not rdworks.Mol or rdworks.MolLibr.
|
249
|
+
|
250
|
+
Returns:
|
251
|
+
Self: a copy of rdworks.MolLibr object.
|
252
|
+
"""
|
253
|
+
if isinstance(other, Mol):
|
254
|
+
intersection = frozenset(self.libr) & frozenset([other])
|
255
|
+
elif isinstance(other, MolLibr):
|
256
|
+
intersection = frozenset(self.libr) & frozenset(other.libr)
|
257
|
+
else:
|
258
|
+
raise TypeError("'&' operator or overlap() expects rdworks.Mol or rdworks.MolLibr object")
|
259
|
+
obj = copy.deepcopy(self)
|
260
|
+
obj.libr = list(intersection)
|
261
|
+
return obj
|
262
|
+
|
263
|
+
|
264
|
+
def __iand__(self, other: Mol | Self) -> Self:
|
265
|
+
"""Operator `&=`. Re-assigns self with common subset.
|
266
|
+
|
267
|
+
Args:
|
268
|
+
other (Union[Mol,Self]): other molecule or library.
|
269
|
+
|
270
|
+
Raises:
|
271
|
+
TypeError: if `other` is not rdworks.Mol or rdworks.MolLibr.
|
272
|
+
|
273
|
+
Returns:
|
274
|
+
Self: rdworks.MolLibr object.
|
275
|
+
"""
|
276
|
+
if isinstance(other, Mol):
|
277
|
+
intersection = frozenset(self.libr) & frozenset([other])
|
278
|
+
elif isinstance(other, MolLibr):
|
279
|
+
intersection = frozenset(self.libr) & frozenset(other.libr)
|
280
|
+
else:
|
281
|
+
raise TypeError("'&=' operator expects rdworks.Mol or rdworks.MolLibr object")
|
282
|
+
self.libr = list(intersection)
|
283
|
+
return self
|
284
|
+
|
285
|
+
|
286
|
+
@staticmethod
|
287
|
+
def _mask_similar(mol:Mol, targs:tuple) -> bool:
|
288
|
+
"""A mask function to return True if molecule is similar with target molecules, `targs`.
|
289
|
+
|
290
|
+
Args:
|
291
|
+
mol (Mol): subject rdworks.Mol object.
|
292
|
+
targs (tuple): a tuple of rdworks.Mol objects to compare.
|
293
|
+
|
294
|
+
Returns:
|
295
|
+
bool: True if molecule is similar with target molecules.
|
296
|
+
"""
|
297
|
+
return mol.is_similar(*targs) # unpack tuple of arguments
|
298
|
+
|
299
|
+
|
300
|
+
@staticmethod
|
301
|
+
def _mask_drop(mol:Mol, terms:str | Path) -> bool:
|
302
|
+
"""A mask function to return True if molecule matches `terms`.
|
303
|
+
|
304
|
+
Note that molecules matching the terms will be dropped (NOT be included) in the compression.
|
305
|
+
|
306
|
+
Args:
|
307
|
+
mol (Mol): subject rdworks.Mol object.
|
308
|
+
terms (str | Path): rule.
|
309
|
+
|
310
|
+
Returns:
|
311
|
+
bool: True if molecule matches the terms.
|
312
|
+
"""
|
313
|
+
return not mol.is_matching(terms)
|
314
|
+
|
315
|
+
@staticmethod
|
316
|
+
def _map_qed(mol:Mol, properties:list[str]=['QED', 'MolWt', 'LogP', 'TPSA', 'HBD']) -> dict:
|
317
|
+
"""A map function to apply Mol.qed(`properties`) on `mol`.
|
318
|
+
|
319
|
+
The default behavior of map() is to pass the elements of the iterable to the function by reference.
|
320
|
+
This means that if the function modifies the elements of the iterable,
|
321
|
+
those changes will be reflected in the iterable itself.
|
322
|
+
|
323
|
+
Args:
|
324
|
+
mol (Mol): subject rdworks.Mol object.
|
325
|
+
properties (list[str], optional): properties. Defaults to ['QED', 'MolWt', 'LogP', 'TPSA', 'HBD'].
|
326
|
+
|
327
|
+
Returns:
|
328
|
+
dict: dictionary of properties.
|
329
|
+
"""
|
330
|
+
return mol.qed(properties)
|
331
|
+
|
332
|
+
|
333
|
+
def compute(self, **kwargs) -> Self:
|
334
|
+
"""Change settings for parallel computing.
|
335
|
+
|
336
|
+
Args:
|
337
|
+
max_workers (Optional[int], optional): max number of workers. Defaults to None.
|
338
|
+
chunksize (Optional[int], optional): chunksize of splitted workload. Defaults to None.
|
339
|
+
progress (Optional[bool], optional): whether to show progress bar. Defaults to None.
|
340
|
+
|
341
|
+
Returns:
|
342
|
+
Self: rdworks.MolLibr object.
|
343
|
+
"""
|
344
|
+
self.max_workers = kwargs.get('max_workers', self.max_workers)
|
345
|
+
self.chunksize = kwargs.get('chunksize', self.chunksize)
|
346
|
+
self.progress = kwargs.get('progress', self.progress)
|
347
|
+
return self
|
348
|
+
|
349
|
+
|
350
|
+
def rename(self, prefix:Optional[str]=None, sep:str='.', start:int=1) -> Self:
|
351
|
+
"""Rename molecules with serial numbers in-place and their conformers.
|
352
|
+
|
353
|
+
Molecules will be named by a format, `{prefix}{sep}{serial_number}` and
|
354
|
+
conformers will be named accordingly.
|
355
|
+
|
356
|
+
Examples:
|
357
|
+
>>> a.rename(prefix='a')
|
358
|
+
|
359
|
+
Args:
|
360
|
+
prefix (str, optional): prefix for new name. If prefix is not given and set to None,
|
361
|
+
molecules will not renamed but conformers will be still renamed.
|
362
|
+
This is useful after dropping some conformers and rename them serially.
|
363
|
+
sep (str): separator between prefix and serial number (default: `.`)
|
364
|
+
start (int): start number of serial number.
|
365
|
+
|
366
|
+
Returns:
|
367
|
+
Self: rdworks.MolLibr object.
|
368
|
+
"""
|
369
|
+
|
370
|
+
num = self.count()
|
371
|
+
num_digits = len(str(num)) # ex. '100' -> 3
|
372
|
+
if prefix:
|
373
|
+
# use prefix to rename molecules AND conformers
|
374
|
+
for (serial, mol) in enumerate(self.libr, start=start):
|
375
|
+
if num > 1:
|
376
|
+
serial_str = str(serial)
|
377
|
+
while len(serial_str) < num_digits:
|
378
|
+
serial_str = '0' + serial_str
|
379
|
+
mol.rename(prefix=f"{prefix}{sep}{serial_str}")
|
380
|
+
else:
|
381
|
+
mol.rename(prefix)
|
382
|
+
else:
|
383
|
+
# rename molecules using serial numbers if they have duplicate names
|
384
|
+
# name -> name.1, name.2, ...
|
385
|
+
count_names = defaultdict(list)
|
386
|
+
for idx, mol in enumerate(self.libr):
|
387
|
+
count_names[mol.name].append(idx)
|
388
|
+
not_unique_names = [name for name, l in count_names.items() if len(l) > 1]
|
389
|
+
for idx, mol in enumerate(self.libr):
|
390
|
+
if mol.name in not_unique_names:
|
391
|
+
serial = count_names[mol.name].index(idx) + 1
|
392
|
+
mol.rename(f'{mol.name}.{serial}')
|
393
|
+
# rename conformers
|
394
|
+
for mol in self.libr:
|
395
|
+
mol.rename()
|
396
|
+
return self
|
397
|
+
|
398
|
+
|
399
|
+
def overlap(self, other:Self) -> Self:
|
400
|
+
"""Returns a common subset with `other` library.
|
401
|
+
|
402
|
+
Args:
|
403
|
+
other (Self): rdworks.MolLibr object.
|
404
|
+
|
405
|
+
Returns:
|
406
|
+
Self: common subset of rdworks.MolLibr.
|
407
|
+
"""
|
408
|
+
return self.__and__(other)
|
409
|
+
|
410
|
+
|
411
|
+
def similar(self, query:Mol, threshold:float=0.2, **kwargs) -> Self:
|
412
|
+
"""Returns a copy of subset that are similar to `query`.
|
413
|
+
|
414
|
+
Args:
|
415
|
+
query (Mol): query molecule.
|
416
|
+
threshold (float, optional): similarity threshold. Defaults to 0.2.
|
417
|
+
|
418
|
+
Raises:
|
419
|
+
TypeError: if query is not rdworks.Mol type.
|
420
|
+
|
421
|
+
Returns:
|
422
|
+
Self: a copy of self.
|
423
|
+
"""
|
424
|
+
obj = copy.deepcopy(self).compute(**kwargs)
|
425
|
+
if isinstance(query, Mol):
|
426
|
+
largs = [(query, threshold),] * obj.count()
|
427
|
+
else:
|
428
|
+
raise TypeError("MolLibr.similar() expects Mol object")
|
429
|
+
with ProcessPoolExecutor(max_workers=obj.max_workers) as executor:
|
430
|
+
if self.progress:
|
431
|
+
mask = list(tqdm(executor.map(MolLibr._mask_similar, obj.libr, largs, chunksize=obj.chunksize),
|
432
|
+
desc="Similar",
|
433
|
+
total=obj.count()))
|
434
|
+
else:
|
435
|
+
mask = list(executor.map(MolLibr._mask_similar, obj.libr, largs, chunksize=obj.chunksize))
|
436
|
+
obj.libr = list(itertools.compress(obj.libr, mask))
|
437
|
+
return obj
|
438
|
+
|
439
|
+
|
440
|
+
|
441
|
+
def unique(self, report=False) -> Self:
|
442
|
+
"""Removes duplicates and returns a copy of unique library.
|
443
|
+
|
444
|
+
Args:
|
445
|
+
report (bool, optional): whether to report duplicates. Defaults to False.
|
446
|
+
|
447
|
+
Returns:
|
448
|
+
Self: a copy of self.
|
449
|
+
"""
|
450
|
+
obj = copy.deepcopy(self)
|
451
|
+
U = {} # unique SMILES
|
452
|
+
mask = []
|
453
|
+
for mol in obj.libr:
|
454
|
+
if mol.smiles in U:
|
455
|
+
mask.append(False)
|
456
|
+
# ignore the same name or recorded aka
|
457
|
+
if (mol.name != U[mol.smiles].name) and (mol.name not in U[mol.smiles].props['aka']):
|
458
|
+
U[mol.smiles].props['aka'].append(mol.name)
|
459
|
+
else:
|
460
|
+
mask.append(True)
|
461
|
+
U[mol.smiles] = mol
|
462
|
+
obj.libr = list(itertools.compress(obj.libr, mask))
|
463
|
+
if report:
|
464
|
+
print("duplicates:")
|
465
|
+
for mol in obj.libr:
|
466
|
+
if len(mol.props['aka']) > 0:
|
467
|
+
print(f" {mol.name}({len(mol.props['aka'])}) - {','.join(mol.props['aka'])}")
|
468
|
+
print(f"de-duplicated to {obj.count()} molecules")
|
469
|
+
return obj
|
470
|
+
|
471
|
+
|
472
|
+
def qed(self, properties:list[str]=['QED', 'MolWt', 'LogP', 'TPSA', 'HBD'], **kwargs) -> Self:
|
473
|
+
"""Returns a copy of self with calculated quantitative estimate of drug-likeness (QED).
|
474
|
+
|
475
|
+
Args:
|
476
|
+
properties (list[str], optional): _description_. Defaults to ['QED', 'MolWt', 'LogP', 'TPSA', 'HBD'].
|
477
|
+
|
478
|
+
Returns:
|
479
|
+
Self: self.
|
480
|
+
"""
|
481
|
+
self = self.compute(**kwargs)
|
482
|
+
lprops = [ properties, ] * self.count()
|
483
|
+
with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
|
484
|
+
if self.progress:
|
485
|
+
self.libr = list(tqdm(
|
486
|
+
executor.map(MolLibr._map_qed, self.libr, lprops, chunksize=self.chunksize),
|
487
|
+
desc="QED Properties",
|
488
|
+
total=self.count()
|
489
|
+
))
|
490
|
+
else:
|
491
|
+
self.libr = list(
|
492
|
+
executor.map(MolLibr._map_qed, self.libr, lprops, chunksize=self.chunksize)
|
493
|
+
)
|
494
|
+
return self
|
495
|
+
|
496
|
+
|
497
|
+
def drop(self, terms:str | Path | None = None, invert:bool=False, **kwargs) -> Self:
|
498
|
+
"""Drops matched molecules and returns a copy of library with remaining molecules.
|
499
|
+
|
500
|
+
Args:
|
501
|
+
terms (str | Path | None, optional): matching terms. Defaults to None.
|
502
|
+
invert (bool, optional): whether to invert selection by the `terms`. Defaults to False.
|
503
|
+
|
504
|
+
Returns:
|
505
|
+
Self: a copy of self.
|
506
|
+
"""
|
507
|
+
if not terms:
|
508
|
+
print(list_predefined_xml())
|
509
|
+
return self
|
510
|
+
obj = copy.deepcopy(self).compute(**kwargs)
|
511
|
+
lterms = [ terms ] * obj.count()
|
512
|
+
with ProcessPoolExecutor(max_workers=obj.max_workers) as executor:
|
513
|
+
if obj.progress:
|
514
|
+
mask = list(tqdm(
|
515
|
+
executor.map(MolLibr._mask_drop, obj.libr, lterms, chunksize=obj.chunksize),
|
516
|
+
desc="Drop",
|
517
|
+
total=obj.count()))
|
518
|
+
else:
|
519
|
+
mask = list(
|
520
|
+
executor.map(MolLibr._mask_drop, obj.libr, lterms, chunksize=obj.chunksize))
|
521
|
+
if invert:
|
522
|
+
mask = [not b for b in mask]
|
523
|
+
obj.libr = list(itertools.compress(obj.libr, mask))
|
524
|
+
return obj
|
525
|
+
|
526
|
+
|
527
|
+
def pick(self, n:int, **kwargs) -> Self:
|
528
|
+
"""Picks n diverse molecules.
|
529
|
+
|
530
|
+
Args:
|
531
|
+
n (int): number of molecules to pick.
|
532
|
+
|
533
|
+
Returns:
|
534
|
+
Self: a copy of self.
|
535
|
+
"""
|
536
|
+
obj = copy.deepcopy(self)
|
537
|
+
raise NotImplementedError
|
538
|
+
return obj
|
539
|
+
|
540
|
+
|
541
|
+
|
542
|
+
|
543
|
+
##################################################
|
544
|
+
### endpoints
|
545
|
+
##################################################
|
546
|
+
|
547
|
+
|
548
|
+
def count(self) -> int:
|
549
|
+
"""Returns number of molecules.
|
550
|
+
|
551
|
+
Returns:
|
552
|
+
int: count of molecules.
|
553
|
+
"""
|
554
|
+
return len(self.libr)
|
555
|
+
|
556
|
+
|
557
|
+
def cluster(self, threshold:float=0.3, ordered:bool=True, drop_singleton:bool=True) -> list:
|
558
|
+
"""Clusters molecules using fingerprint.
|
559
|
+
|
560
|
+
Args:
|
561
|
+
threshold (float, optional): Tanimoto similarity threshold. Defaults to 0.3.
|
562
|
+
ordered (bool, optional): order clusters by size of cluster. Defaults to True.
|
563
|
+
drop_singleton (bool, optional): exclude singletons. Defaults to True.
|
564
|
+
|
565
|
+
Returns:
|
566
|
+
list: [(centroid_1, idx, idx,), (centroid_2, idx, idx,), ...]
|
567
|
+
"""
|
568
|
+
for mol in self.libr:
|
569
|
+
if not mol.fp:
|
570
|
+
mol.fp = mol.MFP2.GetFingerprint(mol.rdmol)
|
571
|
+
fps = [ mol.fp for mol in self.libr if mol.fp ]
|
572
|
+
n = len(fps)
|
573
|
+
# first generate the distance matrix:
|
574
|
+
dmat = []
|
575
|
+
for i in range(1, n):
|
576
|
+
sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
|
577
|
+
dmat.extend([1-x for x in sims])
|
578
|
+
# Butina hierarchical clustering:
|
579
|
+
# clusters is a list of list of indices
|
580
|
+
clusters = Butina.ClusterData(dmat,
|
581
|
+
nPts=n,
|
582
|
+
distThresh=threshold,
|
583
|
+
isDistData=True,
|
584
|
+
reordering=True)
|
585
|
+
if ordered:
|
586
|
+
# in the order of cluster size, from the largest to the smallest
|
587
|
+
clusters = sorted(clusters, key=lambda indices: len(indices), reverse=True)
|
588
|
+
|
589
|
+
if drop_singleton:
|
590
|
+
clusters = [indices for indices in clusters if len(indices) > 1]
|
591
|
+
|
592
|
+
return clusters
|
593
|
+
|
594
|
+
|
595
|
+
|
596
|
+
def to_sdf(self,
|
597
|
+
path:str | Path,
|
598
|
+
confs:bool=False,
|
599
|
+
props:bool=True,
|
600
|
+
separate:bool=False) -> None:
|
601
|
+
"""Writes to .sdf or .sdf.gz file.
|
602
|
+
|
603
|
+
Chem.SDWriter is supposed to write all non-private molecular properties.
|
604
|
+
|
605
|
+
`dirname/filename.sdf` -> `dirname/filename_{molecule name}.sdf`
|
606
|
+
`dirname/filename.sdf.gz` -> `dirname/filename_{molecule name}.sdf.gz`
|
607
|
+
|
608
|
+
Args:
|
609
|
+
path (str or PosixPath) : output filename or path
|
610
|
+
confs (bool) : whether to write 3D coordinates and conformer properties. Defaults to False.
|
611
|
+
props (bool) : whether to write SDF properties. Defaults to True.
|
612
|
+
separate (bool) : write each molecule to separate files. Defaults to False.
|
613
|
+
"""
|
614
|
+
if isinstance(path, str):
|
615
|
+
path = Path(path)
|
616
|
+
# PurePosixPath('my/dir/mol.sdf.gz').suffix -> '.gz'
|
617
|
+
# PurePosixPath('my/dir/mol.sdf.gz').suffixes -> ['.sdf', '.gz']
|
618
|
+
# PurePosixPath('my/dir/mol.sdf').name -> 'mol.sdf'
|
619
|
+
# PurePosixPath('my/dir/mol.sdf').with_name('mol2.sdf') -> PurePath('my/dir/mol2.sdf')
|
620
|
+
suffix = path.suffix
|
621
|
+
suffixes = ''.join(path.suffixes)
|
622
|
+
prefix = path.name.replace(suffixes, '')
|
623
|
+
if separate:
|
624
|
+
for mol in self.libr:
|
625
|
+
if suffix == '.gz':
|
626
|
+
with gzip.open(path.with_name(f'{prefix}_{mol.name}.sdf.gz'), "wt") as f:
|
627
|
+
f.write(mol.to_sdf(confs, props))
|
628
|
+
else:
|
629
|
+
with open(path.with_name(f'{prefix}_{mol.name}.sdf'), "w") as f:
|
630
|
+
f.write(mol.to_sdf(confs, props))
|
631
|
+
|
632
|
+
else:
|
633
|
+
if suffix == '.gz':
|
634
|
+
with gzip.open(path, "wt") as f:
|
635
|
+
for mol in self.libr:
|
636
|
+
f.write(mol.to_sdf(confs, props))
|
637
|
+
else:
|
638
|
+
with open(path, "w") as f:
|
639
|
+
for mol in self.libr:
|
640
|
+
f.write(mol.to_sdf(confs, props))
|
641
|
+
|
642
|
+
|
643
|
+
def to_smi(self, path:str | Path) -> None:
|
644
|
+
"""Writes to .smi file.
|
645
|
+
|
646
|
+
Args:
|
647
|
+
path (str | Path): output filename or path.
|
648
|
+
"""
|
649
|
+
if isinstance(path, Path):
|
650
|
+
path = path.as_posix() # convert to string
|
651
|
+
if path.endswith('.gz'):
|
652
|
+
with gzip.open(path, "wt") as smigz:
|
653
|
+
for mol in self.libr:
|
654
|
+
smigz.write(f'{mol.smiles} {mol.name}\n')
|
655
|
+
else:
|
656
|
+
with open(path, "w") as smi:
|
657
|
+
for mol in self.libr:
|
658
|
+
smi.write(f'{mol.smiles} {mol.name}\n')
|
659
|
+
|
660
|
+
|
661
|
+
def to_image(self, width:int=200, height:int=200, index:bool=False, mols_per_row:int=5) -> str:
|
662
|
+
"""Returns SVG strings for Jupyter notebook.
|
663
|
+
|
664
|
+
Args:
|
665
|
+
width (int, optional): width. Defaults to 200.
|
666
|
+
height (int, optional): height. Defaults to 200.
|
667
|
+
index (bool, optional): whether to show atom index. Defaults to False.
|
668
|
+
mols_per_row (int, optional): number of molecules per row. Defaults to 5.
|
669
|
+
|
670
|
+
Returns:
|
671
|
+
str: SVG strings for Jupyter notebook.
|
672
|
+
"""
|
673
|
+
|
674
|
+
if index:
|
675
|
+
for mol in self.libr:
|
676
|
+
for a in mol.rdmol.GetAtoms():
|
677
|
+
a.SetProp("atomNote", str(a.GetIdx()+1))
|
678
|
+
rdmols = [mol.rdmol for mol in self.libr]
|
679
|
+
legends = [mol.name for mol in self.libr]
|
680
|
+
return Draw.MolsToGridImage(rdmols,
|
681
|
+
legends=legends,
|
682
|
+
molsPerRow=min(mols_per_row, len(rdmols)),
|
683
|
+
subImgSize=(width,height),
|
684
|
+
useSVG=True)
|
685
|
+
|
686
|
+
|
687
|
+
def to_png(self, path:str | Path, width:int=200, height:int=200, index:bool=False, mols_per_row:int=5) -> None:
|
688
|
+
"""Writes to a .png file.
|
689
|
+
|
690
|
+
Args:
|
691
|
+
path (str | Path): output filename or path.
|
692
|
+
width (int, optional): width. Defaults to 200.
|
693
|
+
height (int, optional): height. Defaults to 200.
|
694
|
+
index (bool, optional): whether to show atom index. Defaults to False.
|
695
|
+
mols_per_row (int, optional): number of molecules per row. Defaults to 5.
|
696
|
+
"""
|
697
|
+
if isinstance(path, Path):
|
698
|
+
path = path.as_posix() # convert to string
|
699
|
+
if index:
|
700
|
+
for mol in self.libr:
|
701
|
+
for a in mol.rdmol.GetAtoms():
|
702
|
+
a.SetProp("atomNote", str(a.GetIdx()+1))
|
703
|
+
rdmols = [mol.rdmol for mol in self.libr]
|
704
|
+
legends = [mol.name for mol in self.libr]
|
705
|
+
Draw.MolsToGridImage(rdmols,
|
706
|
+
legends=legends,
|
707
|
+
molsPerRow=min(mols_per_row,len(rdmols)),
|
708
|
+
subImgSize=(width,height),
|
709
|
+
useSVG=False).save(path)
|
710
|
+
|
711
|
+
|
712
|
+
def to_html(self) -> str:
|
713
|
+
"""Writes to HTML strings.
|
714
|
+
|
715
|
+
Returns:
|
716
|
+
str: HTML strings.
|
717
|
+
"""
|
718
|
+
HTML = "<html><body>"
|
719
|
+
for mol in self.libr:
|
720
|
+
HTML += mol.to_html(htmlbody=False)
|
721
|
+
HTML += "</body></html>"
|
722
|
+
return HTML
|
723
|
+
|
724
|
+
|
725
|
+
def to_dataframe(self,
|
726
|
+
name:str='name',
|
727
|
+
smiles:str='smiles',
|
728
|
+
confs:bool=False) -> pd.DataFrame:
|
729
|
+
"""Returns a Pandas DataFrame.
|
730
|
+
|
731
|
+
Args:
|
732
|
+
name (str, optional): column name for name. Defaults to 'name'.
|
733
|
+
smiles (str, optional): column name for SMILES. Defaults to 'smiles'.
|
734
|
+
confs (bool, optional): whether to include conformer properties. Defaults to False.
|
735
|
+
|
736
|
+
Returns:
|
737
|
+
pd.DataFrame: pandas DataFrame.
|
738
|
+
"""
|
739
|
+
if confs:
|
740
|
+
exclude = ['coord']
|
741
|
+
property_columns = set()
|
742
|
+
for mol in self.libr:
|
743
|
+
for conf in mol.confs:
|
744
|
+
for k in conf.props:
|
745
|
+
if k not in exclude:
|
746
|
+
property_columns.add(k)
|
747
|
+
property_columns = property_columns - set([name, smiles])
|
748
|
+
data = {name:[], smiles:[]}
|
749
|
+
data.update({k:[] for k in property_columns})
|
750
|
+
for mol in self.libr:
|
751
|
+
for conf in mol.confs:
|
752
|
+
data[name].append(conf.name)
|
753
|
+
data[smiles].append(mol.smiles)
|
754
|
+
for k in property_columns:
|
755
|
+
if k in conf.props:
|
756
|
+
data[k].append(conf.props[k])
|
757
|
+
else:
|
758
|
+
data[k].append(None)
|
759
|
+
else:
|
760
|
+
property_columns = set()
|
761
|
+
for mol in self.libr:
|
762
|
+
for k in mol.props:
|
763
|
+
property_columns.add(k)
|
764
|
+
property_columns = property_columns - set([name, smiles])
|
765
|
+
data = {name:[], smiles:[]}
|
766
|
+
data.update({k:[] for k in property_columns})
|
767
|
+
for mol in self.libr:
|
768
|
+
data[name].append(mol.name)
|
769
|
+
data[smiles].append(mol.smiles)
|
770
|
+
for k in property_columns:
|
771
|
+
if k in mol.props:
|
772
|
+
data[k].append(mol.props[k])
|
773
|
+
else:
|
774
|
+
data[k].append(None)
|
775
|
+
return pd.DataFrame(data)
|
776
|
+
|
777
|
+
|
778
|
+
def to_csv(self,
|
779
|
+
path:str | Path,
|
780
|
+
confs:bool=False,
|
781
|
+
decimal_places:int=3) -> None:
|
782
|
+
"""Writes to a .csv file.
|
783
|
+
|
784
|
+
Args:
|
785
|
+
path (str | Path): output filename or path.
|
786
|
+
confs (bool, optional): whether to include conformer properties. Defaults to False.
|
787
|
+
decimal_places (int, optional): decimal places for float numbers. Defaults to 3.
|
788
|
+
"""
|
789
|
+
df = self.to_dataframe(confs=confs)
|
790
|
+
df.to_csv(path, index=False, float_format=f'%.{decimal_places}f')
|
791
|
+
|
792
|
+
|
793
|
+
@staticmethod
|
794
|
+
def _mask_nn_applicable(mol:Mol, model:str) -> bool:
|
795
|
+
"""A mask function to return True if molecule is NN applicable.
|
796
|
+
|
797
|
+
Args:
|
798
|
+
mol (Mol): rdworks.Mol object.
|
799
|
+
model (str): name of NN model.
|
800
|
+
|
801
|
+
Returns:
|
802
|
+
bool: True if molecule is NN applicable.
|
803
|
+
"""
|
804
|
+
return mol.is_nn_applicable(model)
|
805
|
+
|
806
|
+
|
807
|
+
def nn_applicable(self, model:str, **kwargs) -> Self:
|
808
|
+
"""Returns a copy of subset of library that is applicable to given neural network `model`.
|
809
|
+
|
810
|
+
Examples:
|
811
|
+
>>> libr = rdworks.MolLibr(drug_smiles, drug_names)
|
812
|
+
>>> ani2x_compatible_subset = libr.nn_applicable('ANI-2x', progress=False)
|
813
|
+
|
814
|
+
Args:
|
815
|
+
model (str): name of model.
|
816
|
+
|
817
|
+
Returns:
|
818
|
+
Self: subset of library.
|
819
|
+
"""
|
820
|
+
obj = copy.deepcopy(self).compute(**kwargs)
|
821
|
+
lmodel = [model,] * self.count()
|
822
|
+
with ProcessPoolExecutor(max_workers=obj.max_workers) as executor:
|
823
|
+
if obj.progress:
|
824
|
+
mask = list(tqdm(
|
825
|
+
executor.map(self.mask_nn_applicable, obj.libr, lmodel, chunksize=obj.chunksize),
|
826
|
+
desc="NN applicable",
|
827
|
+
total=obj.count()))
|
828
|
+
else:
|
829
|
+
mask = list(
|
830
|
+
executor.map(self._mask_nn_applicable, obj.libr, lmodel, chunksize=obj.chunksize))
|
831
|
+
obj.libr = list(itertools.compress(obj.libr, mask))
|
832
|
+
return obj
|
833
|
+
|
834
|
+
|
835
|
+
def to_nnbatches(self, batchsize:int=1000) -> list:
|
836
|
+
"""Split workload flexibily into a numer of batches.
|
837
|
+
|
838
|
+
- Each batch has up to `batchsize` number of atoms.
|
839
|
+
- Conformers originated from a same molecule can be splitted into multiple batches.
|
840
|
+
- Or one batch can contain conformers originated from multiple molecules.
|
841
|
+
|
842
|
+
coord: coordinates of input molecules (N, m, 3) where N is the number of structures and
|
843
|
+
m is the number of atoms in each structure.
|
844
|
+
numbers: atomic numbers in the molecule (include H). (N, m)
|
845
|
+
charges: (N,)
|
846
|
+
|
847
|
+
Args:
|
848
|
+
batchsize: max. number of atoms in a batch.
|
849
|
+
|
850
|
+
Returns:
|
851
|
+
list: list of batches.
|
852
|
+
"""
|
853
|
+
|
854
|
+
pre_batches = []
|
855
|
+
batch_confs = []
|
856
|
+
batch_mols = []
|
857
|
+
batch_n_atoms = 0
|
858
|
+
|
859
|
+
for mol in self.libr:
|
860
|
+
for conf in mol.confs:
|
861
|
+
n_atoms = conf.props['atoms']
|
862
|
+
if (batch_n_atoms + n_atoms) > batchsize:
|
863
|
+
pre_batches.append((batch_mols, batch_confs, batch_n_atoms))
|
864
|
+
# start over a new batch
|
865
|
+
batch_mols = [mol]
|
866
|
+
batch_confs = [conf]
|
867
|
+
batch_n_atoms = n_atoms
|
868
|
+
else:
|
869
|
+
batch_mols.append(mol)
|
870
|
+
batch_confs.append(conf)
|
871
|
+
batch_n_atoms += n_atoms
|
872
|
+
|
873
|
+
if batch_n_atoms > 0: # last remaining batch
|
874
|
+
pre_batches.append((batch_mols, batch_confs, batch_n_atoms))
|
875
|
+
|
876
|
+
batches = []
|
877
|
+
|
878
|
+
for i, (batch_mols, batch_confs, batch_n_atoms) in enumerate(pre_batches, start=1):
|
879
|
+
charges = [mol.props['charge'] for mol in batch_mols]
|
880
|
+
coord = [conf.rdmol.GetConformer().GetPositions().tolist() for conf in batch_confs]
|
881
|
+
# to be consistent with legacy code
|
882
|
+
coord = [[tuple(xyz) for xyz in inner] for inner in coord]
|
883
|
+
# numbers should be got from conformers because of hydrogens
|
884
|
+
numbers = [[a.GetAtomicNum() for a in conf.rdmol.GetAtoms()] for conf in batch_confs]
|
885
|
+
batches.append((coord, numbers, charges, batch_confs, batch_mols))
|
886
|
+
|
887
|
+
return batches
|