rdworks 0.25.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. rdworks/__init__.py +35 -0
  2. rdworks/autograph/__init__.py +4 -0
  3. rdworks/autograph/autograph.py +184 -0
  4. rdworks/autograph/centroid.py +90 -0
  5. rdworks/autograph/dynamictreecut.py +135 -0
  6. rdworks/autograph/nmrclust.py +123 -0
  7. rdworks/autograph/rckmeans.py +74 -0
  8. rdworks/bitqt/__init__.py +1 -0
  9. rdworks/bitqt/bitqt.py +355 -0
  10. rdworks/conf.py +374 -0
  11. rdworks/descriptor.py +36 -0
  12. rdworks/display.py +206 -0
  13. rdworks/ionized.py +170 -0
  14. rdworks/matchedseries.py +260 -0
  15. rdworks/mol.py +1522 -0
  16. rdworks/mollibr.py +887 -0
  17. rdworks/pka.py +38 -0
  18. rdworks/predefined/Asinex_fragment.xml +20 -0
  19. rdworks/predefined/Astex_RO3.xml +16 -0
  20. rdworks/predefined/Baell2010_PAINS/Baell2010A.xml +52 -0
  21. rdworks/predefined/Baell2010_PAINS/Baell2010B.xml +169 -0
  22. rdworks/predefined/Baell2010_PAINS/Baell2010C.xml +1231 -0
  23. rdworks/predefined/Baell2010_PAINS/PAINS-less-than-015-hits.xml +2048 -0
  24. rdworks/predefined/Baell2010_PAINS/PAINS-less-than-150-hits.xml +278 -0
  25. rdworks/predefined/Baell2010_PAINS/PAINS-more-than-150-hits.xml +83 -0
  26. rdworks/predefined/Baell2010_PAINS/makexml.py +70 -0
  27. rdworks/predefined/Brenk2008_Dundee/makexml.py +21 -0
  28. rdworks/predefined/CNS.xml +18 -0
  29. rdworks/predefined/ChEMBL_Walters/BMS.xml +543 -0
  30. rdworks/predefined/ChEMBL_Walters/Dundee.xml +318 -0
  31. rdworks/predefined/ChEMBL_Walters/Glaxo.xml +168 -0
  32. rdworks/predefined/ChEMBL_Walters/Inpharmatica.xml +276 -0
  33. rdworks/predefined/ChEMBL_Walters/LINT.xml +174 -0
  34. rdworks/predefined/ChEMBL_Walters/MLSMR.xml +351 -0
  35. rdworks/predefined/ChEMBL_Walters/PAINS.xml +1446 -0
  36. rdworks/predefined/ChEMBL_Walters/SureChEMBL.xml +501 -0
  37. rdworks/predefined/ChEMBL_Walters/makexml.py +40 -0
  38. rdworks/predefined/Hann1999_Glaxo/Hann1999.xml +168 -0
  39. rdworks/predefined/Hann1999_Glaxo/Hann1999Acid.xml +102 -0
  40. rdworks/predefined/Hann1999_Glaxo/Hann1999Base.xml +6 -0
  41. rdworks/predefined/Hann1999_Glaxo/Hann1999ElPh.xml +6 -0
  42. rdworks/predefined/Hann1999_Glaxo/Hann1999NuPh.xml +6 -0
  43. rdworks/predefined/Hann1999_Glaxo/makexml.py +83 -0
  44. rdworks/predefined/Kazius2005/Kazius2005.xml +114 -0
  45. rdworks/predefined/Kazius2005/makexml.py +66 -0
  46. rdworks/predefined/ZINC_druglike.xml +24 -0
  47. rdworks/predefined/ZINC_fragment.xml +14 -0
  48. rdworks/predefined/ZINC_leadlike.xml +15 -0
  49. rdworks/predefined/fragment.xml +7 -0
  50. rdworks/predefined/ionized/simple_smarts_pattern.csv +57 -0
  51. rdworks/predefined/ionized/smarts_pattern.csv +107 -0
  52. rdworks/predefined/misc/makexml.py +119 -0
  53. rdworks/predefined/misc/reactive-part-2.xml +104 -0
  54. rdworks/predefined/misc/reactive-part-3.xml +74 -0
  55. rdworks/predefined/misc/reactive.xml +321 -0
  56. rdworks/readin.py +312 -0
  57. rdworks/rgroup.py +2173 -0
  58. rdworks/scaffold.py +520 -0
  59. rdworks/std.py +143 -0
  60. rdworks/stereoisomers.py +127 -0
  61. rdworks/tautomers.py +20 -0
  62. rdworks/units.py +63 -0
  63. rdworks/utils.py +495 -0
  64. rdworks/xml.py +260 -0
  65. rdworks-0.25.7.dist-info/METADATA +37 -0
  66. rdworks-0.25.7.dist-info/RECORD +69 -0
  67. rdworks-0.25.7.dist-info/WHEEL +5 -0
  68. rdworks-0.25.7.dist-info/licenses/LICENSE +21 -0
  69. rdworks-0.25.7.dist-info/top_level.txt +1 -0
rdworks/mollibr.py ADDED
@@ -0,0 +1,887 @@
1
+ import copy
2
+ import itertools
3
+ import pandas as pd
4
+ import gzip
5
+
6
+ from pathlib import Path
7
+ from typing import Optional, Union, Self, Iterator
8
+ from collections import defaultdict
9
+ from concurrent.futures import ProcessPoolExecutor
10
+ from tqdm import tqdm
11
+
12
+ from rdkit import Chem, DataStructs
13
+ from rdkit.Chem import Draw
14
+ from rdkit.ML.Cluster import Butina
15
+ from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker
16
+
17
+ from rdworks.conf import Conf
18
+ from rdworks.mol import Mol
19
+
20
+ from rdworks.xml import list_predefined_xml
21
+ from rdworks.utils import precheck_path, guess_mol_id
22
+
23
+
24
+ class MolLibr:
25
+ def __init__(self,
26
+ molecules: list | tuple | set | None = None,
27
+ names: list | tuple | set | None = None,
28
+ std:bool=False,
29
+ max_workers:int=4,
30
+ chunksize:int=100,
31
+ progress:bool=False) -> None:
32
+ """Create a rdworks.MolLibr object.
33
+
34
+ Args:
35
+ molecules (Optional[Union[list,tuple,set]], optional): a list/tuple/set of molecules
36
+ (rdworks.Mol | SMILES | rdkit.Chem.Mol). Defaults to None.
37
+ names (Optional[Union[list,tuple,set]], optional): a list/tuple/set of names.
38
+ Defaults to None.
39
+ std (bool, optional): whether to standardize molecules. Defaults to False.
40
+ max_workers (int, optional): max workers for parallel calculation. Defaults to 4.
41
+ chunksize (int, optional): chunksize for parallel calculation. Defaults to 100.
42
+ progress (bool, optional): whether to show progress bar. Defaults to False.
43
+
44
+ Raises:
45
+ ValueError: if counts of molecules and names differ.
46
+ TypeError: if molecule is not rdworks.Mol | SMILES | rdkit.Chem.Mol )
47
+ """
48
+ self.libr = []
49
+ self.max_workers = max_workers
50
+ self.chunksize = chunksize
51
+ self.progress = progress
52
+ self.query = None
53
+ self.threshold = None
54
+ self.clusters = None
55
+
56
+ if molecules and isinstance(molecules, (list, tuple, set)):
57
+ if names and isinstance(names, (list, tuple, set)):
58
+ if len(names) != len(molecules):
59
+ raise ValueError('MolLibr() counts of molecules and names are different')
60
+ if isinstance(molecules[0], Mol):
61
+ self.libr = molecules
62
+ elif isinstance(molecules[0], Conf):
63
+ self.libr = [Mol(conf.rdmol, name=conf.name).props.update(conf.props) for conf in molecules]
64
+ elif isinstance(molecules[0], str): # SMILES string
65
+ if names:
66
+ self.libr = [Mol(smi, name=name, std=std) for (smi, name) in zip(molecules, names)]
67
+ else:
68
+ self.libr = [Mol(smi, std=std) for smi in molecules]
69
+ self.rename(prefix='entry') # default name
70
+ elif isinstance(molecules[0], Chem.Mol):
71
+ if names:
72
+ self.libr = [Mol(rdmol, name=name, std=std) for (rdmol, name) in zip(molecules, names)]
73
+ else:
74
+ self.libr = [Mol(rdmol, std=std) for rdmol in molecules]
75
+ self.rename(prefix='entry') # default name
76
+ else:
77
+ raise TypeError('MolLibr() takes a list|tuple|set of Mol|SMILES|Chem.Mol')
78
+
79
+ def copy(self) -> Self:
80
+ """Returns a copy of self.
81
+
82
+ Returns:
83
+ Self: rdworks.MolLibr object.
84
+ """
85
+ return copy.deepcopy(self)
86
+
87
+
88
+ def __str__(self) -> str:
89
+ """Returns string representation.
90
+
91
+ Returns:
92
+ str: string representation.
93
+ """
94
+
95
+ return f"<MolLibr({self.count()})>"
96
+
97
+
98
+ def __iter__(self) -> Iterator:
99
+ """Yields an iterator of molecules.
100
+
101
+ Yields:
102
+ Iterator: iterator of molecules.
103
+ """
104
+ return iter(self.libr)
105
+
106
+
107
+ def __next__(self) -> Mol:
108
+ """Next molecule.
109
+
110
+ Returns:
111
+ Mol: next molecule (rdworks.Mol) object.
112
+ """
113
+ return next(self.libr)
114
+
115
+
116
+ def __eq__(self, other:Self) -> bool:
117
+ """Operator `==`.
118
+
119
+ Args:
120
+ other (rdworks.MolLibr): other rdworks.MolLibr object.
121
+
122
+ Returns:
123
+ bool: True if other rdworks.MolLibr object is identical with self.
124
+ """
125
+ if isinstance(other, MolLibr):
126
+ return len(frozenset(self.libr) - frozenset(other.libr)) == 0
127
+ else:
128
+ return False
129
+
130
+
131
+ def __getitem__(self, index: int | slice) -> Mol:
132
+ """Operator `[]`.
133
+
134
+ Args:
135
+ index (Union[int, slice]): index or slice of indexes.
136
+
137
+ Raises:
138
+ ValueError: if library is empty or index is out of range.
139
+
140
+ Returns:
141
+ Mol: rdworks.Mol object
142
+ """
143
+ if self.count() == 0:
144
+ raise ValueError(f"library is empty")
145
+ try:
146
+ return self.libr[index]
147
+ except:
148
+ raise ValueError(f"index should be 0..{self.count()-1}")
149
+
150
+
151
+ def __add__(self, other:object) -> Self:
152
+ """Operator `+`. Returns a copy of extended library.
153
+
154
+ Args:
155
+ other (object): other rdworks.Mol or rdworks.MolLibr object.
156
+
157
+ Raises:
158
+ TypeError: if `other` is not rdworks.Mol or rdworks.MolLibr.
159
+
160
+ Returns:
161
+ Self: rdworks.MolLibr object.
162
+ """
163
+ if isinstance(other, Mol):
164
+ obj = copy.deepcopy(self)
165
+ obj.libr.append(other)
166
+ return obj
167
+ elif isinstance(other, MolLibr):
168
+ obj = copy.deepcopy(self)
169
+ obj.libr.extend(other.libr)
170
+ return obj
171
+ else:
172
+ raise TypeError("'+' operator expects rdworks.Mol or rdworks.MolLibr object")
173
+
174
+
175
+ def __iadd__(self, other: Mol | Self) -> Self:
176
+ """Operator `+=`. Updates self by adding other molecule or library
177
+
178
+ Args:
179
+ other (object): other rdworks.Mol or rdworks.MolLibr object.
180
+
181
+ Raises:
182
+ TypeError: if `other` is not rdworks.Mol or rdworks.MolLibr.
183
+
184
+ Returns:
185
+ Self: rdworks.MolLibr object.
186
+ """
187
+ if isinstance(other, Mol):
188
+ self.libr.append(other)
189
+ elif isinstance(other, MolLibr):
190
+ self.libr.extend(other.libr)
191
+ else:
192
+ raise TypeError("'+=' operator expects Mol or MolLibr object")
193
+ return self
194
+
195
+
196
+ def __sub__(self, other: Mol | Self) -> Self:
197
+ """Operator `-`. Returns a copy of subtractive subset.
198
+
199
+ Args:
200
+ other (Union[Mol,Self]): other rdworks.Mol or rdworks.MolLibr object.
201
+
202
+ Raises:
203
+ TypeError: if `other` is not rdworks.Mol or rdworks.MolLibr.
204
+
205
+ Returns:
206
+ Self: a copy of subtractive subset.
207
+ """
208
+ if isinstance(other, Mol):
209
+ difference = frozenset(self.libr) - frozenset([other])
210
+ elif isinstance(other, MolLibr):
211
+ difference = frozenset(self.libr) - frozenset(other.libr)
212
+ else:
213
+ raise TypeError("'-' operator expects rdworks.Mol or rdworks.MolLibr object")
214
+ obj = copy.deepcopy(self)
215
+ obj.libr = list(difference)
216
+ return obj
217
+
218
+
219
+ def __isub__(self, other: Mol | Self) -> Self:
220
+ """Operator `-=`. Updates self by subtracting other molecule or library.
221
+
222
+ Args:
223
+ other (Union[Mol,Self]): other molecule or library.
224
+
225
+ Raises:
226
+ TypeError: if `other` is not rdworks.Mol or rdworks.MolLibr.
227
+
228
+ Returns:
229
+ Self: rdworks.MolLibr object.
230
+ """
231
+ if isinstance(other, Mol):
232
+ difference = frozenset(self.libr) - frozenset([other])
233
+ elif isinstance(other, MolLibr):
234
+ difference = frozenset(self.libr) - frozenset(other.libr)
235
+ else:
236
+ raise TypeError("'-=' operator expects rdworks.Mol or rdworks.MolLibr object")
237
+ self.libr = list(difference)
238
+ return self
239
+
240
+
241
+ def __and__(self, other: Mol | Self) -> Self:
242
+ """Operator `&`. Returns a copy of common subset.
243
+
244
+ Args:
245
+ other (Union[Mol,Self]): other molecule or library.
246
+
247
+ Raises:
248
+ TypeError: if `other` is not rdworks.Mol or rdworks.MolLibr.
249
+
250
+ Returns:
251
+ Self: a copy of rdworks.MolLibr object.
252
+ """
253
+ if isinstance(other, Mol):
254
+ intersection = frozenset(self.libr) & frozenset([other])
255
+ elif isinstance(other, MolLibr):
256
+ intersection = frozenset(self.libr) & frozenset(other.libr)
257
+ else:
258
+ raise TypeError("'&' operator or overlap() expects rdworks.Mol or rdworks.MolLibr object")
259
+ obj = copy.deepcopy(self)
260
+ obj.libr = list(intersection)
261
+ return obj
262
+
263
+
264
+ def __iand__(self, other: Mol | Self) -> Self:
265
+ """Operator `&=`. Re-assigns self with common subset.
266
+
267
+ Args:
268
+ other (Union[Mol,Self]): other molecule or library.
269
+
270
+ Raises:
271
+ TypeError: if `other` is not rdworks.Mol or rdworks.MolLibr.
272
+
273
+ Returns:
274
+ Self: rdworks.MolLibr object.
275
+ """
276
+ if isinstance(other, Mol):
277
+ intersection = frozenset(self.libr) & frozenset([other])
278
+ elif isinstance(other, MolLibr):
279
+ intersection = frozenset(self.libr) & frozenset(other.libr)
280
+ else:
281
+ raise TypeError("'&=' operator expects rdworks.Mol or rdworks.MolLibr object")
282
+ self.libr = list(intersection)
283
+ return self
284
+
285
+
286
+ @staticmethod
287
+ def _mask_similar(mol:Mol, targs:tuple) -> bool:
288
+ """A mask function to return True if molecule is similar with target molecules, `targs`.
289
+
290
+ Args:
291
+ mol (Mol): subject rdworks.Mol object.
292
+ targs (tuple): a tuple of rdworks.Mol objects to compare.
293
+
294
+ Returns:
295
+ bool: True if molecule is similar with target molecules.
296
+ """
297
+ return mol.is_similar(*targs) # unpack tuple of arguments
298
+
299
+
300
+ @staticmethod
301
+ def _mask_drop(mol:Mol, terms:str | Path) -> bool:
302
+ """A mask function to return True if molecule matches `terms`.
303
+
304
+ Note that molecules matching the terms will be dropped (NOT be included) in the compression.
305
+
306
+ Args:
307
+ mol (Mol): subject rdworks.Mol object.
308
+ terms (str | Path): rule.
309
+
310
+ Returns:
311
+ bool: True if molecule matches the terms.
312
+ """
313
+ return not mol.is_matching(terms)
314
+
315
+ @staticmethod
316
+ def _map_qed(mol:Mol, properties:list[str]=['QED', 'MolWt', 'LogP', 'TPSA', 'HBD']) -> dict:
317
+ """A map function to apply Mol.qed(`properties`) on `mol`.
318
+
319
+ The default behavior of map() is to pass the elements of the iterable to the function by reference.
320
+ This means that if the function modifies the elements of the iterable,
321
+ those changes will be reflected in the iterable itself.
322
+
323
+ Args:
324
+ mol (Mol): subject rdworks.Mol object.
325
+ properties (list[str], optional): properties. Defaults to ['QED', 'MolWt', 'LogP', 'TPSA', 'HBD'].
326
+
327
+ Returns:
328
+ dict: dictionary of properties.
329
+ """
330
+ return mol.qed(properties)
331
+
332
+
333
+ def compute(self, **kwargs) -> Self:
334
+ """Change settings for parallel computing.
335
+
336
+ Args:
337
+ max_workers (Optional[int], optional): max number of workers. Defaults to None.
338
+ chunksize (Optional[int], optional): chunksize of splitted workload. Defaults to None.
339
+ progress (Optional[bool], optional): whether to show progress bar. Defaults to None.
340
+
341
+ Returns:
342
+ Self: rdworks.MolLibr object.
343
+ """
344
+ self.max_workers = kwargs.get('max_workers', self.max_workers)
345
+ self.chunksize = kwargs.get('chunksize', self.chunksize)
346
+ self.progress = kwargs.get('progress', self.progress)
347
+ return self
348
+
349
+
350
+ def rename(self, prefix:Optional[str]=None, sep:str='.', start:int=1) -> Self:
351
+ """Rename molecules with serial numbers in-place and their conformers.
352
+
353
+ Molecules will be named by a format, `{prefix}{sep}{serial_number}` and
354
+ conformers will be named accordingly.
355
+
356
+ Examples:
357
+ >>> a.rename(prefix='a')
358
+
359
+ Args:
360
+ prefix (str, optional): prefix for new name. If prefix is not given and set to None,
361
+ molecules will not renamed but conformers will be still renamed.
362
+ This is useful after dropping some conformers and rename them serially.
363
+ sep (str): separator between prefix and serial number (default: `.`)
364
+ start (int): start number of serial number.
365
+
366
+ Returns:
367
+ Self: rdworks.MolLibr object.
368
+ """
369
+
370
+ num = self.count()
371
+ num_digits = len(str(num)) # ex. '100' -> 3
372
+ if prefix:
373
+ # use prefix to rename molecules AND conformers
374
+ for (serial, mol) in enumerate(self.libr, start=start):
375
+ if num > 1:
376
+ serial_str = str(serial)
377
+ while len(serial_str) < num_digits:
378
+ serial_str = '0' + serial_str
379
+ mol.rename(prefix=f"{prefix}{sep}{serial_str}")
380
+ else:
381
+ mol.rename(prefix)
382
+ else:
383
+ # rename molecules using serial numbers if they have duplicate names
384
+ # name -> name.1, name.2, ...
385
+ count_names = defaultdict(list)
386
+ for idx, mol in enumerate(self.libr):
387
+ count_names[mol.name].append(idx)
388
+ not_unique_names = [name for name, l in count_names.items() if len(l) > 1]
389
+ for idx, mol in enumerate(self.libr):
390
+ if mol.name in not_unique_names:
391
+ serial = count_names[mol.name].index(idx) + 1
392
+ mol.rename(f'{mol.name}.{serial}')
393
+ # rename conformers
394
+ for mol in self.libr:
395
+ mol.rename()
396
+ return self
397
+
398
+
399
+ def overlap(self, other:Self) -> Self:
400
+ """Returns a common subset with `other` library.
401
+
402
+ Args:
403
+ other (Self): rdworks.MolLibr object.
404
+
405
+ Returns:
406
+ Self: common subset of rdworks.MolLibr.
407
+ """
408
+ return self.__and__(other)
409
+
410
+
411
+ def similar(self, query:Mol, threshold:float=0.2, **kwargs) -> Self:
412
+ """Returns a copy of subset that are similar to `query`.
413
+
414
+ Args:
415
+ query (Mol): query molecule.
416
+ threshold (float, optional): similarity threshold. Defaults to 0.2.
417
+
418
+ Raises:
419
+ TypeError: if query is not rdworks.Mol type.
420
+
421
+ Returns:
422
+ Self: a copy of self.
423
+ """
424
+ obj = copy.deepcopy(self).compute(**kwargs)
425
+ if isinstance(query, Mol):
426
+ largs = [(query, threshold),] * obj.count()
427
+ else:
428
+ raise TypeError("MolLibr.similar() expects Mol object")
429
+ with ProcessPoolExecutor(max_workers=obj.max_workers) as executor:
430
+ if self.progress:
431
+ mask = list(tqdm(executor.map(MolLibr._mask_similar, obj.libr, largs, chunksize=obj.chunksize),
432
+ desc="Similar",
433
+ total=obj.count()))
434
+ else:
435
+ mask = list(executor.map(MolLibr._mask_similar, obj.libr, largs, chunksize=obj.chunksize))
436
+ obj.libr = list(itertools.compress(obj.libr, mask))
437
+ return obj
438
+
439
+
440
+
441
+ def unique(self, report=False) -> Self:
442
+ """Removes duplicates and returns a copy of unique library.
443
+
444
+ Args:
445
+ report (bool, optional): whether to report duplicates. Defaults to False.
446
+
447
+ Returns:
448
+ Self: a copy of self.
449
+ """
450
+ obj = copy.deepcopy(self)
451
+ U = {} # unique SMILES
452
+ mask = []
453
+ for mol in obj.libr:
454
+ if mol.smiles in U:
455
+ mask.append(False)
456
+ # ignore the same name or recorded aka
457
+ if (mol.name != U[mol.smiles].name) and (mol.name not in U[mol.smiles].props['aka']):
458
+ U[mol.smiles].props['aka'].append(mol.name)
459
+ else:
460
+ mask.append(True)
461
+ U[mol.smiles] = mol
462
+ obj.libr = list(itertools.compress(obj.libr, mask))
463
+ if report:
464
+ print("duplicates:")
465
+ for mol in obj.libr:
466
+ if len(mol.props['aka']) > 0:
467
+ print(f" {mol.name}({len(mol.props['aka'])}) - {','.join(mol.props['aka'])}")
468
+ print(f"de-duplicated to {obj.count()} molecules")
469
+ return obj
470
+
471
+
472
+ def qed(self, properties:list[str]=['QED', 'MolWt', 'LogP', 'TPSA', 'HBD'], **kwargs) -> Self:
473
+ """Returns a copy of self with calculated quantitative estimate of drug-likeness (QED).
474
+
475
+ Args:
476
+ properties (list[str], optional): _description_. Defaults to ['QED', 'MolWt', 'LogP', 'TPSA', 'HBD'].
477
+
478
+ Returns:
479
+ Self: self.
480
+ """
481
+ self = self.compute(**kwargs)
482
+ lprops = [ properties, ] * self.count()
483
+ with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
484
+ if self.progress:
485
+ self.libr = list(tqdm(
486
+ executor.map(MolLibr._map_qed, self.libr, lprops, chunksize=self.chunksize),
487
+ desc="QED Properties",
488
+ total=self.count()
489
+ ))
490
+ else:
491
+ self.libr = list(
492
+ executor.map(MolLibr._map_qed, self.libr, lprops, chunksize=self.chunksize)
493
+ )
494
+ return self
495
+
496
+
497
+ def drop(self, terms:str | Path | None = None, invert:bool=False, **kwargs) -> Self:
498
+ """Drops matched molecules and returns a copy of library with remaining molecules.
499
+
500
+ Args:
501
+ terms (str | Path | None, optional): matching terms. Defaults to None.
502
+ invert (bool, optional): whether to invert selection by the `terms`. Defaults to False.
503
+
504
+ Returns:
505
+ Self: a copy of self.
506
+ """
507
+ if not terms:
508
+ print(list_predefined_xml())
509
+ return self
510
+ obj = copy.deepcopy(self).compute(**kwargs)
511
+ lterms = [ terms ] * obj.count()
512
+ with ProcessPoolExecutor(max_workers=obj.max_workers) as executor:
513
+ if obj.progress:
514
+ mask = list(tqdm(
515
+ executor.map(MolLibr._mask_drop, obj.libr, lterms, chunksize=obj.chunksize),
516
+ desc="Drop",
517
+ total=obj.count()))
518
+ else:
519
+ mask = list(
520
+ executor.map(MolLibr._mask_drop, obj.libr, lterms, chunksize=obj.chunksize))
521
+ if invert:
522
+ mask = [not b for b in mask]
523
+ obj.libr = list(itertools.compress(obj.libr, mask))
524
+ return obj
525
+
526
+
527
+ def pick(self, n:int, **kwargs) -> Self:
528
+ """Picks n diverse molecules.
529
+
530
+ Args:
531
+ n (int): number of molecules to pick.
532
+
533
+ Returns:
534
+ Self: a copy of self.
535
+ """
536
+ obj = copy.deepcopy(self)
537
+ raise NotImplementedError
538
+ return obj
539
+
540
+
541
+
542
+
543
+ ##################################################
544
+ ### endpoints
545
+ ##################################################
546
+
547
+
548
+ def count(self) -> int:
549
+ """Returns number of molecules.
550
+
551
+ Returns:
552
+ int: count of molecules.
553
+ """
554
+ return len(self.libr)
555
+
556
+
557
+ def cluster(self, threshold:float=0.3, ordered:bool=True, drop_singleton:bool=True) -> list:
558
+ """Clusters molecules using fingerprint.
559
+
560
+ Args:
561
+ threshold (float, optional): Tanimoto similarity threshold. Defaults to 0.3.
562
+ ordered (bool, optional): order clusters by size of cluster. Defaults to True.
563
+ drop_singleton (bool, optional): exclude singletons. Defaults to True.
564
+
565
+ Returns:
566
+ list: [(centroid_1, idx, idx,), (centroid_2, idx, idx,), ...]
567
+ """
568
+ for mol in self.libr:
569
+ if not mol.fp:
570
+ mol.fp = mol.MFP2.GetFingerprint(mol.rdmol)
571
+ fps = [ mol.fp for mol in self.libr if mol.fp ]
572
+ n = len(fps)
573
+ # first generate the distance matrix:
574
+ dmat = []
575
+ for i in range(1, n):
576
+ sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
577
+ dmat.extend([1-x for x in sims])
578
+ # Butina hierarchical clustering:
579
+ # clusters is a list of list of indices
580
+ clusters = Butina.ClusterData(dmat,
581
+ nPts=n,
582
+ distThresh=threshold,
583
+ isDistData=True,
584
+ reordering=True)
585
+ if ordered:
586
+ # in the order of cluster size, from the largest to the smallest
587
+ clusters = sorted(clusters, key=lambda indices: len(indices), reverse=True)
588
+
589
+ if drop_singleton:
590
+ clusters = [indices for indices in clusters if len(indices) > 1]
591
+
592
+ return clusters
593
+
594
+
595
+
596
+ def to_sdf(self,
597
+ path:str | Path,
598
+ confs:bool=False,
599
+ props:bool=True,
600
+ separate:bool=False) -> None:
601
+ """Writes to .sdf or .sdf.gz file.
602
+
603
+ Chem.SDWriter is supposed to write all non-private molecular properties.
604
+
605
+ `dirname/filename.sdf` -> `dirname/filename_{molecule name}.sdf`
606
+ `dirname/filename.sdf.gz` -> `dirname/filename_{molecule name}.sdf.gz`
607
+
608
+ Args:
609
+ path (str or PosixPath) : output filename or path
610
+ confs (bool) : whether to write 3D coordinates and conformer properties. Defaults to False.
611
+ props (bool) : whether to write SDF properties. Defaults to True.
612
+ separate (bool) : write each molecule to separate files. Defaults to False.
613
+ """
614
+ if isinstance(path, str):
615
+ path = Path(path)
616
+ # PurePosixPath('my/dir/mol.sdf.gz').suffix -> '.gz'
617
+ # PurePosixPath('my/dir/mol.sdf.gz').suffixes -> ['.sdf', '.gz']
618
+ # PurePosixPath('my/dir/mol.sdf').name -> 'mol.sdf'
619
+ # PurePosixPath('my/dir/mol.sdf').with_name('mol2.sdf') -> PurePath('my/dir/mol2.sdf')
620
+ suffix = path.suffix
621
+ suffixes = ''.join(path.suffixes)
622
+ prefix = path.name.replace(suffixes, '')
623
+ if separate:
624
+ for mol in self.libr:
625
+ if suffix == '.gz':
626
+ with gzip.open(path.with_name(f'{prefix}_{mol.name}.sdf.gz'), "wt") as f:
627
+ f.write(mol.to_sdf(confs, props))
628
+ else:
629
+ with open(path.with_name(f'{prefix}_{mol.name}.sdf'), "w") as f:
630
+ f.write(mol.to_sdf(confs, props))
631
+
632
+ else:
633
+ if suffix == '.gz':
634
+ with gzip.open(path, "wt") as f:
635
+ for mol in self.libr:
636
+ f.write(mol.to_sdf(confs, props))
637
+ else:
638
+ with open(path, "w") as f:
639
+ for mol in self.libr:
640
+ f.write(mol.to_sdf(confs, props))
641
+
642
+
643
+ def to_smi(self, path:str | Path) -> None:
644
+ """Writes to .smi file.
645
+
646
+ Args:
647
+ path (str | Path): output filename or path.
648
+ """
649
+ if isinstance(path, Path):
650
+ path = path.as_posix() # convert to string
651
+ if path.endswith('.gz'):
652
+ with gzip.open(path, "wt") as smigz:
653
+ for mol in self.libr:
654
+ smigz.write(f'{mol.smiles} {mol.name}\n')
655
+ else:
656
+ with open(path, "w") as smi:
657
+ for mol in self.libr:
658
+ smi.write(f'{mol.smiles} {mol.name}\n')
659
+
660
+
661
+ def to_image(self, width:int=200, height:int=200, index:bool=False, mols_per_row:int=5) -> str:
662
+ """Returns SVG strings for Jupyter notebook.
663
+
664
+ Args:
665
+ width (int, optional): width. Defaults to 200.
666
+ height (int, optional): height. Defaults to 200.
667
+ index (bool, optional): whether to show atom index. Defaults to False.
668
+ mols_per_row (int, optional): number of molecules per row. Defaults to 5.
669
+
670
+ Returns:
671
+ str: SVG strings for Jupyter notebook.
672
+ """
673
+
674
+ if index:
675
+ for mol in self.libr:
676
+ for a in mol.rdmol.GetAtoms():
677
+ a.SetProp("atomNote", str(a.GetIdx()+1))
678
+ rdmols = [mol.rdmol for mol in self.libr]
679
+ legends = [mol.name for mol in self.libr]
680
+ return Draw.MolsToGridImage(rdmols,
681
+ legends=legends,
682
+ molsPerRow=min(mols_per_row, len(rdmols)),
683
+ subImgSize=(width,height),
684
+ useSVG=True)
685
+
686
+
687
+ def to_png(self, path:str | Path, width:int=200, height:int=200, index:bool=False, mols_per_row:int=5) -> None:
688
+ """Writes to a .png file.
689
+
690
+ Args:
691
+ path (str | Path): output filename or path.
692
+ width (int, optional): width. Defaults to 200.
693
+ height (int, optional): height. Defaults to 200.
694
+ index (bool, optional): whether to show atom index. Defaults to False.
695
+ mols_per_row (int, optional): number of molecules per row. Defaults to 5.
696
+ """
697
+ if isinstance(path, Path):
698
+ path = path.as_posix() # convert to string
699
+ if index:
700
+ for mol in self.libr:
701
+ for a in mol.rdmol.GetAtoms():
702
+ a.SetProp("atomNote", str(a.GetIdx()+1))
703
+ rdmols = [mol.rdmol for mol in self.libr]
704
+ legends = [mol.name for mol in self.libr]
705
+ Draw.MolsToGridImage(rdmols,
706
+ legends=legends,
707
+ molsPerRow=min(mols_per_row,len(rdmols)),
708
+ subImgSize=(width,height),
709
+ useSVG=False).save(path)
710
+
711
+
712
+ def to_html(self) -> str:
713
+ """Writes to HTML strings.
714
+
715
+ Returns:
716
+ str: HTML strings.
717
+ """
718
+ HTML = "<html><body>"
719
+ for mol in self.libr:
720
+ HTML += mol.to_html(htmlbody=False)
721
+ HTML += "</body></html>"
722
+ return HTML
723
+
724
+
725
+ def to_dataframe(self,
726
+ name:str='name',
727
+ smiles:str='smiles',
728
+ confs:bool=False) -> pd.DataFrame:
729
+ """Returns a Pandas DataFrame.
730
+
731
+ Args:
732
+ name (str, optional): column name for name. Defaults to 'name'.
733
+ smiles (str, optional): column name for SMILES. Defaults to 'smiles'.
734
+ confs (bool, optional): whether to include conformer properties. Defaults to False.
735
+
736
+ Returns:
737
+ pd.DataFrame: pandas DataFrame.
738
+ """
739
+ if confs:
740
+ exclude = ['coord']
741
+ property_columns = set()
742
+ for mol in self.libr:
743
+ for conf in mol.confs:
744
+ for k in conf.props:
745
+ if k not in exclude:
746
+ property_columns.add(k)
747
+ property_columns = property_columns - set([name, smiles])
748
+ data = {name:[], smiles:[]}
749
+ data.update({k:[] for k in property_columns})
750
+ for mol in self.libr:
751
+ for conf in mol.confs:
752
+ data[name].append(conf.name)
753
+ data[smiles].append(mol.smiles)
754
+ for k in property_columns:
755
+ if k in conf.props:
756
+ data[k].append(conf.props[k])
757
+ else:
758
+ data[k].append(None)
759
+ else:
760
+ property_columns = set()
761
+ for mol in self.libr:
762
+ for k in mol.props:
763
+ property_columns.add(k)
764
+ property_columns = property_columns - set([name, smiles])
765
+ data = {name:[], smiles:[]}
766
+ data.update({k:[] for k in property_columns})
767
+ for mol in self.libr:
768
+ data[name].append(mol.name)
769
+ data[smiles].append(mol.smiles)
770
+ for k in property_columns:
771
+ if k in mol.props:
772
+ data[k].append(mol.props[k])
773
+ else:
774
+ data[k].append(None)
775
+ return pd.DataFrame(data)
776
+
777
+
778
+ def to_csv(self,
779
+ path:str | Path,
780
+ confs:bool=False,
781
+ decimal_places:int=3) -> None:
782
+ """Writes to a .csv file.
783
+
784
+ Args:
785
+ path (str | Path): output filename or path.
786
+ confs (bool, optional): whether to include conformer properties. Defaults to False.
787
+ decimal_places (int, optional): decimal places for float numbers. Defaults to 3.
788
+ """
789
+ df = self.to_dataframe(confs=confs)
790
+ df.to_csv(path, index=False, float_format=f'%.{decimal_places}f')
791
+
792
+
793
+ @staticmethod
794
+ def _mask_nn_applicable(mol:Mol, model:str) -> bool:
795
+ """A mask function to return True if molecule is NN applicable.
796
+
797
+ Args:
798
+ mol (Mol): rdworks.Mol object.
799
+ model (str): name of NN model.
800
+
801
+ Returns:
802
+ bool: True if molecule is NN applicable.
803
+ """
804
+ return mol.is_nn_applicable(model)
805
+
806
+
807
+ def nn_applicable(self, model:str, **kwargs) -> Self:
808
+ """Returns a copy of subset of library that is applicable to given neural network `model`.
809
+
810
+ Examples:
811
+ >>> libr = rdworks.MolLibr(drug_smiles, drug_names)
812
+ >>> ani2x_compatible_subset = libr.nn_applicable('ANI-2x', progress=False)
813
+
814
+ Args:
815
+ model (str): name of model.
816
+
817
+ Returns:
818
+ Self: subset of library.
819
+ """
820
+ obj = copy.deepcopy(self).compute(**kwargs)
821
+ lmodel = [model,] * self.count()
822
+ with ProcessPoolExecutor(max_workers=obj.max_workers) as executor:
823
+ if obj.progress:
824
+ mask = list(tqdm(
825
+ executor.map(self.mask_nn_applicable, obj.libr, lmodel, chunksize=obj.chunksize),
826
+ desc="NN applicable",
827
+ total=obj.count()))
828
+ else:
829
+ mask = list(
830
+ executor.map(self._mask_nn_applicable, obj.libr, lmodel, chunksize=obj.chunksize))
831
+ obj.libr = list(itertools.compress(obj.libr, mask))
832
+ return obj
833
+
834
+
835
+ def to_nnbatches(self, batchsize:int=1000) -> list:
836
+ """Split workload flexibily into a numer of batches.
837
+
838
+ - Each batch has up to `batchsize` number of atoms.
839
+ - Conformers originated from a same molecule can be splitted into multiple batches.
840
+ - Or one batch can contain conformers originated from multiple molecules.
841
+
842
+ coord: coordinates of input molecules (N, m, 3) where N is the number of structures and
843
+ m is the number of atoms in each structure.
844
+ numbers: atomic numbers in the molecule (include H). (N, m)
845
+ charges: (N,)
846
+
847
+ Args:
848
+ batchsize: max. number of atoms in a batch.
849
+
850
+ Returns:
851
+ list: list of batches.
852
+ """
853
+
854
+ pre_batches = []
855
+ batch_confs = []
856
+ batch_mols = []
857
+ batch_n_atoms = 0
858
+
859
+ for mol in self.libr:
860
+ for conf in mol.confs:
861
+ n_atoms = conf.props['atoms']
862
+ if (batch_n_atoms + n_atoms) > batchsize:
863
+ pre_batches.append((batch_mols, batch_confs, batch_n_atoms))
864
+ # start over a new batch
865
+ batch_mols = [mol]
866
+ batch_confs = [conf]
867
+ batch_n_atoms = n_atoms
868
+ else:
869
+ batch_mols.append(mol)
870
+ batch_confs.append(conf)
871
+ batch_n_atoms += n_atoms
872
+
873
+ if batch_n_atoms > 0: # last remaining batch
874
+ pre_batches.append((batch_mols, batch_confs, batch_n_atoms))
875
+
876
+ batches = []
877
+
878
+ for i, (batch_mols, batch_confs, batch_n_atoms) in enumerate(pre_batches, start=1):
879
+ charges = [mol.props['charge'] for mol in batch_mols]
880
+ coord = [conf.rdmol.GetConformer().GetPositions().tolist() for conf in batch_confs]
881
+ # to be consistent with legacy code
882
+ coord = [[tuple(xyz) for xyz in inner] for inner in coord]
883
+ # numbers should be got from conformers because of hydrogens
884
+ numbers = [[a.GetAtomicNum() for a in conf.rdmol.GetAtoms()] for conf in batch_confs]
885
+ batches.append((coord, numbers, charges, batch_confs, batch_mols))
886
+
887
+ return batches