rdkit-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rdkit_cli/__init__.py +4 -0
- rdkit_cli/__main__.py +6 -0
- rdkit_cli/cli.py +162 -0
- rdkit_cli/commands/__init__.py +1 -0
- rdkit_cli/commands/conformers.py +220 -0
- rdkit_cli/commands/convert.py +162 -0
- rdkit_cli/commands/depict.py +311 -0
- rdkit_cli/commands/descriptors.py +251 -0
- rdkit_cli/commands/diversity.py +232 -0
- rdkit_cli/commands/enumerate.py +229 -0
- rdkit_cli/commands/filter.py +384 -0
- rdkit_cli/commands/fingerprints.py +179 -0
- rdkit_cli/commands/fragment.py +284 -0
- rdkit_cli/commands/mcs.py +162 -0
- rdkit_cli/commands/reactions.py +191 -0
- rdkit_cli/commands/scaffold.py +243 -0
- rdkit_cli/commands/similarity.py +359 -0
- rdkit_cli/commands/standardize.py +138 -0
- rdkit_cli/core/__init__.py +1 -0
- rdkit_cli/core/conformers.py +197 -0
- rdkit_cli/core/depict.py +241 -0
- rdkit_cli/core/descriptors.py +248 -0
- rdkit_cli/core/diversity.py +174 -0
- rdkit_cli/core/enumerate.py +190 -0
- rdkit_cli/core/filters.py +443 -0
- rdkit_cli/core/fingerprints.py +265 -0
- rdkit_cli/core/fragment.py +237 -0
- rdkit_cli/core/mcs.py +128 -0
- rdkit_cli/core/reactions.py +159 -0
- rdkit_cli/core/scaffold.py +174 -0
- rdkit_cli/core/similarity.py +206 -0
- rdkit_cli/core/standardizer.py +141 -0
- rdkit_cli/io/__init__.py +7 -0
- rdkit_cli/io/formats.py +109 -0
- rdkit_cli/io/readers.py +352 -0
- rdkit_cli/io/writers.py +275 -0
- rdkit_cli/parallel/__init__.py +5 -0
- rdkit_cli/parallel/batch.py +181 -0
- rdkit_cli/parallel/executor.py +180 -0
- rdkit_cli/progress/__init__.py +5 -0
- rdkit_cli/progress/ninja.py +195 -0
- rdkit_cli/utils/__init__.py +1 -0
- rdkit_cli-0.1.0.dist-info/METADATA +380 -0
- rdkit_cli-0.1.0.dist-info/RECORD +47 -0
- rdkit_cli-0.1.0.dist-info/WHEEL +4 -0
- rdkit_cli-0.1.0.dist-info/entry_points.txt +2 -0
- rdkit_cli-0.1.0.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,443 @@
|
|
|
1
|
+
"""Molecular filtering engine."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Optional, Any, Callable
|
|
5
|
+
|
|
6
|
+
from rdkit import Chem
|
|
7
|
+
from rdkit.Chem import Descriptors, FilterCatalog, rdfiltercatalog
|
|
8
|
+
|
|
9
|
+
from rdkit_cli.io.readers import MoleculeRecord
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class FilterResult:
|
|
14
|
+
"""Result of a filter check."""
|
|
15
|
+
|
|
16
|
+
passed: bool
|
|
17
|
+
reason: Optional[str] = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# Drug-likeness rules
|
|
21
|
+
DRUGLIKE_RULES = {
|
|
22
|
+
"lipinski": {
|
|
23
|
+
"MolWt": (None, 500),
|
|
24
|
+
"MolLogP": (None, 5),
|
|
25
|
+
"NumHDonors": (None, 5),
|
|
26
|
+
"NumHAcceptors": (None, 10),
|
|
27
|
+
},
|
|
28
|
+
"veber": {
|
|
29
|
+
"NumRotatableBonds": (None, 10),
|
|
30
|
+
"TPSA": (None, 140),
|
|
31
|
+
},
|
|
32
|
+
"ghose": {
|
|
33
|
+
"MolWt": (160, 480),
|
|
34
|
+
"MolLogP": (-0.4, 5.6),
|
|
35
|
+
"NumAtoms": (20, 70),
|
|
36
|
+
"MolMR": (40, 130),
|
|
37
|
+
},
|
|
38
|
+
"egan": {
|
|
39
|
+
"MolLogP": (None, 5.88),
|
|
40
|
+
"TPSA": (None, 131.6),
|
|
41
|
+
},
|
|
42
|
+
"muegge": {
|
|
43
|
+
"MolWt": (200, 600),
|
|
44
|
+
"MolLogP": (-2, 5),
|
|
45
|
+
"TPSA": (None, 150),
|
|
46
|
+
"RingCount": (None, 7),
|
|
47
|
+
"NumHDonors": (None, 5),
|
|
48
|
+
"NumHAcceptors": (None, 10),
|
|
49
|
+
"NumRotatableBonds": (None, 15),
|
|
50
|
+
},
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def check_property_range(
|
|
55
|
+
mol: Chem.Mol,
|
|
56
|
+
property_name: str,
|
|
57
|
+
min_val: Optional[float],
|
|
58
|
+
max_val: Optional[float],
|
|
59
|
+
) -> bool:
|
|
60
|
+
"""Check if a property is within range."""
|
|
61
|
+
# Get property function
|
|
62
|
+
if property_name == "NumAtoms":
|
|
63
|
+
value = mol.GetNumAtoms()
|
|
64
|
+
elif hasattr(Descriptors, property_name):
|
|
65
|
+
func = getattr(Descriptors, property_name)
|
|
66
|
+
value = func(mol)
|
|
67
|
+
else:
|
|
68
|
+
return True # Unknown property, pass
|
|
69
|
+
|
|
70
|
+
if min_val is not None and value < min_val:
|
|
71
|
+
return False
|
|
72
|
+
if max_val is not None and value > max_val:
|
|
73
|
+
return False
|
|
74
|
+
|
|
75
|
+
return True
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def check_druglike_rules(mol: Chem.Mol, rule_name: str) -> FilterResult:
|
|
79
|
+
"""
|
|
80
|
+
Check drug-likeness rules.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
mol: RDKit molecule
|
|
84
|
+
rule_name: Name of rule set (lipinski, veber, etc.)
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
FilterResult with pass/fail status
|
|
88
|
+
"""
|
|
89
|
+
if rule_name not in DRUGLIKE_RULES:
|
|
90
|
+
raise ValueError(f"Unknown rule: {rule_name}")
|
|
91
|
+
|
|
92
|
+
rules = DRUGLIKE_RULES[rule_name]
|
|
93
|
+
violations = []
|
|
94
|
+
|
|
95
|
+
for prop, (min_val, max_val) in rules.items():
|
|
96
|
+
if not check_property_range(mol, prop, min_val, max_val):
|
|
97
|
+
violations.append(prop)
|
|
98
|
+
|
|
99
|
+
if violations:
|
|
100
|
+
return FilterResult(passed=False, reason=f"Failed: {', '.join(violations)}")
|
|
101
|
+
|
|
102
|
+
return FilterResult(passed=True)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class SubstructureFilter:
|
|
106
|
+
"""Filter molecules by substructure."""
|
|
107
|
+
|
|
108
|
+
def __init__(
|
|
109
|
+
self,
|
|
110
|
+
smarts: str,
|
|
111
|
+
exclude: bool = False,
|
|
112
|
+
include_smiles: bool = True,
|
|
113
|
+
include_name: bool = True,
|
|
114
|
+
):
|
|
115
|
+
"""
|
|
116
|
+
Initialize substructure filter.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
smarts: SMARTS pattern to match
|
|
120
|
+
exclude: If True, exclude matching molecules
|
|
121
|
+
include_smiles: Include SMILES in output
|
|
122
|
+
include_name: Include molecule name in output
|
|
123
|
+
"""
|
|
124
|
+
self.pattern = Chem.MolFromSmarts(smarts)
|
|
125
|
+
if self.pattern is None:
|
|
126
|
+
raise ValueError(f"Invalid SMARTS pattern: {smarts}")
|
|
127
|
+
|
|
128
|
+
self.exclude = exclude
|
|
129
|
+
self.include_smiles = include_smiles
|
|
130
|
+
self.include_name = include_name
|
|
131
|
+
|
|
132
|
+
def filter(self, record: MoleculeRecord) -> Optional[dict[str, Any]]:
|
|
133
|
+
"""
|
|
134
|
+
Filter a molecule record.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
record: MoleculeRecord to check
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
Dictionary if molecule passes filter, None otherwise
|
|
141
|
+
"""
|
|
142
|
+
if record.mol is None:
|
|
143
|
+
return None
|
|
144
|
+
|
|
145
|
+
has_match = record.mol.HasSubstructMatch(self.pattern)
|
|
146
|
+
|
|
147
|
+
# If exclude=True, we want molecules WITHOUT the match
|
|
148
|
+
# If exclude=False, we want molecules WITH the match
|
|
149
|
+
passes = (self.exclude and not has_match) or (not self.exclude and has_match)
|
|
150
|
+
|
|
151
|
+
if not passes:
|
|
152
|
+
return None
|
|
153
|
+
|
|
154
|
+
result: dict[str, Any] = {}
|
|
155
|
+
if self.include_smiles:
|
|
156
|
+
result["smiles"] = record.smiles
|
|
157
|
+
if self.include_name and record.name:
|
|
158
|
+
result["name"] = record.name
|
|
159
|
+
|
|
160
|
+
# Copy other metadata
|
|
161
|
+
for key, value in record.metadata.items():
|
|
162
|
+
if key not in result:
|
|
163
|
+
result[key] = value
|
|
164
|
+
|
|
165
|
+
return result
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
class PropertyFilter:
|
|
169
|
+
"""Filter molecules by property values."""
|
|
170
|
+
|
|
171
|
+
def __init__(
|
|
172
|
+
self,
|
|
173
|
+
rules: dict[str, tuple[Optional[float], Optional[float]]],
|
|
174
|
+
include_smiles: bool = True,
|
|
175
|
+
include_name: bool = True,
|
|
176
|
+
):
|
|
177
|
+
"""
|
|
178
|
+
Initialize property filter.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
rules: Dictionary of property_name -> (min_val, max_val)
|
|
182
|
+
include_smiles: Include SMILES in output
|
|
183
|
+
include_name: Include molecule name in output
|
|
184
|
+
"""
|
|
185
|
+
self.rules = rules
|
|
186
|
+
self.include_smiles = include_smiles
|
|
187
|
+
self.include_name = include_name
|
|
188
|
+
|
|
189
|
+
def filter(self, record: MoleculeRecord) -> Optional[dict[str, Any]]:
|
|
190
|
+
"""Filter a molecule record."""
|
|
191
|
+
if record.mol is None:
|
|
192
|
+
return None
|
|
193
|
+
|
|
194
|
+
for prop, (min_val, max_val) in self.rules.items():
|
|
195
|
+
if not check_property_range(record.mol, prop, min_val, max_val):
|
|
196
|
+
return None
|
|
197
|
+
|
|
198
|
+
result: dict[str, Any] = {}
|
|
199
|
+
if self.include_smiles:
|
|
200
|
+
result["smiles"] = record.smiles
|
|
201
|
+
if self.include_name and record.name:
|
|
202
|
+
result["name"] = record.name
|
|
203
|
+
|
|
204
|
+
for key, value in record.metadata.items():
|
|
205
|
+
if key not in result:
|
|
206
|
+
result[key] = value
|
|
207
|
+
|
|
208
|
+
return result
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
class DruglikeFilter:
|
|
212
|
+
"""Filter molecules by drug-likeness rules."""
|
|
213
|
+
|
|
214
|
+
def __init__(
|
|
215
|
+
self,
|
|
216
|
+
rule_name: str = "lipinski",
|
|
217
|
+
max_violations: int = 0,
|
|
218
|
+
include_smiles: bool = True,
|
|
219
|
+
include_name: bool = True,
|
|
220
|
+
):
|
|
221
|
+
"""
|
|
222
|
+
Initialize drug-likeness filter.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
rule_name: Rule set to use
|
|
226
|
+
max_violations: Maximum allowed violations
|
|
227
|
+
include_smiles: Include SMILES in output
|
|
228
|
+
include_name: Include molecule name in output
|
|
229
|
+
"""
|
|
230
|
+
if rule_name not in DRUGLIKE_RULES:
|
|
231
|
+
raise ValueError(f"Unknown rule: {rule_name}. Available: {', '.join(DRUGLIKE_RULES.keys())}")
|
|
232
|
+
|
|
233
|
+
self.rule_name = rule_name
|
|
234
|
+
self.max_violations = max_violations
|
|
235
|
+
self.include_smiles = include_smiles
|
|
236
|
+
self.include_name = include_name
|
|
237
|
+
|
|
238
|
+
def filter(self, record: MoleculeRecord) -> Optional[dict[str, Any]]:
|
|
239
|
+
"""Filter a molecule record."""
|
|
240
|
+
if record.mol is None:
|
|
241
|
+
return None
|
|
242
|
+
|
|
243
|
+
rules = DRUGLIKE_RULES[self.rule_name]
|
|
244
|
+
violations = 0
|
|
245
|
+
|
|
246
|
+
for prop, (min_val, max_val) in rules.items():
|
|
247
|
+
if not check_property_range(record.mol, prop, min_val, max_val):
|
|
248
|
+
violations += 1
|
|
249
|
+
|
|
250
|
+
if violations > self.max_violations:
|
|
251
|
+
return None
|
|
252
|
+
|
|
253
|
+
result: dict[str, Any] = {}
|
|
254
|
+
if self.include_smiles:
|
|
255
|
+
result["smiles"] = record.smiles
|
|
256
|
+
if self.include_name and record.name:
|
|
257
|
+
result["name"] = record.name
|
|
258
|
+
|
|
259
|
+
for key, value in record.metadata.items():
|
|
260
|
+
if key not in result:
|
|
261
|
+
result[key] = value
|
|
262
|
+
|
|
263
|
+
return result
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
class PAINSFilter:
|
|
267
|
+
"""Filter molecules for PAINS (Pan-Assay Interference Compounds)."""
|
|
268
|
+
|
|
269
|
+
def __init__(
|
|
270
|
+
self,
|
|
271
|
+
exclude: bool = True,
|
|
272
|
+
include_smiles: bool = True,
|
|
273
|
+
include_name: bool = True,
|
|
274
|
+
):
|
|
275
|
+
"""Initialize PAINS filter."""
|
|
276
|
+
self.exclude = exclude
|
|
277
|
+
self.include_smiles = include_smiles
|
|
278
|
+
self.include_name = include_name
|
|
279
|
+
|
|
280
|
+
# Initialize PAINS catalog
|
|
281
|
+
params = FilterCatalog.FilterCatalogParams()
|
|
282
|
+
params.AddCatalog(FilterCatalog.FilterCatalogParams.FilterCatalogs.PAINS)
|
|
283
|
+
self.catalog = FilterCatalog.FilterCatalog(params)
|
|
284
|
+
|
|
285
|
+
def filter(self, record: MoleculeRecord) -> Optional[dict[str, Any]]:
|
|
286
|
+
"""Filter a molecule record (returns None if PAINS hit and exclude=True)."""
|
|
287
|
+
if record.mol is None:
|
|
288
|
+
return None
|
|
289
|
+
|
|
290
|
+
# Check for PAINS
|
|
291
|
+
entry = self.catalog.GetFirstMatch(record.mol)
|
|
292
|
+
is_pains = entry is not None
|
|
293
|
+
|
|
294
|
+
# If exclude=True (default), filter out PAINS hits
|
|
295
|
+
# If exclude=False, keep only PAINS hits
|
|
296
|
+
if self.exclude and is_pains:
|
|
297
|
+
return None
|
|
298
|
+
if not self.exclude and not is_pains:
|
|
299
|
+
return None
|
|
300
|
+
|
|
301
|
+
result: dict[str, Any] = {}
|
|
302
|
+
if self.include_smiles:
|
|
303
|
+
result["smiles"] = record.smiles
|
|
304
|
+
if self.include_name and record.name:
|
|
305
|
+
result["name"] = record.name
|
|
306
|
+
|
|
307
|
+
for key, value in record.metadata.items():
|
|
308
|
+
if key not in result:
|
|
309
|
+
result[key] = value
|
|
310
|
+
|
|
311
|
+
return result
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
class ElementFilter:
|
|
315
|
+
"""Filter molecules by allowed/required/forbidden elements."""
|
|
316
|
+
|
|
317
|
+
def __init__(
|
|
318
|
+
self,
|
|
319
|
+
allowed_elements: Optional[list[str]] = None,
|
|
320
|
+
required_elements: Optional[list[str]] = None,
|
|
321
|
+
forbidden_elements: Optional[list[str]] = None,
|
|
322
|
+
include_smiles: bool = True,
|
|
323
|
+
include_name: bool = True,
|
|
324
|
+
):
|
|
325
|
+
"""
|
|
326
|
+
Initialize element filter.
|
|
327
|
+
|
|
328
|
+
Args:
|
|
329
|
+
allowed_elements: Only these elements are allowed
|
|
330
|
+
required_elements: Molecule must contain all of these
|
|
331
|
+
forbidden_elements: Molecule must not contain any of these
|
|
332
|
+
"""
|
|
333
|
+
self.allowed = set(allowed_elements) if allowed_elements else None
|
|
334
|
+
self.required = set(required_elements) if required_elements else None
|
|
335
|
+
self.forbidden = set(forbidden_elements) if forbidden_elements else None
|
|
336
|
+
self.include_smiles = include_smiles
|
|
337
|
+
self.include_name = include_name
|
|
338
|
+
|
|
339
|
+
def filter(self, record: MoleculeRecord) -> Optional[dict[str, Any]]:
|
|
340
|
+
"""Filter a molecule record by elements."""
|
|
341
|
+
if record.mol is None:
|
|
342
|
+
return None
|
|
343
|
+
|
|
344
|
+
# Get elements in molecule
|
|
345
|
+
elements = set()
|
|
346
|
+
for atom in record.mol.GetAtoms():
|
|
347
|
+
elements.add(atom.GetSymbol())
|
|
348
|
+
|
|
349
|
+
# Check allowed
|
|
350
|
+
if self.allowed is not None:
|
|
351
|
+
if not elements.issubset(self.allowed):
|
|
352
|
+
return None
|
|
353
|
+
|
|
354
|
+
# Check required
|
|
355
|
+
if self.required is not None:
|
|
356
|
+
if not self.required.issubset(elements):
|
|
357
|
+
return None
|
|
358
|
+
|
|
359
|
+
# Check forbidden
|
|
360
|
+
if self.forbidden is not None:
|
|
361
|
+
if elements.intersection(self.forbidden):
|
|
362
|
+
return None
|
|
363
|
+
|
|
364
|
+
result: dict[str, Any] = {}
|
|
365
|
+
if self.include_smiles:
|
|
366
|
+
result["smiles"] = record.smiles
|
|
367
|
+
if self.include_name and record.name:
|
|
368
|
+
result["name"] = record.name
|
|
369
|
+
|
|
370
|
+
for key, value in record.metadata.items():
|
|
371
|
+
if key not in result:
|
|
372
|
+
result[key] = value
|
|
373
|
+
|
|
374
|
+
return result
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
class ComplexityFilter:
|
|
378
|
+
"""Filter molecules by complexity measures."""
|
|
379
|
+
|
|
380
|
+
def __init__(
|
|
381
|
+
self,
|
|
382
|
+
min_atoms: int = 1,
|
|
383
|
+
max_atoms: int = 100,
|
|
384
|
+
min_rings: int = 0,
|
|
385
|
+
max_rings: int = 10,
|
|
386
|
+
min_rotatable: int = 0,
|
|
387
|
+
max_rotatable: int = 20,
|
|
388
|
+
include_smiles: bool = True,
|
|
389
|
+
include_name: bool = True,
|
|
390
|
+
):
|
|
391
|
+
"""
|
|
392
|
+
Initialize complexity filter.
|
|
393
|
+
|
|
394
|
+
Args:
|
|
395
|
+
min_atoms: Minimum heavy atom count
|
|
396
|
+
max_atoms: Maximum heavy atom count
|
|
397
|
+
min_rings: Minimum ring count
|
|
398
|
+
max_rings: Maximum ring count
|
|
399
|
+
min_rotatable: Minimum rotatable bonds
|
|
400
|
+
max_rotatable: Maximum rotatable bonds
|
|
401
|
+
"""
|
|
402
|
+
self.min_atoms = min_atoms
|
|
403
|
+
self.max_atoms = max_atoms
|
|
404
|
+
self.min_rings = min_rings
|
|
405
|
+
self.max_rings = max_rings
|
|
406
|
+
self.min_rotatable = min_rotatable
|
|
407
|
+
self.max_rotatable = max_rotatable
|
|
408
|
+
self.include_smiles = include_smiles
|
|
409
|
+
self.include_name = include_name
|
|
410
|
+
|
|
411
|
+
def filter(self, record: MoleculeRecord) -> Optional[dict[str, Any]]:
|
|
412
|
+
"""Filter a molecule record by complexity."""
|
|
413
|
+
if record.mol is None:
|
|
414
|
+
return None
|
|
415
|
+
|
|
416
|
+
mol = record.mol
|
|
417
|
+
|
|
418
|
+
# Check heavy atom count
|
|
419
|
+
heavy_atoms = mol.GetNumHeavyAtoms()
|
|
420
|
+
if heavy_atoms < self.min_atoms or heavy_atoms > self.max_atoms:
|
|
421
|
+
return None
|
|
422
|
+
|
|
423
|
+
# Check ring count
|
|
424
|
+
ring_count = Descriptors.RingCount(mol)
|
|
425
|
+
if ring_count < self.min_rings or ring_count > self.max_rings:
|
|
426
|
+
return None
|
|
427
|
+
|
|
428
|
+
# Check rotatable bonds
|
|
429
|
+
rotatable = Descriptors.NumRotatableBonds(mol)
|
|
430
|
+
if rotatable < self.min_rotatable or rotatable > self.max_rotatable:
|
|
431
|
+
return None
|
|
432
|
+
|
|
433
|
+
result: dict[str, Any] = {}
|
|
434
|
+
if self.include_smiles:
|
|
435
|
+
result["smiles"] = record.smiles
|
|
436
|
+
if self.include_name and record.name:
|
|
437
|
+
result["name"] = record.name
|
|
438
|
+
|
|
439
|
+
for key, value in record.metadata.items():
|
|
440
|
+
if key not in result:
|
|
441
|
+
result[key] = value
|
|
442
|
+
|
|
443
|
+
return result
|
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
"""Molecular fingerprint computation engine."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import Optional, Any
|
|
6
|
+
|
|
7
|
+
from rdkit import Chem, DataStructs
|
|
8
|
+
from rdkit.Chem import AllChem, MACCSkeys, rdMolDescriptors
|
|
9
|
+
|
|
10
|
+
from rdkit_cli.io.readers import MoleculeRecord
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class FingerprintType(Enum):
|
|
14
|
+
"""Supported fingerprint types."""
|
|
15
|
+
|
|
16
|
+
MORGAN = "morgan"
|
|
17
|
+
MACCS = "maccs"
|
|
18
|
+
RDKIT = "rdkit"
|
|
19
|
+
ATOMPAIR = "atompair"
|
|
20
|
+
TORSION = "torsion"
|
|
21
|
+
PATTERN = "pattern"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class FingerprintInfo:
|
|
26
|
+
"""Information about a fingerprint type."""
|
|
27
|
+
|
|
28
|
+
name: str
|
|
29
|
+
description: str
|
|
30
|
+
default_bits: int
|
|
31
|
+
has_radius: bool
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
FINGERPRINT_INFO: dict[FingerprintType, FingerprintInfo] = {
|
|
35
|
+
FingerprintType.MORGAN: FingerprintInfo(
|
|
36
|
+
name="morgan",
|
|
37
|
+
description="Morgan/ECFP circular fingerprints",
|
|
38
|
+
default_bits=2048,
|
|
39
|
+
has_radius=True,
|
|
40
|
+
),
|
|
41
|
+
FingerprintType.MACCS: FingerprintInfo(
|
|
42
|
+
name="maccs",
|
|
43
|
+
description="MACCS structural keys (166 bits)",
|
|
44
|
+
default_bits=167,
|
|
45
|
+
has_radius=False,
|
|
46
|
+
),
|
|
47
|
+
FingerprintType.RDKIT: FingerprintInfo(
|
|
48
|
+
name="rdkit",
|
|
49
|
+
description="RDKit/Daylight-like path-based fingerprints",
|
|
50
|
+
default_bits=2048,
|
|
51
|
+
has_radius=False,
|
|
52
|
+
),
|
|
53
|
+
FingerprintType.ATOMPAIR: FingerprintInfo(
|
|
54
|
+
name="atompair",
|
|
55
|
+
description="Atom pair fingerprints",
|
|
56
|
+
default_bits=2048,
|
|
57
|
+
has_radius=False,
|
|
58
|
+
),
|
|
59
|
+
FingerprintType.TORSION: FingerprintInfo(
|
|
60
|
+
name="torsion",
|
|
61
|
+
description="Topological torsion fingerprints",
|
|
62
|
+
default_bits=2048,
|
|
63
|
+
has_radius=False,
|
|
64
|
+
),
|
|
65
|
+
FingerprintType.PATTERN: FingerprintInfo(
|
|
66
|
+
name="pattern",
|
|
67
|
+
description="SMARTS pattern fingerprints (for screening)",
|
|
68
|
+
default_bits=2048,
|
|
69
|
+
has_radius=False,
|
|
70
|
+
),
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def list_fingerprints() -> list[FingerprintInfo]:
|
|
75
|
+
"""List available fingerprint types."""
|
|
76
|
+
return list(FINGERPRINT_INFO.values())
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def compute_fingerprint(
|
|
80
|
+
mol: Chem.Mol,
|
|
81
|
+
fp_type: FingerprintType,
|
|
82
|
+
n_bits: int = 2048,
|
|
83
|
+
radius: int = 2,
|
|
84
|
+
use_counts: bool = False,
|
|
85
|
+
) -> Optional[DataStructs.ExplicitBitVect]:
|
|
86
|
+
"""
|
|
87
|
+
Compute fingerprint for a molecule.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
mol: RDKit molecule
|
|
91
|
+
fp_type: Type of fingerprint
|
|
92
|
+
n_bits: Number of bits
|
|
93
|
+
radius: Radius for Morgan fingerprints
|
|
94
|
+
use_counts: Use count fingerprints (Morgan only)
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
Fingerprint bit vector or None on failure
|
|
98
|
+
"""
|
|
99
|
+
try:
|
|
100
|
+
if fp_type == FingerprintType.MORGAN:
|
|
101
|
+
if use_counts:
|
|
102
|
+
return rdMolDescriptors.GetHashedMorganFingerprint(
|
|
103
|
+
mol, radius, nBits=n_bits
|
|
104
|
+
)
|
|
105
|
+
else:
|
|
106
|
+
return rdMolDescriptors.GetMorganFingerprintAsBitVect(
|
|
107
|
+
mol, radius, nBits=n_bits
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
elif fp_type == FingerprintType.MACCS:
|
|
111
|
+
return MACCSkeys.GenMACCSKeys(mol)
|
|
112
|
+
|
|
113
|
+
elif fp_type == FingerprintType.RDKIT:
|
|
114
|
+
return Chem.RDKFingerprint(mol, fpSize=n_bits)
|
|
115
|
+
|
|
116
|
+
elif fp_type == FingerprintType.ATOMPAIR:
|
|
117
|
+
return rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(
|
|
118
|
+
mol, nBits=n_bits
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
elif fp_type == FingerprintType.TORSION:
|
|
122
|
+
return rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(
|
|
123
|
+
mol, nBits=n_bits
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
elif fp_type == FingerprintType.PATTERN:
|
|
127
|
+
return Chem.PatternFingerprint(mol, fpSize=n_bits)
|
|
128
|
+
|
|
129
|
+
else:
|
|
130
|
+
raise ValueError(f"Unknown fingerprint type: {fp_type}")
|
|
131
|
+
|
|
132
|
+
except Exception:
|
|
133
|
+
return None
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def fingerprint_to_hex(fp) -> str:
|
|
137
|
+
"""Convert fingerprint to hex string."""
|
|
138
|
+
if fp is None:
|
|
139
|
+
return ""
|
|
140
|
+
|
|
141
|
+
if hasattr(fp, "GetNonzeroElements"):
|
|
142
|
+
# Count fingerprint - convert to bit vector first
|
|
143
|
+
bit_string = fp.ToBitString()
|
|
144
|
+
return hex(int(bit_string, 2))[2:]
|
|
145
|
+
|
|
146
|
+
# Bit vector
|
|
147
|
+
return fp.ToBase64()
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def fingerprint_to_bitstring(fp) -> str:
|
|
151
|
+
"""Convert fingerprint to bit string."""
|
|
152
|
+
if fp is None:
|
|
153
|
+
return ""
|
|
154
|
+
return fp.ToBitString()
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def fingerprint_to_numpy(fp):
|
|
158
|
+
"""Convert fingerprint to numpy array."""
|
|
159
|
+
import numpy as np
|
|
160
|
+
|
|
161
|
+
if fp is None:
|
|
162
|
+
return None
|
|
163
|
+
|
|
164
|
+
arr = np.zeros((len(fp),), dtype=np.int8)
|
|
165
|
+
DataStructs.ConvertToNumpyArray(fp, arr)
|
|
166
|
+
return arr
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
class FingerprintCalculator:
|
|
170
|
+
"""Calculator for molecular fingerprints."""
|
|
171
|
+
|
|
172
|
+
def __init__(
|
|
173
|
+
self,
|
|
174
|
+
fp_type: FingerprintType = FingerprintType.MORGAN,
|
|
175
|
+
n_bits: int = 2048,
|
|
176
|
+
radius: int = 2,
|
|
177
|
+
use_counts: bool = False,
|
|
178
|
+
output_format: str = "hex",
|
|
179
|
+
include_smiles: bool = True,
|
|
180
|
+
include_name: bool = True,
|
|
181
|
+
):
|
|
182
|
+
"""
|
|
183
|
+
Initialize fingerprint calculator.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
fp_type: Type of fingerprint
|
|
187
|
+
n_bits: Number of bits
|
|
188
|
+
radius: Radius for Morgan fingerprints
|
|
189
|
+
use_counts: Use count fingerprints
|
|
190
|
+
output_format: Output format (hex, bitstring, bits)
|
|
191
|
+
include_smiles: Include SMILES in output
|
|
192
|
+
include_name: Include molecule name in output
|
|
193
|
+
"""
|
|
194
|
+
self.fp_type = fp_type
|
|
195
|
+
self.n_bits = n_bits
|
|
196
|
+
self.radius = radius
|
|
197
|
+
self.use_counts = use_counts
|
|
198
|
+
self.output_format = output_format
|
|
199
|
+
self.include_smiles = include_smiles
|
|
200
|
+
self.include_name = include_name
|
|
201
|
+
|
|
202
|
+
# Override n_bits for MACCS
|
|
203
|
+
if fp_type == FingerprintType.MACCS:
|
|
204
|
+
self.n_bits = 167
|
|
205
|
+
|
|
206
|
+
def compute(self, record: MoleculeRecord) -> Optional[dict[str, Any]]:
|
|
207
|
+
"""
|
|
208
|
+
Compute fingerprint for a molecule record.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
record: MoleculeRecord to process
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
Dictionary with fingerprint or None if molecule is invalid
|
|
215
|
+
"""
|
|
216
|
+
if record.mol is None:
|
|
217
|
+
return None
|
|
218
|
+
|
|
219
|
+
fp = compute_fingerprint(
|
|
220
|
+
record.mol,
|
|
221
|
+
self.fp_type,
|
|
222
|
+
n_bits=self.n_bits,
|
|
223
|
+
radius=self.radius,
|
|
224
|
+
use_counts=self.use_counts,
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
if fp is None:
|
|
228
|
+
return None
|
|
229
|
+
|
|
230
|
+
result: dict[str, Any] = {}
|
|
231
|
+
|
|
232
|
+
if self.include_smiles:
|
|
233
|
+
result["smiles"] = record.smiles
|
|
234
|
+
if self.include_name and record.name:
|
|
235
|
+
result["name"] = record.name
|
|
236
|
+
|
|
237
|
+
# Format fingerprint
|
|
238
|
+
if self.output_format == "hex":
|
|
239
|
+
result["fingerprint"] = fingerprint_to_hex(fp)
|
|
240
|
+
elif self.output_format == "bitstring":
|
|
241
|
+
result["fingerprint"] = fingerprint_to_bitstring(fp)
|
|
242
|
+
elif self.output_format == "bits":
|
|
243
|
+
# Individual bit columns
|
|
244
|
+
bits = fingerprint_to_bitstring(fp)
|
|
245
|
+
for i, bit in enumerate(bits):
|
|
246
|
+
result[f"bit_{i}"] = int(bit)
|
|
247
|
+
else:
|
|
248
|
+
result["fingerprint"] = fingerprint_to_hex(fp)
|
|
249
|
+
|
|
250
|
+
return result
|
|
251
|
+
|
|
252
|
+
def get_column_names(self) -> list[str]:
|
|
253
|
+
"""Get output column names in order."""
|
|
254
|
+
cols = []
|
|
255
|
+
if self.include_smiles:
|
|
256
|
+
cols.append("smiles")
|
|
257
|
+
if self.include_name:
|
|
258
|
+
cols.append("name")
|
|
259
|
+
|
|
260
|
+
if self.output_format == "bits":
|
|
261
|
+
cols.extend([f"bit_{i}" for i in range(self.n_bits)])
|
|
262
|
+
else:
|
|
263
|
+
cols.append("fingerprint")
|
|
264
|
+
|
|
265
|
+
return cols
|