rdkit-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rdkit_cli/__init__.py +4 -0
- rdkit_cli/__main__.py +6 -0
- rdkit_cli/cli.py +162 -0
- rdkit_cli/commands/__init__.py +1 -0
- rdkit_cli/commands/conformers.py +220 -0
- rdkit_cli/commands/convert.py +162 -0
- rdkit_cli/commands/depict.py +311 -0
- rdkit_cli/commands/descriptors.py +251 -0
- rdkit_cli/commands/diversity.py +232 -0
- rdkit_cli/commands/enumerate.py +229 -0
- rdkit_cli/commands/filter.py +384 -0
- rdkit_cli/commands/fingerprints.py +179 -0
- rdkit_cli/commands/fragment.py +284 -0
- rdkit_cli/commands/mcs.py +162 -0
- rdkit_cli/commands/reactions.py +191 -0
- rdkit_cli/commands/scaffold.py +243 -0
- rdkit_cli/commands/similarity.py +359 -0
- rdkit_cli/commands/standardize.py +138 -0
- rdkit_cli/core/__init__.py +1 -0
- rdkit_cli/core/conformers.py +197 -0
- rdkit_cli/core/depict.py +241 -0
- rdkit_cli/core/descriptors.py +248 -0
- rdkit_cli/core/diversity.py +174 -0
- rdkit_cli/core/enumerate.py +190 -0
- rdkit_cli/core/filters.py +443 -0
- rdkit_cli/core/fingerprints.py +265 -0
- rdkit_cli/core/fragment.py +237 -0
- rdkit_cli/core/mcs.py +128 -0
- rdkit_cli/core/reactions.py +159 -0
- rdkit_cli/core/scaffold.py +174 -0
- rdkit_cli/core/similarity.py +206 -0
- rdkit_cli/core/standardizer.py +141 -0
- rdkit_cli/io/__init__.py +7 -0
- rdkit_cli/io/formats.py +109 -0
- rdkit_cli/io/readers.py +352 -0
- rdkit_cli/io/writers.py +275 -0
- rdkit_cli/parallel/__init__.py +5 -0
- rdkit_cli/parallel/batch.py +181 -0
- rdkit_cli/parallel/executor.py +180 -0
- rdkit_cli/progress/__init__.py +5 -0
- rdkit_cli/progress/ninja.py +195 -0
- rdkit_cli/utils/__init__.py +1 -0
- rdkit_cli-0.1.0.dist-info/METADATA +380 -0
- rdkit_cli-0.1.0.dist-info/RECORD +47 -0
- rdkit_cli-0.1.0.dist-info/WHEEL +4 -0
- rdkit_cli-0.1.0.dist-info/entry_points.txt +2 -0
- rdkit_cli-0.1.0.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
"""Molecular descriptor computation engine."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Optional, Any
|
|
5
|
+
|
|
6
|
+
from rdkit import Chem
|
|
7
|
+
from rdkit.Chem import Descriptors, rdMolDescriptors, QED
|
|
8
|
+
|
|
9
|
+
from rdkit_cli.io.readers import MoleculeRecord
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# Descriptor categories
|
|
13
|
+
DESCRIPTOR_CATEGORIES = [
|
|
14
|
+
"constitutional",
|
|
15
|
+
"topological",
|
|
16
|
+
"electronic",
|
|
17
|
+
"geometric",
|
|
18
|
+
"molecular",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class DescriptorInfo:
|
|
24
|
+
"""Information about a descriptor."""
|
|
25
|
+
|
|
26
|
+
name: str
|
|
27
|
+
description: str
|
|
28
|
+
category: str
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# Build descriptor registry from RDKit
|
|
32
|
+
def _build_descriptor_registry() -> dict[str, tuple[callable, str, str]]:
|
|
33
|
+
"""Build registry of all available descriptors."""
|
|
34
|
+
registry = {}
|
|
35
|
+
|
|
36
|
+
# Get all descriptors from Descriptors module
|
|
37
|
+
for name, func in Descriptors.descList:
|
|
38
|
+
# Categorize based on name patterns
|
|
39
|
+
category = "molecular"
|
|
40
|
+
lower_name = name.lower()
|
|
41
|
+
|
|
42
|
+
if any(x in lower_name for x in ["chi", "kappa", "hall", "balaban", "bertz"]):
|
|
43
|
+
category = "topological"
|
|
44
|
+
elif any(x in lower_name for x in ["tpsa", "labute", "peoe", "gasteiger"]):
|
|
45
|
+
category = "electronic"
|
|
46
|
+
elif any(x in lower_name for x in ["num", "count", "heavy", "ring", "rotatable"]):
|
|
47
|
+
category = "constitutional"
|
|
48
|
+
elif any(x in lower_name for x in ["mol", "exact", "weight", "logp", "mr"]):
|
|
49
|
+
category = "molecular"
|
|
50
|
+
|
|
51
|
+
registry[name] = (func, f"RDKit descriptor: {name}", category)
|
|
52
|
+
|
|
53
|
+
return registry
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
DESCRIPTOR_REGISTRY = _build_descriptor_registry()
|
|
57
|
+
|
|
58
|
+
# Add QED (not in Descriptors.descList)
|
|
59
|
+
DESCRIPTOR_REGISTRY["QED"] = (QED.qed, "Quantitative Estimate of Drug-likeness", "molecular")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def compute_lipinski_violations(mol: Chem.Mol) -> int:
|
|
63
|
+
"""
|
|
64
|
+
Count Lipinski Rule of 5 violations.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
mol: RDKit molecule
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
Number of violations (0-4)
|
|
71
|
+
"""
|
|
72
|
+
violations = 0
|
|
73
|
+
|
|
74
|
+
if Descriptors.MolWt(mol) > 500:
|
|
75
|
+
violations += 1
|
|
76
|
+
if Descriptors.MolLogP(mol) > 5:
|
|
77
|
+
violations += 1
|
|
78
|
+
if Descriptors.NumHDonors(mol) > 5:
|
|
79
|
+
violations += 1
|
|
80
|
+
if Descriptors.NumHAcceptors(mol) > 10:
|
|
81
|
+
violations += 1
|
|
82
|
+
|
|
83
|
+
return violations
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def list_descriptors(
|
|
87
|
+
category: Optional[str] = None,
|
|
88
|
+
verbose: bool = False,
|
|
89
|
+
) -> list[DescriptorInfo]:
|
|
90
|
+
"""
|
|
91
|
+
List available descriptors.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
category: Filter by category
|
|
95
|
+
verbose: Include descriptions
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
List of DescriptorInfo objects
|
|
99
|
+
"""
|
|
100
|
+
result = []
|
|
101
|
+
|
|
102
|
+
for name, (func, desc, cat) in sorted(DESCRIPTOR_REGISTRY.items()):
|
|
103
|
+
if category is None or cat == category:
|
|
104
|
+
result.append(DescriptorInfo(name=name, description=desc, category=cat))
|
|
105
|
+
|
|
106
|
+
return result
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def compute_descriptor(mol: Chem.Mol, name: str) -> Optional[float]:
|
|
110
|
+
"""
|
|
111
|
+
Compute a single descriptor for a molecule.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
mol: RDKit molecule
|
|
115
|
+
name: Descriptor name
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
Descriptor value or None if computation failed
|
|
119
|
+
"""
|
|
120
|
+
if name not in DESCRIPTOR_REGISTRY:
|
|
121
|
+
raise ValueError(f"Unknown descriptor: {name}")
|
|
122
|
+
|
|
123
|
+
func = DESCRIPTOR_REGISTRY[name][0]
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
value = func(mol)
|
|
127
|
+
# Handle NaN and inf
|
|
128
|
+
if value is None or (isinstance(value, float) and (value != value or abs(value) == float("inf"))):
|
|
129
|
+
return None
|
|
130
|
+
return float(value)
|
|
131
|
+
except Exception:
|
|
132
|
+
return None
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class DescriptorCalculator:
|
|
136
|
+
"""Calculator for molecular descriptors."""
|
|
137
|
+
|
|
138
|
+
def __init__(
|
|
139
|
+
self,
|
|
140
|
+
descriptors: Optional[list[str]] = None,
|
|
141
|
+
include_smiles: bool = True,
|
|
142
|
+
include_name: bool = True,
|
|
143
|
+
precision: int = 4,
|
|
144
|
+
error_value: str = "NaN",
|
|
145
|
+
):
|
|
146
|
+
"""
|
|
147
|
+
Initialize descriptor calculator.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
descriptors: List of descriptor names (None for all)
|
|
151
|
+
include_smiles: Include SMILES in output
|
|
152
|
+
include_name: Include molecule name in output
|
|
153
|
+
precision: Decimal precision for float values
|
|
154
|
+
error_value: Value to use for failed calculations
|
|
155
|
+
"""
|
|
156
|
+
if descriptors is None:
|
|
157
|
+
self.descriptors = list(DESCRIPTOR_REGISTRY.keys())
|
|
158
|
+
else:
|
|
159
|
+
# Validate descriptor names
|
|
160
|
+
unknown = set(descriptors) - set(DESCRIPTOR_REGISTRY.keys())
|
|
161
|
+
if unknown:
|
|
162
|
+
raise ValueError(f"Unknown descriptors: {', '.join(unknown)}")
|
|
163
|
+
self.descriptors = descriptors
|
|
164
|
+
|
|
165
|
+
self.include_smiles = include_smiles
|
|
166
|
+
self.include_name = include_name
|
|
167
|
+
self.precision = precision
|
|
168
|
+
self.error_value = error_value
|
|
169
|
+
|
|
170
|
+
def _format_value(self, value: Optional[float]) -> Any:
|
|
171
|
+
"""Format a descriptor value with precision and error handling."""
|
|
172
|
+
if value is None:
|
|
173
|
+
return self.error_value
|
|
174
|
+
if isinstance(value, float):
|
|
175
|
+
return round(value, self.precision)
|
|
176
|
+
return value
|
|
177
|
+
|
|
178
|
+
def compute(self, record: MoleculeRecord) -> Optional[dict[str, Any]]:
|
|
179
|
+
"""
|
|
180
|
+
Compute descriptors for a molecule record.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
record: MoleculeRecord to process
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
Dictionary with descriptor values or None if molecule is invalid
|
|
187
|
+
"""
|
|
188
|
+
if record.mol is None:
|
|
189
|
+
return None
|
|
190
|
+
|
|
191
|
+
result: dict[str, Any] = {}
|
|
192
|
+
|
|
193
|
+
if self.include_smiles:
|
|
194
|
+
result["smiles"] = record.smiles
|
|
195
|
+
if self.include_name and record.name:
|
|
196
|
+
result["name"] = record.name
|
|
197
|
+
|
|
198
|
+
for desc_name in self.descriptors:
|
|
199
|
+
value = compute_descriptor(record.mol, desc_name)
|
|
200
|
+
result[desc_name] = self._format_value(value)
|
|
201
|
+
|
|
202
|
+
return result
|
|
203
|
+
|
|
204
|
+
def get_column_names(self) -> list[str]:
|
|
205
|
+
"""Get output column names in order."""
|
|
206
|
+
cols = []
|
|
207
|
+
if self.include_smiles:
|
|
208
|
+
cols.append("smiles")
|
|
209
|
+
if self.include_name:
|
|
210
|
+
cols.append("name")
|
|
211
|
+
cols.extend(self.descriptors)
|
|
212
|
+
return cols
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
# Common descriptor sets
|
|
216
|
+
COMMON_DESCRIPTORS = [
|
|
217
|
+
"MolWt",
|
|
218
|
+
"ExactMolWt",
|
|
219
|
+
"HeavyAtomCount",
|
|
220
|
+
"NumHAcceptors",
|
|
221
|
+
"NumHDonors",
|
|
222
|
+
"NumRotatableBonds",
|
|
223
|
+
"NumHeteroatoms",
|
|
224
|
+
"NumAromaticRings",
|
|
225
|
+
"RingCount",
|
|
226
|
+
"TPSA",
|
|
227
|
+
"MolLogP",
|
|
228
|
+
"MolMR",
|
|
229
|
+
"FractionCSP3",
|
|
230
|
+
]
|
|
231
|
+
|
|
232
|
+
LIPINSKI_DESCRIPTORS = [
|
|
233
|
+
"MolWt",
|
|
234
|
+
"MolLogP",
|
|
235
|
+
"NumHDonors",
|
|
236
|
+
"NumHAcceptors",
|
|
237
|
+
]
|
|
238
|
+
|
|
239
|
+
DRUGLIKE_DESCRIPTORS = [
|
|
240
|
+
"MolWt",
|
|
241
|
+
"MolLogP",
|
|
242
|
+
"NumHDonors",
|
|
243
|
+
"NumHAcceptors",
|
|
244
|
+
"TPSA",
|
|
245
|
+
"NumRotatableBonds",
|
|
246
|
+
"RingCount",
|
|
247
|
+
"HeavyAtomCount",
|
|
248
|
+
]
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""Molecular diversity analysis engine."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional, Any
|
|
4
|
+
|
|
5
|
+
from rdkit import Chem, DataStructs
|
|
6
|
+
from rdkit.Chem import rdMolDescriptors
|
|
7
|
+
from rdkit.SimDivFilters import rdSimDivPickers
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_morgan_fingerprint(mol: Chem.Mol, radius: int = 2, n_bits: int = 2048):
|
|
11
|
+
"""Get Morgan fingerprint for a molecule."""
|
|
12
|
+
return rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class DiversityPicker:
|
|
16
|
+
"""Select diverse subset of molecules using MaxMin algorithm."""
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
n_picks: int = 100,
|
|
21
|
+
seed: Optional[int] = None,
|
|
22
|
+
radius: int = 2,
|
|
23
|
+
n_bits: int = 2048,
|
|
24
|
+
method: str = "maxmin",
|
|
25
|
+
):
|
|
26
|
+
"""
|
|
27
|
+
Initialize diversity picker.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
n_picks: Number of molecules to pick
|
|
31
|
+
seed: Random seed for reproducibility
|
|
32
|
+
radius: Morgan fingerprint radius
|
|
33
|
+
n_bits: Fingerprint bit size
|
|
34
|
+
method: Picking method ('maxmin' or 'leader')
|
|
35
|
+
"""
|
|
36
|
+
self.n_picks = n_picks
|
|
37
|
+
self.seed = seed
|
|
38
|
+
self.radius = radius
|
|
39
|
+
self.n_bits = n_bits
|
|
40
|
+
self.method = method
|
|
41
|
+
|
|
42
|
+
def pick(
|
|
43
|
+
self,
|
|
44
|
+
mols: list[Chem.Mol],
|
|
45
|
+
first_picks: Optional[list[int]] = None,
|
|
46
|
+
) -> list[int]:
|
|
47
|
+
"""
|
|
48
|
+
Pick diverse subset of molecules.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
mols: List of molecules
|
|
52
|
+
first_picks: Indices of molecules that must be included
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
List of selected indices
|
|
56
|
+
"""
|
|
57
|
+
# Filter None molecules and track indices
|
|
58
|
+
valid_mols = []
|
|
59
|
+
valid_indices = []
|
|
60
|
+
for i, mol in enumerate(mols):
|
|
61
|
+
if mol is not None:
|
|
62
|
+
valid_mols.append(mol)
|
|
63
|
+
valid_indices.append(i)
|
|
64
|
+
|
|
65
|
+
if len(valid_mols) == 0:
|
|
66
|
+
return []
|
|
67
|
+
|
|
68
|
+
# Generate fingerprints
|
|
69
|
+
fps = [get_morgan_fingerprint(mol, self.radius, self.n_bits) for mol in valid_mols]
|
|
70
|
+
|
|
71
|
+
# Adjust n_picks if larger than available
|
|
72
|
+
n_to_pick = min(self.n_picks, len(fps))
|
|
73
|
+
|
|
74
|
+
# Create picker
|
|
75
|
+
if self.method == "maxmin":
|
|
76
|
+
picker = rdSimDivPickers.MaxMinPicker()
|
|
77
|
+
else:
|
|
78
|
+
picker = rdSimDivPickers.LeaderPicker()
|
|
79
|
+
|
|
80
|
+
# Define distance function
|
|
81
|
+
def dist_func(i, j):
|
|
82
|
+
return 1 - DataStructs.TanimotoSimilarity(fps[i], fps[j])
|
|
83
|
+
|
|
84
|
+
# Pick diverse molecules
|
|
85
|
+
if first_picks:
|
|
86
|
+
# Map first_picks to valid indices
|
|
87
|
+
mapped_first = [valid_indices.index(i) for i in first_picks if i in valid_indices]
|
|
88
|
+
picks = list(picker.LazyBitVectorPick(fps, len(fps), n_to_pick, firstPicks=mapped_first))
|
|
89
|
+
else:
|
|
90
|
+
if self.seed is not None:
|
|
91
|
+
picks = list(picker.LazyBitVectorPick(fps, len(fps), n_to_pick, seed=self.seed))
|
|
92
|
+
else:
|
|
93
|
+
picks = list(picker.LazyBitVectorPick(fps, len(fps), n_to_pick))
|
|
94
|
+
|
|
95
|
+
# Map back to original indices
|
|
96
|
+
return [valid_indices[i] for i in picks]
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class DiversityAnalyzer:
|
|
100
|
+
"""Analyze diversity of a molecule set."""
|
|
101
|
+
|
|
102
|
+
def __init__(
|
|
103
|
+
self,
|
|
104
|
+
radius: int = 2,
|
|
105
|
+
n_bits: int = 2048,
|
|
106
|
+
sample_size: int = 1000,
|
|
107
|
+
):
|
|
108
|
+
"""
|
|
109
|
+
Initialize diversity analyzer.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
radius: Morgan fingerprint radius
|
|
113
|
+
n_bits: Fingerprint bit size
|
|
114
|
+
sample_size: Max molecules to sample for analysis
|
|
115
|
+
"""
|
|
116
|
+
self.radius = radius
|
|
117
|
+
self.n_bits = n_bits
|
|
118
|
+
self.sample_size = sample_size
|
|
119
|
+
|
|
120
|
+
def analyze(self, mols: list[Chem.Mol]) -> dict[str, Any]:
|
|
121
|
+
"""
|
|
122
|
+
Analyze diversity of molecule set.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
mols: List of molecules
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
Dictionary with diversity statistics
|
|
129
|
+
"""
|
|
130
|
+
import random
|
|
131
|
+
|
|
132
|
+
# Filter None molecules
|
|
133
|
+
valid_mols = [mol for mol in mols if mol is not None]
|
|
134
|
+
|
|
135
|
+
if len(valid_mols) < 2:
|
|
136
|
+
return {"error": "Need at least 2 valid molecules"}
|
|
137
|
+
|
|
138
|
+
# Sample if too large
|
|
139
|
+
if len(valid_mols) > self.sample_size:
|
|
140
|
+
valid_mols = random.sample(valid_mols, self.sample_size)
|
|
141
|
+
|
|
142
|
+
# Generate fingerprints
|
|
143
|
+
fps = [get_morgan_fingerprint(mol, self.radius, self.n_bits) for mol in valid_mols]
|
|
144
|
+
|
|
145
|
+
# Compute pairwise similarities
|
|
146
|
+
similarities = []
|
|
147
|
+
n = len(fps)
|
|
148
|
+
for i in range(n):
|
|
149
|
+
for j in range(i + 1, n):
|
|
150
|
+
sim = DataStructs.TanimotoSimilarity(fps[i], fps[j])
|
|
151
|
+
similarities.append(sim)
|
|
152
|
+
|
|
153
|
+
if not similarities:
|
|
154
|
+
return {"error": "Could not compute similarities"}
|
|
155
|
+
|
|
156
|
+
# Calculate statistics
|
|
157
|
+
import statistics
|
|
158
|
+
|
|
159
|
+
mean_sim = statistics.mean(similarities)
|
|
160
|
+
median_sim = statistics.median(similarities)
|
|
161
|
+
min_sim = min(similarities)
|
|
162
|
+
max_sim = max(similarities)
|
|
163
|
+
stdev_sim = statistics.stdev(similarities) if len(similarities) > 1 else 0
|
|
164
|
+
|
|
165
|
+
return {
|
|
166
|
+
"n_molecules": len(valid_mols),
|
|
167
|
+
"n_pairs": len(similarities),
|
|
168
|
+
"mean_similarity": round(mean_sim, 4),
|
|
169
|
+
"median_similarity": round(median_sim, 4),
|
|
170
|
+
"min_similarity": round(min_sim, 4),
|
|
171
|
+
"max_similarity": round(max_sim, 4),
|
|
172
|
+
"stdev_similarity": round(stdev_sim, 4),
|
|
173
|
+
"diversity_score": round(1 - mean_sim, 4),
|
|
174
|
+
}
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""Molecular enumeration engine."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional, Any
|
|
4
|
+
|
|
5
|
+
from rdkit import Chem
|
|
6
|
+
from rdkit.Chem import AllChem, EnumerateStereoisomers
|
|
7
|
+
from rdkit.Chem.MolStandardize import rdMolStandardize
|
|
8
|
+
|
|
9
|
+
from rdkit_cli.io.readers import MoleculeRecord
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class StereoisomerEnumerator:
|
|
13
|
+
"""Enumerate stereoisomers of molecules."""
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
max_isomers: int = 32,
|
|
18
|
+
include_given: bool = True,
|
|
19
|
+
only_unassigned: bool = True,
|
|
20
|
+
include_smiles: bool = True,
|
|
21
|
+
include_name: bool = True,
|
|
22
|
+
):
|
|
23
|
+
"""
|
|
24
|
+
Initialize stereoisomer enumerator.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
max_isomers: Maximum stereoisomers to generate
|
|
28
|
+
include_given: Include the input stereoisomer
|
|
29
|
+
only_unassigned: Only enumerate unassigned stereocenters
|
|
30
|
+
include_smiles: Include original SMILES in output
|
|
31
|
+
include_name: Include molecule name in output
|
|
32
|
+
"""
|
|
33
|
+
self.max_isomers = max_isomers
|
|
34
|
+
self.include_given = include_given
|
|
35
|
+
self.only_unassigned = only_unassigned
|
|
36
|
+
self.include_smiles = include_smiles
|
|
37
|
+
self.include_name = include_name
|
|
38
|
+
|
|
39
|
+
def enumerate(self, record: MoleculeRecord) -> list[dict[str, Any]]:
|
|
40
|
+
"""
|
|
41
|
+
Enumerate stereoisomers.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
record: MoleculeRecord to process
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
List of dictionaries with stereoisomer SMILES
|
|
48
|
+
"""
|
|
49
|
+
if record.mol is None:
|
|
50
|
+
return []
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
opts = EnumerateStereoisomers.StereoEnumerationOptions()
|
|
54
|
+
opts.maxIsomers = self.max_isomers
|
|
55
|
+
opts.onlyUnassigned = self.only_unassigned
|
|
56
|
+
|
|
57
|
+
isomers = list(EnumerateStereoisomers.EnumerateStereoisomers(record.mol, opts))
|
|
58
|
+
|
|
59
|
+
results = []
|
|
60
|
+
for i, iso in enumerate(isomers[:self.max_isomers]):
|
|
61
|
+
smi = Chem.MolToSmiles(iso, isomericSmiles=True)
|
|
62
|
+
result: dict[str, Any] = {"smiles": smi}
|
|
63
|
+
|
|
64
|
+
if self.include_name and record.name:
|
|
65
|
+
result["name"] = f"{record.name}_iso{i}"
|
|
66
|
+
elif record.name:
|
|
67
|
+
result["name"] = record.name
|
|
68
|
+
|
|
69
|
+
result["stereoisomer_idx"] = i
|
|
70
|
+
result["original_smiles"] = record.smiles
|
|
71
|
+
|
|
72
|
+
results.append(result)
|
|
73
|
+
|
|
74
|
+
return results
|
|
75
|
+
|
|
76
|
+
except Exception:
|
|
77
|
+
return []
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class TautomerEnumerator:
|
|
81
|
+
"""Enumerate tautomers of molecules."""
|
|
82
|
+
|
|
83
|
+
def __init__(
|
|
84
|
+
self,
|
|
85
|
+
max_tautomers: int = 50,
|
|
86
|
+
max_transforms: int = 1000,
|
|
87
|
+
include_smiles: bool = True,
|
|
88
|
+
include_name: bool = True,
|
|
89
|
+
):
|
|
90
|
+
"""
|
|
91
|
+
Initialize tautomer enumerator.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
max_tautomers: Maximum tautomers to generate
|
|
95
|
+
max_transforms: Maximum transforms to apply
|
|
96
|
+
include_smiles: Include original SMILES in output
|
|
97
|
+
include_name: Include molecule name in output
|
|
98
|
+
"""
|
|
99
|
+
self.max_tautomers = max_tautomers
|
|
100
|
+
self.max_transforms = max_transforms
|
|
101
|
+
self.include_smiles = include_smiles
|
|
102
|
+
self.include_name = include_name
|
|
103
|
+
|
|
104
|
+
# Create enumerator
|
|
105
|
+
self._enumerator = rdMolStandardize.TautomerEnumerator()
|
|
106
|
+
self._enumerator.SetMaxTautomers(max_tautomers)
|
|
107
|
+
self._enumerator.SetMaxTransforms(max_transforms)
|
|
108
|
+
|
|
109
|
+
def enumerate(self, record: MoleculeRecord) -> list[dict[str, Any]]:
|
|
110
|
+
"""
|
|
111
|
+
Enumerate tautomers.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
record: MoleculeRecord to process
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
List of dictionaries with tautomer SMILES
|
|
118
|
+
"""
|
|
119
|
+
if record.mol is None:
|
|
120
|
+
return []
|
|
121
|
+
|
|
122
|
+
try:
|
|
123
|
+
tautomers = list(self._enumerator.Enumerate(record.mol))
|
|
124
|
+
|
|
125
|
+
results = []
|
|
126
|
+
for i, taut in enumerate(tautomers[:self.max_tautomers]):
|
|
127
|
+
smi = Chem.MolToSmiles(taut, isomericSmiles=True)
|
|
128
|
+
result: dict[str, Any] = {"smiles": smi}
|
|
129
|
+
|
|
130
|
+
if self.include_name and record.name:
|
|
131
|
+
result["name"] = f"{record.name}_taut{i}"
|
|
132
|
+
elif record.name:
|
|
133
|
+
result["name"] = record.name
|
|
134
|
+
|
|
135
|
+
result["tautomer_idx"] = i
|
|
136
|
+
result["original_smiles"] = record.smiles
|
|
137
|
+
|
|
138
|
+
results.append(result)
|
|
139
|
+
|
|
140
|
+
return results
|
|
141
|
+
|
|
142
|
+
except Exception:
|
|
143
|
+
return []
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class CanonicalTautomerizer:
|
|
147
|
+
"""Get canonical tautomer of molecules."""
|
|
148
|
+
|
|
149
|
+
def __init__(
|
|
150
|
+
self,
|
|
151
|
+
include_original: bool = False,
|
|
152
|
+
):
|
|
153
|
+
"""
|
|
154
|
+
Initialize canonical tautomerizer.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
include_original: Include original SMILES in output
|
|
158
|
+
"""
|
|
159
|
+
self.include_original = include_original
|
|
160
|
+
self._canonicalizer = rdMolStandardize.TautomerEnumerator()
|
|
161
|
+
|
|
162
|
+
def canonicalize(self, record: MoleculeRecord) -> Optional[dict[str, Any]]:
|
|
163
|
+
"""
|
|
164
|
+
Get canonical tautomer.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
record: MoleculeRecord to process
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
Dictionary with canonical tautomer or None
|
|
171
|
+
"""
|
|
172
|
+
if record.mol is None:
|
|
173
|
+
return None
|
|
174
|
+
|
|
175
|
+
try:
|
|
176
|
+
canonical = self._canonicalizer.Canonicalize(record.mol)
|
|
177
|
+
smi = Chem.MolToSmiles(canonical, isomericSmiles=True)
|
|
178
|
+
|
|
179
|
+
result: dict[str, Any] = {"smiles": smi}
|
|
180
|
+
|
|
181
|
+
if record.name:
|
|
182
|
+
result["name"] = record.name
|
|
183
|
+
|
|
184
|
+
if self.include_original:
|
|
185
|
+
result["original_smiles"] = record.smiles
|
|
186
|
+
|
|
187
|
+
return result
|
|
188
|
+
|
|
189
|
+
except Exception:
|
|
190
|
+
return None
|