rdkit-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rdkit_cli/__init__.py +4 -0
- rdkit_cli/__main__.py +6 -0
- rdkit_cli/cli.py +162 -0
- rdkit_cli/commands/__init__.py +1 -0
- rdkit_cli/commands/conformers.py +220 -0
- rdkit_cli/commands/convert.py +162 -0
- rdkit_cli/commands/depict.py +311 -0
- rdkit_cli/commands/descriptors.py +251 -0
- rdkit_cli/commands/diversity.py +232 -0
- rdkit_cli/commands/enumerate.py +229 -0
- rdkit_cli/commands/filter.py +384 -0
- rdkit_cli/commands/fingerprints.py +179 -0
- rdkit_cli/commands/fragment.py +284 -0
- rdkit_cli/commands/mcs.py +162 -0
- rdkit_cli/commands/reactions.py +191 -0
- rdkit_cli/commands/scaffold.py +243 -0
- rdkit_cli/commands/similarity.py +359 -0
- rdkit_cli/commands/standardize.py +138 -0
- rdkit_cli/core/__init__.py +1 -0
- rdkit_cli/core/conformers.py +197 -0
- rdkit_cli/core/depict.py +241 -0
- rdkit_cli/core/descriptors.py +248 -0
- rdkit_cli/core/diversity.py +174 -0
- rdkit_cli/core/enumerate.py +190 -0
- rdkit_cli/core/filters.py +443 -0
- rdkit_cli/core/fingerprints.py +265 -0
- rdkit_cli/core/fragment.py +237 -0
- rdkit_cli/core/mcs.py +128 -0
- rdkit_cli/core/reactions.py +159 -0
- rdkit_cli/core/scaffold.py +174 -0
- rdkit_cli/core/similarity.py +206 -0
- rdkit_cli/core/standardizer.py +141 -0
- rdkit_cli/io/__init__.py +7 -0
- rdkit_cli/io/formats.py +109 -0
- rdkit_cli/io/readers.py +352 -0
- rdkit_cli/io/writers.py +275 -0
- rdkit_cli/parallel/__init__.py +5 -0
- rdkit_cli/parallel/batch.py +181 -0
- rdkit_cli/parallel/executor.py +180 -0
- rdkit_cli/progress/__init__.py +5 -0
- rdkit_cli/progress/ninja.py +195 -0
- rdkit_cli/utils/__init__.py +1 -0
- rdkit_cli-0.1.0.dist-info/METADATA +380 -0
- rdkit_cli-0.1.0.dist-info/RECORD +47 -0
- rdkit_cli-0.1.0.dist-info/WHEEL +4 -0
- rdkit_cli-0.1.0.dist-info/entry_points.txt +2 -0
- rdkit_cli-0.1.0.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
"""Molecular fragmentation engine."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional, Any
|
|
4
|
+
from collections import Counter
|
|
5
|
+
|
|
6
|
+
from rdkit import Chem
|
|
7
|
+
from rdkit.Chem import BRICS, Recap, AllChem, rdMolDescriptors
|
|
8
|
+
|
|
9
|
+
from rdkit_cli.io.readers import MoleculeRecord
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BRICSFragmenter:
|
|
13
|
+
"""Fragment molecules using BRICS algorithm."""
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
min_fragment_size: int = 1,
|
|
18
|
+
include_smiles: bool = True,
|
|
19
|
+
include_name: bool = True,
|
|
20
|
+
):
|
|
21
|
+
"""
|
|
22
|
+
Initialize BRICS fragmenter.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
min_fragment_size: Minimum fragment heavy atom count
|
|
26
|
+
include_smiles: Include original SMILES in output
|
|
27
|
+
include_name: Include molecule name in output
|
|
28
|
+
"""
|
|
29
|
+
self.min_fragment_size = min_fragment_size
|
|
30
|
+
self.include_smiles = include_smiles
|
|
31
|
+
self.include_name = include_name
|
|
32
|
+
|
|
33
|
+
def fragment(self, record: MoleculeRecord) -> list[dict[str, Any]]:
|
|
34
|
+
"""
|
|
35
|
+
Fragment a molecule using BRICS.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
record: MoleculeRecord to process
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
List of dictionaries with fragment SMILES
|
|
42
|
+
"""
|
|
43
|
+
if record.mol is None:
|
|
44
|
+
return []
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
fragments = BRICS.BRICSDecompose(record.mol)
|
|
48
|
+
|
|
49
|
+
results = []
|
|
50
|
+
for i, frag_smi in enumerate(fragments):
|
|
51
|
+
# Parse fragment to check size
|
|
52
|
+
frag_mol = Chem.MolFromSmiles(frag_smi)
|
|
53
|
+
if frag_mol is None:
|
|
54
|
+
continue
|
|
55
|
+
|
|
56
|
+
heavy_atoms = frag_mol.GetNumHeavyAtoms()
|
|
57
|
+
if heavy_atoms < self.min_fragment_size:
|
|
58
|
+
continue
|
|
59
|
+
|
|
60
|
+
result: dict[str, Any] = {"fragment_smiles": frag_smi}
|
|
61
|
+
|
|
62
|
+
if self.include_smiles:
|
|
63
|
+
result["smiles"] = record.smiles
|
|
64
|
+
if self.include_name and record.name:
|
|
65
|
+
result["name"] = record.name
|
|
66
|
+
|
|
67
|
+
result["fragment_idx"] = i
|
|
68
|
+
result["heavy_atom_count"] = heavy_atoms
|
|
69
|
+
|
|
70
|
+
results.append(result)
|
|
71
|
+
|
|
72
|
+
return results
|
|
73
|
+
|
|
74
|
+
except Exception:
|
|
75
|
+
return []
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class RECAPFragmenter:
|
|
79
|
+
"""Fragment molecules using RECAP algorithm."""
|
|
80
|
+
|
|
81
|
+
def __init__(
|
|
82
|
+
self,
|
|
83
|
+
min_fragment_size: int = 1,
|
|
84
|
+
include_smiles: bool = True,
|
|
85
|
+
include_name: bool = True,
|
|
86
|
+
):
|
|
87
|
+
"""
|
|
88
|
+
Initialize RECAP fragmenter.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
min_fragment_size: Minimum fragment heavy atom count
|
|
92
|
+
include_smiles: Include original SMILES in output
|
|
93
|
+
include_name: Include molecule name in output
|
|
94
|
+
"""
|
|
95
|
+
self.min_fragment_size = min_fragment_size
|
|
96
|
+
self.include_smiles = include_smiles
|
|
97
|
+
self.include_name = include_name
|
|
98
|
+
|
|
99
|
+
def fragment(self, record: MoleculeRecord) -> list[dict[str, Any]]:
|
|
100
|
+
"""
|
|
101
|
+
Fragment a molecule using RECAP.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
record: MoleculeRecord to process
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
List of dictionaries with fragment SMILES
|
|
108
|
+
"""
|
|
109
|
+
if record.mol is None:
|
|
110
|
+
return []
|
|
111
|
+
|
|
112
|
+
try:
|
|
113
|
+
recap_tree = Recap.RecapDecompose(record.mol)
|
|
114
|
+
leaves = recap_tree.GetLeaves()
|
|
115
|
+
|
|
116
|
+
results = []
|
|
117
|
+
for i, (frag_smi, node) in enumerate(leaves.items()):
|
|
118
|
+
# Parse fragment to check size
|
|
119
|
+
frag_mol = node.mol
|
|
120
|
+
if frag_mol is None:
|
|
121
|
+
continue
|
|
122
|
+
|
|
123
|
+
heavy_atoms = frag_mol.GetNumHeavyAtoms()
|
|
124
|
+
if heavy_atoms < self.min_fragment_size:
|
|
125
|
+
continue
|
|
126
|
+
|
|
127
|
+
result: dict[str, Any] = {"fragment_smiles": frag_smi}
|
|
128
|
+
|
|
129
|
+
if self.include_smiles:
|
|
130
|
+
result["smiles"] = record.smiles
|
|
131
|
+
if self.include_name and record.name:
|
|
132
|
+
result["name"] = record.name
|
|
133
|
+
|
|
134
|
+
result["fragment_idx"] = i
|
|
135
|
+
result["heavy_atom_count"] = heavy_atoms
|
|
136
|
+
|
|
137
|
+
results.append(result)
|
|
138
|
+
|
|
139
|
+
return results
|
|
140
|
+
|
|
141
|
+
except Exception:
|
|
142
|
+
return []
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class FunctionalGroupExtractor:
|
|
146
|
+
"""Extract functional groups from molecules."""
|
|
147
|
+
|
|
148
|
+
def __init__(
|
|
149
|
+
self,
|
|
150
|
+
include_smiles: bool = True,
|
|
151
|
+
include_name: bool = True,
|
|
152
|
+
):
|
|
153
|
+
"""
|
|
154
|
+
Initialize functional group extractor.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
include_smiles: Include original SMILES in output
|
|
158
|
+
include_name: Include molecule name in output
|
|
159
|
+
"""
|
|
160
|
+
self.include_smiles = include_smiles
|
|
161
|
+
self.include_name = include_name
|
|
162
|
+
# Use RDKit's functional group hierarchy
|
|
163
|
+
self._fgs = rdMolDescriptors.GetMorganFingerprint
|
|
164
|
+
|
|
165
|
+
def extract(self, record: MoleculeRecord) -> Optional[dict[str, Any]]:
|
|
166
|
+
"""
|
|
167
|
+
Extract functional groups from a molecule.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
record: MoleculeRecord to process
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
Dictionary with functional group info or None
|
|
174
|
+
"""
|
|
175
|
+
if record.mol is None:
|
|
176
|
+
return None
|
|
177
|
+
|
|
178
|
+
try:
|
|
179
|
+
# Get functional groups using SMARTS patterns
|
|
180
|
+
fg_patterns = {
|
|
181
|
+
"alcohol": "[OX2H]",
|
|
182
|
+
"aldehyde": "[CX3H1](=O)[#6]",
|
|
183
|
+
"ketone": "[#6][CX3](=O)[#6]",
|
|
184
|
+
"carboxylic_acid": "[CX3](=O)[OX2H1]",
|
|
185
|
+
"ester": "[#6][CX3](=O)[OX2][#6]",
|
|
186
|
+
"ether": "[OD2]([#6])[#6]",
|
|
187
|
+
"amine_primary": "[NX3H2][#6]",
|
|
188
|
+
"amine_secondary": "[NX3H1]([#6])[#6]",
|
|
189
|
+
"amine_tertiary": "[NX3]([#6])([#6])[#6]",
|
|
190
|
+
"amide": "[NX3][CX3](=[OX1])[#6]",
|
|
191
|
+
"nitro": "[$([NX3](=O)=O),$([NX3+](=O)[O-])]",
|
|
192
|
+
"nitrile": "[NX1]#[CX2]",
|
|
193
|
+
"halogen": "[F,Cl,Br,I]",
|
|
194
|
+
"thiol": "[SX2H]",
|
|
195
|
+
"sulfide": "[#16X2]([#6])[#6]",
|
|
196
|
+
"aromatic_ring": "a1aaaaa1",
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
result: dict[str, Any] = {}
|
|
200
|
+
|
|
201
|
+
if self.include_smiles:
|
|
202
|
+
result["smiles"] = record.smiles
|
|
203
|
+
if self.include_name and record.name:
|
|
204
|
+
result["name"] = record.name
|
|
205
|
+
|
|
206
|
+
for name, smarts in fg_patterns.items():
|
|
207
|
+
pattern = Chem.MolFromSmarts(smarts)
|
|
208
|
+
if pattern:
|
|
209
|
+
matches = record.mol.GetSubstructMatches(pattern)
|
|
210
|
+
result[f"n_{name}"] = len(matches)
|
|
211
|
+
|
|
212
|
+
return result
|
|
213
|
+
|
|
214
|
+
except Exception:
|
|
215
|
+
return None
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def analyze_fragments(fragments: list[str], top_n: int = 20) -> list[tuple[str, int, float]]:
|
|
219
|
+
"""
|
|
220
|
+
Analyze fragment frequency distribution.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
fragments: List of fragment SMILES
|
|
224
|
+
top_n: Number of top fragments to return
|
|
225
|
+
|
|
226
|
+
Returns:
|
|
227
|
+
List of (fragment, count, percentage) tuples
|
|
228
|
+
"""
|
|
229
|
+
total = len(fragments)
|
|
230
|
+
counter = Counter(fragments)
|
|
231
|
+
|
|
232
|
+
results = []
|
|
233
|
+
for frag, count in counter.most_common(top_n):
|
|
234
|
+
percentage = (count / total) * 100 if total > 0 else 0
|
|
235
|
+
results.append((frag, count, round(percentage, 2)))
|
|
236
|
+
|
|
237
|
+
return results
|
rdkit_cli/core/mcs.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""Maximum Common Substructure engine."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional, Any
|
|
4
|
+
|
|
5
|
+
from rdkit import Chem
|
|
6
|
+
from rdkit.Chem import rdFMCS
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def find_mcs(
|
|
10
|
+
mols: list[Chem.Mol],
|
|
11
|
+
timeout: int = 60,
|
|
12
|
+
threshold: float = 1.0,
|
|
13
|
+
maximize: str = "atoms",
|
|
14
|
+
ring_matches_ring_only: bool = True,
|
|
15
|
+
complete_rings_only: bool = True,
|
|
16
|
+
match_valences: bool = False,
|
|
17
|
+
match_chiral_tag: bool = False,
|
|
18
|
+
atom_compare: str = "elements",
|
|
19
|
+
bond_compare: str = "order",
|
|
20
|
+
) -> Optional[dict[str, Any]]:
|
|
21
|
+
"""
|
|
22
|
+
Find Maximum Common Substructure of molecules.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
mols: List of molecules
|
|
26
|
+
timeout: Maximum time in seconds
|
|
27
|
+
threshold: Fraction of molecules that must contain MCS
|
|
28
|
+
maximize: What to maximize ('atoms' or 'bonds')
|
|
29
|
+
ring_matches_ring_only: Ring atoms only match ring atoms
|
|
30
|
+
complete_rings_only: Only return complete rings
|
|
31
|
+
match_valences: Match atom valences
|
|
32
|
+
match_chiral_tag: Match chirality
|
|
33
|
+
atom_compare: Atom comparison ('any', 'elements', 'isotopes')
|
|
34
|
+
bond_compare: Bond comparison ('any', 'order', 'orderexact')
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
Dictionary with MCS results or None
|
|
38
|
+
"""
|
|
39
|
+
# Filter None molecules
|
|
40
|
+
valid_mols = [mol for mol in mols if mol is not None]
|
|
41
|
+
|
|
42
|
+
if len(valid_mols) < 2:
|
|
43
|
+
return None
|
|
44
|
+
|
|
45
|
+
# Set up atom comparison
|
|
46
|
+
atom_compare_map = {
|
|
47
|
+
"any": rdFMCS.AtomCompare.CompareAny,
|
|
48
|
+
"elements": rdFMCS.AtomCompare.CompareElements,
|
|
49
|
+
"isotopes": rdFMCS.AtomCompare.CompareIsotopes,
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
# Set up bond comparison
|
|
53
|
+
bond_compare_map = {
|
|
54
|
+
"any": rdFMCS.BondCompare.CompareAny,
|
|
55
|
+
"order": rdFMCS.BondCompare.CompareOrder,
|
|
56
|
+
"orderexact": rdFMCS.BondCompare.CompareOrderExact,
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
result = rdFMCS.FindMCS(
|
|
61
|
+
valid_mols,
|
|
62
|
+
timeout=timeout,
|
|
63
|
+
threshold=threshold,
|
|
64
|
+
maximizeBonds=(maximize == "bonds"),
|
|
65
|
+
ringMatchesRingOnly=ring_matches_ring_only,
|
|
66
|
+
completeRingsOnly=complete_rings_only,
|
|
67
|
+
matchValences=match_valences,
|
|
68
|
+
matchChiralTag=match_chiral_tag,
|
|
69
|
+
atomCompare=atom_compare_map.get(atom_compare, rdFMCS.AtomCompare.CompareElements),
|
|
70
|
+
bondCompare=bond_compare_map.get(bond_compare, rdFMCS.BondCompare.CompareOrder),
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
if result.canceled:
|
|
74
|
+
return {"canceled": True, "timeout": timeout}
|
|
75
|
+
|
|
76
|
+
if result.numAtoms == 0:
|
|
77
|
+
return {"smarts": "", "num_atoms": 0, "num_bonds": 0}
|
|
78
|
+
|
|
79
|
+
return {
|
|
80
|
+
"smarts": result.smartsString,
|
|
81
|
+
"num_atoms": result.numAtoms,
|
|
82
|
+
"num_bonds": result.numBonds,
|
|
83
|
+
"canceled": result.canceled,
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
except Exception as e:
|
|
87
|
+
return {"error": str(e)}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class MCSAligner:
|
|
91
|
+
"""Align molecules based on MCS."""
|
|
92
|
+
|
|
93
|
+
def __init__(
|
|
94
|
+
self,
|
|
95
|
+
reference_smiles: str,
|
|
96
|
+
timeout: int = 30,
|
|
97
|
+
):
|
|
98
|
+
"""
|
|
99
|
+
Initialize MCS aligner.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
reference_smiles: Reference molecule SMILES
|
|
103
|
+
timeout: MCS timeout in seconds
|
|
104
|
+
"""
|
|
105
|
+
self.reference_mol = Chem.MolFromSmiles(reference_smiles)
|
|
106
|
+
if self.reference_mol is None:
|
|
107
|
+
raise ValueError(f"Invalid reference SMILES: {reference_smiles}")
|
|
108
|
+
self.timeout = timeout
|
|
109
|
+
|
|
110
|
+
def find_common(self, mol: Chem.Mol) -> Optional[dict[str, Any]]:
|
|
111
|
+
"""
|
|
112
|
+
Find MCS between reference and query molecule.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
mol: Query molecule
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
Dictionary with MCS info or None
|
|
119
|
+
"""
|
|
120
|
+
if mol is None:
|
|
121
|
+
return None
|
|
122
|
+
|
|
123
|
+
result = find_mcs(
|
|
124
|
+
[self.reference_mol, mol],
|
|
125
|
+
timeout=self.timeout,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
return result
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
"""Reaction transformation engine."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional, Any
|
|
4
|
+
|
|
5
|
+
from rdkit import Chem
|
|
6
|
+
from rdkit.Chem import AllChem, rdChemReactions
|
|
7
|
+
|
|
8
|
+
from rdkit_cli.io.readers import MoleculeRecord
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ReactionTransformer:
|
|
12
|
+
"""Apply SMIRKS transformations to molecules."""
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
smirks: str,
|
|
17
|
+
max_products: int = 100,
|
|
18
|
+
):
|
|
19
|
+
"""
|
|
20
|
+
Initialize reaction transformer.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
smirks: SMIRKS reaction pattern
|
|
24
|
+
max_products: Maximum number of products to generate
|
|
25
|
+
"""
|
|
26
|
+
self.reaction = AllChem.ReactionFromSmarts(smirks)
|
|
27
|
+
if self.reaction is None:
|
|
28
|
+
raise ValueError(f"Invalid SMIRKS pattern: {smirks}")
|
|
29
|
+
|
|
30
|
+
self.max_products = max_products
|
|
31
|
+
|
|
32
|
+
def transform(self, record: MoleculeRecord) -> Optional[dict[str, Any]]:
|
|
33
|
+
"""
|
|
34
|
+
Apply transformation to a molecule.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
record: MoleculeRecord to transform
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Dictionary with products or None if no reaction
|
|
41
|
+
"""
|
|
42
|
+
if record.mol is None:
|
|
43
|
+
return None
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
products = self.reaction.RunReactants((record.mol,))
|
|
47
|
+
|
|
48
|
+
if not products:
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
# Collect unique products
|
|
52
|
+
unique_smiles = set()
|
|
53
|
+
for product_set in products[:self.max_products]:
|
|
54
|
+
for prod in product_set:
|
|
55
|
+
try:
|
|
56
|
+
Chem.SanitizeMol(prod)
|
|
57
|
+
smi = Chem.MolToSmiles(prod)
|
|
58
|
+
unique_smiles.add(smi)
|
|
59
|
+
except Exception:
|
|
60
|
+
continue
|
|
61
|
+
|
|
62
|
+
if not unique_smiles:
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
# Return first product (or could return all)
|
|
66
|
+
product_smiles = list(unique_smiles)[0]
|
|
67
|
+
|
|
68
|
+
result: dict[str, Any] = {
|
|
69
|
+
"smiles": product_smiles,
|
|
70
|
+
"reactant": record.smiles,
|
|
71
|
+
"num_products": len(unique_smiles),
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
if record.name:
|
|
75
|
+
result["name"] = record.name
|
|
76
|
+
|
|
77
|
+
return result
|
|
78
|
+
|
|
79
|
+
except Exception:
|
|
80
|
+
return None
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class ReactionEnumerator:
|
|
84
|
+
"""Enumerate products from reaction templates."""
|
|
85
|
+
|
|
86
|
+
def __init__(
|
|
87
|
+
self,
|
|
88
|
+
reaction_smarts: str,
|
|
89
|
+
max_products: int = 1000,
|
|
90
|
+
):
|
|
91
|
+
"""
|
|
92
|
+
Initialize reaction enumerator.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
reaction_smarts: Reaction SMARTS
|
|
96
|
+
max_products: Maximum products to generate
|
|
97
|
+
"""
|
|
98
|
+
self.reaction = AllChem.ReactionFromSmarts(reaction_smarts)
|
|
99
|
+
if self.reaction is None:
|
|
100
|
+
raise ValueError(f"Invalid reaction SMARTS: {reaction_smarts}")
|
|
101
|
+
|
|
102
|
+
self.max_products = max_products
|
|
103
|
+
self.num_reactants = self.reaction.GetNumReactantTemplates()
|
|
104
|
+
|
|
105
|
+
def enumerate(
|
|
106
|
+
self,
|
|
107
|
+
reactant_lists: list[list[Chem.Mol]],
|
|
108
|
+
) -> list[dict[str, Any]]:
|
|
109
|
+
"""
|
|
110
|
+
Enumerate reaction products from lists of reactants.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
reactant_lists: List of reactant lists (one per reactant template)
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
List of product dictionaries
|
|
117
|
+
"""
|
|
118
|
+
if len(reactant_lists) != self.num_reactants:
|
|
119
|
+
raise ValueError(
|
|
120
|
+
f"Expected {self.num_reactants} reactant lists, got {len(reactant_lists)}"
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
results = []
|
|
124
|
+
unique_products = set()
|
|
125
|
+
|
|
126
|
+
# Generate all combinations
|
|
127
|
+
from itertools import product as iterproduct
|
|
128
|
+
|
|
129
|
+
for reactants in iterproduct(*reactant_lists):
|
|
130
|
+
if len(results) >= self.max_products:
|
|
131
|
+
break
|
|
132
|
+
|
|
133
|
+
try:
|
|
134
|
+
products = self.reaction.RunReactants(reactants)
|
|
135
|
+
|
|
136
|
+
for product_set in products:
|
|
137
|
+
for prod in product_set:
|
|
138
|
+
try:
|
|
139
|
+
Chem.SanitizeMol(prod)
|
|
140
|
+
smi = Chem.MolToSmiles(prod)
|
|
141
|
+
|
|
142
|
+
if smi not in unique_products:
|
|
143
|
+
unique_products.add(smi)
|
|
144
|
+
results.append({
|
|
145
|
+
"smiles": smi,
|
|
146
|
+
"reactants": ".".join(
|
|
147
|
+
Chem.MolToSmiles(r) for r in reactants
|
|
148
|
+
),
|
|
149
|
+
})
|
|
150
|
+
|
|
151
|
+
if len(results) >= self.max_products:
|
|
152
|
+
break
|
|
153
|
+
except Exception:
|
|
154
|
+
continue
|
|
155
|
+
|
|
156
|
+
except Exception:
|
|
157
|
+
continue
|
|
158
|
+
|
|
159
|
+
return results
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""Scaffold analysis engine."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional, Any
|
|
4
|
+
from collections import Counter
|
|
5
|
+
|
|
6
|
+
from rdkit import Chem
|
|
7
|
+
from rdkit.Chem.Scaffolds import MurckoScaffold
|
|
8
|
+
|
|
9
|
+
from rdkit_cli.io.readers import MoleculeRecord
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def get_murcko_scaffold(mol: Chem.Mol, generic: bool = False) -> Optional[str]:
|
|
13
|
+
"""
|
|
14
|
+
Get Murcko scaffold for a molecule.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
mol: RDKit molecule
|
|
18
|
+
generic: If True, return generic scaffold (element-agnostic)
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
Scaffold SMILES or None if failed
|
|
22
|
+
"""
|
|
23
|
+
try:
|
|
24
|
+
core = MurckoScaffold.GetScaffoldForMol(mol)
|
|
25
|
+
|
|
26
|
+
if generic:
|
|
27
|
+
core = MurckoScaffold.MakeScaffoldGeneric(core)
|
|
28
|
+
|
|
29
|
+
return Chem.MolToSmiles(core)
|
|
30
|
+
except Exception:
|
|
31
|
+
return None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_side_chains(mol: Chem.Mol) -> list[str]:
|
|
35
|
+
"""
|
|
36
|
+
Get side chains (R-groups) for a molecule.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
mol: RDKit molecule
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
List of side chain SMILES
|
|
43
|
+
"""
|
|
44
|
+
try:
|
|
45
|
+
side_chains = MurckoScaffold.MurckoDecompose(mol)
|
|
46
|
+
return [Chem.MolToSmiles(sc) for sc in side_chains if sc is not None]
|
|
47
|
+
except Exception:
|
|
48
|
+
return []
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class ScaffoldExtractor:
|
|
52
|
+
"""Extract Murcko scaffolds from molecules."""
|
|
53
|
+
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
generic: bool = False,
|
|
57
|
+
include_smiles: bool = True,
|
|
58
|
+
include_name: bool = True,
|
|
59
|
+
):
|
|
60
|
+
"""
|
|
61
|
+
Initialize scaffold extractor.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
generic: Generate generic (element-agnostic) scaffolds
|
|
65
|
+
include_smiles: Include original SMILES in output
|
|
66
|
+
include_name: Include molecule name in output
|
|
67
|
+
"""
|
|
68
|
+
self.generic = generic
|
|
69
|
+
self.include_smiles = include_smiles
|
|
70
|
+
self.include_name = include_name
|
|
71
|
+
|
|
72
|
+
def extract(self, record: MoleculeRecord) -> Optional[dict[str, Any]]:
|
|
73
|
+
"""
|
|
74
|
+
Extract scaffold from a molecule.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
record: MoleculeRecord to process
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Dictionary with scaffold info or None if failed
|
|
81
|
+
"""
|
|
82
|
+
if record.mol is None:
|
|
83
|
+
return None
|
|
84
|
+
|
|
85
|
+
scaffold = get_murcko_scaffold(record.mol, generic=self.generic)
|
|
86
|
+
|
|
87
|
+
if scaffold is None:
|
|
88
|
+
return None
|
|
89
|
+
|
|
90
|
+
result: dict[str, Any] = {}
|
|
91
|
+
|
|
92
|
+
if self.include_smiles:
|
|
93
|
+
result["smiles"] = record.smiles
|
|
94
|
+
if self.include_name and record.name:
|
|
95
|
+
result["name"] = record.name
|
|
96
|
+
|
|
97
|
+
result["scaffold"] = scaffold
|
|
98
|
+
|
|
99
|
+
return result
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class ScaffoldDecomposer:
|
|
103
|
+
"""Decompose molecules into scaffold and side chains."""
|
|
104
|
+
|
|
105
|
+
def __init__(
|
|
106
|
+
self,
|
|
107
|
+
include_smiles: bool = True,
|
|
108
|
+
include_name: bool = True,
|
|
109
|
+
):
|
|
110
|
+
"""
|
|
111
|
+
Initialize scaffold decomposer.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
include_smiles: Include original SMILES in output
|
|
115
|
+
include_name: Include molecule name in output
|
|
116
|
+
"""
|
|
117
|
+
self.include_smiles = include_smiles
|
|
118
|
+
self.include_name = include_name
|
|
119
|
+
|
|
120
|
+
def decompose(self, record: MoleculeRecord) -> Optional[dict[str, Any]]:
|
|
121
|
+
"""
|
|
122
|
+
Decompose a molecule into scaffold and side chains.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
record: MoleculeRecord to process
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
Dictionary with decomposition info or None if failed
|
|
129
|
+
"""
|
|
130
|
+
if record.mol is None:
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
scaffold = get_murcko_scaffold(record.mol)
|
|
134
|
+
if scaffold is None:
|
|
135
|
+
return None
|
|
136
|
+
|
|
137
|
+
generic_scaffold = get_murcko_scaffold(record.mol, generic=True)
|
|
138
|
+
|
|
139
|
+
result: dict[str, Any] = {}
|
|
140
|
+
|
|
141
|
+
if self.include_smiles:
|
|
142
|
+
result["smiles"] = record.smiles
|
|
143
|
+
if self.include_name and record.name:
|
|
144
|
+
result["name"] = record.name
|
|
145
|
+
|
|
146
|
+
result["scaffold"] = scaffold
|
|
147
|
+
result["generic_scaffold"] = generic_scaffold
|
|
148
|
+
|
|
149
|
+
return result
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def analyze_scaffolds(
|
|
153
|
+
scaffolds: list[str],
|
|
154
|
+
top_n: int = 20,
|
|
155
|
+
) -> list[tuple[str, int, float]]:
|
|
156
|
+
"""
|
|
157
|
+
Analyze scaffold frequency distribution.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
scaffolds: List of scaffold SMILES
|
|
161
|
+
top_n: Number of top scaffolds to return
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
List of (scaffold, count, percentage) tuples
|
|
165
|
+
"""
|
|
166
|
+
total = len(scaffolds)
|
|
167
|
+
counter = Counter(scaffolds)
|
|
168
|
+
|
|
169
|
+
results = []
|
|
170
|
+
for scaffold, count in counter.most_common(top_n):
|
|
171
|
+
percentage = (count / total) * 100 if total > 0 else 0
|
|
172
|
+
results.append((scaffold, count, round(percentage, 2)))
|
|
173
|
+
|
|
174
|
+
return results
|