celltype-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- celltype_cli-0.1.0.dist-info/METADATA +267 -0
- celltype_cli-0.1.0.dist-info/RECORD +89 -0
- celltype_cli-0.1.0.dist-info/WHEEL +4 -0
- celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
- celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- ct/__init__.py +3 -0
- ct/agent/__init__.py +0 -0
- ct/agent/case_studies.py +426 -0
- ct/agent/config.py +523 -0
- ct/agent/doctor.py +544 -0
- ct/agent/knowledge.py +523 -0
- ct/agent/loop.py +99 -0
- ct/agent/mcp_server.py +478 -0
- ct/agent/orchestrator.py +733 -0
- ct/agent/runner.py +656 -0
- ct/agent/sandbox.py +481 -0
- ct/agent/session.py +145 -0
- ct/agent/system_prompt.py +186 -0
- ct/agent/trace_store.py +228 -0
- ct/agent/trajectory.py +169 -0
- ct/agent/types.py +182 -0
- ct/agent/workflows.py +462 -0
- ct/api/__init__.py +1 -0
- ct/api/app.py +211 -0
- ct/api/config.py +120 -0
- ct/api/engine.py +124 -0
- ct/cli.py +1448 -0
- ct/data/__init__.py +0 -0
- ct/data/compute_providers.json +59 -0
- ct/data/cro_database.json +395 -0
- ct/data/downloader.py +238 -0
- ct/data/loaders.py +252 -0
- ct/kb/__init__.py +5 -0
- ct/kb/benchmarks.py +147 -0
- ct/kb/governance.py +106 -0
- ct/kb/ingest.py +415 -0
- ct/kb/reasoning.py +129 -0
- ct/kb/schema_monitor.py +162 -0
- ct/kb/substrate.py +387 -0
- ct/models/__init__.py +0 -0
- ct/models/llm.py +370 -0
- ct/tools/__init__.py +195 -0
- ct/tools/_compound_resolver.py +297 -0
- ct/tools/biomarker.py +368 -0
- ct/tools/cellxgene.py +282 -0
- ct/tools/chemistry.py +1371 -0
- ct/tools/claude.py +390 -0
- ct/tools/clinical.py +1153 -0
- ct/tools/clue.py +249 -0
- ct/tools/code.py +1069 -0
- ct/tools/combination.py +397 -0
- ct/tools/compute.py +402 -0
- ct/tools/cro.py +413 -0
- ct/tools/data_api.py +2114 -0
- ct/tools/design.py +295 -0
- ct/tools/dna.py +575 -0
- ct/tools/experiment.py +604 -0
- ct/tools/expression.py +655 -0
- ct/tools/files.py +957 -0
- ct/tools/genomics.py +1387 -0
- ct/tools/http_client.py +146 -0
- ct/tools/imaging.py +319 -0
- ct/tools/intel.py +223 -0
- ct/tools/literature.py +743 -0
- ct/tools/network.py +422 -0
- ct/tools/notification.py +111 -0
- ct/tools/omics.py +3330 -0
- ct/tools/ops.py +1230 -0
- ct/tools/parity.py +649 -0
- ct/tools/pk.py +245 -0
- ct/tools/protein.py +678 -0
- ct/tools/regulatory.py +643 -0
- ct/tools/remote_data.py +179 -0
- ct/tools/report.py +181 -0
- ct/tools/repurposing.py +376 -0
- ct/tools/safety.py +1280 -0
- ct/tools/shell.py +178 -0
- ct/tools/singlecell.py +533 -0
- ct/tools/statistics.py +552 -0
- ct/tools/structure.py +882 -0
- ct/tools/target.py +901 -0
- ct/tools/translational.py +123 -0
- ct/tools/viability.py +218 -0
- ct/ui/__init__.py +0 -0
- ct/ui/markdown.py +31 -0
- ct/ui/status.py +258 -0
- ct/ui/suggestions.py +567 -0
- ct/ui/terminal.py +1456 -0
- ct/ui/traces.py +112 -0
ct/tools/chemistry.py
ADDED
|
@@ -0,0 +1,1371 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Chemistry tools: molecular descriptors, SAR analysis, similarity search.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from ct.tools import registry
|
|
6
|
+
from ct.tools.http_client import request
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _extract_smiles(smiles) -> str:
|
|
10
|
+
"""Extract a SMILES string from various input types and resolve drug names.
|
|
11
|
+
|
|
12
|
+
Handles the case where a dict (e.g., full pubchem_lookup result) is passed
|
|
13
|
+
instead of a plain SMILES string — typically when the planner uses $step.1
|
|
14
|
+
instead of $step.1.canonical_smiles.
|
|
15
|
+
|
|
16
|
+
Also resolves drug names (e.g. "lenalidomide") to SMILES via
|
|
17
|
+
_compound_resolver.resolve_to_smiles.
|
|
18
|
+
"""
|
|
19
|
+
if isinstance(smiles, dict):
|
|
20
|
+
smiles = (smiles.get("canonical_smiles") or smiles.get("smiles")
|
|
21
|
+
or smiles.get("summary", ""))
|
|
22
|
+
smiles = str(smiles).strip()
|
|
23
|
+
|
|
24
|
+
# Try to resolve name → SMILES (handles both valid SMILES and drug names)
|
|
25
|
+
try:
|
|
26
|
+
from ct.tools._compound_resolver import resolve_to_smiles
|
|
27
|
+
return resolve_to_smiles(smiles)
|
|
28
|
+
except (ValueError, ImportError):
|
|
29
|
+
return smiles # Fall through — tool will handle invalid SMILES
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@registry.register(
|
|
33
|
+
name="chemistry.descriptors",
|
|
34
|
+
description="Compute molecular descriptors and fingerprints for a compound from SMILES",
|
|
35
|
+
category="chemistry",
|
|
36
|
+
parameters={"smiles": "SMILES string"},
|
|
37
|
+
usage_guide="You need molecular properties (MW, LogP, TPSA, Lipinski) for a compound. Use early in hit characterization to assess drug-likeness and physicochemical profile.",
|
|
38
|
+
)
|
|
39
|
+
def descriptors(smiles: str, **kwargs) -> dict:
|
|
40
|
+
"""Compute molecular properties from SMILES."""
|
|
41
|
+
smiles = _extract_smiles(smiles)
|
|
42
|
+
from rdkit import Chem
|
|
43
|
+
from rdkit.Chem import Descriptors, AllChem, rdMolDescriptors
|
|
44
|
+
|
|
45
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
46
|
+
if mol is None:
|
|
47
|
+
return {"error": f"Invalid SMILES: {smiles}", "summary": f"Invalid SMILES: {smiles}"}
|
|
48
|
+
props = {
|
|
49
|
+
"smiles": smiles,
|
|
50
|
+
"molecular_weight": Descriptors.MolWt(mol),
|
|
51
|
+
"logp": Descriptors.MolLogP(mol),
|
|
52
|
+
"hbd": Descriptors.NumHDonors(mol),
|
|
53
|
+
"hba": Descriptors.NumHAcceptors(mol),
|
|
54
|
+
"tpsa": Descriptors.TPSA(mol),
|
|
55
|
+
"rotatable_bonds": Descriptors.NumRotatableBonds(mol),
|
|
56
|
+
"rings": Descriptors.RingCount(mol),
|
|
57
|
+
"aromatic_rings": Descriptors.NumAromaticRings(mol),
|
|
58
|
+
"heavy_atoms": mol.GetNumHeavyAtoms(),
|
|
59
|
+
"formula": rdMolDescriptors.CalcMolFormula(mol),
|
|
60
|
+
"num_stereocenters": len(Chem.FindMolChiralCenters(mol)),
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
# Lipinski Rule of 5
|
|
64
|
+
props["lipinski_violations"] = sum([
|
|
65
|
+
props["molecular_weight"] > 500,
|
|
66
|
+
props["logp"] > 5,
|
|
67
|
+
props["hbd"] > 5,
|
|
68
|
+
props["hba"] > 10,
|
|
69
|
+
])
|
|
70
|
+
|
|
71
|
+
# Molecular glue specific
|
|
72
|
+
props["mw_logp_ratio"] = props["molecular_weight"] / (props["logp"] + 1e-6)
|
|
73
|
+
props["tpsa_per_mw"] = props["tpsa"] / props["molecular_weight"]
|
|
74
|
+
|
|
75
|
+
return {
|
|
76
|
+
"summary": f"Molecular profile for {props['formula']} (MW={props['molecular_weight']:.1f}, "
|
|
77
|
+
f"LogP={props['logp']:.2f}, Lipinski violations={props['lipinski_violations']})",
|
|
78
|
+
"properties": props,
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@registry.register(
|
|
83
|
+
name="chemistry.pairwise_similarity",
|
|
84
|
+
description="Compute pairwise Tanimoto similarity matrix for a list of compounds (by name or SMILES)",
|
|
85
|
+
category="chemistry",
|
|
86
|
+
parameters={
|
|
87
|
+
"compounds": "List of compound names or SMILES strings",
|
|
88
|
+
"fingerprint": "Fingerprint type: 'morgan' (default, ECFP4) or 'maccs'",
|
|
89
|
+
},
|
|
90
|
+
usage_guide="You need to compute fingerprint similarity between a set of named compounds. Use when the question asks to 'compare similarity', 'cluster by scaffold', or 'compute Tanimoto' between specific compounds. Returns a full pairwise similarity matrix.",
|
|
91
|
+
)
|
|
92
|
+
def pairwise_similarity(compounds: list = None, fingerprint: str = "morgan", **kwargs) -> dict:
|
|
93
|
+
"""Compute pairwise Tanimoto similarity for a set of compounds."""
|
|
94
|
+
if not compounds or len(compounds) < 2:
|
|
95
|
+
return {"error": "Need at least 2 compounds", "summary": "Provide a list of 2+ compound names or SMILES"}
|
|
96
|
+
|
|
97
|
+
from rdkit import Chem, DataStructs
|
|
98
|
+
from rdkit.Chem import AllChem, MACCSkeys
|
|
99
|
+
|
|
100
|
+
# Resolve names to SMILES and compute fingerprints
|
|
101
|
+
resolved = []
|
|
102
|
+
for cpd in compounds:
|
|
103
|
+
smi = _extract_smiles(cpd)
|
|
104
|
+
mol = Chem.MolFromSmiles(smi)
|
|
105
|
+
if mol is None:
|
|
106
|
+
resolved.append({"name": cpd, "smiles": smi, "mol": None, "error": f"Invalid SMILES: {smi}"})
|
|
107
|
+
else:
|
|
108
|
+
resolved.append({"name": cpd, "smiles": Chem.MolToSmiles(mol), "mol": mol})
|
|
109
|
+
|
|
110
|
+
# Compute fingerprints
|
|
111
|
+
fps = []
|
|
112
|
+
for r in resolved:
|
|
113
|
+
if r["mol"] is None:
|
|
114
|
+
fps.append(None)
|
|
115
|
+
elif fingerprint == "maccs":
|
|
116
|
+
fps.append(MACCSkeys.GenMACCSKeys(r["mol"]))
|
|
117
|
+
else:
|
|
118
|
+
fps.append(AllChem.GetMorganFingerprintAsBitVect(r["mol"], 2, nBits=2048))
|
|
119
|
+
|
|
120
|
+
# Compute pairwise similarity matrix
|
|
121
|
+
n = len(resolved)
|
|
122
|
+
matrix = {}
|
|
123
|
+
pairs = []
|
|
124
|
+
for i in range(n):
|
|
125
|
+
for j in range(i + 1, n):
|
|
126
|
+
if fps[i] is None or fps[j] is None:
|
|
127
|
+
sim = 0.0
|
|
128
|
+
else:
|
|
129
|
+
sim = DataStructs.TanimotoSimilarity(fps[i], fps[j])
|
|
130
|
+
key = f"{resolved[i]['name']} vs {resolved[j]['name']}"
|
|
131
|
+
matrix[key] = round(sim, 4)
|
|
132
|
+
pairs.append({
|
|
133
|
+
"compound_a": resolved[i]["name"],
|
|
134
|
+
"compound_b": resolved[j]["name"],
|
|
135
|
+
"smiles_a": resolved[i]["smiles"],
|
|
136
|
+
"smiles_b": resolved[j]["smiles"],
|
|
137
|
+
"tanimoto": round(sim, 4),
|
|
138
|
+
})
|
|
139
|
+
|
|
140
|
+
# Sort by similarity
|
|
141
|
+
pairs.sort(key=lambda x: -x["tanimoto"])
|
|
142
|
+
|
|
143
|
+
# Cluster suggestion
|
|
144
|
+
if pairs:
|
|
145
|
+
most_similar = pairs[0]
|
|
146
|
+
least_similar = pairs[-1]
|
|
147
|
+
else:
|
|
148
|
+
most_similar = least_similar = {}
|
|
149
|
+
|
|
150
|
+
# Build readable matrix
|
|
151
|
+
names = [r["name"] for r in resolved]
|
|
152
|
+
matrix_rows = []
|
|
153
|
+
for i in range(n):
|
|
154
|
+
row = {}
|
|
155
|
+
for j in range(n):
|
|
156
|
+
if i == j:
|
|
157
|
+
row[names[j]] = 1.0
|
|
158
|
+
elif i < j:
|
|
159
|
+
row[names[j]] = round(DataStructs.TanimotoSimilarity(fps[i], fps[j]), 4) if fps[i] and fps[j] else 0.0
|
|
160
|
+
else:
|
|
161
|
+
row[names[j]] = round(DataStructs.TanimotoSimilarity(fps[j], fps[i]), 4) if fps[i] and fps[j] else 0.0
|
|
162
|
+
matrix_rows.append({"compound": names[i], **row})
|
|
163
|
+
|
|
164
|
+
fp_label = "ECFP4 (Morgan r=2, 2048 bits)" if fingerprint == "morgan" else "MACCS keys (166 bits)"
|
|
165
|
+
|
|
166
|
+
summary_lines = [
|
|
167
|
+
f"Pairwise Tanimoto similarity ({fp_label}) for {n} compounds:",
|
|
168
|
+
]
|
|
169
|
+
for p in pairs:
|
|
170
|
+
summary_lines.append(f" {p['compound_a']} vs {p['compound_b']}: {p['tanimoto']:.4f}")
|
|
171
|
+
if most_similar:
|
|
172
|
+
summary_lines.append(f"Most similar: {most_similar['compound_a']} & {most_similar['compound_b']} ({most_similar['tanimoto']:.4f})")
|
|
173
|
+
if least_similar:
|
|
174
|
+
summary_lines.append(f"Most different: {least_similar['compound_a']} & {least_similar['compound_b']} ({least_similar['tanimoto']:.4f})")
|
|
175
|
+
|
|
176
|
+
return {
|
|
177
|
+
"summary": "\n".join(summary_lines),
|
|
178
|
+
"fingerprint_type": fp_label,
|
|
179
|
+
"n_compounds": n,
|
|
180
|
+
"pairs": pairs,
|
|
181
|
+
"matrix": matrix_rows,
|
|
182
|
+
"resolved_smiles": [{"name": r["name"], "smiles": r["smiles"]} for r in resolved],
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
@registry.register(
|
|
187
|
+
name="chemistry.similarity_search",
|
|
188
|
+
description="Find similar compounds in a library using Tanimoto similarity on Morgan fingerprints",
|
|
189
|
+
category="chemistry",
|
|
190
|
+
parameters={"smiles": "Query SMILES", "library_path": "Path to compound library CSV", "top_n": "Number of hits"},
|
|
191
|
+
usage_guide="You have a hit compound and want to find structurally similar analogs in a library. Use for SAR expansion or finding backup compounds with similar scaffolds.",
|
|
192
|
+
)
|
|
193
|
+
def similarity_search(smiles: str, library_path: str = None, top_n: int = 10, **kwargs) -> dict:
|
|
194
|
+
"""Search for similar compounds using fingerprint similarity."""
|
|
195
|
+
smiles = _extract_smiles(smiles)
|
|
196
|
+
from rdkit import Chem, DataStructs
|
|
197
|
+
from rdkit.Chem import AllChem
|
|
198
|
+
import pandas as pd
|
|
199
|
+
|
|
200
|
+
query_mol = Chem.MolFromSmiles(smiles)
|
|
201
|
+
if query_mol is None:
|
|
202
|
+
return {"error": f"Invalid SMILES: {smiles}", "summary": f"Invalid SMILES: {smiles}"}
|
|
203
|
+
query_fp = AllChem.GetMorganFingerprintAsBitVect(query_mol, 2, nBits=2048)
|
|
204
|
+
|
|
205
|
+
# Load library
|
|
206
|
+
if library_path:
|
|
207
|
+
lib = pd.read_csv(library_path)
|
|
208
|
+
else:
|
|
209
|
+
return {"error": "No compound library specified", "summary": "No compound library specified"}
|
|
210
|
+
smiles_col = next((c for c in lib.columns if c.lower() in ['smiles', 'canonical_smiles']), None)
|
|
211
|
+
if smiles_col is None:
|
|
212
|
+
return {"error": f"No SMILES column found in library", "summary": f"No SMILES column found in library"}
|
|
213
|
+
results = []
|
|
214
|
+
for _, row in lib.iterrows():
|
|
215
|
+
mol = Chem.MolFromSmiles(row[smiles_col])
|
|
216
|
+
if mol is None:
|
|
217
|
+
continue
|
|
218
|
+
fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
|
|
219
|
+
similarity = DataStructs.TanimotoSimilarity(query_fp, fp)
|
|
220
|
+
results.append({
|
|
221
|
+
"smiles": row[smiles_col],
|
|
222
|
+
"similarity": similarity,
|
|
223
|
+
**{k: row[k] for k in row.index if k != smiles_col},
|
|
224
|
+
})
|
|
225
|
+
|
|
226
|
+
results.sort(key=lambda x: -x["similarity"])
|
|
227
|
+
top_hits = results[:top_n]
|
|
228
|
+
|
|
229
|
+
return {
|
|
230
|
+
"summary": f"Top {top_n} similar compounds (max Tanimoto={top_hits[0]['similarity']:.3f})" if top_hits else "No hits",
|
|
231
|
+
"hits": top_hits,
|
|
232
|
+
"library_size": len(results),
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
@registry.register(
|
|
237
|
+
name="chemistry.sar_analyze",
|
|
238
|
+
description="Analyze structure-activity relationships for a set of compounds with activity data",
|
|
239
|
+
category="chemistry",
|
|
240
|
+
parameters={"compounds_path": "CSV with SMILES and activity columns"},
|
|
241
|
+
usage_guide="You have a set of compounds with activity data and want to understand which molecular features drive potency. Use for medicinal chemistry optimization guidance.",
|
|
242
|
+
)
|
|
243
|
+
def sar_analyze(compounds_path: str, activity_col: str = "activity", **kwargs) -> dict:
|
|
244
|
+
"""Run SAR analysis on a compound set."""
|
|
245
|
+
import pandas as pd
|
|
246
|
+
from rdkit import Chem
|
|
247
|
+
from rdkit.Chem import AllChem, Descriptors
|
|
248
|
+
import numpy as np
|
|
249
|
+
|
|
250
|
+
df = pd.read_csv(compounds_path)
|
|
251
|
+
smiles_col = next((c for c in df.columns if c.lower() in ['smiles', 'canonical_smiles']), None)
|
|
252
|
+
|
|
253
|
+
if smiles_col is None or activity_col not in df.columns:
|
|
254
|
+
return {"error": "Need SMILES and activity columns", "summary": "Need SMILES and activity columns"}
|
|
255
|
+
# Compute descriptors
|
|
256
|
+
features = []
|
|
257
|
+
for _, row in df.iterrows():
|
|
258
|
+
mol = Chem.MolFromSmiles(row[smiles_col])
|
|
259
|
+
if mol is None:
|
|
260
|
+
continue
|
|
261
|
+
features.append({
|
|
262
|
+
"smiles": row[smiles_col],
|
|
263
|
+
"activity": row[activity_col],
|
|
264
|
+
"mw": Descriptors.MolWt(mol),
|
|
265
|
+
"logp": Descriptors.MolLogP(mol),
|
|
266
|
+
"tpsa": Descriptors.TPSA(mol),
|
|
267
|
+
"hbd": Descriptors.NumHDonors(mol),
|
|
268
|
+
"hba": Descriptors.NumHAcceptors(mol),
|
|
269
|
+
"rotbonds": Descriptors.NumRotatableBonds(mol),
|
|
270
|
+
})
|
|
271
|
+
|
|
272
|
+
feat_df = pd.DataFrame(features)
|
|
273
|
+
|
|
274
|
+
# Correlate descriptors with activity
|
|
275
|
+
from scipy import stats
|
|
276
|
+
correlations = {}
|
|
277
|
+
for col in ["mw", "logp", "tpsa", "hbd", "hba", "rotbonds"]:
|
|
278
|
+
r, p = stats.pearsonr(feat_df[col], feat_df["activity"])
|
|
279
|
+
correlations[col] = {"r": round(r, 3), "p": round(p, 4)}
|
|
280
|
+
|
|
281
|
+
return {
|
|
282
|
+
"summary": f"SAR analysis on {len(feat_df)} compounds",
|
|
283
|
+
"correlations": correlations,
|
|
284
|
+
"n_compounds": len(feat_df),
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
@registry.register(
|
|
289
|
+
name="chemistry.mmp_analysis",
|
|
290
|
+
description="Matched molecular pair analysis to identify R-group transformations that improve activity",
|
|
291
|
+
category="chemistry",
|
|
292
|
+
parameters={
|
|
293
|
+
"compounds_csv": "Path to CSV with SMILES and activity columns",
|
|
294
|
+
"activity_col": "Name of the activity column (default 'activity')",
|
|
295
|
+
},
|
|
296
|
+
usage_guide="You have a congeneric series of compounds and want to identify which single-point structural changes drive activity. Use for medicinal chemistry SAR optimization — finds matched molecular pairs and ranks R-group swaps by activity improvement.",
|
|
297
|
+
)
|
|
298
|
+
def mmp_analysis(compounds_csv: str = None, activity_col: str = "activity", **kwargs) -> dict:
|
|
299
|
+
"""Matched molecular pair analysis for a set of compounds.
|
|
300
|
+
|
|
301
|
+
Fragments molecules at single acyclic bonds, identifies matched pairs
|
|
302
|
+
(same core, different R-group), and correlates R-group changes with
|
|
303
|
+
activity differences.
|
|
304
|
+
"""
|
|
305
|
+
try:
|
|
306
|
+
from rdkit import Chem
|
|
307
|
+
from rdkit.Chem import BRICS, AllChem, Descriptors, rdMolDescriptors
|
|
308
|
+
except ImportError:
|
|
309
|
+
return {"error": "RDKit is required for MMP analysis. Install with: pip install rdkit", "summary": "RDKit is required for MMP analysis. Install with: pip install rdkit"}
|
|
310
|
+
import pandas as pd
|
|
311
|
+
import numpy as np
|
|
312
|
+
|
|
313
|
+
# Load or generate demo data
|
|
314
|
+
if compounds_csv:
|
|
315
|
+
try:
|
|
316
|
+
df = pd.read_csv(compounds_csv)
|
|
317
|
+
except Exception as e:
|
|
318
|
+
return {"error": f"Could not read CSV: {e}", "summary": f"Failed to load {compounds_csv}"}
|
|
319
|
+
smiles_col = next((c for c in df.columns if c.lower() in ["smiles", "canonical_smiles"]), None)
|
|
320
|
+
if smiles_col is None:
|
|
321
|
+
return {"error": "No SMILES column found (expected 'smiles' or 'canonical_smiles')", "summary": "No SMILES column found (expected 'smiles' or 'canonical_smiles')"}
|
|
322
|
+
if activity_col not in df.columns:
|
|
323
|
+
return {"error": f"Activity column '{activity_col}' not found. Available: {list(df.columns)}", "summary": f"Activity column '{activity_col}' not found. Available: {list(df.columns)}"}
|
|
324
|
+
else:
|
|
325
|
+
# Demo dataset: simple benzamide series
|
|
326
|
+
demo_data = [
|
|
327
|
+
("c1ccc(C(=O)N)cc1", 5.2, "benzamide"),
|
|
328
|
+
("c1ccc(C(=O)N)cc1F", 6.1, "4-fluorobenzamide"),
|
|
329
|
+
("c1ccc(C(=O)N)cc1Cl", 5.8, "4-chlorobenzamide"),
|
|
330
|
+
("c1ccc(C(=O)N)cc1C", 5.5, "4-methylbenzamide"),
|
|
331
|
+
("c1ccc(C(=O)N)cc1OC", 6.4, "4-methoxybenzamide"),
|
|
332
|
+
("c1ccc(C(=O)N)cc1O", 6.0, "4-hydroxybenzamide"),
|
|
333
|
+
("c1ccc(C(=O)NC)cc1", 5.0, "N-methylbenzamide"),
|
|
334
|
+
("c1ccc(C(=O)NCC)cc1", 4.7, "N-ethylbenzamide"),
|
|
335
|
+
("c1ccc(C(=O)N)c(F)c1", 5.9, "3-fluorobenzamide"),
|
|
336
|
+
("c1cc(F)c(C(=O)N)cc1F", 6.8, "3,4-difluorobenzamide"),
|
|
337
|
+
]
|
|
338
|
+
df = pd.DataFrame(demo_data, columns=["smiles", activity_col, "name"])
|
|
339
|
+
smiles_col = "smiles"
|
|
340
|
+
|
|
341
|
+
# Parse molecules and compute Murcko scaffolds
|
|
342
|
+
from rdkit.Chem.Scaffolds import MurckoScaffold
|
|
343
|
+
|
|
344
|
+
parsed = []
|
|
345
|
+
for _, row in df.iterrows():
|
|
346
|
+
mol = Chem.MolFromSmiles(row[smiles_col])
|
|
347
|
+
if mol is None:
|
|
348
|
+
continue
|
|
349
|
+
try:
|
|
350
|
+
scaffold = MurckoScaffold.GetScaffoldForMol(mol)
|
|
351
|
+
scaffold_smi = Chem.MolToSmiles(scaffold)
|
|
352
|
+
except Exception:
|
|
353
|
+
scaffold_smi = "unknown"
|
|
354
|
+
parsed.append({
|
|
355
|
+
"smiles": row[smiles_col],
|
|
356
|
+
"mol": mol,
|
|
357
|
+
"activity": float(row[activity_col]),
|
|
358
|
+
"scaffold": scaffold_smi,
|
|
359
|
+
"name": row.get("name", row[smiles_col]),
|
|
360
|
+
})
|
|
361
|
+
|
|
362
|
+
if len(parsed) < 2:
|
|
363
|
+
return {"error": "Need at least 2 valid compounds for MMP analysis",
|
|
364
|
+
"summary": "Insufficient valid compounds for analysis"}
|
|
365
|
+
|
|
366
|
+
# Fragment each molecule using BRICS
|
|
367
|
+
fragments_map = {} # smiles -> list of (core, rgroup) tuples
|
|
368
|
+
for entry in parsed:
|
|
369
|
+
mol = entry["mol"]
|
|
370
|
+
smi = entry["smiles"]
|
|
371
|
+
fragments_map[smi] = []
|
|
372
|
+
|
|
373
|
+
try:
|
|
374
|
+
brics_frags = BRICS.BRICSDecompose(mol, returnMols=False)
|
|
375
|
+
for frag in brics_frags:
|
|
376
|
+
fragments_map[smi].append(frag)
|
|
377
|
+
except Exception:
|
|
378
|
+
pass
|
|
379
|
+
|
|
380
|
+
# Identify matched pairs: same scaffold, different compounds
|
|
381
|
+
scaffold_groups = {}
|
|
382
|
+
for entry in parsed:
|
|
383
|
+
scaffold_groups.setdefault(entry["scaffold"], []).append(entry)
|
|
384
|
+
|
|
385
|
+
pairs = []
|
|
386
|
+
transformations = {} # (from_feature, to_feature) -> [delta_activity]
|
|
387
|
+
|
|
388
|
+
for scaffold, members in scaffold_groups.items():
|
|
389
|
+
if len(members) < 2:
|
|
390
|
+
continue
|
|
391
|
+
|
|
392
|
+
# Generate all pairs within scaffold group
|
|
393
|
+
for i in range(len(members)):
|
|
394
|
+
for j in range(i + 1, len(members)):
|
|
395
|
+
m1 = members[i]
|
|
396
|
+
m2 = members[j]
|
|
397
|
+
delta = m2["activity"] - m1["activity"]
|
|
398
|
+
|
|
399
|
+
# Find structural difference using MCS
|
|
400
|
+
try:
|
|
401
|
+
from rdkit.Chem import rdFMCS
|
|
402
|
+
mcs = rdFMCS.FindMCS(
|
|
403
|
+
[m1["mol"], m2["mol"]],
|
|
404
|
+
timeout=2,
|
|
405
|
+
matchValences=True,
|
|
406
|
+
ringMatchesRingOnly=True,
|
|
407
|
+
)
|
|
408
|
+
core_smarts = mcs.smartsString if mcs and mcs.numAtoms > 0 else None
|
|
409
|
+
except Exception:
|
|
410
|
+
core_smarts = None
|
|
411
|
+
|
|
412
|
+
# Characterize the transformation by atom count difference
|
|
413
|
+
atoms1 = m1["mol"].GetNumHeavyAtoms()
|
|
414
|
+
atoms2 = m2["mol"].GetNumHeavyAtoms()
|
|
415
|
+
|
|
416
|
+
pair_info = {
|
|
417
|
+
"compound_a": m1["smiles"],
|
|
418
|
+
"compound_b": m2["smiles"],
|
|
419
|
+
"name_a": m1.get("name", m1["smiles"]),
|
|
420
|
+
"name_b": m2.get("name", m2["smiles"]),
|
|
421
|
+
"activity_a": round(m1["activity"], 3),
|
|
422
|
+
"activity_b": round(m2["activity"], 3),
|
|
423
|
+
"delta_activity": round(delta, 3),
|
|
424
|
+
"scaffold": scaffold,
|
|
425
|
+
"core_mcs": core_smarts,
|
|
426
|
+
"heavy_atom_diff": atoms2 - atoms1,
|
|
427
|
+
}
|
|
428
|
+
pairs.append(pair_info)
|
|
429
|
+
|
|
430
|
+
# Track transformations by scaffold
|
|
431
|
+
key = scaffold
|
|
432
|
+
if key not in transformations:
|
|
433
|
+
transformations[key] = []
|
|
434
|
+
transformations[key].append({
|
|
435
|
+
"from": m1["smiles"],
|
|
436
|
+
"to": m2["smiles"],
|
|
437
|
+
"delta": delta,
|
|
438
|
+
})
|
|
439
|
+
|
|
440
|
+
# Rank pairs by absolute activity change
|
|
441
|
+
pairs.sort(key=lambda x: abs(x["delta_activity"]), reverse=True)
|
|
442
|
+
|
|
443
|
+
# Aggregate transformation statistics per scaffold
|
|
444
|
+
scaffold_stats = []
|
|
445
|
+
for scaffold, trans_list in transformations.items():
|
|
446
|
+
deltas = [t["delta"] for t in trans_list]
|
|
447
|
+
scaffold_stats.append({
|
|
448
|
+
"scaffold": scaffold,
|
|
449
|
+
"n_pairs": len(trans_list),
|
|
450
|
+
"mean_delta": round(float(np.mean(deltas)), 3),
|
|
451
|
+
"max_delta": round(float(np.max(deltas)), 3),
|
|
452
|
+
"min_delta": round(float(np.min(deltas)), 3),
|
|
453
|
+
"std_delta": round(float(np.std(deltas)), 3) if len(deltas) > 1 else 0.0,
|
|
454
|
+
})
|
|
455
|
+
|
|
456
|
+
# Find top activity-improving transformations
|
|
457
|
+
top_improvements = [p for p in pairs if p["delta_activity"] > 0][:10]
|
|
458
|
+
top_decreases = [p for p in pairs if p["delta_activity"] < 0]
|
|
459
|
+
top_decreases.sort(key=lambda x: x["delta_activity"])
|
|
460
|
+
top_decreases = top_decreases[:5]
|
|
461
|
+
|
|
462
|
+
n_scaffolds = len(scaffold_groups)
|
|
463
|
+
using_demo = compounds_csv is None
|
|
464
|
+
|
|
465
|
+
summary_lines = [
|
|
466
|
+
f"MMP analysis: {len(parsed)} compounds, {len(pairs)} matched pairs, {n_scaffolds} scaffold(s)",
|
|
467
|
+
]
|
|
468
|
+
if using_demo:
|
|
469
|
+
summary_lines.append("(Using built-in demo dataset — provide compounds_csv for custom analysis)")
|
|
470
|
+
if top_improvements:
|
|
471
|
+
best = top_improvements[0]
|
|
472
|
+
summary_lines.append(
|
|
473
|
+
f"Best improvement: {best['name_a']} -> {best['name_b']} "
|
|
474
|
+
f"(delta={best['delta_activity']:+.3f})"
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
return {
|
|
478
|
+
"summary": "\n".join(summary_lines),
|
|
479
|
+
"n_compounds": len(parsed),
|
|
480
|
+
"n_pairs": len(pairs),
|
|
481
|
+
"n_scaffolds": n_scaffolds,
|
|
482
|
+
"using_demo_data": using_demo,
|
|
483
|
+
"top_improvements": top_improvements,
|
|
484
|
+
"top_decreases": top_decreases,
|
|
485
|
+
"scaffold_stats": scaffold_stats,
|
|
486
|
+
"all_pairs": pairs[:50], # cap output
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
@registry.register(
|
|
491
|
+
name="chemistry.scaffold_hop",
|
|
492
|
+
description="Suggest scaffold replacements and bioisosteres for a compound",
|
|
493
|
+
category="chemistry",
|
|
494
|
+
parameters={
|
|
495
|
+
"smiles": "SMILES string for the input compound",
|
|
496
|
+
},
|
|
497
|
+
usage_guide="You want to explore alternative scaffolds for a hit compound — either to improve properties, escape a patent, or find novel chemical matter. Generates bioisosteric replacements for functional groups and suggests scaffold hops based on the Murcko framework.",
|
|
498
|
+
)
|
|
499
|
+
def scaffold_hop(smiles: str, **kwargs) -> dict:
|
|
500
|
+
"""Suggest scaffold replacements and bioisosteric substitutions.
|
|
501
|
+
|
|
502
|
+
Extracts the Murcko scaffold, identifies key functional groups, and
|
|
503
|
+
suggests common bioisosteric replacements with rationale.
|
|
504
|
+
"""
|
|
505
|
+
smiles = _extract_smiles(smiles)
|
|
506
|
+
try:
|
|
507
|
+
from rdkit import Chem
|
|
508
|
+
from rdkit.Chem import Descriptors, AllChem, rdMolDescriptors
|
|
509
|
+
from rdkit.Chem.Scaffolds import MurckoScaffold
|
|
510
|
+
except ImportError:
|
|
511
|
+
return {"error": "RDKit is required for scaffold hopping. Install with: pip install rdkit", "summary": "RDKit is required for scaffold hopping. Install with: pip install rdkit"}
|
|
512
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
513
|
+
if mol is None:
|
|
514
|
+
return {"error": f"Invalid SMILES: {smiles}", "summary": f"Could not parse SMILES: {smiles}"}
|
|
515
|
+
|
|
516
|
+
# Extract Murcko scaffold
|
|
517
|
+
try:
|
|
518
|
+
scaffold_mol = MurckoScaffold.GetScaffoldForMol(mol)
|
|
519
|
+
scaffold_smi = Chem.MolToSmiles(scaffold_mol)
|
|
520
|
+
generic_scaffold = MurckoScaffold.MakeScaffoldGeneric(scaffold_mol)
|
|
521
|
+
generic_smi = Chem.MolToSmiles(generic_scaffold)
|
|
522
|
+
except Exception as e:
|
|
523
|
+
scaffold_smi = "N/A"
|
|
524
|
+
generic_smi = "N/A"
|
|
525
|
+
|
|
526
|
+
# Identify functional groups via SMARTS matching
|
|
527
|
+
# Each entry: (name, smarts, bioisosteres)
|
|
528
|
+
fg_definitions = [
|
|
529
|
+
("carboxylic_acid", "[CX3](=O)[OX2H1]", [
|
|
530
|
+
{"replacement": "tetrazole", "smiles_fragment": "c1nnn[nH]1",
|
|
531
|
+
"rationale": "Classic carboxylic acid bioisostere — similar pKa, improved metabolic stability and permeability"},
|
|
532
|
+
{"replacement": "acyl sulfonamide", "smiles_fragment": "C(=O)NS(=O)=O",
|
|
533
|
+
"rationale": "Acidic NH mimics carboxylate — good for oral bioavailability"},
|
|
534
|
+
{"replacement": "hydroxamic acid", "smiles_fragment": "C(=O)NO",
|
|
535
|
+
"rationale": "Maintains H-bond donor/acceptor pattern — also a zinc-binding group"},
|
|
536
|
+
]),
|
|
537
|
+
("amide", "[NX3][CX3](=[OX1])[#6]", [
|
|
538
|
+
{"replacement": "sulfonamide", "smiles_fragment": "NS(=O)(=O)",
|
|
539
|
+
"rationale": "Similar geometry and H-bonding — often improved metabolic stability"},
|
|
540
|
+
{"replacement": "urea", "smiles_fragment": "NC(=O)N",
|
|
541
|
+
"rationale": "Additional H-bond donor — can improve target binding"},
|
|
542
|
+
{"replacement": "reversed amide", "smiles_fragment": "C(=O)N (reversed)",
|
|
543
|
+
"rationale": "Switching C(=O)NH to NHC(=O) — changes metabolic soft spot"},
|
|
544
|
+
{"replacement": "1,2,4-oxadiazole", "smiles_fragment": "c1nonc1",
|
|
545
|
+
"rationale": "Planar amide bioisostere — improved metabolic stability"},
|
|
546
|
+
]),
|
|
547
|
+
("phenyl", "c1ccccc1", [
|
|
548
|
+
{"replacement": "pyridine", "smiles_fragment": "c1ccncc1",
|
|
549
|
+
"rationale": "Introduces H-bond acceptor — improves solubility and can modulate pKa"},
|
|
550
|
+
{"replacement": "pyrimidine", "smiles_fragment": "c1ncncc1",
|
|
551
|
+
"rationale": "Two nitrogen atoms — further improved solubility vs pyridine"},
|
|
552
|
+
{"replacement": "cyclohexane", "smiles_fragment": "C1CCCCC1",
|
|
553
|
+
"rationale": "sp3-rich replacement — escape flatness, improve Fsp3 and solubility (Lovering)"},
|
|
554
|
+
{"replacement": "thiophene", "smiles_fragment": "c1ccsc1",
|
|
555
|
+
"rationale": "5-membered aromatic — different vector geometry, often similar binding"},
|
|
556
|
+
]),
|
|
557
|
+
("ester", "[#6][CX3](=O)[OX2][#6]", [
|
|
558
|
+
{"replacement": "amide", "smiles_fragment": "C(=O)N",
|
|
559
|
+
"rationale": "Much more metabolically stable — standard ester prodrug reversal"},
|
|
560
|
+
{"replacement": "oxadiazole", "smiles_fragment": "c1nonn1",
|
|
561
|
+
"rationale": "Planar ester bioisostere — metabolically stable"},
|
|
562
|
+
]),
|
|
563
|
+
("sulfonamide", "[NX3]S(=O)(=O)", [
|
|
564
|
+
{"replacement": "amide", "smiles_fragment": "NC(=O)",
|
|
565
|
+
"rationale": "Simpler, often similar activity — different metabolic profile"},
|
|
566
|
+
{"replacement": "reverse sulfonamide", "smiles_fragment": "S(=O)(=O)N (reversed)",
|
|
567
|
+
"rationale": "Switch N and C sides of sulfonamide"},
|
|
568
|
+
]),
|
|
569
|
+
("hydroxyl", "[OX2H]", [
|
|
570
|
+
{"replacement": "fluorine", "smiles_fragment": "F",
|
|
571
|
+
"rationale": "Similar size, H-bond acceptor only — blocks metabolic oxidation site"},
|
|
572
|
+
{"replacement": "amine", "smiles_fragment": "N",
|
|
573
|
+
"rationale": "H-bond donor and acceptor — different pKa profile"},
|
|
574
|
+
{"replacement": "methoxy", "smiles_fragment": "OC",
|
|
575
|
+
"rationale": "Caps the OH — blocks glucuronidation, changes H-bonding"},
|
|
576
|
+
]),
|
|
577
|
+
("nitrile", "[CX2]#[NX1]", [
|
|
578
|
+
{"replacement": "isoxazole", "smiles_fragment": "c1ccon1",
|
|
579
|
+
"rationale": "Ring-based CN mimic — similar dipole and H-bond accepting"},
|
|
580
|
+
]),
|
|
581
|
+
("fluorine", "[F]", [
|
|
582
|
+
{"replacement": "chlorine", "smiles_fragment": "Cl",
|
|
583
|
+
"rationale": "Larger halogen — increased lipophilicity, different steric profile"},
|
|
584
|
+
{"replacement": "hydrogen", "smiles_fragment": "[H]",
|
|
585
|
+
"rationale": "Remove halogen — simplify molecule, assess fluorine contribution"},
|
|
586
|
+
{"replacement": "trifluoromethyl", "smiles_fragment": "C(F)(F)F",
|
|
587
|
+
"rationale": "Strongly electron-withdrawing — metabolically stable, increases lipophilicity"},
|
|
588
|
+
]),
|
|
589
|
+
]
|
|
590
|
+
|
|
591
|
+
# Match functional groups
|
|
592
|
+
detected_groups = []
|
|
593
|
+
all_bioisosteres = []
|
|
594
|
+
|
|
595
|
+
for fg_name, smarts, bioisosteres in fg_definitions:
|
|
596
|
+
pattern = Chem.MolFromSmarts(smarts)
|
|
597
|
+
if pattern is None:
|
|
598
|
+
continue
|
|
599
|
+
matches = mol.GetSubstructMatches(pattern)
|
|
600
|
+
if matches:
|
|
601
|
+
detected_groups.append({
|
|
602
|
+
"group": fg_name,
|
|
603
|
+
"count": len(matches),
|
|
604
|
+
"atom_indices": [list(m) for m in matches],
|
|
605
|
+
})
|
|
606
|
+
for bio in bioisosteres:
|
|
607
|
+
all_bioisosteres.append({
|
|
608
|
+
"original_group": fg_name,
|
|
609
|
+
**bio,
|
|
610
|
+
})
|
|
611
|
+
|
|
612
|
+
# Scaffold replacement suggestions
|
|
613
|
+
scaffold_replacements = []
|
|
614
|
+
|
|
615
|
+
# Detect ring systems in the scaffold
|
|
616
|
+
if scaffold_smi != "N/A":
|
|
617
|
+
scaf_mol = Chem.MolFromSmiles(scaffold_smi)
|
|
618
|
+
if scaf_mol:
|
|
619
|
+
ring_info = scaf_mol.GetRingInfo()
|
|
620
|
+
n_rings = ring_info.NumRings()
|
|
621
|
+
|
|
622
|
+
# Common scaffold hops based on ring system
|
|
623
|
+
ring_replacements = {
|
|
624
|
+
"c1ccccc1": [ # benzene
|
|
625
|
+
("c1ccncc1", "phenyl -> pyridyl (N-walk around ring)"),
|
|
626
|
+
("c1ccoc1", "phenyl -> furanyl (ring contraction)"),
|
|
627
|
+
("c1ccsc1", "phenyl -> thiophenyl (5-mem heterocycle)"),
|
|
628
|
+
("C1CCCCC1", "phenyl -> cyclohexyl (sp3 escape)"),
|
|
629
|
+
("c1cc[nH]c1", "phenyl -> pyrrolyl (electron-rich 5-mem)"),
|
|
630
|
+
],
|
|
631
|
+
"c1ccncc1": [ # pyridine
|
|
632
|
+
("c1ccccc1", "pyridyl -> phenyl (remove N)"),
|
|
633
|
+
("c1ncncc1", "pyridyl -> pyrimidyl (add N)"),
|
|
634
|
+
("c1ccnnc1", "pyridyl -> pyridazinyl (adjacent N)"),
|
|
635
|
+
],
|
|
636
|
+
"c1cc[nH]c1": [ # pyrrole
|
|
637
|
+
("c1ccoc1", "pyrrolyl -> furanyl"),
|
|
638
|
+
("c1ccsc1", "pyrrolyl -> thiophenyl"),
|
|
639
|
+
],
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
for ring_smi, replacements in ring_replacements.items():
|
|
643
|
+
ring_pat = Chem.MolFromSmarts(ring_smi)
|
|
644
|
+
if ring_pat and scaf_mol.HasSubstructMatch(ring_pat):
|
|
645
|
+
for repl_smi, description in replacements:
|
|
646
|
+
scaffold_replacements.append({
|
|
647
|
+
"original_ring": ring_smi,
|
|
648
|
+
"replacement_ring": repl_smi,
|
|
649
|
+
"description": description,
|
|
650
|
+
})
|
|
651
|
+
|
|
652
|
+
# Compute properties for context
|
|
653
|
+
mw = Descriptors.MolWt(mol)
|
|
654
|
+
logp = Descriptors.MolLogP(mol)
|
|
655
|
+
tpsa = Descriptors.TPSA(mol)
|
|
656
|
+
fsp3 = rdMolDescriptors.CalcFractionCSP3(mol)
|
|
657
|
+
|
|
658
|
+
property_context = {
|
|
659
|
+
"molecular_weight": round(mw, 1),
|
|
660
|
+
"logp": round(logp, 2),
|
|
661
|
+
"tpsa": round(tpsa, 1),
|
|
662
|
+
"fsp3": round(fsp3, 3),
|
|
663
|
+
"suggestions_for_improvement": [],
|
|
664
|
+
}
|
|
665
|
+
|
|
666
|
+
if fsp3 < 0.25:
|
|
667
|
+
property_context["suggestions_for_improvement"].append(
|
|
668
|
+
"Low Fsp3 ({:.2f}) — consider sp3-rich scaffold hops (phenyl->cyclohexyl) to improve solubility".format(fsp3)
|
|
669
|
+
)
|
|
670
|
+
if logp > 4:
|
|
671
|
+
property_context["suggestions_for_improvement"].append(
|
|
672
|
+
"High LogP ({:.1f}) — add heteroatoms or polar groups to improve solubility".format(logp)
|
|
673
|
+
)
|
|
674
|
+
if tpsa < 40:
|
|
675
|
+
property_context["suggestions_for_improvement"].append(
|
|
676
|
+
"Low TPSA ({:.0f}) — may have poor solubility; consider adding H-bond acceptors".format(tpsa)
|
|
677
|
+
)
|
|
678
|
+
|
|
679
|
+
# Summary
|
|
680
|
+
summary_lines = [
|
|
681
|
+
f"Scaffold analysis for {smiles}",
|
|
682
|
+
f"Murcko scaffold: {scaffold_smi}",
|
|
683
|
+
f"Generic framework: {generic_smi}",
|
|
684
|
+
f"Detected functional groups: {', '.join(g['group'] for g in detected_groups) if detected_groups else 'none identified'}",
|
|
685
|
+
f"Bioisostere suggestions: {len(all_bioisosteres)}",
|
|
686
|
+
f"Scaffold hop options: {len(scaffold_replacements)}",
|
|
687
|
+
]
|
|
688
|
+
|
|
689
|
+
return {
|
|
690
|
+
"summary": "\n".join(summary_lines),
|
|
691
|
+
"input_smiles": smiles,
|
|
692
|
+
"murcko_scaffold": scaffold_smi,
|
|
693
|
+
"generic_framework": generic_smi,
|
|
694
|
+
"detected_functional_groups": detected_groups,
|
|
695
|
+
"bioisostere_suggestions": all_bioisosteres,
|
|
696
|
+
"scaffold_replacements": scaffold_replacements,
|
|
697
|
+
"property_context": property_context,
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
|
|
701
|
+
@registry.register(
|
|
702
|
+
name="chemistry.pubchem_lookup",
|
|
703
|
+
description="Look up compound data from PubChem by name or SMILES",
|
|
704
|
+
category="chemistry",
|
|
705
|
+
parameters={
|
|
706
|
+
"query": "Compound name or SMILES string",
|
|
707
|
+
"query_type": "Type of query: 'name' or 'smiles' (default 'name')",
|
|
708
|
+
},
|
|
709
|
+
usage_guide="You need compound information (structure, properties, synonyms, CID) from PubChem. Use when identifying a compound by name or validating a SMILES string. Returns canonical SMILES, physicochemical properties, and identifiers.",
|
|
710
|
+
)
|
|
711
|
+
def pubchem_lookup(query: str, query_type: str = "name", **kwargs) -> dict:
|
|
712
|
+
"""Look up compound data from PubChem PUG REST API.
|
|
713
|
+
|
|
714
|
+
Supports lookup by compound name or SMILES string. Returns CID, canonical
|
|
715
|
+
SMILES, molecular properties, and synonyms.
|
|
716
|
+
"""
|
|
717
|
+
base_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
|
|
718
|
+
|
|
719
|
+
# Step 1: Resolve query to CID
|
|
720
|
+
if query_type == "smiles":
|
|
721
|
+
lookup_url = f"{base_url}/compound/smiles/JSON"
|
|
722
|
+
# Use POST for SMILES to handle special characters
|
|
723
|
+
resp, error = request(
|
|
724
|
+
"POST",
|
|
725
|
+
lookup_url,
|
|
726
|
+
data={"smiles": query},
|
|
727
|
+
timeout=10,
|
|
728
|
+
retries=2,
|
|
729
|
+
raise_for_status=False,
|
|
730
|
+
)
|
|
731
|
+
if error:
|
|
732
|
+
return {"error": f"HTTP error: {error}", "summary": f"PubChem lookup failed: {error}"}
|
|
733
|
+
else:
|
|
734
|
+
# URL-encode the compound name
|
|
735
|
+
import urllib.parse
|
|
736
|
+
encoded_query = urllib.parse.quote(query, safe="")
|
|
737
|
+
lookup_url = f"{base_url}/compound/name/{encoded_query}/JSON"
|
|
738
|
+
resp, error = request(
|
|
739
|
+
"GET",
|
|
740
|
+
lookup_url,
|
|
741
|
+
timeout=10,
|
|
742
|
+
retries=2,
|
|
743
|
+
raise_for_status=False,
|
|
744
|
+
)
|
|
745
|
+
if error:
|
|
746
|
+
return {"error": f"HTTP error: {error}", "summary": f"PubChem lookup failed: {error}"}
|
|
747
|
+
|
|
748
|
+
if resp.status_code == 404:
|
|
749
|
+
return {
|
|
750
|
+
"error": f"Compound not found: {query}",
|
|
751
|
+
"summary": f"PubChem: no compound found for '{query}' (query_type={query_type})",
|
|
752
|
+
}
|
|
753
|
+
if resp.status_code != 200:
|
|
754
|
+
return {
|
|
755
|
+
"error": f"PubChem API error (HTTP {resp.status_code})",
|
|
756
|
+
"summary": f"PubChem lookup failed with status {resp.status_code}",
|
|
757
|
+
}
|
|
758
|
+
|
|
759
|
+
try:
|
|
760
|
+
data = resp.json()
|
|
761
|
+
except Exception:
|
|
762
|
+
return {"error": "Failed to parse PubChem response", "summary": "PubChem returned invalid JSON"}
|
|
763
|
+
|
|
764
|
+
# Extract CID
|
|
765
|
+
compounds = data.get("PC_Compounds", [])
|
|
766
|
+
if not compounds:
|
|
767
|
+
return {"error": "No compound data in response", "summary": f"PubChem returned empty result for '{query}'"}
|
|
768
|
+
|
|
769
|
+
compound = compounds[0]
|
|
770
|
+
cid = compound.get("id", {}).get("id", {}).get("cid")
|
|
771
|
+
|
|
772
|
+
if not cid:
|
|
773
|
+
return {"error": "Could not extract CID", "summary": "PubChem response missing CID"}
|
|
774
|
+
|
|
775
|
+
# Step 2: Get properties
|
|
776
|
+
props_url = (
|
|
777
|
+
f"{base_url}/compound/cid/{cid}/property/"
|
|
778
|
+
"MolecularFormula,MolecularWeight,CanonicalSMILES,IsomericSMILES,"
|
|
779
|
+
"XLogP,ExactMass,TPSA,HBondDonorCount,HBondAcceptorCount,"
|
|
780
|
+
"RotatableBondCount,HeavyAtomCount,Complexity/JSON"
|
|
781
|
+
)
|
|
782
|
+
props_resp, props_error = request(
|
|
783
|
+
"GET",
|
|
784
|
+
props_url,
|
|
785
|
+
timeout=10,
|
|
786
|
+
retries=2,
|
|
787
|
+
raise_for_status=False,
|
|
788
|
+
)
|
|
789
|
+
if props_error:
|
|
790
|
+
props_data = {}
|
|
791
|
+
else:
|
|
792
|
+
try:
|
|
793
|
+
props_data = props_resp.json() if props_resp.status_code == 200 else {}
|
|
794
|
+
except Exception:
|
|
795
|
+
props_data = {}
|
|
796
|
+
|
|
797
|
+
properties = {}
|
|
798
|
+
prop_table = props_data.get("PropertyTable", {}).get("Properties", [])
|
|
799
|
+
if prop_table:
|
|
800
|
+
p = prop_table[0]
|
|
801
|
+
# PubChem may return SMILES as "CanonicalSMILES", "SMILES", or "ConnectivitySMILES"
|
|
802
|
+
canonical = p.get("CanonicalSMILES") or p.get("SMILES") or p.get("ConnectivitySMILES")
|
|
803
|
+
isomeric = p.get("IsomericSMILES") or canonical
|
|
804
|
+
properties = {
|
|
805
|
+
"cid": p.get("CID"),
|
|
806
|
+
"molecular_formula": p.get("MolecularFormula"),
|
|
807
|
+
"molecular_weight": p.get("MolecularWeight"),
|
|
808
|
+
"canonical_smiles": canonical,
|
|
809
|
+
"isomeric_smiles": isomeric,
|
|
810
|
+
"xlogp": p.get("XLogP"),
|
|
811
|
+
"exact_mass": p.get("ExactMass"),
|
|
812
|
+
"tpsa": p.get("TPSA"),
|
|
813
|
+
"hbd": p.get("HBondDonorCount"),
|
|
814
|
+
"hba": p.get("HBondAcceptorCount"),
|
|
815
|
+
"rotatable_bonds": p.get("RotatableBondCount"),
|
|
816
|
+
"heavy_atoms": p.get("HeavyAtomCount"),
|
|
817
|
+
"complexity": p.get("Complexity"),
|
|
818
|
+
}
|
|
819
|
+
|
|
820
|
+
# Step 3: Get synonyms
|
|
821
|
+
synonyms_url = f"{base_url}/compound/cid/{cid}/synonyms/JSON"
|
|
822
|
+
synonyms = []
|
|
823
|
+
syn_resp, syn_error = request(
|
|
824
|
+
"GET",
|
|
825
|
+
synonyms_url,
|
|
826
|
+
timeout=10,
|
|
827
|
+
retries=2,
|
|
828
|
+
raise_for_status=False,
|
|
829
|
+
)
|
|
830
|
+
if not syn_error and syn_resp.status_code == 200:
|
|
831
|
+
try:
|
|
832
|
+
syn_data = syn_resp.json()
|
|
833
|
+
syn_list = syn_data.get("InformationList", {}).get("Information", [])
|
|
834
|
+
if syn_list:
|
|
835
|
+
synonyms = syn_list[0].get("Synonym", [])[:20] # cap at 20
|
|
836
|
+
except Exception:
|
|
837
|
+
pass
|
|
838
|
+
|
|
839
|
+
# Build summary
|
|
840
|
+
canonical = properties.get("canonical_smiles", "N/A")
|
|
841
|
+
mw = properties.get("molecular_weight", "N/A")
|
|
842
|
+
formula = properties.get("molecular_formula", "N/A")
|
|
843
|
+
xlogp = properties.get("xlogp", "N/A")
|
|
844
|
+
|
|
845
|
+
summary_lines = [
|
|
846
|
+
f"PubChem: {query} (CID {cid})",
|
|
847
|
+
f"Formula: {formula}, MW: {mw}, XLogP: {xlogp}",
|
|
848
|
+
f"SMILES: {canonical}",
|
|
849
|
+
]
|
|
850
|
+
if synonyms:
|
|
851
|
+
summary_lines.append(f"Also known as: {', '.join(synonyms[:5])}")
|
|
852
|
+
|
|
853
|
+
return {
|
|
854
|
+
"summary": "\n".join(summary_lines),
|
|
855
|
+
"cid": cid,
|
|
856
|
+
"canonical_smiles": properties.get("canonical_smiles"),
|
|
857
|
+
"properties": properties,
|
|
858
|
+
"synonyms": synonyms,
|
|
859
|
+
"pubchem_url": f"https://pubchem.ncbi.nlm.nih.gov/compound/{cid}",
|
|
860
|
+
}
|
|
861
|
+
|
|
862
|
+
|
|
863
|
+
# ─── Retrosynthetic transforms (SMARTS) ─────────────────────────
|
|
864
|
+
# Each transform: (name, product_smarts, reactant_smarts_list, reagents, conditions)
|
|
865
|
+
_RETRO_TRANSFORMS = [
|
|
866
|
+
{
|
|
867
|
+
"name": "Amide bond disconnection",
|
|
868
|
+
"description": "Disconnect C(=O)-N amide bond → carboxylic acid + amine",
|
|
869
|
+
"product_smarts": "[C:1](=[O:2])-[N:3]",
|
|
870
|
+
"reagents": ["HATU", "DIPEA"],
|
|
871
|
+
"conditions": "Amide coupling, DMF, RT, 12h",
|
|
872
|
+
"reaction_class": "amide_coupling",
|
|
873
|
+
},
|
|
874
|
+
{
|
|
875
|
+
"name": "Suzuki coupling",
|
|
876
|
+
"description": "Disconnect Ar-Ar biaryl bond → aryl boronic acid + aryl halide",
|
|
877
|
+
"product_smarts": "[c:1]-[c:2]",
|
|
878
|
+
"reagents": ["Pd(PPh3)4", "K2CO3"],
|
|
879
|
+
"conditions": "Suzuki coupling, dioxane/H2O, 80°C, 16h",
|
|
880
|
+
"reaction_class": "cross_coupling",
|
|
881
|
+
},
|
|
882
|
+
{
|
|
883
|
+
"name": "Ester hydrolysis",
|
|
884
|
+
"description": "Disconnect C(=O)-O ester bond → carboxylic acid + alcohol",
|
|
885
|
+
"product_smarts": "[C:1](=[O:2])-[O:3][C:4]",
|
|
886
|
+
"reagents": ["DCC", "DMAP"],
|
|
887
|
+
"conditions": "Esterification, DCM, RT, 4h",
|
|
888
|
+
"reaction_class": "esterification",
|
|
889
|
+
},
|
|
890
|
+
{
|
|
891
|
+
"name": "Reductive amination",
|
|
892
|
+
"description": "Disconnect C-N bond adjacent to C-H → aldehyde/ketone + amine",
|
|
893
|
+
"product_smarts": "[C:1]-[NH:2]",
|
|
894
|
+
"reagents": ["NaBH3CN", "AcOH"],
|
|
895
|
+
"conditions": "Reductive amination, MeOH, RT, 16h",
|
|
896
|
+
"reaction_class": "reductive_amination",
|
|
897
|
+
},
|
|
898
|
+
{
|
|
899
|
+
"name": "N-alkylation",
|
|
900
|
+
"description": "Disconnect N-C(sp3) bond → amine + alkyl halide",
|
|
901
|
+
"product_smarts": "[N:1]-[CH2:2]",
|
|
902
|
+
"reagents": ["K2CO3"],
|
|
903
|
+
"conditions": "N-alkylation, DMF, 60°C, 12h",
|
|
904
|
+
"reaction_class": "alkylation",
|
|
905
|
+
},
|
|
906
|
+
{
|
|
907
|
+
"name": "Ether formation (Williamson)",
|
|
908
|
+
"description": "Disconnect C-O-C ether bond → alcohol + alkyl halide",
|
|
909
|
+
"product_smarts": "[C:1]-[O:2]-[C:3]",
|
|
910
|
+
"reagents": ["NaH"],
|
|
911
|
+
"conditions": "Williamson ether synthesis, THF, 0°C→RT, 6h",
|
|
912
|
+
"reaction_class": "etherification",
|
|
913
|
+
},
|
|
914
|
+
{
|
|
915
|
+
"name": "Sulfonamide formation",
|
|
916
|
+
"description": "Disconnect S(=O)(=O)-N bond → sulfonyl chloride + amine",
|
|
917
|
+
"product_smarts": "[S:1](=[O:2])(=[O:3])-[N:4]",
|
|
918
|
+
"reagents": ["Et3N"],
|
|
919
|
+
"conditions": "Sulfonamide coupling, DCM, 0°C→RT, 4h",
|
|
920
|
+
"reaction_class": "sulfonamide_formation",
|
|
921
|
+
},
|
|
922
|
+
{
|
|
923
|
+
"name": "Urea formation",
|
|
924
|
+
"description": "Disconnect N-C(=O)-N urea → isocyanate + amine",
|
|
925
|
+
"product_smarts": "[N:1]-[C:2](=[O:3])-[N:4]",
|
|
926
|
+
"reagents": ["CDI or triphosgene"],
|
|
927
|
+
"conditions": "Urea formation, DCM, RT, 12h",
|
|
928
|
+
"reaction_class": "urea_formation",
|
|
929
|
+
},
|
|
930
|
+
]
|
|
931
|
+
|
|
932
|
+
|
|
933
|
+
@registry.register(
|
|
934
|
+
name="chemistry.retrosynthesis",
|
|
935
|
+
description="Plan retrosynthetic routes for a target molecule — uses IBM RXN API if configured, otherwise heuristic SMARTS-based disconnections",
|
|
936
|
+
category="chemistry",
|
|
937
|
+
parameters={
|
|
938
|
+
"smiles": "SMILES string of the target molecule",
|
|
939
|
+
"max_steps": "Maximum retrosynthetic steps (default 3)",
|
|
940
|
+
},
|
|
941
|
+
usage_guide="You want to plan a synthetic route to make a target compound. Use for synthesis feasibility assessment, identifying key disconnections, and suggesting reagents/conditions. Provides heuristic retrosynthetic analysis using common transforms; optionally uses IBM RXN API if an API key is configured.",
|
|
942
|
+
)
|
|
943
|
+
def retrosynthesis(smiles: str, max_steps: int = 3, **kwargs) -> dict:
|
|
944
|
+
"""Plan retrosynthetic routes for a target molecule.
|
|
945
|
+
|
|
946
|
+
Attempts the IBM RXN API first (if api.ibm_rxn_key is configured),
|
|
947
|
+
then falls back to a heuristic RDKit-based retrosynthesis using
|
|
948
|
+
common disconnection transforms.
|
|
949
|
+
"""
|
|
950
|
+
smiles = _extract_smiles(smiles)
|
|
951
|
+
|
|
952
|
+
# Try IBM RXN API first
|
|
953
|
+
session = kwargs.get("_session", None)
|
|
954
|
+
api_key = None
|
|
955
|
+
if session and hasattr(session, "config"):
|
|
956
|
+
api_key = session.config.get("api.ibm_rxn_key", None)
|
|
957
|
+
|
|
958
|
+
if api_key:
|
|
959
|
+
result = _retrosynthesis_ibm_rxn(smiles, max_steps, api_key)
|
|
960
|
+
if result and "error" not in result:
|
|
961
|
+
return result
|
|
962
|
+
|
|
963
|
+
# Fall back to heuristic RDKit retrosynthesis
|
|
964
|
+
return _retrosynthesis_heuristic(smiles, max_steps)
|
|
965
|
+
|
|
966
|
+
|
|
967
|
+
def _retrosynthesis_ibm_rxn(smiles: str, max_steps: int, api_key: str) -> dict:
|
|
968
|
+
"""Call IBM RXN API for retrosynthesis prediction."""
|
|
969
|
+
import time
|
|
970
|
+
|
|
971
|
+
base_url = "https://rxn.res.ibm.com/rxn/api/api/v1"
|
|
972
|
+
headers = {
|
|
973
|
+
"Authorization": api_key,
|
|
974
|
+
"Content-Type": "application/json",
|
|
975
|
+
}
|
|
976
|
+
|
|
977
|
+
# Submit retrosynthesis prediction
|
|
978
|
+
resp, error = request(
|
|
979
|
+
"POST",
|
|
980
|
+
f"{base_url}/retrosynthesis/predict",
|
|
981
|
+
json={"content": smiles, "maxSteps": max_steps},
|
|
982
|
+
headers=headers,
|
|
983
|
+
timeout=30,
|
|
984
|
+
retries=2,
|
|
985
|
+
raise_for_status=False,
|
|
986
|
+
)
|
|
987
|
+
if error:
|
|
988
|
+
return {"error": f"IBM RXN API request failed: {error}", "summary": f"IBM RXN API request failed: {error}"}
|
|
989
|
+
if resp.status_code != 200:
|
|
990
|
+
return {"error": f"IBM RXN API returned status {resp.status_code}", "summary": f"IBM RXN API returned status {resp.status_code}"}
|
|
991
|
+
try:
|
|
992
|
+
prediction_id = resp.json().get("prediction_id")
|
|
993
|
+
except Exception:
|
|
994
|
+
return {"error": "IBM RXN API returned invalid JSON", "summary": "IBM RXN API returned invalid JSON"}
|
|
995
|
+
if not prediction_id:
|
|
996
|
+
return {"error": "IBM RXN API did not return a prediction ID", "summary": "IBM RXN API did not return a prediction ID"}
|
|
997
|
+
# Poll for results (up to 60 seconds)
|
|
998
|
+
for _ in range(12):
|
|
999
|
+
time.sleep(5)
|
|
1000
|
+
poll_resp, poll_error = request(
|
|
1001
|
+
"GET",
|
|
1002
|
+
f"{base_url}/retrosynthesis/results/{prediction_id}",
|
|
1003
|
+
headers=headers,
|
|
1004
|
+
timeout=15,
|
|
1005
|
+
retries=1,
|
|
1006
|
+
raise_for_status=False,
|
|
1007
|
+
)
|
|
1008
|
+
if poll_error:
|
|
1009
|
+
import logging
|
|
1010
|
+
logging.getLogger("ct.tools.chemistry").debug(
|
|
1011
|
+
"IBM RXN poll attempt failed: %s", poll_error,
|
|
1012
|
+
)
|
|
1013
|
+
continue
|
|
1014
|
+
if poll_resp.status_code == 200:
|
|
1015
|
+
try:
|
|
1016
|
+
data = poll_resp.json()
|
|
1017
|
+
except Exception:
|
|
1018
|
+
continue
|
|
1019
|
+
status = data.get("status", "")
|
|
1020
|
+
if status == "SUCCESS":
|
|
1021
|
+
return _parse_ibm_rxn_results(smiles, data)
|
|
1022
|
+
if status == "FAILED":
|
|
1023
|
+
return {"error": "IBM RXN retrosynthesis failed", "summary": "IBM RXN retrosynthesis failed"}
|
|
1024
|
+
return {"error": "IBM RXN API timed out waiting for results", "summary": "IBM RXN API timed out waiting for results"}
|
|
1025
|
+
def _parse_ibm_rxn_results(smiles: str, data: dict) -> dict:
|
|
1026
|
+
"""Parse IBM RXN API retrosynthesis results into standard format."""
|
|
1027
|
+
routes = []
|
|
1028
|
+
retro_routes = data.get("retrosynthetic_paths", [])
|
|
1029
|
+
|
|
1030
|
+
for i, route in enumerate(retro_routes):
|
|
1031
|
+
steps = []
|
|
1032
|
+
for step in route.get("steps", []):
|
|
1033
|
+
steps.append({
|
|
1034
|
+
"reaction_smiles": step.get("reaction", ""),
|
|
1035
|
+
"reactants": step.get("reactants", []),
|
|
1036
|
+
"confidence": step.get("confidence", 0.0),
|
|
1037
|
+
})
|
|
1038
|
+
routes.append({
|
|
1039
|
+
"route_id": i + 1,
|
|
1040
|
+
"n_steps": len(steps),
|
|
1041
|
+
"steps": steps,
|
|
1042
|
+
"confidence": route.get("confidence", 0.0),
|
|
1043
|
+
})
|
|
1044
|
+
|
|
1045
|
+
routes.sort(key=lambda r: r["n_steps"])
|
|
1046
|
+
shortest = routes[0]["n_steps"] if routes else 0
|
|
1047
|
+
|
|
1048
|
+
return {
|
|
1049
|
+
"summary": f"Retrosynthesis for {smiles}: {len(routes)} routes found via IBM RXN, "
|
|
1050
|
+
f"shortest is {shortest} steps",
|
|
1051
|
+
"target": smiles,
|
|
1052
|
+
"source": "ibm_rxn",
|
|
1053
|
+
"n_routes": len(routes),
|
|
1054
|
+
"routes": routes,
|
|
1055
|
+
}
|
|
1056
|
+
|
|
1057
|
+
|
|
1058
|
+
def _retrosynthesis_heuristic(smiles: str, max_steps: int) -> dict:
|
|
1059
|
+
"""Heuristic retrosynthesis using RDKit SMARTS-based disconnections."""
|
|
1060
|
+
from rdkit import Chem
|
|
1061
|
+
from rdkit.Chem import AllChem, Descriptors, rdMolDescriptors
|
|
1062
|
+
|
|
1063
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
1064
|
+
if mol is None:
|
|
1065
|
+
return {"error": f"Invalid SMILES: {smiles}", "summary": f"Could not parse SMILES: {smiles}"}
|
|
1066
|
+
|
|
1067
|
+
# Find applicable disconnections
|
|
1068
|
+
disconnections = []
|
|
1069
|
+
for transform in _RETRO_TRANSFORMS:
|
|
1070
|
+
pattern = Chem.MolFromSmarts(transform["product_smarts"])
|
|
1071
|
+
if pattern is None:
|
|
1072
|
+
continue
|
|
1073
|
+
matches = mol.GetSubstructMatches(pattern)
|
|
1074
|
+
if matches:
|
|
1075
|
+
disconnections.append({
|
|
1076
|
+
"transform_name": transform["name"],
|
|
1077
|
+
"description": transform["description"],
|
|
1078
|
+
"n_sites": len(matches),
|
|
1079
|
+
"atom_indices": [list(m) for m in matches[:3]], # cap at 3
|
|
1080
|
+
"reagents": transform["reagents"],
|
|
1081
|
+
"conditions": transform["conditions"],
|
|
1082
|
+
"reaction_class": transform["reaction_class"],
|
|
1083
|
+
})
|
|
1084
|
+
|
|
1085
|
+
if not disconnections:
|
|
1086
|
+
return {
|
|
1087
|
+
"summary": f"[HEURISTIC FALLBACK] Retrosynthesis for {smiles}: no heuristic disconnections found — "
|
|
1088
|
+
"molecule may require specialized chemistry. Configure api.ibm_rxn_key for AI-powered retrosynthesis.",
|
|
1089
|
+
"target": smiles,
|
|
1090
|
+
"source": "heuristic",
|
|
1091
|
+
"n_routes": 0,
|
|
1092
|
+
"routes": [],
|
|
1093
|
+
"disconnections": [],
|
|
1094
|
+
}
|
|
1095
|
+
|
|
1096
|
+
# Build routes: each disconnection is a potential first step
|
|
1097
|
+
# For multi-step, try to fragment further (simplified: just report single-step disconnections
|
|
1098
|
+
# but note that BRICS can provide deeper fragmentation)
|
|
1099
|
+
routes = []
|
|
1100
|
+
for i, disc in enumerate(disconnections):
|
|
1101
|
+
route_steps = [{
|
|
1102
|
+
"step": 1,
|
|
1103
|
+
"transform": disc["transform_name"],
|
|
1104
|
+
"description": disc["description"],
|
|
1105
|
+
"reagents": disc["reagents"],
|
|
1106
|
+
"conditions": disc["conditions"],
|
|
1107
|
+
"n_disconnection_sites": disc["n_sites"],
|
|
1108
|
+
}]
|
|
1109
|
+
routes.append({
|
|
1110
|
+
"route_id": i + 1,
|
|
1111
|
+
"strategy": disc["transform_name"],
|
|
1112
|
+
"n_steps": 1,
|
|
1113
|
+
"steps": route_steps,
|
|
1114
|
+
"reaction_class": disc["reaction_class"],
|
|
1115
|
+
})
|
|
1116
|
+
|
|
1117
|
+
# BRICS decomposition for deeper analysis
|
|
1118
|
+
brics_fragments = []
|
|
1119
|
+
try:
|
|
1120
|
+
from rdkit.Chem import BRICS
|
|
1121
|
+
frags = BRICS.BRICSDecompose(mol, returnMols=False)
|
|
1122
|
+
brics_fragments = list(frags)[:10] # cap output
|
|
1123
|
+
except Exception:
|
|
1124
|
+
pass
|
|
1125
|
+
|
|
1126
|
+
# Add a BRICS-based route if fragments found
|
|
1127
|
+
if brics_fragments and len(brics_fragments) > 1:
|
|
1128
|
+
brics_steps = []
|
|
1129
|
+
for j, frag in enumerate(brics_fragments):
|
|
1130
|
+
brics_steps.append({
|
|
1131
|
+
"step": j + 1,
|
|
1132
|
+
"fragment": frag,
|
|
1133
|
+
"description": f"BRICS fragment {j + 1}",
|
|
1134
|
+
})
|
|
1135
|
+
routes.append({
|
|
1136
|
+
"route_id": len(routes) + 1,
|
|
1137
|
+
"strategy": "BRICS full decomposition",
|
|
1138
|
+
"n_steps": len(brics_fragments),
|
|
1139
|
+
"steps": brics_steps,
|
|
1140
|
+
"reaction_class": "brics",
|
|
1141
|
+
})
|
|
1142
|
+
|
|
1143
|
+
# Molecular properties for context
|
|
1144
|
+
mw = Descriptors.MolWt(mol)
|
|
1145
|
+
formula = rdMolDescriptors.CalcMolFormula(mol)
|
|
1146
|
+
|
|
1147
|
+
# Sort routes by step count
|
|
1148
|
+
routes.sort(key=lambda r: r["n_steps"])
|
|
1149
|
+
shortest = routes[0]["n_steps"] if routes else 0
|
|
1150
|
+
|
|
1151
|
+
return {
|
|
1152
|
+
"summary": f"[HEURISTIC FALLBACK] Retrosynthesis for {formula} ({smiles}): {len(routes)} routes found "
|
|
1153
|
+
f"via SMARTS-based disconnection (not AI-predicted). Configure api.ibm_rxn_key for more accurate routes.",
|
|
1154
|
+
"target": smiles,
|
|
1155
|
+
"formula": formula,
|
|
1156
|
+
"molecular_weight": round(mw, 1),
|
|
1157
|
+
"source": "heuristic",
|
|
1158
|
+
"n_routes": len(routes),
|
|
1159
|
+
"routes": routes,
|
|
1160
|
+
"disconnections": disconnections,
|
|
1161
|
+
"brics_fragments": brics_fragments,
|
|
1162
|
+
}
|
|
1163
|
+
|
|
1164
|
+
|
|
1165
|
+
# ─── Pharmacophore feature SMARTS definitions ──────────────────
|
|
1166
|
+
_PHARMACOPHORE_FEATURES = {
|
|
1167
|
+
"HBD": {
|
|
1168
|
+
"name": "Hydrogen Bond Donor",
|
|
1169
|
+
"smarts": ["[#7!H0&!$(N-[SX4](=O)(=O)[CX4](F)(F)F)]", "[#8!H0&!$([OH][C,S,P]=O)]", "[#16!H0]"],
|
|
1170
|
+
},
|
|
1171
|
+
"HBA": {
|
|
1172
|
+
"name": "Hydrogen Bond Acceptor",
|
|
1173
|
+
"smarts": ["[#7&!$([nH])&!$(N-N=O)]", "[$([O])&!$([OX2](C)C=O)]", "[#16&X2]"],
|
|
1174
|
+
},
|
|
1175
|
+
"Aromatic": {
|
|
1176
|
+
"name": "Aromatic Ring",
|
|
1177
|
+
"smarts": ["a1aaaaa1", "a1aaaa1"],
|
|
1178
|
+
},
|
|
1179
|
+
"Hydrophobic": {
|
|
1180
|
+
"name": "Hydrophobic",
|
|
1181
|
+
"smarts": ["[CH2X4,CH1X4,CH0X4]", "[$([cX3](:*):*)&!$([cX3](-[OH])-[OH])]"],
|
|
1182
|
+
},
|
|
1183
|
+
"PosIonizable": {
|
|
1184
|
+
"name": "Positive Ionizable",
|
|
1185
|
+
"smarts": ["[+,+2,+3,+4]", "[$([NX3&!$([NX3]-O)](-C)(-C)-C)]", "[$(n1cc[nH]c1)]"],
|
|
1186
|
+
},
|
|
1187
|
+
"NegIonizable": {
|
|
1188
|
+
"name": "Negative Ionizable",
|
|
1189
|
+
"smarts": ["[-,-2,-3,-4]", "[$([OH]-[CX3]=[OX1])]", "[$([OH]-[SX4](=[OX1])(=[OX1]))]"],
|
|
1190
|
+
},
|
|
1191
|
+
}
|
|
1192
|
+
|
|
1193
|
+
|
|
1194
|
+
@registry.register(
|
|
1195
|
+
name="chemistry.pharmacophore",
|
|
1196
|
+
description="Generate a pharmacophore model from a set of active compounds identifying common molecular features",
|
|
1197
|
+
category="chemistry",
|
|
1198
|
+
parameters={
|
|
1199
|
+
"smiles_list": "List of SMILES strings for active compounds",
|
|
1200
|
+
"method": "Analysis method: 'common_features' (default) or 'fingerprints'",
|
|
1201
|
+
},
|
|
1202
|
+
usage_guide="You have a set of active compounds and want to identify the common pharmacophoric features that drive activity. Use for understanding SAR, virtual screening, and lead optimization. Identifies shared HBD, HBA, aromatic, hydrophobic, and ionizable features across the compound set.",
|
|
1203
|
+
)
|
|
1204
|
+
def pharmacophore(smiles_list: list = None, method: str = "common_features", **kwargs) -> dict:
|
|
1205
|
+
"""Generate a pharmacophore model from a set of active compounds.
|
|
1206
|
+
|
|
1207
|
+
Identifies common pharmacophore features (HBD, HBA, Aromatic, Hydrophobic,
|
|
1208
|
+
PosIonizable, NegIonizable) across the compound set and optionally generates
|
|
1209
|
+
2D pharmacophore fingerprints for consensus scoring.
|
|
1210
|
+
"""
|
|
1211
|
+
from rdkit import Chem, DataStructs
|
|
1212
|
+
from rdkit.Chem import Descriptors
|
|
1213
|
+
|
|
1214
|
+
if not smiles_list or len(smiles_list) < 2:
|
|
1215
|
+
return {
|
|
1216
|
+
"error": "Need at least 2 SMILES strings",
|
|
1217
|
+
"summary": "Pharmacophore analysis requires at least 2 compounds",
|
|
1218
|
+
}
|
|
1219
|
+
|
|
1220
|
+
# Resolve any drug names to SMILES
|
|
1221
|
+
resolved_list = []
|
|
1222
|
+
for smi in smiles_list:
|
|
1223
|
+
resolved_list.append(_extract_smiles(smi))
|
|
1224
|
+
smiles_list = resolved_list
|
|
1225
|
+
|
|
1226
|
+
# Parse molecules
|
|
1227
|
+
mols = []
|
|
1228
|
+
valid_smiles = []
|
|
1229
|
+
invalid = []
|
|
1230
|
+
for smi in smiles_list:
|
|
1231
|
+
mol = Chem.MolFromSmiles(smi)
|
|
1232
|
+
if mol is None:
|
|
1233
|
+
invalid.append(smi)
|
|
1234
|
+
else:
|
|
1235
|
+
mols.append(mol)
|
|
1236
|
+
valid_smiles.append(smi)
|
|
1237
|
+
|
|
1238
|
+
if len(mols) < 2:
|
|
1239
|
+
return {
|
|
1240
|
+
"error": f"Only {len(mols)} valid molecule(s) — need at least 2",
|
|
1241
|
+
"summary": "Insufficient valid molecules for pharmacophore analysis",
|
|
1242
|
+
"invalid_smiles": invalid,
|
|
1243
|
+
}
|
|
1244
|
+
|
|
1245
|
+
n_compounds = len(mols)
|
|
1246
|
+
|
|
1247
|
+
# Step 1: Detect pharmacophore features per molecule
|
|
1248
|
+
per_molecule_features = [] # list of dicts: smiles -> {feature_type: count}
|
|
1249
|
+
|
|
1250
|
+
for i, mol in enumerate(mols):
|
|
1251
|
+
mol_features = {}
|
|
1252
|
+
for feat_type, feat_def in _PHARMACOPHORE_FEATURES.items():
|
|
1253
|
+
count = 0
|
|
1254
|
+
for smarts_str in feat_def["smarts"]:
|
|
1255
|
+
pattern = Chem.MolFromSmarts(smarts_str)
|
|
1256
|
+
if pattern is not None:
|
|
1257
|
+
matches = mol.GetSubstructMatches(pattern)
|
|
1258
|
+
count += len(matches)
|
|
1259
|
+
mol_features[feat_type] = count
|
|
1260
|
+
per_molecule_features.append({
|
|
1261
|
+
"smiles": valid_smiles[i],
|
|
1262
|
+
"features": mol_features,
|
|
1263
|
+
})
|
|
1264
|
+
|
|
1265
|
+
# Step 2: Identify common features (present in all molecules)
|
|
1266
|
+
feature_types = list(_PHARMACOPHORE_FEATURES.keys())
|
|
1267
|
+
common_features = []
|
|
1268
|
+
feature_distribution = {}
|
|
1269
|
+
|
|
1270
|
+
for feat_type in feature_types:
|
|
1271
|
+
counts = [mf["features"][feat_type] for mf in per_molecule_features]
|
|
1272
|
+
min_count = min(counts)
|
|
1273
|
+
max_count = max(counts)
|
|
1274
|
+
mean_count = sum(counts) / len(counts)
|
|
1275
|
+
# Feature is "common" if present in all molecules
|
|
1276
|
+
present_in = sum(1 for c in counts if c > 0)
|
|
1277
|
+
frequency = present_in / n_compounds
|
|
1278
|
+
|
|
1279
|
+
feature_distribution[feat_type] = {
|
|
1280
|
+
"name": _PHARMACOPHORE_FEATURES[feat_type]["name"],
|
|
1281
|
+
"min_count": min_count,
|
|
1282
|
+
"max_count": max_count,
|
|
1283
|
+
"mean_count": round(mean_count, 1),
|
|
1284
|
+
"present_in_n": present_in,
|
|
1285
|
+
"frequency": round(frequency, 3),
|
|
1286
|
+
}
|
|
1287
|
+
|
|
1288
|
+
if min_count > 0:
|
|
1289
|
+
common_features.append({
|
|
1290
|
+
"type": feat_type,
|
|
1291
|
+
"name": _PHARMACOPHORE_FEATURES[feat_type]["name"],
|
|
1292
|
+
"min_count": min_count,
|
|
1293
|
+
"conserved": min_count == max_count,
|
|
1294
|
+
"frequency": 1.0,
|
|
1295
|
+
})
|
|
1296
|
+
|
|
1297
|
+
# Step 3: 2D pharmacophore fingerprints (if method includes fingerprints)
|
|
1298
|
+
pharm_fp_similarity = None
|
|
1299
|
+
if method in ("fingerprints", "both"):
|
|
1300
|
+
try:
|
|
1301
|
+
from rdkit.Chem.Pharm2D import Gobbi_Pharm2D, Generate
|
|
1302
|
+
|
|
1303
|
+
factory = Gobbi_Pharm2D.factory
|
|
1304
|
+
fps = []
|
|
1305
|
+
for mol in mols:
|
|
1306
|
+
fp = Generate.Gen2DFingerprint(mol, factory)
|
|
1307
|
+
fps.append(fp)
|
|
1308
|
+
|
|
1309
|
+
# Pairwise Tanimoto similarity
|
|
1310
|
+
sim_sum = 0.0
|
|
1311
|
+
sim_count = 0
|
|
1312
|
+
for i in range(len(fps)):
|
|
1313
|
+
for j in range(i + 1, len(fps)):
|
|
1314
|
+
sim = DataStructs.TanimotoSimilarity(fps[i], fps[j])
|
|
1315
|
+
sim_sum += sim
|
|
1316
|
+
sim_count += 1
|
|
1317
|
+
|
|
1318
|
+
pharm_fp_similarity = round(sim_sum / sim_count, 3) if sim_count > 0 else 0.0
|
|
1319
|
+
except Exception:
|
|
1320
|
+
pharm_fp_similarity = None # Gobbi_Pharm2D not available
|
|
1321
|
+
|
|
1322
|
+
# Step 4: Consensus score
|
|
1323
|
+
# Based on: fraction of features that are common + consistency of counts
|
|
1324
|
+
if feature_types:
|
|
1325
|
+
common_frac = len(common_features) / len(feature_types)
|
|
1326
|
+
else:
|
|
1327
|
+
common_frac = 0.0
|
|
1328
|
+
|
|
1329
|
+
# Weight by how conserved the counts are (lower variance = higher consensus)
|
|
1330
|
+
variance_scores = []
|
|
1331
|
+
for feat_type in feature_types:
|
|
1332
|
+
counts = [mf["features"][feat_type] for mf in per_molecule_features]
|
|
1333
|
+
if max(counts) > 0:
|
|
1334
|
+
normalized_range = (max(counts) - min(counts)) / max(counts)
|
|
1335
|
+
variance_scores.append(1.0 - normalized_range)
|
|
1336
|
+
|
|
1337
|
+
consistency = sum(variance_scores) / len(variance_scores) if variance_scores else 0.0
|
|
1338
|
+
consensus_score = round((common_frac * 0.6 + consistency * 0.4), 3)
|
|
1339
|
+
|
|
1340
|
+
# Build summary
|
|
1341
|
+
common_desc = []
|
|
1342
|
+
for cf in common_features:
|
|
1343
|
+
common_desc.append(f"{cf['min_count']} {cf['name']}")
|
|
1344
|
+
|
|
1345
|
+
summary_parts = [
|
|
1346
|
+
f"Pharmacophore from {n_compounds} compounds: "
|
|
1347
|
+
f"{len(common_features)} common features",
|
|
1348
|
+
]
|
|
1349
|
+
if common_desc:
|
|
1350
|
+
summary_parts[0] += f" ({', '.join(common_desc)})"
|
|
1351
|
+
summary_parts.append(f"Consensus score: {consensus_score}")
|
|
1352
|
+
if pharm_fp_similarity is not None:
|
|
1353
|
+
summary_parts.append(f"Mean pharmacophore fingerprint similarity: {pharm_fp_similarity}")
|
|
1354
|
+
|
|
1355
|
+
result = {
|
|
1356
|
+
"summary": "\n".join(summary_parts),
|
|
1357
|
+
"n_compounds": n_compounds,
|
|
1358
|
+
"n_valid": len(mols),
|
|
1359
|
+
"common_features": common_features,
|
|
1360
|
+
"feature_distribution": feature_distribution,
|
|
1361
|
+
"per_molecule_features": per_molecule_features,
|
|
1362
|
+
"consensus_score": consensus_score,
|
|
1363
|
+
"method": method,
|
|
1364
|
+
}
|
|
1365
|
+
|
|
1366
|
+
if invalid:
|
|
1367
|
+
result["invalid_smiles"] = invalid
|
|
1368
|
+
if pharm_fp_similarity is not None:
|
|
1369
|
+
result["pharmacophore_fp_similarity"] = pharm_fp_similarity
|
|
1370
|
+
|
|
1371
|
+
return result
|