celltype-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. celltype_cli-0.1.0.dist-info/METADATA +267 -0
  2. celltype_cli-0.1.0.dist-info/RECORD +89 -0
  3. celltype_cli-0.1.0.dist-info/WHEEL +4 -0
  4. celltype_cli-0.1.0.dist-info/entry_points.txt +2 -0
  5. celltype_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
  6. ct/__init__.py +3 -0
  7. ct/agent/__init__.py +0 -0
  8. ct/agent/case_studies.py +426 -0
  9. ct/agent/config.py +523 -0
  10. ct/agent/doctor.py +544 -0
  11. ct/agent/knowledge.py +523 -0
  12. ct/agent/loop.py +99 -0
  13. ct/agent/mcp_server.py +478 -0
  14. ct/agent/orchestrator.py +733 -0
  15. ct/agent/runner.py +656 -0
  16. ct/agent/sandbox.py +481 -0
  17. ct/agent/session.py +145 -0
  18. ct/agent/system_prompt.py +186 -0
  19. ct/agent/trace_store.py +228 -0
  20. ct/agent/trajectory.py +169 -0
  21. ct/agent/types.py +182 -0
  22. ct/agent/workflows.py +462 -0
  23. ct/api/__init__.py +1 -0
  24. ct/api/app.py +211 -0
  25. ct/api/config.py +120 -0
  26. ct/api/engine.py +124 -0
  27. ct/cli.py +1448 -0
  28. ct/data/__init__.py +0 -0
  29. ct/data/compute_providers.json +59 -0
  30. ct/data/cro_database.json +395 -0
  31. ct/data/downloader.py +238 -0
  32. ct/data/loaders.py +252 -0
  33. ct/kb/__init__.py +5 -0
  34. ct/kb/benchmarks.py +147 -0
  35. ct/kb/governance.py +106 -0
  36. ct/kb/ingest.py +415 -0
  37. ct/kb/reasoning.py +129 -0
  38. ct/kb/schema_monitor.py +162 -0
  39. ct/kb/substrate.py +387 -0
  40. ct/models/__init__.py +0 -0
  41. ct/models/llm.py +370 -0
  42. ct/tools/__init__.py +195 -0
  43. ct/tools/_compound_resolver.py +297 -0
  44. ct/tools/biomarker.py +368 -0
  45. ct/tools/cellxgene.py +282 -0
  46. ct/tools/chemistry.py +1371 -0
  47. ct/tools/claude.py +390 -0
  48. ct/tools/clinical.py +1153 -0
  49. ct/tools/clue.py +249 -0
  50. ct/tools/code.py +1069 -0
  51. ct/tools/combination.py +397 -0
  52. ct/tools/compute.py +402 -0
  53. ct/tools/cro.py +413 -0
  54. ct/tools/data_api.py +2114 -0
  55. ct/tools/design.py +295 -0
  56. ct/tools/dna.py +575 -0
  57. ct/tools/experiment.py +604 -0
  58. ct/tools/expression.py +655 -0
  59. ct/tools/files.py +957 -0
  60. ct/tools/genomics.py +1387 -0
  61. ct/tools/http_client.py +146 -0
  62. ct/tools/imaging.py +319 -0
  63. ct/tools/intel.py +223 -0
  64. ct/tools/literature.py +743 -0
  65. ct/tools/network.py +422 -0
  66. ct/tools/notification.py +111 -0
  67. ct/tools/omics.py +3330 -0
  68. ct/tools/ops.py +1230 -0
  69. ct/tools/parity.py +649 -0
  70. ct/tools/pk.py +245 -0
  71. ct/tools/protein.py +678 -0
  72. ct/tools/regulatory.py +643 -0
  73. ct/tools/remote_data.py +179 -0
  74. ct/tools/report.py +181 -0
  75. ct/tools/repurposing.py +376 -0
  76. ct/tools/safety.py +1280 -0
  77. ct/tools/shell.py +178 -0
  78. ct/tools/singlecell.py +533 -0
  79. ct/tools/statistics.py +552 -0
  80. ct/tools/structure.py +882 -0
  81. ct/tools/target.py +901 -0
  82. ct/tools/translational.py +123 -0
  83. ct/tools/viability.py +218 -0
  84. ct/ui/__init__.py +0 -0
  85. ct/ui/markdown.py +31 -0
  86. ct/ui/status.py +258 -0
  87. ct/ui/suggestions.py +567 -0
  88. ct/ui/terminal.py +1456 -0
  89. ct/ui/traces.py +112 -0
ct/tools/chemistry.py ADDED
@@ -0,0 +1,1371 @@
1
+ """
2
+ Chemistry tools: molecular descriptors, SAR analysis, similarity search.
3
+ """
4
+
5
+ from ct.tools import registry
6
+ from ct.tools.http_client import request
7
+
8
+
9
+ def _extract_smiles(smiles) -> str:
10
+ """Extract a SMILES string from various input types and resolve drug names.
11
+
12
+ Handles the case where a dict (e.g., full pubchem_lookup result) is passed
13
+ instead of a plain SMILES string — typically when the planner uses $step.1
14
+ instead of $step.1.canonical_smiles.
15
+
16
+ Also resolves drug names (e.g. "lenalidomide") to SMILES via
17
+ _compound_resolver.resolve_to_smiles.
18
+ """
19
+ if isinstance(smiles, dict):
20
+ smiles = (smiles.get("canonical_smiles") or smiles.get("smiles")
21
+ or smiles.get("summary", ""))
22
+ smiles = str(smiles).strip()
23
+
24
+ # Try to resolve name → SMILES (handles both valid SMILES and drug names)
25
+ try:
26
+ from ct.tools._compound_resolver import resolve_to_smiles
27
+ return resolve_to_smiles(smiles)
28
+ except (ValueError, ImportError):
29
+ return smiles # Fall through — tool will handle invalid SMILES
30
+
31
+
32
+ @registry.register(
33
+ name="chemistry.descriptors",
34
+ description="Compute molecular descriptors and fingerprints for a compound from SMILES",
35
+ category="chemistry",
36
+ parameters={"smiles": "SMILES string"},
37
+ usage_guide="You need molecular properties (MW, LogP, TPSA, Lipinski) for a compound. Use early in hit characterization to assess drug-likeness and physicochemical profile.",
38
+ )
39
+ def descriptors(smiles: str, **kwargs) -> dict:
40
+ """Compute molecular properties from SMILES."""
41
+ smiles = _extract_smiles(smiles)
42
+ from rdkit import Chem
43
+ from rdkit.Chem import Descriptors, AllChem, rdMolDescriptors
44
+
45
+ mol = Chem.MolFromSmiles(smiles)
46
+ if mol is None:
47
+ return {"error": f"Invalid SMILES: {smiles}", "summary": f"Invalid SMILES: {smiles}"}
48
+ props = {
49
+ "smiles": smiles,
50
+ "molecular_weight": Descriptors.MolWt(mol),
51
+ "logp": Descriptors.MolLogP(mol),
52
+ "hbd": Descriptors.NumHDonors(mol),
53
+ "hba": Descriptors.NumHAcceptors(mol),
54
+ "tpsa": Descriptors.TPSA(mol),
55
+ "rotatable_bonds": Descriptors.NumRotatableBonds(mol),
56
+ "rings": Descriptors.RingCount(mol),
57
+ "aromatic_rings": Descriptors.NumAromaticRings(mol),
58
+ "heavy_atoms": mol.GetNumHeavyAtoms(),
59
+ "formula": rdMolDescriptors.CalcMolFormula(mol),
60
+ "num_stereocenters": len(Chem.FindMolChiralCenters(mol)),
61
+ }
62
+
63
+ # Lipinski Rule of 5
64
+ props["lipinski_violations"] = sum([
65
+ props["molecular_weight"] > 500,
66
+ props["logp"] > 5,
67
+ props["hbd"] > 5,
68
+ props["hba"] > 10,
69
+ ])
70
+
71
+ # Molecular glue specific
72
+ props["mw_logp_ratio"] = props["molecular_weight"] / (props["logp"] + 1e-6)
73
+ props["tpsa_per_mw"] = props["tpsa"] / props["molecular_weight"]
74
+
75
+ return {
76
+ "summary": f"Molecular profile for {props['formula']} (MW={props['molecular_weight']:.1f}, "
77
+ f"LogP={props['logp']:.2f}, Lipinski violations={props['lipinski_violations']})",
78
+ "properties": props,
79
+ }
80
+
81
+
82
+ @registry.register(
83
+ name="chemistry.pairwise_similarity",
84
+ description="Compute pairwise Tanimoto similarity matrix for a list of compounds (by name or SMILES)",
85
+ category="chemistry",
86
+ parameters={
87
+ "compounds": "List of compound names or SMILES strings",
88
+ "fingerprint": "Fingerprint type: 'morgan' (default, ECFP4) or 'maccs'",
89
+ },
90
+ usage_guide="You need to compute fingerprint similarity between a set of named compounds. Use when the question asks to 'compare similarity', 'cluster by scaffold', or 'compute Tanimoto' between specific compounds. Returns a full pairwise similarity matrix.",
91
+ )
92
+ def pairwise_similarity(compounds: list = None, fingerprint: str = "morgan", **kwargs) -> dict:
93
+ """Compute pairwise Tanimoto similarity for a set of compounds."""
94
+ if not compounds or len(compounds) < 2:
95
+ return {"error": "Need at least 2 compounds", "summary": "Provide a list of 2+ compound names or SMILES"}
96
+
97
+ from rdkit import Chem, DataStructs
98
+ from rdkit.Chem import AllChem, MACCSkeys
99
+
100
+ # Resolve names to SMILES and compute fingerprints
101
+ resolved = []
102
+ for cpd in compounds:
103
+ smi = _extract_smiles(cpd)
104
+ mol = Chem.MolFromSmiles(smi)
105
+ if mol is None:
106
+ resolved.append({"name": cpd, "smiles": smi, "mol": None, "error": f"Invalid SMILES: {smi}"})
107
+ else:
108
+ resolved.append({"name": cpd, "smiles": Chem.MolToSmiles(mol), "mol": mol})
109
+
110
+ # Compute fingerprints
111
+ fps = []
112
+ for r in resolved:
113
+ if r["mol"] is None:
114
+ fps.append(None)
115
+ elif fingerprint == "maccs":
116
+ fps.append(MACCSkeys.GenMACCSKeys(r["mol"]))
117
+ else:
118
+ fps.append(AllChem.GetMorganFingerprintAsBitVect(r["mol"], 2, nBits=2048))
119
+
120
+ # Compute pairwise similarity matrix
121
+ n = len(resolved)
122
+ matrix = {}
123
+ pairs = []
124
+ for i in range(n):
125
+ for j in range(i + 1, n):
126
+ if fps[i] is None or fps[j] is None:
127
+ sim = 0.0
128
+ else:
129
+ sim = DataStructs.TanimotoSimilarity(fps[i], fps[j])
130
+ key = f"{resolved[i]['name']} vs {resolved[j]['name']}"
131
+ matrix[key] = round(sim, 4)
132
+ pairs.append({
133
+ "compound_a": resolved[i]["name"],
134
+ "compound_b": resolved[j]["name"],
135
+ "smiles_a": resolved[i]["smiles"],
136
+ "smiles_b": resolved[j]["smiles"],
137
+ "tanimoto": round(sim, 4),
138
+ })
139
+
140
+ # Sort by similarity
141
+ pairs.sort(key=lambda x: -x["tanimoto"])
142
+
143
+ # Cluster suggestion
144
+ if pairs:
145
+ most_similar = pairs[0]
146
+ least_similar = pairs[-1]
147
+ else:
148
+ most_similar = least_similar = {}
149
+
150
+ # Build readable matrix
151
+ names = [r["name"] for r in resolved]
152
+ matrix_rows = []
153
+ for i in range(n):
154
+ row = {}
155
+ for j in range(n):
156
+ if i == j:
157
+ row[names[j]] = 1.0
158
+ elif i < j:
159
+ row[names[j]] = round(DataStructs.TanimotoSimilarity(fps[i], fps[j]), 4) if fps[i] and fps[j] else 0.0
160
+ else:
161
+ row[names[j]] = round(DataStructs.TanimotoSimilarity(fps[j], fps[i]), 4) if fps[i] and fps[j] else 0.0
162
+ matrix_rows.append({"compound": names[i], **row})
163
+
164
+ fp_label = "ECFP4 (Morgan r=2, 2048 bits)" if fingerprint == "morgan" else "MACCS keys (166 bits)"
165
+
166
+ summary_lines = [
167
+ f"Pairwise Tanimoto similarity ({fp_label}) for {n} compounds:",
168
+ ]
169
+ for p in pairs:
170
+ summary_lines.append(f" {p['compound_a']} vs {p['compound_b']}: {p['tanimoto']:.4f}")
171
+ if most_similar:
172
+ summary_lines.append(f"Most similar: {most_similar['compound_a']} & {most_similar['compound_b']} ({most_similar['tanimoto']:.4f})")
173
+ if least_similar:
174
+ summary_lines.append(f"Most different: {least_similar['compound_a']} & {least_similar['compound_b']} ({least_similar['tanimoto']:.4f})")
175
+
176
+ return {
177
+ "summary": "\n".join(summary_lines),
178
+ "fingerprint_type": fp_label,
179
+ "n_compounds": n,
180
+ "pairs": pairs,
181
+ "matrix": matrix_rows,
182
+ "resolved_smiles": [{"name": r["name"], "smiles": r["smiles"]} for r in resolved],
183
+ }
184
+
185
+
186
+ @registry.register(
187
+ name="chemistry.similarity_search",
188
+ description="Find similar compounds in a library using Tanimoto similarity on Morgan fingerprints",
189
+ category="chemistry",
190
+ parameters={"smiles": "Query SMILES", "library_path": "Path to compound library CSV", "top_n": "Number of hits"},
191
+ usage_guide="You have a hit compound and want to find structurally similar analogs in a library. Use for SAR expansion or finding backup compounds with similar scaffolds.",
192
+ )
193
+ def similarity_search(smiles: str, library_path: str = None, top_n: int = 10, **kwargs) -> dict:
194
+ """Search for similar compounds using fingerprint similarity."""
195
+ smiles = _extract_smiles(smiles)
196
+ from rdkit import Chem, DataStructs
197
+ from rdkit.Chem import AllChem
198
+ import pandas as pd
199
+
200
+ query_mol = Chem.MolFromSmiles(smiles)
201
+ if query_mol is None:
202
+ return {"error": f"Invalid SMILES: {smiles}", "summary": f"Invalid SMILES: {smiles}"}
203
+ query_fp = AllChem.GetMorganFingerprintAsBitVect(query_mol, 2, nBits=2048)
204
+
205
+ # Load library
206
+ if library_path:
207
+ lib = pd.read_csv(library_path)
208
+ else:
209
+ return {"error": "No compound library specified", "summary": "No compound library specified"}
210
+ smiles_col = next((c for c in lib.columns if c.lower() in ['smiles', 'canonical_smiles']), None)
211
+ if smiles_col is None:
212
+ return {"error": f"No SMILES column found in library", "summary": f"No SMILES column found in library"}
213
+ results = []
214
+ for _, row in lib.iterrows():
215
+ mol = Chem.MolFromSmiles(row[smiles_col])
216
+ if mol is None:
217
+ continue
218
+ fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
219
+ similarity = DataStructs.TanimotoSimilarity(query_fp, fp)
220
+ results.append({
221
+ "smiles": row[smiles_col],
222
+ "similarity": similarity,
223
+ **{k: row[k] for k in row.index if k != smiles_col},
224
+ })
225
+
226
+ results.sort(key=lambda x: -x["similarity"])
227
+ top_hits = results[:top_n]
228
+
229
+ return {
230
+ "summary": f"Top {top_n} similar compounds (max Tanimoto={top_hits[0]['similarity']:.3f})" if top_hits else "No hits",
231
+ "hits": top_hits,
232
+ "library_size": len(results),
233
+ }
234
+
235
+
236
+ @registry.register(
237
+ name="chemistry.sar_analyze",
238
+ description="Analyze structure-activity relationships for a set of compounds with activity data",
239
+ category="chemistry",
240
+ parameters={"compounds_path": "CSV with SMILES and activity columns"},
241
+ usage_guide="You have a set of compounds with activity data and want to understand which molecular features drive potency. Use for medicinal chemistry optimization guidance.",
242
+ )
243
+ def sar_analyze(compounds_path: str, activity_col: str = "activity", **kwargs) -> dict:
244
+ """Run SAR analysis on a compound set."""
245
+ import pandas as pd
246
+ from rdkit import Chem
247
+ from rdkit.Chem import AllChem, Descriptors
248
+ import numpy as np
249
+
250
+ df = pd.read_csv(compounds_path)
251
+ smiles_col = next((c for c in df.columns if c.lower() in ['smiles', 'canonical_smiles']), None)
252
+
253
+ if smiles_col is None or activity_col not in df.columns:
254
+ return {"error": "Need SMILES and activity columns", "summary": "Need SMILES and activity columns"}
255
+ # Compute descriptors
256
+ features = []
257
+ for _, row in df.iterrows():
258
+ mol = Chem.MolFromSmiles(row[smiles_col])
259
+ if mol is None:
260
+ continue
261
+ features.append({
262
+ "smiles": row[smiles_col],
263
+ "activity": row[activity_col],
264
+ "mw": Descriptors.MolWt(mol),
265
+ "logp": Descriptors.MolLogP(mol),
266
+ "tpsa": Descriptors.TPSA(mol),
267
+ "hbd": Descriptors.NumHDonors(mol),
268
+ "hba": Descriptors.NumHAcceptors(mol),
269
+ "rotbonds": Descriptors.NumRotatableBonds(mol),
270
+ })
271
+
272
+ feat_df = pd.DataFrame(features)
273
+
274
+ # Correlate descriptors with activity
275
+ from scipy import stats
276
+ correlations = {}
277
+ for col in ["mw", "logp", "tpsa", "hbd", "hba", "rotbonds"]:
278
+ r, p = stats.pearsonr(feat_df[col], feat_df["activity"])
279
+ correlations[col] = {"r": round(r, 3), "p": round(p, 4)}
280
+
281
+ return {
282
+ "summary": f"SAR analysis on {len(feat_df)} compounds",
283
+ "correlations": correlations,
284
+ "n_compounds": len(feat_df),
285
+ }
286
+
287
+
288
+ @registry.register(
289
+ name="chemistry.mmp_analysis",
290
+ description="Matched molecular pair analysis to identify R-group transformations that improve activity",
291
+ category="chemistry",
292
+ parameters={
293
+ "compounds_csv": "Path to CSV with SMILES and activity columns",
294
+ "activity_col": "Name of the activity column (default 'activity')",
295
+ },
296
+ usage_guide="You have a congeneric series of compounds and want to identify which single-point structural changes drive activity. Use for medicinal chemistry SAR optimization — finds matched molecular pairs and ranks R-group swaps by activity improvement.",
297
+ )
298
+ def mmp_analysis(compounds_csv: str = None, activity_col: str = "activity", **kwargs) -> dict:
299
+ """Matched molecular pair analysis for a set of compounds.
300
+
301
+ Fragments molecules at single acyclic bonds, identifies matched pairs
302
+ (same core, different R-group), and correlates R-group changes with
303
+ activity differences.
304
+ """
305
+ try:
306
+ from rdkit import Chem
307
+ from rdkit.Chem import BRICS, AllChem, Descriptors, rdMolDescriptors
308
+ except ImportError:
309
+ return {"error": "RDKit is required for MMP analysis. Install with: pip install rdkit", "summary": "RDKit is required for MMP analysis. Install with: pip install rdkit"}
310
+ import pandas as pd
311
+ import numpy as np
312
+
313
+ # Load or generate demo data
314
+ if compounds_csv:
315
+ try:
316
+ df = pd.read_csv(compounds_csv)
317
+ except Exception as e:
318
+ return {"error": f"Could not read CSV: {e}", "summary": f"Failed to load {compounds_csv}"}
319
+ smiles_col = next((c for c in df.columns if c.lower() in ["smiles", "canonical_smiles"]), None)
320
+ if smiles_col is None:
321
+ return {"error": "No SMILES column found (expected 'smiles' or 'canonical_smiles')", "summary": "No SMILES column found (expected 'smiles' or 'canonical_smiles')"}
322
+ if activity_col not in df.columns:
323
+ return {"error": f"Activity column '{activity_col}' not found. Available: {list(df.columns)}", "summary": f"Activity column '{activity_col}' not found. Available: {list(df.columns)}"}
324
+ else:
325
+ # Demo dataset: simple benzamide series
326
+ demo_data = [
327
+ ("c1ccc(C(=O)N)cc1", 5.2, "benzamide"),
328
+ ("c1ccc(C(=O)N)cc1F", 6.1, "4-fluorobenzamide"),
329
+ ("c1ccc(C(=O)N)cc1Cl", 5.8, "4-chlorobenzamide"),
330
+ ("c1ccc(C(=O)N)cc1C", 5.5, "4-methylbenzamide"),
331
+ ("c1ccc(C(=O)N)cc1OC", 6.4, "4-methoxybenzamide"),
332
+ ("c1ccc(C(=O)N)cc1O", 6.0, "4-hydroxybenzamide"),
333
+ ("c1ccc(C(=O)NC)cc1", 5.0, "N-methylbenzamide"),
334
+ ("c1ccc(C(=O)NCC)cc1", 4.7, "N-ethylbenzamide"),
335
+ ("c1ccc(C(=O)N)c(F)c1", 5.9, "3-fluorobenzamide"),
336
+ ("c1cc(F)c(C(=O)N)cc1F", 6.8, "3,4-difluorobenzamide"),
337
+ ]
338
+ df = pd.DataFrame(demo_data, columns=["smiles", activity_col, "name"])
339
+ smiles_col = "smiles"
340
+
341
+ # Parse molecules and compute Murcko scaffolds
342
+ from rdkit.Chem.Scaffolds import MurckoScaffold
343
+
344
+ parsed = []
345
+ for _, row in df.iterrows():
346
+ mol = Chem.MolFromSmiles(row[smiles_col])
347
+ if mol is None:
348
+ continue
349
+ try:
350
+ scaffold = MurckoScaffold.GetScaffoldForMol(mol)
351
+ scaffold_smi = Chem.MolToSmiles(scaffold)
352
+ except Exception:
353
+ scaffold_smi = "unknown"
354
+ parsed.append({
355
+ "smiles": row[smiles_col],
356
+ "mol": mol,
357
+ "activity": float(row[activity_col]),
358
+ "scaffold": scaffold_smi,
359
+ "name": row.get("name", row[smiles_col]),
360
+ })
361
+
362
+ if len(parsed) < 2:
363
+ return {"error": "Need at least 2 valid compounds for MMP analysis",
364
+ "summary": "Insufficient valid compounds for analysis"}
365
+
366
+ # Fragment each molecule using BRICS
367
+ fragments_map = {} # smiles -> list of (core, rgroup) tuples
368
+ for entry in parsed:
369
+ mol = entry["mol"]
370
+ smi = entry["smiles"]
371
+ fragments_map[smi] = []
372
+
373
+ try:
374
+ brics_frags = BRICS.BRICSDecompose(mol, returnMols=False)
375
+ for frag in brics_frags:
376
+ fragments_map[smi].append(frag)
377
+ except Exception:
378
+ pass
379
+
380
+ # Identify matched pairs: same scaffold, different compounds
381
+ scaffold_groups = {}
382
+ for entry in parsed:
383
+ scaffold_groups.setdefault(entry["scaffold"], []).append(entry)
384
+
385
+ pairs = []
386
+ transformations = {} # (from_feature, to_feature) -> [delta_activity]
387
+
388
+ for scaffold, members in scaffold_groups.items():
389
+ if len(members) < 2:
390
+ continue
391
+
392
+ # Generate all pairs within scaffold group
393
+ for i in range(len(members)):
394
+ for j in range(i + 1, len(members)):
395
+ m1 = members[i]
396
+ m2 = members[j]
397
+ delta = m2["activity"] - m1["activity"]
398
+
399
+ # Find structural difference using MCS
400
+ try:
401
+ from rdkit.Chem import rdFMCS
402
+ mcs = rdFMCS.FindMCS(
403
+ [m1["mol"], m2["mol"]],
404
+ timeout=2,
405
+ matchValences=True,
406
+ ringMatchesRingOnly=True,
407
+ )
408
+ core_smarts = mcs.smartsString if mcs and mcs.numAtoms > 0 else None
409
+ except Exception:
410
+ core_smarts = None
411
+
412
+ # Characterize the transformation by atom count difference
413
+ atoms1 = m1["mol"].GetNumHeavyAtoms()
414
+ atoms2 = m2["mol"].GetNumHeavyAtoms()
415
+
416
+ pair_info = {
417
+ "compound_a": m1["smiles"],
418
+ "compound_b": m2["smiles"],
419
+ "name_a": m1.get("name", m1["smiles"]),
420
+ "name_b": m2.get("name", m2["smiles"]),
421
+ "activity_a": round(m1["activity"], 3),
422
+ "activity_b": round(m2["activity"], 3),
423
+ "delta_activity": round(delta, 3),
424
+ "scaffold": scaffold,
425
+ "core_mcs": core_smarts,
426
+ "heavy_atom_diff": atoms2 - atoms1,
427
+ }
428
+ pairs.append(pair_info)
429
+
430
+ # Track transformations by scaffold
431
+ key = scaffold
432
+ if key not in transformations:
433
+ transformations[key] = []
434
+ transformations[key].append({
435
+ "from": m1["smiles"],
436
+ "to": m2["smiles"],
437
+ "delta": delta,
438
+ })
439
+
440
+ # Rank pairs by absolute activity change
441
+ pairs.sort(key=lambda x: abs(x["delta_activity"]), reverse=True)
442
+
443
+ # Aggregate transformation statistics per scaffold
444
+ scaffold_stats = []
445
+ for scaffold, trans_list in transformations.items():
446
+ deltas = [t["delta"] for t in trans_list]
447
+ scaffold_stats.append({
448
+ "scaffold": scaffold,
449
+ "n_pairs": len(trans_list),
450
+ "mean_delta": round(float(np.mean(deltas)), 3),
451
+ "max_delta": round(float(np.max(deltas)), 3),
452
+ "min_delta": round(float(np.min(deltas)), 3),
453
+ "std_delta": round(float(np.std(deltas)), 3) if len(deltas) > 1 else 0.0,
454
+ })
455
+
456
+ # Find top activity-improving transformations
457
+ top_improvements = [p for p in pairs if p["delta_activity"] > 0][:10]
458
+ top_decreases = [p for p in pairs if p["delta_activity"] < 0]
459
+ top_decreases.sort(key=lambda x: x["delta_activity"])
460
+ top_decreases = top_decreases[:5]
461
+
462
+ n_scaffolds = len(scaffold_groups)
463
+ using_demo = compounds_csv is None
464
+
465
+ summary_lines = [
466
+ f"MMP analysis: {len(parsed)} compounds, {len(pairs)} matched pairs, {n_scaffolds} scaffold(s)",
467
+ ]
468
+ if using_demo:
469
+ summary_lines.append("(Using built-in demo dataset — provide compounds_csv for custom analysis)")
470
+ if top_improvements:
471
+ best = top_improvements[0]
472
+ summary_lines.append(
473
+ f"Best improvement: {best['name_a']} -> {best['name_b']} "
474
+ f"(delta={best['delta_activity']:+.3f})"
475
+ )
476
+
477
+ return {
478
+ "summary": "\n".join(summary_lines),
479
+ "n_compounds": len(parsed),
480
+ "n_pairs": len(pairs),
481
+ "n_scaffolds": n_scaffolds,
482
+ "using_demo_data": using_demo,
483
+ "top_improvements": top_improvements,
484
+ "top_decreases": top_decreases,
485
+ "scaffold_stats": scaffold_stats,
486
+ "all_pairs": pairs[:50], # cap output
487
+ }
488
+
489
+
490
+ @registry.register(
491
+ name="chemistry.scaffold_hop",
492
+ description="Suggest scaffold replacements and bioisosteres for a compound",
493
+ category="chemistry",
494
+ parameters={
495
+ "smiles": "SMILES string for the input compound",
496
+ },
497
+ usage_guide="You want to explore alternative scaffolds for a hit compound — either to improve properties, escape a patent, or find novel chemical matter. Generates bioisosteric replacements for functional groups and suggests scaffold hops based on the Murcko framework.",
498
+ )
499
+ def scaffold_hop(smiles: str, **kwargs) -> dict:
500
+ """Suggest scaffold replacements and bioisosteric substitutions.
501
+
502
+ Extracts the Murcko scaffold, identifies key functional groups, and
503
+ suggests common bioisosteric replacements with rationale.
504
+ """
505
+ smiles = _extract_smiles(smiles)
506
+ try:
507
+ from rdkit import Chem
508
+ from rdkit.Chem import Descriptors, AllChem, rdMolDescriptors
509
+ from rdkit.Chem.Scaffolds import MurckoScaffold
510
+ except ImportError:
511
+ return {"error": "RDKit is required for scaffold hopping. Install with: pip install rdkit", "summary": "RDKit is required for scaffold hopping. Install with: pip install rdkit"}
512
+ mol = Chem.MolFromSmiles(smiles)
513
+ if mol is None:
514
+ return {"error": f"Invalid SMILES: {smiles}", "summary": f"Could not parse SMILES: {smiles}"}
515
+
516
+ # Extract Murcko scaffold
517
+ try:
518
+ scaffold_mol = MurckoScaffold.GetScaffoldForMol(mol)
519
+ scaffold_smi = Chem.MolToSmiles(scaffold_mol)
520
+ generic_scaffold = MurckoScaffold.MakeScaffoldGeneric(scaffold_mol)
521
+ generic_smi = Chem.MolToSmiles(generic_scaffold)
522
+ except Exception as e:
523
+ scaffold_smi = "N/A"
524
+ generic_smi = "N/A"
525
+
526
+ # Identify functional groups via SMARTS matching
527
+ # Each entry: (name, smarts, bioisosteres)
528
+ fg_definitions = [
529
+ ("carboxylic_acid", "[CX3](=O)[OX2H1]", [
530
+ {"replacement": "tetrazole", "smiles_fragment": "c1nnn[nH]1",
531
+ "rationale": "Classic carboxylic acid bioisostere — similar pKa, improved metabolic stability and permeability"},
532
+ {"replacement": "acyl sulfonamide", "smiles_fragment": "C(=O)NS(=O)=O",
533
+ "rationale": "Acidic NH mimics carboxylate — good for oral bioavailability"},
534
+ {"replacement": "hydroxamic acid", "smiles_fragment": "C(=O)NO",
535
+ "rationale": "Maintains H-bond donor/acceptor pattern — also a zinc-binding group"},
536
+ ]),
537
+ ("amide", "[NX3][CX3](=[OX1])[#6]", [
538
+ {"replacement": "sulfonamide", "smiles_fragment": "NS(=O)(=O)",
539
+ "rationale": "Similar geometry and H-bonding — often improved metabolic stability"},
540
+ {"replacement": "urea", "smiles_fragment": "NC(=O)N",
541
+ "rationale": "Additional H-bond donor — can improve target binding"},
542
+ {"replacement": "reversed amide", "smiles_fragment": "C(=O)N (reversed)",
543
+ "rationale": "Switching C(=O)NH to NHC(=O) — changes metabolic soft spot"},
544
+ {"replacement": "1,2,4-oxadiazole", "smiles_fragment": "c1nonc1",
545
+ "rationale": "Planar amide bioisostere — improved metabolic stability"},
546
+ ]),
547
+ ("phenyl", "c1ccccc1", [
548
+ {"replacement": "pyridine", "smiles_fragment": "c1ccncc1",
549
+ "rationale": "Introduces H-bond acceptor — improves solubility and can modulate pKa"},
550
+ {"replacement": "pyrimidine", "smiles_fragment": "c1ncncc1",
551
+ "rationale": "Two nitrogen atoms — further improved solubility vs pyridine"},
552
+ {"replacement": "cyclohexane", "smiles_fragment": "C1CCCCC1",
553
+ "rationale": "sp3-rich replacement — escape flatness, improve Fsp3 and solubility (Lovering)"},
554
+ {"replacement": "thiophene", "smiles_fragment": "c1ccsc1",
555
+ "rationale": "5-membered aromatic — different vector geometry, often similar binding"},
556
+ ]),
557
+ ("ester", "[#6][CX3](=O)[OX2][#6]", [
558
+ {"replacement": "amide", "smiles_fragment": "C(=O)N",
559
+ "rationale": "Much more metabolically stable — standard ester prodrug reversal"},
560
+ {"replacement": "oxadiazole", "smiles_fragment": "c1nonn1",
561
+ "rationale": "Planar ester bioisostere — metabolically stable"},
562
+ ]),
563
+ ("sulfonamide", "[NX3]S(=O)(=O)", [
564
+ {"replacement": "amide", "smiles_fragment": "NC(=O)",
565
+ "rationale": "Simpler, often similar activity — different metabolic profile"},
566
+ {"replacement": "reverse sulfonamide", "smiles_fragment": "S(=O)(=O)N (reversed)",
567
+ "rationale": "Switch N and C sides of sulfonamide"},
568
+ ]),
569
+ ("hydroxyl", "[OX2H]", [
570
+ {"replacement": "fluorine", "smiles_fragment": "F",
571
+ "rationale": "Similar size, H-bond acceptor only — blocks metabolic oxidation site"},
572
+ {"replacement": "amine", "smiles_fragment": "N",
573
+ "rationale": "H-bond donor and acceptor — different pKa profile"},
574
+ {"replacement": "methoxy", "smiles_fragment": "OC",
575
+ "rationale": "Caps the OH — blocks glucuronidation, changes H-bonding"},
576
+ ]),
577
+ ("nitrile", "[CX2]#[NX1]", [
578
+ {"replacement": "isoxazole", "smiles_fragment": "c1ccon1",
579
+ "rationale": "Ring-based CN mimic — similar dipole and H-bond accepting"},
580
+ ]),
581
+ ("fluorine", "[F]", [
582
+ {"replacement": "chlorine", "smiles_fragment": "Cl",
583
+ "rationale": "Larger halogen — increased lipophilicity, different steric profile"},
584
+ {"replacement": "hydrogen", "smiles_fragment": "[H]",
585
+ "rationale": "Remove halogen — simplify molecule, assess fluorine contribution"},
586
+ {"replacement": "trifluoromethyl", "smiles_fragment": "C(F)(F)F",
587
+ "rationale": "Strongly electron-withdrawing — metabolically stable, increases lipophilicity"},
588
+ ]),
589
+ ]
590
+
591
+ # Match functional groups
592
+ detected_groups = []
593
+ all_bioisosteres = []
594
+
595
+ for fg_name, smarts, bioisosteres in fg_definitions:
596
+ pattern = Chem.MolFromSmarts(smarts)
597
+ if pattern is None:
598
+ continue
599
+ matches = mol.GetSubstructMatches(pattern)
600
+ if matches:
601
+ detected_groups.append({
602
+ "group": fg_name,
603
+ "count": len(matches),
604
+ "atom_indices": [list(m) for m in matches],
605
+ })
606
+ for bio in bioisosteres:
607
+ all_bioisosteres.append({
608
+ "original_group": fg_name,
609
+ **bio,
610
+ })
611
+
612
+ # Scaffold replacement suggestions
613
+ scaffold_replacements = []
614
+
615
+ # Detect ring systems in the scaffold
616
+ if scaffold_smi != "N/A":
617
+ scaf_mol = Chem.MolFromSmiles(scaffold_smi)
618
+ if scaf_mol:
619
+ ring_info = scaf_mol.GetRingInfo()
620
+ n_rings = ring_info.NumRings()
621
+
622
+ # Common scaffold hops based on ring system
623
+ ring_replacements = {
624
+ "c1ccccc1": [ # benzene
625
+ ("c1ccncc1", "phenyl -> pyridyl (N-walk around ring)"),
626
+ ("c1ccoc1", "phenyl -> furanyl (ring contraction)"),
627
+ ("c1ccsc1", "phenyl -> thiophenyl (5-mem heterocycle)"),
628
+ ("C1CCCCC1", "phenyl -> cyclohexyl (sp3 escape)"),
629
+ ("c1cc[nH]c1", "phenyl -> pyrrolyl (electron-rich 5-mem)"),
630
+ ],
631
+ "c1ccncc1": [ # pyridine
632
+ ("c1ccccc1", "pyridyl -> phenyl (remove N)"),
633
+ ("c1ncncc1", "pyridyl -> pyrimidyl (add N)"),
634
+ ("c1ccnnc1", "pyridyl -> pyridazinyl (adjacent N)"),
635
+ ],
636
+ "c1cc[nH]c1": [ # pyrrole
637
+ ("c1ccoc1", "pyrrolyl -> furanyl"),
638
+ ("c1ccsc1", "pyrrolyl -> thiophenyl"),
639
+ ],
640
+ }
641
+
642
+ for ring_smi, replacements in ring_replacements.items():
643
+ ring_pat = Chem.MolFromSmarts(ring_smi)
644
+ if ring_pat and scaf_mol.HasSubstructMatch(ring_pat):
645
+ for repl_smi, description in replacements:
646
+ scaffold_replacements.append({
647
+ "original_ring": ring_smi,
648
+ "replacement_ring": repl_smi,
649
+ "description": description,
650
+ })
651
+
652
+ # Compute properties for context
653
+ mw = Descriptors.MolWt(mol)
654
+ logp = Descriptors.MolLogP(mol)
655
+ tpsa = Descriptors.TPSA(mol)
656
+ fsp3 = rdMolDescriptors.CalcFractionCSP3(mol)
657
+
658
+ property_context = {
659
+ "molecular_weight": round(mw, 1),
660
+ "logp": round(logp, 2),
661
+ "tpsa": round(tpsa, 1),
662
+ "fsp3": round(fsp3, 3),
663
+ "suggestions_for_improvement": [],
664
+ }
665
+
666
+ if fsp3 < 0.25:
667
+ property_context["suggestions_for_improvement"].append(
668
+ "Low Fsp3 ({:.2f}) — consider sp3-rich scaffold hops (phenyl->cyclohexyl) to improve solubility".format(fsp3)
669
+ )
670
+ if logp > 4:
671
+ property_context["suggestions_for_improvement"].append(
672
+ "High LogP ({:.1f}) — add heteroatoms or polar groups to improve solubility".format(logp)
673
+ )
674
+ if tpsa < 40:
675
+ property_context["suggestions_for_improvement"].append(
676
+ "Low TPSA ({:.0f}) — may have poor solubility; consider adding H-bond acceptors".format(tpsa)
677
+ )
678
+
679
+ # Summary
680
+ summary_lines = [
681
+ f"Scaffold analysis for {smiles}",
682
+ f"Murcko scaffold: {scaffold_smi}",
683
+ f"Generic framework: {generic_smi}",
684
+ f"Detected functional groups: {', '.join(g['group'] for g in detected_groups) if detected_groups else 'none identified'}",
685
+ f"Bioisostere suggestions: {len(all_bioisosteres)}",
686
+ f"Scaffold hop options: {len(scaffold_replacements)}",
687
+ ]
688
+
689
+ return {
690
+ "summary": "\n".join(summary_lines),
691
+ "input_smiles": smiles,
692
+ "murcko_scaffold": scaffold_smi,
693
+ "generic_framework": generic_smi,
694
+ "detected_functional_groups": detected_groups,
695
+ "bioisostere_suggestions": all_bioisosteres,
696
+ "scaffold_replacements": scaffold_replacements,
697
+ "property_context": property_context,
698
+ }
699
+
700
+
701
+ @registry.register(
702
+ name="chemistry.pubchem_lookup",
703
+ description="Look up compound data from PubChem by name or SMILES",
704
+ category="chemistry",
705
+ parameters={
706
+ "query": "Compound name or SMILES string",
707
+ "query_type": "Type of query: 'name' or 'smiles' (default 'name')",
708
+ },
709
+ usage_guide="You need compound information (structure, properties, synonyms, CID) from PubChem. Use when identifying a compound by name or validating a SMILES string. Returns canonical SMILES, physicochemical properties, and identifiers.",
710
+ )
711
+ def pubchem_lookup(query: str, query_type: str = "name", **kwargs) -> dict:
712
+ """Look up compound data from PubChem PUG REST API.
713
+
714
+ Supports lookup by compound name or SMILES string. Returns CID, canonical
715
+ SMILES, molecular properties, and synonyms.
716
+ """
717
+ base_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
718
+
719
+ # Step 1: Resolve query to CID
720
+ if query_type == "smiles":
721
+ lookup_url = f"{base_url}/compound/smiles/JSON"
722
+ # Use POST for SMILES to handle special characters
723
+ resp, error = request(
724
+ "POST",
725
+ lookup_url,
726
+ data={"smiles": query},
727
+ timeout=10,
728
+ retries=2,
729
+ raise_for_status=False,
730
+ )
731
+ if error:
732
+ return {"error": f"HTTP error: {error}", "summary": f"PubChem lookup failed: {error}"}
733
+ else:
734
+ # URL-encode the compound name
735
+ import urllib.parse
736
+ encoded_query = urllib.parse.quote(query, safe="")
737
+ lookup_url = f"{base_url}/compound/name/{encoded_query}/JSON"
738
+ resp, error = request(
739
+ "GET",
740
+ lookup_url,
741
+ timeout=10,
742
+ retries=2,
743
+ raise_for_status=False,
744
+ )
745
+ if error:
746
+ return {"error": f"HTTP error: {error}", "summary": f"PubChem lookup failed: {error}"}
747
+
748
+ if resp.status_code == 404:
749
+ return {
750
+ "error": f"Compound not found: {query}",
751
+ "summary": f"PubChem: no compound found for '{query}' (query_type={query_type})",
752
+ }
753
+ if resp.status_code != 200:
754
+ return {
755
+ "error": f"PubChem API error (HTTP {resp.status_code})",
756
+ "summary": f"PubChem lookup failed with status {resp.status_code}",
757
+ }
758
+
759
+ try:
760
+ data = resp.json()
761
+ except Exception:
762
+ return {"error": "Failed to parse PubChem response", "summary": "PubChem returned invalid JSON"}
763
+
764
+ # Extract CID
765
+ compounds = data.get("PC_Compounds", [])
766
+ if not compounds:
767
+ return {"error": "No compound data in response", "summary": f"PubChem returned empty result for '{query}'"}
768
+
769
+ compound = compounds[0]
770
+ cid = compound.get("id", {}).get("id", {}).get("cid")
771
+
772
+ if not cid:
773
+ return {"error": "Could not extract CID", "summary": "PubChem response missing CID"}
774
+
775
+ # Step 2: Get properties
776
+ props_url = (
777
+ f"{base_url}/compound/cid/{cid}/property/"
778
+ "MolecularFormula,MolecularWeight,CanonicalSMILES,IsomericSMILES,"
779
+ "XLogP,ExactMass,TPSA,HBondDonorCount,HBondAcceptorCount,"
780
+ "RotatableBondCount,HeavyAtomCount,Complexity/JSON"
781
+ )
782
+ props_resp, props_error = request(
783
+ "GET",
784
+ props_url,
785
+ timeout=10,
786
+ retries=2,
787
+ raise_for_status=False,
788
+ )
789
+ if props_error:
790
+ props_data = {}
791
+ else:
792
+ try:
793
+ props_data = props_resp.json() if props_resp.status_code == 200 else {}
794
+ except Exception:
795
+ props_data = {}
796
+
797
+ properties = {}
798
+ prop_table = props_data.get("PropertyTable", {}).get("Properties", [])
799
+ if prop_table:
800
+ p = prop_table[0]
801
+ # PubChem may return SMILES as "CanonicalSMILES", "SMILES", or "ConnectivitySMILES"
802
+ canonical = p.get("CanonicalSMILES") or p.get("SMILES") or p.get("ConnectivitySMILES")
803
+ isomeric = p.get("IsomericSMILES") or canonical
804
+ properties = {
805
+ "cid": p.get("CID"),
806
+ "molecular_formula": p.get("MolecularFormula"),
807
+ "molecular_weight": p.get("MolecularWeight"),
808
+ "canonical_smiles": canonical,
809
+ "isomeric_smiles": isomeric,
810
+ "xlogp": p.get("XLogP"),
811
+ "exact_mass": p.get("ExactMass"),
812
+ "tpsa": p.get("TPSA"),
813
+ "hbd": p.get("HBondDonorCount"),
814
+ "hba": p.get("HBondAcceptorCount"),
815
+ "rotatable_bonds": p.get("RotatableBondCount"),
816
+ "heavy_atoms": p.get("HeavyAtomCount"),
817
+ "complexity": p.get("Complexity"),
818
+ }
819
+
820
+ # Step 3: Get synonyms
821
+ synonyms_url = f"{base_url}/compound/cid/{cid}/synonyms/JSON"
822
+ synonyms = []
823
+ syn_resp, syn_error = request(
824
+ "GET",
825
+ synonyms_url,
826
+ timeout=10,
827
+ retries=2,
828
+ raise_for_status=False,
829
+ )
830
+ if not syn_error and syn_resp.status_code == 200:
831
+ try:
832
+ syn_data = syn_resp.json()
833
+ syn_list = syn_data.get("InformationList", {}).get("Information", [])
834
+ if syn_list:
835
+ synonyms = syn_list[0].get("Synonym", [])[:20] # cap at 20
836
+ except Exception:
837
+ pass
838
+
839
+ # Build summary
840
+ canonical = properties.get("canonical_smiles", "N/A")
841
+ mw = properties.get("molecular_weight", "N/A")
842
+ formula = properties.get("molecular_formula", "N/A")
843
+ xlogp = properties.get("xlogp", "N/A")
844
+
845
+ summary_lines = [
846
+ f"PubChem: {query} (CID {cid})",
847
+ f"Formula: {formula}, MW: {mw}, XLogP: {xlogp}",
848
+ f"SMILES: {canonical}",
849
+ ]
850
+ if synonyms:
851
+ summary_lines.append(f"Also known as: {', '.join(synonyms[:5])}")
852
+
853
+ return {
854
+ "summary": "\n".join(summary_lines),
855
+ "cid": cid,
856
+ "canonical_smiles": properties.get("canonical_smiles"),
857
+ "properties": properties,
858
+ "synonyms": synonyms,
859
+ "pubchem_url": f"https://pubchem.ncbi.nlm.nih.gov/compound/{cid}",
860
+ }
861
+
862
+
863
+ # ─── Retrosynthetic transforms (SMARTS) ─────────────────────────
864
+ # Each transform: (name, product_smarts, reactant_smarts_list, reagents, conditions)
865
+ _RETRO_TRANSFORMS = [
866
+ {
867
+ "name": "Amide bond disconnection",
868
+ "description": "Disconnect C(=O)-N amide bond → carboxylic acid + amine",
869
+ "product_smarts": "[C:1](=[O:2])-[N:3]",
870
+ "reagents": ["HATU", "DIPEA"],
871
+ "conditions": "Amide coupling, DMF, RT, 12h",
872
+ "reaction_class": "amide_coupling",
873
+ },
874
+ {
875
+ "name": "Suzuki coupling",
876
+ "description": "Disconnect Ar-Ar biaryl bond → aryl boronic acid + aryl halide",
877
+ "product_smarts": "[c:1]-[c:2]",
878
+ "reagents": ["Pd(PPh3)4", "K2CO3"],
879
+ "conditions": "Suzuki coupling, dioxane/H2O, 80°C, 16h",
880
+ "reaction_class": "cross_coupling",
881
+ },
882
+ {
883
+ "name": "Ester hydrolysis",
884
+ "description": "Disconnect C(=O)-O ester bond → carboxylic acid + alcohol",
885
+ "product_smarts": "[C:1](=[O:2])-[O:3][C:4]",
886
+ "reagents": ["DCC", "DMAP"],
887
+ "conditions": "Esterification, DCM, RT, 4h",
888
+ "reaction_class": "esterification",
889
+ },
890
+ {
891
+ "name": "Reductive amination",
892
+ "description": "Disconnect C-N bond adjacent to C-H → aldehyde/ketone + amine",
893
+ "product_smarts": "[C:1]-[NH:2]",
894
+ "reagents": ["NaBH3CN", "AcOH"],
895
+ "conditions": "Reductive amination, MeOH, RT, 16h",
896
+ "reaction_class": "reductive_amination",
897
+ },
898
+ {
899
+ "name": "N-alkylation",
900
+ "description": "Disconnect N-C(sp3) bond → amine + alkyl halide",
901
+ "product_smarts": "[N:1]-[CH2:2]",
902
+ "reagents": ["K2CO3"],
903
+ "conditions": "N-alkylation, DMF, 60°C, 12h",
904
+ "reaction_class": "alkylation",
905
+ },
906
+ {
907
+ "name": "Ether formation (Williamson)",
908
+ "description": "Disconnect C-O-C ether bond → alcohol + alkyl halide",
909
+ "product_smarts": "[C:1]-[O:2]-[C:3]",
910
+ "reagents": ["NaH"],
911
+ "conditions": "Williamson ether synthesis, THF, 0°C→RT, 6h",
912
+ "reaction_class": "etherification",
913
+ },
914
+ {
915
+ "name": "Sulfonamide formation",
916
+ "description": "Disconnect S(=O)(=O)-N bond → sulfonyl chloride + amine",
917
+ "product_smarts": "[S:1](=[O:2])(=[O:3])-[N:4]",
918
+ "reagents": ["Et3N"],
919
+ "conditions": "Sulfonamide coupling, DCM, 0°C→RT, 4h",
920
+ "reaction_class": "sulfonamide_formation",
921
+ },
922
+ {
923
+ "name": "Urea formation",
924
+ "description": "Disconnect N-C(=O)-N urea → isocyanate + amine",
925
+ "product_smarts": "[N:1]-[C:2](=[O:3])-[N:4]",
926
+ "reagents": ["CDI or triphosgene"],
927
+ "conditions": "Urea formation, DCM, RT, 12h",
928
+ "reaction_class": "urea_formation",
929
+ },
930
+ ]
931
+
932
+
933
+ @registry.register(
934
+ name="chemistry.retrosynthesis",
935
+ description="Plan retrosynthetic routes for a target molecule — uses IBM RXN API if configured, otherwise heuristic SMARTS-based disconnections",
936
+ category="chemistry",
937
+ parameters={
938
+ "smiles": "SMILES string of the target molecule",
939
+ "max_steps": "Maximum retrosynthetic steps (default 3)",
940
+ },
941
+ usage_guide="You want to plan a synthetic route to make a target compound. Use for synthesis feasibility assessment, identifying key disconnections, and suggesting reagents/conditions. Provides heuristic retrosynthetic analysis using common transforms; optionally uses IBM RXN API if an API key is configured.",
942
+ )
943
+ def retrosynthesis(smiles: str, max_steps: int = 3, **kwargs) -> dict:
944
+ """Plan retrosynthetic routes for a target molecule.
945
+
946
+ Attempts the IBM RXN API first (if api.ibm_rxn_key is configured),
947
+ then falls back to a heuristic RDKit-based retrosynthesis using
948
+ common disconnection transforms.
949
+ """
950
+ smiles = _extract_smiles(smiles)
951
+
952
+ # Try IBM RXN API first
953
+ session = kwargs.get("_session", None)
954
+ api_key = None
955
+ if session and hasattr(session, "config"):
956
+ api_key = session.config.get("api.ibm_rxn_key", None)
957
+
958
+ if api_key:
959
+ result = _retrosynthesis_ibm_rxn(smiles, max_steps, api_key)
960
+ if result and "error" not in result:
961
+ return result
962
+
963
+ # Fall back to heuristic RDKit retrosynthesis
964
+ return _retrosynthesis_heuristic(smiles, max_steps)
965
+
966
+
967
+ def _retrosynthesis_ibm_rxn(smiles: str, max_steps: int, api_key: str) -> dict:
968
+ """Call IBM RXN API for retrosynthesis prediction."""
969
+ import time
970
+
971
+ base_url = "https://rxn.res.ibm.com/rxn/api/api/v1"
972
+ headers = {
973
+ "Authorization": api_key,
974
+ "Content-Type": "application/json",
975
+ }
976
+
977
+ # Submit retrosynthesis prediction
978
+ resp, error = request(
979
+ "POST",
980
+ f"{base_url}/retrosynthesis/predict",
981
+ json={"content": smiles, "maxSteps": max_steps},
982
+ headers=headers,
983
+ timeout=30,
984
+ retries=2,
985
+ raise_for_status=False,
986
+ )
987
+ if error:
988
+ return {"error": f"IBM RXN API request failed: {error}", "summary": f"IBM RXN API request failed: {error}"}
989
+ if resp.status_code != 200:
990
+ return {"error": f"IBM RXN API returned status {resp.status_code}", "summary": f"IBM RXN API returned status {resp.status_code}"}
991
+ try:
992
+ prediction_id = resp.json().get("prediction_id")
993
+ except Exception:
994
+ return {"error": "IBM RXN API returned invalid JSON", "summary": "IBM RXN API returned invalid JSON"}
995
+ if not prediction_id:
996
+ return {"error": "IBM RXN API did not return a prediction ID", "summary": "IBM RXN API did not return a prediction ID"}
997
+ # Poll for results (up to 60 seconds)
998
+ for _ in range(12):
999
+ time.sleep(5)
1000
+ poll_resp, poll_error = request(
1001
+ "GET",
1002
+ f"{base_url}/retrosynthesis/results/{prediction_id}",
1003
+ headers=headers,
1004
+ timeout=15,
1005
+ retries=1,
1006
+ raise_for_status=False,
1007
+ )
1008
+ if poll_error:
1009
+ import logging
1010
+ logging.getLogger("ct.tools.chemistry").debug(
1011
+ "IBM RXN poll attempt failed: %s", poll_error,
1012
+ )
1013
+ continue
1014
+ if poll_resp.status_code == 200:
1015
+ try:
1016
+ data = poll_resp.json()
1017
+ except Exception:
1018
+ continue
1019
+ status = data.get("status", "")
1020
+ if status == "SUCCESS":
1021
+ return _parse_ibm_rxn_results(smiles, data)
1022
+ if status == "FAILED":
1023
+ return {"error": "IBM RXN retrosynthesis failed", "summary": "IBM RXN retrosynthesis failed"}
1024
+ return {"error": "IBM RXN API timed out waiting for results", "summary": "IBM RXN API timed out waiting for results"}
1025
+ def _parse_ibm_rxn_results(smiles: str, data: dict) -> dict:
1026
+ """Parse IBM RXN API retrosynthesis results into standard format."""
1027
+ routes = []
1028
+ retro_routes = data.get("retrosynthetic_paths", [])
1029
+
1030
+ for i, route in enumerate(retro_routes):
1031
+ steps = []
1032
+ for step in route.get("steps", []):
1033
+ steps.append({
1034
+ "reaction_smiles": step.get("reaction", ""),
1035
+ "reactants": step.get("reactants", []),
1036
+ "confidence": step.get("confidence", 0.0),
1037
+ })
1038
+ routes.append({
1039
+ "route_id": i + 1,
1040
+ "n_steps": len(steps),
1041
+ "steps": steps,
1042
+ "confidence": route.get("confidence", 0.0),
1043
+ })
1044
+
1045
+ routes.sort(key=lambda r: r["n_steps"])
1046
+ shortest = routes[0]["n_steps"] if routes else 0
1047
+
1048
+ return {
1049
+ "summary": f"Retrosynthesis for {smiles}: {len(routes)} routes found via IBM RXN, "
1050
+ f"shortest is {shortest} steps",
1051
+ "target": smiles,
1052
+ "source": "ibm_rxn",
1053
+ "n_routes": len(routes),
1054
+ "routes": routes,
1055
+ }
1056
+
1057
+
1058
+ def _retrosynthesis_heuristic(smiles: str, max_steps: int) -> dict:
1059
+ """Heuristic retrosynthesis using RDKit SMARTS-based disconnections."""
1060
+ from rdkit import Chem
1061
+ from rdkit.Chem import AllChem, Descriptors, rdMolDescriptors
1062
+
1063
+ mol = Chem.MolFromSmiles(smiles)
1064
+ if mol is None:
1065
+ return {"error": f"Invalid SMILES: {smiles}", "summary": f"Could not parse SMILES: {smiles}"}
1066
+
1067
+ # Find applicable disconnections
1068
+ disconnections = []
1069
+ for transform in _RETRO_TRANSFORMS:
1070
+ pattern = Chem.MolFromSmarts(transform["product_smarts"])
1071
+ if pattern is None:
1072
+ continue
1073
+ matches = mol.GetSubstructMatches(pattern)
1074
+ if matches:
1075
+ disconnections.append({
1076
+ "transform_name": transform["name"],
1077
+ "description": transform["description"],
1078
+ "n_sites": len(matches),
1079
+ "atom_indices": [list(m) for m in matches[:3]], # cap at 3
1080
+ "reagents": transform["reagents"],
1081
+ "conditions": transform["conditions"],
1082
+ "reaction_class": transform["reaction_class"],
1083
+ })
1084
+
1085
+ if not disconnections:
1086
+ return {
1087
+ "summary": f"[HEURISTIC FALLBACK] Retrosynthesis for {smiles}: no heuristic disconnections found — "
1088
+ "molecule may require specialized chemistry. Configure api.ibm_rxn_key for AI-powered retrosynthesis.",
1089
+ "target": smiles,
1090
+ "source": "heuristic",
1091
+ "n_routes": 0,
1092
+ "routes": [],
1093
+ "disconnections": [],
1094
+ }
1095
+
1096
+ # Build routes: each disconnection is a potential first step
1097
+ # For multi-step, try to fragment further (simplified: just report single-step disconnections
1098
+ # but note that BRICS can provide deeper fragmentation)
1099
+ routes = []
1100
+ for i, disc in enumerate(disconnections):
1101
+ route_steps = [{
1102
+ "step": 1,
1103
+ "transform": disc["transform_name"],
1104
+ "description": disc["description"],
1105
+ "reagents": disc["reagents"],
1106
+ "conditions": disc["conditions"],
1107
+ "n_disconnection_sites": disc["n_sites"],
1108
+ }]
1109
+ routes.append({
1110
+ "route_id": i + 1,
1111
+ "strategy": disc["transform_name"],
1112
+ "n_steps": 1,
1113
+ "steps": route_steps,
1114
+ "reaction_class": disc["reaction_class"],
1115
+ })
1116
+
1117
+ # BRICS decomposition for deeper analysis
1118
+ brics_fragments = []
1119
+ try:
1120
+ from rdkit.Chem import BRICS
1121
+ frags = BRICS.BRICSDecompose(mol, returnMols=False)
1122
+ brics_fragments = list(frags)[:10] # cap output
1123
+ except Exception:
1124
+ pass
1125
+
1126
+ # Add a BRICS-based route if fragments found
1127
+ if brics_fragments and len(brics_fragments) > 1:
1128
+ brics_steps = []
1129
+ for j, frag in enumerate(brics_fragments):
1130
+ brics_steps.append({
1131
+ "step": j + 1,
1132
+ "fragment": frag,
1133
+ "description": f"BRICS fragment {j + 1}",
1134
+ })
1135
+ routes.append({
1136
+ "route_id": len(routes) + 1,
1137
+ "strategy": "BRICS full decomposition",
1138
+ "n_steps": len(brics_fragments),
1139
+ "steps": brics_steps,
1140
+ "reaction_class": "brics",
1141
+ })
1142
+
1143
+ # Molecular properties for context
1144
+ mw = Descriptors.MolWt(mol)
1145
+ formula = rdMolDescriptors.CalcMolFormula(mol)
1146
+
1147
+ # Sort routes by step count
1148
+ routes.sort(key=lambda r: r["n_steps"])
1149
+ shortest = routes[0]["n_steps"] if routes else 0
1150
+
1151
+ return {
1152
+ "summary": f"[HEURISTIC FALLBACK] Retrosynthesis for {formula} ({smiles}): {len(routes)} routes found "
1153
+ f"via SMARTS-based disconnection (not AI-predicted). Configure api.ibm_rxn_key for more accurate routes.",
1154
+ "target": smiles,
1155
+ "formula": formula,
1156
+ "molecular_weight": round(mw, 1),
1157
+ "source": "heuristic",
1158
+ "n_routes": len(routes),
1159
+ "routes": routes,
1160
+ "disconnections": disconnections,
1161
+ "brics_fragments": brics_fragments,
1162
+ }
1163
+
1164
+
1165
+ # ─── Pharmacophore feature SMARTS definitions ──────────────────
1166
+ _PHARMACOPHORE_FEATURES = {
1167
+ "HBD": {
1168
+ "name": "Hydrogen Bond Donor",
1169
+ "smarts": ["[#7!H0&!$(N-[SX4](=O)(=O)[CX4](F)(F)F)]", "[#8!H0&!$([OH][C,S,P]=O)]", "[#16!H0]"],
1170
+ },
1171
+ "HBA": {
1172
+ "name": "Hydrogen Bond Acceptor",
1173
+ "smarts": ["[#7&!$([nH])&!$(N-N=O)]", "[$([O])&!$([OX2](C)C=O)]", "[#16&X2]"],
1174
+ },
1175
+ "Aromatic": {
1176
+ "name": "Aromatic Ring",
1177
+ "smarts": ["a1aaaaa1", "a1aaaa1"],
1178
+ },
1179
+ "Hydrophobic": {
1180
+ "name": "Hydrophobic",
1181
+ "smarts": ["[CH2X4,CH1X4,CH0X4]", "[$([cX3](:*):*)&!$([cX3](-[OH])-[OH])]"],
1182
+ },
1183
+ "PosIonizable": {
1184
+ "name": "Positive Ionizable",
1185
+ "smarts": ["[+,+2,+3,+4]", "[$([NX3&!$([NX3]-O)](-C)(-C)-C)]", "[$(n1cc[nH]c1)]"],
1186
+ },
1187
+ "NegIonizable": {
1188
+ "name": "Negative Ionizable",
1189
+ "smarts": ["[-,-2,-3,-4]", "[$([OH]-[CX3]=[OX1])]", "[$([OH]-[SX4](=[OX1])(=[OX1]))]"],
1190
+ },
1191
+ }
1192
+
1193
+
1194
+ @registry.register(
1195
+ name="chemistry.pharmacophore",
1196
+ description="Generate a pharmacophore model from a set of active compounds identifying common molecular features",
1197
+ category="chemistry",
1198
+ parameters={
1199
+ "smiles_list": "List of SMILES strings for active compounds",
1200
+ "method": "Analysis method: 'common_features' (default) or 'fingerprints'",
1201
+ },
1202
+ usage_guide="You have a set of active compounds and want to identify the common pharmacophoric features that drive activity. Use for understanding SAR, virtual screening, and lead optimization. Identifies shared HBD, HBA, aromatic, hydrophobic, and ionizable features across the compound set.",
1203
+ )
1204
+ def pharmacophore(smiles_list: list = None, method: str = "common_features", **kwargs) -> dict:
1205
+ """Generate a pharmacophore model from a set of active compounds.
1206
+
1207
+ Identifies common pharmacophore features (HBD, HBA, Aromatic, Hydrophobic,
1208
+ PosIonizable, NegIonizable) across the compound set and optionally generates
1209
+ 2D pharmacophore fingerprints for consensus scoring.
1210
+ """
1211
+ from rdkit import Chem, DataStructs
1212
+ from rdkit.Chem import Descriptors
1213
+
1214
+ if not smiles_list or len(smiles_list) < 2:
1215
+ return {
1216
+ "error": "Need at least 2 SMILES strings",
1217
+ "summary": "Pharmacophore analysis requires at least 2 compounds",
1218
+ }
1219
+
1220
+ # Resolve any drug names to SMILES
1221
+ resolved_list = []
1222
+ for smi in smiles_list:
1223
+ resolved_list.append(_extract_smiles(smi))
1224
+ smiles_list = resolved_list
1225
+
1226
+ # Parse molecules
1227
+ mols = []
1228
+ valid_smiles = []
1229
+ invalid = []
1230
+ for smi in smiles_list:
1231
+ mol = Chem.MolFromSmiles(smi)
1232
+ if mol is None:
1233
+ invalid.append(smi)
1234
+ else:
1235
+ mols.append(mol)
1236
+ valid_smiles.append(smi)
1237
+
1238
+ if len(mols) < 2:
1239
+ return {
1240
+ "error": f"Only {len(mols)} valid molecule(s) — need at least 2",
1241
+ "summary": "Insufficient valid molecules for pharmacophore analysis",
1242
+ "invalid_smiles": invalid,
1243
+ }
1244
+
1245
+ n_compounds = len(mols)
1246
+
1247
+ # Step 1: Detect pharmacophore features per molecule
1248
+ per_molecule_features = [] # list of dicts: smiles -> {feature_type: count}
1249
+
1250
+ for i, mol in enumerate(mols):
1251
+ mol_features = {}
1252
+ for feat_type, feat_def in _PHARMACOPHORE_FEATURES.items():
1253
+ count = 0
1254
+ for smarts_str in feat_def["smarts"]:
1255
+ pattern = Chem.MolFromSmarts(smarts_str)
1256
+ if pattern is not None:
1257
+ matches = mol.GetSubstructMatches(pattern)
1258
+ count += len(matches)
1259
+ mol_features[feat_type] = count
1260
+ per_molecule_features.append({
1261
+ "smiles": valid_smiles[i],
1262
+ "features": mol_features,
1263
+ })
1264
+
1265
+ # Step 2: Identify common features (present in all molecules)
1266
+ feature_types = list(_PHARMACOPHORE_FEATURES.keys())
1267
+ common_features = []
1268
+ feature_distribution = {}
1269
+
1270
+ for feat_type in feature_types:
1271
+ counts = [mf["features"][feat_type] for mf in per_molecule_features]
1272
+ min_count = min(counts)
1273
+ max_count = max(counts)
1274
+ mean_count = sum(counts) / len(counts)
1275
+ # Feature is "common" if present in all molecules
1276
+ present_in = sum(1 for c in counts if c > 0)
1277
+ frequency = present_in / n_compounds
1278
+
1279
+ feature_distribution[feat_type] = {
1280
+ "name": _PHARMACOPHORE_FEATURES[feat_type]["name"],
1281
+ "min_count": min_count,
1282
+ "max_count": max_count,
1283
+ "mean_count": round(mean_count, 1),
1284
+ "present_in_n": present_in,
1285
+ "frequency": round(frequency, 3),
1286
+ }
1287
+
1288
+ if min_count > 0:
1289
+ common_features.append({
1290
+ "type": feat_type,
1291
+ "name": _PHARMACOPHORE_FEATURES[feat_type]["name"],
1292
+ "min_count": min_count,
1293
+ "conserved": min_count == max_count,
1294
+ "frequency": 1.0,
1295
+ })
1296
+
1297
+ # Step 3: 2D pharmacophore fingerprints (if method includes fingerprints)
1298
+ pharm_fp_similarity = None
1299
+ if method in ("fingerprints", "both"):
1300
+ try:
1301
+ from rdkit.Chem.Pharm2D import Gobbi_Pharm2D, Generate
1302
+
1303
+ factory = Gobbi_Pharm2D.factory
1304
+ fps = []
1305
+ for mol in mols:
1306
+ fp = Generate.Gen2DFingerprint(mol, factory)
1307
+ fps.append(fp)
1308
+
1309
+ # Pairwise Tanimoto similarity
1310
+ sim_sum = 0.0
1311
+ sim_count = 0
1312
+ for i in range(len(fps)):
1313
+ for j in range(i + 1, len(fps)):
1314
+ sim = DataStructs.TanimotoSimilarity(fps[i], fps[j])
1315
+ sim_sum += sim
1316
+ sim_count += 1
1317
+
1318
+ pharm_fp_similarity = round(sim_sum / sim_count, 3) if sim_count > 0 else 0.0
1319
+ except Exception:
1320
+ pharm_fp_similarity = None # Gobbi_Pharm2D not available
1321
+
1322
+ # Step 4: Consensus score
1323
+ # Based on: fraction of features that are common + consistency of counts
1324
+ if feature_types:
1325
+ common_frac = len(common_features) / len(feature_types)
1326
+ else:
1327
+ common_frac = 0.0
1328
+
1329
+ # Weight by how conserved the counts are (lower variance = higher consensus)
1330
+ variance_scores = []
1331
+ for feat_type in feature_types:
1332
+ counts = [mf["features"][feat_type] for mf in per_molecule_features]
1333
+ if max(counts) > 0:
1334
+ normalized_range = (max(counts) - min(counts)) / max(counts)
1335
+ variance_scores.append(1.0 - normalized_range)
1336
+
1337
+ consistency = sum(variance_scores) / len(variance_scores) if variance_scores else 0.0
1338
+ consensus_score = round((common_frac * 0.6 + consistency * 0.4), 3)
1339
+
1340
+ # Build summary
1341
+ common_desc = []
1342
+ for cf in common_features:
1343
+ common_desc.append(f"{cf['min_count']} {cf['name']}")
1344
+
1345
+ summary_parts = [
1346
+ f"Pharmacophore from {n_compounds} compounds: "
1347
+ f"{len(common_features)} common features",
1348
+ ]
1349
+ if common_desc:
1350
+ summary_parts[0] += f" ({', '.join(common_desc)})"
1351
+ summary_parts.append(f"Consensus score: {consensus_score}")
1352
+ if pharm_fp_similarity is not None:
1353
+ summary_parts.append(f"Mean pharmacophore fingerprint similarity: {pharm_fp_similarity}")
1354
+
1355
+ result = {
1356
+ "summary": "\n".join(summary_parts),
1357
+ "n_compounds": n_compounds,
1358
+ "n_valid": len(mols),
1359
+ "common_features": common_features,
1360
+ "feature_distribution": feature_distribution,
1361
+ "per_molecule_features": per_molecule_features,
1362
+ "consensus_score": consensus_score,
1363
+ "method": method,
1364
+ }
1365
+
1366
+ if invalid:
1367
+ result["invalid_smiles"] = invalid
1368
+ if pharm_fp_similarity is not None:
1369
+ result["pharmacophore_fp_similarity"] = pharm_fp_similarity
1370
+
1371
+ return result