cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,3722 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LLM-assisted molecule construction via IUPAC name manipulation.
|
|
3
|
+
|
|
4
|
+
Provides composable tools designed for use with an LLM orchestrator.
|
|
5
|
+
The LLM translates natural language descriptions of molecules into tool calls
|
|
6
|
+
that manipulate IUPAC names, which are then validated and converted to
|
|
7
|
+
structures.
|
|
8
|
+
|
|
9
|
+
Architecture::
|
|
10
|
+
|
|
11
|
+
NL description --> LLM orchestrator --> tool calls --> IUPAC name --> CDXML
|
|
12
|
+
|
|
13
|
+
The key insight: IUPAC names are a lossless text representation of molecules.
|
|
14
|
+
Instead of having an LLM generate SMILES or manipulate CDXML directly, we let
|
|
15
|
+
the LLM do "name surgery" — assembling, modifying, and validating IUPAC names
|
|
16
|
+
using grounded tools. This avoids hallucinated SMILES while leveraging LLMs'
|
|
17
|
+
strength with natural language.
|
|
18
|
+
|
|
19
|
+
Layer 2 — Name manipulation tools:
|
|
20
|
+
resolve_to_smiles — Resolve any chemical identifier to SMILES
|
|
21
|
+
get_prefix_form — Get IUPAC substituent prefix for a group
|
|
22
|
+
assemble_name — Build IUPAC name from parent + substituents
|
|
23
|
+
modify_name — Add/swap/remove substituents in an existing name
|
|
24
|
+
validate_name — Check if an IUPAC name resolves to a valid molecule
|
|
25
|
+
name_to_structure — Convert validated name to CDXML
|
|
26
|
+
enumerate_names — List alternative IUPAC name forms for a molecule
|
|
27
|
+
|
|
28
|
+
Layer 3 — Graph manipulation tools (for structural transformations):
|
|
29
|
+
list_reactions — List available named reaction templates
|
|
30
|
+
apply_reaction — Apply a reaction template (Suzuki, Buchwald, etc.)
|
|
31
|
+
deprotect — Remove protecting groups (Boc, Fmoc, Cbz, etc.)
|
|
32
|
+
|
|
33
|
+
Meta:
|
|
34
|
+
get_tool_definitions — Export all tool schemas for LLM function calling
|
|
35
|
+
|
|
36
|
+
Usage (Python)::
|
|
37
|
+
|
|
38
|
+
from cdxml_toolkit.naming.mol_builder import (
|
|
39
|
+
get_prefix_form, assemble_name, validate_name, name_to_structure,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
pf = get_prefix_form("CF3")
|
|
43
|
+
# {'prefix': 'trifluoromethyl', 'source': 'table', 'ok': True}
|
|
44
|
+
|
|
45
|
+
result = assemble_name("pyridine", [
|
|
46
|
+
{"locant": "2", "prefix": "chloro"},
|
|
47
|
+
{"locant": "3", "prefix": pf["prefix"]},
|
|
48
|
+
])
|
|
49
|
+
# {'name': '2-chloro-3-(trifluoromethyl)pyridine', 'valid': True,
|
|
50
|
+
# 'smiles': '...', 'ok': True}
|
|
51
|
+
|
|
52
|
+
cdxml = name_to_structure(result["name"])
|
|
53
|
+
# {'cdxml': '<?xml ...', 'ok': True}
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
import json
|
|
57
|
+
import logging
|
|
58
|
+
import os
|
|
59
|
+
import re
|
|
60
|
+
from typing import Any, Dict, List, Optional
|
|
61
|
+
|
|
62
|
+
logger = logging.getLogger(__name__)
|
|
63
|
+
|
|
64
|
+
# ---------------------------------------------------------------------------
|
|
65
|
+
# Lazy singletons — avoid import-time cost for heavy dependencies
|
|
66
|
+
# ---------------------------------------------------------------------------
|
|
67
|
+
|
|
68
|
+
_cs_instance = None
|
|
69
|
+
_cs_failed = False
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _get_cs():
|
|
73
|
+
"""Lazily obtain a ChemScriptBridge instance (or None)."""
|
|
74
|
+
global _cs_instance, _cs_failed
|
|
75
|
+
if _cs_failed:
|
|
76
|
+
return None
|
|
77
|
+
if _cs_instance is not None:
|
|
78
|
+
return _cs_instance
|
|
79
|
+
try:
|
|
80
|
+
from cdxml_toolkit.chemdraw.chemscript_bridge import ChemScriptBridge
|
|
81
|
+
_cs_instance = ChemScriptBridge()
|
|
82
|
+
return _cs_instance
|
|
83
|
+
except Exception as exc:
|
|
84
|
+
logger.debug("ChemScript unavailable: %s", exc)
|
|
85
|
+
_cs_failed = True
|
|
86
|
+
return None
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _rdkit_canonical(smiles: str) -> Optional[str]:
|
|
90
|
+
"""Canonical SMILES via RDKit, or None."""
|
|
91
|
+
from rdkit import Chem
|
|
92
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
93
|
+
return Chem.MolToSmiles(mol) if mol else None
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# ---------------------------------------------------------------------------
|
|
97
|
+
# Prefix lookup table — covers common med-chem substituents
|
|
98
|
+
# ---------------------------------------------------------------------------
|
|
99
|
+
|
|
100
|
+
# Maps group identifiers (abbreviations, names, formulae) to IUPAC prefix
|
|
101
|
+
# form suitable for direct insertion into a substituted name.
|
|
102
|
+
# The table is checked case-insensitively.
|
|
103
|
+
_PREFIX_TABLE: Dict[str, str] = {
|
|
104
|
+
# --- Halogens ---
|
|
105
|
+
"f": "fluoro", "cl": "chloro", "br": "bromo", "i": "iodo",
|
|
106
|
+
"fluorine": "fluoro", "chlorine": "chloro",
|
|
107
|
+
"bromine": "bromo", "iodine": "iodo",
|
|
108
|
+
|
|
109
|
+
# --- Oxygen ---
|
|
110
|
+
"oh": "hydroxy", "ome": "methoxy", "oet": "ethoxy",
|
|
111
|
+
"oac": "acetyloxy", "obn": "benzyloxy", "oph": "phenoxy",
|
|
112
|
+
"methoxy": "methoxy", "ethoxy": "ethoxy", "hydroxy": "hydroxy",
|
|
113
|
+
"ocf3": "trifluoromethoxy", "oipr": "isopropoxy",
|
|
114
|
+
|
|
115
|
+
# --- Nitrogen ---
|
|
116
|
+
"nh2": "amino", "nhme": "methylamino", "nme2": "dimethylamino",
|
|
117
|
+
"nhac": "acetamido", "no2": "nitro", "n3": "azido",
|
|
118
|
+
"amino": "amino", "nitro": "nitro", "azido": "azido",
|
|
119
|
+
|
|
120
|
+
# --- Simple carbon ---
|
|
121
|
+
"me": "methyl", "et": "ethyl", "pr": "propyl", "npr": "propyl",
|
|
122
|
+
"ipr": "propan-2-yl", "bu": "butyl", "nbu": "butyl",
|
|
123
|
+
"tbu": "tert-butyl", "sbu": "sec-butyl", "ibu": "isobutyl",
|
|
124
|
+
"methyl": "methyl", "ethyl": "ethyl",
|
|
125
|
+
"vinyl": "ethenyl", "allyl": "prop-2-en-1-yl",
|
|
126
|
+
"isopropyl": "propan-2-yl",
|
|
127
|
+
|
|
128
|
+
# --- Cycloalkyl ---
|
|
129
|
+
"cyclopropyl": "cyclopropyl", "cyclobutyl": "cyclobutyl",
|
|
130
|
+
"cyclopentyl": "cyclopentyl", "cyclohexyl": "cyclohexyl",
|
|
131
|
+
"cyclopropane": "cyclopropyl", "cyclobutane": "cyclobutyl",
|
|
132
|
+
"cyclopentane": "cyclopentyl", "cyclohexane": "cyclohexyl",
|
|
133
|
+
|
|
134
|
+
# --- Aryl ---
|
|
135
|
+
"ph": "phenyl", "bn": "benzyl", "bz": "benzoyl",
|
|
136
|
+
"phenyl": "phenyl", "benzyl": "benzyl",
|
|
137
|
+
|
|
138
|
+
# --- Functional groups (abbreviations) ---
|
|
139
|
+
"cn": "cyano", "cho": "formyl", "cooh": "carboxy",
|
|
140
|
+
"co2h": "carboxy", "-cooh": "carboxy", "-co2h": "carboxy",
|
|
141
|
+
"come": "acetyl", "ac": "acetyl",
|
|
142
|
+
"conh2": "carbamoyl", "-conh2": "carbamoyl",
|
|
143
|
+
"coome": "methoxycarbonyl", "co2me": "methoxycarbonyl",
|
|
144
|
+
"meo2c": "methoxycarbonyl", "meoco": "methoxycarbonyl",
|
|
145
|
+
"cooch3": "methoxycarbonyl", "-cooch3": "methoxycarbonyl",
|
|
146
|
+
"-coome": "methoxycarbonyl", "-co2me": "methoxycarbonyl",
|
|
147
|
+
"cooet": "ethoxycarbonyl", "co2et": "ethoxycarbonyl",
|
|
148
|
+
"eto2c": "ethoxycarbonyl", "etoco": "ethoxycarbonyl",
|
|
149
|
+
"cooipr": "isopropoxycarbonyl", "co2ipr": "isopropoxycarbonyl",
|
|
150
|
+
"cootbu": "tert-butoxycarbonyl", "co2tbu": "tert-butoxycarbonyl",
|
|
151
|
+
"-cho": "formyl",
|
|
152
|
+
|
|
153
|
+
# --- Functional group descriptors (natural language → prefix) ---
|
|
154
|
+
"methyl ester": "methoxycarbonyl",
|
|
155
|
+
"me ester": "methoxycarbonyl",
|
|
156
|
+
"ome ester": "methoxycarbonyl",
|
|
157
|
+
"ethyl ester": "ethoxycarbonyl",
|
|
158
|
+
"et ester": "ethoxycarbonyl",
|
|
159
|
+
"isopropyl ester": "isopropoxycarbonyl",
|
|
160
|
+
"tert-butyl ester": "tert-butoxycarbonyl",
|
|
161
|
+
"aldehyde": "formyl",
|
|
162
|
+
"ketone": "oxo",
|
|
163
|
+
"carboxylic acid": "carboxy",
|
|
164
|
+
"nitrile": "cyano",
|
|
165
|
+
"amide": "carbamoyl",
|
|
166
|
+
"primary amide": "carbamoyl",
|
|
167
|
+
"alcohol": "hydroxy",
|
|
168
|
+
"hydroxyl": "hydroxy",
|
|
169
|
+
"thiol": "sulfanyl",
|
|
170
|
+
"mercaptan": "sulfanyl",
|
|
171
|
+
"sulfonic acid": "sulfo",
|
|
172
|
+
"sulfonamide": "sulfamoyl",
|
|
173
|
+
|
|
174
|
+
# --- Fluorocarbons ---
|
|
175
|
+
"cf3": "trifluoromethyl", "chf2": "difluoromethyl",
|
|
176
|
+
"ccl3": "trichloromethyl",
|
|
177
|
+
|
|
178
|
+
# --- Sulphur ---
|
|
179
|
+
"sh": "sulfanyl", "sme": "methylsulfanyl",
|
|
180
|
+
"so2me": "methanesulfonyl", "ms": "methanesulfonyl",
|
|
181
|
+
"so2nh2": "sulfamoyl",
|
|
182
|
+
|
|
183
|
+
# --- Heterocycles as substituent prefix ---
|
|
184
|
+
"morpholine": "morpholino", "morpholinyl": "morpholino",
|
|
185
|
+
"morpholino": "morpholino",
|
|
186
|
+
"piperidine": "piperidin-1-yl", "piperidinyl": "piperidin-1-yl",
|
|
187
|
+
"piperazine": "piperazin-1-yl", "piperazinyl": "piperazin-1-yl",
|
|
188
|
+
"pyrrolidine": "pyrrolidin-1-yl", "pyrrolidinyl": "pyrrolidin-1-yl",
|
|
189
|
+
"pyridine": "pyridinyl", "pyridinyl": "pyridinyl",
|
|
190
|
+
"pyrimidine": "pyrimidinyl",
|
|
191
|
+
"thiophene": "thiophen-2-yl", "thienyl": "thiophen-2-yl",
|
|
192
|
+
"furan": "furan-2-yl", "furyl": "furan-2-yl",
|
|
193
|
+
"pyrrole": "pyrrol-1-yl",
|
|
194
|
+
"imidazole": "imidazolyl", "imidazolyl": "imidazolyl",
|
|
195
|
+
"thiazole": "thiazolyl", "thiazolyl": "thiazolyl",
|
|
196
|
+
"oxazole": "oxazolyl", "oxazolyl": "oxazolyl",
|
|
197
|
+
"indole": "indolyl",
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
# IUPAC multiplying prefixes for identical substituents
|
|
201
|
+
_MULTIPLIERS = {2: "di", 3: "tri", 4: "tetra", 5: "penta", 6: "hexa"}
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
# ---------------------------------------------------------------------------
|
|
205
|
+
# Internal helpers
|
|
206
|
+
# ---------------------------------------------------------------------------
|
|
207
|
+
|
|
208
|
+
def _resolve_query(query: str, use_network: bool = True) -> Optional[Dict]:
|
|
209
|
+
"""Multi-tier resolution chain: reagent DB → formula → ChemScript → PubChem.
|
|
210
|
+
|
|
211
|
+
Returns {"smiles": ..., "source": ...} or None.
|
|
212
|
+
"""
|
|
213
|
+
from rdkit import Chem
|
|
214
|
+
|
|
215
|
+
clean = query.strip()
|
|
216
|
+
if not clean:
|
|
217
|
+
return None
|
|
218
|
+
|
|
219
|
+
# Tier 1: Reagent DB
|
|
220
|
+
try:
|
|
221
|
+
from cdxml_toolkit.resolve.reagent_db import get_reagent_db
|
|
222
|
+
db = get_reagent_db()
|
|
223
|
+
entry = db.entry_for_name(clean.lower())
|
|
224
|
+
if entry:
|
|
225
|
+
smi = entry.get("smiles")
|
|
226
|
+
if isinstance(smi, list):
|
|
227
|
+
smi = smi[0]
|
|
228
|
+
if smi and Chem.MolFromSmiles(smi):
|
|
229
|
+
return {"smiles": _rdkit_canonical(smi), "source": "reagent_db"}
|
|
230
|
+
except Exception:
|
|
231
|
+
pass
|
|
232
|
+
|
|
233
|
+
# Tier 2: Condensed formula
|
|
234
|
+
try:
|
|
235
|
+
from cdxml_toolkit.resolve.condensed_formula import resolve_condensed_formula
|
|
236
|
+
smi = resolve_condensed_formula(clean)
|
|
237
|
+
if smi:
|
|
238
|
+
canon = _rdkit_canonical(smi)
|
|
239
|
+
if canon:
|
|
240
|
+
return {"smiles": canon, "source": "formula"}
|
|
241
|
+
except Exception:
|
|
242
|
+
pass
|
|
243
|
+
|
|
244
|
+
# Tier 3: ChemScript (name → SMILES)
|
|
245
|
+
cs = _get_cs()
|
|
246
|
+
if cs is not None:
|
|
247
|
+
try:
|
|
248
|
+
smi = cs.write_data(clean, "smiles", source_format="name")
|
|
249
|
+
if smi and Chem.MolFromSmiles(smi):
|
|
250
|
+
return {"smiles": _rdkit_canonical(smi), "source": "chemscript"}
|
|
251
|
+
except Exception:
|
|
252
|
+
pass
|
|
253
|
+
|
|
254
|
+
# Tier 3b: OPSIN (offline IUPAC name → SMILES, bundled JRE)
|
|
255
|
+
try:
|
|
256
|
+
from cdxml_toolkit.resolve.jre_manager import ensure_java_on_path
|
|
257
|
+
if ensure_java_on_path():
|
|
258
|
+
import warnings
|
|
259
|
+
from py2opsin import py2opsin as _py2opsin
|
|
260
|
+
with warnings.catch_warnings():
|
|
261
|
+
warnings.simplefilter("ignore", RuntimeWarning)
|
|
262
|
+
smi = _py2opsin(clean)
|
|
263
|
+
if smi and Chem.MolFromSmiles(smi):
|
|
264
|
+
return {"smiles": _rdkit_canonical(smi), "source": "opsin"}
|
|
265
|
+
except (ImportError, FileNotFoundError):
|
|
266
|
+
pass
|
|
267
|
+
except Exception:
|
|
268
|
+
pass
|
|
269
|
+
|
|
270
|
+
# Tier 4: PubChem (online)
|
|
271
|
+
if use_network:
|
|
272
|
+
try:
|
|
273
|
+
from cdxml_toolkit.resolve.cas_resolver import resolve_name_to_smiles
|
|
274
|
+
smi = resolve_name_to_smiles(clean)
|
|
275
|
+
if smi:
|
|
276
|
+
canon = _rdkit_canonical(smi)
|
|
277
|
+
if canon:
|
|
278
|
+
return {"smiles": canon, "source": "pubchem"}
|
|
279
|
+
except Exception:
|
|
280
|
+
pass
|
|
281
|
+
|
|
282
|
+
return None
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def _name_to_smiles_cs(name: str) -> Optional[str]:
|
|
286
|
+
"""Resolve an IUPAC name to SMILES via ChemScript."""
|
|
287
|
+
cs = _get_cs()
|
|
288
|
+
if cs is None:
|
|
289
|
+
return None
|
|
290
|
+
try:
|
|
291
|
+
smi = cs.write_data(name, "smiles", source_format="name")
|
|
292
|
+
if smi:
|
|
293
|
+
return _rdkit_canonical(smi)
|
|
294
|
+
except Exception:
|
|
295
|
+
pass
|
|
296
|
+
return None
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def _smiles_to_name_cs(smiles: str) -> Optional[str]:
|
|
300
|
+
"""Get IUPAC name for a SMILES string via ChemScript."""
|
|
301
|
+
cs = _get_cs()
|
|
302
|
+
if cs is None:
|
|
303
|
+
return None
|
|
304
|
+
try:
|
|
305
|
+
return cs.get_name(smiles)
|
|
306
|
+
except Exception:
|
|
307
|
+
return None
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def _is_complex_prefix(prefix: str) -> bool:
|
|
311
|
+
"""Check if a prefix needs parentheses when inserted into a name.
|
|
312
|
+
|
|
313
|
+
Complex prefixes contain hyphens, digits, or commas that would be
|
|
314
|
+
ambiguous without enclosing parentheses.
|
|
315
|
+
"""
|
|
316
|
+
# Already parenthesised
|
|
317
|
+
if prefix.startswith("(") and prefix.endswith(")"):
|
|
318
|
+
return False
|
|
319
|
+
# Contains internal structure that needs brackets
|
|
320
|
+
return bool(re.search(r"[\d,]", prefix)) and "-" in prefix
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def _locant_sort_key(loc: str):
|
|
324
|
+
"""Sort locants: numeric before alphabetic, ascending."""
|
|
325
|
+
m = re.match(r"(\d+)(.*)", loc)
|
|
326
|
+
if m:
|
|
327
|
+
return (0, int(m.group(1)), m.group(2))
|
|
328
|
+
return (1, 0, loc)
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def _prefix_alpha_key(prefix: str) -> str:
|
|
332
|
+
"""IUPAC alphabetical sort key: ignore leading locants/multipliers.
|
|
333
|
+
|
|
334
|
+
``"1,1-difluoroethyl"`` → ``"difluoroethyl"``
|
|
335
|
+
``"tert-butyl"`` → ``"tert-butyl"``
|
|
336
|
+
"""
|
|
337
|
+
stripped = re.sub(r"^[\d,]+-", "", prefix)
|
|
338
|
+
return stripped.lower()
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def _try_validate(name: str, use_network: bool = True) -> Optional[str]:
|
|
342
|
+
"""Try to resolve a name to canonical SMILES by any available means.
|
|
343
|
+
|
|
344
|
+
Returns canonical SMILES or None.
|
|
345
|
+
"""
|
|
346
|
+
from rdkit import Chem
|
|
347
|
+
|
|
348
|
+
# ChemScript (most reliable for IUPAC names)
|
|
349
|
+
smi = _name_to_smiles_cs(name)
|
|
350
|
+
if smi:
|
|
351
|
+
return smi
|
|
352
|
+
|
|
353
|
+
# PubChem fallback (for common names)
|
|
354
|
+
if use_network:
|
|
355
|
+
try:
|
|
356
|
+
from cdxml_toolkit.resolve.cas_resolver import resolve_name_to_smiles
|
|
357
|
+
smi = resolve_name_to_smiles(name)
|
|
358
|
+
if smi:
|
|
359
|
+
return _rdkit_canonical(smi)
|
|
360
|
+
except Exception:
|
|
361
|
+
pass
|
|
362
|
+
|
|
363
|
+
return None
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
# ---------------------------------------------------------------------------
|
|
367
|
+
# RDKit property helpers
|
|
368
|
+
# ---------------------------------------------------------------------------
|
|
369
|
+
|
|
370
|
+
def _rdkit_properties(smiles: str) -> Dict[str, Any]:
|
|
371
|
+
"""Compute formula, MW, and exact mass from a SMILES via RDKit.
|
|
372
|
+
|
|
373
|
+
Returns a dict with keys ``formula``, ``mw``, ``exact_mass``.
|
|
374
|
+
Values are None if RDKit is unavailable or the molecule is invalid.
|
|
375
|
+
"""
|
|
376
|
+
props: Dict[str, Any] = {"formula": None, "mw": None, "exact_mass": None}
|
|
377
|
+
try:
|
|
378
|
+
from rdkit import Chem
|
|
379
|
+
from rdkit.Chem import Descriptors, rdMolDescriptors
|
|
380
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
381
|
+
if mol is None:
|
|
382
|
+
return props
|
|
383
|
+
props["formula"] = rdMolDescriptors.CalcMolFormula(mol)
|
|
384
|
+
props["mw"] = round(Descriptors.MolWt(mol), 4)
|
|
385
|
+
props["exact_mass"] = round(Descriptors.ExactMolWt(mol), 4)
|
|
386
|
+
except Exception:
|
|
387
|
+
pass
|
|
388
|
+
return props
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
# ---------------------------------------------------------------------------
|
|
392
|
+
# Tool 1: resolve_compound (rich resolver)
|
|
393
|
+
# ---------------------------------------------------------------------------
|
|
394
|
+
|
|
395
|
+
def resolve_compound(query: str, use_network: bool = True) -> Dict[str, Any]:
|
|
396
|
+
"""Resolve any chemical identifier to a rich molecule descriptor.
|
|
397
|
+
|
|
398
|
+
Consolidates all resolution pathways (reagent DB, condensed formula,
|
|
399
|
+
ChemScript, PubChem) and enriches the result with molecular properties
|
|
400
|
+
computed via RDKit and metadata from the reagent database.
|
|
401
|
+
|
|
402
|
+
Args:
|
|
403
|
+
query: Chemical identifier — common name, IUPAC name, abbreviation,
|
|
404
|
+
condensed formula, or CAS number. Examples:
|
|
405
|
+
``"aspirin"``, ``"PhB(OH)2"``, ``"2-chloropyridine"``,
|
|
406
|
+
``"534-17-8"``, ``"deucravacitinib"``.
|
|
407
|
+
use_network: Allow PubChem lookup (requires internet).
|
|
408
|
+
|
|
409
|
+
Returns:
|
|
410
|
+
Dict with keys:
|
|
411
|
+
|
|
412
|
+
- ``ok`` (bool): True on success.
|
|
413
|
+
- ``name`` (str): Input query echoed back.
|
|
414
|
+
- ``smiles`` (str): Isomeric/canonical SMILES.
|
|
415
|
+
- ``formula`` (str | None): Molecular formula (e.g. ``"C9H8O4"``).
|
|
416
|
+
- ``mw`` (float | None): Molecular weight.
|
|
417
|
+
- ``exact_mass`` (float | None): Monoisotopic mass.
|
|
418
|
+
- ``iupac_name`` (str | None): IUPAC name from ChemScript or PubChem.
|
|
419
|
+
- ``source`` (str): Which tier resolved the SMILES (``"reagent_db"``,
|
|
420
|
+
``"formula"``, ``"chemscript"``, ``"pubchem"``).
|
|
421
|
+
- ``role`` (str | None): Reagent role from the curated DB if known
|
|
422
|
+
(e.g. ``"base"``, ``"solvent"``, ``"catalyst"``).
|
|
423
|
+
- ``display_text`` (str | None): Preferred display name from the
|
|
424
|
+
reagent DB, or the IUPAC name if available.
|
|
425
|
+
- ``prefix_form`` (str | None): IUPAC substituent prefix for use in
|
|
426
|
+
``assemble_name`` (e.g. ``"trifluoromethyl"`` for ``CF3``,
|
|
427
|
+
``"morpholino"`` for morpholine). ``None`` if the compound is not
|
|
428
|
+
a substituent group or no prefix could be determined.
|
|
429
|
+
|
|
430
|
+
On failure: ``ok=False`` with an ``error`` key.
|
|
431
|
+
|
|
432
|
+
Example::
|
|
433
|
+
|
|
434
|
+
>>> resolve_compound("Cs2CO3")
|
|
435
|
+
{'ok': True, 'name': 'Cs2CO3', 'smiles': 'O=C([O-])[O-].[Cs+].[Cs+]',
|
|
436
|
+
'formula': 'CCs2O3', 'mw': 325.82, 'exact_mass': 325.82,
|
|
437
|
+
'iupac_name': None, 'source': 'reagent_db',
|
|
438
|
+
'role': 'base', 'display_text': 'Cs2CO3', 'prefix_form': None}
|
|
439
|
+
|
|
440
|
+
>>> resolve_compound("Et3N")
|
|
441
|
+
{'ok': True, 'name': 'Et3N', 'smiles': 'CCN(CC)CC',
|
|
442
|
+
'formula': 'C6H15N', 'mw': 101.19, 'exact_mass': 101.12,
|
|
443
|
+
'iupac_name': None, 'source': 'formula',
|
|
444
|
+
'role': None, 'display_text': None, 'prefix_form': None}
|
|
445
|
+
|
|
446
|
+
>>> resolve_compound("CF3")
|
|
447
|
+
{'ok': True, ..., 'prefix_form': 'trifluoromethyl'}
|
|
448
|
+
|
|
449
|
+
>>> resolve_compound("morpholine")
|
|
450
|
+
{'ok': True, ..., 'prefix_form': 'morpholino'}
|
|
451
|
+
"""
|
|
452
|
+
# --- Step 1: resolve SMILES via the existing 4-tier chain ---
|
|
453
|
+
resolved = _resolve_query(query, use_network=use_network)
|
|
454
|
+
if not resolved:
|
|
455
|
+
return {"ok": False, "error": f"Could not resolve '{query}' to a structure."}
|
|
456
|
+
|
|
457
|
+
smiles = resolved["smiles"]
|
|
458
|
+
source = resolved["source"]
|
|
459
|
+
|
|
460
|
+
# --- Step 2: compute molecular properties via RDKit ---
|
|
461
|
+
props = _rdkit_properties(smiles)
|
|
462
|
+
|
|
463
|
+
# --- Step 3: IUPAC name via ChemScript (best quality) ---
|
|
464
|
+
iupac_name: Optional[str] = None
|
|
465
|
+
if source == "chemscript":
|
|
466
|
+
# ChemScript already resolved this name — get the canonical IUPAC back
|
|
467
|
+
iupac_name = _smiles_to_name_cs(smiles)
|
|
468
|
+
elif source != "reagent_db":
|
|
469
|
+
# For formula/pubchem sources, try ChemScript name generation
|
|
470
|
+
iupac_name = _smiles_to_name_cs(smiles)
|
|
471
|
+
|
|
472
|
+
# --- Step 4: role and display_text from reagent_db ---
|
|
473
|
+
role: Optional[str] = None
|
|
474
|
+
display_text: Optional[str] = None
|
|
475
|
+
try:
|
|
476
|
+
from cdxml_toolkit.resolve.reagent_db import get_reagent_db
|
|
477
|
+
db = get_reagent_db()
|
|
478
|
+
# Try by name first (fastest), then by resolved SMILES
|
|
479
|
+
entry = db.entry_for_name(query.lower())
|
|
480
|
+
if entry is None:
|
|
481
|
+
entry = db.entry_for_smiles(smiles)
|
|
482
|
+
if entry is not None:
|
|
483
|
+
role = entry.get("role")
|
|
484
|
+
display_text = entry.get("display")
|
|
485
|
+
except Exception:
|
|
486
|
+
pass
|
|
487
|
+
|
|
488
|
+
# Fall back: display_text from IUPAC name if reagent_db had nothing
|
|
489
|
+
if display_text is None and iupac_name:
|
|
490
|
+
display_text = iupac_name
|
|
491
|
+
|
|
492
|
+
# --- Step 5: IUPAC substituent prefix form ---
|
|
493
|
+
prefix_form: Optional[str] = None
|
|
494
|
+
pf_result = get_prefix_form(query)
|
|
495
|
+
if pf_result.get("ok"):
|
|
496
|
+
prefix_form = pf_result["prefix"]
|
|
497
|
+
else:
|
|
498
|
+
# Try on the resolved SMILES as a fallback
|
|
499
|
+
pf_result2 = get_prefix_form(smiles)
|
|
500
|
+
if pf_result2.get("ok"):
|
|
501
|
+
prefix_form = pf_result2["prefix"]
|
|
502
|
+
|
|
503
|
+
return {
|
|
504
|
+
"ok": True,
|
|
505
|
+
"name": query,
|
|
506
|
+
"smiles": smiles,
|
|
507
|
+
"formula": props["formula"],
|
|
508
|
+
"mw": props["mw"],
|
|
509
|
+
"exact_mass": props["exact_mass"],
|
|
510
|
+
"iupac_name": iupac_name,
|
|
511
|
+
"source": source,
|
|
512
|
+
"role": role,
|
|
513
|
+
"display_text": display_text,
|
|
514
|
+
"prefix_form": prefix_form,
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
# ---------------------------------------------------------------------------
|
|
519
|
+
# Tool 2 (legacy thin wrapper): resolve_to_smiles
|
|
520
|
+
# ---------------------------------------------------------------------------
|
|
521
|
+
|
|
522
|
+
def resolve_to_smiles(query: str, use_network: bool = True) -> Dict[str, Any]:
|
|
523
|
+
"""Resolve a chemical identifier to its canonical SMILES string.
|
|
524
|
+
|
|
525
|
+
Accepts common names, IUPAC names, abbreviations, condensed formulae,
|
|
526
|
+
and CAS numbers. Uses a 4-tier resolution chain:
|
|
527
|
+
reagent DB → condensed formula → ChemScript → PubChem.
|
|
528
|
+
|
|
529
|
+
.. note::
|
|
530
|
+
For richer output (formula, MW, exact mass, role, display text),
|
|
531
|
+
use :func:`resolve_compound` instead.
|
|
532
|
+
|
|
533
|
+
Args:
|
|
534
|
+
query: Chemical identifier. Examples: ``"aspirin"``,
|
|
535
|
+
``"PhB(OH)2"``, ``"2-chloropyridine"``, ``"534-17-8"``.
|
|
536
|
+
use_network: Allow PubChem lookup (requires internet).
|
|
537
|
+
|
|
538
|
+
Returns:
|
|
539
|
+
Dict with keys ``ok``, ``smiles``, ``source``.
|
|
540
|
+
On failure: ``ok=False`` with an ``error`` message.
|
|
541
|
+
|
|
542
|
+
Example::
|
|
543
|
+
|
|
544
|
+
>>> resolve_to_smiles("Et3N")
|
|
545
|
+
{'ok': True, 'smiles': 'CCN(CC)CC', 'source': 'formula'}
|
|
546
|
+
"""
|
|
547
|
+
result = resolve_compound(query, use_network=use_network)
|
|
548
|
+
if result["ok"]:
|
|
549
|
+
return {"ok": True, "smiles": result["smiles"], "source": result["source"]}
|
|
550
|
+
return {"ok": False, "error": result.get("error", f"Could not resolve '{query}'.")}
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
# ---------------------------------------------------------------------------
|
|
554
|
+
# Tool 2: get_prefix_form
|
|
555
|
+
# ---------------------------------------------------------------------------
|
|
556
|
+
|
|
557
|
+
def get_prefix_form(group: str) -> Dict[str, Any]:
|
|
558
|
+
"""Get the IUPAC substituent prefix form for a chemical group.
|
|
559
|
+
|
|
560
|
+
Given a group name, abbreviation, or formula, returns the prefix
|
|
561
|
+
string suitable for insertion into an IUPAC name.
|
|
562
|
+
|
|
563
|
+
Uses a curated lookup table for common groups (fast, offline), then
|
|
564
|
+
falls back to ChemScript-based naming with the Se-probe for anything
|
|
565
|
+
not in the table.
|
|
566
|
+
|
|
567
|
+
Args:
|
|
568
|
+
group: Group identifier. Examples: ``"CF3"``, ``"morpholine"``,
|
|
569
|
+
``"NO2"``, ``"cyclopropyl"``, ``"OMe"``.
|
|
570
|
+
|
|
571
|
+
Returns:
|
|
572
|
+
Dict with keys ``ok``, ``prefix``, ``source``.
|
|
573
|
+
``source`` is ``"table"`` for lookup hits, ``"probe"`` for
|
|
574
|
+
ChemScript probe, or ``"passthrough"`` if the input was already
|
|
575
|
+
a valid prefix form.
|
|
576
|
+
|
|
577
|
+
Examples::
|
|
578
|
+
|
|
579
|
+
>>> get_prefix_form("CF3")
|
|
580
|
+
{'ok': True, 'prefix': 'trifluoromethyl', 'source': 'table'}
|
|
581
|
+
>>> get_prefix_form("morpholine")
|
|
582
|
+
{'ok': True, 'prefix': 'morpholino', 'source': 'table'}
|
|
583
|
+
"""
|
|
584
|
+
clean = group.strip()
|
|
585
|
+
if not clean:
|
|
586
|
+
return {"ok": False, "error": "Empty group."}
|
|
587
|
+
|
|
588
|
+
# --- Table lookup (case-insensitive) ---
|
|
589
|
+
key = clean.lower()
|
|
590
|
+
if key in _PREFIX_TABLE:
|
|
591
|
+
return {"ok": True, "prefix": _PREFIX_TABLE[key], "source": "table"}
|
|
592
|
+
|
|
593
|
+
# --- Check if it's already a valid prefix ---
|
|
594
|
+
# If appending it to "benzene" gives a valid name, it's a prefix.
|
|
595
|
+
test_name = f"1-{clean}benzene" if not clean[0].isdigit() else f"{clean}benzene"
|
|
596
|
+
smi = _try_validate(test_name)
|
|
597
|
+
if smi:
|
|
598
|
+
return {"ok": True, "prefix": clean, "source": "passthrough"}
|
|
599
|
+
|
|
600
|
+
# --- Se-probe via name_fragment_as_substituent ---
|
|
601
|
+
# Resolve group to SMILES, add [*] attachment, call the decomposer.
|
|
602
|
+
resolved = _resolve_query(clean, use_network=True)
|
|
603
|
+
if resolved:
|
|
604
|
+
from rdkit import Chem
|
|
605
|
+
mol = Chem.MolFromSmiles(resolved["smiles"])
|
|
606
|
+
if mol:
|
|
607
|
+
# Build [*]-fragment SMILES by attaching dummy at the most
|
|
608
|
+
# likely bonding position (first atom in canonical SMILES).
|
|
609
|
+
# For many simple groups this is correct.
|
|
610
|
+
edit = Chem.RWMol(mol)
|
|
611
|
+
dummy_idx = edit.AddAtom(Chem.Atom(0)) # [*]
|
|
612
|
+
edit.AddBond(0, dummy_idx, Chem.BondType.SINGLE)
|
|
613
|
+
try:
|
|
614
|
+
Chem.SanitizeMol(edit)
|
|
615
|
+
frag_smi = Chem.MolToSmiles(edit.GetMol())
|
|
616
|
+
from .name_decomposer import name_fragment_as_substituent
|
|
617
|
+
prefix = name_fragment_as_substituent(frag_smi, verbose=False)
|
|
618
|
+
if prefix:
|
|
619
|
+
return {"ok": True, "prefix": prefix, "source": "probe"}
|
|
620
|
+
except Exception:
|
|
621
|
+
pass
|
|
622
|
+
|
|
623
|
+
return {
|
|
624
|
+
"ok": False,
|
|
625
|
+
"error": f"Could not determine prefix form for '{group}'.",
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
|
|
629
|
+
# ---------------------------------------------------------------------------
|
|
630
|
+
# Tool 3: assemble_name
|
|
631
|
+
# ---------------------------------------------------------------------------
|
|
632
|
+
|
|
633
|
+
def assemble_name(parent: str,
|
|
634
|
+
substituents: List[Dict[str, str]],
|
|
635
|
+
validate: bool = True,
|
|
636
|
+
use_network: bool = True) -> Dict[str, Any]:
|
|
637
|
+
"""Assemble an IUPAC name from a parent and substituent list.
|
|
638
|
+
|
|
639
|
+
Handles alphabetical ordering, multiplicative prefixes (di-, tri-),
|
|
640
|
+
and parenthesisation of complex substituents. Optionally validates
|
|
641
|
+
the assembled name by resolving it to SMILES.
|
|
642
|
+
|
|
643
|
+
Args:
|
|
644
|
+
parent: Parent ring or chain name (e.g. ``"pyridine"``,
|
|
645
|
+
``"benzene"``, ``"pentane"``).
|
|
646
|
+
substituents: List of dicts, each with ``"locant"`` (str) and
|
|
647
|
+
``"prefix"`` (str). Example::
|
|
648
|
+
|
|
649
|
+
[{"locant": "2", "prefix": "chloro"},
|
|
650
|
+
{"locant": "3", "prefix": "methyl"}]
|
|
651
|
+
validate: If True, resolve the assembled name and confirm validity.
|
|
652
|
+
use_network: Allow PubChem for validation.
|
|
653
|
+
|
|
654
|
+
Returns:
|
|
655
|
+
Dict with ``ok``, ``name``, and (if validated) ``valid``, ``smiles``.
|
|
656
|
+
|
|
657
|
+
Example::
|
|
658
|
+
|
|
659
|
+
>>> assemble_name("pyridine", [
|
|
660
|
+
... {"locant": "2", "prefix": "chloro"},
|
|
661
|
+
... {"locant": "5", "prefix": "nitro"},
|
|
662
|
+
... ])
|
|
663
|
+
{'ok': True, 'name': '2-chloro-5-nitropyridine', 'valid': True,
|
|
664
|
+
'smiles': '...'}
|
|
665
|
+
"""
|
|
666
|
+
if not parent:
|
|
667
|
+
return {"ok": False, "error": "Parent name is required."}
|
|
668
|
+
if not substituents:
|
|
669
|
+
# Bare parent — still valid
|
|
670
|
+
if validate:
|
|
671
|
+
smi = _try_validate(parent, use_network=use_network)
|
|
672
|
+
if smi:
|
|
673
|
+
return {"ok": True, "name": parent, "valid": True, "smiles": smi}
|
|
674
|
+
return {"ok": True, "name": parent, "valid": False, "smiles": None}
|
|
675
|
+
return {"ok": True, "name": parent}
|
|
676
|
+
|
|
677
|
+
# --- Group identical prefixes for multipliers ---
|
|
678
|
+
from collections import defaultdict
|
|
679
|
+
groups: Dict[str, List[str]] = defaultdict(list)
|
|
680
|
+
for sub in substituents:
|
|
681
|
+
prefix = sub.get("prefix", "").strip()
|
|
682
|
+
locant = sub.get("locant", "").strip()
|
|
683
|
+
if prefix:
|
|
684
|
+
groups[prefix].append(locant)
|
|
685
|
+
|
|
686
|
+
# --- Build prefix fragments, sorted alphabetically by prefix ---
|
|
687
|
+
fragments = []
|
|
688
|
+
for prefix in sorted(groups.keys(), key=_prefix_alpha_key):
|
|
689
|
+
locants = sorted(groups[prefix], key=_locant_sort_key)
|
|
690
|
+
locant_str = ",".join(loc for loc in locants if loc)
|
|
691
|
+
n = len(locants)
|
|
692
|
+
|
|
693
|
+
# Format the prefix with optional multiplier
|
|
694
|
+
if n > 1 and prefix in _MULTIPLIERS:
|
|
695
|
+
mult = _MULTIPLIERS.get(n, str(n))
|
|
696
|
+
elif n > 1:
|
|
697
|
+
mult = _MULTIPLIERS.get(n, str(n))
|
|
698
|
+
else:
|
|
699
|
+
mult = ""
|
|
700
|
+
|
|
701
|
+
# Parenthesise complex prefixes
|
|
702
|
+
needs_parens = _is_complex_prefix(prefix)
|
|
703
|
+
pfx = f"({prefix})" if needs_parens else prefix
|
|
704
|
+
|
|
705
|
+
if mult:
|
|
706
|
+
part = f"{locant_str}-{mult}{pfx}" if locant_str else f"{mult}{pfx}"
|
|
707
|
+
else:
|
|
708
|
+
part = f"{locant_str}-{pfx}" if locant_str else pfx
|
|
709
|
+
|
|
710
|
+
fragments.append(part)
|
|
711
|
+
|
|
712
|
+
# --- Assemble final name ---
|
|
713
|
+
name = "-".join(fragments) + parent
|
|
714
|
+
|
|
715
|
+
result: Dict[str, Any] = {"ok": True, "name": name}
|
|
716
|
+
|
|
717
|
+
if validate:
|
|
718
|
+
smi = _try_validate(name, use_network=use_network)
|
|
719
|
+
result["valid"] = smi is not None
|
|
720
|
+
result["smiles"] = smi
|
|
721
|
+
if not smi:
|
|
722
|
+
# Try without parentheses as alternative
|
|
723
|
+
alt_frags = []
|
|
724
|
+
for prefix in sorted(groups.keys(), key=_prefix_alpha_key):
|
|
725
|
+
locants = sorted(groups[prefix], key=_locant_sort_key)
|
|
726
|
+
locant_str = ",".join(loc for loc in locants if loc)
|
|
727
|
+
n = len(locants)
|
|
728
|
+
mult = _MULTIPLIERS.get(n, "") if n > 1 else ""
|
|
729
|
+
part = f"{locant_str}-{mult}{prefix}" if locant_str else f"{mult}{prefix}"
|
|
730
|
+
alt_frags.append(part)
|
|
731
|
+
alt_name = "-".join(alt_frags) + parent
|
|
732
|
+
if alt_name != name:
|
|
733
|
+
alt_smi = _try_validate(alt_name, use_network=use_network)
|
|
734
|
+
if alt_smi:
|
|
735
|
+
result["name"] = alt_name
|
|
736
|
+
result["valid"] = True
|
|
737
|
+
result["smiles"] = alt_smi
|
|
738
|
+
|
|
739
|
+
return result
|
|
740
|
+
|
|
741
|
+
|
|
742
|
+
# ---------------------------------------------------------------------------
|
|
743
|
+
# Tool 4: modify_name
|
|
744
|
+
# ---------------------------------------------------------------------------
|
|
745
|
+
|
|
746
|
+
def modify_name(name: str,
|
|
747
|
+
operation: str,
|
|
748
|
+
target: Optional[str] = None,
|
|
749
|
+
replacement: Optional[str] = None,
|
|
750
|
+
locant: Optional[str] = None,
|
|
751
|
+
validate: bool = True,
|
|
752
|
+
use_network: bool = True) -> Dict[str, Any]:
|
|
753
|
+
"""Modify an IUPAC name by swapping, adding, or removing a substituent.
|
|
754
|
+
|
|
755
|
+
Operations:
|
|
756
|
+
|
|
757
|
+
- ``"swap"``: Replace *target* prefix with *replacement*.
|
|
758
|
+
E.g. swap "nitro" → "amino" in "4-nitropyridine" → "4-aminopyridine".
|
|
759
|
+
|
|
760
|
+
- ``"add"``: Insert *replacement* at *locant*.
|
|
761
|
+
E.g. add "methyl" at "3" to "2-chloropyridine" → "2-chloro-3-methylpyridine".
|
|
762
|
+
|
|
763
|
+
- ``"remove"``: Delete the *target* prefix.
|
|
764
|
+
E.g. remove "chloro" from "2-chloro-3-methylpyridine" → "3-methylpyridine".
|
|
765
|
+
|
|
766
|
+
For ``"swap"``, the name is re-alphabetised automatically.
|
|
767
|
+
|
|
768
|
+
Args:
|
|
769
|
+
name: The IUPAC name to modify.
|
|
770
|
+
operation: ``"swap"``, ``"add"``, or ``"remove"``.
|
|
771
|
+
target: Prefix to replace (swap) or remove (remove).
|
|
772
|
+
replacement: New prefix (swap) or prefix to insert (add).
|
|
773
|
+
locant: Position for insertion (add only).
|
|
774
|
+
validate: Resolve the result to confirm validity.
|
|
775
|
+
use_network: Allow PubChem for validation.
|
|
776
|
+
|
|
777
|
+
Returns:
|
|
778
|
+
Dict with ``ok``, ``name``, ``valid``, ``smiles``.
|
|
779
|
+
|
|
780
|
+
Examples::
|
|
781
|
+
|
|
782
|
+
>>> modify_name("4-nitropyridine", "swap",
|
|
783
|
+
... target="nitro", replacement="amino")
|
|
784
|
+
{'ok': True, 'name': '4-aminopyridine', ...}
|
|
785
|
+
|
|
786
|
+
>>> modify_name("2-chloropyridine", "add",
|
|
787
|
+
... replacement="methyl", locant="3")
|
|
788
|
+
{'ok': True, 'name': '2-chloro-3-methylpyridine', ...}
|
|
789
|
+
"""
|
|
790
|
+
if operation == "swap":
|
|
791
|
+
return _modify_swap(name, target, replacement, validate, use_network)
|
|
792
|
+
elif operation == "add":
|
|
793
|
+
return _modify_add(name, replacement, locant, validate, use_network)
|
|
794
|
+
elif operation == "remove":
|
|
795
|
+
return _modify_remove(name, target, validate, use_network)
|
|
796
|
+
else:
|
|
797
|
+
return {"ok": False, "error": f"Unknown operation '{operation}'. "
|
|
798
|
+
"Use 'swap', 'add', or 'remove'."}
|
|
799
|
+
|
|
800
|
+
|
|
801
|
+
def _parse_name_components(name: str) -> Optional[Dict]:
|
|
802
|
+
"""Best-effort parse of a substituted IUPAC name into components.
|
|
803
|
+
|
|
804
|
+
Splits a name like ``"2-chloro-5-(trifluoromethyl)pyridine"`` into::
|
|
805
|
+
|
|
806
|
+
{"parent": "pyridine",
|
|
807
|
+
"substituents": [{"locant": "2", "prefix": "chloro"},
|
|
808
|
+
{"locant": "5", "prefix": "trifluoromethyl"}]}
|
|
809
|
+
|
|
810
|
+
Uses the aligned namer's ring system list for parent detection.
|
|
811
|
+
"""
|
|
812
|
+
try:
|
|
813
|
+
from .aligned_namer import _KNOWN_RINGS
|
|
814
|
+
rings = _KNOWN_RINGS
|
|
815
|
+
except ImportError:
|
|
816
|
+
rings = set()
|
|
817
|
+
|
|
818
|
+
# Also try common chain parents
|
|
819
|
+
chains = [
|
|
820
|
+
"icosane", "nonadecane", "octadecane", "heptadecane", "hexadecane",
|
|
821
|
+
"pentadecane", "tetradecane", "tridecane", "dodecane", "undecane",
|
|
822
|
+
"decane", "nonane", "octane", "heptane", "hexane", "pentane",
|
|
823
|
+
"butane", "propane", "ethane", "methane",
|
|
824
|
+
"icosanoic acid", "nonadecanoic acid", "octadecanoic acid",
|
|
825
|
+
]
|
|
826
|
+
all_parents = sorted(
|
|
827
|
+
list(rings) + chains, key=len, reverse=True
|
|
828
|
+
)
|
|
829
|
+
|
|
830
|
+
# Find the parent: longest known name that matches the tail
|
|
831
|
+
parent = None
|
|
832
|
+
prefix_part = ""
|
|
833
|
+
for p in all_parents:
|
|
834
|
+
if name.endswith(p):
|
|
835
|
+
prefix_part = name[:-len(p)]
|
|
836
|
+
parent = p
|
|
837
|
+
break
|
|
838
|
+
|
|
839
|
+
# Fallback: if no known parent, try splitting at the last segment
|
|
840
|
+
# that doesn't start with a digit
|
|
841
|
+
if parent is None:
|
|
842
|
+
# Try to identify parent as the last non-prefixed segment
|
|
843
|
+
# Pattern: everything after the last "-" that isn't a locant-prefix pair
|
|
844
|
+
parts = name.rsplit("-", 1)
|
|
845
|
+
if len(parts) == 2 and not re.match(r"^\d", parts[1]):
|
|
846
|
+
parent = parts[1]
|
|
847
|
+
prefix_part = parts[0] + "-"
|
|
848
|
+
else:
|
|
849
|
+
parent = name
|
|
850
|
+
prefix_part = ""
|
|
851
|
+
|
|
852
|
+
if not prefix_part.strip("-"):
|
|
853
|
+
return {"parent": parent, "substituents": []}
|
|
854
|
+
|
|
855
|
+
# Parse prefix_part into (locant, prefix) pairs
|
|
856
|
+
prefix_str = prefix_part.rstrip("-")
|
|
857
|
+
substituents = []
|
|
858
|
+
|
|
859
|
+
# Pattern: locant(s)-[multiplier][(]prefix[)] or locant(s)-[multiplier]prefix
|
|
860
|
+
# Walk through segments
|
|
861
|
+
segments = _split_prefix_segments(prefix_str)
|
|
862
|
+
for seg in segments:
|
|
863
|
+
parsed = _parse_single_prefix(seg)
|
|
864
|
+
if parsed:
|
|
865
|
+
substituents.extend(parsed)
|
|
866
|
+
|
|
867
|
+
return {"parent": parent, "substituents": substituents}
|
|
868
|
+
|
|
869
|
+
|
|
870
|
+
def _split_prefix_segments(prefix_str: str) -> List[str]:
|
|
871
|
+
"""Split a prefix string into individual prefix segments.
|
|
872
|
+
|
|
873
|
+
Handles parenthesised prefixes correctly:
|
|
874
|
+
``"2-chloro-3-(trifluoromethyl)"`` → ``["2-chloro", "3-(trifluoromethyl)"]``
|
|
875
|
+
"""
|
|
876
|
+
segments = []
|
|
877
|
+
current = ""
|
|
878
|
+
depth = 0
|
|
879
|
+
for ch in prefix_str:
|
|
880
|
+
if ch == "(":
|
|
881
|
+
depth += 1
|
|
882
|
+
current += ch
|
|
883
|
+
elif ch == ")":
|
|
884
|
+
depth -= 1
|
|
885
|
+
current += ch
|
|
886
|
+
elif ch == "-" and depth == 0:
|
|
887
|
+
if current:
|
|
888
|
+
# Check: is this a separator between segments, or within one?
|
|
889
|
+
# A segment boundary is after a prefix (lowercase letter or ')').
|
|
890
|
+
# Within a segment: after a locant (digit) or multiplier.
|
|
891
|
+
if current and (current[-1].isalpha() and current[-1].islower()
|
|
892
|
+
or current[-1] == ")"):
|
|
893
|
+
segments.append(current)
|
|
894
|
+
current = ""
|
|
895
|
+
else:
|
|
896
|
+
current += ch
|
|
897
|
+
else:
|
|
898
|
+
current += ch
|
|
899
|
+
else:
|
|
900
|
+
current += ch
|
|
901
|
+
if current:
|
|
902
|
+
segments.append(current)
|
|
903
|
+
return segments
|
|
904
|
+
|
|
905
|
+
|
|
906
|
+
def _parse_single_prefix(segment: str) -> Optional[List[Dict[str, str]]]:
|
|
907
|
+
"""Parse a single prefix segment like '2-chloro' or '2,4-dichloro'.
|
|
908
|
+
|
|
909
|
+
Returns list of {"locant": ..., "prefix": ...} dicts.
|
|
910
|
+
"""
|
|
911
|
+
# Handle multiplied: 2,4-dichloro
|
|
912
|
+
m = re.match(
|
|
913
|
+
r"^([\d,]+)-(?:di|tri|tetra|penta|hexa)"
|
|
914
|
+
r"[\(\[]?([a-zA-Z][\w,\-]*?)[\)\]]?$",
|
|
915
|
+
segment,
|
|
916
|
+
)
|
|
917
|
+
if m:
|
|
918
|
+
locants = m.group(1).split(",")
|
|
919
|
+
prefix = m.group(2)
|
|
920
|
+
return [{"locant": loc, "prefix": prefix} for loc in locants]
|
|
921
|
+
|
|
922
|
+
# Handle parenthesised: 3-(trifluoromethyl)
|
|
923
|
+
m = re.match(r"^(\d+\w?)-\((.+)\)$", segment)
|
|
924
|
+
if m:
|
|
925
|
+
return [{"locant": m.group(1), "prefix": m.group(2)}]
|
|
926
|
+
|
|
927
|
+
# Handle simple: 2-chloro
|
|
928
|
+
m = re.match(r"^(\d+\w?)-([a-zA-Z][\w\-]*)$", segment)
|
|
929
|
+
if m:
|
|
930
|
+
return [{"locant": m.group(1), "prefix": m.group(2)}]
|
|
931
|
+
|
|
932
|
+
# No locant: just a prefix (e.g., "amino" without locant)
|
|
933
|
+
if re.match(r"^[a-zA-Z]", segment):
|
|
934
|
+
return [{"locant": "", "prefix": segment}]
|
|
935
|
+
|
|
936
|
+
return None
|
|
937
|
+
|
|
938
|
+
|
|
939
|
+
def _modify_swap(name, target, replacement, validate, use_network):
|
|
940
|
+
"""Swap one prefix for another and re-assemble."""
|
|
941
|
+
if not target or not replacement:
|
|
942
|
+
return {"ok": False, "error": "Both 'target' and 'replacement' required for swap."}
|
|
943
|
+
|
|
944
|
+
parsed = _parse_name_components(name)
|
|
945
|
+
if parsed is None:
|
|
946
|
+
return {"ok": False, "error": f"Could not parse name '{name}'."}
|
|
947
|
+
|
|
948
|
+
subs = parsed["substituents"]
|
|
949
|
+
found = False
|
|
950
|
+
for sub in subs:
|
|
951
|
+
if sub["prefix"] == target:
|
|
952
|
+
sub["prefix"] = replacement
|
|
953
|
+
found = True
|
|
954
|
+
if not found:
|
|
955
|
+
return {
|
|
956
|
+
"ok": False,
|
|
957
|
+
"error": f"Prefix '{target}' not found in '{name}'.",
|
|
958
|
+
"found_prefixes": [s["prefix"] for s in subs],
|
|
959
|
+
}
|
|
960
|
+
|
|
961
|
+
return assemble_name(parsed["parent"], subs, validate=validate,
|
|
962
|
+
use_network=use_network)
|
|
963
|
+
|
|
964
|
+
|
|
965
|
+
def _modify_add(name, prefix, locant, validate, use_network):
|
|
966
|
+
"""Add a new substituent to an existing name."""
|
|
967
|
+
if not prefix:
|
|
968
|
+
return {"ok": False, "error": "'replacement' (prefix to add) is required."}
|
|
969
|
+
if not locant:
|
|
970
|
+
return {"ok": False, "error": "'locant' is required for add operation."}
|
|
971
|
+
|
|
972
|
+
parsed = _parse_name_components(name)
|
|
973
|
+
if parsed is None:
|
|
974
|
+
return {"ok": False, "error": f"Could not parse name '{name}'."}
|
|
975
|
+
|
|
976
|
+
parsed["substituents"].append({"locant": locant, "prefix": prefix})
|
|
977
|
+
return assemble_name(parsed["parent"], parsed["substituents"],
|
|
978
|
+
validate=validate, use_network=use_network)
|
|
979
|
+
|
|
980
|
+
|
|
981
|
+
def _modify_remove(name, target, validate, use_network):
|
|
982
|
+
"""Remove a substituent from a name."""
|
|
983
|
+
if not target:
|
|
984
|
+
return {"ok": False, "error": "'target' prefix is required for remove."}
|
|
985
|
+
|
|
986
|
+
parsed = _parse_name_components(name)
|
|
987
|
+
if parsed is None:
|
|
988
|
+
return {"ok": False, "error": f"Could not parse name '{name}'."}
|
|
989
|
+
|
|
990
|
+
original_len = len(parsed["substituents"])
|
|
991
|
+
parsed["substituents"] = [
|
|
992
|
+
s for s in parsed["substituents"] if s["prefix"] != target
|
|
993
|
+
]
|
|
994
|
+
if len(parsed["substituents"]) == original_len:
|
|
995
|
+
return {
|
|
996
|
+
"ok": False,
|
|
997
|
+
"error": f"Prefix '{target}' not found in '{name}'.",
|
|
998
|
+
"found_prefixes": [s["prefix"] for s in parsed["substituents"]],
|
|
999
|
+
}
|
|
1000
|
+
|
|
1001
|
+
return assemble_name(parsed["parent"], parsed["substituents"],
|
|
1002
|
+
validate=validate, use_network=use_network)
|
|
1003
|
+
|
|
1004
|
+
|
|
1005
|
+
# ---------------------------------------------------------------------------
|
|
1006
|
+
# Tool 5: validate_name
|
|
1007
|
+
# ---------------------------------------------------------------------------
|
|
1008
|
+
|
|
1009
|
+
def validate_name(name: str,
|
|
1010
|
+
use_network: bool = True) -> Dict[str, Any]:
|
|
1011
|
+
"""Validate an IUPAC name and return its SMILES if valid.
|
|
1012
|
+
|
|
1013
|
+
Attempts to resolve the name to a structure using ChemScript
|
|
1014
|
+
(preferred) or PubChem (fallback). Returns whether the name is
|
|
1015
|
+
valid and the canonical SMILES.
|
|
1016
|
+
|
|
1017
|
+
Args:
|
|
1018
|
+
name: IUPAC name to validate.
|
|
1019
|
+
use_network: Allow PubChem lookup.
|
|
1020
|
+
|
|
1021
|
+
Returns:
|
|
1022
|
+
Dict with ``ok``, ``valid``, ``smiles``, ``name``.
|
|
1023
|
+
|
|
1024
|
+
Example::
|
|
1025
|
+
|
|
1026
|
+
>>> validate_name("2-chloropyridine")
|
|
1027
|
+
{'ok': True, 'valid': True, 'smiles': 'Clc1ccccn1', 'name': '2-chloropyridine'}
|
|
1028
|
+
"""
|
|
1029
|
+
smi = _try_validate(name, use_network=use_network)
|
|
1030
|
+
if smi:
|
|
1031
|
+
# Also get the canonical IUPAC name if ChemScript is available
|
|
1032
|
+
canonical = _smiles_to_name_cs(smi)
|
|
1033
|
+
return {
|
|
1034
|
+
"ok": True,
|
|
1035
|
+
"valid": True,
|
|
1036
|
+
"smiles": smi,
|
|
1037
|
+
"name": name,
|
|
1038
|
+
"canonical_name": canonical,
|
|
1039
|
+
}
|
|
1040
|
+
return {"ok": True, "valid": False, "smiles": None, "name": name}
|
|
1041
|
+
|
|
1042
|
+
|
|
1043
|
+
# ---------------------------------------------------------------------------
|
|
1044
|
+
# Tool 6: name_to_structure
|
|
1045
|
+
# ---------------------------------------------------------------------------
|
|
1046
|
+
|
|
1047
|
+
def name_to_structure(name: str,
|
|
1048
|
+
output_format: str = "cdxml") -> Dict[str, Any]:
|
|
1049
|
+
"""Convert a chemical name to a structure in the requested format.
|
|
1050
|
+
|
|
1051
|
+
Resolves the name, generates 2D coordinates, and returns the
|
|
1052
|
+
structure as a string (CDXML, SMILES, or MOL).
|
|
1053
|
+
|
|
1054
|
+
Args:
|
|
1055
|
+
name: IUPAC or common name.
|
|
1056
|
+
output_format: ``"cdxml"`` (default), ``"smiles"``, or ``"mol"``.
|
|
1057
|
+
|
|
1058
|
+
Returns:
|
|
1059
|
+
Dict with ``ok`` and the structure data (key matches format name).
|
|
1060
|
+
|
|
1061
|
+
Example::
|
|
1062
|
+
|
|
1063
|
+
>>> result = name_to_structure("2-chloropyridine")
|
|
1064
|
+
>>> result["ok"]
|
|
1065
|
+
True
|
|
1066
|
+
>>> result["cdxml"][:20]
|
|
1067
|
+
'<?xml version="1.0"'
|
|
1068
|
+
"""
|
|
1069
|
+
fmt = output_format.lower()
|
|
1070
|
+
|
|
1071
|
+
if fmt == "smiles":
|
|
1072
|
+
smi = _try_validate(name)
|
|
1073
|
+
if smi:
|
|
1074
|
+
return {"ok": True, "smiles": smi}
|
|
1075
|
+
return {"ok": False, "error": f"Could not resolve '{name}'."}
|
|
1076
|
+
|
|
1077
|
+
# For CDXML and MOL, prefer ChemScript (gives ACS-styled 2D)
|
|
1078
|
+
cs = _get_cs()
|
|
1079
|
+
if cs is not None:
|
|
1080
|
+
try:
|
|
1081
|
+
if fmt == "cdxml":
|
|
1082
|
+
cdxml = cs.name_to_cdxml(name)
|
|
1083
|
+
return {"ok": True, "cdxml": cdxml}
|
|
1084
|
+
elif fmt == "mol":
|
|
1085
|
+
mol_data = cs.write_data(name, "mol", source_format="name")
|
|
1086
|
+
return {"ok": True, "mol": mol_data}
|
|
1087
|
+
except Exception:
|
|
1088
|
+
pass
|
|
1089
|
+
|
|
1090
|
+
# Fallback: resolve to SMILES, then generate structure via RDKit
|
|
1091
|
+
smi = _try_validate(name)
|
|
1092
|
+
if not smi:
|
|
1093
|
+
return {"ok": False, "error": f"Could not resolve '{name}'."}
|
|
1094
|
+
|
|
1095
|
+
if fmt == "cdxml":
|
|
1096
|
+
# Try ChemScript with SMILES input
|
|
1097
|
+
if cs is not None:
|
|
1098
|
+
try:
|
|
1099
|
+
cdxml = cs.smiles_to_cdxml(smi)
|
|
1100
|
+
return {"ok": True, "cdxml": cdxml}
|
|
1101
|
+
except Exception:
|
|
1102
|
+
pass
|
|
1103
|
+
return {"ok": False,
|
|
1104
|
+
"error": "CDXML output requires ChemScript. "
|
|
1105
|
+
f"Name resolved to SMILES: {smi}",
|
|
1106
|
+
"smiles": smi}
|
|
1107
|
+
|
|
1108
|
+
if fmt == "mol":
|
|
1109
|
+
from rdkit import Chem
|
|
1110
|
+
from rdkit.Chem import AllChem
|
|
1111
|
+
mol = Chem.MolFromSmiles(smi)
|
|
1112
|
+
if mol:
|
|
1113
|
+
AllChem.Compute2DCoords(mol)
|
|
1114
|
+
return {"ok": True, "mol": Chem.MolToMolBlock(mol)}
|
|
1115
|
+
return {"ok": False, "error": "RDKit could not generate MOL block."}
|
|
1116
|
+
|
|
1117
|
+
return {"ok": False, "error": f"Unknown format '{fmt}'. Use cdxml, smiles, or mol."}
|
|
1118
|
+
|
|
1119
|
+
|
|
1120
|
+
# ---------------------------------------------------------------------------
|
|
1121
|
+
# Tool 7: enumerate_names
|
|
1122
|
+
# ---------------------------------------------------------------------------
|
|
1123
|
+
|
|
1124
|
+
def enumerate_names(identifier: str,
|
|
1125
|
+
use_network: bool = True) -> Dict[str, Any]:
|
|
1126
|
+
"""Enumerate alternative IUPAC name forms for a molecule.
|
|
1127
|
+
|
|
1128
|
+
Given a chemical name or SMILES, returns the canonical IUPAC name plus
|
|
1129
|
+
alternative forms where substituents appear as different prefixes or
|
|
1130
|
+
where a different parent ring/chain is chosen. This is essential for
|
|
1131
|
+
name surgery: it lets you see functional groups as swappable prefixes.
|
|
1132
|
+
|
|
1133
|
+
For example, ``"1-(4-bromophenyl)ethan-1-one"`` (a ketone in suffix
|
|
1134
|
+
form) generates alternatives including ``"1-acetyl-4-bromobenzene"``
|
|
1135
|
+
where the ketone appears as the prefix ``"acetyl"`` — now swappable
|
|
1136
|
+
via ``modify_name``.
|
|
1137
|
+
|
|
1138
|
+
Args:
|
|
1139
|
+
identifier: Chemical name, SMILES, abbreviation, or any
|
|
1140
|
+
identifier accepted by ``resolve_to_smiles``.
|
|
1141
|
+
use_network: Allow PubChem for resolution.
|
|
1142
|
+
|
|
1143
|
+
Returns:
|
|
1144
|
+
Dict with:
|
|
1145
|
+
|
|
1146
|
+
- ``ok``: bool
|
|
1147
|
+
- ``canonical_name``: the ChemDraw canonical IUPAC name
|
|
1148
|
+
- ``smiles``: canonical SMILES
|
|
1149
|
+
- ``names``: list of dicts, each with ``name`` (str),
|
|
1150
|
+
``valid`` (bool), ``strategy`` (str), and ``prefixes``
|
|
1151
|
+
(list of prefix strings visible in that name form).
|
|
1152
|
+
The canonical name is always the first entry.
|
|
1153
|
+
|
|
1154
|
+
Example::
|
|
1155
|
+
|
|
1156
|
+
>>> result = enumerate_names("1-(4-bromophenyl)ethan-1-one")
|
|
1157
|
+
>>> for n in result["names"]:
|
|
1158
|
+
... print(n["name"], n["prefixes"])
|
|
1159
|
+
1-(4-bromophenyl)ethan-1-one ['(4-bromophenyl)']
|
|
1160
|
+
1-acetyl-4-bromobenzene ['acetyl', 'bromo']
|
|
1161
|
+
...
|
|
1162
|
+
"""
|
|
1163
|
+
# Resolve to SMILES — try direct SMILES parse first, then name resolution
|
|
1164
|
+
from rdkit import Chem as _Chem
|
|
1165
|
+
_test_mol = _Chem.MolFromSmiles(identifier)
|
|
1166
|
+
if _test_mol is not None:
|
|
1167
|
+
smiles = _Chem.MolToSmiles(_test_mol)
|
|
1168
|
+
else:
|
|
1169
|
+
resolved = _resolve_query(identifier, use_network=use_network)
|
|
1170
|
+
if not resolved:
|
|
1171
|
+
return {"ok": False,
|
|
1172
|
+
"error": f"Could not resolve '{identifier}' to a structure."}
|
|
1173
|
+
smiles = resolved["smiles"]
|
|
1174
|
+
|
|
1175
|
+
# Run decomposition
|
|
1176
|
+
try:
|
|
1177
|
+
from .name_decomposer import decompose_name
|
|
1178
|
+
result = decompose_name(smiles, verbose=False, timeout=30.0)
|
|
1179
|
+
except Exception as exc:
|
|
1180
|
+
return {"ok": False,
|
|
1181
|
+
"error": f"Decomposition failed: {exc}",
|
|
1182
|
+
"smiles": smiles}
|
|
1183
|
+
|
|
1184
|
+
if result.errors:
|
|
1185
|
+
return {"ok": False,
|
|
1186
|
+
"error": "; ".join(result.errors),
|
|
1187
|
+
"smiles": smiles}
|
|
1188
|
+
|
|
1189
|
+
canon = result.canonical_name
|
|
1190
|
+
if not canon:
|
|
1191
|
+
return {"ok": False,
|
|
1192
|
+
"error": "Could not determine canonical name.",
|
|
1193
|
+
"smiles": smiles}
|
|
1194
|
+
|
|
1195
|
+
# Build the output list, canonical first
|
|
1196
|
+
names = []
|
|
1197
|
+
|
|
1198
|
+
# Parse prefixes from each name form
|
|
1199
|
+
canon_parsed = _parse_name_components(canon)
|
|
1200
|
+
canon_prefixes = ([s["prefix"] for s in canon_parsed["substituents"]]
|
|
1201
|
+
if canon_parsed else [])
|
|
1202
|
+
names.append({
|
|
1203
|
+
"name": canon,
|
|
1204
|
+
"valid": True,
|
|
1205
|
+
"strategy": "canonical",
|
|
1206
|
+
"prefixes": canon_prefixes,
|
|
1207
|
+
})
|
|
1208
|
+
|
|
1209
|
+
# Add valid alternatives
|
|
1210
|
+
seen = {canon}
|
|
1211
|
+
for alt in result.alternatives:
|
|
1212
|
+
if not alt.valid:
|
|
1213
|
+
continue
|
|
1214
|
+
if alt.name in seen:
|
|
1215
|
+
continue
|
|
1216
|
+
seen.add(alt.name)
|
|
1217
|
+
|
|
1218
|
+
parsed = _parse_name_components(alt.name)
|
|
1219
|
+
prefixes = ([s["prefix"] for s in parsed["substituents"]]
|
|
1220
|
+
if parsed else [])
|
|
1221
|
+
names.append({
|
|
1222
|
+
"name": alt.name,
|
|
1223
|
+
"valid": True,
|
|
1224
|
+
"strategy": alt.strategy,
|
|
1225
|
+
"prefixes": prefixes,
|
|
1226
|
+
})
|
|
1227
|
+
|
|
1228
|
+
return {
|
|
1229
|
+
"ok": True,
|
|
1230
|
+
"canonical_name": canon,
|
|
1231
|
+
"smiles": smiles,
|
|
1232
|
+
"names": names,
|
|
1233
|
+
}
|
|
1234
|
+
|
|
1235
|
+
|
|
1236
|
+
# ---------------------------------------------------------------------------
|
|
1237
|
+
# Layer 3: Graph manipulation — reaction templates
|
|
1238
|
+
# ---------------------------------------------------------------------------
|
|
1239
|
+
|
|
1240
|
+
# Hand-curated templates for common med-chem transformations that an LLM
|
|
1241
|
+
# will recognise by name. These supplement the larger collection loaded
|
|
1242
|
+
# from reactions_datamol.json.
|
|
1243
|
+
_CLASSIC_TEMPLATES: Dict[str, Dict[str, Any]] = {
|
|
1244
|
+
"suzuki_coupling": {
|
|
1245
|
+
"description": "Suzuki coupling: aryl halide + boronic acid to biaryl",
|
|
1246
|
+
"smarts": "[c:1][Br,I].[#6:2][B]([OH])[OH]>>[c:1]-[#6:2]",
|
|
1247
|
+
"n_reactants": 2,
|
|
1248
|
+
"substrate_hint": "aryl bromide or iodide",
|
|
1249
|
+
"reagent_hint": "boronic acid",
|
|
1250
|
+
"conditions": ["Pd(dppf)Cl2", "K2CO3", "dioxane/H2O", "80 °C"],
|
|
1251
|
+
"category": "coupling",
|
|
1252
|
+
},
|
|
1253
|
+
"buchwald_amination": {
|
|
1254
|
+
"description":
|
|
1255
|
+
"Buchwald-Hartwig amination: aryl halide + amine to aryl amine",
|
|
1256
|
+
"smarts": "[c:1][Cl,Br,I].[NX3;H2,H1:2]>>[c:1]-[N:2]",
|
|
1257
|
+
"n_reactants": 2,
|
|
1258
|
+
"substrate_hint": "aryl halide",
|
|
1259
|
+
"reagent_hint": "primary or secondary amine",
|
|
1260
|
+
"conditions": ["Pd2(dba)3", "XPhos", "Cs2CO3", "toluene", "100 °C"],
|
|
1261
|
+
"category": "coupling",
|
|
1262
|
+
},
|
|
1263
|
+
"snar": {
|
|
1264
|
+
"description":
|
|
1265
|
+
"Nucleophilic aromatic substitution: activated aryl halide + "
|
|
1266
|
+
"nucleophile",
|
|
1267
|
+
"smarts": "[c:1][F,Cl].[NX3;H2,H1:2]>>[c:1]-[N:2]",
|
|
1268
|
+
"n_reactants": 2,
|
|
1269
|
+
"substrate_hint": "electron-poor aryl fluoride or chloride",
|
|
1270
|
+
"reagent_hint": "amine nucleophile",
|
|
1271
|
+
"conditions": ["DIPEA", "DMSO or NMP", "80-120 °C"],
|
|
1272
|
+
"category": "coupling",
|
|
1273
|
+
},
|
|
1274
|
+
"amide_coupling": {
|
|
1275
|
+
"description": "Amide bond formation: carboxylic acid + amine",
|
|
1276
|
+
"smarts":
|
|
1277
|
+
"[C:1](=[O:2])[OH].[NX3;H2,H1:3]>>[C:1](=[O:2])-[N:3]",
|
|
1278
|
+
"n_reactants": 2,
|
|
1279
|
+
"substrate_hint": "carboxylic acid",
|
|
1280
|
+
"reagent_hint": "primary or secondary amine",
|
|
1281
|
+
"conditions": ["HATU", "DIPEA", "DMF", "rt"],
|
|
1282
|
+
"category": "coupling",
|
|
1283
|
+
},
|
|
1284
|
+
"reductive_amination": {
|
|
1285
|
+
"description":
|
|
1286
|
+
"Reductive amination: aldehyde or ketone + amine to amine",
|
|
1287
|
+
"smarts": "[C:1](=[O:2]).[NX3;H2,H1:3]>>[C:1]-[N:3]",
|
|
1288
|
+
"n_reactants": 2,
|
|
1289
|
+
"substrate_hint": "aldehyde or ketone",
|
|
1290
|
+
"reagent_hint": "primary or secondary amine",
|
|
1291
|
+
"conditions": ["NaBH(OAc)3", "AcOH", "DCE", "rt"],
|
|
1292
|
+
"category": "functional_group",
|
|
1293
|
+
},
|
|
1294
|
+
"nitro_reduction": {
|
|
1295
|
+
"description": "Nitro group reduction to amine (ArNO2 to ArNH2)",
|
|
1296
|
+
"smarts": "[c:1][N+](=[O])[O-]>>[c:1]N",
|
|
1297
|
+
"n_reactants": 1,
|
|
1298
|
+
"substrate_hint": "aromatic nitro compound",
|
|
1299
|
+
"reagent_hint": None,
|
|
1300
|
+
"conditions": ["SnCl2·2H2O", "EtOH", "80 °C"],
|
|
1301
|
+
"category": "functional_group",
|
|
1302
|
+
},
|
|
1303
|
+
"ester_hydrolysis": {
|
|
1304
|
+
"description": "Ester hydrolysis to carboxylic acid",
|
|
1305
|
+
"smarts":
|
|
1306
|
+
"[C:1](=[O:2])[O:3][C:4]>>[C:1](=[O:2])[OH]",
|
|
1307
|
+
"n_reactants": 1,
|
|
1308
|
+
"substrate_hint": "ester",
|
|
1309
|
+
"reagent_hint": None,
|
|
1310
|
+
"conditions": ["LiOH", "THF/H2O", "rt"],
|
|
1311
|
+
"category": "functional_group",
|
|
1312
|
+
},
|
|
1313
|
+
"n_alkylation": {
|
|
1314
|
+
"description": "N-Alkylation: amine + alkyl halide",
|
|
1315
|
+
"smarts":
|
|
1316
|
+
"[NX3;H2,H1:1].[C:2][Cl,Br,I]>>[N:1]-[C:2]",
|
|
1317
|
+
"n_reactants": 2,
|
|
1318
|
+
"substrate_hint": "amine",
|
|
1319
|
+
"reagent_hint": "alkyl halide",
|
|
1320
|
+
"conditions": ["K2CO3", "DMF", "60 °C"],
|
|
1321
|
+
"category": "coupling",
|
|
1322
|
+
},
|
|
1323
|
+
"sonogashira_coupling": {
|
|
1324
|
+
"description":
|
|
1325
|
+
"Sonogashira coupling: aryl halide + terminal alkyne",
|
|
1326
|
+
"smarts":
|
|
1327
|
+
"[c:1][Br,I].[CH:2]#[C:3]>>[c:1]-[C:2]#[C:3]",
|
|
1328
|
+
"n_reactants": 2,
|
|
1329
|
+
"substrate_hint": "aryl bromide or iodide",
|
|
1330
|
+
"reagent_hint": "terminal alkyne",
|
|
1331
|
+
"conditions": [
|
|
1332
|
+
"PdCl2(PPh3)2", "CuI", "Et3N", "THF", "rt",
|
|
1333
|
+
],
|
|
1334
|
+
"category": "coupling",
|
|
1335
|
+
},
|
|
1336
|
+
"heck_reaction": {
|
|
1337
|
+
"description":
|
|
1338
|
+
"Heck reaction: aryl halide + alkene to substituted alkene",
|
|
1339
|
+
"smarts":
|
|
1340
|
+
"[c:1][Br,I].[CH:2]=[CH2:3]>>[c:1]/[CH:2]=[CH2:3]",
|
|
1341
|
+
"n_reactants": 2,
|
|
1342
|
+
"substrate_hint": "aryl halide",
|
|
1343
|
+
"reagent_hint": "terminal alkene",
|
|
1344
|
+
"conditions": [
|
|
1345
|
+
"Pd(OAc)2", "P(o-tol)3", "Et3N", "DMF", "100 °C",
|
|
1346
|
+
],
|
|
1347
|
+
"category": "coupling",
|
|
1348
|
+
},
|
|
1349
|
+
"alcohol_oxidation": {
|
|
1350
|
+
"description": "Alcohol oxidation to aldehyde or ketone",
|
|
1351
|
+
"smarts":
|
|
1352
|
+
"[C:1][OH:2]>>[C:1]=[O:2]",
|
|
1353
|
+
"n_reactants": 1,
|
|
1354
|
+
"substrate_hint": "primary or secondary alcohol",
|
|
1355
|
+
"reagent_hint": None,
|
|
1356
|
+
"conditions": ["Dess-Martin periodinane", "DCM", "rt"],
|
|
1357
|
+
"category": "functional_group",
|
|
1358
|
+
},
|
|
1359
|
+
"grignard_addition": {
|
|
1360
|
+
"description":
|
|
1361
|
+
"Grignard / organometallic addition to aldehyde or ketone",
|
|
1362
|
+
"smarts":
|
|
1363
|
+
"[C:1](=[O:2])[#6:4].[#6:3][Mg]>>[C:1]([OH:2])([#6:4])-[#6:3]",
|
|
1364
|
+
"n_reactants": 2,
|
|
1365
|
+
"substrate_hint": "aldehyde or ketone",
|
|
1366
|
+
"reagent_hint": "Grignard reagent (RMgBr SMILES)",
|
|
1367
|
+
"conditions": ["THF", "-78 °C to rt"],
|
|
1368
|
+
"category": "functional_group",
|
|
1369
|
+
},
|
|
1370
|
+
# --- Hartenfeller-Schneider extras (not in datamol) ---
|
|
1371
|
+
"amide_hydrolysis": {
|
|
1372
|
+
"description":
|
|
1373
|
+
"Amide hydrolysis to carboxylic acid (RCONHR' \u2192 RCOOH)",
|
|
1374
|
+
"smarts": "[C:1](=[O:2])[NX3:3]>>[C:1](=[O:2])[OH]",
|
|
1375
|
+
"n_reactants": 1,
|
|
1376
|
+
"substrate_hint": "amide (primary, secondary, or tertiary)",
|
|
1377
|
+
"reagent_hint": None,
|
|
1378
|
+
"conditions": ["6M HCl or 2M NaOH", "reflux"],
|
|
1379
|
+
"category": "functional_group",
|
|
1380
|
+
},
|
|
1381
|
+
"wittig": {
|
|
1382
|
+
"description":
|
|
1383
|
+
"Wittig olefination: aldehyde/ketone + alkyl halide to alkene",
|
|
1384
|
+
"smarts":
|
|
1385
|
+
"[#6:3]-[C;H1,$([CH0](-[#6])[#6]);!$(CC=O):1]=[OD1]"
|
|
1386
|
+
".[Cl,Br,I][C;H2;$(C-[#6]);!$(CC[I,Br]);!$(CCO[CH3]):2]"
|
|
1387
|
+
">>[C:3][C:1]=[C:2]",
|
|
1388
|
+
"n_reactants": 2,
|
|
1389
|
+
"substrate_hint": "aldehyde or ketone",
|
|
1390
|
+
"reagent_hint": "alkyl halide (ylide precursor)",
|
|
1391
|
+
"conditions": ["PPh3", "n-BuLi", "THF", "0 \u00b0C to rt"],
|
|
1392
|
+
"category": "functional_group",
|
|
1393
|
+
},
|
|
1394
|
+
"niementowski_quinazoline": {
|
|
1395
|
+
"description":
|
|
1396
|
+
"Niementowski quinazoline: anthranilic acid + amide "
|
|
1397
|
+
"\u2192 4-quinazolinone",
|
|
1398
|
+
"smarts":
|
|
1399
|
+
"[c:1](-[C;$(C-c1ccccc1):2](=[OD1:3])-[OH1])"
|
|
1400
|
+
":[c:4](-[NH2:5])"
|
|
1401
|
+
".[N;!H0;!$(N-N);!$(N-C=N);!$(N(-C=O)-C=O):6]"
|
|
1402
|
+
"-[C;H1,$(C-[#6]):7]=[OD1]"
|
|
1403
|
+
">>[c:4]2:[c:1]-[C:2](=[O:3])-[N:6]-[C:7]=[N:5]-2",
|
|
1404
|
+
"n_reactants": 2,
|
|
1405
|
+
"substrate_hint": "anthranilic acid derivative",
|
|
1406
|
+
"reagent_hint": "amide or formamide",
|
|
1407
|
+
"conditions": ["neat or AcOH", "120\u2013150 \u00b0C"],
|
|
1408
|
+
"category": "heterocycle_formation",
|
|
1409
|
+
},
|
|
1410
|
+
"grignard_carbonyl": {
|
|
1411
|
+
"description":
|
|
1412
|
+
"Grignard on nitrile: nitrile + aryl/alkyl halide \u2192 ketone",
|
|
1413
|
+
"smarts":
|
|
1414
|
+
"[#6:1][C:2]#[#7;D1]"
|
|
1415
|
+
".[Cl,Br,I][#6;$([#6]~[#6]);"
|
|
1416
|
+
"!$([#6]([Cl,Br,I])[Cl,Br,I]);!$([#6]=O):3]"
|
|
1417
|
+
">>[#6:1][C:2](=O)[#6:3]",
|
|
1418
|
+
"n_reactants": 2,
|
|
1419
|
+
"substrate_hint": "nitrile (R\u2212C\u2261N)",
|
|
1420
|
+
"reagent_hint": "aryl or alkyl halide (Grignard precursor)",
|
|
1421
|
+
"conditions": ["Mg", "THF", "then H3O+"],
|
|
1422
|
+
"category": "functional_group",
|
|
1423
|
+
},
|
|
1424
|
+
# --- Deprotection templates (SMARTS from RDKit rdDeprotect source) ---
|
|
1425
|
+
"cbz_deprotection": {
|
|
1426
|
+
"description": "Remove Cbz (carbobenzyloxy) from amine",
|
|
1427
|
+
"smarts":
|
|
1428
|
+
"[NX3;H0,H1:1][C;R0](=O)[O;R0][C;R0]"
|
|
1429
|
+
"c1[c;H1][c;H1][c;H1][c;H1][c;H1]1>>[N:1]",
|
|
1430
|
+
"n_reactants": 1,
|
|
1431
|
+
"substrate_hint": "Cbz-protected amine",
|
|
1432
|
+
"reagent_hint": None,
|
|
1433
|
+
"conditions": ["H2", "Pd/C", "MeOH", "rt"],
|
|
1434
|
+
"category": "deprotection",
|
|
1435
|
+
},
|
|
1436
|
+
"fmoc_deprotection": {
|
|
1437
|
+
"description": "Remove Fmoc (9-fluorenylmethyloxycarbonyl) from amine",
|
|
1438
|
+
"smarts":
|
|
1439
|
+
"[NX3;H0,H1:1][#6](=O)-[#8]-[#6]-[#6]-1"
|
|
1440
|
+
"-c2ccccc2-c2ccccc-12>>[N:1]",
|
|
1441
|
+
"n_reactants": 1,
|
|
1442
|
+
"substrate_hint": "Fmoc-protected amine",
|
|
1443
|
+
"reagent_hint": None,
|
|
1444
|
+
"conditions": ["piperidine", "DMF", "rt"],
|
|
1445
|
+
"category": "deprotection",
|
|
1446
|
+
},
|
|
1447
|
+
"tbs_deprotection": {
|
|
1448
|
+
"description": "Remove TBS (tert-butyldimethylsilyl) from alcohol",
|
|
1449
|
+
"smarts": "CC(C)([Si](C)(C)[O;H0:1])C>>[O;H1:1]",
|
|
1450
|
+
"n_reactants": 1,
|
|
1451
|
+
"substrate_hint": "TBS-protected alcohol",
|
|
1452
|
+
"reagent_hint": None,
|
|
1453
|
+
"conditions": ["TBAF", "THF", "rt"],
|
|
1454
|
+
"category": "deprotection",
|
|
1455
|
+
},
|
|
1456
|
+
"bn_deprotection_o": {
|
|
1457
|
+
"description": "Remove benzyl (Bn) from alcohol",
|
|
1458
|
+
"smarts":
|
|
1459
|
+
"[O;!$(*C(=O)):1][CH2]"
|
|
1460
|
+
"c1[c;H1][c;H1][c;H1][c;H1][c;H1]1>>[O;H1:1]",
|
|
1461
|
+
"n_reactants": 1,
|
|
1462
|
+
"substrate_hint": "Bn-protected alcohol",
|
|
1463
|
+
"reagent_hint": None,
|
|
1464
|
+
"conditions": ["H2", "Pd/C", "EtOAc", "rt"],
|
|
1465
|
+
"category": "deprotection",
|
|
1466
|
+
},
|
|
1467
|
+
"bn_deprotection_n": {
|
|
1468
|
+
"description": "Remove benzyl (Bn) from amine",
|
|
1469
|
+
"smarts":
|
|
1470
|
+
"[NX3;H0,H1;!$(NC=O):1][C;H2]"
|
|
1471
|
+
"c1[c;H1][c;H1][c;H1][c;H1][c;H1]1>>[N:1]",
|
|
1472
|
+
"n_reactants": 1,
|
|
1473
|
+
"substrate_hint": "Bn-protected amine",
|
|
1474
|
+
"reagent_hint": None,
|
|
1475
|
+
"conditions": ["H2", "Pd/C", "MeOH", "rt"],
|
|
1476
|
+
"category": "deprotection",
|
|
1477
|
+
},
|
|
1478
|
+
"ac_deprotection_o": {
|
|
1479
|
+
"description": "Remove acetyl (Ac) from alcohol",
|
|
1480
|
+
"smarts": "[O;R0:1][C;R0](=O)[C;H3]>>[O:1]",
|
|
1481
|
+
"n_reactants": 1,
|
|
1482
|
+
"substrate_hint": "Ac-protected alcohol",
|
|
1483
|
+
"reagent_hint": None,
|
|
1484
|
+
"conditions": ["K2CO3", "MeOH", "rt"],
|
|
1485
|
+
"category": "deprotection",
|
|
1486
|
+
},
|
|
1487
|
+
"ac_deprotection_n": {
|
|
1488
|
+
"description": "Remove acetyl (Ac) from amine",
|
|
1489
|
+
"smarts": "[NX3;H0,H1:1][C;R0](=O)[C;H3]>>[N:1]",
|
|
1490
|
+
"n_reactants": 1,
|
|
1491
|
+
"substrate_hint": "Ac-protected amine",
|
|
1492
|
+
"reagent_hint": None,
|
|
1493
|
+
"conditions": ["6M HCl", "reflux"],
|
|
1494
|
+
"category": "deprotection",
|
|
1495
|
+
},
|
|
1496
|
+
"pmb_deprotection": {
|
|
1497
|
+
"description": "Remove PMB (para-methoxybenzyl) from alcohol",
|
|
1498
|
+
"smarts":
|
|
1499
|
+
"[c;H1]1[c;H1]c(O[C;H3])[c;H1][c;H1]c1"
|
|
1500
|
+
"[C;H2][O;D2&R0:1]>>[O;H1:1]",
|
|
1501
|
+
"n_reactants": 1,
|
|
1502
|
+
"substrate_hint": "PMB-protected alcohol",
|
|
1503
|
+
"reagent_hint": None,
|
|
1504
|
+
"conditions": ["DDQ", "DCM/H2O", "rt"],
|
|
1505
|
+
"category": "deprotection",
|
|
1506
|
+
},
|
|
1507
|
+
"ts_deprotection": {
|
|
1508
|
+
"description": "Remove tosyl (Ts) from amine",
|
|
1509
|
+
"smarts":
|
|
1510
|
+
"[C;H3]c1[c;H1][c;H1]c(S(=O)(=O)"
|
|
1511
|
+
"[NX3;H0,H1;!$(NC=O):1])[c;H1][c;H1]1>>[N:1]",
|
|
1512
|
+
"n_reactants": 1,
|
|
1513
|
+
"substrate_hint": "Ts-protected amine",
|
|
1514
|
+
"reagent_hint": None,
|
|
1515
|
+
"conditions": ["Mg", "MeOH", "sonication"],
|
|
1516
|
+
"category": "deprotection",
|
|
1517
|
+
},
|
|
1518
|
+
"tfa_deprotection": {
|
|
1519
|
+
"description": "Remove trifluoroacetyl (TFA) from amine",
|
|
1520
|
+
"smarts": "[N;H0,H1:1]C(=O)C(F)(F)F>>[N:1]",
|
|
1521
|
+
"n_reactants": 1,
|
|
1522
|
+
"substrate_hint": "TFA-protected amine",
|
|
1523
|
+
"reagent_hint": None,
|
|
1524
|
+
"conditions": ["K2CO3", "MeOH/H2O", "rt"],
|
|
1525
|
+
"category": "deprotection",
|
|
1526
|
+
},
|
|
1527
|
+
# --- Protection templates (reversed deprotection SMARTS, unimolecular) ---
|
|
1528
|
+
"cbz_protection": {
|
|
1529
|
+
"description": "Add Cbz (carbobenzyloxy) to amine",
|
|
1530
|
+
"smarts": "[NX3;H1,H2:1]>>[N:1]C(=O)OCc1ccccc1",
|
|
1531
|
+
"n_reactants": 1,
|
|
1532
|
+
"substrate_hint": "free amine",
|
|
1533
|
+
"reagent_hint": None,
|
|
1534
|
+
"conditions": ["CbzCl", "NaOH", "dioxane/H2O", "0 \u00b0C"],
|
|
1535
|
+
"category": "protection",
|
|
1536
|
+
},
|
|
1537
|
+
"fmoc_protection": {
|
|
1538
|
+
"description": "Add Fmoc (9-fluorenylmethyloxycarbonyl) to amine",
|
|
1539
|
+
"smarts":
|
|
1540
|
+
"[NX3;H1,H2:1]>>[N:1]C(=O)OCC1c2ccccc2-c2ccccc21",
|
|
1541
|
+
"n_reactants": 1,
|
|
1542
|
+
"substrate_hint": "free amine",
|
|
1543
|
+
"reagent_hint": None,
|
|
1544
|
+
"conditions": ["Fmoc-OSu", "NaHCO3", "dioxane/H2O", "rt"],
|
|
1545
|
+
"category": "protection",
|
|
1546
|
+
},
|
|
1547
|
+
"tbs_protection": {
|
|
1548
|
+
"description": "Add TBS (tert-butyldimethylsilyl) to alcohol",
|
|
1549
|
+
"smarts": "[O;H1:1]>>[O:1][Si](C)(C)C(C)(C)C",
|
|
1550
|
+
"n_reactants": 1,
|
|
1551
|
+
"substrate_hint": "free alcohol",
|
|
1552
|
+
"reagent_hint": None,
|
|
1553
|
+
"conditions": ["TBSCl", "imidazole", "DMF", "rt"],
|
|
1554
|
+
"category": "protection",
|
|
1555
|
+
},
|
|
1556
|
+
"bn_protection_o": {
|
|
1557
|
+
"description": "Add benzyl (Bn) to alcohol",
|
|
1558
|
+
"smarts": "[O;H1:1]>>[O:1]Cc1ccccc1",
|
|
1559
|
+
"n_reactants": 1,
|
|
1560
|
+
"substrate_hint": "free alcohol",
|
|
1561
|
+
"reagent_hint": None,
|
|
1562
|
+
"conditions": ["BnBr", "NaH", "DMF", "0 \u00b0C"],
|
|
1563
|
+
"category": "protection",
|
|
1564
|
+
},
|
|
1565
|
+
"bn_protection_n": {
|
|
1566
|
+
"description": "Add benzyl (Bn) to amine",
|
|
1567
|
+
"smarts": "[NX3;H1,H2;!$(NC=O):1]>>[N:1]Cc1ccccc1",
|
|
1568
|
+
"n_reactants": 1,
|
|
1569
|
+
"substrate_hint": "free amine",
|
|
1570
|
+
"reagent_hint": None,
|
|
1571
|
+
"conditions": ["BnBr", "K2CO3", "DMF", "60 \u00b0C"],
|
|
1572
|
+
"category": "protection",
|
|
1573
|
+
},
|
|
1574
|
+
"ac_protection_o": {
|
|
1575
|
+
"description": "Add acetyl (Ac) to alcohol",
|
|
1576
|
+
"smarts": "[O;H1:1]>>[O:1]C(C)=O",
|
|
1577
|
+
"n_reactants": 1,
|
|
1578
|
+
"substrate_hint": "free alcohol",
|
|
1579
|
+
"reagent_hint": None,
|
|
1580
|
+
"conditions": ["Ac2O", "pyridine", "rt"],
|
|
1581
|
+
"category": "protection",
|
|
1582
|
+
},
|
|
1583
|
+
"ac_protection_n": {
|
|
1584
|
+
"description": "Add acetyl (Ac) to amine",
|
|
1585
|
+
"smarts": "[NX3;H1,H2:1]>>[N:1]C(C)=O",
|
|
1586
|
+
"n_reactants": 1,
|
|
1587
|
+
"substrate_hint": "free amine",
|
|
1588
|
+
"reagent_hint": None,
|
|
1589
|
+
"conditions": ["Ac2O", "Et3N", "DCM", "rt"],
|
|
1590
|
+
"category": "protection",
|
|
1591
|
+
},
|
|
1592
|
+
"pmb_protection": {
|
|
1593
|
+
"description": "Add PMB (para-methoxybenzyl) to alcohol",
|
|
1594
|
+
"smarts": "[O;H1:1]>>[O:1]Cc1ccc(OC)cc1",
|
|
1595
|
+
"n_reactants": 1,
|
|
1596
|
+
"substrate_hint": "free alcohol",
|
|
1597
|
+
"reagent_hint": None,
|
|
1598
|
+
"conditions": ["PMBCl", "NaH", "DMF", "0 \u00b0C"],
|
|
1599
|
+
"category": "protection",
|
|
1600
|
+
},
|
|
1601
|
+
"ts_protection": {
|
|
1602
|
+
"description": "Add tosyl (Ts) to amine",
|
|
1603
|
+
"smarts":
|
|
1604
|
+
"[NX3;H1,H2;!$(NC=O):1]>>[N:1]S(=O)(=O)c1ccc(C)cc1",
|
|
1605
|
+
"n_reactants": 1,
|
|
1606
|
+
"substrate_hint": "free amine",
|
|
1607
|
+
"reagent_hint": None,
|
|
1608
|
+
"conditions": ["TsCl", "Et3N", "DCM", "0 \u00b0C"],
|
|
1609
|
+
"category": "protection",
|
|
1610
|
+
},
|
|
1611
|
+
}
|
|
1612
|
+
|
|
1613
|
+
|
|
1614
|
+
# ---------------------------------------------------------------------------
|
|
1615
|
+
# Dynamic loading of reaction templates from datamol
|
|
1616
|
+
# ---------------------------------------------------------------------------
|
|
1617
|
+
|
|
1618
|
+
_datamol_cache: Optional[Dict[str, Dict[str, Any]]] = None
|
|
1619
|
+
|
|
1620
|
+
|
|
1621
|
+
def _category_from_tags(tags: set) -> str:
|
|
1622
|
+
"""Derive a template category from datamol tags."""
|
|
1623
|
+
if tags & {"heterocycle formation", "cyclization", "ring formation"}:
|
|
1624
|
+
return "heterocycle_formation"
|
|
1625
|
+
if tags & {"amide coupling", "amide"}:
|
|
1626
|
+
return "coupling"
|
|
1627
|
+
# Datamol uses "protecting group", also match "protection"/"deprotection"
|
|
1628
|
+
if tags & {"protecting group", "protection", "deprotection"}:
|
|
1629
|
+
# Distinguish protection vs deprotection by tag name
|
|
1630
|
+
tag_lc = {t.lower() for t in tags}
|
|
1631
|
+
if any("deprotect" in t for t in tag_lc):
|
|
1632
|
+
return "deprotection"
|
|
1633
|
+
if any("protect" in t for t in tag_lc):
|
|
1634
|
+
return "protection"
|
|
1635
|
+
return "protecting_group"
|
|
1636
|
+
if tags & {"C-C bond formation", "C-N bond formation",
|
|
1637
|
+
"C-O bond formation", "C-S bond formation",
|
|
1638
|
+
"N-arylation", "O-arylation", "S-arylation"}:
|
|
1639
|
+
return "coupling"
|
|
1640
|
+
return "functional_group"
|
|
1641
|
+
|
|
1642
|
+
|
|
1643
|
+
def _load_datamol_templates() -> Dict[str, Dict[str, Any]]:
|
|
1644
|
+
"""Load all reaction templates from datamol JSON.
|
|
1645
|
+
|
|
1646
|
+
Reads ``reactions_datamol.json`` (127 curated reaction templates from
|
|
1647
|
+
the datamol project, Apache 2.0) and converts each entry to our
|
|
1648
|
+
standard template format with snake_case keys. Includes heterocycle
|
|
1649
|
+
formation, couplings, functional group transforms, ester/amide
|
|
1650
|
+
chemistry, protection/deprotection, and more.
|
|
1651
|
+
|
|
1652
|
+
Returns:
|
|
1653
|
+
Dict mapping template name to template dict.
|
|
1654
|
+
"""
|
|
1655
|
+
global _datamol_cache
|
|
1656
|
+
if _datamol_cache is not None:
|
|
1657
|
+
return _datamol_cache
|
|
1658
|
+
|
|
1659
|
+
json_path = os.path.join(os.path.dirname(__file__), "reactions_datamol.json")
|
|
1660
|
+
if not os.path.exists(json_path):
|
|
1661
|
+
logger.warning("reactions_datamol.json not found — datamol "
|
|
1662
|
+
"templates unavailable")
|
|
1663
|
+
_datamol_cache = {}
|
|
1664
|
+
return _datamol_cache
|
|
1665
|
+
|
|
1666
|
+
with open(json_path, encoding="utf-8") as fh:
|
|
1667
|
+
raw = json.load(fh)
|
|
1668
|
+
|
|
1669
|
+
templates: Dict[str, Dict[str, Any]] = {}
|
|
1670
|
+
|
|
1671
|
+
for key, entry in raw.items():
|
|
1672
|
+
syn_smarts = entry.get("syn_smarts", "")
|
|
1673
|
+
if not syn_smarts:
|
|
1674
|
+
continue
|
|
1675
|
+
|
|
1676
|
+
tags = set(entry.get("tags", []))
|
|
1677
|
+
|
|
1678
|
+
# Derive template name: JSON key is already kebab-case
|
|
1679
|
+
# Convert to snake_case for consistency
|
|
1680
|
+
tname = key.replace("-", "_")
|
|
1681
|
+
|
|
1682
|
+
# Count reactant fragments (separated by '.')
|
|
1683
|
+
reactant_part = syn_smarts.split(">>")[0] if ">>" in syn_smarts else ""
|
|
1684
|
+
# Count top-level dots (outside brackets)
|
|
1685
|
+
n_reactants = 1
|
|
1686
|
+
depth = 0
|
|
1687
|
+
for ch in reactant_part:
|
|
1688
|
+
if ch == "[":
|
|
1689
|
+
depth += 1
|
|
1690
|
+
elif ch == "]":
|
|
1691
|
+
depth -= 1
|
|
1692
|
+
elif ch == "." and depth == 0:
|
|
1693
|
+
n_reactants += 1
|
|
1694
|
+
|
|
1695
|
+
templates[tname] = {
|
|
1696
|
+
"description": entry.get("description", entry.get("long_name", key)),
|
|
1697
|
+
"long_name": entry.get("long_name", ""),
|
|
1698
|
+
"smarts": syn_smarts,
|
|
1699
|
+
"n_reactants": n_reactants,
|
|
1700
|
+
"substrate_hint": ", ".join(entry.get("rhs_classes", [])),
|
|
1701
|
+
"reagent_hint": (", ".join(entry.get("rhs_classes", [])[1:])
|
|
1702
|
+
if n_reactants > 1 and len(entry.get("rhs_classes", [])) > 1
|
|
1703
|
+
else None),
|
|
1704
|
+
"conditions": [], # literature conditions vary
|
|
1705
|
+
"category": _category_from_tags(tags),
|
|
1706
|
+
"tags": list(tags),
|
|
1707
|
+
"source": "datamol",
|
|
1708
|
+
}
|
|
1709
|
+
|
|
1710
|
+
_datamol_cache = templates
|
|
1711
|
+
logger.debug("Loaded %d templates from datamol", len(templates))
|
|
1712
|
+
return _datamol_cache
|
|
1713
|
+
|
|
1714
|
+
|
|
1715
|
+
# Merged registry: classic hand-written + all datamol templates
|
|
1716
|
+
_merged_templates: Optional[Dict[str, Dict[str, Any]]] = None
|
|
1717
|
+
|
|
1718
|
+
|
|
1719
|
+
def _get_reaction_templates() -> Dict[str, Dict[str, Any]]:
|
|
1720
|
+
"""Return the merged reaction template registry (lazy-loaded).
|
|
1721
|
+
|
|
1722
|
+
Classic hand-written templates (couplings, functional group transforms,
|
|
1723
|
+
protection/deprotection) are merged with all datamol templates
|
|
1724
|
+
(heterocycle formation, couplings, FG transforms, and more).
|
|
1725
|
+
Classic templates take priority on name collisions.
|
|
1726
|
+
"""
|
|
1727
|
+
global _merged_templates
|
|
1728
|
+
if _merged_templates is not None:
|
|
1729
|
+
return _merged_templates
|
|
1730
|
+
|
|
1731
|
+
datamol = _load_datamol_templates()
|
|
1732
|
+
merged = dict(datamol) # datamol first, classic overrides
|
|
1733
|
+
merged.update(_CLASSIC_TEMPLATES)
|
|
1734
|
+
_merged_templates = merged
|
|
1735
|
+
return _merged_templates
|
|
1736
|
+
|
|
1737
|
+
|
|
1738
|
+
# ---------------------------------------------------------------------------
|
|
1739
|
+
# Tool 7: list_reactions
|
|
1740
|
+
# ---------------------------------------------------------------------------
|
|
1741
|
+
|
|
1742
|
+
def list_reactions(category: Optional[str] = None) -> Dict[str, Any]:
|
|
1743
|
+
"""List available named reaction templates.
|
|
1744
|
+
|
|
1745
|
+
Returns a summary of each reaction: name, description, number of
|
|
1746
|
+
reactants required, and typical conditions. Use this to find the
|
|
1747
|
+
right template before calling ``apply_reaction``.
|
|
1748
|
+
|
|
1749
|
+
Args:
|
|
1750
|
+
category: Optional filter. One of ``"coupling"``,
|
|
1751
|
+
``"functional_group"``, or ``"heterocycle_formation"``.
|
|
1752
|
+
If *None*, all templates are returned.
|
|
1753
|
+
|
|
1754
|
+
Returns:
|
|
1755
|
+
Dict with ``ok``, ``reactions`` (list of summaries), and
|
|
1756
|
+
``categories`` (list of available category names).
|
|
1757
|
+
|
|
1758
|
+
Example::
|
|
1759
|
+
|
|
1760
|
+
>>> result = list_reactions()
|
|
1761
|
+
>>> for r in result["reactions"]:
|
|
1762
|
+
... print(r["name"], "—", r["description"])
|
|
1763
|
+
suzuki_coupling — Suzuki coupling: aryl halide + boronic acid to biaryl
|
|
1764
|
+
...
|
|
1765
|
+
>>> result = list_reactions(category="heterocycle_formation")
|
|
1766
|
+
"""
|
|
1767
|
+
templates = _get_reaction_templates()
|
|
1768
|
+
rxns = []
|
|
1769
|
+
cats = set()
|
|
1770
|
+
for name, tmpl in templates.items():
|
|
1771
|
+
cat = tmpl.get("category", "other")
|
|
1772
|
+
cats.add(cat)
|
|
1773
|
+
if category and cat != category:
|
|
1774
|
+
continue
|
|
1775
|
+
rxns.append({
|
|
1776
|
+
"name": name,
|
|
1777
|
+
"description": tmpl["description"],
|
|
1778
|
+
"n_reactants": tmpl["n_reactants"],
|
|
1779
|
+
"substrate_hint": tmpl["substrate_hint"],
|
|
1780
|
+
"reagent_hint": tmpl.get("reagent_hint"),
|
|
1781
|
+
"conditions": tmpl.get("conditions", []),
|
|
1782
|
+
"category": cat,
|
|
1783
|
+
})
|
|
1784
|
+
return {"ok": True, "reactions": rxns, "categories": sorted(cats)}
|
|
1785
|
+
|
|
1786
|
+
|
|
1787
|
+
# ---------------------------------------------------------------------------
|
|
1788
|
+
# Tool 8: apply_reaction
|
|
1789
|
+
# ---------------------------------------------------------------------------
|
|
1790
|
+
|
|
1791
|
+
def apply_reaction(reaction_name: str,
|
|
1792
|
+
substrate: str,
|
|
1793
|
+
reagent: Optional[str] = None) -> Dict[str, Any]:
|
|
1794
|
+
"""Apply a named reaction template to transform a substrate.
|
|
1795
|
+
|
|
1796
|
+
Takes a substrate SMILES (and optionally a reagent SMILES for
|
|
1797
|
+
bimolecular reactions) and returns the product(s).
|
|
1798
|
+
|
|
1799
|
+
Args:
|
|
1800
|
+
reaction_name: Template name from ``list_reactions()``.
|
|
1801
|
+
substrate: SMILES of the main substrate.
|
|
1802
|
+
reagent: SMILES of the coupling partner (for 2-reactant rxns).
|
|
1803
|
+
Can also be a chemical name or abbreviation.
|
|
1804
|
+
|
|
1805
|
+
Returns:
|
|
1806
|
+
Dict with ``ok``, ``products`` (list of product dicts with
|
|
1807
|
+
``smiles`` and ``name`` keys), and ``conditions``.
|
|
1808
|
+
|
|
1809
|
+
Example::
|
|
1810
|
+
|
|
1811
|
+
>>> apply_reaction("nitro_reduction", "c1ccc([N+](=O)[O-])cc1")
|
|
1812
|
+
{'ok': True,
|
|
1813
|
+
'products': [{'smiles': 'Nc1ccccc1', 'name': 'aniline'}],
|
|
1814
|
+
'conditions': ['SnCl2·2H2O', 'EtOH', '80 °C']}
|
|
1815
|
+
"""
|
|
1816
|
+
from rdkit import Chem
|
|
1817
|
+
from rdkit.Chem import AllChem
|
|
1818
|
+
|
|
1819
|
+
# Look up template
|
|
1820
|
+
templates = _get_reaction_templates()
|
|
1821
|
+
tmpl = templates.get(reaction_name)
|
|
1822
|
+
# Case-insensitive fallback
|
|
1823
|
+
if tmpl is None:
|
|
1824
|
+
lower_map = {k.lower(): k for k in templates}
|
|
1825
|
+
real_key = lower_map.get(reaction_name.lower())
|
|
1826
|
+
if real_key:
|
|
1827
|
+
tmpl = templates[real_key]
|
|
1828
|
+
if tmpl is None:
|
|
1829
|
+
# Fuzzy match: find closest reaction names
|
|
1830
|
+
query_lower = reaction_name.lower().replace("-", "_").replace(" ", "_")
|
|
1831
|
+
scored = []
|
|
1832
|
+
for k in templates:
|
|
1833
|
+
k_lower = k.lower()
|
|
1834
|
+
# Substring match
|
|
1835
|
+
if query_lower in k_lower or k_lower in query_lower:
|
|
1836
|
+
scored.append((0, k))
|
|
1837
|
+
else:
|
|
1838
|
+
# Count shared words
|
|
1839
|
+
q_parts = set(query_lower.split("_"))
|
|
1840
|
+
k_parts = set(k_lower.split("_"))
|
|
1841
|
+
overlap = len(q_parts & k_parts)
|
|
1842
|
+
if overlap > 0:
|
|
1843
|
+
scored.append((1, k))
|
|
1844
|
+
scored.sort()
|
|
1845
|
+
suggestions = [s[1] for s in scored[:5]]
|
|
1846
|
+
if suggestions:
|
|
1847
|
+
hint = f"Did you mean: {', '.join(suggestions)}?"
|
|
1848
|
+
else:
|
|
1849
|
+
# Show a sample of available reactions
|
|
1850
|
+
all_names = sorted(templates.keys())
|
|
1851
|
+
hint = f"Some available reactions: {', '.join(all_names[:15])}... ({len(all_names)} total)"
|
|
1852
|
+
return {
|
|
1853
|
+
"ok": False,
|
|
1854
|
+
"error": f"Unknown reaction '{reaction_name}'. {hint}",
|
|
1855
|
+
}
|
|
1856
|
+
|
|
1857
|
+
# Parse reaction SMARTS
|
|
1858
|
+
try:
|
|
1859
|
+
rxn = AllChem.ReactionFromSmarts(tmpl["smarts"])
|
|
1860
|
+
except Exception as exc:
|
|
1861
|
+
return {"ok": False, "error": f"Invalid reaction SMARTS: {exc}"}
|
|
1862
|
+
|
|
1863
|
+
# Parse substrate
|
|
1864
|
+
sub_mol = Chem.MolFromSmiles(substrate)
|
|
1865
|
+
if sub_mol is None:
|
|
1866
|
+
# Maybe it's a name — try resolving
|
|
1867
|
+
resolved = _resolve_query(substrate)
|
|
1868
|
+
if resolved:
|
|
1869
|
+
sub_mol = Chem.MolFromSmiles(resolved["smiles"])
|
|
1870
|
+
if sub_mol is None:
|
|
1871
|
+
return {"ok": False, "error": f"Could not parse substrate '{substrate}'."}
|
|
1872
|
+
|
|
1873
|
+
# Handle reagent for bimolecular reactions
|
|
1874
|
+
if tmpl["n_reactants"] == 2:
|
|
1875
|
+
if not reagent:
|
|
1876
|
+
return {
|
|
1877
|
+
"ok": False,
|
|
1878
|
+
"error": f"Reaction '{reaction_name}' requires a reagent. "
|
|
1879
|
+
f"Expected: {tmpl.get('reagent_hint', 'coupling partner')}.",
|
|
1880
|
+
}
|
|
1881
|
+
rea_mol = Chem.MolFromSmiles(reagent)
|
|
1882
|
+
if rea_mol is None:
|
|
1883
|
+
# Try resolving as name
|
|
1884
|
+
resolved = _resolve_query(reagent)
|
|
1885
|
+
if resolved:
|
|
1886
|
+
rea_mol = Chem.MolFromSmiles(resolved["smiles"])
|
|
1887
|
+
if rea_mol is None:
|
|
1888
|
+
return {"ok": False,
|
|
1889
|
+
"error": f"Could not parse reagent '{reagent}'."}
|
|
1890
|
+
reactants = (sub_mol, rea_mol)
|
|
1891
|
+
else:
|
|
1892
|
+
reactants = (sub_mol,)
|
|
1893
|
+
|
|
1894
|
+
# Run reaction
|
|
1895
|
+
try:
|
|
1896
|
+
product_sets = rxn.RunReactants(reactants)
|
|
1897
|
+
except Exception as exc:
|
|
1898
|
+
return {"ok": False, "error": f"Reaction failed: {exc}"}
|
|
1899
|
+
|
|
1900
|
+
if not product_sets:
|
|
1901
|
+
# Try swapped reactant order for bimolecular reactions
|
|
1902
|
+
if tmpl["n_reactants"] == 2:
|
|
1903
|
+
try:
|
|
1904
|
+
product_sets = rxn.RunReactants((reactants[1], reactants[0]))
|
|
1905
|
+
except Exception:
|
|
1906
|
+
pass
|
|
1907
|
+
if not product_sets:
|
|
1908
|
+
return {
|
|
1909
|
+
"ok": False,
|
|
1910
|
+
"error": "No products formed. Check that the substrate "
|
|
1911
|
+
f"matches: {tmpl['substrate_hint']}.",
|
|
1912
|
+
}
|
|
1913
|
+
|
|
1914
|
+
# Collect unique products
|
|
1915
|
+
seen = set()
|
|
1916
|
+
products = []
|
|
1917
|
+
for prod_tuple in product_sets:
|
|
1918
|
+
for prod in prod_tuple:
|
|
1919
|
+
try:
|
|
1920
|
+
Chem.SanitizeMol(prod)
|
|
1921
|
+
smi = Chem.MolToSmiles(prod)
|
|
1922
|
+
if smi not in seen:
|
|
1923
|
+
seen.add(smi)
|
|
1924
|
+
name = _smiles_to_name_cs(smi)
|
|
1925
|
+
products.append({"smiles": smi, "name": name})
|
|
1926
|
+
except Exception:
|
|
1927
|
+
continue
|
|
1928
|
+
|
|
1929
|
+
if not products:
|
|
1930
|
+
return {"ok": False, "error": "Products could not be sanitised."}
|
|
1931
|
+
|
|
1932
|
+
return {
|
|
1933
|
+
"ok": True,
|
|
1934
|
+
"products": products,
|
|
1935
|
+
"conditions": tmpl["conditions"],
|
|
1936
|
+
"reaction": reaction_name,
|
|
1937
|
+
}
|
|
1938
|
+
|
|
1939
|
+
|
|
1940
|
+
# ---------------------------------------------------------------------------
|
|
1941
|
+
# Tool 9: deprotect
|
|
1942
|
+
# ---------------------------------------------------------------------------
|
|
1943
|
+
|
|
1944
|
+
def _detect_deprotection_templates(mol) -> List[str]:
|
|
1945
|
+
"""Return names of all deprotection templates that fire on *mol*.
|
|
1946
|
+
|
|
1947
|
+
Used internally by :func:`deprotect` to route single-PG cases through
|
|
1948
|
+
:func:`apply_reaction` and multi-PG (or unrecognised) cases through
|
|
1949
|
+
RDKit's ``rdDeprotect`` library.
|
|
1950
|
+
|
|
1951
|
+
Args:
|
|
1952
|
+
mol: RDKit ``Mol`` object.
|
|
1953
|
+
|
|
1954
|
+
Returns:
|
|
1955
|
+
List of template names (from the merged registry) whose SMARTS
|
|
1956
|
+
match at least one site on *mol*.
|
|
1957
|
+
"""
|
|
1958
|
+
from rdkit.Chem import AllChem
|
|
1959
|
+
|
|
1960
|
+
templates = _get_reaction_templates()
|
|
1961
|
+
fired: List[str] = []
|
|
1962
|
+
for name, tmpl in templates.items():
|
|
1963
|
+
if tmpl.get("category") not in ("deprotection",):
|
|
1964
|
+
continue
|
|
1965
|
+
if tmpl.get("n_reactants", 1) != 1:
|
|
1966
|
+
continue
|
|
1967
|
+
try:
|
|
1968
|
+
rxn = AllChem.ReactionFromSmarts(tmpl["smarts"])
|
|
1969
|
+
if rxn.RunReactants((mol,)):
|
|
1970
|
+
fired.append(name)
|
|
1971
|
+
except Exception:
|
|
1972
|
+
continue
|
|
1973
|
+
return fired
|
|
1974
|
+
|
|
1975
|
+
|
|
1976
|
+
# Map from apply_reaction template name to PG abbreviation used in the
|
|
1977
|
+
# ``removed`` list that callers expect. Keeps the public return format
|
|
1978
|
+
# stable even when template names are refactored.
|
|
1979
|
+
_TEMPLATE_TO_PG_ABBREV: Dict[str, str] = {
|
|
1980
|
+
"BOC_deprotection": "Boc",
|
|
1981
|
+
"cbz_deprotection": "Cbz",
|
|
1982
|
+
"fmoc_deprotection": "Fmoc",
|
|
1983
|
+
"tbs_deprotection": "TBS",
|
|
1984
|
+
"bn_deprotection_o": "Bn",
|
|
1985
|
+
"bn_deprotection_n": "Bn",
|
|
1986
|
+
"ac_deprotection_o": "Ac",
|
|
1987
|
+
"ac_deprotection_n": "Ac",
|
|
1988
|
+
"pmb_deprotection": "PMB",
|
|
1989
|
+
"ts_deprotection": "Ts",
|
|
1990
|
+
"tfa_deprotection": "TFA",
|
|
1991
|
+
}
|
|
1992
|
+
|
|
1993
|
+
|
|
1994
|
+
def deprotect(smiles: str) -> Dict[str, Any]:
|
|
1995
|
+
"""Remove common protecting groups from a molecule.
|
|
1996
|
+
|
|
1997
|
+
For substrates carrying a **single recognisable protecting group**,
|
|
1998
|
+
this function delegates to :func:`apply_reaction` using the
|
|
1999
|
+
appropriate named template (e.g. ``"BOC_deprotection"``,
|
|
2000
|
+
``"fmoc_deprotection"``). This keeps the deprotection logic
|
|
2001
|
+
centralised in the reaction-template registry and makes the
|
|
2002
|
+
single-PG path available to agents via :func:`apply_reaction`
|
|
2003
|
+
directly.
|
|
2004
|
+
|
|
2005
|
+
For substrates with **multiple protecting groups**, or when the PG is
|
|
2006
|
+
not covered by the named-template registry, the function falls back to
|
|
2007
|
+
RDKit's built-in ``rdDeprotect`` library (25+ templates covering Boc,
|
|
2008
|
+
Fmoc, Cbz, TBS, THP, Bn, Ac, PMB, Tr, and more).
|
|
2009
|
+
|
|
2010
|
+
The return format is identical in all cases, so existing callers are
|
|
2011
|
+
unaffected.
|
|
2012
|
+
|
|
2013
|
+
Args:
|
|
2014
|
+
smiles: SMILES of the protected molecule.
|
|
2015
|
+
|
|
2016
|
+
Returns:
|
|
2017
|
+
Dict with ``ok``, ``product_smiles``, ``product_name``,
|
|
2018
|
+
and ``removed`` (list of protecting group abbreviations removed).
|
|
2019
|
+
|
|
2020
|
+
Example::
|
|
2021
|
+
|
|
2022
|
+
>>> deprotect("O=C(OC(C)(C)C)Nc1ccccc1") # Boc-aniline
|
|
2023
|
+
{'ok': True, 'product_smiles': 'Nc1ccccc1', 'product_name': 'aniline',
|
|
2024
|
+
'removed': ['Boc']}
|
|
2025
|
+
|
|
2026
|
+
Note:
|
|
2027
|
+
Single-PG deprotections can also be called directly via
|
|
2028
|
+
:func:`apply_reaction`, e.g.
|
|
2029
|
+
``apply_reaction("BOC_deprotection", smiles)``. Use that form
|
|
2030
|
+
when you know the specific protecting group in advance.
|
|
2031
|
+
"""
|
|
2032
|
+
from rdkit import Chem
|
|
2033
|
+
|
|
2034
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
2035
|
+
if mol is None:
|
|
2036
|
+
resolved = _resolve_query(smiles)
|
|
2037
|
+
if resolved:
|
|
2038
|
+
mol = Chem.MolFromSmiles(resolved["smiles"])
|
|
2039
|
+
if mol is None:
|
|
2040
|
+
return {"ok": False, "error": f"Could not parse '{smiles}'."}
|
|
2041
|
+
|
|
2042
|
+
original_smi = Chem.MolToSmiles(mol)
|
|
2043
|
+
|
|
2044
|
+
# --- Fast path: exactly one named template fires → delegate to apply_reaction ---
|
|
2045
|
+
fired = _detect_deprotection_templates(mol)
|
|
2046
|
+
if len(fired) == 1:
|
|
2047
|
+
tname = fired[0]
|
|
2048
|
+
ar_result = apply_reaction(tname, original_smi)
|
|
2049
|
+
if ar_result.get("ok") and ar_result.get("products"):
|
|
2050
|
+
product_smi = ar_result["products"][0]["smiles"]
|
|
2051
|
+
pg_abbrev = _TEMPLATE_TO_PG_ABBREV.get(tname, tname)
|
|
2052
|
+
return {
|
|
2053
|
+
"ok": True,
|
|
2054
|
+
"product_smiles": product_smi,
|
|
2055
|
+
"product_name": _smiles_to_name_cs(product_smi),
|
|
2056
|
+
"removed": [pg_abbrev],
|
|
2057
|
+
}
|
|
2058
|
+
# apply_reaction unexpectedly failed — fall through to rdDeprotect
|
|
2059
|
+
|
|
2060
|
+
# --- Fallback: rdDeprotect handles multiple PGs or unrecognised ones ---
|
|
2061
|
+
try:
|
|
2062
|
+
from rdkit.Chem import rdDeprotect
|
|
2063
|
+
result = rdDeprotect.Deprotect(mol)
|
|
2064
|
+
except ImportError:
|
|
2065
|
+
if fired:
|
|
2066
|
+
# rdDeprotect unavailable but we know which PG(s) to remove —
|
|
2067
|
+
# run apply_reaction for each in sequence
|
|
2068
|
+
current_smi = original_smi
|
|
2069
|
+
removed_abbrevs: List[str] = []
|
|
2070
|
+
for tname in fired:
|
|
2071
|
+
ar = apply_reaction(tname, current_smi)
|
|
2072
|
+
if ar.get("ok") and ar.get("products"):
|
|
2073
|
+
current_smi = ar["products"][0]["smiles"]
|
|
2074
|
+
removed_abbrevs.append(_TEMPLATE_TO_PG_ABBREV.get(tname, tname))
|
|
2075
|
+
if removed_abbrevs:
|
|
2076
|
+
return {
|
|
2077
|
+
"ok": True,
|
|
2078
|
+
"product_smiles": current_smi,
|
|
2079
|
+
"product_name": _smiles_to_name_cs(current_smi),
|
|
2080
|
+
"removed": removed_abbrevs,
|
|
2081
|
+
}
|
|
2082
|
+
return {
|
|
2083
|
+
"ok": False,
|
|
2084
|
+
"error": (
|
|
2085
|
+
"rdDeprotect not available in this RDKit build. "
|
|
2086
|
+
"Use apply_reaction() with a specific template name "
|
|
2087
|
+
"(e.g. 'BOC_deprotection') for single-PG removal."
|
|
2088
|
+
),
|
|
2089
|
+
}
|
|
2090
|
+
except Exception as exc:
|
|
2091
|
+
return {"ok": False, "error": f"Deprotection failed: {exc}"}
|
|
2092
|
+
|
|
2093
|
+
product_smi = Chem.MolToSmiles(result)
|
|
2094
|
+
|
|
2095
|
+
if product_smi == original_smi:
|
|
2096
|
+
return {
|
|
2097
|
+
"ok": True,
|
|
2098
|
+
"product_smiles": product_smi,
|
|
2099
|
+
"product_name": _smiles_to_name_cs(product_smi),
|
|
2100
|
+
"removed": [],
|
|
2101
|
+
"note": "No protecting groups detected.",
|
|
2102
|
+
}
|
|
2103
|
+
|
|
2104
|
+
# Identify which PGs were removed by checking each rdDeprotect template
|
|
2105
|
+
removed = []
|
|
2106
|
+
try:
|
|
2107
|
+
from rdkit.Chem import AllChem
|
|
2108
|
+
deprots = rdDeprotect.GetDeprotections()
|
|
2109
|
+
for d in deprots:
|
|
2110
|
+
rxn_sma = d.reaction_smarts
|
|
2111
|
+
rxn = AllChem.ReactionFromSmarts(rxn_sma)
|
|
2112
|
+
try:
|
|
2113
|
+
prods = rxn.RunReactants((mol,))
|
|
2114
|
+
if prods:
|
|
2115
|
+
for ptuple in prods:
|
|
2116
|
+
for p in ptuple:
|
|
2117
|
+
try:
|
|
2118
|
+
Chem.SanitizeMol(p)
|
|
2119
|
+
if Chem.MolToSmiles(p) != original_smi:
|
|
2120
|
+
removed.append(d.abbreviation)
|
|
2121
|
+
break
|
|
2122
|
+
except Exception:
|
|
2123
|
+
continue
|
|
2124
|
+
if removed and removed[-1] == d.abbreviation:
|
|
2125
|
+
break
|
|
2126
|
+
except Exception:
|
|
2127
|
+
continue
|
|
2128
|
+
except Exception:
|
|
2129
|
+
pass
|
|
2130
|
+
|
|
2131
|
+
name = _smiles_to_name_cs(product_smi)
|
|
2132
|
+
return {
|
|
2133
|
+
"ok": True,
|
|
2134
|
+
"product_smiles": product_smi,
|
|
2135
|
+
"product_name": name,
|
|
2136
|
+
"removed": removed,
|
|
2137
|
+
}
|
|
2138
|
+
|
|
2139
|
+
|
|
2140
|
+
# ---------------------------------------------------------------------------
|
|
2141
|
+
# Tool 10: draw_molecule
|
|
2142
|
+
# ---------------------------------------------------------------------------
|
|
2143
|
+
|
|
2144
|
+
def draw_molecule(
|
|
2145
|
+
mol_json: Dict[str, Any],
|
|
2146
|
+
output_path: Optional[str] = None,
|
|
2147
|
+
) -> Dict[str, Any]:
|
|
2148
|
+
"""Render a single molecule to a standalone CDXML document.
|
|
2149
|
+
|
|
2150
|
+
Takes a molecule dict (as returned by ``resolve_compound`` or any dict
|
|
2151
|
+
containing at minimum a ``smiles`` field) and generates a CDXML string
|
|
2152
|
+
suitable for opening directly in ChemDraw. No arrow, no reaction scheme —
|
|
2153
|
+
just the structure, centred on a page.
|
|
2154
|
+
|
|
2155
|
+
An optional text label (compound name or custom label) is placed below the
|
|
2156
|
+
structure when the input dict contains a ``label``, ``name``, or
|
|
2157
|
+
``iupac_name`` field (checked in that priority order).
|
|
2158
|
+
|
|
2159
|
+
Args:
|
|
2160
|
+
mol_json: Dict with at minimum ``"smiles"``. Optional display keys:
|
|
2161
|
+
``"label"`` (used verbatim), ``"name"``, ``"iupac_name"``.
|
|
2162
|
+
Any other fields are ignored.
|
|
2163
|
+
output_path: If given, the CDXML string is also written to this file
|
|
2164
|
+
path.
|
|
2165
|
+
|
|
2166
|
+
Returns:
|
|
2167
|
+
Dict with keys:
|
|
2168
|
+
|
|
2169
|
+
- ``ok``: bool
|
|
2170
|
+
- ``cdxml``: CDXML document string (on success)
|
|
2171
|
+
- ``output_path``: echoed path if *output_path* was specified
|
|
2172
|
+
- ``error``: error message (when ``ok=False``)
|
|
2173
|
+
|
|
2174
|
+
Example::
|
|
2175
|
+
|
|
2176
|
+
>>> result = draw_molecule({"smiles": "CC(=O)Oc1ccccc1C(=O)O",
|
|
2177
|
+
... "name": "aspirin"})
|
|
2178
|
+
>>> result["ok"]
|
|
2179
|
+
True
|
|
2180
|
+
>>> result["cdxml"][:20]
|
|
2181
|
+
'<?xml version="1.0"'
|
|
2182
|
+
"""
|
|
2183
|
+
# --- Validate input ---
|
|
2184
|
+
smiles = mol_json.get("smiles")
|
|
2185
|
+
if not smiles:
|
|
2186
|
+
return {"ok": False, "error": "mol_json must contain a 'smiles' field."}
|
|
2187
|
+
|
|
2188
|
+
# --- Resolve display label (priority: label > name > iupac_name) ---
|
|
2189
|
+
label: Optional[str] = (
|
|
2190
|
+
mol_json.get("label")
|
|
2191
|
+
or mol_json.get("name")
|
|
2192
|
+
or mol_json.get("iupac_name")
|
|
2193
|
+
)
|
|
2194
|
+
|
|
2195
|
+
# --- Import renderer internals (lazy — avoids import-time cost) ---
|
|
2196
|
+
try:
|
|
2197
|
+
from cdxml_toolkit.render.renderer import (
|
|
2198
|
+
_IDGen,
|
|
2199
|
+
_smiles_to_fragment_data,
|
|
2200
|
+
_build_fragment,
|
|
2201
|
+
_build_text_element,
|
|
2202
|
+
_fragment_bbox,
|
|
2203
|
+
_bbox_center,
|
|
2204
|
+
_shift_atoms,
|
|
2205
|
+
)
|
|
2206
|
+
except ImportError as exc:
|
|
2207
|
+
return {"ok": False, "error": f"Renderer not available: {exc}"}
|
|
2208
|
+
|
|
2209
|
+
from cdxml_toolkit.constants import (
|
|
2210
|
+
CDXML_FOOTER,
|
|
2211
|
+
CDXML_HEADER,
|
|
2212
|
+
ACS_LABEL_FONT,
|
|
2213
|
+
ACS_LABEL_SIZE,
|
|
2214
|
+
ACS_LABEL_FACE,
|
|
2215
|
+
ACS_CAPTION_SIZE,
|
|
2216
|
+
ACS_HASH_SPACING,
|
|
2217
|
+
ACS_MARGIN_WIDTH,
|
|
2218
|
+
ACS_LINE_WIDTH,
|
|
2219
|
+
ACS_BOLD_WIDTH,
|
|
2220
|
+
ACS_BOND_LENGTH_STR,
|
|
2221
|
+
ACS_BOND_SPACING,
|
|
2222
|
+
ACS_CHAIN_ANGLE_STR,
|
|
2223
|
+
)
|
|
2224
|
+
|
|
2225
|
+
# --- Generate 2D coordinates ---
|
|
2226
|
+
CENTER_X, CENTER_Y = 200.0, 200.0
|
|
2227
|
+
|
|
2228
|
+
result = _smiles_to_fragment_data(smiles, CENTER_X, CENTER_Y)
|
|
2229
|
+
if result is None:
|
|
2230
|
+
return {
|
|
2231
|
+
"ok": False,
|
|
2232
|
+
"error": f"Could not generate 2D coordinates for SMILES: {smiles!r}",
|
|
2233
|
+
}
|
|
2234
|
+
|
|
2235
|
+
atoms, bonds = result
|
|
2236
|
+
|
|
2237
|
+
# Re-centre the structure at the desired origin
|
|
2238
|
+
bbox = _fragment_bbox(atoms)
|
|
2239
|
+
cx, cy = _bbox_center(bbox)
|
|
2240
|
+
_shift_atoms(atoms, CENTER_X - cx, CENTER_Y - cy)
|
|
2241
|
+
bbox = _fragment_bbox(atoms)
|
|
2242
|
+
|
|
2243
|
+
# --- Build XML ---
|
|
2244
|
+
ids = _IDGen(1000)
|
|
2245
|
+
frag_xml, _, _ = _build_fragment(atoms, bonds, ids)
|
|
2246
|
+
|
|
2247
|
+
xml_parts = [frag_xml]
|
|
2248
|
+
|
|
2249
|
+
# --- Optional label below the structure ---
|
|
2250
|
+
if label:
|
|
2251
|
+
label_y = bbox[3] + 14.0 # 14 pt below the structure bottom
|
|
2252
|
+
lbl_xml, _ = _build_text_element(
|
|
2253
|
+
[label], CENTER_X, label_y, ids,
|
|
2254
|
+
justification="Center", use_formatting=False,
|
|
2255
|
+
)
|
|
2256
|
+
xml_parts.append(lbl_xml)
|
|
2257
|
+
|
|
2258
|
+
inner_xml = "\n".join(xml_parts)
|
|
2259
|
+
|
|
2260
|
+
# --- Wrap in CDXML document ---
|
|
2261
|
+
page_id = ids.next()
|
|
2262
|
+
|
|
2263
|
+
header = CDXML_HEADER.format(
|
|
2264
|
+
bbox="0 0 1620 2160",
|
|
2265
|
+
label_font=ACS_LABEL_FONT,
|
|
2266
|
+
label_size=ACS_LABEL_SIZE,
|
|
2267
|
+
label_face=ACS_LABEL_FACE,
|
|
2268
|
+
caption_size=ACS_CAPTION_SIZE,
|
|
2269
|
+
hash_spacing=ACS_HASH_SPACING,
|
|
2270
|
+
margin_width=ACS_MARGIN_WIDTH,
|
|
2271
|
+
line_width=ACS_LINE_WIDTH,
|
|
2272
|
+
bold_width=ACS_BOLD_WIDTH,
|
|
2273
|
+
bond_length=ACS_BOND_LENGTH_STR,
|
|
2274
|
+
bond_spacing=ACS_BOND_SPACING,
|
|
2275
|
+
chain_angle=ACS_CHAIN_ANGLE_STR,
|
|
2276
|
+
)
|
|
2277
|
+
|
|
2278
|
+
page_open = (
|
|
2279
|
+
f'<page id="{page_id}" BoundingBox="0 0 1620 2160" '
|
|
2280
|
+
f'HeaderPosition="36" FooterPosition="36" '
|
|
2281
|
+
f'PrintTrimMarks="yes" HeightPages="3" WidthPages="3">'
|
|
2282
|
+
)
|
|
2283
|
+
|
|
2284
|
+
cdxml = "\n".join([header, page_open, inner_xml, "</page>", CDXML_FOOTER])
|
|
2285
|
+
|
|
2286
|
+
# --- Write to file if requested ---
|
|
2287
|
+
ret: Dict[str, Any] = {"ok": True, "cdxml": cdxml}
|
|
2288
|
+
if output_path:
|
|
2289
|
+
try:
|
|
2290
|
+
with open(output_path, "w", encoding="utf-8") as fh:
|
|
2291
|
+
fh.write(cdxml)
|
|
2292
|
+
ret["output_path"] = output_path
|
|
2293
|
+
except OSError as exc:
|
|
2294
|
+
return {"ok": False, "error": f"Failed to write '{output_path}': {exc}"}
|
|
2295
|
+
|
|
2296
|
+
return ret
|
|
2297
|
+
|
|
2298
|
+
|
|
2299
|
+
# ---------------------------------------------------------------------------
|
|
2300
|
+
# Tool 11: modify_molecule
|
|
2301
|
+
# ---------------------------------------------------------------------------
|
|
2302
|
+
|
|
2303
|
+
def _compute_formula(smiles: str) -> Optional[str]:
|
|
2304
|
+
"""Get molecular formula string from SMILES using RDKit."""
|
|
2305
|
+
try:
|
|
2306
|
+
from rdkit import Chem
|
|
2307
|
+
from rdkit.Chem import rdMolDescriptors
|
|
2308
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
2309
|
+
if mol is None:
|
|
2310
|
+
return None
|
|
2311
|
+
return rdMolDescriptors.CalcMolFormula(mol)
|
|
2312
|
+
except Exception:
|
|
2313
|
+
return None
|
|
2314
|
+
|
|
2315
|
+
|
|
2316
|
+
def _compute_mw(smiles: str) -> Optional[float]:
|
|
2317
|
+
"""Get exact molecular weight (monoisotopic) from SMILES using RDKit."""
|
|
2318
|
+
try:
|
|
2319
|
+
from rdkit import Chem
|
|
2320
|
+
from rdkit.Chem import Descriptors
|
|
2321
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
2322
|
+
if mol is None:
|
|
2323
|
+
return None
|
|
2324
|
+
return round(Descriptors.ExactMolWt(mol), 4)
|
|
2325
|
+
except Exception:
|
|
2326
|
+
return None
|
|
2327
|
+
|
|
2328
|
+
|
|
2329
|
+
def _parse_formula_counts(formula: str) -> Dict[str, int]:
|
|
2330
|
+
"""Parse a molecular formula string into element counts.
|
|
2331
|
+
|
|
2332
|
+
Handles simple formulas like ``C26H26N8O3``. Returns a dict mapping
|
|
2333
|
+
element symbol to count.
|
|
2334
|
+
"""
|
|
2335
|
+
counts: Dict[str, int] = {}
|
|
2336
|
+
for sym, n in re.findall(r"([A-Z][a-z]?)(\d*)", formula):
|
|
2337
|
+
if sym:
|
|
2338
|
+
counts[sym] = counts.get(sym, 0) + (int(n) if n else 1)
|
|
2339
|
+
return counts
|
|
2340
|
+
|
|
2341
|
+
|
|
2342
|
+
def _delta_formula(formula_in: str, formula_out: str) -> str:
|
|
2343
|
+
"""Compute element-by-element formula difference as a compact string.
|
|
2344
|
+
|
|
2345
|
+
Example: ``C20H20`` to ``C26H26`` gives ``+C6H6``.
|
|
2346
|
+
Returns a string like ``"+C6H4, -D3"`` or ``"(no change)"``.
|
|
2347
|
+
"""
|
|
2348
|
+
counts_in = _parse_formula_counts(formula_in)
|
|
2349
|
+
counts_out = _parse_formula_counts(formula_out)
|
|
2350
|
+
|
|
2351
|
+
all_elems = sorted(set(list(counts_in.keys()) + list(counts_out.keys())))
|
|
2352
|
+
added: List[str] = []
|
|
2353
|
+
removed: List[str] = []
|
|
2354
|
+
|
|
2355
|
+
for elem in all_elems:
|
|
2356
|
+
n_in = counts_in.get(elem, 0)
|
|
2357
|
+
n_out = counts_out.get(elem, 0)
|
|
2358
|
+
delta = n_out - n_in
|
|
2359
|
+
if delta > 0:
|
|
2360
|
+
added.append(f"{elem}{delta if delta > 1 else ''}")
|
|
2361
|
+
elif delta < 0:
|
|
2362
|
+
removed.append(f"{elem}{abs(delta) if abs(delta) > 1 else ''}")
|
|
2363
|
+
|
|
2364
|
+
parts = []
|
|
2365
|
+
if added:
|
|
2366
|
+
parts.append("+" + "".join(added))
|
|
2367
|
+
if removed:
|
|
2368
|
+
parts.append("-" + "".join(removed))
|
|
2369
|
+
return ", ".join(parts) if parts else "(no change)"
|
|
2370
|
+
|
|
2371
|
+
|
|
2372
|
+
def _build_mol_diff(input_smiles: str, output_smiles: str) -> Dict[str, Any]:
|
|
2373
|
+
"""Build the ``diff`` sub-dict using MCS + formula comparison."""
|
|
2374
|
+
diff: Dict[str, Any] = {
|
|
2375
|
+
"atoms_added": [],
|
|
2376
|
+
"atoms_removed": [],
|
|
2377
|
+
"atoms_changed": [],
|
|
2378
|
+
"mcs_smarts": None,
|
|
2379
|
+
"delta_formula": None,
|
|
2380
|
+
"delta_mw": None,
|
|
2381
|
+
}
|
|
2382
|
+
|
|
2383
|
+
try:
|
|
2384
|
+
from rdkit import Chem
|
|
2385
|
+
from rdkit.Chem import rdFMCS
|
|
2386
|
+
from cdxml_toolkit.naming.aligned_namer import molecular_diff
|
|
2387
|
+
|
|
2388
|
+
md = molecular_diff(input_smiles, output_smiles)
|
|
2389
|
+
|
|
2390
|
+
if not md.fallback_used:
|
|
2391
|
+
try:
|
|
2392
|
+
sm_mol = Chem.MolFromSmiles(input_smiles)
|
|
2393
|
+
prod_mol = Chem.MolFromSmiles(output_smiles)
|
|
2394
|
+
if sm_mol and prod_mol:
|
|
2395
|
+
mcs = rdFMCS.FindMCS(
|
|
2396
|
+
[sm_mol, prod_mol],
|
|
2397
|
+
threshold=1.0,
|
|
2398
|
+
ringMatchesRingOnly=True,
|
|
2399
|
+
completeRingsOnly=True,
|
|
2400
|
+
atomCompare=rdFMCS.AtomCompare.CompareElements,
|
|
2401
|
+
bondCompare=rdFMCS.BondCompare.CompareOrder,
|
|
2402
|
+
timeout=5,
|
|
2403
|
+
)
|
|
2404
|
+
if not mcs.canceled and mcs.numAtoms >= 3:
|
|
2405
|
+
diff["mcs_smarts"] = mcs.smartsString
|
|
2406
|
+
except Exception:
|
|
2407
|
+
pass
|
|
2408
|
+
|
|
2409
|
+
for ch in md.changes:
|
|
2410
|
+
if ch.change_type == "addition":
|
|
2411
|
+
diff["atoms_added"].append(ch.prod_name)
|
|
2412
|
+
elif ch.change_type == "removal":
|
|
2413
|
+
diff["atoms_removed"].append(ch.sm_name)
|
|
2414
|
+
elif ch.change_type == "replace":
|
|
2415
|
+
diff["atoms_changed"].append(
|
|
2416
|
+
{"from": ch.sm_name, "to": ch.prod_name}
|
|
2417
|
+
)
|
|
2418
|
+
except Exception:
|
|
2419
|
+
pass
|
|
2420
|
+
|
|
2421
|
+
# Formula and MW delta (always computed — does not need MCS)
|
|
2422
|
+
formula_in = _compute_formula(input_smiles)
|
|
2423
|
+
formula_out = _compute_formula(output_smiles)
|
|
2424
|
+
mw_in = _compute_mw(input_smiles)
|
|
2425
|
+
mw_out = _compute_mw(output_smiles)
|
|
2426
|
+
|
|
2427
|
+
if formula_in and formula_out:
|
|
2428
|
+
diff["delta_formula"] = _delta_formula(formula_in, formula_out)
|
|
2429
|
+
if mw_in is not None and mw_out is not None:
|
|
2430
|
+
diff["delta_mw"] = round(mw_out - mw_in, 4)
|
|
2431
|
+
|
|
2432
|
+
return diff
|
|
2433
|
+
|
|
2434
|
+
|
|
2435
|
+
def _build_aligned_names(input_smiles: str, output_smiles: str) -> str:
|
|
2436
|
+
"""Build an aligned name comparison string for two SMILES.
|
|
2437
|
+
|
|
2438
|
+
Returns a string like ``"X \u2192 Y\\n changes: ..."``.
|
|
2439
|
+
Falls back to a simple ``"name1 \u2192 name2"`` via ChemScript.
|
|
2440
|
+
"""
|
|
2441
|
+
try:
|
|
2442
|
+
from cdxml_toolkit.naming.aligned_namer import (
|
|
2443
|
+
find_aligned_names, format_name_diff,
|
|
2444
|
+
)
|
|
2445
|
+
ar = find_aligned_names(input_smiles, output_smiles)
|
|
2446
|
+
if ar.best_sm_name and ar.best_prod_name:
|
|
2447
|
+
diff_str = format_name_diff(ar.best_sm_name, ar.best_prod_name)
|
|
2448
|
+
return (
|
|
2449
|
+
f"{ar.best_sm_name} \u2192 {ar.best_prod_name}"
|
|
2450
|
+
f"\n changes: {diff_str}"
|
|
2451
|
+
)
|
|
2452
|
+
except Exception:
|
|
2453
|
+
pass
|
|
2454
|
+
|
|
2455
|
+
n1 = _smiles_to_name_cs(input_smiles) or ""
|
|
2456
|
+
n2 = _smiles_to_name_cs(output_smiles) or ""
|
|
2457
|
+
if n1 and n2:
|
|
2458
|
+
return f"{n1} \u2192 {n2}"
|
|
2459
|
+
return ""
|
|
2460
|
+
|
|
2461
|
+
|
|
2462
|
+
def modify_molecule(mol_json: Dict[str, Any],
|
|
2463
|
+
operation: str,
|
|
2464
|
+
**kwargs: Any) -> Dict[str, Any]:
|
|
2465
|
+
"""Modify a molecule and verify the change with a structural diff.
|
|
2466
|
+
|
|
2467
|
+
This is the molecular editor for LLM orchestration. It takes a
|
|
2468
|
+
molecule (as a dict with at least a ``smiles`` key), applies an
|
|
2469
|
+
operation, and returns the modified molecule with a structural diff
|
|
2470
|
+
so the LLM can verify the change happened as intended.
|
|
2471
|
+
|
|
2472
|
+
Parameters
|
|
2473
|
+
----------
|
|
2474
|
+
mol_json : dict
|
|
2475
|
+
Source molecule dict. Must contain ``smiles`` (canonical SMILES).
|
|
2476
|
+
May also contain ``name`` or ``iupac_name`` for display.
|
|
2477
|
+
operation : str
|
|
2478
|
+
One of:
|
|
2479
|
+
|
|
2480
|
+
- ``"analyze"`` — inspect the molecule without modifying it.
|
|
2481
|
+
Returns functional groups, alternative IUPAC names, bracket
|
|
2482
|
+
tree, prefix form, formula, and MW. No additional kwargs.
|
|
2483
|
+
|
|
2484
|
+
- ``"name_surgery"`` — modify via IUPAC name manipulation.
|
|
2485
|
+
Additional kwargs:
|
|
2486
|
+
|
|
2487
|
+
- ``add``: list of ``{"locant": str, "prefix": str}`` dicts
|
|
2488
|
+
- ``remove``: list of prefix strings to remove
|
|
2489
|
+
|
|
2490
|
+
- ``"smarts"`` — apply a SMARTS reaction transform.
|
|
2491
|
+
Additional kwargs:
|
|
2492
|
+
|
|
2493
|
+
- ``smarts``: reaction SMARTS string, e.g. ``"[c:1][F]>>[c:1][Cl]"``
|
|
2494
|
+
- ``reaction_name``: name from ``list_reactions()`` (alternative)
|
|
2495
|
+
|
|
2496
|
+
- ``"set_smiles"`` — accept new SMILES from the LLM.
|
|
2497
|
+
Additional kwargs:
|
|
2498
|
+
|
|
2499
|
+
- ``new_smiles``: str (validated with RDKit)
|
|
2500
|
+
- ``description``: str (optional, for context)
|
|
2501
|
+
|
|
2502
|
+
- ``"reaction"`` — apply a named reaction template (calls
|
|
2503
|
+
``apply_reaction()`` internally). Additional kwargs:
|
|
2504
|
+
|
|
2505
|
+
- ``reaction_name``: str (required) — template from ``list_reactions()``
|
|
2506
|
+
- ``reagent``: dict with ``smiles`` key (for binary reactions)
|
|
2507
|
+
|
|
2508
|
+
Returns
|
|
2509
|
+
-------
|
|
2510
|
+
dict
|
|
2511
|
+
For ``"analyze"`` operation:
|
|
2512
|
+
|
|
2513
|
+
- ``ok``: bool
|
|
2514
|
+
- ``input_smiles``: canonical SMILES of input
|
|
2515
|
+
- ``canonical_name``: IUPAC name (from ChemScript, or empty)
|
|
2516
|
+
- ``alternative_names``: list of alternative IUPAC names (round-trip
|
|
2517
|
+
validated) showing different parent/substituent perspectives
|
|
2518
|
+
- ``functional_groups``: list of functional group names present
|
|
2519
|
+
(e.g. ``["aryl chloride", "pyridine", "amide"]``)
|
|
2520
|
+
- ``prefix_form``: IUPAC prefix if this could be a substituent, or
|
|
2521
|
+
``None``
|
|
2522
|
+
- ``bracket_tree``: the canonical IUPAC name with its bracket
|
|
2523
|
+
hierarchy preserved (same as ``canonical_name``); the caller can
|
|
2524
|
+
parse parenthesised groups to see substituents at each depth
|
|
2525
|
+
- ``formula``: molecular formula string
|
|
2526
|
+
- ``mw``: exact monoisotopic MW (float)
|
|
2527
|
+
|
|
2528
|
+
For modification operations (``"name_surgery"``, ``"smarts"``,
|
|
2529
|
+
``"set_smiles"``):
|
|
2530
|
+
|
|
2531
|
+
- ``ok``: bool
|
|
2532
|
+
- ``input_smiles``: canonical SMILES of input
|
|
2533
|
+
- ``output_smiles``: canonical SMILES of output
|
|
2534
|
+
- ``input_name``: IUPAC name of input
|
|
2535
|
+
- ``output_name``: IUPAC name of output (from ChemScript)
|
|
2536
|
+
- ``aligned_names``: side-by-side aligned name comparison string
|
|
2537
|
+
- ``diff``: sub-dict with:
|
|
2538
|
+
|
|
2539
|
+
- ``atoms_added``: list of fragment names added
|
|
2540
|
+
- ``atoms_removed``: list of fragment names removed
|
|
2541
|
+
- ``atoms_changed``: list of ``{"from": ..., "to": ...}`` dicts
|
|
2542
|
+
- ``mcs_smarts``: maximum common substructure SMARTS (str or None)
|
|
2543
|
+
- ``delta_formula``: formula difference (e.g. ``"+C6H5, -F"``)
|
|
2544
|
+
- ``delta_mw``: MW difference in Da (float)
|
|
2545
|
+
|
|
2546
|
+
- ``formula``: molecular formula of output
|
|
2547
|
+
- ``mw``: exact monoisotopic MW of output
|
|
2548
|
+
|
|
2549
|
+
Examples
|
|
2550
|
+
--------
|
|
2551
|
+
::
|
|
2552
|
+
|
|
2553
|
+
# Swap a CD3 for benzyl via SMARTS
|
|
2554
|
+
result = modify_molecule(
|
|
2555
|
+
{"smiles": "C([2H])([2H])[2H]"},
|
|
2556
|
+
"smarts",
|
|
2557
|
+
smarts="[C:1]([2H])([2H])[2H]>>[C:1]Cc1ccccc1",
|
|
2558
|
+
)
|
|
2559
|
+
|
|
2560
|
+
# Add a fluoro group via name surgery
|
|
2561
|
+
result = modify_molecule(
|
|
2562
|
+
{"smiles": "Clc1ccncc1"},
|
|
2563
|
+
"name_surgery",
|
|
2564
|
+
add=[{"locant": "3", "prefix": "fluoro"}],
|
|
2565
|
+
)
|
|
2566
|
+
|
|
2567
|
+
# Directly set new SMILES and verify
|
|
2568
|
+
result = modify_molecule(
|
|
2569
|
+
{"smiles": "Clc1ccncc1"},
|
|
2570
|
+
"set_smiles",
|
|
2571
|
+
new_smiles="Clc1cc(F)ncc1",
|
|
2572
|
+
description="added fluoro at C3",
|
|
2573
|
+
)
|
|
2574
|
+
"""
|
|
2575
|
+
from rdkit import Chem
|
|
2576
|
+
|
|
2577
|
+
# ---- Validate input ----
|
|
2578
|
+
input_smiles_raw = mol_json.get("smiles", "")
|
|
2579
|
+
if not input_smiles_raw:
|
|
2580
|
+
return {"ok": False, "error": "mol_json must contain 'smiles'."}
|
|
2581
|
+
|
|
2582
|
+
in_mol = Chem.MolFromSmiles(input_smiles_raw)
|
|
2583
|
+
if in_mol is None:
|
|
2584
|
+
return {"ok": False,
|
|
2585
|
+
"error": f"Could not parse input SMILES: '{input_smiles_raw}'."}
|
|
2586
|
+
input_smiles = Chem.MolToSmiles(in_mol)
|
|
2587
|
+
|
|
2588
|
+
output_smiles: Optional[str] = None
|
|
2589
|
+
alternative_products: List[Dict[str, Any]] = []
|
|
2590
|
+
|
|
2591
|
+
# ---- Dispatch operation ----
|
|
2592
|
+
if operation == "set_smiles":
|
|
2593
|
+
new_smiles = kwargs.get("new_smiles", "")
|
|
2594
|
+
if not new_smiles:
|
|
2595
|
+
return {"ok": False, "error": "'new_smiles' is required for set_smiles."}
|
|
2596
|
+
out_mol = Chem.MolFromSmiles(new_smiles)
|
|
2597
|
+
if out_mol is None:
|
|
2598
|
+
return {"ok": False,
|
|
2599
|
+
"error": f"'new_smiles' is not a valid SMILES: '{new_smiles}'."}
|
|
2600
|
+
output_smiles = Chem.MolToSmiles(out_mol)
|
|
2601
|
+
|
|
2602
|
+
elif operation == "smarts":
|
|
2603
|
+
smarts_str = kwargs.get("smarts", "")
|
|
2604
|
+
reaction_name = kwargs.get("reaction_name", "")
|
|
2605
|
+
|
|
2606
|
+
if reaction_name and not smarts_str:
|
|
2607
|
+
templates = _get_reaction_templates()
|
|
2608
|
+
tmpl = templates.get(reaction_name)
|
|
2609
|
+
if tmpl is None:
|
|
2610
|
+
return {"ok": False,
|
|
2611
|
+
"error": f"Unknown reaction_name '{reaction_name}'."}
|
|
2612
|
+
smarts_str = tmpl["smarts"]
|
|
2613
|
+
|
|
2614
|
+
if not smarts_str:
|
|
2615
|
+
return {"ok": False,
|
|
2616
|
+
"error": "'smarts' or 'reaction_name' is required for smarts."}
|
|
2617
|
+
|
|
2618
|
+
try:
|
|
2619
|
+
from rdkit.Chem import AllChem
|
|
2620
|
+
rxn = AllChem.ReactionFromSmarts(smarts_str)
|
|
2621
|
+
except Exception as exc:
|
|
2622
|
+
return {"ok": False, "error": f"Invalid reaction SMARTS: {exc}"}
|
|
2623
|
+
|
|
2624
|
+
try:
|
|
2625
|
+
product_sets = rxn.RunReactants((in_mol,))
|
|
2626
|
+
except Exception as exc:
|
|
2627
|
+
return {"ok": False, "error": f"SMARTS reaction failed: {exc}"}
|
|
2628
|
+
|
|
2629
|
+
if not product_sets:
|
|
2630
|
+
# Detect common patterns and suggest named reactions
|
|
2631
|
+
hints = []
|
|
2632
|
+
s = smarts_str
|
|
2633
|
+
# Check most specific patterns first
|
|
2634
|
+
if any(p in s for p in ["OC(C)(C)C", "Boc", "BOC", "boc", "tBu"]):
|
|
2635
|
+
hints.append("For Boc deprotection, try: operation='reaction', reaction_name='BOC_deprotection'")
|
|
2636
|
+
elif any(p in s for p in ["Fmoc", "fmoc", "fluorenyl"]):
|
|
2637
|
+
hints.append("For Fmoc deprotection, try: operation='reaction', reaction_name='fmoc_deprotection'")
|
|
2638
|
+
elif "C(=O)N" in s and "OC(=O)N" not in s:
|
|
2639
|
+
hints.append("For amide hydrolysis, try: operation='reaction', reaction_name='amide_hydrolysis'")
|
|
2640
|
+
elif "C(=O)O" in s:
|
|
2641
|
+
hints.append("For ester hydrolysis, try: operation='reaction', reaction_name='ester_hydrolysis'")
|
|
2642
|
+
if not hints:
|
|
2643
|
+
hints.append("Hint: use operation='reaction' with a reaction_name for common transformations. Call modify_molecule with operation='reaction' and no reaction_name to see all available reactions.")
|
|
2644
|
+
return {
|
|
2645
|
+
"ok": False,
|
|
2646
|
+
"error": (
|
|
2647
|
+
"SMARTS pattern did not match the input molecule. "
|
|
2648
|
+
+ " ".join(hints)
|
|
2649
|
+
),
|
|
2650
|
+
"input_smiles": input_smiles,
|
|
2651
|
+
}
|
|
2652
|
+
|
|
2653
|
+
for prod_tuple in product_sets:
|
|
2654
|
+
for prod in prod_tuple:
|
|
2655
|
+
try:
|
|
2656
|
+
Chem.SanitizeMol(prod)
|
|
2657
|
+
output_smiles = Chem.MolToSmiles(prod)
|
|
2658
|
+
break
|
|
2659
|
+
except Exception:
|
|
2660
|
+
continue
|
|
2661
|
+
if output_smiles:
|
|
2662
|
+
break
|
|
2663
|
+
|
|
2664
|
+
if not output_smiles:
|
|
2665
|
+
return {"ok": False,
|
|
2666
|
+
"error": "SMARTS reaction produced no valid products."}
|
|
2667
|
+
|
|
2668
|
+
elif operation == "name_surgery":
|
|
2669
|
+
iupac_name = (mol_json.get("iupac_name")
|
|
2670
|
+
or mol_json.get("name")
|
|
2671
|
+
or _smiles_to_name_cs(input_smiles))
|
|
2672
|
+
|
|
2673
|
+
if not iupac_name:
|
|
2674
|
+
return {
|
|
2675
|
+
"ok": False,
|
|
2676
|
+
"error": (
|
|
2677
|
+
"name_surgery requires an IUPAC name. "
|
|
2678
|
+
"Provide 'iupac_name' in mol_json, or ensure ChemScript "
|
|
2679
|
+
"is available to auto-generate one."
|
|
2680
|
+
),
|
|
2681
|
+
}
|
|
2682
|
+
|
|
2683
|
+
add_list: List[Dict[str, str]] = kwargs.get("add", [])
|
|
2684
|
+
remove_list: List[str] = kwargs.get("remove", [])
|
|
2685
|
+
|
|
2686
|
+
current_name = iupac_name
|
|
2687
|
+
|
|
2688
|
+
for prefix_to_remove in remove_list:
|
|
2689
|
+
# Auto-resolve abbreviations to IUPAC prefix form
|
|
2690
|
+
pfx_r = get_prefix_form(prefix_to_remove)
|
|
2691
|
+
if pfx_r.get("ok"):
|
|
2692
|
+
prefix_to_remove = pfx_r["prefix"]
|
|
2693
|
+
res = _modify_remove(current_name, prefix_to_remove,
|
|
2694
|
+
validate=True, use_network=False)
|
|
2695
|
+
if not res.get("ok"):
|
|
2696
|
+
return {
|
|
2697
|
+
"ok": False,
|
|
2698
|
+
"error": (f"name_surgery remove '{prefix_to_remove}' "
|
|
2699
|
+
f"failed: {res.get('error', '?')}"),
|
|
2700
|
+
"input_smiles": input_smiles,
|
|
2701
|
+
"tried_name": current_name,
|
|
2702
|
+
}
|
|
2703
|
+
if res.get("valid") and res.get("smiles"):
|
|
2704
|
+
current_name = res["name"]
|
|
2705
|
+
else:
|
|
2706
|
+
return {
|
|
2707
|
+
"ok": False,
|
|
2708
|
+
"error": (f"name_surgery remove '{prefix_to_remove}' "
|
|
2709
|
+
f"produced invalid name: '{res.get('name')}'."),
|
|
2710
|
+
"input_smiles": input_smiles,
|
|
2711
|
+
}
|
|
2712
|
+
|
|
2713
|
+
for sub in add_list:
|
|
2714
|
+
prefix = sub.get("prefix", "")
|
|
2715
|
+
locant = sub.get("locant", "")
|
|
2716
|
+
if not prefix:
|
|
2717
|
+
continue
|
|
2718
|
+
# Auto-resolve abbreviations/formulae to IUPAC prefix form
|
|
2719
|
+
# so the agent can say "CF3" instead of "trifluoromethyl".
|
|
2720
|
+
pfx_result = get_prefix_form(prefix)
|
|
2721
|
+
if pfx_result.get("ok"):
|
|
2722
|
+
prefix = pfx_result["prefix"]
|
|
2723
|
+
res = _modify_add(current_name, prefix, locant,
|
|
2724
|
+
validate=True, use_network=False)
|
|
2725
|
+
if not res.get("ok"):
|
|
2726
|
+
return {
|
|
2727
|
+
"ok": False,
|
|
2728
|
+
"error": (f"name_surgery add '{prefix}' at '{locant}' "
|
|
2729
|
+
f"failed: {res.get('error', '?')}"),
|
|
2730
|
+
"input_smiles": input_smiles,
|
|
2731
|
+
"tried_name": current_name,
|
|
2732
|
+
}
|
|
2733
|
+
if res.get("valid") and res.get("smiles"):
|
|
2734
|
+
current_name = res["name"]
|
|
2735
|
+
else:
|
|
2736
|
+
return {
|
|
2737
|
+
"ok": False,
|
|
2738
|
+
"error": (f"name_surgery add '{prefix}' produced "
|
|
2739
|
+
f"invalid name: '{res.get('name')}'."),
|
|
2740
|
+
"input_smiles": input_smiles,
|
|
2741
|
+
}
|
|
2742
|
+
|
|
2743
|
+
output_smiles = _try_validate(current_name, use_network=False)
|
|
2744
|
+
if not output_smiles:
|
|
2745
|
+
return {
|
|
2746
|
+
"ok": False,
|
|
2747
|
+
"error": f"Could not validate name surgery result: '{current_name}'.",
|
|
2748
|
+
"input_smiles": input_smiles,
|
|
2749
|
+
"output_name_attempted": current_name,
|
|
2750
|
+
}
|
|
2751
|
+
out_mol = Chem.MolFromSmiles(output_smiles)
|
|
2752
|
+
if out_mol:
|
|
2753
|
+
output_smiles = Chem.MolToSmiles(out_mol)
|
|
2754
|
+
|
|
2755
|
+
elif operation == "reaction":
|
|
2756
|
+
# ---- Reaction: apply a named reaction template via apply_reaction ----
|
|
2757
|
+
reaction_name = kwargs.get("reaction_name", "")
|
|
2758
|
+
if not reaction_name:
|
|
2759
|
+
rxn_list = list_reactions()
|
|
2760
|
+
names = [r["name"] for r in rxn_list.get("reactions", [])]
|
|
2761
|
+
return {
|
|
2762
|
+
"ok": False,
|
|
2763
|
+
"error": (
|
|
2764
|
+
"'reaction_name' is required for the reaction operation. "
|
|
2765
|
+
f"Available reactions: {', '.join(names)}"
|
|
2766
|
+
),
|
|
2767
|
+
"input_smiles": input_smiles,
|
|
2768
|
+
}
|
|
2769
|
+
|
|
2770
|
+
reagent_dict = kwargs.get("reagent", None)
|
|
2771
|
+
reagent_smiles = reagent_dict.get("smiles") if isinstance(reagent_dict, dict) else None
|
|
2772
|
+
|
|
2773
|
+
rxn_result = apply_reaction(reaction_name, input_smiles, reagent_smiles)
|
|
2774
|
+
if not rxn_result.get("ok"):
|
|
2775
|
+
return {
|
|
2776
|
+
"ok": False,
|
|
2777
|
+
"error": rxn_result.get("error", "Reaction failed."),
|
|
2778
|
+
"input_smiles": input_smiles,
|
|
2779
|
+
"reaction_name": reaction_name,
|
|
2780
|
+
}
|
|
2781
|
+
|
|
2782
|
+
products = rxn_result.get("products", [])
|
|
2783
|
+
if not products:
|
|
2784
|
+
return {
|
|
2785
|
+
"ok": False,
|
|
2786
|
+
"error": "Reaction produced no products.",
|
|
2787
|
+
"input_smiles": input_smiles,
|
|
2788
|
+
"reaction_name": reaction_name,
|
|
2789
|
+
}
|
|
2790
|
+
|
|
2791
|
+
# Primary product is first; store remaining as alternatives
|
|
2792
|
+
output_smiles = products[0]["smiles"]
|
|
2793
|
+
alternative_products = products[1:] if len(products) > 1 else []
|
|
2794
|
+
|
|
2795
|
+
elif operation == "analyze":
|
|
2796
|
+
# ---- Analyze: reason about a molecule without modifying it ----
|
|
2797
|
+
# Functional group SMARTS (name → SMARTS pattern).
|
|
2798
|
+
_FG_SMARTS: List[tuple] = [
|
|
2799
|
+
# Halogens
|
|
2800
|
+
("aryl fluoride", "[F][c]"),
|
|
2801
|
+
("aryl chloride", "[Cl][c]"),
|
|
2802
|
+
("aryl bromide", "[Br][c]"),
|
|
2803
|
+
("aryl iodide", "[I][c]"),
|
|
2804
|
+
("alkyl fluoride", "[F][CX4]"),
|
|
2805
|
+
("alkyl chloride", "[Cl][CX4]"),
|
|
2806
|
+
("alkyl bromide", "[Br][CX4]"),
|
|
2807
|
+
("alkyl iodide", "[I][CX4]"),
|
|
2808
|
+
# Nitrogen
|
|
2809
|
+
("primary amine", "[NH2][CX4]"),
|
|
2810
|
+
("secondary amine", "[NH1]([CX4])[CX4]"),
|
|
2811
|
+
("tertiary amine", "[NX3;!$(N=*)]([CX4])([CX4])[CX4]"),
|
|
2812
|
+
("aromatic amine", "[NH2][c]"),
|
|
2813
|
+
("amide", "[CX3](=[OX1])[NX3]"),
|
|
2814
|
+
("sulfonamide", "[SX4](=[OX1])(=[OX1])[NX3]"),
|
|
2815
|
+
("nitro", "[$([NX3](=O)=O),$([NX3+](=O)[O-])]"),
|
|
2816
|
+
("nitrile", "[CX2]#[NX1]"),
|
|
2817
|
+
("isocyanate", "[NX2]=[C]=[OX1]"),
|
|
2818
|
+
("urea", "[NX3][CX3](=[OX1])[NX3]"),
|
|
2819
|
+
("carbamate", "[NX3][CX3](=[OX1])[OX2]"),
|
|
2820
|
+
# Oxygen
|
|
2821
|
+
("carboxylic acid", "[CX3](=[OX1])[OX2H1]"),
|
|
2822
|
+
("ester", "[CX3](=[OX1])[OX2][CX4]"),
|
|
2823
|
+
("ketone", "[CX3](=[OX1])[CX4]"),
|
|
2824
|
+
("aldehyde", "[CX3H1](=[OX1])"),
|
|
2825
|
+
("alcohol", "[OX2H][CX4]"),
|
|
2826
|
+
("phenol", "[OX2H][c]"),
|
|
2827
|
+
("ether", "[OX2]([CX4])[CX4]"),
|
|
2828
|
+
("aryl ether", "[OX2]([c])[CX4,c]"),
|
|
2829
|
+
("epoxide", "[C]1[O][C]1"),
|
|
2830
|
+
("anhydride", "[CX3](=[OX1])[OX2][CX3](=[OX1])"),
|
|
2831
|
+
# Sulfur
|
|
2832
|
+
("thiol", "[SX2H]"),
|
|
2833
|
+
("thioether", "[SX2]([CX4])[CX4]"),
|
|
2834
|
+
("sulfoxide", "[$([SX3]=O)]"),
|
|
2835
|
+
("sulfone", "[$([SX4](=[OX1])(=[OX1]))]"),
|
|
2836
|
+
# Phosphorus
|
|
2837
|
+
("phosphate", "[PX4](=[OX1])([OX2])([OX2])[OX2]"),
|
|
2838
|
+
("phosphonic acid", "[PX4](=[OX1])([OX2H])([OX2H])"),
|
|
2839
|
+
# Boron
|
|
2840
|
+
("boronic acid", "[BX3]([OX2H])[OX2H]"),
|
|
2841
|
+
("boronate ester", "[BX3]([OX2])[OX2]"),
|
|
2842
|
+
# Heterocycles (aromatic)
|
|
2843
|
+
("pyridine", "c1ccncc1"),
|
|
2844
|
+
("pyrimidine", "c1cnccn1"),
|
|
2845
|
+
("pyrazine", "c1cnccn1"),
|
|
2846
|
+
("imidazole", "c1cnc[nH]1"),
|
|
2847
|
+
("pyrazole", "c1cc[nH]n1"),
|
|
2848
|
+
("triazole", "c1cn[nH]n1"),
|
|
2849
|
+
("tetrazole", "c1nnn[nH]1"),
|
|
2850
|
+
("oxazole", "c1cocn1"),
|
|
2851
|
+
("thiazole", "c1cscn1"),
|
|
2852
|
+
("indole", "c1ccc2[nH]ccc2c1"),
|
|
2853
|
+
("benzimidazole", "c1cnc2ccccc2n1"),
|
|
2854
|
+
("quinoline", "c1ccc2ncccc2c1"),
|
|
2855
|
+
("isoquinoline", "c1ccc2cnccc2c1"),
|
|
2856
|
+
("piperidine", "[NH]1CCCCC1"),
|
|
2857
|
+
("piperazine", "N1CCNCC1"),
|
|
2858
|
+
("morpholine", "O1CCNCC1"),
|
|
2859
|
+
("pyrrolidine", "[NH]1CCCC1"),
|
|
2860
|
+
("azetidine", "[NH]1CCC1"),
|
|
2861
|
+
# Protected amines
|
|
2862
|
+
("Boc-protected amine", "[NX3][CX3](=[OX1])OC(C)(C)C"),
|
|
2863
|
+
("Cbz-protected amine", "[NX3][CX3](=[OX1])OCc1ccccc1"),
|
|
2864
|
+
("Fmoc-protected amine", "[NX3][CX3](=[OX1])OCC1c2ccccc2-c2ccccc21"),
|
|
2865
|
+
]
|
|
2866
|
+
|
|
2867
|
+
# Remove any broken SMARTS (the sulfone pattern has a typo guard)
|
|
2868
|
+
valid_fg_patterns: List[tuple] = []
|
|
2869
|
+
for fg_name, fg_smarts in _FG_SMARTS:
|
|
2870
|
+
try:
|
|
2871
|
+
from rdkit.Chem import MolFromSmarts
|
|
2872
|
+
patt = MolFromSmarts(fg_smarts)
|
|
2873
|
+
if patt is not None:
|
|
2874
|
+
valid_fg_patterns.append((fg_name, patt))
|
|
2875
|
+
except Exception:
|
|
2876
|
+
pass
|
|
2877
|
+
|
|
2878
|
+
# Detect functional groups
|
|
2879
|
+
functional_groups: List[str] = []
|
|
2880
|
+
for fg_name, patt in valid_fg_patterns:
|
|
2881
|
+
if in_mol.HasSubstructMatch(patt):
|
|
2882
|
+
functional_groups.append(fg_name)
|
|
2883
|
+
|
|
2884
|
+
# Get canonical IUPAC name from ChemScript
|
|
2885
|
+
canonical_name = _smiles_to_name_cs(input_smiles) or ""
|
|
2886
|
+
|
|
2887
|
+
# Get decomposition (alternatives + bracket tree)
|
|
2888
|
+
alternative_names: List[str] = []
|
|
2889
|
+
bracket_tree_str: Optional[str] = None
|
|
2890
|
+
try:
|
|
2891
|
+
from cdxml_toolkit.naming.name_decomposer import decompose_name
|
|
2892
|
+
decomp = decompose_name(input_smiles)
|
|
2893
|
+
if decomp.alternatives:
|
|
2894
|
+
alternative_names = [a.name for a in decomp.alternatives
|
|
2895
|
+
if a.valid and a.name]
|
|
2896
|
+
if decomp.bracket_tree is not None:
|
|
2897
|
+
bracket_tree_str = decomp.canonical_name
|
|
2898
|
+
if not canonical_name and decomp.canonical_name:
|
|
2899
|
+
canonical_name = decomp.canonical_name
|
|
2900
|
+
except Exception:
|
|
2901
|
+
pass
|
|
2902
|
+
|
|
2903
|
+
# Get prefix form (substituent name)
|
|
2904
|
+
prefix_form: Optional[str] = None
|
|
2905
|
+
try:
|
|
2906
|
+
pfx_result = get_prefix_form(canonical_name or input_smiles)
|
|
2907
|
+
if pfx_result.get("ok"):
|
|
2908
|
+
prefix_form = pfx_result["prefix"]
|
|
2909
|
+
except Exception:
|
|
2910
|
+
pass
|
|
2911
|
+
|
|
2912
|
+
formula = _compute_formula(input_smiles)
|
|
2913
|
+
mw_val = _compute_mw(input_smiles)
|
|
2914
|
+
|
|
2915
|
+
return {
|
|
2916
|
+
"ok": True,
|
|
2917
|
+
"input_smiles": input_smiles,
|
|
2918
|
+
"canonical_name": canonical_name,
|
|
2919
|
+
"alternative_names": alternative_names,
|
|
2920
|
+
"functional_groups": functional_groups,
|
|
2921
|
+
"prefix_form": prefix_form,
|
|
2922
|
+
"bracket_tree": bracket_tree_str,
|
|
2923
|
+
"formula": formula,
|
|
2924
|
+
"mw": mw_val,
|
|
2925
|
+
}
|
|
2926
|
+
|
|
2927
|
+
elif operation == "set_name":
|
|
2928
|
+
# ---- Set name: resolve a new IUPAC name to SMILES, validate, diff ----
|
|
2929
|
+
new_name = kwargs.get("new_name", "")
|
|
2930
|
+
if not new_name:
|
|
2931
|
+
return {"ok": False, "error": "'new_name' is required for set_name."}
|
|
2932
|
+
|
|
2933
|
+
# Try to resolve the name to SMILES
|
|
2934
|
+
output_smiles = _try_validate(new_name, use_network=True)
|
|
2935
|
+
if not output_smiles:
|
|
2936
|
+
# Also try resolve_to_smiles in case it's a common name
|
|
2937
|
+
r = resolve_to_smiles(new_name, use_network=True)
|
|
2938
|
+
if r.get("ok"):
|
|
2939
|
+
output_smiles = r["smiles"]
|
|
2940
|
+
|
|
2941
|
+
if not output_smiles:
|
|
2942
|
+
return {
|
|
2943
|
+
"ok": False,
|
|
2944
|
+
"error": f"Could not resolve name '{new_name}' to a valid structure.",
|
|
2945
|
+
"input_smiles": input_smiles,
|
|
2946
|
+
}
|
|
2947
|
+
out_mol = Chem.MolFromSmiles(output_smiles)
|
|
2948
|
+
if out_mol is None:
|
|
2949
|
+
return {
|
|
2950
|
+
"ok": False,
|
|
2951
|
+
"error": f"Name '{new_name}' resolved but SMILES is invalid.",
|
|
2952
|
+
"input_smiles": input_smiles,
|
|
2953
|
+
}
|
|
2954
|
+
output_smiles = Chem.MolToSmiles(out_mol)
|
|
2955
|
+
|
|
2956
|
+
else:
|
|
2957
|
+
return {
|
|
2958
|
+
"ok": False,
|
|
2959
|
+
"error": (f"Unknown operation '{operation}'. "
|
|
2960
|
+
"Use 'analyze', 'name_surgery', 'smarts', "
|
|
2961
|
+
"'set_smiles', 'set_name', or 'reaction'."),
|
|
2962
|
+
}
|
|
2963
|
+
|
|
2964
|
+
# ---- Build output ----
|
|
2965
|
+
input_name = (mol_json.get("iupac_name")
|
|
2966
|
+
or mol_json.get("name")
|
|
2967
|
+
or _smiles_to_name_cs(input_smiles)
|
|
2968
|
+
or "")
|
|
2969
|
+
output_name = _smiles_to_name_cs(output_smiles) or ""
|
|
2970
|
+
|
|
2971
|
+
aligned_names = _build_aligned_names(input_smiles, output_smiles)
|
|
2972
|
+
diff = _build_mol_diff(input_smiles, output_smiles)
|
|
2973
|
+
|
|
2974
|
+
formula = _compute_formula(output_smiles)
|
|
2975
|
+
mw_out = _compute_mw(output_smiles)
|
|
2976
|
+
|
|
2977
|
+
result = {
|
|
2978
|
+
"ok": True,
|
|
2979
|
+
"input_smiles": input_smiles,
|
|
2980
|
+
"output_smiles": output_smiles,
|
|
2981
|
+
"input_name": input_name,
|
|
2982
|
+
"output_name": output_name,
|
|
2983
|
+
"aligned_names": aligned_names,
|
|
2984
|
+
"diff": diff,
|
|
2985
|
+
"formula": formula,
|
|
2986
|
+
"mw": mw_out,
|
|
2987
|
+
}
|
|
2988
|
+
if alternative_products:
|
|
2989
|
+
result["alternative_products"] = alternative_products
|
|
2990
|
+
return result
|
|
2991
|
+
|
|
2992
|
+
|
|
2993
|
+
# ---------------------------------------------------------------------------
|
|
2994
|
+
# Tool definitions for LLM function calling
|
|
2995
|
+
# ---------------------------------------------------------------------------
|
|
2996
|
+
|
|
2997
|
+
def get_tool_definitions() -> List[Dict[str, Any]]:
|
|
2998
|
+
"""Return tool schemas suitable for LLM function calling (Claude/OpenAI).
|
|
2999
|
+
|
|
3000
|
+
Each tool definition follows the Anthropic tool-use format::
|
|
3001
|
+
|
|
3002
|
+
{"name": "...", "description": "...", "input_schema": {...}}
|
|
3003
|
+
|
|
3004
|
+
The LLM orchestrator should register these as available tools and
|
|
3005
|
+
call the corresponding Python functions based on the LLM's output.
|
|
3006
|
+
|
|
3007
|
+
Returns:
|
|
3008
|
+
List of tool definition dicts.
|
|
3009
|
+
"""
|
|
3010
|
+
return [
|
|
3011
|
+
{
|
|
3012
|
+
"name": "resolve_compound",
|
|
3013
|
+
"description": (
|
|
3014
|
+
"Resolve a chemical identifier to a rich molecule descriptor "
|
|
3015
|
+
"with SMILES, molecular formula, MW, exact mass, IUPAC name, "
|
|
3016
|
+
"reagent role, display text, and IUPAC substituent prefix form. "
|
|
3017
|
+
"This is the preferred resolver — use it whenever you need more "
|
|
3018
|
+
"than just SMILES.\n\n"
|
|
3019
|
+
"Accepts common names, IUPAC names, abbreviations, condensed "
|
|
3020
|
+
"formulae, and CAS numbers. Resolution order:\n"
|
|
3021
|
+
" 1. Curated reagent DB (~186 entries with roles)\n"
|
|
3022
|
+
" 2. Generative condensed formula parser (offline)\n"
|
|
3023
|
+
" 3. ChemScript IUPAC name engine (offline)\n"
|
|
3024
|
+
" 4. PubChem API (online, if use_network=True)\n\n"
|
|
3025
|
+
"Output fields include:\n"
|
|
3026
|
+
" - smiles, formula, mw, exact_mass, iupac_name, source\n"
|
|
3027
|
+
" - role, display_text (from curated reagent DB if known)\n"
|
|
3028
|
+
" - prefix_form: IUPAC substituent prefix for use in "
|
|
3029
|
+
"assemble_name (e.g. 'trifluoromethyl' for CF3, 'morpholino' "
|
|
3030
|
+
"for morpholine); null if not a substituent group.\n\n"
|
|
3031
|
+
"Examples of valid queries:\n"
|
|
3032
|
+
' - Common names: "aspirin", "morpholine", "HATU"\n'
|
|
3033
|
+
' - Abbreviations: "Cs2CO3", "DIPEA", "Et3N"\n'
|
|
3034
|
+
' - IUPAC names: "2-chloropyridine"\n'
|
|
3035
|
+
' - Formulae: "PhB(OH)2", "CF3COOH"\n'
|
|
3036
|
+
' - CAS numbers: "534-17-8"\n'
|
|
3037
|
+
' - Drug names: "deucravacitinib"\n'
|
|
3038
|
+
),
|
|
3039
|
+
"input_schema": {
|
|
3040
|
+
"type": "object",
|
|
3041
|
+
"properties": {
|
|
3042
|
+
"query": {
|
|
3043
|
+
"type": "string",
|
|
3044
|
+
"description": "Chemical identifier to resolve.",
|
|
3045
|
+
},
|
|
3046
|
+
"use_network": {
|
|
3047
|
+
"type": "boolean",
|
|
3048
|
+
"description": (
|
|
3049
|
+
"Allow PubChem lookup (default: true). "
|
|
3050
|
+
"Set false for offline-only resolution."
|
|
3051
|
+
),
|
|
3052
|
+
},
|
|
3053
|
+
},
|
|
3054
|
+
"required": ["query"],
|
|
3055
|
+
},
|
|
3056
|
+
},
|
|
3057
|
+
{
|
|
3058
|
+
"name": "resolve_to_smiles",
|
|
3059
|
+
"description": (
|
|
3060
|
+
"Resolve a chemical identifier (name, abbreviation, formula, "
|
|
3061
|
+
"or CAS number) to a canonical SMILES string. Use this when "
|
|
3062
|
+
"you need only the SMILES; for richer output (formula, MW, "
|
|
3063
|
+
"exact mass, role) use resolve_compound instead.\n\n"
|
|
3064
|
+
"Examples of valid queries:\n"
|
|
3065
|
+
' - Common names: "aspirin", "morpholine", "HATU"\n'
|
|
3066
|
+
' - IUPAC names: "2-chloropyridine", "4-methylbenzoic acid"\n'
|
|
3067
|
+
' - Formulae: "PhB(OH)2", "Et3N", "CF3COOH"\n'
|
|
3068
|
+
' - CAS numbers: "534-17-8"\n'
|
|
3069
|
+
),
|
|
3070
|
+
"input_schema": {
|
|
3071
|
+
"type": "object",
|
|
3072
|
+
"properties": {
|
|
3073
|
+
"query": {
|
|
3074
|
+
"type": "string",
|
|
3075
|
+
"description": "Chemical identifier to resolve.",
|
|
3076
|
+
},
|
|
3077
|
+
},
|
|
3078
|
+
"required": ["query"],
|
|
3079
|
+
},
|
|
3080
|
+
},
|
|
3081
|
+
{
|
|
3082
|
+
"name": "get_prefix_form",
|
|
3083
|
+
"description": (
|
|
3084
|
+
"Get the IUPAC substituent prefix form for a chemical group "
|
|
3085
|
+
"so it can be used in assemble_name. Returns the prefix "
|
|
3086
|
+
"string (e.g. 'trifluoromethyl' for 'CF3', 'morpholino' for "
|
|
3087
|
+
"'morpholine').\n\n"
|
|
3088
|
+
"Use this when you know what group to attach but need its "
|
|
3089
|
+
"correct IUPAC prefix name.\n\n"
|
|
3090
|
+
"Examples:\n"
|
|
3091
|
+
' - "CF3" -> "trifluoromethyl"\n'
|
|
3092
|
+
' - "NO2" -> "nitro"\n'
|
|
3093
|
+
' - "OMe" -> "methoxy"\n'
|
|
3094
|
+
' - "morpholine" -> "morpholino"\n'
|
|
3095
|
+
' - "cyclopropane" -> "cyclopropyl"\n'
|
|
3096
|
+
),
|
|
3097
|
+
"input_schema": {
|
|
3098
|
+
"type": "object",
|
|
3099
|
+
"properties": {
|
|
3100
|
+
"group": {
|
|
3101
|
+
"type": "string",
|
|
3102
|
+
"description": (
|
|
3103
|
+
"Group to look up: abbreviation ('CF3', 'OMe'), "
|
|
3104
|
+
"name ('morpholine'), or formula ('CHF2')."
|
|
3105
|
+
),
|
|
3106
|
+
},
|
|
3107
|
+
},
|
|
3108
|
+
"required": ["group"],
|
|
3109
|
+
},
|
|
3110
|
+
},
|
|
3111
|
+
{
|
|
3112
|
+
"name": "assemble_name",
|
|
3113
|
+
"description": (
|
|
3114
|
+
"Build an IUPAC name from a parent ring/chain and a list of "
|
|
3115
|
+
"substituents. Handles alphabetical ordering and multiplying "
|
|
3116
|
+
"prefixes (di-, tri-) automatically. Validates the assembled "
|
|
3117
|
+
"name by resolving it to a structure.\n\n"
|
|
3118
|
+
"Example:\n"
|
|
3119
|
+
" parent: 'pyridine'\n"
|
|
3120
|
+
" substituents: [\n"
|
|
3121
|
+
' {"locant": "2", "prefix": "chloro"},\n'
|
|
3122
|
+
' {"locant": "5", "prefix": "trifluoromethyl"}\n'
|
|
3123
|
+
" ]\n"
|
|
3124
|
+
" -> '2-chloro-5-(trifluoromethyl)pyridine'\n"
|
|
3125
|
+
),
|
|
3126
|
+
"input_schema": {
|
|
3127
|
+
"type": "object",
|
|
3128
|
+
"properties": {
|
|
3129
|
+
"parent": {
|
|
3130
|
+
"type": "string",
|
|
3131
|
+
"description": (
|
|
3132
|
+
"Parent ring or chain name "
|
|
3133
|
+
"(e.g. 'pyridine', 'benzene', 'pentane')."
|
|
3134
|
+
),
|
|
3135
|
+
},
|
|
3136
|
+
"substituents": {
|
|
3137
|
+
"type": "array",
|
|
3138
|
+
"items": {
|
|
3139
|
+
"type": "object",
|
|
3140
|
+
"properties": {
|
|
3141
|
+
"locant": {
|
|
3142
|
+
"type": "string",
|
|
3143
|
+
"description": "Position number (e.g. '2', '3').",
|
|
3144
|
+
},
|
|
3145
|
+
"prefix": {
|
|
3146
|
+
"type": "string",
|
|
3147
|
+
"description": (
|
|
3148
|
+
"IUPAC prefix (e.g. 'chloro', 'methyl'). "
|
|
3149
|
+
"Use get_prefix_form first if unsure."
|
|
3150
|
+
),
|
|
3151
|
+
},
|
|
3152
|
+
},
|
|
3153
|
+
"required": ["locant", "prefix"],
|
|
3154
|
+
},
|
|
3155
|
+
"description": "List of substituents with positions.",
|
|
3156
|
+
},
|
|
3157
|
+
},
|
|
3158
|
+
"required": ["parent", "substituents"],
|
|
3159
|
+
},
|
|
3160
|
+
},
|
|
3161
|
+
{
|
|
3162
|
+
"name": "modify_name",
|
|
3163
|
+
"description": (
|
|
3164
|
+
"Modify an existing IUPAC name by swapping, adding, or "
|
|
3165
|
+
"removing a substituent. The name is re-alphabetised and "
|
|
3166
|
+
"validated automatically.\n\n"
|
|
3167
|
+
"Operations:\n"
|
|
3168
|
+
" - 'swap': Replace target prefix with replacement.\n"
|
|
3169
|
+
" Example: swap 'nitro' -> 'amino' in '4-nitropyridine'\n"
|
|
3170
|
+
" - 'add': Insert replacement prefix at locant.\n"
|
|
3171
|
+
" Example: add 'methyl' at '3' to '2-chloropyridine'\n"
|
|
3172
|
+
" - 'remove': Delete the target prefix.\n"
|
|
3173
|
+
" Example: remove 'chloro' from '2-chloro-3-methylpyridine'\n"
|
|
3174
|
+
),
|
|
3175
|
+
"input_schema": {
|
|
3176
|
+
"type": "object",
|
|
3177
|
+
"properties": {
|
|
3178
|
+
"name": {
|
|
3179
|
+
"type": "string",
|
|
3180
|
+
"description": "The IUPAC name to modify.",
|
|
3181
|
+
},
|
|
3182
|
+
"operation": {
|
|
3183
|
+
"type": "string",
|
|
3184
|
+
"enum": ["swap", "add", "remove"],
|
|
3185
|
+
"description": "Type of modification.",
|
|
3186
|
+
},
|
|
3187
|
+
"target": {
|
|
3188
|
+
"type": "string",
|
|
3189
|
+
"description": "Prefix to replace (swap) or remove (remove).",
|
|
3190
|
+
},
|
|
3191
|
+
"replacement": {
|
|
3192
|
+
"type": "string",
|
|
3193
|
+
"description": "New prefix (swap) or prefix to insert (add).",
|
|
3194
|
+
},
|
|
3195
|
+
"locant": {
|
|
3196
|
+
"type": "string",
|
|
3197
|
+
"description": "Position for insertion (add only).",
|
|
3198
|
+
},
|
|
3199
|
+
},
|
|
3200
|
+
"required": ["name", "operation"],
|
|
3201
|
+
},
|
|
3202
|
+
},
|
|
3203
|
+
{
|
|
3204
|
+
"name": "validate_name",
|
|
3205
|
+
"description": (
|
|
3206
|
+
"Check whether an IUPAC name is valid by attempting to "
|
|
3207
|
+
"resolve it to a molecular structure. Returns the canonical "
|
|
3208
|
+
"SMILES if valid. Use this to verify names before generating "
|
|
3209
|
+
"structures.\n\n"
|
|
3210
|
+
"Example:\n"
|
|
3211
|
+
' "2-chloro-3-(trifluoromethyl)pyridine" -> valid, SMILES\n'
|
|
3212
|
+
' "2-chloro-99-methylpyridine" -> invalid\n'
|
|
3213
|
+
),
|
|
3214
|
+
"input_schema": {
|
|
3215
|
+
"type": "object",
|
|
3216
|
+
"properties": {
|
|
3217
|
+
"name": {
|
|
3218
|
+
"type": "string",
|
|
3219
|
+
"description": "IUPAC name to validate.",
|
|
3220
|
+
},
|
|
3221
|
+
},
|
|
3222
|
+
"required": ["name"],
|
|
3223
|
+
},
|
|
3224
|
+
},
|
|
3225
|
+
{
|
|
3226
|
+
"name": "name_to_structure",
|
|
3227
|
+
"description": (
|
|
3228
|
+
"Convert a validated chemical name to a structure file "
|
|
3229
|
+
"(CDXML for ChemDraw, or SMILES/MOL). This is the final "
|
|
3230
|
+
"step: call this after assembling and validating the name.\n\n"
|
|
3231
|
+
"Output formats:\n"
|
|
3232
|
+
' - "cdxml": ChemDraw XML (requires ChemScript)\n'
|
|
3233
|
+
' - "smiles": canonical SMILES string\n'
|
|
3234
|
+
' - "mol": MDL MOL block with 2D coordinates\n'
|
|
3235
|
+
),
|
|
3236
|
+
"input_schema": {
|
|
3237
|
+
"type": "object",
|
|
3238
|
+
"properties": {
|
|
3239
|
+
"name": {
|
|
3240
|
+
"type": "string",
|
|
3241
|
+
"description": "Chemical name to convert.",
|
|
3242
|
+
},
|
|
3243
|
+
"output_format": {
|
|
3244
|
+
"type": "string",
|
|
3245
|
+
"enum": ["cdxml", "smiles", "mol"],
|
|
3246
|
+
"description": "Output format (default: cdxml).",
|
|
3247
|
+
},
|
|
3248
|
+
},
|
|
3249
|
+
"required": ["name"],
|
|
3250
|
+
},
|
|
3251
|
+
},
|
|
3252
|
+
{
|
|
3253
|
+
"name": "enumerate_names",
|
|
3254
|
+
"description": (
|
|
3255
|
+
"List alternative IUPAC name forms for a molecule. "
|
|
3256
|
+
"Given a name or SMILES, returns the canonical name plus "
|
|
3257
|
+
"alternative forms that express the same molecule using "
|
|
3258
|
+
"different parent rings/chains and substituent prefixes.\n\n"
|
|
3259
|
+
"IMPORTANT: Call this BEFORE modify_name when doing name "
|
|
3260
|
+
"surgery on functional groups that appear as suffixes in "
|
|
3261
|
+
"the canonical name (ketones '-one', alcohols '-ol', "
|
|
3262
|
+
"amines '-amine', acids '-oic acid', etc.). The "
|
|
3263
|
+
"alternatives expose these groups as swappable prefixes.\n\n"
|
|
3264
|
+
"Example:\n"
|
|
3265
|
+
" '1-(4-bromophenyl)ethan-1-one' (ketone as suffix)\n"
|
|
3266
|
+
" -> alternatives include '1-acetyl-4-bromobenzene'\n"
|
|
3267
|
+
" where the ketone is now the prefix 'acetyl'\n"
|
|
3268
|
+
" -> you can then swap 'acetyl' for another prefix\n\n"
|
|
3269
|
+
"Each name form includes a 'prefixes' list showing "
|
|
3270
|
+
"which substituent prefixes are visible and swappable.\n"
|
|
3271
|
+
),
|
|
3272
|
+
"input_schema": {
|
|
3273
|
+
"type": "object",
|
|
3274
|
+
"properties": {
|
|
3275
|
+
"identifier": {
|
|
3276
|
+
"type": "string",
|
|
3277
|
+
"description": (
|
|
3278
|
+
"Chemical name, SMILES, abbreviation, or any "
|
|
3279
|
+
"identifier. Will be resolved to a structure."
|
|
3280
|
+
),
|
|
3281
|
+
},
|
|
3282
|
+
},
|
|
3283
|
+
"required": ["identifier"],
|
|
3284
|
+
},
|
|
3285
|
+
},
|
|
3286
|
+
# --- Layer 3: Graph manipulation tools ---
|
|
3287
|
+
{
|
|
3288
|
+
"name": "list_reactions",
|
|
3289
|
+
"description": (
|
|
3290
|
+
"List available named reaction templates. Returns the "
|
|
3291
|
+
"name, description, number of reactants, and typical "
|
|
3292
|
+
"conditions for each reaction. Call this to find the "
|
|
3293
|
+
"right template before using apply_reaction.\n\n"
|
|
3294
|
+
"Categories:\n"
|
|
3295
|
+
" - 'coupling': Suzuki, Buchwald, SNAr, amide, "
|
|
3296
|
+
"Sonogashira, Heck, N-alkylation\n"
|
|
3297
|
+
" - 'functional_group': nitro reduction, ester hydrolysis, "
|
|
3298
|
+
"alcohol oxidation, reductive amination, Grignard\n"
|
|
3299
|
+
" - 'heterocycle_formation': ~60 ring-forming reactions "
|
|
3300
|
+
"including Huisgen triazole, Fischer indole, Paal-Knorr "
|
|
3301
|
+
"pyrrole, Hantzsch pyridine/thiazole, benzimidazole, "
|
|
3302
|
+
"benzoxazole, Pictet-Spengler, Biginelli, and many more\n\n"
|
|
3303
|
+
"Use the optional category filter to narrow results.\n"
|
|
3304
|
+
),
|
|
3305
|
+
"input_schema": {
|
|
3306
|
+
"type": "object",
|
|
3307
|
+
"properties": {
|
|
3308
|
+
"category": {
|
|
3309
|
+
"type": "string",
|
|
3310
|
+
"enum": [
|
|
3311
|
+
"coupling",
|
|
3312
|
+
"functional_group",
|
|
3313
|
+
"heterocycle_formation",
|
|
3314
|
+
],
|
|
3315
|
+
"description": (
|
|
3316
|
+
"Optional: filter by category. "
|
|
3317
|
+
"Omit to list all reactions."
|
|
3318
|
+
),
|
|
3319
|
+
},
|
|
3320
|
+
},
|
|
3321
|
+
"required": [],
|
|
3322
|
+
},
|
|
3323
|
+
},
|
|
3324
|
+
{
|
|
3325
|
+
"name": "apply_reaction",
|
|
3326
|
+
"description": (
|
|
3327
|
+
"Apply a named reaction template to a substrate molecule. "
|
|
3328
|
+
"For two-component reactions (e.g. Suzuki, Buchwald), "
|
|
3329
|
+
"provide both substrate and reagent SMILES. For single-"
|
|
3330
|
+
"component reactions (e.g. nitro reduction), only the "
|
|
3331
|
+
"substrate is needed.\n\n"
|
|
3332
|
+
"This tool covers ~70 reactions including:\n"
|
|
3333
|
+
" - Classic couplings (Suzuki, Buchwald, Heck, etc.)\n"
|
|
3334
|
+
" - Functional group transforms (reductions, oxidations)\n"
|
|
3335
|
+
" - Ring-forming heterocyclic reactions (Fischer indole, "
|
|
3336
|
+
"Huisgen triazole, Paal-Knorr pyrrole, Hantzsch thiazole, "
|
|
3337
|
+
"benzimidazole synthesis, Pictet-Spengler, etc.)\n\n"
|
|
3338
|
+
"Use list_reactions() to find the right template name.\n\n"
|
|
3339
|
+
"The substrate and reagent can be SMILES strings or "
|
|
3340
|
+
"chemical names/abbreviations (they will be resolved "
|
|
3341
|
+
"automatically).\n\n"
|
|
3342
|
+
"Returns the product SMILES, IUPAC name, and suggested "
|
|
3343
|
+
"reaction conditions.\n\n"
|
|
3344
|
+
"Examples:\n"
|
|
3345
|
+
' - apply_reaction("nitro_reduction", '
|
|
3346
|
+
'"c1ccc([N+](=O)[O-])cc1")\n'
|
|
3347
|
+
' - apply_reaction("suzuki_coupling", '
|
|
3348
|
+
'"c1ccc(Br)cc1", "c1ccc(B(O)O)cc1")\n'
|
|
3349
|
+
),
|
|
3350
|
+
"input_schema": {
|
|
3351
|
+
"type": "object",
|
|
3352
|
+
"properties": {
|
|
3353
|
+
"reaction_name": {
|
|
3354
|
+
"type": "string",
|
|
3355
|
+
"description": (
|
|
3356
|
+
"Reaction template name from list_reactions "
|
|
3357
|
+
"(e.g. 'suzuki_coupling', 'nitro_reduction')."
|
|
3358
|
+
),
|
|
3359
|
+
},
|
|
3360
|
+
"substrate": {
|
|
3361
|
+
"type": "string",
|
|
3362
|
+
"description": (
|
|
3363
|
+
"SMILES or name of the main substrate."
|
|
3364
|
+
),
|
|
3365
|
+
},
|
|
3366
|
+
"reagent": {
|
|
3367
|
+
"type": "string",
|
|
3368
|
+
"description": (
|
|
3369
|
+
"SMILES or name of the coupling partner "
|
|
3370
|
+
"(for 2-reactant reactions only)."
|
|
3371
|
+
),
|
|
3372
|
+
},
|
|
3373
|
+
},
|
|
3374
|
+
"required": ["reaction_name", "substrate"],
|
|
3375
|
+
},
|
|
3376
|
+
},
|
|
3377
|
+
{
|
|
3378
|
+
"name": "deprotect",
|
|
3379
|
+
"description": (
|
|
3380
|
+
"Remove common protecting groups from a molecule. "
|
|
3381
|
+
"Uses 25 built-in deprotection templates covering:\n"
|
|
3382
|
+
" Boc, Fmoc, Cbz (amines)\n"
|
|
3383
|
+
" TBS/TBDMS, THP, Bn, Ac, PMB, TMS (alcohols)\n"
|
|
3384
|
+
" Acetal/Ketal (carbonyls)\n\n"
|
|
3385
|
+
"Accepts SMILES or a chemical name. Returns the "
|
|
3386
|
+
"deprotected product and which PGs were removed.\n\n"
|
|
3387
|
+
"Example:\n"
|
|
3388
|
+
' deprotect("O=C(OC(C)(C)C)Nc1ccccc1") # Boc-aniline\n'
|
|
3389
|
+
' -> product: aniline, removed: [Boc]\n'
|
|
3390
|
+
),
|
|
3391
|
+
"input_schema": {
|
|
3392
|
+
"type": "object",
|
|
3393
|
+
"properties": {
|
|
3394
|
+
"smiles": {
|
|
3395
|
+
"type": "string",
|
|
3396
|
+
"description": (
|
|
3397
|
+
"SMILES or chemical name of the "
|
|
3398
|
+
"protected molecule."
|
|
3399
|
+
),
|
|
3400
|
+
},
|
|
3401
|
+
},
|
|
3402
|
+
"required": ["smiles"],
|
|
3403
|
+
},
|
|
3404
|
+
},
|
|
3405
|
+
# --- Reaction JSON summary ---
|
|
3406
|
+
{
|
|
3407
|
+
"name": "reaction_summary",
|
|
3408
|
+
"description": (
|
|
3409
|
+
"Load a reaction JSON file and return a slim summary "
|
|
3410
|
+
"with only the fields you need. Use this instead of "
|
|
3411
|
+
"reading the full JSON, which contains bulky geometry "
|
|
3412
|
+
"and mass data.\n\n"
|
|
3413
|
+
"Default fields (per species): id, name, role, "
|
|
3414
|
+
"role_detail, smiles, display_text, formula, mw.\n"
|
|
3415
|
+
"Default top-level: experiment, conditions.\n"
|
|
3416
|
+
"Default eln_data: product_yield, reaction_type.\n\n"
|
|
3417
|
+
"Request additional fields by name when needed:\n"
|
|
3418
|
+
" - LCMS: add species fields ['exact_mass', 'adducts']\n"
|
|
3419
|
+
" - Procedure: add species fields ['csv_mass', "
|
|
3420
|
+
"'csv_equiv', 'csv_volume'] and eln fields "
|
|
3421
|
+
"['procedure_plain', 'product_obtained', 'sm_mass']\n"
|
|
3422
|
+
" - Scheme drawing: defaults are sufficient\n"
|
|
3423
|
+
" - Pass ['*'] to any field list for all fields.\n\n"
|
|
3424
|
+
"Available species fields:\n"
|
|
3425
|
+
" id, name, role, role_detail, smiles, smiles_neutral, "
|
|
3426
|
+
"classification_method, is_sm, is_dp, is_substrate, "
|
|
3427
|
+
"is_solvent, exact_mass, exact_mass_full, mw, formula, "
|
|
3428
|
+
"adducts, source, source_id, csv_equiv, csv_mass, "
|
|
3429
|
+
"csv_name, csv_volume, csv_supplier, display_text, "
|
|
3430
|
+
"original_geometry\n\n"
|
|
3431
|
+
"Available top-level fields:\n"
|
|
3432
|
+
" version, experiment, input_files, reaction_smiles, "
|
|
3433
|
+
"reaction_class, reaction_name, "
|
|
3434
|
+
"classification_confidence, warnings, metadata, "
|
|
3435
|
+
"conditions\n\n"
|
|
3436
|
+
"Available eln_data fields:\n"
|
|
3437
|
+
" sm_mass, product_obtained, product_yield, "
|
|
3438
|
+
"procedure_text, procedure_plain, reaction_type, "
|
|
3439
|
+
"start_date, labbook_name, solvents, solvent_details\n"
|
|
3440
|
+
),
|
|
3441
|
+
"input_schema": {
|
|
3442
|
+
"type": "object",
|
|
3443
|
+
"properties": {
|
|
3444
|
+
"json_path": {
|
|
3445
|
+
"type": "string",
|
|
3446
|
+
"description": "Path to the reaction JSON file.",
|
|
3447
|
+
},
|
|
3448
|
+
"species_fields": {
|
|
3449
|
+
"type": "array",
|
|
3450
|
+
"items": {"type": "string"},
|
|
3451
|
+
"description": (
|
|
3452
|
+
"Per-species fields to include. Omit for "
|
|
3453
|
+
"defaults. Pass ['*'] for all fields."
|
|
3454
|
+
),
|
|
3455
|
+
},
|
|
3456
|
+
"top_fields": {
|
|
3457
|
+
"type": "array",
|
|
3458
|
+
"items": {"type": "string"},
|
|
3459
|
+
"description": (
|
|
3460
|
+
"Top-level fields to include. Omit for "
|
|
3461
|
+
"defaults. Pass ['*'] for all fields."
|
|
3462
|
+
),
|
|
3463
|
+
},
|
|
3464
|
+
"eln_fields": {
|
|
3465
|
+
"type": "array",
|
|
3466
|
+
"items": {"type": "string"},
|
|
3467
|
+
"description": (
|
|
3468
|
+
"eln_data sub-fields to include. Omit for "
|
|
3469
|
+
"defaults. Pass ['*'] for all. Pass [] to "
|
|
3470
|
+
"omit eln_data entirely."
|
|
3471
|
+
),
|
|
3472
|
+
},
|
|
3473
|
+
},
|
|
3474
|
+
"required": ["json_path"],
|
|
3475
|
+
},
|
|
3476
|
+
},
|
|
3477
|
+
# --- Single-molecule rendering ---
|
|
3478
|
+
{
|
|
3479
|
+
"name": "draw_molecule",
|
|
3480
|
+
"description": (
|
|
3481
|
+
"Render a single molecule structure to a standalone CDXML "
|
|
3482
|
+
"document (no arrow, no reaction scheme). The output opens "
|
|
3483
|
+
"directly in ChemDraw and uses ACS Document 1996 style.\n\n"
|
|
3484
|
+
"Input is a dict with at minimum a 'smiles' field. "
|
|
3485
|
+
"An optional label (compound name or custom text) is placed "
|
|
3486
|
+
"below the structure. Use the 'output_path' argument to "
|
|
3487
|
+
"write the CDXML to a file as well.\n\n"
|
|
3488
|
+
"Label priority: 'label' > 'name' > 'iupac_name'.\n\n"
|
|
3489
|
+
"Examples:\n"
|
|
3490
|
+
" draw_molecule({'smiles': 'CC(=O)Oc1ccccc1C(=O)O', "
|
|
3491
|
+
"'name': 'aspirin'})\n"
|
|
3492
|
+
" draw_molecule({'smiles': 'c1ccccc1'}, "
|
|
3493
|
+
"output_path='benzene.cdxml')\n"
|
|
3494
|
+
),
|
|
3495
|
+
"input_schema": {
|
|
3496
|
+
"type": "object",
|
|
3497
|
+
"properties": {
|
|
3498
|
+
"mol_json": {
|
|
3499
|
+
"type": "object",
|
|
3500
|
+
"description": (
|
|
3501
|
+
"Molecule dict. Required key: 'smiles'. "
|
|
3502
|
+
"Optional display keys: 'label', 'name', "
|
|
3503
|
+
"'iupac_name'."
|
|
3504
|
+
),
|
|
3505
|
+
"properties": {
|
|
3506
|
+
"smiles": {
|
|
3507
|
+
"type": "string",
|
|
3508
|
+
"description": "SMILES string of the molecule.",
|
|
3509
|
+
},
|
|
3510
|
+
"label": {
|
|
3511
|
+
"type": "string",
|
|
3512
|
+
"description": "Custom label shown below the structure.",
|
|
3513
|
+
},
|
|
3514
|
+
"name": {
|
|
3515
|
+
"type": "string",
|
|
3516
|
+
"description": "Compound name (used as label if 'label' not set).",
|
|
3517
|
+
},
|
|
3518
|
+
"iupac_name": {
|
|
3519
|
+
"type": "string",
|
|
3520
|
+
"description": "IUPAC name (used as label if 'name' not set).",
|
|
3521
|
+
},
|
|
3522
|
+
},
|
|
3523
|
+
"required": ["smiles"],
|
|
3524
|
+
},
|
|
3525
|
+
"output_path": {
|
|
3526
|
+
"type": "string",
|
|
3527
|
+
"description": (
|
|
3528
|
+
"Optional file path to write the CDXML to "
|
|
3529
|
+
"(e.g. 'molecule.cdxml'). The CDXML string is "
|
|
3530
|
+
"always returned in the response regardless."
|
|
3531
|
+
),
|
|
3532
|
+
},
|
|
3533
|
+
},
|
|
3534
|
+
"required": ["mol_json"],
|
|
3535
|
+
},
|
|
3536
|
+
},
|
|
3537
|
+
# --- Molecular editor ---
|
|
3538
|
+
{
|
|
3539
|
+
"name": "modify_molecule",
|
|
3540
|
+
"description": (
|
|
3541
|
+
"Modify a molecule and verify the change with a structural "
|
|
3542
|
+
"diff. This is the premier tool for editing chemical "
|
|
3543
|
+
"structures with verification — like drawing in ChemDraw "
|
|
3544
|
+
"and visually checking the result.\n\n"
|
|
3545
|
+
"Input is a mol_json dict (with at minimum a 'smiles' key, "
|
|
3546
|
+
"e.g. from resolve_compound). The tool applies the "
|
|
3547
|
+
"requested operation, validates the result with RDKit, "
|
|
3548
|
+
"and returns the output molecule with:\n\n"
|
|
3549
|
+
" - aligned_names: side-by-side IUPAC name comparison "
|
|
3550
|
+
"(so you can see what changed in words)\n"
|
|
3551
|
+
" - diff.atoms_changed: MCS-based fragment diff "
|
|
3552
|
+
"(so you can see what atoms were added/removed/replaced)\n"
|
|
3553
|
+
" - diff.delta_formula / diff.delta_mw: formula and MW "
|
|
3554
|
+
"change numbers for sanity-checking\n\n"
|
|
3555
|
+
"Six operation modes:\n\n"
|
|
3556
|
+
" 'analyze' — DOES NOT modify the molecule. Returns "
|
|
3557
|
+
"a rich description: functional groups present, alternative "
|
|
3558
|
+
"IUPAC names from different perspectives, canonical name, "
|
|
3559
|
+
"bracket tree (hierarchical name decomposition), substituent "
|
|
3560
|
+
"prefix form, formula, and MW. Call this FIRST when you "
|
|
3561
|
+
"need to understand a molecule before deciding what surgery "
|
|
3562
|
+
"to do.\n\n"
|
|
3563
|
+
" 'set_smiles' — LLM provides the new SMILES directly. "
|
|
3564
|
+
"Tool validates it and computes the diff. Use when you "
|
|
3565
|
+
"already know the exact SMILES.\n\n"
|
|
3566
|
+
" 'set_name' — LLM provides an IUPAC or common name for "
|
|
3567
|
+
"the desired product. Tool resolves to SMILES and computes "
|
|
3568
|
+
"the diff. Use when you know the target molecule by name.\n\n"
|
|
3569
|
+
" 'smarts' — apply a SMARTS reaction transform. "
|
|
3570
|
+
"Provide either a 'smarts' reaction SMARTS string "
|
|
3571
|
+
"(e.g. '[c:1][F]>>[c:1][Cl]') or a 'reaction_name' from "
|
|
3572
|
+
"list_reactions(). Good for specific bond transformations.\n\n"
|
|
3573
|
+
" 'reaction' — apply a named reaction template via "
|
|
3574
|
+
"apply_reaction(). Provide 'reaction_name' (required) and "
|
|
3575
|
+
"optionally 'reagent' dict (with 'smiles' key) for binary "
|
|
3576
|
+
"reactions. Returns the primary product with the standard "
|
|
3577
|
+
"diff fields; additional products go in 'alternative_products'."
|
|
3578
|
+
" Use list_reactions() to find available template names.\n\n"
|
|
3579
|
+
" 'name_surgery' — modify via IUPAC name manipulation. "
|
|
3580
|
+
"Requires ChemScript. Provide 'add' "
|
|
3581
|
+
"(list of {locant, prefix} dicts) and/or 'remove' "
|
|
3582
|
+
"(list of prefix strings). Best for simple substituent "
|
|
3583
|
+
"swaps on drug-like molecules.\n\n"
|
|
3584
|
+
"Examples:\n"
|
|
3585
|
+
" # CD3 → benzyl swap\n"
|
|
3586
|
+
" modify_molecule({'smiles': '...'}, 'smarts',\n"
|
|
3587
|
+
" smarts='[C:1]([2H])([2H])[2H]>>[C:1]Cc1ccccc1')\n\n"
|
|
3588
|
+
" # Add fluoro at C3\n"
|
|
3589
|
+
" modify_molecule({'smiles': 'Clc1ccncc1'}, 'name_surgery',\n"
|
|
3590
|
+
" add=[{'locant': '3', 'prefix': 'fluoro'}])\n\n"
|
|
3591
|
+
" # Set explicit SMILES\n"
|
|
3592
|
+
" modify_molecule({'smiles': 'Clc1ccncc1'}, 'set_smiles',\n"
|
|
3593
|
+
" new_smiles='Clc1cc(F)ncc1', "
|
|
3594
|
+
"description='fluoro at C3')\n"
|
|
3595
|
+
),
|
|
3596
|
+
"input_schema": {
|
|
3597
|
+
"type": "object",
|
|
3598
|
+
"properties": {
|
|
3599
|
+
"mol_json": {
|
|
3600
|
+
"type": "object",
|
|
3601
|
+
"description": (
|
|
3602
|
+
"Source molecule dict. Required key: 'smiles'. "
|
|
3603
|
+
"Optional: 'name', 'iupac_name' (used as "
|
|
3604
|
+
"starting point for name_surgery)."
|
|
3605
|
+
),
|
|
3606
|
+
"properties": {
|
|
3607
|
+
"smiles": {
|
|
3608
|
+
"type": "string",
|
|
3609
|
+
"description": "SMILES of the molecule to modify.",
|
|
3610
|
+
},
|
|
3611
|
+
"name": {
|
|
3612
|
+
"type": "string",
|
|
3613
|
+
"description": "Common name (optional).",
|
|
3614
|
+
},
|
|
3615
|
+
"iupac_name": {
|
|
3616
|
+
"type": "string",
|
|
3617
|
+
"description": (
|
|
3618
|
+
"IUPAC name (used as starting point for "
|
|
3619
|
+
"name_surgery if provided)."
|
|
3620
|
+
),
|
|
3621
|
+
},
|
|
3622
|
+
},
|
|
3623
|
+
"required": ["smiles"],
|
|
3624
|
+
},
|
|
3625
|
+
"operation": {
|
|
3626
|
+
"type": "string",
|
|
3627
|
+
"enum": ["analyze", "name_surgery", "smarts", "set_smiles", "set_name", "reaction"],
|
|
3628
|
+
"description": (
|
|
3629
|
+
"Operation to apply. Use 'analyze' to inspect a "
|
|
3630
|
+
"molecule without modifying it; use 'name_surgery', "
|
|
3631
|
+
"'smarts', 'set_smiles', 'set_name', or 'reaction' to edit it."
|
|
3632
|
+
),
|
|
3633
|
+
},
|
|
3634
|
+
"new_smiles": {
|
|
3635
|
+
"type": "string",
|
|
3636
|
+
"description": (
|
|
3637
|
+
"[set_smiles only] The new SMILES string. "
|
|
3638
|
+
"Will be validated with RDKit."
|
|
3639
|
+
),
|
|
3640
|
+
},
|
|
3641
|
+
"new_name": {
|
|
3642
|
+
"type": "string",
|
|
3643
|
+
"description": (
|
|
3644
|
+
"[set_name only] An IUPAC or common name for "
|
|
3645
|
+
"the desired product. Will be resolved to "
|
|
3646
|
+
"SMILES and validated."
|
|
3647
|
+
),
|
|
3648
|
+
},
|
|
3649
|
+
"description": {
|
|
3650
|
+
"type": "string",
|
|
3651
|
+
"description": (
|
|
3652
|
+
"[set_smiles/set_name] Optional description of "
|
|
3653
|
+
"the change (for logging/context)."
|
|
3654
|
+
),
|
|
3655
|
+
},
|
|
3656
|
+
"smarts": {
|
|
3657
|
+
"type": "string",
|
|
3658
|
+
"description": (
|
|
3659
|
+
"[smarts only] Reaction SMARTS string. "
|
|
3660
|
+
"Use atom-map numbers for bond-order-preserving "
|
|
3661
|
+
"transforms, e.g. '[c:1][F]>>[c:1][Cl]'."
|
|
3662
|
+
),
|
|
3663
|
+
},
|
|
3664
|
+
"reaction_name": {
|
|
3665
|
+
"type": "string",
|
|
3666
|
+
"description": (
|
|
3667
|
+
"[smarts, reaction] Named reaction from list_reactions(). "
|
|
3668
|
+
"For 'smarts': used as the SMARTS transform (alternative to "
|
|
3669
|
+
"providing 'smarts' directly). "
|
|
3670
|
+
"For 'reaction': required — selects the reaction template "
|
|
3671
|
+
"to apply via apply_reaction()."
|
|
3672
|
+
),
|
|
3673
|
+
},
|
|
3674
|
+
"reagent": {
|
|
3675
|
+
"type": "object",
|
|
3676
|
+
"description": (
|
|
3677
|
+
"[reaction only] The coupling partner for binary reactions "
|
|
3678
|
+
"(e.g. amide_coupling, suzuki_coupling). "
|
|
3679
|
+
"Must contain at minimum a 'smiles' key."
|
|
3680
|
+
),
|
|
3681
|
+
"properties": {
|
|
3682
|
+
"smiles": {
|
|
3683
|
+
"type": "string",
|
|
3684
|
+
"description": "SMILES of the reagent/coupling partner.",
|
|
3685
|
+
},
|
|
3686
|
+
},
|
|
3687
|
+
"required": ["smiles"],
|
|
3688
|
+
},
|
|
3689
|
+
"add": {
|
|
3690
|
+
"type": "array",
|
|
3691
|
+
"items": {
|
|
3692
|
+
"type": "object",
|
|
3693
|
+
"properties": {
|
|
3694
|
+
"locant": {
|
|
3695
|
+
"type": "string",
|
|
3696
|
+
"description": "Position number (e.g. '3').",
|
|
3697
|
+
},
|
|
3698
|
+
"prefix": {
|
|
3699
|
+
"type": "string",
|
|
3700
|
+
"description": "IUPAC prefix (e.g. 'fluoro', 'methyl').",
|
|
3701
|
+
},
|
|
3702
|
+
},
|
|
3703
|
+
"required": ["locant", "prefix"],
|
|
3704
|
+
},
|
|
3705
|
+
"description": (
|
|
3706
|
+
"[name_surgery only] Substituents to add. "
|
|
3707
|
+
"Each entry needs 'locant' and 'prefix'."
|
|
3708
|
+
),
|
|
3709
|
+
},
|
|
3710
|
+
"remove": {
|
|
3711
|
+
"type": "array",
|
|
3712
|
+
"items": {"type": "string"},
|
|
3713
|
+
"description": (
|
|
3714
|
+
"[name_surgery only] List of IUPAC prefix "
|
|
3715
|
+
"strings to remove (e.g. ['chloro', 'methyl'])."
|
|
3716
|
+
),
|
|
3717
|
+
},
|
|
3718
|
+
},
|
|
3719
|
+
"required": ["mol_json", "operation"],
|
|
3720
|
+
},
|
|
3721
|
+
},
|
|
3722
|
+
]
|