cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,3722 @@
1
+ """
2
+ LLM-assisted molecule construction via IUPAC name manipulation.
3
+
4
+ Provides composable tools designed for use with an LLM orchestrator.
5
+ The LLM translates natural language descriptions of molecules into tool calls
6
+ that manipulate IUPAC names, which are then validated and converted to
7
+ structures.
8
+
9
+ Architecture::
10
+
11
+ NL description --> LLM orchestrator --> tool calls --> IUPAC name --> CDXML
12
+
13
+ The key insight: IUPAC names are a lossless text representation of molecules.
14
+ Instead of having an LLM generate SMILES or manipulate CDXML directly, we let
15
+ the LLM do "name surgery" — assembling, modifying, and validating IUPAC names
16
+ using grounded tools. This avoids hallucinated SMILES while leveraging LLMs'
17
+ strength with natural language.
18
+
19
+ Layer 2 — Name manipulation tools:
20
+ resolve_to_smiles — Resolve any chemical identifier to SMILES
21
+ get_prefix_form — Get IUPAC substituent prefix for a group
22
+ assemble_name — Build IUPAC name from parent + substituents
23
+ modify_name — Add/swap/remove substituents in an existing name
24
+ validate_name — Check if an IUPAC name resolves to a valid molecule
25
+ name_to_structure — Convert validated name to CDXML
26
+ enumerate_names — List alternative IUPAC name forms for a molecule
27
+
28
+ Layer 3 — Graph manipulation tools (for structural transformations):
29
+ list_reactions — List available named reaction templates
30
+ apply_reaction — Apply a reaction template (Suzuki, Buchwald, etc.)
31
+ deprotect — Remove protecting groups (Boc, Fmoc, Cbz, etc.)
32
+
33
+ Meta:
34
+ get_tool_definitions — Export all tool schemas for LLM function calling
35
+
36
+ Usage (Python)::
37
+
38
+ from cdxml_toolkit.naming.mol_builder import (
39
+ get_prefix_form, assemble_name, validate_name, name_to_structure,
40
+ )
41
+
42
+ pf = get_prefix_form("CF3")
43
+ # {'prefix': 'trifluoromethyl', 'source': 'table', 'ok': True}
44
+
45
+ result = assemble_name("pyridine", [
46
+ {"locant": "2", "prefix": "chloro"},
47
+ {"locant": "3", "prefix": pf["prefix"]},
48
+ ])
49
+ # {'name': '2-chloro-3-(trifluoromethyl)pyridine', 'valid': True,
50
+ # 'smiles': '...', 'ok': True}
51
+
52
+ cdxml = name_to_structure(result["name"])
53
+ # {'cdxml': '<?xml ...', 'ok': True}
54
+ """
55
+
56
+ import json
57
+ import logging
58
+ import os
59
+ import re
60
+ from typing import Any, Dict, List, Optional
61
+
62
+ logger = logging.getLogger(__name__)
63
+
64
+ # ---------------------------------------------------------------------------
65
+ # Lazy singletons — avoid import-time cost for heavy dependencies
66
+ # ---------------------------------------------------------------------------
67
+
68
+ _cs_instance = None
69
+ _cs_failed = False
70
+
71
+
72
+ def _get_cs():
73
+ """Lazily obtain a ChemScriptBridge instance (or None)."""
74
+ global _cs_instance, _cs_failed
75
+ if _cs_failed:
76
+ return None
77
+ if _cs_instance is not None:
78
+ return _cs_instance
79
+ try:
80
+ from cdxml_toolkit.chemdraw.chemscript_bridge import ChemScriptBridge
81
+ _cs_instance = ChemScriptBridge()
82
+ return _cs_instance
83
+ except Exception as exc:
84
+ logger.debug("ChemScript unavailable: %s", exc)
85
+ _cs_failed = True
86
+ return None
87
+
88
+
89
+ def _rdkit_canonical(smiles: str) -> Optional[str]:
90
+ """Canonical SMILES via RDKit, or None."""
91
+ from rdkit import Chem
92
+ mol = Chem.MolFromSmiles(smiles)
93
+ return Chem.MolToSmiles(mol) if mol else None
94
+
95
+
96
+ # ---------------------------------------------------------------------------
97
+ # Prefix lookup table — covers common med-chem substituents
98
+ # ---------------------------------------------------------------------------
99
+
100
+ # Maps group identifiers (abbreviations, names, formulae) to IUPAC prefix
101
+ # form suitable for direct insertion into a substituted name.
102
+ # The table is checked case-insensitively.
103
+ _PREFIX_TABLE: Dict[str, str] = {
104
+ # --- Halogens ---
105
+ "f": "fluoro", "cl": "chloro", "br": "bromo", "i": "iodo",
106
+ "fluorine": "fluoro", "chlorine": "chloro",
107
+ "bromine": "bromo", "iodine": "iodo",
108
+
109
+ # --- Oxygen ---
110
+ "oh": "hydroxy", "ome": "methoxy", "oet": "ethoxy",
111
+ "oac": "acetyloxy", "obn": "benzyloxy", "oph": "phenoxy",
112
+ "methoxy": "methoxy", "ethoxy": "ethoxy", "hydroxy": "hydroxy",
113
+ "ocf3": "trifluoromethoxy", "oipr": "isopropoxy",
114
+
115
+ # --- Nitrogen ---
116
+ "nh2": "amino", "nhme": "methylamino", "nme2": "dimethylamino",
117
+ "nhac": "acetamido", "no2": "nitro", "n3": "azido",
118
+ "amino": "amino", "nitro": "nitro", "azido": "azido",
119
+
120
+ # --- Simple carbon ---
121
+ "me": "methyl", "et": "ethyl", "pr": "propyl", "npr": "propyl",
122
+ "ipr": "propan-2-yl", "bu": "butyl", "nbu": "butyl",
123
+ "tbu": "tert-butyl", "sbu": "sec-butyl", "ibu": "isobutyl",
124
+ "methyl": "methyl", "ethyl": "ethyl",
125
+ "vinyl": "ethenyl", "allyl": "prop-2-en-1-yl",
126
+ "isopropyl": "propan-2-yl",
127
+
128
+ # --- Cycloalkyl ---
129
+ "cyclopropyl": "cyclopropyl", "cyclobutyl": "cyclobutyl",
130
+ "cyclopentyl": "cyclopentyl", "cyclohexyl": "cyclohexyl",
131
+ "cyclopropane": "cyclopropyl", "cyclobutane": "cyclobutyl",
132
+ "cyclopentane": "cyclopentyl", "cyclohexane": "cyclohexyl",
133
+
134
+ # --- Aryl ---
135
+ "ph": "phenyl", "bn": "benzyl", "bz": "benzoyl",
136
+ "phenyl": "phenyl", "benzyl": "benzyl",
137
+
138
+ # --- Functional groups (abbreviations) ---
139
+ "cn": "cyano", "cho": "formyl", "cooh": "carboxy",
140
+ "co2h": "carboxy", "-cooh": "carboxy", "-co2h": "carboxy",
141
+ "come": "acetyl", "ac": "acetyl",
142
+ "conh2": "carbamoyl", "-conh2": "carbamoyl",
143
+ "coome": "methoxycarbonyl", "co2me": "methoxycarbonyl",
144
+ "meo2c": "methoxycarbonyl", "meoco": "methoxycarbonyl",
145
+ "cooch3": "methoxycarbonyl", "-cooch3": "methoxycarbonyl",
146
+ "-coome": "methoxycarbonyl", "-co2me": "methoxycarbonyl",
147
+ "cooet": "ethoxycarbonyl", "co2et": "ethoxycarbonyl",
148
+ "eto2c": "ethoxycarbonyl", "etoco": "ethoxycarbonyl",
149
+ "cooipr": "isopropoxycarbonyl", "co2ipr": "isopropoxycarbonyl",
150
+ "cootbu": "tert-butoxycarbonyl", "co2tbu": "tert-butoxycarbonyl",
151
+ "-cho": "formyl",
152
+
153
+ # --- Functional group descriptors (natural language → prefix) ---
154
+ "methyl ester": "methoxycarbonyl",
155
+ "me ester": "methoxycarbonyl",
156
+ "ome ester": "methoxycarbonyl",
157
+ "ethyl ester": "ethoxycarbonyl",
158
+ "et ester": "ethoxycarbonyl",
159
+ "isopropyl ester": "isopropoxycarbonyl",
160
+ "tert-butyl ester": "tert-butoxycarbonyl",
161
+ "aldehyde": "formyl",
162
+ "ketone": "oxo",
163
+ "carboxylic acid": "carboxy",
164
+ "nitrile": "cyano",
165
+ "amide": "carbamoyl",
166
+ "primary amide": "carbamoyl",
167
+ "alcohol": "hydroxy",
168
+ "hydroxyl": "hydroxy",
169
+ "thiol": "sulfanyl",
170
+ "mercaptan": "sulfanyl",
171
+ "sulfonic acid": "sulfo",
172
+ "sulfonamide": "sulfamoyl",
173
+
174
+ # --- Fluorocarbons ---
175
+ "cf3": "trifluoromethyl", "chf2": "difluoromethyl",
176
+ "ccl3": "trichloromethyl",
177
+
178
+ # --- Sulphur ---
179
+ "sh": "sulfanyl", "sme": "methylsulfanyl",
180
+ "so2me": "methanesulfonyl", "ms": "methanesulfonyl",
181
+ "so2nh2": "sulfamoyl",
182
+
183
+ # --- Heterocycles as substituent prefix ---
184
+ "morpholine": "morpholino", "morpholinyl": "morpholino",
185
+ "morpholino": "morpholino",
186
+ "piperidine": "piperidin-1-yl", "piperidinyl": "piperidin-1-yl",
187
+ "piperazine": "piperazin-1-yl", "piperazinyl": "piperazin-1-yl",
188
+ "pyrrolidine": "pyrrolidin-1-yl", "pyrrolidinyl": "pyrrolidin-1-yl",
189
+ "pyridine": "pyridinyl", "pyridinyl": "pyridinyl",
190
+ "pyrimidine": "pyrimidinyl",
191
+ "thiophene": "thiophen-2-yl", "thienyl": "thiophen-2-yl",
192
+ "furan": "furan-2-yl", "furyl": "furan-2-yl",
193
+ "pyrrole": "pyrrol-1-yl",
194
+ "imidazole": "imidazolyl", "imidazolyl": "imidazolyl",
195
+ "thiazole": "thiazolyl", "thiazolyl": "thiazolyl",
196
+ "oxazole": "oxazolyl", "oxazolyl": "oxazolyl",
197
+ "indole": "indolyl",
198
+ }
199
+
200
+ # IUPAC multiplying prefixes for identical substituents
201
+ _MULTIPLIERS = {2: "di", 3: "tri", 4: "tetra", 5: "penta", 6: "hexa"}
202
+
203
+
204
+ # ---------------------------------------------------------------------------
205
+ # Internal helpers
206
+ # ---------------------------------------------------------------------------
207
+
208
+ def _resolve_query(query: str, use_network: bool = True) -> Optional[Dict]:
209
+ """Multi-tier resolution chain: reagent DB → formula → ChemScript → PubChem.
210
+
211
+ Returns {"smiles": ..., "source": ...} or None.
212
+ """
213
+ from rdkit import Chem
214
+
215
+ clean = query.strip()
216
+ if not clean:
217
+ return None
218
+
219
+ # Tier 1: Reagent DB
220
+ try:
221
+ from cdxml_toolkit.resolve.reagent_db import get_reagent_db
222
+ db = get_reagent_db()
223
+ entry = db.entry_for_name(clean.lower())
224
+ if entry:
225
+ smi = entry.get("smiles")
226
+ if isinstance(smi, list):
227
+ smi = smi[0]
228
+ if smi and Chem.MolFromSmiles(smi):
229
+ return {"smiles": _rdkit_canonical(smi), "source": "reagent_db"}
230
+ except Exception:
231
+ pass
232
+
233
+ # Tier 2: Condensed formula
234
+ try:
235
+ from cdxml_toolkit.resolve.condensed_formula import resolve_condensed_formula
236
+ smi = resolve_condensed_formula(clean)
237
+ if smi:
238
+ canon = _rdkit_canonical(smi)
239
+ if canon:
240
+ return {"smiles": canon, "source": "formula"}
241
+ except Exception:
242
+ pass
243
+
244
+ # Tier 3: ChemScript (name → SMILES)
245
+ cs = _get_cs()
246
+ if cs is not None:
247
+ try:
248
+ smi = cs.write_data(clean, "smiles", source_format="name")
249
+ if smi and Chem.MolFromSmiles(smi):
250
+ return {"smiles": _rdkit_canonical(smi), "source": "chemscript"}
251
+ except Exception:
252
+ pass
253
+
254
+ # Tier 3b: OPSIN (offline IUPAC name → SMILES, bundled JRE)
255
+ try:
256
+ from cdxml_toolkit.resolve.jre_manager import ensure_java_on_path
257
+ if ensure_java_on_path():
258
+ import warnings
259
+ from py2opsin import py2opsin as _py2opsin
260
+ with warnings.catch_warnings():
261
+ warnings.simplefilter("ignore", RuntimeWarning)
262
+ smi = _py2opsin(clean)
263
+ if smi and Chem.MolFromSmiles(smi):
264
+ return {"smiles": _rdkit_canonical(smi), "source": "opsin"}
265
+ except (ImportError, FileNotFoundError):
266
+ pass
267
+ except Exception:
268
+ pass
269
+
270
+ # Tier 4: PubChem (online)
271
+ if use_network:
272
+ try:
273
+ from cdxml_toolkit.resolve.cas_resolver import resolve_name_to_smiles
274
+ smi = resolve_name_to_smiles(clean)
275
+ if smi:
276
+ canon = _rdkit_canonical(smi)
277
+ if canon:
278
+ return {"smiles": canon, "source": "pubchem"}
279
+ except Exception:
280
+ pass
281
+
282
+ return None
283
+
284
+
285
+ def _name_to_smiles_cs(name: str) -> Optional[str]:
286
+ """Resolve an IUPAC name to SMILES via ChemScript."""
287
+ cs = _get_cs()
288
+ if cs is None:
289
+ return None
290
+ try:
291
+ smi = cs.write_data(name, "smiles", source_format="name")
292
+ if smi:
293
+ return _rdkit_canonical(smi)
294
+ except Exception:
295
+ pass
296
+ return None
297
+
298
+
299
+ def _smiles_to_name_cs(smiles: str) -> Optional[str]:
300
+ """Get IUPAC name for a SMILES string via ChemScript."""
301
+ cs = _get_cs()
302
+ if cs is None:
303
+ return None
304
+ try:
305
+ return cs.get_name(smiles)
306
+ except Exception:
307
+ return None
308
+
309
+
310
+ def _is_complex_prefix(prefix: str) -> bool:
311
+ """Check if a prefix needs parentheses when inserted into a name.
312
+
313
+ Complex prefixes contain hyphens, digits, or commas that would be
314
+ ambiguous without enclosing parentheses.
315
+ """
316
+ # Already parenthesised
317
+ if prefix.startswith("(") and prefix.endswith(")"):
318
+ return False
319
+ # Contains internal structure that needs brackets
320
+ return bool(re.search(r"[\d,]", prefix)) and "-" in prefix
321
+
322
+
323
+ def _locant_sort_key(loc: str):
324
+ """Sort locants: numeric before alphabetic, ascending."""
325
+ m = re.match(r"(\d+)(.*)", loc)
326
+ if m:
327
+ return (0, int(m.group(1)), m.group(2))
328
+ return (1, 0, loc)
329
+
330
+
331
+ def _prefix_alpha_key(prefix: str) -> str:
332
+ """IUPAC alphabetical sort key: ignore leading locants/multipliers.
333
+
334
+ ``"1,1-difluoroethyl"`` → ``"difluoroethyl"``
335
+ ``"tert-butyl"`` → ``"tert-butyl"``
336
+ """
337
+ stripped = re.sub(r"^[\d,]+-", "", prefix)
338
+ return stripped.lower()
339
+
340
+
341
+ def _try_validate(name: str, use_network: bool = True) -> Optional[str]:
342
+ """Try to resolve a name to canonical SMILES by any available means.
343
+
344
+ Returns canonical SMILES or None.
345
+ """
346
+ from rdkit import Chem
347
+
348
+ # ChemScript (most reliable for IUPAC names)
349
+ smi = _name_to_smiles_cs(name)
350
+ if smi:
351
+ return smi
352
+
353
+ # PubChem fallback (for common names)
354
+ if use_network:
355
+ try:
356
+ from cdxml_toolkit.resolve.cas_resolver import resolve_name_to_smiles
357
+ smi = resolve_name_to_smiles(name)
358
+ if smi:
359
+ return _rdkit_canonical(smi)
360
+ except Exception:
361
+ pass
362
+
363
+ return None
364
+
365
+
366
+ # ---------------------------------------------------------------------------
367
+ # RDKit property helpers
368
+ # ---------------------------------------------------------------------------
369
+
370
+ def _rdkit_properties(smiles: str) -> Dict[str, Any]:
371
+ """Compute formula, MW, and exact mass from a SMILES via RDKit.
372
+
373
+ Returns a dict with keys ``formula``, ``mw``, ``exact_mass``.
374
+ Values are None if RDKit is unavailable or the molecule is invalid.
375
+ """
376
+ props: Dict[str, Any] = {"formula": None, "mw": None, "exact_mass": None}
377
+ try:
378
+ from rdkit import Chem
379
+ from rdkit.Chem import Descriptors, rdMolDescriptors
380
+ mol = Chem.MolFromSmiles(smiles)
381
+ if mol is None:
382
+ return props
383
+ props["formula"] = rdMolDescriptors.CalcMolFormula(mol)
384
+ props["mw"] = round(Descriptors.MolWt(mol), 4)
385
+ props["exact_mass"] = round(Descriptors.ExactMolWt(mol), 4)
386
+ except Exception:
387
+ pass
388
+ return props
389
+
390
+
391
+ # ---------------------------------------------------------------------------
392
+ # Tool 1: resolve_compound (rich resolver)
393
+ # ---------------------------------------------------------------------------
394
+
395
+ def resolve_compound(query: str, use_network: bool = True) -> Dict[str, Any]:
396
+ """Resolve any chemical identifier to a rich molecule descriptor.
397
+
398
+ Consolidates all resolution pathways (reagent DB, condensed formula,
399
+ ChemScript, PubChem) and enriches the result with molecular properties
400
+ computed via RDKit and metadata from the reagent database.
401
+
402
+ Args:
403
+ query: Chemical identifier — common name, IUPAC name, abbreviation,
404
+ condensed formula, or CAS number. Examples:
405
+ ``"aspirin"``, ``"PhB(OH)2"``, ``"2-chloropyridine"``,
406
+ ``"534-17-8"``, ``"deucravacitinib"``.
407
+ use_network: Allow PubChem lookup (requires internet).
408
+
409
+ Returns:
410
+ Dict with keys:
411
+
412
+ - ``ok`` (bool): True on success.
413
+ - ``name`` (str): Input query echoed back.
414
+ - ``smiles`` (str): Isomeric/canonical SMILES.
415
+ - ``formula`` (str | None): Molecular formula (e.g. ``"C9H8O4"``).
416
+ - ``mw`` (float | None): Molecular weight.
417
+ - ``exact_mass`` (float | None): Monoisotopic mass.
418
+ - ``iupac_name`` (str | None): IUPAC name from ChemScript or PubChem.
419
+ - ``source`` (str): Which tier resolved the SMILES (``"reagent_db"``,
420
+ ``"formula"``, ``"chemscript"``, ``"pubchem"``).
421
+ - ``role`` (str | None): Reagent role from the curated DB if known
422
+ (e.g. ``"base"``, ``"solvent"``, ``"catalyst"``).
423
+ - ``display_text`` (str | None): Preferred display name from the
424
+ reagent DB, or the IUPAC name if available.
425
+ - ``prefix_form`` (str | None): IUPAC substituent prefix for use in
426
+ ``assemble_name`` (e.g. ``"trifluoromethyl"`` for ``CF3``,
427
+ ``"morpholino"`` for morpholine). ``None`` if the compound is not
428
+ a substituent group or no prefix could be determined.
429
+
430
+ On failure: ``ok=False`` with an ``error`` key.
431
+
432
+ Example::
433
+
434
+ >>> resolve_compound("Cs2CO3")
435
+ {'ok': True, 'name': 'Cs2CO3', 'smiles': 'O=C([O-])[O-].[Cs+].[Cs+]',
436
+ 'formula': 'CCs2O3', 'mw': 325.82, 'exact_mass': 325.82,
437
+ 'iupac_name': None, 'source': 'reagent_db',
438
+ 'role': 'base', 'display_text': 'Cs2CO3', 'prefix_form': None}
439
+
440
+ >>> resolve_compound("Et3N")
441
+ {'ok': True, 'name': 'Et3N', 'smiles': 'CCN(CC)CC',
442
+ 'formula': 'C6H15N', 'mw': 101.19, 'exact_mass': 101.12,
443
+ 'iupac_name': None, 'source': 'formula',
444
+ 'role': None, 'display_text': None, 'prefix_form': None}
445
+
446
+ >>> resolve_compound("CF3")
447
+ {'ok': True, ..., 'prefix_form': 'trifluoromethyl'}
448
+
449
+ >>> resolve_compound("morpholine")
450
+ {'ok': True, ..., 'prefix_form': 'morpholino'}
451
+ """
452
+ # --- Step 1: resolve SMILES via the existing 4-tier chain ---
453
+ resolved = _resolve_query(query, use_network=use_network)
454
+ if not resolved:
455
+ return {"ok": False, "error": f"Could not resolve '{query}' to a structure."}
456
+
457
+ smiles = resolved["smiles"]
458
+ source = resolved["source"]
459
+
460
+ # --- Step 2: compute molecular properties via RDKit ---
461
+ props = _rdkit_properties(smiles)
462
+
463
+ # --- Step 3: IUPAC name via ChemScript (best quality) ---
464
+ iupac_name: Optional[str] = None
465
+ if source == "chemscript":
466
+ # ChemScript already resolved this name — get the canonical IUPAC back
467
+ iupac_name = _smiles_to_name_cs(smiles)
468
+ elif source != "reagent_db":
469
+ # For formula/pubchem sources, try ChemScript name generation
470
+ iupac_name = _smiles_to_name_cs(smiles)
471
+
472
+ # --- Step 4: role and display_text from reagent_db ---
473
+ role: Optional[str] = None
474
+ display_text: Optional[str] = None
475
+ try:
476
+ from cdxml_toolkit.resolve.reagent_db import get_reagent_db
477
+ db = get_reagent_db()
478
+ # Try by name first (fastest), then by resolved SMILES
479
+ entry = db.entry_for_name(query.lower())
480
+ if entry is None:
481
+ entry = db.entry_for_smiles(smiles)
482
+ if entry is not None:
483
+ role = entry.get("role")
484
+ display_text = entry.get("display")
485
+ except Exception:
486
+ pass
487
+
488
+ # Fall back: display_text from IUPAC name if reagent_db had nothing
489
+ if display_text is None and iupac_name:
490
+ display_text = iupac_name
491
+
492
+ # --- Step 5: IUPAC substituent prefix form ---
493
+ prefix_form: Optional[str] = None
494
+ pf_result = get_prefix_form(query)
495
+ if pf_result.get("ok"):
496
+ prefix_form = pf_result["prefix"]
497
+ else:
498
+ # Try on the resolved SMILES as a fallback
499
+ pf_result2 = get_prefix_form(smiles)
500
+ if pf_result2.get("ok"):
501
+ prefix_form = pf_result2["prefix"]
502
+
503
+ return {
504
+ "ok": True,
505
+ "name": query,
506
+ "smiles": smiles,
507
+ "formula": props["formula"],
508
+ "mw": props["mw"],
509
+ "exact_mass": props["exact_mass"],
510
+ "iupac_name": iupac_name,
511
+ "source": source,
512
+ "role": role,
513
+ "display_text": display_text,
514
+ "prefix_form": prefix_form,
515
+ }
516
+
517
+
518
+ # ---------------------------------------------------------------------------
519
+ # Tool 2 (legacy thin wrapper): resolve_to_smiles
520
+ # ---------------------------------------------------------------------------
521
+
522
+ def resolve_to_smiles(query: str, use_network: bool = True) -> Dict[str, Any]:
523
+ """Resolve a chemical identifier to its canonical SMILES string.
524
+
525
+ Accepts common names, IUPAC names, abbreviations, condensed formulae,
526
+ and CAS numbers. Uses a 4-tier resolution chain:
527
+ reagent DB → condensed formula → ChemScript → PubChem.
528
+
529
+ .. note::
530
+ For richer output (formula, MW, exact mass, role, display text),
531
+ use :func:`resolve_compound` instead.
532
+
533
+ Args:
534
+ query: Chemical identifier. Examples: ``"aspirin"``,
535
+ ``"PhB(OH)2"``, ``"2-chloropyridine"``, ``"534-17-8"``.
536
+ use_network: Allow PubChem lookup (requires internet).
537
+
538
+ Returns:
539
+ Dict with keys ``ok``, ``smiles``, ``source``.
540
+ On failure: ``ok=False`` with an ``error`` message.
541
+
542
+ Example::
543
+
544
+ >>> resolve_to_smiles("Et3N")
545
+ {'ok': True, 'smiles': 'CCN(CC)CC', 'source': 'formula'}
546
+ """
547
+ result = resolve_compound(query, use_network=use_network)
548
+ if result["ok"]:
549
+ return {"ok": True, "smiles": result["smiles"], "source": result["source"]}
550
+ return {"ok": False, "error": result.get("error", f"Could not resolve '{query}'.")}
551
+
552
+
553
+ # ---------------------------------------------------------------------------
554
+ # Tool 2: get_prefix_form
555
+ # ---------------------------------------------------------------------------
556
+
557
+ def get_prefix_form(group: str) -> Dict[str, Any]:
558
+ """Get the IUPAC substituent prefix form for a chemical group.
559
+
560
+ Given a group name, abbreviation, or formula, returns the prefix
561
+ string suitable for insertion into an IUPAC name.
562
+
563
+ Uses a curated lookup table for common groups (fast, offline), then
564
+ falls back to ChemScript-based naming with the Se-probe for anything
565
+ not in the table.
566
+
567
+ Args:
568
+ group: Group identifier. Examples: ``"CF3"``, ``"morpholine"``,
569
+ ``"NO2"``, ``"cyclopropyl"``, ``"OMe"``.
570
+
571
+ Returns:
572
+ Dict with keys ``ok``, ``prefix``, ``source``.
573
+ ``source`` is ``"table"`` for lookup hits, ``"probe"`` for
574
+ ChemScript probe, or ``"passthrough"`` if the input was already
575
+ a valid prefix form.
576
+
577
+ Examples::
578
+
579
+ >>> get_prefix_form("CF3")
580
+ {'ok': True, 'prefix': 'trifluoromethyl', 'source': 'table'}
581
+ >>> get_prefix_form("morpholine")
582
+ {'ok': True, 'prefix': 'morpholino', 'source': 'table'}
583
+ """
584
+ clean = group.strip()
585
+ if not clean:
586
+ return {"ok": False, "error": "Empty group."}
587
+
588
+ # --- Table lookup (case-insensitive) ---
589
+ key = clean.lower()
590
+ if key in _PREFIX_TABLE:
591
+ return {"ok": True, "prefix": _PREFIX_TABLE[key], "source": "table"}
592
+
593
+ # --- Check if it's already a valid prefix ---
594
+ # If appending it to "benzene" gives a valid name, it's a prefix.
595
+ test_name = f"1-{clean}benzene" if not clean[0].isdigit() else f"{clean}benzene"
596
+ smi = _try_validate(test_name)
597
+ if smi:
598
+ return {"ok": True, "prefix": clean, "source": "passthrough"}
599
+
600
+ # --- Se-probe via name_fragment_as_substituent ---
601
+ # Resolve group to SMILES, add [*] attachment, call the decomposer.
602
+ resolved = _resolve_query(clean, use_network=True)
603
+ if resolved:
604
+ from rdkit import Chem
605
+ mol = Chem.MolFromSmiles(resolved["smiles"])
606
+ if mol:
607
+ # Build [*]-fragment SMILES by attaching dummy at the most
608
+ # likely bonding position (first atom in canonical SMILES).
609
+ # For many simple groups this is correct.
610
+ edit = Chem.RWMol(mol)
611
+ dummy_idx = edit.AddAtom(Chem.Atom(0)) # [*]
612
+ edit.AddBond(0, dummy_idx, Chem.BondType.SINGLE)
613
+ try:
614
+ Chem.SanitizeMol(edit)
615
+ frag_smi = Chem.MolToSmiles(edit.GetMol())
616
+ from .name_decomposer import name_fragment_as_substituent
617
+ prefix = name_fragment_as_substituent(frag_smi, verbose=False)
618
+ if prefix:
619
+ return {"ok": True, "prefix": prefix, "source": "probe"}
620
+ except Exception:
621
+ pass
622
+
623
+ return {
624
+ "ok": False,
625
+ "error": f"Could not determine prefix form for '{group}'.",
626
+ }
627
+
628
+
629
+ # ---------------------------------------------------------------------------
630
+ # Tool 3: assemble_name
631
+ # ---------------------------------------------------------------------------
632
+
633
+ def assemble_name(parent: str,
634
+ substituents: List[Dict[str, str]],
635
+ validate: bool = True,
636
+ use_network: bool = True) -> Dict[str, Any]:
637
+ """Assemble an IUPAC name from a parent and substituent list.
638
+
639
+ Handles alphabetical ordering, multiplicative prefixes (di-, tri-),
640
+ and parenthesisation of complex substituents. Optionally validates
641
+ the assembled name by resolving it to SMILES.
642
+
643
+ Args:
644
+ parent: Parent ring or chain name (e.g. ``"pyridine"``,
645
+ ``"benzene"``, ``"pentane"``).
646
+ substituents: List of dicts, each with ``"locant"`` (str) and
647
+ ``"prefix"`` (str). Example::
648
+
649
+ [{"locant": "2", "prefix": "chloro"},
650
+ {"locant": "3", "prefix": "methyl"}]
651
+ validate: If True, resolve the assembled name and confirm validity.
652
+ use_network: Allow PubChem for validation.
653
+
654
+ Returns:
655
+ Dict with ``ok``, ``name``, and (if validated) ``valid``, ``smiles``.
656
+
657
+ Example::
658
+
659
+ >>> assemble_name("pyridine", [
660
+ ... {"locant": "2", "prefix": "chloro"},
661
+ ... {"locant": "5", "prefix": "nitro"},
662
+ ... ])
663
+ {'ok': True, 'name': '2-chloro-5-nitropyridine', 'valid': True,
664
+ 'smiles': '...'}
665
+ """
666
+ if not parent:
667
+ return {"ok": False, "error": "Parent name is required."}
668
+ if not substituents:
669
+ # Bare parent — still valid
670
+ if validate:
671
+ smi = _try_validate(parent, use_network=use_network)
672
+ if smi:
673
+ return {"ok": True, "name": parent, "valid": True, "smiles": smi}
674
+ return {"ok": True, "name": parent, "valid": False, "smiles": None}
675
+ return {"ok": True, "name": parent}
676
+
677
+ # --- Group identical prefixes for multipliers ---
678
+ from collections import defaultdict
679
+ groups: Dict[str, List[str]] = defaultdict(list)
680
+ for sub in substituents:
681
+ prefix = sub.get("prefix", "").strip()
682
+ locant = sub.get("locant", "").strip()
683
+ if prefix:
684
+ groups[prefix].append(locant)
685
+
686
+ # --- Build prefix fragments, sorted alphabetically by prefix ---
687
+ fragments = []
688
+ for prefix in sorted(groups.keys(), key=_prefix_alpha_key):
689
+ locants = sorted(groups[prefix], key=_locant_sort_key)
690
+ locant_str = ",".join(loc for loc in locants if loc)
691
+ n = len(locants)
692
+
693
+ # Format the prefix with optional multiplier
694
+ if n > 1 and prefix in _MULTIPLIERS:
695
+ mult = _MULTIPLIERS.get(n, str(n))
696
+ elif n > 1:
697
+ mult = _MULTIPLIERS.get(n, str(n))
698
+ else:
699
+ mult = ""
700
+
701
+ # Parenthesise complex prefixes
702
+ needs_parens = _is_complex_prefix(prefix)
703
+ pfx = f"({prefix})" if needs_parens else prefix
704
+
705
+ if mult:
706
+ part = f"{locant_str}-{mult}{pfx}" if locant_str else f"{mult}{pfx}"
707
+ else:
708
+ part = f"{locant_str}-{pfx}" if locant_str else pfx
709
+
710
+ fragments.append(part)
711
+
712
+ # --- Assemble final name ---
713
+ name = "-".join(fragments) + parent
714
+
715
+ result: Dict[str, Any] = {"ok": True, "name": name}
716
+
717
+ if validate:
718
+ smi = _try_validate(name, use_network=use_network)
719
+ result["valid"] = smi is not None
720
+ result["smiles"] = smi
721
+ if not smi:
722
+ # Try without parentheses as alternative
723
+ alt_frags = []
724
+ for prefix in sorted(groups.keys(), key=_prefix_alpha_key):
725
+ locants = sorted(groups[prefix], key=_locant_sort_key)
726
+ locant_str = ",".join(loc for loc in locants if loc)
727
+ n = len(locants)
728
+ mult = _MULTIPLIERS.get(n, "") if n > 1 else ""
729
+ part = f"{locant_str}-{mult}{prefix}" if locant_str else f"{mult}{prefix}"
730
+ alt_frags.append(part)
731
+ alt_name = "-".join(alt_frags) + parent
732
+ if alt_name != name:
733
+ alt_smi = _try_validate(alt_name, use_network=use_network)
734
+ if alt_smi:
735
+ result["name"] = alt_name
736
+ result["valid"] = True
737
+ result["smiles"] = alt_smi
738
+
739
+ return result
740
+
741
+
742
+ # ---------------------------------------------------------------------------
743
+ # Tool 4: modify_name
744
+ # ---------------------------------------------------------------------------
745
+
746
+ def modify_name(name: str,
747
+ operation: str,
748
+ target: Optional[str] = None,
749
+ replacement: Optional[str] = None,
750
+ locant: Optional[str] = None,
751
+ validate: bool = True,
752
+ use_network: bool = True) -> Dict[str, Any]:
753
+ """Modify an IUPAC name by swapping, adding, or removing a substituent.
754
+
755
+ Operations:
756
+
757
+ - ``"swap"``: Replace *target* prefix with *replacement*.
758
+ E.g. swap "nitro" → "amino" in "4-nitropyridine" → "4-aminopyridine".
759
+
760
+ - ``"add"``: Insert *replacement* at *locant*.
761
+ E.g. add "methyl" at "3" to "2-chloropyridine" → "2-chloro-3-methylpyridine".
762
+
763
+ - ``"remove"``: Delete the *target* prefix.
764
+ E.g. remove "chloro" from "2-chloro-3-methylpyridine" → "3-methylpyridine".
765
+
766
+ For ``"swap"``, the name is re-alphabetised automatically.
767
+
768
+ Args:
769
+ name: The IUPAC name to modify.
770
+ operation: ``"swap"``, ``"add"``, or ``"remove"``.
771
+ target: Prefix to replace (swap) or remove (remove).
772
+ replacement: New prefix (swap) or prefix to insert (add).
773
+ locant: Position for insertion (add only).
774
+ validate: Resolve the result to confirm validity.
775
+ use_network: Allow PubChem for validation.
776
+
777
+ Returns:
778
+ Dict with ``ok``, ``name``, ``valid``, ``smiles``.
779
+
780
+ Examples::
781
+
782
+ >>> modify_name("4-nitropyridine", "swap",
783
+ ... target="nitro", replacement="amino")
784
+ {'ok': True, 'name': '4-aminopyridine', ...}
785
+
786
+ >>> modify_name("2-chloropyridine", "add",
787
+ ... replacement="methyl", locant="3")
788
+ {'ok': True, 'name': '2-chloro-3-methylpyridine', ...}
789
+ """
790
+ if operation == "swap":
791
+ return _modify_swap(name, target, replacement, validate, use_network)
792
+ elif operation == "add":
793
+ return _modify_add(name, replacement, locant, validate, use_network)
794
+ elif operation == "remove":
795
+ return _modify_remove(name, target, validate, use_network)
796
+ else:
797
+ return {"ok": False, "error": f"Unknown operation '{operation}'. "
798
+ "Use 'swap', 'add', or 'remove'."}
799
+
800
+
801
+ def _parse_name_components(name: str) -> Optional[Dict]:
802
+ """Best-effort parse of a substituted IUPAC name into components.
803
+
804
+ Splits a name like ``"2-chloro-5-(trifluoromethyl)pyridine"`` into::
805
+
806
+ {"parent": "pyridine",
807
+ "substituents": [{"locant": "2", "prefix": "chloro"},
808
+ {"locant": "5", "prefix": "trifluoromethyl"}]}
809
+
810
+ Uses the aligned namer's ring system list for parent detection.
811
+ """
812
+ try:
813
+ from .aligned_namer import _KNOWN_RINGS
814
+ rings = _KNOWN_RINGS
815
+ except ImportError:
816
+ rings = set()
817
+
818
+ # Also try common chain parents
819
+ chains = [
820
+ "icosane", "nonadecane", "octadecane", "heptadecane", "hexadecane",
821
+ "pentadecane", "tetradecane", "tridecane", "dodecane", "undecane",
822
+ "decane", "nonane", "octane", "heptane", "hexane", "pentane",
823
+ "butane", "propane", "ethane", "methane",
824
+ "icosanoic acid", "nonadecanoic acid", "octadecanoic acid",
825
+ ]
826
+ all_parents = sorted(
827
+ list(rings) + chains, key=len, reverse=True
828
+ )
829
+
830
+ # Find the parent: longest known name that matches the tail
831
+ parent = None
832
+ prefix_part = ""
833
+ for p in all_parents:
834
+ if name.endswith(p):
835
+ prefix_part = name[:-len(p)]
836
+ parent = p
837
+ break
838
+
839
+ # Fallback: if no known parent, try splitting at the last segment
840
+ # that doesn't start with a digit
841
+ if parent is None:
842
+ # Try to identify parent as the last non-prefixed segment
843
+ # Pattern: everything after the last "-" that isn't a locant-prefix pair
844
+ parts = name.rsplit("-", 1)
845
+ if len(parts) == 2 and not re.match(r"^\d", parts[1]):
846
+ parent = parts[1]
847
+ prefix_part = parts[0] + "-"
848
+ else:
849
+ parent = name
850
+ prefix_part = ""
851
+
852
+ if not prefix_part.strip("-"):
853
+ return {"parent": parent, "substituents": []}
854
+
855
+ # Parse prefix_part into (locant, prefix) pairs
856
+ prefix_str = prefix_part.rstrip("-")
857
+ substituents = []
858
+
859
+ # Pattern: locant(s)-[multiplier][(]prefix[)] or locant(s)-[multiplier]prefix
860
+ # Walk through segments
861
+ segments = _split_prefix_segments(prefix_str)
862
+ for seg in segments:
863
+ parsed = _parse_single_prefix(seg)
864
+ if parsed:
865
+ substituents.extend(parsed)
866
+
867
+ return {"parent": parent, "substituents": substituents}
868
+
869
+
870
+ def _split_prefix_segments(prefix_str: str) -> List[str]:
871
+ """Split a prefix string into individual prefix segments.
872
+
873
+ Handles parenthesised prefixes correctly:
874
+ ``"2-chloro-3-(trifluoromethyl)"`` → ``["2-chloro", "3-(trifluoromethyl)"]``
875
+ """
876
+ segments = []
877
+ current = ""
878
+ depth = 0
879
+ for ch in prefix_str:
880
+ if ch == "(":
881
+ depth += 1
882
+ current += ch
883
+ elif ch == ")":
884
+ depth -= 1
885
+ current += ch
886
+ elif ch == "-" and depth == 0:
887
+ if current:
888
+ # Check: is this a separator between segments, or within one?
889
+ # A segment boundary is after a prefix (lowercase letter or ')').
890
+ # Within a segment: after a locant (digit) or multiplier.
891
+ if current and (current[-1].isalpha() and current[-1].islower()
892
+ or current[-1] == ")"):
893
+ segments.append(current)
894
+ current = ""
895
+ else:
896
+ current += ch
897
+ else:
898
+ current += ch
899
+ else:
900
+ current += ch
901
+ if current:
902
+ segments.append(current)
903
+ return segments
904
+
905
+
906
+ def _parse_single_prefix(segment: str) -> Optional[List[Dict[str, str]]]:
907
+ """Parse a single prefix segment like '2-chloro' or '2,4-dichloro'.
908
+
909
+ Returns list of {"locant": ..., "prefix": ...} dicts.
910
+ """
911
+ # Handle multiplied: 2,4-dichloro
912
+ m = re.match(
913
+ r"^([\d,]+)-(?:di|tri|tetra|penta|hexa)"
914
+ r"[\(\[]?([a-zA-Z][\w,\-]*?)[\)\]]?$",
915
+ segment,
916
+ )
917
+ if m:
918
+ locants = m.group(1).split(",")
919
+ prefix = m.group(2)
920
+ return [{"locant": loc, "prefix": prefix} for loc in locants]
921
+
922
+ # Handle parenthesised: 3-(trifluoromethyl)
923
+ m = re.match(r"^(\d+\w?)-\((.+)\)$", segment)
924
+ if m:
925
+ return [{"locant": m.group(1), "prefix": m.group(2)}]
926
+
927
+ # Handle simple: 2-chloro
928
+ m = re.match(r"^(\d+\w?)-([a-zA-Z][\w\-]*)$", segment)
929
+ if m:
930
+ return [{"locant": m.group(1), "prefix": m.group(2)}]
931
+
932
+ # No locant: just a prefix (e.g., "amino" without locant)
933
+ if re.match(r"^[a-zA-Z]", segment):
934
+ return [{"locant": "", "prefix": segment}]
935
+
936
+ return None
937
+
938
+
939
+ def _modify_swap(name, target, replacement, validate, use_network):
940
+ """Swap one prefix for another and re-assemble."""
941
+ if not target or not replacement:
942
+ return {"ok": False, "error": "Both 'target' and 'replacement' required for swap."}
943
+
944
+ parsed = _parse_name_components(name)
945
+ if parsed is None:
946
+ return {"ok": False, "error": f"Could not parse name '{name}'."}
947
+
948
+ subs = parsed["substituents"]
949
+ found = False
950
+ for sub in subs:
951
+ if sub["prefix"] == target:
952
+ sub["prefix"] = replacement
953
+ found = True
954
+ if not found:
955
+ return {
956
+ "ok": False,
957
+ "error": f"Prefix '{target}' not found in '{name}'.",
958
+ "found_prefixes": [s["prefix"] for s in subs],
959
+ }
960
+
961
+ return assemble_name(parsed["parent"], subs, validate=validate,
962
+ use_network=use_network)
963
+
964
+
965
+ def _modify_add(name, prefix, locant, validate, use_network):
966
+ """Add a new substituent to an existing name."""
967
+ if not prefix:
968
+ return {"ok": False, "error": "'replacement' (prefix to add) is required."}
969
+ if not locant:
970
+ return {"ok": False, "error": "'locant' is required for add operation."}
971
+
972
+ parsed = _parse_name_components(name)
973
+ if parsed is None:
974
+ return {"ok": False, "error": f"Could not parse name '{name}'."}
975
+
976
+ parsed["substituents"].append({"locant": locant, "prefix": prefix})
977
+ return assemble_name(parsed["parent"], parsed["substituents"],
978
+ validate=validate, use_network=use_network)
979
+
980
+
981
+ def _modify_remove(name, target, validate, use_network):
982
+ """Remove a substituent from a name."""
983
+ if not target:
984
+ return {"ok": False, "error": "'target' prefix is required for remove."}
985
+
986
+ parsed = _parse_name_components(name)
987
+ if parsed is None:
988
+ return {"ok": False, "error": f"Could not parse name '{name}'."}
989
+
990
+ original_len = len(parsed["substituents"])
991
+ parsed["substituents"] = [
992
+ s for s in parsed["substituents"] if s["prefix"] != target
993
+ ]
994
+ if len(parsed["substituents"]) == original_len:
995
+ return {
996
+ "ok": False,
997
+ "error": f"Prefix '{target}' not found in '{name}'.",
998
+ "found_prefixes": [s["prefix"] for s in parsed["substituents"]],
999
+ }
1000
+
1001
+ return assemble_name(parsed["parent"], parsed["substituents"],
1002
+ validate=validate, use_network=use_network)
1003
+
1004
+
1005
+ # ---------------------------------------------------------------------------
1006
+ # Tool 5: validate_name
1007
+ # ---------------------------------------------------------------------------
1008
+
1009
+ def validate_name(name: str,
1010
+ use_network: bool = True) -> Dict[str, Any]:
1011
+ """Validate an IUPAC name and return its SMILES if valid.
1012
+
1013
+ Attempts to resolve the name to a structure using ChemScript
1014
+ (preferred) or PubChem (fallback). Returns whether the name is
1015
+ valid and the canonical SMILES.
1016
+
1017
+ Args:
1018
+ name: IUPAC name to validate.
1019
+ use_network: Allow PubChem lookup.
1020
+
1021
+ Returns:
1022
+ Dict with ``ok``, ``valid``, ``smiles``, ``name``.
1023
+
1024
+ Example::
1025
+
1026
+ >>> validate_name("2-chloropyridine")
1027
+ {'ok': True, 'valid': True, 'smiles': 'Clc1ccccn1', 'name': '2-chloropyridine'}
1028
+ """
1029
+ smi = _try_validate(name, use_network=use_network)
1030
+ if smi:
1031
+ # Also get the canonical IUPAC name if ChemScript is available
1032
+ canonical = _smiles_to_name_cs(smi)
1033
+ return {
1034
+ "ok": True,
1035
+ "valid": True,
1036
+ "smiles": smi,
1037
+ "name": name,
1038
+ "canonical_name": canonical,
1039
+ }
1040
+ return {"ok": True, "valid": False, "smiles": None, "name": name}
1041
+
1042
+
1043
+ # ---------------------------------------------------------------------------
1044
+ # Tool 6: name_to_structure
1045
+ # ---------------------------------------------------------------------------
1046
+
1047
+ def name_to_structure(name: str,
1048
+ output_format: str = "cdxml") -> Dict[str, Any]:
1049
+ """Convert a chemical name to a structure in the requested format.
1050
+
1051
+ Resolves the name, generates 2D coordinates, and returns the
1052
+ structure as a string (CDXML, SMILES, or MOL).
1053
+
1054
+ Args:
1055
+ name: IUPAC or common name.
1056
+ output_format: ``"cdxml"`` (default), ``"smiles"``, or ``"mol"``.
1057
+
1058
+ Returns:
1059
+ Dict with ``ok`` and the structure data (key matches format name).
1060
+
1061
+ Example::
1062
+
1063
+ >>> result = name_to_structure("2-chloropyridine")
1064
+ >>> result["ok"]
1065
+ True
1066
+ >>> result["cdxml"][:20]
1067
+ '<?xml version="1.0"'
1068
+ """
1069
+ fmt = output_format.lower()
1070
+
1071
+ if fmt == "smiles":
1072
+ smi = _try_validate(name)
1073
+ if smi:
1074
+ return {"ok": True, "smiles": smi}
1075
+ return {"ok": False, "error": f"Could not resolve '{name}'."}
1076
+
1077
+ # For CDXML and MOL, prefer ChemScript (gives ACS-styled 2D)
1078
+ cs = _get_cs()
1079
+ if cs is not None:
1080
+ try:
1081
+ if fmt == "cdxml":
1082
+ cdxml = cs.name_to_cdxml(name)
1083
+ return {"ok": True, "cdxml": cdxml}
1084
+ elif fmt == "mol":
1085
+ mol_data = cs.write_data(name, "mol", source_format="name")
1086
+ return {"ok": True, "mol": mol_data}
1087
+ except Exception:
1088
+ pass
1089
+
1090
+ # Fallback: resolve to SMILES, then generate structure via RDKit
1091
+ smi = _try_validate(name)
1092
+ if not smi:
1093
+ return {"ok": False, "error": f"Could not resolve '{name}'."}
1094
+
1095
+ if fmt == "cdxml":
1096
+ # Try ChemScript with SMILES input
1097
+ if cs is not None:
1098
+ try:
1099
+ cdxml = cs.smiles_to_cdxml(smi)
1100
+ return {"ok": True, "cdxml": cdxml}
1101
+ except Exception:
1102
+ pass
1103
+ return {"ok": False,
1104
+ "error": "CDXML output requires ChemScript. "
1105
+ f"Name resolved to SMILES: {smi}",
1106
+ "smiles": smi}
1107
+
1108
+ if fmt == "mol":
1109
+ from rdkit import Chem
1110
+ from rdkit.Chem import AllChem
1111
+ mol = Chem.MolFromSmiles(smi)
1112
+ if mol:
1113
+ AllChem.Compute2DCoords(mol)
1114
+ return {"ok": True, "mol": Chem.MolToMolBlock(mol)}
1115
+ return {"ok": False, "error": "RDKit could not generate MOL block."}
1116
+
1117
+ return {"ok": False, "error": f"Unknown format '{fmt}'. Use cdxml, smiles, or mol."}
1118
+
1119
+
1120
+ # ---------------------------------------------------------------------------
1121
+ # Tool 7: enumerate_names
1122
+ # ---------------------------------------------------------------------------
1123
+
1124
+ def enumerate_names(identifier: str,
1125
+ use_network: bool = True) -> Dict[str, Any]:
1126
+ """Enumerate alternative IUPAC name forms for a molecule.
1127
+
1128
+ Given a chemical name or SMILES, returns the canonical IUPAC name plus
1129
+ alternative forms where substituents appear as different prefixes or
1130
+ where a different parent ring/chain is chosen. This is essential for
1131
+ name surgery: it lets you see functional groups as swappable prefixes.
1132
+
1133
+ For example, ``"1-(4-bromophenyl)ethan-1-one"`` (a ketone in suffix
1134
+ form) generates alternatives including ``"1-acetyl-4-bromobenzene"``
1135
+ where the ketone appears as the prefix ``"acetyl"`` — now swappable
1136
+ via ``modify_name``.
1137
+
1138
+ Args:
1139
+ identifier: Chemical name, SMILES, abbreviation, or any
1140
+ identifier accepted by ``resolve_to_smiles``.
1141
+ use_network: Allow PubChem for resolution.
1142
+
1143
+ Returns:
1144
+ Dict with:
1145
+
1146
+ - ``ok``: bool
1147
+ - ``canonical_name``: the ChemDraw canonical IUPAC name
1148
+ - ``smiles``: canonical SMILES
1149
+ - ``names``: list of dicts, each with ``name`` (str),
1150
+ ``valid`` (bool), ``strategy`` (str), and ``prefixes``
1151
+ (list of prefix strings visible in that name form).
1152
+ The canonical name is always the first entry.
1153
+
1154
+ Example::
1155
+
1156
+ >>> result = enumerate_names("1-(4-bromophenyl)ethan-1-one")
1157
+ >>> for n in result["names"]:
1158
+ ... print(n["name"], n["prefixes"])
1159
+ 1-(4-bromophenyl)ethan-1-one ['(4-bromophenyl)']
1160
+ 1-acetyl-4-bromobenzene ['acetyl', 'bromo']
1161
+ ...
1162
+ """
1163
+ # Resolve to SMILES — try direct SMILES parse first, then name resolution
1164
+ from rdkit import Chem as _Chem
1165
+ _test_mol = _Chem.MolFromSmiles(identifier)
1166
+ if _test_mol is not None:
1167
+ smiles = _Chem.MolToSmiles(_test_mol)
1168
+ else:
1169
+ resolved = _resolve_query(identifier, use_network=use_network)
1170
+ if not resolved:
1171
+ return {"ok": False,
1172
+ "error": f"Could not resolve '{identifier}' to a structure."}
1173
+ smiles = resolved["smiles"]
1174
+
1175
+ # Run decomposition
1176
+ try:
1177
+ from .name_decomposer import decompose_name
1178
+ result = decompose_name(smiles, verbose=False, timeout=30.0)
1179
+ except Exception as exc:
1180
+ return {"ok": False,
1181
+ "error": f"Decomposition failed: {exc}",
1182
+ "smiles": smiles}
1183
+
1184
+ if result.errors:
1185
+ return {"ok": False,
1186
+ "error": "; ".join(result.errors),
1187
+ "smiles": smiles}
1188
+
1189
+ canon = result.canonical_name
1190
+ if not canon:
1191
+ return {"ok": False,
1192
+ "error": "Could not determine canonical name.",
1193
+ "smiles": smiles}
1194
+
1195
+ # Build the output list, canonical first
1196
+ names = []
1197
+
1198
+ # Parse prefixes from each name form
1199
+ canon_parsed = _parse_name_components(canon)
1200
+ canon_prefixes = ([s["prefix"] for s in canon_parsed["substituents"]]
1201
+ if canon_parsed else [])
1202
+ names.append({
1203
+ "name": canon,
1204
+ "valid": True,
1205
+ "strategy": "canonical",
1206
+ "prefixes": canon_prefixes,
1207
+ })
1208
+
1209
+ # Add valid alternatives
1210
+ seen = {canon}
1211
+ for alt in result.alternatives:
1212
+ if not alt.valid:
1213
+ continue
1214
+ if alt.name in seen:
1215
+ continue
1216
+ seen.add(alt.name)
1217
+
1218
+ parsed = _parse_name_components(alt.name)
1219
+ prefixes = ([s["prefix"] for s in parsed["substituents"]]
1220
+ if parsed else [])
1221
+ names.append({
1222
+ "name": alt.name,
1223
+ "valid": True,
1224
+ "strategy": alt.strategy,
1225
+ "prefixes": prefixes,
1226
+ })
1227
+
1228
+ return {
1229
+ "ok": True,
1230
+ "canonical_name": canon,
1231
+ "smiles": smiles,
1232
+ "names": names,
1233
+ }
1234
+
1235
+
1236
+ # ---------------------------------------------------------------------------
1237
+ # Layer 3: Graph manipulation — reaction templates
1238
+ # ---------------------------------------------------------------------------
1239
+
1240
+ # Hand-curated templates for common med-chem transformations that an LLM
1241
+ # will recognise by name. These supplement the larger collection loaded
1242
+ # from reactions_datamol.json.
1243
+ _CLASSIC_TEMPLATES: Dict[str, Dict[str, Any]] = {
1244
+ "suzuki_coupling": {
1245
+ "description": "Suzuki coupling: aryl halide + boronic acid to biaryl",
1246
+ "smarts": "[c:1][Br,I].[#6:2][B]([OH])[OH]>>[c:1]-[#6:2]",
1247
+ "n_reactants": 2,
1248
+ "substrate_hint": "aryl bromide or iodide",
1249
+ "reagent_hint": "boronic acid",
1250
+ "conditions": ["Pd(dppf)Cl2", "K2CO3", "dioxane/H2O", "80 °C"],
1251
+ "category": "coupling",
1252
+ },
1253
+ "buchwald_amination": {
1254
+ "description":
1255
+ "Buchwald-Hartwig amination: aryl halide + amine to aryl amine",
1256
+ "smarts": "[c:1][Cl,Br,I].[NX3;H2,H1:2]>>[c:1]-[N:2]",
1257
+ "n_reactants": 2,
1258
+ "substrate_hint": "aryl halide",
1259
+ "reagent_hint": "primary or secondary amine",
1260
+ "conditions": ["Pd2(dba)3", "XPhos", "Cs2CO3", "toluene", "100 °C"],
1261
+ "category": "coupling",
1262
+ },
1263
+ "snar": {
1264
+ "description":
1265
+ "Nucleophilic aromatic substitution: activated aryl halide + "
1266
+ "nucleophile",
1267
+ "smarts": "[c:1][F,Cl].[NX3;H2,H1:2]>>[c:1]-[N:2]",
1268
+ "n_reactants": 2,
1269
+ "substrate_hint": "electron-poor aryl fluoride or chloride",
1270
+ "reagent_hint": "amine nucleophile",
1271
+ "conditions": ["DIPEA", "DMSO or NMP", "80-120 °C"],
1272
+ "category": "coupling",
1273
+ },
1274
+ "amide_coupling": {
1275
+ "description": "Amide bond formation: carboxylic acid + amine",
1276
+ "smarts":
1277
+ "[C:1](=[O:2])[OH].[NX3;H2,H1:3]>>[C:1](=[O:2])-[N:3]",
1278
+ "n_reactants": 2,
1279
+ "substrate_hint": "carboxylic acid",
1280
+ "reagent_hint": "primary or secondary amine",
1281
+ "conditions": ["HATU", "DIPEA", "DMF", "rt"],
1282
+ "category": "coupling",
1283
+ },
1284
+ "reductive_amination": {
1285
+ "description":
1286
+ "Reductive amination: aldehyde or ketone + amine to amine",
1287
+ "smarts": "[C:1](=[O:2]).[NX3;H2,H1:3]>>[C:1]-[N:3]",
1288
+ "n_reactants": 2,
1289
+ "substrate_hint": "aldehyde or ketone",
1290
+ "reagent_hint": "primary or secondary amine",
1291
+ "conditions": ["NaBH(OAc)3", "AcOH", "DCE", "rt"],
1292
+ "category": "functional_group",
1293
+ },
1294
+ "nitro_reduction": {
1295
+ "description": "Nitro group reduction to amine (ArNO2 to ArNH2)",
1296
+ "smarts": "[c:1][N+](=[O])[O-]>>[c:1]N",
1297
+ "n_reactants": 1,
1298
+ "substrate_hint": "aromatic nitro compound",
1299
+ "reagent_hint": None,
1300
+ "conditions": ["SnCl2·2H2O", "EtOH", "80 °C"],
1301
+ "category": "functional_group",
1302
+ },
1303
+ "ester_hydrolysis": {
1304
+ "description": "Ester hydrolysis to carboxylic acid",
1305
+ "smarts":
1306
+ "[C:1](=[O:2])[O:3][C:4]>>[C:1](=[O:2])[OH]",
1307
+ "n_reactants": 1,
1308
+ "substrate_hint": "ester",
1309
+ "reagent_hint": None,
1310
+ "conditions": ["LiOH", "THF/H2O", "rt"],
1311
+ "category": "functional_group",
1312
+ },
1313
+ "n_alkylation": {
1314
+ "description": "N-Alkylation: amine + alkyl halide",
1315
+ "smarts":
1316
+ "[NX3;H2,H1:1].[C:2][Cl,Br,I]>>[N:1]-[C:2]",
1317
+ "n_reactants": 2,
1318
+ "substrate_hint": "amine",
1319
+ "reagent_hint": "alkyl halide",
1320
+ "conditions": ["K2CO3", "DMF", "60 °C"],
1321
+ "category": "coupling",
1322
+ },
1323
+ "sonogashira_coupling": {
1324
+ "description":
1325
+ "Sonogashira coupling: aryl halide + terminal alkyne",
1326
+ "smarts":
1327
+ "[c:1][Br,I].[CH:2]#[C:3]>>[c:1]-[C:2]#[C:3]",
1328
+ "n_reactants": 2,
1329
+ "substrate_hint": "aryl bromide or iodide",
1330
+ "reagent_hint": "terminal alkyne",
1331
+ "conditions": [
1332
+ "PdCl2(PPh3)2", "CuI", "Et3N", "THF", "rt",
1333
+ ],
1334
+ "category": "coupling",
1335
+ },
1336
+ "heck_reaction": {
1337
+ "description":
1338
+ "Heck reaction: aryl halide + alkene to substituted alkene",
1339
+ "smarts":
1340
+ "[c:1][Br,I].[CH:2]=[CH2:3]>>[c:1]/[CH:2]=[CH2:3]",
1341
+ "n_reactants": 2,
1342
+ "substrate_hint": "aryl halide",
1343
+ "reagent_hint": "terminal alkene",
1344
+ "conditions": [
1345
+ "Pd(OAc)2", "P(o-tol)3", "Et3N", "DMF", "100 °C",
1346
+ ],
1347
+ "category": "coupling",
1348
+ },
1349
+ "alcohol_oxidation": {
1350
+ "description": "Alcohol oxidation to aldehyde or ketone",
1351
+ "smarts":
1352
+ "[C:1][OH:2]>>[C:1]=[O:2]",
1353
+ "n_reactants": 1,
1354
+ "substrate_hint": "primary or secondary alcohol",
1355
+ "reagent_hint": None,
1356
+ "conditions": ["Dess-Martin periodinane", "DCM", "rt"],
1357
+ "category": "functional_group",
1358
+ },
1359
+ "grignard_addition": {
1360
+ "description":
1361
+ "Grignard / organometallic addition to aldehyde or ketone",
1362
+ "smarts":
1363
+ "[C:1](=[O:2])[#6:4].[#6:3][Mg]>>[C:1]([OH:2])([#6:4])-[#6:3]",
1364
+ "n_reactants": 2,
1365
+ "substrate_hint": "aldehyde or ketone",
1366
+ "reagent_hint": "Grignard reagent (RMgBr SMILES)",
1367
+ "conditions": ["THF", "-78 °C to rt"],
1368
+ "category": "functional_group",
1369
+ },
1370
+ # --- Hartenfeller-Schneider extras (not in datamol) ---
1371
+ "amide_hydrolysis": {
1372
+ "description":
1373
+ "Amide hydrolysis to carboxylic acid (RCONHR' \u2192 RCOOH)",
1374
+ "smarts": "[C:1](=[O:2])[NX3:3]>>[C:1](=[O:2])[OH]",
1375
+ "n_reactants": 1,
1376
+ "substrate_hint": "amide (primary, secondary, or tertiary)",
1377
+ "reagent_hint": None,
1378
+ "conditions": ["6M HCl or 2M NaOH", "reflux"],
1379
+ "category": "functional_group",
1380
+ },
1381
+ "wittig": {
1382
+ "description":
1383
+ "Wittig olefination: aldehyde/ketone + alkyl halide to alkene",
1384
+ "smarts":
1385
+ "[#6:3]-[C;H1,$([CH0](-[#6])[#6]);!$(CC=O):1]=[OD1]"
1386
+ ".[Cl,Br,I][C;H2;$(C-[#6]);!$(CC[I,Br]);!$(CCO[CH3]):2]"
1387
+ ">>[C:3][C:1]=[C:2]",
1388
+ "n_reactants": 2,
1389
+ "substrate_hint": "aldehyde or ketone",
1390
+ "reagent_hint": "alkyl halide (ylide precursor)",
1391
+ "conditions": ["PPh3", "n-BuLi", "THF", "0 \u00b0C to rt"],
1392
+ "category": "functional_group",
1393
+ },
1394
+ "niementowski_quinazoline": {
1395
+ "description":
1396
+ "Niementowski quinazoline: anthranilic acid + amide "
1397
+ "\u2192 4-quinazolinone",
1398
+ "smarts":
1399
+ "[c:1](-[C;$(C-c1ccccc1):2](=[OD1:3])-[OH1])"
1400
+ ":[c:4](-[NH2:5])"
1401
+ ".[N;!H0;!$(N-N);!$(N-C=N);!$(N(-C=O)-C=O):6]"
1402
+ "-[C;H1,$(C-[#6]):7]=[OD1]"
1403
+ ">>[c:4]2:[c:1]-[C:2](=[O:3])-[N:6]-[C:7]=[N:5]-2",
1404
+ "n_reactants": 2,
1405
+ "substrate_hint": "anthranilic acid derivative",
1406
+ "reagent_hint": "amide or formamide",
1407
+ "conditions": ["neat or AcOH", "120\u2013150 \u00b0C"],
1408
+ "category": "heterocycle_formation",
1409
+ },
1410
+ "grignard_carbonyl": {
1411
+ "description":
1412
+ "Grignard on nitrile: nitrile + aryl/alkyl halide \u2192 ketone",
1413
+ "smarts":
1414
+ "[#6:1][C:2]#[#7;D1]"
1415
+ ".[Cl,Br,I][#6;$([#6]~[#6]);"
1416
+ "!$([#6]([Cl,Br,I])[Cl,Br,I]);!$([#6]=O):3]"
1417
+ ">>[#6:1][C:2](=O)[#6:3]",
1418
+ "n_reactants": 2,
1419
+ "substrate_hint": "nitrile (R\u2212C\u2261N)",
1420
+ "reagent_hint": "aryl or alkyl halide (Grignard precursor)",
1421
+ "conditions": ["Mg", "THF", "then H3O+"],
1422
+ "category": "functional_group",
1423
+ },
1424
+ # --- Deprotection templates (SMARTS from RDKit rdDeprotect source) ---
1425
+ "cbz_deprotection": {
1426
+ "description": "Remove Cbz (carbobenzyloxy) from amine",
1427
+ "smarts":
1428
+ "[NX3;H0,H1:1][C;R0](=O)[O;R0][C;R0]"
1429
+ "c1[c;H1][c;H1][c;H1][c;H1][c;H1]1>>[N:1]",
1430
+ "n_reactants": 1,
1431
+ "substrate_hint": "Cbz-protected amine",
1432
+ "reagent_hint": None,
1433
+ "conditions": ["H2", "Pd/C", "MeOH", "rt"],
1434
+ "category": "deprotection",
1435
+ },
1436
+ "fmoc_deprotection": {
1437
+ "description": "Remove Fmoc (9-fluorenylmethyloxycarbonyl) from amine",
1438
+ "smarts":
1439
+ "[NX3;H0,H1:1][#6](=O)-[#8]-[#6]-[#6]-1"
1440
+ "-c2ccccc2-c2ccccc-12>>[N:1]",
1441
+ "n_reactants": 1,
1442
+ "substrate_hint": "Fmoc-protected amine",
1443
+ "reagent_hint": None,
1444
+ "conditions": ["piperidine", "DMF", "rt"],
1445
+ "category": "deprotection",
1446
+ },
1447
+ "tbs_deprotection": {
1448
+ "description": "Remove TBS (tert-butyldimethylsilyl) from alcohol",
1449
+ "smarts": "CC(C)([Si](C)(C)[O;H0:1])C>>[O;H1:1]",
1450
+ "n_reactants": 1,
1451
+ "substrate_hint": "TBS-protected alcohol",
1452
+ "reagent_hint": None,
1453
+ "conditions": ["TBAF", "THF", "rt"],
1454
+ "category": "deprotection",
1455
+ },
1456
+ "bn_deprotection_o": {
1457
+ "description": "Remove benzyl (Bn) from alcohol",
1458
+ "smarts":
1459
+ "[O;!$(*C(=O)):1][CH2]"
1460
+ "c1[c;H1][c;H1][c;H1][c;H1][c;H1]1>>[O;H1:1]",
1461
+ "n_reactants": 1,
1462
+ "substrate_hint": "Bn-protected alcohol",
1463
+ "reagent_hint": None,
1464
+ "conditions": ["H2", "Pd/C", "EtOAc", "rt"],
1465
+ "category": "deprotection",
1466
+ },
1467
+ "bn_deprotection_n": {
1468
+ "description": "Remove benzyl (Bn) from amine",
1469
+ "smarts":
1470
+ "[NX3;H0,H1;!$(NC=O):1][C;H2]"
1471
+ "c1[c;H1][c;H1][c;H1][c;H1][c;H1]1>>[N:1]",
1472
+ "n_reactants": 1,
1473
+ "substrate_hint": "Bn-protected amine",
1474
+ "reagent_hint": None,
1475
+ "conditions": ["H2", "Pd/C", "MeOH", "rt"],
1476
+ "category": "deprotection",
1477
+ },
1478
+ "ac_deprotection_o": {
1479
+ "description": "Remove acetyl (Ac) from alcohol",
1480
+ "smarts": "[O;R0:1][C;R0](=O)[C;H3]>>[O:1]",
1481
+ "n_reactants": 1,
1482
+ "substrate_hint": "Ac-protected alcohol",
1483
+ "reagent_hint": None,
1484
+ "conditions": ["K2CO3", "MeOH", "rt"],
1485
+ "category": "deprotection",
1486
+ },
1487
+ "ac_deprotection_n": {
1488
+ "description": "Remove acetyl (Ac) from amine",
1489
+ "smarts": "[NX3;H0,H1:1][C;R0](=O)[C;H3]>>[N:1]",
1490
+ "n_reactants": 1,
1491
+ "substrate_hint": "Ac-protected amine",
1492
+ "reagent_hint": None,
1493
+ "conditions": ["6M HCl", "reflux"],
1494
+ "category": "deprotection",
1495
+ },
1496
+ "pmb_deprotection": {
1497
+ "description": "Remove PMB (para-methoxybenzyl) from alcohol",
1498
+ "smarts":
1499
+ "[c;H1]1[c;H1]c(O[C;H3])[c;H1][c;H1]c1"
1500
+ "[C;H2][O;D2&R0:1]>>[O;H1:1]",
1501
+ "n_reactants": 1,
1502
+ "substrate_hint": "PMB-protected alcohol",
1503
+ "reagent_hint": None,
1504
+ "conditions": ["DDQ", "DCM/H2O", "rt"],
1505
+ "category": "deprotection",
1506
+ },
1507
+ "ts_deprotection": {
1508
+ "description": "Remove tosyl (Ts) from amine",
1509
+ "smarts":
1510
+ "[C;H3]c1[c;H1][c;H1]c(S(=O)(=O)"
1511
+ "[NX3;H0,H1;!$(NC=O):1])[c;H1][c;H1]1>>[N:1]",
1512
+ "n_reactants": 1,
1513
+ "substrate_hint": "Ts-protected amine",
1514
+ "reagent_hint": None,
1515
+ "conditions": ["Mg", "MeOH", "sonication"],
1516
+ "category": "deprotection",
1517
+ },
1518
+ "tfa_deprotection": {
1519
+ "description": "Remove trifluoroacetyl (TFA) from amine",
1520
+ "smarts": "[N;H0,H1:1]C(=O)C(F)(F)F>>[N:1]",
1521
+ "n_reactants": 1,
1522
+ "substrate_hint": "TFA-protected amine",
1523
+ "reagent_hint": None,
1524
+ "conditions": ["K2CO3", "MeOH/H2O", "rt"],
1525
+ "category": "deprotection",
1526
+ },
1527
+ # --- Protection templates (reversed deprotection SMARTS, unimolecular) ---
1528
+ "cbz_protection": {
1529
+ "description": "Add Cbz (carbobenzyloxy) to amine",
1530
+ "smarts": "[NX3;H1,H2:1]>>[N:1]C(=O)OCc1ccccc1",
1531
+ "n_reactants": 1,
1532
+ "substrate_hint": "free amine",
1533
+ "reagent_hint": None,
1534
+ "conditions": ["CbzCl", "NaOH", "dioxane/H2O", "0 \u00b0C"],
1535
+ "category": "protection",
1536
+ },
1537
+ "fmoc_protection": {
1538
+ "description": "Add Fmoc (9-fluorenylmethyloxycarbonyl) to amine",
1539
+ "smarts":
1540
+ "[NX3;H1,H2:1]>>[N:1]C(=O)OCC1c2ccccc2-c2ccccc21",
1541
+ "n_reactants": 1,
1542
+ "substrate_hint": "free amine",
1543
+ "reagent_hint": None,
1544
+ "conditions": ["Fmoc-OSu", "NaHCO3", "dioxane/H2O", "rt"],
1545
+ "category": "protection",
1546
+ },
1547
+ "tbs_protection": {
1548
+ "description": "Add TBS (tert-butyldimethylsilyl) to alcohol",
1549
+ "smarts": "[O;H1:1]>>[O:1][Si](C)(C)C(C)(C)C",
1550
+ "n_reactants": 1,
1551
+ "substrate_hint": "free alcohol",
1552
+ "reagent_hint": None,
1553
+ "conditions": ["TBSCl", "imidazole", "DMF", "rt"],
1554
+ "category": "protection",
1555
+ },
1556
+ "bn_protection_o": {
1557
+ "description": "Add benzyl (Bn) to alcohol",
1558
+ "smarts": "[O;H1:1]>>[O:1]Cc1ccccc1",
1559
+ "n_reactants": 1,
1560
+ "substrate_hint": "free alcohol",
1561
+ "reagent_hint": None,
1562
+ "conditions": ["BnBr", "NaH", "DMF", "0 \u00b0C"],
1563
+ "category": "protection",
1564
+ },
1565
+ "bn_protection_n": {
1566
+ "description": "Add benzyl (Bn) to amine",
1567
+ "smarts": "[NX3;H1,H2;!$(NC=O):1]>>[N:1]Cc1ccccc1",
1568
+ "n_reactants": 1,
1569
+ "substrate_hint": "free amine",
1570
+ "reagent_hint": None,
1571
+ "conditions": ["BnBr", "K2CO3", "DMF", "60 \u00b0C"],
1572
+ "category": "protection",
1573
+ },
1574
+ "ac_protection_o": {
1575
+ "description": "Add acetyl (Ac) to alcohol",
1576
+ "smarts": "[O;H1:1]>>[O:1]C(C)=O",
1577
+ "n_reactants": 1,
1578
+ "substrate_hint": "free alcohol",
1579
+ "reagent_hint": None,
1580
+ "conditions": ["Ac2O", "pyridine", "rt"],
1581
+ "category": "protection",
1582
+ },
1583
+ "ac_protection_n": {
1584
+ "description": "Add acetyl (Ac) to amine",
1585
+ "smarts": "[NX3;H1,H2:1]>>[N:1]C(C)=O",
1586
+ "n_reactants": 1,
1587
+ "substrate_hint": "free amine",
1588
+ "reagent_hint": None,
1589
+ "conditions": ["Ac2O", "Et3N", "DCM", "rt"],
1590
+ "category": "protection",
1591
+ },
1592
+ "pmb_protection": {
1593
+ "description": "Add PMB (para-methoxybenzyl) to alcohol",
1594
+ "smarts": "[O;H1:1]>>[O:1]Cc1ccc(OC)cc1",
1595
+ "n_reactants": 1,
1596
+ "substrate_hint": "free alcohol",
1597
+ "reagent_hint": None,
1598
+ "conditions": ["PMBCl", "NaH", "DMF", "0 \u00b0C"],
1599
+ "category": "protection",
1600
+ },
1601
+ "ts_protection": {
1602
+ "description": "Add tosyl (Ts) to amine",
1603
+ "smarts":
1604
+ "[NX3;H1,H2;!$(NC=O):1]>>[N:1]S(=O)(=O)c1ccc(C)cc1",
1605
+ "n_reactants": 1,
1606
+ "substrate_hint": "free amine",
1607
+ "reagent_hint": None,
1608
+ "conditions": ["TsCl", "Et3N", "DCM", "0 \u00b0C"],
1609
+ "category": "protection",
1610
+ },
1611
+ }
1612
+
1613
+
1614
+ # ---------------------------------------------------------------------------
1615
+ # Dynamic loading of reaction templates from datamol
1616
+ # ---------------------------------------------------------------------------
1617
+
1618
+ _datamol_cache: Optional[Dict[str, Dict[str, Any]]] = None
1619
+
1620
+
1621
+ def _category_from_tags(tags: set) -> str:
1622
+ """Derive a template category from datamol tags."""
1623
+ if tags & {"heterocycle formation", "cyclization", "ring formation"}:
1624
+ return "heterocycle_formation"
1625
+ if tags & {"amide coupling", "amide"}:
1626
+ return "coupling"
1627
+ # Datamol uses "protecting group", also match "protection"/"deprotection"
1628
+ if tags & {"protecting group", "protection", "deprotection"}:
1629
+ # Distinguish protection vs deprotection by tag name
1630
+ tag_lc = {t.lower() for t in tags}
1631
+ if any("deprotect" in t for t in tag_lc):
1632
+ return "deprotection"
1633
+ if any("protect" in t for t in tag_lc):
1634
+ return "protection"
1635
+ return "protecting_group"
1636
+ if tags & {"C-C bond formation", "C-N bond formation",
1637
+ "C-O bond formation", "C-S bond formation",
1638
+ "N-arylation", "O-arylation", "S-arylation"}:
1639
+ return "coupling"
1640
+ return "functional_group"
1641
+
1642
+
1643
+ def _load_datamol_templates() -> Dict[str, Dict[str, Any]]:
1644
+ """Load all reaction templates from datamol JSON.
1645
+
1646
+ Reads ``reactions_datamol.json`` (127 curated reaction templates from
1647
+ the datamol project, Apache 2.0) and converts each entry to our
1648
+ standard template format with snake_case keys. Includes heterocycle
1649
+ formation, couplings, functional group transforms, ester/amide
1650
+ chemistry, protection/deprotection, and more.
1651
+
1652
+ Returns:
1653
+ Dict mapping template name to template dict.
1654
+ """
1655
+ global _datamol_cache
1656
+ if _datamol_cache is not None:
1657
+ return _datamol_cache
1658
+
1659
+ json_path = os.path.join(os.path.dirname(__file__), "reactions_datamol.json")
1660
+ if not os.path.exists(json_path):
1661
+ logger.warning("reactions_datamol.json not found — datamol "
1662
+ "templates unavailable")
1663
+ _datamol_cache = {}
1664
+ return _datamol_cache
1665
+
1666
+ with open(json_path, encoding="utf-8") as fh:
1667
+ raw = json.load(fh)
1668
+
1669
+ templates: Dict[str, Dict[str, Any]] = {}
1670
+
1671
+ for key, entry in raw.items():
1672
+ syn_smarts = entry.get("syn_smarts", "")
1673
+ if not syn_smarts:
1674
+ continue
1675
+
1676
+ tags = set(entry.get("tags", []))
1677
+
1678
+ # Derive template name: JSON key is already kebab-case
1679
+ # Convert to snake_case for consistency
1680
+ tname = key.replace("-", "_")
1681
+
1682
+ # Count reactant fragments (separated by '.')
1683
+ reactant_part = syn_smarts.split(">>")[0] if ">>" in syn_smarts else ""
1684
+ # Count top-level dots (outside brackets)
1685
+ n_reactants = 1
1686
+ depth = 0
1687
+ for ch in reactant_part:
1688
+ if ch == "[":
1689
+ depth += 1
1690
+ elif ch == "]":
1691
+ depth -= 1
1692
+ elif ch == "." and depth == 0:
1693
+ n_reactants += 1
1694
+
1695
+ templates[tname] = {
1696
+ "description": entry.get("description", entry.get("long_name", key)),
1697
+ "long_name": entry.get("long_name", ""),
1698
+ "smarts": syn_smarts,
1699
+ "n_reactants": n_reactants,
1700
+ "substrate_hint": ", ".join(entry.get("rhs_classes", [])),
1701
+ "reagent_hint": (", ".join(entry.get("rhs_classes", [])[1:])
1702
+ if n_reactants > 1 and len(entry.get("rhs_classes", [])) > 1
1703
+ else None),
1704
+ "conditions": [], # literature conditions vary
1705
+ "category": _category_from_tags(tags),
1706
+ "tags": list(tags),
1707
+ "source": "datamol",
1708
+ }
1709
+
1710
+ _datamol_cache = templates
1711
+ logger.debug("Loaded %d templates from datamol", len(templates))
1712
+ return _datamol_cache
1713
+
1714
+
1715
+ # Merged registry: classic hand-written + all datamol templates
1716
+ _merged_templates: Optional[Dict[str, Dict[str, Any]]] = None
1717
+
1718
+
1719
+ def _get_reaction_templates() -> Dict[str, Dict[str, Any]]:
1720
+ """Return the merged reaction template registry (lazy-loaded).
1721
+
1722
+ Classic hand-written templates (couplings, functional group transforms,
1723
+ protection/deprotection) are merged with all datamol templates
1724
+ (heterocycle formation, couplings, FG transforms, and more).
1725
+ Classic templates take priority on name collisions.
1726
+ """
1727
+ global _merged_templates
1728
+ if _merged_templates is not None:
1729
+ return _merged_templates
1730
+
1731
+ datamol = _load_datamol_templates()
1732
+ merged = dict(datamol) # datamol first, classic overrides
1733
+ merged.update(_CLASSIC_TEMPLATES)
1734
+ _merged_templates = merged
1735
+ return _merged_templates
1736
+
1737
+
1738
+ # ---------------------------------------------------------------------------
1739
+ # Tool 7: list_reactions
1740
+ # ---------------------------------------------------------------------------
1741
+
1742
+ def list_reactions(category: Optional[str] = None) -> Dict[str, Any]:
1743
+ """List available named reaction templates.
1744
+
1745
+ Returns a summary of each reaction: name, description, number of
1746
+ reactants required, and typical conditions. Use this to find the
1747
+ right template before calling ``apply_reaction``.
1748
+
1749
+ Args:
1750
+ category: Optional filter. One of ``"coupling"``,
1751
+ ``"functional_group"``, or ``"heterocycle_formation"``.
1752
+ If *None*, all templates are returned.
1753
+
1754
+ Returns:
1755
+ Dict with ``ok``, ``reactions`` (list of summaries), and
1756
+ ``categories`` (list of available category names).
1757
+
1758
+ Example::
1759
+
1760
+ >>> result = list_reactions()
1761
+ >>> for r in result["reactions"]:
1762
+ ... print(r["name"], "—", r["description"])
1763
+ suzuki_coupling — Suzuki coupling: aryl halide + boronic acid to biaryl
1764
+ ...
1765
+ >>> result = list_reactions(category="heterocycle_formation")
1766
+ """
1767
+ templates = _get_reaction_templates()
1768
+ rxns = []
1769
+ cats = set()
1770
+ for name, tmpl in templates.items():
1771
+ cat = tmpl.get("category", "other")
1772
+ cats.add(cat)
1773
+ if category and cat != category:
1774
+ continue
1775
+ rxns.append({
1776
+ "name": name,
1777
+ "description": tmpl["description"],
1778
+ "n_reactants": tmpl["n_reactants"],
1779
+ "substrate_hint": tmpl["substrate_hint"],
1780
+ "reagent_hint": tmpl.get("reagent_hint"),
1781
+ "conditions": tmpl.get("conditions", []),
1782
+ "category": cat,
1783
+ })
1784
+ return {"ok": True, "reactions": rxns, "categories": sorted(cats)}
1785
+
1786
+
1787
+ # ---------------------------------------------------------------------------
1788
+ # Tool 8: apply_reaction
1789
+ # ---------------------------------------------------------------------------
1790
+
1791
+ def apply_reaction(reaction_name: str,
1792
+ substrate: str,
1793
+ reagent: Optional[str] = None) -> Dict[str, Any]:
1794
+ """Apply a named reaction template to transform a substrate.
1795
+
1796
+ Takes a substrate SMILES (and optionally a reagent SMILES for
1797
+ bimolecular reactions) and returns the product(s).
1798
+
1799
+ Args:
1800
+ reaction_name: Template name from ``list_reactions()``.
1801
+ substrate: SMILES of the main substrate.
1802
+ reagent: SMILES of the coupling partner (for 2-reactant rxns).
1803
+ Can also be a chemical name or abbreviation.
1804
+
1805
+ Returns:
1806
+ Dict with ``ok``, ``products`` (list of product dicts with
1807
+ ``smiles`` and ``name`` keys), and ``conditions``.
1808
+
1809
+ Example::
1810
+
1811
+ >>> apply_reaction("nitro_reduction", "c1ccc([N+](=O)[O-])cc1")
1812
+ {'ok': True,
1813
+ 'products': [{'smiles': 'Nc1ccccc1', 'name': 'aniline'}],
1814
+ 'conditions': ['SnCl2·2H2O', 'EtOH', '80 °C']}
1815
+ """
1816
+ from rdkit import Chem
1817
+ from rdkit.Chem import AllChem
1818
+
1819
+ # Look up template
1820
+ templates = _get_reaction_templates()
1821
+ tmpl = templates.get(reaction_name)
1822
+ # Case-insensitive fallback
1823
+ if tmpl is None:
1824
+ lower_map = {k.lower(): k for k in templates}
1825
+ real_key = lower_map.get(reaction_name.lower())
1826
+ if real_key:
1827
+ tmpl = templates[real_key]
1828
+ if tmpl is None:
1829
+ # Fuzzy match: find closest reaction names
1830
+ query_lower = reaction_name.lower().replace("-", "_").replace(" ", "_")
1831
+ scored = []
1832
+ for k in templates:
1833
+ k_lower = k.lower()
1834
+ # Substring match
1835
+ if query_lower in k_lower or k_lower in query_lower:
1836
+ scored.append((0, k))
1837
+ else:
1838
+ # Count shared words
1839
+ q_parts = set(query_lower.split("_"))
1840
+ k_parts = set(k_lower.split("_"))
1841
+ overlap = len(q_parts & k_parts)
1842
+ if overlap > 0:
1843
+ scored.append((1, k))
1844
+ scored.sort()
1845
+ suggestions = [s[1] for s in scored[:5]]
1846
+ if suggestions:
1847
+ hint = f"Did you mean: {', '.join(suggestions)}?"
1848
+ else:
1849
+ # Show a sample of available reactions
1850
+ all_names = sorted(templates.keys())
1851
+ hint = f"Some available reactions: {', '.join(all_names[:15])}... ({len(all_names)} total)"
1852
+ return {
1853
+ "ok": False,
1854
+ "error": f"Unknown reaction '{reaction_name}'. {hint}",
1855
+ }
1856
+
1857
+ # Parse reaction SMARTS
1858
+ try:
1859
+ rxn = AllChem.ReactionFromSmarts(tmpl["smarts"])
1860
+ except Exception as exc:
1861
+ return {"ok": False, "error": f"Invalid reaction SMARTS: {exc}"}
1862
+
1863
+ # Parse substrate
1864
+ sub_mol = Chem.MolFromSmiles(substrate)
1865
+ if sub_mol is None:
1866
+ # Maybe it's a name — try resolving
1867
+ resolved = _resolve_query(substrate)
1868
+ if resolved:
1869
+ sub_mol = Chem.MolFromSmiles(resolved["smiles"])
1870
+ if sub_mol is None:
1871
+ return {"ok": False, "error": f"Could not parse substrate '{substrate}'."}
1872
+
1873
+ # Handle reagent for bimolecular reactions
1874
+ if tmpl["n_reactants"] == 2:
1875
+ if not reagent:
1876
+ return {
1877
+ "ok": False,
1878
+ "error": f"Reaction '{reaction_name}' requires a reagent. "
1879
+ f"Expected: {tmpl.get('reagent_hint', 'coupling partner')}.",
1880
+ }
1881
+ rea_mol = Chem.MolFromSmiles(reagent)
1882
+ if rea_mol is None:
1883
+ # Try resolving as name
1884
+ resolved = _resolve_query(reagent)
1885
+ if resolved:
1886
+ rea_mol = Chem.MolFromSmiles(resolved["smiles"])
1887
+ if rea_mol is None:
1888
+ return {"ok": False,
1889
+ "error": f"Could not parse reagent '{reagent}'."}
1890
+ reactants = (sub_mol, rea_mol)
1891
+ else:
1892
+ reactants = (sub_mol,)
1893
+
1894
+ # Run reaction
1895
+ try:
1896
+ product_sets = rxn.RunReactants(reactants)
1897
+ except Exception as exc:
1898
+ return {"ok": False, "error": f"Reaction failed: {exc}"}
1899
+
1900
+ if not product_sets:
1901
+ # Try swapped reactant order for bimolecular reactions
1902
+ if tmpl["n_reactants"] == 2:
1903
+ try:
1904
+ product_sets = rxn.RunReactants((reactants[1], reactants[0]))
1905
+ except Exception:
1906
+ pass
1907
+ if not product_sets:
1908
+ return {
1909
+ "ok": False,
1910
+ "error": "No products formed. Check that the substrate "
1911
+ f"matches: {tmpl['substrate_hint']}.",
1912
+ }
1913
+
1914
+ # Collect unique products
1915
+ seen = set()
1916
+ products = []
1917
+ for prod_tuple in product_sets:
1918
+ for prod in prod_tuple:
1919
+ try:
1920
+ Chem.SanitizeMol(prod)
1921
+ smi = Chem.MolToSmiles(prod)
1922
+ if smi not in seen:
1923
+ seen.add(smi)
1924
+ name = _smiles_to_name_cs(smi)
1925
+ products.append({"smiles": smi, "name": name})
1926
+ except Exception:
1927
+ continue
1928
+
1929
+ if not products:
1930
+ return {"ok": False, "error": "Products could not be sanitised."}
1931
+
1932
+ return {
1933
+ "ok": True,
1934
+ "products": products,
1935
+ "conditions": tmpl["conditions"],
1936
+ "reaction": reaction_name,
1937
+ }
1938
+
1939
+
1940
+ # ---------------------------------------------------------------------------
1941
+ # Tool 9: deprotect
1942
+ # ---------------------------------------------------------------------------
1943
+
1944
+ def _detect_deprotection_templates(mol) -> List[str]:
1945
+ """Return names of all deprotection templates that fire on *mol*.
1946
+
1947
+ Used internally by :func:`deprotect` to route single-PG cases through
1948
+ :func:`apply_reaction` and multi-PG (or unrecognised) cases through
1949
+ RDKit's ``rdDeprotect`` library.
1950
+
1951
+ Args:
1952
+ mol: RDKit ``Mol`` object.
1953
+
1954
+ Returns:
1955
+ List of template names (from the merged registry) whose SMARTS
1956
+ match at least one site on *mol*.
1957
+ """
1958
+ from rdkit.Chem import AllChem
1959
+
1960
+ templates = _get_reaction_templates()
1961
+ fired: List[str] = []
1962
+ for name, tmpl in templates.items():
1963
+ if tmpl.get("category") not in ("deprotection",):
1964
+ continue
1965
+ if tmpl.get("n_reactants", 1) != 1:
1966
+ continue
1967
+ try:
1968
+ rxn = AllChem.ReactionFromSmarts(tmpl["smarts"])
1969
+ if rxn.RunReactants((mol,)):
1970
+ fired.append(name)
1971
+ except Exception:
1972
+ continue
1973
+ return fired
1974
+
1975
+
1976
+ # Map from apply_reaction template name to PG abbreviation used in the
1977
+ # ``removed`` list that callers expect. Keeps the public return format
1978
+ # stable even when template names are refactored.
1979
+ _TEMPLATE_TO_PG_ABBREV: Dict[str, str] = {
1980
+ "BOC_deprotection": "Boc",
1981
+ "cbz_deprotection": "Cbz",
1982
+ "fmoc_deprotection": "Fmoc",
1983
+ "tbs_deprotection": "TBS",
1984
+ "bn_deprotection_o": "Bn",
1985
+ "bn_deprotection_n": "Bn",
1986
+ "ac_deprotection_o": "Ac",
1987
+ "ac_deprotection_n": "Ac",
1988
+ "pmb_deprotection": "PMB",
1989
+ "ts_deprotection": "Ts",
1990
+ "tfa_deprotection": "TFA",
1991
+ }
1992
+
1993
+
1994
+ def deprotect(smiles: str) -> Dict[str, Any]:
1995
+ """Remove common protecting groups from a molecule.
1996
+
1997
+ For substrates carrying a **single recognisable protecting group**,
1998
+ this function delegates to :func:`apply_reaction` using the
1999
+ appropriate named template (e.g. ``"BOC_deprotection"``,
2000
+ ``"fmoc_deprotection"``). This keeps the deprotection logic
2001
+ centralised in the reaction-template registry and makes the
2002
+ single-PG path available to agents via :func:`apply_reaction`
2003
+ directly.
2004
+
2005
+ For substrates with **multiple protecting groups**, or when the PG is
2006
+ not covered by the named-template registry, the function falls back to
2007
+ RDKit's built-in ``rdDeprotect`` library (25+ templates covering Boc,
2008
+ Fmoc, Cbz, TBS, THP, Bn, Ac, PMB, Tr, and more).
2009
+
2010
+ The return format is identical in all cases, so existing callers are
2011
+ unaffected.
2012
+
2013
+ Args:
2014
+ smiles: SMILES of the protected molecule.
2015
+
2016
+ Returns:
2017
+ Dict with ``ok``, ``product_smiles``, ``product_name``,
2018
+ and ``removed`` (list of protecting group abbreviations removed).
2019
+
2020
+ Example::
2021
+
2022
+ >>> deprotect("O=C(OC(C)(C)C)Nc1ccccc1") # Boc-aniline
2023
+ {'ok': True, 'product_smiles': 'Nc1ccccc1', 'product_name': 'aniline',
2024
+ 'removed': ['Boc']}
2025
+
2026
+ Note:
2027
+ Single-PG deprotections can also be called directly via
2028
+ :func:`apply_reaction`, e.g.
2029
+ ``apply_reaction("BOC_deprotection", smiles)``. Use that form
2030
+ when you know the specific protecting group in advance.
2031
+ """
2032
+ from rdkit import Chem
2033
+
2034
+ mol = Chem.MolFromSmiles(smiles)
2035
+ if mol is None:
2036
+ resolved = _resolve_query(smiles)
2037
+ if resolved:
2038
+ mol = Chem.MolFromSmiles(resolved["smiles"])
2039
+ if mol is None:
2040
+ return {"ok": False, "error": f"Could not parse '{smiles}'."}
2041
+
2042
+ original_smi = Chem.MolToSmiles(mol)
2043
+
2044
+ # --- Fast path: exactly one named template fires → delegate to apply_reaction ---
2045
+ fired = _detect_deprotection_templates(mol)
2046
+ if len(fired) == 1:
2047
+ tname = fired[0]
2048
+ ar_result = apply_reaction(tname, original_smi)
2049
+ if ar_result.get("ok") and ar_result.get("products"):
2050
+ product_smi = ar_result["products"][0]["smiles"]
2051
+ pg_abbrev = _TEMPLATE_TO_PG_ABBREV.get(tname, tname)
2052
+ return {
2053
+ "ok": True,
2054
+ "product_smiles": product_smi,
2055
+ "product_name": _smiles_to_name_cs(product_smi),
2056
+ "removed": [pg_abbrev],
2057
+ }
2058
+ # apply_reaction unexpectedly failed — fall through to rdDeprotect
2059
+
2060
+ # --- Fallback: rdDeprotect handles multiple PGs or unrecognised ones ---
2061
+ try:
2062
+ from rdkit.Chem import rdDeprotect
2063
+ result = rdDeprotect.Deprotect(mol)
2064
+ except ImportError:
2065
+ if fired:
2066
+ # rdDeprotect unavailable but we know which PG(s) to remove —
2067
+ # run apply_reaction for each in sequence
2068
+ current_smi = original_smi
2069
+ removed_abbrevs: List[str] = []
2070
+ for tname in fired:
2071
+ ar = apply_reaction(tname, current_smi)
2072
+ if ar.get("ok") and ar.get("products"):
2073
+ current_smi = ar["products"][0]["smiles"]
2074
+ removed_abbrevs.append(_TEMPLATE_TO_PG_ABBREV.get(tname, tname))
2075
+ if removed_abbrevs:
2076
+ return {
2077
+ "ok": True,
2078
+ "product_smiles": current_smi,
2079
+ "product_name": _smiles_to_name_cs(current_smi),
2080
+ "removed": removed_abbrevs,
2081
+ }
2082
+ return {
2083
+ "ok": False,
2084
+ "error": (
2085
+ "rdDeprotect not available in this RDKit build. "
2086
+ "Use apply_reaction() with a specific template name "
2087
+ "(e.g. 'BOC_deprotection') for single-PG removal."
2088
+ ),
2089
+ }
2090
+ except Exception as exc:
2091
+ return {"ok": False, "error": f"Deprotection failed: {exc}"}
2092
+
2093
+ product_smi = Chem.MolToSmiles(result)
2094
+
2095
+ if product_smi == original_smi:
2096
+ return {
2097
+ "ok": True,
2098
+ "product_smiles": product_smi,
2099
+ "product_name": _smiles_to_name_cs(product_smi),
2100
+ "removed": [],
2101
+ "note": "No protecting groups detected.",
2102
+ }
2103
+
2104
+ # Identify which PGs were removed by checking each rdDeprotect template
2105
+ removed = []
2106
+ try:
2107
+ from rdkit.Chem import AllChem
2108
+ deprots = rdDeprotect.GetDeprotections()
2109
+ for d in deprots:
2110
+ rxn_sma = d.reaction_smarts
2111
+ rxn = AllChem.ReactionFromSmarts(rxn_sma)
2112
+ try:
2113
+ prods = rxn.RunReactants((mol,))
2114
+ if prods:
2115
+ for ptuple in prods:
2116
+ for p in ptuple:
2117
+ try:
2118
+ Chem.SanitizeMol(p)
2119
+ if Chem.MolToSmiles(p) != original_smi:
2120
+ removed.append(d.abbreviation)
2121
+ break
2122
+ except Exception:
2123
+ continue
2124
+ if removed and removed[-1] == d.abbreviation:
2125
+ break
2126
+ except Exception:
2127
+ continue
2128
+ except Exception:
2129
+ pass
2130
+
2131
+ name = _smiles_to_name_cs(product_smi)
2132
+ return {
2133
+ "ok": True,
2134
+ "product_smiles": product_smi,
2135
+ "product_name": name,
2136
+ "removed": removed,
2137
+ }
2138
+
2139
+
2140
+ # ---------------------------------------------------------------------------
2141
+ # Tool 10: draw_molecule
2142
+ # ---------------------------------------------------------------------------
2143
+
2144
+ def draw_molecule(
2145
+ mol_json: Dict[str, Any],
2146
+ output_path: Optional[str] = None,
2147
+ ) -> Dict[str, Any]:
2148
+ """Render a single molecule to a standalone CDXML document.
2149
+
2150
+ Takes a molecule dict (as returned by ``resolve_compound`` or any dict
2151
+ containing at minimum a ``smiles`` field) and generates a CDXML string
2152
+ suitable for opening directly in ChemDraw. No arrow, no reaction scheme —
2153
+ just the structure, centred on a page.
2154
+
2155
+ An optional text label (compound name or custom label) is placed below the
2156
+ structure when the input dict contains a ``label``, ``name``, or
2157
+ ``iupac_name`` field (checked in that priority order).
2158
+
2159
+ Args:
2160
+ mol_json: Dict with at minimum ``"smiles"``. Optional display keys:
2161
+ ``"label"`` (used verbatim), ``"name"``, ``"iupac_name"``.
2162
+ Any other fields are ignored.
2163
+ output_path: If given, the CDXML string is also written to this file
2164
+ path.
2165
+
2166
+ Returns:
2167
+ Dict with keys:
2168
+
2169
+ - ``ok``: bool
2170
+ - ``cdxml``: CDXML document string (on success)
2171
+ - ``output_path``: echoed path if *output_path* was specified
2172
+ - ``error``: error message (when ``ok=False``)
2173
+
2174
+ Example::
2175
+
2176
+ >>> result = draw_molecule({"smiles": "CC(=O)Oc1ccccc1C(=O)O",
2177
+ ... "name": "aspirin"})
2178
+ >>> result["ok"]
2179
+ True
2180
+ >>> result["cdxml"][:20]
2181
+ '<?xml version="1.0"'
2182
+ """
2183
+ # --- Validate input ---
2184
+ smiles = mol_json.get("smiles")
2185
+ if not smiles:
2186
+ return {"ok": False, "error": "mol_json must contain a 'smiles' field."}
2187
+
2188
+ # --- Resolve display label (priority: label > name > iupac_name) ---
2189
+ label: Optional[str] = (
2190
+ mol_json.get("label")
2191
+ or mol_json.get("name")
2192
+ or mol_json.get("iupac_name")
2193
+ )
2194
+
2195
+ # --- Import renderer internals (lazy — avoids import-time cost) ---
2196
+ try:
2197
+ from cdxml_toolkit.render.renderer import (
2198
+ _IDGen,
2199
+ _smiles_to_fragment_data,
2200
+ _build_fragment,
2201
+ _build_text_element,
2202
+ _fragment_bbox,
2203
+ _bbox_center,
2204
+ _shift_atoms,
2205
+ )
2206
+ except ImportError as exc:
2207
+ return {"ok": False, "error": f"Renderer not available: {exc}"}
2208
+
2209
+ from cdxml_toolkit.constants import (
2210
+ CDXML_FOOTER,
2211
+ CDXML_HEADER,
2212
+ ACS_LABEL_FONT,
2213
+ ACS_LABEL_SIZE,
2214
+ ACS_LABEL_FACE,
2215
+ ACS_CAPTION_SIZE,
2216
+ ACS_HASH_SPACING,
2217
+ ACS_MARGIN_WIDTH,
2218
+ ACS_LINE_WIDTH,
2219
+ ACS_BOLD_WIDTH,
2220
+ ACS_BOND_LENGTH_STR,
2221
+ ACS_BOND_SPACING,
2222
+ ACS_CHAIN_ANGLE_STR,
2223
+ )
2224
+
2225
+ # --- Generate 2D coordinates ---
2226
+ CENTER_X, CENTER_Y = 200.0, 200.0
2227
+
2228
+ result = _smiles_to_fragment_data(smiles, CENTER_X, CENTER_Y)
2229
+ if result is None:
2230
+ return {
2231
+ "ok": False,
2232
+ "error": f"Could not generate 2D coordinates for SMILES: {smiles!r}",
2233
+ }
2234
+
2235
+ atoms, bonds = result
2236
+
2237
+ # Re-centre the structure at the desired origin
2238
+ bbox = _fragment_bbox(atoms)
2239
+ cx, cy = _bbox_center(bbox)
2240
+ _shift_atoms(atoms, CENTER_X - cx, CENTER_Y - cy)
2241
+ bbox = _fragment_bbox(atoms)
2242
+
2243
+ # --- Build XML ---
2244
+ ids = _IDGen(1000)
2245
+ frag_xml, _, _ = _build_fragment(atoms, bonds, ids)
2246
+
2247
+ xml_parts = [frag_xml]
2248
+
2249
+ # --- Optional label below the structure ---
2250
+ if label:
2251
+ label_y = bbox[3] + 14.0 # 14 pt below the structure bottom
2252
+ lbl_xml, _ = _build_text_element(
2253
+ [label], CENTER_X, label_y, ids,
2254
+ justification="Center", use_formatting=False,
2255
+ )
2256
+ xml_parts.append(lbl_xml)
2257
+
2258
+ inner_xml = "\n".join(xml_parts)
2259
+
2260
+ # --- Wrap in CDXML document ---
2261
+ page_id = ids.next()
2262
+
2263
+ header = CDXML_HEADER.format(
2264
+ bbox="0 0 1620 2160",
2265
+ label_font=ACS_LABEL_FONT,
2266
+ label_size=ACS_LABEL_SIZE,
2267
+ label_face=ACS_LABEL_FACE,
2268
+ caption_size=ACS_CAPTION_SIZE,
2269
+ hash_spacing=ACS_HASH_SPACING,
2270
+ margin_width=ACS_MARGIN_WIDTH,
2271
+ line_width=ACS_LINE_WIDTH,
2272
+ bold_width=ACS_BOLD_WIDTH,
2273
+ bond_length=ACS_BOND_LENGTH_STR,
2274
+ bond_spacing=ACS_BOND_SPACING,
2275
+ chain_angle=ACS_CHAIN_ANGLE_STR,
2276
+ )
2277
+
2278
+ page_open = (
2279
+ f'<page id="{page_id}" BoundingBox="0 0 1620 2160" '
2280
+ f'HeaderPosition="36" FooterPosition="36" '
2281
+ f'PrintTrimMarks="yes" HeightPages="3" WidthPages="3">'
2282
+ )
2283
+
2284
+ cdxml = "\n".join([header, page_open, inner_xml, "</page>", CDXML_FOOTER])
2285
+
2286
+ # --- Write to file if requested ---
2287
+ ret: Dict[str, Any] = {"ok": True, "cdxml": cdxml}
2288
+ if output_path:
2289
+ try:
2290
+ with open(output_path, "w", encoding="utf-8") as fh:
2291
+ fh.write(cdxml)
2292
+ ret["output_path"] = output_path
2293
+ except OSError as exc:
2294
+ return {"ok": False, "error": f"Failed to write '{output_path}': {exc}"}
2295
+
2296
+ return ret
2297
+
2298
+
2299
+ # ---------------------------------------------------------------------------
2300
+ # Tool 11: modify_molecule
2301
+ # ---------------------------------------------------------------------------
2302
+
2303
+ def _compute_formula(smiles: str) -> Optional[str]:
2304
+ """Get molecular formula string from SMILES using RDKit."""
2305
+ try:
2306
+ from rdkit import Chem
2307
+ from rdkit.Chem import rdMolDescriptors
2308
+ mol = Chem.MolFromSmiles(smiles)
2309
+ if mol is None:
2310
+ return None
2311
+ return rdMolDescriptors.CalcMolFormula(mol)
2312
+ except Exception:
2313
+ return None
2314
+
2315
+
2316
+ def _compute_mw(smiles: str) -> Optional[float]:
2317
+ """Get exact molecular weight (monoisotopic) from SMILES using RDKit."""
2318
+ try:
2319
+ from rdkit import Chem
2320
+ from rdkit.Chem import Descriptors
2321
+ mol = Chem.MolFromSmiles(smiles)
2322
+ if mol is None:
2323
+ return None
2324
+ return round(Descriptors.ExactMolWt(mol), 4)
2325
+ except Exception:
2326
+ return None
2327
+
2328
+
2329
+ def _parse_formula_counts(formula: str) -> Dict[str, int]:
2330
+ """Parse a molecular formula string into element counts.
2331
+
2332
+ Handles simple formulas like ``C26H26N8O3``. Returns a dict mapping
2333
+ element symbol to count.
2334
+ """
2335
+ counts: Dict[str, int] = {}
2336
+ for sym, n in re.findall(r"([A-Z][a-z]?)(\d*)", formula):
2337
+ if sym:
2338
+ counts[sym] = counts.get(sym, 0) + (int(n) if n else 1)
2339
+ return counts
2340
+
2341
+
2342
+ def _delta_formula(formula_in: str, formula_out: str) -> str:
2343
+ """Compute element-by-element formula difference as a compact string.
2344
+
2345
+ Example: ``C20H20`` to ``C26H26`` gives ``+C6H6``.
2346
+ Returns a string like ``"+C6H4, -D3"`` or ``"(no change)"``.
2347
+ """
2348
+ counts_in = _parse_formula_counts(formula_in)
2349
+ counts_out = _parse_formula_counts(formula_out)
2350
+
2351
+ all_elems = sorted(set(list(counts_in.keys()) + list(counts_out.keys())))
2352
+ added: List[str] = []
2353
+ removed: List[str] = []
2354
+
2355
+ for elem in all_elems:
2356
+ n_in = counts_in.get(elem, 0)
2357
+ n_out = counts_out.get(elem, 0)
2358
+ delta = n_out - n_in
2359
+ if delta > 0:
2360
+ added.append(f"{elem}{delta if delta > 1 else ''}")
2361
+ elif delta < 0:
2362
+ removed.append(f"{elem}{abs(delta) if abs(delta) > 1 else ''}")
2363
+
2364
+ parts = []
2365
+ if added:
2366
+ parts.append("+" + "".join(added))
2367
+ if removed:
2368
+ parts.append("-" + "".join(removed))
2369
+ return ", ".join(parts) if parts else "(no change)"
2370
+
2371
+
2372
+ def _build_mol_diff(input_smiles: str, output_smiles: str) -> Dict[str, Any]:
2373
+ """Build the ``diff`` sub-dict using MCS + formula comparison."""
2374
+ diff: Dict[str, Any] = {
2375
+ "atoms_added": [],
2376
+ "atoms_removed": [],
2377
+ "atoms_changed": [],
2378
+ "mcs_smarts": None,
2379
+ "delta_formula": None,
2380
+ "delta_mw": None,
2381
+ }
2382
+
2383
+ try:
2384
+ from rdkit import Chem
2385
+ from rdkit.Chem import rdFMCS
2386
+ from cdxml_toolkit.naming.aligned_namer import molecular_diff
2387
+
2388
+ md = molecular_diff(input_smiles, output_smiles)
2389
+
2390
+ if not md.fallback_used:
2391
+ try:
2392
+ sm_mol = Chem.MolFromSmiles(input_smiles)
2393
+ prod_mol = Chem.MolFromSmiles(output_smiles)
2394
+ if sm_mol and prod_mol:
2395
+ mcs = rdFMCS.FindMCS(
2396
+ [sm_mol, prod_mol],
2397
+ threshold=1.0,
2398
+ ringMatchesRingOnly=True,
2399
+ completeRingsOnly=True,
2400
+ atomCompare=rdFMCS.AtomCompare.CompareElements,
2401
+ bondCompare=rdFMCS.BondCompare.CompareOrder,
2402
+ timeout=5,
2403
+ )
2404
+ if not mcs.canceled and mcs.numAtoms >= 3:
2405
+ diff["mcs_smarts"] = mcs.smartsString
2406
+ except Exception:
2407
+ pass
2408
+
2409
+ for ch in md.changes:
2410
+ if ch.change_type == "addition":
2411
+ diff["atoms_added"].append(ch.prod_name)
2412
+ elif ch.change_type == "removal":
2413
+ diff["atoms_removed"].append(ch.sm_name)
2414
+ elif ch.change_type == "replace":
2415
+ diff["atoms_changed"].append(
2416
+ {"from": ch.sm_name, "to": ch.prod_name}
2417
+ )
2418
+ except Exception:
2419
+ pass
2420
+
2421
+ # Formula and MW delta (always computed — does not need MCS)
2422
+ formula_in = _compute_formula(input_smiles)
2423
+ formula_out = _compute_formula(output_smiles)
2424
+ mw_in = _compute_mw(input_smiles)
2425
+ mw_out = _compute_mw(output_smiles)
2426
+
2427
+ if formula_in and formula_out:
2428
+ diff["delta_formula"] = _delta_formula(formula_in, formula_out)
2429
+ if mw_in is not None and mw_out is not None:
2430
+ diff["delta_mw"] = round(mw_out - mw_in, 4)
2431
+
2432
+ return diff
2433
+
2434
+
2435
+ def _build_aligned_names(input_smiles: str, output_smiles: str) -> str:
2436
+ """Build an aligned name comparison string for two SMILES.
2437
+
2438
+ Returns a string like ``"X \u2192 Y\\n changes: ..."``.
2439
+ Falls back to a simple ``"name1 \u2192 name2"`` via ChemScript.
2440
+ """
2441
+ try:
2442
+ from cdxml_toolkit.naming.aligned_namer import (
2443
+ find_aligned_names, format_name_diff,
2444
+ )
2445
+ ar = find_aligned_names(input_smiles, output_smiles)
2446
+ if ar.best_sm_name and ar.best_prod_name:
2447
+ diff_str = format_name_diff(ar.best_sm_name, ar.best_prod_name)
2448
+ return (
2449
+ f"{ar.best_sm_name} \u2192 {ar.best_prod_name}"
2450
+ f"\n changes: {diff_str}"
2451
+ )
2452
+ except Exception:
2453
+ pass
2454
+
2455
+ n1 = _smiles_to_name_cs(input_smiles) or ""
2456
+ n2 = _smiles_to_name_cs(output_smiles) or ""
2457
+ if n1 and n2:
2458
+ return f"{n1} \u2192 {n2}"
2459
+ return ""
2460
+
2461
+
2462
+ def modify_molecule(mol_json: Dict[str, Any],
2463
+ operation: str,
2464
+ **kwargs: Any) -> Dict[str, Any]:
2465
+ """Modify a molecule and verify the change with a structural diff.
2466
+
2467
+ This is the molecular editor for LLM orchestration. It takes a
2468
+ molecule (as a dict with at least a ``smiles`` key), applies an
2469
+ operation, and returns the modified molecule with a structural diff
2470
+ so the LLM can verify the change happened as intended.
2471
+
2472
+ Parameters
2473
+ ----------
2474
+ mol_json : dict
2475
+ Source molecule dict. Must contain ``smiles`` (canonical SMILES).
2476
+ May also contain ``name`` or ``iupac_name`` for display.
2477
+ operation : str
2478
+ One of:
2479
+
2480
+ - ``"analyze"`` — inspect the molecule without modifying it.
2481
+ Returns functional groups, alternative IUPAC names, bracket
2482
+ tree, prefix form, formula, and MW. No additional kwargs.
2483
+
2484
+ - ``"name_surgery"`` — modify via IUPAC name manipulation.
2485
+ Additional kwargs:
2486
+
2487
+ - ``add``: list of ``{"locant": str, "prefix": str}`` dicts
2488
+ - ``remove``: list of prefix strings to remove
2489
+
2490
+ - ``"smarts"`` — apply a SMARTS reaction transform.
2491
+ Additional kwargs:
2492
+
2493
+ - ``smarts``: reaction SMARTS string, e.g. ``"[c:1][F]>>[c:1][Cl]"``
2494
+ - ``reaction_name``: name from ``list_reactions()`` (alternative)
2495
+
2496
+ - ``"set_smiles"`` — accept new SMILES from the LLM.
2497
+ Additional kwargs:
2498
+
2499
+ - ``new_smiles``: str (validated with RDKit)
2500
+ - ``description``: str (optional, for context)
2501
+
2502
+ - ``"reaction"`` — apply a named reaction template (calls
2503
+ ``apply_reaction()`` internally). Additional kwargs:
2504
+
2505
+ - ``reaction_name``: str (required) — template from ``list_reactions()``
2506
+ - ``reagent``: dict with ``smiles`` key (for binary reactions)
2507
+
2508
+ Returns
2509
+ -------
2510
+ dict
2511
+ For ``"analyze"`` operation:
2512
+
2513
+ - ``ok``: bool
2514
+ - ``input_smiles``: canonical SMILES of input
2515
+ - ``canonical_name``: IUPAC name (from ChemScript, or empty)
2516
+ - ``alternative_names``: list of alternative IUPAC names (round-trip
2517
+ validated) showing different parent/substituent perspectives
2518
+ - ``functional_groups``: list of functional group names present
2519
+ (e.g. ``["aryl chloride", "pyridine", "amide"]``)
2520
+ - ``prefix_form``: IUPAC prefix if this could be a substituent, or
2521
+ ``None``
2522
+ - ``bracket_tree``: the canonical IUPAC name with its bracket
2523
+ hierarchy preserved (same as ``canonical_name``); the caller can
2524
+ parse parenthesised groups to see substituents at each depth
2525
+ - ``formula``: molecular formula string
2526
+ - ``mw``: exact monoisotopic MW (float)
2527
+
2528
+ For modification operations (``"name_surgery"``, ``"smarts"``,
2529
+ ``"set_smiles"``):
2530
+
2531
+ - ``ok``: bool
2532
+ - ``input_smiles``: canonical SMILES of input
2533
+ - ``output_smiles``: canonical SMILES of output
2534
+ - ``input_name``: IUPAC name of input
2535
+ - ``output_name``: IUPAC name of output (from ChemScript)
2536
+ - ``aligned_names``: side-by-side aligned name comparison string
2537
+ - ``diff``: sub-dict with:
2538
+
2539
+ - ``atoms_added``: list of fragment names added
2540
+ - ``atoms_removed``: list of fragment names removed
2541
+ - ``atoms_changed``: list of ``{"from": ..., "to": ...}`` dicts
2542
+ - ``mcs_smarts``: maximum common substructure SMARTS (str or None)
2543
+ - ``delta_formula``: formula difference (e.g. ``"+C6H5, -F"``)
2544
+ - ``delta_mw``: MW difference in Da (float)
2545
+
2546
+ - ``formula``: molecular formula of output
2547
+ - ``mw``: exact monoisotopic MW of output
2548
+
2549
+ Examples
2550
+ --------
2551
+ ::
2552
+
2553
+ # Swap a CD3 for benzyl via SMARTS
2554
+ result = modify_molecule(
2555
+ {"smiles": "C([2H])([2H])[2H]"},
2556
+ "smarts",
2557
+ smarts="[C:1]([2H])([2H])[2H]>>[C:1]Cc1ccccc1",
2558
+ )
2559
+
2560
+ # Add a fluoro group via name surgery
2561
+ result = modify_molecule(
2562
+ {"smiles": "Clc1ccncc1"},
2563
+ "name_surgery",
2564
+ add=[{"locant": "3", "prefix": "fluoro"}],
2565
+ )
2566
+
2567
+ # Directly set new SMILES and verify
2568
+ result = modify_molecule(
2569
+ {"smiles": "Clc1ccncc1"},
2570
+ "set_smiles",
2571
+ new_smiles="Clc1cc(F)ncc1",
2572
+ description="added fluoro at C3",
2573
+ )
2574
+ """
2575
+ from rdkit import Chem
2576
+
2577
+ # ---- Validate input ----
2578
+ input_smiles_raw = mol_json.get("smiles", "")
2579
+ if not input_smiles_raw:
2580
+ return {"ok": False, "error": "mol_json must contain 'smiles'."}
2581
+
2582
+ in_mol = Chem.MolFromSmiles(input_smiles_raw)
2583
+ if in_mol is None:
2584
+ return {"ok": False,
2585
+ "error": f"Could not parse input SMILES: '{input_smiles_raw}'."}
2586
+ input_smiles = Chem.MolToSmiles(in_mol)
2587
+
2588
+ output_smiles: Optional[str] = None
2589
+ alternative_products: List[Dict[str, Any]] = []
2590
+
2591
+ # ---- Dispatch operation ----
2592
+ if operation == "set_smiles":
2593
+ new_smiles = kwargs.get("new_smiles", "")
2594
+ if not new_smiles:
2595
+ return {"ok": False, "error": "'new_smiles' is required for set_smiles."}
2596
+ out_mol = Chem.MolFromSmiles(new_smiles)
2597
+ if out_mol is None:
2598
+ return {"ok": False,
2599
+ "error": f"'new_smiles' is not a valid SMILES: '{new_smiles}'."}
2600
+ output_smiles = Chem.MolToSmiles(out_mol)
2601
+
2602
+ elif operation == "smarts":
2603
+ smarts_str = kwargs.get("smarts", "")
2604
+ reaction_name = kwargs.get("reaction_name", "")
2605
+
2606
+ if reaction_name and not smarts_str:
2607
+ templates = _get_reaction_templates()
2608
+ tmpl = templates.get(reaction_name)
2609
+ if tmpl is None:
2610
+ return {"ok": False,
2611
+ "error": f"Unknown reaction_name '{reaction_name}'."}
2612
+ smarts_str = tmpl["smarts"]
2613
+
2614
+ if not smarts_str:
2615
+ return {"ok": False,
2616
+ "error": "'smarts' or 'reaction_name' is required for smarts."}
2617
+
2618
+ try:
2619
+ from rdkit.Chem import AllChem
2620
+ rxn = AllChem.ReactionFromSmarts(smarts_str)
2621
+ except Exception as exc:
2622
+ return {"ok": False, "error": f"Invalid reaction SMARTS: {exc}"}
2623
+
2624
+ try:
2625
+ product_sets = rxn.RunReactants((in_mol,))
2626
+ except Exception as exc:
2627
+ return {"ok": False, "error": f"SMARTS reaction failed: {exc}"}
2628
+
2629
+ if not product_sets:
2630
+ # Detect common patterns and suggest named reactions
2631
+ hints = []
2632
+ s = smarts_str
2633
+ # Check most specific patterns first
2634
+ if any(p in s for p in ["OC(C)(C)C", "Boc", "BOC", "boc", "tBu"]):
2635
+ hints.append("For Boc deprotection, try: operation='reaction', reaction_name='BOC_deprotection'")
2636
+ elif any(p in s for p in ["Fmoc", "fmoc", "fluorenyl"]):
2637
+ hints.append("For Fmoc deprotection, try: operation='reaction', reaction_name='fmoc_deprotection'")
2638
+ elif "C(=O)N" in s and "OC(=O)N" not in s:
2639
+ hints.append("For amide hydrolysis, try: operation='reaction', reaction_name='amide_hydrolysis'")
2640
+ elif "C(=O)O" in s:
2641
+ hints.append("For ester hydrolysis, try: operation='reaction', reaction_name='ester_hydrolysis'")
2642
+ if not hints:
2643
+ hints.append("Hint: use operation='reaction' with a reaction_name for common transformations. Call modify_molecule with operation='reaction' and no reaction_name to see all available reactions.")
2644
+ return {
2645
+ "ok": False,
2646
+ "error": (
2647
+ "SMARTS pattern did not match the input molecule. "
2648
+ + " ".join(hints)
2649
+ ),
2650
+ "input_smiles": input_smiles,
2651
+ }
2652
+
2653
+ for prod_tuple in product_sets:
2654
+ for prod in prod_tuple:
2655
+ try:
2656
+ Chem.SanitizeMol(prod)
2657
+ output_smiles = Chem.MolToSmiles(prod)
2658
+ break
2659
+ except Exception:
2660
+ continue
2661
+ if output_smiles:
2662
+ break
2663
+
2664
+ if not output_smiles:
2665
+ return {"ok": False,
2666
+ "error": "SMARTS reaction produced no valid products."}
2667
+
2668
+ elif operation == "name_surgery":
2669
+ iupac_name = (mol_json.get("iupac_name")
2670
+ or mol_json.get("name")
2671
+ or _smiles_to_name_cs(input_smiles))
2672
+
2673
+ if not iupac_name:
2674
+ return {
2675
+ "ok": False,
2676
+ "error": (
2677
+ "name_surgery requires an IUPAC name. "
2678
+ "Provide 'iupac_name' in mol_json, or ensure ChemScript "
2679
+ "is available to auto-generate one."
2680
+ ),
2681
+ }
2682
+
2683
+ add_list: List[Dict[str, str]] = kwargs.get("add", [])
2684
+ remove_list: List[str] = kwargs.get("remove", [])
2685
+
2686
+ current_name = iupac_name
2687
+
2688
+ for prefix_to_remove in remove_list:
2689
+ # Auto-resolve abbreviations to IUPAC prefix form
2690
+ pfx_r = get_prefix_form(prefix_to_remove)
2691
+ if pfx_r.get("ok"):
2692
+ prefix_to_remove = pfx_r["prefix"]
2693
+ res = _modify_remove(current_name, prefix_to_remove,
2694
+ validate=True, use_network=False)
2695
+ if not res.get("ok"):
2696
+ return {
2697
+ "ok": False,
2698
+ "error": (f"name_surgery remove '{prefix_to_remove}' "
2699
+ f"failed: {res.get('error', '?')}"),
2700
+ "input_smiles": input_smiles,
2701
+ "tried_name": current_name,
2702
+ }
2703
+ if res.get("valid") and res.get("smiles"):
2704
+ current_name = res["name"]
2705
+ else:
2706
+ return {
2707
+ "ok": False,
2708
+ "error": (f"name_surgery remove '{prefix_to_remove}' "
2709
+ f"produced invalid name: '{res.get('name')}'."),
2710
+ "input_smiles": input_smiles,
2711
+ }
2712
+
2713
+ for sub in add_list:
2714
+ prefix = sub.get("prefix", "")
2715
+ locant = sub.get("locant", "")
2716
+ if not prefix:
2717
+ continue
2718
+ # Auto-resolve abbreviations/formulae to IUPAC prefix form
2719
+ # so the agent can say "CF3" instead of "trifluoromethyl".
2720
+ pfx_result = get_prefix_form(prefix)
2721
+ if pfx_result.get("ok"):
2722
+ prefix = pfx_result["prefix"]
2723
+ res = _modify_add(current_name, prefix, locant,
2724
+ validate=True, use_network=False)
2725
+ if not res.get("ok"):
2726
+ return {
2727
+ "ok": False,
2728
+ "error": (f"name_surgery add '{prefix}' at '{locant}' "
2729
+ f"failed: {res.get('error', '?')}"),
2730
+ "input_smiles": input_smiles,
2731
+ "tried_name": current_name,
2732
+ }
2733
+ if res.get("valid") and res.get("smiles"):
2734
+ current_name = res["name"]
2735
+ else:
2736
+ return {
2737
+ "ok": False,
2738
+ "error": (f"name_surgery add '{prefix}' produced "
2739
+ f"invalid name: '{res.get('name')}'."),
2740
+ "input_smiles": input_smiles,
2741
+ }
2742
+
2743
+ output_smiles = _try_validate(current_name, use_network=False)
2744
+ if not output_smiles:
2745
+ return {
2746
+ "ok": False,
2747
+ "error": f"Could not validate name surgery result: '{current_name}'.",
2748
+ "input_smiles": input_smiles,
2749
+ "output_name_attempted": current_name,
2750
+ }
2751
+ out_mol = Chem.MolFromSmiles(output_smiles)
2752
+ if out_mol:
2753
+ output_smiles = Chem.MolToSmiles(out_mol)
2754
+
2755
+ elif operation == "reaction":
2756
+ # ---- Reaction: apply a named reaction template via apply_reaction ----
2757
+ reaction_name = kwargs.get("reaction_name", "")
2758
+ if not reaction_name:
2759
+ rxn_list = list_reactions()
2760
+ names = [r["name"] for r in rxn_list.get("reactions", [])]
2761
+ return {
2762
+ "ok": False,
2763
+ "error": (
2764
+ "'reaction_name' is required for the reaction operation. "
2765
+ f"Available reactions: {', '.join(names)}"
2766
+ ),
2767
+ "input_smiles": input_smiles,
2768
+ }
2769
+
2770
+ reagent_dict = kwargs.get("reagent", None)
2771
+ reagent_smiles = reagent_dict.get("smiles") if isinstance(reagent_dict, dict) else None
2772
+
2773
+ rxn_result = apply_reaction(reaction_name, input_smiles, reagent_smiles)
2774
+ if not rxn_result.get("ok"):
2775
+ return {
2776
+ "ok": False,
2777
+ "error": rxn_result.get("error", "Reaction failed."),
2778
+ "input_smiles": input_smiles,
2779
+ "reaction_name": reaction_name,
2780
+ }
2781
+
2782
+ products = rxn_result.get("products", [])
2783
+ if not products:
2784
+ return {
2785
+ "ok": False,
2786
+ "error": "Reaction produced no products.",
2787
+ "input_smiles": input_smiles,
2788
+ "reaction_name": reaction_name,
2789
+ }
2790
+
2791
+ # Primary product is first; store remaining as alternatives
2792
+ output_smiles = products[0]["smiles"]
2793
+ alternative_products = products[1:] if len(products) > 1 else []
2794
+
2795
+ elif operation == "analyze":
2796
+ # ---- Analyze: reason about a molecule without modifying it ----
2797
+ # Functional group SMARTS (name → SMARTS pattern).
2798
+ _FG_SMARTS: List[tuple] = [
2799
+ # Halogens
2800
+ ("aryl fluoride", "[F][c]"),
2801
+ ("aryl chloride", "[Cl][c]"),
2802
+ ("aryl bromide", "[Br][c]"),
2803
+ ("aryl iodide", "[I][c]"),
2804
+ ("alkyl fluoride", "[F][CX4]"),
2805
+ ("alkyl chloride", "[Cl][CX4]"),
2806
+ ("alkyl bromide", "[Br][CX4]"),
2807
+ ("alkyl iodide", "[I][CX4]"),
2808
+ # Nitrogen
2809
+ ("primary amine", "[NH2][CX4]"),
2810
+ ("secondary amine", "[NH1]([CX4])[CX4]"),
2811
+ ("tertiary amine", "[NX3;!$(N=*)]([CX4])([CX4])[CX4]"),
2812
+ ("aromatic amine", "[NH2][c]"),
2813
+ ("amide", "[CX3](=[OX1])[NX3]"),
2814
+ ("sulfonamide", "[SX4](=[OX1])(=[OX1])[NX3]"),
2815
+ ("nitro", "[$([NX3](=O)=O),$([NX3+](=O)[O-])]"),
2816
+ ("nitrile", "[CX2]#[NX1]"),
2817
+ ("isocyanate", "[NX2]=[C]=[OX1]"),
2818
+ ("urea", "[NX3][CX3](=[OX1])[NX3]"),
2819
+ ("carbamate", "[NX3][CX3](=[OX1])[OX2]"),
2820
+ # Oxygen
2821
+ ("carboxylic acid", "[CX3](=[OX1])[OX2H1]"),
2822
+ ("ester", "[CX3](=[OX1])[OX2][CX4]"),
2823
+ ("ketone", "[CX3](=[OX1])[CX4]"),
2824
+ ("aldehyde", "[CX3H1](=[OX1])"),
2825
+ ("alcohol", "[OX2H][CX4]"),
2826
+ ("phenol", "[OX2H][c]"),
2827
+ ("ether", "[OX2]([CX4])[CX4]"),
2828
+ ("aryl ether", "[OX2]([c])[CX4,c]"),
2829
+ ("epoxide", "[C]1[O][C]1"),
2830
+ ("anhydride", "[CX3](=[OX1])[OX2][CX3](=[OX1])"),
2831
+ # Sulfur
2832
+ ("thiol", "[SX2H]"),
2833
+ ("thioether", "[SX2]([CX4])[CX4]"),
2834
+ ("sulfoxide", "[$([SX3]=O)]"),
2835
+ ("sulfone", "[$([SX4](=[OX1])(=[OX1]))]"),
2836
+ # Phosphorus
2837
+ ("phosphate", "[PX4](=[OX1])([OX2])([OX2])[OX2]"),
2838
+ ("phosphonic acid", "[PX4](=[OX1])([OX2H])([OX2H])"),
2839
+ # Boron
2840
+ ("boronic acid", "[BX3]([OX2H])[OX2H]"),
2841
+ ("boronate ester", "[BX3]([OX2])[OX2]"),
2842
+ # Heterocycles (aromatic)
2843
+ ("pyridine", "c1ccncc1"),
2844
+ ("pyrimidine", "c1cnccn1"),
2845
+ ("pyrazine", "c1cnccn1"),
2846
+ ("imidazole", "c1cnc[nH]1"),
2847
+ ("pyrazole", "c1cc[nH]n1"),
2848
+ ("triazole", "c1cn[nH]n1"),
2849
+ ("tetrazole", "c1nnn[nH]1"),
2850
+ ("oxazole", "c1cocn1"),
2851
+ ("thiazole", "c1cscn1"),
2852
+ ("indole", "c1ccc2[nH]ccc2c1"),
2853
+ ("benzimidazole", "c1cnc2ccccc2n1"),
2854
+ ("quinoline", "c1ccc2ncccc2c1"),
2855
+ ("isoquinoline", "c1ccc2cnccc2c1"),
2856
+ ("piperidine", "[NH]1CCCCC1"),
2857
+ ("piperazine", "N1CCNCC1"),
2858
+ ("morpholine", "O1CCNCC1"),
2859
+ ("pyrrolidine", "[NH]1CCCC1"),
2860
+ ("azetidine", "[NH]1CCC1"),
2861
+ # Protected amines
2862
+ ("Boc-protected amine", "[NX3][CX3](=[OX1])OC(C)(C)C"),
2863
+ ("Cbz-protected amine", "[NX3][CX3](=[OX1])OCc1ccccc1"),
2864
+ ("Fmoc-protected amine", "[NX3][CX3](=[OX1])OCC1c2ccccc2-c2ccccc21"),
2865
+ ]
2866
+
2867
+ # Remove any broken SMARTS (the sulfone pattern has a typo guard)
2868
+ valid_fg_patterns: List[tuple] = []
2869
+ for fg_name, fg_smarts in _FG_SMARTS:
2870
+ try:
2871
+ from rdkit.Chem import MolFromSmarts
2872
+ patt = MolFromSmarts(fg_smarts)
2873
+ if patt is not None:
2874
+ valid_fg_patterns.append((fg_name, patt))
2875
+ except Exception:
2876
+ pass
2877
+
2878
+ # Detect functional groups
2879
+ functional_groups: List[str] = []
2880
+ for fg_name, patt in valid_fg_patterns:
2881
+ if in_mol.HasSubstructMatch(patt):
2882
+ functional_groups.append(fg_name)
2883
+
2884
+ # Get canonical IUPAC name from ChemScript
2885
+ canonical_name = _smiles_to_name_cs(input_smiles) or ""
2886
+
2887
+ # Get decomposition (alternatives + bracket tree)
2888
+ alternative_names: List[str] = []
2889
+ bracket_tree_str: Optional[str] = None
2890
+ try:
2891
+ from cdxml_toolkit.naming.name_decomposer import decompose_name
2892
+ decomp = decompose_name(input_smiles)
2893
+ if decomp.alternatives:
2894
+ alternative_names = [a.name for a in decomp.alternatives
2895
+ if a.valid and a.name]
2896
+ if decomp.bracket_tree is not None:
2897
+ bracket_tree_str = decomp.canonical_name
2898
+ if not canonical_name and decomp.canonical_name:
2899
+ canonical_name = decomp.canonical_name
2900
+ except Exception:
2901
+ pass
2902
+
2903
+ # Get prefix form (substituent name)
2904
+ prefix_form: Optional[str] = None
2905
+ try:
2906
+ pfx_result = get_prefix_form(canonical_name or input_smiles)
2907
+ if pfx_result.get("ok"):
2908
+ prefix_form = pfx_result["prefix"]
2909
+ except Exception:
2910
+ pass
2911
+
2912
+ formula = _compute_formula(input_smiles)
2913
+ mw_val = _compute_mw(input_smiles)
2914
+
2915
+ return {
2916
+ "ok": True,
2917
+ "input_smiles": input_smiles,
2918
+ "canonical_name": canonical_name,
2919
+ "alternative_names": alternative_names,
2920
+ "functional_groups": functional_groups,
2921
+ "prefix_form": prefix_form,
2922
+ "bracket_tree": bracket_tree_str,
2923
+ "formula": formula,
2924
+ "mw": mw_val,
2925
+ }
2926
+
2927
+ elif operation == "set_name":
2928
+ # ---- Set name: resolve a new IUPAC name to SMILES, validate, diff ----
2929
+ new_name = kwargs.get("new_name", "")
2930
+ if not new_name:
2931
+ return {"ok": False, "error": "'new_name' is required for set_name."}
2932
+
2933
+ # Try to resolve the name to SMILES
2934
+ output_smiles = _try_validate(new_name, use_network=True)
2935
+ if not output_smiles:
2936
+ # Also try resolve_to_smiles in case it's a common name
2937
+ r = resolve_to_smiles(new_name, use_network=True)
2938
+ if r.get("ok"):
2939
+ output_smiles = r["smiles"]
2940
+
2941
+ if not output_smiles:
2942
+ return {
2943
+ "ok": False,
2944
+ "error": f"Could not resolve name '{new_name}' to a valid structure.",
2945
+ "input_smiles": input_smiles,
2946
+ }
2947
+ out_mol = Chem.MolFromSmiles(output_smiles)
2948
+ if out_mol is None:
2949
+ return {
2950
+ "ok": False,
2951
+ "error": f"Name '{new_name}' resolved but SMILES is invalid.",
2952
+ "input_smiles": input_smiles,
2953
+ }
2954
+ output_smiles = Chem.MolToSmiles(out_mol)
2955
+
2956
+ else:
2957
+ return {
2958
+ "ok": False,
2959
+ "error": (f"Unknown operation '{operation}'. "
2960
+ "Use 'analyze', 'name_surgery', 'smarts', "
2961
+ "'set_smiles', 'set_name', or 'reaction'."),
2962
+ }
2963
+
2964
+ # ---- Build output ----
2965
+ input_name = (mol_json.get("iupac_name")
2966
+ or mol_json.get("name")
2967
+ or _smiles_to_name_cs(input_smiles)
2968
+ or "")
2969
+ output_name = _smiles_to_name_cs(output_smiles) or ""
2970
+
2971
+ aligned_names = _build_aligned_names(input_smiles, output_smiles)
2972
+ diff = _build_mol_diff(input_smiles, output_smiles)
2973
+
2974
+ formula = _compute_formula(output_smiles)
2975
+ mw_out = _compute_mw(output_smiles)
2976
+
2977
+ result = {
2978
+ "ok": True,
2979
+ "input_smiles": input_smiles,
2980
+ "output_smiles": output_smiles,
2981
+ "input_name": input_name,
2982
+ "output_name": output_name,
2983
+ "aligned_names": aligned_names,
2984
+ "diff": diff,
2985
+ "formula": formula,
2986
+ "mw": mw_out,
2987
+ }
2988
+ if alternative_products:
2989
+ result["alternative_products"] = alternative_products
2990
+ return result
2991
+
2992
+
2993
+ # ---------------------------------------------------------------------------
2994
+ # Tool definitions for LLM function calling
2995
+ # ---------------------------------------------------------------------------
2996
+
2997
+ def get_tool_definitions() -> List[Dict[str, Any]]:
2998
+ """Return tool schemas suitable for LLM function calling (Claude/OpenAI).
2999
+
3000
+ Each tool definition follows the Anthropic tool-use format::
3001
+
3002
+ {"name": "...", "description": "...", "input_schema": {...}}
3003
+
3004
+ The LLM orchestrator should register these as available tools and
3005
+ call the corresponding Python functions based on the LLM's output.
3006
+
3007
+ Returns:
3008
+ List of tool definition dicts.
3009
+ """
3010
+ return [
3011
+ {
3012
+ "name": "resolve_compound",
3013
+ "description": (
3014
+ "Resolve a chemical identifier to a rich molecule descriptor "
3015
+ "with SMILES, molecular formula, MW, exact mass, IUPAC name, "
3016
+ "reagent role, display text, and IUPAC substituent prefix form. "
3017
+ "This is the preferred resolver — use it whenever you need more "
3018
+ "than just SMILES.\n\n"
3019
+ "Accepts common names, IUPAC names, abbreviations, condensed "
3020
+ "formulae, and CAS numbers. Resolution order:\n"
3021
+ " 1. Curated reagent DB (~186 entries with roles)\n"
3022
+ " 2. Generative condensed formula parser (offline)\n"
3023
+ " 3. ChemScript IUPAC name engine (offline)\n"
3024
+ " 4. PubChem API (online, if use_network=True)\n\n"
3025
+ "Output fields include:\n"
3026
+ " - smiles, formula, mw, exact_mass, iupac_name, source\n"
3027
+ " - role, display_text (from curated reagent DB if known)\n"
3028
+ " - prefix_form: IUPAC substituent prefix for use in "
3029
+ "assemble_name (e.g. 'trifluoromethyl' for CF3, 'morpholino' "
3030
+ "for morpholine); null if not a substituent group.\n\n"
3031
+ "Examples of valid queries:\n"
3032
+ ' - Common names: "aspirin", "morpholine", "HATU"\n'
3033
+ ' - Abbreviations: "Cs2CO3", "DIPEA", "Et3N"\n'
3034
+ ' - IUPAC names: "2-chloropyridine"\n'
3035
+ ' - Formulae: "PhB(OH)2", "CF3COOH"\n'
3036
+ ' - CAS numbers: "534-17-8"\n'
3037
+ ' - Drug names: "deucravacitinib"\n'
3038
+ ),
3039
+ "input_schema": {
3040
+ "type": "object",
3041
+ "properties": {
3042
+ "query": {
3043
+ "type": "string",
3044
+ "description": "Chemical identifier to resolve.",
3045
+ },
3046
+ "use_network": {
3047
+ "type": "boolean",
3048
+ "description": (
3049
+ "Allow PubChem lookup (default: true). "
3050
+ "Set false for offline-only resolution."
3051
+ ),
3052
+ },
3053
+ },
3054
+ "required": ["query"],
3055
+ },
3056
+ },
3057
+ {
3058
+ "name": "resolve_to_smiles",
3059
+ "description": (
3060
+ "Resolve a chemical identifier (name, abbreviation, formula, "
3061
+ "or CAS number) to a canonical SMILES string. Use this when "
3062
+ "you need only the SMILES; for richer output (formula, MW, "
3063
+ "exact mass, role) use resolve_compound instead.\n\n"
3064
+ "Examples of valid queries:\n"
3065
+ ' - Common names: "aspirin", "morpholine", "HATU"\n'
3066
+ ' - IUPAC names: "2-chloropyridine", "4-methylbenzoic acid"\n'
3067
+ ' - Formulae: "PhB(OH)2", "Et3N", "CF3COOH"\n'
3068
+ ' - CAS numbers: "534-17-8"\n'
3069
+ ),
3070
+ "input_schema": {
3071
+ "type": "object",
3072
+ "properties": {
3073
+ "query": {
3074
+ "type": "string",
3075
+ "description": "Chemical identifier to resolve.",
3076
+ },
3077
+ },
3078
+ "required": ["query"],
3079
+ },
3080
+ },
3081
+ {
3082
+ "name": "get_prefix_form",
3083
+ "description": (
3084
+ "Get the IUPAC substituent prefix form for a chemical group "
3085
+ "so it can be used in assemble_name. Returns the prefix "
3086
+ "string (e.g. 'trifluoromethyl' for 'CF3', 'morpholino' for "
3087
+ "'morpholine').\n\n"
3088
+ "Use this when you know what group to attach but need its "
3089
+ "correct IUPAC prefix name.\n\n"
3090
+ "Examples:\n"
3091
+ ' - "CF3" -> "trifluoromethyl"\n'
3092
+ ' - "NO2" -> "nitro"\n'
3093
+ ' - "OMe" -> "methoxy"\n'
3094
+ ' - "morpholine" -> "morpholino"\n'
3095
+ ' - "cyclopropane" -> "cyclopropyl"\n'
3096
+ ),
3097
+ "input_schema": {
3098
+ "type": "object",
3099
+ "properties": {
3100
+ "group": {
3101
+ "type": "string",
3102
+ "description": (
3103
+ "Group to look up: abbreviation ('CF3', 'OMe'), "
3104
+ "name ('morpholine'), or formula ('CHF2')."
3105
+ ),
3106
+ },
3107
+ },
3108
+ "required": ["group"],
3109
+ },
3110
+ },
3111
+ {
3112
+ "name": "assemble_name",
3113
+ "description": (
3114
+ "Build an IUPAC name from a parent ring/chain and a list of "
3115
+ "substituents. Handles alphabetical ordering and multiplying "
3116
+ "prefixes (di-, tri-) automatically. Validates the assembled "
3117
+ "name by resolving it to a structure.\n\n"
3118
+ "Example:\n"
3119
+ " parent: 'pyridine'\n"
3120
+ " substituents: [\n"
3121
+ ' {"locant": "2", "prefix": "chloro"},\n'
3122
+ ' {"locant": "5", "prefix": "trifluoromethyl"}\n'
3123
+ " ]\n"
3124
+ " -> '2-chloro-5-(trifluoromethyl)pyridine'\n"
3125
+ ),
3126
+ "input_schema": {
3127
+ "type": "object",
3128
+ "properties": {
3129
+ "parent": {
3130
+ "type": "string",
3131
+ "description": (
3132
+ "Parent ring or chain name "
3133
+ "(e.g. 'pyridine', 'benzene', 'pentane')."
3134
+ ),
3135
+ },
3136
+ "substituents": {
3137
+ "type": "array",
3138
+ "items": {
3139
+ "type": "object",
3140
+ "properties": {
3141
+ "locant": {
3142
+ "type": "string",
3143
+ "description": "Position number (e.g. '2', '3').",
3144
+ },
3145
+ "prefix": {
3146
+ "type": "string",
3147
+ "description": (
3148
+ "IUPAC prefix (e.g. 'chloro', 'methyl'). "
3149
+ "Use get_prefix_form first if unsure."
3150
+ ),
3151
+ },
3152
+ },
3153
+ "required": ["locant", "prefix"],
3154
+ },
3155
+ "description": "List of substituents with positions.",
3156
+ },
3157
+ },
3158
+ "required": ["parent", "substituents"],
3159
+ },
3160
+ },
3161
+ {
3162
+ "name": "modify_name",
3163
+ "description": (
3164
+ "Modify an existing IUPAC name by swapping, adding, or "
3165
+ "removing a substituent. The name is re-alphabetised and "
3166
+ "validated automatically.\n\n"
3167
+ "Operations:\n"
3168
+ " - 'swap': Replace target prefix with replacement.\n"
3169
+ " Example: swap 'nitro' -> 'amino' in '4-nitropyridine'\n"
3170
+ " - 'add': Insert replacement prefix at locant.\n"
3171
+ " Example: add 'methyl' at '3' to '2-chloropyridine'\n"
3172
+ " - 'remove': Delete the target prefix.\n"
3173
+ " Example: remove 'chloro' from '2-chloro-3-methylpyridine'\n"
3174
+ ),
3175
+ "input_schema": {
3176
+ "type": "object",
3177
+ "properties": {
3178
+ "name": {
3179
+ "type": "string",
3180
+ "description": "The IUPAC name to modify.",
3181
+ },
3182
+ "operation": {
3183
+ "type": "string",
3184
+ "enum": ["swap", "add", "remove"],
3185
+ "description": "Type of modification.",
3186
+ },
3187
+ "target": {
3188
+ "type": "string",
3189
+ "description": "Prefix to replace (swap) or remove (remove).",
3190
+ },
3191
+ "replacement": {
3192
+ "type": "string",
3193
+ "description": "New prefix (swap) or prefix to insert (add).",
3194
+ },
3195
+ "locant": {
3196
+ "type": "string",
3197
+ "description": "Position for insertion (add only).",
3198
+ },
3199
+ },
3200
+ "required": ["name", "operation"],
3201
+ },
3202
+ },
3203
+ {
3204
+ "name": "validate_name",
3205
+ "description": (
3206
+ "Check whether an IUPAC name is valid by attempting to "
3207
+ "resolve it to a molecular structure. Returns the canonical "
3208
+ "SMILES if valid. Use this to verify names before generating "
3209
+ "structures.\n\n"
3210
+ "Example:\n"
3211
+ ' "2-chloro-3-(trifluoromethyl)pyridine" -> valid, SMILES\n'
3212
+ ' "2-chloro-99-methylpyridine" -> invalid\n'
3213
+ ),
3214
+ "input_schema": {
3215
+ "type": "object",
3216
+ "properties": {
3217
+ "name": {
3218
+ "type": "string",
3219
+ "description": "IUPAC name to validate.",
3220
+ },
3221
+ },
3222
+ "required": ["name"],
3223
+ },
3224
+ },
3225
+ {
3226
+ "name": "name_to_structure",
3227
+ "description": (
3228
+ "Convert a validated chemical name to a structure file "
3229
+ "(CDXML for ChemDraw, or SMILES/MOL). This is the final "
3230
+ "step: call this after assembling and validating the name.\n\n"
3231
+ "Output formats:\n"
3232
+ ' - "cdxml": ChemDraw XML (requires ChemScript)\n'
3233
+ ' - "smiles": canonical SMILES string\n'
3234
+ ' - "mol": MDL MOL block with 2D coordinates\n'
3235
+ ),
3236
+ "input_schema": {
3237
+ "type": "object",
3238
+ "properties": {
3239
+ "name": {
3240
+ "type": "string",
3241
+ "description": "Chemical name to convert.",
3242
+ },
3243
+ "output_format": {
3244
+ "type": "string",
3245
+ "enum": ["cdxml", "smiles", "mol"],
3246
+ "description": "Output format (default: cdxml).",
3247
+ },
3248
+ },
3249
+ "required": ["name"],
3250
+ },
3251
+ },
3252
+ {
3253
+ "name": "enumerate_names",
3254
+ "description": (
3255
+ "List alternative IUPAC name forms for a molecule. "
3256
+ "Given a name or SMILES, returns the canonical name plus "
3257
+ "alternative forms that express the same molecule using "
3258
+ "different parent rings/chains and substituent prefixes.\n\n"
3259
+ "IMPORTANT: Call this BEFORE modify_name when doing name "
3260
+ "surgery on functional groups that appear as suffixes in "
3261
+ "the canonical name (ketones '-one', alcohols '-ol', "
3262
+ "amines '-amine', acids '-oic acid', etc.). The "
3263
+ "alternatives expose these groups as swappable prefixes.\n\n"
3264
+ "Example:\n"
3265
+ " '1-(4-bromophenyl)ethan-1-one' (ketone as suffix)\n"
3266
+ " -> alternatives include '1-acetyl-4-bromobenzene'\n"
3267
+ " where the ketone is now the prefix 'acetyl'\n"
3268
+ " -> you can then swap 'acetyl' for another prefix\n\n"
3269
+ "Each name form includes a 'prefixes' list showing "
3270
+ "which substituent prefixes are visible and swappable.\n"
3271
+ ),
3272
+ "input_schema": {
3273
+ "type": "object",
3274
+ "properties": {
3275
+ "identifier": {
3276
+ "type": "string",
3277
+ "description": (
3278
+ "Chemical name, SMILES, abbreviation, or any "
3279
+ "identifier. Will be resolved to a structure."
3280
+ ),
3281
+ },
3282
+ },
3283
+ "required": ["identifier"],
3284
+ },
3285
+ },
3286
+ # --- Layer 3: Graph manipulation tools ---
3287
+ {
3288
+ "name": "list_reactions",
3289
+ "description": (
3290
+ "List available named reaction templates. Returns the "
3291
+ "name, description, number of reactants, and typical "
3292
+ "conditions for each reaction. Call this to find the "
3293
+ "right template before using apply_reaction.\n\n"
3294
+ "Categories:\n"
3295
+ " - 'coupling': Suzuki, Buchwald, SNAr, amide, "
3296
+ "Sonogashira, Heck, N-alkylation\n"
3297
+ " - 'functional_group': nitro reduction, ester hydrolysis, "
3298
+ "alcohol oxidation, reductive amination, Grignard\n"
3299
+ " - 'heterocycle_formation': ~60 ring-forming reactions "
3300
+ "including Huisgen triazole, Fischer indole, Paal-Knorr "
3301
+ "pyrrole, Hantzsch pyridine/thiazole, benzimidazole, "
3302
+ "benzoxazole, Pictet-Spengler, Biginelli, and many more\n\n"
3303
+ "Use the optional category filter to narrow results.\n"
3304
+ ),
3305
+ "input_schema": {
3306
+ "type": "object",
3307
+ "properties": {
3308
+ "category": {
3309
+ "type": "string",
3310
+ "enum": [
3311
+ "coupling",
3312
+ "functional_group",
3313
+ "heterocycle_formation",
3314
+ ],
3315
+ "description": (
3316
+ "Optional: filter by category. "
3317
+ "Omit to list all reactions."
3318
+ ),
3319
+ },
3320
+ },
3321
+ "required": [],
3322
+ },
3323
+ },
3324
+ {
3325
+ "name": "apply_reaction",
3326
+ "description": (
3327
+ "Apply a named reaction template to a substrate molecule. "
3328
+ "For two-component reactions (e.g. Suzuki, Buchwald), "
3329
+ "provide both substrate and reagent SMILES. For single-"
3330
+ "component reactions (e.g. nitro reduction), only the "
3331
+ "substrate is needed.\n\n"
3332
+ "This tool covers ~70 reactions including:\n"
3333
+ " - Classic couplings (Suzuki, Buchwald, Heck, etc.)\n"
3334
+ " - Functional group transforms (reductions, oxidations)\n"
3335
+ " - Ring-forming heterocyclic reactions (Fischer indole, "
3336
+ "Huisgen triazole, Paal-Knorr pyrrole, Hantzsch thiazole, "
3337
+ "benzimidazole synthesis, Pictet-Spengler, etc.)\n\n"
3338
+ "Use list_reactions() to find the right template name.\n\n"
3339
+ "The substrate and reagent can be SMILES strings or "
3340
+ "chemical names/abbreviations (they will be resolved "
3341
+ "automatically).\n\n"
3342
+ "Returns the product SMILES, IUPAC name, and suggested "
3343
+ "reaction conditions.\n\n"
3344
+ "Examples:\n"
3345
+ ' - apply_reaction("nitro_reduction", '
3346
+ '"c1ccc([N+](=O)[O-])cc1")\n'
3347
+ ' - apply_reaction("suzuki_coupling", '
3348
+ '"c1ccc(Br)cc1", "c1ccc(B(O)O)cc1")\n'
3349
+ ),
3350
+ "input_schema": {
3351
+ "type": "object",
3352
+ "properties": {
3353
+ "reaction_name": {
3354
+ "type": "string",
3355
+ "description": (
3356
+ "Reaction template name from list_reactions "
3357
+ "(e.g. 'suzuki_coupling', 'nitro_reduction')."
3358
+ ),
3359
+ },
3360
+ "substrate": {
3361
+ "type": "string",
3362
+ "description": (
3363
+ "SMILES or name of the main substrate."
3364
+ ),
3365
+ },
3366
+ "reagent": {
3367
+ "type": "string",
3368
+ "description": (
3369
+ "SMILES or name of the coupling partner "
3370
+ "(for 2-reactant reactions only)."
3371
+ ),
3372
+ },
3373
+ },
3374
+ "required": ["reaction_name", "substrate"],
3375
+ },
3376
+ },
3377
+ {
3378
+ "name": "deprotect",
3379
+ "description": (
3380
+ "Remove common protecting groups from a molecule. "
3381
+ "Uses 25 built-in deprotection templates covering:\n"
3382
+ " Boc, Fmoc, Cbz (amines)\n"
3383
+ " TBS/TBDMS, THP, Bn, Ac, PMB, TMS (alcohols)\n"
3384
+ " Acetal/Ketal (carbonyls)\n\n"
3385
+ "Accepts SMILES or a chemical name. Returns the "
3386
+ "deprotected product and which PGs were removed.\n\n"
3387
+ "Example:\n"
3388
+ ' deprotect("O=C(OC(C)(C)C)Nc1ccccc1") # Boc-aniline\n'
3389
+ ' -> product: aniline, removed: [Boc]\n'
3390
+ ),
3391
+ "input_schema": {
3392
+ "type": "object",
3393
+ "properties": {
3394
+ "smiles": {
3395
+ "type": "string",
3396
+ "description": (
3397
+ "SMILES or chemical name of the "
3398
+ "protected molecule."
3399
+ ),
3400
+ },
3401
+ },
3402
+ "required": ["smiles"],
3403
+ },
3404
+ },
3405
+ # --- Reaction JSON summary ---
3406
+ {
3407
+ "name": "reaction_summary",
3408
+ "description": (
3409
+ "Load a reaction JSON file and return a slim summary "
3410
+ "with only the fields you need. Use this instead of "
3411
+ "reading the full JSON, which contains bulky geometry "
3412
+ "and mass data.\n\n"
3413
+ "Default fields (per species): id, name, role, "
3414
+ "role_detail, smiles, display_text, formula, mw.\n"
3415
+ "Default top-level: experiment, conditions.\n"
3416
+ "Default eln_data: product_yield, reaction_type.\n\n"
3417
+ "Request additional fields by name when needed:\n"
3418
+ " - LCMS: add species fields ['exact_mass', 'adducts']\n"
3419
+ " - Procedure: add species fields ['csv_mass', "
3420
+ "'csv_equiv', 'csv_volume'] and eln fields "
3421
+ "['procedure_plain', 'product_obtained', 'sm_mass']\n"
3422
+ " - Scheme drawing: defaults are sufficient\n"
3423
+ " - Pass ['*'] to any field list for all fields.\n\n"
3424
+ "Available species fields:\n"
3425
+ " id, name, role, role_detail, smiles, smiles_neutral, "
3426
+ "classification_method, is_sm, is_dp, is_substrate, "
3427
+ "is_solvent, exact_mass, exact_mass_full, mw, formula, "
3428
+ "adducts, source, source_id, csv_equiv, csv_mass, "
3429
+ "csv_name, csv_volume, csv_supplier, display_text, "
3430
+ "original_geometry\n\n"
3431
+ "Available top-level fields:\n"
3432
+ " version, experiment, input_files, reaction_smiles, "
3433
+ "reaction_class, reaction_name, "
3434
+ "classification_confidence, warnings, metadata, "
3435
+ "conditions\n\n"
3436
+ "Available eln_data fields:\n"
3437
+ " sm_mass, product_obtained, product_yield, "
3438
+ "procedure_text, procedure_plain, reaction_type, "
3439
+ "start_date, labbook_name, solvents, solvent_details\n"
3440
+ ),
3441
+ "input_schema": {
3442
+ "type": "object",
3443
+ "properties": {
3444
+ "json_path": {
3445
+ "type": "string",
3446
+ "description": "Path to the reaction JSON file.",
3447
+ },
3448
+ "species_fields": {
3449
+ "type": "array",
3450
+ "items": {"type": "string"},
3451
+ "description": (
3452
+ "Per-species fields to include. Omit for "
3453
+ "defaults. Pass ['*'] for all fields."
3454
+ ),
3455
+ },
3456
+ "top_fields": {
3457
+ "type": "array",
3458
+ "items": {"type": "string"},
3459
+ "description": (
3460
+ "Top-level fields to include. Omit for "
3461
+ "defaults. Pass ['*'] for all fields."
3462
+ ),
3463
+ },
3464
+ "eln_fields": {
3465
+ "type": "array",
3466
+ "items": {"type": "string"},
3467
+ "description": (
3468
+ "eln_data sub-fields to include. Omit for "
3469
+ "defaults. Pass ['*'] for all. Pass [] to "
3470
+ "omit eln_data entirely."
3471
+ ),
3472
+ },
3473
+ },
3474
+ "required": ["json_path"],
3475
+ },
3476
+ },
3477
+ # --- Single-molecule rendering ---
3478
+ {
3479
+ "name": "draw_molecule",
3480
+ "description": (
3481
+ "Render a single molecule structure to a standalone CDXML "
3482
+ "document (no arrow, no reaction scheme). The output opens "
3483
+ "directly in ChemDraw and uses ACS Document 1996 style.\n\n"
3484
+ "Input is a dict with at minimum a 'smiles' field. "
3485
+ "An optional label (compound name or custom text) is placed "
3486
+ "below the structure. Use the 'output_path' argument to "
3487
+ "write the CDXML to a file as well.\n\n"
3488
+ "Label priority: 'label' > 'name' > 'iupac_name'.\n\n"
3489
+ "Examples:\n"
3490
+ " draw_molecule({'smiles': 'CC(=O)Oc1ccccc1C(=O)O', "
3491
+ "'name': 'aspirin'})\n"
3492
+ " draw_molecule({'smiles': 'c1ccccc1'}, "
3493
+ "output_path='benzene.cdxml')\n"
3494
+ ),
3495
+ "input_schema": {
3496
+ "type": "object",
3497
+ "properties": {
3498
+ "mol_json": {
3499
+ "type": "object",
3500
+ "description": (
3501
+ "Molecule dict. Required key: 'smiles'. "
3502
+ "Optional display keys: 'label', 'name', "
3503
+ "'iupac_name'."
3504
+ ),
3505
+ "properties": {
3506
+ "smiles": {
3507
+ "type": "string",
3508
+ "description": "SMILES string of the molecule.",
3509
+ },
3510
+ "label": {
3511
+ "type": "string",
3512
+ "description": "Custom label shown below the structure.",
3513
+ },
3514
+ "name": {
3515
+ "type": "string",
3516
+ "description": "Compound name (used as label if 'label' not set).",
3517
+ },
3518
+ "iupac_name": {
3519
+ "type": "string",
3520
+ "description": "IUPAC name (used as label if 'name' not set).",
3521
+ },
3522
+ },
3523
+ "required": ["smiles"],
3524
+ },
3525
+ "output_path": {
3526
+ "type": "string",
3527
+ "description": (
3528
+ "Optional file path to write the CDXML to "
3529
+ "(e.g. 'molecule.cdxml'). The CDXML string is "
3530
+ "always returned in the response regardless."
3531
+ ),
3532
+ },
3533
+ },
3534
+ "required": ["mol_json"],
3535
+ },
3536
+ },
3537
+ # --- Molecular editor ---
3538
+ {
3539
+ "name": "modify_molecule",
3540
+ "description": (
3541
+ "Modify a molecule and verify the change with a structural "
3542
+ "diff. This is the premier tool for editing chemical "
3543
+ "structures with verification — like drawing in ChemDraw "
3544
+ "and visually checking the result.\n\n"
3545
+ "Input is a mol_json dict (with at minimum a 'smiles' key, "
3546
+ "e.g. from resolve_compound). The tool applies the "
3547
+ "requested operation, validates the result with RDKit, "
3548
+ "and returns the output molecule with:\n\n"
3549
+ " - aligned_names: side-by-side IUPAC name comparison "
3550
+ "(so you can see what changed in words)\n"
3551
+ " - diff.atoms_changed: MCS-based fragment diff "
3552
+ "(so you can see what atoms were added/removed/replaced)\n"
3553
+ " - diff.delta_formula / diff.delta_mw: formula and MW "
3554
+ "change numbers for sanity-checking\n\n"
3555
+ "Six operation modes:\n\n"
3556
+ " 'analyze' — DOES NOT modify the molecule. Returns "
3557
+ "a rich description: functional groups present, alternative "
3558
+ "IUPAC names from different perspectives, canonical name, "
3559
+ "bracket tree (hierarchical name decomposition), substituent "
3560
+ "prefix form, formula, and MW. Call this FIRST when you "
3561
+ "need to understand a molecule before deciding what surgery "
3562
+ "to do.\n\n"
3563
+ " 'set_smiles' — LLM provides the new SMILES directly. "
3564
+ "Tool validates it and computes the diff. Use when you "
3565
+ "already know the exact SMILES.\n\n"
3566
+ " 'set_name' — LLM provides an IUPAC or common name for "
3567
+ "the desired product. Tool resolves to SMILES and computes "
3568
+ "the diff. Use when you know the target molecule by name.\n\n"
3569
+ " 'smarts' — apply a SMARTS reaction transform. "
3570
+ "Provide either a 'smarts' reaction SMARTS string "
3571
+ "(e.g. '[c:1][F]>>[c:1][Cl]') or a 'reaction_name' from "
3572
+ "list_reactions(). Good for specific bond transformations.\n\n"
3573
+ " 'reaction' — apply a named reaction template via "
3574
+ "apply_reaction(). Provide 'reaction_name' (required) and "
3575
+ "optionally 'reagent' dict (with 'smiles' key) for binary "
3576
+ "reactions. Returns the primary product with the standard "
3577
+ "diff fields; additional products go in 'alternative_products'."
3578
+ " Use list_reactions() to find available template names.\n\n"
3579
+ " 'name_surgery' — modify via IUPAC name manipulation. "
3580
+ "Requires ChemScript. Provide 'add' "
3581
+ "(list of {locant, prefix} dicts) and/or 'remove' "
3582
+ "(list of prefix strings). Best for simple substituent "
3583
+ "swaps on drug-like molecules.\n\n"
3584
+ "Examples:\n"
3585
+ " # CD3 → benzyl swap\n"
3586
+ " modify_molecule({'smiles': '...'}, 'smarts',\n"
3587
+ " smarts='[C:1]([2H])([2H])[2H]>>[C:1]Cc1ccccc1')\n\n"
3588
+ " # Add fluoro at C3\n"
3589
+ " modify_molecule({'smiles': 'Clc1ccncc1'}, 'name_surgery',\n"
3590
+ " add=[{'locant': '3', 'prefix': 'fluoro'}])\n\n"
3591
+ " # Set explicit SMILES\n"
3592
+ " modify_molecule({'smiles': 'Clc1ccncc1'}, 'set_smiles',\n"
3593
+ " new_smiles='Clc1cc(F)ncc1', "
3594
+ "description='fluoro at C3')\n"
3595
+ ),
3596
+ "input_schema": {
3597
+ "type": "object",
3598
+ "properties": {
3599
+ "mol_json": {
3600
+ "type": "object",
3601
+ "description": (
3602
+ "Source molecule dict. Required key: 'smiles'. "
3603
+ "Optional: 'name', 'iupac_name' (used as "
3604
+ "starting point for name_surgery)."
3605
+ ),
3606
+ "properties": {
3607
+ "smiles": {
3608
+ "type": "string",
3609
+ "description": "SMILES of the molecule to modify.",
3610
+ },
3611
+ "name": {
3612
+ "type": "string",
3613
+ "description": "Common name (optional).",
3614
+ },
3615
+ "iupac_name": {
3616
+ "type": "string",
3617
+ "description": (
3618
+ "IUPAC name (used as starting point for "
3619
+ "name_surgery if provided)."
3620
+ ),
3621
+ },
3622
+ },
3623
+ "required": ["smiles"],
3624
+ },
3625
+ "operation": {
3626
+ "type": "string",
3627
+ "enum": ["analyze", "name_surgery", "smarts", "set_smiles", "set_name", "reaction"],
3628
+ "description": (
3629
+ "Operation to apply. Use 'analyze' to inspect a "
3630
+ "molecule without modifying it; use 'name_surgery', "
3631
+ "'smarts', 'set_smiles', 'set_name', or 'reaction' to edit it."
3632
+ ),
3633
+ },
3634
+ "new_smiles": {
3635
+ "type": "string",
3636
+ "description": (
3637
+ "[set_smiles only] The new SMILES string. "
3638
+ "Will be validated with RDKit."
3639
+ ),
3640
+ },
3641
+ "new_name": {
3642
+ "type": "string",
3643
+ "description": (
3644
+ "[set_name only] An IUPAC or common name for "
3645
+ "the desired product. Will be resolved to "
3646
+ "SMILES and validated."
3647
+ ),
3648
+ },
3649
+ "description": {
3650
+ "type": "string",
3651
+ "description": (
3652
+ "[set_smiles/set_name] Optional description of "
3653
+ "the change (for logging/context)."
3654
+ ),
3655
+ },
3656
+ "smarts": {
3657
+ "type": "string",
3658
+ "description": (
3659
+ "[smarts only] Reaction SMARTS string. "
3660
+ "Use atom-map numbers for bond-order-preserving "
3661
+ "transforms, e.g. '[c:1][F]>>[c:1][Cl]'."
3662
+ ),
3663
+ },
3664
+ "reaction_name": {
3665
+ "type": "string",
3666
+ "description": (
3667
+ "[smarts, reaction] Named reaction from list_reactions(). "
3668
+ "For 'smarts': used as the SMARTS transform (alternative to "
3669
+ "providing 'smarts' directly). "
3670
+ "For 'reaction': required — selects the reaction template "
3671
+ "to apply via apply_reaction()."
3672
+ ),
3673
+ },
3674
+ "reagent": {
3675
+ "type": "object",
3676
+ "description": (
3677
+ "[reaction only] The coupling partner for binary reactions "
3678
+ "(e.g. amide_coupling, suzuki_coupling). "
3679
+ "Must contain at minimum a 'smiles' key."
3680
+ ),
3681
+ "properties": {
3682
+ "smiles": {
3683
+ "type": "string",
3684
+ "description": "SMILES of the reagent/coupling partner.",
3685
+ },
3686
+ },
3687
+ "required": ["smiles"],
3688
+ },
3689
+ "add": {
3690
+ "type": "array",
3691
+ "items": {
3692
+ "type": "object",
3693
+ "properties": {
3694
+ "locant": {
3695
+ "type": "string",
3696
+ "description": "Position number (e.g. '3').",
3697
+ },
3698
+ "prefix": {
3699
+ "type": "string",
3700
+ "description": "IUPAC prefix (e.g. 'fluoro', 'methyl').",
3701
+ },
3702
+ },
3703
+ "required": ["locant", "prefix"],
3704
+ },
3705
+ "description": (
3706
+ "[name_surgery only] Substituents to add. "
3707
+ "Each entry needs 'locant' and 'prefix'."
3708
+ ),
3709
+ },
3710
+ "remove": {
3711
+ "type": "array",
3712
+ "items": {"type": "string"},
3713
+ "description": (
3714
+ "[name_surgery only] List of IUPAC prefix "
3715
+ "strings to remove (e.g. ['chloro', 'methyl'])."
3716
+ ),
3717
+ },
3718
+ },
3719
+ "required": ["mol_json", "operation"],
3720
+ },
3721
+ },
3722
+ ]