cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1567 @@
1
+ """cdxml-toolkit MCP server — 15 chemistry tools via Model Context Protocol.
2
+
3
+ Tools:
4
+ resolve_name — Chemical name/formula/CAS → rich molecule JSON
5
+ modify_molecule — Analyze or transform a molecule (6 operations)
6
+ draw_molecule — Single molecule → CDXML document
7
+ render_scheme — YAML/compact text/reaction JSON → publication CDXML
8
+ parse_reaction — CDXML/CDX/CSV/RXN → semantic reaction JSON
9
+ summarize_reaction — Reaction JSON → compact context-efficient summary
10
+ extract_structures — Image/PDF → SMILES + bboxes via DECIMER
11
+ parse_scheme — CDXML scheme → structured species/steps/topology JSON
12
+ convert_cdx_cdxml — Bidirectional CDX ↔ CDXML file conversion
13
+ parse_analysis_file — LCMS/NMR PDF → peaks and data
14
+ format_lab_entry — Entry dicts → formatted lab book text
15
+ extract_cdxml_from_office — PPTX/DOCX → embedded CDXML files
16
+ embed_cdxml_in_office — CDXML → editable OLE object in PPTX/DOCX
17
+ search_compound — SMILES → exact/similar matches across experiments
18
+ render_to_png — CDXML → PNG via ChemDraw COM
19
+
20
+ Run:
21
+ python -m cdxml_toolkit.mcp_server # stdio (default)
22
+ python -m cdxml_toolkit.mcp_server --transport http # streamable-http
23
+ python -m cdxml_toolkit.mcp_server --transport http --port 8080
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import argparse
29
+ import json
30
+ import os
31
+ from pathlib import Path
32
+ from typing import Any, Dict, List, Optional, Union
33
+
34
+ from mcp.server.fastmcp import FastMCP
35
+ import functools
36
+
37
+ import asyncio
38
+ import inspect
39
+
40
+ def _json_safe_tool(fn):
41
+ """Decorator: convert dict/list returns to JSON strings for MCP compatibility.
42
+ Handles both sync and async functions."""
43
+ if inspect.iscoroutinefunction(fn):
44
+ @functools.wraps(fn)
45
+ async def async_wrapper(*args, **kwargs):
46
+ result = await fn(*args, **kwargs)
47
+ if isinstance(result, (dict, list)):
48
+ return json.dumps(result, indent=2, default=str)
49
+ return result
50
+ return async_wrapper
51
+ else:
52
+ @functools.wraps(fn)
53
+ def wrapper(*args, **kwargs):
54
+ result = fn(*args, **kwargs)
55
+ if isinstance(result, (dict, list)):
56
+ return json.dumps(result, indent=2, default=str)
57
+ return result
58
+ return wrapper
59
+
60
+ # Patch FastMCP.tool to auto-wrap with _json_safe_tool
61
+ mcp = FastMCP(
62
+ "cdxml-toolkit",
63
+ instructions=(
64
+ "Chemistry toolkit for parsing, rendering, and reasoning about reaction schemes. "
65
+ "Core workflow: parse_reaction → summarize_reaction (inspect) → render_scheme. "
66
+ "For structure work: resolve_name → modify_molecule → draw_molecule → render_scheme. "
67
+ "Convention: atom-contributing species (reactants, products, key reagents) get drawn "
68
+ "structures; text-only labels are for reagents/conditions. Use (IUPAC name + MW) as "
69
+ "compound identifiers, not LLM-generated SMILES — always resolve names first."
70
+ ),
71
+ )
72
+
73
+ # Patch mcp.tool to auto-wrap with JSON-safe return conversion
74
+ _original_mcp_tool_decorator = mcp.tool
75
+
76
+ def _safe_mcp_tool(*args, **kwargs):
77
+ """Wrap mcp.tool to ensure all tool functions return strings, not dicts."""
78
+ decorator = _original_mcp_tool_decorator(*args, **kwargs)
79
+ def wrapper(fn):
80
+ return decorator(_json_safe_tool(fn))
81
+ return wrapper
82
+
83
+ mcp.tool = _safe_mcp_tool
84
+
85
+
86
+ # ---------------------------------------------------------------------------
87
+ # Helpers
88
+ # ---------------------------------------------------------------------------
89
+
90
+ _TEMP_DIR = os.path.join(os.environ.get("TEMP", os.environ.get("TMP", "/tmp")),
91
+ "cdxml-toolkit-output")
92
+
93
+
94
+ def _write_output(content, output_path: Optional[str], prefix: str, ext: str) -> dict:
95
+ """Write content to output_path (or auto-generated temp file) and return metadata.
96
+
97
+ Always writes to a file — the agent never receives large inline content.
98
+ If output_path is None, generates a temp path under _TEMP_DIR.
99
+ """
100
+ if output_path is None:
101
+ os.makedirs(_TEMP_DIR, exist_ok=True)
102
+ import hashlib, time
103
+ tag = hashlib.md5(str(time.time()).encode()).hexdigest()[:8]
104
+ output_path = os.path.join(_TEMP_DIR, f"{prefix}_{tag}{ext}")
105
+
106
+ os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
107
+ if isinstance(content, str):
108
+ with open(output_path, "w", encoding="utf-8") as f:
109
+ f.write(content)
110
+ else:
111
+ with open(output_path, "w", encoding="utf-8") as f:
112
+ json.dump(content, f, indent=2, default=str)
113
+
114
+ return {"ok": True, "output_path": output_path, "size": os.path.getsize(output_path)}
115
+
116
+
117
+ def _coerce_mol_json(mol_json):
118
+ """Accept a bare SMILES string in place of {"smiles": "..."}."""
119
+ if isinstance(mol_json, str):
120
+ mol_json = {"smiles": mol_json}
121
+ return mol_json
122
+
123
+
124
+ def _parse_json_string(value):
125
+ """If *value* is a JSON-encoded string (e.g. '[{"locant":"4"}]'), parse it."""
126
+ if isinstance(value, str):
127
+ value = value.strip()
128
+ if value and value[0] in ("[", "{"):
129
+ try:
130
+ return json.loads(value)
131
+ except json.JSONDecodeError:
132
+ pass
133
+ return value
134
+
135
+
136
+ # Known modify_molecule operations
137
+ _KNOWN_OPERATIONS = {"analyze", "name_surgery", "smarts", "set_smiles", "set_name", "reaction"}
138
+
139
+
140
+ def _normalize_operation(operation: str, reaction_name: Optional[str]) -> tuple:
141
+ """Normalize the operation string for modify_molecule.
142
+
143
+ Handles:
144
+ - Case/whitespace: "BOC deprotection" → look up as reaction name
145
+ - If *operation* isn't one of the 6 known ops but matches a reaction
146
+ template, redirect to operation="reaction", reaction_name=<matched>.
147
+ Returns (operation, reaction_name).
148
+ """
149
+ op = operation.strip()
150
+ # Exact match (case-insensitive)
151
+ op_lower = op.lower().replace(" ", "_").replace("-", "_")
152
+ for known in _KNOWN_OPERATIONS:
153
+ if op_lower == known.lower():
154
+ return known, reaction_name
155
+
156
+ # Not a known operation — maybe it's a reaction name?
157
+ # Import lazily to check reaction templates
158
+ if reaction_name is None:
159
+ try:
160
+ from cdxml_toolkit.naming.mol_builder import list_reactions
161
+ result = list_reactions()
162
+ rxn_list = result.get("reactions", []) if isinstance(result, dict) else []
163
+ template_names = [r["name"] if isinstance(r, dict) else r for r in rxn_list]
164
+ # Try exact (case-insensitive, normalize underscores)
165
+ for name in template_names:
166
+ if op_lower == name.lower().replace(" ", "_").replace("-", "_"):
167
+ return "reaction", name
168
+ # Try fuzzy substring
169
+ for name in template_names:
170
+ if op_lower in name.lower() or name.lower() in op_lower:
171
+ return "reaction", name
172
+ except Exception:
173
+ pass
174
+
175
+ return op, reaction_name
176
+
177
+
178
+ def _validate_file(path: str, label: str) -> Path:
179
+ """Resolve and validate that *path* exists and is a file."""
180
+ p = Path(path).resolve()
181
+ if not p.is_file():
182
+ raise FileNotFoundError(
183
+ f"File not found: {p}. Check the path exists and use forward slashes."
184
+ )
185
+ return p
186
+
187
+
188
+ # ---------------------------------------------------------------------------
189
+ # Tool 1: resolve_name
190
+ # ---------------------------------------------------------------------------
191
+
192
+ @mcp.tool()
193
+ def resolve_name(query: str, use_network: bool = True) -> dict:
194
+ """Resolve any chemical identifier to a rich molecule descriptor.
195
+
196
+ Converts a name, abbreviation, condensed formula, or CAS number into a
197
+ structured molecule dict with SMILES, formula, MW, exact mass, IUPAC name,
198
+ reagent role, and display text. Uses a 4-tier resolution chain: curated
199
+ reagent DB → condensed formula parser → ChemScript IUPAC → PubChem.
200
+
201
+ Do NOT hand-construct SMILES — use this tool instead. The returned dict can
202
+ be passed directly to modify_molecule, draw_molecule, or used to build a
203
+ render_scheme input.
204
+
205
+ Args:
206
+ query: Chemical identifier — common name, IUPAC name, abbreviation,
207
+ condensed formula (e.g. "PhB(OH)2"), or CAS number. Examples:
208
+ "aspirin", "Cs2CO3", "2-chloropyridine", "534-17-8", "Et3N".
209
+ use_network: Allow PubChem lookup (requires internet). Default True.
210
+
211
+ Returns:
212
+ Dict with keys: ok, name, smiles, formula, mw, exact_mass, iupac_name,
213
+ source (which tier resolved it), role (if in reagent DB), display_text,
214
+ prefix_form (IUPAC substituent prefix, if applicable).
215
+ Returns {ok: False, error: "..."} if unresolvable.
216
+ """
217
+ if not query or not query.strip():
218
+ return (
219
+ "Usage: resolve_name(query='compound name or formula')\n"
220
+ "Examples:\n"
221
+ " resolve_name(query='aspirin')\n"
222
+ " resolve_name(query='CF3')\n"
223
+ " resolve_name(query='Cs2CO3')\n"
224
+ " resolve_name(query='deucravacitinib')\n"
225
+ " resolve_name(query='534-17-8') # CAS number\n"
226
+ "Returns: {ok, name, smiles, formula, mw, exact_mass, role, display_text, source}"
227
+ )
228
+
229
+ from cdxml_toolkit.naming.mol_builder import resolve_compound
230
+
231
+ return resolve_compound(query, use_network=use_network)
232
+
233
+
234
+ # ---------------------------------------------------------------------------
235
+ # Tool 2: modify_molecule
236
+ # ---------------------------------------------------------------------------
237
+
238
+ @mcp.tool()
239
+ def modify_molecule(
240
+ mol_json: dict,
241
+ operation: str,
242
+ add: Optional[List[dict]] = None,
243
+ remove: Optional[List[str]] = None,
244
+ new_smiles: Optional[str] = None,
245
+ new_name: Optional[str] = None,
246
+ reaction_name: Optional[str] = None,
247
+ reagent: Optional[dict] = None,
248
+ smarts: Optional[str] = None,
249
+ description: Optional[str] = None,
250
+ ) -> dict:
251
+ """Analyze or transform a molecule with structural verification.
252
+
253
+ Takes a molecule dict (at minimum {"smiles": "..."}) and applies one of 6
254
+ operations. Returns the modified molecule with a structural diff so you can
255
+ verify the change was correct. This is the primary molecular editing tool.
256
+
257
+ Do NOT set new_smiles to a hallucinated SMILES — only use "set_smiles" if
258
+ you have a validated SMILES from resolve_name or apply_reaction output.
259
+
260
+ Operations:
261
+ "analyze" — Inspect without modifying: functional groups, IUPAC
262
+ names, formula, MW, prefix form. No extra kwargs needed.
263
+ "name_surgery" — Modify via IUPAC name: add/remove substituents.
264
+ Pass add=[{"locant": "2", "prefix": "fluoro"}] and/or
265
+ remove=["methyl"] kwargs.
266
+ "smarts" — Apply a SMARTS reaction transform. Pass smarts=
267
+ "reaction SMARTS" (e.g. "[c:1][F]>>[c:1][Cl]") or
268
+ reaction_name= from list_reactions output.
269
+ "set_smiles" — Accept a validated SMILES. Pass new_smiles= (validated
270
+ by RDKit) and optional description= for context.
271
+ "set_name" — Set the display name. Pass new_name=.
272
+ "reaction" — Apply a named template from list_reactions. Pass
273
+ reaction_name= and optionally reagent={"smiles": ...}
274
+ for binary reactions (coupling, etc.).
275
+
276
+ Args:
277
+ mol_json: Source molecule dict with at least {"smiles": "..."}.
278
+ operation: One of: "analyze", "name_surgery", "smarts", "set_smiles",
279
+ "set_name", "reaction".
280
+ add: For "name_surgery" — list of {"locant": str, "prefix": str} dicts.
281
+ remove: For "name_surgery" — list of prefix strings to remove.
282
+ new_smiles: For "set_smiles" — validated SMILES string.
283
+ new_name: For "set_name" — new display name string.
284
+ reaction_name: For "smarts"/"reaction" — template name.
285
+ reagent: For "reaction" — dict with "smiles" key for the second reagent.
286
+ smarts: For "smarts" — reaction SMARTS string.
287
+ description: For "set_smiles" — optional context note.
288
+
289
+ Returns:
290
+ For "analyze": ok, input_smiles, canonical_name, alternative_names,
291
+ functional_groups, prefix_form, bracket_tree, formula, mw.
292
+ For modifications: ok, input_smiles, output_smiles, input_name,
293
+ output_name, aligned_names, diff (atoms_added, atoms_removed,
294
+ atoms_changed, mcs_smarts, delta_formula, delta_mw), formula, mw.
295
+ """
296
+ # ── Input normalization ──────────────────────────────────────
297
+ mol_json = _coerce_mol_json(mol_json)
298
+
299
+ if not mol_json or not mol_json.get("smiles"):
300
+ return (
301
+ "Usage: modify_molecule(mol_json={smiles: '...'}, operation='...')\n"
302
+ "Operations: analyze, name_surgery, smarts, set_smiles, set_name, reaction\n"
303
+ "Examples:\n"
304
+ " analyze: modify_molecule({smiles:'Clc1ccncc1'}, 'analyze')\n"
305
+ " name_surgery: modify_molecule({smiles:'...'}, 'name_surgery', add=[{locant:'4', prefix:'fluoro'}])\n"
306
+ " reaction: modify_molecule({smiles:'...'}, 'reaction', reaction_name='amide_coupling', reagent={smiles:'...'})\n"
307
+ " set_smiles: modify_molecule({smiles:'...'}, 'set_smiles', new_smiles='validated_smiles')\n"
308
+ "Tip: get mol_json from resolve_name first, not hand-written SMILES."
309
+ )
310
+
311
+ # Normalize operation name (e.g. "BOC deprotection" → reaction + BOC_deprotection)
312
+ operation, reaction_name = _normalize_operation(operation, reaction_name)
313
+
314
+ # Parse stringified JSON arrays (model sometimes sends "[{...}]" as a string)
315
+ if add is not None:
316
+ add = _parse_json_string(add)
317
+ if remove is not None:
318
+ remove = _parse_json_string(remove)
319
+
320
+ from cdxml_toolkit.naming.mol_builder import modify_molecule as _modify
321
+
322
+ kwargs: Dict[str, Any] = {}
323
+ if add is not None:
324
+ kwargs["add"] = add
325
+ if remove is not None:
326
+ kwargs["remove"] = remove
327
+ if new_smiles is not None:
328
+ kwargs["new_smiles"] = new_smiles
329
+ if new_name is not None:
330
+ kwargs["new_name"] = new_name
331
+ if reaction_name is not None:
332
+ kwargs["reaction_name"] = reaction_name
333
+ if reagent is not None:
334
+ kwargs["reagent"] = _coerce_mol_json(reagent)
335
+ if smarts is not None:
336
+ kwargs["smarts"] = smarts
337
+ if description is not None:
338
+ kwargs["description"] = description
339
+
340
+ return _modify(mol_json, operation, **kwargs)
341
+
342
+
343
+ # ---------------------------------------------------------------------------
344
+ # Tool 3: draw_molecule
345
+ # ---------------------------------------------------------------------------
346
+
347
+ @mcp.tool()
348
+ def draw_molecule(mol_json: dict, output_path: Optional[str] = None) -> dict:
349
+ """Render a single molecule to a standalone CDXML document.
350
+
351
+ Takes a molecule dict (at minimum {"smiles": "..."}) and generates a
352
+ self-contained CDXML string with 2D coordinates in ACS Document 1996 style
353
+ (BondLength=14.40, Arial 10pt). Optionally places a text label below the
354
+ structure using the "label", "name", or "iupac_name" field (in that order).
355
+
356
+ Useful for quick structure visualisation before adding to a scheme, or for
357
+ generating a CDXML snippet to open directly in ChemDraw.
358
+
359
+ Args:
360
+ mol_json: Molecule dict with at least {"smiles": "..."}. Optional display
361
+ fields: "label" (used verbatim), "name", "iupac_name".
362
+ Typically the output of resolve_name or modify_molecule.
363
+ output_path: If given, also write CDXML to this file path.
364
+
365
+ Returns:
366
+ Dict with keys: ok, cdxml (CDXML document string), and output_path if
367
+ a path was specified. Returns {ok: False, error: "..."} on failure.
368
+ """
369
+ # ── Input normalization ──────────────────────────────────────
370
+ mol_json = _coerce_mol_json(mol_json)
371
+
372
+ if not mol_json or not mol_json.get("smiles"):
373
+ return (
374
+ "Usage: draw_molecule(mol_json={smiles:'...'}, output_path='path.cdxml')\n"
375
+ "Examples:\n"
376
+ " draw_molecule({smiles:'c1ccncc1'}) # pyridine, no file written\n"
377
+ " draw_molecule({smiles:'c1ccncc1', label:'Pyridine'}, output_path='out.cdxml')\n"
378
+ "Returns: {ok, cdxml (CDXML string), output_path}\n"
379
+ "Tip: get mol_json from resolve_name — do not hand-write SMILES."
380
+ )
381
+
382
+ from cdxml_toolkit.naming.mol_builder import draw_molecule as _draw
383
+
384
+ result = _draw(mol_json, output_path=output_path)
385
+
386
+ # Always write to file — don't return large CDXML inline
387
+ if isinstance(result, dict) and result.get("ok") and "cdxml" in result:
388
+ cdxml = result["cdxml"]
389
+ path = result.get("output_path") or output_path
390
+ out = _write_output(cdxml, path, "molecule", ".cdxml")
391
+ out["name"] = mol_json.get("name") or mol_json.get("iupac_name") or ""
392
+ out["smiles"] = mol_json.get("smiles", "")
393
+ return out
394
+
395
+ return result
396
+
397
+
398
+ # ---------------------------------------------------------------------------
399
+ # Tool 4: render_scheme
400
+ # ---------------------------------------------------------------------------
401
+
402
+ @mcp.tool()
403
+ def render_scheme(
404
+ yaml_text: Optional[str] = None,
405
+ compact_text: Optional[str] = None,
406
+ json_path: Optional[str] = None,
407
+ layout: str = "auto",
408
+ output_path: Optional[str] = None,
409
+ ) -> str:
410
+ """Render a chemical reaction scheme to publication-ready CDXML.
411
+
412
+ Accepts exactly ONE of: yaml_text, compact_text, or json_path.
413
+ Call with NO arguments to see the full YAML schema reference.
414
+
415
+ - yaml_text: Write YAML with structures (ID + SMILES) and steps
416
+ (substrates, products, above/below arrow).
417
+ - json_path: Path to reaction JSON from parse_reaction (auto-layout).
418
+
419
+ Convention: ONE substrate on center line per step. Additional reagents
420
+ go in above_arrow (structures or text). This shares intermediates
421
+ between sequential steps.
422
+
423
+ Args:
424
+ yaml_text: YAML scheme descriptor string.
425
+ compact_text: Compact DSL syntax string.
426
+ json_path: Path to a reaction JSON file.
427
+ layout: Layout for json_path: "auto", "landscape", "portrait".
428
+ output_path: If given, write the CDXML to this file and return
429
+ {ok, output_path, size} instead of the raw CDXML string.
430
+
431
+ Returns:
432
+ CDXML string (when output_path is None), or {ok, output_path, size}
433
+ when output_path is provided, or YAML schema reference if called with
434
+ no arguments.
435
+ """
436
+ from cdxml_toolkit.render.renderer import render
437
+
438
+ modes = sum(x is not None for x in [yaml_text, compact_text, json_path])
439
+ if modes == 0:
440
+ return (
441
+ "No input provided. Call with exactly one of: yaml_text, compact_text, or json_path.\n\n"
442
+ "YAML SCHEMA REFERENCE:\n"
443
+ " layout: sequential # linear | sequential | stacked-rows\n"
444
+ " source: reaction.json # optional — resolve SMILES by ID from this JSON\n"
445
+ " structures:\n"
446
+ " SM:\n"
447
+ " smiles: \"CCO\" # required unless source JSON provides it\n"
448
+ " label: \"Ethanol\" # optional label below structure\n"
449
+ " Product:\n"
450
+ " smiles: \"CC=O\"\n"
451
+ " steps:\n"
452
+ " - substrates: [SM]\n"
453
+ " products: [Product]\n"
454
+ " above_arrow:\n"
455
+ " structures: [Reagent] # drawn structures above arrow\n"
456
+ " text: [\"PCC (1.5 eq)\"] # text labels above arrow\n"
457
+ " below_arrow:\n"
458
+ " text: [\"DCM, rt, 2 h\"]\n\n"
459
+ "For stacked-rows (independent reactions), use sections:\n"
460
+ " sections:\n"
461
+ " - label: \"(i)\"\n"
462
+ " steps: [{substrates: [A], products: [B], ...}]\n"
463
+ " - label: \"(ii)\"\n"
464
+ " steps: [{substrates: [C], products: [D], ...}]\n\n"
465
+ "Convention: ONE substrate on center line per step. Reagents go in above_arrow.\n"
466
+ "output_path: optional — write CDXML to file and get {ok, output_path, size} back."
467
+ )
468
+ if modes > 1:
469
+ raise ValueError("Provide only ONE of: yaml_text, compact_text, or json_path.")
470
+
471
+ if yaml_text is not None:
472
+ from cdxml_toolkit.render.parser import parse_yaml
473
+
474
+ try:
475
+ scheme = parse_yaml(yaml_text)
476
+ except Exception as e:
477
+ return {
478
+ "ok": False,
479
+ "error": (
480
+ f"YAML parse error: {e}. "
481
+ "Correct format: structures: {ID: {smiles: '...'}}, "
482
+ "steps: [{substrates: [ID], products: [ID]}]"
483
+ ),
484
+ }
485
+ try:
486
+ cdxml = render(scheme)
487
+ except Exception as e:
488
+ return {
489
+ "ok": False,
490
+ "error": (
491
+ f"Render failed: {e}. "
492
+ "Check that all structure IDs referenced in steps exist in the structures block "
493
+ "and that each SMILES was obtained from resolve_name, not hand-written."
494
+ ),
495
+ }
496
+
497
+ elif compact_text is not None:
498
+ from cdxml_toolkit.render.compact_parser import parse_compact
499
+
500
+ try:
501
+ scheme = parse_compact(compact_text)
502
+ except Exception as e:
503
+ return {
504
+ "ok": False,
505
+ "error": (
506
+ f"Compact text parse error: {e}. "
507
+ "Correct format: 'ID: {SMILES}' definitions, then 'A + B --> C' reaction lines "
508
+ "with optional 'above:' and 'below:' annotation lines."
509
+ ),
510
+ }
511
+ try:
512
+ cdxml = render(scheme)
513
+ except Exception as e:
514
+ return {
515
+ "ok": False,
516
+ "error": (
517
+ f"Render failed: {e}. "
518
+ "Check that all structure IDs referenced in the reaction line are defined above "
519
+ "and that SMILES strings came from resolve_name."
520
+ ),
521
+ }
522
+
523
+ else:
524
+ # json_path mode — auto-generate YAML then render
525
+ import yaml
526
+
527
+ from cdxml_toolkit.render.parser import parse_yaml
528
+ from cdxml_toolkit.render.scheme_yaml_writer import build_scheme_yaml_dict
529
+
530
+ p = _validate_file(json_path, "JSON file")
531
+ try:
532
+ yaml_dict = build_scheme_yaml_dict(str(p), layout=layout, include_run_arrows=True)
533
+ yaml_str = yaml.dump(yaml_dict, default_flow_style=False, allow_unicode=True, sort_keys=False)
534
+ scheme = parse_yaml(yaml_str)
535
+ cdxml = render(scheme)
536
+ except Exception as e:
537
+ return {
538
+ "ok": False,
539
+ "error": (
540
+ f"Render from JSON failed: {e}. "
541
+ "Ensure the JSON file was produced by parse_reaction and is not corrupted. "
542
+ "Try summarize_reaction on it first to verify its contents."
543
+ ),
544
+ }
545
+
546
+ return _write_output(cdxml, output_path, "scheme", ".cdxml")
547
+
548
+
549
+ # ---------------------------------------------------------------------------
550
+ # Tool 5: parse_reaction
551
+ # ---------------------------------------------------------------------------
552
+
553
+ @mcp.tool()
554
+ def parse_reaction(
555
+ cdxml: Optional[str] = None,
556
+ cdx: Optional[str] = None,
557
+ csv: Optional[str] = None,
558
+ rxn: Optional[str] = None,
559
+ input_dir: Optional[str] = None,
560
+ output_path: Optional[str] = None,
561
+ ) -> dict:
562
+ """Parse reaction files into a semantic JSON descriptor.
563
+
564
+ Extracts every species with canonical SMILES, role classification (using
565
+ Schneider fingerprint scoring for reactant/reagent binary, plus curated
566
+ database for semantic roles like base/solvent/catalyst), display names,
567
+ equivalents, mass data, and adducts. Produces a single JSON source of truth
568
+ suitable for summarize_reaction, render_scheme, or LCMS analysis.
569
+
570
+ Provide at least one file path. Multiple may be combined (e.g. cdxml + csv)
571
+ to merge structural data with ELN metadata.
572
+
573
+ Args:
574
+ cdxml: Path to a .cdxml reaction file.
575
+ cdx: Path to a .cdx reaction file (converted internally).
576
+ csv: Path to a Findmolecule ELN CSV export.
577
+ rxn: Path to a .rxn file.
578
+ input_dir: Directory containing experiment files (auto-discovers
579
+ cdxml/cdx/csv/rxn by experiment ID).
580
+ output_path: If given, write the result JSON to this file and return
581
+ {ok, output_path} instead of the full dict.
582
+
583
+ Returns:
584
+ Reaction descriptor dict with keys: version, experiment, input_files,
585
+ reaction_smiles, reaction_class, species (list with role, smiles,
586
+ formula, mw, etc.), conditions, and eln_data.
587
+ When output_path is provided, returns {ok, output_path} instead.
588
+ """
589
+ if not any([cdxml, cdx, csv, rxn, input_dir]):
590
+ return (
591
+ "Usage: parse_reaction(cdxml='path.cdxml', csv='path.csv')\n"
592
+ "Provide at least one input file; cdxml+csv is the most common combination.\n"
593
+ "Examples:\n"
594
+ " parse_reaction(cdxml='experiment.cdxml')\n"
595
+ " parse_reaction(cdxml='experiment.cdxml', csv='experiment.csv')\n"
596
+ " parse_reaction(input_dir='experiments/KL-CC-001/') # auto-discovers files\n"
597
+ " parse_reaction(cdxml='experiment.cdxml', output_path='reaction.json') # save to file\n"
598
+ "Returns: reaction descriptor JSON with species, roles, SMILES, conditions."
599
+ )
600
+
601
+ from cdxml_toolkit.perception.reaction_parser import parse_reaction as _parse
602
+
603
+ kwargs: Dict[str, Any] = {}
604
+ if cdxml:
605
+ kwargs["cdxml"] = str(_validate_file(cdxml, "CDXML file"))
606
+ if cdx:
607
+ kwargs["cdx"] = str(_validate_file(cdx, "CDX file"))
608
+ if csv:
609
+ kwargs["csv"] = str(_validate_file(csv, "CSV file"))
610
+ if rxn:
611
+ kwargs["rxn"] = str(_validate_file(rxn, "RXN file"))
612
+ if input_dir:
613
+ p = Path(input_dir).resolve()
614
+ if not p.is_dir():
615
+ raise FileNotFoundError(
616
+ f"File not found: {p}. Check the path exists and use forward slashes."
617
+ )
618
+ # Auto-discover files when only input_dir given (no explicit files)
619
+ if not any([cdxml, cdx, csv, rxn]):
620
+ for f in sorted(p.iterdir()):
621
+ ext = f.suffix.lower()
622
+ if ext == ".cdxml" and not cdxml:
623
+ kwargs["cdxml"] = str(f)
624
+ elif ext == ".cdx" and "cdx" not in kwargs and "cdxml" not in kwargs:
625
+ kwargs["cdx"] = str(f)
626
+ elif ext == ".csv" and "csv" not in kwargs:
627
+ kwargs["csv"] = str(f)
628
+ elif ext == ".rxn" and "rxn" not in kwargs:
629
+ kwargs["rxn"] = str(f)
630
+ else:
631
+ kwargs["input_dir"] = str(p)
632
+
633
+ try:
634
+ descriptor = _parse(**kwargs, verbose=False)
635
+ result = descriptor.to_dict()
636
+ except Exception as e:
637
+ return {
638
+ "ok": False,
639
+ "error": (
640
+ f"Parse failed: {e}. "
641
+ "Provide at least one of: cdxml=, csv=, rxn= file paths. "
642
+ "If using a CDX binary file, convert it first with convert_cdx_cdxml."
643
+ ),
644
+ }
645
+
646
+ return _write_output(result, output_path, "reaction", ".json")
647
+
648
+
649
+ # ---------------------------------------------------------------------------
650
+ # Tool 6: summarize_reaction
651
+ # ---------------------------------------------------------------------------
652
+
653
+ @mcp.tool()
654
+ def summarize_reaction(
655
+ json_path: str,
656
+ species_fields: Optional[List[str]] = None,
657
+ top_fields: Optional[List[str]] = None,
658
+ eln_fields: Optional[List[str]] = None,
659
+ ) -> dict:
660
+ """Return a compact, context-efficient view of a reaction JSON file.
661
+
662
+ The full reaction JSON can be 3,000+ tokens with geometry data. This tool
663
+ returns only the fields you need for a given task, making it practical for
664
+ LLM reasoning without burning context.
665
+
666
+ Default fields (when no arguments given):
667
+ species: id, name, role, role_detail, smiles, display_text, formula, mw
668
+ top-level: experiment, conditions
669
+ eln_data: product_yield, reaction_type
670
+
671
+ Pass ["*"] for any field set to get all fields (equivalent to loading the
672
+ full JSON). Request specific fields by name for task-focused summaries.
673
+
674
+ Args:
675
+ json_path: Path to a reaction JSON file from parse_reaction.
676
+ species_fields: Species fields to include. Available: id, name, role,
677
+ role_detail, smiles, smiles_neutral, is_sm, is_dp,
678
+ is_substrate, is_solvent, exact_mass, exact_mass_full,
679
+ mw, formula, adducts, source, source_id, csv_equiv,
680
+ csv_mass, csv_name, csv_volume, csv_supplier,
681
+ display_text, original_geometry. Pass ["*"] for all.
682
+ top_fields: Top-level fields. Available: version, experiment,
683
+ input_files, reaction_smiles, reaction_class,
684
+ reaction_name, classification_confidence, warnings,
685
+ metadata, conditions. Pass ["*"] for all.
686
+ eln_fields: ELN data fields. Available: sm_mass, product_obtained,
687
+ product_yield, procedure_text, procedure_plain,
688
+ reaction_type, start_date, labbook_name, solvents,
689
+ solvent_details. Pass ["*"] for all.
690
+
691
+ Returns:
692
+ Compact dict with requested fields for each species and top-level keys.
693
+ """
694
+ if not json_path or not json_path.strip():
695
+ return (
696
+ "Usage: summarize_reaction(json_path='reaction.json')\n"
697
+ "Examples:\n"
698
+ " summarize_reaction(json_path='reaction.json') # default fields\n"
699
+ " summarize_reaction(json_path='reaction.json', species_fields=['id','name','smiles','adducts']) # LCMS task\n"
700
+ " summarize_reaction(json_path='reaction.json', species_fields=['*']) # all fields\n"
701
+ "Default species fields: id, name, role, smiles, display_text, formula, mw"
702
+ )
703
+
704
+ from cdxml_toolkit.perception.reaction_parser import reaction_summary
705
+
706
+ # Parse stringified JSON arrays (model sometimes sends '["name","role"]' as string)
707
+ species_fields = _parse_json_string(species_fields) if species_fields is not None else None
708
+ top_fields = _parse_json_string(top_fields) if top_fields is not None else None
709
+ eln_fields = _parse_json_string(eln_fields) if eln_fields is not None else None
710
+
711
+ p = _validate_file(json_path, "JSON file")
712
+ return reaction_summary(
713
+ str(p),
714
+ species_fields=species_fields,
715
+ top_fields=top_fields,
716
+ eln_fields=eln_fields,
717
+ )
718
+
719
+
720
+ # ---------------------------------------------------------------------------
721
+ # Tool 7: extract_structures_from_image
722
+ # ---------------------------------------------------------------------------
723
+
724
+ @mcp.tool()
725
+ async def extract_structures_from_image(
726
+ image_path: str,
727
+ detect_labels: bool = True,
728
+ ) -> dict:
729
+ """Extract chemical structures from an image using DECIMER.
730
+
731
+ Takes a PNG, JPG, or PDF image and returns SMILES + confidence scores +
732
+ bounding boxes for every detected chemical structure. Segments the image
733
+ into individual structure regions automatically. Optionally detects nearby
734
+ text labels via OCR.
735
+
736
+ DECIMER models download on first run (~570 MB to ~/.data/DECIMER-V2/).
737
+ Requires: DECIMER, opencv-python, and optionally pytesseract/easyocr.
738
+
739
+ The returned SMILES should be passed through resolve_name or modify_molecule
740
+ to verify and enrich — DECIMER SMILES may not be canonical and can have
741
+ low confidence for complex structures.
742
+
743
+ Args:
744
+ image_path: Path to PNG, JPG, or PDF file.
745
+ detect_labels: Attempt OCR detection of text labels near structures.
746
+ Requires pytesseract or easyocr; labels are null without
747
+ an OCR library. Default True.
748
+
749
+ Returns:
750
+ Dict with keys: ok, image_path, structures (list of: smiles, confidence
751
+ in [0,1], bbox [x0,y0,x1,y1], label or null). Returns {ok: False,
752
+ error: "..."} if DECIMER is not installed or extraction fails.
753
+ """
754
+ if not image_path or not image_path.strip():
755
+ return (
756
+ "Usage: extract_structures_from_image(image_path='image.png')\n"
757
+ "Supported formats: PNG, JPG, PDF\n"
758
+ "Returns SMILES + confidence score + bounding box for each structure found.\n"
759
+ "Note: DECIMER models download ~570 MB on first run.\n"
760
+ "Tip: pass returned SMILES through resolve_name to validate and enrich."
761
+ )
762
+
763
+ try:
764
+ from cdxml_toolkit.image.structure_from_image import (
765
+ extract_structures_from_image as _extract,
766
+ )
767
+ except ImportError as e:
768
+ return {
769
+ "ok": False,
770
+ "error": (
771
+ f"DECIMER/OpenCV not available: {e}. "
772
+ "Install with: pip install cdxml-toolkit[decimer]"
773
+ ),
774
+ }
775
+
776
+ p = _validate_file(image_path, "Image file")
777
+ try:
778
+ # Run in a thread pool so the synchronous TF inference doesn't
779
+ # block the MCP event loop (keeps keepalive pings responsive).
780
+ # TF's C-level logging is suppressed via TF_CPP_MIN_LOG_LEVEL=3
781
+ # set in main() before any TF import.
782
+ # If --preload-decimer was used, the model is already warm and
783
+ # this returns in ~2 s. Otherwise, first call pays ~25 s cold
784
+ # start, subsequent calls are fast.
785
+ loop = asyncio.get_event_loop()
786
+ return await loop.run_in_executor(
787
+ None,
788
+ functools.partial(
789
+ _extract, str(p), detect_labels=detect_labels,
790
+ ),
791
+ )
792
+ except Exception as e:
793
+ return {
794
+ "ok": False,
795
+ "error": (
796
+ f"Image extraction failed: {e}. "
797
+ "Try again (DECIMER may need more time on first run to load models), "
798
+ "or use resolve_name() to look up compounds by name instead."
799
+ ),
800
+ }
801
+
802
+
803
+ # ---------------------------------------------------------------------------
804
+ # Tool 8: parse_scheme
805
+ # ---------------------------------------------------------------------------
806
+
807
+ @mcp.tool()
808
+ def parse_scheme(
809
+ cdxml_path: str,
810
+ output_path: Optional[str] = None,
811
+ ) -> dict:
812
+ """Parse a CDXML reaction scheme into a structured description.
813
+
814
+ Reads a CDXML file containing a reaction scheme (single- or multi-step) and
815
+ returns a structured JSON with a species registry, reaction graph, topology
816
+ classification, and a natural language narrative suitable for LLM reasoning.
817
+
818
+ Uses two strategies in order: step-attribute path (reads <scheme><step>
819
+ attributes if present) then geometry-based fallback (spatial arrow detection).
820
+ Text labels near arrows are classified as "chemical", "condition_ref",
821
+ "footnote", "yield", "compound_label", "citation", or "bioactivity".
822
+
823
+ Args:
824
+ cdxml_path: Path to a CDXML file containing a reaction scheme.
825
+ output_path: If given, write the result JSON to this file and return
826
+ {ok, output_path, size} instead of the full dict.
827
+
828
+ Returns:
829
+ Dict with keys: source_file, species (dict of species records with
830
+ smiles, name, formula, mw, role, text_category), steps (list with
831
+ reactant/product/reagent species IDs and conditions), topology
832
+ (linear/parallel/convergent/divergent), content_type, narrative
833
+ (human-readable summary), and optionally sub_schemes for multi-panel
834
+ files. When output_path is provided, returns {ok, output_path, size}.
835
+ """
836
+ if not cdxml_path or not cdxml_path.strip():
837
+ return (
838
+ "Usage: parse_scheme(cdxml_path='scheme.cdxml')\n"
839
+ "Reads a CDXML scheme and returns species, steps, topology, and narrative.\n"
840
+ "Example: parse_scheme(cdxml_path='experiments/KL-CC-001/scheme.cdxml')\n"
841
+ " parse_scheme(cdxml_path='scheme.cdxml', output_path='parsed.json') # save to file\n"
842
+ "Returns: {species, steps, topology, content_type, narrative}"
843
+ )
844
+
845
+ from cdxml_toolkit.perception.scheme_reader import read_scheme
846
+
847
+ p = _validate_file(cdxml_path, "CDXML file")
848
+ desc = read_scheme(str(p))
849
+
850
+ # SchemeDescription is a dataclass — convert to dict
851
+ try:
852
+ from dataclasses import asdict
853
+ result = asdict(desc)
854
+ except Exception:
855
+ # Fallback: use to_dict if available
856
+ if hasattr(desc, "to_dict"):
857
+ result = desc.to_dict()
858
+ else:
859
+ result = {
860
+ "source_file": getattr(desc, "source_file", str(p)),
861
+ "species": {k: v if isinstance(v, dict) else vars(v)
862
+ for k, v in (desc.species or {}).items()},
863
+ "steps": [s if isinstance(s, dict) else vars(s)
864
+ for s in (desc.steps or [])],
865
+ "topology": getattr(desc, "topology", None),
866
+ "content_type": getattr(desc, "content_type", None),
867
+ "narrative": getattr(desc, "narrative", None),
868
+ }
869
+
870
+ return _write_output(result, output_path, "scheme_parsed", ".json")
871
+
872
+
873
+ # ---------------------------------------------------------------------------
874
+ # Tool 9: convert_cdx_cdxml
875
+ # ---------------------------------------------------------------------------
876
+
877
+ @mcp.tool()
878
+ def convert_cdx_cdxml(
879
+ input_path: str,
880
+ output_path: Optional[str] = None,
881
+ ) -> dict:
882
+ """Convert bidirectionally between CDX (binary) and CDXML (XML) formats.
883
+
884
+ Direction is detected from the file extension:
885
+ - .cdx input → .cdxml output
886
+ - .cdxml input → .cdx output
887
+
888
+ Uses available backends in order: ChemDraw COM (best fidelity, Windows) →
889
+ pycdxml (pure Python, partial support) → OpenBabel.
890
+
891
+ ChemDraw COM requires ChemDraw to be installed and closed before running.
892
+
893
+ Args:
894
+ input_path: Path to .cdx or .cdxml file.
895
+ output_path: Output file path. If not given, same directory as input
896
+ with the swapped extension (e.g. foo.cdx → foo.cdxml).
897
+
898
+ Returns:
899
+ Dict with keys: ok, input, output (absolute path to written file).
900
+ Returns {ok: False, error: "..."} if conversion fails.
901
+ """
902
+ if not input_path or not input_path.strip():
903
+ return (
904
+ "Usage: convert_cdx_cdxml(input_path='file.cdx', output_path='file.cdxml')\n"
905
+ "Direction is auto-detected from extension: .cdx → .cdxml or .cdxml → .cdx\n"
906
+ "Examples:\n"
907
+ " convert_cdx_cdxml(input_path='experiment.cdx')\n"
908
+ " convert_cdx_cdxml(input_path='scheme.cdxml', output_path='scheme.cdx')\n"
909
+ "Requires ChemDraw COM (Windows, ChemDraw must be closed)."
910
+ )
911
+
912
+ from cdxml_toolkit.chemdraw.cdx_converter import convert_file
913
+
914
+ p = _validate_file(input_path, "Input file")
915
+ ext = p.suffix.lower()
916
+ if ext not in (".cdx", ".cdxml"):
917
+ return {
918
+ "ok": False,
919
+ "error": f"Unsupported extension: {ext}. Use .cdx or .cdxml.",
920
+ }
921
+
922
+ try:
923
+ out = convert_file(str(p), output_path=output_path)
924
+ return {"ok": True, "input": str(p), "output": str(Path(out).resolve())}
925
+ except Exception as e:
926
+ return {
927
+ "ok": False,
928
+ "error": (
929
+ f"Conversion failed: {e}. "
930
+ "Ensure ChemDraw is installed and not currently running."
931
+ ),
932
+ "input": str(p),
933
+ }
934
+
935
+
936
+ # ---------------------------------------------------------------------------
937
+ # Tool 10: parse_analysis_file
938
+ # ---------------------------------------------------------------------------
939
+
940
+ @mcp.tool()
941
+ def parse_analysis_file(
942
+ pdf_path: str,
943
+ output_path: Optional[str] = None,
944
+ ) -> dict:
945
+ """Parse an LCMS or NMR analysis PDF to extract peaks and data.
946
+
947
+ Supports Waters LCMS reports and MestReNova NMR PDFs. Returns structured
948
+ peak data for LCMS species identification or NMR characterisation.
949
+
950
+ This module is under active development. If unavailable, the tool returns
951
+ a graceful error rather than crashing.
952
+
953
+ Args:
954
+ pdf_path: Path to an LCMS or NMR PDF report.
955
+ output_path: If given, write the parsed data as JSON to this file and
956
+ return {ok, output_path, size} instead of the full dict.
957
+
958
+ Returns:
959
+ For LCMS: dict with retention_times, peak_areas, masses, UV traces.
960
+ For NMR: dict with chemical_shifts, multiplicities, integrations.
961
+ When output_path is provided, returns {ok, output_path, size}.
962
+ Returns {ok: False, error: "..."} if module unavailable or parse fails.
963
+ """
964
+ if not pdf_path or not pdf_path.strip():
965
+ return (
966
+ "Usage: parse_analysis_file(pdf_path='report.pdf')\n"
967
+ "Supports: Waters LCMS reports, MestReNova NMR PDFs.\n"
968
+ "Examples:\n"
969
+ " parse_analysis_file(pdf_path='lcms/t0.pdf')\n"
970
+ " parse_analysis_file(pdf_path='nmr/product_1H.pdf')\n"
971
+ " parse_analysis_file(pdf_path='lcms/t0.pdf', output_path='t0_parsed.json') # save to file\n"
972
+ "Returns: retention times, peak areas, masses (LCMS) or shifts/multiplicities (NMR)."
973
+ )
974
+
975
+ try:
976
+ from cdxml_toolkit.analysis.parse_analysis_file import parse_analysis_file as _parse
977
+ except ImportError as e:
978
+ return {
979
+ "ok": False,
980
+ "error": (
981
+ f"parse_analysis_file module not available: {e}. "
982
+ "Install with: pip install cdxml-toolkit[analysis]"
983
+ ),
984
+ }
985
+
986
+ p = _validate_file(pdf_path, "PDF file")
987
+ try:
988
+ result = _parse(str(p))
989
+ if not isinstance(result, dict):
990
+ result = {"ok": True, "data": result}
991
+
992
+ return _write_output(result, output_path, "analysis", ".json")
993
+ except Exception as e:
994
+ return {
995
+ "ok": False,
996
+ "error": (
997
+ f"Parse failed: {e}. "
998
+ "Ensure the PDF is a Waters LCMS report or MestReNova NMR export; "
999
+ "scanned PDFs without embedded text are not supported."
1000
+ ),
1001
+ "pdf_path": str(p),
1002
+ }
1003
+
1004
+
1005
+ # ---------------------------------------------------------------------------
1006
+ # Tool 11: format_lab_entry
1007
+ # ---------------------------------------------------------------------------
1008
+
1009
+ @mcp.tool()
1010
+ def format_lab_entry(
1011
+ entries_json: Union[List[dict], dict, str],
1012
+ output_path: Optional[str] = None,
1013
+ ) -> dict:
1014
+ """Format a list of entry dicts into a structured lab book text entry.
1015
+
1016
+ Takes a list of typed entry dicts (or a JSON string) and produces a
1017
+ formatted lab book entry. The tool re-parses LCMS PDFs to fill in
1018
+ exact numbers — you only provide peak identifications (name, approximate
1019
+ RT, ion) as search keys.
1020
+
1021
+ IMPORTANT: Do NOT write free-form LCMS text. Use the structured entry
1022
+ types below. The tool will look up the actual RT, area%, m/z, and UV
1023
+ from the PDF.
1024
+
1025
+ Entry types and their required fields:
1026
+
1027
+ {"type": "text", "content": "Procedure paragraph or section header..."}
1028
+
1029
+ {"type": "lcms-species",
1030
+ "file": "path/to/report.pdf",
1031
+ "label": "t = 0 min",
1032
+ "peaks": [
1033
+ {"name": "Product", "rt": 1.02, "ion": {"mode": "ES-", "mz": 444.1}},
1034
+ {"name": "SM", "rt": 0.65, "ion": {"mode": "ES+", "mz": 275.1}},
1035
+ {"name": "TPPO", "rt": 1.02, "ion": {"mode": "ES+", "mz": 279.1}}
1036
+ ]}
1037
+
1038
+ {"type": "lcms-areas",
1039
+ "file": "path/to/report.pdf",
1040
+ "label": "t = 10 min",
1041
+ "peaks": [
1042
+ {"name": "Product", "rt": 1.03, "compound_related": true},
1043
+ {"name": "Byproduct", "rt": 1.26, "compound_related": false}
1044
+ ]}
1045
+
1046
+ {"type": "lcms-species",
1047
+ "file": "path/to/report.pdf",
1048
+ "label": "Purified product",
1049
+ "peaks": [
1050
+ {"name": "Product", "rt": 1.01, "ion": {"mode": "ES-", "mz": 444.2},
1051
+ "purity": true, "detector": "220nm"}
1052
+ ]}
1053
+
1054
+ {"type": "lcms-manual",
1055
+ "file": "path/to/manual_integration.pdf",
1056
+ "label": "Manual LC",
1057
+ "peaks": [
1058
+ {"name": "Product", "rt": 1.01, "compound_related": true}
1059
+ ]}
1060
+
1061
+ {"type": "nmr", "content": "1H NMR (400 MHz, DMSO-d6): ..."}
1062
+
1063
+ Workflow: First call parse_analysis_file on each PDF to see peaks/masses.
1064
+ Then build entries referencing those PDFs with approximate RT and ion as
1065
+ search keys. This tool re-reads the PDF and fills in exact numbers.
1066
+
1067
+ Args:
1068
+ entries_json: List of entry dicts, or a JSON string, or {"entries": [...]}.
1069
+ output_path: If given, write the formatted text to this file and return
1070
+ {ok, output_path, size} instead of {ok, text}.
1071
+
1072
+ Returns:
1073
+ Dict with keys: ok, text (formatted lab book entry string).
1074
+ When output_path is provided, returns {ok, output_path, size} instead.
1075
+ """
1076
+ # Return usage if called with empty/null-ish input
1077
+ if entries_json is None or entries_json == [] or entries_json == {} or entries_json == "":
1078
+ return (
1079
+ "Usage: format_lab_entry(entries_json=[...entry dicts...])\n"
1080
+ "Entry types:\n"
1081
+ " {type:'text', content:'Procedure paragraph...'}\n"
1082
+ " {type:'lcms-species', file:'report.pdf', label:'t=0', peaks:[{name:'Product', rt:1.02, ion:{mode:'ES+', mz:445.1}}]}\n"
1083
+ " {type:'lcms-areas', file:'report.pdf', label:'t=10 min', peaks:[{name:'Product', rt:1.03, compound_related:true}]}\n"
1084
+ " {type:'nmr', content:'1H NMR (400 MHz, DMSO-d6): ...'}\n"
1085
+ "Workflow: call parse_analysis_file on each PDF first to identify peaks/RTs.\n"
1086
+ "output_path: optional — write formatted text to file and get {ok, output_path, size} back."
1087
+ )
1088
+
1089
+ from cdxml_toolkit.analysis.format_procedure_entry import process_entries
1090
+
1091
+ # Accept a JSON string, a list, or a dict with "entries" key
1092
+ if isinstance(entries_json, str):
1093
+ try:
1094
+ entries_json = json.loads(entries_json)
1095
+ except json.JSONDecodeError as e:
1096
+ return {"ok": False, "error": f"Invalid JSON: {e}"}
1097
+
1098
+ if isinstance(entries_json, dict):
1099
+ if "entries" in entries_json:
1100
+ entries = entries_json["entries"]
1101
+ elif "procedure" in entries_json or "content" in entries_json:
1102
+ # Shorthand: {procedure: "text"} or {content: "text"} → [{type: "text", content: "..."}]
1103
+ text = entries_json.get("procedure") or entries_json.get("content", "")
1104
+ entries = [{"type": "text", "content": text}]
1105
+ else:
1106
+ entries = entries_json.get("entries", [])
1107
+ elif isinstance(entries_json, list):
1108
+ entries = entries_json
1109
+ elif isinstance(entries_json, str) and not entries_json.strip().startswith("["):
1110
+ # Bare text string → wrap as text entry
1111
+ entries = [{"type": "text", "content": entries_json}]
1112
+ else:
1113
+ return {
1114
+ "ok": False,
1115
+ "error": "entries_json must be a list, a JSON string, or {\"entries\": [...]}",
1116
+ }
1117
+
1118
+ try:
1119
+ text = process_entries(entries)
1120
+
1121
+ return _write_output(text, output_path, "lab_entry", ".txt")
1122
+ except Exception as e:
1123
+ return {
1124
+ "ok": False,
1125
+ "error": (
1126
+ f"Entry formatting failed: {e}. "
1127
+ "Check that each entry has a valid 'type' field (text, lcms-species, lcms-areas, "
1128
+ "lcms-manual, nmr) and that PDF paths in 'file' fields exist. "
1129
+ "Call parse_analysis_file on each PDF first to identify peak RTs."
1130
+ ),
1131
+ }
1132
+
1133
+
1134
+ # ---------------------------------------------------------------------------
1135
+ # Tool 12: extract_cdxml_from_office
1136
+ # ---------------------------------------------------------------------------
1137
+
1138
+ @mcp.tool()
1139
+ def extract_cdxml_from_office(
1140
+ file_path: str,
1141
+ output_dir: Optional[str] = None,
1142
+ ) -> dict:
1143
+ """Extract embedded ChemDraw objects from a PPTX or DOCX file.
1144
+
1145
+ Office files (PPTX/DOCX) are ZIP archives that may contain ChemDraw OLE
1146
+ objects as binary blobs. This tool extracts every ChemDraw object, converts
1147
+ it to CDXML, and writes the files to output_dir.
1148
+
1149
+ Requires: olefile. CDX→CDXML conversion uses available backends.
1150
+
1151
+ Args:
1152
+ file_path: Path to a .pptx or .docx file.
1153
+ output_dir: Directory for extracted CDXML files. Default: a folder
1154
+ named "<basename>_chemdraw/" next to the input file.
1155
+
1156
+ Returns:
1157
+ Dict with keys: ok, input, output_dir, objects (list of: source_path,
1158
+ cdxml_output, cdx_output, error for each extracted object).
1159
+ Returns {ok: False, error: "..."} if extraction fails entirely.
1160
+ """
1161
+ if not file_path or not file_path.strip():
1162
+ return (
1163
+ "Usage: extract_cdxml_from_office(file_path='document.pptx', output_dir='out/')\n"
1164
+ "Extracts embedded ChemDraw objects from PPTX or DOCX files.\n"
1165
+ "Examples:\n"
1166
+ " extract_cdxml_from_office(file_path='presentation.pptx')\n"
1167
+ " extract_cdxml_from_office(file_path='report.docx', output_dir='extracted/')\n"
1168
+ "Returns: {ok, count, objects:[{cdxml_output, source_path}]}"
1169
+ )
1170
+
1171
+ from cdxml_toolkit.office.ole_extractor import extract_from_office, ExtractedObject
1172
+
1173
+ p = _validate_file(file_path, "Office file")
1174
+
1175
+ try:
1176
+ results = extract_from_office(
1177
+ str(p),
1178
+ output_dir=output_dir,
1179
+ output_format="cdxml",
1180
+ )
1181
+ except Exception as e:
1182
+ return {
1183
+ "ok": False,
1184
+ "error": (
1185
+ f"Extraction failed: {e}. "
1186
+ "Ensure the file is a valid .pptx or .docx and that olefile is installed "
1187
+ "(pip install olefile). The file must contain embedded ChemDraw OLE objects."
1188
+ ),
1189
+ "input": str(p),
1190
+ }
1191
+
1192
+ objects = []
1193
+ for obj in results:
1194
+ entry: Dict[str, Any] = {"source_path": obj.source_path}
1195
+ if obj.cdxml_output:
1196
+ entry["cdxml_output"] = obj.cdxml_output
1197
+ if obj.cdx_output:
1198
+ entry["cdx_output"] = obj.cdx_output
1199
+ if obj.error:
1200
+ entry["error"] = obj.error
1201
+ objects.append(entry)
1202
+
1203
+ resolved_output_dir = output_dir
1204
+ if results:
1205
+ # derive from first result's output path
1206
+ first_out = results[0].cdxml_output or results[0].cdx_output
1207
+ if first_out:
1208
+ resolved_output_dir = str(Path(first_out).parent)
1209
+
1210
+ return {
1211
+ "ok": True,
1212
+ "input": str(p),
1213
+ "output_dir": resolved_output_dir,
1214
+ "count": len(objects),
1215
+ "objects": objects,
1216
+ }
1217
+
1218
+
1219
+ # ---------------------------------------------------------------------------
1220
+ # Tool 13: embed_cdxml_in_office
1221
+ # ---------------------------------------------------------------------------
1222
+
1223
+ @mcp.tool()
1224
+ def embed_cdxml_in_office(
1225
+ cdxml_path: str,
1226
+ office_path: str,
1227
+ output_path: Optional[str] = None,
1228
+ ) -> dict:
1229
+ """Embed a CDXML file as an editable ChemDraw OLE object in PPTX or DOCX.
1230
+
1231
+ Converts CDXML → CDX + EMF preview via ChemDraw COM, builds a CFB OLE
1232
+ compound file, and injects it into a PPTX slide or DOCX paragraph as a
1233
+ double-clickable, editable ChemDraw object.
1234
+
1235
+ Requires: ChemDraw COM (Windows, ChemDraw 16+), python-pptx or python-docx.
1236
+ ChemDraw must be installed and closed before calling this tool.
1237
+
1238
+ The output format (.pptx or .docx) is detected from office_path extension.
1239
+ If office_path does not exist, a new file is created.
1240
+
1241
+ Args:
1242
+ cdxml_path: Path to the CDXML file to embed.
1243
+ office_path: Path to the target .pptx or .docx file. Created if it
1244
+ does not exist.
1245
+ output_path: Output file path. If not given, writes to office_path
1246
+ (modifies in place via temp file).
1247
+
1248
+ Returns:
1249
+ Dict with keys: ok, input_cdxml, output (absolute path to written
1250
+ Office file), format ("pptx" or "docx"), num_objects_embedded.
1251
+ Returns {ok: False, error: "..."} if embedding fails.
1252
+ """
1253
+ if not cdxml_path or not cdxml_path.strip():
1254
+ return (
1255
+ "Usage: embed_cdxml_in_office(cdxml_path='scheme.cdxml', office_path='output.pptx')\n"
1256
+ "Embeds a CDXML file as a double-clickable ChemDraw OLE object.\n"
1257
+ "Examples:\n"
1258
+ " embed_cdxml_in_office(cdxml_path='scheme.cdxml', office_path='report.pptx')\n"
1259
+ " embed_cdxml_in_office(cdxml_path='scheme.cdxml', office_path='report.docx', output_path='report_v2.docx')\n"
1260
+ "Requires: ChemDraw COM (Windows, ChemDraw must be closed), python-pptx or python-docx."
1261
+ )
1262
+
1263
+ from cdxml_toolkit.office.ole_embedder import (
1264
+ batch_convert,
1265
+ get_cdxml_content_size,
1266
+ build_ole_compound_file,
1267
+ build_pptx,
1268
+ build_docx,
1269
+ )
1270
+
1271
+ cdxml_p = _validate_file(cdxml_path, "CDXML file")
1272
+ ext = Path(office_path).suffix.lower()
1273
+ if ext not in (".pptx", ".docx"):
1274
+ return {
1275
+ "ok": False,
1276
+ "error": f"Unsupported office format: {ext}. Use .pptx or .docx.",
1277
+ }
1278
+
1279
+ if output_path is None:
1280
+ output_path = office_path
1281
+
1282
+ try:
1283
+ # Step 1: convert CDXML → CDX + EMF via ChemDraw COM
1284
+ converted = batch_convert([str(cdxml_p)])
1285
+ if not converted:
1286
+ return {"ok": False, "error": "ChemDraw COM conversion returned no output."}
1287
+ item = converted[0]
1288
+
1289
+ # Step 2: compute display dimensions + build OLE compound file
1290
+ w_emu, h_emu = get_cdxml_content_size(str(cdxml_p))
1291
+ ole_data = build_ole_compound_file(item["cdx_data"])
1292
+
1293
+ items = [{
1294
+ "ole_data": ole_data,
1295
+ "emf_data": item["emf_data"],
1296
+ "width_emu": w_emu,
1297
+ "height_emu": h_emu,
1298
+ "name": item["name"],
1299
+ }]
1300
+
1301
+ # Step 3: build/inject into Office file
1302
+ fmt = ext.lstrip(".")
1303
+ if ext == ".pptx":
1304
+ build_pptx(items, output_path)
1305
+ else:
1306
+ build_docx(items, output_path)
1307
+
1308
+ return {
1309
+ "ok": True,
1310
+ "input_cdxml": str(cdxml_p),
1311
+ "output": str(Path(output_path).resolve()),
1312
+ "format": fmt,
1313
+ "num_objects_embedded": 1,
1314
+ }
1315
+
1316
+ except ImportError as e:
1317
+ return {
1318
+ "ok": False,
1319
+ "error": (
1320
+ f"ChemDraw COM or Office library not available: {e}. "
1321
+ "Requires pywin32 (ChemDraw COM) and python-pptx/python-docx."
1322
+ ),
1323
+ }
1324
+ except Exception as e:
1325
+ return {
1326
+ "ok": False,
1327
+ "error": (
1328
+ f"Embedding failed: {e}. "
1329
+ "Ensure ChemDraw is installed and not currently running, "
1330
+ "and that the target file is a valid .pptx or .docx."
1331
+ ),
1332
+ }
1333
+
1334
+
1335
+ # ---------------------------------------------------------------------------
1336
+ # Tool 14: search_compound
1337
+ # ---------------------------------------------------------------------------
1338
+
1339
+ @mcp.tool()
1340
+ def search_compound(
1341
+ smiles: str,
1342
+ experiment_dir: str,
1343
+ similarity_threshold: float = 0.85,
1344
+ ) -> dict:
1345
+ """Search for a compound across experiment JSON files by SMILES similarity.
1346
+
1347
+ Scans a directory of reaction JSON files (from parse_reaction) and returns
1348
+ exact matches and structurally similar compounds above the given Tanimoto
1349
+ threshold. Useful for finding related experiments, checking if a compound
1350
+ has been made before, or tracing a compound through a multi-step synthesis.
1351
+
1352
+ This module is under active development. If unavailable, the tool returns
1353
+ a graceful error rather than crashing.
1354
+
1355
+ Args:
1356
+ smiles: SMILES string of the compound to search for.
1357
+ Use resolve_name to get a validated SMILES first.
1358
+ experiment_dir: Directory containing reaction JSON files to search.
1359
+ similarity_threshold: Tanimoto similarity cutoff (0–1). Default 0.85.
1360
+
1361
+ Returns:
1362
+ Dict with keys: ok, query_smiles, exact_matches (list), similar_matches
1363
+ (list with similarity scores), total_files_searched.
1364
+ Returns {ok: False, error: "..."} if module unavailable or search fails.
1365
+ """
1366
+ if not smiles or not smiles.strip():
1367
+ return (
1368
+ "Usage: search_compound(smiles='...', experiment_dir='path/to/experiments')\n"
1369
+ "Searches reaction JSON files for exact or similar compounds by Tanimoto similarity.\n"
1370
+ "Examples:\n"
1371
+ " search_compound(smiles='c1ccncc1', experiment_dir='experiments/')\n"
1372
+ " search_compound(smiles='c1ccncc1', experiment_dir='experiments/', similarity_threshold=0.9)\n"
1373
+ "Tip: get a validated SMILES from resolve_name before searching."
1374
+ )
1375
+
1376
+ try:
1377
+ from cdxml_toolkit.perception.compound_search import search_compound as _search
1378
+ except ImportError as e:
1379
+ return {
1380
+ "ok": False,
1381
+ "error": f"compound_search module not available: {e}",
1382
+ }
1383
+
1384
+ d = Path(experiment_dir).resolve()
1385
+ if not d.is_dir():
1386
+ return {
1387
+ "ok": False,
1388
+ "error": (
1389
+ f"File not found: {d}. "
1390
+ "Check the path exists and use forward slashes."
1391
+ ),
1392
+ }
1393
+
1394
+ try:
1395
+ result = _search(
1396
+ smiles,
1397
+ str(d),
1398
+ similarity_threshold=similarity_threshold,
1399
+ )
1400
+ if isinstance(result, dict):
1401
+ return result
1402
+ return {"ok": True, "data": result}
1403
+ except Exception as e:
1404
+ return {
1405
+ "ok": False,
1406
+ "error": (
1407
+ f"Search failed: {e}. "
1408
+ "Ensure the SMILES is valid (use resolve_name to obtain one) "
1409
+ "and that the directory contains reaction JSON files from parse_reaction."
1410
+ ),
1411
+ }
1412
+
1413
+
1414
+ # ---------------------------------------------------------------------------
1415
+ # Tool 15: render_to_png
1416
+ # ---------------------------------------------------------------------------
1417
+
1418
+ @mcp.tool()
1419
+ def render_to_png(
1420
+ cdxml_path: str,
1421
+ output_path: Optional[str] = None,
1422
+ ) -> dict:
1423
+ """Render a CDXML file to PNG using ChemDraw COM.
1424
+
1425
+ Uses ChemDraw's native rendering engine (via COM automation) at 300 DPI
1426
+ with a solid white background. ChemDraw must be installed (Professional
1427
+ 16+) and closed before calling this tool.
1428
+
1429
+ This tool uses ChemDraw COM exclusively — no RDKit fallback. For a
1430
+ quick preview without ChemDraw, use draw_molecule which returns CDXML
1431
+ that can be opened directly.
1432
+
1433
+ Args:
1434
+ cdxml_path: Path to the CDXML file to render.
1435
+ output_path: Output PNG path. If not given, writes to the same
1436
+ directory as the input with a .png extension.
1437
+
1438
+ Returns:
1439
+ Dict with keys: ok, input, output (absolute path to PNG file).
1440
+ Returns {ok: False, error: "..."} if ChemDraw COM is unavailable
1441
+ or rendering fails.
1442
+ """
1443
+ if not cdxml_path or not cdxml_path.strip():
1444
+ return (
1445
+ "Usage: render_to_png(cdxml_path='scheme.cdxml', output_path='scheme.png')\n"
1446
+ "Renders CDXML to PNG at 300 DPI using ChemDraw COM.\n"
1447
+ "Examples:\n"
1448
+ " render_to_png(cdxml_path='scheme.cdxml')\n"
1449
+ " render_to_png(cdxml_path='scheme.cdxml', output_path='figures/scheme.png')\n"
1450
+ "Requires: ChemDraw Professional 16+ on Windows, must be closed before calling."
1451
+ )
1452
+
1453
+ try:
1454
+ from cdxml_toolkit.chemdraw.cdxml_to_image import cdxml_to_image
1455
+ except ImportError as e:
1456
+ return {
1457
+ "ok": False,
1458
+ "error": (
1459
+ f"ChemDraw COM not available: {e}. "
1460
+ "Requires pywin32 and ChemDraw Professional 16+ on Windows."
1461
+ ),
1462
+ }
1463
+
1464
+ p = _validate_file(cdxml_path, "CDXML file")
1465
+
1466
+ try:
1467
+ out = cdxml_to_image(str(p), output_path=output_path, png_dpi=300)
1468
+ return {"ok": True, "input": str(p), "output": str(Path(out).resolve())}
1469
+ except Exception as e:
1470
+ return {
1471
+ "ok": False,
1472
+ "error": (
1473
+ f"Rendering failed: {e}. "
1474
+ "Ensure ChemDraw Professional 16+ is installed and not currently running."
1475
+ ),
1476
+ "input": str(p),
1477
+ }
1478
+
1479
+
1480
+ # ---------------------------------------------------------------------------
1481
+ # CLI entry point
1482
+ # ---------------------------------------------------------------------------
1483
+
1484
+ def main():
1485
+ parser = argparse.ArgumentParser(
1486
+ description="cdxml-toolkit MCP server",
1487
+ formatter_class=argparse.RawDescriptionHelpFormatter,
1488
+ )
1489
+ parser.add_argument(
1490
+ "--transport",
1491
+ choices=["stdio", "http"],
1492
+ default="stdio",
1493
+ help="MCP transport mode (default: stdio)",
1494
+ )
1495
+ parser.add_argument(
1496
+ "--port",
1497
+ type=int,
1498
+ default=8000,
1499
+ help="Port for HTTP transport (default: 8000)",
1500
+ )
1501
+ parser.add_argument(
1502
+ "--no-preload-decimer",
1503
+ action="store_true",
1504
+ help=(
1505
+ "Skip pre-loading the DECIMER model at startup. "
1506
+ "Saves ~25 s startup time and ~800 MB RAM, but first "
1507
+ "extract_structures_from_image call will be slow."
1508
+ ),
1509
+ )
1510
+ args = parser.parse_args()
1511
+
1512
+ # Suppress TensorFlow C-level logging before any TF import.
1513
+ # This prevents TF's stdout/stderr writes from corrupting the
1514
+ # MCP JSON-RPC stdio channel.
1515
+ os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "3")
1516
+ os.environ.setdefault("TF_ENABLE_ONEDNN_OPTS", "0")
1517
+
1518
+ # Pre-import RDKit and load the reagent DB before the MCP event loop
1519
+ # starts. On some Windows/Python configurations, importing the RDKit C
1520
+ # extension from inside the async event loop's thread causes a hang
1521
+ # (the DLL load never completes once anyio has wrapped stdout/stderr).
1522
+ # Doing it here -- before mcp.run() -- avoids the issue entirely and
1523
+ # also eliminates the 1-2 s first-call delay for resolve_name.
1524
+ try:
1525
+ from rdkit import Chem # noqa: F401
1526
+ from cdxml_toolkit.resolve.reagent_db import get_reagent_db
1527
+ get_reagent_db()
1528
+ except Exception:
1529
+ pass # graceful: server still works, just slower on first call
1530
+
1531
+ # Pre-load the DECIMER model in a background thread so the MCP server
1532
+ # is immediately available for other tools (resolve_name, etc.) while
1533
+ # the ~25 s model load happens asynchronously. Disable with
1534
+ # --no-preload-decimer if DECIMER is not installed or RAM is tight.
1535
+ if not args.no_preload_decimer:
1536
+ import threading
1537
+
1538
+ def _bg_load_decimer():
1539
+ import time as _t
1540
+ _t0 = _t.perf_counter()
1541
+ try:
1542
+ from cdxml_toolkit.image.structure_from_image import _load_decimer
1543
+ _load_decimer(hand_drawn=False)
1544
+ import sys
1545
+ print(
1546
+ f"[cdxml-toolkit] DECIMER preloaded in "
1547
+ f"{_t.perf_counter() - _t0:.1f}s",
1548
+ file=sys.stderr,
1549
+ )
1550
+ except Exception as exc:
1551
+ import sys
1552
+ print(
1553
+ f"[cdxml-toolkit] DECIMER preload skipped: {exc}",
1554
+ file=sys.stderr,
1555
+ )
1556
+
1557
+ t = threading.Thread(target=_bg_load_decimer, daemon=True)
1558
+ t.start()
1559
+
1560
+ if args.transport == "stdio":
1561
+ mcp.run(transport="stdio")
1562
+ else:
1563
+ mcp.run(transport="streamable-http", host="0.0.0.0", port=args.port)
1564
+
1565
+
1566
+ if __name__ == "__main__":
1567
+ main()