cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,901 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ ChemScript Bridge — Python wrapper around PerkinElmer's ChemScript .NET library.
4
+
5
+ Provides native access to ChemDraw's chemical intelligence: format conversion,
6
+ name↔structure, structure cleanup, substructure search, reaction handling, and
7
+ more — all from Python.
8
+
9
+ Architecture:
10
+ ChemScript's .NET DLL is 32-bit, so we run a thin JSON-RPC server
11
+ (_chemscript_server.py) under a 32-bit Python environment (chemscript32)
12
+ and communicate via subprocess stdin/stdout.
13
+
14
+ Usage (CLI):
15
+ python chemscript_bridge.py convert input.cdx output.cdxml
16
+ python chemscript_bridge.py name2struct "morpholine" -o output.cdxml
17
+ python chemscript_bridge.py smiles2struct "C1COCCN1" -o morpholine.cdxml
18
+ python chemscript_bridge.py cleanup messy.cdxml -o clean.cdxml
19
+ python chemscript_bridge.py info structure.cdx
20
+ python chemscript_bridge.py search --target target.cdx --query query.cdx
21
+ python chemscript_bridge.py reaction input.cdx --list
22
+ python chemscript_bridge.py lcs mol1.cdx mol2.cdx
23
+
24
+ Python API:
25
+ from cdxml_toolkit.chemdraw.chemscript_bridge import ChemScriptBridge
26
+ cs = ChemScriptBridge()
27
+ cdxml = cs.name_to_cdxml("morpholine")
28
+ cs.convert_file("input.cdx", "output.cdxml")
29
+ """
30
+
31
+ import argparse
32
+ import json
33
+ import os
34
+ import re
35
+ import subprocess
36
+ import sys
37
+ import textwrap
38
+ from pathlib import Path
39
+ from typing import Any, Dict, List, Optional, Tuple
40
+
41
+ from ..constants import ACS_STYLE
42
+
43
+ # ---------------------------------------------------------------------------
44
+ # Configuration
45
+ # ---------------------------------------------------------------------------
46
+
47
+ CONFIG_PATH = Path.home() / ".chemscript_config.json"
48
+
49
+ # Default 32-bit Python path (chemscript32 conda env)
50
+ DEFAULT_PYTHON32 = None # auto-detected
51
+
52
+ # ACS Document 1996 style attributes to inject into ChemScript CDXML output.
53
+ # Imported from constants.py — kept as module-level alias for backward compat.
54
+ ACS_STYLE_ATTRS = ACS_STYLE
55
+
56
+
57
+ def _find_python32() -> Optional[str]:
58
+ """Locate the 32-bit Python interpreter for the chemscript32 conda env."""
59
+ candidates = [
60
+ Path.home() / "miniconda3" / "envs" / "chemscript32" / "python.exe",
61
+ Path(os.environ.get("CONDA_PREFIX", "")) / ".." / "chemscript32" / "python.exe",
62
+ Path(os.environ.get("USERPROFILE", "")) / "miniconda3" / "envs" / "chemscript32" / "python.exe",
63
+ Path(os.environ.get("USERPROFILE", "")) / "Anaconda3" / "envs" / "chemscript32" / "python.exe",
64
+ ]
65
+ for p in candidates:
66
+ resolved = p.resolve()
67
+ if resolved.exists():
68
+ return str(resolved)
69
+ return None
70
+
71
+
72
+ def _load_config() -> dict:
73
+ """Load saved config (python32 path, DLL path, etc.)."""
74
+ if CONFIG_PATH.exists():
75
+ try:
76
+ return json.loads(CONFIG_PATH.read_text())
77
+ except (json.JSONDecodeError, OSError):
78
+ pass
79
+ return {}
80
+
81
+
82
+ def _save_config(cfg: dict):
83
+ """Persist config."""
84
+ try:
85
+ CONFIG_PATH.write_text(json.dumps(cfg, indent=2))
86
+ except OSError as e:
87
+ print(f"Warning: could not save config: {e}", file=sys.stderr)
88
+
89
+
90
+ # ---------------------------------------------------------------------------
91
+ # CDXML post-processing — inject ACS 1996 style
92
+ # ---------------------------------------------------------------------------
93
+
94
+ def _inject_acs_style(cdxml: str) -> str:
95
+ """
96
+ Post-process CDXML from ChemScript to inject ACS Document 1996 style.
97
+
98
+ ChemScript defaults to BondLength=30; we rescale to 14.40 and set other
99
+ ACS style attributes on the root <CDXML> element.
100
+ """
101
+ if not cdxml or "<CDXML" not in cdxml:
102
+ return cdxml
103
+
104
+ # Parse the current BondLength from ChemScript output
105
+ bl_match = re.search(r'BondLength="([^"]+)"', cdxml)
106
+ cs_bond_length = float(bl_match.group(1)) if bl_match else 30.0
107
+ target_bond_length = float(ACS_STYLE_ATTRS["BondLength"])
108
+
109
+ if cs_bond_length <= 0:
110
+ cs_bond_length = 30.0
111
+ scale = target_bond_length / cs_bond_length
112
+
113
+ # Inject/replace style attributes on the root <CDXML> element
114
+ for attr, val in ACS_STYLE_ATTRS.items():
115
+ pat = re.compile(rf'\b{attr}="[^"]*"')
116
+ if pat.search(cdxml):
117
+ cdxml = pat.sub(f'{attr}="{val}"', cdxml)
118
+ else:
119
+ # Insert before the closing > of <CDXML ...>
120
+ cdxml = re.sub(
121
+ r"(<CDXML\b[^>]*?)(>)",
122
+ rf'\1 {attr}="{val}"\2',
123
+ cdxml,
124
+ count=1,
125
+ )
126
+
127
+ # Rescale all coordinate values if scale != 1.0
128
+ if abs(scale - 1.0) > 0.001:
129
+ cdxml = _rescale_cdxml_coords(cdxml, scale)
130
+
131
+ return cdxml
132
+
133
+
134
+ def _rescale_cdxml_coords(cdxml: str, scale: float) -> str:
135
+ """Rescale point coordinates (p=, BoundingBox=) in CDXML by a factor."""
136
+
137
+ def _scale_point(match: re.Match) -> str:
138
+ attr = match.group(1)
139
+ values = match.group(2)
140
+ nums = values.split()
141
+ scaled = " ".join(f"{float(n) * scale:.2f}" for n in nums)
142
+ return f'{attr}="{scaled}"'
143
+
144
+ # Scale p="x y" (node positions)
145
+ cdxml = re.sub(r'(p)="([^"]+)"', _scale_point, cdxml)
146
+ # Scale BoundingBox="l t r b"
147
+ cdxml = re.sub(r'(BoundingBox)="([^"]+)"', _scale_point, cdxml)
148
+
149
+ return cdxml
150
+
151
+
152
+ # ---------------------------------------------------------------------------
153
+ # ChemScriptBridge class — Python API
154
+ # ---------------------------------------------------------------------------
155
+
156
+ class ChemScriptBridge:
157
+ """
158
+ High-level Python interface to ChemScript via a 32-bit subprocess server.
159
+
160
+ Usage:
161
+ cs = ChemScriptBridge()
162
+ cdxml = cs.name_to_cdxml("morpholine")
163
+ cs.convert_file("in.cdx", "out.cdxml")
164
+ """
165
+
166
+ def __init__(self, python32_path: str = None):
167
+ cfg = _load_config()
168
+ self._python32 = python32_path or cfg.get("python32") or _find_python32()
169
+ if self._python32 is None:
170
+ raise RuntimeError(
171
+ "Could not find 32-bit Python (chemscript32 conda env).\n"
172
+ "Create it with: CONDA_SUBDIR=win-32 conda create -n chemscript32 python=3.10\n"
173
+ "Then install pythonnet: chemscript32/python.exe -m pip install pythonnet\n"
174
+ "Or specify the path: ChemScriptBridge(python32_path=r'C:\\...\\python.exe')"
175
+ )
176
+ # Save for next time
177
+ if cfg.get("python32") != self._python32:
178
+ cfg["python32"] = self._python32
179
+ _save_config(cfg)
180
+
181
+ self._server_script = str(
182
+ Path(__file__).resolve().parent / "_chemscript_server.py"
183
+ )
184
+ self._proc: Optional[subprocess.Popen] = None
185
+
186
+ def _ensure_server(self):
187
+ """Start the server subprocess if not already running."""
188
+ if self._proc is not None and self._proc.poll() is None:
189
+ return
190
+ cmd = [self._python32, self._server_script]
191
+ # Pass DLL config from ~/.chemscript_config.json so the server
192
+ # can locate the correct ChemScript DLL (ChemDraw 15 vs 16).
193
+ cfg = _load_config()
194
+ if cfg.get("dll_dir"):
195
+ cmd += ["--dll-dir", cfg["dll_dir"]]
196
+ if cfg.get("assembly"):
197
+ cmd += ["--assembly", cfg["assembly"]]
198
+ self._proc = subprocess.Popen(
199
+ cmd,
200
+ stdin=subprocess.PIPE,
201
+ stdout=subprocess.PIPE,
202
+ stderr=subprocess.PIPE,
203
+ text=True,
204
+ encoding="utf-8",
205
+ )
206
+ # Wait for ready signal
207
+ ready_line = self._proc.stdout.readline()
208
+ if not ready_line:
209
+ err = self._proc.stderr.read()
210
+ raise RuntimeError(f"ChemScript server failed to start: {err}")
211
+ ready = json.loads(ready_line)
212
+ if not ready.get("ready"):
213
+ raise RuntimeError(f"ChemScript server unexpected: {ready}")
214
+
215
+ def _call(self, cmd: str, **args) -> dict:
216
+ """Send a command to the server and return the response."""
217
+ self._ensure_server()
218
+ request = json.dumps({"cmd": cmd, "args": args})
219
+ self._proc.stdin.write(request + "\n")
220
+ self._proc.stdin.flush()
221
+ resp_line = self._proc.stdout.readline()
222
+ if not resp_line:
223
+ err = self._proc.stderr.read() if self._proc.stderr else "no output"
224
+ raise RuntimeError(f"ChemScript server died: {err}")
225
+ return json.loads(resp_line)
226
+
227
+ def close(self):
228
+ """Shut down the server."""
229
+ if self._proc and self._proc.poll() is None:
230
+ try:
231
+ self._proc.stdin.write(json.dumps({"cmd": "quit"}) + "\n")
232
+ self._proc.stdin.flush()
233
+ self._proc.wait(timeout=5)
234
+ except Exception:
235
+ self._proc.kill()
236
+ self._proc = None
237
+
238
+ def __del__(self):
239
+ self.close()
240
+
241
+ def __enter__(self):
242
+ return self
243
+
244
+ def __exit__(self, *args):
245
+ self.close()
246
+
247
+ # -----------------------------------------------------------------------
248
+ # Public API
249
+ # -----------------------------------------------------------------------
250
+
251
+ def convert_file(self, input_path: str, output_path: str) -> dict:
252
+ """
253
+ Convert a chemistry file between formats.
254
+
255
+ Supported: CDX, CDXML, MOL, SDF, RXN, SMILES.
256
+ Format determined by file extension.
257
+ """
258
+ result = self._call("convert", input=os.path.abspath(input_path),
259
+ output=os.path.abspath(output_path))
260
+ if not result.get("ok"):
261
+ raise RuntimeError(result.get("error", "Conversion failed"))
262
+ # Post-process CDXML output for ACS style
263
+ if output_path.lower().endswith(".cdxml"):
264
+ self._postprocess_cdxml(output_path)
265
+ return result
266
+
267
+ def name_to_cdxml(self, name: str, output: str = None) -> str:
268
+ """
269
+ Convert a chemical name to CDXML string.
270
+
271
+ Args:
272
+ name: Chemical name (e.g. "morpholine", "benzene").
273
+ output: Optional file path to write CDXML.
274
+
275
+ Returns:
276
+ CDXML string with ACS 1996 style.
277
+ """
278
+ result = self._call("name_to_cdxml", name=name,
279
+ output=os.path.abspath(output) if output else None)
280
+ if not result.get("ok"):
281
+ raise RuntimeError(result.get("error", f"Name resolution failed: {name}"))
282
+ cdxml = _inject_acs_style(result["cdxml"])
283
+ if output:
284
+ Path(output).write_text(cdxml, encoding="utf-8")
285
+ return cdxml
286
+
287
+ def smiles_to_cdxml(self, smiles: str, output: str = None) -> str:
288
+ """
289
+ Convert a SMILES string to CDXML.
290
+
291
+ Args:
292
+ smiles: SMILES string (e.g. "C1COCCN1").
293
+ output: Optional file path to write CDXML.
294
+
295
+ Returns:
296
+ CDXML string with ACS 1996 style.
297
+ """
298
+ result = self._call("smiles_to_cdxml", smiles=smiles,
299
+ output=os.path.abspath(output) if output else None)
300
+ if not result.get("ok"):
301
+ raise RuntimeError(result.get("error", f"SMILES parse failed: {smiles}"))
302
+ cdxml = _inject_acs_style(result["cdxml"])
303
+ if output:
304
+ Path(output).write_text(cdxml, encoding="utf-8")
305
+ return cdxml
306
+
307
+ def cleanup(self, input_path: str, output: str = None) -> str:
308
+ """
309
+ Clean up a structure file — normalize coordinates, bond lengths, etc.
310
+
311
+ Args:
312
+ input_path: Path to structure file.
313
+ output: Output path (defaults to overwriting input).
314
+
315
+ Returns:
316
+ Output file path.
317
+ """
318
+ out = output or input_path
319
+ result = self._call("cleanup",
320
+ input=os.path.abspath(input_path),
321
+ output=os.path.abspath(out))
322
+ if not result.get("ok"):
323
+ raise RuntimeError(result.get("error", "Cleanup failed"))
324
+ if out.lower().endswith(".cdxml"):
325
+ self._postprocess_cdxml(out)
326
+ return out
327
+
328
+ def get_name(self, source: str) -> str:
329
+ """Get IUPAC name for a structure file or SMILES string."""
330
+ args = {"source": source}
331
+ if os.path.isfile(source):
332
+ args["source"] = os.path.abspath(source)
333
+ result = self._call("get_name", **args)
334
+ if not result.get("ok"):
335
+ raise RuntimeError(result.get("error", "Name lookup failed"))
336
+ return result["name"]
337
+
338
+ def get_formula(self, source: str) -> str:
339
+ """Get molecular formula for a structure file or SMILES string."""
340
+ args = {"source": source}
341
+ if os.path.isfile(source):
342
+ args["source"] = os.path.abspath(source)
343
+ result = self._call("get_formula", **args)
344
+ if not result.get("ok"):
345
+ raise RuntimeError(result.get("error", "Formula lookup failed"))
346
+ return result["formula"]
347
+
348
+ def get_info(self, source: str) -> dict:
349
+ """
350
+ Get full chemical info: name, formula, SMILES, InChI, atom/bond count.
351
+
352
+ Works with both structure files and reaction files.
353
+ """
354
+ args = {"source": source}
355
+ if os.path.isfile(source):
356
+ args["source"] = os.path.abspath(source)
357
+ result = self._call("get_info", **args)
358
+ if not result.get("ok"):
359
+ raise RuntimeError(result.get("error", "Info lookup failed"))
360
+ return result
361
+
362
+ def contains_substructure(self, target: str, query: str) -> bool:
363
+ """
364
+ Check if target contains query as a substructure.
365
+
366
+ Args can be file paths or SMILES strings.
367
+ """
368
+ t_args = {"target": os.path.abspath(target) if os.path.isfile(target) else target}
369
+ q_args = {"query": os.path.abspath(query) if os.path.isfile(query) else query}
370
+ if not os.path.isfile(target):
371
+ t_args["target_format"] = "smiles"
372
+ if not os.path.isfile(query):
373
+ q_args["query_format"] = "smiles"
374
+ result = self._call("contains_substructure", **t_args, **q_args)
375
+ if not result.get("ok"):
376
+ raise RuntimeError(result.get("error", "Substructure search failed"))
377
+ return result["contains"]
378
+
379
+ def substructure_search(self, target: str, query: str) -> dict:
380
+ """
381
+ Perform atom-by-atom substructure search.
382
+
383
+ Returns dict with 'contains' bool and 'maps' list of atom mappings.
384
+ """
385
+ t_args = {"target": os.path.abspath(target) if os.path.isfile(target) else target}
386
+ q_args = {"query": os.path.abspath(query) if os.path.isfile(query) else query}
387
+ if not os.path.isfile(target):
388
+ t_args["target_format"] = "smiles"
389
+ if not os.path.isfile(query):
390
+ q_args["query_format"] = "smiles"
391
+ result = self._call("substructure_search", **t_args, **q_args)
392
+ if not result.get("ok"):
393
+ raise RuntimeError(result.get("error", "Substructure search failed"))
394
+ return result
395
+
396
+ def load_reaction(self, source: str, include_cdxml: bool = False) -> dict:
397
+ """
398
+ Load a reaction file and return component information.
399
+
400
+ Args:
401
+ source: Path to reaction file (CDX, RXN) or reaction SMILES.
402
+ include_cdxml: If True, include CDXML for each component.
403
+
404
+ Returns:
405
+ Dict with 'formula', 'reactants', 'products'.
406
+ """
407
+ args = {"source": source, "include_cdxml": include_cdxml}
408
+ if os.path.isfile(source):
409
+ args["source"] = os.path.abspath(source)
410
+ else:
411
+ args["format"] = "smiles"
412
+ result = self._call("load_reaction", **args)
413
+ if not result.get("ok"):
414
+ raise RuntimeError(result.get("error", "Reaction load failed"))
415
+ return result
416
+
417
+ def largest_common_substructure(self, mol1: str, mol2: str) -> dict:
418
+ """
419
+ Find the largest common substructure between two molecules.
420
+
421
+ Args can be file paths or SMILES strings.
422
+
423
+ Returns:
424
+ Dict with 'atom_map' and 'common_atom_count'.
425
+ """
426
+ args = {}
427
+ args["mol1"] = os.path.abspath(mol1) if os.path.isfile(mol1) else mol1
428
+ args["mol2"] = os.path.abspath(mol2) if os.path.isfile(mol2) else mol2
429
+ if not os.path.isfile(mol1):
430
+ args["mol1_format"] = "smiles"
431
+ if not os.path.isfile(mol2):
432
+ args["mol2_format"] = "smiles"
433
+ result = self._call("largest_common_substructure", **args)
434
+ if not result.get("ok"):
435
+ raise RuntimeError(result.get("error", "LCS failed"))
436
+ return result
437
+
438
+ def overlay(self, source: str, target: str,
439
+ source_format: str = None,
440
+ target_format: str = None) -> Tuple[str, bool]:
441
+ """
442
+ Overlay (2D-align) a molecule onto a reference molecule.
443
+
444
+ Args:
445
+ source: File path or CDXML string of the molecule to align.
446
+ target: File path or CDXML string of the reference molecule.
447
+ source_format: Format hint for source (optional).
448
+ target_format: Format hint for target (optional).
449
+
450
+ Returns:
451
+ Tuple of (aligned_cdxml_string, success_bool).
452
+ """
453
+ args = {}
454
+ args["source"] = os.path.abspath(source) if os.path.isfile(source) else source
455
+ args["target"] = os.path.abspath(target) if os.path.isfile(target) else target
456
+ if source_format:
457
+ args["source_format"] = source_format
458
+ if target_format:
459
+ args["target_format"] = target_format
460
+ result = self._call("overlay", **args)
461
+ if not result.get("ok"):
462
+ raise RuntimeError(result.get("error", "Overlay failed"))
463
+ cdxml = _inject_acs_style(result["aligned_cdxml"])
464
+ return cdxml, result.get("success", False)
465
+
466
+ def substructure_align(self, query: str, target: str,
467
+ query_format: str = None,
468
+ target_format: str = None) -> Optional[List]:
469
+ """
470
+ Align a small molecule (query) to its substructure match in a
471
+ larger molecule (target).
472
+
473
+ Uses ChemScript to confirm substructure match and get SMILES,
474
+ then RDKit for atom-index mapping (avoiding ChemScript naming bugs).
475
+
476
+ Returns a list of (x, y) positions for each query atom (in the
477
+ query's CDXML atom iteration order), taken from the matched
478
+ target atoms. Returns None if no substructure match was found.
479
+ """
480
+ import re as _re
481
+ from xml.etree import ElementTree as ET
482
+
483
+ args = {}
484
+ args["query"] = os.path.abspath(query) if os.path.isfile(query) else query
485
+ args["target"] = os.path.abspath(target) if os.path.isfile(target) else target
486
+ if query_format:
487
+ args["query_format"] = query_format
488
+ if target_format:
489
+ args["target_format"] = target_format
490
+ result = self._call("substructure_align", **args)
491
+ if not result.get("ok") or not result.get("contains"):
492
+ return None
493
+
494
+ target_cdxml = result.get("target_cdxml", "")
495
+ query_cdxml = result.get("query_cdxml", "")
496
+ target_mol_block = result.get("target_mol", "")
497
+ query_mol_block = result.get("query_mol", "")
498
+
499
+ if not target_cdxml or not target_mol_block or not query_mol_block:
500
+ return None
501
+
502
+ # --- Use RDKit for substructure matching via MOL blocks ---
503
+ # MOL block atom order is guaranteed to match ChemScript's iteration
504
+ # order (both come from the same StructureData), so RDKit atom indices
505
+ # from the MOL block = ChemScript atom indices = CDXML <n> order.
506
+ try:
507
+ from rdkit import Chem
508
+ except ImportError:
509
+ return None
510
+
511
+ # ChemScript MOL blocks may have aromatic bonds that RDKit can't
512
+ # kekulize, so parse without sanitization then sanitize everything
513
+ # except kekulization.
514
+ target_mol = Chem.MolFromMolBlock(target_mol_block, sanitize=False)
515
+ if target_mol:
516
+ Chem.SanitizeMol(
517
+ target_mol,
518
+ Chem.SanitizeFlags.SANITIZE_ALL
519
+ ^ Chem.SanitizeFlags.SANITIZE_KEKULIZE,
520
+ )
521
+ query_mol = Chem.MolFromMolBlock(query_mol_block, sanitize=False)
522
+ if query_mol:
523
+ Chem.SanitizeMol(query_mol)
524
+ if target_mol is None or query_mol is None:
525
+ return None
526
+
527
+ match = target_mol.GetSubstructMatch(query_mol)
528
+ if not match:
529
+ return None
530
+ # match[i] = target atom index that corresponds to query atom i
531
+
532
+ # --- Parse target CDXML to extract atom positions ---
533
+ clean = _re.sub(r'<!DOCTYPE[^>]*>', '', target_cdxml)
534
+ clean = _re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', clean)
535
+ try:
536
+ troot = ET.fromstring(clean)
537
+ except ET.ParseError:
538
+ return None
539
+
540
+ target_positions = []
541
+ for n in troot.iter("n"):
542
+ p = n.get("p", "")
543
+ if p:
544
+ try:
545
+ px, py = p.split()[:2]
546
+ target_positions.append((float(px), float(py)))
547
+ except (ValueError, IndexError):
548
+ target_positions.append(None)
549
+ else:
550
+ target_positions.append(None)
551
+
552
+ # Build positions for each query atom
553
+ positions = []
554
+ for qi in range(len(match)):
555
+ ti = match[qi] # target atom index
556
+ if ti < len(target_positions) and target_positions[ti] is not None:
557
+ positions.append(target_positions[ti])
558
+ else:
559
+ positions.append(None)
560
+
561
+ return positions
562
+
563
+ def write_data(self, source: str, target_format: str,
564
+ source_format: str = None) -> str:
565
+ """
566
+ Convert a structure to a specific format string.
567
+
568
+ Args:
569
+ source: File path or data string.
570
+ target_format: Output format (smiles, inchi, mol, cdxml, name, etc.).
571
+ source_format: Input format hint (optional).
572
+
573
+ Returns:
574
+ Data string in target format.
575
+ """
576
+ args = {"target_format": target_format}
577
+ if os.path.isfile(source):
578
+ args["source"] = os.path.abspath(source)
579
+ else:
580
+ args["source"] = source
581
+ if source_format:
582
+ args["source_format"] = source_format
583
+ result = self._call("write_data", **args)
584
+ if not result.get("ok"):
585
+ raise RuntimeError(result.get("error", "WriteData failed"))
586
+ return result["data"]
587
+
588
+ def mimetypes(self) -> List[str]:
589
+ """List all supported mimetypes."""
590
+ result = self._call("mimetypes")
591
+ return result.get("mimetypes", [])
592
+
593
+ # -----------------------------------------------------------------------
594
+ # Internal helpers
595
+ # -----------------------------------------------------------------------
596
+
597
+ def _postprocess_cdxml(self, path: str):
598
+ """Read a CDXML file written by ChemScript and inject ACS style."""
599
+ try:
600
+ text = Path(path).read_text(encoding="utf-8")
601
+ text = _inject_acs_style(text)
602
+ Path(path).write_text(text, encoding="utf-8")
603
+ except Exception as e:
604
+ print(f"Warning: CDXML post-processing failed: {e}", file=sys.stderr)
605
+
606
+
607
+ # ---------------------------------------------------------------------------
608
+ # CLI interface
609
+ # ---------------------------------------------------------------------------
610
+
611
+
612
+ def _cli_configure(args) -> int:
613
+ """Auto-detect ChemDraw version and save config."""
614
+ cfg = _load_config()
615
+
616
+ # Detect python32 path
617
+ py32 = cfg.get("python32") or _find_python32()
618
+ if py32:
619
+ cfg["python32"] = py32
620
+ print(f" 32-bit Python: {py32}")
621
+ else:
622
+ print(" WARNING: 32-bit Python (chemscript32 env) not found.")
623
+ print(" Create it with: set CONDA_SUBDIR=win-32 && conda create -n chemscript32 python=3.10")
624
+
625
+ # Detect ChemDraw / ChemScript DLL
626
+ # Search order:
627
+ # 1. Local chemscript_dlls/ directory (portable deployment with bundled DLLs)
628
+ # 2. Standard PerkinElmerInformatics install paths (ChemOffice2016, then 2015)
629
+ # 3. CambridgeSoft install paths (older naming convention)
630
+ found_version = None
631
+ script_dir = os.path.dirname(os.path.abspath(__file__))
632
+ local_dll_dir = os.path.join(script_dir, "chemscript_dlls")
633
+
634
+ # Check local bundled DLLs first
635
+ for assembly in ["CambridgeSoft.ChemScript16", "CambridgeSoft.ChemScript15"]:
636
+ dll_file = os.path.join(local_dll_dir, f"{assembly}.dll")
637
+ if os.path.isfile(dll_file):
638
+ cfg["dll_dir"] = local_dll_dir
639
+ cfg["assembly"] = assembly
640
+ found_version = f"local ({assembly})"
641
+ print(f" ChemScript DLL: {dll_file} (bundled)")
642
+ break
643
+
644
+ # Check standard install paths
645
+ if not found_version:
646
+ prog_x86 = os.environ.get("PROGRAMFILES(X86)", r"C:\Program Files (x86)")
647
+ search_bases = [
648
+ os.path.join(prog_x86, "PerkinElmerInformatics"),
649
+ os.path.join(prog_x86, "CambridgeSoft"),
650
+ ]
651
+ for pei_base in search_bases:
652
+ if found_version:
653
+ break
654
+ for version_dir, assembly in [
655
+ ("ChemOffice2016", "CambridgeSoft.ChemScript16"),
656
+ ("ChemOffice2015", "CambridgeSoft.ChemScript15"),
657
+ ]:
658
+ dll_dir = os.path.join(pei_base, version_dir, "ChemScript", "Lib", "Net")
659
+ dll_file = os.path.join(dll_dir, f"{assembly}.dll")
660
+ if os.path.isfile(dll_file):
661
+ cfg["dll_dir"] = dll_dir
662
+ cfg["assembly"] = assembly
663
+ found_version = version_dir
664
+ print(f" ChemScript DLL: {dll_file}")
665
+ break
666
+
667
+ if not found_version:
668
+ print(" WARNING: ChemScript DLL not found.")
669
+ print(f" Searched: {local_dll_dir}")
670
+ print(f" Searched: Program Files (x86)\\PerkinElmerInformatics\\ChemOffice20XX")
671
+ print(f" Searched: Program Files (x86)\\CambridgeSoft\\ChemOffice20XX")
672
+ print(" Either install ChemDraw with ChemScript, or copy the DLLs to:")
673
+ print(f" {local_dll_dir}")
674
+ print(" Required: CambridgeSoft.ChemScript16.dll + ChemScript160.dll")
675
+
676
+ _save_config(cfg)
677
+ print(f"\n Config saved to: {CONFIG_PATH}")
678
+ return 0
679
+
680
+
681
+ def _cli_ping(args, cs: ChemScriptBridge) -> int:
682
+ """Test that the ChemScript bridge is working."""
683
+ result = cs._call("ping")
684
+ if result.get("ok"):
685
+ print("ChemScript bridge OK: server is responding")
686
+ return 0
687
+ else:
688
+ print(f"ChemScript bridge FAILED: {result.get('error', 'unknown')}", file=sys.stderr)
689
+ return 1
690
+
691
+
692
+ def _cli_convert(args, cs: ChemScriptBridge) -> int:
693
+ result = cs.convert_file(args.input, args.output)
694
+ kind = result.get("type", "unknown")
695
+ formula = result.get("formula", "?")
696
+ print(f"Converted ({kind}): {formula}", file=sys.stderr)
697
+ print(f"Written to {args.output}", file=sys.stderr)
698
+ return 0
699
+
700
+
701
+ def _cli_name2struct(args, cs: ChemScriptBridge) -> int:
702
+ output = args.output
703
+ if output == "-":
704
+ cdxml = cs.name_to_cdxml(args.name)
705
+ print(cdxml)
706
+ else:
707
+ cdxml = cs.name_to_cdxml(args.name, output=output)
708
+ print(f"Written to {output}", file=sys.stderr)
709
+ return 0
710
+
711
+
712
+ def _cli_smiles2struct(args, cs: ChemScriptBridge) -> int:
713
+ output = args.output
714
+ if output == "-":
715
+ cdxml = cs.smiles_to_cdxml(args.smiles)
716
+ print(cdxml)
717
+ else:
718
+ cdxml = cs.smiles_to_cdxml(args.smiles, output=output)
719
+ print(f"Written to {output}", file=sys.stderr)
720
+ return 0
721
+
722
+
723
+ def _cli_cleanup(args, cs: ChemScriptBridge) -> int:
724
+ output = args.output or args.input
725
+ cs.cleanup(args.input, output=output)
726
+ print(f"Cleaned up → {output}", file=sys.stderr)
727
+ return 0
728
+
729
+
730
+ def _cli_info(args, cs: ChemScriptBridge) -> int:
731
+ info = cs.get_info(args.source)
732
+ if info.get("type") == "structure":
733
+ print(f"Type: structure")
734
+ print(f"Name: {info.get('name', '(unknown)')}")
735
+ print(f"Formula: {info.get('formula', '?')}")
736
+ print(f"SMILES: {info.get('smiles', '?')}")
737
+ if info.get("inchi"):
738
+ print(f"InChI: {info['inchi']}")
739
+ print(f"Atoms: {info.get('atom_count', '?')}")
740
+ print(f"Bonds: {info.get('bond_count', '?')}")
741
+ elif info.get("type") == "reaction":
742
+ print(f"Type: reaction")
743
+ print(f"Formula: {info.get('formula', '?')}")
744
+ print(f"Reactants ({len(info.get('reactants', []))}):")
745
+ for i, rct in enumerate(info.get("reactants", []), 1):
746
+ name = rct.get("name") or "(unknown)"
747
+ print(f" {i}. {rct['formula']} - {name} [{rct['smiles']}]")
748
+ print(f"Products ({len(info.get('products', []))}):")
749
+ for i, prod in enumerate(info.get("products", []), 1):
750
+ name = prod.get("name") or "(unknown)"
751
+ print(f" {i}. {prod['formula']} - {name} [{prod['smiles']}]")
752
+ else:
753
+ print(json.dumps(info, indent=2))
754
+ return 0
755
+
756
+
757
+ def _cli_search(args, cs: ChemScriptBridge) -> int:
758
+ if args.atom_map:
759
+ result = cs.substructure_search(args.target, args.query)
760
+ print(f"Contains substructure: {result['contains']}")
761
+ if result["maps"]:
762
+ for i, m in enumerate(result["maps"], 1):
763
+ print(f" Map {i}:")
764
+ for k, v in m.items():
765
+ print(f" {k} -> {v}")
766
+ else:
767
+ found = cs.contains_substructure(args.target, args.query)
768
+ print(f"Contains substructure: {found}")
769
+ return 0
770
+
771
+
772
+ def _cli_reaction(args, cs: ChemScriptBridge) -> int:
773
+ info = cs.load_reaction(args.input, include_cdxml=args.cdxml)
774
+ print(f"Reaction: {info['formula']}")
775
+ print(f"Reactants ({len(info['reactants'])}):")
776
+ for i, rct in enumerate(info["reactants"], 1):
777
+ name = rct.get("name") or "(unknown)"
778
+ print(f" {i}. {rct['formula']} - {name} [{rct['smiles']}]")
779
+ print(f"Products ({len(info['products'])}):")
780
+ for i, prod in enumerate(info["products"], 1):
781
+ name = prod.get("name") or "(unknown)"
782
+ print(f" {i}. {prod['formula']} - {name} [{prod['smiles']}]")
783
+ if args.json:
784
+ print(json.dumps(info, indent=2))
785
+ return 0
786
+
787
+
788
+ def _cli_lcs(args, cs: ChemScriptBridge) -> int:
789
+ result = cs.largest_common_substructure(args.mol1, args.mol2)
790
+ print(f"Common atoms: {result.get('common_atom_count', 0)}")
791
+ for entry in result.get("atom_map", []):
792
+ print(f" {entry['common']}: mol1={entry['mol1']}, mol2={entry['mol2']}")
793
+ return 0
794
+
795
+
796
+ def main(argv: Optional[List[str]] = None) -> int:
797
+ parser = argparse.ArgumentParser(
798
+ description="ChemScript Bridge — access ChemDraw's chemical intelligence from Python.",
799
+ formatter_class=argparse.RawDescriptionHelpFormatter,
800
+ epilog=textwrap.dedent("""\
801
+ Examples:
802
+ %(prog)s convert input.cdx output.cdxml
803
+ %(prog)s name2struct "morpholine" -o morpholine.cdxml
804
+ %(prog)s smiles2struct "C1COCCN1" -o morpholine.cdxml
805
+ %(prog)s cleanup messy.cdxml -o clean.cdxml
806
+ %(prog)s info structure.cdx
807
+ %(prog)s search --target target.cdx --query query.cdx
808
+ %(prog)s reaction input.cdx --list
809
+ %(prog)s lcs "C1(C)CCCC1CCO" "C1CCCC1C"
810
+ """),
811
+ )
812
+
813
+ sub = parser.add_subparsers(dest="command", required=True)
814
+
815
+ # configure — no ChemScript needed
816
+ sub.add_parser("configure", help="Auto-detect ChemDraw version and save config")
817
+
818
+ # ping — test bridge connectivity
819
+ sub.add_parser("ping", help="Test ChemScript bridge connectivity")
820
+
821
+ # convert
822
+ p = sub.add_parser("convert", help="Convert between chemical file formats")
823
+ p.add_argument("input", help="Input file (CDX, CDXML, MOL, RXN, etc.)")
824
+ p.add_argument("output", help="Output file (format from extension)")
825
+
826
+ # name2struct
827
+ p = sub.add_parser("name2struct", help="Chemical name → CDXML structure")
828
+ p.add_argument("name", help="Chemical name (e.g. 'morpholine')")
829
+ p.add_argument("-o", "--output", default="-", help="Output CDXML file (default: stdout)")
830
+
831
+ # smiles2struct
832
+ p = sub.add_parser("smiles2struct", help="SMILES → CDXML structure")
833
+ p.add_argument("smiles", help="SMILES string")
834
+ p.add_argument("-o", "--output", default="-", help="Output CDXML file (default: stdout)")
835
+
836
+ # cleanup
837
+ p = sub.add_parser("cleanup", help="Clean up structure coordinates")
838
+ p.add_argument("input", help="Input structure file")
839
+ p.add_argument("-o", "--output", default=None, help="Output file (default: overwrite input)")
840
+
841
+ # info
842
+ p = sub.add_parser("info", help="Get chemical info (name, formula, SMILES, etc.)")
843
+ p.add_argument("source", help="Structure/reaction file or SMILES string")
844
+
845
+ # search
846
+ p = sub.add_parser("search", help="Substructure search")
847
+ p.add_argument("--target", required=True, help="Target structure (file or SMILES)")
848
+ p.add_argument("--query", required=True, help="Query substructure (file or SMILES)")
849
+ p.add_argument("--atom-map", action="store_true", help="Show atom-by-atom mapping")
850
+
851
+ # reaction
852
+ p = sub.add_parser("reaction", help="Extract reaction components")
853
+ p.add_argument("input", help="Reaction file (CDX, RXN) or reaction SMILES")
854
+ p.add_argument("--list", action="store_true", dest="list_components",
855
+ help="List reactants and products")
856
+ p.add_argument("--cdxml", action="store_true", help="Include CDXML for each component")
857
+ p.add_argument("--json", action="store_true", help="Output full JSON")
858
+
859
+ # lcs
860
+ p = sub.add_parser("lcs", help="Largest common substructure")
861
+ p.add_argument("mol1", help="First molecule (file or SMILES)")
862
+ p.add_argument("mol2", help="Second molecule (file or SMILES)")
863
+
864
+ args = parser.parse_args(argv)
865
+
866
+ # 'configure' doesn't need a running ChemScript server
867
+ if args.command == "configure":
868
+ return _cli_configure(args)
869
+
870
+ try:
871
+ cs = ChemScriptBridge()
872
+ except RuntimeError as e:
873
+ print(f"ERROR: {e}", file=sys.stderr)
874
+ return 1
875
+
876
+ try:
877
+ dispatch = {
878
+ "ping": _cli_ping,
879
+ "convert": _cli_convert,
880
+ "name2struct": _cli_name2struct,
881
+ "smiles2struct": _cli_smiles2struct,
882
+ "cleanup": _cli_cleanup,
883
+ "info": _cli_info,
884
+ "search": _cli_search,
885
+ "reaction": _cli_reaction,
886
+ "lcs": _cli_lcs,
887
+ }
888
+ handler = dispatch[args.command]
889
+ return handler(args, cs)
890
+ except RuntimeError as e:
891
+ print(f"ERROR: {e}", file=sys.stderr)
892
+ return 1
893
+ except Exception as e:
894
+ print(f"ERROR: {e}", file=sys.stderr)
895
+ return 1
896
+ finally:
897
+ cs.close()
898
+
899
+
900
+ if __name__ == "__main__":
901
+ sys.exit(main())