cdxml-toolkit 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. cdxml_toolkit-0.5.0/LICENSE +21 -0
  2. cdxml_toolkit-0.5.0/NOTICE.md +37 -0
  3. cdxml_toolkit-0.5.0/PKG-INFO +318 -0
  4. cdxml_toolkit-0.5.0/README.md +257 -0
  5. cdxml_toolkit-0.5.0/cdxml_toolkit/__init__.py +18 -0
  6. cdxml_toolkit-0.5.0/cdxml_toolkit/_jre/__init__.py +2 -0
  7. cdxml_toolkit-0.5.0/cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  8. cdxml_toolkit-0.5.0/cdxml_toolkit/analysis/__init__.py +35 -0
  9. cdxml_toolkit-0.5.0/cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  10. cdxml_toolkit-0.5.0/cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  11. cdxml_toolkit-0.5.0/cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  12. cdxml_toolkit-0.5.0/cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  13. cdxml_toolkit-0.5.0/cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  14. cdxml_toolkit-0.5.0/cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  15. cdxml_toolkit-0.5.0/cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  16. cdxml_toolkit-0.5.0/cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  17. cdxml_toolkit-0.5.0/cdxml_toolkit/analysis/extract_nmr.py +47 -0
  18. cdxml_toolkit-0.5.0/cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  19. cdxml_toolkit-0.5.0/cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  20. cdxml_toolkit-0.5.0/cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  21. cdxml_toolkit-0.5.0/cdxml_toolkit/cdxml_builder.py +920 -0
  22. cdxml_toolkit-0.5.0/cdxml_toolkit/cdxml_utils.py +342 -0
  23. cdxml_toolkit-0.5.0/cdxml_toolkit/chemdraw/__init__.py +5 -0
  24. cdxml_toolkit-0.5.0/cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  25. cdxml_toolkit-0.5.0/cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  26. cdxml_toolkit-0.5.0/cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  27. cdxml_toolkit-0.5.0/cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  28. cdxml_toolkit-0.5.0/cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  29. cdxml_toolkit-0.5.0/cdxml_toolkit/constants.py +304 -0
  30. cdxml_toolkit-0.5.0/cdxml_toolkit/coord_normalizer.py +438 -0
  31. cdxml_toolkit-0.5.0/cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  32. cdxml_toolkit-0.5.0/cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  33. cdxml_toolkit-0.5.0/cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  34. cdxml_toolkit-0.5.0/cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  35. cdxml_toolkit-0.5.0/cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  36. cdxml_toolkit-0.5.0/cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  37. cdxml_toolkit-0.5.0/cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  38. cdxml_toolkit-0.5.0/cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  39. cdxml_toolkit-0.5.0/cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  40. cdxml_toolkit-0.5.0/cdxml_toolkit/image/__init__.py +15 -0
  41. cdxml_toolkit-0.5.0/cdxml_toolkit/image/reaction_from_image.py +2103 -0
  42. cdxml_toolkit-0.5.0/cdxml_toolkit/image/structure_from_image.py +1711 -0
  43. cdxml_toolkit-0.5.0/cdxml_toolkit/layout/__init__.py +5 -0
  44. cdxml_toolkit-0.5.0/cdxml_toolkit/layout/alignment.py +1642 -0
  45. cdxml_toolkit-0.5.0/cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  46. cdxml_toolkit-0.5.0/cdxml_toolkit/layout/scheme_merger.py +2260 -0
  47. cdxml_toolkit-0.5.0/cdxml_toolkit/mcp_server/__init__.py +0 -0
  48. cdxml_toolkit-0.5.0/cdxml_toolkit/mcp_server/__main__.py +5 -0
  49. cdxml_toolkit-0.5.0/cdxml_toolkit/mcp_server/server.py +1567 -0
  50. cdxml_toolkit-0.5.0/cdxml_toolkit/naming/__init__.py +6 -0
  51. cdxml_toolkit-0.5.0/cdxml_toolkit/naming/aligned_namer.py +2342 -0
  52. cdxml_toolkit-0.5.0/cdxml_toolkit/naming/mol_builder.py +3722 -0
  53. cdxml_toolkit-0.5.0/cdxml_toolkit/naming/name_decomposer.py +2843 -0
  54. cdxml_toolkit-0.5.0/cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  55. cdxml_toolkit-0.5.0/cdxml_toolkit/office/__init__.py +5 -0
  56. cdxml_toolkit-0.5.0/cdxml_toolkit/office/doc_from_template.py +722 -0
  57. cdxml_toolkit-0.5.0/cdxml_toolkit/office/ole_embedder.py +808 -0
  58. cdxml_toolkit-0.5.0/cdxml_toolkit/office/ole_extractor.py +272 -0
  59. cdxml_toolkit-0.5.0/cdxml_toolkit/perception/__init__.py +10 -0
  60. cdxml_toolkit-0.5.0/cdxml_toolkit/perception/compound_search.py +229 -0
  61. cdxml_toolkit-0.5.0/cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  62. cdxml_toolkit-0.5.0/cdxml_toolkit/perception/rdf_parser.py +664 -0
  63. cdxml_toolkit-0.5.0/cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  64. cdxml_toolkit-0.5.0/cdxml_toolkit/perception/reaction_parser.py +2150 -0
  65. cdxml_toolkit-0.5.0/cdxml_toolkit/perception/scheme_reader.py +2948 -0
  66. cdxml_toolkit-0.5.0/cdxml_toolkit/perception/scheme_refine.py +1404 -0
  67. cdxml_toolkit-0.5.0/cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  68. cdxml_toolkit-0.5.0/cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  69. cdxml_toolkit-0.5.0/cdxml_toolkit/rdkit_utils.py +605 -0
  70. cdxml_toolkit-0.5.0/cdxml_toolkit/render/__init__.py +17 -0
  71. cdxml_toolkit-0.5.0/cdxml_toolkit/render/auto_layout.py +229 -0
  72. cdxml_toolkit-0.5.0/cdxml_toolkit/render/compact_parser.py +632 -0
  73. cdxml_toolkit-0.5.0/cdxml_toolkit/render/parser.py +706 -0
  74. cdxml_toolkit-0.5.0/cdxml_toolkit/render/render_scheme.py +267 -0
  75. cdxml_toolkit-0.5.0/cdxml_toolkit/render/renderer.py +2387 -0
  76. cdxml_toolkit-0.5.0/cdxml_toolkit/render/schema.py +90 -0
  77. cdxml_toolkit-0.5.0/cdxml_toolkit/render/scheme_maker.py +1043 -0
  78. cdxml_toolkit-0.5.0/cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  79. cdxml_toolkit-0.5.0/cdxml_toolkit/resolve/__init__.py +13 -0
  80. cdxml_toolkit-0.5.0/cdxml_toolkit/resolve/cas_resolver.py +430 -0
  81. cdxml_toolkit-0.5.0/cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  82. cdxml_toolkit-0.5.0/cdxml_toolkit/resolve/condensed_formula.py +493 -0
  83. cdxml_toolkit-0.5.0/cdxml_toolkit/resolve/jre_manager.py +195 -0
  84. cdxml_toolkit-0.5.0/cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  85. cdxml_toolkit-0.5.0/cdxml_toolkit/resolve/reagent_db.py +285 -0
  86. cdxml_toolkit-0.5.0/cdxml_toolkit/resolve/superatom_data.json +2856 -0
  87. cdxml_toolkit-0.5.0/cdxml_toolkit/resolve/superatom_table.py +146 -0
  88. cdxml_toolkit-0.5.0/cdxml_toolkit/text_formatting.py +298 -0
  89. cdxml_toolkit-0.5.0/cdxml_toolkit.egg-info/PKG-INFO +318 -0
  90. cdxml_toolkit-0.5.0/cdxml_toolkit.egg-info/SOURCES.txt +110 -0
  91. cdxml_toolkit-0.5.0/cdxml_toolkit.egg-info/dependency_links.txt +1 -0
  92. cdxml_toolkit-0.5.0/cdxml_toolkit.egg-info/entry_points.txt +17 -0
  93. cdxml_toolkit-0.5.0/cdxml_toolkit.egg-info/requires.txt +50 -0
  94. cdxml_toolkit-0.5.0/cdxml_toolkit.egg-info/top_level.txt +1 -0
  95. cdxml_toolkit-0.5.0/pyproject.toml +95 -0
  96. cdxml_toolkit-0.5.0/setup.cfg +4 -0
  97. cdxml_toolkit-0.5.0/tests/test_builder.py +122 -0
  98. cdxml_toolkit-0.5.0/tests/test_cdxml_utils.py +247 -0
  99. cdxml_toolkit-0.5.0/tests/test_condensed_formula.py +292 -0
  100. cdxml_toolkit-0.5.0/tests/test_constants.py +208 -0
  101. cdxml_toolkit-0.5.0/tests/test_merge_yaml.py +599 -0
  102. cdxml_toolkit-0.5.0/tests/test_mol_builder.py +858 -0
  103. cdxml_toolkit-0.5.0/tests/test_rdkit_utils.py +367 -0
  104. cdxml_toolkit-0.5.0/tests/test_reaction_parser.py +958 -0
  105. cdxml_toolkit-0.5.0/tests/test_reagent_db.py +345 -0
  106. cdxml_toolkit-0.5.0/tests/test_scheme_maker.py +391 -0
  107. cdxml_toolkit-0.5.0/tests/test_scheme_reader.py +693 -0
  108. cdxml_toolkit-0.5.0/tests/test_smoke.py +254 -0
  109. cdxml_toolkit-0.5.0/tests/test_smoke_extended.py +804 -0
  110. cdxml_toolkit-0.5.0/tests/test_spatial_assignment.py +608 -0
  111. cdxml_toolkit-0.5.0/tests/test_superatom_table.py +195 -0
  112. cdxml_toolkit-0.5.0/tests/test_text_formatting.py +143 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Hiu Fung Kevin Lee
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,37 @@
1
+ # Third-Party Notices
2
+
3
+ This project includes data and code derived from the following sources.
4
+
5
+ ## ChemScanner (data source)
6
+
7
+ The files `chemscanner_abbreviations.json` and `superatom_data.json` contain
8
+ chemical abbreviation-to-SMILES mappings derived from data files in the
9
+ [ChemScanner](https://github.com/ComPlat/chem_scanner) project, which is
10
+ licensed under AGPL-3.0.
11
+
12
+ The original data was extracted from ChemScanner's `abbreviations.yaml`,
13
+ `solvents.yaml`, and `superatom.txt` configuration files. The extraction
14
+ process involved:
15
+ - Parsing the original YAML/TXT files
16
+ - Validating all SMILES strings with RDKit
17
+ - Canonicalizing SMILES to a standard form
18
+ - Deduplicating entries by canonical SMILES
19
+ - Reorganizing into a different JSON schema with alias support
20
+
21
+ The resulting JSON files contain factual chemical information (abbreviation
22
+ labels mapped to their corresponding SMILES representations). Chemical
23
+ abbreviation-to-structure mappings are scientific facts — "OTs" represents
24
+ a tosylate group regardless of the source from which it is documented.
25
+
26
+ ## RDKit Built-in Abbreviations (runtime data source)
27
+
28
+ At runtime, `superatom_table.py` supplements its JSON-backed lookup table
29
+ with abbreviation data from RDKit's `rdAbbreviations.GetDefaultAbbreviations()`
30
+ (approximately 40 entries). RDKit is licensed under the
31
+ [BSD 3-Clause License](https://github.com/rdkit/rdkit/blob/master/license.txt).
32
+
33
+ ## Reagent Database
34
+
35
+ The file `reagent_abbreviations.json` is an original curated database of
36
+ approximately 172 reagent entries commonly used in medicinal chemistry,
37
+ compiled by the project author. It is licensed under MIT as part of this project.
@@ -0,0 +1,318 @@
1
+ Metadata-Version: 2.4
2
+ Name: cdxml-toolkit
3
+ Version: 0.5.0
4
+ Summary: Python toolkit for ChemDraw CDXML reaction scheme processing, layout, and rendering.
5
+ Author: Hiu Fung Kevin Lee
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/leehiufung911/cdxml-toolkit
8
+ Project-URL: Repository, https://github.com/leehiufung911/cdxml-toolkit
9
+ Project-URL: Issues, https://github.com/leehiufung911/cdxml-toolkit/issues
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Topic :: Scientific/Engineering :: Chemistry
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Operating System :: Microsoft :: Windows
20
+ Requires-Python: >=3.9
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ License-File: NOTICE.md
24
+ Requires-Dist: lxml>=4.6
25
+ Provides-Extra: rdkit
26
+ Requires-Dist: rdkit>=2023.03; extra == "rdkit"
27
+ Provides-Extra: chemdraw
28
+ Requires-Dist: pywin32>=300; extra == "chemdraw"
29
+ Provides-Extra: image
30
+ Requires-Dist: opencv-python>=4.5; extra == "image"
31
+ Requires-Dist: Pillow>=9.0; extra == "image"
32
+ Provides-Extra: office
33
+ Requires-Dist: python-pptx>=0.6; extra == "office"
34
+ Requires-Dist: python-docx>=0.8; extra == "office"
35
+ Requires-Dist: olefile>=0.46; extra == "office"
36
+ Provides-Extra: yaml
37
+ Requires-Dist: pyyaml>=5.4; extra == "yaml"
38
+ Provides-Extra: mcp
39
+ Requires-Dist: mcp>=1.0; extra == "mcp"
40
+ Requires-Dist: pyyaml>=5.4; extra == "mcp"
41
+ Provides-Extra: analysis
42
+ Requires-Dist: pdfplumber>=0.7; extra == "analysis"
43
+ Provides-Extra: chemscript
44
+ Requires-Dist: pythonnet>=3.0; extra == "chemscript"
45
+ Provides-Extra: decimer
46
+ Requires-Dist: tensorflow>=2.16; extra == "decimer"
47
+ Requires-Dist: decimer>=2.6; extra == "decimer"
48
+ Requires-Dist: pymupdf>=1.20; extra == "decimer"
49
+ Provides-Extra: ocr
50
+ Requires-Dist: pytesseract>=0.3; extra == "ocr"
51
+ Provides-Extra: opsin
52
+ Requires-Dist: py2opsin>=1.0; extra == "opsin"
53
+ Provides-Extra: all
54
+ Requires-Dist: cdxml-toolkit[analysis,chemdraw,chemscript,image,mcp,office,opsin,rdkit,yaml]; extra == "all"
55
+ Provides-Extra: full
56
+ Requires-Dist: cdxml-toolkit[all,decimer,ocr]; extra == "full"
57
+ Provides-Extra: dev
58
+ Requires-Dist: cdxml-toolkit[all]; extra == "dev"
59
+ Requires-Dist: pytest>=7.0; extra == "dev"
60
+ Dynamic: license-file
61
+
62
+ # cdxml-toolkit
63
+
64
+ Chemistry office automation toolkit with MCP (Model Context Protocol) server. Lets LLM agents draw reaction schemes, parse ELN exports, analyze LCMS data, and produce publication-ready ChemDraw (CDXML) output.
65
+
66
+ The goal: any chemist with a consumer GPU can run a local LLM agent that helps with routine chemistry office tasks. The toolkit provides 15 grounded, validated chemistry tools that LLMs call via MCP — the agent reasons about chemistry while the tools handle SMILES resolution, 2D coordinate generation, and CDXML layout.
67
+
68
+ > Built and tested with Claude Code (Opus 4.6). I directed the design and architecture; Claude did the implementation. I'm a PhD organic chemist, not a programmer — this project wouldn't exist without Claude Code, and I thank Anthropic.
69
+
70
+ ## Quick start: MCP server
71
+
72
+ The primary interface is the MCP server. Connect it to any MCP-compatible agent (Claude Desktop, opencode, qwen-agent, etc.) and just chat naturally: "Draw deucravacitinib", "Help me complete my lab book", "Extract structures from this image".
73
+
74
+ ### Claude Desktop
75
+
76
+ Edit `%APPDATA%\Claude\claude_desktop_config.json` (Windows) or `~/Library/Application Support/Claude/claude_desktop_config.json` (Mac):
77
+
78
+ ```json
79
+ {
80
+ "mcpServers": {
81
+ "cdxml-toolkit": {
82
+ "command": "python",
83
+ "args": ["-m", "cdxml_toolkit.mcp_server"]
84
+ }
85
+ }
86
+ }
87
+ ```
88
+
89
+ ### opencode (for OpenRouter / local models)
90
+
91
+ Create `opencode.json`:
92
+
93
+ ```json
94
+ {
95
+ "provider": {
96
+ "openrouter": {
97
+ "models": { "qwen/qwen3.5-27b": {} }
98
+ }
99
+ },
100
+ "mcp": {
101
+ "cdxml-toolkit": {
102
+ "type": "local",
103
+ "command": ["python", "-m", "cdxml_toolkit.mcp_server"],
104
+ "enabled": true,
105
+ "timeout": 120000
106
+ }
107
+ }
108
+ }
109
+ ```
110
+
111
+ ### Verify it works
112
+
113
+ ```
114
+ > Use cdxml-toolkit. Resolve "aspirin", then draw it.
115
+ ```
116
+
117
+ Expected: 2 tool calls (resolve_name, draw_molecule), produces an aspirin CDXML file.
118
+
119
+ ## MCP tools (15)
120
+
121
+ ### Chemistry resolution
122
+ | Tool | Description |
123
+ |------|-------------|
124
+ | `resolve_name` | Name/abbreviation/CAS/formula to rich molecule JSON (5-tier: reagent DB, condensed formula, ChemScript, OPSIN, PubChem) |
125
+ | `modify_molecule` | 6 operations: analyze, name_surgery, smarts, set_smiles, set_name, reaction. 162 named reaction templates. Returns MCS-based structural diffs. |
126
+
127
+ ### Structure rendering
128
+ | Tool | Description |
129
+ |------|-------------|
130
+ | `draw_molecule` | Single molecule to CDXML |
131
+ | `render_scheme` | YAML/compact text/reaction JSON to publication-ready CDXML. Forgiving parser handles common LLM YAML mistakes. |
132
+
133
+ ### Perception (reading existing chemistry)
134
+ | Tool | Description |
135
+ |------|-------------|
136
+ | `parse_reaction` | ELN exports (CDXML/CDX/CSV/RXN) to semantic JSON with species, roles, SMILES, equivalents |
137
+ | `summarize_reaction` | Context-efficient view of reaction JSON (select only the fields you need) |
138
+ | `extract_structures_from_image` | Image to SMILES + confidence scores via DECIMER neural network |
139
+ | `parse_scheme` | CDXML scheme to structured species/steps/topology JSON |
140
+
141
+ ### Analysis
142
+ | Tool | Description |
143
+ |------|-------------|
144
+ | `parse_analysis_file` | LCMS (Waters/manual) or NMR (MestReNova) PDF to structured peak data |
145
+ | `format_lab_entry` | Structured entry dicts to formatted lab book text. Re-reads LCMS PDFs for exact numbers. |
146
+
147
+ ### Office integration
148
+ | Tool | Description |
149
+ |------|-------------|
150
+ | `extract_cdxml_from_office` | Pull embedded ChemDraw OLE objects from PPTX/DOCX |
151
+ | `embed_cdxml_in_office` | Inject CDXML as editable ChemDraw OLE into PPTX/DOCX |
152
+ | `convert_cdx_cdxml` | Bidirectional CDX/CDXML conversion |
153
+ | `search_compound` | Find a molecule across experiment directories by SMILES similarity |
154
+ | `render_to_png` | CDXML to PNG via ChemDraw COM |
155
+
156
+ ## Design principles
157
+
158
+ **Never trust LLM-generated SMILES.** The agent always goes through `resolve_name` to get grounded SMILES from databases. Direct SMILES generation is the #1 source of chemistry hallucination.
159
+
160
+ **Verify every transformation.** `modify_molecule` returns aligned IUPAC name diffs and MCS-based molecular diffs after every edit. The agent can confirm the transformation is correct.
161
+
162
+ **Never flood the agent.** Large outputs (CDXML, JSON) always write to files and return `{ok: true, output_path: "...", size: 23456}`. The agent never gets 30KB of XML in its context window.
163
+
164
+ **Forgiving inputs.** The YAML parser accepts 9+ common LLM mistakes (inline structures, `substrates` as alias for `structures`, text as string not list, bare SMILES, `above_arrow` as list/string). Input parameters accept bare SMILES strings, stringified JSON arrays, and fuzzy operation names.
165
+
166
+ **Actionable errors.** Every error tells the agent what to do instead: "Did you mean: BOC_deprotection?", not "KeyError".
167
+
168
+ **Progressive discovery.** Call any tool with no arguments to get usage examples and schema reference.
169
+
170
+ ## Installation
171
+
172
+ **Prerequisites:** Windows with ChemDraw (ChemOffice 2015+) and ChemScript installed.
173
+
174
+ ```bash
175
+ # From GitHub — recommended install (includes RDKit, MCP server, ChemDraw COM,
176
+ # Office support, PDF parsing, image processing, and ChemScript bridge)
177
+ pip install "cdxml-toolkit[all] @ git+https://github.com/leehiufung911/cdxml-toolkit.git@main"
178
+
179
+ # With DECIMER neural image extraction (extract_structures_from_image)
180
+ pip install "cdxml-toolkit[all,decimer] @ git+https://github.com/leehiufung911/cdxml-toolkit.git@main"
181
+
182
+ # Development (editable install)
183
+ git clone https://github.com/leehiufung911/cdxml-toolkit.git
184
+ cd cdxml-toolkit
185
+ pip install -e ".[dev]"
186
+ ```
187
+
188
+ **Required:** `lxml>=4.6`. **Recommended:** `rdkit>=2023.03` (needed for scheme rendering).
189
+
190
+ ### Extras
191
+
192
+ | Extra | What it includes | Notes |
193
+ |-------|-----------------|-------|
194
+ | `[all]` | RDKit, pywin32, image, Office, YAML, PDF analysis, MCP server, pythonnet, py2opsin | **Use this.** Everything most users need. |
195
+ | `[decimer]` | TensorFlow, DECIMER, PyMuPDF | Neural image-to-SMILES. Adds ~1 GB. |
196
+ | `[full]` | `[all]` + `[decimer]` + `[ocr]` | Everything pip-installable. |
197
+ | `[rdkit]` | RDKit only | Minimal install for scripting. |
198
+ | `[mcp]` | MCP server + PyYAML | MCP server only (no RDKit/Office). |
199
+ | `[dev]` | `[all]` + pytest | For running the test suite. |
200
+
201
+ ### Name resolution tiers
202
+
203
+ `resolve_name` tries 5 tiers in order. The first tier to return a valid SMILES wins:
204
+
205
+ | Tier | Source | Deps | Coverage |
206
+ |------|--------|------|----------|
207
+ | 1 | Curated reagent DB (186 entries) | None | Common reagents, catalysts, solvents |
208
+ | 2 | Condensed formula parser | RDKit | Shorthand like PhB(OH)2, Et3N, CF3 |
209
+ | 3 | **ChemScript** (preferred) | ChemDraw + 32-bit Python | Full IUPAC names, any drawable structure |
210
+ | 4 | **OPSIN** (bundled fallback) | py2opsin + bundled JRE | Systematic IUPAC names, offline |
211
+ | 5 | PubChem | Network | CAS numbers, trade names, everything else |
212
+
213
+ ChemScript (Tier 3) is preferred because it handles the widest range of names and integrates with ChemDraw's structure engine. OPSIN (Tier 4) is a fully offline fallback that works out of the box — a JRE is bundled with the package, no Java install needed. If neither is available, PubChem provides a network-based last resort.
214
+
215
+ ### System dependencies (not pip-installable)
216
+
217
+ | Dependency | Required for | Setup |
218
+ |-----------|-------------|-------|
219
+ | **ChemDraw** (ChemOffice 2015+) | CDX conversion, PNG rendering | Must be **closed** before running COM tools. |
220
+ | **ChemScript .NET** | Name resolution Tier 3 (preferred, not required) | Comes with ChemOffice. See setup below. |
221
+ | **Microsoft Office** | OLE embedding into PPTX/DOCX | Optional. Only needed for `embed_cdxml_in_office`. |
222
+
223
+ ### ChemScript setup (optional but recommended)
224
+
225
+ ChemScript gives the best IUPAC name resolution but requires a 32-bit Python environment because the ChemScript .NET DLL is 32-bit. If you skip this, OPSIN handles IUPAC names as a fallback.
226
+
227
+ ```bash
228
+ # 1. Create 32-bit Python env
229
+ set CONDA_SUBDIR=win-32 && conda create -n chemscript32 python=3.10 -y
230
+
231
+ # 2. Install pythonnet in the 32-bit env
232
+ C:\Users\%USERNAME%\miniconda3\envs\chemscript32\python.exe -m pip install pythonnet
233
+
234
+ # 3. Auto-detect ChemDraw and save config
235
+ cdxml-convert --configure
236
+ ```
237
+
238
+ Step 3 scans for ChemDraw (2015/2016/PerkinElmer paths) and writes `~/.chemscript_config.json`. If your ChemDraw is in a non-standard location, edit the config manually:
239
+
240
+ ```json
241
+ {
242
+ "python32": "C:\\Users\\YOU\\miniconda3\\envs\\chemscript32\\python.exe",
243
+ "dll_dir": "C:\\Program Files (x86)\\PerkinElmerInformatics\\ChemOffice2016\\ChemScript\\Lib\\Net",
244
+ "assembly": "CambridgeSoft.ChemScript16"
245
+ }
246
+ ```
247
+
248
+ ## CLI tools
249
+
250
+ All tools are also available as command-line scripts:
251
+
252
+ | Command | Description |
253
+ |---------|-------------|
254
+ | `cdxml-mcp` | MCP server (primary interface) |
255
+ | `cdxml-parse` | Parse reaction files to JSON |
256
+ | `cdxml-render` | Render JSON/YAML/compact text to CDXML |
257
+ | `cdxml-convert` | CDX/CDXML bidirectional conversion |
258
+ | `cdxml-image` | CDXML to PNG/SVG (ChemDraw COM) |
259
+ | `cdxml-merge` | Merge multiple reaction schemes |
260
+ | `cdxml-layout` | Clean up reaction layout (pure Python) |
261
+ | `cdxml-ole` | Embed CDXML as editable OLE in PPTX/DOCX |
262
+ | `cdxml-lcms` | Parse LCMS PDF reports |
263
+ | `cdxml-nmr` | Extract NMR data from MestReNova PDFs |
264
+ | `cdxml-format-entry` | Format lab book entries |
265
+ | `cdxml-discover` | Discover experiment files in a directory |
266
+
267
+ ## Scheme DSL
268
+
269
+ The renderer accepts three input formats:
270
+
271
+ **YAML** (what agents typically write):
272
+ ```yaml
273
+ layout: sequential
274
+ structures:
275
+ SM:
276
+ smiles: "Brc1ncnc2sccc12"
277
+ Product:
278
+ smiles: "c1nc(N2CCOCC2)c2ccsc2n1"
279
+ steps:
280
+ - substrates: [SM]
281
+ products: [Product]
282
+ above_arrow:
283
+ structures: [Morph]
284
+ below_arrow:
285
+ text: ["Pd2(dba)3", "BINAP", "Cs2CO3", "Dioxane, 105 C"]
286
+ ```
287
+
288
+ **Compact text** ("Mermaid for reactions"):
289
+ ```
290
+ SM: {Brc1ncnc2sccc12}
291
+ SM --> Product{c1nc(N2CCOCC2)c2ccsc2n1}
292
+ above: Morph{C1COCCN1}
293
+ below: "Pd2(dba)3", "BINAP", "Cs2CO3"
294
+ ```
295
+
296
+ **Reaction JSON** (from parse_reaction):
297
+ ```bash
298
+ cdxml-render --from-json reaction.json -o scheme.cdxml
299
+ ```
300
+
301
+ ## Running tests
302
+
303
+ ```bash
304
+ pip install -e ".[dev]"
305
+ pytest tests/ -v
306
+ ```
307
+
308
+ ## License
309
+
310
+ [MIT](LICENSE)
311
+
312
+ ## Attribution
313
+
314
+ See [NOTICE.md](NOTICE.md) for third-party data attribution (ChemScanner, RDKit).
315
+
316
+ ## Author
317
+
318
+ Hiu Fung Kevin Lee ([@leehiufung911](https://github.com/leehiufung911))
@@ -0,0 +1,257 @@
1
+ # cdxml-toolkit
2
+
3
+ Chemistry office automation toolkit with MCP (Model Context Protocol) server. Lets LLM agents draw reaction schemes, parse ELN exports, analyze LCMS data, and produce publication-ready ChemDraw (CDXML) output.
4
+
5
+ The goal: any chemist with a consumer GPU can run a local LLM agent that helps with routine chemistry office tasks. The toolkit provides 15 grounded, validated chemistry tools that LLMs call via MCP — the agent reasons about chemistry while the tools handle SMILES resolution, 2D coordinate generation, and CDXML layout.
6
+
7
+ > Built and tested with Claude Code (Opus 4.6). I directed the design and architecture; Claude did the implementation. I'm a PhD organic chemist, not a programmer — this project wouldn't exist without Claude Code, and I thank Anthropic.
8
+
9
+ ## Quick start: MCP server
10
+
11
+ The primary interface is the MCP server. Connect it to any MCP-compatible agent (Claude Desktop, opencode, qwen-agent, etc.) and just chat naturally: "Draw deucravacitinib", "Help me complete my lab book", "Extract structures from this image".
12
+
13
+ ### Claude Desktop
14
+
15
+ Edit `%APPDATA%\Claude\claude_desktop_config.json` (Windows) or `~/Library/Application Support/Claude/claude_desktop_config.json` (Mac):
16
+
17
+ ```json
18
+ {
19
+ "mcpServers": {
20
+ "cdxml-toolkit": {
21
+ "command": "python",
22
+ "args": ["-m", "cdxml_toolkit.mcp_server"]
23
+ }
24
+ }
25
+ }
26
+ ```
27
+
28
+ ### opencode (for OpenRouter / local models)
29
+
30
+ Create `opencode.json`:
31
+
32
+ ```json
33
+ {
34
+ "provider": {
35
+ "openrouter": {
36
+ "models": { "qwen/qwen3.5-27b": {} }
37
+ }
38
+ },
39
+ "mcp": {
40
+ "cdxml-toolkit": {
41
+ "type": "local",
42
+ "command": ["python", "-m", "cdxml_toolkit.mcp_server"],
43
+ "enabled": true,
44
+ "timeout": 120000
45
+ }
46
+ }
47
+ }
48
+ ```
49
+
50
+ ### Verify it works
51
+
52
+ ```
53
+ > Use cdxml-toolkit. Resolve "aspirin", then draw it.
54
+ ```
55
+
56
+ Expected: 2 tool calls (resolve_name, draw_molecule), produces an aspirin CDXML file.
57
+
58
+ ## MCP tools (15)
59
+
60
+ ### Chemistry resolution
61
+ | Tool | Description |
62
+ |------|-------------|
63
+ | `resolve_name` | Name/abbreviation/CAS/formula to rich molecule JSON (5-tier: reagent DB, condensed formula, ChemScript, OPSIN, PubChem) |
64
+ | `modify_molecule` | 6 operations: analyze, name_surgery, smarts, set_smiles, set_name, reaction. 162 named reaction templates. Returns MCS-based structural diffs. |
65
+
66
+ ### Structure rendering
67
+ | Tool | Description |
68
+ |------|-------------|
69
+ | `draw_molecule` | Single molecule to CDXML |
70
+ | `render_scheme` | YAML/compact text/reaction JSON to publication-ready CDXML. Forgiving parser handles common LLM YAML mistakes. |
71
+
72
+ ### Perception (reading existing chemistry)
73
+ | Tool | Description |
74
+ |------|-------------|
75
+ | `parse_reaction` | ELN exports (CDXML/CDX/CSV/RXN) to semantic JSON with species, roles, SMILES, equivalents |
76
+ | `summarize_reaction` | Context-efficient view of reaction JSON (select only the fields you need) |
77
+ | `extract_structures_from_image` | Image to SMILES + confidence scores via DECIMER neural network |
78
+ | `parse_scheme` | CDXML scheme to structured species/steps/topology JSON |
79
+
80
+ ### Analysis
81
+ | Tool | Description |
82
+ |------|-------------|
83
+ | `parse_analysis_file` | LCMS (Waters/manual) or NMR (MestReNova) PDF to structured peak data |
84
+ | `format_lab_entry` | Structured entry dicts to formatted lab book text. Re-reads LCMS PDFs for exact numbers. |
85
+
86
+ ### Office integration
87
+ | Tool | Description |
88
+ |------|-------------|
89
+ | `extract_cdxml_from_office` | Pull embedded ChemDraw OLE objects from PPTX/DOCX |
90
+ | `embed_cdxml_in_office` | Inject CDXML as editable ChemDraw OLE into PPTX/DOCX |
91
+ | `convert_cdx_cdxml` | Bidirectional CDX/CDXML conversion |
92
+ | `search_compound` | Find a molecule across experiment directories by SMILES similarity |
93
+ | `render_to_png` | CDXML to PNG via ChemDraw COM |
94
+
95
+ ## Design principles
96
+
97
+ **Never trust LLM-generated SMILES.** The agent always goes through `resolve_name` to get grounded SMILES from databases. Direct SMILES generation is the #1 source of chemistry hallucination.
98
+
99
+ **Verify every transformation.** `modify_molecule` returns aligned IUPAC name diffs and MCS-based molecular diffs after every edit. The agent can confirm the transformation is correct.
100
+
101
+ **Never flood the agent.** Large outputs (CDXML, JSON) always write to files and return `{ok: true, output_path: "...", size: 23456}`. The agent never gets 30KB of XML in its context window.
102
+
103
+ **Forgiving inputs.** The YAML parser accepts 9+ common LLM mistakes (inline structures, `substrates` as alias for `structures`, text as string not list, bare SMILES, `above_arrow` as list/string). Input parameters accept bare SMILES strings, stringified JSON arrays, and fuzzy operation names.
104
+
105
+ **Actionable errors.** Every error tells the agent what to do instead: "Did you mean: BOC_deprotection?", not "KeyError".
106
+
107
+ **Progressive discovery.** Call any tool with no arguments to get usage examples and schema reference.
108
+
109
+ ## Installation
110
+
111
+ **Prerequisites:** Windows with ChemDraw (ChemOffice 2015+) and ChemScript installed.
112
+
113
+ ```bash
114
+ # From GitHub — recommended install (includes RDKit, MCP server, ChemDraw COM,
115
+ # Office support, PDF parsing, image processing, and ChemScript bridge)
116
+ pip install "cdxml-toolkit[all] @ git+https://github.com/leehiufung911/cdxml-toolkit.git@main"
117
+
118
+ # With DECIMER neural image extraction (extract_structures_from_image)
119
+ pip install "cdxml-toolkit[all,decimer] @ git+https://github.com/leehiufung911/cdxml-toolkit.git@main"
120
+
121
+ # Development (editable install)
122
+ git clone https://github.com/leehiufung911/cdxml-toolkit.git
123
+ cd cdxml-toolkit
124
+ pip install -e ".[dev]"
125
+ ```
126
+
127
+ **Required:** `lxml>=4.6`. **Recommended:** `rdkit>=2023.03` (needed for scheme rendering).
128
+
129
+ ### Extras
130
+
131
+ | Extra | What it includes | Notes |
132
+ |-------|-----------------|-------|
133
+ | `[all]` | RDKit, pywin32, image, Office, YAML, PDF analysis, MCP server, pythonnet, py2opsin | **Use this.** Everything most users need. |
134
+ | `[decimer]` | TensorFlow, DECIMER, PyMuPDF | Neural image-to-SMILES. Adds ~1 GB. |
135
+ | `[full]` | `[all]` + `[decimer]` + `[ocr]` | Everything pip-installable. |
136
+ | `[rdkit]` | RDKit only | Minimal install for scripting. |
137
+ | `[mcp]` | MCP server + PyYAML | MCP server only (no RDKit/Office). |
138
+ | `[dev]` | `[all]` + pytest | For running the test suite. |
139
+
140
+ ### Name resolution tiers
141
+
142
+ `resolve_name` tries 5 tiers in order. The first tier to return a valid SMILES wins:
143
+
144
+ | Tier | Source | Deps | Coverage |
145
+ |------|--------|------|----------|
146
+ | 1 | Curated reagent DB (186 entries) | None | Common reagents, catalysts, solvents |
147
+ | 2 | Condensed formula parser | RDKit | Shorthand like PhB(OH)2, Et3N, CF3 |
148
+ | 3 | **ChemScript** (preferred) | ChemDraw + 32-bit Python | Full IUPAC names, any drawable structure |
149
+ | 4 | **OPSIN** (bundled fallback) | py2opsin + bundled JRE | Systematic IUPAC names, offline |
150
+ | 5 | PubChem | Network | CAS numbers, trade names, everything else |
151
+
152
+ ChemScript (Tier 3) is preferred because it handles the widest range of names and integrates with ChemDraw's structure engine. OPSIN (Tier 4) is a fully offline fallback that works out of the box — a JRE is bundled with the package, no Java install needed. If neither is available, PubChem provides a network-based last resort.
153
+
154
+ ### System dependencies (not pip-installable)
155
+
156
+ | Dependency | Required for | Setup |
157
+ |-----------|-------------|-------|
158
+ | **ChemDraw** (ChemOffice 2015+) | CDX conversion, PNG rendering | Must be **closed** before running COM tools. |
159
+ | **ChemScript .NET** | Name resolution Tier 3 (preferred, not required) | Comes with ChemOffice. See setup below. |
160
+ | **Microsoft Office** | OLE embedding into PPTX/DOCX | Optional. Only needed for `embed_cdxml_in_office`. |
161
+
162
+ ### ChemScript setup (optional but recommended)
163
+
164
+ ChemScript gives the best IUPAC name resolution but requires a 32-bit Python environment because the ChemScript .NET DLL is 32-bit. If you skip this, OPSIN handles IUPAC names as a fallback.
165
+
166
+ ```bash
167
+ # 1. Create 32-bit Python env
168
+ set CONDA_SUBDIR=win-32 && conda create -n chemscript32 python=3.10 -y
169
+
170
+ # 2. Install pythonnet in the 32-bit env
171
+ C:\Users\%USERNAME%\miniconda3\envs\chemscript32\python.exe -m pip install pythonnet
172
+
173
+ # 3. Auto-detect ChemDraw and save config
174
+ cdxml-convert --configure
175
+ ```
176
+
177
+ Step 3 scans for ChemDraw (2015/2016/PerkinElmer paths) and writes `~/.chemscript_config.json`. If your ChemDraw is in a non-standard location, edit the config manually:
178
+
179
+ ```json
180
+ {
181
+ "python32": "C:\\Users\\YOU\\miniconda3\\envs\\chemscript32\\python.exe",
182
+ "dll_dir": "C:\\Program Files (x86)\\PerkinElmerInformatics\\ChemOffice2016\\ChemScript\\Lib\\Net",
183
+ "assembly": "CambridgeSoft.ChemScript16"
184
+ }
185
+ ```
186
+
187
+ ## CLI tools
188
+
189
+ All tools are also available as command-line scripts:
190
+
191
+ | Command | Description |
192
+ |---------|-------------|
193
+ | `cdxml-mcp` | MCP server (primary interface) |
194
+ | `cdxml-parse` | Parse reaction files to JSON |
195
+ | `cdxml-render` | Render JSON/YAML/compact text to CDXML |
196
+ | `cdxml-convert` | CDX/CDXML bidirectional conversion |
197
+ | `cdxml-image` | CDXML to PNG/SVG (ChemDraw COM) |
198
+ | `cdxml-merge` | Merge multiple reaction schemes |
199
+ | `cdxml-layout` | Clean up reaction layout (pure Python) |
200
+ | `cdxml-ole` | Embed CDXML as editable OLE in PPTX/DOCX |
201
+ | `cdxml-lcms` | Parse LCMS PDF reports |
202
+ | `cdxml-nmr` | Extract NMR data from MestReNova PDFs |
203
+ | `cdxml-format-entry` | Format lab book entries |
204
+ | `cdxml-discover` | Discover experiment files in a directory |
205
+
206
+ ## Scheme DSL
207
+
208
+ The renderer accepts three input formats:
209
+
210
+ **YAML** (what agents typically write):
211
+ ```yaml
212
+ layout: sequential
213
+ structures:
214
+ SM:
215
+ smiles: "Brc1ncnc2sccc12"
216
+ Product:
217
+ smiles: "c1nc(N2CCOCC2)c2ccsc2n1"
218
+ steps:
219
+ - substrates: [SM]
220
+ products: [Product]
221
+ above_arrow:
222
+ structures: [Morph]
223
+ below_arrow:
224
+ text: ["Pd2(dba)3", "BINAP", "Cs2CO3", "Dioxane, 105 C"]
225
+ ```
226
+
227
+ **Compact text** ("Mermaid for reactions"):
228
+ ```
229
+ SM: {Brc1ncnc2sccc12}
230
+ SM --> Product{c1nc(N2CCOCC2)c2ccsc2n1}
231
+ above: Morph{C1COCCN1}
232
+ below: "Pd2(dba)3", "BINAP", "Cs2CO3"
233
+ ```
234
+
235
+ **Reaction JSON** (from parse_reaction):
236
+ ```bash
237
+ cdxml-render --from-json reaction.json -o scheme.cdxml
238
+ ```
239
+
240
+ ## Running tests
241
+
242
+ ```bash
243
+ pip install -e ".[dev]"
244
+ pytest tests/ -v
245
+ ```
246
+
247
+ ## License
248
+
249
+ [MIT](LICENSE)
250
+
251
+ ## Attribution
252
+
253
+ See [NOTICE.md](NOTICE.md) for third-party data attribution (ChemScanner, RDKit).
254
+
255
+ ## Author
256
+
257
+ Hiu Fung Kevin Lee ([@leehiufung911](https://github.com/leehiufung911))
@@ -0,0 +1,18 @@
1
+ """cdxml-toolkit: Python toolkit for ChemDraw CDXML reaction scheme processing.
2
+
3
+ Provides tools for reading, writing, manipulating, and rendering ChemDraw CDXML
4
+ files. Includes reaction scheme layout, reagent classification, structure
5
+ alignment, and a declarative DSL for building schemes from YAML or text.
6
+
7
+ Core utilities are available without optional dependencies. RDKit, ChemDraw COM,
8
+ and other heavy dependencies are lazy-imported and only required when their
9
+ specific features are used.
10
+ """
11
+
12
+ __version__ = "0.5.0"
13
+
14
+ # Core utilities — always available (stdlib + lxml only)
15
+ from .constants import ACS_BOND_LENGTH, ACS_CHAIN_ANGLE, ACS_STYLE
16
+ from .cdxml_utils import parse_cdxml, write_cdxml, fragment_bbox
17
+ from .text_formatting import build_formatted_s_xml
18
+ from .resolve.reagent_db import get_reagent_db
@@ -0,0 +1,2 @@
1
+ # Bundled Eclipse Temurin JRE 21 (Windows x64) for OPSIN.
2
+ # License: GPL v2 with Classpath Exception (allows bundling).