cdxml-toolkit 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit-0.5.0/LICENSE +21 -0
- cdxml_toolkit-0.5.0/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0/PKG-INFO +318 -0
- cdxml_toolkit-0.5.0/README.md +257 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit.egg-info/PKG-INFO +318 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit.egg-info/SOURCES.txt +110 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit.egg-info/dependency_links.txt +1 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit.egg-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit.egg-info/requires.txt +50 -0
- cdxml_toolkit-0.5.0/cdxml_toolkit.egg-info/top_level.txt +1 -0
- cdxml_toolkit-0.5.0/pyproject.toml +95 -0
- cdxml_toolkit-0.5.0/setup.cfg +4 -0
- cdxml_toolkit-0.5.0/tests/test_builder.py +122 -0
- cdxml_toolkit-0.5.0/tests/test_cdxml_utils.py +247 -0
- cdxml_toolkit-0.5.0/tests/test_condensed_formula.py +292 -0
- cdxml_toolkit-0.5.0/tests/test_constants.py +208 -0
- cdxml_toolkit-0.5.0/tests/test_merge_yaml.py +599 -0
- cdxml_toolkit-0.5.0/tests/test_mol_builder.py +858 -0
- cdxml_toolkit-0.5.0/tests/test_rdkit_utils.py +367 -0
- cdxml_toolkit-0.5.0/tests/test_reaction_parser.py +958 -0
- cdxml_toolkit-0.5.0/tests/test_reagent_db.py +345 -0
- cdxml_toolkit-0.5.0/tests/test_scheme_maker.py +391 -0
- cdxml_toolkit-0.5.0/tests/test_scheme_reader.py +693 -0
- cdxml_toolkit-0.5.0/tests/test_smoke.py +254 -0
- cdxml_toolkit-0.5.0/tests/test_smoke_extended.py +804 -0
- cdxml_toolkit-0.5.0/tests/test_spatial_assignment.py +608 -0
- cdxml_toolkit-0.5.0/tests/test_superatom_table.py +195 -0
- cdxml_toolkit-0.5.0/tests/test_text_formatting.py +143 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Hiu Fung Kevin Lee
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# Third-Party Notices
|
|
2
|
+
|
|
3
|
+
This project includes data and code derived from the following sources.
|
|
4
|
+
|
|
5
|
+
## ChemScanner (data source)
|
|
6
|
+
|
|
7
|
+
The files `chemscanner_abbreviations.json` and `superatom_data.json` contain
|
|
8
|
+
chemical abbreviation-to-SMILES mappings derived from data files in the
|
|
9
|
+
[ChemScanner](https://github.com/ComPlat/chem_scanner) project, which is
|
|
10
|
+
licensed under AGPL-3.0.
|
|
11
|
+
|
|
12
|
+
The original data was extracted from ChemScanner's `abbreviations.yaml`,
|
|
13
|
+
`solvents.yaml`, and `superatom.txt` configuration files. The extraction
|
|
14
|
+
process involved:
|
|
15
|
+
- Parsing the original YAML/TXT files
|
|
16
|
+
- Validating all SMILES strings with RDKit
|
|
17
|
+
- Canonicalizing SMILES to a standard form
|
|
18
|
+
- Deduplicating entries by canonical SMILES
|
|
19
|
+
- Reorganizing into a different JSON schema with alias support
|
|
20
|
+
|
|
21
|
+
The resulting JSON files contain factual chemical information (abbreviation
|
|
22
|
+
labels mapped to their corresponding SMILES representations). Chemical
|
|
23
|
+
abbreviation-to-structure mappings are scientific facts — "OTs" represents
|
|
24
|
+
a tosylate group regardless of the source from which it is documented.
|
|
25
|
+
|
|
26
|
+
## RDKit Built-in Abbreviations (runtime data source)
|
|
27
|
+
|
|
28
|
+
At runtime, `superatom_table.py` supplements its JSON-backed lookup table
|
|
29
|
+
with abbreviation data from RDKit's `rdAbbreviations.GetDefaultAbbreviations()`
|
|
30
|
+
(approximately 40 entries). RDKit is licensed under the
|
|
31
|
+
[BSD 3-Clause License](https://github.com/rdkit/rdkit/blob/master/license.txt).
|
|
32
|
+
|
|
33
|
+
## Reagent Database
|
|
34
|
+
|
|
35
|
+
The file `reagent_abbreviations.json` is an original curated database of
|
|
36
|
+
approximately 172 reagent entries commonly used in medicinal chemistry,
|
|
37
|
+
compiled by the project author. It is licensed under MIT as part of this project.
|
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cdxml-toolkit
|
|
3
|
+
Version: 0.5.0
|
|
4
|
+
Summary: Python toolkit for ChemDraw CDXML reaction scheme processing, layout, and rendering.
|
|
5
|
+
Author: Hiu Fung Kevin Lee
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/leehiufung911/cdxml-toolkit
|
|
8
|
+
Project-URL: Repository, https://github.com/leehiufung911/cdxml-toolkit
|
|
9
|
+
Project-URL: Issues, https://github.com/leehiufung911/cdxml-toolkit/issues
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Chemistry
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
License-File: NOTICE.md
|
|
24
|
+
Requires-Dist: lxml>=4.6
|
|
25
|
+
Provides-Extra: rdkit
|
|
26
|
+
Requires-Dist: rdkit>=2023.03; extra == "rdkit"
|
|
27
|
+
Provides-Extra: chemdraw
|
|
28
|
+
Requires-Dist: pywin32>=300; extra == "chemdraw"
|
|
29
|
+
Provides-Extra: image
|
|
30
|
+
Requires-Dist: opencv-python>=4.5; extra == "image"
|
|
31
|
+
Requires-Dist: Pillow>=9.0; extra == "image"
|
|
32
|
+
Provides-Extra: office
|
|
33
|
+
Requires-Dist: python-pptx>=0.6; extra == "office"
|
|
34
|
+
Requires-Dist: python-docx>=0.8; extra == "office"
|
|
35
|
+
Requires-Dist: olefile>=0.46; extra == "office"
|
|
36
|
+
Provides-Extra: yaml
|
|
37
|
+
Requires-Dist: pyyaml>=5.4; extra == "yaml"
|
|
38
|
+
Provides-Extra: mcp
|
|
39
|
+
Requires-Dist: mcp>=1.0; extra == "mcp"
|
|
40
|
+
Requires-Dist: pyyaml>=5.4; extra == "mcp"
|
|
41
|
+
Provides-Extra: analysis
|
|
42
|
+
Requires-Dist: pdfplumber>=0.7; extra == "analysis"
|
|
43
|
+
Provides-Extra: chemscript
|
|
44
|
+
Requires-Dist: pythonnet>=3.0; extra == "chemscript"
|
|
45
|
+
Provides-Extra: decimer
|
|
46
|
+
Requires-Dist: tensorflow>=2.16; extra == "decimer"
|
|
47
|
+
Requires-Dist: decimer>=2.6; extra == "decimer"
|
|
48
|
+
Requires-Dist: pymupdf>=1.20; extra == "decimer"
|
|
49
|
+
Provides-Extra: ocr
|
|
50
|
+
Requires-Dist: pytesseract>=0.3; extra == "ocr"
|
|
51
|
+
Provides-Extra: opsin
|
|
52
|
+
Requires-Dist: py2opsin>=1.0; extra == "opsin"
|
|
53
|
+
Provides-Extra: all
|
|
54
|
+
Requires-Dist: cdxml-toolkit[analysis,chemdraw,chemscript,image,mcp,office,opsin,rdkit,yaml]; extra == "all"
|
|
55
|
+
Provides-Extra: full
|
|
56
|
+
Requires-Dist: cdxml-toolkit[all,decimer,ocr]; extra == "full"
|
|
57
|
+
Provides-Extra: dev
|
|
58
|
+
Requires-Dist: cdxml-toolkit[all]; extra == "dev"
|
|
59
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
60
|
+
Dynamic: license-file
|
|
61
|
+
|
|
62
|
+
# cdxml-toolkit
|
|
63
|
+
|
|
64
|
+
Chemistry office automation toolkit with MCP (Model Context Protocol) server. Lets LLM agents draw reaction schemes, parse ELN exports, analyze LCMS data, and produce publication-ready ChemDraw (CDXML) output.
|
|
65
|
+
|
|
66
|
+
The goal: any chemist with a consumer GPU can run a local LLM agent that helps with routine chemistry office tasks. The toolkit provides 15 grounded, validated chemistry tools that LLMs call via MCP — the agent reasons about chemistry while the tools handle SMILES resolution, 2D coordinate generation, and CDXML layout.
|
|
67
|
+
|
|
68
|
+
> Built and tested with Claude Code (Opus 4.6). I directed the design and architecture; Claude did the implementation. I'm a PhD organic chemist, not a programmer — this project wouldn't exist without Claude Code, and I thank Anthropic.
|
|
69
|
+
|
|
70
|
+
## Quick start: MCP server
|
|
71
|
+
|
|
72
|
+
The primary interface is the MCP server. Connect it to any MCP-compatible agent (Claude Desktop, opencode, qwen-agent, etc.) and just chat naturally: "Draw deucravacitinib", "Help me complete my lab book", "Extract structures from this image".
|
|
73
|
+
|
|
74
|
+
### Claude Desktop
|
|
75
|
+
|
|
76
|
+
Edit `%APPDATA%\Claude\claude_desktop_config.json` (Windows) or `~/Library/Application Support/Claude/claude_desktop_config.json` (Mac):
|
|
77
|
+
|
|
78
|
+
```json
|
|
79
|
+
{
|
|
80
|
+
"mcpServers": {
|
|
81
|
+
"cdxml-toolkit": {
|
|
82
|
+
"command": "python",
|
|
83
|
+
"args": ["-m", "cdxml_toolkit.mcp_server"]
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### opencode (for OpenRouter / local models)
|
|
90
|
+
|
|
91
|
+
Create `opencode.json`:
|
|
92
|
+
|
|
93
|
+
```json
|
|
94
|
+
{
|
|
95
|
+
"provider": {
|
|
96
|
+
"openrouter": {
|
|
97
|
+
"models": { "qwen/qwen3.5-27b": {} }
|
|
98
|
+
}
|
|
99
|
+
},
|
|
100
|
+
"mcp": {
|
|
101
|
+
"cdxml-toolkit": {
|
|
102
|
+
"type": "local",
|
|
103
|
+
"command": ["python", "-m", "cdxml_toolkit.mcp_server"],
|
|
104
|
+
"enabled": true,
|
|
105
|
+
"timeout": 120000
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### Verify it works
|
|
112
|
+
|
|
113
|
+
```
|
|
114
|
+
> Use cdxml-toolkit. Resolve "aspirin", then draw it.
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
Expected: 2 tool calls (resolve_name, draw_molecule), produces an aspirin CDXML file.
|
|
118
|
+
|
|
119
|
+
## MCP tools (15)
|
|
120
|
+
|
|
121
|
+
### Chemistry resolution
|
|
122
|
+
| Tool | Description |
|
|
123
|
+
|------|-------------|
|
|
124
|
+
| `resolve_name` | Name/abbreviation/CAS/formula to rich molecule JSON (5-tier: reagent DB, condensed formula, ChemScript, OPSIN, PubChem) |
|
|
125
|
+
| `modify_molecule` | 6 operations: analyze, name_surgery, smarts, set_smiles, set_name, reaction. 162 named reaction templates. Returns MCS-based structural diffs. |
|
|
126
|
+
|
|
127
|
+
### Structure rendering
|
|
128
|
+
| Tool | Description |
|
|
129
|
+
|------|-------------|
|
|
130
|
+
| `draw_molecule` | Single molecule to CDXML |
|
|
131
|
+
| `render_scheme` | YAML/compact text/reaction JSON to publication-ready CDXML. Forgiving parser handles common LLM YAML mistakes. |
|
|
132
|
+
|
|
133
|
+
### Perception (reading existing chemistry)
|
|
134
|
+
| Tool | Description |
|
|
135
|
+
|------|-------------|
|
|
136
|
+
| `parse_reaction` | ELN exports (CDXML/CDX/CSV/RXN) to semantic JSON with species, roles, SMILES, equivalents |
|
|
137
|
+
| `summarize_reaction` | Context-efficient view of reaction JSON (select only the fields you need) |
|
|
138
|
+
| `extract_structures_from_image` | Image to SMILES + confidence scores via DECIMER neural network |
|
|
139
|
+
| `parse_scheme` | CDXML scheme to structured species/steps/topology JSON |
|
|
140
|
+
|
|
141
|
+
### Analysis
|
|
142
|
+
| Tool | Description |
|
|
143
|
+
|------|-------------|
|
|
144
|
+
| `parse_analysis_file` | LCMS (Waters/manual) or NMR (MestReNova) PDF to structured peak data |
|
|
145
|
+
| `format_lab_entry` | Structured entry dicts to formatted lab book text. Re-reads LCMS PDFs for exact numbers. |
|
|
146
|
+
|
|
147
|
+
### Office integration
|
|
148
|
+
| Tool | Description |
|
|
149
|
+
|------|-------------|
|
|
150
|
+
| `extract_cdxml_from_office` | Pull embedded ChemDraw OLE objects from PPTX/DOCX |
|
|
151
|
+
| `embed_cdxml_in_office` | Inject CDXML as editable ChemDraw OLE into PPTX/DOCX |
|
|
152
|
+
| `convert_cdx_cdxml` | Bidirectional CDX/CDXML conversion |
|
|
153
|
+
| `search_compound` | Find a molecule across experiment directories by SMILES similarity |
|
|
154
|
+
| `render_to_png` | CDXML to PNG via ChemDraw COM |
|
|
155
|
+
|
|
156
|
+
## Design principles
|
|
157
|
+
|
|
158
|
+
**Never trust LLM-generated SMILES.** The agent always goes through `resolve_name` to get grounded SMILES from databases. Direct SMILES generation is the #1 source of chemistry hallucination.
|
|
159
|
+
|
|
160
|
+
**Verify every transformation.** `modify_molecule` returns aligned IUPAC name diffs and MCS-based molecular diffs after every edit. The agent can confirm the transformation is correct.
|
|
161
|
+
|
|
162
|
+
**Never flood the agent.** Large outputs (CDXML, JSON) always write to files and return `{ok: true, output_path: "...", size: 23456}`. The agent never gets 30KB of XML in its context window.
|
|
163
|
+
|
|
164
|
+
**Forgiving inputs.** The YAML parser accepts 9+ common LLM mistakes (inline structures, `substrates` as alias for `structures`, text as string not list, bare SMILES, `above_arrow` as list/string). Input parameters accept bare SMILES strings, stringified JSON arrays, and fuzzy operation names.
|
|
165
|
+
|
|
166
|
+
**Actionable errors.** Every error tells the agent what to do instead: "Did you mean: BOC_deprotection?", not "KeyError".
|
|
167
|
+
|
|
168
|
+
**Progressive discovery.** Call any tool with no arguments to get usage examples and schema reference.
|
|
169
|
+
|
|
170
|
+
## Installation
|
|
171
|
+
|
|
172
|
+
**Prerequisites:** Windows with ChemDraw (ChemOffice 2015+) and ChemScript installed.
|
|
173
|
+
|
|
174
|
+
```bash
|
|
175
|
+
# From GitHub — recommended install (includes RDKit, MCP server, ChemDraw COM,
|
|
176
|
+
# Office support, PDF parsing, image processing, and ChemScript bridge)
|
|
177
|
+
pip install "cdxml-toolkit[all] @ git+https://github.com/leehiufung911/cdxml-toolkit.git@main"
|
|
178
|
+
|
|
179
|
+
# With DECIMER neural image extraction (extract_structures_from_image)
|
|
180
|
+
pip install "cdxml-toolkit[all,decimer] @ git+https://github.com/leehiufung911/cdxml-toolkit.git@main"
|
|
181
|
+
|
|
182
|
+
# Development (editable install)
|
|
183
|
+
git clone https://github.com/leehiufung911/cdxml-toolkit.git
|
|
184
|
+
cd cdxml-toolkit
|
|
185
|
+
pip install -e ".[dev]"
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
**Required:** `lxml>=4.6`. **Recommended:** `rdkit>=2023.03` (needed for scheme rendering).
|
|
189
|
+
|
|
190
|
+
### Extras
|
|
191
|
+
|
|
192
|
+
| Extra | What it includes | Notes |
|
|
193
|
+
|-------|-----------------|-------|
|
|
194
|
+
| `[all]` | RDKit, pywin32, image, Office, YAML, PDF analysis, MCP server, pythonnet, py2opsin | **Use this.** Everything most users need. |
|
|
195
|
+
| `[decimer]` | TensorFlow, DECIMER, PyMuPDF | Neural image-to-SMILES. Adds ~1 GB. |
|
|
196
|
+
| `[full]` | `[all]` + `[decimer]` + `[ocr]` | Everything pip-installable. |
|
|
197
|
+
| `[rdkit]` | RDKit only | Minimal install for scripting. |
|
|
198
|
+
| `[mcp]` | MCP server + PyYAML | MCP server only (no RDKit/Office). |
|
|
199
|
+
| `[dev]` | `[all]` + pytest | For running the test suite. |
|
|
200
|
+
|
|
201
|
+
### Name resolution tiers
|
|
202
|
+
|
|
203
|
+
`resolve_name` tries 5 tiers in order. The first tier to return a valid SMILES wins:
|
|
204
|
+
|
|
205
|
+
| Tier | Source | Deps | Coverage |
|
|
206
|
+
|------|--------|------|----------|
|
|
207
|
+
| 1 | Curated reagent DB (186 entries) | None | Common reagents, catalysts, solvents |
|
|
208
|
+
| 2 | Condensed formula parser | RDKit | Shorthand like PhB(OH)2, Et3N, CF3 |
|
|
209
|
+
| 3 | **ChemScript** (preferred) | ChemDraw + 32-bit Python | Full IUPAC names, any drawable structure |
|
|
210
|
+
| 4 | **OPSIN** (bundled fallback) | py2opsin + bundled JRE | Systematic IUPAC names, offline |
|
|
211
|
+
| 5 | PubChem | Network | CAS numbers, trade names, everything else |
|
|
212
|
+
|
|
213
|
+
ChemScript (Tier 3) is preferred because it handles the widest range of names and integrates with ChemDraw's structure engine. OPSIN (Tier 4) is a fully offline fallback that works out of the box — a JRE is bundled with the package, no Java install needed. If neither is available, PubChem provides a network-based last resort.
|
|
214
|
+
|
|
215
|
+
### System dependencies (not pip-installable)
|
|
216
|
+
|
|
217
|
+
| Dependency | Required for | Setup |
|
|
218
|
+
|-----------|-------------|-------|
|
|
219
|
+
| **ChemDraw** (ChemOffice 2015+) | CDX conversion, PNG rendering | Must be **closed** before running COM tools. |
|
|
220
|
+
| **ChemScript .NET** | Name resolution Tier 3 (preferred, not required) | Comes with ChemOffice. See setup below. |
|
|
221
|
+
| **Microsoft Office** | OLE embedding into PPTX/DOCX | Optional. Only needed for `embed_cdxml_in_office`. |
|
|
222
|
+
|
|
223
|
+
### ChemScript setup (optional but recommended)
|
|
224
|
+
|
|
225
|
+
ChemScript gives the best IUPAC name resolution but requires a 32-bit Python environment because the ChemScript .NET DLL is 32-bit. If you skip this, OPSIN handles IUPAC names as a fallback.
|
|
226
|
+
|
|
227
|
+
```bash
|
|
228
|
+
# 1. Create 32-bit Python env
|
|
229
|
+
set CONDA_SUBDIR=win-32 && conda create -n chemscript32 python=3.10 -y
|
|
230
|
+
|
|
231
|
+
# 2. Install pythonnet in the 32-bit env
|
|
232
|
+
C:\Users\%USERNAME%\miniconda3\envs\chemscript32\python.exe -m pip install pythonnet
|
|
233
|
+
|
|
234
|
+
# 3. Auto-detect ChemDraw and save config
|
|
235
|
+
cdxml-convert --configure
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
Step 3 scans for ChemDraw (2015/2016/PerkinElmer paths) and writes `~/.chemscript_config.json`. If your ChemDraw is in a non-standard location, edit the config manually:
|
|
239
|
+
|
|
240
|
+
```json
|
|
241
|
+
{
|
|
242
|
+
"python32": "C:\\Users\\YOU\\miniconda3\\envs\\chemscript32\\python.exe",
|
|
243
|
+
"dll_dir": "C:\\Program Files (x86)\\PerkinElmerInformatics\\ChemOffice2016\\ChemScript\\Lib\\Net",
|
|
244
|
+
"assembly": "CambridgeSoft.ChemScript16"
|
|
245
|
+
}
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
## CLI tools
|
|
249
|
+
|
|
250
|
+
All tools are also available as command-line scripts:
|
|
251
|
+
|
|
252
|
+
| Command | Description |
|
|
253
|
+
|---------|-------------|
|
|
254
|
+
| `cdxml-mcp` | MCP server (primary interface) |
|
|
255
|
+
| `cdxml-parse` | Parse reaction files to JSON |
|
|
256
|
+
| `cdxml-render` | Render JSON/YAML/compact text to CDXML |
|
|
257
|
+
| `cdxml-convert` | CDX/CDXML bidirectional conversion |
|
|
258
|
+
| `cdxml-image` | CDXML to PNG/SVG (ChemDraw COM) |
|
|
259
|
+
| `cdxml-merge` | Merge multiple reaction schemes |
|
|
260
|
+
| `cdxml-layout` | Clean up reaction layout (pure Python) |
|
|
261
|
+
| `cdxml-ole` | Embed CDXML as editable OLE in PPTX/DOCX |
|
|
262
|
+
| `cdxml-lcms` | Parse LCMS PDF reports |
|
|
263
|
+
| `cdxml-nmr` | Extract NMR data from MestReNova PDFs |
|
|
264
|
+
| `cdxml-format-entry` | Format lab book entries |
|
|
265
|
+
| `cdxml-discover` | Discover experiment files in a directory |
|
|
266
|
+
|
|
267
|
+
## Scheme DSL
|
|
268
|
+
|
|
269
|
+
The renderer accepts three input formats:
|
|
270
|
+
|
|
271
|
+
**YAML** (what agents typically write):
|
|
272
|
+
```yaml
|
|
273
|
+
layout: sequential
|
|
274
|
+
structures:
|
|
275
|
+
SM:
|
|
276
|
+
smiles: "Brc1ncnc2sccc12"
|
|
277
|
+
Product:
|
|
278
|
+
smiles: "c1nc(N2CCOCC2)c2ccsc2n1"
|
|
279
|
+
steps:
|
|
280
|
+
- substrates: [SM]
|
|
281
|
+
products: [Product]
|
|
282
|
+
above_arrow:
|
|
283
|
+
structures: [Morph]
|
|
284
|
+
below_arrow:
|
|
285
|
+
text: ["Pd2(dba)3", "BINAP", "Cs2CO3", "Dioxane, 105 C"]
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
**Compact text** ("Mermaid for reactions"):
|
|
289
|
+
```
|
|
290
|
+
SM: {Brc1ncnc2sccc12}
|
|
291
|
+
SM --> Product{c1nc(N2CCOCC2)c2ccsc2n1}
|
|
292
|
+
above: Morph{C1COCCN1}
|
|
293
|
+
below: "Pd2(dba)3", "BINAP", "Cs2CO3"
|
|
294
|
+
```
|
|
295
|
+
|
|
296
|
+
**Reaction JSON** (from parse_reaction):
|
|
297
|
+
```bash
|
|
298
|
+
cdxml-render --from-json reaction.json -o scheme.cdxml
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
## Running tests
|
|
302
|
+
|
|
303
|
+
```bash
|
|
304
|
+
pip install -e ".[dev]"
|
|
305
|
+
pytest tests/ -v
|
|
306
|
+
```
|
|
307
|
+
|
|
308
|
+
## License
|
|
309
|
+
|
|
310
|
+
[MIT](LICENSE)
|
|
311
|
+
|
|
312
|
+
## Attribution
|
|
313
|
+
|
|
314
|
+
See [NOTICE.md](NOTICE.md) for third-party data attribution (ChemScanner, RDKit).
|
|
315
|
+
|
|
316
|
+
## Author
|
|
317
|
+
|
|
318
|
+
Hiu Fung Kevin Lee ([@leehiufung911](https://github.com/leehiufung911))
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
# cdxml-toolkit
|
|
2
|
+
|
|
3
|
+
Chemistry office automation toolkit with MCP (Model Context Protocol) server. Lets LLM agents draw reaction schemes, parse ELN exports, analyze LCMS data, and produce publication-ready ChemDraw (CDXML) output.
|
|
4
|
+
|
|
5
|
+
The goal: any chemist with a consumer GPU can run a local LLM agent that helps with routine chemistry office tasks. The toolkit provides 15 grounded, validated chemistry tools that LLMs call via MCP — the agent reasons about chemistry while the tools handle SMILES resolution, 2D coordinate generation, and CDXML layout.
|
|
6
|
+
|
|
7
|
+
> Built and tested with Claude Code (Opus 4.6). I directed the design and architecture; Claude did the implementation. I'm a PhD organic chemist, not a programmer — this project wouldn't exist without Claude Code, and I thank Anthropic.
|
|
8
|
+
|
|
9
|
+
## Quick start: MCP server
|
|
10
|
+
|
|
11
|
+
The primary interface is the MCP server. Connect it to any MCP-compatible agent (Claude Desktop, opencode, qwen-agent, etc.) and just chat naturally: "Draw deucravacitinib", "Help me complete my lab book", "Extract structures from this image".
|
|
12
|
+
|
|
13
|
+
### Claude Desktop
|
|
14
|
+
|
|
15
|
+
Edit `%APPDATA%\Claude\claude_desktop_config.json` (Windows) or `~/Library/Application Support/Claude/claude_desktop_config.json` (Mac):
|
|
16
|
+
|
|
17
|
+
```json
|
|
18
|
+
{
|
|
19
|
+
"mcpServers": {
|
|
20
|
+
"cdxml-toolkit": {
|
|
21
|
+
"command": "python",
|
|
22
|
+
"args": ["-m", "cdxml_toolkit.mcp_server"]
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### opencode (for OpenRouter / local models)
|
|
29
|
+
|
|
30
|
+
Create `opencode.json`:
|
|
31
|
+
|
|
32
|
+
```json
|
|
33
|
+
{
|
|
34
|
+
"provider": {
|
|
35
|
+
"openrouter": {
|
|
36
|
+
"models": { "qwen/qwen3.5-27b": {} }
|
|
37
|
+
}
|
|
38
|
+
},
|
|
39
|
+
"mcp": {
|
|
40
|
+
"cdxml-toolkit": {
|
|
41
|
+
"type": "local",
|
|
42
|
+
"command": ["python", "-m", "cdxml_toolkit.mcp_server"],
|
|
43
|
+
"enabled": true,
|
|
44
|
+
"timeout": 120000
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### Verify it works
|
|
51
|
+
|
|
52
|
+
```
|
|
53
|
+
> Use cdxml-toolkit. Resolve "aspirin", then draw it.
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Expected: 2 tool calls (resolve_name, draw_molecule), produces an aspirin CDXML file.
|
|
57
|
+
|
|
58
|
+
## MCP tools (15)
|
|
59
|
+
|
|
60
|
+
### Chemistry resolution
|
|
61
|
+
| Tool | Description |
|
|
62
|
+
|------|-------------|
|
|
63
|
+
| `resolve_name` | Name/abbreviation/CAS/formula to rich molecule JSON (5-tier: reagent DB, condensed formula, ChemScript, OPSIN, PubChem) |
|
|
64
|
+
| `modify_molecule` | 6 operations: analyze, name_surgery, smarts, set_smiles, set_name, reaction. 162 named reaction templates. Returns MCS-based structural diffs. |
|
|
65
|
+
|
|
66
|
+
### Structure rendering
|
|
67
|
+
| Tool | Description |
|
|
68
|
+
|------|-------------|
|
|
69
|
+
| `draw_molecule` | Single molecule to CDXML |
|
|
70
|
+
| `render_scheme` | YAML/compact text/reaction JSON to publication-ready CDXML. Forgiving parser handles common LLM YAML mistakes. |
|
|
71
|
+
|
|
72
|
+
### Perception (reading existing chemistry)
|
|
73
|
+
| Tool | Description |
|
|
74
|
+
|------|-------------|
|
|
75
|
+
| `parse_reaction` | ELN exports (CDXML/CDX/CSV/RXN) to semantic JSON with species, roles, SMILES, equivalents |
|
|
76
|
+
| `summarize_reaction` | Context-efficient view of reaction JSON (select only the fields you need) |
|
|
77
|
+
| `extract_structures_from_image` | Image to SMILES + confidence scores via DECIMER neural network |
|
|
78
|
+
| `parse_scheme` | CDXML scheme to structured species/steps/topology JSON |
|
|
79
|
+
|
|
80
|
+
### Analysis
|
|
81
|
+
| Tool | Description |
|
|
82
|
+
|------|-------------|
|
|
83
|
+
| `parse_analysis_file` | LCMS (Waters/manual) or NMR (MestReNova) PDF to structured peak data |
|
|
84
|
+
| `format_lab_entry` | Structured entry dicts to formatted lab book text. Re-reads LCMS PDFs for exact numbers. |
|
|
85
|
+
|
|
86
|
+
### Office integration
|
|
87
|
+
| Tool | Description |
|
|
88
|
+
|------|-------------|
|
|
89
|
+
| `extract_cdxml_from_office` | Pull embedded ChemDraw OLE objects from PPTX/DOCX |
|
|
90
|
+
| `embed_cdxml_in_office` | Inject CDXML as editable ChemDraw OLE into PPTX/DOCX |
|
|
91
|
+
| `convert_cdx_cdxml` | Bidirectional CDX/CDXML conversion |
|
|
92
|
+
| `search_compound` | Find a molecule across experiment directories by SMILES similarity |
|
|
93
|
+
| `render_to_png` | CDXML to PNG via ChemDraw COM |
|
|
94
|
+
|
|
95
|
+
## Design principles
|
|
96
|
+
|
|
97
|
+
**Never trust LLM-generated SMILES.** The agent always goes through `resolve_name` to get grounded SMILES from databases. Direct SMILES generation is the #1 source of chemistry hallucination.
|
|
98
|
+
|
|
99
|
+
**Verify every transformation.** `modify_molecule` returns aligned IUPAC name diffs and MCS-based molecular diffs after every edit. The agent can confirm the transformation is correct.
|
|
100
|
+
|
|
101
|
+
**Never flood the agent.** Large outputs (CDXML, JSON) always write to files and return `{ok: true, output_path: "...", size: 23456}`. The agent never gets 30KB of XML in its context window.
|
|
102
|
+
|
|
103
|
+
**Forgiving inputs.** The YAML parser accepts 9+ common LLM mistakes (inline structures, `substrates` as alias for `structures`, text as string not list, bare SMILES, `above_arrow` as list/string). Input parameters accept bare SMILES strings, stringified JSON arrays, and fuzzy operation names.
|
|
104
|
+
|
|
105
|
+
**Actionable errors.** Every error tells the agent what to do instead: "Did you mean: BOC_deprotection?", not "KeyError".
|
|
106
|
+
|
|
107
|
+
**Progressive discovery.** Call any tool with no arguments to get usage examples and schema reference.
|
|
108
|
+
|
|
109
|
+
## Installation
|
|
110
|
+
|
|
111
|
+
**Prerequisites:** Windows with ChemDraw (ChemOffice 2015+) and ChemScript installed.
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
# From GitHub — recommended install (includes RDKit, MCP server, ChemDraw COM,
|
|
115
|
+
# Office support, PDF parsing, image processing, and ChemScript bridge)
|
|
116
|
+
pip install "cdxml-toolkit[all] @ git+https://github.com/leehiufung911/cdxml-toolkit.git@main"
|
|
117
|
+
|
|
118
|
+
# With DECIMER neural image extraction (extract_structures_from_image)
|
|
119
|
+
pip install "cdxml-toolkit[all,decimer] @ git+https://github.com/leehiufung911/cdxml-toolkit.git@main"
|
|
120
|
+
|
|
121
|
+
# Development (editable install)
|
|
122
|
+
git clone https://github.com/leehiufung911/cdxml-toolkit.git
|
|
123
|
+
cd cdxml-toolkit
|
|
124
|
+
pip install -e ".[dev]"
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
**Required:** `lxml>=4.6`. **Recommended:** `rdkit>=2023.03` (needed for scheme rendering).
|
|
128
|
+
|
|
129
|
+
### Extras
|
|
130
|
+
|
|
131
|
+
| Extra | What it includes | Notes |
|
|
132
|
+
|-------|-----------------|-------|
|
|
133
|
+
| `[all]` | RDKit, pywin32, image, Office, YAML, PDF analysis, MCP server, pythonnet, py2opsin | **Use this.** Everything most users need. |
|
|
134
|
+
| `[decimer]` | TensorFlow, DECIMER, PyMuPDF | Neural image-to-SMILES. Adds ~1 GB. |
|
|
135
|
+
| `[full]` | `[all]` + `[decimer]` + `[ocr]` | Everything pip-installable. |
|
|
136
|
+
| `[rdkit]` | RDKit only | Minimal install for scripting. |
|
|
137
|
+
| `[mcp]` | MCP server + PyYAML | MCP server only (no RDKit/Office). |
|
|
138
|
+
| `[dev]` | `[all]` + pytest | For running the test suite. |
|
|
139
|
+
|
|
140
|
+
### Name resolution tiers
|
|
141
|
+
|
|
142
|
+
`resolve_name` tries 5 tiers in order. The first tier to return a valid SMILES wins:
|
|
143
|
+
|
|
144
|
+
| Tier | Source | Deps | Coverage |
|
|
145
|
+
|------|--------|------|----------|
|
|
146
|
+
| 1 | Curated reagent DB (186 entries) | None | Common reagents, catalysts, solvents |
|
|
147
|
+
| 2 | Condensed formula parser | RDKit | Shorthand like PhB(OH)2, Et3N, CF3 |
|
|
148
|
+
| 3 | **ChemScript** (preferred) | ChemDraw + 32-bit Python | Full IUPAC names, any drawable structure |
|
|
149
|
+
| 4 | **OPSIN** (bundled fallback) | py2opsin + bundled JRE | Systematic IUPAC names, offline |
|
|
150
|
+
| 5 | PubChem | Network | CAS numbers, trade names, everything else |
|
|
151
|
+
|
|
152
|
+
ChemScript (Tier 3) is preferred because it handles the widest range of names and integrates with ChemDraw's structure engine. OPSIN (Tier 4) is a fully offline fallback that works out of the box — a JRE is bundled with the package, no Java install needed. If neither is available, PubChem provides a network-based last resort.
|
|
153
|
+
|
|
154
|
+
### System dependencies (not pip-installable)
|
|
155
|
+
|
|
156
|
+
| Dependency | Required for | Setup |
|
|
157
|
+
|-----------|-------------|-------|
|
|
158
|
+
| **ChemDraw** (ChemOffice 2015+) | CDX conversion, PNG rendering | Must be **closed** before running COM tools. |
|
|
159
|
+
| **ChemScript .NET** | Name resolution Tier 3 (preferred, not required) | Comes with ChemOffice. See setup below. |
|
|
160
|
+
| **Microsoft Office** | OLE embedding into PPTX/DOCX | Optional. Only needed for `embed_cdxml_in_office`. |
|
|
161
|
+
|
|
162
|
+
### ChemScript setup (optional but recommended)
|
|
163
|
+
|
|
164
|
+
ChemScript gives the best IUPAC name resolution but requires a 32-bit Python environment because the ChemScript .NET DLL is 32-bit. If you skip this, OPSIN handles IUPAC names as a fallback.
|
|
165
|
+
|
|
166
|
+
```bash
|
|
167
|
+
# 1. Create 32-bit Python env
|
|
168
|
+
set CONDA_SUBDIR=win-32 && conda create -n chemscript32 python=3.10 -y
|
|
169
|
+
|
|
170
|
+
# 2. Install pythonnet in the 32-bit env
|
|
171
|
+
C:\Users\%USERNAME%\miniconda3\envs\chemscript32\python.exe -m pip install pythonnet
|
|
172
|
+
|
|
173
|
+
# 3. Auto-detect ChemDraw and save config
|
|
174
|
+
cdxml-convert --configure
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
Step 3 scans for ChemDraw (2015/2016/PerkinElmer paths) and writes `~/.chemscript_config.json`. If your ChemDraw is in a non-standard location, edit the config manually:
|
|
178
|
+
|
|
179
|
+
```json
|
|
180
|
+
{
|
|
181
|
+
"python32": "C:\\Users\\YOU\\miniconda3\\envs\\chemscript32\\python.exe",
|
|
182
|
+
"dll_dir": "C:\\Program Files (x86)\\PerkinElmerInformatics\\ChemOffice2016\\ChemScript\\Lib\\Net",
|
|
183
|
+
"assembly": "CambridgeSoft.ChemScript16"
|
|
184
|
+
}
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
## CLI tools
|
|
188
|
+
|
|
189
|
+
All tools are also available as command-line scripts:
|
|
190
|
+
|
|
191
|
+
| Command | Description |
|
|
192
|
+
|---------|-------------|
|
|
193
|
+
| `cdxml-mcp` | MCP server (primary interface) |
|
|
194
|
+
| `cdxml-parse` | Parse reaction files to JSON |
|
|
195
|
+
| `cdxml-render` | Render JSON/YAML/compact text to CDXML |
|
|
196
|
+
| `cdxml-convert` | CDX/CDXML bidirectional conversion |
|
|
197
|
+
| `cdxml-image` | CDXML to PNG/SVG (ChemDraw COM) |
|
|
198
|
+
| `cdxml-merge` | Merge multiple reaction schemes |
|
|
199
|
+
| `cdxml-layout` | Clean up reaction layout (pure Python) |
|
|
200
|
+
| `cdxml-ole` | Embed CDXML as editable OLE in PPTX/DOCX |
|
|
201
|
+
| `cdxml-lcms` | Parse LCMS PDF reports |
|
|
202
|
+
| `cdxml-nmr` | Extract NMR data from MestReNova PDFs |
|
|
203
|
+
| `cdxml-format-entry` | Format lab book entries |
|
|
204
|
+
| `cdxml-discover` | Discover experiment files in a directory |
|
|
205
|
+
|
|
206
|
+
## Scheme DSL
|
|
207
|
+
|
|
208
|
+
The renderer accepts three input formats:
|
|
209
|
+
|
|
210
|
+
**YAML** (what agents typically write):
|
|
211
|
+
```yaml
|
|
212
|
+
layout: sequential
|
|
213
|
+
structures:
|
|
214
|
+
SM:
|
|
215
|
+
smiles: "Brc1ncnc2sccc12"
|
|
216
|
+
Product:
|
|
217
|
+
smiles: "c1nc(N2CCOCC2)c2ccsc2n1"
|
|
218
|
+
steps:
|
|
219
|
+
- substrates: [SM]
|
|
220
|
+
products: [Product]
|
|
221
|
+
above_arrow:
|
|
222
|
+
structures: [Morph]
|
|
223
|
+
below_arrow:
|
|
224
|
+
text: ["Pd2(dba)3", "BINAP", "Cs2CO3", "Dioxane, 105 C"]
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
**Compact text** ("Mermaid for reactions"):
|
|
228
|
+
```
|
|
229
|
+
SM: {Brc1ncnc2sccc12}
|
|
230
|
+
SM --> Product{c1nc(N2CCOCC2)c2ccsc2n1}
|
|
231
|
+
above: Morph{C1COCCN1}
|
|
232
|
+
below: "Pd2(dba)3", "BINAP", "Cs2CO3"
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
**Reaction JSON** (from parse_reaction):
|
|
236
|
+
```bash
|
|
237
|
+
cdxml-render --from-json reaction.json -o scheme.cdxml
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
## Running tests
|
|
241
|
+
|
|
242
|
+
```bash
|
|
243
|
+
pip install -e ".[dev]"
|
|
244
|
+
pytest tests/ -v
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
## License
|
|
248
|
+
|
|
249
|
+
[MIT](LICENSE)
|
|
250
|
+
|
|
251
|
+
## Attribution
|
|
252
|
+
|
|
253
|
+
See [NOTICE.md](NOTICE.md) for third-party data attribution (ChemScanner, RDKit).
|
|
254
|
+
|
|
255
|
+
## Author
|
|
256
|
+
|
|
257
|
+
Hiu Fung Kevin Lee ([@leehiufung911](https://github.com/leehiufung911))
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""cdxml-toolkit: Python toolkit for ChemDraw CDXML reaction scheme processing.
|
|
2
|
+
|
|
3
|
+
Provides tools for reading, writing, manipulating, and rendering ChemDraw CDXML
|
|
4
|
+
files. Includes reaction scheme layout, reagent classification, structure
|
|
5
|
+
alignment, and a declarative DSL for building schemes from YAML or text.
|
|
6
|
+
|
|
7
|
+
Core utilities are available without optional dependencies. RDKit, ChemDraw COM,
|
|
8
|
+
and other heavy dependencies are lazy-imported and only required when their
|
|
9
|
+
specific features are used.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
__version__ = "0.5.0"
|
|
13
|
+
|
|
14
|
+
# Core utilities — always available (stdlib + lxml only)
|
|
15
|
+
from .constants import ACS_BOND_LENGTH, ACS_CHAIN_ANGLE, ACS_STYLE
|
|
16
|
+
from .cdxml_utils import parse_cdxml, write_cdxml, fragment_bbox
|
|
17
|
+
from .text_formatting import build_formatted_s_xml
|
|
18
|
+
from .resolve.reagent_db import get_reagent_db
|
|
Binary file
|