cdxml-toolkit 0.5.0__tar.gz → 0.5.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cdxml_toolkit-0.5.0/cdxml_toolkit.egg-info → cdxml_toolkit-0.5.2}/PKG-INFO +25 -52
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/README.md +5 -17
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/mcp_server/server.py +13 -9
- cdxml_toolkit-0.5.2/cdxml_toolkit/office/ole_extractor.py +374 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2/cdxml_toolkit.egg-info}/PKG-INFO +25 -52
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit.egg-info/requires.txt +13 -41
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/pyproject.toml +18 -27
- cdxml_toolkit-0.5.0/cdxml_toolkit/office/ole_extractor.py +0 -272
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/LICENSE +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/NOTICE.md +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/__init__.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/_jre/__init__.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/analysis/__init__.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/analysis/deterministic/__init__.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/analysis/deterministic/lcms_identifier.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/analysis/deterministic/mass_resolver.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/analysis/deterministic/procedure_writer.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/analysis/extract_nmr.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/analysis/format_procedure_entry.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/analysis/lcms_analyzer.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/analysis/parse_analysis_file.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/cdxml_builder.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/cdxml_utils.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/chemdraw/__init__.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/chemdraw/_chemscript_server.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/chemdraw/cdx_converter.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/chemdraw/cdxml_to_image.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/chemdraw/chemscript_bridge.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/constants.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/coord_normalizer.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/deterministic_pipeline/__init__.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/image/__init__.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/image/reaction_from_image.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/image/structure_from_image.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/layout/__init__.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/layout/alignment.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/layout/reaction_cleanup.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/layout/scheme_merger.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/mcp_server/__init__.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/mcp_server/__main__.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/naming/__init__.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/naming/aligned_namer.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/naming/mol_builder.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/naming/name_decomposer.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/naming/reactions_datamol.json +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/office/__init__.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/office/doc_from_template.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/office/ole_embedder.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/perception/__init__.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/perception/compound_search.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/perception/eln_csv_parser.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/perception/rdf_parser.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/perception/reactant_heuristic.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/perception/reaction_parser.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/perception/scheme_reader.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/perception/scheme_refine.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/perception/scheme_segmenter.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/perception/spatial_assignment.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/rdkit_utils.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/render/__init__.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/render/auto_layout.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/render/compact_parser.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/render/parser.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/render/render_scheme.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/render/renderer.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/render/schema.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/render/scheme_maker.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/render/scheme_yaml_writer.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/resolve/__init__.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/resolve/cas_resolver.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/resolve/chemscanner_abbreviations.json +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/resolve/condensed_formula.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/resolve/jre_manager.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/resolve/reagent_abbreviations.json +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/resolve/reagent_db.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/resolve/superatom_data.json +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/resolve/superatom_table.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/text_formatting.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit.egg-info/SOURCES.txt +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit.egg-info/dependency_links.txt +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit.egg-info/entry_points.txt +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit.egg-info/top_level.txt +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/setup.cfg +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_builder.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_cdxml_utils.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_condensed_formula.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_constants.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_merge_yaml.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_mol_builder.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_rdkit_utils.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_reaction_parser.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_reagent_db.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_scheme_maker.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_scheme_reader.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_smoke.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_smoke_extended.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_spatial_assignment.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_superatom_table.py +0 -0
- {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_text_formatting.py +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cdxml-toolkit
|
|
3
|
-
Version: 0.5.
|
|
4
|
-
Summary: Python toolkit for
|
|
3
|
+
Version: 0.5.2
|
|
4
|
+
Summary: MCP server and Python toolkit for perception, rendering, and analysis of molecules and reaction schemes in ChemDraw CDXML.
|
|
5
5
|
Author: Hiu Fung Kevin Lee
|
|
6
6
|
License-Expression: MIT
|
|
7
7
|
Project-URL: Homepage, https://github.com/leehiufung911/cdxml-toolkit
|
|
@@ -22,40 +22,25 @@ Description-Content-Type: text/markdown
|
|
|
22
22
|
License-File: LICENSE
|
|
23
23
|
License-File: NOTICE.md
|
|
24
24
|
Requires-Dist: lxml>=4.6
|
|
25
|
-
|
|
26
|
-
Requires-Dist:
|
|
27
|
-
|
|
28
|
-
Requires-Dist:
|
|
29
|
-
|
|
30
|
-
Requires-Dist:
|
|
31
|
-
Requires-Dist:
|
|
32
|
-
|
|
33
|
-
Requires-Dist:
|
|
34
|
-
Requires-Dist:
|
|
35
|
-
Requires-Dist:
|
|
36
|
-
|
|
37
|
-
Requires-Dist:
|
|
38
|
-
|
|
39
|
-
Requires-Dist:
|
|
40
|
-
Requires-Dist:
|
|
41
|
-
|
|
42
|
-
Requires-Dist: pdfplumber>=0.7; extra == "analysis"
|
|
43
|
-
Provides-Extra: chemscript
|
|
44
|
-
Requires-Dist: pythonnet>=3.0; extra == "chemscript"
|
|
45
|
-
Provides-Extra: decimer
|
|
46
|
-
Requires-Dist: tensorflow>=2.16; extra == "decimer"
|
|
47
|
-
Requires-Dist: decimer>=2.6; extra == "decimer"
|
|
48
|
-
Requires-Dist: pymupdf>=1.20; extra == "decimer"
|
|
49
|
-
Provides-Extra: ocr
|
|
50
|
-
Requires-Dist: pytesseract>=0.3; extra == "ocr"
|
|
51
|
-
Provides-Extra: opsin
|
|
52
|
-
Requires-Dist: py2opsin>=1.0; extra == "opsin"
|
|
53
|
-
Provides-Extra: all
|
|
54
|
-
Requires-Dist: cdxml-toolkit[analysis,chemdraw,chemscript,image,mcp,office,opsin,rdkit,yaml]; extra == "all"
|
|
55
|
-
Provides-Extra: full
|
|
56
|
-
Requires-Dist: cdxml-toolkit[all,decimer,ocr]; extra == "full"
|
|
25
|
+
Requires-Dist: rdkit>=2023.03
|
|
26
|
+
Requires-Dist: pywin32>=300
|
|
27
|
+
Requires-Dist: opencv-python>=4.5
|
|
28
|
+
Requires-Dist: Pillow>=9.0
|
|
29
|
+
Requires-Dist: python-pptx>=0.6
|
|
30
|
+
Requires-Dist: python-docx>=0.8
|
|
31
|
+
Requires-Dist: olefile>=0.46
|
|
32
|
+
Requires-Dist: pyyaml>=5.4
|
|
33
|
+
Requires-Dist: mcp>=1.0
|
|
34
|
+
Requires-Dist: pdfplumber>=0.7
|
|
35
|
+
Requires-Dist: pythonnet>=3.0
|
|
36
|
+
Requires-Dist: tensorflow>=2.16
|
|
37
|
+
Requires-Dist: decimer>=2.6
|
|
38
|
+
Requires-Dist: pymupdf>=1.20
|
|
39
|
+
Requires-Dist: pytesseract>=0.3
|
|
40
|
+
Requires-Dist: py2opsin>=1.0
|
|
41
|
+
Requires-Dist: pytest>=7.0
|
|
57
42
|
Provides-Extra: dev
|
|
58
|
-
Requires-Dist: cdxml-toolkit
|
|
43
|
+
Requires-Dist: cdxml-toolkit; extra == "dev"
|
|
59
44
|
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
60
45
|
Dynamic: license-file
|
|
61
46
|
|
|
@@ -172,12 +157,11 @@ Expected: 2 tool calls (resolve_name, draw_molecule), produces an aspirin CDXML
|
|
|
172
157
|
**Prerequisites:** Windows with ChemDraw (ChemOffice 2015+) and ChemScript installed.
|
|
173
158
|
|
|
174
159
|
```bash
|
|
175
|
-
# From
|
|
176
|
-
|
|
177
|
-
pip install "cdxml-toolkit[all] @ git+https://github.com/leehiufung911/cdxml-toolkit.git@main"
|
|
160
|
+
# From PyPI (recommended)
|
|
161
|
+
pip install cdxml-toolkit
|
|
178
162
|
|
|
179
|
-
#
|
|
180
|
-
pip install "cdxml-toolkit
|
|
163
|
+
# From GitHub (latest development version)
|
|
164
|
+
pip install "cdxml-toolkit @ git+https://github.com/leehiufung911/cdxml-toolkit.git@main"
|
|
181
165
|
|
|
182
166
|
# Development (editable install)
|
|
183
167
|
git clone https://github.com/leehiufung911/cdxml-toolkit.git
|
|
@@ -185,18 +169,7 @@ cd cdxml-toolkit
|
|
|
185
169
|
pip install -e ".[dev]"
|
|
186
170
|
```
|
|
187
171
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
### Extras
|
|
191
|
-
|
|
192
|
-
| Extra | What it includes | Notes |
|
|
193
|
-
|-------|-----------------|-------|
|
|
194
|
-
| `[all]` | RDKit, pywin32, image, Office, YAML, PDF analysis, MCP server, pythonnet, py2opsin | **Use this.** Everything most users need. |
|
|
195
|
-
| `[decimer]` | TensorFlow, DECIMER, PyMuPDF | Neural image-to-SMILES. Adds ~1 GB. |
|
|
196
|
-
| `[full]` | `[all]` + `[decimer]` + `[ocr]` | Everything pip-installable. |
|
|
197
|
-
| `[rdkit]` | RDKit only | Minimal install for scripting. |
|
|
198
|
-
| `[mcp]` | MCP server + PyYAML | MCP server only (no RDKit/Office). |
|
|
199
|
-
| `[dev]` | `[all]` + pytest | For running the test suite. |
|
|
172
|
+
Everything is included by default: RDKit, MCP server, ChemDraw COM, Office support, PDF analysis, image processing, ChemScript bridge, DECIMER neural image extraction, OPSIN, and OCR.
|
|
200
173
|
|
|
201
174
|
### Name resolution tiers
|
|
202
175
|
|
|
@@ -111,12 +111,11 @@ Expected: 2 tool calls (resolve_name, draw_molecule), produces an aspirin CDXML
|
|
|
111
111
|
**Prerequisites:** Windows with ChemDraw (ChemOffice 2015+) and ChemScript installed.
|
|
112
112
|
|
|
113
113
|
```bash
|
|
114
|
-
# From
|
|
115
|
-
|
|
116
|
-
pip install "cdxml-toolkit[all] @ git+https://github.com/leehiufung911/cdxml-toolkit.git@main"
|
|
114
|
+
# From PyPI (recommended)
|
|
115
|
+
pip install cdxml-toolkit
|
|
117
116
|
|
|
118
|
-
#
|
|
119
|
-
pip install "cdxml-toolkit
|
|
117
|
+
# From GitHub (latest development version)
|
|
118
|
+
pip install "cdxml-toolkit @ git+https://github.com/leehiufung911/cdxml-toolkit.git@main"
|
|
120
119
|
|
|
121
120
|
# Development (editable install)
|
|
122
121
|
git clone https://github.com/leehiufung911/cdxml-toolkit.git
|
|
@@ -124,18 +123,7 @@ cd cdxml-toolkit
|
|
|
124
123
|
pip install -e ".[dev]"
|
|
125
124
|
```
|
|
126
125
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
### Extras
|
|
130
|
-
|
|
131
|
-
| Extra | What it includes | Notes |
|
|
132
|
-
|-------|-----------------|-------|
|
|
133
|
-
| `[all]` | RDKit, pywin32, image, Office, YAML, PDF analysis, MCP server, pythonnet, py2opsin | **Use this.** Everything most users need. |
|
|
134
|
-
| `[decimer]` | TensorFlow, DECIMER, PyMuPDF | Neural image-to-SMILES. Adds ~1 GB. |
|
|
135
|
-
| `[full]` | `[all]` + `[decimer]` + `[ocr]` | Everything pip-installable. |
|
|
136
|
-
| `[rdkit]` | RDKit only | Minimal install for scripting. |
|
|
137
|
-
| `[mcp]` | MCP server + PyYAML | MCP server only (no RDKit/Office). |
|
|
138
|
-
| `[dev]` | `[all]` + pytest | For running the test suite. |
|
|
126
|
+
Everything is included by default: RDKit, MCP server, ChemDraw COM, Office support, PDF analysis, image processing, ChemScript bridge, DECIMER neural image extraction, OPSIN, and OCR.
|
|
139
127
|
|
|
140
128
|
### Name resolution tiers
|
|
141
129
|
|
|
@@ -12,7 +12,7 @@ Tools:
|
|
|
12
12
|
convert_cdx_cdxml — Bidirectional CDX ↔ CDXML file conversion
|
|
13
13
|
parse_analysis_file — LCMS/NMR PDF → peaks and data
|
|
14
14
|
format_lab_entry — Entry dicts → formatted lab book text
|
|
15
|
-
extract_cdxml_from_office — PPTX/DOCX → embedded CDXML files
|
|
15
|
+
extract_cdxml_from_office — PPTX/DOCX/XLS/XLSX → embedded CDXML files
|
|
16
16
|
embed_cdxml_in_office — CDXML → editable OLE object in PPTX/DOCX
|
|
17
17
|
search_compound — SMILES → exact/similar matches across experiments
|
|
18
18
|
render_to_png — CDXML → PNG via ChemDraw COM
|
|
@@ -1140,16 +1140,17 @@ def extract_cdxml_from_office(
|
|
|
1140
1140
|
file_path: str,
|
|
1141
1141
|
output_dir: Optional[str] = None,
|
|
1142
1142
|
) -> dict:
|
|
1143
|
-
"""Extract embedded ChemDraw objects from a PPTX or
|
|
1143
|
+
"""Extract embedded ChemDraw objects from a PPTX, DOCX, XLS, or XLSX file.
|
|
1144
1144
|
|
|
1145
|
-
Office files (PPTX/DOCX) are ZIP archives that may contain ChemDraw OLE
|
|
1146
|
-
objects as binary blobs.
|
|
1147
|
-
|
|
1145
|
+
Office files (PPTX/DOCX/XLSX) are ZIP archives that may contain ChemDraw OLE
|
|
1146
|
+
objects as binary blobs. XLS files are OLE2 compound documents with embedded
|
|
1147
|
+
ChemDraw objects stored in MBD* sub-storages. This tool extracts every
|
|
1148
|
+
ChemDraw object, converts it to CDXML, and writes the files to output_dir.
|
|
1148
1149
|
|
|
1149
1150
|
Requires: olefile. CDX→CDXML conversion uses available backends.
|
|
1150
1151
|
|
|
1151
1152
|
Args:
|
|
1152
|
-
file_path: Path to a .pptx or .
|
|
1153
|
+
file_path: Path to a .pptx, .docx, .xlsx, or .xls file.
|
|
1153
1154
|
output_dir: Directory for extracted CDXML files. Default: a folder
|
|
1154
1155
|
named "<basename>_chemdraw/" next to the input file.
|
|
1155
1156
|
|
|
@@ -1161,10 +1162,12 @@ def extract_cdxml_from_office(
|
|
|
1161
1162
|
if not file_path or not file_path.strip():
|
|
1162
1163
|
return (
|
|
1163
1164
|
"Usage: extract_cdxml_from_office(file_path='document.pptx', output_dir='out/')\n"
|
|
1164
|
-
"Extracts embedded ChemDraw objects from PPTX or
|
|
1165
|
+
"Extracts embedded ChemDraw objects from PPTX, DOCX, XLS, or XLSX files.\n"
|
|
1165
1166
|
"Examples:\n"
|
|
1166
1167
|
" extract_cdxml_from_office(file_path='presentation.pptx')\n"
|
|
1167
1168
|
" extract_cdxml_from_office(file_path='report.docx', output_dir='extracted/')\n"
|
|
1169
|
+
" extract_cdxml_from_office(file_path='labbook.xls')\n"
|
|
1170
|
+
" extract_cdxml_from_office(file_path='labbook.xlsx')\n"
|
|
1168
1171
|
"Returns: {ok, count, objects:[{cdxml_output, source_path}]}"
|
|
1169
1172
|
)
|
|
1170
1173
|
|
|
@@ -1183,8 +1186,9 @@ def extract_cdxml_from_office(
|
|
|
1183
1186
|
"ok": False,
|
|
1184
1187
|
"error": (
|
|
1185
1188
|
f"Extraction failed: {e}. "
|
|
1186
|
-
"Ensure the file is a valid .pptx or .
|
|
1187
|
-
"(pip install olefile). The file must contain embedded
|
|
1189
|
+
"Ensure the file is a valid .pptx, .docx, .xlsx, or .xls and that olefile "
|
|
1190
|
+
"is installed (pip install olefile). The file must contain embedded "
|
|
1191
|
+
"ChemDraw OLE objects."
|
|
1188
1192
|
),
|
|
1189
1193
|
"input": str(p),
|
|
1190
1194
|
}
|
|
@@ -0,0 +1,374 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
OLE Extractor — Extract embedded ChemDraw objects from Office files.
|
|
4
|
+
|
|
5
|
+
Supports four Office formats:
|
|
6
|
+
- PPTX / DOCX / XLSX — ZIP archives containing OLE blobs in known paths.
|
|
7
|
+
- XLS — OLE2 compound documents with MBD* sub-storages.
|
|
8
|
+
|
|
9
|
+
ChemDraw objects are stored as CDX data inside the OLE "CONTENTS" stream.
|
|
10
|
+
This tool extracts and optionally converts them to CDXML.
|
|
11
|
+
|
|
12
|
+
Usage:
|
|
13
|
+
python ole_extractor.py input.pptx [-o output_dir/] [--format cdxml|cdx|both]
|
|
14
|
+
python ole_extractor.py input.docx [-o output_dir/] [--format cdxml|cdx|both]
|
|
15
|
+
python ole_extractor.py input.xlsx [-o output_dir/] [--format cdxml|cdx|both]
|
|
16
|
+
python ole_extractor.py input.xls [-o output_dir/] [--format cdxml|cdx|both]
|
|
17
|
+
|
|
18
|
+
Requires: olefile, cdx_converter (for CDXML conversion)
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import argparse
|
|
22
|
+
import io
|
|
23
|
+
import os
|
|
24
|
+
import sys
|
|
25
|
+
import zipfile
|
|
26
|
+
from dataclasses import dataclass, field
|
|
27
|
+
from typing import List, Optional
|
|
28
|
+
|
|
29
|
+
import olefile
|
|
30
|
+
|
|
31
|
+
# ChemDraw OLE CLSID (CS ChemDraw Drawing / CS ChemDraw 3D)
|
|
32
|
+
CHEMDRAW_CLSIDS = {
|
|
33
|
+
"41BA6D21-A02E-11CE-8FD9-0020AFD1F20C", # ChemDraw Drawing
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
# CDX binary magic bytes
|
|
37
|
+
CDX_MAGIC = b"VjCD"
|
|
38
|
+
|
|
39
|
+
# Where Office stores OLE embeddings (ZIP-based formats)
|
|
40
|
+
EMBEDDING_PATTERNS = {
|
|
41
|
+
".pptx": "ppt/embeddings/",
|
|
42
|
+
".docx": "word/embeddings/",
|
|
43
|
+
".xlsx": "xl/embeddings/",
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
# OLE2-native formats (not ZIP-based) — handled separately
|
|
47
|
+
OLE2_FORMATS = {".xls"}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class ExtractedObject:
|
|
52
|
+
"""A single extracted ChemDraw object."""
|
|
53
|
+
source_path: str # path inside ZIP (e.g. ppt/embeddings/oleObject1.bin)
|
|
54
|
+
cdx_data: bytes
|
|
55
|
+
cdx_output: Optional[str] = None # path where CDX was saved
|
|
56
|
+
cdxml_output: Optional[str] = None # path where CDXML was saved
|
|
57
|
+
error: Optional[str] = None
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def find_ole_entries(zip_path: str) -> List[str]:
|
|
61
|
+
"""List OLE embedding paths inside a PPTX/DOCX/XLSX ZIP."""
|
|
62
|
+
ext = os.path.splitext(zip_path)[1].lower()
|
|
63
|
+
prefix = EMBEDDING_PATTERNS.get(ext)
|
|
64
|
+
if prefix is None:
|
|
65
|
+
raise ValueError(
|
|
66
|
+
f"Unsupported ZIP-based file type: {ext}. "
|
|
67
|
+
f"Use .pptx, .docx, or .xlsx."
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
with zipfile.ZipFile(zip_path, "r") as zf:
|
|
71
|
+
return [
|
|
72
|
+
name for name in zf.namelist()
|
|
73
|
+
if name.startswith(prefix) and name.lower().endswith(".bin")
|
|
74
|
+
]
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def is_chemdraw_ole(ole: olefile.OleFileIO) -> bool:
|
|
78
|
+
"""Check if an OLE container holds a ChemDraw object."""
|
|
79
|
+
# Check CLSID
|
|
80
|
+
clsid = ole.root.clsid.upper() if ole.root.clsid else ""
|
|
81
|
+
if clsid in CHEMDRAW_CLSIDS:
|
|
82
|
+
return True
|
|
83
|
+
|
|
84
|
+
# Check for CONTENTS stream with CDX magic
|
|
85
|
+
if ole.exists("CONTENTS"):
|
|
86
|
+
header = ole.openstream("CONTENTS").read(4)
|
|
87
|
+
if header == CDX_MAGIC:
|
|
88
|
+
return True
|
|
89
|
+
|
|
90
|
+
return False
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def extract_cdx_from_ole(ole_data: bytes) -> Optional[bytes]:
|
|
94
|
+
"""Extract raw CDX bytes from an OLE compound document."""
|
|
95
|
+
if not olefile.isOleFile(io.BytesIO(ole_data)):
|
|
96
|
+
return None
|
|
97
|
+
|
|
98
|
+
ole = olefile.OleFileIO(io.BytesIO(ole_data))
|
|
99
|
+
try:
|
|
100
|
+
if not is_chemdraw_ole(ole):
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
if ole.exists("CONTENTS"):
|
|
104
|
+
cdx = ole.openstream("CONTENTS").read()
|
|
105
|
+
if cdx[:4] == CDX_MAGIC:
|
|
106
|
+
return cdx
|
|
107
|
+
|
|
108
|
+
# Fallback: check \x01Ole10Native stream
|
|
109
|
+
if ole.exists("\x01Ole10Native"):
|
|
110
|
+
data = ole.openstream("\x01Ole10Native").read()
|
|
111
|
+
# Skip 4-byte length prefix
|
|
112
|
+
if len(data) > 4 and data[4:8] == CDX_MAGIC:
|
|
113
|
+
return data[4:]
|
|
114
|
+
|
|
115
|
+
return None
|
|
116
|
+
finally:
|
|
117
|
+
ole.close()
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _get_converter(output_format: str):
|
|
121
|
+
"""Lazy-import cdx_converter if CDXML output is requested."""
|
|
122
|
+
if output_format not in ("cdxml", "both"):
|
|
123
|
+
return None
|
|
124
|
+
try:
|
|
125
|
+
from ..chemdraw import cdx_converter
|
|
126
|
+
return cdx_converter
|
|
127
|
+
except ImportError:
|
|
128
|
+
print(
|
|
129
|
+
"Warning: cdx_converter not found. CDX files will be saved "
|
|
130
|
+
"but CDXML conversion is unavailable.",
|
|
131
|
+
file=sys.stderr,
|
|
132
|
+
)
|
|
133
|
+
return None
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _save_extracted_object(
|
|
137
|
+
cdx_data: bytes,
|
|
138
|
+
source_path: str,
|
|
139
|
+
entry_name: str,
|
|
140
|
+
output_dir: str,
|
|
141
|
+
output_format: str,
|
|
142
|
+
convert_method: str,
|
|
143
|
+
_converter,
|
|
144
|
+
) -> ExtractedObject:
|
|
145
|
+
"""Save a single extracted CDX blob to disk (CDX and/or CDXML)."""
|
|
146
|
+
obj = ExtractedObject(source_path=source_path, cdx_data=cdx_data)
|
|
147
|
+
|
|
148
|
+
# Save CDX
|
|
149
|
+
if output_format in ("cdx", "both"):
|
|
150
|
+
cdx_path = os.path.join(output_dir, f"{entry_name}.cdx")
|
|
151
|
+
with open(cdx_path, "wb") as f:
|
|
152
|
+
f.write(cdx_data)
|
|
153
|
+
obj.cdx_output = cdx_path
|
|
154
|
+
|
|
155
|
+
# Convert to CDXML
|
|
156
|
+
if output_format in ("cdxml", "both"):
|
|
157
|
+
cdxml_path = os.path.join(output_dir, f"{entry_name}.cdxml")
|
|
158
|
+
if _converter is not None:
|
|
159
|
+
try:
|
|
160
|
+
cdxml_str = _converter.convert_cdx_to_cdxml(
|
|
161
|
+
cdx_data, method=convert_method
|
|
162
|
+
)
|
|
163
|
+
with open(cdxml_path, "w", encoding="utf-8") as f:
|
|
164
|
+
f.write(cdxml_str)
|
|
165
|
+
obj.cdxml_output = cdxml_path
|
|
166
|
+
except Exception as e:
|
|
167
|
+
obj.error = f"CDXML conversion failed: {e}"
|
|
168
|
+
# Still save CDX as fallback
|
|
169
|
+
if obj.cdx_output is None:
|
|
170
|
+
fallback = os.path.join(output_dir, f"{entry_name}.cdx")
|
|
171
|
+
with open(fallback, "wb") as f:
|
|
172
|
+
f.write(cdx_data)
|
|
173
|
+
obj.cdx_output = fallback
|
|
174
|
+
else:
|
|
175
|
+
# No converter — save CDX instead
|
|
176
|
+
if obj.cdx_output is None:
|
|
177
|
+
fallback = os.path.join(output_dir, f"{entry_name}.cdx")
|
|
178
|
+
with open(fallback, "wb") as f:
|
|
179
|
+
f.write(cdx_data)
|
|
180
|
+
obj.cdx_output = fallback
|
|
181
|
+
obj.error = "cdx_converter unavailable; saved CDX only"
|
|
182
|
+
|
|
183
|
+
return obj
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _extract_from_zip(
|
|
187
|
+
input_path: str,
|
|
188
|
+
output_dir: str,
|
|
189
|
+
output_format: str,
|
|
190
|
+
convert_method: str,
|
|
191
|
+
_converter,
|
|
192
|
+
) -> List[ExtractedObject]:
|
|
193
|
+
"""Extract ChemDraw objects from a ZIP-based Office file (PPTX/DOCX/XLSX)."""
|
|
194
|
+
ole_entries = find_ole_entries(input_path)
|
|
195
|
+
results = []
|
|
196
|
+
|
|
197
|
+
with zipfile.ZipFile(input_path, "r") as zf:
|
|
198
|
+
for entry in ole_entries:
|
|
199
|
+
ole_data = zf.read(entry)
|
|
200
|
+
cdx_data = extract_cdx_from_ole(ole_data)
|
|
201
|
+
|
|
202
|
+
if cdx_data is None:
|
|
203
|
+
# Not a ChemDraw object — skip silently
|
|
204
|
+
continue
|
|
205
|
+
|
|
206
|
+
entry_name = os.path.splitext(os.path.basename(entry))[0]
|
|
207
|
+
obj = _save_extracted_object(
|
|
208
|
+
cdx_data, entry, entry_name,
|
|
209
|
+
output_dir, output_format, convert_method, _converter,
|
|
210
|
+
)
|
|
211
|
+
results.append(obj)
|
|
212
|
+
|
|
213
|
+
return results
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _extract_from_xls(
|
|
217
|
+
input_path: str,
|
|
218
|
+
output_dir: str,
|
|
219
|
+
output_format: str,
|
|
220
|
+
convert_method: str,
|
|
221
|
+
_converter,
|
|
222
|
+
) -> List[ExtractedObject]:
|
|
223
|
+
"""Extract ChemDraw objects from an OLE2-native XLS file.
|
|
224
|
+
|
|
225
|
+
XLS files are OLE2 compound documents. Embedded objects are stored as
|
|
226
|
+
sub-storages with names starting with 'MBD' (e.g. MBD078DC381).
|
|
227
|
+
ChemDraw objects contain a CONTENTS stream with CDX binary data.
|
|
228
|
+
"""
|
|
229
|
+
results = []
|
|
230
|
+
ole = olefile.OleFileIO(input_path)
|
|
231
|
+
try:
|
|
232
|
+
# Find top-level MBD* storages (embedded OLE objects)
|
|
233
|
+
seen_storages = set()
|
|
234
|
+
for entry in ole.listdir(storages=True, streams=False):
|
|
235
|
+
if len(entry) >= 1 and entry[0].startswith("MBD"):
|
|
236
|
+
seen_storages.add(entry[0])
|
|
237
|
+
|
|
238
|
+
obj_index = 0
|
|
239
|
+
for storage_name in sorted(seen_storages):
|
|
240
|
+
contents_path = f"{storage_name}/CONTENTS"
|
|
241
|
+
if not ole.exists(contents_path):
|
|
242
|
+
continue
|
|
243
|
+
|
|
244
|
+
cdx = ole.openstream(contents_path).read()
|
|
245
|
+
if len(cdx) < 4 or cdx[:4] != CDX_MAGIC:
|
|
246
|
+
continue
|
|
247
|
+
|
|
248
|
+
obj_index += 1
|
|
249
|
+
source_path = f"{storage_name}/CONTENTS"
|
|
250
|
+
entry_name = f"oleObject{obj_index}"
|
|
251
|
+
obj = _save_extracted_object(
|
|
252
|
+
cdx, source_path, entry_name,
|
|
253
|
+
output_dir, output_format, convert_method, _converter,
|
|
254
|
+
)
|
|
255
|
+
results.append(obj)
|
|
256
|
+
finally:
|
|
257
|
+
ole.close()
|
|
258
|
+
|
|
259
|
+
return results
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def extract_from_office(
|
|
263
|
+
input_path: str,
|
|
264
|
+
output_dir: Optional[str] = None,
|
|
265
|
+
output_format: str = "cdxml",
|
|
266
|
+
convert_method: str = "auto",
|
|
267
|
+
) -> List[ExtractedObject]:
|
|
268
|
+
"""Extract all ChemDraw objects from an Office file.
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
input_path: Path to .pptx, .docx, .xlsx, or .xls file.
|
|
272
|
+
output_dir: Directory for extracted files. Default: <basename>_chemdraw/
|
|
273
|
+
output_format: "cdx", "cdxml", or "both".
|
|
274
|
+
convert_method: Backend for CDX→CDXML conversion (passed to cdx_converter).
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
List of ExtractedObject with extraction results.
|
|
278
|
+
"""
|
|
279
|
+
ext = os.path.splitext(input_path)[1].lower()
|
|
280
|
+
|
|
281
|
+
if ext not in EMBEDDING_PATTERNS and ext not in OLE2_FORMATS:
|
|
282
|
+
raise ValueError(
|
|
283
|
+
f"Unsupported file type: {ext}. "
|
|
284
|
+
f"Use .pptx, .docx, .xlsx, or .xls."
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
if output_dir is None:
|
|
288
|
+
basename = os.path.splitext(os.path.basename(input_path))[0]
|
|
289
|
+
output_dir = os.path.join(os.path.dirname(input_path) or ".", f"{basename}_chemdraw")
|
|
290
|
+
|
|
291
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
292
|
+
_converter = _get_converter(output_format)
|
|
293
|
+
|
|
294
|
+
if ext in OLE2_FORMATS:
|
|
295
|
+
return _extract_from_xls(
|
|
296
|
+
input_path, output_dir, output_format, convert_method, _converter,
|
|
297
|
+
)
|
|
298
|
+
else:
|
|
299
|
+
return _extract_from_zip(
|
|
300
|
+
input_path, output_dir, output_format, convert_method, _converter,
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def print_summary(results: List[ExtractedObject], input_path: str) -> None:
|
|
305
|
+
"""Print extraction summary to stdout."""
|
|
306
|
+
print(f"{'=' * 60}")
|
|
307
|
+
print(f"OLE Extractor - {os.path.basename(input_path)}")
|
|
308
|
+
print(f"{'=' * 60}")
|
|
309
|
+
|
|
310
|
+
if not results:
|
|
311
|
+
print("No ChemDraw objects found.")
|
|
312
|
+
return
|
|
313
|
+
|
|
314
|
+
print(f"Found {len(results)} ChemDraw object(s):\n")
|
|
315
|
+
for i, obj in enumerate(results, 1):
|
|
316
|
+
print(f" [{i}] {obj.source_path}")
|
|
317
|
+
print(f" CDX size: {len(obj.cdx_data):,} bytes")
|
|
318
|
+
if obj.cdx_output:
|
|
319
|
+
print(f" CDX: {obj.cdx_output}")
|
|
320
|
+
if obj.cdxml_output:
|
|
321
|
+
size = os.path.getsize(obj.cdxml_output)
|
|
322
|
+
print(f" CDXML: {obj.cdxml_output} ({size:,} bytes)")
|
|
323
|
+
if obj.error:
|
|
324
|
+
print(f" Note: {obj.error}")
|
|
325
|
+
print()
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
# ---------------------------------------------------------------------------
|
|
329
|
+
# CLI
|
|
330
|
+
# ---------------------------------------------------------------------------
|
|
331
|
+
|
|
332
|
+
def main(argv=None) -> int:
|
|
333
|
+
parser = argparse.ArgumentParser(
|
|
334
|
+
description="Extract embedded ChemDraw objects from Office files."
|
|
335
|
+
)
|
|
336
|
+
parser.add_argument("input", help="Input file (.pptx, .docx, .xlsx, or .xls)")
|
|
337
|
+
parser.add_argument(
|
|
338
|
+
"-o", "--output-dir",
|
|
339
|
+
help="Output directory (default: <input_basename>_chemdraw/)"
|
|
340
|
+
)
|
|
341
|
+
parser.add_argument(
|
|
342
|
+
"--format",
|
|
343
|
+
choices=["cdxml", "cdx", "both"],
|
|
344
|
+
default="cdxml",
|
|
345
|
+
help="Output format (default: cdxml)"
|
|
346
|
+
)
|
|
347
|
+
parser.add_argument(
|
|
348
|
+
"--method",
|
|
349
|
+
choices=["auto", "com", "pycdxml", "obabel"],
|
|
350
|
+
default="auto",
|
|
351
|
+
help="CDX→CDXML conversion backend (default: auto)"
|
|
352
|
+
)
|
|
353
|
+
args = parser.parse_args(argv)
|
|
354
|
+
|
|
355
|
+
if not os.path.isfile(args.input):
|
|
356
|
+
print(f"Error: file not found: {args.input}", file=sys.stderr)
|
|
357
|
+
return 1
|
|
358
|
+
|
|
359
|
+
try:
|
|
360
|
+
results = extract_from_office(
|
|
361
|
+
args.input,
|
|
362
|
+
output_dir=args.output_dir,
|
|
363
|
+
output_format=args.format,
|
|
364
|
+
convert_method=args.method,
|
|
365
|
+
)
|
|
366
|
+
print_summary(results, args.input)
|
|
367
|
+
return 0
|
|
368
|
+
except Exception as e:
|
|
369
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
370
|
+
return 1
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
if __name__ == "__main__":
|
|
374
|
+
sys.exit(main())
|