cdxml-toolkit 0.5.0__tar.gz → 0.5.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. {cdxml_toolkit-0.5.0/cdxml_toolkit.egg-info → cdxml_toolkit-0.5.2}/PKG-INFO +25 -52
  2. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/README.md +5 -17
  3. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/mcp_server/server.py +13 -9
  4. cdxml_toolkit-0.5.2/cdxml_toolkit/office/ole_extractor.py +374 -0
  5. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2/cdxml_toolkit.egg-info}/PKG-INFO +25 -52
  6. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit.egg-info/requires.txt +13 -41
  7. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/pyproject.toml +18 -27
  8. cdxml_toolkit-0.5.0/cdxml_toolkit/office/ole_extractor.py +0 -272
  9. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/LICENSE +0 -0
  10. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/NOTICE.md +0 -0
  11. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/__init__.py +0 -0
  12. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/_jre/__init__.py +0 -0
  13. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  14. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/analysis/__init__.py +0 -0
  15. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/analysis/deterministic/__init__.py +0 -0
  16. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +0 -0
  17. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +0 -0
  18. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +0 -0
  19. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/analysis/deterministic/lcms_identifier.py +0 -0
  20. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/analysis/deterministic/mass_resolver.py +0 -0
  21. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +0 -0
  22. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/analysis/deterministic/procedure_writer.py +0 -0
  23. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/analysis/extract_nmr.py +0 -0
  24. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/analysis/format_procedure_entry.py +0 -0
  25. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/analysis/lcms_analyzer.py +0 -0
  26. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/analysis/parse_analysis_file.py +0 -0
  27. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/cdxml_builder.py +0 -0
  28. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/cdxml_utils.py +0 -0
  29. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/chemdraw/__init__.py +0 -0
  30. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/chemdraw/_chemscript_server.py +0 -0
  31. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/chemdraw/cdx_converter.py +0 -0
  32. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/chemdraw/cdxml_to_image.py +0 -0
  33. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +0 -0
  34. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/chemdraw/chemscript_bridge.py +0 -0
  35. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/constants.py +0 -0
  36. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/coord_normalizer.py +0 -0
  37. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/deterministic_pipeline/__init__.py +0 -0
  38. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +0 -0
  39. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +0 -0
  40. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +0 -0
  41. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +0 -0
  42. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +0 -0
  43. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +0 -0
  44. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +0 -0
  45. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +0 -0
  46. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/image/__init__.py +0 -0
  47. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/image/reaction_from_image.py +0 -0
  48. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/image/structure_from_image.py +0 -0
  49. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/layout/__init__.py +0 -0
  50. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/layout/alignment.py +0 -0
  51. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/layout/reaction_cleanup.py +0 -0
  52. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/layout/scheme_merger.py +0 -0
  53. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/mcp_server/__init__.py +0 -0
  54. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/mcp_server/__main__.py +0 -0
  55. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/naming/__init__.py +0 -0
  56. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/naming/aligned_namer.py +0 -0
  57. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/naming/mol_builder.py +0 -0
  58. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/naming/name_decomposer.py +0 -0
  59. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/naming/reactions_datamol.json +0 -0
  60. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/office/__init__.py +0 -0
  61. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/office/doc_from_template.py +0 -0
  62. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/office/ole_embedder.py +0 -0
  63. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/perception/__init__.py +0 -0
  64. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/perception/compound_search.py +0 -0
  65. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/perception/eln_csv_parser.py +0 -0
  66. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/perception/rdf_parser.py +0 -0
  67. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/perception/reactant_heuristic.py +0 -0
  68. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/perception/reaction_parser.py +0 -0
  69. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/perception/scheme_reader.py +0 -0
  70. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/perception/scheme_refine.py +0 -0
  71. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/perception/scheme_segmenter.py +0 -0
  72. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/perception/spatial_assignment.py +0 -0
  73. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/rdkit_utils.py +0 -0
  74. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/render/__init__.py +0 -0
  75. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/render/auto_layout.py +0 -0
  76. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/render/compact_parser.py +0 -0
  77. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/render/parser.py +0 -0
  78. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/render/render_scheme.py +0 -0
  79. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/render/renderer.py +0 -0
  80. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/render/schema.py +0 -0
  81. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/render/scheme_maker.py +0 -0
  82. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/render/scheme_yaml_writer.py +0 -0
  83. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/resolve/__init__.py +0 -0
  84. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/resolve/cas_resolver.py +0 -0
  85. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/resolve/chemscanner_abbreviations.json +0 -0
  86. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/resolve/condensed_formula.py +0 -0
  87. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/resolve/jre_manager.py +0 -0
  88. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/resolve/reagent_abbreviations.json +0 -0
  89. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/resolve/reagent_db.py +0 -0
  90. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/resolve/superatom_data.json +0 -0
  91. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/resolve/superatom_table.py +0 -0
  92. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit/text_formatting.py +0 -0
  93. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit.egg-info/SOURCES.txt +0 -0
  94. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit.egg-info/dependency_links.txt +0 -0
  95. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit.egg-info/entry_points.txt +0 -0
  96. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/cdxml_toolkit.egg-info/top_level.txt +0 -0
  97. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/setup.cfg +0 -0
  98. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_builder.py +0 -0
  99. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_cdxml_utils.py +0 -0
  100. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_condensed_formula.py +0 -0
  101. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_constants.py +0 -0
  102. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_merge_yaml.py +0 -0
  103. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_mol_builder.py +0 -0
  104. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_rdkit_utils.py +0 -0
  105. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_reaction_parser.py +0 -0
  106. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_reagent_db.py +0 -0
  107. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_scheme_maker.py +0 -0
  108. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_scheme_reader.py +0 -0
  109. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_smoke.py +0 -0
  110. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_smoke_extended.py +0 -0
  111. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_spatial_assignment.py +0 -0
  112. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_superatom_table.py +0 -0
  113. {cdxml_toolkit-0.5.0 → cdxml_toolkit-0.5.2}/tests/test_text_formatting.py +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cdxml-toolkit
3
- Version: 0.5.0
4
- Summary: Python toolkit for ChemDraw CDXML reaction scheme processing, layout, and rendering.
3
+ Version: 0.5.2
4
+ Summary: MCP server and Python toolkit for perception, rendering, and analysis of molecules and reaction schemes in ChemDraw CDXML.
5
5
  Author: Hiu Fung Kevin Lee
6
6
  License-Expression: MIT
7
7
  Project-URL: Homepage, https://github.com/leehiufung911/cdxml-toolkit
@@ -22,40 +22,25 @@ Description-Content-Type: text/markdown
22
22
  License-File: LICENSE
23
23
  License-File: NOTICE.md
24
24
  Requires-Dist: lxml>=4.6
25
- Provides-Extra: rdkit
26
- Requires-Dist: rdkit>=2023.03; extra == "rdkit"
27
- Provides-Extra: chemdraw
28
- Requires-Dist: pywin32>=300; extra == "chemdraw"
29
- Provides-Extra: image
30
- Requires-Dist: opencv-python>=4.5; extra == "image"
31
- Requires-Dist: Pillow>=9.0; extra == "image"
32
- Provides-Extra: office
33
- Requires-Dist: python-pptx>=0.6; extra == "office"
34
- Requires-Dist: python-docx>=0.8; extra == "office"
35
- Requires-Dist: olefile>=0.46; extra == "office"
36
- Provides-Extra: yaml
37
- Requires-Dist: pyyaml>=5.4; extra == "yaml"
38
- Provides-Extra: mcp
39
- Requires-Dist: mcp>=1.0; extra == "mcp"
40
- Requires-Dist: pyyaml>=5.4; extra == "mcp"
41
- Provides-Extra: analysis
42
- Requires-Dist: pdfplumber>=0.7; extra == "analysis"
43
- Provides-Extra: chemscript
44
- Requires-Dist: pythonnet>=3.0; extra == "chemscript"
45
- Provides-Extra: decimer
46
- Requires-Dist: tensorflow>=2.16; extra == "decimer"
47
- Requires-Dist: decimer>=2.6; extra == "decimer"
48
- Requires-Dist: pymupdf>=1.20; extra == "decimer"
49
- Provides-Extra: ocr
50
- Requires-Dist: pytesseract>=0.3; extra == "ocr"
51
- Provides-Extra: opsin
52
- Requires-Dist: py2opsin>=1.0; extra == "opsin"
53
- Provides-Extra: all
54
- Requires-Dist: cdxml-toolkit[analysis,chemdraw,chemscript,image,mcp,office,opsin,rdkit,yaml]; extra == "all"
55
- Provides-Extra: full
56
- Requires-Dist: cdxml-toolkit[all,decimer,ocr]; extra == "full"
25
+ Requires-Dist: rdkit>=2023.03
26
+ Requires-Dist: pywin32>=300
27
+ Requires-Dist: opencv-python>=4.5
28
+ Requires-Dist: Pillow>=9.0
29
+ Requires-Dist: python-pptx>=0.6
30
+ Requires-Dist: python-docx>=0.8
31
+ Requires-Dist: olefile>=0.46
32
+ Requires-Dist: pyyaml>=5.4
33
+ Requires-Dist: mcp>=1.0
34
+ Requires-Dist: pdfplumber>=0.7
35
+ Requires-Dist: pythonnet>=3.0
36
+ Requires-Dist: tensorflow>=2.16
37
+ Requires-Dist: decimer>=2.6
38
+ Requires-Dist: pymupdf>=1.20
39
+ Requires-Dist: pytesseract>=0.3
40
+ Requires-Dist: py2opsin>=1.0
41
+ Requires-Dist: pytest>=7.0
57
42
  Provides-Extra: dev
58
- Requires-Dist: cdxml-toolkit[all]; extra == "dev"
43
+ Requires-Dist: cdxml-toolkit; extra == "dev"
59
44
  Requires-Dist: pytest>=7.0; extra == "dev"
60
45
  Dynamic: license-file
61
46
 
@@ -172,12 +157,11 @@ Expected: 2 tool calls (resolve_name, draw_molecule), produces an aspirin CDXML
172
157
  **Prerequisites:** Windows with ChemDraw (ChemOffice 2015+) and ChemScript installed.
173
158
 
174
159
  ```bash
175
- # From GitHub recommended install (includes RDKit, MCP server, ChemDraw COM,
176
- # Office support, PDF parsing, image processing, and ChemScript bridge)
177
- pip install "cdxml-toolkit[all] @ git+https://github.com/leehiufung911/cdxml-toolkit.git@main"
160
+ # From PyPI (recommended)
161
+ pip install cdxml-toolkit
178
162
 
179
- # With DECIMER neural image extraction (extract_structures_from_image)
180
- pip install "cdxml-toolkit[all,decimer] @ git+https://github.com/leehiufung911/cdxml-toolkit.git@main"
163
+ # From GitHub (latest development version)
164
+ pip install "cdxml-toolkit @ git+https://github.com/leehiufung911/cdxml-toolkit.git@main"
181
165
 
182
166
  # Development (editable install)
183
167
  git clone https://github.com/leehiufung911/cdxml-toolkit.git
@@ -185,18 +169,7 @@ cd cdxml-toolkit
185
169
  pip install -e ".[dev]"
186
170
  ```
187
171
 
188
- **Required:** `lxml>=4.6`. **Recommended:** `rdkit>=2023.03` (needed for scheme rendering).
189
-
190
- ### Extras
191
-
192
- | Extra | What it includes | Notes |
193
- |-------|-----------------|-------|
194
- | `[all]` | RDKit, pywin32, image, Office, YAML, PDF analysis, MCP server, pythonnet, py2opsin | **Use this.** Everything most users need. |
195
- | `[decimer]` | TensorFlow, DECIMER, PyMuPDF | Neural image-to-SMILES. Adds ~1 GB. |
196
- | `[full]` | `[all]` + `[decimer]` + `[ocr]` | Everything pip-installable. |
197
- | `[rdkit]` | RDKit only | Minimal install for scripting. |
198
- | `[mcp]` | MCP server + PyYAML | MCP server only (no RDKit/Office). |
199
- | `[dev]` | `[all]` + pytest | For running the test suite. |
172
+ Everything is included by default: RDKit, MCP server, ChemDraw COM, Office support, PDF analysis, image processing, ChemScript bridge, DECIMER neural image extraction, OPSIN, and OCR.
200
173
 
201
174
  ### Name resolution tiers
202
175
 
@@ -111,12 +111,11 @@ Expected: 2 tool calls (resolve_name, draw_molecule), produces an aspirin CDXML
111
111
  **Prerequisites:** Windows with ChemDraw (ChemOffice 2015+) and ChemScript installed.
112
112
 
113
113
  ```bash
114
- # From GitHub recommended install (includes RDKit, MCP server, ChemDraw COM,
115
- # Office support, PDF parsing, image processing, and ChemScript bridge)
116
- pip install "cdxml-toolkit[all] @ git+https://github.com/leehiufung911/cdxml-toolkit.git@main"
114
+ # From PyPI (recommended)
115
+ pip install cdxml-toolkit
117
116
 
118
- # With DECIMER neural image extraction (extract_structures_from_image)
119
- pip install "cdxml-toolkit[all,decimer] @ git+https://github.com/leehiufung911/cdxml-toolkit.git@main"
117
+ # From GitHub (latest development version)
118
+ pip install "cdxml-toolkit @ git+https://github.com/leehiufung911/cdxml-toolkit.git@main"
120
119
 
121
120
  # Development (editable install)
122
121
  git clone https://github.com/leehiufung911/cdxml-toolkit.git
@@ -124,18 +123,7 @@ cd cdxml-toolkit
124
123
  pip install -e ".[dev]"
125
124
  ```
126
125
 
127
- **Required:** `lxml>=4.6`. **Recommended:** `rdkit>=2023.03` (needed for scheme rendering).
128
-
129
- ### Extras
130
-
131
- | Extra | What it includes | Notes |
132
- |-------|-----------------|-------|
133
- | `[all]` | RDKit, pywin32, image, Office, YAML, PDF analysis, MCP server, pythonnet, py2opsin | **Use this.** Everything most users need. |
134
- | `[decimer]` | TensorFlow, DECIMER, PyMuPDF | Neural image-to-SMILES. Adds ~1 GB. |
135
- | `[full]` | `[all]` + `[decimer]` + `[ocr]` | Everything pip-installable. |
136
- | `[rdkit]` | RDKit only | Minimal install for scripting. |
137
- | `[mcp]` | MCP server + PyYAML | MCP server only (no RDKit/Office). |
138
- | `[dev]` | `[all]` + pytest | For running the test suite. |
126
+ Everything is included by default: RDKit, MCP server, ChemDraw COM, Office support, PDF analysis, image processing, ChemScript bridge, DECIMER neural image extraction, OPSIN, and OCR.
139
127
 
140
128
  ### Name resolution tiers
141
129
 
@@ -12,7 +12,7 @@ Tools:
12
12
  convert_cdx_cdxml — Bidirectional CDX ↔ CDXML file conversion
13
13
  parse_analysis_file — LCMS/NMR PDF → peaks and data
14
14
  format_lab_entry — Entry dicts → formatted lab book text
15
- extract_cdxml_from_office — PPTX/DOCX → embedded CDXML files
15
+ extract_cdxml_from_office — PPTX/DOCX/XLS/XLSX → embedded CDXML files
16
16
  embed_cdxml_in_office — CDXML → editable OLE object in PPTX/DOCX
17
17
  search_compound — SMILES → exact/similar matches across experiments
18
18
  render_to_png — CDXML → PNG via ChemDraw COM
@@ -1140,16 +1140,17 @@ def extract_cdxml_from_office(
1140
1140
  file_path: str,
1141
1141
  output_dir: Optional[str] = None,
1142
1142
  ) -> dict:
1143
- """Extract embedded ChemDraw objects from a PPTX or DOCX file.
1143
+ """Extract embedded ChemDraw objects from a PPTX, DOCX, XLS, or XLSX file.
1144
1144
 
1145
- Office files (PPTX/DOCX) are ZIP archives that may contain ChemDraw OLE
1146
- objects as binary blobs. This tool extracts every ChemDraw object, converts
1147
- it to CDXML, and writes the files to output_dir.
1145
+ Office files (PPTX/DOCX/XLSX) are ZIP archives that may contain ChemDraw OLE
1146
+ objects as binary blobs. XLS files are OLE2 compound documents with embedded
1147
+ ChemDraw objects stored in MBD* sub-storages. This tool extracts every
1148
+ ChemDraw object, converts it to CDXML, and writes the files to output_dir.
1148
1149
 
1149
1150
  Requires: olefile. CDX→CDXML conversion uses available backends.
1150
1151
 
1151
1152
  Args:
1152
- file_path: Path to a .pptx or .docx file.
1153
+ file_path: Path to a .pptx, .docx, .xlsx, or .xls file.
1153
1154
  output_dir: Directory for extracted CDXML files. Default: a folder
1154
1155
  named "<basename>_chemdraw/" next to the input file.
1155
1156
 
@@ -1161,10 +1162,12 @@ def extract_cdxml_from_office(
1161
1162
  if not file_path or not file_path.strip():
1162
1163
  return (
1163
1164
  "Usage: extract_cdxml_from_office(file_path='document.pptx', output_dir='out/')\n"
1164
- "Extracts embedded ChemDraw objects from PPTX or DOCX files.\n"
1165
+ "Extracts embedded ChemDraw objects from PPTX, DOCX, XLS, or XLSX files.\n"
1165
1166
  "Examples:\n"
1166
1167
  " extract_cdxml_from_office(file_path='presentation.pptx')\n"
1167
1168
  " extract_cdxml_from_office(file_path='report.docx', output_dir='extracted/')\n"
1169
+ " extract_cdxml_from_office(file_path='labbook.xls')\n"
1170
+ " extract_cdxml_from_office(file_path='labbook.xlsx')\n"
1168
1171
  "Returns: {ok, count, objects:[{cdxml_output, source_path}]}"
1169
1172
  )
1170
1173
 
@@ -1183,8 +1186,9 @@ def extract_cdxml_from_office(
1183
1186
  "ok": False,
1184
1187
  "error": (
1185
1188
  f"Extraction failed: {e}. "
1186
- "Ensure the file is a valid .pptx or .docx and that olefile is installed "
1187
- "(pip install olefile). The file must contain embedded ChemDraw OLE objects."
1189
+ "Ensure the file is a valid .pptx, .docx, .xlsx, or .xls and that olefile "
1190
+ "is installed (pip install olefile). The file must contain embedded "
1191
+ "ChemDraw OLE objects."
1188
1192
  ),
1189
1193
  "input": str(p),
1190
1194
  }
@@ -0,0 +1,374 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ OLE Extractor — Extract embedded ChemDraw objects from Office files.
4
+
5
+ Supports four Office formats:
6
+ - PPTX / DOCX / XLSX — ZIP archives containing OLE blobs in known paths.
7
+ - XLS — OLE2 compound documents with MBD* sub-storages.
8
+
9
+ ChemDraw objects are stored as CDX data inside the OLE "CONTENTS" stream.
10
+ This tool extracts and optionally converts them to CDXML.
11
+
12
+ Usage:
13
+ python ole_extractor.py input.pptx [-o output_dir/] [--format cdxml|cdx|both]
14
+ python ole_extractor.py input.docx [-o output_dir/] [--format cdxml|cdx|both]
15
+ python ole_extractor.py input.xlsx [-o output_dir/] [--format cdxml|cdx|both]
16
+ python ole_extractor.py input.xls [-o output_dir/] [--format cdxml|cdx|both]
17
+
18
+ Requires: olefile, cdx_converter (for CDXML conversion)
19
+ """
20
+
21
+ import argparse
22
+ import io
23
+ import os
24
+ import sys
25
+ import zipfile
26
+ from dataclasses import dataclass, field
27
+ from typing import List, Optional
28
+
29
+ import olefile
30
+
31
+ # ChemDraw OLE CLSID (CS ChemDraw Drawing / CS ChemDraw 3D)
32
+ CHEMDRAW_CLSIDS = {
33
+ "41BA6D21-A02E-11CE-8FD9-0020AFD1F20C", # ChemDraw Drawing
34
+ }
35
+
36
+ # CDX binary magic bytes
37
+ CDX_MAGIC = b"VjCD"
38
+
39
+ # Where Office stores OLE embeddings (ZIP-based formats)
40
+ EMBEDDING_PATTERNS = {
41
+ ".pptx": "ppt/embeddings/",
42
+ ".docx": "word/embeddings/",
43
+ ".xlsx": "xl/embeddings/",
44
+ }
45
+
46
+ # OLE2-native formats (not ZIP-based) — handled separately
47
+ OLE2_FORMATS = {".xls"}
48
+
49
+
50
+ @dataclass
51
+ class ExtractedObject:
52
+ """A single extracted ChemDraw object."""
53
+ source_path: str # path inside ZIP (e.g. ppt/embeddings/oleObject1.bin)
54
+ cdx_data: bytes
55
+ cdx_output: Optional[str] = None # path where CDX was saved
56
+ cdxml_output: Optional[str] = None # path where CDXML was saved
57
+ error: Optional[str] = None
58
+
59
+
60
+ def find_ole_entries(zip_path: str) -> List[str]:
61
+ """List OLE embedding paths inside a PPTX/DOCX/XLSX ZIP."""
62
+ ext = os.path.splitext(zip_path)[1].lower()
63
+ prefix = EMBEDDING_PATTERNS.get(ext)
64
+ if prefix is None:
65
+ raise ValueError(
66
+ f"Unsupported ZIP-based file type: {ext}. "
67
+ f"Use .pptx, .docx, or .xlsx."
68
+ )
69
+
70
+ with zipfile.ZipFile(zip_path, "r") as zf:
71
+ return [
72
+ name for name in zf.namelist()
73
+ if name.startswith(prefix) and name.lower().endswith(".bin")
74
+ ]
75
+
76
+
77
+ def is_chemdraw_ole(ole: olefile.OleFileIO) -> bool:
78
+ """Check if an OLE container holds a ChemDraw object."""
79
+ # Check CLSID
80
+ clsid = ole.root.clsid.upper() if ole.root.clsid else ""
81
+ if clsid in CHEMDRAW_CLSIDS:
82
+ return True
83
+
84
+ # Check for CONTENTS stream with CDX magic
85
+ if ole.exists("CONTENTS"):
86
+ header = ole.openstream("CONTENTS").read(4)
87
+ if header == CDX_MAGIC:
88
+ return True
89
+
90
+ return False
91
+
92
+
93
+ def extract_cdx_from_ole(ole_data: bytes) -> Optional[bytes]:
94
+ """Extract raw CDX bytes from an OLE compound document."""
95
+ if not olefile.isOleFile(io.BytesIO(ole_data)):
96
+ return None
97
+
98
+ ole = olefile.OleFileIO(io.BytesIO(ole_data))
99
+ try:
100
+ if not is_chemdraw_ole(ole):
101
+ return None
102
+
103
+ if ole.exists("CONTENTS"):
104
+ cdx = ole.openstream("CONTENTS").read()
105
+ if cdx[:4] == CDX_MAGIC:
106
+ return cdx
107
+
108
+ # Fallback: check \x01Ole10Native stream
109
+ if ole.exists("\x01Ole10Native"):
110
+ data = ole.openstream("\x01Ole10Native").read()
111
+ # Skip 4-byte length prefix
112
+ if len(data) > 4 and data[4:8] == CDX_MAGIC:
113
+ return data[4:]
114
+
115
+ return None
116
+ finally:
117
+ ole.close()
118
+
119
+
120
+ def _get_converter(output_format: str):
121
+ """Lazy-import cdx_converter if CDXML output is requested."""
122
+ if output_format not in ("cdxml", "both"):
123
+ return None
124
+ try:
125
+ from ..chemdraw import cdx_converter
126
+ return cdx_converter
127
+ except ImportError:
128
+ print(
129
+ "Warning: cdx_converter not found. CDX files will be saved "
130
+ "but CDXML conversion is unavailable.",
131
+ file=sys.stderr,
132
+ )
133
+ return None
134
+
135
+
136
+ def _save_extracted_object(
137
+ cdx_data: bytes,
138
+ source_path: str,
139
+ entry_name: str,
140
+ output_dir: str,
141
+ output_format: str,
142
+ convert_method: str,
143
+ _converter,
144
+ ) -> ExtractedObject:
145
+ """Save a single extracted CDX blob to disk (CDX and/or CDXML)."""
146
+ obj = ExtractedObject(source_path=source_path, cdx_data=cdx_data)
147
+
148
+ # Save CDX
149
+ if output_format in ("cdx", "both"):
150
+ cdx_path = os.path.join(output_dir, f"{entry_name}.cdx")
151
+ with open(cdx_path, "wb") as f:
152
+ f.write(cdx_data)
153
+ obj.cdx_output = cdx_path
154
+
155
+ # Convert to CDXML
156
+ if output_format in ("cdxml", "both"):
157
+ cdxml_path = os.path.join(output_dir, f"{entry_name}.cdxml")
158
+ if _converter is not None:
159
+ try:
160
+ cdxml_str = _converter.convert_cdx_to_cdxml(
161
+ cdx_data, method=convert_method
162
+ )
163
+ with open(cdxml_path, "w", encoding="utf-8") as f:
164
+ f.write(cdxml_str)
165
+ obj.cdxml_output = cdxml_path
166
+ except Exception as e:
167
+ obj.error = f"CDXML conversion failed: {e}"
168
+ # Still save CDX as fallback
169
+ if obj.cdx_output is None:
170
+ fallback = os.path.join(output_dir, f"{entry_name}.cdx")
171
+ with open(fallback, "wb") as f:
172
+ f.write(cdx_data)
173
+ obj.cdx_output = fallback
174
+ else:
175
+ # No converter — save CDX instead
176
+ if obj.cdx_output is None:
177
+ fallback = os.path.join(output_dir, f"{entry_name}.cdx")
178
+ with open(fallback, "wb") as f:
179
+ f.write(cdx_data)
180
+ obj.cdx_output = fallback
181
+ obj.error = "cdx_converter unavailable; saved CDX only"
182
+
183
+ return obj
184
+
185
+
186
+ def _extract_from_zip(
187
+ input_path: str,
188
+ output_dir: str,
189
+ output_format: str,
190
+ convert_method: str,
191
+ _converter,
192
+ ) -> List[ExtractedObject]:
193
+ """Extract ChemDraw objects from a ZIP-based Office file (PPTX/DOCX/XLSX)."""
194
+ ole_entries = find_ole_entries(input_path)
195
+ results = []
196
+
197
+ with zipfile.ZipFile(input_path, "r") as zf:
198
+ for entry in ole_entries:
199
+ ole_data = zf.read(entry)
200
+ cdx_data = extract_cdx_from_ole(ole_data)
201
+
202
+ if cdx_data is None:
203
+ # Not a ChemDraw object — skip silently
204
+ continue
205
+
206
+ entry_name = os.path.splitext(os.path.basename(entry))[0]
207
+ obj = _save_extracted_object(
208
+ cdx_data, entry, entry_name,
209
+ output_dir, output_format, convert_method, _converter,
210
+ )
211
+ results.append(obj)
212
+
213
+ return results
214
+
215
+
216
+ def _extract_from_xls(
217
+ input_path: str,
218
+ output_dir: str,
219
+ output_format: str,
220
+ convert_method: str,
221
+ _converter,
222
+ ) -> List[ExtractedObject]:
223
+ """Extract ChemDraw objects from an OLE2-native XLS file.
224
+
225
+ XLS files are OLE2 compound documents. Embedded objects are stored as
226
+ sub-storages with names starting with 'MBD' (e.g. MBD078DC381).
227
+ ChemDraw objects contain a CONTENTS stream with CDX binary data.
228
+ """
229
+ results = []
230
+ ole = olefile.OleFileIO(input_path)
231
+ try:
232
+ # Find top-level MBD* storages (embedded OLE objects)
233
+ seen_storages = set()
234
+ for entry in ole.listdir(storages=True, streams=False):
235
+ if len(entry) >= 1 and entry[0].startswith("MBD"):
236
+ seen_storages.add(entry[0])
237
+
238
+ obj_index = 0
239
+ for storage_name in sorted(seen_storages):
240
+ contents_path = f"{storage_name}/CONTENTS"
241
+ if not ole.exists(contents_path):
242
+ continue
243
+
244
+ cdx = ole.openstream(contents_path).read()
245
+ if len(cdx) < 4 or cdx[:4] != CDX_MAGIC:
246
+ continue
247
+
248
+ obj_index += 1
249
+ source_path = f"{storage_name}/CONTENTS"
250
+ entry_name = f"oleObject{obj_index}"
251
+ obj = _save_extracted_object(
252
+ cdx, source_path, entry_name,
253
+ output_dir, output_format, convert_method, _converter,
254
+ )
255
+ results.append(obj)
256
+ finally:
257
+ ole.close()
258
+
259
+ return results
260
+
261
+
262
+ def extract_from_office(
263
+ input_path: str,
264
+ output_dir: Optional[str] = None,
265
+ output_format: str = "cdxml",
266
+ convert_method: str = "auto",
267
+ ) -> List[ExtractedObject]:
268
+ """Extract all ChemDraw objects from an Office file.
269
+
270
+ Args:
271
+ input_path: Path to .pptx, .docx, .xlsx, or .xls file.
272
+ output_dir: Directory for extracted files. Default: <basename>_chemdraw/
273
+ output_format: "cdx", "cdxml", or "both".
274
+ convert_method: Backend for CDX→CDXML conversion (passed to cdx_converter).
275
+
276
+ Returns:
277
+ List of ExtractedObject with extraction results.
278
+ """
279
+ ext = os.path.splitext(input_path)[1].lower()
280
+
281
+ if ext not in EMBEDDING_PATTERNS and ext not in OLE2_FORMATS:
282
+ raise ValueError(
283
+ f"Unsupported file type: {ext}. "
284
+ f"Use .pptx, .docx, .xlsx, or .xls."
285
+ )
286
+
287
+ if output_dir is None:
288
+ basename = os.path.splitext(os.path.basename(input_path))[0]
289
+ output_dir = os.path.join(os.path.dirname(input_path) or ".", f"{basename}_chemdraw")
290
+
291
+ os.makedirs(output_dir, exist_ok=True)
292
+ _converter = _get_converter(output_format)
293
+
294
+ if ext in OLE2_FORMATS:
295
+ return _extract_from_xls(
296
+ input_path, output_dir, output_format, convert_method, _converter,
297
+ )
298
+ else:
299
+ return _extract_from_zip(
300
+ input_path, output_dir, output_format, convert_method, _converter,
301
+ )
302
+
303
+
304
+ def print_summary(results: List[ExtractedObject], input_path: str) -> None:
305
+ """Print extraction summary to stdout."""
306
+ print(f"{'=' * 60}")
307
+ print(f"OLE Extractor - {os.path.basename(input_path)}")
308
+ print(f"{'=' * 60}")
309
+
310
+ if not results:
311
+ print("No ChemDraw objects found.")
312
+ return
313
+
314
+ print(f"Found {len(results)} ChemDraw object(s):\n")
315
+ for i, obj in enumerate(results, 1):
316
+ print(f" [{i}] {obj.source_path}")
317
+ print(f" CDX size: {len(obj.cdx_data):,} bytes")
318
+ if obj.cdx_output:
319
+ print(f" CDX: {obj.cdx_output}")
320
+ if obj.cdxml_output:
321
+ size = os.path.getsize(obj.cdxml_output)
322
+ print(f" CDXML: {obj.cdxml_output} ({size:,} bytes)")
323
+ if obj.error:
324
+ print(f" Note: {obj.error}")
325
+ print()
326
+
327
+
328
+ # ---------------------------------------------------------------------------
329
+ # CLI
330
+ # ---------------------------------------------------------------------------
331
+
332
+ def main(argv=None) -> int:
333
+ parser = argparse.ArgumentParser(
334
+ description="Extract embedded ChemDraw objects from Office files."
335
+ )
336
+ parser.add_argument("input", help="Input file (.pptx, .docx, .xlsx, or .xls)")
337
+ parser.add_argument(
338
+ "-o", "--output-dir",
339
+ help="Output directory (default: <input_basename>_chemdraw/)"
340
+ )
341
+ parser.add_argument(
342
+ "--format",
343
+ choices=["cdxml", "cdx", "both"],
344
+ default="cdxml",
345
+ help="Output format (default: cdxml)"
346
+ )
347
+ parser.add_argument(
348
+ "--method",
349
+ choices=["auto", "com", "pycdxml", "obabel"],
350
+ default="auto",
351
+ help="CDX→CDXML conversion backend (default: auto)"
352
+ )
353
+ args = parser.parse_args(argv)
354
+
355
+ if not os.path.isfile(args.input):
356
+ print(f"Error: file not found: {args.input}", file=sys.stderr)
357
+ return 1
358
+
359
+ try:
360
+ results = extract_from_office(
361
+ args.input,
362
+ output_dir=args.output_dir,
363
+ output_format=args.format,
364
+ convert_method=args.method,
365
+ )
366
+ print_summary(results, args.input)
367
+ return 0
368
+ except Exception as e:
369
+ print(f"Error: {e}", file=sys.stderr)
370
+ return 1
371
+
372
+
373
+ if __name__ == "__main__":
374
+ sys.exit(main())