cdxml-toolkit 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. cdxml_toolkit/__init__.py +18 -0
  2. cdxml_toolkit/_jre/__init__.py +2 -0
  3. cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
  4. cdxml_toolkit/analysis/__init__.py +35 -0
  5. cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
  6. cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
  7. cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
  8. cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
  9. cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
  10. cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
  11. cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
  12. cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
  13. cdxml_toolkit/analysis/extract_nmr.py +47 -0
  14. cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
  15. cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
  16. cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
  17. cdxml_toolkit/cdxml_builder.py +920 -0
  18. cdxml_toolkit/cdxml_utils.py +342 -0
  19. cdxml_toolkit/chemdraw/__init__.py +5 -0
  20. cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
  21. cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
  22. cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
  23. cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
  24. cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
  25. cdxml_toolkit/constants.py +304 -0
  26. cdxml_toolkit/coord_normalizer.py +438 -0
  27. cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
  28. cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
  29. cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
  30. cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
  31. cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
  32. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
  33. cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
  34. cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
  35. cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
  36. cdxml_toolkit/image/__init__.py +15 -0
  37. cdxml_toolkit/image/reaction_from_image.py +2103 -0
  38. cdxml_toolkit/image/structure_from_image.py +1711 -0
  39. cdxml_toolkit/layout/__init__.py +5 -0
  40. cdxml_toolkit/layout/alignment.py +1642 -0
  41. cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
  42. cdxml_toolkit/layout/scheme_merger.py +2260 -0
  43. cdxml_toolkit/mcp_server/__init__.py +0 -0
  44. cdxml_toolkit/mcp_server/__main__.py +5 -0
  45. cdxml_toolkit/mcp_server/server.py +1567 -0
  46. cdxml_toolkit/naming/__init__.py +6 -0
  47. cdxml_toolkit/naming/aligned_namer.py +2342 -0
  48. cdxml_toolkit/naming/mol_builder.py +3722 -0
  49. cdxml_toolkit/naming/name_decomposer.py +2843 -0
  50. cdxml_toolkit/naming/reactions_datamol.json +2414 -0
  51. cdxml_toolkit/office/__init__.py +5 -0
  52. cdxml_toolkit/office/doc_from_template.py +722 -0
  53. cdxml_toolkit/office/ole_embedder.py +808 -0
  54. cdxml_toolkit/office/ole_extractor.py +272 -0
  55. cdxml_toolkit/perception/__init__.py +10 -0
  56. cdxml_toolkit/perception/compound_search.py +229 -0
  57. cdxml_toolkit/perception/eln_csv_parser.py +240 -0
  58. cdxml_toolkit/perception/rdf_parser.py +664 -0
  59. cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
  60. cdxml_toolkit/perception/reaction_parser.py +2150 -0
  61. cdxml_toolkit/perception/scheme_reader.py +2948 -0
  62. cdxml_toolkit/perception/scheme_refine.py +1404 -0
  63. cdxml_toolkit/perception/scheme_segmenter.py +619 -0
  64. cdxml_toolkit/perception/spatial_assignment.py +1013 -0
  65. cdxml_toolkit/rdkit_utils.py +605 -0
  66. cdxml_toolkit/render/__init__.py +17 -0
  67. cdxml_toolkit/render/auto_layout.py +229 -0
  68. cdxml_toolkit/render/compact_parser.py +632 -0
  69. cdxml_toolkit/render/parser.py +706 -0
  70. cdxml_toolkit/render/render_scheme.py +267 -0
  71. cdxml_toolkit/render/renderer.py +2387 -0
  72. cdxml_toolkit/render/schema.py +90 -0
  73. cdxml_toolkit/render/scheme_maker.py +1043 -0
  74. cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
  75. cdxml_toolkit/resolve/__init__.py +13 -0
  76. cdxml_toolkit/resolve/cas_resolver.py +430 -0
  77. cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
  78. cdxml_toolkit/resolve/condensed_formula.py +493 -0
  79. cdxml_toolkit/resolve/jre_manager.py +195 -0
  80. cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
  81. cdxml_toolkit/resolve/reagent_db.py +285 -0
  82. cdxml_toolkit/resolve/superatom_data.json +2856 -0
  83. cdxml_toolkit/resolve/superatom_table.py +146 -0
  84. cdxml_toolkit/text_formatting.py +298 -0
  85. cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
  86. cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
  87. cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
  88. cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
  89. cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
  90. cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
  91. cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,342 @@
1
+ """Shared CDXML geometry and IO utilities.
2
+
3
+ Extracted from duplicated code across reaction_cleanup.py,
4
+ eln_enrichment.py, and scheme_polisher.py for v0.3 consolidation.
5
+
6
+ Key design decisions (from CLAUDE.md / reaction_cleanup FINDINGS):
7
+ - Fragment bounding boxes use direct-child <n> atom "p" positions ONLY.
8
+ XML BoundingBox attributes are unreliable, especially for
9
+ NodeType="Fragment" abbreviation groups (OTs, Boc) which report
10
+ the expanded inner structure, not the visible abbreviation.
11
+ - Hanging label detection: when N or P is the bottommost atom with
12
+ <=2 explicit bonds, ChemDraw renders H as a vertical stack below
13
+ the atom symbol, requiring extra layout gap (16pt vs 8pt).
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import xml.etree.ElementTree as ET
19
+ from typing import Dict, Optional, Tuple
20
+
21
+ from .constants import LAYOUT_HANGING_LABEL_GAP
22
+
23
+
24
+ # ---------------------------------------------------------------------------
25
+ # Fragment geometry
26
+ # ---------------------------------------------------------------------------
27
+
28
+ def fragment_bbox(
29
+ frag: ET.Element,
30
+ ) -> Optional[Tuple[float, float, float, float]]:
31
+ """Atom-only bounding box for a <fragment> element.
32
+
33
+ Uses direct-child <n> atom ``p`` positions only (NOT recursive,
34
+ NOT XML BoundingBox). XML BoundingBox is unreliable for
35
+ ``NodeType='Fragment'`` abbreviation groups.
36
+
37
+ Returns ``(min_x, min_y, max_x, max_y)`` or *None* if the
38
+ fragment has no atoms with ``p`` attributes and no fallback
39
+ BoundingBox.
40
+ """
41
+ xs: list[float] = []
42
+ ys: list[float] = []
43
+
44
+ for n in frag.findall("n"): # direct children only
45
+ p = n.get("p")
46
+ if p:
47
+ parts = p.split()
48
+ if len(parts) >= 2:
49
+ xs.append(float(parts[0]))
50
+ ys.append(float(parts[1]))
51
+
52
+ if xs:
53
+ return min(xs), min(ys), max(xs), max(ys)
54
+
55
+ # Fallback: use XML BoundingBox if present
56
+ bb = frag.get("BoundingBox", "")
57
+ if bb:
58
+ vals = [float(v) for v in bb.split()]
59
+ if len(vals) >= 4:
60
+ return vals[0], vals[1], vals[2], vals[3]
61
+
62
+ return None
63
+
64
+
65
+ def fragment_centroid(frag: ET.Element) -> Optional[Tuple[float, float]]:
66
+ """Center point of :func:`fragment_bbox`.
67
+
68
+ Returns ``(cx, cy)`` or *None* when no bbox can be computed.
69
+ """
70
+ bbox = fragment_bbox(frag)
71
+ if bbox is None:
72
+ return None
73
+ return (bbox[0] + bbox[2]) / 2.0, (bbox[1] + bbox[3]) / 2.0
74
+
75
+
76
+ def fragment_bottom_has_hanging_label(frag: ET.Element) -> bool:
77
+ """True if the bottommost atom has a label that hangs below it.
78
+
79
+ In ChemDraw, when N (Element=7) or P (Element=15) is the bottommost
80
+ atom of a fragment and has only 2 or fewer explicit bonds, the
81
+ implicit H is rendered as a vertical stack (N above, H below).
82
+ This causes the label to extend below the atom coordinate.
83
+
84
+ Returns *True* when extra gap is needed below this fragment.
85
+ """
86
+ HANGING_ELEMENTS = {"7", "15"}
87
+
88
+ atoms: list[tuple[ET.Element, float]] = [] # (node, y)
89
+ for n in frag.findall("n"):
90
+ p = n.get("p")
91
+ if p:
92
+ parts = p.split()
93
+ if len(parts) >= 2:
94
+ atoms.append((n, float(parts[1])))
95
+
96
+ if not atoms:
97
+ return False
98
+
99
+ max_y = max(a[1] for a in atoms)
100
+
101
+ for n, y in atoms:
102
+ if y < max_y - 1.0:
103
+ continue
104
+ if n.get("Element", "") not in HANGING_ELEMENTS:
105
+ continue
106
+
107
+ node_id = n.get("id", "")
108
+ bond_count = 0
109
+ for b in frag.findall("b"):
110
+ if b.get("B") == node_id or b.get("E") == node_id:
111
+ bond_count += 1
112
+
113
+ if bond_count <= 2:
114
+ return True
115
+
116
+ return False
117
+
118
+
119
+ def fragment_bbox_with_label_extension(
120
+ frag: ET.Element,
121
+ ) -> Optional[Tuple[float, float, float, float]]:
122
+ """Atom-only bounding box with hanging-label extension.
123
+
124
+ Delegates to :func:`fragment_bbox` for the base bbox, then extends
125
+ ``max_y`` by :data:`constants.LAYOUT_HANGING_LABEL_GAP` (+16 pt) when
126
+ :func:`fragment_bottom_has_hanging_label` is True.
127
+
128
+ This accounts for N-H / P-H labels that render as a vertical stack
129
+ below the atom coordinate in ChemDraw.
130
+ """
131
+ bbox = fragment_bbox(frag)
132
+ if bbox is None:
133
+ return None
134
+ min_x, min_y, max_x, max_y = bbox
135
+ if fragment_bottom_has_hanging_label(frag):
136
+ max_y += LAYOUT_HANGING_LABEL_GAP
137
+ return (min_x, min_y, max_x, max_y)
138
+
139
+
140
+ # ---------------------------------------------------------------------------
141
+ # Text geometry
142
+ # ---------------------------------------------------------------------------
143
+
144
+ def recompute_text_bbox(t_elem: ET.Element) -> None:
145
+ """Recompute and set BoundingBox on a ``<t>`` element.
146
+
147
+ Uses the ``p`` attribute (anchor position) and aggregated ``<s>``
148
+ text content to estimate bounds. Char width 5.8 pt (Arial 10 pt),
149
+ line height 12 pt. Handles multi-line text and Left/Center/Right
150
+ justification.
151
+ """
152
+ p = t_elem.get("p", "")
153
+ if not p:
154
+ return
155
+ parts = [float(v) for v in p.split()]
156
+ if len(parts) < 2:
157
+ return
158
+ px, py = parts[0], parts[1]
159
+
160
+ text_content = "".join(s.text or "" for s in t_elem.iter("s"))
161
+ lines = text_content.split("\n") if "\n" in text_content else [text_content]
162
+ max_line_len = max((len(l) for l in lines), default=0)
163
+ n_lines = max(1, len(lines))
164
+
165
+ char_w = 5.8
166
+ line_h = 12.0
167
+ w = max_line_len * char_w
168
+ h = n_lines * line_h # noqa: F841 (kept for clarity)
169
+
170
+ just = t_elem.get(
171
+ "CaptionJustification", t_elem.get("Justification", "Left")
172
+ )
173
+ if just == "Center":
174
+ x1 = px - w / 2.0
175
+ x2 = px + w / 2.0
176
+ elif just == "Right":
177
+ x1 = px - w
178
+ x2 = px
179
+ else: # Left (default)
180
+ x1 = px
181
+ x2 = px + w
182
+
183
+ y1 = py - line_h # ascender above baseline
184
+ y2 = py + (n_lines - 1) * line_h + 3.0 # descender below last
185
+
186
+ t_elem.set("BoundingBox", f"{x1:.2f} {y1:.2f} {x2:.2f} {y2:.2f}")
187
+
188
+
189
+ # ---------------------------------------------------------------------------
190
+ # ID map
191
+ # ---------------------------------------------------------------------------
192
+
193
+ def build_id_map(parent: ET.Element) -> Dict[str, ET.Element]:
194
+ """Build ``{id_string: element}`` map for all descendants with an
195
+ ``id`` attribute.
196
+
197
+ This is **recursive** — it walks the entire subtree via
198
+ ``parent.iter()``, so nested elements at any depth are included.
199
+
200
+ Note: ``reaction_cleanup._build_id_map()`` is intentionally
201
+ **shallow** (direct children only via ``for el in page``).
202
+ The two are NOT interchangeable.
203
+ """
204
+ m: Dict[str, ET.Element] = {}
205
+ for el in parent.iter():
206
+ eid = el.get("id", "")
207
+ if eid:
208
+ m[eid] = el
209
+ return m
210
+
211
+
212
+ # ---------------------------------------------------------------------------
213
+ # Arrow geometry
214
+ # ---------------------------------------------------------------------------
215
+
216
+ def arrow_endpoints(
217
+ arrow: ET.Element,
218
+ ) -> Tuple[float, float, float, float]:
219
+ """Return ``(tail_x, tail_y, head_x, head_y)`` from an arrow element.
220
+
221
+ Reads ``Head3D`` and ``Tail3D`` attributes first, falling back to
222
+ ``BoundingBox``. Returns a default (450, 250, 550, 250) if neither
223
+ is available.
224
+ """
225
+ head = arrow.get("Head3D", "")
226
+ tail = arrow.get("Tail3D", "")
227
+ if head and tail:
228
+ hp = [float(v) for v in head.split()]
229
+ tp = [float(v) for v in tail.split()]
230
+ return tp[0], tp[1], hp[0], hp[1]
231
+ # Fallback: BoundingBox
232
+ bb = arrow.get("BoundingBox", "")
233
+ if bb:
234
+ vals = [float(v) for v in bb.split()]
235
+ return vals[0], (vals[1] + vals[3]) / 2, vals[2], (vals[1] + vals[3]) / 2
236
+ return 450.0, 250.0, 550.0, 250.0
237
+
238
+
239
+ # ---------------------------------------------------------------------------
240
+ # CDXML IO
241
+ # ---------------------------------------------------------------------------
242
+
243
+ def parse_cdxml(path: str) -> ET.ElementTree:
244
+ """Parse a CDXML file, returning an :class:`~xml.etree.ElementTree.ElementTree`."""
245
+ return ET.parse(path)
246
+
247
+
248
+ def write_cdxml(tree: ET.ElementTree, path: str) -> None:
249
+ """Write *tree* to *path*, re-inserting the DOCTYPE declaration.
250
+
251
+ ``ElementTree.write()`` drops the DOCTYPE. This function writes
252
+ the XML first, then patches the file to re-add it and strip any
253
+ ``ns0:`` namespace prefixes.
254
+ """
255
+ tree.write(path, xml_declaration=True, encoding="UTF-8")
256
+
257
+ with open(path, "r", encoding="utf-8") as f:
258
+ content = f.read()
259
+
260
+ if "<!DOCTYPE" not in content:
261
+ content = content.replace(
262
+ "?>",
263
+ '?>\n<!DOCTYPE CDXML SYSTEM "http://www.cambridgesoft.com/xml/cdxml.dtd" >',
264
+ 1,
265
+ )
266
+
267
+ # Strip namespace prefixes that ElementTree may inject
268
+ content = content.replace("ns0:", "").replace(":ns0", "")
269
+
270
+ with open(path, "w", encoding="utf-8") as f:
271
+ f.write(content)
272
+
273
+
274
+ # ---------------------------------------------------------------------------
275
+ # Self-test
276
+ # ---------------------------------------------------------------------------
277
+
278
+ if __name__ == "__main__":
279
+ print("cdxml_utils self-test\n" + "=" * 40)
280
+
281
+ # Minimal fragment with 3 atoms and 2 bonds
282
+ xml_str = """\
283
+ <fragment id="100">
284
+ <n id="1" p="100 200" Element="6" />
285
+ <n id="2" p="120 180" Element="7" NumHydrogens="1" />
286
+ <n id="3" p="140 200" />
287
+ <b id="10" B="1" E="2" />
288
+ <b id="11" B="2" E="3" />
289
+ </fragment>"""
290
+ frag = ET.fromstring(xml_str)
291
+
292
+ # fragment_bbox
293
+ bbox = fragment_bbox(frag)
294
+ assert bbox is not None, "bbox should not be None"
295
+ assert bbox == (100.0, 180.0, 140.0, 200.0), f"unexpected bbox: {bbox}"
296
+ print(f" fragment_bbox: {bbox} OK")
297
+
298
+ # fragment_centroid
299
+ c = fragment_centroid(frag)
300
+ assert c is not None
301
+ assert c == (120.0, 190.0), f"unexpected centroid: {c}"
302
+ print(f" fragment_centroid: {c} OK")
303
+
304
+ # hanging label — atom 2 (N, Element=7) is NOT bottommost (y=180 < 200)
305
+ assert not fragment_bottom_has_hanging_label(frag), "should be False"
306
+ print(" hanging_label (no): OK")
307
+
308
+ # Fragment where N IS bottommost
309
+ xml_hang = """\
310
+ <fragment id="200">
311
+ <n id="1" p="100 180" />
312
+ <n id="2" p="120 200" Element="7" />
313
+ <b id="10" B="1" E="2" />
314
+ </fragment>"""
315
+ frag_hang = ET.fromstring(xml_hang)
316
+ assert fragment_bottom_has_hanging_label(frag_hang), "should be True"
317
+ print(" hanging_label (yes): OK")
318
+
319
+ # Empty fragment — no atoms
320
+ frag_empty = ET.fromstring('<fragment id="300" />')
321
+ assert fragment_bbox(frag_empty) is None, "empty frag should return None"
322
+ assert fragment_centroid(frag_empty) is None
323
+ print(" empty fragment: OK")
324
+
325
+ # recompute_text_bbox
326
+ t_xml = '<t id="50" p="100 200"><s>Hello</s></t>'
327
+ t_el = ET.fromstring(t_xml)
328
+ recompute_text_bbox(t_el)
329
+ bb = t_el.get("BoundingBox")
330
+ assert bb is not None, "BoundingBox should be set"
331
+ vals = [float(v) for v in bb.split()]
332
+ assert len(vals) == 4
333
+ print(f" recompute_text_bbox: {bb} OK")
334
+
335
+ # build_id_map
336
+ page_xml = '<page><n id="1" /><n id="2"><n id="3" /></n></page>'
337
+ page = ET.fromstring(page_xml)
338
+ m = build_id_map(page)
339
+ assert "1" in m and "2" in m and "3" in m, f"missing ids: {set(m)}"
340
+ print(f" build_id_map: {len(m)} entries OK")
341
+
342
+ print("\nAll tests passed.")
@@ -0,0 +1,5 @@
1
+ """ChemDraw — ChemDraw-specific integrations (COM, ChemScript).
2
+
3
+ Everything that requires ChemDraw to be installed: CDX/CDXML conversion,
4
+ image rendering, ChemScript .NET bridge, ELN CDX cleanup.
5
+ """