cdxml-toolkit 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdxml_toolkit/__init__.py +18 -0
- cdxml_toolkit/_jre/__init__.py +2 -0
- cdxml_toolkit/_jre/temurin-21-jre-win-x64.zip +0 -0
- cdxml_toolkit/analysis/__init__.py +35 -0
- cdxml_toolkit/analysis/deterministic/__init__.py +12 -0
- cdxml_toolkit/analysis/deterministic/discover_experiment_files.py +413 -0
- cdxml_toolkit/analysis/deterministic/lab_book_formatter.py +701 -0
- cdxml_toolkit/analysis/deterministic/lcms_file_categorizer.py +928 -0
- cdxml_toolkit/analysis/deterministic/lcms_identifier.py +598 -0
- cdxml_toolkit/analysis/deterministic/mass_resolver.py +654 -0
- cdxml_toolkit/analysis/deterministic/multi_lcms_analyzer.py +1412 -0
- cdxml_toolkit/analysis/deterministic/procedure_writer.py +446 -0
- cdxml_toolkit/analysis/extract_nmr.py +47 -0
- cdxml_toolkit/analysis/format_procedure_entry.py +479 -0
- cdxml_toolkit/analysis/lcms_analyzer.py +1299 -0
- cdxml_toolkit/analysis/parse_analysis_file.py +134 -0
- cdxml_toolkit/cdxml_builder.py +920 -0
- cdxml_toolkit/cdxml_utils.py +342 -0
- cdxml_toolkit/chemdraw/__init__.py +5 -0
- cdxml_toolkit/chemdraw/_chemscript_server.py +562 -0
- cdxml_toolkit/chemdraw/cdx_converter.py +527 -0
- cdxml_toolkit/chemdraw/cdxml_to_image.py +262 -0
- cdxml_toolkit/chemdraw/cdxml_to_image_rdkit.py +296 -0
- cdxml_toolkit/chemdraw/chemscript_bridge.py +901 -0
- cdxml_toolkit/constants.py +304 -0
- cdxml_toolkit/coord_normalizer.py +438 -0
- cdxml_toolkit/deterministic_pipeline/__init__.py +6 -0
- cdxml_toolkit/deterministic_pipeline/legacy/__init__.py +5 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_cdx_cleanup.py +509 -0
- cdxml_toolkit/deterministic_pipeline/legacy/eln_enrichment.py +1394 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_aligner.py +428 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher.py +1337 -0
- cdxml_toolkit/deterministic_pipeline/legacy/scheme_polisher_v2.py +1340 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_audit.py +931 -0
- cdxml_toolkit/deterministic_pipeline/scheme_reader_verify.py +1160 -0
- cdxml_toolkit/image/__init__.py +15 -0
- cdxml_toolkit/image/reaction_from_image.py +2103 -0
- cdxml_toolkit/image/structure_from_image.py +1711 -0
- cdxml_toolkit/layout/__init__.py +5 -0
- cdxml_toolkit/layout/alignment.py +1642 -0
- cdxml_toolkit/layout/reaction_cleanup.py +1002 -0
- cdxml_toolkit/layout/scheme_merger.py +2260 -0
- cdxml_toolkit/mcp_server/__init__.py +0 -0
- cdxml_toolkit/mcp_server/__main__.py +5 -0
- cdxml_toolkit/mcp_server/server.py +1567 -0
- cdxml_toolkit/naming/__init__.py +6 -0
- cdxml_toolkit/naming/aligned_namer.py +2342 -0
- cdxml_toolkit/naming/mol_builder.py +3722 -0
- cdxml_toolkit/naming/name_decomposer.py +2843 -0
- cdxml_toolkit/naming/reactions_datamol.json +2414 -0
- cdxml_toolkit/office/__init__.py +5 -0
- cdxml_toolkit/office/doc_from_template.py +722 -0
- cdxml_toolkit/office/ole_embedder.py +808 -0
- cdxml_toolkit/office/ole_extractor.py +272 -0
- cdxml_toolkit/perception/__init__.py +10 -0
- cdxml_toolkit/perception/compound_search.py +229 -0
- cdxml_toolkit/perception/eln_csv_parser.py +240 -0
- cdxml_toolkit/perception/rdf_parser.py +664 -0
- cdxml_toolkit/perception/reactant_heuristic.py +1045 -0
- cdxml_toolkit/perception/reaction_parser.py +2150 -0
- cdxml_toolkit/perception/scheme_reader.py +2948 -0
- cdxml_toolkit/perception/scheme_refine.py +1404 -0
- cdxml_toolkit/perception/scheme_segmenter.py +619 -0
- cdxml_toolkit/perception/spatial_assignment.py +1013 -0
- cdxml_toolkit/rdkit_utils.py +605 -0
- cdxml_toolkit/render/__init__.py +17 -0
- cdxml_toolkit/render/auto_layout.py +229 -0
- cdxml_toolkit/render/compact_parser.py +632 -0
- cdxml_toolkit/render/parser.py +706 -0
- cdxml_toolkit/render/render_scheme.py +267 -0
- cdxml_toolkit/render/renderer.py +2387 -0
- cdxml_toolkit/render/schema.py +90 -0
- cdxml_toolkit/render/scheme_maker.py +1043 -0
- cdxml_toolkit/render/scheme_yaml_writer.py +1487 -0
- cdxml_toolkit/resolve/__init__.py +13 -0
- cdxml_toolkit/resolve/cas_resolver.py +430 -0
- cdxml_toolkit/resolve/chemscanner_abbreviations.json +28813 -0
- cdxml_toolkit/resolve/condensed_formula.py +493 -0
- cdxml_toolkit/resolve/jre_manager.py +195 -0
- cdxml_toolkit/resolve/reagent_abbreviations.json +1046 -0
- cdxml_toolkit/resolve/reagent_db.py +285 -0
- cdxml_toolkit/resolve/superatom_data.json +2856 -0
- cdxml_toolkit/resolve/superatom_table.py +146 -0
- cdxml_toolkit/text_formatting.py +298 -0
- cdxml_toolkit-0.5.0.dist-info/METADATA +318 -0
- cdxml_toolkit-0.5.0.dist-info/RECORD +91 -0
- cdxml_toolkit-0.5.0.dist-info/WHEEL +5 -0
- cdxml_toolkit-0.5.0.dist-info/entry_points.txt +17 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/LICENSE +21 -0
- cdxml_toolkit-0.5.0.dist-info/licenses/NOTICE.md +37 -0
- cdxml_toolkit-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
"""Shared CDXML geometry and IO utilities.
|
|
2
|
+
|
|
3
|
+
Extracted from duplicated code across reaction_cleanup.py,
|
|
4
|
+
eln_enrichment.py, and scheme_polisher.py for v0.3 consolidation.
|
|
5
|
+
|
|
6
|
+
Key design decisions (from CLAUDE.md / reaction_cleanup FINDINGS):
|
|
7
|
+
- Fragment bounding boxes use direct-child <n> atom "p" positions ONLY.
|
|
8
|
+
XML BoundingBox attributes are unreliable, especially for
|
|
9
|
+
NodeType="Fragment" abbreviation groups (OTs, Boc) which report
|
|
10
|
+
the expanded inner structure, not the visible abbreviation.
|
|
11
|
+
- Hanging label detection: when N or P is the bottommost atom with
|
|
12
|
+
<=2 explicit bonds, ChemDraw renders H as a vertical stack below
|
|
13
|
+
the atom symbol, requiring extra layout gap (16pt vs 8pt).
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import xml.etree.ElementTree as ET
|
|
19
|
+
from typing import Dict, Optional, Tuple
|
|
20
|
+
|
|
21
|
+
from .constants import LAYOUT_HANGING_LABEL_GAP
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# ---------------------------------------------------------------------------
|
|
25
|
+
# Fragment geometry
|
|
26
|
+
# ---------------------------------------------------------------------------
|
|
27
|
+
|
|
28
|
+
def fragment_bbox(
|
|
29
|
+
frag: ET.Element,
|
|
30
|
+
) -> Optional[Tuple[float, float, float, float]]:
|
|
31
|
+
"""Atom-only bounding box for a <fragment> element.
|
|
32
|
+
|
|
33
|
+
Uses direct-child <n> atom ``p`` positions only (NOT recursive,
|
|
34
|
+
NOT XML BoundingBox). XML BoundingBox is unreliable for
|
|
35
|
+
``NodeType='Fragment'`` abbreviation groups.
|
|
36
|
+
|
|
37
|
+
Returns ``(min_x, min_y, max_x, max_y)`` or *None* if the
|
|
38
|
+
fragment has no atoms with ``p`` attributes and no fallback
|
|
39
|
+
BoundingBox.
|
|
40
|
+
"""
|
|
41
|
+
xs: list[float] = []
|
|
42
|
+
ys: list[float] = []
|
|
43
|
+
|
|
44
|
+
for n in frag.findall("n"): # direct children only
|
|
45
|
+
p = n.get("p")
|
|
46
|
+
if p:
|
|
47
|
+
parts = p.split()
|
|
48
|
+
if len(parts) >= 2:
|
|
49
|
+
xs.append(float(parts[0]))
|
|
50
|
+
ys.append(float(parts[1]))
|
|
51
|
+
|
|
52
|
+
if xs:
|
|
53
|
+
return min(xs), min(ys), max(xs), max(ys)
|
|
54
|
+
|
|
55
|
+
# Fallback: use XML BoundingBox if present
|
|
56
|
+
bb = frag.get("BoundingBox", "")
|
|
57
|
+
if bb:
|
|
58
|
+
vals = [float(v) for v in bb.split()]
|
|
59
|
+
if len(vals) >= 4:
|
|
60
|
+
return vals[0], vals[1], vals[2], vals[3]
|
|
61
|
+
|
|
62
|
+
return None
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def fragment_centroid(frag: ET.Element) -> Optional[Tuple[float, float]]:
|
|
66
|
+
"""Center point of :func:`fragment_bbox`.
|
|
67
|
+
|
|
68
|
+
Returns ``(cx, cy)`` or *None* when no bbox can be computed.
|
|
69
|
+
"""
|
|
70
|
+
bbox = fragment_bbox(frag)
|
|
71
|
+
if bbox is None:
|
|
72
|
+
return None
|
|
73
|
+
return (bbox[0] + bbox[2]) / 2.0, (bbox[1] + bbox[3]) / 2.0
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def fragment_bottom_has_hanging_label(frag: ET.Element) -> bool:
|
|
77
|
+
"""True if the bottommost atom has a label that hangs below it.
|
|
78
|
+
|
|
79
|
+
In ChemDraw, when N (Element=7) or P (Element=15) is the bottommost
|
|
80
|
+
atom of a fragment and has only 2 or fewer explicit bonds, the
|
|
81
|
+
implicit H is rendered as a vertical stack (N above, H below).
|
|
82
|
+
This causes the label to extend below the atom coordinate.
|
|
83
|
+
|
|
84
|
+
Returns *True* when extra gap is needed below this fragment.
|
|
85
|
+
"""
|
|
86
|
+
HANGING_ELEMENTS = {"7", "15"}
|
|
87
|
+
|
|
88
|
+
atoms: list[tuple[ET.Element, float]] = [] # (node, y)
|
|
89
|
+
for n in frag.findall("n"):
|
|
90
|
+
p = n.get("p")
|
|
91
|
+
if p:
|
|
92
|
+
parts = p.split()
|
|
93
|
+
if len(parts) >= 2:
|
|
94
|
+
atoms.append((n, float(parts[1])))
|
|
95
|
+
|
|
96
|
+
if not atoms:
|
|
97
|
+
return False
|
|
98
|
+
|
|
99
|
+
max_y = max(a[1] for a in atoms)
|
|
100
|
+
|
|
101
|
+
for n, y in atoms:
|
|
102
|
+
if y < max_y - 1.0:
|
|
103
|
+
continue
|
|
104
|
+
if n.get("Element", "") not in HANGING_ELEMENTS:
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
node_id = n.get("id", "")
|
|
108
|
+
bond_count = 0
|
|
109
|
+
for b in frag.findall("b"):
|
|
110
|
+
if b.get("B") == node_id or b.get("E") == node_id:
|
|
111
|
+
bond_count += 1
|
|
112
|
+
|
|
113
|
+
if bond_count <= 2:
|
|
114
|
+
return True
|
|
115
|
+
|
|
116
|
+
return False
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def fragment_bbox_with_label_extension(
|
|
120
|
+
frag: ET.Element,
|
|
121
|
+
) -> Optional[Tuple[float, float, float, float]]:
|
|
122
|
+
"""Atom-only bounding box with hanging-label extension.
|
|
123
|
+
|
|
124
|
+
Delegates to :func:`fragment_bbox` for the base bbox, then extends
|
|
125
|
+
``max_y`` by :data:`constants.LAYOUT_HANGING_LABEL_GAP` (+16 pt) when
|
|
126
|
+
:func:`fragment_bottom_has_hanging_label` is True.
|
|
127
|
+
|
|
128
|
+
This accounts for N-H / P-H labels that render as a vertical stack
|
|
129
|
+
below the atom coordinate in ChemDraw.
|
|
130
|
+
"""
|
|
131
|
+
bbox = fragment_bbox(frag)
|
|
132
|
+
if bbox is None:
|
|
133
|
+
return None
|
|
134
|
+
min_x, min_y, max_x, max_y = bbox
|
|
135
|
+
if fragment_bottom_has_hanging_label(frag):
|
|
136
|
+
max_y += LAYOUT_HANGING_LABEL_GAP
|
|
137
|
+
return (min_x, min_y, max_x, max_y)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
# ---------------------------------------------------------------------------
|
|
141
|
+
# Text geometry
|
|
142
|
+
# ---------------------------------------------------------------------------
|
|
143
|
+
|
|
144
|
+
def recompute_text_bbox(t_elem: ET.Element) -> None:
|
|
145
|
+
"""Recompute and set BoundingBox on a ``<t>`` element.
|
|
146
|
+
|
|
147
|
+
Uses the ``p`` attribute (anchor position) and aggregated ``<s>``
|
|
148
|
+
text content to estimate bounds. Char width 5.8 pt (Arial 10 pt),
|
|
149
|
+
line height 12 pt. Handles multi-line text and Left/Center/Right
|
|
150
|
+
justification.
|
|
151
|
+
"""
|
|
152
|
+
p = t_elem.get("p", "")
|
|
153
|
+
if not p:
|
|
154
|
+
return
|
|
155
|
+
parts = [float(v) for v in p.split()]
|
|
156
|
+
if len(parts) < 2:
|
|
157
|
+
return
|
|
158
|
+
px, py = parts[0], parts[1]
|
|
159
|
+
|
|
160
|
+
text_content = "".join(s.text or "" for s in t_elem.iter("s"))
|
|
161
|
+
lines = text_content.split("\n") if "\n" in text_content else [text_content]
|
|
162
|
+
max_line_len = max((len(l) for l in lines), default=0)
|
|
163
|
+
n_lines = max(1, len(lines))
|
|
164
|
+
|
|
165
|
+
char_w = 5.8
|
|
166
|
+
line_h = 12.0
|
|
167
|
+
w = max_line_len * char_w
|
|
168
|
+
h = n_lines * line_h # noqa: F841 (kept for clarity)
|
|
169
|
+
|
|
170
|
+
just = t_elem.get(
|
|
171
|
+
"CaptionJustification", t_elem.get("Justification", "Left")
|
|
172
|
+
)
|
|
173
|
+
if just == "Center":
|
|
174
|
+
x1 = px - w / 2.0
|
|
175
|
+
x2 = px + w / 2.0
|
|
176
|
+
elif just == "Right":
|
|
177
|
+
x1 = px - w
|
|
178
|
+
x2 = px
|
|
179
|
+
else: # Left (default)
|
|
180
|
+
x1 = px
|
|
181
|
+
x2 = px + w
|
|
182
|
+
|
|
183
|
+
y1 = py - line_h # ascender above baseline
|
|
184
|
+
y2 = py + (n_lines - 1) * line_h + 3.0 # descender below last
|
|
185
|
+
|
|
186
|
+
t_elem.set("BoundingBox", f"{x1:.2f} {y1:.2f} {x2:.2f} {y2:.2f}")
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
# ---------------------------------------------------------------------------
|
|
190
|
+
# ID map
|
|
191
|
+
# ---------------------------------------------------------------------------
|
|
192
|
+
|
|
193
|
+
def build_id_map(parent: ET.Element) -> Dict[str, ET.Element]:
|
|
194
|
+
"""Build ``{id_string: element}`` map for all descendants with an
|
|
195
|
+
``id`` attribute.
|
|
196
|
+
|
|
197
|
+
This is **recursive** — it walks the entire subtree via
|
|
198
|
+
``parent.iter()``, so nested elements at any depth are included.
|
|
199
|
+
|
|
200
|
+
Note: ``reaction_cleanup._build_id_map()`` is intentionally
|
|
201
|
+
**shallow** (direct children only via ``for el in page``).
|
|
202
|
+
The two are NOT interchangeable.
|
|
203
|
+
"""
|
|
204
|
+
m: Dict[str, ET.Element] = {}
|
|
205
|
+
for el in parent.iter():
|
|
206
|
+
eid = el.get("id", "")
|
|
207
|
+
if eid:
|
|
208
|
+
m[eid] = el
|
|
209
|
+
return m
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
# ---------------------------------------------------------------------------
|
|
213
|
+
# Arrow geometry
|
|
214
|
+
# ---------------------------------------------------------------------------
|
|
215
|
+
|
|
216
|
+
def arrow_endpoints(
|
|
217
|
+
arrow: ET.Element,
|
|
218
|
+
) -> Tuple[float, float, float, float]:
|
|
219
|
+
"""Return ``(tail_x, tail_y, head_x, head_y)`` from an arrow element.
|
|
220
|
+
|
|
221
|
+
Reads ``Head3D`` and ``Tail3D`` attributes first, falling back to
|
|
222
|
+
``BoundingBox``. Returns a default (450, 250, 550, 250) if neither
|
|
223
|
+
is available.
|
|
224
|
+
"""
|
|
225
|
+
head = arrow.get("Head3D", "")
|
|
226
|
+
tail = arrow.get("Tail3D", "")
|
|
227
|
+
if head and tail:
|
|
228
|
+
hp = [float(v) for v in head.split()]
|
|
229
|
+
tp = [float(v) for v in tail.split()]
|
|
230
|
+
return tp[0], tp[1], hp[0], hp[1]
|
|
231
|
+
# Fallback: BoundingBox
|
|
232
|
+
bb = arrow.get("BoundingBox", "")
|
|
233
|
+
if bb:
|
|
234
|
+
vals = [float(v) for v in bb.split()]
|
|
235
|
+
return vals[0], (vals[1] + vals[3]) / 2, vals[2], (vals[1] + vals[3]) / 2
|
|
236
|
+
return 450.0, 250.0, 550.0, 250.0
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
# ---------------------------------------------------------------------------
|
|
240
|
+
# CDXML IO
|
|
241
|
+
# ---------------------------------------------------------------------------
|
|
242
|
+
|
|
243
|
+
def parse_cdxml(path: str) -> ET.ElementTree:
|
|
244
|
+
"""Parse a CDXML file, returning an :class:`~xml.etree.ElementTree.ElementTree`."""
|
|
245
|
+
return ET.parse(path)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def write_cdxml(tree: ET.ElementTree, path: str) -> None:
|
|
249
|
+
"""Write *tree* to *path*, re-inserting the DOCTYPE declaration.
|
|
250
|
+
|
|
251
|
+
``ElementTree.write()`` drops the DOCTYPE. This function writes
|
|
252
|
+
the XML first, then patches the file to re-add it and strip any
|
|
253
|
+
``ns0:`` namespace prefixes.
|
|
254
|
+
"""
|
|
255
|
+
tree.write(path, xml_declaration=True, encoding="UTF-8")
|
|
256
|
+
|
|
257
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
258
|
+
content = f.read()
|
|
259
|
+
|
|
260
|
+
if "<!DOCTYPE" not in content:
|
|
261
|
+
content = content.replace(
|
|
262
|
+
"?>",
|
|
263
|
+
'?>\n<!DOCTYPE CDXML SYSTEM "http://www.cambridgesoft.com/xml/cdxml.dtd" >',
|
|
264
|
+
1,
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
# Strip namespace prefixes that ElementTree may inject
|
|
268
|
+
content = content.replace("ns0:", "").replace(":ns0", "")
|
|
269
|
+
|
|
270
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
271
|
+
f.write(content)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
# ---------------------------------------------------------------------------
|
|
275
|
+
# Self-test
|
|
276
|
+
# ---------------------------------------------------------------------------
|
|
277
|
+
|
|
278
|
+
if __name__ == "__main__":
|
|
279
|
+
print("cdxml_utils self-test\n" + "=" * 40)
|
|
280
|
+
|
|
281
|
+
# Minimal fragment with 3 atoms and 2 bonds
|
|
282
|
+
xml_str = """\
|
|
283
|
+
<fragment id="100">
|
|
284
|
+
<n id="1" p="100 200" Element="6" />
|
|
285
|
+
<n id="2" p="120 180" Element="7" NumHydrogens="1" />
|
|
286
|
+
<n id="3" p="140 200" />
|
|
287
|
+
<b id="10" B="1" E="2" />
|
|
288
|
+
<b id="11" B="2" E="3" />
|
|
289
|
+
</fragment>"""
|
|
290
|
+
frag = ET.fromstring(xml_str)
|
|
291
|
+
|
|
292
|
+
# fragment_bbox
|
|
293
|
+
bbox = fragment_bbox(frag)
|
|
294
|
+
assert bbox is not None, "bbox should not be None"
|
|
295
|
+
assert bbox == (100.0, 180.0, 140.0, 200.0), f"unexpected bbox: {bbox}"
|
|
296
|
+
print(f" fragment_bbox: {bbox} OK")
|
|
297
|
+
|
|
298
|
+
# fragment_centroid
|
|
299
|
+
c = fragment_centroid(frag)
|
|
300
|
+
assert c is not None
|
|
301
|
+
assert c == (120.0, 190.0), f"unexpected centroid: {c}"
|
|
302
|
+
print(f" fragment_centroid: {c} OK")
|
|
303
|
+
|
|
304
|
+
# hanging label — atom 2 (N, Element=7) is NOT bottommost (y=180 < 200)
|
|
305
|
+
assert not fragment_bottom_has_hanging_label(frag), "should be False"
|
|
306
|
+
print(" hanging_label (no): OK")
|
|
307
|
+
|
|
308
|
+
# Fragment where N IS bottommost
|
|
309
|
+
xml_hang = """\
|
|
310
|
+
<fragment id="200">
|
|
311
|
+
<n id="1" p="100 180" />
|
|
312
|
+
<n id="2" p="120 200" Element="7" />
|
|
313
|
+
<b id="10" B="1" E="2" />
|
|
314
|
+
</fragment>"""
|
|
315
|
+
frag_hang = ET.fromstring(xml_hang)
|
|
316
|
+
assert fragment_bottom_has_hanging_label(frag_hang), "should be True"
|
|
317
|
+
print(" hanging_label (yes): OK")
|
|
318
|
+
|
|
319
|
+
# Empty fragment — no atoms
|
|
320
|
+
frag_empty = ET.fromstring('<fragment id="300" />')
|
|
321
|
+
assert fragment_bbox(frag_empty) is None, "empty frag should return None"
|
|
322
|
+
assert fragment_centroid(frag_empty) is None
|
|
323
|
+
print(" empty fragment: OK")
|
|
324
|
+
|
|
325
|
+
# recompute_text_bbox
|
|
326
|
+
t_xml = '<t id="50" p="100 200"><s>Hello</s></t>'
|
|
327
|
+
t_el = ET.fromstring(t_xml)
|
|
328
|
+
recompute_text_bbox(t_el)
|
|
329
|
+
bb = t_el.get("BoundingBox")
|
|
330
|
+
assert bb is not None, "BoundingBox should be set"
|
|
331
|
+
vals = [float(v) for v in bb.split()]
|
|
332
|
+
assert len(vals) == 4
|
|
333
|
+
print(f" recompute_text_bbox: {bb} OK")
|
|
334
|
+
|
|
335
|
+
# build_id_map
|
|
336
|
+
page_xml = '<page><n id="1" /><n id="2"><n id="3" /></n></page>'
|
|
337
|
+
page = ET.fromstring(page_xml)
|
|
338
|
+
m = build_id_map(page)
|
|
339
|
+
assert "1" in m and "2" in m and "3" in m, f"missing ids: {set(m)}"
|
|
340
|
+
print(f" build_id_map: {len(m)} entries OK")
|
|
341
|
+
|
|
342
|
+
print("\nAll tests passed.")
|