python-hwpx 1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,726 @@
1
+ """High-level routines for traversing text inside HWPX documents."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import Dict, Iterator, Mapping, Optional, Sequence, Tuple, Union, Literal
9
+ from xml.etree import ElementTree as ET
10
+ from zipfile import ZipFile
11
+
12
+ __all__ = [
13
+ "DEFAULT_NAMESPACES",
14
+ "AnnotationOptions",
15
+ "ParagraphInfo",
16
+ "SectionInfo",
17
+ "TextExtractor",
18
+ "build_parent_map",
19
+ "describe_element_path",
20
+ "strip_namespace",
21
+ ]
22
+
23
+ DEFAULT_NAMESPACES: Dict[str, str] = {
24
+ "hp": "http://www.hancom.co.kr/hwpml/2011/paragraph",
25
+ "hp10": "http://www.hancom.co.kr/hwpml/2016/paragraph",
26
+ "hs": "http://www.hancom.co.kr/hwpml/2011/section",
27
+ "hc": "http://www.hancom.co.kr/hwpml/2011/core",
28
+ "ha": "http://www.hancom.co.kr/hwpml/2011/app",
29
+ "hh": "http://www.hancom.co.kr/hwpml/2011/head",
30
+ "hhs": "http://www.hancom.co.kr/hwpml/2011/history",
31
+ "hm": "http://www.hancom.co.kr/hwpml/2011/master-page",
32
+ "hpf": "http://www.hancom.co.kr/schema/2011/hpf",
33
+ "dc": "http://purl.org/dc/elements/1.1/",
34
+ "opf": "http://www.idpf.org/2007/opf/",
35
+ }
36
+
37
+ _SECTION_PATTERN = re.compile(r"^Contents/section(\d+)\.xml$")
38
+
39
+ _OBJECT_CONTAINERS = {
40
+ "tbl",
41
+ "container",
42
+ "line",
43
+ "rect",
44
+ "ellipse",
45
+ "arc",
46
+ "polygon",
47
+ "curve",
48
+ "connectLine",
49
+ "textart",
50
+ "pic",
51
+ "compose",
52
+ "switch",
53
+ "equation",
54
+ "ole",
55
+ "edit",
56
+ "btn",
57
+ "checkBtn",
58
+ "radioBtn",
59
+ }
60
+
61
+ _ObjectBehavior = Union[str, None]
62
+
63
+ HighlightBehavior = Literal["ignore", "markers"]
64
+ NoteBehavior = Literal["ignore", "placeholder", "inline"]
65
+ HyperlinkBehavior = Literal["ignore", "placeholder", "target"]
66
+ ControlBehavior = Literal["ignore", "placeholder", "nested"]
67
+
68
+
69
+ @dataclass(frozen=True)
70
+ class AnnotationOptions:
71
+ """Configuration describing how inline annotations should be rendered."""
72
+
73
+ highlight: HighlightBehavior = "ignore"
74
+ highlight_start: str = "[HIGHLIGHT color={color}]"
75
+ highlight_end: str = "[/HIGHLIGHT]"
76
+ highlight_summary: str = "color={color}"
77
+
78
+ footnote: NoteBehavior = "ignore"
79
+ endnote: NoteBehavior = "ignore"
80
+ note_inline_format: str = "[{kind}:{text}]"
81
+ note_placeholder: str = "[{kind}:{inst_id}]"
82
+ note_summary: str = "{kind}:{inst_id}"
83
+ note_joiner: str = " "
84
+
85
+ hyperlink: HyperlinkBehavior = "ignore"
86
+ hyperlink_target_format: str = "<{target}>"
87
+ hyperlink_placeholder: str = "[LINK:{target}]"
88
+ hyperlink_summary: str = "{target}"
89
+
90
+ control: ControlBehavior = "ignore"
91
+ control_placeholder: str = "[CTRL:{name}]"
92
+ control_summary: str = "{name}"
93
+ control_joiner: str = "\n"
94
+
95
+
96
+ @dataclass(frozen=True)
97
+ class SectionInfo:
98
+ """Metadata for a section XML file bundled within an HWPX document."""
99
+
100
+ index: int
101
+ """Zero-based index of the section as it appears in ``content.hpf``."""
102
+
103
+ name: str
104
+ """Path of the section XML entry within the archive."""
105
+
106
+ element: ET.Element
107
+ """Parsed XML element representing the ``hs:sec`` root node."""
108
+
109
+
110
+ @dataclass(frozen=True)
111
+ class ParagraphInfo:
112
+ """Container describing a paragraph extracted from a section."""
113
+
114
+ section: SectionInfo
115
+ index: int
116
+ element: ET.Element
117
+ path: str
118
+ hierarchy: Tuple[str, ...]
119
+ _extractor: "TextExtractor"
120
+
121
+ @property
122
+ def tag(self) -> str:
123
+ """Return the local tag name (normally ``p``)."""
124
+
125
+ return strip_namespace(self.element.tag)
126
+
127
+ @property
128
+ def ancestors(self) -> Tuple[str, ...]:
129
+ """Return the hierarchy leading to the paragraph (excluding itself)."""
130
+
131
+ return self.hierarchy[:-1]
132
+
133
+ @property
134
+ def is_nested(self) -> bool:
135
+ """Whether the paragraph resides inside an object such as a table."""
136
+
137
+ return len(self.ancestors) > 1
138
+
139
+ def text(
140
+ self,
141
+ *,
142
+ object_behavior: _ObjectBehavior = "skip",
143
+ object_placeholder: Optional[str] = None,
144
+ preserve_breaks: bool = True,
145
+ annotations: Optional[AnnotationOptions] = None,
146
+ ) -> str:
147
+ """Return the paragraph text using the parent extractor's settings."""
148
+
149
+ return self._extractor.paragraph_text(
150
+ self.element,
151
+ object_behavior=object_behavior,
152
+ object_placeholder=object_placeholder,
153
+ preserve_breaks=preserve_breaks,
154
+ annotations=annotations,
155
+ )
156
+
157
+ def __str__(self) -> str: # pragma: no cover - convenience only
158
+ return self.text()
159
+
160
+
161
+ class TextExtractor:
162
+ """High level helper that walks through sections and paragraphs."""
163
+
164
+ def __init__(
165
+ self,
166
+ source: Union[str, Path, ZipFile],
167
+ *,
168
+ namespaces: Optional[Mapping[str, str]] = None,
169
+ ) -> None:
170
+ self._source = source
171
+ self._zip: Optional[ZipFile] = None
172
+ self._owns_zip = False
173
+ merged_namespaces = dict(DEFAULT_NAMESPACES)
174
+ if namespaces:
175
+ merged_namespaces.update(namespaces)
176
+ self.namespaces: Dict[str, str] = merged_namespaces
177
+
178
+ # ------------------------------------------------------------------
179
+ # Context manager helpers
180
+ # ------------------------------------------------------------------
181
+ def open(self) -> ZipFile:
182
+ """Open the underlying archive if necessary and return it."""
183
+
184
+ if self._zip is None:
185
+ if isinstance(self._source, ZipFile):
186
+ self._zip = self._source
187
+ self._owns_zip = False
188
+ else:
189
+ self._zip = ZipFile(self._source) # type: ignore[arg-type]
190
+ self._owns_zip = True
191
+ return self._zip
192
+
193
+ def close(self) -> None:
194
+ """Close the archive when owned by the extractor."""
195
+
196
+ if self._zip is not None and self._owns_zip:
197
+ self._zip.close()
198
+ self._zip = None
199
+ self._owns_zip = False
200
+
201
+ def __enter__(self) -> "TextExtractor": # pragma: no cover - trivial
202
+ self.open()
203
+ return self
204
+
205
+ def __exit__(self, *_exc: object) -> None: # pragma: no cover - trivial
206
+ self.close()
207
+
208
+ # ------------------------------------------------------------------
209
+ # Iteration helpers
210
+ # ------------------------------------------------------------------
211
+ def iter_sections(self) -> Iterator[SectionInfo]:
212
+ """Yield :class:`SectionInfo` objects for each section XML entry."""
213
+
214
+ archive = self.open()
215
+ section_files = list(self._iter_section_files(archive))
216
+ for index, name in enumerate(section_files):
217
+ data = archive.read(name)
218
+ element = ET.fromstring(data)
219
+ yield SectionInfo(index=index, name=name, element=element)
220
+
221
+ def iter_paragraphs(
222
+ self,
223
+ section: SectionInfo,
224
+ *,
225
+ include_nested: bool = True,
226
+ ) -> Iterator[ParagraphInfo]:
227
+ """Yield paragraphs contained inside *section* in document order."""
228
+
229
+ root = section.element
230
+ parent_map = build_parent_map(root)
231
+ if include_nested:
232
+ paragraph_elements = list(root.findall(".//hp:p", namespaces=self.namespaces))
233
+ else:
234
+ paragraph_elements = [
235
+ child
236
+ for child in root
237
+ if tag_matches(child.tag, "hp:p", self.namespaces)
238
+ ]
239
+
240
+ for index, element in enumerate(paragraph_elements):
241
+ path = describe_element_path(element, parent_map)
242
+ hierarchy = tuple(path.split("/"))
243
+ yield ParagraphInfo(
244
+ section=section,
245
+ index=index,
246
+ element=element,
247
+ path=path,
248
+ hierarchy=hierarchy,
249
+ _extractor=self,
250
+ )
251
+
252
+ def iter_document_paragraphs(
253
+ self,
254
+ *,
255
+ include_nested: bool = True,
256
+ ) -> Iterator[ParagraphInfo]:
257
+ """Yield every paragraph across all sections."""
258
+
259
+ for section in self.iter_sections():
260
+ yield from self.iter_paragraphs(section, include_nested=include_nested)
261
+
262
+ # ------------------------------------------------------------------
263
+ # Text helpers
264
+ # ------------------------------------------------------------------
265
+ def paragraph_text(
266
+ self,
267
+ paragraph: ET.Element,
268
+ *,
269
+ object_behavior: _ObjectBehavior = "skip",
270
+ object_placeholder: Optional[str] = None,
271
+ preserve_breaks: bool = True,
272
+ annotations: Optional[AnnotationOptions] = None,
273
+ ) -> str:
274
+ """Return a string representation of a paragraph element."""
275
+
276
+ fragments: list[str] = []
277
+ for run in paragraph.findall("hp:run", namespaces=self.namespaces):
278
+ for child in run:
279
+ tag = strip_namespace(child.tag)
280
+ if tag == "t":
281
+ self._render_text_element(child, fragments, annotations)
282
+ elif tag == "lineBreak":
283
+ if preserve_breaks:
284
+ fragments.append("\n")
285
+ elif tag == "tab":
286
+ fragments.append("\t" if preserve_breaks else " ")
287
+ elif tag in {"footNote", "endNote"}:
288
+ self._handle_note(
289
+ child,
290
+ fragments,
291
+ tag,
292
+ annotations=annotations,
293
+ preserve_breaks=preserve_breaks,
294
+ )
295
+ elif tag == "ctrl":
296
+ self._handle_control(
297
+ child,
298
+ fragments,
299
+ annotations=annotations,
300
+ preserve_breaks=preserve_breaks,
301
+ )
302
+ elif tag in _OBJECT_CONTAINERS:
303
+ self._handle_object(
304
+ child,
305
+ fragments,
306
+ behavior=object_behavior,
307
+ placeholder=object_placeholder,
308
+ preserve_breaks=preserve_breaks,
309
+ annotations=annotations,
310
+ )
311
+ else:
312
+ self._handle_unexpected(
313
+ child,
314
+ fragments,
315
+ behavior=object_behavior,
316
+ placeholder=object_placeholder,
317
+ preserve_breaks=preserve_breaks,
318
+ annotations=annotations,
319
+ )
320
+ return "".join(fragments)
321
+
322
+ def _handle_object(
323
+ self,
324
+ element: ET.Element,
325
+ fragments: list[str],
326
+ *,
327
+ behavior: _ObjectBehavior,
328
+ placeholder: Optional[str],
329
+ preserve_breaks: bool,
330
+ annotations: Optional[AnnotationOptions],
331
+ ) -> None:
332
+ tag = strip_namespace(element.tag)
333
+ if behavior == "skip" or behavior is None:
334
+ return
335
+ if behavior == "placeholder":
336
+ placeholder = placeholder or "[{type}]"
337
+ fragments.append(placeholder.format(type=tag))
338
+ return
339
+ if behavior == "nested":
340
+ for inner_paragraph in element.findall(".//hp:p", namespaces=self.namespaces):
341
+ text = self.paragraph_text(
342
+ inner_paragraph,
343
+ object_behavior=behavior,
344
+ object_placeholder=placeholder,
345
+ preserve_breaks=preserve_breaks,
346
+ annotations=annotations,
347
+ )
348
+ if text:
349
+ fragments.append(text)
350
+ if preserve_breaks:
351
+ fragments.append("\n")
352
+ if fragments and fragments[-1] == "\n":
353
+ fragments.pop()
354
+ return
355
+ raise ValueError(f"Unsupported object behavior: {behavior!r}")
356
+
357
+ def _handle_unexpected(
358
+ self,
359
+ element: ET.Element,
360
+ fragments: list[str],
361
+ *,
362
+ behavior: _ObjectBehavior,
363
+ placeholder: Optional[str],
364
+ preserve_breaks: bool,
365
+ annotations: Optional[AnnotationOptions],
366
+ ) -> None:
367
+ tag = strip_namespace(element.tag)
368
+ if tag == "ctrl":
369
+ self._handle_control(
370
+ element,
371
+ fragments,
372
+ annotations=annotations,
373
+ preserve_breaks=preserve_breaks,
374
+ )
375
+ return
376
+ if behavior == "placeholder":
377
+ placeholder = placeholder or "[{type}]"
378
+ fragments.append(placeholder.format(type=tag))
379
+ elif behavior == "nested":
380
+ # Attempt to gather nested paragraph text for unknown containers.
381
+ for inner_paragraph in element.findall(".//hp:p", namespaces=self.namespaces):
382
+ text = self.paragraph_text(
383
+ inner_paragraph,
384
+ object_behavior=behavior,
385
+ object_placeholder=placeholder,
386
+ preserve_breaks=preserve_breaks,
387
+ annotations=annotations,
388
+ )
389
+ if text:
390
+ fragments.append(text)
391
+ if preserve_breaks:
392
+ fragments.append("\n")
393
+ if fragments and fragments[-1] == "\n":
394
+ fragments.pop()
395
+ else:
396
+ # Default: ignore the element silently.
397
+ return
398
+
399
+ def _render_text_element(
400
+ self,
401
+ element: ET.Element,
402
+ fragments: list[str],
403
+ annotations: Optional[AnnotationOptions],
404
+ ) -> None:
405
+ if element.text:
406
+ fragments.append(element.text)
407
+
408
+ highlight_stack: list[Optional[str]] = []
409
+ highlight_mode = annotations.highlight if annotations else "ignore"
410
+
411
+ for child in element:
412
+ tag = strip_namespace(child.tag)
413
+ if tag == "markpenBegin":
414
+ color = child.get("color") or ""
415
+ highlight_stack.append(color)
416
+ if annotations and highlight_mode == "markers":
417
+ fragments.append(
418
+ annotations.highlight_start.format(color=color or "")
419
+ )
420
+ elif tag == "markpenEnd":
421
+ color = highlight_stack.pop() if highlight_stack else ""
422
+ if annotations and highlight_mode == "markers":
423
+ fragments.append(
424
+ annotations.highlight_end.format(color=color or "")
425
+ )
426
+ else:
427
+ self._render_text_element(child, fragments, annotations)
428
+
429
+ if child.tail:
430
+ fragments.append(child.tail)
431
+
432
+ while highlight_stack:
433
+ color = highlight_stack.pop()
434
+ if annotations and highlight_mode == "markers":
435
+ fragments.append(annotations.highlight_end.format(color=color or ""))
436
+
437
+ def _handle_note(
438
+ self,
439
+ element: ET.Element,
440
+ fragments: list[str],
441
+ kind: str,
442
+ *,
443
+ annotations: Optional[AnnotationOptions],
444
+ preserve_breaks: bool,
445
+ ) -> None:
446
+ if annotations is None:
447
+ return
448
+ option = annotations.footnote if kind == "footNote" else annotations.endnote
449
+ if option == "ignore":
450
+ return
451
+
452
+ kind_name = "footnote" if kind == "footNote" else "endnote"
453
+ inst_id = element.get("instId") or ""
454
+
455
+ if option == "placeholder":
456
+ fragments.append(
457
+ annotations.note_placeholder.format(kind=kind_name, inst_id=inst_id)
458
+ )
459
+ return
460
+
461
+ if option == "inline":
462
+ note_text = _resolve_note_text(
463
+ self,
464
+ element,
465
+ annotations,
466
+ preserve_breaks=preserve_breaks,
467
+ )
468
+ fragments.append(
469
+ annotations.note_inline_format.format(
470
+ kind=kind_name, inst_id=inst_id, text=note_text
471
+ )
472
+ )
473
+
474
+ def _handle_control(
475
+ self,
476
+ element: ET.Element,
477
+ fragments: list[str],
478
+ *,
479
+ annotations: Optional[AnnotationOptions],
480
+ preserve_breaks: bool,
481
+ ) -> None:
482
+ if annotations is None:
483
+ return
484
+
485
+ field_begin = element.find("hp:fieldBegin", namespaces=self.namespaces)
486
+ if field_begin is not None:
487
+ field_type = field_begin.get("type") or ""
488
+ if field_type == "HYPERLINK":
489
+ self._handle_hyperlink(field_begin, fragments, annotations)
490
+ return
491
+
492
+ if element.find("hp:fieldEnd", namespaces=self.namespaces) is not None:
493
+ return
494
+
495
+ behavior = annotations.control
496
+ if behavior == "ignore":
497
+ return
498
+ if behavior == "nested":
499
+ text = _resolve_control_nested_text(
500
+ self,
501
+ element,
502
+ annotations,
503
+ preserve_breaks=preserve_breaks,
504
+ )
505
+ if text:
506
+ fragments.append(text)
507
+ return
508
+ if behavior == "placeholder":
509
+ first_child = next(iter(element), None)
510
+ name = strip_namespace(first_child.tag) if first_child is not None else "ctrl"
511
+ ctrl_type = (
512
+ first_child.get("type") if first_child is not None else element.get("type")
513
+ )
514
+ fragments.append(
515
+ annotations.control_placeholder.format(name=name, type=ctrl_type or "")
516
+ )
517
+
518
+ def _handle_hyperlink(
519
+ self,
520
+ field_begin: ET.Element,
521
+ fragments: list[str],
522
+ annotations: AnnotationOptions,
523
+ ) -> None:
524
+ behavior = annotations.hyperlink
525
+ target = _resolve_hyperlink_target(field_begin, self.namespaces)
526
+ if behavior == "placeholder":
527
+ fragments.append(
528
+ annotations.hyperlink_placeholder.format(target=target or "")
529
+ )
530
+ elif behavior == "target":
531
+ if target:
532
+ fragments.append(
533
+ annotations.hyperlink_target_format.format(target=target)
534
+ )
535
+
536
+ def extract_text(
537
+ self,
538
+ *,
539
+ paragraph_separator: str = "\n",
540
+ skip_empty: bool = True,
541
+ include_nested: bool = True,
542
+ object_behavior: _ObjectBehavior = "skip",
543
+ object_placeholder: Optional[str] = None,
544
+ preserve_breaks: bool = True,
545
+ annotations: Optional[AnnotationOptions] = None,
546
+ ) -> str:
547
+ """Return the plain text for all paragraphs in the document."""
548
+
549
+ texts: list[str] = []
550
+ for paragraph in self.iter_document_paragraphs(include_nested=include_nested):
551
+ text = paragraph.text(
552
+ object_behavior=object_behavior,
553
+ object_placeholder=object_placeholder,
554
+ preserve_breaks=preserve_breaks,
555
+ annotations=annotations,
556
+ )
557
+ if skip_empty and not text.strip():
558
+ continue
559
+ texts.append(text)
560
+ return paragraph_separator.join(texts)
561
+
562
+ # ------------------------------------------------------------------
563
+ # Internal helpers
564
+ # ------------------------------------------------------------------
565
+ def _iter_section_files(self, archive: ZipFile) -> Iterator[str]:
566
+ try:
567
+ manifest = archive.read("Contents/content.hpf")
568
+ except KeyError:
569
+ manifest = None
570
+
571
+ if manifest:
572
+ root = ET.fromstring(manifest)
573
+ items = [
574
+ item.get("href")
575
+ for item in root.findall(".//opf:item", namespaces=self.namespaces)
576
+ if item.get("href") and _SECTION_PATTERN.match(item.get("href"))
577
+ ]
578
+ if items:
579
+ return iter(items)
580
+
581
+ section_files = [
582
+ name
583
+ for name in archive.namelist()
584
+ if _SECTION_PATTERN.match(name)
585
+ ]
586
+ section_files.sort(key=_section_sort_key)
587
+ return iter(section_files)
588
+
589
+
590
+ # ----------------------------------------------------------------------
591
+ # General XML helpers shared with the object finder
592
+ # ----------------------------------------------------------------------
593
+
594
+
595
+ def _resolve_note_text(
596
+ extractor: "TextExtractor",
597
+ element: ET.Element,
598
+ annotations: Optional[AnnotationOptions],
599
+ *,
600
+ preserve_breaks: bool,
601
+ ) -> str:
602
+ sub_list = element.find("hp:subList", namespaces=extractor.namespaces)
603
+ if sub_list is None:
604
+ return ""
605
+
606
+ texts: list[str] = []
607
+ for inner_paragraph in sub_list.findall(".//hp:p", namespaces=extractor.namespaces):
608
+ text = extractor.paragraph_text(
609
+ inner_paragraph,
610
+ object_behavior="skip",
611
+ object_placeholder=None,
612
+ preserve_breaks=preserve_breaks,
613
+ annotations=annotations,
614
+ )
615
+ if text:
616
+ texts.append(text)
617
+
618
+ joiner = annotations.note_joiner if annotations else " "
619
+ return joiner.join(texts)
620
+
621
+
622
+ def _resolve_control_nested_text(
623
+ extractor: "TextExtractor",
624
+ element: ET.Element,
625
+ annotations: Optional[AnnotationOptions],
626
+ *,
627
+ preserve_breaks: bool,
628
+ ) -> str:
629
+ texts: list[str] = []
630
+ for inner_paragraph in element.findall(".//hp:p", namespaces=extractor.namespaces):
631
+ text = extractor.paragraph_text(
632
+ inner_paragraph,
633
+ object_behavior="skip",
634
+ object_placeholder=None,
635
+ preserve_breaks=preserve_breaks,
636
+ annotations=annotations,
637
+ )
638
+ if text:
639
+ texts.append(text)
640
+
641
+ if not texts:
642
+ return ""
643
+ joiner = annotations.control_joiner if annotations else "\n"
644
+ return joiner.join(texts)
645
+
646
+
647
+ def _resolve_hyperlink_target(
648
+ field_begin: ET.Element,
649
+ namespaces: Mapping[str, str],
650
+ ) -> Optional[str]:
651
+ params = field_begin.find("hp:parameters", namespaces=namespaces)
652
+ if params is None:
653
+ return None
654
+
655
+ for string_param in params.findall("hp:stringParam", namespaces=namespaces):
656
+ if string_param.get("name") == "Command":
657
+ value = string_param.text or ""
658
+ if "|" in value:
659
+ return value.split("|", 1)[0]
660
+ return value
661
+ return None
662
+
663
+
664
+ def strip_namespace(tag: str) -> str:
665
+ """Return the local component of an XML tag."""
666
+
667
+ if "}" in tag:
668
+ return tag.split("}", 1)[1]
669
+ return tag
670
+
671
+
672
+ def tag_matches(candidate: str, query: Union[str, Sequence[str]], namespaces: Mapping[str, str]) -> bool:
673
+ """Return ``True`` when *candidate* matches *query* according to namespaces."""
674
+
675
+ if isinstance(query, Sequence) and not isinstance(query, str):
676
+ return any(tag_matches(candidate, item, namespaces) for item in query)
677
+
678
+ if isinstance(query, str):
679
+ if query.startswith("{"):
680
+ return candidate == query
681
+ if ":" in query:
682
+ prefix, local = query.split(":", 1)
683
+ namespace = namespaces.get(prefix)
684
+ if namespace is None:
685
+ return False
686
+ return candidate == f"{{{namespace}}}{local}"
687
+ return strip_namespace(candidate) == query
688
+
689
+ raise TypeError("query must be a string or sequence of strings")
690
+
691
+
692
+ def build_parent_map(root: ET.Element) -> Dict[ET.Element, ET.Element]:
693
+ """Construct a mapping that describes the parent of every node in *root*."""
694
+
695
+ return {child: parent for parent in root.iter() for child in parent}
696
+
697
+
698
+ def describe_element_path(
699
+ element: ET.Element,
700
+ parent_map: Mapping[ET.Element, ET.Element],
701
+ ) -> str:
702
+ """Return an XPath-like representation for *element*."""
703
+
704
+ parts: list[str] = []
705
+ current: Optional[ET.Element] = element
706
+ while current is not None:
707
+ parent = parent_map.get(current)
708
+ local = strip_namespace(current.tag)
709
+ if parent is None:
710
+ parts.append(local)
711
+ break
712
+ siblings = [child for child in parent if strip_namespace(child.tag) == local]
713
+ if len(siblings) > 1:
714
+ index = siblings.index(current)
715
+ parts.append(f"{local}[{index}]")
716
+ else:
717
+ parts.append(local)
718
+ current = parent
719
+ return "/".join(reversed(parts))
720
+
721
+
722
+ def _section_sort_key(name: str) -> Tuple[int, str]:
723
+ match = _SECTION_PATTERN.match(name)
724
+ if match:
725
+ return (int(match.group(1)), name)
726
+ return (0, name)