regen.mde 0.2.2 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/LICENSE +16 -16
  2. package/README.md +409 -295
  3. package/bin/build-corpus-editor.js +83 -81
  4. package/bin/build-corpus.js +41 -41
  5. package/bin/postinstall.js +259 -187
  6. package/bin/regen-mdeditor-install.js +27 -27
  7. package/bin/regen-mdeditor-uninstall.js +19 -19
  8. package/bin/validate-katex.js +93 -93
  9. package/desktop/BuildCorpusEditor/BuildCorpusBridge.cs +493 -270
  10. package/desktop/BuildCorpusEditor/BuildCorpusEditor.csproj +22 -22
  11. package/desktop/BuildCorpusEditor/EditorForm.cs +853 -540
  12. package/desktop/BuildCorpusEditor/Program.cs +85 -81
  13. package/desktop/BuildCorpusEditor/app.manifest +16 -16
  14. package/dist/release/regen-mde-0.8.0-win-x64.zip +0 -0
  15. package/dist/windows-editor/BuildCorpusEditor.dll +0 -0
  16. package/dist/windows-editor/BuildCorpusEditor.exe +0 -0
  17. package/dist/windows-editor/BuildCorpusEditor.pdb +0 -0
  18. package/dist/windows-editor/BuildCorpusEditor.runtimeconfig.json +1 -1
  19. package/dist/windows-editor/wwwroot/assets/index-C_VxJk4k.js +375 -0
  20. package/dist/windows-editor/wwwroot/assets/index-Wt9zSjIw.css +1 -0
  21. package/dist/windows-editor/wwwroot/index.html +22 -22
  22. package/editor-web/index.html +21 -21
  23. package/editor-web/src/main.jsx +1044 -399
  24. package/editor-web/src/styles.css +846 -602
  25. package/editor-web/vite.config.js +13 -13
  26. package/examples/build-corpus.config.example.json +21 -21
  27. package/installer/install-regen-mde.ps1 +214 -175
  28. package/installer/regen-mde.nsi +81 -81
  29. package/package.json +10 -6
  30. package/pyproject.toml +4 -3
  31. package/requirements.txt +5 -4
  32. package/scripts/build-windows-editor.ps1 +47 -47
  33. package/scripts/package-windows-editor.ps1 +90 -90
  34. package/scripts/release-dual.mjs +105 -0
  35. package/scripts/run-corpus.ps1 +28 -28
  36. package/scripts/run-editor-implementation-plane.ps1 +226 -203
  37. package/scripts/run-required-tests.ps1 +98 -98
  38. package/scripts/run-smoke.ps1 +28 -28
  39. package/src/build_corpus/__init__.py +1 -1
  40. package/src/build_corpus/docx_exporter.py +1055 -798
  41. package/src/build_corpus/equations.py +1345 -0
  42. package/src/build_corpus/exporter.py +1488 -1195
  43. package/src/build_corpus/frontmatter.py +302 -0
  44. package/src/build_corpus/ppt_exporter.py +543 -532
  45. package/src/build_corpus/templates/__init__.py +1 -1
  46. package/src/build_corpus/validate_assets.py +46 -46
  47. package/tools/audit_corpus.py +203 -203
  48. package/tools/collect_microsoft_word_templates.py +228 -228
  49. package/tools/collect_online_docx_corpus.py +272 -272
  50. package/tools/collect_online_pptx_corpus.py +252 -252
  51. package/tools/compare_pptx_inputs_outputs.py +87 -87
  52. package/tools/roundtrip_docx_corpus.py +171 -171
  53. package/dist/release/regen.mde-0.2.2-win-x64-setup.exe +0 -0
  54. package/dist/release/regen.mde-0.2.2-win-x64.zip +0 -0
  55. package/dist/windows-editor/wwwroot/assets/index-DjJ6xmhy.js +0 -326
  56. package/dist/windows-editor/wwwroot/assets/index-_dwMNNsm.css +0 -1
@@ -0,0 +1,1345 @@
1
+ """LaTeX -> OMML (Office Math) conversion for Markdown -> Word export.
2
+
3
+ Word renders equations from OMML (`<m:oMath>`), not from raw text. The previous
4
+ exporter only set the font to "Cambria Math" and emitted the LaTeX source as
5
+ literal characters, so anything with real commands (\\sum, \\Delta, \\rightarrow,
6
+ \\leq) showed up as raw text. This module performs a genuine conversion:
7
+
8
+ LaTeX --(latex2mathml)--> presentation MathML --(owned converter)--> OMML
9
+
10
+ and returns a parsed `<m:oMath>` / `<m:oMathPara>` element ready to splice into a
11
+ python-docx paragraph.
12
+
13
+ OMML is REQUIRED, not optional. If the conversion backends are not importable
14
+ this raises loudly, so a missing dependency can never silently strip OMML from
15
+ the whole document. A *single* equation that fails to convert returns ``None``
16
+ so that one fragment falls back to text while the rest of the document keeps
17
+ real OMML.
18
+
19
+ This converter is owned (pure stdlib + latex2mathml); it does NOT use
20
+ ``mathml2omml``. The 113-equation conformance suite in
21
+ ``tests/test_omml_suite.py`` is the structural spec the emitted OMML matches.
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ from html import escape
27
+ from xml.etree import ElementTree as ET
28
+
29
+ from docx.oxml import parse_xml
30
+ from docx.oxml.ns import nsdecls
31
+
32
+ _MATH_NS = "http://schemas.openxmlformats.org/officeDocument/2006/math"
33
+ _MATHML_NS = "http://www.w3.org/1998/Math/MathML"
34
+
35
+ # N-ary operator characters. Big operators (∑∏⋃⋂…) and integrals (∫∮…) all
36
+ # default in this corpus to limits placed beside the glyph (limLoc=subSup),
37
+ # matching the conformance fixture.
38
+ _NARY_CHARS = {
39
+ "∑", # ∑ summation
40
+ "∏", # ∏ product
41
+ "∐", # ∐ coproduct
42
+ "∫", # ∫ integral
43
+ "∬", # ∬ double integral
44
+ "∭", # ∭ triple integral
45
+ "⨌", # ⨌ quadruple integral
46
+ "∮", # ∮ contour integral
47
+ "∯", # ∯ surface integral
48
+ "∰", # ∰ volume integral
49
+ "⋃", # ⋃ n-ary union
50
+ "⋂", # ⋂ n-ary intersection
51
+ "⋁", # ⋁ n-ary logical or
52
+ "⋀", # ⋀ n-ary logical and
53
+ "⨁", # ⨁ n-ary circled plus
54
+ "⨂", # ⨂ n-ary circled times
55
+ "⨀", # ⨀ n-ary circled dot
56
+ }
57
+
58
+ # Fence/delimiter open->close pairs recognised on bare mrow content.
59
+ _FENCE_PAIRS = {
60
+ "(": ")",
61
+ "[": "]",
62
+ "{": "}",
63
+ "⟨": "⟩", # ⟨ ⟩ angle brackets
64
+ "⌈": "⌉", # ⌈ ⌉ ceiling
65
+ "⌊": "⌋", # ⌊ ⌋ floor
66
+ "|": "|",
67
+ "‖": "‖", # ‖ double vertical bar
68
+ }
69
+ _FENCE_OPENERS = set(_FENCE_PAIRS)
70
+ _FENCE_CLOSERS = set(_FENCE_PAIRS.values())
71
+
72
+ # Accent over/under glyphs (both spacing and combining forms latex2mathml may
73
+ # emit) mapped to the COMBINING mark Word stores on <m:acc><m:chr>.
74
+ _ACCENT_TO_COMBINING = {
75
+ "^": "̂", # circumflex / hat
76
+ "̂": "̂",
77
+ "~": "̃", # tilde
78
+ "̃": "̃",
79
+ "˜": "̃", # small tilde
80
+ "→": "⃗", # vector arrow
81
+ "⃗": "⃗",
82
+ "˙": "̇", # dot above
83
+ "̇": "̇",
84
+ "¨": "̈", # diaeresis / ddot
85
+ "̈": "̈",
86
+ "ˇ": "̌", # caron / check
87
+ "̌": "̌",
88
+ "˘": "̆", # breve
89
+ "̆": "̆",
90
+ "´": "́", # acute
91
+ "́": "́",
92
+ "`": "̀", # grave
93
+ "̀": "̀",
94
+ "´": "́", # acute accent variant
95
+ }
96
+ # Overline / bar (top) glyphs → <m:bar pos="top">.
97
+ _BAR_TOP_CHARS = {
98
+ "¯", # ¯ macron (spacing)
99
+ "̄", # ̄ combining macron
100
+ "̅", # ̅ combining overline
101
+ "‾", # ‾ overline (spacing)
102
+ }
103
+ # Underline glyph (―) is shared by overline; munder vs mover disambiguates.
104
+ _OVERLINE_CHARS = _BAR_TOP_CHARS | {"―"}
105
+ _BAR_BOT_CHARS = {
106
+ "̲", # ̲ combining low line
107
+ "_", # _ low line (spacing)
108
+ "―", # ― horizontal bar (latex2mathml \underline, under context)
109
+ }
110
+
111
+ # Group-character braces.
112
+ _GROUPCHR_TOP = {"⏞"} # over-brace
113
+ _GROUPCHR_BOT = {"⏟"} # under-brace
114
+
115
+ # Arrow glyphs (base of a decorated arrow like \xrightarrow).
116
+ _ARROW_CHARS = {"→", "←", "↔", "⇒", "⇐", "⇔", "↦", "⟶", "⟵", "⟷"}
117
+
118
+ # Known function names latex2mathml renders as a bare <mi>/<mo>. A function
119
+ # application wraps the name in <m:func> with the argument as <m:e>.
120
+ _FUNCTION_NAMES = {
121
+ "sin", "cos", "tan", "cot", "sec", "csc",
122
+ "sinh", "cosh", "tanh", "coth",
123
+ "arcsin", "arccos", "arctan",
124
+ "arsinh", "arcosh", "artanh",
125
+ "log", "ln", "lg", "exp",
126
+ "det", "dim", "hom", "ker", "deg", "gcd", "arg",
127
+ }
128
+ # Operator-name <mo> tokens that take an UNDER limit (limLow) when scripted.
129
+ _LIMIT_OP_NAMES = {"lim", "lim sup", "lim inf", "max", "min", "sup", "inf",
130
+ "limsup", "liminf", "argmax", "argmin"}
131
+
132
+ # Bold-italic / bold math-alphanumeric Unicode blocks → (ascii, style). We map
133
+ # Mathematical Bold and Bold-Italic letters/digits back to their ASCII base and
134
+ # carry the bold style as <m:sty m:val="b">. Other math-alphanumeric variants
135
+ # (blackboard, script/calligraphic) are intentionally LEFT as-is — the fixture
136
+ # keeps ℝ, 𝒩, 𝒳 verbatim.
137
+ def _bold_unmap(ch: str) -> "str | None":
138
+ cp = ord(ch)
139
+ # Mathematical Bold Capital A-Z: U+1D400..U+1D419
140
+ if 0x1D400 <= cp <= 0x1D419:
141
+ return chr(ord("A") + cp - 0x1D400)
142
+ # Mathematical Bold Small a-z: U+1D41A..U+1D433
143
+ if 0x1D41A <= cp <= 0x1D433:
144
+ return chr(ord("a") + cp - 0x1D41A)
145
+ # Mathematical Bold Digit 0-9: U+1D7CE..U+1D7D7
146
+ if 0x1D7CE <= cp <= 0x1D7D7:
147
+ return chr(ord("0") + cp - 0x1D7CE)
148
+ return None
149
+
150
+
151
+ def _local_name(node: ET.Element) -> str:
152
+ if node.tag.startswith("{"):
153
+ return node.tag.rsplit("}", 1)[1]
154
+ return node.tag
155
+
156
+
157
+ def _node_text(node: ET.Element) -> str:
158
+ return "".join(node.itertext())
159
+
160
+
161
+ _SCRIPT_TAGS = {"msub", "msup", "msubsup", "munder", "mover", "munderover"}
162
+
163
+
164
+ # ---------------------------------------------------------------------------
165
+ # Run model — a small intermediate representation so adjacent simple text
166
+ # runs coalesce into ONE <m:r><m:t>…</m:t></m:r>, matching the fixture.
167
+ # ---------------------------------------------------------------------------
168
+
169
+ class _Run:
170
+ """A coalescable text run: text + optional style ('b'|'p'|None)."""
171
+
172
+ __slots__ = ("text", "style")
173
+
174
+ def __init__(self, text: str, style: "str | None" = None) -> None:
175
+ self.text = text
176
+ self.style = style
177
+
178
+
179
+ class _Raw:
180
+ """An already-serialized OMML fragment that must not be coalesced."""
181
+
182
+ __slots__ = ("xml",)
183
+
184
+ def __init__(self, xml: str) -> None:
185
+ self.xml = xml
186
+
187
+
188
+ def _run_xml(text: str, style: "str | None") -> str:
189
+ if style:
190
+ return (
191
+ f"<m:r><m:rPr><m:sty m:val=\"{style}\"/></m:rPr>"
192
+ f"<m:t>{escape(text)}</m:t></m:r>"
193
+ )
194
+ return f"<m:r><m:t>{escape(text)}</m:t></m:r>"
195
+
196
+
197
+ def _emit(parts: "list") -> str:
198
+ """Serialize a list of _Run/_Raw items, coalescing adjacent runs that share
199
+ the same style into a single <m:r><m:t>."""
200
+ out: list[str] = []
201
+ buf_text = ""
202
+ buf_style: "str | None" = None
203
+ have_buf = False
204
+
205
+ def flush() -> None:
206
+ nonlocal have_buf, buf_text, buf_style
207
+ if have_buf:
208
+ out.append(_run_xml(buf_text, buf_style))
209
+ have_buf = False
210
+ buf_text = ""
211
+ buf_style = None
212
+
213
+ for item in parts:
214
+ if isinstance(item, _Run):
215
+ if have_buf and item.style == buf_style:
216
+ buf_text += item.text
217
+ else:
218
+ flush()
219
+ buf_text = item.text
220
+ buf_style = item.style
221
+ have_buf = True
222
+ else: # _Raw
223
+ flush()
224
+ out.append(item.xml)
225
+ flush()
226
+ return "".join(out)
227
+
228
+
229
+ # ---------------------------------------------------------------------------
230
+ # Token-level conversion: MathML node -> list of _Run/_Raw parts.
231
+ # ---------------------------------------------------------------------------
232
+
233
+ def _text_token_parts(node: ET.Element) -> "list":
234
+ """Convert a leaf text node (mi/mn/mo/mtext) to run parts, applying style
235
+ from mathvariant and unmapping bold math-alphanumeric characters."""
236
+ raw = "".join(node.itertext())
237
+ variant = node.attrib.get("mathvariant", "")
238
+ style: "str | None" = None
239
+ if variant in ("bold", "bold-italic"):
240
+ style = "b"
241
+ # NB: ``mathvariant="normal"`` (\mathrm) is NOT auto-styled upright here —
242
+ # the fixture only marks upright runs in specific positions (a prescript
243
+ # base, a transpose superscript), which are handled by _coerce_upright /
244
+ # _sup_omml. Auto-styling every \mathrm run would over-apply <m:sty p>.
245
+ # Per-character bold-alphanumeric unmap (e.g. \mathbf{F} -> 𝐅).
246
+ mapped = []
247
+ saw_bold = False
248
+ for ch in raw:
249
+ b = _bold_unmap(ch)
250
+ if b is not None:
251
+ mapped.append(b)
252
+ saw_bold = True
253
+ else:
254
+ mapped.append(ch)
255
+ text = _normalize_spaces("".join(mapped))
256
+ text = text.translate(_GLYPH_ALIASES)
257
+ if saw_bold and style is None:
258
+ style = "b"
259
+ return [_Run(text, style)]
260
+
261
+
262
+ # Operator glyph aliases: latex2mathml renders some relations with an ASCII
263
+ # code point; the fixture uses the proper Unicode math symbol.
264
+ _GLYPH_ALIASES = {
265
+ ord("~"): "∼", # \sim → U+223C
266
+ }
267
+
268
+
269
+ # Unicode spaces latex2mathml emits inside operator names / \text → normalise to
270
+ # a regular ASCII space so "lim sup" compares as "lim sup", etc.
271
+ _SPACE_CHARS = "\xa0             "
272
+ _SPACE_TABLE = {ord(c): " " for c in _SPACE_CHARS}
273
+
274
+
275
+ def _normalize_spaces(text: str) -> str:
276
+ return text.translate(_SPACE_TABLE)
277
+
278
+
279
+ def _node_parts(node: ET.Element) -> "list":
280
+ """Convert a single MathML node to a list of _Run/_Raw parts."""
281
+ name = _local_name(node)
282
+ if name in {"math", "mrow", "semantics", "mstyle", "mpadded"}:
283
+ return _sequence_parts(list(node))
284
+ if name in {"mi", "mn", "mo", "mtext"}:
285
+ return _text_token_parts(node)
286
+ if name == "mspace":
287
+ # Math spacing commands (\, \; \! \quad …) carry no run content in the
288
+ # fixture — drop them. (Row-break newlines are consumed upstream.)
289
+ return []
290
+ if name == "none":
291
+ return []
292
+ if name == "mphantom":
293
+ inner = _emit(_sequence_parts(list(node)))
294
+ return [_Raw(f"<m:phant><m:phantPr/><m:e>{inner}</m:e></m:phant>")]
295
+ if name == "mfenced":
296
+ return [_Raw(_mfenced_to_omml(node))]
297
+ if name == "mmultiscripts":
298
+ return [_Raw(_mmultiscripts_to_omml(node))]
299
+ if name == "menclose":
300
+ return [_Raw(_menclose_to_omml(node))]
301
+ if name == "msup":
302
+ kids = list(node)
303
+ # Prime superscript (f' → f′): fold the prime glyph(s) into a text run
304
+ # rather than a real superscript.
305
+ if len(kids) > 1:
306
+ sup_txt = _node_text(kids[1]).strip()
307
+ if sup_txt and all(ch in "′″‴⁗" for ch in sup_txt):
308
+ return _node_parts(kids[0]) + [_Run(sup_txt)]
309
+ return [_Raw(
310
+ "<m:sSup>"
311
+ f"<m:e>{_child_omml(node, 0)}</m:e>"
312
+ f"<m:sup>{_sup_omml(node, 1)}</m:sup>"
313
+ "</m:sSup>"
314
+ )]
315
+ if name == "msub":
316
+ return [_Raw(
317
+ "<m:sSub>"
318
+ f"<m:e>{_child_omml(node, 0)}</m:e>"
319
+ f"<m:sub>{_child_omml(node, 1)}</m:sub>"
320
+ "</m:sSub>"
321
+ )]
322
+ if name == "msubsup":
323
+ return [_Raw(
324
+ "<m:sSubSup>"
325
+ f"<m:e>{_child_omml(node, 0)}</m:e>"
326
+ f"<m:sub>{_child_omml(node, 1)}</m:sub>"
327
+ f"<m:sup>{_child_omml(node, 2)}</m:sup>"
328
+ "</m:sSubSup>"
329
+ )]
330
+ if name == "mfrac":
331
+ return [_Raw(
332
+ "<m:f>"
333
+ f"<m:num>{_child_omml(node, 0)}</m:num>"
334
+ f"<m:den>{_child_omml(node, 1)}</m:den>"
335
+ "</m:f>"
336
+ )]
337
+ if name == "msqrt":
338
+ body = _emit(_sequence_parts(list(node)))
339
+ return [_Raw(
340
+ "<m:rad><m:radPr><m:degHide m:val=\"1\"/></m:radPr>"
341
+ f"<m:deg/><m:e>{body}</m:e></m:rad>"
342
+ )]
343
+ if name == "mroot":
344
+ return [_Raw(
345
+ "<m:rad><m:radPr><m:degHide m:val=\"0\"/></m:radPr>"
346
+ f"<m:deg>{_child_omml(node, 1)}</m:deg>"
347
+ f"<m:e>{_child_omml(node, 0)}</m:e>"
348
+ "</m:rad>"
349
+ )]
350
+ if name == "mover":
351
+ arrow = _decorated_arrow_text(node)
352
+ if arrow is not None:
353
+ return [_Run(arrow)] # coalescable so "→𝒩" merges into one run
354
+ return [_Raw(_mover_to_omml(node))]
355
+ if name == "munder":
356
+ return [_Raw(_munder_to_omml(node))]
357
+ if name == "munderover":
358
+ return [_Raw(_munderover_to_omml(node))]
359
+ if name == "mtable":
360
+ return [_Raw(_mtable_to_eqarr(node))]
361
+ # Fallback: descend.
362
+ sub = _sequence_parts(list(node))
363
+ if sub:
364
+ return sub
365
+ return [_Run("".join(node.itertext()))]
366
+
367
+
368
+ def _child_omml(node: ET.Element, index: int) -> str:
369
+ children = list(node)
370
+ if index >= len(children):
371
+ return ""
372
+ return _emit(_node_parts(children[index]))
373
+
374
+
375
+ def _sup_omml(node: ET.Element, index: int) -> str:
376
+ """Superscript content, marking a lone uppercase Latin letter as upright
377
+ (e.g. the transpose ``A^{T}`` — latex2mathml drops mathvariant for it)."""
378
+ children = list(node)
379
+ if index >= len(children):
380
+ return ""
381
+ sup = children[index]
382
+ parts = _node_parts(sup)
383
+ if (len(parts) == 1 and isinstance(parts[0], _Run)
384
+ and parts[0].style is None
385
+ and len(parts[0].text) == 1
386
+ and parts[0].text.isascii()
387
+ and parts[0].text.isupper()):
388
+ parts = [_Run(parts[0].text, "p")]
389
+ return _emit(parts)
390
+
391
+
392
+ # ---------------------------------------------------------------------------
393
+ # Group character (overbrace / underbrace) — mover/munder with a brace glyph.
394
+ # ---------------------------------------------------------------------------
395
+
396
+ def _try_groupchr(node: ET.Element) -> "str | None":
397
+ """If node is an mover/munder (possibly nested for the label) whose brace
398
+ child is ⏞/⏟, emit <m:groupChr>. The attached label (over the brace) is
399
+ dropped, matching the fixture."""
400
+ name = _local_name(node)
401
+ if name not in ("mover", "munder"):
402
+ return None
403
+ children = list(node)
404
+ if len(children) < 2:
405
+ return None
406
+ base, mark = children[0], children[1]
407
+ mark_txt = _node_text(mark).strip()
408
+ # Outer mover label wrapping an inner brace: \overbrace{...}^{label}
409
+ if mark_txt not in _GROUPCHR_TOP and mark_txt not in _GROUPCHR_BOT:
410
+ inner = _try_groupchr(base)
411
+ return inner # label (mark) dropped; inner brace is authoritative
412
+ if mark_txt in _GROUPCHR_TOP:
413
+ chr_val, pos, vert = "⏞", "top", "bot"
414
+ else:
415
+ chr_val, pos, vert = "⏟", "bot", "top"
416
+ body = _emit(_node_parts(base))
417
+ return (
418
+ "<m:groupChr><m:groupChrPr>"
419
+ f'<m:chr m:val="{escape(chr_val)}"/>'
420
+ f'<m:pos m:val="{pos}"/>'
421
+ f'<m:vertJc m:val="{vert}"/>'
422
+ "</m:groupChrPr>"
423
+ f"<m:e>{body}</m:e></m:groupChr>"
424
+ )
425
+
426
+
427
+ def _decorated_arrow_text(node: ET.Element) -> "str | None":
428
+ """If ``node`` is a decorated arrow (\\xrightarrow{d} → mover(arrow, label)),
429
+ return the bare arrow glyph (label dropped); else None."""
430
+ children = list(node)
431
+ if len(children) < 2:
432
+ return None
433
+ base_txt = _node_text(children[0]).strip()
434
+ over_char = _node_text(children[1]).strip()
435
+ if (base_txt in _ARROW_CHARS
436
+ and over_char not in _ACCENT_TO_COMBINING
437
+ and over_char not in _OVERLINE_CHARS):
438
+ return base_txt
439
+ return None
440
+
441
+
442
+ def _mover_to_omml(node: ET.Element) -> str:
443
+ grp = _try_groupchr(node)
444
+ if grp is not None:
445
+ return grp
446
+ children = list(node)
447
+ over_char = _node_text(children[1]).strip() if len(children) > 1 else "^"
448
+ base_xml = _child_omml(node, 0)
449
+ if over_char in _OVERLINE_CHARS:
450
+ return (
451
+ "<m:bar><m:barPr><m:pos m:val=\"top\"/></m:barPr>"
452
+ f"<m:e>{base_xml}</m:e></m:bar>"
453
+ )
454
+ chr_val = _ACCENT_TO_COMBINING.get(over_char, over_char or "̂")
455
+ return (
456
+ "<m:acc>"
457
+ f'<m:accPr><m:chr m:val="{escape(chr_val)}"/></m:accPr>'
458
+ f"<m:e>{base_xml}</m:e></m:acc>"
459
+ )
460
+
461
+
462
+ def _munder_to_omml(node: ET.Element) -> str:
463
+ grp = _try_groupchr(node)
464
+ if grp is not None:
465
+ return grp
466
+ children = list(node)
467
+ under_char = _node_text(children[1]).strip() if len(children) > 1 else ""
468
+ if under_char in _BAR_BOT_CHARS:
469
+ return (
470
+ "<m:bar><m:barPr><m:pos m:val=\"bot\"/></m:barPr>"
471
+ f"<m:e>{_child_omml(node, 0)}</m:e></m:bar>"
472
+ )
473
+ # Otherwise a lower limit (rare in this corpus outside scripted ops).
474
+ return (
475
+ "<m:limLow>"
476
+ f"<m:e>{_child_omml(node, 0)}</m:e>"
477
+ f"<m:lim>{_child_omml(node, 1)}</m:lim>"
478
+ "</m:limLow>"
479
+ )
480
+
481
+
482
+ def _munderover_to_omml(node: ET.Element) -> str:
483
+ return (
484
+ "<m:limUpp><m:e><m:limLow>"
485
+ f"<m:e>{_child_omml(node, 0)}</m:e>"
486
+ f"<m:lim>{_child_omml(node, 1)}</m:lim>"
487
+ "</m:limLow></m:e>"
488
+ f"<m:lim>{_child_omml(node, 2)}</m:lim>"
489
+ "</m:limUpp>"
490
+ )
491
+
492
+
493
+ # ---------------------------------------------------------------------------
494
+ # N-ary operators.
495
+ # ---------------------------------------------------------------------------
496
+
497
+ def _is_nary_operator_node(node: ET.Element) -> "str | None":
498
+ if _local_name(node) != "mo":
499
+ return None
500
+ text = _node_text(node).strip()
501
+ return text if text in _NARY_CHARS else None
502
+
503
+
504
+ def _script_base(node: ET.Element):
505
+ children = list(node)
506
+ return children[0] if children else None
507
+
508
+
509
+ def _nary_pr(base_char: str, has_sub: bool, has_sup: bool) -> str:
510
+ parts = [
511
+ f'<m:chr m:val="{escape(base_char)}"/>',
512
+ '<m:limLoc m:val="subSup"/>',
513
+ ]
514
+ # Only emit a Hide hint when that limit is genuinely ABSENT.
515
+ if not has_sub:
516
+ parts.append('<m:subHide m:val="1"/>')
517
+ if not has_sup:
518
+ parts.append('<m:supHide m:val="1"/>')
519
+ return "<m:naryPr>" + "".join(parts) + "</m:naryPr>"
520
+
521
+
522
+ def _nary_from_script(node: ET.Element, base_char: str, body: str) -> str:
523
+ name = _local_name(node)
524
+ children = list(node)
525
+ sub_xml = ""
526
+ sup_xml = ""
527
+ if name in {"msub", "munder"}:
528
+ sub_xml = _emit(_node_parts(children[1])) if len(children) > 1 else ""
529
+ elif name in {"msup", "mover"}:
530
+ sup_xml = _emit(_node_parts(children[1])) if len(children) > 1 else ""
531
+ elif name in {"msubsup", "munderover"}:
532
+ sub_xml = _emit(_node_parts(children[1])) if len(children) > 1 else ""
533
+ sup_xml = _emit(_node_parts(children[2])) if len(children) > 2 else ""
534
+ pr = _nary_pr(base_char, bool(sub_xml), bool(sup_xml))
535
+ sub_block = f"<m:sub>{sub_xml}</m:sub>" if sub_xml else "<m:sub/>"
536
+ sup_block = f"<m:sup>{sup_xml}</m:sup>" if sup_xml else "<m:sup/>"
537
+ return f"<m:nary>{pr}{sub_block}{sup_block}<m:e>{body}</m:e></m:nary>"
538
+
539
+
540
+ # ---------------------------------------------------------------------------
541
+ # Limit operators (lim / limsup / max …) scripted via msub/munder.
542
+ # ---------------------------------------------------------------------------
543
+
544
+ def _is_limit_op_script(node: ET.Element) -> "ET.Element | None":
545
+ """If node is msub/munder whose base is a limit-op <mo>/<mi>, return base."""
546
+ if _local_name(node) not in ("msub", "munder"):
547
+ return None
548
+ base = _script_base(node)
549
+ if base is None:
550
+ return None
551
+ if _local_name(base) in ("mo", "mi"):
552
+ if _normalize_spaces(_node_text(base)).strip() in _LIMIT_OP_NAMES:
553
+ return base
554
+ return None
555
+
556
+
557
+ def _limit_low_from_script(node: ET.Element) -> str:
558
+ children = list(node)
559
+ base_xml = _emit(_node_parts(children[0]))
560
+ lim_xml = _emit(_node_parts(children[1])) if len(children) > 1 else ""
561
+ return (
562
+ "<m:limLow>"
563
+ f"<m:e>{base_xml}</m:e>"
564
+ f"<m:lim>{lim_xml}</m:lim>"
565
+ "</m:limLow>"
566
+ )
567
+
568
+
569
+ # ---------------------------------------------------------------------------
570
+ # Function detection (\sin x, \log_2 n, \ln(x+1), …).
571
+ # ---------------------------------------------------------------------------
572
+
573
+ def _function_name_node(node: ET.Element) -> "str | None":
574
+ """Return the function name if node is a function-name token, else None."""
575
+ if _local_name(node) in ("mi", "mo"):
576
+ txt = _node_text(node).strip()
577
+ if txt in _FUNCTION_NAMES:
578
+ return txt
579
+ if _local_name(node) == "msub":
580
+ base = _script_base(node)
581
+ if base is not None and _local_name(base) in ("mi", "mo"):
582
+ if _node_text(base).strip() in _FUNCTION_NAMES:
583
+ return _node_text(base).strip()
584
+ return None
585
+
586
+
587
+ def _func_apply(name_xml: str, arg_xml: str) -> str:
588
+ return (
589
+ "<m:func><m:funcPr/>"
590
+ f"<m:fName>{name_xml}</m:fName>"
591
+ f"<m:e>{arg_xml}</m:e></m:func>"
592
+ )
593
+
594
+
595
+ # ---------------------------------------------------------------------------
596
+ # Sequence conversion — the heart of run coalescing + lookahead constructs.
597
+ # ---------------------------------------------------------------------------
598
+
599
+ def _sequence_parts(children: "list") -> "list":
600
+ """Convert a flat sequence of MathML nodes to a list of _Run/_Raw parts.
601
+
602
+ Handles, with sibling lookahead:
603
+ • bare fence wrapping → <m:d>
604
+ • n-ary operator body capture → <m:nary>
605
+ • limit-op scripts (lim…) → <m:limLow> (no argument capture)
606
+ • function application (sin/log/…) → <m:func> capturing the argument
607
+ • empty-base prescript pattern → <m:sPre>
608
+ """
609
+ # Top-level bare fence (e.g. \left( a + b \right) flattened to mo … mo).
610
+ fence = _try_bare_fence(children)
611
+ if fence is not None:
612
+ return [_Raw(fence)]
613
+
614
+ parts: "list" = []
615
+ i = 0
616
+ n = len(children)
617
+ while i < n:
618
+ child = children[i]
619
+ cname = _local_name(child)
620
+
621
+ # Pre-script pattern: msub/msup/msubsup with EMPTY base, followed by the
622
+ # real base run → <m:sPre>.
623
+ pre = _try_prescript(children, i)
624
+ if pre is not None:
625
+ xml, consumed = pre
626
+ parts.append(_Raw(xml))
627
+ i += consumed
628
+ continue
629
+
630
+ # N-ary operator with script container (msub/munder/…): capture body.
631
+ if cname in _SCRIPT_TAGS:
632
+ base = _script_base(child)
633
+ base_char = _is_nary_operator_node(base) if base is not None else None
634
+ if base_char:
635
+ body_nodes, j = _capture_body(children, i + 1)
636
+ body = _emit(_sequence_parts(body_nodes))
637
+ parts.append(_Raw(_nary_from_script(child, base_char, body)))
638
+ i = j
639
+ continue
640
+ # Limit operator (lim/limsup/max …) → limLow, no argument capture.
641
+ if _is_limit_op_script(child) is not None:
642
+ parts.append(_Raw(_limit_low_from_script(child)))
643
+ i += 1
644
+ continue
645
+ sbase = _script_base(child)
646
+ if (cname in ("msup", "msub") and sbase is not None
647
+ and _function_name_node(sbase) is not None):
648
+ kids = list(child)
649
+ if cname == "msub":
650
+ # Subscripted function NAME (e.g. \log_2 n): the whole sSub is
651
+ # the function name; the following operand is the argument.
652
+ name_xml = (
653
+ "<m:sSub>"
654
+ f"<m:e>{_emit(_node_parts(sbase))}</m:e>"
655
+ f"<m:sub>{_emit(_node_parts(kids[1])) if len(kids) > 1 else ''}</m:sub>"
656
+ "</m:sSub>"
657
+ )
658
+ arg_nodes, j = _capture_func_arg(children, i + 1)
659
+ parts.append(_Raw(_func_apply(
660
+ name_xml, _emit(_sequence_parts(arg_nodes)))))
661
+ i = j
662
+ continue
663
+ # Superscripted function (e.g. \cos^2 θ): power of the result.
664
+ name_xml = _emit(_node_parts(sbase))
665
+ arg_nodes, j = _capture_func_arg(children, i + 1)
666
+ func_xml = _func_apply(name_xml, _emit(_sequence_parts(arg_nodes)))
667
+ script_xml = _emit(_node_parts(kids[1])) if len(kids) > 1 else ""
668
+ parts.append(_Raw(
669
+ f"<m:sSup><m:e>{func_xml}</m:e>"
670
+ f"<m:sup>{script_xml}</m:sup></m:sSup>"
671
+ ))
672
+ i = j
673
+ continue
674
+
675
+ # Bare n-ary mo (no script container).
676
+ bare_char = _is_nary_operator_node(child)
677
+ if bare_char:
678
+ body_nodes, j = _capture_body(children, i + 1)
679
+ body = _emit(_sequence_parts(body_nodes))
680
+ pr = _nary_pr(bare_char, False, False)
681
+ parts.append(_Raw(
682
+ f"<m:nary>{pr}<m:sub/><m:sup/><m:e>{body}</m:e></m:nary>"
683
+ ))
684
+ i = j
685
+ continue
686
+
687
+ # Function application: name token (sin/log_2/…) consumes its argument.
688
+ fname = _function_name_node(child)
689
+ if fname is not None:
690
+ name_xml = _emit(_node_parts(child))
691
+ arg_nodes, j = _capture_func_arg(children, i + 1)
692
+ if arg_nodes:
693
+ arg_xml = _emit(_sequence_parts(arg_nodes))
694
+ parts.append(_Raw(_func_apply(name_xml, arg_xml)))
695
+ i = j
696
+ continue
697
+
698
+ # Implicit function application: a single-letter operand (or scripted
699
+ # operand) immediately followed by a parenthesised group → func, with the
700
+ # surrounding parentheses STRIPPED (the inner content is the argument).
701
+ # e.g. f(x) → func(f, x), f(x,y) → func(f, "x,y").
702
+ if _is_operand_callable(child) and _next_is_paren_group(children, i + 1):
703
+ name_xml = _emit(_node_parts(child))
704
+ inner_nodes, j = _capture_paren_inner(children, i + 1)
705
+ parts.append(_Raw(_func_apply(
706
+ name_xml, _emit(_sequence_parts(inner_nodes)))))
707
+ i = j
708
+ continue
709
+
710
+ # Inline fence group: a parenthesised sub-expression in function-call
711
+ # position (immediately after an identifier/scripted operand) → wrap in
712
+ # <m:d>. e.g. f'(x) → f′ (x). Parens after an operator stay literal text.
713
+ prev = children[i - 1] if i > 0 else None
714
+ grp = (_inline_fence_group(children, i)
715
+ if prev is not None and _local_name(prev) in (
716
+ "mi", "msub", "msup", "msubsup", "mrow", "mover", "munder")
717
+ else None)
718
+ if grp is not None:
719
+ xml, j = grp
720
+ parts.append(_Raw(xml))
721
+ i = j
722
+ continue
723
+
724
+ # Default: token / structural node.
725
+ parts.extend(_node_parts(child))
726
+ i += 1
727
+ return parts
728
+
729
+
730
+ def _inline_fence_group(children: "list", i: int) -> "tuple[str, int] | None":
731
+ child = children[i]
732
+ if _local_name(child) != "mo":
733
+ return None
734
+ open_ch = _node_text(child).strip()
735
+ # Only parenthesis groups are wrapped inline. Brackets/braces/bars appear in
736
+ # the corpus as literal text inside limits and sets, so wrapping them would
737
+ # over-fire; the whole-row fence handler still covers \left[ … \right].
738
+ if open_ch != "(":
739
+ return None
740
+ close_ch = _FENCE_PAIRS[open_ch]
741
+ depth = 0
742
+ n = len(children)
743
+ j = i
744
+ while j < n:
745
+ node = children[j]
746
+ if _local_name(node) == "mo":
747
+ t = _node_text(node).strip()
748
+ if t == open_ch:
749
+ depth += 1
750
+ elif t == close_ch:
751
+ depth -= 1
752
+ if depth == 0:
753
+ inner = children[i + 1:j]
754
+ return _delimiter(
755
+ open_ch, close_ch,
756
+ [_emit(_sequence_parts(inner))]), j + 1
757
+ j += 1
758
+ return None
759
+
760
+
761
+ def _is_operand_callable(node: ET.Element) -> bool:
762
+ """A node that can stand as a function head when followed by '(': a single
763
+ identifier letter, or a script whose base is such a letter (f', T_k, …)."""
764
+ name = _local_name(node)
765
+ if name == "mi":
766
+ txt = _node_text(node).strip()
767
+ return len(txt) == 1 and txt.isalpha()
768
+ return False
769
+
770
+
771
+ def _capture_paren_inner(children: "list", start: int) -> "tuple[list, int]":
772
+ """Capture the INNER nodes of the following ``(`` … ``)`` group (parens
773
+ stripped). Returns (inner_nodes, index_after_close)."""
774
+ n = len(children)
775
+ while start < n and _local_name(children[start]) == "mspace":
776
+ start += 1
777
+ if start >= n or _local_name(children[start]) != "mo" \
778
+ or _node_text(children[start]).strip() != "(":
779
+ return [children[start]] if start < n else [], min(start + 1, n)
780
+ depth = 0
781
+ j = start
782
+ while j < n:
783
+ node = children[j]
784
+ if _local_name(node) == "mo":
785
+ t = _node_text(node).strip()
786
+ if t == "(":
787
+ depth += 1
788
+ elif t == ")":
789
+ depth -= 1
790
+ if depth == 0:
791
+ return children[start + 1:j], j + 1
792
+ j += 1
793
+ return children[start + 1:], n
794
+
795
+
796
+ def _next_is_paren_group(children: "list", start: int) -> bool:
797
+ # Skip spacing.
798
+ while start < len(children) and _local_name(children[start]) == "mspace":
799
+ start += 1
800
+ if start >= len(children):
801
+ return False
802
+ nxt = children[start]
803
+ if _local_name(nxt) == "mo" and _node_text(nxt).strip() == "(":
804
+ return True
805
+ if _local_name(nxt) == "mfenced":
806
+ return True
807
+ return False
808
+
809
+
810
+ _RELATION_OPS = {
811
+ "=", "≠", "≈", "≡", "∼", "≅", "≃", "∝",
812
+ "<", ">", "≤", "≥", "≪", "≫",
813
+ "→", "←", "↔", "⇒", "⇐", "⇔", "↦", "⟶",
814
+ "∈", "∉", "⊂", "⊆", "⊃", "⊇",
815
+ }
816
+
817
+
818
+ def _capture_body(children: "list", start: int) -> "tuple[list, int]":
819
+ """Capture nodes for an n-ary body: the summand/integrand.
820
+
821
+ A following n-ary operator is INCLUDED so adjacent operators nest (matching
822
+ ``\\sum_i \\sum_j a_{ij}`` → an outer nary whose body is the inner nary). The
823
+ body ends at a top-level relation operator (=, ∼, <, →, ∈, …), which marks
824
+ the end of the operand and the start of a new clause."""
825
+ body: "list" = []
826
+ j = start
827
+ n = len(children)
828
+ while j < n:
829
+ nxt = children[j]
830
+ if _local_name(nxt) in ("mo", "mi"):
831
+ t = _node_text(nxt).strip().translate(_GLYPH_ALIASES)
832
+ if t in _RELATION_OPS:
833
+ break
834
+ body.append(nxt)
835
+ j += 1
836
+ return body, j
837
+
838
+
839
+ def _capture_func_arg(children: "list", start: int) -> "tuple[list, int]":
840
+ """Capture the argument of a function application.
841
+
842
+ The argument is the immediately-following operand:
843
+ • a single delimiter group (mo '(' … mo ')'), or a node that already is a
844
+ delimiter / fenced / row, OR
845
+ • a single simple operand token.
846
+ We deliberately keep the argument tight so trailing terms stay outside.
847
+ """
848
+ n = len(children)
849
+ # Skip a spacing token (e.g. \! negative thin space between \exp and the
850
+ # parenthesised argument).
851
+ while start < n and _local_name(children[start]) == "mspace":
852
+ start += 1
853
+ if start >= n:
854
+ return [], start
855
+ first = children[start]
856
+ fname_local = _local_name(first)
857
+ # Parenthesised argument starting with an opening fence mo: take through the
858
+ # matching close fence.
859
+ if fname_local == "mo":
860
+ open_ch = _node_text(first).strip()
861
+ if open_ch in _FENCE_OPENERS:
862
+ close_ch = _FENCE_PAIRS[open_ch]
863
+ depth = 0
864
+ j = start
865
+ while j < n:
866
+ node = children[j]
867
+ if _local_name(node) == "mo":
868
+ t = _node_text(node).strip()
869
+ if t == open_ch:
870
+ depth += 1
871
+ elif t == close_ch:
872
+ depth -= 1
873
+ if depth == 0:
874
+ return children[start:j + 1], j + 1
875
+ j += 1
876
+ # Unbalanced — fall through to single token.
877
+ # A node that is itself a complete group (mfenced / row / delimiter) — take
878
+ # just it.
879
+ if fname_local in {"mfenced", "mrow"}:
880
+ return [first], start + 1
881
+ # Otherwise the single following operand token.
882
+ return [first], start + 1
883
+
884
+
885
+ def _try_prescript(children: "list", i: int) -> "tuple[str, int] | None":
886
+ """Detect the empty-base prescript pattern emitted by latex2mathml for
887
+ ``{}^{a}_{b}\\mathrm{X}``: an msubsup/msup/msub whose BASE is an empty
888
+ <mrow/>, immediately followed by the real base operand. Emits <m:sPre>.
889
+ """
890
+ node = children[i]
891
+ name = _local_name(node)
892
+ if name not in ("msubsup", "msup", "msub"):
893
+ return None
894
+ kids = list(node)
895
+ if not kids:
896
+ return None
897
+ base = kids[0]
898
+ # Empty base = an mrow/mi/mo with no text and no children.
899
+ if _node_text(base).strip() != "" or list(base):
900
+ return None
901
+ # Need a following operand to act as the real base.
902
+ if i + 1 >= len(children):
903
+ return None
904
+ sub_xml = ""
905
+ sup_xml = ""
906
+ if name == "msubsup":
907
+ sub_xml = _emit(_node_parts(kids[1])) if len(kids) > 1 else ""
908
+ sup_xml = _emit(_node_parts(kids[2])) if len(kids) > 2 else ""
909
+ elif name == "msub":
910
+ sub_xml = _emit(_node_parts(kids[1])) if len(kids) > 1 else ""
911
+ elif name == "msup":
912
+ sup_xml = _emit(_node_parts(kids[1])) if len(kids) > 1 else ""
913
+ real_base = children[i + 1]
914
+ base_xml = _emit(_coerce_upright(_node_parts(real_base)))
915
+ sub_block = f"<m:sub>{sub_xml}</m:sub>" if sub_xml else "<m:sub/>"
916
+ sup_block = f"<m:sup>{sup_xml}</m:sup>" if sup_xml else "<m:sup/>"
917
+ xml = (
918
+ "<m:sPre><m:sPrePr/>"
919
+ f"{sub_block}{sup_block}"
920
+ f"<m:e>{base_xml}</m:e></m:sPre>"
921
+ )
922
+ return xml, 2
923
+
924
+
925
+ def _coerce_upright(parts: "list") -> "list":
926
+ """Mark plain text runs in ``parts`` as upright (sty='p'). Used for the base
927
+ of a prescript (a chemical element symbol like \\mathrm{X}), which
928
+ latex2mathml strips ``mathvariant=normal`` from for single letters."""
929
+ out: "list" = []
930
+ for item in parts:
931
+ if isinstance(item, _Run) and item.style is None:
932
+ out.append(_Run(item.text, "p"))
933
+ else:
934
+ out.append(item)
935
+ return out
936
+
937
+
938
+ # ---------------------------------------------------------------------------
939
+ # Delimiters / fences.
940
+ # ---------------------------------------------------------------------------
941
+
942
+ def _delimiter(begin: str, end: str, inner_es: "list[str]", sep: "str | None" = None) -> str:
943
+ pr_parts = [f'<m:begChr m:val="{escape(begin)}"/>', f'<m:endChr m:val="{escape(end)}"/>']
944
+ if sep is not None:
945
+ pr_parts.append(f'<m:sepChr m:val="{escape(sep)}"/>')
946
+ pr = "<m:dPr>" + "".join(pr_parts) + "</m:dPr>"
947
+ es = "".join(f"<m:e>{e}</m:e>" for e in inner_es) or "<m:e/>"
948
+ return f"<m:d>{pr}{es}</m:d>"
949
+
950
+
951
+ def _is_fence_mo(node: ET.Element) -> bool:
952
+ return _local_name(node) == "mo" and (
953
+ node.attrib.get("fence") == "true"
954
+ or _node_text(node).strip() in _FENCE_OPENERS
955
+ or _node_text(node).strip() in _FENCE_CLOSERS
956
+ )
957
+
958
+
959
+ def _try_bare_fence(children: "list") -> "str | None":
960
+ """If ``children`` open with a fence ``mo`` and close with a matching fence
961
+ ``mo``, wrap the inner content in <m:d>. Handles inner ``\\middle|`` bars by
962
+ splitting into nested <m:d> with | … | when fence bars appear mid-row."""
963
+ if len(children) < 2:
964
+ return None
965
+ first, last = children[0], children[-1]
966
+ if _local_name(first) != "mo" or _local_name(last) != "mo":
967
+ return None
968
+ open_ch = _node_text(first).strip()
969
+ close_ch = _node_text(last).strip()
970
+ if open_ch not in _FENCE_OPENERS:
971
+ return None
972
+ # The close must be a recognised closer; allow asymmetric pairs that the
973
+ # fixture uses (e.g. ( … ] and | … ⟩).
974
+ if close_ch not in _FENCE_CLOSERS:
975
+ return None
976
+ inner = children[1:-1]
977
+ # Detect inner "middle" fence bars (lspace/rspace stretchy fence | ) that
978
+ # split the content into nested delimiters: ⟨ a | b | c ⟩.
979
+ mids = [
980
+ idx for idx, node in enumerate(inner)
981
+ if _local_name(node) == "mo"
982
+ and node.attrib.get("fence") == "true"
983
+ and node.attrib.get("lspace") is not None
984
+ and _node_text(node).strip() in ("|", "‖")
985
+ ]
986
+ if mids:
987
+ segs: "list[list]" = [[]]
988
+ bar_chars: "list[str]" = []
989
+ for node in inner:
990
+ if (_local_name(node) == "mo"
991
+ and node.attrib.get("fence") == "true"
992
+ and node.attrib.get("lspace") is not None
993
+ and _node_text(node).strip() in ("|", "‖")):
994
+ bar_chars.append(_node_text(node).strip())
995
+ segs.append([])
996
+ else:
997
+ segs[-1].append(node)
998
+ # Middle bars come in PAIRS: the content between bar[0] and bar[1] is
999
+ # wrapped in its own <m:d>|…|, the content after bar[1] is plain, etc.
1000
+ # ⟨ ϕ | X | ψ ⟩ → ϕ, <m:d>|X|</m:d>, ψ.
1001
+ es_parts: "list[str]" = []
1002
+ for k, seg in enumerate(segs):
1003
+ seg_xml = _emit(_sequence_parts(seg))
1004
+ if k >= 1 and k % 2 == 1:
1005
+ bar = bar_chars[k - 1] if k - 1 < len(bar_chars) else "|"
1006
+ es_parts.append(_delimiter(bar, bar, [seg_xml]))
1007
+ else:
1008
+ es_parts.append(seg_xml)
1009
+ body = "".join(es_parts)
1010
+ return _delimiter(open_ch, close_ch, [body])
1011
+ # Plain fence: the whole inner content is a single <m:e> (commas stay as
1012
+ # literal text — the fixture does not split argument lists into cells).
1013
+ return _delimiter(open_ch, close_ch, [_emit(_sequence_parts(inner))])
1014
+
1015
+
1016
+ def _mfenced_to_omml(node: ET.Element) -> str:
1017
+ open_ch = node.attrib.get("open", "(")
1018
+ close_ch = node.attrib.get("close", ")")
1019
+ seps = node.attrib.get("separators", ",")
1020
+ children = list(node)
1021
+ es = [_emit(_node_parts(child)) for child in children]
1022
+ sep = seps[0] if seps else None
1023
+ return _delimiter(open_ch, close_ch, es, sep=sep if len(es) > 1 else None)
1024
+
1025
+
1026
+ # ---------------------------------------------------------------------------
1027
+ # Multiscripts (mmultiscripts) — pre/post scripts.
1028
+ # ---------------------------------------------------------------------------
1029
+
1030
+ def _mmultiscripts_to_omml(node: ET.Element) -> str:
1031
+ children = list(node)
1032
+ if not children:
1033
+ return ""
1034
+ base = children[0]
1035
+ base_xml = _emit(_node_parts(base))
1036
+ post: "list[ET.Element]" = []
1037
+ pre: "list[ET.Element]" = []
1038
+ in_pre = False
1039
+ for child in children[1:]:
1040
+ if _local_name(child) == "mprescripts":
1041
+ in_pre = True
1042
+ continue
1043
+ (pre if in_pre else post).append(child)
1044
+
1045
+ def _sx(nodes: "list[ET.Element]", idx: int) -> str:
1046
+ if idx >= len(nodes):
1047
+ return ""
1048
+ nd = nodes[idx]
1049
+ if _local_name(nd) == "none":
1050
+ return ""
1051
+ return _emit(_node_parts(nd))
1052
+
1053
+ result = base_xml
1054
+ if post:
1055
+ sub_xml = _sx(post, 0)
1056
+ sup_xml = _sx(post, 1)
1057
+ if sub_xml and sup_xml:
1058
+ result = (
1059
+ "<m:sSubSup>"
1060
+ f"<m:e>{result}</m:e>"
1061
+ f"<m:sub>{sub_xml}</m:sub>"
1062
+ f"<m:sup>{sup_xml}</m:sup>"
1063
+ "</m:sSubSup>"
1064
+ )
1065
+ elif sub_xml:
1066
+ result = f"<m:sSub><m:e>{result}</m:e><m:sub>{sub_xml}</m:sub></m:sSub>"
1067
+ elif sup_xml:
1068
+ result = f"<m:sSup><m:e>{result}</m:e><m:sup>{sup_xml}</m:sup></m:sSup>"
1069
+ if pre:
1070
+ pre_sub = _sx(pre, 0)
1071
+ pre_sup = _sx(pre, 1)
1072
+ sub_block = f"<m:sub>{pre_sub}</m:sub>" if pre_sub else "<m:sub/>"
1073
+ sup_block = f"<m:sup>{pre_sup}</m:sup>" if pre_sup else "<m:sup/>"
1074
+ result = (
1075
+ "<m:sPre><m:sPrePr/>"
1076
+ f"{sub_block}{sup_block}"
1077
+ f"<m:e>{result}</m:e></m:sPre>"
1078
+ )
1079
+ return result
1080
+
1081
+
1082
+ # ---------------------------------------------------------------------------
1083
+ # Enclosures (menclose) — boxed / cancel / bcancel.
1084
+ # ---------------------------------------------------------------------------
1085
+
1086
+ def _menclose_to_omml(node: ET.Element) -> str:
1087
+ notation = node.attrib.get("notation", "").strip().lower()
1088
+ inner = _emit(_sequence_parts(list(node)))
1089
+ if notation == "updiagonalstrike":
1090
+ pr = (
1091
+ "<m:borderBoxPr>"
1092
+ '<m:hideTop m:val="1"/><m:hideBot m:val="1"/>'
1093
+ '<m:hideLeft m:val="1"/><m:hideRight m:val="1"/>'
1094
+ '<m:strikeTLBR m:val="1"/>'
1095
+ "</m:borderBoxPr>"
1096
+ )
1097
+ return f"<m:borderBox>{pr}<m:e>{inner}</m:e></m:borderBox>"
1098
+ if notation == "downdiagonalstrike":
1099
+ pr = (
1100
+ "<m:borderBoxPr>"
1101
+ '<m:hideTop m:val="1"/><m:hideBot m:val="1"/>'
1102
+ '<m:hideLeft m:val="1"/><m:hideRight m:val="1"/>'
1103
+ '<m:strikeBLTR m:val="1"/>'
1104
+ "</m:borderBoxPr>"
1105
+ )
1106
+ return f"<m:borderBox>{pr}<m:e>{inner}</m:e></m:borderBox>"
1107
+ # box / roundedbox / actuarial / everything else → plain bordered box.
1108
+ return f"<m:borderBox><m:borderBoxPr/><m:e>{inner}</m:e></m:borderBox>"
1109
+
1110
+
1111
+ # ---------------------------------------------------------------------------
1112
+ # mtable → eqArr (aligned / cases). latex2mathml flattens \aligned to a single
1113
+ # mrow with raw '&' separators (handled in preprocessing), and \cases to an
1114
+ # mtable inside a prefix-fence mrow. Both become <m:eqArr> with '&' stripped.
1115
+ # ---------------------------------------------------------------------------
1116
+
1117
+ # Synthetic newline marker (an mspace[linebreak=newline]) used to insert an
1118
+ # explicit row break at <mtr> boundaries.
1119
+ _NEWLINE_MARK = ET.Element("mspace", {"linebreak": "newline"})
1120
+
1121
+
1122
+ def _mtable_rows(node: ET.Element) -> "list[list]":
1123
+ """Flatten an mtable into a list of token-rows, splitting on the nbsp/newline
1124
+ ``\\`` break markers. Used for eqArr (aligned/cases) where columns collapse
1125
+ into a single per-row sequence (the '&' align markers are stripped)."""
1126
+ flat: "list" = []
1127
+ for tr in [c for c in list(node) if _local_name(c) == "mtr"]:
1128
+ for cell in [c for c in list(tr) if _local_name(c) == "mtd"]:
1129
+ flat.extend(list(cell))
1130
+ flat.append(_NEWLINE_MARK) # mtr boundary is also a row break
1131
+ out: "list[list]" = []
1132
+ for seg in _split_rows_on_nbsp(flat):
1133
+ out.append([nd for nd in seg if not _is_align_marker(nd)])
1134
+ return [r for r in out if r]
1135
+
1136
+
1137
+ def _mtable_to_eqarr(node: ET.Element) -> str:
1138
+ rows = _mtable_rows(node)
1139
+ es = "".join(
1140
+ f"<m:e>{_emit(_sequence_parts(seg))}</m:e>" for seg in rows
1141
+ )
1142
+ return f"<m:eqArr>{es}</m:eqArr>"
1143
+
1144
+
1145
+ def _mtable_grid(node: ET.Element) -> "list[list[list]]":
1146
+ """Reconstruct a 2-D cell grid from latex2mathml's degenerate matrix mtable.
1147
+
1148
+ latex2mathml drops '&' column markers in matrix environments and packs the
1149
+ table into one <mtr> with split <mtd>s plus a newline/nbsp ``\\`` break. We
1150
+ treat each <mtd> boundary as a column separator and each break as a new row.
1151
+ """
1152
+ rows: "list[list[list]]" = [[]]
1153
+ for tr in [c for c in list(node) if _local_name(c) == "mtr"]:
1154
+ for cell in [c for c in list(tr) if _local_name(c) == "mtd"]:
1155
+ segments = _split_rows_on_nbsp(list(cell)) or [[]]
1156
+ for k, seg in enumerate(segments):
1157
+ seg = [nd for nd in seg if not _is_align_marker(nd)]
1158
+ if k == 0:
1159
+ rows[-1].append(seg)
1160
+ else:
1161
+ rows.append([seg])
1162
+ rows.append([]) # mtr boundary ends a row
1163
+ return [r for r in rows if r]
1164
+
1165
+
1166
+ def _mtable_to_matrix(node: ET.Element) -> str:
1167
+ grid = _mtable_grid(node)
1168
+ ncols = max((len(r) for r in grid), default=0)
1169
+ mrows = []
1170
+ for row in grid:
1171
+ es = "".join(
1172
+ f"<m:e>{_emit(_sequence_parts(cell))}</m:e>" for cell in row
1173
+ )
1174
+ mrows.append(f"<m:mr>{es}</m:mr>")
1175
+ mpr = (
1176
+ "<m:mPr><m:mcs><m:mc><m:mcPr>"
1177
+ f'<m:count m:val="{ncols}"/>'
1178
+ "</m:mcPr></m:mc></m:mcs></m:mPr>"
1179
+ )
1180
+ return f"<m:m>{mpr}{''.join(mrows)}</m:m>"
1181
+
1182
+
1183
+ # Private-use sentinel char standing in for the alignment '&' that
1184
+ # latex2mathml emits as invalid XML inside aligned environments.
1185
+ _ALIGN_SENTINEL = ""
1186
+
1187
+
1188
+ def _is_align_marker(node: ET.Element) -> bool:
1189
+ if _local_name(node) not in ("mi", "mo"):
1190
+ return False
1191
+ t = _node_text(node).strip()
1192
+ return t == "&" or t == _ALIGN_SENTINEL
1193
+
1194
+
1195
+ def _is_nbsp_break(node: ET.Element) -> bool:
1196
+ # Newline mspace (latex2mathml emits this for "\\" in aligned/matrix).
1197
+ if _local_name(node) == "mspace" and node.attrib.get("linebreak") == "newline":
1198
+ return True
1199
+ # Whitespace/nbsp mtext (alternate "\\" rendering).
1200
+ if _local_name(node) == "mtext":
1201
+ txt = _node_text(node)
1202
+ return txt != "" and txt.strip("  ") == ""
1203
+ return False
1204
+
1205
+
1206
+ def _split_rows_on_nbsp(nodes: "list") -> "list[list]":
1207
+ rows: "list[list]" = [[]]
1208
+ for nd in nodes:
1209
+ if _is_nbsp_break(nd):
1210
+ rows.append([])
1211
+ else:
1212
+ rows[-1].append(nd)
1213
+ return [r for r in rows if r]
1214
+
1215
+
1216
+ # ---------------------------------------------------------------------------
1217
+ # Top-level mrow special cases: aligned (flattened) and cases (fence+mtable).
1218
+ # ---------------------------------------------------------------------------
1219
+
1220
+ def _try_aligned_or_cases(root_children: "list") -> "str | None":
1221
+ """If the top-level content is an aligned/cases environment, emit eqArr.
1222
+
1223
+ • aligned → a single mrow flattened to tokens with raw '&' align markers and
1224
+ nbsp '\\' breaks (no mtable). Detect by presence of '&' align markers.
1225
+ • cases → an mrow whose children are [fence-mo '{', mtable]. Detect the
1226
+ mtable and convert it.
1227
+ """
1228
+ # Unwrap a single wrapping mrow.
1229
+ nodes = root_children
1230
+ while len(nodes) == 1 and _local_name(nodes[0]) in ("mrow", "mstyle"):
1231
+ nodes = list(nodes[0])
1232
+
1233
+ # matrix: fence-open + mtable + fence-close → <m:d><m:m>.
1234
+ if (len(nodes) == 3
1235
+ and _local_name(nodes[0]) == "mo"
1236
+ and _local_name(nodes[1]) == "mtable"
1237
+ and _local_name(nodes[2]) == "mo"):
1238
+ open_ch = _node_text(nodes[0]).strip()
1239
+ close_ch = _node_text(nodes[2]).strip()
1240
+ if open_ch in _FENCE_OPENERS and close_ch in _FENCE_CLOSERS:
1241
+ return _delimiter(open_ch, close_ch, [_mtable_to_matrix(nodes[1])])
1242
+ # cases: prefix fence '{' followed by mtable → eqArr (no fence).
1243
+ if (len(nodes) == 2
1244
+ and _local_name(nodes[0]) == "mo"
1245
+ and _node_text(nodes[0]).strip() == "{"
1246
+ and _local_name(nodes[1]) == "mtable"):
1247
+ return _mtable_to_eqarr(nodes[1])
1248
+ # A bare mtable (\begin{matrix} / \begin{array}, no fence) → matrix <m:m>.
1249
+ if len(nodes) == 1 and _local_name(nodes[0]) == "mtable":
1250
+ return _mtable_to_matrix(nodes[0])
1251
+
1252
+ # aligned: flattened tokens containing '&' align markers.
1253
+ if any(_is_align_marker(nd) for nd in nodes):
1254
+ rows: "list[str]" = []
1255
+ for seg in _split_rows_on_nbsp(nodes):
1256
+ seg = [nd for nd in seg if not _is_align_marker(nd)]
1257
+ rows.append(_emit(_sequence_parts(seg)))
1258
+ es = "".join(f"<m:e>{r}</m:e>" for r in rows)
1259
+ return f"<m:eqArr>{es}</m:eqArr>"
1260
+ return None
1261
+
1262
+
1263
+ # ---------------------------------------------------------------------------
1264
+ # MathML preprocessing — repair latex2mathml's invalid '&' in aligned output.
1265
+ # ---------------------------------------------------------------------------
1266
+
1267
+ def _sanitize_mathml(mathml: str) -> str:
1268
+ """latex2mathml emits a literal ``<mi>&</mi>`` for the alignment ``&`` in
1269
+ ``aligned`` environments, which is invalid XML. Replace it with a parseable
1270
+ sentinel token we can recognise downstream as an align marker."""
1271
+ return mathml.replace("<mi>&</mi>", "<mi></mi>")
1272
+
1273
+
1274
+
1275
+ def _mathml_to_omml(mathml: str) -> "str | None":
1276
+ mathml = _sanitize_mathml(mathml)
1277
+ try:
1278
+ root = ET.fromstring(mathml)
1279
+ except ET.ParseError:
1280
+ return None
1281
+ if _local_name(root) != "math" and root.tag != f"{{{_MATHML_NS}}}math":
1282
+ return None
1283
+ children = list(root)
1284
+ aligned = _try_aligned_or_cases(children)
1285
+ if aligned is not None:
1286
+ return f"<m:oMath>{aligned}</m:oMath>"
1287
+ body = _emit(_sequence_parts(children))
1288
+ if not body:
1289
+ return None
1290
+ return f"<m:oMath>{body}</m:oMath>"
1291
+
1292
+
1293
+ def _latex_to_omml_string(latex: str) -> "str | None":
1294
+ """Convert a LaTeX fragment to an OMML `<m:oMath>` XML string, or None."""
1295
+ latex = latex.strip()
1296
+ if not latex:
1297
+ return None
1298
+ try:
1299
+ import latex2mathml.converter as _l2m
1300
+ except ImportError as exc:
1301
+ raise RuntimeError(
1302
+ "OMML conversion requires 'latex2mathml'; "
1303
+ "install the build-corpus dependencies (pip install -r requirements.txt)"
1304
+ ) from exc
1305
+ try:
1306
+ mathml = _l2m.convert(latex)
1307
+ omml = _mathml_to_omml(mathml)
1308
+ except Exception:
1309
+ return None
1310
+ if not omml or "oMath" not in omml:
1311
+ return None
1312
+ return omml
1313
+
1314
+
1315
+ def _parse_with_namespaces(omml: str):
1316
+ decls = nsdecls("m", "w")
1317
+ for tag in ("<m:oMath>", "<m:oMathPara>"):
1318
+ if omml.startswith(tag):
1319
+ opener = tag[:-1] + f" {decls}>"
1320
+ omml = opener + omml[len(tag):]
1321
+ break
1322
+ else:
1323
+ return None
1324
+ try:
1325
+ return parse_xml(omml)
1326
+ except Exception:
1327
+ return None
1328
+
1329
+
1330
+ def latex_to_omath(latex: str):
1331
+ """Return an inline `<m:oMath>` element for ``latex``, or None on failure."""
1332
+ omml = _latex_to_omml_string(latex)
1333
+ if omml is None:
1334
+ return None
1335
+ return _parse_with_namespaces(omml)
1336
+
1337
+
1338
+ def latex_to_omath_para(latex: str):
1339
+ """Return a display `<m:oMathPara>` element for ``latex``, or None."""
1340
+ omml = _latex_to_omml_string(latex)
1341
+ if omml is None:
1342
+ return None
1343
+ if omml.startswith("<m:oMath>"):
1344
+ omml = "<m:oMathPara>" + omml + "</m:oMathPara>"
1345
+ return _parse_with_namespaces(omml)