regen.mde 0.2.2 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +16 -16
- package/README.md +409 -295
- package/bin/build-corpus-editor.js +83 -81
- package/bin/build-corpus.js +41 -41
- package/bin/postinstall.js +259 -187
- package/bin/regen-mdeditor-install.js +27 -27
- package/bin/regen-mdeditor-uninstall.js +19 -19
- package/bin/validate-katex.js +93 -93
- package/desktop/BuildCorpusEditor/BuildCorpusBridge.cs +493 -270
- package/desktop/BuildCorpusEditor/BuildCorpusEditor.csproj +22 -22
- package/desktop/BuildCorpusEditor/EditorForm.cs +853 -540
- package/desktop/BuildCorpusEditor/Program.cs +85 -81
- package/desktop/BuildCorpusEditor/app.manifest +16 -16
- package/dist/release/regen-mde-0.8.0-win-x64.zip +0 -0
- package/dist/windows-editor/BuildCorpusEditor.dll +0 -0
- package/dist/windows-editor/BuildCorpusEditor.exe +0 -0
- package/dist/windows-editor/BuildCorpusEditor.pdb +0 -0
- package/dist/windows-editor/BuildCorpusEditor.runtimeconfig.json +1 -1
- package/dist/windows-editor/wwwroot/assets/index-C_VxJk4k.js +375 -0
- package/dist/windows-editor/wwwroot/assets/index-Wt9zSjIw.css +1 -0
- package/dist/windows-editor/wwwroot/index.html +22 -22
- package/editor-web/index.html +21 -21
- package/editor-web/src/main.jsx +1044 -399
- package/editor-web/src/styles.css +846 -602
- package/editor-web/vite.config.js +13 -13
- package/examples/build-corpus.config.example.json +21 -21
- package/installer/install-regen-mde.ps1 +214 -175
- package/installer/regen-mde.nsi +81 -81
- package/package.json +10 -6
- package/pyproject.toml +4 -3
- package/requirements.txt +5 -4
- package/scripts/build-windows-editor.ps1 +47 -47
- package/scripts/package-windows-editor.ps1 +90 -90
- package/scripts/release-dual.mjs +105 -0
- package/scripts/run-corpus.ps1 +28 -28
- package/scripts/run-editor-implementation-plane.ps1 +226 -203
- package/scripts/run-required-tests.ps1 +98 -98
- package/scripts/run-smoke.ps1 +28 -28
- package/src/build_corpus/__init__.py +1 -1
- package/src/build_corpus/docx_exporter.py +1055 -798
- package/src/build_corpus/equations.py +1345 -0
- package/src/build_corpus/exporter.py +1488 -1195
- package/src/build_corpus/frontmatter.py +302 -0
- package/src/build_corpus/ppt_exporter.py +543 -532
- package/src/build_corpus/templates/__init__.py +1 -1
- package/src/build_corpus/validate_assets.py +46 -46
- package/tools/audit_corpus.py +203 -203
- package/tools/collect_microsoft_word_templates.py +228 -228
- package/tools/collect_online_docx_corpus.py +272 -272
- package/tools/collect_online_pptx_corpus.py +252 -252
- package/tools/compare_pptx_inputs_outputs.py +87 -87
- package/tools/roundtrip_docx_corpus.py +171 -171
- package/dist/release/regen.mde-0.2.2-win-x64-setup.exe +0 -0
- package/dist/release/regen.mde-0.2.2-win-x64.zip +0 -0
- package/dist/windows-editor/wwwroot/assets/index-DjJ6xmhy.js +0 -326
- package/dist/windows-editor/wwwroot/assets/index-_dwMNNsm.css +0 -1
|
@@ -0,0 +1,1345 @@
|
|
|
1
|
+
"""LaTeX -> OMML (Office Math) conversion for Markdown -> Word export.
|
|
2
|
+
|
|
3
|
+
Word renders equations from OMML (`<m:oMath>`), not from raw text. The previous
|
|
4
|
+
exporter only set the font to "Cambria Math" and emitted the LaTeX source as
|
|
5
|
+
literal characters, so anything with real commands (\\sum, \\Delta, \\rightarrow,
|
|
6
|
+
\\leq) showed up as raw text. This module performs a genuine conversion:
|
|
7
|
+
|
|
8
|
+
LaTeX --(latex2mathml)--> presentation MathML --(owned converter)--> OMML
|
|
9
|
+
|
|
10
|
+
and returns a parsed `<m:oMath>` / `<m:oMathPara>` element ready to splice into a
|
|
11
|
+
python-docx paragraph.
|
|
12
|
+
|
|
13
|
+
OMML is REQUIRED, not optional. If the conversion backends are not importable
|
|
14
|
+
this raises loudly, so a missing dependency can never silently strip OMML from
|
|
15
|
+
the whole document. A *single* equation that fails to convert returns ``None``
|
|
16
|
+
so that one fragment falls back to text while the rest of the document keeps
|
|
17
|
+
real OMML.
|
|
18
|
+
|
|
19
|
+
This converter is owned (pure stdlib + latex2mathml); it does NOT use
|
|
20
|
+
``mathml2omml``. The 113-equation conformance suite in
|
|
21
|
+
``tests/test_omml_suite.py`` is the structural spec the emitted OMML matches.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
from html import escape
|
|
27
|
+
from xml.etree import ElementTree as ET
|
|
28
|
+
|
|
29
|
+
from docx.oxml import parse_xml
|
|
30
|
+
from docx.oxml.ns import nsdecls
|
|
31
|
+
|
|
32
|
+
_MATH_NS = "http://schemas.openxmlformats.org/officeDocument/2006/math"
|
|
33
|
+
_MATHML_NS = "http://www.w3.org/1998/Math/MathML"
|
|
34
|
+
|
|
35
|
+
# N-ary operator characters. Big operators (∑∏⋃⋂…) and integrals (∫∮…) all
|
|
36
|
+
# default in this corpus to limits placed beside the glyph (limLoc=subSup),
|
|
37
|
+
# matching the conformance fixture.
|
|
38
|
+
_NARY_CHARS = {
|
|
39
|
+
"∑", # ∑ summation
|
|
40
|
+
"∏", # ∏ product
|
|
41
|
+
"∐", # ∐ coproduct
|
|
42
|
+
"∫", # ∫ integral
|
|
43
|
+
"∬", # ∬ double integral
|
|
44
|
+
"∭", # ∭ triple integral
|
|
45
|
+
"⨌", # ⨌ quadruple integral
|
|
46
|
+
"∮", # ∮ contour integral
|
|
47
|
+
"∯", # ∯ surface integral
|
|
48
|
+
"∰", # ∰ volume integral
|
|
49
|
+
"⋃", # ⋃ n-ary union
|
|
50
|
+
"⋂", # ⋂ n-ary intersection
|
|
51
|
+
"⋁", # ⋁ n-ary logical or
|
|
52
|
+
"⋀", # ⋀ n-ary logical and
|
|
53
|
+
"⨁", # ⨁ n-ary circled plus
|
|
54
|
+
"⨂", # ⨂ n-ary circled times
|
|
55
|
+
"⨀", # ⨀ n-ary circled dot
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
# Fence/delimiter open->close pairs recognised on bare mrow content.
|
|
59
|
+
_FENCE_PAIRS = {
|
|
60
|
+
"(": ")",
|
|
61
|
+
"[": "]",
|
|
62
|
+
"{": "}",
|
|
63
|
+
"⟨": "⟩", # ⟨ ⟩ angle brackets
|
|
64
|
+
"⌈": "⌉", # ⌈ ⌉ ceiling
|
|
65
|
+
"⌊": "⌋", # ⌊ ⌋ floor
|
|
66
|
+
"|": "|",
|
|
67
|
+
"‖": "‖", # ‖ double vertical bar
|
|
68
|
+
}
|
|
69
|
+
_FENCE_OPENERS = set(_FENCE_PAIRS)
|
|
70
|
+
_FENCE_CLOSERS = set(_FENCE_PAIRS.values())
|
|
71
|
+
|
|
72
|
+
# Accent over/under glyphs (both spacing and combining forms latex2mathml may
|
|
73
|
+
# emit) mapped to the COMBINING mark Word stores on <m:acc><m:chr>.
|
|
74
|
+
_ACCENT_TO_COMBINING = {
|
|
75
|
+
"^": "̂", # circumflex / hat
|
|
76
|
+
"̂": "̂",
|
|
77
|
+
"~": "̃", # tilde
|
|
78
|
+
"̃": "̃",
|
|
79
|
+
"˜": "̃", # small tilde
|
|
80
|
+
"→": "⃗", # vector arrow
|
|
81
|
+
"⃗": "⃗",
|
|
82
|
+
"˙": "̇", # dot above
|
|
83
|
+
"̇": "̇",
|
|
84
|
+
"¨": "̈", # diaeresis / ddot
|
|
85
|
+
"̈": "̈",
|
|
86
|
+
"ˇ": "̌", # caron / check
|
|
87
|
+
"̌": "̌",
|
|
88
|
+
"˘": "̆", # breve
|
|
89
|
+
"̆": "̆",
|
|
90
|
+
"´": "́", # acute
|
|
91
|
+
"́": "́",
|
|
92
|
+
"`": "̀", # grave
|
|
93
|
+
"̀": "̀",
|
|
94
|
+
"´": "́", # acute accent variant
|
|
95
|
+
}
|
|
96
|
+
# Overline / bar (top) glyphs → <m:bar pos="top">.
|
|
97
|
+
_BAR_TOP_CHARS = {
|
|
98
|
+
"¯", # ¯ macron (spacing)
|
|
99
|
+
"̄", # ̄ combining macron
|
|
100
|
+
"̅", # ̅ combining overline
|
|
101
|
+
"‾", # ‾ overline (spacing)
|
|
102
|
+
}
|
|
103
|
+
# Underline glyph (―) is shared by overline; munder vs mover disambiguates.
|
|
104
|
+
_OVERLINE_CHARS = _BAR_TOP_CHARS | {"―"}
|
|
105
|
+
_BAR_BOT_CHARS = {
|
|
106
|
+
"̲", # ̲ combining low line
|
|
107
|
+
"_", # _ low line (spacing)
|
|
108
|
+
"―", # ― horizontal bar (latex2mathml \underline, under context)
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
# Group-character braces.
|
|
112
|
+
_GROUPCHR_TOP = {"⏞"} # over-brace
|
|
113
|
+
_GROUPCHR_BOT = {"⏟"} # under-brace
|
|
114
|
+
|
|
115
|
+
# Arrow glyphs (base of a decorated arrow like \xrightarrow).
|
|
116
|
+
_ARROW_CHARS = {"→", "←", "↔", "⇒", "⇐", "⇔", "↦", "⟶", "⟵", "⟷"}
|
|
117
|
+
|
|
118
|
+
# Known function names latex2mathml renders as a bare <mi>/<mo>. A function
|
|
119
|
+
# application wraps the name in <m:func> with the argument as <m:e>.
|
|
120
|
+
_FUNCTION_NAMES = {
|
|
121
|
+
"sin", "cos", "tan", "cot", "sec", "csc",
|
|
122
|
+
"sinh", "cosh", "tanh", "coth",
|
|
123
|
+
"arcsin", "arccos", "arctan",
|
|
124
|
+
"arsinh", "arcosh", "artanh",
|
|
125
|
+
"log", "ln", "lg", "exp",
|
|
126
|
+
"det", "dim", "hom", "ker", "deg", "gcd", "arg",
|
|
127
|
+
}
|
|
128
|
+
# Operator-name <mo> tokens that take an UNDER limit (limLow) when scripted.
|
|
129
|
+
_LIMIT_OP_NAMES = {"lim", "lim sup", "lim inf", "max", "min", "sup", "inf",
|
|
130
|
+
"limsup", "liminf", "argmax", "argmin"}
|
|
131
|
+
|
|
132
|
+
# Bold-italic / bold math-alphanumeric Unicode blocks → (ascii, style). We map
|
|
133
|
+
# Mathematical Bold and Bold-Italic letters/digits back to their ASCII base and
|
|
134
|
+
# carry the bold style as <m:sty m:val="b">. Other math-alphanumeric variants
|
|
135
|
+
# (blackboard, script/calligraphic) are intentionally LEFT as-is — the fixture
|
|
136
|
+
# keeps ℝ, 𝒩, 𝒳 verbatim.
|
|
137
|
+
def _bold_unmap(ch: str) -> "str | None":
|
|
138
|
+
cp = ord(ch)
|
|
139
|
+
# Mathematical Bold Capital A-Z: U+1D400..U+1D419
|
|
140
|
+
if 0x1D400 <= cp <= 0x1D419:
|
|
141
|
+
return chr(ord("A") + cp - 0x1D400)
|
|
142
|
+
# Mathematical Bold Small a-z: U+1D41A..U+1D433
|
|
143
|
+
if 0x1D41A <= cp <= 0x1D433:
|
|
144
|
+
return chr(ord("a") + cp - 0x1D41A)
|
|
145
|
+
# Mathematical Bold Digit 0-9: U+1D7CE..U+1D7D7
|
|
146
|
+
if 0x1D7CE <= cp <= 0x1D7D7:
|
|
147
|
+
return chr(ord("0") + cp - 0x1D7CE)
|
|
148
|
+
return None
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _local_name(node: ET.Element) -> str:
|
|
152
|
+
if node.tag.startswith("{"):
|
|
153
|
+
return node.tag.rsplit("}", 1)[1]
|
|
154
|
+
return node.tag
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _node_text(node: ET.Element) -> str:
|
|
158
|
+
return "".join(node.itertext())
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
_SCRIPT_TAGS = {"msub", "msup", "msubsup", "munder", "mover", "munderover"}
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
# ---------------------------------------------------------------------------
|
|
165
|
+
# Run model — a small intermediate representation so adjacent simple text
|
|
166
|
+
# runs coalesce into ONE <m:r><m:t>…</m:t></m:r>, matching the fixture.
|
|
167
|
+
# ---------------------------------------------------------------------------
|
|
168
|
+
|
|
169
|
+
class _Run:
|
|
170
|
+
"""A coalescable text run: text + optional style ('b'|'p'|None)."""
|
|
171
|
+
|
|
172
|
+
__slots__ = ("text", "style")
|
|
173
|
+
|
|
174
|
+
def __init__(self, text: str, style: "str | None" = None) -> None:
|
|
175
|
+
self.text = text
|
|
176
|
+
self.style = style
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
class _Raw:
|
|
180
|
+
"""An already-serialized OMML fragment that must not be coalesced."""
|
|
181
|
+
|
|
182
|
+
__slots__ = ("xml",)
|
|
183
|
+
|
|
184
|
+
def __init__(self, xml: str) -> None:
|
|
185
|
+
self.xml = xml
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _run_xml(text: str, style: "str | None") -> str:
|
|
189
|
+
if style:
|
|
190
|
+
return (
|
|
191
|
+
f"<m:r><m:rPr><m:sty m:val=\"{style}\"/></m:rPr>"
|
|
192
|
+
f"<m:t>{escape(text)}</m:t></m:r>"
|
|
193
|
+
)
|
|
194
|
+
return f"<m:r><m:t>{escape(text)}</m:t></m:r>"
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _emit(parts: "list") -> str:
|
|
198
|
+
"""Serialize a list of _Run/_Raw items, coalescing adjacent runs that share
|
|
199
|
+
the same style into a single <m:r><m:t>."""
|
|
200
|
+
out: list[str] = []
|
|
201
|
+
buf_text = ""
|
|
202
|
+
buf_style: "str | None" = None
|
|
203
|
+
have_buf = False
|
|
204
|
+
|
|
205
|
+
def flush() -> None:
|
|
206
|
+
nonlocal have_buf, buf_text, buf_style
|
|
207
|
+
if have_buf:
|
|
208
|
+
out.append(_run_xml(buf_text, buf_style))
|
|
209
|
+
have_buf = False
|
|
210
|
+
buf_text = ""
|
|
211
|
+
buf_style = None
|
|
212
|
+
|
|
213
|
+
for item in parts:
|
|
214
|
+
if isinstance(item, _Run):
|
|
215
|
+
if have_buf and item.style == buf_style:
|
|
216
|
+
buf_text += item.text
|
|
217
|
+
else:
|
|
218
|
+
flush()
|
|
219
|
+
buf_text = item.text
|
|
220
|
+
buf_style = item.style
|
|
221
|
+
have_buf = True
|
|
222
|
+
else: # _Raw
|
|
223
|
+
flush()
|
|
224
|
+
out.append(item.xml)
|
|
225
|
+
flush()
|
|
226
|
+
return "".join(out)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
# ---------------------------------------------------------------------------
|
|
230
|
+
# Token-level conversion: MathML node -> list of _Run/_Raw parts.
|
|
231
|
+
# ---------------------------------------------------------------------------
|
|
232
|
+
|
|
233
|
+
def _text_token_parts(node: ET.Element) -> "list":
|
|
234
|
+
"""Convert a leaf text node (mi/mn/mo/mtext) to run parts, applying style
|
|
235
|
+
from mathvariant and unmapping bold math-alphanumeric characters."""
|
|
236
|
+
raw = "".join(node.itertext())
|
|
237
|
+
variant = node.attrib.get("mathvariant", "")
|
|
238
|
+
style: "str | None" = None
|
|
239
|
+
if variant in ("bold", "bold-italic"):
|
|
240
|
+
style = "b"
|
|
241
|
+
# NB: ``mathvariant="normal"`` (\mathrm) is NOT auto-styled upright here —
|
|
242
|
+
# the fixture only marks upright runs in specific positions (a prescript
|
|
243
|
+
# base, a transpose superscript), which are handled by _coerce_upright /
|
|
244
|
+
# _sup_omml. Auto-styling every \mathrm run would over-apply <m:sty p>.
|
|
245
|
+
# Per-character bold-alphanumeric unmap (e.g. \mathbf{F} -> 𝐅).
|
|
246
|
+
mapped = []
|
|
247
|
+
saw_bold = False
|
|
248
|
+
for ch in raw:
|
|
249
|
+
b = _bold_unmap(ch)
|
|
250
|
+
if b is not None:
|
|
251
|
+
mapped.append(b)
|
|
252
|
+
saw_bold = True
|
|
253
|
+
else:
|
|
254
|
+
mapped.append(ch)
|
|
255
|
+
text = _normalize_spaces("".join(mapped))
|
|
256
|
+
text = text.translate(_GLYPH_ALIASES)
|
|
257
|
+
if saw_bold and style is None:
|
|
258
|
+
style = "b"
|
|
259
|
+
return [_Run(text, style)]
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
# Operator glyph aliases: latex2mathml renders some relations with an ASCII
|
|
263
|
+
# code point; the fixture uses the proper Unicode math symbol.
|
|
264
|
+
_GLYPH_ALIASES = {
|
|
265
|
+
ord("~"): "∼", # \sim → U+223C
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
# Unicode spaces latex2mathml emits inside operator names / \text → normalise to
|
|
270
|
+
# a regular ASCII space so "lim sup" compares as "lim sup", etc.
|
|
271
|
+
_SPACE_CHARS = "\xa0 "
|
|
272
|
+
_SPACE_TABLE = {ord(c): " " for c in _SPACE_CHARS}
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def _normalize_spaces(text: str) -> str:
|
|
276
|
+
return text.translate(_SPACE_TABLE)
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def _node_parts(node: ET.Element) -> "list":
|
|
280
|
+
"""Convert a single MathML node to a list of _Run/_Raw parts."""
|
|
281
|
+
name = _local_name(node)
|
|
282
|
+
if name in {"math", "mrow", "semantics", "mstyle", "mpadded"}:
|
|
283
|
+
return _sequence_parts(list(node))
|
|
284
|
+
if name in {"mi", "mn", "mo", "mtext"}:
|
|
285
|
+
return _text_token_parts(node)
|
|
286
|
+
if name == "mspace":
|
|
287
|
+
# Math spacing commands (\, \; \! \quad …) carry no run content in the
|
|
288
|
+
# fixture — drop them. (Row-break newlines are consumed upstream.)
|
|
289
|
+
return []
|
|
290
|
+
if name == "none":
|
|
291
|
+
return []
|
|
292
|
+
if name == "mphantom":
|
|
293
|
+
inner = _emit(_sequence_parts(list(node)))
|
|
294
|
+
return [_Raw(f"<m:phant><m:phantPr/><m:e>{inner}</m:e></m:phant>")]
|
|
295
|
+
if name == "mfenced":
|
|
296
|
+
return [_Raw(_mfenced_to_omml(node))]
|
|
297
|
+
if name == "mmultiscripts":
|
|
298
|
+
return [_Raw(_mmultiscripts_to_omml(node))]
|
|
299
|
+
if name == "menclose":
|
|
300
|
+
return [_Raw(_menclose_to_omml(node))]
|
|
301
|
+
if name == "msup":
|
|
302
|
+
kids = list(node)
|
|
303
|
+
# Prime superscript (f' → f′): fold the prime glyph(s) into a text run
|
|
304
|
+
# rather than a real superscript.
|
|
305
|
+
if len(kids) > 1:
|
|
306
|
+
sup_txt = _node_text(kids[1]).strip()
|
|
307
|
+
if sup_txt and all(ch in "′″‴⁗" for ch in sup_txt):
|
|
308
|
+
return _node_parts(kids[0]) + [_Run(sup_txt)]
|
|
309
|
+
return [_Raw(
|
|
310
|
+
"<m:sSup>"
|
|
311
|
+
f"<m:e>{_child_omml(node, 0)}</m:e>"
|
|
312
|
+
f"<m:sup>{_sup_omml(node, 1)}</m:sup>"
|
|
313
|
+
"</m:sSup>"
|
|
314
|
+
)]
|
|
315
|
+
if name == "msub":
|
|
316
|
+
return [_Raw(
|
|
317
|
+
"<m:sSub>"
|
|
318
|
+
f"<m:e>{_child_omml(node, 0)}</m:e>"
|
|
319
|
+
f"<m:sub>{_child_omml(node, 1)}</m:sub>"
|
|
320
|
+
"</m:sSub>"
|
|
321
|
+
)]
|
|
322
|
+
if name == "msubsup":
|
|
323
|
+
return [_Raw(
|
|
324
|
+
"<m:sSubSup>"
|
|
325
|
+
f"<m:e>{_child_omml(node, 0)}</m:e>"
|
|
326
|
+
f"<m:sub>{_child_omml(node, 1)}</m:sub>"
|
|
327
|
+
f"<m:sup>{_child_omml(node, 2)}</m:sup>"
|
|
328
|
+
"</m:sSubSup>"
|
|
329
|
+
)]
|
|
330
|
+
if name == "mfrac":
|
|
331
|
+
return [_Raw(
|
|
332
|
+
"<m:f>"
|
|
333
|
+
f"<m:num>{_child_omml(node, 0)}</m:num>"
|
|
334
|
+
f"<m:den>{_child_omml(node, 1)}</m:den>"
|
|
335
|
+
"</m:f>"
|
|
336
|
+
)]
|
|
337
|
+
if name == "msqrt":
|
|
338
|
+
body = _emit(_sequence_parts(list(node)))
|
|
339
|
+
return [_Raw(
|
|
340
|
+
"<m:rad><m:radPr><m:degHide m:val=\"1\"/></m:radPr>"
|
|
341
|
+
f"<m:deg/><m:e>{body}</m:e></m:rad>"
|
|
342
|
+
)]
|
|
343
|
+
if name == "mroot":
|
|
344
|
+
return [_Raw(
|
|
345
|
+
"<m:rad><m:radPr><m:degHide m:val=\"0\"/></m:radPr>"
|
|
346
|
+
f"<m:deg>{_child_omml(node, 1)}</m:deg>"
|
|
347
|
+
f"<m:e>{_child_omml(node, 0)}</m:e>"
|
|
348
|
+
"</m:rad>"
|
|
349
|
+
)]
|
|
350
|
+
if name == "mover":
|
|
351
|
+
arrow = _decorated_arrow_text(node)
|
|
352
|
+
if arrow is not None:
|
|
353
|
+
return [_Run(arrow)] # coalescable so "→𝒩" merges into one run
|
|
354
|
+
return [_Raw(_mover_to_omml(node))]
|
|
355
|
+
if name == "munder":
|
|
356
|
+
return [_Raw(_munder_to_omml(node))]
|
|
357
|
+
if name == "munderover":
|
|
358
|
+
return [_Raw(_munderover_to_omml(node))]
|
|
359
|
+
if name == "mtable":
|
|
360
|
+
return [_Raw(_mtable_to_eqarr(node))]
|
|
361
|
+
# Fallback: descend.
|
|
362
|
+
sub = _sequence_parts(list(node))
|
|
363
|
+
if sub:
|
|
364
|
+
return sub
|
|
365
|
+
return [_Run("".join(node.itertext()))]
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def _child_omml(node: ET.Element, index: int) -> str:
|
|
369
|
+
children = list(node)
|
|
370
|
+
if index >= len(children):
|
|
371
|
+
return ""
|
|
372
|
+
return _emit(_node_parts(children[index]))
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def _sup_omml(node: ET.Element, index: int) -> str:
|
|
376
|
+
"""Superscript content, marking a lone uppercase Latin letter as upright
|
|
377
|
+
(e.g. the transpose ``A^{T}`` — latex2mathml drops mathvariant for it)."""
|
|
378
|
+
children = list(node)
|
|
379
|
+
if index >= len(children):
|
|
380
|
+
return ""
|
|
381
|
+
sup = children[index]
|
|
382
|
+
parts = _node_parts(sup)
|
|
383
|
+
if (len(parts) == 1 and isinstance(parts[0], _Run)
|
|
384
|
+
and parts[0].style is None
|
|
385
|
+
and len(parts[0].text) == 1
|
|
386
|
+
and parts[0].text.isascii()
|
|
387
|
+
and parts[0].text.isupper()):
|
|
388
|
+
parts = [_Run(parts[0].text, "p")]
|
|
389
|
+
return _emit(parts)
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
# ---------------------------------------------------------------------------
|
|
393
|
+
# Group character (overbrace / underbrace) — mover/munder with a brace glyph.
|
|
394
|
+
# ---------------------------------------------------------------------------
|
|
395
|
+
|
|
396
|
+
def _try_groupchr(node: ET.Element) -> "str | None":
|
|
397
|
+
"""If node is an mover/munder (possibly nested for the label) whose brace
|
|
398
|
+
child is ⏞/⏟, emit <m:groupChr>. The attached label (over the brace) is
|
|
399
|
+
dropped, matching the fixture."""
|
|
400
|
+
name = _local_name(node)
|
|
401
|
+
if name not in ("mover", "munder"):
|
|
402
|
+
return None
|
|
403
|
+
children = list(node)
|
|
404
|
+
if len(children) < 2:
|
|
405
|
+
return None
|
|
406
|
+
base, mark = children[0], children[1]
|
|
407
|
+
mark_txt = _node_text(mark).strip()
|
|
408
|
+
# Outer mover label wrapping an inner brace: \overbrace{...}^{label}
|
|
409
|
+
if mark_txt not in _GROUPCHR_TOP and mark_txt not in _GROUPCHR_BOT:
|
|
410
|
+
inner = _try_groupchr(base)
|
|
411
|
+
return inner # label (mark) dropped; inner brace is authoritative
|
|
412
|
+
if mark_txt in _GROUPCHR_TOP:
|
|
413
|
+
chr_val, pos, vert = "⏞", "top", "bot"
|
|
414
|
+
else:
|
|
415
|
+
chr_val, pos, vert = "⏟", "bot", "top"
|
|
416
|
+
body = _emit(_node_parts(base))
|
|
417
|
+
return (
|
|
418
|
+
"<m:groupChr><m:groupChrPr>"
|
|
419
|
+
f'<m:chr m:val="{escape(chr_val)}"/>'
|
|
420
|
+
f'<m:pos m:val="{pos}"/>'
|
|
421
|
+
f'<m:vertJc m:val="{vert}"/>'
|
|
422
|
+
"</m:groupChrPr>"
|
|
423
|
+
f"<m:e>{body}</m:e></m:groupChr>"
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
def _decorated_arrow_text(node: ET.Element) -> "str | None":
|
|
428
|
+
"""If ``node`` is a decorated arrow (\\xrightarrow{d} → mover(arrow, label)),
|
|
429
|
+
return the bare arrow glyph (label dropped); else None."""
|
|
430
|
+
children = list(node)
|
|
431
|
+
if len(children) < 2:
|
|
432
|
+
return None
|
|
433
|
+
base_txt = _node_text(children[0]).strip()
|
|
434
|
+
over_char = _node_text(children[1]).strip()
|
|
435
|
+
if (base_txt in _ARROW_CHARS
|
|
436
|
+
and over_char not in _ACCENT_TO_COMBINING
|
|
437
|
+
and over_char not in _OVERLINE_CHARS):
|
|
438
|
+
return base_txt
|
|
439
|
+
return None
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
def _mover_to_omml(node: ET.Element) -> str:
|
|
443
|
+
grp = _try_groupchr(node)
|
|
444
|
+
if grp is not None:
|
|
445
|
+
return grp
|
|
446
|
+
children = list(node)
|
|
447
|
+
over_char = _node_text(children[1]).strip() if len(children) > 1 else "^"
|
|
448
|
+
base_xml = _child_omml(node, 0)
|
|
449
|
+
if over_char in _OVERLINE_CHARS:
|
|
450
|
+
return (
|
|
451
|
+
"<m:bar><m:barPr><m:pos m:val=\"top\"/></m:barPr>"
|
|
452
|
+
f"<m:e>{base_xml}</m:e></m:bar>"
|
|
453
|
+
)
|
|
454
|
+
chr_val = _ACCENT_TO_COMBINING.get(over_char, over_char or "̂")
|
|
455
|
+
return (
|
|
456
|
+
"<m:acc>"
|
|
457
|
+
f'<m:accPr><m:chr m:val="{escape(chr_val)}"/></m:accPr>'
|
|
458
|
+
f"<m:e>{base_xml}</m:e></m:acc>"
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
def _munder_to_omml(node: ET.Element) -> str:
|
|
463
|
+
grp = _try_groupchr(node)
|
|
464
|
+
if grp is not None:
|
|
465
|
+
return grp
|
|
466
|
+
children = list(node)
|
|
467
|
+
under_char = _node_text(children[1]).strip() if len(children) > 1 else ""
|
|
468
|
+
if under_char in _BAR_BOT_CHARS:
|
|
469
|
+
return (
|
|
470
|
+
"<m:bar><m:barPr><m:pos m:val=\"bot\"/></m:barPr>"
|
|
471
|
+
f"<m:e>{_child_omml(node, 0)}</m:e></m:bar>"
|
|
472
|
+
)
|
|
473
|
+
# Otherwise a lower limit (rare in this corpus outside scripted ops).
|
|
474
|
+
return (
|
|
475
|
+
"<m:limLow>"
|
|
476
|
+
f"<m:e>{_child_omml(node, 0)}</m:e>"
|
|
477
|
+
f"<m:lim>{_child_omml(node, 1)}</m:lim>"
|
|
478
|
+
"</m:limLow>"
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
def _munderover_to_omml(node: ET.Element) -> str:
|
|
483
|
+
return (
|
|
484
|
+
"<m:limUpp><m:e><m:limLow>"
|
|
485
|
+
f"<m:e>{_child_omml(node, 0)}</m:e>"
|
|
486
|
+
f"<m:lim>{_child_omml(node, 1)}</m:lim>"
|
|
487
|
+
"</m:limLow></m:e>"
|
|
488
|
+
f"<m:lim>{_child_omml(node, 2)}</m:lim>"
|
|
489
|
+
"</m:limUpp>"
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
# ---------------------------------------------------------------------------
|
|
494
|
+
# N-ary operators.
|
|
495
|
+
# ---------------------------------------------------------------------------
|
|
496
|
+
|
|
497
|
+
def _is_nary_operator_node(node: ET.Element) -> "str | None":
|
|
498
|
+
if _local_name(node) != "mo":
|
|
499
|
+
return None
|
|
500
|
+
text = _node_text(node).strip()
|
|
501
|
+
return text if text in _NARY_CHARS else None
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
def _script_base(node: ET.Element):
|
|
505
|
+
children = list(node)
|
|
506
|
+
return children[0] if children else None
|
|
507
|
+
|
|
508
|
+
|
|
509
|
+
def _nary_pr(base_char: str, has_sub: bool, has_sup: bool) -> str:
|
|
510
|
+
parts = [
|
|
511
|
+
f'<m:chr m:val="{escape(base_char)}"/>',
|
|
512
|
+
'<m:limLoc m:val="subSup"/>',
|
|
513
|
+
]
|
|
514
|
+
# Only emit a Hide hint when that limit is genuinely ABSENT.
|
|
515
|
+
if not has_sub:
|
|
516
|
+
parts.append('<m:subHide m:val="1"/>')
|
|
517
|
+
if not has_sup:
|
|
518
|
+
parts.append('<m:supHide m:val="1"/>')
|
|
519
|
+
return "<m:naryPr>" + "".join(parts) + "</m:naryPr>"
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
def _nary_from_script(node: ET.Element, base_char: str, body: str) -> str:
|
|
523
|
+
name = _local_name(node)
|
|
524
|
+
children = list(node)
|
|
525
|
+
sub_xml = ""
|
|
526
|
+
sup_xml = ""
|
|
527
|
+
if name in {"msub", "munder"}:
|
|
528
|
+
sub_xml = _emit(_node_parts(children[1])) if len(children) > 1 else ""
|
|
529
|
+
elif name in {"msup", "mover"}:
|
|
530
|
+
sup_xml = _emit(_node_parts(children[1])) if len(children) > 1 else ""
|
|
531
|
+
elif name in {"msubsup", "munderover"}:
|
|
532
|
+
sub_xml = _emit(_node_parts(children[1])) if len(children) > 1 else ""
|
|
533
|
+
sup_xml = _emit(_node_parts(children[2])) if len(children) > 2 else ""
|
|
534
|
+
pr = _nary_pr(base_char, bool(sub_xml), bool(sup_xml))
|
|
535
|
+
sub_block = f"<m:sub>{sub_xml}</m:sub>" if sub_xml else "<m:sub/>"
|
|
536
|
+
sup_block = f"<m:sup>{sup_xml}</m:sup>" if sup_xml else "<m:sup/>"
|
|
537
|
+
return f"<m:nary>{pr}{sub_block}{sup_block}<m:e>{body}</m:e></m:nary>"
|
|
538
|
+
|
|
539
|
+
|
|
540
|
+
# ---------------------------------------------------------------------------
|
|
541
|
+
# Limit operators (lim / limsup / max …) scripted via msub/munder.
|
|
542
|
+
# ---------------------------------------------------------------------------
|
|
543
|
+
|
|
544
|
+
def _is_limit_op_script(node: ET.Element) -> "ET.Element | None":
|
|
545
|
+
"""If node is msub/munder whose base is a limit-op <mo>/<mi>, return base."""
|
|
546
|
+
if _local_name(node) not in ("msub", "munder"):
|
|
547
|
+
return None
|
|
548
|
+
base = _script_base(node)
|
|
549
|
+
if base is None:
|
|
550
|
+
return None
|
|
551
|
+
if _local_name(base) in ("mo", "mi"):
|
|
552
|
+
if _normalize_spaces(_node_text(base)).strip() in _LIMIT_OP_NAMES:
|
|
553
|
+
return base
|
|
554
|
+
return None
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
def _limit_low_from_script(node: ET.Element) -> str:
|
|
558
|
+
children = list(node)
|
|
559
|
+
base_xml = _emit(_node_parts(children[0]))
|
|
560
|
+
lim_xml = _emit(_node_parts(children[1])) if len(children) > 1 else ""
|
|
561
|
+
return (
|
|
562
|
+
"<m:limLow>"
|
|
563
|
+
f"<m:e>{base_xml}</m:e>"
|
|
564
|
+
f"<m:lim>{lim_xml}</m:lim>"
|
|
565
|
+
"</m:limLow>"
|
|
566
|
+
)
|
|
567
|
+
|
|
568
|
+
|
|
569
|
+
# ---------------------------------------------------------------------------
|
|
570
|
+
# Function detection (\sin x, \log_2 n, \ln(x+1), …).
|
|
571
|
+
# ---------------------------------------------------------------------------
|
|
572
|
+
|
|
573
|
+
def _function_name_node(node: ET.Element) -> "str | None":
|
|
574
|
+
"""Return the function name if node is a function-name token, else None."""
|
|
575
|
+
if _local_name(node) in ("mi", "mo"):
|
|
576
|
+
txt = _node_text(node).strip()
|
|
577
|
+
if txt in _FUNCTION_NAMES:
|
|
578
|
+
return txt
|
|
579
|
+
if _local_name(node) == "msub":
|
|
580
|
+
base = _script_base(node)
|
|
581
|
+
if base is not None and _local_name(base) in ("mi", "mo"):
|
|
582
|
+
if _node_text(base).strip() in _FUNCTION_NAMES:
|
|
583
|
+
return _node_text(base).strip()
|
|
584
|
+
return None
|
|
585
|
+
|
|
586
|
+
|
|
587
|
+
def _func_apply(name_xml: str, arg_xml: str) -> str:
|
|
588
|
+
return (
|
|
589
|
+
"<m:func><m:funcPr/>"
|
|
590
|
+
f"<m:fName>{name_xml}</m:fName>"
|
|
591
|
+
f"<m:e>{arg_xml}</m:e></m:func>"
|
|
592
|
+
)
|
|
593
|
+
|
|
594
|
+
|
|
595
|
+
# ---------------------------------------------------------------------------
|
|
596
|
+
# Sequence conversion — the heart of run coalescing + lookahead constructs.
|
|
597
|
+
# ---------------------------------------------------------------------------
|
|
598
|
+
|
|
599
|
+
def _sequence_parts(children: "list") -> "list":
|
|
600
|
+
"""Convert a flat sequence of MathML nodes to a list of _Run/_Raw parts.
|
|
601
|
+
|
|
602
|
+
Handles, with sibling lookahead:
|
|
603
|
+
• bare fence wrapping → <m:d>
|
|
604
|
+
• n-ary operator body capture → <m:nary>
|
|
605
|
+
• limit-op scripts (lim…) → <m:limLow> (no argument capture)
|
|
606
|
+
• function application (sin/log/…) → <m:func> capturing the argument
|
|
607
|
+
• empty-base prescript pattern → <m:sPre>
|
|
608
|
+
"""
|
|
609
|
+
# Top-level bare fence (e.g. \left( a + b \right) flattened to mo … mo).
|
|
610
|
+
fence = _try_bare_fence(children)
|
|
611
|
+
if fence is not None:
|
|
612
|
+
return [_Raw(fence)]
|
|
613
|
+
|
|
614
|
+
parts: "list" = []
|
|
615
|
+
i = 0
|
|
616
|
+
n = len(children)
|
|
617
|
+
while i < n:
|
|
618
|
+
child = children[i]
|
|
619
|
+
cname = _local_name(child)
|
|
620
|
+
|
|
621
|
+
# Pre-script pattern: msub/msup/msubsup with EMPTY base, followed by the
|
|
622
|
+
# real base run → <m:sPre>.
|
|
623
|
+
pre = _try_prescript(children, i)
|
|
624
|
+
if pre is not None:
|
|
625
|
+
xml, consumed = pre
|
|
626
|
+
parts.append(_Raw(xml))
|
|
627
|
+
i += consumed
|
|
628
|
+
continue
|
|
629
|
+
|
|
630
|
+
# N-ary operator with script container (msub/munder/…): capture body.
|
|
631
|
+
if cname in _SCRIPT_TAGS:
|
|
632
|
+
base = _script_base(child)
|
|
633
|
+
base_char = _is_nary_operator_node(base) if base is not None else None
|
|
634
|
+
if base_char:
|
|
635
|
+
body_nodes, j = _capture_body(children, i + 1)
|
|
636
|
+
body = _emit(_sequence_parts(body_nodes))
|
|
637
|
+
parts.append(_Raw(_nary_from_script(child, base_char, body)))
|
|
638
|
+
i = j
|
|
639
|
+
continue
|
|
640
|
+
# Limit operator (lim/limsup/max …) → limLow, no argument capture.
|
|
641
|
+
if _is_limit_op_script(child) is not None:
|
|
642
|
+
parts.append(_Raw(_limit_low_from_script(child)))
|
|
643
|
+
i += 1
|
|
644
|
+
continue
|
|
645
|
+
sbase = _script_base(child)
|
|
646
|
+
if (cname in ("msup", "msub") and sbase is not None
|
|
647
|
+
and _function_name_node(sbase) is not None):
|
|
648
|
+
kids = list(child)
|
|
649
|
+
if cname == "msub":
|
|
650
|
+
# Subscripted function NAME (e.g. \log_2 n): the whole sSub is
|
|
651
|
+
# the function name; the following operand is the argument.
|
|
652
|
+
name_xml = (
|
|
653
|
+
"<m:sSub>"
|
|
654
|
+
f"<m:e>{_emit(_node_parts(sbase))}</m:e>"
|
|
655
|
+
f"<m:sub>{_emit(_node_parts(kids[1])) if len(kids) > 1 else ''}</m:sub>"
|
|
656
|
+
"</m:sSub>"
|
|
657
|
+
)
|
|
658
|
+
arg_nodes, j = _capture_func_arg(children, i + 1)
|
|
659
|
+
parts.append(_Raw(_func_apply(
|
|
660
|
+
name_xml, _emit(_sequence_parts(arg_nodes)))))
|
|
661
|
+
i = j
|
|
662
|
+
continue
|
|
663
|
+
# Superscripted function (e.g. \cos^2 θ): power of the result.
|
|
664
|
+
name_xml = _emit(_node_parts(sbase))
|
|
665
|
+
arg_nodes, j = _capture_func_arg(children, i + 1)
|
|
666
|
+
func_xml = _func_apply(name_xml, _emit(_sequence_parts(arg_nodes)))
|
|
667
|
+
script_xml = _emit(_node_parts(kids[1])) if len(kids) > 1 else ""
|
|
668
|
+
parts.append(_Raw(
|
|
669
|
+
f"<m:sSup><m:e>{func_xml}</m:e>"
|
|
670
|
+
f"<m:sup>{script_xml}</m:sup></m:sSup>"
|
|
671
|
+
))
|
|
672
|
+
i = j
|
|
673
|
+
continue
|
|
674
|
+
|
|
675
|
+
# Bare n-ary mo (no script container).
|
|
676
|
+
bare_char = _is_nary_operator_node(child)
|
|
677
|
+
if bare_char:
|
|
678
|
+
body_nodes, j = _capture_body(children, i + 1)
|
|
679
|
+
body = _emit(_sequence_parts(body_nodes))
|
|
680
|
+
pr = _nary_pr(bare_char, False, False)
|
|
681
|
+
parts.append(_Raw(
|
|
682
|
+
f"<m:nary>{pr}<m:sub/><m:sup/><m:e>{body}</m:e></m:nary>"
|
|
683
|
+
))
|
|
684
|
+
i = j
|
|
685
|
+
continue
|
|
686
|
+
|
|
687
|
+
# Function application: name token (sin/log_2/…) consumes its argument.
|
|
688
|
+
fname = _function_name_node(child)
|
|
689
|
+
if fname is not None:
|
|
690
|
+
name_xml = _emit(_node_parts(child))
|
|
691
|
+
arg_nodes, j = _capture_func_arg(children, i + 1)
|
|
692
|
+
if arg_nodes:
|
|
693
|
+
arg_xml = _emit(_sequence_parts(arg_nodes))
|
|
694
|
+
parts.append(_Raw(_func_apply(name_xml, arg_xml)))
|
|
695
|
+
i = j
|
|
696
|
+
continue
|
|
697
|
+
|
|
698
|
+
# Implicit function application: a single-letter operand (or scripted
|
|
699
|
+
# operand) immediately followed by a parenthesised group → func, with the
|
|
700
|
+
# surrounding parentheses STRIPPED (the inner content is the argument).
|
|
701
|
+
# e.g. f(x) → func(f, x), f(x,y) → func(f, "x,y").
|
|
702
|
+
if _is_operand_callable(child) and _next_is_paren_group(children, i + 1):
|
|
703
|
+
name_xml = _emit(_node_parts(child))
|
|
704
|
+
inner_nodes, j = _capture_paren_inner(children, i + 1)
|
|
705
|
+
parts.append(_Raw(_func_apply(
|
|
706
|
+
name_xml, _emit(_sequence_parts(inner_nodes)))))
|
|
707
|
+
i = j
|
|
708
|
+
continue
|
|
709
|
+
|
|
710
|
+
# Inline fence group: a parenthesised sub-expression in function-call
|
|
711
|
+
# position (immediately after an identifier/scripted operand) → wrap in
|
|
712
|
+
# <m:d>. e.g. f'(x) → f′ (x). Parens after an operator stay literal text.
|
|
713
|
+
prev = children[i - 1] if i > 0 else None
|
|
714
|
+
grp = (_inline_fence_group(children, i)
|
|
715
|
+
if prev is not None and _local_name(prev) in (
|
|
716
|
+
"mi", "msub", "msup", "msubsup", "mrow", "mover", "munder")
|
|
717
|
+
else None)
|
|
718
|
+
if grp is not None:
|
|
719
|
+
xml, j = grp
|
|
720
|
+
parts.append(_Raw(xml))
|
|
721
|
+
i = j
|
|
722
|
+
continue
|
|
723
|
+
|
|
724
|
+
# Default: token / structural node.
|
|
725
|
+
parts.extend(_node_parts(child))
|
|
726
|
+
i += 1
|
|
727
|
+
return parts
|
|
728
|
+
|
|
729
|
+
|
|
730
|
+
def _inline_fence_group(children: "list", i: int) -> "tuple[str, int] | None":
|
|
731
|
+
child = children[i]
|
|
732
|
+
if _local_name(child) != "mo":
|
|
733
|
+
return None
|
|
734
|
+
open_ch = _node_text(child).strip()
|
|
735
|
+
# Only parenthesis groups are wrapped inline. Brackets/braces/bars appear in
|
|
736
|
+
# the corpus as literal text inside limits and sets, so wrapping them would
|
|
737
|
+
# over-fire; the whole-row fence handler still covers \left[ … \right].
|
|
738
|
+
if open_ch != "(":
|
|
739
|
+
return None
|
|
740
|
+
close_ch = _FENCE_PAIRS[open_ch]
|
|
741
|
+
depth = 0
|
|
742
|
+
n = len(children)
|
|
743
|
+
j = i
|
|
744
|
+
while j < n:
|
|
745
|
+
node = children[j]
|
|
746
|
+
if _local_name(node) == "mo":
|
|
747
|
+
t = _node_text(node).strip()
|
|
748
|
+
if t == open_ch:
|
|
749
|
+
depth += 1
|
|
750
|
+
elif t == close_ch:
|
|
751
|
+
depth -= 1
|
|
752
|
+
if depth == 0:
|
|
753
|
+
inner = children[i + 1:j]
|
|
754
|
+
return _delimiter(
|
|
755
|
+
open_ch, close_ch,
|
|
756
|
+
[_emit(_sequence_parts(inner))]), j + 1
|
|
757
|
+
j += 1
|
|
758
|
+
return None
|
|
759
|
+
|
|
760
|
+
|
|
761
|
+
def _is_operand_callable(node: ET.Element) -> bool:
|
|
762
|
+
"""A node that can stand as a function head when followed by '(': a single
|
|
763
|
+
identifier letter, or a script whose base is such a letter (f', T_k, …)."""
|
|
764
|
+
name = _local_name(node)
|
|
765
|
+
if name == "mi":
|
|
766
|
+
txt = _node_text(node).strip()
|
|
767
|
+
return len(txt) == 1 and txt.isalpha()
|
|
768
|
+
return False
|
|
769
|
+
|
|
770
|
+
|
|
771
|
+
def _capture_paren_inner(children: "list", start: int) -> "tuple[list, int]":
|
|
772
|
+
"""Capture the INNER nodes of the following ``(`` … ``)`` group (parens
|
|
773
|
+
stripped). Returns (inner_nodes, index_after_close)."""
|
|
774
|
+
n = len(children)
|
|
775
|
+
while start < n and _local_name(children[start]) == "mspace":
|
|
776
|
+
start += 1
|
|
777
|
+
if start >= n or _local_name(children[start]) != "mo" \
|
|
778
|
+
or _node_text(children[start]).strip() != "(":
|
|
779
|
+
return [children[start]] if start < n else [], min(start + 1, n)
|
|
780
|
+
depth = 0
|
|
781
|
+
j = start
|
|
782
|
+
while j < n:
|
|
783
|
+
node = children[j]
|
|
784
|
+
if _local_name(node) == "mo":
|
|
785
|
+
t = _node_text(node).strip()
|
|
786
|
+
if t == "(":
|
|
787
|
+
depth += 1
|
|
788
|
+
elif t == ")":
|
|
789
|
+
depth -= 1
|
|
790
|
+
if depth == 0:
|
|
791
|
+
return children[start + 1:j], j + 1
|
|
792
|
+
j += 1
|
|
793
|
+
return children[start + 1:], n
|
|
794
|
+
|
|
795
|
+
|
|
796
|
+
def _next_is_paren_group(children: "list", start: int) -> bool:
|
|
797
|
+
# Skip spacing.
|
|
798
|
+
while start < len(children) and _local_name(children[start]) == "mspace":
|
|
799
|
+
start += 1
|
|
800
|
+
if start >= len(children):
|
|
801
|
+
return False
|
|
802
|
+
nxt = children[start]
|
|
803
|
+
if _local_name(nxt) == "mo" and _node_text(nxt).strip() == "(":
|
|
804
|
+
return True
|
|
805
|
+
if _local_name(nxt) == "mfenced":
|
|
806
|
+
return True
|
|
807
|
+
return False
|
|
808
|
+
|
|
809
|
+
|
|
810
|
+
_RELATION_OPS = {
|
|
811
|
+
"=", "≠", "≈", "≡", "∼", "≅", "≃", "∝",
|
|
812
|
+
"<", ">", "≤", "≥", "≪", "≫",
|
|
813
|
+
"→", "←", "↔", "⇒", "⇐", "⇔", "↦", "⟶",
|
|
814
|
+
"∈", "∉", "⊂", "⊆", "⊃", "⊇",
|
|
815
|
+
}
|
|
816
|
+
|
|
817
|
+
|
|
818
|
+
def _capture_body(children: "list", start: int) -> "tuple[list, int]":
|
|
819
|
+
"""Capture nodes for an n-ary body: the summand/integrand.
|
|
820
|
+
|
|
821
|
+
A following n-ary operator is INCLUDED so adjacent operators nest (matching
|
|
822
|
+
``\\sum_i \\sum_j a_{ij}`` → an outer nary whose body is the inner nary). The
|
|
823
|
+
body ends at a top-level relation operator (=, ∼, <, →, ∈, …), which marks
|
|
824
|
+
the end of the operand and the start of a new clause."""
|
|
825
|
+
body: "list" = []
|
|
826
|
+
j = start
|
|
827
|
+
n = len(children)
|
|
828
|
+
while j < n:
|
|
829
|
+
nxt = children[j]
|
|
830
|
+
if _local_name(nxt) in ("mo", "mi"):
|
|
831
|
+
t = _node_text(nxt).strip().translate(_GLYPH_ALIASES)
|
|
832
|
+
if t in _RELATION_OPS:
|
|
833
|
+
break
|
|
834
|
+
body.append(nxt)
|
|
835
|
+
j += 1
|
|
836
|
+
return body, j
|
|
837
|
+
|
|
838
|
+
|
|
839
|
+
def _capture_func_arg(children: "list", start: int) -> "tuple[list, int]":
|
|
840
|
+
"""Capture the argument of a function application.
|
|
841
|
+
|
|
842
|
+
The argument is the immediately-following operand:
|
|
843
|
+
• a single delimiter group (mo '(' … mo ')'), or a node that already is a
|
|
844
|
+
delimiter / fenced / row, OR
|
|
845
|
+
• a single simple operand token.
|
|
846
|
+
We deliberately keep the argument tight so trailing terms stay outside.
|
|
847
|
+
"""
|
|
848
|
+
n = len(children)
|
|
849
|
+
# Skip a spacing token (e.g. \! negative thin space between \exp and the
|
|
850
|
+
# parenthesised argument).
|
|
851
|
+
while start < n and _local_name(children[start]) == "mspace":
|
|
852
|
+
start += 1
|
|
853
|
+
if start >= n:
|
|
854
|
+
return [], start
|
|
855
|
+
first = children[start]
|
|
856
|
+
fname_local = _local_name(first)
|
|
857
|
+
# Parenthesised argument starting with an opening fence mo: take through the
|
|
858
|
+
# matching close fence.
|
|
859
|
+
if fname_local == "mo":
|
|
860
|
+
open_ch = _node_text(first).strip()
|
|
861
|
+
if open_ch in _FENCE_OPENERS:
|
|
862
|
+
close_ch = _FENCE_PAIRS[open_ch]
|
|
863
|
+
depth = 0
|
|
864
|
+
j = start
|
|
865
|
+
while j < n:
|
|
866
|
+
node = children[j]
|
|
867
|
+
if _local_name(node) == "mo":
|
|
868
|
+
t = _node_text(node).strip()
|
|
869
|
+
if t == open_ch:
|
|
870
|
+
depth += 1
|
|
871
|
+
elif t == close_ch:
|
|
872
|
+
depth -= 1
|
|
873
|
+
if depth == 0:
|
|
874
|
+
return children[start:j + 1], j + 1
|
|
875
|
+
j += 1
|
|
876
|
+
# Unbalanced — fall through to single token.
|
|
877
|
+
# A node that is itself a complete group (mfenced / row / delimiter) — take
|
|
878
|
+
# just it.
|
|
879
|
+
if fname_local in {"mfenced", "mrow"}:
|
|
880
|
+
return [first], start + 1
|
|
881
|
+
# Otherwise the single following operand token.
|
|
882
|
+
return [first], start + 1
|
|
883
|
+
|
|
884
|
+
|
|
885
|
+
def _try_prescript(children: "list", i: int) -> "tuple[str, int] | None":
|
|
886
|
+
"""Detect the empty-base prescript pattern emitted by latex2mathml for
|
|
887
|
+
``{}^{a}_{b}\\mathrm{X}``: an msubsup/msup/msub whose BASE is an empty
|
|
888
|
+
<mrow/>, immediately followed by the real base operand. Emits <m:sPre>.
|
|
889
|
+
"""
|
|
890
|
+
node = children[i]
|
|
891
|
+
name = _local_name(node)
|
|
892
|
+
if name not in ("msubsup", "msup", "msub"):
|
|
893
|
+
return None
|
|
894
|
+
kids = list(node)
|
|
895
|
+
if not kids:
|
|
896
|
+
return None
|
|
897
|
+
base = kids[0]
|
|
898
|
+
# Empty base = an mrow/mi/mo with no text and no children.
|
|
899
|
+
if _node_text(base).strip() != "" or list(base):
|
|
900
|
+
return None
|
|
901
|
+
# Need a following operand to act as the real base.
|
|
902
|
+
if i + 1 >= len(children):
|
|
903
|
+
return None
|
|
904
|
+
sub_xml = ""
|
|
905
|
+
sup_xml = ""
|
|
906
|
+
if name == "msubsup":
|
|
907
|
+
sub_xml = _emit(_node_parts(kids[1])) if len(kids) > 1 else ""
|
|
908
|
+
sup_xml = _emit(_node_parts(kids[2])) if len(kids) > 2 else ""
|
|
909
|
+
elif name == "msub":
|
|
910
|
+
sub_xml = _emit(_node_parts(kids[1])) if len(kids) > 1 else ""
|
|
911
|
+
elif name == "msup":
|
|
912
|
+
sup_xml = _emit(_node_parts(kids[1])) if len(kids) > 1 else ""
|
|
913
|
+
real_base = children[i + 1]
|
|
914
|
+
base_xml = _emit(_coerce_upright(_node_parts(real_base)))
|
|
915
|
+
sub_block = f"<m:sub>{sub_xml}</m:sub>" if sub_xml else "<m:sub/>"
|
|
916
|
+
sup_block = f"<m:sup>{sup_xml}</m:sup>" if sup_xml else "<m:sup/>"
|
|
917
|
+
xml = (
|
|
918
|
+
"<m:sPre><m:sPrePr/>"
|
|
919
|
+
f"{sub_block}{sup_block}"
|
|
920
|
+
f"<m:e>{base_xml}</m:e></m:sPre>"
|
|
921
|
+
)
|
|
922
|
+
return xml, 2
|
|
923
|
+
|
|
924
|
+
|
|
925
|
+
def _coerce_upright(parts: "list") -> "list":
|
|
926
|
+
"""Mark plain text runs in ``parts`` as upright (sty='p'). Used for the base
|
|
927
|
+
of a prescript (a chemical element symbol like \\mathrm{X}), which
|
|
928
|
+
latex2mathml strips ``mathvariant=normal`` from for single letters."""
|
|
929
|
+
out: "list" = []
|
|
930
|
+
for item in parts:
|
|
931
|
+
if isinstance(item, _Run) and item.style is None:
|
|
932
|
+
out.append(_Run(item.text, "p"))
|
|
933
|
+
else:
|
|
934
|
+
out.append(item)
|
|
935
|
+
return out
|
|
936
|
+
|
|
937
|
+
|
|
938
|
+
# ---------------------------------------------------------------------------
|
|
939
|
+
# Delimiters / fences.
|
|
940
|
+
# ---------------------------------------------------------------------------
|
|
941
|
+
|
|
942
|
+
def _delimiter(begin: str, end: str, inner_es: "list[str]", sep: "str | None" = None) -> str:
|
|
943
|
+
pr_parts = [f'<m:begChr m:val="{escape(begin)}"/>', f'<m:endChr m:val="{escape(end)}"/>']
|
|
944
|
+
if sep is not None:
|
|
945
|
+
pr_parts.append(f'<m:sepChr m:val="{escape(sep)}"/>')
|
|
946
|
+
pr = "<m:dPr>" + "".join(pr_parts) + "</m:dPr>"
|
|
947
|
+
es = "".join(f"<m:e>{e}</m:e>" for e in inner_es) or "<m:e/>"
|
|
948
|
+
return f"<m:d>{pr}{es}</m:d>"
|
|
949
|
+
|
|
950
|
+
|
|
951
|
+
def _is_fence_mo(node: ET.Element) -> bool:
|
|
952
|
+
return _local_name(node) == "mo" and (
|
|
953
|
+
node.attrib.get("fence") == "true"
|
|
954
|
+
or _node_text(node).strip() in _FENCE_OPENERS
|
|
955
|
+
or _node_text(node).strip() in _FENCE_CLOSERS
|
|
956
|
+
)
|
|
957
|
+
|
|
958
|
+
|
|
959
|
+
def _try_bare_fence(children: "list") -> "str | None":
|
|
960
|
+
"""If ``children`` open with a fence ``mo`` and close with a matching fence
|
|
961
|
+
``mo``, wrap the inner content in <m:d>. Handles inner ``\\middle|`` bars by
|
|
962
|
+
splitting into nested <m:d> with | … | when fence bars appear mid-row."""
|
|
963
|
+
if len(children) < 2:
|
|
964
|
+
return None
|
|
965
|
+
first, last = children[0], children[-1]
|
|
966
|
+
if _local_name(first) != "mo" or _local_name(last) != "mo":
|
|
967
|
+
return None
|
|
968
|
+
open_ch = _node_text(first).strip()
|
|
969
|
+
close_ch = _node_text(last).strip()
|
|
970
|
+
if open_ch not in _FENCE_OPENERS:
|
|
971
|
+
return None
|
|
972
|
+
# The close must be a recognised closer; allow asymmetric pairs that the
|
|
973
|
+
# fixture uses (e.g. ( … ] and | … ⟩).
|
|
974
|
+
if close_ch not in _FENCE_CLOSERS:
|
|
975
|
+
return None
|
|
976
|
+
inner = children[1:-1]
|
|
977
|
+
# Detect inner "middle" fence bars (lspace/rspace stretchy fence | ) that
|
|
978
|
+
# split the content into nested delimiters: ⟨ a | b | c ⟩.
|
|
979
|
+
mids = [
|
|
980
|
+
idx for idx, node in enumerate(inner)
|
|
981
|
+
if _local_name(node) == "mo"
|
|
982
|
+
and node.attrib.get("fence") == "true"
|
|
983
|
+
and node.attrib.get("lspace") is not None
|
|
984
|
+
and _node_text(node).strip() in ("|", "‖")
|
|
985
|
+
]
|
|
986
|
+
if mids:
|
|
987
|
+
segs: "list[list]" = [[]]
|
|
988
|
+
bar_chars: "list[str]" = []
|
|
989
|
+
for node in inner:
|
|
990
|
+
if (_local_name(node) == "mo"
|
|
991
|
+
and node.attrib.get("fence") == "true"
|
|
992
|
+
and node.attrib.get("lspace") is not None
|
|
993
|
+
and _node_text(node).strip() in ("|", "‖")):
|
|
994
|
+
bar_chars.append(_node_text(node).strip())
|
|
995
|
+
segs.append([])
|
|
996
|
+
else:
|
|
997
|
+
segs[-1].append(node)
|
|
998
|
+
# Middle bars come in PAIRS: the content between bar[0] and bar[1] is
|
|
999
|
+
# wrapped in its own <m:d>|…|, the content after bar[1] is plain, etc.
|
|
1000
|
+
# ⟨ ϕ | X | ψ ⟩ → ϕ, <m:d>|X|</m:d>, ψ.
|
|
1001
|
+
es_parts: "list[str]" = []
|
|
1002
|
+
for k, seg in enumerate(segs):
|
|
1003
|
+
seg_xml = _emit(_sequence_parts(seg))
|
|
1004
|
+
if k >= 1 and k % 2 == 1:
|
|
1005
|
+
bar = bar_chars[k - 1] if k - 1 < len(bar_chars) else "|"
|
|
1006
|
+
es_parts.append(_delimiter(bar, bar, [seg_xml]))
|
|
1007
|
+
else:
|
|
1008
|
+
es_parts.append(seg_xml)
|
|
1009
|
+
body = "".join(es_parts)
|
|
1010
|
+
return _delimiter(open_ch, close_ch, [body])
|
|
1011
|
+
# Plain fence: the whole inner content is a single <m:e> (commas stay as
|
|
1012
|
+
# literal text — the fixture does not split argument lists into cells).
|
|
1013
|
+
return _delimiter(open_ch, close_ch, [_emit(_sequence_parts(inner))])
|
|
1014
|
+
|
|
1015
|
+
|
|
1016
|
+
def _mfenced_to_omml(node: ET.Element) -> str:
|
|
1017
|
+
open_ch = node.attrib.get("open", "(")
|
|
1018
|
+
close_ch = node.attrib.get("close", ")")
|
|
1019
|
+
seps = node.attrib.get("separators", ",")
|
|
1020
|
+
children = list(node)
|
|
1021
|
+
es = [_emit(_node_parts(child)) for child in children]
|
|
1022
|
+
sep = seps[0] if seps else None
|
|
1023
|
+
return _delimiter(open_ch, close_ch, es, sep=sep if len(es) > 1 else None)
|
|
1024
|
+
|
|
1025
|
+
|
|
1026
|
+
# ---------------------------------------------------------------------------
|
|
1027
|
+
# Multiscripts (mmultiscripts) — pre/post scripts.
|
|
1028
|
+
# ---------------------------------------------------------------------------
|
|
1029
|
+
|
|
1030
|
+
def _mmultiscripts_to_omml(node: ET.Element) -> str:
|
|
1031
|
+
children = list(node)
|
|
1032
|
+
if not children:
|
|
1033
|
+
return ""
|
|
1034
|
+
base = children[0]
|
|
1035
|
+
base_xml = _emit(_node_parts(base))
|
|
1036
|
+
post: "list[ET.Element]" = []
|
|
1037
|
+
pre: "list[ET.Element]" = []
|
|
1038
|
+
in_pre = False
|
|
1039
|
+
for child in children[1:]:
|
|
1040
|
+
if _local_name(child) == "mprescripts":
|
|
1041
|
+
in_pre = True
|
|
1042
|
+
continue
|
|
1043
|
+
(pre if in_pre else post).append(child)
|
|
1044
|
+
|
|
1045
|
+
def _sx(nodes: "list[ET.Element]", idx: int) -> str:
|
|
1046
|
+
if idx >= len(nodes):
|
|
1047
|
+
return ""
|
|
1048
|
+
nd = nodes[idx]
|
|
1049
|
+
if _local_name(nd) == "none":
|
|
1050
|
+
return ""
|
|
1051
|
+
return _emit(_node_parts(nd))
|
|
1052
|
+
|
|
1053
|
+
result = base_xml
|
|
1054
|
+
if post:
|
|
1055
|
+
sub_xml = _sx(post, 0)
|
|
1056
|
+
sup_xml = _sx(post, 1)
|
|
1057
|
+
if sub_xml and sup_xml:
|
|
1058
|
+
result = (
|
|
1059
|
+
"<m:sSubSup>"
|
|
1060
|
+
f"<m:e>{result}</m:e>"
|
|
1061
|
+
f"<m:sub>{sub_xml}</m:sub>"
|
|
1062
|
+
f"<m:sup>{sup_xml}</m:sup>"
|
|
1063
|
+
"</m:sSubSup>"
|
|
1064
|
+
)
|
|
1065
|
+
elif sub_xml:
|
|
1066
|
+
result = f"<m:sSub><m:e>{result}</m:e><m:sub>{sub_xml}</m:sub></m:sSub>"
|
|
1067
|
+
elif sup_xml:
|
|
1068
|
+
result = f"<m:sSup><m:e>{result}</m:e><m:sup>{sup_xml}</m:sup></m:sSup>"
|
|
1069
|
+
if pre:
|
|
1070
|
+
pre_sub = _sx(pre, 0)
|
|
1071
|
+
pre_sup = _sx(pre, 1)
|
|
1072
|
+
sub_block = f"<m:sub>{pre_sub}</m:sub>" if pre_sub else "<m:sub/>"
|
|
1073
|
+
sup_block = f"<m:sup>{pre_sup}</m:sup>" if pre_sup else "<m:sup/>"
|
|
1074
|
+
result = (
|
|
1075
|
+
"<m:sPre><m:sPrePr/>"
|
|
1076
|
+
f"{sub_block}{sup_block}"
|
|
1077
|
+
f"<m:e>{result}</m:e></m:sPre>"
|
|
1078
|
+
)
|
|
1079
|
+
return result
|
|
1080
|
+
|
|
1081
|
+
|
|
1082
|
+
# ---------------------------------------------------------------------------
|
|
1083
|
+
# Enclosures (menclose) — boxed / cancel / bcancel.
|
|
1084
|
+
# ---------------------------------------------------------------------------
|
|
1085
|
+
|
|
1086
|
+
def _menclose_to_omml(node: ET.Element) -> str:
|
|
1087
|
+
notation = node.attrib.get("notation", "").strip().lower()
|
|
1088
|
+
inner = _emit(_sequence_parts(list(node)))
|
|
1089
|
+
if notation == "updiagonalstrike":
|
|
1090
|
+
pr = (
|
|
1091
|
+
"<m:borderBoxPr>"
|
|
1092
|
+
'<m:hideTop m:val="1"/><m:hideBot m:val="1"/>'
|
|
1093
|
+
'<m:hideLeft m:val="1"/><m:hideRight m:val="1"/>'
|
|
1094
|
+
'<m:strikeTLBR m:val="1"/>'
|
|
1095
|
+
"</m:borderBoxPr>"
|
|
1096
|
+
)
|
|
1097
|
+
return f"<m:borderBox>{pr}<m:e>{inner}</m:e></m:borderBox>"
|
|
1098
|
+
if notation == "downdiagonalstrike":
|
|
1099
|
+
pr = (
|
|
1100
|
+
"<m:borderBoxPr>"
|
|
1101
|
+
'<m:hideTop m:val="1"/><m:hideBot m:val="1"/>'
|
|
1102
|
+
'<m:hideLeft m:val="1"/><m:hideRight m:val="1"/>'
|
|
1103
|
+
'<m:strikeBLTR m:val="1"/>'
|
|
1104
|
+
"</m:borderBoxPr>"
|
|
1105
|
+
)
|
|
1106
|
+
return f"<m:borderBox>{pr}<m:e>{inner}</m:e></m:borderBox>"
|
|
1107
|
+
# box / roundedbox / actuarial / everything else → plain bordered box.
|
|
1108
|
+
return f"<m:borderBox><m:borderBoxPr/><m:e>{inner}</m:e></m:borderBox>"
|
|
1109
|
+
|
|
1110
|
+
|
|
1111
|
+
# ---------------------------------------------------------------------------
|
|
1112
|
+
# mtable → eqArr (aligned / cases). latex2mathml flattens \aligned to a single
|
|
1113
|
+
# mrow with raw '&' separators (handled in preprocessing), and \cases to an
|
|
1114
|
+
# mtable inside a prefix-fence mrow. Both become <m:eqArr> with '&' stripped.
|
|
1115
|
+
# ---------------------------------------------------------------------------
|
|
1116
|
+
|
|
1117
|
+
# Synthetic newline marker (an mspace[linebreak=newline]) used to insert an
|
|
1118
|
+
# explicit row break at <mtr> boundaries.
|
|
1119
|
+
_NEWLINE_MARK = ET.Element("mspace", {"linebreak": "newline"})
|
|
1120
|
+
|
|
1121
|
+
|
|
1122
|
+
def _mtable_rows(node: ET.Element) -> "list[list]":
|
|
1123
|
+
"""Flatten an mtable into a list of token-rows, splitting on the nbsp/newline
|
|
1124
|
+
``\\`` break markers. Used for eqArr (aligned/cases) where columns collapse
|
|
1125
|
+
into a single per-row sequence (the '&' align markers are stripped)."""
|
|
1126
|
+
flat: "list" = []
|
|
1127
|
+
for tr in [c for c in list(node) if _local_name(c) == "mtr"]:
|
|
1128
|
+
for cell in [c for c in list(tr) if _local_name(c) == "mtd"]:
|
|
1129
|
+
flat.extend(list(cell))
|
|
1130
|
+
flat.append(_NEWLINE_MARK) # mtr boundary is also a row break
|
|
1131
|
+
out: "list[list]" = []
|
|
1132
|
+
for seg in _split_rows_on_nbsp(flat):
|
|
1133
|
+
out.append([nd for nd in seg if not _is_align_marker(nd)])
|
|
1134
|
+
return [r for r in out if r]
|
|
1135
|
+
|
|
1136
|
+
|
|
1137
|
+
def _mtable_to_eqarr(node: ET.Element) -> str:
|
|
1138
|
+
rows = _mtable_rows(node)
|
|
1139
|
+
es = "".join(
|
|
1140
|
+
f"<m:e>{_emit(_sequence_parts(seg))}</m:e>" for seg in rows
|
|
1141
|
+
)
|
|
1142
|
+
return f"<m:eqArr>{es}</m:eqArr>"
|
|
1143
|
+
|
|
1144
|
+
|
|
1145
|
+
def _mtable_grid(node: ET.Element) -> "list[list[list]]":
|
|
1146
|
+
"""Reconstruct a 2-D cell grid from latex2mathml's degenerate matrix mtable.
|
|
1147
|
+
|
|
1148
|
+
latex2mathml drops '&' column markers in matrix environments and packs the
|
|
1149
|
+
table into one <mtr> with split <mtd>s plus a newline/nbsp ``\\`` break. We
|
|
1150
|
+
treat each <mtd> boundary as a column separator and each break as a new row.
|
|
1151
|
+
"""
|
|
1152
|
+
rows: "list[list[list]]" = [[]]
|
|
1153
|
+
for tr in [c for c in list(node) if _local_name(c) == "mtr"]:
|
|
1154
|
+
for cell in [c for c in list(tr) if _local_name(c) == "mtd"]:
|
|
1155
|
+
segments = _split_rows_on_nbsp(list(cell)) or [[]]
|
|
1156
|
+
for k, seg in enumerate(segments):
|
|
1157
|
+
seg = [nd for nd in seg if not _is_align_marker(nd)]
|
|
1158
|
+
if k == 0:
|
|
1159
|
+
rows[-1].append(seg)
|
|
1160
|
+
else:
|
|
1161
|
+
rows.append([seg])
|
|
1162
|
+
rows.append([]) # mtr boundary ends a row
|
|
1163
|
+
return [r for r in rows if r]
|
|
1164
|
+
|
|
1165
|
+
|
|
1166
|
+
def _mtable_to_matrix(node: ET.Element) -> str:
|
|
1167
|
+
grid = _mtable_grid(node)
|
|
1168
|
+
ncols = max((len(r) for r in grid), default=0)
|
|
1169
|
+
mrows = []
|
|
1170
|
+
for row in grid:
|
|
1171
|
+
es = "".join(
|
|
1172
|
+
f"<m:e>{_emit(_sequence_parts(cell))}</m:e>" for cell in row
|
|
1173
|
+
)
|
|
1174
|
+
mrows.append(f"<m:mr>{es}</m:mr>")
|
|
1175
|
+
mpr = (
|
|
1176
|
+
"<m:mPr><m:mcs><m:mc><m:mcPr>"
|
|
1177
|
+
f'<m:count m:val="{ncols}"/>'
|
|
1178
|
+
"</m:mcPr></m:mc></m:mcs></m:mPr>"
|
|
1179
|
+
)
|
|
1180
|
+
return f"<m:m>{mpr}{''.join(mrows)}</m:m>"
|
|
1181
|
+
|
|
1182
|
+
|
|
1183
|
+
# Private-use sentinel char standing in for the alignment '&' that
|
|
1184
|
+
# latex2mathml emits as invalid XML inside aligned environments.
|
|
1185
|
+
_ALIGN_SENTINEL = ""
|
|
1186
|
+
|
|
1187
|
+
|
|
1188
|
+
def _is_align_marker(node: ET.Element) -> bool:
|
|
1189
|
+
if _local_name(node) not in ("mi", "mo"):
|
|
1190
|
+
return False
|
|
1191
|
+
t = _node_text(node).strip()
|
|
1192
|
+
return t == "&" or t == _ALIGN_SENTINEL
|
|
1193
|
+
|
|
1194
|
+
|
|
1195
|
+
def _is_nbsp_break(node: ET.Element) -> bool:
|
|
1196
|
+
# Newline mspace (latex2mathml emits this for "\\" in aligned/matrix).
|
|
1197
|
+
if _local_name(node) == "mspace" and node.attrib.get("linebreak") == "newline":
|
|
1198
|
+
return True
|
|
1199
|
+
# Whitespace/nbsp mtext (alternate "\\" rendering).
|
|
1200
|
+
if _local_name(node) == "mtext":
|
|
1201
|
+
txt = _node_text(node)
|
|
1202
|
+
return txt != "" and txt.strip(" ") == ""
|
|
1203
|
+
return False
|
|
1204
|
+
|
|
1205
|
+
|
|
1206
|
+
def _split_rows_on_nbsp(nodes: "list") -> "list[list]":
|
|
1207
|
+
rows: "list[list]" = [[]]
|
|
1208
|
+
for nd in nodes:
|
|
1209
|
+
if _is_nbsp_break(nd):
|
|
1210
|
+
rows.append([])
|
|
1211
|
+
else:
|
|
1212
|
+
rows[-1].append(nd)
|
|
1213
|
+
return [r for r in rows if r]
|
|
1214
|
+
|
|
1215
|
+
|
|
1216
|
+
# ---------------------------------------------------------------------------
|
|
1217
|
+
# Top-level mrow special cases: aligned (flattened) and cases (fence+mtable).
|
|
1218
|
+
# ---------------------------------------------------------------------------
|
|
1219
|
+
|
|
1220
|
+
def _try_aligned_or_cases(root_children: "list") -> "str | None":
|
|
1221
|
+
"""If the top-level content is an aligned/cases environment, emit eqArr.
|
|
1222
|
+
|
|
1223
|
+
• aligned → a single mrow flattened to tokens with raw '&' align markers and
|
|
1224
|
+
nbsp '\\' breaks (no mtable). Detect by presence of '&' align markers.
|
|
1225
|
+
• cases → an mrow whose children are [fence-mo '{', mtable]. Detect the
|
|
1226
|
+
mtable and convert it.
|
|
1227
|
+
"""
|
|
1228
|
+
# Unwrap a single wrapping mrow.
|
|
1229
|
+
nodes = root_children
|
|
1230
|
+
while len(nodes) == 1 and _local_name(nodes[0]) in ("mrow", "mstyle"):
|
|
1231
|
+
nodes = list(nodes[0])
|
|
1232
|
+
|
|
1233
|
+
# matrix: fence-open + mtable + fence-close → <m:d><m:m>.
|
|
1234
|
+
if (len(nodes) == 3
|
|
1235
|
+
and _local_name(nodes[0]) == "mo"
|
|
1236
|
+
and _local_name(nodes[1]) == "mtable"
|
|
1237
|
+
and _local_name(nodes[2]) == "mo"):
|
|
1238
|
+
open_ch = _node_text(nodes[0]).strip()
|
|
1239
|
+
close_ch = _node_text(nodes[2]).strip()
|
|
1240
|
+
if open_ch in _FENCE_OPENERS and close_ch in _FENCE_CLOSERS:
|
|
1241
|
+
return _delimiter(open_ch, close_ch, [_mtable_to_matrix(nodes[1])])
|
|
1242
|
+
# cases: prefix fence '{' followed by mtable → eqArr (no fence).
|
|
1243
|
+
if (len(nodes) == 2
|
|
1244
|
+
and _local_name(nodes[0]) == "mo"
|
|
1245
|
+
and _node_text(nodes[0]).strip() == "{"
|
|
1246
|
+
and _local_name(nodes[1]) == "mtable"):
|
|
1247
|
+
return _mtable_to_eqarr(nodes[1])
|
|
1248
|
+
# A bare mtable (\begin{matrix} / \begin{array}, no fence) → matrix <m:m>.
|
|
1249
|
+
if len(nodes) == 1 and _local_name(nodes[0]) == "mtable":
|
|
1250
|
+
return _mtable_to_matrix(nodes[0])
|
|
1251
|
+
|
|
1252
|
+
# aligned: flattened tokens containing '&' align markers.
|
|
1253
|
+
if any(_is_align_marker(nd) for nd in nodes):
|
|
1254
|
+
rows: "list[str]" = []
|
|
1255
|
+
for seg in _split_rows_on_nbsp(nodes):
|
|
1256
|
+
seg = [nd for nd in seg if not _is_align_marker(nd)]
|
|
1257
|
+
rows.append(_emit(_sequence_parts(seg)))
|
|
1258
|
+
es = "".join(f"<m:e>{r}</m:e>" for r in rows)
|
|
1259
|
+
return f"<m:eqArr>{es}</m:eqArr>"
|
|
1260
|
+
return None
|
|
1261
|
+
|
|
1262
|
+
|
|
1263
|
+
# ---------------------------------------------------------------------------
|
|
1264
|
+
# MathML preprocessing — repair latex2mathml's invalid '&' in aligned output.
|
|
1265
|
+
# ---------------------------------------------------------------------------
|
|
1266
|
+
|
|
1267
|
+
def _sanitize_mathml(mathml: str) -> str:
|
|
1268
|
+
"""latex2mathml emits a literal ``<mi>&</mi>`` for the alignment ``&`` in
|
|
1269
|
+
``aligned`` environments, which is invalid XML. Replace it with a parseable
|
|
1270
|
+
sentinel token we can recognise downstream as an align marker."""
|
|
1271
|
+
return mathml.replace("<mi>&</mi>", "<mi></mi>")
|
|
1272
|
+
|
|
1273
|
+
|
|
1274
|
+
|
|
1275
|
+
def _mathml_to_omml(mathml: str) -> "str | None":
|
|
1276
|
+
mathml = _sanitize_mathml(mathml)
|
|
1277
|
+
try:
|
|
1278
|
+
root = ET.fromstring(mathml)
|
|
1279
|
+
except ET.ParseError:
|
|
1280
|
+
return None
|
|
1281
|
+
if _local_name(root) != "math" and root.tag != f"{{{_MATHML_NS}}}math":
|
|
1282
|
+
return None
|
|
1283
|
+
children = list(root)
|
|
1284
|
+
aligned = _try_aligned_or_cases(children)
|
|
1285
|
+
if aligned is not None:
|
|
1286
|
+
return f"<m:oMath>{aligned}</m:oMath>"
|
|
1287
|
+
body = _emit(_sequence_parts(children))
|
|
1288
|
+
if not body:
|
|
1289
|
+
return None
|
|
1290
|
+
return f"<m:oMath>{body}</m:oMath>"
|
|
1291
|
+
|
|
1292
|
+
|
|
1293
|
+
def _latex_to_omml_string(latex: str) -> "str | None":
|
|
1294
|
+
"""Convert a LaTeX fragment to an OMML `<m:oMath>` XML string, or None."""
|
|
1295
|
+
latex = latex.strip()
|
|
1296
|
+
if not latex:
|
|
1297
|
+
return None
|
|
1298
|
+
try:
|
|
1299
|
+
import latex2mathml.converter as _l2m
|
|
1300
|
+
except ImportError as exc:
|
|
1301
|
+
raise RuntimeError(
|
|
1302
|
+
"OMML conversion requires 'latex2mathml'; "
|
|
1303
|
+
"install the build-corpus dependencies (pip install -r requirements.txt)"
|
|
1304
|
+
) from exc
|
|
1305
|
+
try:
|
|
1306
|
+
mathml = _l2m.convert(latex)
|
|
1307
|
+
omml = _mathml_to_omml(mathml)
|
|
1308
|
+
except Exception:
|
|
1309
|
+
return None
|
|
1310
|
+
if not omml or "oMath" not in omml:
|
|
1311
|
+
return None
|
|
1312
|
+
return omml
|
|
1313
|
+
|
|
1314
|
+
|
|
1315
|
+
def _parse_with_namespaces(omml: str):
|
|
1316
|
+
decls = nsdecls("m", "w")
|
|
1317
|
+
for tag in ("<m:oMath>", "<m:oMathPara>"):
|
|
1318
|
+
if omml.startswith(tag):
|
|
1319
|
+
opener = tag[:-1] + f" {decls}>"
|
|
1320
|
+
omml = opener + omml[len(tag):]
|
|
1321
|
+
break
|
|
1322
|
+
else:
|
|
1323
|
+
return None
|
|
1324
|
+
try:
|
|
1325
|
+
return parse_xml(omml)
|
|
1326
|
+
except Exception:
|
|
1327
|
+
return None
|
|
1328
|
+
|
|
1329
|
+
|
|
1330
|
+
def latex_to_omath(latex: str):
|
|
1331
|
+
"""Return an inline `<m:oMath>` element for ``latex``, or None on failure."""
|
|
1332
|
+
omml = _latex_to_omml_string(latex)
|
|
1333
|
+
if omml is None:
|
|
1334
|
+
return None
|
|
1335
|
+
return _parse_with_namespaces(omml)
|
|
1336
|
+
|
|
1337
|
+
|
|
1338
|
+
def latex_to_omath_para(latex: str):
|
|
1339
|
+
"""Return a display `<m:oMathPara>` element for ``latex``, or None."""
|
|
1340
|
+
omml = _latex_to_omml_string(latex)
|
|
1341
|
+
if omml is None:
|
|
1342
|
+
return None
|
|
1343
|
+
if omml.startswith("<m:oMath>"):
|
|
1344
|
+
omml = "<m:oMathPara>" + omml + "</m:oMathPara>"
|
|
1345
|
+
return _parse_with_namespaces(omml)
|