mathfmt 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mathfmt/__init__.py +6 -0
- mathfmt/__main__.py +3 -0
- mathfmt/cli.py +136 -0
- mathfmt/core.py +779 -0
- mathfmt-0.1.0.dist-info/METADATA +128 -0
- mathfmt-0.1.0.dist-info/RECORD +10 -0
- mathfmt-0.1.0.dist-info/WHEEL +5 -0
- mathfmt-0.1.0.dist-info/entry_points.txt +2 -0
- mathfmt-0.1.0.dist-info/licenses/LICENSE +21 -0
- mathfmt-0.1.0.dist-info/top_level.txt +1 -0
mathfmt/__init__.py
ADDED
mathfmt/__main__.py
ADDED
mathfmt/cli.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""Command-line interface for MathFmt."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
8
|
+
import platform
|
|
9
|
+
import sys
|
|
10
|
+
import tempfile
|
|
11
|
+
from collections.abc import Sequence
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
from lxml import etree
|
|
15
|
+
|
|
16
|
+
from . import __version__
|
|
17
|
+
from .core import apply_docx, find_xsl, scan_docx
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def default_output(input_path: Path) -> Path:
|
|
21
|
+
return input_path.with_name(f"{input_path.stem}.mathfmt.docx")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def default_result_report(output_path: Path) -> Path:
|
|
25
|
+
return output_path.with_name(f"{output_path.stem}.report.json")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def doctor_data(explicit_xsl: Path | None = None) -> dict[str, object]:
|
|
29
|
+
data: dict[str, object] = {
|
|
30
|
+
"mathfmt": __version__,
|
|
31
|
+
"python": platform.python_version(),
|
|
32
|
+
"platform": platform.platform(),
|
|
33
|
+
"windows": os.name == "nt",
|
|
34
|
+
"lxml": etree.LXML_VERSION,
|
|
35
|
+
"xsl": None,
|
|
36
|
+
"ready": False,
|
|
37
|
+
}
|
|
38
|
+
try:
|
|
39
|
+
data["xsl"] = str(find_xsl(explicit_xsl).resolve())
|
|
40
|
+
data["ready"] = True
|
|
41
|
+
except FileNotFoundError as exc:
|
|
42
|
+
data["error"] = str(exc)
|
|
43
|
+
return data
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
47
|
+
parser = argparse.ArgumentParser(
|
|
48
|
+
prog="mathfmt",
|
|
49
|
+
description="Typeset plain-text DOCX formulas as native Word equations.",
|
|
50
|
+
)
|
|
51
|
+
parser.add_argument("--version", action="version", version=f"MathFmt {__version__}")
|
|
52
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
53
|
+
|
|
54
|
+
scan = subparsers.add_parser("scan", help="create a reviewable formula candidate report")
|
|
55
|
+
scan.add_argument("input", type=Path)
|
|
56
|
+
scan.add_argument("--report", type=Path, required=True)
|
|
57
|
+
|
|
58
|
+
apply = subparsers.add_parser("apply", help="apply a reviewed candidate report")
|
|
59
|
+
apply.add_argument("input", type=Path)
|
|
60
|
+
apply.add_argument("--review", type=Path, required=True)
|
|
61
|
+
apply.add_argument("--output", "--out", dest="output", type=Path, required=True)
|
|
62
|
+
apply.add_argument("--report", type=Path, required=True)
|
|
63
|
+
apply.add_argument("--xsl", type=Path)
|
|
64
|
+
|
|
65
|
+
convert = subparsers.add_parser("convert", help="conservatively convert detected formulas in one step")
|
|
66
|
+
convert.add_argument("input", type=Path)
|
|
67
|
+
convert.add_argument("--output", "--out", dest="output", type=Path)
|
|
68
|
+
convert.add_argument("--report", type=Path)
|
|
69
|
+
convert.add_argument("--xsl", type=Path)
|
|
70
|
+
|
|
71
|
+
doctor = subparsers.add_parser("doctor", help="check the local MathFmt environment")
|
|
72
|
+
doctor.add_argument("--xsl", type=Path)
|
|
73
|
+
doctor.add_argument("--json", action="store_true", dest="as_json")
|
|
74
|
+
return parser
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def run_convert(args: argparse.Namespace) -> int:
|
|
78
|
+
output = args.output or default_output(args.input)
|
|
79
|
+
report_path = args.report or default_result_report(output)
|
|
80
|
+
xsl = find_xsl(args.xsl)
|
|
81
|
+
with tempfile.TemporaryDirectory(prefix="mathfmt-") as temp_dir:
|
|
82
|
+
review_path = Path(temp_dir) / "candidates.json"
|
|
83
|
+
scan = scan_docx(args.input, review_path)
|
|
84
|
+
result = apply_docx(args.input, review_path, output, report_path, xsl)
|
|
85
|
+
print(f"Candidates: {scan['summary']['candidates']}")
|
|
86
|
+
print(f"Converted: {result['converted_count']}")
|
|
87
|
+
print(f"Skipped: {result['skipped_count']}")
|
|
88
|
+
print(f"Output: {output}")
|
|
89
|
+
print(f"Report: {report_path}")
|
|
90
|
+
return 0 if result["skipped_count"] == 0 else 2
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def main(argv: Sequence[str] | None = None) -> int:
|
|
94
|
+
args = build_parser().parse_args(argv)
|
|
95
|
+
try:
|
|
96
|
+
if args.command == "scan":
|
|
97
|
+
report = scan_docx(args.input, args.report)
|
|
98
|
+
print(f"Candidates: {report['summary']['candidates']}")
|
|
99
|
+
print(f"Report: {args.report}")
|
|
100
|
+
return 0
|
|
101
|
+
if args.command == "apply":
|
|
102
|
+
result = apply_docx(
|
|
103
|
+
args.input,
|
|
104
|
+
args.review,
|
|
105
|
+
args.output,
|
|
106
|
+
args.report,
|
|
107
|
+
find_xsl(args.xsl),
|
|
108
|
+
)
|
|
109
|
+
print(f"Converted: {result['converted_count']}")
|
|
110
|
+
print(f"Skipped: {result['skipped_count']}")
|
|
111
|
+
print(f"Output: {args.output}")
|
|
112
|
+
print(f"Report: {args.report}")
|
|
113
|
+
return 0 if result["skipped_count"] == 0 else 2
|
|
114
|
+
if args.command == "convert":
|
|
115
|
+
return run_convert(args)
|
|
116
|
+
if args.command == "doctor":
|
|
117
|
+
data = doctor_data(args.xsl)
|
|
118
|
+
if args.as_json:
|
|
119
|
+
print(json.dumps(data, ensure_ascii=False, indent=2))
|
|
120
|
+
else:
|
|
121
|
+
print(f"MathFmt: {data['mathfmt']}")
|
|
122
|
+
print(f"Python: {data['python']}")
|
|
123
|
+
print(f"Platform: {data['platform']}")
|
|
124
|
+
print(f"MML2OMML.XSL: {data['xsl'] or 'not found'}")
|
|
125
|
+
print(f"Ready: {'yes' if data['ready'] else 'no'}")
|
|
126
|
+
if data.get("error"):
|
|
127
|
+
print(f"Hint: {data['error']}")
|
|
128
|
+
return 0 if data["ready"] else 1
|
|
129
|
+
except (FileNotFoundError, ValueError, json.JSONDecodeError, etree.XMLSyntaxError) as exc:
|
|
130
|
+
print(f"mathfmt: error: {exc}", file=sys.stderr)
|
|
131
|
+
return 1
|
|
132
|
+
return 1
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
if __name__ == "__main__":
|
|
136
|
+
raise SystemExit(main())
|
mathfmt/core.py
ADDED
|
@@ -0,0 +1,779 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Scan and typeset plain-text DOCX formulas as native Word OMML equations."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import copy
|
|
7
|
+
import json
|
|
8
|
+
import re
|
|
9
|
+
import shutil
|
|
10
|
+
import zipfile
|
|
11
|
+
from collections.abc import Sequence
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
from lxml import etree
|
|
16
|
+
|
|
17
|
+
W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
|
18
|
+
M_NS = "http://schemas.openxmlformats.org/officeDocument/2006/math"
|
|
19
|
+
MML_NS = "http://www.w3.org/1998/Math/MathML"
|
|
20
|
+
XML_NS = "http://www.w3.org/XML/1998/namespace"
|
|
21
|
+
NS = {"w": W_NS, "m": M_NS}
|
|
22
|
+
|
|
23
|
+
TARGET_PART_RE = re.compile(r"^word/(document|header\d+|footer\d+)\.xml$")
|
|
24
|
+
|
|
25
|
+
SUBSCRIPT_MAP = str.maketrans(
|
|
26
|
+
"₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎ₐₑₕᵢⱼₖₗₘₙₒₚᵣₛₜᵤᵥₓ",
|
|
27
|
+
"0123456789+-=()aehijklmnop rstuvx".replace(" ", ""),
|
|
28
|
+
)
|
|
29
|
+
SUPERSCRIPT_MAP = {
|
|
30
|
+
"⁰": "0", "¹": "1", "²": "2", "³": "3", "⁴": "4",
|
|
31
|
+
"⁵": "5", "⁶": "6", "⁷": "7", "⁸": "8", "⁹": "9",
|
|
32
|
+
"⁺": "+", "⁻": "-", "⁼": "=", "⁽": "(", "⁾": ")", "ⁿ": "n",
|
|
33
|
+
}
|
|
34
|
+
SUPERSCRIPT_CHARS = "".join(SUPERSCRIPT_MAP)
|
|
35
|
+
|
|
36
|
+
CODE_START_RE = re.compile(
|
|
37
|
+
r"^(?:%|#|pkg\s|clear\b|close\b|plot\b|grid\b|xlabel\b|ylabel\b|"
|
|
38
|
+
r"title\b|legend\b|hold\b|for\b|while\b|if\b|function\b|import\b|from\b)",
|
|
39
|
+
re.IGNORECASE,
|
|
40
|
+
)
|
|
41
|
+
FORMULA_ANCHOR_RE = re.compile(r"(?:=|≠|<=|>=|!=|→|->|±|\+/-|√|sqrt|lim)")
|
|
42
|
+
MATH_CHARS = set(
|
|
43
|
+
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
|
|
44
|
+
"₀₁₂₃₄₅₆₇₈₉ₚᵥₜ⁰¹²³⁴⁵⁶⁷⁸⁹+-*/^=<>!~→⇒±≠≤≥≈√∞ΔπΓ"
|
|
45
|
+
"()[]{}.,'′˙¨·×÷_ \t"
|
|
46
|
+
)
|
|
47
|
+
TRIM_PUNCT = " \t,,.。;;::"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def qname(ns: str, local: str) -> str:
|
|
51
|
+
return f"{{{ns}}}{local}"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def mml(local: str, text: str | None = None, **attrs: str) -> etree._Element:
|
|
55
|
+
element = etree.Element(qname(MML_NS, local), **attrs)
|
|
56
|
+
if text is not None:
|
|
57
|
+
element.text = text
|
|
58
|
+
return element
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def mrow(*children: etree._Element) -> etree._Element:
|
|
62
|
+
row = mml("mrow")
|
|
63
|
+
for child in children:
|
|
64
|
+
row.append(child)
|
|
65
|
+
return row
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass(frozen=True)
|
|
69
|
+
class Token:
|
|
70
|
+
kind: str
|
|
71
|
+
value: str
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass
|
|
75
|
+
class Node:
|
|
76
|
+
kind: str
|
|
77
|
+
value: str | None = None
|
|
78
|
+
children: tuple[Node, ...] = ()
|
|
79
|
+
meta: dict[str, str] | None = None
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class FormulaError(ValueError):
|
|
83
|
+
pass
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
TOKEN_RE = re.compile(
|
|
87
|
+
r"\s*(?:"
|
|
88
|
+
r"(?P<NUMBER>\d+(?:[\.,]\d+)?)|"
|
|
89
|
+
r"(?P<IDENT>sqrt|lim|exp|sin|cos|tan|Delta|pi|inf|e[pv]|pPAIR|DERV\d+|[A-Za-z](?:\d+)?|[ΔπΓ∞])|"
|
|
90
|
+
r"(?P<OP><=|>=|!=|~=|->|=>|\+/-|[+\-*/^=<>±≠≤≥≈→⇒·×÷])|"
|
|
91
|
+
r"(?P<LPAREN>[\(\[\{])|(?P<RPAREN>[\)\]\}])|(?P<COMMA>,)"
|
|
92
|
+
r")"
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def preprocess_formula(source: str) -> tuple[str, dict[str, tuple[int, str, str]]]:
|
|
97
|
+
text = source.strip()
|
|
98
|
+
derivatives: dict[str, tuple[int, str, str]] = {}
|
|
99
|
+
|
|
100
|
+
leibniz_patterns = [
|
|
101
|
+
(2, re.compile(r"\bd(?:\^?2|²)([A-Za-z])\(([^()]*)\)/d([A-Za-z])(?:\^?2|²)")),
|
|
102
|
+
(1, re.compile(r"\bd([A-Za-z])\(([^()]*)\)/d([A-Za-z])")),
|
|
103
|
+
]
|
|
104
|
+
for order, pattern in leibniz_patterns:
|
|
105
|
+
while True:
|
|
106
|
+
match = pattern.search(text)
|
|
107
|
+
if not match:
|
|
108
|
+
break
|
|
109
|
+
key = f"DERV{len(derivatives)}"
|
|
110
|
+
derivatives[key] = (order, match.group(1), match.group(3))
|
|
111
|
+
text = text[: match.start()] + key + text[match.end() :]
|
|
112
|
+
|
|
113
|
+
derivative_patterns = [
|
|
114
|
+
(2, re.compile(r"([A-Za-z])(?:''|¨)\(([^()]*)\)")),
|
|
115
|
+
(1, re.compile(r"([A-Za-z])(?:'|′|˙)\(([^()]*)\)")),
|
|
116
|
+
]
|
|
117
|
+
for order, pattern in derivative_patterns:
|
|
118
|
+
while True:
|
|
119
|
+
match = pattern.search(text)
|
|
120
|
+
if not match:
|
|
121
|
+
break
|
|
122
|
+
key = f"DERV{len(derivatives)}"
|
|
123
|
+
derivatives[key] = (order, match.group(1), match.group(2))
|
|
124
|
+
text = text[: match.start()] + key + text[match.end() :]
|
|
125
|
+
|
|
126
|
+
text = text.replace("limₚ→0", "lim(p->0)").replace("limₜ→∞", "lim(t->inf)")
|
|
127
|
+
text = re.sub(r"([A-Za-z0-9)\]])([" + re.escape(SUPERSCRIPT_CHARS) + r"]+)",
|
|
128
|
+
lambda m: m.group(1) + "^" + "".join(SUPERSCRIPT_MAP[c] for c in m.group(2)), text)
|
|
129
|
+
text = text.translate(SUBSCRIPT_MAP)
|
|
130
|
+
text = re.sub(r"\bp1\s*,\s*2\b", "pPAIR", text)
|
|
131
|
+
text = text.replace("√", "sqrt").replace("+/-", "±")
|
|
132
|
+
text = text.replace("!=", "≠").replace("<=", "≤").replace(">=", "≥")
|
|
133
|
+
text = text.replace("->", "→").replace("=>", "⇒")
|
|
134
|
+
text = text.replace("×", "*").replace("·", "*").replace("÷", "/")
|
|
135
|
+
text = re.sub(r"(?:Γ|1)\(t\)", "u(t)", text)
|
|
136
|
+
text = re.sub(r"\bDelta\b", "Δ", text)
|
|
137
|
+
text = re.sub(r"\binf\b", "∞", text)
|
|
138
|
+
text = re.sub(r"\bpi\b", "π", text)
|
|
139
|
+
text = re.sub(r"e\^\{([^{}]+)\}", r"e^(\1)", text)
|
|
140
|
+
return text, derivatives
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def tokenize(text: str) -> list[Token]:
|
|
144
|
+
tokens: list[Token] = []
|
|
145
|
+
position = 0
|
|
146
|
+
while position < len(text):
|
|
147
|
+
match = TOKEN_RE.match(text, position)
|
|
148
|
+
if not match:
|
|
149
|
+
if text[position:].strip() == "":
|
|
150
|
+
break
|
|
151
|
+
raise FormulaError(f"Unrecognized formula text near: {text[position:position + 24]!r}")
|
|
152
|
+
kind = match.lastgroup
|
|
153
|
+
if kind is None:
|
|
154
|
+
raise FormulaError("Tokenizer produced an empty token")
|
|
155
|
+
tokens.append(Token(kind, match.group(kind)))
|
|
156
|
+
position = match.end()
|
|
157
|
+
tokens.append(Token("EOF", ""))
|
|
158
|
+
return tokens
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class Parser:
|
|
162
|
+
def __init__(self, tokens: Sequence[Token], derivatives: dict[str, tuple[int, str, str]]):
|
|
163
|
+
self.tokens = tokens
|
|
164
|
+
self.derivatives = derivatives
|
|
165
|
+
self.index = 0
|
|
166
|
+
|
|
167
|
+
@property
|
|
168
|
+
def current(self) -> Token:
|
|
169
|
+
return self.tokens[self.index]
|
|
170
|
+
|
|
171
|
+
def advance(self) -> Token:
|
|
172
|
+
token = self.current
|
|
173
|
+
self.index += 1
|
|
174
|
+
return token
|
|
175
|
+
|
|
176
|
+
def accept(self, kind: str, value: str | None = None) -> Token | None:
|
|
177
|
+
if self.current.kind == kind and (value is None or self.current.value == value):
|
|
178
|
+
return self.advance()
|
|
179
|
+
return None
|
|
180
|
+
|
|
181
|
+
def expect(self, kind: str) -> Token:
|
|
182
|
+
token = self.accept(kind)
|
|
183
|
+
if token is None:
|
|
184
|
+
raise FormulaError(f"Expected {kind}, got {self.current.kind} {self.current.value!r}")
|
|
185
|
+
return token
|
|
186
|
+
|
|
187
|
+
def parse(self) -> Node:
|
|
188
|
+
node = self.parse_sequence()
|
|
189
|
+
if self.current.kind != "EOF":
|
|
190
|
+
raise FormulaError(f"Unexpected token: {self.current.value!r}")
|
|
191
|
+
return node
|
|
192
|
+
|
|
193
|
+
def parse_sequence(self) -> Node:
|
|
194
|
+
nodes = [self.parse_relation()]
|
|
195
|
+
while self.accept("COMMA"):
|
|
196
|
+
nodes.append(self.parse_relation())
|
|
197
|
+
return nodes[0] if len(nodes) == 1 else Node("sequence", children=tuple(nodes))
|
|
198
|
+
|
|
199
|
+
def parse_relation(self) -> Node:
|
|
200
|
+
node = self.parse_add()
|
|
201
|
+
while self.current.kind == "OP" and self.current.value in {
|
|
202
|
+
"=", "<", ">", "≤", "≥", "≠", "~=", "→", "⇒",
|
|
203
|
+
}:
|
|
204
|
+
op = self.advance().value
|
|
205
|
+
node = Node("binary", op, (node, self.parse_add()))
|
|
206
|
+
return node
|
|
207
|
+
|
|
208
|
+
def parse_add(self) -> Node:
|
|
209
|
+
node = self.parse_mul()
|
|
210
|
+
while self.current.kind == "OP" and self.current.value in {"+", "-", "±"}:
|
|
211
|
+
op = self.advance().value
|
|
212
|
+
node = Node("binary", op, (node, self.parse_mul()))
|
|
213
|
+
return node
|
|
214
|
+
|
|
215
|
+
def starts_atom(self) -> bool:
|
|
216
|
+
return self.current.kind in {"NUMBER", "IDENT", "LPAREN"}
|
|
217
|
+
|
|
218
|
+
def parse_mul(self) -> Node:
|
|
219
|
+
node = self.parse_power()
|
|
220
|
+
while True:
|
|
221
|
+
if self.current.kind == "OP" and self.current.value in {"*", "·", "×", "/", "÷"}:
|
|
222
|
+
op = self.advance().value
|
|
223
|
+
node = Node("binary", "/" if op in {"/", "÷"} else "*", (node, self.parse_power()))
|
|
224
|
+
elif self.starts_atom():
|
|
225
|
+
node = Node("binary", "implicit", (node, self.parse_power()))
|
|
226
|
+
else:
|
|
227
|
+
break
|
|
228
|
+
return node
|
|
229
|
+
|
|
230
|
+
def parse_power(self) -> Node:
|
|
231
|
+
node = self.parse_unary()
|
|
232
|
+
if self.current.kind == "OP" and self.current.value == "^":
|
|
233
|
+
self.advance()
|
|
234
|
+
node = Node("power", children=(node, self.parse_power()))
|
|
235
|
+
return node
|
|
236
|
+
|
|
237
|
+
def parse_unary(self) -> Node:
|
|
238
|
+
if self.current.kind == "OP" and self.current.value in {"+", "-"}:
|
|
239
|
+
return Node("unary", self.advance().value, (self.parse_unary(),))
|
|
240
|
+
return self.parse_atom()
|
|
241
|
+
|
|
242
|
+
def parse_group(self) -> Node:
|
|
243
|
+
opener = self.expect("LPAREN").value
|
|
244
|
+
closer = {"(": ")", "[": "]", "{": "}"}[opener]
|
|
245
|
+
child = self.parse_sequence()
|
|
246
|
+
token = self.expect("RPAREN")
|
|
247
|
+
if token.value != closer:
|
|
248
|
+
raise FormulaError(f"Mismatched group: {opener}{token.value}")
|
|
249
|
+
return Node("group", opener + closer, (child,))
|
|
250
|
+
|
|
251
|
+
def parse_atom(self) -> Node:
|
|
252
|
+
if token := self.accept("NUMBER"):
|
|
253
|
+
return Node("number", token.value)
|
|
254
|
+
if self.current.kind == "LPAREN":
|
|
255
|
+
return self.parse_group()
|
|
256
|
+
if token := self.accept("IDENT"):
|
|
257
|
+
name = token.value
|
|
258
|
+
if name in self.derivatives:
|
|
259
|
+
order, variable, argument = self.derivatives[name]
|
|
260
|
+
return Node("derivative", children=(Node("identifier", variable), Node("identifier", argument)), meta={"order": str(order)})
|
|
261
|
+
if self.current.kind == "LPAREN":
|
|
262
|
+
group = self.parse_group()
|
|
263
|
+
if name in {"sqrt", "√"}:
|
|
264
|
+
return Node("sqrt", children=group.children)
|
|
265
|
+
if name == "lim":
|
|
266
|
+
return Node("limit", children=group.children)
|
|
267
|
+
return Node("function", name, group.children)
|
|
268
|
+
return Node("identifier", name)
|
|
269
|
+
raise FormulaError(f"Expected formula atom, got {self.current.kind} {self.current.value!r}")
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def identifier_mathml(value: str) -> etree._Element:
|
|
273
|
+
if value in {"∞", "inf"}:
|
|
274
|
+
return mml("mo", "∞")
|
|
275
|
+
greek = {"Delta": "Δ", "Δ": "Δ", "pi": "π", "π": "π"}
|
|
276
|
+
if value in greek:
|
|
277
|
+
return mml("mi", greek[value])
|
|
278
|
+
if value == "pPAIR":
|
|
279
|
+
sub = mml("msub")
|
|
280
|
+
sub.append(mml("mi", "p"))
|
|
281
|
+
sub.append(mrow(mml("mn", "1"), mml("mo", ","), mml("mn", "2")))
|
|
282
|
+
return sub
|
|
283
|
+
match = re.fullmatch(r"([A-Za-z])([0-9]+|[pv])", value)
|
|
284
|
+
if match:
|
|
285
|
+
sub = mml("msub")
|
|
286
|
+
sub.append(mml("mi", match.group(1)))
|
|
287
|
+
suffix = match.group(2)
|
|
288
|
+
sub.append(mml("mn" if suffix.isdigit() else "mi", suffix))
|
|
289
|
+
return sub
|
|
290
|
+
return mml("mi", value)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def derivative_mathml(node: Node) -> etree._Element:
|
|
294
|
+
order = int((node.meta or {}).get("order", "1"))
|
|
295
|
+
variable = node_to_mathml(node.children[0])
|
|
296
|
+
argument = node_to_mathml(node.children[1])
|
|
297
|
+
function = mrow(variable, fenced(argument, "()"))
|
|
298
|
+
numerator_d = mml("mi", "d")
|
|
299
|
+
if order > 1:
|
|
300
|
+
power = mml("msup")
|
|
301
|
+
power.append(numerator_d)
|
|
302
|
+
power.append(mml("mn", str(order)))
|
|
303
|
+
numerator_d = power
|
|
304
|
+
numerator = mrow(numerator_d, function)
|
|
305
|
+
denominator_variable = node_to_mathml(node.children[1])
|
|
306
|
+
if order > 1:
|
|
307
|
+
power = mml("msup")
|
|
308
|
+
power.append(denominator_variable)
|
|
309
|
+
power.append(mml("mn", str(order)))
|
|
310
|
+
denominator_variable = power
|
|
311
|
+
denominator = mrow(mml("mi", "d"), denominator_variable)
|
|
312
|
+
fraction = mml("mfrac")
|
|
313
|
+
fraction.extend([numerator, denominator])
|
|
314
|
+
return fraction
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def fenced(child: etree._Element, brackets: str) -> etree._Element:
|
|
318
|
+
element = mml("mfenced", open=brackets[0], close=brackets[1])
|
|
319
|
+
element.append(child)
|
|
320
|
+
return element
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def node_to_mathml(node: Node) -> etree._Element:
|
|
324
|
+
if node.kind == "number":
|
|
325
|
+
return mml("mn", node.value or "")
|
|
326
|
+
if node.kind == "identifier":
|
|
327
|
+
return identifier_mathml(node.value or "")
|
|
328
|
+
if node.kind == "derivative":
|
|
329
|
+
return derivative_mathml(node)
|
|
330
|
+
if node.kind == "group":
|
|
331
|
+
return fenced(node_to_mathml(node.children[0]), node.value or "()")
|
|
332
|
+
if node.kind == "sqrt":
|
|
333
|
+
root = mml("msqrt")
|
|
334
|
+
root.append(node_to_mathml(node.children[0]))
|
|
335
|
+
return root
|
|
336
|
+
if node.kind == "function":
|
|
337
|
+
return mrow(identifier_mathml(node.value or ""), fenced(node_to_mathml(node.children[0]), "()"))
|
|
338
|
+
if node.kind == "limit":
|
|
339
|
+
under = mml("munder")
|
|
340
|
+
under.append(mml("mi", "lim"))
|
|
341
|
+
under.append(node_to_mathml(node.children[0]))
|
|
342
|
+
return under
|
|
343
|
+
if node.kind == "unary":
|
|
344
|
+
return mrow(mml("mo", node.value or ""), node_to_mathml(node.children[0]))
|
|
345
|
+
if node.kind == "power":
|
|
346
|
+
power = mml("msup")
|
|
347
|
+
exponent = node.children[1]
|
|
348
|
+
if exponent.kind == "group" and exponent.value == "()":
|
|
349
|
+
exponent = exponent.children[0]
|
|
350
|
+
power.extend([node_to_mathml(node.children[0]), node_to_mathml(exponent)])
|
|
351
|
+
return power
|
|
352
|
+
if node.kind == "binary":
|
|
353
|
+
left = node_to_mathml(node.children[0])
|
|
354
|
+
right = node_to_mathml(node.children[1])
|
|
355
|
+
if node.value == "/":
|
|
356
|
+
left_node, right_node = node.children
|
|
357
|
+
if left_node.kind == "group":
|
|
358
|
+
left = node_to_mathml(left_node.children[0])
|
|
359
|
+
if right_node.kind == "group":
|
|
360
|
+
right = node_to_mathml(right_node.children[0])
|
|
361
|
+
fraction = mml("mfrac")
|
|
362
|
+
fraction.extend([left, right])
|
|
363
|
+
return fraction
|
|
364
|
+
if node.value in {"*", "implicit"}:
|
|
365
|
+
return mrow(left, mml("mo", "\u2062"), right)
|
|
366
|
+
symbols = {"~=": "≈"}
|
|
367
|
+
return mrow(left, mml("mo", symbols.get(node.value or "", node.value or "")), right)
|
|
368
|
+
if node.kind == "sequence":
|
|
369
|
+
row = mml("mrow")
|
|
370
|
+
for index, child in enumerate(node.children):
|
|
371
|
+
if index:
|
|
372
|
+
row.append(mml("mo", ","))
|
|
373
|
+
row.append(node_to_mathml(child))
|
|
374
|
+
return row
|
|
375
|
+
raise FormulaError(f"Unsupported AST node: {node.kind}")
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def formula_to_mathml(source: str) -> etree._Element:
|
|
379
|
+
normalized, derivatives = preprocess_formula(source)
|
|
380
|
+
ast = Parser(tokenize(normalized), derivatives).parse()
|
|
381
|
+
root = mml("math", display="inline", nsmap={None: MML_NS})
|
|
382
|
+
root.append(node_to_mathml(ast))
|
|
383
|
+
return root
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def find_xsl(explicit: Path | None = None) -> Path:
|
|
387
|
+
if explicit is not None and not explicit.is_file():
|
|
388
|
+
raise FileNotFoundError(f"MML2OMML.XSL was not found at: {explicit}")
|
|
389
|
+
candidates = [
|
|
390
|
+
explicit,
|
|
391
|
+
Path(r"C:\Program Files\Microsoft Office\root\Office16\MML2OMML.XSL"),
|
|
392
|
+
Path(r"C:\Program Files (x86)\Microsoft Office\root\Office16\MML2OMML.XSL"),
|
|
393
|
+
]
|
|
394
|
+
for candidate in candidates:
|
|
395
|
+
if candidate and candidate.exists():
|
|
396
|
+
return candidate
|
|
397
|
+
raise FileNotFoundError("MML2OMML.XSL was not found; pass --xsl with its path")
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def mathml_to_omml(math: etree._Element, transform: etree.XSLT) -> etree._Element:
|
|
401
|
+
result = transform(etree.ElementTree(math))
|
|
402
|
+
root = result.getroot()
|
|
403
|
+
if root is None:
|
|
404
|
+
raise FormulaError("MML2OMML produced no root element")
|
|
405
|
+
if root.tag == qname(M_NS, "oMathPara"):
|
|
406
|
+
equations = root.xpath(".//m:oMath", namespaces=NS)
|
|
407
|
+
if not equations:
|
|
408
|
+
raise FormulaError("MML2OMML output contains no m:oMath")
|
|
409
|
+
return copy.deepcopy(equations[0])
|
|
410
|
+
if root.tag == qname(M_NS, "oMath"):
|
|
411
|
+
return copy.deepcopy(root)
|
|
412
|
+
equations = root.xpath(".//m:oMath", namespaces=NS)
|
|
413
|
+
if not equations:
|
|
414
|
+
raise FormulaError(f"Unexpected MML2OMML root: {root.tag}")
|
|
415
|
+
return copy.deepcopy(equations[0])
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
def paragraph_text(paragraph: etree._Element) -> str:
|
|
419
|
+
return "".join(paragraph.xpath(".//w:t/text()", namespaces=NS))
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def likely_code(text: str) -> bool:
|
|
423
|
+
stripped = text.strip()
|
|
424
|
+
if CODE_START_RE.search(stripped):
|
|
425
|
+
return True
|
|
426
|
+
if ";" in stripped and re.search(r"\b[A-Za-z_]\w*\s*=", stripped):
|
|
427
|
+
return True
|
|
428
|
+
if re.search(r"\b(?:tf|step|roots|exp)\s*\(", stripped) and stripped.endswith(";"):
|
|
429
|
+
return True
|
|
430
|
+
return False
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def math_score(source: str) -> int:
|
|
434
|
+
score = 0
|
|
435
|
+
score += 3 * len(re.findall(r"=|≠|<=|>=|!=", source))
|
|
436
|
+
score += 2 * len(re.findall(r"[+*/^√±∞→]", source))
|
|
437
|
+
score += len(re.findall(r"[A-Za-z]\w*\([^)]*\)", source))
|
|
438
|
+
score += len(re.findall(r"\d", source)) // 2
|
|
439
|
+
return score
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
def candidate_runs(text: str) -> list[tuple[int, int, str]]:
|
|
443
|
+
candidates: list[tuple[int, int, str]] = []
|
|
444
|
+
index = 0
|
|
445
|
+
while index < len(text):
|
|
446
|
+
if text[index] not in MATH_CHARS:
|
|
447
|
+
index += 1
|
|
448
|
+
continue
|
|
449
|
+
start = index
|
|
450
|
+
while index < len(text) and text[index] in MATH_CHARS:
|
|
451
|
+
index += 1
|
|
452
|
+
end = index
|
|
453
|
+
raw = text[start:end]
|
|
454
|
+
left_trim = len(raw) - len(raw.lstrip(TRIM_PUNCT))
|
|
455
|
+
right_trim = len(raw) - len(raw.rstrip(TRIM_PUNCT))
|
|
456
|
+
start += left_trim
|
|
457
|
+
end -= right_trim
|
|
458
|
+
source = text[start:end]
|
|
459
|
+
if not source or not FORMULA_ANCHOR_RE.search(source) or math_score(source) < 4:
|
|
460
|
+
continue
|
|
461
|
+
source = re.split(r"\.\s+(?=[A-Za-z])", source, maxsplit=1)[0]
|
|
462
|
+
source = re.split(r",?\s+(?:avec|si|et|Elle|C|La|Pour)\b", source, maxsplit=1, flags=re.IGNORECASE)[0]
|
|
463
|
+
source = re.sub(
|
|
464
|
+
r"^.*\b(?:est|sont|vaut|discriminant|vers|equals?|is)\s+",
|
|
465
|
+
"",
|
|
466
|
+
source,
|
|
467
|
+
flags=re.IGNORECASE,
|
|
468
|
+
)
|
|
469
|
+
source = source.strip(TRIM_PUNCT)
|
|
470
|
+
if not source or not FORMULA_ANCHOR_RE.search(source) or math_score(source) < 4:
|
|
471
|
+
continue
|
|
472
|
+
start = text.find(source, start, end)
|
|
473
|
+
end = start + len(source)
|
|
474
|
+
if source and start >= 0:
|
|
475
|
+
candidates.append((start, end, source))
|
|
476
|
+
deduped: list[tuple[int, int, str]] = []
|
|
477
|
+
for item in candidates:
|
|
478
|
+
if not deduped or item[:2] != deduped[-1][:2]:
|
|
479
|
+
deduped.append(item)
|
|
480
|
+
return deduped
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
def inspect_docx(input_path: Path) -> tuple[list[zipfile.ZipInfo], dict[str, bytes]]:
|
|
484
|
+
with zipfile.ZipFile(input_path, "r") as archive:
|
|
485
|
+
infos = archive.infolist()
|
|
486
|
+
data = {info.filename: archive.read(info.filename) for info in infos}
|
|
487
|
+
return infos, data
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
def scan_docx(input_path: Path, report_path: Path) -> dict[str, object]:
|
|
491
|
+
if input_path.suffix.lower() != ".docx":
|
|
492
|
+
raise ValueError("Input must be a .docx file")
|
|
493
|
+
if not input_path.is_file():
|
|
494
|
+
raise FileNotFoundError(f"Input DOCX was not found: {input_path}")
|
|
495
|
+
_, parts = inspect_docx(input_path)
|
|
496
|
+
report: dict[str, object] = {
|
|
497
|
+
"schema_version": 1,
|
|
498
|
+
"input": str(input_path.resolve()),
|
|
499
|
+
"profile": {"derivatives": "fraction", "unit_step": "u(t)", "output": "native_word_omml"},
|
|
500
|
+
"summary": {"paragraphs": 0, "candidates": 0, "existing_equations": 0, "drawing_paragraphs": 0, "code_paragraphs": 0},
|
|
501
|
+
"candidates": [],
|
|
502
|
+
}
|
|
503
|
+
candidates: list[dict[str, object]] = []
|
|
504
|
+
summary = report["summary"]
|
|
505
|
+
assert isinstance(summary, dict)
|
|
506
|
+
|
|
507
|
+
for part_name, raw in parts.items():
|
|
508
|
+
if not TARGET_PART_RE.match(part_name):
|
|
509
|
+
continue
|
|
510
|
+
root = etree.fromstring(raw)
|
|
511
|
+
paragraphs = root.xpath(".//w:p", namespaces=NS)
|
|
512
|
+
for paragraph_index, paragraph in enumerate(paragraphs):
|
|
513
|
+
summary["paragraphs"] += 1
|
|
514
|
+
if paragraph.xpath(".//m:oMath | .//m:oMathPara", namespaces=NS):
|
|
515
|
+
summary["existing_equations"] += 1
|
|
516
|
+
continue
|
|
517
|
+
if paragraph.xpath(".//w:drawing | .//w:pict", namespaces=NS):
|
|
518
|
+
summary["drawing_paragraphs"] += 1
|
|
519
|
+
continue
|
|
520
|
+
text = paragraph_text(paragraph)
|
|
521
|
+
if not text.strip():
|
|
522
|
+
continue
|
|
523
|
+
if likely_code(text):
|
|
524
|
+
summary["code_paragraphs"] += 1
|
|
525
|
+
continue
|
|
526
|
+
for start, end, source in candidate_runs(text):
|
|
527
|
+
candidate_id = f"f{len(candidates) + 1:04d}"
|
|
528
|
+
display = text.strip() == source.strip()
|
|
529
|
+
candidate = {
|
|
530
|
+
"id": candidate_id,
|
|
531
|
+
"selected": True,
|
|
532
|
+
"part": part_name,
|
|
533
|
+
"paragraph_index": paragraph_index,
|
|
534
|
+
"start": start,
|
|
535
|
+
"end": end,
|
|
536
|
+
"source": source,
|
|
537
|
+
"linear": source,
|
|
538
|
+
"display": display,
|
|
539
|
+
"paragraph_text": text,
|
|
540
|
+
}
|
|
541
|
+
try:
|
|
542
|
+
formula_to_mathml(source)
|
|
543
|
+
candidate["parse_status"] = "ok"
|
|
544
|
+
except Exception as exc:
|
|
545
|
+
candidate["selected"] = False
|
|
546
|
+
candidate["parse_status"] = "review"
|
|
547
|
+
candidate["parse_error"] = str(exc)
|
|
548
|
+
candidates.append(candidate)
|
|
549
|
+
summary["candidates"] = len(candidates)
|
|
550
|
+
report["candidates"] = candidates
|
|
551
|
+
report_path.parent.mkdir(parents=True, exist_ok=True)
|
|
552
|
+
report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
553
|
+
return report
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
def ancestor_run(text_node: etree._Element, paragraph: etree._Element) -> etree._Element | None:
|
|
557
|
+
current = text_node.getparent()
|
|
558
|
+
while current is not None and current is not paragraph:
|
|
559
|
+
if current.tag == qname(W_NS, "r") and current.getparent() is paragraph:
|
|
560
|
+
return current
|
|
561
|
+
current = current.getparent()
|
|
562
|
+
return None
|
|
563
|
+
|
|
564
|
+
|
|
565
|
+
def run_with_text_like(run: etree._Element, text: str) -> etree._Element:
|
|
566
|
+
new_run = etree.Element(qname(W_NS, "r"))
|
|
567
|
+
rpr = run.find(qname(W_NS, "rPr"))
|
|
568
|
+
if rpr is not None:
|
|
569
|
+
new_run.append(copy.deepcopy(rpr))
|
|
570
|
+
text_element = etree.SubElement(new_run, qname(W_NS, "t"))
|
|
571
|
+
if text.startswith(" ") or text.endswith(" "):
|
|
572
|
+
text_element.set(qname(XML_NS, "space"), "preserve")
|
|
573
|
+
text_element.text = text
|
|
574
|
+
return new_run
|
|
575
|
+
|
|
576
|
+
|
|
577
|
+
def replace_inline_span(paragraph: etree._Element, start: int, end: int, omath: etree._Element) -> None:
|
|
578
|
+
text_nodes = paragraph.xpath(".//w:t", namespaces=NS)
|
|
579
|
+
offsets: list[tuple[etree._Element, int, int]] = []
|
|
580
|
+
cursor = 0
|
|
581
|
+
for node in text_nodes:
|
|
582
|
+
value = node.text or ""
|
|
583
|
+
offsets.append((node, cursor, cursor + len(value)))
|
|
584
|
+
cursor += len(value)
|
|
585
|
+
touched = [(node, lo, hi) for node, lo, hi in offsets if hi > start and lo < end]
|
|
586
|
+
if not touched:
|
|
587
|
+
raise FormulaError("Candidate span does not overlap any text node")
|
|
588
|
+
start_node, start_lo, _ = touched[0]
|
|
589
|
+
end_node, end_lo, _ = touched[-1]
|
|
590
|
+
start_run = ancestor_run(start_node, paragraph)
|
|
591
|
+
end_run = ancestor_run(end_node, paragraph)
|
|
592
|
+
if start_run is None or end_run is None:
|
|
593
|
+
raise FormulaError("Formula span crosses a hyperlink or unsupported nested run")
|
|
594
|
+
|
|
595
|
+
start_value = start_node.text or ""
|
|
596
|
+
end_value = end_node.text or ""
|
|
597
|
+
prefix = start_value[: max(0, start - start_lo)]
|
|
598
|
+
suffix = end_value[max(0, end - end_lo) :]
|
|
599
|
+
|
|
600
|
+
for node, lo, hi in touched:
|
|
601
|
+
value = node.text or ""
|
|
602
|
+
keep_left = value[: max(0, start - lo)] if node is start_node else ""
|
|
603
|
+
keep_right = value[max(0, end - lo) :] if node is end_node else ""
|
|
604
|
+
node.text = keep_left + keep_right
|
|
605
|
+
start_node.text = prefix
|
|
606
|
+
|
|
607
|
+
parent = paragraph
|
|
608
|
+
insert_index = parent.index(start_run) + 1
|
|
609
|
+
parent.insert(insert_index, omath)
|
|
610
|
+
if start_run is end_run and suffix:
|
|
611
|
+
start_node.text = prefix
|
|
612
|
+
parent.insert(insert_index + 1, run_with_text_like(start_run, suffix))
|
|
613
|
+
|
|
614
|
+
|
|
615
|
+
def replace_display_paragraph(paragraph: etree._Element, omath: etree._Element) -> None:
|
|
616
|
+
for child in list(paragraph):
|
|
617
|
+
if child.tag != qname(W_NS, "pPr"):
|
|
618
|
+
paragraph.remove(child)
|
|
619
|
+
math_para = etree.Element(qname(M_NS, "oMathPara"))
|
|
620
|
+
math_para.append(omath)
|
|
621
|
+
paragraph.append(math_para)
|
|
622
|
+
|
|
623
|
+
|
|
624
|
+
def replace_multiline_table_formula(
|
|
625
|
+
paragraph: etree._Element,
|
|
626
|
+
equations: Sequence[etree._Element],
|
|
627
|
+
suffix: str = "",
|
|
628
|
+
) -> None:
|
|
629
|
+
for child in list(paragraph):
|
|
630
|
+
if child.tag != qname(W_NS, "pPr"):
|
|
631
|
+
paragraph.remove(child)
|
|
632
|
+
for index, equation in enumerate(equations):
|
|
633
|
+
if index:
|
|
634
|
+
run = etree.SubElement(paragraph, qname(W_NS, "r"))
|
|
635
|
+
etree.SubElement(run, qname(W_NS, "br"))
|
|
636
|
+
paragraph.append(equation)
|
|
637
|
+
if suffix:
|
|
638
|
+
run = etree.SubElement(paragraph, qname(W_NS, "r"))
|
|
639
|
+
text = etree.SubElement(run, qname(W_NS, "t"))
|
|
640
|
+
text.text = suffix
|
|
641
|
+
|
|
642
|
+
|
|
643
|
+
def split_top_level_additive(text: str, target_length: int = 30) -> list[str]:
|
|
644
|
+
depth = 0
|
|
645
|
+
starts = [0]
|
|
646
|
+
for index, char in enumerate(text):
|
|
647
|
+
if char in "([{":
|
|
648
|
+
depth += 1
|
|
649
|
+
elif char in ")]}":
|
|
650
|
+
depth = max(0, depth - 1)
|
|
651
|
+
elif depth == 0 and char in "+-" and index > 0:
|
|
652
|
+
previous = text[index - 1]
|
|
653
|
+
if previous not in "=<>+-*/^(,":
|
|
654
|
+
starts.append(index)
|
|
655
|
+
if len(starts) == 1:
|
|
656
|
+
return [text]
|
|
657
|
+
terms = [text[starts[i] : starts[i + 1] if i + 1 < len(starts) else len(text)] for i in range(len(starts))]
|
|
658
|
+
lines: list[str] = []
|
|
659
|
+
current = ""
|
|
660
|
+
for term in terms:
|
|
661
|
+
if current and len(current) + len(term) > target_length:
|
|
662
|
+
lines.append(current)
|
|
663
|
+
current = term
|
|
664
|
+
else:
|
|
665
|
+
current += term
|
|
666
|
+
if current:
|
|
667
|
+
lines.append(current)
|
|
668
|
+
return lines
|
|
669
|
+
|
|
670
|
+
|
|
671
|
+
def estimated_formula_width(text: str) -> int:
|
|
672
|
+
derivative_count = len(re.findall(r"(?:''|'|¨|˙)\s*\(", text))
|
|
673
|
+
return len(text) + derivative_count * 18
|
|
674
|
+
|
|
675
|
+
|
|
676
|
+
def set_math_font_size(omath: etree._Element, half_points: int) -> None:
|
|
677
|
+
for math_run in omath.xpath(".//m:r", namespaces=NS):
|
|
678
|
+
word_rpr = math_run.find(qname(W_NS, "rPr"))
|
|
679
|
+
if word_rpr is None:
|
|
680
|
+
word_rpr = etree.Element(qname(W_NS, "rPr"))
|
|
681
|
+
math_rpr = math_run.find(qname(M_NS, "rPr"))
|
|
682
|
+
insert_at = 1 if math_rpr is not None else 0
|
|
683
|
+
math_run.insert(insert_at, word_rpr)
|
|
684
|
+
for local in ("sz", "szCs"):
|
|
685
|
+
size = word_rpr.find(qname(W_NS, local))
|
|
686
|
+
if size is None:
|
|
687
|
+
size = etree.SubElement(word_rpr, qname(W_NS, local))
|
|
688
|
+
size.set(qname(W_NS, "val"), str(half_points))
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
def apply_docx(
|
|
692
|
+
input_path: Path,
|
|
693
|
+
review_path: Path,
|
|
694
|
+
output_path: Path,
|
|
695
|
+
result_path: Path,
|
|
696
|
+
xsl_path: Path,
|
|
697
|
+
) -> dict[str, object]:
|
|
698
|
+
if input_path.suffix.lower() != ".docx" or output_path.suffix.lower() != ".docx":
|
|
699
|
+
raise ValueError("Input and output must be .docx files")
|
|
700
|
+
if input_path.resolve() == output_path.resolve():
|
|
701
|
+
raise ValueError("Refusing to overwrite the input DOCX")
|
|
702
|
+
review = json.loads(review_path.read_text(encoding="utf-8"))
|
|
703
|
+
candidates = [c for c in review.get("candidates", []) if c.get("selected")]
|
|
704
|
+
infos, parts = inspect_docx(input_path)
|
|
705
|
+
transform = etree.XSLT(etree.parse(str(xsl_path)))
|
|
706
|
+
result: dict[str, object] = {
|
|
707
|
+
"input": str(input_path.resolve()),
|
|
708
|
+
"output": str(output_path.resolve()),
|
|
709
|
+
"review": str(review_path.resolve()),
|
|
710
|
+
"xsl": str(xsl_path.resolve()),
|
|
711
|
+
"converted": [],
|
|
712
|
+
"skipped": [],
|
|
713
|
+
}
|
|
714
|
+
|
|
715
|
+
grouped: dict[tuple[str, int], list[dict[str, object]]] = {}
|
|
716
|
+
for candidate in candidates:
|
|
717
|
+
key = (str(candidate["part"]), int(candidate["paragraph_index"]))
|
|
718
|
+
grouped.setdefault(key, []).append(candidate)
|
|
719
|
+
|
|
720
|
+
for (part_name, paragraph_index), group in grouped.items():
|
|
721
|
+
if part_name not in parts:
|
|
722
|
+
for candidate in group:
|
|
723
|
+
result["skipped"].append({"id": candidate.get("id"), "error": "DOCX part not found"})
|
|
724
|
+
continue
|
|
725
|
+
root = etree.fromstring(parts[part_name])
|
|
726
|
+
paragraphs = root.xpath(".//w:p", namespaces=NS)
|
|
727
|
+
if paragraph_index >= len(paragraphs):
|
|
728
|
+
for candidate in group:
|
|
729
|
+
result["skipped"].append({"id": candidate.get("id"), "error": "Paragraph index out of range"})
|
|
730
|
+
continue
|
|
731
|
+
paragraph = paragraphs[paragraph_index]
|
|
732
|
+
original_text = paragraph_text(paragraph)
|
|
733
|
+
for candidate in sorted(group, key=lambda c: int(c["start"]), reverse=True):
|
|
734
|
+
try:
|
|
735
|
+
start, end = int(candidate["start"]), int(candidate["end"])
|
|
736
|
+
source = str(candidate["source"])
|
|
737
|
+
if original_text[start:end] != source:
|
|
738
|
+
raise FormulaError("Reviewed source no longer matches the paragraph span")
|
|
739
|
+
linear = str(candidate.get("linear", source))
|
|
740
|
+
in_table = bool(paragraph.xpath("ancestor::w:tc", namespaces=NS))
|
|
741
|
+
is_display = bool(candidate.get("display")) and source.strip() == original_text.strip()
|
|
742
|
+
outside_formula = original_text[:start] + original_text[end:]
|
|
743
|
+
covers_formula_paragraph = not outside_formula.strip(TRIM_PUNCT)
|
|
744
|
+
table_lines = (
|
|
745
|
+
split_top_level_additive(linear)
|
|
746
|
+
if in_table and covers_formula_paragraph and estimated_formula_width(linear) > 65
|
|
747
|
+
else [linear]
|
|
748
|
+
)
|
|
749
|
+
equations = [mathml_to_omml(formula_to_mathml(line), transform) for line in table_lines]
|
|
750
|
+
if in_table:
|
|
751
|
+
for equation in equations:
|
|
752
|
+
set_math_font_size(equation, 16)
|
|
753
|
+
if len(equations) > 1:
|
|
754
|
+
replace_multiline_table_formula(paragraph, equations, original_text[end:])
|
|
755
|
+
elif is_display:
|
|
756
|
+
omath = equations[0]
|
|
757
|
+
replace_display_paragraph(paragraph, omath)
|
|
758
|
+
else:
|
|
759
|
+
omath = equations[0]
|
|
760
|
+
replace_inline_span(paragraph, start, end, omath)
|
|
761
|
+
result["converted"].append({"id": candidate.get("id"), "source": source, "part": part_name, "lines": len(equations)})
|
|
762
|
+
except Exception as exc:
|
|
763
|
+
result["skipped"].append({"id": candidate.get("id"), "source": candidate.get("source"), "error": str(exc)})
|
|
764
|
+
parts[part_name] = etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone=True)
|
|
765
|
+
|
|
766
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
767
|
+
tmp_path = output_path.with_suffix(output_path.suffix + ".tmp")
|
|
768
|
+
try:
|
|
769
|
+
with zipfile.ZipFile(tmp_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
|
770
|
+
for info in infos:
|
|
771
|
+
archive.writestr(info, parts[info.filename])
|
|
772
|
+
shutil.move(str(tmp_path), str(output_path))
|
|
773
|
+
finally:
|
|
774
|
+
tmp_path.unlink(missing_ok=True)
|
|
775
|
+
result["converted_count"] = len(result["converted"])
|
|
776
|
+
result["skipped_count"] = len(result["skipped"])
|
|
777
|
+
result_path.parent.mkdir(parents=True, exist_ok=True)
|
|
778
|
+
result_path.write_text(json.dumps(result, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
779
|
+
return result
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mathfmt
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Typeset plain-text formulas in DOCX files as native Word equations.
|
|
5
|
+
Author: Leo
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/gml853503962-creator/mathfmt
|
|
8
|
+
Project-URL: Repository, https://github.com/gml853503962-creator/mathfmt
|
|
9
|
+
Project-URL: Issues, https://github.com/gml853503962-creator/mathfmt/issues
|
|
10
|
+
Keywords: docx,word,math,omml,equation,typesetting
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: Education
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Office/Business :: Office Suites
|
|
22
|
+
Classifier: Topic :: Text Processing :: Markup :: XML
|
|
23
|
+
Requires-Python: >=3.10
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: lxml>=5.0
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: build>=1.2; extra == "dev"
|
|
29
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
30
|
+
Requires-Dist: ruff>=0.6; extra == "dev"
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
# MathFmt
|
|
34
|
+
|
|
35
|
+
[中文](#中文) | [English](#english)
|
|
36
|
+
|
|
37
|
+
MathFmt turns awkward plain-text formulas in Word documents into native Word equations suitable for
|
|
38
|
+
textbooks, exams, and technical reports. Processing stays on your computer.
|
|
39
|
+
|
|
40
|
+
## 中文
|
|
41
|
+
|
|
42
|
+
MathFmt 将 DOCX 文档中的普通文本公式转换为 Word 原生 OMML 公式,例如把 `ds(t)/dt`、
|
|
43
|
+
`sqrt(x^2+1)` 和 `p1` 排版为纸质教材中常见的分式、根号与上下标。
|
|
44
|
+
|
|
45
|
+
### 特点
|
|
46
|
+
|
|
47
|
+
- 保留原始 DOCX,始终写入新文件。
|
|
48
|
+
- 支持正文、表格、页眉、页脚和混合文本公式。
|
|
49
|
+
- 自动跳过代码、图片公式和已有 Word 原生公式。
|
|
50
|
+
- 提供可审核流程和保守的一键转换。
|
|
51
|
+
- 完全本地运行,不上传文档。
|
|
52
|
+
|
|
53
|
+
### 环境要求
|
|
54
|
+
|
|
55
|
+
- Windows 10/11
|
|
56
|
+
- Python 3.10 或更高版本
|
|
57
|
+
- Microsoft Word/Office,并包含 `MML2OMML.XSL`
|
|
58
|
+
|
|
59
|
+
MathFmt 不分发微软的 XSL 文件。它会自动检查常见 Office 安装目录,也可使用 `--xsl` 指定路径。
|
|
60
|
+
|
|
61
|
+
### 安装
|
|
62
|
+
|
|
63
|
+
```powershell
|
|
64
|
+
pip install mathfmt
|
|
65
|
+
mathfmt doctor
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
从源码开发安装:
|
|
69
|
+
|
|
70
|
+
```powershell
|
|
71
|
+
git clone https://github.com/gml853503962-creator/mathfmt.git
|
|
72
|
+
cd mathfmt
|
|
73
|
+
python -m pip install -e ".[dev]"
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### 使用
|
|
77
|
+
|
|
78
|
+
```powershell
|
|
79
|
+
# 保守的一键转换
|
|
80
|
+
mathfmt convert input.docx
|
|
81
|
+
|
|
82
|
+
# 审核后转换
|
|
83
|
+
mathfmt scan input.docx --report candidates.json
|
|
84
|
+
mathfmt apply input.docx --review candidates.json --output output.docx --report result.json
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
一键转换默认生成 `input.mathfmt.docx` 和 `input.mathfmt.report.json`,不会覆盖原文件。编辑
|
|
88
|
+
`candidates.json` 中的 `selected`、`source` 或 `linear` 后再执行 `apply`。如果 Office 安装在
|
|
89
|
+
非标准目录,为 `apply`、`convert` 或 `doctor` 添加 `--xsl C:\path\to\MML2OMML.XSL`。
|
|
90
|
+
|
|
91
|
+
### Codex Skill
|
|
92
|
+
|
|
93
|
+
仓库中的 `skills/mathfmt` 可复制到 Codex 技能目录。技能依赖已安装的 `mathfmt` 命令。
|
|
94
|
+
|
|
95
|
+
## English
|
|
96
|
+
|
|
97
|
+
MathFmt converts plain-text formulas in DOCX files into native Word OMML equations with stacked
|
|
98
|
+
fractions, radicals, superscripts, subscripts, derivatives, and standard mathematical operators.
|
|
99
|
+
|
|
100
|
+
### Highlights
|
|
101
|
+
|
|
102
|
+
- Never overwrites the source DOCX.
|
|
103
|
+
- Handles body text, tables, headers, footers, and formulas mixed with prose.
|
|
104
|
+
- Skips likely code, image formulas, and existing Word equations.
|
|
105
|
+
- Offers both a review-first workflow and conservative one-step conversion.
|
|
106
|
+
- Processes documents locally without uploading them.
|
|
107
|
+
|
|
108
|
+
### Requirements and installation
|
|
109
|
+
|
|
110
|
+
MathFmt 0.1 supports Windows, Python 3.10+, and a Microsoft Office installation containing
|
|
111
|
+
`MML2OMML.XSL`. The Microsoft stylesheet is detected locally and is not distributed with MathFmt.
|
|
112
|
+
|
|
113
|
+
```powershell
|
|
114
|
+
pip install mathfmt
|
|
115
|
+
mathfmt doctor
|
|
116
|
+
mathfmt convert input.docx
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
For review-first conversion, run `scan`, edit the generated JSON, then run `apply` as shown above.
|
|
120
|
+
|
|
121
|
+
## Contributing
|
|
122
|
+
|
|
123
|
+
Issues and pull requests are welcome. See [CONTRIBUTING.md](CONTRIBUTING.md),
|
|
124
|
+
[CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md), and [SECURITY.md](SECURITY.md).
|
|
125
|
+
|
|
126
|
+
## License
|
|
127
|
+
|
|
128
|
+
MIT License. Copyright (c) 2026 Leo.
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
mathfmt/__init__.py,sha256=3pPdh3QI-DGkkCcIzF9Qs80tuHoB3rldk6L3wGJgZe4,190
|
|
2
|
+
mathfmt/__main__.py,sha256=k1ocEWawweo1qCJWNFAAvyxz3tcY13dzvCenHszij30,48
|
|
3
|
+
mathfmt/cli.py,sha256=qLQ_BunCTG1S4AzV4GyFgcHgTAGmJrxux50EtM0Xb7c,5073
|
|
4
|
+
mathfmt/core.py,sha256=t-QTXLwZT63TfEC5vVkKW7gtvhkk806SBzB82bCF-0Y,30707
|
|
5
|
+
mathfmt-0.1.0.dist-info/licenses/LICENSE,sha256=D9t3XgDm_d4jmEtPEYTNE8NBNcqsHnFK1IkUHCOXBLw,1060
|
|
6
|
+
mathfmt-0.1.0.dist-info/METADATA,sha256=VTZPuSbCPirrMMJNlhauzb5EIIx11OmpeQR6gwQYBGM,4450
|
|
7
|
+
mathfmt-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
8
|
+
mathfmt-0.1.0.dist-info/entry_points.txt,sha256=ZZ8BY4-gwAGDz1vLaq68V5TY0BSk9SM34EWRI6hF1qI,45
|
|
9
|
+
mathfmt-0.1.0.dist-info/top_level.txt,sha256=ZSUOpS_ciCol-UY9sp5gA39eSEirrTFSwwCpF2eKXb0,8
|
|
10
|
+
mathfmt-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Leo
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
mathfmt
|