tex2word 0.8.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tex2word-0.8.1/LICENSE +21 -0
- tex2word-0.8.1/PKG-INFO +209 -0
- tex2word-0.8.1/README.md +168 -0
- tex2word-0.8.1/pyproject.toml +103 -0
- tex2word-0.8.1/src/latex2word/__init__.py +13 -0
- tex2word-0.8.1/src/latex2word/backend/__init__.py +1 -0
- tex2word-0.8.1/src/latex2word/backend/document.py +1138 -0
- tex2word-0.8.1/src/latex2word/backend/fields.py +97 -0
- tex2word-0.8.1/src/latex2word/backend/images.py +94 -0
- tex2word-0.8.1/src/latex2word/backend/latex_writer.py +425 -0
- tex2word-0.8.1/src/latex2word/backend/numbering.py +127 -0
- tex2word-0.8.1/src/latex2word/backend/ooxml.py +80 -0
- tex2word-0.8.1/src/latex2word/backend/package.py +198 -0
- tex2word-0.8.1/src/latex2word/backend/raster.py +61 -0
- tex2word-0.8.1/src/latex2word/benchmark.py +101 -0
- tex2word-0.8.1/src/latex2word/bib/__init__.py +12 -0
- tex2word-0.8.1/src/latex2word/bib/bbl.py +69 -0
- tex2word-0.8.1/src/latex2word/bib/bibtex.py +195 -0
- tex2word-0.8.1/src/latex2word/bib/csl_engine.py +89 -0
- tex2word-0.8.1/src/latex2word/bib/render.py +330 -0
- tex2word-0.8.1/src/latex2word/bib/zotero.py +78 -0
- tex2word-0.8.1/src/latex2word/cli.py +214 -0
- tex2word-0.8.1/src/latex2word/frontend/__init__.py +7 -0
- tex2word-0.8.1/src/latex2word/frontend/algorithms.py +223 -0
- tex2word-0.8.1/src/latex2word/frontend/colors.py +134 -0
- tex2word-0.8.1/src/latex2word/frontend/docx_reader.py +779 -0
- tex2word-0.8.1/src/latex2word/frontend/latexml.py +283 -0
- tex2word-0.8.1/src/latex2word/frontend/macros.py +444 -0
- tex2word-0.8.1/src/latex2word/frontend/parser.py +1701 -0
- tex2word-0.8.1/src/latex2word/frontend/preprocess.py +118 -0
- tex2word-0.8.1/src/latex2word/frontend/siunitx.py +101 -0
- tex2word-0.8.1/src/latex2word/ir.py +432 -0
- tex2word-0.8.1/src/latex2word/mathml/__init__.py +7 -0
- tex2word-0.8.1/src/latex2word/mathml/cascade.py +129 -0
- tex2word-0.8.1/src/latex2word/mathml/imagemath.py +107 -0
- tex2word-0.8.1/src/latex2word/mathml/latex_math.py +596 -0
- tex2word-0.8.1/src/latex2word/mathml/mathml_to_omml.py +184 -0
- tex2word-0.8.1/src/latex2word/mathml/omml.py +300 -0
- tex2word-0.8.1/src/latex2word/mathml/omml_reader.py +242 -0
- tex2word-0.8.1/src/latex2word/mathml/symbols.py +122 -0
- tex2word-0.8.1/src/latex2word/pipeline.py +172 -0
- tex2word-0.8.1/src/latex2word/py.typed +0 -0
- tex2word-0.8.1/src/latex2word/render_check.py +128 -0
- tex2word-0.8.1/src/latex2word/report.py +175 -0
- tex2word-0.8.1/src/latex2word/roundtrip.py +361 -0
- tex2word-0.8.1/src/latex2word/templates/__init__.py +10 -0
- tex2word-0.8.1/src/latex2word/templates/reference.py +244 -0
- tex2word-0.8.1/src/latex2word/templates/styles.xml +146 -0
- tex2word-0.8.1/src/latex2word/transforms/__init__.py +7 -0
- tex2word-0.8.1/src/latex2word/transforms/crossref.py +134 -0
- tex2word-0.8.1/src/latex2word/validate.py +203 -0
tex2word-0.8.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Yifan Yang
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
tex2word-0.8.1/PKG-INFO
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tex2word
|
|
3
|
+
Version: 0.8.1
|
|
4
|
+
Summary: Open-source, production-grade LaTeX -> Microsoft Word (.docx) converter with native OMML math and live fields
|
|
5
|
+
Keywords: latex,word,docx,converter,ooxml,omml,tex,office,document
|
|
6
|
+
Author: Yifan Yang
|
|
7
|
+
Author-email: Yifan Yang <yfyang.86@hotmail.com>
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Intended Audience :: End Users/Desktop
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Text Processing :: Markup :: LaTeX
|
|
19
|
+
Classifier: Topic :: Text Processing :: Markup :: XML
|
|
20
|
+
Classifier: Topic :: Office/Business :: Office Suites
|
|
21
|
+
Classifier: Typing :: Typed
|
|
22
|
+
Requires-Dist: lxml>=5.0
|
|
23
|
+
Requires-Dist: pylatexenc>=2.10
|
|
24
|
+
Requires-Dist: citeproc-py>=0.6 ; extra == 'csl'
|
|
25
|
+
Requires-Dist: matplotlib>=3.7 ; extra == 'mathimg'
|
|
26
|
+
Requires-Dist: latex2mathml>=3.77 ; extra == 'mathml'
|
|
27
|
+
Requires-Dist: pypdfium2>=4 ; extra == 'pdf'
|
|
28
|
+
Requires-Dist: pillow>=10 ; extra == 'pdf'
|
|
29
|
+
Maintainer: Yifan Yang
|
|
30
|
+
Maintainer-email: Yifan Yang <yfyang.86@hotmail.com>
|
|
31
|
+
Requires-Python: >=3.12
|
|
32
|
+
Project-URL: Changelog, https://github.com/yfyang86/tex2word/blob/main/CHANGELOG.md
|
|
33
|
+
Project-URL: Homepage, https://github.com/yfyang86/tex2word
|
|
34
|
+
Project-URL: Issues, https://github.com/yfyang86/tex2word/issues
|
|
35
|
+
Project-URL: Repository, https://github.com/yfyang86/tex2word
|
|
36
|
+
Provides-Extra: csl
|
|
37
|
+
Provides-Extra: mathimg
|
|
38
|
+
Provides-Extra: mathml
|
|
39
|
+
Provides-Extra: pdf
|
|
40
|
+
Description-Content-Type: text/markdown
|
|
41
|
+
|
|
42
|
+
# latex2word
|
|
43
|
+
|
|
44
|
+
An open-source, cross-platform **LaTeX → Microsoft Word (`.docx`)** converter
|
|
45
|
+
that produces *genuinely editable* Word: native paragraph styles, **native OMML
|
|
46
|
+
equations** (editable in Word's equation editor, not images), and **live,
|
|
47
|
+
auto-renumbering fields** for equation/figure/table numbers and
|
|
48
|
+
cross-references.
|
|
49
|
+
|
|
50
|
+
> **Status: production-grade.** Foundation, math core (direct LaTeX→OMML), the
|
|
51
|
+
> live cross-reference/field plumbing (the differentiator), image embedding, the
|
|
52
|
+
> BibTeX bibliography, and the robustness layer (math cascade, coverage report,
|
|
53
|
+
> OOXML validator, round-trip manifest) are all in. See
|
|
54
|
+
> [`CHANGELOG.md`](CHANGELOG.md) for the release history.
|
|
55
|
+
|
|
56
|
+
## Why
|
|
57
|
+
|
|
58
|
+
Pandoc/`texmath` is the open-source reference but **drops equation numbers**,
|
|
59
|
+
can dump raw LaTeX for labelled equations, and emits *static* cross-references.
|
|
60
|
+
No open tool produces editable styles **and** native OMML **and** live
|
|
61
|
+
field-based numbering. That gap is the product.
|
|
62
|
+
|
|
63
|
+
## Install & use
|
|
64
|
+
|
|
65
|
+
Requires Python 3.12+.
|
|
66
|
+
|
|
67
|
+
From PyPI:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pip install tex2word # core (PNG/JPEG figures)
|
|
71
|
+
pip install "tex2word[pdf]" # + PDF figure rasterisation (pypdfium2, Apache-2.0)
|
|
72
|
+
pip install "tex2word[mathml]" # + LaTeX->MathML->OMML for hard math (latex2mathml)
|
|
73
|
+
pip install "tex2word[csl]" # + real CSL citation styles (citeproc-py)
|
|
74
|
+
pip install "tex2word[pdf,mathml,csl,mathimg]" # everything
|
|
75
|
+
|
|
76
|
+
latex2word convert paper.tex -o paper.docx
|
|
77
|
+
latex2word convert paper.tex -o paper.docx --report report.json
|
|
78
|
+
latex2word convert paper.tex -o paper.docx --reference-doc journal.docx
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Or, for a development checkout with [uv](https://docs.astral.sh/uv/):
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
uv sync --all-extras
|
|
85
|
+
uv run latex2word convert paper.tex -o paper.docx
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Or from Python:
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
from latex2word import convert_source, convert_file
|
|
92
|
+
|
|
93
|
+
out_path, result = convert_file("paper.tex")
|
|
94
|
+
print(result.report.summary()) # math coverage + warnings
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## What works today
|
|
98
|
+
|
|
99
|
+
- **Reference Word templates** ★: `--reference-doc TEMPLATE.docx` adopts a
|
|
100
|
+
journal/corporate template's styles, theme and page geometry (size + margins),
|
|
101
|
+
so the output matches the required look — while keeping the live fields below.
|
|
102
|
+
Our custom styles are merged in so nothing renders unstyled.
|
|
103
|
+
- **Structure & styles**: `\title`/`\author`/`\date`/`abstract`, `\section`…
|
|
104
|
+
`\subparagraph` → Word Title/Heading 1–4 (visible in the Navigation pane),
|
|
105
|
+
paragraphs, `\textbf`/`\emph`/`\texttt`/`\underline`/`\textsc`, quotes, code.
|
|
106
|
+
Sections are **auto-numbered** (multilevel `1` / `1.1` / `1.1.1`) like LaTeX,
|
|
107
|
+
with `\section*` unnumbered; `\ref` to a section shows its live number. In
|
|
108
|
+
**book/report** documents `\chapter` is the top level (sections nest under it)
|
|
109
|
+
and `\appendix` switches to lettered headings (`A`, `A.1`).
|
|
110
|
+
- **Math (direct LaTeX→OMML)**: inline `$…$`, display `\[…\]`,
|
|
111
|
+
`equation`/`align`/`gather`; fractions, sub/superscripts, roots, `\sum`/`\int`
|
|
112
|
+
with limits, accents, `\left…\right` delimiters, matrices/`cases`, Greek and
|
|
113
|
+
hundreds of symbols, `\mathbb`/`\mathcal`/`\mathbf`, functions (`\sin`, `\lim`).
|
|
114
|
+
`align*`/`aligned` line up at the `&` (a column-justified matrix); numbered
|
|
115
|
+
`align` keeps a live number per line.
|
|
116
|
+
- **Live fields** ★: numbered equations get `SEQ Equation` fields inside
|
|
117
|
+
bookmarks; `\ref`/`\eqref`/`\pageref` become `REF`/`PAGEREF` fields; figure
|
|
118
|
+
and table captions get `SEQ Figure`/`SEQ Table`. Numbers auto-renumber in
|
|
119
|
+
Word on field refresh. `--number-by-section` switches to `N.M` per-section
|
|
120
|
+
numbering (`STYLEREF` + `SEQ \s`), book/report style.
|
|
121
|
+
- **Table of contents** ★: `\tableofcontents` → a live Word `TOC` field (rebuilds
|
|
122
|
+
from heading styles on refresh); `\listoffigures`/`\listoftables` → caption-
|
|
123
|
+
sequence lists. Schema-valid and round-tripping.
|
|
124
|
+
- **Lists, tables, figures**: `itemize`/`enumerate`, `tabular`/`longtable` with
|
|
125
|
+
`booktabs`, `\multicolumn`→column span, `\multirow`→vertical merge, and
|
|
126
|
+
repeating header rows; captioned `figure`/`table`, `\includegraphics`
|
|
127
|
+
(PNG/JPEG embedded directly; **PDF figures rasterised** to PNG when the
|
|
128
|
+
optional `tex2word[pdf]` extra — pypdfium2 — is installed). An
|
|
129
|
+
`\includegraphics` in running text (an icon/logo) is embedded **inline**.
|
|
130
|
+
- **Custom macros**: `\newcommand`/`\renewcommand`/`\def` are expanded before
|
|
131
|
+
parsing. Common `mathtools`/`physics` math (`\abs`, `\norm`, `\dv`, `\ket`, …)
|
|
132
|
+
and `siunitx` (`\SI{9.81}{\meter\per\second\squared}` → `9.81 m/s²`, `\num`,
|
|
133
|
+
`\ang`) work as built-ins when not user-defined. **Acronyms** (`glossaries`):
|
|
134
|
+
`\newacronym` + `\gls`/`\acrshort`/`\acrlong`/`\acrfull` expand with the
|
|
135
|
+
first-use "long (short)" rule.
|
|
136
|
+
- **Footnotes**: `\footnote` → native Word footnotes (`footnotes.xml`), not
|
|
137
|
+
inlined text; footnote bodies keep their formatting and math.
|
|
138
|
+
- **Inline verbatim & smart refs**: `\verb|...|` → literal monospace;
|
|
139
|
+
`\cref`/`\Cref`/`\autoref` add cleveref-style type prefixes ("fig. N" /
|
|
140
|
+
"Figure N").
|
|
141
|
+
- **Theorem environments**: `theorem`/`lemma`/`proof`/`definition`/… render
|
|
142
|
+
with a bold numbered lead (live `SEQ` per kind), optional `[title]`, and a
|
|
143
|
+
QED mark for proofs; `\ref` to a theorem shows its number.
|
|
144
|
+
- **Algorithms**: `algorithm` + `algorithmic`/`algpseudocode`/`algorithm2e` →
|
|
145
|
+
numbered, indented pseudocode with bold keywords, inline OMML math, and a live
|
|
146
|
+
`SEQ Algorithm` caption.
|
|
147
|
+
- **Graceful degradation**: unknown constructs never abort; they pass through
|
|
148
|
+
best-effort and are logged to the conversion report (math coverage telemetry
|
|
149
|
+
included). The math **decision-cascade** (direct OMML → LaTeX→MathML→OMML
|
|
150
|
+
secondary path → image fallback `--math-image-fallback` → raw) records which
|
|
151
|
+
path each equation took.
|
|
152
|
+
- **Round-trip**: the IR is embedded as a JSON manifest custom part, so the
|
|
153
|
+
exact IR can be recovered from the `.docx` (`latex2word.roundtrip.recover_ir`)
|
|
154
|
+
and converted **back to LaTeX** (`latex2word to-latex out.docx`); the corpus
|
|
155
|
+
`latex→docx→latex` keeps the same block structure. Reconcile (on by default)
|
|
156
|
+
merges Word edits against the manifest, and **Word Track Changes are accepted**
|
|
157
|
+
on read (insertions kept, deletions dropped).
|
|
158
|
+
- **Reports & validation**: `--report report.json|report.html` writes a coverage
|
|
159
|
+
report; `latex2word.validate.validate_docx` structurally validates output;
|
|
160
|
+
`latex2word benchmark <dir>` reports a quantitative baseline (math-OMML %,
|
|
161
|
+
validity, warnings, 0-abort) across a paper set (CI-gated on the corpus + UATs:
|
|
162
|
+
currently 100% native-OMML math, 100% valid, 0 aborts).
|
|
163
|
+
- **Reproducible**: set `SOURCE_DATE_EPOCH` and the same input yields
|
|
164
|
+
byte-identical output (the `.docx` ZIP is built deterministically).
|
|
165
|
+
- **Live citations** (opt-in `--citations zotero`): emit
|
|
166
|
+
`ADDIN ZOTERO_ITEM CSL_CITATION` / `CSL_BIBLIOGRAPHY` fields so citations are
|
|
167
|
+
editable by Zotero/Mendeley in Word (default is static formatted text).
|
|
168
|
+
- **Real CSL styles** (opt-in `--csl style.csl`, needs `tex2word[csl]`): a
|
|
169
|
+
genuine `citeproc-py` engine formats in-text citations and the reference list
|
|
170
|
+
against any `.csl` style, with proper sorting; the built-in heuristic is the
|
|
171
|
+
fallback. `\nocite{key}`/`\nocite{*}` are honoured.
|
|
172
|
+
|
|
173
|
+
- **Front-end choice**: the default **`pure`** front-end (pylatexenc-based) is
|
|
174
|
+
the validated engine — it converts the corpus and three real-paper UATs at
|
|
175
|
+
100% native-OMML math, 100% valid output, 0 aborts. `--frontend latexml` is
|
|
176
|
+
**experimental**: it shells out to a real `latexml` install for genuine TeX
|
|
177
|
+
expansion, but is not yet proven end-to-end (it silently falls back to `pure`
|
|
178
|
+
on any failure; see the advisory `real-tool` CI lane).
|
|
179
|
+
|
|
180
|
+
## Architecture
|
|
181
|
+
|
|
182
|
+
```
|
|
183
|
+
LaTeX ─▶ front-end (preprocess, macro-expand, pylatexenc walk) ─▶ IR
|
|
184
|
+
─▶ transforms (cross-reference resolution) ─▶ IR
|
|
185
|
+
─▶ back-end (raw OOXML via lxml: document/styles/numbering) ─▶ .docx
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
The **IR** ([`src/latex2word/ir.py`](src/latex2word/ir.py)) is the format-neutral seam, so a LaTeXML front-end can replace the static parser post-V1 without touching the back-end.
|
|
189
|
+
|
|
190
|
+
## Development
|
|
191
|
+
|
|
192
|
+
```bash
|
|
193
|
+
uv run pytest # tests
|
|
194
|
+
uv run ruff check src tests
|
|
195
|
+
uv run mypy src
|
|
196
|
+
uv run pre-commit install # optional: run the lint/type gate on every commit
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
Releases: pushing a `vX.Y.Z` tag builds the wheel/sdist and publishes to PyPI
|
|
200
|
+
(via the `Release` workflow, using PyPI Trusted Publishing). Notable changes are
|
|
201
|
+
recorded in [`CHANGELOG.md`](CHANGELOG.md).
|
|
202
|
+
|
|
203
|
+
## License
|
|
204
|
+
|
|
205
|
+
MIT — see [`LICENSE`](LICENSE).
|
|
206
|
+
|
|
207
|
+
## Author
|
|
208
|
+
|
|
209
|
+
Yifan Yang <yfyang.86@hotmail.com>
|
tex2word-0.8.1/README.md
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
# latex2word
|
|
2
|
+
|
|
3
|
+
An open-source, cross-platform **LaTeX → Microsoft Word (`.docx`)** converter
|
|
4
|
+
that produces *genuinely editable* Word: native paragraph styles, **native OMML
|
|
5
|
+
equations** (editable in Word's equation editor, not images), and **live,
|
|
6
|
+
auto-renumbering fields** for equation/figure/table numbers and
|
|
7
|
+
cross-references.
|
|
8
|
+
|
|
9
|
+
> **Status: production-grade.** Foundation, math core (direct LaTeX→OMML), the
|
|
10
|
+
> live cross-reference/field plumbing (the differentiator), image embedding, the
|
|
11
|
+
> BibTeX bibliography, and the robustness layer (math cascade, coverage report,
|
|
12
|
+
> OOXML validator, round-trip manifest) are all in. See
|
|
13
|
+
> [`CHANGELOG.md`](CHANGELOG.md) for the release history.
|
|
14
|
+
|
|
15
|
+
## Why
|
|
16
|
+
|
|
17
|
+
Pandoc/`texmath` is the open-source reference but **drops equation numbers**,
|
|
18
|
+
can dump raw LaTeX for labelled equations, and emits *static* cross-references.
|
|
19
|
+
No open tool produces editable styles **and** native OMML **and** live
|
|
20
|
+
field-based numbering. That gap is the product.
|
|
21
|
+
|
|
22
|
+
## Install & use
|
|
23
|
+
|
|
24
|
+
Requires Python 3.12+.
|
|
25
|
+
|
|
26
|
+
From PyPI:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install tex2word # core (PNG/JPEG figures)
|
|
30
|
+
pip install "tex2word[pdf]" # + PDF figure rasterisation (pypdfium2, Apache-2.0)
|
|
31
|
+
pip install "tex2word[mathml]" # + LaTeX->MathML->OMML for hard math (latex2mathml)
|
|
32
|
+
pip install "tex2word[csl]" # + real CSL citation styles (citeproc-py)
|
|
33
|
+
pip install "tex2word[pdf,mathml,csl,mathimg]" # everything
|
|
34
|
+
|
|
35
|
+
latex2word convert paper.tex -o paper.docx
|
|
36
|
+
latex2word convert paper.tex -o paper.docx --report report.json
|
|
37
|
+
latex2word convert paper.tex -o paper.docx --reference-doc journal.docx
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Or, for a development checkout with [uv](https://docs.astral.sh/uv/):
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
uv sync --all-extras
|
|
44
|
+
uv run latex2word convert paper.tex -o paper.docx
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Or from Python:
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
from latex2word import convert_source, convert_file
|
|
51
|
+
|
|
52
|
+
out_path, result = convert_file("paper.tex")
|
|
53
|
+
print(result.report.summary()) # math coverage + warnings
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## What works today
|
|
57
|
+
|
|
58
|
+
- **Reference Word templates** ★: `--reference-doc TEMPLATE.docx` adopts a
|
|
59
|
+
journal/corporate template's styles, theme and page geometry (size + margins),
|
|
60
|
+
so the output matches the required look — while keeping the live fields below.
|
|
61
|
+
Our custom styles are merged in so nothing renders unstyled.
|
|
62
|
+
- **Structure & styles**: `\title`/`\author`/`\date`/`abstract`, `\section`…
|
|
63
|
+
`\subparagraph` → Word Title/Heading 1–4 (visible in the Navigation pane),
|
|
64
|
+
paragraphs, `\textbf`/`\emph`/`\texttt`/`\underline`/`\textsc`, quotes, code.
|
|
65
|
+
Sections are **auto-numbered** (multilevel `1` / `1.1` / `1.1.1`) like LaTeX,
|
|
66
|
+
with `\section*` unnumbered; `\ref` to a section shows its live number. In
|
|
67
|
+
**book/report** documents `\chapter` is the top level (sections nest under it)
|
|
68
|
+
and `\appendix` switches to lettered headings (`A`, `A.1`).
|
|
69
|
+
- **Math (direct LaTeX→OMML)**: inline `$…$`, display `\[…\]`,
|
|
70
|
+
`equation`/`align`/`gather`; fractions, sub/superscripts, roots, `\sum`/`\int`
|
|
71
|
+
with limits, accents, `\left…\right` delimiters, matrices/`cases`, Greek and
|
|
72
|
+
hundreds of symbols, `\mathbb`/`\mathcal`/`\mathbf`, functions (`\sin`, `\lim`).
|
|
73
|
+
`align*`/`aligned` line up at the `&` (a column-justified matrix); numbered
|
|
74
|
+
`align` keeps a live number per line.
|
|
75
|
+
- **Live fields** ★: numbered equations get `SEQ Equation` fields inside
|
|
76
|
+
bookmarks; `\ref`/`\eqref`/`\pageref` become `REF`/`PAGEREF` fields; figure
|
|
77
|
+
and table captions get `SEQ Figure`/`SEQ Table`. Numbers auto-renumber in
|
|
78
|
+
Word on field refresh. `--number-by-section` switches to `N.M` per-section
|
|
79
|
+
numbering (`STYLEREF` + `SEQ \s`), book/report style.
|
|
80
|
+
- **Table of contents** ★: `\tableofcontents` → a live Word `TOC` field (rebuilds
|
|
81
|
+
from heading styles on refresh); `\listoffigures`/`\listoftables` → caption-
|
|
82
|
+
sequence lists. Schema-valid and round-tripping.
|
|
83
|
+
- **Lists, tables, figures**: `itemize`/`enumerate`, `tabular`/`longtable` with
|
|
84
|
+
`booktabs`, `\multicolumn`→column span, `\multirow`→vertical merge, and
|
|
85
|
+
repeating header rows; captioned `figure`/`table`, `\includegraphics`
|
|
86
|
+
(PNG/JPEG embedded directly; **PDF figures rasterised** to PNG when the
|
|
87
|
+
optional `tex2word[pdf]` extra — pypdfium2 — is installed). An
|
|
88
|
+
`\includegraphics` in running text (an icon/logo) is embedded **inline**.
|
|
89
|
+
- **Custom macros**: `\newcommand`/`\renewcommand`/`\def` are expanded before
|
|
90
|
+
parsing. Common `mathtools`/`physics` math (`\abs`, `\norm`, `\dv`, `\ket`, …)
|
|
91
|
+
and `siunitx` (`\SI{9.81}{\meter\per\second\squared}` → `9.81 m/s²`, `\num`,
|
|
92
|
+
`\ang`) work as built-ins when not user-defined. **Acronyms** (`glossaries`):
|
|
93
|
+
`\newacronym` + `\gls`/`\acrshort`/`\acrlong`/`\acrfull` expand with the
|
|
94
|
+
first-use "long (short)" rule.
|
|
95
|
+
- **Footnotes**: `\footnote` → native Word footnotes (`footnotes.xml`), not
|
|
96
|
+
inlined text; footnote bodies keep their formatting and math.
|
|
97
|
+
- **Inline verbatim & smart refs**: `\verb|...|` → literal monospace;
|
|
98
|
+
`\cref`/`\Cref`/`\autoref` add cleveref-style type prefixes ("fig. N" /
|
|
99
|
+
"Figure N").
|
|
100
|
+
- **Theorem environments**: `theorem`/`lemma`/`proof`/`definition`/… render
|
|
101
|
+
with a bold numbered lead (live `SEQ` per kind), optional `[title]`, and a
|
|
102
|
+
QED mark for proofs; `\ref` to a theorem shows its number.
|
|
103
|
+
- **Algorithms**: `algorithm` + `algorithmic`/`algpseudocode`/`algorithm2e` →
|
|
104
|
+
numbered, indented pseudocode with bold keywords, inline OMML math, and a live
|
|
105
|
+
`SEQ Algorithm` caption.
|
|
106
|
+
- **Graceful degradation**: unknown constructs never abort; they pass through
|
|
107
|
+
best-effort and are logged to the conversion report (math coverage telemetry
|
|
108
|
+
included). The math **decision-cascade** (direct OMML → LaTeX→MathML→OMML
|
|
109
|
+
secondary path → image fallback `--math-image-fallback` → raw) records which
|
|
110
|
+
path each equation took.
|
|
111
|
+
- **Round-trip**: the IR is embedded as a JSON manifest custom part, so the
|
|
112
|
+
exact IR can be recovered from the `.docx` (`latex2word.roundtrip.recover_ir`)
|
|
113
|
+
and converted **back to LaTeX** (`latex2word to-latex out.docx`); the corpus
|
|
114
|
+
`latex→docx→latex` keeps the same block structure. Reconcile (on by default)
|
|
115
|
+
merges Word edits against the manifest, and **Word Track Changes are accepted**
|
|
116
|
+
on read (insertions kept, deletions dropped).
|
|
117
|
+
- **Reports & validation**: `--report report.json|report.html` writes a coverage
|
|
118
|
+
report; `latex2word.validate.validate_docx` structurally validates output;
|
|
119
|
+
`latex2word benchmark <dir>` reports a quantitative baseline (math-OMML %,
|
|
120
|
+
validity, warnings, 0-abort) across a paper set (CI-gated on the corpus + UATs:
|
|
121
|
+
currently 100% native-OMML math, 100% valid, 0 aborts).
|
|
122
|
+
- **Reproducible**: set `SOURCE_DATE_EPOCH` and the same input yields
|
|
123
|
+
byte-identical output (the `.docx` ZIP is built deterministically).
|
|
124
|
+
- **Live citations** (opt-in `--citations zotero`): emit
|
|
125
|
+
`ADDIN ZOTERO_ITEM CSL_CITATION` / `CSL_BIBLIOGRAPHY` fields so citations are
|
|
126
|
+
editable by Zotero/Mendeley in Word (default is static formatted text).
|
|
127
|
+
- **Real CSL styles** (opt-in `--csl style.csl`, needs `tex2word[csl]`): a
|
|
128
|
+
genuine `citeproc-py` engine formats in-text citations and the reference list
|
|
129
|
+
against any `.csl` style, with proper sorting; the built-in heuristic is the
|
|
130
|
+
fallback. `\nocite{key}`/`\nocite{*}` are honoured.
|
|
131
|
+
|
|
132
|
+
- **Front-end choice**: the default **`pure`** front-end (pylatexenc-based) is
|
|
133
|
+
the validated engine — it converts the corpus and three real-paper UATs at
|
|
134
|
+
100% native-OMML math, 100% valid output, 0 aborts. `--frontend latexml` is
|
|
135
|
+
**experimental**: it shells out to a real `latexml` install for genuine TeX
|
|
136
|
+
expansion, but is not yet proven end-to-end (it silently falls back to `pure`
|
|
137
|
+
on any failure; see the advisory `real-tool` CI lane).
|
|
138
|
+
|
|
139
|
+
## Architecture
|
|
140
|
+
|
|
141
|
+
```
|
|
142
|
+
LaTeX ─▶ front-end (preprocess, macro-expand, pylatexenc walk) ─▶ IR
|
|
143
|
+
─▶ transforms (cross-reference resolution) ─▶ IR
|
|
144
|
+
─▶ back-end (raw OOXML via lxml: document/styles/numbering) ─▶ .docx
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
The **IR** ([`src/latex2word/ir.py`](src/latex2word/ir.py)) is the format-neutral seam, so a LaTeXML front-end can replace the static parser post-V1 without touching the back-end.
|
|
148
|
+
|
|
149
|
+
## Development
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
uv run pytest # tests
|
|
153
|
+
uv run ruff check src tests
|
|
154
|
+
uv run mypy src
|
|
155
|
+
uv run pre-commit install # optional: run the lint/type gate on every commit
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
Releases: pushing a `vX.Y.Z` tag builds the wheel/sdist and publishes to PyPI
|
|
159
|
+
(via the `Release` workflow, using PyPI Trusted Publishing). Notable changes are
|
|
160
|
+
recorded in [`CHANGELOG.md`](CHANGELOG.md).
|
|
161
|
+
|
|
162
|
+
## License
|
|
163
|
+
|
|
164
|
+
MIT — see [`LICENSE`](LICENSE).
|
|
165
|
+
|
|
166
|
+
## Author
|
|
167
|
+
|
|
168
|
+
Yifan Yang <yfyang.86@hotmail.com>
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "tex2word"
|
|
3
|
+
version = "0.8.1"
|
|
4
|
+
description = "Open-source, production-grade LaTeX -> Microsoft Word (.docx) converter with native OMML math and live fields"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.12"
|
|
7
|
+
authors = [
|
|
8
|
+
{ name = "Yifan Yang", email = "yfyang.86@hotmail.com" }
|
|
9
|
+
]
|
|
10
|
+
maintainers = [
|
|
11
|
+
{ name = "Yifan Yang", email = "yfyang.86@hotmail.com" }
|
|
12
|
+
]
|
|
13
|
+
license = "MIT"
|
|
14
|
+
license-files = ["LICENSE"]
|
|
15
|
+
keywords = [
|
|
16
|
+
"latex",
|
|
17
|
+
"word",
|
|
18
|
+
"docx",
|
|
19
|
+
"converter",
|
|
20
|
+
"ooxml",
|
|
21
|
+
"omml",
|
|
22
|
+
"tex",
|
|
23
|
+
"office",
|
|
24
|
+
"document",
|
|
25
|
+
]
|
|
26
|
+
classifiers = [
|
|
27
|
+
"Development Status :: 4 - Beta",
|
|
28
|
+
"Environment :: Console",
|
|
29
|
+
"Intended Audience :: Science/Research",
|
|
30
|
+
"Intended Audience :: End Users/Desktop",
|
|
31
|
+
"Operating System :: OS Independent",
|
|
32
|
+
"Programming Language :: Python :: 3",
|
|
33
|
+
"Programming Language :: Python :: 3.12",
|
|
34
|
+
"Programming Language :: Python :: 3.13",
|
|
35
|
+
"Topic :: Text Processing :: Markup :: LaTeX",
|
|
36
|
+
"Topic :: Text Processing :: Markup :: XML",
|
|
37
|
+
"Topic :: Office/Business :: Office Suites",
|
|
38
|
+
"Typing :: Typed",
|
|
39
|
+
]
|
|
40
|
+
dependencies = [
|
|
41
|
+
"lxml>=5.0",
|
|
42
|
+
"pylatexenc>=2.10",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
[project.urls]
|
|
46
|
+
Homepage = "https://github.com/yfyang86/tex2word"
|
|
47
|
+
Repository = "https://github.com/yfyang86/tex2word"
|
|
48
|
+
Issues = "https://github.com/yfyang86/tex2word/issues"
|
|
49
|
+
Changelog = "https://github.com/yfyang86/tex2word/blob/main/CHANGELOG.md"
|
|
50
|
+
|
|
51
|
+
[project.optional-dependencies]
|
|
52
|
+
# Rasterise PDF figures to embeddable PNG (Word can't embed PDF directly).
|
|
53
|
+
# pypdfium2 (PDFium) + Pillow are both permissive (Apache-2.0/BSD/HPND) -- no GPL.
|
|
54
|
+
pdf = ["pypdfium2>=4", "pillow>=10"]
|
|
55
|
+
# Secondary math path: LaTeX -> presentation MathML -> OMML for the hard 20%.
|
|
56
|
+
mathml = ["latex2mathml>=3.77"]
|
|
57
|
+
# Image fallback for math that can't become OMML (pure-Python; no TeX needed).
|
|
58
|
+
mathimg = ["matplotlib>=3.7"]
|
|
59
|
+
# Real CSL citation processor: format citations/bibliography against a .csl style.
|
|
60
|
+
csl = ["citeproc-py>=0.6"]
|
|
61
|
+
|
|
62
|
+
[project.scripts]
|
|
63
|
+
latex2word = "latex2word.cli:main"
|
|
64
|
+
|
|
65
|
+
[build-system]
|
|
66
|
+
requires = ["uv_build>=0.8.17,<0.9.0"]
|
|
67
|
+
build-backend = "uv_build"
|
|
68
|
+
|
|
69
|
+
# The PyPI distribution is published as "tex2word", but the import package
|
|
70
|
+
# remains "latex2word", so point the build backend at the actual module dir.
|
|
71
|
+
[tool.uv.build-backend]
|
|
72
|
+
module-name = "latex2word"
|
|
73
|
+
|
|
74
|
+
[dependency-groups]
|
|
75
|
+
dev = [
|
|
76
|
+
"pytest>=8.0",
|
|
77
|
+
"ruff>=0.6",
|
|
78
|
+
"mypy>=1.11",
|
|
79
|
+
"lxml-stubs>=0.5.1",
|
|
80
|
+
"pypdfium2>=4",
|
|
81
|
+
"pillow>=10",
|
|
82
|
+
"latex2mathml>=3.77",
|
|
83
|
+
"matplotlib>=3.7",
|
|
84
|
+
]
|
|
85
|
+
|
|
86
|
+
[tool.ruff]
|
|
87
|
+
line-length = 100
|
|
88
|
+
target-version = "py312"
|
|
89
|
+
|
|
90
|
+
[tool.ruff.lint]
|
|
91
|
+
select = ["E", "F", "I", "UP", "B"]
|
|
92
|
+
|
|
93
|
+
[tool.ruff.lint.per-file-ignores]
|
|
94
|
+
# These modules embed long XML/HTML string literals (templates / reports).
|
|
95
|
+
"src/latex2word/backend/package.py" = ["E501"]
|
|
96
|
+
"src/latex2word/report.py" = ["E501"]
|
|
97
|
+
|
|
98
|
+
[tool.mypy]
|
|
99
|
+
python_version = "3.12"
|
|
100
|
+
ignore_missing_imports = true
|
|
101
|
+
|
|
102
|
+
[tool.pytest.ini_options]
|
|
103
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""latex2word: an open-source LaTeX -> Microsoft Word (.docx) converter.
|
|
2
|
+
|
|
3
|
+
See ``README.md`` for the architecture overview. The public entry points are
|
|
4
|
+
:func:`~latex2word.pipeline.convert_source` and
|
|
5
|
+
:func:`~latex2word.pipeline.convert_file`.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from .pipeline import ConversionResult, convert_file, convert_source
|
|
11
|
+
|
|
12
|
+
__all__ = ["ConversionResult", "convert_file", "convert_source"]
|
|
13
|
+
__version__ = "0.8.1"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""OOXML back-end: turns the IR into a `.docx` (OPC) package."""
|