tex2word 0.8.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. tex2word-0.8.1/LICENSE +21 -0
  2. tex2word-0.8.1/PKG-INFO +209 -0
  3. tex2word-0.8.1/README.md +168 -0
  4. tex2word-0.8.1/pyproject.toml +103 -0
  5. tex2word-0.8.1/src/latex2word/__init__.py +13 -0
  6. tex2word-0.8.1/src/latex2word/backend/__init__.py +1 -0
  7. tex2word-0.8.1/src/latex2word/backend/document.py +1138 -0
  8. tex2word-0.8.1/src/latex2word/backend/fields.py +97 -0
  9. tex2word-0.8.1/src/latex2word/backend/images.py +94 -0
  10. tex2word-0.8.1/src/latex2word/backend/latex_writer.py +425 -0
  11. tex2word-0.8.1/src/latex2word/backend/numbering.py +127 -0
  12. tex2word-0.8.1/src/latex2word/backend/ooxml.py +80 -0
  13. tex2word-0.8.1/src/latex2word/backend/package.py +198 -0
  14. tex2word-0.8.1/src/latex2word/backend/raster.py +61 -0
  15. tex2word-0.8.1/src/latex2word/benchmark.py +101 -0
  16. tex2word-0.8.1/src/latex2word/bib/__init__.py +12 -0
  17. tex2word-0.8.1/src/latex2word/bib/bbl.py +69 -0
  18. tex2word-0.8.1/src/latex2word/bib/bibtex.py +195 -0
  19. tex2word-0.8.1/src/latex2word/bib/csl_engine.py +89 -0
  20. tex2word-0.8.1/src/latex2word/bib/render.py +330 -0
  21. tex2word-0.8.1/src/latex2word/bib/zotero.py +78 -0
  22. tex2word-0.8.1/src/latex2word/cli.py +214 -0
  23. tex2word-0.8.1/src/latex2word/frontend/__init__.py +7 -0
  24. tex2word-0.8.1/src/latex2word/frontend/algorithms.py +223 -0
  25. tex2word-0.8.1/src/latex2word/frontend/colors.py +134 -0
  26. tex2word-0.8.1/src/latex2word/frontend/docx_reader.py +779 -0
  27. tex2word-0.8.1/src/latex2word/frontend/latexml.py +283 -0
  28. tex2word-0.8.1/src/latex2word/frontend/macros.py +444 -0
  29. tex2word-0.8.1/src/latex2word/frontend/parser.py +1701 -0
  30. tex2word-0.8.1/src/latex2word/frontend/preprocess.py +118 -0
  31. tex2word-0.8.1/src/latex2word/frontend/siunitx.py +101 -0
  32. tex2word-0.8.1/src/latex2word/ir.py +432 -0
  33. tex2word-0.8.1/src/latex2word/mathml/__init__.py +7 -0
  34. tex2word-0.8.1/src/latex2word/mathml/cascade.py +129 -0
  35. tex2word-0.8.1/src/latex2word/mathml/imagemath.py +107 -0
  36. tex2word-0.8.1/src/latex2word/mathml/latex_math.py +596 -0
  37. tex2word-0.8.1/src/latex2word/mathml/mathml_to_omml.py +184 -0
  38. tex2word-0.8.1/src/latex2word/mathml/omml.py +300 -0
  39. tex2word-0.8.1/src/latex2word/mathml/omml_reader.py +242 -0
  40. tex2word-0.8.1/src/latex2word/mathml/symbols.py +122 -0
  41. tex2word-0.8.1/src/latex2word/pipeline.py +172 -0
  42. tex2word-0.8.1/src/latex2word/py.typed +0 -0
  43. tex2word-0.8.1/src/latex2word/render_check.py +128 -0
  44. tex2word-0.8.1/src/latex2word/report.py +175 -0
  45. tex2word-0.8.1/src/latex2word/roundtrip.py +361 -0
  46. tex2word-0.8.1/src/latex2word/templates/__init__.py +10 -0
  47. tex2word-0.8.1/src/latex2word/templates/reference.py +244 -0
  48. tex2word-0.8.1/src/latex2word/templates/styles.xml +146 -0
  49. tex2word-0.8.1/src/latex2word/transforms/__init__.py +7 -0
  50. tex2word-0.8.1/src/latex2word/transforms/crossref.py +134 -0
  51. tex2word-0.8.1/src/latex2word/validate.py +203 -0
tex2word-0.8.1/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Yifan Yang
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,209 @@
1
+ Metadata-Version: 2.4
2
+ Name: tex2word
3
+ Version: 0.8.1
4
+ Summary: Open-source, production-grade LaTeX -> Microsoft Word (.docx) converter with native OMML math and live fields
5
+ Keywords: latex,word,docx,converter,ooxml,omml,tex,office,document
6
+ Author: Yifan Yang
7
+ Author-email: Yifan Yang <yfyang.86@hotmail.com>
8
+ License-Expression: MIT
9
+ License-File: LICENSE
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Environment :: Console
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: Intended Audience :: End Users/Desktop
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Text Processing :: Markup :: LaTeX
19
+ Classifier: Topic :: Text Processing :: Markup :: XML
20
+ Classifier: Topic :: Office/Business :: Office Suites
21
+ Classifier: Typing :: Typed
22
+ Requires-Dist: lxml>=5.0
23
+ Requires-Dist: pylatexenc>=2.10
24
+ Requires-Dist: citeproc-py>=0.6 ; extra == 'csl'
25
+ Requires-Dist: matplotlib>=3.7 ; extra == 'mathimg'
26
+ Requires-Dist: latex2mathml>=3.77 ; extra == 'mathml'
27
+ Requires-Dist: pypdfium2>=4 ; extra == 'pdf'
28
+ Requires-Dist: pillow>=10 ; extra == 'pdf'
29
+ Maintainer: Yifan Yang
30
+ Maintainer-email: Yifan Yang <yfyang.86@hotmail.com>
31
+ Requires-Python: >=3.12
32
+ Project-URL: Changelog, https://github.com/yfyang86/tex2word/blob/main/CHANGELOG.md
33
+ Project-URL: Homepage, https://github.com/yfyang86/tex2word
34
+ Project-URL: Issues, https://github.com/yfyang86/tex2word/issues
35
+ Project-URL: Repository, https://github.com/yfyang86/tex2word
36
+ Provides-Extra: csl
37
+ Provides-Extra: mathimg
38
+ Provides-Extra: mathml
39
+ Provides-Extra: pdf
40
+ Description-Content-Type: text/markdown
41
+
42
+ # latex2word
43
+
44
+ An open-source, cross-platform **LaTeX → Microsoft Word (`.docx`)** converter
45
+ that produces *genuinely editable* Word: native paragraph styles, **native OMML
46
+ equations** (editable in Word's equation editor, not images), and **live,
47
+ auto-renumbering fields** for equation/figure/table numbers and
48
+ cross-references.
49
+
50
+ > **Status: production-grade.** Foundation, math core (direct LaTeX→OMML), the
51
+ > live cross-reference/field plumbing (the differentiator), image embedding, the
52
+ > BibTeX bibliography, and the robustness layer (math cascade, coverage report,
53
+ > OOXML validator, round-trip manifest) are all in. See
54
+ > [`CHANGELOG.md`](CHANGELOG.md) for the release history.
55
+
56
+ ## Why
57
+
58
+ Pandoc/`texmath` is the open-source reference but **drops equation numbers**,
59
+ can dump raw LaTeX for labelled equations, and emits *static* cross-references.
60
+ No open tool produces editable styles **and** native OMML **and** live
61
+ field-based numbering. That gap is the product.
62
+
63
+ ## Install & use
64
+
65
+ Requires Python 3.12+.
66
+
67
+ From PyPI:
68
+
69
+ ```bash
70
+ pip install tex2word # core (PNG/JPEG figures)
71
+ pip install "tex2word[pdf]" # + PDF figure rasterisation (pypdfium2, Apache-2.0)
72
+ pip install "tex2word[mathml]" # + LaTeX->MathML->OMML for hard math (latex2mathml)
73
+ pip install "tex2word[csl]" # + real CSL citation styles (citeproc-py)
74
+ pip install "tex2word[pdf,mathml,csl,mathimg]" # everything
75
+
76
+ latex2word convert paper.tex -o paper.docx
77
+ latex2word convert paper.tex -o paper.docx --report report.json
78
+ latex2word convert paper.tex -o paper.docx --reference-doc journal.docx
79
+ ```
80
+
81
+ Or, for a development checkout with [uv](https://docs.astral.sh/uv/):
82
+
83
+ ```bash
84
+ uv sync --all-extras
85
+ uv run latex2word convert paper.tex -o paper.docx
86
+ ```
87
+
88
+ Or from Python:
89
+
90
+ ```python
91
+ from latex2word import convert_source, convert_file
92
+
93
+ out_path, result = convert_file("paper.tex")
94
+ print(result.report.summary()) # math coverage + warnings
95
+ ```
96
+
97
+ ## What works today
98
+
99
+ - **Reference Word templates** ★: `--reference-doc TEMPLATE.docx` adopts a
100
+ journal/corporate template's styles, theme and page geometry (size + margins),
101
+ so the output matches the required look — while keeping the live fields below.
102
+ Our custom styles are merged in so nothing renders unstyled.
103
+ - **Structure & styles**: `\title`/`\author`/`\date`/`abstract`, `\section`…
104
+ `\subparagraph` → Word Title/Heading 1–4 (visible in the Navigation pane),
105
+ paragraphs, `\textbf`/`\emph`/`\texttt`/`\underline`/`\textsc`, quotes, code.
106
+ Sections are **auto-numbered** (multilevel `1` / `1.1` / `1.1.1`) like LaTeX,
107
+ with `\section*` unnumbered; `\ref` to a section shows its live number. In
108
+ **book/report** documents `\chapter` is the top level (sections nest under it)
109
+ and `\appendix` switches to lettered headings (`A`, `A.1`).
110
+ - **Math (direct LaTeX→OMML)**: inline `$…$`, display `\[…\]`,
111
+ `equation`/`align`/`gather`; fractions, sub/superscripts, roots, `\sum`/`\int`
112
+ with limits, accents, `\left…\right` delimiters, matrices/`cases`, Greek and
113
+ hundreds of symbols, `\mathbb`/`\mathcal`/`\mathbf`, functions (`\sin`, `\lim`).
114
+ `align*`/`aligned` line up at the `&` (a column-justified matrix); numbered
115
+ `align` keeps a live number per line.
116
+ - **Live fields** ★: numbered equations get `SEQ Equation` fields inside
117
+ bookmarks; `\ref`/`\eqref`/`\pageref` become `REF`/`PAGEREF` fields; figure
118
+ and table captions get `SEQ Figure`/`SEQ Table`. Numbers auto-renumber in
119
+ Word on field refresh. `--number-by-section` switches to `N.M` per-section
120
+ numbering (`STYLEREF` + `SEQ \s`), book/report style.
121
+ - **Table of contents** ★: `\tableofcontents` → a live Word `TOC` field (rebuilds
122
+ from heading styles on refresh); `\listoffigures`/`\listoftables` → caption-
123
+ sequence lists. Schema-valid and round-tripping.
124
+ - **Lists, tables, figures**: `itemize`/`enumerate`, `tabular`/`longtable` with
125
+ `booktabs`, `\multicolumn`→column span, `\multirow`→vertical merge, and
126
+ repeating header rows; captioned `figure`/`table`, `\includegraphics`
127
+ (PNG/JPEG embedded directly; **PDF figures rasterised** to PNG when the
128
+ optional `tex2word[pdf]` extra — pypdfium2 — is installed). An
129
+ `\includegraphics` in running text (an icon/logo) is embedded **inline**.
130
+ - **Custom macros**: `\newcommand`/`\renewcommand`/`\def` are expanded before
131
+ parsing. Common `mathtools`/`physics` math (`\abs`, `\norm`, `\dv`, `\ket`, …)
132
+ and `siunitx` (`\SI{9.81}{\meter\per\second\squared}` → `9.81 m/s²`, `\num`,
133
+ `\ang`) work as built-ins when not user-defined. **Acronyms** (`glossaries`):
134
+ `\newacronym` + `\gls`/`\acrshort`/`\acrlong`/`\acrfull` expand with the
135
+ first-use "long (short)" rule.
136
+ - **Footnotes**: `\footnote` → native Word footnotes (`footnotes.xml`), not
137
+ inlined text; footnote bodies keep their formatting and math.
138
+ - **Inline verbatim & smart refs**: `\verb|...|` → literal monospace;
139
+ `\cref`/`\Cref`/`\autoref` add cleveref-style type prefixes ("fig. N" /
140
+ "Figure N").
141
+ - **Theorem environments**: `theorem`/`lemma`/`proof`/`definition`/… render
142
+ with a bold numbered lead (live `SEQ` per kind), optional `[title]`, and a
143
+ QED mark for proofs; `\ref` to a theorem shows its number.
144
+ - **Algorithms**: `algorithm` + `algorithmic`/`algpseudocode`/`algorithm2e` →
145
+ numbered, indented pseudocode with bold keywords, inline OMML math, and a live
146
+ `SEQ Algorithm` caption.
147
+ - **Graceful degradation**: unknown constructs never abort; they pass through
148
+ best-effort and are logged to the conversion report (math coverage telemetry
149
+ included). The math **decision-cascade** (direct OMML → LaTeX→MathML→OMML
150
+ secondary path → image fallback `--math-image-fallback` → raw) records which
151
+ path each equation took.
152
+ - **Round-trip**: the IR is embedded as a JSON manifest custom part, so the
153
+ exact IR can be recovered from the `.docx` (`latex2word.roundtrip.recover_ir`)
154
+ and converted **back to LaTeX** (`latex2word to-latex out.docx`); the corpus
155
+ `latex→docx→latex` keeps the same block structure. Reconcile (on by default)
156
+ merges Word edits against the manifest, and **Word Track Changes are accepted**
157
+ on read (insertions kept, deletions dropped).
158
+ - **Reports & validation**: `--report report.json|report.html` writes a coverage
159
+ report; `latex2word.validate.validate_docx` structurally validates output;
160
+ `latex2word benchmark <dir>` reports a quantitative baseline (math-OMML %,
161
+ validity, warnings, 0-abort) across a paper set (CI-gated on the corpus + UATs:
162
+ currently 100% native-OMML math, 100% valid, 0 aborts).
163
+ - **Reproducible**: set `SOURCE_DATE_EPOCH` and the same input yields
164
+ byte-identical output (the `.docx` ZIP is built deterministically).
165
+ - **Live citations** (opt-in `--citations zotero`): emit
166
+ `ADDIN ZOTERO_ITEM CSL_CITATION` / `CSL_BIBLIOGRAPHY` fields so citations are
167
+ editable by Zotero/Mendeley in Word (default is static formatted text).
168
+ - **Real CSL styles** (opt-in `--csl style.csl`, needs `tex2word[csl]`): a
169
+ genuine `citeproc-py` engine formats in-text citations and the reference list
170
+ against any `.csl` style, with proper sorting; the built-in heuristic is the
171
+ fallback. `\nocite{key}`/`\nocite{*}` are honoured.
172
+
173
+ - **Front-end choice**: the default **`pure`** front-end (pylatexenc-based) is
174
+ the validated engine — it converts the corpus and three real-paper UATs at
175
+ 100% native-OMML math, 100% valid output, 0 aborts. `--frontend latexml` is
176
+ **experimental**: it shells out to a real `latexml` install for genuine TeX
177
+ expansion, but is not yet proven end-to-end (it silently falls back to `pure`
178
+ on any failure; see the advisory `real-tool` CI lane).
179
+
180
+ ## Architecture
181
+
182
+ ```
183
+ LaTeX ─▶ front-end (preprocess, macro-expand, pylatexenc walk) ─▶ IR
184
+ ─▶ transforms (cross-reference resolution) ─▶ IR
185
+ ─▶ back-end (raw OOXML via lxml: document/styles/numbering) ─▶ .docx
186
+ ```
187
+
188
+ The **IR** ([`src/latex2word/ir.py`](src/latex2word/ir.py)) is the format-neutral seam, so a LaTeXML front-end can replace the static parser post-V1 without touching the back-end.
189
+
190
+ ## Development
191
+
192
+ ```bash
193
+ uv run pytest # tests
194
+ uv run ruff check src tests
195
+ uv run mypy src
196
+ uv run pre-commit install # optional: run the lint/type gate on every commit
197
+ ```
198
+
199
+ Releases: pushing a `vX.Y.Z` tag builds the wheel/sdist and publishes to PyPI
200
+ (via the `Release` workflow, using PyPI Trusted Publishing). Notable changes are
201
+ recorded in [`CHANGELOG.md`](CHANGELOG.md).
202
+
203
+ ## License
204
+
205
+ MIT — see [`LICENSE`](LICENSE).
206
+
207
+ ## Author
208
+
209
+ Yifan Yang <yfyang.86@hotmail.com>
@@ -0,0 +1,168 @@
1
+ # latex2word
2
+
3
+ An open-source, cross-platform **LaTeX → Microsoft Word (`.docx`)** converter
4
+ that produces *genuinely editable* Word: native paragraph styles, **native OMML
5
+ equations** (editable in Word's equation editor, not images), and **live,
6
+ auto-renumbering fields** for equation/figure/table numbers and
7
+ cross-references.
8
+
9
+ > **Status: production-grade.** Foundation, math core (direct LaTeX→OMML), the
10
+ > live cross-reference/field plumbing (the differentiator), image embedding, the
11
+ > BibTeX bibliography, and the robustness layer (math cascade, coverage report,
12
+ > OOXML validator, round-trip manifest) are all in. See
13
+ > [`CHANGELOG.md`](CHANGELOG.md) for the release history.
14
+
15
+ ## Why
16
+
17
+ Pandoc/`texmath` is the open-source reference but **drops equation numbers**,
18
+ can dump raw LaTeX for labelled equations, and emits *static* cross-references.
19
+ No open tool produces editable styles **and** native OMML **and** live
20
+ field-based numbering. That gap is the product.
21
+
22
+ ## Install & use
23
+
24
+ Requires Python 3.12+.
25
+
26
+ From PyPI:
27
+
28
+ ```bash
29
+ pip install tex2word # core (PNG/JPEG figures)
30
+ pip install "tex2word[pdf]" # + PDF figure rasterisation (pypdfium2, Apache-2.0)
31
+ pip install "tex2word[mathml]" # + LaTeX->MathML->OMML for hard math (latex2mathml)
32
+ pip install "tex2word[csl]" # + real CSL citation styles (citeproc-py)
33
+ pip install "tex2word[pdf,mathml,csl,mathimg]" # everything
34
+
35
+ latex2word convert paper.tex -o paper.docx
36
+ latex2word convert paper.tex -o paper.docx --report report.json
37
+ latex2word convert paper.tex -o paper.docx --reference-doc journal.docx
38
+ ```
39
+
40
+ Or, for a development checkout with [uv](https://docs.astral.sh/uv/):
41
+
42
+ ```bash
43
+ uv sync --all-extras
44
+ uv run latex2word convert paper.tex -o paper.docx
45
+ ```
46
+
47
+ Or from Python:
48
+
49
+ ```python
50
+ from latex2word import convert_source, convert_file
51
+
52
+ out_path, result = convert_file("paper.tex")
53
+ print(result.report.summary()) # math coverage + warnings
54
+ ```
55
+
56
+ ## What works today
57
+
58
+ - **Reference Word templates** ★: `--reference-doc TEMPLATE.docx` adopts a
59
+ journal/corporate template's styles, theme and page geometry (size + margins),
60
+ so the output matches the required look — while keeping the live fields below.
61
+ Our custom styles are merged in so nothing renders unstyled.
62
+ - **Structure & styles**: `\title`/`\author`/`\date`/`abstract`, `\section`…
63
+ `\subparagraph` → Word Title/Heading 1–4 (visible in the Navigation pane),
64
+ paragraphs, `\textbf`/`\emph`/`\texttt`/`\underline`/`\textsc`, quotes, code.
65
+ Sections are **auto-numbered** (multilevel `1` / `1.1` / `1.1.1`) like LaTeX,
66
+ with `\section*` unnumbered; `\ref` to a section shows its live number. In
67
+ **book/report** documents `\chapter` is the top level (sections nest under it)
68
+ and `\appendix` switches to lettered headings (`A`, `A.1`).
69
+ - **Math (direct LaTeX→OMML)**: inline `$…$`, display `\[…\]`,
70
+ `equation`/`align`/`gather`; fractions, sub/superscripts, roots, `\sum`/`\int`
71
+ with limits, accents, `\left…\right` delimiters, matrices/`cases`, Greek and
72
+ hundreds of symbols, `\mathbb`/`\mathcal`/`\mathbf`, functions (`\sin`, `\lim`).
73
+ `align*`/`aligned` line up at the `&` (a column-justified matrix); numbered
74
+ `align` keeps a live number per line.
75
+ - **Live fields** ★: numbered equations get `SEQ Equation` fields inside
76
+ bookmarks; `\ref`/`\eqref`/`\pageref` become `REF`/`PAGEREF` fields; figure
77
+ and table captions get `SEQ Figure`/`SEQ Table`. Numbers auto-renumber in
78
+ Word on field refresh. `--number-by-section` switches to `N.M` per-section
79
+ numbering (`STYLEREF` + `SEQ \s`), book/report style.
80
+ - **Table of contents** ★: `\tableofcontents` → a live Word `TOC` field (rebuilds
81
+ from heading styles on refresh); `\listoffigures`/`\listoftables` → caption-
82
+ sequence lists. Schema-valid and round-tripping.
83
+ - **Lists, tables, figures**: `itemize`/`enumerate`, `tabular`/`longtable` with
84
+ `booktabs`, `\multicolumn`→column span, `\multirow`→vertical merge, and
85
+ repeating header rows; captioned `figure`/`table`, `\includegraphics`
86
+ (PNG/JPEG embedded directly; **PDF figures rasterised** to PNG when the
87
+ optional `tex2word[pdf]` extra — pypdfium2 — is installed). An
88
+ `\includegraphics` in running text (an icon/logo) is embedded **inline**.
89
+ - **Custom macros**: `\newcommand`/`\renewcommand`/`\def` are expanded before
90
+ parsing. Common `mathtools`/`physics` math (`\abs`, `\norm`, `\dv`, `\ket`, …)
91
+ and `siunitx` (`\SI{9.81}{\meter\per\second\squared}` → `9.81 m/s²`, `\num`,
92
+ `\ang`) work as built-ins when not user-defined. **Acronyms** (`glossaries`):
93
+ `\newacronym` + `\gls`/`\acrshort`/`\acrlong`/`\acrfull` expand with the
94
+ first-use "long (short)" rule.
95
+ - **Footnotes**: `\footnote` → native Word footnotes (`footnotes.xml`), not
96
+ inlined text; footnote bodies keep their formatting and math.
97
+ - **Inline verbatim & smart refs**: `\verb|...|` → literal monospace;
98
+ `\cref`/`\Cref`/`\autoref` add cleveref-style type prefixes ("fig. N" /
99
+ "Figure N").
100
+ - **Theorem environments**: `theorem`/`lemma`/`proof`/`definition`/… render
101
+ with a bold numbered lead (live `SEQ` per kind), optional `[title]`, and a
102
+ QED mark for proofs; `\ref` to a theorem shows its number.
103
+ - **Algorithms**: `algorithm` + `algorithmic`/`algpseudocode`/`algorithm2e` →
104
+ numbered, indented pseudocode with bold keywords, inline OMML math, and a live
105
+ `SEQ Algorithm` caption.
106
+ - **Graceful degradation**: unknown constructs never abort; they pass through
107
+ best-effort and are logged to the conversion report (math coverage telemetry
108
+ included). The math **decision-cascade** (direct OMML → LaTeX→MathML→OMML
109
+ secondary path → image fallback `--math-image-fallback` → raw) records which
110
+ path each equation took.
111
+ - **Round-trip**: the IR is embedded as a JSON manifest custom part, so the
112
+ exact IR can be recovered from the `.docx` (`latex2word.roundtrip.recover_ir`)
113
+ and converted **back to LaTeX** (`latex2word to-latex out.docx`); the corpus
114
+ `latex→docx→latex` keeps the same block structure. Reconcile (on by default)
115
+ merges Word edits against the manifest, and **Word Track Changes are accepted**
116
+ on read (insertions kept, deletions dropped).
117
+ - **Reports & validation**: `--report report.json|report.html` writes a coverage
118
+ report; `latex2word.validate.validate_docx` structurally validates output;
119
+ `latex2word benchmark <dir>` reports a quantitative baseline (math-OMML %,
120
+ validity, warnings, 0-abort) across a paper set (CI-gated on the corpus + UATs:
121
+ currently 100% native-OMML math, 100% valid, 0 aborts).
122
+ - **Reproducible**: set `SOURCE_DATE_EPOCH` and the same input yields
123
+ byte-identical output (the `.docx` ZIP is built deterministically).
124
+ - **Live citations** (opt-in `--citations zotero`): emit
125
+ `ADDIN ZOTERO_ITEM CSL_CITATION` / `CSL_BIBLIOGRAPHY` fields so citations are
126
+ editable by Zotero/Mendeley in Word (default is static formatted text).
127
+ - **Real CSL styles** (opt-in `--csl style.csl`, needs `tex2word[csl]`): a
128
+ genuine `citeproc-py` engine formats in-text citations and the reference list
129
+ against any `.csl` style, with proper sorting; the built-in heuristic is the
130
+ fallback. `\nocite{key}`/`\nocite{*}` are honoured.
131
+
132
+ - **Front-end choice**: the default **`pure`** front-end (pylatexenc-based) is
133
+ the validated engine — it converts the corpus and three real-paper UATs at
134
+ 100% native-OMML math, 100% valid output, 0 aborts. `--frontend latexml` is
135
+ **experimental**: it shells out to a real `latexml` install for genuine TeX
136
+ expansion, but is not yet proven end-to-end (it silently falls back to `pure`
137
+ on any failure; see the advisory `real-tool` CI lane).
138
+
139
+ ## Architecture
140
+
141
+ ```
142
+ LaTeX ─▶ front-end (preprocess, macro-expand, pylatexenc walk) ─▶ IR
143
+ ─▶ transforms (cross-reference resolution) ─▶ IR
144
+ ─▶ back-end (raw OOXML via lxml: document/styles/numbering) ─▶ .docx
145
+ ```
146
+
147
+ The **IR** ([`src/latex2word/ir.py`](src/latex2word/ir.py)) is the format-neutral seam, so a LaTeXML front-end can replace the static parser post-V1 without touching the back-end.
148
+
149
+ ## Development
150
+
151
+ ```bash
152
+ uv run pytest # tests
153
+ uv run ruff check src tests
154
+ uv run mypy src
155
+ uv run pre-commit install # optional: run the lint/type gate on every commit
156
+ ```
157
+
158
+ Releases: pushing a `vX.Y.Z` tag builds the wheel/sdist and publishes to PyPI
159
+ (via the `Release` workflow, using PyPI Trusted Publishing). Notable changes are
160
+ recorded in [`CHANGELOG.md`](CHANGELOG.md).
161
+
162
+ ## License
163
+
164
+ MIT — see [`LICENSE`](LICENSE).
165
+
166
+ ## Author
167
+
168
+ Yifan Yang <yfyang.86@hotmail.com>
@@ -0,0 +1,103 @@
1
+ [project]
2
+ name = "tex2word"
3
+ version = "0.8.1"
4
+ description = "Open-source, production-grade LaTeX -> Microsoft Word (.docx) converter with native OMML math and live fields"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ authors = [
8
+ { name = "Yifan Yang", email = "yfyang.86@hotmail.com" }
9
+ ]
10
+ maintainers = [
11
+ { name = "Yifan Yang", email = "yfyang.86@hotmail.com" }
12
+ ]
13
+ license = "MIT"
14
+ license-files = ["LICENSE"]
15
+ keywords = [
16
+ "latex",
17
+ "word",
18
+ "docx",
19
+ "converter",
20
+ "ooxml",
21
+ "omml",
22
+ "tex",
23
+ "office",
24
+ "document",
25
+ ]
26
+ classifiers = [
27
+ "Development Status :: 4 - Beta",
28
+ "Environment :: Console",
29
+ "Intended Audience :: Science/Research",
30
+ "Intended Audience :: End Users/Desktop",
31
+ "Operating System :: OS Independent",
32
+ "Programming Language :: Python :: 3",
33
+ "Programming Language :: Python :: 3.12",
34
+ "Programming Language :: Python :: 3.13",
35
+ "Topic :: Text Processing :: Markup :: LaTeX",
36
+ "Topic :: Text Processing :: Markup :: XML",
37
+ "Topic :: Office/Business :: Office Suites",
38
+ "Typing :: Typed",
39
+ ]
40
+ dependencies = [
41
+ "lxml>=5.0",
42
+ "pylatexenc>=2.10",
43
+ ]
44
+
45
+ [project.urls]
46
+ Homepage = "https://github.com/yfyang86/tex2word"
47
+ Repository = "https://github.com/yfyang86/tex2word"
48
+ Issues = "https://github.com/yfyang86/tex2word/issues"
49
+ Changelog = "https://github.com/yfyang86/tex2word/blob/main/CHANGELOG.md"
50
+
51
+ [project.optional-dependencies]
52
+ # Rasterise PDF figures to embeddable PNG (Word can't embed PDF directly).
53
+ # pypdfium2 (PDFium) + Pillow are both permissive (Apache-2.0/BSD/HPND) -- no GPL.
54
+ pdf = ["pypdfium2>=4", "pillow>=10"]
55
+ # Secondary math path: LaTeX -> presentation MathML -> OMML for the hard 20%.
56
+ mathml = ["latex2mathml>=3.77"]
57
+ # Image fallback for math that can't become OMML (pure-Python; no TeX needed).
58
+ mathimg = ["matplotlib>=3.7"]
59
+ # Real CSL citation processor: format citations/bibliography against a .csl style.
60
+ csl = ["citeproc-py>=0.6"]
61
+
62
+ [project.scripts]
63
+ latex2word = "latex2word.cli:main"
64
+
65
+ [build-system]
66
+ requires = ["uv_build>=0.8.17,<0.9.0"]
67
+ build-backend = "uv_build"
68
+
69
+ # The PyPI distribution is published as "tex2word", but the import package
70
+ # remains "latex2word", so point the build backend at the actual module dir.
71
+ [tool.uv.build-backend]
72
+ module-name = "latex2word"
73
+
74
+ [dependency-groups]
75
+ dev = [
76
+ "pytest>=8.0",
77
+ "ruff>=0.6",
78
+ "mypy>=1.11",
79
+ "lxml-stubs>=0.5.1",
80
+ "pypdfium2>=4",
81
+ "pillow>=10",
82
+ "latex2mathml>=3.77",
83
+ "matplotlib>=3.7",
84
+ ]
85
+
86
+ [tool.ruff]
87
+ line-length = 100
88
+ target-version = "py312"
89
+
90
+ [tool.ruff.lint]
91
+ select = ["E", "F", "I", "UP", "B"]
92
+
93
+ [tool.ruff.lint.per-file-ignores]
94
+ # These modules embed long XML/HTML string literals (templates / reports).
95
+ "src/latex2word/backend/package.py" = ["E501"]
96
+ "src/latex2word/report.py" = ["E501"]
97
+
98
+ [tool.mypy]
99
+ python_version = "3.12"
100
+ ignore_missing_imports = true
101
+
102
+ [tool.pytest.ini_options]
103
+ testpaths = ["tests"]
@@ -0,0 +1,13 @@
1
+ """latex2word: an open-source LaTeX -> Microsoft Word (.docx) converter.
2
+
3
+ See ``README.md`` for the architecture overview. The public entry points are
4
+ :func:`~latex2word.pipeline.convert_source` and
5
+ :func:`~latex2word.pipeline.convert_file`.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from .pipeline import ConversionResult, convert_file, convert_source
11
+
12
+ __all__ = ["ConversionResult", "convert_file", "convert_source"]
13
+ __version__ = "0.8.1"
@@ -0,0 +1 @@
1
+ """OOXML back-end: turns the IR into a `.docx` (OPC) package."""