wps2md 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ .venv/
2
+ __pycache__/
3
+ *.egg-info/
4
+ dist/
5
+ build/
6
+ .pytest_cache/
7
+ *.pyc
wps2md-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 listeng
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
wps2md-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,88 @@
1
+ Metadata-Version: 2.4
2
+ Name: wps2md
3
+ Version: 0.1.0
4
+ Summary: Parse legacy WPS Writer (.wps) OLE2 binary files into structured text and Markdown.
5
+ Project-URL: Homepage, https://github.com/listeng/wps2md
6
+ Project-URL: Issues, https://github.com/listeng/wps2md/issues
7
+ Author-email: listeng <aosp@163.com>
8
+ License: MIT
9
+ License-File: LICENSE
10
+ Keywords: markdown,office,ole2,parser,word,wps
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Office/Business :: Office Suites
20
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
21
+ Requires-Python: >=3.10
22
+ Requires-Dist: olefile>=0.46
23
+ Description-Content-Type: text/markdown
24
+
25
+ # wps2md
26
+
27
+ A tiny Python library and CLI for converting legacy **WPS Writer `.wps`**
28
+ files (OLE2 Word-binary format, FIB magic `0xA5EC`/`0xA5DC`) into
29
+ structured text and Markdown.
30
+
31
+ Unlike `.docx` (which is OOXML/zip and can be read by `python-docx`),
32
+ `.wps` files saved by WPS Office are binary OLE2 compound documents.
33
+ This library reads the `WordDocument` stream, validates the FIB,
34
+ recovers paragraph style indices (`istd`) via `PlcfBtePapx` → FKPs,
35
+ and renders Heading 1-9 styles as `#`..`#########` in Markdown.
36
+
37
+ ## Install
38
+
39
+ ```bash
40
+ pip install wps2md
41
+ ```
42
+
43
+ ## CLI
44
+
45
+ ```bash
46
+ wps2md example.wps # print Markdown to stdout
47
+ wps2md example.wps > example.md
48
+ python -m wps2md example.wps # equivalent
49
+ ```
50
+
51
+ ## Library
52
+
53
+ ```python
54
+ from wps2md import parse, to_markdown
55
+
56
+ doc = parse("example.wps")
57
+ print(doc.main_text) # plain text of the main body
58
+ print(doc.num_pages) # from OLE SummaryInformation
59
+ print(to_markdown(doc.paragraphs)) # Markdown with H1-H9 from Word styles
60
+
61
+ for p in doc.paragraphs:
62
+ print(p.heading_level, p.text) # 0 for normal text, 1-9 for headings
63
+ ```
64
+
65
+ ## API
66
+
67
+ - `parse(path) -> WpsDocument` — parse a `.wps` file.
68
+ - `WpsDocument` — dataclass with `main_text`, `paragraphs`, `footnotes`,
69
+ `headers_footers`, `annotations`, `encoding`, `num_pages`.
70
+ - `Paragraph(istd: int, text: str)` — one paragraph; `heading_level`
71
+ returns 1-9 for built-in Heading styles, else 0.
72
+ - `to_markdown(paragraphs) -> str` — render paragraphs as Markdown.
73
+ - `WpsParseError` — raised for non-`.wps` inputs, encrypted files,
74
+ or unreadable streams.
75
+
76
+ ## Limitations
77
+
78
+ - Tables, images, footnotes/headers paragraph styles, complex fields,
79
+ and CHPX (character formatting like bold/italic) are not currently
80
+ surfaced — only paragraph-level Heading styles drive Markdown output.
81
+ - Encrypted/password-protected files are rejected.
82
+ - Only the OLE2 Word-binary variant of `.wps` is supported (modern WPS
83
+ Office still writes this for `.wps`; the OOXML `.docx` variant should
84
+ be read with `python-docx` instead).
85
+
86
+ ## License
87
+
88
+ MIT
wps2md-0.1.0/README.md ADDED
@@ -0,0 +1,64 @@
1
+ # wps2md
2
+
3
+ A tiny Python library and CLI for converting legacy **WPS Writer `.wps`**
4
+ files (OLE2 Word-binary format, FIB magic `0xA5EC`/`0xA5DC`) into
5
+ structured text and Markdown.
6
+
7
+ Unlike `.docx` (which is OOXML/zip and can be read by `python-docx`),
8
+ `.wps` files saved by WPS Office are binary OLE2 compound documents.
9
+ This library reads the `WordDocument` stream, validates the FIB,
10
+ recovers paragraph style indices (`istd`) via `PlcfBtePapx` → FKPs,
11
+ and renders Heading 1-9 styles as `#`..`#########` in Markdown.
12
+
13
+ ## Install
14
+
15
+ ```bash
16
+ pip install wps2md
17
+ ```
18
+
19
+ ## CLI
20
+
21
+ ```bash
22
+ wps2md example.wps # print Markdown to stdout
23
+ wps2md example.wps > example.md
24
+ python -m wps2md example.wps # equivalent
25
+ ```
26
+
27
+ ## Library
28
+
29
+ ```python
30
+ from wps2md import parse, to_markdown
31
+
32
+ doc = parse("example.wps")
33
+ print(doc.main_text) # plain text of the main body
34
+ print(doc.num_pages) # from OLE SummaryInformation
35
+ print(to_markdown(doc.paragraphs)) # Markdown with H1-H9 from Word styles
36
+
37
+ for p in doc.paragraphs:
38
+ print(p.heading_level, p.text) # 0 for normal text, 1-9 for headings
39
+ ```
40
+
41
+ ## API
42
+
43
+ - `parse(path) -> WpsDocument` — parse a `.wps` file.
44
+ - `WpsDocument` — dataclass with `main_text`, `paragraphs`, `footnotes`,
45
+ `headers_footers`, `annotations`, `encoding`, `num_pages`.
46
+ - `Paragraph(istd: int, text: str)` — one paragraph; `heading_level`
47
+ returns 1-9 for built-in Heading styles, else 0.
48
+ - `to_markdown(paragraphs) -> str` — render paragraphs as Markdown.
49
+ - `WpsParseError` — raised for non-`.wps` inputs, encrypted files,
50
+ or unreadable streams.
51
+
52
+ ## Limitations
53
+
54
+ - Tables, images, footnotes/headers paragraph styles, complex fields,
55
+ and CHPX (character formatting like bold/italic) are not currently
56
+ surfaced — only paragraph-level Heading styles drive Markdown output.
57
+ - Encrypted/password-protected files are rejected.
58
+ - Only the OLE2 Word-binary variant of `.wps` is supported (modern WPS
59
+ Office still writes this for `.wps`; the OOXML `.docx` variant should
60
+ be read with `python-docx` instead).
61
+
62
+ ## License
63
+
64
+ MIT
@@ -0,0 +1,46 @@
1
+ [project]
2
+ name = "wps2md"
3
+ version = "0.1.0"
4
+ description = "Parse legacy WPS Writer (.wps) OLE2 binary files into structured text and Markdown."
5
+ readme = "README.md"
6
+ license = { text = "MIT" }
7
+ requires-python = ">=3.10"
8
+ authors = [{ name = "listeng", email = "aosp@163.com" }]
9
+ keywords = ["wps", "office", "parser", "ole2", "word", "markdown"]
10
+ classifiers = [
11
+ "Development Status :: 3 - Alpha",
12
+ "Intended Audience :: Developers",
13
+ "License :: OSI Approved :: MIT License",
14
+ "Operating System :: OS Independent",
15
+ "Programming Language :: Python :: 3",
16
+ "Programming Language :: Python :: 3.10",
17
+ "Programming Language :: Python :: 3.11",
18
+ "Programming Language :: Python :: 3.12",
19
+ "Topic :: Office/Business :: Office Suites",
20
+ "Topic :: Text Processing :: Markup :: Markdown",
21
+ ]
22
+ dependencies = [
23
+ "olefile>=0.46",
24
+ ]
25
+
26
+ [project.urls]
27
+ Homepage = "https://github.com/listeng/wps2md"
28
+ Issues = "https://github.com/listeng/wps2md/issues"
29
+
30
+ [project.scripts]
31
+ wps2md = "wps2md.__main__:main"
32
+
33
+ [build-system]
34
+ requires = ["hatchling"]
35
+ build-backend = "hatchling.build"
36
+
37
+ [tool.hatch.build.targets.wheel]
38
+ packages = ["src/wps2md"]
39
+
40
+ [tool.hatch.build.targets.sdist]
41
+ include = ["src", "tests", "README.md", "LICENSE", "pyproject.toml"]
42
+
43
+ [dependency-groups]
44
+ dev = [
45
+ "pytest>=8",
46
+ ]
@@ -0,0 +1,26 @@
1
+ """wps2md — parse legacy WPS (.wps, OLE2 Word-binary) files into Markdown.
2
+
3
+ Quick start:
4
+
5
+ from wps2md import parse, to_markdown
6
+
7
+ doc = parse("file.wps")
8
+ print(doc.main_text)
9
+ print(to_markdown(doc.paragraphs))
10
+ """
11
+ from wps2md.core import (
12
+ Paragraph,
13
+ WpsDocument,
14
+ WpsParseError,
15
+ parse,
16
+ to_markdown,
17
+ )
18
+
19
+ __all__ = [
20
+ "Paragraph",
21
+ "WpsDocument",
22
+ "WpsParseError",
23
+ "parse",
24
+ "to_markdown",
25
+ ]
26
+ __version__ = "0.1.0"
@@ -0,0 +1,21 @@
1
+ """CLI: wps2md <file.wps> (or: python -m wps2md <file.wps>)"""
2
+ import sys
3
+
4
+ from wps2md.core import WpsParseError, parse, to_markdown
5
+
6
+
7
+ def main() -> int:
8
+ if len(sys.argv) < 2:
9
+ print("usage: wps2md <file.wps>", file=sys.stderr)
10
+ return 2
11
+ try:
12
+ doc = parse(sys.argv[1])
13
+ except WpsParseError as e:
14
+ print(f"parse error: {e}", file=sys.stderr)
15
+ return 1
16
+ print(to_markdown(doc.paragraphs), end="")
17
+ return 0
18
+
19
+
20
+ if __name__ == "__main__":
21
+ sys.exit(main())
@@ -0,0 +1,475 @@
1
+ """Core parser for legacy WPS (.wps) OLE2 Word-binary files.
2
+
3
+ Reads the OLE2 compound document, validates the FIB, walks PlcfBtePapx →
4
+ FKPs to recover paragraph style indices (istd), and exposes structured
5
+ output plus a Markdown renderer that respects Heading 1-9 styles.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import re
10
+ import struct
11
+ from dataclasses import dataclass, field
12
+ from pathlib import Path
13
+ from typing import Union
14
+
15
+ import olefile # type: ignore[import-untyped]
16
+
17
+ # ---------------------------------------------------------------------------
18
+ # FIB constants (MS-DOC §2.5.1)
19
+ # ---------------------------------------------------------------------------
20
+ _FIB_MAGIC_WORD97 = 0xA5EC
21
+ _FIB_MAGIC_WORD95 = 0xA5DC
22
+ _FIB_FLAGS_OFFSET = 0x0A
23
+ _FIB_ENCRYPTED_FLAG = 0x0100
24
+ _FIB_FCMIN = 0x18
25
+ _FIB_FCMAC = 0x1C
26
+ _FIB_CCP_TEXT = 0x4C
27
+ _FIB_CCP_FTN = 0x50
28
+ _FIB_CCP_HDD = 0x54
29
+ _FIB_CCP_ATN = 0x5C
30
+ _FIB_FC_PLCFBTEPAPX = 0x102
31
+ _FIB_LCB_PLCFBTEPAPX = 0x106
32
+ _MIN_DOC_SIZE = 0x200
33
+ _FKP_SIZE = 512
34
+
35
+ _U16 = struct.Struct("<H")
36
+ _U32 = struct.Struct("<I")
37
+
38
+ _TRANS = str.maketrans({
39
+ "\x07": "\t", "\x0b": "\n", "\x0c": "\n\n", "\x0d": "\n",
40
+ "\x13": None, "\x14": " ", "\x15": None,
41
+ "\x01": None, "\x08": None, "\x19": None, "\x1e": None, "\x1f": None,
42
+ "\xa0": " ", "\x00": None, "\x7f": None,
43
+ })
44
+
45
+
46
+ class WpsParseError(Exception):
47
+ """Raised when the file cannot be parsed as a Word/WPS binary document."""
48
+
49
+
50
+ @dataclass
51
+ class Paragraph:
52
+ """A single paragraph with its style and list/table flags.
53
+
54
+ Attributes:
55
+ istd: Word style index. Heading 1-9 → 1-9, Normal → 0.
56
+ text: Cleaned paragraph text.
57
+ in_table / is_row_end: From sprmPFInTable (0x2416) and sprmPTtp
58
+ (0x2417). ``is_row_end`` marks the row-terminator paragraph.
59
+ ilfo: 1-based list reference (sprmPIlfo). 0 → not a list item.
60
+ ilvl: 0-based indent level (sprmPIlvl). Defaults to 0.
61
+ list_ordered: True for ordered lists, False for bullets, None when
62
+ the paragraph is not a list item.
63
+ """
64
+
65
+ istd: int
66
+ text: str
67
+ in_table: bool = False
68
+ is_row_end: bool = False
69
+ ilfo: int = 0
70
+ ilvl: int = 0
71
+ list_ordered: bool | None = None
72
+
73
+ @property
74
+ def heading_level(self) -> int:
75
+ """Return 1-9 for Heading styles, else 0."""
76
+ return self.istd if 1 <= self.istd <= 9 else 0
77
+
78
+
79
+ @dataclass
80
+ class WpsDocument:
81
+ """Parsed WPS/.doc document."""
82
+
83
+ main_text: str
84
+ paragraphs: list[Paragraph] = field(default_factory=list)
85
+ footnotes: str = ""
86
+ headers_footers: str = ""
87
+ annotations: str = ""
88
+ encoding: str = "utf-16-le"
89
+ num_pages: int | None = None
90
+
91
+
92
+ def _clean(text: str) -> str:
93
+ if not text:
94
+ return ""
95
+ text = text.translate(_TRANS)
96
+ text = re.sub(r"[\x00-\x08\x0e-\x1f\x7f]", "", text)
97
+ text = re.sub(r"[ \t]+", " ", text)
98
+ text = re.sub(r"\n{3,}", "\n\n", text)
99
+ return text.strip()
100
+
101
+
102
+ # Sprm operation codes used here (MS-DOC §2.6.4).
103
+ _SPRM_PF_IN_TABLE = 0x2416 # paragraph is inside a table cell
104
+ _SPRM_PT_TP = 0x2417 # paragraph terminates a table row
105
+ _SPRM_P_ILVL = 0x260A # list indent level (0..8)
106
+ _SPRM_P_ILFO = 0x460B # 1-based index into PlfLfo (0 = not a list item)
107
+
108
+ # Numbering Format Codes (MS-DOC §2.9.166). 23/255 = bullet (unordered);
109
+ # values 0-7, 22, 45-47 are common ordered formats. We treat 23 as the only
110
+ # strictly unordered case and everything else with a list reference as ordered.
111
+ _NFC_BULLET = 23
112
+ _NFC_NONE = 255
113
+
114
+
115
+ def _read_list_kinds(wd: bytes, table: bytes) -> dict[int, bool]:
116
+ """Return {ilfo: is_ordered} mapping by walking PlcfLst and PlfLfo.
117
+
118
+ The returned dict tells whether the list referenced by a 1-based ``ilfo``
119
+ index uses an ordered numbering format (True) or a bullet (False).
120
+ Unknown / unparseable lists are simply absent from the mapping.
121
+
122
+ Layout (MS-DOC §2.4.1, §2.9.131, §2.9.150):
123
+ PlcfLst: U16 cLst, then cLst LSTF (28 bytes each), then variable LVLs.
124
+ PlfLfo: U32 lfoMac, then lfoMac LFO (16 bytes each), then LFOData.
125
+ LSTF.lsid is at offset 0 (I32); rgistdForLst follows; first LVL's
126
+ LVLF.nfc is at offset 24 of each LVL after the LSTF array.
127
+ Rather than recover full LVL offsets (which depend on grpprl sizes), we
128
+ locate each LSTF's first LVLF.nfc by scanning the bytes that follow the
129
+ LSTF array, which is good enough to classify ordered vs bullet for ilvl 0.
130
+ """
131
+ fib_base = 0x9A
132
+ fc_lst = _U32.unpack_from(wd, fib_base + 47 * 8)[0]
133
+ lcb_lst = _U32.unpack_from(wd, fib_base + 47 * 8 + 4)[0]
134
+ fc_lfo = _U32.unpack_from(wd, fib_base + 49 * 8)[0]
135
+ lcb_lfo = _U32.unpack_from(wd, fib_base + 49 * 8 + 4)[0]
136
+ if lcb_lst < 2 or lcb_lfo < 4 or not table:
137
+ return {}
138
+ if fc_lst + lcb_lst > len(table) or fc_lfo + lcb_lfo > len(table):
139
+ return {}
140
+
141
+ # --- Parse PlcfLst: cLst + LSTF[cLst] + LVL blob ---
142
+ lst_buf = table[fc_lst:fc_lst + lcb_lst]
143
+ c_lst = _U16.unpack_from(lst_buf, 0)[0]
144
+ lstf_size = 28
145
+ if 2 + c_lst * lstf_size > len(lst_buf):
146
+ return {}
147
+ lsid_to_ordered: dict[int, bool] = {}
148
+ lvl_blob_off = 2 + c_lst * lstf_size
149
+ cursor = lvl_blob_off
150
+ for i in range(c_lst):
151
+ lsid = struct.unpack_from("<i", lst_buf, 2 + i * lstf_size)[0]
152
+ # rgLVL count: 9 for multi-level lists, 1 for simple (bit at offset 26).
153
+ flags = lst_buf[2 + i * lstf_size + 26] if 2 + i * lstf_size + 26 < len(lst_buf) else 0
154
+ n_lvl = 1 if (flags & 0x10) else 9
155
+ nfc_first: int | None = None
156
+ for j in range(n_lvl):
157
+ if cursor + 28 > len(lst_buf):
158
+ break
159
+ # LVLF (28 bytes): nfc at offset 24 (U8).
160
+ nfc = lst_buf[cursor + 24]
161
+ cb_grpprl_papx = lst_buf[cursor + 25]
162
+ cb_grpprl_chpx = lst_buf[cursor + 26]
163
+ # After LVLF: cbGrpprlPapx + cbGrpprlChpx + xst (variable).
164
+ # xst: U16 cch + cch * U16 chars + U16 trailing reserved.
165
+ lvl_data_off = cursor + 28 + cb_grpprl_papx + cb_grpprl_chpx
166
+ if lvl_data_off + 2 > len(lst_buf):
167
+ break
168
+ cch = _U16.unpack_from(lst_buf, lvl_data_off)[0]
169
+ lvl_end = lvl_data_off + 2 + cch * 2
170
+ cursor = lvl_end
171
+ if j == 0:
172
+ nfc_first = nfc
173
+ if nfc_first is not None and nfc_first != _NFC_NONE:
174
+ lsid_to_ordered[lsid] = nfc_first != _NFC_BULLET
175
+
176
+ # --- Parse PlfLfo: lfoMac + LFO[lfoMac] ---
177
+ lfo_buf = table[fc_lfo:fc_lfo + lcb_lfo]
178
+ lfo_mac = _U32.unpack_from(lfo_buf, 0)[0]
179
+ lfo_size = 16
180
+ if 4 + lfo_mac * lfo_size > len(lfo_buf):
181
+ return {}
182
+ ilfo_to_ordered: dict[int, bool] = {}
183
+ for i in range(lfo_mac):
184
+ lsid = struct.unpack_from("<i", lfo_buf, 4 + i * lfo_size)[0]
185
+ if lsid in lsid_to_ordered:
186
+ ilfo_to_ordered[i + 1] = lsid_to_ordered[lsid] # ilfo is 1-based
187
+ return ilfo_to_ordered
188
+
189
+
190
+ def _iter_sprms(grpprl: bytes):
191
+ """Yield (opcode, value_bytes) for each sprm in a grpprl byte string.
192
+
193
+ Operand size is encoded in bits 13-15 of the opcode (spra):
194
+ 0,1 → 1 byte; 2,4,5 → 2 bytes; 3 → 4 bytes; 7 → 3 bytes;
195
+ 6 → variable, length byte follows opcode (length includes itself).
196
+ """
197
+ j = 0
198
+ n = len(grpprl)
199
+ while j + 2 <= n:
200
+ op = _U16.unpack_from(grpprl, j)[0]
201
+ j += 2
202
+ spra = (op >> 13) & 0x7
203
+ if spra in (0, 1):
204
+ oplen = 1
205
+ elif spra in (2, 4, 5):
206
+ oplen = 2
207
+ elif spra == 3:
208
+ oplen = 4
209
+ elif spra == 7:
210
+ oplen = 3
211
+ elif spra == 6:
212
+ if j >= n:
213
+ break
214
+ oplen = grpprl[j] + 1 # length byte itself counts
215
+ else:
216
+ break
217
+ if j + oplen > n:
218
+ break
219
+ yield op, grpprl[j:j + oplen]
220
+ j += oplen
221
+
222
+
223
+ def _read_paragraphs(
224
+ wd: bytes, table: bytes, fc_min: int, fc_mac: int,
225
+ ilfo_ordered: dict[int, bool] | None = None,
226
+ ) -> list[Paragraph]:
227
+ """Walk PlcfBtePapx → FKPs and return paragraphs in document order."""
228
+ if len(wd) < _FIB_LCB_PLCFBTEPAPX + 4 or not table:
229
+ return []
230
+ fc_plcf = _U32.unpack_from(wd, _FIB_FC_PLCFBTEPAPX)[0]
231
+ lcb_plcf = _U32.unpack_from(wd, _FIB_LCB_PLCFBTEPAPX)[0]
232
+ if lcb_plcf < 12 or fc_plcf + lcb_plcf > len(table):
233
+ return []
234
+
235
+ plcf = table[fc_plcf:fc_plcf + lcb_plcf]
236
+ n = (lcb_plcf - 4) // 8
237
+ a_pn = struct.unpack_from(f"<{n}I", plcf, (n + 1) * 4)
238
+
239
+ paragraphs: list[Paragraph] = []
240
+ for pn in a_pn:
241
+ fkp = wd[pn * _FKP_SIZE:(pn + 1) * _FKP_SIZE]
242
+ if len(fkp) < _FKP_SIZE:
243
+ continue
244
+ cpara = fkp[_FKP_SIZE - 1]
245
+ rgfc = struct.unpack_from(f"<{cpara + 1}I", fkp, 0)
246
+ rgbx_off = (cpara + 1) * 4
247
+ for i in range(cpara):
248
+ fc_start, fc_end = rgfc[i], rgfc[i + 1]
249
+ b_off = fkp[rgbx_off + i * 13]
250
+ istd = 0
251
+ in_table = False
252
+ is_row_end = False
253
+ ilfo = 0
254
+ ilvl = 0
255
+ if b_off != 0:
256
+ papx_off = b_off * 2
257
+ cb = fkp[papx_off]
258
+ # PAPX layout (MS-DOC §2.9.32). When cb != 0, total length is
259
+ # cb*2 bytes including istd+grpprl, grpprl starts at +3.
260
+ # When cb == 0, the next byte cb' gives length cb'*2, grpprl
261
+ # starts at +4 and istd is at +2.
262
+ if cb != 0:
263
+ total = cb * 2
264
+ istd_pos = papx_off + 1
265
+ grp_start = papx_off + 3
266
+ grp_len = total - 3
267
+ else:
268
+ cb2 = fkp[papx_off + 1] if papx_off + 1 < len(fkp) else 0
269
+ total = cb2 * 2
270
+ istd_pos = papx_off + 2
271
+ grp_start = papx_off + 4
272
+ grp_len = total - 4
273
+ if istd_pos + 2 <= len(fkp):
274
+ istd = _U16.unpack_from(fkp, istd_pos)[0] & 0x0FFF
275
+ if grp_len > 0 and grp_start + grp_len <= len(fkp):
276
+ for op, val in _iter_sprms(fkp[grp_start:grp_start + grp_len]):
277
+ if op == _SPRM_PF_IN_TABLE and val and val[0] != 0:
278
+ in_table = True
279
+ elif op == _SPRM_PT_TP and val and val[0] != 0:
280
+ is_row_end = True
281
+ in_table = True
282
+ elif op == _SPRM_P_ILFO and len(val) >= 2:
283
+ ilfo = _U16.unpack_from(val, 0)[0]
284
+ elif op == _SPRM_P_ILVL and val:
285
+ ilvl = val[0]
286
+ s = max(fc_start, fc_min)
287
+ e = min(fc_end, fc_mac)
288
+ if e <= s:
289
+ continue
290
+ raw = wd[s:e].decode("utf-16-le", errors="replace")
291
+ # For table cells the \x07 byte separates cells within a row; for
292
+ # the row-end marker it's just the row terminator. Cleaning would
293
+ # convert \x07 → tab, which we want for cell splitting, so let
294
+ # render handle the raw text after minimal cleanup.
295
+ text = _clean(raw)
296
+ ordered: bool | None = None
297
+ if ilfo and not in_table:
298
+ if ilfo_ordered is not None and ilfo in ilfo_ordered:
299
+ ordered = ilfo_ordered[ilfo]
300
+ else:
301
+ ordered = True # have ilfo but no list-table info; assume ordered
302
+ paragraphs.append(
303
+ Paragraph(
304
+ istd=istd,
305
+ text=text,
306
+ in_table=in_table,
307
+ is_row_end=is_row_end,
308
+ ilfo=ilfo,
309
+ ilvl=ilvl,
310
+ list_ordered=ordered,
311
+ )
312
+ )
313
+ return paragraphs
314
+
315
+
316
+ def parse(path: Union[str, Path]) -> WpsDocument:
317
+ """Parse a .wps file and return a :class:`WpsDocument`.
318
+
319
+ Only WPS Writer ``.wps`` files in OLE2 Word-binary form are supported.
320
+ Raises :class:`WpsParseError` for non-.wps inputs, non-WPS binaries,
321
+ encrypted files, or otherwise unreadable streams.
322
+ """
323
+ path = Path(path)
324
+ if path.suffix.lower() != ".wps":
325
+ raise WpsParseError(f"Only .wps files are supported (got {path.suffix!r})")
326
+ ole = olefile.OleFileIO(str(path))
327
+ try:
328
+ if not ole.exists("WordDocument"):
329
+ raise WpsParseError("No WordDocument stream — not a Word binary file")
330
+ wd = ole.openstream("WordDocument").read()
331
+ if len(wd) < _MIN_DOC_SIZE:
332
+ raise WpsParseError("WordDocument stream too small")
333
+
334
+ magic = _U16.unpack_from(wd, 0)[0]
335
+ if magic not in (_FIB_MAGIC_WORD97, _FIB_MAGIC_WORD95):
336
+ raise WpsParseError(f"Bad FIB magic: {hex(magic)}")
337
+
338
+ flags = _U16.unpack_from(wd, _FIB_FLAGS_OFFSET)[0]
339
+ if flags & _FIB_ENCRYPTED_FLAG:
340
+ raise WpsParseError("File is encrypted / password-protected")
341
+
342
+ # Word 97-2003 keeps the table stream in either 0Table or 1Table
343
+ # depending on FIB flag fWhichTblStm; try 0Table first, then 1Table.
344
+ if ole.exists("0Table"):
345
+ table = ole.openstream("0Table").read()
346
+ elif ole.exists("1Table"):
347
+ table = ole.openstream("1Table").read()
348
+ else:
349
+ table = b""
350
+
351
+ fc_min = _U32.unpack_from(wd, _FIB_FCMIN)[0]
352
+ fc_mac = _U32.unpack_from(wd, _FIB_FCMAC)[0]
353
+ ccp_text = _U32.unpack_from(wd, _FIB_CCP_TEXT)[0]
354
+ ccp_ftn = _U32.unpack_from(wd, _FIB_CCP_FTN)[0]
355
+ ccp_hdd = _U32.unpack_from(wd, _FIB_CCP_HDD)[0]
356
+ ccp_atn = _U32.unpack_from(wd, _FIB_CCP_ATN)[0]
357
+
358
+ text_bytes = fc_mac - fc_min
359
+ total = ccp_text + ccp_ftn + ccp_hdd + ccp_atn
360
+ if total > 0 and text_bytes == total * 2:
361
+ mult, enc = 2, "utf-16-le"
362
+ elif total > 0 and text_bytes == total:
363
+ mult, enc = 1, "cp1252"
364
+ else:
365
+ mult, enc = 2, "utf-16-le"
366
+
367
+ pos = fc_min
368
+ main = wd[pos:pos + ccp_text * mult]; pos += ccp_text * mult
369
+ ftn = wd[pos:pos + ccp_ftn * mult]; pos += ccp_ftn * mult
370
+ hdd = wd[pos:pos + ccp_hdd * mult]; pos += ccp_hdd * mult
371
+ atn = wd[pos:pos + ccp_atn * mult]
372
+
373
+ ilfo_ordered = _read_list_kinds(wd, table) if mult == 2 else {}
374
+ paragraphs = (
375
+ _read_paragraphs(
376
+ wd, table, fc_min, fc_min + ccp_text * mult, ilfo_ordered
377
+ )
378
+ if mult == 2
379
+ else []
380
+ )
381
+
382
+ meta = ole.get_metadata()
383
+ return WpsDocument(
384
+ main_text=_clean(main.decode(enc, errors="replace")),
385
+ paragraphs=paragraphs,
386
+ footnotes=_clean(ftn.decode(enc, errors="replace")) if ftn else "",
387
+ headers_footers=_clean(hdd.decode(enc, errors="replace")) if hdd else "",
388
+ annotations=_clean(atn.decode(enc, errors="replace")) if atn else "",
389
+ encoding=enc,
390
+ num_pages=getattr(meta, "num_pages", None),
391
+ )
392
+ finally:
393
+ ole.close()
394
+
395
+
396
+ def _escape_cell(text: str) -> str:
397
+ """Escape characters that would break a Markdown table cell."""
398
+ return text.replace("\\", "\\\\").replace("|", "\\|").replace("\n", " ").strip()
399
+
400
+
401
+ def _flush_table(rows: list[list[str]], out: list[str]) -> None:
402
+ """Emit a Markdown pipe table for collected rows.
403
+
404
+ The whole table is appended as a single block (lines joined by ``\n``)
405
+ so that the outer ``\n\n`` join keeps blank lines only between blocks,
406
+ not between rows of the same table.
407
+ """
408
+ if not rows:
409
+ return
410
+ width = max(len(r) for r in rows)
411
+ norm = [r + [""] * (width - len(r)) for r in rows]
412
+ header = norm[0]
413
+ body = norm[1:] if len(norm) > 1 else []
414
+ lines = [
415
+ "| " + " | ".join(_escape_cell(c) for c in header) + " |",
416
+ "|" + "|".join([" --- "] * width) + "|",
417
+ ]
418
+ for row in body:
419
+ lines.append("| " + " | ".join(_escape_cell(c) for c in row) + " |")
420
+ out.append("\n".join(lines))
421
+ rows.clear()
422
+
423
+
424
+ def to_markdown(paragraphs: list[Paragraph]) -> str:
425
+ """Render paragraphs to Markdown.
426
+
427
+ Heading 1-9 (istd 1-9) become ``#``..``#########``. Paragraphs flagged
428
+ as in-table are accumulated into a Markdown pipe table, with row
429
+ boundaries from ``is_row_end``. All other paragraphs are emitted as
430
+ plain text. Output ends with a single trailing newline.
431
+ """
432
+ out: list[str] = []
433
+ table_rows: list[list[str]] = []
434
+ current_row: list[str] = []
435
+ # Counters keyed by (ilfo, ilvl) for ordered list numbering.
436
+ list_counters: dict[tuple[int, int], int] = {}
437
+ for p in paragraphs:
438
+ if p.in_table:
439
+ if p.is_row_end:
440
+ # Row terminator paragraph; commit the row we've accumulated.
441
+ if current_row:
442
+ table_rows.append(current_row)
443
+ current_row = []
444
+ else:
445
+ current_row.append(p.text)
446
+ continue
447
+ # Leaving a table region: flush whatever we collected.
448
+ if current_row:
449
+ table_rows.append(current_row)
450
+ current_row = []
451
+ if table_rows:
452
+ _flush_table(table_rows, out)
453
+ if not p.text:
454
+ continue
455
+ if p.heading_level:
456
+ out.append(f"{'#' * p.heading_level} {p.text}")
457
+ list_counters.clear()
458
+ elif p.list_ordered is not None:
459
+ indent = " " * max(0, p.ilvl)
460
+ key = (p.ilfo, p.ilvl)
461
+ if p.list_ordered:
462
+ list_counters[key] = list_counters.get(key, 0) + 1
463
+ marker = f"{list_counters[key]}."
464
+ else:
465
+ marker = "-"
466
+ out.append(f"{indent}{marker} {p.text}")
467
+ else:
468
+ out.append(p.text)
469
+ list_counters.clear()
470
+ # Flush any trailing table.
471
+ if current_row:
472
+ table_rows.append(current_row)
473
+ if table_rows:
474
+ _flush_table(table_rows, out)
475
+ return "\n\n".join(out) + "\n" if out else ""
Binary file
@@ -0,0 +1,41 @@
1
+ import pathlib
2
+
3
+ import pytest
4
+
5
+ from wps2md import WpsParseError, parse, to_markdown
6
+
7
+ SAMPLE = pathlib.Path(__file__).parent / "sample.wps"
8
+
9
+
10
+ @pytest.fixture(scope="module")
11
+ def doc():
12
+ if not SAMPLE.exists():
13
+ pytest.skip("sample.wps not present")
14
+ return parse(SAMPLE)
15
+
16
+
17
+ def test_main_text_nonempty(doc):
18
+ assert len(doc.main_text) > 0
19
+
20
+
21
+ def test_paragraphs_have_first_heading(doc):
22
+ assert doc.paragraphs, "expected at least one paragraph"
23
+ assert doc.paragraphs[0].heading_level == 1
24
+
25
+
26
+ def test_markdown_starts_with_h1(doc):
27
+ md = to_markdown(doc.paragraphs)
28
+ assert md.startswith("# ")
29
+
30
+
31
+ def test_only_h1_in_sample(doc):
32
+ md = to_markdown(doc.paragraphs)
33
+ h_lines = [ln for ln in md.splitlines() if ln.startswith("#")]
34
+ assert all(ln.startswith("# ") for ln in h_lines)
35
+
36
+
37
+ def test_rejects_non_wps_extension(tmp_path):
38
+ p = tmp_path / "foo.doc"
39
+ p.write_bytes(b"\xd0\xcf\x11\xe0")
40
+ with pytest.raises(WpsParseError):
41
+ parse(p)