wps2md 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wps2md/__init__.py +26 -0
- wps2md/__main__.py +21 -0
- wps2md/core.py +475 -0
- wps2md-0.1.0.dist-info/METADATA +88 -0
- wps2md-0.1.0.dist-info/RECORD +8 -0
- wps2md-0.1.0.dist-info/WHEEL +4 -0
- wps2md-0.1.0.dist-info/entry_points.txt +2 -0
- wps2md-0.1.0.dist-info/licenses/LICENSE +21 -0
wps2md/__init__.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""wps2md — parse legacy WPS (.wps, OLE2 Word-binary) files into Markdown.
|
|
2
|
+
|
|
3
|
+
Quick start:
|
|
4
|
+
|
|
5
|
+
from wps2md import parse, to_markdown
|
|
6
|
+
|
|
7
|
+
doc = parse("file.wps")
|
|
8
|
+
print(doc.main_text)
|
|
9
|
+
print(to_markdown(doc.paragraphs))
|
|
10
|
+
"""
|
|
11
|
+
from wps2md.core import (
|
|
12
|
+
Paragraph,
|
|
13
|
+
WpsDocument,
|
|
14
|
+
WpsParseError,
|
|
15
|
+
parse,
|
|
16
|
+
to_markdown,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"Paragraph",
|
|
21
|
+
"WpsDocument",
|
|
22
|
+
"WpsParseError",
|
|
23
|
+
"parse",
|
|
24
|
+
"to_markdown",
|
|
25
|
+
]
|
|
26
|
+
__version__ = "0.1.0"
|
wps2md/__main__.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""CLI: wps2md <file.wps> (or: python -m wps2md <file.wps>)"""
|
|
2
|
+
import sys
|
|
3
|
+
|
|
4
|
+
from wps2md.core import WpsParseError, parse, to_markdown
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def main() -> int:
|
|
8
|
+
if len(sys.argv) < 2:
|
|
9
|
+
print("usage: wps2md <file.wps>", file=sys.stderr)
|
|
10
|
+
return 2
|
|
11
|
+
try:
|
|
12
|
+
doc = parse(sys.argv[1])
|
|
13
|
+
except WpsParseError as e:
|
|
14
|
+
print(f"parse error: {e}", file=sys.stderr)
|
|
15
|
+
return 1
|
|
16
|
+
print(to_markdown(doc.paragraphs), end="")
|
|
17
|
+
return 0
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
if __name__ == "__main__":
|
|
21
|
+
sys.exit(main())
|
wps2md/core.py
ADDED
|
@@ -0,0 +1,475 @@
|
|
|
1
|
+
"""Core parser for legacy WPS (.wps) OLE2 Word-binary files.
|
|
2
|
+
|
|
3
|
+
Reads the OLE2 compound document, validates the FIB, walks PlcfBtePapx →
|
|
4
|
+
FKPs to recover paragraph style indices (istd), and exposes structured
|
|
5
|
+
output plus a Markdown renderer that respects Heading 1-9 styles.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
import struct
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Union
|
|
14
|
+
|
|
15
|
+
import olefile # type: ignore[import-untyped]
|
|
16
|
+
|
|
17
|
+
# ---------------------------------------------------------------------------
|
|
18
|
+
# FIB constants (MS-DOC §2.5.1)
|
|
19
|
+
# ---------------------------------------------------------------------------
|
|
20
|
+
_FIB_MAGIC_WORD97 = 0xA5EC
|
|
21
|
+
_FIB_MAGIC_WORD95 = 0xA5DC
|
|
22
|
+
_FIB_FLAGS_OFFSET = 0x0A
|
|
23
|
+
_FIB_ENCRYPTED_FLAG = 0x0100
|
|
24
|
+
_FIB_FCMIN = 0x18
|
|
25
|
+
_FIB_FCMAC = 0x1C
|
|
26
|
+
_FIB_CCP_TEXT = 0x4C
|
|
27
|
+
_FIB_CCP_FTN = 0x50
|
|
28
|
+
_FIB_CCP_HDD = 0x54
|
|
29
|
+
_FIB_CCP_ATN = 0x5C
|
|
30
|
+
_FIB_FC_PLCFBTEPAPX = 0x102
|
|
31
|
+
_FIB_LCB_PLCFBTEPAPX = 0x106
|
|
32
|
+
_MIN_DOC_SIZE = 0x200
|
|
33
|
+
_FKP_SIZE = 512
|
|
34
|
+
|
|
35
|
+
_U16 = struct.Struct("<H")
|
|
36
|
+
_U32 = struct.Struct("<I")
|
|
37
|
+
|
|
38
|
+
_TRANS = str.maketrans({
|
|
39
|
+
"\x07": "\t", "\x0b": "\n", "\x0c": "\n\n", "\x0d": "\n",
|
|
40
|
+
"\x13": None, "\x14": " ", "\x15": None,
|
|
41
|
+
"\x01": None, "\x08": None, "\x19": None, "\x1e": None, "\x1f": None,
|
|
42
|
+
"\xa0": " ", "\x00": None, "\x7f": None,
|
|
43
|
+
})
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class WpsParseError(Exception):
|
|
47
|
+
"""Raised when the file cannot be parsed as a Word/WPS binary document."""
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class Paragraph:
|
|
52
|
+
"""A single paragraph with its style and list/table flags.
|
|
53
|
+
|
|
54
|
+
Attributes:
|
|
55
|
+
istd: Word style index. Heading 1-9 → 1-9, Normal → 0.
|
|
56
|
+
text: Cleaned paragraph text.
|
|
57
|
+
in_table / is_row_end: From sprmPFInTable (0x2416) and sprmPTtp
|
|
58
|
+
(0x2417). ``is_row_end`` marks the row-terminator paragraph.
|
|
59
|
+
ilfo: 1-based list reference (sprmPIlfo). 0 → not a list item.
|
|
60
|
+
ilvl: 0-based indent level (sprmPIlvl). Defaults to 0.
|
|
61
|
+
list_ordered: True for ordered lists, False for bullets, None when
|
|
62
|
+
the paragraph is not a list item.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
istd: int
|
|
66
|
+
text: str
|
|
67
|
+
in_table: bool = False
|
|
68
|
+
is_row_end: bool = False
|
|
69
|
+
ilfo: int = 0
|
|
70
|
+
ilvl: int = 0
|
|
71
|
+
list_ordered: bool | None = None
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def heading_level(self) -> int:
|
|
75
|
+
"""Return 1-9 for Heading styles, else 0."""
|
|
76
|
+
return self.istd if 1 <= self.istd <= 9 else 0
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass
|
|
80
|
+
class WpsDocument:
|
|
81
|
+
"""Parsed WPS/.doc document."""
|
|
82
|
+
|
|
83
|
+
main_text: str
|
|
84
|
+
paragraphs: list[Paragraph] = field(default_factory=list)
|
|
85
|
+
footnotes: str = ""
|
|
86
|
+
headers_footers: str = ""
|
|
87
|
+
annotations: str = ""
|
|
88
|
+
encoding: str = "utf-16-le"
|
|
89
|
+
num_pages: int | None = None
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _clean(text: str) -> str:
|
|
93
|
+
if not text:
|
|
94
|
+
return ""
|
|
95
|
+
text = text.translate(_TRANS)
|
|
96
|
+
text = re.sub(r"[\x00-\x08\x0e-\x1f\x7f]", "", text)
|
|
97
|
+
text = re.sub(r"[ \t]+", " ", text)
|
|
98
|
+
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
99
|
+
return text.strip()
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
# Sprm operation codes used here (MS-DOC §2.6.4).
|
|
103
|
+
_SPRM_PF_IN_TABLE = 0x2416 # paragraph is inside a table cell
|
|
104
|
+
_SPRM_PT_TP = 0x2417 # paragraph terminates a table row
|
|
105
|
+
_SPRM_P_ILVL = 0x260A # list indent level (0..8)
|
|
106
|
+
_SPRM_P_ILFO = 0x460B # 1-based index into PlfLfo (0 = not a list item)
|
|
107
|
+
|
|
108
|
+
# Numbering Format Codes (MS-DOC §2.9.166). 23/255 = bullet (unordered);
|
|
109
|
+
# values 0-7, 22, 45-47 are common ordered formats. We treat 23 as the only
|
|
110
|
+
# strictly unordered case and everything else with a list reference as ordered.
|
|
111
|
+
_NFC_BULLET = 23
|
|
112
|
+
_NFC_NONE = 255
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _read_list_kinds(wd: bytes, table: bytes) -> dict[int, bool]:
|
|
116
|
+
"""Return {ilfo: is_ordered} mapping by walking PlcfLst and PlfLfo.
|
|
117
|
+
|
|
118
|
+
The returned dict tells whether the list referenced by a 1-based ``ilfo``
|
|
119
|
+
index uses an ordered numbering format (True) or a bullet (False).
|
|
120
|
+
Unknown / unparseable lists are simply absent from the mapping.
|
|
121
|
+
|
|
122
|
+
Layout (MS-DOC §2.4.1, §2.9.131, §2.9.150):
|
|
123
|
+
PlcfLst: U16 cLst, then cLst LSTF (28 bytes each), then variable LVLs.
|
|
124
|
+
PlfLfo: U32 lfoMac, then lfoMac LFO (16 bytes each), then LFOData.
|
|
125
|
+
LSTF.lsid is at offset 0 (I32); rgistdForLst follows; first LVL's
|
|
126
|
+
LVLF.nfc is at offset 24 of each LVL after the LSTF array.
|
|
127
|
+
Rather than recover full LVL offsets (which depend on grpprl sizes), we
|
|
128
|
+
locate each LSTF's first LVLF.nfc by scanning the bytes that follow the
|
|
129
|
+
LSTF array, which is good enough to classify ordered vs bullet for ilvl 0.
|
|
130
|
+
"""
|
|
131
|
+
fib_base = 0x9A
|
|
132
|
+
fc_lst = _U32.unpack_from(wd, fib_base + 47 * 8)[0]
|
|
133
|
+
lcb_lst = _U32.unpack_from(wd, fib_base + 47 * 8 + 4)[0]
|
|
134
|
+
fc_lfo = _U32.unpack_from(wd, fib_base + 49 * 8)[0]
|
|
135
|
+
lcb_lfo = _U32.unpack_from(wd, fib_base + 49 * 8 + 4)[0]
|
|
136
|
+
if lcb_lst < 2 or lcb_lfo < 4 or not table:
|
|
137
|
+
return {}
|
|
138
|
+
if fc_lst + lcb_lst > len(table) or fc_lfo + lcb_lfo > len(table):
|
|
139
|
+
return {}
|
|
140
|
+
|
|
141
|
+
# --- Parse PlcfLst: cLst + LSTF[cLst] + LVL blob ---
|
|
142
|
+
lst_buf = table[fc_lst:fc_lst + lcb_lst]
|
|
143
|
+
c_lst = _U16.unpack_from(lst_buf, 0)[0]
|
|
144
|
+
lstf_size = 28
|
|
145
|
+
if 2 + c_lst * lstf_size > len(lst_buf):
|
|
146
|
+
return {}
|
|
147
|
+
lsid_to_ordered: dict[int, bool] = {}
|
|
148
|
+
lvl_blob_off = 2 + c_lst * lstf_size
|
|
149
|
+
cursor = lvl_blob_off
|
|
150
|
+
for i in range(c_lst):
|
|
151
|
+
lsid = struct.unpack_from("<i", lst_buf, 2 + i * lstf_size)[0]
|
|
152
|
+
# rgLVL count: 9 for multi-level lists, 1 for simple (bit at offset 26).
|
|
153
|
+
flags = lst_buf[2 + i * lstf_size + 26] if 2 + i * lstf_size + 26 < len(lst_buf) else 0
|
|
154
|
+
n_lvl = 1 if (flags & 0x10) else 9
|
|
155
|
+
nfc_first: int | None = None
|
|
156
|
+
for j in range(n_lvl):
|
|
157
|
+
if cursor + 28 > len(lst_buf):
|
|
158
|
+
break
|
|
159
|
+
# LVLF (28 bytes): nfc at offset 24 (U8).
|
|
160
|
+
nfc = lst_buf[cursor + 24]
|
|
161
|
+
cb_grpprl_papx = lst_buf[cursor + 25]
|
|
162
|
+
cb_grpprl_chpx = lst_buf[cursor + 26]
|
|
163
|
+
# After LVLF: cbGrpprlPapx + cbGrpprlChpx + xst (variable).
|
|
164
|
+
# xst: U16 cch + cch * U16 chars + U16 trailing reserved.
|
|
165
|
+
lvl_data_off = cursor + 28 + cb_grpprl_papx + cb_grpprl_chpx
|
|
166
|
+
if lvl_data_off + 2 > len(lst_buf):
|
|
167
|
+
break
|
|
168
|
+
cch = _U16.unpack_from(lst_buf, lvl_data_off)[0]
|
|
169
|
+
lvl_end = lvl_data_off + 2 + cch * 2
|
|
170
|
+
cursor = lvl_end
|
|
171
|
+
if j == 0:
|
|
172
|
+
nfc_first = nfc
|
|
173
|
+
if nfc_first is not None and nfc_first != _NFC_NONE:
|
|
174
|
+
lsid_to_ordered[lsid] = nfc_first != _NFC_BULLET
|
|
175
|
+
|
|
176
|
+
# --- Parse PlfLfo: lfoMac + LFO[lfoMac] ---
|
|
177
|
+
lfo_buf = table[fc_lfo:fc_lfo + lcb_lfo]
|
|
178
|
+
lfo_mac = _U32.unpack_from(lfo_buf, 0)[0]
|
|
179
|
+
lfo_size = 16
|
|
180
|
+
if 4 + lfo_mac * lfo_size > len(lfo_buf):
|
|
181
|
+
return {}
|
|
182
|
+
ilfo_to_ordered: dict[int, bool] = {}
|
|
183
|
+
for i in range(lfo_mac):
|
|
184
|
+
lsid = struct.unpack_from("<i", lfo_buf, 4 + i * lfo_size)[0]
|
|
185
|
+
if lsid in lsid_to_ordered:
|
|
186
|
+
ilfo_to_ordered[i + 1] = lsid_to_ordered[lsid] # ilfo is 1-based
|
|
187
|
+
return ilfo_to_ordered
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _iter_sprms(grpprl: bytes):
|
|
191
|
+
"""Yield (opcode, value_bytes) for each sprm in a grpprl byte string.
|
|
192
|
+
|
|
193
|
+
Operand size is encoded in bits 13-15 of the opcode (spra):
|
|
194
|
+
0,1 → 1 byte; 2,4,5 → 2 bytes; 3 → 4 bytes; 7 → 3 bytes;
|
|
195
|
+
6 → variable, length byte follows opcode (length includes itself).
|
|
196
|
+
"""
|
|
197
|
+
j = 0
|
|
198
|
+
n = len(grpprl)
|
|
199
|
+
while j + 2 <= n:
|
|
200
|
+
op = _U16.unpack_from(grpprl, j)[0]
|
|
201
|
+
j += 2
|
|
202
|
+
spra = (op >> 13) & 0x7
|
|
203
|
+
if spra in (0, 1):
|
|
204
|
+
oplen = 1
|
|
205
|
+
elif spra in (2, 4, 5):
|
|
206
|
+
oplen = 2
|
|
207
|
+
elif spra == 3:
|
|
208
|
+
oplen = 4
|
|
209
|
+
elif spra == 7:
|
|
210
|
+
oplen = 3
|
|
211
|
+
elif spra == 6:
|
|
212
|
+
if j >= n:
|
|
213
|
+
break
|
|
214
|
+
oplen = grpprl[j] + 1 # length byte itself counts
|
|
215
|
+
else:
|
|
216
|
+
break
|
|
217
|
+
if j + oplen > n:
|
|
218
|
+
break
|
|
219
|
+
yield op, grpprl[j:j + oplen]
|
|
220
|
+
j += oplen
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _read_paragraphs(
|
|
224
|
+
wd: bytes, table: bytes, fc_min: int, fc_mac: int,
|
|
225
|
+
ilfo_ordered: dict[int, bool] | None = None,
|
|
226
|
+
) -> list[Paragraph]:
|
|
227
|
+
"""Walk PlcfBtePapx → FKPs and return paragraphs in document order."""
|
|
228
|
+
if len(wd) < _FIB_LCB_PLCFBTEPAPX + 4 or not table:
|
|
229
|
+
return []
|
|
230
|
+
fc_plcf = _U32.unpack_from(wd, _FIB_FC_PLCFBTEPAPX)[0]
|
|
231
|
+
lcb_plcf = _U32.unpack_from(wd, _FIB_LCB_PLCFBTEPAPX)[0]
|
|
232
|
+
if lcb_plcf < 12 or fc_plcf + lcb_plcf > len(table):
|
|
233
|
+
return []
|
|
234
|
+
|
|
235
|
+
plcf = table[fc_plcf:fc_plcf + lcb_plcf]
|
|
236
|
+
n = (lcb_plcf - 4) // 8
|
|
237
|
+
a_pn = struct.unpack_from(f"<{n}I", plcf, (n + 1) * 4)
|
|
238
|
+
|
|
239
|
+
paragraphs: list[Paragraph] = []
|
|
240
|
+
for pn in a_pn:
|
|
241
|
+
fkp = wd[pn * _FKP_SIZE:(pn + 1) * _FKP_SIZE]
|
|
242
|
+
if len(fkp) < _FKP_SIZE:
|
|
243
|
+
continue
|
|
244
|
+
cpara = fkp[_FKP_SIZE - 1]
|
|
245
|
+
rgfc = struct.unpack_from(f"<{cpara + 1}I", fkp, 0)
|
|
246
|
+
rgbx_off = (cpara + 1) * 4
|
|
247
|
+
for i in range(cpara):
|
|
248
|
+
fc_start, fc_end = rgfc[i], rgfc[i + 1]
|
|
249
|
+
b_off = fkp[rgbx_off + i * 13]
|
|
250
|
+
istd = 0
|
|
251
|
+
in_table = False
|
|
252
|
+
is_row_end = False
|
|
253
|
+
ilfo = 0
|
|
254
|
+
ilvl = 0
|
|
255
|
+
if b_off != 0:
|
|
256
|
+
papx_off = b_off * 2
|
|
257
|
+
cb = fkp[papx_off]
|
|
258
|
+
# PAPX layout (MS-DOC §2.9.32). When cb != 0, total length is
|
|
259
|
+
# cb*2 bytes including istd+grpprl, grpprl starts at +3.
|
|
260
|
+
# When cb == 0, the next byte cb' gives length cb'*2, grpprl
|
|
261
|
+
# starts at +4 and istd is at +2.
|
|
262
|
+
if cb != 0:
|
|
263
|
+
total = cb * 2
|
|
264
|
+
istd_pos = papx_off + 1
|
|
265
|
+
grp_start = papx_off + 3
|
|
266
|
+
grp_len = total - 3
|
|
267
|
+
else:
|
|
268
|
+
cb2 = fkp[papx_off + 1] if papx_off + 1 < len(fkp) else 0
|
|
269
|
+
total = cb2 * 2
|
|
270
|
+
istd_pos = papx_off + 2
|
|
271
|
+
grp_start = papx_off + 4
|
|
272
|
+
grp_len = total - 4
|
|
273
|
+
if istd_pos + 2 <= len(fkp):
|
|
274
|
+
istd = _U16.unpack_from(fkp, istd_pos)[0] & 0x0FFF
|
|
275
|
+
if grp_len > 0 and grp_start + grp_len <= len(fkp):
|
|
276
|
+
for op, val in _iter_sprms(fkp[grp_start:grp_start + grp_len]):
|
|
277
|
+
if op == _SPRM_PF_IN_TABLE and val and val[0] != 0:
|
|
278
|
+
in_table = True
|
|
279
|
+
elif op == _SPRM_PT_TP and val and val[0] != 0:
|
|
280
|
+
is_row_end = True
|
|
281
|
+
in_table = True
|
|
282
|
+
elif op == _SPRM_P_ILFO and len(val) >= 2:
|
|
283
|
+
ilfo = _U16.unpack_from(val, 0)[0]
|
|
284
|
+
elif op == _SPRM_P_ILVL and val:
|
|
285
|
+
ilvl = val[0]
|
|
286
|
+
s = max(fc_start, fc_min)
|
|
287
|
+
e = min(fc_end, fc_mac)
|
|
288
|
+
if e <= s:
|
|
289
|
+
continue
|
|
290
|
+
raw = wd[s:e].decode("utf-16-le", errors="replace")
|
|
291
|
+
# For table cells the \x07 byte separates cells within a row; for
|
|
292
|
+
# the row-end marker it's just the row terminator. Cleaning would
|
|
293
|
+
# convert \x07 → tab, which we want for cell splitting, so let
|
|
294
|
+
# render handle the raw text after minimal cleanup.
|
|
295
|
+
text = _clean(raw)
|
|
296
|
+
ordered: bool | None = None
|
|
297
|
+
if ilfo and not in_table:
|
|
298
|
+
if ilfo_ordered is not None and ilfo in ilfo_ordered:
|
|
299
|
+
ordered = ilfo_ordered[ilfo]
|
|
300
|
+
else:
|
|
301
|
+
ordered = True # have ilfo but no list-table info; assume ordered
|
|
302
|
+
paragraphs.append(
|
|
303
|
+
Paragraph(
|
|
304
|
+
istd=istd,
|
|
305
|
+
text=text,
|
|
306
|
+
in_table=in_table,
|
|
307
|
+
is_row_end=is_row_end,
|
|
308
|
+
ilfo=ilfo,
|
|
309
|
+
ilvl=ilvl,
|
|
310
|
+
list_ordered=ordered,
|
|
311
|
+
)
|
|
312
|
+
)
|
|
313
|
+
return paragraphs
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def parse(path: Union[str, Path]) -> WpsDocument:
|
|
317
|
+
"""Parse a .wps file and return a :class:`WpsDocument`.
|
|
318
|
+
|
|
319
|
+
Only WPS Writer ``.wps`` files in OLE2 Word-binary form are supported.
|
|
320
|
+
Raises :class:`WpsParseError` for non-.wps inputs, non-WPS binaries,
|
|
321
|
+
encrypted files, or otherwise unreadable streams.
|
|
322
|
+
"""
|
|
323
|
+
path = Path(path)
|
|
324
|
+
if path.suffix.lower() != ".wps":
|
|
325
|
+
raise WpsParseError(f"Only .wps files are supported (got {path.suffix!r})")
|
|
326
|
+
ole = olefile.OleFileIO(str(path))
|
|
327
|
+
try:
|
|
328
|
+
if not ole.exists("WordDocument"):
|
|
329
|
+
raise WpsParseError("No WordDocument stream — not a Word binary file")
|
|
330
|
+
wd = ole.openstream("WordDocument").read()
|
|
331
|
+
if len(wd) < _MIN_DOC_SIZE:
|
|
332
|
+
raise WpsParseError("WordDocument stream too small")
|
|
333
|
+
|
|
334
|
+
magic = _U16.unpack_from(wd, 0)[0]
|
|
335
|
+
if magic not in (_FIB_MAGIC_WORD97, _FIB_MAGIC_WORD95):
|
|
336
|
+
raise WpsParseError(f"Bad FIB magic: {hex(magic)}")
|
|
337
|
+
|
|
338
|
+
flags = _U16.unpack_from(wd, _FIB_FLAGS_OFFSET)[0]
|
|
339
|
+
if flags & _FIB_ENCRYPTED_FLAG:
|
|
340
|
+
raise WpsParseError("File is encrypted / password-protected")
|
|
341
|
+
|
|
342
|
+
# Word 97-2003 keeps the table stream in either 0Table or 1Table
|
|
343
|
+
# depending on FIB flag fWhichTblStm; try 0Table first, then 1Table.
|
|
344
|
+
if ole.exists("0Table"):
|
|
345
|
+
table = ole.openstream("0Table").read()
|
|
346
|
+
elif ole.exists("1Table"):
|
|
347
|
+
table = ole.openstream("1Table").read()
|
|
348
|
+
else:
|
|
349
|
+
table = b""
|
|
350
|
+
|
|
351
|
+
fc_min = _U32.unpack_from(wd, _FIB_FCMIN)[0]
|
|
352
|
+
fc_mac = _U32.unpack_from(wd, _FIB_FCMAC)[0]
|
|
353
|
+
ccp_text = _U32.unpack_from(wd, _FIB_CCP_TEXT)[0]
|
|
354
|
+
ccp_ftn = _U32.unpack_from(wd, _FIB_CCP_FTN)[0]
|
|
355
|
+
ccp_hdd = _U32.unpack_from(wd, _FIB_CCP_HDD)[0]
|
|
356
|
+
ccp_atn = _U32.unpack_from(wd, _FIB_CCP_ATN)[0]
|
|
357
|
+
|
|
358
|
+
text_bytes = fc_mac - fc_min
|
|
359
|
+
total = ccp_text + ccp_ftn + ccp_hdd + ccp_atn
|
|
360
|
+
if total > 0 and text_bytes == total * 2:
|
|
361
|
+
mult, enc = 2, "utf-16-le"
|
|
362
|
+
elif total > 0 and text_bytes == total:
|
|
363
|
+
mult, enc = 1, "cp1252"
|
|
364
|
+
else:
|
|
365
|
+
mult, enc = 2, "utf-16-le"
|
|
366
|
+
|
|
367
|
+
pos = fc_min
|
|
368
|
+
main = wd[pos:pos + ccp_text * mult]; pos += ccp_text * mult
|
|
369
|
+
ftn = wd[pos:pos + ccp_ftn * mult]; pos += ccp_ftn * mult
|
|
370
|
+
hdd = wd[pos:pos + ccp_hdd * mult]; pos += ccp_hdd * mult
|
|
371
|
+
atn = wd[pos:pos + ccp_atn * mult]
|
|
372
|
+
|
|
373
|
+
ilfo_ordered = _read_list_kinds(wd, table) if mult == 2 else {}
|
|
374
|
+
paragraphs = (
|
|
375
|
+
_read_paragraphs(
|
|
376
|
+
wd, table, fc_min, fc_min + ccp_text * mult, ilfo_ordered
|
|
377
|
+
)
|
|
378
|
+
if mult == 2
|
|
379
|
+
else []
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
meta = ole.get_metadata()
|
|
383
|
+
return WpsDocument(
|
|
384
|
+
main_text=_clean(main.decode(enc, errors="replace")),
|
|
385
|
+
paragraphs=paragraphs,
|
|
386
|
+
footnotes=_clean(ftn.decode(enc, errors="replace")) if ftn else "",
|
|
387
|
+
headers_footers=_clean(hdd.decode(enc, errors="replace")) if hdd else "",
|
|
388
|
+
annotations=_clean(atn.decode(enc, errors="replace")) if atn else "",
|
|
389
|
+
encoding=enc,
|
|
390
|
+
num_pages=getattr(meta, "num_pages", None),
|
|
391
|
+
)
|
|
392
|
+
finally:
|
|
393
|
+
ole.close()
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def _escape_cell(text: str) -> str:
|
|
397
|
+
"""Escape characters that would break a Markdown table cell."""
|
|
398
|
+
return text.replace("\\", "\\\\").replace("|", "\\|").replace("\n", " ").strip()
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def _flush_table(rows: list[list[str]], out: list[str]) -> None:
|
|
402
|
+
"""Emit a Markdown pipe table for collected rows.
|
|
403
|
+
|
|
404
|
+
The whole table is appended as a single block (lines joined by ``\n``)
|
|
405
|
+
so that the outer ``\n\n`` join keeps blank lines only between blocks,
|
|
406
|
+
not between rows of the same table.
|
|
407
|
+
"""
|
|
408
|
+
if not rows:
|
|
409
|
+
return
|
|
410
|
+
width = max(len(r) for r in rows)
|
|
411
|
+
norm = [r + [""] * (width - len(r)) for r in rows]
|
|
412
|
+
header = norm[0]
|
|
413
|
+
body = norm[1:] if len(norm) > 1 else []
|
|
414
|
+
lines = [
|
|
415
|
+
"| " + " | ".join(_escape_cell(c) for c in header) + " |",
|
|
416
|
+
"|" + "|".join([" --- "] * width) + "|",
|
|
417
|
+
]
|
|
418
|
+
for row in body:
|
|
419
|
+
lines.append("| " + " | ".join(_escape_cell(c) for c in row) + " |")
|
|
420
|
+
out.append("\n".join(lines))
|
|
421
|
+
rows.clear()
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
def to_markdown(paragraphs: list[Paragraph]) -> str:
|
|
425
|
+
"""Render paragraphs to Markdown.
|
|
426
|
+
|
|
427
|
+
Heading 1-9 (istd 1-9) become ``#``..``#########``. Paragraphs flagged
|
|
428
|
+
as in-table are accumulated into a Markdown pipe table, with row
|
|
429
|
+
boundaries from ``is_row_end``. All other paragraphs are emitted as
|
|
430
|
+
plain text. Output ends with a single trailing newline.
|
|
431
|
+
"""
|
|
432
|
+
out: list[str] = []
|
|
433
|
+
table_rows: list[list[str]] = []
|
|
434
|
+
current_row: list[str] = []
|
|
435
|
+
# Counters keyed by (ilfo, ilvl) for ordered list numbering.
|
|
436
|
+
list_counters: dict[tuple[int, int], int] = {}
|
|
437
|
+
for p in paragraphs:
|
|
438
|
+
if p.in_table:
|
|
439
|
+
if p.is_row_end:
|
|
440
|
+
# Row terminator paragraph; commit the row we've accumulated.
|
|
441
|
+
if current_row:
|
|
442
|
+
table_rows.append(current_row)
|
|
443
|
+
current_row = []
|
|
444
|
+
else:
|
|
445
|
+
current_row.append(p.text)
|
|
446
|
+
continue
|
|
447
|
+
# Leaving a table region: flush whatever we collected.
|
|
448
|
+
if current_row:
|
|
449
|
+
table_rows.append(current_row)
|
|
450
|
+
current_row = []
|
|
451
|
+
if table_rows:
|
|
452
|
+
_flush_table(table_rows, out)
|
|
453
|
+
if not p.text:
|
|
454
|
+
continue
|
|
455
|
+
if p.heading_level:
|
|
456
|
+
out.append(f"{'#' * p.heading_level} {p.text}")
|
|
457
|
+
list_counters.clear()
|
|
458
|
+
elif p.list_ordered is not None:
|
|
459
|
+
indent = " " * max(0, p.ilvl)
|
|
460
|
+
key = (p.ilfo, p.ilvl)
|
|
461
|
+
if p.list_ordered:
|
|
462
|
+
list_counters[key] = list_counters.get(key, 0) + 1
|
|
463
|
+
marker = f"{list_counters[key]}."
|
|
464
|
+
else:
|
|
465
|
+
marker = "-"
|
|
466
|
+
out.append(f"{indent}{marker} {p.text}")
|
|
467
|
+
else:
|
|
468
|
+
out.append(p.text)
|
|
469
|
+
list_counters.clear()
|
|
470
|
+
# Flush any trailing table.
|
|
471
|
+
if current_row:
|
|
472
|
+
table_rows.append(current_row)
|
|
473
|
+
if table_rows:
|
|
474
|
+
_flush_table(table_rows, out)
|
|
475
|
+
return "\n\n".join(out) + "\n" if out else ""
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: wps2md
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Parse legacy WPS Writer (.wps) OLE2 binary files into structured text and Markdown.
|
|
5
|
+
Project-URL: Homepage, https://github.com/listeng/wps2md
|
|
6
|
+
Project-URL: Issues, https://github.com/listeng/wps2md/issues
|
|
7
|
+
Author-email: listeng <aosp@163.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: markdown,office,ole2,parser,word,wps
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Office/Business :: Office Suites
|
|
20
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Requires-Dist: olefile>=0.46
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
|
|
25
|
+
# wps2md
|
|
26
|
+
|
|
27
|
+
A tiny Python library and CLI for converting legacy **WPS Writer `.wps`**
|
|
28
|
+
files (OLE2 Word-binary format, FIB magic `0xA5EC`/`0xA5DC`) into
|
|
29
|
+
structured text and Markdown.
|
|
30
|
+
|
|
31
|
+
Unlike `.docx` (which is OOXML/zip and can be read by `python-docx`),
|
|
32
|
+
`.wps` files saved by WPS Office are binary OLE2 compound documents.
|
|
33
|
+
This library reads the `WordDocument` stream, validates the FIB,
|
|
34
|
+
recovers paragraph style indices (`istd`) via `PlcfBtePapx` → FKPs,
|
|
35
|
+
and renders Heading 1-9 styles as `#`..`#########` in Markdown.
|
|
36
|
+
|
|
37
|
+
## Install
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install wps2md
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## CLI
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
wps2md example.wps # print Markdown to stdout
|
|
47
|
+
wps2md example.wps > example.md
|
|
48
|
+
python -m wps2md example.wps # equivalent
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Library
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
from wps2md import parse, to_markdown
|
|
55
|
+
|
|
56
|
+
doc = parse("example.wps")
|
|
57
|
+
print(doc.main_text) # plain text of the main body
|
|
58
|
+
print(doc.num_pages) # from OLE SummaryInformation
|
|
59
|
+
print(to_markdown(doc.paragraphs)) # Markdown with H1-H9 from Word styles
|
|
60
|
+
|
|
61
|
+
for p in doc.paragraphs:
|
|
62
|
+
print(p.heading_level, p.text) # 0 for normal text, 1-9 for headings
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## API
|
|
66
|
+
|
|
67
|
+
- `parse(path) -> WpsDocument` — parse a `.wps` file.
|
|
68
|
+
- `WpsDocument` — dataclass with `main_text`, `paragraphs`, `footnotes`,
|
|
69
|
+
`headers_footers`, `annotations`, `encoding`, `num_pages`.
|
|
70
|
+
- `Paragraph(istd: int, text: str)` — one paragraph; `heading_level`
|
|
71
|
+
returns 1-9 for built-in Heading styles, else 0.
|
|
72
|
+
- `to_markdown(paragraphs) -> str` — render paragraphs as Markdown.
|
|
73
|
+
- `WpsParseError` — raised for non-`.wps` inputs, encrypted files,
|
|
74
|
+
or unreadable streams.
|
|
75
|
+
|
|
76
|
+
## Limitations
|
|
77
|
+
|
|
78
|
+
- Tables, images, footnotes/headers paragraph styles, complex fields,
|
|
79
|
+
and CHPX (character formatting like bold/italic) are not currently
|
|
80
|
+
surfaced — only paragraph-level Heading styles drive Markdown output.
|
|
81
|
+
- Encrypted/password-protected files are rejected.
|
|
82
|
+
- Only the OLE2 Word-binary variant of `.wps` is supported (modern WPS
|
|
83
|
+
Office still writes this for `.wps`; the OOXML `.docx` variant should
|
|
84
|
+
be read with `python-docx` instead).
|
|
85
|
+
|
|
86
|
+
## License
|
|
87
|
+
|
|
88
|
+
MIT
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
wps2md/__init__.py,sha256=K_F2i_q2ilOY_iRP_oXzTlJkYHOyQznIU_78jUfHMhY,464
|
|
2
|
+
wps2md/__main__.py,sha256=ezUxufYfOWyJ1BpG4QjP5-xXXG-8xivDO8YWol_nQtQ,509
|
|
3
|
+
wps2md/core.py,sha256=xLJcdDwKOS9PPVEmyvitefnRGGiJ_BbOEW-nlVjoGBY,18023
|
|
4
|
+
wps2md-0.1.0.dist-info/METADATA,sha256=RXw3zpZWKX7lBgOV28qWyygW4sHe_r7LSMgbLJQt9tw,3068
|
|
5
|
+
wps2md-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
6
|
+
wps2md-0.1.0.dist-info/entry_points.txt,sha256=bjL38tRwUVfoH1Wj-xAPGAqiUyY_CGpPyoHpzuEoUis,48
|
|
7
|
+
wps2md-0.1.0.dist-info/licenses/LICENSE,sha256=NI8IXX5oLwwNgDs7cheXIUtNYoxgHMTWTqnMKJ7VHIs,1064
|
|
8
|
+
wps2md-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 listeng
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|