hwp2md 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.md +7 -0
- package/README.md +174 -0
- package/dist/browser.d.mts +179 -0
- package/dist/browser.d.mts.map +1 -0
- package/dist/browser.mjs +27 -0
- package/dist/browser.mjs.map +1 -0
- package/dist/cli/index.d.mts +1 -0
- package/dist/cli/index.mjs +687 -0
- package/dist/cli/index.mjs.map +1 -0
- package/dist/converter-C0C25ssg.mjs +3 -0
- package/dist/converter-D6LrZNSL.mjs +4804 -0
- package/dist/converter-D6LrZNSL.mjs.map +1 -0
- package/dist/index.cjs +1200 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +454 -0
- package/dist/index.d.cts.map +1 -0
- package/dist/index.d.mts +454 -0
- package/dist/index.d.mts.map +1 -0
- package/dist/index.mjs +1126 -0
- package/dist/index.mjs.map +1 -0
- package/package.json +92 -0
package/dist/index.mjs
ADDED
|
@@ -0,0 +1,1126 @@
|
|
|
1
|
+
import * as CFB from "cfb";
|
|
2
|
+
import { inflateRaw } from "pako";
|
|
3
|
+
import { unzipSync } from "fflate";
|
|
4
|
+
import { XMLParser } from "fast-xml-parser";
|
|
5
|
+
|
|
6
|
+
//#region src/utils/compression.ts
|
|
7
|
+
/**
|
|
8
|
+
* Decompress raw deflate data (no zlib header)
|
|
9
|
+
* Equivalent to Python: zlib.decompress(data, -15)
|
|
10
|
+
*
|
|
11
|
+
* HWP files use raw deflate compression without zlib wrapper headers.
|
|
12
|
+
* The windowBits=-15 in Python indicates raw deflate mode.
|
|
13
|
+
*
|
|
14
|
+
* @param data - Compressed data
|
|
15
|
+
* @returns Decompressed data
|
|
16
|
+
*/
|
|
17
|
+
function decompressRaw(data) {
|
|
18
|
+
try {
|
|
19
|
+
return inflateRaw(data);
|
|
20
|
+
} catch (error) {
|
|
21
|
+
throw new Error(`Failed to decompress data: ${error.message}`);
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
//#endregion
|
|
26
|
+
//#region src/parser.ts
|
|
27
|
+
/**
|
|
28
|
+
* HWP 5.0 File Parser
|
|
29
|
+
* Parses OLE Compound File format HWP files
|
|
30
|
+
*/
|
|
31
|
+
/**
|
|
32
|
+
* HWP File Parser
|
|
33
|
+
* Reads and parses HWP 5.0 files using OLE Compound File format
|
|
34
|
+
*/
|
|
35
|
+
var HWPFile = class HWPFile {
|
|
36
|
+
cfb = null;
|
|
37
|
+
_fileHeader = null;
|
|
38
|
+
_isCompressed = false;
|
|
39
|
+
/**
|
|
40
|
+
* Create HWPFile from raw data
|
|
41
|
+
* @param data - Raw HWP file data
|
|
42
|
+
*/
|
|
43
|
+
constructor(data) {
|
|
44
|
+
this.data = data;
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* Create HWPFile from file path (Node.js only)
|
|
48
|
+
* @param path - Path to HWP file
|
|
49
|
+
*/
|
|
50
|
+
static async fromFile(path) {
|
|
51
|
+
const data = await (await import("node:fs/promises")).readFile(path);
|
|
52
|
+
return new HWPFile(new Uint8Array(data));
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Create HWPFile from ArrayBuffer
|
|
56
|
+
* @param data - ArrayBuffer data
|
|
57
|
+
*/
|
|
58
|
+
static fromArrayBuffer(data) {
|
|
59
|
+
return new HWPFile(new Uint8Array(data));
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Create HWPFile from Uint8Array
|
|
63
|
+
* @param data - Uint8Array data
|
|
64
|
+
*/
|
|
65
|
+
static fromUint8Array(data) {
|
|
66
|
+
return new HWPFile(data);
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Open and parse HWP file
|
|
70
|
+
*/
|
|
71
|
+
open() {
|
|
72
|
+
const uint8Array = this.data instanceof Uint8Array ? this.data : new Uint8Array(this.data);
|
|
73
|
+
if (uint8Array.length >= 30) {
|
|
74
|
+
if (new TextDecoder("utf-8").decode(uint8Array.slice(0, 30)).replace(/\0+$/, "").startsWith("HWP Document File V3")) throw new Error("HWP 3.0 format (HWP 97, 2002, etc.) is not supported. Please use HWP 5.0 or later format. You can convert HWP 3.0 files to HWP 5.0 format using Hancom Office.");
|
|
75
|
+
}
|
|
76
|
+
try {
|
|
77
|
+
if (typeof Buffer !== "undefined") {
|
|
78
|
+
const buffer = Buffer.from(uint8Array);
|
|
79
|
+
this.cfb = CFB.read(buffer, { type: "buffer" });
|
|
80
|
+
} else {
|
|
81
|
+
const buffer = Array.from(uint8Array);
|
|
82
|
+
this.cfb = CFB.read(buffer, { type: "array" });
|
|
83
|
+
}
|
|
84
|
+
} catch (error) {
|
|
85
|
+
throw new Error(`Failed to parse HWP file: ${error.message}`);
|
|
86
|
+
}
|
|
87
|
+
this._fileHeader = this.parseFileHeader();
|
|
88
|
+
this._isCompressed = this._fileHeader.isCompressed;
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Close HWP file and release resources
|
|
92
|
+
*/
|
|
93
|
+
close() {
|
|
94
|
+
this.cfb = null;
|
|
95
|
+
this._fileHeader = null;
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Get file header information
|
|
99
|
+
*/
|
|
100
|
+
get fileHeader() {
|
|
101
|
+
return this._fileHeader;
|
|
102
|
+
}
|
|
103
|
+
/**
|
|
104
|
+
* Check if file is compressed
|
|
105
|
+
*/
|
|
106
|
+
get isCompressed() {
|
|
107
|
+
return this._isCompressed;
|
|
108
|
+
}
|
|
109
|
+
/**
|
|
110
|
+
* Parse FileHeader stream (256 bytes fixed)
|
|
111
|
+
*/
|
|
112
|
+
parseFileHeader() {
|
|
113
|
+
if (!this.cfb) throw new Error("HWP file not opened");
|
|
114
|
+
const entry = CFB.find(this.cfb, "FileHeader");
|
|
115
|
+
if (!entry) throw new Error("FileHeader not found in HWP file");
|
|
116
|
+
const data = entry.content;
|
|
117
|
+
if (data.length !== 256) throw new Error(`Invalid FileHeader size: ${data.length} (expected 256)`);
|
|
118
|
+
const signatureBytes = data.slice(0, 32);
|
|
119
|
+
const signature = new TextDecoder("utf-8").decode(signatureBytes).replace(/\0+$/, "");
|
|
120
|
+
if (!signature.startsWith("HWP Document File")) throw new Error(`Invalid HWP signature: ${signature}`);
|
|
121
|
+
const view = new DataView(data.buffer, data.byteOffset, data.byteLength);
|
|
122
|
+
const versionRaw = view.getUint32(32, true);
|
|
123
|
+
const major = versionRaw >> 24 & 255;
|
|
124
|
+
const minor = versionRaw >> 16 & 255;
|
|
125
|
+
const patch = versionRaw >> 8 & 255;
|
|
126
|
+
const rev = versionRaw & 255;
|
|
127
|
+
const properties = view.getUint32(36, true);
|
|
128
|
+
const isCompressed = Boolean(properties & 1);
|
|
129
|
+
const isEncrypted = Boolean(properties & 2);
|
|
130
|
+
return {
|
|
131
|
+
signature,
|
|
132
|
+
version: `${major}.${minor}.${patch}.${rev}`,
|
|
133
|
+
isCompressed,
|
|
134
|
+
isEncrypted,
|
|
135
|
+
rawProperties: properties
|
|
136
|
+
};
|
|
137
|
+
}
|
|
138
|
+
/**
|
|
139
|
+
* Read and decompress stream
|
|
140
|
+
* @param streamPath - Stream path (e.g., 'DocInfo', 'BodyText/Section0')
|
|
141
|
+
* @returns Decompressed data or null if stream doesn't exist
|
|
142
|
+
*/
|
|
143
|
+
readStream(streamPath) {
|
|
144
|
+
if (!this.cfb) return null;
|
|
145
|
+
const entry = CFB.find(this.cfb, streamPath);
|
|
146
|
+
if (!entry) return null;
|
|
147
|
+
let data = entry.content;
|
|
148
|
+
if (this._isCompressed) try {
|
|
149
|
+
data = decompressRaw(data);
|
|
150
|
+
} catch (error) {
|
|
151
|
+
throw new Error(`Failed to decompress ${streamPath}: ${error.message}`);
|
|
152
|
+
}
|
|
153
|
+
return data;
|
|
154
|
+
}
|
|
155
|
+
/**
|
|
156
|
+
* List all streams in HWP file
|
|
157
|
+
* @returns Array of stream paths
|
|
158
|
+
*/
|
|
159
|
+
listStreams() {
|
|
160
|
+
if (!this.cfb) return [];
|
|
161
|
+
const streams = [];
|
|
162
|
+
for (const entry of this.cfb.FileIndex) if (entry.type === 2) {
|
|
163
|
+
const path = entry.name.split("/").filter((p) => p);
|
|
164
|
+
if (path.length > 0) streams.push(path);
|
|
165
|
+
}
|
|
166
|
+
return streams;
|
|
167
|
+
}
|
|
168
|
+
/**
|
|
169
|
+
* Get file information
|
|
170
|
+
*/
|
|
171
|
+
getFileInfo() {
|
|
172
|
+
if (!this._fileHeader) return {};
|
|
173
|
+
return {
|
|
174
|
+
signature: this._fileHeader.signature,
|
|
175
|
+
version: this._fileHeader.version,
|
|
176
|
+
compressed: this._isCompressed,
|
|
177
|
+
encrypted: this._fileHeader.isEncrypted,
|
|
178
|
+
streams: this.listStreams()
|
|
179
|
+
};
|
|
180
|
+
}
|
|
181
|
+
/**
|
|
182
|
+
* Get number of sections in BodyText
|
|
183
|
+
*/
|
|
184
|
+
getSectionCount() {
|
|
185
|
+
if (!this.cfb) return 0;
|
|
186
|
+
let count = 0;
|
|
187
|
+
while (CFB.find(this.cfb, `BodyText/Section${count}`) || CFB.find(this.cfb, `Section${count}`)) count++;
|
|
188
|
+
return count;
|
|
189
|
+
}
|
|
190
|
+
/**
|
|
191
|
+
* Read section data
|
|
192
|
+
* @param sectionIndex - Section index (0-based)
|
|
193
|
+
* @returns Decompressed section data
|
|
194
|
+
*/
|
|
195
|
+
readSection(sectionIndex) {
|
|
196
|
+
let data = this.readStream(`BodyText/Section${sectionIndex}`);
|
|
197
|
+
if (!data) data = this.readStream(`Section${sectionIndex}`);
|
|
198
|
+
return data;
|
|
199
|
+
}
|
|
200
|
+
};
|
|
201
|
+
|
|
202
|
+
//#endregion
|
|
203
|
+
//#region src/record.ts
|
|
204
|
+
const HWPTAG_BEGIN = 16;
|
|
205
|
+
const HWPTAG_DOCUMENT_PROPERTIES = 16;
|
|
206
|
+
const HWPTAG_ID_MAPPINGS = 17;
|
|
207
|
+
const HWPTAG_BIN_DATA = 18;
|
|
208
|
+
const HWPTAG_FACE_NAME = 19;
|
|
209
|
+
const HWPTAG_BORDER_FILL = 20;
|
|
210
|
+
const HWPTAG_CHAR_SHAPE = 21;
|
|
211
|
+
const HWPTAG_TAB_DEF = 22;
|
|
212
|
+
const HWPTAG_NUMBERING = 23;
|
|
213
|
+
const HWPTAG_BULLET = 24;
|
|
214
|
+
const HWPTAG_PARA_SHAPE = 25;
|
|
215
|
+
const HWPTAG_STYLE = 26;
|
|
216
|
+
const HWPTAG_PARA_HEADER = HWPTAG_BEGIN + 50;
|
|
217
|
+
const HWPTAG_PARA_TEXT = HWPTAG_BEGIN + 51;
|
|
218
|
+
const HWPTAG_PARA_CHAR_SHAPE = HWPTAG_BEGIN + 52;
|
|
219
|
+
const HWPTAG_PARA_LINE_SEG = HWPTAG_BEGIN + 53;
|
|
220
|
+
const HWPTAG_PARA_RANGE_TAG = HWPTAG_BEGIN + 54;
|
|
221
|
+
const HWPTAG_CTRL_HEADER = HWPTAG_BEGIN + 55;
|
|
222
|
+
const HWPTAG_LIST_HEADER = HWPTAG_BEGIN + 56;
|
|
223
|
+
const HWPTAG_PAGE_DEF = HWPTAG_BEGIN + 57;
|
|
224
|
+
const HWPTAG_FOOTNOTE_SHAPE = HWPTAG_BEGIN + 58;
|
|
225
|
+
const HWPTAG_PAGE_BORDER_FILL = HWPTAG_BEGIN + 59;
|
|
226
|
+
const HWPTAG_SHAPE_COMPONENT = HWPTAG_BEGIN + 60;
|
|
227
|
+
const HWPTAG_TABLE = HWPTAG_BEGIN + 61;
|
|
228
|
+
const HWPTAG_SHAPE_COMPONENT_LINE = HWPTAG_BEGIN + 62;
|
|
229
|
+
const HWPTAG_CTRL_DATA = HWPTAG_BEGIN + 71;
|
|
230
|
+
/**
|
|
231
|
+
* HWP Record Reader
|
|
232
|
+
* Reads binary records from HWP stream data
|
|
233
|
+
*/
|
|
234
|
+
var RecordReader = class {
|
|
235
|
+
data;
|
|
236
|
+
offset = 0;
|
|
237
|
+
constructor(data) {
|
|
238
|
+
this.data = data;
|
|
239
|
+
}
|
|
240
|
+
/**
|
|
241
|
+
* Check if there are more records to read
|
|
242
|
+
*/
|
|
243
|
+
hasMore() {
|
|
244
|
+
return this.offset < this.data.length;
|
|
245
|
+
}
|
|
246
|
+
/**
|
|
247
|
+
* Read next record
|
|
248
|
+
* @returns Next record, or null if no more records
|
|
249
|
+
*/
|
|
250
|
+
readRecord() {
|
|
251
|
+
if (this.offset + 4 > this.data.length) return null;
|
|
252
|
+
const header = new DataView(this.data.buffer, this.data.byteOffset + this.offset, 4).getUint32(0, true);
|
|
253
|
+
const tagId = header & 1023;
|
|
254
|
+
const level = header >> 10 & 1023;
|
|
255
|
+
let size = header >> 20 & 4095;
|
|
256
|
+
let dataOffset = this.offset + 4;
|
|
257
|
+
if (size === 4095) {
|
|
258
|
+
if (dataOffset + 4 > this.data.length) return null;
|
|
259
|
+
size = new DataView(this.data.buffer, this.data.byteOffset + dataOffset, 4).getUint32(0, true);
|
|
260
|
+
dataOffset += 4;
|
|
261
|
+
}
|
|
262
|
+
if (dataOffset + size > this.data.length) return null;
|
|
263
|
+
const recordData = this.data.slice(dataOffset, dataOffset + size);
|
|
264
|
+
this.offset = dataOffset + size;
|
|
265
|
+
return {
|
|
266
|
+
tagId,
|
|
267
|
+
level,
|
|
268
|
+
data: recordData,
|
|
269
|
+
size
|
|
270
|
+
};
|
|
271
|
+
}
|
|
272
|
+
/**
|
|
273
|
+
* Peek at next record header without consuming it
|
|
274
|
+
* @returns Header info with tagId, level, size
|
|
275
|
+
*/
|
|
276
|
+
peekRecordHeader() {
|
|
277
|
+
if (this.offset + 4 > this.data.length) return null;
|
|
278
|
+
const header = new DataView(this.data.buffer, this.data.byteOffset + this.offset, 4).getUint32(0, true);
|
|
279
|
+
const tagId = header & 1023;
|
|
280
|
+
const level = header >> 10 & 1023;
|
|
281
|
+
let size = header >> 20 & 4095;
|
|
282
|
+
let dataOffset = this.offset + 4;
|
|
283
|
+
if (size === 4095) {
|
|
284
|
+
if (dataOffset + 4 > this.data.length) return null;
|
|
285
|
+
size = new DataView(this.data.buffer, this.data.byteOffset + dataOffset, 4).getUint32(0, true);
|
|
286
|
+
}
|
|
287
|
+
return {
|
|
288
|
+
tagId,
|
|
289
|
+
level,
|
|
290
|
+
size
|
|
291
|
+
};
|
|
292
|
+
}
|
|
293
|
+
/**
|
|
294
|
+
* Read all records (considering hierarchy)
|
|
295
|
+
* @param parentLevel - Stop when reaching this level or below
|
|
296
|
+
* @returns All records at current level
|
|
297
|
+
*/
|
|
298
|
+
readAllRecords(parentLevel) {
|
|
299
|
+
const records = [];
|
|
300
|
+
while (this.hasMore()) {
|
|
301
|
+
const header = this.peekRecordHeader();
|
|
302
|
+
if (!header) break;
|
|
303
|
+
if (parentLevel !== void 0 && header.level <= parentLevel) break;
|
|
304
|
+
const record = this.readRecord();
|
|
305
|
+
if (record) records.push(record);
|
|
306
|
+
}
|
|
307
|
+
return records;
|
|
308
|
+
}
|
|
309
|
+
/**
|
|
310
|
+
* Get current position
|
|
311
|
+
*/
|
|
312
|
+
get position() {
|
|
313
|
+
return this.offset;
|
|
314
|
+
}
|
|
315
|
+
/**
|
|
316
|
+
* Get remaining bytes
|
|
317
|
+
*/
|
|
318
|
+
get remaining() {
|
|
319
|
+
return this.data.length - this.offset;
|
|
320
|
+
}
|
|
321
|
+
};
|
|
322
|
+
|
|
323
|
+
//#endregion
|
|
324
|
+
//#region src/table.ts
|
|
325
|
+
/**
|
|
326
|
+
* HWP Table Parser
|
|
327
|
+
* Parses table records and converts to Markdown
|
|
328
|
+
*/
|
|
329
|
+
/**
|
|
330
|
+
* Parse table properties from TABLE record data
|
|
331
|
+
* @param data - TABLE record data
|
|
332
|
+
* @returns Table properties with rows, cols
|
|
333
|
+
*/
|
|
334
|
+
function parseTableProperties(data) {
|
|
335
|
+
if (data.length < 8) throw new Error(`Invalid TABLE record size: ${data.length}`);
|
|
336
|
+
const view = new DataView(data.buffer, data.byteOffset, data.byteLength);
|
|
337
|
+
return {
|
|
338
|
+
properties: view.getUint32(0, true),
|
|
339
|
+
rows: view.getUint16(4, true),
|
|
340
|
+
cols: view.getUint16(6, true)
|
|
341
|
+
};
|
|
342
|
+
}
|
|
343
|
+
/**
|
|
344
|
+
* Parse cell properties
|
|
345
|
+
* @param data - Cell property data
|
|
346
|
+
* @returns Cell properties
|
|
347
|
+
*/
|
|
348
|
+
function parseCellProperties(data) {
|
|
349
|
+
if (data.length < 8) return {
|
|
350
|
+
col: 0,
|
|
351
|
+
row: 0,
|
|
352
|
+
colspan: 1,
|
|
353
|
+
rowspan: 1
|
|
354
|
+
};
|
|
355
|
+
const view = new DataView(data.buffer, data.byteOffset, data.byteLength);
|
|
356
|
+
return {
|
|
357
|
+
col: view.getUint16(0, true),
|
|
358
|
+
row: view.getUint16(2, true),
|
|
359
|
+
colspan: view.getUint16(4, true),
|
|
360
|
+
rowspan: view.getUint16(6, true)
|
|
361
|
+
};
|
|
362
|
+
}
|
|
363
|
+
/**
|
|
364
|
+
* Parse table from TABLE record and subsequent records
|
|
365
|
+
* @param tableRecordData - TABLE record data
|
|
366
|
+
* @param reader - Record reader for reading cell data
|
|
367
|
+
* @param lineBreakStyle - How to handle line breaks in cells
|
|
368
|
+
* @returns Parsed table
|
|
369
|
+
*/
|
|
370
|
+
function parseTable(tableRecordData, reader, lineBreakStyle = "space") {
|
|
371
|
+
const props = parseTableProperties(tableRecordData);
|
|
372
|
+
const rows = props.rows;
|
|
373
|
+
const cols = props.cols;
|
|
374
|
+
const cells = [];
|
|
375
|
+
let cellIndex = 0;
|
|
376
|
+
while (reader.hasMore() && cellIndex < rows * cols) {
|
|
377
|
+
const header = reader.peekRecordHeader();
|
|
378
|
+
if (!header) break;
|
|
379
|
+
if (header.level < 2) break;
|
|
380
|
+
if (header.tagId === HWPTAG_LIST_HEADER) {
|
|
381
|
+
const listRecord = reader.readRecord();
|
|
382
|
+
if (!listRecord) break;
|
|
383
|
+
let row, col, colspan, rowspan;
|
|
384
|
+
if (listRecord.data.length >= 16) {
|
|
385
|
+
const view = new DataView(listRecord.data.buffer, listRecord.data.byteOffset, listRecord.data.byteLength);
|
|
386
|
+
col = view.getUint16(8, true);
|
|
387
|
+
row = view.getUint16(10, true);
|
|
388
|
+
colspan = view.getUint16(12, true);
|
|
389
|
+
rowspan = view.getUint16(14, true);
|
|
390
|
+
if (colspan === 0) colspan = 1;
|
|
391
|
+
if (rowspan === 0) rowspan = 1;
|
|
392
|
+
} else {
|
|
393
|
+
row = Math.floor(cellIndex / cols);
|
|
394
|
+
col = cellIndex % cols;
|
|
395
|
+
colspan = 1;
|
|
396
|
+
rowspan = 1;
|
|
397
|
+
}
|
|
398
|
+
const textParts = [];
|
|
399
|
+
while (reader.hasMore()) {
|
|
400
|
+
const paraHeader = reader.peekRecordHeader();
|
|
401
|
+
if (!paraHeader) break;
|
|
402
|
+
if (paraHeader.tagId === HWPTAG_LIST_HEADER) break;
|
|
403
|
+
if (paraHeader.level < 2) break;
|
|
404
|
+
if (paraHeader.tagId === HWPTAG_PARA_HEADER) {
|
|
405
|
+
const paraRec = reader.readRecord();
|
|
406
|
+
if (!paraRec) break;
|
|
407
|
+
if (paraRec.data.length >= 4) {
|
|
408
|
+
const nchars = new DataView(paraRec.data.buffer, paraRec.data.byteOffset, paraRec.data.byteLength).getUint32(0, true) & 2147483647;
|
|
409
|
+
if (nchars > 0 && reader.hasMore()) {
|
|
410
|
+
const nextH = reader.peekRecordHeader();
|
|
411
|
+
if (nextH && nextH.tagId === HWPTAG_PARA_TEXT) {
|
|
412
|
+
const textRec = reader.readRecord();
|
|
413
|
+
if (textRec) {
|
|
414
|
+
const text = processControlChars(parseParaText(textRec.data, nchars)).text;
|
|
415
|
+
if (text.trim()) textParts.push(text.trim());
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
} else reader.readRecord();
|
|
421
|
+
}
|
|
422
|
+
let cellText;
|
|
423
|
+
if (lineBreakStyle === "br") {
|
|
424
|
+
cellText = textParts.join("<br>");
|
|
425
|
+
cellText = cellText.replace(/\n/g, "<br>");
|
|
426
|
+
} else {
|
|
427
|
+
cellText = textParts.join(" ");
|
|
428
|
+
cellText = cellText.replace(/\n/g, " ");
|
|
429
|
+
}
|
|
430
|
+
const cell = {
|
|
431
|
+
row,
|
|
432
|
+
col,
|
|
433
|
+
rowspan,
|
|
434
|
+
colspan,
|
|
435
|
+
text: cellText
|
|
436
|
+
};
|
|
437
|
+
cells.push(cell);
|
|
438
|
+
cellIndex++;
|
|
439
|
+
} else reader.readRecord();
|
|
440
|
+
}
|
|
441
|
+
return {
|
|
442
|
+
rows,
|
|
443
|
+
cols,
|
|
444
|
+
cells
|
|
445
|
+
};
|
|
446
|
+
}
|
|
447
|
+
/**
|
|
448
|
+
* Convert table to Markdown
|
|
449
|
+
* @param table - Table object
|
|
450
|
+
* @param mergeStrategy - 'repeat' (default) or 'blank'
|
|
451
|
+
* @returns Markdown table
|
|
452
|
+
*/
|
|
453
|
+
function tableToMarkdown(table, mergeStrategy = "repeat") {
|
|
454
|
+
const { rows, cols, cells } = table;
|
|
455
|
+
const matrix = Array.from({ length: rows }, () => Array(cols).fill(""));
|
|
456
|
+
for (const cell of cells) if (mergeStrategy === "repeat") {
|
|
457
|
+
for (let r = cell.row; r < cell.row + cell.rowspan; r++) for (let c = cell.col; c < cell.col + cell.colspan; c++) if (r < rows && c < cols) matrix[r][c] = cell.text;
|
|
458
|
+
} else if (cell.row < rows && cell.col < cols) matrix[cell.row][cell.col] = cell.text;
|
|
459
|
+
const lines = [];
|
|
460
|
+
if (rows === 0 || cols === 0) return "[Empty Table]";
|
|
461
|
+
lines.push("| " + Array(cols).fill("").join(" | ") + " |");
|
|
462
|
+
lines.push("| " + Array(cols).fill("---").join(" | ") + " |");
|
|
463
|
+
for (const row of matrix) lines.push("| " + row.join(" | ") + " |");
|
|
464
|
+
return lines.join("\n");
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
//#endregion
|
|
468
|
+
//#region src/paragraph.ts
|
|
469
|
+
/**
|
|
470
|
+
* HWP Paragraph Parser
|
|
471
|
+
* Parses paragraph records and extracts text
|
|
472
|
+
*/
|
|
473
|
+
/**
|
|
474
|
+
* Parse paragraph header
|
|
475
|
+
* @param data - PARA_HEADER record data
|
|
476
|
+
* @returns Paragraph header information
|
|
477
|
+
*/
|
|
478
|
+
function parseParaHeader(data) {
|
|
479
|
+
if (data.length < 22) throw new Error(`Invalid PARA_HEADER size: ${data.length}`);
|
|
480
|
+
const view = new DataView(data.buffer, data.byteOffset, data.byteLength);
|
|
481
|
+
return {
|
|
482
|
+
textCount: view.getUint32(0, true) & 2147483647,
|
|
483
|
+
controlMask: view.getUint32(4, true),
|
|
484
|
+
paraShapeId: view.getUint16(8, true),
|
|
485
|
+
styleId: view.getUint8(10),
|
|
486
|
+
columnType: view.getUint8(11),
|
|
487
|
+
charShapeCount: view.getUint16(12, true)
|
|
488
|
+
};
|
|
489
|
+
}
|
|
490
|
+
/**
|
|
491
|
+
* Parse paragraph text with control info table
|
|
492
|
+
*
|
|
493
|
+
* PARA_TEXT structure:
|
|
494
|
+
* [Control info table: 16 bytes per control] + [Actual text]
|
|
495
|
+
*
|
|
496
|
+
* Each control info block (16 bytes):
|
|
497
|
+
* - 2 bytes: control code
|
|
498
|
+
* - 4 bytes: control ID
|
|
499
|
+
* - 8 bytes: control data
|
|
500
|
+
* - 2 bytes: control code (repeated)
|
|
501
|
+
*
|
|
502
|
+
* @param data - PARA_TEXT record data
|
|
503
|
+
* @param nchars - Number of WCHAR characters (control table + text)
|
|
504
|
+
* @returns Decoded text
|
|
505
|
+
*/
|
|
506
|
+
function parseParaText(data, _nchars) {
|
|
507
|
+
const CHAR_CONTROLS = new Set([
|
|
508
|
+
0,
|
|
509
|
+
10,
|
|
510
|
+
13,
|
|
511
|
+
30,
|
|
512
|
+
31
|
|
513
|
+
]);
|
|
514
|
+
let offset = 0;
|
|
515
|
+
while (offset + 2 <= data.length) {
|
|
516
|
+
const charCode = new DataView(data.buffer, data.byteOffset + offset, 2).getUint16(0, true);
|
|
517
|
+
if (charCode < 32 && !CHAR_CONTROLS.has(charCode)) {
|
|
518
|
+
offset += 16;
|
|
519
|
+
if (offset > data.length) break;
|
|
520
|
+
} else break;
|
|
521
|
+
}
|
|
522
|
+
const textData = data.slice(offset);
|
|
523
|
+
if (textData.length === 0) return "";
|
|
524
|
+
try {
|
|
525
|
+
return new TextDecoder("utf-16le").decode(textData);
|
|
526
|
+
} catch (error) {
|
|
527
|
+
return "";
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
/**
|
|
531
|
+
* Process control characters in text
|
|
532
|
+
* @param text - Raw text with control characters
|
|
533
|
+
* @returns Processed text and has_table flag
|
|
534
|
+
*/
|
|
535
|
+
function processControlChars(text) {
|
|
536
|
+
const result = [];
|
|
537
|
+
let hasTable = false;
|
|
538
|
+
for (const char of text) {
|
|
539
|
+
const code = char.codePointAt(0) ?? 0;
|
|
540
|
+
if (code < 32) {
|
|
541
|
+
if (code === 10) result.push("\n");
|
|
542
|
+
else if (code === 11) hasTable = true;
|
|
543
|
+
else if (code === 13) result.push("\n");
|
|
544
|
+
} else result.push(char);
|
|
545
|
+
}
|
|
546
|
+
return {
|
|
547
|
+
text: result.join(""),
|
|
548
|
+
hasTable
|
|
549
|
+
};
|
|
550
|
+
}
|
|
551
|
+
/**
|
|
552
|
+
* Paragraph Parser
|
|
553
|
+
* Parses paragraphs from HWP record stream
|
|
554
|
+
*/
|
|
555
|
+
var ParagraphParser = class {
|
|
556
|
+
constructor(reader, options = {}) {
|
|
557
|
+
this.reader = reader;
|
|
558
|
+
this.options = options;
|
|
559
|
+
if (!this.options.tableLineBreakStyle) this.options.tableLineBreakStyle = "space";
|
|
560
|
+
}
|
|
561
|
+
/**
|
|
562
|
+
* Parse next paragraph
|
|
563
|
+
* @returns Parsed paragraph, or null if no more paragraphs
|
|
564
|
+
*/
|
|
565
|
+
parseParagraph() {
|
|
566
|
+
if (!this.reader.hasMore()) return null;
|
|
567
|
+
let record = null;
|
|
568
|
+
while (this.reader.hasMore()) {
|
|
569
|
+
record = this.reader.readRecord();
|
|
570
|
+
if (record && record.tagId === HWPTAG_PARA_HEADER) break;
|
|
571
|
+
record = null;
|
|
572
|
+
}
|
|
573
|
+
if (!record) return null;
|
|
574
|
+
const header = parseParaHeader(record.data);
|
|
575
|
+
let text = "";
|
|
576
|
+
if (header.textCount > 0) {
|
|
577
|
+
const nextHeader = this.reader.peekRecordHeader();
|
|
578
|
+
if (nextHeader && nextHeader.tagId === HWPTAG_PARA_TEXT) {
|
|
579
|
+
record = this.reader.readRecord();
|
|
580
|
+
if (record) text = processControlChars(parseParaText(record.data, header.textCount)).text;
|
|
581
|
+
}
|
|
582
|
+
}
|
|
583
|
+
const tables = [];
|
|
584
|
+
while (this.reader.hasMore()) {
|
|
585
|
+
const nextHeader = this.reader.peekRecordHeader();
|
|
586
|
+
if (!nextHeader) break;
|
|
587
|
+
if (nextHeader.tagId === HWPTAG_PARA_HEADER) break;
|
|
588
|
+
if (nextHeader.level === 0) break;
|
|
589
|
+
if (nextHeader.tagId === HWPTAG_CTRL_HEADER) {
|
|
590
|
+
this.reader.readRecord();
|
|
591
|
+
const tableHeader = this.reader.peekRecordHeader();
|
|
592
|
+
if (tableHeader && tableHeader.tagId === HWPTAG_TABLE) {
|
|
593
|
+
const tableRecord = this.reader.readRecord();
|
|
594
|
+
if (tableRecord) try {
|
|
595
|
+
const tableMd = tableToMarkdown(parseTable(tableRecord.data, this.reader, this.options.tableLineBreakStyle));
|
|
596
|
+
tables.push(tableMd);
|
|
597
|
+
} catch (error) {
|
|
598
|
+
console.warn("Warning: Failed to parse table:", error);
|
|
599
|
+
tables.push("[TABLE - Parse Error]");
|
|
600
|
+
}
|
|
601
|
+
}
|
|
602
|
+
} else this.reader.readRecord();
|
|
603
|
+
}
|
|
604
|
+
if (tables.length > 0) {
|
|
605
|
+
text = text.trimEnd();
|
|
606
|
+
if (text.length > 0 && text.length < 5 && [...text].every((c) => {
|
|
607
|
+
return (c.codePointAt(0) ?? 0) > 127 || /\s/.test(c);
|
|
608
|
+
})) text = tables.join("\n\n");
|
|
609
|
+
else {
|
|
610
|
+
if (text) text += "\n\n";
|
|
611
|
+
text += tables.join("\n\n");
|
|
612
|
+
}
|
|
613
|
+
}
|
|
614
|
+
return {
|
|
615
|
+
text,
|
|
616
|
+
header
|
|
617
|
+
};
|
|
618
|
+
}
|
|
619
|
+
/**
|
|
620
|
+
* Parse all paragraphs in section
|
|
621
|
+
* @returns All paragraphs
|
|
622
|
+
*/
|
|
623
|
+
parseAllParagraphs() {
|
|
624
|
+
const paragraphs = [];
|
|
625
|
+
while (this.reader.hasMore()) {
|
|
626
|
+
const para = this.parseParagraph();
|
|
627
|
+
if (para) paragraphs.push(para);
|
|
628
|
+
else break;
|
|
629
|
+
}
|
|
630
|
+
return paragraphs;
|
|
631
|
+
}
|
|
632
|
+
};
|
|
633
|
+
|
|
634
|
+
//#endregion
|
|
635
|
+
//#region src/hwpx_parser.ts
|
|
636
|
+
/**
|
|
637
|
+
* HWPX (ZIP+XML) File Parser
|
|
638
|
+
* Parses HWPX files (ZIP archives containing XML) into Paragraphs and Tables
|
|
639
|
+
*/
|
|
640
|
+
const HWPX_MIMETYPE = "application/hwp+zip";
|
|
641
|
+
/** Options for the XML parser */
|
|
642
|
+
const xmlParserOptions = {
|
|
643
|
+
ignoreAttributes: false,
|
|
644
|
+
attributeNamePrefix: "@_",
|
|
645
|
+
removeNSPrefix: false,
|
|
646
|
+
isArray: (name) => {
|
|
647
|
+
return [
|
|
648
|
+
"hp:p",
|
|
649
|
+
"hp:run",
|
|
650
|
+
"hp:t",
|
|
651
|
+
"hp:tr",
|
|
652
|
+
"hp:tc",
|
|
653
|
+
"hp:tbl",
|
|
654
|
+
"opf:item",
|
|
655
|
+
"opf:itemref"
|
|
656
|
+
].includes(name);
|
|
657
|
+
},
|
|
658
|
+
textNodeName: "#text"
|
|
659
|
+
};
|
|
660
|
+
/**
|
|
661
|
+
* HWPX file parser
|
|
662
|
+
*/
|
|
663
|
+
var HWPXFile = class HWPXFile {
|
|
664
|
+
data;
|
|
665
|
+
entries = null;
|
|
666
|
+
sectionPaths = [];
|
|
667
|
+
version = "";
|
|
668
|
+
xmlParser;
|
|
669
|
+
constructor(data) {
|
|
670
|
+
this.data = data;
|
|
671
|
+
this.xmlParser = new XMLParser(xmlParserOptions);
|
|
672
|
+
}
|
|
673
|
+
static async fromFile(path) {
|
|
674
|
+
const { readFileSync } = await import("node:fs");
|
|
675
|
+
const data = readFileSync(path);
|
|
676
|
+
return new HWPXFile(new Uint8Array(data));
|
|
677
|
+
}
|
|
678
|
+
static fromArrayBuffer(data) {
|
|
679
|
+
return new HWPXFile(new Uint8Array(data));
|
|
680
|
+
}
|
|
681
|
+
static fromUint8Array(data) {
|
|
682
|
+
return new HWPXFile(data);
|
|
683
|
+
}
|
|
684
|
+
open() {
|
|
685
|
+
if (this.data.length < 4 || this.data[0] !== 80 || this.data[1] !== 75 || this.data[2] !== 3 || this.data[3] !== 4) throw new Error("Not a valid HWPX file (not a ZIP archive)");
|
|
686
|
+
try {
|
|
687
|
+
this.entries = unzipSync(this.data);
|
|
688
|
+
} catch (e) {
|
|
689
|
+
throw new Error(`Failed to parse HWPX file: ${e instanceof Error ? e.message : e}`);
|
|
690
|
+
}
|
|
691
|
+
const mimetypeEntry = this.entries["mimetype"];
|
|
692
|
+
if (!mimetypeEntry) throw new Error("Not a valid HWPX file: missing 'mimetype' entry");
|
|
693
|
+
const mimetype = new TextDecoder().decode(mimetypeEntry).trim();
|
|
694
|
+
if (mimetype !== HWPX_MIMETYPE) throw new Error(`Invalid HWPX mimetype: '${mimetype}' (expected '${HWPX_MIMETYPE}')`);
|
|
695
|
+
this.parseVersion();
|
|
696
|
+
this.discoverSections();
|
|
697
|
+
}
|
|
698
|
+
close() {
|
|
699
|
+
this.entries = null;
|
|
700
|
+
this.sectionPaths = [];
|
|
701
|
+
}
|
|
702
|
+
parseVersion() {
|
|
703
|
+
const versionEntry = this.entries?.["version.xml"];
|
|
704
|
+
if (!versionEntry) {
|
|
705
|
+
this.version = "unknown";
|
|
706
|
+
return;
|
|
707
|
+
}
|
|
708
|
+
try {
|
|
709
|
+
const xml = new TextDecoder().decode(versionEntry);
|
|
710
|
+
const parsed = this.xmlParser.parse(xml);
|
|
711
|
+
const root = parsed["hv:HCFVersion"] ?? parsed["HCFVersion"] ?? parsed;
|
|
712
|
+
const major = root["@_major"] ?? "";
|
|
713
|
+
const minor = root["@_minor"] ?? "";
|
|
714
|
+
const micro = root["@_micro"] ?? "";
|
|
715
|
+
if (major) this.version = `${major}.${minor}.${micro}`;
|
|
716
|
+
else this.version = root["@_Version"] ?? "unknown";
|
|
717
|
+
} catch {
|
|
718
|
+
this.version = "unknown";
|
|
719
|
+
}
|
|
720
|
+
}
|
|
721
|
+
discoverSections() {
|
|
722
|
+
this.sectionPaths = [];
|
|
723
|
+
const hpfEntry = this.entries?.["Contents/content.hpf"];
|
|
724
|
+
if (!hpfEntry) {
|
|
725
|
+
for (const name of Object.keys(this.entries ?? {})) if (name.startsWith("Contents/section") && name.endsWith(".xml")) this.sectionPaths.push(name);
|
|
726
|
+
this.sectionPaths.sort();
|
|
727
|
+
return;
|
|
728
|
+
}
|
|
729
|
+
try {
|
|
730
|
+
const xml = new TextDecoder().decode(hpfEntry);
|
|
731
|
+
const parsed = this.xmlParser.parse(xml);
|
|
732
|
+
const pkg = parsed["opf:package"] ?? parsed["package"] ?? {};
|
|
733
|
+
const manifest = pkg["opf:manifest"] ?? pkg["manifest"] ?? {};
|
|
734
|
+
const spine = pkg["opf:spine"] ?? pkg["spine"] ?? {};
|
|
735
|
+
const idToHref = {};
|
|
736
|
+
const items = manifest["opf:item"] ?? manifest["item"] ?? [];
|
|
737
|
+
const itemList = Array.isArray(items) ? items : [items];
|
|
738
|
+
for (const item of itemList) {
|
|
739
|
+
const id = item["@_id"] ?? "";
|
|
740
|
+
const href = item["@_href"] ?? "";
|
|
741
|
+
if (href) idToHref[id] = href;
|
|
742
|
+
}
|
|
743
|
+
const itemrefs = spine["opf:itemref"] ?? spine["itemref"] ?? [];
|
|
744
|
+
const refList = Array.isArray(itemrefs) ? itemrefs : [itemrefs];
|
|
745
|
+
for (const ref of refList) {
|
|
746
|
+
const idref = ref["@_idref"] ?? "";
|
|
747
|
+
if (idref in idToHref && idref.toLowerCase().includes("section")) {
|
|
748
|
+
let href = idToHref[idref];
|
|
749
|
+
if (!href.startsWith("Contents/")) href = "Contents/" + href;
|
|
750
|
+
this.sectionPaths.push(href);
|
|
751
|
+
}
|
|
752
|
+
}
|
|
753
|
+
if (this.sectionPaths.length === 0) {
|
|
754
|
+
for (const [, href] of Object.entries(idToHref).sort()) if (href.toLowerCase().includes("section")) {
|
|
755
|
+
const fullHref = href.startsWith("Contents/") ? href : "Contents/" + href;
|
|
756
|
+
this.sectionPaths.push(fullHref);
|
|
757
|
+
}
|
|
758
|
+
}
|
|
759
|
+
} catch {
|
|
760
|
+
for (const name of Object.keys(this.entries ?? {})) if (name.startsWith("Contents/section") && name.endsWith(".xml")) this.sectionPaths.push(name);
|
|
761
|
+
this.sectionPaths.sort();
|
|
762
|
+
}
|
|
763
|
+
}
|
|
764
|
+
get fileInfo() {
|
|
765
|
+
this.ensureOpen();
|
|
766
|
+
return {
|
|
767
|
+
format: "HWPX",
|
|
768
|
+
version: this.version,
|
|
769
|
+
sectionCount: this.sectionPaths.length,
|
|
770
|
+
contents: Object.keys(this.entries ?? {})
|
|
771
|
+
};
|
|
772
|
+
}
|
|
773
|
+
getSectionCount() {
|
|
774
|
+
this.ensureOpen();
|
|
775
|
+
return this.sectionPaths.length;
|
|
776
|
+
}
|
|
777
|
+
getSectionXml(index) {
|
|
778
|
+
this.ensureOpen();
|
|
779
|
+
if (index < 0 || index >= this.sectionPaths.length) throw new RangeError(`Section index ${index} out of range (0-${this.sectionPaths.length - 1})`);
|
|
780
|
+
const path = this.sectionPaths[index];
|
|
781
|
+
const entry = this.entries?.[path];
|
|
782
|
+
if (!entry) throw new Error(`Section file not found: ${path}`);
|
|
783
|
+
const xml = new TextDecoder().decode(entry);
|
|
784
|
+
return this.xmlParser.parse(xml);
|
|
785
|
+
}
|
|
786
|
+
listContents() {
|
|
787
|
+
this.ensureOpen();
|
|
788
|
+
return Object.keys(this.entries ?? {});
|
|
789
|
+
}
|
|
790
|
+
ensureOpen() {
|
|
791
|
+
if (!this.entries) throw new Error("HWPX file is not open. Call open() first.");
|
|
792
|
+
}
|
|
793
|
+
};
|
|
794
|
+
function ensureArray(val) {
|
|
795
|
+
if (val === void 0 || val === null) return [];
|
|
796
|
+
return Array.isArray(val) ? val : [val];
|
|
797
|
+
}
|
|
798
|
+
/**
|
|
799
|
+
* Extract text from a paragraph object (hp:p)
|
|
800
|
+
*/
|
|
801
|
+
function extractParagraphText(pObj) {
|
|
802
|
+
const parts = [];
|
|
803
|
+
for (const run of ensureArray(pObj["hp:run"])) for (const [key, value] of Object.entries(run)) if (key === "hp:t") for (const t of ensureArray(value)) {
|
|
804
|
+
const text = typeof t === "string" ? t : t?.["#text"];
|
|
805
|
+
if (text !== void 0 && text !== null) parts.push(String(text));
|
|
806
|
+
}
|
|
807
|
+
else if (key === "hp:lineBreak") parts.push("\n");
|
|
808
|
+
else if (key === "hp:tab") parts.push(" ");
|
|
809
|
+
return parts.join("");
|
|
810
|
+
}
|
|
811
|
+
/**
|
|
812
|
+
* Extract text from a table cell (hp:tc)
|
|
813
|
+
*/
|
|
814
|
+
function extractCellText(tcObj, lineBreakStyle = "space") {
|
|
815
|
+
const textParts = [];
|
|
816
|
+
const subList = tcObj["hp:subList"];
|
|
817
|
+
if (subList) for (const p of ensureArray(subList["hp:p"])) {
|
|
818
|
+
const paraText = extractParagraphText(p);
|
|
819
|
+
if (paraText.trim()) textParts.push(paraText.trim());
|
|
820
|
+
}
|
|
821
|
+
if (lineBreakStyle === "br") return textParts.join("<br>").replace(/\n/g, "<br>");
|
|
822
|
+
return textParts.join(" ").replace(/\n/g, " ");
|
|
823
|
+
}
|
|
824
|
+
/**
|
|
825
|
+
* Parse a table XML object (hp:tbl) into a Table
|
|
826
|
+
*/
|
|
827
|
+
function parseHwpxTable(tblObj, lineBreakStyle = "space") {
|
|
828
|
+
const rows = Number(tblObj["@_rowCnt"] ?? 0);
|
|
829
|
+
const cols = Number(tblObj["@_colCnt"] ?? 0);
|
|
830
|
+
const cells = [];
|
|
831
|
+
for (const tr of ensureArray(tblObj["hp:tr"])) for (const tc of ensureArray(tr["hp:tc"])) {
|
|
832
|
+
const addr = tc["hp:cellAddr"];
|
|
833
|
+
const col = Number(addr?.["@_colAddr"] ?? 0);
|
|
834
|
+
const row = Number(addr?.["@_rowAddr"] ?? 0);
|
|
835
|
+
const span = tc["hp:cellSpan"];
|
|
836
|
+
let colspan = Number(span?.["@_colSpan"] ?? 1);
|
|
837
|
+
let rowspan = Number(span?.["@_rowSpan"] ?? 1);
|
|
838
|
+
if (colspan < 1) colspan = 1;
|
|
839
|
+
if (rowspan < 1) rowspan = 1;
|
|
840
|
+
const text = extractCellText(tc, lineBreakStyle);
|
|
841
|
+
cells.push({
|
|
842
|
+
row,
|
|
843
|
+
col,
|
|
844
|
+
rowspan,
|
|
845
|
+
colspan,
|
|
846
|
+
text
|
|
847
|
+
});
|
|
848
|
+
}
|
|
849
|
+
return {
|
|
850
|
+
rows,
|
|
851
|
+
cols,
|
|
852
|
+
cells
|
|
853
|
+
};
|
|
854
|
+
}
|
|
855
|
+
/**
|
|
856
|
+
* Parse a section XML object into paragraphs
|
|
857
|
+
*/
|
|
858
|
+
function parseHwpxSection(sectionObj, options = {}) {
|
|
859
|
+
const lineBreakStyle = options.tableLineBreakStyle ?? "space";
|
|
860
|
+
const paragraphs = [];
|
|
861
|
+
let sec = sectionObj;
|
|
862
|
+
if (sec["hs:sec"]) sec = sec["hs:sec"];
|
|
863
|
+
for (const p of ensureArray(sec["hp:p"])) {
|
|
864
|
+
let text = extractParagraphText(p);
|
|
865
|
+
const tableMds = [];
|
|
866
|
+
for (const run of ensureArray(p["hp:run"])) for (const tbl of ensureArray(run["hp:tbl"])) try {
|
|
867
|
+
const table = parseHwpxTable(tbl, lineBreakStyle);
|
|
868
|
+
tableMds.push(tableToMarkdown(table));
|
|
869
|
+
} catch {
|
|
870
|
+
tableMds.push("[TABLE - Parse Error]");
|
|
871
|
+
}
|
|
872
|
+
for (const tbl of ensureArray(p["hp:tbl"])) try {
|
|
873
|
+
const table = parseHwpxTable(tbl, lineBreakStyle);
|
|
874
|
+
tableMds.push(tableToMarkdown(table));
|
|
875
|
+
} catch {
|
|
876
|
+
tableMds.push("[TABLE - Parse Error]");
|
|
877
|
+
}
|
|
878
|
+
if (tableMds.length > 0) {
|
|
879
|
+
text = text.trimEnd();
|
|
880
|
+
if (text) text += "\n\n";
|
|
881
|
+
text += tableMds.join("\n\n");
|
|
882
|
+
}
|
|
883
|
+
if (text.trim()) paragraphs.push({
|
|
884
|
+
text,
|
|
885
|
+
header: {
|
|
886
|
+
textCount: 0,
|
|
887
|
+
controlMask: 0,
|
|
888
|
+
paraShapeId: 0,
|
|
889
|
+
styleId: 0,
|
|
890
|
+
columnType: 0,
|
|
891
|
+
charShapeCount: 0
|
|
892
|
+
}
|
|
893
|
+
});
|
|
894
|
+
}
|
|
895
|
+
for (const tbl of ensureArray(sec["hp:tbl"])) try {
|
|
896
|
+
const md = tableToMarkdown(parseHwpxTable(tbl, lineBreakStyle));
|
|
897
|
+
if (md.trim()) paragraphs.push({
|
|
898
|
+
text: md,
|
|
899
|
+
header: {
|
|
900
|
+
textCount: 0,
|
|
901
|
+
controlMask: 0,
|
|
902
|
+
paraShapeId: 0,
|
|
903
|
+
styleId: 0,
|
|
904
|
+
columnType: 0,
|
|
905
|
+
charShapeCount: 0
|
|
906
|
+
}
|
|
907
|
+
});
|
|
908
|
+
} catch {}
|
|
909
|
+
return paragraphs;
|
|
910
|
+
}
|
|
911
|
+
/**
|
|
912
|
+
* Check if binary data is a HWPX file (ZIP magic bytes)
|
|
913
|
+
*/
|
|
914
|
+
function isHwpxData(data) {
|
|
915
|
+
const bytes = data instanceof Uint8Array ? data : new Uint8Array(data);
|
|
916
|
+
return bytes.length >= 4 && bytes[0] === 80 && bytes[1] === 75 && bytes[2] === 3 && bytes[3] === 4;
|
|
917
|
+
}
|
|
918
|
+
/**
|
|
919
|
+
* Check if a file path has HWPX extension
|
|
920
|
+
*/
|
|
921
|
+
function isHwpxPath(path) {
|
|
922
|
+
return path.toLowerCase().endsWith(".hwpx");
|
|
923
|
+
}
|
|
924
|
+
|
|
925
|
+
//#endregion
|
|
926
|
+
//#region src/converter.ts
|
|
927
|
+
/**
|
|
928
|
+
* HWP/HWPX to Markdown Converter
|
|
929
|
+
* Orchestrates conversion from HWP/HWPX to Markdown
|
|
930
|
+
*/
|
|
931
|
+
/**
|
|
932
|
+
* Convert paragraphs to Markdown
|
|
933
|
+
* @param paragraphs - List of paragraphs
|
|
934
|
+
* @returns Markdown text
|
|
935
|
+
*/
|
|
936
|
+
function paragraphsToMarkdown(paragraphs) {
|
|
937
|
+
const lines = [];
|
|
938
|
+
for (const para of paragraphs) {
|
|
939
|
+
const text = para.text.trim();
|
|
940
|
+
if (!text) continue;
|
|
941
|
+
if (text.length < 5 && !text.startsWith("|") && [...text].every((c) => (c.codePointAt(0) ?? 0) > 127)) continue;
|
|
942
|
+
lines.push(text);
|
|
943
|
+
}
|
|
944
|
+
return lines.join("\n\n");
|
|
945
|
+
}
|
|
946
|
+
/**
|
|
947
|
+
* Convert HWP file to Markdown
|
|
948
|
+
* @param hwp - Opened HWP file
|
|
949
|
+
* @param options - Conversion options
|
|
950
|
+
* @returns Markdown content
|
|
951
|
+
*/
|
|
952
|
+
function convertHwpToMarkdown(hwp, options = {}) {
|
|
953
|
+
const allParagraphs = [];
|
|
954
|
+
if (!options.tableLineBreakStyle) options.tableLineBreakStyle = "space";
|
|
955
|
+
const sectionCount = hwp.getSectionCount();
|
|
956
|
+
for (let i = 0; i < sectionCount; i++) {
|
|
957
|
+
const sectionData = hwp.readSection(i);
|
|
958
|
+
if (!sectionData) continue;
|
|
959
|
+
const paragraphs = new ParagraphParser(new RecordReader(sectionData), options).parseAllParagraphs();
|
|
960
|
+
allParagraphs.push(...paragraphs);
|
|
961
|
+
}
|
|
962
|
+
return paragraphsToMarkdown(allParagraphs);
|
|
963
|
+
}
|
|
964
|
+
/**
|
|
965
|
+
* Convert HWPX file to Markdown
|
|
966
|
+
* @param hwpx - Opened HWPX file
|
|
967
|
+
* @param options - Conversion options
|
|
968
|
+
* @returns Markdown content
|
|
969
|
+
*/
|
|
970
|
+
function convertHwpxToMarkdown(hwpx, options = {}) {
|
|
971
|
+
const allParagraphs = [];
|
|
972
|
+
const sectionCount = hwpx.getSectionCount();
|
|
973
|
+
for (let i = 0; i < sectionCount; i++) {
|
|
974
|
+
const paragraphs = parseHwpxSection(hwpx.getSectionXml(i), options);
|
|
975
|
+
allParagraphs.push(...paragraphs);
|
|
976
|
+
}
|
|
977
|
+
return paragraphsToMarkdown(allParagraphs);
|
|
978
|
+
}
|
|
979
|
+
/**
|
|
980
|
+
* High-level API: Convert HWP/HWPX file to Markdown
|
|
981
|
+
* Auto-detects format based on file extension or magic bytes.
|
|
982
|
+
* @param input - File path (Node.js), ArrayBuffer, or Uint8Array
|
|
983
|
+
* @param options - Conversion options
|
|
984
|
+
* @returns Markdown content
|
|
985
|
+
*/
|
|
986
|
+
async function convert(input, options) {
|
|
987
|
+
if (typeof input === "string" && isHwpxPath(input)) {
|
|
988
|
+
const hwpx = await HWPXFile.fromFile(input);
|
|
989
|
+
try {
|
|
990
|
+
hwpx.open();
|
|
991
|
+
return convertHwpxToMarkdown(hwpx, options);
|
|
992
|
+
} finally {
|
|
993
|
+
hwpx.close();
|
|
994
|
+
}
|
|
995
|
+
}
|
|
996
|
+
if ((input instanceof Uint8Array || input instanceof ArrayBuffer) && isHwpxData(input instanceof ArrayBuffer ? new Uint8Array(input) : input)) {
|
|
997
|
+
const hwpx = input instanceof Uint8Array ? HWPXFile.fromUint8Array(input) : HWPXFile.fromArrayBuffer(input);
|
|
998
|
+
try {
|
|
999
|
+
hwpx.open();
|
|
1000
|
+
return convertHwpxToMarkdown(hwpx, options);
|
|
1001
|
+
} finally {
|
|
1002
|
+
hwpx.close();
|
|
1003
|
+
}
|
|
1004
|
+
}
|
|
1005
|
+
let hwp;
|
|
1006
|
+
if (typeof input === "string") hwp = await HWPFile.fromFile(input);
|
|
1007
|
+
else if (input instanceof Uint8Array) hwp = HWPFile.fromUint8Array(input);
|
|
1008
|
+
else hwp = HWPFile.fromArrayBuffer(input);
|
|
1009
|
+
try {
|
|
1010
|
+
hwp.open();
|
|
1011
|
+
return convertHwpToMarkdown(hwp, options);
|
|
1012
|
+
} finally {
|
|
1013
|
+
hwp.close();
|
|
1014
|
+
}
|
|
1015
|
+
}
|
|
1016
|
+
|
|
1017
|
+
//#endregion
|
|
1018
|
+
//#region src/utils/binary.ts
|
|
1019
|
+
/**
|
|
1020
|
+
* Binary Reader Utility
|
|
1021
|
+
* Replaces Python's struct.unpack functionality
|
|
1022
|
+
* All multi-byte values are little-endian
|
|
1023
|
+
*/
|
|
1024
|
+
var BinaryReader = class BinaryReader {
|
|
1025
|
+
view;
|
|
1026
|
+
offset = 0;
|
|
1027
|
+
constructor(data) {
|
|
1028
|
+
const buffer = data instanceof Uint8Array ? data.buffer : data;
|
|
1029
|
+
const byteOffset = data instanceof Uint8Array ? data.byteOffset : 0;
|
|
1030
|
+
const byteLength = data instanceof Uint8Array ? data.byteLength : data.byteLength;
|
|
1031
|
+
this.view = new DataView(buffer, byteOffset, byteLength);
|
|
1032
|
+
}
|
|
1033
|
+
/**
|
|
1034
|
+
* Read unsigned 8-bit integer (BYTE)
|
|
1035
|
+
*/
|
|
1036
|
+
readUint8() {
|
|
1037
|
+
const value = this.view.getUint8(this.offset);
|
|
1038
|
+
this.offset += 1;
|
|
1039
|
+
return value;
|
|
1040
|
+
}
|
|
1041
|
+
/**
|
|
1042
|
+
* Read unsigned 16-bit integer (WORD) - little-endian
|
|
1043
|
+
*/
|
|
1044
|
+
readUint16LE() {
|
|
1045
|
+
const value = this.view.getUint16(this.offset, true);
|
|
1046
|
+
this.offset += 2;
|
|
1047
|
+
return value;
|
|
1048
|
+
}
|
|
1049
|
+
/**
|
|
1050
|
+
* Read unsigned 32-bit integer (DWORD) - little-endian
|
|
1051
|
+
*/
|
|
1052
|
+
readUint32LE() {
|
|
1053
|
+
const value = this.view.getUint32(this.offset, true);
|
|
1054
|
+
this.offset += 4;
|
|
1055
|
+
return value;
|
|
1056
|
+
}
|
|
1057
|
+
/**
|
|
1058
|
+
* Read signed 32-bit integer - little-endian
|
|
1059
|
+
*/
|
|
1060
|
+
readInt32LE() {
|
|
1061
|
+
const value = this.view.getInt32(this.offset, true);
|
|
1062
|
+
this.offset += 4;
|
|
1063
|
+
return value;
|
|
1064
|
+
}
|
|
1065
|
+
/**
|
|
1066
|
+
* Read bytes without advancing offset (peek)
|
|
1067
|
+
*/
|
|
1068
|
+
peekBytes(length) {
|
|
1069
|
+
return new Uint8Array(this.view.buffer, this.view.byteOffset + this.offset, length);
|
|
1070
|
+
}
|
|
1071
|
+
/**
|
|
1072
|
+
* Read bytes and advance offset
|
|
1073
|
+
*/
|
|
1074
|
+
readBytes(length) {
|
|
1075
|
+
const bytes = new Uint8Array(this.view.buffer, this.view.byteOffset + this.offset, length);
|
|
1076
|
+
this.offset += length;
|
|
1077
|
+
return bytes;
|
|
1078
|
+
}
|
|
1079
|
+
/**
|
|
1080
|
+
* Skip bytes
|
|
1081
|
+
*/
|
|
1082
|
+
skip(length) {
|
|
1083
|
+
this.offset += length;
|
|
1084
|
+
}
|
|
1085
|
+
/**
|
|
1086
|
+
* Set absolute position
|
|
1087
|
+
*/
|
|
1088
|
+
seek(offset) {
|
|
1089
|
+
this.offset = offset;
|
|
1090
|
+
}
|
|
1091
|
+
/**
|
|
1092
|
+
* Get current position
|
|
1093
|
+
*/
|
|
1094
|
+
get position() {
|
|
1095
|
+
return this.offset;
|
|
1096
|
+
}
|
|
1097
|
+
/**
|
|
1098
|
+
* Get remaining bytes
|
|
1099
|
+
*/
|
|
1100
|
+
get remaining() {
|
|
1101
|
+
return this.view.byteLength - this.offset;
|
|
1102
|
+
}
|
|
1103
|
+
/**
|
|
1104
|
+
* Check if more data is available
|
|
1105
|
+
*/
|
|
1106
|
+
hasMore(minBytes = 1) {
|
|
1107
|
+
return this.remaining >= minBytes;
|
|
1108
|
+
}
|
|
1109
|
+
/**
|
|
1110
|
+
* Get total length
|
|
1111
|
+
*/
|
|
1112
|
+
get length() {
|
|
1113
|
+
return this.view.byteLength;
|
|
1114
|
+
}
|
|
1115
|
+
/**
|
|
1116
|
+
* Create a new BinaryReader for a subset of data
|
|
1117
|
+
*/
|
|
1118
|
+
slice(start, end) {
|
|
1119
|
+
const actualEnd = end ?? this.view.byteLength;
|
|
1120
|
+
return new BinaryReader(new Uint8Array(this.view.buffer, this.view.byteOffset + start, actualEnd - start));
|
|
1121
|
+
}
|
|
1122
|
+
};
|
|
1123
|
+
|
|
1124
|
+
//#endregion
|
|
1125
|
+
export { BinaryReader, HWPFile, HWPTAG_BEGIN, HWPTAG_BIN_DATA, HWPTAG_BORDER_FILL, HWPTAG_BULLET, HWPTAG_CHAR_SHAPE, HWPTAG_CTRL_DATA, HWPTAG_CTRL_HEADER, HWPTAG_DOCUMENT_PROPERTIES, HWPTAG_FACE_NAME, HWPTAG_FOOTNOTE_SHAPE, HWPTAG_ID_MAPPINGS, HWPTAG_LIST_HEADER, HWPTAG_NUMBERING, HWPTAG_PAGE_BORDER_FILL, HWPTAG_PAGE_DEF, HWPTAG_PARA_CHAR_SHAPE, HWPTAG_PARA_HEADER, HWPTAG_PARA_LINE_SEG, HWPTAG_PARA_RANGE_TAG, HWPTAG_PARA_SHAPE, HWPTAG_PARA_TEXT, HWPTAG_SHAPE_COMPONENT, HWPTAG_SHAPE_COMPONENT_LINE, HWPTAG_STYLE, HWPTAG_TABLE, HWPTAG_TAB_DEF, HWPXFile, ParagraphParser, RecordReader, convert, convertHwpToMarkdown, convertHwpxToMarkdown, decompressRaw, isHwpxData, isHwpxPath, paragraphsToMarkdown, parseCellProperties, parseHwpxSection, parseHwpxTable, parseParaHeader, parseParaText, parseTable, parseTableProperties, processControlChars, tableToMarkdown };
|
|
1126
|
+
//# sourceMappingURL=index.mjs.map
|