hwp2md 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.md +7 -0
- package/README.md +174 -0
- package/dist/browser.d.mts +179 -0
- package/dist/browser.d.mts.map +1 -0
- package/dist/browser.mjs +27 -0
- package/dist/browser.mjs.map +1 -0
- package/dist/cli/index.d.mts +1 -0
- package/dist/cli/index.mjs +687 -0
- package/dist/cli/index.mjs.map +1 -0
- package/dist/converter-C0C25ssg.mjs +3 -0
- package/dist/converter-D6LrZNSL.mjs +4804 -0
- package/dist/converter-D6LrZNSL.mjs.map +1 -0
- package/dist/index.cjs +1200 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +454 -0
- package/dist/index.d.cts.map +1 -0
- package/dist/index.d.mts +454 -0
- package/dist/index.d.mts.map +1 -0
- package/dist/index.mjs +1126 -0
- package/dist/index.mjs.map +1 -0
- package/package.json +92 -0
|
@@ -0,0 +1,687 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { Command } from "commander";
|
|
3
|
+
import { readFile, writeFile } from "node:fs/promises";
|
|
4
|
+
import * as CFB from "cfb";
|
|
5
|
+
import { inflateRaw } from "pako";
|
|
6
|
+
import "fflate";
|
|
7
|
+
import "fast-xml-parser";
|
|
8
|
+
|
|
9
|
+
//#region src/utils/compression.ts
|
|
10
|
+
/**
|
|
11
|
+
* Decompress raw deflate data (no zlib header)
|
|
12
|
+
* Equivalent to Python: zlib.decompress(data, -15)
|
|
13
|
+
*
|
|
14
|
+
* HWP files use raw deflate compression without zlib wrapper headers.
|
|
15
|
+
* The windowBits=-15 in Python indicates raw deflate mode.
|
|
16
|
+
*
|
|
17
|
+
* @param data - Compressed data
|
|
18
|
+
* @returns Decompressed data
|
|
19
|
+
*/
|
|
20
|
+
function decompressRaw(data) {
|
|
21
|
+
try {
|
|
22
|
+
return inflateRaw(data);
|
|
23
|
+
} catch (error) {
|
|
24
|
+
throw new Error(`Failed to decompress data: ${error.message}`);
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
//#endregion
|
|
29
|
+
//#region src/parser.ts
|
|
30
|
+
/**
|
|
31
|
+
* HWP 5.0 File Parser
|
|
32
|
+
* Parses OLE Compound File format HWP files
|
|
33
|
+
*/
|
|
34
|
+
/**
|
|
35
|
+
* HWP File Parser
|
|
36
|
+
* Reads and parses HWP 5.0 files using OLE Compound File format
|
|
37
|
+
*/
|
|
38
|
+
var HWPFile = class HWPFile {
|
|
39
|
+
cfb = null;
|
|
40
|
+
_fileHeader = null;
|
|
41
|
+
_isCompressed = false;
|
|
42
|
+
/**
|
|
43
|
+
* Create HWPFile from raw data
|
|
44
|
+
* @param data - Raw HWP file data
|
|
45
|
+
*/
|
|
46
|
+
constructor(data) {
|
|
47
|
+
this.data = data;
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Create HWPFile from file path (Node.js only)
|
|
51
|
+
* @param path - Path to HWP file
|
|
52
|
+
*/
|
|
53
|
+
static async fromFile(path) {
|
|
54
|
+
const data = await (await import("node:fs/promises")).readFile(path);
|
|
55
|
+
return new HWPFile(new Uint8Array(data));
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Create HWPFile from ArrayBuffer
|
|
59
|
+
* @param data - ArrayBuffer data
|
|
60
|
+
*/
|
|
61
|
+
static fromArrayBuffer(data) {
|
|
62
|
+
return new HWPFile(new Uint8Array(data));
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Create HWPFile from Uint8Array
|
|
66
|
+
* @param data - Uint8Array data
|
|
67
|
+
*/
|
|
68
|
+
static fromUint8Array(data) {
|
|
69
|
+
return new HWPFile(data);
|
|
70
|
+
}
|
|
71
|
+
/**
|
|
72
|
+
* Open and parse HWP file
|
|
73
|
+
*/
|
|
74
|
+
open() {
|
|
75
|
+
const uint8Array = this.data instanceof Uint8Array ? this.data : new Uint8Array(this.data);
|
|
76
|
+
if (uint8Array.length >= 30) {
|
|
77
|
+
if (new TextDecoder("utf-8").decode(uint8Array.slice(0, 30)).replace(/\0+$/, "").startsWith("HWP Document File V3")) throw new Error("HWP 3.0 format (HWP 97, 2002, etc.) is not supported. Please use HWP 5.0 or later format. You can convert HWP 3.0 files to HWP 5.0 format using Hancom Office.");
|
|
78
|
+
}
|
|
79
|
+
try {
|
|
80
|
+
if (typeof Buffer !== "undefined") {
|
|
81
|
+
const buffer = Buffer.from(uint8Array);
|
|
82
|
+
this.cfb = CFB.read(buffer, { type: "buffer" });
|
|
83
|
+
} else {
|
|
84
|
+
const buffer = Array.from(uint8Array);
|
|
85
|
+
this.cfb = CFB.read(buffer, { type: "array" });
|
|
86
|
+
}
|
|
87
|
+
} catch (error) {
|
|
88
|
+
throw new Error(`Failed to parse HWP file: ${error.message}`);
|
|
89
|
+
}
|
|
90
|
+
this._fileHeader = this.parseFileHeader();
|
|
91
|
+
this._isCompressed = this._fileHeader.isCompressed;
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Close HWP file and release resources
|
|
95
|
+
*/
|
|
96
|
+
close() {
|
|
97
|
+
this.cfb = null;
|
|
98
|
+
this._fileHeader = null;
|
|
99
|
+
}
|
|
100
|
+
/**
|
|
101
|
+
* Get file header information
|
|
102
|
+
*/
|
|
103
|
+
get fileHeader() {
|
|
104
|
+
return this._fileHeader;
|
|
105
|
+
}
|
|
106
|
+
/**
|
|
107
|
+
* Check if file is compressed
|
|
108
|
+
*/
|
|
109
|
+
get isCompressed() {
|
|
110
|
+
return this._isCompressed;
|
|
111
|
+
}
|
|
112
|
+
/**
|
|
113
|
+
* Parse FileHeader stream (256 bytes fixed)
|
|
114
|
+
*/
|
|
115
|
+
parseFileHeader() {
|
|
116
|
+
if (!this.cfb) throw new Error("HWP file not opened");
|
|
117
|
+
const entry = CFB.find(this.cfb, "FileHeader");
|
|
118
|
+
if (!entry) throw new Error("FileHeader not found in HWP file");
|
|
119
|
+
const data = entry.content;
|
|
120
|
+
if (data.length !== 256) throw new Error(`Invalid FileHeader size: ${data.length} (expected 256)`);
|
|
121
|
+
const signatureBytes = data.slice(0, 32);
|
|
122
|
+
const signature = new TextDecoder("utf-8").decode(signatureBytes).replace(/\0+$/, "");
|
|
123
|
+
if (!signature.startsWith("HWP Document File")) throw new Error(`Invalid HWP signature: ${signature}`);
|
|
124
|
+
const view = new DataView(data.buffer, data.byteOffset, data.byteLength);
|
|
125
|
+
const versionRaw = view.getUint32(32, true);
|
|
126
|
+
const major = versionRaw >> 24 & 255;
|
|
127
|
+
const minor = versionRaw >> 16 & 255;
|
|
128
|
+
const patch = versionRaw >> 8 & 255;
|
|
129
|
+
const rev = versionRaw & 255;
|
|
130
|
+
const properties = view.getUint32(36, true);
|
|
131
|
+
const isCompressed = Boolean(properties & 1);
|
|
132
|
+
const isEncrypted = Boolean(properties & 2);
|
|
133
|
+
return {
|
|
134
|
+
signature,
|
|
135
|
+
version: `${major}.${minor}.${patch}.${rev}`,
|
|
136
|
+
isCompressed,
|
|
137
|
+
isEncrypted,
|
|
138
|
+
rawProperties: properties
|
|
139
|
+
};
|
|
140
|
+
}
|
|
141
|
+
/**
|
|
142
|
+
* Read and decompress stream
|
|
143
|
+
* @param streamPath - Stream path (e.g., 'DocInfo', 'BodyText/Section0')
|
|
144
|
+
* @returns Decompressed data or null if stream doesn't exist
|
|
145
|
+
*/
|
|
146
|
+
readStream(streamPath) {
|
|
147
|
+
if (!this.cfb) return null;
|
|
148
|
+
const entry = CFB.find(this.cfb, streamPath);
|
|
149
|
+
if (!entry) return null;
|
|
150
|
+
let data = entry.content;
|
|
151
|
+
if (this._isCompressed) try {
|
|
152
|
+
data = decompressRaw(data);
|
|
153
|
+
} catch (error) {
|
|
154
|
+
throw new Error(`Failed to decompress ${streamPath}: ${error.message}`);
|
|
155
|
+
}
|
|
156
|
+
return data;
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* List all streams in HWP file
|
|
160
|
+
* @returns Array of stream paths
|
|
161
|
+
*/
|
|
162
|
+
listStreams() {
|
|
163
|
+
if (!this.cfb) return [];
|
|
164
|
+
const streams = [];
|
|
165
|
+
for (const entry of this.cfb.FileIndex) if (entry.type === 2) {
|
|
166
|
+
const path = entry.name.split("/").filter((p) => p);
|
|
167
|
+
if (path.length > 0) streams.push(path);
|
|
168
|
+
}
|
|
169
|
+
return streams;
|
|
170
|
+
}
|
|
171
|
+
/**
|
|
172
|
+
* Get file information
|
|
173
|
+
*/
|
|
174
|
+
getFileInfo() {
|
|
175
|
+
if (!this._fileHeader) return {};
|
|
176
|
+
return {
|
|
177
|
+
signature: this._fileHeader.signature,
|
|
178
|
+
version: this._fileHeader.version,
|
|
179
|
+
compressed: this._isCompressed,
|
|
180
|
+
encrypted: this._fileHeader.isEncrypted,
|
|
181
|
+
streams: this.listStreams()
|
|
182
|
+
};
|
|
183
|
+
}
|
|
184
|
+
/**
|
|
185
|
+
* Get number of sections in BodyText
|
|
186
|
+
*/
|
|
187
|
+
getSectionCount() {
|
|
188
|
+
if (!this.cfb) return 0;
|
|
189
|
+
let count = 0;
|
|
190
|
+
while (CFB.find(this.cfb, `BodyText/Section${count}`) || CFB.find(this.cfb, `Section${count}`)) count++;
|
|
191
|
+
return count;
|
|
192
|
+
}
|
|
193
|
+
/**
|
|
194
|
+
* Read section data
|
|
195
|
+
* @param sectionIndex - Section index (0-based)
|
|
196
|
+
* @returns Decompressed section data
|
|
197
|
+
*/
|
|
198
|
+
readSection(sectionIndex) {
|
|
199
|
+
let data = this.readStream(`BodyText/Section${sectionIndex}`);
|
|
200
|
+
if (!data) data = this.readStream(`Section${sectionIndex}`);
|
|
201
|
+
return data;
|
|
202
|
+
}
|
|
203
|
+
};
|
|
204
|
+
|
|
205
|
+
//#endregion
|
|
206
|
+
//#region src/record.ts
|
|
207
|
+
const HWPTAG_BEGIN = 16;
|
|
208
|
+
const HWPTAG_PARA_HEADER = HWPTAG_BEGIN + 50;
|
|
209
|
+
const HWPTAG_PARA_TEXT = HWPTAG_BEGIN + 51;
|
|
210
|
+
const HWPTAG_PARA_CHAR_SHAPE = HWPTAG_BEGIN + 52;
|
|
211
|
+
const HWPTAG_PARA_LINE_SEG = HWPTAG_BEGIN + 53;
|
|
212
|
+
const HWPTAG_PARA_RANGE_TAG = HWPTAG_BEGIN + 54;
|
|
213
|
+
const HWPTAG_CTRL_HEADER = HWPTAG_BEGIN + 55;
|
|
214
|
+
const HWPTAG_LIST_HEADER = HWPTAG_BEGIN + 56;
|
|
215
|
+
const HWPTAG_PAGE_DEF = HWPTAG_BEGIN + 57;
|
|
216
|
+
const HWPTAG_FOOTNOTE_SHAPE = HWPTAG_BEGIN + 58;
|
|
217
|
+
const HWPTAG_PAGE_BORDER_FILL = HWPTAG_BEGIN + 59;
|
|
218
|
+
const HWPTAG_SHAPE_COMPONENT = HWPTAG_BEGIN + 60;
|
|
219
|
+
const HWPTAG_TABLE = HWPTAG_BEGIN + 61;
|
|
220
|
+
const HWPTAG_SHAPE_COMPONENT_LINE = HWPTAG_BEGIN + 62;
|
|
221
|
+
const HWPTAG_CTRL_DATA = HWPTAG_BEGIN + 71;
|
|
222
|
+
/**
|
|
223
|
+
* HWP Record Reader
|
|
224
|
+
* Reads binary records from HWP stream data
|
|
225
|
+
*/
|
|
226
|
+
var RecordReader = class {
|
|
227
|
+
data;
|
|
228
|
+
offset = 0;
|
|
229
|
+
constructor(data) {
|
|
230
|
+
this.data = data;
|
|
231
|
+
}
|
|
232
|
+
/**
|
|
233
|
+
* Check if there are more records to read
|
|
234
|
+
*/
|
|
235
|
+
hasMore() {
|
|
236
|
+
return this.offset < this.data.length;
|
|
237
|
+
}
|
|
238
|
+
/**
|
|
239
|
+
* Read next record
|
|
240
|
+
* @returns Next record, or null if no more records
|
|
241
|
+
*/
|
|
242
|
+
readRecord() {
|
|
243
|
+
if (this.offset + 4 > this.data.length) return null;
|
|
244
|
+
const header = new DataView(this.data.buffer, this.data.byteOffset + this.offset, 4).getUint32(0, true);
|
|
245
|
+
const tagId = header & 1023;
|
|
246
|
+
const level = header >> 10 & 1023;
|
|
247
|
+
let size = header >> 20 & 4095;
|
|
248
|
+
let dataOffset = this.offset + 4;
|
|
249
|
+
if (size === 4095) {
|
|
250
|
+
if (dataOffset + 4 > this.data.length) return null;
|
|
251
|
+
size = new DataView(this.data.buffer, this.data.byteOffset + dataOffset, 4).getUint32(0, true);
|
|
252
|
+
dataOffset += 4;
|
|
253
|
+
}
|
|
254
|
+
if (dataOffset + size > this.data.length) return null;
|
|
255
|
+
const recordData = this.data.slice(dataOffset, dataOffset + size);
|
|
256
|
+
this.offset = dataOffset + size;
|
|
257
|
+
return {
|
|
258
|
+
tagId,
|
|
259
|
+
level,
|
|
260
|
+
data: recordData,
|
|
261
|
+
size
|
|
262
|
+
};
|
|
263
|
+
}
|
|
264
|
+
/**
|
|
265
|
+
* Peek at next record header without consuming it
|
|
266
|
+
* @returns Header info with tagId, level, size
|
|
267
|
+
*/
|
|
268
|
+
peekRecordHeader() {
|
|
269
|
+
if (this.offset + 4 > this.data.length) return null;
|
|
270
|
+
const header = new DataView(this.data.buffer, this.data.byteOffset + this.offset, 4).getUint32(0, true);
|
|
271
|
+
const tagId = header & 1023;
|
|
272
|
+
const level = header >> 10 & 1023;
|
|
273
|
+
let size = header >> 20 & 4095;
|
|
274
|
+
let dataOffset = this.offset + 4;
|
|
275
|
+
if (size === 4095) {
|
|
276
|
+
if (dataOffset + 4 > this.data.length) return null;
|
|
277
|
+
size = new DataView(this.data.buffer, this.data.byteOffset + dataOffset, 4).getUint32(0, true);
|
|
278
|
+
}
|
|
279
|
+
return {
|
|
280
|
+
tagId,
|
|
281
|
+
level,
|
|
282
|
+
size
|
|
283
|
+
};
|
|
284
|
+
}
|
|
285
|
+
/**
|
|
286
|
+
* Read all records (considering hierarchy)
|
|
287
|
+
* @param parentLevel - Stop when reaching this level or below
|
|
288
|
+
* @returns All records at current level
|
|
289
|
+
*/
|
|
290
|
+
readAllRecords(parentLevel) {
|
|
291
|
+
const records = [];
|
|
292
|
+
while (this.hasMore()) {
|
|
293
|
+
const header = this.peekRecordHeader();
|
|
294
|
+
if (!header) break;
|
|
295
|
+
if (parentLevel !== void 0 && header.level <= parentLevel) break;
|
|
296
|
+
const record = this.readRecord();
|
|
297
|
+
if (record) records.push(record);
|
|
298
|
+
}
|
|
299
|
+
return records;
|
|
300
|
+
}
|
|
301
|
+
/**
|
|
302
|
+
* Get current position
|
|
303
|
+
*/
|
|
304
|
+
get position() {
|
|
305
|
+
return this.offset;
|
|
306
|
+
}
|
|
307
|
+
/**
|
|
308
|
+
* Get remaining bytes
|
|
309
|
+
*/
|
|
310
|
+
get remaining() {
|
|
311
|
+
return this.data.length - this.offset;
|
|
312
|
+
}
|
|
313
|
+
};
|
|
314
|
+
|
|
315
|
+
//#endregion
|
|
316
|
+
//#region src/table.ts
|
|
317
|
+
/**
|
|
318
|
+
* HWP Table Parser
|
|
319
|
+
* Parses table records and converts to Markdown
|
|
320
|
+
*/
|
|
321
|
+
/**
|
|
322
|
+
* Parse table properties from TABLE record data
|
|
323
|
+
* @param data - TABLE record data
|
|
324
|
+
* @returns Table properties with rows, cols
|
|
325
|
+
*/
|
|
326
|
+
function parseTableProperties(data) {
|
|
327
|
+
if (data.length < 8) throw new Error(`Invalid TABLE record size: ${data.length}`);
|
|
328
|
+
const view = new DataView(data.buffer, data.byteOffset, data.byteLength);
|
|
329
|
+
return {
|
|
330
|
+
properties: view.getUint32(0, true),
|
|
331
|
+
rows: view.getUint16(4, true),
|
|
332
|
+
cols: view.getUint16(6, true)
|
|
333
|
+
};
|
|
334
|
+
}
|
|
335
|
+
/**
|
|
336
|
+
* Parse table from TABLE record and subsequent records
|
|
337
|
+
* @param tableRecordData - TABLE record data
|
|
338
|
+
* @param reader - Record reader for reading cell data
|
|
339
|
+
* @param lineBreakStyle - How to handle line breaks in cells
|
|
340
|
+
* @returns Parsed table
|
|
341
|
+
*/
|
|
342
|
+
function parseTable(tableRecordData, reader, lineBreakStyle = "space") {
|
|
343
|
+
const props = parseTableProperties(tableRecordData);
|
|
344
|
+
const rows = props.rows;
|
|
345
|
+
const cols = props.cols;
|
|
346
|
+
const cells = [];
|
|
347
|
+
let cellIndex = 0;
|
|
348
|
+
while (reader.hasMore() && cellIndex < rows * cols) {
|
|
349
|
+
const header = reader.peekRecordHeader();
|
|
350
|
+
if (!header) break;
|
|
351
|
+
if (header.level < 2) break;
|
|
352
|
+
if (header.tagId === HWPTAG_LIST_HEADER) {
|
|
353
|
+
const listRecord = reader.readRecord();
|
|
354
|
+
if (!listRecord) break;
|
|
355
|
+
let row, col, colspan, rowspan;
|
|
356
|
+
if (listRecord.data.length >= 16) {
|
|
357
|
+
const view = new DataView(listRecord.data.buffer, listRecord.data.byteOffset, listRecord.data.byteLength);
|
|
358
|
+
col = view.getUint16(8, true);
|
|
359
|
+
row = view.getUint16(10, true);
|
|
360
|
+
colspan = view.getUint16(12, true);
|
|
361
|
+
rowspan = view.getUint16(14, true);
|
|
362
|
+
if (colspan === 0) colspan = 1;
|
|
363
|
+
if (rowspan === 0) rowspan = 1;
|
|
364
|
+
} else {
|
|
365
|
+
row = Math.floor(cellIndex / cols);
|
|
366
|
+
col = cellIndex % cols;
|
|
367
|
+
colspan = 1;
|
|
368
|
+
rowspan = 1;
|
|
369
|
+
}
|
|
370
|
+
const textParts = [];
|
|
371
|
+
while (reader.hasMore()) {
|
|
372
|
+
const paraHeader = reader.peekRecordHeader();
|
|
373
|
+
if (!paraHeader) break;
|
|
374
|
+
if (paraHeader.tagId === HWPTAG_LIST_HEADER) break;
|
|
375
|
+
if (paraHeader.level < 2) break;
|
|
376
|
+
if (paraHeader.tagId === HWPTAG_PARA_HEADER) {
|
|
377
|
+
const paraRec = reader.readRecord();
|
|
378
|
+
if (!paraRec) break;
|
|
379
|
+
if (paraRec.data.length >= 4) {
|
|
380
|
+
const nchars = new DataView(paraRec.data.buffer, paraRec.data.byteOffset, paraRec.data.byteLength).getUint32(0, true) & 2147483647;
|
|
381
|
+
if (nchars > 0 && reader.hasMore()) {
|
|
382
|
+
const nextH = reader.peekRecordHeader();
|
|
383
|
+
if (nextH && nextH.tagId === HWPTAG_PARA_TEXT) {
|
|
384
|
+
const textRec = reader.readRecord();
|
|
385
|
+
if (textRec) {
|
|
386
|
+
const text = processControlChars(parseParaText(textRec.data, nchars)).text;
|
|
387
|
+
if (text.trim()) textParts.push(text.trim());
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
} else reader.readRecord();
|
|
393
|
+
}
|
|
394
|
+
let cellText;
|
|
395
|
+
if (lineBreakStyle === "br") {
|
|
396
|
+
cellText = textParts.join("<br>");
|
|
397
|
+
cellText = cellText.replace(/\n/g, "<br>");
|
|
398
|
+
} else {
|
|
399
|
+
cellText = textParts.join(" ");
|
|
400
|
+
cellText = cellText.replace(/\n/g, " ");
|
|
401
|
+
}
|
|
402
|
+
const cell = {
|
|
403
|
+
row,
|
|
404
|
+
col,
|
|
405
|
+
rowspan,
|
|
406
|
+
colspan,
|
|
407
|
+
text: cellText
|
|
408
|
+
};
|
|
409
|
+
cells.push(cell);
|
|
410
|
+
cellIndex++;
|
|
411
|
+
} else reader.readRecord();
|
|
412
|
+
}
|
|
413
|
+
return {
|
|
414
|
+
rows,
|
|
415
|
+
cols,
|
|
416
|
+
cells
|
|
417
|
+
};
|
|
418
|
+
}
|
|
419
|
+
/**
|
|
420
|
+
* Convert table to Markdown
|
|
421
|
+
* @param table - Table object
|
|
422
|
+
* @param mergeStrategy - 'repeat' (default) or 'blank'
|
|
423
|
+
* @returns Markdown table
|
|
424
|
+
*/
|
|
425
|
+
function tableToMarkdown(table, mergeStrategy = "repeat") {
|
|
426
|
+
const { rows, cols, cells } = table;
|
|
427
|
+
const matrix = Array.from({ length: rows }, () => Array(cols).fill(""));
|
|
428
|
+
for (const cell of cells) if (mergeStrategy === "repeat") {
|
|
429
|
+
for (let r = cell.row; r < cell.row + cell.rowspan; r++) for (let c = cell.col; c < cell.col + cell.colspan; c++) if (r < rows && c < cols) matrix[r][c] = cell.text;
|
|
430
|
+
} else if (cell.row < rows && cell.col < cols) matrix[cell.row][cell.col] = cell.text;
|
|
431
|
+
const lines = [];
|
|
432
|
+
if (rows === 0 || cols === 0) return "[Empty Table]";
|
|
433
|
+
lines.push("| " + Array(cols).fill("").join(" | ") + " |");
|
|
434
|
+
lines.push("| " + Array(cols).fill("---").join(" | ") + " |");
|
|
435
|
+
for (const row of matrix) lines.push("| " + row.join(" | ") + " |");
|
|
436
|
+
return lines.join("\n");
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
//#endregion
|
|
440
|
+
//#region src/paragraph.ts
|
|
441
|
+
/**
|
|
442
|
+
* HWP Paragraph Parser
|
|
443
|
+
* Parses paragraph records and extracts text
|
|
444
|
+
*/
|
|
445
|
+
/**
|
|
446
|
+
* Parse paragraph header
|
|
447
|
+
* @param data - PARA_HEADER record data
|
|
448
|
+
* @returns Paragraph header information
|
|
449
|
+
*/
|
|
450
|
+
function parseParaHeader(data) {
|
|
451
|
+
if (data.length < 22) throw new Error(`Invalid PARA_HEADER size: ${data.length}`);
|
|
452
|
+
const view = new DataView(data.buffer, data.byteOffset, data.byteLength);
|
|
453
|
+
return {
|
|
454
|
+
textCount: view.getUint32(0, true) & 2147483647,
|
|
455
|
+
controlMask: view.getUint32(4, true),
|
|
456
|
+
paraShapeId: view.getUint16(8, true),
|
|
457
|
+
styleId: view.getUint8(10),
|
|
458
|
+
columnType: view.getUint8(11),
|
|
459
|
+
charShapeCount: view.getUint16(12, true)
|
|
460
|
+
};
|
|
461
|
+
}
|
|
462
|
+
/**
|
|
463
|
+
* Parse paragraph text with control info table
|
|
464
|
+
*
|
|
465
|
+
* PARA_TEXT structure:
|
|
466
|
+
* [Control info table: 16 bytes per control] + [Actual text]
|
|
467
|
+
*
|
|
468
|
+
* Each control info block (16 bytes):
|
|
469
|
+
* - 2 bytes: control code
|
|
470
|
+
* - 4 bytes: control ID
|
|
471
|
+
* - 8 bytes: control data
|
|
472
|
+
* - 2 bytes: control code (repeated)
|
|
473
|
+
*
|
|
474
|
+
* @param data - PARA_TEXT record data
|
|
475
|
+
* @param nchars - Number of WCHAR characters (control table + text)
|
|
476
|
+
* @returns Decoded text
|
|
477
|
+
*/
|
|
478
|
+
function parseParaText(data, _nchars) {
|
|
479
|
+
const CHAR_CONTROLS = new Set([
|
|
480
|
+
0,
|
|
481
|
+
10,
|
|
482
|
+
13,
|
|
483
|
+
30,
|
|
484
|
+
31
|
|
485
|
+
]);
|
|
486
|
+
let offset = 0;
|
|
487
|
+
while (offset + 2 <= data.length) {
|
|
488
|
+
const charCode = new DataView(data.buffer, data.byteOffset + offset, 2).getUint16(0, true);
|
|
489
|
+
if (charCode < 32 && !CHAR_CONTROLS.has(charCode)) {
|
|
490
|
+
offset += 16;
|
|
491
|
+
if (offset > data.length) break;
|
|
492
|
+
} else break;
|
|
493
|
+
}
|
|
494
|
+
const textData = data.slice(offset);
|
|
495
|
+
if (textData.length === 0) return "";
|
|
496
|
+
try {
|
|
497
|
+
return new TextDecoder("utf-16le").decode(textData);
|
|
498
|
+
} catch (error) {
|
|
499
|
+
return "";
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
/**
|
|
503
|
+
* Process control characters in text
|
|
504
|
+
* @param text - Raw text with control characters
|
|
505
|
+
* @returns Processed text and has_table flag
|
|
506
|
+
*/
|
|
507
|
+
function processControlChars(text) {
|
|
508
|
+
const result = [];
|
|
509
|
+
let hasTable = false;
|
|
510
|
+
for (const char of text) {
|
|
511
|
+
const code = char.codePointAt(0) ?? 0;
|
|
512
|
+
if (code < 32) {
|
|
513
|
+
if (code === 10) result.push("\n");
|
|
514
|
+
else if (code === 11) hasTable = true;
|
|
515
|
+
else if (code === 13) result.push("\n");
|
|
516
|
+
} else result.push(char);
|
|
517
|
+
}
|
|
518
|
+
return {
|
|
519
|
+
text: result.join(""),
|
|
520
|
+
hasTable
|
|
521
|
+
};
|
|
522
|
+
}
|
|
523
|
+
/**
|
|
524
|
+
* Paragraph Parser
|
|
525
|
+
* Parses paragraphs from HWP record stream
|
|
526
|
+
*/
|
|
527
|
+
var ParagraphParser = class {
|
|
528
|
+
constructor(reader, options = {}) {
|
|
529
|
+
this.reader = reader;
|
|
530
|
+
this.options = options;
|
|
531
|
+
if (!this.options.tableLineBreakStyle) this.options.tableLineBreakStyle = "space";
|
|
532
|
+
}
|
|
533
|
+
/**
|
|
534
|
+
* Parse next paragraph
|
|
535
|
+
* @returns Parsed paragraph, or null if no more paragraphs
|
|
536
|
+
*/
|
|
537
|
+
parseParagraph() {
|
|
538
|
+
if (!this.reader.hasMore()) return null;
|
|
539
|
+
let record = null;
|
|
540
|
+
while (this.reader.hasMore()) {
|
|
541
|
+
record = this.reader.readRecord();
|
|
542
|
+
if (record && record.tagId === HWPTAG_PARA_HEADER) break;
|
|
543
|
+
record = null;
|
|
544
|
+
}
|
|
545
|
+
if (!record) return null;
|
|
546
|
+
const header = parseParaHeader(record.data);
|
|
547
|
+
let text = "";
|
|
548
|
+
if (header.textCount > 0) {
|
|
549
|
+
const nextHeader = this.reader.peekRecordHeader();
|
|
550
|
+
if (nextHeader && nextHeader.tagId === HWPTAG_PARA_TEXT) {
|
|
551
|
+
record = this.reader.readRecord();
|
|
552
|
+
if (record) text = processControlChars(parseParaText(record.data, header.textCount)).text;
|
|
553
|
+
}
|
|
554
|
+
}
|
|
555
|
+
const tables = [];
|
|
556
|
+
while (this.reader.hasMore()) {
|
|
557
|
+
const nextHeader = this.reader.peekRecordHeader();
|
|
558
|
+
if (!nextHeader) break;
|
|
559
|
+
if (nextHeader.tagId === HWPTAG_PARA_HEADER) break;
|
|
560
|
+
if (nextHeader.level === 0) break;
|
|
561
|
+
if (nextHeader.tagId === HWPTAG_CTRL_HEADER) {
|
|
562
|
+
this.reader.readRecord();
|
|
563
|
+
const tableHeader = this.reader.peekRecordHeader();
|
|
564
|
+
if (tableHeader && tableHeader.tagId === HWPTAG_TABLE) {
|
|
565
|
+
const tableRecord = this.reader.readRecord();
|
|
566
|
+
if (tableRecord) try {
|
|
567
|
+
const tableMd = tableToMarkdown(parseTable(tableRecord.data, this.reader, this.options.tableLineBreakStyle));
|
|
568
|
+
tables.push(tableMd);
|
|
569
|
+
} catch (error) {
|
|
570
|
+
console.warn("Warning: Failed to parse table:", error);
|
|
571
|
+
tables.push("[TABLE - Parse Error]");
|
|
572
|
+
}
|
|
573
|
+
}
|
|
574
|
+
} else this.reader.readRecord();
|
|
575
|
+
}
|
|
576
|
+
if (tables.length > 0) {
|
|
577
|
+
text = text.trimEnd();
|
|
578
|
+
if (text.length > 0 && text.length < 5 && [...text].every((c) => {
|
|
579
|
+
return (c.codePointAt(0) ?? 0) > 127 || /\s/.test(c);
|
|
580
|
+
})) text = tables.join("\n\n");
|
|
581
|
+
else {
|
|
582
|
+
if (text) text += "\n\n";
|
|
583
|
+
text += tables.join("\n\n");
|
|
584
|
+
}
|
|
585
|
+
}
|
|
586
|
+
return {
|
|
587
|
+
text,
|
|
588
|
+
header
|
|
589
|
+
};
|
|
590
|
+
}
|
|
591
|
+
/**
|
|
592
|
+
* Parse all paragraphs in section
|
|
593
|
+
* @returns All paragraphs
|
|
594
|
+
*/
|
|
595
|
+
parseAllParagraphs() {
|
|
596
|
+
const paragraphs = [];
|
|
597
|
+
while (this.reader.hasMore()) {
|
|
598
|
+
const para = this.parseParagraph();
|
|
599
|
+
if (para) paragraphs.push(para);
|
|
600
|
+
else break;
|
|
601
|
+
}
|
|
602
|
+
return paragraphs;
|
|
603
|
+
}
|
|
604
|
+
};
|
|
605
|
+
|
|
606
|
+
//#endregion
|
|
607
|
+
//#region src/converter.ts
|
|
608
|
+
/**
|
|
609
|
+
* Convert paragraphs to Markdown
|
|
610
|
+
* @param paragraphs - List of paragraphs
|
|
611
|
+
* @returns Markdown text
|
|
612
|
+
*/
|
|
613
|
+
function paragraphsToMarkdown(paragraphs) {
|
|
614
|
+
const lines = [];
|
|
615
|
+
for (const para of paragraphs) {
|
|
616
|
+
const text = para.text.trim();
|
|
617
|
+
if (!text) continue;
|
|
618
|
+
if (text.length < 5 && !text.startsWith("|") && [...text].every((c) => (c.codePointAt(0) ?? 0) > 127)) continue;
|
|
619
|
+
lines.push(text);
|
|
620
|
+
}
|
|
621
|
+
return lines.join("\n\n");
|
|
622
|
+
}
|
|
623
|
+
/**
|
|
624
|
+
* Convert HWP file to Markdown
|
|
625
|
+
* @param hwp - Opened HWP file
|
|
626
|
+
* @param options - Conversion options
|
|
627
|
+
* @returns Markdown content
|
|
628
|
+
*/
|
|
629
|
+
function convertHwpToMarkdown(hwp, options = {}) {
|
|
630
|
+
const allParagraphs = [];
|
|
631
|
+
if (!options.tableLineBreakStyle) options.tableLineBreakStyle = "space";
|
|
632
|
+
const sectionCount = hwp.getSectionCount();
|
|
633
|
+
for (let i = 0; i < sectionCount; i++) {
|
|
634
|
+
const sectionData = hwp.readSection(i);
|
|
635
|
+
if (!sectionData) continue;
|
|
636
|
+
const paragraphs = new ParagraphParser(new RecordReader(sectionData), options).parseAllParagraphs();
|
|
637
|
+
allParagraphs.push(...paragraphs);
|
|
638
|
+
}
|
|
639
|
+
return paragraphsToMarkdown(allParagraphs);
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
//#endregion
|
|
643
|
+
//#region src/cli/index.ts
|
|
644
|
+
/**
|
|
645
|
+
* HWP2MD CLI
|
|
646
|
+
* Command-line interface for HWP to Markdown conversion
|
|
647
|
+
*/
|
|
648
|
+
const program = new Command();
|
|
649
|
+
program.name("hwp2md").description("HWP to Markdown converter").version("0.1.0");
|
|
650
|
+
program.command("info <file>").description("Display HWP file information").action(async (file) => {
|
|
651
|
+
try {
|
|
652
|
+
const data = await readFile(file);
|
|
653
|
+
const hwp = new HWPFile(new Uint8Array(data));
|
|
654
|
+
hwp.open();
|
|
655
|
+
const header = hwp.fileHeader;
|
|
656
|
+
console.log(`File: ${file}`);
|
|
657
|
+
console.log(`Signature: ${header?.signature}`);
|
|
658
|
+
console.log(`Version: ${header?.version}`);
|
|
659
|
+
console.log(`Compressed: ${hwp.isCompressed}`);
|
|
660
|
+
console.log(`Encrypted: ${header?.isEncrypted}`);
|
|
661
|
+
console.log(`Sections: ${hwp.getSectionCount()}`);
|
|
662
|
+
hwp.close();
|
|
663
|
+
} catch (error) {
|
|
664
|
+
console.error(`Error: ${error.message}`);
|
|
665
|
+
process.exit(1);
|
|
666
|
+
}
|
|
667
|
+
});
|
|
668
|
+
program.command("convert <input> [output]").description("Convert HWP file to Markdown").option("--table-line-breaks <style>", "Line break style in table cells: space or br", "space").action(async (input, output, options) => {
|
|
669
|
+
const outputPath = output ?? input.replace(/\.hwp$/i, ".md");
|
|
670
|
+
try {
|
|
671
|
+
console.log(`Converting ${input} to ${outputPath}...`);
|
|
672
|
+
const data = await readFile(input);
|
|
673
|
+
const hwp = new HWPFile(new Uint8Array(data));
|
|
674
|
+
hwp.open();
|
|
675
|
+
await writeFile(outputPath, convertHwpToMarkdown(hwp, { tableLineBreakStyle: options.tableLineBreaks }), "utf-8");
|
|
676
|
+
console.log(`Conversion completed: ${outputPath}`);
|
|
677
|
+
hwp.close();
|
|
678
|
+
} catch (error) {
|
|
679
|
+
console.error(`Error: ${error.message}`);
|
|
680
|
+
process.exit(1);
|
|
681
|
+
}
|
|
682
|
+
});
|
|
683
|
+
program.parse();
|
|
684
|
+
|
|
685
|
+
//#endregion
|
|
686
|
+
export { };
|
|
687
|
+
//# sourceMappingURL=index.mjs.map
|