hwp2md 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs ADDED
@@ -0,0 +1,1126 @@
1
+ import * as CFB from "cfb";
2
+ import { inflateRaw } from "pako";
3
+ import { unzipSync } from "fflate";
4
+ import { XMLParser } from "fast-xml-parser";
5
+
6
+ //#region src/utils/compression.ts
7
+ /**
8
+ * Decompress raw deflate data (no zlib header)
9
+ * Equivalent to Python: zlib.decompress(data, -15)
10
+ *
11
+ * HWP files use raw deflate compression without zlib wrapper headers.
12
+ * The windowBits=-15 in Python indicates raw deflate mode.
13
+ *
14
+ * @param data - Compressed data
15
+ * @returns Decompressed data
16
+ */
17
+ function decompressRaw(data) {
18
+ try {
19
+ return inflateRaw(data);
20
+ } catch (error) {
21
+ throw new Error(`Failed to decompress data: ${error.message}`);
22
+ }
23
+ }
24
+
25
+ //#endregion
26
+ //#region src/parser.ts
27
+ /**
28
+ * HWP 5.0 File Parser
29
+ * Parses OLE Compound File format HWP files
30
+ */
31
+ /**
32
+ * HWP File Parser
33
+ * Reads and parses HWP 5.0 files using OLE Compound File format
34
+ */
35
+ var HWPFile = class HWPFile {
36
+ cfb = null;
37
+ _fileHeader = null;
38
+ _isCompressed = false;
39
+ /**
40
+ * Create HWPFile from raw data
41
+ * @param data - Raw HWP file data
42
+ */
43
+ constructor(data) {
44
+ this.data = data;
45
+ }
46
+ /**
47
+ * Create HWPFile from file path (Node.js only)
48
+ * @param path - Path to HWP file
49
+ */
50
+ static async fromFile(path) {
51
+ const data = await (await import("node:fs/promises")).readFile(path);
52
+ return new HWPFile(new Uint8Array(data));
53
+ }
54
+ /**
55
+ * Create HWPFile from ArrayBuffer
56
+ * @param data - ArrayBuffer data
57
+ */
58
+ static fromArrayBuffer(data) {
59
+ return new HWPFile(new Uint8Array(data));
60
+ }
61
+ /**
62
+ * Create HWPFile from Uint8Array
63
+ * @param data - Uint8Array data
64
+ */
65
+ static fromUint8Array(data) {
66
+ return new HWPFile(data);
67
+ }
68
+ /**
69
+ * Open and parse HWP file
70
+ */
71
+ open() {
72
+ const uint8Array = this.data instanceof Uint8Array ? this.data : new Uint8Array(this.data);
73
+ if (uint8Array.length >= 30) {
74
+ if (new TextDecoder("utf-8").decode(uint8Array.slice(0, 30)).replace(/\0+$/, "").startsWith("HWP Document File V3")) throw new Error("HWP 3.0 format (HWP 97, 2002, etc.) is not supported. Please use HWP 5.0 or later format. You can convert HWP 3.0 files to HWP 5.0 format using Hancom Office.");
75
+ }
76
+ try {
77
+ if (typeof Buffer !== "undefined") {
78
+ const buffer = Buffer.from(uint8Array);
79
+ this.cfb = CFB.read(buffer, { type: "buffer" });
80
+ } else {
81
+ const buffer = Array.from(uint8Array);
82
+ this.cfb = CFB.read(buffer, { type: "array" });
83
+ }
84
+ } catch (error) {
85
+ throw new Error(`Failed to parse HWP file: ${error.message}`);
86
+ }
87
+ this._fileHeader = this.parseFileHeader();
88
+ this._isCompressed = this._fileHeader.isCompressed;
89
+ }
90
+ /**
91
+ * Close HWP file and release resources
92
+ */
93
+ close() {
94
+ this.cfb = null;
95
+ this._fileHeader = null;
96
+ }
97
+ /**
98
+ * Get file header information
99
+ */
100
+ get fileHeader() {
101
+ return this._fileHeader;
102
+ }
103
+ /**
104
+ * Check if file is compressed
105
+ */
106
+ get isCompressed() {
107
+ return this._isCompressed;
108
+ }
109
+ /**
110
+ * Parse FileHeader stream (256 bytes fixed)
111
+ */
112
+ parseFileHeader() {
113
+ if (!this.cfb) throw new Error("HWP file not opened");
114
+ const entry = CFB.find(this.cfb, "FileHeader");
115
+ if (!entry) throw new Error("FileHeader not found in HWP file");
116
+ const data = entry.content;
117
+ if (data.length !== 256) throw new Error(`Invalid FileHeader size: ${data.length} (expected 256)`);
118
+ const signatureBytes = data.slice(0, 32);
119
+ const signature = new TextDecoder("utf-8").decode(signatureBytes).replace(/\0+$/, "");
120
+ if (!signature.startsWith("HWP Document File")) throw new Error(`Invalid HWP signature: ${signature}`);
121
+ const view = new DataView(data.buffer, data.byteOffset, data.byteLength);
122
+ const versionRaw = view.getUint32(32, true);
123
+ const major = versionRaw >> 24 & 255;
124
+ const minor = versionRaw >> 16 & 255;
125
+ const patch = versionRaw >> 8 & 255;
126
+ const rev = versionRaw & 255;
127
+ const properties = view.getUint32(36, true);
128
+ const isCompressed = Boolean(properties & 1);
129
+ const isEncrypted = Boolean(properties & 2);
130
+ return {
131
+ signature,
132
+ version: `${major}.${minor}.${patch}.${rev}`,
133
+ isCompressed,
134
+ isEncrypted,
135
+ rawProperties: properties
136
+ };
137
+ }
138
+ /**
139
+ * Read and decompress stream
140
+ * @param streamPath - Stream path (e.g., 'DocInfo', 'BodyText/Section0')
141
+ * @returns Decompressed data or null if stream doesn't exist
142
+ */
143
+ readStream(streamPath) {
144
+ if (!this.cfb) return null;
145
+ const entry = CFB.find(this.cfb, streamPath);
146
+ if (!entry) return null;
147
+ let data = entry.content;
148
+ if (this._isCompressed) try {
149
+ data = decompressRaw(data);
150
+ } catch (error) {
151
+ throw new Error(`Failed to decompress ${streamPath}: ${error.message}`);
152
+ }
153
+ return data;
154
+ }
155
+ /**
156
+ * List all streams in HWP file
157
+ * @returns Array of stream paths
158
+ */
159
+ listStreams() {
160
+ if (!this.cfb) return [];
161
+ const streams = [];
162
+ for (const entry of this.cfb.FileIndex) if (entry.type === 2) {
163
+ const path = entry.name.split("/").filter((p) => p);
164
+ if (path.length > 0) streams.push(path);
165
+ }
166
+ return streams;
167
+ }
168
+ /**
169
+ * Get file information
170
+ */
171
+ getFileInfo() {
172
+ if (!this._fileHeader) return {};
173
+ return {
174
+ signature: this._fileHeader.signature,
175
+ version: this._fileHeader.version,
176
+ compressed: this._isCompressed,
177
+ encrypted: this._fileHeader.isEncrypted,
178
+ streams: this.listStreams()
179
+ };
180
+ }
181
+ /**
182
+ * Get number of sections in BodyText
183
+ */
184
+ getSectionCount() {
185
+ if (!this.cfb) return 0;
186
+ let count = 0;
187
+ while (CFB.find(this.cfb, `BodyText/Section${count}`) || CFB.find(this.cfb, `Section${count}`)) count++;
188
+ return count;
189
+ }
190
+ /**
191
+ * Read section data
192
+ * @param sectionIndex - Section index (0-based)
193
+ * @returns Decompressed section data
194
+ */
195
+ readSection(sectionIndex) {
196
+ let data = this.readStream(`BodyText/Section${sectionIndex}`);
197
+ if (!data) data = this.readStream(`Section${sectionIndex}`);
198
+ return data;
199
+ }
200
+ };
201
+
202
+ //#endregion
203
+ //#region src/record.ts
204
+ const HWPTAG_BEGIN = 16;
205
+ const HWPTAG_DOCUMENT_PROPERTIES = 16;
206
+ const HWPTAG_ID_MAPPINGS = 17;
207
+ const HWPTAG_BIN_DATA = 18;
208
+ const HWPTAG_FACE_NAME = 19;
209
+ const HWPTAG_BORDER_FILL = 20;
210
+ const HWPTAG_CHAR_SHAPE = 21;
211
+ const HWPTAG_TAB_DEF = 22;
212
+ const HWPTAG_NUMBERING = 23;
213
+ const HWPTAG_BULLET = 24;
214
+ const HWPTAG_PARA_SHAPE = 25;
215
+ const HWPTAG_STYLE = 26;
216
+ const HWPTAG_PARA_HEADER = HWPTAG_BEGIN + 50;
217
+ const HWPTAG_PARA_TEXT = HWPTAG_BEGIN + 51;
218
+ const HWPTAG_PARA_CHAR_SHAPE = HWPTAG_BEGIN + 52;
219
+ const HWPTAG_PARA_LINE_SEG = HWPTAG_BEGIN + 53;
220
+ const HWPTAG_PARA_RANGE_TAG = HWPTAG_BEGIN + 54;
221
+ const HWPTAG_CTRL_HEADER = HWPTAG_BEGIN + 55;
222
+ const HWPTAG_LIST_HEADER = HWPTAG_BEGIN + 56;
223
+ const HWPTAG_PAGE_DEF = HWPTAG_BEGIN + 57;
224
+ const HWPTAG_FOOTNOTE_SHAPE = HWPTAG_BEGIN + 58;
225
+ const HWPTAG_PAGE_BORDER_FILL = HWPTAG_BEGIN + 59;
226
+ const HWPTAG_SHAPE_COMPONENT = HWPTAG_BEGIN + 60;
227
+ const HWPTAG_TABLE = HWPTAG_BEGIN + 61;
228
+ const HWPTAG_SHAPE_COMPONENT_LINE = HWPTAG_BEGIN + 62;
229
+ const HWPTAG_CTRL_DATA = HWPTAG_BEGIN + 71;
230
+ /**
231
+ * HWP Record Reader
232
+ * Reads binary records from HWP stream data
233
+ */
234
+ var RecordReader = class {
235
+ data;
236
+ offset = 0;
237
+ constructor(data) {
238
+ this.data = data;
239
+ }
240
+ /**
241
+ * Check if there are more records to read
242
+ */
243
+ hasMore() {
244
+ return this.offset < this.data.length;
245
+ }
246
+ /**
247
+ * Read next record
248
+ * @returns Next record, or null if no more records
249
+ */
250
+ readRecord() {
251
+ if (this.offset + 4 > this.data.length) return null;
252
+ const header = new DataView(this.data.buffer, this.data.byteOffset + this.offset, 4).getUint32(0, true);
253
+ const tagId = header & 1023;
254
+ const level = header >> 10 & 1023;
255
+ let size = header >> 20 & 4095;
256
+ let dataOffset = this.offset + 4;
257
+ if (size === 4095) {
258
+ if (dataOffset + 4 > this.data.length) return null;
259
+ size = new DataView(this.data.buffer, this.data.byteOffset + dataOffset, 4).getUint32(0, true);
260
+ dataOffset += 4;
261
+ }
262
+ if (dataOffset + size > this.data.length) return null;
263
+ const recordData = this.data.slice(dataOffset, dataOffset + size);
264
+ this.offset = dataOffset + size;
265
+ return {
266
+ tagId,
267
+ level,
268
+ data: recordData,
269
+ size
270
+ };
271
+ }
272
+ /**
273
+ * Peek at next record header without consuming it
274
+ * @returns Header info with tagId, level, size
275
+ */
276
+ peekRecordHeader() {
277
+ if (this.offset + 4 > this.data.length) return null;
278
+ const header = new DataView(this.data.buffer, this.data.byteOffset + this.offset, 4).getUint32(0, true);
279
+ const tagId = header & 1023;
280
+ const level = header >> 10 & 1023;
281
+ let size = header >> 20 & 4095;
282
+ let dataOffset = this.offset + 4;
283
+ if (size === 4095) {
284
+ if (dataOffset + 4 > this.data.length) return null;
285
+ size = new DataView(this.data.buffer, this.data.byteOffset + dataOffset, 4).getUint32(0, true);
286
+ }
287
+ return {
288
+ tagId,
289
+ level,
290
+ size
291
+ };
292
+ }
293
+ /**
294
+ * Read all records (considering hierarchy)
295
+ * @param parentLevel - Stop when reaching this level or below
296
+ * @returns All records at current level
297
+ */
298
+ readAllRecords(parentLevel) {
299
+ const records = [];
300
+ while (this.hasMore()) {
301
+ const header = this.peekRecordHeader();
302
+ if (!header) break;
303
+ if (parentLevel !== void 0 && header.level <= parentLevel) break;
304
+ const record = this.readRecord();
305
+ if (record) records.push(record);
306
+ }
307
+ return records;
308
+ }
309
+ /**
310
+ * Get current position
311
+ */
312
+ get position() {
313
+ return this.offset;
314
+ }
315
+ /**
316
+ * Get remaining bytes
317
+ */
318
+ get remaining() {
319
+ return this.data.length - this.offset;
320
+ }
321
+ };
322
+
323
+ //#endregion
324
+ //#region src/table.ts
325
+ /**
326
+ * HWP Table Parser
327
+ * Parses table records and converts to Markdown
328
+ */
329
+ /**
330
+ * Parse table properties from TABLE record data
331
+ * @param data - TABLE record data
332
+ * @returns Table properties with rows, cols
333
+ */
334
+ function parseTableProperties(data) {
335
+ if (data.length < 8) throw new Error(`Invalid TABLE record size: ${data.length}`);
336
+ const view = new DataView(data.buffer, data.byteOffset, data.byteLength);
337
+ return {
338
+ properties: view.getUint32(0, true),
339
+ rows: view.getUint16(4, true),
340
+ cols: view.getUint16(6, true)
341
+ };
342
+ }
343
+ /**
344
+ * Parse cell properties
345
+ * @param data - Cell property data
346
+ * @returns Cell properties
347
+ */
348
+ function parseCellProperties(data) {
349
+ if (data.length < 8) return {
350
+ col: 0,
351
+ row: 0,
352
+ colspan: 1,
353
+ rowspan: 1
354
+ };
355
+ const view = new DataView(data.buffer, data.byteOffset, data.byteLength);
356
+ return {
357
+ col: view.getUint16(0, true),
358
+ row: view.getUint16(2, true),
359
+ colspan: view.getUint16(4, true),
360
+ rowspan: view.getUint16(6, true)
361
+ };
362
+ }
363
+ /**
364
+ * Parse table from TABLE record and subsequent records
365
+ * @param tableRecordData - TABLE record data
366
+ * @param reader - Record reader for reading cell data
367
+ * @param lineBreakStyle - How to handle line breaks in cells
368
+ * @returns Parsed table
369
+ */
370
+ function parseTable(tableRecordData, reader, lineBreakStyle = "space") {
371
+ const props = parseTableProperties(tableRecordData);
372
+ const rows = props.rows;
373
+ const cols = props.cols;
374
+ const cells = [];
375
+ let cellIndex = 0;
376
+ while (reader.hasMore() && cellIndex < rows * cols) {
377
+ const header = reader.peekRecordHeader();
378
+ if (!header) break;
379
+ if (header.level < 2) break;
380
+ if (header.tagId === HWPTAG_LIST_HEADER) {
381
+ const listRecord = reader.readRecord();
382
+ if (!listRecord) break;
383
+ let row, col, colspan, rowspan;
384
+ if (listRecord.data.length >= 16) {
385
+ const view = new DataView(listRecord.data.buffer, listRecord.data.byteOffset, listRecord.data.byteLength);
386
+ col = view.getUint16(8, true);
387
+ row = view.getUint16(10, true);
388
+ colspan = view.getUint16(12, true);
389
+ rowspan = view.getUint16(14, true);
390
+ if (colspan === 0) colspan = 1;
391
+ if (rowspan === 0) rowspan = 1;
392
+ } else {
393
+ row = Math.floor(cellIndex / cols);
394
+ col = cellIndex % cols;
395
+ colspan = 1;
396
+ rowspan = 1;
397
+ }
398
+ const textParts = [];
399
+ while (reader.hasMore()) {
400
+ const paraHeader = reader.peekRecordHeader();
401
+ if (!paraHeader) break;
402
+ if (paraHeader.tagId === HWPTAG_LIST_HEADER) break;
403
+ if (paraHeader.level < 2) break;
404
+ if (paraHeader.tagId === HWPTAG_PARA_HEADER) {
405
+ const paraRec = reader.readRecord();
406
+ if (!paraRec) break;
407
+ if (paraRec.data.length >= 4) {
408
+ const nchars = new DataView(paraRec.data.buffer, paraRec.data.byteOffset, paraRec.data.byteLength).getUint32(0, true) & 2147483647;
409
+ if (nchars > 0 && reader.hasMore()) {
410
+ const nextH = reader.peekRecordHeader();
411
+ if (nextH && nextH.tagId === HWPTAG_PARA_TEXT) {
412
+ const textRec = reader.readRecord();
413
+ if (textRec) {
414
+ const text = processControlChars(parseParaText(textRec.data, nchars)).text;
415
+ if (text.trim()) textParts.push(text.trim());
416
+ }
417
+ }
418
+ }
419
+ }
420
+ } else reader.readRecord();
421
+ }
422
+ let cellText;
423
+ if (lineBreakStyle === "br") {
424
+ cellText = textParts.join("<br>");
425
+ cellText = cellText.replace(/\n/g, "<br>");
426
+ } else {
427
+ cellText = textParts.join(" ");
428
+ cellText = cellText.replace(/\n/g, " ");
429
+ }
430
+ const cell = {
431
+ row,
432
+ col,
433
+ rowspan,
434
+ colspan,
435
+ text: cellText
436
+ };
437
+ cells.push(cell);
438
+ cellIndex++;
439
+ } else reader.readRecord();
440
+ }
441
+ return {
442
+ rows,
443
+ cols,
444
+ cells
445
+ };
446
+ }
447
+ /**
448
+ * Convert table to Markdown
449
+ * @param table - Table object
450
+ * @param mergeStrategy - 'repeat' (default) or 'blank'
451
+ * @returns Markdown table
452
+ */
453
+ function tableToMarkdown(table, mergeStrategy = "repeat") {
454
+ const { rows, cols, cells } = table;
455
+ const matrix = Array.from({ length: rows }, () => Array(cols).fill(""));
456
+ for (const cell of cells) if (mergeStrategy === "repeat") {
457
+ for (let r = cell.row; r < cell.row + cell.rowspan; r++) for (let c = cell.col; c < cell.col + cell.colspan; c++) if (r < rows && c < cols) matrix[r][c] = cell.text;
458
+ } else if (cell.row < rows && cell.col < cols) matrix[cell.row][cell.col] = cell.text;
459
+ const lines = [];
460
+ if (rows === 0 || cols === 0) return "[Empty Table]";
461
+ lines.push("| " + Array(cols).fill("").join(" | ") + " |");
462
+ lines.push("| " + Array(cols).fill("---").join(" | ") + " |");
463
+ for (const row of matrix) lines.push("| " + row.join(" | ") + " |");
464
+ return lines.join("\n");
465
+ }
466
+
467
+ //#endregion
468
+ //#region src/paragraph.ts
469
+ /**
470
+ * HWP Paragraph Parser
471
+ * Parses paragraph records and extracts text
472
+ */
473
+ /**
474
+ * Parse paragraph header
475
+ * @param data - PARA_HEADER record data
476
+ * @returns Paragraph header information
477
+ */
478
+ function parseParaHeader(data) {
479
+ if (data.length < 22) throw new Error(`Invalid PARA_HEADER size: ${data.length}`);
480
+ const view = new DataView(data.buffer, data.byteOffset, data.byteLength);
481
+ return {
482
+ textCount: view.getUint32(0, true) & 2147483647,
483
+ controlMask: view.getUint32(4, true),
484
+ paraShapeId: view.getUint16(8, true),
485
+ styleId: view.getUint8(10),
486
+ columnType: view.getUint8(11),
487
+ charShapeCount: view.getUint16(12, true)
488
+ };
489
+ }
490
+ /**
491
+ * Parse paragraph text with control info table
492
+ *
493
+ * PARA_TEXT structure:
494
+ * [Control info table: 16 bytes per control] + [Actual text]
495
+ *
496
+ * Each control info block (16 bytes):
497
+ * - 2 bytes: control code
498
+ * - 4 bytes: control ID
499
+ * - 8 bytes: control data
500
+ * - 2 bytes: control code (repeated)
501
+ *
502
+ * @param data - PARA_TEXT record data
503
+ * @param nchars - Number of WCHAR characters (control table + text)
504
+ * @returns Decoded text
505
+ */
506
+ function parseParaText(data, _nchars) {
507
+ const CHAR_CONTROLS = new Set([
508
+ 0,
509
+ 10,
510
+ 13,
511
+ 30,
512
+ 31
513
+ ]);
514
+ let offset = 0;
515
+ while (offset + 2 <= data.length) {
516
+ const charCode = new DataView(data.buffer, data.byteOffset + offset, 2).getUint16(0, true);
517
+ if (charCode < 32 && !CHAR_CONTROLS.has(charCode)) {
518
+ offset += 16;
519
+ if (offset > data.length) break;
520
+ } else break;
521
+ }
522
+ const textData = data.slice(offset);
523
+ if (textData.length === 0) return "";
524
+ try {
525
+ return new TextDecoder("utf-16le").decode(textData);
526
+ } catch (error) {
527
+ return "";
528
+ }
529
+ }
530
+ /**
531
+ * Process control characters in text
532
+ * @param text - Raw text with control characters
533
+ * @returns Processed text and has_table flag
534
+ */
535
+ function processControlChars(text) {
536
+ const result = [];
537
+ let hasTable = false;
538
+ for (const char of text) {
539
+ const code = char.codePointAt(0) ?? 0;
540
+ if (code < 32) {
541
+ if (code === 10) result.push("\n");
542
+ else if (code === 11) hasTable = true;
543
+ else if (code === 13) result.push("\n");
544
+ } else result.push(char);
545
+ }
546
+ return {
547
+ text: result.join(""),
548
+ hasTable
549
+ };
550
+ }
551
+ /**
552
+ * Paragraph Parser
553
+ * Parses paragraphs from HWP record stream
554
+ */
555
+ var ParagraphParser = class {
556
+ constructor(reader, options = {}) {
557
+ this.reader = reader;
558
+ this.options = options;
559
+ if (!this.options.tableLineBreakStyle) this.options.tableLineBreakStyle = "space";
560
+ }
561
+ /**
562
+ * Parse next paragraph
563
+ * @returns Parsed paragraph, or null if no more paragraphs
564
+ */
565
+ parseParagraph() {
566
+ if (!this.reader.hasMore()) return null;
567
+ let record = null;
568
+ while (this.reader.hasMore()) {
569
+ record = this.reader.readRecord();
570
+ if (record && record.tagId === HWPTAG_PARA_HEADER) break;
571
+ record = null;
572
+ }
573
+ if (!record) return null;
574
+ const header = parseParaHeader(record.data);
575
+ let text = "";
576
+ if (header.textCount > 0) {
577
+ const nextHeader = this.reader.peekRecordHeader();
578
+ if (nextHeader && nextHeader.tagId === HWPTAG_PARA_TEXT) {
579
+ record = this.reader.readRecord();
580
+ if (record) text = processControlChars(parseParaText(record.data, header.textCount)).text;
581
+ }
582
+ }
583
+ const tables = [];
584
+ while (this.reader.hasMore()) {
585
+ const nextHeader = this.reader.peekRecordHeader();
586
+ if (!nextHeader) break;
587
+ if (nextHeader.tagId === HWPTAG_PARA_HEADER) break;
588
+ if (nextHeader.level === 0) break;
589
+ if (nextHeader.tagId === HWPTAG_CTRL_HEADER) {
590
+ this.reader.readRecord();
591
+ const tableHeader = this.reader.peekRecordHeader();
592
+ if (tableHeader && tableHeader.tagId === HWPTAG_TABLE) {
593
+ const tableRecord = this.reader.readRecord();
594
+ if (tableRecord) try {
595
+ const tableMd = tableToMarkdown(parseTable(tableRecord.data, this.reader, this.options.tableLineBreakStyle));
596
+ tables.push(tableMd);
597
+ } catch (error) {
598
+ console.warn("Warning: Failed to parse table:", error);
599
+ tables.push("[TABLE - Parse Error]");
600
+ }
601
+ }
602
+ } else this.reader.readRecord();
603
+ }
604
+ if (tables.length > 0) {
605
+ text = text.trimEnd();
606
+ if (text.length > 0 && text.length < 5 && [...text].every((c) => {
607
+ return (c.codePointAt(0) ?? 0) > 127 || /\s/.test(c);
608
+ })) text = tables.join("\n\n");
609
+ else {
610
+ if (text) text += "\n\n";
611
+ text += tables.join("\n\n");
612
+ }
613
+ }
614
+ return {
615
+ text,
616
+ header
617
+ };
618
+ }
619
+ /**
620
+ * Parse all paragraphs in section
621
+ * @returns All paragraphs
622
+ */
623
+ parseAllParagraphs() {
624
+ const paragraphs = [];
625
+ while (this.reader.hasMore()) {
626
+ const para = this.parseParagraph();
627
+ if (para) paragraphs.push(para);
628
+ else break;
629
+ }
630
+ return paragraphs;
631
+ }
632
+ };
633
+
634
+ //#endregion
635
+ //#region src/hwpx_parser.ts
636
+ /**
637
+ * HWPX (ZIP+XML) File Parser
638
+ * Parses HWPX files (ZIP archives containing XML) into Paragraphs and Tables
639
+ */
640
+ const HWPX_MIMETYPE = "application/hwp+zip";
641
+ /** Options for the XML parser */
642
+ const xmlParserOptions = {
643
+ ignoreAttributes: false,
644
+ attributeNamePrefix: "@_",
645
+ removeNSPrefix: false,
646
+ isArray: (name) => {
647
+ return [
648
+ "hp:p",
649
+ "hp:run",
650
+ "hp:t",
651
+ "hp:tr",
652
+ "hp:tc",
653
+ "hp:tbl",
654
+ "opf:item",
655
+ "opf:itemref"
656
+ ].includes(name);
657
+ },
658
+ textNodeName: "#text"
659
+ };
660
+ /**
661
+ * HWPX file parser
662
+ */
663
+ var HWPXFile = class HWPXFile {
664
+ data;
665
+ entries = null;
666
+ sectionPaths = [];
667
+ version = "";
668
+ xmlParser;
669
+ constructor(data) {
670
+ this.data = data;
671
+ this.xmlParser = new XMLParser(xmlParserOptions);
672
+ }
673
+ static async fromFile(path) {
674
+ const { readFileSync } = await import("node:fs");
675
+ const data = readFileSync(path);
676
+ return new HWPXFile(new Uint8Array(data));
677
+ }
678
+ static fromArrayBuffer(data) {
679
+ return new HWPXFile(new Uint8Array(data));
680
+ }
681
+ static fromUint8Array(data) {
682
+ return new HWPXFile(data);
683
+ }
684
+ open() {
685
+ if (this.data.length < 4 || this.data[0] !== 80 || this.data[1] !== 75 || this.data[2] !== 3 || this.data[3] !== 4) throw new Error("Not a valid HWPX file (not a ZIP archive)");
686
+ try {
687
+ this.entries = unzipSync(this.data);
688
+ } catch (e) {
689
+ throw new Error(`Failed to parse HWPX file: ${e instanceof Error ? e.message : e}`);
690
+ }
691
+ const mimetypeEntry = this.entries["mimetype"];
692
+ if (!mimetypeEntry) throw new Error("Not a valid HWPX file: missing 'mimetype' entry");
693
+ const mimetype = new TextDecoder().decode(mimetypeEntry).trim();
694
+ if (mimetype !== HWPX_MIMETYPE) throw new Error(`Invalid HWPX mimetype: '${mimetype}' (expected '${HWPX_MIMETYPE}')`);
695
+ this.parseVersion();
696
+ this.discoverSections();
697
+ }
698
+ close() {
699
+ this.entries = null;
700
+ this.sectionPaths = [];
701
+ }
702
+ parseVersion() {
703
+ const versionEntry = this.entries?.["version.xml"];
704
+ if (!versionEntry) {
705
+ this.version = "unknown";
706
+ return;
707
+ }
708
+ try {
709
+ const xml = new TextDecoder().decode(versionEntry);
710
+ const parsed = this.xmlParser.parse(xml);
711
+ const root = parsed["hv:HCFVersion"] ?? parsed["HCFVersion"] ?? parsed;
712
+ const major = root["@_major"] ?? "";
713
+ const minor = root["@_minor"] ?? "";
714
+ const micro = root["@_micro"] ?? "";
715
+ if (major) this.version = `${major}.${minor}.${micro}`;
716
+ else this.version = root["@_Version"] ?? "unknown";
717
+ } catch {
718
+ this.version = "unknown";
719
+ }
720
+ }
721
+ discoverSections() {
722
+ this.sectionPaths = [];
723
+ const hpfEntry = this.entries?.["Contents/content.hpf"];
724
+ if (!hpfEntry) {
725
+ for (const name of Object.keys(this.entries ?? {})) if (name.startsWith("Contents/section") && name.endsWith(".xml")) this.sectionPaths.push(name);
726
+ this.sectionPaths.sort();
727
+ return;
728
+ }
729
+ try {
730
+ const xml = new TextDecoder().decode(hpfEntry);
731
+ const parsed = this.xmlParser.parse(xml);
732
+ const pkg = parsed["opf:package"] ?? parsed["package"] ?? {};
733
+ const manifest = pkg["opf:manifest"] ?? pkg["manifest"] ?? {};
734
+ const spine = pkg["opf:spine"] ?? pkg["spine"] ?? {};
735
+ const idToHref = {};
736
+ const items = manifest["opf:item"] ?? manifest["item"] ?? [];
737
+ const itemList = Array.isArray(items) ? items : [items];
738
+ for (const item of itemList) {
739
+ const id = item["@_id"] ?? "";
740
+ const href = item["@_href"] ?? "";
741
+ if (href) idToHref[id] = href;
742
+ }
743
+ const itemrefs = spine["opf:itemref"] ?? spine["itemref"] ?? [];
744
+ const refList = Array.isArray(itemrefs) ? itemrefs : [itemrefs];
745
+ for (const ref of refList) {
746
+ const idref = ref["@_idref"] ?? "";
747
+ if (idref in idToHref && idref.toLowerCase().includes("section")) {
748
+ let href = idToHref[idref];
749
+ if (!href.startsWith("Contents/")) href = "Contents/" + href;
750
+ this.sectionPaths.push(href);
751
+ }
752
+ }
753
+ if (this.sectionPaths.length === 0) {
754
+ for (const [, href] of Object.entries(idToHref).sort()) if (href.toLowerCase().includes("section")) {
755
+ const fullHref = href.startsWith("Contents/") ? href : "Contents/" + href;
756
+ this.sectionPaths.push(fullHref);
757
+ }
758
+ }
759
+ } catch {
760
+ for (const name of Object.keys(this.entries ?? {})) if (name.startsWith("Contents/section") && name.endsWith(".xml")) this.sectionPaths.push(name);
761
+ this.sectionPaths.sort();
762
+ }
763
+ }
764
+ get fileInfo() {
765
+ this.ensureOpen();
766
+ return {
767
+ format: "HWPX",
768
+ version: this.version,
769
+ sectionCount: this.sectionPaths.length,
770
+ contents: Object.keys(this.entries ?? {})
771
+ };
772
+ }
773
+ getSectionCount() {
774
+ this.ensureOpen();
775
+ return this.sectionPaths.length;
776
+ }
777
+ getSectionXml(index) {
778
+ this.ensureOpen();
779
+ if (index < 0 || index >= this.sectionPaths.length) throw new RangeError(`Section index ${index} out of range (0-${this.sectionPaths.length - 1})`);
780
+ const path = this.sectionPaths[index];
781
+ const entry = this.entries?.[path];
782
+ if (!entry) throw new Error(`Section file not found: ${path}`);
783
+ const xml = new TextDecoder().decode(entry);
784
+ return this.xmlParser.parse(xml);
785
+ }
786
+ listContents() {
787
+ this.ensureOpen();
788
+ return Object.keys(this.entries ?? {});
789
+ }
790
+ ensureOpen() {
791
+ if (!this.entries) throw new Error("HWPX file is not open. Call open() first.");
792
+ }
793
+ };
794
+ function ensureArray(val) {
795
+ if (val === void 0 || val === null) return [];
796
+ return Array.isArray(val) ? val : [val];
797
+ }
798
+ /**
799
+ * Extract text from a paragraph object (hp:p)
800
+ */
801
+ function extractParagraphText(pObj) {
802
+ const parts = [];
803
+ for (const run of ensureArray(pObj["hp:run"])) for (const [key, value] of Object.entries(run)) if (key === "hp:t") for (const t of ensureArray(value)) {
804
+ const text = typeof t === "string" ? t : t?.["#text"];
805
+ if (text !== void 0 && text !== null) parts.push(String(text));
806
+ }
807
+ else if (key === "hp:lineBreak") parts.push("\n");
808
+ else if (key === "hp:tab") parts.push(" ");
809
+ return parts.join("");
810
+ }
811
+ /**
812
+ * Extract text from a table cell (hp:tc)
813
+ */
814
+ function extractCellText(tcObj, lineBreakStyle = "space") {
815
+ const textParts = [];
816
+ const subList = tcObj["hp:subList"];
817
+ if (subList) for (const p of ensureArray(subList["hp:p"])) {
818
+ const paraText = extractParagraphText(p);
819
+ if (paraText.trim()) textParts.push(paraText.trim());
820
+ }
821
+ if (lineBreakStyle === "br") return textParts.join("<br>").replace(/\n/g, "<br>");
822
+ return textParts.join(" ").replace(/\n/g, " ");
823
+ }
824
+ /**
825
+ * Parse a table XML object (hp:tbl) into a Table
826
+ */
827
+ function parseHwpxTable(tblObj, lineBreakStyle = "space") {
828
+ const rows = Number(tblObj["@_rowCnt"] ?? 0);
829
+ const cols = Number(tblObj["@_colCnt"] ?? 0);
830
+ const cells = [];
831
+ for (const tr of ensureArray(tblObj["hp:tr"])) for (const tc of ensureArray(tr["hp:tc"])) {
832
+ const addr = tc["hp:cellAddr"];
833
+ const col = Number(addr?.["@_colAddr"] ?? 0);
834
+ const row = Number(addr?.["@_rowAddr"] ?? 0);
835
+ const span = tc["hp:cellSpan"];
836
+ let colspan = Number(span?.["@_colSpan"] ?? 1);
837
+ let rowspan = Number(span?.["@_rowSpan"] ?? 1);
838
+ if (colspan < 1) colspan = 1;
839
+ if (rowspan < 1) rowspan = 1;
840
+ const text = extractCellText(tc, lineBreakStyle);
841
+ cells.push({
842
+ row,
843
+ col,
844
+ rowspan,
845
+ colspan,
846
+ text
847
+ });
848
+ }
849
+ return {
850
+ rows,
851
+ cols,
852
+ cells
853
+ };
854
+ }
855
+ /**
856
+ * Parse a section XML object into paragraphs
857
+ */
858
+ function parseHwpxSection(sectionObj, options = {}) {
859
+ const lineBreakStyle = options.tableLineBreakStyle ?? "space";
860
+ const paragraphs = [];
861
+ let sec = sectionObj;
862
+ if (sec["hs:sec"]) sec = sec["hs:sec"];
863
+ for (const p of ensureArray(sec["hp:p"])) {
864
+ let text = extractParagraphText(p);
865
+ const tableMds = [];
866
+ for (const run of ensureArray(p["hp:run"])) for (const tbl of ensureArray(run["hp:tbl"])) try {
867
+ const table = parseHwpxTable(tbl, lineBreakStyle);
868
+ tableMds.push(tableToMarkdown(table));
869
+ } catch {
870
+ tableMds.push("[TABLE - Parse Error]");
871
+ }
872
+ for (const tbl of ensureArray(p["hp:tbl"])) try {
873
+ const table = parseHwpxTable(tbl, lineBreakStyle);
874
+ tableMds.push(tableToMarkdown(table));
875
+ } catch {
876
+ tableMds.push("[TABLE - Parse Error]");
877
+ }
878
+ if (tableMds.length > 0) {
879
+ text = text.trimEnd();
880
+ if (text) text += "\n\n";
881
+ text += tableMds.join("\n\n");
882
+ }
883
+ if (text.trim()) paragraphs.push({
884
+ text,
885
+ header: {
886
+ textCount: 0,
887
+ controlMask: 0,
888
+ paraShapeId: 0,
889
+ styleId: 0,
890
+ columnType: 0,
891
+ charShapeCount: 0
892
+ }
893
+ });
894
+ }
895
+ for (const tbl of ensureArray(sec["hp:tbl"])) try {
896
+ const md = tableToMarkdown(parseHwpxTable(tbl, lineBreakStyle));
897
+ if (md.trim()) paragraphs.push({
898
+ text: md,
899
+ header: {
900
+ textCount: 0,
901
+ controlMask: 0,
902
+ paraShapeId: 0,
903
+ styleId: 0,
904
+ columnType: 0,
905
+ charShapeCount: 0
906
+ }
907
+ });
908
+ } catch {}
909
+ return paragraphs;
910
+ }
911
+ /**
912
+ * Check if binary data is a HWPX file (ZIP magic bytes)
913
+ */
914
+ function isHwpxData(data) {
915
+ const bytes = data instanceof Uint8Array ? data : new Uint8Array(data);
916
+ return bytes.length >= 4 && bytes[0] === 80 && bytes[1] === 75 && bytes[2] === 3 && bytes[3] === 4;
917
+ }
918
+ /**
919
+ * Check if a file path has HWPX extension
920
+ */
921
+ function isHwpxPath(path) {
922
+ return path.toLowerCase().endsWith(".hwpx");
923
+ }
924
+
925
+ //#endregion
926
+ //#region src/converter.ts
927
+ /**
928
+ * HWP/HWPX to Markdown Converter
929
+ * Orchestrates conversion from HWP/HWPX to Markdown
930
+ */
931
+ /**
932
+ * Convert paragraphs to Markdown
933
+ * @param paragraphs - List of paragraphs
934
+ * @returns Markdown text
935
+ */
936
+ function paragraphsToMarkdown(paragraphs) {
937
+ const lines = [];
938
+ for (const para of paragraphs) {
939
+ const text = para.text.trim();
940
+ if (!text) continue;
941
+ if (text.length < 5 && !text.startsWith("|") && [...text].every((c) => (c.codePointAt(0) ?? 0) > 127)) continue;
942
+ lines.push(text);
943
+ }
944
+ return lines.join("\n\n");
945
+ }
946
+ /**
947
+ * Convert HWP file to Markdown
948
+ * @param hwp - Opened HWP file
949
+ * @param options - Conversion options
950
+ * @returns Markdown content
951
+ */
952
+ function convertHwpToMarkdown(hwp, options = {}) {
953
+ const allParagraphs = [];
954
+ if (!options.tableLineBreakStyle) options.tableLineBreakStyle = "space";
955
+ const sectionCount = hwp.getSectionCount();
956
+ for (let i = 0; i < sectionCount; i++) {
957
+ const sectionData = hwp.readSection(i);
958
+ if (!sectionData) continue;
959
+ const paragraphs = new ParagraphParser(new RecordReader(sectionData), options).parseAllParagraphs();
960
+ allParagraphs.push(...paragraphs);
961
+ }
962
+ return paragraphsToMarkdown(allParagraphs);
963
+ }
964
+ /**
965
+ * Convert HWPX file to Markdown
966
+ * @param hwpx - Opened HWPX file
967
+ * @param options - Conversion options
968
+ * @returns Markdown content
969
+ */
970
+ function convertHwpxToMarkdown(hwpx, options = {}) {
971
+ const allParagraphs = [];
972
+ const sectionCount = hwpx.getSectionCount();
973
+ for (let i = 0; i < sectionCount; i++) {
974
+ const paragraphs = parseHwpxSection(hwpx.getSectionXml(i), options);
975
+ allParagraphs.push(...paragraphs);
976
+ }
977
+ return paragraphsToMarkdown(allParagraphs);
978
+ }
979
+ /**
980
+ * High-level API: Convert HWP/HWPX file to Markdown
981
+ * Auto-detects format based on file extension or magic bytes.
982
+ * @param input - File path (Node.js), ArrayBuffer, or Uint8Array
983
+ * @param options - Conversion options
984
+ * @returns Markdown content
985
+ */
986
+ async function convert(input, options) {
987
+ if (typeof input === "string" && isHwpxPath(input)) {
988
+ const hwpx = await HWPXFile.fromFile(input);
989
+ try {
990
+ hwpx.open();
991
+ return convertHwpxToMarkdown(hwpx, options);
992
+ } finally {
993
+ hwpx.close();
994
+ }
995
+ }
996
+ if ((input instanceof Uint8Array || input instanceof ArrayBuffer) && isHwpxData(input instanceof ArrayBuffer ? new Uint8Array(input) : input)) {
997
+ const hwpx = input instanceof Uint8Array ? HWPXFile.fromUint8Array(input) : HWPXFile.fromArrayBuffer(input);
998
+ try {
999
+ hwpx.open();
1000
+ return convertHwpxToMarkdown(hwpx, options);
1001
+ } finally {
1002
+ hwpx.close();
1003
+ }
1004
+ }
1005
+ let hwp;
1006
+ if (typeof input === "string") hwp = await HWPFile.fromFile(input);
1007
+ else if (input instanceof Uint8Array) hwp = HWPFile.fromUint8Array(input);
1008
+ else hwp = HWPFile.fromArrayBuffer(input);
1009
+ try {
1010
+ hwp.open();
1011
+ return convertHwpToMarkdown(hwp, options);
1012
+ } finally {
1013
+ hwp.close();
1014
+ }
1015
+ }
1016
+
1017
+ //#endregion
1018
+ //#region src/utils/binary.ts
1019
+ /**
1020
+ * Binary Reader Utility
1021
+ * Replaces Python's struct.unpack functionality
1022
+ * All multi-byte values are little-endian
1023
+ */
1024
+ var BinaryReader = class BinaryReader {
1025
+ view;
1026
+ offset = 0;
1027
+ constructor(data) {
1028
+ const buffer = data instanceof Uint8Array ? data.buffer : data;
1029
+ const byteOffset = data instanceof Uint8Array ? data.byteOffset : 0;
1030
+ const byteLength = data instanceof Uint8Array ? data.byteLength : data.byteLength;
1031
+ this.view = new DataView(buffer, byteOffset, byteLength);
1032
+ }
1033
+ /**
1034
+ * Read unsigned 8-bit integer (BYTE)
1035
+ */
1036
+ readUint8() {
1037
+ const value = this.view.getUint8(this.offset);
1038
+ this.offset += 1;
1039
+ return value;
1040
+ }
1041
+ /**
1042
+ * Read unsigned 16-bit integer (WORD) - little-endian
1043
+ */
1044
+ readUint16LE() {
1045
+ const value = this.view.getUint16(this.offset, true);
1046
+ this.offset += 2;
1047
+ return value;
1048
+ }
1049
+ /**
1050
+ * Read unsigned 32-bit integer (DWORD) - little-endian
1051
+ */
1052
+ readUint32LE() {
1053
+ const value = this.view.getUint32(this.offset, true);
1054
+ this.offset += 4;
1055
+ return value;
1056
+ }
1057
+ /**
1058
+ * Read signed 32-bit integer - little-endian
1059
+ */
1060
+ readInt32LE() {
1061
+ const value = this.view.getInt32(this.offset, true);
1062
+ this.offset += 4;
1063
+ return value;
1064
+ }
1065
+ /**
1066
+ * Read bytes without advancing offset (peek)
1067
+ */
1068
+ peekBytes(length) {
1069
+ return new Uint8Array(this.view.buffer, this.view.byteOffset + this.offset, length);
1070
+ }
1071
+ /**
1072
+ * Read bytes and advance offset
1073
+ */
1074
+ readBytes(length) {
1075
+ const bytes = new Uint8Array(this.view.buffer, this.view.byteOffset + this.offset, length);
1076
+ this.offset += length;
1077
+ return bytes;
1078
+ }
1079
+ /**
1080
+ * Skip bytes
1081
+ */
1082
+ skip(length) {
1083
+ this.offset += length;
1084
+ }
1085
+ /**
1086
+ * Set absolute position
1087
+ */
1088
+ seek(offset) {
1089
+ this.offset = offset;
1090
+ }
1091
+ /**
1092
+ * Get current position
1093
+ */
1094
+ get position() {
1095
+ return this.offset;
1096
+ }
1097
+ /**
1098
+ * Get remaining bytes
1099
+ */
1100
+ get remaining() {
1101
+ return this.view.byteLength - this.offset;
1102
+ }
1103
+ /**
1104
+ * Check if more data is available
1105
+ */
1106
+ hasMore(minBytes = 1) {
1107
+ return this.remaining >= minBytes;
1108
+ }
1109
+ /**
1110
+ * Get total length
1111
+ */
1112
+ get length() {
1113
+ return this.view.byteLength;
1114
+ }
1115
+ /**
1116
+ * Create a new BinaryReader for a subset of data
1117
+ */
1118
+ slice(start, end) {
1119
+ const actualEnd = end ?? this.view.byteLength;
1120
+ return new BinaryReader(new Uint8Array(this.view.buffer, this.view.byteOffset + start, actualEnd - start));
1121
+ }
1122
+ };
1123
+
1124
+ //#endregion
1125
+ export { BinaryReader, HWPFile, HWPTAG_BEGIN, HWPTAG_BIN_DATA, HWPTAG_BORDER_FILL, HWPTAG_BULLET, HWPTAG_CHAR_SHAPE, HWPTAG_CTRL_DATA, HWPTAG_CTRL_HEADER, HWPTAG_DOCUMENT_PROPERTIES, HWPTAG_FACE_NAME, HWPTAG_FOOTNOTE_SHAPE, HWPTAG_ID_MAPPINGS, HWPTAG_LIST_HEADER, HWPTAG_NUMBERING, HWPTAG_PAGE_BORDER_FILL, HWPTAG_PAGE_DEF, HWPTAG_PARA_CHAR_SHAPE, HWPTAG_PARA_HEADER, HWPTAG_PARA_LINE_SEG, HWPTAG_PARA_RANGE_TAG, HWPTAG_PARA_SHAPE, HWPTAG_PARA_TEXT, HWPTAG_SHAPE_COMPONENT, HWPTAG_SHAPE_COMPONENT_LINE, HWPTAG_STYLE, HWPTAG_TABLE, HWPTAG_TAB_DEF, HWPXFile, ParagraphParser, RecordReader, convert, convertHwpToMarkdown, convertHwpxToMarkdown, decompressRaw, isHwpxData, isHwpxPath, paragraphsToMarkdown, parseCellProperties, parseHwpxSection, parseHwpxTable, parseParaHeader, parseParaText, parseTable, parseTableProperties, processControlChars, tableToMarkdown };
1126
+ //# sourceMappingURL=index.mjs.map