kordoc 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,813 @@
1
+ #!/usr/bin/env node
2
+ var __defProp = Object.defineProperty;
3
+ var __require = /* @__PURE__ */ ((x) => typeof require !== "undefined" ? require : typeof Proxy !== "undefined" ? new Proxy(x, {
4
+ get: (a, b) => (typeof require !== "undefined" ? require : a)[b]
5
+ }) : x)(function(x) {
6
+ if (typeof require !== "undefined") return require.apply(this, arguments);
7
+ throw Error('Dynamic require of "' + x + '" is not supported');
8
+ });
9
+ var __export = (target, all) => {
10
+ for (var name in all)
11
+ __defProp(target, name, { get: all[name], enumerable: true });
12
+ };
13
+
14
+ // src/detect.ts
15
+ function isHwpxFile(buffer) {
16
+ const bytes = new Uint8Array(buffer.slice(0, 4));
17
+ return bytes[0] === 80 && bytes[1] === 75 && bytes[2] === 3 && bytes[3] === 4;
18
+ }
19
+ function isOldHwpFile(buffer) {
20
+ const bytes = new Uint8Array(buffer.slice(0, 4));
21
+ return bytes[0] === 208 && bytes[1] === 207 && bytes[2] === 17 && bytes[3] === 224;
22
+ }
23
+ function isPdfFile(buffer) {
24
+ const bytes = new Uint8Array(buffer.slice(0, 4));
25
+ return bytes[0] === 37 && bytes[1] === 80 && bytes[2] === 68 && bytes[3] === 70;
26
+ }
27
+ function detectFormat(buffer) {
28
+ if (isHwpxFile(buffer)) return "hwpx";
29
+ if (isOldHwpFile(buffer)) return "hwp";
30
+ if (isPdfFile(buffer)) return "pdf";
31
+ return "unknown";
32
+ }
33
+
34
+ // src/hwpx/parser.ts
35
+ import JSZip from "jszip";
36
+ import { DOMParser } from "@xmldom/xmldom";
37
+
38
+ // src/table/builder.ts
39
+ function buildTable(rows) {
40
+ const numRows = rows.length;
41
+ const tempOccupied = Array.from({ length: numRows }, () => Array(100).fill(false));
42
+ let maxCols = 0;
43
+ for (let rowIdx = 0; rowIdx < numRows; rowIdx++) {
44
+ let colIdx = 0;
45
+ for (const cell of rows[rowIdx]) {
46
+ while (colIdx < 100 && tempOccupied[rowIdx][colIdx]) colIdx++;
47
+ if (colIdx >= 100) break;
48
+ for (let r = rowIdx; r < Math.min(rowIdx + cell.rowSpan, numRows); r++) {
49
+ for (let c = colIdx; c < Math.min(colIdx + cell.colSpan, 100); c++) {
50
+ tempOccupied[r][c] = true;
51
+ }
52
+ }
53
+ colIdx += cell.colSpan;
54
+ if (colIdx > maxCols) maxCols = colIdx;
55
+ }
56
+ }
57
+ if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
58
+ const grid = Array.from(
59
+ { length: numRows },
60
+ () => Array.from({ length: maxCols }, () => ({ text: "", colSpan: 1, rowSpan: 1 }))
61
+ );
62
+ const occupied = Array.from({ length: numRows }, () => Array(maxCols).fill(false));
63
+ for (let rowIdx = 0; rowIdx < numRows; rowIdx++) {
64
+ let colIdx = 0;
65
+ let cellIdx = 0;
66
+ while (colIdx < maxCols && cellIdx < rows[rowIdx].length) {
67
+ while (colIdx < maxCols && occupied[rowIdx][colIdx]) colIdx++;
68
+ if (colIdx >= maxCols) break;
69
+ const cell = rows[rowIdx][cellIdx];
70
+ grid[rowIdx][colIdx] = {
71
+ text: cell.text.trim(),
72
+ colSpan: cell.colSpan,
73
+ rowSpan: cell.rowSpan
74
+ };
75
+ for (let r = rowIdx; r < Math.min(rowIdx + cell.rowSpan, numRows); r++) {
76
+ for (let c = colIdx; c < Math.min(colIdx + cell.colSpan, maxCols); c++) {
77
+ occupied[r][c] = true;
78
+ }
79
+ }
80
+ colIdx += cell.colSpan;
81
+ cellIdx++;
82
+ }
83
+ }
84
+ return { rows: numRows, cols: maxCols, cells: grid, hasHeader: numRows > 1 };
85
+ }
86
+ function convertTableToText(rows) {
87
+ return rows.map(
88
+ (row) => row.map((c) => c.text.trim().replace(/\n/g, " ")).filter(Boolean).join(" | ")
89
+ ).filter(Boolean).join("\n");
90
+ }
91
+ function blocksToMarkdown(blocks) {
92
+ const lines = [];
93
+ for (let i = 0; i < blocks.length; i++) {
94
+ const block = blocks[i];
95
+ if (block.type === "paragraph" && block.text) {
96
+ const text = block.text;
97
+ if (/^\[별표\s*\d+/.test(text)) {
98
+ const nextBlock = blocks[i + 1];
99
+ if (nextBlock?.type === "paragraph" && nextBlock.text && /관련\)?$/.test(nextBlock.text)) {
100
+ lines.push("", `## ${text} ${nextBlock.text}`, "");
101
+ i++;
102
+ } else {
103
+ lines.push("", `## ${text}`, "");
104
+ }
105
+ continue;
106
+ }
107
+ if (/^\([^)]*조[^)]*관련\)$/.test(text)) {
108
+ lines.push(`*${text}*`, "");
109
+ continue;
110
+ }
111
+ lines.push(text);
112
+ } else if (block.type === "table" && block.table) {
113
+ lines.push(tableToMarkdown(block.table));
114
+ }
115
+ }
116
+ return lines.join("\n").trim();
117
+ }
118
+ function tableToMarkdown(table) {
119
+ if (table.rows === 0 || table.cols === 0) return "";
120
+ const { cells, rows: numRows, cols: numCols } = table;
121
+ if (numRows === 1 && numCols === 1) {
122
+ const content = cells[0][0].text;
123
+ return content.split(/\n/).map((line) => {
124
+ const trimmed = line.trim();
125
+ if (!trimmed) return "";
126
+ if (/^\d+\.\s/.test(trimmed)) return `**${trimmed}**`;
127
+ if (/^[가-힣]\.\s/.test(trimmed)) return ` ${trimmed}`;
128
+ return trimmed;
129
+ }).filter(Boolean).join("\n");
130
+ }
131
+ const display = Array.from({ length: numRows }, () => Array(numCols).fill(""));
132
+ const skip = /* @__PURE__ */ new Set();
133
+ for (let r = 0; r < numRows; r++) {
134
+ for (let c = 0; c < numCols; c++) {
135
+ if (skip.has(`${r},${c}`)) continue;
136
+ const cell = cells[r][c];
137
+ display[r][c] = cell.text.replace(/\n/g, "<br>");
138
+ for (let dr = 0; dr < cell.rowSpan; dr++) {
139
+ for (let dc = 0; dc < cell.colSpan; dc++) {
140
+ if (dr === 0 && dc === 0) continue;
141
+ if (r + dr < numRows && c + dc < numCols) {
142
+ skip.add(`${r + dr},${c + dc}`);
143
+ }
144
+ }
145
+ }
146
+ }
147
+ }
148
+ const uniqueRows = [];
149
+ const seen = /* @__PURE__ */ new Set();
150
+ for (const row of display) {
151
+ const key = row.join("||");
152
+ if (!seen.has(key)) {
153
+ seen.add(key);
154
+ uniqueRows.push(row);
155
+ }
156
+ }
157
+ if (uniqueRows.length === 0) return "";
158
+ const md = [];
159
+ md.push("| " + uniqueRows[0].join(" | ") + " |");
160
+ md.push("| " + uniqueRows[0].map(() => "---").join(" | ") + " |");
161
+ for (let i = 1; i < uniqueRows.length; i++) {
162
+ md.push("| " + uniqueRows[i].join(" | ") + " |");
163
+ }
164
+ return md.join("\n");
165
+ }
166
+
167
+ // src/hwpx/parser.ts
168
+ async function parseHwpxDocument(buffer) {
169
+ let zip;
170
+ try {
171
+ zip = await JSZip.loadAsync(buffer);
172
+ } catch {
173
+ return extractFromBrokenZip(buffer);
174
+ }
175
+ const sectionPaths = await resolveSectionPaths(zip);
176
+ if (sectionPaths.length === 0) throw new Error("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
177
+ const blocks = [];
178
+ for (const path of sectionPaths) {
179
+ const file = zip.file(path);
180
+ if (!file) continue;
181
+ const xml = await file.async("text");
182
+ blocks.push(...parseSectionXml(xml));
183
+ }
184
+ return blocksToMarkdown(blocks);
185
+ }
186
+ function extractFromBrokenZip(buffer) {
187
+ const data = new Uint8Array(buffer);
188
+ const view = new DataView(buffer);
189
+ let pos = 0;
190
+ const texts = [];
191
+ while (pos < data.length - 30) {
192
+ if (data[pos] !== 80 || data[pos + 1] !== 75 || data[pos + 2] !== 3 || data[pos + 3] !== 4) break;
193
+ const method = view.getUint16(pos + 8, true);
194
+ const compSize = view.getUint32(pos + 18, true);
195
+ const nameLen = view.getUint16(pos + 26, true);
196
+ const extraLen = view.getUint16(pos + 28, true);
197
+ const nameBytes = data.slice(pos + 30, pos + 30 + nameLen);
198
+ const name = new TextDecoder().decode(nameBytes);
199
+ const fileStart = pos + 30 + nameLen + extraLen;
200
+ const fileData = data.slice(fileStart, fileStart + compSize);
201
+ pos = fileStart + compSize;
202
+ if (!name.toLowerCase().includes("section") || !name.endsWith(".xml")) continue;
203
+ try {
204
+ let content;
205
+ if (method === 0) {
206
+ content = new TextDecoder().decode(fileData);
207
+ } else if (method === 8) {
208
+ const { inflateRawSync: inflateRawSync2 } = __require("zlib");
209
+ content = new TextDecoder().decode(inflateRawSync2(Buffer.from(fileData)));
210
+ } else {
211
+ continue;
212
+ }
213
+ const sectionText = blocksToMarkdown(parseSectionXml(content));
214
+ if (sectionText) texts.push(sectionText);
215
+ } catch {
216
+ continue;
217
+ }
218
+ }
219
+ if (texts.length === 0) throw new Error("\uC190\uC0C1\uB41C HWPX\uC5D0\uC11C \uC139\uC158 \uB370\uC774\uD130\uB97C \uBCF5\uAD6C\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
220
+ return texts.join("\n\n");
221
+ }
222
+ async function resolveSectionPaths(zip) {
223
+ const manifestPaths = ["Contents/content.hpf", "content.hpf"];
224
+ for (const mp of manifestPaths) {
225
+ const file = zip.file(new RegExp(`^${mp.replace(/\./g, "\\.")}$`, "i"))[0];
226
+ if (!file) continue;
227
+ const xml = await file.async("text");
228
+ const paths = parseSectionPathsFromManifest(xml);
229
+ if (paths.length > 0) return paths;
230
+ }
231
+ const sectionFiles = zip.file(/[Ss]ection\d+\.xml$/);
232
+ return sectionFiles.map((f) => f.name).sort();
233
+ }
234
+ function parseSectionPathsFromManifest(xml) {
235
+ const parser = new DOMParser();
236
+ const doc = parser.parseFromString(xml, "text/xml");
237
+ const items = doc.getElementsByTagName("opf:item");
238
+ const spine = doc.getElementsByTagName("opf:itemref");
239
+ const isSectionId = (id) => /^s/i.test(id) || id.toLowerCase().includes("section");
240
+ const idToHref = /* @__PURE__ */ new Map();
241
+ for (let i = 0; i < items.length; i++) {
242
+ const item = items[i];
243
+ const id = item.getAttribute("id") || "";
244
+ let href = item.getAttribute("href") || "";
245
+ const mediaType = item.getAttribute("media-type") || "";
246
+ if (!isSectionId(id) && !mediaType.includes("xml")) continue;
247
+ if (!href.startsWith("/") && !href.startsWith("Contents/") && isSectionId(id))
248
+ href = "Contents/" + href;
249
+ idToHref.set(id, href);
250
+ }
251
+ if (spine.length > 0) {
252
+ const ordered = [];
253
+ for (let i = 0; i < spine.length; i++) {
254
+ const href = idToHref.get(spine[i].getAttribute("idref") || "");
255
+ if (href) ordered.push(href);
256
+ }
257
+ if (ordered.length > 0) return ordered;
258
+ }
259
+ return Array.from(idToHref.entries()).filter(([id]) => isSectionId(id)).sort((a, b) => a[0].localeCompare(b[0])).map(([, href]) => href);
260
+ }
261
+ function parseSectionXml(xml) {
262
+ const parser = new DOMParser();
263
+ const doc = parser.parseFromString(xml, "text/xml");
264
+ if (!doc.documentElement) return [];
265
+ const blocks = [];
266
+ walkSection(doc.documentElement, blocks, null, []);
267
+ return blocks;
268
+ }
269
+ function walkSection(node, blocks, tableCtx, tableStack) {
270
+ const children = node.childNodes;
271
+ if (!children) return;
272
+ for (let i = 0; i < children.length; i++) {
273
+ const el = children[i];
274
+ if (el.nodeType !== 1) continue;
275
+ const tag = el.tagName || el.localName || "";
276
+ const localTag = tag.replace(/^[^:]+:/, "");
277
+ switch (localTag) {
278
+ case "tbl": {
279
+ if (tableCtx) tableStack.push(tableCtx);
280
+ const newTable = { rows: [], currentRow: [], cell: null };
281
+ walkSection(el, blocks, newTable, tableStack);
282
+ if (newTable.rows.length > 0) {
283
+ if (tableStack.length > 0) {
284
+ const parentTable = tableStack.pop();
285
+ const nestedText = convertTableToText(newTable.rows);
286
+ if (parentTable.cell) {
287
+ parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
288
+ }
289
+ tableCtx = parentTable;
290
+ } else {
291
+ blocks.push({ type: "table", table: buildTable(newTable.rows) });
292
+ tableCtx = null;
293
+ }
294
+ } else {
295
+ tableCtx = tableStack.length > 0 ? tableStack.pop() : null;
296
+ }
297
+ break;
298
+ }
299
+ case "tr":
300
+ if (tableCtx) {
301
+ tableCtx.currentRow = [];
302
+ walkSection(el, blocks, tableCtx, tableStack);
303
+ if (tableCtx.currentRow.length > 0) tableCtx.rows.push(tableCtx.currentRow);
304
+ tableCtx.currentRow = [];
305
+ }
306
+ break;
307
+ case "tc":
308
+ if (tableCtx) {
309
+ tableCtx.cell = { text: "", colSpan: 1, rowSpan: 1 };
310
+ walkSection(el, blocks, tableCtx, tableStack);
311
+ if (tableCtx.cell) {
312
+ tableCtx.currentRow.push(tableCtx.cell);
313
+ tableCtx.cell = null;
314
+ }
315
+ }
316
+ break;
317
+ case "cellSpan":
318
+ if (tableCtx?.cell) {
319
+ const cs = parseInt(el.getAttribute("colSpan") || "1", 10);
320
+ const rs = parseInt(el.getAttribute("rowSpan") || "1", 10);
321
+ if (cs > 0) tableCtx.cell.colSpan = cs;
322
+ if (rs > 0) tableCtx.cell.rowSpan = rs;
323
+ }
324
+ break;
325
+ case "p": {
326
+ const text = extractParagraphText(el);
327
+ if (text) {
328
+ if (tableCtx?.cell) {
329
+ tableCtx.cell.text += (tableCtx.cell.text ? "\n" : "") + text;
330
+ } else if (!tableCtx) {
331
+ blocks.push({ type: "paragraph", text });
332
+ }
333
+ }
334
+ walkSection(el, blocks, tableCtx, tableStack);
335
+ break;
336
+ }
337
+ default:
338
+ walkSection(el, blocks, tableCtx, tableStack);
339
+ break;
340
+ }
341
+ }
342
+ }
343
+ function extractParagraphText(para) {
344
+ let text = "";
345
+ const walk = (node) => {
346
+ const children = node.childNodes;
347
+ if (!children) return;
348
+ for (let i = 0; i < children.length; i++) {
349
+ const child = children[i];
350
+ if (child.nodeType === 3) {
351
+ text += child.textContent || "";
352
+ continue;
353
+ }
354
+ if (child.nodeType !== 1) continue;
355
+ const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
356
+ switch (tag) {
357
+ case "t":
358
+ text += child.textContent || "";
359
+ break;
360
+ case "tab":
361
+ text += " ";
362
+ break;
363
+ case "br":
364
+ if ((child.getAttribute("type") || "line") === "line") text += "\n";
365
+ break;
366
+ case "fwSpace":
367
+ case "hwSpace":
368
+ text += " ";
369
+ break;
370
+ case "tbl":
371
+ break;
372
+ // 테이블은 walkSection에서 처리
373
+ default:
374
+ walk(child);
375
+ break;
376
+ }
377
+ }
378
+ };
379
+ walk(para);
380
+ return text.replace(/[ \t]+/g, " ").trim();
381
+ }
382
+
383
+ // src/hwp5/record.ts
384
+ import { inflateRawSync, inflateSync } from "zlib";
385
+ var TAG_PARA_HEADER = 66;
386
+ var TAG_PARA_TEXT = 67;
387
+ var TAG_CTRL_HEADER = 71;
388
+ var TAG_LIST_HEADER = 72;
389
+ var TAG_TABLE = 77;
390
+ var CHAR_LINE = 0;
391
+ var CHAR_PARA = 13;
392
+ var CHAR_TAB = 9;
393
+ var CHAR_HYPHEN = 30;
394
+ var CHAR_NBSP = 31;
395
+ var CHAR_FIXED_NBSP = 24;
396
+ var FLAG_COMPRESSED = 1 << 0;
397
+ var FLAG_ENCRYPTED = 1 << 1;
398
+ var FLAG_DRM = 1 << 4;
399
+ function readRecords(data) {
400
+ const records = [];
401
+ let offset = 0;
402
+ while (offset + 4 <= data.length) {
403
+ const header = data.readUInt32LE(offset);
404
+ offset += 4;
405
+ const tagId = header & 1023;
406
+ const level = header >> 10 & 1023;
407
+ let size = header >> 20 & 4095;
408
+ if (size === 4095) {
409
+ if (offset + 4 > data.length) break;
410
+ size = data.readUInt32LE(offset);
411
+ offset += 4;
412
+ }
413
+ if (offset + size > data.length) break;
414
+ records.push({ tagId, level, size, data: data.subarray(offset, offset + size) });
415
+ offset += size;
416
+ }
417
+ return records;
418
+ }
419
+ function decompressStream(data) {
420
+ if (data.length >= 2 && data[0] === 120) {
421
+ try {
422
+ return inflateSync(data);
423
+ } catch {
424
+ }
425
+ }
426
+ return inflateRawSync(data);
427
+ }
428
+ function parseFileHeader(data) {
429
+ const sig = data.subarray(0, 32).toString("utf8").replace(/\0+$/, "");
430
+ return {
431
+ signature: sig,
432
+ versionMajor: data[35],
433
+ flags: data.readUInt32LE(36)
434
+ };
435
+ }
436
+ function extractText(data) {
437
+ let result = "";
438
+ let i = 0;
439
+ while (i + 1 < data.length) {
440
+ const ch = data.readUInt16LE(i);
441
+ i += 2;
442
+ switch (ch) {
443
+ case CHAR_LINE:
444
+ result += "\n";
445
+ break;
446
+ case CHAR_PARA:
447
+ break;
448
+ case CHAR_TAB:
449
+ result += " ";
450
+ break;
451
+ case CHAR_HYPHEN:
452
+ result += "-";
453
+ break;
454
+ case CHAR_NBSP:
455
+ case CHAR_FIXED_NBSP:
456
+ result += " ";
457
+ break;
458
+ default:
459
+ if (ch >= 1 && ch <= 31) {
460
+ const isExt = ch >= 1 && ch <= 3 || ch >= 11 && ch <= 18 || ch >= 21 && ch <= 23;
461
+ const isInline = ch >= 4 && ch <= 9 || ch >= 19 && ch <= 20;
462
+ if ((isExt || isInline) && i + 14 <= data.length) i += 14;
463
+ } else if (ch >= 32) {
464
+ result += String.fromCharCode(ch);
465
+ }
466
+ break;
467
+ }
468
+ }
469
+ return result;
470
+ }
471
+
472
+ // src/hwp5/parser.ts
473
+ import { createRequire } from "module";
474
+ var require2 = createRequire(import.meta.url);
475
+ var CFB = require2("cfb");
476
+ function parseHwp5Document(buffer) {
477
+ const cfb = CFB.parse(buffer);
478
+ const headerEntry = CFB.find(cfb, "/FileHeader");
479
+ if (!headerEntry?.content) throw new Error("FileHeader \uC2A4\uD2B8\uB9BC \uC5C6\uC74C");
480
+ const header = parseFileHeader(Buffer.from(headerEntry.content));
481
+ if (header.signature !== "HWP Document File") throw new Error("HWP \uC2DC\uADF8\uB2C8\uCC98 \uBD88\uC77C\uCE58");
482
+ if (header.flags & FLAG_ENCRYPTED) throw new Error("\uC554\uD638\uD654\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
483
+ if (header.flags & FLAG_DRM) throw new Error("DRM \uBCF4\uD638\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
484
+ const compressed = (header.flags & FLAG_COMPRESSED) !== 0;
485
+ const sections = findSections(cfb);
486
+ if (sections.length === 0) throw new Error("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
487
+ const blocks = [];
488
+ for (const sectionData of sections) {
489
+ const data = compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
490
+ const records = readRecords(data);
491
+ blocks.push(...parseSection(records));
492
+ }
493
+ return blocksToMarkdown(blocks);
494
+ }
495
+ function findSections(cfb) {
496
+ const sections = [];
497
+ for (let i = 0; ; i++) {
498
+ const entry = CFB.find(cfb, `/BodyText/Section${i}`);
499
+ if (!entry?.content) break;
500
+ sections.push({ idx: i, content: Buffer.from(entry.content) });
501
+ }
502
+ if (sections.length === 0 && cfb.FileIndex) {
503
+ for (const entry of cfb.FileIndex) {
504
+ if (entry.name?.startsWith("Section") && entry.content) {
505
+ const idx = parseInt(entry.name.replace("Section", ""), 10) || 0;
506
+ sections.push({ idx, content: Buffer.from(entry.content) });
507
+ }
508
+ }
509
+ }
510
+ return sections.sort((a, b) => a.idx - b.idx).map((s) => s.content);
511
+ }
512
+ function parseSection(records) {
513
+ const blocks = [];
514
+ let i = 0;
515
+ while (i < records.length) {
516
+ const rec = records[i];
517
+ if (rec.tagId === TAG_PARA_HEADER && rec.level === 0) {
518
+ const { paragraph, tables, nextIdx } = parseParagraphWithTables(records, i);
519
+ if (paragraph) blocks.push({ type: "paragraph", text: paragraph });
520
+ for (const t of tables) blocks.push({ type: "table", table: t });
521
+ i = nextIdx;
522
+ continue;
523
+ }
524
+ if (rec.tagId === TAG_CTRL_HEADER && rec.level <= 1 && rec.data.length >= 4) {
525
+ const ctrlId = rec.data.subarray(0, 4).toString("ascii");
526
+ if (ctrlId === " lbt" || ctrlId === "tbl ") {
527
+ const { table, nextIdx } = parseTableBlock(records, i);
528
+ if (table) blocks.push({ type: "table", table });
529
+ i = nextIdx;
530
+ continue;
531
+ }
532
+ }
533
+ i++;
534
+ }
535
+ return blocks;
536
+ }
537
+ function parseParagraphWithTables(records, startIdx) {
538
+ const startLevel = records[startIdx].level;
539
+ let text = "";
540
+ const tables = [];
541
+ let i = startIdx + 1;
542
+ while (i < records.length) {
543
+ const rec = records[i];
544
+ if (rec.tagId === TAG_PARA_HEADER && rec.level <= startLevel) break;
545
+ if (rec.tagId === TAG_PARA_TEXT) {
546
+ text = extractText(rec.data);
547
+ }
548
+ if (rec.tagId === TAG_CTRL_HEADER && rec.data.length >= 4) {
549
+ const ctrlId = rec.data.subarray(0, 4).toString("ascii");
550
+ if (ctrlId === " lbt" || ctrlId === "tbl ") {
551
+ const { table, nextIdx } = parseTableBlock(records, i);
552
+ if (table) tables.push(table);
553
+ i = nextIdx;
554
+ continue;
555
+ }
556
+ }
557
+ i++;
558
+ }
559
+ const trimmed = text.trim();
560
+ return { paragraph: trimmed || null, tables, nextIdx: i };
561
+ }
562
+ function parseTableBlock(records, startIdx) {
563
+ const tableLevel = records[startIdx].level;
564
+ let i = startIdx + 1;
565
+ let rows = 0, cols = 0;
566
+ const cells = [];
567
+ while (i < records.length) {
568
+ const rec = records[i];
569
+ if (rec.tagId === TAG_PARA_HEADER && rec.level <= tableLevel) break;
570
+ if (rec.tagId === TAG_CTRL_HEADER && rec.level <= tableLevel) break;
571
+ if (rec.tagId === TAG_TABLE && rec.data.length >= 8) {
572
+ rows = rec.data.readUInt16LE(4);
573
+ cols = rec.data.readUInt16LE(6);
574
+ }
575
+ if (rec.tagId === TAG_LIST_HEADER) {
576
+ const { cell, nextIdx } = parseCellBlock(records, i, tableLevel);
577
+ if (cell) cells.push(cell);
578
+ i = nextIdx;
579
+ continue;
580
+ }
581
+ i++;
582
+ }
583
+ if (rows === 0 || cols === 0 || cells.length === 0) return { table: null, nextIdx: i };
584
+ const cellRows = arrangeCells(rows, cols, cells);
585
+ return { table: buildTable(cellRows), nextIdx: i };
586
+ }
587
+ function parseCellBlock(records, startIdx, tableLevel) {
588
+ const cellLevel = records[startIdx].level;
589
+ const texts = [];
590
+ let i = startIdx + 1;
591
+ while (i < records.length) {
592
+ const rec = records[i];
593
+ if (rec.tagId === TAG_LIST_HEADER && rec.level <= cellLevel) break;
594
+ if (rec.level <= tableLevel && (rec.tagId === TAG_PARA_HEADER || rec.tagId === TAG_CTRL_HEADER)) break;
595
+ if (rec.tagId === TAG_PARA_TEXT) {
596
+ const t = extractText(rec.data).trim();
597
+ if (t) texts.push(t);
598
+ }
599
+ i++;
600
+ }
601
+ return { cell: { text: texts.join("\n"), colSpan: 1, rowSpan: 1 }, nextIdx: i };
602
+ }
603
+ function arrangeCells(rows, cols, cells) {
604
+ const grid = Array.from({ length: rows }, () => Array(cols).fill(null));
605
+ let cellIdx = 0;
606
+ for (let r = 0; r < rows && cellIdx < cells.length; r++) {
607
+ for (let c = 0; c < cols && cellIdx < cells.length; c++) {
608
+ if (grid[r][c] !== null) continue;
609
+ const cell = cells[cellIdx++];
610
+ grid[r][c] = cell;
611
+ for (let dr = 0; dr < cell.rowSpan; dr++) {
612
+ for (let dc = 0; dc < cell.colSpan; dc++) {
613
+ if (dr === 0 && dc === 0) continue;
614
+ if (r + dr < rows && c + dc < cols)
615
+ grid[r + dr][c + dc] = { text: "", colSpan: 1, rowSpan: 1 };
616
+ }
617
+ }
618
+ }
619
+ }
620
+ return grid.map((row) => row.map((c) => c || { text: "", colSpan: 1, rowSpan: 1 }));
621
+ }
622
+
623
+ // src/pdf/parser.ts
624
+ import { createRequire as createRequire2 } from "module";
625
+ import { pathToFileURL } from "url";
626
+ var pdfjsModule = null;
627
+ async function loadPdfjs() {
628
+ if (pdfjsModule) return pdfjsModule;
629
+ try {
630
+ pdfjsModule = await import("pdfjs-dist/legacy/build/pdf.mjs");
631
+ const req = createRequire2(import.meta.url);
632
+ const workerPath = req.resolve("pdfjs-dist/legacy/build/pdf.worker.mjs");
633
+ pdfjsModule.GlobalWorkerOptions.workerSrc = pathToFileURL(workerPath).href;
634
+ return pdfjsModule;
635
+ } catch {
636
+ return null;
637
+ }
638
+ }
639
+ async function parsePdfDocument(buffer) {
640
+ const pdfjs = await loadPdfjs();
641
+ if (!pdfjs) {
642
+ return {
643
+ success: false,
644
+ fileType: "pdf",
645
+ pageCount: 0,
646
+ error: "pdfjs-dist\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4. npm install pdfjs-dist"
647
+ };
648
+ }
649
+ try {
650
+ const data = new Uint8Array(buffer);
651
+ const doc = await pdfjs.getDocument({
652
+ data,
653
+ useSystemFonts: true,
654
+ disableFontFace: true,
655
+ isEvalSupported: false,
656
+ workerSrc: ""
657
+ }).promise;
658
+ const pageCount = doc.numPages;
659
+ if (pageCount === 0) {
660
+ return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4." };
661
+ }
662
+ const pageTexts = [];
663
+ let totalChars = 0;
664
+ for (let i = 1; i <= pageCount; i++) {
665
+ const page = await doc.getPage(i);
666
+ const textContent = await page.getTextContent();
667
+ const lines = groupTextItemsByLine(textContent.items);
668
+ const pageText = lines.join("\n");
669
+ totalChars += pageText.replace(/\s/g, "").length;
670
+ pageTexts.push(pageText);
671
+ }
672
+ const avgCharsPerPage = totalChars / pageCount;
673
+ if (avgCharsPerPage < 10) {
674
+ return {
675
+ success: false,
676
+ fileType: "pdf",
677
+ pageCount,
678
+ isImageBased: true,
679
+ error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF\uB85C \uCD94\uC815\uB429\uB2C8\uB2E4 (${pageCount}\uD398\uC774\uC9C0, \uCD94\uCD9C \uD14D\uC2A4\uD2B8 ${totalChars}\uC790).`
680
+ };
681
+ }
682
+ let markdown = "";
683
+ for (let i = 0; i < pageTexts.length; i++) {
684
+ const cleaned = cleanPdfText(pageTexts[i]);
685
+ if (cleaned.trim()) {
686
+ if (i > 0 && markdown) markdown += "\n\n";
687
+ markdown += cleaned;
688
+ }
689
+ }
690
+ markdown = reconstructTables(markdown);
691
+ return { success: true, fileType: "pdf", markdown, pageCount, isImageBased: false };
692
+ } catch (err) {
693
+ return {
694
+ success: false,
695
+ fileType: "pdf",
696
+ pageCount: 0,
697
+ error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328"
698
+ };
699
+ }
700
+ }
701
+ function groupTextItemsByLine(items) {
702
+ if (items.length === 0) return [];
703
+ const textItems = items.filter(
704
+ (item) => typeof item.str === "string" && item.str.trim() !== ""
705
+ );
706
+ if (textItems.length === 0) return [];
707
+ textItems.sort((a, b) => {
708
+ const yDiff = b.transform[5] - a.transform[5];
709
+ if (Math.abs(yDiff) < 2) return a.transform[4] - b.transform[4];
710
+ return yDiff;
711
+ });
712
+ const lines = [];
713
+ let currentY = textItems[0].transform[5];
714
+ let currentLine = [];
715
+ for (const item of textItems) {
716
+ const y = item.transform[5];
717
+ if (Math.abs(currentY - y) > Math.max(item.height * 0.5, 2)) {
718
+ if (currentLine.length > 0) lines.push(mergeLineItems(currentLine));
719
+ currentLine = [];
720
+ currentY = y;
721
+ }
722
+ currentLine.push({ text: item.str, x: item.transform[4], width: item.width });
723
+ }
724
+ if (currentLine.length > 0) lines.push(mergeLineItems(currentLine));
725
+ return lines;
726
+ }
727
+ function mergeLineItems(items) {
728
+ if (items.length <= 1) return items[0]?.text || "";
729
+ items.sort((a, b) => a.x - b.x);
730
+ let result = items[0].text;
731
+ for (let i = 1; i < items.length; i++) {
732
+ const gap = items[i].x - (items[i - 1].x + items[i - 1].width);
733
+ if (gap > 15) result += " ";
734
+ else if (gap > 3) result += " ";
735
+ result += items[i].text;
736
+ }
737
+ return result;
738
+ }
739
+ function cleanPdfText(text) {
740
+ return text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/([가-힣·,\-])\n([가-힣(])/g, "$1 $2").replace(/\n{3,}/g, "\n\n").trim();
741
+ }
742
+ function reconstructTables(text) {
743
+ const lines = text.split("\n");
744
+ const result = [];
745
+ let tableBuffer = [];
746
+ for (const line of lines) {
747
+ if (line.includes(" ")) {
748
+ tableBuffer.push(line.split(" ").map((c) => c.trim()));
749
+ } else {
750
+ if (tableBuffer.length >= 2) result.push(formatAsMarkdownTable(tableBuffer));
751
+ else if (tableBuffer.length === 1) result.push(tableBuffer[0].join(" | "));
752
+ tableBuffer = [];
753
+ result.push(line);
754
+ }
755
+ }
756
+ if (tableBuffer.length >= 2) result.push(formatAsMarkdownTable(tableBuffer));
757
+ else if (tableBuffer.length === 1) result.push(tableBuffer[0].join(" | "));
758
+ return result.join("\n");
759
+ }
760
+ function formatAsMarkdownTable(rows) {
761
+ const maxCols = Math.max(...rows.map((r) => r.length));
762
+ const normalized = rows.map((r) => {
763
+ while (r.length < maxCols) r.push("");
764
+ return r;
765
+ });
766
+ const lines = [];
767
+ lines.push("| " + normalized[0].join(" | ") + " |");
768
+ lines.push("| " + normalized[0].map(() => "---").join(" | ") + " |");
769
+ for (let i = 1; i < normalized.length; i++) {
770
+ lines.push("| " + normalized[i].join(" | ") + " |");
771
+ }
772
+ return lines.join("\n");
773
+ }
774
+
775
+ // src/index.ts
776
+ async function parse(buffer) {
777
+ const format = detectFormat(buffer);
778
+ switch (format) {
779
+ case "hwpx":
780
+ return parseHwpx(buffer);
781
+ case "hwp":
782
+ return parseHwp(buffer);
783
+ case "pdf":
784
+ return parsePdf(buffer);
785
+ default:
786
+ return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4." };
787
+ }
788
+ }
789
+ async function parseHwpx(buffer) {
790
+ try {
791
+ const markdown = await parseHwpxDocument(buffer);
792
+ return { success: true, fileType: "hwpx", markdown };
793
+ } catch (err) {
794
+ return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328" };
795
+ }
796
+ }
797
+ async function parseHwp(buffer) {
798
+ try {
799
+ const markdown = parseHwp5Document(Buffer.from(buffer));
800
+ return { success: true, fileType: "hwp", markdown };
801
+ } catch (err) {
802
+ return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328" };
803
+ }
804
+ }
805
+ async function parsePdf(buffer) {
806
+ return parsePdfDocument(buffer);
807
+ }
808
+
809
+ export {
810
+ __export,
811
+ detectFormat,
812
+ parse
813
+ };