kordoc 1.7.2 → 1.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,22 @@
1
1
  #!/usr/bin/env node
2
+ import {
3
+ KordocError,
4
+ classifyError,
5
+ isPathTraversal,
6
+ precheckZipSize,
7
+ sanitizeHref,
8
+ toArrayBuffer
9
+ } from "./chunk-AHW56LNX.js";
10
+ import {
11
+ parsePageRange
12
+ } from "./chunk-MOL7MDBG.js";
2
13
 
3
14
  // src/detect.ts
15
+ import JSZip from "jszip";
4
16
  function magicBytes(buffer) {
5
17
  return new Uint8Array(buffer, 0, Math.min(4, buffer.byteLength));
6
18
  }
7
- function isHwpxFile(buffer) {
19
+ function isZipFile(buffer) {
8
20
  const b = magicBytes(buffer);
9
21
  return b[0] === 80 && b[1] === 75 && b[2] === 3 && b[3] === 4;
10
22
  }
@@ -18,15 +30,28 @@ function isPdfFile(buffer) {
18
30
  }
19
31
  function detectFormat(buffer) {
20
32
  if (buffer.byteLength < 4) return "unknown";
21
- if (isHwpxFile(buffer)) return "hwpx";
33
+ if (isZipFile(buffer)) return "hwpx";
22
34
  if (isOldHwpFile(buffer)) return "hwp";
23
35
  if (isPdfFile(buffer)) return "pdf";
24
36
  return "unknown";
25
37
  }
38
+ async function detectZipFormat(buffer) {
39
+ try {
40
+ const zip = await JSZip.loadAsync(buffer);
41
+ if (zip.file("xl/workbook.xml")) return "xlsx";
42
+ if (zip.file("word/document.xml")) return "docx";
43
+ if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return "hwpx";
44
+ const hasSection = Object.keys(zip.files).some((f) => f.startsWith("Contents/"));
45
+ if (hasSection) return "hwpx";
46
+ return "unknown";
47
+ } catch {
48
+ return "unknown";
49
+ }
50
+ }
26
51
 
27
52
  // src/table/builder.ts
28
53
  var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
29
- function sanitizeHref(href) {
54
+ function sanitizeHref2(href) {
30
55
  const trimmed = href.trim();
31
56
  if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
32
57
  return trimmed;
@@ -36,23 +61,24 @@ var MAX_ROWS = 1e4;
36
61
  function buildTable(rows) {
37
62
  if (rows.length > MAX_ROWS) rows = rows.slice(0, MAX_ROWS);
38
63
  const numRows = rows.length;
39
- const tempOccupied = /* @__PURE__ */ new Set();
64
+ const hasAddr = rows.some((row) => row.some((c) => c.colAddr !== void 0 && c.rowAddr !== void 0));
65
+ if (hasAddr) return buildTableDirect(rows, numRows);
40
66
  let maxCols = 0;
67
+ const tempOccupied = Array.from({ length: numRows }, () => []);
41
68
  for (let rowIdx = 0; rowIdx < numRows; rowIdx++) {
42
69
  let colIdx = 0;
43
70
  for (const cell of rows[rowIdx]) {
44
- while (colIdx < MAX_COLS && tempOccupied.has(rowIdx * MAX_COLS + colIdx)) colIdx++;
71
+ while (colIdx < MAX_COLS && tempOccupied[rowIdx][colIdx]) colIdx++;
45
72
  if (colIdx >= MAX_COLS) break;
46
73
  for (let r = rowIdx; r < Math.min(rowIdx + cell.rowSpan, numRows); r++) {
47
74
  for (let c = colIdx; c < Math.min(colIdx + cell.colSpan, MAX_COLS); c++) {
48
- tempOccupied.add(r * MAX_COLS + c);
75
+ tempOccupied[r][c] = true;
49
76
  }
50
77
  }
51
78
  colIdx += cell.colSpan;
52
79
  if (colIdx > maxCols) maxCols = colIdx;
53
80
  }
54
81
  }
55
- tempOccupied.clear();
56
82
  if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
57
83
  const grid = Array.from(
58
84
  { length: numRows },
@@ -80,6 +106,50 @@ function buildTable(rows) {
80
106
  cellIdx++;
81
107
  }
82
108
  }
109
+ return trimAndReturn(grid, numRows, maxCols);
110
+ }
111
+ function buildTableDirect(rows, numRows) {
112
+ let maxCols = 0;
113
+ for (const row of rows) {
114
+ for (const cell of row) {
115
+ const end = (cell.colAddr ?? 0) + cell.colSpan;
116
+ if (end > maxCols) maxCols = end;
117
+ }
118
+ }
119
+ if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
120
+ const grid = Array.from(
121
+ { length: numRows },
122
+ () => Array.from({ length: maxCols }, () => ({ text: "", colSpan: 1, rowSpan: 1 }))
123
+ );
124
+ for (const row of rows) {
125
+ for (const cell of row) {
126
+ const r = cell.rowAddr ?? 0;
127
+ const c = cell.colAddr ?? 0;
128
+ if (r >= numRows || c >= maxCols) continue;
129
+ grid[r][c] = { text: cell.text.trim(), colSpan: cell.colSpan, rowSpan: cell.rowSpan };
130
+ for (let dr = 0; dr < cell.rowSpan; dr++) {
131
+ for (let dc = 0; dc < cell.colSpan; dc++) {
132
+ if (dr === 0 && dc === 0) continue;
133
+ if (r + dr < numRows && c + dc < maxCols) {
134
+ grid[r + dr][c + dc] = { text: "", colSpan: 1, rowSpan: 1 };
135
+ }
136
+ }
137
+ }
138
+ }
139
+ }
140
+ return trimAndReturn(grid, numRows, maxCols);
141
+ }
142
+ function trimAndReturn(grid, numRows, maxCols) {
143
+ let effectiveCols = maxCols;
144
+ while (effectiveCols > 0) {
145
+ const colEmpty = grid.every((row) => !row[effectiveCols - 1]?.text?.trim());
146
+ if (!colEmpty) break;
147
+ effectiveCols--;
148
+ }
149
+ if (effectiveCols < maxCols && effectiveCols > 0) {
150
+ const trimmed = grid.map((row) => row.slice(0, effectiveCols));
151
+ return { rows: numRows, cols: effectiveCols, cells: trimmed, hasHeader: numRows > 1 };
152
+ }
83
153
  return { rows: numRows, cols: maxCols, cells: grid, hasHeader: numRows > 1 };
84
154
  }
85
155
  function convertTableToText(rows) {
@@ -87,13 +157,26 @@ function convertTableToText(rows) {
87
157
  (row) => row.map((c) => c.text.trim().replace(/\n/g, " ")).filter(Boolean).join(" | ")
88
158
  ).filter(Boolean).join("\n");
89
159
  }
160
+ var HWP_SHAPE_ALT_TEXT_RE = /(?:모서리가 둥근 |둥근 )?(?:사각형|직사각형|정사각형|원|타원|삼각형|이등변 삼각형|직각 삼각형|선|직선|곡선|화살표|굵은 화살표|이중 화살표|오각형|육각형|팔각형|별|[4-8]점별|십자|십자형|구름|구름형|마름모|도넛|평행사변형|사다리꼴|부채꼴|호|반원|물결|번개|하트|빗금|블록 화살표|수식|표|그림|개체|그리기\s?개체|묶음\s?개체|글상자|수식\s?개체|OLE\s?개체)\s?입니다\.?/g;
161
+ function sanitizeText(text) {
162
+ let result = text.replace(/[\u{F0000}-\u{FFFFD}]/gu, "").replace(HWP_SHAPE_ALT_TEXT_RE, "").replace(/ +/g, " ").trim();
163
+ if (result.length <= 30 && result.includes(" ")) {
164
+ const tokens = result.split(" ");
165
+ const koreanSingleCharCount = tokens.filter((t) => t.length === 1 && /[\uAC00-\uD7AF\u3131-\u318E]/.test(t)).length;
166
+ if (tokens.length >= 3 && koreanSingleCharCount / tokens.length >= 0.7) {
167
+ result = tokens.join("");
168
+ }
169
+ }
170
+ return result;
171
+ }
90
172
  function blocksToMarkdown(blocks) {
91
173
  const lines = [];
92
174
  for (let i = 0; i < blocks.length; i++) {
93
175
  const block = blocks[i];
94
176
  if (block.type === "heading" && block.text) {
95
177
  const prefix = "#".repeat(Math.min(block.level || 2, 6));
96
- lines.push("", `${prefix} ${block.text}`, "");
178
+ const headingText = sanitizeText(block.text);
179
+ if (headingText) lines.push("", `${prefix} ${headingText}`, "");
97
180
  continue;
98
181
  }
99
182
  if (block.type === "image" && block.text) {
@@ -105,9 +188,11 @@ function blocksToMarkdown(blocks) {
105
188
  continue;
106
189
  }
107
190
  if (block.type === "list" && block.text) {
108
- const alreadyNumbered = block.listType === "ordered" && /^\d+\.\s/.test(block.text);
191
+ const listText = sanitizeText(block.text);
192
+ if (!listText) continue;
193
+ const alreadyNumbered = block.listType === "ordered" && /^\d+\.\s/.test(listText);
109
194
  const prefix = alreadyNumbered ? "" : block.listType === "ordered" ? "1. " : "- ";
110
- lines.push(`${prefix}${block.text}`);
195
+ lines.push(`${prefix}${listText}`);
111
196
  if (block.children) {
112
197
  for (const child of block.children) {
113
198
  const childPrefix = child.listType === "ordered" ? "1." : "-";
@@ -117,7 +202,8 @@ function blocksToMarkdown(blocks) {
117
202
  continue;
118
203
  }
119
204
  if (block.type === "paragraph" && block.text) {
120
- let text = block.text;
205
+ let text = sanitizeText(block.text);
206
+ if (!text) continue;
121
207
  if (/^\[별표\s*\d+/.test(text)) {
122
208
  const nextBlock = blocks[i + 1];
123
209
  if (nextBlock?.type === "paragraph" && nextBlock.text && /관련\)?$/.test(nextBlock.text)) {
@@ -133,7 +219,7 @@ function blocksToMarkdown(blocks) {
133
219
  continue;
134
220
  }
135
221
  if (block.href) {
136
- const href = sanitizeHref(block.href);
222
+ const href = sanitizeHref2(block.href);
137
223
  if (href) text = `[${text}](${href})`;
138
224
  }
139
225
  if (block.footnoteText) {
@@ -154,7 +240,7 @@ function tableToMarkdown(table) {
154
240
  if (table.rows === 0 || table.cols === 0) return "";
155
241
  const { cells, rows: numRows, cols: numCols } = table;
156
242
  if (numRows === 1 && numCols === 1) {
157
- const content = cells[0][0].text;
243
+ const content = sanitizeText(cells[0][0].text);
158
244
  return content.split(/\n/).map((line) => {
159
245
  const trimmed = line.trim();
160
246
  if (!trimmed) return "";
@@ -163,13 +249,17 @@ function tableToMarkdown(table) {
163
249
  return trimmed;
164
250
  }).filter(Boolean).join("\n");
165
251
  }
252
+ if (numCols === 1 && numRows >= 2) {
253
+ return cells.map((row) => sanitizeText(row[0].text).replace(/\n/g, " ")).filter(Boolean).join("\n");
254
+ }
166
255
  const display = Array.from({ length: numRows }, () => Array(numCols).fill(""));
167
256
  const skip = /* @__PURE__ */ new Set();
168
257
  for (let r = 0; r < numRows; r++) {
169
258
  for (let c = 0; c < numCols; c++) {
170
259
  if (skip.has(`${r},${c}`)) continue;
171
- const cell = cells[r][c];
172
- display[r][c] = cell.text.replace(/\n/g, "<br>");
260
+ const cell = cells[r]?.[c];
261
+ if (!cell) continue;
262
+ display[r][c] = sanitizeText(cell.text).replace(/\n/g, "<br>");
173
263
  for (let dr = 0; dr < cell.rowSpan; dr++) {
174
264
  for (let dc = 0; dc < cell.colSpan; dc++) {
175
265
  if (dr === 0 && dc === 0) continue;
@@ -178,12 +268,28 @@ function tableToMarkdown(table) {
178
268
  }
179
269
  }
180
270
  }
271
+ c += cell.colSpan - 1;
181
272
  }
182
273
  }
183
274
  const uniqueRows = [];
184
- for (const row of display) {
275
+ let pendingFirstCol = "";
276
+ for (let r = 0; r < display.length; r++) {
277
+ const row = display[r];
185
278
  const isEmptyPlaceholder = row.every((cell) => cell === "");
186
- if (!isEmptyPlaceholder) uniqueRows.push(row);
279
+ if (isEmptyPlaceholder) continue;
280
+ const hasSkippedCols = row.some((cell, c) => cell === "" && skip.has(`${r},${c}`));
281
+ const nonEmptyCols = row.filter((cell) => cell !== "");
282
+ if (!hasSkippedCols && nonEmptyCols.length === 1 && row[0] !== "" && row.slice(1).every((c) => c === "")) {
283
+ pendingFirstCol = row[0];
284
+ continue;
285
+ }
286
+ if (pendingFirstCol && row[0] === "") {
287
+ row[0] = pendingFirstCol;
288
+ pendingFirstCol = "";
289
+ } else {
290
+ pendingFirstCol = "";
291
+ }
292
+ uniqueRows.push(row);
187
293
  }
188
294
  if (uniqueRows.length === 0) return "";
189
295
  const md = [];
@@ -195,75 +301,15 @@ function tableToMarkdown(table) {
195
301
  return md.join("\n");
196
302
  }
197
303
 
198
- // src/utils.ts
199
- var VERSION = true ? "1.7.2" : "0.0.0-dev";
200
- function toArrayBuffer(buf) {
201
- if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
202
- return buf.buffer;
203
- }
204
- return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
205
- }
206
- var KordocError = class extends Error {
207
- constructor(message) {
208
- super(message);
209
- this.name = "KordocError";
210
- }
211
- };
212
- function sanitizeError(err) {
213
- if (err instanceof KordocError) return err.message;
214
- return "\uBB38\uC11C \uCC98\uB9AC \uC911 \uC624\uB958\uAC00 \uBC1C\uC0DD\uD588\uC2B5\uB2C8\uB2E4";
215
- }
216
- function isPathTraversal(name) {
217
- if (name.includes("\0")) return true;
218
- const normalized = name.replace(/\\/g, "/");
219
- return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
220
- }
221
- function classifyError(err) {
222
- if (!(err instanceof Error)) return "PARSE_ERROR";
223
- const msg = err.message;
224
- if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
225
- if (msg.includes("DRM")) return "DRM_PROTECTED";
226
- if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
227
- if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
228
- if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
229
- if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
230
- if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
231
- return "PARSE_ERROR";
232
- }
233
-
234
304
  // src/hwpx/parser.ts
235
- import JSZip from "jszip";
305
+ import JSZip2 from "jszip";
236
306
  import { inflateRawSync } from "zlib";
237
307
  import { DOMParser } from "@xmldom/xmldom";
238
308
 
239
- // src/page-range.ts
240
- function parsePageRange(spec, maxPages) {
241
- const result = /* @__PURE__ */ new Set();
242
- if (maxPages <= 0) return result;
243
- if (Array.isArray(spec)) {
244
- for (const n of spec) {
245
- const page = Math.round(n);
246
- if (page >= 1 && page <= maxPages) result.add(page);
247
- }
248
- return result;
249
- }
250
- if (typeof spec !== "string" || spec.trim() === "") return result;
251
- const parts = spec.split(",");
252
- for (const part of parts) {
253
- const trimmed = part.trim();
254
- if (!trimmed) continue;
255
- const rangeMatch = trimmed.match(/^(\d+)\s*-\s*(\d+)$/);
256
- if (rangeMatch) {
257
- const start = Math.max(1, parseInt(rangeMatch[1], 10));
258
- const end = Math.min(maxPages, parseInt(rangeMatch[2], 10));
259
- for (let i = start; i <= end; i++) result.add(i);
260
- } else {
261
- const page = parseInt(trimmed, 10);
262
- if (!isNaN(page) && page >= 1 && page <= maxPages) result.add(page);
263
- }
264
- }
265
- return result;
266
- }
309
+ // src/types.ts
310
+ var HEADING_RATIO_H1 = 1.5;
311
+ var HEADING_RATIO_H2 = 1.3;
312
+ var HEADING_RATIO_H3 = 1.15;
267
313
 
268
314
  // src/hwpx/parser.ts
269
315
  var MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024;
@@ -357,16 +403,10 @@ function stripDtd(xml) {
357
403
  return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
358
404
  }
359
405
  async function parseHwpxDocument(buffer, options) {
360
- const precheck = precheckZipSize(buffer);
361
- if (precheck.totalUncompressed > MAX_DECOMPRESS_SIZE) {
362
- throw new KordocError("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
363
- }
364
- if (precheck.entryCount > MAX_ZIP_ENTRIES) {
365
- throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
366
- }
406
+ precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
367
407
  let zip;
368
408
  try {
369
- zip = await JSZip.loadAsync(buffer);
409
+ zip = await JSZip2.loadAsync(buffer);
370
410
  } catch {
371
411
  return extractFromBrokenZip(buffer);
372
412
  }
@@ -529,7 +569,7 @@ function parseDublinCoreMetadata(xml, metadata) {
529
569
  async function extractHwpxMetadataOnly(buffer) {
530
570
  let zip;
531
571
  try {
532
- zip = await JSZip.loadAsync(buffer);
572
+ zip = await JSZip2.loadAsync(buffer);
533
573
  } catch {
534
574
  throw new KordocError("HWPX ZIP\uC744 \uC5F4 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
535
575
  }
@@ -539,46 +579,17 @@ async function extractHwpxMetadataOnly(buffer) {
539
579
  metadata.pageCount = sectionPaths.length;
540
580
  return metadata;
541
581
  }
542
- function precheckZipSize(buffer) {
543
- try {
544
- const data = new DataView(buffer);
545
- const len = buffer.byteLength;
546
- if (len < 22) return { totalUncompressed: 0, entryCount: 0 };
547
- const searchStart = Math.max(0, len - 22 - 65535);
548
- let eocdOffset = -1;
549
- for (let i = len - 22; i >= searchStart; i--) {
550
- if (data.getUint32(i, true) === 101010256) {
551
- eocdOffset = i;
552
- break;
553
- }
554
- }
555
- if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
556
- const entryCount = data.getUint16(eocdOffset + 10, true);
557
- const cdSize = data.getUint32(eocdOffset + 12, true);
558
- const cdOffset = data.getUint32(eocdOffset + 16, true);
559
- if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
560
- let totalUncompressed = 0;
561
- let pos = cdOffset;
562
- for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
563
- if (data.getUint32(pos, true) !== 33639248) break;
564
- totalUncompressed += data.getUint32(pos + 24, true);
565
- const nameLen = data.getUint16(pos + 28, true);
566
- const extraLen = data.getUint16(pos + 30, true);
567
- const commentLen = data.getUint16(pos + 32, true);
568
- pos += 46 + nameLen + extraLen + commentLen;
569
- }
570
- return { totalUncompressed, entryCount };
571
- } catch {
572
- return { totalUncompressed: 0, entryCount: 0 };
573
- }
574
- }
575
582
  function extractFromBrokenZip(buffer) {
576
583
  const data = new Uint8Array(buffer);
577
584
  const view = new DataView(buffer);
578
585
  let pos = 0;
579
586
  const blocks = [];
587
+ const warnings = [
588
+ { code: "BROKEN_ZIP_RECOVERY", message: "\uC190\uC0C1\uB41C ZIP \uAD6C\uC870 \u2014 Local File Header \uAE30\uBC18 \uBCF5\uAD6C \uBAA8\uB4DC" }
589
+ ];
580
590
  let totalDecompressed = 0;
581
591
  let entryCount = 0;
592
+ let sectionNum = 0;
582
593
  while (pos < data.length - 30) {
583
594
  if (data[pos] !== 80 || data[pos + 1] !== 75 || data[pos + 2] !== 3 || data[pos + 3] !== 4) {
584
595
  pos++;
@@ -624,14 +635,15 @@ function extractFromBrokenZip(buffer) {
624
635
  }
625
636
  totalDecompressed += content.length * 2;
626
637
  if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("\uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC");
627
- blocks.push(...parseSectionXml(content));
638
+ sectionNum++;
639
+ blocks.push(...parseSectionXml(content, void 0, warnings, sectionNum));
628
640
  } catch {
629
641
  continue;
630
642
  }
631
643
  }
632
644
  if (blocks.length === 0) throw new KordocError("\uC190\uC0C1\uB41C HWPX\uC5D0\uC11C \uC139\uC158 \uB370\uC774\uD130\uB97C \uBCF5\uAD6C\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
633
645
  const markdown = blocksToMarkdown(blocks);
634
- return { markdown, blocks };
646
+ return { markdown, blocks, warnings: warnings.length > 0 ? warnings : void 0 };
635
647
  }
636
648
  async function resolveSectionPaths(zip) {
637
649
  const manifestPaths = ["Contents/content.hpf", "content.hpf"];
@@ -695,9 +707,9 @@ function detectHwpxHeadings(blocks, styleMap) {
695
707
  let level = 0;
696
708
  if (baseFontSize > 0 && block.style?.fontSize) {
697
709
  const ratio = block.style.fontSize / baseFontSize;
698
- if (ratio >= 1.5) level = 1;
699
- else if (ratio >= 1.3) level = 2;
700
- else if (ratio >= 1.15) level = 3;
710
+ if (ratio >= HEADING_RATIO_H1) level = 1;
711
+ else if (ratio >= HEADING_RATIO_H2) level = 2;
712
+ else if (ratio >= HEADING_RATIO_H3) level = 3;
701
713
  }
702
714
  if (/^제\d+[조장절편]/.test(text) && text.length <= 50) {
703
715
  if (level === 0) level = 3;
@@ -783,6 +795,14 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
783
795
  }
784
796
  }
785
797
  break;
798
+ case "cellAddr":
799
+ if (tableCtx?.cell) {
800
+ const ca = parseInt(el.getAttribute("colAddr") || "", 10);
801
+ const ra = parseInt(el.getAttribute("rowAddr") || "", 10);
802
+ if (!isNaN(ca)) tableCtx.cell.colAddr = ca;
803
+ if (!isNaN(ra)) tableCtx.cell.rowAddr = ra;
804
+ }
805
+ break;
786
806
  case "cellSpan":
787
807
  if (tableCtx?.cell) {
788
808
  const cs = parseInt(el.getAttribute("colSpan") || "1", 10);
@@ -829,39 +849,47 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
829
849
  if (depth > MAX_XML_DEPTH) return tableCtx;
830
850
  const children = node.childNodes;
831
851
  if (!children) return tableCtx;
832
- for (let i = 0; i < children.length; i++) {
833
- const el = children[i];
834
- if (el.nodeType !== 1) continue;
835
- const tag = el.tagName || el.localName || "";
836
- const localTag = tag.replace(/^[^:]+:/, "");
837
- if (localTag === "tbl") {
838
- if (tableCtx) tableStack.push(tableCtx);
839
- const newTable = { rows: [], currentRow: [], cell: null };
840
- walkSection(el, blocks, newTable, tableStack, styleMap, warnings, sectionNum, depth + 1);
841
- if (newTable.rows.length > 0) {
842
- if (tableStack.length > 0) {
843
- const parentTable = tableStack.pop();
844
- const nestedText = convertTableToText(newTable.rows);
845
- if (parentTable.cell) {
846
- parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
852
+ const walkChildren = (parent, d) => {
853
+ if (d > MAX_XML_DEPTH) return;
854
+ const kids = parent.childNodes;
855
+ if (!kids) return;
856
+ for (let i = 0; i < kids.length; i++) {
857
+ const el = kids[i];
858
+ if (el.nodeType !== 1) continue;
859
+ const tag = el.tagName || el.localName || "";
860
+ const localTag = tag.replace(/^[^:]+:/, "");
861
+ if (localTag === "tbl") {
862
+ if (tableCtx) tableStack.push(tableCtx);
863
+ const newTable = { rows: [], currentRow: [], cell: null };
864
+ walkSection(el, blocks, newTable, tableStack, styleMap, warnings, sectionNum, d + 1);
865
+ if (newTable.rows.length > 0) {
866
+ if (tableStack.length > 0) {
867
+ const parentTable = tableStack.pop();
868
+ const nestedText = convertTableToText(newTable.rows);
869
+ if (parentTable.cell) {
870
+ parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
871
+ }
872
+ tableCtx = parentTable;
873
+ } else {
874
+ blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
875
+ tableCtx = null;
847
876
  }
848
- tableCtx = parentTable;
849
877
  } else {
850
- blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
851
- tableCtx = null;
878
+ tableCtx = tableStack.length > 0 ? tableStack.pop() : null;
852
879
  }
853
- } else {
854
- tableCtx = tableStack.length > 0 ? tableStack.pop() : null;
855
- }
856
- } else if (localTag === "pic" || localTag === "shape" || localTag === "drawingObject") {
857
- const imgRef = extractImageRef(el);
858
- if (imgRef) {
859
- blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
860
- } else if (warnings && sectionNum) {
861
- warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
880
+ } else if (localTag === "pic" || localTag === "shape" || localTag === "drawingObject") {
881
+ const imgRef = extractImageRef(el);
882
+ if (imgRef) {
883
+ blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
884
+ } else if (warnings && sectionNum) {
885
+ warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
886
+ }
887
+ } else if (localTag === "r" || localTag === "run" || localTag === "ctrl") {
888
+ walkChildren(el, d + 1);
862
889
  }
863
890
  }
864
- }
891
+ };
892
+ walkChildren(node, depth);
865
893
  return tableCtx;
866
894
  }
867
895
  function extractParagraphInfo(para, styleMap) {
@@ -900,7 +928,10 @@ function extractParagraphInfo(para, styleMap) {
900
928
  // 하이퍼링크
901
929
  case "hyperlink": {
902
930
  const url = child.getAttribute("url") || child.getAttribute("href") || "";
903
- if (url) href = url;
931
+ if (url) {
932
+ const safe = sanitizeHref(url);
933
+ if (safe) href = safe;
934
+ }
904
935
  walk(child);
905
936
  break;
906
937
  }
@@ -913,6 +944,29 @@ function extractParagraphInfo(para, styleMap) {
913
944
  if (noteText) footnote = (footnote ? footnote + "; " : "") + noteText;
914
945
  break;
915
946
  }
947
+ // 제어 요소 — 필드, 컨트롤, 매개변수 등 스킵
948
+ case "ctrl":
949
+ case "fieldBegin":
950
+ case "fieldEnd":
951
+ case "parameters":
952
+ case "stringParam":
953
+ case "integerParam":
954
+ case "boolParam":
955
+ case "floatParam":
956
+ case "secPr":
957
+ // 섹션 속성 (페이지 설정 등)
958
+ case "colPr":
959
+ // 다단 속성
960
+ case "linesegarray":
961
+ case "lineseg":
962
+ // 레이아웃 정보
963
+ // 도형/이미지 요소 — 대체텍스트("사각형입니다." 등) 누출 방지
964
+ case "pic":
965
+ case "shape":
966
+ case "drawingObject":
967
+ case "shapeComment":
968
+ case "drawText":
969
+ break;
916
970
  // run 요소에서 charPrIDRef 추출
917
971
  case "r": {
918
972
  const runCharPr = child.getAttribute("charPrIDRef");
@@ -927,7 +981,10 @@ function extractParagraphInfo(para, styleMap) {
927
981
  }
928
982
  };
929
983
  walk(para);
930
- const cleanText = text.replace(/[ \t]+/g, " ").trim();
984
+ let cleanText = text.replace(/[ \t]+/g, " ").trim();
985
+ if (/^그림입니다\.?\s*원본\s*그림의\s*(이름|크기)/.test(cleanText)) cleanText = "";
986
+ cleanText = cleanText.replace(/그림입니다\.?\s*원본\s*그림의\s*(이름|크기)[^\n]*(\n[^\n]*원본\s*그림의\s*(이름|크기)[^\n]*)*/g, "").trim();
987
+ cleanText = cleanText.replace(/(?:모서리가 둥근 |둥근 )?(?:사각형|직사각형|정사각형|원|타원|삼각형|선|직선|곡선|화살표|오각형|육각형|팔각형|별|십자|구름|마름모|도넛|평행사변형|사다리꼴|개체|그리기\s?개체|묶음\s?개체|글상자|수식|표|그림|OLE\s?개체)\s?입니다\.?/g, "").trim();
931
988
  let style;
932
989
  if (styleMap && charPrId) {
933
990
  const charProp = styleMap.charProperties.get(charPrId);
@@ -1205,9 +1262,9 @@ function detectHwp5Headings(blocks, docInfo) {
1205
1262
  if (/^\d+$/.test(text)) continue;
1206
1263
  const ratio = block.style.fontSize / baseFontSize;
1207
1264
  let level = 0;
1208
- if (ratio >= 1.5) level = 1;
1209
- else if (ratio >= 1.3) level = 2;
1210
- else if (ratio >= 1.15) level = 3;
1265
+ if (ratio >= HEADING_RATIO_H1) level = 1;
1266
+ else if (ratio >= HEADING_RATIO_H2) level = 2;
1267
+ else if (ratio >= HEADING_RATIO_H3) level = 3;
1211
1268
  if (/^제\d+[조장절편]/.test(text) && text.length <= 50) {
1212
1269
  if (level === 0) level = 3;
1213
1270
  }
@@ -1308,20 +1365,22 @@ function detectImageMime(data) {
1308
1365
  }
1309
1366
  function extractHwp5Images(cfb, blocks, compressed, warnings) {
1310
1367
  const binDataMap = /* @__PURE__ */ new Map();
1311
- for (let idx = 0; idx < 1e4; idx++) {
1312
- const entry = CFB.find(cfb, `/BinData/BIN${String(idx).padStart(4, "0")}`) || CFB.find(cfb, `/BinData/Bin${String(idx).padStart(4, "0")}`);
1313
- if (!entry?.content) {
1314
- if (idx > 0) break;
1315
- continue;
1316
- }
1317
- let data = Buffer.from(entry.content);
1318
- if (compressed) {
1319
- try {
1320
- data = decompressStream(data);
1321
- } catch {
1368
+ const binDataRe = /\/BinData\/[Bb][Ii][Nn](\d{4})$/;
1369
+ if (cfb.FileIndex) {
1370
+ for (const entry of cfb.FileIndex) {
1371
+ if (!entry?.name || !entry.content) continue;
1372
+ const match = entry.name.match(binDataRe);
1373
+ if (!match) continue;
1374
+ const idx = parseInt(match[1], 10);
1375
+ let data = Buffer.from(entry.content);
1376
+ if (compressed) {
1377
+ try {
1378
+ data = decompressStream(data);
1379
+ } catch {
1380
+ }
1322
1381
  }
1382
+ binDataMap.set(idx, { data, name: entry.name });
1323
1383
  }
1324
- binDataMap.set(idx, { data, name: entry.name || `BIN${idx}` });
1325
1384
  }
1326
1385
  if (binDataMap.size === 0) return [];
1327
1386
  const images = [];
@@ -1468,6 +1527,16 @@ function parseTableBlock(records, startIdx) {
1468
1527
  i++;
1469
1528
  }
1470
1529
  if (rows === 0 || cols === 0 || cells.length === 0) return { table: null, nextIdx: i };
1530
+ const hasAddr = cells.some((c) => c.colAddr !== void 0 && c.rowAddr !== void 0);
1531
+ if (hasAddr) {
1532
+ const cellRows2 = arrangeCells(rows, cols, cells);
1533
+ const irCells = cellRows2.map((row) => row.map((c) => ({
1534
+ text: c.text.trim(),
1535
+ colSpan: c.colSpan,
1536
+ rowSpan: c.rowSpan
1537
+ })));
1538
+ return { table: { rows, cols, cells: irCells, hasHeader: rows > 1 }, nextIdx: i };
1539
+ }
1471
1540
  const cellRows = arrangeCells(rows, cols, cells);
1472
1541
  return { table: buildTable(cellRows), nextIdx: i };
1473
1542
  }
@@ -1731,7 +1800,36 @@ function buildTableGrids(horizontals, verticals) {
1731
1800
  };
1732
1801
  grids.push({ rowYs, colXs, bbox });
1733
1802
  }
1734
- return grids;
1803
+ return mergeAdjacentGrids(grids);
1804
+ }
1805
+ function mergeAdjacentGrids(grids) {
1806
+ if (grids.length <= 1) return grids;
1807
+ const sorted = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
1808
+ const merged = [sorted[0]];
1809
+ for (let i = 1; i < sorted.length; i++) {
1810
+ const prev = merged[merged.length - 1];
1811
+ const curr = sorted[i];
1812
+ if (prev.colXs.length === curr.colXs.length) {
1813
+ const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= COORD_MERGE_TOL * 3);
1814
+ const verticalGap = prev.bbox.y1 - curr.bbox.y2;
1815
+ if (colMatch && verticalGap >= -COORD_MERGE_TOL && verticalGap <= 20) {
1816
+ const allRowYs = [.../* @__PURE__ */ new Set([...prev.rowYs, ...curr.rowYs])].sort((a, b) => b - a);
1817
+ merged[merged.length - 1] = {
1818
+ rowYs: allRowYs,
1819
+ colXs: prev.colXs,
1820
+ bbox: {
1821
+ x1: Math.min(prev.bbox.x1, curr.bbox.x1),
1822
+ y1: Math.min(prev.bbox.y1, curr.bbox.y1),
1823
+ x2: Math.max(prev.bbox.x2, curr.bbox.x2),
1824
+ y2: Math.max(prev.bbox.y2, curr.bbox.y2)
1825
+ }
1826
+ };
1827
+ continue;
1828
+ }
1829
+ }
1830
+ merged.push(curr);
1831
+ }
1832
+ return merged;
1735
1833
  }
1736
1834
  function clusterCoordinates(values) {
1737
1835
  if (values.length === 0) return [];
@@ -1918,7 +2016,11 @@ function cellTextToString(items) {
1918
2016
  for (let j = 1; j < s.length; j++) {
1919
2017
  const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
1920
2018
  const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
1921
- if (gap < avgFs * 0.3 && /[가-힣]$/.test(result) && /^[가-힣]/.test(s[j].text)) {
2019
+ const prevIsKorean = /[가-힣]$/.test(result);
2020
+ const currIsKorean = /^[가-힣]/.test(s[j].text);
2021
+ if (gap < avgFs * 0.15) {
2022
+ result += s[j].text;
2023
+ } else if (gap < avgFs * 0.35 && (prevIsKorean || currIsKorean)) {
1922
2024
  result += s[j].text;
1923
2025
  } else {
1924
2026
  result += " " + s[j].text;
@@ -1933,6 +2035,12 @@ function cellTextToString(items) {
1933
2035
  const curr = textLines[i];
1934
2036
  if (/[가-힣]$/.test(prev) && /^[가-힣]+$/.test(curr) && curr.length <= 8 && !curr.includes(" ")) {
1935
2037
  merged[merged.length - 1] = prev + curr;
2038
+ } else if (curr.trim().length <= 3 && /^[)\]%}]/.test(curr.trim())) {
2039
+ merged[merged.length - 1] = prev + curr.trim();
2040
+ } else if (/[,(]$/.test(prev.trim()) && curr.trim().length <= 15) {
2041
+ merged[merged.length - 1] = prev + curr.trim();
2042
+ } else if (/[\d,]$/.test(prev) && /^[\d,]+[)\]]?$/.test(curr.trim()) && curr.trim().length <= 10) {
2043
+ merged[merged.length - 1] = prev + curr.trim();
1936
2044
  } else {
1937
2045
  merged.push(curr);
1938
2046
  }
@@ -2145,21 +2253,26 @@ async function loadPdfWithTimeout(buffer) {
2145
2253
  disableFontFace: true,
2146
2254
  isEvalSupported: false
2147
2255
  });
2148
- return Promise.race([
2149
- loadingTask.promise,
2150
- new Promise(
2151
- (_, reject) => setTimeout(() => {
2152
- loadingTask.destroy();
2153
- reject(new KordocError("PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (30\uCD08 \uCD08\uACFC)"));
2154
- }, PDF_LOAD_TIMEOUT_MS)
2155
- )
2156
- ]);
2256
+ let timer;
2257
+ try {
2258
+ return await Promise.race([
2259
+ loadingTask.promise,
2260
+ new Promise((_, reject) => {
2261
+ timer = setTimeout(() => {
2262
+ loadingTask.destroy();
2263
+ reject(new KordocError("PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (30\uCD08 \uCD08\uACFC)"));
2264
+ }, PDF_LOAD_TIMEOUT_MS);
2265
+ })
2266
+ ]);
2267
+ } finally {
2268
+ if (timer !== void 0) clearTimeout(timer);
2269
+ }
2157
2270
  }
2158
2271
  async function parsePdfDocument(buffer, options) {
2159
2272
  const doc = await loadPdfWithTimeout(buffer);
2160
2273
  try {
2161
2274
  const pageCount = doc.numPages;
2162
- if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "PARSE_ERROR" };
2275
+ if (pageCount === 0) throw new KordocError("PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.");
2163
2276
  const metadata = { pageCount };
2164
2277
  await extractPdfMetadata(doc, metadata);
2165
2278
  const blocks = [];
@@ -2212,14 +2325,14 @@ async function parsePdfDocument(buffer, options) {
2212
2325
  const ocrBlocks = await ocrPages(doc, options.ocr, pageFilter, effectivePageCount);
2213
2326
  if (ocrBlocks.length > 0) {
2214
2327
  const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
2215
- return { success: true, fileType: "pdf", markdown: ocrMarkdown, pageCount: parsedPageCount, blocks: ocrBlocks, metadata, isImageBased: true, warnings };
2328
+ return { markdown: ocrMarkdown, blocks: ocrBlocks, metadata, warnings, isImageBased: true };
2216
2329
  }
2217
2330
  } catch {
2218
2331
  }
2219
2332
  }
2220
- return { success: false, fileType: "pdf", pageCount, isImageBased: true, error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`, code: "IMAGE_BASED_PDF" };
2333
+ throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
2221
2334
  }
2222
- if (options?.removeHeaderFooter && parsedPageCount >= 3) {
2335
+ if (options?.removeHeaderFooter !== false && parsedPageCount >= 3) {
2223
2336
  const removed = removeHeaderFooterBlocks(blocks, pageHeights, warnings);
2224
2337
  for (let ri = removed.length - 1; ri >= 0; ri--) {
2225
2338
  blocks.splice(removed[ri], 1);
@@ -2229,9 +2342,10 @@ async function parsePdfDocument(buffer, options) {
2229
2342
  if (medianFontSize > 0) {
2230
2343
  detectHeadings(blocks, medianFontSize);
2231
2344
  }
2345
+ detectMarkerHeadings(blocks);
2232
2346
  const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
2233
2347
  let markdown = cleanPdfText(blocksToMarkdown(blocks));
2234
- return { success: true, fileType: "pdf", markdown, pageCount: parsedPageCount, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
2348
+ return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
2235
2349
  } finally {
2236
2350
  await doc.destroy().catch(() => {
2237
2351
  });
@@ -2302,12 +2416,67 @@ function detectHeadings(blocks, medianFontSize) {
2302
2416
  if (/^\d+$/.test(text)) continue;
2303
2417
  const ratio = block.style.fontSize / medianFontSize;
2304
2418
  let level = 0;
2305
- if (ratio >= 1.5) level = 1;
2306
- else if (ratio >= 1.3) level = 2;
2307
- else if (ratio >= 1.15) level = 3;
2419
+ if (ratio >= HEADING_RATIO_H1) level = 1;
2420
+ else if (ratio >= HEADING_RATIO_H2) level = 2;
2421
+ else if (ratio >= HEADING_RATIO_H3) level = 3;
2308
2422
  if (level > 0) {
2309
2423
  block.type = "heading";
2310
2424
  block.level = level;
2425
+ block.text = collapseEvenSpacing(text);
2426
+ }
2427
+ }
2428
+ }
2429
+ function collapseEvenSpacing(text) {
2430
+ const tokens = text.split(" ");
2431
+ const singleCharCount = tokens.filter((t) => t.length === 1).length;
2432
+ if (tokens.length >= 3 && singleCharCount / tokens.length >= 0.7) {
2433
+ return tokens.join("");
2434
+ }
2435
+ return text;
2436
+ }
2437
+ function shouldDemoteTable(table) {
2438
+ const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
2439
+ const allText = allCells.join(" ");
2440
+ if (allText.length > 200) return false;
2441
+ if (/[□■◆○●▶]/.test(allText) && table.rows <= 3) return true;
2442
+ const totalCells = table.rows * table.cols;
2443
+ const emptyCells = totalCells - allCells.length;
2444
+ if (table.rows <= 2 && emptyCells > totalCells * 0.5) return true;
2445
+ if (table.rows === 1 && !/\d{2,}/.test(allText)) return true;
2446
+ return false;
2447
+ }
2448
+ function demoteTableToText(table) {
2449
+ const lines = [];
2450
+ for (let r = 0; r < table.rows; r++) {
2451
+ const cells = table.cells[r].map((c) => c.text.trim()).filter(Boolean);
2452
+ if (cells.length === 0) continue;
2453
+ if (table.cols === 2 && cells.length === 2) {
2454
+ lines.push(`${cells[0]} : ${cells[1]}`);
2455
+ } else {
2456
+ lines.push(cells.join(" "));
2457
+ }
2458
+ }
2459
+ return lines.join("\n");
2460
+ }
2461
+ function detectMarkerHeadings(blocks) {
2462
+ for (let i = 0; i < blocks.length; i++) {
2463
+ const block = blocks[i];
2464
+ if (block.type !== "paragraph" || !block.text) continue;
2465
+ const text = block.text.trim();
2466
+ if (text.length < 50 && /^[□■◆◇▶]\s*[가-힣]/.test(text)) {
2467
+ block.type = "heading";
2468
+ block.level = 4;
2469
+ continue;
2470
+ }
2471
+ if (/^[가-힣]{2,6}$/.test(text) && block.style?.fontSize) {
2472
+ const prev = blocks[i - 1];
2473
+ const next = blocks[i + 1];
2474
+ const prevIsStructural = !prev || prev.type === "table" || prev.type === "heading" || prev.type === "separator";
2475
+ const nextIsStructural = !next || next.type === "table" || next.type === "heading" || next.type === "paragraph" && next.text && /^[□■◆○●]/.test(next.text.trim());
2476
+ if (prevIsStructural || nextIsStructural) {
2477
+ block.type = "heading";
2478
+ block.level = 3;
2479
+ }
2311
2480
  }
2312
2481
  }
2313
2482
  }
@@ -2344,7 +2513,7 @@ function computeRegion(items) {
2344
2513
  }
2345
2514
  return { items, minX, minY, maxX, maxY };
2346
2515
  }
2347
- function findYSplit(items, region, gapThreshold) {
2516
+ function findYSplit(items, _region, gapThreshold) {
2348
2517
  const sorted = [...items].sort((a, b) => b.y - a.y);
2349
2518
  let bestGap = gapThreshold;
2350
2519
  let bestSplit = null;
@@ -2359,7 +2528,7 @@ function findYSplit(items, region, gapThreshold) {
2359
2528
  }
2360
2529
  return bestSplit;
2361
2530
  }
2362
- function findXSplit(items, region, gapThreshold) {
2531
+ function findXSplit(items, _region, gapThreshold) {
2363
2532
  const sorted = [...items].sort((a, b) => a.x - b.x);
2364
2533
  let bestGap = gapThreshold;
2365
2534
  let bestSplit = null;
@@ -2418,7 +2587,8 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
2418
2587
  );
2419
2588
  for (const cell of cells) {
2420
2589
  const cellItems = cellTextMap.get(cell) || [];
2421
- const text = cellTextToString(cellItems);
2590
+ let text = cellTextToString(cellItems);
2591
+ text = text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").trim();
2422
2592
  irGrid[cell.row][cell.col] = {
2423
2593
  text,
2424
2594
  colSpan: cell.colSpan,
@@ -2433,18 +2603,21 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
2433
2603
  };
2434
2604
  const hasContent = irGrid.some((row) => row.some((cell) => cell.text.trim() !== ""));
2435
2605
  if (!hasContent) continue;
2436
- blocks.push({
2437
- type: "table",
2438
- table: irTable,
2439
- pageNumber: pageNum,
2440
- bbox: {
2441
- page: pageNum,
2442
- x: grid.bbox.x1,
2443
- y: grid.bbox.y1,
2444
- width: grid.bbox.x2 - grid.bbox.x1,
2445
- height: grid.bbox.y2 - grid.bbox.y1
2606
+ const tableBbox = {
2607
+ page: pageNum,
2608
+ x: grid.bbox.x1,
2609
+ y: grid.bbox.y1,
2610
+ width: grid.bbox.x2 - grid.bbox.x1,
2611
+ height: grid.bbox.y2 - grid.bbox.y1
2612
+ };
2613
+ if (shouldDemoteTable(irTable)) {
2614
+ const demoted = demoteTableToText(irTable);
2615
+ if (demoted) {
2616
+ blocks.push({ type: "paragraph", text: demoted, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
2446
2617
  }
2447
- });
2618
+ continue;
2619
+ }
2620
+ blocks.push({ type: "table", table: irTable, pageNumber: pageNum, bbox: tableBbox });
2448
2621
  }
2449
2622
  const remaining = items.filter((i) => !usedItems.has(i));
2450
2623
  if (remaining.length > 0) {
@@ -2456,9 +2629,29 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
2456
2629
  const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
2457
2630
  return by - ay;
2458
2631
  });
2459
- return allBlocks;
2632
+ return mergeAdjacentTableBlocks(allBlocks);
2460
2633
  }
2461
- return blocks;
2634
+ return mergeAdjacentTableBlocks(blocks);
2635
+ }
2636
+ function mergeAdjacentTableBlocks(blocks) {
2637
+ if (blocks.length <= 1) return blocks;
2638
+ const result = [blocks[0]];
2639
+ for (let i = 1; i < blocks.length; i++) {
2640
+ const prev = result[result.length - 1];
2641
+ const curr = blocks[i];
2642
+ if (prev.type === "table" && curr.type === "table" && prev.table && curr.table && prev.table.cols === curr.table.cols) {
2643
+ const merged = {
2644
+ rows: prev.table.rows + curr.table.rows,
2645
+ cols: prev.table.cols,
2646
+ cells: [...prev.table.cells, ...curr.table.cells],
2647
+ hasHeader: prev.table.hasHeader
2648
+ };
2649
+ result[result.length - 1] = { ...prev, table: merged };
2650
+ } else {
2651
+ result.push(curr);
2652
+ }
2653
+ }
2654
+ return result;
2462
2655
  }
2463
2656
  function extractPageBlocksFallback(items, pageNum) {
2464
2657
  if (items.length === 0) return [];
@@ -2481,11 +2674,13 @@ function extractPageBlocksFallback(items, pageNum) {
2481
2674
  }));
2482
2675
  const clusterResults = detectClusterTables(clusterItems, pageNum);
2483
2676
  if (clusterResults.length > 0) {
2677
+ const ciToIdx = /* @__PURE__ */ new Map();
2678
+ for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
2484
2679
  const usedIndices = /* @__PURE__ */ new Set();
2485
2680
  for (const cr of clusterResults) {
2486
2681
  for (const ci of cr.usedItems) {
2487
- const idx = clusterItems.indexOf(ci);
2488
- if (idx >= 0) usedIndices.add(idx);
2682
+ const idx = ciToIdx.get(ci);
2683
+ if (idx !== void 0) usedIndices.add(idx);
2489
2684
  }
2490
2685
  blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
2491
2686
  }
@@ -2796,7 +2991,8 @@ function mergeLineSimple(items) {
2796
2991
  const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
2797
2992
  const avgFs = (sorted[i].fontSize + sorted[i - 1].fontSize) / 2;
2798
2993
  if (gap > 15) result += " ";
2799
- else if (gap < avgFs * 0.3 && /[가-힣]$/.test(result) && /^[가-힣]/.test(sorted[i].text)) {
2994
+ else if (gap < avgFs * 0.15) {
2995
+ } else if (gap < avgFs * 0.35 && (/[가-힣]$/.test(result) || /^[가-힣]/.test(sorted[i].text))) {
2800
2996
  } else if (gap > 3) result += " ";
2801
2997
  result += sorted[i].text;
2802
2998
  }
@@ -2804,8 +3000,8 @@ function mergeLineSimple(items) {
2804
3000
  }
2805
3001
  function cleanPdfText(text) {
2806
3002
  return mergeKoreanLines(
2807
- text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "")
2808
- ).replace(/\n{3,}/g, "\n\n").trim();
3003
+ text.replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "")
3004
+ ).replace(/^(?!\|).{3,30}$/gm, (line) => collapseEvenSpacing(line)).replace(/\n{3,}/g, "\n\n").trim();
2809
3005
  }
2810
3006
  function startsWithMarker(line) {
2811
3007
  const t = line.trimStart();
@@ -2819,15 +3015,13 @@ function detectListBlocks(blocks) {
2819
3015
  for (let i = 0; i < blocks.length; i++) {
2820
3016
  const block = blocks[i];
2821
3017
  if (block.type === "paragraph" && block.text) {
2822
- const match = block.text.match(/^(\d+)\.\s/);
2823
- if (match) {
2824
- result.push({
2825
- ...block,
2826
- type: "list",
2827
- listType: "ordered",
2828
- // 원래 번호를 text에 보존 (blocksToMarkdown에서 그대로 출력)
2829
- text: block.text
2830
- });
3018
+ const text = block.text.trim();
3019
+ if (/^\d+\.\s/.test(text)) {
3020
+ result.push({ ...block, type: "list", listType: "ordered", text: block.text });
3021
+ continue;
3022
+ }
3023
+ if (/^[○●·※▶▷◆◇\-]\s/.test(text)) {
3024
+ result.push({ ...block, type: "list", listType: "unordered", text: block.text });
2831
3025
  continue;
2832
3026
  }
2833
3027
  }
@@ -2986,11 +3180,20 @@ function mergeKoreanLines(text) {
2986
3180
  for (let i = 1; i < lines.length; i++) {
2987
3181
  const prev = result[result.length - 1];
2988
3182
  const curr = lines[i];
2989
- if (/^#{1,6}\s/.test(prev) || /^#{1,6}\s/.test(curr)) {
3183
+ const currTrimmed = curr.trim();
3184
+ if (/^#{1,6}\s/.test(prev) || /^#{1,6}\s/.test(curr) || /^\|/.test(currTrimmed) || /^---/.test(currTrimmed)) {
2990
3185
  result.push(curr);
2991
3186
  continue;
2992
3187
  }
2993
- if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !curr.trimStart().startsWith("|") && !startsWithMarker(curr) && !isStandaloneHeader(prev)) {
3188
+ if (/,$/.test(prev.trim()) && currTrimmed.length > 0) {
3189
+ result[result.length - 1] = prev + "\n" + curr;
3190
+ continue;
3191
+ }
3192
+ if (/^\(※/.test(currTrimmed)) {
3193
+ result[result.length - 1] = prev + " " + currTrimmed;
3194
+ continue;
3195
+ }
3196
+ if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev)) {
2994
3197
  result[result.length - 1] = prev + " " + curr;
2995
3198
  } else {
2996
3199
  result.push(curr);
@@ -3002,6 +3205,716 @@ function mergeKoreanLines(text) {
3002
3205
  // src/index.ts
3003
3206
  import { readFile } from "fs/promises";
3004
3207
 
3208
+ // src/xlsx/parser.ts
3209
+ import JSZip3 from "jszip";
3210
+ import { DOMParser as DOMParser2 } from "@xmldom/xmldom";
3211
+ var MAX_SHEETS = 100;
3212
+ var MAX_DECOMPRESS_SIZE3 = 100 * 1024 * 1024;
3213
+ var MAX_ROWS2 = 1e4;
3214
+ var MAX_COLS2 = 200;
3215
+ function cleanNumericValue(raw) {
3216
+ if (!/^-?\d+\.\d+$/.test(raw)) return raw;
3217
+ const num = parseFloat(raw);
3218
+ if (!isFinite(num)) return raw;
3219
+ const cleaned = parseFloat(num.toPrecision(15)).toString();
3220
+ return cleaned;
3221
+ }
3222
+ function parseCellRef(ref) {
3223
+ const m = ref.match(/^([A-Z]+)(\d+)$/);
3224
+ if (!m) return null;
3225
+ let col = 0;
3226
+ for (const ch of m[1]) col = col * 26 + (ch.charCodeAt(0) - 64);
3227
+ return { col: col - 1, row: parseInt(m[2], 10) - 1 };
3228
+ }
3229
+ function parseMergeRef(ref) {
3230
+ const parts = ref.split(":");
3231
+ if (parts.length !== 2) return null;
3232
+ const start = parseCellRef(parts[0]);
3233
+ const end = parseCellRef(parts[1]);
3234
+ if (!start || !end) return null;
3235
+ return { startCol: start.col, startRow: start.row, endCol: end.col, endRow: end.row };
3236
+ }
3237
+ function getElements(parent, tagName) {
3238
+ const nodes = parent.getElementsByTagName(tagName);
3239
+ const result = [];
3240
+ for (let i = 0; i < nodes.length; i++) result.push(nodes[i]);
3241
+ return result;
3242
+ }
3243
+ function getTextContent(el) {
3244
+ return el.textContent?.trim() ?? "";
3245
+ }
3246
+ function parseXml(text) {
3247
+ return new DOMParser2().parseFromString(text, "text/xml");
3248
+ }
3249
+ function parseSharedStrings(xml) {
3250
+ const doc = parseXml(xml);
3251
+ const strings = [];
3252
+ const siList = getElements(doc.documentElement, "si");
3253
+ for (const si of siList) {
3254
+ const tElements = getElements(si, "t");
3255
+ strings.push(tElements.map((t) => t.textContent ?? "").join(""));
3256
+ }
3257
+ return strings;
3258
+ }
3259
+ function parseWorkbook(xml) {
3260
+ const doc = parseXml(xml);
3261
+ const sheets = [];
3262
+ const sheetElements = getElements(doc.documentElement, "sheet");
3263
+ for (const el of sheetElements) {
3264
+ sheets.push({
3265
+ name: el.getAttribute("name") ?? `Sheet${sheets.length + 1}`,
3266
+ sheetId: el.getAttribute("sheetId") ?? "",
3267
+ rId: el.getAttribute("r:id") ?? ""
3268
+ });
3269
+ }
3270
+ return sheets;
3271
+ }
3272
+ function parseRels(xml) {
3273
+ const doc = parseXml(xml);
3274
+ const map = /* @__PURE__ */ new Map();
3275
+ const rels = getElements(doc.documentElement, "Relationship");
3276
+ for (const rel of rels) {
3277
+ const id = rel.getAttribute("Id");
3278
+ const target = rel.getAttribute("Target");
3279
+ if (id && target) map.set(id, target);
3280
+ }
3281
+ return map;
3282
+ }
3283
+ function parseWorksheet(xml, sharedStrings) {
3284
+ const doc = parseXml(xml);
3285
+ const grid = [];
3286
+ let maxRow = 0;
3287
+ let maxCol = 0;
3288
+ const rows = getElements(doc.documentElement, "row");
3289
+ for (const rowEl of rows) {
3290
+ const rowNum = parseInt(rowEl.getAttribute("r") ?? "0", 10) - 1;
3291
+ if (rowNum < 0 || rowNum >= MAX_ROWS2) continue;
3292
+ const cells = getElements(rowEl, "c");
3293
+ for (const cellEl of cells) {
3294
+ const ref = cellEl.getAttribute("r");
3295
+ if (!ref) continue;
3296
+ const pos = parseCellRef(ref);
3297
+ if (!pos || pos.col >= MAX_COLS2) continue;
3298
+ const type = cellEl.getAttribute("t");
3299
+ const vElements = getElements(cellEl, "v");
3300
+ const fElements = getElements(cellEl, "f");
3301
+ let value = "";
3302
+ if (vElements.length > 0) {
3303
+ const raw = getTextContent(vElements[0]);
3304
+ if (type === "s") {
3305
+ const idx = parseInt(raw, 10);
3306
+ value = sharedStrings[idx] ?? "";
3307
+ } else if (type === "b") {
3308
+ value = raw === "1" ? "TRUE" : "FALSE";
3309
+ } else {
3310
+ value = cleanNumericValue(raw);
3311
+ }
3312
+ } else if (type === "inlineStr") {
3313
+ const isEl = getElements(cellEl, "is");
3314
+ if (isEl.length > 0) {
3315
+ const tElements = getElements(isEl[0], "t");
3316
+ value = tElements.map((t) => t.textContent ?? "").join("");
3317
+ }
3318
+ }
3319
+ if (!value && fElements.length > 0) {
3320
+ value = `=${getTextContent(fElements[0])}`;
3321
+ }
3322
+ while (grid.length <= pos.row) grid.push([]);
3323
+ while (grid[pos.row].length <= pos.col) grid[pos.row].push("");
3324
+ grid[pos.row][pos.col] = value;
3325
+ if (pos.row > maxRow) maxRow = pos.row;
3326
+ if (pos.col > maxCol) maxCol = pos.col;
3327
+ }
3328
+ }
3329
+ const merges = [];
3330
+ const mergeCellElements = getElements(doc.documentElement, "mergeCell");
3331
+ for (const el of mergeCellElements) {
3332
+ const ref = el.getAttribute("ref");
3333
+ if (!ref) continue;
3334
+ const m = parseMergeRef(ref);
3335
+ if (m) merges.push(m);
3336
+ }
3337
+ return { grid, merges, maxRow, maxCol };
3338
+ }
3339
+ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
3340
+ const blocks = [];
3341
+ if (sheetName) {
3342
+ blocks.push({
3343
+ type: "heading",
3344
+ text: sheetName,
3345
+ level: 2,
3346
+ pageNumber: sheetIndex + 1
3347
+ });
3348
+ }
3349
+ if (maxRow < 0 || maxCol < 0 || grid.length === 0) return blocks;
3350
+ const mergeMap = /* @__PURE__ */ new Map();
3351
+ const mergeSkip = /* @__PURE__ */ new Set();
3352
+ for (const m of merges) {
3353
+ const colSpan = m.endCol - m.startCol + 1;
3354
+ const rowSpan = m.endRow - m.startRow + 1;
3355
+ mergeMap.set(`${m.startRow},${m.startCol}`, { colSpan, rowSpan });
3356
+ for (let r = m.startRow; r <= m.endRow; r++) {
3357
+ for (let c = m.startCol; c <= m.endCol; c++) {
3358
+ if (r !== m.startRow || c !== m.startCol) {
3359
+ mergeSkip.add(`${r},${c}`);
3360
+ }
3361
+ }
3362
+ }
3363
+ }
3364
+ let firstRow = -1;
3365
+ let lastRow = -1;
3366
+ for (let r = 0; r <= maxRow; r++) {
3367
+ const row = grid[r];
3368
+ if (row && row.some((cell) => cell !== "")) {
3369
+ if (firstRow === -1) firstRow = r;
3370
+ lastRow = r;
3371
+ }
3372
+ }
3373
+ if (firstRow === -1) return blocks;
3374
+ const cellRows = [];
3375
+ for (let r = firstRow; r <= lastRow; r++) {
3376
+ const row = [];
3377
+ for (let c = 0; c <= maxCol; c++) {
3378
+ const key = `${r},${c}`;
3379
+ if (mergeSkip.has(key)) continue;
3380
+ const text = (grid[r] && grid[r][c]) ?? "";
3381
+ const merge = mergeMap.get(key);
3382
+ row.push({
3383
+ text,
3384
+ colSpan: merge?.colSpan ?? 1,
3385
+ rowSpan: merge?.rowSpan ?? 1
3386
+ });
3387
+ }
3388
+ cellRows.push(row);
3389
+ }
3390
+ if (cellRows.length > 0) {
3391
+ const table = buildTable(cellRows);
3392
+ if (table.rows > 0) {
3393
+ blocks.push({ type: "table", table, pageNumber: sheetIndex + 1 });
3394
+ }
3395
+ }
3396
+ return blocks;
3397
+ }
3398
+ async function parseXlsxDocument(buffer, options) {
3399
+ precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
3400
+ const zip = await JSZip3.loadAsync(buffer);
3401
+ const warnings = [];
3402
+ const workbookFile = zip.file("xl/workbook.xml");
3403
+ if (!workbookFile) {
3404
+ throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 XLSX \uD30C\uC77C: xl/workbook.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
3405
+ }
3406
+ let sharedStrings = [];
3407
+ const ssFile = zip.file("xl/sharedStrings.xml");
3408
+ if (ssFile) {
3409
+ sharedStrings = parseSharedStrings(await ssFile.async("text"));
3410
+ }
3411
+ const sheets = parseWorkbook(await workbookFile.async("text"));
3412
+ if (sheets.length === 0) {
3413
+ throw new KordocError("XLSX \uD30C\uC77C\uC5D0 \uC2DC\uD2B8\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4");
3414
+ }
3415
+ let relsMap = /* @__PURE__ */ new Map();
3416
+ const relsFile = zip.file("xl/_rels/workbook.xml.rels");
3417
+ if (relsFile) {
3418
+ relsMap = parseRels(await relsFile.async("text"));
3419
+ }
3420
+ let pageFilter = null;
3421
+ if (options?.pages) {
3422
+ const { parsePageRange: parsePageRange2 } = await import("./page-range-737B4EZW.js");
3423
+ pageFilter = parsePageRange2(options.pages, sheets.length);
3424
+ }
3425
+ const blocks = [];
3426
+ const processedSheets = Math.min(sheets.length, MAX_SHEETS);
3427
+ for (let i = 0; i < processedSheets; i++) {
3428
+ if (pageFilter && !pageFilter.has(i + 1)) continue;
3429
+ const sheet = sheets[i];
3430
+ options?.onProgress?.(i + 1, processedSheets);
3431
+ let sheetPath = relsMap.get(sheet.rId);
3432
+ if (sheetPath) {
3433
+ if (!sheetPath.startsWith("xl/") && !sheetPath.startsWith("/")) {
3434
+ sheetPath = `xl/${sheetPath}`;
3435
+ } else if (sheetPath.startsWith("/")) {
3436
+ sheetPath = sheetPath.slice(1);
3437
+ }
3438
+ } else {
3439
+ sheetPath = `xl/worksheets/sheet${i + 1}.xml`;
3440
+ }
3441
+ const sheetFile = zip.file(sheetPath);
3442
+ if (!sheetFile) {
3443
+ warnings.push({
3444
+ page: i + 1,
3445
+ message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${sheetPath}`,
3446
+ code: "PARTIAL_PARSE"
3447
+ });
3448
+ continue;
3449
+ }
3450
+ try {
3451
+ const sheetXml = await sheetFile.async("text");
3452
+ const { grid, merges, maxRow, maxCol } = parseWorksheet(sheetXml, sharedStrings);
3453
+ const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
3454
+ blocks.push(...sheetBlocks);
3455
+ } catch (err) {
3456
+ warnings.push({
3457
+ page: i + 1,
3458
+ message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC2F1 \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
3459
+ code: "PARTIAL_PARSE"
3460
+ });
3461
+ }
3462
+ }
3463
+ const metadata = {
3464
+ pageCount: processedSheets
3465
+ };
3466
+ const coreFile = zip.file("docProps/core.xml");
3467
+ if (coreFile) {
3468
+ try {
3469
+ const coreXml = await coreFile.async("text");
3470
+ const doc = parseXml(coreXml);
3471
+ const getFirst = (tag) => {
3472
+ const els = doc.getElementsByTagName(tag);
3473
+ return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
3474
+ };
3475
+ metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
3476
+ metadata.author = getFirst("dc:creator");
3477
+ metadata.description = getFirst("dc:description");
3478
+ const created = getFirst("dcterms:created");
3479
+ if (created) metadata.createdAt = created;
3480
+ const modified = getFirst("dcterms:modified");
3481
+ if (modified) metadata.modifiedAt = modified;
3482
+ } catch {
3483
+ }
3484
+ }
3485
+ const markdown = blocksToMarkdown(blocks);
3486
+ return { markdown, blocks, metadata, warnings: warnings.length > 0 ? warnings : void 0 };
3487
+ }
3488
+
3489
+ // src/docx/parser.ts
3490
+ import JSZip4 from "jszip";
3491
+ import { DOMParser as DOMParser3 } from "@xmldom/xmldom";
3492
+ var MAX_DECOMPRESS_SIZE4 = 100 * 1024 * 1024;
3493
+ function getChildElements(parent, localName) {
3494
+ const result = [];
3495
+ const children = parent.childNodes;
3496
+ for (let i = 0; i < children.length; i++) {
3497
+ const node = children[i];
3498
+ if (node.nodeType === 1) {
3499
+ const el = node;
3500
+ if (el.localName === localName || el.tagName?.endsWith(`:${localName}`)) {
3501
+ result.push(el);
3502
+ }
3503
+ }
3504
+ }
3505
+ return result;
3506
+ }
3507
+ function findElements(parent, localName) {
3508
+ const result = [];
3509
+ const walk = (node) => {
3510
+ const children = node.childNodes;
3511
+ for (let i = 0; i < children.length; i++) {
3512
+ const child = children[i];
3513
+ if (child.nodeType === 1) {
3514
+ const el = child;
3515
+ if (el.localName === localName || el.tagName?.endsWith(`:${localName}`)) {
3516
+ result.push(el);
3517
+ }
3518
+ walk(el);
3519
+ }
3520
+ }
3521
+ };
3522
+ walk(parent);
3523
+ return result;
3524
+ }
3525
+ function getAttr(el, localName) {
3526
+ const attrs = el.attributes;
3527
+ for (let i = 0; i < attrs.length; i++) {
3528
+ const attr = attrs[i];
3529
+ if (attr.localName === localName || attr.name === localName) return attr.value;
3530
+ }
3531
+ return null;
3532
+ }
3533
+ function parseXml2(text) {
3534
+ return new DOMParser3().parseFromString(text, "text/xml");
3535
+ }
3536
+ function parseStyles(xml) {
3537
+ const doc = parseXml2(xml);
3538
+ const styles = /* @__PURE__ */ new Map();
3539
+ const styleElements = findElements(doc, "style");
3540
+ for (const el of styleElements) {
3541
+ const styleId = getAttr(el, "styleId");
3542
+ if (!styleId) continue;
3543
+ const nameEls = getChildElements(el, "name");
3544
+ const name = nameEls.length > 0 ? getAttr(nameEls[0], "val") ?? "" : "";
3545
+ const basedOnEls = getChildElements(el, "basedOn");
3546
+ const basedOn = basedOnEls.length > 0 ? getAttr(basedOnEls[0], "val") ?? void 0 : void 0;
3547
+ const pPrEls = getChildElements(el, "pPr");
3548
+ let outlineLevel;
3549
+ if (pPrEls.length > 0) {
3550
+ const outlineEls = getChildElements(pPrEls[0], "outlineLvl");
3551
+ if (outlineEls.length > 0) {
3552
+ const val = getAttr(outlineEls[0], "val");
3553
+ if (val !== null) outlineLevel = parseInt(val, 10);
3554
+ }
3555
+ }
3556
+ if (outlineLevel === void 0) {
3557
+ const headingMatch = name.match(/^(?:heading|Heading)\s*(\d+)$/i);
3558
+ if (headingMatch) outlineLevel = parseInt(headingMatch[1], 10) - 1;
3559
+ }
3560
+ styles.set(styleId, { name, basedOn, outlineLevel });
3561
+ }
3562
+ return styles;
3563
+ }
3564
+ function parseNumbering(xml) {
3565
+ const doc = parseXml2(xml);
3566
+ const abstractNums = /* @__PURE__ */ new Map();
3567
+ const abstractElements = findElements(doc, "abstractNum");
3568
+ for (const el of abstractElements) {
3569
+ const abstractNumId = getAttr(el, "abstractNumId");
3570
+ if (!abstractNumId) continue;
3571
+ const levels = /* @__PURE__ */ new Map();
3572
+ const lvlElements = getChildElements(el, "lvl");
3573
+ for (const lvl of lvlElements) {
3574
+ const ilvl = parseInt(getAttr(lvl, "ilvl") ?? "0", 10);
3575
+ const numFmtEls = getChildElements(lvl, "numFmt");
3576
+ const numFmt = numFmtEls.length > 0 ? getAttr(numFmtEls[0], "val") ?? "bullet" : "bullet";
3577
+ levels.set(ilvl, { numFmt, level: ilvl });
3578
+ }
3579
+ abstractNums.set(abstractNumId, levels);
3580
+ }
3581
+ const nums = /* @__PURE__ */ new Map();
3582
+ const numElements = findElements(doc, "num");
3583
+ for (const el of numElements) {
3584
+ const numId = getAttr(el, "numId");
3585
+ if (!numId) continue;
3586
+ const abstractRefs = getChildElements(el, "abstractNumId");
3587
+ if (abstractRefs.length > 0) {
3588
+ const ref = getAttr(abstractRefs[0], "val");
3589
+ if (ref && abstractNums.has(ref)) {
3590
+ nums.set(numId, abstractNums.get(ref));
3591
+ }
3592
+ }
3593
+ }
3594
+ return nums;
3595
+ }
3596
+ function parseRels2(xml) {
3597
+ const doc = parseXml2(xml);
3598
+ const map = /* @__PURE__ */ new Map();
3599
+ const rels = findElements(doc, "Relationship");
3600
+ for (const rel of rels) {
3601
+ const id = getAttr(rel, "Id");
3602
+ const target = getAttr(rel, "Target");
3603
+ if (id && target) map.set(id, target);
3604
+ }
3605
+ return map;
3606
+ }
3607
+ function parseFootnotes(xml) {
3608
+ const doc = parseXml2(xml);
3609
+ const notes = /* @__PURE__ */ new Map();
3610
+ const fnElements = findElements(doc, "footnote");
3611
+ for (const fn of fnElements) {
3612
+ const id = getAttr(fn, "id");
3613
+ if (!id || id === "0" || id === "-1") continue;
3614
+ const texts = [];
3615
+ const pElements = findElements(fn, "p");
3616
+ for (const p of pElements) {
3617
+ const runs = findElements(p, "r");
3618
+ for (const r of runs) {
3619
+ const tElements = getChildElements(r, "t");
3620
+ for (const t of tElements) texts.push(t.textContent ?? "");
3621
+ }
3622
+ }
3623
+ notes.set(id, texts.join("").trim());
3624
+ }
3625
+ return notes;
3626
+ }
3627
+ function extractRun(r) {
3628
+ const tElements = getChildElements(r, "t");
3629
+ const text = tElements.map((t) => t.textContent ?? "").join("");
3630
+ let bold = false;
3631
+ let italic = false;
3632
+ const rPrEls = getChildElements(r, "rPr");
3633
+ if (rPrEls.length > 0) {
3634
+ bold = getChildElements(rPrEls[0], "b").length > 0;
3635
+ italic = getChildElements(rPrEls[0], "i").length > 0;
3636
+ }
3637
+ return { text, bold, italic };
3638
+ }
3639
+ function parseParagraph(p, styles, numbering, footnotes, rels) {
3640
+ const pPrEls = getChildElements(p, "pPr");
3641
+ let styleId = "";
3642
+ let numId = "";
3643
+ let ilvl = 0;
3644
+ if (pPrEls.length > 0) {
3645
+ const pStyleEls = getChildElements(pPrEls[0], "pStyle");
3646
+ if (pStyleEls.length > 0) styleId = getAttr(pStyleEls[0], "val") ?? "";
3647
+ const numPrEls = getChildElements(pPrEls[0], "numPr");
3648
+ if (numPrEls.length > 0) {
3649
+ const numIdEls = getChildElements(numPrEls[0], "numId");
3650
+ const ilvlEls = getChildElements(numPrEls[0], "ilvl");
3651
+ numId = numIdEls.length > 0 ? getAttr(numIdEls[0], "val") ?? "" : "";
3652
+ ilvl = ilvlEls.length > 0 ? parseInt(getAttr(ilvlEls[0], "val") ?? "0", 10) : 0;
3653
+ }
3654
+ }
3655
+ const parts = [];
3656
+ let hasBold = false;
3657
+ let hasItalic = false;
3658
+ let href;
3659
+ let footnoteText;
3660
+ const hyperlinks = getChildElements(p, "hyperlink");
3661
+ const hyperlinkTexts = /* @__PURE__ */ new Set();
3662
+ for (const hl of hyperlinks) {
3663
+ const rId = getAttr(hl, "id");
3664
+ const hlText = [];
3665
+ const runs2 = findElements(hl, "r");
3666
+ for (const r of runs2) {
3667
+ const result = extractRun(r);
3668
+ hlText.push(result.text);
3669
+ }
3670
+ const text2 = hlText.join("");
3671
+ if (text2) {
3672
+ hyperlinkTexts.add(text2);
3673
+ if (rId && rels.has(rId)) {
3674
+ href = rels.get(rId);
3675
+ parts.push(text2);
3676
+ } else {
3677
+ parts.push(text2);
3678
+ }
3679
+ }
3680
+ }
3681
+ const runs = getChildElements(p, "r");
3682
+ for (const r of runs) {
3683
+ if (r.parentNode && r.parentNode.localName === "hyperlink") continue;
3684
+ const result = extractRun(r);
3685
+ if (result.bold) hasBold = true;
3686
+ if (result.italic) hasItalic = true;
3687
+ const fnRefEls = getChildElements(r, "footnoteReference");
3688
+ if (fnRefEls.length > 0) {
3689
+ const fnId = getAttr(fnRefEls[0], "id");
3690
+ if (fnId && footnotes.has(fnId)) {
3691
+ footnoteText = footnotes.get(fnId);
3692
+ }
3693
+ }
3694
+ if (result.text) parts.push(result.text);
3695
+ }
3696
+ const text = parts.join("").trim();
3697
+ if (!text) return null;
3698
+ const style = styles.get(styleId);
3699
+ if (style?.outlineLevel !== void 0 && style.outlineLevel >= 0 && style.outlineLevel <= 5) {
3700
+ return {
3701
+ type: "heading",
3702
+ text,
3703
+ level: style.outlineLevel + 1
3704
+ };
3705
+ }
3706
+ if (numId && numId !== "0") {
3707
+ const numDef = numbering.get(numId);
3708
+ const levelInfo = numDef?.get(ilvl);
3709
+ const listType = levelInfo?.numFmt === "bullet" ? "unordered" : "ordered";
3710
+ return { type: "list", text, listType };
3711
+ }
3712
+ const block = { type: "paragraph", text };
3713
+ if (hasBold || hasItalic) {
3714
+ block.style = { bold: hasBold || void 0, italic: hasItalic || void 0 };
3715
+ }
3716
+ if (href) block.href = href;
3717
+ if (footnoteText) block.footnoteText = footnoteText;
3718
+ return block;
3719
+ }
3720
+ function parseTable(tbl, styles, numbering, footnotes, rels) {
3721
+ const trElements = getChildElements(tbl, "tr");
3722
+ if (trElements.length === 0) return null;
3723
+ const rows = [];
3724
+ let maxCols = 0;
3725
+ for (const tr of trElements) {
3726
+ const tcElements = getChildElements(tr, "tc");
3727
+ const row = [];
3728
+ for (const tc of tcElements) {
3729
+ let colSpan = 1;
3730
+ let rowSpan = 1;
3731
+ const tcPrEls = getChildElements(tc, "tcPr");
3732
+ if (tcPrEls.length > 0) {
3733
+ const gridSpanEls = getChildElements(tcPrEls[0], "gridSpan");
3734
+ if (gridSpanEls.length > 0) {
3735
+ colSpan = parseInt(getAttr(gridSpanEls[0], "val") ?? "1", 10);
3736
+ }
3737
+ const vMergeEls = getChildElements(tcPrEls[0], "vMerge");
3738
+ if (vMergeEls.length > 0) {
3739
+ const val = getAttr(vMergeEls[0], "val");
3740
+ if (val !== "restart" && val !== null) {
3741
+ row.push({ text: "", colSpan, rowSpan: 0 });
3742
+ continue;
3743
+ }
3744
+ }
3745
+ }
3746
+ const cellTexts = [];
3747
+ const pElements = getChildElements(tc, "p");
3748
+ for (const p of pElements) {
3749
+ const block = parseParagraph(p, styles, numbering, footnotes, rels);
3750
+ if (block?.text) cellTexts.push(block.text);
3751
+ }
3752
+ row.push({ text: cellTexts.join("\n"), colSpan, rowSpan });
3753
+ }
3754
+ rows.push(row);
3755
+ if (row.length > maxCols) maxCols = row.length;
3756
+ }
3757
+ for (let c = 0; c < maxCols; c++) {
3758
+ for (let r = 0; r < rows.length; r++) {
3759
+ const cell = rows[r][c];
3760
+ if (!cell || cell.rowSpan === 0) continue;
3761
+ let span = 1;
3762
+ for (let nr = r + 1; nr < rows.length; nr++) {
3763
+ if (rows[nr][c]?.rowSpan === 0) span++;
3764
+ else break;
3765
+ }
3766
+ cell.rowSpan = span;
3767
+ }
3768
+ }
3769
+ const cleanRows = [];
3770
+ for (const row of rows) {
3771
+ const clean = row.filter((cell) => cell.rowSpan !== 0);
3772
+ cleanRows.push(clean);
3773
+ }
3774
+ if (cleanRows.length === 0) return null;
3775
+ let cols = 0;
3776
+ for (const row of cleanRows) {
3777
+ let c = 0;
3778
+ for (const cell of row) c += cell.colSpan;
3779
+ if (c > cols) cols = c;
3780
+ }
3781
+ const table = {
3782
+ rows: cleanRows.length,
3783
+ cols,
3784
+ cells: cleanRows,
3785
+ hasHeader: cleanRows.length > 1
3786
+ };
3787
+ return { type: "table", table };
3788
+ }
3789
+ async function extractImages(zip, rels, doc) {
3790
+ const blocks = [];
3791
+ const images = [];
3792
+ const drawingElements = findElements(doc.documentElement, "drawing");
3793
+ let imgIdx = 0;
3794
+ for (const drawing of drawingElements) {
3795
+ const blips = findElements(drawing, "blip");
3796
+ for (const blip of blips) {
3797
+ const embedId = getAttr(blip, "embed");
3798
+ if (!embedId) continue;
3799
+ const target = rels.get(embedId);
3800
+ if (!target) continue;
3801
+ const imgPath = target.startsWith("/") ? target.slice(1) : target.startsWith("word/") ? target : `word/${target}`;
3802
+ const imgFile = zip.file(imgPath);
3803
+ if (!imgFile) continue;
3804
+ try {
3805
+ const data = await imgFile.async("uint8array");
3806
+ imgIdx++;
3807
+ const ext = imgPath.split(".").pop()?.toLowerCase() ?? "png";
3808
+ const mimeMap = {
3809
+ png: "image/png",
3810
+ jpg: "image/jpeg",
3811
+ jpeg: "image/jpeg",
3812
+ gif: "image/gif",
3813
+ bmp: "image/bmp",
3814
+ wmf: "image/wmf",
3815
+ emf: "image/emf"
3816
+ };
3817
+ const filename = `image_${String(imgIdx).padStart(3, "0")}.${ext}`;
3818
+ images.push({ filename, data, mimeType: mimeMap[ext] ?? "image/png" });
3819
+ blocks.push({ type: "image", text: filename });
3820
+ } catch {
3821
+ }
3822
+ }
3823
+ }
3824
+ return { blocks, images };
3825
+ }
3826
+ async function parseDocxDocument(buffer, options) {
3827
+ precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
3828
+ const zip = await JSZip4.loadAsync(buffer);
3829
+ const warnings = [];
3830
+ const docFile = zip.file("word/document.xml");
3831
+ if (!docFile) {
3832
+ throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 DOCX \uD30C\uC77C: word/document.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
3833
+ }
3834
+ let rels = /* @__PURE__ */ new Map();
3835
+ const relsFile = zip.file("word/_rels/document.xml.rels");
3836
+ if (relsFile) {
3837
+ rels = parseRels2(await relsFile.async("text"));
3838
+ }
3839
+ let styles = /* @__PURE__ */ new Map();
3840
+ const stylesFile = zip.file("word/styles.xml");
3841
+ if (stylesFile) {
3842
+ try {
3843
+ styles = parseStyles(await stylesFile.async("text"));
3844
+ } catch {
3845
+ }
3846
+ }
3847
+ let numbering = /* @__PURE__ */ new Map();
3848
+ const numFile = zip.file("word/numbering.xml");
3849
+ if (numFile) {
3850
+ try {
3851
+ numbering = parseNumbering(await numFile.async("text"));
3852
+ } catch {
3853
+ }
3854
+ }
3855
+ let footnotes = /* @__PURE__ */ new Map();
3856
+ const fnFile = zip.file("word/footnotes.xml");
3857
+ if (fnFile) {
3858
+ try {
3859
+ footnotes = parseFootnotes(await fnFile.async("text"));
3860
+ } catch {
3861
+ }
3862
+ }
3863
+ const docXml = await docFile.async("text");
3864
+ const doc = parseXml2(docXml);
3865
+ const body = findElements(doc, "body");
3866
+ if (body.length === 0) {
3867
+ throw new KordocError("DOCX \uBCF8\uBB38(w:body)\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
3868
+ }
3869
+ const blocks = [];
3870
+ const bodyEl = body[0];
3871
+ const children = bodyEl.childNodes;
3872
+ for (let i = 0; i < children.length; i++) {
3873
+ const node = children[i];
3874
+ if (node.nodeType !== 1) continue;
3875
+ const el = node;
3876
+ const localName = el.localName ?? el.tagName?.split(":").pop();
3877
+ if (localName === "p") {
3878
+ const block = parseParagraph(el, styles, numbering, footnotes, rels);
3879
+ if (block) blocks.push(block);
3880
+ } else if (localName === "tbl") {
3881
+ const block = parseTable(el, styles, numbering, footnotes, rels);
3882
+ if (block) blocks.push(block);
3883
+ }
3884
+ }
3885
+ const { blocks: imgBlocks, images } = await extractImages(zip, rels, doc);
3886
+ const metadata = {};
3887
+ const coreFile = zip.file("docProps/core.xml");
3888
+ if (coreFile) {
3889
+ try {
3890
+ const coreXml = await coreFile.async("text");
3891
+ const coreDoc = parseXml2(coreXml);
3892
+ const getFirst = (tag) => {
3893
+ const els = coreDoc.getElementsByTagName(tag);
3894
+ return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
3895
+ };
3896
+ metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
3897
+ metadata.author = getFirst("dc:creator");
3898
+ metadata.description = getFirst("dc:description");
3899
+ const created = getFirst("dcterms:created");
3900
+ if (created) metadata.createdAt = created;
3901
+ const modified = getFirst("dcterms:modified");
3902
+ if (modified) metadata.modifiedAt = modified;
3903
+ } catch {
3904
+ }
3905
+ }
3906
+ const outline = blocks.filter((b) => b.type === "heading").map((b) => ({ level: b.level ?? 2, text: b.text ?? "" }));
3907
+ const markdown = blocksToMarkdown(blocks);
3908
+ return {
3909
+ markdown,
3910
+ blocks,
3911
+ metadata,
3912
+ outline: outline.length > 0 ? outline : void 0,
3913
+ warnings: warnings.length > 0 ? warnings : void 0,
3914
+ images: images.length > 0 ? images : void 0
3915
+ };
3916
+ }
3917
+
3005
3918
  // src/form/recognize.ts
3006
3919
  var LABEL_KEYWORDS = /* @__PURE__ */ new Set([
3007
3920
  "\uC131\uBA85",
@@ -3129,7 +4042,7 @@ function extractInlineFields(text) {
3129
4042
  }
3130
4043
 
3131
4044
  // src/hwpx/generator.ts
3132
- import JSZip2 from "jszip";
4045
+ import JSZip5 from "jszip";
3133
4046
 
3134
4047
  // src/index.ts
3135
4048
  async function parse(input, options) {
@@ -3152,8 +4065,12 @@ async function parse(input, options) {
3152
4065
  }
3153
4066
  const format = detectFormat(buffer);
3154
4067
  switch (format) {
3155
- case "hwpx":
4068
+ case "hwpx": {
4069
+ const zipFormat = await detectZipFormat(buffer);
4070
+ if (zipFormat === "xlsx") return parseXlsx(buffer, options);
4071
+ if (zipFormat === "docx") return parseDocx(buffer, options);
3156
4072
  return parseHwpx(buffer, options);
4073
+ }
3157
4074
  case "hwp":
3158
4075
  return parseHwp(buffer, options);
3159
4076
  case "pdf":
@@ -3180,9 +4097,27 @@ async function parseHwp(buffer, options) {
3180
4097
  }
3181
4098
  async function parsePdf(buffer, options) {
3182
4099
  try {
3183
- return await parsePdfDocument(buffer, options);
4100
+ const { markdown, blocks, metadata, outline, warnings, isImageBased } = await parsePdfDocument(buffer, options);
4101
+ return { success: true, fileType: "pdf", markdown, blocks, metadata, outline, warnings, isImageBased };
4102
+ } catch (err) {
4103
+ const isImageBased = err instanceof Error && "isImageBased" in err ? true : void 0;
4104
+ return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err), isImageBased };
4105
+ }
4106
+ }
4107
+ async function parseXlsx(buffer, options) {
4108
+ try {
4109
+ const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options);
4110
+ return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
3184
4111
  } catch (err) {
3185
- return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
4112
+ return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
4113
+ }
4114
+ }
4115
+ async function parseDocx(buffer, options) {
4116
+ try {
4117
+ const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options);
4118
+ return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
4119
+ } catch (err) {
4120
+ return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
3186
4121
  }
3187
4122
  }
3188
4123
 
@@ -3360,10 +4295,6 @@ function diffTableCells(a, b) {
3360
4295
  export {
3361
4296
  detectFormat,
3362
4297
  blocksToMarkdown,
3363
- VERSION,
3364
- toArrayBuffer,
3365
- KordocError,
3366
- sanitizeError,
3367
4298
  extractHwpxMetadataOnly,
3368
4299
  extractHwp5MetadataOnly,
3369
4300
  extractPdfMetadataOnly,
@@ -3371,4 +4302,4 @@ export {
3371
4302
  extractFormFields,
3372
4303
  parse
3373
4304
  };
3374
- //# sourceMappingURL=chunk-NJ3R7LNR.js.map
4305
+ //# sourceMappingURL=chunk-MDRW3HYC.js.map