kordoc 1.7.2 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,22 @@
1
1
  #!/usr/bin/env node
2
+ import {
3
+ KordocError,
4
+ classifyError,
5
+ isPathTraversal,
6
+ precheckZipSize,
7
+ sanitizeHref,
8
+ toArrayBuffer
9
+ } from "./chunk-UUKFY5P5.js";
10
+ import {
11
+ parsePageRange
12
+ } from "./chunk-MOL7MDBG.js";
2
13
 
3
14
  // src/detect.ts
15
+ import JSZip from "jszip";
4
16
  function magicBytes(buffer) {
5
17
  return new Uint8Array(buffer, 0, Math.min(4, buffer.byteLength));
6
18
  }
7
- function isHwpxFile(buffer) {
19
+ function isZipFile(buffer) {
8
20
  const b = magicBytes(buffer);
9
21
  return b[0] === 80 && b[1] === 75 && b[2] === 3 && b[3] === 4;
10
22
  }
@@ -18,15 +30,28 @@ function isPdfFile(buffer) {
18
30
  }
19
31
  function detectFormat(buffer) {
20
32
  if (buffer.byteLength < 4) return "unknown";
21
- if (isHwpxFile(buffer)) return "hwpx";
33
+ if (isZipFile(buffer)) return "hwpx";
22
34
  if (isOldHwpFile(buffer)) return "hwp";
23
35
  if (isPdfFile(buffer)) return "pdf";
24
36
  return "unknown";
25
37
  }
38
+ async function detectZipFormat(buffer) {
39
+ try {
40
+ const zip = await JSZip.loadAsync(buffer);
41
+ if (zip.file("xl/workbook.xml")) return "xlsx";
42
+ if (zip.file("word/document.xml")) return "docx";
43
+ if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return "hwpx";
44
+ const hasSection = Object.keys(zip.files).some((f) => f.startsWith("Contents/"));
45
+ if (hasSection) return "hwpx";
46
+ return "unknown";
47
+ } catch {
48
+ return "unknown";
49
+ }
50
+ }
26
51
 
27
52
  // src/table/builder.ts
28
53
  var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
29
- function sanitizeHref(href) {
54
+ function sanitizeHref2(href) {
30
55
  const trimmed = href.trim();
31
56
  if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
32
57
  return trimmed;
@@ -80,6 +105,16 @@ function buildTable(rows) {
80
105
  cellIdx++;
81
106
  }
82
107
  }
108
+ let effectiveCols = maxCols;
109
+ while (effectiveCols > 0) {
110
+ const colEmpty = grid.every((row) => !row[effectiveCols - 1]?.text?.trim());
111
+ if (!colEmpty) break;
112
+ effectiveCols--;
113
+ }
114
+ if (effectiveCols < maxCols && effectiveCols > 0) {
115
+ const trimmed = grid.map((row) => row.slice(0, effectiveCols));
116
+ return { rows: numRows, cols: effectiveCols, cells: trimmed, hasHeader: numRows > 1 };
117
+ }
83
118
  return { rows: numRows, cols: maxCols, cells: grid, hasHeader: numRows > 1 };
84
119
  }
85
120
  function convertTableToText(rows) {
@@ -87,13 +122,26 @@ function convertTableToText(rows) {
87
122
  (row) => row.map((c) => c.text.trim().replace(/\n/g, " ")).filter(Boolean).join(" | ")
88
123
  ).filter(Boolean).join("\n");
89
124
  }
125
+ var HWP_SHAPE_ALT_TEXT_RE = /(?:모서리가 둥근 |둥근 )?(?:사각형|직사각형|정사각형|원|타원|삼각형|이등변 삼각형|직각 삼각형|선|직선|곡선|화살표|굵은 화살표|이중 화살표|오각형|육각형|팔각형|별|[4-8]점별|십자|십자형|구름|구름형|마름모|도넛|평행사변형|사다리꼴|부채꼴|호|반원|물결|번개|하트|빗금|블록 화살표|수식|표|그림|개체|그리기\s?개체|묶음\s?개체|글상자|수식\s?개체|OLE\s?개체)\s?입니다\.?/g;
126
+ function sanitizeText(text) {
127
+ let result = text.replace(/[\u{F0000}-\u{FFFFD}]/gu, "").replace(HWP_SHAPE_ALT_TEXT_RE, "").replace(/ +/g, " ").trim();
128
+ if (result.length <= 30 && result.includes(" ")) {
129
+ const tokens = result.split(" ");
130
+ const koreanSingleCharCount = tokens.filter((t) => t.length === 1 && /[\uAC00-\uD7AF\u3131-\u318E]/.test(t)).length;
131
+ if (tokens.length >= 3 && koreanSingleCharCount / tokens.length >= 0.7) {
132
+ result = tokens.join("");
133
+ }
134
+ }
135
+ return result;
136
+ }
90
137
  function blocksToMarkdown(blocks) {
91
138
  const lines = [];
92
139
  for (let i = 0; i < blocks.length; i++) {
93
140
  const block = blocks[i];
94
141
  if (block.type === "heading" && block.text) {
95
142
  const prefix = "#".repeat(Math.min(block.level || 2, 6));
96
- lines.push("", `${prefix} ${block.text}`, "");
143
+ const headingText = sanitizeText(block.text);
144
+ if (headingText) lines.push("", `${prefix} ${headingText}`, "");
97
145
  continue;
98
146
  }
99
147
  if (block.type === "image" && block.text) {
@@ -105,9 +153,11 @@ function blocksToMarkdown(blocks) {
105
153
  continue;
106
154
  }
107
155
  if (block.type === "list" && block.text) {
108
- const alreadyNumbered = block.listType === "ordered" && /^\d+\.\s/.test(block.text);
156
+ const listText = sanitizeText(block.text);
157
+ if (!listText) continue;
158
+ const alreadyNumbered = block.listType === "ordered" && /^\d+\.\s/.test(listText);
109
159
  const prefix = alreadyNumbered ? "" : block.listType === "ordered" ? "1. " : "- ";
110
- lines.push(`${prefix}${block.text}`);
160
+ lines.push(`${prefix}${listText}`);
111
161
  if (block.children) {
112
162
  for (const child of block.children) {
113
163
  const childPrefix = child.listType === "ordered" ? "1." : "-";
@@ -117,7 +167,8 @@ function blocksToMarkdown(blocks) {
117
167
  continue;
118
168
  }
119
169
  if (block.type === "paragraph" && block.text) {
120
- let text = block.text;
170
+ let text = sanitizeText(block.text);
171
+ if (!text) continue;
121
172
  if (/^\[별표\s*\d+/.test(text)) {
122
173
  const nextBlock = blocks[i + 1];
123
174
  if (nextBlock?.type === "paragraph" && nextBlock.text && /관련\)?$/.test(nextBlock.text)) {
@@ -133,7 +184,7 @@ function blocksToMarkdown(blocks) {
133
184
  continue;
134
185
  }
135
186
  if (block.href) {
136
- const href = sanitizeHref(block.href);
187
+ const href = sanitizeHref2(block.href);
137
188
  if (href) text = `[${text}](${href})`;
138
189
  }
139
190
  if (block.footnoteText) {
@@ -154,7 +205,7 @@ function tableToMarkdown(table) {
154
205
  if (table.rows === 0 || table.cols === 0) return "";
155
206
  const { cells, rows: numRows, cols: numCols } = table;
156
207
  if (numRows === 1 && numCols === 1) {
157
- const content = cells[0][0].text;
208
+ const content = sanitizeText(cells[0][0].text);
158
209
  return content.split(/\n/).map((line) => {
159
210
  const trimmed = line.trim();
160
211
  if (!trimmed) return "";
@@ -163,13 +214,19 @@ function tableToMarkdown(table) {
163
214
  return trimmed;
164
215
  }).filter(Boolean).join("\n");
165
216
  }
217
+ if (numCols === 1 && numRows >= 2) {
218
+ return cells.map((row) => sanitizeText(row[0].text).replace(/\n/g, " ")).filter(Boolean).join("\n");
219
+ }
166
220
  const display = Array.from({ length: numRows }, () => Array(numCols).fill(""));
167
221
  const skip = /* @__PURE__ */ new Set();
168
222
  for (let r = 0; r < numRows; r++) {
223
+ let cellIdx = 0;
169
224
  for (let c = 0; c < numCols; c++) {
170
225
  if (skip.has(`${r},${c}`)) continue;
171
- const cell = cells[r][c];
172
- display[r][c] = cell.text.replace(/\n/g, "<br>");
226
+ const cell = cells[r]?.[cellIdx];
227
+ if (!cell) break;
228
+ cellIdx++;
229
+ display[r][c] = sanitizeText(cell.text).replace(/\n/g, "<br>");
173
230
  for (let dr = 0; dr < cell.rowSpan; dr++) {
174
231
  for (let dc = 0; dc < cell.colSpan; dc++) {
175
232
  if (dr === 0 && dc === 0) continue;
@@ -178,12 +235,28 @@ function tableToMarkdown(table) {
178
235
  }
179
236
  }
180
237
  }
238
+ c += cell.colSpan - 1;
181
239
  }
182
240
  }
183
241
  const uniqueRows = [];
184
- for (const row of display) {
242
+ let pendingFirstCol = "";
243
+ for (let r = 0; r < display.length; r++) {
244
+ const row = display[r];
185
245
  const isEmptyPlaceholder = row.every((cell) => cell === "");
186
- if (!isEmptyPlaceholder) uniqueRows.push(row);
246
+ if (isEmptyPlaceholder) continue;
247
+ const hasSkippedCols = row.some((cell, c) => cell === "" && skip.has(`${r},${c}`));
248
+ const nonEmptyCols = row.filter((cell) => cell !== "");
249
+ if (!hasSkippedCols && nonEmptyCols.length === 1 && row[0] !== "" && row.slice(1).every((c) => c === "")) {
250
+ pendingFirstCol = row[0];
251
+ continue;
252
+ }
253
+ if (pendingFirstCol && row[0] === "") {
254
+ row[0] = pendingFirstCol;
255
+ pendingFirstCol = "";
256
+ } else {
257
+ pendingFirstCol = "";
258
+ }
259
+ uniqueRows.push(row);
187
260
  }
188
261
  if (uniqueRows.length === 0) return "";
189
262
  const md = [];
@@ -195,75 +268,15 @@ function tableToMarkdown(table) {
195
268
  return md.join("\n");
196
269
  }
197
270
 
198
- // src/utils.ts
199
- var VERSION = true ? "1.7.2" : "0.0.0-dev";
200
- function toArrayBuffer(buf) {
201
- if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
202
- return buf.buffer;
203
- }
204
- return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
205
- }
206
- var KordocError = class extends Error {
207
- constructor(message) {
208
- super(message);
209
- this.name = "KordocError";
210
- }
211
- };
212
- function sanitizeError(err) {
213
- if (err instanceof KordocError) return err.message;
214
- return "\uBB38\uC11C \uCC98\uB9AC \uC911 \uC624\uB958\uAC00 \uBC1C\uC0DD\uD588\uC2B5\uB2C8\uB2E4";
215
- }
216
- function isPathTraversal(name) {
217
- if (name.includes("\0")) return true;
218
- const normalized = name.replace(/\\/g, "/");
219
- return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
220
- }
221
- function classifyError(err) {
222
- if (!(err instanceof Error)) return "PARSE_ERROR";
223
- const msg = err.message;
224
- if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
225
- if (msg.includes("DRM")) return "DRM_PROTECTED";
226
- if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
227
- if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
228
- if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
229
- if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
230
- if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
231
- return "PARSE_ERROR";
232
- }
233
-
234
271
  // src/hwpx/parser.ts
235
- import JSZip from "jszip";
272
+ import JSZip2 from "jszip";
236
273
  import { inflateRawSync } from "zlib";
237
274
  import { DOMParser } from "@xmldom/xmldom";
238
275
 
239
- // src/page-range.ts
240
- function parsePageRange(spec, maxPages) {
241
- const result = /* @__PURE__ */ new Set();
242
- if (maxPages <= 0) return result;
243
- if (Array.isArray(spec)) {
244
- for (const n of spec) {
245
- const page = Math.round(n);
246
- if (page >= 1 && page <= maxPages) result.add(page);
247
- }
248
- return result;
249
- }
250
- if (typeof spec !== "string" || spec.trim() === "") return result;
251
- const parts = spec.split(",");
252
- for (const part of parts) {
253
- const trimmed = part.trim();
254
- if (!trimmed) continue;
255
- const rangeMatch = trimmed.match(/^(\d+)\s*-\s*(\d+)$/);
256
- if (rangeMatch) {
257
- const start = Math.max(1, parseInt(rangeMatch[1], 10));
258
- const end = Math.min(maxPages, parseInt(rangeMatch[2], 10));
259
- for (let i = start; i <= end; i++) result.add(i);
260
- } else {
261
- const page = parseInt(trimmed, 10);
262
- if (!isNaN(page) && page >= 1 && page <= maxPages) result.add(page);
263
- }
264
- }
265
- return result;
266
- }
276
+ // src/types.ts
277
+ var HEADING_RATIO_H1 = 1.5;
278
+ var HEADING_RATIO_H2 = 1.3;
279
+ var HEADING_RATIO_H3 = 1.15;
267
280
 
268
281
  // src/hwpx/parser.ts
269
282
  var MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024;
@@ -357,16 +370,10 @@ function stripDtd(xml) {
357
370
  return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
358
371
  }
359
372
  async function parseHwpxDocument(buffer, options) {
360
- const precheck = precheckZipSize(buffer);
361
- if (precheck.totalUncompressed > MAX_DECOMPRESS_SIZE) {
362
- throw new KordocError("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
363
- }
364
- if (precheck.entryCount > MAX_ZIP_ENTRIES) {
365
- throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
366
- }
373
+ precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
367
374
  let zip;
368
375
  try {
369
- zip = await JSZip.loadAsync(buffer);
376
+ zip = await JSZip2.loadAsync(buffer);
370
377
  } catch {
371
378
  return extractFromBrokenZip(buffer);
372
379
  }
@@ -529,7 +536,7 @@ function parseDublinCoreMetadata(xml, metadata) {
529
536
  async function extractHwpxMetadataOnly(buffer) {
530
537
  let zip;
531
538
  try {
532
- zip = await JSZip.loadAsync(buffer);
539
+ zip = await JSZip2.loadAsync(buffer);
533
540
  } catch {
534
541
  throw new KordocError("HWPX ZIP\uC744 \uC5F4 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
535
542
  }
@@ -539,46 +546,17 @@ async function extractHwpxMetadataOnly(buffer) {
539
546
  metadata.pageCount = sectionPaths.length;
540
547
  return metadata;
541
548
  }
542
- function precheckZipSize(buffer) {
543
- try {
544
- const data = new DataView(buffer);
545
- const len = buffer.byteLength;
546
- if (len < 22) return { totalUncompressed: 0, entryCount: 0 };
547
- const searchStart = Math.max(0, len - 22 - 65535);
548
- let eocdOffset = -1;
549
- for (let i = len - 22; i >= searchStart; i--) {
550
- if (data.getUint32(i, true) === 101010256) {
551
- eocdOffset = i;
552
- break;
553
- }
554
- }
555
- if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
556
- const entryCount = data.getUint16(eocdOffset + 10, true);
557
- const cdSize = data.getUint32(eocdOffset + 12, true);
558
- const cdOffset = data.getUint32(eocdOffset + 16, true);
559
- if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
560
- let totalUncompressed = 0;
561
- let pos = cdOffset;
562
- for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
563
- if (data.getUint32(pos, true) !== 33639248) break;
564
- totalUncompressed += data.getUint32(pos + 24, true);
565
- const nameLen = data.getUint16(pos + 28, true);
566
- const extraLen = data.getUint16(pos + 30, true);
567
- const commentLen = data.getUint16(pos + 32, true);
568
- pos += 46 + nameLen + extraLen + commentLen;
569
- }
570
- return { totalUncompressed, entryCount };
571
- } catch {
572
- return { totalUncompressed: 0, entryCount: 0 };
573
- }
574
- }
575
549
  function extractFromBrokenZip(buffer) {
576
550
  const data = new Uint8Array(buffer);
577
551
  const view = new DataView(buffer);
578
552
  let pos = 0;
579
553
  const blocks = [];
554
+ const warnings = [
555
+ { code: "BROKEN_ZIP_RECOVERY", message: "\uC190\uC0C1\uB41C ZIP \uAD6C\uC870 \u2014 Local File Header \uAE30\uBC18 \uBCF5\uAD6C \uBAA8\uB4DC" }
556
+ ];
580
557
  let totalDecompressed = 0;
581
558
  let entryCount = 0;
559
+ let sectionNum = 0;
582
560
  while (pos < data.length - 30) {
583
561
  if (data[pos] !== 80 || data[pos + 1] !== 75 || data[pos + 2] !== 3 || data[pos + 3] !== 4) {
584
562
  pos++;
@@ -624,14 +602,15 @@ function extractFromBrokenZip(buffer) {
624
602
  }
625
603
  totalDecompressed += content.length * 2;
626
604
  if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("\uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC");
627
- blocks.push(...parseSectionXml(content));
605
+ sectionNum++;
606
+ blocks.push(...parseSectionXml(content, void 0, warnings, sectionNum));
628
607
  } catch {
629
608
  continue;
630
609
  }
631
610
  }
632
611
  if (blocks.length === 0) throw new KordocError("\uC190\uC0C1\uB41C HWPX\uC5D0\uC11C \uC139\uC158 \uB370\uC774\uD130\uB97C \uBCF5\uAD6C\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
633
612
  const markdown = blocksToMarkdown(blocks);
634
- return { markdown, blocks };
613
+ return { markdown, blocks, warnings: warnings.length > 0 ? warnings : void 0 };
635
614
  }
636
615
  async function resolveSectionPaths(zip) {
637
616
  const manifestPaths = ["Contents/content.hpf", "content.hpf"];
@@ -695,9 +674,9 @@ function detectHwpxHeadings(blocks, styleMap) {
695
674
  let level = 0;
696
675
  if (baseFontSize > 0 && block.style?.fontSize) {
697
676
  const ratio = block.style.fontSize / baseFontSize;
698
- if (ratio >= 1.5) level = 1;
699
- else if (ratio >= 1.3) level = 2;
700
- else if (ratio >= 1.15) level = 3;
677
+ if (ratio >= HEADING_RATIO_H1) level = 1;
678
+ else if (ratio >= HEADING_RATIO_H2) level = 2;
679
+ else if (ratio >= HEADING_RATIO_H3) level = 3;
701
680
  }
702
681
  if (/^제\d+[조장절편]/.test(text) && text.length <= 50) {
703
682
  if (level === 0) level = 3;
@@ -829,39 +808,47 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
829
808
  if (depth > MAX_XML_DEPTH) return tableCtx;
830
809
  const children = node.childNodes;
831
810
  if (!children) return tableCtx;
832
- for (let i = 0; i < children.length; i++) {
833
- const el = children[i];
834
- if (el.nodeType !== 1) continue;
835
- const tag = el.tagName || el.localName || "";
836
- const localTag = tag.replace(/^[^:]+:/, "");
837
- if (localTag === "tbl") {
838
- if (tableCtx) tableStack.push(tableCtx);
839
- const newTable = { rows: [], currentRow: [], cell: null };
840
- walkSection(el, blocks, newTable, tableStack, styleMap, warnings, sectionNum, depth + 1);
841
- if (newTable.rows.length > 0) {
842
- if (tableStack.length > 0) {
843
- const parentTable = tableStack.pop();
844
- const nestedText = convertTableToText(newTable.rows);
845
- if (parentTable.cell) {
846
- parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
811
+ const walkChildren = (parent, d) => {
812
+ if (d > MAX_XML_DEPTH) return;
813
+ const kids = parent.childNodes;
814
+ if (!kids) return;
815
+ for (let i = 0; i < kids.length; i++) {
816
+ const el = kids[i];
817
+ if (el.nodeType !== 1) continue;
818
+ const tag = el.tagName || el.localName || "";
819
+ const localTag = tag.replace(/^[^:]+:/, "");
820
+ if (localTag === "tbl") {
821
+ if (tableCtx) tableStack.push(tableCtx);
822
+ const newTable = { rows: [], currentRow: [], cell: null };
823
+ walkSection(el, blocks, newTable, tableStack, styleMap, warnings, sectionNum, d + 1);
824
+ if (newTable.rows.length > 0) {
825
+ if (tableStack.length > 0) {
826
+ const parentTable = tableStack.pop();
827
+ const nestedText = convertTableToText(newTable.rows);
828
+ if (parentTable.cell) {
829
+ parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
830
+ }
831
+ tableCtx = parentTable;
832
+ } else {
833
+ blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
834
+ tableCtx = null;
847
835
  }
848
- tableCtx = parentTable;
849
836
  } else {
850
- blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
851
- tableCtx = null;
837
+ tableCtx = tableStack.length > 0 ? tableStack.pop() : null;
852
838
  }
853
- } else {
854
- tableCtx = tableStack.length > 0 ? tableStack.pop() : null;
855
- }
856
- } else if (localTag === "pic" || localTag === "shape" || localTag === "drawingObject") {
857
- const imgRef = extractImageRef(el);
858
- if (imgRef) {
859
- blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
860
- } else if (warnings && sectionNum) {
861
- warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
839
+ } else if (localTag === "pic" || localTag === "shape" || localTag === "drawingObject") {
840
+ const imgRef = extractImageRef(el);
841
+ if (imgRef) {
842
+ blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
843
+ } else if (warnings && sectionNum) {
844
+ warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
845
+ }
846
+ } else if (localTag === "r" || localTag === "run" || localTag === "ctrl") {
847
+ walkChildren(el, d + 1);
862
848
  }
863
849
  }
864
- }
850
+ };
851
+ walkChildren(node, depth);
865
852
  return tableCtx;
866
853
  }
867
854
  function extractParagraphInfo(para, styleMap) {
@@ -900,7 +887,10 @@ function extractParagraphInfo(para, styleMap) {
900
887
  // 하이퍼링크
901
888
  case "hyperlink": {
902
889
  const url = child.getAttribute("url") || child.getAttribute("href") || "";
903
- if (url) href = url;
890
+ if (url) {
891
+ const safe = sanitizeHref(url);
892
+ if (safe) href = safe;
893
+ }
904
894
  walk(child);
905
895
  break;
906
896
  }
@@ -913,6 +903,29 @@ function extractParagraphInfo(para, styleMap) {
913
903
  if (noteText) footnote = (footnote ? footnote + "; " : "") + noteText;
914
904
  break;
915
905
  }
906
+ // 제어 요소 — 필드, 컨트롤, 매개변수 등 스킵
907
+ case "ctrl":
908
+ case "fieldBegin":
909
+ case "fieldEnd":
910
+ case "parameters":
911
+ case "stringParam":
912
+ case "integerParam":
913
+ case "boolParam":
914
+ case "floatParam":
915
+ case "secPr":
916
+ // 섹션 속성 (페이지 설정 등)
917
+ case "colPr":
918
+ // 다단 속성
919
+ case "linesegarray":
920
+ case "lineseg":
921
+ // 레이아웃 정보
922
+ // 도형/이미지 요소 — 대체텍스트("사각형입니다." 등) 누출 방지
923
+ case "pic":
924
+ case "shape":
925
+ case "drawingObject":
926
+ case "shapeComment":
927
+ case "drawText":
928
+ break;
916
929
  // run 요소에서 charPrIDRef 추출
917
930
  case "r": {
918
931
  const runCharPr = child.getAttribute("charPrIDRef");
@@ -927,7 +940,10 @@ function extractParagraphInfo(para, styleMap) {
927
940
  }
928
941
  };
929
942
  walk(para);
930
- const cleanText = text.replace(/[ \t]+/g, " ").trim();
943
+ let cleanText = text.replace(/[ \t]+/g, " ").trim();
944
+ if (/^그림입니다\.?\s*원본\s*그림의\s*(이름|크기)/.test(cleanText)) cleanText = "";
945
+ cleanText = cleanText.replace(/그림입니다\.?\s*원본\s*그림의\s*(이름|크기)[^\n]*(\n[^\n]*원본\s*그림의\s*(이름|크기)[^\n]*)*/g, "").trim();
946
+ cleanText = cleanText.replace(/(?:모서리가 둥근 |둥근 )?(?:사각형|직사각형|정사각형|원|타원|삼각형|선|직선|곡선|화살표|오각형|육각형|팔각형|별|십자|구름|마름모|도넛|평행사변형|사다리꼴|개체|그리기\s?개체|묶음\s?개체|글상자|수식|표|그림|OLE\s?개체)\s?입니다\.?/g, "").trim();
931
947
  let style;
932
948
  if (styleMap && charPrId) {
933
949
  const charProp = styleMap.charProperties.get(charPrId);
@@ -1205,9 +1221,9 @@ function detectHwp5Headings(blocks, docInfo) {
1205
1221
  if (/^\d+$/.test(text)) continue;
1206
1222
  const ratio = block.style.fontSize / baseFontSize;
1207
1223
  let level = 0;
1208
- if (ratio >= 1.5) level = 1;
1209
- else if (ratio >= 1.3) level = 2;
1210
- else if (ratio >= 1.15) level = 3;
1224
+ if (ratio >= HEADING_RATIO_H1) level = 1;
1225
+ else if (ratio >= HEADING_RATIO_H2) level = 2;
1226
+ else if (ratio >= HEADING_RATIO_H3) level = 3;
1211
1227
  if (/^제\d+[조장절편]/.test(text) && text.length <= 50) {
1212
1228
  if (level === 0) level = 3;
1213
1229
  }
@@ -1308,20 +1324,22 @@ function detectImageMime(data) {
1308
1324
  }
1309
1325
  function extractHwp5Images(cfb, blocks, compressed, warnings) {
1310
1326
  const binDataMap = /* @__PURE__ */ new Map();
1311
- for (let idx = 0; idx < 1e4; idx++) {
1312
- const entry = CFB.find(cfb, `/BinData/BIN${String(idx).padStart(4, "0")}`) || CFB.find(cfb, `/BinData/Bin${String(idx).padStart(4, "0")}`);
1313
- if (!entry?.content) {
1314
- if (idx > 0) break;
1315
- continue;
1316
- }
1317
- let data = Buffer.from(entry.content);
1318
- if (compressed) {
1319
- try {
1320
- data = decompressStream(data);
1321
- } catch {
1327
+ const binDataRe = /\/BinData\/[Bb][Ii][Nn](\d{4})$/;
1328
+ if (cfb.FileIndex) {
1329
+ for (const entry of cfb.FileIndex) {
1330
+ if (!entry?.name || !entry.content) continue;
1331
+ const match = entry.name.match(binDataRe);
1332
+ if (!match) continue;
1333
+ const idx = parseInt(match[1], 10);
1334
+ let data = Buffer.from(entry.content);
1335
+ if (compressed) {
1336
+ try {
1337
+ data = decompressStream(data);
1338
+ } catch {
1339
+ }
1322
1340
  }
1341
+ binDataMap.set(idx, { data, name: entry.name });
1323
1342
  }
1324
- binDataMap.set(idx, { data, name: entry.name || `BIN${idx}` });
1325
1343
  }
1326
1344
  if (binDataMap.size === 0) return [];
1327
1345
  const images = [];
@@ -1468,6 +1486,16 @@ function parseTableBlock(records, startIdx) {
1468
1486
  i++;
1469
1487
  }
1470
1488
  if (rows === 0 || cols === 0 || cells.length === 0) return { table: null, nextIdx: i };
1489
+ const hasAddr = cells.some((c) => c.colAddr !== void 0 && c.rowAddr !== void 0);
1490
+ if (hasAddr) {
1491
+ const cellRows2 = arrangeCells(rows, cols, cells);
1492
+ const irCells = cellRows2.map((row) => row.map((c) => ({
1493
+ text: c.text.trim(),
1494
+ colSpan: c.colSpan,
1495
+ rowSpan: c.rowSpan
1496
+ })));
1497
+ return { table: { rows, cols, cells: irCells, hasHeader: rows > 1 }, nextIdx: i };
1498
+ }
1471
1499
  const cellRows = arrangeCells(rows, cols, cells);
1472
1500
  return { table: buildTable(cellRows), nextIdx: i };
1473
1501
  }
@@ -1731,7 +1759,36 @@ function buildTableGrids(horizontals, verticals) {
1731
1759
  };
1732
1760
  grids.push({ rowYs, colXs, bbox });
1733
1761
  }
1734
- return grids;
1762
+ return mergeAdjacentGrids(grids);
1763
+ }
1764
+ function mergeAdjacentGrids(grids) {
1765
+ if (grids.length <= 1) return grids;
1766
+ const sorted = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
1767
+ const merged = [sorted[0]];
1768
+ for (let i = 1; i < sorted.length; i++) {
1769
+ const prev = merged[merged.length - 1];
1770
+ const curr = sorted[i];
1771
+ if (prev.colXs.length === curr.colXs.length) {
1772
+ const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= COORD_MERGE_TOL * 3);
1773
+ const verticalGap = prev.bbox.y1 - curr.bbox.y2;
1774
+ if (colMatch && verticalGap >= -COORD_MERGE_TOL && verticalGap <= 20) {
1775
+ const allRowYs = [.../* @__PURE__ */ new Set([...prev.rowYs, ...curr.rowYs])].sort((a, b) => b - a);
1776
+ merged[merged.length - 1] = {
1777
+ rowYs: allRowYs,
1778
+ colXs: prev.colXs,
1779
+ bbox: {
1780
+ x1: Math.min(prev.bbox.x1, curr.bbox.x1),
1781
+ y1: Math.min(prev.bbox.y1, curr.bbox.y1),
1782
+ x2: Math.max(prev.bbox.x2, curr.bbox.x2),
1783
+ y2: Math.max(prev.bbox.y2, curr.bbox.y2)
1784
+ }
1785
+ };
1786
+ continue;
1787
+ }
1788
+ }
1789
+ merged.push(curr);
1790
+ }
1791
+ return merged;
1735
1792
  }
1736
1793
  function clusterCoordinates(values) {
1737
1794
  if (values.length === 0) return [];
@@ -1918,7 +1975,11 @@ function cellTextToString(items) {
1918
1975
  for (let j = 1; j < s.length; j++) {
1919
1976
  const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
1920
1977
  const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
1921
- if (gap < avgFs * 0.3 && /[가-힣]$/.test(result) && /^[가-힣]/.test(s[j].text)) {
1978
+ const prevIsKorean = /[가-힣]$/.test(result);
1979
+ const currIsKorean = /^[가-힣]/.test(s[j].text);
1980
+ if (gap < avgFs * 0.15) {
1981
+ result += s[j].text;
1982
+ } else if (gap < avgFs * 0.35 && (prevIsKorean || currIsKorean)) {
1922
1983
  result += s[j].text;
1923
1984
  } else {
1924
1985
  result += " " + s[j].text;
@@ -1933,6 +1994,12 @@ function cellTextToString(items) {
1933
1994
  const curr = textLines[i];
1934
1995
  if (/[가-힣]$/.test(prev) && /^[가-힣]+$/.test(curr) && curr.length <= 8 && !curr.includes(" ")) {
1935
1996
  merged[merged.length - 1] = prev + curr;
1997
+ } else if (curr.trim().length <= 3 && /^[)\]%}]/.test(curr.trim())) {
1998
+ merged[merged.length - 1] = prev + curr.trim();
1999
+ } else if (/[,(]$/.test(prev.trim()) && curr.trim().length <= 15) {
2000
+ merged[merged.length - 1] = prev + curr.trim();
2001
+ } else if (/[\d,]$/.test(prev) && /^[\d,]+[)\]]?$/.test(curr.trim()) && curr.trim().length <= 10) {
2002
+ merged[merged.length - 1] = prev + curr.trim();
1936
2003
  } else {
1937
2004
  merged.push(curr);
1938
2005
  }
@@ -2145,21 +2212,26 @@ async function loadPdfWithTimeout(buffer) {
2145
2212
  disableFontFace: true,
2146
2213
  isEvalSupported: false
2147
2214
  });
2148
- return Promise.race([
2149
- loadingTask.promise,
2150
- new Promise(
2151
- (_, reject) => setTimeout(() => {
2152
- loadingTask.destroy();
2153
- reject(new KordocError("PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (30\uCD08 \uCD08\uACFC)"));
2154
- }, PDF_LOAD_TIMEOUT_MS)
2155
- )
2156
- ]);
2215
+ let timer;
2216
+ try {
2217
+ return await Promise.race([
2218
+ loadingTask.promise,
2219
+ new Promise((_, reject) => {
2220
+ timer = setTimeout(() => {
2221
+ loadingTask.destroy();
2222
+ reject(new KordocError("PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (30\uCD08 \uCD08\uACFC)"));
2223
+ }, PDF_LOAD_TIMEOUT_MS);
2224
+ })
2225
+ ]);
2226
+ } finally {
2227
+ if (timer !== void 0) clearTimeout(timer);
2228
+ }
2157
2229
  }
2158
2230
  async function parsePdfDocument(buffer, options) {
2159
2231
  const doc = await loadPdfWithTimeout(buffer);
2160
2232
  try {
2161
2233
  const pageCount = doc.numPages;
2162
- if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "PARSE_ERROR" };
2234
+ if (pageCount === 0) throw new KordocError("PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.");
2163
2235
  const metadata = { pageCount };
2164
2236
  await extractPdfMetadata(doc, metadata);
2165
2237
  const blocks = [];
@@ -2212,14 +2284,14 @@ async function parsePdfDocument(buffer, options) {
2212
2284
  const ocrBlocks = await ocrPages(doc, options.ocr, pageFilter, effectivePageCount);
2213
2285
  if (ocrBlocks.length > 0) {
2214
2286
  const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
2215
- return { success: true, fileType: "pdf", markdown: ocrMarkdown, pageCount: parsedPageCount, blocks: ocrBlocks, metadata, isImageBased: true, warnings };
2287
+ return { markdown: ocrMarkdown, blocks: ocrBlocks, metadata, warnings, isImageBased: true };
2216
2288
  }
2217
2289
  } catch {
2218
2290
  }
2219
2291
  }
2220
- return { success: false, fileType: "pdf", pageCount, isImageBased: true, error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`, code: "IMAGE_BASED_PDF" };
2292
+ throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
2221
2293
  }
2222
- if (options?.removeHeaderFooter && parsedPageCount >= 3) {
2294
+ if (options?.removeHeaderFooter !== false && parsedPageCount >= 3) {
2223
2295
  const removed = removeHeaderFooterBlocks(blocks, pageHeights, warnings);
2224
2296
  for (let ri = removed.length - 1; ri >= 0; ri--) {
2225
2297
  blocks.splice(removed[ri], 1);
@@ -2229,9 +2301,10 @@ async function parsePdfDocument(buffer, options) {
2229
2301
  if (medianFontSize > 0) {
2230
2302
  detectHeadings(blocks, medianFontSize);
2231
2303
  }
2304
+ detectMarkerHeadings(blocks);
2232
2305
  const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
2233
2306
  let markdown = cleanPdfText(blocksToMarkdown(blocks));
2234
- return { success: true, fileType: "pdf", markdown, pageCount: parsedPageCount, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
2307
+ return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
2235
2308
  } finally {
2236
2309
  await doc.destroy().catch(() => {
2237
2310
  });
@@ -2302,12 +2375,67 @@ function detectHeadings(blocks, medianFontSize) {
2302
2375
  if (/^\d+$/.test(text)) continue;
2303
2376
  const ratio = block.style.fontSize / medianFontSize;
2304
2377
  let level = 0;
2305
- if (ratio >= 1.5) level = 1;
2306
- else if (ratio >= 1.3) level = 2;
2307
- else if (ratio >= 1.15) level = 3;
2378
+ if (ratio >= HEADING_RATIO_H1) level = 1;
2379
+ else if (ratio >= HEADING_RATIO_H2) level = 2;
2380
+ else if (ratio >= HEADING_RATIO_H3) level = 3;
2308
2381
  if (level > 0) {
2309
2382
  block.type = "heading";
2310
2383
  block.level = level;
2384
+ block.text = collapseEvenSpacing(text);
2385
+ }
2386
+ }
2387
+ }
2388
+ function collapseEvenSpacing(text) {
2389
+ const tokens = text.split(" ");
2390
+ const singleCharCount = tokens.filter((t) => t.length === 1).length;
2391
+ if (tokens.length >= 3 && singleCharCount / tokens.length >= 0.7) {
2392
+ return tokens.join("");
2393
+ }
2394
+ return text;
2395
+ }
2396
+ function shouldDemoteTable(table) {
2397
+ const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
2398
+ const allText = allCells.join(" ");
2399
+ if (allText.length > 200) return false;
2400
+ if (/[□■◆○●▶]/.test(allText) && table.rows <= 3) return true;
2401
+ const totalCells = table.rows * table.cols;
2402
+ const emptyCells = totalCells - allCells.length;
2403
+ if (table.rows <= 2 && emptyCells > totalCells * 0.5) return true;
2404
+ if (table.rows === 1 && !/\d{2,}/.test(allText)) return true;
2405
+ return false;
2406
+ }
2407
+ function demoteTableToText(table) {
2408
+ const lines = [];
2409
+ for (let r = 0; r < table.rows; r++) {
2410
+ const cells = table.cells[r].map((c) => c.text.trim()).filter(Boolean);
2411
+ if (cells.length === 0) continue;
2412
+ if (table.cols === 2 && cells.length === 2) {
2413
+ lines.push(`${cells[0]} : ${cells[1]}`);
2414
+ } else {
2415
+ lines.push(cells.join(" "));
2416
+ }
2417
+ }
2418
+ return lines.join("\n");
2419
+ }
2420
+ function detectMarkerHeadings(blocks) {
2421
+ for (let i = 0; i < blocks.length; i++) {
2422
+ const block = blocks[i];
2423
+ if (block.type !== "paragraph" || !block.text) continue;
2424
+ const text = block.text.trim();
2425
+ if (text.length < 50 && /^[□■◆◇▶]\s*[가-힣]/.test(text)) {
2426
+ block.type = "heading";
2427
+ block.level = 4;
2428
+ continue;
2429
+ }
2430
+ if (/^[가-힣]{2,6}$/.test(text) && block.style?.fontSize) {
2431
+ const prev = blocks[i - 1];
2432
+ const next = blocks[i + 1];
2433
+ const prevIsStructural = !prev || prev.type === "table" || prev.type === "heading" || prev.type === "separator";
2434
+ const nextIsStructural = !next || next.type === "table" || next.type === "heading" || next.type === "paragraph" && next.text && /^[□■◆○●]/.test(next.text.trim());
2435
+ if (prevIsStructural || nextIsStructural) {
2436
+ block.type = "heading";
2437
+ block.level = 3;
2438
+ }
2311
2439
  }
2312
2440
  }
2313
2441
  }
@@ -2344,7 +2472,7 @@ function computeRegion(items) {
2344
2472
  }
2345
2473
  return { items, minX, minY, maxX, maxY };
2346
2474
  }
2347
- function findYSplit(items, region, gapThreshold) {
2475
+ function findYSplit(items, _region, gapThreshold) {
2348
2476
  const sorted = [...items].sort((a, b) => b.y - a.y);
2349
2477
  let bestGap = gapThreshold;
2350
2478
  let bestSplit = null;
@@ -2359,7 +2487,7 @@ function findYSplit(items, region, gapThreshold) {
2359
2487
  }
2360
2488
  return bestSplit;
2361
2489
  }
2362
- function findXSplit(items, region, gapThreshold) {
2490
+ function findXSplit(items, _region, gapThreshold) {
2363
2491
  const sorted = [...items].sort((a, b) => a.x - b.x);
2364
2492
  let bestGap = gapThreshold;
2365
2493
  let bestSplit = null;
@@ -2418,7 +2546,8 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
2418
2546
  );
2419
2547
  for (const cell of cells) {
2420
2548
  const cellItems = cellTextMap.get(cell) || [];
2421
- const text = cellTextToString(cellItems);
2549
+ let text = cellTextToString(cellItems);
2550
+ text = text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").trim();
2422
2551
  irGrid[cell.row][cell.col] = {
2423
2552
  text,
2424
2553
  colSpan: cell.colSpan,
@@ -2433,18 +2562,21 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
2433
2562
  };
2434
2563
  const hasContent = irGrid.some((row) => row.some((cell) => cell.text.trim() !== ""));
2435
2564
  if (!hasContent) continue;
2436
- blocks.push({
2437
- type: "table",
2438
- table: irTable,
2439
- pageNumber: pageNum,
2440
- bbox: {
2441
- page: pageNum,
2442
- x: grid.bbox.x1,
2443
- y: grid.bbox.y1,
2444
- width: grid.bbox.x2 - grid.bbox.x1,
2445
- height: grid.bbox.y2 - grid.bbox.y1
2565
+ const tableBbox = {
2566
+ page: pageNum,
2567
+ x: grid.bbox.x1,
2568
+ y: grid.bbox.y1,
2569
+ width: grid.bbox.x2 - grid.bbox.x1,
2570
+ height: grid.bbox.y2 - grid.bbox.y1
2571
+ };
2572
+ if (shouldDemoteTable(irTable)) {
2573
+ const demoted = demoteTableToText(irTable);
2574
+ if (demoted) {
2575
+ blocks.push({ type: "paragraph", text: demoted, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
2446
2576
  }
2447
- });
2577
+ continue;
2578
+ }
2579
+ blocks.push({ type: "table", table: irTable, pageNumber: pageNum, bbox: tableBbox });
2448
2580
  }
2449
2581
  const remaining = items.filter((i) => !usedItems.has(i));
2450
2582
  if (remaining.length > 0) {
@@ -2456,9 +2588,29 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
2456
2588
  const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
2457
2589
  return by - ay;
2458
2590
  });
2459
- return allBlocks;
2591
+ return mergeAdjacentTableBlocks(allBlocks);
2460
2592
  }
2461
- return blocks;
2593
+ return mergeAdjacentTableBlocks(blocks);
2594
+ }
2595
+ function mergeAdjacentTableBlocks(blocks) {
2596
+ if (blocks.length <= 1) return blocks;
2597
+ const result = [blocks[0]];
2598
+ for (let i = 1; i < blocks.length; i++) {
2599
+ const prev = result[result.length - 1];
2600
+ const curr = blocks[i];
2601
+ if (prev.type === "table" && curr.type === "table" && prev.table && curr.table && prev.table.cols === curr.table.cols) {
2602
+ const merged = {
2603
+ rows: prev.table.rows + curr.table.rows,
2604
+ cols: prev.table.cols,
2605
+ cells: [...prev.table.cells, ...curr.table.cells],
2606
+ hasHeader: prev.table.hasHeader
2607
+ };
2608
+ result[result.length - 1] = { ...prev, table: merged };
2609
+ } else {
2610
+ result.push(curr);
2611
+ }
2612
+ }
2613
+ return result;
2462
2614
  }
2463
2615
  function extractPageBlocksFallback(items, pageNum) {
2464
2616
  if (items.length === 0) return [];
@@ -2481,11 +2633,13 @@ function extractPageBlocksFallback(items, pageNum) {
2481
2633
  }));
2482
2634
  const clusterResults = detectClusterTables(clusterItems, pageNum);
2483
2635
  if (clusterResults.length > 0) {
2636
+ const ciToIdx = /* @__PURE__ */ new Map();
2637
+ for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
2484
2638
  const usedIndices = /* @__PURE__ */ new Set();
2485
2639
  for (const cr of clusterResults) {
2486
2640
  for (const ci of cr.usedItems) {
2487
- const idx = clusterItems.indexOf(ci);
2488
- if (idx >= 0) usedIndices.add(idx);
2641
+ const idx = ciToIdx.get(ci);
2642
+ if (idx !== void 0) usedIndices.add(idx);
2489
2643
  }
2490
2644
  blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
2491
2645
  }
@@ -2796,7 +2950,8 @@ function mergeLineSimple(items) {
2796
2950
  const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
2797
2951
  const avgFs = (sorted[i].fontSize + sorted[i - 1].fontSize) / 2;
2798
2952
  if (gap > 15) result += " ";
2799
- else if (gap < avgFs * 0.3 && /[가-힣]$/.test(result) && /^[가-힣]/.test(sorted[i].text)) {
2953
+ else if (gap < avgFs * 0.15) {
2954
+ } else if (gap < avgFs * 0.35 && (/[가-힣]$/.test(result) || /^[가-힣]/.test(sorted[i].text))) {
2800
2955
  } else if (gap > 3) result += " ";
2801
2956
  result += sorted[i].text;
2802
2957
  }
@@ -2804,8 +2959,8 @@ function mergeLineSimple(items) {
2804
2959
  }
2805
2960
  function cleanPdfText(text) {
2806
2961
  return mergeKoreanLines(
2807
- text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "")
2808
- ).replace(/\n{3,}/g, "\n\n").trim();
2962
+ text.replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "")
2963
+ ).replace(/^(?!\|).{3,30}$/gm, (line) => collapseEvenSpacing(line)).replace(/\n{3,}/g, "\n\n").trim();
2809
2964
  }
2810
2965
  function startsWithMarker(line) {
2811
2966
  const t = line.trimStart();
@@ -2819,15 +2974,13 @@ function detectListBlocks(blocks) {
2819
2974
  for (let i = 0; i < blocks.length; i++) {
2820
2975
  const block = blocks[i];
2821
2976
  if (block.type === "paragraph" && block.text) {
2822
- const match = block.text.match(/^(\d+)\.\s/);
2823
- if (match) {
2824
- result.push({
2825
- ...block,
2826
- type: "list",
2827
- listType: "ordered",
2828
- // 원래 번호를 text에 보존 (blocksToMarkdown에서 그대로 출력)
2829
- text: block.text
2830
- });
2977
+ const text = block.text.trim();
2978
+ if (/^\d+\.\s/.test(text)) {
2979
+ result.push({ ...block, type: "list", listType: "ordered", text: block.text });
2980
+ continue;
2981
+ }
2982
+ if (/^[○●·※▶▷◆◇\-]\s/.test(text)) {
2983
+ result.push({ ...block, type: "list", listType: "unordered", text: block.text });
2831
2984
  continue;
2832
2985
  }
2833
2986
  }
@@ -2986,11 +3139,20 @@ function mergeKoreanLines(text) {
2986
3139
  for (let i = 1; i < lines.length; i++) {
2987
3140
  const prev = result[result.length - 1];
2988
3141
  const curr = lines[i];
2989
- if (/^#{1,6}\s/.test(prev) || /^#{1,6}\s/.test(curr)) {
3142
+ const currTrimmed = curr.trim();
3143
+ if (/^#{1,6}\s/.test(prev) || /^#{1,6}\s/.test(curr) || /^\|/.test(currTrimmed) || /^---/.test(currTrimmed)) {
2990
3144
  result.push(curr);
2991
3145
  continue;
2992
3146
  }
2993
- if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !curr.trimStart().startsWith("|") && !startsWithMarker(curr) && !isStandaloneHeader(prev)) {
3147
+ if (/,$/.test(prev.trim()) && currTrimmed.length > 0) {
3148
+ result[result.length - 1] = prev + "\n" + curr;
3149
+ continue;
3150
+ }
3151
+ if (/^\(※/.test(currTrimmed)) {
3152
+ result[result.length - 1] = prev + " " + currTrimmed;
3153
+ continue;
3154
+ }
3155
+ if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev)) {
2994
3156
  result[result.length - 1] = prev + " " + curr;
2995
3157
  } else {
2996
3158
  result.push(curr);
@@ -3002,6 +3164,716 @@ function mergeKoreanLines(text) {
3002
3164
  // src/index.ts
3003
3165
  import { readFile } from "fs/promises";
3004
3166
 
3167
+ // src/xlsx/parser.ts
3168
+ import JSZip3 from "jszip";
3169
+ import { DOMParser as DOMParser2 } from "@xmldom/xmldom";
3170
+ var MAX_SHEETS = 100;
3171
+ var MAX_DECOMPRESS_SIZE3 = 100 * 1024 * 1024;
3172
+ var MAX_ROWS2 = 1e4;
3173
+ var MAX_COLS2 = 200;
3174
+ function cleanNumericValue(raw) {
3175
+ if (!/^-?\d+\.\d+$/.test(raw)) return raw;
3176
+ const num = parseFloat(raw);
3177
+ if (!isFinite(num)) return raw;
3178
+ const cleaned = parseFloat(num.toPrecision(15)).toString();
3179
+ return cleaned;
3180
+ }
3181
+ function parseCellRef(ref) {
3182
+ const m = ref.match(/^([A-Z]+)(\d+)$/);
3183
+ if (!m) return null;
3184
+ let col = 0;
3185
+ for (const ch of m[1]) col = col * 26 + (ch.charCodeAt(0) - 64);
3186
+ return { col: col - 1, row: parseInt(m[2], 10) - 1 };
3187
+ }
3188
+ function parseMergeRef(ref) {
3189
+ const parts = ref.split(":");
3190
+ if (parts.length !== 2) return null;
3191
+ const start = parseCellRef(parts[0]);
3192
+ const end = parseCellRef(parts[1]);
3193
+ if (!start || !end) return null;
3194
+ return { startCol: start.col, startRow: start.row, endCol: end.col, endRow: end.row };
3195
+ }
3196
+ function getElements(parent, tagName) {
3197
+ const nodes = parent.getElementsByTagName(tagName);
3198
+ const result = [];
3199
+ for (let i = 0; i < nodes.length; i++) result.push(nodes[i]);
3200
+ return result;
3201
+ }
3202
+ function getTextContent(el) {
3203
+ return el.textContent?.trim() ?? "";
3204
+ }
3205
+ function parseXml(text) {
3206
+ return new DOMParser2().parseFromString(text, "text/xml");
3207
+ }
3208
+ function parseSharedStrings(xml) {
3209
+ const doc = parseXml(xml);
3210
+ const strings = [];
3211
+ const siList = getElements(doc.documentElement, "si");
3212
+ for (const si of siList) {
3213
+ const tElements = getElements(si, "t");
3214
+ strings.push(tElements.map((t) => t.textContent ?? "").join(""));
3215
+ }
3216
+ return strings;
3217
+ }
3218
+ function parseWorkbook(xml) {
3219
+ const doc = parseXml(xml);
3220
+ const sheets = [];
3221
+ const sheetElements = getElements(doc.documentElement, "sheet");
3222
+ for (const el of sheetElements) {
3223
+ sheets.push({
3224
+ name: el.getAttribute("name") ?? `Sheet${sheets.length + 1}`,
3225
+ sheetId: el.getAttribute("sheetId") ?? "",
3226
+ rId: el.getAttribute("r:id") ?? ""
3227
+ });
3228
+ }
3229
+ return sheets;
3230
+ }
3231
+ function parseRels(xml) {
3232
+ const doc = parseXml(xml);
3233
+ const map = /* @__PURE__ */ new Map();
3234
+ const rels = getElements(doc.documentElement, "Relationship");
3235
+ for (const rel of rels) {
3236
+ const id = rel.getAttribute("Id");
3237
+ const target = rel.getAttribute("Target");
3238
+ if (id && target) map.set(id, target);
3239
+ }
3240
+ return map;
3241
+ }
3242
+ function parseWorksheet(xml, sharedStrings) {
3243
+ const doc = parseXml(xml);
3244
+ const grid = [];
3245
+ let maxRow = 0;
3246
+ let maxCol = 0;
3247
+ const rows = getElements(doc.documentElement, "row");
3248
+ for (const rowEl of rows) {
3249
+ const rowNum = parseInt(rowEl.getAttribute("r") ?? "0", 10) - 1;
3250
+ if (rowNum < 0 || rowNum >= MAX_ROWS2) continue;
3251
+ const cells = getElements(rowEl, "c");
3252
+ for (const cellEl of cells) {
3253
+ const ref = cellEl.getAttribute("r");
3254
+ if (!ref) continue;
3255
+ const pos = parseCellRef(ref);
3256
+ if (!pos || pos.col >= MAX_COLS2) continue;
3257
+ const type = cellEl.getAttribute("t");
3258
+ const vElements = getElements(cellEl, "v");
3259
+ const fElements = getElements(cellEl, "f");
3260
+ let value = "";
3261
+ if (vElements.length > 0) {
3262
+ const raw = getTextContent(vElements[0]);
3263
+ if (type === "s") {
3264
+ const idx = parseInt(raw, 10);
3265
+ value = sharedStrings[idx] ?? "";
3266
+ } else if (type === "b") {
3267
+ value = raw === "1" ? "TRUE" : "FALSE";
3268
+ } else {
3269
+ value = cleanNumericValue(raw);
3270
+ }
3271
+ } else if (type === "inlineStr") {
3272
+ const isEl = getElements(cellEl, "is");
3273
+ if (isEl.length > 0) {
3274
+ const tElements = getElements(isEl[0], "t");
3275
+ value = tElements.map((t) => t.textContent ?? "").join("");
3276
+ }
3277
+ }
3278
+ if (!value && fElements.length > 0) {
3279
+ value = `=${getTextContent(fElements[0])}`;
3280
+ }
3281
+ while (grid.length <= pos.row) grid.push([]);
3282
+ while (grid[pos.row].length <= pos.col) grid[pos.row].push("");
3283
+ grid[pos.row][pos.col] = value;
3284
+ if (pos.row > maxRow) maxRow = pos.row;
3285
+ if (pos.col > maxCol) maxCol = pos.col;
3286
+ }
3287
+ }
3288
+ const merges = [];
3289
+ const mergeCellElements = getElements(doc.documentElement, "mergeCell");
3290
+ for (const el of mergeCellElements) {
3291
+ const ref = el.getAttribute("ref");
3292
+ if (!ref) continue;
3293
+ const m = parseMergeRef(ref);
3294
+ if (m) merges.push(m);
3295
+ }
3296
+ return { grid, merges, maxRow, maxCol };
3297
+ }
3298
+ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
3299
+ const blocks = [];
3300
+ if (sheetName) {
3301
+ blocks.push({
3302
+ type: "heading",
3303
+ text: sheetName,
3304
+ level: 2,
3305
+ pageNumber: sheetIndex + 1
3306
+ });
3307
+ }
3308
+ if (maxRow < 0 || maxCol < 0 || grid.length === 0) return blocks;
3309
+ const mergeMap = /* @__PURE__ */ new Map();
3310
+ const mergeSkip = /* @__PURE__ */ new Set();
3311
+ for (const m of merges) {
3312
+ const colSpan = m.endCol - m.startCol + 1;
3313
+ const rowSpan = m.endRow - m.startRow + 1;
3314
+ mergeMap.set(`${m.startRow},${m.startCol}`, { colSpan, rowSpan });
3315
+ for (let r = m.startRow; r <= m.endRow; r++) {
3316
+ for (let c = m.startCol; c <= m.endCol; c++) {
3317
+ if (r !== m.startRow || c !== m.startCol) {
3318
+ mergeSkip.add(`${r},${c}`);
3319
+ }
3320
+ }
3321
+ }
3322
+ }
3323
+ let firstRow = -1;
3324
+ let lastRow = -1;
3325
+ for (let r = 0; r <= maxRow; r++) {
3326
+ const row = grid[r];
3327
+ if (row && row.some((cell) => cell !== "")) {
3328
+ if (firstRow === -1) firstRow = r;
3329
+ lastRow = r;
3330
+ }
3331
+ }
3332
+ if (firstRow === -1) return blocks;
3333
+ const cellRows = [];
3334
+ for (let r = firstRow; r <= lastRow; r++) {
3335
+ const row = [];
3336
+ for (let c = 0; c <= maxCol; c++) {
3337
+ const key = `${r},${c}`;
3338
+ if (mergeSkip.has(key)) continue;
3339
+ const text = (grid[r] && grid[r][c]) ?? "";
3340
+ const merge = mergeMap.get(key);
3341
+ row.push({
3342
+ text,
3343
+ colSpan: merge?.colSpan ?? 1,
3344
+ rowSpan: merge?.rowSpan ?? 1
3345
+ });
3346
+ }
3347
+ cellRows.push(row);
3348
+ }
3349
+ if (cellRows.length > 0) {
3350
+ const table = buildTable(cellRows);
3351
+ if (table.rows > 0) {
3352
+ blocks.push({ type: "table", table, pageNumber: sheetIndex + 1 });
3353
+ }
3354
+ }
3355
+ return blocks;
3356
+ }
3357
+ async function parseXlsxDocument(buffer, options) {
3358
+ precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
3359
+ const zip = await JSZip3.loadAsync(buffer);
3360
+ const warnings = [];
3361
+ const workbookFile = zip.file("xl/workbook.xml");
3362
+ if (!workbookFile) {
3363
+ throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 XLSX \uD30C\uC77C: xl/workbook.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
3364
+ }
3365
+ let sharedStrings = [];
3366
+ const ssFile = zip.file("xl/sharedStrings.xml");
3367
+ if (ssFile) {
3368
+ sharedStrings = parseSharedStrings(await ssFile.async("text"));
3369
+ }
3370
+ const sheets = parseWorkbook(await workbookFile.async("text"));
3371
+ if (sheets.length === 0) {
3372
+ throw new KordocError("XLSX \uD30C\uC77C\uC5D0 \uC2DC\uD2B8\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4");
3373
+ }
3374
+ let relsMap = /* @__PURE__ */ new Map();
3375
+ const relsFile = zip.file("xl/_rels/workbook.xml.rels");
3376
+ if (relsFile) {
3377
+ relsMap = parseRels(await relsFile.async("text"));
3378
+ }
3379
+ let pageFilter = null;
3380
+ if (options?.pages) {
3381
+ const { parsePageRange: parsePageRange2 } = await import("./page-range-737B4EZW.js");
3382
+ pageFilter = parsePageRange2(options.pages, sheets.length);
3383
+ }
3384
+ const blocks = [];
3385
+ const processedSheets = Math.min(sheets.length, MAX_SHEETS);
3386
+ for (let i = 0; i < processedSheets; i++) {
3387
+ if (pageFilter && !pageFilter.has(i + 1)) continue;
3388
+ const sheet = sheets[i];
3389
+ options?.onProgress?.(i + 1, processedSheets);
3390
+ let sheetPath = relsMap.get(sheet.rId);
3391
+ if (sheetPath) {
3392
+ if (!sheetPath.startsWith("xl/") && !sheetPath.startsWith("/")) {
3393
+ sheetPath = `xl/${sheetPath}`;
3394
+ } else if (sheetPath.startsWith("/")) {
3395
+ sheetPath = sheetPath.slice(1);
3396
+ }
3397
+ } else {
3398
+ sheetPath = `xl/worksheets/sheet${i + 1}.xml`;
3399
+ }
3400
+ const sheetFile = zip.file(sheetPath);
3401
+ if (!sheetFile) {
3402
+ warnings.push({
3403
+ page: i + 1,
3404
+ message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${sheetPath}`,
3405
+ code: "PARTIAL_PARSE"
3406
+ });
3407
+ continue;
3408
+ }
3409
+ try {
3410
+ const sheetXml = await sheetFile.async("text");
3411
+ const { grid, merges, maxRow, maxCol } = parseWorksheet(sheetXml, sharedStrings);
3412
+ const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
3413
+ blocks.push(...sheetBlocks);
3414
+ } catch (err) {
3415
+ warnings.push({
3416
+ page: i + 1,
3417
+ message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC2F1 \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
3418
+ code: "PARTIAL_PARSE"
3419
+ });
3420
+ }
3421
+ }
3422
+ const metadata = {
3423
+ pageCount: processedSheets
3424
+ };
3425
+ const coreFile = zip.file("docProps/core.xml");
3426
+ if (coreFile) {
3427
+ try {
3428
+ const coreXml = await coreFile.async("text");
3429
+ const doc = parseXml(coreXml);
3430
+ const getFirst = (tag) => {
3431
+ const els = doc.getElementsByTagName(tag);
3432
+ return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
3433
+ };
3434
+ metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
3435
+ metadata.author = getFirst("dc:creator");
3436
+ metadata.description = getFirst("dc:description");
3437
+ const created = getFirst("dcterms:created");
3438
+ if (created) metadata.createdAt = created;
3439
+ const modified = getFirst("dcterms:modified");
3440
+ if (modified) metadata.modifiedAt = modified;
3441
+ } catch {
3442
+ }
3443
+ }
3444
+ const markdown = blocksToMarkdown(blocks);
3445
+ return { markdown, blocks, metadata, warnings: warnings.length > 0 ? warnings : void 0 };
3446
+ }
3447
+
3448
+ // src/docx/parser.ts
3449
+ import JSZip4 from "jszip";
3450
+ import { DOMParser as DOMParser3 } from "@xmldom/xmldom";
3451
+ var MAX_DECOMPRESS_SIZE4 = 100 * 1024 * 1024;
3452
+ function getChildElements(parent, localName) {
3453
+ const result = [];
3454
+ const children = parent.childNodes;
3455
+ for (let i = 0; i < children.length; i++) {
3456
+ const node = children[i];
3457
+ if (node.nodeType === 1) {
3458
+ const el = node;
3459
+ if (el.localName === localName || el.tagName?.endsWith(`:${localName}`)) {
3460
+ result.push(el);
3461
+ }
3462
+ }
3463
+ }
3464
+ return result;
3465
+ }
3466
+ function findElements(parent, localName) {
3467
+ const result = [];
3468
+ const walk = (node) => {
3469
+ const children = node.childNodes;
3470
+ for (let i = 0; i < children.length; i++) {
3471
+ const child = children[i];
3472
+ if (child.nodeType === 1) {
3473
+ const el = child;
3474
+ if (el.localName === localName || el.tagName?.endsWith(`:${localName}`)) {
3475
+ result.push(el);
3476
+ }
3477
+ walk(el);
3478
+ }
3479
+ }
3480
+ };
3481
+ walk(parent);
3482
+ return result;
3483
+ }
3484
+ function getAttr(el, localName) {
3485
+ const attrs = el.attributes;
3486
+ for (let i = 0; i < attrs.length; i++) {
3487
+ const attr = attrs[i];
3488
+ if (attr.localName === localName || attr.name === localName) return attr.value;
3489
+ }
3490
+ return null;
3491
+ }
3492
+ function parseXml2(text) {
3493
+ return new DOMParser3().parseFromString(text, "text/xml");
3494
+ }
3495
+ function parseStyles(xml) {
3496
+ const doc = parseXml2(xml);
3497
+ const styles = /* @__PURE__ */ new Map();
3498
+ const styleElements = findElements(doc, "style");
3499
+ for (const el of styleElements) {
3500
+ const styleId = getAttr(el, "styleId");
3501
+ if (!styleId) continue;
3502
+ const nameEls = getChildElements(el, "name");
3503
+ const name = nameEls.length > 0 ? getAttr(nameEls[0], "val") ?? "" : "";
3504
+ const basedOnEls = getChildElements(el, "basedOn");
3505
+ const basedOn = basedOnEls.length > 0 ? getAttr(basedOnEls[0], "val") ?? void 0 : void 0;
3506
+ const pPrEls = getChildElements(el, "pPr");
3507
+ let outlineLevel;
3508
+ if (pPrEls.length > 0) {
3509
+ const outlineEls = getChildElements(pPrEls[0], "outlineLvl");
3510
+ if (outlineEls.length > 0) {
3511
+ const val = getAttr(outlineEls[0], "val");
3512
+ if (val !== null) outlineLevel = parseInt(val, 10);
3513
+ }
3514
+ }
3515
+ if (outlineLevel === void 0) {
3516
+ const headingMatch = name.match(/^(?:heading|Heading)\s*(\d+)$/i);
3517
+ if (headingMatch) outlineLevel = parseInt(headingMatch[1], 10) - 1;
3518
+ }
3519
+ styles.set(styleId, { name, basedOn, outlineLevel });
3520
+ }
3521
+ return styles;
3522
+ }
3523
+ function parseNumbering(xml) {
3524
+ const doc = parseXml2(xml);
3525
+ const abstractNums = /* @__PURE__ */ new Map();
3526
+ const abstractElements = findElements(doc, "abstractNum");
3527
+ for (const el of abstractElements) {
3528
+ const abstractNumId = getAttr(el, "abstractNumId");
3529
+ if (!abstractNumId) continue;
3530
+ const levels = /* @__PURE__ */ new Map();
3531
+ const lvlElements = getChildElements(el, "lvl");
3532
+ for (const lvl of lvlElements) {
3533
+ const ilvl = parseInt(getAttr(lvl, "ilvl") ?? "0", 10);
3534
+ const numFmtEls = getChildElements(lvl, "numFmt");
3535
+ const numFmt = numFmtEls.length > 0 ? getAttr(numFmtEls[0], "val") ?? "bullet" : "bullet";
3536
+ levels.set(ilvl, { numFmt, level: ilvl });
3537
+ }
3538
+ abstractNums.set(abstractNumId, levels);
3539
+ }
3540
+ const nums = /* @__PURE__ */ new Map();
3541
+ const numElements = findElements(doc, "num");
3542
+ for (const el of numElements) {
3543
+ const numId = getAttr(el, "numId");
3544
+ if (!numId) continue;
3545
+ const abstractRefs = getChildElements(el, "abstractNumId");
3546
+ if (abstractRefs.length > 0) {
3547
+ const ref = getAttr(abstractRefs[0], "val");
3548
+ if (ref && abstractNums.has(ref)) {
3549
+ nums.set(numId, abstractNums.get(ref));
3550
+ }
3551
+ }
3552
+ }
3553
+ return nums;
3554
+ }
3555
+ function parseRels2(xml) {
3556
+ const doc = parseXml2(xml);
3557
+ const map = /* @__PURE__ */ new Map();
3558
+ const rels = findElements(doc, "Relationship");
3559
+ for (const rel of rels) {
3560
+ const id = getAttr(rel, "Id");
3561
+ const target = getAttr(rel, "Target");
3562
+ if (id && target) map.set(id, target);
3563
+ }
3564
+ return map;
3565
+ }
3566
+ function parseFootnotes(xml) {
3567
+ const doc = parseXml2(xml);
3568
+ const notes = /* @__PURE__ */ new Map();
3569
+ const fnElements = findElements(doc, "footnote");
3570
+ for (const fn of fnElements) {
3571
+ const id = getAttr(fn, "id");
3572
+ if (!id || id === "0" || id === "-1") continue;
3573
+ const texts = [];
3574
+ const pElements = findElements(fn, "p");
3575
+ for (const p of pElements) {
3576
+ const runs = findElements(p, "r");
3577
+ for (const r of runs) {
3578
+ const tElements = getChildElements(r, "t");
3579
+ for (const t of tElements) texts.push(t.textContent ?? "");
3580
+ }
3581
+ }
3582
+ notes.set(id, texts.join("").trim());
3583
+ }
3584
+ return notes;
3585
+ }
3586
+ function extractRun(r) {
3587
+ const tElements = getChildElements(r, "t");
3588
+ const text = tElements.map((t) => t.textContent ?? "").join("");
3589
+ let bold = false;
3590
+ let italic = false;
3591
+ const rPrEls = getChildElements(r, "rPr");
3592
+ if (rPrEls.length > 0) {
3593
+ bold = getChildElements(rPrEls[0], "b").length > 0;
3594
+ italic = getChildElements(rPrEls[0], "i").length > 0;
3595
+ }
3596
+ return { text, bold, italic };
3597
+ }
3598
+ function parseParagraph(p, styles, numbering, footnotes, rels) {
3599
+ const pPrEls = getChildElements(p, "pPr");
3600
+ let styleId = "";
3601
+ let numId = "";
3602
+ let ilvl = 0;
3603
+ if (pPrEls.length > 0) {
3604
+ const pStyleEls = getChildElements(pPrEls[0], "pStyle");
3605
+ if (pStyleEls.length > 0) styleId = getAttr(pStyleEls[0], "val") ?? "";
3606
+ const numPrEls = getChildElements(pPrEls[0], "numPr");
3607
+ if (numPrEls.length > 0) {
3608
+ const numIdEls = getChildElements(numPrEls[0], "numId");
3609
+ const ilvlEls = getChildElements(numPrEls[0], "ilvl");
3610
+ numId = numIdEls.length > 0 ? getAttr(numIdEls[0], "val") ?? "" : "";
3611
+ ilvl = ilvlEls.length > 0 ? parseInt(getAttr(ilvlEls[0], "val") ?? "0", 10) : 0;
3612
+ }
3613
+ }
3614
+ const parts = [];
3615
+ let hasBold = false;
3616
+ let hasItalic = false;
3617
+ let href;
3618
+ let footnoteText;
3619
+ const hyperlinks = getChildElements(p, "hyperlink");
3620
+ const hyperlinkTexts = /* @__PURE__ */ new Set();
3621
+ for (const hl of hyperlinks) {
3622
+ const rId = getAttr(hl, "id");
3623
+ const hlText = [];
3624
+ const runs2 = findElements(hl, "r");
3625
+ for (const r of runs2) {
3626
+ const result = extractRun(r);
3627
+ hlText.push(result.text);
3628
+ }
3629
+ const text2 = hlText.join("");
3630
+ if (text2) {
3631
+ hyperlinkTexts.add(text2);
3632
+ if (rId && rels.has(rId)) {
3633
+ href = rels.get(rId);
3634
+ parts.push(text2);
3635
+ } else {
3636
+ parts.push(text2);
3637
+ }
3638
+ }
3639
+ }
3640
+ const runs = getChildElements(p, "r");
3641
+ for (const r of runs) {
3642
+ if (r.parentNode && r.parentNode.localName === "hyperlink") continue;
3643
+ const result = extractRun(r);
3644
+ if (result.bold) hasBold = true;
3645
+ if (result.italic) hasItalic = true;
3646
+ const fnRefEls = getChildElements(r, "footnoteReference");
3647
+ if (fnRefEls.length > 0) {
3648
+ const fnId = getAttr(fnRefEls[0], "id");
3649
+ if (fnId && footnotes.has(fnId)) {
3650
+ footnoteText = footnotes.get(fnId);
3651
+ }
3652
+ }
3653
+ if (result.text) parts.push(result.text);
3654
+ }
3655
+ const text = parts.join("").trim();
3656
+ if (!text) return null;
3657
+ const style = styles.get(styleId);
3658
+ if (style?.outlineLevel !== void 0 && style.outlineLevel >= 0 && style.outlineLevel <= 5) {
3659
+ return {
3660
+ type: "heading",
3661
+ text,
3662
+ level: style.outlineLevel + 1
3663
+ };
3664
+ }
3665
+ if (numId && numId !== "0") {
3666
+ const numDef = numbering.get(numId);
3667
+ const levelInfo = numDef?.get(ilvl);
3668
+ const listType = levelInfo?.numFmt === "bullet" ? "unordered" : "ordered";
3669
+ return { type: "list", text, listType };
3670
+ }
3671
+ const block = { type: "paragraph", text };
3672
+ if (hasBold || hasItalic) {
3673
+ block.style = { bold: hasBold || void 0, italic: hasItalic || void 0 };
3674
+ }
3675
+ if (href) block.href = href;
3676
+ if (footnoteText) block.footnoteText = footnoteText;
3677
+ return block;
3678
+ }
3679
+ function parseTable(tbl, styles, numbering, footnotes, rels) {
3680
+ const trElements = getChildElements(tbl, "tr");
3681
+ if (trElements.length === 0) return null;
3682
+ const rows = [];
3683
+ let maxCols = 0;
3684
+ for (const tr of trElements) {
3685
+ const tcElements = getChildElements(tr, "tc");
3686
+ const row = [];
3687
+ for (const tc of tcElements) {
3688
+ let colSpan = 1;
3689
+ let rowSpan = 1;
3690
+ const tcPrEls = getChildElements(tc, "tcPr");
3691
+ if (tcPrEls.length > 0) {
3692
+ const gridSpanEls = getChildElements(tcPrEls[0], "gridSpan");
3693
+ if (gridSpanEls.length > 0) {
3694
+ colSpan = parseInt(getAttr(gridSpanEls[0], "val") ?? "1", 10);
3695
+ }
3696
+ const vMergeEls = getChildElements(tcPrEls[0], "vMerge");
3697
+ if (vMergeEls.length > 0) {
3698
+ const val = getAttr(vMergeEls[0], "val");
3699
+ if (val !== "restart" && val !== null) {
3700
+ row.push({ text: "", colSpan, rowSpan: 0 });
3701
+ continue;
3702
+ }
3703
+ }
3704
+ }
3705
+ const cellTexts = [];
3706
+ const pElements = getChildElements(tc, "p");
3707
+ for (const p of pElements) {
3708
+ const block = parseParagraph(p, styles, numbering, footnotes, rels);
3709
+ if (block?.text) cellTexts.push(block.text);
3710
+ }
3711
+ row.push({ text: cellTexts.join("\n"), colSpan, rowSpan });
3712
+ }
3713
+ rows.push(row);
3714
+ if (row.length > maxCols) maxCols = row.length;
3715
+ }
3716
+ for (let c = 0; c < maxCols; c++) {
3717
+ for (let r = 0; r < rows.length; r++) {
3718
+ const cell = rows[r][c];
3719
+ if (!cell || cell.rowSpan === 0) continue;
3720
+ let span = 1;
3721
+ for (let nr = r + 1; nr < rows.length; nr++) {
3722
+ if (rows[nr][c]?.rowSpan === 0) span++;
3723
+ else break;
3724
+ }
3725
+ cell.rowSpan = span;
3726
+ }
3727
+ }
3728
+ const cleanRows = [];
3729
+ for (const row of rows) {
3730
+ const clean = row.filter((cell) => cell.rowSpan !== 0);
3731
+ cleanRows.push(clean);
3732
+ }
3733
+ if (cleanRows.length === 0) return null;
3734
+ let cols = 0;
3735
+ for (const row of cleanRows) {
3736
+ let c = 0;
3737
+ for (const cell of row) c += cell.colSpan;
3738
+ if (c > cols) cols = c;
3739
+ }
3740
+ const table = {
3741
+ rows: cleanRows.length,
3742
+ cols,
3743
+ cells: cleanRows,
3744
+ hasHeader: cleanRows.length > 1
3745
+ };
3746
+ return { type: "table", table };
3747
+ }
3748
+ async function extractImages(zip, rels, doc) {
3749
+ const blocks = [];
3750
+ const images = [];
3751
+ const drawingElements = findElements(doc.documentElement, "drawing");
3752
+ let imgIdx = 0;
3753
+ for (const drawing of drawingElements) {
3754
+ const blips = findElements(drawing, "blip");
3755
+ for (const blip of blips) {
3756
+ const embedId = getAttr(blip, "embed");
3757
+ if (!embedId) continue;
3758
+ const target = rels.get(embedId);
3759
+ if (!target) continue;
3760
+ const imgPath = target.startsWith("/") ? target.slice(1) : target.startsWith("word/") ? target : `word/${target}`;
3761
+ const imgFile = zip.file(imgPath);
3762
+ if (!imgFile) continue;
3763
+ try {
3764
+ const data = await imgFile.async("uint8array");
3765
+ imgIdx++;
3766
+ const ext = imgPath.split(".").pop()?.toLowerCase() ?? "png";
3767
+ const mimeMap = {
3768
+ png: "image/png",
3769
+ jpg: "image/jpeg",
3770
+ jpeg: "image/jpeg",
3771
+ gif: "image/gif",
3772
+ bmp: "image/bmp",
3773
+ wmf: "image/wmf",
3774
+ emf: "image/emf"
3775
+ };
3776
+ const filename = `image_${String(imgIdx).padStart(3, "0")}.${ext}`;
3777
+ images.push({ filename, data, mimeType: mimeMap[ext] ?? "image/png" });
3778
+ blocks.push({ type: "image", text: filename });
3779
+ } catch {
3780
+ }
3781
+ }
3782
+ }
3783
+ return { blocks, images };
3784
+ }
3785
+ async function parseDocxDocument(buffer, options) {
3786
+ precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
3787
+ const zip = await JSZip4.loadAsync(buffer);
3788
+ const warnings = [];
3789
+ const docFile = zip.file("word/document.xml");
3790
+ if (!docFile) {
3791
+ throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 DOCX \uD30C\uC77C: word/document.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
3792
+ }
3793
+ let rels = /* @__PURE__ */ new Map();
3794
+ const relsFile = zip.file("word/_rels/document.xml.rels");
3795
+ if (relsFile) {
3796
+ rels = parseRels2(await relsFile.async("text"));
3797
+ }
3798
+ let styles = /* @__PURE__ */ new Map();
3799
+ const stylesFile = zip.file("word/styles.xml");
3800
+ if (stylesFile) {
3801
+ try {
3802
+ styles = parseStyles(await stylesFile.async("text"));
3803
+ } catch {
3804
+ }
3805
+ }
3806
+ let numbering = /* @__PURE__ */ new Map();
3807
+ const numFile = zip.file("word/numbering.xml");
3808
+ if (numFile) {
3809
+ try {
3810
+ numbering = parseNumbering(await numFile.async("text"));
3811
+ } catch {
3812
+ }
3813
+ }
3814
+ let footnotes = /* @__PURE__ */ new Map();
3815
+ const fnFile = zip.file("word/footnotes.xml");
3816
+ if (fnFile) {
3817
+ try {
3818
+ footnotes = parseFootnotes(await fnFile.async("text"));
3819
+ } catch {
3820
+ }
3821
+ }
3822
+ const docXml = await docFile.async("text");
3823
+ const doc = parseXml2(docXml);
3824
+ const body = findElements(doc, "body");
3825
+ if (body.length === 0) {
3826
+ throw new KordocError("DOCX \uBCF8\uBB38(w:body)\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
3827
+ }
3828
+ const blocks = [];
3829
+ const bodyEl = body[0];
3830
+ const children = bodyEl.childNodes;
3831
+ for (let i = 0; i < children.length; i++) {
3832
+ const node = children[i];
3833
+ if (node.nodeType !== 1) continue;
3834
+ const el = node;
3835
+ const localName = el.localName ?? el.tagName?.split(":").pop();
3836
+ if (localName === "p") {
3837
+ const block = parseParagraph(el, styles, numbering, footnotes, rels);
3838
+ if (block) blocks.push(block);
3839
+ } else if (localName === "tbl") {
3840
+ const block = parseTable(el, styles, numbering, footnotes, rels);
3841
+ if (block) blocks.push(block);
3842
+ }
3843
+ }
3844
+ const { blocks: imgBlocks, images } = await extractImages(zip, rels, doc);
3845
+ const metadata = {};
3846
+ const coreFile = zip.file("docProps/core.xml");
3847
+ if (coreFile) {
3848
+ try {
3849
+ const coreXml = await coreFile.async("text");
3850
+ const coreDoc = parseXml2(coreXml);
3851
+ const getFirst = (tag) => {
3852
+ const els = coreDoc.getElementsByTagName(tag);
3853
+ return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
3854
+ };
3855
+ metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
3856
+ metadata.author = getFirst("dc:creator");
3857
+ metadata.description = getFirst("dc:description");
3858
+ const created = getFirst("dcterms:created");
3859
+ if (created) metadata.createdAt = created;
3860
+ const modified = getFirst("dcterms:modified");
3861
+ if (modified) metadata.modifiedAt = modified;
3862
+ } catch {
3863
+ }
3864
+ }
3865
+ const outline = blocks.filter((b) => b.type === "heading").map((b) => ({ level: b.level ?? 2, text: b.text ?? "" }));
3866
+ const markdown = blocksToMarkdown(blocks);
3867
+ return {
3868
+ markdown,
3869
+ blocks,
3870
+ metadata,
3871
+ outline: outline.length > 0 ? outline : void 0,
3872
+ warnings: warnings.length > 0 ? warnings : void 0,
3873
+ images: images.length > 0 ? images : void 0
3874
+ };
3875
+ }
3876
+
3005
3877
  // src/form/recognize.ts
3006
3878
  var LABEL_KEYWORDS = /* @__PURE__ */ new Set([
3007
3879
  "\uC131\uBA85",
@@ -3129,7 +4001,7 @@ function extractInlineFields(text) {
3129
4001
  }
3130
4002
 
3131
4003
  // src/hwpx/generator.ts
3132
- import JSZip2 from "jszip";
4004
+ import JSZip5 from "jszip";
3133
4005
 
3134
4006
  // src/index.ts
3135
4007
  async function parse(input, options) {
@@ -3152,8 +4024,12 @@ async function parse(input, options) {
3152
4024
  }
3153
4025
  const format = detectFormat(buffer);
3154
4026
  switch (format) {
3155
- case "hwpx":
4027
+ case "hwpx": {
4028
+ const zipFormat = await detectZipFormat(buffer);
4029
+ if (zipFormat === "xlsx") return parseXlsx(buffer, options);
4030
+ if (zipFormat === "docx") return parseDocx(buffer, options);
3156
4031
  return parseHwpx(buffer, options);
4032
+ }
3157
4033
  case "hwp":
3158
4034
  return parseHwp(buffer, options);
3159
4035
  case "pdf":
@@ -3180,9 +4056,27 @@ async function parseHwp(buffer, options) {
3180
4056
  }
3181
4057
  async function parsePdf(buffer, options) {
3182
4058
  try {
3183
- return await parsePdfDocument(buffer, options);
4059
+ const { markdown, blocks, metadata, outline, warnings, isImageBased } = await parsePdfDocument(buffer, options);
4060
+ return { success: true, fileType: "pdf", markdown, blocks, metadata, outline, warnings, isImageBased };
3184
4061
  } catch (err) {
3185
- return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
4062
+ const isImageBased = err instanceof Error && "isImageBased" in err ? true : void 0;
4063
+ return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err), isImageBased };
4064
+ }
4065
+ }
4066
+ async function parseXlsx(buffer, options) {
4067
+ try {
4068
+ const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options);
4069
+ return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
4070
+ } catch (err) {
4071
+ return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
4072
+ }
4073
+ }
4074
+ async function parseDocx(buffer, options) {
4075
+ try {
4076
+ const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options);
4077
+ return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
4078
+ } catch (err) {
4079
+ return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
3186
4080
  }
3187
4081
  }
3188
4082
 
@@ -3360,10 +4254,6 @@ function diffTableCells(a, b) {
3360
4254
  export {
3361
4255
  detectFormat,
3362
4256
  blocksToMarkdown,
3363
- VERSION,
3364
- toArrayBuffer,
3365
- KordocError,
3366
- sanitizeError,
3367
4257
  extractHwpxMetadataOnly,
3368
4258
  extractHwp5MetadataOnly,
3369
4259
  extractPdfMetadataOnly,
@@ -3371,4 +4261,4 @@ export {
3371
4261
  extractFormFields,
3372
4262
  parse
3373
4263
  };
3374
- //# sourceMappingURL=chunk-NJ3R7LNR.js.map
4264
+ //# sourceMappingURL=chunk-QQ6PZADA.js.map