kordoc 2.0.3 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,53 +1,105 @@
1
1
  #!/usr/bin/env node
2
2
  import {
3
- KordocError,
4
- classifyError,
5
- isPathTraversal,
6
- precheckZipSize,
7
- sanitizeHref,
8
- toArrayBuffer
9
- } from "./chunk-25TXW6EP.js";
3
+ detectFormat,
4
+ detectZipFormat
5
+ } from "./chunk-5Y2Q3BRW.js";
10
6
  import {
11
7
  parsePageRange
12
- } from "./chunk-3TBUDJDE.js";
8
+ } from "./chunk-MOL7MDBG.js";
13
9
 
14
- // src/detect.ts
15
- import JSZip from "jszip";
16
- function magicBytes(buffer) {
17
- return new Uint8Array(buffer, 0, Math.min(4, buffer.byteLength));
18
- }
19
- function isZipFile(buffer) {
20
- const b = magicBytes(buffer);
21
- return b[0] === 80 && b[1] === 75 && b[2] === 3 && b[3] === 4;
22
- }
23
- function isOldHwpFile(buffer) {
24
- const b = magicBytes(buffer);
25
- return b[0] === 208 && b[1] === 207 && b[2] === 17 && b[3] === 224;
10
+ // src/utils.ts
11
+ var VERSION = true ? "2.2.0" : "0.0.0-dev";
12
+ function toArrayBuffer(buf) {
13
+ if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
14
+ return buf.buffer;
15
+ }
16
+ return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
26
17
  }
27
- function isPdfFile(buffer) {
28
- const b = magicBytes(buffer);
29
- return b[0] === 37 && b[1] === 80 && b[2] === 68 && b[3] === 70;
18
+ var KordocError = class extends Error {
19
+ constructor(message) {
20
+ super(message);
21
+ this.name = "KordocError";
22
+ }
23
+ };
24
+ function sanitizeError(err) {
25
+ if (err instanceof KordocError) return err.message;
26
+ return "\uBB38\uC11C \uCC98\uB9AC \uC911 \uC624\uB958\uAC00 \uBC1C\uC0DD\uD588\uC2B5\uB2C8\uB2E4";
30
27
  }
31
- function detectFormat(buffer) {
32
- if (buffer.byteLength < 4) return "unknown";
33
- if (isZipFile(buffer)) return "hwpx";
34
- if (isOldHwpFile(buffer)) return "hwp";
35
- if (isPdfFile(buffer)) return "pdf";
36
- return "unknown";
28
+ function isPathTraversal(name) {
29
+ if (name.includes("\0")) return true;
30
+ const normalized = name.replace(/\\/g, "/");
31
+ const segments = normalized.split("/");
32
+ return segments.some((s) => s === "..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
37
33
  }
38
- async function detectZipFormat(buffer) {
34
+ function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
39
35
  try {
40
- const zip = await JSZip.loadAsync(buffer);
41
- if (zip.file("xl/workbook.xml")) return "xlsx";
42
- if (zip.file("word/document.xml")) return "docx";
43
- if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return "hwpx";
44
- const hasSection = Object.keys(zip.files).some((f) => f.startsWith("Contents/"));
45
- if (hasSection) return "hwpx";
46
- return "unknown";
47
- } catch {
48
- return "unknown";
36
+ const data = new DataView(buffer);
37
+ const len = buffer.byteLength;
38
+ let eocdOffset = -1;
39
+ for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
40
+ if (data.getUint32(i, true) === 101010256) {
41
+ eocdOffset = i;
42
+ break;
43
+ }
44
+ }
45
+ if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
46
+ const entryCount = data.getUint16(eocdOffset + 10, true);
47
+ if (entryCount > maxEntries) {
48
+ throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
49
+ }
50
+ const cdSize = data.getUint32(eocdOffset + 12, true);
51
+ const cdOffset = data.getUint32(eocdOffset + 16, true);
52
+ if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
53
+ let totalUncompressed = 0;
54
+ let pos = cdOffset;
55
+ for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
56
+ if (data.getUint32(pos, true) !== 33639248) break;
57
+ totalUncompressed += data.getUint32(pos + 24, true);
58
+ const nameLen = data.getUint16(pos + 28, true);
59
+ const extraLen = data.getUint16(pos + 30, true);
60
+ const commentLen = data.getUint16(pos + 32, true);
61
+ pos += 46 + nameLen + extraLen + commentLen;
62
+ }
63
+ if (totalUncompressed > maxUncompressedSize) {
64
+ throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
65
+ }
66
+ return { totalUncompressed, entryCount };
67
+ } catch (err) {
68
+ if (err instanceof KordocError) throw err;
69
+ return { totalUncompressed: 0, entryCount: 0 };
49
70
  }
50
71
  }
72
+ function stripDtd(xml) {
73
+ return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
74
+ }
75
+ var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
76
+ function sanitizeHref(href) {
77
+ const trimmed = href.trim();
78
+ if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
79
+ return trimmed;
80
+ }
81
+ function safeMin(arr) {
82
+ let min = Infinity;
83
+ for (let i = 0; i < arr.length; i++) if (arr[i] < min) min = arr[i];
84
+ return min;
85
+ }
86
+ function safeMax(arr) {
87
+ let max = -Infinity;
88
+ for (let i = 0; i < arr.length; i++) if (arr[i] > max) max = arr[i];
89
+ return max;
90
+ }
91
+ function classifyError(err) {
92
+ if (!(err instanceof Error)) return "PARSE_ERROR";
93
+ const msg = err.message;
94
+ if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
95
+ if (msg.includes("DRM")) return "DRM_PROTECTED";
96
+ if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
97
+ if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
98
+ if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
99
+ if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
100
+ if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
101
+ return "PARSE_ERROR";
102
+ }
51
103
 
52
104
  // src/table/builder.ts
53
105
  var MAX_COLS = 200;
@@ -110,6 +162,7 @@ function buildTableDirect(rows, numRows) {
110
162
  if (end > maxCols) maxCols = end;
111
163
  }
112
164
  }
165
+ if (maxCols > MAX_COLS) maxCols = MAX_COLS;
113
166
  if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
114
167
  const grid = Array.from(
115
168
  { length: numRows },
@@ -119,7 +172,7 @@ function buildTableDirect(rows, numRows) {
119
172
  for (const cell of row) {
120
173
  const r = cell.rowAddr ?? 0;
121
174
  const c = cell.colAddr ?? 0;
122
- if (r >= numRows || c >= maxCols) continue;
175
+ if (r >= numRows || c >= maxCols || r < 0 || c < 0) continue;
123
176
  grid[r][c] = { text: cell.text.trim(), colSpan: cell.colSpan, rowSpan: cell.rowSpan };
124
177
  for (let dr = 0; dr < cell.rowSpan; dr++) {
125
178
  for (let dc = 0; dc < cell.colSpan; dc++) {
@@ -341,7 +394,7 @@ function tableToMarkdown(table) {
341
394
  }
342
395
 
343
396
  // src/hwpx/parser.ts
344
- import JSZip2 from "jszip";
397
+ import JSZip from "jszip";
345
398
  import { inflateRawSync } from "zlib";
346
399
  import { DOMParser } from "@xmldom/xmldom";
347
400
 
@@ -403,7 +456,12 @@ function parseCharProperties(doc, map) {
403
456
  if (!id) continue;
404
457
  const prop = {};
405
458
  const height = el.getAttribute("height");
406
- if (height) prop.fontSize = parseInt(height, 10) / 100;
459
+ if (height) {
460
+ const parsedHeight = parseInt(height, 10);
461
+ if (!isNaN(parsedHeight) && parsedHeight > 0) {
462
+ prop.fontSize = parsedHeight / 100;
463
+ }
464
+ }
407
465
  const bold = el.getAttribute("bold");
408
466
  if (bold === "true" || bold === "1") prop.bold = true;
409
467
  const italic = el.getAttribute("italic");
@@ -438,14 +496,11 @@ function parseStyleElements(doc, map) {
438
496
  }
439
497
  }
440
498
  }
441
- function stripDtd(xml) {
442
- return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
443
- }
444
499
  async function parseHwpxDocument(buffer, options) {
445
500
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
446
501
  let zip;
447
502
  try {
448
- zip = await JSZip2.loadAsync(buffer);
503
+ zip = await JSZip.loadAsync(buffer);
449
504
  } catch {
450
505
  return extractFromBrokenZip(buffer);
451
506
  }
@@ -543,7 +598,7 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
543
598
  const data = await file.async("uint8array");
544
599
  decompressed.total += data.length;
545
600
  if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
546
- const ext = ref.includes(".") ? ref.split(".").pop() : "png";
601
+ const ext = ref.includes(".") ? ref.split(".").pop() || "png" : "png";
547
602
  const mimeType = imageExtToMime(ext);
548
603
  imageIndex++;
549
604
  const filename = `image_${String(imageIndex).padStart(3, "0")}.${mimeToExt(mimeType)}`;
@@ -608,7 +663,7 @@ function parseDublinCoreMetadata(xml, metadata) {
608
663
  async function extractHwpxMetadataOnly(buffer) {
609
664
  let zip;
610
665
  try {
611
- zip = await JSZip2.loadAsync(buffer);
666
+ zip = await JSZip.loadAsync(buffer);
612
667
  } catch {
613
668
  throw new KordocError("HWPX ZIP\uC744 \uC5F4 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
614
669
  }
@@ -803,7 +858,8 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
803
858
  if (newTable.rows.length > 0) {
804
859
  if (tableStack.length > 0) {
805
860
  const parentTable = tableStack.pop();
806
- const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
861
+ let nestedCols = 0;
862
+ for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
807
863
  if (newTable.rows.length >= 3 && nestedCols >= 2) {
808
864
  blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
809
865
  } else {
@@ -850,8 +906,10 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
850
906
  break;
851
907
  case "cellSpan":
852
908
  if (tableCtx?.cell) {
853
- const cs = parseInt(el.getAttribute("colSpan") || "1", 10);
854
- const rs = parseInt(el.getAttribute("rowSpan") || "1", 10);
909
+ const rawCs = parseInt(el.getAttribute("colSpan") || "1", 10);
910
+ const cs = isNaN(rawCs) ? 1 : rawCs;
911
+ const rawRs = parseInt(el.getAttribute("rowSpan") || "1", 10);
912
+ const rs = isNaN(rawRs) ? 1 : rawRs;
855
913
  tableCtx.cell.colSpan = clampSpan(cs, MAX_COLS);
856
914
  tableCtx.cell.rowSpan = clampSpan(rs, MAX_ROWS);
857
915
  }
@@ -910,7 +968,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
910
968
  if (newTable.rows.length > 0) {
911
969
  if (tableStack.length > 0) {
912
970
  const parentTable = tableStack.pop();
913
- const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
971
+ let nestedCols = 0;
972
+ for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
914
973
  if (newTable.rows.length >= 3 && nestedCols >= 2) {
915
974
  blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
916
975
  } else {
@@ -943,6 +1002,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
943
1002
  extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
944
1003
  } else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
945
1004
  walkChildren(el, d + 1);
1005
+ } else if (localTag === "run") {
1006
+ tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
946
1007
  }
947
1008
  }
948
1009
  };
@@ -2006,6 +2067,7 @@ function parseLenientCfb(data) {
2006
2067
  if (miniSectorSizeShift > 16) throw new Error("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uBBF8\uB2C8 \uC139\uD130 \uD06C\uAE30 \uC2DC\uD504\uD2B8: " + miniSectorSizeShift);
2007
2068
  const miniSectorSize = 1 << miniSectorSizeShift;
2008
2069
  const fatSectorCount = data.readUInt32LE(44);
2070
+ if (fatSectorCount > 1e4) throw new Error("FAT \uC139\uD130 \uC218\uAC00 \uB108\uBB34 \uB9CE\uC2B5\uB2C8\uB2E4: " + fatSectorCount);
2009
2071
  const firstDirSector = data.readUInt32LE(48);
2010
2072
  const miniStreamCutoff = data.readUInt32LE(56);
2011
2073
  const firstMiniFatSector = data.readUInt32LE(60);
@@ -2394,10 +2456,14 @@ function findSections(cfb) {
2394
2456
  }
2395
2457
  function findSectionsLenient(lcfb, compressed) {
2396
2458
  const sections = [];
2459
+ let totalDecompressed = 0;
2397
2460
  for (let i = 0; i < MAX_SECTIONS; i++) {
2398
2461
  const raw = lcfb.findStream(`/BodyText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
2399
2462
  if (!raw) break;
2400
- sections.push({ idx: i, content: compressed ? decompressStream(raw) : raw });
2463
+ const content = compressed ? decompressStream(raw) : raw;
2464
+ totalDecompressed += content.length;
2465
+ if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
2466
+ sections.push({ idx: i, content });
2401
2467
  }
2402
2468
  if (sections.length === 0) {
2403
2469
  for (const e of lcfb.entries()) {
@@ -2405,7 +2471,12 @@ function findSectionsLenient(lcfb, compressed) {
2405
2471
  if (e.name.startsWith("Section")) {
2406
2472
  const idx = parseInt(e.name.replace("Section", ""), 10) || 0;
2407
2473
  const raw = lcfb.findStream(e.name);
2408
- if (raw) sections.push({ idx, content: compressed ? decompressStream(raw) : raw });
2474
+ if (raw) {
2475
+ const content = compressed ? decompressStream(raw) : raw;
2476
+ totalDecompressed += content.length;
2477
+ if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
2478
+ sections.push({ idx, content });
2479
+ }
2409
2480
  }
2410
2481
  }
2411
2482
  }
@@ -2413,11 +2484,15 @@ function findSectionsLenient(lcfb, compressed) {
2413
2484
  }
2414
2485
  function findViewTextSectionsLenient(lcfb, compressed) {
2415
2486
  const sections = [];
2487
+ let totalDecompressed = 0;
2416
2488
  for (let i = 0; i < MAX_SECTIONS; i++) {
2417
2489
  const raw = lcfb.findStream(`/ViewText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
2418
2490
  if (!raw) break;
2419
2491
  try {
2420
- sections.push({ idx: i, content: decryptViewText(raw, compressed) });
2492
+ const content = decryptViewText(raw, compressed);
2493
+ totalDecompressed += content.length;
2494
+ if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
2495
+ sections.push({ idx: i, content });
2421
2496
  } catch {
2422
2497
  break;
2423
2498
  }
@@ -2816,10 +2891,14 @@ function arrangeCells(rows, cols, cells) {
2816
2891
  // src/pdf/line-detector.ts
2817
2892
  import { OPS } from "pdfjs-dist/legacy/build/pdf.mjs";
2818
2893
  var ORIENTATION_TOL = 2;
2819
- var MIN_LINE_LENGTH = 10;
2820
- var COORD_MERGE_TOL = 3;
2894
+ var MIN_LINE_LENGTH = 15;
2895
+ var MAX_LINE_WIDTH = 5;
2821
2896
  var CONNECT_TOL = 5;
2822
2897
  var CELL_PADDING = 2;
2898
+ var MIN_COL_WIDTH = 15;
2899
+ var MIN_ROW_HEIGHT = 6;
2900
+ var VERTEX_MERGE_FACTOR = 4;
2901
+ var MIN_COORD_MERGE_TOL = 8;
2823
2902
  function extractLines(fnArray, argsArray) {
2824
2903
  const horizontals = [];
2825
2904
  const verticals = [];
@@ -2971,6 +3050,55 @@ function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
2971
3050
  verticals.push({ x1: x, y1, x2: x, y2, lineWidth });
2972
3051
  }
2973
3052
  }
3053
+ function preprocessLines(horizontals, verticals) {
3054
+ let h = horizontals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
3055
+ let v = verticals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
3056
+ h = mergeParallelLines(h, "h");
3057
+ v = mergeParallelLines(v, "v");
3058
+ return { horizontals: h, verticals: v };
3059
+ }
3060
+ function mergeParallelLines(lines, dir) {
3061
+ if (lines.length <= 1) return lines;
3062
+ const sorted = [...lines].sort((a, b) => {
3063
+ const posA = dir === "h" ? a.y1 : a.x1;
3064
+ const posB = dir === "h" ? b.y1 : b.x1;
3065
+ if (Math.abs(posA - posB) > 0.1) return posA - posB;
3066
+ return dir === "h" ? a.x1 - b.x1 : a.y1 - b.y1;
3067
+ });
3068
+ const MERGE_TOL = 3;
3069
+ const result = [sorted[0]];
3070
+ for (let i = 1; i < sorted.length; i++) {
3071
+ const prev = result[result.length - 1];
3072
+ const curr = sorted[i];
3073
+ const prevPos = dir === "h" ? prev.y1 : prev.x1;
3074
+ const currPos = dir === "h" ? curr.y1 : curr.x1;
3075
+ if (Math.abs(prevPos - currPos) <= MERGE_TOL) {
3076
+ const prevStart = dir === "h" ? prev.x1 : prev.y1;
3077
+ const prevEnd = dir === "h" ? prev.x2 : prev.y2;
3078
+ const currStart = dir === "h" ? curr.x1 : curr.y1;
3079
+ const currEnd = dir === "h" ? curr.x2 : curr.y2;
3080
+ const overlap = Math.min(prevEnd, currEnd) - Math.max(prevStart, currStart);
3081
+ const minLen = Math.min(prevEnd - prevStart, currEnd - currStart);
3082
+ if (overlap > minLen * 0.3) {
3083
+ if (dir === "h") {
3084
+ prev.x1 = Math.min(prev.x1, curr.x1);
3085
+ prev.x2 = Math.max(prev.x2, curr.x2);
3086
+ prev.y1 = (prev.y1 + curr.y1) / 2;
3087
+ prev.y2 = prev.y1;
3088
+ } else {
3089
+ prev.y1 = Math.min(prev.y1, curr.y1);
3090
+ prev.y2 = Math.max(prev.y2, curr.y2);
3091
+ prev.x1 = (prev.x1 + curr.x1) / 2;
3092
+ prev.x2 = prev.x1;
3093
+ }
3094
+ prev.lineWidth = Math.max(prev.lineWidth, curr.lineWidth);
3095
+ continue;
3096
+ }
3097
+ }
3098
+ result.push(curr);
3099
+ }
3100
+ return result;
3101
+ }
2974
3102
  function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
2975
3103
  const margin = 5;
2976
3104
  return {
@@ -2982,8 +3110,49 @@ function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
2982
3110
  )
2983
3111
  };
2984
3112
  }
3113
+ function buildVertices(horizontals, verticals) {
3114
+ const vertices = [];
3115
+ const tol = CONNECT_TOL;
3116
+ for (const h of horizontals) {
3117
+ for (const v of verticals) {
3118
+ if (v.x1 >= h.x1 - tol && v.x1 <= h.x2 + tol && h.y1 >= v.y1 - tol && h.y1 <= v.y2 + tol) {
3119
+ const radius = Math.max(h.lineWidth, v.lineWidth, 1);
3120
+ vertices.push({ x: v.x1, y: h.y1, radius });
3121
+ }
3122
+ }
3123
+ }
3124
+ return vertices;
3125
+ }
3126
+ function mergeVertices(vertices) {
3127
+ if (vertices.length <= 1) return vertices;
3128
+ const merged = [];
3129
+ const used = new Array(vertices.length).fill(false);
3130
+ for (let i = 0; i < vertices.length; i++) {
3131
+ if (used[i]) continue;
3132
+ let sumX = vertices[i].x, sumY = vertices[i].y;
3133
+ let maxRadius = vertices[i].radius;
3134
+ let count = 1;
3135
+ for (let j = i + 1; j < vertices.length; j++) {
3136
+ if (used[j]) continue;
3137
+ const mergeTol = VERTEX_MERGE_FACTOR * Math.max(maxRadius, vertices[j].radius);
3138
+ if (Math.abs(vertices[i].x - vertices[j].x) <= mergeTol && Math.abs(vertices[i].y - vertices[j].y) <= mergeTol) {
3139
+ sumX += vertices[j].x;
3140
+ sumY += vertices[j].y;
3141
+ maxRadius = Math.max(maxRadius, vertices[j].radius);
3142
+ count++;
3143
+ used[j] = true;
3144
+ }
3145
+ }
3146
+ merged.push({ x: sumX / count, y: sumY / count, radius: maxRadius });
3147
+ }
3148
+ return merged;
3149
+ }
2985
3150
  function buildTableGrids(horizontals, verticals) {
2986
3151
  if (horizontals.length < 2 || verticals.length < 2) return [];
3152
+ const allVertices = buildVertices(horizontals, verticals);
3153
+ const vertices = mergeVertices(allVertices);
3154
+ if (vertices.length < 4) return [];
3155
+ const globalRadius = vertices.reduce((max, v) => Math.max(max, v.radius), 1);
2987
3156
  const allLines = [
2988
3157
  ...horizontals.map((l, i) => ({ ...l, type: "h", id: i })),
2989
3158
  ...verticals.map((l, i) => ({ ...l, type: "v", id: i + horizontals.length }))
@@ -2994,21 +3163,74 @@ function buildTableGrids(horizontals, verticals) {
2994
3163
  const hLines = group.filter((l) => l.type === "h");
2995
3164
  const vLines = group.filter((l) => l.type === "v");
2996
3165
  if (hLines.length < 2 || vLines.length < 2) continue;
2997
- const rawYs = hLines.map((l) => l.y1);
2998
- const rowYs = clusterCoordinates(rawYs).sort((a, b) => b - a);
2999
- const rawXs = vLines.map((l) => l.x1);
3000
- const colXs = clusterCoordinates(rawXs).sort((a, b) => a - b);
3166
+ let gx1 = Infinity, gy1 = Infinity, gx2 = -Infinity, gy2 = -Infinity;
3167
+ for (const l of vLines) {
3168
+ if (l.x1 < gx1) gx1 = l.x1;
3169
+ if (l.x1 > gx2) gx2 = l.x1;
3170
+ }
3171
+ for (const l of hLines) {
3172
+ if (l.y1 < gy1) gy1 = l.y1;
3173
+ if (l.y1 > gy2) gy2 = l.y1;
3174
+ }
3175
+ const groupBbox = {
3176
+ x1: gx1 - CONNECT_TOL,
3177
+ y1: gy1 - CONNECT_TOL,
3178
+ x2: gx2 + CONNECT_TOL,
3179
+ y2: gy2 + CONNECT_TOL
3180
+ };
3181
+ const groupVertices = vertices.filter(
3182
+ (v) => v.x >= groupBbox.x1 && v.x <= groupBbox.x2 && v.y >= groupBbox.y1 && v.y <= groupBbox.y2
3183
+ );
3184
+ const groupRadius = groupVertices.length > 0 ? groupVertices.reduce((max, v) => Math.max(max, v.radius), 1) : globalRadius;
3185
+ const coordMergeTol = Math.max(VERTEX_MERGE_FACTOR * groupRadius, MIN_COORD_MERGE_TOL);
3186
+ const rawYs = [
3187
+ ...hLines.map((l) => l.y1),
3188
+ ...groupVertices.map((v) => v.y)
3189
+ ];
3190
+ const rowYs = clusterCoordinates(rawYs, coordMergeTol).sort((a, b) => b - a);
3191
+ const rawXs = [
3192
+ ...vLines.map((l) => l.x1),
3193
+ ...groupVertices.map((v) => v.x)
3194
+ ];
3195
+ const colXs = clusterCoordinates(rawXs, coordMergeTol).sort((a, b) => a - b);
3001
3196
  if (rowYs.length < 2 || colXs.length < 2) continue;
3197
+ const validColXs = enforceMinWidth(colXs, MIN_COL_WIDTH);
3198
+ const validRowYs = enforceMinHeight(rowYs, MIN_ROW_HEIGHT);
3199
+ if (validRowYs.length < 2 || validColXs.length < 2) continue;
3002
3200
  const bbox = {
3003
- x1: colXs[0],
3004
- y1: rowYs[rowYs.length - 1],
3005
- x2: colXs[colXs.length - 1],
3006
- y2: rowYs[0]
3201
+ x1: validColXs[0],
3202
+ y1: validRowYs[validRowYs.length - 1],
3203
+ x2: validColXs[validColXs.length - 1],
3204
+ y2: validRowYs[0]
3007
3205
  };
3008
- grids.push({ rowYs, colXs, bbox });
3206
+ grids.push({ rowYs: validRowYs, colXs: validColXs, bbox, vertexRadius: groupRadius });
3009
3207
  }
3010
3208
  return mergeAdjacentGrids(grids);
3011
3209
  }
3210
+ function enforceMinWidth(colXs, minWidth) {
3211
+ if (colXs.length <= 2) return colXs;
3212
+ const result = [colXs[0]];
3213
+ for (let i = 1; i < colXs.length; i++) {
3214
+ const prevX = result[result.length - 1];
3215
+ if (colXs[i] - prevX < minWidth && i < colXs.length - 1) {
3216
+ continue;
3217
+ }
3218
+ result.push(colXs[i]);
3219
+ }
3220
+ return result;
3221
+ }
3222
+ function enforceMinHeight(rowYs, minHeight) {
3223
+ if (rowYs.length <= 2) return rowYs;
3224
+ const result = [rowYs[0]];
3225
+ for (let i = 1; i < rowYs.length; i++) {
3226
+ const prevY = result[result.length - 1];
3227
+ if (prevY - rowYs[i] < minHeight && i < rowYs.length - 1) {
3228
+ continue;
3229
+ }
3230
+ result.push(rowYs[i]);
3231
+ }
3232
+ return result;
3233
+ }
3012
3234
  function mergeAdjacentGrids(grids) {
3013
3235
  if (grids.length <= 1) return grids;
3014
3236
  const sorted = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
@@ -3017,9 +3239,10 @@ function mergeAdjacentGrids(grids) {
3017
3239
  const prev = merged[merged.length - 1];
3018
3240
  const curr = sorted[i];
3019
3241
  if (prev.colXs.length === curr.colXs.length) {
3020
- const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= COORD_MERGE_TOL * 3);
3242
+ const mergeTol = Math.max(VERTEX_MERGE_FACTOR * Math.max(prev.vertexRadius, curr.vertexRadius), 6) * 3;
3243
+ const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= mergeTol);
3021
3244
  const verticalGap = prev.bbox.y1 - curr.bbox.y2;
3022
- if (colMatch && verticalGap >= -COORD_MERGE_TOL && verticalGap <= 20) {
3245
+ if (colMatch && verticalGap >= -CONNECT_TOL && verticalGap <= 20) {
3023
3246
  const allRowYs = [.../* @__PURE__ */ new Set([...prev.rowYs, ...curr.rowYs])].sort((a, b) => b - a);
3024
3247
  merged[merged.length - 1] = {
3025
3248
  rowYs: allRowYs,
@@ -3029,7 +3252,8 @@ function mergeAdjacentGrids(grids) {
3029
3252
  y1: Math.min(prev.bbox.y1, curr.bbox.y1),
3030
3253
  x2: Math.max(prev.bbox.x2, curr.bbox.x2),
3031
3254
  y2: Math.max(prev.bbox.y2, curr.bbox.y2)
3032
- }
3255
+ },
3256
+ vertexRadius: Math.max(prev.vertexRadius, curr.vertexRadius)
3033
3257
  };
3034
3258
  continue;
3035
3259
  }
@@ -3038,14 +3262,14 @@ function mergeAdjacentGrids(grids) {
3038
3262
  }
3039
3263
  return merged;
3040
3264
  }
3041
- function clusterCoordinates(values) {
3265
+ function clusterCoordinates(values, tolerance) {
3042
3266
  if (values.length === 0) return [];
3043
3267
  const sorted = [...values].sort((a, b) => a - b);
3044
3268
  const clusters = [{ sum: sorted[0], count: 1 }];
3045
3269
  for (let i = 1; i < sorted.length; i++) {
3046
3270
  const last = clusters[clusters.length - 1];
3047
3271
  const avg = last.sum / last.count;
3048
- if (Math.abs(sorted[i] - avg) <= COORD_MERGE_TOL) {
3272
+ if (Math.abs(sorted[i] - avg) <= tolerance) {
3049
3273
  last.sum += sorted[i];
3050
3274
  last.count++;
3051
3275
  } else {
@@ -3102,6 +3326,20 @@ function extractCells(grid, horizontals, verticals) {
3102
3326
  const numRows = rowYs.length - 1;
3103
3327
  const numCols = colXs.length - 1;
3104
3328
  if (numRows <= 0 || numCols <= 0) return [];
3329
+ const vBorders = Array.from(
3330
+ { length: numRows },
3331
+ (_, r) => Array.from(
3332
+ { length: numCols + 1 },
3333
+ (_2, c) => hasVerticalLine(verticals, colXs[c], rowYs[r], rowYs[r + 1], grid.vertexRadius)
3334
+ )
3335
+ );
3336
+ const hBorders = Array.from(
3337
+ { length: numRows + 1 },
3338
+ (_, r) => Array.from(
3339
+ { length: numCols },
3340
+ (_2, c) => hasHorizontalLine(horizontals, rowYs[r], colXs[c], colXs[c + 1], grid.vertexRadius)
3341
+ )
3342
+ );
3105
3343
  const occupied = Array.from({ length: numRows }, () => Array(numCols).fill(false));
3106
3344
  const cells = [];
3107
3345
  for (let r = 0; r < numRows; r++) {
@@ -3109,18 +3347,26 @@ function extractCells(grid, horizontals, verticals) {
3109
3347
  if (occupied[r][c]) continue;
3110
3348
  let colSpan = 1;
3111
3349
  let rowSpan = 1;
3112
- while (c + colSpan < numCols) {
3113
- const borderX = colXs[c + colSpan];
3114
- const topY = rowYs[r];
3115
- const botY = rowYs[r + 1];
3116
- if (hasVerticalLine(verticals, borderX, topY, botY)) break;
3350
+ while (c + colSpan < numCols && !vBorders[r][c + colSpan]) {
3351
+ let canExpand = true;
3352
+ for (let dr = 0; dr < rowSpan; dr++) {
3353
+ if (vBorders[r + dr][c + colSpan]) {
3354
+ canExpand = false;
3355
+ break;
3356
+ }
3357
+ }
3358
+ if (!canExpand) break;
3117
3359
  colSpan++;
3118
3360
  }
3119
3361
  while (r + rowSpan < numRows) {
3120
- const borderY = rowYs[r + rowSpan];
3121
- const leftX = colXs[c];
3122
- const rightX = colXs[c + colSpan];
3123
- if (hasHorizontalLine(horizontals, borderY, leftX, rightX)) break;
3362
+ let hasLine = false;
3363
+ for (let dc = 0; dc < colSpan; dc++) {
3364
+ if (hBorders[r + rowSpan][c + dc]) {
3365
+ hasLine = true;
3366
+ break;
3367
+ }
3368
+ }
3369
+ if (hasLine) break;
3124
3370
  rowSpan++;
3125
3371
  }
3126
3372
  for (let dr = 0; dr < rowSpan; dr++) {
@@ -3144,28 +3390,30 @@ function extractCells(grid, horizontals, verticals) {
3144
3390
  }
3145
3391
  return cells;
3146
3392
  }
3147
- function hasVerticalLine(verticals, x, topY, botY) {
3148
- const tol = COORD_MERGE_TOL + 1;
3393
+ function hasVerticalLine(verticals, x, topY, botY, vertexRadius) {
3394
+ const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
3149
3395
  for (const v of verticals) {
3150
3396
  if (Math.abs(v.x1 - x) <= tol) {
3151
3397
  const cellH = Math.abs(topY - botY);
3398
+ if (cellH < 0.1) continue;
3152
3399
  const overlapTop = Math.min(v.y2, topY);
3153
3400
  const overlapBot = Math.max(v.y1, botY);
3154
3401
  const overlap = overlapTop - overlapBot;
3155
- if (overlap >= cellH * 0.5) return true;
3402
+ if (overlap >= cellH * 0.75) return true;
3156
3403
  }
3157
3404
  }
3158
3405
  return false;
3159
3406
  }
3160
- function hasHorizontalLine(horizontals, y, leftX, rightX) {
3161
- const tol = COORD_MERGE_TOL + 1;
3407
+ function hasHorizontalLine(horizontals, y, leftX, rightX, vertexRadius) {
3408
+ const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
3162
3409
  for (const h of horizontals) {
3163
3410
  if (Math.abs(h.y1 - y) <= tol) {
3164
3411
  const cellW = Math.abs(rightX - leftX);
3412
+ if (cellW < 0.1) continue;
3165
3413
  const overlapLeft = Math.max(h.x1, leftX);
3166
3414
  const overlapRight = Math.min(h.x2, rightX);
3167
3415
  const overlap = overlapRight - overlapLeft;
3168
- if (overlap >= cellW * 0.5) return true;
3416
+ if (overlap >= cellW * 0.75) return true;
3169
3417
  }
3170
3418
  }
3171
3419
  return false;
@@ -3176,23 +3424,24 @@ function mapTextToCells(items, cells) {
3176
3424
  result.set(cell, []);
3177
3425
  }
3178
3426
  for (const item of items) {
3179
- const cx = item.x + item.w / 2;
3180
- const cy = item.y;
3181
3427
  const pad = CELL_PADDING;
3182
3428
  let bestCell = null;
3183
- let bestDist = Infinity;
3429
+ let bestScore = 0;
3184
3430
  for (const cell of cells) {
3185
- if (cx >= cell.bbox.x1 - pad && cx <= cell.bbox.x2 + pad && cy >= cell.bbox.y1 - pad && cy <= cell.bbox.y2 + pad) {
3186
- const cellCx = (cell.bbox.x1 + cell.bbox.x2) / 2;
3187
- const cellCy = (cell.bbox.y1 + cell.bbox.y2) / 2;
3188
- const dist = Math.abs(cx - cellCx) + Math.abs(cy - cellCy);
3189
- if (dist < bestDist) {
3190
- bestDist = dist;
3191
- bestCell = cell;
3192
- }
3431
+ const ix1 = Math.max(item.x, cell.bbox.x1 - pad);
3432
+ const ix2 = Math.min(item.x + item.w, cell.bbox.x2 + pad);
3433
+ const iy1 = Math.max(item.y, cell.bbox.y1 - pad);
3434
+ const iy2 = Math.min(item.y + (item.h || item.fontSize), cell.bbox.y2 + pad);
3435
+ if (ix1 >= ix2 || iy1 >= iy2) continue;
3436
+ const intersectArea = (ix2 - ix1) * (iy2 - iy1);
3437
+ const itemArea = Math.max(item.w, 1) * Math.max(item.h || item.fontSize, 1);
3438
+ const score = intersectArea / itemArea;
3439
+ if (score > bestScore) {
3440
+ bestScore = score;
3441
+ bestCell = cell;
3193
3442
  }
3194
3443
  }
3195
- if (bestCell) {
3444
+ if (bestCell && bestScore > 0.3) {
3196
3445
  result.get(bestCell).push(item);
3197
3446
  }
3198
3447
  }
@@ -3219,8 +3468,13 @@ function cellTextToString(items) {
3219
3468
  const textLines = lines.map((line) => {
3220
3469
  const s = line.sort((a, b) => a.x - b.x);
3221
3470
  if (s.length === 1) return s[0].text;
3471
+ const evenSpaced = detectEvenSpacedItems(s);
3222
3472
  let result = s[0].text;
3223
3473
  for (let j = 1; j < s.length; j++) {
3474
+ if (evenSpaced[j]) {
3475
+ result += s[j].text;
3476
+ continue;
3477
+ }
3224
3478
  const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
3225
3479
  const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
3226
3480
  const prevIsKorean = /[가-힣]$/.test(result);
@@ -3235,6 +3489,57 @@ function cellTextToString(items) {
3235
3489
  }
3236
3490
  return result;
3237
3491
  });
3492
+ return mergeCellTextLines(textLines);
3493
+ }
3494
+ function detectEvenSpacedItems(items) {
3495
+ const result = new Array(items.length).fill(false);
3496
+ if (items.length < 3) return result;
3497
+ let runStart = -1;
3498
+ for (let i = 0; i < items.length; i++) {
3499
+ const isShortKorean = /^[가-힣]{1}$/.test(items[i].text) || /^[\d]{1}$/.test(items[i].text);
3500
+ if (isShortKorean && runStart >= 0 && i > 0) {
3501
+ const gap = items[i].x - (items[i - 1].x + items[i - 1].w);
3502
+ const maxRunGap = Math.max(items[i].fontSize * 3, 30);
3503
+ if (gap > maxRunGap) {
3504
+ if (i - runStart >= 3) markEvenRun(items, result, runStart, i);
3505
+ runStart = i;
3506
+ continue;
3507
+ }
3508
+ }
3509
+ if (isShortKorean) {
3510
+ if (runStart < 0) runStart = i;
3511
+ } else {
3512
+ if (runStart >= 0 && i - runStart >= 3) {
3513
+ markEvenRun(items, result, runStart, i);
3514
+ }
3515
+ runStart = -1;
3516
+ }
3517
+ }
3518
+ if (runStart >= 0 && items.length - runStart >= 3) {
3519
+ markEvenRun(items, result, runStart, items.length);
3520
+ }
3521
+ return result;
3522
+ }
3523
+ function markEvenRun(items, result, start, end) {
3524
+ const gaps = [];
3525
+ for (let i = start + 1; i < end; i++) {
3526
+ gaps.push(items[i].x - (items[i - 1].x + items[i - 1].w));
3527
+ }
3528
+ const posGaps = gaps.filter((g2) => g2 > 0);
3529
+ if (posGaps.length < 2) return;
3530
+ let minGap = Infinity, maxGap = -Infinity;
3531
+ for (const g2 of posGaps) {
3532
+ if (g2 < minGap) minGap = g2;
3533
+ if (g2 > maxGap) maxGap = g2;
3534
+ }
3535
+ const avgFs = items[start].fontSize;
3536
+ if (minGap >= avgFs * 0.1 && maxGap <= avgFs * 3 && maxGap / Math.max(minGap, 0.1) <= 3) {
3537
+ for (let i = start + 1; i < end; i++) {
3538
+ result[i] = true;
3539
+ }
3540
+ }
3541
+ }
3542
+ function mergeCellTextLines(textLines) {
3238
3543
  if (textLines.length <= 1) return textLines[0] || "";
3239
3544
  const merged = [textLines[0]];
3240
3545
  for (let i = 1; i < textLines.length; i++) {
@@ -3260,24 +3565,172 @@ var Y_TOL = 3;
3260
3565
  var COL_CLUSTER_TOL = 15;
3261
3566
  var MIN_ROWS = 3;
3262
3567
  var MIN_COLS = 2;
3263
- var MIN_GAP_FACTOR = 1.5;
3264
- var MIN_COL_FILL_RATIO = 0.3;
3568
+ var MIN_GAP_FACTOR = 2;
3569
+ var MIN_GAP_ABSOLUTE = 20;
3570
+ var MIN_COL_FILL_RATIO = 0.4;
3265
3571
  function detectClusterTables(items, pageNum) {
3266
3572
  if (items.length < MIN_ROWS * MIN_COLS) return [];
3267
- const rows = groupByBaseline(items);
3573
+ const { merged, originMap } = mergeEvenSpacedClusters(items);
3574
+ const rows = groupByBaseline(merged);
3268
3575
  if (rows.length < MIN_ROWS) return [];
3269
- const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
3270
- if (suspiciousRows.length < MIN_ROWS) return [];
3271
- const columns = extractColumnClusters(suspiciousRows);
3272
- if (columns.length < MIN_COLS) return [];
3273
- const tableRegions = findTableRegions(rows, columns);
3274
3576
  const results = [];
3275
- for (const region of tableRegions) {
3276
- const table = buildClusterTable(region.rows, columns, pageNum);
3277
- if (table) results.push(table);
3577
+ const headerResult = detectHeaderRow(rows);
3578
+ if (headerResult) {
3579
+ const { columns, headerIdx } = headerResult;
3580
+ const headerRow = rows[headerIdx];
3581
+ const headerItems = [...headerRow.items].sort((a, b) => a.x - b.x);
3582
+ const headerAndBelow = rows.slice(headerIdx);
3583
+ const mergedRows = mergeMultiLineRows(headerAndBelow, columns);
3584
+ const tableRegions = findTableRegionsByHeader(mergedRows, columns, headerItems);
3585
+ for (const region of tableRegions) {
3586
+ const table = buildClusterTable(region.rows, columns, pageNum);
3587
+ if (table) {
3588
+ expandUsedItems(table.usedItems, originMap);
3589
+ results.push(table);
3590
+ }
3591
+ }
3592
+ }
3593
+ if (results.length === 0) {
3594
+ const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
3595
+ if (suspiciousRows.length >= MIN_ROWS) {
3596
+ const columns = extractColumnClusters(suspiciousRows);
3597
+ if (columns.length >= MIN_COLS) {
3598
+ const tableRegions = findTableRegions(rows, columns);
3599
+ for (const region of tableRegions) {
3600
+ const mergedRows = mergeMultiLineRows(region.rows, columns);
3601
+ const table = buildClusterTable(mergedRows, columns, pageNum);
3602
+ if (table) {
3603
+ expandUsedItems(table.usedItems, originMap);
3604
+ results.push(table);
3605
+ }
3606
+ }
3607
+ }
3608
+ }
3278
3609
  }
3279
3610
  return results;
3280
3611
  }
3612
+ function mergeEvenSpacedClusters(items) {
3613
+ const originMap = /* @__PURE__ */ new Map();
3614
+ const rows = groupByBaseline(items);
3615
+ const merged = [];
3616
+ for (const row of rows) {
3617
+ const sorted = [...row.items].sort((a, b) => a.x - b.x);
3618
+ let i = 0;
3619
+ while (i < sorted.length) {
3620
+ if (/^[가-힣\d]$/.test(sorted[i].text)) {
3621
+ let runEnd = i + 1;
3622
+ while (runEnd < sorted.length && /^[가-힣\d]$/.test(sorted[runEnd].text)) {
3623
+ const gap = sorted[runEnd].x - (sorted[runEnd - 1].x + sorted[runEnd - 1].w);
3624
+ const fs = sorted[runEnd].fontSize;
3625
+ if (gap < fs * 0.1 || gap > fs * 3) break;
3626
+ runEnd++;
3627
+ }
3628
+ if (runEnd - i >= 3) {
3629
+ const gaps = [];
3630
+ for (let g2 = i + 1; g2 < runEnd; g2++) {
3631
+ gaps.push(sorted[g2].x - (sorted[g2 - 1].x + sorted[g2 - 1].w));
3632
+ }
3633
+ let minG = Infinity, maxG = -Infinity;
3634
+ for (const g2 of gaps) {
3635
+ if (g2 < minG) minG = g2;
3636
+ if (g2 > maxG) maxG = g2;
3637
+ }
3638
+ if (minG > 0 && maxG / minG <= 3) {
3639
+ const run = sorted.slice(i, runEnd);
3640
+ const text = run.map((r) => r.text).join("");
3641
+ const first = run[0], last = run[runEnd - i - 1];
3642
+ const item = {
3643
+ text,
3644
+ x: first.x,
3645
+ y: first.y,
3646
+ w: last.x + last.w - first.x,
3647
+ h: first.h,
3648
+ fontSize: first.fontSize,
3649
+ fontName: first.fontName
3650
+ };
3651
+ originMap.set(item, run);
3652
+ merged.push(item);
3653
+ i = runEnd;
3654
+ continue;
3655
+ }
3656
+ }
3657
+ }
3658
+ merged.push(sorted[i]);
3659
+ i++;
3660
+ }
3661
+ }
3662
+ return { merged, originMap };
3663
+ }
3664
+ function expandUsedItems(usedItems, originMap) {
3665
+ const toAdd = [];
3666
+ for (const item of usedItems) {
3667
+ const origins = originMap.get(item);
3668
+ if (origins) for (const o of origins) toAdd.push(o);
3669
+ }
3670
+ for (const a of toAdd) usedItems.add(a);
3671
+ }
3672
+ function detectHeaderRow(rows) {
3673
+ const allItems = rows.flatMap((r) => r.items);
3674
+ if (allItems.length === 0) return null;
3675
+ let allMinX = Infinity, allMaxX = -Infinity;
3676
+ for (const i of allItems) {
3677
+ if (i.x < allMinX) allMinX = i.x;
3678
+ const r = i.x + i.w;
3679
+ if (r > allMaxX) allMaxX = r;
3680
+ }
3681
+ const pageSpan = allMaxX - allMinX;
3682
+ if (pageSpan <= 0) return null;
3683
+ for (let ri = 0; ri < rows.length; ri++) {
3684
+ const row = rows[ri];
3685
+ if (row.items.length < MIN_COLS || row.items.length > 6) continue;
3686
+ if (row.items.some((i) => i.text.length > 8)) continue;
3687
+ if (!row.items.some((i) => /[가-힣]/.test(i.text))) continue;
3688
+ if (row.items.some((i) => /^[□■○●·※▶▷◆◇\-]/.test(i.text))) continue;
3689
+ const sorted = [...row.items].sort((a, b) => a.x - b.x);
3690
+ const xSpan = sorted[sorted.length - 1].x + sorted[sorted.length - 1].w - sorted[0].x;
3691
+ if (xSpan / pageSpan < 0.4) continue;
3692
+ const avgFs = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
3693
+ let hasLargeGap = false;
3694
+ for (let i = 1; i < sorted.length; i++) {
3695
+ const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
3696
+ if (gap >= avgFs * 2.5) {
3697
+ hasLargeGap = true;
3698
+ break;
3699
+ }
3700
+ }
3701
+ if (!hasLargeGap) continue;
3702
+ const columns = sorted.map((item) => ({ x: item.x, count: 0 }));
3703
+ let matchCount = 0;
3704
+ for (let j = ri + 1; j < rows.length && matchCount < MIN_ROWS + 2; j++) {
3705
+ const matched = countMatchedColumnsRange(rows[j], columns, sorted);
3706
+ if (matched >= MIN_COLS) matchCount++;
3707
+ }
3708
+ if (matchCount < MIN_ROWS) continue;
3709
+ return { columns, headerIdx: ri };
3710
+ }
3711
+ return null;
3712
+ }
3713
+ function mergeMultiLineRows(rows, columns) {
3714
+ if (rows.length <= 1) return rows;
3715
+ const result = [rows[0]];
3716
+ const allFontSizes = rows.flatMap((r) => r.items).map((i) => i.fontSize);
3717
+ const avgFontSize = allFontSizes.length > 0 ? allFontSizes.reduce((s, v) => s + v, 0) / allFontSizes.length : 12;
3718
+ for (let i = 1; i < rows.length; i++) {
3719
+ const prev = result[result.length - 1];
3720
+ const curr = rows[i];
3721
+ const yGap = Math.abs(prev.y - curr.y);
3722
+ const matchedCols = countMatchedColumns(curr, columns);
3723
+ if (yGap < avgFontSize * 1.8 && curr.items.length <= 2 && (matchedCols < MIN_COLS || curr.items.length === 1)) {
3724
+ result[result.length - 1] = {
3725
+ y: prev.y,
3726
+ items: [...prev.items, ...curr.items]
3727
+ };
3728
+ } else {
3729
+ result.push(curr);
3730
+ }
3731
+ }
3732
+ return result;
3733
+ }
3281
3734
  function groupByBaseline(items) {
3282
3735
  if (items.length === 0) return [];
3283
3736
  const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
@@ -3299,8 +3752,9 @@ function groupByBaseline(items) {
3299
3752
  function hasSuspiciousGaps(row) {
3300
3753
  if (row.items.length < 2) return false;
3301
3754
  const sorted = [...row.items].sort((a, b) => a.x - b.x);
3755
+ if (sorted.length === 2 && sorted[1].text.length > 20) return false;
3302
3756
  const avgFontSize = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
3303
- const minGap = avgFontSize * MIN_GAP_FACTOR;
3757
+ const minGap = Math.max(avgFontSize * MIN_GAP_FACTOR, MIN_GAP_ABSOLUTE);
3304
3758
  for (let i = 1; i < sorted.length; i++) {
3305
3759
  const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
3306
3760
  if (gap >= minGap) return true;
@@ -3327,6 +3781,41 @@ function extractColumnClusters(rows) {
3327
3781
  const minCount = Math.max(2, Math.floor(rows.length * MIN_COL_FILL_RATIO));
3328
3782
  return clusters.filter((c) => c.count >= minCount).sort((a, b) => a.x - b.x);
3329
3783
  }
3784
+ function findTableRegionsByHeader(allRows, columns, headerItems) {
3785
+ const regions = [];
3786
+ let currentRegion = [];
3787
+ let missStreak = 0;
3788
+ for (const row of allRows) {
3789
+ const matchedCols = countMatchedColumnsRange(row, columns, headerItems);
3790
+ if (matchedCols >= MIN_COLS) {
3791
+ currentRegion.push(row);
3792
+ missStreak = 0;
3793
+ } else if (currentRegion.length > 0 && (row.items.length <= 2 || missStreak === 0)) {
3794
+ currentRegion.push(row);
3795
+ missStreak++;
3796
+ } else {
3797
+ while (currentRegion.length > 0) {
3798
+ const last = currentRegion[currentRegion.length - 1];
3799
+ if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
3800
+ currentRegion.pop();
3801
+ }
3802
+ if (currentRegion.length >= MIN_ROWS) {
3803
+ regions.push({ rows: [...currentRegion] });
3804
+ }
3805
+ currentRegion = [];
3806
+ missStreak = 0;
3807
+ }
3808
+ }
3809
+ while (currentRegion.length > 0) {
3810
+ const last = currentRegion[currentRegion.length - 1];
3811
+ if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
3812
+ currentRegion.pop();
3813
+ }
3814
+ if (currentRegion.length >= MIN_ROWS) {
3815
+ regions.push({ rows: currentRegion });
3816
+ }
3817
+ return regions;
3818
+ }
3330
3819
  function findTableRegions(allRows, columns) {
3331
3820
  const regions = [];
3332
3821
  let currentRegion = [];
@@ -3362,18 +3851,81 @@ function countMatchedColumns(row, columns) {
3362
3851
  }
3363
3852
  return matched.size;
3364
3853
  }
3365
- function assignToColumn(item, columns) {
3366
- const MAX_DIST = COL_CLUSTER_TOL * 3;
3367
- let bestCol = -1;
3368
- let bestDist = Infinity;
3369
- for (let ci = 0; ci < columns.length; ci++) {
3370
- const dist = Math.abs(item.x - columns[ci].x);
3371
- if (dist < bestDist) {
3372
- bestDist = dist;
3373
- bestCol = ci;
3854
+ function countMatchedColumnsRange(row, columns, headerItems) {
3855
+ const boundaries = [];
3856
+ for (let ci = 0; ci < headerItems.length; ci++) {
3857
+ const left = ci === 0 ? 0 : (headerItems[ci - 1].x + headerItems[ci - 1].w + headerItems[ci].x) / 2;
3858
+ const right = ci === headerItems.length - 1 ? Infinity : (headerItems[ci].x + headerItems[ci].w + headerItems[ci + 1].x) / 2;
3859
+ boundaries.push({ left, right });
3860
+ }
3861
+ const matched = /* @__PURE__ */ new Set();
3862
+ for (const item of row.items) {
3863
+ for (let ci = 0; ci < boundaries.length; ci++) {
3864
+ if (item.x >= boundaries[ci].left && item.x < boundaries[ci].right) {
3865
+ matched.add(ci);
3866
+ break;
3867
+ }
3374
3868
  }
3375
3869
  }
3376
- return bestDist <= MAX_DIST ? bestCol : -1;
3870
+ return matched.size;
3871
+ }
3872
+ function assignRowItems(items, columns, numCols) {
3873
+ if (items.length === 0) return [];
3874
+ const sorted = [...items].sort((a, b) => a.x - b.x);
3875
+ const colCenters = columns.map((c) => c.x);
3876
+ const gaps = [];
3877
+ for (let i = 1; i < sorted.length; i++) {
3878
+ gaps.push({ idx: i, size: sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w) });
3879
+ }
3880
+ const gapSizes = gaps.map((g2) => g2.size).sort((a, b) => a - b);
3881
+ const medianGap = gapSizes.length > 0 ? gapSizes[Math.floor(gapSizes.length / 2)] : 0;
3882
+ const gapThreshold = sorted.length <= numCols + 1 ? 12 : Math.max(medianGap * 2.5, 12);
3883
+ const significantGaps = gaps.filter((g2) => g2.size >= gapThreshold).sort((a, b) => b.size - a.size).slice(0, numCols - 1).sort((a, b) => a.idx - b.idx);
3884
+ const groups = [];
3885
+ let start = 0;
3886
+ for (const gap of significantGaps) {
3887
+ groups.push(sorted.slice(start, gap.idx));
3888
+ start = gap.idx;
3889
+ }
3890
+ groups.push(sorted.slice(start));
3891
+ const result = [];
3892
+ const usedCols = /* @__PURE__ */ new Set();
3893
+ const groupCenters = groups.map((g2) => {
3894
+ let minX = Infinity, maxX = -Infinity;
3895
+ for (const i of g2) {
3896
+ if (i.x < minX) minX = i.x;
3897
+ const r = i.x + i.w;
3898
+ if (r > maxX) maxX = r;
3899
+ }
3900
+ return (minX + maxX) / 2;
3901
+ });
3902
+ const assignments = [];
3903
+ for (let gi = 0; gi < groups.length; gi++) {
3904
+ for (let ci = 0; ci < numCols; ci++) {
3905
+ assignments.push({ gi, ci, dist: Math.abs(groupCenters[gi] - colCenters[ci]) });
3906
+ }
3907
+ }
3908
+ assignments.sort((a, b) => a.dist - b.dist);
3909
+ const assignedGroups = /* @__PURE__ */ new Set();
3910
+ for (const { gi, ci } of assignments) {
3911
+ if (assignedGroups.has(gi) || usedCols.has(ci)) continue;
3912
+ result.push({ col: ci, items: groups[gi] });
3913
+ assignedGroups.add(gi);
3914
+ usedCols.add(ci);
3915
+ }
3916
+ for (let gi = 0; gi < groups.length; gi++) {
3917
+ if (assignedGroups.has(gi)) continue;
3918
+ let bestCol = 0, bestDist = Infinity;
3919
+ for (let ci = 0; ci < numCols; ci++) {
3920
+ const d = Math.abs(groupCenters[gi] - colCenters[ci]);
3921
+ if (d < bestDist) {
3922
+ bestDist = d;
3923
+ bestCol = ci;
3924
+ }
3925
+ }
3926
+ result.push({ col: bestCol, items: groups[gi] });
3927
+ }
3928
+ return result;
3377
3929
  }
3378
3930
  function buildClusterTable(rows, columns, pageNum) {
3379
3931
  const numCols = columns.length;
@@ -3391,12 +3943,12 @@ function buildClusterTable(rows, columns, pageNum) {
3391
3943
  usedItems.add(row.items[0]);
3392
3944
  continue;
3393
3945
  }
3394
- for (const item of row.items) {
3395
- const col = assignToColumn(item, columns);
3396
- if (col < 0) continue;
3946
+ const assignments = assignRowItems(row.items, columns, numCols);
3947
+ for (const { col, items } of assignments) {
3948
+ const text = items.map((i) => i.text).join(" ");
3397
3949
  const existing = cells[r][col].text;
3398
- cells[r][col].text = existing ? existing + " " + item.text : item.text;
3399
- usedItems.add(item);
3950
+ cells[r][col].text = existing ? existing + " " + text : text;
3951
+ for (const item of items) usedItems.add(item);
3400
3952
  }
3401
3953
  }
3402
3954
  let emptyRows = 0;
@@ -3408,11 +3960,48 @@ function buildClusterTable(rows, columns, pageNum) {
3408
3960
  const hasValue = cells.some((row) => row[c].text !== "");
3409
3961
  if (!hasValue) return null;
3410
3962
  }
3963
+ for (let r = numRows - 1; r >= 1; r--) {
3964
+ const nonEmptyCols = cells[r].filter((c) => c.text.trim()).length;
3965
+ if (nonEmptyCols !== 1) continue;
3966
+ if (cells[r][0].text.trim() !== "") continue;
3967
+ const contentText = cells[r].find((c) => c.text.trim())?.text.trim() || "";
3968
+ if (/^[○●▶\-·]/.test(contentText)) continue;
3969
+ for (let pr = r - 1; pr >= 0; pr--) {
3970
+ if (cells[pr].some((c) => c.text.trim())) {
3971
+ for (let c = 0; c < numCols; c++) {
3972
+ const prev = cells[pr][c].text.trim();
3973
+ const curr = cells[r][c].text.trim();
3974
+ if (curr) cells[pr][c].text = prev ? prev + " " + curr : curr;
3975
+ }
3976
+ for (let c = 0; c < numCols; c++) cells[r][c].text = "";
3977
+ break;
3978
+ }
3979
+ }
3980
+ }
3981
+ for (let r = 0; r < cells.length - 1; r++) {
3982
+ const row = cells[r];
3983
+ const hasCol0 = row[0].text.trim() !== "";
3984
+ const hasColLast = numCols > 1 && row[numCols - 1].text.trim() !== "";
3985
+ const midEmpty = row.slice(1, numCols - 1).every((c) => c.text.trim() === "");
3986
+ if (hasCol0 && hasColLast && midEmpty) {
3987
+ const next = cells[r + 1];
3988
+ if (next[0].text.trim() === "" && next.some((c) => c.text.trim())) {
3989
+ for (let c = 1; c < numCols; c++) {
3990
+ const curr = next[c].text.trim();
3991
+ if (curr) row[c].text = row[c].text.trim() ? row[c].text.trim() + " " + curr : curr;
3992
+ }
3993
+ for (let c = 0; c < numCols; c++) next[c].text = "";
3994
+ }
3995
+ }
3996
+ }
3997
+ const filteredCells = cells.filter((row) => row.some((c) => c.text.trim()));
3998
+ const finalRowCount = filteredCells.length;
3999
+ if (finalRowCount < MIN_ROWS) return null;
3411
4000
  const irTable = {
3412
- rows: numRows,
4001
+ rows: finalRowCount,
3413
4002
  cols: numCols,
3414
- cells,
3415
- hasHeader: numRows > 1
4003
+ cells: filteredCells,
4004
+ hasHeader: finalRowCount > 1
3416
4005
  };
3417
4006
  const allItems = rows.flatMap((r) => r.items);
3418
4007
  let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
@@ -3489,7 +4078,7 @@ async function parsePdfDocument(buffer, options) {
3489
4078
  const effectivePageCount = Math.min(pageCount, MAX_PAGES);
3490
4079
  const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
3491
4080
  const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
3492
- const allFontSizes = [];
4081
+ const fontSizeFreq = /* @__PURE__ */ new Map();
3493
4082
  const pageHeights = /* @__PURE__ */ new Map();
3494
4083
  let parsedPages = 0;
3495
4084
  for (let i = 1; i <= effectivePageCount; i++) {
@@ -3506,7 +4095,7 @@ async function parsePdfDocument(buffer, options) {
3506
4095
  warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
3507
4096
  }
3508
4097
  for (const item of visible) {
3509
- if (item.fontSize > 0) allFontSizes.push(item.fontSize);
4098
+ if (item.fontSize > 0) fontSizeFreq.set(item.fontSize, (fontSizeFreq.get(item.fontSize) || 0) + 1);
3510
4099
  }
3511
4100
  const opList = await page.getOperatorList();
3512
4101
  const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
@@ -3528,7 +4117,7 @@ async function parsePdfDocument(buffer, options) {
3528
4117
  if (totalChars / Math.max(parsedPageCount, 1) < 10) {
3529
4118
  if (options?.ocr) {
3530
4119
  try {
3531
- const { ocrPages } = await import("./provider-EU3CG724.js");
4120
+ const { ocrPages } = await import("./provider-7H4CPZYS.js");
3532
4121
  const ocrBlocks = await ocrPages(doc, options.ocr, pageFilter, effectivePageCount);
3533
4122
  if (ocrBlocks.length > 0) {
3534
4123
  const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
@@ -3545,7 +4134,7 @@ async function parsePdfDocument(buffer, options) {
3545
4134
  blocks.splice(removed[ri], 1);
3546
4135
  }
3547
4136
  }
3548
- const medianFontSize = computeMedianFontSize(allFontSizes);
4137
+ const medianFontSize = computeMedianFontSizeFromFreq(fontSizeFreq);
3549
4138
  if (medianFontSize > 0) {
3550
4139
  detectHeadings(blocks, medianFontSize);
3551
4140
  }
@@ -3609,11 +4198,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
3609
4198
  }
3610
4199
  return { visible, hiddenCount };
3611
4200
  }
3612
- function computeMedianFontSize(sizes) {
3613
- if (sizes.length === 0) return 0;
3614
- const sorted = [...sizes].sort((a, b) => a - b);
3615
- const mid = Math.floor(sorted.length / 2);
3616
- return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
4201
+ function computeMedianFontSizeFromFreq(freq) {
4202
+ if (freq.size === 0) return 0;
4203
+ let total = 0;
4204
+ for (const count of freq.values()) total += count;
4205
+ const sorted = [...freq.entries()].sort((a, b) => a[0] - b[0]);
4206
+ const mid = Math.floor(total / 2);
4207
+ let cumulative = 0;
4208
+ for (const [size, count] of sorted) {
4209
+ cumulative += count;
4210
+ if (cumulative > mid) return size;
4211
+ }
4212
+ return sorted[sorted.length - 1][0];
3617
4213
  }
3618
4214
  function detectHeadings(blocks, medianFontSize) {
3619
4215
  for (const block of blocks) {
@@ -3639,11 +4235,21 @@ function collapseEvenSpacing(text) {
3639
4235
  if (tokens.length >= 3 && singleCharCount / tokens.length >= 0.7) {
3640
4236
  return tokens.join("");
3641
4237
  }
3642
- return text;
4238
+ return text.replace(
4239
+ /(?<![가-힣])[가-힣](?: [가-힣\d]){2,}(?![가-힣])/g,
4240
+ (match) => match.replace(/ /g, "")
4241
+ );
3643
4242
  }
3644
4243
  function shouldDemoteTable(table) {
3645
4244
  const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
3646
4245
  const allText = allCells.join(" ");
4246
+ if (table.rows <= 3 && table.cols <= 3) {
4247
+ const totalCells2 = table.rows * table.cols;
4248
+ const emptyCells2 = totalCells2 - allCells.length;
4249
+ if (emptyCells2 >= totalCells2 * 0.3) return true;
4250
+ if (/[□■◆○●▶ㅇ]/.test(allText)) return true;
4251
+ if (/<[^>]+>/.test(allText)) return true;
4252
+ }
3647
4253
  if (allText.length > 200) return false;
3648
4254
  if (/[□■◆○●▶]/.test(allText) && table.rows <= 3) return true;
3649
4255
  const totalCells = table.rows * table.cols;
@@ -3754,6 +4360,7 @@ function extractPageBlocksWithLines(items, pageNum, opList, pageWidth, pageHeigh
3754
4360
  if (items.length === 0) return [];
3755
4361
  let { horizontals, verticals } = extractLines(opList.fnArray, opList.argsArray);
3756
4362
  ({ horizontals, verticals } = filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight));
4363
+ ({ horizontals, verticals } = preprocessLines(horizontals, verticals));
3757
4364
  const grids = buildTableGrids(horizontals, verticals);
3758
4365
  if (grids.length > 0) {
3759
4366
  return extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals);
@@ -3765,14 +4372,19 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
3765
4372
  const usedItems = /* @__PURE__ */ new Set();
3766
4373
  const sortedGrids = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
3767
4374
  for (const grid of sortedGrids) {
4375
+ const numGridRows = grid.rowYs.length - 1;
4376
+ const numGridCols = grid.colXs.length - 1;
4377
+ if (numGridRows === 1 && numGridCols >= 2) continue;
3768
4378
  const tableItems = [];
3769
4379
  const pad = 3;
4380
+ const gridW = grid.bbox.x2 - grid.bbox.x1;
3770
4381
  for (const item of items) {
3771
4382
  if (usedItems.has(item)) continue;
3772
- if (item.x >= grid.bbox.x1 - pad && item.x + item.w <= grid.bbox.x2 + pad && item.y >= grid.bbox.y1 - pad && item.y <= grid.bbox.y2 + pad) {
3773
- tableItems.push(item);
3774
- usedItems.add(item);
3775
- }
4383
+ if (item.y < grid.bbox.y1 - pad || item.y > grid.bbox.y2 + pad) continue;
4384
+ if (item.x < grid.bbox.x1 - pad || item.x + item.w > grid.bbox.x2 + pad) continue;
4385
+ if (gridW < 120 && item.x + item.w > grid.bbox.x2 - 2) continue;
4386
+ tableItems.push(item);
4387
+ usedItems.add(item);
3776
4388
  }
3777
4389
  const cells = extractCells(grid, horizontals, verticals);
3778
4390
  if (cells.length === 0) continue;
@@ -3796,6 +4408,7 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
3796
4408
  const cellItems = cellTextMap.get(cell) || [];
3797
4409
  let text = cellTextToString(cellItems);
3798
4410
  text = text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").trim();
4411
+ text = text.split("\n").map((line) => collapseEvenSpacing(line)).join("\n");
3799
4412
  irGrid[cell.row][cell.col] = {
3800
4413
  text,
3801
4414
  colSpan: cell.colSpan,
@@ -3820,23 +4433,58 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
3820
4433
  if (shouldDemoteTable(irTable)) {
3821
4434
  const demoted = demoteTableToText(irTable);
3822
4435
  if (demoted) {
3823
- blocks.push({ type: "paragraph", text: demoted, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
4436
+ const text = numGridRows === 1 ? "\n" + demoted + "\n" : demoted;
4437
+ blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
3824
4438
  }
3825
4439
  continue;
3826
4440
  }
3827
4441
  blocks.push({ type: "table", table: irTable, pageNumber: pageNum, bbox: tableBbox });
3828
4442
  }
3829
- const remaining = items.filter((i) => !usedItems.has(i));
4443
+ let remaining = items.filter((i) => !usedItems.has(i));
3830
4444
  if (remaining.length > 0) {
3831
4445
  remaining.sort((a, b) => b.y - a.y || a.x - b.x);
3832
- const textBlocks = detectListBlocks(extractPageBlocksFallback(remaining, pageNum));
3833
- const allBlocks = [...blocks, ...textBlocks];
3834
- allBlocks.sort((a, b) => {
4446
+ const clusterItems = remaining.map((i) => ({
4447
+ text: i.text,
4448
+ x: i.x,
4449
+ y: i.y,
4450
+ w: i.w,
4451
+ h: i.h,
4452
+ fontSize: i.fontSize,
4453
+ fontName: i.fontName
4454
+ }));
4455
+ const clusterResults = detectClusterTables(clusterItems, pageNum);
4456
+ if (clusterResults.length > 0) {
4457
+ const ciToIdx = /* @__PURE__ */ new Map();
4458
+ for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
4459
+ const usedClusterIndices = /* @__PURE__ */ new Set();
4460
+ for (const cr of clusterResults) {
4461
+ for (const ci of cr.usedItems) {
4462
+ const idx = ciToIdx.get(ci);
4463
+ if (idx !== void 0) usedClusterIndices.add(idx);
4464
+ }
4465
+ blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
4466
+ }
4467
+ remaining = remaining.filter((_, idx) => !usedClusterIndices.has(idx));
4468
+ }
4469
+ if (remaining.length > 0) {
4470
+ const allY = remaining.map((i) => i.y);
4471
+ const pageH = safeMax(allY) - safeMin(allY);
4472
+ const groups = xyCutOrder(remaining, Math.max(15, pageH * 0.03));
4473
+ const textBlocks = [];
4474
+ for (const group of groups) {
4475
+ if (group.length === 0) continue;
4476
+ const groupBlocks = extractPageBlocksFallback(group, pageNum);
4477
+ for (const b of groupBlocks) textBlocks.push(b);
4478
+ }
4479
+ const finalTextBlocks = detectListBlocks(textBlocks);
4480
+ for (const b of finalTextBlocks) blocks.push(b);
4481
+ }
4482
+ blocks.sort((a, b) => {
3835
4483
  const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
3836
4484
  const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
3837
4485
  return by - ay;
3838
4486
  });
3839
- return mergeAdjacentTableBlocks(allBlocks);
4487
+ return mergeAdjacentTableBlocks(blocks);
3840
4488
  }
3841
4489
  return mergeAdjacentTableBlocks(blocks);
3842
4490
  }
@@ -3863,52 +4511,52 @@ function mergeAdjacentTableBlocks(blocks) {
3863
4511
  function extractPageBlocksFallback(items, pageNum) {
3864
4512
  if (items.length === 0) return [];
3865
4513
  const blocks = [];
3866
- const allYLines = groupByY(items);
3867
- const columns = detectColumns(allYLines);
3868
- if (columns && columns.length >= 3) {
3869
- const tableText = extractWithColumns(allYLines, columns);
3870
- const bbox = computeBBox(items, pageNum);
3871
- blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
3872
- } else {
3873
- const clusterItems = items.map((i) => ({
3874
- text: i.text,
3875
- x: i.x,
3876
- y: i.y,
3877
- w: i.w,
3878
- h: i.h,
3879
- fontSize: i.fontSize,
3880
- fontName: i.fontName
3881
- }));
3882
- const clusterResults = detectClusterTables(clusterItems, pageNum);
3883
- if (clusterResults.length > 0) {
3884
- const ciToIdx = /* @__PURE__ */ new Map();
3885
- for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
3886
- const usedIndices = /* @__PURE__ */ new Set();
3887
- for (const cr of clusterResults) {
3888
- for (const ci of cr.usedItems) {
3889
- const idx = ciToIdx.get(ci);
3890
- if (idx !== void 0) usedIndices.add(idx);
3891
- }
3892
- blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
4514
+ const clusterItems = items.map((i) => ({
4515
+ text: i.text,
4516
+ x: i.x,
4517
+ y: i.y,
4518
+ w: i.w,
4519
+ h: i.h,
4520
+ fontSize: i.fontSize,
4521
+ fontName: i.fontName
4522
+ }));
4523
+ const clusterResults = detectClusterTables(clusterItems, pageNum);
4524
+ if (clusterResults.length > 0) {
4525
+ const ciToIdx = /* @__PURE__ */ new Map();
4526
+ for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
4527
+ const usedIndices = /* @__PURE__ */ new Set();
4528
+ for (const cr of clusterResults) {
4529
+ for (const ci of cr.usedItems) {
4530
+ const idx = ciToIdx.get(ci);
4531
+ if (idx !== void 0) usedIndices.add(idx);
3893
4532
  }
3894
- const remaining = items.filter((_, idx) => !usedIndices.has(idx));
3895
- if (remaining.length > 0) {
3896
- const yLines = groupByY(remaining);
3897
- for (const line of yLines) {
3898
- const text = mergeLineSimple(line);
3899
- if (!text.trim()) continue;
3900
- const bbox = computeBBox(line, pageNum);
3901
- blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
3902
- }
4533
+ blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
4534
+ }
4535
+ const remaining = items.filter((_, idx) => !usedIndices.has(idx));
4536
+ if (remaining.length > 0) {
4537
+ const yLines = groupByY(remaining);
4538
+ for (const line of yLines) {
4539
+ const text = mergeLineSimple(line);
4540
+ if (!text.trim()) continue;
4541
+ const bbox = computeBBox(line, pageNum);
4542
+ blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
3903
4543
  }
3904
- blocks.sort((a, b) => {
3905
- const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
3906
- const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
3907
- return by - ay;
3908
- });
4544
+ }
4545
+ blocks.sort((a, b) => {
4546
+ const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
4547
+ const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
4548
+ return by - ay;
4549
+ });
4550
+ } else {
4551
+ const allYLines = groupByY(items);
4552
+ const columns = detectColumns(allYLines);
4553
+ if (columns && columns.length >= 3) {
4554
+ const tableText = extractWithColumns(allYLines, columns);
4555
+ const bbox = computeBBox(items, pageNum);
4556
+ blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
3909
4557
  } else {
3910
4558
  const allY = items.map((i) => i.y);
3911
- const pageHeight = Math.max(...allY) - Math.min(...allY);
4559
+ const pageHeight = safeMax(allY) - safeMin(allY);
3912
4560
  const gapThreshold = Math.max(15, pageHeight * 0.03);
3913
4561
  const orderedGroups = xyCutOrder(items, gapThreshold);
3914
4562
  for (const group of orderedGroups) {
@@ -3961,22 +4609,76 @@ function dominantStyle(items) {
3961
4609
  return { fontSize: dominantSize, fontName };
3962
4610
  }
3963
4611
  function normalizeItems(rawItems) {
3964
- return rawItems.filter((i) => typeof i.str === "string" && i.str.trim() !== "").map((i) => {
4612
+ const items = [];
4613
+ const spacePositions = [];
4614
+ for (const i of rawItems) {
4615
+ if (typeof i.str !== "string") continue;
4616
+ const x = Math.round(i.transform[4]);
4617
+ const y = Math.round(i.transform[5]);
4618
+ if (!i.str.trim()) {
4619
+ spacePositions.push({ x, y });
4620
+ continue;
4621
+ }
3965
4622
  const scaleY = Math.abs(i.transform[3]);
3966
4623
  const scaleX = Math.abs(i.transform[0]);
3967
4624
  const fontSize = Math.round(Math.max(scaleY, scaleX));
3968
- return {
3969
- text: i.str.trim(),
3970
- x: Math.round(i.transform[4]),
3971
- y: Math.round(i.transform[5]),
3972
- w: Math.round(i.width),
3973
- h: Math.round(i.height),
3974
- fontSize,
3975
- fontName: i.fontName || "",
3976
- // 0pt 폰트이거나 너비 0 → hidden text (prompt injection 의심)
3977
- isHidden: fontSize === 0 || i.width === 0 && i.str.trim().length > 0
3978
- };
3979
- }).sort((a, b) => b.y - a.y || a.x - b.x);
4625
+ const w = Math.round(i.width);
4626
+ const h = Math.round(i.height);
4627
+ const isHidden = fontSize === 0 || i.width === 0 && i.str.trim().length > 0;
4628
+ let text = i.str.trim();
4629
+ if (/^[\d\s\-().·,☎]+$/.test(text) && /\d/.test(text) && / /.test(text)) {
4630
+ text = text.replace(/ /g, "");
4631
+ }
4632
+ const split = splitEvenSpacedItem(text, x, w, fontSize);
4633
+ if (split) {
4634
+ for (const s of split) {
4635
+ items.push({ text: s.text, x: s.x, y, w: s.w, h, fontSize, fontName: i.fontName || "", isHidden });
4636
+ }
4637
+ } else {
4638
+ items.push({ text, x, y, w, h, fontSize, fontName: i.fontName || "", isHidden });
4639
+ }
4640
+ }
4641
+ const sorted = items.sort((a, b) => b.y - a.y || a.x - b.x);
4642
+ const deduped = [];
4643
+ for (let i = 0; i < sorted.length; i++) {
4644
+ let isDup = false;
4645
+ for (let j = deduped.length - 1; j >= 0; j--) {
4646
+ const prev = deduped[j];
4647
+ if (prev.y - sorted[i].y > 3) break;
4648
+ if (Math.abs(prev.y - sorted[i].y) <= 3 && prev.text === sorted[i].text && Math.abs(prev.x - sorted[i].x) <= 3) {
4649
+ isDup = true;
4650
+ break;
4651
+ }
4652
+ }
4653
+ if (!isDup) deduped.push(sorted[i]);
4654
+ }
4655
+ if (spacePositions.length > 0) {
4656
+ for (const item of deduped) {
4657
+ for (const sp of spacePositions) {
4658
+ if (Math.abs(sp.y - item.y) <= 3) {
4659
+ const dist = item.x - sp.x;
4660
+ if (dist >= 0 && dist <= 20) {
4661
+ item.hasSpaceBefore = true;
4662
+ break;
4663
+ }
4664
+ }
4665
+ }
4666
+ }
4667
+ }
4668
+ return deduped;
4669
+ }
4670
+ function splitEvenSpacedItem(text, itemX, itemW, fontSize) {
4671
+ if (!/^[가-힣\d](?: [가-힣\d]){2,}$/.test(text)) return null;
4672
+ const chars = text.split(" ");
4673
+ if (chars.length < 3) return null;
4674
+ const charW = itemW / chars.length;
4675
+ if (charW > fontSize * 2) return null;
4676
+ return chars.map((ch, idx) => ({
4677
+ text: ch,
4678
+ x: Math.round(itemX + idx * charW),
4679
+ w: Math.round(charW * 0.8)
4680
+ // 실제 글자 폭은 간격보다 좁음
4681
+ }));
3980
4682
  }
3981
4683
  function groupByY(items) {
3982
4684
  if (items.length === 0) return [];
@@ -4001,14 +4703,14 @@ function isProseSpread(items) {
4001
4703
  for (let i = 1; i < sorted.length; i++) {
4002
4704
  gaps.push(sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w));
4003
4705
  }
4004
- const maxGap = Math.max(...gaps);
4706
+ const maxGap = safeMax(gaps);
4005
4707
  const avgLen = items.reduce((s, i) => s + i.text.length, 0) / items.length;
4006
4708
  return maxGap < 40 && avgLen < 5;
4007
4709
  }
4008
4710
  function detectColumns(yLines) {
4009
4711
  const allItems = yLines.flat();
4010
4712
  if (allItems.length === 0) return null;
4011
- const pageWidth = Math.max(...allItems.map((i) => i.x + i.w)) - Math.min(...allItems.map((i) => i.x));
4713
+ const pageWidth = safeMax(allItems.map((i) => i.x + i.w)) - safeMin(allItems.map((i) => i.x));
4012
4714
  if (pageWidth < 100) return null;
4013
4715
  let bigoLineIdx = -1;
4014
4716
  for (let i = 0; i < yLines.length; i++) {
@@ -4040,7 +4742,7 @@ function detectColumns(yLines) {
4040
4742
  }
4041
4743
  const peaks = xClusters.filter((c) => c.count >= 3).sort((a, b) => a.minX - b.minX);
4042
4744
  if (peaks.length < 3) return null;
4043
- const MERGE_TOL = 30;
4745
+ const MERGE_TOL = 40;
4044
4746
  const merged = [peaks[0]];
4045
4747
  for (let i = 1; i < peaks.length; i++) {
4046
4748
  const prev = merged[merged.length - 1];
@@ -4054,7 +4756,14 @@ function detectColumns(yLines) {
4054
4756
  merged.push({ ...peaks[i] });
4055
4757
  }
4056
4758
  }
4057
- const columns = merged.filter((c) => c.count >= 3).map((c) => c.minX);
4759
+ const rawColumns = merged.filter((c) => c.count >= 3).map((c) => c.minX);
4760
+ if (rawColumns.length < 3) return null;
4761
+ const MIN_DETECT_COL_WIDTH = 30;
4762
+ const columns = [rawColumns[0]];
4763
+ for (let i = 1; i < rawColumns.length; i++) {
4764
+ if (rawColumns[i] - columns[columns.length - 1] < MIN_DETECT_COL_WIDTH) continue;
4765
+ columns.push(rawColumns[i]);
4766
+ }
4058
4767
  return columns.length >= 3 ? columns : null;
4059
4768
  }
4060
4769
  function findColumn(x, columns) {
@@ -4182,6 +4891,16 @@ function buildGridTable(lines, columns) {
4182
4891
  }
4183
4892
  merged.splice(0, headerEnd, headerRow);
4184
4893
  }
4894
+ for (const row of merged) {
4895
+ for (let c = 0; c < row.length; c++) {
4896
+ if (row[c]) row[c] = collapseEvenSpacing(row[c]);
4897
+ }
4898
+ }
4899
+ const totalCells = merged.length * numCols;
4900
+ const filledCells = merged.reduce((s, row) => s + row.filter((c) => c).length, 0);
4901
+ if (filledCells < totalCells * 0.35 || merged.length < 2 || merged.length <= 3 && numCols >= 7) {
4902
+ return merged.map((r) => r.filter((c) => c).join(" ")).join("\n");
4903
+ }
4185
4904
  const md = [];
4186
4905
  md.push("| " + merged[0].join(" | ") + " |");
4187
4906
  md.push("| " + merged[0].map(() => "---").join(" | ") + " |");
@@ -4193,12 +4912,32 @@ function buildGridTable(lines, columns) {
4193
4912
  function mergeLineSimple(items) {
4194
4913
  if (items.length <= 1) return items[0]?.text || "";
4195
4914
  const sorted = [...items].sort((a, b) => a.x - b.x);
4915
+ const isEvenSpaced = detectEvenSpacedItems(sorted);
4196
4916
  let result = sorted[0].text;
4197
4917
  for (let i = 1; i < sorted.length; i++) {
4198
4918
  const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
4199
4919
  const avgFs = (sorted[i].fontSize + sorted[i - 1].fontSize) / 2;
4200
- if (gap > 15) result += " ";
4201
- else if (gap < avgFs * 0.15) {
4920
+ const tabThreshold = Math.max(avgFs * 2, 30);
4921
+ if (gap > tabThreshold) {
4922
+ result += " ";
4923
+ result += sorted[i].text;
4924
+ continue;
4925
+ }
4926
+ if (isEvenSpaced[i]) {
4927
+ result += sorted[i].text;
4928
+ continue;
4929
+ }
4930
+ if (sorted[i].hasSpaceBefore && gap >= avgFs * 0.05) {
4931
+ result += " ";
4932
+ result += sorted[i].text;
4933
+ continue;
4934
+ }
4935
+ if (/[□■○●▶◆◇ㅇ]$/.test(sorted[i - 1].text) && /^[가-힣]/.test(sorted[i].text) && gap > 1) {
4936
+ result += " ";
4937
+ result += sorted[i].text;
4938
+ continue;
4939
+ }
4940
+ if (gap < avgFs * 0.15) {
4202
4941
  } else if (gap < avgFs * 0.35 && (/[가-힣]$/.test(result) || /^[가-힣]/.test(sorted[i].text))) {
4203
4942
  } else if (gap > 3) result += " ";
4204
4943
  result += sorted[i].text;
@@ -4207,8 +4946,8 @@ function mergeLineSimple(items) {
4207
4946
  }
4208
4947
  function cleanPdfText(text) {
4209
4948
  return mergeKoreanLines(
4210
- text.replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "")
4211
- ).replace(/^(?!\|).{3,30}$/gm, (line) => collapseEvenSpacing(line)).replace(/\n{3,}/g, "\n\n").trim();
4949
+ text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
4950
+ ).replace(/^(?!\| ---).*$/gm, (line) => collapseEvenSpacing(line)).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
4212
4951
  }
4213
4952
  function startsWithMarker(line) {
4214
4953
  const t = line.trimStart();
@@ -4400,7 +5139,7 @@ function mergeKoreanLines(text) {
4400
5139
  result[result.length - 1] = prev + " " + currTrimmed;
4401
5140
  continue;
4402
5141
  }
4403
- if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev)) {
5142
+ if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev) && !startsWithMarker(prev)) {
4404
5143
  result[result.length - 1] = prev + " " + curr;
4405
5144
  } else {
4406
5145
  result.push(curr);
@@ -4413,7 +5152,7 @@ function mergeKoreanLines(text) {
4413
5152
  import { readFile } from "fs/promises";
4414
5153
 
4415
5154
  // src/xlsx/parser.ts
4416
- import JSZip3 from "jszip";
5155
+ import JSZip2 from "jszip";
4417
5156
  import { DOMParser as DOMParser2 } from "@xmldom/xmldom";
4418
5157
  var MAX_SHEETS = 100;
4419
5158
  var MAX_DECOMPRESS_SIZE3 = 100 * 1024 * 1024;
@@ -4451,7 +5190,7 @@ function getTextContent(el) {
4451
5190
  return el.textContent?.trim() ?? "";
4452
5191
  }
4453
5192
  function parseXml(text) {
4454
- return new DOMParser2().parseFromString(text, "text/xml");
5193
+ return new DOMParser2().parseFromString(stripDtd(text), "text/xml");
4455
5194
  }
4456
5195
  function parseSharedStrings(xml) {
4457
5196
  const doc = parseXml(xml);
@@ -4604,7 +5343,7 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
4604
5343
  }
4605
5344
  async function parseXlsxDocument(buffer, options) {
4606
5345
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
4607
- const zip = await JSZip3.loadAsync(buffer);
5346
+ const zip = await JSZip2.loadAsync(buffer);
4608
5347
  const warnings = [];
4609
5348
  const workbookFile = zip.file("xl/workbook.xml");
4610
5349
  if (!workbookFile) {
@@ -4626,7 +5365,7 @@ async function parseXlsxDocument(buffer, options) {
4626
5365
  }
4627
5366
  let pageFilter = null;
4628
5367
  if (options?.pages) {
4629
- const { parsePageRange: parsePageRange2 } = await import("./page-range-OF5I4PQY.js");
5368
+ const { parsePageRange: parsePageRange2 } = await import("./page-range-737B4EZW.js");
4630
5369
  pageFilter = parsePageRange2(options.pages, sheets.length);
4631
5370
  }
4632
5371
  const blocks = [];
@@ -4694,7 +5433,7 @@ async function parseXlsxDocument(buffer, options) {
4694
5433
  }
4695
5434
 
4696
5435
  // src/docx/parser.ts
4697
- import JSZip4 from "jszip";
5436
+ import JSZip3 from "jszip";
4698
5437
  import { DOMParser as DOMParser3 } from "@xmldom/xmldom";
4699
5438
  var MAX_DECOMPRESS_SIZE4 = 100 * 1024 * 1024;
4700
5439
  function getChildElements(parent, localName) {
@@ -4738,7 +5477,7 @@ function getAttr(el, localName) {
4738
5477
  return null;
4739
5478
  }
4740
5479
  function parseXml2(text) {
4741
- return new DOMParser3().parseFromString(text, "text/xml");
5480
+ return new DOMParser3().parseFromString(stripDtd(text), "text/xml");
4742
5481
  }
4743
5482
  function parseStyles(xml) {
4744
5483
  const doc = parseXml2(xml);
@@ -5032,7 +5771,7 @@ async function extractImages(zip, rels, doc) {
5032
5771
  }
5033
5772
  async function parseDocxDocument(buffer, options) {
5034
5773
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
5035
- const zip = await JSZip4.loadAsync(buffer);
5774
+ const zip = await JSZip3.loadAsync(buffer);
5036
5775
  const warnings = [];
5037
5776
  const docFile = zip.file("word/document.xml");
5038
5777
  if (!docFile) {
@@ -5249,7 +5988,7 @@ function extractInlineFields(text) {
5249
5988
  }
5250
5989
 
5251
5990
  // src/hwpx/generator.ts
5252
- import JSZip5 from "jszip";
5991
+ import JSZip4 from "jszip";
5253
5992
 
5254
5993
  // src/index.ts
5255
5994
  async function parse(input, options) {
@@ -5344,7 +6083,13 @@ function normalize(s) {
5344
6083
  }
5345
6084
  var MAX_LEVENSHTEIN_LEN = 1e4;
5346
6085
  function levenshtein(a, b) {
5347
- if (a.length + b.length > MAX_LEVENSHTEIN_LEN) return Math.abs(a.length - b.length);
6086
+ if (a.length + b.length > MAX_LEVENSHTEIN_LEN) {
6087
+ const sampleLen = Math.min(500, a.length, b.length);
6088
+ let diffs = 0;
6089
+ for (let i = 0; i < sampleLen; i++) if (a[i] !== b[i]) diffs++;
6090
+ const sampleRate = sampleLen > 0 ? diffs / sampleLen : 1;
6091
+ return Math.abs(a.length - b.length) + Math.round(Math.min(a.length, b.length) * sampleRate);
6092
+ }
5348
6093
  if (a.length > b.length) [a, b] = [b, a];
5349
6094
  const m = a.length;
5350
6095
  const n = b.length;
@@ -5500,7 +6245,10 @@ function diffTableCells(a, b) {
5500
6245
  }
5501
6246
 
5502
6247
  export {
5503
- detectFormat,
6248
+ VERSION,
6249
+ toArrayBuffer,
6250
+ KordocError,
6251
+ sanitizeError,
5504
6252
  blocksToMarkdown,
5505
6253
  extractHwpxMetadataOnly,
5506
6254
  extractHwp5MetadataOnly,
@@ -5509,4 +6257,4 @@ export {
5509
6257
  extractFormFields,
5510
6258
  parse
5511
6259
  };
5512
- //# sourceMappingURL=chunk-4UH6ABAY.js.map
6260
+ //# sourceMappingURL=chunk-LYFG7AUT.js.map