kordoc 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,53 +1,105 @@
1
1
  #!/usr/bin/env node
2
2
  import {
3
- KordocError,
4
- classifyError,
5
- isPathTraversal,
6
- precheckZipSize,
7
- sanitizeHref,
8
- toArrayBuffer
9
- } from "./chunk-PKIJLEV6.js";
3
+ detectFormat,
4
+ detectZipFormat
5
+ } from "./chunk-5Y2Q3BRW.js";
10
6
  import {
11
7
  parsePageRange
12
8
  } from "./chunk-MOL7MDBG.js";
13
9
 
14
- // src/detect.ts
15
- import JSZip from "jszip";
16
- function magicBytes(buffer) {
17
- return new Uint8Array(buffer, 0, Math.min(4, buffer.byteLength));
18
- }
19
- function isZipFile(buffer) {
20
- const b = magicBytes(buffer);
21
- return b[0] === 80 && b[1] === 75 && b[2] === 3 && b[3] === 4;
22
- }
23
- function isOldHwpFile(buffer) {
24
- const b = magicBytes(buffer);
25
- return b[0] === 208 && b[1] === 207 && b[2] === 17 && b[3] === 224;
10
+ // src/utils.ts
11
+ var VERSION = true ? "2.2.0" : "0.0.0-dev";
12
+ function toArrayBuffer(buf) {
13
+ if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
14
+ return buf.buffer;
15
+ }
16
+ return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
26
17
  }
27
- function isPdfFile(buffer) {
28
- const b = magicBytes(buffer);
29
- return b[0] === 37 && b[1] === 80 && b[2] === 68 && b[3] === 70;
18
+ var KordocError = class extends Error {
19
+ constructor(message) {
20
+ super(message);
21
+ this.name = "KordocError";
22
+ }
23
+ };
24
+ function sanitizeError(err) {
25
+ if (err instanceof KordocError) return err.message;
26
+ return "\uBB38\uC11C \uCC98\uB9AC \uC911 \uC624\uB958\uAC00 \uBC1C\uC0DD\uD588\uC2B5\uB2C8\uB2E4";
30
27
  }
31
- function detectFormat(buffer) {
32
- if (buffer.byteLength < 4) return "unknown";
33
- if (isZipFile(buffer)) return "hwpx";
34
- if (isOldHwpFile(buffer)) return "hwp";
35
- if (isPdfFile(buffer)) return "pdf";
36
- return "unknown";
28
+ function isPathTraversal(name) {
29
+ if (name.includes("\0")) return true;
30
+ const normalized = name.replace(/\\/g, "/");
31
+ const segments = normalized.split("/");
32
+ return segments.some((s) => s === "..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
37
33
  }
38
- async function detectZipFormat(buffer) {
34
+ function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
39
35
  try {
40
- const zip = await JSZip.loadAsync(buffer);
41
- if (zip.file("xl/workbook.xml")) return "xlsx";
42
- if (zip.file("word/document.xml")) return "docx";
43
- if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return "hwpx";
44
- const hasSection = Object.keys(zip.files).some((f) => f.startsWith("Contents/"));
45
- if (hasSection) return "hwpx";
46
- return "unknown";
47
- } catch {
48
- return "unknown";
36
+ const data = new DataView(buffer);
37
+ const len = buffer.byteLength;
38
+ let eocdOffset = -1;
39
+ for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
40
+ if (data.getUint32(i, true) === 101010256) {
41
+ eocdOffset = i;
42
+ break;
43
+ }
44
+ }
45
+ if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
46
+ const entryCount = data.getUint16(eocdOffset + 10, true);
47
+ if (entryCount > maxEntries) {
48
+ throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
49
+ }
50
+ const cdSize = data.getUint32(eocdOffset + 12, true);
51
+ const cdOffset = data.getUint32(eocdOffset + 16, true);
52
+ if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
53
+ let totalUncompressed = 0;
54
+ let pos = cdOffset;
55
+ for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
56
+ if (data.getUint32(pos, true) !== 33639248) break;
57
+ totalUncompressed += data.getUint32(pos + 24, true);
58
+ const nameLen = data.getUint16(pos + 28, true);
59
+ const extraLen = data.getUint16(pos + 30, true);
60
+ const commentLen = data.getUint16(pos + 32, true);
61
+ pos += 46 + nameLen + extraLen + commentLen;
62
+ }
63
+ if (totalUncompressed > maxUncompressedSize) {
64
+ throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
65
+ }
66
+ return { totalUncompressed, entryCount };
67
+ } catch (err) {
68
+ if (err instanceof KordocError) throw err;
69
+ return { totalUncompressed: 0, entryCount: 0 };
49
70
  }
50
71
  }
72
+ function stripDtd(xml) {
73
+ return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
74
+ }
75
+ var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
76
+ function sanitizeHref(href) {
77
+ const trimmed = href.trim();
78
+ if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
79
+ return trimmed;
80
+ }
81
+ function safeMin(arr) {
82
+ let min = Infinity;
83
+ for (let i = 0; i < arr.length; i++) if (arr[i] < min) min = arr[i];
84
+ return min;
85
+ }
86
+ function safeMax(arr) {
87
+ let max = -Infinity;
88
+ for (let i = 0; i < arr.length; i++) if (arr[i] > max) max = arr[i];
89
+ return max;
90
+ }
91
+ function classifyError(err) {
92
+ if (!(err instanceof Error)) return "PARSE_ERROR";
93
+ const msg = err.message;
94
+ if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
95
+ if (msg.includes("DRM")) return "DRM_PROTECTED";
96
+ if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
97
+ if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
98
+ if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
99
+ if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
100
+ if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
101
+ return "PARSE_ERROR";
102
+ }
51
103
 
52
104
  // src/table/builder.ts
53
105
  var MAX_COLS = 200;
@@ -110,6 +162,7 @@ function buildTableDirect(rows, numRows) {
110
162
  if (end > maxCols) maxCols = end;
111
163
  }
112
164
  }
165
+ if (maxCols > MAX_COLS) maxCols = MAX_COLS;
113
166
  if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
114
167
  const grid = Array.from(
115
168
  { length: numRows },
@@ -119,7 +172,7 @@ function buildTableDirect(rows, numRows) {
119
172
  for (const cell of row) {
120
173
  const r = cell.rowAddr ?? 0;
121
174
  const c = cell.colAddr ?? 0;
122
- if (r >= numRows || c >= maxCols) continue;
175
+ if (r >= numRows || c >= maxCols || r < 0 || c < 0) continue;
123
176
  grid[r][c] = { text: cell.text.trim(), colSpan: cell.colSpan, rowSpan: cell.rowSpan };
124
177
  for (let dr = 0; dr < cell.rowSpan; dr++) {
125
178
  for (let dc = 0; dc < cell.colSpan; dc++) {
@@ -304,9 +357,6 @@ function tableToMarkdown(table) {
304
357
  if (dr === 0 && dc === 0) continue;
305
358
  if (r + dr < numRows && c + dc < numCols) {
306
359
  skip.add(`${r + dr},${c + dc}`);
307
- if (dr === 0) {
308
- display[r][c + dc] = cell.text.replace(/\n/g, "<br>");
309
- }
310
360
  }
311
361
  }
312
362
  }
@@ -344,7 +394,7 @@ function tableToMarkdown(table) {
344
394
  }
345
395
 
346
396
  // src/hwpx/parser.ts
347
- import JSZip2 from "jszip";
397
+ import JSZip from "jszip";
348
398
  import { inflateRawSync } from "zlib";
349
399
  import { DOMParser } from "@xmldom/xmldom";
350
400
 
@@ -446,14 +496,11 @@ function parseStyleElements(doc, map) {
446
496
  }
447
497
  }
448
498
  }
449
- function stripDtd(xml) {
450
- return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
451
- }
452
499
  async function parseHwpxDocument(buffer, options) {
453
500
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
454
501
  let zip;
455
502
  try {
456
- zip = await JSZip2.loadAsync(buffer);
503
+ zip = await JSZip.loadAsync(buffer);
457
504
  } catch {
458
505
  return extractFromBrokenZip(buffer);
459
506
  }
@@ -616,7 +663,7 @@ function parseDublinCoreMetadata(xml, metadata) {
616
663
  async function extractHwpxMetadataOnly(buffer) {
617
664
  let zip;
618
665
  try {
619
- zip = await JSZip2.loadAsync(buffer);
666
+ zip = await JSZip.loadAsync(buffer);
620
667
  } catch {
621
668
  throw new KordocError("HWPX ZIP\uC744 \uC5F4 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
622
669
  }
@@ -811,7 +858,8 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
811
858
  if (newTable.rows.length > 0) {
812
859
  if (tableStack.length > 0) {
813
860
  const parentTable = tableStack.pop();
814
- const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
861
+ let nestedCols = 0;
862
+ for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
815
863
  if (newTable.rows.length >= 3 && nestedCols >= 2) {
816
864
  blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
817
865
  } else {
@@ -920,7 +968,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
920
968
  if (newTable.rows.length > 0) {
921
969
  if (tableStack.length > 0) {
922
970
  const parentTable = tableStack.pop();
923
- const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
971
+ let nestedCols = 0;
972
+ for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
924
973
  if (newTable.rows.length >= 3 && nestedCols >= 2) {
925
974
  blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
926
975
  } else {
@@ -2018,6 +2067,7 @@ function parseLenientCfb(data) {
2018
2067
  if (miniSectorSizeShift > 16) throw new Error("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uBBF8\uB2C8 \uC139\uD130 \uD06C\uAE30 \uC2DC\uD504\uD2B8: " + miniSectorSizeShift);
2019
2068
  const miniSectorSize = 1 << miniSectorSizeShift;
2020
2069
  const fatSectorCount = data.readUInt32LE(44);
2070
+ if (fatSectorCount > 1e4) throw new Error("FAT \uC139\uD130 \uC218\uAC00 \uB108\uBB34 \uB9CE\uC2B5\uB2C8\uB2E4: " + fatSectorCount);
2021
2071
  const firstDirSector = data.readUInt32LE(48);
2022
2072
  const miniStreamCutoff = data.readUInt32LE(56);
2023
2073
  const firstMiniFatSector = data.readUInt32LE(60);
@@ -2406,10 +2456,14 @@ function findSections(cfb) {
2406
2456
  }
2407
2457
  function findSectionsLenient(lcfb, compressed) {
2408
2458
  const sections = [];
2459
+ let totalDecompressed = 0;
2409
2460
  for (let i = 0; i < MAX_SECTIONS; i++) {
2410
2461
  const raw = lcfb.findStream(`/BodyText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
2411
2462
  if (!raw) break;
2412
- sections.push({ idx: i, content: compressed ? decompressStream(raw) : raw });
2463
+ const content = compressed ? decompressStream(raw) : raw;
2464
+ totalDecompressed += content.length;
2465
+ if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
2466
+ sections.push({ idx: i, content });
2413
2467
  }
2414
2468
  if (sections.length === 0) {
2415
2469
  for (const e of lcfb.entries()) {
@@ -2417,7 +2471,12 @@ function findSectionsLenient(lcfb, compressed) {
2417
2471
  if (e.name.startsWith("Section")) {
2418
2472
  const idx = parseInt(e.name.replace("Section", ""), 10) || 0;
2419
2473
  const raw = lcfb.findStream(e.name);
2420
- if (raw) sections.push({ idx, content: compressed ? decompressStream(raw) : raw });
2474
+ if (raw) {
2475
+ const content = compressed ? decompressStream(raw) : raw;
2476
+ totalDecompressed += content.length;
2477
+ if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
2478
+ sections.push({ idx, content });
2479
+ }
2421
2480
  }
2422
2481
  }
2423
2482
  }
@@ -2425,11 +2484,15 @@ function findSectionsLenient(lcfb, compressed) {
2425
2484
  }
2426
2485
  function findViewTextSectionsLenient(lcfb, compressed) {
2427
2486
  const sections = [];
2487
+ let totalDecompressed = 0;
2428
2488
  for (let i = 0; i < MAX_SECTIONS; i++) {
2429
2489
  const raw = lcfb.findStream(`/ViewText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
2430
2490
  if (!raw) break;
2431
2491
  try {
2432
- sections.push({ idx: i, content: decryptViewText(raw, compressed) });
2492
+ const content = decryptViewText(raw, compressed);
2493
+ totalDecompressed += content.length;
2494
+ if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
2495
+ sections.push({ idx: i, content });
2433
2496
  } catch {
2434
2497
  break;
2435
2498
  }
@@ -2828,37 +2891,18 @@ function arrangeCells(rows, cols, cells) {
2828
2891
  // src/pdf/line-detector.ts
2829
2892
  import { OPS } from "pdfjs-dist/legacy/build/pdf.mjs";
2830
2893
  var ORIENTATION_TOL = 2;
2831
- var MIN_LINE_LENGTH = 10;
2832
- var COORD_MERGE_TOL = 3;
2894
+ var MIN_LINE_LENGTH = 15;
2895
+ var MAX_LINE_WIDTH = 5;
2833
2896
  var CONNECT_TOL = 5;
2834
2897
  var CELL_PADDING = 2;
2835
- var MAX_LINE_WIDTH = 5;
2836
- var IDENTITY = [1, 0, 0, 1, 0, 0];
2837
- function matMultiply(m1, m2) {
2838
- return [
2839
- m1[0] * m2[0] + m1[2] * m2[1],
2840
- m1[1] * m2[0] + m1[3] * m2[1],
2841
- m1[0] * m2[2] + m1[2] * m2[3],
2842
- m1[1] * m2[2] + m1[3] * m2[3],
2843
- m1[0] * m2[4] + m1[2] * m2[5] + m1[4],
2844
- m1[1] * m2[4] + m1[3] * m2[5] + m1[5]
2845
- ];
2846
- }
2847
- function matTransformPoint(m, x, y) {
2848
- return [m[0] * x + m[2] * y + m[4], m[1] * x + m[3] * y + m[5]];
2849
- }
2850
- function matScale(m) {
2851
- return Math.max(
2852
- Math.sqrt(m[1] * m[1] + m[3] * m[3]),
2853
- Math.sqrt(m[0] * m[0] + m[2] * m[2])
2854
- );
2855
- }
2898
+ var MIN_COL_WIDTH = 15;
2899
+ var MIN_ROW_HEIGHT = 6;
2900
+ var VERTEX_MERGE_FACTOR = 4;
2901
+ var MIN_COORD_MERGE_TOL = 8;
2856
2902
  function extractLines(fnArray, argsArray) {
2857
2903
  const horizontals = [];
2858
2904
  const verticals = [];
2859
- let ctm = [...IDENTITY];
2860
2905
  let lineWidth = 1;
2861
- const stateStack = [];
2862
2906
  let currentPath = [];
2863
2907
  let pathStartX = 0, pathStartY = 0;
2864
2908
  let curX = 0, curY = 0;
@@ -2876,53 +2920,13 @@ function extractLines(fnArray, argsArray) {
2876
2920
  );
2877
2921
  }
2878
2922
  }
2879
- function tryConvertLinesToRectangle(path) {
2880
- if (path.length < 3 || path.length > 5) return false;
2881
- const first = path[0], last = path[path.length - 1];
2882
- const closed = Math.abs(first.x1 - last.x2) < 1 && Math.abs(first.y1 - last.y2) < 1;
2883
- if (!closed) return false;
2884
- let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
2885
- for (const seg of path) {
2886
- minX = Math.min(minX, seg.x1, seg.x2);
2887
- minY = Math.min(minY, seg.y1, seg.y2);
2888
- maxX = Math.max(maxX, seg.x1, seg.x2);
2889
- maxY = Math.max(maxY, seg.y1, seg.y2);
2890
- }
2891
- const w = maxX - minX, h = maxY - minY;
2892
- if (w < MIN_LINE_LENGTH && h < MIN_LINE_LENGTH) return false;
2893
- path.length = 0;
2894
- if (h < ORIENTATION_TOL * 2 || w > MIN_LINE_LENGTH && h <= MAX_LINE_WIDTH) {
2895
- path.push({ x1: minX, y1: (minY + maxY) / 2, x2: maxX, y2: (minY + maxY) / 2 });
2896
- } else if (w < ORIENTATION_TOL * 2 || h > MIN_LINE_LENGTH && w <= MAX_LINE_WIDTH) {
2897
- path.push({ x1: (minX + maxX) / 2, y1: minY, x2: (minX + maxX) / 2, y2: maxY });
2898
- } else {
2899
- pushRectangle(path, minX, minY, w, h);
2900
- }
2901
- return true;
2902
- }
2903
- function flushPath(isStroke, isFill) {
2904
- if (!isStroke && !isFill) {
2905
- currentPath = [];
2906
- return;
2907
- }
2908
- if (isFill && !isStroke && currentPath.length >= 3) {
2909
- tryConvertLinesToRectangle(currentPath);
2910
- }
2911
- const scale = matScale(ctm);
2912
- const effectiveLW = lineWidth * scale;
2913
- if (effectiveLW > MAX_LINE_WIDTH && isStroke && !isFill) {
2923
+ function flushPath(isStroke) {
2924
+ if (!isStroke) {
2914
2925
  currentPath = [];
2915
2926
  return;
2916
2927
  }
2917
2928
  for (const seg of currentPath) {
2918
- const [px1, py1] = matTransformPoint(ctm, seg.x1, seg.y1);
2919
- const [px2, py2] = matTransformPoint(ctm, seg.x2, seg.y2);
2920
- classifyAndAdd(
2921
- { x1: px1, y1: py1, x2: px2, y2: py2 },
2922
- effectiveLW,
2923
- horizontals,
2924
- verticals
2925
- );
2929
+ classifyAndAdd(seg, lineWidth, horizontals, verticals);
2926
2930
  }
2927
2931
  currentPath = [];
2928
2932
  }
@@ -2930,28 +2934,9 @@ function extractLines(fnArray, argsArray) {
2930
2934
  const op = fnArray[i];
2931
2935
  const args = argsArray[i];
2932
2936
  switch (op) {
2933
- // ── Graphics State ──
2934
- case OPS.save:
2935
- stateStack.push({ ctm: [...ctm], lineWidth });
2936
- break;
2937
- case OPS.restore:
2938
- if (stateStack.length > 0) {
2939
- const state = stateStack.pop();
2940
- ctm = state.ctm;
2941
- lineWidth = state.lineWidth;
2942
- }
2943
- break;
2944
- case OPS.transform: {
2945
- const m = args;
2946
- if (m.length >= 6) {
2947
- ctm = matMultiply(ctm, [m[0], m[1], m[2], m[3], m[4], m[5]]);
2948
- }
2949
- break;
2950
- }
2951
2937
  case OPS.setLineWidth:
2952
2938
  lineWidth = args[0] || 1;
2953
2939
  break;
2954
- // ── Path Construction ──
2955
2940
  case OPS.constructPath: {
2956
2941
  const arg0 = args[0];
2957
2942
  if (Array.isArray(arg0)) {
@@ -3019,60 +3004,34 @@ function extractLines(fnArray, argsArray) {
3019
3004
  }
3020
3005
  }
3021
3006
  }
3022
- const isStroke5 = afterOp === OPS.stroke || afterOp === OPS.closeStroke;
3023
- const isFill5 = afterOp === OPS.fill || afterOp === OPS.eoFill;
3024
- const isBoth5 = afterOp === OPS.fillStroke || afterOp === OPS.eoFillStroke || afterOp === OPS.closeFillStroke || afterOp === OPS.closeEOFillStroke;
3025
- if (isStroke5 || isFill5 || isBoth5) {
3026
- flushPath(isStroke5 || isBoth5, isFill5 || isBoth5);
3007
+ if (afterOp === OPS.stroke || afterOp === OPS.closeStroke) {
3008
+ flushPath(true);
3009
+ } else if (afterOp === OPS.fill || afterOp === OPS.eoFill || afterOp === OPS.fillStroke || afterOp === OPS.eoFillStroke || afterOp === OPS.closeFillStroke || afterOp === OPS.closeEOFillStroke) {
3010
+ flushPath(true);
3027
3011
  } else if (afterOp === OPS.endPath) {
3028
- flushPath(false, false);
3012
+ flushPath(false);
3029
3013
  }
3030
3014
  }
3031
3015
  break;
3032
3016
  }
3033
- // ── Paint Operations ──
3034
3017
  case OPS.stroke:
3035
3018
  case OPS.closeStroke:
3036
- flushPath(true, false);
3019
+ flushPath(true);
3037
3020
  break;
3038
3021
  case OPS.fill:
3039
3022
  case OPS.eoFill:
3040
- flushPath(false, true);
3041
- break;
3042
3023
  case OPS.fillStroke:
3043
3024
  case OPS.eoFillStroke:
3044
3025
  case OPS.closeFillStroke:
3045
3026
  case OPS.closeEOFillStroke:
3046
- flushPath(true, true);
3027
+ flushPath(true);
3047
3028
  break;
3048
3029
  case OPS.endPath:
3049
- flushPath(false, false);
3030
+ flushPath(false);
3050
3031
  break;
3051
3032
  }
3052
3033
  }
3053
- return {
3054
- horizontals: deduplicateLines(horizontals),
3055
- verticals: deduplicateLines(verticals)
3056
- };
3057
- }
3058
- function deduplicateLines(lines) {
3059
- if (lines.length <= 1) return lines;
3060
- const result = [];
3061
- const tol = COORD_MERGE_TOL;
3062
- for (const line of lines) {
3063
- let isDuplicate = false;
3064
- for (const existing of result) {
3065
- if (Math.abs(line.y1 - existing.y1) <= tol && Math.abs(line.y2 - existing.y2) <= tol && Math.abs(line.x1 - existing.x1) <= tol && Math.abs(line.x2 - existing.x2) <= tol) {
3066
- if (line.lineWidth > existing.lineWidth) {
3067
- existing.lineWidth = line.lineWidth;
3068
- }
3069
- isDuplicate = true;
3070
- break;
3071
- }
3072
- }
3073
- if (!isDuplicate) result.push(line);
3074
- }
3075
- return result;
3034
+ return { horizontals, verticals };
3076
3035
  }
3077
3036
  function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
3078
3037
  const dx = Math.abs(seg.x2 - seg.x1);
@@ -3091,6 +3050,55 @@ function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
3091
3050
  verticals.push({ x1: x, y1, x2: x, y2, lineWidth });
3092
3051
  }
3093
3052
  }
3053
+ function preprocessLines(horizontals, verticals) {
3054
+ let h = horizontals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
3055
+ let v = verticals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
3056
+ h = mergeParallelLines(h, "h");
3057
+ v = mergeParallelLines(v, "v");
3058
+ return { horizontals: h, verticals: v };
3059
+ }
3060
+ function mergeParallelLines(lines, dir) {
3061
+ if (lines.length <= 1) return lines;
3062
+ const sorted = [...lines].sort((a, b) => {
3063
+ const posA = dir === "h" ? a.y1 : a.x1;
3064
+ const posB = dir === "h" ? b.y1 : b.x1;
3065
+ if (Math.abs(posA - posB) > 0.1) return posA - posB;
3066
+ return dir === "h" ? a.x1 - b.x1 : a.y1 - b.y1;
3067
+ });
3068
+ const MERGE_TOL = 3;
3069
+ const result = [sorted[0]];
3070
+ for (let i = 1; i < sorted.length; i++) {
3071
+ const prev = result[result.length - 1];
3072
+ const curr = sorted[i];
3073
+ const prevPos = dir === "h" ? prev.y1 : prev.x1;
3074
+ const currPos = dir === "h" ? curr.y1 : curr.x1;
3075
+ if (Math.abs(prevPos - currPos) <= MERGE_TOL) {
3076
+ const prevStart = dir === "h" ? prev.x1 : prev.y1;
3077
+ const prevEnd = dir === "h" ? prev.x2 : prev.y2;
3078
+ const currStart = dir === "h" ? curr.x1 : curr.y1;
3079
+ const currEnd = dir === "h" ? curr.x2 : curr.y2;
3080
+ const overlap = Math.min(prevEnd, currEnd) - Math.max(prevStart, currStart);
3081
+ const minLen = Math.min(prevEnd - prevStart, currEnd - currStart);
3082
+ if (overlap > minLen * 0.3) {
3083
+ if (dir === "h") {
3084
+ prev.x1 = Math.min(prev.x1, curr.x1);
3085
+ prev.x2 = Math.max(prev.x2, curr.x2);
3086
+ prev.y1 = (prev.y1 + curr.y1) / 2;
3087
+ prev.y2 = prev.y1;
3088
+ } else {
3089
+ prev.y1 = Math.min(prev.y1, curr.y1);
3090
+ prev.y2 = Math.max(prev.y2, curr.y2);
3091
+ prev.x1 = (prev.x1 + curr.x1) / 2;
3092
+ prev.x2 = prev.x1;
3093
+ }
3094
+ prev.lineWidth = Math.max(prev.lineWidth, curr.lineWidth);
3095
+ continue;
3096
+ }
3097
+ }
3098
+ result.push(curr);
3099
+ }
3100
+ return result;
3101
+ }
3094
3102
  function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
3095
3103
  const margin = 5;
3096
3104
  return {
@@ -3102,8 +3110,49 @@ function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
3102
3110
  )
3103
3111
  };
3104
3112
  }
3113
+ function buildVertices(horizontals, verticals) {
3114
+ const vertices = [];
3115
+ const tol = CONNECT_TOL;
3116
+ for (const h of horizontals) {
3117
+ for (const v of verticals) {
3118
+ if (v.x1 >= h.x1 - tol && v.x1 <= h.x2 + tol && h.y1 >= v.y1 - tol && h.y1 <= v.y2 + tol) {
3119
+ const radius = Math.max(h.lineWidth, v.lineWidth, 1);
3120
+ vertices.push({ x: v.x1, y: h.y1, radius });
3121
+ }
3122
+ }
3123
+ }
3124
+ return vertices;
3125
+ }
3126
+ function mergeVertices(vertices) {
3127
+ if (vertices.length <= 1) return vertices;
3128
+ const merged = [];
3129
+ const used = new Array(vertices.length).fill(false);
3130
+ for (let i = 0; i < vertices.length; i++) {
3131
+ if (used[i]) continue;
3132
+ let sumX = vertices[i].x, sumY = vertices[i].y;
3133
+ let maxRadius = vertices[i].radius;
3134
+ let count = 1;
3135
+ for (let j = i + 1; j < vertices.length; j++) {
3136
+ if (used[j]) continue;
3137
+ const mergeTol = VERTEX_MERGE_FACTOR * Math.max(maxRadius, vertices[j].radius);
3138
+ if (Math.abs(vertices[i].x - vertices[j].x) <= mergeTol && Math.abs(vertices[i].y - vertices[j].y) <= mergeTol) {
3139
+ sumX += vertices[j].x;
3140
+ sumY += vertices[j].y;
3141
+ maxRadius = Math.max(maxRadius, vertices[j].radius);
3142
+ count++;
3143
+ used[j] = true;
3144
+ }
3145
+ }
3146
+ merged.push({ x: sumX / count, y: sumY / count, radius: maxRadius });
3147
+ }
3148
+ return merged;
3149
+ }
3105
3150
  function buildTableGrids(horizontals, verticals) {
3106
3151
  if (horizontals.length < 2 || verticals.length < 2) return [];
3152
+ const allVertices = buildVertices(horizontals, verticals);
3153
+ const vertices = mergeVertices(allVertices);
3154
+ if (vertices.length < 4) return [];
3155
+ const globalRadius = vertices.reduce((max, v) => Math.max(max, v.radius), 1);
3107
3156
  const allLines = [
3108
3157
  ...horizontals.map((l, i) => ({ ...l, type: "h", id: i })),
3109
3158
  ...verticals.map((l, i) => ({ ...l, type: "v", id: i + horizontals.length }))
@@ -3114,21 +3163,74 @@ function buildTableGrids(horizontals, verticals) {
3114
3163
  const hLines = group.filter((l) => l.type === "h");
3115
3164
  const vLines = group.filter((l) => l.type === "v");
3116
3165
  if (hLines.length < 2 || vLines.length < 2) continue;
3117
- const rawYs = hLines.map((l) => l.y1);
3118
- const rowYs = clusterCoordinates(rawYs).sort((a, b) => b - a);
3119
- const rawXs = vLines.map((l) => l.x1);
3120
- const colXs = clusterCoordinates(rawXs).sort((a, b) => a - b);
3166
+ let gx1 = Infinity, gy1 = Infinity, gx2 = -Infinity, gy2 = -Infinity;
3167
+ for (const l of vLines) {
3168
+ if (l.x1 < gx1) gx1 = l.x1;
3169
+ if (l.x1 > gx2) gx2 = l.x1;
3170
+ }
3171
+ for (const l of hLines) {
3172
+ if (l.y1 < gy1) gy1 = l.y1;
3173
+ if (l.y1 > gy2) gy2 = l.y1;
3174
+ }
3175
+ const groupBbox = {
3176
+ x1: gx1 - CONNECT_TOL,
3177
+ y1: gy1 - CONNECT_TOL,
3178
+ x2: gx2 + CONNECT_TOL,
3179
+ y2: gy2 + CONNECT_TOL
3180
+ };
3181
+ const groupVertices = vertices.filter(
3182
+ (v) => v.x >= groupBbox.x1 && v.x <= groupBbox.x2 && v.y >= groupBbox.y1 && v.y <= groupBbox.y2
3183
+ );
3184
+ const groupRadius = groupVertices.length > 0 ? groupVertices.reduce((max, v) => Math.max(max, v.radius), 1) : globalRadius;
3185
+ const coordMergeTol = Math.max(VERTEX_MERGE_FACTOR * groupRadius, MIN_COORD_MERGE_TOL);
3186
+ const rawYs = [
3187
+ ...hLines.map((l) => l.y1),
3188
+ ...groupVertices.map((v) => v.y)
3189
+ ];
3190
+ const rowYs = clusterCoordinates(rawYs, coordMergeTol).sort((a, b) => b - a);
3191
+ const rawXs = [
3192
+ ...vLines.map((l) => l.x1),
3193
+ ...groupVertices.map((v) => v.x)
3194
+ ];
3195
+ const colXs = clusterCoordinates(rawXs, coordMergeTol).sort((a, b) => a - b);
3121
3196
  if (rowYs.length < 2 || colXs.length < 2) continue;
3197
+ const validColXs = enforceMinWidth(colXs, MIN_COL_WIDTH);
3198
+ const validRowYs = enforceMinHeight(rowYs, MIN_ROW_HEIGHT);
3199
+ if (validRowYs.length < 2 || validColXs.length < 2) continue;
3122
3200
  const bbox = {
3123
- x1: colXs[0],
3124
- y1: rowYs[rowYs.length - 1],
3125
- x2: colXs[colXs.length - 1],
3126
- y2: rowYs[0]
3201
+ x1: validColXs[0],
3202
+ y1: validRowYs[validRowYs.length - 1],
3203
+ x2: validColXs[validColXs.length - 1],
3204
+ y2: validRowYs[0]
3127
3205
  };
3128
- grids.push({ rowYs, colXs, bbox });
3206
+ grids.push({ rowYs: validRowYs, colXs: validColXs, bbox, vertexRadius: groupRadius });
3129
3207
  }
3130
3208
  return mergeAdjacentGrids(grids);
3131
3209
  }
3210
+ function enforceMinWidth(colXs, minWidth) {
3211
+ if (colXs.length <= 2) return colXs;
3212
+ const result = [colXs[0]];
3213
+ for (let i = 1; i < colXs.length; i++) {
3214
+ const prevX = result[result.length - 1];
3215
+ if (colXs[i] - prevX < minWidth && i < colXs.length - 1) {
3216
+ continue;
3217
+ }
3218
+ result.push(colXs[i]);
3219
+ }
3220
+ return result;
3221
+ }
3222
+ function enforceMinHeight(rowYs, minHeight) {
3223
+ if (rowYs.length <= 2) return rowYs;
3224
+ const result = [rowYs[0]];
3225
+ for (let i = 1; i < rowYs.length; i++) {
3226
+ const prevY = result[result.length - 1];
3227
+ if (prevY - rowYs[i] < minHeight && i < rowYs.length - 1) {
3228
+ continue;
3229
+ }
3230
+ result.push(rowYs[i]);
3231
+ }
3232
+ return result;
3233
+ }
3132
3234
  function mergeAdjacentGrids(grids) {
3133
3235
  if (grids.length <= 1) return grids;
3134
3236
  const sorted = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
@@ -3137,9 +3239,10 @@ function mergeAdjacentGrids(grids) {
3137
3239
  const prev = merged[merged.length - 1];
3138
3240
  const curr = sorted[i];
3139
3241
  if (prev.colXs.length === curr.colXs.length) {
3140
- const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= COORD_MERGE_TOL * 3);
3242
+ const mergeTol = Math.max(VERTEX_MERGE_FACTOR * Math.max(prev.vertexRadius, curr.vertexRadius), 6) * 3;
3243
+ const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= mergeTol);
3141
3244
  const verticalGap = prev.bbox.y1 - curr.bbox.y2;
3142
- if (colMatch && verticalGap >= -COORD_MERGE_TOL && verticalGap <= 20) {
3245
+ if (colMatch && verticalGap >= -CONNECT_TOL && verticalGap <= 20) {
3143
3246
  const allRowYs = [.../* @__PURE__ */ new Set([...prev.rowYs, ...curr.rowYs])].sort((a, b) => b - a);
3144
3247
  merged[merged.length - 1] = {
3145
3248
  rowYs: allRowYs,
@@ -3149,7 +3252,8 @@ function mergeAdjacentGrids(grids) {
3149
3252
  y1: Math.min(prev.bbox.y1, curr.bbox.y1),
3150
3253
  x2: Math.max(prev.bbox.x2, curr.bbox.x2),
3151
3254
  y2: Math.max(prev.bbox.y2, curr.bbox.y2)
3152
- }
3255
+ },
3256
+ vertexRadius: Math.max(prev.vertexRadius, curr.vertexRadius)
3153
3257
  };
3154
3258
  continue;
3155
3259
  }
@@ -3158,14 +3262,14 @@ function mergeAdjacentGrids(grids) {
3158
3262
  }
3159
3263
  return merged;
3160
3264
  }
3161
- function clusterCoordinates(values) {
3265
+ function clusterCoordinates(values, tolerance) {
3162
3266
  if (values.length === 0) return [];
3163
3267
  const sorted = [...values].sort((a, b) => a - b);
3164
3268
  const clusters = [{ sum: sorted[0], count: 1 }];
3165
3269
  for (let i = 1; i < sorted.length; i++) {
3166
3270
  const last = clusters[clusters.length - 1];
3167
3271
  const avg = last.sum / last.count;
3168
- if (Math.abs(sorted[i] - avg) <= COORD_MERGE_TOL) {
3272
+ if (Math.abs(sorted[i] - avg) <= tolerance) {
3169
3273
  last.sum += sorted[i];
3170
3274
  last.count++;
3171
3275
  } else {
@@ -3222,6 +3326,20 @@ function extractCells(grid, horizontals, verticals) {
3222
3326
  const numRows = rowYs.length - 1;
3223
3327
  const numCols = colXs.length - 1;
3224
3328
  if (numRows <= 0 || numCols <= 0) return [];
3329
+ const vBorders = Array.from(
3330
+ { length: numRows },
3331
+ (_, r) => Array.from(
3332
+ { length: numCols + 1 },
3333
+ (_2, c) => hasVerticalLine(verticals, colXs[c], rowYs[r], rowYs[r + 1], grid.vertexRadius)
3334
+ )
3335
+ );
3336
+ const hBorders = Array.from(
3337
+ { length: numRows + 1 },
3338
+ (_, r) => Array.from(
3339
+ { length: numCols },
3340
+ (_2, c) => hasHorizontalLine(horizontals, rowYs[r], colXs[c], colXs[c + 1], grid.vertexRadius)
3341
+ )
3342
+ );
3225
3343
  const occupied = Array.from({ length: numRows }, () => Array(numCols).fill(false));
3226
3344
  const cells = [];
3227
3345
  for (let r = 0; r < numRows; r++) {
@@ -3229,18 +3347,26 @@ function extractCells(grid, horizontals, verticals) {
3229
3347
  if (occupied[r][c]) continue;
3230
3348
  let colSpan = 1;
3231
3349
  let rowSpan = 1;
3232
- while (c + colSpan < numCols) {
3233
- const borderX = colXs[c + colSpan];
3234
- const topY = rowYs[r];
3235
- const botY = rowYs[r + 1];
3236
- if (hasVerticalLine(verticals, borderX, topY, botY)) break;
3350
+ while (c + colSpan < numCols && !vBorders[r][c + colSpan]) {
3351
+ let canExpand = true;
3352
+ for (let dr = 0; dr < rowSpan; dr++) {
3353
+ if (vBorders[r + dr][c + colSpan]) {
3354
+ canExpand = false;
3355
+ break;
3356
+ }
3357
+ }
3358
+ if (!canExpand) break;
3237
3359
  colSpan++;
3238
3360
  }
3239
3361
  while (r + rowSpan < numRows) {
3240
- const borderY = rowYs[r + rowSpan];
3241
- const leftX = colXs[c];
3242
- const rightX = colXs[c + colSpan];
3243
- if (hasHorizontalLine(horizontals, borderY, leftX, rightX)) break;
3362
+ let hasLine = false;
3363
+ for (let dc = 0; dc < colSpan; dc++) {
3364
+ if (hBorders[r + rowSpan][c + dc]) {
3365
+ hasLine = true;
3366
+ break;
3367
+ }
3368
+ }
3369
+ if (hasLine) break;
3244
3370
  rowSpan++;
3245
3371
  }
3246
3372
  for (let dr = 0; dr < rowSpan; dr++) {
@@ -3264,28 +3390,30 @@ function extractCells(grid, horizontals, verticals) {
3264
3390
  }
3265
3391
  return cells;
3266
3392
  }
3267
- function hasVerticalLine(verticals, x, topY, botY) {
3268
- const tol = COORD_MERGE_TOL + 1;
3393
+ function hasVerticalLine(verticals, x, topY, botY, vertexRadius) {
3394
+ const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
3269
3395
  for (const v of verticals) {
3270
3396
  if (Math.abs(v.x1 - x) <= tol) {
3271
3397
  const cellH = Math.abs(topY - botY);
3398
+ if (cellH < 0.1) continue;
3272
3399
  const overlapTop = Math.min(v.y2, topY);
3273
3400
  const overlapBot = Math.max(v.y1, botY);
3274
3401
  const overlap = overlapTop - overlapBot;
3275
- if (overlap >= cellH * 0.5) return true;
3402
+ if (overlap >= cellH * 0.75) return true;
3276
3403
  }
3277
3404
  }
3278
3405
  return false;
3279
3406
  }
3280
- function hasHorizontalLine(horizontals, y, leftX, rightX) {
3281
- const tol = COORD_MERGE_TOL + 1;
3407
+ function hasHorizontalLine(horizontals, y, leftX, rightX, vertexRadius) {
3408
+ const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
3282
3409
  for (const h of horizontals) {
3283
3410
  if (Math.abs(h.y1 - y) <= tol) {
3284
3411
  const cellW = Math.abs(rightX - leftX);
3412
+ if (cellW < 0.1) continue;
3285
3413
  const overlapLeft = Math.max(h.x1, leftX);
3286
3414
  const overlapRight = Math.min(h.x2, rightX);
3287
3415
  const overlap = overlapRight - overlapLeft;
3288
- if (overlap >= cellW * 0.5) return true;
3416
+ if (overlap >= cellW * 0.75) return true;
3289
3417
  }
3290
3418
  }
3291
3419
  return false;
@@ -3296,23 +3424,24 @@ function mapTextToCells(items, cells) {
3296
3424
  result.set(cell, []);
3297
3425
  }
3298
3426
  for (const item of items) {
3299
- const cx = item.x + item.w / 2;
3300
- const cy = item.y;
3301
3427
  const pad = CELL_PADDING;
3302
3428
  let bestCell = null;
3303
- let bestDist = Infinity;
3429
+ let bestScore = 0;
3304
3430
  for (const cell of cells) {
3305
- if (cx >= cell.bbox.x1 - pad && cx <= cell.bbox.x2 + pad && cy >= cell.bbox.y1 - pad && cy <= cell.bbox.y2 + pad) {
3306
- const cellCx = (cell.bbox.x1 + cell.bbox.x2) / 2;
3307
- const cellCy = (cell.bbox.y1 + cell.bbox.y2) / 2;
3308
- const dist = Math.abs(cx - cellCx) + Math.abs(cy - cellCy);
3309
- if (dist < bestDist) {
3310
- bestDist = dist;
3311
- bestCell = cell;
3312
- }
3431
+ const ix1 = Math.max(item.x, cell.bbox.x1 - pad);
3432
+ const ix2 = Math.min(item.x + item.w, cell.bbox.x2 + pad);
3433
+ const iy1 = Math.max(item.y, cell.bbox.y1 - pad);
3434
+ const iy2 = Math.min(item.y + (item.h || item.fontSize), cell.bbox.y2 + pad);
3435
+ if (ix1 >= ix2 || iy1 >= iy2) continue;
3436
+ const intersectArea = (ix2 - ix1) * (iy2 - iy1);
3437
+ const itemArea = Math.max(item.w, 1) * Math.max(item.h || item.fontSize, 1);
3438
+ const score = intersectArea / itemArea;
3439
+ if (score > bestScore) {
3440
+ bestScore = score;
3441
+ bestCell = cell;
3313
3442
  }
3314
3443
  }
3315
- if (bestCell) {
3444
+ if (bestCell && bestScore > 0.3) {
3316
3445
  result.get(bestCell).push(item);
3317
3446
  }
3318
3447
  }
@@ -3339,8 +3468,13 @@ function cellTextToString(items) {
3339
3468
  const textLines = lines.map((line) => {
3340
3469
  const s = line.sort((a, b) => a.x - b.x);
3341
3470
  if (s.length === 1) return s[0].text;
3471
+ const evenSpaced = detectEvenSpacedItems(s);
3342
3472
  let result = s[0].text;
3343
3473
  for (let j = 1; j < s.length; j++) {
3474
+ if (evenSpaced[j]) {
3475
+ result += s[j].text;
3476
+ continue;
3477
+ }
3344
3478
  const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
3345
3479
  const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
3346
3480
  const prevIsKorean = /[가-힣]$/.test(result);
@@ -3355,6 +3489,57 @@ function cellTextToString(items) {
3355
3489
  }
3356
3490
  return result;
3357
3491
  });
3492
+ return mergeCellTextLines(textLines);
3493
+ }
3494
+ function detectEvenSpacedItems(items) {
3495
+ const result = new Array(items.length).fill(false);
3496
+ if (items.length < 3) return result;
3497
+ let runStart = -1;
3498
+ for (let i = 0; i < items.length; i++) {
3499
+ const isShortKorean = /^[가-힣]{1}$/.test(items[i].text) || /^[\d]{1}$/.test(items[i].text);
3500
+ if (isShortKorean && runStart >= 0 && i > 0) {
3501
+ const gap = items[i].x - (items[i - 1].x + items[i - 1].w);
3502
+ const maxRunGap = Math.max(items[i].fontSize * 3, 30);
3503
+ if (gap > maxRunGap) {
3504
+ if (i - runStart >= 3) markEvenRun(items, result, runStart, i);
3505
+ runStart = i;
3506
+ continue;
3507
+ }
3508
+ }
3509
+ if (isShortKorean) {
3510
+ if (runStart < 0) runStart = i;
3511
+ } else {
3512
+ if (runStart >= 0 && i - runStart >= 3) {
3513
+ markEvenRun(items, result, runStart, i);
3514
+ }
3515
+ runStart = -1;
3516
+ }
3517
+ }
3518
+ if (runStart >= 0 && items.length - runStart >= 3) {
3519
+ markEvenRun(items, result, runStart, items.length);
3520
+ }
3521
+ return result;
3522
+ }
3523
+ function markEvenRun(items, result, start, end) {
3524
+ const gaps = [];
3525
+ for (let i = start + 1; i < end; i++) {
3526
+ gaps.push(items[i].x - (items[i - 1].x + items[i - 1].w));
3527
+ }
3528
+ const posGaps = gaps.filter((g2) => g2 > 0);
3529
+ if (posGaps.length < 2) return;
3530
+ let minGap = Infinity, maxGap = -Infinity;
3531
+ for (const g2 of posGaps) {
3532
+ if (g2 < minGap) minGap = g2;
3533
+ if (g2 > maxGap) maxGap = g2;
3534
+ }
3535
+ const avgFs = items[start].fontSize;
3536
+ if (minGap >= avgFs * 0.1 && maxGap <= avgFs * 3 && maxGap / Math.max(minGap, 0.1) <= 3) {
3537
+ for (let i = start + 1; i < end; i++) {
3538
+ result[i] = true;
3539
+ }
3540
+ }
3541
+ }
3542
+ function mergeCellTextLines(textLines) {
3358
3543
  if (textLines.length <= 1) return textLines[0] || "";
3359
3544
  const merged = [textLines[0]];
3360
3545
  for (let i = 1; i < textLines.length; i++) {
@@ -3380,24 +3565,172 @@ var Y_TOL = 3;
3380
3565
  var COL_CLUSTER_TOL = 15;
3381
3566
  var MIN_ROWS = 3;
3382
3567
  var MIN_COLS = 2;
3383
- var MIN_GAP_FACTOR = 1.5;
3384
- var MIN_COL_FILL_RATIO = 0.3;
3568
+ var MIN_GAP_FACTOR = 2;
3569
+ var MIN_GAP_ABSOLUTE = 20;
3570
+ var MIN_COL_FILL_RATIO = 0.4;
3385
3571
  function detectClusterTables(items, pageNum) {
3386
3572
  if (items.length < MIN_ROWS * MIN_COLS) return [];
3387
- const rows = groupByBaseline(items);
3573
+ const { merged, originMap } = mergeEvenSpacedClusters(items);
3574
+ const rows = groupByBaseline(merged);
3388
3575
  if (rows.length < MIN_ROWS) return [];
3389
- const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
3390
- if (suspiciousRows.length < MIN_ROWS) return [];
3391
- const columns = extractColumnClusters(suspiciousRows);
3392
- if (columns.length < MIN_COLS) return [];
3393
- const tableRegions = findTableRegions(rows, columns);
3394
3576
  const results = [];
3395
- for (const region of tableRegions) {
3396
- const table = buildClusterTable(region.rows, columns, pageNum);
3397
- if (table) results.push(table);
3577
+ const headerResult = detectHeaderRow(rows);
3578
+ if (headerResult) {
3579
+ const { columns, headerIdx } = headerResult;
3580
+ const headerRow = rows[headerIdx];
3581
+ const headerItems = [...headerRow.items].sort((a, b) => a.x - b.x);
3582
+ const headerAndBelow = rows.slice(headerIdx);
3583
+ const mergedRows = mergeMultiLineRows(headerAndBelow, columns);
3584
+ const tableRegions = findTableRegionsByHeader(mergedRows, columns, headerItems);
3585
+ for (const region of tableRegions) {
3586
+ const table = buildClusterTable(region.rows, columns, pageNum);
3587
+ if (table) {
3588
+ expandUsedItems(table.usedItems, originMap);
3589
+ results.push(table);
3590
+ }
3591
+ }
3592
+ }
3593
+ if (results.length === 0) {
3594
+ const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
3595
+ if (suspiciousRows.length >= MIN_ROWS) {
3596
+ const columns = extractColumnClusters(suspiciousRows);
3597
+ if (columns.length >= MIN_COLS) {
3598
+ const tableRegions = findTableRegions(rows, columns);
3599
+ for (const region of tableRegions) {
3600
+ const mergedRows = mergeMultiLineRows(region.rows, columns);
3601
+ const table = buildClusterTable(mergedRows, columns, pageNum);
3602
+ if (table) {
3603
+ expandUsedItems(table.usedItems, originMap);
3604
+ results.push(table);
3605
+ }
3606
+ }
3607
+ }
3608
+ }
3398
3609
  }
3399
3610
  return results;
3400
3611
  }
3612
+ function mergeEvenSpacedClusters(items) {
3613
+ const originMap = /* @__PURE__ */ new Map();
3614
+ const rows = groupByBaseline(items);
3615
+ const merged = [];
3616
+ for (const row of rows) {
3617
+ const sorted = [...row.items].sort((a, b) => a.x - b.x);
3618
+ let i = 0;
3619
+ while (i < sorted.length) {
3620
+ if (/^[가-힣\d]$/.test(sorted[i].text)) {
3621
+ let runEnd = i + 1;
3622
+ while (runEnd < sorted.length && /^[가-힣\d]$/.test(sorted[runEnd].text)) {
3623
+ const gap = sorted[runEnd].x - (sorted[runEnd - 1].x + sorted[runEnd - 1].w);
3624
+ const fs = sorted[runEnd].fontSize;
3625
+ if (gap < fs * 0.1 || gap > fs * 3) break;
3626
+ runEnd++;
3627
+ }
3628
+ if (runEnd - i >= 3) {
3629
+ const gaps = [];
3630
+ for (let g2 = i + 1; g2 < runEnd; g2++) {
3631
+ gaps.push(sorted[g2].x - (sorted[g2 - 1].x + sorted[g2 - 1].w));
3632
+ }
3633
+ let minG = Infinity, maxG = -Infinity;
3634
+ for (const g2 of gaps) {
3635
+ if (g2 < minG) minG = g2;
3636
+ if (g2 > maxG) maxG = g2;
3637
+ }
3638
+ if (minG > 0 && maxG / minG <= 3) {
3639
+ const run = sorted.slice(i, runEnd);
3640
+ const text = run.map((r) => r.text).join("");
3641
+ const first = run[0], last = run[runEnd - i - 1];
3642
+ const item = {
3643
+ text,
3644
+ x: first.x,
3645
+ y: first.y,
3646
+ w: last.x + last.w - first.x,
3647
+ h: first.h,
3648
+ fontSize: first.fontSize,
3649
+ fontName: first.fontName
3650
+ };
3651
+ originMap.set(item, run);
3652
+ merged.push(item);
3653
+ i = runEnd;
3654
+ continue;
3655
+ }
3656
+ }
3657
+ }
3658
+ merged.push(sorted[i]);
3659
+ i++;
3660
+ }
3661
+ }
3662
+ return { merged, originMap };
3663
+ }
3664
+ function expandUsedItems(usedItems, originMap) {
3665
+ const toAdd = [];
3666
+ for (const item of usedItems) {
3667
+ const origins = originMap.get(item);
3668
+ if (origins) for (const o of origins) toAdd.push(o);
3669
+ }
3670
+ for (const a of toAdd) usedItems.add(a);
3671
+ }
3672
+ function detectHeaderRow(rows) {
3673
+ const allItems = rows.flatMap((r) => r.items);
3674
+ if (allItems.length === 0) return null;
3675
+ let allMinX = Infinity, allMaxX = -Infinity;
3676
+ for (const i of allItems) {
3677
+ if (i.x < allMinX) allMinX = i.x;
3678
+ const r = i.x + i.w;
3679
+ if (r > allMaxX) allMaxX = r;
3680
+ }
3681
+ const pageSpan = allMaxX - allMinX;
3682
+ if (pageSpan <= 0) return null;
3683
+ for (let ri = 0; ri < rows.length; ri++) {
3684
+ const row = rows[ri];
3685
+ if (row.items.length < MIN_COLS || row.items.length > 6) continue;
3686
+ if (row.items.some((i) => i.text.length > 8)) continue;
3687
+ if (!row.items.some((i) => /[가-힣]/.test(i.text))) continue;
3688
+ if (row.items.some((i) => /^[□■○●·※▶▷◆◇\-]/.test(i.text))) continue;
3689
+ const sorted = [...row.items].sort((a, b) => a.x - b.x);
3690
+ const xSpan = sorted[sorted.length - 1].x + sorted[sorted.length - 1].w - sorted[0].x;
3691
+ if (xSpan / pageSpan < 0.4) continue;
3692
+ const avgFs = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
3693
+ let hasLargeGap = false;
3694
+ for (let i = 1; i < sorted.length; i++) {
3695
+ const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
3696
+ if (gap >= avgFs * 2.5) {
3697
+ hasLargeGap = true;
3698
+ break;
3699
+ }
3700
+ }
3701
+ if (!hasLargeGap) continue;
3702
+ const columns = sorted.map((item) => ({ x: item.x, count: 0 }));
3703
+ let matchCount = 0;
3704
+ for (let j = ri + 1; j < rows.length && matchCount < MIN_ROWS + 2; j++) {
3705
+ const matched = countMatchedColumnsRange(rows[j], columns, sorted);
3706
+ if (matched >= MIN_COLS) matchCount++;
3707
+ }
3708
+ if (matchCount < MIN_ROWS) continue;
3709
+ return { columns, headerIdx: ri };
3710
+ }
3711
+ return null;
3712
+ }
3713
+ function mergeMultiLineRows(rows, columns) {
3714
+ if (rows.length <= 1) return rows;
3715
+ const result = [rows[0]];
3716
+ const allFontSizes = rows.flatMap((r) => r.items).map((i) => i.fontSize);
3717
+ const avgFontSize = allFontSizes.length > 0 ? allFontSizes.reduce((s, v) => s + v, 0) / allFontSizes.length : 12;
3718
+ for (let i = 1; i < rows.length; i++) {
3719
+ const prev = result[result.length - 1];
3720
+ const curr = rows[i];
3721
+ const yGap = Math.abs(prev.y - curr.y);
3722
+ const matchedCols = countMatchedColumns(curr, columns);
3723
+ if (yGap < avgFontSize * 1.8 && curr.items.length <= 2 && (matchedCols < MIN_COLS || curr.items.length === 1)) {
3724
+ result[result.length - 1] = {
3725
+ y: prev.y,
3726
+ items: [...prev.items, ...curr.items]
3727
+ };
3728
+ } else {
3729
+ result.push(curr);
3730
+ }
3731
+ }
3732
+ return result;
3733
+ }
3401
3734
  function groupByBaseline(items) {
3402
3735
  if (items.length === 0) return [];
3403
3736
  const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
@@ -3419,8 +3752,9 @@ function groupByBaseline(items) {
3419
3752
  function hasSuspiciousGaps(row) {
3420
3753
  if (row.items.length < 2) return false;
3421
3754
  const sorted = [...row.items].sort((a, b) => a.x - b.x);
3755
+ if (sorted.length === 2 && sorted[1].text.length > 20) return false;
3422
3756
  const avgFontSize = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
3423
- const minGap = avgFontSize * MIN_GAP_FACTOR;
3757
+ const minGap = Math.max(avgFontSize * MIN_GAP_FACTOR, MIN_GAP_ABSOLUTE);
3424
3758
  for (let i = 1; i < sorted.length; i++) {
3425
3759
  const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
3426
3760
  if (gap >= minGap) return true;
@@ -3447,6 +3781,41 @@ function extractColumnClusters(rows) {
3447
3781
  const minCount = Math.max(2, Math.floor(rows.length * MIN_COL_FILL_RATIO));
3448
3782
  return clusters.filter((c) => c.count >= minCount).sort((a, b) => a.x - b.x);
3449
3783
  }
3784
+ function findTableRegionsByHeader(allRows, columns, headerItems) {
3785
+ const regions = [];
3786
+ let currentRegion = [];
3787
+ let missStreak = 0;
3788
+ for (const row of allRows) {
3789
+ const matchedCols = countMatchedColumnsRange(row, columns, headerItems);
3790
+ if (matchedCols >= MIN_COLS) {
3791
+ currentRegion.push(row);
3792
+ missStreak = 0;
3793
+ } else if (currentRegion.length > 0 && (row.items.length <= 2 || missStreak === 0)) {
3794
+ currentRegion.push(row);
3795
+ missStreak++;
3796
+ } else {
3797
+ while (currentRegion.length > 0) {
3798
+ const last = currentRegion[currentRegion.length - 1];
3799
+ if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
3800
+ currentRegion.pop();
3801
+ }
3802
+ if (currentRegion.length >= MIN_ROWS) {
3803
+ regions.push({ rows: [...currentRegion] });
3804
+ }
3805
+ currentRegion = [];
3806
+ missStreak = 0;
3807
+ }
3808
+ }
3809
+ while (currentRegion.length > 0) {
3810
+ const last = currentRegion[currentRegion.length - 1];
3811
+ if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
3812
+ currentRegion.pop();
3813
+ }
3814
+ if (currentRegion.length >= MIN_ROWS) {
3815
+ regions.push({ rows: currentRegion });
3816
+ }
3817
+ return regions;
3818
+ }
3450
3819
  function findTableRegions(allRows, columns) {
3451
3820
  const regions = [];
3452
3821
  let currentRegion = [];
@@ -3482,18 +3851,81 @@ function countMatchedColumns(row, columns) {
3482
3851
  }
3483
3852
  return matched.size;
3484
3853
  }
3485
- function assignToColumn(item, columns) {
3486
- const MAX_DIST = COL_CLUSTER_TOL * 3;
3487
- let bestCol = -1;
3488
- let bestDist = Infinity;
3489
- for (let ci = 0; ci < columns.length; ci++) {
3490
- const dist = Math.abs(item.x - columns[ci].x);
3491
- if (dist < bestDist) {
3492
- bestDist = dist;
3493
- bestCol = ci;
3854
+ function countMatchedColumnsRange(row, columns, headerItems) {
3855
+ const boundaries = [];
3856
+ for (let ci = 0; ci < headerItems.length; ci++) {
3857
+ const left = ci === 0 ? 0 : (headerItems[ci - 1].x + headerItems[ci - 1].w + headerItems[ci].x) / 2;
3858
+ const right = ci === headerItems.length - 1 ? Infinity : (headerItems[ci].x + headerItems[ci].w + headerItems[ci + 1].x) / 2;
3859
+ boundaries.push({ left, right });
3860
+ }
3861
+ const matched = /* @__PURE__ */ new Set();
3862
+ for (const item of row.items) {
3863
+ for (let ci = 0; ci < boundaries.length; ci++) {
3864
+ if (item.x >= boundaries[ci].left && item.x < boundaries[ci].right) {
3865
+ matched.add(ci);
3866
+ break;
3867
+ }
3868
+ }
3869
+ }
3870
+ return matched.size;
3871
+ }
3872
+ function assignRowItems(items, columns, numCols) {
3873
+ if (items.length === 0) return [];
3874
+ const sorted = [...items].sort((a, b) => a.x - b.x);
3875
+ const colCenters = columns.map((c) => c.x);
3876
+ const gaps = [];
3877
+ for (let i = 1; i < sorted.length; i++) {
3878
+ gaps.push({ idx: i, size: sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w) });
3879
+ }
3880
+ const gapSizes = gaps.map((g2) => g2.size).sort((a, b) => a - b);
3881
+ const medianGap = gapSizes.length > 0 ? gapSizes[Math.floor(gapSizes.length / 2)] : 0;
3882
+ const gapThreshold = sorted.length <= numCols + 1 ? 12 : Math.max(medianGap * 2.5, 12);
3883
+ const significantGaps = gaps.filter((g2) => g2.size >= gapThreshold).sort((a, b) => b.size - a.size).slice(0, numCols - 1).sort((a, b) => a.idx - b.idx);
3884
+ const groups = [];
3885
+ let start = 0;
3886
+ for (const gap of significantGaps) {
3887
+ groups.push(sorted.slice(start, gap.idx));
3888
+ start = gap.idx;
3889
+ }
3890
+ groups.push(sorted.slice(start));
3891
+ const result = [];
3892
+ const usedCols = /* @__PURE__ */ new Set();
3893
+ const groupCenters = groups.map((g2) => {
3894
+ let minX = Infinity, maxX = -Infinity;
3895
+ for (const i of g2) {
3896
+ if (i.x < minX) minX = i.x;
3897
+ const r = i.x + i.w;
3898
+ if (r > maxX) maxX = r;
3899
+ }
3900
+ return (minX + maxX) / 2;
3901
+ });
3902
+ const assignments = [];
3903
+ for (let gi = 0; gi < groups.length; gi++) {
3904
+ for (let ci = 0; ci < numCols; ci++) {
3905
+ assignments.push({ gi, ci, dist: Math.abs(groupCenters[gi] - colCenters[ci]) });
3906
+ }
3907
+ }
3908
+ assignments.sort((a, b) => a.dist - b.dist);
3909
+ const assignedGroups = /* @__PURE__ */ new Set();
3910
+ for (const { gi, ci } of assignments) {
3911
+ if (assignedGroups.has(gi) || usedCols.has(ci)) continue;
3912
+ result.push({ col: ci, items: groups[gi] });
3913
+ assignedGroups.add(gi);
3914
+ usedCols.add(ci);
3915
+ }
3916
+ for (let gi = 0; gi < groups.length; gi++) {
3917
+ if (assignedGroups.has(gi)) continue;
3918
+ let bestCol = 0, bestDist = Infinity;
3919
+ for (let ci = 0; ci < numCols; ci++) {
3920
+ const d = Math.abs(groupCenters[gi] - colCenters[ci]);
3921
+ if (d < bestDist) {
3922
+ bestDist = d;
3923
+ bestCol = ci;
3924
+ }
3494
3925
  }
3926
+ result.push({ col: bestCol, items: groups[gi] });
3495
3927
  }
3496
- return bestDist <= MAX_DIST ? bestCol : -1;
3928
+ return result;
3497
3929
  }
3498
3930
  function buildClusterTable(rows, columns, pageNum) {
3499
3931
  const numCols = columns.length;
@@ -3511,12 +3943,12 @@ function buildClusterTable(rows, columns, pageNum) {
3511
3943
  usedItems.add(row.items[0]);
3512
3944
  continue;
3513
3945
  }
3514
- for (const item of row.items) {
3515
- const col = assignToColumn(item, columns);
3516
- if (col < 0) continue;
3946
+ const assignments = assignRowItems(row.items, columns, numCols);
3947
+ for (const { col, items } of assignments) {
3948
+ const text = items.map((i) => i.text).join(" ");
3517
3949
  const existing = cells[r][col].text;
3518
- cells[r][col].text = existing ? existing + " " + item.text : item.text;
3519
- usedItems.add(item);
3950
+ cells[r][col].text = existing ? existing + " " + text : text;
3951
+ for (const item of items) usedItems.add(item);
3520
3952
  }
3521
3953
  }
3522
3954
  let emptyRows = 0;
@@ -3528,11 +3960,48 @@ function buildClusterTable(rows, columns, pageNum) {
3528
3960
  const hasValue = cells.some((row) => row[c].text !== "");
3529
3961
  if (!hasValue) return null;
3530
3962
  }
3963
+ for (let r = numRows - 1; r >= 1; r--) {
3964
+ const nonEmptyCols = cells[r].filter((c) => c.text.trim()).length;
3965
+ if (nonEmptyCols !== 1) continue;
3966
+ if (cells[r][0].text.trim() !== "") continue;
3967
+ const contentText = cells[r].find((c) => c.text.trim())?.text.trim() || "";
3968
+ if (/^[○●▶\-·]/.test(contentText)) continue;
3969
+ for (let pr = r - 1; pr >= 0; pr--) {
3970
+ if (cells[pr].some((c) => c.text.trim())) {
3971
+ for (let c = 0; c < numCols; c++) {
3972
+ const prev = cells[pr][c].text.trim();
3973
+ const curr = cells[r][c].text.trim();
3974
+ if (curr) cells[pr][c].text = prev ? prev + " " + curr : curr;
3975
+ }
3976
+ for (let c = 0; c < numCols; c++) cells[r][c].text = "";
3977
+ break;
3978
+ }
3979
+ }
3980
+ }
3981
+ for (let r = 0; r < cells.length - 1; r++) {
3982
+ const row = cells[r];
3983
+ const hasCol0 = row[0].text.trim() !== "";
3984
+ const hasColLast = numCols > 1 && row[numCols - 1].text.trim() !== "";
3985
+ const midEmpty = row.slice(1, numCols - 1).every((c) => c.text.trim() === "");
3986
+ if (hasCol0 && hasColLast && midEmpty) {
3987
+ const next = cells[r + 1];
3988
+ if (next[0].text.trim() === "" && next.some((c) => c.text.trim())) {
3989
+ for (let c = 1; c < numCols; c++) {
3990
+ const curr = next[c].text.trim();
3991
+ if (curr) row[c].text = row[c].text.trim() ? row[c].text.trim() + " " + curr : curr;
3992
+ }
3993
+ for (let c = 0; c < numCols; c++) next[c].text = "";
3994
+ }
3995
+ }
3996
+ }
3997
+ const filteredCells = cells.filter((row) => row.some((c) => c.text.trim()));
3998
+ const finalRowCount = filteredCells.length;
3999
+ if (finalRowCount < MIN_ROWS) return null;
3531
4000
  const irTable = {
3532
- rows: numRows,
4001
+ rows: finalRowCount,
3533
4002
  cols: numCols,
3534
- cells,
3535
- hasHeader: numRows > 1
4003
+ cells: filteredCells,
4004
+ hasHeader: finalRowCount > 1
3536
4005
  };
3537
4006
  const allItems = rows.flatMap((r) => r.items);
3538
4007
  let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
@@ -3609,7 +4078,7 @@ async function parsePdfDocument(buffer, options) {
3609
4078
  const effectivePageCount = Math.min(pageCount, MAX_PAGES);
3610
4079
  const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
3611
4080
  const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
3612
- const allFontSizes = [];
4081
+ const fontSizeFreq = /* @__PURE__ */ new Map();
3613
4082
  const pageHeights = /* @__PURE__ */ new Map();
3614
4083
  let parsedPages = 0;
3615
4084
  for (let i = 1; i <= effectivePageCount; i++) {
@@ -3626,7 +4095,7 @@ async function parsePdfDocument(buffer, options) {
3626
4095
  warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
3627
4096
  }
3628
4097
  for (const item of visible) {
3629
- if (item.fontSize > 0) allFontSizes.push(item.fontSize);
4098
+ if (item.fontSize > 0) fontSizeFreq.set(item.fontSize, (fontSizeFreq.get(item.fontSize) || 0) + 1);
3630
4099
  }
3631
4100
  const opList = await page.getOperatorList();
3632
4101
  const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
@@ -3665,10 +4134,9 @@ async function parsePdfDocument(buffer, options) {
3665
4134
  blocks.splice(removed[ri], 1);
3666
4135
  }
3667
4136
  }
3668
- const medianFontSize = computeMedianFontSize(allFontSizes);
4137
+ const medianFontSize = computeMedianFontSizeFromFreq(fontSizeFreq);
3669
4138
  if (medianFontSize > 0) {
3670
4139
  detectHeadings(blocks, medianFontSize);
3671
- mergeAdjacentHeadings(blocks);
3672
4140
  }
3673
4141
  detectMarkerHeadings(blocks);
3674
4142
  const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
@@ -3730,11 +4198,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
3730
4198
  }
3731
4199
  return { visible, hiddenCount };
3732
4200
  }
3733
- function computeMedianFontSize(sizes) {
3734
- if (sizes.length === 0) return 0;
3735
- const sorted = [...sizes].sort((a, b) => a - b);
3736
- const mid = Math.floor(sorted.length / 2);
3737
- return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
4201
+ function computeMedianFontSizeFromFreq(freq) {
4202
+ if (freq.size === 0) return 0;
4203
+ let total = 0;
4204
+ for (const count of freq.values()) total += count;
4205
+ const sorted = [...freq.entries()].sort((a, b) => a[0] - b[0]);
4206
+ const mid = Math.floor(total / 2);
4207
+ let cumulative = 0;
4208
+ for (const [size, count] of sorted) {
4209
+ cumulative += count;
4210
+ if (cumulative > mid) return size;
4211
+ }
4212
+ return sorted[sorted.length - 1][0];
3738
4213
  }
3739
4214
  function detectHeadings(blocks, medianFontSize) {
3740
4215
  for (const block of blocks) {
@@ -3754,220 +4229,27 @@ function detectHeadings(blocks, medianFontSize) {
3754
4229
  }
3755
4230
  }
3756
4231
  }
3757
- function mergeAdjacentHeadings(blocks) {
3758
- let i = 0;
3759
- while (i < blocks.length - 1) {
3760
- const curr = blocks[i];
3761
- const next = blocks[i + 1];
3762
- if (curr.type !== "heading" || next.type !== "heading") {
3763
- i++;
3764
- continue;
3765
- }
3766
- if (!curr.bbox || !next.bbox || !curr.text || !next.text) {
3767
- i++;
3768
- continue;
3769
- }
3770
- const currBaseline = curr.bbox.y + (curr.style?.fontSize || curr.bbox.height);
3771
- const nextBaseline = next.bbox.y + (next.style?.fontSize || next.bbox.height);
3772
- const yDiff = Math.abs(currBaseline - nextBaseline);
3773
- const maxFs = Math.max(curr.style?.fontSize || 12, next.style?.fontSize || 12);
3774
- const sameY = curr.bbox.page === next.bbox.page && yDiff < maxFs * 1.5;
3775
- const sameLevel = curr.level === next.level;
3776
- if (sameY && sameLevel) {
3777
- const currX = curr.bbox.x;
3778
- const nextX = next.bbox.x;
3779
- if (currX <= nextX) {
3780
- curr.text = curr.text + " " + next.text;
3781
- } else {
3782
- curr.text = next.text + " " + curr.text;
3783
- }
3784
- curr.bbox = {
3785
- page: curr.bbox.page,
3786
- x: Math.min(curr.bbox.x, next.bbox.x),
3787
- y: Math.min(curr.bbox.y, next.bbox.y),
3788
- width: Math.max(curr.bbox.x + curr.bbox.width, next.bbox.x + next.bbox.width) - Math.min(curr.bbox.x, next.bbox.x),
3789
- height: Math.max(curr.bbox.height, next.bbox.height)
3790
- };
3791
- blocks.splice(i + 1, 1);
3792
- } else {
3793
- i++;
3794
- }
3795
- }
3796
- }
3797
4232
  function collapseEvenSpacing(text) {
3798
4233
  const tokens = text.split(" ");
3799
4234
  const singleCharCount = tokens.filter((t) => t.length === 1).length;
3800
4235
  if (tokens.length >= 3 && singleCharCount / tokens.length >= 0.7) {
3801
4236
  return tokens.join("");
3802
4237
  }
3803
- return text;
3804
- }
3805
- function buildXyCutBlocks(items, pageNum) {
3806
- const allY = items.map((i) => i.y);
3807
- const pageHeight = Math.max(...allY) - Math.min(...allY);
3808
- const gapThreshold = Math.max(15, pageHeight * 0.03);
3809
- const orderedGroups = xyCutOrder(items, gapThreshold);
3810
- const blocks = [];
3811
- for (const group of orderedGroups) {
3812
- if (group.length === 0) continue;
3813
- const yLines = groupByY(group);
3814
- for (const line of yLines) {
3815
- const text = mergeLineSimple(line);
3816
- if (!text.trim()) continue;
3817
- blocks.push({
3818
- type: "paragraph",
3819
- text,
3820
- pageNumber: pageNum,
3821
- bbox: computeBBox(line, pageNum),
3822
- style: dominantStyle(line)
3823
- });
3824
- }
3825
- }
3826
- return blocks.length > 0 ? blocks : null;
3827
- }
3828
- function normalizeUnderSegmentedTable(table, items, pageNum, bbox) {
3829
- const totalCells = table.cells.reduce((sum, row) => sum + row.filter((c) => c.text.trim()).length, 0);
3830
- const totalTextLines = table.cells.reduce((sum, row) => sum + row.reduce((s, c) => s + (c.text.trim() ? c.text.split("\n").length : 0), 0), 0);
3831
- const isUnderSegmented = table.rows === 1 && table.cols === 1 || totalCells <= 2 && totalTextLines >= 8 || totalCells <= 2 && items.length >= 6;
3832
- if (!isUnderSegmented) return null;
3833
- if (hasMultiColumnLayout(items)) return buildXyCutBlocks(items, pageNum);
3834
- const directTable = buildTableFromTextLayout(items, pageNum, bbox);
3835
- if (directTable) return directTable;
3836
- const clusterItems = items.map((i) => ({
3837
- text: i.text,
3838
- x: i.x,
3839
- y: i.y,
3840
- w: i.w,
3841
- h: i.h,
3842
- fontSize: i.fontSize,
3843
- fontName: i.fontName
3844
- }));
3845
- const clusterResults = detectClusterTables(clusterItems, pageNum);
3846
- if (clusterResults.length > 0) {
3847
- const blocks = [];
3848
- const ciToIdx = /* @__PURE__ */ new Map();
3849
- for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
3850
- const usedIndices = /* @__PURE__ */ new Set();
3851
- for (const cr of clusterResults) {
3852
- for (const ci of cr.usedItems) {
3853
- const idx = ciToIdx.get(ci);
3854
- if (idx !== void 0) usedIndices.add(idx);
3855
- }
3856
- blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
3857
- }
3858
- const remaining = items.filter((_, idx) => !usedIndices.has(idx));
3859
- for (const item of remaining) {
3860
- if (!item.text.trim()) continue;
3861
- blocks.push({
3862
- type: "paragraph",
3863
- text: item.text,
3864
- pageNumber: pageNum,
3865
- bbox: computeBBox([item], pageNum),
3866
- style: { fontSize: item.fontSize, fontName: item.fontName }
3867
- });
3868
- }
3869
- blocks.sort((a, b) => {
3870
- const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
3871
- const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
3872
- return by - ay;
3873
- });
3874
- return blocks.length > 0 ? blocks : null;
3875
- }
3876
- return null;
3877
- }
3878
- function buildTableFromTextLayout(items, pageNum, bbox) {
3879
- if (items.length < 4) return null;
3880
- const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
3881
- const yTol = 3;
3882
- const rows = [];
3883
- let curRow = [sorted[0]];
3884
- let curY = sorted[0].y;
3885
- for (let i = 1; i < sorted.length; i++) {
3886
- if (Math.abs(sorted[i].y - curY) <= yTol) {
3887
- curRow.push(sorted[i]);
3888
- } else {
3889
- rows.push(curRow);
3890
- curRow = [sorted[i]];
3891
- curY = sorted[i].y;
3892
- }
3893
- }
3894
- rows.push(curRow);
3895
- if (rows.length < 2) return null;
3896
- const gapPositions = [];
3897
- for (const row of rows) {
3898
- if (row.length < 2) continue;
3899
- const sortedX = [...row].sort((a, b) => a.x - b.x);
3900
- const avgFs = sortedX.reduce((s, i) => s + i.fontSize, 0) / sortedX.length;
3901
- for (let j = 1; j < sortedX.length; j++) {
3902
- const gap = sortedX[j].x - (sortedX[j - 1].x + sortedX[j - 1].w);
3903
- if (gap >= avgFs * 1.5) {
3904
- gapPositions.push(sortedX[j - 1].x + sortedX[j - 1].w + gap / 2);
3905
- }
3906
- }
3907
- }
3908
- if (gapPositions.length < 2) return null;
3909
- gapPositions.sort((a, b) => a - b);
3910
- const colBoundaries = [];
3911
- let clusterSum = gapPositions[0], clusterCount = 1;
3912
- for (let i = 1; i < gapPositions.length; i++) {
3913
- const avg = clusterSum / clusterCount;
3914
- if (Math.abs(gapPositions[i] - avg) <= 15) {
3915
- clusterSum += gapPositions[i];
3916
- clusterCount++;
3917
- } else {
3918
- if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
3919
- clusterSum = gapPositions[i];
3920
- clusterCount = 1;
3921
- }
3922
- }
3923
- if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
3924
- if (colBoundaries.length === 0) return null;
3925
- const numCols = colBoundaries.length + 1;
3926
- const tableRows = [];
3927
- for (const row of rows) {
3928
- const cells = Array(numCols).fill("");
3929
- const sortedX = [...row].sort((a, b) => a.x - b.x);
3930
- for (const item of sortedX) {
3931
- const cx = item.x + item.w / 2;
3932
- let col = 0;
3933
- for (let b = 0; b < colBoundaries.length; b++) {
3934
- if (cx > colBoundaries[b]) col = b + 1;
3935
- }
3936
- cells[col] = cells[col] ? cells[col] + " " + item.text : item.text;
3937
- }
3938
- if (cells[0].trim() === "" && tableRows.length > 0) {
3939
- const prevCells = tableRows[tableRows.length - 1].cells;
3940
- for (let c = 0; c < numCols; c++) {
3941
- if (cells[c].trim()) {
3942
- prevCells[c] = prevCells[c] ? prevCells[c] + " " + cells[c].trim() : cells[c].trim();
3943
- }
3944
- }
3945
- } else {
3946
- tableRows.push({ cells });
3947
- }
3948
- }
3949
- if (tableRows.length < 2) return null;
3950
- const nonEmptyCount = tableRows.reduce((sum, r) => sum + r.cells.filter((c) => c.trim()).length, 0);
3951
- const totalCount = tableRows.length * numCols;
3952
- if (nonEmptyCount < totalCount * 0.3) return null;
3953
- const irCells = tableRows.map(
3954
- (r) => r.cells.map((text, colIdx) => {
3955
- let cleaned = text.trim();
3956
- if (colIdx > 0) cleaned = cleaned.replace(/^[•○·\-]\s*/, "");
3957
- return { text: cleaned, colSpan: 1, rowSpan: 1 };
3958
- })
4238
+ return text.replace(
4239
+ /(?<![가-힣])[가-힣](?: [가-힣\d]){2,}(?![가-힣])/g,
4240
+ (match) => match.replace(/ /g, "")
3959
4241
  );
3960
- const irTable = {
3961
- rows: tableRows.length,
3962
- cols: numCols,
3963
- cells: irCells,
3964
- hasHeader: tableRows.length > 1
3965
- };
3966
- return [{ type: "table", table: irTable, pageNumber: pageNum, bbox }];
3967
4242
  }
3968
4243
  function shouldDemoteTable(table) {
3969
4244
  const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
3970
4245
  const allText = allCells.join(" ");
4246
+ if (table.rows <= 3 && table.cols <= 3) {
4247
+ const totalCells2 = table.rows * table.cols;
4248
+ const emptyCells2 = totalCells2 - allCells.length;
4249
+ if (emptyCells2 >= totalCells2 * 0.3) return true;
4250
+ if (/[□■◆○●▶ㅇ]/.test(allText)) return true;
4251
+ if (/<[^>]+>/.test(allText)) return true;
4252
+ }
3971
4253
  if (allText.length > 200) return false;
3972
4254
  if (/[□■◆○●▶]/.test(allText) && table.rows <= 3) return true;
3973
4255
  const totalCells = table.rows * table.cols;
@@ -4011,32 +4293,6 @@ function detectMarkerHeadings(blocks) {
4011
4293
  }
4012
4294
  }
4013
4295
  }
4014
- function hasMultiColumnLayout(items) {
4015
- if (items.length < 30) return false;
4016
- const sorted = [...items].sort((a, b) => a.x - b.x);
4017
- const minX = sorted[0].x;
4018
- let maxX = minX;
4019
- for (const i of sorted) if (i.x + i.w > maxX) maxX = i.x + i.w;
4020
- const pageWidth = maxX - minX;
4021
- if (pageWidth < 200) return false;
4022
- let bestGap = 0;
4023
- let bestSplit = 0;
4024
- for (let j = 1; j < sorted.length; j++) {
4025
- const gap = sorted[j].x - (sorted[j - 1].x + sorted[j - 1].w);
4026
- if (gap > bestGap) {
4027
- bestGap = gap;
4028
- bestSplit = (sorted[j - 1].x + sorted[j - 1].w + sorted[j].x) / 2;
4029
- }
4030
- }
4031
- if (bestGap < 20) return false;
4032
- const splitRatio = (bestSplit - minX) / pageWidth;
4033
- if (splitRatio < 0.35 || splitRatio > 0.65) return false;
4034
- const leftCount = items.filter((i) => i.x + i.w / 2 < bestSplit).length;
4035
- const rightCount = items.filter((i) => i.x + i.w / 2 >= bestSplit).length;
4036
- if (leftCount < 15 || rightCount < 15) return false;
4037
- if (Math.min(leftCount, rightCount) / Math.max(leftCount, rightCount) < 0.35) return false;
4038
- return true;
4039
- }
4040
4296
  var MAX_XYCUT_DEPTH = 50;
4041
4297
  function xyCutOrder(items, gapThreshold, depth = 0) {
4042
4298
  if (items.length === 0) return [];
@@ -4104,6 +4360,7 @@ function extractPageBlocksWithLines(items, pageNum, opList, pageWidth, pageHeigh
4104
4360
  if (items.length === 0) return [];
4105
4361
  let { horizontals, verticals } = extractLines(opList.fnArray, opList.argsArray);
4106
4362
  ({ horizontals, verticals } = filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight));
4363
+ ({ horizontals, verticals } = preprocessLines(horizontals, verticals));
4107
4364
  const grids = buildTableGrids(horizontals, verticals);
4108
4365
  if (grids.length > 0) {
4109
4366
  return extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals);
@@ -4115,14 +4372,19 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
4115
4372
  const usedItems = /* @__PURE__ */ new Set();
4116
4373
  const sortedGrids = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
4117
4374
  for (const grid of sortedGrids) {
4375
+ const numGridRows = grid.rowYs.length - 1;
4376
+ const numGridCols = grid.colXs.length - 1;
4377
+ if (numGridRows === 1 && numGridCols >= 2) continue;
4118
4378
  const tableItems = [];
4119
4379
  const pad = 3;
4380
+ const gridW = grid.bbox.x2 - grid.bbox.x1;
4120
4381
  for (const item of items) {
4121
4382
  if (usedItems.has(item)) continue;
4122
- if (item.x >= grid.bbox.x1 - pad && item.x + item.w <= grid.bbox.x2 + pad && item.y >= grid.bbox.y1 - pad && item.y <= grid.bbox.y2 + pad) {
4123
- tableItems.push(item);
4124
- usedItems.add(item);
4125
- }
4383
+ if (item.y < grid.bbox.y1 - pad || item.y > grid.bbox.y2 + pad) continue;
4384
+ if (item.x < grid.bbox.x1 - pad || item.x + item.w > grid.bbox.x2 + pad) continue;
4385
+ if (gridW < 120 && item.x + item.w > grid.bbox.x2 - 2) continue;
4386
+ tableItems.push(item);
4387
+ usedItems.add(item);
4126
4388
  }
4127
4389
  const cells = extractCells(grid, horizontals, verticals);
4128
4390
  if (cells.length === 0) continue;
@@ -4146,6 +4408,7 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
4146
4408
  const cellItems = cellTextMap.get(cell) || [];
4147
4409
  let text = cellTextToString(cellItems);
4148
4410
  text = text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").trim();
4411
+ text = text.split("\n").map((line) => collapseEvenSpacing(line)).join("\n");
4149
4412
  irGrid[cell.row][cell.col] = {
4150
4413
  text,
4151
4414
  colSpan: cell.colSpan,
@@ -4167,31 +4430,61 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
4167
4430
  width: grid.bbox.x2 - grid.bbox.x1,
4168
4431
  height: grid.bbox.y2 - grid.bbox.y1
4169
4432
  };
4170
- const normalized = normalizeUnderSegmentedTable(irTable, tableItems, pageNum, tableBbox);
4171
- if (normalized) {
4172
- blocks.push(...normalized);
4173
- continue;
4174
- }
4175
4433
  if (shouldDemoteTable(irTable)) {
4176
4434
  const demoted = demoteTableToText(irTable);
4177
4435
  if (demoted) {
4178
- blocks.push({ type: "paragraph", text: demoted, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
4436
+ const text = numGridRows === 1 ? "\n" + demoted + "\n" : demoted;
4437
+ blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
4179
4438
  }
4180
4439
  continue;
4181
4440
  }
4182
4441
  blocks.push({ type: "table", table: irTable, pageNumber: pageNum, bbox: tableBbox });
4183
4442
  }
4184
- const remaining = items.filter((i) => !usedItems.has(i));
4443
+ let remaining = items.filter((i) => !usedItems.has(i));
4185
4444
  if (remaining.length > 0) {
4186
4445
  remaining.sort((a, b) => b.y - a.y || a.x - b.x);
4187
- const textBlocks = detectListBlocks(extractPageBlocksFallback(remaining, pageNum));
4188
- const allBlocks = [...blocks, ...textBlocks];
4189
- allBlocks.sort((a, b) => {
4446
+ const clusterItems = remaining.map((i) => ({
4447
+ text: i.text,
4448
+ x: i.x,
4449
+ y: i.y,
4450
+ w: i.w,
4451
+ h: i.h,
4452
+ fontSize: i.fontSize,
4453
+ fontName: i.fontName
4454
+ }));
4455
+ const clusterResults = detectClusterTables(clusterItems, pageNum);
4456
+ if (clusterResults.length > 0) {
4457
+ const ciToIdx = /* @__PURE__ */ new Map();
4458
+ for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
4459
+ const usedClusterIndices = /* @__PURE__ */ new Set();
4460
+ for (const cr of clusterResults) {
4461
+ for (const ci of cr.usedItems) {
4462
+ const idx = ciToIdx.get(ci);
4463
+ if (idx !== void 0) usedClusterIndices.add(idx);
4464
+ }
4465
+ blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
4466
+ }
4467
+ remaining = remaining.filter((_, idx) => !usedClusterIndices.has(idx));
4468
+ }
4469
+ if (remaining.length > 0) {
4470
+ const allY = remaining.map((i) => i.y);
4471
+ const pageH = safeMax(allY) - safeMin(allY);
4472
+ const groups = xyCutOrder(remaining, Math.max(15, pageH * 0.03));
4473
+ const textBlocks = [];
4474
+ for (const group of groups) {
4475
+ if (group.length === 0) continue;
4476
+ const groupBlocks = extractPageBlocksFallback(group, pageNum);
4477
+ for (const b of groupBlocks) textBlocks.push(b);
4478
+ }
4479
+ const finalTextBlocks = detectListBlocks(textBlocks);
4480
+ for (const b of finalTextBlocks) blocks.push(b);
4481
+ }
4482
+ blocks.sort((a, b) => {
4190
4483
  const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
4191
4484
  const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
4192
4485
  return by - ay;
4193
4486
  });
4194
- return mergeAdjacentTableBlocks(allBlocks);
4487
+ return mergeAdjacentTableBlocks(blocks);
4195
4488
  }
4196
4489
  return mergeAdjacentTableBlocks(blocks);
4197
4490
  }
@@ -4217,57 +4510,53 @@ function mergeAdjacentTableBlocks(blocks) {
4217
4510
  }
4218
4511
  function extractPageBlocksFallback(items, pageNum) {
4219
4512
  if (items.length === 0) return [];
4220
- if (hasMultiColumnLayout(items)) {
4221
- const xyBlocks = buildXyCutBlocks(items, pageNum) || [];
4222
- return detectSpecialKoreanTables(detectListBlocks(xyBlocks));
4223
- }
4224
4513
  const blocks = [];
4225
- const allYLines = groupByY(items);
4226
- const columns = detectColumns(allYLines);
4227
- if (columns && columns.length >= 3) {
4228
- const tableText = extractWithColumns(allYLines, columns);
4229
- const bbox = computeBBox(items, pageNum);
4230
- blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
4231
- } else {
4232
- const clusterItems = items.map((i) => ({
4233
- text: i.text,
4234
- x: i.x,
4235
- y: i.y,
4236
- w: i.w,
4237
- h: i.h,
4238
- fontSize: i.fontSize,
4239
- fontName: i.fontName
4240
- }));
4241
- const clusterResults = hasMultiColumnLayout(items) ? [] : detectClusterTables(clusterItems, pageNum);
4242
- if (clusterResults.length > 0) {
4243
- const ciToIdx = /* @__PURE__ */ new Map();
4244
- for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
4245
- const usedIndices = /* @__PURE__ */ new Set();
4246
- for (const cr of clusterResults) {
4247
- for (const ci of cr.usedItems) {
4248
- const idx = ciToIdx.get(ci);
4249
- if (idx !== void 0) usedIndices.add(idx);
4250
- }
4251
- blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
4514
+ const clusterItems = items.map((i) => ({
4515
+ text: i.text,
4516
+ x: i.x,
4517
+ y: i.y,
4518
+ w: i.w,
4519
+ h: i.h,
4520
+ fontSize: i.fontSize,
4521
+ fontName: i.fontName
4522
+ }));
4523
+ const clusterResults = detectClusterTables(clusterItems, pageNum);
4524
+ if (clusterResults.length > 0) {
4525
+ const ciToIdx = /* @__PURE__ */ new Map();
4526
+ for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
4527
+ const usedIndices = /* @__PURE__ */ new Set();
4528
+ for (const cr of clusterResults) {
4529
+ for (const ci of cr.usedItems) {
4530
+ const idx = ciToIdx.get(ci);
4531
+ if (idx !== void 0) usedIndices.add(idx);
4252
4532
  }
4253
- const remaining = items.filter((_, idx) => !usedIndices.has(idx));
4254
- if (remaining.length > 0) {
4255
- const yLines = groupByY(remaining);
4256
- for (const line of yLines) {
4257
- const text = mergeLineSimple(line);
4258
- if (!text.trim()) continue;
4259
- const bbox = computeBBox(line, pageNum);
4260
- blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
4261
- }
4533
+ blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
4534
+ }
4535
+ const remaining = items.filter((_, idx) => !usedIndices.has(idx));
4536
+ if (remaining.length > 0) {
4537
+ const yLines = groupByY(remaining);
4538
+ for (const line of yLines) {
4539
+ const text = mergeLineSimple(line);
4540
+ if (!text.trim()) continue;
4541
+ const bbox = computeBBox(line, pageNum);
4542
+ blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
4262
4543
  }
4263
- blocks.sort((a, b) => {
4264
- const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
4265
- const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
4266
- return by - ay;
4267
- });
4544
+ }
4545
+ blocks.sort((a, b) => {
4546
+ const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
4547
+ const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
4548
+ return by - ay;
4549
+ });
4550
+ } else {
4551
+ const allYLines = groupByY(items);
4552
+ const columns = detectColumns(allYLines);
4553
+ if (columns && columns.length >= 3) {
4554
+ const tableText = extractWithColumns(allYLines, columns);
4555
+ const bbox = computeBBox(items, pageNum);
4556
+ blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
4268
4557
  } else {
4269
4558
  const allY = items.map((i) => i.y);
4270
- const pageHeight = Math.max(...allY) - Math.min(...allY);
4559
+ const pageHeight = safeMax(allY) - safeMin(allY);
4271
4560
  const gapThreshold = Math.max(15, pageHeight * 0.03);
4272
4561
  const orderedGroups = xyCutOrder(items, gapThreshold);
4273
4562
  for (const group of orderedGroups) {
@@ -4320,22 +4609,76 @@ function dominantStyle(items) {
4320
4609
  return { fontSize: dominantSize, fontName };
4321
4610
  }
4322
4611
  function normalizeItems(rawItems) {
4323
- return rawItems.filter((i) => typeof i.str === "string" && i.str.trim() !== "").map((i) => {
4612
+ const items = [];
4613
+ const spacePositions = [];
4614
+ for (const i of rawItems) {
4615
+ if (typeof i.str !== "string") continue;
4616
+ const x = Math.round(i.transform[4]);
4617
+ const y = Math.round(i.transform[5]);
4618
+ if (!i.str.trim()) {
4619
+ spacePositions.push({ x, y });
4620
+ continue;
4621
+ }
4324
4622
  const scaleY = Math.abs(i.transform[3]);
4325
4623
  const scaleX = Math.abs(i.transform[0]);
4326
4624
  const fontSize = Math.round(Math.max(scaleY, scaleX));
4327
- return {
4328
- text: i.str.trim(),
4329
- x: Math.round(i.transform[4]),
4330
- y: Math.round(i.transform[5]),
4331
- w: Math.round(i.width),
4332
- h: Math.round(i.height),
4333
- fontSize,
4334
- fontName: i.fontName || "",
4335
- // 0pt 폰트이거나 너비 0 → hidden text (prompt injection 의심)
4336
- isHidden: fontSize === 0 || i.width === 0 && i.str.trim().length > 0
4337
- };
4338
- }).sort((a, b) => b.y - a.y || a.x - b.x);
4625
+ const w = Math.round(i.width);
4626
+ const h = Math.round(i.height);
4627
+ const isHidden = fontSize === 0 || i.width === 0 && i.str.trim().length > 0;
4628
+ let text = i.str.trim();
4629
+ if (/^[\d\s\-().·,☎]+$/.test(text) && /\d/.test(text) && / /.test(text)) {
4630
+ text = text.replace(/ /g, "");
4631
+ }
4632
+ const split = splitEvenSpacedItem(text, x, w, fontSize);
4633
+ if (split) {
4634
+ for (const s of split) {
4635
+ items.push({ text: s.text, x: s.x, y, w: s.w, h, fontSize, fontName: i.fontName || "", isHidden });
4636
+ }
4637
+ } else {
4638
+ items.push({ text, x, y, w, h, fontSize, fontName: i.fontName || "", isHidden });
4639
+ }
4640
+ }
4641
+ const sorted = items.sort((a, b) => b.y - a.y || a.x - b.x);
4642
+ const deduped = [];
4643
+ for (let i = 0; i < sorted.length; i++) {
4644
+ let isDup = false;
4645
+ for (let j = deduped.length - 1; j >= 0; j--) {
4646
+ const prev = deduped[j];
4647
+ if (prev.y - sorted[i].y > 3) break;
4648
+ if (Math.abs(prev.y - sorted[i].y) <= 3 && prev.text === sorted[i].text && Math.abs(prev.x - sorted[i].x) <= 3) {
4649
+ isDup = true;
4650
+ break;
4651
+ }
4652
+ }
4653
+ if (!isDup) deduped.push(sorted[i]);
4654
+ }
4655
+ if (spacePositions.length > 0) {
4656
+ for (const item of deduped) {
4657
+ for (const sp of spacePositions) {
4658
+ if (Math.abs(sp.y - item.y) <= 3) {
4659
+ const dist = item.x - sp.x;
4660
+ if (dist >= 0 && dist <= 20) {
4661
+ item.hasSpaceBefore = true;
4662
+ break;
4663
+ }
4664
+ }
4665
+ }
4666
+ }
4667
+ }
4668
+ return deduped;
4669
+ }
4670
+ function splitEvenSpacedItem(text, itemX, itemW, fontSize) {
4671
+ if (!/^[가-힣\d](?: [가-힣\d]){2,}$/.test(text)) return null;
4672
+ const chars = text.split(" ");
4673
+ if (chars.length < 3) return null;
4674
+ const charW = itemW / chars.length;
4675
+ if (charW > fontSize * 2) return null;
4676
+ return chars.map((ch, idx) => ({
4677
+ text: ch,
4678
+ x: Math.round(itemX + idx * charW),
4679
+ w: Math.round(charW * 0.8)
4680
+ // 실제 글자 폭은 간격보다 좁음
4681
+ }));
4339
4682
  }
4340
4683
  function groupByY(items) {
4341
4684
  if (items.length === 0) return [];
@@ -4360,14 +4703,14 @@ function isProseSpread(items) {
4360
4703
  for (let i = 1; i < sorted.length; i++) {
4361
4704
  gaps.push(sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w));
4362
4705
  }
4363
- const maxGap = Math.max(...gaps);
4706
+ const maxGap = safeMax(gaps);
4364
4707
  const avgLen = items.reduce((s, i) => s + i.text.length, 0) / items.length;
4365
4708
  return maxGap < 40 && avgLen < 5;
4366
4709
  }
4367
4710
  function detectColumns(yLines) {
4368
4711
  const allItems = yLines.flat();
4369
4712
  if (allItems.length === 0) return null;
4370
- const pageWidth = Math.max(...allItems.map((i) => i.x + i.w)) - Math.min(...allItems.map((i) => i.x));
4713
+ const pageWidth = safeMax(allItems.map((i) => i.x + i.w)) - safeMin(allItems.map((i) => i.x));
4371
4714
  if (pageWidth < 100) return null;
4372
4715
  let bigoLineIdx = -1;
4373
4716
  for (let i = 0; i < yLines.length; i++) {
@@ -4399,7 +4742,7 @@ function detectColumns(yLines) {
4399
4742
  }
4400
4743
  const peaks = xClusters.filter((c) => c.count >= 3).sort((a, b) => a.minX - b.minX);
4401
4744
  if (peaks.length < 3) return null;
4402
- const MERGE_TOL = 30;
4745
+ const MERGE_TOL = 40;
4403
4746
  const merged = [peaks[0]];
4404
4747
  for (let i = 1; i < peaks.length; i++) {
4405
4748
  const prev = merged[merged.length - 1];
@@ -4413,7 +4756,14 @@ function detectColumns(yLines) {
4413
4756
  merged.push({ ...peaks[i] });
4414
4757
  }
4415
4758
  }
4416
- const columns = merged.filter((c) => c.count >= 3).map((c) => c.minX);
4759
+ const rawColumns = merged.filter((c) => c.count >= 3).map((c) => c.minX);
4760
+ if (rawColumns.length < 3) return null;
4761
+ const MIN_DETECT_COL_WIDTH = 30;
4762
+ const columns = [rawColumns[0]];
4763
+ for (let i = 1; i < rawColumns.length; i++) {
4764
+ if (rawColumns[i] - columns[columns.length - 1] < MIN_DETECT_COL_WIDTH) continue;
4765
+ columns.push(rawColumns[i]);
4766
+ }
4417
4767
  return columns.length >= 3 ? columns : null;
4418
4768
  }
4419
4769
  function findColumn(x, columns) {
@@ -4541,6 +4891,16 @@ function buildGridTable(lines, columns) {
4541
4891
  }
4542
4892
  merged.splice(0, headerEnd, headerRow);
4543
4893
  }
4894
+ for (const row of merged) {
4895
+ for (let c = 0; c < row.length; c++) {
4896
+ if (row[c]) row[c] = collapseEvenSpacing(row[c]);
4897
+ }
4898
+ }
4899
+ const totalCells = merged.length * numCols;
4900
+ const filledCells = merged.reduce((s, row) => s + row.filter((c) => c).length, 0);
4901
+ if (filledCells < totalCells * 0.35 || merged.length < 2 || merged.length <= 3 && numCols >= 7) {
4902
+ return merged.map((r) => r.filter((c) => c).join(" ")).join("\n");
4903
+ }
4544
4904
  const md = [];
4545
4905
  md.push("| " + merged[0].join(" | ") + " |");
4546
4906
  md.push("| " + merged[0].map(() => "---").join(" | ") + " |");
@@ -4552,12 +4912,32 @@ function buildGridTable(lines, columns) {
4552
4912
  function mergeLineSimple(items) {
4553
4913
  if (items.length <= 1) return items[0]?.text || "";
4554
4914
  const sorted = [...items].sort((a, b) => a.x - b.x);
4915
+ const isEvenSpaced = detectEvenSpacedItems(sorted);
4555
4916
  let result = sorted[0].text;
4556
4917
  for (let i = 1; i < sorted.length; i++) {
4557
4918
  const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
4558
4919
  const avgFs = (sorted[i].fontSize + sorted[i - 1].fontSize) / 2;
4559
- if (gap > 15) result += " ";
4560
- else if (gap < avgFs * 0.15) {
4920
+ const tabThreshold = Math.max(avgFs * 2, 30);
4921
+ if (gap > tabThreshold) {
4922
+ result += " ";
4923
+ result += sorted[i].text;
4924
+ continue;
4925
+ }
4926
+ if (isEvenSpaced[i]) {
4927
+ result += sorted[i].text;
4928
+ continue;
4929
+ }
4930
+ if (sorted[i].hasSpaceBefore && gap >= avgFs * 0.05) {
4931
+ result += " ";
4932
+ result += sorted[i].text;
4933
+ continue;
4934
+ }
4935
+ if (/[□■○●▶◆◇ㅇ]$/.test(sorted[i - 1].text) && /^[가-힣]/.test(sorted[i].text) && gap > 1) {
4936
+ result += " ";
4937
+ result += sorted[i].text;
4938
+ continue;
4939
+ }
4940
+ if (gap < avgFs * 0.15) {
4561
4941
  } else if (gap < avgFs * 0.35 && (/[가-힣]$/.test(result) || /^[가-힣]/.test(sorted[i].text))) {
4562
4942
  } else if (gap > 3) result += " ";
4563
4943
  result += sorted[i].text;
@@ -4566,8 +4946,8 @@ function mergeLineSimple(items) {
4566
4946
  }
4567
4947
  function cleanPdfText(text) {
4568
4948
  return mergeKoreanLines(
4569
- text.replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "")
4570
- ).replace(/^(?!\|).{3,30}$/gm, (line) => collapseEvenSpacing(line)).replace(/\n{3,}/g, "\n\n").trim();
4949
+ text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
4950
+ ).replace(/^(?!\| ---).*$/gm, (line) => collapseEvenSpacing(line)).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
4571
4951
  }
4572
4952
  function startsWithMarker(line) {
4573
4953
  const t = line.trimStart();
@@ -4759,7 +5139,7 @@ function mergeKoreanLines(text) {
4759
5139
  result[result.length - 1] = prev + " " + currTrimmed;
4760
5140
  continue;
4761
5141
  }
4762
- if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev)) {
5142
+ if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev) && !startsWithMarker(prev)) {
4763
5143
  result[result.length - 1] = prev + " " + curr;
4764
5144
  } else {
4765
5145
  result.push(curr);
@@ -4772,7 +5152,7 @@ function mergeKoreanLines(text) {
4772
5152
  import { readFile } from "fs/promises";
4773
5153
 
4774
5154
  // src/xlsx/parser.ts
4775
- import JSZip3 from "jszip";
5155
+ import JSZip2 from "jszip";
4776
5156
  import { DOMParser as DOMParser2 } from "@xmldom/xmldom";
4777
5157
  var MAX_SHEETS = 100;
4778
5158
  var MAX_DECOMPRESS_SIZE3 = 100 * 1024 * 1024;
@@ -4810,7 +5190,7 @@ function getTextContent(el) {
4810
5190
  return el.textContent?.trim() ?? "";
4811
5191
  }
4812
5192
  function parseXml(text) {
4813
- return new DOMParser2().parseFromString(text, "text/xml");
5193
+ return new DOMParser2().parseFromString(stripDtd(text), "text/xml");
4814
5194
  }
4815
5195
  function parseSharedStrings(xml) {
4816
5196
  const doc = parseXml(xml);
@@ -4963,7 +5343,7 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
4963
5343
  }
4964
5344
  async function parseXlsxDocument(buffer, options) {
4965
5345
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
4966
- const zip = await JSZip3.loadAsync(buffer);
5346
+ const zip = await JSZip2.loadAsync(buffer);
4967
5347
  const warnings = [];
4968
5348
  const workbookFile = zip.file("xl/workbook.xml");
4969
5349
  if (!workbookFile) {
@@ -5053,7 +5433,7 @@ async function parseXlsxDocument(buffer, options) {
5053
5433
  }
5054
5434
 
5055
5435
  // src/docx/parser.ts
5056
- import JSZip4 from "jszip";
5436
+ import JSZip3 from "jszip";
5057
5437
  import { DOMParser as DOMParser3 } from "@xmldom/xmldom";
5058
5438
  var MAX_DECOMPRESS_SIZE4 = 100 * 1024 * 1024;
5059
5439
  function getChildElements(parent, localName) {
@@ -5097,7 +5477,7 @@ function getAttr(el, localName) {
5097
5477
  return null;
5098
5478
  }
5099
5479
  function parseXml2(text) {
5100
- return new DOMParser3().parseFromString(text, "text/xml");
5480
+ return new DOMParser3().parseFromString(stripDtd(text), "text/xml");
5101
5481
  }
5102
5482
  function parseStyles(xml) {
5103
5483
  const doc = parseXml2(xml);
@@ -5391,7 +5771,7 @@ async function extractImages(zip, rels, doc) {
5391
5771
  }
5392
5772
  async function parseDocxDocument(buffer, options) {
5393
5773
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
5394
- const zip = await JSZip4.loadAsync(buffer);
5774
+ const zip = await JSZip3.loadAsync(buffer);
5395
5775
  const warnings = [];
5396
5776
  const docFile = zip.file("word/document.xml");
5397
5777
  if (!docFile) {
@@ -5608,7 +5988,7 @@ function extractInlineFields(text) {
5608
5988
  }
5609
5989
 
5610
5990
  // src/hwpx/generator.ts
5611
- import JSZip5 from "jszip";
5991
+ import JSZip4 from "jszip";
5612
5992
 
5613
5993
  // src/index.ts
5614
5994
  async function parse(input, options) {
@@ -5703,7 +6083,13 @@ function normalize(s) {
5703
6083
  }
5704
6084
  var MAX_LEVENSHTEIN_LEN = 1e4;
5705
6085
  function levenshtein(a, b) {
5706
- if (a.length + b.length > MAX_LEVENSHTEIN_LEN) return Math.abs(a.length - b.length);
6086
+ if (a.length + b.length > MAX_LEVENSHTEIN_LEN) {
6087
+ const sampleLen = Math.min(500, a.length, b.length);
6088
+ let diffs = 0;
6089
+ for (let i = 0; i < sampleLen; i++) if (a[i] !== b[i]) diffs++;
6090
+ const sampleRate = sampleLen > 0 ? diffs / sampleLen : 1;
6091
+ return Math.abs(a.length - b.length) + Math.round(Math.min(a.length, b.length) * sampleRate);
6092
+ }
5707
6093
  if (a.length > b.length) [a, b] = [b, a];
5708
6094
  const m = a.length;
5709
6095
  const n = b.length;
@@ -5859,7 +6245,10 @@ function diffTableCells(a, b) {
5859
6245
  }
5860
6246
 
5861
6247
  export {
5862
- detectFormat,
6248
+ VERSION,
6249
+ toArrayBuffer,
6250
+ KordocError,
6251
+ sanitizeError,
5863
6252
  blocksToMarkdown,
5864
6253
  extractHwpxMetadataOnly,
5865
6254
  extractHwp5MetadataOnly,
@@ -5868,4 +6257,4 @@ export {
5868
6257
  extractFormFields,
5869
6258
  parse
5870
6259
  };
5871
- //# sourceMappingURL=chunk-GJ2S6IMC.js.map
6260
+ //# sourceMappingURL=chunk-LYFG7AUT.js.map