kordoc 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -139,7 +139,7 @@ import { inflateRawSync } from "zlib";
139
139
  import { DOMParser } from "@xmldom/xmldom";
140
140
 
141
141
  // src/utils.ts
142
- var VERSION = true ? "2.1.0" : "0.0.0-dev";
142
+ var VERSION = true ? "2.2.0" : "0.0.0-dev";
143
143
  function toArrayBuffer(buf) {
144
144
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
145
145
  return buf.buffer;
@@ -155,7 +155,8 @@ var KordocError = class extends Error {
155
155
  function isPathTraversal(name) {
156
156
  if (name.includes("\0")) return true;
157
157
  const normalized = name.replace(/\\/g, "/");
158
- return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
158
+ const segments = normalized.split("/");
159
+ return segments.some((s) => s === "..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
159
160
  }
160
161
  function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
161
162
  try {
@@ -195,12 +196,25 @@ function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEnt
195
196
  return { totalUncompressed: 0, entryCount: 0 };
196
197
  }
197
198
  }
199
+ function stripDtd(xml) {
200
+ return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
201
+ }
198
202
  var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
199
203
  function sanitizeHref(href) {
200
204
  const trimmed = href.trim();
201
205
  if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
202
206
  return trimmed;
203
207
  }
208
+ function safeMin(arr) {
209
+ let min = Infinity;
210
+ for (let i = 0; i < arr.length; i++) if (arr[i] < min) min = arr[i];
211
+ return min;
212
+ }
213
+ function safeMax(arr) {
214
+ let max = -Infinity;
215
+ for (let i = 0; i < arr.length; i++) if (arr[i] > max) max = arr[i];
216
+ return max;
217
+ }
204
218
  function classifyError(err) {
205
219
  if (!(err instanceof Error)) return "PARSE_ERROR";
206
220
  const msg = err.message;
@@ -275,6 +289,7 @@ function buildTableDirect(rows, numRows) {
275
289
  if (end > maxCols) maxCols = end;
276
290
  }
277
291
  }
292
+ if (maxCols > MAX_COLS) maxCols = MAX_COLS;
278
293
  if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
279
294
  const grid = Array.from(
280
295
  { length: numRows },
@@ -284,7 +299,7 @@ function buildTableDirect(rows, numRows) {
284
299
  for (const cell of row) {
285
300
  const r = cell.rowAddr ?? 0;
286
301
  const c = cell.colAddr ?? 0;
287
- if (r >= numRows || c >= maxCols) continue;
302
+ if (r >= numRows || c >= maxCols || r < 0 || c < 0) continue;
288
303
  grid[r][c] = { text: cell.text.trim(), colSpan: cell.colSpan, rowSpan: cell.rowSpan };
289
304
  for (let dr = 0; dr < cell.rowSpan; dr++) {
290
305
  for (let dc = 0; dc < cell.colSpan; dc++) {
@@ -469,9 +484,6 @@ function tableToMarkdown(table) {
469
484
  if (dr === 0 && dc === 0) continue;
470
485
  if (r + dr < numRows && c + dc < numCols) {
471
486
  skip.add(`${r + dr},${c + dc}`);
472
- if (dr === 0) {
473
- display[r][c + dc] = cell.text.replace(/\n/g, "<br>");
474
- }
475
487
  }
476
488
  }
477
489
  }
@@ -607,9 +619,6 @@ function parseStyleElements(doc, map) {
607
619
  }
608
620
  }
609
621
  }
610
- function stripDtd(xml) {
611
- return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
612
- }
613
622
  async function parseHwpxDocument(buffer, options) {
614
623
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
615
624
  let zip;
@@ -959,7 +968,8 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
959
968
  if (newTable.rows.length > 0) {
960
969
  if (tableStack.length > 0) {
961
970
  const parentTable = tableStack.pop();
962
- const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
971
+ let nestedCols = 0;
972
+ for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
963
973
  if (newTable.rows.length >= 3 && nestedCols >= 2) {
964
974
  blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
965
975
  } else {
@@ -1068,7 +1078,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
1068
1078
  if (newTable.rows.length > 0) {
1069
1079
  if (tableStack.length > 0) {
1070
1080
  const parentTable = tableStack.pop();
1071
- const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
1081
+ let nestedCols = 0;
1082
+ for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
1072
1083
  if (newTable.rows.length >= 3 && nestedCols >= 2) {
1073
1084
  blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
1074
1085
  } else {
@@ -2166,6 +2177,7 @@ function parseLenientCfb(data) {
2166
2177
  if (miniSectorSizeShift > 16) throw new Error("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uBBF8\uB2C8 \uC139\uD130 \uD06C\uAE30 \uC2DC\uD504\uD2B8: " + miniSectorSizeShift);
2167
2178
  const miniSectorSize = 1 << miniSectorSizeShift;
2168
2179
  const fatSectorCount = data.readUInt32LE(44);
2180
+ if (fatSectorCount > 1e4) throw new Error("FAT \uC139\uD130 \uC218\uAC00 \uB108\uBB34 \uB9CE\uC2B5\uB2C8\uB2E4: " + fatSectorCount);
2169
2181
  const firstDirSector = data.readUInt32LE(48);
2170
2182
  const miniStreamCutoff = data.readUInt32LE(56);
2171
2183
  const firstMiniFatSector = data.readUInt32LE(60);
@@ -2541,10 +2553,14 @@ function findSections(cfb) {
2541
2553
  }
2542
2554
  function findSectionsLenient(lcfb, compressed) {
2543
2555
  const sections = [];
2556
+ let totalDecompressed = 0;
2544
2557
  for (let i = 0; i < MAX_SECTIONS; i++) {
2545
2558
  const raw = lcfb.findStream(`/BodyText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
2546
2559
  if (!raw) break;
2547
- sections.push({ idx: i, content: compressed ? decompressStream(raw) : raw });
2560
+ const content = compressed ? decompressStream(raw) : raw;
2561
+ totalDecompressed += content.length;
2562
+ if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
2563
+ sections.push({ idx: i, content });
2548
2564
  }
2549
2565
  if (sections.length === 0) {
2550
2566
  for (const e of lcfb.entries()) {
@@ -2552,7 +2568,12 @@ function findSectionsLenient(lcfb, compressed) {
2552
2568
  if (e.name.startsWith("Section")) {
2553
2569
  const idx = parseInt(e.name.replace("Section", ""), 10) || 0;
2554
2570
  const raw = lcfb.findStream(e.name);
2555
- if (raw) sections.push({ idx, content: compressed ? decompressStream(raw) : raw });
2571
+ if (raw) {
2572
+ const content = compressed ? decompressStream(raw) : raw;
2573
+ totalDecompressed += content.length;
2574
+ if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
2575
+ sections.push({ idx, content });
2576
+ }
2556
2577
  }
2557
2578
  }
2558
2579
  }
@@ -2560,11 +2581,15 @@ function findSectionsLenient(lcfb, compressed) {
2560
2581
  }
2561
2582
  function findViewTextSectionsLenient(lcfb, compressed) {
2562
2583
  const sections = [];
2584
+ let totalDecompressed = 0;
2563
2585
  for (let i = 0; i < MAX_SECTIONS; i++) {
2564
2586
  const raw = lcfb.findStream(`/ViewText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
2565
2587
  if (!raw) break;
2566
2588
  try {
2567
- sections.push({ idx: i, content: decryptViewText(raw, compressed) });
2589
+ const content = decryptViewText(raw, compressed);
2590
+ totalDecompressed += content.length;
2591
+ if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
2592
+ sections.push({ idx: i, content });
2568
2593
  } catch {
2569
2594
  break;
2570
2595
  }
@@ -2966,37 +2991,18 @@ init_page_range();
2966
2991
  // src/pdf/line-detector.ts
2967
2992
  import { OPS } from "pdfjs-dist/legacy/build/pdf.mjs";
2968
2993
  var ORIENTATION_TOL = 2;
2969
- var MIN_LINE_LENGTH = 10;
2970
- var COORD_MERGE_TOL = 3;
2994
+ var MIN_LINE_LENGTH = 15;
2995
+ var MAX_LINE_WIDTH = 5;
2971
2996
  var CONNECT_TOL = 5;
2972
2997
  var CELL_PADDING = 2;
2973
- var MAX_LINE_WIDTH = 5;
2974
- var IDENTITY = [1, 0, 0, 1, 0, 0];
2975
- function matMultiply(m1, m2) {
2976
- return [
2977
- m1[0] * m2[0] + m1[2] * m2[1],
2978
- m1[1] * m2[0] + m1[3] * m2[1],
2979
- m1[0] * m2[2] + m1[2] * m2[3],
2980
- m1[1] * m2[2] + m1[3] * m2[3],
2981
- m1[0] * m2[4] + m1[2] * m2[5] + m1[4],
2982
- m1[1] * m2[4] + m1[3] * m2[5] + m1[5]
2983
- ];
2984
- }
2985
- function matTransformPoint(m, x, y) {
2986
- return [m[0] * x + m[2] * y + m[4], m[1] * x + m[3] * y + m[5]];
2987
- }
2988
- function matScale(m) {
2989
- return Math.max(
2990
- Math.sqrt(m[1] * m[1] + m[3] * m[3]),
2991
- Math.sqrt(m[0] * m[0] + m[2] * m[2])
2992
- );
2993
- }
2998
+ var MIN_COL_WIDTH = 15;
2999
+ var MIN_ROW_HEIGHT = 6;
3000
+ var VERTEX_MERGE_FACTOR = 4;
3001
+ var MIN_COORD_MERGE_TOL = 8;
2994
3002
  function extractLines(fnArray, argsArray) {
2995
3003
  const horizontals = [];
2996
3004
  const verticals = [];
2997
- let ctm = [...IDENTITY];
2998
3005
  let lineWidth = 1;
2999
- const stateStack = [];
3000
3006
  let currentPath = [];
3001
3007
  let pathStartX = 0, pathStartY = 0;
3002
3008
  let curX = 0, curY = 0;
@@ -3014,53 +3020,13 @@ function extractLines(fnArray, argsArray) {
3014
3020
  );
3015
3021
  }
3016
3022
  }
3017
- function tryConvertLinesToRectangle(path) {
3018
- if (path.length < 3 || path.length > 5) return false;
3019
- const first = path[0], last = path[path.length - 1];
3020
- const closed = Math.abs(first.x1 - last.x2) < 1 && Math.abs(first.y1 - last.y2) < 1;
3021
- if (!closed) return false;
3022
- let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
3023
- for (const seg of path) {
3024
- minX = Math.min(minX, seg.x1, seg.x2);
3025
- minY = Math.min(minY, seg.y1, seg.y2);
3026
- maxX = Math.max(maxX, seg.x1, seg.x2);
3027
- maxY = Math.max(maxY, seg.y1, seg.y2);
3028
- }
3029
- const w = maxX - minX, h = maxY - minY;
3030
- if (w < MIN_LINE_LENGTH && h < MIN_LINE_LENGTH) return false;
3031
- path.length = 0;
3032
- if (h < ORIENTATION_TOL * 2 || w > MIN_LINE_LENGTH && h <= MAX_LINE_WIDTH) {
3033
- path.push({ x1: minX, y1: (minY + maxY) / 2, x2: maxX, y2: (minY + maxY) / 2 });
3034
- } else if (w < ORIENTATION_TOL * 2 || h > MIN_LINE_LENGTH && w <= MAX_LINE_WIDTH) {
3035
- path.push({ x1: (minX + maxX) / 2, y1: minY, x2: (minX + maxX) / 2, y2: maxY });
3036
- } else {
3037
- pushRectangle(path, minX, minY, w, h);
3038
- }
3039
- return true;
3040
- }
3041
- function flushPath(isStroke, isFill) {
3042
- if (!isStroke && !isFill) {
3043
- currentPath = [];
3044
- return;
3045
- }
3046
- if (isFill && !isStroke && currentPath.length >= 3) {
3047
- tryConvertLinesToRectangle(currentPath);
3048
- }
3049
- const scale = matScale(ctm);
3050
- const effectiveLW = lineWidth * scale;
3051
- if (effectiveLW > MAX_LINE_WIDTH && isStroke && !isFill) {
3023
+ function flushPath(isStroke) {
3024
+ if (!isStroke) {
3052
3025
  currentPath = [];
3053
3026
  return;
3054
3027
  }
3055
3028
  for (const seg of currentPath) {
3056
- const [px1, py1] = matTransformPoint(ctm, seg.x1, seg.y1);
3057
- const [px2, py2] = matTransformPoint(ctm, seg.x2, seg.y2);
3058
- classifyAndAdd(
3059
- { x1: px1, y1: py1, x2: px2, y2: py2 },
3060
- effectiveLW,
3061
- horizontals,
3062
- verticals
3063
- );
3029
+ classifyAndAdd(seg, lineWidth, horizontals, verticals);
3064
3030
  }
3065
3031
  currentPath = [];
3066
3032
  }
@@ -3068,28 +3034,9 @@ function extractLines(fnArray, argsArray) {
3068
3034
  const op = fnArray[i];
3069
3035
  const args = argsArray[i];
3070
3036
  switch (op) {
3071
- // ── Graphics State ──
3072
- case OPS.save:
3073
- stateStack.push({ ctm: [...ctm], lineWidth });
3074
- break;
3075
- case OPS.restore:
3076
- if (stateStack.length > 0) {
3077
- const state = stateStack.pop();
3078
- ctm = state.ctm;
3079
- lineWidth = state.lineWidth;
3080
- }
3081
- break;
3082
- case OPS.transform: {
3083
- const m = args;
3084
- if (m.length >= 6) {
3085
- ctm = matMultiply(ctm, [m[0], m[1], m[2], m[3], m[4], m[5]]);
3086
- }
3087
- break;
3088
- }
3089
3037
  case OPS.setLineWidth:
3090
3038
  lineWidth = args[0] || 1;
3091
3039
  break;
3092
- // ── Path Construction ──
3093
3040
  case OPS.constructPath: {
3094
3041
  const arg0 = args[0];
3095
3042
  if (Array.isArray(arg0)) {
@@ -3157,60 +3104,34 @@ function extractLines(fnArray, argsArray) {
3157
3104
  }
3158
3105
  }
3159
3106
  }
3160
- const isStroke5 = afterOp === OPS.stroke || afterOp === OPS.closeStroke;
3161
- const isFill5 = afterOp === OPS.fill || afterOp === OPS.eoFill;
3162
- const isBoth5 = afterOp === OPS.fillStroke || afterOp === OPS.eoFillStroke || afterOp === OPS.closeFillStroke || afterOp === OPS.closeEOFillStroke;
3163
- if (isStroke5 || isFill5 || isBoth5) {
3164
- flushPath(isStroke5 || isBoth5, isFill5 || isBoth5);
3107
+ if (afterOp === OPS.stroke || afterOp === OPS.closeStroke) {
3108
+ flushPath(true);
3109
+ } else if (afterOp === OPS.fill || afterOp === OPS.eoFill || afterOp === OPS.fillStroke || afterOp === OPS.eoFillStroke || afterOp === OPS.closeFillStroke || afterOp === OPS.closeEOFillStroke) {
3110
+ flushPath(true);
3165
3111
  } else if (afterOp === OPS.endPath) {
3166
- flushPath(false, false);
3112
+ flushPath(false);
3167
3113
  }
3168
3114
  }
3169
3115
  break;
3170
3116
  }
3171
- // ── Paint Operations ──
3172
3117
  case OPS.stroke:
3173
3118
  case OPS.closeStroke:
3174
- flushPath(true, false);
3119
+ flushPath(true);
3175
3120
  break;
3176
3121
  case OPS.fill:
3177
3122
  case OPS.eoFill:
3178
- flushPath(false, true);
3179
- break;
3180
3123
  case OPS.fillStroke:
3181
3124
  case OPS.eoFillStroke:
3182
3125
  case OPS.closeFillStroke:
3183
3126
  case OPS.closeEOFillStroke:
3184
- flushPath(true, true);
3127
+ flushPath(true);
3185
3128
  break;
3186
3129
  case OPS.endPath:
3187
- flushPath(false, false);
3188
- break;
3189
- }
3190
- }
3191
- return {
3192
- horizontals: deduplicateLines(horizontals),
3193
- verticals: deduplicateLines(verticals)
3194
- };
3195
- }
3196
- function deduplicateLines(lines) {
3197
- if (lines.length <= 1) return lines;
3198
- const result = [];
3199
- const tol = COORD_MERGE_TOL;
3200
- for (const line of lines) {
3201
- let isDuplicate = false;
3202
- for (const existing of result) {
3203
- if (Math.abs(line.y1 - existing.y1) <= tol && Math.abs(line.y2 - existing.y2) <= tol && Math.abs(line.x1 - existing.x1) <= tol && Math.abs(line.x2 - existing.x2) <= tol) {
3204
- if (line.lineWidth > existing.lineWidth) {
3205
- existing.lineWidth = line.lineWidth;
3206
- }
3207
- isDuplicate = true;
3130
+ flushPath(false);
3208
3131
  break;
3209
- }
3210
3132
  }
3211
- if (!isDuplicate) result.push(line);
3212
3133
  }
3213
- return result;
3134
+ return { horizontals, verticals };
3214
3135
  }
3215
3136
  function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
3216
3137
  const dx = Math.abs(seg.x2 - seg.x1);
@@ -3229,6 +3150,55 @@ function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
3229
3150
  verticals.push({ x1: x, y1, x2: x, y2, lineWidth });
3230
3151
  }
3231
3152
  }
3153
+ function preprocessLines(horizontals, verticals) {
3154
+ let h = horizontals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
3155
+ let v = verticals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
3156
+ h = mergeParallelLines(h, "h");
3157
+ v = mergeParallelLines(v, "v");
3158
+ return { horizontals: h, verticals: v };
3159
+ }
3160
+ function mergeParallelLines(lines, dir) {
3161
+ if (lines.length <= 1) return lines;
3162
+ const sorted = [...lines].sort((a, b) => {
3163
+ const posA = dir === "h" ? a.y1 : a.x1;
3164
+ const posB = dir === "h" ? b.y1 : b.x1;
3165
+ if (Math.abs(posA - posB) > 0.1) return posA - posB;
3166
+ return dir === "h" ? a.x1 - b.x1 : a.y1 - b.y1;
3167
+ });
3168
+ const MERGE_TOL = 3;
3169
+ const result = [sorted[0]];
3170
+ for (let i = 1; i < sorted.length; i++) {
3171
+ const prev = result[result.length - 1];
3172
+ const curr = sorted[i];
3173
+ const prevPos = dir === "h" ? prev.y1 : prev.x1;
3174
+ const currPos = dir === "h" ? curr.y1 : curr.x1;
3175
+ if (Math.abs(prevPos - currPos) <= MERGE_TOL) {
3176
+ const prevStart = dir === "h" ? prev.x1 : prev.y1;
3177
+ const prevEnd = dir === "h" ? prev.x2 : prev.y2;
3178
+ const currStart = dir === "h" ? curr.x1 : curr.y1;
3179
+ const currEnd = dir === "h" ? curr.x2 : curr.y2;
3180
+ const overlap = Math.min(prevEnd, currEnd) - Math.max(prevStart, currStart);
3181
+ const minLen = Math.min(prevEnd - prevStart, currEnd - currStart);
3182
+ if (overlap > minLen * 0.3) {
3183
+ if (dir === "h") {
3184
+ prev.x1 = Math.min(prev.x1, curr.x1);
3185
+ prev.x2 = Math.max(prev.x2, curr.x2);
3186
+ prev.y1 = (prev.y1 + curr.y1) / 2;
3187
+ prev.y2 = prev.y1;
3188
+ } else {
3189
+ prev.y1 = Math.min(prev.y1, curr.y1);
3190
+ prev.y2 = Math.max(prev.y2, curr.y2);
3191
+ prev.x1 = (prev.x1 + curr.x1) / 2;
3192
+ prev.x2 = prev.x1;
3193
+ }
3194
+ prev.lineWidth = Math.max(prev.lineWidth, curr.lineWidth);
3195
+ continue;
3196
+ }
3197
+ }
3198
+ result.push(curr);
3199
+ }
3200
+ return result;
3201
+ }
3232
3202
  function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
3233
3203
  const margin = 5;
3234
3204
  return {
@@ -3240,8 +3210,49 @@ function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
3240
3210
  )
3241
3211
  };
3242
3212
  }
3213
+ function buildVertices(horizontals, verticals) {
3214
+ const vertices = [];
3215
+ const tol = CONNECT_TOL;
3216
+ for (const h of horizontals) {
3217
+ for (const v of verticals) {
3218
+ if (v.x1 >= h.x1 - tol && v.x1 <= h.x2 + tol && h.y1 >= v.y1 - tol && h.y1 <= v.y2 + tol) {
3219
+ const radius = Math.max(h.lineWidth, v.lineWidth, 1);
3220
+ vertices.push({ x: v.x1, y: h.y1, radius });
3221
+ }
3222
+ }
3223
+ }
3224
+ return vertices;
3225
+ }
3226
+ function mergeVertices(vertices) {
3227
+ if (vertices.length <= 1) return vertices;
3228
+ const merged = [];
3229
+ const used = new Array(vertices.length).fill(false);
3230
+ for (let i = 0; i < vertices.length; i++) {
3231
+ if (used[i]) continue;
3232
+ let sumX = vertices[i].x, sumY = vertices[i].y;
3233
+ let maxRadius = vertices[i].radius;
3234
+ let count = 1;
3235
+ for (let j = i + 1; j < vertices.length; j++) {
3236
+ if (used[j]) continue;
3237
+ const mergeTol = VERTEX_MERGE_FACTOR * Math.max(maxRadius, vertices[j].radius);
3238
+ if (Math.abs(vertices[i].x - vertices[j].x) <= mergeTol && Math.abs(vertices[i].y - vertices[j].y) <= mergeTol) {
3239
+ sumX += vertices[j].x;
3240
+ sumY += vertices[j].y;
3241
+ maxRadius = Math.max(maxRadius, vertices[j].radius);
3242
+ count++;
3243
+ used[j] = true;
3244
+ }
3245
+ }
3246
+ merged.push({ x: sumX / count, y: sumY / count, radius: maxRadius });
3247
+ }
3248
+ return merged;
3249
+ }
3243
3250
  function buildTableGrids(horizontals, verticals) {
3244
3251
  if (horizontals.length < 2 || verticals.length < 2) return [];
3252
+ const allVertices = buildVertices(horizontals, verticals);
3253
+ const vertices = mergeVertices(allVertices);
3254
+ if (vertices.length < 4) return [];
3255
+ const globalRadius = vertices.reduce((max, v) => Math.max(max, v.radius), 1);
3245
3256
  const allLines = [
3246
3257
  ...horizontals.map((l, i) => ({ ...l, type: "h", id: i })),
3247
3258
  ...verticals.map((l, i) => ({ ...l, type: "v", id: i + horizontals.length }))
@@ -3252,21 +3263,74 @@ function buildTableGrids(horizontals, verticals) {
3252
3263
  const hLines = group.filter((l) => l.type === "h");
3253
3264
  const vLines = group.filter((l) => l.type === "v");
3254
3265
  if (hLines.length < 2 || vLines.length < 2) continue;
3255
- const rawYs = hLines.map((l) => l.y1);
3256
- const rowYs = clusterCoordinates(rawYs).sort((a, b) => b - a);
3257
- const rawXs = vLines.map((l) => l.x1);
3258
- const colXs = clusterCoordinates(rawXs).sort((a, b) => a - b);
3266
+ let gx1 = Infinity, gy1 = Infinity, gx2 = -Infinity, gy2 = -Infinity;
3267
+ for (const l of vLines) {
3268
+ if (l.x1 < gx1) gx1 = l.x1;
3269
+ if (l.x1 > gx2) gx2 = l.x1;
3270
+ }
3271
+ for (const l of hLines) {
3272
+ if (l.y1 < gy1) gy1 = l.y1;
3273
+ if (l.y1 > gy2) gy2 = l.y1;
3274
+ }
3275
+ const groupBbox = {
3276
+ x1: gx1 - CONNECT_TOL,
3277
+ y1: gy1 - CONNECT_TOL,
3278
+ x2: gx2 + CONNECT_TOL,
3279
+ y2: gy2 + CONNECT_TOL
3280
+ };
3281
+ const groupVertices = vertices.filter(
3282
+ (v) => v.x >= groupBbox.x1 && v.x <= groupBbox.x2 && v.y >= groupBbox.y1 && v.y <= groupBbox.y2
3283
+ );
3284
+ const groupRadius = groupVertices.length > 0 ? groupVertices.reduce((max, v) => Math.max(max, v.radius), 1) : globalRadius;
3285
+ const coordMergeTol = Math.max(VERTEX_MERGE_FACTOR * groupRadius, MIN_COORD_MERGE_TOL);
3286
+ const rawYs = [
3287
+ ...hLines.map((l) => l.y1),
3288
+ ...groupVertices.map((v) => v.y)
3289
+ ];
3290
+ const rowYs = clusterCoordinates(rawYs, coordMergeTol).sort((a, b) => b - a);
3291
+ const rawXs = [
3292
+ ...vLines.map((l) => l.x1),
3293
+ ...groupVertices.map((v) => v.x)
3294
+ ];
3295
+ const colXs = clusterCoordinates(rawXs, coordMergeTol).sort((a, b) => a - b);
3259
3296
  if (rowYs.length < 2 || colXs.length < 2) continue;
3297
+ const validColXs = enforceMinWidth(colXs, MIN_COL_WIDTH);
3298
+ const validRowYs = enforceMinHeight(rowYs, MIN_ROW_HEIGHT);
3299
+ if (validRowYs.length < 2 || validColXs.length < 2) continue;
3260
3300
  const bbox = {
3261
- x1: colXs[0],
3262
- y1: rowYs[rowYs.length - 1],
3263
- x2: colXs[colXs.length - 1],
3264
- y2: rowYs[0]
3301
+ x1: validColXs[0],
3302
+ y1: validRowYs[validRowYs.length - 1],
3303
+ x2: validColXs[validColXs.length - 1],
3304
+ y2: validRowYs[0]
3265
3305
  };
3266
- grids.push({ rowYs, colXs, bbox });
3306
+ grids.push({ rowYs: validRowYs, colXs: validColXs, bbox, vertexRadius: groupRadius });
3267
3307
  }
3268
3308
  return mergeAdjacentGrids(grids);
3269
3309
  }
3310
+ function enforceMinWidth(colXs, minWidth) {
3311
+ if (colXs.length <= 2) return colXs;
3312
+ const result = [colXs[0]];
3313
+ for (let i = 1; i < colXs.length; i++) {
3314
+ const prevX = result[result.length - 1];
3315
+ if (colXs[i] - prevX < minWidth && i < colXs.length - 1) {
3316
+ continue;
3317
+ }
3318
+ result.push(colXs[i]);
3319
+ }
3320
+ return result;
3321
+ }
3322
+ function enforceMinHeight(rowYs, minHeight) {
3323
+ if (rowYs.length <= 2) return rowYs;
3324
+ const result = [rowYs[0]];
3325
+ for (let i = 1; i < rowYs.length; i++) {
3326
+ const prevY = result[result.length - 1];
3327
+ if (prevY - rowYs[i] < minHeight && i < rowYs.length - 1) {
3328
+ continue;
3329
+ }
3330
+ result.push(rowYs[i]);
3331
+ }
3332
+ return result;
3333
+ }
3270
3334
  function mergeAdjacentGrids(grids) {
3271
3335
  if (grids.length <= 1) return grids;
3272
3336
  const sorted = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
@@ -3275,9 +3339,10 @@ function mergeAdjacentGrids(grids) {
3275
3339
  const prev = merged[merged.length - 1];
3276
3340
  const curr = sorted[i];
3277
3341
  if (prev.colXs.length === curr.colXs.length) {
3278
- const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= COORD_MERGE_TOL * 3);
3342
+ const mergeTol = Math.max(VERTEX_MERGE_FACTOR * Math.max(prev.vertexRadius, curr.vertexRadius), 6) * 3;
3343
+ const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= mergeTol);
3279
3344
  const verticalGap = prev.bbox.y1 - curr.bbox.y2;
3280
- if (colMatch && verticalGap >= -COORD_MERGE_TOL && verticalGap <= 20) {
3345
+ if (colMatch && verticalGap >= -CONNECT_TOL && verticalGap <= 20) {
3281
3346
  const allRowYs = [.../* @__PURE__ */ new Set([...prev.rowYs, ...curr.rowYs])].sort((a, b) => b - a);
3282
3347
  merged[merged.length - 1] = {
3283
3348
  rowYs: allRowYs,
@@ -3287,7 +3352,8 @@ function mergeAdjacentGrids(grids) {
3287
3352
  y1: Math.min(prev.bbox.y1, curr.bbox.y1),
3288
3353
  x2: Math.max(prev.bbox.x2, curr.bbox.x2),
3289
3354
  y2: Math.max(prev.bbox.y2, curr.bbox.y2)
3290
- }
3355
+ },
3356
+ vertexRadius: Math.max(prev.vertexRadius, curr.vertexRadius)
3291
3357
  };
3292
3358
  continue;
3293
3359
  }
@@ -3296,14 +3362,14 @@ function mergeAdjacentGrids(grids) {
3296
3362
  }
3297
3363
  return merged;
3298
3364
  }
3299
- function clusterCoordinates(values) {
3365
+ function clusterCoordinates(values, tolerance) {
3300
3366
  if (values.length === 0) return [];
3301
3367
  const sorted = [...values].sort((a, b) => a - b);
3302
3368
  const clusters = [{ sum: sorted[0], count: 1 }];
3303
3369
  for (let i = 1; i < sorted.length; i++) {
3304
3370
  const last = clusters[clusters.length - 1];
3305
3371
  const avg = last.sum / last.count;
3306
- if (Math.abs(sorted[i] - avg) <= COORD_MERGE_TOL) {
3372
+ if (Math.abs(sorted[i] - avg) <= tolerance) {
3307
3373
  last.sum += sorted[i];
3308
3374
  last.count++;
3309
3375
  } else {
@@ -3360,6 +3426,20 @@ function extractCells(grid, horizontals, verticals) {
3360
3426
  const numRows = rowYs.length - 1;
3361
3427
  const numCols = colXs.length - 1;
3362
3428
  if (numRows <= 0 || numCols <= 0) return [];
3429
+ const vBorders = Array.from(
3430
+ { length: numRows },
3431
+ (_, r) => Array.from(
3432
+ { length: numCols + 1 },
3433
+ (_2, c) => hasVerticalLine(verticals, colXs[c], rowYs[r], rowYs[r + 1], grid.vertexRadius)
3434
+ )
3435
+ );
3436
+ const hBorders = Array.from(
3437
+ { length: numRows + 1 },
3438
+ (_, r) => Array.from(
3439
+ { length: numCols },
3440
+ (_2, c) => hasHorizontalLine(horizontals, rowYs[r], colXs[c], colXs[c + 1], grid.vertexRadius)
3441
+ )
3442
+ );
3363
3443
  const occupied = Array.from({ length: numRows }, () => Array(numCols).fill(false));
3364
3444
  const cells = [];
3365
3445
  for (let r = 0; r < numRows; r++) {
@@ -3367,18 +3447,26 @@ function extractCells(grid, horizontals, verticals) {
3367
3447
  if (occupied[r][c]) continue;
3368
3448
  let colSpan = 1;
3369
3449
  let rowSpan = 1;
3370
- while (c + colSpan < numCols) {
3371
- const borderX = colXs[c + colSpan];
3372
- const topY = rowYs[r];
3373
- const botY = rowYs[r + 1];
3374
- if (hasVerticalLine(verticals, borderX, topY, botY)) break;
3450
+ while (c + colSpan < numCols && !vBorders[r][c + colSpan]) {
3451
+ let canExpand = true;
3452
+ for (let dr = 0; dr < rowSpan; dr++) {
3453
+ if (vBorders[r + dr][c + colSpan]) {
3454
+ canExpand = false;
3455
+ break;
3456
+ }
3457
+ }
3458
+ if (!canExpand) break;
3375
3459
  colSpan++;
3376
3460
  }
3377
3461
  while (r + rowSpan < numRows) {
3378
- const borderY = rowYs[r + rowSpan];
3379
- const leftX = colXs[c];
3380
- const rightX = colXs[c + colSpan];
3381
- if (hasHorizontalLine(horizontals, borderY, leftX, rightX)) break;
3462
+ let hasLine = false;
3463
+ for (let dc = 0; dc < colSpan; dc++) {
3464
+ if (hBorders[r + rowSpan][c + dc]) {
3465
+ hasLine = true;
3466
+ break;
3467
+ }
3468
+ }
3469
+ if (hasLine) break;
3382
3470
  rowSpan++;
3383
3471
  }
3384
3472
  for (let dr = 0; dr < rowSpan; dr++) {
@@ -3402,28 +3490,30 @@ function extractCells(grid, horizontals, verticals) {
3402
3490
  }
3403
3491
  return cells;
3404
3492
  }
3405
- function hasVerticalLine(verticals, x, topY, botY) {
3406
- const tol = COORD_MERGE_TOL + 1;
3493
+ function hasVerticalLine(verticals, x, topY, botY, vertexRadius) {
3494
+ const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
3407
3495
  for (const v of verticals) {
3408
3496
  if (Math.abs(v.x1 - x) <= tol) {
3409
3497
  const cellH = Math.abs(topY - botY);
3498
+ if (cellH < 0.1) continue;
3410
3499
  const overlapTop = Math.min(v.y2, topY);
3411
3500
  const overlapBot = Math.max(v.y1, botY);
3412
3501
  const overlap = overlapTop - overlapBot;
3413
- if (overlap >= cellH * 0.5) return true;
3502
+ if (overlap >= cellH * 0.75) return true;
3414
3503
  }
3415
3504
  }
3416
3505
  return false;
3417
3506
  }
3418
- function hasHorizontalLine(horizontals, y, leftX, rightX) {
3419
- const tol = COORD_MERGE_TOL + 1;
3507
+ function hasHorizontalLine(horizontals, y, leftX, rightX, vertexRadius) {
3508
+ const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
3420
3509
  for (const h of horizontals) {
3421
3510
  if (Math.abs(h.y1 - y) <= tol) {
3422
3511
  const cellW = Math.abs(rightX - leftX);
3512
+ if (cellW < 0.1) continue;
3423
3513
  const overlapLeft = Math.max(h.x1, leftX);
3424
3514
  const overlapRight = Math.min(h.x2, rightX);
3425
3515
  const overlap = overlapRight - overlapLeft;
3426
- if (overlap >= cellW * 0.5) return true;
3516
+ if (overlap >= cellW * 0.75) return true;
3427
3517
  }
3428
3518
  }
3429
3519
  return false;
@@ -3434,23 +3524,24 @@ function mapTextToCells(items, cells) {
3434
3524
  result.set(cell, []);
3435
3525
  }
3436
3526
  for (const item of items) {
3437
- const cx = item.x + item.w / 2;
3438
- const cy = item.y;
3439
3527
  const pad = CELL_PADDING;
3440
3528
  let bestCell = null;
3441
- let bestDist = Infinity;
3529
+ let bestScore = 0;
3442
3530
  for (const cell of cells) {
3443
- if (cx >= cell.bbox.x1 - pad && cx <= cell.bbox.x2 + pad && cy >= cell.bbox.y1 - pad && cy <= cell.bbox.y2 + pad) {
3444
- const cellCx = (cell.bbox.x1 + cell.bbox.x2) / 2;
3445
- const cellCy = (cell.bbox.y1 + cell.bbox.y2) / 2;
3446
- const dist = Math.abs(cx - cellCx) + Math.abs(cy - cellCy);
3447
- if (dist < bestDist) {
3448
- bestDist = dist;
3449
- bestCell = cell;
3450
- }
3531
+ const ix1 = Math.max(item.x, cell.bbox.x1 - pad);
3532
+ const ix2 = Math.min(item.x + item.w, cell.bbox.x2 + pad);
3533
+ const iy1 = Math.max(item.y, cell.bbox.y1 - pad);
3534
+ const iy2 = Math.min(item.y + (item.h || item.fontSize), cell.bbox.y2 + pad);
3535
+ if (ix1 >= ix2 || iy1 >= iy2) continue;
3536
+ const intersectArea = (ix2 - ix1) * (iy2 - iy1);
3537
+ const itemArea = Math.max(item.w, 1) * Math.max(item.h || item.fontSize, 1);
3538
+ const score = intersectArea / itemArea;
3539
+ if (score > bestScore) {
3540
+ bestScore = score;
3541
+ bestCell = cell;
3451
3542
  }
3452
3543
  }
3453
- if (bestCell) {
3544
+ if (bestCell && bestScore > 0.3) {
3454
3545
  result.get(bestCell).push(item);
3455
3546
  }
3456
3547
  }
@@ -3477,8 +3568,13 @@ function cellTextToString(items) {
3477
3568
  const textLines = lines.map((line) => {
3478
3569
  const s = line.sort((a, b) => a.x - b.x);
3479
3570
  if (s.length === 1) return s[0].text;
3571
+ const evenSpaced = detectEvenSpacedItems(s);
3480
3572
  let result = s[0].text;
3481
3573
  for (let j = 1; j < s.length; j++) {
3574
+ if (evenSpaced[j]) {
3575
+ result += s[j].text;
3576
+ continue;
3577
+ }
3482
3578
  const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
3483
3579
  const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
3484
3580
  const prevIsKorean = /[가-힣]$/.test(result);
@@ -3493,6 +3589,57 @@ function cellTextToString(items) {
3493
3589
  }
3494
3590
  return result;
3495
3591
  });
3592
+ return mergeCellTextLines(textLines);
3593
+ }
3594
+ function detectEvenSpacedItems(items) {
3595
+ const result = new Array(items.length).fill(false);
3596
+ if (items.length < 3) return result;
3597
+ let runStart = -1;
3598
+ for (let i = 0; i < items.length; i++) {
3599
+ const isShortKorean = /^[가-힣]{1}$/.test(items[i].text) || /^[\d]{1}$/.test(items[i].text);
3600
+ if (isShortKorean && runStart >= 0 && i > 0) {
3601
+ const gap = items[i].x - (items[i - 1].x + items[i - 1].w);
3602
+ const maxRunGap = Math.max(items[i].fontSize * 3, 30);
3603
+ if (gap > maxRunGap) {
3604
+ if (i - runStart >= 3) markEvenRun(items, result, runStart, i);
3605
+ runStart = i;
3606
+ continue;
3607
+ }
3608
+ }
3609
+ if (isShortKorean) {
3610
+ if (runStart < 0) runStart = i;
3611
+ } else {
3612
+ if (runStart >= 0 && i - runStart >= 3) {
3613
+ markEvenRun(items, result, runStart, i);
3614
+ }
3615
+ runStart = -1;
3616
+ }
3617
+ }
3618
+ if (runStart >= 0 && items.length - runStart >= 3) {
3619
+ markEvenRun(items, result, runStart, items.length);
3620
+ }
3621
+ return result;
3622
+ }
3623
+ function markEvenRun(items, result, start, end) {
3624
+ const gaps = [];
3625
+ for (let i = start + 1; i < end; i++) {
3626
+ gaps.push(items[i].x - (items[i - 1].x + items[i - 1].w));
3627
+ }
3628
+ const posGaps = gaps.filter((g2) => g2 > 0);
3629
+ if (posGaps.length < 2) return;
3630
+ let minGap = Infinity, maxGap = -Infinity;
3631
+ for (const g2 of posGaps) {
3632
+ if (g2 < minGap) minGap = g2;
3633
+ if (g2 > maxGap) maxGap = g2;
3634
+ }
3635
+ const avgFs = items[start].fontSize;
3636
+ if (minGap >= avgFs * 0.1 && maxGap <= avgFs * 3 && maxGap / Math.max(minGap, 0.1) <= 3) {
3637
+ for (let i = start + 1; i < end; i++) {
3638
+ result[i] = true;
3639
+ }
3640
+ }
3641
+ }
3642
+ function mergeCellTextLines(textLines) {
3496
3643
  if (textLines.length <= 1) return textLines[0] || "";
3497
3644
  const merged = [textLines[0]];
3498
3645
  for (let i = 1; i < textLines.length; i++) {
@@ -3518,24 +3665,172 @@ var Y_TOL = 3;
3518
3665
  var COL_CLUSTER_TOL = 15;
3519
3666
  var MIN_ROWS = 3;
3520
3667
  var MIN_COLS = 2;
3521
- var MIN_GAP_FACTOR = 1.5;
3522
- var MIN_COL_FILL_RATIO = 0.3;
3668
+ var MIN_GAP_FACTOR = 2;
3669
+ var MIN_GAP_ABSOLUTE = 20;
3670
+ var MIN_COL_FILL_RATIO = 0.4;
3523
3671
  function detectClusterTables(items, pageNum) {
3524
3672
  if (items.length < MIN_ROWS * MIN_COLS) return [];
3525
- const rows = groupByBaseline(items);
3673
+ const { merged, originMap } = mergeEvenSpacedClusters(items);
3674
+ const rows = groupByBaseline(merged);
3526
3675
  if (rows.length < MIN_ROWS) return [];
3527
- const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
3528
- if (suspiciousRows.length < MIN_ROWS) return [];
3529
- const columns = extractColumnClusters(suspiciousRows);
3530
- if (columns.length < MIN_COLS) return [];
3531
- const tableRegions = findTableRegions(rows, columns);
3532
3676
  const results = [];
3533
- for (const region of tableRegions) {
3534
- const table = buildClusterTable(region.rows, columns, pageNum);
3535
- if (table) results.push(table);
3677
+ const headerResult = detectHeaderRow(rows);
3678
+ if (headerResult) {
3679
+ const { columns, headerIdx } = headerResult;
3680
+ const headerRow = rows[headerIdx];
3681
+ const headerItems = [...headerRow.items].sort((a, b) => a.x - b.x);
3682
+ const headerAndBelow = rows.slice(headerIdx);
3683
+ const mergedRows = mergeMultiLineRows(headerAndBelow, columns);
3684
+ const tableRegions = findTableRegionsByHeader(mergedRows, columns, headerItems);
3685
+ for (const region of tableRegions) {
3686
+ const table = buildClusterTable(region.rows, columns, pageNum);
3687
+ if (table) {
3688
+ expandUsedItems(table.usedItems, originMap);
3689
+ results.push(table);
3690
+ }
3691
+ }
3692
+ }
3693
+ if (results.length === 0) {
3694
+ const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
3695
+ if (suspiciousRows.length >= MIN_ROWS) {
3696
+ const columns = extractColumnClusters(suspiciousRows);
3697
+ if (columns.length >= MIN_COLS) {
3698
+ const tableRegions = findTableRegions(rows, columns);
3699
+ for (const region of tableRegions) {
3700
+ const mergedRows = mergeMultiLineRows(region.rows, columns);
3701
+ const table = buildClusterTable(mergedRows, columns, pageNum);
3702
+ if (table) {
3703
+ expandUsedItems(table.usedItems, originMap);
3704
+ results.push(table);
3705
+ }
3706
+ }
3707
+ }
3708
+ }
3536
3709
  }
3537
3710
  return results;
3538
3711
  }
3712
+ function mergeEvenSpacedClusters(items) {
3713
+ const originMap = /* @__PURE__ */ new Map();
3714
+ const rows = groupByBaseline(items);
3715
+ const merged = [];
3716
+ for (const row of rows) {
3717
+ const sorted = [...row.items].sort((a, b) => a.x - b.x);
3718
+ let i = 0;
3719
+ while (i < sorted.length) {
3720
+ if (/^[가-힣\d]$/.test(sorted[i].text)) {
3721
+ let runEnd = i + 1;
3722
+ while (runEnd < sorted.length && /^[가-힣\d]$/.test(sorted[runEnd].text)) {
3723
+ const gap = sorted[runEnd].x - (sorted[runEnd - 1].x + sorted[runEnd - 1].w);
3724
+ const fs = sorted[runEnd].fontSize;
3725
+ if (gap < fs * 0.1 || gap > fs * 3) break;
3726
+ runEnd++;
3727
+ }
3728
+ if (runEnd - i >= 3) {
3729
+ const gaps = [];
3730
+ for (let g2 = i + 1; g2 < runEnd; g2++) {
3731
+ gaps.push(sorted[g2].x - (sorted[g2 - 1].x + sorted[g2 - 1].w));
3732
+ }
3733
+ let minG = Infinity, maxG = -Infinity;
3734
+ for (const g2 of gaps) {
3735
+ if (g2 < minG) minG = g2;
3736
+ if (g2 > maxG) maxG = g2;
3737
+ }
3738
+ if (minG > 0 && maxG / minG <= 3) {
3739
+ const run = sorted.slice(i, runEnd);
3740
+ const text = run.map((r) => r.text).join("");
3741
+ const first = run[0], last = run[runEnd - i - 1];
3742
+ const item = {
3743
+ text,
3744
+ x: first.x,
3745
+ y: first.y,
3746
+ w: last.x + last.w - first.x,
3747
+ h: first.h,
3748
+ fontSize: first.fontSize,
3749
+ fontName: first.fontName
3750
+ };
3751
+ originMap.set(item, run);
3752
+ merged.push(item);
3753
+ i = runEnd;
3754
+ continue;
3755
+ }
3756
+ }
3757
+ }
3758
+ merged.push(sorted[i]);
3759
+ i++;
3760
+ }
3761
+ }
3762
+ return { merged, originMap };
3763
+ }
3764
+ function expandUsedItems(usedItems, originMap) {
3765
+ const toAdd = [];
3766
+ for (const item of usedItems) {
3767
+ const origins = originMap.get(item);
3768
+ if (origins) for (const o of origins) toAdd.push(o);
3769
+ }
3770
+ for (const a of toAdd) usedItems.add(a);
3771
+ }
3772
+ function detectHeaderRow(rows) {
3773
+ const allItems = rows.flatMap((r) => r.items);
3774
+ if (allItems.length === 0) return null;
3775
+ let allMinX = Infinity, allMaxX = -Infinity;
3776
+ for (const i of allItems) {
3777
+ if (i.x < allMinX) allMinX = i.x;
3778
+ const r = i.x + i.w;
3779
+ if (r > allMaxX) allMaxX = r;
3780
+ }
3781
+ const pageSpan = allMaxX - allMinX;
3782
+ if (pageSpan <= 0) return null;
3783
+ for (let ri = 0; ri < rows.length; ri++) {
3784
+ const row = rows[ri];
3785
+ if (row.items.length < MIN_COLS || row.items.length > 6) continue;
3786
+ if (row.items.some((i) => i.text.length > 8)) continue;
3787
+ if (!row.items.some((i) => /[가-힣]/.test(i.text))) continue;
3788
+ if (row.items.some((i) => /^[□■○●·※▶▷◆◇\-]/.test(i.text))) continue;
3789
+ const sorted = [...row.items].sort((a, b) => a.x - b.x);
3790
+ const xSpan = sorted[sorted.length - 1].x + sorted[sorted.length - 1].w - sorted[0].x;
3791
+ if (xSpan / pageSpan < 0.4) continue;
3792
+ const avgFs = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
3793
+ let hasLargeGap = false;
3794
+ for (let i = 1; i < sorted.length; i++) {
3795
+ const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
3796
+ if (gap >= avgFs * 2.5) {
3797
+ hasLargeGap = true;
3798
+ break;
3799
+ }
3800
+ }
3801
+ if (!hasLargeGap) continue;
3802
+ const columns = sorted.map((item) => ({ x: item.x, count: 0 }));
3803
+ let matchCount = 0;
3804
+ for (let j = ri + 1; j < rows.length && matchCount < MIN_ROWS + 2; j++) {
3805
+ const matched = countMatchedColumnsRange(rows[j], columns, sorted);
3806
+ if (matched >= MIN_COLS) matchCount++;
3807
+ }
3808
+ if (matchCount < MIN_ROWS) continue;
3809
+ return { columns, headerIdx: ri };
3810
+ }
3811
+ return null;
3812
+ }
3813
+ function mergeMultiLineRows(rows, columns) {
3814
+ if (rows.length <= 1) return rows;
3815
+ const result = [rows[0]];
3816
+ const allFontSizes = rows.flatMap((r) => r.items).map((i) => i.fontSize);
3817
+ const avgFontSize = allFontSizes.length > 0 ? allFontSizes.reduce((s, v) => s + v, 0) / allFontSizes.length : 12;
3818
+ for (let i = 1; i < rows.length; i++) {
3819
+ const prev = result[result.length - 1];
3820
+ const curr = rows[i];
3821
+ const yGap = Math.abs(prev.y - curr.y);
3822
+ const matchedCols = countMatchedColumns(curr, columns);
3823
+ if (yGap < avgFontSize * 1.8 && curr.items.length <= 2 && (matchedCols < MIN_COLS || curr.items.length === 1)) {
3824
+ result[result.length - 1] = {
3825
+ y: prev.y,
3826
+ items: [...prev.items, ...curr.items]
3827
+ };
3828
+ } else {
3829
+ result.push(curr);
3830
+ }
3831
+ }
3832
+ return result;
3833
+ }
3539
3834
  function groupByBaseline(items) {
3540
3835
  if (items.length === 0) return [];
3541
3836
  const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
@@ -3557,8 +3852,9 @@ function groupByBaseline(items) {
3557
3852
  function hasSuspiciousGaps(row) {
3558
3853
  if (row.items.length < 2) return false;
3559
3854
  const sorted = [...row.items].sort((a, b) => a.x - b.x);
3855
+ if (sorted.length === 2 && sorted[1].text.length > 20) return false;
3560
3856
  const avgFontSize = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
3561
- const minGap = avgFontSize * MIN_GAP_FACTOR;
3857
+ const minGap = Math.max(avgFontSize * MIN_GAP_FACTOR, MIN_GAP_ABSOLUTE);
3562
3858
  for (let i = 1; i < sorted.length; i++) {
3563
3859
  const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
3564
3860
  if (gap >= minGap) return true;
@@ -3585,6 +3881,41 @@ function extractColumnClusters(rows) {
3585
3881
  const minCount = Math.max(2, Math.floor(rows.length * MIN_COL_FILL_RATIO));
3586
3882
  return clusters.filter((c) => c.count >= minCount).sort((a, b) => a.x - b.x);
3587
3883
  }
3884
+ function findTableRegionsByHeader(allRows, columns, headerItems) {
3885
+ const regions = [];
3886
+ let currentRegion = [];
3887
+ let missStreak = 0;
3888
+ for (const row of allRows) {
3889
+ const matchedCols = countMatchedColumnsRange(row, columns, headerItems);
3890
+ if (matchedCols >= MIN_COLS) {
3891
+ currentRegion.push(row);
3892
+ missStreak = 0;
3893
+ } else if (currentRegion.length > 0 && (row.items.length <= 2 || missStreak === 0)) {
3894
+ currentRegion.push(row);
3895
+ missStreak++;
3896
+ } else {
3897
+ while (currentRegion.length > 0) {
3898
+ const last = currentRegion[currentRegion.length - 1];
3899
+ if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
3900
+ currentRegion.pop();
3901
+ }
3902
+ if (currentRegion.length >= MIN_ROWS) {
3903
+ regions.push({ rows: [...currentRegion] });
3904
+ }
3905
+ currentRegion = [];
3906
+ missStreak = 0;
3907
+ }
3908
+ }
3909
+ while (currentRegion.length > 0) {
3910
+ const last = currentRegion[currentRegion.length - 1];
3911
+ if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
3912
+ currentRegion.pop();
3913
+ }
3914
+ if (currentRegion.length >= MIN_ROWS) {
3915
+ regions.push({ rows: currentRegion });
3916
+ }
3917
+ return regions;
3918
+ }
3588
3919
  function findTableRegions(allRows, columns) {
3589
3920
  const regions = [];
3590
3921
  let currentRegion = [];
@@ -3620,18 +3951,81 @@ function countMatchedColumns(row, columns) {
3620
3951
  }
3621
3952
  return matched.size;
3622
3953
  }
3623
- function assignToColumn(item, columns) {
3624
- const MAX_DIST = COL_CLUSTER_TOL * 3;
3625
- let bestCol = -1;
3626
- let bestDist = Infinity;
3627
- for (let ci = 0; ci < columns.length; ci++) {
3628
- const dist = Math.abs(item.x - columns[ci].x);
3629
- if (dist < bestDist) {
3630
- bestDist = dist;
3631
- bestCol = ci;
3954
+ function countMatchedColumnsRange(row, columns, headerItems) {
3955
+ const boundaries = [];
3956
+ for (let ci = 0; ci < headerItems.length; ci++) {
3957
+ const left = ci === 0 ? 0 : (headerItems[ci - 1].x + headerItems[ci - 1].w + headerItems[ci].x) / 2;
3958
+ const right = ci === headerItems.length - 1 ? Infinity : (headerItems[ci].x + headerItems[ci].w + headerItems[ci + 1].x) / 2;
3959
+ boundaries.push({ left, right });
3960
+ }
3961
+ const matched = /* @__PURE__ */ new Set();
3962
+ for (const item of row.items) {
3963
+ for (let ci = 0; ci < boundaries.length; ci++) {
3964
+ if (item.x >= boundaries[ci].left && item.x < boundaries[ci].right) {
3965
+ matched.add(ci);
3966
+ break;
3967
+ }
3968
+ }
3969
+ }
3970
+ return matched.size;
3971
+ }
3972
+ function assignRowItems(items, columns, numCols) {
3973
+ if (items.length === 0) return [];
3974
+ const sorted = [...items].sort((a, b) => a.x - b.x);
3975
+ const colCenters = columns.map((c) => c.x);
3976
+ const gaps = [];
3977
+ for (let i = 1; i < sorted.length; i++) {
3978
+ gaps.push({ idx: i, size: sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w) });
3979
+ }
3980
+ const gapSizes = gaps.map((g2) => g2.size).sort((a, b) => a - b);
3981
+ const medianGap = gapSizes.length > 0 ? gapSizes[Math.floor(gapSizes.length / 2)] : 0;
3982
+ const gapThreshold = sorted.length <= numCols + 1 ? 12 : Math.max(medianGap * 2.5, 12);
3983
+ const significantGaps = gaps.filter((g2) => g2.size >= gapThreshold).sort((a, b) => b.size - a.size).slice(0, numCols - 1).sort((a, b) => a.idx - b.idx);
3984
+ const groups = [];
3985
+ let start = 0;
3986
+ for (const gap of significantGaps) {
3987
+ groups.push(sorted.slice(start, gap.idx));
3988
+ start = gap.idx;
3989
+ }
3990
+ groups.push(sorted.slice(start));
3991
+ const result = [];
3992
+ const usedCols = /* @__PURE__ */ new Set();
3993
+ const groupCenters = groups.map((g2) => {
3994
+ let minX = Infinity, maxX = -Infinity;
3995
+ for (const i of g2) {
3996
+ if (i.x < minX) minX = i.x;
3997
+ const r = i.x + i.w;
3998
+ if (r > maxX) maxX = r;
3999
+ }
4000
+ return (minX + maxX) / 2;
4001
+ });
4002
+ const assignments = [];
4003
+ for (let gi = 0; gi < groups.length; gi++) {
4004
+ for (let ci = 0; ci < numCols; ci++) {
4005
+ assignments.push({ gi, ci, dist: Math.abs(groupCenters[gi] - colCenters[ci]) });
4006
+ }
4007
+ }
4008
+ assignments.sort((a, b) => a.dist - b.dist);
4009
+ const assignedGroups = /* @__PURE__ */ new Set();
4010
+ for (const { gi, ci } of assignments) {
4011
+ if (assignedGroups.has(gi) || usedCols.has(ci)) continue;
4012
+ result.push({ col: ci, items: groups[gi] });
4013
+ assignedGroups.add(gi);
4014
+ usedCols.add(ci);
4015
+ }
4016
+ for (let gi = 0; gi < groups.length; gi++) {
4017
+ if (assignedGroups.has(gi)) continue;
4018
+ let bestCol = 0, bestDist = Infinity;
4019
+ for (let ci = 0; ci < numCols; ci++) {
4020
+ const d = Math.abs(groupCenters[gi] - colCenters[ci]);
4021
+ if (d < bestDist) {
4022
+ bestDist = d;
4023
+ bestCol = ci;
4024
+ }
3632
4025
  }
4026
+ result.push({ col: bestCol, items: groups[gi] });
3633
4027
  }
3634
- return bestDist <= MAX_DIST ? bestCol : -1;
4028
+ return result;
3635
4029
  }
3636
4030
  function buildClusterTable(rows, columns, pageNum) {
3637
4031
  const numCols = columns.length;
@@ -3649,12 +4043,12 @@ function buildClusterTable(rows, columns, pageNum) {
3649
4043
  usedItems.add(row.items[0]);
3650
4044
  continue;
3651
4045
  }
3652
- for (const item of row.items) {
3653
- const col = assignToColumn(item, columns);
3654
- if (col < 0) continue;
4046
+ const assignments = assignRowItems(row.items, columns, numCols);
4047
+ for (const { col, items } of assignments) {
4048
+ const text = items.map((i) => i.text).join(" ");
3655
4049
  const existing = cells[r][col].text;
3656
- cells[r][col].text = existing ? existing + " " + item.text : item.text;
3657
- usedItems.add(item);
4050
+ cells[r][col].text = existing ? existing + " " + text : text;
4051
+ for (const item of items) usedItems.add(item);
3658
4052
  }
3659
4053
  }
3660
4054
  let emptyRows = 0;
@@ -3666,11 +4060,48 @@ function buildClusterTable(rows, columns, pageNum) {
3666
4060
  const hasValue = cells.some((row) => row[c].text !== "");
3667
4061
  if (!hasValue) return null;
3668
4062
  }
4063
+ for (let r = numRows - 1; r >= 1; r--) {
4064
+ const nonEmptyCols = cells[r].filter((c) => c.text.trim()).length;
4065
+ if (nonEmptyCols !== 1) continue;
4066
+ if (cells[r][0].text.trim() !== "") continue;
4067
+ const contentText = cells[r].find((c) => c.text.trim())?.text.trim() || "";
4068
+ if (/^[○●▶\-·]/.test(contentText)) continue;
4069
+ for (let pr = r - 1; pr >= 0; pr--) {
4070
+ if (cells[pr].some((c) => c.text.trim())) {
4071
+ for (let c = 0; c < numCols; c++) {
4072
+ const prev = cells[pr][c].text.trim();
4073
+ const curr = cells[r][c].text.trim();
4074
+ if (curr) cells[pr][c].text = prev ? prev + " " + curr : curr;
4075
+ }
4076
+ for (let c = 0; c < numCols; c++) cells[r][c].text = "";
4077
+ break;
4078
+ }
4079
+ }
4080
+ }
4081
+ for (let r = 0; r < cells.length - 1; r++) {
4082
+ const row = cells[r];
4083
+ const hasCol0 = row[0].text.trim() !== "";
4084
+ const hasColLast = numCols > 1 && row[numCols - 1].text.trim() !== "";
4085
+ const midEmpty = row.slice(1, numCols - 1).every((c) => c.text.trim() === "");
4086
+ if (hasCol0 && hasColLast && midEmpty) {
4087
+ const next = cells[r + 1];
4088
+ if (next[0].text.trim() === "" && next.some((c) => c.text.trim())) {
4089
+ for (let c = 1; c < numCols; c++) {
4090
+ const curr = next[c].text.trim();
4091
+ if (curr) row[c].text = row[c].text.trim() ? row[c].text.trim() + " " + curr : curr;
4092
+ }
4093
+ for (let c = 0; c < numCols; c++) next[c].text = "";
4094
+ }
4095
+ }
4096
+ }
4097
+ const filteredCells = cells.filter((row) => row.some((c) => c.text.trim()));
4098
+ const finalRowCount = filteredCells.length;
4099
+ if (finalRowCount < MIN_ROWS) return null;
3669
4100
  const irTable = {
3670
- rows: numRows,
4101
+ rows: finalRowCount,
3671
4102
  cols: numCols,
3672
- cells,
3673
- hasHeader: numRows > 1
4103
+ cells: filteredCells,
4104
+ hasHeader: finalRowCount > 1
3674
4105
  };
3675
4106
  const allItems = rows.flatMap((r) => r.items);
3676
4107
  let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
@@ -3747,7 +4178,7 @@ async function parsePdfDocument(buffer, options) {
3747
4178
  const effectivePageCount = Math.min(pageCount, MAX_PAGES);
3748
4179
  const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
3749
4180
  const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
3750
- const allFontSizes = [];
4181
+ const fontSizeFreq = /* @__PURE__ */ new Map();
3751
4182
  const pageHeights = /* @__PURE__ */ new Map();
3752
4183
  let parsedPages = 0;
3753
4184
  for (let i = 1; i <= effectivePageCount; i++) {
@@ -3764,7 +4195,7 @@ async function parsePdfDocument(buffer, options) {
3764
4195
  warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
3765
4196
  }
3766
4197
  for (const item of visible) {
3767
- if (item.fontSize > 0) allFontSizes.push(item.fontSize);
4198
+ if (item.fontSize > 0) fontSizeFreq.set(item.fontSize, (fontSizeFreq.get(item.fontSize) || 0) + 1);
3768
4199
  }
3769
4200
  const opList = await page.getOperatorList();
3770
4201
  const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
@@ -3803,10 +4234,9 @@ async function parsePdfDocument(buffer, options) {
3803
4234
  blocks.splice(removed[ri], 1);
3804
4235
  }
3805
4236
  }
3806
- const medianFontSize = computeMedianFontSize(allFontSizes);
4237
+ const medianFontSize = computeMedianFontSizeFromFreq(fontSizeFreq);
3807
4238
  if (medianFontSize > 0) {
3808
4239
  detectHeadings(blocks, medianFontSize);
3809
- mergeAdjacentHeadings(blocks);
3810
4240
  }
3811
4241
  detectMarkerHeadings(blocks);
3812
4242
  const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
@@ -3857,11 +4287,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
3857
4287
  }
3858
4288
  return { visible, hiddenCount };
3859
4289
  }
3860
- function computeMedianFontSize(sizes) {
3861
- if (sizes.length === 0) return 0;
3862
- const sorted = [...sizes].sort((a, b) => a - b);
3863
- const mid = Math.floor(sorted.length / 2);
3864
- return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
4290
+ function computeMedianFontSizeFromFreq(freq) {
4291
+ if (freq.size === 0) return 0;
4292
+ let total = 0;
4293
+ for (const count of freq.values()) total += count;
4294
+ const sorted = [...freq.entries()].sort((a, b) => a[0] - b[0]);
4295
+ const mid = Math.floor(total / 2);
4296
+ let cumulative = 0;
4297
+ for (const [size, count] of sorted) {
4298
+ cumulative += count;
4299
+ if (cumulative > mid) return size;
4300
+ }
4301
+ return sorted[sorted.length - 1][0];
3865
4302
  }
3866
4303
  function detectHeadings(blocks, medianFontSize) {
3867
4304
  for (const block of blocks) {
@@ -3881,220 +4318,27 @@ function detectHeadings(blocks, medianFontSize) {
3881
4318
  }
3882
4319
  }
3883
4320
  }
3884
- function mergeAdjacentHeadings(blocks) {
3885
- let i = 0;
3886
- while (i < blocks.length - 1) {
3887
- const curr = blocks[i];
3888
- const next = blocks[i + 1];
3889
- if (curr.type !== "heading" || next.type !== "heading") {
3890
- i++;
3891
- continue;
3892
- }
3893
- if (!curr.bbox || !next.bbox || !curr.text || !next.text) {
3894
- i++;
3895
- continue;
3896
- }
3897
- const currBaseline = curr.bbox.y + (curr.style?.fontSize || curr.bbox.height);
3898
- const nextBaseline = next.bbox.y + (next.style?.fontSize || next.bbox.height);
3899
- const yDiff = Math.abs(currBaseline - nextBaseline);
3900
- const maxFs = Math.max(curr.style?.fontSize || 12, next.style?.fontSize || 12);
3901
- const sameY = curr.bbox.page === next.bbox.page && yDiff < maxFs * 1.5;
3902
- const sameLevel = curr.level === next.level;
3903
- if (sameY && sameLevel) {
3904
- const currX = curr.bbox.x;
3905
- const nextX = next.bbox.x;
3906
- if (currX <= nextX) {
3907
- curr.text = curr.text + " " + next.text;
3908
- } else {
3909
- curr.text = next.text + " " + curr.text;
3910
- }
3911
- curr.bbox = {
3912
- page: curr.bbox.page,
3913
- x: Math.min(curr.bbox.x, next.bbox.x),
3914
- y: Math.min(curr.bbox.y, next.bbox.y),
3915
- width: Math.max(curr.bbox.x + curr.bbox.width, next.bbox.x + next.bbox.width) - Math.min(curr.bbox.x, next.bbox.x),
3916
- height: Math.max(curr.bbox.height, next.bbox.height)
3917
- };
3918
- blocks.splice(i + 1, 1);
3919
- } else {
3920
- i++;
3921
- }
3922
- }
3923
- }
3924
4321
  function collapseEvenSpacing(text) {
3925
4322
  const tokens = text.split(" ");
3926
4323
  const singleCharCount = tokens.filter((t) => t.length === 1).length;
3927
4324
  if (tokens.length >= 3 && singleCharCount / tokens.length >= 0.7) {
3928
4325
  return tokens.join("");
3929
4326
  }
3930
- return text;
3931
- }
3932
- function buildXyCutBlocks(items, pageNum) {
3933
- const allY = items.map((i) => i.y);
3934
- const pageHeight = Math.max(...allY) - Math.min(...allY);
3935
- const gapThreshold = Math.max(15, pageHeight * 0.03);
3936
- const orderedGroups = xyCutOrder(items, gapThreshold);
3937
- const blocks = [];
3938
- for (const group of orderedGroups) {
3939
- if (group.length === 0) continue;
3940
- const yLines = groupByY(group);
3941
- for (const line of yLines) {
3942
- const text = mergeLineSimple(line);
3943
- if (!text.trim()) continue;
3944
- blocks.push({
3945
- type: "paragraph",
3946
- text,
3947
- pageNumber: pageNum,
3948
- bbox: computeBBox(line, pageNum),
3949
- style: dominantStyle(line)
3950
- });
3951
- }
3952
- }
3953
- return blocks.length > 0 ? blocks : null;
3954
- }
3955
- function normalizeUnderSegmentedTable(table, items, pageNum, bbox) {
3956
- const totalCells = table.cells.reduce((sum, row) => sum + row.filter((c) => c.text.trim()).length, 0);
3957
- const totalTextLines = table.cells.reduce((sum, row) => sum + row.reduce((s, c) => s + (c.text.trim() ? c.text.split("\n").length : 0), 0), 0);
3958
- const isUnderSegmented = table.rows === 1 && table.cols === 1 || totalCells <= 2 && totalTextLines >= 8 || totalCells <= 2 && items.length >= 6;
3959
- if (!isUnderSegmented) return null;
3960
- if (hasMultiColumnLayout(items)) return buildXyCutBlocks(items, pageNum);
3961
- const directTable = buildTableFromTextLayout(items, pageNum, bbox);
3962
- if (directTable) return directTable;
3963
- const clusterItems = items.map((i) => ({
3964
- text: i.text,
3965
- x: i.x,
3966
- y: i.y,
3967
- w: i.w,
3968
- h: i.h,
3969
- fontSize: i.fontSize,
3970
- fontName: i.fontName
3971
- }));
3972
- const clusterResults = detectClusterTables(clusterItems, pageNum);
3973
- if (clusterResults.length > 0) {
3974
- const blocks = [];
3975
- const ciToIdx = /* @__PURE__ */ new Map();
3976
- for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
3977
- const usedIndices = /* @__PURE__ */ new Set();
3978
- for (const cr of clusterResults) {
3979
- for (const ci of cr.usedItems) {
3980
- const idx = ciToIdx.get(ci);
3981
- if (idx !== void 0) usedIndices.add(idx);
3982
- }
3983
- blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
3984
- }
3985
- const remaining = items.filter((_, idx) => !usedIndices.has(idx));
3986
- for (const item of remaining) {
3987
- if (!item.text.trim()) continue;
3988
- blocks.push({
3989
- type: "paragraph",
3990
- text: item.text,
3991
- pageNumber: pageNum,
3992
- bbox: computeBBox([item], pageNum),
3993
- style: { fontSize: item.fontSize, fontName: item.fontName }
3994
- });
3995
- }
3996
- blocks.sort((a, b) => {
3997
- const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
3998
- const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
3999
- return by - ay;
4000
- });
4001
- return blocks.length > 0 ? blocks : null;
4002
- }
4003
- return null;
4004
- }
4005
- function buildTableFromTextLayout(items, pageNum, bbox) {
4006
- if (items.length < 4) return null;
4007
- const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
4008
- const yTol = 3;
4009
- const rows = [];
4010
- let curRow = [sorted[0]];
4011
- let curY = sorted[0].y;
4012
- for (let i = 1; i < sorted.length; i++) {
4013
- if (Math.abs(sorted[i].y - curY) <= yTol) {
4014
- curRow.push(sorted[i]);
4015
- } else {
4016
- rows.push(curRow);
4017
- curRow = [sorted[i]];
4018
- curY = sorted[i].y;
4019
- }
4020
- }
4021
- rows.push(curRow);
4022
- if (rows.length < 2) return null;
4023
- const gapPositions = [];
4024
- for (const row of rows) {
4025
- if (row.length < 2) continue;
4026
- const sortedX = [...row].sort((a, b) => a.x - b.x);
4027
- const avgFs = sortedX.reduce((s, i) => s + i.fontSize, 0) / sortedX.length;
4028
- for (let j = 1; j < sortedX.length; j++) {
4029
- const gap = sortedX[j].x - (sortedX[j - 1].x + sortedX[j - 1].w);
4030
- if (gap >= avgFs * 1.5) {
4031
- gapPositions.push(sortedX[j - 1].x + sortedX[j - 1].w + gap / 2);
4032
- }
4033
- }
4034
- }
4035
- if (gapPositions.length < 2) return null;
4036
- gapPositions.sort((a, b) => a - b);
4037
- const colBoundaries = [];
4038
- let clusterSum = gapPositions[0], clusterCount = 1;
4039
- for (let i = 1; i < gapPositions.length; i++) {
4040
- const avg = clusterSum / clusterCount;
4041
- if (Math.abs(gapPositions[i] - avg) <= 15) {
4042
- clusterSum += gapPositions[i];
4043
- clusterCount++;
4044
- } else {
4045
- if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
4046
- clusterSum = gapPositions[i];
4047
- clusterCount = 1;
4048
- }
4049
- }
4050
- if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
4051
- if (colBoundaries.length === 0) return null;
4052
- const numCols = colBoundaries.length + 1;
4053
- const tableRows = [];
4054
- for (const row of rows) {
4055
- const cells = Array(numCols).fill("");
4056
- const sortedX = [...row].sort((a, b) => a.x - b.x);
4057
- for (const item of sortedX) {
4058
- const cx = item.x + item.w / 2;
4059
- let col = 0;
4060
- for (let b = 0; b < colBoundaries.length; b++) {
4061
- if (cx > colBoundaries[b]) col = b + 1;
4062
- }
4063
- cells[col] = cells[col] ? cells[col] + " " + item.text : item.text;
4064
- }
4065
- if (cells[0].trim() === "" && tableRows.length > 0) {
4066
- const prevCells = tableRows[tableRows.length - 1].cells;
4067
- for (let c = 0; c < numCols; c++) {
4068
- if (cells[c].trim()) {
4069
- prevCells[c] = prevCells[c] ? prevCells[c] + " " + cells[c].trim() : cells[c].trim();
4070
- }
4071
- }
4072
- } else {
4073
- tableRows.push({ cells });
4074
- }
4075
- }
4076
- if (tableRows.length < 2) return null;
4077
- const nonEmptyCount = tableRows.reduce((sum, r) => sum + r.cells.filter((c) => c.trim()).length, 0);
4078
- const totalCount = tableRows.length * numCols;
4079
- if (nonEmptyCount < totalCount * 0.3) return null;
4080
- const irCells = tableRows.map(
4081
- (r) => r.cells.map((text, colIdx) => {
4082
- let cleaned = text.trim();
4083
- if (colIdx > 0) cleaned = cleaned.replace(/^[•○·\-]\s*/, "");
4084
- return { text: cleaned, colSpan: 1, rowSpan: 1 };
4085
- })
4327
+ return text.replace(
4328
+ /(?<![가-힣])[가-힣](?: [가-힣\d]){2,}(?![가-힣])/g,
4329
+ (match) => match.replace(/ /g, "")
4086
4330
  );
4087
- const irTable = {
4088
- rows: tableRows.length,
4089
- cols: numCols,
4090
- cells: irCells,
4091
- hasHeader: tableRows.length > 1
4092
- };
4093
- return [{ type: "table", table: irTable, pageNumber: pageNum, bbox }];
4094
4331
  }
4095
4332
  function shouldDemoteTable(table) {
4096
4333
  const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
4097
4334
  const allText = allCells.join(" ");
4335
+ if (table.rows <= 3 && table.cols <= 3) {
4336
+ const totalCells2 = table.rows * table.cols;
4337
+ const emptyCells2 = totalCells2 - allCells.length;
4338
+ if (emptyCells2 >= totalCells2 * 0.3) return true;
4339
+ if (/[□■◆○●▶ㅇ]/.test(allText)) return true;
4340
+ if (/<[^>]+>/.test(allText)) return true;
4341
+ }
4098
4342
  if (allText.length > 200) return false;
4099
4343
  if (/[□■◆○●▶]/.test(allText) && table.rows <= 3) return true;
4100
4344
  const totalCells = table.rows * table.cols;
@@ -4138,32 +4382,6 @@ function detectMarkerHeadings(blocks) {
4138
4382
  }
4139
4383
  }
4140
4384
  }
4141
- function hasMultiColumnLayout(items) {
4142
- if (items.length < 30) return false;
4143
- const sorted = [...items].sort((a, b) => a.x - b.x);
4144
- const minX = sorted[0].x;
4145
- let maxX = minX;
4146
- for (const i of sorted) if (i.x + i.w > maxX) maxX = i.x + i.w;
4147
- const pageWidth = maxX - minX;
4148
- if (pageWidth < 200) return false;
4149
- let bestGap = 0;
4150
- let bestSplit = 0;
4151
- for (let j = 1; j < sorted.length; j++) {
4152
- const gap = sorted[j].x - (sorted[j - 1].x + sorted[j - 1].w);
4153
- if (gap > bestGap) {
4154
- bestGap = gap;
4155
- bestSplit = (sorted[j - 1].x + sorted[j - 1].w + sorted[j].x) / 2;
4156
- }
4157
- }
4158
- if (bestGap < 20) return false;
4159
- const splitRatio = (bestSplit - minX) / pageWidth;
4160
- if (splitRatio < 0.35 || splitRatio > 0.65) return false;
4161
- const leftCount = items.filter((i) => i.x + i.w / 2 < bestSplit).length;
4162
- const rightCount = items.filter((i) => i.x + i.w / 2 >= bestSplit).length;
4163
- if (leftCount < 15 || rightCount < 15) return false;
4164
- if (Math.min(leftCount, rightCount) / Math.max(leftCount, rightCount) < 0.35) return false;
4165
- return true;
4166
- }
4167
4385
  var MAX_XYCUT_DEPTH = 50;
4168
4386
  function xyCutOrder(items, gapThreshold, depth = 0) {
4169
4387
  if (items.length === 0) return [];
@@ -4231,6 +4449,7 @@ function extractPageBlocksWithLines(items, pageNum, opList, pageWidth, pageHeigh
4231
4449
  if (items.length === 0) return [];
4232
4450
  let { horizontals, verticals } = extractLines(opList.fnArray, opList.argsArray);
4233
4451
  ({ horizontals, verticals } = filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight));
4452
+ ({ horizontals, verticals } = preprocessLines(horizontals, verticals));
4234
4453
  const grids = buildTableGrids(horizontals, verticals);
4235
4454
  if (grids.length > 0) {
4236
4455
  return extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals);
@@ -4242,14 +4461,19 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
4242
4461
  const usedItems = /* @__PURE__ */ new Set();
4243
4462
  const sortedGrids = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
4244
4463
  for (const grid of sortedGrids) {
4464
+ const numGridRows = grid.rowYs.length - 1;
4465
+ const numGridCols = grid.colXs.length - 1;
4466
+ if (numGridRows === 1 && numGridCols >= 2) continue;
4245
4467
  const tableItems = [];
4246
4468
  const pad = 3;
4469
+ const gridW = grid.bbox.x2 - grid.bbox.x1;
4247
4470
  for (const item of items) {
4248
4471
  if (usedItems.has(item)) continue;
4249
- if (item.x >= grid.bbox.x1 - pad && item.x + item.w <= grid.bbox.x2 + pad && item.y >= grid.bbox.y1 - pad && item.y <= grid.bbox.y2 + pad) {
4250
- tableItems.push(item);
4251
- usedItems.add(item);
4252
- }
4472
+ if (item.y < grid.bbox.y1 - pad || item.y > grid.bbox.y2 + pad) continue;
4473
+ if (item.x < grid.bbox.x1 - pad || item.x + item.w > grid.bbox.x2 + pad) continue;
4474
+ if (gridW < 120 && item.x + item.w > grid.bbox.x2 - 2) continue;
4475
+ tableItems.push(item);
4476
+ usedItems.add(item);
4253
4477
  }
4254
4478
  const cells = extractCells(grid, horizontals, verticals);
4255
4479
  if (cells.length === 0) continue;
@@ -4273,6 +4497,7 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
4273
4497
  const cellItems = cellTextMap.get(cell) || [];
4274
4498
  let text = cellTextToString(cellItems);
4275
4499
  text = text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").trim();
4500
+ text = text.split("\n").map((line) => collapseEvenSpacing(line)).join("\n");
4276
4501
  irGrid[cell.row][cell.col] = {
4277
4502
  text,
4278
4503
  colSpan: cell.colSpan,
@@ -4294,31 +4519,61 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
4294
4519
  width: grid.bbox.x2 - grid.bbox.x1,
4295
4520
  height: grid.bbox.y2 - grid.bbox.y1
4296
4521
  };
4297
- const normalized = normalizeUnderSegmentedTable(irTable, tableItems, pageNum, tableBbox);
4298
- if (normalized) {
4299
- blocks.push(...normalized);
4300
- continue;
4301
- }
4302
4522
  if (shouldDemoteTable(irTable)) {
4303
4523
  const demoted = demoteTableToText(irTable);
4304
4524
  if (demoted) {
4305
- blocks.push({ type: "paragraph", text: demoted, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
4525
+ const text = numGridRows === 1 ? "\n" + demoted + "\n" : demoted;
4526
+ blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
4306
4527
  }
4307
4528
  continue;
4308
4529
  }
4309
4530
  blocks.push({ type: "table", table: irTable, pageNumber: pageNum, bbox: tableBbox });
4310
4531
  }
4311
- const remaining = items.filter((i) => !usedItems.has(i));
4532
+ let remaining = items.filter((i) => !usedItems.has(i));
4312
4533
  if (remaining.length > 0) {
4313
4534
  remaining.sort((a, b) => b.y - a.y || a.x - b.x);
4314
- const textBlocks = detectListBlocks(extractPageBlocksFallback(remaining, pageNum));
4315
- const allBlocks = [...blocks, ...textBlocks];
4316
- allBlocks.sort((a, b) => {
4535
+ const clusterItems = remaining.map((i) => ({
4536
+ text: i.text,
4537
+ x: i.x,
4538
+ y: i.y,
4539
+ w: i.w,
4540
+ h: i.h,
4541
+ fontSize: i.fontSize,
4542
+ fontName: i.fontName
4543
+ }));
4544
+ const clusterResults = detectClusterTables(clusterItems, pageNum);
4545
+ if (clusterResults.length > 0) {
4546
+ const ciToIdx = /* @__PURE__ */ new Map();
4547
+ for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
4548
+ const usedClusterIndices = /* @__PURE__ */ new Set();
4549
+ for (const cr of clusterResults) {
4550
+ for (const ci of cr.usedItems) {
4551
+ const idx = ciToIdx.get(ci);
4552
+ if (idx !== void 0) usedClusterIndices.add(idx);
4553
+ }
4554
+ blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
4555
+ }
4556
+ remaining = remaining.filter((_, idx) => !usedClusterIndices.has(idx));
4557
+ }
4558
+ if (remaining.length > 0) {
4559
+ const allY = remaining.map((i) => i.y);
4560
+ const pageH = safeMax(allY) - safeMin(allY);
4561
+ const groups = xyCutOrder(remaining, Math.max(15, pageH * 0.03));
4562
+ const textBlocks = [];
4563
+ for (const group of groups) {
4564
+ if (group.length === 0) continue;
4565
+ const groupBlocks = extractPageBlocksFallback(group, pageNum);
4566
+ for (const b of groupBlocks) textBlocks.push(b);
4567
+ }
4568
+ const finalTextBlocks = detectListBlocks(textBlocks);
4569
+ for (const b of finalTextBlocks) blocks.push(b);
4570
+ }
4571
+ blocks.sort((a, b) => {
4317
4572
  const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
4318
4573
  const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
4319
4574
  return by - ay;
4320
4575
  });
4321
- return mergeAdjacentTableBlocks(allBlocks);
4576
+ return mergeAdjacentTableBlocks(blocks);
4322
4577
  }
4323
4578
  return mergeAdjacentTableBlocks(blocks);
4324
4579
  }
@@ -4344,57 +4599,53 @@ function mergeAdjacentTableBlocks(blocks) {
4344
4599
  }
4345
4600
  function extractPageBlocksFallback(items, pageNum) {
4346
4601
  if (items.length === 0) return [];
4347
- if (hasMultiColumnLayout(items)) {
4348
- const xyBlocks = buildXyCutBlocks(items, pageNum) || [];
4349
- return detectSpecialKoreanTables(detectListBlocks(xyBlocks));
4350
- }
4351
4602
  const blocks = [];
4352
- const allYLines = groupByY(items);
4353
- const columns = detectColumns(allYLines);
4354
- if (columns && columns.length >= 3) {
4355
- const tableText = extractWithColumns(allYLines, columns);
4356
- const bbox = computeBBox(items, pageNum);
4357
- blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
4358
- } else {
4359
- const clusterItems = items.map((i) => ({
4360
- text: i.text,
4361
- x: i.x,
4362
- y: i.y,
4363
- w: i.w,
4364
- h: i.h,
4365
- fontSize: i.fontSize,
4366
- fontName: i.fontName
4367
- }));
4368
- const clusterResults = hasMultiColumnLayout(items) ? [] : detectClusterTables(clusterItems, pageNum);
4369
- if (clusterResults.length > 0) {
4370
- const ciToIdx = /* @__PURE__ */ new Map();
4371
- for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
4372
- const usedIndices = /* @__PURE__ */ new Set();
4373
- for (const cr of clusterResults) {
4374
- for (const ci of cr.usedItems) {
4375
- const idx = ciToIdx.get(ci);
4376
- if (idx !== void 0) usedIndices.add(idx);
4377
- }
4378
- blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
4603
+ const clusterItems = items.map((i) => ({
4604
+ text: i.text,
4605
+ x: i.x,
4606
+ y: i.y,
4607
+ w: i.w,
4608
+ h: i.h,
4609
+ fontSize: i.fontSize,
4610
+ fontName: i.fontName
4611
+ }));
4612
+ const clusterResults = detectClusterTables(clusterItems, pageNum);
4613
+ if (clusterResults.length > 0) {
4614
+ const ciToIdx = /* @__PURE__ */ new Map();
4615
+ for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
4616
+ const usedIndices = /* @__PURE__ */ new Set();
4617
+ for (const cr of clusterResults) {
4618
+ for (const ci of cr.usedItems) {
4619
+ const idx = ciToIdx.get(ci);
4620
+ if (idx !== void 0) usedIndices.add(idx);
4379
4621
  }
4380
- const remaining = items.filter((_, idx) => !usedIndices.has(idx));
4381
- if (remaining.length > 0) {
4382
- const yLines = groupByY(remaining);
4383
- for (const line of yLines) {
4384
- const text = mergeLineSimple(line);
4385
- if (!text.trim()) continue;
4386
- const bbox = computeBBox(line, pageNum);
4387
- blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
4388
- }
4622
+ blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
4623
+ }
4624
+ const remaining = items.filter((_, idx) => !usedIndices.has(idx));
4625
+ if (remaining.length > 0) {
4626
+ const yLines = groupByY(remaining);
4627
+ for (const line of yLines) {
4628
+ const text = mergeLineSimple(line);
4629
+ if (!text.trim()) continue;
4630
+ const bbox = computeBBox(line, pageNum);
4631
+ blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
4389
4632
  }
4390
- blocks.sort((a, b) => {
4391
- const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
4392
- const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
4393
- return by - ay;
4394
- });
4633
+ }
4634
+ blocks.sort((a, b) => {
4635
+ const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
4636
+ const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
4637
+ return by - ay;
4638
+ });
4639
+ } else {
4640
+ const allYLines = groupByY(items);
4641
+ const columns = detectColumns(allYLines);
4642
+ if (columns && columns.length >= 3) {
4643
+ const tableText = extractWithColumns(allYLines, columns);
4644
+ const bbox = computeBBox(items, pageNum);
4645
+ blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
4395
4646
  } else {
4396
4647
  const allY = items.map((i) => i.y);
4397
- const pageHeight = Math.max(...allY) - Math.min(...allY);
4648
+ const pageHeight = safeMax(allY) - safeMin(allY);
4398
4649
  const gapThreshold = Math.max(15, pageHeight * 0.03);
4399
4650
  const orderedGroups = xyCutOrder(items, gapThreshold);
4400
4651
  for (const group of orderedGroups) {
@@ -4447,22 +4698,76 @@ function dominantStyle(items) {
4447
4698
  return { fontSize: dominantSize, fontName };
4448
4699
  }
4449
4700
  function normalizeItems(rawItems) {
4450
- return rawItems.filter((i) => typeof i.str === "string" && i.str.trim() !== "").map((i) => {
4701
+ const items = [];
4702
+ const spacePositions = [];
4703
+ for (const i of rawItems) {
4704
+ if (typeof i.str !== "string") continue;
4705
+ const x = Math.round(i.transform[4]);
4706
+ const y = Math.round(i.transform[5]);
4707
+ if (!i.str.trim()) {
4708
+ spacePositions.push({ x, y });
4709
+ continue;
4710
+ }
4451
4711
  const scaleY = Math.abs(i.transform[3]);
4452
4712
  const scaleX = Math.abs(i.transform[0]);
4453
4713
  const fontSize = Math.round(Math.max(scaleY, scaleX));
4454
- return {
4455
- text: i.str.trim(),
4456
- x: Math.round(i.transform[4]),
4457
- y: Math.round(i.transform[5]),
4458
- w: Math.round(i.width),
4459
- h: Math.round(i.height),
4460
- fontSize,
4461
- fontName: i.fontName || "",
4462
- // 0pt 폰트이거나 너비 0 → hidden text (prompt injection 의심)
4463
- isHidden: fontSize === 0 || i.width === 0 && i.str.trim().length > 0
4464
- };
4465
- }).sort((a, b) => b.y - a.y || a.x - b.x);
4714
+ const w = Math.round(i.width);
4715
+ const h = Math.round(i.height);
4716
+ const isHidden = fontSize === 0 || i.width === 0 && i.str.trim().length > 0;
4717
+ let text = i.str.trim();
4718
+ if (/^[\d\s\-().·,☎]+$/.test(text) && /\d/.test(text) && / /.test(text)) {
4719
+ text = text.replace(/ /g, "");
4720
+ }
4721
+ const split = splitEvenSpacedItem(text, x, w, fontSize);
4722
+ if (split) {
4723
+ for (const s of split) {
4724
+ items.push({ text: s.text, x: s.x, y, w: s.w, h, fontSize, fontName: i.fontName || "", isHidden });
4725
+ }
4726
+ } else {
4727
+ items.push({ text, x, y, w, h, fontSize, fontName: i.fontName || "", isHidden });
4728
+ }
4729
+ }
4730
+ const sorted = items.sort((a, b) => b.y - a.y || a.x - b.x);
4731
+ const deduped = [];
4732
+ for (let i = 0; i < sorted.length; i++) {
4733
+ let isDup = false;
4734
+ for (let j = deduped.length - 1; j >= 0; j--) {
4735
+ const prev = deduped[j];
4736
+ if (prev.y - sorted[i].y > 3) break;
4737
+ if (Math.abs(prev.y - sorted[i].y) <= 3 && prev.text === sorted[i].text && Math.abs(prev.x - sorted[i].x) <= 3) {
4738
+ isDup = true;
4739
+ break;
4740
+ }
4741
+ }
4742
+ if (!isDup) deduped.push(sorted[i]);
4743
+ }
4744
+ if (spacePositions.length > 0) {
4745
+ for (const item of deduped) {
4746
+ for (const sp of spacePositions) {
4747
+ if (Math.abs(sp.y - item.y) <= 3) {
4748
+ const dist = item.x - sp.x;
4749
+ if (dist >= 0 && dist <= 20) {
4750
+ item.hasSpaceBefore = true;
4751
+ break;
4752
+ }
4753
+ }
4754
+ }
4755
+ }
4756
+ }
4757
+ return deduped;
4758
+ }
4759
+ function splitEvenSpacedItem(text, itemX, itemW, fontSize) {
4760
+ if (!/^[가-힣\d](?: [가-힣\d]){2,}$/.test(text)) return null;
4761
+ const chars = text.split(" ");
4762
+ if (chars.length < 3) return null;
4763
+ const charW = itemW / chars.length;
4764
+ if (charW > fontSize * 2) return null;
4765
+ return chars.map((ch, idx) => ({
4766
+ text: ch,
4767
+ x: Math.round(itemX + idx * charW),
4768
+ w: Math.round(charW * 0.8)
4769
+ // 실제 글자 폭은 간격보다 좁음
4770
+ }));
4466
4771
  }
4467
4772
  function groupByY(items) {
4468
4773
  if (items.length === 0) return [];
@@ -4487,14 +4792,14 @@ function isProseSpread(items) {
4487
4792
  for (let i = 1; i < sorted.length; i++) {
4488
4793
  gaps.push(sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w));
4489
4794
  }
4490
- const maxGap = Math.max(...gaps);
4795
+ const maxGap = safeMax(gaps);
4491
4796
  const avgLen = items.reduce((s, i) => s + i.text.length, 0) / items.length;
4492
4797
  return maxGap < 40 && avgLen < 5;
4493
4798
  }
4494
4799
  function detectColumns(yLines) {
4495
4800
  const allItems = yLines.flat();
4496
4801
  if (allItems.length === 0) return null;
4497
- const pageWidth = Math.max(...allItems.map((i) => i.x + i.w)) - Math.min(...allItems.map((i) => i.x));
4802
+ const pageWidth = safeMax(allItems.map((i) => i.x + i.w)) - safeMin(allItems.map((i) => i.x));
4498
4803
  if (pageWidth < 100) return null;
4499
4804
  let bigoLineIdx = -1;
4500
4805
  for (let i = 0; i < yLines.length; i++) {
@@ -4526,7 +4831,7 @@ function detectColumns(yLines) {
4526
4831
  }
4527
4832
  const peaks = xClusters.filter((c) => c.count >= 3).sort((a, b) => a.minX - b.minX);
4528
4833
  if (peaks.length < 3) return null;
4529
- const MERGE_TOL = 30;
4834
+ const MERGE_TOL = 40;
4530
4835
  const merged = [peaks[0]];
4531
4836
  for (let i = 1; i < peaks.length; i++) {
4532
4837
  const prev = merged[merged.length - 1];
@@ -4540,7 +4845,14 @@ function detectColumns(yLines) {
4540
4845
  merged.push({ ...peaks[i] });
4541
4846
  }
4542
4847
  }
4543
- const columns = merged.filter((c) => c.count >= 3).map((c) => c.minX);
4848
+ const rawColumns = merged.filter((c) => c.count >= 3).map((c) => c.minX);
4849
+ if (rawColumns.length < 3) return null;
4850
+ const MIN_DETECT_COL_WIDTH = 30;
4851
+ const columns = [rawColumns[0]];
4852
+ for (let i = 1; i < rawColumns.length; i++) {
4853
+ if (rawColumns[i] - columns[columns.length - 1] < MIN_DETECT_COL_WIDTH) continue;
4854
+ columns.push(rawColumns[i]);
4855
+ }
4544
4856
  return columns.length >= 3 ? columns : null;
4545
4857
  }
4546
4858
  function findColumn(x, columns) {
@@ -4668,6 +4980,16 @@ function buildGridTable(lines, columns) {
4668
4980
  }
4669
4981
  merged.splice(0, headerEnd, headerRow);
4670
4982
  }
4983
+ for (const row of merged) {
4984
+ for (let c = 0; c < row.length; c++) {
4985
+ if (row[c]) row[c] = collapseEvenSpacing(row[c]);
4986
+ }
4987
+ }
4988
+ const totalCells = merged.length * numCols;
4989
+ const filledCells = merged.reduce((s, row) => s + row.filter((c) => c).length, 0);
4990
+ if (filledCells < totalCells * 0.35 || merged.length < 2 || merged.length <= 3 && numCols >= 7) {
4991
+ return merged.map((r) => r.filter((c) => c).join(" ")).join("\n");
4992
+ }
4671
4993
  const md = [];
4672
4994
  md.push("| " + merged[0].join(" | ") + " |");
4673
4995
  md.push("| " + merged[0].map(() => "---").join(" | ") + " |");
@@ -4679,12 +5001,32 @@ function buildGridTable(lines, columns) {
4679
5001
  function mergeLineSimple(items) {
4680
5002
  if (items.length <= 1) return items[0]?.text || "";
4681
5003
  const sorted = [...items].sort((a, b) => a.x - b.x);
5004
+ const isEvenSpaced = detectEvenSpacedItems(sorted);
4682
5005
  let result = sorted[0].text;
4683
5006
  for (let i = 1; i < sorted.length; i++) {
4684
5007
  const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
4685
5008
  const avgFs = (sorted[i].fontSize + sorted[i - 1].fontSize) / 2;
4686
- if (gap > 15) result += " ";
4687
- else if (gap < avgFs * 0.15) {
5009
+ const tabThreshold = Math.max(avgFs * 2, 30);
5010
+ if (gap > tabThreshold) {
5011
+ result += " ";
5012
+ result += sorted[i].text;
5013
+ continue;
5014
+ }
5015
+ if (isEvenSpaced[i]) {
5016
+ result += sorted[i].text;
5017
+ continue;
5018
+ }
5019
+ if (sorted[i].hasSpaceBefore && gap >= avgFs * 0.05) {
5020
+ result += " ";
5021
+ result += sorted[i].text;
5022
+ continue;
5023
+ }
5024
+ if (/[□■○●▶◆◇ㅇ]$/.test(sorted[i - 1].text) && /^[가-힣]/.test(sorted[i].text) && gap > 1) {
5025
+ result += " ";
5026
+ result += sorted[i].text;
5027
+ continue;
5028
+ }
5029
+ if (gap < avgFs * 0.15) {
4688
5030
  } else if (gap < avgFs * 0.35 && (/[가-힣]$/.test(result) || /^[가-힣]/.test(sorted[i].text))) {
4689
5031
  } else if (gap > 3) result += " ";
4690
5032
  result += sorted[i].text;
@@ -4693,8 +5035,8 @@ function mergeLineSimple(items) {
4693
5035
  }
4694
5036
  function cleanPdfText(text) {
4695
5037
  return mergeKoreanLines(
4696
- text.replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "")
4697
- ).replace(/^(?!\|).{3,30}$/gm, (line) => collapseEvenSpacing(line)).replace(/\n{3,}/g, "\n\n").trim();
5038
+ text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
5039
+ ).replace(/^(?!\| ---).*$/gm, (line) => collapseEvenSpacing(line)).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
4698
5040
  }
4699
5041
  function startsWithMarker(line) {
4700
5042
  const t = line.trimStart();
@@ -4886,7 +5228,7 @@ function mergeKoreanLines(text) {
4886
5228
  result[result.length - 1] = prev + " " + currTrimmed;
4887
5229
  continue;
4888
5230
  }
4889
- if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev)) {
5231
+ if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev) && !startsWithMarker(prev)) {
4890
5232
  result[result.length - 1] = prev + " " + curr;
4891
5233
  } else {
4892
5234
  result.push(curr);
@@ -4934,7 +5276,7 @@ function getTextContent(el) {
4934
5276
  return el.textContent?.trim() ?? "";
4935
5277
  }
4936
5278
  function parseXml(text) {
4937
- return new DOMParser2().parseFromString(text, "text/xml");
5279
+ return new DOMParser2().parseFromString(stripDtd(text), "text/xml");
4938
5280
  }
4939
5281
  function parseSharedStrings(xml) {
4940
5282
  const doc = parseXml(xml);
@@ -5221,7 +5563,7 @@ function getAttr(el, localName) {
5221
5563
  return null;
5222
5564
  }
5223
5565
  function parseXml2(text) {
5224
- return new DOMParser3().parseFromString(text, "text/xml");
5566
+ return new DOMParser3().parseFromString(stripDtd(text), "text/xml");
5225
5567
  }
5226
5568
  function parseStyles(xml) {
5227
5569
  const doc = parseXml2(xml);
@@ -5621,7 +5963,13 @@ function normalize(s) {
5621
5963
  }
5622
5964
  var MAX_LEVENSHTEIN_LEN = 1e4;
5623
5965
  function levenshtein(a, b) {
5624
- if (a.length + b.length > MAX_LEVENSHTEIN_LEN) return Math.abs(a.length - b.length);
5966
+ if (a.length + b.length > MAX_LEVENSHTEIN_LEN) {
5967
+ const sampleLen = Math.min(500, a.length, b.length);
5968
+ let diffs = 0;
5969
+ for (let i = 0; i < sampleLen; i++) if (a[i] !== b[i]) diffs++;
5970
+ const sampleRate = sampleLen > 0 ? diffs / sampleLen : 1;
5971
+ return Math.abs(a.length - b.length) + Math.round(Math.min(a.length, b.length) * sampleRate);
5972
+ }
5625
5973
  if (a.length > b.length) [a, b] = [b, a];
5626
5974
  const m = a.length;
5627
5975
  const n = b.length;
@@ -5904,13 +6252,20 @@ function extractInlineFields(text) {
5904
6252
 
5905
6253
  // src/hwpx/generator.ts
5906
6254
  import JSZip5 from "jszip";
5907
- var HWPML_NS = "http://www.hancom.co.kr/hwpml/2016/HwpMl";
6255
+ var NS_SECTION = "http://www.hancom.co.kr/hwpml/2011/section";
6256
+ var NS_PARA = "http://www.hancom.co.kr/hwpml/2011/paragraph";
6257
+ var NS_HEAD = "http://www.hancom.co.kr/hwpml/2011/head";
6258
+ var NS_OPF = "http://www.idpf.org/2007/opf/";
6259
+ var NS_HPF = "http://www.hancom.co.kr/schema/2011/hpf";
6260
+ var NS_OCF = "urn:oasis:names:tc:opendocument:xmlns:container";
5908
6261
  async function markdownToHwpx(markdown) {
5909
6262
  const blocks = parseMarkdownToBlocks(markdown);
5910
6263
  const sectionXml = blocksToSectionXml(blocks);
5911
6264
  const zip = new JSZip5();
5912
6265
  zip.file("mimetype", "application/hwp+zip", { compression: "STORE" });
6266
+ zip.file("META-INF/container.xml", generateContainerXml());
5913
6267
  zip.file("Contents/content.hpf", generateManifest());
6268
+ zip.file("Contents/header.xml", generateHeaderXml());
5914
6269
  zip.file("Contents/section0.xml", sectionXml);
5915
6270
  return await zip.generateAsync({ type: "arraybuffer" });
5916
6271
  }
@@ -5955,8 +6310,111 @@ function parseMarkdownToBlocks(md) {
5955
6310
  function escapeXml(text) {
5956
6311
  return text.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;");
5957
6312
  }
6313
+ function generateContainerXml() {
6314
+ return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
6315
+ <ocf:container xmlns:ocf="${NS_OCF}" xmlns:hpf="${NS_HPF}">
6316
+ <ocf:rootfiles>
6317
+ <ocf:rootfile full-path="Contents/content.hpf" media-type="application/hwpml-package+xml"/>
6318
+ </ocf:rootfiles>
6319
+ </ocf:container>`;
6320
+ }
6321
+ function generateManifest() {
6322
+ return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
6323
+ <opf:package xmlns:opf="${NS_OPF}" xmlns:hpf="${NS_HPF}" xmlns:hh="${NS_HEAD}">
6324
+ <opf:manifest>
6325
+ <opf:item id="header" href="Contents/header.xml" media-type="application/xml"/>
6326
+ <opf:item id="section0" href="Contents/section0.xml" media-type="application/xml"/>
6327
+ </opf:manifest>
6328
+ <opf:spine>
6329
+ <opf:itemref idref="header" linear="no"/>
6330
+ <opf:itemref idref="section0" linear="yes"/>
6331
+ </opf:spine>
6332
+ </opf:package>`;
6333
+ }
6334
+ function generateHeaderXml() {
6335
+ return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
6336
+ <hh:head xmlns:hh="${NS_HEAD}" xmlns:hp="${NS_PARA}" version="1.4" secCnt="1">
6337
+ <hh:beginNum page="1" footnote="1" endnote="1" pic="1" tbl="1" equation="1"/>
6338
+ <hh:refList>
6339
+ <hh:fontfaces itemCnt="7">
6340
+ <hh:fontface lang="HANGUL" fontCnt="1">
6341
+ <hh:font id="0" face="\uD568\uCD08\uB86C\uBC14\uD0D5" type="TTF" isEmbedded="0">
6342
+ <hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="4" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
6343
+ </hh:font>
6344
+ </hh:fontface>
6345
+ <hh:fontface lang="LATIN" fontCnt="1">
6346
+ <hh:font id="0" face="Times New Roman" type="TTF" isEmbedded="0">
6347
+ <hh:typeInfo familyType="FCAT_OLDSTYLE" weight="5" proportion="4" contrast="2" strokeVariation="0" armStyle="0" letterform="0" midline="0" xHeight="4"/>
6348
+ </hh:font>
6349
+ </hh:fontface>
6350
+ <hh:fontface lang="HANJA" fontCnt="1">
6351
+ <hh:font id="0" face="\uD568\uCD08\uB86C\uBC14\uD0D5" type="TTF" isEmbedded="0">
6352
+ <hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="4" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
6353
+ </hh:font>
6354
+ </hh:fontface>
6355
+ <hh:fontface lang="JAPANESE" fontCnt="1">
6356
+ <hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
6357
+ <hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
6358
+ </hh:font>
6359
+ </hh:fontface>
6360
+ <hh:fontface lang="OTHER" fontCnt="1">
6361
+ <hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
6362
+ <hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
6363
+ </hh:font>
6364
+ </hh:fontface>
6365
+ <hh:fontface lang="SYMBOL" fontCnt="1">
6366
+ <hh:font id="0" face="Symbol" type="TTF" isEmbedded="0">
6367
+ <hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
6368
+ </hh:font>
6369
+ </hh:fontface>
6370
+ <hh:fontface lang="USER" fontCnt="1">
6371
+ <hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
6372
+ <hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
6373
+ </hh:font>
6374
+ </hh:fontface>
6375
+ </hh:fontfaces>
6376
+ <hh:borderFills itemCnt="1">
6377
+ <hh:borderFill id="0" threeD="0" shadow="0" centerLine="0" breakCellSeparateLine="0">
6378
+ <hh:slash type="NONE" Crooked="0" isCounter="0"/>
6379
+ <hh:backSlash type="NONE" Crooked="0" isCounter="0"/>
6380
+ <hh:leftBorder type="NONE" width="0.1mm" color="0"/>
6381
+ <hh:rightBorder type="NONE" width="0.1mm" color="0"/>
6382
+ <hh:topBorder type="NONE" width="0.1mm" color="0"/>
6383
+ <hh:bottomBorder type="NONE" width="0.1mm" color="0"/>
6384
+ <hh:diagonal type="NONE" width="0.1mm" color="0"/>
6385
+ <hh:fillInfo/>
6386
+ </hh:borderFill>
6387
+ </hh:borderFills>
6388
+ <hh:charProperties itemCnt="1">
6389
+ <hh:charPr id="0" height="1000" textColor="0" shadeColor="-1" useFontSpace="0" useKerning="0" symMark="NONE" borderFillIDRef="0">
6390
+ <hh:fontRef hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
6391
+ <hh:ratio hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
6392
+ <hh:spacing hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
6393
+ <hh:relSz hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
6394
+ <hh:offset hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
6395
+ </hh:charPr>
6396
+ </hh:charProperties>
6397
+ <hh:tabProperties itemCnt="0"/>
6398
+ <hh:numberings itemCnt="0"/>
6399
+ <hh:bullets itemCnt="0"/>
6400
+ <hh:paraProperties itemCnt="1">
6401
+ <hh:paraPr id="0" tabIDRef="0" condense="0" fontLineHeight="0" snapToGrid="0" suppressOverlap="0" checked="0">
6402
+ <hh:parLineBreak lineBreak="BREAK_LINE" wordBreak="BREAK_WORD" breakLatinWord="BREAK_WORD" breakNonLatinWord="BREAK_WORD"/>
6403
+ <hh:parMargin left="0" right="0" prev="0" next="0" indent="0"/>
6404
+ <hh:parBorder borderFillIDRef="0" offsetLeft="0" offsetRight="0" offsetTop="0" offsetBottom="0" connect="0" ignoreMargin="0"/>
6405
+ <hh:parShade borderFillIDRef="0"/>
6406
+ <hh:parTabList/>
6407
+ </hh:paraPr>
6408
+ </hh:paraProperties>
6409
+ <hh:styles itemCnt="1">
6410
+ <hh:style id="0" type="PARA" name="\uBC14\uD0D5\uAE00" engName="Normal" paraPrIDRef="0" charPrIDRef="0" nextStyleIDRef="0" langIDRef="1042" lockForm="0"/>
6411
+ </hh:styles>
6412
+ </hh:refList>
6413
+ <hh:compatibleDocument targetProgram="HWP2018"/>
6414
+ </hh:head>`;
6415
+ }
5958
6416
  function generateParagraph(text) {
5959
- return `<hp:p><hp:run><hp:t>${escapeXml(text)}</hp:t></hp:run></hp:p>`;
6417
+ return `<hp:p paraPrIDRef="0" styleIDRef="0"><hp:run charPrIDRef="0"><hp:t>${escapeXml(text)}</hp:t></hp:run></hp:p>`;
5960
6418
  }
5961
6419
  function generateTable(rows) {
5962
6420
  const trElements = rows.map((row) => {
@@ -5980,22 +6438,11 @@ function blocksToSectionXml(blocks) {
5980
6438
  return "";
5981
6439
  }
5982
6440
  }).join("\n ");
5983
- return `<?xml version="1.0" encoding="UTF-8"?>
5984
- <hs:sec xmlns:hs="${HWPML_NS}" xmlns:hp="${HWPML_NS}">
6441
+ return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
6442
+ <hs:sec xmlns:hs="${NS_SECTION}" xmlns:hp="${NS_PARA}">
5985
6443
  ${body}
5986
6444
  </hs:sec>`;
5987
6445
  }
5988
- function generateManifest() {
5989
- return `<?xml version="1.0" encoding="UTF-8"?>
5990
- <opf:package xmlns:opf="http://www.idpf.org/2007/opf">
5991
- <opf:manifest>
5992
- <opf:item id="s0" href="section0.xml" media-type="application/xml"/>
5993
- </opf:manifest>
5994
- <opf:spine>
5995
- <opf:itemref idref="s0"/>
5996
- </opf:spine>
5997
- </opf:package>`;
5998
- }
5999
6446
 
6000
6447
  // src/index.ts
6001
6448
  async function parse(input, options) {