kordoc 2.1.0 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -139,7 +139,7 @@ import { inflateRawSync } from "zlib";
139
139
  import { DOMParser } from "@xmldom/xmldom";
140
140
 
141
141
  // src/utils.ts
142
- var VERSION = true ? "2.1.0" : "0.0.0-dev";
142
+ var VERSION = true ? "2.2.1" : "0.0.0-dev";
143
143
  function toArrayBuffer(buf) {
144
144
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
145
145
  return buf.buffer;
@@ -155,7 +155,8 @@ var KordocError = class extends Error {
155
155
  function isPathTraversal(name) {
156
156
  if (name.includes("\0")) return true;
157
157
  const normalized = name.replace(/\\/g, "/");
158
- return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
158
+ const segments = normalized.split("/");
159
+ return segments.some((s) => s === "..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
159
160
  }
160
161
  function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
161
162
  try {
@@ -195,12 +196,25 @@ function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEnt
195
196
  return { totalUncompressed: 0, entryCount: 0 };
196
197
  }
197
198
  }
199
+ function stripDtd(xml) {
200
+ return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
201
+ }
198
202
  var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
199
203
  function sanitizeHref(href) {
200
204
  const trimmed = href.trim();
201
205
  if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
202
206
  return trimmed;
203
207
  }
208
+ function safeMin(arr) {
209
+ let min = Infinity;
210
+ for (let i = 0; i < arr.length; i++) if (arr[i] < min) min = arr[i];
211
+ return min;
212
+ }
213
+ function safeMax(arr) {
214
+ let max = -Infinity;
215
+ for (let i = 0; i < arr.length; i++) if (arr[i] > max) max = arr[i];
216
+ return max;
217
+ }
204
218
  function classifyError(err) {
205
219
  if (!(err instanceof Error)) return "PARSE_ERROR";
206
220
  const msg = err.message;
@@ -275,6 +289,7 @@ function buildTableDirect(rows, numRows) {
275
289
  if (end > maxCols) maxCols = end;
276
290
  }
277
291
  }
292
+ if (maxCols > MAX_COLS) maxCols = MAX_COLS;
278
293
  if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
279
294
  const grid = Array.from(
280
295
  { length: numRows },
@@ -284,7 +299,7 @@ function buildTableDirect(rows, numRows) {
284
299
  for (const cell of row) {
285
300
  const r = cell.rowAddr ?? 0;
286
301
  const c = cell.colAddr ?? 0;
287
- if (r >= numRows || c >= maxCols) continue;
302
+ if (r >= numRows || c >= maxCols || r < 0 || c < 0) continue;
288
303
  grid[r][c] = { text: cell.text.trim(), colSpan: cell.colSpan, rowSpan: cell.rowSpan };
289
304
  for (let dr = 0; dr < cell.rowSpan; dr++) {
290
305
  for (let dc = 0; dc < cell.colSpan; dc++) {
@@ -313,9 +328,12 @@ function trimAndReturn(grid, numRows, maxCols) {
313
328
  }
314
329
  function convertTableToText(rows) {
315
330
  return rows.map(
316
- (row) => row.map((c) => c.text.trim().replace(/\n/g, " ")).filter(Boolean).join(" | ")
331
+ (row) => row.map((c) => c.text.trim().replace(/\n/g, " ").replace(/\|/g, "\\|")).filter(Boolean).join(" / ")
317
332
  ).filter(Boolean).join("\n");
318
333
  }
334
+ function escapeGfm(text) {
335
+ return text.replace(/~/g, "\\~");
336
+ }
319
337
  var HWP_SHAPE_ALT_TEXT_RE = /(?:모서리가 둥근 |둥근 )?(?:사각형|직사각형|정사각형|원|타원|삼각형|이등변 삼각형|직각 삼각형|선|직선|곡선|화살표|굵은 화살표|이중 화살표|오각형|육각형|팔각형|별|[4-8]점별|십자|십자형|구름|구름형|마름모|도넛|평행사변형|사다리꼴|부채꼴|호|반원|물결|번개|하트|빗금|블록 화살표|수식|표|그림|개체|그리기\s?개체|묶음\s?개체|글상자|수식\s?개체|OLE\s?개체)\s?입니다\.?/g;
320
338
  function sanitizeText(text) {
321
339
  let result = text.replace(/[\u{F0000}-\u{FFFFD}]/gu, "").replace(HWP_SHAPE_ALT_TEXT_RE, "").replace(/ +/g, " ").trim();
@@ -425,7 +443,7 @@ function blocksToMarkdown(blocks) {
425
443
  if (block.footnoteText) {
426
444
  text += ` (\uC8FC: ${block.footnoteText})`;
427
445
  }
428
- lines.push(text);
446
+ lines.push(escapeGfm(text), "");
429
447
  } else if (block.type === "table" && block.table) {
430
448
  if (lines.length > 0 && lines[lines.length - 1] !== "") {
431
449
  lines.push("");
@@ -448,13 +466,13 @@ function tableToMarkdown(table) {
448
466
  return content.split(/\n/).map((line) => {
449
467
  const trimmed = line.trim();
450
468
  if (!trimmed) return "";
451
- if (/^\d+\.\s/.test(trimmed)) return `**${trimmed}**`;
452
- if (/^[가-힣]\.\s/.test(trimmed)) return ` ${trimmed}`;
453
- return trimmed;
469
+ if (/^\d+\.\s/.test(trimmed)) return `**${escapeGfm(trimmed)}**`;
470
+ if (/^[가-힣]\.\s/.test(trimmed)) return ` ${escapeGfm(trimmed)}`;
471
+ return escapeGfm(trimmed);
454
472
  }).filter(Boolean).join("\n");
455
473
  }
456
474
  if (numCols === 1 && numRows >= 2) {
457
- return cells.map((row) => sanitizeText(row[0].text).replace(/\n/g, " ")).filter(Boolean).join("\n");
475
+ return cells.map((row) => escapeGfm(sanitizeText(row[0].text)).replace(/\n/g, " ")).filter(Boolean).join("\n");
458
476
  }
459
477
  const display = Array.from({ length: numRows }, () => Array(numCols).fill(""));
460
478
  const skip = /* @__PURE__ */ new Set();
@@ -463,15 +481,12 @@ function tableToMarkdown(table) {
463
481
  if (skip.has(`${r},${c}`)) continue;
464
482
  const cell = cells[r]?.[c];
465
483
  if (!cell) continue;
466
- display[r][c] = sanitizeText(cell.text).replace(/\n/g, "<br>");
484
+ display[r][c] = escapeGfm(sanitizeText(cell.text)).replace(/\|/g, "\\|").replace(/\n/g, "<br>");
467
485
  for (let dr = 0; dr < cell.rowSpan; dr++) {
468
486
  for (let dc = 0; dc < cell.colSpan; dc++) {
469
487
  if (dr === 0 && dc === 0) continue;
470
488
  if (r + dr < numRows && c + dc < numCols) {
471
489
  skip.add(`${r + dr},${c + dc}`);
472
- if (dr === 0) {
473
- display[r][c + dc] = cell.text.replace(/\n/g, "<br>");
474
- }
475
490
  }
476
491
  }
477
492
  }
@@ -607,9 +622,6 @@ function parseStyleElements(doc, map) {
607
622
  }
608
623
  }
609
624
  }
610
- function stripDtd(xml) {
611
- return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
612
- }
613
625
  async function parseHwpxDocument(buffer, options) {
614
626
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
615
627
  let zip;
@@ -959,7 +971,8 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
959
971
  if (newTable.rows.length > 0) {
960
972
  if (tableStack.length > 0) {
961
973
  const parentTable = tableStack.pop();
962
- const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
974
+ let nestedCols = 0;
975
+ for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
963
976
  if (newTable.rows.length >= 3 && nestedCols >= 2) {
964
977
  blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
965
978
  } else {
@@ -1068,7 +1081,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
1068
1081
  if (newTable.rows.length > 0) {
1069
1082
  if (tableStack.length > 0) {
1070
1083
  const parentTable = tableStack.pop();
1071
- const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
1084
+ let nestedCols = 0;
1085
+ for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
1072
1086
  if (newTable.rows.length >= 3 && nestedCols >= 2) {
1073
1087
  blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
1074
1088
  } else {
@@ -2166,6 +2180,7 @@ function parseLenientCfb(data) {
2166
2180
  if (miniSectorSizeShift > 16) throw new Error("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uBBF8\uB2C8 \uC139\uD130 \uD06C\uAE30 \uC2DC\uD504\uD2B8: " + miniSectorSizeShift);
2167
2181
  const miniSectorSize = 1 << miniSectorSizeShift;
2168
2182
  const fatSectorCount = data.readUInt32LE(44);
2183
+ if (fatSectorCount > 1e4) throw new Error("FAT \uC139\uD130 \uC218\uAC00 \uB108\uBB34 \uB9CE\uC2B5\uB2C8\uB2E4: " + fatSectorCount);
2169
2184
  const firstDirSector = data.readUInt32LE(48);
2170
2185
  const miniStreamCutoff = data.readUInt32LE(56);
2171
2186
  const firstMiniFatSector = data.readUInt32LE(60);
@@ -2541,10 +2556,14 @@ function findSections(cfb) {
2541
2556
  }
2542
2557
  function findSectionsLenient(lcfb, compressed) {
2543
2558
  const sections = [];
2559
+ let totalDecompressed = 0;
2544
2560
  for (let i = 0; i < MAX_SECTIONS; i++) {
2545
2561
  const raw = lcfb.findStream(`/BodyText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
2546
2562
  if (!raw) break;
2547
- sections.push({ idx: i, content: compressed ? decompressStream(raw) : raw });
2563
+ const content = compressed ? decompressStream(raw) : raw;
2564
+ totalDecompressed += content.length;
2565
+ if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
2566
+ sections.push({ idx: i, content });
2548
2567
  }
2549
2568
  if (sections.length === 0) {
2550
2569
  for (const e of lcfb.entries()) {
@@ -2552,7 +2571,12 @@ function findSectionsLenient(lcfb, compressed) {
2552
2571
  if (e.name.startsWith("Section")) {
2553
2572
  const idx = parseInt(e.name.replace("Section", ""), 10) || 0;
2554
2573
  const raw = lcfb.findStream(e.name);
2555
- if (raw) sections.push({ idx, content: compressed ? decompressStream(raw) : raw });
2574
+ if (raw) {
2575
+ const content = compressed ? decompressStream(raw) : raw;
2576
+ totalDecompressed += content.length;
2577
+ if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
2578
+ sections.push({ idx, content });
2579
+ }
2556
2580
  }
2557
2581
  }
2558
2582
  }
@@ -2560,11 +2584,15 @@ function findSectionsLenient(lcfb, compressed) {
2560
2584
  }
2561
2585
  function findViewTextSectionsLenient(lcfb, compressed) {
2562
2586
  const sections = [];
2587
+ let totalDecompressed = 0;
2563
2588
  for (let i = 0; i < MAX_SECTIONS; i++) {
2564
2589
  const raw = lcfb.findStream(`/ViewText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
2565
2590
  if (!raw) break;
2566
2591
  try {
2567
- sections.push({ idx: i, content: decryptViewText(raw, compressed) });
2592
+ const content = decryptViewText(raw, compressed);
2593
+ totalDecompressed += content.length;
2594
+ if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
2595
+ sections.push({ idx: i, content });
2568
2596
  } catch {
2569
2597
  break;
2570
2598
  }
@@ -2966,37 +2994,18 @@ init_page_range();
2966
2994
  // src/pdf/line-detector.ts
2967
2995
  import { OPS } from "pdfjs-dist/legacy/build/pdf.mjs";
2968
2996
  var ORIENTATION_TOL = 2;
2969
- var MIN_LINE_LENGTH = 10;
2970
- var COORD_MERGE_TOL = 3;
2997
+ var MIN_LINE_LENGTH = 15;
2998
+ var MAX_LINE_WIDTH = 5;
2971
2999
  var CONNECT_TOL = 5;
2972
3000
  var CELL_PADDING = 2;
2973
- var MAX_LINE_WIDTH = 5;
2974
- var IDENTITY = [1, 0, 0, 1, 0, 0];
2975
- function matMultiply(m1, m2) {
2976
- return [
2977
- m1[0] * m2[0] + m1[2] * m2[1],
2978
- m1[1] * m2[0] + m1[3] * m2[1],
2979
- m1[0] * m2[2] + m1[2] * m2[3],
2980
- m1[1] * m2[2] + m1[3] * m2[3],
2981
- m1[0] * m2[4] + m1[2] * m2[5] + m1[4],
2982
- m1[1] * m2[4] + m1[3] * m2[5] + m1[5]
2983
- ];
2984
- }
2985
- function matTransformPoint(m, x, y) {
2986
- return [m[0] * x + m[2] * y + m[4], m[1] * x + m[3] * y + m[5]];
2987
- }
2988
- function matScale(m) {
2989
- return Math.max(
2990
- Math.sqrt(m[1] * m[1] + m[3] * m[3]),
2991
- Math.sqrt(m[0] * m[0] + m[2] * m[2])
2992
- );
2993
- }
3001
+ var MIN_COL_WIDTH = 15;
3002
+ var MIN_ROW_HEIGHT = 6;
3003
+ var VERTEX_MERGE_FACTOR = 4;
3004
+ var MIN_COORD_MERGE_TOL = 8;
2994
3005
  function extractLines(fnArray, argsArray) {
2995
3006
  const horizontals = [];
2996
3007
  const verticals = [];
2997
- let ctm = [...IDENTITY];
2998
3008
  let lineWidth = 1;
2999
- const stateStack = [];
3000
3009
  let currentPath = [];
3001
3010
  let pathStartX = 0, pathStartY = 0;
3002
3011
  let curX = 0, curY = 0;
@@ -3014,53 +3023,13 @@ function extractLines(fnArray, argsArray) {
3014
3023
  );
3015
3024
  }
3016
3025
  }
3017
- function tryConvertLinesToRectangle(path) {
3018
- if (path.length < 3 || path.length > 5) return false;
3019
- const first = path[0], last = path[path.length - 1];
3020
- const closed = Math.abs(first.x1 - last.x2) < 1 && Math.abs(first.y1 - last.y2) < 1;
3021
- if (!closed) return false;
3022
- let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
3023
- for (const seg of path) {
3024
- minX = Math.min(minX, seg.x1, seg.x2);
3025
- minY = Math.min(minY, seg.y1, seg.y2);
3026
- maxX = Math.max(maxX, seg.x1, seg.x2);
3027
- maxY = Math.max(maxY, seg.y1, seg.y2);
3028
- }
3029
- const w = maxX - minX, h = maxY - minY;
3030
- if (w < MIN_LINE_LENGTH && h < MIN_LINE_LENGTH) return false;
3031
- path.length = 0;
3032
- if (h < ORIENTATION_TOL * 2 || w > MIN_LINE_LENGTH && h <= MAX_LINE_WIDTH) {
3033
- path.push({ x1: minX, y1: (minY + maxY) / 2, x2: maxX, y2: (minY + maxY) / 2 });
3034
- } else if (w < ORIENTATION_TOL * 2 || h > MIN_LINE_LENGTH && w <= MAX_LINE_WIDTH) {
3035
- path.push({ x1: (minX + maxX) / 2, y1: minY, x2: (minX + maxX) / 2, y2: maxY });
3036
- } else {
3037
- pushRectangle(path, minX, minY, w, h);
3038
- }
3039
- return true;
3040
- }
3041
- function flushPath(isStroke, isFill) {
3042
- if (!isStroke && !isFill) {
3043
- currentPath = [];
3044
- return;
3045
- }
3046
- if (isFill && !isStroke && currentPath.length >= 3) {
3047
- tryConvertLinesToRectangle(currentPath);
3048
- }
3049
- const scale = matScale(ctm);
3050
- const effectiveLW = lineWidth * scale;
3051
- if (effectiveLW > MAX_LINE_WIDTH && isStroke && !isFill) {
3026
+ function flushPath(isStroke) {
3027
+ if (!isStroke) {
3052
3028
  currentPath = [];
3053
3029
  return;
3054
3030
  }
3055
3031
  for (const seg of currentPath) {
3056
- const [px1, py1] = matTransformPoint(ctm, seg.x1, seg.y1);
3057
- const [px2, py2] = matTransformPoint(ctm, seg.x2, seg.y2);
3058
- classifyAndAdd(
3059
- { x1: px1, y1: py1, x2: px2, y2: py2 },
3060
- effectiveLW,
3061
- horizontals,
3062
- verticals
3063
- );
3032
+ classifyAndAdd(seg, lineWidth, horizontals, verticals);
3064
3033
  }
3065
3034
  currentPath = [];
3066
3035
  }
@@ -3068,28 +3037,9 @@ function extractLines(fnArray, argsArray) {
3068
3037
  const op = fnArray[i];
3069
3038
  const args = argsArray[i];
3070
3039
  switch (op) {
3071
- // ── Graphics State ──
3072
- case OPS.save:
3073
- stateStack.push({ ctm: [...ctm], lineWidth });
3074
- break;
3075
- case OPS.restore:
3076
- if (stateStack.length > 0) {
3077
- const state = stateStack.pop();
3078
- ctm = state.ctm;
3079
- lineWidth = state.lineWidth;
3080
- }
3081
- break;
3082
- case OPS.transform: {
3083
- const m = args;
3084
- if (m.length >= 6) {
3085
- ctm = matMultiply(ctm, [m[0], m[1], m[2], m[3], m[4], m[5]]);
3086
- }
3087
- break;
3088
- }
3089
3040
  case OPS.setLineWidth:
3090
3041
  lineWidth = args[0] || 1;
3091
3042
  break;
3092
- // ── Path Construction ──
3093
3043
  case OPS.constructPath: {
3094
3044
  const arg0 = args[0];
3095
3045
  if (Array.isArray(arg0)) {
@@ -3157,60 +3107,34 @@ function extractLines(fnArray, argsArray) {
3157
3107
  }
3158
3108
  }
3159
3109
  }
3160
- const isStroke5 = afterOp === OPS.stroke || afterOp === OPS.closeStroke;
3161
- const isFill5 = afterOp === OPS.fill || afterOp === OPS.eoFill;
3162
- const isBoth5 = afterOp === OPS.fillStroke || afterOp === OPS.eoFillStroke || afterOp === OPS.closeFillStroke || afterOp === OPS.closeEOFillStroke;
3163
- if (isStroke5 || isFill5 || isBoth5) {
3164
- flushPath(isStroke5 || isBoth5, isFill5 || isBoth5);
3110
+ if (afterOp === OPS.stroke || afterOp === OPS.closeStroke) {
3111
+ flushPath(true);
3112
+ } else if (afterOp === OPS.fill || afterOp === OPS.eoFill || afterOp === OPS.fillStroke || afterOp === OPS.eoFillStroke || afterOp === OPS.closeFillStroke || afterOp === OPS.closeEOFillStroke) {
3113
+ flushPath(true);
3165
3114
  } else if (afterOp === OPS.endPath) {
3166
- flushPath(false, false);
3115
+ flushPath(false);
3167
3116
  }
3168
3117
  }
3169
3118
  break;
3170
3119
  }
3171
- // ── Paint Operations ──
3172
3120
  case OPS.stroke:
3173
3121
  case OPS.closeStroke:
3174
- flushPath(true, false);
3122
+ flushPath(true);
3175
3123
  break;
3176
3124
  case OPS.fill:
3177
3125
  case OPS.eoFill:
3178
- flushPath(false, true);
3179
- break;
3180
3126
  case OPS.fillStroke:
3181
3127
  case OPS.eoFillStroke:
3182
3128
  case OPS.closeFillStroke:
3183
3129
  case OPS.closeEOFillStroke:
3184
- flushPath(true, true);
3130
+ flushPath(true);
3185
3131
  break;
3186
3132
  case OPS.endPath:
3187
- flushPath(false, false);
3188
- break;
3189
- }
3190
- }
3191
- return {
3192
- horizontals: deduplicateLines(horizontals),
3193
- verticals: deduplicateLines(verticals)
3194
- };
3195
- }
3196
- function deduplicateLines(lines) {
3197
- if (lines.length <= 1) return lines;
3198
- const result = [];
3199
- const tol = COORD_MERGE_TOL;
3200
- for (const line of lines) {
3201
- let isDuplicate = false;
3202
- for (const existing of result) {
3203
- if (Math.abs(line.y1 - existing.y1) <= tol && Math.abs(line.y2 - existing.y2) <= tol && Math.abs(line.x1 - existing.x1) <= tol && Math.abs(line.x2 - existing.x2) <= tol) {
3204
- if (line.lineWidth > existing.lineWidth) {
3205
- existing.lineWidth = line.lineWidth;
3206
- }
3207
- isDuplicate = true;
3133
+ flushPath(false);
3208
3134
  break;
3209
- }
3210
3135
  }
3211
- if (!isDuplicate) result.push(line);
3212
3136
  }
3213
- return result;
3137
+ return { horizontals, verticals };
3214
3138
  }
3215
3139
  function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
3216
3140
  const dx = Math.abs(seg.x2 - seg.x1);
@@ -3229,6 +3153,55 @@ function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
3229
3153
  verticals.push({ x1: x, y1, x2: x, y2, lineWidth });
3230
3154
  }
3231
3155
  }
3156
+ function preprocessLines(horizontals, verticals) {
3157
+ let h = horizontals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
3158
+ let v = verticals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
3159
+ h = mergeParallelLines(h, "h");
3160
+ v = mergeParallelLines(v, "v");
3161
+ return { horizontals: h, verticals: v };
3162
+ }
3163
+ function mergeParallelLines(lines, dir) {
3164
+ if (lines.length <= 1) return lines;
3165
+ const sorted = [...lines].sort((a, b) => {
3166
+ const posA = dir === "h" ? a.y1 : a.x1;
3167
+ const posB = dir === "h" ? b.y1 : b.x1;
3168
+ if (Math.abs(posA - posB) > 0.1) return posA - posB;
3169
+ return dir === "h" ? a.x1 - b.x1 : a.y1 - b.y1;
3170
+ });
3171
+ const MERGE_TOL = 3;
3172
+ const result = [sorted[0]];
3173
+ for (let i = 1; i < sorted.length; i++) {
3174
+ const prev = result[result.length - 1];
3175
+ const curr = sorted[i];
3176
+ const prevPos = dir === "h" ? prev.y1 : prev.x1;
3177
+ const currPos = dir === "h" ? curr.y1 : curr.x1;
3178
+ if (Math.abs(prevPos - currPos) <= MERGE_TOL) {
3179
+ const prevStart = dir === "h" ? prev.x1 : prev.y1;
3180
+ const prevEnd = dir === "h" ? prev.x2 : prev.y2;
3181
+ const currStart = dir === "h" ? curr.x1 : curr.y1;
3182
+ const currEnd = dir === "h" ? curr.x2 : curr.y2;
3183
+ const overlap = Math.min(prevEnd, currEnd) - Math.max(prevStart, currStart);
3184
+ const minLen = Math.min(prevEnd - prevStart, currEnd - currStart);
3185
+ if (overlap > minLen * 0.3) {
3186
+ if (dir === "h") {
3187
+ prev.x1 = Math.min(prev.x1, curr.x1);
3188
+ prev.x2 = Math.max(prev.x2, curr.x2);
3189
+ prev.y1 = (prev.y1 + curr.y1) / 2;
3190
+ prev.y2 = prev.y1;
3191
+ } else {
3192
+ prev.y1 = Math.min(prev.y1, curr.y1);
3193
+ prev.y2 = Math.max(prev.y2, curr.y2);
3194
+ prev.x1 = (prev.x1 + curr.x1) / 2;
3195
+ prev.x2 = prev.x1;
3196
+ }
3197
+ prev.lineWidth = Math.max(prev.lineWidth, curr.lineWidth);
3198
+ continue;
3199
+ }
3200
+ }
3201
+ result.push(curr);
3202
+ }
3203
+ return result;
3204
+ }
3232
3205
  function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
3233
3206
  const margin = 5;
3234
3207
  return {
@@ -3240,8 +3213,49 @@ function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
3240
3213
  )
3241
3214
  };
3242
3215
  }
3216
+ function buildVertices(horizontals, verticals) {
3217
+ const vertices = [];
3218
+ const tol = CONNECT_TOL;
3219
+ for (const h of horizontals) {
3220
+ for (const v of verticals) {
3221
+ if (v.x1 >= h.x1 - tol && v.x1 <= h.x2 + tol && h.y1 >= v.y1 - tol && h.y1 <= v.y2 + tol) {
3222
+ const radius = Math.max(h.lineWidth, v.lineWidth, 1);
3223
+ vertices.push({ x: v.x1, y: h.y1, radius });
3224
+ }
3225
+ }
3226
+ }
3227
+ return vertices;
3228
+ }
3229
+ function mergeVertices(vertices) {
3230
+ if (vertices.length <= 1) return vertices;
3231
+ const merged = [];
3232
+ const used = new Array(vertices.length).fill(false);
3233
+ for (let i = 0; i < vertices.length; i++) {
3234
+ if (used[i]) continue;
3235
+ let sumX = vertices[i].x, sumY = vertices[i].y;
3236
+ let maxRadius = vertices[i].radius;
3237
+ let count = 1;
3238
+ for (let j = i + 1; j < vertices.length; j++) {
3239
+ if (used[j]) continue;
3240
+ const mergeTol = VERTEX_MERGE_FACTOR * Math.max(maxRadius, vertices[j].radius);
3241
+ if (Math.abs(vertices[i].x - vertices[j].x) <= mergeTol && Math.abs(vertices[i].y - vertices[j].y) <= mergeTol) {
3242
+ sumX += vertices[j].x;
3243
+ sumY += vertices[j].y;
3244
+ maxRadius = Math.max(maxRadius, vertices[j].radius);
3245
+ count++;
3246
+ used[j] = true;
3247
+ }
3248
+ }
3249
+ merged.push({ x: sumX / count, y: sumY / count, radius: maxRadius });
3250
+ }
3251
+ return merged;
3252
+ }
3243
3253
  function buildTableGrids(horizontals, verticals) {
3244
3254
  if (horizontals.length < 2 || verticals.length < 2) return [];
3255
+ const allVertices = buildVertices(horizontals, verticals);
3256
+ const vertices = mergeVertices(allVertices);
3257
+ if (vertices.length < 4) return [];
3258
+ const globalRadius = vertices.reduce((max, v) => Math.max(max, v.radius), 1);
3245
3259
  const allLines = [
3246
3260
  ...horizontals.map((l, i) => ({ ...l, type: "h", id: i })),
3247
3261
  ...verticals.map((l, i) => ({ ...l, type: "v", id: i + horizontals.length }))
@@ -3252,21 +3266,74 @@ function buildTableGrids(horizontals, verticals) {
3252
3266
  const hLines = group.filter((l) => l.type === "h");
3253
3267
  const vLines = group.filter((l) => l.type === "v");
3254
3268
  if (hLines.length < 2 || vLines.length < 2) continue;
3255
- const rawYs = hLines.map((l) => l.y1);
3256
- const rowYs = clusterCoordinates(rawYs).sort((a, b) => b - a);
3257
- const rawXs = vLines.map((l) => l.x1);
3258
- const colXs = clusterCoordinates(rawXs).sort((a, b) => a - b);
3269
+ let gx1 = Infinity, gy1 = Infinity, gx2 = -Infinity, gy2 = -Infinity;
3270
+ for (const l of vLines) {
3271
+ if (l.x1 < gx1) gx1 = l.x1;
3272
+ if (l.x1 > gx2) gx2 = l.x1;
3273
+ }
3274
+ for (const l of hLines) {
3275
+ if (l.y1 < gy1) gy1 = l.y1;
3276
+ if (l.y1 > gy2) gy2 = l.y1;
3277
+ }
3278
+ const groupBbox = {
3279
+ x1: gx1 - CONNECT_TOL,
3280
+ y1: gy1 - CONNECT_TOL,
3281
+ x2: gx2 + CONNECT_TOL,
3282
+ y2: gy2 + CONNECT_TOL
3283
+ };
3284
+ const groupVertices = vertices.filter(
3285
+ (v) => v.x >= groupBbox.x1 && v.x <= groupBbox.x2 && v.y >= groupBbox.y1 && v.y <= groupBbox.y2
3286
+ );
3287
+ const groupRadius = groupVertices.length > 0 ? groupVertices.reduce((max, v) => Math.max(max, v.radius), 1) : globalRadius;
3288
+ const coordMergeTol = Math.max(VERTEX_MERGE_FACTOR * groupRadius, MIN_COORD_MERGE_TOL);
3289
+ const rawYs = [
3290
+ ...hLines.map((l) => l.y1),
3291
+ ...groupVertices.map((v) => v.y)
3292
+ ];
3293
+ const rowYs = clusterCoordinates(rawYs, coordMergeTol).sort((a, b) => b - a);
3294
+ const rawXs = [
3295
+ ...vLines.map((l) => l.x1),
3296
+ ...groupVertices.map((v) => v.x)
3297
+ ];
3298
+ const colXs = clusterCoordinates(rawXs, coordMergeTol).sort((a, b) => a - b);
3259
3299
  if (rowYs.length < 2 || colXs.length < 2) continue;
3300
+ const validColXs = enforceMinWidth(colXs, MIN_COL_WIDTH);
3301
+ const validRowYs = enforceMinHeight(rowYs, MIN_ROW_HEIGHT);
3302
+ if (validRowYs.length < 2 || validColXs.length < 2) continue;
3260
3303
  const bbox = {
3261
- x1: colXs[0],
3262
- y1: rowYs[rowYs.length - 1],
3263
- x2: colXs[colXs.length - 1],
3264
- y2: rowYs[0]
3304
+ x1: validColXs[0],
3305
+ y1: validRowYs[validRowYs.length - 1],
3306
+ x2: validColXs[validColXs.length - 1],
3307
+ y2: validRowYs[0]
3265
3308
  };
3266
- grids.push({ rowYs, colXs, bbox });
3309
+ grids.push({ rowYs: validRowYs, colXs: validColXs, bbox, vertexRadius: groupRadius });
3267
3310
  }
3268
3311
  return mergeAdjacentGrids(grids);
3269
3312
  }
3313
+ function enforceMinWidth(colXs, minWidth) {
3314
+ if (colXs.length <= 2) return colXs;
3315
+ const result = [colXs[0]];
3316
+ for (let i = 1; i < colXs.length; i++) {
3317
+ const prevX = result[result.length - 1];
3318
+ if (colXs[i] - prevX < minWidth && i < colXs.length - 1) {
3319
+ continue;
3320
+ }
3321
+ result.push(colXs[i]);
3322
+ }
3323
+ return result;
3324
+ }
3325
+ function enforceMinHeight(rowYs, minHeight) {
3326
+ if (rowYs.length <= 2) return rowYs;
3327
+ const result = [rowYs[0]];
3328
+ for (let i = 1; i < rowYs.length; i++) {
3329
+ const prevY = result[result.length - 1];
3330
+ if (prevY - rowYs[i] < minHeight && i < rowYs.length - 1) {
3331
+ continue;
3332
+ }
3333
+ result.push(rowYs[i]);
3334
+ }
3335
+ return result;
3336
+ }
3270
3337
  function mergeAdjacentGrids(grids) {
3271
3338
  if (grids.length <= 1) return grids;
3272
3339
  const sorted = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
@@ -3275,9 +3342,10 @@ function mergeAdjacentGrids(grids) {
3275
3342
  const prev = merged[merged.length - 1];
3276
3343
  const curr = sorted[i];
3277
3344
  if (prev.colXs.length === curr.colXs.length) {
3278
- const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= COORD_MERGE_TOL * 3);
3345
+ const mergeTol = Math.max(VERTEX_MERGE_FACTOR * Math.max(prev.vertexRadius, curr.vertexRadius), 6) * 3;
3346
+ const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= mergeTol);
3279
3347
  const verticalGap = prev.bbox.y1 - curr.bbox.y2;
3280
- if (colMatch && verticalGap >= -COORD_MERGE_TOL && verticalGap <= 20) {
3348
+ if (colMatch && verticalGap >= -CONNECT_TOL && verticalGap <= 20) {
3281
3349
  const allRowYs = [.../* @__PURE__ */ new Set([...prev.rowYs, ...curr.rowYs])].sort((a, b) => b - a);
3282
3350
  merged[merged.length - 1] = {
3283
3351
  rowYs: allRowYs,
@@ -3287,7 +3355,8 @@ function mergeAdjacentGrids(grids) {
3287
3355
  y1: Math.min(prev.bbox.y1, curr.bbox.y1),
3288
3356
  x2: Math.max(prev.bbox.x2, curr.bbox.x2),
3289
3357
  y2: Math.max(prev.bbox.y2, curr.bbox.y2)
3290
- }
3358
+ },
3359
+ vertexRadius: Math.max(prev.vertexRadius, curr.vertexRadius)
3291
3360
  };
3292
3361
  continue;
3293
3362
  }
@@ -3296,14 +3365,14 @@ function mergeAdjacentGrids(grids) {
3296
3365
  }
3297
3366
  return merged;
3298
3367
  }
3299
- function clusterCoordinates(values) {
3368
+ function clusterCoordinates(values, tolerance) {
3300
3369
  if (values.length === 0) return [];
3301
3370
  const sorted = [...values].sort((a, b) => a - b);
3302
3371
  const clusters = [{ sum: sorted[0], count: 1 }];
3303
3372
  for (let i = 1; i < sorted.length; i++) {
3304
3373
  const last = clusters[clusters.length - 1];
3305
3374
  const avg = last.sum / last.count;
3306
- if (Math.abs(sorted[i] - avg) <= COORD_MERGE_TOL) {
3375
+ if (Math.abs(sorted[i] - avg) <= tolerance) {
3307
3376
  last.sum += sorted[i];
3308
3377
  last.count++;
3309
3378
  } else {
@@ -3360,6 +3429,20 @@ function extractCells(grid, horizontals, verticals) {
3360
3429
  const numRows = rowYs.length - 1;
3361
3430
  const numCols = colXs.length - 1;
3362
3431
  if (numRows <= 0 || numCols <= 0) return [];
3432
+ const vBorders = Array.from(
3433
+ { length: numRows },
3434
+ (_, r) => Array.from(
3435
+ { length: numCols + 1 },
3436
+ (_2, c) => hasVerticalLine(verticals, colXs[c], rowYs[r], rowYs[r + 1], grid.vertexRadius)
3437
+ )
3438
+ );
3439
+ const hBorders = Array.from(
3440
+ { length: numRows + 1 },
3441
+ (_, r) => Array.from(
3442
+ { length: numCols },
3443
+ (_2, c) => hasHorizontalLine(horizontals, rowYs[r], colXs[c], colXs[c + 1], grid.vertexRadius)
3444
+ )
3445
+ );
3363
3446
  const occupied = Array.from({ length: numRows }, () => Array(numCols).fill(false));
3364
3447
  const cells = [];
3365
3448
  for (let r = 0; r < numRows; r++) {
@@ -3367,18 +3450,26 @@ function extractCells(grid, horizontals, verticals) {
3367
3450
  if (occupied[r][c]) continue;
3368
3451
  let colSpan = 1;
3369
3452
  let rowSpan = 1;
3370
- while (c + colSpan < numCols) {
3371
- const borderX = colXs[c + colSpan];
3372
- const topY = rowYs[r];
3373
- const botY = rowYs[r + 1];
3374
- if (hasVerticalLine(verticals, borderX, topY, botY)) break;
3453
+ while (c + colSpan < numCols && !vBorders[r][c + colSpan]) {
3454
+ let canExpand = true;
3455
+ for (let dr = 0; dr < rowSpan; dr++) {
3456
+ if (vBorders[r + dr][c + colSpan]) {
3457
+ canExpand = false;
3458
+ break;
3459
+ }
3460
+ }
3461
+ if (!canExpand) break;
3375
3462
  colSpan++;
3376
3463
  }
3377
3464
  while (r + rowSpan < numRows) {
3378
- const borderY = rowYs[r + rowSpan];
3379
- const leftX = colXs[c];
3380
- const rightX = colXs[c + colSpan];
3381
- if (hasHorizontalLine(horizontals, borderY, leftX, rightX)) break;
3465
+ let hasLine = false;
3466
+ for (let dc = 0; dc < colSpan; dc++) {
3467
+ if (hBorders[r + rowSpan][c + dc]) {
3468
+ hasLine = true;
3469
+ break;
3470
+ }
3471
+ }
3472
+ if (hasLine) break;
3382
3473
  rowSpan++;
3383
3474
  }
3384
3475
  for (let dr = 0; dr < rowSpan; dr++) {
@@ -3402,28 +3493,30 @@ function extractCells(grid, horizontals, verticals) {
3402
3493
  }
3403
3494
  return cells;
3404
3495
  }
3405
- function hasVerticalLine(verticals, x, topY, botY) {
3406
- const tol = COORD_MERGE_TOL + 1;
3496
+ function hasVerticalLine(verticals, x, topY, botY, vertexRadius) {
3497
+ const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
3407
3498
  for (const v of verticals) {
3408
3499
  if (Math.abs(v.x1 - x) <= tol) {
3409
3500
  const cellH = Math.abs(topY - botY);
3501
+ if (cellH < 0.1) continue;
3410
3502
  const overlapTop = Math.min(v.y2, topY);
3411
3503
  const overlapBot = Math.max(v.y1, botY);
3412
3504
  const overlap = overlapTop - overlapBot;
3413
- if (overlap >= cellH * 0.5) return true;
3505
+ if (overlap >= cellH * 0.75) return true;
3414
3506
  }
3415
3507
  }
3416
3508
  return false;
3417
3509
  }
3418
- function hasHorizontalLine(horizontals, y, leftX, rightX) {
3419
- const tol = COORD_MERGE_TOL + 1;
3510
+ function hasHorizontalLine(horizontals, y, leftX, rightX, vertexRadius) {
3511
+ const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
3420
3512
  for (const h of horizontals) {
3421
3513
  if (Math.abs(h.y1 - y) <= tol) {
3422
3514
  const cellW = Math.abs(rightX - leftX);
3515
+ if (cellW < 0.1) continue;
3423
3516
  const overlapLeft = Math.max(h.x1, leftX);
3424
3517
  const overlapRight = Math.min(h.x2, rightX);
3425
3518
  const overlap = overlapRight - overlapLeft;
3426
- if (overlap >= cellW * 0.5) return true;
3519
+ if (overlap >= cellW * 0.75) return true;
3427
3520
  }
3428
3521
  }
3429
3522
  return false;
@@ -3434,23 +3527,24 @@ function mapTextToCells(items, cells) {
3434
3527
  result.set(cell, []);
3435
3528
  }
3436
3529
  for (const item of items) {
3437
- const cx = item.x + item.w / 2;
3438
- const cy = item.y;
3439
3530
  const pad = CELL_PADDING;
3440
3531
  let bestCell = null;
3441
- let bestDist = Infinity;
3532
+ let bestScore = 0;
3442
3533
  for (const cell of cells) {
3443
- if (cx >= cell.bbox.x1 - pad && cx <= cell.bbox.x2 + pad && cy >= cell.bbox.y1 - pad && cy <= cell.bbox.y2 + pad) {
3444
- const cellCx = (cell.bbox.x1 + cell.bbox.x2) / 2;
3445
- const cellCy = (cell.bbox.y1 + cell.bbox.y2) / 2;
3446
- const dist = Math.abs(cx - cellCx) + Math.abs(cy - cellCy);
3447
- if (dist < bestDist) {
3448
- bestDist = dist;
3449
- bestCell = cell;
3450
- }
3534
+ const ix1 = Math.max(item.x, cell.bbox.x1 - pad);
3535
+ const ix2 = Math.min(item.x + item.w, cell.bbox.x2 + pad);
3536
+ const iy1 = Math.max(item.y, cell.bbox.y1 - pad);
3537
+ const iy2 = Math.min(item.y + (item.h || item.fontSize), cell.bbox.y2 + pad);
3538
+ if (ix1 >= ix2 || iy1 >= iy2) continue;
3539
+ const intersectArea = (ix2 - ix1) * (iy2 - iy1);
3540
+ const itemArea = Math.max(item.w, 1) * Math.max(item.h || item.fontSize, 1);
3541
+ const score = intersectArea / itemArea;
3542
+ if (score > bestScore) {
3543
+ bestScore = score;
3544
+ bestCell = cell;
3451
3545
  }
3452
3546
  }
3453
- if (bestCell) {
3547
+ if (bestCell && bestScore > 0.3) {
3454
3548
  result.get(bestCell).push(item);
3455
3549
  }
3456
3550
  }
@@ -3477,8 +3571,13 @@ function cellTextToString(items) {
3477
3571
  const textLines = lines.map((line) => {
3478
3572
  const s = line.sort((a, b) => a.x - b.x);
3479
3573
  if (s.length === 1) return s[0].text;
3574
+ const evenSpaced = detectEvenSpacedItems(s);
3480
3575
  let result = s[0].text;
3481
3576
  for (let j = 1; j < s.length; j++) {
3577
+ if (evenSpaced[j]) {
3578
+ result += s[j].text;
3579
+ continue;
3580
+ }
3482
3581
  const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
3483
3582
  const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
3484
3583
  const prevIsKorean = /[가-힣]$/.test(result);
@@ -3493,6 +3592,57 @@ function cellTextToString(items) {
3493
3592
  }
3494
3593
  return result;
3495
3594
  });
3595
+ return mergeCellTextLines(textLines);
3596
+ }
3597
+ function detectEvenSpacedItems(items) {
3598
+ const result = new Array(items.length).fill(false);
3599
+ if (items.length < 3) return result;
3600
+ let runStart = -1;
3601
+ for (let i = 0; i < items.length; i++) {
3602
+ const isShortKorean = /^[가-힣]{1}$/.test(items[i].text) || /^[\d]{1}$/.test(items[i].text);
3603
+ if (isShortKorean && runStart >= 0 && i > 0) {
3604
+ const gap = items[i].x - (items[i - 1].x + items[i - 1].w);
3605
+ const maxRunGap = Math.max(items[i].fontSize * 3, 30);
3606
+ if (gap > maxRunGap) {
3607
+ if (i - runStart >= 3) markEvenRun(items, result, runStart, i);
3608
+ runStart = i;
3609
+ continue;
3610
+ }
3611
+ }
3612
+ if (isShortKorean) {
3613
+ if (runStart < 0) runStart = i;
3614
+ } else {
3615
+ if (runStart >= 0 && i - runStart >= 3) {
3616
+ markEvenRun(items, result, runStart, i);
3617
+ }
3618
+ runStart = -1;
3619
+ }
3620
+ }
3621
+ if (runStart >= 0 && items.length - runStart >= 3) {
3622
+ markEvenRun(items, result, runStart, items.length);
3623
+ }
3624
+ return result;
3625
+ }
3626
+ function markEvenRun(items, result, start, end) {
3627
+ const gaps = [];
3628
+ for (let i = start + 1; i < end; i++) {
3629
+ gaps.push(items[i].x - (items[i - 1].x + items[i - 1].w));
3630
+ }
3631
+ const posGaps = gaps.filter((g2) => g2 > 0);
3632
+ if (posGaps.length < 2) return;
3633
+ let minGap = Infinity, maxGap = -Infinity;
3634
+ for (const g2 of posGaps) {
3635
+ if (g2 < minGap) minGap = g2;
3636
+ if (g2 > maxGap) maxGap = g2;
3637
+ }
3638
+ const avgFs = items[start].fontSize;
3639
+ if (minGap >= avgFs * 0.1 && maxGap <= avgFs * 3 && maxGap / Math.max(minGap, 0.1) <= 3) {
3640
+ for (let i = start + 1; i < end; i++) {
3641
+ result[i] = true;
3642
+ }
3643
+ }
3644
+ }
3645
+ function mergeCellTextLines(textLines) {
3496
3646
  if (textLines.length <= 1) return textLines[0] || "";
3497
3647
  const merged = [textLines[0]];
3498
3648
  for (let i = 1; i < textLines.length; i++) {
@@ -3518,24 +3668,172 @@ var Y_TOL = 3;
3518
3668
  var COL_CLUSTER_TOL = 15;
3519
3669
  var MIN_ROWS = 3;
3520
3670
  var MIN_COLS = 2;
3521
- var MIN_GAP_FACTOR = 1.5;
3522
- var MIN_COL_FILL_RATIO = 0.3;
3671
+ var MIN_GAP_FACTOR = 2;
3672
+ var MIN_GAP_ABSOLUTE = 20;
3673
+ var MIN_COL_FILL_RATIO = 0.4;
3523
3674
  function detectClusterTables(items, pageNum) {
3524
3675
  if (items.length < MIN_ROWS * MIN_COLS) return [];
3525
- const rows = groupByBaseline(items);
3676
+ const { merged, originMap } = mergeEvenSpacedClusters(items);
3677
+ const rows = groupByBaseline(merged);
3526
3678
  if (rows.length < MIN_ROWS) return [];
3527
- const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
3528
- if (suspiciousRows.length < MIN_ROWS) return [];
3529
- const columns = extractColumnClusters(suspiciousRows);
3530
- if (columns.length < MIN_COLS) return [];
3531
- const tableRegions = findTableRegions(rows, columns);
3532
3679
  const results = [];
3533
- for (const region of tableRegions) {
3534
- const table = buildClusterTable(region.rows, columns, pageNum);
3535
- if (table) results.push(table);
3680
+ const headerResult = detectHeaderRow(rows);
3681
+ if (headerResult) {
3682
+ const { columns, headerIdx } = headerResult;
3683
+ const headerRow = rows[headerIdx];
3684
+ const headerItems = [...headerRow.items].sort((a, b) => a.x - b.x);
3685
+ const headerAndBelow = rows.slice(headerIdx);
3686
+ const mergedRows = mergeMultiLineRows(headerAndBelow, columns);
3687
+ const tableRegions = findTableRegionsByHeader(mergedRows, columns, headerItems);
3688
+ for (const region of tableRegions) {
3689
+ const table = buildClusterTable(region.rows, columns, pageNum);
3690
+ if (table) {
3691
+ expandUsedItems(table.usedItems, originMap);
3692
+ results.push(table);
3693
+ }
3694
+ }
3695
+ }
3696
+ if (results.length === 0) {
3697
+ const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
3698
+ if (suspiciousRows.length >= MIN_ROWS) {
3699
+ const columns = extractColumnClusters(suspiciousRows);
3700
+ if (columns.length >= MIN_COLS) {
3701
+ const tableRegions = findTableRegions(rows, columns);
3702
+ for (const region of tableRegions) {
3703
+ const mergedRows = mergeMultiLineRows(region.rows, columns);
3704
+ const table = buildClusterTable(mergedRows, columns, pageNum);
3705
+ if (table) {
3706
+ expandUsedItems(table.usedItems, originMap);
3707
+ results.push(table);
3708
+ }
3709
+ }
3710
+ }
3711
+ }
3536
3712
  }
3537
3713
  return results;
3538
3714
  }
3715
+ function mergeEvenSpacedClusters(items) {
3716
+ const originMap = /* @__PURE__ */ new Map();
3717
+ const rows = groupByBaseline(items);
3718
+ const merged = [];
3719
+ for (const row of rows) {
3720
+ const sorted = [...row.items].sort((a, b) => a.x - b.x);
3721
+ let i = 0;
3722
+ while (i < sorted.length) {
3723
+ if (/^[가-힣\d]$/.test(sorted[i].text)) {
3724
+ let runEnd = i + 1;
3725
+ while (runEnd < sorted.length && /^[가-힣\d]$/.test(sorted[runEnd].text)) {
3726
+ const gap = sorted[runEnd].x - (sorted[runEnd - 1].x + sorted[runEnd - 1].w);
3727
+ const fs = sorted[runEnd].fontSize;
3728
+ if (gap < fs * 0.1 || gap > fs * 3) break;
3729
+ runEnd++;
3730
+ }
3731
+ if (runEnd - i >= 3) {
3732
+ const gaps = [];
3733
+ for (let g2 = i + 1; g2 < runEnd; g2++) {
3734
+ gaps.push(sorted[g2].x - (sorted[g2 - 1].x + sorted[g2 - 1].w));
3735
+ }
3736
+ let minG = Infinity, maxG = -Infinity;
3737
+ for (const g2 of gaps) {
3738
+ if (g2 < minG) minG = g2;
3739
+ if (g2 > maxG) maxG = g2;
3740
+ }
3741
+ if (minG > 0 && maxG / minG <= 3) {
3742
+ const run = sorted.slice(i, runEnd);
3743
+ const text = run.map((r) => r.text).join("");
3744
+ const first = run[0], last = run[runEnd - i - 1];
3745
+ const item = {
3746
+ text,
3747
+ x: first.x,
3748
+ y: first.y,
3749
+ w: last.x + last.w - first.x,
3750
+ h: first.h,
3751
+ fontSize: first.fontSize,
3752
+ fontName: first.fontName
3753
+ };
3754
+ originMap.set(item, run);
3755
+ merged.push(item);
3756
+ i = runEnd;
3757
+ continue;
3758
+ }
3759
+ }
3760
+ }
3761
+ merged.push(sorted[i]);
3762
+ i++;
3763
+ }
3764
+ }
3765
+ return { merged, originMap };
3766
+ }
3767
+ function expandUsedItems(usedItems, originMap) {
3768
+ const toAdd = [];
3769
+ for (const item of usedItems) {
3770
+ const origins = originMap.get(item);
3771
+ if (origins) for (const o of origins) toAdd.push(o);
3772
+ }
3773
+ for (const a of toAdd) usedItems.add(a);
3774
+ }
3775
+ function detectHeaderRow(rows) {
3776
+ const allItems = rows.flatMap((r) => r.items);
3777
+ if (allItems.length === 0) return null;
3778
+ let allMinX = Infinity, allMaxX = -Infinity;
3779
+ for (const i of allItems) {
3780
+ if (i.x < allMinX) allMinX = i.x;
3781
+ const r = i.x + i.w;
3782
+ if (r > allMaxX) allMaxX = r;
3783
+ }
3784
+ const pageSpan = allMaxX - allMinX;
3785
+ if (pageSpan <= 0) return null;
3786
+ for (let ri = 0; ri < rows.length; ri++) {
3787
+ const row = rows[ri];
3788
+ if (row.items.length < MIN_COLS || row.items.length > 6) continue;
3789
+ if (row.items.some((i) => i.text.length > 8)) continue;
3790
+ if (!row.items.some((i) => /[가-힣]/.test(i.text))) continue;
3791
+ if (row.items.some((i) => /^[□■○●·※▶▷◆◇\-]/.test(i.text))) continue;
3792
+ const sorted = [...row.items].sort((a, b) => a.x - b.x);
3793
+ const xSpan = sorted[sorted.length - 1].x + sorted[sorted.length - 1].w - sorted[0].x;
3794
+ if (xSpan / pageSpan < 0.4) continue;
3795
+ const avgFs = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
3796
+ let hasLargeGap = false;
3797
+ for (let i = 1; i < sorted.length; i++) {
3798
+ const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
3799
+ if (gap >= avgFs * 2.5) {
3800
+ hasLargeGap = true;
3801
+ break;
3802
+ }
3803
+ }
3804
+ if (!hasLargeGap) continue;
3805
+ const columns = sorted.map((item) => ({ x: item.x, count: 0 }));
3806
+ let matchCount = 0;
3807
+ for (let j = ri + 1; j < rows.length && matchCount < MIN_ROWS + 2; j++) {
3808
+ const matched = countMatchedColumnsRange(rows[j], columns, sorted);
3809
+ if (matched >= MIN_COLS) matchCount++;
3810
+ }
3811
+ if (matchCount < MIN_ROWS) continue;
3812
+ return { columns, headerIdx: ri };
3813
+ }
3814
+ return null;
3815
+ }
3816
+ function mergeMultiLineRows(rows, columns) {
3817
+ if (rows.length <= 1) return rows;
3818
+ const result = [rows[0]];
3819
+ const allFontSizes = rows.flatMap((r) => r.items).map((i) => i.fontSize);
3820
+ const avgFontSize = allFontSizes.length > 0 ? allFontSizes.reduce((s, v) => s + v, 0) / allFontSizes.length : 12;
3821
+ for (let i = 1; i < rows.length; i++) {
3822
+ const prev = result[result.length - 1];
3823
+ const curr = rows[i];
3824
+ const yGap = Math.abs(prev.y - curr.y);
3825
+ const matchedCols = countMatchedColumns(curr, columns);
3826
+ if (yGap < avgFontSize * 1.8 && curr.items.length <= 2 && (matchedCols < MIN_COLS || curr.items.length === 1)) {
3827
+ result[result.length - 1] = {
3828
+ y: prev.y,
3829
+ items: [...prev.items, ...curr.items]
3830
+ };
3831
+ } else {
3832
+ result.push(curr);
3833
+ }
3834
+ }
3835
+ return result;
3836
+ }
3539
3837
  function groupByBaseline(items) {
3540
3838
  if (items.length === 0) return [];
3541
3839
  const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
@@ -3557,8 +3855,9 @@ function groupByBaseline(items) {
3557
3855
  function hasSuspiciousGaps(row) {
3558
3856
  if (row.items.length < 2) return false;
3559
3857
  const sorted = [...row.items].sort((a, b) => a.x - b.x);
3858
+ if (sorted.length === 2 && sorted[1].text.length > 20) return false;
3560
3859
  const avgFontSize = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
3561
- const minGap = avgFontSize * MIN_GAP_FACTOR;
3860
+ const minGap = Math.max(avgFontSize * MIN_GAP_FACTOR, MIN_GAP_ABSOLUTE);
3562
3861
  for (let i = 1; i < sorted.length; i++) {
3563
3862
  const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
3564
3863
  if (gap >= minGap) return true;
@@ -3585,6 +3884,41 @@ function extractColumnClusters(rows) {
3585
3884
  const minCount = Math.max(2, Math.floor(rows.length * MIN_COL_FILL_RATIO));
3586
3885
  return clusters.filter((c) => c.count >= minCount).sort((a, b) => a.x - b.x);
3587
3886
  }
3887
+ function findTableRegionsByHeader(allRows, columns, headerItems) {
3888
+ const regions = [];
3889
+ let currentRegion = [];
3890
+ let missStreak = 0;
3891
+ for (const row of allRows) {
3892
+ const matchedCols = countMatchedColumnsRange(row, columns, headerItems);
3893
+ if (matchedCols >= MIN_COLS) {
3894
+ currentRegion.push(row);
3895
+ missStreak = 0;
3896
+ } else if (currentRegion.length > 0 && (row.items.length <= 2 || missStreak === 0)) {
3897
+ currentRegion.push(row);
3898
+ missStreak++;
3899
+ } else {
3900
+ while (currentRegion.length > 0) {
3901
+ const last = currentRegion[currentRegion.length - 1];
3902
+ if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
3903
+ currentRegion.pop();
3904
+ }
3905
+ if (currentRegion.length >= MIN_ROWS) {
3906
+ regions.push({ rows: [...currentRegion] });
3907
+ }
3908
+ currentRegion = [];
3909
+ missStreak = 0;
3910
+ }
3911
+ }
3912
+ while (currentRegion.length > 0) {
3913
+ const last = currentRegion[currentRegion.length - 1];
3914
+ if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
3915
+ currentRegion.pop();
3916
+ }
3917
+ if (currentRegion.length >= MIN_ROWS) {
3918
+ regions.push({ rows: currentRegion });
3919
+ }
3920
+ return regions;
3921
+ }
3588
3922
  function findTableRegions(allRows, columns) {
3589
3923
  const regions = [];
3590
3924
  let currentRegion = [];
@@ -3620,18 +3954,81 @@ function countMatchedColumns(row, columns) {
3620
3954
  }
3621
3955
  return matched.size;
3622
3956
  }
3623
- function assignToColumn(item, columns) {
3624
- const MAX_DIST = COL_CLUSTER_TOL * 3;
3625
- let bestCol = -1;
3626
- let bestDist = Infinity;
3627
- for (let ci = 0; ci < columns.length; ci++) {
3628
- const dist = Math.abs(item.x - columns[ci].x);
3629
- if (dist < bestDist) {
3630
- bestDist = dist;
3631
- bestCol = ci;
3957
+ function countMatchedColumnsRange(row, columns, headerItems) {
3958
+ const boundaries = [];
3959
+ for (let ci = 0; ci < headerItems.length; ci++) {
3960
+ const left = ci === 0 ? 0 : (headerItems[ci - 1].x + headerItems[ci - 1].w + headerItems[ci].x) / 2;
3961
+ const right = ci === headerItems.length - 1 ? Infinity : (headerItems[ci].x + headerItems[ci].w + headerItems[ci + 1].x) / 2;
3962
+ boundaries.push({ left, right });
3963
+ }
3964
+ const matched = /* @__PURE__ */ new Set();
3965
+ for (const item of row.items) {
3966
+ for (let ci = 0; ci < boundaries.length; ci++) {
3967
+ if (item.x >= boundaries[ci].left && item.x < boundaries[ci].right) {
3968
+ matched.add(ci);
3969
+ break;
3970
+ }
3971
+ }
3972
+ }
3973
+ return matched.size;
3974
+ }
3975
+ function assignRowItems(items, columns, numCols) {
3976
+ if (items.length === 0) return [];
3977
+ const sorted = [...items].sort((a, b) => a.x - b.x);
3978
+ const colCenters = columns.map((c) => c.x);
3979
+ const gaps = [];
3980
+ for (let i = 1; i < sorted.length; i++) {
3981
+ gaps.push({ idx: i, size: sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w) });
3982
+ }
3983
+ const gapSizes = gaps.map((g2) => g2.size).sort((a, b) => a - b);
3984
+ const medianGap = gapSizes.length > 0 ? gapSizes[Math.floor(gapSizes.length / 2)] : 0;
3985
+ const gapThreshold = sorted.length <= numCols + 1 ? 12 : Math.max(medianGap * 2.5, 12);
3986
+ const significantGaps = gaps.filter((g2) => g2.size >= gapThreshold).sort((a, b) => b.size - a.size).slice(0, numCols - 1).sort((a, b) => a.idx - b.idx);
3987
+ const groups = [];
3988
+ let start = 0;
3989
+ for (const gap of significantGaps) {
3990
+ groups.push(sorted.slice(start, gap.idx));
3991
+ start = gap.idx;
3992
+ }
3993
+ groups.push(sorted.slice(start));
3994
+ const result = [];
3995
+ const usedCols = /* @__PURE__ */ new Set();
3996
+ const groupCenters = groups.map((g2) => {
3997
+ let minX = Infinity, maxX = -Infinity;
3998
+ for (const i of g2) {
3999
+ if (i.x < minX) minX = i.x;
4000
+ const r = i.x + i.w;
4001
+ if (r > maxX) maxX = r;
4002
+ }
4003
+ return (minX + maxX) / 2;
4004
+ });
4005
+ const assignments = [];
4006
+ for (let gi = 0; gi < groups.length; gi++) {
4007
+ for (let ci = 0; ci < numCols; ci++) {
4008
+ assignments.push({ gi, ci, dist: Math.abs(groupCenters[gi] - colCenters[ci]) });
4009
+ }
4010
+ }
4011
+ assignments.sort((a, b) => a.dist - b.dist);
4012
+ const assignedGroups = /* @__PURE__ */ new Set();
4013
+ for (const { gi, ci } of assignments) {
4014
+ if (assignedGroups.has(gi) || usedCols.has(ci)) continue;
4015
+ result.push({ col: ci, items: groups[gi] });
4016
+ assignedGroups.add(gi);
4017
+ usedCols.add(ci);
4018
+ }
4019
+ for (let gi = 0; gi < groups.length; gi++) {
4020
+ if (assignedGroups.has(gi)) continue;
4021
+ let bestCol = 0, bestDist = Infinity;
4022
+ for (let ci = 0; ci < numCols; ci++) {
4023
+ const d = Math.abs(groupCenters[gi] - colCenters[ci]);
4024
+ if (d < bestDist) {
4025
+ bestDist = d;
4026
+ bestCol = ci;
4027
+ }
3632
4028
  }
4029
+ result.push({ col: bestCol, items: groups[gi] });
3633
4030
  }
3634
- return bestDist <= MAX_DIST ? bestCol : -1;
4031
+ return result;
3635
4032
  }
3636
4033
  function buildClusterTable(rows, columns, pageNum) {
3637
4034
  const numCols = columns.length;
@@ -3649,12 +4046,12 @@ function buildClusterTable(rows, columns, pageNum) {
3649
4046
  usedItems.add(row.items[0]);
3650
4047
  continue;
3651
4048
  }
3652
- for (const item of row.items) {
3653
- const col = assignToColumn(item, columns);
3654
- if (col < 0) continue;
4049
+ const assignments = assignRowItems(row.items, columns, numCols);
4050
+ for (const { col, items } of assignments) {
4051
+ const text = items.map((i) => i.text).join(" ");
3655
4052
  const existing = cells[r][col].text;
3656
- cells[r][col].text = existing ? existing + " " + item.text : item.text;
3657
- usedItems.add(item);
4053
+ cells[r][col].text = existing ? existing + " " + text : text;
4054
+ for (const item of items) usedItems.add(item);
3658
4055
  }
3659
4056
  }
3660
4057
  let emptyRows = 0;
@@ -3666,11 +4063,48 @@ function buildClusterTable(rows, columns, pageNum) {
3666
4063
  const hasValue = cells.some((row) => row[c].text !== "");
3667
4064
  if (!hasValue) return null;
3668
4065
  }
4066
+ for (let r = numRows - 1; r >= 1; r--) {
4067
+ const nonEmptyCols = cells[r].filter((c) => c.text.trim()).length;
4068
+ if (nonEmptyCols !== 1) continue;
4069
+ if (cells[r][0].text.trim() !== "") continue;
4070
+ const contentText = cells[r].find((c) => c.text.trim())?.text.trim() || "";
4071
+ if (/^[○●▶\-·]/.test(contentText)) continue;
4072
+ for (let pr = r - 1; pr >= 0; pr--) {
4073
+ if (cells[pr].some((c) => c.text.trim())) {
4074
+ for (let c = 0; c < numCols; c++) {
4075
+ const prev = cells[pr][c].text.trim();
4076
+ const curr = cells[r][c].text.trim();
4077
+ if (curr) cells[pr][c].text = prev ? prev + " " + curr : curr;
4078
+ }
4079
+ for (let c = 0; c < numCols; c++) cells[r][c].text = "";
4080
+ break;
4081
+ }
4082
+ }
4083
+ }
4084
+ for (let r = 0; r < cells.length - 1; r++) {
4085
+ const row = cells[r];
4086
+ const hasCol0 = row[0].text.trim() !== "";
4087
+ const hasColLast = numCols > 1 && row[numCols - 1].text.trim() !== "";
4088
+ const midEmpty = row.slice(1, numCols - 1).every((c) => c.text.trim() === "");
4089
+ if (hasCol0 && hasColLast && midEmpty) {
4090
+ const next = cells[r + 1];
4091
+ if (next[0].text.trim() === "" && next.some((c) => c.text.trim())) {
4092
+ for (let c = 1; c < numCols; c++) {
4093
+ const curr = next[c].text.trim();
4094
+ if (curr) row[c].text = row[c].text.trim() ? row[c].text.trim() + " " + curr : curr;
4095
+ }
4096
+ for (let c = 0; c < numCols; c++) next[c].text = "";
4097
+ }
4098
+ }
4099
+ }
4100
+ const filteredCells = cells.filter((row) => row.some((c) => c.text.trim()));
4101
+ const finalRowCount = filteredCells.length;
4102
+ if (finalRowCount < MIN_ROWS) return null;
3669
4103
  const irTable = {
3670
- rows: numRows,
4104
+ rows: finalRowCount,
3671
4105
  cols: numCols,
3672
- cells,
3673
- hasHeader: numRows > 1
4106
+ cells: filteredCells,
4107
+ hasHeader: finalRowCount > 1
3674
4108
  };
3675
4109
  const allItems = rows.flatMap((r) => r.items);
3676
4110
  let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
@@ -3747,7 +4181,7 @@ async function parsePdfDocument(buffer, options) {
3747
4181
  const effectivePageCount = Math.min(pageCount, MAX_PAGES);
3748
4182
  const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
3749
4183
  const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
3750
- const allFontSizes = [];
4184
+ const fontSizeFreq = /* @__PURE__ */ new Map();
3751
4185
  const pageHeights = /* @__PURE__ */ new Map();
3752
4186
  let parsedPages = 0;
3753
4187
  for (let i = 1; i <= effectivePageCount; i++) {
@@ -3764,7 +4198,7 @@ async function parsePdfDocument(buffer, options) {
3764
4198
  warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
3765
4199
  }
3766
4200
  for (const item of visible) {
3767
- if (item.fontSize > 0) allFontSizes.push(item.fontSize);
4201
+ if (item.fontSize > 0) fontSizeFreq.set(item.fontSize, (fontSizeFreq.get(item.fontSize) || 0) + 1);
3768
4202
  }
3769
4203
  const opList = await page.getOperatorList();
3770
4204
  const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
@@ -3803,10 +4237,9 @@ async function parsePdfDocument(buffer, options) {
3803
4237
  blocks.splice(removed[ri], 1);
3804
4238
  }
3805
4239
  }
3806
- const medianFontSize = computeMedianFontSize(allFontSizes);
4240
+ const medianFontSize = computeMedianFontSizeFromFreq(fontSizeFreq);
3807
4241
  if (medianFontSize > 0) {
3808
4242
  detectHeadings(blocks, medianFontSize);
3809
- mergeAdjacentHeadings(blocks);
3810
4243
  }
3811
4244
  detectMarkerHeadings(blocks);
3812
4245
  const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
@@ -3857,11 +4290,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
3857
4290
  }
3858
4291
  return { visible, hiddenCount };
3859
4292
  }
3860
- function computeMedianFontSize(sizes) {
3861
- if (sizes.length === 0) return 0;
3862
- const sorted = [...sizes].sort((a, b) => a - b);
3863
- const mid = Math.floor(sorted.length / 2);
3864
- return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
4293
+ function computeMedianFontSizeFromFreq(freq) {
4294
+ if (freq.size === 0) return 0;
4295
+ let total = 0;
4296
+ for (const count of freq.values()) total += count;
4297
+ const sorted = [...freq.entries()].sort((a, b) => a[0] - b[0]);
4298
+ const mid = Math.floor(total / 2);
4299
+ let cumulative = 0;
4300
+ for (const [size, count] of sorted) {
4301
+ cumulative += count;
4302
+ if (cumulative > mid) return size;
4303
+ }
4304
+ return sorted[sorted.length - 1][0];
3865
4305
  }
3866
4306
  function detectHeadings(blocks, medianFontSize) {
3867
4307
  for (const block of blocks) {
@@ -3881,220 +4321,27 @@ function detectHeadings(blocks, medianFontSize) {
3881
4321
  }
3882
4322
  }
3883
4323
  }
3884
- function mergeAdjacentHeadings(blocks) {
3885
- let i = 0;
3886
- while (i < blocks.length - 1) {
3887
- const curr = blocks[i];
3888
- const next = blocks[i + 1];
3889
- if (curr.type !== "heading" || next.type !== "heading") {
3890
- i++;
3891
- continue;
3892
- }
3893
- if (!curr.bbox || !next.bbox || !curr.text || !next.text) {
3894
- i++;
3895
- continue;
3896
- }
3897
- const currBaseline = curr.bbox.y + (curr.style?.fontSize || curr.bbox.height);
3898
- const nextBaseline = next.bbox.y + (next.style?.fontSize || next.bbox.height);
3899
- const yDiff = Math.abs(currBaseline - nextBaseline);
3900
- const maxFs = Math.max(curr.style?.fontSize || 12, next.style?.fontSize || 12);
3901
- const sameY = curr.bbox.page === next.bbox.page && yDiff < maxFs * 1.5;
3902
- const sameLevel = curr.level === next.level;
3903
- if (sameY && sameLevel) {
3904
- const currX = curr.bbox.x;
3905
- const nextX = next.bbox.x;
3906
- if (currX <= nextX) {
3907
- curr.text = curr.text + " " + next.text;
3908
- } else {
3909
- curr.text = next.text + " " + curr.text;
3910
- }
3911
- curr.bbox = {
3912
- page: curr.bbox.page,
3913
- x: Math.min(curr.bbox.x, next.bbox.x),
3914
- y: Math.min(curr.bbox.y, next.bbox.y),
3915
- width: Math.max(curr.bbox.x + curr.bbox.width, next.bbox.x + next.bbox.width) - Math.min(curr.bbox.x, next.bbox.x),
3916
- height: Math.max(curr.bbox.height, next.bbox.height)
3917
- };
3918
- blocks.splice(i + 1, 1);
3919
- } else {
3920
- i++;
3921
- }
3922
- }
3923
- }
3924
4324
  function collapseEvenSpacing(text) {
3925
4325
  const tokens = text.split(" ");
3926
4326
  const singleCharCount = tokens.filter((t) => t.length === 1).length;
3927
4327
  if (tokens.length >= 3 && singleCharCount / tokens.length >= 0.7) {
3928
4328
  return tokens.join("");
3929
4329
  }
3930
- return text;
3931
- }
3932
- function buildXyCutBlocks(items, pageNum) {
3933
- const allY = items.map((i) => i.y);
3934
- const pageHeight = Math.max(...allY) - Math.min(...allY);
3935
- const gapThreshold = Math.max(15, pageHeight * 0.03);
3936
- const orderedGroups = xyCutOrder(items, gapThreshold);
3937
- const blocks = [];
3938
- for (const group of orderedGroups) {
3939
- if (group.length === 0) continue;
3940
- const yLines = groupByY(group);
3941
- for (const line of yLines) {
3942
- const text = mergeLineSimple(line);
3943
- if (!text.trim()) continue;
3944
- blocks.push({
3945
- type: "paragraph",
3946
- text,
3947
- pageNumber: pageNum,
3948
- bbox: computeBBox(line, pageNum),
3949
- style: dominantStyle(line)
3950
- });
3951
- }
3952
- }
3953
- return blocks.length > 0 ? blocks : null;
3954
- }
3955
- function normalizeUnderSegmentedTable(table, items, pageNum, bbox) {
3956
- const totalCells = table.cells.reduce((sum, row) => sum + row.filter((c) => c.text.trim()).length, 0);
3957
- const totalTextLines = table.cells.reduce((sum, row) => sum + row.reduce((s, c) => s + (c.text.trim() ? c.text.split("\n").length : 0), 0), 0);
3958
- const isUnderSegmented = table.rows === 1 && table.cols === 1 || totalCells <= 2 && totalTextLines >= 8 || totalCells <= 2 && items.length >= 6;
3959
- if (!isUnderSegmented) return null;
3960
- if (hasMultiColumnLayout(items)) return buildXyCutBlocks(items, pageNum);
3961
- const directTable = buildTableFromTextLayout(items, pageNum, bbox);
3962
- if (directTable) return directTable;
3963
- const clusterItems = items.map((i) => ({
3964
- text: i.text,
3965
- x: i.x,
3966
- y: i.y,
3967
- w: i.w,
3968
- h: i.h,
3969
- fontSize: i.fontSize,
3970
- fontName: i.fontName
3971
- }));
3972
- const clusterResults = detectClusterTables(clusterItems, pageNum);
3973
- if (clusterResults.length > 0) {
3974
- const blocks = [];
3975
- const ciToIdx = /* @__PURE__ */ new Map();
3976
- for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
3977
- const usedIndices = /* @__PURE__ */ new Set();
3978
- for (const cr of clusterResults) {
3979
- for (const ci of cr.usedItems) {
3980
- const idx = ciToIdx.get(ci);
3981
- if (idx !== void 0) usedIndices.add(idx);
3982
- }
3983
- blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
3984
- }
3985
- const remaining = items.filter((_, idx) => !usedIndices.has(idx));
3986
- for (const item of remaining) {
3987
- if (!item.text.trim()) continue;
3988
- blocks.push({
3989
- type: "paragraph",
3990
- text: item.text,
3991
- pageNumber: pageNum,
3992
- bbox: computeBBox([item], pageNum),
3993
- style: { fontSize: item.fontSize, fontName: item.fontName }
3994
- });
3995
- }
3996
- blocks.sort((a, b) => {
3997
- const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
3998
- const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
3999
- return by - ay;
4000
- });
4001
- return blocks.length > 0 ? blocks : null;
4002
- }
4003
- return null;
4004
- }
4005
- function buildTableFromTextLayout(items, pageNum, bbox) {
4006
- if (items.length < 4) return null;
4007
- const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
4008
- const yTol = 3;
4009
- const rows = [];
4010
- let curRow = [sorted[0]];
4011
- let curY = sorted[0].y;
4012
- for (let i = 1; i < sorted.length; i++) {
4013
- if (Math.abs(sorted[i].y - curY) <= yTol) {
4014
- curRow.push(sorted[i]);
4015
- } else {
4016
- rows.push(curRow);
4017
- curRow = [sorted[i]];
4018
- curY = sorted[i].y;
4019
- }
4020
- }
4021
- rows.push(curRow);
4022
- if (rows.length < 2) return null;
4023
- const gapPositions = [];
4024
- for (const row of rows) {
4025
- if (row.length < 2) continue;
4026
- const sortedX = [...row].sort((a, b) => a.x - b.x);
4027
- const avgFs = sortedX.reduce((s, i) => s + i.fontSize, 0) / sortedX.length;
4028
- for (let j = 1; j < sortedX.length; j++) {
4029
- const gap = sortedX[j].x - (sortedX[j - 1].x + sortedX[j - 1].w);
4030
- if (gap >= avgFs * 1.5) {
4031
- gapPositions.push(sortedX[j - 1].x + sortedX[j - 1].w + gap / 2);
4032
- }
4033
- }
4034
- }
4035
- if (gapPositions.length < 2) return null;
4036
- gapPositions.sort((a, b) => a - b);
4037
- const colBoundaries = [];
4038
- let clusterSum = gapPositions[0], clusterCount = 1;
4039
- for (let i = 1; i < gapPositions.length; i++) {
4040
- const avg = clusterSum / clusterCount;
4041
- if (Math.abs(gapPositions[i] - avg) <= 15) {
4042
- clusterSum += gapPositions[i];
4043
- clusterCount++;
4044
- } else {
4045
- if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
4046
- clusterSum = gapPositions[i];
4047
- clusterCount = 1;
4048
- }
4049
- }
4050
- if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
4051
- if (colBoundaries.length === 0) return null;
4052
- const numCols = colBoundaries.length + 1;
4053
- const tableRows = [];
4054
- for (const row of rows) {
4055
- const cells = Array(numCols).fill("");
4056
- const sortedX = [...row].sort((a, b) => a.x - b.x);
4057
- for (const item of sortedX) {
4058
- const cx = item.x + item.w / 2;
4059
- let col = 0;
4060
- for (let b = 0; b < colBoundaries.length; b++) {
4061
- if (cx > colBoundaries[b]) col = b + 1;
4062
- }
4063
- cells[col] = cells[col] ? cells[col] + " " + item.text : item.text;
4064
- }
4065
- if (cells[0].trim() === "" && tableRows.length > 0) {
4066
- const prevCells = tableRows[tableRows.length - 1].cells;
4067
- for (let c = 0; c < numCols; c++) {
4068
- if (cells[c].trim()) {
4069
- prevCells[c] = prevCells[c] ? prevCells[c] + " " + cells[c].trim() : cells[c].trim();
4070
- }
4071
- }
4072
- } else {
4073
- tableRows.push({ cells });
4074
- }
4075
- }
4076
- if (tableRows.length < 2) return null;
4077
- const nonEmptyCount = tableRows.reduce((sum, r) => sum + r.cells.filter((c) => c.trim()).length, 0);
4078
- const totalCount = tableRows.length * numCols;
4079
- if (nonEmptyCount < totalCount * 0.3) return null;
4080
- const irCells = tableRows.map(
4081
- (r) => r.cells.map((text, colIdx) => {
4082
- let cleaned = text.trim();
4083
- if (colIdx > 0) cleaned = cleaned.replace(/^[•○·\-]\s*/, "");
4084
- return { text: cleaned, colSpan: 1, rowSpan: 1 };
4085
- })
4330
+ return text.replace(
4331
+ /(?<![가-힣])[가-힣](?: [가-힣\d]){2,}(?![가-힣])/g,
4332
+ (match) => match.replace(/ /g, "")
4086
4333
  );
4087
- const irTable = {
4088
- rows: tableRows.length,
4089
- cols: numCols,
4090
- cells: irCells,
4091
- hasHeader: tableRows.length > 1
4092
- };
4093
- return [{ type: "table", table: irTable, pageNumber: pageNum, bbox }];
4094
4334
  }
4095
4335
  function shouldDemoteTable(table) {
4096
4336
  const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
4097
4337
  const allText = allCells.join(" ");
4338
+ if (table.rows <= 3 && table.cols <= 3) {
4339
+ const totalCells2 = table.rows * table.cols;
4340
+ const emptyCells2 = totalCells2 - allCells.length;
4341
+ if (emptyCells2 >= totalCells2 * 0.3) return true;
4342
+ if (/[□■◆○●▶ㅇ]/.test(allText)) return true;
4343
+ if (/<[^>]+>/.test(allText)) return true;
4344
+ }
4098
4345
  if (allText.length > 200) return false;
4099
4346
  if (/[□■◆○●▶]/.test(allText) && table.rows <= 3) return true;
4100
4347
  const totalCells = table.rows * table.cols;
@@ -4138,32 +4385,6 @@ function detectMarkerHeadings(blocks) {
4138
4385
  }
4139
4386
  }
4140
4387
  }
4141
- function hasMultiColumnLayout(items) {
4142
- if (items.length < 30) return false;
4143
- const sorted = [...items].sort((a, b) => a.x - b.x);
4144
- const minX = sorted[0].x;
4145
- let maxX = minX;
4146
- for (const i of sorted) if (i.x + i.w > maxX) maxX = i.x + i.w;
4147
- const pageWidth = maxX - minX;
4148
- if (pageWidth < 200) return false;
4149
- let bestGap = 0;
4150
- let bestSplit = 0;
4151
- for (let j = 1; j < sorted.length; j++) {
4152
- const gap = sorted[j].x - (sorted[j - 1].x + sorted[j - 1].w);
4153
- if (gap > bestGap) {
4154
- bestGap = gap;
4155
- bestSplit = (sorted[j - 1].x + sorted[j - 1].w + sorted[j].x) / 2;
4156
- }
4157
- }
4158
- if (bestGap < 20) return false;
4159
- const splitRatio = (bestSplit - minX) / pageWidth;
4160
- if (splitRatio < 0.35 || splitRatio > 0.65) return false;
4161
- const leftCount = items.filter((i) => i.x + i.w / 2 < bestSplit).length;
4162
- const rightCount = items.filter((i) => i.x + i.w / 2 >= bestSplit).length;
4163
- if (leftCount < 15 || rightCount < 15) return false;
4164
- if (Math.min(leftCount, rightCount) / Math.max(leftCount, rightCount) < 0.35) return false;
4165
- return true;
4166
- }
4167
4388
  var MAX_XYCUT_DEPTH = 50;
4168
4389
  function xyCutOrder(items, gapThreshold, depth = 0) {
4169
4390
  if (items.length === 0) return [];
@@ -4231,6 +4452,7 @@ function extractPageBlocksWithLines(items, pageNum, opList, pageWidth, pageHeigh
4231
4452
  if (items.length === 0) return [];
4232
4453
  let { horizontals, verticals } = extractLines(opList.fnArray, opList.argsArray);
4233
4454
  ({ horizontals, verticals } = filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight));
4455
+ ({ horizontals, verticals } = preprocessLines(horizontals, verticals));
4234
4456
  const grids = buildTableGrids(horizontals, verticals);
4235
4457
  if (grids.length > 0) {
4236
4458
  return extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals);
@@ -4242,14 +4464,19 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
4242
4464
  const usedItems = /* @__PURE__ */ new Set();
4243
4465
  const sortedGrids = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
4244
4466
  for (const grid of sortedGrids) {
4467
+ const numGridRows = grid.rowYs.length - 1;
4468
+ const numGridCols = grid.colXs.length - 1;
4469
+ if (numGridRows === 1 && numGridCols >= 2) continue;
4245
4470
  const tableItems = [];
4246
4471
  const pad = 3;
4472
+ const gridW = grid.bbox.x2 - grid.bbox.x1;
4247
4473
  for (const item of items) {
4248
4474
  if (usedItems.has(item)) continue;
4249
- if (item.x >= grid.bbox.x1 - pad && item.x + item.w <= grid.bbox.x2 + pad && item.y >= grid.bbox.y1 - pad && item.y <= grid.bbox.y2 + pad) {
4250
- tableItems.push(item);
4251
- usedItems.add(item);
4252
- }
4475
+ if (item.y < grid.bbox.y1 - pad || item.y > grid.bbox.y2 + pad) continue;
4476
+ if (item.x < grid.bbox.x1 - pad || item.x + item.w > grid.bbox.x2 + pad) continue;
4477
+ if (gridW < 120 && item.x + item.w > grid.bbox.x2 - 2) continue;
4478
+ tableItems.push(item);
4479
+ usedItems.add(item);
4253
4480
  }
4254
4481
  const cells = extractCells(grid, horizontals, verticals);
4255
4482
  if (cells.length === 0) continue;
@@ -4273,6 +4500,7 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
4273
4500
  const cellItems = cellTextMap.get(cell) || [];
4274
4501
  let text = cellTextToString(cellItems);
4275
4502
  text = text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").trim();
4503
+ text = text.split("\n").map((line) => collapseEvenSpacing(line)).join("\n");
4276
4504
  irGrid[cell.row][cell.col] = {
4277
4505
  text,
4278
4506
  colSpan: cell.colSpan,
@@ -4294,31 +4522,61 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
4294
4522
  width: grid.bbox.x2 - grid.bbox.x1,
4295
4523
  height: grid.bbox.y2 - grid.bbox.y1
4296
4524
  };
4297
- const normalized = normalizeUnderSegmentedTable(irTable, tableItems, pageNum, tableBbox);
4298
- if (normalized) {
4299
- blocks.push(...normalized);
4300
- continue;
4301
- }
4302
4525
  if (shouldDemoteTable(irTable)) {
4303
4526
  const demoted = demoteTableToText(irTable);
4304
4527
  if (demoted) {
4305
- blocks.push({ type: "paragraph", text: demoted, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
4528
+ const text = numGridRows === 1 ? "\n" + demoted + "\n" : demoted;
4529
+ blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
4306
4530
  }
4307
4531
  continue;
4308
4532
  }
4309
4533
  blocks.push({ type: "table", table: irTable, pageNumber: pageNum, bbox: tableBbox });
4310
4534
  }
4311
- const remaining = items.filter((i) => !usedItems.has(i));
4535
+ let remaining = items.filter((i) => !usedItems.has(i));
4312
4536
  if (remaining.length > 0) {
4313
4537
  remaining.sort((a, b) => b.y - a.y || a.x - b.x);
4314
- const textBlocks = detectListBlocks(extractPageBlocksFallback(remaining, pageNum));
4315
- const allBlocks = [...blocks, ...textBlocks];
4316
- allBlocks.sort((a, b) => {
4538
+ const clusterItems = remaining.map((i) => ({
4539
+ text: i.text,
4540
+ x: i.x,
4541
+ y: i.y,
4542
+ w: i.w,
4543
+ h: i.h,
4544
+ fontSize: i.fontSize,
4545
+ fontName: i.fontName
4546
+ }));
4547
+ const clusterResults = detectClusterTables(clusterItems, pageNum);
4548
+ if (clusterResults.length > 0) {
4549
+ const ciToIdx = /* @__PURE__ */ new Map();
4550
+ for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
4551
+ const usedClusterIndices = /* @__PURE__ */ new Set();
4552
+ for (const cr of clusterResults) {
4553
+ for (const ci of cr.usedItems) {
4554
+ const idx = ciToIdx.get(ci);
4555
+ if (idx !== void 0) usedClusterIndices.add(idx);
4556
+ }
4557
+ blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
4558
+ }
4559
+ remaining = remaining.filter((_, idx) => !usedClusterIndices.has(idx));
4560
+ }
4561
+ if (remaining.length > 0) {
4562
+ const allY = remaining.map((i) => i.y);
4563
+ const pageH = safeMax(allY) - safeMin(allY);
4564
+ const groups = xyCutOrder(remaining, Math.max(15, pageH * 0.03));
4565
+ const textBlocks = [];
4566
+ for (const group of groups) {
4567
+ if (group.length === 0) continue;
4568
+ const groupBlocks = extractPageBlocksFallback(group, pageNum);
4569
+ for (const b of groupBlocks) textBlocks.push(b);
4570
+ }
4571
+ const finalTextBlocks = detectListBlocks(textBlocks);
4572
+ for (const b of finalTextBlocks) blocks.push(b);
4573
+ }
4574
+ blocks.sort((a, b) => {
4317
4575
  const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
4318
4576
  const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
4319
4577
  return by - ay;
4320
4578
  });
4321
- return mergeAdjacentTableBlocks(allBlocks);
4579
+ return mergeAdjacentTableBlocks(blocks);
4322
4580
  }
4323
4581
  return mergeAdjacentTableBlocks(blocks);
4324
4582
  }
@@ -4344,57 +4602,53 @@ function mergeAdjacentTableBlocks(blocks) {
4344
4602
  }
4345
4603
  function extractPageBlocksFallback(items, pageNum) {
4346
4604
  if (items.length === 0) return [];
4347
- if (hasMultiColumnLayout(items)) {
4348
- const xyBlocks = buildXyCutBlocks(items, pageNum) || [];
4349
- return detectSpecialKoreanTables(detectListBlocks(xyBlocks));
4350
- }
4351
4605
  const blocks = [];
4352
- const allYLines = groupByY(items);
4353
- const columns = detectColumns(allYLines);
4354
- if (columns && columns.length >= 3) {
4355
- const tableText = extractWithColumns(allYLines, columns);
4356
- const bbox = computeBBox(items, pageNum);
4357
- blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
4358
- } else {
4359
- const clusterItems = items.map((i) => ({
4360
- text: i.text,
4361
- x: i.x,
4362
- y: i.y,
4363
- w: i.w,
4364
- h: i.h,
4365
- fontSize: i.fontSize,
4366
- fontName: i.fontName
4367
- }));
4368
- const clusterResults = hasMultiColumnLayout(items) ? [] : detectClusterTables(clusterItems, pageNum);
4369
- if (clusterResults.length > 0) {
4370
- const ciToIdx = /* @__PURE__ */ new Map();
4371
- for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
4372
- const usedIndices = /* @__PURE__ */ new Set();
4373
- for (const cr of clusterResults) {
4374
- for (const ci of cr.usedItems) {
4375
- const idx = ciToIdx.get(ci);
4376
- if (idx !== void 0) usedIndices.add(idx);
4377
- }
4378
- blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
4606
+ const clusterItems = items.map((i) => ({
4607
+ text: i.text,
4608
+ x: i.x,
4609
+ y: i.y,
4610
+ w: i.w,
4611
+ h: i.h,
4612
+ fontSize: i.fontSize,
4613
+ fontName: i.fontName
4614
+ }));
4615
+ const clusterResults = detectClusterTables(clusterItems, pageNum);
4616
+ if (clusterResults.length > 0) {
4617
+ const ciToIdx = /* @__PURE__ */ new Map();
4618
+ for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
4619
+ const usedIndices = /* @__PURE__ */ new Set();
4620
+ for (const cr of clusterResults) {
4621
+ for (const ci of cr.usedItems) {
4622
+ const idx = ciToIdx.get(ci);
4623
+ if (idx !== void 0) usedIndices.add(idx);
4379
4624
  }
4380
- const remaining = items.filter((_, idx) => !usedIndices.has(idx));
4381
- if (remaining.length > 0) {
4382
- const yLines = groupByY(remaining);
4383
- for (const line of yLines) {
4384
- const text = mergeLineSimple(line);
4385
- if (!text.trim()) continue;
4386
- const bbox = computeBBox(line, pageNum);
4387
- blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
4388
- }
4625
+ blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
4626
+ }
4627
+ const remaining = items.filter((_, idx) => !usedIndices.has(idx));
4628
+ if (remaining.length > 0) {
4629
+ const yLines = groupByY(remaining);
4630
+ for (const line of yLines) {
4631
+ const text = mergeLineSimple(line);
4632
+ if (!text.trim()) continue;
4633
+ const bbox = computeBBox(line, pageNum);
4634
+ blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
4389
4635
  }
4390
- blocks.sort((a, b) => {
4391
- const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
4392
- const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
4393
- return by - ay;
4394
- });
4636
+ }
4637
+ blocks.sort((a, b) => {
4638
+ const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
4639
+ const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
4640
+ return by - ay;
4641
+ });
4642
+ } else {
4643
+ const allYLines = groupByY(items);
4644
+ const columns = detectColumns(allYLines);
4645
+ if (columns && columns.length >= 3) {
4646
+ const tableText = extractWithColumns(allYLines, columns);
4647
+ const bbox = computeBBox(items, pageNum);
4648
+ blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
4395
4649
  } else {
4396
4650
  const allY = items.map((i) => i.y);
4397
- const pageHeight = Math.max(...allY) - Math.min(...allY);
4651
+ const pageHeight = safeMax(allY) - safeMin(allY);
4398
4652
  const gapThreshold = Math.max(15, pageHeight * 0.03);
4399
4653
  const orderedGroups = xyCutOrder(items, gapThreshold);
4400
4654
  for (const group of orderedGroups) {
@@ -4447,22 +4701,76 @@ function dominantStyle(items) {
4447
4701
  return { fontSize: dominantSize, fontName };
4448
4702
  }
4449
4703
  function normalizeItems(rawItems) {
4450
- return rawItems.filter((i) => typeof i.str === "string" && i.str.trim() !== "").map((i) => {
4704
+ const items = [];
4705
+ const spacePositions = [];
4706
+ for (const i of rawItems) {
4707
+ if (typeof i.str !== "string") continue;
4708
+ const x = Math.round(i.transform[4]);
4709
+ const y = Math.round(i.transform[5]);
4710
+ if (!i.str.trim()) {
4711
+ spacePositions.push({ x, y });
4712
+ continue;
4713
+ }
4451
4714
  const scaleY = Math.abs(i.transform[3]);
4452
4715
  const scaleX = Math.abs(i.transform[0]);
4453
4716
  const fontSize = Math.round(Math.max(scaleY, scaleX));
4454
- return {
4455
- text: i.str.trim(),
4456
- x: Math.round(i.transform[4]),
4457
- y: Math.round(i.transform[5]),
4458
- w: Math.round(i.width),
4459
- h: Math.round(i.height),
4460
- fontSize,
4461
- fontName: i.fontName || "",
4462
- // 0pt 폰트이거나 너비 0 → hidden text (prompt injection 의심)
4463
- isHidden: fontSize === 0 || i.width === 0 && i.str.trim().length > 0
4464
- };
4465
- }).sort((a, b) => b.y - a.y || a.x - b.x);
4717
+ const w = Math.round(i.width);
4718
+ const h = Math.round(i.height);
4719
+ const isHidden = fontSize === 0 || i.width === 0 && i.str.trim().length > 0;
4720
+ let text = i.str.trim();
4721
+ if (/^[\d\s\-().·,☎]+$/.test(text) && /\d/.test(text) && / /.test(text)) {
4722
+ text = text.replace(/ /g, "");
4723
+ }
4724
+ const split = splitEvenSpacedItem(text, x, w, fontSize);
4725
+ if (split) {
4726
+ for (const s of split) {
4727
+ items.push({ text: s.text, x: s.x, y, w: s.w, h, fontSize, fontName: i.fontName || "", isHidden });
4728
+ }
4729
+ } else {
4730
+ items.push({ text, x, y, w, h, fontSize, fontName: i.fontName || "", isHidden });
4731
+ }
4732
+ }
4733
+ const sorted = items.sort((a, b) => b.y - a.y || a.x - b.x);
4734
+ const deduped = [];
4735
+ for (let i = 0; i < sorted.length; i++) {
4736
+ let isDup = false;
4737
+ for (let j = deduped.length - 1; j >= 0; j--) {
4738
+ const prev = deduped[j];
4739
+ if (prev.y - sorted[i].y > 3) break;
4740
+ if (Math.abs(prev.y - sorted[i].y) <= 3 && prev.text === sorted[i].text && Math.abs(prev.x - sorted[i].x) <= 3) {
4741
+ isDup = true;
4742
+ break;
4743
+ }
4744
+ }
4745
+ if (!isDup) deduped.push(sorted[i]);
4746
+ }
4747
+ if (spacePositions.length > 0) {
4748
+ for (const item of deduped) {
4749
+ for (const sp of spacePositions) {
4750
+ if (Math.abs(sp.y - item.y) <= 3) {
4751
+ const dist = item.x - sp.x;
4752
+ if (dist >= 0 && dist <= 20) {
4753
+ item.hasSpaceBefore = true;
4754
+ break;
4755
+ }
4756
+ }
4757
+ }
4758
+ }
4759
+ }
4760
+ return deduped;
4761
+ }
4762
+ function splitEvenSpacedItem(text, itemX, itemW, fontSize) {
4763
+ if (!/^[가-힣\d](?: [가-힣\d]){2,}$/.test(text)) return null;
4764
+ const chars = text.split(" ");
4765
+ if (chars.length < 3) return null;
4766
+ const charW = itemW / chars.length;
4767
+ if (charW > fontSize * 2) return null;
4768
+ return chars.map((ch, idx) => ({
4769
+ text: ch,
4770
+ x: Math.round(itemX + idx * charW),
4771
+ w: Math.round(charW * 0.8)
4772
+ // 실제 글자 폭은 간격보다 좁음
4773
+ }));
4466
4774
  }
4467
4775
  function groupByY(items) {
4468
4776
  if (items.length === 0) return [];
@@ -4487,14 +4795,14 @@ function isProseSpread(items) {
4487
4795
  for (let i = 1; i < sorted.length; i++) {
4488
4796
  gaps.push(sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w));
4489
4797
  }
4490
- const maxGap = Math.max(...gaps);
4798
+ const maxGap = safeMax(gaps);
4491
4799
  const avgLen = items.reduce((s, i) => s + i.text.length, 0) / items.length;
4492
4800
  return maxGap < 40 && avgLen < 5;
4493
4801
  }
4494
4802
  function detectColumns(yLines) {
4495
4803
  const allItems = yLines.flat();
4496
4804
  if (allItems.length === 0) return null;
4497
- const pageWidth = Math.max(...allItems.map((i) => i.x + i.w)) - Math.min(...allItems.map((i) => i.x));
4805
+ const pageWidth = safeMax(allItems.map((i) => i.x + i.w)) - safeMin(allItems.map((i) => i.x));
4498
4806
  if (pageWidth < 100) return null;
4499
4807
  let bigoLineIdx = -1;
4500
4808
  for (let i = 0; i < yLines.length; i++) {
@@ -4526,7 +4834,7 @@ function detectColumns(yLines) {
4526
4834
  }
4527
4835
  const peaks = xClusters.filter((c) => c.count >= 3).sort((a, b) => a.minX - b.minX);
4528
4836
  if (peaks.length < 3) return null;
4529
- const MERGE_TOL = 30;
4837
+ const MERGE_TOL = 40;
4530
4838
  const merged = [peaks[0]];
4531
4839
  for (let i = 1; i < peaks.length; i++) {
4532
4840
  const prev = merged[merged.length - 1];
@@ -4540,7 +4848,14 @@ function detectColumns(yLines) {
4540
4848
  merged.push({ ...peaks[i] });
4541
4849
  }
4542
4850
  }
4543
- const columns = merged.filter((c) => c.count >= 3).map((c) => c.minX);
4851
+ const rawColumns = merged.filter((c) => c.count >= 3).map((c) => c.minX);
4852
+ if (rawColumns.length < 3) return null;
4853
+ const MIN_DETECT_COL_WIDTH = 30;
4854
+ const columns = [rawColumns[0]];
4855
+ for (let i = 1; i < rawColumns.length; i++) {
4856
+ if (rawColumns[i] - columns[columns.length - 1] < MIN_DETECT_COL_WIDTH) continue;
4857
+ columns.push(rawColumns[i]);
4858
+ }
4544
4859
  return columns.length >= 3 ? columns : null;
4545
4860
  }
4546
4861
  function findColumn(x, columns) {
@@ -4668,6 +4983,16 @@ function buildGridTable(lines, columns) {
4668
4983
  }
4669
4984
  merged.splice(0, headerEnd, headerRow);
4670
4985
  }
4986
+ for (const row of merged) {
4987
+ for (let c = 0; c < row.length; c++) {
4988
+ if (row[c]) row[c] = collapseEvenSpacing(row[c]);
4989
+ }
4990
+ }
4991
+ const totalCells = merged.length * numCols;
4992
+ const filledCells = merged.reduce((s, row) => s + row.filter((c) => c).length, 0);
4993
+ if (filledCells < totalCells * 0.35 || merged.length < 2 || merged.length <= 3 && numCols >= 7) {
4994
+ return merged.map((r) => r.filter((c) => c).join(" ")).join("\n");
4995
+ }
4671
4996
  const md = [];
4672
4997
  md.push("| " + merged[0].join(" | ") + " |");
4673
4998
  md.push("| " + merged[0].map(() => "---").join(" | ") + " |");
@@ -4679,12 +5004,32 @@ function buildGridTable(lines, columns) {
4679
5004
  function mergeLineSimple(items) {
4680
5005
  if (items.length <= 1) return items[0]?.text || "";
4681
5006
  const sorted = [...items].sort((a, b) => a.x - b.x);
5007
+ const isEvenSpaced = detectEvenSpacedItems(sorted);
4682
5008
  let result = sorted[0].text;
4683
5009
  for (let i = 1; i < sorted.length; i++) {
4684
5010
  const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
4685
5011
  const avgFs = (sorted[i].fontSize + sorted[i - 1].fontSize) / 2;
4686
- if (gap > 15) result += " ";
4687
- else if (gap < avgFs * 0.15) {
5012
+ const tabThreshold = Math.max(avgFs * 2, 30);
5013
+ if (gap > tabThreshold) {
5014
+ result += " ";
5015
+ result += sorted[i].text;
5016
+ continue;
5017
+ }
5018
+ if (isEvenSpaced[i]) {
5019
+ result += sorted[i].text;
5020
+ continue;
5021
+ }
5022
+ if (sorted[i].hasSpaceBefore && gap >= avgFs * 0.05) {
5023
+ result += " ";
5024
+ result += sorted[i].text;
5025
+ continue;
5026
+ }
5027
+ if (/[□■○●▶◆◇ㅇ]$/.test(sorted[i - 1].text) && /^[가-힣]/.test(sorted[i].text) && gap > 1) {
5028
+ result += " ";
5029
+ result += sorted[i].text;
5030
+ continue;
5031
+ }
5032
+ if (gap < avgFs * 0.15) {
4688
5033
  } else if (gap < avgFs * 0.35 && (/[가-힣]$/.test(result) || /^[가-힣]/.test(sorted[i].text))) {
4689
5034
  } else if (gap > 3) result += " ";
4690
5035
  result += sorted[i].text;
@@ -4693,8 +5038,8 @@ function mergeLineSimple(items) {
4693
5038
  }
4694
5039
  function cleanPdfText(text) {
4695
5040
  return mergeKoreanLines(
4696
- text.replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "")
4697
- ).replace(/^(?!\|).{3,30}$/gm, (line) => collapseEvenSpacing(line)).replace(/\n{3,}/g, "\n\n").trim();
5041
+ text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
5042
+ ).replace(/^(?!\| ---).*$/gm, (line) => collapseEvenSpacing(line)).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
4698
5043
  }
4699
5044
  function startsWithMarker(line) {
4700
5045
  const t = line.trimStart();
@@ -4886,7 +5231,7 @@ function mergeKoreanLines(text) {
4886
5231
  result[result.length - 1] = prev + " " + currTrimmed;
4887
5232
  continue;
4888
5233
  }
4889
- if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev)) {
5234
+ if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev) && !startsWithMarker(prev)) {
4890
5235
  result[result.length - 1] = prev + " " + curr;
4891
5236
  } else {
4892
5237
  result.push(curr);
@@ -4934,7 +5279,7 @@ function getTextContent(el) {
4934
5279
  return el.textContent?.trim() ?? "";
4935
5280
  }
4936
5281
  function parseXml(text) {
4937
- return new DOMParser2().parseFromString(text, "text/xml");
5282
+ return new DOMParser2().parseFromString(stripDtd(text), "text/xml");
4938
5283
  }
4939
5284
  function parseSharedStrings(xml) {
4940
5285
  const doc = parseXml(xml);
@@ -5221,7 +5566,7 @@ function getAttr(el, localName) {
5221
5566
  return null;
5222
5567
  }
5223
5568
  function parseXml2(text) {
5224
- return new DOMParser3().parseFromString(text, "text/xml");
5569
+ return new DOMParser3().parseFromString(stripDtd(text), "text/xml");
5225
5570
  }
5226
5571
  function parseStyles(xml) {
5227
5572
  const doc = parseXml2(xml);
@@ -5621,7 +5966,13 @@ function normalize(s) {
5621
5966
  }
5622
5967
  var MAX_LEVENSHTEIN_LEN = 1e4;
5623
5968
  function levenshtein(a, b) {
5624
- if (a.length + b.length > MAX_LEVENSHTEIN_LEN) return Math.abs(a.length - b.length);
5969
+ if (a.length + b.length > MAX_LEVENSHTEIN_LEN) {
5970
+ const sampleLen = Math.min(500, a.length, b.length);
5971
+ let diffs = 0;
5972
+ for (let i = 0; i < sampleLen; i++) if (a[i] !== b[i]) diffs++;
5973
+ const sampleRate = sampleLen > 0 ? diffs / sampleLen : 1;
5974
+ return Math.abs(a.length - b.length) + Math.round(Math.min(a.length, b.length) * sampleRate);
5975
+ }
5625
5976
  if (a.length > b.length) [a, b] = [b, a];
5626
5977
  const m = a.length;
5627
5978
  const n = b.length;
@@ -5904,13 +6255,20 @@ function extractInlineFields(text) {
5904
6255
 
5905
6256
  // src/hwpx/generator.ts
5906
6257
  import JSZip5 from "jszip";
5907
- var HWPML_NS = "http://www.hancom.co.kr/hwpml/2016/HwpMl";
6258
+ var NS_SECTION = "http://www.hancom.co.kr/hwpml/2011/section";
6259
+ var NS_PARA = "http://www.hancom.co.kr/hwpml/2011/paragraph";
6260
+ var NS_HEAD = "http://www.hancom.co.kr/hwpml/2011/head";
6261
+ var NS_OPF = "http://www.idpf.org/2007/opf/";
6262
+ var NS_HPF = "http://www.hancom.co.kr/schema/2011/hpf";
6263
+ var NS_OCF = "urn:oasis:names:tc:opendocument:xmlns:container";
5908
6264
  async function markdownToHwpx(markdown) {
5909
6265
  const blocks = parseMarkdownToBlocks(markdown);
5910
6266
  const sectionXml = blocksToSectionXml(blocks);
5911
6267
  const zip = new JSZip5();
5912
6268
  zip.file("mimetype", "application/hwp+zip", { compression: "STORE" });
6269
+ zip.file("META-INF/container.xml", generateContainerXml());
5913
6270
  zip.file("Contents/content.hpf", generateManifest());
6271
+ zip.file("Contents/header.xml", generateHeaderXml());
5914
6272
  zip.file("Contents/section0.xml", sectionXml);
5915
6273
  return await zip.generateAsync({ type: "arraybuffer" });
5916
6274
  }
@@ -5955,8 +6313,111 @@ function parseMarkdownToBlocks(md) {
5955
6313
  function escapeXml(text) {
5956
6314
  return text.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;");
5957
6315
  }
6316
+ function generateContainerXml() {
6317
+ return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
6318
+ <ocf:container xmlns:ocf="${NS_OCF}" xmlns:hpf="${NS_HPF}">
6319
+ <ocf:rootfiles>
6320
+ <ocf:rootfile full-path="Contents/content.hpf" media-type="application/hwpml-package+xml"/>
6321
+ </ocf:rootfiles>
6322
+ </ocf:container>`;
6323
+ }
6324
+ function generateManifest() {
6325
+ return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
6326
+ <opf:package xmlns:opf="${NS_OPF}" xmlns:hpf="${NS_HPF}" xmlns:hh="${NS_HEAD}">
6327
+ <opf:manifest>
6328
+ <opf:item id="header" href="Contents/header.xml" media-type="application/xml"/>
6329
+ <opf:item id="section0" href="Contents/section0.xml" media-type="application/xml"/>
6330
+ </opf:manifest>
6331
+ <opf:spine>
6332
+ <opf:itemref idref="header" linear="no"/>
6333
+ <opf:itemref idref="section0" linear="yes"/>
6334
+ </opf:spine>
6335
+ </opf:package>`;
6336
+ }
6337
+ function generateHeaderXml() {
6338
+ return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
6339
+ <hh:head xmlns:hh="${NS_HEAD}" xmlns:hp="${NS_PARA}" version="1.4" secCnt="1">
6340
+ <hh:beginNum page="1" footnote="1" endnote="1" pic="1" tbl="1" equation="1"/>
6341
+ <hh:refList>
6342
+ <hh:fontfaces itemCnt="7">
6343
+ <hh:fontface lang="HANGUL" fontCnt="1">
6344
+ <hh:font id="0" face="\uD568\uCD08\uB86C\uBC14\uD0D5" type="TTF" isEmbedded="0">
6345
+ <hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="4" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
6346
+ </hh:font>
6347
+ </hh:fontface>
6348
+ <hh:fontface lang="LATIN" fontCnt="1">
6349
+ <hh:font id="0" face="Times New Roman" type="TTF" isEmbedded="0">
6350
+ <hh:typeInfo familyType="FCAT_OLDSTYLE" weight="5" proportion="4" contrast="2" strokeVariation="0" armStyle="0" letterform="0" midline="0" xHeight="4"/>
6351
+ </hh:font>
6352
+ </hh:fontface>
6353
+ <hh:fontface lang="HANJA" fontCnt="1">
6354
+ <hh:font id="0" face="\uD568\uCD08\uB86C\uBC14\uD0D5" type="TTF" isEmbedded="0">
6355
+ <hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="4" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
6356
+ </hh:font>
6357
+ </hh:fontface>
6358
+ <hh:fontface lang="JAPANESE" fontCnt="1">
6359
+ <hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
6360
+ <hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
6361
+ </hh:font>
6362
+ </hh:fontface>
6363
+ <hh:fontface lang="OTHER" fontCnt="1">
6364
+ <hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
6365
+ <hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
6366
+ </hh:font>
6367
+ </hh:fontface>
6368
+ <hh:fontface lang="SYMBOL" fontCnt="1">
6369
+ <hh:font id="0" face="Symbol" type="TTF" isEmbedded="0">
6370
+ <hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
6371
+ </hh:font>
6372
+ </hh:fontface>
6373
+ <hh:fontface lang="USER" fontCnt="1">
6374
+ <hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
6375
+ <hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
6376
+ </hh:font>
6377
+ </hh:fontface>
6378
+ </hh:fontfaces>
6379
+ <hh:borderFills itemCnt="1">
6380
+ <hh:borderFill id="0" threeD="0" shadow="0" centerLine="0" breakCellSeparateLine="0">
6381
+ <hh:slash type="NONE" Crooked="0" isCounter="0"/>
6382
+ <hh:backSlash type="NONE" Crooked="0" isCounter="0"/>
6383
+ <hh:leftBorder type="NONE" width="0.1mm" color="0"/>
6384
+ <hh:rightBorder type="NONE" width="0.1mm" color="0"/>
6385
+ <hh:topBorder type="NONE" width="0.1mm" color="0"/>
6386
+ <hh:bottomBorder type="NONE" width="0.1mm" color="0"/>
6387
+ <hh:diagonal type="NONE" width="0.1mm" color="0"/>
6388
+ <hh:fillInfo/>
6389
+ </hh:borderFill>
6390
+ </hh:borderFills>
6391
+ <hh:charProperties itemCnt="1">
6392
+ <hh:charPr id="0" height="1000" textColor="0" shadeColor="-1" useFontSpace="0" useKerning="0" symMark="NONE" borderFillIDRef="0">
6393
+ <hh:fontRef hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
6394
+ <hh:ratio hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
6395
+ <hh:spacing hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
6396
+ <hh:relSz hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
6397
+ <hh:offset hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
6398
+ </hh:charPr>
6399
+ </hh:charProperties>
6400
+ <hh:tabProperties itemCnt="0"/>
6401
+ <hh:numberings itemCnt="0"/>
6402
+ <hh:bullets itemCnt="0"/>
6403
+ <hh:paraProperties itemCnt="1">
6404
+ <hh:paraPr id="0" tabIDRef="0" condense="0" fontLineHeight="0" snapToGrid="0" suppressOverlap="0" checked="0">
6405
+ <hh:parLineBreak lineBreak="BREAK_LINE" wordBreak="BREAK_WORD" breakLatinWord="BREAK_WORD" breakNonLatinWord="BREAK_WORD"/>
6406
+ <hh:parMargin left="0" right="0" prev="0" next="0" indent="0"/>
6407
+ <hh:parBorder borderFillIDRef="0" offsetLeft="0" offsetRight="0" offsetTop="0" offsetBottom="0" connect="0" ignoreMargin="0"/>
6408
+ <hh:parShade borderFillIDRef="0"/>
6409
+ <hh:parTabList/>
6410
+ </hh:paraPr>
6411
+ </hh:paraProperties>
6412
+ <hh:styles itemCnt="1">
6413
+ <hh:style id="0" type="PARA" name="\uBC14\uD0D5\uAE00" engName="Normal" paraPrIDRef="0" charPrIDRef="0" nextStyleIDRef="0" langIDRef="1042" lockForm="0"/>
6414
+ </hh:styles>
6415
+ </hh:refList>
6416
+ <hh:compatibleDocument targetProgram="HWP2018"/>
6417
+ </hh:head>`;
6418
+ }
5958
6419
  function generateParagraph(text) {
5959
- return `<hp:p><hp:run><hp:t>${escapeXml(text)}</hp:t></hp:run></hp:p>`;
6420
+ return `<hp:p paraPrIDRef="0" styleIDRef="0"><hp:run charPrIDRef="0"><hp:t>${escapeXml(text)}</hp:t></hp:run></hp:p>`;
5960
6421
  }
5961
6422
  function generateTable(rows) {
5962
6423
  const trElements = rows.map((row) => {
@@ -5980,22 +6441,11 @@ function blocksToSectionXml(blocks) {
5980
6441
  return "";
5981
6442
  }
5982
6443
  }).join("\n ");
5983
- return `<?xml version="1.0" encoding="UTF-8"?>
5984
- <hs:sec xmlns:hs="${HWPML_NS}" xmlns:hp="${HWPML_NS}">
6444
+ return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
6445
+ <hs:sec xmlns:hs="${NS_SECTION}" xmlns:hp="${NS_PARA}">
5985
6446
  ${body}
5986
6447
  </hs:sec>`;
5987
6448
  }
5988
- function generateManifest() {
5989
- return `<?xml version="1.0" encoding="UTF-8"?>
5990
- <opf:package xmlns:opf="http://www.idpf.org/2007/opf">
5991
- <opf:manifest>
5992
- <opf:item id="s0" href="section0.xml" media-type="application/xml"/>
5993
- </opf:manifest>
5994
- <opf:spine>
5995
- <opf:itemref idref="s0"/>
5996
- </opf:spine>
5997
- </opf:package>`;
5998
- }
5999
6449
 
6000
6450
  // src/index.ts
6001
6451
  async function parse(input, options) {