kordoc 2.0.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -63,6 +63,7 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount) {
63
63
  blocks.push({ type: "paragraph", text: text.trim(), pageNumber: i });
64
64
  }
65
65
  } catch {
66
+ blocks.push({ type: "paragraph", text: `[OCR \uC2E4\uD328: \uD398\uC774\uC9C0 ${i}]` });
66
67
  }
67
68
  }
68
69
  return blocks;
@@ -138,7 +139,7 @@ import { inflateRawSync } from "zlib";
138
139
  import { DOMParser } from "@xmldom/xmldom";
139
140
 
140
141
  // src/utils.ts
141
- var VERSION = true ? "2.0.3" : "0.0.0-dev";
142
+ var VERSION = true ? "2.1.0" : "0.0.0-dev";
142
143
  function toArrayBuffer(buf) {
143
144
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
144
145
  return buf.buffer;
@@ -468,6 +469,9 @@ function tableToMarkdown(table) {
468
469
  if (dr === 0 && dc === 0) continue;
469
470
  if (r + dr < numRows && c + dc < numCols) {
470
471
  skip.add(`${r + dr},${c + dc}`);
472
+ if (dr === 0) {
473
+ display[r][c + dc] = cell.text.replace(/\n/g, "<br>");
474
+ }
471
475
  }
472
476
  }
473
477
  }
@@ -563,7 +567,12 @@ function parseCharProperties(doc, map) {
563
567
  if (!id) continue;
564
568
  const prop = {};
565
569
  const height = el.getAttribute("height");
566
- if (height) prop.fontSize = parseInt(height, 10) / 100;
570
+ if (height) {
571
+ const parsedHeight = parseInt(height, 10);
572
+ if (!isNaN(parsedHeight) && parsedHeight > 0) {
573
+ prop.fontSize = parsedHeight / 100;
574
+ }
575
+ }
567
576
  const bold = el.getAttribute("bold");
568
577
  if (bold === "true" || bold === "1") prop.bold = true;
569
578
  const italic = el.getAttribute("italic");
@@ -703,7 +712,7 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
703
712
  const data = await file.async("uint8array");
704
713
  decompressed.total += data.length;
705
714
  if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
706
- const ext = ref.includes(".") ? ref.split(".").pop() : "png";
715
+ const ext = ref.includes(".") ? ref.split(".").pop() || "png" : "png";
707
716
  const mimeType = imageExtToMime(ext);
708
717
  imageIndex++;
709
718
  const filename = `image_${String(imageIndex).padStart(3, "0")}.${mimeToExt(mimeType)}`;
@@ -997,8 +1006,10 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
997
1006
  break;
998
1007
  case "cellSpan":
999
1008
  if (tableCtx?.cell) {
1000
- const cs = parseInt(el.getAttribute("colSpan") || "1", 10);
1001
- const rs = parseInt(el.getAttribute("rowSpan") || "1", 10);
1009
+ const rawCs = parseInt(el.getAttribute("colSpan") || "1", 10);
1010
+ const cs = isNaN(rawCs) ? 1 : rawCs;
1011
+ const rawRs = parseInt(el.getAttribute("rowSpan") || "1", 10);
1012
+ const rs = isNaN(rawRs) ? 1 : rawRs;
1002
1013
  tableCtx.cell.colSpan = clampSpan(cs, MAX_COLS);
1003
1014
  tableCtx.cell.rowSpan = clampSpan(rs, MAX_ROWS);
1004
1015
  }
@@ -1090,6 +1101,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
1090
1101
  extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
1091
1102
  } else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
1092
1103
  walkChildren(el, d + 1);
1104
+ } else if (localTag === "run") {
1105
+ tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
1093
1106
  }
1094
1107
  }
1095
1108
  };
@@ -2957,10 +2970,33 @@ var MIN_LINE_LENGTH = 10;
2957
2970
  var COORD_MERGE_TOL = 3;
2958
2971
  var CONNECT_TOL = 5;
2959
2972
  var CELL_PADDING = 2;
2973
+ var MAX_LINE_WIDTH = 5;
2974
+ var IDENTITY = [1, 0, 0, 1, 0, 0];
2975
+ function matMultiply(m1, m2) {
2976
+ return [
2977
+ m1[0] * m2[0] + m1[2] * m2[1],
2978
+ m1[1] * m2[0] + m1[3] * m2[1],
2979
+ m1[0] * m2[2] + m1[2] * m2[3],
2980
+ m1[1] * m2[2] + m1[3] * m2[3],
2981
+ m1[0] * m2[4] + m1[2] * m2[5] + m1[4],
2982
+ m1[1] * m2[4] + m1[3] * m2[5] + m1[5]
2983
+ ];
2984
+ }
2985
+ function matTransformPoint(m, x, y) {
2986
+ return [m[0] * x + m[2] * y + m[4], m[1] * x + m[3] * y + m[5]];
2987
+ }
2988
+ function matScale(m) {
2989
+ return Math.max(
2990
+ Math.sqrt(m[1] * m[1] + m[3] * m[3]),
2991
+ Math.sqrt(m[0] * m[0] + m[2] * m[2])
2992
+ );
2993
+ }
2960
2994
  function extractLines(fnArray, argsArray) {
2961
2995
  const horizontals = [];
2962
2996
  const verticals = [];
2997
+ let ctm = [...IDENTITY];
2963
2998
  let lineWidth = 1;
2999
+ const stateStack = [];
2964
3000
  let currentPath = [];
2965
3001
  let pathStartX = 0, pathStartY = 0;
2966
3002
  let curX = 0, curY = 0;
@@ -2978,13 +3014,53 @@ function extractLines(fnArray, argsArray) {
2978
3014
  );
2979
3015
  }
2980
3016
  }
2981
- function flushPath(isStroke) {
2982
- if (!isStroke) {
3017
+ function tryConvertLinesToRectangle(path) {
3018
+ if (path.length < 3 || path.length > 5) return false;
3019
+ const first = path[0], last = path[path.length - 1];
3020
+ const closed = Math.abs(first.x1 - last.x2) < 1 && Math.abs(first.y1 - last.y2) < 1;
3021
+ if (!closed) return false;
3022
+ let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
3023
+ for (const seg of path) {
3024
+ minX = Math.min(minX, seg.x1, seg.x2);
3025
+ minY = Math.min(minY, seg.y1, seg.y2);
3026
+ maxX = Math.max(maxX, seg.x1, seg.x2);
3027
+ maxY = Math.max(maxY, seg.y1, seg.y2);
3028
+ }
3029
+ const w = maxX - minX, h = maxY - minY;
3030
+ if (w < MIN_LINE_LENGTH && h < MIN_LINE_LENGTH) return false;
3031
+ path.length = 0;
3032
+ if (h < ORIENTATION_TOL * 2 || w > MIN_LINE_LENGTH && h <= MAX_LINE_WIDTH) {
3033
+ path.push({ x1: minX, y1: (minY + maxY) / 2, x2: maxX, y2: (minY + maxY) / 2 });
3034
+ } else if (w < ORIENTATION_TOL * 2 || h > MIN_LINE_LENGTH && w <= MAX_LINE_WIDTH) {
3035
+ path.push({ x1: (minX + maxX) / 2, y1: minY, x2: (minX + maxX) / 2, y2: maxY });
3036
+ } else {
3037
+ pushRectangle(path, minX, minY, w, h);
3038
+ }
3039
+ return true;
3040
+ }
3041
+ function flushPath(isStroke, isFill) {
3042
+ if (!isStroke && !isFill) {
3043
+ currentPath = [];
3044
+ return;
3045
+ }
3046
+ if (isFill && !isStroke && currentPath.length >= 3) {
3047
+ tryConvertLinesToRectangle(currentPath);
3048
+ }
3049
+ const scale = matScale(ctm);
3050
+ const effectiveLW = lineWidth * scale;
3051
+ if (effectiveLW > MAX_LINE_WIDTH && isStroke && !isFill) {
2983
3052
  currentPath = [];
2984
3053
  return;
2985
3054
  }
2986
3055
  for (const seg of currentPath) {
2987
- classifyAndAdd(seg, lineWidth, horizontals, verticals);
3056
+ const [px1, py1] = matTransformPoint(ctm, seg.x1, seg.y1);
3057
+ const [px2, py2] = matTransformPoint(ctm, seg.x2, seg.y2);
3058
+ classifyAndAdd(
3059
+ { x1: px1, y1: py1, x2: px2, y2: py2 },
3060
+ effectiveLW,
3061
+ horizontals,
3062
+ verticals
3063
+ );
2988
3064
  }
2989
3065
  currentPath = [];
2990
3066
  }
@@ -2992,9 +3068,28 @@ function extractLines(fnArray, argsArray) {
2992
3068
  const op = fnArray[i];
2993
3069
  const args = argsArray[i];
2994
3070
  switch (op) {
3071
+ // ── Graphics State ──
3072
+ case OPS.save:
3073
+ stateStack.push({ ctm: [...ctm], lineWidth });
3074
+ break;
3075
+ case OPS.restore:
3076
+ if (stateStack.length > 0) {
3077
+ const state = stateStack.pop();
3078
+ ctm = state.ctm;
3079
+ lineWidth = state.lineWidth;
3080
+ }
3081
+ break;
3082
+ case OPS.transform: {
3083
+ const m = args;
3084
+ if (m.length >= 6) {
3085
+ ctm = matMultiply(ctm, [m[0], m[1], m[2], m[3], m[4], m[5]]);
3086
+ }
3087
+ break;
3088
+ }
2995
3089
  case OPS.setLineWidth:
2996
3090
  lineWidth = args[0] || 1;
2997
3091
  break;
3092
+ // ── Path Construction ──
2998
3093
  case OPS.constructPath: {
2999
3094
  const arg0 = args[0];
3000
3095
  if (Array.isArray(arg0)) {
@@ -3062,34 +3157,60 @@ function extractLines(fnArray, argsArray) {
3062
3157
  }
3063
3158
  }
3064
3159
  }
3065
- if (afterOp === OPS.stroke || afterOp === OPS.closeStroke) {
3066
- flushPath(true);
3067
- } else if (afterOp === OPS.fill || afterOp === OPS.eoFill || afterOp === OPS.fillStroke || afterOp === OPS.eoFillStroke || afterOp === OPS.closeFillStroke || afterOp === OPS.closeEOFillStroke) {
3068
- flushPath(true);
3160
+ const isStroke5 = afterOp === OPS.stroke || afterOp === OPS.closeStroke;
3161
+ const isFill5 = afterOp === OPS.fill || afterOp === OPS.eoFill;
3162
+ const isBoth5 = afterOp === OPS.fillStroke || afterOp === OPS.eoFillStroke || afterOp === OPS.closeFillStroke || afterOp === OPS.closeEOFillStroke;
3163
+ if (isStroke5 || isFill5 || isBoth5) {
3164
+ flushPath(isStroke5 || isBoth5, isFill5 || isBoth5);
3069
3165
  } else if (afterOp === OPS.endPath) {
3070
- flushPath(false);
3166
+ flushPath(false, false);
3071
3167
  }
3072
3168
  }
3073
3169
  break;
3074
3170
  }
3171
+ // ── Paint Operations ──
3075
3172
  case OPS.stroke:
3076
3173
  case OPS.closeStroke:
3077
- flushPath(true);
3174
+ flushPath(true, false);
3078
3175
  break;
3079
3176
  case OPS.fill:
3080
3177
  case OPS.eoFill:
3178
+ flushPath(false, true);
3179
+ break;
3081
3180
  case OPS.fillStroke:
3082
3181
  case OPS.eoFillStroke:
3083
3182
  case OPS.closeFillStroke:
3084
3183
  case OPS.closeEOFillStroke:
3085
- flushPath(true);
3184
+ flushPath(true, true);
3086
3185
  break;
3087
3186
  case OPS.endPath:
3088
- flushPath(false);
3187
+ flushPath(false, false);
3089
3188
  break;
3090
3189
  }
3091
3190
  }
3092
- return { horizontals, verticals };
3191
+ return {
3192
+ horizontals: deduplicateLines(horizontals),
3193
+ verticals: deduplicateLines(verticals)
3194
+ };
3195
+ }
3196
+ function deduplicateLines(lines) {
3197
+ if (lines.length <= 1) return lines;
3198
+ const result = [];
3199
+ const tol = COORD_MERGE_TOL;
3200
+ for (const line of lines) {
3201
+ let isDuplicate = false;
3202
+ for (const existing of result) {
3203
+ if (Math.abs(line.y1 - existing.y1) <= tol && Math.abs(line.y2 - existing.y2) <= tol && Math.abs(line.x1 - existing.x1) <= tol && Math.abs(line.x2 - existing.x2) <= tol) {
3204
+ if (line.lineWidth > existing.lineWidth) {
3205
+ existing.lineWidth = line.lineWidth;
3206
+ }
3207
+ isDuplicate = true;
3208
+ break;
3209
+ }
3210
+ }
3211
+ if (!isDuplicate) result.push(line);
3212
+ }
3213
+ return result;
3093
3214
  }
3094
3215
  function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
3095
3216
  const dx = Math.abs(seg.x2 - seg.x1);
@@ -3685,6 +3806,7 @@ async function parsePdfDocument(buffer, options) {
3685
3806
  const medianFontSize = computeMedianFontSize(allFontSizes);
3686
3807
  if (medianFontSize > 0) {
3687
3808
  detectHeadings(blocks, medianFontSize);
3809
+ mergeAdjacentHeadings(blocks);
3688
3810
  }
3689
3811
  detectMarkerHeadings(blocks);
3690
3812
  const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
@@ -3759,6 +3881,46 @@ function detectHeadings(blocks, medianFontSize) {
3759
3881
  }
3760
3882
  }
3761
3883
  }
3884
+ function mergeAdjacentHeadings(blocks) {
3885
+ let i = 0;
3886
+ while (i < blocks.length - 1) {
3887
+ const curr = blocks[i];
3888
+ const next = blocks[i + 1];
3889
+ if (curr.type !== "heading" || next.type !== "heading") {
3890
+ i++;
3891
+ continue;
3892
+ }
3893
+ if (!curr.bbox || !next.bbox || !curr.text || !next.text) {
3894
+ i++;
3895
+ continue;
3896
+ }
3897
+ const currBaseline = curr.bbox.y + (curr.style?.fontSize || curr.bbox.height);
3898
+ const nextBaseline = next.bbox.y + (next.style?.fontSize || next.bbox.height);
3899
+ const yDiff = Math.abs(currBaseline - nextBaseline);
3900
+ const maxFs = Math.max(curr.style?.fontSize || 12, next.style?.fontSize || 12);
3901
+ const sameY = curr.bbox.page === next.bbox.page && yDiff < maxFs * 1.5;
3902
+ const sameLevel = curr.level === next.level;
3903
+ if (sameY && sameLevel) {
3904
+ const currX = curr.bbox.x;
3905
+ const nextX = next.bbox.x;
3906
+ if (currX <= nextX) {
3907
+ curr.text = curr.text + " " + next.text;
3908
+ } else {
3909
+ curr.text = next.text + " " + curr.text;
3910
+ }
3911
+ curr.bbox = {
3912
+ page: curr.bbox.page,
3913
+ x: Math.min(curr.bbox.x, next.bbox.x),
3914
+ y: Math.min(curr.bbox.y, next.bbox.y),
3915
+ width: Math.max(curr.bbox.x + curr.bbox.width, next.bbox.x + next.bbox.width) - Math.min(curr.bbox.x, next.bbox.x),
3916
+ height: Math.max(curr.bbox.height, next.bbox.height)
3917
+ };
3918
+ blocks.splice(i + 1, 1);
3919
+ } else {
3920
+ i++;
3921
+ }
3922
+ }
3923
+ }
3762
3924
  function collapseEvenSpacing(text) {
3763
3925
  const tokens = text.split(" ");
3764
3926
  const singleCharCount = tokens.filter((t) => t.length === 1).length;
@@ -3767,6 +3929,169 @@ function collapseEvenSpacing(text) {
3767
3929
  }
3768
3930
  return text;
3769
3931
  }
3932
+ function buildXyCutBlocks(items, pageNum) {
3933
+ const allY = items.map((i) => i.y);
3934
+ const pageHeight = Math.max(...allY) - Math.min(...allY);
3935
+ const gapThreshold = Math.max(15, pageHeight * 0.03);
3936
+ const orderedGroups = xyCutOrder(items, gapThreshold);
3937
+ const blocks = [];
3938
+ for (const group of orderedGroups) {
3939
+ if (group.length === 0) continue;
3940
+ const yLines = groupByY(group);
3941
+ for (const line of yLines) {
3942
+ const text = mergeLineSimple(line);
3943
+ if (!text.trim()) continue;
3944
+ blocks.push({
3945
+ type: "paragraph",
3946
+ text,
3947
+ pageNumber: pageNum,
3948
+ bbox: computeBBox(line, pageNum),
3949
+ style: dominantStyle(line)
3950
+ });
3951
+ }
3952
+ }
3953
+ return blocks.length > 0 ? blocks : null;
3954
+ }
3955
+ function normalizeUnderSegmentedTable(table, items, pageNum, bbox) {
3956
+ const totalCells = table.cells.reduce((sum, row) => sum + row.filter((c) => c.text.trim()).length, 0);
3957
+ const totalTextLines = table.cells.reduce((sum, row) => sum + row.reduce((s, c) => s + (c.text.trim() ? c.text.split("\n").length : 0), 0), 0);
3958
+ const isUnderSegmented = table.rows === 1 && table.cols === 1 || totalCells <= 2 && totalTextLines >= 8 || totalCells <= 2 && items.length >= 6;
3959
+ if (!isUnderSegmented) return null;
3960
+ if (hasMultiColumnLayout(items)) return buildXyCutBlocks(items, pageNum);
3961
+ const directTable = buildTableFromTextLayout(items, pageNum, bbox);
3962
+ if (directTable) return directTable;
3963
+ const clusterItems = items.map((i) => ({
3964
+ text: i.text,
3965
+ x: i.x,
3966
+ y: i.y,
3967
+ w: i.w,
3968
+ h: i.h,
3969
+ fontSize: i.fontSize,
3970
+ fontName: i.fontName
3971
+ }));
3972
+ const clusterResults = detectClusterTables(clusterItems, pageNum);
3973
+ if (clusterResults.length > 0) {
3974
+ const blocks = [];
3975
+ const ciToIdx = /* @__PURE__ */ new Map();
3976
+ for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
3977
+ const usedIndices = /* @__PURE__ */ new Set();
3978
+ for (const cr of clusterResults) {
3979
+ for (const ci of cr.usedItems) {
3980
+ const idx = ciToIdx.get(ci);
3981
+ if (idx !== void 0) usedIndices.add(idx);
3982
+ }
3983
+ blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
3984
+ }
3985
+ const remaining = items.filter((_, idx) => !usedIndices.has(idx));
3986
+ for (const item of remaining) {
3987
+ if (!item.text.trim()) continue;
3988
+ blocks.push({
3989
+ type: "paragraph",
3990
+ text: item.text,
3991
+ pageNumber: pageNum,
3992
+ bbox: computeBBox([item], pageNum),
3993
+ style: { fontSize: item.fontSize, fontName: item.fontName }
3994
+ });
3995
+ }
3996
+ blocks.sort((a, b) => {
3997
+ const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
3998
+ const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
3999
+ return by - ay;
4000
+ });
4001
+ return blocks.length > 0 ? blocks : null;
4002
+ }
4003
+ return null;
4004
+ }
4005
+ function buildTableFromTextLayout(items, pageNum, bbox) {
4006
+ if (items.length < 4) return null;
4007
+ const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
4008
+ const yTol = 3;
4009
+ const rows = [];
4010
+ let curRow = [sorted[0]];
4011
+ let curY = sorted[0].y;
4012
+ for (let i = 1; i < sorted.length; i++) {
4013
+ if (Math.abs(sorted[i].y - curY) <= yTol) {
4014
+ curRow.push(sorted[i]);
4015
+ } else {
4016
+ rows.push(curRow);
4017
+ curRow = [sorted[i]];
4018
+ curY = sorted[i].y;
4019
+ }
4020
+ }
4021
+ rows.push(curRow);
4022
+ if (rows.length < 2) return null;
4023
+ const gapPositions = [];
4024
+ for (const row of rows) {
4025
+ if (row.length < 2) continue;
4026
+ const sortedX = [...row].sort((a, b) => a.x - b.x);
4027
+ const avgFs = sortedX.reduce((s, i) => s + i.fontSize, 0) / sortedX.length;
4028
+ for (let j = 1; j < sortedX.length; j++) {
4029
+ const gap = sortedX[j].x - (sortedX[j - 1].x + sortedX[j - 1].w);
4030
+ if (gap >= avgFs * 1.5) {
4031
+ gapPositions.push(sortedX[j - 1].x + sortedX[j - 1].w + gap / 2);
4032
+ }
4033
+ }
4034
+ }
4035
+ if (gapPositions.length < 2) return null;
4036
+ gapPositions.sort((a, b) => a - b);
4037
+ const colBoundaries = [];
4038
+ let clusterSum = gapPositions[0], clusterCount = 1;
4039
+ for (let i = 1; i < gapPositions.length; i++) {
4040
+ const avg = clusterSum / clusterCount;
4041
+ if (Math.abs(gapPositions[i] - avg) <= 15) {
4042
+ clusterSum += gapPositions[i];
4043
+ clusterCount++;
4044
+ } else {
4045
+ if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
4046
+ clusterSum = gapPositions[i];
4047
+ clusterCount = 1;
4048
+ }
4049
+ }
4050
+ if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
4051
+ if (colBoundaries.length === 0) return null;
4052
+ const numCols = colBoundaries.length + 1;
4053
+ const tableRows = [];
4054
+ for (const row of rows) {
4055
+ const cells = Array(numCols).fill("");
4056
+ const sortedX = [...row].sort((a, b) => a.x - b.x);
4057
+ for (const item of sortedX) {
4058
+ const cx = item.x + item.w / 2;
4059
+ let col = 0;
4060
+ for (let b = 0; b < colBoundaries.length; b++) {
4061
+ if (cx > colBoundaries[b]) col = b + 1;
4062
+ }
4063
+ cells[col] = cells[col] ? cells[col] + " " + item.text : item.text;
4064
+ }
4065
+ if (cells[0].trim() === "" && tableRows.length > 0) {
4066
+ const prevCells = tableRows[tableRows.length - 1].cells;
4067
+ for (let c = 0; c < numCols; c++) {
4068
+ if (cells[c].trim()) {
4069
+ prevCells[c] = prevCells[c] ? prevCells[c] + " " + cells[c].trim() : cells[c].trim();
4070
+ }
4071
+ }
4072
+ } else {
4073
+ tableRows.push({ cells });
4074
+ }
4075
+ }
4076
+ if (tableRows.length < 2) return null;
4077
+ const nonEmptyCount = tableRows.reduce((sum, r) => sum + r.cells.filter((c) => c.trim()).length, 0);
4078
+ const totalCount = tableRows.length * numCols;
4079
+ if (nonEmptyCount < totalCount * 0.3) return null;
4080
+ const irCells = tableRows.map(
4081
+ (r) => r.cells.map((text, colIdx) => {
4082
+ let cleaned = text.trim();
4083
+ if (colIdx > 0) cleaned = cleaned.replace(/^[•○·\-]\s*/, "");
4084
+ return { text: cleaned, colSpan: 1, rowSpan: 1 };
4085
+ })
4086
+ );
4087
+ const irTable = {
4088
+ rows: tableRows.length,
4089
+ cols: numCols,
4090
+ cells: irCells,
4091
+ hasHeader: tableRows.length > 1
4092
+ };
4093
+ return [{ type: "table", table: irTable, pageNumber: pageNum, bbox }];
4094
+ }
3770
4095
  function shouldDemoteTable(table) {
3771
4096
  const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
3772
4097
  const allText = allCells.join(" ");
@@ -3813,6 +4138,32 @@ function detectMarkerHeadings(blocks) {
3813
4138
  }
3814
4139
  }
3815
4140
  }
4141
+ function hasMultiColumnLayout(items) {
4142
+ if (items.length < 30) return false;
4143
+ const sorted = [...items].sort((a, b) => a.x - b.x);
4144
+ const minX = sorted[0].x;
4145
+ let maxX = minX;
4146
+ for (const i of sorted) if (i.x + i.w > maxX) maxX = i.x + i.w;
4147
+ const pageWidth = maxX - minX;
4148
+ if (pageWidth < 200) return false;
4149
+ let bestGap = 0;
4150
+ let bestSplit = 0;
4151
+ for (let j = 1; j < sorted.length; j++) {
4152
+ const gap = sorted[j].x - (sorted[j - 1].x + sorted[j - 1].w);
4153
+ if (gap > bestGap) {
4154
+ bestGap = gap;
4155
+ bestSplit = (sorted[j - 1].x + sorted[j - 1].w + sorted[j].x) / 2;
4156
+ }
4157
+ }
4158
+ if (bestGap < 20) return false;
4159
+ const splitRatio = (bestSplit - minX) / pageWidth;
4160
+ if (splitRatio < 0.35 || splitRatio > 0.65) return false;
4161
+ const leftCount = items.filter((i) => i.x + i.w / 2 < bestSplit).length;
4162
+ const rightCount = items.filter((i) => i.x + i.w / 2 >= bestSplit).length;
4163
+ if (leftCount < 15 || rightCount < 15) return false;
4164
+ if (Math.min(leftCount, rightCount) / Math.max(leftCount, rightCount) < 0.35) return false;
4165
+ return true;
4166
+ }
3816
4167
  var MAX_XYCUT_DEPTH = 50;
3817
4168
  function xyCutOrder(items, gapThreshold, depth = 0) {
3818
4169
  if (items.length === 0) return [];
@@ -3943,6 +4294,11 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
3943
4294
  width: grid.bbox.x2 - grid.bbox.x1,
3944
4295
  height: grid.bbox.y2 - grid.bbox.y1
3945
4296
  };
4297
+ const normalized = normalizeUnderSegmentedTable(irTable, tableItems, pageNum, tableBbox);
4298
+ if (normalized) {
4299
+ blocks.push(...normalized);
4300
+ continue;
4301
+ }
3946
4302
  if (shouldDemoteTable(irTable)) {
3947
4303
  const demoted = demoteTableToText(irTable);
3948
4304
  if (demoted) {
@@ -3988,6 +4344,10 @@ function mergeAdjacentTableBlocks(blocks) {
3988
4344
  }
3989
4345
  function extractPageBlocksFallback(items, pageNum) {
3990
4346
  if (items.length === 0) return [];
4347
+ if (hasMultiColumnLayout(items)) {
4348
+ const xyBlocks = buildXyCutBlocks(items, pageNum) || [];
4349
+ return detectSpecialKoreanTables(detectListBlocks(xyBlocks));
4350
+ }
3991
4351
  const blocks = [];
3992
4352
  const allYLines = groupByY(items);
3993
4353
  const columns = detectColumns(allYLines);
@@ -4005,7 +4365,7 @@ function extractPageBlocksFallback(items, pageNum) {
4005
4365
  fontSize: i.fontSize,
4006
4366
  fontName: i.fontName
4007
4367
  }));
4008
- const clusterResults = detectClusterTables(clusterItems, pageNum);
4368
+ const clusterResults = hasMultiColumnLayout(items) ? [] : detectClusterTables(clusterItems, pageNum);
4009
4369
  if (clusterResults.length > 0) {
4010
4370
  const ciToIdx = /* @__PURE__ */ new Map();
4011
4371
  for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);