kordoc 2.0.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -85,6 +85,7 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount) {
85
85
  blocks.push({ type: "paragraph", text: text.trim(), pageNumber: i });
86
86
  }
87
87
  } catch {
88
+ blocks.push({ type: "paragraph", text: `[OCR \uC2E4\uD328: \uD398\uC774\uC9C0 ${i}]` });
88
89
  }
89
90
  }
90
91
  return blocks;
@@ -182,7 +183,7 @@ var import_zlib = require("zlib");
182
183
  var import_xmldom = require("@xmldom/xmldom");
183
184
 
184
185
  // src/utils.ts
185
- var VERSION = true ? "2.0.3" : "0.0.0-dev";
186
+ var VERSION = true ? "2.1.0" : "0.0.0-dev";
186
187
  function toArrayBuffer(buf) {
187
188
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
188
189
  return buf.buffer;
@@ -512,6 +513,9 @@ function tableToMarkdown(table) {
512
513
  if (dr === 0 && dc === 0) continue;
513
514
  if (r + dr < numRows && c + dc < numCols) {
514
515
  skip.add(`${r + dr},${c + dc}`);
516
+ if (dr === 0) {
517
+ display[r][c + dc] = cell.text.replace(/\n/g, "<br>");
518
+ }
515
519
  }
516
520
  }
517
521
  }
@@ -607,7 +611,12 @@ function parseCharProperties(doc, map) {
607
611
  if (!id) continue;
608
612
  const prop = {};
609
613
  const height = el.getAttribute("height");
610
- if (height) prop.fontSize = parseInt(height, 10) / 100;
614
+ if (height) {
615
+ const parsedHeight = parseInt(height, 10);
616
+ if (!isNaN(parsedHeight) && parsedHeight > 0) {
617
+ prop.fontSize = parsedHeight / 100;
618
+ }
619
+ }
611
620
  const bold = el.getAttribute("bold");
612
621
  if (bold === "true" || bold === "1") prop.bold = true;
613
622
  const italic = el.getAttribute("italic");
@@ -747,7 +756,7 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
747
756
  const data = await file.async("uint8array");
748
757
  decompressed.total += data.length;
749
758
  if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
750
- const ext = ref.includes(".") ? ref.split(".").pop() : "png";
759
+ const ext = ref.includes(".") ? ref.split(".").pop() || "png" : "png";
751
760
  const mimeType = imageExtToMime(ext);
752
761
  imageIndex++;
753
762
  const filename = `image_${String(imageIndex).padStart(3, "0")}.${mimeToExt(mimeType)}`;
@@ -1041,8 +1050,10 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
1041
1050
  break;
1042
1051
  case "cellSpan":
1043
1052
  if (tableCtx?.cell) {
1044
- const cs = parseInt(el.getAttribute("colSpan") || "1", 10);
1045
- const rs = parseInt(el.getAttribute("rowSpan") || "1", 10);
1053
+ const rawCs = parseInt(el.getAttribute("colSpan") || "1", 10);
1054
+ const cs = isNaN(rawCs) ? 1 : rawCs;
1055
+ const rawRs = parseInt(el.getAttribute("rowSpan") || "1", 10);
1056
+ const rs = isNaN(rawRs) ? 1 : rawRs;
1046
1057
  tableCtx.cell.colSpan = clampSpan(cs, MAX_COLS);
1047
1058
  tableCtx.cell.rowSpan = clampSpan(rs, MAX_ROWS);
1048
1059
  }
@@ -1134,6 +1145,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
1134
1145
  extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
1135
1146
  } else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
1136
1147
  walkChildren(el, d + 1);
1148
+ } else if (localTag === "run") {
1149
+ tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
1137
1150
  }
1138
1151
  }
1139
1152
  };
@@ -3002,10 +3015,33 @@ var MIN_LINE_LENGTH = 10;
3002
3015
  var COORD_MERGE_TOL = 3;
3003
3016
  var CONNECT_TOL = 5;
3004
3017
  var CELL_PADDING = 2;
3018
+ var MAX_LINE_WIDTH = 5;
3019
+ var IDENTITY = [1, 0, 0, 1, 0, 0];
3020
+ function matMultiply(m1, m2) {
3021
+ return [
3022
+ m1[0] * m2[0] + m1[2] * m2[1],
3023
+ m1[1] * m2[0] + m1[3] * m2[1],
3024
+ m1[0] * m2[2] + m1[2] * m2[3],
3025
+ m1[1] * m2[2] + m1[3] * m2[3],
3026
+ m1[0] * m2[4] + m1[2] * m2[5] + m1[4],
3027
+ m1[1] * m2[4] + m1[3] * m2[5] + m1[5]
3028
+ ];
3029
+ }
3030
+ function matTransformPoint(m, x, y) {
3031
+ return [m[0] * x + m[2] * y + m[4], m[1] * x + m[3] * y + m[5]];
3032
+ }
3033
+ function matScale(m) {
3034
+ return Math.max(
3035
+ Math.sqrt(m[1] * m[1] + m[3] * m[3]),
3036
+ Math.sqrt(m[0] * m[0] + m[2] * m[2])
3037
+ );
3038
+ }
3005
3039
  function extractLines(fnArray, argsArray) {
3006
3040
  const horizontals = [];
3007
3041
  const verticals = [];
3042
+ let ctm = [...IDENTITY];
3008
3043
  let lineWidth = 1;
3044
+ const stateStack = [];
3009
3045
  let currentPath = [];
3010
3046
  let pathStartX = 0, pathStartY = 0;
3011
3047
  let curX = 0, curY = 0;
@@ -3023,13 +3059,53 @@ function extractLines(fnArray, argsArray) {
3023
3059
  );
3024
3060
  }
3025
3061
  }
3026
- function flushPath(isStroke) {
3027
- if (!isStroke) {
3062
+ function tryConvertLinesToRectangle(path) {
3063
+ if (path.length < 3 || path.length > 5) return false;
3064
+ const first = path[0], last = path[path.length - 1];
3065
+ const closed = Math.abs(first.x1 - last.x2) < 1 && Math.abs(first.y1 - last.y2) < 1;
3066
+ if (!closed) return false;
3067
+ let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
3068
+ for (const seg of path) {
3069
+ minX = Math.min(minX, seg.x1, seg.x2);
3070
+ minY = Math.min(minY, seg.y1, seg.y2);
3071
+ maxX = Math.max(maxX, seg.x1, seg.x2);
3072
+ maxY = Math.max(maxY, seg.y1, seg.y2);
3073
+ }
3074
+ const w = maxX - minX, h = maxY - minY;
3075
+ if (w < MIN_LINE_LENGTH && h < MIN_LINE_LENGTH) return false;
3076
+ path.length = 0;
3077
+ if (h < ORIENTATION_TOL * 2 || w > MIN_LINE_LENGTH && h <= MAX_LINE_WIDTH) {
3078
+ path.push({ x1: minX, y1: (minY + maxY) / 2, x2: maxX, y2: (minY + maxY) / 2 });
3079
+ } else if (w < ORIENTATION_TOL * 2 || h > MIN_LINE_LENGTH && w <= MAX_LINE_WIDTH) {
3080
+ path.push({ x1: (minX + maxX) / 2, y1: minY, x2: (minX + maxX) / 2, y2: maxY });
3081
+ } else {
3082
+ pushRectangle(path, minX, minY, w, h);
3083
+ }
3084
+ return true;
3085
+ }
3086
+ function flushPath(isStroke, isFill) {
3087
+ if (!isStroke && !isFill) {
3088
+ currentPath = [];
3089
+ return;
3090
+ }
3091
+ if (isFill && !isStroke && currentPath.length >= 3) {
3092
+ tryConvertLinesToRectangle(currentPath);
3093
+ }
3094
+ const scale = matScale(ctm);
3095
+ const effectiveLW = lineWidth * scale;
3096
+ if (effectiveLW > MAX_LINE_WIDTH && isStroke && !isFill) {
3028
3097
  currentPath = [];
3029
3098
  return;
3030
3099
  }
3031
3100
  for (const seg of currentPath) {
3032
- classifyAndAdd(seg, lineWidth, horizontals, verticals);
3101
+ const [px1, py1] = matTransformPoint(ctm, seg.x1, seg.y1);
3102
+ const [px2, py2] = matTransformPoint(ctm, seg.x2, seg.y2);
3103
+ classifyAndAdd(
3104
+ { x1: px1, y1: py1, x2: px2, y2: py2 },
3105
+ effectiveLW,
3106
+ horizontals,
3107
+ verticals
3108
+ );
3033
3109
  }
3034
3110
  currentPath = [];
3035
3111
  }
@@ -3037,9 +3113,28 @@ function extractLines(fnArray, argsArray) {
3037
3113
  const op = fnArray[i];
3038
3114
  const args = argsArray[i];
3039
3115
  switch (op) {
3116
+ // ── Graphics State ──
3117
+ case import_pdf.OPS.save:
3118
+ stateStack.push({ ctm: [...ctm], lineWidth });
3119
+ break;
3120
+ case import_pdf.OPS.restore:
3121
+ if (stateStack.length > 0) {
3122
+ const state = stateStack.pop();
3123
+ ctm = state.ctm;
3124
+ lineWidth = state.lineWidth;
3125
+ }
3126
+ break;
3127
+ case import_pdf.OPS.transform: {
3128
+ const m = args;
3129
+ if (m.length >= 6) {
3130
+ ctm = matMultiply(ctm, [m[0], m[1], m[2], m[3], m[4], m[5]]);
3131
+ }
3132
+ break;
3133
+ }
3040
3134
  case import_pdf.OPS.setLineWidth:
3041
3135
  lineWidth = args[0] || 1;
3042
3136
  break;
3137
+ // ── Path Construction ──
3043
3138
  case import_pdf.OPS.constructPath: {
3044
3139
  const arg0 = args[0];
3045
3140
  if (Array.isArray(arg0)) {
@@ -3107,34 +3202,60 @@ function extractLines(fnArray, argsArray) {
3107
3202
  }
3108
3203
  }
3109
3204
  }
3110
- if (afterOp === import_pdf.OPS.stroke || afterOp === import_pdf.OPS.closeStroke) {
3111
- flushPath(true);
3112
- } else if (afterOp === import_pdf.OPS.fill || afterOp === import_pdf.OPS.eoFill || afterOp === import_pdf.OPS.fillStroke || afterOp === import_pdf.OPS.eoFillStroke || afterOp === import_pdf.OPS.closeFillStroke || afterOp === import_pdf.OPS.closeEOFillStroke) {
3113
- flushPath(true);
3205
+ const isStroke5 = afterOp === import_pdf.OPS.stroke || afterOp === import_pdf.OPS.closeStroke;
3206
+ const isFill5 = afterOp === import_pdf.OPS.fill || afterOp === import_pdf.OPS.eoFill;
3207
+ const isBoth5 = afterOp === import_pdf.OPS.fillStroke || afterOp === import_pdf.OPS.eoFillStroke || afterOp === import_pdf.OPS.closeFillStroke || afterOp === import_pdf.OPS.closeEOFillStroke;
3208
+ if (isStroke5 || isFill5 || isBoth5) {
3209
+ flushPath(isStroke5 || isBoth5, isFill5 || isBoth5);
3114
3210
  } else if (afterOp === import_pdf.OPS.endPath) {
3115
- flushPath(false);
3211
+ flushPath(false, false);
3116
3212
  }
3117
3213
  }
3118
3214
  break;
3119
3215
  }
3216
+ // ── Paint Operations ──
3120
3217
  case import_pdf.OPS.stroke:
3121
3218
  case import_pdf.OPS.closeStroke:
3122
- flushPath(true);
3219
+ flushPath(true, false);
3123
3220
  break;
3124
3221
  case import_pdf.OPS.fill:
3125
3222
  case import_pdf.OPS.eoFill:
3223
+ flushPath(false, true);
3224
+ break;
3126
3225
  case import_pdf.OPS.fillStroke:
3127
3226
  case import_pdf.OPS.eoFillStroke:
3128
3227
  case import_pdf.OPS.closeFillStroke:
3129
3228
  case import_pdf.OPS.closeEOFillStroke:
3130
- flushPath(true);
3229
+ flushPath(true, true);
3131
3230
  break;
3132
3231
  case import_pdf.OPS.endPath:
3133
- flushPath(false);
3232
+ flushPath(false, false);
3134
3233
  break;
3135
3234
  }
3136
3235
  }
3137
- return { horizontals, verticals };
3236
+ return {
3237
+ horizontals: deduplicateLines(horizontals),
3238
+ verticals: deduplicateLines(verticals)
3239
+ };
3240
+ }
3241
+ function deduplicateLines(lines) {
3242
+ if (lines.length <= 1) return lines;
3243
+ const result = [];
3244
+ const tol = COORD_MERGE_TOL;
3245
+ for (const line of lines) {
3246
+ let isDuplicate = false;
3247
+ for (const existing of result) {
3248
+ if (Math.abs(line.y1 - existing.y1) <= tol && Math.abs(line.y2 - existing.y2) <= tol && Math.abs(line.x1 - existing.x1) <= tol && Math.abs(line.x2 - existing.x2) <= tol) {
3249
+ if (line.lineWidth > existing.lineWidth) {
3250
+ existing.lineWidth = line.lineWidth;
3251
+ }
3252
+ isDuplicate = true;
3253
+ break;
3254
+ }
3255
+ }
3256
+ if (!isDuplicate) result.push(line);
3257
+ }
3258
+ return result;
3138
3259
  }
3139
3260
  function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
3140
3261
  const dx = Math.abs(seg.x2 - seg.x1);
@@ -3730,6 +3851,7 @@ async function parsePdfDocument(buffer, options) {
3730
3851
  const medianFontSize = computeMedianFontSize(allFontSizes);
3731
3852
  if (medianFontSize > 0) {
3732
3853
  detectHeadings(blocks, medianFontSize);
3854
+ mergeAdjacentHeadings(blocks);
3733
3855
  }
3734
3856
  detectMarkerHeadings(blocks);
3735
3857
  const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
@@ -3804,6 +3926,46 @@ function detectHeadings(blocks, medianFontSize) {
3804
3926
  }
3805
3927
  }
3806
3928
  }
3929
+ function mergeAdjacentHeadings(blocks) {
3930
+ let i = 0;
3931
+ while (i < blocks.length - 1) {
3932
+ const curr = blocks[i];
3933
+ const next = blocks[i + 1];
3934
+ if (curr.type !== "heading" || next.type !== "heading") {
3935
+ i++;
3936
+ continue;
3937
+ }
3938
+ if (!curr.bbox || !next.bbox || !curr.text || !next.text) {
3939
+ i++;
3940
+ continue;
3941
+ }
3942
+ const currBaseline = curr.bbox.y + (curr.style?.fontSize || curr.bbox.height);
3943
+ const nextBaseline = next.bbox.y + (next.style?.fontSize || next.bbox.height);
3944
+ const yDiff = Math.abs(currBaseline - nextBaseline);
3945
+ const maxFs = Math.max(curr.style?.fontSize || 12, next.style?.fontSize || 12);
3946
+ const sameY = curr.bbox.page === next.bbox.page && yDiff < maxFs * 1.5;
3947
+ const sameLevel = curr.level === next.level;
3948
+ if (sameY && sameLevel) {
3949
+ const currX = curr.bbox.x;
3950
+ const nextX = next.bbox.x;
3951
+ if (currX <= nextX) {
3952
+ curr.text = curr.text + " " + next.text;
3953
+ } else {
3954
+ curr.text = next.text + " " + curr.text;
3955
+ }
3956
+ curr.bbox = {
3957
+ page: curr.bbox.page,
3958
+ x: Math.min(curr.bbox.x, next.bbox.x),
3959
+ y: Math.min(curr.bbox.y, next.bbox.y),
3960
+ width: Math.max(curr.bbox.x + curr.bbox.width, next.bbox.x + next.bbox.width) - Math.min(curr.bbox.x, next.bbox.x),
3961
+ height: Math.max(curr.bbox.height, next.bbox.height)
3962
+ };
3963
+ blocks.splice(i + 1, 1);
3964
+ } else {
3965
+ i++;
3966
+ }
3967
+ }
3968
+ }
3807
3969
  function collapseEvenSpacing(text) {
3808
3970
  const tokens = text.split(" ");
3809
3971
  const singleCharCount = tokens.filter((t) => t.length === 1).length;
@@ -3812,6 +3974,169 @@ function collapseEvenSpacing(text) {
3812
3974
  }
3813
3975
  return text;
3814
3976
  }
3977
+ function buildXyCutBlocks(items, pageNum) {
3978
+ const allY = items.map((i) => i.y);
3979
+ const pageHeight = Math.max(...allY) - Math.min(...allY);
3980
+ const gapThreshold = Math.max(15, pageHeight * 0.03);
3981
+ const orderedGroups = xyCutOrder(items, gapThreshold);
3982
+ const blocks = [];
3983
+ for (const group of orderedGroups) {
3984
+ if (group.length === 0) continue;
3985
+ const yLines = groupByY(group);
3986
+ for (const line of yLines) {
3987
+ const text = mergeLineSimple(line);
3988
+ if (!text.trim()) continue;
3989
+ blocks.push({
3990
+ type: "paragraph",
3991
+ text,
3992
+ pageNumber: pageNum,
3993
+ bbox: computeBBox(line, pageNum),
3994
+ style: dominantStyle(line)
3995
+ });
3996
+ }
3997
+ }
3998
+ return blocks.length > 0 ? blocks : null;
3999
+ }
4000
+ function normalizeUnderSegmentedTable(table, items, pageNum, bbox) {
4001
+ const totalCells = table.cells.reduce((sum, row) => sum + row.filter((c) => c.text.trim()).length, 0);
4002
+ const totalTextLines = table.cells.reduce((sum, row) => sum + row.reduce((s, c) => s + (c.text.trim() ? c.text.split("\n").length : 0), 0), 0);
4003
+ const isUnderSegmented = table.rows === 1 && table.cols === 1 || totalCells <= 2 && totalTextLines >= 8 || totalCells <= 2 && items.length >= 6;
4004
+ if (!isUnderSegmented) return null;
4005
+ if (hasMultiColumnLayout(items)) return buildXyCutBlocks(items, pageNum);
4006
+ const directTable = buildTableFromTextLayout(items, pageNum, bbox);
4007
+ if (directTable) return directTable;
4008
+ const clusterItems = items.map((i) => ({
4009
+ text: i.text,
4010
+ x: i.x,
4011
+ y: i.y,
4012
+ w: i.w,
4013
+ h: i.h,
4014
+ fontSize: i.fontSize,
4015
+ fontName: i.fontName
4016
+ }));
4017
+ const clusterResults = detectClusterTables(clusterItems, pageNum);
4018
+ if (clusterResults.length > 0) {
4019
+ const blocks = [];
4020
+ const ciToIdx = /* @__PURE__ */ new Map();
4021
+ for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
4022
+ const usedIndices = /* @__PURE__ */ new Set();
4023
+ for (const cr of clusterResults) {
4024
+ for (const ci of cr.usedItems) {
4025
+ const idx = ciToIdx.get(ci);
4026
+ if (idx !== void 0) usedIndices.add(idx);
4027
+ }
4028
+ blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
4029
+ }
4030
+ const remaining = items.filter((_, idx) => !usedIndices.has(idx));
4031
+ for (const item of remaining) {
4032
+ if (!item.text.trim()) continue;
4033
+ blocks.push({
4034
+ type: "paragraph",
4035
+ text: item.text,
4036
+ pageNumber: pageNum,
4037
+ bbox: computeBBox([item], pageNum),
4038
+ style: { fontSize: item.fontSize, fontName: item.fontName }
4039
+ });
4040
+ }
4041
+ blocks.sort((a, b) => {
4042
+ const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
4043
+ const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
4044
+ return by - ay;
4045
+ });
4046
+ return blocks.length > 0 ? blocks : null;
4047
+ }
4048
+ return null;
4049
+ }
4050
+ function buildTableFromTextLayout(items, pageNum, bbox) {
4051
+ if (items.length < 4) return null;
4052
+ const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
4053
+ const yTol = 3;
4054
+ const rows = [];
4055
+ let curRow = [sorted[0]];
4056
+ let curY = sorted[0].y;
4057
+ for (let i = 1; i < sorted.length; i++) {
4058
+ if (Math.abs(sorted[i].y - curY) <= yTol) {
4059
+ curRow.push(sorted[i]);
4060
+ } else {
4061
+ rows.push(curRow);
4062
+ curRow = [sorted[i]];
4063
+ curY = sorted[i].y;
4064
+ }
4065
+ }
4066
+ rows.push(curRow);
4067
+ if (rows.length < 2) return null;
4068
+ const gapPositions = [];
4069
+ for (const row of rows) {
4070
+ if (row.length < 2) continue;
4071
+ const sortedX = [...row].sort((a, b) => a.x - b.x);
4072
+ const avgFs = sortedX.reduce((s, i) => s + i.fontSize, 0) / sortedX.length;
4073
+ for (let j = 1; j < sortedX.length; j++) {
4074
+ const gap = sortedX[j].x - (sortedX[j - 1].x + sortedX[j - 1].w);
4075
+ if (gap >= avgFs * 1.5) {
4076
+ gapPositions.push(sortedX[j - 1].x + sortedX[j - 1].w + gap / 2);
4077
+ }
4078
+ }
4079
+ }
4080
+ if (gapPositions.length < 2) return null;
4081
+ gapPositions.sort((a, b) => a - b);
4082
+ const colBoundaries = [];
4083
+ let clusterSum = gapPositions[0], clusterCount = 1;
4084
+ for (let i = 1; i < gapPositions.length; i++) {
4085
+ const avg = clusterSum / clusterCount;
4086
+ if (Math.abs(gapPositions[i] - avg) <= 15) {
4087
+ clusterSum += gapPositions[i];
4088
+ clusterCount++;
4089
+ } else {
4090
+ if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
4091
+ clusterSum = gapPositions[i];
4092
+ clusterCount = 1;
4093
+ }
4094
+ }
4095
+ if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
4096
+ if (colBoundaries.length === 0) return null;
4097
+ const numCols = colBoundaries.length + 1;
4098
+ const tableRows = [];
4099
+ for (const row of rows) {
4100
+ const cells = Array(numCols).fill("");
4101
+ const sortedX = [...row].sort((a, b) => a.x - b.x);
4102
+ for (const item of sortedX) {
4103
+ const cx = item.x + item.w / 2;
4104
+ let col = 0;
4105
+ for (let b = 0; b < colBoundaries.length; b++) {
4106
+ if (cx > colBoundaries[b]) col = b + 1;
4107
+ }
4108
+ cells[col] = cells[col] ? cells[col] + " " + item.text : item.text;
4109
+ }
4110
+ if (cells[0].trim() === "" && tableRows.length > 0) {
4111
+ const prevCells = tableRows[tableRows.length - 1].cells;
4112
+ for (let c = 0; c < numCols; c++) {
4113
+ if (cells[c].trim()) {
4114
+ prevCells[c] = prevCells[c] ? prevCells[c] + " " + cells[c].trim() : cells[c].trim();
4115
+ }
4116
+ }
4117
+ } else {
4118
+ tableRows.push({ cells });
4119
+ }
4120
+ }
4121
+ if (tableRows.length < 2) return null;
4122
+ const nonEmptyCount = tableRows.reduce((sum, r) => sum + r.cells.filter((c) => c.trim()).length, 0);
4123
+ const totalCount = tableRows.length * numCols;
4124
+ if (nonEmptyCount < totalCount * 0.3) return null;
4125
+ const irCells = tableRows.map(
4126
+ (r) => r.cells.map((text, colIdx) => {
4127
+ let cleaned = text.trim();
4128
+ if (colIdx > 0) cleaned = cleaned.replace(/^[•○·\-]\s*/, "");
4129
+ return { text: cleaned, colSpan: 1, rowSpan: 1 };
4130
+ })
4131
+ );
4132
+ const irTable = {
4133
+ rows: tableRows.length,
4134
+ cols: numCols,
4135
+ cells: irCells,
4136
+ hasHeader: tableRows.length > 1
4137
+ };
4138
+ return [{ type: "table", table: irTable, pageNumber: pageNum, bbox }];
4139
+ }
3815
4140
  function shouldDemoteTable(table) {
3816
4141
  const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
3817
4142
  const allText = allCells.join(" ");
@@ -3858,6 +4183,32 @@ function detectMarkerHeadings(blocks) {
3858
4183
  }
3859
4184
  }
3860
4185
  }
4186
+ function hasMultiColumnLayout(items) {
4187
+ if (items.length < 30) return false;
4188
+ const sorted = [...items].sort((a, b) => a.x - b.x);
4189
+ const minX = sorted[0].x;
4190
+ let maxX = minX;
4191
+ for (const i of sorted) if (i.x + i.w > maxX) maxX = i.x + i.w;
4192
+ const pageWidth = maxX - minX;
4193
+ if (pageWidth < 200) return false;
4194
+ let bestGap = 0;
4195
+ let bestSplit = 0;
4196
+ for (let j = 1; j < sorted.length; j++) {
4197
+ const gap = sorted[j].x - (sorted[j - 1].x + sorted[j - 1].w);
4198
+ if (gap > bestGap) {
4199
+ bestGap = gap;
4200
+ bestSplit = (sorted[j - 1].x + sorted[j - 1].w + sorted[j].x) / 2;
4201
+ }
4202
+ }
4203
+ if (bestGap < 20) return false;
4204
+ const splitRatio = (bestSplit - minX) / pageWidth;
4205
+ if (splitRatio < 0.35 || splitRatio > 0.65) return false;
4206
+ const leftCount = items.filter((i) => i.x + i.w / 2 < bestSplit).length;
4207
+ const rightCount = items.filter((i) => i.x + i.w / 2 >= bestSplit).length;
4208
+ if (leftCount < 15 || rightCount < 15) return false;
4209
+ if (Math.min(leftCount, rightCount) / Math.max(leftCount, rightCount) < 0.35) return false;
4210
+ return true;
4211
+ }
3861
4212
  var MAX_XYCUT_DEPTH = 50;
3862
4213
  function xyCutOrder(items, gapThreshold, depth = 0) {
3863
4214
  if (items.length === 0) return [];
@@ -3988,6 +4339,11 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
3988
4339
  width: grid.bbox.x2 - grid.bbox.x1,
3989
4340
  height: grid.bbox.y2 - grid.bbox.y1
3990
4341
  };
4342
+ const normalized = normalizeUnderSegmentedTable(irTable, tableItems, pageNum, tableBbox);
4343
+ if (normalized) {
4344
+ blocks.push(...normalized);
4345
+ continue;
4346
+ }
3991
4347
  if (shouldDemoteTable(irTable)) {
3992
4348
  const demoted = demoteTableToText(irTable);
3993
4349
  if (demoted) {
@@ -4033,6 +4389,10 @@ function mergeAdjacentTableBlocks(blocks) {
4033
4389
  }
4034
4390
  function extractPageBlocksFallback(items, pageNum) {
4035
4391
  if (items.length === 0) return [];
4392
+ if (hasMultiColumnLayout(items)) {
4393
+ const xyBlocks = buildXyCutBlocks(items, pageNum) || [];
4394
+ return detectSpecialKoreanTables(detectListBlocks(xyBlocks));
4395
+ }
4036
4396
  const blocks = [];
4037
4397
  const allYLines = groupByY(items);
4038
4398
  const columns = detectColumns(allYLines);
@@ -4050,7 +4410,7 @@ function extractPageBlocksFallback(items, pageNum) {
4050
4410
  fontSize: i.fontSize,
4051
4411
  fontName: i.fontName
4052
4412
  }));
4053
- const clusterResults = detectClusterTables(clusterItems, pageNum);
4413
+ const clusterResults = hasMultiColumnLayout(items) ? [] : detectClusterTables(clusterItems, pageNum);
4054
4414
  if (clusterResults.length > 0) {
4055
4415
  const ciToIdx = /* @__PURE__ */ new Map();
4056
4416
  for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);