kordoc 2.0.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,10 +6,10 @@ import {
6
6
  precheckZipSize,
7
7
  sanitizeHref,
8
8
  toArrayBuffer
9
- } from "./chunk-25TXW6EP.js";
9
+ } from "./chunk-PKIJLEV6.js";
10
10
  import {
11
11
  parsePageRange
12
- } from "./chunk-3TBUDJDE.js";
12
+ } from "./chunk-MOL7MDBG.js";
13
13
 
14
14
  // src/detect.ts
15
15
  import JSZip from "jszip";
@@ -304,6 +304,9 @@ function tableToMarkdown(table) {
304
304
  if (dr === 0 && dc === 0) continue;
305
305
  if (r + dr < numRows && c + dc < numCols) {
306
306
  skip.add(`${r + dr},${c + dc}`);
307
+ if (dr === 0) {
308
+ display[r][c + dc] = cell.text.replace(/\n/g, "<br>");
309
+ }
307
310
  }
308
311
  }
309
312
  }
@@ -403,7 +406,12 @@ function parseCharProperties(doc, map) {
403
406
  if (!id) continue;
404
407
  const prop = {};
405
408
  const height = el.getAttribute("height");
406
- if (height) prop.fontSize = parseInt(height, 10) / 100;
409
+ if (height) {
410
+ const parsedHeight = parseInt(height, 10);
411
+ if (!isNaN(parsedHeight) && parsedHeight > 0) {
412
+ prop.fontSize = parsedHeight / 100;
413
+ }
414
+ }
407
415
  const bold = el.getAttribute("bold");
408
416
  if (bold === "true" || bold === "1") prop.bold = true;
409
417
  const italic = el.getAttribute("italic");
@@ -543,7 +551,7 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
543
551
  const data = await file.async("uint8array");
544
552
  decompressed.total += data.length;
545
553
  if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
546
- const ext = ref.includes(".") ? ref.split(".").pop() : "png";
554
+ const ext = ref.includes(".") ? ref.split(".").pop() || "png" : "png";
547
555
  const mimeType = imageExtToMime(ext);
548
556
  imageIndex++;
549
557
  const filename = `image_${String(imageIndex).padStart(3, "0")}.${mimeToExt(mimeType)}`;
@@ -850,8 +858,10 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
850
858
  break;
851
859
  case "cellSpan":
852
860
  if (tableCtx?.cell) {
853
- const cs = parseInt(el.getAttribute("colSpan") || "1", 10);
854
- const rs = parseInt(el.getAttribute("rowSpan") || "1", 10);
861
+ const rawCs = parseInt(el.getAttribute("colSpan") || "1", 10);
862
+ const cs = isNaN(rawCs) ? 1 : rawCs;
863
+ const rawRs = parseInt(el.getAttribute("rowSpan") || "1", 10);
864
+ const rs = isNaN(rawRs) ? 1 : rawRs;
855
865
  tableCtx.cell.colSpan = clampSpan(cs, MAX_COLS);
856
866
  tableCtx.cell.rowSpan = clampSpan(rs, MAX_ROWS);
857
867
  }
@@ -943,6 +953,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
943
953
  extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
944
954
  } else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
945
955
  walkChildren(el, d + 1);
956
+ } else if (localTag === "run") {
957
+ tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
946
958
  }
947
959
  }
948
960
  };
@@ -2820,10 +2832,33 @@ var MIN_LINE_LENGTH = 10;
2820
2832
  var COORD_MERGE_TOL = 3;
2821
2833
  var CONNECT_TOL = 5;
2822
2834
  var CELL_PADDING = 2;
2835
+ var MAX_LINE_WIDTH = 5;
2836
+ var IDENTITY = [1, 0, 0, 1, 0, 0];
2837
+ function matMultiply(m1, m2) {
2838
+ return [
2839
+ m1[0] * m2[0] + m1[2] * m2[1],
2840
+ m1[1] * m2[0] + m1[3] * m2[1],
2841
+ m1[0] * m2[2] + m1[2] * m2[3],
2842
+ m1[1] * m2[2] + m1[3] * m2[3],
2843
+ m1[0] * m2[4] + m1[2] * m2[5] + m1[4],
2844
+ m1[1] * m2[4] + m1[3] * m2[5] + m1[5]
2845
+ ];
2846
+ }
2847
+ function matTransformPoint(m, x, y) {
2848
+ return [m[0] * x + m[2] * y + m[4], m[1] * x + m[3] * y + m[5]];
2849
+ }
2850
+ function matScale(m) {
2851
+ return Math.max(
2852
+ Math.sqrt(m[1] * m[1] + m[3] * m[3]),
2853
+ Math.sqrt(m[0] * m[0] + m[2] * m[2])
2854
+ );
2855
+ }
2823
2856
  function extractLines(fnArray, argsArray) {
2824
2857
  const horizontals = [];
2825
2858
  const verticals = [];
2859
+ let ctm = [...IDENTITY];
2826
2860
  let lineWidth = 1;
2861
+ const stateStack = [];
2827
2862
  let currentPath = [];
2828
2863
  let pathStartX = 0, pathStartY = 0;
2829
2864
  let curX = 0, curY = 0;
@@ -2841,13 +2876,53 @@ function extractLines(fnArray, argsArray) {
2841
2876
  );
2842
2877
  }
2843
2878
  }
2844
- function flushPath(isStroke) {
2845
- if (!isStroke) {
2879
+ function tryConvertLinesToRectangle(path) {
2880
+ if (path.length < 3 || path.length > 5) return false;
2881
+ const first = path[0], last = path[path.length - 1];
2882
+ const closed = Math.abs(first.x1 - last.x2) < 1 && Math.abs(first.y1 - last.y2) < 1;
2883
+ if (!closed) return false;
2884
+ let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
2885
+ for (const seg of path) {
2886
+ minX = Math.min(minX, seg.x1, seg.x2);
2887
+ minY = Math.min(minY, seg.y1, seg.y2);
2888
+ maxX = Math.max(maxX, seg.x1, seg.x2);
2889
+ maxY = Math.max(maxY, seg.y1, seg.y2);
2890
+ }
2891
+ const w = maxX - minX, h = maxY - minY;
2892
+ if (w < MIN_LINE_LENGTH && h < MIN_LINE_LENGTH) return false;
2893
+ path.length = 0;
2894
+ if (h < ORIENTATION_TOL * 2 || w > MIN_LINE_LENGTH && h <= MAX_LINE_WIDTH) {
2895
+ path.push({ x1: minX, y1: (minY + maxY) / 2, x2: maxX, y2: (minY + maxY) / 2 });
2896
+ } else if (w < ORIENTATION_TOL * 2 || h > MIN_LINE_LENGTH && w <= MAX_LINE_WIDTH) {
2897
+ path.push({ x1: (minX + maxX) / 2, y1: minY, x2: (minX + maxX) / 2, y2: maxY });
2898
+ } else {
2899
+ pushRectangle(path, minX, minY, w, h);
2900
+ }
2901
+ return true;
2902
+ }
2903
+ function flushPath(isStroke, isFill) {
2904
+ if (!isStroke && !isFill) {
2905
+ currentPath = [];
2906
+ return;
2907
+ }
2908
+ if (isFill && !isStroke && currentPath.length >= 3) {
2909
+ tryConvertLinesToRectangle(currentPath);
2910
+ }
2911
+ const scale = matScale(ctm);
2912
+ const effectiveLW = lineWidth * scale;
2913
+ if (effectiveLW > MAX_LINE_WIDTH && isStroke && !isFill) {
2846
2914
  currentPath = [];
2847
2915
  return;
2848
2916
  }
2849
2917
  for (const seg of currentPath) {
2850
- classifyAndAdd(seg, lineWidth, horizontals, verticals);
2918
+ const [px1, py1] = matTransformPoint(ctm, seg.x1, seg.y1);
2919
+ const [px2, py2] = matTransformPoint(ctm, seg.x2, seg.y2);
2920
+ classifyAndAdd(
2921
+ { x1: px1, y1: py1, x2: px2, y2: py2 },
2922
+ effectiveLW,
2923
+ horizontals,
2924
+ verticals
2925
+ );
2851
2926
  }
2852
2927
  currentPath = [];
2853
2928
  }
@@ -2855,9 +2930,28 @@ function extractLines(fnArray, argsArray) {
2855
2930
  const op = fnArray[i];
2856
2931
  const args = argsArray[i];
2857
2932
  switch (op) {
2933
+ // ── Graphics State ──
2934
+ case OPS.save:
2935
+ stateStack.push({ ctm: [...ctm], lineWidth });
2936
+ break;
2937
+ case OPS.restore:
2938
+ if (stateStack.length > 0) {
2939
+ const state = stateStack.pop();
2940
+ ctm = state.ctm;
2941
+ lineWidth = state.lineWidth;
2942
+ }
2943
+ break;
2944
+ case OPS.transform: {
2945
+ const m = args;
2946
+ if (m.length >= 6) {
2947
+ ctm = matMultiply(ctm, [m[0], m[1], m[2], m[3], m[4], m[5]]);
2948
+ }
2949
+ break;
2950
+ }
2858
2951
  case OPS.setLineWidth:
2859
2952
  lineWidth = args[0] || 1;
2860
2953
  break;
2954
+ // ── Path Construction ──
2861
2955
  case OPS.constructPath: {
2862
2956
  const arg0 = args[0];
2863
2957
  if (Array.isArray(arg0)) {
@@ -2925,34 +3019,60 @@ function extractLines(fnArray, argsArray) {
2925
3019
  }
2926
3020
  }
2927
3021
  }
2928
- if (afterOp === OPS.stroke || afterOp === OPS.closeStroke) {
2929
- flushPath(true);
2930
- } else if (afterOp === OPS.fill || afterOp === OPS.eoFill || afterOp === OPS.fillStroke || afterOp === OPS.eoFillStroke || afterOp === OPS.closeFillStroke || afterOp === OPS.closeEOFillStroke) {
2931
- flushPath(true);
3022
+ const isStroke5 = afterOp === OPS.stroke || afterOp === OPS.closeStroke;
3023
+ const isFill5 = afterOp === OPS.fill || afterOp === OPS.eoFill;
3024
+ const isBoth5 = afterOp === OPS.fillStroke || afterOp === OPS.eoFillStroke || afterOp === OPS.closeFillStroke || afterOp === OPS.closeEOFillStroke;
3025
+ if (isStroke5 || isFill5 || isBoth5) {
3026
+ flushPath(isStroke5 || isBoth5, isFill5 || isBoth5);
2932
3027
  } else if (afterOp === OPS.endPath) {
2933
- flushPath(false);
3028
+ flushPath(false, false);
2934
3029
  }
2935
3030
  }
2936
3031
  break;
2937
3032
  }
3033
+ // ── Paint Operations ──
2938
3034
  case OPS.stroke:
2939
3035
  case OPS.closeStroke:
2940
- flushPath(true);
3036
+ flushPath(true, false);
2941
3037
  break;
2942
3038
  case OPS.fill:
2943
3039
  case OPS.eoFill:
3040
+ flushPath(false, true);
3041
+ break;
2944
3042
  case OPS.fillStroke:
2945
3043
  case OPS.eoFillStroke:
2946
3044
  case OPS.closeFillStroke:
2947
3045
  case OPS.closeEOFillStroke:
2948
- flushPath(true);
3046
+ flushPath(true, true);
2949
3047
  break;
2950
3048
  case OPS.endPath:
2951
- flushPath(false);
3049
+ flushPath(false, false);
3050
+ break;
3051
+ }
3052
+ }
3053
+ return {
3054
+ horizontals: deduplicateLines(horizontals),
3055
+ verticals: deduplicateLines(verticals)
3056
+ };
3057
+ }
3058
+ function deduplicateLines(lines) {
3059
+ if (lines.length <= 1) return lines;
3060
+ const result = [];
3061
+ const tol = COORD_MERGE_TOL;
3062
+ for (const line of lines) {
3063
+ let isDuplicate = false;
3064
+ for (const existing of result) {
3065
+ if (Math.abs(line.y1 - existing.y1) <= tol && Math.abs(line.y2 - existing.y2) <= tol && Math.abs(line.x1 - existing.x1) <= tol && Math.abs(line.x2 - existing.x2) <= tol) {
3066
+ if (line.lineWidth > existing.lineWidth) {
3067
+ existing.lineWidth = line.lineWidth;
3068
+ }
3069
+ isDuplicate = true;
2952
3070
  break;
3071
+ }
2953
3072
  }
3073
+ if (!isDuplicate) result.push(line);
2954
3074
  }
2955
- return { horizontals, verticals };
3075
+ return result;
2956
3076
  }
2957
3077
  function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
2958
3078
  const dx = Math.abs(seg.x2 - seg.x1);
@@ -3528,7 +3648,7 @@ async function parsePdfDocument(buffer, options) {
3528
3648
  if (totalChars / Math.max(parsedPageCount, 1) < 10) {
3529
3649
  if (options?.ocr) {
3530
3650
  try {
3531
- const { ocrPages } = await import("./provider-EU3CG724.js");
3651
+ const { ocrPages } = await import("./provider-7H4CPZYS.js");
3532
3652
  const ocrBlocks = await ocrPages(doc, options.ocr, pageFilter, effectivePageCount);
3533
3653
  if (ocrBlocks.length > 0) {
3534
3654
  const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
@@ -3548,6 +3668,7 @@ async function parsePdfDocument(buffer, options) {
3548
3668
  const medianFontSize = computeMedianFontSize(allFontSizes);
3549
3669
  if (medianFontSize > 0) {
3550
3670
  detectHeadings(blocks, medianFontSize);
3671
+ mergeAdjacentHeadings(blocks);
3551
3672
  }
3552
3673
  detectMarkerHeadings(blocks);
3553
3674
  const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
@@ -3633,6 +3754,46 @@ function detectHeadings(blocks, medianFontSize) {
3633
3754
  }
3634
3755
  }
3635
3756
  }
3757
+ function mergeAdjacentHeadings(blocks) {
3758
+ let i = 0;
3759
+ while (i < blocks.length - 1) {
3760
+ const curr = blocks[i];
3761
+ const next = blocks[i + 1];
3762
+ if (curr.type !== "heading" || next.type !== "heading") {
3763
+ i++;
3764
+ continue;
3765
+ }
3766
+ if (!curr.bbox || !next.bbox || !curr.text || !next.text) {
3767
+ i++;
3768
+ continue;
3769
+ }
3770
+ const currBaseline = curr.bbox.y + (curr.style?.fontSize || curr.bbox.height);
3771
+ const nextBaseline = next.bbox.y + (next.style?.fontSize || next.bbox.height);
3772
+ const yDiff = Math.abs(currBaseline - nextBaseline);
3773
+ const maxFs = Math.max(curr.style?.fontSize || 12, next.style?.fontSize || 12);
3774
+ const sameY = curr.bbox.page === next.bbox.page && yDiff < maxFs * 1.5;
3775
+ const sameLevel = curr.level === next.level;
3776
+ if (sameY && sameLevel) {
3777
+ const currX = curr.bbox.x;
3778
+ const nextX = next.bbox.x;
3779
+ if (currX <= nextX) {
3780
+ curr.text = curr.text + " " + next.text;
3781
+ } else {
3782
+ curr.text = next.text + " " + curr.text;
3783
+ }
3784
+ curr.bbox = {
3785
+ page: curr.bbox.page,
3786
+ x: Math.min(curr.bbox.x, next.bbox.x),
3787
+ y: Math.min(curr.bbox.y, next.bbox.y),
3788
+ width: Math.max(curr.bbox.x + curr.bbox.width, next.bbox.x + next.bbox.width) - Math.min(curr.bbox.x, next.bbox.x),
3789
+ height: Math.max(curr.bbox.height, next.bbox.height)
3790
+ };
3791
+ blocks.splice(i + 1, 1);
3792
+ } else {
3793
+ i++;
3794
+ }
3795
+ }
3796
+ }
3636
3797
  function collapseEvenSpacing(text) {
3637
3798
  const tokens = text.split(" ");
3638
3799
  const singleCharCount = tokens.filter((t) => t.length === 1).length;
@@ -3641,6 +3802,169 @@ function collapseEvenSpacing(text) {
3641
3802
  }
3642
3803
  return text;
3643
3804
  }
3805
+ function buildXyCutBlocks(items, pageNum) {
3806
+ const allY = items.map((i) => i.y);
3807
+ const pageHeight = Math.max(...allY) - Math.min(...allY);
3808
+ const gapThreshold = Math.max(15, pageHeight * 0.03);
3809
+ const orderedGroups = xyCutOrder(items, gapThreshold);
3810
+ const blocks = [];
3811
+ for (const group of orderedGroups) {
3812
+ if (group.length === 0) continue;
3813
+ const yLines = groupByY(group);
3814
+ for (const line of yLines) {
3815
+ const text = mergeLineSimple(line);
3816
+ if (!text.trim()) continue;
3817
+ blocks.push({
3818
+ type: "paragraph",
3819
+ text,
3820
+ pageNumber: pageNum,
3821
+ bbox: computeBBox(line, pageNum),
3822
+ style: dominantStyle(line)
3823
+ });
3824
+ }
3825
+ }
3826
+ return blocks.length > 0 ? blocks : null;
3827
+ }
3828
+ function normalizeUnderSegmentedTable(table, items, pageNum, bbox) {
3829
+ const totalCells = table.cells.reduce((sum, row) => sum + row.filter((c) => c.text.trim()).length, 0);
3830
+ const totalTextLines = table.cells.reduce((sum, row) => sum + row.reduce((s, c) => s + (c.text.trim() ? c.text.split("\n").length : 0), 0), 0);
3831
+ const isUnderSegmented = table.rows === 1 && table.cols === 1 || totalCells <= 2 && totalTextLines >= 8 || totalCells <= 2 && items.length >= 6;
3832
+ if (!isUnderSegmented) return null;
3833
+ if (hasMultiColumnLayout(items)) return buildXyCutBlocks(items, pageNum);
3834
+ const directTable = buildTableFromTextLayout(items, pageNum, bbox);
3835
+ if (directTable) return directTable;
3836
+ const clusterItems = items.map((i) => ({
3837
+ text: i.text,
3838
+ x: i.x,
3839
+ y: i.y,
3840
+ w: i.w,
3841
+ h: i.h,
3842
+ fontSize: i.fontSize,
3843
+ fontName: i.fontName
3844
+ }));
3845
+ const clusterResults = detectClusterTables(clusterItems, pageNum);
3846
+ if (clusterResults.length > 0) {
3847
+ const blocks = [];
3848
+ const ciToIdx = /* @__PURE__ */ new Map();
3849
+ for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
3850
+ const usedIndices = /* @__PURE__ */ new Set();
3851
+ for (const cr of clusterResults) {
3852
+ for (const ci of cr.usedItems) {
3853
+ const idx = ciToIdx.get(ci);
3854
+ if (idx !== void 0) usedIndices.add(idx);
3855
+ }
3856
+ blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
3857
+ }
3858
+ const remaining = items.filter((_, idx) => !usedIndices.has(idx));
3859
+ for (const item of remaining) {
3860
+ if (!item.text.trim()) continue;
3861
+ blocks.push({
3862
+ type: "paragraph",
3863
+ text: item.text,
3864
+ pageNumber: pageNum,
3865
+ bbox: computeBBox([item], pageNum),
3866
+ style: { fontSize: item.fontSize, fontName: item.fontName }
3867
+ });
3868
+ }
3869
+ blocks.sort((a, b) => {
3870
+ const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
3871
+ const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
3872
+ return by - ay;
3873
+ });
3874
+ return blocks.length > 0 ? blocks : null;
3875
+ }
3876
+ return null;
3877
+ }
3878
+ function buildTableFromTextLayout(items, pageNum, bbox) {
3879
+ if (items.length < 4) return null;
3880
+ const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
3881
+ const yTol = 3;
3882
+ const rows = [];
3883
+ let curRow = [sorted[0]];
3884
+ let curY = sorted[0].y;
3885
+ for (let i = 1; i < sorted.length; i++) {
3886
+ if (Math.abs(sorted[i].y - curY) <= yTol) {
3887
+ curRow.push(sorted[i]);
3888
+ } else {
3889
+ rows.push(curRow);
3890
+ curRow = [sorted[i]];
3891
+ curY = sorted[i].y;
3892
+ }
3893
+ }
3894
+ rows.push(curRow);
3895
+ if (rows.length < 2) return null;
3896
+ const gapPositions = [];
3897
+ for (const row of rows) {
3898
+ if (row.length < 2) continue;
3899
+ const sortedX = [...row].sort((a, b) => a.x - b.x);
3900
+ const avgFs = sortedX.reduce((s, i) => s + i.fontSize, 0) / sortedX.length;
3901
+ for (let j = 1; j < sortedX.length; j++) {
3902
+ const gap = sortedX[j].x - (sortedX[j - 1].x + sortedX[j - 1].w);
3903
+ if (gap >= avgFs * 1.5) {
3904
+ gapPositions.push(sortedX[j - 1].x + sortedX[j - 1].w + gap / 2);
3905
+ }
3906
+ }
3907
+ }
3908
+ if (gapPositions.length < 2) return null;
3909
+ gapPositions.sort((a, b) => a - b);
3910
+ const colBoundaries = [];
3911
+ let clusterSum = gapPositions[0], clusterCount = 1;
3912
+ for (let i = 1; i < gapPositions.length; i++) {
3913
+ const avg = clusterSum / clusterCount;
3914
+ if (Math.abs(gapPositions[i] - avg) <= 15) {
3915
+ clusterSum += gapPositions[i];
3916
+ clusterCount++;
3917
+ } else {
3918
+ if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
3919
+ clusterSum = gapPositions[i];
3920
+ clusterCount = 1;
3921
+ }
3922
+ }
3923
+ if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
3924
+ if (colBoundaries.length === 0) return null;
3925
+ const numCols = colBoundaries.length + 1;
3926
+ const tableRows = [];
3927
+ for (const row of rows) {
3928
+ const cells = Array(numCols).fill("");
3929
+ const sortedX = [...row].sort((a, b) => a.x - b.x);
3930
+ for (const item of sortedX) {
3931
+ const cx = item.x + item.w / 2;
3932
+ let col = 0;
3933
+ for (let b = 0; b < colBoundaries.length; b++) {
3934
+ if (cx > colBoundaries[b]) col = b + 1;
3935
+ }
3936
+ cells[col] = cells[col] ? cells[col] + " " + item.text : item.text;
3937
+ }
3938
+ if (cells[0].trim() === "" && tableRows.length > 0) {
3939
+ const prevCells = tableRows[tableRows.length - 1].cells;
3940
+ for (let c = 0; c < numCols; c++) {
3941
+ if (cells[c].trim()) {
3942
+ prevCells[c] = prevCells[c] ? prevCells[c] + " " + cells[c].trim() : cells[c].trim();
3943
+ }
3944
+ }
3945
+ } else {
3946
+ tableRows.push({ cells });
3947
+ }
3948
+ }
3949
+ if (tableRows.length < 2) return null;
3950
+ const nonEmptyCount = tableRows.reduce((sum, r) => sum + r.cells.filter((c) => c.trim()).length, 0);
3951
+ const totalCount = tableRows.length * numCols;
3952
+ if (nonEmptyCount < totalCount * 0.3) return null;
3953
+ const irCells = tableRows.map(
3954
+ (r) => r.cells.map((text, colIdx) => {
3955
+ let cleaned = text.trim();
3956
+ if (colIdx > 0) cleaned = cleaned.replace(/^[•○·\-]\s*/, "");
3957
+ return { text: cleaned, colSpan: 1, rowSpan: 1 };
3958
+ })
3959
+ );
3960
+ const irTable = {
3961
+ rows: tableRows.length,
3962
+ cols: numCols,
3963
+ cells: irCells,
3964
+ hasHeader: tableRows.length > 1
3965
+ };
3966
+ return [{ type: "table", table: irTable, pageNumber: pageNum, bbox }];
3967
+ }
3644
3968
  function shouldDemoteTable(table) {
3645
3969
  const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
3646
3970
  const allText = allCells.join(" ");
@@ -3687,6 +4011,32 @@ function detectMarkerHeadings(blocks) {
3687
4011
  }
3688
4012
  }
3689
4013
  }
4014
+ function hasMultiColumnLayout(items) {
4015
+ if (items.length < 30) return false;
4016
+ const sorted = [...items].sort((a, b) => a.x - b.x);
4017
+ const minX = sorted[0].x;
4018
+ let maxX = minX;
4019
+ for (const i of sorted) if (i.x + i.w > maxX) maxX = i.x + i.w;
4020
+ const pageWidth = maxX - minX;
4021
+ if (pageWidth < 200) return false;
4022
+ let bestGap = 0;
4023
+ let bestSplit = 0;
4024
+ for (let j = 1; j < sorted.length; j++) {
4025
+ const gap = sorted[j].x - (sorted[j - 1].x + sorted[j - 1].w);
4026
+ if (gap > bestGap) {
4027
+ bestGap = gap;
4028
+ bestSplit = (sorted[j - 1].x + sorted[j - 1].w + sorted[j].x) / 2;
4029
+ }
4030
+ }
4031
+ if (bestGap < 20) return false;
4032
+ const splitRatio = (bestSplit - minX) / pageWidth;
4033
+ if (splitRatio < 0.35 || splitRatio > 0.65) return false;
4034
+ const leftCount = items.filter((i) => i.x + i.w / 2 < bestSplit).length;
4035
+ const rightCount = items.filter((i) => i.x + i.w / 2 >= bestSplit).length;
4036
+ if (leftCount < 15 || rightCount < 15) return false;
4037
+ if (Math.min(leftCount, rightCount) / Math.max(leftCount, rightCount) < 0.35) return false;
4038
+ return true;
4039
+ }
3690
4040
  var MAX_XYCUT_DEPTH = 50;
3691
4041
  function xyCutOrder(items, gapThreshold, depth = 0) {
3692
4042
  if (items.length === 0) return [];
@@ -3817,6 +4167,11 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
3817
4167
  width: grid.bbox.x2 - grid.bbox.x1,
3818
4168
  height: grid.bbox.y2 - grid.bbox.y1
3819
4169
  };
4170
+ const normalized = normalizeUnderSegmentedTable(irTable, tableItems, pageNum, tableBbox);
4171
+ if (normalized) {
4172
+ blocks.push(...normalized);
4173
+ continue;
4174
+ }
3820
4175
  if (shouldDemoteTable(irTable)) {
3821
4176
  const demoted = demoteTableToText(irTable);
3822
4177
  if (demoted) {
@@ -3862,6 +4217,10 @@ function mergeAdjacentTableBlocks(blocks) {
3862
4217
  }
3863
4218
  function extractPageBlocksFallback(items, pageNum) {
3864
4219
  if (items.length === 0) return [];
4220
+ if (hasMultiColumnLayout(items)) {
4221
+ const xyBlocks = buildXyCutBlocks(items, pageNum) || [];
4222
+ return detectSpecialKoreanTables(detectListBlocks(xyBlocks));
4223
+ }
3865
4224
  const blocks = [];
3866
4225
  const allYLines = groupByY(items);
3867
4226
  const columns = detectColumns(allYLines);
@@ -3879,7 +4238,7 @@ function extractPageBlocksFallback(items, pageNum) {
3879
4238
  fontSize: i.fontSize,
3880
4239
  fontName: i.fontName
3881
4240
  }));
3882
- const clusterResults = detectClusterTables(clusterItems, pageNum);
4241
+ const clusterResults = hasMultiColumnLayout(items) ? [] : detectClusterTables(clusterItems, pageNum);
3883
4242
  if (clusterResults.length > 0) {
3884
4243
  const ciToIdx = /* @__PURE__ */ new Map();
3885
4244
  for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
@@ -4626,7 +4985,7 @@ async function parseXlsxDocument(buffer, options) {
4626
4985
  }
4627
4986
  let pageFilter = null;
4628
4987
  if (options?.pages) {
4629
- const { parsePageRange: parsePageRange2 } = await import("./page-range-OF5I4PQY.js");
4988
+ const { parsePageRange: parsePageRange2 } = await import("./page-range-737B4EZW.js");
4630
4989
  pageFilter = parsePageRange2(options.pages, sheets.length);
4631
4990
  }
4632
4991
  const blocks = [];
@@ -5509,4 +5868,4 @@ export {
5509
5868
  extractFormFields,
5510
5869
  parse
5511
5870
  };
5512
- //# sourceMappingURL=chunk-4UH6ABAY.js.map
5871
+ //# sourceMappingURL=chunk-GJ2S6IMC.js.map