kordoc 2.0.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +302 -291
- package/dist/{chunk-4UH6ABAY.js → chunk-GJ2S6IMC.js} +381 -22
- package/dist/chunk-GJ2S6IMC.js.map +1 -0
- package/dist/{chunk-3TBUDJDE.js → chunk-MOL7MDBG.js} +1 -1
- package/dist/chunk-MOL7MDBG.js.map +1 -0
- package/dist/{chunk-25TXW6EP.js → chunk-PKIJLEV6.js} +2 -2
- package/dist/chunk-PKIJLEV6.js.map +1 -0
- package/dist/cli.js +5 -5
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +378 -18
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +378 -18
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +3 -3
- package/dist/mcp.js.map +1 -1
- package/dist/page-range-737B4EZW.js +8 -0
- package/dist/{provider-EU3CG724.js → provider-7H4CPZYS.js} +2 -1
- package/dist/provider-7H4CPZYS.js.map +1 -0
- package/dist/{utils-BTZ4WSYX.js → utils-BWQ2RGUD.js} +2 -2
- package/dist/{watch-QD3PDNXQ.js → watch-X7IC7MLF.js} +10 -6
- package/dist/watch-X7IC7MLF.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-25TXW6EP.js.map +0 -1
- package/dist/chunk-3TBUDJDE.js.map +0 -1
- package/dist/chunk-4UH6ABAY.js.map +0 -1
- package/dist/page-range-OF5I4PQY.js +0 -8
- package/dist/provider-EU3CG724.js.map +0 -1
- package/dist/watch-QD3PDNXQ.js.map +0 -1
- /package/dist/{page-range-OF5I4PQY.js.map → page-range-737B4EZW.js.map} +0 -0
- /package/dist/{utils-BTZ4WSYX.js.map → utils-BWQ2RGUD.js.map} +0 -0
|
@@ -6,10 +6,10 @@ import {
|
|
|
6
6
|
precheckZipSize,
|
|
7
7
|
sanitizeHref,
|
|
8
8
|
toArrayBuffer
|
|
9
|
-
} from "./chunk-
|
|
9
|
+
} from "./chunk-PKIJLEV6.js";
|
|
10
10
|
import {
|
|
11
11
|
parsePageRange
|
|
12
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-MOL7MDBG.js";
|
|
13
13
|
|
|
14
14
|
// src/detect.ts
|
|
15
15
|
import JSZip from "jszip";
|
|
@@ -304,6 +304,9 @@ function tableToMarkdown(table) {
|
|
|
304
304
|
if (dr === 0 && dc === 0) continue;
|
|
305
305
|
if (r + dr < numRows && c + dc < numCols) {
|
|
306
306
|
skip.add(`${r + dr},${c + dc}`);
|
|
307
|
+
if (dr === 0) {
|
|
308
|
+
display[r][c + dc] = cell.text.replace(/\n/g, "<br>");
|
|
309
|
+
}
|
|
307
310
|
}
|
|
308
311
|
}
|
|
309
312
|
}
|
|
@@ -403,7 +406,12 @@ function parseCharProperties(doc, map) {
|
|
|
403
406
|
if (!id) continue;
|
|
404
407
|
const prop = {};
|
|
405
408
|
const height = el.getAttribute("height");
|
|
406
|
-
if (height)
|
|
409
|
+
if (height) {
|
|
410
|
+
const parsedHeight = parseInt(height, 10);
|
|
411
|
+
if (!isNaN(parsedHeight) && parsedHeight > 0) {
|
|
412
|
+
prop.fontSize = parsedHeight / 100;
|
|
413
|
+
}
|
|
414
|
+
}
|
|
407
415
|
const bold = el.getAttribute("bold");
|
|
408
416
|
if (bold === "true" || bold === "1") prop.bold = true;
|
|
409
417
|
const italic = el.getAttribute("italic");
|
|
@@ -543,7 +551,7 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
|
|
|
543
551
|
const data = await file.async("uint8array");
|
|
544
552
|
decompressed.total += data.length;
|
|
545
553
|
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
546
|
-
const ext = ref.includes(".") ? ref.split(".").pop() : "png";
|
|
554
|
+
const ext = ref.includes(".") ? ref.split(".").pop() || "png" : "png";
|
|
547
555
|
const mimeType = imageExtToMime(ext);
|
|
548
556
|
imageIndex++;
|
|
549
557
|
const filename = `image_${String(imageIndex).padStart(3, "0")}.${mimeToExt(mimeType)}`;
|
|
@@ -850,8 +858,10 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
850
858
|
break;
|
|
851
859
|
case "cellSpan":
|
|
852
860
|
if (tableCtx?.cell) {
|
|
853
|
-
const
|
|
854
|
-
const
|
|
861
|
+
const rawCs = parseInt(el.getAttribute("colSpan") || "1", 10);
|
|
862
|
+
const cs = isNaN(rawCs) ? 1 : rawCs;
|
|
863
|
+
const rawRs = parseInt(el.getAttribute("rowSpan") || "1", 10);
|
|
864
|
+
const rs = isNaN(rawRs) ? 1 : rawRs;
|
|
855
865
|
tableCtx.cell.colSpan = clampSpan(cs, MAX_COLS);
|
|
856
866
|
tableCtx.cell.rowSpan = clampSpan(rs, MAX_ROWS);
|
|
857
867
|
}
|
|
@@ -943,6 +953,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
943
953
|
extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
|
|
944
954
|
} else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
|
|
945
955
|
walkChildren(el, d + 1);
|
|
956
|
+
} else if (localTag === "run") {
|
|
957
|
+
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
|
|
946
958
|
}
|
|
947
959
|
}
|
|
948
960
|
};
|
|
@@ -2820,10 +2832,33 @@ var MIN_LINE_LENGTH = 10;
|
|
|
2820
2832
|
var COORD_MERGE_TOL = 3;
|
|
2821
2833
|
var CONNECT_TOL = 5;
|
|
2822
2834
|
var CELL_PADDING = 2;
|
|
2835
|
+
var MAX_LINE_WIDTH = 5;
|
|
2836
|
+
var IDENTITY = [1, 0, 0, 1, 0, 0];
|
|
2837
|
+
function matMultiply(m1, m2) {
|
|
2838
|
+
return [
|
|
2839
|
+
m1[0] * m2[0] + m1[2] * m2[1],
|
|
2840
|
+
m1[1] * m2[0] + m1[3] * m2[1],
|
|
2841
|
+
m1[0] * m2[2] + m1[2] * m2[3],
|
|
2842
|
+
m1[1] * m2[2] + m1[3] * m2[3],
|
|
2843
|
+
m1[0] * m2[4] + m1[2] * m2[5] + m1[4],
|
|
2844
|
+
m1[1] * m2[4] + m1[3] * m2[5] + m1[5]
|
|
2845
|
+
];
|
|
2846
|
+
}
|
|
2847
|
+
function matTransformPoint(m, x, y) {
|
|
2848
|
+
return [m[0] * x + m[2] * y + m[4], m[1] * x + m[3] * y + m[5]];
|
|
2849
|
+
}
|
|
2850
|
+
function matScale(m) {
|
|
2851
|
+
return Math.max(
|
|
2852
|
+
Math.sqrt(m[1] * m[1] + m[3] * m[3]),
|
|
2853
|
+
Math.sqrt(m[0] * m[0] + m[2] * m[2])
|
|
2854
|
+
);
|
|
2855
|
+
}
|
|
2823
2856
|
function extractLines(fnArray, argsArray) {
|
|
2824
2857
|
const horizontals = [];
|
|
2825
2858
|
const verticals = [];
|
|
2859
|
+
let ctm = [...IDENTITY];
|
|
2826
2860
|
let lineWidth = 1;
|
|
2861
|
+
const stateStack = [];
|
|
2827
2862
|
let currentPath = [];
|
|
2828
2863
|
let pathStartX = 0, pathStartY = 0;
|
|
2829
2864
|
let curX = 0, curY = 0;
|
|
@@ -2841,13 +2876,53 @@ function extractLines(fnArray, argsArray) {
|
|
|
2841
2876
|
);
|
|
2842
2877
|
}
|
|
2843
2878
|
}
|
|
2844
|
-
function
|
|
2845
|
-
if (
|
|
2879
|
+
function tryConvertLinesToRectangle(path) {
|
|
2880
|
+
if (path.length < 3 || path.length > 5) return false;
|
|
2881
|
+
const first = path[0], last = path[path.length - 1];
|
|
2882
|
+
const closed = Math.abs(first.x1 - last.x2) < 1 && Math.abs(first.y1 - last.y2) < 1;
|
|
2883
|
+
if (!closed) return false;
|
|
2884
|
+
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
2885
|
+
for (const seg of path) {
|
|
2886
|
+
minX = Math.min(minX, seg.x1, seg.x2);
|
|
2887
|
+
minY = Math.min(minY, seg.y1, seg.y2);
|
|
2888
|
+
maxX = Math.max(maxX, seg.x1, seg.x2);
|
|
2889
|
+
maxY = Math.max(maxY, seg.y1, seg.y2);
|
|
2890
|
+
}
|
|
2891
|
+
const w = maxX - minX, h = maxY - minY;
|
|
2892
|
+
if (w < MIN_LINE_LENGTH && h < MIN_LINE_LENGTH) return false;
|
|
2893
|
+
path.length = 0;
|
|
2894
|
+
if (h < ORIENTATION_TOL * 2 || w > MIN_LINE_LENGTH && h <= MAX_LINE_WIDTH) {
|
|
2895
|
+
path.push({ x1: minX, y1: (minY + maxY) / 2, x2: maxX, y2: (minY + maxY) / 2 });
|
|
2896
|
+
} else if (w < ORIENTATION_TOL * 2 || h > MIN_LINE_LENGTH && w <= MAX_LINE_WIDTH) {
|
|
2897
|
+
path.push({ x1: (minX + maxX) / 2, y1: minY, x2: (minX + maxX) / 2, y2: maxY });
|
|
2898
|
+
} else {
|
|
2899
|
+
pushRectangle(path, minX, minY, w, h);
|
|
2900
|
+
}
|
|
2901
|
+
return true;
|
|
2902
|
+
}
|
|
2903
|
+
function flushPath(isStroke, isFill) {
|
|
2904
|
+
if (!isStroke && !isFill) {
|
|
2905
|
+
currentPath = [];
|
|
2906
|
+
return;
|
|
2907
|
+
}
|
|
2908
|
+
if (isFill && !isStroke && currentPath.length >= 3) {
|
|
2909
|
+
tryConvertLinesToRectangle(currentPath);
|
|
2910
|
+
}
|
|
2911
|
+
const scale = matScale(ctm);
|
|
2912
|
+
const effectiveLW = lineWidth * scale;
|
|
2913
|
+
if (effectiveLW > MAX_LINE_WIDTH && isStroke && !isFill) {
|
|
2846
2914
|
currentPath = [];
|
|
2847
2915
|
return;
|
|
2848
2916
|
}
|
|
2849
2917
|
for (const seg of currentPath) {
|
|
2850
|
-
|
|
2918
|
+
const [px1, py1] = matTransformPoint(ctm, seg.x1, seg.y1);
|
|
2919
|
+
const [px2, py2] = matTransformPoint(ctm, seg.x2, seg.y2);
|
|
2920
|
+
classifyAndAdd(
|
|
2921
|
+
{ x1: px1, y1: py1, x2: px2, y2: py2 },
|
|
2922
|
+
effectiveLW,
|
|
2923
|
+
horizontals,
|
|
2924
|
+
verticals
|
|
2925
|
+
);
|
|
2851
2926
|
}
|
|
2852
2927
|
currentPath = [];
|
|
2853
2928
|
}
|
|
@@ -2855,9 +2930,28 @@ function extractLines(fnArray, argsArray) {
|
|
|
2855
2930
|
const op = fnArray[i];
|
|
2856
2931
|
const args = argsArray[i];
|
|
2857
2932
|
switch (op) {
|
|
2933
|
+
// ── Graphics State ──
|
|
2934
|
+
case OPS.save:
|
|
2935
|
+
stateStack.push({ ctm: [...ctm], lineWidth });
|
|
2936
|
+
break;
|
|
2937
|
+
case OPS.restore:
|
|
2938
|
+
if (stateStack.length > 0) {
|
|
2939
|
+
const state = stateStack.pop();
|
|
2940
|
+
ctm = state.ctm;
|
|
2941
|
+
lineWidth = state.lineWidth;
|
|
2942
|
+
}
|
|
2943
|
+
break;
|
|
2944
|
+
case OPS.transform: {
|
|
2945
|
+
const m = args;
|
|
2946
|
+
if (m.length >= 6) {
|
|
2947
|
+
ctm = matMultiply(ctm, [m[0], m[1], m[2], m[3], m[4], m[5]]);
|
|
2948
|
+
}
|
|
2949
|
+
break;
|
|
2950
|
+
}
|
|
2858
2951
|
case OPS.setLineWidth:
|
|
2859
2952
|
lineWidth = args[0] || 1;
|
|
2860
2953
|
break;
|
|
2954
|
+
// ── Path Construction ──
|
|
2861
2955
|
case OPS.constructPath: {
|
|
2862
2956
|
const arg0 = args[0];
|
|
2863
2957
|
if (Array.isArray(arg0)) {
|
|
@@ -2925,34 +3019,60 @@ function extractLines(fnArray, argsArray) {
|
|
|
2925
3019
|
}
|
|
2926
3020
|
}
|
|
2927
3021
|
}
|
|
2928
|
-
|
|
2929
|
-
|
|
2930
|
-
|
|
2931
|
-
|
|
3022
|
+
const isStroke5 = afterOp === OPS.stroke || afterOp === OPS.closeStroke;
|
|
3023
|
+
const isFill5 = afterOp === OPS.fill || afterOp === OPS.eoFill;
|
|
3024
|
+
const isBoth5 = afterOp === OPS.fillStroke || afterOp === OPS.eoFillStroke || afterOp === OPS.closeFillStroke || afterOp === OPS.closeEOFillStroke;
|
|
3025
|
+
if (isStroke5 || isFill5 || isBoth5) {
|
|
3026
|
+
flushPath(isStroke5 || isBoth5, isFill5 || isBoth5);
|
|
2932
3027
|
} else if (afterOp === OPS.endPath) {
|
|
2933
|
-
flushPath(false);
|
|
3028
|
+
flushPath(false, false);
|
|
2934
3029
|
}
|
|
2935
3030
|
}
|
|
2936
3031
|
break;
|
|
2937
3032
|
}
|
|
3033
|
+
// ── Paint Operations ──
|
|
2938
3034
|
case OPS.stroke:
|
|
2939
3035
|
case OPS.closeStroke:
|
|
2940
|
-
flushPath(true);
|
|
3036
|
+
flushPath(true, false);
|
|
2941
3037
|
break;
|
|
2942
3038
|
case OPS.fill:
|
|
2943
3039
|
case OPS.eoFill:
|
|
3040
|
+
flushPath(false, true);
|
|
3041
|
+
break;
|
|
2944
3042
|
case OPS.fillStroke:
|
|
2945
3043
|
case OPS.eoFillStroke:
|
|
2946
3044
|
case OPS.closeFillStroke:
|
|
2947
3045
|
case OPS.closeEOFillStroke:
|
|
2948
|
-
flushPath(true);
|
|
3046
|
+
flushPath(true, true);
|
|
2949
3047
|
break;
|
|
2950
3048
|
case OPS.endPath:
|
|
2951
|
-
flushPath(false);
|
|
3049
|
+
flushPath(false, false);
|
|
3050
|
+
break;
|
|
3051
|
+
}
|
|
3052
|
+
}
|
|
3053
|
+
return {
|
|
3054
|
+
horizontals: deduplicateLines(horizontals),
|
|
3055
|
+
verticals: deduplicateLines(verticals)
|
|
3056
|
+
};
|
|
3057
|
+
}
|
|
3058
|
+
function deduplicateLines(lines) {
|
|
3059
|
+
if (lines.length <= 1) return lines;
|
|
3060
|
+
const result = [];
|
|
3061
|
+
const tol = COORD_MERGE_TOL;
|
|
3062
|
+
for (const line of lines) {
|
|
3063
|
+
let isDuplicate = false;
|
|
3064
|
+
for (const existing of result) {
|
|
3065
|
+
if (Math.abs(line.y1 - existing.y1) <= tol && Math.abs(line.y2 - existing.y2) <= tol && Math.abs(line.x1 - existing.x1) <= tol && Math.abs(line.x2 - existing.x2) <= tol) {
|
|
3066
|
+
if (line.lineWidth > existing.lineWidth) {
|
|
3067
|
+
existing.lineWidth = line.lineWidth;
|
|
3068
|
+
}
|
|
3069
|
+
isDuplicate = true;
|
|
2952
3070
|
break;
|
|
3071
|
+
}
|
|
2953
3072
|
}
|
|
3073
|
+
if (!isDuplicate) result.push(line);
|
|
2954
3074
|
}
|
|
2955
|
-
return
|
|
3075
|
+
return result;
|
|
2956
3076
|
}
|
|
2957
3077
|
function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
|
|
2958
3078
|
const dx = Math.abs(seg.x2 - seg.x1);
|
|
@@ -3528,7 +3648,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3528
3648
|
if (totalChars / Math.max(parsedPageCount, 1) < 10) {
|
|
3529
3649
|
if (options?.ocr) {
|
|
3530
3650
|
try {
|
|
3531
|
-
const { ocrPages } = await import("./provider-
|
|
3651
|
+
const { ocrPages } = await import("./provider-7H4CPZYS.js");
|
|
3532
3652
|
const ocrBlocks = await ocrPages(doc, options.ocr, pageFilter, effectivePageCount);
|
|
3533
3653
|
if (ocrBlocks.length > 0) {
|
|
3534
3654
|
const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
|
|
@@ -3548,6 +3668,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3548
3668
|
const medianFontSize = computeMedianFontSize(allFontSizes);
|
|
3549
3669
|
if (medianFontSize > 0) {
|
|
3550
3670
|
detectHeadings(blocks, medianFontSize);
|
|
3671
|
+
mergeAdjacentHeadings(blocks);
|
|
3551
3672
|
}
|
|
3552
3673
|
detectMarkerHeadings(blocks);
|
|
3553
3674
|
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
@@ -3633,6 +3754,46 @@ function detectHeadings(blocks, medianFontSize) {
|
|
|
3633
3754
|
}
|
|
3634
3755
|
}
|
|
3635
3756
|
}
|
|
3757
|
+
function mergeAdjacentHeadings(blocks) {
|
|
3758
|
+
let i = 0;
|
|
3759
|
+
while (i < blocks.length - 1) {
|
|
3760
|
+
const curr = blocks[i];
|
|
3761
|
+
const next = blocks[i + 1];
|
|
3762
|
+
if (curr.type !== "heading" || next.type !== "heading") {
|
|
3763
|
+
i++;
|
|
3764
|
+
continue;
|
|
3765
|
+
}
|
|
3766
|
+
if (!curr.bbox || !next.bbox || !curr.text || !next.text) {
|
|
3767
|
+
i++;
|
|
3768
|
+
continue;
|
|
3769
|
+
}
|
|
3770
|
+
const currBaseline = curr.bbox.y + (curr.style?.fontSize || curr.bbox.height);
|
|
3771
|
+
const nextBaseline = next.bbox.y + (next.style?.fontSize || next.bbox.height);
|
|
3772
|
+
const yDiff = Math.abs(currBaseline - nextBaseline);
|
|
3773
|
+
const maxFs = Math.max(curr.style?.fontSize || 12, next.style?.fontSize || 12);
|
|
3774
|
+
const sameY = curr.bbox.page === next.bbox.page && yDiff < maxFs * 1.5;
|
|
3775
|
+
const sameLevel = curr.level === next.level;
|
|
3776
|
+
if (sameY && sameLevel) {
|
|
3777
|
+
const currX = curr.bbox.x;
|
|
3778
|
+
const nextX = next.bbox.x;
|
|
3779
|
+
if (currX <= nextX) {
|
|
3780
|
+
curr.text = curr.text + " " + next.text;
|
|
3781
|
+
} else {
|
|
3782
|
+
curr.text = next.text + " " + curr.text;
|
|
3783
|
+
}
|
|
3784
|
+
curr.bbox = {
|
|
3785
|
+
page: curr.bbox.page,
|
|
3786
|
+
x: Math.min(curr.bbox.x, next.bbox.x),
|
|
3787
|
+
y: Math.min(curr.bbox.y, next.bbox.y),
|
|
3788
|
+
width: Math.max(curr.bbox.x + curr.bbox.width, next.bbox.x + next.bbox.width) - Math.min(curr.bbox.x, next.bbox.x),
|
|
3789
|
+
height: Math.max(curr.bbox.height, next.bbox.height)
|
|
3790
|
+
};
|
|
3791
|
+
blocks.splice(i + 1, 1);
|
|
3792
|
+
} else {
|
|
3793
|
+
i++;
|
|
3794
|
+
}
|
|
3795
|
+
}
|
|
3796
|
+
}
|
|
3636
3797
|
function collapseEvenSpacing(text) {
|
|
3637
3798
|
const tokens = text.split(" ");
|
|
3638
3799
|
const singleCharCount = tokens.filter((t) => t.length === 1).length;
|
|
@@ -3641,6 +3802,169 @@ function collapseEvenSpacing(text) {
|
|
|
3641
3802
|
}
|
|
3642
3803
|
return text;
|
|
3643
3804
|
}
|
|
3805
|
+
function buildXyCutBlocks(items, pageNum) {
|
|
3806
|
+
const allY = items.map((i) => i.y);
|
|
3807
|
+
const pageHeight = Math.max(...allY) - Math.min(...allY);
|
|
3808
|
+
const gapThreshold = Math.max(15, pageHeight * 0.03);
|
|
3809
|
+
const orderedGroups = xyCutOrder(items, gapThreshold);
|
|
3810
|
+
const blocks = [];
|
|
3811
|
+
for (const group of orderedGroups) {
|
|
3812
|
+
if (group.length === 0) continue;
|
|
3813
|
+
const yLines = groupByY(group);
|
|
3814
|
+
for (const line of yLines) {
|
|
3815
|
+
const text = mergeLineSimple(line);
|
|
3816
|
+
if (!text.trim()) continue;
|
|
3817
|
+
blocks.push({
|
|
3818
|
+
type: "paragraph",
|
|
3819
|
+
text,
|
|
3820
|
+
pageNumber: pageNum,
|
|
3821
|
+
bbox: computeBBox(line, pageNum),
|
|
3822
|
+
style: dominantStyle(line)
|
|
3823
|
+
});
|
|
3824
|
+
}
|
|
3825
|
+
}
|
|
3826
|
+
return blocks.length > 0 ? blocks : null;
|
|
3827
|
+
}
|
|
3828
|
+
function normalizeUnderSegmentedTable(table, items, pageNum, bbox) {
|
|
3829
|
+
const totalCells = table.cells.reduce((sum, row) => sum + row.filter((c) => c.text.trim()).length, 0);
|
|
3830
|
+
const totalTextLines = table.cells.reduce((sum, row) => sum + row.reduce((s, c) => s + (c.text.trim() ? c.text.split("\n").length : 0), 0), 0);
|
|
3831
|
+
const isUnderSegmented = table.rows === 1 && table.cols === 1 || totalCells <= 2 && totalTextLines >= 8 || totalCells <= 2 && items.length >= 6;
|
|
3832
|
+
if (!isUnderSegmented) return null;
|
|
3833
|
+
if (hasMultiColumnLayout(items)) return buildXyCutBlocks(items, pageNum);
|
|
3834
|
+
const directTable = buildTableFromTextLayout(items, pageNum, bbox);
|
|
3835
|
+
if (directTable) return directTable;
|
|
3836
|
+
const clusterItems = items.map((i) => ({
|
|
3837
|
+
text: i.text,
|
|
3838
|
+
x: i.x,
|
|
3839
|
+
y: i.y,
|
|
3840
|
+
w: i.w,
|
|
3841
|
+
h: i.h,
|
|
3842
|
+
fontSize: i.fontSize,
|
|
3843
|
+
fontName: i.fontName
|
|
3844
|
+
}));
|
|
3845
|
+
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
3846
|
+
if (clusterResults.length > 0) {
|
|
3847
|
+
const blocks = [];
|
|
3848
|
+
const ciToIdx = /* @__PURE__ */ new Map();
|
|
3849
|
+
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
3850
|
+
const usedIndices = /* @__PURE__ */ new Set();
|
|
3851
|
+
for (const cr of clusterResults) {
|
|
3852
|
+
for (const ci of cr.usedItems) {
|
|
3853
|
+
const idx = ciToIdx.get(ci);
|
|
3854
|
+
if (idx !== void 0) usedIndices.add(idx);
|
|
3855
|
+
}
|
|
3856
|
+
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
3857
|
+
}
|
|
3858
|
+
const remaining = items.filter((_, idx) => !usedIndices.has(idx));
|
|
3859
|
+
for (const item of remaining) {
|
|
3860
|
+
if (!item.text.trim()) continue;
|
|
3861
|
+
blocks.push({
|
|
3862
|
+
type: "paragraph",
|
|
3863
|
+
text: item.text,
|
|
3864
|
+
pageNumber: pageNum,
|
|
3865
|
+
bbox: computeBBox([item], pageNum),
|
|
3866
|
+
style: { fontSize: item.fontSize, fontName: item.fontName }
|
|
3867
|
+
});
|
|
3868
|
+
}
|
|
3869
|
+
blocks.sort((a, b) => {
|
|
3870
|
+
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
3871
|
+
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
3872
|
+
return by - ay;
|
|
3873
|
+
});
|
|
3874
|
+
return blocks.length > 0 ? blocks : null;
|
|
3875
|
+
}
|
|
3876
|
+
return null;
|
|
3877
|
+
}
|
|
3878
|
+
function buildTableFromTextLayout(items, pageNum, bbox) {
|
|
3879
|
+
if (items.length < 4) return null;
|
|
3880
|
+
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
3881
|
+
const yTol = 3;
|
|
3882
|
+
const rows = [];
|
|
3883
|
+
let curRow = [sorted[0]];
|
|
3884
|
+
let curY = sorted[0].y;
|
|
3885
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
3886
|
+
if (Math.abs(sorted[i].y - curY) <= yTol) {
|
|
3887
|
+
curRow.push(sorted[i]);
|
|
3888
|
+
} else {
|
|
3889
|
+
rows.push(curRow);
|
|
3890
|
+
curRow = [sorted[i]];
|
|
3891
|
+
curY = sorted[i].y;
|
|
3892
|
+
}
|
|
3893
|
+
}
|
|
3894
|
+
rows.push(curRow);
|
|
3895
|
+
if (rows.length < 2) return null;
|
|
3896
|
+
const gapPositions = [];
|
|
3897
|
+
for (const row of rows) {
|
|
3898
|
+
if (row.length < 2) continue;
|
|
3899
|
+
const sortedX = [...row].sort((a, b) => a.x - b.x);
|
|
3900
|
+
const avgFs = sortedX.reduce((s, i) => s + i.fontSize, 0) / sortedX.length;
|
|
3901
|
+
for (let j = 1; j < sortedX.length; j++) {
|
|
3902
|
+
const gap = sortedX[j].x - (sortedX[j - 1].x + sortedX[j - 1].w);
|
|
3903
|
+
if (gap >= avgFs * 1.5) {
|
|
3904
|
+
gapPositions.push(sortedX[j - 1].x + sortedX[j - 1].w + gap / 2);
|
|
3905
|
+
}
|
|
3906
|
+
}
|
|
3907
|
+
}
|
|
3908
|
+
if (gapPositions.length < 2) return null;
|
|
3909
|
+
gapPositions.sort((a, b) => a - b);
|
|
3910
|
+
const colBoundaries = [];
|
|
3911
|
+
let clusterSum = gapPositions[0], clusterCount = 1;
|
|
3912
|
+
for (let i = 1; i < gapPositions.length; i++) {
|
|
3913
|
+
const avg = clusterSum / clusterCount;
|
|
3914
|
+
if (Math.abs(gapPositions[i] - avg) <= 15) {
|
|
3915
|
+
clusterSum += gapPositions[i];
|
|
3916
|
+
clusterCount++;
|
|
3917
|
+
} else {
|
|
3918
|
+
if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
|
|
3919
|
+
clusterSum = gapPositions[i];
|
|
3920
|
+
clusterCount = 1;
|
|
3921
|
+
}
|
|
3922
|
+
}
|
|
3923
|
+
if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
|
|
3924
|
+
if (colBoundaries.length === 0) return null;
|
|
3925
|
+
const numCols = colBoundaries.length + 1;
|
|
3926
|
+
const tableRows = [];
|
|
3927
|
+
for (const row of rows) {
|
|
3928
|
+
const cells = Array(numCols).fill("");
|
|
3929
|
+
const sortedX = [...row].sort((a, b) => a.x - b.x);
|
|
3930
|
+
for (const item of sortedX) {
|
|
3931
|
+
const cx = item.x + item.w / 2;
|
|
3932
|
+
let col = 0;
|
|
3933
|
+
for (let b = 0; b < colBoundaries.length; b++) {
|
|
3934
|
+
if (cx > colBoundaries[b]) col = b + 1;
|
|
3935
|
+
}
|
|
3936
|
+
cells[col] = cells[col] ? cells[col] + " " + item.text : item.text;
|
|
3937
|
+
}
|
|
3938
|
+
if (cells[0].trim() === "" && tableRows.length > 0) {
|
|
3939
|
+
const prevCells = tableRows[tableRows.length - 1].cells;
|
|
3940
|
+
for (let c = 0; c < numCols; c++) {
|
|
3941
|
+
if (cells[c].trim()) {
|
|
3942
|
+
prevCells[c] = prevCells[c] ? prevCells[c] + " " + cells[c].trim() : cells[c].trim();
|
|
3943
|
+
}
|
|
3944
|
+
}
|
|
3945
|
+
} else {
|
|
3946
|
+
tableRows.push({ cells });
|
|
3947
|
+
}
|
|
3948
|
+
}
|
|
3949
|
+
if (tableRows.length < 2) return null;
|
|
3950
|
+
const nonEmptyCount = tableRows.reduce((sum, r) => sum + r.cells.filter((c) => c.trim()).length, 0);
|
|
3951
|
+
const totalCount = tableRows.length * numCols;
|
|
3952
|
+
if (nonEmptyCount < totalCount * 0.3) return null;
|
|
3953
|
+
const irCells = tableRows.map(
|
|
3954
|
+
(r) => r.cells.map((text, colIdx) => {
|
|
3955
|
+
let cleaned = text.trim();
|
|
3956
|
+
if (colIdx > 0) cleaned = cleaned.replace(/^[•○·\-]\s*/, "");
|
|
3957
|
+
return { text: cleaned, colSpan: 1, rowSpan: 1 };
|
|
3958
|
+
})
|
|
3959
|
+
);
|
|
3960
|
+
const irTable = {
|
|
3961
|
+
rows: tableRows.length,
|
|
3962
|
+
cols: numCols,
|
|
3963
|
+
cells: irCells,
|
|
3964
|
+
hasHeader: tableRows.length > 1
|
|
3965
|
+
};
|
|
3966
|
+
return [{ type: "table", table: irTable, pageNumber: pageNum, bbox }];
|
|
3967
|
+
}
|
|
3644
3968
|
function shouldDemoteTable(table) {
|
|
3645
3969
|
const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
|
|
3646
3970
|
const allText = allCells.join(" ");
|
|
@@ -3687,6 +4011,32 @@ function detectMarkerHeadings(blocks) {
|
|
|
3687
4011
|
}
|
|
3688
4012
|
}
|
|
3689
4013
|
}
|
|
4014
|
+
function hasMultiColumnLayout(items) {
|
|
4015
|
+
if (items.length < 30) return false;
|
|
4016
|
+
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
4017
|
+
const minX = sorted[0].x;
|
|
4018
|
+
let maxX = minX;
|
|
4019
|
+
for (const i of sorted) if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
4020
|
+
const pageWidth = maxX - minX;
|
|
4021
|
+
if (pageWidth < 200) return false;
|
|
4022
|
+
let bestGap = 0;
|
|
4023
|
+
let bestSplit = 0;
|
|
4024
|
+
for (let j = 1; j < sorted.length; j++) {
|
|
4025
|
+
const gap = sorted[j].x - (sorted[j - 1].x + sorted[j - 1].w);
|
|
4026
|
+
if (gap > bestGap) {
|
|
4027
|
+
bestGap = gap;
|
|
4028
|
+
bestSplit = (sorted[j - 1].x + sorted[j - 1].w + sorted[j].x) / 2;
|
|
4029
|
+
}
|
|
4030
|
+
}
|
|
4031
|
+
if (bestGap < 20) return false;
|
|
4032
|
+
const splitRatio = (bestSplit - minX) / pageWidth;
|
|
4033
|
+
if (splitRatio < 0.35 || splitRatio > 0.65) return false;
|
|
4034
|
+
const leftCount = items.filter((i) => i.x + i.w / 2 < bestSplit).length;
|
|
4035
|
+
const rightCount = items.filter((i) => i.x + i.w / 2 >= bestSplit).length;
|
|
4036
|
+
if (leftCount < 15 || rightCount < 15) return false;
|
|
4037
|
+
if (Math.min(leftCount, rightCount) / Math.max(leftCount, rightCount) < 0.35) return false;
|
|
4038
|
+
return true;
|
|
4039
|
+
}
|
|
3690
4040
|
var MAX_XYCUT_DEPTH = 50;
|
|
3691
4041
|
function xyCutOrder(items, gapThreshold, depth = 0) {
|
|
3692
4042
|
if (items.length === 0) return [];
|
|
@@ -3817,6 +4167,11 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
3817
4167
|
width: grid.bbox.x2 - grid.bbox.x1,
|
|
3818
4168
|
height: grid.bbox.y2 - grid.bbox.y1
|
|
3819
4169
|
};
|
|
4170
|
+
const normalized = normalizeUnderSegmentedTable(irTable, tableItems, pageNum, tableBbox);
|
|
4171
|
+
if (normalized) {
|
|
4172
|
+
blocks.push(...normalized);
|
|
4173
|
+
continue;
|
|
4174
|
+
}
|
|
3820
4175
|
if (shouldDemoteTable(irTable)) {
|
|
3821
4176
|
const demoted = demoteTableToText(irTable);
|
|
3822
4177
|
if (demoted) {
|
|
@@ -3862,6 +4217,10 @@ function mergeAdjacentTableBlocks(blocks) {
|
|
|
3862
4217
|
}
|
|
3863
4218
|
function extractPageBlocksFallback(items, pageNum) {
|
|
3864
4219
|
if (items.length === 0) return [];
|
|
4220
|
+
if (hasMultiColumnLayout(items)) {
|
|
4221
|
+
const xyBlocks = buildXyCutBlocks(items, pageNum) || [];
|
|
4222
|
+
return detectSpecialKoreanTables(detectListBlocks(xyBlocks));
|
|
4223
|
+
}
|
|
3865
4224
|
const blocks = [];
|
|
3866
4225
|
const allYLines = groupByY(items);
|
|
3867
4226
|
const columns = detectColumns(allYLines);
|
|
@@ -3879,7 +4238,7 @@ function extractPageBlocksFallback(items, pageNum) {
|
|
|
3879
4238
|
fontSize: i.fontSize,
|
|
3880
4239
|
fontName: i.fontName
|
|
3881
4240
|
}));
|
|
3882
|
-
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4241
|
+
const clusterResults = hasMultiColumnLayout(items) ? [] : detectClusterTables(clusterItems, pageNum);
|
|
3883
4242
|
if (clusterResults.length > 0) {
|
|
3884
4243
|
const ciToIdx = /* @__PURE__ */ new Map();
|
|
3885
4244
|
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
@@ -4626,7 +4985,7 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
4626
4985
|
}
|
|
4627
4986
|
let pageFilter = null;
|
|
4628
4987
|
if (options?.pages) {
|
|
4629
|
-
const { parsePageRange: parsePageRange2 } = await import("./page-range-
|
|
4988
|
+
const { parsePageRange: parsePageRange2 } = await import("./page-range-737B4EZW.js");
|
|
4630
4989
|
pageFilter = parsePageRange2(options.pages, sheets.length);
|
|
4631
4990
|
}
|
|
4632
4991
|
const blocks = [];
|
|
@@ -5509,4 +5868,4 @@ export {
|
|
|
5509
5868
|
extractFormFields,
|
|
5510
5869
|
parse
|
|
5511
5870
|
};
|
|
5512
|
-
//# sourceMappingURL=chunk-
|
|
5871
|
+
//# sourceMappingURL=chunk-GJ2S6IMC.js.map
|