kordoc 2.0.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +302 -291
- package/dist/{chunk-4UH6ABAY.js → chunk-GJ2S6IMC.js} +381 -22
- package/dist/chunk-GJ2S6IMC.js.map +1 -0
- package/dist/{chunk-3TBUDJDE.js → chunk-MOL7MDBG.js} +1 -1
- package/dist/chunk-MOL7MDBG.js.map +1 -0
- package/dist/{chunk-25TXW6EP.js → chunk-PKIJLEV6.js} +2 -2
- package/dist/chunk-PKIJLEV6.js.map +1 -0
- package/dist/cli.js +5 -5
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +378 -18
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +378 -18
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +3 -3
- package/dist/mcp.js.map +1 -1
- package/dist/page-range-737B4EZW.js +8 -0
- package/dist/{provider-EU3CG724.js → provider-7H4CPZYS.js} +2 -1
- package/dist/provider-7H4CPZYS.js.map +1 -0
- package/dist/{utils-BTZ4WSYX.js → utils-BWQ2RGUD.js} +2 -2
- package/dist/{watch-QD3PDNXQ.js → watch-X7IC7MLF.js} +10 -6
- package/dist/watch-X7IC7MLF.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-25TXW6EP.js.map +0 -1
- package/dist/chunk-3TBUDJDE.js.map +0 -1
- package/dist/chunk-4UH6ABAY.js.map +0 -1
- package/dist/page-range-OF5I4PQY.js +0 -8
- package/dist/provider-EU3CG724.js.map +0 -1
- package/dist/watch-QD3PDNXQ.js.map +0 -1
- /package/dist/{page-range-OF5I4PQY.js.map → page-range-737B4EZW.js.map} +0 -0
- /package/dist/{utils-BTZ4WSYX.js.map → utils-BWQ2RGUD.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -63,6 +63,7 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount) {
|
|
|
63
63
|
blocks.push({ type: "paragraph", text: text.trim(), pageNumber: i });
|
|
64
64
|
}
|
|
65
65
|
} catch {
|
|
66
|
+
blocks.push({ type: "paragraph", text: `[OCR \uC2E4\uD328: \uD398\uC774\uC9C0 ${i}]` });
|
|
66
67
|
}
|
|
67
68
|
}
|
|
68
69
|
return blocks;
|
|
@@ -138,7 +139,7 @@ import { inflateRawSync } from "zlib";
|
|
|
138
139
|
import { DOMParser } from "@xmldom/xmldom";
|
|
139
140
|
|
|
140
141
|
// src/utils.ts
|
|
141
|
-
var VERSION = true ? "2.0
|
|
142
|
+
var VERSION = true ? "2.1.0" : "0.0.0-dev";
|
|
142
143
|
function toArrayBuffer(buf) {
|
|
143
144
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
144
145
|
return buf.buffer;
|
|
@@ -468,6 +469,9 @@ function tableToMarkdown(table) {
|
|
|
468
469
|
if (dr === 0 && dc === 0) continue;
|
|
469
470
|
if (r + dr < numRows && c + dc < numCols) {
|
|
470
471
|
skip.add(`${r + dr},${c + dc}`);
|
|
472
|
+
if (dr === 0) {
|
|
473
|
+
display[r][c + dc] = cell.text.replace(/\n/g, "<br>");
|
|
474
|
+
}
|
|
471
475
|
}
|
|
472
476
|
}
|
|
473
477
|
}
|
|
@@ -563,7 +567,12 @@ function parseCharProperties(doc, map) {
|
|
|
563
567
|
if (!id) continue;
|
|
564
568
|
const prop = {};
|
|
565
569
|
const height = el.getAttribute("height");
|
|
566
|
-
if (height)
|
|
570
|
+
if (height) {
|
|
571
|
+
const parsedHeight = parseInt(height, 10);
|
|
572
|
+
if (!isNaN(parsedHeight) && parsedHeight > 0) {
|
|
573
|
+
prop.fontSize = parsedHeight / 100;
|
|
574
|
+
}
|
|
575
|
+
}
|
|
567
576
|
const bold = el.getAttribute("bold");
|
|
568
577
|
if (bold === "true" || bold === "1") prop.bold = true;
|
|
569
578
|
const italic = el.getAttribute("italic");
|
|
@@ -703,7 +712,7 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
|
|
|
703
712
|
const data = await file.async("uint8array");
|
|
704
713
|
decompressed.total += data.length;
|
|
705
714
|
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
706
|
-
const ext = ref.includes(".") ? ref.split(".").pop() : "png";
|
|
715
|
+
const ext = ref.includes(".") ? ref.split(".").pop() || "png" : "png";
|
|
707
716
|
const mimeType = imageExtToMime(ext);
|
|
708
717
|
imageIndex++;
|
|
709
718
|
const filename = `image_${String(imageIndex).padStart(3, "0")}.${mimeToExt(mimeType)}`;
|
|
@@ -997,8 +1006,10 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
997
1006
|
break;
|
|
998
1007
|
case "cellSpan":
|
|
999
1008
|
if (tableCtx?.cell) {
|
|
1000
|
-
const
|
|
1001
|
-
const
|
|
1009
|
+
const rawCs = parseInt(el.getAttribute("colSpan") || "1", 10);
|
|
1010
|
+
const cs = isNaN(rawCs) ? 1 : rawCs;
|
|
1011
|
+
const rawRs = parseInt(el.getAttribute("rowSpan") || "1", 10);
|
|
1012
|
+
const rs = isNaN(rawRs) ? 1 : rawRs;
|
|
1002
1013
|
tableCtx.cell.colSpan = clampSpan(cs, MAX_COLS);
|
|
1003
1014
|
tableCtx.cell.rowSpan = clampSpan(rs, MAX_ROWS);
|
|
1004
1015
|
}
|
|
@@ -1090,6 +1101,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
1090
1101
|
extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
|
|
1091
1102
|
} else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
|
|
1092
1103
|
walkChildren(el, d + 1);
|
|
1104
|
+
} else if (localTag === "run") {
|
|
1105
|
+
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
|
|
1093
1106
|
}
|
|
1094
1107
|
}
|
|
1095
1108
|
};
|
|
@@ -2957,10 +2970,33 @@ var MIN_LINE_LENGTH = 10;
|
|
|
2957
2970
|
var COORD_MERGE_TOL = 3;
|
|
2958
2971
|
var CONNECT_TOL = 5;
|
|
2959
2972
|
var CELL_PADDING = 2;
|
|
2973
|
+
var MAX_LINE_WIDTH = 5;
|
|
2974
|
+
var IDENTITY = [1, 0, 0, 1, 0, 0];
|
|
2975
|
+
function matMultiply(m1, m2) {
|
|
2976
|
+
return [
|
|
2977
|
+
m1[0] * m2[0] + m1[2] * m2[1],
|
|
2978
|
+
m1[1] * m2[0] + m1[3] * m2[1],
|
|
2979
|
+
m1[0] * m2[2] + m1[2] * m2[3],
|
|
2980
|
+
m1[1] * m2[2] + m1[3] * m2[3],
|
|
2981
|
+
m1[0] * m2[4] + m1[2] * m2[5] + m1[4],
|
|
2982
|
+
m1[1] * m2[4] + m1[3] * m2[5] + m1[5]
|
|
2983
|
+
];
|
|
2984
|
+
}
|
|
2985
|
+
function matTransformPoint(m, x, y) {
|
|
2986
|
+
return [m[0] * x + m[2] * y + m[4], m[1] * x + m[3] * y + m[5]];
|
|
2987
|
+
}
|
|
2988
|
+
function matScale(m) {
|
|
2989
|
+
return Math.max(
|
|
2990
|
+
Math.sqrt(m[1] * m[1] + m[3] * m[3]),
|
|
2991
|
+
Math.sqrt(m[0] * m[0] + m[2] * m[2])
|
|
2992
|
+
);
|
|
2993
|
+
}
|
|
2960
2994
|
function extractLines(fnArray, argsArray) {
|
|
2961
2995
|
const horizontals = [];
|
|
2962
2996
|
const verticals = [];
|
|
2997
|
+
let ctm = [...IDENTITY];
|
|
2963
2998
|
let lineWidth = 1;
|
|
2999
|
+
const stateStack = [];
|
|
2964
3000
|
let currentPath = [];
|
|
2965
3001
|
let pathStartX = 0, pathStartY = 0;
|
|
2966
3002
|
let curX = 0, curY = 0;
|
|
@@ -2978,13 +3014,53 @@ function extractLines(fnArray, argsArray) {
|
|
|
2978
3014
|
);
|
|
2979
3015
|
}
|
|
2980
3016
|
}
|
|
2981
|
-
function
|
|
2982
|
-
if (
|
|
3017
|
+
function tryConvertLinesToRectangle(path) {
|
|
3018
|
+
if (path.length < 3 || path.length > 5) return false;
|
|
3019
|
+
const first = path[0], last = path[path.length - 1];
|
|
3020
|
+
const closed = Math.abs(first.x1 - last.x2) < 1 && Math.abs(first.y1 - last.y2) < 1;
|
|
3021
|
+
if (!closed) return false;
|
|
3022
|
+
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
3023
|
+
for (const seg of path) {
|
|
3024
|
+
minX = Math.min(minX, seg.x1, seg.x2);
|
|
3025
|
+
minY = Math.min(minY, seg.y1, seg.y2);
|
|
3026
|
+
maxX = Math.max(maxX, seg.x1, seg.x2);
|
|
3027
|
+
maxY = Math.max(maxY, seg.y1, seg.y2);
|
|
3028
|
+
}
|
|
3029
|
+
const w = maxX - minX, h = maxY - minY;
|
|
3030
|
+
if (w < MIN_LINE_LENGTH && h < MIN_LINE_LENGTH) return false;
|
|
3031
|
+
path.length = 0;
|
|
3032
|
+
if (h < ORIENTATION_TOL * 2 || w > MIN_LINE_LENGTH && h <= MAX_LINE_WIDTH) {
|
|
3033
|
+
path.push({ x1: minX, y1: (minY + maxY) / 2, x2: maxX, y2: (minY + maxY) / 2 });
|
|
3034
|
+
} else if (w < ORIENTATION_TOL * 2 || h > MIN_LINE_LENGTH && w <= MAX_LINE_WIDTH) {
|
|
3035
|
+
path.push({ x1: (minX + maxX) / 2, y1: minY, x2: (minX + maxX) / 2, y2: maxY });
|
|
3036
|
+
} else {
|
|
3037
|
+
pushRectangle(path, minX, minY, w, h);
|
|
3038
|
+
}
|
|
3039
|
+
return true;
|
|
3040
|
+
}
|
|
3041
|
+
function flushPath(isStroke, isFill) {
|
|
3042
|
+
if (!isStroke && !isFill) {
|
|
3043
|
+
currentPath = [];
|
|
3044
|
+
return;
|
|
3045
|
+
}
|
|
3046
|
+
if (isFill && !isStroke && currentPath.length >= 3) {
|
|
3047
|
+
tryConvertLinesToRectangle(currentPath);
|
|
3048
|
+
}
|
|
3049
|
+
const scale = matScale(ctm);
|
|
3050
|
+
const effectiveLW = lineWidth * scale;
|
|
3051
|
+
if (effectiveLW > MAX_LINE_WIDTH && isStroke && !isFill) {
|
|
2983
3052
|
currentPath = [];
|
|
2984
3053
|
return;
|
|
2985
3054
|
}
|
|
2986
3055
|
for (const seg of currentPath) {
|
|
2987
|
-
|
|
3056
|
+
const [px1, py1] = matTransformPoint(ctm, seg.x1, seg.y1);
|
|
3057
|
+
const [px2, py2] = matTransformPoint(ctm, seg.x2, seg.y2);
|
|
3058
|
+
classifyAndAdd(
|
|
3059
|
+
{ x1: px1, y1: py1, x2: px2, y2: py2 },
|
|
3060
|
+
effectiveLW,
|
|
3061
|
+
horizontals,
|
|
3062
|
+
verticals
|
|
3063
|
+
);
|
|
2988
3064
|
}
|
|
2989
3065
|
currentPath = [];
|
|
2990
3066
|
}
|
|
@@ -2992,9 +3068,28 @@ function extractLines(fnArray, argsArray) {
|
|
|
2992
3068
|
const op = fnArray[i];
|
|
2993
3069
|
const args = argsArray[i];
|
|
2994
3070
|
switch (op) {
|
|
3071
|
+
// ── Graphics State ──
|
|
3072
|
+
case OPS.save:
|
|
3073
|
+
stateStack.push({ ctm: [...ctm], lineWidth });
|
|
3074
|
+
break;
|
|
3075
|
+
case OPS.restore:
|
|
3076
|
+
if (stateStack.length > 0) {
|
|
3077
|
+
const state = stateStack.pop();
|
|
3078
|
+
ctm = state.ctm;
|
|
3079
|
+
lineWidth = state.lineWidth;
|
|
3080
|
+
}
|
|
3081
|
+
break;
|
|
3082
|
+
case OPS.transform: {
|
|
3083
|
+
const m = args;
|
|
3084
|
+
if (m.length >= 6) {
|
|
3085
|
+
ctm = matMultiply(ctm, [m[0], m[1], m[2], m[3], m[4], m[5]]);
|
|
3086
|
+
}
|
|
3087
|
+
break;
|
|
3088
|
+
}
|
|
2995
3089
|
case OPS.setLineWidth:
|
|
2996
3090
|
lineWidth = args[0] || 1;
|
|
2997
3091
|
break;
|
|
3092
|
+
// ── Path Construction ──
|
|
2998
3093
|
case OPS.constructPath: {
|
|
2999
3094
|
const arg0 = args[0];
|
|
3000
3095
|
if (Array.isArray(arg0)) {
|
|
@@ -3062,34 +3157,60 @@ function extractLines(fnArray, argsArray) {
|
|
|
3062
3157
|
}
|
|
3063
3158
|
}
|
|
3064
3159
|
}
|
|
3065
|
-
|
|
3066
|
-
|
|
3067
|
-
|
|
3068
|
-
|
|
3160
|
+
const isStroke5 = afterOp === OPS.stroke || afterOp === OPS.closeStroke;
|
|
3161
|
+
const isFill5 = afterOp === OPS.fill || afterOp === OPS.eoFill;
|
|
3162
|
+
const isBoth5 = afterOp === OPS.fillStroke || afterOp === OPS.eoFillStroke || afterOp === OPS.closeFillStroke || afterOp === OPS.closeEOFillStroke;
|
|
3163
|
+
if (isStroke5 || isFill5 || isBoth5) {
|
|
3164
|
+
flushPath(isStroke5 || isBoth5, isFill5 || isBoth5);
|
|
3069
3165
|
} else if (afterOp === OPS.endPath) {
|
|
3070
|
-
flushPath(false);
|
|
3166
|
+
flushPath(false, false);
|
|
3071
3167
|
}
|
|
3072
3168
|
}
|
|
3073
3169
|
break;
|
|
3074
3170
|
}
|
|
3171
|
+
// ── Paint Operations ──
|
|
3075
3172
|
case OPS.stroke:
|
|
3076
3173
|
case OPS.closeStroke:
|
|
3077
|
-
flushPath(true);
|
|
3174
|
+
flushPath(true, false);
|
|
3078
3175
|
break;
|
|
3079
3176
|
case OPS.fill:
|
|
3080
3177
|
case OPS.eoFill:
|
|
3178
|
+
flushPath(false, true);
|
|
3179
|
+
break;
|
|
3081
3180
|
case OPS.fillStroke:
|
|
3082
3181
|
case OPS.eoFillStroke:
|
|
3083
3182
|
case OPS.closeFillStroke:
|
|
3084
3183
|
case OPS.closeEOFillStroke:
|
|
3085
|
-
flushPath(true);
|
|
3184
|
+
flushPath(true, true);
|
|
3086
3185
|
break;
|
|
3087
3186
|
case OPS.endPath:
|
|
3088
|
-
flushPath(false);
|
|
3187
|
+
flushPath(false, false);
|
|
3089
3188
|
break;
|
|
3090
3189
|
}
|
|
3091
3190
|
}
|
|
3092
|
-
return {
|
|
3191
|
+
return {
|
|
3192
|
+
horizontals: deduplicateLines(horizontals),
|
|
3193
|
+
verticals: deduplicateLines(verticals)
|
|
3194
|
+
};
|
|
3195
|
+
}
|
|
3196
|
+
function deduplicateLines(lines) {
|
|
3197
|
+
if (lines.length <= 1) return lines;
|
|
3198
|
+
const result = [];
|
|
3199
|
+
const tol = COORD_MERGE_TOL;
|
|
3200
|
+
for (const line of lines) {
|
|
3201
|
+
let isDuplicate = false;
|
|
3202
|
+
for (const existing of result) {
|
|
3203
|
+
if (Math.abs(line.y1 - existing.y1) <= tol && Math.abs(line.y2 - existing.y2) <= tol && Math.abs(line.x1 - existing.x1) <= tol && Math.abs(line.x2 - existing.x2) <= tol) {
|
|
3204
|
+
if (line.lineWidth > existing.lineWidth) {
|
|
3205
|
+
existing.lineWidth = line.lineWidth;
|
|
3206
|
+
}
|
|
3207
|
+
isDuplicate = true;
|
|
3208
|
+
break;
|
|
3209
|
+
}
|
|
3210
|
+
}
|
|
3211
|
+
if (!isDuplicate) result.push(line);
|
|
3212
|
+
}
|
|
3213
|
+
return result;
|
|
3093
3214
|
}
|
|
3094
3215
|
function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
|
|
3095
3216
|
const dx = Math.abs(seg.x2 - seg.x1);
|
|
@@ -3685,6 +3806,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3685
3806
|
const medianFontSize = computeMedianFontSize(allFontSizes);
|
|
3686
3807
|
if (medianFontSize > 0) {
|
|
3687
3808
|
detectHeadings(blocks, medianFontSize);
|
|
3809
|
+
mergeAdjacentHeadings(blocks);
|
|
3688
3810
|
}
|
|
3689
3811
|
detectMarkerHeadings(blocks);
|
|
3690
3812
|
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
@@ -3759,6 +3881,46 @@ function detectHeadings(blocks, medianFontSize) {
|
|
|
3759
3881
|
}
|
|
3760
3882
|
}
|
|
3761
3883
|
}
|
|
3884
|
+
function mergeAdjacentHeadings(blocks) {
|
|
3885
|
+
let i = 0;
|
|
3886
|
+
while (i < blocks.length - 1) {
|
|
3887
|
+
const curr = blocks[i];
|
|
3888
|
+
const next = blocks[i + 1];
|
|
3889
|
+
if (curr.type !== "heading" || next.type !== "heading") {
|
|
3890
|
+
i++;
|
|
3891
|
+
continue;
|
|
3892
|
+
}
|
|
3893
|
+
if (!curr.bbox || !next.bbox || !curr.text || !next.text) {
|
|
3894
|
+
i++;
|
|
3895
|
+
continue;
|
|
3896
|
+
}
|
|
3897
|
+
const currBaseline = curr.bbox.y + (curr.style?.fontSize || curr.bbox.height);
|
|
3898
|
+
const nextBaseline = next.bbox.y + (next.style?.fontSize || next.bbox.height);
|
|
3899
|
+
const yDiff = Math.abs(currBaseline - nextBaseline);
|
|
3900
|
+
const maxFs = Math.max(curr.style?.fontSize || 12, next.style?.fontSize || 12);
|
|
3901
|
+
const sameY = curr.bbox.page === next.bbox.page && yDiff < maxFs * 1.5;
|
|
3902
|
+
const sameLevel = curr.level === next.level;
|
|
3903
|
+
if (sameY && sameLevel) {
|
|
3904
|
+
const currX = curr.bbox.x;
|
|
3905
|
+
const nextX = next.bbox.x;
|
|
3906
|
+
if (currX <= nextX) {
|
|
3907
|
+
curr.text = curr.text + " " + next.text;
|
|
3908
|
+
} else {
|
|
3909
|
+
curr.text = next.text + " " + curr.text;
|
|
3910
|
+
}
|
|
3911
|
+
curr.bbox = {
|
|
3912
|
+
page: curr.bbox.page,
|
|
3913
|
+
x: Math.min(curr.bbox.x, next.bbox.x),
|
|
3914
|
+
y: Math.min(curr.bbox.y, next.bbox.y),
|
|
3915
|
+
width: Math.max(curr.bbox.x + curr.bbox.width, next.bbox.x + next.bbox.width) - Math.min(curr.bbox.x, next.bbox.x),
|
|
3916
|
+
height: Math.max(curr.bbox.height, next.bbox.height)
|
|
3917
|
+
};
|
|
3918
|
+
blocks.splice(i + 1, 1);
|
|
3919
|
+
} else {
|
|
3920
|
+
i++;
|
|
3921
|
+
}
|
|
3922
|
+
}
|
|
3923
|
+
}
|
|
3762
3924
|
function collapseEvenSpacing(text) {
|
|
3763
3925
|
const tokens = text.split(" ");
|
|
3764
3926
|
const singleCharCount = tokens.filter((t) => t.length === 1).length;
|
|
@@ -3767,6 +3929,169 @@ function collapseEvenSpacing(text) {
|
|
|
3767
3929
|
}
|
|
3768
3930
|
return text;
|
|
3769
3931
|
}
|
|
3932
|
+
function buildXyCutBlocks(items, pageNum) {
|
|
3933
|
+
const allY = items.map((i) => i.y);
|
|
3934
|
+
const pageHeight = Math.max(...allY) - Math.min(...allY);
|
|
3935
|
+
const gapThreshold = Math.max(15, pageHeight * 0.03);
|
|
3936
|
+
const orderedGroups = xyCutOrder(items, gapThreshold);
|
|
3937
|
+
const blocks = [];
|
|
3938
|
+
for (const group of orderedGroups) {
|
|
3939
|
+
if (group.length === 0) continue;
|
|
3940
|
+
const yLines = groupByY(group);
|
|
3941
|
+
for (const line of yLines) {
|
|
3942
|
+
const text = mergeLineSimple(line);
|
|
3943
|
+
if (!text.trim()) continue;
|
|
3944
|
+
blocks.push({
|
|
3945
|
+
type: "paragraph",
|
|
3946
|
+
text,
|
|
3947
|
+
pageNumber: pageNum,
|
|
3948
|
+
bbox: computeBBox(line, pageNum),
|
|
3949
|
+
style: dominantStyle(line)
|
|
3950
|
+
});
|
|
3951
|
+
}
|
|
3952
|
+
}
|
|
3953
|
+
return blocks.length > 0 ? blocks : null;
|
|
3954
|
+
}
|
|
3955
|
+
function normalizeUnderSegmentedTable(table, items, pageNum, bbox) {
|
|
3956
|
+
const totalCells = table.cells.reduce((sum, row) => sum + row.filter((c) => c.text.trim()).length, 0);
|
|
3957
|
+
const totalTextLines = table.cells.reduce((sum, row) => sum + row.reduce((s, c) => s + (c.text.trim() ? c.text.split("\n").length : 0), 0), 0);
|
|
3958
|
+
const isUnderSegmented = table.rows === 1 && table.cols === 1 || totalCells <= 2 && totalTextLines >= 8 || totalCells <= 2 && items.length >= 6;
|
|
3959
|
+
if (!isUnderSegmented) return null;
|
|
3960
|
+
if (hasMultiColumnLayout(items)) return buildXyCutBlocks(items, pageNum);
|
|
3961
|
+
const directTable = buildTableFromTextLayout(items, pageNum, bbox);
|
|
3962
|
+
if (directTable) return directTable;
|
|
3963
|
+
const clusterItems = items.map((i) => ({
|
|
3964
|
+
text: i.text,
|
|
3965
|
+
x: i.x,
|
|
3966
|
+
y: i.y,
|
|
3967
|
+
w: i.w,
|
|
3968
|
+
h: i.h,
|
|
3969
|
+
fontSize: i.fontSize,
|
|
3970
|
+
fontName: i.fontName
|
|
3971
|
+
}));
|
|
3972
|
+
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
3973
|
+
if (clusterResults.length > 0) {
|
|
3974
|
+
const blocks = [];
|
|
3975
|
+
const ciToIdx = /* @__PURE__ */ new Map();
|
|
3976
|
+
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
3977
|
+
const usedIndices = /* @__PURE__ */ new Set();
|
|
3978
|
+
for (const cr of clusterResults) {
|
|
3979
|
+
for (const ci of cr.usedItems) {
|
|
3980
|
+
const idx = ciToIdx.get(ci);
|
|
3981
|
+
if (idx !== void 0) usedIndices.add(idx);
|
|
3982
|
+
}
|
|
3983
|
+
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
3984
|
+
}
|
|
3985
|
+
const remaining = items.filter((_, idx) => !usedIndices.has(idx));
|
|
3986
|
+
for (const item of remaining) {
|
|
3987
|
+
if (!item.text.trim()) continue;
|
|
3988
|
+
blocks.push({
|
|
3989
|
+
type: "paragraph",
|
|
3990
|
+
text: item.text,
|
|
3991
|
+
pageNumber: pageNum,
|
|
3992
|
+
bbox: computeBBox([item], pageNum),
|
|
3993
|
+
style: { fontSize: item.fontSize, fontName: item.fontName }
|
|
3994
|
+
});
|
|
3995
|
+
}
|
|
3996
|
+
blocks.sort((a, b) => {
|
|
3997
|
+
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
3998
|
+
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
3999
|
+
return by - ay;
|
|
4000
|
+
});
|
|
4001
|
+
return blocks.length > 0 ? blocks : null;
|
|
4002
|
+
}
|
|
4003
|
+
return null;
|
|
4004
|
+
}
|
|
4005
|
+
function buildTableFromTextLayout(items, pageNum, bbox) {
|
|
4006
|
+
if (items.length < 4) return null;
|
|
4007
|
+
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
4008
|
+
const yTol = 3;
|
|
4009
|
+
const rows = [];
|
|
4010
|
+
let curRow = [sorted[0]];
|
|
4011
|
+
let curY = sorted[0].y;
|
|
4012
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
4013
|
+
if (Math.abs(sorted[i].y - curY) <= yTol) {
|
|
4014
|
+
curRow.push(sorted[i]);
|
|
4015
|
+
} else {
|
|
4016
|
+
rows.push(curRow);
|
|
4017
|
+
curRow = [sorted[i]];
|
|
4018
|
+
curY = sorted[i].y;
|
|
4019
|
+
}
|
|
4020
|
+
}
|
|
4021
|
+
rows.push(curRow);
|
|
4022
|
+
if (rows.length < 2) return null;
|
|
4023
|
+
const gapPositions = [];
|
|
4024
|
+
for (const row of rows) {
|
|
4025
|
+
if (row.length < 2) continue;
|
|
4026
|
+
const sortedX = [...row].sort((a, b) => a.x - b.x);
|
|
4027
|
+
const avgFs = sortedX.reduce((s, i) => s + i.fontSize, 0) / sortedX.length;
|
|
4028
|
+
for (let j = 1; j < sortedX.length; j++) {
|
|
4029
|
+
const gap = sortedX[j].x - (sortedX[j - 1].x + sortedX[j - 1].w);
|
|
4030
|
+
if (gap >= avgFs * 1.5) {
|
|
4031
|
+
gapPositions.push(sortedX[j - 1].x + sortedX[j - 1].w + gap / 2);
|
|
4032
|
+
}
|
|
4033
|
+
}
|
|
4034
|
+
}
|
|
4035
|
+
if (gapPositions.length < 2) return null;
|
|
4036
|
+
gapPositions.sort((a, b) => a - b);
|
|
4037
|
+
const colBoundaries = [];
|
|
4038
|
+
let clusterSum = gapPositions[0], clusterCount = 1;
|
|
4039
|
+
for (let i = 1; i < gapPositions.length; i++) {
|
|
4040
|
+
const avg = clusterSum / clusterCount;
|
|
4041
|
+
if (Math.abs(gapPositions[i] - avg) <= 15) {
|
|
4042
|
+
clusterSum += gapPositions[i];
|
|
4043
|
+
clusterCount++;
|
|
4044
|
+
} else {
|
|
4045
|
+
if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
|
|
4046
|
+
clusterSum = gapPositions[i];
|
|
4047
|
+
clusterCount = 1;
|
|
4048
|
+
}
|
|
4049
|
+
}
|
|
4050
|
+
if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
|
|
4051
|
+
if (colBoundaries.length === 0) return null;
|
|
4052
|
+
const numCols = colBoundaries.length + 1;
|
|
4053
|
+
const tableRows = [];
|
|
4054
|
+
for (const row of rows) {
|
|
4055
|
+
const cells = Array(numCols).fill("");
|
|
4056
|
+
const sortedX = [...row].sort((a, b) => a.x - b.x);
|
|
4057
|
+
for (const item of sortedX) {
|
|
4058
|
+
const cx = item.x + item.w / 2;
|
|
4059
|
+
let col = 0;
|
|
4060
|
+
for (let b = 0; b < colBoundaries.length; b++) {
|
|
4061
|
+
if (cx > colBoundaries[b]) col = b + 1;
|
|
4062
|
+
}
|
|
4063
|
+
cells[col] = cells[col] ? cells[col] + " " + item.text : item.text;
|
|
4064
|
+
}
|
|
4065
|
+
if (cells[0].trim() === "" && tableRows.length > 0) {
|
|
4066
|
+
const prevCells = tableRows[tableRows.length - 1].cells;
|
|
4067
|
+
for (let c = 0; c < numCols; c++) {
|
|
4068
|
+
if (cells[c].trim()) {
|
|
4069
|
+
prevCells[c] = prevCells[c] ? prevCells[c] + " " + cells[c].trim() : cells[c].trim();
|
|
4070
|
+
}
|
|
4071
|
+
}
|
|
4072
|
+
} else {
|
|
4073
|
+
tableRows.push({ cells });
|
|
4074
|
+
}
|
|
4075
|
+
}
|
|
4076
|
+
if (tableRows.length < 2) return null;
|
|
4077
|
+
const nonEmptyCount = tableRows.reduce((sum, r) => sum + r.cells.filter((c) => c.trim()).length, 0);
|
|
4078
|
+
const totalCount = tableRows.length * numCols;
|
|
4079
|
+
if (nonEmptyCount < totalCount * 0.3) return null;
|
|
4080
|
+
const irCells = tableRows.map(
|
|
4081
|
+
(r) => r.cells.map((text, colIdx) => {
|
|
4082
|
+
let cleaned = text.trim();
|
|
4083
|
+
if (colIdx > 0) cleaned = cleaned.replace(/^[•○·\-]\s*/, "");
|
|
4084
|
+
return { text: cleaned, colSpan: 1, rowSpan: 1 };
|
|
4085
|
+
})
|
|
4086
|
+
);
|
|
4087
|
+
const irTable = {
|
|
4088
|
+
rows: tableRows.length,
|
|
4089
|
+
cols: numCols,
|
|
4090
|
+
cells: irCells,
|
|
4091
|
+
hasHeader: tableRows.length > 1
|
|
4092
|
+
};
|
|
4093
|
+
return [{ type: "table", table: irTable, pageNumber: pageNum, bbox }];
|
|
4094
|
+
}
|
|
3770
4095
|
function shouldDemoteTable(table) {
|
|
3771
4096
|
const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
|
|
3772
4097
|
const allText = allCells.join(" ");
|
|
@@ -3813,6 +4138,32 @@ function detectMarkerHeadings(blocks) {
|
|
|
3813
4138
|
}
|
|
3814
4139
|
}
|
|
3815
4140
|
}
|
|
4141
|
+
function hasMultiColumnLayout(items) {
|
|
4142
|
+
if (items.length < 30) return false;
|
|
4143
|
+
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
4144
|
+
const minX = sorted[0].x;
|
|
4145
|
+
let maxX = minX;
|
|
4146
|
+
for (const i of sorted) if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
4147
|
+
const pageWidth = maxX - minX;
|
|
4148
|
+
if (pageWidth < 200) return false;
|
|
4149
|
+
let bestGap = 0;
|
|
4150
|
+
let bestSplit = 0;
|
|
4151
|
+
for (let j = 1; j < sorted.length; j++) {
|
|
4152
|
+
const gap = sorted[j].x - (sorted[j - 1].x + sorted[j - 1].w);
|
|
4153
|
+
if (gap > bestGap) {
|
|
4154
|
+
bestGap = gap;
|
|
4155
|
+
bestSplit = (sorted[j - 1].x + sorted[j - 1].w + sorted[j].x) / 2;
|
|
4156
|
+
}
|
|
4157
|
+
}
|
|
4158
|
+
if (bestGap < 20) return false;
|
|
4159
|
+
const splitRatio = (bestSplit - minX) / pageWidth;
|
|
4160
|
+
if (splitRatio < 0.35 || splitRatio > 0.65) return false;
|
|
4161
|
+
const leftCount = items.filter((i) => i.x + i.w / 2 < bestSplit).length;
|
|
4162
|
+
const rightCount = items.filter((i) => i.x + i.w / 2 >= bestSplit).length;
|
|
4163
|
+
if (leftCount < 15 || rightCount < 15) return false;
|
|
4164
|
+
if (Math.min(leftCount, rightCount) / Math.max(leftCount, rightCount) < 0.35) return false;
|
|
4165
|
+
return true;
|
|
4166
|
+
}
|
|
3816
4167
|
var MAX_XYCUT_DEPTH = 50;
|
|
3817
4168
|
function xyCutOrder(items, gapThreshold, depth = 0) {
|
|
3818
4169
|
if (items.length === 0) return [];
|
|
@@ -3943,6 +4294,11 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
3943
4294
|
width: grid.bbox.x2 - grid.bbox.x1,
|
|
3944
4295
|
height: grid.bbox.y2 - grid.bbox.y1
|
|
3945
4296
|
};
|
|
4297
|
+
const normalized = normalizeUnderSegmentedTable(irTable, tableItems, pageNum, tableBbox);
|
|
4298
|
+
if (normalized) {
|
|
4299
|
+
blocks.push(...normalized);
|
|
4300
|
+
continue;
|
|
4301
|
+
}
|
|
3946
4302
|
if (shouldDemoteTable(irTable)) {
|
|
3947
4303
|
const demoted = demoteTableToText(irTable);
|
|
3948
4304
|
if (demoted) {
|
|
@@ -3988,6 +4344,10 @@ function mergeAdjacentTableBlocks(blocks) {
|
|
|
3988
4344
|
}
|
|
3989
4345
|
function extractPageBlocksFallback(items, pageNum) {
|
|
3990
4346
|
if (items.length === 0) return [];
|
|
4347
|
+
if (hasMultiColumnLayout(items)) {
|
|
4348
|
+
const xyBlocks = buildXyCutBlocks(items, pageNum) || [];
|
|
4349
|
+
return detectSpecialKoreanTables(detectListBlocks(xyBlocks));
|
|
4350
|
+
}
|
|
3991
4351
|
const blocks = [];
|
|
3992
4352
|
const allYLines = groupByY(items);
|
|
3993
4353
|
const columns = detectColumns(allYLines);
|
|
@@ -4005,7 +4365,7 @@ function extractPageBlocksFallback(items, pageNum) {
|
|
|
4005
4365
|
fontSize: i.fontSize,
|
|
4006
4366
|
fontName: i.fontName
|
|
4007
4367
|
}));
|
|
4008
|
-
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4368
|
+
const clusterResults = hasMultiColumnLayout(items) ? [] : detectClusterTables(clusterItems, pageNum);
|
|
4009
4369
|
if (clusterResults.length > 0) {
|
|
4010
4370
|
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4011
4371
|
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|