kordoc 2.0.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +302 -291
- package/dist/{chunk-4UH6ABAY.js → chunk-GJ2S6IMC.js} +381 -22
- package/dist/chunk-GJ2S6IMC.js.map +1 -0
- package/dist/{chunk-3TBUDJDE.js → chunk-MOL7MDBG.js} +1 -1
- package/dist/chunk-MOL7MDBG.js.map +1 -0
- package/dist/{chunk-25TXW6EP.js → chunk-PKIJLEV6.js} +2 -2
- package/dist/chunk-PKIJLEV6.js.map +1 -0
- package/dist/cli.js +5 -5
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +378 -18
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +378 -18
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +3 -3
- package/dist/mcp.js.map +1 -1
- package/dist/page-range-737B4EZW.js +8 -0
- package/dist/{provider-EU3CG724.js → provider-7H4CPZYS.js} +2 -1
- package/dist/provider-7H4CPZYS.js.map +1 -0
- package/dist/{utils-BTZ4WSYX.js → utils-BWQ2RGUD.js} +2 -2
- package/dist/{watch-QD3PDNXQ.js → watch-X7IC7MLF.js} +10 -6
- package/dist/watch-X7IC7MLF.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-25TXW6EP.js.map +0 -1
- package/dist/chunk-3TBUDJDE.js.map +0 -1
- package/dist/chunk-4UH6ABAY.js.map +0 -1
- package/dist/page-range-OF5I4PQY.js +0 -8
- package/dist/provider-EU3CG724.js.map +0 -1
- package/dist/watch-QD3PDNXQ.js.map +0 -1
- /package/dist/{page-range-OF5I4PQY.js.map → page-range-737B4EZW.js.map} +0 -0
- /package/dist/{utils-BTZ4WSYX.js.map → utils-BWQ2RGUD.js.map} +0 -0
package/dist/index.cjs
CHANGED
|
@@ -85,6 +85,7 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount) {
|
|
|
85
85
|
blocks.push({ type: "paragraph", text: text.trim(), pageNumber: i });
|
|
86
86
|
}
|
|
87
87
|
} catch {
|
|
88
|
+
blocks.push({ type: "paragraph", text: `[OCR \uC2E4\uD328: \uD398\uC774\uC9C0 ${i}]` });
|
|
88
89
|
}
|
|
89
90
|
}
|
|
90
91
|
return blocks;
|
|
@@ -182,7 +183,7 @@ var import_zlib = require("zlib");
|
|
|
182
183
|
var import_xmldom = require("@xmldom/xmldom");
|
|
183
184
|
|
|
184
185
|
// src/utils.ts
|
|
185
|
-
var VERSION = true ? "2.0
|
|
186
|
+
var VERSION = true ? "2.1.0" : "0.0.0-dev";
|
|
186
187
|
function toArrayBuffer(buf) {
|
|
187
188
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
188
189
|
return buf.buffer;
|
|
@@ -512,6 +513,9 @@ function tableToMarkdown(table) {
|
|
|
512
513
|
if (dr === 0 && dc === 0) continue;
|
|
513
514
|
if (r + dr < numRows && c + dc < numCols) {
|
|
514
515
|
skip.add(`${r + dr},${c + dc}`);
|
|
516
|
+
if (dr === 0) {
|
|
517
|
+
display[r][c + dc] = cell.text.replace(/\n/g, "<br>");
|
|
518
|
+
}
|
|
515
519
|
}
|
|
516
520
|
}
|
|
517
521
|
}
|
|
@@ -607,7 +611,12 @@ function parseCharProperties(doc, map) {
|
|
|
607
611
|
if (!id) continue;
|
|
608
612
|
const prop = {};
|
|
609
613
|
const height = el.getAttribute("height");
|
|
610
|
-
if (height)
|
|
614
|
+
if (height) {
|
|
615
|
+
const parsedHeight = parseInt(height, 10);
|
|
616
|
+
if (!isNaN(parsedHeight) && parsedHeight > 0) {
|
|
617
|
+
prop.fontSize = parsedHeight / 100;
|
|
618
|
+
}
|
|
619
|
+
}
|
|
611
620
|
const bold = el.getAttribute("bold");
|
|
612
621
|
if (bold === "true" || bold === "1") prop.bold = true;
|
|
613
622
|
const italic = el.getAttribute("italic");
|
|
@@ -747,7 +756,7 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
|
|
|
747
756
|
const data = await file.async("uint8array");
|
|
748
757
|
decompressed.total += data.length;
|
|
749
758
|
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
750
|
-
const ext = ref.includes(".") ? ref.split(".").pop() : "png";
|
|
759
|
+
const ext = ref.includes(".") ? ref.split(".").pop() || "png" : "png";
|
|
751
760
|
const mimeType = imageExtToMime(ext);
|
|
752
761
|
imageIndex++;
|
|
753
762
|
const filename = `image_${String(imageIndex).padStart(3, "0")}.${mimeToExt(mimeType)}`;
|
|
@@ -1041,8 +1050,10 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
1041
1050
|
break;
|
|
1042
1051
|
case "cellSpan":
|
|
1043
1052
|
if (tableCtx?.cell) {
|
|
1044
|
-
const
|
|
1045
|
-
const
|
|
1053
|
+
const rawCs = parseInt(el.getAttribute("colSpan") || "1", 10);
|
|
1054
|
+
const cs = isNaN(rawCs) ? 1 : rawCs;
|
|
1055
|
+
const rawRs = parseInt(el.getAttribute("rowSpan") || "1", 10);
|
|
1056
|
+
const rs = isNaN(rawRs) ? 1 : rawRs;
|
|
1046
1057
|
tableCtx.cell.colSpan = clampSpan(cs, MAX_COLS);
|
|
1047
1058
|
tableCtx.cell.rowSpan = clampSpan(rs, MAX_ROWS);
|
|
1048
1059
|
}
|
|
@@ -1134,6 +1145,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
1134
1145
|
extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
|
|
1135
1146
|
} else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
|
|
1136
1147
|
walkChildren(el, d + 1);
|
|
1148
|
+
} else if (localTag === "run") {
|
|
1149
|
+
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
|
|
1137
1150
|
}
|
|
1138
1151
|
}
|
|
1139
1152
|
};
|
|
@@ -3002,10 +3015,33 @@ var MIN_LINE_LENGTH = 10;
|
|
|
3002
3015
|
var COORD_MERGE_TOL = 3;
|
|
3003
3016
|
var CONNECT_TOL = 5;
|
|
3004
3017
|
var CELL_PADDING = 2;
|
|
3018
|
+
var MAX_LINE_WIDTH = 5;
|
|
3019
|
+
var IDENTITY = [1, 0, 0, 1, 0, 0];
|
|
3020
|
+
function matMultiply(m1, m2) {
|
|
3021
|
+
return [
|
|
3022
|
+
m1[0] * m2[0] + m1[2] * m2[1],
|
|
3023
|
+
m1[1] * m2[0] + m1[3] * m2[1],
|
|
3024
|
+
m1[0] * m2[2] + m1[2] * m2[3],
|
|
3025
|
+
m1[1] * m2[2] + m1[3] * m2[3],
|
|
3026
|
+
m1[0] * m2[4] + m1[2] * m2[5] + m1[4],
|
|
3027
|
+
m1[1] * m2[4] + m1[3] * m2[5] + m1[5]
|
|
3028
|
+
];
|
|
3029
|
+
}
|
|
3030
|
+
function matTransformPoint(m, x, y) {
|
|
3031
|
+
return [m[0] * x + m[2] * y + m[4], m[1] * x + m[3] * y + m[5]];
|
|
3032
|
+
}
|
|
3033
|
+
function matScale(m) {
|
|
3034
|
+
return Math.max(
|
|
3035
|
+
Math.sqrt(m[1] * m[1] + m[3] * m[3]),
|
|
3036
|
+
Math.sqrt(m[0] * m[0] + m[2] * m[2])
|
|
3037
|
+
);
|
|
3038
|
+
}
|
|
3005
3039
|
function extractLines(fnArray, argsArray) {
|
|
3006
3040
|
const horizontals = [];
|
|
3007
3041
|
const verticals = [];
|
|
3042
|
+
let ctm = [...IDENTITY];
|
|
3008
3043
|
let lineWidth = 1;
|
|
3044
|
+
const stateStack = [];
|
|
3009
3045
|
let currentPath = [];
|
|
3010
3046
|
let pathStartX = 0, pathStartY = 0;
|
|
3011
3047
|
let curX = 0, curY = 0;
|
|
@@ -3023,13 +3059,53 @@ function extractLines(fnArray, argsArray) {
|
|
|
3023
3059
|
);
|
|
3024
3060
|
}
|
|
3025
3061
|
}
|
|
3026
|
-
function
|
|
3027
|
-
if (
|
|
3062
|
+
function tryConvertLinesToRectangle(path) {
|
|
3063
|
+
if (path.length < 3 || path.length > 5) return false;
|
|
3064
|
+
const first = path[0], last = path[path.length - 1];
|
|
3065
|
+
const closed = Math.abs(first.x1 - last.x2) < 1 && Math.abs(first.y1 - last.y2) < 1;
|
|
3066
|
+
if (!closed) return false;
|
|
3067
|
+
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
3068
|
+
for (const seg of path) {
|
|
3069
|
+
minX = Math.min(minX, seg.x1, seg.x2);
|
|
3070
|
+
minY = Math.min(minY, seg.y1, seg.y2);
|
|
3071
|
+
maxX = Math.max(maxX, seg.x1, seg.x2);
|
|
3072
|
+
maxY = Math.max(maxY, seg.y1, seg.y2);
|
|
3073
|
+
}
|
|
3074
|
+
const w = maxX - minX, h = maxY - minY;
|
|
3075
|
+
if (w < MIN_LINE_LENGTH && h < MIN_LINE_LENGTH) return false;
|
|
3076
|
+
path.length = 0;
|
|
3077
|
+
if (h < ORIENTATION_TOL * 2 || w > MIN_LINE_LENGTH && h <= MAX_LINE_WIDTH) {
|
|
3078
|
+
path.push({ x1: minX, y1: (minY + maxY) / 2, x2: maxX, y2: (minY + maxY) / 2 });
|
|
3079
|
+
} else if (w < ORIENTATION_TOL * 2 || h > MIN_LINE_LENGTH && w <= MAX_LINE_WIDTH) {
|
|
3080
|
+
path.push({ x1: (minX + maxX) / 2, y1: minY, x2: (minX + maxX) / 2, y2: maxY });
|
|
3081
|
+
} else {
|
|
3082
|
+
pushRectangle(path, minX, minY, w, h);
|
|
3083
|
+
}
|
|
3084
|
+
return true;
|
|
3085
|
+
}
|
|
3086
|
+
function flushPath(isStroke, isFill) {
|
|
3087
|
+
if (!isStroke && !isFill) {
|
|
3088
|
+
currentPath = [];
|
|
3089
|
+
return;
|
|
3090
|
+
}
|
|
3091
|
+
if (isFill && !isStroke && currentPath.length >= 3) {
|
|
3092
|
+
tryConvertLinesToRectangle(currentPath);
|
|
3093
|
+
}
|
|
3094
|
+
const scale = matScale(ctm);
|
|
3095
|
+
const effectiveLW = lineWidth * scale;
|
|
3096
|
+
if (effectiveLW > MAX_LINE_WIDTH && isStroke && !isFill) {
|
|
3028
3097
|
currentPath = [];
|
|
3029
3098
|
return;
|
|
3030
3099
|
}
|
|
3031
3100
|
for (const seg of currentPath) {
|
|
3032
|
-
|
|
3101
|
+
const [px1, py1] = matTransformPoint(ctm, seg.x1, seg.y1);
|
|
3102
|
+
const [px2, py2] = matTransformPoint(ctm, seg.x2, seg.y2);
|
|
3103
|
+
classifyAndAdd(
|
|
3104
|
+
{ x1: px1, y1: py1, x2: px2, y2: py2 },
|
|
3105
|
+
effectiveLW,
|
|
3106
|
+
horizontals,
|
|
3107
|
+
verticals
|
|
3108
|
+
);
|
|
3033
3109
|
}
|
|
3034
3110
|
currentPath = [];
|
|
3035
3111
|
}
|
|
@@ -3037,9 +3113,28 @@ function extractLines(fnArray, argsArray) {
|
|
|
3037
3113
|
const op = fnArray[i];
|
|
3038
3114
|
const args = argsArray[i];
|
|
3039
3115
|
switch (op) {
|
|
3116
|
+
// ── Graphics State ──
|
|
3117
|
+
case import_pdf.OPS.save:
|
|
3118
|
+
stateStack.push({ ctm: [...ctm], lineWidth });
|
|
3119
|
+
break;
|
|
3120
|
+
case import_pdf.OPS.restore:
|
|
3121
|
+
if (stateStack.length > 0) {
|
|
3122
|
+
const state = stateStack.pop();
|
|
3123
|
+
ctm = state.ctm;
|
|
3124
|
+
lineWidth = state.lineWidth;
|
|
3125
|
+
}
|
|
3126
|
+
break;
|
|
3127
|
+
case import_pdf.OPS.transform: {
|
|
3128
|
+
const m = args;
|
|
3129
|
+
if (m.length >= 6) {
|
|
3130
|
+
ctm = matMultiply(ctm, [m[0], m[1], m[2], m[3], m[4], m[5]]);
|
|
3131
|
+
}
|
|
3132
|
+
break;
|
|
3133
|
+
}
|
|
3040
3134
|
case import_pdf.OPS.setLineWidth:
|
|
3041
3135
|
lineWidth = args[0] || 1;
|
|
3042
3136
|
break;
|
|
3137
|
+
// ── Path Construction ──
|
|
3043
3138
|
case import_pdf.OPS.constructPath: {
|
|
3044
3139
|
const arg0 = args[0];
|
|
3045
3140
|
if (Array.isArray(arg0)) {
|
|
@@ -3107,34 +3202,60 @@ function extractLines(fnArray, argsArray) {
|
|
|
3107
3202
|
}
|
|
3108
3203
|
}
|
|
3109
3204
|
}
|
|
3110
|
-
|
|
3111
|
-
|
|
3112
|
-
|
|
3113
|
-
|
|
3205
|
+
const isStroke5 = afterOp === import_pdf.OPS.stroke || afterOp === import_pdf.OPS.closeStroke;
|
|
3206
|
+
const isFill5 = afterOp === import_pdf.OPS.fill || afterOp === import_pdf.OPS.eoFill;
|
|
3207
|
+
const isBoth5 = afterOp === import_pdf.OPS.fillStroke || afterOp === import_pdf.OPS.eoFillStroke || afterOp === import_pdf.OPS.closeFillStroke || afterOp === import_pdf.OPS.closeEOFillStroke;
|
|
3208
|
+
if (isStroke5 || isFill5 || isBoth5) {
|
|
3209
|
+
flushPath(isStroke5 || isBoth5, isFill5 || isBoth5);
|
|
3114
3210
|
} else if (afterOp === import_pdf.OPS.endPath) {
|
|
3115
|
-
flushPath(false);
|
|
3211
|
+
flushPath(false, false);
|
|
3116
3212
|
}
|
|
3117
3213
|
}
|
|
3118
3214
|
break;
|
|
3119
3215
|
}
|
|
3216
|
+
// ── Paint Operations ──
|
|
3120
3217
|
case import_pdf.OPS.stroke:
|
|
3121
3218
|
case import_pdf.OPS.closeStroke:
|
|
3122
|
-
flushPath(true);
|
|
3219
|
+
flushPath(true, false);
|
|
3123
3220
|
break;
|
|
3124
3221
|
case import_pdf.OPS.fill:
|
|
3125
3222
|
case import_pdf.OPS.eoFill:
|
|
3223
|
+
flushPath(false, true);
|
|
3224
|
+
break;
|
|
3126
3225
|
case import_pdf.OPS.fillStroke:
|
|
3127
3226
|
case import_pdf.OPS.eoFillStroke:
|
|
3128
3227
|
case import_pdf.OPS.closeFillStroke:
|
|
3129
3228
|
case import_pdf.OPS.closeEOFillStroke:
|
|
3130
|
-
flushPath(true);
|
|
3229
|
+
flushPath(true, true);
|
|
3131
3230
|
break;
|
|
3132
3231
|
case import_pdf.OPS.endPath:
|
|
3133
|
-
flushPath(false);
|
|
3232
|
+
flushPath(false, false);
|
|
3134
3233
|
break;
|
|
3135
3234
|
}
|
|
3136
3235
|
}
|
|
3137
|
-
return {
|
|
3236
|
+
return {
|
|
3237
|
+
horizontals: deduplicateLines(horizontals),
|
|
3238
|
+
verticals: deduplicateLines(verticals)
|
|
3239
|
+
};
|
|
3240
|
+
}
|
|
3241
|
+
function deduplicateLines(lines) {
|
|
3242
|
+
if (lines.length <= 1) return lines;
|
|
3243
|
+
const result = [];
|
|
3244
|
+
const tol = COORD_MERGE_TOL;
|
|
3245
|
+
for (const line of lines) {
|
|
3246
|
+
let isDuplicate = false;
|
|
3247
|
+
for (const existing of result) {
|
|
3248
|
+
if (Math.abs(line.y1 - existing.y1) <= tol && Math.abs(line.y2 - existing.y2) <= tol && Math.abs(line.x1 - existing.x1) <= tol && Math.abs(line.x2 - existing.x2) <= tol) {
|
|
3249
|
+
if (line.lineWidth > existing.lineWidth) {
|
|
3250
|
+
existing.lineWidth = line.lineWidth;
|
|
3251
|
+
}
|
|
3252
|
+
isDuplicate = true;
|
|
3253
|
+
break;
|
|
3254
|
+
}
|
|
3255
|
+
}
|
|
3256
|
+
if (!isDuplicate) result.push(line);
|
|
3257
|
+
}
|
|
3258
|
+
return result;
|
|
3138
3259
|
}
|
|
3139
3260
|
function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
|
|
3140
3261
|
const dx = Math.abs(seg.x2 - seg.x1);
|
|
@@ -3730,6 +3851,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3730
3851
|
const medianFontSize = computeMedianFontSize(allFontSizes);
|
|
3731
3852
|
if (medianFontSize > 0) {
|
|
3732
3853
|
detectHeadings(blocks, medianFontSize);
|
|
3854
|
+
mergeAdjacentHeadings(blocks);
|
|
3733
3855
|
}
|
|
3734
3856
|
detectMarkerHeadings(blocks);
|
|
3735
3857
|
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
@@ -3804,6 +3926,46 @@ function detectHeadings(blocks, medianFontSize) {
|
|
|
3804
3926
|
}
|
|
3805
3927
|
}
|
|
3806
3928
|
}
|
|
3929
|
+
function mergeAdjacentHeadings(blocks) {
|
|
3930
|
+
let i = 0;
|
|
3931
|
+
while (i < blocks.length - 1) {
|
|
3932
|
+
const curr = blocks[i];
|
|
3933
|
+
const next = blocks[i + 1];
|
|
3934
|
+
if (curr.type !== "heading" || next.type !== "heading") {
|
|
3935
|
+
i++;
|
|
3936
|
+
continue;
|
|
3937
|
+
}
|
|
3938
|
+
if (!curr.bbox || !next.bbox || !curr.text || !next.text) {
|
|
3939
|
+
i++;
|
|
3940
|
+
continue;
|
|
3941
|
+
}
|
|
3942
|
+
const currBaseline = curr.bbox.y + (curr.style?.fontSize || curr.bbox.height);
|
|
3943
|
+
const nextBaseline = next.bbox.y + (next.style?.fontSize || next.bbox.height);
|
|
3944
|
+
const yDiff = Math.abs(currBaseline - nextBaseline);
|
|
3945
|
+
const maxFs = Math.max(curr.style?.fontSize || 12, next.style?.fontSize || 12);
|
|
3946
|
+
const sameY = curr.bbox.page === next.bbox.page && yDiff < maxFs * 1.5;
|
|
3947
|
+
const sameLevel = curr.level === next.level;
|
|
3948
|
+
if (sameY && sameLevel) {
|
|
3949
|
+
const currX = curr.bbox.x;
|
|
3950
|
+
const nextX = next.bbox.x;
|
|
3951
|
+
if (currX <= nextX) {
|
|
3952
|
+
curr.text = curr.text + " " + next.text;
|
|
3953
|
+
} else {
|
|
3954
|
+
curr.text = next.text + " " + curr.text;
|
|
3955
|
+
}
|
|
3956
|
+
curr.bbox = {
|
|
3957
|
+
page: curr.bbox.page,
|
|
3958
|
+
x: Math.min(curr.bbox.x, next.bbox.x),
|
|
3959
|
+
y: Math.min(curr.bbox.y, next.bbox.y),
|
|
3960
|
+
width: Math.max(curr.bbox.x + curr.bbox.width, next.bbox.x + next.bbox.width) - Math.min(curr.bbox.x, next.bbox.x),
|
|
3961
|
+
height: Math.max(curr.bbox.height, next.bbox.height)
|
|
3962
|
+
};
|
|
3963
|
+
blocks.splice(i + 1, 1);
|
|
3964
|
+
} else {
|
|
3965
|
+
i++;
|
|
3966
|
+
}
|
|
3967
|
+
}
|
|
3968
|
+
}
|
|
3807
3969
|
function collapseEvenSpacing(text) {
|
|
3808
3970
|
const tokens = text.split(" ");
|
|
3809
3971
|
const singleCharCount = tokens.filter((t) => t.length === 1).length;
|
|
@@ -3812,6 +3974,169 @@ function collapseEvenSpacing(text) {
|
|
|
3812
3974
|
}
|
|
3813
3975
|
return text;
|
|
3814
3976
|
}
|
|
3977
|
+
function buildXyCutBlocks(items, pageNum) {
|
|
3978
|
+
const allY = items.map((i) => i.y);
|
|
3979
|
+
const pageHeight = Math.max(...allY) - Math.min(...allY);
|
|
3980
|
+
const gapThreshold = Math.max(15, pageHeight * 0.03);
|
|
3981
|
+
const orderedGroups = xyCutOrder(items, gapThreshold);
|
|
3982
|
+
const blocks = [];
|
|
3983
|
+
for (const group of orderedGroups) {
|
|
3984
|
+
if (group.length === 0) continue;
|
|
3985
|
+
const yLines = groupByY(group);
|
|
3986
|
+
for (const line of yLines) {
|
|
3987
|
+
const text = mergeLineSimple(line);
|
|
3988
|
+
if (!text.trim()) continue;
|
|
3989
|
+
blocks.push({
|
|
3990
|
+
type: "paragraph",
|
|
3991
|
+
text,
|
|
3992
|
+
pageNumber: pageNum,
|
|
3993
|
+
bbox: computeBBox(line, pageNum),
|
|
3994
|
+
style: dominantStyle(line)
|
|
3995
|
+
});
|
|
3996
|
+
}
|
|
3997
|
+
}
|
|
3998
|
+
return blocks.length > 0 ? blocks : null;
|
|
3999
|
+
}
|
|
4000
|
+
function normalizeUnderSegmentedTable(table, items, pageNum, bbox) {
|
|
4001
|
+
const totalCells = table.cells.reduce((sum, row) => sum + row.filter((c) => c.text.trim()).length, 0);
|
|
4002
|
+
const totalTextLines = table.cells.reduce((sum, row) => sum + row.reduce((s, c) => s + (c.text.trim() ? c.text.split("\n").length : 0), 0), 0);
|
|
4003
|
+
const isUnderSegmented = table.rows === 1 && table.cols === 1 || totalCells <= 2 && totalTextLines >= 8 || totalCells <= 2 && items.length >= 6;
|
|
4004
|
+
if (!isUnderSegmented) return null;
|
|
4005
|
+
if (hasMultiColumnLayout(items)) return buildXyCutBlocks(items, pageNum);
|
|
4006
|
+
const directTable = buildTableFromTextLayout(items, pageNum, bbox);
|
|
4007
|
+
if (directTable) return directTable;
|
|
4008
|
+
const clusterItems = items.map((i) => ({
|
|
4009
|
+
text: i.text,
|
|
4010
|
+
x: i.x,
|
|
4011
|
+
y: i.y,
|
|
4012
|
+
w: i.w,
|
|
4013
|
+
h: i.h,
|
|
4014
|
+
fontSize: i.fontSize,
|
|
4015
|
+
fontName: i.fontName
|
|
4016
|
+
}));
|
|
4017
|
+
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4018
|
+
if (clusterResults.length > 0) {
|
|
4019
|
+
const blocks = [];
|
|
4020
|
+
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4021
|
+
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4022
|
+
const usedIndices = /* @__PURE__ */ new Set();
|
|
4023
|
+
for (const cr of clusterResults) {
|
|
4024
|
+
for (const ci of cr.usedItems) {
|
|
4025
|
+
const idx = ciToIdx.get(ci);
|
|
4026
|
+
if (idx !== void 0) usedIndices.add(idx);
|
|
4027
|
+
}
|
|
4028
|
+
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4029
|
+
}
|
|
4030
|
+
const remaining = items.filter((_, idx) => !usedIndices.has(idx));
|
|
4031
|
+
for (const item of remaining) {
|
|
4032
|
+
if (!item.text.trim()) continue;
|
|
4033
|
+
blocks.push({
|
|
4034
|
+
type: "paragraph",
|
|
4035
|
+
text: item.text,
|
|
4036
|
+
pageNumber: pageNum,
|
|
4037
|
+
bbox: computeBBox([item], pageNum),
|
|
4038
|
+
style: { fontSize: item.fontSize, fontName: item.fontName }
|
|
4039
|
+
});
|
|
4040
|
+
}
|
|
4041
|
+
blocks.sort((a, b) => {
|
|
4042
|
+
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
4043
|
+
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
4044
|
+
return by - ay;
|
|
4045
|
+
});
|
|
4046
|
+
return blocks.length > 0 ? blocks : null;
|
|
4047
|
+
}
|
|
4048
|
+
return null;
|
|
4049
|
+
}
|
|
4050
|
+
function buildTableFromTextLayout(items, pageNum, bbox) {
|
|
4051
|
+
if (items.length < 4) return null;
|
|
4052
|
+
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
4053
|
+
const yTol = 3;
|
|
4054
|
+
const rows = [];
|
|
4055
|
+
let curRow = [sorted[0]];
|
|
4056
|
+
let curY = sorted[0].y;
|
|
4057
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
4058
|
+
if (Math.abs(sorted[i].y - curY) <= yTol) {
|
|
4059
|
+
curRow.push(sorted[i]);
|
|
4060
|
+
} else {
|
|
4061
|
+
rows.push(curRow);
|
|
4062
|
+
curRow = [sorted[i]];
|
|
4063
|
+
curY = sorted[i].y;
|
|
4064
|
+
}
|
|
4065
|
+
}
|
|
4066
|
+
rows.push(curRow);
|
|
4067
|
+
if (rows.length < 2) return null;
|
|
4068
|
+
const gapPositions = [];
|
|
4069
|
+
for (const row of rows) {
|
|
4070
|
+
if (row.length < 2) continue;
|
|
4071
|
+
const sortedX = [...row].sort((a, b) => a.x - b.x);
|
|
4072
|
+
const avgFs = sortedX.reduce((s, i) => s + i.fontSize, 0) / sortedX.length;
|
|
4073
|
+
for (let j = 1; j < sortedX.length; j++) {
|
|
4074
|
+
const gap = sortedX[j].x - (sortedX[j - 1].x + sortedX[j - 1].w);
|
|
4075
|
+
if (gap >= avgFs * 1.5) {
|
|
4076
|
+
gapPositions.push(sortedX[j - 1].x + sortedX[j - 1].w + gap / 2);
|
|
4077
|
+
}
|
|
4078
|
+
}
|
|
4079
|
+
}
|
|
4080
|
+
if (gapPositions.length < 2) return null;
|
|
4081
|
+
gapPositions.sort((a, b) => a - b);
|
|
4082
|
+
const colBoundaries = [];
|
|
4083
|
+
let clusterSum = gapPositions[0], clusterCount = 1;
|
|
4084
|
+
for (let i = 1; i < gapPositions.length; i++) {
|
|
4085
|
+
const avg = clusterSum / clusterCount;
|
|
4086
|
+
if (Math.abs(gapPositions[i] - avg) <= 15) {
|
|
4087
|
+
clusterSum += gapPositions[i];
|
|
4088
|
+
clusterCount++;
|
|
4089
|
+
} else {
|
|
4090
|
+
if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
|
|
4091
|
+
clusterSum = gapPositions[i];
|
|
4092
|
+
clusterCount = 1;
|
|
4093
|
+
}
|
|
4094
|
+
}
|
|
4095
|
+
if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
|
|
4096
|
+
if (colBoundaries.length === 0) return null;
|
|
4097
|
+
const numCols = colBoundaries.length + 1;
|
|
4098
|
+
const tableRows = [];
|
|
4099
|
+
for (const row of rows) {
|
|
4100
|
+
const cells = Array(numCols).fill("");
|
|
4101
|
+
const sortedX = [...row].sort((a, b) => a.x - b.x);
|
|
4102
|
+
for (const item of sortedX) {
|
|
4103
|
+
const cx = item.x + item.w / 2;
|
|
4104
|
+
let col = 0;
|
|
4105
|
+
for (let b = 0; b < colBoundaries.length; b++) {
|
|
4106
|
+
if (cx > colBoundaries[b]) col = b + 1;
|
|
4107
|
+
}
|
|
4108
|
+
cells[col] = cells[col] ? cells[col] + " " + item.text : item.text;
|
|
4109
|
+
}
|
|
4110
|
+
if (cells[0].trim() === "" && tableRows.length > 0) {
|
|
4111
|
+
const prevCells = tableRows[tableRows.length - 1].cells;
|
|
4112
|
+
for (let c = 0; c < numCols; c++) {
|
|
4113
|
+
if (cells[c].trim()) {
|
|
4114
|
+
prevCells[c] = prevCells[c] ? prevCells[c] + " " + cells[c].trim() : cells[c].trim();
|
|
4115
|
+
}
|
|
4116
|
+
}
|
|
4117
|
+
} else {
|
|
4118
|
+
tableRows.push({ cells });
|
|
4119
|
+
}
|
|
4120
|
+
}
|
|
4121
|
+
if (tableRows.length < 2) return null;
|
|
4122
|
+
const nonEmptyCount = tableRows.reduce((sum, r) => sum + r.cells.filter((c) => c.trim()).length, 0);
|
|
4123
|
+
const totalCount = tableRows.length * numCols;
|
|
4124
|
+
if (nonEmptyCount < totalCount * 0.3) return null;
|
|
4125
|
+
const irCells = tableRows.map(
|
|
4126
|
+
(r) => r.cells.map((text, colIdx) => {
|
|
4127
|
+
let cleaned = text.trim();
|
|
4128
|
+
if (colIdx > 0) cleaned = cleaned.replace(/^[•○·\-]\s*/, "");
|
|
4129
|
+
return { text: cleaned, colSpan: 1, rowSpan: 1 };
|
|
4130
|
+
})
|
|
4131
|
+
);
|
|
4132
|
+
const irTable = {
|
|
4133
|
+
rows: tableRows.length,
|
|
4134
|
+
cols: numCols,
|
|
4135
|
+
cells: irCells,
|
|
4136
|
+
hasHeader: tableRows.length > 1
|
|
4137
|
+
};
|
|
4138
|
+
return [{ type: "table", table: irTable, pageNumber: pageNum, bbox }];
|
|
4139
|
+
}
|
|
3815
4140
|
function shouldDemoteTable(table) {
|
|
3816
4141
|
const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
|
|
3817
4142
|
const allText = allCells.join(" ");
|
|
@@ -3858,6 +4183,32 @@ function detectMarkerHeadings(blocks) {
|
|
|
3858
4183
|
}
|
|
3859
4184
|
}
|
|
3860
4185
|
}
|
|
4186
|
+
function hasMultiColumnLayout(items) {
|
|
4187
|
+
if (items.length < 30) return false;
|
|
4188
|
+
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
4189
|
+
const minX = sorted[0].x;
|
|
4190
|
+
let maxX = minX;
|
|
4191
|
+
for (const i of sorted) if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
4192
|
+
const pageWidth = maxX - minX;
|
|
4193
|
+
if (pageWidth < 200) return false;
|
|
4194
|
+
let bestGap = 0;
|
|
4195
|
+
let bestSplit = 0;
|
|
4196
|
+
for (let j = 1; j < sorted.length; j++) {
|
|
4197
|
+
const gap = sorted[j].x - (sorted[j - 1].x + sorted[j - 1].w);
|
|
4198
|
+
if (gap > bestGap) {
|
|
4199
|
+
bestGap = gap;
|
|
4200
|
+
bestSplit = (sorted[j - 1].x + sorted[j - 1].w + sorted[j].x) / 2;
|
|
4201
|
+
}
|
|
4202
|
+
}
|
|
4203
|
+
if (bestGap < 20) return false;
|
|
4204
|
+
const splitRatio = (bestSplit - minX) / pageWidth;
|
|
4205
|
+
if (splitRatio < 0.35 || splitRatio > 0.65) return false;
|
|
4206
|
+
const leftCount = items.filter((i) => i.x + i.w / 2 < bestSplit).length;
|
|
4207
|
+
const rightCount = items.filter((i) => i.x + i.w / 2 >= bestSplit).length;
|
|
4208
|
+
if (leftCount < 15 || rightCount < 15) return false;
|
|
4209
|
+
if (Math.min(leftCount, rightCount) / Math.max(leftCount, rightCount) < 0.35) return false;
|
|
4210
|
+
return true;
|
|
4211
|
+
}
|
|
3861
4212
|
var MAX_XYCUT_DEPTH = 50;
|
|
3862
4213
|
function xyCutOrder(items, gapThreshold, depth = 0) {
|
|
3863
4214
|
if (items.length === 0) return [];
|
|
@@ -3988,6 +4339,11 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
3988
4339
|
width: grid.bbox.x2 - grid.bbox.x1,
|
|
3989
4340
|
height: grid.bbox.y2 - grid.bbox.y1
|
|
3990
4341
|
};
|
|
4342
|
+
const normalized = normalizeUnderSegmentedTable(irTable, tableItems, pageNum, tableBbox);
|
|
4343
|
+
if (normalized) {
|
|
4344
|
+
blocks.push(...normalized);
|
|
4345
|
+
continue;
|
|
4346
|
+
}
|
|
3991
4347
|
if (shouldDemoteTable(irTable)) {
|
|
3992
4348
|
const demoted = demoteTableToText(irTable);
|
|
3993
4349
|
if (demoted) {
|
|
@@ -4033,6 +4389,10 @@ function mergeAdjacentTableBlocks(blocks) {
|
|
|
4033
4389
|
}
|
|
4034
4390
|
function extractPageBlocksFallback(items, pageNum) {
|
|
4035
4391
|
if (items.length === 0) return [];
|
|
4392
|
+
if (hasMultiColumnLayout(items)) {
|
|
4393
|
+
const xyBlocks = buildXyCutBlocks(items, pageNum) || [];
|
|
4394
|
+
return detectSpecialKoreanTables(detectListBlocks(xyBlocks));
|
|
4395
|
+
}
|
|
4036
4396
|
const blocks = [];
|
|
4037
4397
|
const allYLines = groupByY(items);
|
|
4038
4398
|
const columns = detectColumns(allYLines);
|
|
@@ -4050,7 +4410,7 @@ function extractPageBlocksFallback(items, pageNum) {
|
|
|
4050
4410
|
fontSize: i.fontSize,
|
|
4051
4411
|
fontName: i.fontName
|
|
4052
4412
|
}));
|
|
4053
|
-
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4413
|
+
const clusterResults = hasMultiColumnLayout(items) ? [] : detectClusterTables(clusterItems, pageNum);
|
|
4054
4414
|
if (clusterResults.length > 0) {
|
|
4055
4415
|
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4056
4416
|
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|