kordoc 2.1.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-5Y2Q3BRW.js +52 -0
- package/dist/chunk-5Y2Q3BRW.js.map +1 -0
- package/dist/{chunk-GJ2S6IMC.js → chunk-LYFG7AUT.js} +966 -577
- package/dist/chunk-LYFG7AUT.js.map +1 -0
- package/dist/cli.js +12 -8
- package/dist/cli.js.map +1 -1
- package/dist/detect-GYK3HKD5.js +18 -0
- package/dist/index.cjs +993 -546
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +993 -546
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +40 -11
- package/dist/mcp.js.map +1 -1
- package/dist/{watch-X7IC7MLF.js → watch-Q5OXA73S.js} +31 -15
- package/dist/watch-Q5OXA73S.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-GJ2S6IMC.js.map +0 -1
- package/dist/chunk-PKIJLEV6.js +0 -93
- package/dist/chunk-PKIJLEV6.js.map +0 -1
- package/dist/utils-BWQ2RGUD.js +0 -22
- package/dist/watch-X7IC7MLF.js.map +0 -1
- /package/dist/{utils-BWQ2RGUD.js.map → detect-GYK3HKD5.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -139,7 +139,7 @@ import { inflateRawSync } from "zlib";
|
|
|
139
139
|
import { DOMParser } from "@xmldom/xmldom";
|
|
140
140
|
|
|
141
141
|
// src/utils.ts
|
|
142
|
-
var VERSION = true ? "2.
|
|
142
|
+
var VERSION = true ? "2.2.0" : "0.0.0-dev";
|
|
143
143
|
function toArrayBuffer(buf) {
|
|
144
144
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
145
145
|
return buf.buffer;
|
|
@@ -155,7 +155,8 @@ var KordocError = class extends Error {
|
|
|
155
155
|
function isPathTraversal(name) {
|
|
156
156
|
if (name.includes("\0")) return true;
|
|
157
157
|
const normalized = name.replace(/\\/g, "/");
|
|
158
|
-
|
|
158
|
+
const segments = normalized.split("/");
|
|
159
|
+
return segments.some((s) => s === "..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
159
160
|
}
|
|
160
161
|
function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
|
|
161
162
|
try {
|
|
@@ -195,12 +196,25 @@ function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEnt
|
|
|
195
196
|
return { totalUncompressed: 0, entryCount: 0 };
|
|
196
197
|
}
|
|
197
198
|
}
|
|
199
|
+
function stripDtd(xml) {
|
|
200
|
+
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
201
|
+
}
|
|
198
202
|
var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
|
|
199
203
|
function sanitizeHref(href) {
|
|
200
204
|
const trimmed = href.trim();
|
|
201
205
|
if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
|
|
202
206
|
return trimmed;
|
|
203
207
|
}
|
|
208
|
+
function safeMin(arr) {
|
|
209
|
+
let min = Infinity;
|
|
210
|
+
for (let i = 0; i < arr.length; i++) if (arr[i] < min) min = arr[i];
|
|
211
|
+
return min;
|
|
212
|
+
}
|
|
213
|
+
function safeMax(arr) {
|
|
214
|
+
let max = -Infinity;
|
|
215
|
+
for (let i = 0; i < arr.length; i++) if (arr[i] > max) max = arr[i];
|
|
216
|
+
return max;
|
|
217
|
+
}
|
|
204
218
|
function classifyError(err) {
|
|
205
219
|
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
206
220
|
const msg = err.message;
|
|
@@ -275,6 +289,7 @@ function buildTableDirect(rows, numRows) {
|
|
|
275
289
|
if (end > maxCols) maxCols = end;
|
|
276
290
|
}
|
|
277
291
|
}
|
|
292
|
+
if (maxCols > MAX_COLS) maxCols = MAX_COLS;
|
|
278
293
|
if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
|
|
279
294
|
const grid = Array.from(
|
|
280
295
|
{ length: numRows },
|
|
@@ -284,7 +299,7 @@ function buildTableDirect(rows, numRows) {
|
|
|
284
299
|
for (const cell of row) {
|
|
285
300
|
const r = cell.rowAddr ?? 0;
|
|
286
301
|
const c = cell.colAddr ?? 0;
|
|
287
|
-
if (r >= numRows || c >= maxCols) continue;
|
|
302
|
+
if (r >= numRows || c >= maxCols || r < 0 || c < 0) continue;
|
|
288
303
|
grid[r][c] = { text: cell.text.trim(), colSpan: cell.colSpan, rowSpan: cell.rowSpan };
|
|
289
304
|
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
290
305
|
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
@@ -469,9 +484,6 @@ function tableToMarkdown(table) {
|
|
|
469
484
|
if (dr === 0 && dc === 0) continue;
|
|
470
485
|
if (r + dr < numRows && c + dc < numCols) {
|
|
471
486
|
skip.add(`${r + dr},${c + dc}`);
|
|
472
|
-
if (dr === 0) {
|
|
473
|
-
display[r][c + dc] = cell.text.replace(/\n/g, "<br>");
|
|
474
|
-
}
|
|
475
487
|
}
|
|
476
488
|
}
|
|
477
489
|
}
|
|
@@ -607,9 +619,6 @@ function parseStyleElements(doc, map) {
|
|
|
607
619
|
}
|
|
608
620
|
}
|
|
609
621
|
}
|
|
610
|
-
function stripDtd(xml) {
|
|
611
|
-
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
612
|
-
}
|
|
613
622
|
async function parseHwpxDocument(buffer, options) {
|
|
614
623
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
|
|
615
624
|
let zip;
|
|
@@ -959,7 +968,8 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
959
968
|
if (newTable.rows.length > 0) {
|
|
960
969
|
if (tableStack.length > 0) {
|
|
961
970
|
const parentTable = tableStack.pop();
|
|
962
|
-
|
|
971
|
+
let nestedCols = 0;
|
|
972
|
+
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
963
973
|
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
964
974
|
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
965
975
|
} else {
|
|
@@ -1068,7 +1078,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
1068
1078
|
if (newTable.rows.length > 0) {
|
|
1069
1079
|
if (tableStack.length > 0) {
|
|
1070
1080
|
const parentTable = tableStack.pop();
|
|
1071
|
-
|
|
1081
|
+
let nestedCols = 0;
|
|
1082
|
+
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
1072
1083
|
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
1073
1084
|
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
1074
1085
|
} else {
|
|
@@ -2166,6 +2177,7 @@ function parseLenientCfb(data) {
|
|
|
2166
2177
|
if (miniSectorSizeShift > 16) throw new Error("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uBBF8\uB2C8 \uC139\uD130 \uD06C\uAE30 \uC2DC\uD504\uD2B8: " + miniSectorSizeShift);
|
|
2167
2178
|
const miniSectorSize = 1 << miniSectorSizeShift;
|
|
2168
2179
|
const fatSectorCount = data.readUInt32LE(44);
|
|
2180
|
+
if (fatSectorCount > 1e4) throw new Error("FAT \uC139\uD130 \uC218\uAC00 \uB108\uBB34 \uB9CE\uC2B5\uB2C8\uB2E4: " + fatSectorCount);
|
|
2169
2181
|
const firstDirSector = data.readUInt32LE(48);
|
|
2170
2182
|
const miniStreamCutoff = data.readUInt32LE(56);
|
|
2171
2183
|
const firstMiniFatSector = data.readUInt32LE(60);
|
|
@@ -2541,10 +2553,14 @@ function findSections(cfb) {
|
|
|
2541
2553
|
}
|
|
2542
2554
|
function findSectionsLenient(lcfb, compressed) {
|
|
2543
2555
|
const sections = [];
|
|
2556
|
+
let totalDecompressed = 0;
|
|
2544
2557
|
for (let i = 0; i < MAX_SECTIONS; i++) {
|
|
2545
2558
|
const raw = lcfb.findStream(`/BodyText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
|
|
2546
2559
|
if (!raw) break;
|
|
2547
|
-
|
|
2560
|
+
const content = compressed ? decompressStream(raw) : raw;
|
|
2561
|
+
totalDecompressed += content.length;
|
|
2562
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2563
|
+
sections.push({ idx: i, content });
|
|
2548
2564
|
}
|
|
2549
2565
|
if (sections.length === 0) {
|
|
2550
2566
|
for (const e of lcfb.entries()) {
|
|
@@ -2552,7 +2568,12 @@ function findSectionsLenient(lcfb, compressed) {
|
|
|
2552
2568
|
if (e.name.startsWith("Section")) {
|
|
2553
2569
|
const idx = parseInt(e.name.replace("Section", ""), 10) || 0;
|
|
2554
2570
|
const raw = lcfb.findStream(e.name);
|
|
2555
|
-
if (raw)
|
|
2571
|
+
if (raw) {
|
|
2572
|
+
const content = compressed ? decompressStream(raw) : raw;
|
|
2573
|
+
totalDecompressed += content.length;
|
|
2574
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2575
|
+
sections.push({ idx, content });
|
|
2576
|
+
}
|
|
2556
2577
|
}
|
|
2557
2578
|
}
|
|
2558
2579
|
}
|
|
@@ -2560,11 +2581,15 @@ function findSectionsLenient(lcfb, compressed) {
|
|
|
2560
2581
|
}
|
|
2561
2582
|
function findViewTextSectionsLenient(lcfb, compressed) {
|
|
2562
2583
|
const sections = [];
|
|
2584
|
+
let totalDecompressed = 0;
|
|
2563
2585
|
for (let i = 0; i < MAX_SECTIONS; i++) {
|
|
2564
2586
|
const raw = lcfb.findStream(`/ViewText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
|
|
2565
2587
|
if (!raw) break;
|
|
2566
2588
|
try {
|
|
2567
|
-
|
|
2589
|
+
const content = decryptViewText(raw, compressed);
|
|
2590
|
+
totalDecompressed += content.length;
|
|
2591
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2592
|
+
sections.push({ idx: i, content });
|
|
2568
2593
|
} catch {
|
|
2569
2594
|
break;
|
|
2570
2595
|
}
|
|
@@ -2966,37 +2991,18 @@ init_page_range();
|
|
|
2966
2991
|
// src/pdf/line-detector.ts
|
|
2967
2992
|
import { OPS } from "pdfjs-dist/legacy/build/pdf.mjs";
|
|
2968
2993
|
var ORIENTATION_TOL = 2;
|
|
2969
|
-
var MIN_LINE_LENGTH =
|
|
2970
|
-
var
|
|
2994
|
+
var MIN_LINE_LENGTH = 15;
|
|
2995
|
+
var MAX_LINE_WIDTH = 5;
|
|
2971
2996
|
var CONNECT_TOL = 5;
|
|
2972
2997
|
var CELL_PADDING = 2;
|
|
2973
|
-
var
|
|
2974
|
-
var
|
|
2975
|
-
|
|
2976
|
-
|
|
2977
|
-
m1[0] * m2[0] + m1[2] * m2[1],
|
|
2978
|
-
m1[1] * m2[0] + m1[3] * m2[1],
|
|
2979
|
-
m1[0] * m2[2] + m1[2] * m2[3],
|
|
2980
|
-
m1[1] * m2[2] + m1[3] * m2[3],
|
|
2981
|
-
m1[0] * m2[4] + m1[2] * m2[5] + m1[4],
|
|
2982
|
-
m1[1] * m2[4] + m1[3] * m2[5] + m1[5]
|
|
2983
|
-
];
|
|
2984
|
-
}
|
|
2985
|
-
function matTransformPoint(m, x, y) {
|
|
2986
|
-
return [m[0] * x + m[2] * y + m[4], m[1] * x + m[3] * y + m[5]];
|
|
2987
|
-
}
|
|
2988
|
-
function matScale(m) {
|
|
2989
|
-
return Math.max(
|
|
2990
|
-
Math.sqrt(m[1] * m[1] + m[3] * m[3]),
|
|
2991
|
-
Math.sqrt(m[0] * m[0] + m[2] * m[2])
|
|
2992
|
-
);
|
|
2993
|
-
}
|
|
2998
|
+
var MIN_COL_WIDTH = 15;
|
|
2999
|
+
var MIN_ROW_HEIGHT = 6;
|
|
3000
|
+
var VERTEX_MERGE_FACTOR = 4;
|
|
3001
|
+
var MIN_COORD_MERGE_TOL = 8;
|
|
2994
3002
|
function extractLines(fnArray, argsArray) {
|
|
2995
3003
|
const horizontals = [];
|
|
2996
3004
|
const verticals = [];
|
|
2997
|
-
let ctm = [...IDENTITY];
|
|
2998
3005
|
let lineWidth = 1;
|
|
2999
|
-
const stateStack = [];
|
|
3000
3006
|
let currentPath = [];
|
|
3001
3007
|
let pathStartX = 0, pathStartY = 0;
|
|
3002
3008
|
let curX = 0, curY = 0;
|
|
@@ -3014,53 +3020,13 @@ function extractLines(fnArray, argsArray) {
|
|
|
3014
3020
|
);
|
|
3015
3021
|
}
|
|
3016
3022
|
}
|
|
3017
|
-
function
|
|
3018
|
-
if (
|
|
3019
|
-
const first = path[0], last = path[path.length - 1];
|
|
3020
|
-
const closed = Math.abs(first.x1 - last.x2) < 1 && Math.abs(first.y1 - last.y2) < 1;
|
|
3021
|
-
if (!closed) return false;
|
|
3022
|
-
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
3023
|
-
for (const seg of path) {
|
|
3024
|
-
minX = Math.min(minX, seg.x1, seg.x2);
|
|
3025
|
-
minY = Math.min(minY, seg.y1, seg.y2);
|
|
3026
|
-
maxX = Math.max(maxX, seg.x1, seg.x2);
|
|
3027
|
-
maxY = Math.max(maxY, seg.y1, seg.y2);
|
|
3028
|
-
}
|
|
3029
|
-
const w = maxX - minX, h = maxY - minY;
|
|
3030
|
-
if (w < MIN_LINE_LENGTH && h < MIN_LINE_LENGTH) return false;
|
|
3031
|
-
path.length = 0;
|
|
3032
|
-
if (h < ORIENTATION_TOL * 2 || w > MIN_LINE_LENGTH && h <= MAX_LINE_WIDTH) {
|
|
3033
|
-
path.push({ x1: minX, y1: (minY + maxY) / 2, x2: maxX, y2: (minY + maxY) / 2 });
|
|
3034
|
-
} else if (w < ORIENTATION_TOL * 2 || h > MIN_LINE_LENGTH && w <= MAX_LINE_WIDTH) {
|
|
3035
|
-
path.push({ x1: (minX + maxX) / 2, y1: minY, x2: (minX + maxX) / 2, y2: maxY });
|
|
3036
|
-
} else {
|
|
3037
|
-
pushRectangle(path, minX, minY, w, h);
|
|
3038
|
-
}
|
|
3039
|
-
return true;
|
|
3040
|
-
}
|
|
3041
|
-
function flushPath(isStroke, isFill) {
|
|
3042
|
-
if (!isStroke && !isFill) {
|
|
3043
|
-
currentPath = [];
|
|
3044
|
-
return;
|
|
3045
|
-
}
|
|
3046
|
-
if (isFill && !isStroke && currentPath.length >= 3) {
|
|
3047
|
-
tryConvertLinesToRectangle(currentPath);
|
|
3048
|
-
}
|
|
3049
|
-
const scale = matScale(ctm);
|
|
3050
|
-
const effectiveLW = lineWidth * scale;
|
|
3051
|
-
if (effectiveLW > MAX_LINE_WIDTH && isStroke && !isFill) {
|
|
3023
|
+
function flushPath(isStroke) {
|
|
3024
|
+
if (!isStroke) {
|
|
3052
3025
|
currentPath = [];
|
|
3053
3026
|
return;
|
|
3054
3027
|
}
|
|
3055
3028
|
for (const seg of currentPath) {
|
|
3056
|
-
|
|
3057
|
-
const [px2, py2] = matTransformPoint(ctm, seg.x2, seg.y2);
|
|
3058
|
-
classifyAndAdd(
|
|
3059
|
-
{ x1: px1, y1: py1, x2: px2, y2: py2 },
|
|
3060
|
-
effectiveLW,
|
|
3061
|
-
horizontals,
|
|
3062
|
-
verticals
|
|
3063
|
-
);
|
|
3029
|
+
classifyAndAdd(seg, lineWidth, horizontals, verticals);
|
|
3064
3030
|
}
|
|
3065
3031
|
currentPath = [];
|
|
3066
3032
|
}
|
|
@@ -3068,28 +3034,9 @@ function extractLines(fnArray, argsArray) {
|
|
|
3068
3034
|
const op = fnArray[i];
|
|
3069
3035
|
const args = argsArray[i];
|
|
3070
3036
|
switch (op) {
|
|
3071
|
-
// ── Graphics State ──
|
|
3072
|
-
case OPS.save:
|
|
3073
|
-
stateStack.push({ ctm: [...ctm], lineWidth });
|
|
3074
|
-
break;
|
|
3075
|
-
case OPS.restore:
|
|
3076
|
-
if (stateStack.length > 0) {
|
|
3077
|
-
const state = stateStack.pop();
|
|
3078
|
-
ctm = state.ctm;
|
|
3079
|
-
lineWidth = state.lineWidth;
|
|
3080
|
-
}
|
|
3081
|
-
break;
|
|
3082
|
-
case OPS.transform: {
|
|
3083
|
-
const m = args;
|
|
3084
|
-
if (m.length >= 6) {
|
|
3085
|
-
ctm = matMultiply(ctm, [m[0], m[1], m[2], m[3], m[4], m[5]]);
|
|
3086
|
-
}
|
|
3087
|
-
break;
|
|
3088
|
-
}
|
|
3089
3037
|
case OPS.setLineWidth:
|
|
3090
3038
|
lineWidth = args[0] || 1;
|
|
3091
3039
|
break;
|
|
3092
|
-
// ── Path Construction ──
|
|
3093
3040
|
case OPS.constructPath: {
|
|
3094
3041
|
const arg0 = args[0];
|
|
3095
3042
|
if (Array.isArray(arg0)) {
|
|
@@ -3157,60 +3104,34 @@ function extractLines(fnArray, argsArray) {
|
|
|
3157
3104
|
}
|
|
3158
3105
|
}
|
|
3159
3106
|
}
|
|
3160
|
-
|
|
3161
|
-
|
|
3162
|
-
|
|
3163
|
-
|
|
3164
|
-
flushPath(isStroke5 || isBoth5, isFill5 || isBoth5);
|
|
3107
|
+
if (afterOp === OPS.stroke || afterOp === OPS.closeStroke) {
|
|
3108
|
+
flushPath(true);
|
|
3109
|
+
} else if (afterOp === OPS.fill || afterOp === OPS.eoFill || afterOp === OPS.fillStroke || afterOp === OPS.eoFillStroke || afterOp === OPS.closeFillStroke || afterOp === OPS.closeEOFillStroke) {
|
|
3110
|
+
flushPath(true);
|
|
3165
3111
|
} else if (afterOp === OPS.endPath) {
|
|
3166
|
-
flushPath(false
|
|
3112
|
+
flushPath(false);
|
|
3167
3113
|
}
|
|
3168
3114
|
}
|
|
3169
3115
|
break;
|
|
3170
3116
|
}
|
|
3171
|
-
// ── Paint Operations ──
|
|
3172
3117
|
case OPS.stroke:
|
|
3173
3118
|
case OPS.closeStroke:
|
|
3174
|
-
flushPath(true
|
|
3119
|
+
flushPath(true);
|
|
3175
3120
|
break;
|
|
3176
3121
|
case OPS.fill:
|
|
3177
3122
|
case OPS.eoFill:
|
|
3178
|
-
flushPath(false, true);
|
|
3179
|
-
break;
|
|
3180
3123
|
case OPS.fillStroke:
|
|
3181
3124
|
case OPS.eoFillStroke:
|
|
3182
3125
|
case OPS.closeFillStroke:
|
|
3183
3126
|
case OPS.closeEOFillStroke:
|
|
3184
|
-
flushPath(true
|
|
3127
|
+
flushPath(true);
|
|
3185
3128
|
break;
|
|
3186
3129
|
case OPS.endPath:
|
|
3187
|
-
flushPath(false
|
|
3188
|
-
break;
|
|
3189
|
-
}
|
|
3190
|
-
}
|
|
3191
|
-
return {
|
|
3192
|
-
horizontals: deduplicateLines(horizontals),
|
|
3193
|
-
verticals: deduplicateLines(verticals)
|
|
3194
|
-
};
|
|
3195
|
-
}
|
|
3196
|
-
function deduplicateLines(lines) {
|
|
3197
|
-
if (lines.length <= 1) return lines;
|
|
3198
|
-
const result = [];
|
|
3199
|
-
const tol = COORD_MERGE_TOL;
|
|
3200
|
-
for (const line of lines) {
|
|
3201
|
-
let isDuplicate = false;
|
|
3202
|
-
for (const existing of result) {
|
|
3203
|
-
if (Math.abs(line.y1 - existing.y1) <= tol && Math.abs(line.y2 - existing.y2) <= tol && Math.abs(line.x1 - existing.x1) <= tol && Math.abs(line.x2 - existing.x2) <= tol) {
|
|
3204
|
-
if (line.lineWidth > existing.lineWidth) {
|
|
3205
|
-
existing.lineWidth = line.lineWidth;
|
|
3206
|
-
}
|
|
3207
|
-
isDuplicate = true;
|
|
3130
|
+
flushPath(false);
|
|
3208
3131
|
break;
|
|
3209
|
-
}
|
|
3210
3132
|
}
|
|
3211
|
-
if (!isDuplicate) result.push(line);
|
|
3212
3133
|
}
|
|
3213
|
-
return
|
|
3134
|
+
return { horizontals, verticals };
|
|
3214
3135
|
}
|
|
3215
3136
|
function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
|
|
3216
3137
|
const dx = Math.abs(seg.x2 - seg.x1);
|
|
@@ -3229,6 +3150,55 @@ function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
|
|
|
3229
3150
|
verticals.push({ x1: x, y1, x2: x, y2, lineWidth });
|
|
3230
3151
|
}
|
|
3231
3152
|
}
|
|
3153
|
+
function preprocessLines(horizontals, verticals) {
|
|
3154
|
+
let h = horizontals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
|
|
3155
|
+
let v = verticals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
|
|
3156
|
+
h = mergeParallelLines(h, "h");
|
|
3157
|
+
v = mergeParallelLines(v, "v");
|
|
3158
|
+
return { horizontals: h, verticals: v };
|
|
3159
|
+
}
|
|
3160
|
+
function mergeParallelLines(lines, dir) {
|
|
3161
|
+
if (lines.length <= 1) return lines;
|
|
3162
|
+
const sorted = [...lines].sort((a, b) => {
|
|
3163
|
+
const posA = dir === "h" ? a.y1 : a.x1;
|
|
3164
|
+
const posB = dir === "h" ? b.y1 : b.x1;
|
|
3165
|
+
if (Math.abs(posA - posB) > 0.1) return posA - posB;
|
|
3166
|
+
return dir === "h" ? a.x1 - b.x1 : a.y1 - b.y1;
|
|
3167
|
+
});
|
|
3168
|
+
const MERGE_TOL = 3;
|
|
3169
|
+
const result = [sorted[0]];
|
|
3170
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
3171
|
+
const prev = result[result.length - 1];
|
|
3172
|
+
const curr = sorted[i];
|
|
3173
|
+
const prevPos = dir === "h" ? prev.y1 : prev.x1;
|
|
3174
|
+
const currPos = dir === "h" ? curr.y1 : curr.x1;
|
|
3175
|
+
if (Math.abs(prevPos - currPos) <= MERGE_TOL) {
|
|
3176
|
+
const prevStart = dir === "h" ? prev.x1 : prev.y1;
|
|
3177
|
+
const prevEnd = dir === "h" ? prev.x2 : prev.y2;
|
|
3178
|
+
const currStart = dir === "h" ? curr.x1 : curr.y1;
|
|
3179
|
+
const currEnd = dir === "h" ? curr.x2 : curr.y2;
|
|
3180
|
+
const overlap = Math.min(prevEnd, currEnd) - Math.max(prevStart, currStart);
|
|
3181
|
+
const minLen = Math.min(prevEnd - prevStart, currEnd - currStart);
|
|
3182
|
+
if (overlap > minLen * 0.3) {
|
|
3183
|
+
if (dir === "h") {
|
|
3184
|
+
prev.x1 = Math.min(prev.x1, curr.x1);
|
|
3185
|
+
prev.x2 = Math.max(prev.x2, curr.x2);
|
|
3186
|
+
prev.y1 = (prev.y1 + curr.y1) / 2;
|
|
3187
|
+
prev.y2 = prev.y1;
|
|
3188
|
+
} else {
|
|
3189
|
+
prev.y1 = Math.min(prev.y1, curr.y1);
|
|
3190
|
+
prev.y2 = Math.max(prev.y2, curr.y2);
|
|
3191
|
+
prev.x1 = (prev.x1 + curr.x1) / 2;
|
|
3192
|
+
prev.x2 = prev.x1;
|
|
3193
|
+
}
|
|
3194
|
+
prev.lineWidth = Math.max(prev.lineWidth, curr.lineWidth);
|
|
3195
|
+
continue;
|
|
3196
|
+
}
|
|
3197
|
+
}
|
|
3198
|
+
result.push(curr);
|
|
3199
|
+
}
|
|
3200
|
+
return result;
|
|
3201
|
+
}
|
|
3232
3202
|
function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
|
|
3233
3203
|
const margin = 5;
|
|
3234
3204
|
return {
|
|
@@ -3240,8 +3210,49 @@ function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
|
|
|
3240
3210
|
)
|
|
3241
3211
|
};
|
|
3242
3212
|
}
|
|
3213
|
+
function buildVertices(horizontals, verticals) {
|
|
3214
|
+
const vertices = [];
|
|
3215
|
+
const tol = CONNECT_TOL;
|
|
3216
|
+
for (const h of horizontals) {
|
|
3217
|
+
for (const v of verticals) {
|
|
3218
|
+
if (v.x1 >= h.x1 - tol && v.x1 <= h.x2 + tol && h.y1 >= v.y1 - tol && h.y1 <= v.y2 + tol) {
|
|
3219
|
+
const radius = Math.max(h.lineWidth, v.lineWidth, 1);
|
|
3220
|
+
vertices.push({ x: v.x1, y: h.y1, radius });
|
|
3221
|
+
}
|
|
3222
|
+
}
|
|
3223
|
+
}
|
|
3224
|
+
return vertices;
|
|
3225
|
+
}
|
|
3226
|
+
function mergeVertices(vertices) {
|
|
3227
|
+
if (vertices.length <= 1) return vertices;
|
|
3228
|
+
const merged = [];
|
|
3229
|
+
const used = new Array(vertices.length).fill(false);
|
|
3230
|
+
for (let i = 0; i < vertices.length; i++) {
|
|
3231
|
+
if (used[i]) continue;
|
|
3232
|
+
let sumX = vertices[i].x, sumY = vertices[i].y;
|
|
3233
|
+
let maxRadius = vertices[i].radius;
|
|
3234
|
+
let count = 1;
|
|
3235
|
+
for (let j = i + 1; j < vertices.length; j++) {
|
|
3236
|
+
if (used[j]) continue;
|
|
3237
|
+
const mergeTol = VERTEX_MERGE_FACTOR * Math.max(maxRadius, vertices[j].radius);
|
|
3238
|
+
if (Math.abs(vertices[i].x - vertices[j].x) <= mergeTol && Math.abs(vertices[i].y - vertices[j].y) <= mergeTol) {
|
|
3239
|
+
sumX += vertices[j].x;
|
|
3240
|
+
sumY += vertices[j].y;
|
|
3241
|
+
maxRadius = Math.max(maxRadius, vertices[j].radius);
|
|
3242
|
+
count++;
|
|
3243
|
+
used[j] = true;
|
|
3244
|
+
}
|
|
3245
|
+
}
|
|
3246
|
+
merged.push({ x: sumX / count, y: sumY / count, radius: maxRadius });
|
|
3247
|
+
}
|
|
3248
|
+
return merged;
|
|
3249
|
+
}
|
|
3243
3250
|
function buildTableGrids(horizontals, verticals) {
|
|
3244
3251
|
if (horizontals.length < 2 || verticals.length < 2) return [];
|
|
3252
|
+
const allVertices = buildVertices(horizontals, verticals);
|
|
3253
|
+
const vertices = mergeVertices(allVertices);
|
|
3254
|
+
if (vertices.length < 4) return [];
|
|
3255
|
+
const globalRadius = vertices.reduce((max, v) => Math.max(max, v.radius), 1);
|
|
3245
3256
|
const allLines = [
|
|
3246
3257
|
...horizontals.map((l, i) => ({ ...l, type: "h", id: i })),
|
|
3247
3258
|
...verticals.map((l, i) => ({ ...l, type: "v", id: i + horizontals.length }))
|
|
@@ -3252,21 +3263,74 @@ function buildTableGrids(horizontals, verticals) {
|
|
|
3252
3263
|
const hLines = group.filter((l) => l.type === "h");
|
|
3253
3264
|
const vLines = group.filter((l) => l.type === "v");
|
|
3254
3265
|
if (hLines.length < 2 || vLines.length < 2) continue;
|
|
3255
|
-
|
|
3256
|
-
const
|
|
3257
|
-
|
|
3258
|
-
|
|
3266
|
+
let gx1 = Infinity, gy1 = Infinity, gx2 = -Infinity, gy2 = -Infinity;
|
|
3267
|
+
for (const l of vLines) {
|
|
3268
|
+
if (l.x1 < gx1) gx1 = l.x1;
|
|
3269
|
+
if (l.x1 > gx2) gx2 = l.x1;
|
|
3270
|
+
}
|
|
3271
|
+
for (const l of hLines) {
|
|
3272
|
+
if (l.y1 < gy1) gy1 = l.y1;
|
|
3273
|
+
if (l.y1 > gy2) gy2 = l.y1;
|
|
3274
|
+
}
|
|
3275
|
+
const groupBbox = {
|
|
3276
|
+
x1: gx1 - CONNECT_TOL,
|
|
3277
|
+
y1: gy1 - CONNECT_TOL,
|
|
3278
|
+
x2: gx2 + CONNECT_TOL,
|
|
3279
|
+
y2: gy2 + CONNECT_TOL
|
|
3280
|
+
};
|
|
3281
|
+
const groupVertices = vertices.filter(
|
|
3282
|
+
(v) => v.x >= groupBbox.x1 && v.x <= groupBbox.x2 && v.y >= groupBbox.y1 && v.y <= groupBbox.y2
|
|
3283
|
+
);
|
|
3284
|
+
const groupRadius = groupVertices.length > 0 ? groupVertices.reduce((max, v) => Math.max(max, v.radius), 1) : globalRadius;
|
|
3285
|
+
const coordMergeTol = Math.max(VERTEX_MERGE_FACTOR * groupRadius, MIN_COORD_MERGE_TOL);
|
|
3286
|
+
const rawYs = [
|
|
3287
|
+
...hLines.map((l) => l.y1),
|
|
3288
|
+
...groupVertices.map((v) => v.y)
|
|
3289
|
+
];
|
|
3290
|
+
const rowYs = clusterCoordinates(rawYs, coordMergeTol).sort((a, b) => b - a);
|
|
3291
|
+
const rawXs = [
|
|
3292
|
+
...vLines.map((l) => l.x1),
|
|
3293
|
+
...groupVertices.map((v) => v.x)
|
|
3294
|
+
];
|
|
3295
|
+
const colXs = clusterCoordinates(rawXs, coordMergeTol).sort((a, b) => a - b);
|
|
3259
3296
|
if (rowYs.length < 2 || colXs.length < 2) continue;
|
|
3297
|
+
const validColXs = enforceMinWidth(colXs, MIN_COL_WIDTH);
|
|
3298
|
+
const validRowYs = enforceMinHeight(rowYs, MIN_ROW_HEIGHT);
|
|
3299
|
+
if (validRowYs.length < 2 || validColXs.length < 2) continue;
|
|
3260
3300
|
const bbox = {
|
|
3261
|
-
x1:
|
|
3262
|
-
y1:
|
|
3263
|
-
x2:
|
|
3264
|
-
y2:
|
|
3301
|
+
x1: validColXs[0],
|
|
3302
|
+
y1: validRowYs[validRowYs.length - 1],
|
|
3303
|
+
x2: validColXs[validColXs.length - 1],
|
|
3304
|
+
y2: validRowYs[0]
|
|
3265
3305
|
};
|
|
3266
|
-
grids.push({ rowYs, colXs, bbox });
|
|
3306
|
+
grids.push({ rowYs: validRowYs, colXs: validColXs, bbox, vertexRadius: groupRadius });
|
|
3267
3307
|
}
|
|
3268
3308
|
return mergeAdjacentGrids(grids);
|
|
3269
3309
|
}
|
|
3310
|
+
function enforceMinWidth(colXs, minWidth) {
|
|
3311
|
+
if (colXs.length <= 2) return colXs;
|
|
3312
|
+
const result = [colXs[0]];
|
|
3313
|
+
for (let i = 1; i < colXs.length; i++) {
|
|
3314
|
+
const prevX = result[result.length - 1];
|
|
3315
|
+
if (colXs[i] - prevX < minWidth && i < colXs.length - 1) {
|
|
3316
|
+
continue;
|
|
3317
|
+
}
|
|
3318
|
+
result.push(colXs[i]);
|
|
3319
|
+
}
|
|
3320
|
+
return result;
|
|
3321
|
+
}
|
|
3322
|
+
function enforceMinHeight(rowYs, minHeight) {
|
|
3323
|
+
if (rowYs.length <= 2) return rowYs;
|
|
3324
|
+
const result = [rowYs[0]];
|
|
3325
|
+
for (let i = 1; i < rowYs.length; i++) {
|
|
3326
|
+
const prevY = result[result.length - 1];
|
|
3327
|
+
if (prevY - rowYs[i] < minHeight && i < rowYs.length - 1) {
|
|
3328
|
+
continue;
|
|
3329
|
+
}
|
|
3330
|
+
result.push(rowYs[i]);
|
|
3331
|
+
}
|
|
3332
|
+
return result;
|
|
3333
|
+
}
|
|
3270
3334
|
function mergeAdjacentGrids(grids) {
|
|
3271
3335
|
if (grids.length <= 1) return grids;
|
|
3272
3336
|
const sorted = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
|
|
@@ -3275,9 +3339,10 @@ function mergeAdjacentGrids(grids) {
|
|
|
3275
3339
|
const prev = merged[merged.length - 1];
|
|
3276
3340
|
const curr = sorted[i];
|
|
3277
3341
|
if (prev.colXs.length === curr.colXs.length) {
|
|
3278
|
-
const
|
|
3342
|
+
const mergeTol = Math.max(VERTEX_MERGE_FACTOR * Math.max(prev.vertexRadius, curr.vertexRadius), 6) * 3;
|
|
3343
|
+
const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= mergeTol);
|
|
3279
3344
|
const verticalGap = prev.bbox.y1 - curr.bbox.y2;
|
|
3280
|
-
if (colMatch && verticalGap >= -
|
|
3345
|
+
if (colMatch && verticalGap >= -CONNECT_TOL && verticalGap <= 20) {
|
|
3281
3346
|
const allRowYs = [.../* @__PURE__ */ new Set([...prev.rowYs, ...curr.rowYs])].sort((a, b) => b - a);
|
|
3282
3347
|
merged[merged.length - 1] = {
|
|
3283
3348
|
rowYs: allRowYs,
|
|
@@ -3287,7 +3352,8 @@ function mergeAdjacentGrids(grids) {
|
|
|
3287
3352
|
y1: Math.min(prev.bbox.y1, curr.bbox.y1),
|
|
3288
3353
|
x2: Math.max(prev.bbox.x2, curr.bbox.x2),
|
|
3289
3354
|
y2: Math.max(prev.bbox.y2, curr.bbox.y2)
|
|
3290
|
-
}
|
|
3355
|
+
},
|
|
3356
|
+
vertexRadius: Math.max(prev.vertexRadius, curr.vertexRadius)
|
|
3291
3357
|
};
|
|
3292
3358
|
continue;
|
|
3293
3359
|
}
|
|
@@ -3296,14 +3362,14 @@ function mergeAdjacentGrids(grids) {
|
|
|
3296
3362
|
}
|
|
3297
3363
|
return merged;
|
|
3298
3364
|
}
|
|
3299
|
-
function clusterCoordinates(values) {
|
|
3365
|
+
function clusterCoordinates(values, tolerance) {
|
|
3300
3366
|
if (values.length === 0) return [];
|
|
3301
3367
|
const sorted = [...values].sort((a, b) => a - b);
|
|
3302
3368
|
const clusters = [{ sum: sorted[0], count: 1 }];
|
|
3303
3369
|
for (let i = 1; i < sorted.length; i++) {
|
|
3304
3370
|
const last = clusters[clusters.length - 1];
|
|
3305
3371
|
const avg = last.sum / last.count;
|
|
3306
|
-
if (Math.abs(sorted[i] - avg) <=
|
|
3372
|
+
if (Math.abs(sorted[i] - avg) <= tolerance) {
|
|
3307
3373
|
last.sum += sorted[i];
|
|
3308
3374
|
last.count++;
|
|
3309
3375
|
} else {
|
|
@@ -3360,6 +3426,20 @@ function extractCells(grid, horizontals, verticals) {
|
|
|
3360
3426
|
const numRows = rowYs.length - 1;
|
|
3361
3427
|
const numCols = colXs.length - 1;
|
|
3362
3428
|
if (numRows <= 0 || numCols <= 0) return [];
|
|
3429
|
+
const vBorders = Array.from(
|
|
3430
|
+
{ length: numRows },
|
|
3431
|
+
(_, r) => Array.from(
|
|
3432
|
+
{ length: numCols + 1 },
|
|
3433
|
+
(_2, c) => hasVerticalLine(verticals, colXs[c], rowYs[r], rowYs[r + 1], grid.vertexRadius)
|
|
3434
|
+
)
|
|
3435
|
+
);
|
|
3436
|
+
const hBorders = Array.from(
|
|
3437
|
+
{ length: numRows + 1 },
|
|
3438
|
+
(_, r) => Array.from(
|
|
3439
|
+
{ length: numCols },
|
|
3440
|
+
(_2, c) => hasHorizontalLine(horizontals, rowYs[r], colXs[c], colXs[c + 1], grid.vertexRadius)
|
|
3441
|
+
)
|
|
3442
|
+
);
|
|
3363
3443
|
const occupied = Array.from({ length: numRows }, () => Array(numCols).fill(false));
|
|
3364
3444
|
const cells = [];
|
|
3365
3445
|
for (let r = 0; r < numRows; r++) {
|
|
@@ -3367,18 +3447,26 @@ function extractCells(grid, horizontals, verticals) {
|
|
|
3367
3447
|
if (occupied[r][c]) continue;
|
|
3368
3448
|
let colSpan = 1;
|
|
3369
3449
|
let rowSpan = 1;
|
|
3370
|
-
while (c + colSpan < numCols) {
|
|
3371
|
-
|
|
3372
|
-
|
|
3373
|
-
|
|
3374
|
-
|
|
3450
|
+
while (c + colSpan < numCols && !vBorders[r][c + colSpan]) {
|
|
3451
|
+
let canExpand = true;
|
|
3452
|
+
for (let dr = 0; dr < rowSpan; dr++) {
|
|
3453
|
+
if (vBorders[r + dr][c + colSpan]) {
|
|
3454
|
+
canExpand = false;
|
|
3455
|
+
break;
|
|
3456
|
+
}
|
|
3457
|
+
}
|
|
3458
|
+
if (!canExpand) break;
|
|
3375
3459
|
colSpan++;
|
|
3376
3460
|
}
|
|
3377
3461
|
while (r + rowSpan < numRows) {
|
|
3378
|
-
|
|
3379
|
-
|
|
3380
|
-
|
|
3381
|
-
|
|
3462
|
+
let hasLine = false;
|
|
3463
|
+
for (let dc = 0; dc < colSpan; dc++) {
|
|
3464
|
+
if (hBorders[r + rowSpan][c + dc]) {
|
|
3465
|
+
hasLine = true;
|
|
3466
|
+
break;
|
|
3467
|
+
}
|
|
3468
|
+
}
|
|
3469
|
+
if (hasLine) break;
|
|
3382
3470
|
rowSpan++;
|
|
3383
3471
|
}
|
|
3384
3472
|
for (let dr = 0; dr < rowSpan; dr++) {
|
|
@@ -3402,28 +3490,30 @@ function extractCells(grid, horizontals, verticals) {
|
|
|
3402
3490
|
}
|
|
3403
3491
|
return cells;
|
|
3404
3492
|
}
|
|
3405
|
-
function hasVerticalLine(verticals, x, topY, botY) {
|
|
3406
|
-
const tol =
|
|
3493
|
+
function hasVerticalLine(verticals, x, topY, botY, vertexRadius) {
|
|
3494
|
+
const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
|
|
3407
3495
|
for (const v of verticals) {
|
|
3408
3496
|
if (Math.abs(v.x1 - x) <= tol) {
|
|
3409
3497
|
const cellH = Math.abs(topY - botY);
|
|
3498
|
+
if (cellH < 0.1) continue;
|
|
3410
3499
|
const overlapTop = Math.min(v.y2, topY);
|
|
3411
3500
|
const overlapBot = Math.max(v.y1, botY);
|
|
3412
3501
|
const overlap = overlapTop - overlapBot;
|
|
3413
|
-
if (overlap >= cellH * 0.
|
|
3502
|
+
if (overlap >= cellH * 0.75) return true;
|
|
3414
3503
|
}
|
|
3415
3504
|
}
|
|
3416
3505
|
return false;
|
|
3417
3506
|
}
|
|
3418
|
-
function hasHorizontalLine(horizontals, y, leftX, rightX) {
|
|
3419
|
-
const tol =
|
|
3507
|
+
function hasHorizontalLine(horizontals, y, leftX, rightX, vertexRadius) {
|
|
3508
|
+
const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
|
|
3420
3509
|
for (const h of horizontals) {
|
|
3421
3510
|
if (Math.abs(h.y1 - y) <= tol) {
|
|
3422
3511
|
const cellW = Math.abs(rightX - leftX);
|
|
3512
|
+
if (cellW < 0.1) continue;
|
|
3423
3513
|
const overlapLeft = Math.max(h.x1, leftX);
|
|
3424
3514
|
const overlapRight = Math.min(h.x2, rightX);
|
|
3425
3515
|
const overlap = overlapRight - overlapLeft;
|
|
3426
|
-
if (overlap >= cellW * 0.
|
|
3516
|
+
if (overlap >= cellW * 0.75) return true;
|
|
3427
3517
|
}
|
|
3428
3518
|
}
|
|
3429
3519
|
return false;
|
|
@@ -3434,23 +3524,24 @@ function mapTextToCells(items, cells) {
|
|
|
3434
3524
|
result.set(cell, []);
|
|
3435
3525
|
}
|
|
3436
3526
|
for (const item of items) {
|
|
3437
|
-
const cx = item.x + item.w / 2;
|
|
3438
|
-
const cy = item.y;
|
|
3439
3527
|
const pad = CELL_PADDING;
|
|
3440
3528
|
let bestCell = null;
|
|
3441
|
-
let
|
|
3529
|
+
let bestScore = 0;
|
|
3442
3530
|
for (const cell of cells) {
|
|
3443
|
-
|
|
3444
|
-
|
|
3445
|
-
|
|
3446
|
-
|
|
3447
|
-
|
|
3448
|
-
|
|
3449
|
-
|
|
3450
|
-
|
|
3531
|
+
const ix1 = Math.max(item.x, cell.bbox.x1 - pad);
|
|
3532
|
+
const ix2 = Math.min(item.x + item.w, cell.bbox.x2 + pad);
|
|
3533
|
+
const iy1 = Math.max(item.y, cell.bbox.y1 - pad);
|
|
3534
|
+
const iy2 = Math.min(item.y + (item.h || item.fontSize), cell.bbox.y2 + pad);
|
|
3535
|
+
if (ix1 >= ix2 || iy1 >= iy2) continue;
|
|
3536
|
+
const intersectArea = (ix2 - ix1) * (iy2 - iy1);
|
|
3537
|
+
const itemArea = Math.max(item.w, 1) * Math.max(item.h || item.fontSize, 1);
|
|
3538
|
+
const score = intersectArea / itemArea;
|
|
3539
|
+
if (score > bestScore) {
|
|
3540
|
+
bestScore = score;
|
|
3541
|
+
bestCell = cell;
|
|
3451
3542
|
}
|
|
3452
3543
|
}
|
|
3453
|
-
if (bestCell) {
|
|
3544
|
+
if (bestCell && bestScore > 0.3) {
|
|
3454
3545
|
result.get(bestCell).push(item);
|
|
3455
3546
|
}
|
|
3456
3547
|
}
|
|
@@ -3477,8 +3568,13 @@ function cellTextToString(items) {
|
|
|
3477
3568
|
const textLines = lines.map((line) => {
|
|
3478
3569
|
const s = line.sort((a, b) => a.x - b.x);
|
|
3479
3570
|
if (s.length === 1) return s[0].text;
|
|
3571
|
+
const evenSpaced = detectEvenSpacedItems(s);
|
|
3480
3572
|
let result = s[0].text;
|
|
3481
3573
|
for (let j = 1; j < s.length; j++) {
|
|
3574
|
+
if (evenSpaced[j]) {
|
|
3575
|
+
result += s[j].text;
|
|
3576
|
+
continue;
|
|
3577
|
+
}
|
|
3482
3578
|
const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
|
|
3483
3579
|
const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
|
|
3484
3580
|
const prevIsKorean = /[가-힣]$/.test(result);
|
|
@@ -3493,6 +3589,57 @@ function cellTextToString(items) {
|
|
|
3493
3589
|
}
|
|
3494
3590
|
return result;
|
|
3495
3591
|
});
|
|
3592
|
+
return mergeCellTextLines(textLines);
|
|
3593
|
+
}
|
|
3594
|
+
function detectEvenSpacedItems(items) {
|
|
3595
|
+
const result = new Array(items.length).fill(false);
|
|
3596
|
+
if (items.length < 3) return result;
|
|
3597
|
+
let runStart = -1;
|
|
3598
|
+
for (let i = 0; i < items.length; i++) {
|
|
3599
|
+
const isShortKorean = /^[가-힣]{1}$/.test(items[i].text) || /^[\d]{1}$/.test(items[i].text);
|
|
3600
|
+
if (isShortKorean && runStart >= 0 && i > 0) {
|
|
3601
|
+
const gap = items[i].x - (items[i - 1].x + items[i - 1].w);
|
|
3602
|
+
const maxRunGap = Math.max(items[i].fontSize * 3, 30);
|
|
3603
|
+
if (gap > maxRunGap) {
|
|
3604
|
+
if (i - runStart >= 3) markEvenRun(items, result, runStart, i);
|
|
3605
|
+
runStart = i;
|
|
3606
|
+
continue;
|
|
3607
|
+
}
|
|
3608
|
+
}
|
|
3609
|
+
if (isShortKorean) {
|
|
3610
|
+
if (runStart < 0) runStart = i;
|
|
3611
|
+
} else {
|
|
3612
|
+
if (runStart >= 0 && i - runStart >= 3) {
|
|
3613
|
+
markEvenRun(items, result, runStart, i);
|
|
3614
|
+
}
|
|
3615
|
+
runStart = -1;
|
|
3616
|
+
}
|
|
3617
|
+
}
|
|
3618
|
+
if (runStart >= 0 && items.length - runStart >= 3) {
|
|
3619
|
+
markEvenRun(items, result, runStart, items.length);
|
|
3620
|
+
}
|
|
3621
|
+
return result;
|
|
3622
|
+
}
|
|
3623
|
+
function markEvenRun(items, result, start, end) {
|
|
3624
|
+
const gaps = [];
|
|
3625
|
+
for (let i = start + 1; i < end; i++) {
|
|
3626
|
+
gaps.push(items[i].x - (items[i - 1].x + items[i - 1].w));
|
|
3627
|
+
}
|
|
3628
|
+
const posGaps = gaps.filter((g2) => g2 > 0);
|
|
3629
|
+
if (posGaps.length < 2) return;
|
|
3630
|
+
let minGap = Infinity, maxGap = -Infinity;
|
|
3631
|
+
for (const g2 of posGaps) {
|
|
3632
|
+
if (g2 < minGap) minGap = g2;
|
|
3633
|
+
if (g2 > maxGap) maxGap = g2;
|
|
3634
|
+
}
|
|
3635
|
+
const avgFs = items[start].fontSize;
|
|
3636
|
+
if (minGap >= avgFs * 0.1 && maxGap <= avgFs * 3 && maxGap / Math.max(minGap, 0.1) <= 3) {
|
|
3637
|
+
for (let i = start + 1; i < end; i++) {
|
|
3638
|
+
result[i] = true;
|
|
3639
|
+
}
|
|
3640
|
+
}
|
|
3641
|
+
}
|
|
3642
|
+
function mergeCellTextLines(textLines) {
|
|
3496
3643
|
if (textLines.length <= 1) return textLines[0] || "";
|
|
3497
3644
|
const merged = [textLines[0]];
|
|
3498
3645
|
for (let i = 1; i < textLines.length; i++) {
|
|
@@ -3518,24 +3665,172 @@ var Y_TOL = 3;
|
|
|
3518
3665
|
var COL_CLUSTER_TOL = 15;
|
|
3519
3666
|
var MIN_ROWS = 3;
|
|
3520
3667
|
var MIN_COLS = 2;
|
|
3521
|
-
var MIN_GAP_FACTOR =
|
|
3522
|
-
var
|
|
3668
|
+
var MIN_GAP_FACTOR = 2;
|
|
3669
|
+
var MIN_GAP_ABSOLUTE = 20;
|
|
3670
|
+
var MIN_COL_FILL_RATIO = 0.4;
|
|
3523
3671
|
function detectClusterTables(items, pageNum) {
|
|
3524
3672
|
if (items.length < MIN_ROWS * MIN_COLS) return [];
|
|
3525
|
-
const
|
|
3673
|
+
const { merged, originMap } = mergeEvenSpacedClusters(items);
|
|
3674
|
+
const rows = groupByBaseline(merged);
|
|
3526
3675
|
if (rows.length < MIN_ROWS) return [];
|
|
3527
|
-
const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
|
|
3528
|
-
if (suspiciousRows.length < MIN_ROWS) return [];
|
|
3529
|
-
const columns = extractColumnClusters(suspiciousRows);
|
|
3530
|
-
if (columns.length < MIN_COLS) return [];
|
|
3531
|
-
const tableRegions = findTableRegions(rows, columns);
|
|
3532
3676
|
const results = [];
|
|
3533
|
-
|
|
3534
|
-
|
|
3535
|
-
|
|
3677
|
+
const headerResult = detectHeaderRow(rows);
|
|
3678
|
+
if (headerResult) {
|
|
3679
|
+
const { columns, headerIdx } = headerResult;
|
|
3680
|
+
const headerRow = rows[headerIdx];
|
|
3681
|
+
const headerItems = [...headerRow.items].sort((a, b) => a.x - b.x);
|
|
3682
|
+
const headerAndBelow = rows.slice(headerIdx);
|
|
3683
|
+
const mergedRows = mergeMultiLineRows(headerAndBelow, columns);
|
|
3684
|
+
const tableRegions = findTableRegionsByHeader(mergedRows, columns, headerItems);
|
|
3685
|
+
for (const region of tableRegions) {
|
|
3686
|
+
const table = buildClusterTable(region.rows, columns, pageNum);
|
|
3687
|
+
if (table) {
|
|
3688
|
+
expandUsedItems(table.usedItems, originMap);
|
|
3689
|
+
results.push(table);
|
|
3690
|
+
}
|
|
3691
|
+
}
|
|
3692
|
+
}
|
|
3693
|
+
if (results.length === 0) {
|
|
3694
|
+
const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
|
|
3695
|
+
if (suspiciousRows.length >= MIN_ROWS) {
|
|
3696
|
+
const columns = extractColumnClusters(suspiciousRows);
|
|
3697
|
+
if (columns.length >= MIN_COLS) {
|
|
3698
|
+
const tableRegions = findTableRegions(rows, columns);
|
|
3699
|
+
for (const region of tableRegions) {
|
|
3700
|
+
const mergedRows = mergeMultiLineRows(region.rows, columns);
|
|
3701
|
+
const table = buildClusterTable(mergedRows, columns, pageNum);
|
|
3702
|
+
if (table) {
|
|
3703
|
+
expandUsedItems(table.usedItems, originMap);
|
|
3704
|
+
results.push(table);
|
|
3705
|
+
}
|
|
3706
|
+
}
|
|
3707
|
+
}
|
|
3708
|
+
}
|
|
3536
3709
|
}
|
|
3537
3710
|
return results;
|
|
3538
3711
|
}
|
|
3712
|
+
function mergeEvenSpacedClusters(items) {
|
|
3713
|
+
const originMap = /* @__PURE__ */ new Map();
|
|
3714
|
+
const rows = groupByBaseline(items);
|
|
3715
|
+
const merged = [];
|
|
3716
|
+
for (const row of rows) {
|
|
3717
|
+
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3718
|
+
let i = 0;
|
|
3719
|
+
while (i < sorted.length) {
|
|
3720
|
+
if (/^[가-힣\d]$/.test(sorted[i].text)) {
|
|
3721
|
+
let runEnd = i + 1;
|
|
3722
|
+
while (runEnd < sorted.length && /^[가-힣\d]$/.test(sorted[runEnd].text)) {
|
|
3723
|
+
const gap = sorted[runEnd].x - (sorted[runEnd - 1].x + sorted[runEnd - 1].w);
|
|
3724
|
+
const fs = sorted[runEnd].fontSize;
|
|
3725
|
+
if (gap < fs * 0.1 || gap > fs * 3) break;
|
|
3726
|
+
runEnd++;
|
|
3727
|
+
}
|
|
3728
|
+
if (runEnd - i >= 3) {
|
|
3729
|
+
const gaps = [];
|
|
3730
|
+
for (let g2 = i + 1; g2 < runEnd; g2++) {
|
|
3731
|
+
gaps.push(sorted[g2].x - (sorted[g2 - 1].x + sorted[g2 - 1].w));
|
|
3732
|
+
}
|
|
3733
|
+
let minG = Infinity, maxG = -Infinity;
|
|
3734
|
+
for (const g2 of gaps) {
|
|
3735
|
+
if (g2 < minG) minG = g2;
|
|
3736
|
+
if (g2 > maxG) maxG = g2;
|
|
3737
|
+
}
|
|
3738
|
+
if (minG > 0 && maxG / minG <= 3) {
|
|
3739
|
+
const run = sorted.slice(i, runEnd);
|
|
3740
|
+
const text = run.map((r) => r.text).join("");
|
|
3741
|
+
const first = run[0], last = run[runEnd - i - 1];
|
|
3742
|
+
const item = {
|
|
3743
|
+
text,
|
|
3744
|
+
x: first.x,
|
|
3745
|
+
y: first.y,
|
|
3746
|
+
w: last.x + last.w - first.x,
|
|
3747
|
+
h: first.h,
|
|
3748
|
+
fontSize: first.fontSize,
|
|
3749
|
+
fontName: first.fontName
|
|
3750
|
+
};
|
|
3751
|
+
originMap.set(item, run);
|
|
3752
|
+
merged.push(item);
|
|
3753
|
+
i = runEnd;
|
|
3754
|
+
continue;
|
|
3755
|
+
}
|
|
3756
|
+
}
|
|
3757
|
+
}
|
|
3758
|
+
merged.push(sorted[i]);
|
|
3759
|
+
i++;
|
|
3760
|
+
}
|
|
3761
|
+
}
|
|
3762
|
+
return { merged, originMap };
|
|
3763
|
+
}
|
|
3764
|
+
function expandUsedItems(usedItems, originMap) {
|
|
3765
|
+
const toAdd = [];
|
|
3766
|
+
for (const item of usedItems) {
|
|
3767
|
+
const origins = originMap.get(item);
|
|
3768
|
+
if (origins) for (const o of origins) toAdd.push(o);
|
|
3769
|
+
}
|
|
3770
|
+
for (const a of toAdd) usedItems.add(a);
|
|
3771
|
+
}
|
|
3772
|
+
function detectHeaderRow(rows) {
|
|
3773
|
+
const allItems = rows.flatMap((r) => r.items);
|
|
3774
|
+
if (allItems.length === 0) return null;
|
|
3775
|
+
let allMinX = Infinity, allMaxX = -Infinity;
|
|
3776
|
+
for (const i of allItems) {
|
|
3777
|
+
if (i.x < allMinX) allMinX = i.x;
|
|
3778
|
+
const r = i.x + i.w;
|
|
3779
|
+
if (r > allMaxX) allMaxX = r;
|
|
3780
|
+
}
|
|
3781
|
+
const pageSpan = allMaxX - allMinX;
|
|
3782
|
+
if (pageSpan <= 0) return null;
|
|
3783
|
+
for (let ri = 0; ri < rows.length; ri++) {
|
|
3784
|
+
const row = rows[ri];
|
|
3785
|
+
if (row.items.length < MIN_COLS || row.items.length > 6) continue;
|
|
3786
|
+
if (row.items.some((i) => i.text.length > 8)) continue;
|
|
3787
|
+
if (!row.items.some((i) => /[가-힣]/.test(i.text))) continue;
|
|
3788
|
+
if (row.items.some((i) => /^[□■○●·※▶▷◆◇\-]/.test(i.text))) continue;
|
|
3789
|
+
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3790
|
+
const xSpan = sorted[sorted.length - 1].x + sorted[sorted.length - 1].w - sorted[0].x;
|
|
3791
|
+
if (xSpan / pageSpan < 0.4) continue;
|
|
3792
|
+
const avgFs = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
|
|
3793
|
+
let hasLargeGap = false;
|
|
3794
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
3795
|
+
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
3796
|
+
if (gap >= avgFs * 2.5) {
|
|
3797
|
+
hasLargeGap = true;
|
|
3798
|
+
break;
|
|
3799
|
+
}
|
|
3800
|
+
}
|
|
3801
|
+
if (!hasLargeGap) continue;
|
|
3802
|
+
const columns = sorted.map((item) => ({ x: item.x, count: 0 }));
|
|
3803
|
+
let matchCount = 0;
|
|
3804
|
+
for (let j = ri + 1; j < rows.length && matchCount < MIN_ROWS + 2; j++) {
|
|
3805
|
+
const matched = countMatchedColumnsRange(rows[j], columns, sorted);
|
|
3806
|
+
if (matched >= MIN_COLS) matchCount++;
|
|
3807
|
+
}
|
|
3808
|
+
if (matchCount < MIN_ROWS) continue;
|
|
3809
|
+
return { columns, headerIdx: ri };
|
|
3810
|
+
}
|
|
3811
|
+
return null;
|
|
3812
|
+
}
|
|
3813
|
+
function mergeMultiLineRows(rows, columns) {
|
|
3814
|
+
if (rows.length <= 1) return rows;
|
|
3815
|
+
const result = [rows[0]];
|
|
3816
|
+
const allFontSizes = rows.flatMap((r) => r.items).map((i) => i.fontSize);
|
|
3817
|
+
const avgFontSize = allFontSizes.length > 0 ? allFontSizes.reduce((s, v) => s + v, 0) / allFontSizes.length : 12;
|
|
3818
|
+
for (let i = 1; i < rows.length; i++) {
|
|
3819
|
+
const prev = result[result.length - 1];
|
|
3820
|
+
const curr = rows[i];
|
|
3821
|
+
const yGap = Math.abs(prev.y - curr.y);
|
|
3822
|
+
const matchedCols = countMatchedColumns(curr, columns);
|
|
3823
|
+
if (yGap < avgFontSize * 1.8 && curr.items.length <= 2 && (matchedCols < MIN_COLS || curr.items.length === 1)) {
|
|
3824
|
+
result[result.length - 1] = {
|
|
3825
|
+
y: prev.y,
|
|
3826
|
+
items: [...prev.items, ...curr.items]
|
|
3827
|
+
};
|
|
3828
|
+
} else {
|
|
3829
|
+
result.push(curr);
|
|
3830
|
+
}
|
|
3831
|
+
}
|
|
3832
|
+
return result;
|
|
3833
|
+
}
|
|
3539
3834
|
function groupByBaseline(items) {
|
|
3540
3835
|
if (items.length === 0) return [];
|
|
3541
3836
|
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
@@ -3557,8 +3852,9 @@ function groupByBaseline(items) {
|
|
|
3557
3852
|
function hasSuspiciousGaps(row) {
|
|
3558
3853
|
if (row.items.length < 2) return false;
|
|
3559
3854
|
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3855
|
+
if (sorted.length === 2 && sorted[1].text.length > 20) return false;
|
|
3560
3856
|
const avgFontSize = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
|
|
3561
|
-
const minGap = avgFontSize * MIN_GAP_FACTOR;
|
|
3857
|
+
const minGap = Math.max(avgFontSize * MIN_GAP_FACTOR, MIN_GAP_ABSOLUTE);
|
|
3562
3858
|
for (let i = 1; i < sorted.length; i++) {
|
|
3563
3859
|
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
3564
3860
|
if (gap >= minGap) return true;
|
|
@@ -3585,6 +3881,41 @@ function extractColumnClusters(rows) {
|
|
|
3585
3881
|
const minCount = Math.max(2, Math.floor(rows.length * MIN_COL_FILL_RATIO));
|
|
3586
3882
|
return clusters.filter((c) => c.count >= minCount).sort((a, b) => a.x - b.x);
|
|
3587
3883
|
}
|
|
3884
|
+
function findTableRegionsByHeader(allRows, columns, headerItems) {
|
|
3885
|
+
const regions = [];
|
|
3886
|
+
let currentRegion = [];
|
|
3887
|
+
let missStreak = 0;
|
|
3888
|
+
for (const row of allRows) {
|
|
3889
|
+
const matchedCols = countMatchedColumnsRange(row, columns, headerItems);
|
|
3890
|
+
if (matchedCols >= MIN_COLS) {
|
|
3891
|
+
currentRegion.push(row);
|
|
3892
|
+
missStreak = 0;
|
|
3893
|
+
} else if (currentRegion.length > 0 && (row.items.length <= 2 || missStreak === 0)) {
|
|
3894
|
+
currentRegion.push(row);
|
|
3895
|
+
missStreak++;
|
|
3896
|
+
} else {
|
|
3897
|
+
while (currentRegion.length > 0) {
|
|
3898
|
+
const last = currentRegion[currentRegion.length - 1];
|
|
3899
|
+
if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
|
|
3900
|
+
currentRegion.pop();
|
|
3901
|
+
}
|
|
3902
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
3903
|
+
regions.push({ rows: [...currentRegion] });
|
|
3904
|
+
}
|
|
3905
|
+
currentRegion = [];
|
|
3906
|
+
missStreak = 0;
|
|
3907
|
+
}
|
|
3908
|
+
}
|
|
3909
|
+
while (currentRegion.length > 0) {
|
|
3910
|
+
const last = currentRegion[currentRegion.length - 1];
|
|
3911
|
+
if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
|
|
3912
|
+
currentRegion.pop();
|
|
3913
|
+
}
|
|
3914
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
3915
|
+
regions.push({ rows: currentRegion });
|
|
3916
|
+
}
|
|
3917
|
+
return regions;
|
|
3918
|
+
}
|
|
3588
3919
|
function findTableRegions(allRows, columns) {
|
|
3589
3920
|
const regions = [];
|
|
3590
3921
|
let currentRegion = [];
|
|
@@ -3620,18 +3951,81 @@ function countMatchedColumns(row, columns) {
|
|
|
3620
3951
|
}
|
|
3621
3952
|
return matched.size;
|
|
3622
3953
|
}
|
|
3623
|
-
function
|
|
3624
|
-
const
|
|
3625
|
-
let
|
|
3626
|
-
|
|
3627
|
-
|
|
3628
|
-
|
|
3629
|
-
|
|
3630
|
-
|
|
3631
|
-
|
|
3954
|
+
function countMatchedColumnsRange(row, columns, headerItems) {
|
|
3955
|
+
const boundaries = [];
|
|
3956
|
+
for (let ci = 0; ci < headerItems.length; ci++) {
|
|
3957
|
+
const left = ci === 0 ? 0 : (headerItems[ci - 1].x + headerItems[ci - 1].w + headerItems[ci].x) / 2;
|
|
3958
|
+
const right = ci === headerItems.length - 1 ? Infinity : (headerItems[ci].x + headerItems[ci].w + headerItems[ci + 1].x) / 2;
|
|
3959
|
+
boundaries.push({ left, right });
|
|
3960
|
+
}
|
|
3961
|
+
const matched = /* @__PURE__ */ new Set();
|
|
3962
|
+
for (const item of row.items) {
|
|
3963
|
+
for (let ci = 0; ci < boundaries.length; ci++) {
|
|
3964
|
+
if (item.x >= boundaries[ci].left && item.x < boundaries[ci].right) {
|
|
3965
|
+
matched.add(ci);
|
|
3966
|
+
break;
|
|
3967
|
+
}
|
|
3968
|
+
}
|
|
3969
|
+
}
|
|
3970
|
+
return matched.size;
|
|
3971
|
+
}
|
|
3972
|
+
function assignRowItems(items, columns, numCols) {
|
|
3973
|
+
if (items.length === 0) return [];
|
|
3974
|
+
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
3975
|
+
const colCenters = columns.map((c) => c.x);
|
|
3976
|
+
const gaps = [];
|
|
3977
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
3978
|
+
gaps.push({ idx: i, size: sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w) });
|
|
3979
|
+
}
|
|
3980
|
+
const gapSizes = gaps.map((g2) => g2.size).sort((a, b) => a - b);
|
|
3981
|
+
const medianGap = gapSizes.length > 0 ? gapSizes[Math.floor(gapSizes.length / 2)] : 0;
|
|
3982
|
+
const gapThreshold = sorted.length <= numCols + 1 ? 12 : Math.max(medianGap * 2.5, 12);
|
|
3983
|
+
const significantGaps = gaps.filter((g2) => g2.size >= gapThreshold).sort((a, b) => b.size - a.size).slice(0, numCols - 1).sort((a, b) => a.idx - b.idx);
|
|
3984
|
+
const groups = [];
|
|
3985
|
+
let start = 0;
|
|
3986
|
+
for (const gap of significantGaps) {
|
|
3987
|
+
groups.push(sorted.slice(start, gap.idx));
|
|
3988
|
+
start = gap.idx;
|
|
3989
|
+
}
|
|
3990
|
+
groups.push(sorted.slice(start));
|
|
3991
|
+
const result = [];
|
|
3992
|
+
const usedCols = /* @__PURE__ */ new Set();
|
|
3993
|
+
const groupCenters = groups.map((g2) => {
|
|
3994
|
+
let minX = Infinity, maxX = -Infinity;
|
|
3995
|
+
for (const i of g2) {
|
|
3996
|
+
if (i.x < minX) minX = i.x;
|
|
3997
|
+
const r = i.x + i.w;
|
|
3998
|
+
if (r > maxX) maxX = r;
|
|
3999
|
+
}
|
|
4000
|
+
return (minX + maxX) / 2;
|
|
4001
|
+
});
|
|
4002
|
+
const assignments = [];
|
|
4003
|
+
for (let gi = 0; gi < groups.length; gi++) {
|
|
4004
|
+
for (let ci = 0; ci < numCols; ci++) {
|
|
4005
|
+
assignments.push({ gi, ci, dist: Math.abs(groupCenters[gi] - colCenters[ci]) });
|
|
4006
|
+
}
|
|
4007
|
+
}
|
|
4008
|
+
assignments.sort((a, b) => a.dist - b.dist);
|
|
4009
|
+
const assignedGroups = /* @__PURE__ */ new Set();
|
|
4010
|
+
for (const { gi, ci } of assignments) {
|
|
4011
|
+
if (assignedGroups.has(gi) || usedCols.has(ci)) continue;
|
|
4012
|
+
result.push({ col: ci, items: groups[gi] });
|
|
4013
|
+
assignedGroups.add(gi);
|
|
4014
|
+
usedCols.add(ci);
|
|
4015
|
+
}
|
|
4016
|
+
for (let gi = 0; gi < groups.length; gi++) {
|
|
4017
|
+
if (assignedGroups.has(gi)) continue;
|
|
4018
|
+
let bestCol = 0, bestDist = Infinity;
|
|
4019
|
+
for (let ci = 0; ci < numCols; ci++) {
|
|
4020
|
+
const d = Math.abs(groupCenters[gi] - colCenters[ci]);
|
|
4021
|
+
if (d < bestDist) {
|
|
4022
|
+
bestDist = d;
|
|
4023
|
+
bestCol = ci;
|
|
4024
|
+
}
|
|
3632
4025
|
}
|
|
4026
|
+
result.push({ col: bestCol, items: groups[gi] });
|
|
3633
4027
|
}
|
|
3634
|
-
return
|
|
4028
|
+
return result;
|
|
3635
4029
|
}
|
|
3636
4030
|
function buildClusterTable(rows, columns, pageNum) {
|
|
3637
4031
|
const numCols = columns.length;
|
|
@@ -3649,12 +4043,12 @@ function buildClusterTable(rows, columns, pageNum) {
|
|
|
3649
4043
|
usedItems.add(row.items[0]);
|
|
3650
4044
|
continue;
|
|
3651
4045
|
}
|
|
3652
|
-
|
|
3653
|
-
|
|
3654
|
-
|
|
4046
|
+
const assignments = assignRowItems(row.items, columns, numCols);
|
|
4047
|
+
for (const { col, items } of assignments) {
|
|
4048
|
+
const text = items.map((i) => i.text).join(" ");
|
|
3655
4049
|
const existing = cells[r][col].text;
|
|
3656
|
-
cells[r][col].text = existing ? existing + " " +
|
|
3657
|
-
usedItems.add(item);
|
|
4050
|
+
cells[r][col].text = existing ? existing + " " + text : text;
|
|
4051
|
+
for (const item of items) usedItems.add(item);
|
|
3658
4052
|
}
|
|
3659
4053
|
}
|
|
3660
4054
|
let emptyRows = 0;
|
|
@@ -3666,11 +4060,48 @@ function buildClusterTable(rows, columns, pageNum) {
|
|
|
3666
4060
|
const hasValue = cells.some((row) => row[c].text !== "");
|
|
3667
4061
|
if (!hasValue) return null;
|
|
3668
4062
|
}
|
|
4063
|
+
for (let r = numRows - 1; r >= 1; r--) {
|
|
4064
|
+
const nonEmptyCols = cells[r].filter((c) => c.text.trim()).length;
|
|
4065
|
+
if (nonEmptyCols !== 1) continue;
|
|
4066
|
+
if (cells[r][0].text.trim() !== "") continue;
|
|
4067
|
+
const contentText = cells[r].find((c) => c.text.trim())?.text.trim() || "";
|
|
4068
|
+
if (/^[○●▶\-·]/.test(contentText)) continue;
|
|
4069
|
+
for (let pr = r - 1; pr >= 0; pr--) {
|
|
4070
|
+
if (cells[pr].some((c) => c.text.trim())) {
|
|
4071
|
+
for (let c = 0; c < numCols; c++) {
|
|
4072
|
+
const prev = cells[pr][c].text.trim();
|
|
4073
|
+
const curr = cells[r][c].text.trim();
|
|
4074
|
+
if (curr) cells[pr][c].text = prev ? prev + " " + curr : curr;
|
|
4075
|
+
}
|
|
4076
|
+
for (let c = 0; c < numCols; c++) cells[r][c].text = "";
|
|
4077
|
+
break;
|
|
4078
|
+
}
|
|
4079
|
+
}
|
|
4080
|
+
}
|
|
4081
|
+
for (let r = 0; r < cells.length - 1; r++) {
|
|
4082
|
+
const row = cells[r];
|
|
4083
|
+
const hasCol0 = row[0].text.trim() !== "";
|
|
4084
|
+
const hasColLast = numCols > 1 && row[numCols - 1].text.trim() !== "";
|
|
4085
|
+
const midEmpty = row.slice(1, numCols - 1).every((c) => c.text.trim() === "");
|
|
4086
|
+
if (hasCol0 && hasColLast && midEmpty) {
|
|
4087
|
+
const next = cells[r + 1];
|
|
4088
|
+
if (next[0].text.trim() === "" && next.some((c) => c.text.trim())) {
|
|
4089
|
+
for (let c = 1; c < numCols; c++) {
|
|
4090
|
+
const curr = next[c].text.trim();
|
|
4091
|
+
if (curr) row[c].text = row[c].text.trim() ? row[c].text.trim() + " " + curr : curr;
|
|
4092
|
+
}
|
|
4093
|
+
for (let c = 0; c < numCols; c++) next[c].text = "";
|
|
4094
|
+
}
|
|
4095
|
+
}
|
|
4096
|
+
}
|
|
4097
|
+
const filteredCells = cells.filter((row) => row.some((c) => c.text.trim()));
|
|
4098
|
+
const finalRowCount = filteredCells.length;
|
|
4099
|
+
if (finalRowCount < MIN_ROWS) return null;
|
|
3669
4100
|
const irTable = {
|
|
3670
|
-
rows:
|
|
4101
|
+
rows: finalRowCount,
|
|
3671
4102
|
cols: numCols,
|
|
3672
|
-
cells,
|
|
3673
|
-
hasHeader:
|
|
4103
|
+
cells: filteredCells,
|
|
4104
|
+
hasHeader: finalRowCount > 1
|
|
3674
4105
|
};
|
|
3675
4106
|
const allItems = rows.flatMap((r) => r.items);
|
|
3676
4107
|
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
@@ -3747,7 +4178,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3747
4178
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
3748
4179
|
const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
|
|
3749
4180
|
const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
|
|
3750
|
-
const
|
|
4181
|
+
const fontSizeFreq = /* @__PURE__ */ new Map();
|
|
3751
4182
|
const pageHeights = /* @__PURE__ */ new Map();
|
|
3752
4183
|
let parsedPages = 0;
|
|
3753
4184
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
@@ -3764,7 +4195,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3764
4195
|
warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
|
|
3765
4196
|
}
|
|
3766
4197
|
for (const item of visible) {
|
|
3767
|
-
if (item.fontSize > 0)
|
|
4198
|
+
if (item.fontSize > 0) fontSizeFreq.set(item.fontSize, (fontSizeFreq.get(item.fontSize) || 0) + 1);
|
|
3768
4199
|
}
|
|
3769
4200
|
const opList = await page.getOperatorList();
|
|
3770
4201
|
const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
|
|
@@ -3803,10 +4234,9 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3803
4234
|
blocks.splice(removed[ri], 1);
|
|
3804
4235
|
}
|
|
3805
4236
|
}
|
|
3806
|
-
const medianFontSize =
|
|
4237
|
+
const medianFontSize = computeMedianFontSizeFromFreq(fontSizeFreq);
|
|
3807
4238
|
if (medianFontSize > 0) {
|
|
3808
4239
|
detectHeadings(blocks, medianFontSize);
|
|
3809
|
-
mergeAdjacentHeadings(blocks);
|
|
3810
4240
|
}
|
|
3811
4241
|
detectMarkerHeadings(blocks);
|
|
3812
4242
|
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
@@ -3857,11 +4287,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
|
|
|
3857
4287
|
}
|
|
3858
4288
|
return { visible, hiddenCount };
|
|
3859
4289
|
}
|
|
3860
|
-
function
|
|
3861
|
-
if (
|
|
3862
|
-
|
|
3863
|
-
const
|
|
3864
|
-
|
|
4290
|
+
function computeMedianFontSizeFromFreq(freq) {
|
|
4291
|
+
if (freq.size === 0) return 0;
|
|
4292
|
+
let total = 0;
|
|
4293
|
+
for (const count of freq.values()) total += count;
|
|
4294
|
+
const sorted = [...freq.entries()].sort((a, b) => a[0] - b[0]);
|
|
4295
|
+
const mid = Math.floor(total / 2);
|
|
4296
|
+
let cumulative = 0;
|
|
4297
|
+
for (const [size, count] of sorted) {
|
|
4298
|
+
cumulative += count;
|
|
4299
|
+
if (cumulative > mid) return size;
|
|
4300
|
+
}
|
|
4301
|
+
return sorted[sorted.length - 1][0];
|
|
3865
4302
|
}
|
|
3866
4303
|
function detectHeadings(blocks, medianFontSize) {
|
|
3867
4304
|
for (const block of blocks) {
|
|
@@ -3881,220 +4318,27 @@ function detectHeadings(blocks, medianFontSize) {
|
|
|
3881
4318
|
}
|
|
3882
4319
|
}
|
|
3883
4320
|
}
|
|
3884
|
-
function mergeAdjacentHeadings(blocks) {
|
|
3885
|
-
let i = 0;
|
|
3886
|
-
while (i < blocks.length - 1) {
|
|
3887
|
-
const curr = blocks[i];
|
|
3888
|
-
const next = blocks[i + 1];
|
|
3889
|
-
if (curr.type !== "heading" || next.type !== "heading") {
|
|
3890
|
-
i++;
|
|
3891
|
-
continue;
|
|
3892
|
-
}
|
|
3893
|
-
if (!curr.bbox || !next.bbox || !curr.text || !next.text) {
|
|
3894
|
-
i++;
|
|
3895
|
-
continue;
|
|
3896
|
-
}
|
|
3897
|
-
const currBaseline = curr.bbox.y + (curr.style?.fontSize || curr.bbox.height);
|
|
3898
|
-
const nextBaseline = next.bbox.y + (next.style?.fontSize || next.bbox.height);
|
|
3899
|
-
const yDiff = Math.abs(currBaseline - nextBaseline);
|
|
3900
|
-
const maxFs = Math.max(curr.style?.fontSize || 12, next.style?.fontSize || 12);
|
|
3901
|
-
const sameY = curr.bbox.page === next.bbox.page && yDiff < maxFs * 1.5;
|
|
3902
|
-
const sameLevel = curr.level === next.level;
|
|
3903
|
-
if (sameY && sameLevel) {
|
|
3904
|
-
const currX = curr.bbox.x;
|
|
3905
|
-
const nextX = next.bbox.x;
|
|
3906
|
-
if (currX <= nextX) {
|
|
3907
|
-
curr.text = curr.text + " " + next.text;
|
|
3908
|
-
} else {
|
|
3909
|
-
curr.text = next.text + " " + curr.text;
|
|
3910
|
-
}
|
|
3911
|
-
curr.bbox = {
|
|
3912
|
-
page: curr.bbox.page,
|
|
3913
|
-
x: Math.min(curr.bbox.x, next.bbox.x),
|
|
3914
|
-
y: Math.min(curr.bbox.y, next.bbox.y),
|
|
3915
|
-
width: Math.max(curr.bbox.x + curr.bbox.width, next.bbox.x + next.bbox.width) - Math.min(curr.bbox.x, next.bbox.x),
|
|
3916
|
-
height: Math.max(curr.bbox.height, next.bbox.height)
|
|
3917
|
-
};
|
|
3918
|
-
blocks.splice(i + 1, 1);
|
|
3919
|
-
} else {
|
|
3920
|
-
i++;
|
|
3921
|
-
}
|
|
3922
|
-
}
|
|
3923
|
-
}
|
|
3924
4321
|
function collapseEvenSpacing(text) {
|
|
3925
4322
|
const tokens = text.split(" ");
|
|
3926
4323
|
const singleCharCount = tokens.filter((t) => t.length === 1).length;
|
|
3927
4324
|
if (tokens.length >= 3 && singleCharCount / tokens.length >= 0.7) {
|
|
3928
4325
|
return tokens.join("");
|
|
3929
4326
|
}
|
|
3930
|
-
return text
|
|
3931
|
-
}
|
|
3932
|
-
|
|
3933
|
-
const allY = items.map((i) => i.y);
|
|
3934
|
-
const pageHeight = Math.max(...allY) - Math.min(...allY);
|
|
3935
|
-
const gapThreshold = Math.max(15, pageHeight * 0.03);
|
|
3936
|
-
const orderedGroups = xyCutOrder(items, gapThreshold);
|
|
3937
|
-
const blocks = [];
|
|
3938
|
-
for (const group of orderedGroups) {
|
|
3939
|
-
if (group.length === 0) continue;
|
|
3940
|
-
const yLines = groupByY(group);
|
|
3941
|
-
for (const line of yLines) {
|
|
3942
|
-
const text = mergeLineSimple(line);
|
|
3943
|
-
if (!text.trim()) continue;
|
|
3944
|
-
blocks.push({
|
|
3945
|
-
type: "paragraph",
|
|
3946
|
-
text,
|
|
3947
|
-
pageNumber: pageNum,
|
|
3948
|
-
bbox: computeBBox(line, pageNum),
|
|
3949
|
-
style: dominantStyle(line)
|
|
3950
|
-
});
|
|
3951
|
-
}
|
|
3952
|
-
}
|
|
3953
|
-
return blocks.length > 0 ? blocks : null;
|
|
3954
|
-
}
|
|
3955
|
-
function normalizeUnderSegmentedTable(table, items, pageNum, bbox) {
|
|
3956
|
-
const totalCells = table.cells.reduce((sum, row) => sum + row.filter((c) => c.text.trim()).length, 0);
|
|
3957
|
-
const totalTextLines = table.cells.reduce((sum, row) => sum + row.reduce((s, c) => s + (c.text.trim() ? c.text.split("\n").length : 0), 0), 0);
|
|
3958
|
-
const isUnderSegmented = table.rows === 1 && table.cols === 1 || totalCells <= 2 && totalTextLines >= 8 || totalCells <= 2 && items.length >= 6;
|
|
3959
|
-
if (!isUnderSegmented) return null;
|
|
3960
|
-
if (hasMultiColumnLayout(items)) return buildXyCutBlocks(items, pageNum);
|
|
3961
|
-
const directTable = buildTableFromTextLayout(items, pageNum, bbox);
|
|
3962
|
-
if (directTable) return directTable;
|
|
3963
|
-
const clusterItems = items.map((i) => ({
|
|
3964
|
-
text: i.text,
|
|
3965
|
-
x: i.x,
|
|
3966
|
-
y: i.y,
|
|
3967
|
-
w: i.w,
|
|
3968
|
-
h: i.h,
|
|
3969
|
-
fontSize: i.fontSize,
|
|
3970
|
-
fontName: i.fontName
|
|
3971
|
-
}));
|
|
3972
|
-
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
3973
|
-
if (clusterResults.length > 0) {
|
|
3974
|
-
const blocks = [];
|
|
3975
|
-
const ciToIdx = /* @__PURE__ */ new Map();
|
|
3976
|
-
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
3977
|
-
const usedIndices = /* @__PURE__ */ new Set();
|
|
3978
|
-
for (const cr of clusterResults) {
|
|
3979
|
-
for (const ci of cr.usedItems) {
|
|
3980
|
-
const idx = ciToIdx.get(ci);
|
|
3981
|
-
if (idx !== void 0) usedIndices.add(idx);
|
|
3982
|
-
}
|
|
3983
|
-
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
3984
|
-
}
|
|
3985
|
-
const remaining = items.filter((_, idx) => !usedIndices.has(idx));
|
|
3986
|
-
for (const item of remaining) {
|
|
3987
|
-
if (!item.text.trim()) continue;
|
|
3988
|
-
blocks.push({
|
|
3989
|
-
type: "paragraph",
|
|
3990
|
-
text: item.text,
|
|
3991
|
-
pageNumber: pageNum,
|
|
3992
|
-
bbox: computeBBox([item], pageNum),
|
|
3993
|
-
style: { fontSize: item.fontSize, fontName: item.fontName }
|
|
3994
|
-
});
|
|
3995
|
-
}
|
|
3996
|
-
blocks.sort((a, b) => {
|
|
3997
|
-
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
3998
|
-
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
3999
|
-
return by - ay;
|
|
4000
|
-
});
|
|
4001
|
-
return blocks.length > 0 ? blocks : null;
|
|
4002
|
-
}
|
|
4003
|
-
return null;
|
|
4004
|
-
}
|
|
4005
|
-
function buildTableFromTextLayout(items, pageNum, bbox) {
|
|
4006
|
-
if (items.length < 4) return null;
|
|
4007
|
-
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
4008
|
-
const yTol = 3;
|
|
4009
|
-
const rows = [];
|
|
4010
|
-
let curRow = [sorted[0]];
|
|
4011
|
-
let curY = sorted[0].y;
|
|
4012
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
4013
|
-
if (Math.abs(sorted[i].y - curY) <= yTol) {
|
|
4014
|
-
curRow.push(sorted[i]);
|
|
4015
|
-
} else {
|
|
4016
|
-
rows.push(curRow);
|
|
4017
|
-
curRow = [sorted[i]];
|
|
4018
|
-
curY = sorted[i].y;
|
|
4019
|
-
}
|
|
4020
|
-
}
|
|
4021
|
-
rows.push(curRow);
|
|
4022
|
-
if (rows.length < 2) return null;
|
|
4023
|
-
const gapPositions = [];
|
|
4024
|
-
for (const row of rows) {
|
|
4025
|
-
if (row.length < 2) continue;
|
|
4026
|
-
const sortedX = [...row].sort((a, b) => a.x - b.x);
|
|
4027
|
-
const avgFs = sortedX.reduce((s, i) => s + i.fontSize, 0) / sortedX.length;
|
|
4028
|
-
for (let j = 1; j < sortedX.length; j++) {
|
|
4029
|
-
const gap = sortedX[j].x - (sortedX[j - 1].x + sortedX[j - 1].w);
|
|
4030
|
-
if (gap >= avgFs * 1.5) {
|
|
4031
|
-
gapPositions.push(sortedX[j - 1].x + sortedX[j - 1].w + gap / 2);
|
|
4032
|
-
}
|
|
4033
|
-
}
|
|
4034
|
-
}
|
|
4035
|
-
if (gapPositions.length < 2) return null;
|
|
4036
|
-
gapPositions.sort((a, b) => a - b);
|
|
4037
|
-
const colBoundaries = [];
|
|
4038
|
-
let clusterSum = gapPositions[0], clusterCount = 1;
|
|
4039
|
-
for (let i = 1; i < gapPositions.length; i++) {
|
|
4040
|
-
const avg = clusterSum / clusterCount;
|
|
4041
|
-
if (Math.abs(gapPositions[i] - avg) <= 15) {
|
|
4042
|
-
clusterSum += gapPositions[i];
|
|
4043
|
-
clusterCount++;
|
|
4044
|
-
} else {
|
|
4045
|
-
if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
|
|
4046
|
-
clusterSum = gapPositions[i];
|
|
4047
|
-
clusterCount = 1;
|
|
4048
|
-
}
|
|
4049
|
-
}
|
|
4050
|
-
if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
|
|
4051
|
-
if (colBoundaries.length === 0) return null;
|
|
4052
|
-
const numCols = colBoundaries.length + 1;
|
|
4053
|
-
const tableRows = [];
|
|
4054
|
-
for (const row of rows) {
|
|
4055
|
-
const cells = Array(numCols).fill("");
|
|
4056
|
-
const sortedX = [...row].sort((a, b) => a.x - b.x);
|
|
4057
|
-
for (const item of sortedX) {
|
|
4058
|
-
const cx = item.x + item.w / 2;
|
|
4059
|
-
let col = 0;
|
|
4060
|
-
for (let b = 0; b < colBoundaries.length; b++) {
|
|
4061
|
-
if (cx > colBoundaries[b]) col = b + 1;
|
|
4062
|
-
}
|
|
4063
|
-
cells[col] = cells[col] ? cells[col] + " " + item.text : item.text;
|
|
4064
|
-
}
|
|
4065
|
-
if (cells[0].trim() === "" && tableRows.length > 0) {
|
|
4066
|
-
const prevCells = tableRows[tableRows.length - 1].cells;
|
|
4067
|
-
for (let c = 0; c < numCols; c++) {
|
|
4068
|
-
if (cells[c].trim()) {
|
|
4069
|
-
prevCells[c] = prevCells[c] ? prevCells[c] + " " + cells[c].trim() : cells[c].trim();
|
|
4070
|
-
}
|
|
4071
|
-
}
|
|
4072
|
-
} else {
|
|
4073
|
-
tableRows.push({ cells });
|
|
4074
|
-
}
|
|
4075
|
-
}
|
|
4076
|
-
if (tableRows.length < 2) return null;
|
|
4077
|
-
const nonEmptyCount = tableRows.reduce((sum, r) => sum + r.cells.filter((c) => c.trim()).length, 0);
|
|
4078
|
-
const totalCount = tableRows.length * numCols;
|
|
4079
|
-
if (nonEmptyCount < totalCount * 0.3) return null;
|
|
4080
|
-
const irCells = tableRows.map(
|
|
4081
|
-
(r) => r.cells.map((text, colIdx) => {
|
|
4082
|
-
let cleaned = text.trim();
|
|
4083
|
-
if (colIdx > 0) cleaned = cleaned.replace(/^[•○·\-]\s*/, "");
|
|
4084
|
-
return { text: cleaned, colSpan: 1, rowSpan: 1 };
|
|
4085
|
-
})
|
|
4327
|
+
return text.replace(
|
|
4328
|
+
/(?<![가-힣])[가-힣](?: [가-힣\d]){2,}(?![가-힣])/g,
|
|
4329
|
+
(match) => match.replace(/ /g, "")
|
|
4086
4330
|
);
|
|
4087
|
-
const irTable = {
|
|
4088
|
-
rows: tableRows.length,
|
|
4089
|
-
cols: numCols,
|
|
4090
|
-
cells: irCells,
|
|
4091
|
-
hasHeader: tableRows.length > 1
|
|
4092
|
-
};
|
|
4093
|
-
return [{ type: "table", table: irTable, pageNumber: pageNum, bbox }];
|
|
4094
4331
|
}
|
|
4095
4332
|
function shouldDemoteTable(table) {
|
|
4096
4333
|
const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
|
|
4097
4334
|
const allText = allCells.join(" ");
|
|
4335
|
+
if (table.rows <= 3 && table.cols <= 3) {
|
|
4336
|
+
const totalCells2 = table.rows * table.cols;
|
|
4337
|
+
const emptyCells2 = totalCells2 - allCells.length;
|
|
4338
|
+
if (emptyCells2 >= totalCells2 * 0.3) return true;
|
|
4339
|
+
if (/[□■◆○●▶ㅇ]/.test(allText)) return true;
|
|
4340
|
+
if (/<[^>]+>/.test(allText)) return true;
|
|
4341
|
+
}
|
|
4098
4342
|
if (allText.length > 200) return false;
|
|
4099
4343
|
if (/[□■◆○●▶]/.test(allText) && table.rows <= 3) return true;
|
|
4100
4344
|
const totalCells = table.rows * table.cols;
|
|
@@ -4138,32 +4382,6 @@ function detectMarkerHeadings(blocks) {
|
|
|
4138
4382
|
}
|
|
4139
4383
|
}
|
|
4140
4384
|
}
|
|
4141
|
-
function hasMultiColumnLayout(items) {
|
|
4142
|
-
if (items.length < 30) return false;
|
|
4143
|
-
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
4144
|
-
const minX = sorted[0].x;
|
|
4145
|
-
let maxX = minX;
|
|
4146
|
-
for (const i of sorted) if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
4147
|
-
const pageWidth = maxX - minX;
|
|
4148
|
-
if (pageWidth < 200) return false;
|
|
4149
|
-
let bestGap = 0;
|
|
4150
|
-
let bestSplit = 0;
|
|
4151
|
-
for (let j = 1; j < sorted.length; j++) {
|
|
4152
|
-
const gap = sorted[j].x - (sorted[j - 1].x + sorted[j - 1].w);
|
|
4153
|
-
if (gap > bestGap) {
|
|
4154
|
-
bestGap = gap;
|
|
4155
|
-
bestSplit = (sorted[j - 1].x + sorted[j - 1].w + sorted[j].x) / 2;
|
|
4156
|
-
}
|
|
4157
|
-
}
|
|
4158
|
-
if (bestGap < 20) return false;
|
|
4159
|
-
const splitRatio = (bestSplit - minX) / pageWidth;
|
|
4160
|
-
if (splitRatio < 0.35 || splitRatio > 0.65) return false;
|
|
4161
|
-
const leftCount = items.filter((i) => i.x + i.w / 2 < bestSplit).length;
|
|
4162
|
-
const rightCount = items.filter((i) => i.x + i.w / 2 >= bestSplit).length;
|
|
4163
|
-
if (leftCount < 15 || rightCount < 15) return false;
|
|
4164
|
-
if (Math.min(leftCount, rightCount) / Math.max(leftCount, rightCount) < 0.35) return false;
|
|
4165
|
-
return true;
|
|
4166
|
-
}
|
|
4167
4385
|
var MAX_XYCUT_DEPTH = 50;
|
|
4168
4386
|
function xyCutOrder(items, gapThreshold, depth = 0) {
|
|
4169
4387
|
if (items.length === 0) return [];
|
|
@@ -4231,6 +4449,7 @@ function extractPageBlocksWithLines(items, pageNum, opList, pageWidth, pageHeigh
|
|
|
4231
4449
|
if (items.length === 0) return [];
|
|
4232
4450
|
let { horizontals, verticals } = extractLines(opList.fnArray, opList.argsArray);
|
|
4233
4451
|
({ horizontals, verticals } = filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight));
|
|
4452
|
+
({ horizontals, verticals } = preprocessLines(horizontals, verticals));
|
|
4234
4453
|
const grids = buildTableGrids(horizontals, verticals);
|
|
4235
4454
|
if (grids.length > 0) {
|
|
4236
4455
|
return extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals);
|
|
@@ -4242,14 +4461,19 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
4242
4461
|
const usedItems = /* @__PURE__ */ new Set();
|
|
4243
4462
|
const sortedGrids = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
|
|
4244
4463
|
for (const grid of sortedGrids) {
|
|
4464
|
+
const numGridRows = grid.rowYs.length - 1;
|
|
4465
|
+
const numGridCols = grid.colXs.length - 1;
|
|
4466
|
+
if (numGridRows === 1 && numGridCols >= 2) continue;
|
|
4245
4467
|
const tableItems = [];
|
|
4246
4468
|
const pad = 3;
|
|
4469
|
+
const gridW = grid.bbox.x2 - grid.bbox.x1;
|
|
4247
4470
|
for (const item of items) {
|
|
4248
4471
|
if (usedItems.has(item)) continue;
|
|
4249
|
-
if (item.
|
|
4250
|
-
|
|
4251
|
-
|
|
4252
|
-
|
|
4472
|
+
if (item.y < grid.bbox.y1 - pad || item.y > grid.bbox.y2 + pad) continue;
|
|
4473
|
+
if (item.x < grid.bbox.x1 - pad || item.x + item.w > grid.bbox.x2 + pad) continue;
|
|
4474
|
+
if (gridW < 120 && item.x + item.w > grid.bbox.x2 - 2) continue;
|
|
4475
|
+
tableItems.push(item);
|
|
4476
|
+
usedItems.add(item);
|
|
4253
4477
|
}
|
|
4254
4478
|
const cells = extractCells(grid, horizontals, verticals);
|
|
4255
4479
|
if (cells.length === 0) continue;
|
|
@@ -4273,6 +4497,7 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
4273
4497
|
const cellItems = cellTextMap.get(cell) || [];
|
|
4274
4498
|
let text = cellTextToString(cellItems);
|
|
4275
4499
|
text = text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").trim();
|
|
4500
|
+
text = text.split("\n").map((line) => collapseEvenSpacing(line)).join("\n");
|
|
4276
4501
|
irGrid[cell.row][cell.col] = {
|
|
4277
4502
|
text,
|
|
4278
4503
|
colSpan: cell.colSpan,
|
|
@@ -4294,31 +4519,61 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
4294
4519
|
width: grid.bbox.x2 - grid.bbox.x1,
|
|
4295
4520
|
height: grid.bbox.y2 - grid.bbox.y1
|
|
4296
4521
|
};
|
|
4297
|
-
const normalized = normalizeUnderSegmentedTable(irTable, tableItems, pageNum, tableBbox);
|
|
4298
|
-
if (normalized) {
|
|
4299
|
-
blocks.push(...normalized);
|
|
4300
|
-
continue;
|
|
4301
|
-
}
|
|
4302
4522
|
if (shouldDemoteTable(irTable)) {
|
|
4303
4523
|
const demoted = demoteTableToText(irTable);
|
|
4304
4524
|
if (demoted) {
|
|
4305
|
-
|
|
4525
|
+
const text = numGridRows === 1 ? "\n" + demoted + "\n" : demoted;
|
|
4526
|
+
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
|
|
4306
4527
|
}
|
|
4307
4528
|
continue;
|
|
4308
4529
|
}
|
|
4309
4530
|
blocks.push({ type: "table", table: irTable, pageNumber: pageNum, bbox: tableBbox });
|
|
4310
4531
|
}
|
|
4311
|
-
|
|
4532
|
+
let remaining = items.filter((i) => !usedItems.has(i));
|
|
4312
4533
|
if (remaining.length > 0) {
|
|
4313
4534
|
remaining.sort((a, b) => b.y - a.y || a.x - b.x);
|
|
4314
|
-
const
|
|
4315
|
-
|
|
4316
|
-
|
|
4535
|
+
const clusterItems = remaining.map((i) => ({
|
|
4536
|
+
text: i.text,
|
|
4537
|
+
x: i.x,
|
|
4538
|
+
y: i.y,
|
|
4539
|
+
w: i.w,
|
|
4540
|
+
h: i.h,
|
|
4541
|
+
fontSize: i.fontSize,
|
|
4542
|
+
fontName: i.fontName
|
|
4543
|
+
}));
|
|
4544
|
+
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4545
|
+
if (clusterResults.length > 0) {
|
|
4546
|
+
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4547
|
+
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4548
|
+
const usedClusterIndices = /* @__PURE__ */ new Set();
|
|
4549
|
+
for (const cr of clusterResults) {
|
|
4550
|
+
for (const ci of cr.usedItems) {
|
|
4551
|
+
const idx = ciToIdx.get(ci);
|
|
4552
|
+
if (idx !== void 0) usedClusterIndices.add(idx);
|
|
4553
|
+
}
|
|
4554
|
+
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4555
|
+
}
|
|
4556
|
+
remaining = remaining.filter((_, idx) => !usedClusterIndices.has(idx));
|
|
4557
|
+
}
|
|
4558
|
+
if (remaining.length > 0) {
|
|
4559
|
+
const allY = remaining.map((i) => i.y);
|
|
4560
|
+
const pageH = safeMax(allY) - safeMin(allY);
|
|
4561
|
+
const groups = xyCutOrder(remaining, Math.max(15, pageH * 0.03));
|
|
4562
|
+
const textBlocks = [];
|
|
4563
|
+
for (const group of groups) {
|
|
4564
|
+
if (group.length === 0) continue;
|
|
4565
|
+
const groupBlocks = extractPageBlocksFallback(group, pageNum);
|
|
4566
|
+
for (const b of groupBlocks) textBlocks.push(b);
|
|
4567
|
+
}
|
|
4568
|
+
const finalTextBlocks = detectListBlocks(textBlocks);
|
|
4569
|
+
for (const b of finalTextBlocks) blocks.push(b);
|
|
4570
|
+
}
|
|
4571
|
+
blocks.sort((a, b) => {
|
|
4317
4572
|
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
4318
4573
|
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
4319
4574
|
return by - ay;
|
|
4320
4575
|
});
|
|
4321
|
-
return mergeAdjacentTableBlocks(
|
|
4576
|
+
return mergeAdjacentTableBlocks(blocks);
|
|
4322
4577
|
}
|
|
4323
4578
|
return mergeAdjacentTableBlocks(blocks);
|
|
4324
4579
|
}
|
|
@@ -4344,57 +4599,53 @@ function mergeAdjacentTableBlocks(blocks) {
|
|
|
4344
4599
|
}
|
|
4345
4600
|
function extractPageBlocksFallback(items, pageNum) {
|
|
4346
4601
|
if (items.length === 0) return [];
|
|
4347
|
-
if (hasMultiColumnLayout(items)) {
|
|
4348
|
-
const xyBlocks = buildXyCutBlocks(items, pageNum) || [];
|
|
4349
|
-
return detectSpecialKoreanTables(detectListBlocks(xyBlocks));
|
|
4350
|
-
}
|
|
4351
4602
|
const blocks = [];
|
|
4352
|
-
const
|
|
4353
|
-
|
|
4354
|
-
|
|
4355
|
-
|
|
4356
|
-
|
|
4357
|
-
|
|
4358
|
-
|
|
4359
|
-
|
|
4360
|
-
|
|
4361
|
-
|
|
4362
|
-
|
|
4363
|
-
|
|
4364
|
-
|
|
4365
|
-
|
|
4366
|
-
|
|
4367
|
-
|
|
4368
|
-
|
|
4369
|
-
|
|
4370
|
-
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4371
|
-
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4372
|
-
const usedIndices = /* @__PURE__ */ new Set();
|
|
4373
|
-
for (const cr of clusterResults) {
|
|
4374
|
-
for (const ci of cr.usedItems) {
|
|
4375
|
-
const idx = ciToIdx.get(ci);
|
|
4376
|
-
if (idx !== void 0) usedIndices.add(idx);
|
|
4377
|
-
}
|
|
4378
|
-
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4603
|
+
const clusterItems = items.map((i) => ({
|
|
4604
|
+
text: i.text,
|
|
4605
|
+
x: i.x,
|
|
4606
|
+
y: i.y,
|
|
4607
|
+
w: i.w,
|
|
4608
|
+
h: i.h,
|
|
4609
|
+
fontSize: i.fontSize,
|
|
4610
|
+
fontName: i.fontName
|
|
4611
|
+
}));
|
|
4612
|
+
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4613
|
+
if (clusterResults.length > 0) {
|
|
4614
|
+
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4615
|
+
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4616
|
+
const usedIndices = /* @__PURE__ */ new Set();
|
|
4617
|
+
for (const cr of clusterResults) {
|
|
4618
|
+
for (const ci of cr.usedItems) {
|
|
4619
|
+
const idx = ciToIdx.get(ci);
|
|
4620
|
+
if (idx !== void 0) usedIndices.add(idx);
|
|
4379
4621
|
}
|
|
4380
|
-
|
|
4381
|
-
|
|
4382
|
-
|
|
4383
|
-
|
|
4384
|
-
|
|
4385
|
-
|
|
4386
|
-
|
|
4387
|
-
|
|
4388
|
-
|
|
4622
|
+
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4623
|
+
}
|
|
4624
|
+
const remaining = items.filter((_, idx) => !usedIndices.has(idx));
|
|
4625
|
+
if (remaining.length > 0) {
|
|
4626
|
+
const yLines = groupByY(remaining);
|
|
4627
|
+
for (const line of yLines) {
|
|
4628
|
+
const text = mergeLineSimple(line);
|
|
4629
|
+
if (!text.trim()) continue;
|
|
4630
|
+
const bbox = computeBBox(line, pageNum);
|
|
4631
|
+
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
|
|
4389
4632
|
}
|
|
4390
|
-
|
|
4391
|
-
|
|
4392
|
-
|
|
4393
|
-
|
|
4394
|
-
|
|
4633
|
+
}
|
|
4634
|
+
blocks.sort((a, b) => {
|
|
4635
|
+
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
4636
|
+
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
4637
|
+
return by - ay;
|
|
4638
|
+
});
|
|
4639
|
+
} else {
|
|
4640
|
+
const allYLines = groupByY(items);
|
|
4641
|
+
const columns = detectColumns(allYLines);
|
|
4642
|
+
if (columns && columns.length >= 3) {
|
|
4643
|
+
const tableText = extractWithColumns(allYLines, columns);
|
|
4644
|
+
const bbox = computeBBox(items, pageNum);
|
|
4645
|
+
blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
|
|
4395
4646
|
} else {
|
|
4396
4647
|
const allY = items.map((i) => i.y);
|
|
4397
|
-
const pageHeight =
|
|
4648
|
+
const pageHeight = safeMax(allY) - safeMin(allY);
|
|
4398
4649
|
const gapThreshold = Math.max(15, pageHeight * 0.03);
|
|
4399
4650
|
const orderedGroups = xyCutOrder(items, gapThreshold);
|
|
4400
4651
|
for (const group of orderedGroups) {
|
|
@@ -4447,22 +4698,76 @@ function dominantStyle(items) {
|
|
|
4447
4698
|
return { fontSize: dominantSize, fontName };
|
|
4448
4699
|
}
|
|
4449
4700
|
function normalizeItems(rawItems) {
|
|
4450
|
-
|
|
4701
|
+
const items = [];
|
|
4702
|
+
const spacePositions = [];
|
|
4703
|
+
for (const i of rawItems) {
|
|
4704
|
+
if (typeof i.str !== "string") continue;
|
|
4705
|
+
const x = Math.round(i.transform[4]);
|
|
4706
|
+
const y = Math.round(i.transform[5]);
|
|
4707
|
+
if (!i.str.trim()) {
|
|
4708
|
+
spacePositions.push({ x, y });
|
|
4709
|
+
continue;
|
|
4710
|
+
}
|
|
4451
4711
|
const scaleY = Math.abs(i.transform[3]);
|
|
4452
4712
|
const scaleX = Math.abs(i.transform[0]);
|
|
4453
4713
|
const fontSize = Math.round(Math.max(scaleY, scaleX));
|
|
4454
|
-
|
|
4455
|
-
|
|
4456
|
-
|
|
4457
|
-
|
|
4458
|
-
|
|
4459
|
-
|
|
4460
|
-
|
|
4461
|
-
|
|
4462
|
-
|
|
4463
|
-
|
|
4464
|
-
|
|
4465
|
-
|
|
4714
|
+
const w = Math.round(i.width);
|
|
4715
|
+
const h = Math.round(i.height);
|
|
4716
|
+
const isHidden = fontSize === 0 || i.width === 0 && i.str.trim().length > 0;
|
|
4717
|
+
let text = i.str.trim();
|
|
4718
|
+
if (/^[\d\s\-().·,☎]+$/.test(text) && /\d/.test(text) && / /.test(text)) {
|
|
4719
|
+
text = text.replace(/ /g, "");
|
|
4720
|
+
}
|
|
4721
|
+
const split = splitEvenSpacedItem(text, x, w, fontSize);
|
|
4722
|
+
if (split) {
|
|
4723
|
+
for (const s of split) {
|
|
4724
|
+
items.push({ text: s.text, x: s.x, y, w: s.w, h, fontSize, fontName: i.fontName || "", isHidden });
|
|
4725
|
+
}
|
|
4726
|
+
} else {
|
|
4727
|
+
items.push({ text, x, y, w, h, fontSize, fontName: i.fontName || "", isHidden });
|
|
4728
|
+
}
|
|
4729
|
+
}
|
|
4730
|
+
const sorted = items.sort((a, b) => b.y - a.y || a.x - b.x);
|
|
4731
|
+
const deduped = [];
|
|
4732
|
+
for (let i = 0; i < sorted.length; i++) {
|
|
4733
|
+
let isDup = false;
|
|
4734
|
+
for (let j = deduped.length - 1; j >= 0; j--) {
|
|
4735
|
+
const prev = deduped[j];
|
|
4736
|
+
if (prev.y - sorted[i].y > 3) break;
|
|
4737
|
+
if (Math.abs(prev.y - sorted[i].y) <= 3 && prev.text === sorted[i].text && Math.abs(prev.x - sorted[i].x) <= 3) {
|
|
4738
|
+
isDup = true;
|
|
4739
|
+
break;
|
|
4740
|
+
}
|
|
4741
|
+
}
|
|
4742
|
+
if (!isDup) deduped.push(sorted[i]);
|
|
4743
|
+
}
|
|
4744
|
+
if (spacePositions.length > 0) {
|
|
4745
|
+
for (const item of deduped) {
|
|
4746
|
+
for (const sp of spacePositions) {
|
|
4747
|
+
if (Math.abs(sp.y - item.y) <= 3) {
|
|
4748
|
+
const dist = item.x - sp.x;
|
|
4749
|
+
if (dist >= 0 && dist <= 20) {
|
|
4750
|
+
item.hasSpaceBefore = true;
|
|
4751
|
+
break;
|
|
4752
|
+
}
|
|
4753
|
+
}
|
|
4754
|
+
}
|
|
4755
|
+
}
|
|
4756
|
+
}
|
|
4757
|
+
return deduped;
|
|
4758
|
+
}
|
|
4759
|
+
function splitEvenSpacedItem(text, itemX, itemW, fontSize) {
|
|
4760
|
+
if (!/^[가-힣\d](?: [가-힣\d]){2,}$/.test(text)) return null;
|
|
4761
|
+
const chars = text.split(" ");
|
|
4762
|
+
if (chars.length < 3) return null;
|
|
4763
|
+
const charW = itemW / chars.length;
|
|
4764
|
+
if (charW > fontSize * 2) return null;
|
|
4765
|
+
return chars.map((ch, idx) => ({
|
|
4766
|
+
text: ch,
|
|
4767
|
+
x: Math.round(itemX + idx * charW),
|
|
4768
|
+
w: Math.round(charW * 0.8)
|
|
4769
|
+
// 실제 글자 폭은 간격보다 좁음
|
|
4770
|
+
}));
|
|
4466
4771
|
}
|
|
4467
4772
|
function groupByY(items) {
|
|
4468
4773
|
if (items.length === 0) return [];
|
|
@@ -4487,14 +4792,14 @@ function isProseSpread(items) {
|
|
|
4487
4792
|
for (let i = 1; i < sorted.length; i++) {
|
|
4488
4793
|
gaps.push(sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w));
|
|
4489
4794
|
}
|
|
4490
|
-
const maxGap =
|
|
4795
|
+
const maxGap = safeMax(gaps);
|
|
4491
4796
|
const avgLen = items.reduce((s, i) => s + i.text.length, 0) / items.length;
|
|
4492
4797
|
return maxGap < 40 && avgLen < 5;
|
|
4493
4798
|
}
|
|
4494
4799
|
function detectColumns(yLines) {
|
|
4495
4800
|
const allItems = yLines.flat();
|
|
4496
4801
|
if (allItems.length === 0) return null;
|
|
4497
|
-
const pageWidth =
|
|
4802
|
+
const pageWidth = safeMax(allItems.map((i) => i.x + i.w)) - safeMin(allItems.map((i) => i.x));
|
|
4498
4803
|
if (pageWidth < 100) return null;
|
|
4499
4804
|
let bigoLineIdx = -1;
|
|
4500
4805
|
for (let i = 0; i < yLines.length; i++) {
|
|
@@ -4526,7 +4831,7 @@ function detectColumns(yLines) {
|
|
|
4526
4831
|
}
|
|
4527
4832
|
const peaks = xClusters.filter((c) => c.count >= 3).sort((a, b) => a.minX - b.minX);
|
|
4528
4833
|
if (peaks.length < 3) return null;
|
|
4529
|
-
const MERGE_TOL =
|
|
4834
|
+
const MERGE_TOL = 40;
|
|
4530
4835
|
const merged = [peaks[0]];
|
|
4531
4836
|
for (let i = 1; i < peaks.length; i++) {
|
|
4532
4837
|
const prev = merged[merged.length - 1];
|
|
@@ -4540,7 +4845,14 @@ function detectColumns(yLines) {
|
|
|
4540
4845
|
merged.push({ ...peaks[i] });
|
|
4541
4846
|
}
|
|
4542
4847
|
}
|
|
4543
|
-
const
|
|
4848
|
+
const rawColumns = merged.filter((c) => c.count >= 3).map((c) => c.minX);
|
|
4849
|
+
if (rawColumns.length < 3) return null;
|
|
4850
|
+
const MIN_DETECT_COL_WIDTH = 30;
|
|
4851
|
+
const columns = [rawColumns[0]];
|
|
4852
|
+
for (let i = 1; i < rawColumns.length; i++) {
|
|
4853
|
+
if (rawColumns[i] - columns[columns.length - 1] < MIN_DETECT_COL_WIDTH) continue;
|
|
4854
|
+
columns.push(rawColumns[i]);
|
|
4855
|
+
}
|
|
4544
4856
|
return columns.length >= 3 ? columns : null;
|
|
4545
4857
|
}
|
|
4546
4858
|
function findColumn(x, columns) {
|
|
@@ -4668,6 +4980,16 @@ function buildGridTable(lines, columns) {
|
|
|
4668
4980
|
}
|
|
4669
4981
|
merged.splice(0, headerEnd, headerRow);
|
|
4670
4982
|
}
|
|
4983
|
+
for (const row of merged) {
|
|
4984
|
+
for (let c = 0; c < row.length; c++) {
|
|
4985
|
+
if (row[c]) row[c] = collapseEvenSpacing(row[c]);
|
|
4986
|
+
}
|
|
4987
|
+
}
|
|
4988
|
+
const totalCells = merged.length * numCols;
|
|
4989
|
+
const filledCells = merged.reduce((s, row) => s + row.filter((c) => c).length, 0);
|
|
4990
|
+
if (filledCells < totalCells * 0.35 || merged.length < 2 || merged.length <= 3 && numCols >= 7) {
|
|
4991
|
+
return merged.map((r) => r.filter((c) => c).join(" ")).join("\n");
|
|
4992
|
+
}
|
|
4671
4993
|
const md = [];
|
|
4672
4994
|
md.push("| " + merged[0].join(" | ") + " |");
|
|
4673
4995
|
md.push("| " + merged[0].map(() => "---").join(" | ") + " |");
|
|
@@ -4679,12 +5001,32 @@ function buildGridTable(lines, columns) {
|
|
|
4679
5001
|
function mergeLineSimple(items) {
|
|
4680
5002
|
if (items.length <= 1) return items[0]?.text || "";
|
|
4681
5003
|
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
5004
|
+
const isEvenSpaced = detectEvenSpacedItems(sorted);
|
|
4682
5005
|
let result = sorted[0].text;
|
|
4683
5006
|
for (let i = 1; i < sorted.length; i++) {
|
|
4684
5007
|
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
4685
5008
|
const avgFs = (sorted[i].fontSize + sorted[i - 1].fontSize) / 2;
|
|
4686
|
-
|
|
4687
|
-
|
|
5009
|
+
const tabThreshold = Math.max(avgFs * 2, 30);
|
|
5010
|
+
if (gap > tabThreshold) {
|
|
5011
|
+
result += " ";
|
|
5012
|
+
result += sorted[i].text;
|
|
5013
|
+
continue;
|
|
5014
|
+
}
|
|
5015
|
+
if (isEvenSpaced[i]) {
|
|
5016
|
+
result += sorted[i].text;
|
|
5017
|
+
continue;
|
|
5018
|
+
}
|
|
5019
|
+
if (sorted[i].hasSpaceBefore && gap >= avgFs * 0.05) {
|
|
5020
|
+
result += " ";
|
|
5021
|
+
result += sorted[i].text;
|
|
5022
|
+
continue;
|
|
5023
|
+
}
|
|
5024
|
+
if (/[□■○●▶◆◇ㅇ]$/.test(sorted[i - 1].text) && /^[가-힣]/.test(sorted[i].text) && gap > 1) {
|
|
5025
|
+
result += " ";
|
|
5026
|
+
result += sorted[i].text;
|
|
5027
|
+
continue;
|
|
5028
|
+
}
|
|
5029
|
+
if (gap < avgFs * 0.15) {
|
|
4688
5030
|
} else if (gap < avgFs * 0.35 && (/[가-힣]$/.test(result) || /^[가-힣]/.test(sorted[i].text))) {
|
|
4689
5031
|
} else if (gap > 3) result += " ";
|
|
4690
5032
|
result += sorted[i].text;
|
|
@@ -4693,8 +5035,8 @@ function mergeLineSimple(items) {
|
|
|
4693
5035
|
}
|
|
4694
5036
|
function cleanPdfText(text) {
|
|
4695
5037
|
return mergeKoreanLines(
|
|
4696
|
-
text.replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "")
|
|
4697
|
-
).replace(/^(?!\|)
|
|
5038
|
+
text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
|
|
5039
|
+
).replace(/^(?!\| ---).*$/gm, (line) => collapseEvenSpacing(line)).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
|
|
4698
5040
|
}
|
|
4699
5041
|
function startsWithMarker(line) {
|
|
4700
5042
|
const t = line.trimStart();
|
|
@@ -4886,7 +5228,7 @@ function mergeKoreanLines(text) {
|
|
|
4886
5228
|
result[result.length - 1] = prev + " " + currTrimmed;
|
|
4887
5229
|
continue;
|
|
4888
5230
|
}
|
|
4889
|
-
if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev)) {
|
|
5231
|
+
if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev) && !startsWithMarker(prev)) {
|
|
4890
5232
|
result[result.length - 1] = prev + " " + curr;
|
|
4891
5233
|
} else {
|
|
4892
5234
|
result.push(curr);
|
|
@@ -4934,7 +5276,7 @@ function getTextContent(el) {
|
|
|
4934
5276
|
return el.textContent?.trim() ?? "";
|
|
4935
5277
|
}
|
|
4936
5278
|
function parseXml(text) {
|
|
4937
|
-
return new DOMParser2().parseFromString(text, "text/xml");
|
|
5279
|
+
return new DOMParser2().parseFromString(stripDtd(text), "text/xml");
|
|
4938
5280
|
}
|
|
4939
5281
|
function parseSharedStrings(xml) {
|
|
4940
5282
|
const doc = parseXml(xml);
|
|
@@ -5221,7 +5563,7 @@ function getAttr(el, localName) {
|
|
|
5221
5563
|
return null;
|
|
5222
5564
|
}
|
|
5223
5565
|
function parseXml2(text) {
|
|
5224
|
-
return new DOMParser3().parseFromString(text, "text/xml");
|
|
5566
|
+
return new DOMParser3().parseFromString(stripDtd(text), "text/xml");
|
|
5225
5567
|
}
|
|
5226
5568
|
function parseStyles(xml) {
|
|
5227
5569
|
const doc = parseXml2(xml);
|
|
@@ -5621,7 +5963,13 @@ function normalize(s) {
|
|
|
5621
5963
|
}
|
|
5622
5964
|
var MAX_LEVENSHTEIN_LEN = 1e4;
|
|
5623
5965
|
function levenshtein(a, b) {
|
|
5624
|
-
if (a.length + b.length > MAX_LEVENSHTEIN_LEN)
|
|
5966
|
+
if (a.length + b.length > MAX_LEVENSHTEIN_LEN) {
|
|
5967
|
+
const sampleLen = Math.min(500, a.length, b.length);
|
|
5968
|
+
let diffs = 0;
|
|
5969
|
+
for (let i = 0; i < sampleLen; i++) if (a[i] !== b[i]) diffs++;
|
|
5970
|
+
const sampleRate = sampleLen > 0 ? diffs / sampleLen : 1;
|
|
5971
|
+
return Math.abs(a.length - b.length) + Math.round(Math.min(a.length, b.length) * sampleRate);
|
|
5972
|
+
}
|
|
5625
5973
|
if (a.length > b.length) [a, b] = [b, a];
|
|
5626
5974
|
const m = a.length;
|
|
5627
5975
|
const n = b.length;
|
|
@@ -5904,13 +6252,20 @@ function extractInlineFields(text) {
|
|
|
5904
6252
|
|
|
5905
6253
|
// src/hwpx/generator.ts
|
|
5906
6254
|
import JSZip5 from "jszip";
|
|
5907
|
-
var
|
|
6255
|
+
var NS_SECTION = "http://www.hancom.co.kr/hwpml/2011/section";
|
|
6256
|
+
var NS_PARA = "http://www.hancom.co.kr/hwpml/2011/paragraph";
|
|
6257
|
+
var NS_HEAD = "http://www.hancom.co.kr/hwpml/2011/head";
|
|
6258
|
+
var NS_OPF = "http://www.idpf.org/2007/opf/";
|
|
6259
|
+
var NS_HPF = "http://www.hancom.co.kr/schema/2011/hpf";
|
|
6260
|
+
var NS_OCF = "urn:oasis:names:tc:opendocument:xmlns:container";
|
|
5908
6261
|
async function markdownToHwpx(markdown) {
|
|
5909
6262
|
const blocks = parseMarkdownToBlocks(markdown);
|
|
5910
6263
|
const sectionXml = blocksToSectionXml(blocks);
|
|
5911
6264
|
const zip = new JSZip5();
|
|
5912
6265
|
zip.file("mimetype", "application/hwp+zip", { compression: "STORE" });
|
|
6266
|
+
zip.file("META-INF/container.xml", generateContainerXml());
|
|
5913
6267
|
zip.file("Contents/content.hpf", generateManifest());
|
|
6268
|
+
zip.file("Contents/header.xml", generateHeaderXml());
|
|
5914
6269
|
zip.file("Contents/section0.xml", sectionXml);
|
|
5915
6270
|
return await zip.generateAsync({ type: "arraybuffer" });
|
|
5916
6271
|
}
|
|
@@ -5955,8 +6310,111 @@ function parseMarkdownToBlocks(md) {
|
|
|
5955
6310
|
function escapeXml(text) {
|
|
5956
6311
|
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """);
|
|
5957
6312
|
}
|
|
6313
|
+
function generateContainerXml() {
|
|
6314
|
+
return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
|
6315
|
+
<ocf:container xmlns:ocf="${NS_OCF}" xmlns:hpf="${NS_HPF}">
|
|
6316
|
+
<ocf:rootfiles>
|
|
6317
|
+
<ocf:rootfile full-path="Contents/content.hpf" media-type="application/hwpml-package+xml"/>
|
|
6318
|
+
</ocf:rootfiles>
|
|
6319
|
+
</ocf:container>`;
|
|
6320
|
+
}
|
|
6321
|
+
function generateManifest() {
|
|
6322
|
+
return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
|
6323
|
+
<opf:package xmlns:opf="${NS_OPF}" xmlns:hpf="${NS_HPF}" xmlns:hh="${NS_HEAD}">
|
|
6324
|
+
<opf:manifest>
|
|
6325
|
+
<opf:item id="header" href="Contents/header.xml" media-type="application/xml"/>
|
|
6326
|
+
<opf:item id="section0" href="Contents/section0.xml" media-type="application/xml"/>
|
|
6327
|
+
</opf:manifest>
|
|
6328
|
+
<opf:spine>
|
|
6329
|
+
<opf:itemref idref="header" linear="no"/>
|
|
6330
|
+
<opf:itemref idref="section0" linear="yes"/>
|
|
6331
|
+
</opf:spine>
|
|
6332
|
+
</opf:package>`;
|
|
6333
|
+
}
|
|
6334
|
+
function generateHeaderXml() {
|
|
6335
|
+
return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
|
6336
|
+
<hh:head xmlns:hh="${NS_HEAD}" xmlns:hp="${NS_PARA}" version="1.4" secCnt="1">
|
|
6337
|
+
<hh:beginNum page="1" footnote="1" endnote="1" pic="1" tbl="1" equation="1"/>
|
|
6338
|
+
<hh:refList>
|
|
6339
|
+
<hh:fontfaces itemCnt="7">
|
|
6340
|
+
<hh:fontface lang="HANGUL" fontCnt="1">
|
|
6341
|
+
<hh:font id="0" face="\uD568\uCD08\uB86C\uBC14\uD0D5" type="TTF" isEmbedded="0">
|
|
6342
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="4" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6343
|
+
</hh:font>
|
|
6344
|
+
</hh:fontface>
|
|
6345
|
+
<hh:fontface lang="LATIN" fontCnt="1">
|
|
6346
|
+
<hh:font id="0" face="Times New Roman" type="TTF" isEmbedded="0">
|
|
6347
|
+
<hh:typeInfo familyType="FCAT_OLDSTYLE" weight="5" proportion="4" contrast="2" strokeVariation="0" armStyle="0" letterform="0" midline="0" xHeight="4"/>
|
|
6348
|
+
</hh:font>
|
|
6349
|
+
</hh:fontface>
|
|
6350
|
+
<hh:fontface lang="HANJA" fontCnt="1">
|
|
6351
|
+
<hh:font id="0" face="\uD568\uCD08\uB86C\uBC14\uD0D5" type="TTF" isEmbedded="0">
|
|
6352
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="4" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6353
|
+
</hh:font>
|
|
6354
|
+
</hh:fontface>
|
|
6355
|
+
<hh:fontface lang="JAPANESE" fontCnt="1">
|
|
6356
|
+
<hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
|
|
6357
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6358
|
+
</hh:font>
|
|
6359
|
+
</hh:fontface>
|
|
6360
|
+
<hh:fontface lang="OTHER" fontCnt="1">
|
|
6361
|
+
<hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
|
|
6362
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6363
|
+
</hh:font>
|
|
6364
|
+
</hh:fontface>
|
|
6365
|
+
<hh:fontface lang="SYMBOL" fontCnt="1">
|
|
6366
|
+
<hh:font id="0" face="Symbol" type="TTF" isEmbedded="0">
|
|
6367
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6368
|
+
</hh:font>
|
|
6369
|
+
</hh:fontface>
|
|
6370
|
+
<hh:fontface lang="USER" fontCnt="1">
|
|
6371
|
+
<hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
|
|
6372
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6373
|
+
</hh:font>
|
|
6374
|
+
</hh:fontface>
|
|
6375
|
+
</hh:fontfaces>
|
|
6376
|
+
<hh:borderFills itemCnt="1">
|
|
6377
|
+
<hh:borderFill id="0" threeD="0" shadow="0" centerLine="0" breakCellSeparateLine="0">
|
|
6378
|
+
<hh:slash type="NONE" Crooked="0" isCounter="0"/>
|
|
6379
|
+
<hh:backSlash type="NONE" Crooked="0" isCounter="0"/>
|
|
6380
|
+
<hh:leftBorder type="NONE" width="0.1mm" color="0"/>
|
|
6381
|
+
<hh:rightBorder type="NONE" width="0.1mm" color="0"/>
|
|
6382
|
+
<hh:topBorder type="NONE" width="0.1mm" color="0"/>
|
|
6383
|
+
<hh:bottomBorder type="NONE" width="0.1mm" color="0"/>
|
|
6384
|
+
<hh:diagonal type="NONE" width="0.1mm" color="0"/>
|
|
6385
|
+
<hh:fillInfo/>
|
|
6386
|
+
</hh:borderFill>
|
|
6387
|
+
</hh:borderFills>
|
|
6388
|
+
<hh:charProperties itemCnt="1">
|
|
6389
|
+
<hh:charPr id="0" height="1000" textColor="0" shadeColor="-1" useFontSpace="0" useKerning="0" symMark="NONE" borderFillIDRef="0">
|
|
6390
|
+
<hh:fontRef hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
|
|
6391
|
+
<hh:ratio hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
|
|
6392
|
+
<hh:spacing hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
|
|
6393
|
+
<hh:relSz hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
|
|
6394
|
+
<hh:offset hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
|
|
6395
|
+
</hh:charPr>
|
|
6396
|
+
</hh:charProperties>
|
|
6397
|
+
<hh:tabProperties itemCnt="0"/>
|
|
6398
|
+
<hh:numberings itemCnt="0"/>
|
|
6399
|
+
<hh:bullets itemCnt="0"/>
|
|
6400
|
+
<hh:paraProperties itemCnt="1">
|
|
6401
|
+
<hh:paraPr id="0" tabIDRef="0" condense="0" fontLineHeight="0" snapToGrid="0" suppressOverlap="0" checked="0">
|
|
6402
|
+
<hh:parLineBreak lineBreak="BREAK_LINE" wordBreak="BREAK_WORD" breakLatinWord="BREAK_WORD" breakNonLatinWord="BREAK_WORD"/>
|
|
6403
|
+
<hh:parMargin left="0" right="0" prev="0" next="0" indent="0"/>
|
|
6404
|
+
<hh:parBorder borderFillIDRef="0" offsetLeft="0" offsetRight="0" offsetTop="0" offsetBottom="0" connect="0" ignoreMargin="0"/>
|
|
6405
|
+
<hh:parShade borderFillIDRef="0"/>
|
|
6406
|
+
<hh:parTabList/>
|
|
6407
|
+
</hh:paraPr>
|
|
6408
|
+
</hh:paraProperties>
|
|
6409
|
+
<hh:styles itemCnt="1">
|
|
6410
|
+
<hh:style id="0" type="PARA" name="\uBC14\uD0D5\uAE00" engName="Normal" paraPrIDRef="0" charPrIDRef="0" nextStyleIDRef="0" langIDRef="1042" lockForm="0"/>
|
|
6411
|
+
</hh:styles>
|
|
6412
|
+
</hh:refList>
|
|
6413
|
+
<hh:compatibleDocument targetProgram="HWP2018"/>
|
|
6414
|
+
</hh:head>`;
|
|
6415
|
+
}
|
|
5958
6416
|
function generateParagraph(text) {
|
|
5959
|
-
return `<hp:p><hp:run><hp:t>${escapeXml(text)}</hp:t></hp:run></hp:p>`;
|
|
6417
|
+
return `<hp:p paraPrIDRef="0" styleIDRef="0"><hp:run charPrIDRef="0"><hp:t>${escapeXml(text)}</hp:t></hp:run></hp:p>`;
|
|
5960
6418
|
}
|
|
5961
6419
|
function generateTable(rows) {
|
|
5962
6420
|
const trElements = rows.map((row) => {
|
|
@@ -5980,22 +6438,11 @@ function blocksToSectionXml(blocks) {
|
|
|
5980
6438
|
return "";
|
|
5981
6439
|
}
|
|
5982
6440
|
}).join("\n ");
|
|
5983
|
-
return `<?xml version="1.0" encoding="UTF-8"?>
|
|
5984
|
-
<hs:sec xmlns:hs="${
|
|
6441
|
+
return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
|
6442
|
+
<hs:sec xmlns:hs="${NS_SECTION}" xmlns:hp="${NS_PARA}">
|
|
5985
6443
|
${body}
|
|
5986
6444
|
</hs:sec>`;
|
|
5987
6445
|
}
|
|
5988
|
-
function generateManifest() {
|
|
5989
|
-
return `<?xml version="1.0" encoding="UTF-8"?>
|
|
5990
|
-
<opf:package xmlns:opf="http://www.idpf.org/2007/opf">
|
|
5991
|
-
<opf:manifest>
|
|
5992
|
-
<opf:item id="s0" href="section0.xml" media-type="application/xml"/>
|
|
5993
|
-
</opf:manifest>
|
|
5994
|
-
<opf:spine>
|
|
5995
|
-
<opf:itemref idref="s0"/>
|
|
5996
|
-
</opf:spine>
|
|
5997
|
-
</opf:package>`;
|
|
5998
|
-
}
|
|
5999
6446
|
|
|
6000
6447
|
// src/index.ts
|
|
6001
6448
|
async function parse(input, options) {
|