kordoc 2.1.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-5Y2Q3BRW.js +52 -0
- package/dist/chunk-5Y2Q3BRW.js.map +1 -0
- package/dist/{chunk-GJ2S6IMC.js → chunk-LYFG7AUT.js} +966 -577
- package/dist/chunk-LYFG7AUT.js.map +1 -0
- package/dist/cli.js +12 -8
- package/dist/cli.js.map +1 -1
- package/dist/detect-GYK3HKD5.js +18 -0
- package/dist/index.cjs +993 -546
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +993 -546
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +40 -11
- package/dist/mcp.js.map +1 -1
- package/dist/{watch-X7IC7MLF.js → watch-Q5OXA73S.js} +31 -15
- package/dist/watch-Q5OXA73S.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-GJ2S6IMC.js.map +0 -1
- package/dist/chunk-PKIJLEV6.js +0 -93
- package/dist/chunk-PKIJLEV6.js.map +0 -1
- package/dist/utils-BWQ2RGUD.js +0 -22
- package/dist/watch-X7IC7MLF.js.map +0 -1
- /package/dist/{utils-BWQ2RGUD.js.map → detect-GYK3HKD5.js.map} +0 -0
package/dist/index.cjs
CHANGED
|
@@ -183,7 +183,7 @@ var import_zlib = require("zlib");
|
|
|
183
183
|
var import_xmldom = require("@xmldom/xmldom");
|
|
184
184
|
|
|
185
185
|
// src/utils.ts
|
|
186
|
-
var VERSION = true ? "2.
|
|
186
|
+
var VERSION = true ? "2.2.0" : "0.0.0-dev";
|
|
187
187
|
function toArrayBuffer(buf) {
|
|
188
188
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
189
189
|
return buf.buffer;
|
|
@@ -199,7 +199,8 @@ var KordocError = class extends Error {
|
|
|
199
199
|
function isPathTraversal(name) {
|
|
200
200
|
if (name.includes("\0")) return true;
|
|
201
201
|
const normalized = name.replace(/\\/g, "/");
|
|
202
|
-
|
|
202
|
+
const segments = normalized.split("/");
|
|
203
|
+
return segments.some((s) => s === "..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
203
204
|
}
|
|
204
205
|
function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
|
|
205
206
|
try {
|
|
@@ -239,12 +240,25 @@ function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEnt
|
|
|
239
240
|
return { totalUncompressed: 0, entryCount: 0 };
|
|
240
241
|
}
|
|
241
242
|
}
|
|
243
|
+
function stripDtd(xml) {
|
|
244
|
+
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
245
|
+
}
|
|
242
246
|
var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
|
|
243
247
|
function sanitizeHref(href) {
|
|
244
248
|
const trimmed = href.trim();
|
|
245
249
|
if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
|
|
246
250
|
return trimmed;
|
|
247
251
|
}
|
|
252
|
+
function safeMin(arr) {
|
|
253
|
+
let min = Infinity;
|
|
254
|
+
for (let i = 0; i < arr.length; i++) if (arr[i] < min) min = arr[i];
|
|
255
|
+
return min;
|
|
256
|
+
}
|
|
257
|
+
function safeMax(arr) {
|
|
258
|
+
let max = -Infinity;
|
|
259
|
+
for (let i = 0; i < arr.length; i++) if (arr[i] > max) max = arr[i];
|
|
260
|
+
return max;
|
|
261
|
+
}
|
|
248
262
|
function classifyError(err) {
|
|
249
263
|
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
250
264
|
const msg = err.message;
|
|
@@ -319,6 +333,7 @@ function buildTableDirect(rows, numRows) {
|
|
|
319
333
|
if (end > maxCols) maxCols = end;
|
|
320
334
|
}
|
|
321
335
|
}
|
|
336
|
+
if (maxCols > MAX_COLS) maxCols = MAX_COLS;
|
|
322
337
|
if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
|
|
323
338
|
const grid = Array.from(
|
|
324
339
|
{ length: numRows },
|
|
@@ -328,7 +343,7 @@ function buildTableDirect(rows, numRows) {
|
|
|
328
343
|
for (const cell of row) {
|
|
329
344
|
const r = cell.rowAddr ?? 0;
|
|
330
345
|
const c = cell.colAddr ?? 0;
|
|
331
|
-
if (r >= numRows || c >= maxCols) continue;
|
|
346
|
+
if (r >= numRows || c >= maxCols || r < 0 || c < 0) continue;
|
|
332
347
|
grid[r][c] = { text: cell.text.trim(), colSpan: cell.colSpan, rowSpan: cell.rowSpan };
|
|
333
348
|
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
334
349
|
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
@@ -513,9 +528,6 @@ function tableToMarkdown(table) {
|
|
|
513
528
|
if (dr === 0 && dc === 0) continue;
|
|
514
529
|
if (r + dr < numRows && c + dc < numCols) {
|
|
515
530
|
skip.add(`${r + dr},${c + dc}`);
|
|
516
|
-
if (dr === 0) {
|
|
517
|
-
display[r][c + dc] = cell.text.replace(/\n/g, "<br>");
|
|
518
|
-
}
|
|
519
531
|
}
|
|
520
532
|
}
|
|
521
533
|
}
|
|
@@ -651,9 +663,6 @@ function parseStyleElements(doc, map) {
|
|
|
651
663
|
}
|
|
652
664
|
}
|
|
653
665
|
}
|
|
654
|
-
function stripDtd(xml) {
|
|
655
|
-
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
656
|
-
}
|
|
657
666
|
async function parseHwpxDocument(buffer, options) {
|
|
658
667
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
|
|
659
668
|
let zip;
|
|
@@ -1003,7 +1012,8 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
1003
1012
|
if (newTable.rows.length > 0) {
|
|
1004
1013
|
if (tableStack.length > 0) {
|
|
1005
1014
|
const parentTable = tableStack.pop();
|
|
1006
|
-
|
|
1015
|
+
let nestedCols = 0;
|
|
1016
|
+
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
1007
1017
|
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
1008
1018
|
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
1009
1019
|
} else {
|
|
@@ -1112,7 +1122,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
1112
1122
|
if (newTable.rows.length > 0) {
|
|
1113
1123
|
if (tableStack.length > 0) {
|
|
1114
1124
|
const parentTable = tableStack.pop();
|
|
1115
|
-
|
|
1125
|
+
let nestedCols = 0;
|
|
1126
|
+
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
1116
1127
|
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
1117
1128
|
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
1118
1129
|
} else {
|
|
@@ -2210,6 +2221,7 @@ function parseLenientCfb(data) {
|
|
|
2210
2221
|
if (miniSectorSizeShift > 16) throw new Error("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uBBF8\uB2C8 \uC139\uD130 \uD06C\uAE30 \uC2DC\uD504\uD2B8: " + miniSectorSizeShift);
|
|
2211
2222
|
const miniSectorSize = 1 << miniSectorSizeShift;
|
|
2212
2223
|
const fatSectorCount = data.readUInt32LE(44);
|
|
2224
|
+
if (fatSectorCount > 1e4) throw new Error("FAT \uC139\uD130 \uC218\uAC00 \uB108\uBB34 \uB9CE\uC2B5\uB2C8\uB2E4: " + fatSectorCount);
|
|
2213
2225
|
const firstDirSector = data.readUInt32LE(48);
|
|
2214
2226
|
const miniStreamCutoff = data.readUInt32LE(56);
|
|
2215
2227
|
const firstMiniFatSector = data.readUInt32LE(60);
|
|
@@ -2586,10 +2598,14 @@ function findSections(cfb) {
|
|
|
2586
2598
|
}
|
|
2587
2599
|
function findSectionsLenient(lcfb, compressed) {
|
|
2588
2600
|
const sections = [];
|
|
2601
|
+
let totalDecompressed = 0;
|
|
2589
2602
|
for (let i = 0; i < MAX_SECTIONS; i++) {
|
|
2590
2603
|
const raw = lcfb.findStream(`/BodyText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
|
|
2591
2604
|
if (!raw) break;
|
|
2592
|
-
|
|
2605
|
+
const content = compressed ? decompressStream(raw) : raw;
|
|
2606
|
+
totalDecompressed += content.length;
|
|
2607
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2608
|
+
sections.push({ idx: i, content });
|
|
2593
2609
|
}
|
|
2594
2610
|
if (sections.length === 0) {
|
|
2595
2611
|
for (const e of lcfb.entries()) {
|
|
@@ -2597,7 +2613,12 @@ function findSectionsLenient(lcfb, compressed) {
|
|
|
2597
2613
|
if (e.name.startsWith("Section")) {
|
|
2598
2614
|
const idx = parseInt(e.name.replace("Section", ""), 10) || 0;
|
|
2599
2615
|
const raw = lcfb.findStream(e.name);
|
|
2600
|
-
if (raw)
|
|
2616
|
+
if (raw) {
|
|
2617
|
+
const content = compressed ? decompressStream(raw) : raw;
|
|
2618
|
+
totalDecompressed += content.length;
|
|
2619
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2620
|
+
sections.push({ idx, content });
|
|
2621
|
+
}
|
|
2601
2622
|
}
|
|
2602
2623
|
}
|
|
2603
2624
|
}
|
|
@@ -2605,11 +2626,15 @@ function findSectionsLenient(lcfb, compressed) {
|
|
|
2605
2626
|
}
|
|
2606
2627
|
function findViewTextSectionsLenient(lcfb, compressed) {
|
|
2607
2628
|
const sections = [];
|
|
2629
|
+
let totalDecompressed = 0;
|
|
2608
2630
|
for (let i = 0; i < MAX_SECTIONS; i++) {
|
|
2609
2631
|
const raw = lcfb.findStream(`/ViewText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
|
|
2610
2632
|
if (!raw) break;
|
|
2611
2633
|
try {
|
|
2612
|
-
|
|
2634
|
+
const content = decryptViewText(raw, compressed);
|
|
2635
|
+
totalDecompressed += content.length;
|
|
2636
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2637
|
+
sections.push({ idx: i, content });
|
|
2613
2638
|
} catch {
|
|
2614
2639
|
break;
|
|
2615
2640
|
}
|
|
@@ -3011,37 +3036,18 @@ init_page_range();
|
|
|
3011
3036
|
// src/pdf/line-detector.ts
|
|
3012
3037
|
var import_pdf = require("pdfjs-dist/legacy/build/pdf.mjs");
|
|
3013
3038
|
var ORIENTATION_TOL = 2;
|
|
3014
|
-
var MIN_LINE_LENGTH =
|
|
3015
|
-
var
|
|
3039
|
+
var MIN_LINE_LENGTH = 15;
|
|
3040
|
+
var MAX_LINE_WIDTH = 5;
|
|
3016
3041
|
var CONNECT_TOL = 5;
|
|
3017
3042
|
var CELL_PADDING = 2;
|
|
3018
|
-
var
|
|
3019
|
-
var
|
|
3020
|
-
|
|
3021
|
-
|
|
3022
|
-
m1[0] * m2[0] + m1[2] * m2[1],
|
|
3023
|
-
m1[1] * m2[0] + m1[3] * m2[1],
|
|
3024
|
-
m1[0] * m2[2] + m1[2] * m2[3],
|
|
3025
|
-
m1[1] * m2[2] + m1[3] * m2[3],
|
|
3026
|
-
m1[0] * m2[4] + m1[2] * m2[5] + m1[4],
|
|
3027
|
-
m1[1] * m2[4] + m1[3] * m2[5] + m1[5]
|
|
3028
|
-
];
|
|
3029
|
-
}
|
|
3030
|
-
function matTransformPoint(m, x, y) {
|
|
3031
|
-
return [m[0] * x + m[2] * y + m[4], m[1] * x + m[3] * y + m[5]];
|
|
3032
|
-
}
|
|
3033
|
-
function matScale(m) {
|
|
3034
|
-
return Math.max(
|
|
3035
|
-
Math.sqrt(m[1] * m[1] + m[3] * m[3]),
|
|
3036
|
-
Math.sqrt(m[0] * m[0] + m[2] * m[2])
|
|
3037
|
-
);
|
|
3038
|
-
}
|
|
3043
|
+
var MIN_COL_WIDTH = 15;
|
|
3044
|
+
var MIN_ROW_HEIGHT = 6;
|
|
3045
|
+
var VERTEX_MERGE_FACTOR = 4;
|
|
3046
|
+
var MIN_COORD_MERGE_TOL = 8;
|
|
3039
3047
|
function extractLines(fnArray, argsArray) {
|
|
3040
3048
|
const horizontals = [];
|
|
3041
3049
|
const verticals = [];
|
|
3042
|
-
let ctm = [...IDENTITY];
|
|
3043
3050
|
let lineWidth = 1;
|
|
3044
|
-
const stateStack = [];
|
|
3045
3051
|
let currentPath = [];
|
|
3046
3052
|
let pathStartX = 0, pathStartY = 0;
|
|
3047
3053
|
let curX = 0, curY = 0;
|
|
@@ -3059,53 +3065,13 @@ function extractLines(fnArray, argsArray) {
|
|
|
3059
3065
|
);
|
|
3060
3066
|
}
|
|
3061
3067
|
}
|
|
3062
|
-
function
|
|
3063
|
-
if (
|
|
3064
|
-
const first = path[0], last = path[path.length - 1];
|
|
3065
|
-
const closed = Math.abs(first.x1 - last.x2) < 1 && Math.abs(first.y1 - last.y2) < 1;
|
|
3066
|
-
if (!closed) return false;
|
|
3067
|
-
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
3068
|
-
for (const seg of path) {
|
|
3069
|
-
minX = Math.min(minX, seg.x1, seg.x2);
|
|
3070
|
-
minY = Math.min(minY, seg.y1, seg.y2);
|
|
3071
|
-
maxX = Math.max(maxX, seg.x1, seg.x2);
|
|
3072
|
-
maxY = Math.max(maxY, seg.y1, seg.y2);
|
|
3073
|
-
}
|
|
3074
|
-
const w = maxX - minX, h = maxY - minY;
|
|
3075
|
-
if (w < MIN_LINE_LENGTH && h < MIN_LINE_LENGTH) return false;
|
|
3076
|
-
path.length = 0;
|
|
3077
|
-
if (h < ORIENTATION_TOL * 2 || w > MIN_LINE_LENGTH && h <= MAX_LINE_WIDTH) {
|
|
3078
|
-
path.push({ x1: minX, y1: (minY + maxY) / 2, x2: maxX, y2: (minY + maxY) / 2 });
|
|
3079
|
-
} else if (w < ORIENTATION_TOL * 2 || h > MIN_LINE_LENGTH && w <= MAX_LINE_WIDTH) {
|
|
3080
|
-
path.push({ x1: (minX + maxX) / 2, y1: minY, x2: (minX + maxX) / 2, y2: maxY });
|
|
3081
|
-
} else {
|
|
3082
|
-
pushRectangle(path, minX, minY, w, h);
|
|
3083
|
-
}
|
|
3084
|
-
return true;
|
|
3085
|
-
}
|
|
3086
|
-
function flushPath(isStroke, isFill) {
|
|
3087
|
-
if (!isStroke && !isFill) {
|
|
3088
|
-
currentPath = [];
|
|
3089
|
-
return;
|
|
3090
|
-
}
|
|
3091
|
-
if (isFill && !isStroke && currentPath.length >= 3) {
|
|
3092
|
-
tryConvertLinesToRectangle(currentPath);
|
|
3093
|
-
}
|
|
3094
|
-
const scale = matScale(ctm);
|
|
3095
|
-
const effectiveLW = lineWidth * scale;
|
|
3096
|
-
if (effectiveLW > MAX_LINE_WIDTH && isStroke && !isFill) {
|
|
3068
|
+
function flushPath(isStroke) {
|
|
3069
|
+
if (!isStroke) {
|
|
3097
3070
|
currentPath = [];
|
|
3098
3071
|
return;
|
|
3099
3072
|
}
|
|
3100
3073
|
for (const seg of currentPath) {
|
|
3101
|
-
|
|
3102
|
-
const [px2, py2] = matTransformPoint(ctm, seg.x2, seg.y2);
|
|
3103
|
-
classifyAndAdd(
|
|
3104
|
-
{ x1: px1, y1: py1, x2: px2, y2: py2 },
|
|
3105
|
-
effectiveLW,
|
|
3106
|
-
horizontals,
|
|
3107
|
-
verticals
|
|
3108
|
-
);
|
|
3074
|
+
classifyAndAdd(seg, lineWidth, horizontals, verticals);
|
|
3109
3075
|
}
|
|
3110
3076
|
currentPath = [];
|
|
3111
3077
|
}
|
|
@@ -3113,28 +3079,9 @@ function extractLines(fnArray, argsArray) {
|
|
|
3113
3079
|
const op = fnArray[i];
|
|
3114
3080
|
const args = argsArray[i];
|
|
3115
3081
|
switch (op) {
|
|
3116
|
-
// ── Graphics State ──
|
|
3117
|
-
case import_pdf.OPS.save:
|
|
3118
|
-
stateStack.push({ ctm: [...ctm], lineWidth });
|
|
3119
|
-
break;
|
|
3120
|
-
case import_pdf.OPS.restore:
|
|
3121
|
-
if (stateStack.length > 0) {
|
|
3122
|
-
const state = stateStack.pop();
|
|
3123
|
-
ctm = state.ctm;
|
|
3124
|
-
lineWidth = state.lineWidth;
|
|
3125
|
-
}
|
|
3126
|
-
break;
|
|
3127
|
-
case import_pdf.OPS.transform: {
|
|
3128
|
-
const m = args;
|
|
3129
|
-
if (m.length >= 6) {
|
|
3130
|
-
ctm = matMultiply(ctm, [m[0], m[1], m[2], m[3], m[4], m[5]]);
|
|
3131
|
-
}
|
|
3132
|
-
break;
|
|
3133
|
-
}
|
|
3134
3082
|
case import_pdf.OPS.setLineWidth:
|
|
3135
3083
|
lineWidth = args[0] || 1;
|
|
3136
3084
|
break;
|
|
3137
|
-
// ── Path Construction ──
|
|
3138
3085
|
case import_pdf.OPS.constructPath: {
|
|
3139
3086
|
const arg0 = args[0];
|
|
3140
3087
|
if (Array.isArray(arg0)) {
|
|
@@ -3202,60 +3149,34 @@ function extractLines(fnArray, argsArray) {
|
|
|
3202
3149
|
}
|
|
3203
3150
|
}
|
|
3204
3151
|
}
|
|
3205
|
-
|
|
3206
|
-
|
|
3207
|
-
|
|
3208
|
-
|
|
3209
|
-
flushPath(isStroke5 || isBoth5, isFill5 || isBoth5);
|
|
3152
|
+
if (afterOp === import_pdf.OPS.stroke || afterOp === import_pdf.OPS.closeStroke) {
|
|
3153
|
+
flushPath(true);
|
|
3154
|
+
} else if (afterOp === import_pdf.OPS.fill || afterOp === import_pdf.OPS.eoFill || afterOp === import_pdf.OPS.fillStroke || afterOp === import_pdf.OPS.eoFillStroke || afterOp === import_pdf.OPS.closeFillStroke || afterOp === import_pdf.OPS.closeEOFillStroke) {
|
|
3155
|
+
flushPath(true);
|
|
3210
3156
|
} else if (afterOp === import_pdf.OPS.endPath) {
|
|
3211
|
-
flushPath(false
|
|
3157
|
+
flushPath(false);
|
|
3212
3158
|
}
|
|
3213
3159
|
}
|
|
3214
3160
|
break;
|
|
3215
3161
|
}
|
|
3216
|
-
// ── Paint Operations ──
|
|
3217
3162
|
case import_pdf.OPS.stroke:
|
|
3218
3163
|
case import_pdf.OPS.closeStroke:
|
|
3219
|
-
flushPath(true
|
|
3164
|
+
flushPath(true);
|
|
3220
3165
|
break;
|
|
3221
3166
|
case import_pdf.OPS.fill:
|
|
3222
3167
|
case import_pdf.OPS.eoFill:
|
|
3223
|
-
flushPath(false, true);
|
|
3224
|
-
break;
|
|
3225
3168
|
case import_pdf.OPS.fillStroke:
|
|
3226
3169
|
case import_pdf.OPS.eoFillStroke:
|
|
3227
3170
|
case import_pdf.OPS.closeFillStroke:
|
|
3228
3171
|
case import_pdf.OPS.closeEOFillStroke:
|
|
3229
|
-
flushPath(true
|
|
3172
|
+
flushPath(true);
|
|
3230
3173
|
break;
|
|
3231
3174
|
case import_pdf.OPS.endPath:
|
|
3232
|
-
flushPath(false
|
|
3233
|
-
break;
|
|
3234
|
-
}
|
|
3235
|
-
}
|
|
3236
|
-
return {
|
|
3237
|
-
horizontals: deduplicateLines(horizontals),
|
|
3238
|
-
verticals: deduplicateLines(verticals)
|
|
3239
|
-
};
|
|
3240
|
-
}
|
|
3241
|
-
function deduplicateLines(lines) {
|
|
3242
|
-
if (lines.length <= 1) return lines;
|
|
3243
|
-
const result = [];
|
|
3244
|
-
const tol = COORD_MERGE_TOL;
|
|
3245
|
-
for (const line of lines) {
|
|
3246
|
-
let isDuplicate = false;
|
|
3247
|
-
for (const existing of result) {
|
|
3248
|
-
if (Math.abs(line.y1 - existing.y1) <= tol && Math.abs(line.y2 - existing.y2) <= tol && Math.abs(line.x1 - existing.x1) <= tol && Math.abs(line.x2 - existing.x2) <= tol) {
|
|
3249
|
-
if (line.lineWidth > existing.lineWidth) {
|
|
3250
|
-
existing.lineWidth = line.lineWidth;
|
|
3251
|
-
}
|
|
3252
|
-
isDuplicate = true;
|
|
3175
|
+
flushPath(false);
|
|
3253
3176
|
break;
|
|
3254
|
-
}
|
|
3255
3177
|
}
|
|
3256
|
-
if (!isDuplicate) result.push(line);
|
|
3257
3178
|
}
|
|
3258
|
-
return
|
|
3179
|
+
return { horizontals, verticals };
|
|
3259
3180
|
}
|
|
3260
3181
|
function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
|
|
3261
3182
|
const dx = Math.abs(seg.x2 - seg.x1);
|
|
@@ -3274,6 +3195,55 @@ function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
|
|
|
3274
3195
|
verticals.push({ x1: x, y1, x2: x, y2, lineWidth });
|
|
3275
3196
|
}
|
|
3276
3197
|
}
|
|
3198
|
+
function preprocessLines(horizontals, verticals) {
|
|
3199
|
+
let h = horizontals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
|
|
3200
|
+
let v = verticals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
|
|
3201
|
+
h = mergeParallelLines(h, "h");
|
|
3202
|
+
v = mergeParallelLines(v, "v");
|
|
3203
|
+
return { horizontals: h, verticals: v };
|
|
3204
|
+
}
|
|
3205
|
+
function mergeParallelLines(lines, dir) {
|
|
3206
|
+
if (lines.length <= 1) return lines;
|
|
3207
|
+
const sorted = [...lines].sort((a, b) => {
|
|
3208
|
+
const posA = dir === "h" ? a.y1 : a.x1;
|
|
3209
|
+
const posB = dir === "h" ? b.y1 : b.x1;
|
|
3210
|
+
if (Math.abs(posA - posB) > 0.1) return posA - posB;
|
|
3211
|
+
return dir === "h" ? a.x1 - b.x1 : a.y1 - b.y1;
|
|
3212
|
+
});
|
|
3213
|
+
const MERGE_TOL = 3;
|
|
3214
|
+
const result = [sorted[0]];
|
|
3215
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
3216
|
+
const prev = result[result.length - 1];
|
|
3217
|
+
const curr = sorted[i];
|
|
3218
|
+
const prevPos = dir === "h" ? prev.y1 : prev.x1;
|
|
3219
|
+
const currPos = dir === "h" ? curr.y1 : curr.x1;
|
|
3220
|
+
if (Math.abs(prevPos - currPos) <= MERGE_TOL) {
|
|
3221
|
+
const prevStart = dir === "h" ? prev.x1 : prev.y1;
|
|
3222
|
+
const prevEnd = dir === "h" ? prev.x2 : prev.y2;
|
|
3223
|
+
const currStart = dir === "h" ? curr.x1 : curr.y1;
|
|
3224
|
+
const currEnd = dir === "h" ? curr.x2 : curr.y2;
|
|
3225
|
+
const overlap = Math.min(prevEnd, currEnd) - Math.max(prevStart, currStart);
|
|
3226
|
+
const minLen = Math.min(prevEnd - prevStart, currEnd - currStart);
|
|
3227
|
+
if (overlap > minLen * 0.3) {
|
|
3228
|
+
if (dir === "h") {
|
|
3229
|
+
prev.x1 = Math.min(prev.x1, curr.x1);
|
|
3230
|
+
prev.x2 = Math.max(prev.x2, curr.x2);
|
|
3231
|
+
prev.y1 = (prev.y1 + curr.y1) / 2;
|
|
3232
|
+
prev.y2 = prev.y1;
|
|
3233
|
+
} else {
|
|
3234
|
+
prev.y1 = Math.min(prev.y1, curr.y1);
|
|
3235
|
+
prev.y2 = Math.max(prev.y2, curr.y2);
|
|
3236
|
+
prev.x1 = (prev.x1 + curr.x1) / 2;
|
|
3237
|
+
prev.x2 = prev.x1;
|
|
3238
|
+
}
|
|
3239
|
+
prev.lineWidth = Math.max(prev.lineWidth, curr.lineWidth);
|
|
3240
|
+
continue;
|
|
3241
|
+
}
|
|
3242
|
+
}
|
|
3243
|
+
result.push(curr);
|
|
3244
|
+
}
|
|
3245
|
+
return result;
|
|
3246
|
+
}
|
|
3277
3247
|
function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
|
|
3278
3248
|
const margin = 5;
|
|
3279
3249
|
return {
|
|
@@ -3285,8 +3255,49 @@ function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
|
|
|
3285
3255
|
)
|
|
3286
3256
|
};
|
|
3287
3257
|
}
|
|
3258
|
+
function buildVertices(horizontals, verticals) {
|
|
3259
|
+
const vertices = [];
|
|
3260
|
+
const tol = CONNECT_TOL;
|
|
3261
|
+
for (const h of horizontals) {
|
|
3262
|
+
for (const v of verticals) {
|
|
3263
|
+
if (v.x1 >= h.x1 - tol && v.x1 <= h.x2 + tol && h.y1 >= v.y1 - tol && h.y1 <= v.y2 + tol) {
|
|
3264
|
+
const radius = Math.max(h.lineWidth, v.lineWidth, 1);
|
|
3265
|
+
vertices.push({ x: v.x1, y: h.y1, radius });
|
|
3266
|
+
}
|
|
3267
|
+
}
|
|
3268
|
+
}
|
|
3269
|
+
return vertices;
|
|
3270
|
+
}
|
|
3271
|
+
function mergeVertices(vertices) {
|
|
3272
|
+
if (vertices.length <= 1) return vertices;
|
|
3273
|
+
const merged = [];
|
|
3274
|
+
const used = new Array(vertices.length).fill(false);
|
|
3275
|
+
for (let i = 0; i < vertices.length; i++) {
|
|
3276
|
+
if (used[i]) continue;
|
|
3277
|
+
let sumX = vertices[i].x, sumY = vertices[i].y;
|
|
3278
|
+
let maxRadius = vertices[i].radius;
|
|
3279
|
+
let count = 1;
|
|
3280
|
+
for (let j = i + 1; j < vertices.length; j++) {
|
|
3281
|
+
if (used[j]) continue;
|
|
3282
|
+
const mergeTol = VERTEX_MERGE_FACTOR * Math.max(maxRadius, vertices[j].radius);
|
|
3283
|
+
if (Math.abs(vertices[i].x - vertices[j].x) <= mergeTol && Math.abs(vertices[i].y - vertices[j].y) <= mergeTol) {
|
|
3284
|
+
sumX += vertices[j].x;
|
|
3285
|
+
sumY += vertices[j].y;
|
|
3286
|
+
maxRadius = Math.max(maxRadius, vertices[j].radius);
|
|
3287
|
+
count++;
|
|
3288
|
+
used[j] = true;
|
|
3289
|
+
}
|
|
3290
|
+
}
|
|
3291
|
+
merged.push({ x: sumX / count, y: sumY / count, radius: maxRadius });
|
|
3292
|
+
}
|
|
3293
|
+
return merged;
|
|
3294
|
+
}
|
|
3288
3295
|
function buildTableGrids(horizontals, verticals) {
|
|
3289
3296
|
if (horizontals.length < 2 || verticals.length < 2) return [];
|
|
3297
|
+
const allVertices = buildVertices(horizontals, verticals);
|
|
3298
|
+
const vertices = mergeVertices(allVertices);
|
|
3299
|
+
if (vertices.length < 4) return [];
|
|
3300
|
+
const globalRadius = vertices.reduce((max, v) => Math.max(max, v.radius), 1);
|
|
3290
3301
|
const allLines = [
|
|
3291
3302
|
...horizontals.map((l, i) => ({ ...l, type: "h", id: i })),
|
|
3292
3303
|
...verticals.map((l, i) => ({ ...l, type: "v", id: i + horizontals.length }))
|
|
@@ -3297,21 +3308,74 @@ function buildTableGrids(horizontals, verticals) {
|
|
|
3297
3308
|
const hLines = group.filter((l) => l.type === "h");
|
|
3298
3309
|
const vLines = group.filter((l) => l.type === "v");
|
|
3299
3310
|
if (hLines.length < 2 || vLines.length < 2) continue;
|
|
3300
|
-
|
|
3301
|
-
const
|
|
3302
|
-
|
|
3303
|
-
|
|
3311
|
+
let gx1 = Infinity, gy1 = Infinity, gx2 = -Infinity, gy2 = -Infinity;
|
|
3312
|
+
for (const l of vLines) {
|
|
3313
|
+
if (l.x1 < gx1) gx1 = l.x1;
|
|
3314
|
+
if (l.x1 > gx2) gx2 = l.x1;
|
|
3315
|
+
}
|
|
3316
|
+
for (const l of hLines) {
|
|
3317
|
+
if (l.y1 < gy1) gy1 = l.y1;
|
|
3318
|
+
if (l.y1 > gy2) gy2 = l.y1;
|
|
3319
|
+
}
|
|
3320
|
+
const groupBbox = {
|
|
3321
|
+
x1: gx1 - CONNECT_TOL,
|
|
3322
|
+
y1: gy1 - CONNECT_TOL,
|
|
3323
|
+
x2: gx2 + CONNECT_TOL,
|
|
3324
|
+
y2: gy2 + CONNECT_TOL
|
|
3325
|
+
};
|
|
3326
|
+
const groupVertices = vertices.filter(
|
|
3327
|
+
(v) => v.x >= groupBbox.x1 && v.x <= groupBbox.x2 && v.y >= groupBbox.y1 && v.y <= groupBbox.y2
|
|
3328
|
+
);
|
|
3329
|
+
const groupRadius = groupVertices.length > 0 ? groupVertices.reduce((max, v) => Math.max(max, v.radius), 1) : globalRadius;
|
|
3330
|
+
const coordMergeTol = Math.max(VERTEX_MERGE_FACTOR * groupRadius, MIN_COORD_MERGE_TOL);
|
|
3331
|
+
const rawYs = [
|
|
3332
|
+
...hLines.map((l) => l.y1),
|
|
3333
|
+
...groupVertices.map((v) => v.y)
|
|
3334
|
+
];
|
|
3335
|
+
const rowYs = clusterCoordinates(rawYs, coordMergeTol).sort((a, b) => b - a);
|
|
3336
|
+
const rawXs = [
|
|
3337
|
+
...vLines.map((l) => l.x1),
|
|
3338
|
+
...groupVertices.map((v) => v.x)
|
|
3339
|
+
];
|
|
3340
|
+
const colXs = clusterCoordinates(rawXs, coordMergeTol).sort((a, b) => a - b);
|
|
3304
3341
|
if (rowYs.length < 2 || colXs.length < 2) continue;
|
|
3342
|
+
const validColXs = enforceMinWidth(colXs, MIN_COL_WIDTH);
|
|
3343
|
+
const validRowYs = enforceMinHeight(rowYs, MIN_ROW_HEIGHT);
|
|
3344
|
+
if (validRowYs.length < 2 || validColXs.length < 2) continue;
|
|
3305
3345
|
const bbox = {
|
|
3306
|
-
x1:
|
|
3307
|
-
y1:
|
|
3308
|
-
x2:
|
|
3309
|
-
y2:
|
|
3346
|
+
x1: validColXs[0],
|
|
3347
|
+
y1: validRowYs[validRowYs.length - 1],
|
|
3348
|
+
x2: validColXs[validColXs.length - 1],
|
|
3349
|
+
y2: validRowYs[0]
|
|
3310
3350
|
};
|
|
3311
|
-
grids.push({ rowYs, colXs, bbox });
|
|
3351
|
+
grids.push({ rowYs: validRowYs, colXs: validColXs, bbox, vertexRadius: groupRadius });
|
|
3312
3352
|
}
|
|
3313
3353
|
return mergeAdjacentGrids(grids);
|
|
3314
3354
|
}
|
|
3355
|
+
function enforceMinWidth(colXs, minWidth) {
|
|
3356
|
+
if (colXs.length <= 2) return colXs;
|
|
3357
|
+
const result = [colXs[0]];
|
|
3358
|
+
for (let i = 1; i < colXs.length; i++) {
|
|
3359
|
+
const prevX = result[result.length - 1];
|
|
3360
|
+
if (colXs[i] - prevX < minWidth && i < colXs.length - 1) {
|
|
3361
|
+
continue;
|
|
3362
|
+
}
|
|
3363
|
+
result.push(colXs[i]);
|
|
3364
|
+
}
|
|
3365
|
+
return result;
|
|
3366
|
+
}
|
|
3367
|
+
function enforceMinHeight(rowYs, minHeight) {
|
|
3368
|
+
if (rowYs.length <= 2) return rowYs;
|
|
3369
|
+
const result = [rowYs[0]];
|
|
3370
|
+
for (let i = 1; i < rowYs.length; i++) {
|
|
3371
|
+
const prevY = result[result.length - 1];
|
|
3372
|
+
if (prevY - rowYs[i] < minHeight && i < rowYs.length - 1) {
|
|
3373
|
+
continue;
|
|
3374
|
+
}
|
|
3375
|
+
result.push(rowYs[i]);
|
|
3376
|
+
}
|
|
3377
|
+
return result;
|
|
3378
|
+
}
|
|
3315
3379
|
function mergeAdjacentGrids(grids) {
|
|
3316
3380
|
if (grids.length <= 1) return grids;
|
|
3317
3381
|
const sorted = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
|
|
@@ -3320,9 +3384,10 @@ function mergeAdjacentGrids(grids) {
|
|
|
3320
3384
|
const prev = merged[merged.length - 1];
|
|
3321
3385
|
const curr = sorted[i];
|
|
3322
3386
|
if (prev.colXs.length === curr.colXs.length) {
|
|
3323
|
-
const
|
|
3387
|
+
const mergeTol = Math.max(VERTEX_MERGE_FACTOR * Math.max(prev.vertexRadius, curr.vertexRadius), 6) * 3;
|
|
3388
|
+
const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= mergeTol);
|
|
3324
3389
|
const verticalGap = prev.bbox.y1 - curr.bbox.y2;
|
|
3325
|
-
if (colMatch && verticalGap >= -
|
|
3390
|
+
if (colMatch && verticalGap >= -CONNECT_TOL && verticalGap <= 20) {
|
|
3326
3391
|
const allRowYs = [.../* @__PURE__ */ new Set([...prev.rowYs, ...curr.rowYs])].sort((a, b) => b - a);
|
|
3327
3392
|
merged[merged.length - 1] = {
|
|
3328
3393
|
rowYs: allRowYs,
|
|
@@ -3332,7 +3397,8 @@ function mergeAdjacentGrids(grids) {
|
|
|
3332
3397
|
y1: Math.min(prev.bbox.y1, curr.bbox.y1),
|
|
3333
3398
|
x2: Math.max(prev.bbox.x2, curr.bbox.x2),
|
|
3334
3399
|
y2: Math.max(prev.bbox.y2, curr.bbox.y2)
|
|
3335
|
-
}
|
|
3400
|
+
},
|
|
3401
|
+
vertexRadius: Math.max(prev.vertexRadius, curr.vertexRadius)
|
|
3336
3402
|
};
|
|
3337
3403
|
continue;
|
|
3338
3404
|
}
|
|
@@ -3341,14 +3407,14 @@ function mergeAdjacentGrids(grids) {
|
|
|
3341
3407
|
}
|
|
3342
3408
|
return merged;
|
|
3343
3409
|
}
|
|
3344
|
-
function clusterCoordinates(values) {
|
|
3410
|
+
function clusterCoordinates(values, tolerance) {
|
|
3345
3411
|
if (values.length === 0) return [];
|
|
3346
3412
|
const sorted = [...values].sort((a, b) => a - b);
|
|
3347
3413
|
const clusters = [{ sum: sorted[0], count: 1 }];
|
|
3348
3414
|
for (let i = 1; i < sorted.length; i++) {
|
|
3349
3415
|
const last = clusters[clusters.length - 1];
|
|
3350
3416
|
const avg = last.sum / last.count;
|
|
3351
|
-
if (Math.abs(sorted[i] - avg) <=
|
|
3417
|
+
if (Math.abs(sorted[i] - avg) <= tolerance) {
|
|
3352
3418
|
last.sum += sorted[i];
|
|
3353
3419
|
last.count++;
|
|
3354
3420
|
} else {
|
|
@@ -3405,6 +3471,20 @@ function extractCells(grid, horizontals, verticals) {
|
|
|
3405
3471
|
const numRows = rowYs.length - 1;
|
|
3406
3472
|
const numCols = colXs.length - 1;
|
|
3407
3473
|
if (numRows <= 0 || numCols <= 0) return [];
|
|
3474
|
+
const vBorders = Array.from(
|
|
3475
|
+
{ length: numRows },
|
|
3476
|
+
(_, r) => Array.from(
|
|
3477
|
+
{ length: numCols + 1 },
|
|
3478
|
+
(_2, c) => hasVerticalLine(verticals, colXs[c], rowYs[r], rowYs[r + 1], grid.vertexRadius)
|
|
3479
|
+
)
|
|
3480
|
+
);
|
|
3481
|
+
const hBorders = Array.from(
|
|
3482
|
+
{ length: numRows + 1 },
|
|
3483
|
+
(_, r) => Array.from(
|
|
3484
|
+
{ length: numCols },
|
|
3485
|
+
(_2, c) => hasHorizontalLine(horizontals, rowYs[r], colXs[c], colXs[c + 1], grid.vertexRadius)
|
|
3486
|
+
)
|
|
3487
|
+
);
|
|
3408
3488
|
const occupied = Array.from({ length: numRows }, () => Array(numCols).fill(false));
|
|
3409
3489
|
const cells = [];
|
|
3410
3490
|
for (let r = 0; r < numRows; r++) {
|
|
@@ -3412,18 +3492,26 @@ function extractCells(grid, horizontals, verticals) {
|
|
|
3412
3492
|
if (occupied[r][c]) continue;
|
|
3413
3493
|
let colSpan = 1;
|
|
3414
3494
|
let rowSpan = 1;
|
|
3415
|
-
while (c + colSpan < numCols) {
|
|
3416
|
-
|
|
3417
|
-
|
|
3418
|
-
|
|
3419
|
-
|
|
3495
|
+
while (c + colSpan < numCols && !vBorders[r][c + colSpan]) {
|
|
3496
|
+
let canExpand = true;
|
|
3497
|
+
for (let dr = 0; dr < rowSpan; dr++) {
|
|
3498
|
+
if (vBorders[r + dr][c + colSpan]) {
|
|
3499
|
+
canExpand = false;
|
|
3500
|
+
break;
|
|
3501
|
+
}
|
|
3502
|
+
}
|
|
3503
|
+
if (!canExpand) break;
|
|
3420
3504
|
colSpan++;
|
|
3421
3505
|
}
|
|
3422
3506
|
while (r + rowSpan < numRows) {
|
|
3423
|
-
|
|
3424
|
-
|
|
3425
|
-
|
|
3426
|
-
|
|
3507
|
+
let hasLine = false;
|
|
3508
|
+
for (let dc = 0; dc < colSpan; dc++) {
|
|
3509
|
+
if (hBorders[r + rowSpan][c + dc]) {
|
|
3510
|
+
hasLine = true;
|
|
3511
|
+
break;
|
|
3512
|
+
}
|
|
3513
|
+
}
|
|
3514
|
+
if (hasLine) break;
|
|
3427
3515
|
rowSpan++;
|
|
3428
3516
|
}
|
|
3429
3517
|
for (let dr = 0; dr < rowSpan; dr++) {
|
|
@@ -3447,28 +3535,30 @@ function extractCells(grid, horizontals, verticals) {
|
|
|
3447
3535
|
}
|
|
3448
3536
|
return cells;
|
|
3449
3537
|
}
|
|
3450
|
-
function hasVerticalLine(verticals, x, topY, botY) {
|
|
3451
|
-
const tol =
|
|
3538
|
+
function hasVerticalLine(verticals, x, topY, botY, vertexRadius) {
|
|
3539
|
+
const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
|
|
3452
3540
|
for (const v of verticals) {
|
|
3453
3541
|
if (Math.abs(v.x1 - x) <= tol) {
|
|
3454
3542
|
const cellH = Math.abs(topY - botY);
|
|
3543
|
+
if (cellH < 0.1) continue;
|
|
3455
3544
|
const overlapTop = Math.min(v.y2, topY);
|
|
3456
3545
|
const overlapBot = Math.max(v.y1, botY);
|
|
3457
3546
|
const overlap = overlapTop - overlapBot;
|
|
3458
|
-
if (overlap >= cellH * 0.
|
|
3547
|
+
if (overlap >= cellH * 0.75) return true;
|
|
3459
3548
|
}
|
|
3460
3549
|
}
|
|
3461
3550
|
return false;
|
|
3462
3551
|
}
|
|
3463
|
-
function hasHorizontalLine(horizontals, y, leftX, rightX) {
|
|
3464
|
-
const tol =
|
|
3552
|
+
function hasHorizontalLine(horizontals, y, leftX, rightX, vertexRadius) {
|
|
3553
|
+
const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
|
|
3465
3554
|
for (const h of horizontals) {
|
|
3466
3555
|
if (Math.abs(h.y1 - y) <= tol) {
|
|
3467
3556
|
const cellW = Math.abs(rightX - leftX);
|
|
3557
|
+
if (cellW < 0.1) continue;
|
|
3468
3558
|
const overlapLeft = Math.max(h.x1, leftX);
|
|
3469
3559
|
const overlapRight = Math.min(h.x2, rightX);
|
|
3470
3560
|
const overlap = overlapRight - overlapLeft;
|
|
3471
|
-
if (overlap >= cellW * 0.
|
|
3561
|
+
if (overlap >= cellW * 0.75) return true;
|
|
3472
3562
|
}
|
|
3473
3563
|
}
|
|
3474
3564
|
return false;
|
|
@@ -3479,23 +3569,24 @@ function mapTextToCells(items, cells) {
|
|
|
3479
3569
|
result.set(cell, []);
|
|
3480
3570
|
}
|
|
3481
3571
|
for (const item of items) {
|
|
3482
|
-
const cx = item.x + item.w / 2;
|
|
3483
|
-
const cy = item.y;
|
|
3484
3572
|
const pad = CELL_PADDING;
|
|
3485
3573
|
let bestCell = null;
|
|
3486
|
-
let
|
|
3574
|
+
let bestScore = 0;
|
|
3487
3575
|
for (const cell of cells) {
|
|
3488
|
-
|
|
3489
|
-
|
|
3490
|
-
|
|
3491
|
-
|
|
3492
|
-
|
|
3493
|
-
|
|
3494
|
-
|
|
3495
|
-
|
|
3576
|
+
const ix1 = Math.max(item.x, cell.bbox.x1 - pad);
|
|
3577
|
+
const ix2 = Math.min(item.x + item.w, cell.bbox.x2 + pad);
|
|
3578
|
+
const iy1 = Math.max(item.y, cell.bbox.y1 - pad);
|
|
3579
|
+
const iy2 = Math.min(item.y + (item.h || item.fontSize), cell.bbox.y2 + pad);
|
|
3580
|
+
if (ix1 >= ix2 || iy1 >= iy2) continue;
|
|
3581
|
+
const intersectArea = (ix2 - ix1) * (iy2 - iy1);
|
|
3582
|
+
const itemArea = Math.max(item.w, 1) * Math.max(item.h || item.fontSize, 1);
|
|
3583
|
+
const score = intersectArea / itemArea;
|
|
3584
|
+
if (score > bestScore) {
|
|
3585
|
+
bestScore = score;
|
|
3586
|
+
bestCell = cell;
|
|
3496
3587
|
}
|
|
3497
3588
|
}
|
|
3498
|
-
if (bestCell) {
|
|
3589
|
+
if (bestCell && bestScore > 0.3) {
|
|
3499
3590
|
result.get(bestCell).push(item);
|
|
3500
3591
|
}
|
|
3501
3592
|
}
|
|
@@ -3522,8 +3613,13 @@ function cellTextToString(items) {
|
|
|
3522
3613
|
const textLines = lines.map((line) => {
|
|
3523
3614
|
const s = line.sort((a, b) => a.x - b.x);
|
|
3524
3615
|
if (s.length === 1) return s[0].text;
|
|
3616
|
+
const evenSpaced = detectEvenSpacedItems(s);
|
|
3525
3617
|
let result = s[0].text;
|
|
3526
3618
|
for (let j = 1; j < s.length; j++) {
|
|
3619
|
+
if (evenSpaced[j]) {
|
|
3620
|
+
result += s[j].text;
|
|
3621
|
+
continue;
|
|
3622
|
+
}
|
|
3527
3623
|
const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
|
|
3528
3624
|
const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
|
|
3529
3625
|
const prevIsKorean = /[가-힣]$/.test(result);
|
|
@@ -3538,6 +3634,57 @@ function cellTextToString(items) {
|
|
|
3538
3634
|
}
|
|
3539
3635
|
return result;
|
|
3540
3636
|
});
|
|
3637
|
+
return mergeCellTextLines(textLines);
|
|
3638
|
+
}
|
|
3639
|
+
function detectEvenSpacedItems(items) {
|
|
3640
|
+
const result = new Array(items.length).fill(false);
|
|
3641
|
+
if (items.length < 3) return result;
|
|
3642
|
+
let runStart = -1;
|
|
3643
|
+
for (let i = 0; i < items.length; i++) {
|
|
3644
|
+
const isShortKorean = /^[가-힣]{1}$/.test(items[i].text) || /^[\d]{1}$/.test(items[i].text);
|
|
3645
|
+
if (isShortKorean && runStart >= 0 && i > 0) {
|
|
3646
|
+
const gap = items[i].x - (items[i - 1].x + items[i - 1].w);
|
|
3647
|
+
const maxRunGap = Math.max(items[i].fontSize * 3, 30);
|
|
3648
|
+
if (gap > maxRunGap) {
|
|
3649
|
+
if (i - runStart >= 3) markEvenRun(items, result, runStart, i);
|
|
3650
|
+
runStart = i;
|
|
3651
|
+
continue;
|
|
3652
|
+
}
|
|
3653
|
+
}
|
|
3654
|
+
if (isShortKorean) {
|
|
3655
|
+
if (runStart < 0) runStart = i;
|
|
3656
|
+
} else {
|
|
3657
|
+
if (runStart >= 0 && i - runStart >= 3) {
|
|
3658
|
+
markEvenRun(items, result, runStart, i);
|
|
3659
|
+
}
|
|
3660
|
+
runStart = -1;
|
|
3661
|
+
}
|
|
3662
|
+
}
|
|
3663
|
+
if (runStart >= 0 && items.length - runStart >= 3) {
|
|
3664
|
+
markEvenRun(items, result, runStart, items.length);
|
|
3665
|
+
}
|
|
3666
|
+
return result;
|
|
3667
|
+
}
|
|
3668
|
+
function markEvenRun(items, result, start, end) {
|
|
3669
|
+
const gaps = [];
|
|
3670
|
+
for (let i = start + 1; i < end; i++) {
|
|
3671
|
+
gaps.push(items[i].x - (items[i - 1].x + items[i - 1].w));
|
|
3672
|
+
}
|
|
3673
|
+
const posGaps = gaps.filter((g2) => g2 > 0);
|
|
3674
|
+
if (posGaps.length < 2) return;
|
|
3675
|
+
let minGap = Infinity, maxGap = -Infinity;
|
|
3676
|
+
for (const g2 of posGaps) {
|
|
3677
|
+
if (g2 < minGap) minGap = g2;
|
|
3678
|
+
if (g2 > maxGap) maxGap = g2;
|
|
3679
|
+
}
|
|
3680
|
+
const avgFs = items[start].fontSize;
|
|
3681
|
+
if (minGap >= avgFs * 0.1 && maxGap <= avgFs * 3 && maxGap / Math.max(minGap, 0.1) <= 3) {
|
|
3682
|
+
for (let i = start + 1; i < end; i++) {
|
|
3683
|
+
result[i] = true;
|
|
3684
|
+
}
|
|
3685
|
+
}
|
|
3686
|
+
}
|
|
3687
|
+
function mergeCellTextLines(textLines) {
|
|
3541
3688
|
if (textLines.length <= 1) return textLines[0] || "";
|
|
3542
3689
|
const merged = [textLines[0]];
|
|
3543
3690
|
for (let i = 1; i < textLines.length; i++) {
|
|
@@ -3563,24 +3710,172 @@ var Y_TOL = 3;
|
|
|
3563
3710
|
var COL_CLUSTER_TOL = 15;
|
|
3564
3711
|
var MIN_ROWS = 3;
|
|
3565
3712
|
var MIN_COLS = 2;
|
|
3566
|
-
var MIN_GAP_FACTOR =
|
|
3567
|
-
var
|
|
3713
|
+
var MIN_GAP_FACTOR = 2;
|
|
3714
|
+
var MIN_GAP_ABSOLUTE = 20;
|
|
3715
|
+
var MIN_COL_FILL_RATIO = 0.4;
|
|
3568
3716
|
function detectClusterTables(items, pageNum) {
|
|
3569
3717
|
if (items.length < MIN_ROWS * MIN_COLS) return [];
|
|
3570
|
-
const
|
|
3718
|
+
const { merged, originMap } = mergeEvenSpacedClusters(items);
|
|
3719
|
+
const rows = groupByBaseline(merged);
|
|
3571
3720
|
if (rows.length < MIN_ROWS) return [];
|
|
3572
|
-
const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
|
|
3573
|
-
if (suspiciousRows.length < MIN_ROWS) return [];
|
|
3574
|
-
const columns = extractColumnClusters(suspiciousRows);
|
|
3575
|
-
if (columns.length < MIN_COLS) return [];
|
|
3576
|
-
const tableRegions = findTableRegions(rows, columns);
|
|
3577
3721
|
const results = [];
|
|
3578
|
-
|
|
3579
|
-
|
|
3580
|
-
|
|
3722
|
+
const headerResult = detectHeaderRow(rows);
|
|
3723
|
+
if (headerResult) {
|
|
3724
|
+
const { columns, headerIdx } = headerResult;
|
|
3725
|
+
const headerRow = rows[headerIdx];
|
|
3726
|
+
const headerItems = [...headerRow.items].sort((a, b) => a.x - b.x);
|
|
3727
|
+
const headerAndBelow = rows.slice(headerIdx);
|
|
3728
|
+
const mergedRows = mergeMultiLineRows(headerAndBelow, columns);
|
|
3729
|
+
const tableRegions = findTableRegionsByHeader(mergedRows, columns, headerItems);
|
|
3730
|
+
for (const region of tableRegions) {
|
|
3731
|
+
const table = buildClusterTable(region.rows, columns, pageNum);
|
|
3732
|
+
if (table) {
|
|
3733
|
+
expandUsedItems(table.usedItems, originMap);
|
|
3734
|
+
results.push(table);
|
|
3735
|
+
}
|
|
3736
|
+
}
|
|
3737
|
+
}
|
|
3738
|
+
if (results.length === 0) {
|
|
3739
|
+
const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
|
|
3740
|
+
if (suspiciousRows.length >= MIN_ROWS) {
|
|
3741
|
+
const columns = extractColumnClusters(suspiciousRows);
|
|
3742
|
+
if (columns.length >= MIN_COLS) {
|
|
3743
|
+
const tableRegions = findTableRegions(rows, columns);
|
|
3744
|
+
for (const region of tableRegions) {
|
|
3745
|
+
const mergedRows = mergeMultiLineRows(region.rows, columns);
|
|
3746
|
+
const table = buildClusterTable(mergedRows, columns, pageNum);
|
|
3747
|
+
if (table) {
|
|
3748
|
+
expandUsedItems(table.usedItems, originMap);
|
|
3749
|
+
results.push(table);
|
|
3750
|
+
}
|
|
3751
|
+
}
|
|
3752
|
+
}
|
|
3753
|
+
}
|
|
3581
3754
|
}
|
|
3582
3755
|
return results;
|
|
3583
3756
|
}
|
|
3757
|
+
function mergeEvenSpacedClusters(items) {
|
|
3758
|
+
const originMap = /* @__PURE__ */ new Map();
|
|
3759
|
+
const rows = groupByBaseline(items);
|
|
3760
|
+
const merged = [];
|
|
3761
|
+
for (const row of rows) {
|
|
3762
|
+
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3763
|
+
let i = 0;
|
|
3764
|
+
while (i < sorted.length) {
|
|
3765
|
+
if (/^[가-힣\d]$/.test(sorted[i].text)) {
|
|
3766
|
+
let runEnd = i + 1;
|
|
3767
|
+
while (runEnd < sorted.length && /^[가-힣\d]$/.test(sorted[runEnd].text)) {
|
|
3768
|
+
const gap = sorted[runEnd].x - (sorted[runEnd - 1].x + sorted[runEnd - 1].w);
|
|
3769
|
+
const fs = sorted[runEnd].fontSize;
|
|
3770
|
+
if (gap < fs * 0.1 || gap > fs * 3) break;
|
|
3771
|
+
runEnd++;
|
|
3772
|
+
}
|
|
3773
|
+
if (runEnd - i >= 3) {
|
|
3774
|
+
const gaps = [];
|
|
3775
|
+
for (let g2 = i + 1; g2 < runEnd; g2++) {
|
|
3776
|
+
gaps.push(sorted[g2].x - (sorted[g2 - 1].x + sorted[g2 - 1].w));
|
|
3777
|
+
}
|
|
3778
|
+
let minG = Infinity, maxG = -Infinity;
|
|
3779
|
+
for (const g2 of gaps) {
|
|
3780
|
+
if (g2 < minG) minG = g2;
|
|
3781
|
+
if (g2 > maxG) maxG = g2;
|
|
3782
|
+
}
|
|
3783
|
+
if (minG > 0 && maxG / minG <= 3) {
|
|
3784
|
+
const run = sorted.slice(i, runEnd);
|
|
3785
|
+
const text = run.map((r) => r.text).join("");
|
|
3786
|
+
const first = run[0], last = run[runEnd - i - 1];
|
|
3787
|
+
const item = {
|
|
3788
|
+
text,
|
|
3789
|
+
x: first.x,
|
|
3790
|
+
y: first.y,
|
|
3791
|
+
w: last.x + last.w - first.x,
|
|
3792
|
+
h: first.h,
|
|
3793
|
+
fontSize: first.fontSize,
|
|
3794
|
+
fontName: first.fontName
|
|
3795
|
+
};
|
|
3796
|
+
originMap.set(item, run);
|
|
3797
|
+
merged.push(item);
|
|
3798
|
+
i = runEnd;
|
|
3799
|
+
continue;
|
|
3800
|
+
}
|
|
3801
|
+
}
|
|
3802
|
+
}
|
|
3803
|
+
merged.push(sorted[i]);
|
|
3804
|
+
i++;
|
|
3805
|
+
}
|
|
3806
|
+
}
|
|
3807
|
+
return { merged, originMap };
|
|
3808
|
+
}
|
|
3809
|
+
function expandUsedItems(usedItems, originMap) {
|
|
3810
|
+
const toAdd = [];
|
|
3811
|
+
for (const item of usedItems) {
|
|
3812
|
+
const origins = originMap.get(item);
|
|
3813
|
+
if (origins) for (const o of origins) toAdd.push(o);
|
|
3814
|
+
}
|
|
3815
|
+
for (const a of toAdd) usedItems.add(a);
|
|
3816
|
+
}
|
|
3817
|
+
function detectHeaderRow(rows) {
|
|
3818
|
+
const allItems = rows.flatMap((r) => r.items);
|
|
3819
|
+
if (allItems.length === 0) return null;
|
|
3820
|
+
let allMinX = Infinity, allMaxX = -Infinity;
|
|
3821
|
+
for (const i of allItems) {
|
|
3822
|
+
if (i.x < allMinX) allMinX = i.x;
|
|
3823
|
+
const r = i.x + i.w;
|
|
3824
|
+
if (r > allMaxX) allMaxX = r;
|
|
3825
|
+
}
|
|
3826
|
+
const pageSpan = allMaxX - allMinX;
|
|
3827
|
+
if (pageSpan <= 0) return null;
|
|
3828
|
+
for (let ri = 0; ri < rows.length; ri++) {
|
|
3829
|
+
const row = rows[ri];
|
|
3830
|
+
if (row.items.length < MIN_COLS || row.items.length > 6) continue;
|
|
3831
|
+
if (row.items.some((i) => i.text.length > 8)) continue;
|
|
3832
|
+
if (!row.items.some((i) => /[가-힣]/.test(i.text))) continue;
|
|
3833
|
+
if (row.items.some((i) => /^[□■○●·※▶▷◆◇\-]/.test(i.text))) continue;
|
|
3834
|
+
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3835
|
+
const xSpan = sorted[sorted.length - 1].x + sorted[sorted.length - 1].w - sorted[0].x;
|
|
3836
|
+
if (xSpan / pageSpan < 0.4) continue;
|
|
3837
|
+
const avgFs = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
|
|
3838
|
+
let hasLargeGap = false;
|
|
3839
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
3840
|
+
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
3841
|
+
if (gap >= avgFs * 2.5) {
|
|
3842
|
+
hasLargeGap = true;
|
|
3843
|
+
break;
|
|
3844
|
+
}
|
|
3845
|
+
}
|
|
3846
|
+
if (!hasLargeGap) continue;
|
|
3847
|
+
const columns = sorted.map((item) => ({ x: item.x, count: 0 }));
|
|
3848
|
+
let matchCount = 0;
|
|
3849
|
+
for (let j = ri + 1; j < rows.length && matchCount < MIN_ROWS + 2; j++) {
|
|
3850
|
+
const matched = countMatchedColumnsRange(rows[j], columns, sorted);
|
|
3851
|
+
if (matched >= MIN_COLS) matchCount++;
|
|
3852
|
+
}
|
|
3853
|
+
if (matchCount < MIN_ROWS) continue;
|
|
3854
|
+
return { columns, headerIdx: ri };
|
|
3855
|
+
}
|
|
3856
|
+
return null;
|
|
3857
|
+
}
|
|
3858
|
+
function mergeMultiLineRows(rows, columns) {
|
|
3859
|
+
if (rows.length <= 1) return rows;
|
|
3860
|
+
const result = [rows[0]];
|
|
3861
|
+
const allFontSizes = rows.flatMap((r) => r.items).map((i) => i.fontSize);
|
|
3862
|
+
const avgFontSize = allFontSizes.length > 0 ? allFontSizes.reduce((s, v) => s + v, 0) / allFontSizes.length : 12;
|
|
3863
|
+
for (let i = 1; i < rows.length; i++) {
|
|
3864
|
+
const prev = result[result.length - 1];
|
|
3865
|
+
const curr = rows[i];
|
|
3866
|
+
const yGap = Math.abs(prev.y - curr.y);
|
|
3867
|
+
const matchedCols = countMatchedColumns(curr, columns);
|
|
3868
|
+
if (yGap < avgFontSize * 1.8 && curr.items.length <= 2 && (matchedCols < MIN_COLS || curr.items.length === 1)) {
|
|
3869
|
+
result[result.length - 1] = {
|
|
3870
|
+
y: prev.y,
|
|
3871
|
+
items: [...prev.items, ...curr.items]
|
|
3872
|
+
};
|
|
3873
|
+
} else {
|
|
3874
|
+
result.push(curr);
|
|
3875
|
+
}
|
|
3876
|
+
}
|
|
3877
|
+
return result;
|
|
3878
|
+
}
|
|
3584
3879
|
function groupByBaseline(items) {
|
|
3585
3880
|
if (items.length === 0) return [];
|
|
3586
3881
|
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
@@ -3602,8 +3897,9 @@ function groupByBaseline(items) {
|
|
|
3602
3897
|
function hasSuspiciousGaps(row) {
|
|
3603
3898
|
if (row.items.length < 2) return false;
|
|
3604
3899
|
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3900
|
+
if (sorted.length === 2 && sorted[1].text.length > 20) return false;
|
|
3605
3901
|
const avgFontSize = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
|
|
3606
|
-
const minGap = avgFontSize * MIN_GAP_FACTOR;
|
|
3902
|
+
const minGap = Math.max(avgFontSize * MIN_GAP_FACTOR, MIN_GAP_ABSOLUTE);
|
|
3607
3903
|
for (let i = 1; i < sorted.length; i++) {
|
|
3608
3904
|
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
3609
3905
|
if (gap >= minGap) return true;
|
|
@@ -3630,6 +3926,41 @@ function extractColumnClusters(rows) {
|
|
|
3630
3926
|
const minCount = Math.max(2, Math.floor(rows.length * MIN_COL_FILL_RATIO));
|
|
3631
3927
|
return clusters.filter((c) => c.count >= minCount).sort((a, b) => a.x - b.x);
|
|
3632
3928
|
}
|
|
3929
|
+
function findTableRegionsByHeader(allRows, columns, headerItems) {
|
|
3930
|
+
const regions = [];
|
|
3931
|
+
let currentRegion = [];
|
|
3932
|
+
let missStreak = 0;
|
|
3933
|
+
for (const row of allRows) {
|
|
3934
|
+
const matchedCols = countMatchedColumnsRange(row, columns, headerItems);
|
|
3935
|
+
if (matchedCols >= MIN_COLS) {
|
|
3936
|
+
currentRegion.push(row);
|
|
3937
|
+
missStreak = 0;
|
|
3938
|
+
} else if (currentRegion.length > 0 && (row.items.length <= 2 || missStreak === 0)) {
|
|
3939
|
+
currentRegion.push(row);
|
|
3940
|
+
missStreak++;
|
|
3941
|
+
} else {
|
|
3942
|
+
while (currentRegion.length > 0) {
|
|
3943
|
+
const last = currentRegion[currentRegion.length - 1];
|
|
3944
|
+
if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
|
|
3945
|
+
currentRegion.pop();
|
|
3946
|
+
}
|
|
3947
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
3948
|
+
regions.push({ rows: [...currentRegion] });
|
|
3949
|
+
}
|
|
3950
|
+
currentRegion = [];
|
|
3951
|
+
missStreak = 0;
|
|
3952
|
+
}
|
|
3953
|
+
}
|
|
3954
|
+
while (currentRegion.length > 0) {
|
|
3955
|
+
const last = currentRegion[currentRegion.length - 1];
|
|
3956
|
+
if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
|
|
3957
|
+
currentRegion.pop();
|
|
3958
|
+
}
|
|
3959
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
3960
|
+
regions.push({ rows: currentRegion });
|
|
3961
|
+
}
|
|
3962
|
+
return regions;
|
|
3963
|
+
}
|
|
3633
3964
|
function findTableRegions(allRows, columns) {
|
|
3634
3965
|
const regions = [];
|
|
3635
3966
|
let currentRegion = [];
|
|
@@ -3665,18 +3996,81 @@ function countMatchedColumns(row, columns) {
|
|
|
3665
3996
|
}
|
|
3666
3997
|
return matched.size;
|
|
3667
3998
|
}
|
|
3668
|
-
function
|
|
3669
|
-
const
|
|
3670
|
-
let
|
|
3671
|
-
|
|
3672
|
-
|
|
3673
|
-
|
|
3674
|
-
|
|
3675
|
-
|
|
3676
|
-
|
|
3999
|
+
function countMatchedColumnsRange(row, columns, headerItems) {
|
|
4000
|
+
const boundaries = [];
|
|
4001
|
+
for (let ci = 0; ci < headerItems.length; ci++) {
|
|
4002
|
+
const left = ci === 0 ? 0 : (headerItems[ci - 1].x + headerItems[ci - 1].w + headerItems[ci].x) / 2;
|
|
4003
|
+
const right = ci === headerItems.length - 1 ? Infinity : (headerItems[ci].x + headerItems[ci].w + headerItems[ci + 1].x) / 2;
|
|
4004
|
+
boundaries.push({ left, right });
|
|
4005
|
+
}
|
|
4006
|
+
const matched = /* @__PURE__ */ new Set();
|
|
4007
|
+
for (const item of row.items) {
|
|
4008
|
+
for (let ci = 0; ci < boundaries.length; ci++) {
|
|
4009
|
+
if (item.x >= boundaries[ci].left && item.x < boundaries[ci].right) {
|
|
4010
|
+
matched.add(ci);
|
|
4011
|
+
break;
|
|
4012
|
+
}
|
|
4013
|
+
}
|
|
4014
|
+
}
|
|
4015
|
+
return matched.size;
|
|
4016
|
+
}
|
|
4017
|
+
function assignRowItems(items, columns, numCols) {
|
|
4018
|
+
if (items.length === 0) return [];
|
|
4019
|
+
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
4020
|
+
const colCenters = columns.map((c) => c.x);
|
|
4021
|
+
const gaps = [];
|
|
4022
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
4023
|
+
gaps.push({ idx: i, size: sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w) });
|
|
4024
|
+
}
|
|
4025
|
+
const gapSizes = gaps.map((g2) => g2.size).sort((a, b) => a - b);
|
|
4026
|
+
const medianGap = gapSizes.length > 0 ? gapSizes[Math.floor(gapSizes.length / 2)] : 0;
|
|
4027
|
+
const gapThreshold = sorted.length <= numCols + 1 ? 12 : Math.max(medianGap * 2.5, 12);
|
|
4028
|
+
const significantGaps = gaps.filter((g2) => g2.size >= gapThreshold).sort((a, b) => b.size - a.size).slice(0, numCols - 1).sort((a, b) => a.idx - b.idx);
|
|
4029
|
+
const groups = [];
|
|
4030
|
+
let start = 0;
|
|
4031
|
+
for (const gap of significantGaps) {
|
|
4032
|
+
groups.push(sorted.slice(start, gap.idx));
|
|
4033
|
+
start = gap.idx;
|
|
4034
|
+
}
|
|
4035
|
+
groups.push(sorted.slice(start));
|
|
4036
|
+
const result = [];
|
|
4037
|
+
const usedCols = /* @__PURE__ */ new Set();
|
|
4038
|
+
const groupCenters = groups.map((g2) => {
|
|
4039
|
+
let minX = Infinity, maxX = -Infinity;
|
|
4040
|
+
for (const i of g2) {
|
|
4041
|
+
if (i.x < minX) minX = i.x;
|
|
4042
|
+
const r = i.x + i.w;
|
|
4043
|
+
if (r > maxX) maxX = r;
|
|
4044
|
+
}
|
|
4045
|
+
return (minX + maxX) / 2;
|
|
4046
|
+
});
|
|
4047
|
+
const assignments = [];
|
|
4048
|
+
for (let gi = 0; gi < groups.length; gi++) {
|
|
4049
|
+
for (let ci = 0; ci < numCols; ci++) {
|
|
4050
|
+
assignments.push({ gi, ci, dist: Math.abs(groupCenters[gi] - colCenters[ci]) });
|
|
4051
|
+
}
|
|
4052
|
+
}
|
|
4053
|
+
assignments.sort((a, b) => a.dist - b.dist);
|
|
4054
|
+
const assignedGroups = /* @__PURE__ */ new Set();
|
|
4055
|
+
for (const { gi, ci } of assignments) {
|
|
4056
|
+
if (assignedGroups.has(gi) || usedCols.has(ci)) continue;
|
|
4057
|
+
result.push({ col: ci, items: groups[gi] });
|
|
4058
|
+
assignedGroups.add(gi);
|
|
4059
|
+
usedCols.add(ci);
|
|
4060
|
+
}
|
|
4061
|
+
for (let gi = 0; gi < groups.length; gi++) {
|
|
4062
|
+
if (assignedGroups.has(gi)) continue;
|
|
4063
|
+
let bestCol = 0, bestDist = Infinity;
|
|
4064
|
+
for (let ci = 0; ci < numCols; ci++) {
|
|
4065
|
+
const d = Math.abs(groupCenters[gi] - colCenters[ci]);
|
|
4066
|
+
if (d < bestDist) {
|
|
4067
|
+
bestDist = d;
|
|
4068
|
+
bestCol = ci;
|
|
4069
|
+
}
|
|
3677
4070
|
}
|
|
4071
|
+
result.push({ col: bestCol, items: groups[gi] });
|
|
3678
4072
|
}
|
|
3679
|
-
return
|
|
4073
|
+
return result;
|
|
3680
4074
|
}
|
|
3681
4075
|
function buildClusterTable(rows, columns, pageNum) {
|
|
3682
4076
|
const numCols = columns.length;
|
|
@@ -3694,12 +4088,12 @@ function buildClusterTable(rows, columns, pageNum) {
|
|
|
3694
4088
|
usedItems.add(row.items[0]);
|
|
3695
4089
|
continue;
|
|
3696
4090
|
}
|
|
3697
|
-
|
|
3698
|
-
|
|
3699
|
-
|
|
4091
|
+
const assignments = assignRowItems(row.items, columns, numCols);
|
|
4092
|
+
for (const { col, items } of assignments) {
|
|
4093
|
+
const text = items.map((i) => i.text).join(" ");
|
|
3700
4094
|
const existing = cells[r][col].text;
|
|
3701
|
-
cells[r][col].text = existing ? existing + " " +
|
|
3702
|
-
usedItems.add(item);
|
|
4095
|
+
cells[r][col].text = existing ? existing + " " + text : text;
|
|
4096
|
+
for (const item of items) usedItems.add(item);
|
|
3703
4097
|
}
|
|
3704
4098
|
}
|
|
3705
4099
|
let emptyRows = 0;
|
|
@@ -3711,11 +4105,48 @@ function buildClusterTable(rows, columns, pageNum) {
|
|
|
3711
4105
|
const hasValue = cells.some((row) => row[c].text !== "");
|
|
3712
4106
|
if (!hasValue) return null;
|
|
3713
4107
|
}
|
|
4108
|
+
for (let r = numRows - 1; r >= 1; r--) {
|
|
4109
|
+
const nonEmptyCols = cells[r].filter((c) => c.text.trim()).length;
|
|
4110
|
+
if (nonEmptyCols !== 1) continue;
|
|
4111
|
+
if (cells[r][0].text.trim() !== "") continue;
|
|
4112
|
+
const contentText = cells[r].find((c) => c.text.trim())?.text.trim() || "";
|
|
4113
|
+
if (/^[○●▶\-·]/.test(contentText)) continue;
|
|
4114
|
+
for (let pr = r - 1; pr >= 0; pr--) {
|
|
4115
|
+
if (cells[pr].some((c) => c.text.trim())) {
|
|
4116
|
+
for (let c = 0; c < numCols; c++) {
|
|
4117
|
+
const prev = cells[pr][c].text.trim();
|
|
4118
|
+
const curr = cells[r][c].text.trim();
|
|
4119
|
+
if (curr) cells[pr][c].text = prev ? prev + " " + curr : curr;
|
|
4120
|
+
}
|
|
4121
|
+
for (let c = 0; c < numCols; c++) cells[r][c].text = "";
|
|
4122
|
+
break;
|
|
4123
|
+
}
|
|
4124
|
+
}
|
|
4125
|
+
}
|
|
4126
|
+
for (let r = 0; r < cells.length - 1; r++) {
|
|
4127
|
+
const row = cells[r];
|
|
4128
|
+
const hasCol0 = row[0].text.trim() !== "";
|
|
4129
|
+
const hasColLast = numCols > 1 && row[numCols - 1].text.trim() !== "";
|
|
4130
|
+
const midEmpty = row.slice(1, numCols - 1).every((c) => c.text.trim() === "");
|
|
4131
|
+
if (hasCol0 && hasColLast && midEmpty) {
|
|
4132
|
+
const next = cells[r + 1];
|
|
4133
|
+
if (next[0].text.trim() === "" && next.some((c) => c.text.trim())) {
|
|
4134
|
+
for (let c = 1; c < numCols; c++) {
|
|
4135
|
+
const curr = next[c].text.trim();
|
|
4136
|
+
if (curr) row[c].text = row[c].text.trim() ? row[c].text.trim() + " " + curr : curr;
|
|
4137
|
+
}
|
|
4138
|
+
for (let c = 0; c < numCols; c++) next[c].text = "";
|
|
4139
|
+
}
|
|
4140
|
+
}
|
|
4141
|
+
}
|
|
4142
|
+
const filteredCells = cells.filter((row) => row.some((c) => c.text.trim()));
|
|
4143
|
+
const finalRowCount = filteredCells.length;
|
|
4144
|
+
if (finalRowCount < MIN_ROWS) return null;
|
|
3714
4145
|
const irTable = {
|
|
3715
|
-
rows:
|
|
4146
|
+
rows: finalRowCount,
|
|
3716
4147
|
cols: numCols,
|
|
3717
|
-
cells,
|
|
3718
|
-
hasHeader:
|
|
4148
|
+
cells: filteredCells,
|
|
4149
|
+
hasHeader: finalRowCount > 1
|
|
3719
4150
|
};
|
|
3720
4151
|
const allItems = rows.flatMap((r) => r.items);
|
|
3721
4152
|
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
@@ -3792,7 +4223,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3792
4223
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
3793
4224
|
const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
|
|
3794
4225
|
const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
|
|
3795
|
-
const
|
|
4226
|
+
const fontSizeFreq = /* @__PURE__ */ new Map();
|
|
3796
4227
|
const pageHeights = /* @__PURE__ */ new Map();
|
|
3797
4228
|
let parsedPages = 0;
|
|
3798
4229
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
@@ -3809,7 +4240,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3809
4240
|
warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
|
|
3810
4241
|
}
|
|
3811
4242
|
for (const item of visible) {
|
|
3812
|
-
if (item.fontSize > 0)
|
|
4243
|
+
if (item.fontSize > 0) fontSizeFreq.set(item.fontSize, (fontSizeFreq.get(item.fontSize) || 0) + 1);
|
|
3813
4244
|
}
|
|
3814
4245
|
const opList = await page.getOperatorList();
|
|
3815
4246
|
const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
|
|
@@ -3848,10 +4279,9 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3848
4279
|
blocks.splice(removed[ri], 1);
|
|
3849
4280
|
}
|
|
3850
4281
|
}
|
|
3851
|
-
const medianFontSize =
|
|
4282
|
+
const medianFontSize = computeMedianFontSizeFromFreq(fontSizeFreq);
|
|
3852
4283
|
if (medianFontSize > 0) {
|
|
3853
4284
|
detectHeadings(blocks, medianFontSize);
|
|
3854
|
-
mergeAdjacentHeadings(blocks);
|
|
3855
4285
|
}
|
|
3856
4286
|
detectMarkerHeadings(blocks);
|
|
3857
4287
|
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
@@ -3902,11 +4332,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
|
|
|
3902
4332
|
}
|
|
3903
4333
|
return { visible, hiddenCount };
|
|
3904
4334
|
}
|
|
3905
|
-
function
|
|
3906
|
-
if (
|
|
3907
|
-
|
|
3908
|
-
const
|
|
3909
|
-
|
|
4335
|
+
function computeMedianFontSizeFromFreq(freq) {
|
|
4336
|
+
if (freq.size === 0) return 0;
|
|
4337
|
+
let total = 0;
|
|
4338
|
+
for (const count of freq.values()) total += count;
|
|
4339
|
+
const sorted = [...freq.entries()].sort((a, b) => a[0] - b[0]);
|
|
4340
|
+
const mid = Math.floor(total / 2);
|
|
4341
|
+
let cumulative = 0;
|
|
4342
|
+
for (const [size, count] of sorted) {
|
|
4343
|
+
cumulative += count;
|
|
4344
|
+
if (cumulative > mid) return size;
|
|
4345
|
+
}
|
|
4346
|
+
return sorted[sorted.length - 1][0];
|
|
3910
4347
|
}
|
|
3911
4348
|
function detectHeadings(blocks, medianFontSize) {
|
|
3912
4349
|
for (const block of blocks) {
|
|
@@ -3926,220 +4363,27 @@ function detectHeadings(blocks, medianFontSize) {
|
|
|
3926
4363
|
}
|
|
3927
4364
|
}
|
|
3928
4365
|
}
|
|
3929
|
-
function mergeAdjacentHeadings(blocks) {
|
|
3930
|
-
let i = 0;
|
|
3931
|
-
while (i < blocks.length - 1) {
|
|
3932
|
-
const curr = blocks[i];
|
|
3933
|
-
const next = blocks[i + 1];
|
|
3934
|
-
if (curr.type !== "heading" || next.type !== "heading") {
|
|
3935
|
-
i++;
|
|
3936
|
-
continue;
|
|
3937
|
-
}
|
|
3938
|
-
if (!curr.bbox || !next.bbox || !curr.text || !next.text) {
|
|
3939
|
-
i++;
|
|
3940
|
-
continue;
|
|
3941
|
-
}
|
|
3942
|
-
const currBaseline = curr.bbox.y + (curr.style?.fontSize || curr.bbox.height);
|
|
3943
|
-
const nextBaseline = next.bbox.y + (next.style?.fontSize || next.bbox.height);
|
|
3944
|
-
const yDiff = Math.abs(currBaseline - nextBaseline);
|
|
3945
|
-
const maxFs = Math.max(curr.style?.fontSize || 12, next.style?.fontSize || 12);
|
|
3946
|
-
const sameY = curr.bbox.page === next.bbox.page && yDiff < maxFs * 1.5;
|
|
3947
|
-
const sameLevel = curr.level === next.level;
|
|
3948
|
-
if (sameY && sameLevel) {
|
|
3949
|
-
const currX = curr.bbox.x;
|
|
3950
|
-
const nextX = next.bbox.x;
|
|
3951
|
-
if (currX <= nextX) {
|
|
3952
|
-
curr.text = curr.text + " " + next.text;
|
|
3953
|
-
} else {
|
|
3954
|
-
curr.text = next.text + " " + curr.text;
|
|
3955
|
-
}
|
|
3956
|
-
curr.bbox = {
|
|
3957
|
-
page: curr.bbox.page,
|
|
3958
|
-
x: Math.min(curr.bbox.x, next.bbox.x),
|
|
3959
|
-
y: Math.min(curr.bbox.y, next.bbox.y),
|
|
3960
|
-
width: Math.max(curr.bbox.x + curr.bbox.width, next.bbox.x + next.bbox.width) - Math.min(curr.bbox.x, next.bbox.x),
|
|
3961
|
-
height: Math.max(curr.bbox.height, next.bbox.height)
|
|
3962
|
-
};
|
|
3963
|
-
blocks.splice(i + 1, 1);
|
|
3964
|
-
} else {
|
|
3965
|
-
i++;
|
|
3966
|
-
}
|
|
3967
|
-
}
|
|
3968
|
-
}
|
|
3969
4366
|
function collapseEvenSpacing(text) {
|
|
3970
4367
|
const tokens = text.split(" ");
|
|
3971
4368
|
const singleCharCount = tokens.filter((t) => t.length === 1).length;
|
|
3972
4369
|
if (tokens.length >= 3 && singleCharCount / tokens.length >= 0.7) {
|
|
3973
4370
|
return tokens.join("");
|
|
3974
4371
|
}
|
|
3975
|
-
return text
|
|
3976
|
-
}
|
|
3977
|
-
|
|
3978
|
-
const allY = items.map((i) => i.y);
|
|
3979
|
-
const pageHeight = Math.max(...allY) - Math.min(...allY);
|
|
3980
|
-
const gapThreshold = Math.max(15, pageHeight * 0.03);
|
|
3981
|
-
const orderedGroups = xyCutOrder(items, gapThreshold);
|
|
3982
|
-
const blocks = [];
|
|
3983
|
-
for (const group of orderedGroups) {
|
|
3984
|
-
if (group.length === 0) continue;
|
|
3985
|
-
const yLines = groupByY(group);
|
|
3986
|
-
for (const line of yLines) {
|
|
3987
|
-
const text = mergeLineSimple(line);
|
|
3988
|
-
if (!text.trim()) continue;
|
|
3989
|
-
blocks.push({
|
|
3990
|
-
type: "paragraph",
|
|
3991
|
-
text,
|
|
3992
|
-
pageNumber: pageNum,
|
|
3993
|
-
bbox: computeBBox(line, pageNum),
|
|
3994
|
-
style: dominantStyle(line)
|
|
3995
|
-
});
|
|
3996
|
-
}
|
|
3997
|
-
}
|
|
3998
|
-
return blocks.length > 0 ? blocks : null;
|
|
3999
|
-
}
|
|
4000
|
-
function normalizeUnderSegmentedTable(table, items, pageNum, bbox) {
|
|
4001
|
-
const totalCells = table.cells.reduce((sum, row) => sum + row.filter((c) => c.text.trim()).length, 0);
|
|
4002
|
-
const totalTextLines = table.cells.reduce((sum, row) => sum + row.reduce((s, c) => s + (c.text.trim() ? c.text.split("\n").length : 0), 0), 0);
|
|
4003
|
-
const isUnderSegmented = table.rows === 1 && table.cols === 1 || totalCells <= 2 && totalTextLines >= 8 || totalCells <= 2 && items.length >= 6;
|
|
4004
|
-
if (!isUnderSegmented) return null;
|
|
4005
|
-
if (hasMultiColumnLayout(items)) return buildXyCutBlocks(items, pageNum);
|
|
4006
|
-
const directTable = buildTableFromTextLayout(items, pageNum, bbox);
|
|
4007
|
-
if (directTable) return directTable;
|
|
4008
|
-
const clusterItems = items.map((i) => ({
|
|
4009
|
-
text: i.text,
|
|
4010
|
-
x: i.x,
|
|
4011
|
-
y: i.y,
|
|
4012
|
-
w: i.w,
|
|
4013
|
-
h: i.h,
|
|
4014
|
-
fontSize: i.fontSize,
|
|
4015
|
-
fontName: i.fontName
|
|
4016
|
-
}));
|
|
4017
|
-
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4018
|
-
if (clusterResults.length > 0) {
|
|
4019
|
-
const blocks = [];
|
|
4020
|
-
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4021
|
-
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4022
|
-
const usedIndices = /* @__PURE__ */ new Set();
|
|
4023
|
-
for (const cr of clusterResults) {
|
|
4024
|
-
for (const ci of cr.usedItems) {
|
|
4025
|
-
const idx = ciToIdx.get(ci);
|
|
4026
|
-
if (idx !== void 0) usedIndices.add(idx);
|
|
4027
|
-
}
|
|
4028
|
-
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4029
|
-
}
|
|
4030
|
-
const remaining = items.filter((_, idx) => !usedIndices.has(idx));
|
|
4031
|
-
for (const item of remaining) {
|
|
4032
|
-
if (!item.text.trim()) continue;
|
|
4033
|
-
blocks.push({
|
|
4034
|
-
type: "paragraph",
|
|
4035
|
-
text: item.text,
|
|
4036
|
-
pageNumber: pageNum,
|
|
4037
|
-
bbox: computeBBox([item], pageNum),
|
|
4038
|
-
style: { fontSize: item.fontSize, fontName: item.fontName }
|
|
4039
|
-
});
|
|
4040
|
-
}
|
|
4041
|
-
blocks.sort((a, b) => {
|
|
4042
|
-
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
4043
|
-
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
4044
|
-
return by - ay;
|
|
4045
|
-
});
|
|
4046
|
-
return blocks.length > 0 ? blocks : null;
|
|
4047
|
-
}
|
|
4048
|
-
return null;
|
|
4049
|
-
}
|
|
4050
|
-
function buildTableFromTextLayout(items, pageNum, bbox) {
|
|
4051
|
-
if (items.length < 4) return null;
|
|
4052
|
-
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
4053
|
-
const yTol = 3;
|
|
4054
|
-
const rows = [];
|
|
4055
|
-
let curRow = [sorted[0]];
|
|
4056
|
-
let curY = sorted[0].y;
|
|
4057
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
4058
|
-
if (Math.abs(sorted[i].y - curY) <= yTol) {
|
|
4059
|
-
curRow.push(sorted[i]);
|
|
4060
|
-
} else {
|
|
4061
|
-
rows.push(curRow);
|
|
4062
|
-
curRow = [sorted[i]];
|
|
4063
|
-
curY = sorted[i].y;
|
|
4064
|
-
}
|
|
4065
|
-
}
|
|
4066
|
-
rows.push(curRow);
|
|
4067
|
-
if (rows.length < 2) return null;
|
|
4068
|
-
const gapPositions = [];
|
|
4069
|
-
for (const row of rows) {
|
|
4070
|
-
if (row.length < 2) continue;
|
|
4071
|
-
const sortedX = [...row].sort((a, b) => a.x - b.x);
|
|
4072
|
-
const avgFs = sortedX.reduce((s, i) => s + i.fontSize, 0) / sortedX.length;
|
|
4073
|
-
for (let j = 1; j < sortedX.length; j++) {
|
|
4074
|
-
const gap = sortedX[j].x - (sortedX[j - 1].x + sortedX[j - 1].w);
|
|
4075
|
-
if (gap >= avgFs * 1.5) {
|
|
4076
|
-
gapPositions.push(sortedX[j - 1].x + sortedX[j - 1].w + gap / 2);
|
|
4077
|
-
}
|
|
4078
|
-
}
|
|
4079
|
-
}
|
|
4080
|
-
if (gapPositions.length < 2) return null;
|
|
4081
|
-
gapPositions.sort((a, b) => a - b);
|
|
4082
|
-
const colBoundaries = [];
|
|
4083
|
-
let clusterSum = gapPositions[0], clusterCount = 1;
|
|
4084
|
-
for (let i = 1; i < gapPositions.length; i++) {
|
|
4085
|
-
const avg = clusterSum / clusterCount;
|
|
4086
|
-
if (Math.abs(gapPositions[i] - avg) <= 15) {
|
|
4087
|
-
clusterSum += gapPositions[i];
|
|
4088
|
-
clusterCount++;
|
|
4089
|
-
} else {
|
|
4090
|
-
if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
|
|
4091
|
-
clusterSum = gapPositions[i];
|
|
4092
|
-
clusterCount = 1;
|
|
4093
|
-
}
|
|
4094
|
-
}
|
|
4095
|
-
if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
|
|
4096
|
-
if (colBoundaries.length === 0) return null;
|
|
4097
|
-
const numCols = colBoundaries.length + 1;
|
|
4098
|
-
const tableRows = [];
|
|
4099
|
-
for (const row of rows) {
|
|
4100
|
-
const cells = Array(numCols).fill("");
|
|
4101
|
-
const sortedX = [...row].sort((a, b) => a.x - b.x);
|
|
4102
|
-
for (const item of sortedX) {
|
|
4103
|
-
const cx = item.x + item.w / 2;
|
|
4104
|
-
let col = 0;
|
|
4105
|
-
for (let b = 0; b < colBoundaries.length; b++) {
|
|
4106
|
-
if (cx > colBoundaries[b]) col = b + 1;
|
|
4107
|
-
}
|
|
4108
|
-
cells[col] = cells[col] ? cells[col] + " " + item.text : item.text;
|
|
4109
|
-
}
|
|
4110
|
-
if (cells[0].trim() === "" && tableRows.length > 0) {
|
|
4111
|
-
const prevCells = tableRows[tableRows.length - 1].cells;
|
|
4112
|
-
for (let c = 0; c < numCols; c++) {
|
|
4113
|
-
if (cells[c].trim()) {
|
|
4114
|
-
prevCells[c] = prevCells[c] ? prevCells[c] + " " + cells[c].trim() : cells[c].trim();
|
|
4115
|
-
}
|
|
4116
|
-
}
|
|
4117
|
-
} else {
|
|
4118
|
-
tableRows.push({ cells });
|
|
4119
|
-
}
|
|
4120
|
-
}
|
|
4121
|
-
if (tableRows.length < 2) return null;
|
|
4122
|
-
const nonEmptyCount = tableRows.reduce((sum, r) => sum + r.cells.filter((c) => c.trim()).length, 0);
|
|
4123
|
-
const totalCount = tableRows.length * numCols;
|
|
4124
|
-
if (nonEmptyCount < totalCount * 0.3) return null;
|
|
4125
|
-
const irCells = tableRows.map(
|
|
4126
|
-
(r) => r.cells.map((text, colIdx) => {
|
|
4127
|
-
let cleaned = text.trim();
|
|
4128
|
-
if (colIdx > 0) cleaned = cleaned.replace(/^[•○·\-]\s*/, "");
|
|
4129
|
-
return { text: cleaned, colSpan: 1, rowSpan: 1 };
|
|
4130
|
-
})
|
|
4372
|
+
return text.replace(
|
|
4373
|
+
/(?<![가-힣])[가-힣](?: [가-힣\d]){2,}(?![가-힣])/g,
|
|
4374
|
+
(match) => match.replace(/ /g, "")
|
|
4131
4375
|
);
|
|
4132
|
-
const irTable = {
|
|
4133
|
-
rows: tableRows.length,
|
|
4134
|
-
cols: numCols,
|
|
4135
|
-
cells: irCells,
|
|
4136
|
-
hasHeader: tableRows.length > 1
|
|
4137
|
-
};
|
|
4138
|
-
return [{ type: "table", table: irTable, pageNumber: pageNum, bbox }];
|
|
4139
4376
|
}
|
|
4140
4377
|
function shouldDemoteTable(table) {
|
|
4141
4378
|
const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
|
|
4142
4379
|
const allText = allCells.join(" ");
|
|
4380
|
+
if (table.rows <= 3 && table.cols <= 3) {
|
|
4381
|
+
const totalCells2 = table.rows * table.cols;
|
|
4382
|
+
const emptyCells2 = totalCells2 - allCells.length;
|
|
4383
|
+
if (emptyCells2 >= totalCells2 * 0.3) return true;
|
|
4384
|
+
if (/[□■◆○●▶ㅇ]/.test(allText)) return true;
|
|
4385
|
+
if (/<[^>]+>/.test(allText)) return true;
|
|
4386
|
+
}
|
|
4143
4387
|
if (allText.length > 200) return false;
|
|
4144
4388
|
if (/[□■◆○●▶]/.test(allText) && table.rows <= 3) return true;
|
|
4145
4389
|
const totalCells = table.rows * table.cols;
|
|
@@ -4183,32 +4427,6 @@ function detectMarkerHeadings(blocks) {
|
|
|
4183
4427
|
}
|
|
4184
4428
|
}
|
|
4185
4429
|
}
|
|
4186
|
-
function hasMultiColumnLayout(items) {
|
|
4187
|
-
if (items.length < 30) return false;
|
|
4188
|
-
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
4189
|
-
const minX = sorted[0].x;
|
|
4190
|
-
let maxX = minX;
|
|
4191
|
-
for (const i of sorted) if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
4192
|
-
const pageWidth = maxX - minX;
|
|
4193
|
-
if (pageWidth < 200) return false;
|
|
4194
|
-
let bestGap = 0;
|
|
4195
|
-
let bestSplit = 0;
|
|
4196
|
-
for (let j = 1; j < sorted.length; j++) {
|
|
4197
|
-
const gap = sorted[j].x - (sorted[j - 1].x + sorted[j - 1].w);
|
|
4198
|
-
if (gap > bestGap) {
|
|
4199
|
-
bestGap = gap;
|
|
4200
|
-
bestSplit = (sorted[j - 1].x + sorted[j - 1].w + sorted[j].x) / 2;
|
|
4201
|
-
}
|
|
4202
|
-
}
|
|
4203
|
-
if (bestGap < 20) return false;
|
|
4204
|
-
const splitRatio = (bestSplit - minX) / pageWidth;
|
|
4205
|
-
if (splitRatio < 0.35 || splitRatio > 0.65) return false;
|
|
4206
|
-
const leftCount = items.filter((i) => i.x + i.w / 2 < bestSplit).length;
|
|
4207
|
-
const rightCount = items.filter((i) => i.x + i.w / 2 >= bestSplit).length;
|
|
4208
|
-
if (leftCount < 15 || rightCount < 15) return false;
|
|
4209
|
-
if (Math.min(leftCount, rightCount) / Math.max(leftCount, rightCount) < 0.35) return false;
|
|
4210
|
-
return true;
|
|
4211
|
-
}
|
|
4212
4430
|
var MAX_XYCUT_DEPTH = 50;
|
|
4213
4431
|
function xyCutOrder(items, gapThreshold, depth = 0) {
|
|
4214
4432
|
if (items.length === 0) return [];
|
|
@@ -4276,6 +4494,7 @@ function extractPageBlocksWithLines(items, pageNum, opList, pageWidth, pageHeigh
|
|
|
4276
4494
|
if (items.length === 0) return [];
|
|
4277
4495
|
let { horizontals, verticals } = extractLines(opList.fnArray, opList.argsArray);
|
|
4278
4496
|
({ horizontals, verticals } = filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight));
|
|
4497
|
+
({ horizontals, verticals } = preprocessLines(horizontals, verticals));
|
|
4279
4498
|
const grids = buildTableGrids(horizontals, verticals);
|
|
4280
4499
|
if (grids.length > 0) {
|
|
4281
4500
|
return extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals);
|
|
@@ -4287,14 +4506,19 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
4287
4506
|
const usedItems = /* @__PURE__ */ new Set();
|
|
4288
4507
|
const sortedGrids = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
|
|
4289
4508
|
for (const grid of sortedGrids) {
|
|
4509
|
+
const numGridRows = grid.rowYs.length - 1;
|
|
4510
|
+
const numGridCols = grid.colXs.length - 1;
|
|
4511
|
+
if (numGridRows === 1 && numGridCols >= 2) continue;
|
|
4290
4512
|
const tableItems = [];
|
|
4291
4513
|
const pad = 3;
|
|
4514
|
+
const gridW = grid.bbox.x2 - grid.bbox.x1;
|
|
4292
4515
|
for (const item of items) {
|
|
4293
4516
|
if (usedItems.has(item)) continue;
|
|
4294
|
-
if (item.
|
|
4295
|
-
|
|
4296
|
-
|
|
4297
|
-
|
|
4517
|
+
if (item.y < grid.bbox.y1 - pad || item.y > grid.bbox.y2 + pad) continue;
|
|
4518
|
+
if (item.x < grid.bbox.x1 - pad || item.x + item.w > grid.bbox.x2 + pad) continue;
|
|
4519
|
+
if (gridW < 120 && item.x + item.w > grid.bbox.x2 - 2) continue;
|
|
4520
|
+
tableItems.push(item);
|
|
4521
|
+
usedItems.add(item);
|
|
4298
4522
|
}
|
|
4299
4523
|
const cells = extractCells(grid, horizontals, verticals);
|
|
4300
4524
|
if (cells.length === 0) continue;
|
|
@@ -4318,6 +4542,7 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
4318
4542
|
const cellItems = cellTextMap.get(cell) || [];
|
|
4319
4543
|
let text = cellTextToString(cellItems);
|
|
4320
4544
|
text = text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").trim();
|
|
4545
|
+
text = text.split("\n").map((line) => collapseEvenSpacing(line)).join("\n");
|
|
4321
4546
|
irGrid[cell.row][cell.col] = {
|
|
4322
4547
|
text,
|
|
4323
4548
|
colSpan: cell.colSpan,
|
|
@@ -4339,31 +4564,61 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
4339
4564
|
width: grid.bbox.x2 - grid.bbox.x1,
|
|
4340
4565
|
height: grid.bbox.y2 - grid.bbox.y1
|
|
4341
4566
|
};
|
|
4342
|
-
const normalized = normalizeUnderSegmentedTable(irTable, tableItems, pageNum, tableBbox);
|
|
4343
|
-
if (normalized) {
|
|
4344
|
-
blocks.push(...normalized);
|
|
4345
|
-
continue;
|
|
4346
|
-
}
|
|
4347
4567
|
if (shouldDemoteTable(irTable)) {
|
|
4348
4568
|
const demoted = demoteTableToText(irTable);
|
|
4349
4569
|
if (demoted) {
|
|
4350
|
-
|
|
4570
|
+
const text = numGridRows === 1 ? "\n" + demoted + "\n" : demoted;
|
|
4571
|
+
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
|
|
4351
4572
|
}
|
|
4352
4573
|
continue;
|
|
4353
4574
|
}
|
|
4354
4575
|
blocks.push({ type: "table", table: irTable, pageNumber: pageNum, bbox: tableBbox });
|
|
4355
4576
|
}
|
|
4356
|
-
|
|
4577
|
+
let remaining = items.filter((i) => !usedItems.has(i));
|
|
4357
4578
|
if (remaining.length > 0) {
|
|
4358
4579
|
remaining.sort((a, b) => b.y - a.y || a.x - b.x);
|
|
4359
|
-
const
|
|
4360
|
-
|
|
4361
|
-
|
|
4580
|
+
const clusterItems = remaining.map((i) => ({
|
|
4581
|
+
text: i.text,
|
|
4582
|
+
x: i.x,
|
|
4583
|
+
y: i.y,
|
|
4584
|
+
w: i.w,
|
|
4585
|
+
h: i.h,
|
|
4586
|
+
fontSize: i.fontSize,
|
|
4587
|
+
fontName: i.fontName
|
|
4588
|
+
}));
|
|
4589
|
+
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4590
|
+
if (clusterResults.length > 0) {
|
|
4591
|
+
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4592
|
+
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4593
|
+
const usedClusterIndices = /* @__PURE__ */ new Set();
|
|
4594
|
+
for (const cr of clusterResults) {
|
|
4595
|
+
for (const ci of cr.usedItems) {
|
|
4596
|
+
const idx = ciToIdx.get(ci);
|
|
4597
|
+
if (idx !== void 0) usedClusterIndices.add(idx);
|
|
4598
|
+
}
|
|
4599
|
+
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4600
|
+
}
|
|
4601
|
+
remaining = remaining.filter((_, idx) => !usedClusterIndices.has(idx));
|
|
4602
|
+
}
|
|
4603
|
+
if (remaining.length > 0) {
|
|
4604
|
+
const allY = remaining.map((i) => i.y);
|
|
4605
|
+
const pageH = safeMax(allY) - safeMin(allY);
|
|
4606
|
+
const groups = xyCutOrder(remaining, Math.max(15, pageH * 0.03));
|
|
4607
|
+
const textBlocks = [];
|
|
4608
|
+
for (const group of groups) {
|
|
4609
|
+
if (group.length === 0) continue;
|
|
4610
|
+
const groupBlocks = extractPageBlocksFallback(group, pageNum);
|
|
4611
|
+
for (const b of groupBlocks) textBlocks.push(b);
|
|
4612
|
+
}
|
|
4613
|
+
const finalTextBlocks = detectListBlocks(textBlocks);
|
|
4614
|
+
for (const b of finalTextBlocks) blocks.push(b);
|
|
4615
|
+
}
|
|
4616
|
+
blocks.sort((a, b) => {
|
|
4362
4617
|
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
4363
4618
|
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
4364
4619
|
return by - ay;
|
|
4365
4620
|
});
|
|
4366
|
-
return mergeAdjacentTableBlocks(
|
|
4621
|
+
return mergeAdjacentTableBlocks(blocks);
|
|
4367
4622
|
}
|
|
4368
4623
|
return mergeAdjacentTableBlocks(blocks);
|
|
4369
4624
|
}
|
|
@@ -4389,57 +4644,53 @@ function mergeAdjacentTableBlocks(blocks) {
|
|
|
4389
4644
|
}
|
|
4390
4645
|
function extractPageBlocksFallback(items, pageNum) {
|
|
4391
4646
|
if (items.length === 0) return [];
|
|
4392
|
-
if (hasMultiColumnLayout(items)) {
|
|
4393
|
-
const xyBlocks = buildXyCutBlocks(items, pageNum) || [];
|
|
4394
|
-
return detectSpecialKoreanTables(detectListBlocks(xyBlocks));
|
|
4395
|
-
}
|
|
4396
4647
|
const blocks = [];
|
|
4397
|
-
const
|
|
4398
|
-
|
|
4399
|
-
|
|
4400
|
-
|
|
4401
|
-
|
|
4402
|
-
|
|
4403
|
-
|
|
4404
|
-
|
|
4405
|
-
|
|
4406
|
-
|
|
4407
|
-
|
|
4408
|
-
|
|
4409
|
-
|
|
4410
|
-
|
|
4411
|
-
|
|
4412
|
-
|
|
4413
|
-
|
|
4414
|
-
|
|
4415
|
-
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4416
|
-
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4417
|
-
const usedIndices = /* @__PURE__ */ new Set();
|
|
4418
|
-
for (const cr of clusterResults) {
|
|
4419
|
-
for (const ci of cr.usedItems) {
|
|
4420
|
-
const idx = ciToIdx.get(ci);
|
|
4421
|
-
if (idx !== void 0) usedIndices.add(idx);
|
|
4422
|
-
}
|
|
4423
|
-
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4648
|
+
const clusterItems = items.map((i) => ({
|
|
4649
|
+
text: i.text,
|
|
4650
|
+
x: i.x,
|
|
4651
|
+
y: i.y,
|
|
4652
|
+
w: i.w,
|
|
4653
|
+
h: i.h,
|
|
4654
|
+
fontSize: i.fontSize,
|
|
4655
|
+
fontName: i.fontName
|
|
4656
|
+
}));
|
|
4657
|
+
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4658
|
+
if (clusterResults.length > 0) {
|
|
4659
|
+
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4660
|
+
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4661
|
+
const usedIndices = /* @__PURE__ */ new Set();
|
|
4662
|
+
for (const cr of clusterResults) {
|
|
4663
|
+
for (const ci of cr.usedItems) {
|
|
4664
|
+
const idx = ciToIdx.get(ci);
|
|
4665
|
+
if (idx !== void 0) usedIndices.add(idx);
|
|
4424
4666
|
}
|
|
4425
|
-
|
|
4426
|
-
|
|
4427
|
-
|
|
4428
|
-
|
|
4429
|
-
|
|
4430
|
-
|
|
4431
|
-
|
|
4432
|
-
|
|
4433
|
-
|
|
4667
|
+
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4668
|
+
}
|
|
4669
|
+
const remaining = items.filter((_, idx) => !usedIndices.has(idx));
|
|
4670
|
+
if (remaining.length > 0) {
|
|
4671
|
+
const yLines = groupByY(remaining);
|
|
4672
|
+
for (const line of yLines) {
|
|
4673
|
+
const text = mergeLineSimple(line);
|
|
4674
|
+
if (!text.trim()) continue;
|
|
4675
|
+
const bbox = computeBBox(line, pageNum);
|
|
4676
|
+
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
|
|
4434
4677
|
}
|
|
4435
|
-
|
|
4436
|
-
|
|
4437
|
-
|
|
4438
|
-
|
|
4439
|
-
|
|
4678
|
+
}
|
|
4679
|
+
blocks.sort((a, b) => {
|
|
4680
|
+
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
4681
|
+
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
4682
|
+
return by - ay;
|
|
4683
|
+
});
|
|
4684
|
+
} else {
|
|
4685
|
+
const allYLines = groupByY(items);
|
|
4686
|
+
const columns = detectColumns(allYLines);
|
|
4687
|
+
if (columns && columns.length >= 3) {
|
|
4688
|
+
const tableText = extractWithColumns(allYLines, columns);
|
|
4689
|
+
const bbox = computeBBox(items, pageNum);
|
|
4690
|
+
blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
|
|
4440
4691
|
} else {
|
|
4441
4692
|
const allY = items.map((i) => i.y);
|
|
4442
|
-
const pageHeight =
|
|
4693
|
+
const pageHeight = safeMax(allY) - safeMin(allY);
|
|
4443
4694
|
const gapThreshold = Math.max(15, pageHeight * 0.03);
|
|
4444
4695
|
const orderedGroups = xyCutOrder(items, gapThreshold);
|
|
4445
4696
|
for (const group of orderedGroups) {
|
|
@@ -4492,22 +4743,76 @@ function dominantStyle(items) {
|
|
|
4492
4743
|
return { fontSize: dominantSize, fontName };
|
|
4493
4744
|
}
|
|
4494
4745
|
function normalizeItems(rawItems) {
|
|
4495
|
-
|
|
4746
|
+
const items = [];
|
|
4747
|
+
const spacePositions = [];
|
|
4748
|
+
for (const i of rawItems) {
|
|
4749
|
+
if (typeof i.str !== "string") continue;
|
|
4750
|
+
const x = Math.round(i.transform[4]);
|
|
4751
|
+
const y = Math.round(i.transform[5]);
|
|
4752
|
+
if (!i.str.trim()) {
|
|
4753
|
+
spacePositions.push({ x, y });
|
|
4754
|
+
continue;
|
|
4755
|
+
}
|
|
4496
4756
|
const scaleY = Math.abs(i.transform[3]);
|
|
4497
4757
|
const scaleX = Math.abs(i.transform[0]);
|
|
4498
4758
|
const fontSize = Math.round(Math.max(scaleY, scaleX));
|
|
4499
|
-
|
|
4500
|
-
|
|
4501
|
-
|
|
4502
|
-
|
|
4503
|
-
|
|
4504
|
-
|
|
4505
|
-
|
|
4506
|
-
|
|
4507
|
-
|
|
4508
|
-
|
|
4509
|
-
|
|
4510
|
-
|
|
4759
|
+
const w = Math.round(i.width);
|
|
4760
|
+
const h = Math.round(i.height);
|
|
4761
|
+
const isHidden = fontSize === 0 || i.width === 0 && i.str.trim().length > 0;
|
|
4762
|
+
let text = i.str.trim();
|
|
4763
|
+
if (/^[\d\s\-().·,☎]+$/.test(text) && /\d/.test(text) && / /.test(text)) {
|
|
4764
|
+
text = text.replace(/ /g, "");
|
|
4765
|
+
}
|
|
4766
|
+
const split = splitEvenSpacedItem(text, x, w, fontSize);
|
|
4767
|
+
if (split) {
|
|
4768
|
+
for (const s of split) {
|
|
4769
|
+
items.push({ text: s.text, x: s.x, y, w: s.w, h, fontSize, fontName: i.fontName || "", isHidden });
|
|
4770
|
+
}
|
|
4771
|
+
} else {
|
|
4772
|
+
items.push({ text, x, y, w, h, fontSize, fontName: i.fontName || "", isHidden });
|
|
4773
|
+
}
|
|
4774
|
+
}
|
|
4775
|
+
const sorted = items.sort((a, b) => b.y - a.y || a.x - b.x);
|
|
4776
|
+
const deduped = [];
|
|
4777
|
+
for (let i = 0; i < sorted.length; i++) {
|
|
4778
|
+
let isDup = false;
|
|
4779
|
+
for (let j = deduped.length - 1; j >= 0; j--) {
|
|
4780
|
+
const prev = deduped[j];
|
|
4781
|
+
if (prev.y - sorted[i].y > 3) break;
|
|
4782
|
+
if (Math.abs(prev.y - sorted[i].y) <= 3 && prev.text === sorted[i].text && Math.abs(prev.x - sorted[i].x) <= 3) {
|
|
4783
|
+
isDup = true;
|
|
4784
|
+
break;
|
|
4785
|
+
}
|
|
4786
|
+
}
|
|
4787
|
+
if (!isDup) deduped.push(sorted[i]);
|
|
4788
|
+
}
|
|
4789
|
+
if (spacePositions.length > 0) {
|
|
4790
|
+
for (const item of deduped) {
|
|
4791
|
+
for (const sp of spacePositions) {
|
|
4792
|
+
if (Math.abs(sp.y - item.y) <= 3) {
|
|
4793
|
+
const dist = item.x - sp.x;
|
|
4794
|
+
if (dist >= 0 && dist <= 20) {
|
|
4795
|
+
item.hasSpaceBefore = true;
|
|
4796
|
+
break;
|
|
4797
|
+
}
|
|
4798
|
+
}
|
|
4799
|
+
}
|
|
4800
|
+
}
|
|
4801
|
+
}
|
|
4802
|
+
return deduped;
|
|
4803
|
+
}
|
|
4804
|
+
function splitEvenSpacedItem(text, itemX, itemW, fontSize) {
|
|
4805
|
+
if (!/^[가-힣\d](?: [가-힣\d]){2,}$/.test(text)) return null;
|
|
4806
|
+
const chars = text.split(" ");
|
|
4807
|
+
if (chars.length < 3) return null;
|
|
4808
|
+
const charW = itemW / chars.length;
|
|
4809
|
+
if (charW > fontSize * 2) return null;
|
|
4810
|
+
return chars.map((ch, idx) => ({
|
|
4811
|
+
text: ch,
|
|
4812
|
+
x: Math.round(itemX + idx * charW),
|
|
4813
|
+
w: Math.round(charW * 0.8)
|
|
4814
|
+
// 실제 글자 폭은 간격보다 좁음
|
|
4815
|
+
}));
|
|
4511
4816
|
}
|
|
4512
4817
|
function groupByY(items) {
|
|
4513
4818
|
if (items.length === 0) return [];
|
|
@@ -4532,14 +4837,14 @@ function isProseSpread(items) {
|
|
|
4532
4837
|
for (let i = 1; i < sorted.length; i++) {
|
|
4533
4838
|
gaps.push(sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w));
|
|
4534
4839
|
}
|
|
4535
|
-
const maxGap =
|
|
4840
|
+
const maxGap = safeMax(gaps);
|
|
4536
4841
|
const avgLen = items.reduce((s, i) => s + i.text.length, 0) / items.length;
|
|
4537
4842
|
return maxGap < 40 && avgLen < 5;
|
|
4538
4843
|
}
|
|
4539
4844
|
function detectColumns(yLines) {
|
|
4540
4845
|
const allItems = yLines.flat();
|
|
4541
4846
|
if (allItems.length === 0) return null;
|
|
4542
|
-
const pageWidth =
|
|
4847
|
+
const pageWidth = safeMax(allItems.map((i) => i.x + i.w)) - safeMin(allItems.map((i) => i.x));
|
|
4543
4848
|
if (pageWidth < 100) return null;
|
|
4544
4849
|
let bigoLineIdx = -1;
|
|
4545
4850
|
for (let i = 0; i < yLines.length; i++) {
|
|
@@ -4571,7 +4876,7 @@ function detectColumns(yLines) {
|
|
|
4571
4876
|
}
|
|
4572
4877
|
const peaks = xClusters.filter((c) => c.count >= 3).sort((a, b) => a.minX - b.minX);
|
|
4573
4878
|
if (peaks.length < 3) return null;
|
|
4574
|
-
const MERGE_TOL =
|
|
4879
|
+
const MERGE_TOL = 40;
|
|
4575
4880
|
const merged = [peaks[0]];
|
|
4576
4881
|
for (let i = 1; i < peaks.length; i++) {
|
|
4577
4882
|
const prev = merged[merged.length - 1];
|
|
@@ -4585,7 +4890,14 @@ function detectColumns(yLines) {
|
|
|
4585
4890
|
merged.push({ ...peaks[i] });
|
|
4586
4891
|
}
|
|
4587
4892
|
}
|
|
4588
|
-
const
|
|
4893
|
+
const rawColumns = merged.filter((c) => c.count >= 3).map((c) => c.minX);
|
|
4894
|
+
if (rawColumns.length < 3) return null;
|
|
4895
|
+
const MIN_DETECT_COL_WIDTH = 30;
|
|
4896
|
+
const columns = [rawColumns[0]];
|
|
4897
|
+
for (let i = 1; i < rawColumns.length; i++) {
|
|
4898
|
+
if (rawColumns[i] - columns[columns.length - 1] < MIN_DETECT_COL_WIDTH) continue;
|
|
4899
|
+
columns.push(rawColumns[i]);
|
|
4900
|
+
}
|
|
4589
4901
|
return columns.length >= 3 ? columns : null;
|
|
4590
4902
|
}
|
|
4591
4903
|
function findColumn(x, columns) {
|
|
@@ -4713,6 +5025,16 @@ function buildGridTable(lines, columns) {
|
|
|
4713
5025
|
}
|
|
4714
5026
|
merged.splice(0, headerEnd, headerRow);
|
|
4715
5027
|
}
|
|
5028
|
+
for (const row of merged) {
|
|
5029
|
+
for (let c = 0; c < row.length; c++) {
|
|
5030
|
+
if (row[c]) row[c] = collapseEvenSpacing(row[c]);
|
|
5031
|
+
}
|
|
5032
|
+
}
|
|
5033
|
+
const totalCells = merged.length * numCols;
|
|
5034
|
+
const filledCells = merged.reduce((s, row) => s + row.filter((c) => c).length, 0);
|
|
5035
|
+
if (filledCells < totalCells * 0.35 || merged.length < 2 || merged.length <= 3 && numCols >= 7) {
|
|
5036
|
+
return merged.map((r) => r.filter((c) => c).join(" ")).join("\n");
|
|
5037
|
+
}
|
|
4716
5038
|
const md = [];
|
|
4717
5039
|
md.push("| " + merged[0].join(" | ") + " |");
|
|
4718
5040
|
md.push("| " + merged[0].map(() => "---").join(" | ") + " |");
|
|
@@ -4724,12 +5046,32 @@ function buildGridTable(lines, columns) {
|
|
|
4724
5046
|
function mergeLineSimple(items) {
|
|
4725
5047
|
if (items.length <= 1) return items[0]?.text || "";
|
|
4726
5048
|
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
5049
|
+
const isEvenSpaced = detectEvenSpacedItems(sorted);
|
|
4727
5050
|
let result = sorted[0].text;
|
|
4728
5051
|
for (let i = 1; i < sorted.length; i++) {
|
|
4729
5052
|
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
4730
5053
|
const avgFs = (sorted[i].fontSize + sorted[i - 1].fontSize) / 2;
|
|
4731
|
-
|
|
4732
|
-
|
|
5054
|
+
const tabThreshold = Math.max(avgFs * 2, 30);
|
|
5055
|
+
if (gap > tabThreshold) {
|
|
5056
|
+
result += " ";
|
|
5057
|
+
result += sorted[i].text;
|
|
5058
|
+
continue;
|
|
5059
|
+
}
|
|
5060
|
+
if (isEvenSpaced[i]) {
|
|
5061
|
+
result += sorted[i].text;
|
|
5062
|
+
continue;
|
|
5063
|
+
}
|
|
5064
|
+
if (sorted[i].hasSpaceBefore && gap >= avgFs * 0.05) {
|
|
5065
|
+
result += " ";
|
|
5066
|
+
result += sorted[i].text;
|
|
5067
|
+
continue;
|
|
5068
|
+
}
|
|
5069
|
+
if (/[□■○●▶◆◇ㅇ]$/.test(sorted[i - 1].text) && /^[가-힣]/.test(sorted[i].text) && gap > 1) {
|
|
5070
|
+
result += " ";
|
|
5071
|
+
result += sorted[i].text;
|
|
5072
|
+
continue;
|
|
5073
|
+
}
|
|
5074
|
+
if (gap < avgFs * 0.15) {
|
|
4733
5075
|
} else if (gap < avgFs * 0.35 && (/[가-힣]$/.test(result) || /^[가-힣]/.test(sorted[i].text))) {
|
|
4734
5076
|
} else if (gap > 3) result += " ";
|
|
4735
5077
|
result += sorted[i].text;
|
|
@@ -4738,8 +5080,8 @@ function mergeLineSimple(items) {
|
|
|
4738
5080
|
}
|
|
4739
5081
|
function cleanPdfText(text) {
|
|
4740
5082
|
return mergeKoreanLines(
|
|
4741
|
-
text.replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "")
|
|
4742
|
-
).replace(/^(?!\|)
|
|
5083
|
+
text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
|
|
5084
|
+
).replace(/^(?!\| ---).*$/gm, (line) => collapseEvenSpacing(line)).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
|
|
4743
5085
|
}
|
|
4744
5086
|
function startsWithMarker(line) {
|
|
4745
5087
|
const t = line.trimStart();
|
|
@@ -4931,7 +5273,7 @@ function mergeKoreanLines(text) {
|
|
|
4931
5273
|
result[result.length - 1] = prev + " " + currTrimmed;
|
|
4932
5274
|
continue;
|
|
4933
5275
|
}
|
|
4934
|
-
if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev)) {
|
|
5276
|
+
if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev) && !startsWithMarker(prev)) {
|
|
4935
5277
|
result[result.length - 1] = prev + " " + curr;
|
|
4936
5278
|
} else {
|
|
4937
5279
|
result.push(curr);
|
|
@@ -4979,7 +5321,7 @@ function getTextContent(el) {
|
|
|
4979
5321
|
return el.textContent?.trim() ?? "";
|
|
4980
5322
|
}
|
|
4981
5323
|
function parseXml(text) {
|
|
4982
|
-
return new import_xmldom2.DOMParser().parseFromString(text, "text/xml");
|
|
5324
|
+
return new import_xmldom2.DOMParser().parseFromString(stripDtd(text), "text/xml");
|
|
4983
5325
|
}
|
|
4984
5326
|
function parseSharedStrings(xml) {
|
|
4985
5327
|
const doc = parseXml(xml);
|
|
@@ -5266,7 +5608,7 @@ function getAttr(el, localName) {
|
|
|
5266
5608
|
return null;
|
|
5267
5609
|
}
|
|
5268
5610
|
function parseXml2(text) {
|
|
5269
|
-
return new import_xmldom3.DOMParser().parseFromString(text, "text/xml");
|
|
5611
|
+
return new import_xmldom3.DOMParser().parseFromString(stripDtd(text), "text/xml");
|
|
5270
5612
|
}
|
|
5271
5613
|
function parseStyles(xml) {
|
|
5272
5614
|
const doc = parseXml2(xml);
|
|
@@ -5666,7 +6008,13 @@ function normalize(s) {
|
|
|
5666
6008
|
}
|
|
5667
6009
|
var MAX_LEVENSHTEIN_LEN = 1e4;
|
|
5668
6010
|
function levenshtein(a, b) {
|
|
5669
|
-
if (a.length + b.length > MAX_LEVENSHTEIN_LEN)
|
|
6011
|
+
if (a.length + b.length > MAX_LEVENSHTEIN_LEN) {
|
|
6012
|
+
const sampleLen = Math.min(500, a.length, b.length);
|
|
6013
|
+
let diffs = 0;
|
|
6014
|
+
for (let i = 0; i < sampleLen; i++) if (a[i] !== b[i]) diffs++;
|
|
6015
|
+
const sampleRate = sampleLen > 0 ? diffs / sampleLen : 1;
|
|
6016
|
+
return Math.abs(a.length - b.length) + Math.round(Math.min(a.length, b.length) * sampleRate);
|
|
6017
|
+
}
|
|
5670
6018
|
if (a.length > b.length) [a, b] = [b, a];
|
|
5671
6019
|
const m = a.length;
|
|
5672
6020
|
const n = b.length;
|
|
@@ -5949,13 +6297,20 @@ function extractInlineFields(text) {
|
|
|
5949
6297
|
|
|
5950
6298
|
// src/hwpx/generator.ts
|
|
5951
6299
|
var import_jszip5 = __toESM(require("jszip"), 1);
|
|
5952
|
-
var
|
|
6300
|
+
var NS_SECTION = "http://www.hancom.co.kr/hwpml/2011/section";
|
|
6301
|
+
var NS_PARA = "http://www.hancom.co.kr/hwpml/2011/paragraph";
|
|
6302
|
+
var NS_HEAD = "http://www.hancom.co.kr/hwpml/2011/head";
|
|
6303
|
+
var NS_OPF = "http://www.idpf.org/2007/opf/";
|
|
6304
|
+
var NS_HPF = "http://www.hancom.co.kr/schema/2011/hpf";
|
|
6305
|
+
var NS_OCF = "urn:oasis:names:tc:opendocument:xmlns:container";
|
|
5953
6306
|
async function markdownToHwpx(markdown) {
|
|
5954
6307
|
const blocks = parseMarkdownToBlocks(markdown);
|
|
5955
6308
|
const sectionXml = blocksToSectionXml(blocks);
|
|
5956
6309
|
const zip = new import_jszip5.default();
|
|
5957
6310
|
zip.file("mimetype", "application/hwp+zip", { compression: "STORE" });
|
|
6311
|
+
zip.file("META-INF/container.xml", generateContainerXml());
|
|
5958
6312
|
zip.file("Contents/content.hpf", generateManifest());
|
|
6313
|
+
zip.file("Contents/header.xml", generateHeaderXml());
|
|
5959
6314
|
zip.file("Contents/section0.xml", sectionXml);
|
|
5960
6315
|
return await zip.generateAsync({ type: "arraybuffer" });
|
|
5961
6316
|
}
|
|
@@ -6000,8 +6355,111 @@ function parseMarkdownToBlocks(md) {
|
|
|
6000
6355
|
function escapeXml(text) {
|
|
6001
6356
|
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """);
|
|
6002
6357
|
}
|
|
6358
|
+
function generateContainerXml() {
|
|
6359
|
+
return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
|
6360
|
+
<ocf:container xmlns:ocf="${NS_OCF}" xmlns:hpf="${NS_HPF}">
|
|
6361
|
+
<ocf:rootfiles>
|
|
6362
|
+
<ocf:rootfile full-path="Contents/content.hpf" media-type="application/hwpml-package+xml"/>
|
|
6363
|
+
</ocf:rootfiles>
|
|
6364
|
+
</ocf:container>`;
|
|
6365
|
+
}
|
|
6366
|
+
function generateManifest() {
|
|
6367
|
+
return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
|
6368
|
+
<opf:package xmlns:opf="${NS_OPF}" xmlns:hpf="${NS_HPF}" xmlns:hh="${NS_HEAD}">
|
|
6369
|
+
<opf:manifest>
|
|
6370
|
+
<opf:item id="header" href="Contents/header.xml" media-type="application/xml"/>
|
|
6371
|
+
<opf:item id="section0" href="Contents/section0.xml" media-type="application/xml"/>
|
|
6372
|
+
</opf:manifest>
|
|
6373
|
+
<opf:spine>
|
|
6374
|
+
<opf:itemref idref="header" linear="no"/>
|
|
6375
|
+
<opf:itemref idref="section0" linear="yes"/>
|
|
6376
|
+
</opf:spine>
|
|
6377
|
+
</opf:package>`;
|
|
6378
|
+
}
|
|
6379
|
+
function generateHeaderXml() {
|
|
6380
|
+
return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
|
6381
|
+
<hh:head xmlns:hh="${NS_HEAD}" xmlns:hp="${NS_PARA}" version="1.4" secCnt="1">
|
|
6382
|
+
<hh:beginNum page="1" footnote="1" endnote="1" pic="1" tbl="1" equation="1"/>
|
|
6383
|
+
<hh:refList>
|
|
6384
|
+
<hh:fontfaces itemCnt="7">
|
|
6385
|
+
<hh:fontface lang="HANGUL" fontCnt="1">
|
|
6386
|
+
<hh:font id="0" face="\uD568\uCD08\uB86C\uBC14\uD0D5" type="TTF" isEmbedded="0">
|
|
6387
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="4" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6388
|
+
</hh:font>
|
|
6389
|
+
</hh:fontface>
|
|
6390
|
+
<hh:fontface lang="LATIN" fontCnt="1">
|
|
6391
|
+
<hh:font id="0" face="Times New Roman" type="TTF" isEmbedded="0">
|
|
6392
|
+
<hh:typeInfo familyType="FCAT_OLDSTYLE" weight="5" proportion="4" contrast="2" strokeVariation="0" armStyle="0" letterform="0" midline="0" xHeight="4"/>
|
|
6393
|
+
</hh:font>
|
|
6394
|
+
</hh:fontface>
|
|
6395
|
+
<hh:fontface lang="HANJA" fontCnt="1">
|
|
6396
|
+
<hh:font id="0" face="\uD568\uCD08\uB86C\uBC14\uD0D5" type="TTF" isEmbedded="0">
|
|
6397
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="4" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6398
|
+
</hh:font>
|
|
6399
|
+
</hh:fontface>
|
|
6400
|
+
<hh:fontface lang="JAPANESE" fontCnt="1">
|
|
6401
|
+
<hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
|
|
6402
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6403
|
+
</hh:font>
|
|
6404
|
+
</hh:fontface>
|
|
6405
|
+
<hh:fontface lang="OTHER" fontCnt="1">
|
|
6406
|
+
<hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
|
|
6407
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6408
|
+
</hh:font>
|
|
6409
|
+
</hh:fontface>
|
|
6410
|
+
<hh:fontface lang="SYMBOL" fontCnt="1">
|
|
6411
|
+
<hh:font id="0" face="Symbol" type="TTF" isEmbedded="0">
|
|
6412
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6413
|
+
</hh:font>
|
|
6414
|
+
</hh:fontface>
|
|
6415
|
+
<hh:fontface lang="USER" fontCnt="1">
|
|
6416
|
+
<hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
|
|
6417
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6418
|
+
</hh:font>
|
|
6419
|
+
</hh:fontface>
|
|
6420
|
+
</hh:fontfaces>
|
|
6421
|
+
<hh:borderFills itemCnt="1">
|
|
6422
|
+
<hh:borderFill id="0" threeD="0" shadow="0" centerLine="0" breakCellSeparateLine="0">
|
|
6423
|
+
<hh:slash type="NONE" Crooked="0" isCounter="0"/>
|
|
6424
|
+
<hh:backSlash type="NONE" Crooked="0" isCounter="0"/>
|
|
6425
|
+
<hh:leftBorder type="NONE" width="0.1mm" color="0"/>
|
|
6426
|
+
<hh:rightBorder type="NONE" width="0.1mm" color="0"/>
|
|
6427
|
+
<hh:topBorder type="NONE" width="0.1mm" color="0"/>
|
|
6428
|
+
<hh:bottomBorder type="NONE" width="0.1mm" color="0"/>
|
|
6429
|
+
<hh:diagonal type="NONE" width="0.1mm" color="0"/>
|
|
6430
|
+
<hh:fillInfo/>
|
|
6431
|
+
</hh:borderFill>
|
|
6432
|
+
</hh:borderFills>
|
|
6433
|
+
<hh:charProperties itemCnt="1">
|
|
6434
|
+
<hh:charPr id="0" height="1000" textColor="0" shadeColor="-1" useFontSpace="0" useKerning="0" symMark="NONE" borderFillIDRef="0">
|
|
6435
|
+
<hh:fontRef hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
|
|
6436
|
+
<hh:ratio hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
|
|
6437
|
+
<hh:spacing hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
|
|
6438
|
+
<hh:relSz hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
|
|
6439
|
+
<hh:offset hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
|
|
6440
|
+
</hh:charPr>
|
|
6441
|
+
</hh:charProperties>
|
|
6442
|
+
<hh:tabProperties itemCnt="0"/>
|
|
6443
|
+
<hh:numberings itemCnt="0"/>
|
|
6444
|
+
<hh:bullets itemCnt="0"/>
|
|
6445
|
+
<hh:paraProperties itemCnt="1">
|
|
6446
|
+
<hh:paraPr id="0" tabIDRef="0" condense="0" fontLineHeight="0" snapToGrid="0" suppressOverlap="0" checked="0">
|
|
6447
|
+
<hh:parLineBreak lineBreak="BREAK_LINE" wordBreak="BREAK_WORD" breakLatinWord="BREAK_WORD" breakNonLatinWord="BREAK_WORD"/>
|
|
6448
|
+
<hh:parMargin left="0" right="0" prev="0" next="0" indent="0"/>
|
|
6449
|
+
<hh:parBorder borderFillIDRef="0" offsetLeft="0" offsetRight="0" offsetTop="0" offsetBottom="0" connect="0" ignoreMargin="0"/>
|
|
6450
|
+
<hh:parShade borderFillIDRef="0"/>
|
|
6451
|
+
<hh:parTabList/>
|
|
6452
|
+
</hh:paraPr>
|
|
6453
|
+
</hh:paraProperties>
|
|
6454
|
+
<hh:styles itemCnt="1">
|
|
6455
|
+
<hh:style id="0" type="PARA" name="\uBC14\uD0D5\uAE00" engName="Normal" paraPrIDRef="0" charPrIDRef="0" nextStyleIDRef="0" langIDRef="1042" lockForm="0"/>
|
|
6456
|
+
</hh:styles>
|
|
6457
|
+
</hh:refList>
|
|
6458
|
+
<hh:compatibleDocument targetProgram="HWP2018"/>
|
|
6459
|
+
</hh:head>`;
|
|
6460
|
+
}
|
|
6003
6461
|
function generateParagraph(text) {
|
|
6004
|
-
return `<hp:p><hp:run><hp:t>${escapeXml(text)}</hp:t></hp:run></hp:p>`;
|
|
6462
|
+
return `<hp:p paraPrIDRef="0" styleIDRef="0"><hp:run charPrIDRef="0"><hp:t>${escapeXml(text)}</hp:t></hp:run></hp:p>`;
|
|
6005
6463
|
}
|
|
6006
6464
|
function generateTable(rows) {
|
|
6007
6465
|
const trElements = rows.map((row) => {
|
|
@@ -6025,22 +6483,11 @@ function blocksToSectionXml(blocks) {
|
|
|
6025
6483
|
return "";
|
|
6026
6484
|
}
|
|
6027
6485
|
}).join("\n ");
|
|
6028
|
-
return `<?xml version="1.0" encoding="UTF-8"?>
|
|
6029
|
-
<hs:sec xmlns:hs="${
|
|
6486
|
+
return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
|
6487
|
+
<hs:sec xmlns:hs="${NS_SECTION}" xmlns:hp="${NS_PARA}">
|
|
6030
6488
|
${body}
|
|
6031
6489
|
</hs:sec>`;
|
|
6032
6490
|
}
|
|
6033
|
-
function generateManifest() {
|
|
6034
|
-
return `<?xml version="1.0" encoding="UTF-8"?>
|
|
6035
|
-
<opf:package xmlns:opf="http://www.idpf.org/2007/opf">
|
|
6036
|
-
<opf:manifest>
|
|
6037
|
-
<opf:item id="s0" href="section0.xml" media-type="application/xml"/>
|
|
6038
|
-
</opf:manifest>
|
|
6039
|
-
<opf:spine>
|
|
6040
|
-
<opf:itemref idref="s0"/>
|
|
6041
|
-
</opf:spine>
|
|
6042
|
-
</opf:package>`;
|
|
6043
|
-
}
|
|
6044
6491
|
|
|
6045
6492
|
// src/index.ts
|
|
6046
6493
|
async function parse(input, options) {
|