kordoc 2.0.3 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +302 -291
- package/dist/chunk-5Y2Q3BRW.js +52 -0
- package/dist/chunk-5Y2Q3BRW.js.map +1 -0
- package/dist/{chunk-4UH6ABAY.js → chunk-LYFG7AUT.js} +971 -223
- package/dist/chunk-LYFG7AUT.js.map +1 -0
- package/dist/{chunk-3TBUDJDE.js → chunk-MOL7MDBG.js} +1 -1
- package/dist/chunk-MOL7MDBG.js.map +1 -0
- package/dist/cli.js +13 -9
- package/dist/cli.js.map +1 -1
- package/dist/detect-GYK3HKD5.js +18 -0
- package/dist/index.cjs +996 -189
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +996 -189
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +41 -12
- package/dist/mcp.js.map +1 -1
- package/dist/page-range-737B4EZW.js +8 -0
- package/dist/{provider-EU3CG724.js → provider-7H4CPZYS.js} +2 -1
- package/dist/provider-7H4CPZYS.js.map +1 -0
- package/dist/{watch-QD3PDNXQ.js → watch-Q5OXA73S.js} +38 -18
- package/dist/watch-Q5OXA73S.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-25TXW6EP.js +0 -93
- package/dist/chunk-25TXW6EP.js.map +0 -1
- package/dist/chunk-3TBUDJDE.js.map +0 -1
- package/dist/chunk-4UH6ABAY.js.map +0 -1
- package/dist/page-range-OF5I4PQY.js +0 -8
- package/dist/provider-EU3CG724.js.map +0 -1
- package/dist/utils-BTZ4WSYX.js +0 -22
- package/dist/watch-QD3PDNXQ.js.map +0 -1
- /package/dist/{page-range-OF5I4PQY.js.map → detect-GYK3HKD5.js.map} +0 -0
- /package/dist/{utils-BTZ4WSYX.js.map → page-range-737B4EZW.js.map} +0 -0
package/dist/index.cjs
CHANGED
|
@@ -85,6 +85,7 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount) {
|
|
|
85
85
|
blocks.push({ type: "paragraph", text: text.trim(), pageNumber: i });
|
|
86
86
|
}
|
|
87
87
|
} catch {
|
|
88
|
+
blocks.push({ type: "paragraph", text: `[OCR \uC2E4\uD328: \uD398\uC774\uC9C0 ${i}]` });
|
|
88
89
|
}
|
|
89
90
|
}
|
|
90
91
|
return blocks;
|
|
@@ -182,7 +183,7 @@ var import_zlib = require("zlib");
|
|
|
182
183
|
var import_xmldom = require("@xmldom/xmldom");
|
|
183
184
|
|
|
184
185
|
// src/utils.ts
|
|
185
|
-
var VERSION = true ? "2.0
|
|
186
|
+
var VERSION = true ? "2.2.0" : "0.0.0-dev";
|
|
186
187
|
function toArrayBuffer(buf) {
|
|
187
188
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
188
189
|
return buf.buffer;
|
|
@@ -198,7 +199,8 @@ var KordocError = class extends Error {
|
|
|
198
199
|
function isPathTraversal(name) {
|
|
199
200
|
if (name.includes("\0")) return true;
|
|
200
201
|
const normalized = name.replace(/\\/g, "/");
|
|
201
|
-
|
|
202
|
+
const segments = normalized.split("/");
|
|
203
|
+
return segments.some((s) => s === "..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
202
204
|
}
|
|
203
205
|
function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
|
|
204
206
|
try {
|
|
@@ -238,12 +240,25 @@ function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEnt
|
|
|
238
240
|
return { totalUncompressed: 0, entryCount: 0 };
|
|
239
241
|
}
|
|
240
242
|
}
|
|
243
|
+
function stripDtd(xml) {
|
|
244
|
+
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
245
|
+
}
|
|
241
246
|
var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
|
|
242
247
|
function sanitizeHref(href) {
|
|
243
248
|
const trimmed = href.trim();
|
|
244
249
|
if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
|
|
245
250
|
return trimmed;
|
|
246
251
|
}
|
|
252
|
+
function safeMin(arr) {
|
|
253
|
+
let min = Infinity;
|
|
254
|
+
for (let i = 0; i < arr.length; i++) if (arr[i] < min) min = arr[i];
|
|
255
|
+
return min;
|
|
256
|
+
}
|
|
257
|
+
function safeMax(arr) {
|
|
258
|
+
let max = -Infinity;
|
|
259
|
+
for (let i = 0; i < arr.length; i++) if (arr[i] > max) max = arr[i];
|
|
260
|
+
return max;
|
|
261
|
+
}
|
|
247
262
|
function classifyError(err) {
|
|
248
263
|
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
249
264
|
const msg = err.message;
|
|
@@ -318,6 +333,7 @@ function buildTableDirect(rows, numRows) {
|
|
|
318
333
|
if (end > maxCols) maxCols = end;
|
|
319
334
|
}
|
|
320
335
|
}
|
|
336
|
+
if (maxCols > MAX_COLS) maxCols = MAX_COLS;
|
|
321
337
|
if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
|
|
322
338
|
const grid = Array.from(
|
|
323
339
|
{ length: numRows },
|
|
@@ -327,7 +343,7 @@ function buildTableDirect(rows, numRows) {
|
|
|
327
343
|
for (const cell of row) {
|
|
328
344
|
const r = cell.rowAddr ?? 0;
|
|
329
345
|
const c = cell.colAddr ?? 0;
|
|
330
|
-
if (r >= numRows || c >= maxCols) continue;
|
|
346
|
+
if (r >= numRows || c >= maxCols || r < 0 || c < 0) continue;
|
|
331
347
|
grid[r][c] = { text: cell.text.trim(), colSpan: cell.colSpan, rowSpan: cell.rowSpan };
|
|
332
348
|
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
333
349
|
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
@@ -607,7 +623,12 @@ function parseCharProperties(doc, map) {
|
|
|
607
623
|
if (!id) continue;
|
|
608
624
|
const prop = {};
|
|
609
625
|
const height = el.getAttribute("height");
|
|
610
|
-
if (height)
|
|
626
|
+
if (height) {
|
|
627
|
+
const parsedHeight = parseInt(height, 10);
|
|
628
|
+
if (!isNaN(parsedHeight) && parsedHeight > 0) {
|
|
629
|
+
prop.fontSize = parsedHeight / 100;
|
|
630
|
+
}
|
|
631
|
+
}
|
|
611
632
|
const bold = el.getAttribute("bold");
|
|
612
633
|
if (bold === "true" || bold === "1") prop.bold = true;
|
|
613
634
|
const italic = el.getAttribute("italic");
|
|
@@ -642,9 +663,6 @@ function parseStyleElements(doc, map) {
|
|
|
642
663
|
}
|
|
643
664
|
}
|
|
644
665
|
}
|
|
645
|
-
function stripDtd(xml) {
|
|
646
|
-
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
647
|
-
}
|
|
648
666
|
async function parseHwpxDocument(buffer, options) {
|
|
649
667
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
|
|
650
668
|
let zip;
|
|
@@ -747,7 +765,7 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
|
|
|
747
765
|
const data = await file.async("uint8array");
|
|
748
766
|
decompressed.total += data.length;
|
|
749
767
|
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
750
|
-
const ext = ref.includes(".") ? ref.split(".").pop() : "png";
|
|
768
|
+
const ext = ref.includes(".") ? ref.split(".").pop() || "png" : "png";
|
|
751
769
|
const mimeType = imageExtToMime(ext);
|
|
752
770
|
imageIndex++;
|
|
753
771
|
const filename = `image_${String(imageIndex).padStart(3, "0")}.${mimeToExt(mimeType)}`;
|
|
@@ -994,7 +1012,8 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
994
1012
|
if (newTable.rows.length > 0) {
|
|
995
1013
|
if (tableStack.length > 0) {
|
|
996
1014
|
const parentTable = tableStack.pop();
|
|
997
|
-
|
|
1015
|
+
let nestedCols = 0;
|
|
1016
|
+
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
998
1017
|
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
999
1018
|
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
1000
1019
|
} else {
|
|
@@ -1041,8 +1060,10 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
1041
1060
|
break;
|
|
1042
1061
|
case "cellSpan":
|
|
1043
1062
|
if (tableCtx?.cell) {
|
|
1044
|
-
const
|
|
1045
|
-
const
|
|
1063
|
+
const rawCs = parseInt(el.getAttribute("colSpan") || "1", 10);
|
|
1064
|
+
const cs = isNaN(rawCs) ? 1 : rawCs;
|
|
1065
|
+
const rawRs = parseInt(el.getAttribute("rowSpan") || "1", 10);
|
|
1066
|
+
const rs = isNaN(rawRs) ? 1 : rawRs;
|
|
1046
1067
|
tableCtx.cell.colSpan = clampSpan(cs, MAX_COLS);
|
|
1047
1068
|
tableCtx.cell.rowSpan = clampSpan(rs, MAX_ROWS);
|
|
1048
1069
|
}
|
|
@@ -1101,7 +1122,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
1101
1122
|
if (newTable.rows.length > 0) {
|
|
1102
1123
|
if (tableStack.length > 0) {
|
|
1103
1124
|
const parentTable = tableStack.pop();
|
|
1104
|
-
|
|
1125
|
+
let nestedCols = 0;
|
|
1126
|
+
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
1105
1127
|
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
1106
1128
|
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
1107
1129
|
} else {
|
|
@@ -1134,6 +1156,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
1134
1156
|
extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
|
|
1135
1157
|
} else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
|
|
1136
1158
|
walkChildren(el, d + 1);
|
|
1159
|
+
} else if (localTag === "run") {
|
|
1160
|
+
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
|
|
1137
1161
|
}
|
|
1138
1162
|
}
|
|
1139
1163
|
};
|
|
@@ -2197,6 +2221,7 @@ function parseLenientCfb(data) {
|
|
|
2197
2221
|
if (miniSectorSizeShift > 16) throw new Error("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uBBF8\uB2C8 \uC139\uD130 \uD06C\uAE30 \uC2DC\uD504\uD2B8: " + miniSectorSizeShift);
|
|
2198
2222
|
const miniSectorSize = 1 << miniSectorSizeShift;
|
|
2199
2223
|
const fatSectorCount = data.readUInt32LE(44);
|
|
2224
|
+
if (fatSectorCount > 1e4) throw new Error("FAT \uC139\uD130 \uC218\uAC00 \uB108\uBB34 \uB9CE\uC2B5\uB2C8\uB2E4: " + fatSectorCount);
|
|
2200
2225
|
const firstDirSector = data.readUInt32LE(48);
|
|
2201
2226
|
const miniStreamCutoff = data.readUInt32LE(56);
|
|
2202
2227
|
const firstMiniFatSector = data.readUInt32LE(60);
|
|
@@ -2573,10 +2598,14 @@ function findSections(cfb) {
|
|
|
2573
2598
|
}
|
|
2574
2599
|
function findSectionsLenient(lcfb, compressed) {
|
|
2575
2600
|
const sections = [];
|
|
2601
|
+
let totalDecompressed = 0;
|
|
2576
2602
|
for (let i = 0; i < MAX_SECTIONS; i++) {
|
|
2577
2603
|
const raw = lcfb.findStream(`/BodyText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
|
|
2578
2604
|
if (!raw) break;
|
|
2579
|
-
|
|
2605
|
+
const content = compressed ? decompressStream(raw) : raw;
|
|
2606
|
+
totalDecompressed += content.length;
|
|
2607
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2608
|
+
sections.push({ idx: i, content });
|
|
2580
2609
|
}
|
|
2581
2610
|
if (sections.length === 0) {
|
|
2582
2611
|
for (const e of lcfb.entries()) {
|
|
@@ -2584,7 +2613,12 @@ function findSectionsLenient(lcfb, compressed) {
|
|
|
2584
2613
|
if (e.name.startsWith("Section")) {
|
|
2585
2614
|
const idx = parseInt(e.name.replace("Section", ""), 10) || 0;
|
|
2586
2615
|
const raw = lcfb.findStream(e.name);
|
|
2587
|
-
if (raw)
|
|
2616
|
+
if (raw) {
|
|
2617
|
+
const content = compressed ? decompressStream(raw) : raw;
|
|
2618
|
+
totalDecompressed += content.length;
|
|
2619
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2620
|
+
sections.push({ idx, content });
|
|
2621
|
+
}
|
|
2588
2622
|
}
|
|
2589
2623
|
}
|
|
2590
2624
|
}
|
|
@@ -2592,11 +2626,15 @@ function findSectionsLenient(lcfb, compressed) {
|
|
|
2592
2626
|
}
|
|
2593
2627
|
function findViewTextSectionsLenient(lcfb, compressed) {
|
|
2594
2628
|
const sections = [];
|
|
2629
|
+
let totalDecompressed = 0;
|
|
2595
2630
|
for (let i = 0; i < MAX_SECTIONS; i++) {
|
|
2596
2631
|
const raw = lcfb.findStream(`/ViewText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
|
|
2597
2632
|
if (!raw) break;
|
|
2598
2633
|
try {
|
|
2599
|
-
|
|
2634
|
+
const content = decryptViewText(raw, compressed);
|
|
2635
|
+
totalDecompressed += content.length;
|
|
2636
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2637
|
+
sections.push({ idx: i, content });
|
|
2600
2638
|
} catch {
|
|
2601
2639
|
break;
|
|
2602
2640
|
}
|
|
@@ -2998,10 +3036,14 @@ init_page_range();
|
|
|
2998
3036
|
// src/pdf/line-detector.ts
|
|
2999
3037
|
var import_pdf = require("pdfjs-dist/legacy/build/pdf.mjs");
|
|
3000
3038
|
var ORIENTATION_TOL = 2;
|
|
3001
|
-
var MIN_LINE_LENGTH =
|
|
3002
|
-
var
|
|
3039
|
+
var MIN_LINE_LENGTH = 15;
|
|
3040
|
+
var MAX_LINE_WIDTH = 5;
|
|
3003
3041
|
var CONNECT_TOL = 5;
|
|
3004
3042
|
var CELL_PADDING = 2;
|
|
3043
|
+
var MIN_COL_WIDTH = 15;
|
|
3044
|
+
var MIN_ROW_HEIGHT = 6;
|
|
3045
|
+
var VERTEX_MERGE_FACTOR = 4;
|
|
3046
|
+
var MIN_COORD_MERGE_TOL = 8;
|
|
3005
3047
|
function extractLines(fnArray, argsArray) {
|
|
3006
3048
|
const horizontals = [];
|
|
3007
3049
|
const verticals = [];
|
|
@@ -3153,6 +3195,55 @@ function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
|
|
|
3153
3195
|
verticals.push({ x1: x, y1, x2: x, y2, lineWidth });
|
|
3154
3196
|
}
|
|
3155
3197
|
}
|
|
3198
|
+
function preprocessLines(horizontals, verticals) {
|
|
3199
|
+
let h = horizontals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
|
|
3200
|
+
let v = verticals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
|
|
3201
|
+
h = mergeParallelLines(h, "h");
|
|
3202
|
+
v = mergeParallelLines(v, "v");
|
|
3203
|
+
return { horizontals: h, verticals: v };
|
|
3204
|
+
}
|
|
3205
|
+
function mergeParallelLines(lines, dir) {
|
|
3206
|
+
if (lines.length <= 1) return lines;
|
|
3207
|
+
const sorted = [...lines].sort((a, b) => {
|
|
3208
|
+
const posA = dir === "h" ? a.y1 : a.x1;
|
|
3209
|
+
const posB = dir === "h" ? b.y1 : b.x1;
|
|
3210
|
+
if (Math.abs(posA - posB) > 0.1) return posA - posB;
|
|
3211
|
+
return dir === "h" ? a.x1 - b.x1 : a.y1 - b.y1;
|
|
3212
|
+
});
|
|
3213
|
+
const MERGE_TOL = 3;
|
|
3214
|
+
const result = [sorted[0]];
|
|
3215
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
3216
|
+
const prev = result[result.length - 1];
|
|
3217
|
+
const curr = sorted[i];
|
|
3218
|
+
const prevPos = dir === "h" ? prev.y1 : prev.x1;
|
|
3219
|
+
const currPos = dir === "h" ? curr.y1 : curr.x1;
|
|
3220
|
+
if (Math.abs(prevPos - currPos) <= MERGE_TOL) {
|
|
3221
|
+
const prevStart = dir === "h" ? prev.x1 : prev.y1;
|
|
3222
|
+
const prevEnd = dir === "h" ? prev.x2 : prev.y2;
|
|
3223
|
+
const currStart = dir === "h" ? curr.x1 : curr.y1;
|
|
3224
|
+
const currEnd = dir === "h" ? curr.x2 : curr.y2;
|
|
3225
|
+
const overlap = Math.min(prevEnd, currEnd) - Math.max(prevStart, currStart);
|
|
3226
|
+
const minLen = Math.min(prevEnd - prevStart, currEnd - currStart);
|
|
3227
|
+
if (overlap > minLen * 0.3) {
|
|
3228
|
+
if (dir === "h") {
|
|
3229
|
+
prev.x1 = Math.min(prev.x1, curr.x1);
|
|
3230
|
+
prev.x2 = Math.max(prev.x2, curr.x2);
|
|
3231
|
+
prev.y1 = (prev.y1 + curr.y1) / 2;
|
|
3232
|
+
prev.y2 = prev.y1;
|
|
3233
|
+
} else {
|
|
3234
|
+
prev.y1 = Math.min(prev.y1, curr.y1);
|
|
3235
|
+
prev.y2 = Math.max(prev.y2, curr.y2);
|
|
3236
|
+
prev.x1 = (prev.x1 + curr.x1) / 2;
|
|
3237
|
+
prev.x2 = prev.x1;
|
|
3238
|
+
}
|
|
3239
|
+
prev.lineWidth = Math.max(prev.lineWidth, curr.lineWidth);
|
|
3240
|
+
continue;
|
|
3241
|
+
}
|
|
3242
|
+
}
|
|
3243
|
+
result.push(curr);
|
|
3244
|
+
}
|
|
3245
|
+
return result;
|
|
3246
|
+
}
|
|
3156
3247
|
function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
|
|
3157
3248
|
const margin = 5;
|
|
3158
3249
|
return {
|
|
@@ -3164,8 +3255,49 @@ function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
|
|
|
3164
3255
|
)
|
|
3165
3256
|
};
|
|
3166
3257
|
}
|
|
3258
|
+
function buildVertices(horizontals, verticals) {
|
|
3259
|
+
const vertices = [];
|
|
3260
|
+
const tol = CONNECT_TOL;
|
|
3261
|
+
for (const h of horizontals) {
|
|
3262
|
+
for (const v of verticals) {
|
|
3263
|
+
if (v.x1 >= h.x1 - tol && v.x1 <= h.x2 + tol && h.y1 >= v.y1 - tol && h.y1 <= v.y2 + tol) {
|
|
3264
|
+
const radius = Math.max(h.lineWidth, v.lineWidth, 1);
|
|
3265
|
+
vertices.push({ x: v.x1, y: h.y1, radius });
|
|
3266
|
+
}
|
|
3267
|
+
}
|
|
3268
|
+
}
|
|
3269
|
+
return vertices;
|
|
3270
|
+
}
|
|
3271
|
+
function mergeVertices(vertices) {
|
|
3272
|
+
if (vertices.length <= 1) return vertices;
|
|
3273
|
+
const merged = [];
|
|
3274
|
+
const used = new Array(vertices.length).fill(false);
|
|
3275
|
+
for (let i = 0; i < vertices.length; i++) {
|
|
3276
|
+
if (used[i]) continue;
|
|
3277
|
+
let sumX = vertices[i].x, sumY = vertices[i].y;
|
|
3278
|
+
let maxRadius = vertices[i].radius;
|
|
3279
|
+
let count = 1;
|
|
3280
|
+
for (let j = i + 1; j < vertices.length; j++) {
|
|
3281
|
+
if (used[j]) continue;
|
|
3282
|
+
const mergeTol = VERTEX_MERGE_FACTOR * Math.max(maxRadius, vertices[j].radius);
|
|
3283
|
+
if (Math.abs(vertices[i].x - vertices[j].x) <= mergeTol && Math.abs(vertices[i].y - vertices[j].y) <= mergeTol) {
|
|
3284
|
+
sumX += vertices[j].x;
|
|
3285
|
+
sumY += vertices[j].y;
|
|
3286
|
+
maxRadius = Math.max(maxRadius, vertices[j].radius);
|
|
3287
|
+
count++;
|
|
3288
|
+
used[j] = true;
|
|
3289
|
+
}
|
|
3290
|
+
}
|
|
3291
|
+
merged.push({ x: sumX / count, y: sumY / count, radius: maxRadius });
|
|
3292
|
+
}
|
|
3293
|
+
return merged;
|
|
3294
|
+
}
|
|
3167
3295
|
function buildTableGrids(horizontals, verticals) {
|
|
3168
3296
|
if (horizontals.length < 2 || verticals.length < 2) return [];
|
|
3297
|
+
const allVertices = buildVertices(horizontals, verticals);
|
|
3298
|
+
const vertices = mergeVertices(allVertices);
|
|
3299
|
+
if (vertices.length < 4) return [];
|
|
3300
|
+
const globalRadius = vertices.reduce((max, v) => Math.max(max, v.radius), 1);
|
|
3169
3301
|
const allLines = [
|
|
3170
3302
|
...horizontals.map((l, i) => ({ ...l, type: "h", id: i })),
|
|
3171
3303
|
...verticals.map((l, i) => ({ ...l, type: "v", id: i + horizontals.length }))
|
|
@@ -3176,21 +3308,74 @@ function buildTableGrids(horizontals, verticals) {
|
|
|
3176
3308
|
const hLines = group.filter((l) => l.type === "h");
|
|
3177
3309
|
const vLines = group.filter((l) => l.type === "v");
|
|
3178
3310
|
if (hLines.length < 2 || vLines.length < 2) continue;
|
|
3179
|
-
|
|
3180
|
-
const
|
|
3181
|
-
|
|
3182
|
-
|
|
3311
|
+
let gx1 = Infinity, gy1 = Infinity, gx2 = -Infinity, gy2 = -Infinity;
|
|
3312
|
+
for (const l of vLines) {
|
|
3313
|
+
if (l.x1 < gx1) gx1 = l.x1;
|
|
3314
|
+
if (l.x1 > gx2) gx2 = l.x1;
|
|
3315
|
+
}
|
|
3316
|
+
for (const l of hLines) {
|
|
3317
|
+
if (l.y1 < gy1) gy1 = l.y1;
|
|
3318
|
+
if (l.y1 > gy2) gy2 = l.y1;
|
|
3319
|
+
}
|
|
3320
|
+
const groupBbox = {
|
|
3321
|
+
x1: gx1 - CONNECT_TOL,
|
|
3322
|
+
y1: gy1 - CONNECT_TOL,
|
|
3323
|
+
x2: gx2 + CONNECT_TOL,
|
|
3324
|
+
y2: gy2 + CONNECT_TOL
|
|
3325
|
+
};
|
|
3326
|
+
const groupVertices = vertices.filter(
|
|
3327
|
+
(v) => v.x >= groupBbox.x1 && v.x <= groupBbox.x2 && v.y >= groupBbox.y1 && v.y <= groupBbox.y2
|
|
3328
|
+
);
|
|
3329
|
+
const groupRadius = groupVertices.length > 0 ? groupVertices.reduce((max, v) => Math.max(max, v.radius), 1) : globalRadius;
|
|
3330
|
+
const coordMergeTol = Math.max(VERTEX_MERGE_FACTOR * groupRadius, MIN_COORD_MERGE_TOL);
|
|
3331
|
+
const rawYs = [
|
|
3332
|
+
...hLines.map((l) => l.y1),
|
|
3333
|
+
...groupVertices.map((v) => v.y)
|
|
3334
|
+
];
|
|
3335
|
+
const rowYs = clusterCoordinates(rawYs, coordMergeTol).sort((a, b) => b - a);
|
|
3336
|
+
const rawXs = [
|
|
3337
|
+
...vLines.map((l) => l.x1),
|
|
3338
|
+
...groupVertices.map((v) => v.x)
|
|
3339
|
+
];
|
|
3340
|
+
const colXs = clusterCoordinates(rawXs, coordMergeTol).sort((a, b) => a - b);
|
|
3183
3341
|
if (rowYs.length < 2 || colXs.length < 2) continue;
|
|
3342
|
+
const validColXs = enforceMinWidth(colXs, MIN_COL_WIDTH);
|
|
3343
|
+
const validRowYs = enforceMinHeight(rowYs, MIN_ROW_HEIGHT);
|
|
3344
|
+
if (validRowYs.length < 2 || validColXs.length < 2) continue;
|
|
3184
3345
|
const bbox = {
|
|
3185
|
-
x1:
|
|
3186
|
-
y1:
|
|
3187
|
-
x2:
|
|
3188
|
-
y2:
|
|
3346
|
+
x1: validColXs[0],
|
|
3347
|
+
y1: validRowYs[validRowYs.length - 1],
|
|
3348
|
+
x2: validColXs[validColXs.length - 1],
|
|
3349
|
+
y2: validRowYs[0]
|
|
3189
3350
|
};
|
|
3190
|
-
grids.push({ rowYs, colXs, bbox });
|
|
3351
|
+
grids.push({ rowYs: validRowYs, colXs: validColXs, bbox, vertexRadius: groupRadius });
|
|
3191
3352
|
}
|
|
3192
3353
|
return mergeAdjacentGrids(grids);
|
|
3193
3354
|
}
|
|
3355
|
+
function enforceMinWidth(colXs, minWidth) {
|
|
3356
|
+
if (colXs.length <= 2) return colXs;
|
|
3357
|
+
const result = [colXs[0]];
|
|
3358
|
+
for (let i = 1; i < colXs.length; i++) {
|
|
3359
|
+
const prevX = result[result.length - 1];
|
|
3360
|
+
if (colXs[i] - prevX < minWidth && i < colXs.length - 1) {
|
|
3361
|
+
continue;
|
|
3362
|
+
}
|
|
3363
|
+
result.push(colXs[i]);
|
|
3364
|
+
}
|
|
3365
|
+
return result;
|
|
3366
|
+
}
|
|
3367
|
+
function enforceMinHeight(rowYs, minHeight) {
|
|
3368
|
+
if (rowYs.length <= 2) return rowYs;
|
|
3369
|
+
const result = [rowYs[0]];
|
|
3370
|
+
for (let i = 1; i < rowYs.length; i++) {
|
|
3371
|
+
const prevY = result[result.length - 1];
|
|
3372
|
+
if (prevY - rowYs[i] < minHeight && i < rowYs.length - 1) {
|
|
3373
|
+
continue;
|
|
3374
|
+
}
|
|
3375
|
+
result.push(rowYs[i]);
|
|
3376
|
+
}
|
|
3377
|
+
return result;
|
|
3378
|
+
}
|
|
3194
3379
|
function mergeAdjacentGrids(grids) {
|
|
3195
3380
|
if (grids.length <= 1) return grids;
|
|
3196
3381
|
const sorted = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
|
|
@@ -3199,9 +3384,10 @@ function mergeAdjacentGrids(grids) {
|
|
|
3199
3384
|
const prev = merged[merged.length - 1];
|
|
3200
3385
|
const curr = sorted[i];
|
|
3201
3386
|
if (prev.colXs.length === curr.colXs.length) {
|
|
3202
|
-
const
|
|
3387
|
+
const mergeTol = Math.max(VERTEX_MERGE_FACTOR * Math.max(prev.vertexRadius, curr.vertexRadius), 6) * 3;
|
|
3388
|
+
const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= mergeTol);
|
|
3203
3389
|
const verticalGap = prev.bbox.y1 - curr.bbox.y2;
|
|
3204
|
-
if (colMatch && verticalGap >= -
|
|
3390
|
+
if (colMatch && verticalGap >= -CONNECT_TOL && verticalGap <= 20) {
|
|
3205
3391
|
const allRowYs = [.../* @__PURE__ */ new Set([...prev.rowYs, ...curr.rowYs])].sort((a, b) => b - a);
|
|
3206
3392
|
merged[merged.length - 1] = {
|
|
3207
3393
|
rowYs: allRowYs,
|
|
@@ -3211,7 +3397,8 @@ function mergeAdjacentGrids(grids) {
|
|
|
3211
3397
|
y1: Math.min(prev.bbox.y1, curr.bbox.y1),
|
|
3212
3398
|
x2: Math.max(prev.bbox.x2, curr.bbox.x2),
|
|
3213
3399
|
y2: Math.max(prev.bbox.y2, curr.bbox.y2)
|
|
3214
|
-
}
|
|
3400
|
+
},
|
|
3401
|
+
vertexRadius: Math.max(prev.vertexRadius, curr.vertexRadius)
|
|
3215
3402
|
};
|
|
3216
3403
|
continue;
|
|
3217
3404
|
}
|
|
@@ -3220,14 +3407,14 @@ function mergeAdjacentGrids(grids) {
|
|
|
3220
3407
|
}
|
|
3221
3408
|
return merged;
|
|
3222
3409
|
}
|
|
3223
|
-
function clusterCoordinates(values) {
|
|
3410
|
+
function clusterCoordinates(values, tolerance) {
|
|
3224
3411
|
if (values.length === 0) return [];
|
|
3225
3412
|
const sorted = [...values].sort((a, b) => a - b);
|
|
3226
3413
|
const clusters = [{ sum: sorted[0], count: 1 }];
|
|
3227
3414
|
for (let i = 1; i < sorted.length; i++) {
|
|
3228
3415
|
const last = clusters[clusters.length - 1];
|
|
3229
3416
|
const avg = last.sum / last.count;
|
|
3230
|
-
if (Math.abs(sorted[i] - avg) <=
|
|
3417
|
+
if (Math.abs(sorted[i] - avg) <= tolerance) {
|
|
3231
3418
|
last.sum += sorted[i];
|
|
3232
3419
|
last.count++;
|
|
3233
3420
|
} else {
|
|
@@ -3284,6 +3471,20 @@ function extractCells(grid, horizontals, verticals) {
|
|
|
3284
3471
|
const numRows = rowYs.length - 1;
|
|
3285
3472
|
const numCols = colXs.length - 1;
|
|
3286
3473
|
if (numRows <= 0 || numCols <= 0) return [];
|
|
3474
|
+
const vBorders = Array.from(
|
|
3475
|
+
{ length: numRows },
|
|
3476
|
+
(_, r) => Array.from(
|
|
3477
|
+
{ length: numCols + 1 },
|
|
3478
|
+
(_2, c) => hasVerticalLine(verticals, colXs[c], rowYs[r], rowYs[r + 1], grid.vertexRadius)
|
|
3479
|
+
)
|
|
3480
|
+
);
|
|
3481
|
+
const hBorders = Array.from(
|
|
3482
|
+
{ length: numRows + 1 },
|
|
3483
|
+
(_, r) => Array.from(
|
|
3484
|
+
{ length: numCols },
|
|
3485
|
+
(_2, c) => hasHorizontalLine(horizontals, rowYs[r], colXs[c], colXs[c + 1], grid.vertexRadius)
|
|
3486
|
+
)
|
|
3487
|
+
);
|
|
3287
3488
|
const occupied = Array.from({ length: numRows }, () => Array(numCols).fill(false));
|
|
3288
3489
|
const cells = [];
|
|
3289
3490
|
for (let r = 0; r < numRows; r++) {
|
|
@@ -3291,18 +3492,26 @@ function extractCells(grid, horizontals, verticals) {
|
|
|
3291
3492
|
if (occupied[r][c]) continue;
|
|
3292
3493
|
let colSpan = 1;
|
|
3293
3494
|
let rowSpan = 1;
|
|
3294
|
-
while (c + colSpan < numCols) {
|
|
3295
|
-
|
|
3296
|
-
|
|
3297
|
-
|
|
3298
|
-
|
|
3495
|
+
while (c + colSpan < numCols && !vBorders[r][c + colSpan]) {
|
|
3496
|
+
let canExpand = true;
|
|
3497
|
+
for (let dr = 0; dr < rowSpan; dr++) {
|
|
3498
|
+
if (vBorders[r + dr][c + colSpan]) {
|
|
3499
|
+
canExpand = false;
|
|
3500
|
+
break;
|
|
3501
|
+
}
|
|
3502
|
+
}
|
|
3503
|
+
if (!canExpand) break;
|
|
3299
3504
|
colSpan++;
|
|
3300
3505
|
}
|
|
3301
3506
|
while (r + rowSpan < numRows) {
|
|
3302
|
-
|
|
3303
|
-
|
|
3304
|
-
|
|
3305
|
-
|
|
3507
|
+
let hasLine = false;
|
|
3508
|
+
for (let dc = 0; dc < colSpan; dc++) {
|
|
3509
|
+
if (hBorders[r + rowSpan][c + dc]) {
|
|
3510
|
+
hasLine = true;
|
|
3511
|
+
break;
|
|
3512
|
+
}
|
|
3513
|
+
}
|
|
3514
|
+
if (hasLine) break;
|
|
3306
3515
|
rowSpan++;
|
|
3307
3516
|
}
|
|
3308
3517
|
for (let dr = 0; dr < rowSpan; dr++) {
|
|
@@ -3326,28 +3535,30 @@ function extractCells(grid, horizontals, verticals) {
|
|
|
3326
3535
|
}
|
|
3327
3536
|
return cells;
|
|
3328
3537
|
}
|
|
3329
|
-
function hasVerticalLine(verticals, x, topY, botY) {
|
|
3330
|
-
const tol =
|
|
3538
|
+
function hasVerticalLine(verticals, x, topY, botY, vertexRadius) {
|
|
3539
|
+
const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
|
|
3331
3540
|
for (const v of verticals) {
|
|
3332
3541
|
if (Math.abs(v.x1 - x) <= tol) {
|
|
3333
3542
|
const cellH = Math.abs(topY - botY);
|
|
3543
|
+
if (cellH < 0.1) continue;
|
|
3334
3544
|
const overlapTop = Math.min(v.y2, topY);
|
|
3335
3545
|
const overlapBot = Math.max(v.y1, botY);
|
|
3336
3546
|
const overlap = overlapTop - overlapBot;
|
|
3337
|
-
if (overlap >= cellH * 0.
|
|
3547
|
+
if (overlap >= cellH * 0.75) return true;
|
|
3338
3548
|
}
|
|
3339
3549
|
}
|
|
3340
3550
|
return false;
|
|
3341
3551
|
}
|
|
3342
|
-
function hasHorizontalLine(horizontals, y, leftX, rightX) {
|
|
3343
|
-
const tol =
|
|
3552
|
+
function hasHorizontalLine(horizontals, y, leftX, rightX, vertexRadius) {
|
|
3553
|
+
const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
|
|
3344
3554
|
for (const h of horizontals) {
|
|
3345
3555
|
if (Math.abs(h.y1 - y) <= tol) {
|
|
3346
3556
|
const cellW = Math.abs(rightX - leftX);
|
|
3557
|
+
if (cellW < 0.1) continue;
|
|
3347
3558
|
const overlapLeft = Math.max(h.x1, leftX);
|
|
3348
3559
|
const overlapRight = Math.min(h.x2, rightX);
|
|
3349
3560
|
const overlap = overlapRight - overlapLeft;
|
|
3350
|
-
if (overlap >= cellW * 0.
|
|
3561
|
+
if (overlap >= cellW * 0.75) return true;
|
|
3351
3562
|
}
|
|
3352
3563
|
}
|
|
3353
3564
|
return false;
|
|
@@ -3358,23 +3569,24 @@ function mapTextToCells(items, cells) {
|
|
|
3358
3569
|
result.set(cell, []);
|
|
3359
3570
|
}
|
|
3360
3571
|
for (const item of items) {
|
|
3361
|
-
const cx = item.x + item.w / 2;
|
|
3362
|
-
const cy = item.y;
|
|
3363
3572
|
const pad = CELL_PADDING;
|
|
3364
3573
|
let bestCell = null;
|
|
3365
|
-
let
|
|
3574
|
+
let bestScore = 0;
|
|
3366
3575
|
for (const cell of cells) {
|
|
3367
|
-
|
|
3368
|
-
|
|
3369
|
-
|
|
3370
|
-
|
|
3371
|
-
|
|
3372
|
-
|
|
3373
|
-
|
|
3374
|
-
|
|
3576
|
+
const ix1 = Math.max(item.x, cell.bbox.x1 - pad);
|
|
3577
|
+
const ix2 = Math.min(item.x + item.w, cell.bbox.x2 + pad);
|
|
3578
|
+
const iy1 = Math.max(item.y, cell.bbox.y1 - pad);
|
|
3579
|
+
const iy2 = Math.min(item.y + (item.h || item.fontSize), cell.bbox.y2 + pad);
|
|
3580
|
+
if (ix1 >= ix2 || iy1 >= iy2) continue;
|
|
3581
|
+
const intersectArea = (ix2 - ix1) * (iy2 - iy1);
|
|
3582
|
+
const itemArea = Math.max(item.w, 1) * Math.max(item.h || item.fontSize, 1);
|
|
3583
|
+
const score = intersectArea / itemArea;
|
|
3584
|
+
if (score > bestScore) {
|
|
3585
|
+
bestScore = score;
|
|
3586
|
+
bestCell = cell;
|
|
3375
3587
|
}
|
|
3376
3588
|
}
|
|
3377
|
-
if (bestCell) {
|
|
3589
|
+
if (bestCell && bestScore > 0.3) {
|
|
3378
3590
|
result.get(bestCell).push(item);
|
|
3379
3591
|
}
|
|
3380
3592
|
}
|
|
@@ -3401,8 +3613,13 @@ function cellTextToString(items) {
|
|
|
3401
3613
|
const textLines = lines.map((line) => {
|
|
3402
3614
|
const s = line.sort((a, b) => a.x - b.x);
|
|
3403
3615
|
if (s.length === 1) return s[0].text;
|
|
3616
|
+
const evenSpaced = detectEvenSpacedItems(s);
|
|
3404
3617
|
let result = s[0].text;
|
|
3405
3618
|
for (let j = 1; j < s.length; j++) {
|
|
3619
|
+
if (evenSpaced[j]) {
|
|
3620
|
+
result += s[j].text;
|
|
3621
|
+
continue;
|
|
3622
|
+
}
|
|
3406
3623
|
const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
|
|
3407
3624
|
const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
|
|
3408
3625
|
const prevIsKorean = /[가-힣]$/.test(result);
|
|
@@ -3417,6 +3634,57 @@ function cellTextToString(items) {
|
|
|
3417
3634
|
}
|
|
3418
3635
|
return result;
|
|
3419
3636
|
});
|
|
3637
|
+
return mergeCellTextLines(textLines);
|
|
3638
|
+
}
|
|
3639
|
+
function detectEvenSpacedItems(items) {
|
|
3640
|
+
const result = new Array(items.length).fill(false);
|
|
3641
|
+
if (items.length < 3) return result;
|
|
3642
|
+
let runStart = -1;
|
|
3643
|
+
for (let i = 0; i < items.length; i++) {
|
|
3644
|
+
const isShortKorean = /^[가-힣]{1}$/.test(items[i].text) || /^[\d]{1}$/.test(items[i].text);
|
|
3645
|
+
if (isShortKorean && runStart >= 0 && i > 0) {
|
|
3646
|
+
const gap = items[i].x - (items[i - 1].x + items[i - 1].w);
|
|
3647
|
+
const maxRunGap = Math.max(items[i].fontSize * 3, 30);
|
|
3648
|
+
if (gap > maxRunGap) {
|
|
3649
|
+
if (i - runStart >= 3) markEvenRun(items, result, runStart, i);
|
|
3650
|
+
runStart = i;
|
|
3651
|
+
continue;
|
|
3652
|
+
}
|
|
3653
|
+
}
|
|
3654
|
+
if (isShortKorean) {
|
|
3655
|
+
if (runStart < 0) runStart = i;
|
|
3656
|
+
} else {
|
|
3657
|
+
if (runStart >= 0 && i - runStart >= 3) {
|
|
3658
|
+
markEvenRun(items, result, runStart, i);
|
|
3659
|
+
}
|
|
3660
|
+
runStart = -1;
|
|
3661
|
+
}
|
|
3662
|
+
}
|
|
3663
|
+
if (runStart >= 0 && items.length - runStart >= 3) {
|
|
3664
|
+
markEvenRun(items, result, runStart, items.length);
|
|
3665
|
+
}
|
|
3666
|
+
return result;
|
|
3667
|
+
}
|
|
3668
|
+
function markEvenRun(items, result, start, end) {
|
|
3669
|
+
const gaps = [];
|
|
3670
|
+
for (let i = start + 1; i < end; i++) {
|
|
3671
|
+
gaps.push(items[i].x - (items[i - 1].x + items[i - 1].w));
|
|
3672
|
+
}
|
|
3673
|
+
const posGaps = gaps.filter((g2) => g2 > 0);
|
|
3674
|
+
if (posGaps.length < 2) return;
|
|
3675
|
+
let minGap = Infinity, maxGap = -Infinity;
|
|
3676
|
+
for (const g2 of posGaps) {
|
|
3677
|
+
if (g2 < minGap) minGap = g2;
|
|
3678
|
+
if (g2 > maxGap) maxGap = g2;
|
|
3679
|
+
}
|
|
3680
|
+
const avgFs = items[start].fontSize;
|
|
3681
|
+
if (minGap >= avgFs * 0.1 && maxGap <= avgFs * 3 && maxGap / Math.max(minGap, 0.1) <= 3) {
|
|
3682
|
+
for (let i = start + 1; i < end; i++) {
|
|
3683
|
+
result[i] = true;
|
|
3684
|
+
}
|
|
3685
|
+
}
|
|
3686
|
+
}
|
|
3687
|
+
function mergeCellTextLines(textLines) {
|
|
3420
3688
|
if (textLines.length <= 1) return textLines[0] || "";
|
|
3421
3689
|
const merged = [textLines[0]];
|
|
3422
3690
|
for (let i = 1; i < textLines.length; i++) {
|
|
@@ -3442,24 +3710,172 @@ var Y_TOL = 3;
|
|
|
3442
3710
|
var COL_CLUSTER_TOL = 15;
|
|
3443
3711
|
var MIN_ROWS = 3;
|
|
3444
3712
|
var MIN_COLS = 2;
|
|
3445
|
-
var MIN_GAP_FACTOR =
|
|
3446
|
-
var
|
|
3713
|
+
var MIN_GAP_FACTOR = 2;
|
|
3714
|
+
var MIN_GAP_ABSOLUTE = 20;
|
|
3715
|
+
var MIN_COL_FILL_RATIO = 0.4;
|
|
3447
3716
|
function detectClusterTables(items, pageNum) {
|
|
3448
3717
|
if (items.length < MIN_ROWS * MIN_COLS) return [];
|
|
3449
|
-
const
|
|
3718
|
+
const { merged, originMap } = mergeEvenSpacedClusters(items);
|
|
3719
|
+
const rows = groupByBaseline(merged);
|
|
3450
3720
|
if (rows.length < MIN_ROWS) return [];
|
|
3451
|
-
const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
|
|
3452
|
-
if (suspiciousRows.length < MIN_ROWS) return [];
|
|
3453
|
-
const columns = extractColumnClusters(suspiciousRows);
|
|
3454
|
-
if (columns.length < MIN_COLS) return [];
|
|
3455
|
-
const tableRegions = findTableRegions(rows, columns);
|
|
3456
3721
|
const results = [];
|
|
3457
|
-
|
|
3458
|
-
|
|
3459
|
-
|
|
3722
|
+
const headerResult = detectHeaderRow(rows);
|
|
3723
|
+
if (headerResult) {
|
|
3724
|
+
const { columns, headerIdx } = headerResult;
|
|
3725
|
+
const headerRow = rows[headerIdx];
|
|
3726
|
+
const headerItems = [...headerRow.items].sort((a, b) => a.x - b.x);
|
|
3727
|
+
const headerAndBelow = rows.slice(headerIdx);
|
|
3728
|
+
const mergedRows = mergeMultiLineRows(headerAndBelow, columns);
|
|
3729
|
+
const tableRegions = findTableRegionsByHeader(mergedRows, columns, headerItems);
|
|
3730
|
+
for (const region of tableRegions) {
|
|
3731
|
+
const table = buildClusterTable(region.rows, columns, pageNum);
|
|
3732
|
+
if (table) {
|
|
3733
|
+
expandUsedItems(table.usedItems, originMap);
|
|
3734
|
+
results.push(table);
|
|
3735
|
+
}
|
|
3736
|
+
}
|
|
3737
|
+
}
|
|
3738
|
+
if (results.length === 0) {
|
|
3739
|
+
const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
|
|
3740
|
+
if (suspiciousRows.length >= MIN_ROWS) {
|
|
3741
|
+
const columns = extractColumnClusters(suspiciousRows);
|
|
3742
|
+
if (columns.length >= MIN_COLS) {
|
|
3743
|
+
const tableRegions = findTableRegions(rows, columns);
|
|
3744
|
+
for (const region of tableRegions) {
|
|
3745
|
+
const mergedRows = mergeMultiLineRows(region.rows, columns);
|
|
3746
|
+
const table = buildClusterTable(mergedRows, columns, pageNum);
|
|
3747
|
+
if (table) {
|
|
3748
|
+
expandUsedItems(table.usedItems, originMap);
|
|
3749
|
+
results.push(table);
|
|
3750
|
+
}
|
|
3751
|
+
}
|
|
3752
|
+
}
|
|
3753
|
+
}
|
|
3460
3754
|
}
|
|
3461
3755
|
return results;
|
|
3462
3756
|
}
|
|
3757
|
+
function mergeEvenSpacedClusters(items) {
|
|
3758
|
+
const originMap = /* @__PURE__ */ new Map();
|
|
3759
|
+
const rows = groupByBaseline(items);
|
|
3760
|
+
const merged = [];
|
|
3761
|
+
for (const row of rows) {
|
|
3762
|
+
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3763
|
+
let i = 0;
|
|
3764
|
+
while (i < sorted.length) {
|
|
3765
|
+
if (/^[가-힣\d]$/.test(sorted[i].text)) {
|
|
3766
|
+
let runEnd = i + 1;
|
|
3767
|
+
while (runEnd < sorted.length && /^[가-힣\d]$/.test(sorted[runEnd].text)) {
|
|
3768
|
+
const gap = sorted[runEnd].x - (sorted[runEnd - 1].x + sorted[runEnd - 1].w);
|
|
3769
|
+
const fs = sorted[runEnd].fontSize;
|
|
3770
|
+
if (gap < fs * 0.1 || gap > fs * 3) break;
|
|
3771
|
+
runEnd++;
|
|
3772
|
+
}
|
|
3773
|
+
if (runEnd - i >= 3) {
|
|
3774
|
+
const gaps = [];
|
|
3775
|
+
for (let g2 = i + 1; g2 < runEnd; g2++) {
|
|
3776
|
+
gaps.push(sorted[g2].x - (sorted[g2 - 1].x + sorted[g2 - 1].w));
|
|
3777
|
+
}
|
|
3778
|
+
let minG = Infinity, maxG = -Infinity;
|
|
3779
|
+
for (const g2 of gaps) {
|
|
3780
|
+
if (g2 < minG) minG = g2;
|
|
3781
|
+
if (g2 > maxG) maxG = g2;
|
|
3782
|
+
}
|
|
3783
|
+
if (minG > 0 && maxG / minG <= 3) {
|
|
3784
|
+
const run = sorted.slice(i, runEnd);
|
|
3785
|
+
const text = run.map((r) => r.text).join("");
|
|
3786
|
+
const first = run[0], last = run[runEnd - i - 1];
|
|
3787
|
+
const item = {
|
|
3788
|
+
text,
|
|
3789
|
+
x: first.x,
|
|
3790
|
+
y: first.y,
|
|
3791
|
+
w: last.x + last.w - first.x,
|
|
3792
|
+
h: first.h,
|
|
3793
|
+
fontSize: first.fontSize,
|
|
3794
|
+
fontName: first.fontName
|
|
3795
|
+
};
|
|
3796
|
+
originMap.set(item, run);
|
|
3797
|
+
merged.push(item);
|
|
3798
|
+
i = runEnd;
|
|
3799
|
+
continue;
|
|
3800
|
+
}
|
|
3801
|
+
}
|
|
3802
|
+
}
|
|
3803
|
+
merged.push(sorted[i]);
|
|
3804
|
+
i++;
|
|
3805
|
+
}
|
|
3806
|
+
}
|
|
3807
|
+
return { merged, originMap };
|
|
3808
|
+
}
|
|
3809
|
+
function expandUsedItems(usedItems, originMap) {
|
|
3810
|
+
const toAdd = [];
|
|
3811
|
+
for (const item of usedItems) {
|
|
3812
|
+
const origins = originMap.get(item);
|
|
3813
|
+
if (origins) for (const o of origins) toAdd.push(o);
|
|
3814
|
+
}
|
|
3815
|
+
for (const a of toAdd) usedItems.add(a);
|
|
3816
|
+
}
|
|
3817
|
+
function detectHeaderRow(rows) {
|
|
3818
|
+
const allItems = rows.flatMap((r) => r.items);
|
|
3819
|
+
if (allItems.length === 0) return null;
|
|
3820
|
+
let allMinX = Infinity, allMaxX = -Infinity;
|
|
3821
|
+
for (const i of allItems) {
|
|
3822
|
+
if (i.x < allMinX) allMinX = i.x;
|
|
3823
|
+
const r = i.x + i.w;
|
|
3824
|
+
if (r > allMaxX) allMaxX = r;
|
|
3825
|
+
}
|
|
3826
|
+
const pageSpan = allMaxX - allMinX;
|
|
3827
|
+
if (pageSpan <= 0) return null;
|
|
3828
|
+
for (let ri = 0; ri < rows.length; ri++) {
|
|
3829
|
+
const row = rows[ri];
|
|
3830
|
+
if (row.items.length < MIN_COLS || row.items.length > 6) continue;
|
|
3831
|
+
if (row.items.some((i) => i.text.length > 8)) continue;
|
|
3832
|
+
if (!row.items.some((i) => /[가-힣]/.test(i.text))) continue;
|
|
3833
|
+
if (row.items.some((i) => /^[□■○●·※▶▷◆◇\-]/.test(i.text))) continue;
|
|
3834
|
+
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3835
|
+
const xSpan = sorted[sorted.length - 1].x + sorted[sorted.length - 1].w - sorted[0].x;
|
|
3836
|
+
if (xSpan / pageSpan < 0.4) continue;
|
|
3837
|
+
const avgFs = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
|
|
3838
|
+
let hasLargeGap = false;
|
|
3839
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
3840
|
+
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
3841
|
+
if (gap >= avgFs * 2.5) {
|
|
3842
|
+
hasLargeGap = true;
|
|
3843
|
+
break;
|
|
3844
|
+
}
|
|
3845
|
+
}
|
|
3846
|
+
if (!hasLargeGap) continue;
|
|
3847
|
+
const columns = sorted.map((item) => ({ x: item.x, count: 0 }));
|
|
3848
|
+
let matchCount = 0;
|
|
3849
|
+
for (let j = ri + 1; j < rows.length && matchCount < MIN_ROWS + 2; j++) {
|
|
3850
|
+
const matched = countMatchedColumnsRange(rows[j], columns, sorted);
|
|
3851
|
+
if (matched >= MIN_COLS) matchCount++;
|
|
3852
|
+
}
|
|
3853
|
+
if (matchCount < MIN_ROWS) continue;
|
|
3854
|
+
return { columns, headerIdx: ri };
|
|
3855
|
+
}
|
|
3856
|
+
return null;
|
|
3857
|
+
}
|
|
3858
|
+
function mergeMultiLineRows(rows, columns) {
|
|
3859
|
+
if (rows.length <= 1) return rows;
|
|
3860
|
+
const result = [rows[0]];
|
|
3861
|
+
const allFontSizes = rows.flatMap((r) => r.items).map((i) => i.fontSize);
|
|
3862
|
+
const avgFontSize = allFontSizes.length > 0 ? allFontSizes.reduce((s, v) => s + v, 0) / allFontSizes.length : 12;
|
|
3863
|
+
for (let i = 1; i < rows.length; i++) {
|
|
3864
|
+
const prev = result[result.length - 1];
|
|
3865
|
+
const curr = rows[i];
|
|
3866
|
+
const yGap = Math.abs(prev.y - curr.y);
|
|
3867
|
+
const matchedCols = countMatchedColumns(curr, columns);
|
|
3868
|
+
if (yGap < avgFontSize * 1.8 && curr.items.length <= 2 && (matchedCols < MIN_COLS || curr.items.length === 1)) {
|
|
3869
|
+
result[result.length - 1] = {
|
|
3870
|
+
y: prev.y,
|
|
3871
|
+
items: [...prev.items, ...curr.items]
|
|
3872
|
+
};
|
|
3873
|
+
} else {
|
|
3874
|
+
result.push(curr);
|
|
3875
|
+
}
|
|
3876
|
+
}
|
|
3877
|
+
return result;
|
|
3878
|
+
}
|
|
3463
3879
|
function groupByBaseline(items) {
|
|
3464
3880
|
if (items.length === 0) return [];
|
|
3465
3881
|
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
@@ -3481,8 +3897,9 @@ function groupByBaseline(items) {
|
|
|
3481
3897
|
function hasSuspiciousGaps(row) {
|
|
3482
3898
|
if (row.items.length < 2) return false;
|
|
3483
3899
|
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3900
|
+
if (sorted.length === 2 && sorted[1].text.length > 20) return false;
|
|
3484
3901
|
const avgFontSize = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
|
|
3485
|
-
const minGap = avgFontSize * MIN_GAP_FACTOR;
|
|
3902
|
+
const minGap = Math.max(avgFontSize * MIN_GAP_FACTOR, MIN_GAP_ABSOLUTE);
|
|
3486
3903
|
for (let i = 1; i < sorted.length; i++) {
|
|
3487
3904
|
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
3488
3905
|
if (gap >= minGap) return true;
|
|
@@ -3509,6 +3926,41 @@ function extractColumnClusters(rows) {
|
|
|
3509
3926
|
const minCount = Math.max(2, Math.floor(rows.length * MIN_COL_FILL_RATIO));
|
|
3510
3927
|
return clusters.filter((c) => c.count >= minCount).sort((a, b) => a.x - b.x);
|
|
3511
3928
|
}
|
|
3929
|
+
function findTableRegionsByHeader(allRows, columns, headerItems) {
|
|
3930
|
+
const regions = [];
|
|
3931
|
+
let currentRegion = [];
|
|
3932
|
+
let missStreak = 0;
|
|
3933
|
+
for (const row of allRows) {
|
|
3934
|
+
const matchedCols = countMatchedColumnsRange(row, columns, headerItems);
|
|
3935
|
+
if (matchedCols >= MIN_COLS) {
|
|
3936
|
+
currentRegion.push(row);
|
|
3937
|
+
missStreak = 0;
|
|
3938
|
+
} else if (currentRegion.length > 0 && (row.items.length <= 2 || missStreak === 0)) {
|
|
3939
|
+
currentRegion.push(row);
|
|
3940
|
+
missStreak++;
|
|
3941
|
+
} else {
|
|
3942
|
+
while (currentRegion.length > 0) {
|
|
3943
|
+
const last = currentRegion[currentRegion.length - 1];
|
|
3944
|
+
if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
|
|
3945
|
+
currentRegion.pop();
|
|
3946
|
+
}
|
|
3947
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
3948
|
+
regions.push({ rows: [...currentRegion] });
|
|
3949
|
+
}
|
|
3950
|
+
currentRegion = [];
|
|
3951
|
+
missStreak = 0;
|
|
3952
|
+
}
|
|
3953
|
+
}
|
|
3954
|
+
while (currentRegion.length > 0) {
|
|
3955
|
+
const last = currentRegion[currentRegion.length - 1];
|
|
3956
|
+
if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
|
|
3957
|
+
currentRegion.pop();
|
|
3958
|
+
}
|
|
3959
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
3960
|
+
regions.push({ rows: currentRegion });
|
|
3961
|
+
}
|
|
3962
|
+
return regions;
|
|
3963
|
+
}
|
|
3512
3964
|
function findTableRegions(allRows, columns) {
|
|
3513
3965
|
const regions = [];
|
|
3514
3966
|
let currentRegion = [];
|
|
@@ -3544,18 +3996,81 @@ function countMatchedColumns(row, columns) {
|
|
|
3544
3996
|
}
|
|
3545
3997
|
return matched.size;
|
|
3546
3998
|
}
|
|
3547
|
-
function
|
|
3548
|
-
const
|
|
3549
|
-
let
|
|
3550
|
-
|
|
3551
|
-
|
|
3552
|
-
|
|
3553
|
-
|
|
3554
|
-
|
|
3555
|
-
|
|
3999
|
+
function countMatchedColumnsRange(row, columns, headerItems) {
|
|
4000
|
+
const boundaries = [];
|
|
4001
|
+
for (let ci = 0; ci < headerItems.length; ci++) {
|
|
4002
|
+
const left = ci === 0 ? 0 : (headerItems[ci - 1].x + headerItems[ci - 1].w + headerItems[ci].x) / 2;
|
|
4003
|
+
const right = ci === headerItems.length - 1 ? Infinity : (headerItems[ci].x + headerItems[ci].w + headerItems[ci + 1].x) / 2;
|
|
4004
|
+
boundaries.push({ left, right });
|
|
4005
|
+
}
|
|
4006
|
+
const matched = /* @__PURE__ */ new Set();
|
|
4007
|
+
for (const item of row.items) {
|
|
4008
|
+
for (let ci = 0; ci < boundaries.length; ci++) {
|
|
4009
|
+
if (item.x >= boundaries[ci].left && item.x < boundaries[ci].right) {
|
|
4010
|
+
matched.add(ci);
|
|
4011
|
+
break;
|
|
4012
|
+
}
|
|
3556
4013
|
}
|
|
3557
4014
|
}
|
|
3558
|
-
return
|
|
4015
|
+
return matched.size;
|
|
4016
|
+
}
|
|
4017
|
+
function assignRowItems(items, columns, numCols) {
|
|
4018
|
+
if (items.length === 0) return [];
|
|
4019
|
+
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
4020
|
+
const colCenters = columns.map((c) => c.x);
|
|
4021
|
+
const gaps = [];
|
|
4022
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
4023
|
+
gaps.push({ idx: i, size: sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w) });
|
|
4024
|
+
}
|
|
4025
|
+
const gapSizes = gaps.map((g2) => g2.size).sort((a, b) => a - b);
|
|
4026
|
+
const medianGap = gapSizes.length > 0 ? gapSizes[Math.floor(gapSizes.length / 2)] : 0;
|
|
4027
|
+
const gapThreshold = sorted.length <= numCols + 1 ? 12 : Math.max(medianGap * 2.5, 12);
|
|
4028
|
+
const significantGaps = gaps.filter((g2) => g2.size >= gapThreshold).sort((a, b) => b.size - a.size).slice(0, numCols - 1).sort((a, b) => a.idx - b.idx);
|
|
4029
|
+
const groups = [];
|
|
4030
|
+
let start = 0;
|
|
4031
|
+
for (const gap of significantGaps) {
|
|
4032
|
+
groups.push(sorted.slice(start, gap.idx));
|
|
4033
|
+
start = gap.idx;
|
|
4034
|
+
}
|
|
4035
|
+
groups.push(sorted.slice(start));
|
|
4036
|
+
const result = [];
|
|
4037
|
+
const usedCols = /* @__PURE__ */ new Set();
|
|
4038
|
+
const groupCenters = groups.map((g2) => {
|
|
4039
|
+
let minX = Infinity, maxX = -Infinity;
|
|
4040
|
+
for (const i of g2) {
|
|
4041
|
+
if (i.x < minX) minX = i.x;
|
|
4042
|
+
const r = i.x + i.w;
|
|
4043
|
+
if (r > maxX) maxX = r;
|
|
4044
|
+
}
|
|
4045
|
+
return (minX + maxX) / 2;
|
|
4046
|
+
});
|
|
4047
|
+
const assignments = [];
|
|
4048
|
+
for (let gi = 0; gi < groups.length; gi++) {
|
|
4049
|
+
for (let ci = 0; ci < numCols; ci++) {
|
|
4050
|
+
assignments.push({ gi, ci, dist: Math.abs(groupCenters[gi] - colCenters[ci]) });
|
|
4051
|
+
}
|
|
4052
|
+
}
|
|
4053
|
+
assignments.sort((a, b) => a.dist - b.dist);
|
|
4054
|
+
const assignedGroups = /* @__PURE__ */ new Set();
|
|
4055
|
+
for (const { gi, ci } of assignments) {
|
|
4056
|
+
if (assignedGroups.has(gi) || usedCols.has(ci)) continue;
|
|
4057
|
+
result.push({ col: ci, items: groups[gi] });
|
|
4058
|
+
assignedGroups.add(gi);
|
|
4059
|
+
usedCols.add(ci);
|
|
4060
|
+
}
|
|
4061
|
+
for (let gi = 0; gi < groups.length; gi++) {
|
|
4062
|
+
if (assignedGroups.has(gi)) continue;
|
|
4063
|
+
let bestCol = 0, bestDist = Infinity;
|
|
4064
|
+
for (let ci = 0; ci < numCols; ci++) {
|
|
4065
|
+
const d = Math.abs(groupCenters[gi] - colCenters[ci]);
|
|
4066
|
+
if (d < bestDist) {
|
|
4067
|
+
bestDist = d;
|
|
4068
|
+
bestCol = ci;
|
|
4069
|
+
}
|
|
4070
|
+
}
|
|
4071
|
+
result.push({ col: bestCol, items: groups[gi] });
|
|
4072
|
+
}
|
|
4073
|
+
return result;
|
|
3559
4074
|
}
|
|
3560
4075
|
function buildClusterTable(rows, columns, pageNum) {
|
|
3561
4076
|
const numCols = columns.length;
|
|
@@ -3573,12 +4088,12 @@ function buildClusterTable(rows, columns, pageNum) {
|
|
|
3573
4088
|
usedItems.add(row.items[0]);
|
|
3574
4089
|
continue;
|
|
3575
4090
|
}
|
|
3576
|
-
|
|
3577
|
-
|
|
3578
|
-
|
|
4091
|
+
const assignments = assignRowItems(row.items, columns, numCols);
|
|
4092
|
+
for (const { col, items } of assignments) {
|
|
4093
|
+
const text = items.map((i) => i.text).join(" ");
|
|
3579
4094
|
const existing = cells[r][col].text;
|
|
3580
|
-
cells[r][col].text = existing ? existing + " " +
|
|
3581
|
-
usedItems.add(item);
|
|
4095
|
+
cells[r][col].text = existing ? existing + " " + text : text;
|
|
4096
|
+
for (const item of items) usedItems.add(item);
|
|
3582
4097
|
}
|
|
3583
4098
|
}
|
|
3584
4099
|
let emptyRows = 0;
|
|
@@ -3590,11 +4105,48 @@ function buildClusterTable(rows, columns, pageNum) {
|
|
|
3590
4105
|
const hasValue = cells.some((row) => row[c].text !== "");
|
|
3591
4106
|
if (!hasValue) return null;
|
|
3592
4107
|
}
|
|
4108
|
+
for (let r = numRows - 1; r >= 1; r--) {
|
|
4109
|
+
const nonEmptyCols = cells[r].filter((c) => c.text.trim()).length;
|
|
4110
|
+
if (nonEmptyCols !== 1) continue;
|
|
4111
|
+
if (cells[r][0].text.trim() !== "") continue;
|
|
4112
|
+
const contentText = cells[r].find((c) => c.text.trim())?.text.trim() || "";
|
|
4113
|
+
if (/^[○●▶\-·]/.test(contentText)) continue;
|
|
4114
|
+
for (let pr = r - 1; pr >= 0; pr--) {
|
|
4115
|
+
if (cells[pr].some((c) => c.text.trim())) {
|
|
4116
|
+
for (let c = 0; c < numCols; c++) {
|
|
4117
|
+
const prev = cells[pr][c].text.trim();
|
|
4118
|
+
const curr = cells[r][c].text.trim();
|
|
4119
|
+
if (curr) cells[pr][c].text = prev ? prev + " " + curr : curr;
|
|
4120
|
+
}
|
|
4121
|
+
for (let c = 0; c < numCols; c++) cells[r][c].text = "";
|
|
4122
|
+
break;
|
|
4123
|
+
}
|
|
4124
|
+
}
|
|
4125
|
+
}
|
|
4126
|
+
for (let r = 0; r < cells.length - 1; r++) {
|
|
4127
|
+
const row = cells[r];
|
|
4128
|
+
const hasCol0 = row[0].text.trim() !== "";
|
|
4129
|
+
const hasColLast = numCols > 1 && row[numCols - 1].text.trim() !== "";
|
|
4130
|
+
const midEmpty = row.slice(1, numCols - 1).every((c) => c.text.trim() === "");
|
|
4131
|
+
if (hasCol0 && hasColLast && midEmpty) {
|
|
4132
|
+
const next = cells[r + 1];
|
|
4133
|
+
if (next[0].text.trim() === "" && next.some((c) => c.text.trim())) {
|
|
4134
|
+
for (let c = 1; c < numCols; c++) {
|
|
4135
|
+
const curr = next[c].text.trim();
|
|
4136
|
+
if (curr) row[c].text = row[c].text.trim() ? row[c].text.trim() + " " + curr : curr;
|
|
4137
|
+
}
|
|
4138
|
+
for (let c = 0; c < numCols; c++) next[c].text = "";
|
|
4139
|
+
}
|
|
4140
|
+
}
|
|
4141
|
+
}
|
|
4142
|
+
const filteredCells = cells.filter((row) => row.some((c) => c.text.trim()));
|
|
4143
|
+
const finalRowCount = filteredCells.length;
|
|
4144
|
+
if (finalRowCount < MIN_ROWS) return null;
|
|
3593
4145
|
const irTable = {
|
|
3594
|
-
rows:
|
|
4146
|
+
rows: finalRowCount,
|
|
3595
4147
|
cols: numCols,
|
|
3596
|
-
cells,
|
|
3597
|
-
hasHeader:
|
|
4148
|
+
cells: filteredCells,
|
|
4149
|
+
hasHeader: finalRowCount > 1
|
|
3598
4150
|
};
|
|
3599
4151
|
const allItems = rows.flatMap((r) => r.items);
|
|
3600
4152
|
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
@@ -3671,7 +4223,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3671
4223
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
3672
4224
|
const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
|
|
3673
4225
|
const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
|
|
3674
|
-
const
|
|
4226
|
+
const fontSizeFreq = /* @__PURE__ */ new Map();
|
|
3675
4227
|
const pageHeights = /* @__PURE__ */ new Map();
|
|
3676
4228
|
let parsedPages = 0;
|
|
3677
4229
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
@@ -3688,7 +4240,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3688
4240
|
warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
|
|
3689
4241
|
}
|
|
3690
4242
|
for (const item of visible) {
|
|
3691
|
-
if (item.fontSize > 0)
|
|
4243
|
+
if (item.fontSize > 0) fontSizeFreq.set(item.fontSize, (fontSizeFreq.get(item.fontSize) || 0) + 1);
|
|
3692
4244
|
}
|
|
3693
4245
|
const opList = await page.getOperatorList();
|
|
3694
4246
|
const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
|
|
@@ -3727,7 +4279,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3727
4279
|
blocks.splice(removed[ri], 1);
|
|
3728
4280
|
}
|
|
3729
4281
|
}
|
|
3730
|
-
const medianFontSize =
|
|
4282
|
+
const medianFontSize = computeMedianFontSizeFromFreq(fontSizeFreq);
|
|
3731
4283
|
if (medianFontSize > 0) {
|
|
3732
4284
|
detectHeadings(blocks, medianFontSize);
|
|
3733
4285
|
}
|
|
@@ -3780,11 +4332,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
|
|
|
3780
4332
|
}
|
|
3781
4333
|
return { visible, hiddenCount };
|
|
3782
4334
|
}
|
|
3783
|
-
function
|
|
3784
|
-
if (
|
|
3785
|
-
|
|
3786
|
-
const
|
|
3787
|
-
|
|
4335
|
+
function computeMedianFontSizeFromFreq(freq) {
|
|
4336
|
+
if (freq.size === 0) return 0;
|
|
4337
|
+
let total = 0;
|
|
4338
|
+
for (const count of freq.values()) total += count;
|
|
4339
|
+
const sorted = [...freq.entries()].sort((a, b) => a[0] - b[0]);
|
|
4340
|
+
const mid = Math.floor(total / 2);
|
|
4341
|
+
let cumulative = 0;
|
|
4342
|
+
for (const [size, count] of sorted) {
|
|
4343
|
+
cumulative += count;
|
|
4344
|
+
if (cumulative > mid) return size;
|
|
4345
|
+
}
|
|
4346
|
+
return sorted[sorted.length - 1][0];
|
|
3788
4347
|
}
|
|
3789
4348
|
function detectHeadings(blocks, medianFontSize) {
|
|
3790
4349
|
for (const block of blocks) {
|
|
@@ -3810,11 +4369,21 @@ function collapseEvenSpacing(text) {
|
|
|
3810
4369
|
if (tokens.length >= 3 && singleCharCount / tokens.length >= 0.7) {
|
|
3811
4370
|
return tokens.join("");
|
|
3812
4371
|
}
|
|
3813
|
-
return text
|
|
4372
|
+
return text.replace(
|
|
4373
|
+
/(?<![가-힣])[가-힣](?: [가-힣\d]){2,}(?![가-힣])/g,
|
|
4374
|
+
(match) => match.replace(/ /g, "")
|
|
4375
|
+
);
|
|
3814
4376
|
}
|
|
3815
4377
|
function shouldDemoteTable(table) {
|
|
3816
4378
|
const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
|
|
3817
4379
|
const allText = allCells.join(" ");
|
|
4380
|
+
if (table.rows <= 3 && table.cols <= 3) {
|
|
4381
|
+
const totalCells2 = table.rows * table.cols;
|
|
4382
|
+
const emptyCells2 = totalCells2 - allCells.length;
|
|
4383
|
+
if (emptyCells2 >= totalCells2 * 0.3) return true;
|
|
4384
|
+
if (/[□■◆○●▶ㅇ]/.test(allText)) return true;
|
|
4385
|
+
if (/<[^>]+>/.test(allText)) return true;
|
|
4386
|
+
}
|
|
3818
4387
|
if (allText.length > 200) return false;
|
|
3819
4388
|
if (/[□■◆○●▶]/.test(allText) && table.rows <= 3) return true;
|
|
3820
4389
|
const totalCells = table.rows * table.cols;
|
|
@@ -3925,6 +4494,7 @@ function extractPageBlocksWithLines(items, pageNum, opList, pageWidth, pageHeigh
|
|
|
3925
4494
|
if (items.length === 0) return [];
|
|
3926
4495
|
let { horizontals, verticals } = extractLines(opList.fnArray, opList.argsArray);
|
|
3927
4496
|
({ horizontals, verticals } = filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight));
|
|
4497
|
+
({ horizontals, verticals } = preprocessLines(horizontals, verticals));
|
|
3928
4498
|
const grids = buildTableGrids(horizontals, verticals);
|
|
3929
4499
|
if (grids.length > 0) {
|
|
3930
4500
|
return extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals);
|
|
@@ -3936,14 +4506,19 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
3936
4506
|
const usedItems = /* @__PURE__ */ new Set();
|
|
3937
4507
|
const sortedGrids = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
|
|
3938
4508
|
for (const grid of sortedGrids) {
|
|
4509
|
+
const numGridRows = grid.rowYs.length - 1;
|
|
4510
|
+
const numGridCols = grid.colXs.length - 1;
|
|
4511
|
+
if (numGridRows === 1 && numGridCols >= 2) continue;
|
|
3939
4512
|
const tableItems = [];
|
|
3940
4513
|
const pad = 3;
|
|
4514
|
+
const gridW = grid.bbox.x2 - grid.bbox.x1;
|
|
3941
4515
|
for (const item of items) {
|
|
3942
4516
|
if (usedItems.has(item)) continue;
|
|
3943
|
-
if (item.
|
|
3944
|
-
|
|
3945
|
-
|
|
3946
|
-
|
|
4517
|
+
if (item.y < grid.bbox.y1 - pad || item.y > grid.bbox.y2 + pad) continue;
|
|
4518
|
+
if (item.x < grid.bbox.x1 - pad || item.x + item.w > grid.bbox.x2 + pad) continue;
|
|
4519
|
+
if (gridW < 120 && item.x + item.w > grid.bbox.x2 - 2) continue;
|
|
4520
|
+
tableItems.push(item);
|
|
4521
|
+
usedItems.add(item);
|
|
3947
4522
|
}
|
|
3948
4523
|
const cells = extractCells(grid, horizontals, verticals);
|
|
3949
4524
|
if (cells.length === 0) continue;
|
|
@@ -3967,6 +4542,7 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
3967
4542
|
const cellItems = cellTextMap.get(cell) || [];
|
|
3968
4543
|
let text = cellTextToString(cellItems);
|
|
3969
4544
|
text = text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").trim();
|
|
4545
|
+
text = text.split("\n").map((line) => collapseEvenSpacing(line)).join("\n");
|
|
3970
4546
|
irGrid[cell.row][cell.col] = {
|
|
3971
4547
|
text,
|
|
3972
4548
|
colSpan: cell.colSpan,
|
|
@@ -3991,23 +4567,58 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
3991
4567
|
if (shouldDemoteTable(irTable)) {
|
|
3992
4568
|
const demoted = demoteTableToText(irTable);
|
|
3993
4569
|
if (demoted) {
|
|
3994
|
-
|
|
4570
|
+
const text = numGridRows === 1 ? "\n" + demoted + "\n" : demoted;
|
|
4571
|
+
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
|
|
3995
4572
|
}
|
|
3996
4573
|
continue;
|
|
3997
4574
|
}
|
|
3998
4575
|
blocks.push({ type: "table", table: irTable, pageNumber: pageNum, bbox: tableBbox });
|
|
3999
4576
|
}
|
|
4000
|
-
|
|
4577
|
+
let remaining = items.filter((i) => !usedItems.has(i));
|
|
4001
4578
|
if (remaining.length > 0) {
|
|
4002
4579
|
remaining.sort((a, b) => b.y - a.y || a.x - b.x);
|
|
4003
|
-
const
|
|
4004
|
-
|
|
4005
|
-
|
|
4580
|
+
const clusterItems = remaining.map((i) => ({
|
|
4581
|
+
text: i.text,
|
|
4582
|
+
x: i.x,
|
|
4583
|
+
y: i.y,
|
|
4584
|
+
w: i.w,
|
|
4585
|
+
h: i.h,
|
|
4586
|
+
fontSize: i.fontSize,
|
|
4587
|
+
fontName: i.fontName
|
|
4588
|
+
}));
|
|
4589
|
+
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4590
|
+
if (clusterResults.length > 0) {
|
|
4591
|
+
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4592
|
+
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4593
|
+
const usedClusterIndices = /* @__PURE__ */ new Set();
|
|
4594
|
+
for (const cr of clusterResults) {
|
|
4595
|
+
for (const ci of cr.usedItems) {
|
|
4596
|
+
const idx = ciToIdx.get(ci);
|
|
4597
|
+
if (idx !== void 0) usedClusterIndices.add(idx);
|
|
4598
|
+
}
|
|
4599
|
+
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4600
|
+
}
|
|
4601
|
+
remaining = remaining.filter((_, idx) => !usedClusterIndices.has(idx));
|
|
4602
|
+
}
|
|
4603
|
+
if (remaining.length > 0) {
|
|
4604
|
+
const allY = remaining.map((i) => i.y);
|
|
4605
|
+
const pageH = safeMax(allY) - safeMin(allY);
|
|
4606
|
+
const groups = xyCutOrder(remaining, Math.max(15, pageH * 0.03));
|
|
4607
|
+
const textBlocks = [];
|
|
4608
|
+
for (const group of groups) {
|
|
4609
|
+
if (group.length === 0) continue;
|
|
4610
|
+
const groupBlocks = extractPageBlocksFallback(group, pageNum);
|
|
4611
|
+
for (const b of groupBlocks) textBlocks.push(b);
|
|
4612
|
+
}
|
|
4613
|
+
const finalTextBlocks = detectListBlocks(textBlocks);
|
|
4614
|
+
for (const b of finalTextBlocks) blocks.push(b);
|
|
4615
|
+
}
|
|
4616
|
+
blocks.sort((a, b) => {
|
|
4006
4617
|
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
4007
4618
|
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
4008
4619
|
return by - ay;
|
|
4009
4620
|
});
|
|
4010
|
-
return mergeAdjacentTableBlocks(
|
|
4621
|
+
return mergeAdjacentTableBlocks(blocks);
|
|
4011
4622
|
}
|
|
4012
4623
|
return mergeAdjacentTableBlocks(blocks);
|
|
4013
4624
|
}
|
|
@@ -4034,52 +4645,52 @@ function mergeAdjacentTableBlocks(blocks) {
|
|
|
4034
4645
|
function extractPageBlocksFallback(items, pageNum) {
|
|
4035
4646
|
if (items.length === 0) return [];
|
|
4036
4647
|
const blocks = [];
|
|
4037
|
-
const
|
|
4038
|
-
|
|
4039
|
-
|
|
4040
|
-
|
|
4041
|
-
|
|
4042
|
-
|
|
4043
|
-
|
|
4044
|
-
|
|
4045
|
-
|
|
4046
|
-
|
|
4047
|
-
|
|
4048
|
-
|
|
4049
|
-
|
|
4050
|
-
|
|
4051
|
-
|
|
4052
|
-
|
|
4053
|
-
|
|
4054
|
-
|
|
4055
|
-
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4056
|
-
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4057
|
-
const usedIndices = /* @__PURE__ */ new Set();
|
|
4058
|
-
for (const cr of clusterResults) {
|
|
4059
|
-
for (const ci of cr.usedItems) {
|
|
4060
|
-
const idx = ciToIdx.get(ci);
|
|
4061
|
-
if (idx !== void 0) usedIndices.add(idx);
|
|
4062
|
-
}
|
|
4063
|
-
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4648
|
+
const clusterItems = items.map((i) => ({
|
|
4649
|
+
text: i.text,
|
|
4650
|
+
x: i.x,
|
|
4651
|
+
y: i.y,
|
|
4652
|
+
w: i.w,
|
|
4653
|
+
h: i.h,
|
|
4654
|
+
fontSize: i.fontSize,
|
|
4655
|
+
fontName: i.fontName
|
|
4656
|
+
}));
|
|
4657
|
+
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4658
|
+
if (clusterResults.length > 0) {
|
|
4659
|
+
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4660
|
+
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4661
|
+
const usedIndices = /* @__PURE__ */ new Set();
|
|
4662
|
+
for (const cr of clusterResults) {
|
|
4663
|
+
for (const ci of cr.usedItems) {
|
|
4664
|
+
const idx = ciToIdx.get(ci);
|
|
4665
|
+
if (idx !== void 0) usedIndices.add(idx);
|
|
4064
4666
|
}
|
|
4065
|
-
|
|
4066
|
-
|
|
4067
|
-
|
|
4068
|
-
|
|
4069
|
-
|
|
4070
|
-
|
|
4071
|
-
|
|
4072
|
-
|
|
4073
|
-
|
|
4667
|
+
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4668
|
+
}
|
|
4669
|
+
const remaining = items.filter((_, idx) => !usedIndices.has(idx));
|
|
4670
|
+
if (remaining.length > 0) {
|
|
4671
|
+
const yLines = groupByY(remaining);
|
|
4672
|
+
for (const line of yLines) {
|
|
4673
|
+
const text = mergeLineSimple(line);
|
|
4674
|
+
if (!text.trim()) continue;
|
|
4675
|
+
const bbox = computeBBox(line, pageNum);
|
|
4676
|
+
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
|
|
4074
4677
|
}
|
|
4075
|
-
|
|
4076
|
-
|
|
4077
|
-
|
|
4078
|
-
|
|
4079
|
-
|
|
4678
|
+
}
|
|
4679
|
+
blocks.sort((a, b) => {
|
|
4680
|
+
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
4681
|
+
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
4682
|
+
return by - ay;
|
|
4683
|
+
});
|
|
4684
|
+
} else {
|
|
4685
|
+
const allYLines = groupByY(items);
|
|
4686
|
+
const columns = detectColumns(allYLines);
|
|
4687
|
+
if (columns && columns.length >= 3) {
|
|
4688
|
+
const tableText = extractWithColumns(allYLines, columns);
|
|
4689
|
+
const bbox = computeBBox(items, pageNum);
|
|
4690
|
+
blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
|
|
4080
4691
|
} else {
|
|
4081
4692
|
const allY = items.map((i) => i.y);
|
|
4082
|
-
const pageHeight =
|
|
4693
|
+
const pageHeight = safeMax(allY) - safeMin(allY);
|
|
4083
4694
|
const gapThreshold = Math.max(15, pageHeight * 0.03);
|
|
4084
4695
|
const orderedGroups = xyCutOrder(items, gapThreshold);
|
|
4085
4696
|
for (const group of orderedGroups) {
|
|
@@ -4132,22 +4743,76 @@ function dominantStyle(items) {
|
|
|
4132
4743
|
return { fontSize: dominantSize, fontName };
|
|
4133
4744
|
}
|
|
4134
4745
|
function normalizeItems(rawItems) {
|
|
4135
|
-
|
|
4746
|
+
const items = [];
|
|
4747
|
+
const spacePositions = [];
|
|
4748
|
+
for (const i of rawItems) {
|
|
4749
|
+
if (typeof i.str !== "string") continue;
|
|
4750
|
+
const x = Math.round(i.transform[4]);
|
|
4751
|
+
const y = Math.round(i.transform[5]);
|
|
4752
|
+
if (!i.str.trim()) {
|
|
4753
|
+
spacePositions.push({ x, y });
|
|
4754
|
+
continue;
|
|
4755
|
+
}
|
|
4136
4756
|
const scaleY = Math.abs(i.transform[3]);
|
|
4137
4757
|
const scaleX = Math.abs(i.transform[0]);
|
|
4138
4758
|
const fontSize = Math.round(Math.max(scaleY, scaleX));
|
|
4139
|
-
|
|
4140
|
-
|
|
4141
|
-
|
|
4142
|
-
|
|
4143
|
-
|
|
4144
|
-
|
|
4145
|
-
|
|
4146
|
-
|
|
4147
|
-
|
|
4148
|
-
|
|
4149
|
-
|
|
4150
|
-
|
|
4759
|
+
const w = Math.round(i.width);
|
|
4760
|
+
const h = Math.round(i.height);
|
|
4761
|
+
const isHidden = fontSize === 0 || i.width === 0 && i.str.trim().length > 0;
|
|
4762
|
+
let text = i.str.trim();
|
|
4763
|
+
if (/^[\d\s\-().·,☎]+$/.test(text) && /\d/.test(text) && / /.test(text)) {
|
|
4764
|
+
text = text.replace(/ /g, "");
|
|
4765
|
+
}
|
|
4766
|
+
const split = splitEvenSpacedItem(text, x, w, fontSize);
|
|
4767
|
+
if (split) {
|
|
4768
|
+
for (const s of split) {
|
|
4769
|
+
items.push({ text: s.text, x: s.x, y, w: s.w, h, fontSize, fontName: i.fontName || "", isHidden });
|
|
4770
|
+
}
|
|
4771
|
+
} else {
|
|
4772
|
+
items.push({ text, x, y, w, h, fontSize, fontName: i.fontName || "", isHidden });
|
|
4773
|
+
}
|
|
4774
|
+
}
|
|
4775
|
+
const sorted = items.sort((a, b) => b.y - a.y || a.x - b.x);
|
|
4776
|
+
const deduped = [];
|
|
4777
|
+
for (let i = 0; i < sorted.length; i++) {
|
|
4778
|
+
let isDup = false;
|
|
4779
|
+
for (let j = deduped.length - 1; j >= 0; j--) {
|
|
4780
|
+
const prev = deduped[j];
|
|
4781
|
+
if (prev.y - sorted[i].y > 3) break;
|
|
4782
|
+
if (Math.abs(prev.y - sorted[i].y) <= 3 && prev.text === sorted[i].text && Math.abs(prev.x - sorted[i].x) <= 3) {
|
|
4783
|
+
isDup = true;
|
|
4784
|
+
break;
|
|
4785
|
+
}
|
|
4786
|
+
}
|
|
4787
|
+
if (!isDup) deduped.push(sorted[i]);
|
|
4788
|
+
}
|
|
4789
|
+
if (spacePositions.length > 0) {
|
|
4790
|
+
for (const item of deduped) {
|
|
4791
|
+
for (const sp of spacePositions) {
|
|
4792
|
+
if (Math.abs(sp.y - item.y) <= 3) {
|
|
4793
|
+
const dist = item.x - sp.x;
|
|
4794
|
+
if (dist >= 0 && dist <= 20) {
|
|
4795
|
+
item.hasSpaceBefore = true;
|
|
4796
|
+
break;
|
|
4797
|
+
}
|
|
4798
|
+
}
|
|
4799
|
+
}
|
|
4800
|
+
}
|
|
4801
|
+
}
|
|
4802
|
+
return deduped;
|
|
4803
|
+
}
|
|
4804
|
+
function splitEvenSpacedItem(text, itemX, itemW, fontSize) {
|
|
4805
|
+
if (!/^[가-힣\d](?: [가-힣\d]){2,}$/.test(text)) return null;
|
|
4806
|
+
const chars = text.split(" ");
|
|
4807
|
+
if (chars.length < 3) return null;
|
|
4808
|
+
const charW = itemW / chars.length;
|
|
4809
|
+
if (charW > fontSize * 2) return null;
|
|
4810
|
+
return chars.map((ch, idx) => ({
|
|
4811
|
+
text: ch,
|
|
4812
|
+
x: Math.round(itemX + idx * charW),
|
|
4813
|
+
w: Math.round(charW * 0.8)
|
|
4814
|
+
// 실제 글자 폭은 간격보다 좁음
|
|
4815
|
+
}));
|
|
4151
4816
|
}
|
|
4152
4817
|
function groupByY(items) {
|
|
4153
4818
|
if (items.length === 0) return [];
|
|
@@ -4172,14 +4837,14 @@ function isProseSpread(items) {
|
|
|
4172
4837
|
for (let i = 1; i < sorted.length; i++) {
|
|
4173
4838
|
gaps.push(sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w));
|
|
4174
4839
|
}
|
|
4175
|
-
const maxGap =
|
|
4840
|
+
const maxGap = safeMax(gaps);
|
|
4176
4841
|
const avgLen = items.reduce((s, i) => s + i.text.length, 0) / items.length;
|
|
4177
4842
|
return maxGap < 40 && avgLen < 5;
|
|
4178
4843
|
}
|
|
4179
4844
|
function detectColumns(yLines) {
|
|
4180
4845
|
const allItems = yLines.flat();
|
|
4181
4846
|
if (allItems.length === 0) return null;
|
|
4182
|
-
const pageWidth =
|
|
4847
|
+
const pageWidth = safeMax(allItems.map((i) => i.x + i.w)) - safeMin(allItems.map((i) => i.x));
|
|
4183
4848
|
if (pageWidth < 100) return null;
|
|
4184
4849
|
let bigoLineIdx = -1;
|
|
4185
4850
|
for (let i = 0; i < yLines.length; i++) {
|
|
@@ -4211,7 +4876,7 @@ function detectColumns(yLines) {
|
|
|
4211
4876
|
}
|
|
4212
4877
|
const peaks = xClusters.filter((c) => c.count >= 3).sort((a, b) => a.minX - b.minX);
|
|
4213
4878
|
if (peaks.length < 3) return null;
|
|
4214
|
-
const MERGE_TOL =
|
|
4879
|
+
const MERGE_TOL = 40;
|
|
4215
4880
|
const merged = [peaks[0]];
|
|
4216
4881
|
for (let i = 1; i < peaks.length; i++) {
|
|
4217
4882
|
const prev = merged[merged.length - 1];
|
|
@@ -4225,7 +4890,14 @@ function detectColumns(yLines) {
|
|
|
4225
4890
|
merged.push({ ...peaks[i] });
|
|
4226
4891
|
}
|
|
4227
4892
|
}
|
|
4228
|
-
const
|
|
4893
|
+
const rawColumns = merged.filter((c) => c.count >= 3).map((c) => c.minX);
|
|
4894
|
+
if (rawColumns.length < 3) return null;
|
|
4895
|
+
const MIN_DETECT_COL_WIDTH = 30;
|
|
4896
|
+
const columns = [rawColumns[0]];
|
|
4897
|
+
for (let i = 1; i < rawColumns.length; i++) {
|
|
4898
|
+
if (rawColumns[i] - columns[columns.length - 1] < MIN_DETECT_COL_WIDTH) continue;
|
|
4899
|
+
columns.push(rawColumns[i]);
|
|
4900
|
+
}
|
|
4229
4901
|
return columns.length >= 3 ? columns : null;
|
|
4230
4902
|
}
|
|
4231
4903
|
function findColumn(x, columns) {
|
|
@@ -4353,6 +5025,16 @@ function buildGridTable(lines, columns) {
|
|
|
4353
5025
|
}
|
|
4354
5026
|
merged.splice(0, headerEnd, headerRow);
|
|
4355
5027
|
}
|
|
5028
|
+
for (const row of merged) {
|
|
5029
|
+
for (let c = 0; c < row.length; c++) {
|
|
5030
|
+
if (row[c]) row[c] = collapseEvenSpacing(row[c]);
|
|
5031
|
+
}
|
|
5032
|
+
}
|
|
5033
|
+
const totalCells = merged.length * numCols;
|
|
5034
|
+
const filledCells = merged.reduce((s, row) => s + row.filter((c) => c).length, 0);
|
|
5035
|
+
if (filledCells < totalCells * 0.35 || merged.length < 2 || merged.length <= 3 && numCols >= 7) {
|
|
5036
|
+
return merged.map((r) => r.filter((c) => c).join(" ")).join("\n");
|
|
5037
|
+
}
|
|
4356
5038
|
const md = [];
|
|
4357
5039
|
md.push("| " + merged[0].join(" | ") + " |");
|
|
4358
5040
|
md.push("| " + merged[0].map(() => "---").join(" | ") + " |");
|
|
@@ -4364,12 +5046,32 @@ function buildGridTable(lines, columns) {
|
|
|
4364
5046
|
function mergeLineSimple(items) {
|
|
4365
5047
|
if (items.length <= 1) return items[0]?.text || "";
|
|
4366
5048
|
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
5049
|
+
const isEvenSpaced = detectEvenSpacedItems(sorted);
|
|
4367
5050
|
let result = sorted[0].text;
|
|
4368
5051
|
for (let i = 1; i < sorted.length; i++) {
|
|
4369
5052
|
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
4370
5053
|
const avgFs = (sorted[i].fontSize + sorted[i - 1].fontSize) / 2;
|
|
4371
|
-
|
|
4372
|
-
|
|
5054
|
+
const tabThreshold = Math.max(avgFs * 2, 30);
|
|
5055
|
+
if (gap > tabThreshold) {
|
|
5056
|
+
result += " ";
|
|
5057
|
+
result += sorted[i].text;
|
|
5058
|
+
continue;
|
|
5059
|
+
}
|
|
5060
|
+
if (isEvenSpaced[i]) {
|
|
5061
|
+
result += sorted[i].text;
|
|
5062
|
+
continue;
|
|
5063
|
+
}
|
|
5064
|
+
if (sorted[i].hasSpaceBefore && gap >= avgFs * 0.05) {
|
|
5065
|
+
result += " ";
|
|
5066
|
+
result += sorted[i].text;
|
|
5067
|
+
continue;
|
|
5068
|
+
}
|
|
5069
|
+
if (/[□■○●▶◆◇ㅇ]$/.test(sorted[i - 1].text) && /^[가-힣]/.test(sorted[i].text) && gap > 1) {
|
|
5070
|
+
result += " ";
|
|
5071
|
+
result += sorted[i].text;
|
|
5072
|
+
continue;
|
|
5073
|
+
}
|
|
5074
|
+
if (gap < avgFs * 0.15) {
|
|
4373
5075
|
} else if (gap < avgFs * 0.35 && (/[가-힣]$/.test(result) || /^[가-힣]/.test(sorted[i].text))) {
|
|
4374
5076
|
} else if (gap > 3) result += " ";
|
|
4375
5077
|
result += sorted[i].text;
|
|
@@ -4378,8 +5080,8 @@ function mergeLineSimple(items) {
|
|
|
4378
5080
|
}
|
|
4379
5081
|
function cleanPdfText(text) {
|
|
4380
5082
|
return mergeKoreanLines(
|
|
4381
|
-
text.replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "")
|
|
4382
|
-
).replace(/^(?!\|)
|
|
5083
|
+
text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
|
|
5084
|
+
).replace(/^(?!\| ---).*$/gm, (line) => collapseEvenSpacing(line)).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
|
|
4383
5085
|
}
|
|
4384
5086
|
function startsWithMarker(line) {
|
|
4385
5087
|
const t = line.trimStart();
|
|
@@ -4571,7 +5273,7 @@ function mergeKoreanLines(text) {
|
|
|
4571
5273
|
result[result.length - 1] = prev + " " + currTrimmed;
|
|
4572
5274
|
continue;
|
|
4573
5275
|
}
|
|
4574
|
-
if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev)) {
|
|
5276
|
+
if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev) && !startsWithMarker(prev)) {
|
|
4575
5277
|
result[result.length - 1] = prev + " " + curr;
|
|
4576
5278
|
} else {
|
|
4577
5279
|
result.push(curr);
|
|
@@ -4619,7 +5321,7 @@ function getTextContent(el) {
|
|
|
4619
5321
|
return el.textContent?.trim() ?? "";
|
|
4620
5322
|
}
|
|
4621
5323
|
function parseXml(text) {
|
|
4622
|
-
return new import_xmldom2.DOMParser().parseFromString(text, "text/xml");
|
|
5324
|
+
return new import_xmldom2.DOMParser().parseFromString(stripDtd(text), "text/xml");
|
|
4623
5325
|
}
|
|
4624
5326
|
function parseSharedStrings(xml) {
|
|
4625
5327
|
const doc = parseXml(xml);
|
|
@@ -4906,7 +5608,7 @@ function getAttr(el, localName) {
|
|
|
4906
5608
|
return null;
|
|
4907
5609
|
}
|
|
4908
5610
|
function parseXml2(text) {
|
|
4909
|
-
return new import_xmldom3.DOMParser().parseFromString(text, "text/xml");
|
|
5611
|
+
return new import_xmldom3.DOMParser().parseFromString(stripDtd(text), "text/xml");
|
|
4910
5612
|
}
|
|
4911
5613
|
function parseStyles(xml) {
|
|
4912
5614
|
const doc = parseXml2(xml);
|
|
@@ -5306,7 +6008,13 @@ function normalize(s) {
|
|
|
5306
6008
|
}
|
|
5307
6009
|
var MAX_LEVENSHTEIN_LEN = 1e4;
|
|
5308
6010
|
function levenshtein(a, b) {
|
|
5309
|
-
if (a.length + b.length > MAX_LEVENSHTEIN_LEN)
|
|
6011
|
+
if (a.length + b.length > MAX_LEVENSHTEIN_LEN) {
|
|
6012
|
+
const sampleLen = Math.min(500, a.length, b.length);
|
|
6013
|
+
let diffs = 0;
|
|
6014
|
+
for (let i = 0; i < sampleLen; i++) if (a[i] !== b[i]) diffs++;
|
|
6015
|
+
const sampleRate = sampleLen > 0 ? diffs / sampleLen : 1;
|
|
6016
|
+
return Math.abs(a.length - b.length) + Math.round(Math.min(a.length, b.length) * sampleRate);
|
|
6017
|
+
}
|
|
5310
6018
|
if (a.length > b.length) [a, b] = [b, a];
|
|
5311
6019
|
const m = a.length;
|
|
5312
6020
|
const n = b.length;
|
|
@@ -5589,13 +6297,20 @@ function extractInlineFields(text) {
|
|
|
5589
6297
|
|
|
5590
6298
|
// src/hwpx/generator.ts
|
|
5591
6299
|
var import_jszip5 = __toESM(require("jszip"), 1);
|
|
5592
|
-
var
|
|
6300
|
+
var NS_SECTION = "http://www.hancom.co.kr/hwpml/2011/section";
|
|
6301
|
+
var NS_PARA = "http://www.hancom.co.kr/hwpml/2011/paragraph";
|
|
6302
|
+
var NS_HEAD = "http://www.hancom.co.kr/hwpml/2011/head";
|
|
6303
|
+
var NS_OPF = "http://www.idpf.org/2007/opf/";
|
|
6304
|
+
var NS_HPF = "http://www.hancom.co.kr/schema/2011/hpf";
|
|
6305
|
+
var NS_OCF = "urn:oasis:names:tc:opendocument:xmlns:container";
|
|
5593
6306
|
async function markdownToHwpx(markdown) {
|
|
5594
6307
|
const blocks = parseMarkdownToBlocks(markdown);
|
|
5595
6308
|
const sectionXml = blocksToSectionXml(blocks);
|
|
5596
6309
|
const zip = new import_jszip5.default();
|
|
5597
6310
|
zip.file("mimetype", "application/hwp+zip", { compression: "STORE" });
|
|
6311
|
+
zip.file("META-INF/container.xml", generateContainerXml());
|
|
5598
6312
|
zip.file("Contents/content.hpf", generateManifest());
|
|
6313
|
+
zip.file("Contents/header.xml", generateHeaderXml());
|
|
5599
6314
|
zip.file("Contents/section0.xml", sectionXml);
|
|
5600
6315
|
return await zip.generateAsync({ type: "arraybuffer" });
|
|
5601
6316
|
}
|
|
@@ -5640,8 +6355,111 @@ function parseMarkdownToBlocks(md) {
|
|
|
5640
6355
|
function escapeXml(text) {
|
|
5641
6356
|
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """);
|
|
5642
6357
|
}
|
|
6358
|
+
function generateContainerXml() {
|
|
6359
|
+
return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
|
6360
|
+
<ocf:container xmlns:ocf="${NS_OCF}" xmlns:hpf="${NS_HPF}">
|
|
6361
|
+
<ocf:rootfiles>
|
|
6362
|
+
<ocf:rootfile full-path="Contents/content.hpf" media-type="application/hwpml-package+xml"/>
|
|
6363
|
+
</ocf:rootfiles>
|
|
6364
|
+
</ocf:container>`;
|
|
6365
|
+
}
|
|
6366
|
+
function generateManifest() {
|
|
6367
|
+
return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
|
6368
|
+
<opf:package xmlns:opf="${NS_OPF}" xmlns:hpf="${NS_HPF}" xmlns:hh="${NS_HEAD}">
|
|
6369
|
+
<opf:manifest>
|
|
6370
|
+
<opf:item id="header" href="Contents/header.xml" media-type="application/xml"/>
|
|
6371
|
+
<opf:item id="section0" href="Contents/section0.xml" media-type="application/xml"/>
|
|
6372
|
+
</opf:manifest>
|
|
6373
|
+
<opf:spine>
|
|
6374
|
+
<opf:itemref idref="header" linear="no"/>
|
|
6375
|
+
<opf:itemref idref="section0" linear="yes"/>
|
|
6376
|
+
</opf:spine>
|
|
6377
|
+
</opf:package>`;
|
|
6378
|
+
}
|
|
6379
|
+
function generateHeaderXml() {
|
|
6380
|
+
return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
|
6381
|
+
<hh:head xmlns:hh="${NS_HEAD}" xmlns:hp="${NS_PARA}" version="1.4" secCnt="1">
|
|
6382
|
+
<hh:beginNum page="1" footnote="1" endnote="1" pic="1" tbl="1" equation="1"/>
|
|
6383
|
+
<hh:refList>
|
|
6384
|
+
<hh:fontfaces itemCnt="7">
|
|
6385
|
+
<hh:fontface lang="HANGUL" fontCnt="1">
|
|
6386
|
+
<hh:font id="0" face="\uD568\uCD08\uB86C\uBC14\uD0D5" type="TTF" isEmbedded="0">
|
|
6387
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="4" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6388
|
+
</hh:font>
|
|
6389
|
+
</hh:fontface>
|
|
6390
|
+
<hh:fontface lang="LATIN" fontCnt="1">
|
|
6391
|
+
<hh:font id="0" face="Times New Roman" type="TTF" isEmbedded="0">
|
|
6392
|
+
<hh:typeInfo familyType="FCAT_OLDSTYLE" weight="5" proportion="4" contrast="2" strokeVariation="0" armStyle="0" letterform="0" midline="0" xHeight="4"/>
|
|
6393
|
+
</hh:font>
|
|
6394
|
+
</hh:fontface>
|
|
6395
|
+
<hh:fontface lang="HANJA" fontCnt="1">
|
|
6396
|
+
<hh:font id="0" face="\uD568\uCD08\uB86C\uBC14\uD0D5" type="TTF" isEmbedded="0">
|
|
6397
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="4" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6398
|
+
</hh:font>
|
|
6399
|
+
</hh:fontface>
|
|
6400
|
+
<hh:fontface lang="JAPANESE" fontCnt="1">
|
|
6401
|
+
<hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
|
|
6402
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6403
|
+
</hh:font>
|
|
6404
|
+
</hh:fontface>
|
|
6405
|
+
<hh:fontface lang="OTHER" fontCnt="1">
|
|
6406
|
+
<hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
|
|
6407
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6408
|
+
</hh:font>
|
|
6409
|
+
</hh:fontface>
|
|
6410
|
+
<hh:fontface lang="SYMBOL" fontCnt="1">
|
|
6411
|
+
<hh:font id="0" face="Symbol" type="TTF" isEmbedded="0">
|
|
6412
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6413
|
+
</hh:font>
|
|
6414
|
+
</hh:fontface>
|
|
6415
|
+
<hh:fontface lang="USER" fontCnt="1">
|
|
6416
|
+
<hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
|
|
6417
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6418
|
+
</hh:font>
|
|
6419
|
+
</hh:fontface>
|
|
6420
|
+
</hh:fontfaces>
|
|
6421
|
+
<hh:borderFills itemCnt="1">
|
|
6422
|
+
<hh:borderFill id="0" threeD="0" shadow="0" centerLine="0" breakCellSeparateLine="0">
|
|
6423
|
+
<hh:slash type="NONE" Crooked="0" isCounter="0"/>
|
|
6424
|
+
<hh:backSlash type="NONE" Crooked="0" isCounter="0"/>
|
|
6425
|
+
<hh:leftBorder type="NONE" width="0.1mm" color="0"/>
|
|
6426
|
+
<hh:rightBorder type="NONE" width="0.1mm" color="0"/>
|
|
6427
|
+
<hh:topBorder type="NONE" width="0.1mm" color="0"/>
|
|
6428
|
+
<hh:bottomBorder type="NONE" width="0.1mm" color="0"/>
|
|
6429
|
+
<hh:diagonal type="NONE" width="0.1mm" color="0"/>
|
|
6430
|
+
<hh:fillInfo/>
|
|
6431
|
+
</hh:borderFill>
|
|
6432
|
+
</hh:borderFills>
|
|
6433
|
+
<hh:charProperties itemCnt="1">
|
|
6434
|
+
<hh:charPr id="0" height="1000" textColor="0" shadeColor="-1" useFontSpace="0" useKerning="0" symMark="NONE" borderFillIDRef="0">
|
|
6435
|
+
<hh:fontRef hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
|
|
6436
|
+
<hh:ratio hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
|
|
6437
|
+
<hh:spacing hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
|
|
6438
|
+
<hh:relSz hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
|
|
6439
|
+
<hh:offset hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
|
|
6440
|
+
</hh:charPr>
|
|
6441
|
+
</hh:charProperties>
|
|
6442
|
+
<hh:tabProperties itemCnt="0"/>
|
|
6443
|
+
<hh:numberings itemCnt="0"/>
|
|
6444
|
+
<hh:bullets itemCnt="0"/>
|
|
6445
|
+
<hh:paraProperties itemCnt="1">
|
|
6446
|
+
<hh:paraPr id="0" tabIDRef="0" condense="0" fontLineHeight="0" snapToGrid="0" suppressOverlap="0" checked="0">
|
|
6447
|
+
<hh:parLineBreak lineBreak="BREAK_LINE" wordBreak="BREAK_WORD" breakLatinWord="BREAK_WORD" breakNonLatinWord="BREAK_WORD"/>
|
|
6448
|
+
<hh:parMargin left="0" right="0" prev="0" next="0" indent="0"/>
|
|
6449
|
+
<hh:parBorder borderFillIDRef="0" offsetLeft="0" offsetRight="0" offsetTop="0" offsetBottom="0" connect="0" ignoreMargin="0"/>
|
|
6450
|
+
<hh:parShade borderFillIDRef="0"/>
|
|
6451
|
+
<hh:parTabList/>
|
|
6452
|
+
</hh:paraPr>
|
|
6453
|
+
</hh:paraProperties>
|
|
6454
|
+
<hh:styles itemCnt="1">
|
|
6455
|
+
<hh:style id="0" type="PARA" name="\uBC14\uD0D5\uAE00" engName="Normal" paraPrIDRef="0" charPrIDRef="0" nextStyleIDRef="0" langIDRef="1042" lockForm="0"/>
|
|
6456
|
+
</hh:styles>
|
|
6457
|
+
</hh:refList>
|
|
6458
|
+
<hh:compatibleDocument targetProgram="HWP2018"/>
|
|
6459
|
+
</hh:head>`;
|
|
6460
|
+
}
|
|
5643
6461
|
function generateParagraph(text) {
|
|
5644
|
-
return `<hp:p><hp:run><hp:t>${escapeXml(text)}</hp:t></hp:run></hp:p>`;
|
|
6462
|
+
return `<hp:p paraPrIDRef="0" styleIDRef="0"><hp:run charPrIDRef="0"><hp:t>${escapeXml(text)}</hp:t></hp:run></hp:p>`;
|
|
5645
6463
|
}
|
|
5646
6464
|
function generateTable(rows) {
|
|
5647
6465
|
const trElements = rows.map((row) => {
|
|
@@ -5665,22 +6483,11 @@ function blocksToSectionXml(blocks) {
|
|
|
5665
6483
|
return "";
|
|
5666
6484
|
}
|
|
5667
6485
|
}).join("\n ");
|
|
5668
|
-
return `<?xml version="1.0" encoding="UTF-8"?>
|
|
5669
|
-
<hs:sec xmlns:hs="${
|
|
6486
|
+
return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
|
6487
|
+
<hs:sec xmlns:hs="${NS_SECTION}" xmlns:hp="${NS_PARA}">
|
|
5670
6488
|
${body}
|
|
5671
6489
|
</hs:sec>`;
|
|
5672
6490
|
}
|
|
5673
|
-
function generateManifest() {
|
|
5674
|
-
return `<?xml version="1.0" encoding="UTF-8"?>
|
|
5675
|
-
<opf:package xmlns:opf="http://www.idpf.org/2007/opf">
|
|
5676
|
-
<opf:manifest>
|
|
5677
|
-
<opf:item id="s0" href="section0.xml" media-type="application/xml"/>
|
|
5678
|
-
</opf:manifest>
|
|
5679
|
-
<opf:spine>
|
|
5680
|
-
<opf:itemref idref="s0"/>
|
|
5681
|
-
</opf:spine>
|
|
5682
|
-
</opf:package>`;
|
|
5683
|
-
}
|
|
5684
6491
|
|
|
5685
6492
|
// src/index.ts
|
|
5686
6493
|
async function parse(input, options) {
|