kordoc 2.0.3 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +302 -291
- package/dist/chunk-5Y2Q3BRW.js +52 -0
- package/dist/chunk-5Y2Q3BRW.js.map +1 -0
- package/dist/{chunk-4UH6ABAY.js → chunk-LYFG7AUT.js} +971 -223
- package/dist/chunk-LYFG7AUT.js.map +1 -0
- package/dist/{chunk-3TBUDJDE.js → chunk-MOL7MDBG.js} +1 -1
- package/dist/chunk-MOL7MDBG.js.map +1 -0
- package/dist/cli.js +13 -9
- package/dist/cli.js.map +1 -1
- package/dist/detect-GYK3HKD5.js +18 -0
- package/dist/index.cjs +996 -189
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +996 -189
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +41 -12
- package/dist/mcp.js.map +1 -1
- package/dist/page-range-737B4EZW.js +8 -0
- package/dist/{provider-EU3CG724.js → provider-7H4CPZYS.js} +2 -1
- package/dist/provider-7H4CPZYS.js.map +1 -0
- package/dist/{watch-QD3PDNXQ.js → watch-Q5OXA73S.js} +38 -18
- package/dist/watch-Q5OXA73S.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-25TXW6EP.js +0 -93
- package/dist/chunk-25TXW6EP.js.map +0 -1
- package/dist/chunk-3TBUDJDE.js.map +0 -1
- package/dist/chunk-4UH6ABAY.js.map +0 -1
- package/dist/page-range-OF5I4PQY.js +0 -8
- package/dist/provider-EU3CG724.js.map +0 -1
- package/dist/utils-BTZ4WSYX.js +0 -22
- package/dist/watch-QD3PDNXQ.js.map +0 -1
- /package/dist/{page-range-OF5I4PQY.js.map → detect-GYK3HKD5.js.map} +0 -0
- /package/dist/{utils-BTZ4WSYX.js.map → page-range-737B4EZW.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -63,6 +63,7 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount) {
|
|
|
63
63
|
blocks.push({ type: "paragraph", text: text.trim(), pageNumber: i });
|
|
64
64
|
}
|
|
65
65
|
} catch {
|
|
66
|
+
blocks.push({ type: "paragraph", text: `[OCR \uC2E4\uD328: \uD398\uC774\uC9C0 ${i}]` });
|
|
66
67
|
}
|
|
67
68
|
}
|
|
68
69
|
return blocks;
|
|
@@ -138,7 +139,7 @@ import { inflateRawSync } from "zlib";
|
|
|
138
139
|
import { DOMParser } from "@xmldom/xmldom";
|
|
139
140
|
|
|
140
141
|
// src/utils.ts
|
|
141
|
-
var VERSION = true ? "2.0
|
|
142
|
+
var VERSION = true ? "2.2.0" : "0.0.0-dev";
|
|
142
143
|
function toArrayBuffer(buf) {
|
|
143
144
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
144
145
|
return buf.buffer;
|
|
@@ -154,7 +155,8 @@ var KordocError = class extends Error {
|
|
|
154
155
|
function isPathTraversal(name) {
|
|
155
156
|
if (name.includes("\0")) return true;
|
|
156
157
|
const normalized = name.replace(/\\/g, "/");
|
|
157
|
-
|
|
158
|
+
const segments = normalized.split("/");
|
|
159
|
+
return segments.some((s) => s === "..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
158
160
|
}
|
|
159
161
|
function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
|
|
160
162
|
try {
|
|
@@ -194,12 +196,25 @@ function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEnt
|
|
|
194
196
|
return { totalUncompressed: 0, entryCount: 0 };
|
|
195
197
|
}
|
|
196
198
|
}
|
|
199
|
+
function stripDtd(xml) {
|
|
200
|
+
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
201
|
+
}
|
|
197
202
|
var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
|
|
198
203
|
function sanitizeHref(href) {
|
|
199
204
|
const trimmed = href.trim();
|
|
200
205
|
if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
|
|
201
206
|
return trimmed;
|
|
202
207
|
}
|
|
208
|
+
function safeMin(arr) {
|
|
209
|
+
let min = Infinity;
|
|
210
|
+
for (let i = 0; i < arr.length; i++) if (arr[i] < min) min = arr[i];
|
|
211
|
+
return min;
|
|
212
|
+
}
|
|
213
|
+
function safeMax(arr) {
|
|
214
|
+
let max = -Infinity;
|
|
215
|
+
for (let i = 0; i < arr.length; i++) if (arr[i] > max) max = arr[i];
|
|
216
|
+
return max;
|
|
217
|
+
}
|
|
203
218
|
function classifyError(err) {
|
|
204
219
|
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
205
220
|
const msg = err.message;
|
|
@@ -274,6 +289,7 @@ function buildTableDirect(rows, numRows) {
|
|
|
274
289
|
if (end > maxCols) maxCols = end;
|
|
275
290
|
}
|
|
276
291
|
}
|
|
292
|
+
if (maxCols > MAX_COLS) maxCols = MAX_COLS;
|
|
277
293
|
if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
|
|
278
294
|
const grid = Array.from(
|
|
279
295
|
{ length: numRows },
|
|
@@ -283,7 +299,7 @@ function buildTableDirect(rows, numRows) {
|
|
|
283
299
|
for (const cell of row) {
|
|
284
300
|
const r = cell.rowAddr ?? 0;
|
|
285
301
|
const c = cell.colAddr ?? 0;
|
|
286
|
-
if (r >= numRows || c >= maxCols) continue;
|
|
302
|
+
if (r >= numRows || c >= maxCols || r < 0 || c < 0) continue;
|
|
287
303
|
grid[r][c] = { text: cell.text.trim(), colSpan: cell.colSpan, rowSpan: cell.rowSpan };
|
|
288
304
|
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
289
305
|
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
@@ -563,7 +579,12 @@ function parseCharProperties(doc, map) {
|
|
|
563
579
|
if (!id) continue;
|
|
564
580
|
const prop = {};
|
|
565
581
|
const height = el.getAttribute("height");
|
|
566
|
-
if (height)
|
|
582
|
+
if (height) {
|
|
583
|
+
const parsedHeight = parseInt(height, 10);
|
|
584
|
+
if (!isNaN(parsedHeight) && parsedHeight > 0) {
|
|
585
|
+
prop.fontSize = parsedHeight / 100;
|
|
586
|
+
}
|
|
587
|
+
}
|
|
567
588
|
const bold = el.getAttribute("bold");
|
|
568
589
|
if (bold === "true" || bold === "1") prop.bold = true;
|
|
569
590
|
const italic = el.getAttribute("italic");
|
|
@@ -598,9 +619,6 @@ function parseStyleElements(doc, map) {
|
|
|
598
619
|
}
|
|
599
620
|
}
|
|
600
621
|
}
|
|
601
|
-
function stripDtd(xml) {
|
|
602
|
-
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
603
|
-
}
|
|
604
622
|
async function parseHwpxDocument(buffer, options) {
|
|
605
623
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
|
|
606
624
|
let zip;
|
|
@@ -703,7 +721,7 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
|
|
|
703
721
|
const data = await file.async("uint8array");
|
|
704
722
|
decompressed.total += data.length;
|
|
705
723
|
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
706
|
-
const ext = ref.includes(".") ? ref.split(".").pop() : "png";
|
|
724
|
+
const ext = ref.includes(".") ? ref.split(".").pop() || "png" : "png";
|
|
707
725
|
const mimeType = imageExtToMime(ext);
|
|
708
726
|
imageIndex++;
|
|
709
727
|
const filename = `image_${String(imageIndex).padStart(3, "0")}.${mimeToExt(mimeType)}`;
|
|
@@ -950,7 +968,8 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
950
968
|
if (newTable.rows.length > 0) {
|
|
951
969
|
if (tableStack.length > 0) {
|
|
952
970
|
const parentTable = tableStack.pop();
|
|
953
|
-
|
|
971
|
+
let nestedCols = 0;
|
|
972
|
+
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
954
973
|
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
955
974
|
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
956
975
|
} else {
|
|
@@ -997,8 +1016,10 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
997
1016
|
break;
|
|
998
1017
|
case "cellSpan":
|
|
999
1018
|
if (tableCtx?.cell) {
|
|
1000
|
-
const
|
|
1001
|
-
const
|
|
1019
|
+
const rawCs = parseInt(el.getAttribute("colSpan") || "1", 10);
|
|
1020
|
+
const cs = isNaN(rawCs) ? 1 : rawCs;
|
|
1021
|
+
const rawRs = parseInt(el.getAttribute("rowSpan") || "1", 10);
|
|
1022
|
+
const rs = isNaN(rawRs) ? 1 : rawRs;
|
|
1002
1023
|
tableCtx.cell.colSpan = clampSpan(cs, MAX_COLS);
|
|
1003
1024
|
tableCtx.cell.rowSpan = clampSpan(rs, MAX_ROWS);
|
|
1004
1025
|
}
|
|
@@ -1057,7 +1078,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
1057
1078
|
if (newTable.rows.length > 0) {
|
|
1058
1079
|
if (tableStack.length > 0) {
|
|
1059
1080
|
const parentTable = tableStack.pop();
|
|
1060
|
-
|
|
1081
|
+
let nestedCols = 0;
|
|
1082
|
+
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
1061
1083
|
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
1062
1084
|
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
1063
1085
|
} else {
|
|
@@ -1090,6 +1112,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
1090
1112
|
extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
|
|
1091
1113
|
} else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
|
|
1092
1114
|
walkChildren(el, d + 1);
|
|
1115
|
+
} else if (localTag === "run") {
|
|
1116
|
+
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
|
|
1093
1117
|
}
|
|
1094
1118
|
}
|
|
1095
1119
|
};
|
|
@@ -2153,6 +2177,7 @@ function parseLenientCfb(data) {
|
|
|
2153
2177
|
if (miniSectorSizeShift > 16) throw new Error("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uBBF8\uB2C8 \uC139\uD130 \uD06C\uAE30 \uC2DC\uD504\uD2B8: " + miniSectorSizeShift);
|
|
2154
2178
|
const miniSectorSize = 1 << miniSectorSizeShift;
|
|
2155
2179
|
const fatSectorCount = data.readUInt32LE(44);
|
|
2180
|
+
if (fatSectorCount > 1e4) throw new Error("FAT \uC139\uD130 \uC218\uAC00 \uB108\uBB34 \uB9CE\uC2B5\uB2C8\uB2E4: " + fatSectorCount);
|
|
2156
2181
|
const firstDirSector = data.readUInt32LE(48);
|
|
2157
2182
|
const miniStreamCutoff = data.readUInt32LE(56);
|
|
2158
2183
|
const firstMiniFatSector = data.readUInt32LE(60);
|
|
@@ -2528,10 +2553,14 @@ function findSections(cfb) {
|
|
|
2528
2553
|
}
|
|
2529
2554
|
function findSectionsLenient(lcfb, compressed) {
|
|
2530
2555
|
const sections = [];
|
|
2556
|
+
let totalDecompressed = 0;
|
|
2531
2557
|
for (let i = 0; i < MAX_SECTIONS; i++) {
|
|
2532
2558
|
const raw = lcfb.findStream(`/BodyText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
|
|
2533
2559
|
if (!raw) break;
|
|
2534
|
-
|
|
2560
|
+
const content = compressed ? decompressStream(raw) : raw;
|
|
2561
|
+
totalDecompressed += content.length;
|
|
2562
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2563
|
+
sections.push({ idx: i, content });
|
|
2535
2564
|
}
|
|
2536
2565
|
if (sections.length === 0) {
|
|
2537
2566
|
for (const e of lcfb.entries()) {
|
|
@@ -2539,7 +2568,12 @@ function findSectionsLenient(lcfb, compressed) {
|
|
|
2539
2568
|
if (e.name.startsWith("Section")) {
|
|
2540
2569
|
const idx = parseInt(e.name.replace("Section", ""), 10) || 0;
|
|
2541
2570
|
const raw = lcfb.findStream(e.name);
|
|
2542
|
-
if (raw)
|
|
2571
|
+
if (raw) {
|
|
2572
|
+
const content = compressed ? decompressStream(raw) : raw;
|
|
2573
|
+
totalDecompressed += content.length;
|
|
2574
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2575
|
+
sections.push({ idx, content });
|
|
2576
|
+
}
|
|
2543
2577
|
}
|
|
2544
2578
|
}
|
|
2545
2579
|
}
|
|
@@ -2547,11 +2581,15 @@ function findSectionsLenient(lcfb, compressed) {
|
|
|
2547
2581
|
}
|
|
2548
2582
|
function findViewTextSectionsLenient(lcfb, compressed) {
|
|
2549
2583
|
const sections = [];
|
|
2584
|
+
let totalDecompressed = 0;
|
|
2550
2585
|
for (let i = 0; i < MAX_SECTIONS; i++) {
|
|
2551
2586
|
const raw = lcfb.findStream(`/ViewText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
|
|
2552
2587
|
if (!raw) break;
|
|
2553
2588
|
try {
|
|
2554
|
-
|
|
2589
|
+
const content = decryptViewText(raw, compressed);
|
|
2590
|
+
totalDecompressed += content.length;
|
|
2591
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2592
|
+
sections.push({ idx: i, content });
|
|
2555
2593
|
} catch {
|
|
2556
2594
|
break;
|
|
2557
2595
|
}
|
|
@@ -2953,10 +2991,14 @@ init_page_range();
|
|
|
2953
2991
|
// src/pdf/line-detector.ts
|
|
2954
2992
|
import { OPS } from "pdfjs-dist/legacy/build/pdf.mjs";
|
|
2955
2993
|
var ORIENTATION_TOL = 2;
|
|
2956
|
-
var MIN_LINE_LENGTH =
|
|
2957
|
-
var
|
|
2994
|
+
var MIN_LINE_LENGTH = 15;
|
|
2995
|
+
var MAX_LINE_WIDTH = 5;
|
|
2958
2996
|
var CONNECT_TOL = 5;
|
|
2959
2997
|
var CELL_PADDING = 2;
|
|
2998
|
+
var MIN_COL_WIDTH = 15;
|
|
2999
|
+
var MIN_ROW_HEIGHT = 6;
|
|
3000
|
+
var VERTEX_MERGE_FACTOR = 4;
|
|
3001
|
+
var MIN_COORD_MERGE_TOL = 8;
|
|
2960
3002
|
function extractLines(fnArray, argsArray) {
|
|
2961
3003
|
const horizontals = [];
|
|
2962
3004
|
const verticals = [];
|
|
@@ -3108,6 +3150,55 @@ function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
|
|
|
3108
3150
|
verticals.push({ x1: x, y1, x2: x, y2, lineWidth });
|
|
3109
3151
|
}
|
|
3110
3152
|
}
|
|
3153
|
+
function preprocessLines(horizontals, verticals) {
|
|
3154
|
+
let h = horizontals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
|
|
3155
|
+
let v = verticals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
|
|
3156
|
+
h = mergeParallelLines(h, "h");
|
|
3157
|
+
v = mergeParallelLines(v, "v");
|
|
3158
|
+
return { horizontals: h, verticals: v };
|
|
3159
|
+
}
|
|
3160
|
+
function mergeParallelLines(lines, dir) {
|
|
3161
|
+
if (lines.length <= 1) return lines;
|
|
3162
|
+
const sorted = [...lines].sort((a, b) => {
|
|
3163
|
+
const posA = dir === "h" ? a.y1 : a.x1;
|
|
3164
|
+
const posB = dir === "h" ? b.y1 : b.x1;
|
|
3165
|
+
if (Math.abs(posA - posB) > 0.1) return posA - posB;
|
|
3166
|
+
return dir === "h" ? a.x1 - b.x1 : a.y1 - b.y1;
|
|
3167
|
+
});
|
|
3168
|
+
const MERGE_TOL = 3;
|
|
3169
|
+
const result = [sorted[0]];
|
|
3170
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
3171
|
+
const prev = result[result.length - 1];
|
|
3172
|
+
const curr = sorted[i];
|
|
3173
|
+
const prevPos = dir === "h" ? prev.y1 : prev.x1;
|
|
3174
|
+
const currPos = dir === "h" ? curr.y1 : curr.x1;
|
|
3175
|
+
if (Math.abs(prevPos - currPos) <= MERGE_TOL) {
|
|
3176
|
+
const prevStart = dir === "h" ? prev.x1 : prev.y1;
|
|
3177
|
+
const prevEnd = dir === "h" ? prev.x2 : prev.y2;
|
|
3178
|
+
const currStart = dir === "h" ? curr.x1 : curr.y1;
|
|
3179
|
+
const currEnd = dir === "h" ? curr.x2 : curr.y2;
|
|
3180
|
+
const overlap = Math.min(prevEnd, currEnd) - Math.max(prevStart, currStart);
|
|
3181
|
+
const minLen = Math.min(prevEnd - prevStart, currEnd - currStart);
|
|
3182
|
+
if (overlap > minLen * 0.3) {
|
|
3183
|
+
if (dir === "h") {
|
|
3184
|
+
prev.x1 = Math.min(prev.x1, curr.x1);
|
|
3185
|
+
prev.x2 = Math.max(prev.x2, curr.x2);
|
|
3186
|
+
prev.y1 = (prev.y1 + curr.y1) / 2;
|
|
3187
|
+
prev.y2 = prev.y1;
|
|
3188
|
+
} else {
|
|
3189
|
+
prev.y1 = Math.min(prev.y1, curr.y1);
|
|
3190
|
+
prev.y2 = Math.max(prev.y2, curr.y2);
|
|
3191
|
+
prev.x1 = (prev.x1 + curr.x1) / 2;
|
|
3192
|
+
prev.x2 = prev.x1;
|
|
3193
|
+
}
|
|
3194
|
+
prev.lineWidth = Math.max(prev.lineWidth, curr.lineWidth);
|
|
3195
|
+
continue;
|
|
3196
|
+
}
|
|
3197
|
+
}
|
|
3198
|
+
result.push(curr);
|
|
3199
|
+
}
|
|
3200
|
+
return result;
|
|
3201
|
+
}
|
|
3111
3202
|
function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
|
|
3112
3203
|
const margin = 5;
|
|
3113
3204
|
return {
|
|
@@ -3119,8 +3210,49 @@ function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
|
|
|
3119
3210
|
)
|
|
3120
3211
|
};
|
|
3121
3212
|
}
|
|
3213
|
+
function buildVertices(horizontals, verticals) {
|
|
3214
|
+
const vertices = [];
|
|
3215
|
+
const tol = CONNECT_TOL;
|
|
3216
|
+
for (const h of horizontals) {
|
|
3217
|
+
for (const v of verticals) {
|
|
3218
|
+
if (v.x1 >= h.x1 - tol && v.x1 <= h.x2 + tol && h.y1 >= v.y1 - tol && h.y1 <= v.y2 + tol) {
|
|
3219
|
+
const radius = Math.max(h.lineWidth, v.lineWidth, 1);
|
|
3220
|
+
vertices.push({ x: v.x1, y: h.y1, radius });
|
|
3221
|
+
}
|
|
3222
|
+
}
|
|
3223
|
+
}
|
|
3224
|
+
return vertices;
|
|
3225
|
+
}
|
|
3226
|
+
function mergeVertices(vertices) {
|
|
3227
|
+
if (vertices.length <= 1) return vertices;
|
|
3228
|
+
const merged = [];
|
|
3229
|
+
const used = new Array(vertices.length).fill(false);
|
|
3230
|
+
for (let i = 0; i < vertices.length; i++) {
|
|
3231
|
+
if (used[i]) continue;
|
|
3232
|
+
let sumX = vertices[i].x, sumY = vertices[i].y;
|
|
3233
|
+
let maxRadius = vertices[i].radius;
|
|
3234
|
+
let count = 1;
|
|
3235
|
+
for (let j = i + 1; j < vertices.length; j++) {
|
|
3236
|
+
if (used[j]) continue;
|
|
3237
|
+
const mergeTol = VERTEX_MERGE_FACTOR * Math.max(maxRadius, vertices[j].radius);
|
|
3238
|
+
if (Math.abs(vertices[i].x - vertices[j].x) <= mergeTol && Math.abs(vertices[i].y - vertices[j].y) <= mergeTol) {
|
|
3239
|
+
sumX += vertices[j].x;
|
|
3240
|
+
sumY += vertices[j].y;
|
|
3241
|
+
maxRadius = Math.max(maxRadius, vertices[j].radius);
|
|
3242
|
+
count++;
|
|
3243
|
+
used[j] = true;
|
|
3244
|
+
}
|
|
3245
|
+
}
|
|
3246
|
+
merged.push({ x: sumX / count, y: sumY / count, radius: maxRadius });
|
|
3247
|
+
}
|
|
3248
|
+
return merged;
|
|
3249
|
+
}
|
|
3122
3250
|
function buildTableGrids(horizontals, verticals) {
|
|
3123
3251
|
if (horizontals.length < 2 || verticals.length < 2) return [];
|
|
3252
|
+
const allVertices = buildVertices(horizontals, verticals);
|
|
3253
|
+
const vertices = mergeVertices(allVertices);
|
|
3254
|
+
if (vertices.length < 4) return [];
|
|
3255
|
+
const globalRadius = vertices.reduce((max, v) => Math.max(max, v.radius), 1);
|
|
3124
3256
|
const allLines = [
|
|
3125
3257
|
...horizontals.map((l, i) => ({ ...l, type: "h", id: i })),
|
|
3126
3258
|
...verticals.map((l, i) => ({ ...l, type: "v", id: i + horizontals.length }))
|
|
@@ -3131,21 +3263,74 @@ function buildTableGrids(horizontals, verticals) {
|
|
|
3131
3263
|
const hLines = group.filter((l) => l.type === "h");
|
|
3132
3264
|
const vLines = group.filter((l) => l.type === "v");
|
|
3133
3265
|
if (hLines.length < 2 || vLines.length < 2) continue;
|
|
3134
|
-
|
|
3135
|
-
const
|
|
3136
|
-
|
|
3137
|
-
|
|
3266
|
+
let gx1 = Infinity, gy1 = Infinity, gx2 = -Infinity, gy2 = -Infinity;
|
|
3267
|
+
for (const l of vLines) {
|
|
3268
|
+
if (l.x1 < gx1) gx1 = l.x1;
|
|
3269
|
+
if (l.x1 > gx2) gx2 = l.x1;
|
|
3270
|
+
}
|
|
3271
|
+
for (const l of hLines) {
|
|
3272
|
+
if (l.y1 < gy1) gy1 = l.y1;
|
|
3273
|
+
if (l.y1 > gy2) gy2 = l.y1;
|
|
3274
|
+
}
|
|
3275
|
+
const groupBbox = {
|
|
3276
|
+
x1: gx1 - CONNECT_TOL,
|
|
3277
|
+
y1: gy1 - CONNECT_TOL,
|
|
3278
|
+
x2: gx2 + CONNECT_TOL,
|
|
3279
|
+
y2: gy2 + CONNECT_TOL
|
|
3280
|
+
};
|
|
3281
|
+
const groupVertices = vertices.filter(
|
|
3282
|
+
(v) => v.x >= groupBbox.x1 && v.x <= groupBbox.x2 && v.y >= groupBbox.y1 && v.y <= groupBbox.y2
|
|
3283
|
+
);
|
|
3284
|
+
const groupRadius = groupVertices.length > 0 ? groupVertices.reduce((max, v) => Math.max(max, v.radius), 1) : globalRadius;
|
|
3285
|
+
const coordMergeTol = Math.max(VERTEX_MERGE_FACTOR * groupRadius, MIN_COORD_MERGE_TOL);
|
|
3286
|
+
const rawYs = [
|
|
3287
|
+
...hLines.map((l) => l.y1),
|
|
3288
|
+
...groupVertices.map((v) => v.y)
|
|
3289
|
+
];
|
|
3290
|
+
const rowYs = clusterCoordinates(rawYs, coordMergeTol).sort((a, b) => b - a);
|
|
3291
|
+
const rawXs = [
|
|
3292
|
+
...vLines.map((l) => l.x1),
|
|
3293
|
+
...groupVertices.map((v) => v.x)
|
|
3294
|
+
];
|
|
3295
|
+
const colXs = clusterCoordinates(rawXs, coordMergeTol).sort((a, b) => a - b);
|
|
3138
3296
|
if (rowYs.length < 2 || colXs.length < 2) continue;
|
|
3297
|
+
const validColXs = enforceMinWidth(colXs, MIN_COL_WIDTH);
|
|
3298
|
+
const validRowYs = enforceMinHeight(rowYs, MIN_ROW_HEIGHT);
|
|
3299
|
+
if (validRowYs.length < 2 || validColXs.length < 2) continue;
|
|
3139
3300
|
const bbox = {
|
|
3140
|
-
x1:
|
|
3141
|
-
y1:
|
|
3142
|
-
x2:
|
|
3143
|
-
y2:
|
|
3301
|
+
x1: validColXs[0],
|
|
3302
|
+
y1: validRowYs[validRowYs.length - 1],
|
|
3303
|
+
x2: validColXs[validColXs.length - 1],
|
|
3304
|
+
y2: validRowYs[0]
|
|
3144
3305
|
};
|
|
3145
|
-
grids.push({ rowYs, colXs, bbox });
|
|
3306
|
+
grids.push({ rowYs: validRowYs, colXs: validColXs, bbox, vertexRadius: groupRadius });
|
|
3146
3307
|
}
|
|
3147
3308
|
return mergeAdjacentGrids(grids);
|
|
3148
3309
|
}
|
|
3310
|
+
function enforceMinWidth(colXs, minWidth) {
|
|
3311
|
+
if (colXs.length <= 2) return colXs;
|
|
3312
|
+
const result = [colXs[0]];
|
|
3313
|
+
for (let i = 1; i < colXs.length; i++) {
|
|
3314
|
+
const prevX = result[result.length - 1];
|
|
3315
|
+
if (colXs[i] - prevX < minWidth && i < colXs.length - 1) {
|
|
3316
|
+
continue;
|
|
3317
|
+
}
|
|
3318
|
+
result.push(colXs[i]);
|
|
3319
|
+
}
|
|
3320
|
+
return result;
|
|
3321
|
+
}
|
|
3322
|
+
function enforceMinHeight(rowYs, minHeight) {
|
|
3323
|
+
if (rowYs.length <= 2) return rowYs;
|
|
3324
|
+
const result = [rowYs[0]];
|
|
3325
|
+
for (let i = 1; i < rowYs.length; i++) {
|
|
3326
|
+
const prevY = result[result.length - 1];
|
|
3327
|
+
if (prevY - rowYs[i] < minHeight && i < rowYs.length - 1) {
|
|
3328
|
+
continue;
|
|
3329
|
+
}
|
|
3330
|
+
result.push(rowYs[i]);
|
|
3331
|
+
}
|
|
3332
|
+
return result;
|
|
3333
|
+
}
|
|
3149
3334
|
function mergeAdjacentGrids(grids) {
|
|
3150
3335
|
if (grids.length <= 1) return grids;
|
|
3151
3336
|
const sorted = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
|
|
@@ -3154,9 +3339,10 @@ function mergeAdjacentGrids(grids) {
|
|
|
3154
3339
|
const prev = merged[merged.length - 1];
|
|
3155
3340
|
const curr = sorted[i];
|
|
3156
3341
|
if (prev.colXs.length === curr.colXs.length) {
|
|
3157
|
-
const
|
|
3342
|
+
const mergeTol = Math.max(VERTEX_MERGE_FACTOR * Math.max(prev.vertexRadius, curr.vertexRadius), 6) * 3;
|
|
3343
|
+
const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= mergeTol);
|
|
3158
3344
|
const verticalGap = prev.bbox.y1 - curr.bbox.y2;
|
|
3159
|
-
if (colMatch && verticalGap >= -
|
|
3345
|
+
if (colMatch && verticalGap >= -CONNECT_TOL && verticalGap <= 20) {
|
|
3160
3346
|
const allRowYs = [.../* @__PURE__ */ new Set([...prev.rowYs, ...curr.rowYs])].sort((a, b) => b - a);
|
|
3161
3347
|
merged[merged.length - 1] = {
|
|
3162
3348
|
rowYs: allRowYs,
|
|
@@ -3166,7 +3352,8 @@ function mergeAdjacentGrids(grids) {
|
|
|
3166
3352
|
y1: Math.min(prev.bbox.y1, curr.bbox.y1),
|
|
3167
3353
|
x2: Math.max(prev.bbox.x2, curr.bbox.x2),
|
|
3168
3354
|
y2: Math.max(prev.bbox.y2, curr.bbox.y2)
|
|
3169
|
-
}
|
|
3355
|
+
},
|
|
3356
|
+
vertexRadius: Math.max(prev.vertexRadius, curr.vertexRadius)
|
|
3170
3357
|
};
|
|
3171
3358
|
continue;
|
|
3172
3359
|
}
|
|
@@ -3175,14 +3362,14 @@ function mergeAdjacentGrids(grids) {
|
|
|
3175
3362
|
}
|
|
3176
3363
|
return merged;
|
|
3177
3364
|
}
|
|
3178
|
-
function clusterCoordinates(values) {
|
|
3365
|
+
function clusterCoordinates(values, tolerance) {
|
|
3179
3366
|
if (values.length === 0) return [];
|
|
3180
3367
|
const sorted = [...values].sort((a, b) => a - b);
|
|
3181
3368
|
const clusters = [{ sum: sorted[0], count: 1 }];
|
|
3182
3369
|
for (let i = 1; i < sorted.length; i++) {
|
|
3183
3370
|
const last = clusters[clusters.length - 1];
|
|
3184
3371
|
const avg = last.sum / last.count;
|
|
3185
|
-
if (Math.abs(sorted[i] - avg) <=
|
|
3372
|
+
if (Math.abs(sorted[i] - avg) <= tolerance) {
|
|
3186
3373
|
last.sum += sorted[i];
|
|
3187
3374
|
last.count++;
|
|
3188
3375
|
} else {
|
|
@@ -3239,6 +3426,20 @@ function extractCells(grid, horizontals, verticals) {
|
|
|
3239
3426
|
const numRows = rowYs.length - 1;
|
|
3240
3427
|
const numCols = colXs.length - 1;
|
|
3241
3428
|
if (numRows <= 0 || numCols <= 0) return [];
|
|
3429
|
+
const vBorders = Array.from(
|
|
3430
|
+
{ length: numRows },
|
|
3431
|
+
(_, r) => Array.from(
|
|
3432
|
+
{ length: numCols + 1 },
|
|
3433
|
+
(_2, c) => hasVerticalLine(verticals, colXs[c], rowYs[r], rowYs[r + 1], grid.vertexRadius)
|
|
3434
|
+
)
|
|
3435
|
+
);
|
|
3436
|
+
const hBorders = Array.from(
|
|
3437
|
+
{ length: numRows + 1 },
|
|
3438
|
+
(_, r) => Array.from(
|
|
3439
|
+
{ length: numCols },
|
|
3440
|
+
(_2, c) => hasHorizontalLine(horizontals, rowYs[r], colXs[c], colXs[c + 1], grid.vertexRadius)
|
|
3441
|
+
)
|
|
3442
|
+
);
|
|
3242
3443
|
const occupied = Array.from({ length: numRows }, () => Array(numCols).fill(false));
|
|
3243
3444
|
const cells = [];
|
|
3244
3445
|
for (let r = 0; r < numRows; r++) {
|
|
@@ -3246,18 +3447,26 @@ function extractCells(grid, horizontals, verticals) {
|
|
|
3246
3447
|
if (occupied[r][c]) continue;
|
|
3247
3448
|
let colSpan = 1;
|
|
3248
3449
|
let rowSpan = 1;
|
|
3249
|
-
while (c + colSpan < numCols) {
|
|
3250
|
-
|
|
3251
|
-
|
|
3252
|
-
|
|
3253
|
-
|
|
3450
|
+
while (c + colSpan < numCols && !vBorders[r][c + colSpan]) {
|
|
3451
|
+
let canExpand = true;
|
|
3452
|
+
for (let dr = 0; dr < rowSpan; dr++) {
|
|
3453
|
+
if (vBorders[r + dr][c + colSpan]) {
|
|
3454
|
+
canExpand = false;
|
|
3455
|
+
break;
|
|
3456
|
+
}
|
|
3457
|
+
}
|
|
3458
|
+
if (!canExpand) break;
|
|
3254
3459
|
colSpan++;
|
|
3255
3460
|
}
|
|
3256
3461
|
while (r + rowSpan < numRows) {
|
|
3257
|
-
|
|
3258
|
-
|
|
3259
|
-
|
|
3260
|
-
|
|
3462
|
+
let hasLine = false;
|
|
3463
|
+
for (let dc = 0; dc < colSpan; dc++) {
|
|
3464
|
+
if (hBorders[r + rowSpan][c + dc]) {
|
|
3465
|
+
hasLine = true;
|
|
3466
|
+
break;
|
|
3467
|
+
}
|
|
3468
|
+
}
|
|
3469
|
+
if (hasLine) break;
|
|
3261
3470
|
rowSpan++;
|
|
3262
3471
|
}
|
|
3263
3472
|
for (let dr = 0; dr < rowSpan; dr++) {
|
|
@@ -3281,28 +3490,30 @@ function extractCells(grid, horizontals, verticals) {
|
|
|
3281
3490
|
}
|
|
3282
3491
|
return cells;
|
|
3283
3492
|
}
|
|
3284
|
-
function hasVerticalLine(verticals, x, topY, botY) {
|
|
3285
|
-
const tol =
|
|
3493
|
+
function hasVerticalLine(verticals, x, topY, botY, vertexRadius) {
|
|
3494
|
+
const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
|
|
3286
3495
|
for (const v of verticals) {
|
|
3287
3496
|
if (Math.abs(v.x1 - x) <= tol) {
|
|
3288
3497
|
const cellH = Math.abs(topY - botY);
|
|
3498
|
+
if (cellH < 0.1) continue;
|
|
3289
3499
|
const overlapTop = Math.min(v.y2, topY);
|
|
3290
3500
|
const overlapBot = Math.max(v.y1, botY);
|
|
3291
3501
|
const overlap = overlapTop - overlapBot;
|
|
3292
|
-
if (overlap >= cellH * 0.
|
|
3502
|
+
if (overlap >= cellH * 0.75) return true;
|
|
3293
3503
|
}
|
|
3294
3504
|
}
|
|
3295
3505
|
return false;
|
|
3296
3506
|
}
|
|
3297
|
-
function hasHorizontalLine(horizontals, y, leftX, rightX) {
|
|
3298
|
-
const tol =
|
|
3507
|
+
function hasHorizontalLine(horizontals, y, leftX, rightX, vertexRadius) {
|
|
3508
|
+
const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
|
|
3299
3509
|
for (const h of horizontals) {
|
|
3300
3510
|
if (Math.abs(h.y1 - y) <= tol) {
|
|
3301
3511
|
const cellW = Math.abs(rightX - leftX);
|
|
3512
|
+
if (cellW < 0.1) continue;
|
|
3302
3513
|
const overlapLeft = Math.max(h.x1, leftX);
|
|
3303
3514
|
const overlapRight = Math.min(h.x2, rightX);
|
|
3304
3515
|
const overlap = overlapRight - overlapLeft;
|
|
3305
|
-
if (overlap >= cellW * 0.
|
|
3516
|
+
if (overlap >= cellW * 0.75) return true;
|
|
3306
3517
|
}
|
|
3307
3518
|
}
|
|
3308
3519
|
return false;
|
|
@@ -3313,23 +3524,24 @@ function mapTextToCells(items, cells) {
|
|
|
3313
3524
|
result.set(cell, []);
|
|
3314
3525
|
}
|
|
3315
3526
|
for (const item of items) {
|
|
3316
|
-
const cx = item.x + item.w / 2;
|
|
3317
|
-
const cy = item.y;
|
|
3318
3527
|
const pad = CELL_PADDING;
|
|
3319
3528
|
let bestCell = null;
|
|
3320
|
-
let
|
|
3529
|
+
let bestScore = 0;
|
|
3321
3530
|
for (const cell of cells) {
|
|
3322
|
-
|
|
3323
|
-
|
|
3324
|
-
|
|
3325
|
-
|
|
3326
|
-
|
|
3327
|
-
|
|
3328
|
-
|
|
3329
|
-
|
|
3531
|
+
const ix1 = Math.max(item.x, cell.bbox.x1 - pad);
|
|
3532
|
+
const ix2 = Math.min(item.x + item.w, cell.bbox.x2 + pad);
|
|
3533
|
+
const iy1 = Math.max(item.y, cell.bbox.y1 - pad);
|
|
3534
|
+
const iy2 = Math.min(item.y + (item.h || item.fontSize), cell.bbox.y2 + pad);
|
|
3535
|
+
if (ix1 >= ix2 || iy1 >= iy2) continue;
|
|
3536
|
+
const intersectArea = (ix2 - ix1) * (iy2 - iy1);
|
|
3537
|
+
const itemArea = Math.max(item.w, 1) * Math.max(item.h || item.fontSize, 1);
|
|
3538
|
+
const score = intersectArea / itemArea;
|
|
3539
|
+
if (score > bestScore) {
|
|
3540
|
+
bestScore = score;
|
|
3541
|
+
bestCell = cell;
|
|
3330
3542
|
}
|
|
3331
3543
|
}
|
|
3332
|
-
if (bestCell) {
|
|
3544
|
+
if (bestCell && bestScore > 0.3) {
|
|
3333
3545
|
result.get(bestCell).push(item);
|
|
3334
3546
|
}
|
|
3335
3547
|
}
|
|
@@ -3356,8 +3568,13 @@ function cellTextToString(items) {
|
|
|
3356
3568
|
const textLines = lines.map((line) => {
|
|
3357
3569
|
const s = line.sort((a, b) => a.x - b.x);
|
|
3358
3570
|
if (s.length === 1) return s[0].text;
|
|
3571
|
+
const evenSpaced = detectEvenSpacedItems(s);
|
|
3359
3572
|
let result = s[0].text;
|
|
3360
3573
|
for (let j = 1; j < s.length; j++) {
|
|
3574
|
+
if (evenSpaced[j]) {
|
|
3575
|
+
result += s[j].text;
|
|
3576
|
+
continue;
|
|
3577
|
+
}
|
|
3361
3578
|
const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
|
|
3362
3579
|
const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
|
|
3363
3580
|
const prevIsKorean = /[가-힣]$/.test(result);
|
|
@@ -3372,6 +3589,57 @@ function cellTextToString(items) {
|
|
|
3372
3589
|
}
|
|
3373
3590
|
return result;
|
|
3374
3591
|
});
|
|
3592
|
+
return mergeCellTextLines(textLines);
|
|
3593
|
+
}
|
|
3594
|
+
function detectEvenSpacedItems(items) {
|
|
3595
|
+
const result = new Array(items.length).fill(false);
|
|
3596
|
+
if (items.length < 3) return result;
|
|
3597
|
+
let runStart = -1;
|
|
3598
|
+
for (let i = 0; i < items.length; i++) {
|
|
3599
|
+
const isShortKorean = /^[가-힣]{1}$/.test(items[i].text) || /^[\d]{1}$/.test(items[i].text);
|
|
3600
|
+
if (isShortKorean && runStart >= 0 && i > 0) {
|
|
3601
|
+
const gap = items[i].x - (items[i - 1].x + items[i - 1].w);
|
|
3602
|
+
const maxRunGap = Math.max(items[i].fontSize * 3, 30);
|
|
3603
|
+
if (gap > maxRunGap) {
|
|
3604
|
+
if (i - runStart >= 3) markEvenRun(items, result, runStart, i);
|
|
3605
|
+
runStart = i;
|
|
3606
|
+
continue;
|
|
3607
|
+
}
|
|
3608
|
+
}
|
|
3609
|
+
if (isShortKorean) {
|
|
3610
|
+
if (runStart < 0) runStart = i;
|
|
3611
|
+
} else {
|
|
3612
|
+
if (runStart >= 0 && i - runStart >= 3) {
|
|
3613
|
+
markEvenRun(items, result, runStart, i);
|
|
3614
|
+
}
|
|
3615
|
+
runStart = -1;
|
|
3616
|
+
}
|
|
3617
|
+
}
|
|
3618
|
+
if (runStart >= 0 && items.length - runStart >= 3) {
|
|
3619
|
+
markEvenRun(items, result, runStart, items.length);
|
|
3620
|
+
}
|
|
3621
|
+
return result;
|
|
3622
|
+
}
|
|
3623
|
+
function markEvenRun(items, result, start, end) {
|
|
3624
|
+
const gaps = [];
|
|
3625
|
+
for (let i = start + 1; i < end; i++) {
|
|
3626
|
+
gaps.push(items[i].x - (items[i - 1].x + items[i - 1].w));
|
|
3627
|
+
}
|
|
3628
|
+
const posGaps = gaps.filter((g2) => g2 > 0);
|
|
3629
|
+
if (posGaps.length < 2) return;
|
|
3630
|
+
let minGap = Infinity, maxGap = -Infinity;
|
|
3631
|
+
for (const g2 of posGaps) {
|
|
3632
|
+
if (g2 < minGap) minGap = g2;
|
|
3633
|
+
if (g2 > maxGap) maxGap = g2;
|
|
3634
|
+
}
|
|
3635
|
+
const avgFs = items[start].fontSize;
|
|
3636
|
+
if (minGap >= avgFs * 0.1 && maxGap <= avgFs * 3 && maxGap / Math.max(minGap, 0.1) <= 3) {
|
|
3637
|
+
for (let i = start + 1; i < end; i++) {
|
|
3638
|
+
result[i] = true;
|
|
3639
|
+
}
|
|
3640
|
+
}
|
|
3641
|
+
}
|
|
3642
|
+
function mergeCellTextLines(textLines) {
|
|
3375
3643
|
if (textLines.length <= 1) return textLines[0] || "";
|
|
3376
3644
|
const merged = [textLines[0]];
|
|
3377
3645
|
for (let i = 1; i < textLines.length; i++) {
|
|
@@ -3397,24 +3665,172 @@ var Y_TOL = 3;
|
|
|
3397
3665
|
var COL_CLUSTER_TOL = 15;
|
|
3398
3666
|
var MIN_ROWS = 3;
|
|
3399
3667
|
var MIN_COLS = 2;
|
|
3400
|
-
var MIN_GAP_FACTOR =
|
|
3401
|
-
var
|
|
3668
|
+
var MIN_GAP_FACTOR = 2;
|
|
3669
|
+
var MIN_GAP_ABSOLUTE = 20;
|
|
3670
|
+
var MIN_COL_FILL_RATIO = 0.4;
|
|
3402
3671
|
function detectClusterTables(items, pageNum) {
|
|
3403
3672
|
if (items.length < MIN_ROWS * MIN_COLS) return [];
|
|
3404
|
-
const
|
|
3673
|
+
const { merged, originMap } = mergeEvenSpacedClusters(items);
|
|
3674
|
+
const rows = groupByBaseline(merged);
|
|
3405
3675
|
if (rows.length < MIN_ROWS) return [];
|
|
3406
|
-
const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
|
|
3407
|
-
if (suspiciousRows.length < MIN_ROWS) return [];
|
|
3408
|
-
const columns = extractColumnClusters(suspiciousRows);
|
|
3409
|
-
if (columns.length < MIN_COLS) return [];
|
|
3410
|
-
const tableRegions = findTableRegions(rows, columns);
|
|
3411
3676
|
const results = [];
|
|
3412
|
-
|
|
3413
|
-
|
|
3414
|
-
|
|
3677
|
+
const headerResult = detectHeaderRow(rows);
|
|
3678
|
+
if (headerResult) {
|
|
3679
|
+
const { columns, headerIdx } = headerResult;
|
|
3680
|
+
const headerRow = rows[headerIdx];
|
|
3681
|
+
const headerItems = [...headerRow.items].sort((a, b) => a.x - b.x);
|
|
3682
|
+
const headerAndBelow = rows.slice(headerIdx);
|
|
3683
|
+
const mergedRows = mergeMultiLineRows(headerAndBelow, columns);
|
|
3684
|
+
const tableRegions = findTableRegionsByHeader(mergedRows, columns, headerItems);
|
|
3685
|
+
for (const region of tableRegions) {
|
|
3686
|
+
const table = buildClusterTable(region.rows, columns, pageNum);
|
|
3687
|
+
if (table) {
|
|
3688
|
+
expandUsedItems(table.usedItems, originMap);
|
|
3689
|
+
results.push(table);
|
|
3690
|
+
}
|
|
3691
|
+
}
|
|
3692
|
+
}
|
|
3693
|
+
if (results.length === 0) {
|
|
3694
|
+
const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
|
|
3695
|
+
if (suspiciousRows.length >= MIN_ROWS) {
|
|
3696
|
+
const columns = extractColumnClusters(suspiciousRows);
|
|
3697
|
+
if (columns.length >= MIN_COLS) {
|
|
3698
|
+
const tableRegions = findTableRegions(rows, columns);
|
|
3699
|
+
for (const region of tableRegions) {
|
|
3700
|
+
const mergedRows = mergeMultiLineRows(region.rows, columns);
|
|
3701
|
+
const table = buildClusterTable(mergedRows, columns, pageNum);
|
|
3702
|
+
if (table) {
|
|
3703
|
+
expandUsedItems(table.usedItems, originMap);
|
|
3704
|
+
results.push(table);
|
|
3705
|
+
}
|
|
3706
|
+
}
|
|
3707
|
+
}
|
|
3708
|
+
}
|
|
3415
3709
|
}
|
|
3416
3710
|
return results;
|
|
3417
3711
|
}
|
|
3712
|
+
function mergeEvenSpacedClusters(items) {
|
|
3713
|
+
const originMap = /* @__PURE__ */ new Map();
|
|
3714
|
+
const rows = groupByBaseline(items);
|
|
3715
|
+
const merged = [];
|
|
3716
|
+
for (const row of rows) {
|
|
3717
|
+
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3718
|
+
let i = 0;
|
|
3719
|
+
while (i < sorted.length) {
|
|
3720
|
+
if (/^[가-힣\d]$/.test(sorted[i].text)) {
|
|
3721
|
+
let runEnd = i + 1;
|
|
3722
|
+
while (runEnd < sorted.length && /^[가-힣\d]$/.test(sorted[runEnd].text)) {
|
|
3723
|
+
const gap = sorted[runEnd].x - (sorted[runEnd - 1].x + sorted[runEnd - 1].w);
|
|
3724
|
+
const fs = sorted[runEnd].fontSize;
|
|
3725
|
+
if (gap < fs * 0.1 || gap > fs * 3) break;
|
|
3726
|
+
runEnd++;
|
|
3727
|
+
}
|
|
3728
|
+
if (runEnd - i >= 3) {
|
|
3729
|
+
const gaps = [];
|
|
3730
|
+
for (let g2 = i + 1; g2 < runEnd; g2++) {
|
|
3731
|
+
gaps.push(sorted[g2].x - (sorted[g2 - 1].x + sorted[g2 - 1].w));
|
|
3732
|
+
}
|
|
3733
|
+
let minG = Infinity, maxG = -Infinity;
|
|
3734
|
+
for (const g2 of gaps) {
|
|
3735
|
+
if (g2 < minG) minG = g2;
|
|
3736
|
+
if (g2 > maxG) maxG = g2;
|
|
3737
|
+
}
|
|
3738
|
+
if (minG > 0 && maxG / minG <= 3) {
|
|
3739
|
+
const run = sorted.slice(i, runEnd);
|
|
3740
|
+
const text = run.map((r) => r.text).join("");
|
|
3741
|
+
const first = run[0], last = run[runEnd - i - 1];
|
|
3742
|
+
const item = {
|
|
3743
|
+
text,
|
|
3744
|
+
x: first.x,
|
|
3745
|
+
y: first.y,
|
|
3746
|
+
w: last.x + last.w - first.x,
|
|
3747
|
+
h: first.h,
|
|
3748
|
+
fontSize: first.fontSize,
|
|
3749
|
+
fontName: first.fontName
|
|
3750
|
+
};
|
|
3751
|
+
originMap.set(item, run);
|
|
3752
|
+
merged.push(item);
|
|
3753
|
+
i = runEnd;
|
|
3754
|
+
continue;
|
|
3755
|
+
}
|
|
3756
|
+
}
|
|
3757
|
+
}
|
|
3758
|
+
merged.push(sorted[i]);
|
|
3759
|
+
i++;
|
|
3760
|
+
}
|
|
3761
|
+
}
|
|
3762
|
+
return { merged, originMap };
|
|
3763
|
+
}
|
|
3764
|
+
function expandUsedItems(usedItems, originMap) {
|
|
3765
|
+
const toAdd = [];
|
|
3766
|
+
for (const item of usedItems) {
|
|
3767
|
+
const origins = originMap.get(item);
|
|
3768
|
+
if (origins) for (const o of origins) toAdd.push(o);
|
|
3769
|
+
}
|
|
3770
|
+
for (const a of toAdd) usedItems.add(a);
|
|
3771
|
+
}
|
|
3772
|
+
function detectHeaderRow(rows) {
|
|
3773
|
+
const allItems = rows.flatMap((r) => r.items);
|
|
3774
|
+
if (allItems.length === 0) return null;
|
|
3775
|
+
let allMinX = Infinity, allMaxX = -Infinity;
|
|
3776
|
+
for (const i of allItems) {
|
|
3777
|
+
if (i.x < allMinX) allMinX = i.x;
|
|
3778
|
+
const r = i.x + i.w;
|
|
3779
|
+
if (r > allMaxX) allMaxX = r;
|
|
3780
|
+
}
|
|
3781
|
+
const pageSpan = allMaxX - allMinX;
|
|
3782
|
+
if (pageSpan <= 0) return null;
|
|
3783
|
+
for (let ri = 0; ri < rows.length; ri++) {
|
|
3784
|
+
const row = rows[ri];
|
|
3785
|
+
if (row.items.length < MIN_COLS || row.items.length > 6) continue;
|
|
3786
|
+
if (row.items.some((i) => i.text.length > 8)) continue;
|
|
3787
|
+
if (!row.items.some((i) => /[가-힣]/.test(i.text))) continue;
|
|
3788
|
+
if (row.items.some((i) => /^[□■○●·※▶▷◆◇\-]/.test(i.text))) continue;
|
|
3789
|
+
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3790
|
+
const xSpan = sorted[sorted.length - 1].x + sorted[sorted.length - 1].w - sorted[0].x;
|
|
3791
|
+
if (xSpan / pageSpan < 0.4) continue;
|
|
3792
|
+
const avgFs = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
|
|
3793
|
+
let hasLargeGap = false;
|
|
3794
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
3795
|
+
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
3796
|
+
if (gap >= avgFs * 2.5) {
|
|
3797
|
+
hasLargeGap = true;
|
|
3798
|
+
break;
|
|
3799
|
+
}
|
|
3800
|
+
}
|
|
3801
|
+
if (!hasLargeGap) continue;
|
|
3802
|
+
const columns = sorted.map((item) => ({ x: item.x, count: 0 }));
|
|
3803
|
+
let matchCount = 0;
|
|
3804
|
+
for (let j = ri + 1; j < rows.length && matchCount < MIN_ROWS + 2; j++) {
|
|
3805
|
+
const matched = countMatchedColumnsRange(rows[j], columns, sorted);
|
|
3806
|
+
if (matched >= MIN_COLS) matchCount++;
|
|
3807
|
+
}
|
|
3808
|
+
if (matchCount < MIN_ROWS) continue;
|
|
3809
|
+
return { columns, headerIdx: ri };
|
|
3810
|
+
}
|
|
3811
|
+
return null;
|
|
3812
|
+
}
|
|
3813
|
+
function mergeMultiLineRows(rows, columns) {
|
|
3814
|
+
if (rows.length <= 1) return rows;
|
|
3815
|
+
const result = [rows[0]];
|
|
3816
|
+
const allFontSizes = rows.flatMap((r) => r.items).map((i) => i.fontSize);
|
|
3817
|
+
const avgFontSize = allFontSizes.length > 0 ? allFontSizes.reduce((s, v) => s + v, 0) / allFontSizes.length : 12;
|
|
3818
|
+
for (let i = 1; i < rows.length; i++) {
|
|
3819
|
+
const prev = result[result.length - 1];
|
|
3820
|
+
const curr = rows[i];
|
|
3821
|
+
const yGap = Math.abs(prev.y - curr.y);
|
|
3822
|
+
const matchedCols = countMatchedColumns(curr, columns);
|
|
3823
|
+
if (yGap < avgFontSize * 1.8 && curr.items.length <= 2 && (matchedCols < MIN_COLS || curr.items.length === 1)) {
|
|
3824
|
+
result[result.length - 1] = {
|
|
3825
|
+
y: prev.y,
|
|
3826
|
+
items: [...prev.items, ...curr.items]
|
|
3827
|
+
};
|
|
3828
|
+
} else {
|
|
3829
|
+
result.push(curr);
|
|
3830
|
+
}
|
|
3831
|
+
}
|
|
3832
|
+
return result;
|
|
3833
|
+
}
|
|
3418
3834
|
function groupByBaseline(items) {
|
|
3419
3835
|
if (items.length === 0) return [];
|
|
3420
3836
|
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
@@ -3436,8 +3852,9 @@ function groupByBaseline(items) {
|
|
|
3436
3852
|
function hasSuspiciousGaps(row) {
|
|
3437
3853
|
if (row.items.length < 2) return false;
|
|
3438
3854
|
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3855
|
+
if (sorted.length === 2 && sorted[1].text.length > 20) return false;
|
|
3439
3856
|
const avgFontSize = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
|
|
3440
|
-
const minGap = avgFontSize * MIN_GAP_FACTOR;
|
|
3857
|
+
const minGap = Math.max(avgFontSize * MIN_GAP_FACTOR, MIN_GAP_ABSOLUTE);
|
|
3441
3858
|
for (let i = 1; i < sorted.length; i++) {
|
|
3442
3859
|
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
3443
3860
|
if (gap >= minGap) return true;
|
|
@@ -3464,6 +3881,41 @@ function extractColumnClusters(rows) {
|
|
|
3464
3881
|
const minCount = Math.max(2, Math.floor(rows.length * MIN_COL_FILL_RATIO));
|
|
3465
3882
|
return clusters.filter((c) => c.count >= minCount).sort((a, b) => a.x - b.x);
|
|
3466
3883
|
}
|
|
3884
|
+
function findTableRegionsByHeader(allRows, columns, headerItems) {
|
|
3885
|
+
const regions = [];
|
|
3886
|
+
let currentRegion = [];
|
|
3887
|
+
let missStreak = 0;
|
|
3888
|
+
for (const row of allRows) {
|
|
3889
|
+
const matchedCols = countMatchedColumnsRange(row, columns, headerItems);
|
|
3890
|
+
if (matchedCols >= MIN_COLS) {
|
|
3891
|
+
currentRegion.push(row);
|
|
3892
|
+
missStreak = 0;
|
|
3893
|
+
} else if (currentRegion.length > 0 && (row.items.length <= 2 || missStreak === 0)) {
|
|
3894
|
+
currentRegion.push(row);
|
|
3895
|
+
missStreak++;
|
|
3896
|
+
} else {
|
|
3897
|
+
while (currentRegion.length > 0) {
|
|
3898
|
+
const last = currentRegion[currentRegion.length - 1];
|
|
3899
|
+
if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
|
|
3900
|
+
currentRegion.pop();
|
|
3901
|
+
}
|
|
3902
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
3903
|
+
regions.push({ rows: [...currentRegion] });
|
|
3904
|
+
}
|
|
3905
|
+
currentRegion = [];
|
|
3906
|
+
missStreak = 0;
|
|
3907
|
+
}
|
|
3908
|
+
}
|
|
3909
|
+
while (currentRegion.length > 0) {
|
|
3910
|
+
const last = currentRegion[currentRegion.length - 1];
|
|
3911
|
+
if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
|
|
3912
|
+
currentRegion.pop();
|
|
3913
|
+
}
|
|
3914
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
3915
|
+
regions.push({ rows: currentRegion });
|
|
3916
|
+
}
|
|
3917
|
+
return regions;
|
|
3918
|
+
}
|
|
3467
3919
|
function findTableRegions(allRows, columns) {
|
|
3468
3920
|
const regions = [];
|
|
3469
3921
|
let currentRegion = [];
|
|
@@ -3499,18 +3951,81 @@ function countMatchedColumns(row, columns) {
|
|
|
3499
3951
|
}
|
|
3500
3952
|
return matched.size;
|
|
3501
3953
|
}
|
|
3502
|
-
function
|
|
3503
|
-
const
|
|
3504
|
-
let
|
|
3505
|
-
|
|
3506
|
-
|
|
3507
|
-
|
|
3508
|
-
|
|
3509
|
-
|
|
3510
|
-
|
|
3954
|
+
function countMatchedColumnsRange(row, columns, headerItems) {
|
|
3955
|
+
const boundaries = [];
|
|
3956
|
+
for (let ci = 0; ci < headerItems.length; ci++) {
|
|
3957
|
+
const left = ci === 0 ? 0 : (headerItems[ci - 1].x + headerItems[ci - 1].w + headerItems[ci].x) / 2;
|
|
3958
|
+
const right = ci === headerItems.length - 1 ? Infinity : (headerItems[ci].x + headerItems[ci].w + headerItems[ci + 1].x) / 2;
|
|
3959
|
+
boundaries.push({ left, right });
|
|
3960
|
+
}
|
|
3961
|
+
const matched = /* @__PURE__ */ new Set();
|
|
3962
|
+
for (const item of row.items) {
|
|
3963
|
+
for (let ci = 0; ci < boundaries.length; ci++) {
|
|
3964
|
+
if (item.x >= boundaries[ci].left && item.x < boundaries[ci].right) {
|
|
3965
|
+
matched.add(ci);
|
|
3966
|
+
break;
|
|
3967
|
+
}
|
|
3511
3968
|
}
|
|
3512
3969
|
}
|
|
3513
|
-
return
|
|
3970
|
+
return matched.size;
|
|
3971
|
+
}
|
|
3972
|
+
function assignRowItems(items, columns, numCols) {
|
|
3973
|
+
if (items.length === 0) return [];
|
|
3974
|
+
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
3975
|
+
const colCenters = columns.map((c) => c.x);
|
|
3976
|
+
const gaps = [];
|
|
3977
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
3978
|
+
gaps.push({ idx: i, size: sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w) });
|
|
3979
|
+
}
|
|
3980
|
+
const gapSizes = gaps.map((g2) => g2.size).sort((a, b) => a - b);
|
|
3981
|
+
const medianGap = gapSizes.length > 0 ? gapSizes[Math.floor(gapSizes.length / 2)] : 0;
|
|
3982
|
+
const gapThreshold = sorted.length <= numCols + 1 ? 12 : Math.max(medianGap * 2.5, 12);
|
|
3983
|
+
const significantGaps = gaps.filter((g2) => g2.size >= gapThreshold).sort((a, b) => b.size - a.size).slice(0, numCols - 1).sort((a, b) => a.idx - b.idx);
|
|
3984
|
+
const groups = [];
|
|
3985
|
+
let start = 0;
|
|
3986
|
+
for (const gap of significantGaps) {
|
|
3987
|
+
groups.push(sorted.slice(start, gap.idx));
|
|
3988
|
+
start = gap.idx;
|
|
3989
|
+
}
|
|
3990
|
+
groups.push(sorted.slice(start));
|
|
3991
|
+
const result = [];
|
|
3992
|
+
const usedCols = /* @__PURE__ */ new Set();
|
|
3993
|
+
const groupCenters = groups.map((g2) => {
|
|
3994
|
+
let minX = Infinity, maxX = -Infinity;
|
|
3995
|
+
for (const i of g2) {
|
|
3996
|
+
if (i.x < minX) minX = i.x;
|
|
3997
|
+
const r = i.x + i.w;
|
|
3998
|
+
if (r > maxX) maxX = r;
|
|
3999
|
+
}
|
|
4000
|
+
return (minX + maxX) / 2;
|
|
4001
|
+
});
|
|
4002
|
+
const assignments = [];
|
|
4003
|
+
for (let gi = 0; gi < groups.length; gi++) {
|
|
4004
|
+
for (let ci = 0; ci < numCols; ci++) {
|
|
4005
|
+
assignments.push({ gi, ci, dist: Math.abs(groupCenters[gi] - colCenters[ci]) });
|
|
4006
|
+
}
|
|
4007
|
+
}
|
|
4008
|
+
assignments.sort((a, b) => a.dist - b.dist);
|
|
4009
|
+
const assignedGroups = /* @__PURE__ */ new Set();
|
|
4010
|
+
for (const { gi, ci } of assignments) {
|
|
4011
|
+
if (assignedGroups.has(gi) || usedCols.has(ci)) continue;
|
|
4012
|
+
result.push({ col: ci, items: groups[gi] });
|
|
4013
|
+
assignedGroups.add(gi);
|
|
4014
|
+
usedCols.add(ci);
|
|
4015
|
+
}
|
|
4016
|
+
for (let gi = 0; gi < groups.length; gi++) {
|
|
4017
|
+
if (assignedGroups.has(gi)) continue;
|
|
4018
|
+
let bestCol = 0, bestDist = Infinity;
|
|
4019
|
+
for (let ci = 0; ci < numCols; ci++) {
|
|
4020
|
+
const d = Math.abs(groupCenters[gi] - colCenters[ci]);
|
|
4021
|
+
if (d < bestDist) {
|
|
4022
|
+
bestDist = d;
|
|
4023
|
+
bestCol = ci;
|
|
4024
|
+
}
|
|
4025
|
+
}
|
|
4026
|
+
result.push({ col: bestCol, items: groups[gi] });
|
|
4027
|
+
}
|
|
4028
|
+
return result;
|
|
3514
4029
|
}
|
|
3515
4030
|
function buildClusterTable(rows, columns, pageNum) {
|
|
3516
4031
|
const numCols = columns.length;
|
|
@@ -3528,12 +4043,12 @@ function buildClusterTable(rows, columns, pageNum) {
|
|
|
3528
4043
|
usedItems.add(row.items[0]);
|
|
3529
4044
|
continue;
|
|
3530
4045
|
}
|
|
3531
|
-
|
|
3532
|
-
|
|
3533
|
-
|
|
4046
|
+
const assignments = assignRowItems(row.items, columns, numCols);
|
|
4047
|
+
for (const { col, items } of assignments) {
|
|
4048
|
+
const text = items.map((i) => i.text).join(" ");
|
|
3534
4049
|
const existing = cells[r][col].text;
|
|
3535
|
-
cells[r][col].text = existing ? existing + " " +
|
|
3536
|
-
usedItems.add(item);
|
|
4050
|
+
cells[r][col].text = existing ? existing + " " + text : text;
|
|
4051
|
+
for (const item of items) usedItems.add(item);
|
|
3537
4052
|
}
|
|
3538
4053
|
}
|
|
3539
4054
|
let emptyRows = 0;
|
|
@@ -3545,11 +4060,48 @@ function buildClusterTable(rows, columns, pageNum) {
|
|
|
3545
4060
|
const hasValue = cells.some((row) => row[c].text !== "");
|
|
3546
4061
|
if (!hasValue) return null;
|
|
3547
4062
|
}
|
|
4063
|
+
for (let r = numRows - 1; r >= 1; r--) {
|
|
4064
|
+
const nonEmptyCols = cells[r].filter((c) => c.text.trim()).length;
|
|
4065
|
+
if (nonEmptyCols !== 1) continue;
|
|
4066
|
+
if (cells[r][0].text.trim() !== "") continue;
|
|
4067
|
+
const contentText = cells[r].find((c) => c.text.trim())?.text.trim() || "";
|
|
4068
|
+
if (/^[○●▶\-·]/.test(contentText)) continue;
|
|
4069
|
+
for (let pr = r - 1; pr >= 0; pr--) {
|
|
4070
|
+
if (cells[pr].some((c) => c.text.trim())) {
|
|
4071
|
+
for (let c = 0; c < numCols; c++) {
|
|
4072
|
+
const prev = cells[pr][c].text.trim();
|
|
4073
|
+
const curr = cells[r][c].text.trim();
|
|
4074
|
+
if (curr) cells[pr][c].text = prev ? prev + " " + curr : curr;
|
|
4075
|
+
}
|
|
4076
|
+
for (let c = 0; c < numCols; c++) cells[r][c].text = "";
|
|
4077
|
+
break;
|
|
4078
|
+
}
|
|
4079
|
+
}
|
|
4080
|
+
}
|
|
4081
|
+
for (let r = 0; r < cells.length - 1; r++) {
|
|
4082
|
+
const row = cells[r];
|
|
4083
|
+
const hasCol0 = row[0].text.trim() !== "";
|
|
4084
|
+
const hasColLast = numCols > 1 && row[numCols - 1].text.trim() !== "";
|
|
4085
|
+
const midEmpty = row.slice(1, numCols - 1).every((c) => c.text.trim() === "");
|
|
4086
|
+
if (hasCol0 && hasColLast && midEmpty) {
|
|
4087
|
+
const next = cells[r + 1];
|
|
4088
|
+
if (next[0].text.trim() === "" && next.some((c) => c.text.trim())) {
|
|
4089
|
+
for (let c = 1; c < numCols; c++) {
|
|
4090
|
+
const curr = next[c].text.trim();
|
|
4091
|
+
if (curr) row[c].text = row[c].text.trim() ? row[c].text.trim() + " " + curr : curr;
|
|
4092
|
+
}
|
|
4093
|
+
for (let c = 0; c < numCols; c++) next[c].text = "";
|
|
4094
|
+
}
|
|
4095
|
+
}
|
|
4096
|
+
}
|
|
4097
|
+
const filteredCells = cells.filter((row) => row.some((c) => c.text.trim()));
|
|
4098
|
+
const finalRowCount = filteredCells.length;
|
|
4099
|
+
if (finalRowCount < MIN_ROWS) return null;
|
|
3548
4100
|
const irTable = {
|
|
3549
|
-
rows:
|
|
4101
|
+
rows: finalRowCount,
|
|
3550
4102
|
cols: numCols,
|
|
3551
|
-
cells,
|
|
3552
|
-
hasHeader:
|
|
4103
|
+
cells: filteredCells,
|
|
4104
|
+
hasHeader: finalRowCount > 1
|
|
3553
4105
|
};
|
|
3554
4106
|
const allItems = rows.flatMap((r) => r.items);
|
|
3555
4107
|
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
@@ -3626,7 +4178,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3626
4178
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
3627
4179
|
const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
|
|
3628
4180
|
const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
|
|
3629
|
-
const
|
|
4181
|
+
const fontSizeFreq = /* @__PURE__ */ new Map();
|
|
3630
4182
|
const pageHeights = /* @__PURE__ */ new Map();
|
|
3631
4183
|
let parsedPages = 0;
|
|
3632
4184
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
@@ -3643,7 +4195,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3643
4195
|
warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
|
|
3644
4196
|
}
|
|
3645
4197
|
for (const item of visible) {
|
|
3646
|
-
if (item.fontSize > 0)
|
|
4198
|
+
if (item.fontSize > 0) fontSizeFreq.set(item.fontSize, (fontSizeFreq.get(item.fontSize) || 0) + 1);
|
|
3647
4199
|
}
|
|
3648
4200
|
const opList = await page.getOperatorList();
|
|
3649
4201
|
const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
|
|
@@ -3682,7 +4234,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3682
4234
|
blocks.splice(removed[ri], 1);
|
|
3683
4235
|
}
|
|
3684
4236
|
}
|
|
3685
|
-
const medianFontSize =
|
|
4237
|
+
const medianFontSize = computeMedianFontSizeFromFreq(fontSizeFreq);
|
|
3686
4238
|
if (medianFontSize > 0) {
|
|
3687
4239
|
detectHeadings(blocks, medianFontSize);
|
|
3688
4240
|
}
|
|
@@ -3735,11 +4287,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
|
|
|
3735
4287
|
}
|
|
3736
4288
|
return { visible, hiddenCount };
|
|
3737
4289
|
}
|
|
3738
|
-
function
|
|
3739
|
-
if (
|
|
3740
|
-
|
|
3741
|
-
const
|
|
3742
|
-
|
|
4290
|
+
function computeMedianFontSizeFromFreq(freq) {
|
|
4291
|
+
if (freq.size === 0) return 0;
|
|
4292
|
+
let total = 0;
|
|
4293
|
+
for (const count of freq.values()) total += count;
|
|
4294
|
+
const sorted = [...freq.entries()].sort((a, b) => a[0] - b[0]);
|
|
4295
|
+
const mid = Math.floor(total / 2);
|
|
4296
|
+
let cumulative = 0;
|
|
4297
|
+
for (const [size, count] of sorted) {
|
|
4298
|
+
cumulative += count;
|
|
4299
|
+
if (cumulative > mid) return size;
|
|
4300
|
+
}
|
|
4301
|
+
return sorted[sorted.length - 1][0];
|
|
3743
4302
|
}
|
|
3744
4303
|
function detectHeadings(blocks, medianFontSize) {
|
|
3745
4304
|
for (const block of blocks) {
|
|
@@ -3765,11 +4324,21 @@ function collapseEvenSpacing(text) {
|
|
|
3765
4324
|
if (tokens.length >= 3 && singleCharCount / tokens.length >= 0.7) {
|
|
3766
4325
|
return tokens.join("");
|
|
3767
4326
|
}
|
|
3768
|
-
return text
|
|
4327
|
+
return text.replace(
|
|
4328
|
+
/(?<![가-힣])[가-힣](?: [가-힣\d]){2,}(?![가-힣])/g,
|
|
4329
|
+
(match) => match.replace(/ /g, "")
|
|
4330
|
+
);
|
|
3769
4331
|
}
|
|
3770
4332
|
function shouldDemoteTable(table) {
|
|
3771
4333
|
const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
|
|
3772
4334
|
const allText = allCells.join(" ");
|
|
4335
|
+
if (table.rows <= 3 && table.cols <= 3) {
|
|
4336
|
+
const totalCells2 = table.rows * table.cols;
|
|
4337
|
+
const emptyCells2 = totalCells2 - allCells.length;
|
|
4338
|
+
if (emptyCells2 >= totalCells2 * 0.3) return true;
|
|
4339
|
+
if (/[□■◆○●▶ㅇ]/.test(allText)) return true;
|
|
4340
|
+
if (/<[^>]+>/.test(allText)) return true;
|
|
4341
|
+
}
|
|
3773
4342
|
if (allText.length > 200) return false;
|
|
3774
4343
|
if (/[□■◆○●▶]/.test(allText) && table.rows <= 3) return true;
|
|
3775
4344
|
const totalCells = table.rows * table.cols;
|
|
@@ -3880,6 +4449,7 @@ function extractPageBlocksWithLines(items, pageNum, opList, pageWidth, pageHeigh
|
|
|
3880
4449
|
if (items.length === 0) return [];
|
|
3881
4450
|
let { horizontals, verticals } = extractLines(opList.fnArray, opList.argsArray);
|
|
3882
4451
|
({ horizontals, verticals } = filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight));
|
|
4452
|
+
({ horizontals, verticals } = preprocessLines(horizontals, verticals));
|
|
3883
4453
|
const grids = buildTableGrids(horizontals, verticals);
|
|
3884
4454
|
if (grids.length > 0) {
|
|
3885
4455
|
return extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals);
|
|
@@ -3891,14 +4461,19 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
3891
4461
|
const usedItems = /* @__PURE__ */ new Set();
|
|
3892
4462
|
const sortedGrids = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
|
|
3893
4463
|
for (const grid of sortedGrids) {
|
|
4464
|
+
const numGridRows = grid.rowYs.length - 1;
|
|
4465
|
+
const numGridCols = grid.colXs.length - 1;
|
|
4466
|
+
if (numGridRows === 1 && numGridCols >= 2) continue;
|
|
3894
4467
|
const tableItems = [];
|
|
3895
4468
|
const pad = 3;
|
|
4469
|
+
const gridW = grid.bbox.x2 - grid.bbox.x1;
|
|
3896
4470
|
for (const item of items) {
|
|
3897
4471
|
if (usedItems.has(item)) continue;
|
|
3898
|
-
if (item.
|
|
3899
|
-
|
|
3900
|
-
|
|
3901
|
-
|
|
4472
|
+
if (item.y < grid.bbox.y1 - pad || item.y > grid.bbox.y2 + pad) continue;
|
|
4473
|
+
if (item.x < grid.bbox.x1 - pad || item.x + item.w > grid.bbox.x2 + pad) continue;
|
|
4474
|
+
if (gridW < 120 && item.x + item.w > grid.bbox.x2 - 2) continue;
|
|
4475
|
+
tableItems.push(item);
|
|
4476
|
+
usedItems.add(item);
|
|
3902
4477
|
}
|
|
3903
4478
|
const cells = extractCells(grid, horizontals, verticals);
|
|
3904
4479
|
if (cells.length === 0) continue;
|
|
@@ -3922,6 +4497,7 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
3922
4497
|
const cellItems = cellTextMap.get(cell) || [];
|
|
3923
4498
|
let text = cellTextToString(cellItems);
|
|
3924
4499
|
text = text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").trim();
|
|
4500
|
+
text = text.split("\n").map((line) => collapseEvenSpacing(line)).join("\n");
|
|
3925
4501
|
irGrid[cell.row][cell.col] = {
|
|
3926
4502
|
text,
|
|
3927
4503
|
colSpan: cell.colSpan,
|
|
@@ -3946,23 +4522,58 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
3946
4522
|
if (shouldDemoteTable(irTable)) {
|
|
3947
4523
|
const demoted = demoteTableToText(irTable);
|
|
3948
4524
|
if (demoted) {
|
|
3949
|
-
|
|
4525
|
+
const text = numGridRows === 1 ? "\n" + demoted + "\n" : demoted;
|
|
4526
|
+
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
|
|
3950
4527
|
}
|
|
3951
4528
|
continue;
|
|
3952
4529
|
}
|
|
3953
4530
|
blocks.push({ type: "table", table: irTable, pageNumber: pageNum, bbox: tableBbox });
|
|
3954
4531
|
}
|
|
3955
|
-
|
|
4532
|
+
let remaining = items.filter((i) => !usedItems.has(i));
|
|
3956
4533
|
if (remaining.length > 0) {
|
|
3957
4534
|
remaining.sort((a, b) => b.y - a.y || a.x - b.x);
|
|
3958
|
-
const
|
|
3959
|
-
|
|
3960
|
-
|
|
4535
|
+
const clusterItems = remaining.map((i) => ({
|
|
4536
|
+
text: i.text,
|
|
4537
|
+
x: i.x,
|
|
4538
|
+
y: i.y,
|
|
4539
|
+
w: i.w,
|
|
4540
|
+
h: i.h,
|
|
4541
|
+
fontSize: i.fontSize,
|
|
4542
|
+
fontName: i.fontName
|
|
4543
|
+
}));
|
|
4544
|
+
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4545
|
+
if (clusterResults.length > 0) {
|
|
4546
|
+
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4547
|
+
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4548
|
+
const usedClusterIndices = /* @__PURE__ */ new Set();
|
|
4549
|
+
for (const cr of clusterResults) {
|
|
4550
|
+
for (const ci of cr.usedItems) {
|
|
4551
|
+
const idx = ciToIdx.get(ci);
|
|
4552
|
+
if (idx !== void 0) usedClusterIndices.add(idx);
|
|
4553
|
+
}
|
|
4554
|
+
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4555
|
+
}
|
|
4556
|
+
remaining = remaining.filter((_, idx) => !usedClusterIndices.has(idx));
|
|
4557
|
+
}
|
|
4558
|
+
if (remaining.length > 0) {
|
|
4559
|
+
const allY = remaining.map((i) => i.y);
|
|
4560
|
+
const pageH = safeMax(allY) - safeMin(allY);
|
|
4561
|
+
const groups = xyCutOrder(remaining, Math.max(15, pageH * 0.03));
|
|
4562
|
+
const textBlocks = [];
|
|
4563
|
+
for (const group of groups) {
|
|
4564
|
+
if (group.length === 0) continue;
|
|
4565
|
+
const groupBlocks = extractPageBlocksFallback(group, pageNum);
|
|
4566
|
+
for (const b of groupBlocks) textBlocks.push(b);
|
|
4567
|
+
}
|
|
4568
|
+
const finalTextBlocks = detectListBlocks(textBlocks);
|
|
4569
|
+
for (const b of finalTextBlocks) blocks.push(b);
|
|
4570
|
+
}
|
|
4571
|
+
blocks.sort((a, b) => {
|
|
3961
4572
|
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
3962
4573
|
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
3963
4574
|
return by - ay;
|
|
3964
4575
|
});
|
|
3965
|
-
return mergeAdjacentTableBlocks(
|
|
4576
|
+
return mergeAdjacentTableBlocks(blocks);
|
|
3966
4577
|
}
|
|
3967
4578
|
return mergeAdjacentTableBlocks(blocks);
|
|
3968
4579
|
}
|
|
@@ -3989,52 +4600,52 @@ function mergeAdjacentTableBlocks(blocks) {
|
|
|
3989
4600
|
function extractPageBlocksFallback(items, pageNum) {
|
|
3990
4601
|
if (items.length === 0) return [];
|
|
3991
4602
|
const blocks = [];
|
|
3992
|
-
const
|
|
3993
|
-
|
|
3994
|
-
|
|
3995
|
-
|
|
3996
|
-
|
|
3997
|
-
|
|
3998
|
-
|
|
3999
|
-
|
|
4000
|
-
|
|
4001
|
-
|
|
4002
|
-
|
|
4003
|
-
|
|
4004
|
-
|
|
4005
|
-
|
|
4006
|
-
|
|
4007
|
-
|
|
4008
|
-
|
|
4009
|
-
|
|
4010
|
-
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4011
|
-
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4012
|
-
const usedIndices = /* @__PURE__ */ new Set();
|
|
4013
|
-
for (const cr of clusterResults) {
|
|
4014
|
-
for (const ci of cr.usedItems) {
|
|
4015
|
-
const idx = ciToIdx.get(ci);
|
|
4016
|
-
if (idx !== void 0) usedIndices.add(idx);
|
|
4017
|
-
}
|
|
4018
|
-
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4603
|
+
const clusterItems = items.map((i) => ({
|
|
4604
|
+
text: i.text,
|
|
4605
|
+
x: i.x,
|
|
4606
|
+
y: i.y,
|
|
4607
|
+
w: i.w,
|
|
4608
|
+
h: i.h,
|
|
4609
|
+
fontSize: i.fontSize,
|
|
4610
|
+
fontName: i.fontName
|
|
4611
|
+
}));
|
|
4612
|
+
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4613
|
+
if (clusterResults.length > 0) {
|
|
4614
|
+
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4615
|
+
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4616
|
+
const usedIndices = /* @__PURE__ */ new Set();
|
|
4617
|
+
for (const cr of clusterResults) {
|
|
4618
|
+
for (const ci of cr.usedItems) {
|
|
4619
|
+
const idx = ciToIdx.get(ci);
|
|
4620
|
+
if (idx !== void 0) usedIndices.add(idx);
|
|
4019
4621
|
}
|
|
4020
|
-
|
|
4021
|
-
|
|
4022
|
-
|
|
4023
|
-
|
|
4024
|
-
|
|
4025
|
-
|
|
4026
|
-
|
|
4027
|
-
|
|
4028
|
-
|
|
4622
|
+
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4623
|
+
}
|
|
4624
|
+
const remaining = items.filter((_, idx) => !usedIndices.has(idx));
|
|
4625
|
+
if (remaining.length > 0) {
|
|
4626
|
+
const yLines = groupByY(remaining);
|
|
4627
|
+
for (const line of yLines) {
|
|
4628
|
+
const text = mergeLineSimple(line);
|
|
4629
|
+
if (!text.trim()) continue;
|
|
4630
|
+
const bbox = computeBBox(line, pageNum);
|
|
4631
|
+
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
|
|
4029
4632
|
}
|
|
4030
|
-
|
|
4031
|
-
|
|
4032
|
-
|
|
4033
|
-
|
|
4034
|
-
|
|
4633
|
+
}
|
|
4634
|
+
blocks.sort((a, b) => {
|
|
4635
|
+
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
4636
|
+
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
4637
|
+
return by - ay;
|
|
4638
|
+
});
|
|
4639
|
+
} else {
|
|
4640
|
+
const allYLines = groupByY(items);
|
|
4641
|
+
const columns = detectColumns(allYLines);
|
|
4642
|
+
if (columns && columns.length >= 3) {
|
|
4643
|
+
const tableText = extractWithColumns(allYLines, columns);
|
|
4644
|
+
const bbox = computeBBox(items, pageNum);
|
|
4645
|
+
blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
|
|
4035
4646
|
} else {
|
|
4036
4647
|
const allY = items.map((i) => i.y);
|
|
4037
|
-
const pageHeight =
|
|
4648
|
+
const pageHeight = safeMax(allY) - safeMin(allY);
|
|
4038
4649
|
const gapThreshold = Math.max(15, pageHeight * 0.03);
|
|
4039
4650
|
const orderedGroups = xyCutOrder(items, gapThreshold);
|
|
4040
4651
|
for (const group of orderedGroups) {
|
|
@@ -4087,22 +4698,76 @@ function dominantStyle(items) {
|
|
|
4087
4698
|
return { fontSize: dominantSize, fontName };
|
|
4088
4699
|
}
|
|
4089
4700
|
function normalizeItems(rawItems) {
|
|
4090
|
-
|
|
4701
|
+
const items = [];
|
|
4702
|
+
const spacePositions = [];
|
|
4703
|
+
for (const i of rawItems) {
|
|
4704
|
+
if (typeof i.str !== "string") continue;
|
|
4705
|
+
const x = Math.round(i.transform[4]);
|
|
4706
|
+
const y = Math.round(i.transform[5]);
|
|
4707
|
+
if (!i.str.trim()) {
|
|
4708
|
+
spacePositions.push({ x, y });
|
|
4709
|
+
continue;
|
|
4710
|
+
}
|
|
4091
4711
|
const scaleY = Math.abs(i.transform[3]);
|
|
4092
4712
|
const scaleX = Math.abs(i.transform[0]);
|
|
4093
4713
|
const fontSize = Math.round(Math.max(scaleY, scaleX));
|
|
4094
|
-
|
|
4095
|
-
|
|
4096
|
-
|
|
4097
|
-
|
|
4098
|
-
|
|
4099
|
-
|
|
4100
|
-
|
|
4101
|
-
|
|
4102
|
-
|
|
4103
|
-
|
|
4104
|
-
|
|
4105
|
-
|
|
4714
|
+
const w = Math.round(i.width);
|
|
4715
|
+
const h = Math.round(i.height);
|
|
4716
|
+
const isHidden = fontSize === 0 || i.width === 0 && i.str.trim().length > 0;
|
|
4717
|
+
let text = i.str.trim();
|
|
4718
|
+
if (/^[\d\s\-().·,☎]+$/.test(text) && /\d/.test(text) && / /.test(text)) {
|
|
4719
|
+
text = text.replace(/ /g, "");
|
|
4720
|
+
}
|
|
4721
|
+
const split = splitEvenSpacedItem(text, x, w, fontSize);
|
|
4722
|
+
if (split) {
|
|
4723
|
+
for (const s of split) {
|
|
4724
|
+
items.push({ text: s.text, x: s.x, y, w: s.w, h, fontSize, fontName: i.fontName || "", isHidden });
|
|
4725
|
+
}
|
|
4726
|
+
} else {
|
|
4727
|
+
items.push({ text, x, y, w, h, fontSize, fontName: i.fontName || "", isHidden });
|
|
4728
|
+
}
|
|
4729
|
+
}
|
|
4730
|
+
const sorted = items.sort((a, b) => b.y - a.y || a.x - b.x);
|
|
4731
|
+
const deduped = [];
|
|
4732
|
+
for (let i = 0; i < sorted.length; i++) {
|
|
4733
|
+
let isDup = false;
|
|
4734
|
+
for (let j = deduped.length - 1; j >= 0; j--) {
|
|
4735
|
+
const prev = deduped[j];
|
|
4736
|
+
if (prev.y - sorted[i].y > 3) break;
|
|
4737
|
+
if (Math.abs(prev.y - sorted[i].y) <= 3 && prev.text === sorted[i].text && Math.abs(prev.x - sorted[i].x) <= 3) {
|
|
4738
|
+
isDup = true;
|
|
4739
|
+
break;
|
|
4740
|
+
}
|
|
4741
|
+
}
|
|
4742
|
+
if (!isDup) deduped.push(sorted[i]);
|
|
4743
|
+
}
|
|
4744
|
+
if (spacePositions.length > 0) {
|
|
4745
|
+
for (const item of deduped) {
|
|
4746
|
+
for (const sp of spacePositions) {
|
|
4747
|
+
if (Math.abs(sp.y - item.y) <= 3) {
|
|
4748
|
+
const dist = item.x - sp.x;
|
|
4749
|
+
if (dist >= 0 && dist <= 20) {
|
|
4750
|
+
item.hasSpaceBefore = true;
|
|
4751
|
+
break;
|
|
4752
|
+
}
|
|
4753
|
+
}
|
|
4754
|
+
}
|
|
4755
|
+
}
|
|
4756
|
+
}
|
|
4757
|
+
return deduped;
|
|
4758
|
+
}
|
|
4759
|
+
function splitEvenSpacedItem(text, itemX, itemW, fontSize) {
|
|
4760
|
+
if (!/^[가-힣\d](?: [가-힣\d]){2,}$/.test(text)) return null;
|
|
4761
|
+
const chars = text.split(" ");
|
|
4762
|
+
if (chars.length < 3) return null;
|
|
4763
|
+
const charW = itemW / chars.length;
|
|
4764
|
+
if (charW > fontSize * 2) return null;
|
|
4765
|
+
return chars.map((ch, idx) => ({
|
|
4766
|
+
text: ch,
|
|
4767
|
+
x: Math.round(itemX + idx * charW),
|
|
4768
|
+
w: Math.round(charW * 0.8)
|
|
4769
|
+
// 실제 글자 폭은 간격보다 좁음
|
|
4770
|
+
}));
|
|
4106
4771
|
}
|
|
4107
4772
|
function groupByY(items) {
|
|
4108
4773
|
if (items.length === 0) return [];
|
|
@@ -4127,14 +4792,14 @@ function isProseSpread(items) {
|
|
|
4127
4792
|
for (let i = 1; i < sorted.length; i++) {
|
|
4128
4793
|
gaps.push(sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w));
|
|
4129
4794
|
}
|
|
4130
|
-
const maxGap =
|
|
4795
|
+
const maxGap = safeMax(gaps);
|
|
4131
4796
|
const avgLen = items.reduce((s, i) => s + i.text.length, 0) / items.length;
|
|
4132
4797
|
return maxGap < 40 && avgLen < 5;
|
|
4133
4798
|
}
|
|
4134
4799
|
function detectColumns(yLines) {
|
|
4135
4800
|
const allItems = yLines.flat();
|
|
4136
4801
|
if (allItems.length === 0) return null;
|
|
4137
|
-
const pageWidth =
|
|
4802
|
+
const pageWidth = safeMax(allItems.map((i) => i.x + i.w)) - safeMin(allItems.map((i) => i.x));
|
|
4138
4803
|
if (pageWidth < 100) return null;
|
|
4139
4804
|
let bigoLineIdx = -1;
|
|
4140
4805
|
for (let i = 0; i < yLines.length; i++) {
|
|
@@ -4166,7 +4831,7 @@ function detectColumns(yLines) {
|
|
|
4166
4831
|
}
|
|
4167
4832
|
const peaks = xClusters.filter((c) => c.count >= 3).sort((a, b) => a.minX - b.minX);
|
|
4168
4833
|
if (peaks.length < 3) return null;
|
|
4169
|
-
const MERGE_TOL =
|
|
4834
|
+
const MERGE_TOL = 40;
|
|
4170
4835
|
const merged = [peaks[0]];
|
|
4171
4836
|
for (let i = 1; i < peaks.length; i++) {
|
|
4172
4837
|
const prev = merged[merged.length - 1];
|
|
@@ -4180,7 +4845,14 @@ function detectColumns(yLines) {
|
|
|
4180
4845
|
merged.push({ ...peaks[i] });
|
|
4181
4846
|
}
|
|
4182
4847
|
}
|
|
4183
|
-
const
|
|
4848
|
+
const rawColumns = merged.filter((c) => c.count >= 3).map((c) => c.minX);
|
|
4849
|
+
if (rawColumns.length < 3) return null;
|
|
4850
|
+
const MIN_DETECT_COL_WIDTH = 30;
|
|
4851
|
+
const columns = [rawColumns[0]];
|
|
4852
|
+
for (let i = 1; i < rawColumns.length; i++) {
|
|
4853
|
+
if (rawColumns[i] - columns[columns.length - 1] < MIN_DETECT_COL_WIDTH) continue;
|
|
4854
|
+
columns.push(rawColumns[i]);
|
|
4855
|
+
}
|
|
4184
4856
|
return columns.length >= 3 ? columns : null;
|
|
4185
4857
|
}
|
|
4186
4858
|
function findColumn(x, columns) {
|
|
@@ -4308,6 +4980,16 @@ function buildGridTable(lines, columns) {
|
|
|
4308
4980
|
}
|
|
4309
4981
|
merged.splice(0, headerEnd, headerRow);
|
|
4310
4982
|
}
|
|
4983
|
+
for (const row of merged) {
|
|
4984
|
+
for (let c = 0; c < row.length; c++) {
|
|
4985
|
+
if (row[c]) row[c] = collapseEvenSpacing(row[c]);
|
|
4986
|
+
}
|
|
4987
|
+
}
|
|
4988
|
+
const totalCells = merged.length * numCols;
|
|
4989
|
+
const filledCells = merged.reduce((s, row) => s + row.filter((c) => c).length, 0);
|
|
4990
|
+
if (filledCells < totalCells * 0.35 || merged.length < 2 || merged.length <= 3 && numCols >= 7) {
|
|
4991
|
+
return merged.map((r) => r.filter((c) => c).join(" ")).join("\n");
|
|
4992
|
+
}
|
|
4311
4993
|
const md = [];
|
|
4312
4994
|
md.push("| " + merged[0].join(" | ") + " |");
|
|
4313
4995
|
md.push("| " + merged[0].map(() => "---").join(" | ") + " |");
|
|
@@ -4319,12 +5001,32 @@ function buildGridTable(lines, columns) {
|
|
|
4319
5001
|
function mergeLineSimple(items) {
|
|
4320
5002
|
if (items.length <= 1) return items[0]?.text || "";
|
|
4321
5003
|
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
5004
|
+
const isEvenSpaced = detectEvenSpacedItems(sorted);
|
|
4322
5005
|
let result = sorted[0].text;
|
|
4323
5006
|
for (let i = 1; i < sorted.length; i++) {
|
|
4324
5007
|
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
4325
5008
|
const avgFs = (sorted[i].fontSize + sorted[i - 1].fontSize) / 2;
|
|
4326
|
-
|
|
4327
|
-
|
|
5009
|
+
const tabThreshold = Math.max(avgFs * 2, 30);
|
|
5010
|
+
if (gap > tabThreshold) {
|
|
5011
|
+
result += " ";
|
|
5012
|
+
result += sorted[i].text;
|
|
5013
|
+
continue;
|
|
5014
|
+
}
|
|
5015
|
+
if (isEvenSpaced[i]) {
|
|
5016
|
+
result += sorted[i].text;
|
|
5017
|
+
continue;
|
|
5018
|
+
}
|
|
5019
|
+
if (sorted[i].hasSpaceBefore && gap >= avgFs * 0.05) {
|
|
5020
|
+
result += " ";
|
|
5021
|
+
result += sorted[i].text;
|
|
5022
|
+
continue;
|
|
5023
|
+
}
|
|
5024
|
+
if (/[□■○●▶◆◇ㅇ]$/.test(sorted[i - 1].text) && /^[가-힣]/.test(sorted[i].text) && gap > 1) {
|
|
5025
|
+
result += " ";
|
|
5026
|
+
result += sorted[i].text;
|
|
5027
|
+
continue;
|
|
5028
|
+
}
|
|
5029
|
+
if (gap < avgFs * 0.15) {
|
|
4328
5030
|
} else if (gap < avgFs * 0.35 && (/[가-힣]$/.test(result) || /^[가-힣]/.test(sorted[i].text))) {
|
|
4329
5031
|
} else if (gap > 3) result += " ";
|
|
4330
5032
|
result += sorted[i].text;
|
|
@@ -4333,8 +5035,8 @@ function mergeLineSimple(items) {
|
|
|
4333
5035
|
}
|
|
4334
5036
|
function cleanPdfText(text) {
|
|
4335
5037
|
return mergeKoreanLines(
|
|
4336
|
-
text.replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "")
|
|
4337
|
-
).replace(/^(?!\|)
|
|
5038
|
+
text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
|
|
5039
|
+
).replace(/^(?!\| ---).*$/gm, (line) => collapseEvenSpacing(line)).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
|
|
4338
5040
|
}
|
|
4339
5041
|
function startsWithMarker(line) {
|
|
4340
5042
|
const t = line.trimStart();
|
|
@@ -4526,7 +5228,7 @@ function mergeKoreanLines(text) {
|
|
|
4526
5228
|
result[result.length - 1] = prev + " " + currTrimmed;
|
|
4527
5229
|
continue;
|
|
4528
5230
|
}
|
|
4529
|
-
if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev)) {
|
|
5231
|
+
if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev) && !startsWithMarker(prev)) {
|
|
4530
5232
|
result[result.length - 1] = prev + " " + curr;
|
|
4531
5233
|
} else {
|
|
4532
5234
|
result.push(curr);
|
|
@@ -4574,7 +5276,7 @@ function getTextContent(el) {
|
|
|
4574
5276
|
return el.textContent?.trim() ?? "";
|
|
4575
5277
|
}
|
|
4576
5278
|
function parseXml(text) {
|
|
4577
|
-
return new DOMParser2().parseFromString(text, "text/xml");
|
|
5279
|
+
return new DOMParser2().parseFromString(stripDtd(text), "text/xml");
|
|
4578
5280
|
}
|
|
4579
5281
|
function parseSharedStrings(xml) {
|
|
4580
5282
|
const doc = parseXml(xml);
|
|
@@ -4861,7 +5563,7 @@ function getAttr(el, localName) {
|
|
|
4861
5563
|
return null;
|
|
4862
5564
|
}
|
|
4863
5565
|
function parseXml2(text) {
|
|
4864
|
-
return new DOMParser3().parseFromString(text, "text/xml");
|
|
5566
|
+
return new DOMParser3().parseFromString(stripDtd(text), "text/xml");
|
|
4865
5567
|
}
|
|
4866
5568
|
function parseStyles(xml) {
|
|
4867
5569
|
const doc = parseXml2(xml);
|
|
@@ -5261,7 +5963,13 @@ function normalize(s) {
|
|
|
5261
5963
|
}
|
|
5262
5964
|
var MAX_LEVENSHTEIN_LEN = 1e4;
|
|
5263
5965
|
function levenshtein(a, b) {
|
|
5264
|
-
if (a.length + b.length > MAX_LEVENSHTEIN_LEN)
|
|
5966
|
+
if (a.length + b.length > MAX_LEVENSHTEIN_LEN) {
|
|
5967
|
+
const sampleLen = Math.min(500, a.length, b.length);
|
|
5968
|
+
let diffs = 0;
|
|
5969
|
+
for (let i = 0; i < sampleLen; i++) if (a[i] !== b[i]) diffs++;
|
|
5970
|
+
const sampleRate = sampleLen > 0 ? diffs / sampleLen : 1;
|
|
5971
|
+
return Math.abs(a.length - b.length) + Math.round(Math.min(a.length, b.length) * sampleRate);
|
|
5972
|
+
}
|
|
5265
5973
|
if (a.length > b.length) [a, b] = [b, a];
|
|
5266
5974
|
const m = a.length;
|
|
5267
5975
|
const n = b.length;
|
|
@@ -5544,13 +6252,20 @@ function extractInlineFields(text) {
|
|
|
5544
6252
|
|
|
5545
6253
|
// src/hwpx/generator.ts
|
|
5546
6254
|
import JSZip5 from "jszip";
|
|
5547
|
-
var
|
|
6255
|
+
var NS_SECTION = "http://www.hancom.co.kr/hwpml/2011/section";
|
|
6256
|
+
var NS_PARA = "http://www.hancom.co.kr/hwpml/2011/paragraph";
|
|
6257
|
+
var NS_HEAD = "http://www.hancom.co.kr/hwpml/2011/head";
|
|
6258
|
+
var NS_OPF = "http://www.idpf.org/2007/opf/";
|
|
6259
|
+
var NS_HPF = "http://www.hancom.co.kr/schema/2011/hpf";
|
|
6260
|
+
var NS_OCF = "urn:oasis:names:tc:opendocument:xmlns:container";
|
|
5548
6261
|
async function markdownToHwpx(markdown) {
|
|
5549
6262
|
const blocks = parseMarkdownToBlocks(markdown);
|
|
5550
6263
|
const sectionXml = blocksToSectionXml(blocks);
|
|
5551
6264
|
const zip = new JSZip5();
|
|
5552
6265
|
zip.file("mimetype", "application/hwp+zip", { compression: "STORE" });
|
|
6266
|
+
zip.file("META-INF/container.xml", generateContainerXml());
|
|
5553
6267
|
zip.file("Contents/content.hpf", generateManifest());
|
|
6268
|
+
zip.file("Contents/header.xml", generateHeaderXml());
|
|
5554
6269
|
zip.file("Contents/section0.xml", sectionXml);
|
|
5555
6270
|
return await zip.generateAsync({ type: "arraybuffer" });
|
|
5556
6271
|
}
|
|
@@ -5595,8 +6310,111 @@ function parseMarkdownToBlocks(md) {
|
|
|
5595
6310
|
function escapeXml(text) {
|
|
5596
6311
|
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """);
|
|
5597
6312
|
}
|
|
6313
|
+
function generateContainerXml() {
|
|
6314
|
+
return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
|
6315
|
+
<ocf:container xmlns:ocf="${NS_OCF}" xmlns:hpf="${NS_HPF}">
|
|
6316
|
+
<ocf:rootfiles>
|
|
6317
|
+
<ocf:rootfile full-path="Contents/content.hpf" media-type="application/hwpml-package+xml"/>
|
|
6318
|
+
</ocf:rootfiles>
|
|
6319
|
+
</ocf:container>`;
|
|
6320
|
+
}
|
|
6321
|
+
function generateManifest() {
|
|
6322
|
+
return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
|
6323
|
+
<opf:package xmlns:opf="${NS_OPF}" xmlns:hpf="${NS_HPF}" xmlns:hh="${NS_HEAD}">
|
|
6324
|
+
<opf:manifest>
|
|
6325
|
+
<opf:item id="header" href="Contents/header.xml" media-type="application/xml"/>
|
|
6326
|
+
<opf:item id="section0" href="Contents/section0.xml" media-type="application/xml"/>
|
|
6327
|
+
</opf:manifest>
|
|
6328
|
+
<opf:spine>
|
|
6329
|
+
<opf:itemref idref="header" linear="no"/>
|
|
6330
|
+
<opf:itemref idref="section0" linear="yes"/>
|
|
6331
|
+
</opf:spine>
|
|
6332
|
+
</opf:package>`;
|
|
6333
|
+
}
|
|
6334
|
+
function generateHeaderXml() {
|
|
6335
|
+
return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
|
6336
|
+
<hh:head xmlns:hh="${NS_HEAD}" xmlns:hp="${NS_PARA}" version="1.4" secCnt="1">
|
|
6337
|
+
<hh:beginNum page="1" footnote="1" endnote="1" pic="1" tbl="1" equation="1"/>
|
|
6338
|
+
<hh:refList>
|
|
6339
|
+
<hh:fontfaces itemCnt="7">
|
|
6340
|
+
<hh:fontface lang="HANGUL" fontCnt="1">
|
|
6341
|
+
<hh:font id="0" face="\uD568\uCD08\uB86C\uBC14\uD0D5" type="TTF" isEmbedded="0">
|
|
6342
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="4" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6343
|
+
</hh:font>
|
|
6344
|
+
</hh:fontface>
|
|
6345
|
+
<hh:fontface lang="LATIN" fontCnt="1">
|
|
6346
|
+
<hh:font id="0" face="Times New Roman" type="TTF" isEmbedded="0">
|
|
6347
|
+
<hh:typeInfo familyType="FCAT_OLDSTYLE" weight="5" proportion="4" contrast="2" strokeVariation="0" armStyle="0" letterform="0" midline="0" xHeight="4"/>
|
|
6348
|
+
</hh:font>
|
|
6349
|
+
</hh:fontface>
|
|
6350
|
+
<hh:fontface lang="HANJA" fontCnt="1">
|
|
6351
|
+
<hh:font id="0" face="\uD568\uCD08\uB86C\uBC14\uD0D5" type="TTF" isEmbedded="0">
|
|
6352
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="4" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6353
|
+
</hh:font>
|
|
6354
|
+
</hh:fontface>
|
|
6355
|
+
<hh:fontface lang="JAPANESE" fontCnt="1">
|
|
6356
|
+
<hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
|
|
6357
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6358
|
+
</hh:font>
|
|
6359
|
+
</hh:fontface>
|
|
6360
|
+
<hh:fontface lang="OTHER" fontCnt="1">
|
|
6361
|
+
<hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
|
|
6362
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6363
|
+
</hh:font>
|
|
6364
|
+
</hh:fontface>
|
|
6365
|
+
<hh:fontface lang="SYMBOL" fontCnt="1">
|
|
6366
|
+
<hh:font id="0" face="Symbol" type="TTF" isEmbedded="0">
|
|
6367
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6368
|
+
</hh:font>
|
|
6369
|
+
</hh:fontface>
|
|
6370
|
+
<hh:fontface lang="USER" fontCnt="1">
|
|
6371
|
+
<hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
|
|
6372
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6373
|
+
</hh:font>
|
|
6374
|
+
</hh:fontface>
|
|
6375
|
+
</hh:fontfaces>
|
|
6376
|
+
<hh:borderFills itemCnt="1">
|
|
6377
|
+
<hh:borderFill id="0" threeD="0" shadow="0" centerLine="0" breakCellSeparateLine="0">
|
|
6378
|
+
<hh:slash type="NONE" Crooked="0" isCounter="0"/>
|
|
6379
|
+
<hh:backSlash type="NONE" Crooked="0" isCounter="0"/>
|
|
6380
|
+
<hh:leftBorder type="NONE" width="0.1mm" color="0"/>
|
|
6381
|
+
<hh:rightBorder type="NONE" width="0.1mm" color="0"/>
|
|
6382
|
+
<hh:topBorder type="NONE" width="0.1mm" color="0"/>
|
|
6383
|
+
<hh:bottomBorder type="NONE" width="0.1mm" color="0"/>
|
|
6384
|
+
<hh:diagonal type="NONE" width="0.1mm" color="0"/>
|
|
6385
|
+
<hh:fillInfo/>
|
|
6386
|
+
</hh:borderFill>
|
|
6387
|
+
</hh:borderFills>
|
|
6388
|
+
<hh:charProperties itemCnt="1">
|
|
6389
|
+
<hh:charPr id="0" height="1000" textColor="0" shadeColor="-1" useFontSpace="0" useKerning="0" symMark="NONE" borderFillIDRef="0">
|
|
6390
|
+
<hh:fontRef hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
|
|
6391
|
+
<hh:ratio hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
|
|
6392
|
+
<hh:spacing hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
|
|
6393
|
+
<hh:relSz hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
|
|
6394
|
+
<hh:offset hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
|
|
6395
|
+
</hh:charPr>
|
|
6396
|
+
</hh:charProperties>
|
|
6397
|
+
<hh:tabProperties itemCnt="0"/>
|
|
6398
|
+
<hh:numberings itemCnt="0"/>
|
|
6399
|
+
<hh:bullets itemCnt="0"/>
|
|
6400
|
+
<hh:paraProperties itemCnt="1">
|
|
6401
|
+
<hh:paraPr id="0" tabIDRef="0" condense="0" fontLineHeight="0" snapToGrid="0" suppressOverlap="0" checked="0">
|
|
6402
|
+
<hh:parLineBreak lineBreak="BREAK_LINE" wordBreak="BREAK_WORD" breakLatinWord="BREAK_WORD" breakNonLatinWord="BREAK_WORD"/>
|
|
6403
|
+
<hh:parMargin left="0" right="0" prev="0" next="0" indent="0"/>
|
|
6404
|
+
<hh:parBorder borderFillIDRef="0" offsetLeft="0" offsetRight="0" offsetTop="0" offsetBottom="0" connect="0" ignoreMargin="0"/>
|
|
6405
|
+
<hh:parShade borderFillIDRef="0"/>
|
|
6406
|
+
<hh:parTabList/>
|
|
6407
|
+
</hh:paraPr>
|
|
6408
|
+
</hh:paraProperties>
|
|
6409
|
+
<hh:styles itemCnt="1">
|
|
6410
|
+
<hh:style id="0" type="PARA" name="\uBC14\uD0D5\uAE00" engName="Normal" paraPrIDRef="0" charPrIDRef="0" nextStyleIDRef="0" langIDRef="1042" lockForm="0"/>
|
|
6411
|
+
</hh:styles>
|
|
6412
|
+
</hh:refList>
|
|
6413
|
+
<hh:compatibleDocument targetProgram="HWP2018"/>
|
|
6414
|
+
</hh:head>`;
|
|
6415
|
+
}
|
|
5598
6416
|
function generateParagraph(text) {
|
|
5599
|
-
return `<hp:p><hp:run><hp:t>${escapeXml(text)}</hp:t></hp:run></hp:p>`;
|
|
6417
|
+
return `<hp:p paraPrIDRef="0" styleIDRef="0"><hp:run charPrIDRef="0"><hp:t>${escapeXml(text)}</hp:t></hp:run></hp:p>`;
|
|
5600
6418
|
}
|
|
5601
6419
|
function generateTable(rows) {
|
|
5602
6420
|
const trElements = rows.map((row) => {
|
|
@@ -5620,22 +6438,11 @@ function blocksToSectionXml(blocks) {
|
|
|
5620
6438
|
return "";
|
|
5621
6439
|
}
|
|
5622
6440
|
}).join("\n ");
|
|
5623
|
-
return `<?xml version="1.0" encoding="UTF-8"?>
|
|
5624
|
-
<hs:sec xmlns:hs="${
|
|
6441
|
+
return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
|
6442
|
+
<hs:sec xmlns:hs="${NS_SECTION}" xmlns:hp="${NS_PARA}">
|
|
5625
6443
|
${body}
|
|
5626
6444
|
</hs:sec>`;
|
|
5627
6445
|
}
|
|
5628
|
-
function generateManifest() {
|
|
5629
|
-
return `<?xml version="1.0" encoding="UTF-8"?>
|
|
5630
|
-
<opf:package xmlns:opf="http://www.idpf.org/2007/opf">
|
|
5631
|
-
<opf:manifest>
|
|
5632
|
-
<opf:item id="s0" href="section0.xml" media-type="application/xml"/>
|
|
5633
|
-
</opf:manifest>
|
|
5634
|
-
<opf:spine>
|
|
5635
|
-
<opf:itemref idref="s0"/>
|
|
5636
|
-
</opf:spine>
|
|
5637
|
-
</opf:package>`;
|
|
5638
|
-
}
|
|
5639
6446
|
|
|
5640
6447
|
// src/index.ts
|
|
5641
6448
|
async function parse(input, options) {
|