kordoc 2.0.3 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +302 -291
- package/dist/chunk-5Y2Q3BRW.js +52 -0
- package/dist/chunk-5Y2Q3BRW.js.map +1 -0
- package/dist/{chunk-4UH6ABAY.js → chunk-LYFG7AUT.js} +971 -223
- package/dist/chunk-LYFG7AUT.js.map +1 -0
- package/dist/{chunk-3TBUDJDE.js → chunk-MOL7MDBG.js} +1 -1
- package/dist/chunk-MOL7MDBG.js.map +1 -0
- package/dist/cli.js +13 -9
- package/dist/cli.js.map +1 -1
- package/dist/detect-GYK3HKD5.js +18 -0
- package/dist/index.cjs +996 -189
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +996 -189
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +41 -12
- package/dist/mcp.js.map +1 -1
- package/dist/page-range-737B4EZW.js +8 -0
- package/dist/{provider-EU3CG724.js → provider-7H4CPZYS.js} +2 -1
- package/dist/provider-7H4CPZYS.js.map +1 -0
- package/dist/{watch-QD3PDNXQ.js → watch-Q5OXA73S.js} +38 -18
- package/dist/watch-Q5OXA73S.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-25TXW6EP.js +0 -93
- package/dist/chunk-25TXW6EP.js.map +0 -1
- package/dist/chunk-3TBUDJDE.js.map +0 -1
- package/dist/chunk-4UH6ABAY.js.map +0 -1
- package/dist/page-range-OF5I4PQY.js +0 -8
- package/dist/provider-EU3CG724.js.map +0 -1
- package/dist/utils-BTZ4WSYX.js +0 -22
- package/dist/watch-QD3PDNXQ.js.map +0 -1
- /package/dist/{page-range-OF5I4PQY.js.map → detect-GYK3HKD5.js.map} +0 -0
- /package/dist/{utils-BTZ4WSYX.js.map → page-range-737B4EZW.js.map} +0 -0
|
@@ -1,53 +1,105 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import {
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
precheckZipSize,
|
|
7
|
-
sanitizeHref,
|
|
8
|
-
toArrayBuffer
|
|
9
|
-
} from "./chunk-25TXW6EP.js";
|
|
3
|
+
detectFormat,
|
|
4
|
+
detectZipFormat
|
|
5
|
+
} from "./chunk-5Y2Q3BRW.js";
|
|
10
6
|
import {
|
|
11
7
|
parsePageRange
|
|
12
|
-
} from "./chunk-
|
|
8
|
+
} from "./chunk-MOL7MDBG.js";
|
|
13
9
|
|
|
14
|
-
// src/
|
|
15
|
-
|
|
16
|
-
function
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
return b[0] === 80 && b[1] === 75 && b[2] === 3 && b[3] === 4;
|
|
22
|
-
}
|
|
23
|
-
function isOldHwpFile(buffer) {
|
|
24
|
-
const b = magicBytes(buffer);
|
|
25
|
-
return b[0] === 208 && b[1] === 207 && b[2] === 17 && b[3] === 224;
|
|
10
|
+
// src/utils.ts
|
|
11
|
+
var VERSION = true ? "2.2.0" : "0.0.0-dev";
|
|
12
|
+
function toArrayBuffer(buf) {
|
|
13
|
+
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
14
|
+
return buf.buffer;
|
|
15
|
+
}
|
|
16
|
+
return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
|
|
26
17
|
}
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
18
|
+
var KordocError = class extends Error {
|
|
19
|
+
constructor(message) {
|
|
20
|
+
super(message);
|
|
21
|
+
this.name = "KordocError";
|
|
22
|
+
}
|
|
23
|
+
};
|
|
24
|
+
function sanitizeError(err) {
|
|
25
|
+
if (err instanceof KordocError) return err.message;
|
|
26
|
+
return "\uBB38\uC11C \uCC98\uB9AC \uC911 \uC624\uB958\uAC00 \uBC1C\uC0DD\uD588\uC2B5\uB2C8\uB2E4";
|
|
30
27
|
}
|
|
31
|
-
function
|
|
32
|
-
if (
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
return "unknown";
|
|
28
|
+
function isPathTraversal(name) {
|
|
29
|
+
if (name.includes("\0")) return true;
|
|
30
|
+
const normalized = name.replace(/\\/g, "/");
|
|
31
|
+
const segments = normalized.split("/");
|
|
32
|
+
return segments.some((s) => s === "..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
37
33
|
}
|
|
38
|
-
|
|
34
|
+
function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
|
|
39
35
|
try {
|
|
40
|
-
const
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
36
|
+
const data = new DataView(buffer);
|
|
37
|
+
const len = buffer.byteLength;
|
|
38
|
+
let eocdOffset = -1;
|
|
39
|
+
for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
|
|
40
|
+
if (data.getUint32(i, true) === 101010256) {
|
|
41
|
+
eocdOffset = i;
|
|
42
|
+
break;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
|
|
46
|
+
const entryCount = data.getUint16(eocdOffset + 10, true);
|
|
47
|
+
if (entryCount > maxEntries) {
|
|
48
|
+
throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
|
|
49
|
+
}
|
|
50
|
+
const cdSize = data.getUint32(eocdOffset + 12, true);
|
|
51
|
+
const cdOffset = data.getUint32(eocdOffset + 16, true);
|
|
52
|
+
if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
|
|
53
|
+
let totalUncompressed = 0;
|
|
54
|
+
let pos = cdOffset;
|
|
55
|
+
for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
|
|
56
|
+
if (data.getUint32(pos, true) !== 33639248) break;
|
|
57
|
+
totalUncompressed += data.getUint32(pos + 24, true);
|
|
58
|
+
const nameLen = data.getUint16(pos + 28, true);
|
|
59
|
+
const extraLen = data.getUint16(pos + 30, true);
|
|
60
|
+
const commentLen = data.getUint16(pos + 32, true);
|
|
61
|
+
pos += 46 + nameLen + extraLen + commentLen;
|
|
62
|
+
}
|
|
63
|
+
if (totalUncompressed > maxUncompressedSize) {
|
|
64
|
+
throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
|
|
65
|
+
}
|
|
66
|
+
return { totalUncompressed, entryCount };
|
|
67
|
+
} catch (err) {
|
|
68
|
+
if (err instanceof KordocError) throw err;
|
|
69
|
+
return { totalUncompressed: 0, entryCount: 0 };
|
|
49
70
|
}
|
|
50
71
|
}
|
|
72
|
+
function stripDtd(xml) {
|
|
73
|
+
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
74
|
+
}
|
|
75
|
+
var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
|
|
76
|
+
function sanitizeHref(href) {
|
|
77
|
+
const trimmed = href.trim();
|
|
78
|
+
if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
|
|
79
|
+
return trimmed;
|
|
80
|
+
}
|
|
81
|
+
function safeMin(arr) {
|
|
82
|
+
let min = Infinity;
|
|
83
|
+
for (let i = 0; i < arr.length; i++) if (arr[i] < min) min = arr[i];
|
|
84
|
+
return min;
|
|
85
|
+
}
|
|
86
|
+
function safeMax(arr) {
|
|
87
|
+
let max = -Infinity;
|
|
88
|
+
for (let i = 0; i < arr.length; i++) if (arr[i] > max) max = arr[i];
|
|
89
|
+
return max;
|
|
90
|
+
}
|
|
91
|
+
function classifyError(err) {
|
|
92
|
+
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
93
|
+
const msg = err.message;
|
|
94
|
+
if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
|
|
95
|
+
if (msg.includes("DRM")) return "DRM_PROTECTED";
|
|
96
|
+
if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
|
|
97
|
+
if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
|
|
98
|
+
if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
|
|
99
|
+
if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
|
|
100
|
+
if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
|
|
101
|
+
return "PARSE_ERROR";
|
|
102
|
+
}
|
|
51
103
|
|
|
52
104
|
// src/table/builder.ts
|
|
53
105
|
var MAX_COLS = 200;
|
|
@@ -110,6 +162,7 @@ function buildTableDirect(rows, numRows) {
|
|
|
110
162
|
if (end > maxCols) maxCols = end;
|
|
111
163
|
}
|
|
112
164
|
}
|
|
165
|
+
if (maxCols > MAX_COLS) maxCols = MAX_COLS;
|
|
113
166
|
if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
|
|
114
167
|
const grid = Array.from(
|
|
115
168
|
{ length: numRows },
|
|
@@ -119,7 +172,7 @@ function buildTableDirect(rows, numRows) {
|
|
|
119
172
|
for (const cell of row) {
|
|
120
173
|
const r = cell.rowAddr ?? 0;
|
|
121
174
|
const c = cell.colAddr ?? 0;
|
|
122
|
-
if (r >= numRows || c >= maxCols) continue;
|
|
175
|
+
if (r >= numRows || c >= maxCols || r < 0 || c < 0) continue;
|
|
123
176
|
grid[r][c] = { text: cell.text.trim(), colSpan: cell.colSpan, rowSpan: cell.rowSpan };
|
|
124
177
|
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
125
178
|
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
@@ -341,7 +394,7 @@ function tableToMarkdown(table) {
|
|
|
341
394
|
}
|
|
342
395
|
|
|
343
396
|
// src/hwpx/parser.ts
|
|
344
|
-
import
|
|
397
|
+
import JSZip from "jszip";
|
|
345
398
|
import { inflateRawSync } from "zlib";
|
|
346
399
|
import { DOMParser } from "@xmldom/xmldom";
|
|
347
400
|
|
|
@@ -403,7 +456,12 @@ function parseCharProperties(doc, map) {
|
|
|
403
456
|
if (!id) continue;
|
|
404
457
|
const prop = {};
|
|
405
458
|
const height = el.getAttribute("height");
|
|
406
|
-
if (height)
|
|
459
|
+
if (height) {
|
|
460
|
+
const parsedHeight = parseInt(height, 10);
|
|
461
|
+
if (!isNaN(parsedHeight) && parsedHeight > 0) {
|
|
462
|
+
prop.fontSize = parsedHeight / 100;
|
|
463
|
+
}
|
|
464
|
+
}
|
|
407
465
|
const bold = el.getAttribute("bold");
|
|
408
466
|
if (bold === "true" || bold === "1") prop.bold = true;
|
|
409
467
|
const italic = el.getAttribute("italic");
|
|
@@ -438,14 +496,11 @@ function parseStyleElements(doc, map) {
|
|
|
438
496
|
}
|
|
439
497
|
}
|
|
440
498
|
}
|
|
441
|
-
function stripDtd(xml) {
|
|
442
|
-
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
443
|
-
}
|
|
444
499
|
async function parseHwpxDocument(buffer, options) {
|
|
445
500
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
|
|
446
501
|
let zip;
|
|
447
502
|
try {
|
|
448
|
-
zip = await
|
|
503
|
+
zip = await JSZip.loadAsync(buffer);
|
|
449
504
|
} catch {
|
|
450
505
|
return extractFromBrokenZip(buffer);
|
|
451
506
|
}
|
|
@@ -543,7 +598,7 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
|
|
|
543
598
|
const data = await file.async("uint8array");
|
|
544
599
|
decompressed.total += data.length;
|
|
545
600
|
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
546
|
-
const ext = ref.includes(".") ? ref.split(".").pop() : "png";
|
|
601
|
+
const ext = ref.includes(".") ? ref.split(".").pop() || "png" : "png";
|
|
547
602
|
const mimeType = imageExtToMime(ext);
|
|
548
603
|
imageIndex++;
|
|
549
604
|
const filename = `image_${String(imageIndex).padStart(3, "0")}.${mimeToExt(mimeType)}`;
|
|
@@ -608,7 +663,7 @@ function parseDublinCoreMetadata(xml, metadata) {
|
|
|
608
663
|
async function extractHwpxMetadataOnly(buffer) {
|
|
609
664
|
let zip;
|
|
610
665
|
try {
|
|
611
|
-
zip = await
|
|
666
|
+
zip = await JSZip.loadAsync(buffer);
|
|
612
667
|
} catch {
|
|
613
668
|
throw new KordocError("HWPX ZIP\uC744 \uC5F4 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
614
669
|
}
|
|
@@ -803,7 +858,8 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
803
858
|
if (newTable.rows.length > 0) {
|
|
804
859
|
if (tableStack.length > 0) {
|
|
805
860
|
const parentTable = tableStack.pop();
|
|
806
|
-
|
|
861
|
+
let nestedCols = 0;
|
|
862
|
+
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
807
863
|
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
808
864
|
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
809
865
|
} else {
|
|
@@ -850,8 +906,10 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
850
906
|
break;
|
|
851
907
|
case "cellSpan":
|
|
852
908
|
if (tableCtx?.cell) {
|
|
853
|
-
const
|
|
854
|
-
const
|
|
909
|
+
const rawCs = parseInt(el.getAttribute("colSpan") || "1", 10);
|
|
910
|
+
const cs = isNaN(rawCs) ? 1 : rawCs;
|
|
911
|
+
const rawRs = parseInt(el.getAttribute("rowSpan") || "1", 10);
|
|
912
|
+
const rs = isNaN(rawRs) ? 1 : rawRs;
|
|
855
913
|
tableCtx.cell.colSpan = clampSpan(cs, MAX_COLS);
|
|
856
914
|
tableCtx.cell.rowSpan = clampSpan(rs, MAX_ROWS);
|
|
857
915
|
}
|
|
@@ -910,7 +968,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
910
968
|
if (newTable.rows.length > 0) {
|
|
911
969
|
if (tableStack.length > 0) {
|
|
912
970
|
const parentTable = tableStack.pop();
|
|
913
|
-
|
|
971
|
+
let nestedCols = 0;
|
|
972
|
+
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
914
973
|
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
915
974
|
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
916
975
|
} else {
|
|
@@ -943,6 +1002,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
943
1002
|
extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
|
|
944
1003
|
} else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
|
|
945
1004
|
walkChildren(el, d + 1);
|
|
1005
|
+
} else if (localTag === "run") {
|
|
1006
|
+
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
|
|
946
1007
|
}
|
|
947
1008
|
}
|
|
948
1009
|
};
|
|
@@ -2006,6 +2067,7 @@ function parseLenientCfb(data) {
|
|
|
2006
2067
|
if (miniSectorSizeShift > 16) throw new Error("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uBBF8\uB2C8 \uC139\uD130 \uD06C\uAE30 \uC2DC\uD504\uD2B8: " + miniSectorSizeShift);
|
|
2007
2068
|
const miniSectorSize = 1 << miniSectorSizeShift;
|
|
2008
2069
|
const fatSectorCount = data.readUInt32LE(44);
|
|
2070
|
+
if (fatSectorCount > 1e4) throw new Error("FAT \uC139\uD130 \uC218\uAC00 \uB108\uBB34 \uB9CE\uC2B5\uB2C8\uB2E4: " + fatSectorCount);
|
|
2009
2071
|
const firstDirSector = data.readUInt32LE(48);
|
|
2010
2072
|
const miniStreamCutoff = data.readUInt32LE(56);
|
|
2011
2073
|
const firstMiniFatSector = data.readUInt32LE(60);
|
|
@@ -2394,10 +2456,14 @@ function findSections(cfb) {
|
|
|
2394
2456
|
}
|
|
2395
2457
|
function findSectionsLenient(lcfb, compressed) {
|
|
2396
2458
|
const sections = [];
|
|
2459
|
+
let totalDecompressed = 0;
|
|
2397
2460
|
for (let i = 0; i < MAX_SECTIONS; i++) {
|
|
2398
2461
|
const raw = lcfb.findStream(`/BodyText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
|
|
2399
2462
|
if (!raw) break;
|
|
2400
|
-
|
|
2463
|
+
const content = compressed ? decompressStream(raw) : raw;
|
|
2464
|
+
totalDecompressed += content.length;
|
|
2465
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2466
|
+
sections.push({ idx: i, content });
|
|
2401
2467
|
}
|
|
2402
2468
|
if (sections.length === 0) {
|
|
2403
2469
|
for (const e of lcfb.entries()) {
|
|
@@ -2405,7 +2471,12 @@ function findSectionsLenient(lcfb, compressed) {
|
|
|
2405
2471
|
if (e.name.startsWith("Section")) {
|
|
2406
2472
|
const idx = parseInt(e.name.replace("Section", ""), 10) || 0;
|
|
2407
2473
|
const raw = lcfb.findStream(e.name);
|
|
2408
|
-
if (raw)
|
|
2474
|
+
if (raw) {
|
|
2475
|
+
const content = compressed ? decompressStream(raw) : raw;
|
|
2476
|
+
totalDecompressed += content.length;
|
|
2477
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2478
|
+
sections.push({ idx, content });
|
|
2479
|
+
}
|
|
2409
2480
|
}
|
|
2410
2481
|
}
|
|
2411
2482
|
}
|
|
@@ -2413,11 +2484,15 @@ function findSectionsLenient(lcfb, compressed) {
|
|
|
2413
2484
|
}
|
|
2414
2485
|
function findViewTextSectionsLenient(lcfb, compressed) {
|
|
2415
2486
|
const sections = [];
|
|
2487
|
+
let totalDecompressed = 0;
|
|
2416
2488
|
for (let i = 0; i < MAX_SECTIONS; i++) {
|
|
2417
2489
|
const raw = lcfb.findStream(`/ViewText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
|
|
2418
2490
|
if (!raw) break;
|
|
2419
2491
|
try {
|
|
2420
|
-
|
|
2492
|
+
const content = decryptViewText(raw, compressed);
|
|
2493
|
+
totalDecompressed += content.length;
|
|
2494
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2495
|
+
sections.push({ idx: i, content });
|
|
2421
2496
|
} catch {
|
|
2422
2497
|
break;
|
|
2423
2498
|
}
|
|
@@ -2816,10 +2891,14 @@ function arrangeCells(rows, cols, cells) {
|
|
|
2816
2891
|
// src/pdf/line-detector.ts
|
|
2817
2892
|
import { OPS } from "pdfjs-dist/legacy/build/pdf.mjs";
|
|
2818
2893
|
var ORIENTATION_TOL = 2;
|
|
2819
|
-
var MIN_LINE_LENGTH =
|
|
2820
|
-
var
|
|
2894
|
+
var MIN_LINE_LENGTH = 15;
|
|
2895
|
+
var MAX_LINE_WIDTH = 5;
|
|
2821
2896
|
var CONNECT_TOL = 5;
|
|
2822
2897
|
var CELL_PADDING = 2;
|
|
2898
|
+
var MIN_COL_WIDTH = 15;
|
|
2899
|
+
var MIN_ROW_HEIGHT = 6;
|
|
2900
|
+
var VERTEX_MERGE_FACTOR = 4;
|
|
2901
|
+
var MIN_COORD_MERGE_TOL = 8;
|
|
2823
2902
|
function extractLines(fnArray, argsArray) {
|
|
2824
2903
|
const horizontals = [];
|
|
2825
2904
|
const verticals = [];
|
|
@@ -2971,6 +3050,55 @@ function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
|
|
|
2971
3050
|
verticals.push({ x1: x, y1, x2: x, y2, lineWidth });
|
|
2972
3051
|
}
|
|
2973
3052
|
}
|
|
3053
|
+
function preprocessLines(horizontals, verticals) {
|
|
3054
|
+
let h = horizontals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
|
|
3055
|
+
let v = verticals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
|
|
3056
|
+
h = mergeParallelLines(h, "h");
|
|
3057
|
+
v = mergeParallelLines(v, "v");
|
|
3058
|
+
return { horizontals: h, verticals: v };
|
|
3059
|
+
}
|
|
3060
|
+
function mergeParallelLines(lines, dir) {
|
|
3061
|
+
if (lines.length <= 1) return lines;
|
|
3062
|
+
const sorted = [...lines].sort((a, b) => {
|
|
3063
|
+
const posA = dir === "h" ? a.y1 : a.x1;
|
|
3064
|
+
const posB = dir === "h" ? b.y1 : b.x1;
|
|
3065
|
+
if (Math.abs(posA - posB) > 0.1) return posA - posB;
|
|
3066
|
+
return dir === "h" ? a.x1 - b.x1 : a.y1 - b.y1;
|
|
3067
|
+
});
|
|
3068
|
+
const MERGE_TOL = 3;
|
|
3069
|
+
const result = [sorted[0]];
|
|
3070
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
3071
|
+
const prev = result[result.length - 1];
|
|
3072
|
+
const curr = sorted[i];
|
|
3073
|
+
const prevPos = dir === "h" ? prev.y1 : prev.x1;
|
|
3074
|
+
const currPos = dir === "h" ? curr.y1 : curr.x1;
|
|
3075
|
+
if (Math.abs(prevPos - currPos) <= MERGE_TOL) {
|
|
3076
|
+
const prevStart = dir === "h" ? prev.x1 : prev.y1;
|
|
3077
|
+
const prevEnd = dir === "h" ? prev.x2 : prev.y2;
|
|
3078
|
+
const currStart = dir === "h" ? curr.x1 : curr.y1;
|
|
3079
|
+
const currEnd = dir === "h" ? curr.x2 : curr.y2;
|
|
3080
|
+
const overlap = Math.min(prevEnd, currEnd) - Math.max(prevStart, currStart);
|
|
3081
|
+
const minLen = Math.min(prevEnd - prevStart, currEnd - currStart);
|
|
3082
|
+
if (overlap > minLen * 0.3) {
|
|
3083
|
+
if (dir === "h") {
|
|
3084
|
+
prev.x1 = Math.min(prev.x1, curr.x1);
|
|
3085
|
+
prev.x2 = Math.max(prev.x2, curr.x2);
|
|
3086
|
+
prev.y1 = (prev.y1 + curr.y1) / 2;
|
|
3087
|
+
prev.y2 = prev.y1;
|
|
3088
|
+
} else {
|
|
3089
|
+
prev.y1 = Math.min(prev.y1, curr.y1);
|
|
3090
|
+
prev.y2 = Math.max(prev.y2, curr.y2);
|
|
3091
|
+
prev.x1 = (prev.x1 + curr.x1) / 2;
|
|
3092
|
+
prev.x2 = prev.x1;
|
|
3093
|
+
}
|
|
3094
|
+
prev.lineWidth = Math.max(prev.lineWidth, curr.lineWidth);
|
|
3095
|
+
continue;
|
|
3096
|
+
}
|
|
3097
|
+
}
|
|
3098
|
+
result.push(curr);
|
|
3099
|
+
}
|
|
3100
|
+
return result;
|
|
3101
|
+
}
|
|
2974
3102
|
function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
|
|
2975
3103
|
const margin = 5;
|
|
2976
3104
|
return {
|
|
@@ -2982,8 +3110,49 @@ function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
|
|
|
2982
3110
|
)
|
|
2983
3111
|
};
|
|
2984
3112
|
}
|
|
3113
|
+
function buildVertices(horizontals, verticals) {
|
|
3114
|
+
const vertices = [];
|
|
3115
|
+
const tol = CONNECT_TOL;
|
|
3116
|
+
for (const h of horizontals) {
|
|
3117
|
+
for (const v of verticals) {
|
|
3118
|
+
if (v.x1 >= h.x1 - tol && v.x1 <= h.x2 + tol && h.y1 >= v.y1 - tol && h.y1 <= v.y2 + tol) {
|
|
3119
|
+
const radius = Math.max(h.lineWidth, v.lineWidth, 1);
|
|
3120
|
+
vertices.push({ x: v.x1, y: h.y1, radius });
|
|
3121
|
+
}
|
|
3122
|
+
}
|
|
3123
|
+
}
|
|
3124
|
+
return vertices;
|
|
3125
|
+
}
|
|
3126
|
+
function mergeVertices(vertices) {
|
|
3127
|
+
if (vertices.length <= 1) return vertices;
|
|
3128
|
+
const merged = [];
|
|
3129
|
+
const used = new Array(vertices.length).fill(false);
|
|
3130
|
+
for (let i = 0; i < vertices.length; i++) {
|
|
3131
|
+
if (used[i]) continue;
|
|
3132
|
+
let sumX = vertices[i].x, sumY = vertices[i].y;
|
|
3133
|
+
let maxRadius = vertices[i].radius;
|
|
3134
|
+
let count = 1;
|
|
3135
|
+
for (let j = i + 1; j < vertices.length; j++) {
|
|
3136
|
+
if (used[j]) continue;
|
|
3137
|
+
const mergeTol = VERTEX_MERGE_FACTOR * Math.max(maxRadius, vertices[j].radius);
|
|
3138
|
+
if (Math.abs(vertices[i].x - vertices[j].x) <= mergeTol && Math.abs(vertices[i].y - vertices[j].y) <= mergeTol) {
|
|
3139
|
+
sumX += vertices[j].x;
|
|
3140
|
+
sumY += vertices[j].y;
|
|
3141
|
+
maxRadius = Math.max(maxRadius, vertices[j].radius);
|
|
3142
|
+
count++;
|
|
3143
|
+
used[j] = true;
|
|
3144
|
+
}
|
|
3145
|
+
}
|
|
3146
|
+
merged.push({ x: sumX / count, y: sumY / count, radius: maxRadius });
|
|
3147
|
+
}
|
|
3148
|
+
return merged;
|
|
3149
|
+
}
|
|
2985
3150
|
function buildTableGrids(horizontals, verticals) {
|
|
2986
3151
|
if (horizontals.length < 2 || verticals.length < 2) return [];
|
|
3152
|
+
const allVertices = buildVertices(horizontals, verticals);
|
|
3153
|
+
const vertices = mergeVertices(allVertices);
|
|
3154
|
+
if (vertices.length < 4) return [];
|
|
3155
|
+
const globalRadius = vertices.reduce((max, v) => Math.max(max, v.radius), 1);
|
|
2987
3156
|
const allLines = [
|
|
2988
3157
|
...horizontals.map((l, i) => ({ ...l, type: "h", id: i })),
|
|
2989
3158
|
...verticals.map((l, i) => ({ ...l, type: "v", id: i + horizontals.length }))
|
|
@@ -2994,21 +3163,74 @@ function buildTableGrids(horizontals, verticals) {
|
|
|
2994
3163
|
const hLines = group.filter((l) => l.type === "h");
|
|
2995
3164
|
const vLines = group.filter((l) => l.type === "v");
|
|
2996
3165
|
if (hLines.length < 2 || vLines.length < 2) continue;
|
|
2997
|
-
|
|
2998
|
-
const
|
|
2999
|
-
|
|
3000
|
-
|
|
3166
|
+
let gx1 = Infinity, gy1 = Infinity, gx2 = -Infinity, gy2 = -Infinity;
|
|
3167
|
+
for (const l of vLines) {
|
|
3168
|
+
if (l.x1 < gx1) gx1 = l.x1;
|
|
3169
|
+
if (l.x1 > gx2) gx2 = l.x1;
|
|
3170
|
+
}
|
|
3171
|
+
for (const l of hLines) {
|
|
3172
|
+
if (l.y1 < gy1) gy1 = l.y1;
|
|
3173
|
+
if (l.y1 > gy2) gy2 = l.y1;
|
|
3174
|
+
}
|
|
3175
|
+
const groupBbox = {
|
|
3176
|
+
x1: gx1 - CONNECT_TOL,
|
|
3177
|
+
y1: gy1 - CONNECT_TOL,
|
|
3178
|
+
x2: gx2 + CONNECT_TOL,
|
|
3179
|
+
y2: gy2 + CONNECT_TOL
|
|
3180
|
+
};
|
|
3181
|
+
const groupVertices = vertices.filter(
|
|
3182
|
+
(v) => v.x >= groupBbox.x1 && v.x <= groupBbox.x2 && v.y >= groupBbox.y1 && v.y <= groupBbox.y2
|
|
3183
|
+
);
|
|
3184
|
+
const groupRadius = groupVertices.length > 0 ? groupVertices.reduce((max, v) => Math.max(max, v.radius), 1) : globalRadius;
|
|
3185
|
+
const coordMergeTol = Math.max(VERTEX_MERGE_FACTOR * groupRadius, MIN_COORD_MERGE_TOL);
|
|
3186
|
+
const rawYs = [
|
|
3187
|
+
...hLines.map((l) => l.y1),
|
|
3188
|
+
...groupVertices.map((v) => v.y)
|
|
3189
|
+
];
|
|
3190
|
+
const rowYs = clusterCoordinates(rawYs, coordMergeTol).sort((a, b) => b - a);
|
|
3191
|
+
const rawXs = [
|
|
3192
|
+
...vLines.map((l) => l.x1),
|
|
3193
|
+
...groupVertices.map((v) => v.x)
|
|
3194
|
+
];
|
|
3195
|
+
const colXs = clusterCoordinates(rawXs, coordMergeTol).sort((a, b) => a - b);
|
|
3001
3196
|
if (rowYs.length < 2 || colXs.length < 2) continue;
|
|
3197
|
+
const validColXs = enforceMinWidth(colXs, MIN_COL_WIDTH);
|
|
3198
|
+
const validRowYs = enforceMinHeight(rowYs, MIN_ROW_HEIGHT);
|
|
3199
|
+
if (validRowYs.length < 2 || validColXs.length < 2) continue;
|
|
3002
3200
|
const bbox = {
|
|
3003
|
-
x1:
|
|
3004
|
-
y1:
|
|
3005
|
-
x2:
|
|
3006
|
-
y2:
|
|
3201
|
+
x1: validColXs[0],
|
|
3202
|
+
y1: validRowYs[validRowYs.length - 1],
|
|
3203
|
+
x2: validColXs[validColXs.length - 1],
|
|
3204
|
+
y2: validRowYs[0]
|
|
3007
3205
|
};
|
|
3008
|
-
grids.push({ rowYs, colXs, bbox });
|
|
3206
|
+
grids.push({ rowYs: validRowYs, colXs: validColXs, bbox, vertexRadius: groupRadius });
|
|
3009
3207
|
}
|
|
3010
3208
|
return mergeAdjacentGrids(grids);
|
|
3011
3209
|
}
|
|
3210
|
+
function enforceMinWidth(colXs, minWidth) {
|
|
3211
|
+
if (colXs.length <= 2) return colXs;
|
|
3212
|
+
const result = [colXs[0]];
|
|
3213
|
+
for (let i = 1; i < colXs.length; i++) {
|
|
3214
|
+
const prevX = result[result.length - 1];
|
|
3215
|
+
if (colXs[i] - prevX < minWidth && i < colXs.length - 1) {
|
|
3216
|
+
continue;
|
|
3217
|
+
}
|
|
3218
|
+
result.push(colXs[i]);
|
|
3219
|
+
}
|
|
3220
|
+
return result;
|
|
3221
|
+
}
|
|
3222
|
+
function enforceMinHeight(rowYs, minHeight) {
|
|
3223
|
+
if (rowYs.length <= 2) return rowYs;
|
|
3224
|
+
const result = [rowYs[0]];
|
|
3225
|
+
for (let i = 1; i < rowYs.length; i++) {
|
|
3226
|
+
const prevY = result[result.length - 1];
|
|
3227
|
+
if (prevY - rowYs[i] < minHeight && i < rowYs.length - 1) {
|
|
3228
|
+
continue;
|
|
3229
|
+
}
|
|
3230
|
+
result.push(rowYs[i]);
|
|
3231
|
+
}
|
|
3232
|
+
return result;
|
|
3233
|
+
}
|
|
3012
3234
|
function mergeAdjacentGrids(grids) {
|
|
3013
3235
|
if (grids.length <= 1) return grids;
|
|
3014
3236
|
const sorted = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
|
|
@@ -3017,9 +3239,10 @@ function mergeAdjacentGrids(grids) {
|
|
|
3017
3239
|
const prev = merged[merged.length - 1];
|
|
3018
3240
|
const curr = sorted[i];
|
|
3019
3241
|
if (prev.colXs.length === curr.colXs.length) {
|
|
3020
|
-
const
|
|
3242
|
+
const mergeTol = Math.max(VERTEX_MERGE_FACTOR * Math.max(prev.vertexRadius, curr.vertexRadius), 6) * 3;
|
|
3243
|
+
const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= mergeTol);
|
|
3021
3244
|
const verticalGap = prev.bbox.y1 - curr.bbox.y2;
|
|
3022
|
-
if (colMatch && verticalGap >= -
|
|
3245
|
+
if (colMatch && verticalGap >= -CONNECT_TOL && verticalGap <= 20) {
|
|
3023
3246
|
const allRowYs = [.../* @__PURE__ */ new Set([...prev.rowYs, ...curr.rowYs])].sort((a, b) => b - a);
|
|
3024
3247
|
merged[merged.length - 1] = {
|
|
3025
3248
|
rowYs: allRowYs,
|
|
@@ -3029,7 +3252,8 @@ function mergeAdjacentGrids(grids) {
|
|
|
3029
3252
|
y1: Math.min(prev.bbox.y1, curr.bbox.y1),
|
|
3030
3253
|
x2: Math.max(prev.bbox.x2, curr.bbox.x2),
|
|
3031
3254
|
y2: Math.max(prev.bbox.y2, curr.bbox.y2)
|
|
3032
|
-
}
|
|
3255
|
+
},
|
|
3256
|
+
vertexRadius: Math.max(prev.vertexRadius, curr.vertexRadius)
|
|
3033
3257
|
};
|
|
3034
3258
|
continue;
|
|
3035
3259
|
}
|
|
@@ -3038,14 +3262,14 @@ function mergeAdjacentGrids(grids) {
|
|
|
3038
3262
|
}
|
|
3039
3263
|
return merged;
|
|
3040
3264
|
}
|
|
3041
|
-
function clusterCoordinates(values) {
|
|
3265
|
+
function clusterCoordinates(values, tolerance) {
|
|
3042
3266
|
if (values.length === 0) return [];
|
|
3043
3267
|
const sorted = [...values].sort((a, b) => a - b);
|
|
3044
3268
|
const clusters = [{ sum: sorted[0], count: 1 }];
|
|
3045
3269
|
for (let i = 1; i < sorted.length; i++) {
|
|
3046
3270
|
const last = clusters[clusters.length - 1];
|
|
3047
3271
|
const avg = last.sum / last.count;
|
|
3048
|
-
if (Math.abs(sorted[i] - avg) <=
|
|
3272
|
+
if (Math.abs(sorted[i] - avg) <= tolerance) {
|
|
3049
3273
|
last.sum += sorted[i];
|
|
3050
3274
|
last.count++;
|
|
3051
3275
|
} else {
|
|
@@ -3102,6 +3326,20 @@ function extractCells(grid, horizontals, verticals) {
|
|
|
3102
3326
|
const numRows = rowYs.length - 1;
|
|
3103
3327
|
const numCols = colXs.length - 1;
|
|
3104
3328
|
if (numRows <= 0 || numCols <= 0) return [];
|
|
3329
|
+
const vBorders = Array.from(
|
|
3330
|
+
{ length: numRows },
|
|
3331
|
+
(_, r) => Array.from(
|
|
3332
|
+
{ length: numCols + 1 },
|
|
3333
|
+
(_2, c) => hasVerticalLine(verticals, colXs[c], rowYs[r], rowYs[r + 1], grid.vertexRadius)
|
|
3334
|
+
)
|
|
3335
|
+
);
|
|
3336
|
+
const hBorders = Array.from(
|
|
3337
|
+
{ length: numRows + 1 },
|
|
3338
|
+
(_, r) => Array.from(
|
|
3339
|
+
{ length: numCols },
|
|
3340
|
+
(_2, c) => hasHorizontalLine(horizontals, rowYs[r], colXs[c], colXs[c + 1], grid.vertexRadius)
|
|
3341
|
+
)
|
|
3342
|
+
);
|
|
3105
3343
|
const occupied = Array.from({ length: numRows }, () => Array(numCols).fill(false));
|
|
3106
3344
|
const cells = [];
|
|
3107
3345
|
for (let r = 0; r < numRows; r++) {
|
|
@@ -3109,18 +3347,26 @@ function extractCells(grid, horizontals, verticals) {
|
|
|
3109
3347
|
if (occupied[r][c]) continue;
|
|
3110
3348
|
let colSpan = 1;
|
|
3111
3349
|
let rowSpan = 1;
|
|
3112
|
-
while (c + colSpan < numCols) {
|
|
3113
|
-
|
|
3114
|
-
|
|
3115
|
-
|
|
3116
|
-
|
|
3350
|
+
while (c + colSpan < numCols && !vBorders[r][c + colSpan]) {
|
|
3351
|
+
let canExpand = true;
|
|
3352
|
+
for (let dr = 0; dr < rowSpan; dr++) {
|
|
3353
|
+
if (vBorders[r + dr][c + colSpan]) {
|
|
3354
|
+
canExpand = false;
|
|
3355
|
+
break;
|
|
3356
|
+
}
|
|
3357
|
+
}
|
|
3358
|
+
if (!canExpand) break;
|
|
3117
3359
|
colSpan++;
|
|
3118
3360
|
}
|
|
3119
3361
|
while (r + rowSpan < numRows) {
|
|
3120
|
-
|
|
3121
|
-
|
|
3122
|
-
|
|
3123
|
-
|
|
3362
|
+
let hasLine = false;
|
|
3363
|
+
for (let dc = 0; dc < colSpan; dc++) {
|
|
3364
|
+
if (hBorders[r + rowSpan][c + dc]) {
|
|
3365
|
+
hasLine = true;
|
|
3366
|
+
break;
|
|
3367
|
+
}
|
|
3368
|
+
}
|
|
3369
|
+
if (hasLine) break;
|
|
3124
3370
|
rowSpan++;
|
|
3125
3371
|
}
|
|
3126
3372
|
for (let dr = 0; dr < rowSpan; dr++) {
|
|
@@ -3144,28 +3390,30 @@ function extractCells(grid, horizontals, verticals) {
|
|
|
3144
3390
|
}
|
|
3145
3391
|
return cells;
|
|
3146
3392
|
}
|
|
3147
|
-
function hasVerticalLine(verticals, x, topY, botY) {
|
|
3148
|
-
const tol =
|
|
3393
|
+
function hasVerticalLine(verticals, x, topY, botY, vertexRadius) {
|
|
3394
|
+
const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
|
|
3149
3395
|
for (const v of verticals) {
|
|
3150
3396
|
if (Math.abs(v.x1 - x) <= tol) {
|
|
3151
3397
|
const cellH = Math.abs(topY - botY);
|
|
3398
|
+
if (cellH < 0.1) continue;
|
|
3152
3399
|
const overlapTop = Math.min(v.y2, topY);
|
|
3153
3400
|
const overlapBot = Math.max(v.y1, botY);
|
|
3154
3401
|
const overlap = overlapTop - overlapBot;
|
|
3155
|
-
if (overlap >= cellH * 0.
|
|
3402
|
+
if (overlap >= cellH * 0.75) return true;
|
|
3156
3403
|
}
|
|
3157
3404
|
}
|
|
3158
3405
|
return false;
|
|
3159
3406
|
}
|
|
3160
|
-
function hasHorizontalLine(horizontals, y, leftX, rightX) {
|
|
3161
|
-
const tol =
|
|
3407
|
+
function hasHorizontalLine(horizontals, y, leftX, rightX, vertexRadius) {
|
|
3408
|
+
const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
|
|
3162
3409
|
for (const h of horizontals) {
|
|
3163
3410
|
if (Math.abs(h.y1 - y) <= tol) {
|
|
3164
3411
|
const cellW = Math.abs(rightX - leftX);
|
|
3412
|
+
if (cellW < 0.1) continue;
|
|
3165
3413
|
const overlapLeft = Math.max(h.x1, leftX);
|
|
3166
3414
|
const overlapRight = Math.min(h.x2, rightX);
|
|
3167
3415
|
const overlap = overlapRight - overlapLeft;
|
|
3168
|
-
if (overlap >= cellW * 0.
|
|
3416
|
+
if (overlap >= cellW * 0.75) return true;
|
|
3169
3417
|
}
|
|
3170
3418
|
}
|
|
3171
3419
|
return false;
|
|
@@ -3176,23 +3424,24 @@ function mapTextToCells(items, cells) {
|
|
|
3176
3424
|
result.set(cell, []);
|
|
3177
3425
|
}
|
|
3178
3426
|
for (const item of items) {
|
|
3179
|
-
const cx = item.x + item.w / 2;
|
|
3180
|
-
const cy = item.y;
|
|
3181
3427
|
const pad = CELL_PADDING;
|
|
3182
3428
|
let bestCell = null;
|
|
3183
|
-
let
|
|
3429
|
+
let bestScore = 0;
|
|
3184
3430
|
for (const cell of cells) {
|
|
3185
|
-
|
|
3186
|
-
|
|
3187
|
-
|
|
3188
|
-
|
|
3189
|
-
|
|
3190
|
-
|
|
3191
|
-
|
|
3192
|
-
|
|
3431
|
+
const ix1 = Math.max(item.x, cell.bbox.x1 - pad);
|
|
3432
|
+
const ix2 = Math.min(item.x + item.w, cell.bbox.x2 + pad);
|
|
3433
|
+
const iy1 = Math.max(item.y, cell.bbox.y1 - pad);
|
|
3434
|
+
const iy2 = Math.min(item.y + (item.h || item.fontSize), cell.bbox.y2 + pad);
|
|
3435
|
+
if (ix1 >= ix2 || iy1 >= iy2) continue;
|
|
3436
|
+
const intersectArea = (ix2 - ix1) * (iy2 - iy1);
|
|
3437
|
+
const itemArea = Math.max(item.w, 1) * Math.max(item.h || item.fontSize, 1);
|
|
3438
|
+
const score = intersectArea / itemArea;
|
|
3439
|
+
if (score > bestScore) {
|
|
3440
|
+
bestScore = score;
|
|
3441
|
+
bestCell = cell;
|
|
3193
3442
|
}
|
|
3194
3443
|
}
|
|
3195
|
-
if (bestCell) {
|
|
3444
|
+
if (bestCell && bestScore > 0.3) {
|
|
3196
3445
|
result.get(bestCell).push(item);
|
|
3197
3446
|
}
|
|
3198
3447
|
}
|
|
@@ -3219,8 +3468,13 @@ function cellTextToString(items) {
|
|
|
3219
3468
|
const textLines = lines.map((line) => {
|
|
3220
3469
|
const s = line.sort((a, b) => a.x - b.x);
|
|
3221
3470
|
if (s.length === 1) return s[0].text;
|
|
3471
|
+
const evenSpaced = detectEvenSpacedItems(s);
|
|
3222
3472
|
let result = s[0].text;
|
|
3223
3473
|
for (let j = 1; j < s.length; j++) {
|
|
3474
|
+
if (evenSpaced[j]) {
|
|
3475
|
+
result += s[j].text;
|
|
3476
|
+
continue;
|
|
3477
|
+
}
|
|
3224
3478
|
const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
|
|
3225
3479
|
const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
|
|
3226
3480
|
const prevIsKorean = /[가-힣]$/.test(result);
|
|
@@ -3235,6 +3489,57 @@ function cellTextToString(items) {
|
|
|
3235
3489
|
}
|
|
3236
3490
|
return result;
|
|
3237
3491
|
});
|
|
3492
|
+
return mergeCellTextLines(textLines);
|
|
3493
|
+
}
|
|
3494
|
+
function detectEvenSpacedItems(items) {
|
|
3495
|
+
const result = new Array(items.length).fill(false);
|
|
3496
|
+
if (items.length < 3) return result;
|
|
3497
|
+
let runStart = -1;
|
|
3498
|
+
for (let i = 0; i < items.length; i++) {
|
|
3499
|
+
const isShortKorean = /^[가-힣]{1}$/.test(items[i].text) || /^[\d]{1}$/.test(items[i].text);
|
|
3500
|
+
if (isShortKorean && runStart >= 0 && i > 0) {
|
|
3501
|
+
const gap = items[i].x - (items[i - 1].x + items[i - 1].w);
|
|
3502
|
+
const maxRunGap = Math.max(items[i].fontSize * 3, 30);
|
|
3503
|
+
if (gap > maxRunGap) {
|
|
3504
|
+
if (i - runStart >= 3) markEvenRun(items, result, runStart, i);
|
|
3505
|
+
runStart = i;
|
|
3506
|
+
continue;
|
|
3507
|
+
}
|
|
3508
|
+
}
|
|
3509
|
+
if (isShortKorean) {
|
|
3510
|
+
if (runStart < 0) runStart = i;
|
|
3511
|
+
} else {
|
|
3512
|
+
if (runStart >= 0 && i - runStart >= 3) {
|
|
3513
|
+
markEvenRun(items, result, runStart, i);
|
|
3514
|
+
}
|
|
3515
|
+
runStart = -1;
|
|
3516
|
+
}
|
|
3517
|
+
}
|
|
3518
|
+
if (runStart >= 0 && items.length - runStart >= 3) {
|
|
3519
|
+
markEvenRun(items, result, runStart, items.length);
|
|
3520
|
+
}
|
|
3521
|
+
return result;
|
|
3522
|
+
}
|
|
3523
|
+
function markEvenRun(items, result, start, end) {
|
|
3524
|
+
const gaps = [];
|
|
3525
|
+
for (let i = start + 1; i < end; i++) {
|
|
3526
|
+
gaps.push(items[i].x - (items[i - 1].x + items[i - 1].w));
|
|
3527
|
+
}
|
|
3528
|
+
const posGaps = gaps.filter((g2) => g2 > 0);
|
|
3529
|
+
if (posGaps.length < 2) return;
|
|
3530
|
+
let minGap = Infinity, maxGap = -Infinity;
|
|
3531
|
+
for (const g2 of posGaps) {
|
|
3532
|
+
if (g2 < minGap) minGap = g2;
|
|
3533
|
+
if (g2 > maxGap) maxGap = g2;
|
|
3534
|
+
}
|
|
3535
|
+
const avgFs = items[start].fontSize;
|
|
3536
|
+
if (minGap >= avgFs * 0.1 && maxGap <= avgFs * 3 && maxGap / Math.max(minGap, 0.1) <= 3) {
|
|
3537
|
+
for (let i = start + 1; i < end; i++) {
|
|
3538
|
+
result[i] = true;
|
|
3539
|
+
}
|
|
3540
|
+
}
|
|
3541
|
+
}
|
|
3542
|
+
function mergeCellTextLines(textLines) {
|
|
3238
3543
|
if (textLines.length <= 1) return textLines[0] || "";
|
|
3239
3544
|
const merged = [textLines[0]];
|
|
3240
3545
|
for (let i = 1; i < textLines.length; i++) {
|
|
@@ -3260,24 +3565,172 @@ var Y_TOL = 3;
|
|
|
3260
3565
|
var COL_CLUSTER_TOL = 15;
|
|
3261
3566
|
var MIN_ROWS = 3;
|
|
3262
3567
|
var MIN_COLS = 2;
|
|
3263
|
-
var MIN_GAP_FACTOR =
|
|
3264
|
-
var
|
|
3568
|
+
var MIN_GAP_FACTOR = 2;
|
|
3569
|
+
var MIN_GAP_ABSOLUTE = 20;
|
|
3570
|
+
var MIN_COL_FILL_RATIO = 0.4;
|
|
3265
3571
|
function detectClusterTables(items, pageNum) {
|
|
3266
3572
|
if (items.length < MIN_ROWS * MIN_COLS) return [];
|
|
3267
|
-
const
|
|
3573
|
+
const { merged, originMap } = mergeEvenSpacedClusters(items);
|
|
3574
|
+
const rows = groupByBaseline(merged);
|
|
3268
3575
|
if (rows.length < MIN_ROWS) return [];
|
|
3269
|
-
const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
|
|
3270
|
-
if (suspiciousRows.length < MIN_ROWS) return [];
|
|
3271
|
-
const columns = extractColumnClusters(suspiciousRows);
|
|
3272
|
-
if (columns.length < MIN_COLS) return [];
|
|
3273
|
-
const tableRegions = findTableRegions(rows, columns);
|
|
3274
3576
|
const results = [];
|
|
3275
|
-
|
|
3276
|
-
|
|
3277
|
-
|
|
3577
|
+
const headerResult = detectHeaderRow(rows);
|
|
3578
|
+
if (headerResult) {
|
|
3579
|
+
const { columns, headerIdx } = headerResult;
|
|
3580
|
+
const headerRow = rows[headerIdx];
|
|
3581
|
+
const headerItems = [...headerRow.items].sort((a, b) => a.x - b.x);
|
|
3582
|
+
const headerAndBelow = rows.slice(headerIdx);
|
|
3583
|
+
const mergedRows = mergeMultiLineRows(headerAndBelow, columns);
|
|
3584
|
+
const tableRegions = findTableRegionsByHeader(mergedRows, columns, headerItems);
|
|
3585
|
+
for (const region of tableRegions) {
|
|
3586
|
+
const table = buildClusterTable(region.rows, columns, pageNum);
|
|
3587
|
+
if (table) {
|
|
3588
|
+
expandUsedItems(table.usedItems, originMap);
|
|
3589
|
+
results.push(table);
|
|
3590
|
+
}
|
|
3591
|
+
}
|
|
3592
|
+
}
|
|
3593
|
+
if (results.length === 0) {
|
|
3594
|
+
const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
|
|
3595
|
+
if (suspiciousRows.length >= MIN_ROWS) {
|
|
3596
|
+
const columns = extractColumnClusters(suspiciousRows);
|
|
3597
|
+
if (columns.length >= MIN_COLS) {
|
|
3598
|
+
const tableRegions = findTableRegions(rows, columns);
|
|
3599
|
+
for (const region of tableRegions) {
|
|
3600
|
+
const mergedRows = mergeMultiLineRows(region.rows, columns);
|
|
3601
|
+
const table = buildClusterTable(mergedRows, columns, pageNum);
|
|
3602
|
+
if (table) {
|
|
3603
|
+
expandUsedItems(table.usedItems, originMap);
|
|
3604
|
+
results.push(table);
|
|
3605
|
+
}
|
|
3606
|
+
}
|
|
3607
|
+
}
|
|
3608
|
+
}
|
|
3278
3609
|
}
|
|
3279
3610
|
return results;
|
|
3280
3611
|
}
|
|
3612
|
+
function mergeEvenSpacedClusters(items) {
|
|
3613
|
+
const originMap = /* @__PURE__ */ new Map();
|
|
3614
|
+
const rows = groupByBaseline(items);
|
|
3615
|
+
const merged = [];
|
|
3616
|
+
for (const row of rows) {
|
|
3617
|
+
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3618
|
+
let i = 0;
|
|
3619
|
+
while (i < sorted.length) {
|
|
3620
|
+
if (/^[가-힣\d]$/.test(sorted[i].text)) {
|
|
3621
|
+
let runEnd = i + 1;
|
|
3622
|
+
while (runEnd < sorted.length && /^[가-힣\d]$/.test(sorted[runEnd].text)) {
|
|
3623
|
+
const gap = sorted[runEnd].x - (sorted[runEnd - 1].x + sorted[runEnd - 1].w);
|
|
3624
|
+
const fs = sorted[runEnd].fontSize;
|
|
3625
|
+
if (gap < fs * 0.1 || gap > fs * 3) break;
|
|
3626
|
+
runEnd++;
|
|
3627
|
+
}
|
|
3628
|
+
if (runEnd - i >= 3) {
|
|
3629
|
+
const gaps = [];
|
|
3630
|
+
for (let g2 = i + 1; g2 < runEnd; g2++) {
|
|
3631
|
+
gaps.push(sorted[g2].x - (sorted[g2 - 1].x + sorted[g2 - 1].w));
|
|
3632
|
+
}
|
|
3633
|
+
let minG = Infinity, maxG = -Infinity;
|
|
3634
|
+
for (const g2 of gaps) {
|
|
3635
|
+
if (g2 < minG) minG = g2;
|
|
3636
|
+
if (g2 > maxG) maxG = g2;
|
|
3637
|
+
}
|
|
3638
|
+
if (minG > 0 && maxG / minG <= 3) {
|
|
3639
|
+
const run = sorted.slice(i, runEnd);
|
|
3640
|
+
const text = run.map((r) => r.text).join("");
|
|
3641
|
+
const first = run[0], last = run[runEnd - i - 1];
|
|
3642
|
+
const item = {
|
|
3643
|
+
text,
|
|
3644
|
+
x: first.x,
|
|
3645
|
+
y: first.y,
|
|
3646
|
+
w: last.x + last.w - first.x,
|
|
3647
|
+
h: first.h,
|
|
3648
|
+
fontSize: first.fontSize,
|
|
3649
|
+
fontName: first.fontName
|
|
3650
|
+
};
|
|
3651
|
+
originMap.set(item, run);
|
|
3652
|
+
merged.push(item);
|
|
3653
|
+
i = runEnd;
|
|
3654
|
+
continue;
|
|
3655
|
+
}
|
|
3656
|
+
}
|
|
3657
|
+
}
|
|
3658
|
+
merged.push(sorted[i]);
|
|
3659
|
+
i++;
|
|
3660
|
+
}
|
|
3661
|
+
}
|
|
3662
|
+
return { merged, originMap };
|
|
3663
|
+
}
|
|
3664
|
+
function expandUsedItems(usedItems, originMap) {
|
|
3665
|
+
const toAdd = [];
|
|
3666
|
+
for (const item of usedItems) {
|
|
3667
|
+
const origins = originMap.get(item);
|
|
3668
|
+
if (origins) for (const o of origins) toAdd.push(o);
|
|
3669
|
+
}
|
|
3670
|
+
for (const a of toAdd) usedItems.add(a);
|
|
3671
|
+
}
|
|
3672
|
+
function detectHeaderRow(rows) {
|
|
3673
|
+
const allItems = rows.flatMap((r) => r.items);
|
|
3674
|
+
if (allItems.length === 0) return null;
|
|
3675
|
+
let allMinX = Infinity, allMaxX = -Infinity;
|
|
3676
|
+
for (const i of allItems) {
|
|
3677
|
+
if (i.x < allMinX) allMinX = i.x;
|
|
3678
|
+
const r = i.x + i.w;
|
|
3679
|
+
if (r > allMaxX) allMaxX = r;
|
|
3680
|
+
}
|
|
3681
|
+
const pageSpan = allMaxX - allMinX;
|
|
3682
|
+
if (pageSpan <= 0) return null;
|
|
3683
|
+
for (let ri = 0; ri < rows.length; ri++) {
|
|
3684
|
+
const row = rows[ri];
|
|
3685
|
+
if (row.items.length < MIN_COLS || row.items.length > 6) continue;
|
|
3686
|
+
if (row.items.some((i) => i.text.length > 8)) continue;
|
|
3687
|
+
if (!row.items.some((i) => /[가-힣]/.test(i.text))) continue;
|
|
3688
|
+
if (row.items.some((i) => /^[□■○●·※▶▷◆◇\-]/.test(i.text))) continue;
|
|
3689
|
+
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3690
|
+
const xSpan = sorted[sorted.length - 1].x + sorted[sorted.length - 1].w - sorted[0].x;
|
|
3691
|
+
if (xSpan / pageSpan < 0.4) continue;
|
|
3692
|
+
const avgFs = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
|
|
3693
|
+
let hasLargeGap = false;
|
|
3694
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
3695
|
+
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
3696
|
+
if (gap >= avgFs * 2.5) {
|
|
3697
|
+
hasLargeGap = true;
|
|
3698
|
+
break;
|
|
3699
|
+
}
|
|
3700
|
+
}
|
|
3701
|
+
if (!hasLargeGap) continue;
|
|
3702
|
+
const columns = sorted.map((item) => ({ x: item.x, count: 0 }));
|
|
3703
|
+
let matchCount = 0;
|
|
3704
|
+
for (let j = ri + 1; j < rows.length && matchCount < MIN_ROWS + 2; j++) {
|
|
3705
|
+
const matched = countMatchedColumnsRange(rows[j], columns, sorted);
|
|
3706
|
+
if (matched >= MIN_COLS) matchCount++;
|
|
3707
|
+
}
|
|
3708
|
+
if (matchCount < MIN_ROWS) continue;
|
|
3709
|
+
return { columns, headerIdx: ri };
|
|
3710
|
+
}
|
|
3711
|
+
return null;
|
|
3712
|
+
}
|
|
3713
|
+
function mergeMultiLineRows(rows, columns) {
|
|
3714
|
+
if (rows.length <= 1) return rows;
|
|
3715
|
+
const result = [rows[0]];
|
|
3716
|
+
const allFontSizes = rows.flatMap((r) => r.items).map((i) => i.fontSize);
|
|
3717
|
+
const avgFontSize = allFontSizes.length > 0 ? allFontSizes.reduce((s, v) => s + v, 0) / allFontSizes.length : 12;
|
|
3718
|
+
for (let i = 1; i < rows.length; i++) {
|
|
3719
|
+
const prev = result[result.length - 1];
|
|
3720
|
+
const curr = rows[i];
|
|
3721
|
+
const yGap = Math.abs(prev.y - curr.y);
|
|
3722
|
+
const matchedCols = countMatchedColumns(curr, columns);
|
|
3723
|
+
if (yGap < avgFontSize * 1.8 && curr.items.length <= 2 && (matchedCols < MIN_COLS || curr.items.length === 1)) {
|
|
3724
|
+
result[result.length - 1] = {
|
|
3725
|
+
y: prev.y,
|
|
3726
|
+
items: [...prev.items, ...curr.items]
|
|
3727
|
+
};
|
|
3728
|
+
} else {
|
|
3729
|
+
result.push(curr);
|
|
3730
|
+
}
|
|
3731
|
+
}
|
|
3732
|
+
return result;
|
|
3733
|
+
}
|
|
3281
3734
|
function groupByBaseline(items) {
|
|
3282
3735
|
if (items.length === 0) return [];
|
|
3283
3736
|
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
@@ -3299,8 +3752,9 @@ function groupByBaseline(items) {
|
|
|
3299
3752
|
function hasSuspiciousGaps(row) {
|
|
3300
3753
|
if (row.items.length < 2) return false;
|
|
3301
3754
|
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3755
|
+
if (sorted.length === 2 && sorted[1].text.length > 20) return false;
|
|
3302
3756
|
const avgFontSize = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
|
|
3303
|
-
const minGap = avgFontSize * MIN_GAP_FACTOR;
|
|
3757
|
+
const minGap = Math.max(avgFontSize * MIN_GAP_FACTOR, MIN_GAP_ABSOLUTE);
|
|
3304
3758
|
for (let i = 1; i < sorted.length; i++) {
|
|
3305
3759
|
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
3306
3760
|
if (gap >= minGap) return true;
|
|
@@ -3327,6 +3781,41 @@ function extractColumnClusters(rows) {
|
|
|
3327
3781
|
const minCount = Math.max(2, Math.floor(rows.length * MIN_COL_FILL_RATIO));
|
|
3328
3782
|
return clusters.filter((c) => c.count >= minCount).sort((a, b) => a.x - b.x);
|
|
3329
3783
|
}
|
|
3784
|
+
function findTableRegionsByHeader(allRows, columns, headerItems) {
|
|
3785
|
+
const regions = [];
|
|
3786
|
+
let currentRegion = [];
|
|
3787
|
+
let missStreak = 0;
|
|
3788
|
+
for (const row of allRows) {
|
|
3789
|
+
const matchedCols = countMatchedColumnsRange(row, columns, headerItems);
|
|
3790
|
+
if (matchedCols >= MIN_COLS) {
|
|
3791
|
+
currentRegion.push(row);
|
|
3792
|
+
missStreak = 0;
|
|
3793
|
+
} else if (currentRegion.length > 0 && (row.items.length <= 2 || missStreak === 0)) {
|
|
3794
|
+
currentRegion.push(row);
|
|
3795
|
+
missStreak++;
|
|
3796
|
+
} else {
|
|
3797
|
+
while (currentRegion.length > 0) {
|
|
3798
|
+
const last = currentRegion[currentRegion.length - 1];
|
|
3799
|
+
if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
|
|
3800
|
+
currentRegion.pop();
|
|
3801
|
+
}
|
|
3802
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
3803
|
+
regions.push({ rows: [...currentRegion] });
|
|
3804
|
+
}
|
|
3805
|
+
currentRegion = [];
|
|
3806
|
+
missStreak = 0;
|
|
3807
|
+
}
|
|
3808
|
+
}
|
|
3809
|
+
while (currentRegion.length > 0) {
|
|
3810
|
+
const last = currentRegion[currentRegion.length - 1];
|
|
3811
|
+
if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
|
|
3812
|
+
currentRegion.pop();
|
|
3813
|
+
}
|
|
3814
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
3815
|
+
regions.push({ rows: currentRegion });
|
|
3816
|
+
}
|
|
3817
|
+
return regions;
|
|
3818
|
+
}
|
|
3330
3819
|
function findTableRegions(allRows, columns) {
|
|
3331
3820
|
const regions = [];
|
|
3332
3821
|
let currentRegion = [];
|
|
@@ -3362,18 +3851,81 @@ function countMatchedColumns(row, columns) {
|
|
|
3362
3851
|
}
|
|
3363
3852
|
return matched.size;
|
|
3364
3853
|
}
|
|
3365
|
-
function
|
|
3366
|
-
const
|
|
3367
|
-
let
|
|
3368
|
-
|
|
3369
|
-
|
|
3370
|
-
|
|
3371
|
-
|
|
3372
|
-
|
|
3373
|
-
|
|
3854
|
+
function countMatchedColumnsRange(row, columns, headerItems) {
|
|
3855
|
+
const boundaries = [];
|
|
3856
|
+
for (let ci = 0; ci < headerItems.length; ci++) {
|
|
3857
|
+
const left = ci === 0 ? 0 : (headerItems[ci - 1].x + headerItems[ci - 1].w + headerItems[ci].x) / 2;
|
|
3858
|
+
const right = ci === headerItems.length - 1 ? Infinity : (headerItems[ci].x + headerItems[ci].w + headerItems[ci + 1].x) / 2;
|
|
3859
|
+
boundaries.push({ left, right });
|
|
3860
|
+
}
|
|
3861
|
+
const matched = /* @__PURE__ */ new Set();
|
|
3862
|
+
for (const item of row.items) {
|
|
3863
|
+
for (let ci = 0; ci < boundaries.length; ci++) {
|
|
3864
|
+
if (item.x >= boundaries[ci].left && item.x < boundaries[ci].right) {
|
|
3865
|
+
matched.add(ci);
|
|
3866
|
+
break;
|
|
3867
|
+
}
|
|
3374
3868
|
}
|
|
3375
3869
|
}
|
|
3376
|
-
return
|
|
3870
|
+
return matched.size;
|
|
3871
|
+
}
|
|
3872
|
+
function assignRowItems(items, columns, numCols) {
|
|
3873
|
+
if (items.length === 0) return [];
|
|
3874
|
+
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
3875
|
+
const colCenters = columns.map((c) => c.x);
|
|
3876
|
+
const gaps = [];
|
|
3877
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
3878
|
+
gaps.push({ idx: i, size: sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w) });
|
|
3879
|
+
}
|
|
3880
|
+
const gapSizes = gaps.map((g2) => g2.size).sort((a, b) => a - b);
|
|
3881
|
+
const medianGap = gapSizes.length > 0 ? gapSizes[Math.floor(gapSizes.length / 2)] : 0;
|
|
3882
|
+
const gapThreshold = sorted.length <= numCols + 1 ? 12 : Math.max(medianGap * 2.5, 12);
|
|
3883
|
+
const significantGaps = gaps.filter((g2) => g2.size >= gapThreshold).sort((a, b) => b.size - a.size).slice(0, numCols - 1).sort((a, b) => a.idx - b.idx);
|
|
3884
|
+
const groups = [];
|
|
3885
|
+
let start = 0;
|
|
3886
|
+
for (const gap of significantGaps) {
|
|
3887
|
+
groups.push(sorted.slice(start, gap.idx));
|
|
3888
|
+
start = gap.idx;
|
|
3889
|
+
}
|
|
3890
|
+
groups.push(sorted.slice(start));
|
|
3891
|
+
const result = [];
|
|
3892
|
+
const usedCols = /* @__PURE__ */ new Set();
|
|
3893
|
+
const groupCenters = groups.map((g2) => {
|
|
3894
|
+
let minX = Infinity, maxX = -Infinity;
|
|
3895
|
+
for (const i of g2) {
|
|
3896
|
+
if (i.x < minX) minX = i.x;
|
|
3897
|
+
const r = i.x + i.w;
|
|
3898
|
+
if (r > maxX) maxX = r;
|
|
3899
|
+
}
|
|
3900
|
+
return (minX + maxX) / 2;
|
|
3901
|
+
});
|
|
3902
|
+
const assignments = [];
|
|
3903
|
+
for (let gi = 0; gi < groups.length; gi++) {
|
|
3904
|
+
for (let ci = 0; ci < numCols; ci++) {
|
|
3905
|
+
assignments.push({ gi, ci, dist: Math.abs(groupCenters[gi] - colCenters[ci]) });
|
|
3906
|
+
}
|
|
3907
|
+
}
|
|
3908
|
+
assignments.sort((a, b) => a.dist - b.dist);
|
|
3909
|
+
const assignedGroups = /* @__PURE__ */ new Set();
|
|
3910
|
+
for (const { gi, ci } of assignments) {
|
|
3911
|
+
if (assignedGroups.has(gi) || usedCols.has(ci)) continue;
|
|
3912
|
+
result.push({ col: ci, items: groups[gi] });
|
|
3913
|
+
assignedGroups.add(gi);
|
|
3914
|
+
usedCols.add(ci);
|
|
3915
|
+
}
|
|
3916
|
+
for (let gi = 0; gi < groups.length; gi++) {
|
|
3917
|
+
if (assignedGroups.has(gi)) continue;
|
|
3918
|
+
let bestCol = 0, bestDist = Infinity;
|
|
3919
|
+
for (let ci = 0; ci < numCols; ci++) {
|
|
3920
|
+
const d = Math.abs(groupCenters[gi] - colCenters[ci]);
|
|
3921
|
+
if (d < bestDist) {
|
|
3922
|
+
bestDist = d;
|
|
3923
|
+
bestCol = ci;
|
|
3924
|
+
}
|
|
3925
|
+
}
|
|
3926
|
+
result.push({ col: bestCol, items: groups[gi] });
|
|
3927
|
+
}
|
|
3928
|
+
return result;
|
|
3377
3929
|
}
|
|
3378
3930
|
function buildClusterTable(rows, columns, pageNum) {
|
|
3379
3931
|
const numCols = columns.length;
|
|
@@ -3391,12 +3943,12 @@ function buildClusterTable(rows, columns, pageNum) {
|
|
|
3391
3943
|
usedItems.add(row.items[0]);
|
|
3392
3944
|
continue;
|
|
3393
3945
|
}
|
|
3394
|
-
|
|
3395
|
-
|
|
3396
|
-
|
|
3946
|
+
const assignments = assignRowItems(row.items, columns, numCols);
|
|
3947
|
+
for (const { col, items } of assignments) {
|
|
3948
|
+
const text = items.map((i) => i.text).join(" ");
|
|
3397
3949
|
const existing = cells[r][col].text;
|
|
3398
|
-
cells[r][col].text = existing ? existing + " " +
|
|
3399
|
-
usedItems.add(item);
|
|
3950
|
+
cells[r][col].text = existing ? existing + " " + text : text;
|
|
3951
|
+
for (const item of items) usedItems.add(item);
|
|
3400
3952
|
}
|
|
3401
3953
|
}
|
|
3402
3954
|
let emptyRows = 0;
|
|
@@ -3408,11 +3960,48 @@ function buildClusterTable(rows, columns, pageNum) {
|
|
|
3408
3960
|
const hasValue = cells.some((row) => row[c].text !== "");
|
|
3409
3961
|
if (!hasValue) return null;
|
|
3410
3962
|
}
|
|
3963
|
+
for (let r = numRows - 1; r >= 1; r--) {
|
|
3964
|
+
const nonEmptyCols = cells[r].filter((c) => c.text.trim()).length;
|
|
3965
|
+
if (nonEmptyCols !== 1) continue;
|
|
3966
|
+
if (cells[r][0].text.trim() !== "") continue;
|
|
3967
|
+
const contentText = cells[r].find((c) => c.text.trim())?.text.trim() || "";
|
|
3968
|
+
if (/^[○●▶\-·]/.test(contentText)) continue;
|
|
3969
|
+
for (let pr = r - 1; pr >= 0; pr--) {
|
|
3970
|
+
if (cells[pr].some((c) => c.text.trim())) {
|
|
3971
|
+
for (let c = 0; c < numCols; c++) {
|
|
3972
|
+
const prev = cells[pr][c].text.trim();
|
|
3973
|
+
const curr = cells[r][c].text.trim();
|
|
3974
|
+
if (curr) cells[pr][c].text = prev ? prev + " " + curr : curr;
|
|
3975
|
+
}
|
|
3976
|
+
for (let c = 0; c < numCols; c++) cells[r][c].text = "";
|
|
3977
|
+
break;
|
|
3978
|
+
}
|
|
3979
|
+
}
|
|
3980
|
+
}
|
|
3981
|
+
for (let r = 0; r < cells.length - 1; r++) {
|
|
3982
|
+
const row = cells[r];
|
|
3983
|
+
const hasCol0 = row[0].text.trim() !== "";
|
|
3984
|
+
const hasColLast = numCols > 1 && row[numCols - 1].text.trim() !== "";
|
|
3985
|
+
const midEmpty = row.slice(1, numCols - 1).every((c) => c.text.trim() === "");
|
|
3986
|
+
if (hasCol0 && hasColLast && midEmpty) {
|
|
3987
|
+
const next = cells[r + 1];
|
|
3988
|
+
if (next[0].text.trim() === "" && next.some((c) => c.text.trim())) {
|
|
3989
|
+
for (let c = 1; c < numCols; c++) {
|
|
3990
|
+
const curr = next[c].text.trim();
|
|
3991
|
+
if (curr) row[c].text = row[c].text.trim() ? row[c].text.trim() + " " + curr : curr;
|
|
3992
|
+
}
|
|
3993
|
+
for (let c = 0; c < numCols; c++) next[c].text = "";
|
|
3994
|
+
}
|
|
3995
|
+
}
|
|
3996
|
+
}
|
|
3997
|
+
const filteredCells = cells.filter((row) => row.some((c) => c.text.trim()));
|
|
3998
|
+
const finalRowCount = filteredCells.length;
|
|
3999
|
+
if (finalRowCount < MIN_ROWS) return null;
|
|
3411
4000
|
const irTable = {
|
|
3412
|
-
rows:
|
|
4001
|
+
rows: finalRowCount,
|
|
3413
4002
|
cols: numCols,
|
|
3414
|
-
cells,
|
|
3415
|
-
hasHeader:
|
|
4003
|
+
cells: filteredCells,
|
|
4004
|
+
hasHeader: finalRowCount > 1
|
|
3416
4005
|
};
|
|
3417
4006
|
const allItems = rows.flatMap((r) => r.items);
|
|
3418
4007
|
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
@@ -3489,7 +4078,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3489
4078
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
3490
4079
|
const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
|
|
3491
4080
|
const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
|
|
3492
|
-
const
|
|
4081
|
+
const fontSizeFreq = /* @__PURE__ */ new Map();
|
|
3493
4082
|
const pageHeights = /* @__PURE__ */ new Map();
|
|
3494
4083
|
let parsedPages = 0;
|
|
3495
4084
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
@@ -3506,7 +4095,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3506
4095
|
warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
|
|
3507
4096
|
}
|
|
3508
4097
|
for (const item of visible) {
|
|
3509
|
-
if (item.fontSize > 0)
|
|
4098
|
+
if (item.fontSize > 0) fontSizeFreq.set(item.fontSize, (fontSizeFreq.get(item.fontSize) || 0) + 1);
|
|
3510
4099
|
}
|
|
3511
4100
|
const opList = await page.getOperatorList();
|
|
3512
4101
|
const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
|
|
@@ -3528,7 +4117,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3528
4117
|
if (totalChars / Math.max(parsedPageCount, 1) < 10) {
|
|
3529
4118
|
if (options?.ocr) {
|
|
3530
4119
|
try {
|
|
3531
|
-
const { ocrPages } = await import("./provider-
|
|
4120
|
+
const { ocrPages } = await import("./provider-7H4CPZYS.js");
|
|
3532
4121
|
const ocrBlocks = await ocrPages(doc, options.ocr, pageFilter, effectivePageCount);
|
|
3533
4122
|
if (ocrBlocks.length > 0) {
|
|
3534
4123
|
const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
|
|
@@ -3545,7 +4134,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3545
4134
|
blocks.splice(removed[ri], 1);
|
|
3546
4135
|
}
|
|
3547
4136
|
}
|
|
3548
|
-
const medianFontSize =
|
|
4137
|
+
const medianFontSize = computeMedianFontSizeFromFreq(fontSizeFreq);
|
|
3549
4138
|
if (medianFontSize > 0) {
|
|
3550
4139
|
detectHeadings(blocks, medianFontSize);
|
|
3551
4140
|
}
|
|
@@ -3609,11 +4198,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
|
|
|
3609
4198
|
}
|
|
3610
4199
|
return { visible, hiddenCount };
|
|
3611
4200
|
}
|
|
3612
|
-
function
|
|
3613
|
-
if (
|
|
3614
|
-
|
|
3615
|
-
const
|
|
3616
|
-
|
|
4201
|
+
function computeMedianFontSizeFromFreq(freq) {
|
|
4202
|
+
if (freq.size === 0) return 0;
|
|
4203
|
+
let total = 0;
|
|
4204
|
+
for (const count of freq.values()) total += count;
|
|
4205
|
+
const sorted = [...freq.entries()].sort((a, b) => a[0] - b[0]);
|
|
4206
|
+
const mid = Math.floor(total / 2);
|
|
4207
|
+
let cumulative = 0;
|
|
4208
|
+
for (const [size, count] of sorted) {
|
|
4209
|
+
cumulative += count;
|
|
4210
|
+
if (cumulative > mid) return size;
|
|
4211
|
+
}
|
|
4212
|
+
return sorted[sorted.length - 1][0];
|
|
3617
4213
|
}
|
|
3618
4214
|
function detectHeadings(blocks, medianFontSize) {
|
|
3619
4215
|
for (const block of blocks) {
|
|
@@ -3639,11 +4235,21 @@ function collapseEvenSpacing(text) {
|
|
|
3639
4235
|
if (tokens.length >= 3 && singleCharCount / tokens.length >= 0.7) {
|
|
3640
4236
|
return tokens.join("");
|
|
3641
4237
|
}
|
|
3642
|
-
return text
|
|
4238
|
+
return text.replace(
|
|
4239
|
+
/(?<![가-힣])[가-힣](?: [가-힣\d]){2,}(?![가-힣])/g,
|
|
4240
|
+
(match) => match.replace(/ /g, "")
|
|
4241
|
+
);
|
|
3643
4242
|
}
|
|
3644
4243
|
function shouldDemoteTable(table) {
|
|
3645
4244
|
const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
|
|
3646
4245
|
const allText = allCells.join(" ");
|
|
4246
|
+
if (table.rows <= 3 && table.cols <= 3) {
|
|
4247
|
+
const totalCells2 = table.rows * table.cols;
|
|
4248
|
+
const emptyCells2 = totalCells2 - allCells.length;
|
|
4249
|
+
if (emptyCells2 >= totalCells2 * 0.3) return true;
|
|
4250
|
+
if (/[□■◆○●▶ㅇ]/.test(allText)) return true;
|
|
4251
|
+
if (/<[^>]+>/.test(allText)) return true;
|
|
4252
|
+
}
|
|
3647
4253
|
if (allText.length > 200) return false;
|
|
3648
4254
|
if (/[□■◆○●▶]/.test(allText) && table.rows <= 3) return true;
|
|
3649
4255
|
const totalCells = table.rows * table.cols;
|
|
@@ -3754,6 +4360,7 @@ function extractPageBlocksWithLines(items, pageNum, opList, pageWidth, pageHeigh
|
|
|
3754
4360
|
if (items.length === 0) return [];
|
|
3755
4361
|
let { horizontals, verticals } = extractLines(opList.fnArray, opList.argsArray);
|
|
3756
4362
|
({ horizontals, verticals } = filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight));
|
|
4363
|
+
({ horizontals, verticals } = preprocessLines(horizontals, verticals));
|
|
3757
4364
|
const grids = buildTableGrids(horizontals, verticals);
|
|
3758
4365
|
if (grids.length > 0) {
|
|
3759
4366
|
return extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals);
|
|
@@ -3765,14 +4372,19 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
3765
4372
|
const usedItems = /* @__PURE__ */ new Set();
|
|
3766
4373
|
const sortedGrids = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
|
|
3767
4374
|
for (const grid of sortedGrids) {
|
|
4375
|
+
const numGridRows = grid.rowYs.length - 1;
|
|
4376
|
+
const numGridCols = grid.colXs.length - 1;
|
|
4377
|
+
if (numGridRows === 1 && numGridCols >= 2) continue;
|
|
3768
4378
|
const tableItems = [];
|
|
3769
4379
|
const pad = 3;
|
|
4380
|
+
const gridW = grid.bbox.x2 - grid.bbox.x1;
|
|
3770
4381
|
for (const item of items) {
|
|
3771
4382
|
if (usedItems.has(item)) continue;
|
|
3772
|
-
if (item.
|
|
3773
|
-
|
|
3774
|
-
|
|
3775
|
-
|
|
4383
|
+
if (item.y < grid.bbox.y1 - pad || item.y > grid.bbox.y2 + pad) continue;
|
|
4384
|
+
if (item.x < grid.bbox.x1 - pad || item.x + item.w > grid.bbox.x2 + pad) continue;
|
|
4385
|
+
if (gridW < 120 && item.x + item.w > grid.bbox.x2 - 2) continue;
|
|
4386
|
+
tableItems.push(item);
|
|
4387
|
+
usedItems.add(item);
|
|
3776
4388
|
}
|
|
3777
4389
|
const cells = extractCells(grid, horizontals, verticals);
|
|
3778
4390
|
if (cells.length === 0) continue;
|
|
@@ -3796,6 +4408,7 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
3796
4408
|
const cellItems = cellTextMap.get(cell) || [];
|
|
3797
4409
|
let text = cellTextToString(cellItems);
|
|
3798
4410
|
text = text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").trim();
|
|
4411
|
+
text = text.split("\n").map((line) => collapseEvenSpacing(line)).join("\n");
|
|
3799
4412
|
irGrid[cell.row][cell.col] = {
|
|
3800
4413
|
text,
|
|
3801
4414
|
colSpan: cell.colSpan,
|
|
@@ -3820,23 +4433,58 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
3820
4433
|
if (shouldDemoteTable(irTable)) {
|
|
3821
4434
|
const demoted = demoteTableToText(irTable);
|
|
3822
4435
|
if (demoted) {
|
|
3823
|
-
|
|
4436
|
+
const text = numGridRows === 1 ? "\n" + demoted + "\n" : demoted;
|
|
4437
|
+
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
|
|
3824
4438
|
}
|
|
3825
4439
|
continue;
|
|
3826
4440
|
}
|
|
3827
4441
|
blocks.push({ type: "table", table: irTable, pageNumber: pageNum, bbox: tableBbox });
|
|
3828
4442
|
}
|
|
3829
|
-
|
|
4443
|
+
let remaining = items.filter((i) => !usedItems.has(i));
|
|
3830
4444
|
if (remaining.length > 0) {
|
|
3831
4445
|
remaining.sort((a, b) => b.y - a.y || a.x - b.x);
|
|
3832
|
-
const
|
|
3833
|
-
|
|
3834
|
-
|
|
4446
|
+
const clusterItems = remaining.map((i) => ({
|
|
4447
|
+
text: i.text,
|
|
4448
|
+
x: i.x,
|
|
4449
|
+
y: i.y,
|
|
4450
|
+
w: i.w,
|
|
4451
|
+
h: i.h,
|
|
4452
|
+
fontSize: i.fontSize,
|
|
4453
|
+
fontName: i.fontName
|
|
4454
|
+
}));
|
|
4455
|
+
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4456
|
+
if (clusterResults.length > 0) {
|
|
4457
|
+
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4458
|
+
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4459
|
+
const usedClusterIndices = /* @__PURE__ */ new Set();
|
|
4460
|
+
for (const cr of clusterResults) {
|
|
4461
|
+
for (const ci of cr.usedItems) {
|
|
4462
|
+
const idx = ciToIdx.get(ci);
|
|
4463
|
+
if (idx !== void 0) usedClusterIndices.add(idx);
|
|
4464
|
+
}
|
|
4465
|
+
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4466
|
+
}
|
|
4467
|
+
remaining = remaining.filter((_, idx) => !usedClusterIndices.has(idx));
|
|
4468
|
+
}
|
|
4469
|
+
if (remaining.length > 0) {
|
|
4470
|
+
const allY = remaining.map((i) => i.y);
|
|
4471
|
+
const pageH = safeMax(allY) - safeMin(allY);
|
|
4472
|
+
const groups = xyCutOrder(remaining, Math.max(15, pageH * 0.03));
|
|
4473
|
+
const textBlocks = [];
|
|
4474
|
+
for (const group of groups) {
|
|
4475
|
+
if (group.length === 0) continue;
|
|
4476
|
+
const groupBlocks = extractPageBlocksFallback(group, pageNum);
|
|
4477
|
+
for (const b of groupBlocks) textBlocks.push(b);
|
|
4478
|
+
}
|
|
4479
|
+
const finalTextBlocks = detectListBlocks(textBlocks);
|
|
4480
|
+
for (const b of finalTextBlocks) blocks.push(b);
|
|
4481
|
+
}
|
|
4482
|
+
blocks.sort((a, b) => {
|
|
3835
4483
|
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
3836
4484
|
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
3837
4485
|
return by - ay;
|
|
3838
4486
|
});
|
|
3839
|
-
return mergeAdjacentTableBlocks(
|
|
4487
|
+
return mergeAdjacentTableBlocks(blocks);
|
|
3840
4488
|
}
|
|
3841
4489
|
return mergeAdjacentTableBlocks(blocks);
|
|
3842
4490
|
}
|
|
@@ -3863,52 +4511,52 @@ function mergeAdjacentTableBlocks(blocks) {
|
|
|
3863
4511
|
function extractPageBlocksFallback(items, pageNum) {
|
|
3864
4512
|
if (items.length === 0) return [];
|
|
3865
4513
|
const blocks = [];
|
|
3866
|
-
const
|
|
3867
|
-
|
|
3868
|
-
|
|
3869
|
-
|
|
3870
|
-
|
|
3871
|
-
|
|
3872
|
-
|
|
3873
|
-
|
|
3874
|
-
|
|
3875
|
-
|
|
3876
|
-
|
|
3877
|
-
|
|
3878
|
-
|
|
3879
|
-
|
|
3880
|
-
|
|
3881
|
-
|
|
3882
|
-
|
|
3883
|
-
|
|
3884
|
-
const ciToIdx = /* @__PURE__ */ new Map();
|
|
3885
|
-
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
3886
|
-
const usedIndices = /* @__PURE__ */ new Set();
|
|
3887
|
-
for (const cr of clusterResults) {
|
|
3888
|
-
for (const ci of cr.usedItems) {
|
|
3889
|
-
const idx = ciToIdx.get(ci);
|
|
3890
|
-
if (idx !== void 0) usedIndices.add(idx);
|
|
3891
|
-
}
|
|
3892
|
-
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4514
|
+
const clusterItems = items.map((i) => ({
|
|
4515
|
+
text: i.text,
|
|
4516
|
+
x: i.x,
|
|
4517
|
+
y: i.y,
|
|
4518
|
+
w: i.w,
|
|
4519
|
+
h: i.h,
|
|
4520
|
+
fontSize: i.fontSize,
|
|
4521
|
+
fontName: i.fontName
|
|
4522
|
+
}));
|
|
4523
|
+
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4524
|
+
if (clusterResults.length > 0) {
|
|
4525
|
+
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4526
|
+
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4527
|
+
const usedIndices = /* @__PURE__ */ new Set();
|
|
4528
|
+
for (const cr of clusterResults) {
|
|
4529
|
+
for (const ci of cr.usedItems) {
|
|
4530
|
+
const idx = ciToIdx.get(ci);
|
|
4531
|
+
if (idx !== void 0) usedIndices.add(idx);
|
|
3893
4532
|
}
|
|
3894
|
-
|
|
3895
|
-
|
|
3896
|
-
|
|
3897
|
-
|
|
3898
|
-
|
|
3899
|
-
|
|
3900
|
-
|
|
3901
|
-
|
|
3902
|
-
|
|
4533
|
+
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4534
|
+
}
|
|
4535
|
+
const remaining = items.filter((_, idx) => !usedIndices.has(idx));
|
|
4536
|
+
if (remaining.length > 0) {
|
|
4537
|
+
const yLines = groupByY(remaining);
|
|
4538
|
+
for (const line of yLines) {
|
|
4539
|
+
const text = mergeLineSimple(line);
|
|
4540
|
+
if (!text.trim()) continue;
|
|
4541
|
+
const bbox = computeBBox(line, pageNum);
|
|
4542
|
+
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
|
|
3903
4543
|
}
|
|
3904
|
-
|
|
3905
|
-
|
|
3906
|
-
|
|
3907
|
-
|
|
3908
|
-
|
|
4544
|
+
}
|
|
4545
|
+
blocks.sort((a, b) => {
|
|
4546
|
+
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
4547
|
+
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
4548
|
+
return by - ay;
|
|
4549
|
+
});
|
|
4550
|
+
} else {
|
|
4551
|
+
const allYLines = groupByY(items);
|
|
4552
|
+
const columns = detectColumns(allYLines);
|
|
4553
|
+
if (columns && columns.length >= 3) {
|
|
4554
|
+
const tableText = extractWithColumns(allYLines, columns);
|
|
4555
|
+
const bbox = computeBBox(items, pageNum);
|
|
4556
|
+
blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
|
|
3909
4557
|
} else {
|
|
3910
4558
|
const allY = items.map((i) => i.y);
|
|
3911
|
-
const pageHeight =
|
|
4559
|
+
const pageHeight = safeMax(allY) - safeMin(allY);
|
|
3912
4560
|
const gapThreshold = Math.max(15, pageHeight * 0.03);
|
|
3913
4561
|
const orderedGroups = xyCutOrder(items, gapThreshold);
|
|
3914
4562
|
for (const group of orderedGroups) {
|
|
@@ -3961,22 +4609,76 @@ function dominantStyle(items) {
|
|
|
3961
4609
|
return { fontSize: dominantSize, fontName };
|
|
3962
4610
|
}
|
|
3963
4611
|
function normalizeItems(rawItems) {
|
|
3964
|
-
|
|
4612
|
+
const items = [];
|
|
4613
|
+
const spacePositions = [];
|
|
4614
|
+
for (const i of rawItems) {
|
|
4615
|
+
if (typeof i.str !== "string") continue;
|
|
4616
|
+
const x = Math.round(i.transform[4]);
|
|
4617
|
+
const y = Math.round(i.transform[5]);
|
|
4618
|
+
if (!i.str.trim()) {
|
|
4619
|
+
spacePositions.push({ x, y });
|
|
4620
|
+
continue;
|
|
4621
|
+
}
|
|
3965
4622
|
const scaleY = Math.abs(i.transform[3]);
|
|
3966
4623
|
const scaleX = Math.abs(i.transform[0]);
|
|
3967
4624
|
const fontSize = Math.round(Math.max(scaleY, scaleX));
|
|
3968
|
-
|
|
3969
|
-
|
|
3970
|
-
|
|
3971
|
-
|
|
3972
|
-
|
|
3973
|
-
|
|
3974
|
-
|
|
3975
|
-
|
|
3976
|
-
|
|
3977
|
-
|
|
3978
|
-
|
|
3979
|
-
|
|
4625
|
+
const w = Math.round(i.width);
|
|
4626
|
+
const h = Math.round(i.height);
|
|
4627
|
+
const isHidden = fontSize === 0 || i.width === 0 && i.str.trim().length > 0;
|
|
4628
|
+
let text = i.str.trim();
|
|
4629
|
+
if (/^[\d\s\-().·,☎]+$/.test(text) && /\d/.test(text) && / /.test(text)) {
|
|
4630
|
+
text = text.replace(/ /g, "");
|
|
4631
|
+
}
|
|
4632
|
+
const split = splitEvenSpacedItem(text, x, w, fontSize);
|
|
4633
|
+
if (split) {
|
|
4634
|
+
for (const s of split) {
|
|
4635
|
+
items.push({ text: s.text, x: s.x, y, w: s.w, h, fontSize, fontName: i.fontName || "", isHidden });
|
|
4636
|
+
}
|
|
4637
|
+
} else {
|
|
4638
|
+
items.push({ text, x, y, w, h, fontSize, fontName: i.fontName || "", isHidden });
|
|
4639
|
+
}
|
|
4640
|
+
}
|
|
4641
|
+
const sorted = items.sort((a, b) => b.y - a.y || a.x - b.x);
|
|
4642
|
+
const deduped = [];
|
|
4643
|
+
for (let i = 0; i < sorted.length; i++) {
|
|
4644
|
+
let isDup = false;
|
|
4645
|
+
for (let j = deduped.length - 1; j >= 0; j--) {
|
|
4646
|
+
const prev = deduped[j];
|
|
4647
|
+
if (prev.y - sorted[i].y > 3) break;
|
|
4648
|
+
if (Math.abs(prev.y - sorted[i].y) <= 3 && prev.text === sorted[i].text && Math.abs(prev.x - sorted[i].x) <= 3) {
|
|
4649
|
+
isDup = true;
|
|
4650
|
+
break;
|
|
4651
|
+
}
|
|
4652
|
+
}
|
|
4653
|
+
if (!isDup) deduped.push(sorted[i]);
|
|
4654
|
+
}
|
|
4655
|
+
if (spacePositions.length > 0) {
|
|
4656
|
+
for (const item of deduped) {
|
|
4657
|
+
for (const sp of spacePositions) {
|
|
4658
|
+
if (Math.abs(sp.y - item.y) <= 3) {
|
|
4659
|
+
const dist = item.x - sp.x;
|
|
4660
|
+
if (dist >= 0 && dist <= 20) {
|
|
4661
|
+
item.hasSpaceBefore = true;
|
|
4662
|
+
break;
|
|
4663
|
+
}
|
|
4664
|
+
}
|
|
4665
|
+
}
|
|
4666
|
+
}
|
|
4667
|
+
}
|
|
4668
|
+
return deduped;
|
|
4669
|
+
}
|
|
4670
|
+
function splitEvenSpacedItem(text, itemX, itemW, fontSize) {
|
|
4671
|
+
if (!/^[가-힣\d](?: [가-힣\d]){2,}$/.test(text)) return null;
|
|
4672
|
+
const chars = text.split(" ");
|
|
4673
|
+
if (chars.length < 3) return null;
|
|
4674
|
+
const charW = itemW / chars.length;
|
|
4675
|
+
if (charW > fontSize * 2) return null;
|
|
4676
|
+
return chars.map((ch, idx) => ({
|
|
4677
|
+
text: ch,
|
|
4678
|
+
x: Math.round(itemX + idx * charW),
|
|
4679
|
+
w: Math.round(charW * 0.8)
|
|
4680
|
+
// 실제 글자 폭은 간격보다 좁음
|
|
4681
|
+
}));
|
|
3980
4682
|
}
|
|
3981
4683
|
function groupByY(items) {
|
|
3982
4684
|
if (items.length === 0) return [];
|
|
@@ -4001,14 +4703,14 @@ function isProseSpread(items) {
|
|
|
4001
4703
|
for (let i = 1; i < sorted.length; i++) {
|
|
4002
4704
|
gaps.push(sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w));
|
|
4003
4705
|
}
|
|
4004
|
-
const maxGap =
|
|
4706
|
+
const maxGap = safeMax(gaps);
|
|
4005
4707
|
const avgLen = items.reduce((s, i) => s + i.text.length, 0) / items.length;
|
|
4006
4708
|
return maxGap < 40 && avgLen < 5;
|
|
4007
4709
|
}
|
|
4008
4710
|
function detectColumns(yLines) {
|
|
4009
4711
|
const allItems = yLines.flat();
|
|
4010
4712
|
if (allItems.length === 0) return null;
|
|
4011
|
-
const pageWidth =
|
|
4713
|
+
const pageWidth = safeMax(allItems.map((i) => i.x + i.w)) - safeMin(allItems.map((i) => i.x));
|
|
4012
4714
|
if (pageWidth < 100) return null;
|
|
4013
4715
|
let bigoLineIdx = -1;
|
|
4014
4716
|
for (let i = 0; i < yLines.length; i++) {
|
|
@@ -4040,7 +4742,7 @@ function detectColumns(yLines) {
|
|
|
4040
4742
|
}
|
|
4041
4743
|
const peaks = xClusters.filter((c) => c.count >= 3).sort((a, b) => a.minX - b.minX);
|
|
4042
4744
|
if (peaks.length < 3) return null;
|
|
4043
|
-
const MERGE_TOL =
|
|
4745
|
+
const MERGE_TOL = 40;
|
|
4044
4746
|
const merged = [peaks[0]];
|
|
4045
4747
|
for (let i = 1; i < peaks.length; i++) {
|
|
4046
4748
|
const prev = merged[merged.length - 1];
|
|
@@ -4054,7 +4756,14 @@ function detectColumns(yLines) {
|
|
|
4054
4756
|
merged.push({ ...peaks[i] });
|
|
4055
4757
|
}
|
|
4056
4758
|
}
|
|
4057
|
-
const
|
|
4759
|
+
const rawColumns = merged.filter((c) => c.count >= 3).map((c) => c.minX);
|
|
4760
|
+
if (rawColumns.length < 3) return null;
|
|
4761
|
+
const MIN_DETECT_COL_WIDTH = 30;
|
|
4762
|
+
const columns = [rawColumns[0]];
|
|
4763
|
+
for (let i = 1; i < rawColumns.length; i++) {
|
|
4764
|
+
if (rawColumns[i] - columns[columns.length - 1] < MIN_DETECT_COL_WIDTH) continue;
|
|
4765
|
+
columns.push(rawColumns[i]);
|
|
4766
|
+
}
|
|
4058
4767
|
return columns.length >= 3 ? columns : null;
|
|
4059
4768
|
}
|
|
4060
4769
|
function findColumn(x, columns) {
|
|
@@ -4182,6 +4891,16 @@ function buildGridTable(lines, columns) {
|
|
|
4182
4891
|
}
|
|
4183
4892
|
merged.splice(0, headerEnd, headerRow);
|
|
4184
4893
|
}
|
|
4894
|
+
for (const row of merged) {
|
|
4895
|
+
for (let c = 0; c < row.length; c++) {
|
|
4896
|
+
if (row[c]) row[c] = collapseEvenSpacing(row[c]);
|
|
4897
|
+
}
|
|
4898
|
+
}
|
|
4899
|
+
const totalCells = merged.length * numCols;
|
|
4900
|
+
const filledCells = merged.reduce((s, row) => s + row.filter((c) => c).length, 0);
|
|
4901
|
+
if (filledCells < totalCells * 0.35 || merged.length < 2 || merged.length <= 3 && numCols >= 7) {
|
|
4902
|
+
return merged.map((r) => r.filter((c) => c).join(" ")).join("\n");
|
|
4903
|
+
}
|
|
4185
4904
|
const md = [];
|
|
4186
4905
|
md.push("| " + merged[0].join(" | ") + " |");
|
|
4187
4906
|
md.push("| " + merged[0].map(() => "---").join(" | ") + " |");
|
|
@@ -4193,12 +4912,32 @@ function buildGridTable(lines, columns) {
|
|
|
4193
4912
|
function mergeLineSimple(items) {
|
|
4194
4913
|
if (items.length <= 1) return items[0]?.text || "";
|
|
4195
4914
|
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
4915
|
+
const isEvenSpaced = detectEvenSpacedItems(sorted);
|
|
4196
4916
|
let result = sorted[0].text;
|
|
4197
4917
|
for (let i = 1; i < sorted.length; i++) {
|
|
4198
4918
|
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
4199
4919
|
const avgFs = (sorted[i].fontSize + sorted[i - 1].fontSize) / 2;
|
|
4200
|
-
|
|
4201
|
-
|
|
4920
|
+
const tabThreshold = Math.max(avgFs * 2, 30);
|
|
4921
|
+
if (gap > tabThreshold) {
|
|
4922
|
+
result += " ";
|
|
4923
|
+
result += sorted[i].text;
|
|
4924
|
+
continue;
|
|
4925
|
+
}
|
|
4926
|
+
if (isEvenSpaced[i]) {
|
|
4927
|
+
result += sorted[i].text;
|
|
4928
|
+
continue;
|
|
4929
|
+
}
|
|
4930
|
+
if (sorted[i].hasSpaceBefore && gap >= avgFs * 0.05) {
|
|
4931
|
+
result += " ";
|
|
4932
|
+
result += sorted[i].text;
|
|
4933
|
+
continue;
|
|
4934
|
+
}
|
|
4935
|
+
if (/[□■○●▶◆◇ㅇ]$/.test(sorted[i - 1].text) && /^[가-힣]/.test(sorted[i].text) && gap > 1) {
|
|
4936
|
+
result += " ";
|
|
4937
|
+
result += sorted[i].text;
|
|
4938
|
+
continue;
|
|
4939
|
+
}
|
|
4940
|
+
if (gap < avgFs * 0.15) {
|
|
4202
4941
|
} else if (gap < avgFs * 0.35 && (/[가-힣]$/.test(result) || /^[가-힣]/.test(sorted[i].text))) {
|
|
4203
4942
|
} else if (gap > 3) result += " ";
|
|
4204
4943
|
result += sorted[i].text;
|
|
@@ -4207,8 +4946,8 @@ function mergeLineSimple(items) {
|
|
|
4207
4946
|
}
|
|
4208
4947
|
function cleanPdfText(text) {
|
|
4209
4948
|
return mergeKoreanLines(
|
|
4210
|
-
text.replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "")
|
|
4211
|
-
).replace(/^(?!\|)
|
|
4949
|
+
text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
|
|
4950
|
+
).replace(/^(?!\| ---).*$/gm, (line) => collapseEvenSpacing(line)).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
|
|
4212
4951
|
}
|
|
4213
4952
|
function startsWithMarker(line) {
|
|
4214
4953
|
const t = line.trimStart();
|
|
@@ -4400,7 +5139,7 @@ function mergeKoreanLines(text) {
|
|
|
4400
5139
|
result[result.length - 1] = prev + " " + currTrimmed;
|
|
4401
5140
|
continue;
|
|
4402
5141
|
}
|
|
4403
|
-
if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev)) {
|
|
5142
|
+
if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev) && !startsWithMarker(prev)) {
|
|
4404
5143
|
result[result.length - 1] = prev + " " + curr;
|
|
4405
5144
|
} else {
|
|
4406
5145
|
result.push(curr);
|
|
@@ -4413,7 +5152,7 @@ function mergeKoreanLines(text) {
|
|
|
4413
5152
|
import { readFile } from "fs/promises";
|
|
4414
5153
|
|
|
4415
5154
|
// src/xlsx/parser.ts
|
|
4416
|
-
import
|
|
5155
|
+
import JSZip2 from "jszip";
|
|
4417
5156
|
import { DOMParser as DOMParser2 } from "@xmldom/xmldom";
|
|
4418
5157
|
var MAX_SHEETS = 100;
|
|
4419
5158
|
var MAX_DECOMPRESS_SIZE3 = 100 * 1024 * 1024;
|
|
@@ -4451,7 +5190,7 @@ function getTextContent(el) {
|
|
|
4451
5190
|
return el.textContent?.trim() ?? "";
|
|
4452
5191
|
}
|
|
4453
5192
|
function parseXml(text) {
|
|
4454
|
-
return new DOMParser2().parseFromString(text, "text/xml");
|
|
5193
|
+
return new DOMParser2().parseFromString(stripDtd(text), "text/xml");
|
|
4455
5194
|
}
|
|
4456
5195
|
function parseSharedStrings(xml) {
|
|
4457
5196
|
const doc = parseXml(xml);
|
|
@@ -4604,7 +5343,7 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
|
|
|
4604
5343
|
}
|
|
4605
5344
|
async function parseXlsxDocument(buffer, options) {
|
|
4606
5345
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
|
|
4607
|
-
const zip = await
|
|
5346
|
+
const zip = await JSZip2.loadAsync(buffer);
|
|
4608
5347
|
const warnings = [];
|
|
4609
5348
|
const workbookFile = zip.file("xl/workbook.xml");
|
|
4610
5349
|
if (!workbookFile) {
|
|
@@ -4626,7 +5365,7 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
4626
5365
|
}
|
|
4627
5366
|
let pageFilter = null;
|
|
4628
5367
|
if (options?.pages) {
|
|
4629
|
-
const { parsePageRange: parsePageRange2 } = await import("./page-range-
|
|
5368
|
+
const { parsePageRange: parsePageRange2 } = await import("./page-range-737B4EZW.js");
|
|
4630
5369
|
pageFilter = parsePageRange2(options.pages, sheets.length);
|
|
4631
5370
|
}
|
|
4632
5371
|
const blocks = [];
|
|
@@ -4694,7 +5433,7 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
4694
5433
|
}
|
|
4695
5434
|
|
|
4696
5435
|
// src/docx/parser.ts
|
|
4697
|
-
import
|
|
5436
|
+
import JSZip3 from "jszip";
|
|
4698
5437
|
import { DOMParser as DOMParser3 } from "@xmldom/xmldom";
|
|
4699
5438
|
var MAX_DECOMPRESS_SIZE4 = 100 * 1024 * 1024;
|
|
4700
5439
|
function getChildElements(parent, localName) {
|
|
@@ -4738,7 +5477,7 @@ function getAttr(el, localName) {
|
|
|
4738
5477
|
return null;
|
|
4739
5478
|
}
|
|
4740
5479
|
function parseXml2(text) {
|
|
4741
|
-
return new DOMParser3().parseFromString(text, "text/xml");
|
|
5480
|
+
return new DOMParser3().parseFromString(stripDtd(text), "text/xml");
|
|
4742
5481
|
}
|
|
4743
5482
|
function parseStyles(xml) {
|
|
4744
5483
|
const doc = parseXml2(xml);
|
|
@@ -5032,7 +5771,7 @@ async function extractImages(zip, rels, doc) {
|
|
|
5032
5771
|
}
|
|
5033
5772
|
async function parseDocxDocument(buffer, options) {
|
|
5034
5773
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
|
|
5035
|
-
const zip = await
|
|
5774
|
+
const zip = await JSZip3.loadAsync(buffer);
|
|
5036
5775
|
const warnings = [];
|
|
5037
5776
|
const docFile = zip.file("word/document.xml");
|
|
5038
5777
|
if (!docFile) {
|
|
@@ -5249,7 +5988,7 @@ function extractInlineFields(text) {
|
|
|
5249
5988
|
}
|
|
5250
5989
|
|
|
5251
5990
|
// src/hwpx/generator.ts
|
|
5252
|
-
import
|
|
5991
|
+
import JSZip4 from "jszip";
|
|
5253
5992
|
|
|
5254
5993
|
// src/index.ts
|
|
5255
5994
|
async function parse(input, options) {
|
|
@@ -5344,7 +6083,13 @@ function normalize(s) {
|
|
|
5344
6083
|
}
|
|
5345
6084
|
var MAX_LEVENSHTEIN_LEN = 1e4;
|
|
5346
6085
|
function levenshtein(a, b) {
|
|
5347
|
-
if (a.length + b.length > MAX_LEVENSHTEIN_LEN)
|
|
6086
|
+
if (a.length + b.length > MAX_LEVENSHTEIN_LEN) {
|
|
6087
|
+
const sampleLen = Math.min(500, a.length, b.length);
|
|
6088
|
+
let diffs = 0;
|
|
6089
|
+
for (let i = 0; i < sampleLen; i++) if (a[i] !== b[i]) diffs++;
|
|
6090
|
+
const sampleRate = sampleLen > 0 ? diffs / sampleLen : 1;
|
|
6091
|
+
return Math.abs(a.length - b.length) + Math.round(Math.min(a.length, b.length) * sampleRate);
|
|
6092
|
+
}
|
|
5348
6093
|
if (a.length > b.length) [a, b] = [b, a];
|
|
5349
6094
|
const m = a.length;
|
|
5350
6095
|
const n = b.length;
|
|
@@ -5500,7 +6245,10 @@ function diffTableCells(a, b) {
|
|
|
5500
6245
|
}
|
|
5501
6246
|
|
|
5502
6247
|
export {
|
|
5503
|
-
|
|
6248
|
+
VERSION,
|
|
6249
|
+
toArrayBuffer,
|
|
6250
|
+
KordocError,
|
|
6251
|
+
sanitizeError,
|
|
5504
6252
|
blocksToMarkdown,
|
|
5505
6253
|
extractHwpxMetadataOnly,
|
|
5506
6254
|
extractHwp5MetadataOnly,
|
|
@@ -5509,4 +6257,4 @@ export {
|
|
|
5509
6257
|
extractFormFields,
|
|
5510
6258
|
parse
|
|
5511
6259
|
};
|
|
5512
|
-
//# sourceMappingURL=chunk-
|
|
6260
|
+
//# sourceMappingURL=chunk-LYFG7AUT.js.map
|