kordoc 2.1.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-5Y2Q3BRW.js +52 -0
- package/dist/chunk-5Y2Q3BRW.js.map +1 -0
- package/dist/{chunk-GJ2S6IMC.js → chunk-LYFG7AUT.js} +966 -577
- package/dist/chunk-LYFG7AUT.js.map +1 -0
- package/dist/cli.js +12 -8
- package/dist/cli.js.map +1 -1
- package/dist/detect-GYK3HKD5.js +18 -0
- package/dist/index.cjs +993 -546
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +993 -546
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +40 -11
- package/dist/mcp.js.map +1 -1
- package/dist/{watch-X7IC7MLF.js → watch-Q5OXA73S.js} +31 -15
- package/dist/watch-Q5OXA73S.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-GJ2S6IMC.js.map +0 -1
- package/dist/chunk-PKIJLEV6.js +0 -93
- package/dist/chunk-PKIJLEV6.js.map +0 -1
- package/dist/utils-BWQ2RGUD.js +0 -22
- package/dist/watch-X7IC7MLF.js.map +0 -1
- /package/dist/{utils-BWQ2RGUD.js.map → detect-GYK3HKD5.js.map} +0 -0
|
@@ -1,53 +1,105 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import {
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
precheckZipSize,
|
|
7
|
-
sanitizeHref,
|
|
8
|
-
toArrayBuffer
|
|
9
|
-
} from "./chunk-PKIJLEV6.js";
|
|
3
|
+
detectFormat,
|
|
4
|
+
detectZipFormat
|
|
5
|
+
} from "./chunk-5Y2Q3BRW.js";
|
|
10
6
|
import {
|
|
11
7
|
parsePageRange
|
|
12
8
|
} from "./chunk-MOL7MDBG.js";
|
|
13
9
|
|
|
14
|
-
// src/
|
|
15
|
-
|
|
16
|
-
function
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
return b[0] === 80 && b[1] === 75 && b[2] === 3 && b[3] === 4;
|
|
22
|
-
}
|
|
23
|
-
function isOldHwpFile(buffer) {
|
|
24
|
-
const b = magicBytes(buffer);
|
|
25
|
-
return b[0] === 208 && b[1] === 207 && b[2] === 17 && b[3] === 224;
|
|
10
|
+
// src/utils.ts
|
|
11
|
+
var VERSION = true ? "2.2.0" : "0.0.0-dev";
|
|
12
|
+
function toArrayBuffer(buf) {
|
|
13
|
+
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
14
|
+
return buf.buffer;
|
|
15
|
+
}
|
|
16
|
+
return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
|
|
26
17
|
}
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
18
|
+
var KordocError = class extends Error {
|
|
19
|
+
constructor(message) {
|
|
20
|
+
super(message);
|
|
21
|
+
this.name = "KordocError";
|
|
22
|
+
}
|
|
23
|
+
};
|
|
24
|
+
function sanitizeError(err) {
|
|
25
|
+
if (err instanceof KordocError) return err.message;
|
|
26
|
+
return "\uBB38\uC11C \uCC98\uB9AC \uC911 \uC624\uB958\uAC00 \uBC1C\uC0DD\uD588\uC2B5\uB2C8\uB2E4";
|
|
30
27
|
}
|
|
31
|
-
function
|
|
32
|
-
if (
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
return "unknown";
|
|
28
|
+
function isPathTraversal(name) {
|
|
29
|
+
if (name.includes("\0")) return true;
|
|
30
|
+
const normalized = name.replace(/\\/g, "/");
|
|
31
|
+
const segments = normalized.split("/");
|
|
32
|
+
return segments.some((s) => s === "..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
37
33
|
}
|
|
38
|
-
|
|
34
|
+
function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
|
|
39
35
|
try {
|
|
40
|
-
const
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
36
|
+
const data = new DataView(buffer);
|
|
37
|
+
const len = buffer.byteLength;
|
|
38
|
+
let eocdOffset = -1;
|
|
39
|
+
for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
|
|
40
|
+
if (data.getUint32(i, true) === 101010256) {
|
|
41
|
+
eocdOffset = i;
|
|
42
|
+
break;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
|
|
46
|
+
const entryCount = data.getUint16(eocdOffset + 10, true);
|
|
47
|
+
if (entryCount > maxEntries) {
|
|
48
|
+
throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
|
|
49
|
+
}
|
|
50
|
+
const cdSize = data.getUint32(eocdOffset + 12, true);
|
|
51
|
+
const cdOffset = data.getUint32(eocdOffset + 16, true);
|
|
52
|
+
if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
|
|
53
|
+
let totalUncompressed = 0;
|
|
54
|
+
let pos = cdOffset;
|
|
55
|
+
for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
|
|
56
|
+
if (data.getUint32(pos, true) !== 33639248) break;
|
|
57
|
+
totalUncompressed += data.getUint32(pos + 24, true);
|
|
58
|
+
const nameLen = data.getUint16(pos + 28, true);
|
|
59
|
+
const extraLen = data.getUint16(pos + 30, true);
|
|
60
|
+
const commentLen = data.getUint16(pos + 32, true);
|
|
61
|
+
pos += 46 + nameLen + extraLen + commentLen;
|
|
62
|
+
}
|
|
63
|
+
if (totalUncompressed > maxUncompressedSize) {
|
|
64
|
+
throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
|
|
65
|
+
}
|
|
66
|
+
return { totalUncompressed, entryCount };
|
|
67
|
+
} catch (err) {
|
|
68
|
+
if (err instanceof KordocError) throw err;
|
|
69
|
+
return { totalUncompressed: 0, entryCount: 0 };
|
|
49
70
|
}
|
|
50
71
|
}
|
|
72
|
+
function stripDtd(xml) {
|
|
73
|
+
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
74
|
+
}
|
|
75
|
+
var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
|
|
76
|
+
function sanitizeHref(href) {
|
|
77
|
+
const trimmed = href.trim();
|
|
78
|
+
if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
|
|
79
|
+
return trimmed;
|
|
80
|
+
}
|
|
81
|
+
function safeMin(arr) {
|
|
82
|
+
let min = Infinity;
|
|
83
|
+
for (let i = 0; i < arr.length; i++) if (arr[i] < min) min = arr[i];
|
|
84
|
+
return min;
|
|
85
|
+
}
|
|
86
|
+
function safeMax(arr) {
|
|
87
|
+
let max = -Infinity;
|
|
88
|
+
for (let i = 0; i < arr.length; i++) if (arr[i] > max) max = arr[i];
|
|
89
|
+
return max;
|
|
90
|
+
}
|
|
91
|
+
function classifyError(err) {
|
|
92
|
+
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
93
|
+
const msg = err.message;
|
|
94
|
+
if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
|
|
95
|
+
if (msg.includes("DRM")) return "DRM_PROTECTED";
|
|
96
|
+
if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
|
|
97
|
+
if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
|
|
98
|
+
if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
|
|
99
|
+
if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
|
|
100
|
+
if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
|
|
101
|
+
return "PARSE_ERROR";
|
|
102
|
+
}
|
|
51
103
|
|
|
52
104
|
// src/table/builder.ts
|
|
53
105
|
var MAX_COLS = 200;
|
|
@@ -110,6 +162,7 @@ function buildTableDirect(rows, numRows) {
|
|
|
110
162
|
if (end > maxCols) maxCols = end;
|
|
111
163
|
}
|
|
112
164
|
}
|
|
165
|
+
if (maxCols > MAX_COLS) maxCols = MAX_COLS;
|
|
113
166
|
if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
|
|
114
167
|
const grid = Array.from(
|
|
115
168
|
{ length: numRows },
|
|
@@ -119,7 +172,7 @@ function buildTableDirect(rows, numRows) {
|
|
|
119
172
|
for (const cell of row) {
|
|
120
173
|
const r = cell.rowAddr ?? 0;
|
|
121
174
|
const c = cell.colAddr ?? 0;
|
|
122
|
-
if (r >= numRows || c >= maxCols) continue;
|
|
175
|
+
if (r >= numRows || c >= maxCols || r < 0 || c < 0) continue;
|
|
123
176
|
grid[r][c] = { text: cell.text.trim(), colSpan: cell.colSpan, rowSpan: cell.rowSpan };
|
|
124
177
|
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
125
178
|
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
@@ -304,9 +357,6 @@ function tableToMarkdown(table) {
|
|
|
304
357
|
if (dr === 0 && dc === 0) continue;
|
|
305
358
|
if (r + dr < numRows && c + dc < numCols) {
|
|
306
359
|
skip.add(`${r + dr},${c + dc}`);
|
|
307
|
-
if (dr === 0) {
|
|
308
|
-
display[r][c + dc] = cell.text.replace(/\n/g, "<br>");
|
|
309
|
-
}
|
|
310
360
|
}
|
|
311
361
|
}
|
|
312
362
|
}
|
|
@@ -344,7 +394,7 @@ function tableToMarkdown(table) {
|
|
|
344
394
|
}
|
|
345
395
|
|
|
346
396
|
// src/hwpx/parser.ts
|
|
347
|
-
import
|
|
397
|
+
import JSZip from "jszip";
|
|
348
398
|
import { inflateRawSync } from "zlib";
|
|
349
399
|
import { DOMParser } from "@xmldom/xmldom";
|
|
350
400
|
|
|
@@ -446,14 +496,11 @@ function parseStyleElements(doc, map) {
|
|
|
446
496
|
}
|
|
447
497
|
}
|
|
448
498
|
}
|
|
449
|
-
function stripDtd(xml) {
|
|
450
|
-
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
451
|
-
}
|
|
452
499
|
async function parseHwpxDocument(buffer, options) {
|
|
453
500
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
|
|
454
501
|
let zip;
|
|
455
502
|
try {
|
|
456
|
-
zip = await
|
|
503
|
+
zip = await JSZip.loadAsync(buffer);
|
|
457
504
|
} catch {
|
|
458
505
|
return extractFromBrokenZip(buffer);
|
|
459
506
|
}
|
|
@@ -616,7 +663,7 @@ function parseDublinCoreMetadata(xml, metadata) {
|
|
|
616
663
|
async function extractHwpxMetadataOnly(buffer) {
|
|
617
664
|
let zip;
|
|
618
665
|
try {
|
|
619
|
-
zip = await
|
|
666
|
+
zip = await JSZip.loadAsync(buffer);
|
|
620
667
|
} catch {
|
|
621
668
|
throw new KordocError("HWPX ZIP\uC744 \uC5F4 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
622
669
|
}
|
|
@@ -811,7 +858,8 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
811
858
|
if (newTable.rows.length > 0) {
|
|
812
859
|
if (tableStack.length > 0) {
|
|
813
860
|
const parentTable = tableStack.pop();
|
|
814
|
-
|
|
861
|
+
let nestedCols = 0;
|
|
862
|
+
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
815
863
|
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
816
864
|
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
817
865
|
} else {
|
|
@@ -920,7 +968,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
920
968
|
if (newTable.rows.length > 0) {
|
|
921
969
|
if (tableStack.length > 0) {
|
|
922
970
|
const parentTable = tableStack.pop();
|
|
923
|
-
|
|
971
|
+
let nestedCols = 0;
|
|
972
|
+
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
924
973
|
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
925
974
|
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
926
975
|
} else {
|
|
@@ -2018,6 +2067,7 @@ function parseLenientCfb(data) {
|
|
|
2018
2067
|
if (miniSectorSizeShift > 16) throw new Error("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uBBF8\uB2C8 \uC139\uD130 \uD06C\uAE30 \uC2DC\uD504\uD2B8: " + miniSectorSizeShift);
|
|
2019
2068
|
const miniSectorSize = 1 << miniSectorSizeShift;
|
|
2020
2069
|
const fatSectorCount = data.readUInt32LE(44);
|
|
2070
|
+
if (fatSectorCount > 1e4) throw new Error("FAT \uC139\uD130 \uC218\uAC00 \uB108\uBB34 \uB9CE\uC2B5\uB2C8\uB2E4: " + fatSectorCount);
|
|
2021
2071
|
const firstDirSector = data.readUInt32LE(48);
|
|
2022
2072
|
const miniStreamCutoff = data.readUInt32LE(56);
|
|
2023
2073
|
const firstMiniFatSector = data.readUInt32LE(60);
|
|
@@ -2406,10 +2456,14 @@ function findSections(cfb) {
|
|
|
2406
2456
|
}
|
|
2407
2457
|
function findSectionsLenient(lcfb, compressed) {
|
|
2408
2458
|
const sections = [];
|
|
2459
|
+
let totalDecompressed = 0;
|
|
2409
2460
|
for (let i = 0; i < MAX_SECTIONS; i++) {
|
|
2410
2461
|
const raw = lcfb.findStream(`/BodyText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
|
|
2411
2462
|
if (!raw) break;
|
|
2412
|
-
|
|
2463
|
+
const content = compressed ? decompressStream(raw) : raw;
|
|
2464
|
+
totalDecompressed += content.length;
|
|
2465
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2466
|
+
sections.push({ idx: i, content });
|
|
2413
2467
|
}
|
|
2414
2468
|
if (sections.length === 0) {
|
|
2415
2469
|
for (const e of lcfb.entries()) {
|
|
@@ -2417,7 +2471,12 @@ function findSectionsLenient(lcfb, compressed) {
|
|
|
2417
2471
|
if (e.name.startsWith("Section")) {
|
|
2418
2472
|
const idx = parseInt(e.name.replace("Section", ""), 10) || 0;
|
|
2419
2473
|
const raw = lcfb.findStream(e.name);
|
|
2420
|
-
if (raw)
|
|
2474
|
+
if (raw) {
|
|
2475
|
+
const content = compressed ? decompressStream(raw) : raw;
|
|
2476
|
+
totalDecompressed += content.length;
|
|
2477
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2478
|
+
sections.push({ idx, content });
|
|
2479
|
+
}
|
|
2421
2480
|
}
|
|
2422
2481
|
}
|
|
2423
2482
|
}
|
|
@@ -2425,11 +2484,15 @@ function findSectionsLenient(lcfb, compressed) {
|
|
|
2425
2484
|
}
|
|
2426
2485
|
function findViewTextSectionsLenient(lcfb, compressed) {
|
|
2427
2486
|
const sections = [];
|
|
2487
|
+
let totalDecompressed = 0;
|
|
2428
2488
|
for (let i = 0; i < MAX_SECTIONS; i++) {
|
|
2429
2489
|
const raw = lcfb.findStream(`/ViewText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
|
|
2430
2490
|
if (!raw) break;
|
|
2431
2491
|
try {
|
|
2432
|
-
|
|
2492
|
+
const content = decryptViewText(raw, compressed);
|
|
2493
|
+
totalDecompressed += content.length;
|
|
2494
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2495
|
+
sections.push({ idx: i, content });
|
|
2433
2496
|
} catch {
|
|
2434
2497
|
break;
|
|
2435
2498
|
}
|
|
@@ -2828,37 +2891,18 @@ function arrangeCells(rows, cols, cells) {
|
|
|
2828
2891
|
// src/pdf/line-detector.ts
|
|
2829
2892
|
import { OPS } from "pdfjs-dist/legacy/build/pdf.mjs";
|
|
2830
2893
|
var ORIENTATION_TOL = 2;
|
|
2831
|
-
var MIN_LINE_LENGTH =
|
|
2832
|
-
var
|
|
2894
|
+
var MIN_LINE_LENGTH = 15;
|
|
2895
|
+
var MAX_LINE_WIDTH = 5;
|
|
2833
2896
|
var CONNECT_TOL = 5;
|
|
2834
2897
|
var CELL_PADDING = 2;
|
|
2835
|
-
var
|
|
2836
|
-
var
|
|
2837
|
-
|
|
2838
|
-
|
|
2839
|
-
m1[0] * m2[0] + m1[2] * m2[1],
|
|
2840
|
-
m1[1] * m2[0] + m1[3] * m2[1],
|
|
2841
|
-
m1[0] * m2[2] + m1[2] * m2[3],
|
|
2842
|
-
m1[1] * m2[2] + m1[3] * m2[3],
|
|
2843
|
-
m1[0] * m2[4] + m1[2] * m2[5] + m1[4],
|
|
2844
|
-
m1[1] * m2[4] + m1[3] * m2[5] + m1[5]
|
|
2845
|
-
];
|
|
2846
|
-
}
|
|
2847
|
-
function matTransformPoint(m, x, y) {
|
|
2848
|
-
return [m[0] * x + m[2] * y + m[4], m[1] * x + m[3] * y + m[5]];
|
|
2849
|
-
}
|
|
2850
|
-
function matScale(m) {
|
|
2851
|
-
return Math.max(
|
|
2852
|
-
Math.sqrt(m[1] * m[1] + m[3] * m[3]),
|
|
2853
|
-
Math.sqrt(m[0] * m[0] + m[2] * m[2])
|
|
2854
|
-
);
|
|
2855
|
-
}
|
|
2898
|
+
var MIN_COL_WIDTH = 15;
|
|
2899
|
+
var MIN_ROW_HEIGHT = 6;
|
|
2900
|
+
var VERTEX_MERGE_FACTOR = 4;
|
|
2901
|
+
var MIN_COORD_MERGE_TOL = 8;
|
|
2856
2902
|
function extractLines(fnArray, argsArray) {
|
|
2857
2903
|
const horizontals = [];
|
|
2858
2904
|
const verticals = [];
|
|
2859
|
-
let ctm = [...IDENTITY];
|
|
2860
2905
|
let lineWidth = 1;
|
|
2861
|
-
const stateStack = [];
|
|
2862
2906
|
let currentPath = [];
|
|
2863
2907
|
let pathStartX = 0, pathStartY = 0;
|
|
2864
2908
|
let curX = 0, curY = 0;
|
|
@@ -2876,53 +2920,13 @@ function extractLines(fnArray, argsArray) {
|
|
|
2876
2920
|
);
|
|
2877
2921
|
}
|
|
2878
2922
|
}
|
|
2879
|
-
function
|
|
2880
|
-
if (
|
|
2881
|
-
const first = path[0], last = path[path.length - 1];
|
|
2882
|
-
const closed = Math.abs(first.x1 - last.x2) < 1 && Math.abs(first.y1 - last.y2) < 1;
|
|
2883
|
-
if (!closed) return false;
|
|
2884
|
-
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
2885
|
-
for (const seg of path) {
|
|
2886
|
-
minX = Math.min(minX, seg.x1, seg.x2);
|
|
2887
|
-
minY = Math.min(minY, seg.y1, seg.y2);
|
|
2888
|
-
maxX = Math.max(maxX, seg.x1, seg.x2);
|
|
2889
|
-
maxY = Math.max(maxY, seg.y1, seg.y2);
|
|
2890
|
-
}
|
|
2891
|
-
const w = maxX - minX, h = maxY - minY;
|
|
2892
|
-
if (w < MIN_LINE_LENGTH && h < MIN_LINE_LENGTH) return false;
|
|
2893
|
-
path.length = 0;
|
|
2894
|
-
if (h < ORIENTATION_TOL * 2 || w > MIN_LINE_LENGTH && h <= MAX_LINE_WIDTH) {
|
|
2895
|
-
path.push({ x1: minX, y1: (minY + maxY) / 2, x2: maxX, y2: (minY + maxY) / 2 });
|
|
2896
|
-
} else if (w < ORIENTATION_TOL * 2 || h > MIN_LINE_LENGTH && w <= MAX_LINE_WIDTH) {
|
|
2897
|
-
path.push({ x1: (minX + maxX) / 2, y1: minY, x2: (minX + maxX) / 2, y2: maxY });
|
|
2898
|
-
} else {
|
|
2899
|
-
pushRectangle(path, minX, minY, w, h);
|
|
2900
|
-
}
|
|
2901
|
-
return true;
|
|
2902
|
-
}
|
|
2903
|
-
function flushPath(isStroke, isFill) {
|
|
2904
|
-
if (!isStroke && !isFill) {
|
|
2905
|
-
currentPath = [];
|
|
2906
|
-
return;
|
|
2907
|
-
}
|
|
2908
|
-
if (isFill && !isStroke && currentPath.length >= 3) {
|
|
2909
|
-
tryConvertLinesToRectangle(currentPath);
|
|
2910
|
-
}
|
|
2911
|
-
const scale = matScale(ctm);
|
|
2912
|
-
const effectiveLW = lineWidth * scale;
|
|
2913
|
-
if (effectiveLW > MAX_LINE_WIDTH && isStroke && !isFill) {
|
|
2923
|
+
function flushPath(isStroke) {
|
|
2924
|
+
if (!isStroke) {
|
|
2914
2925
|
currentPath = [];
|
|
2915
2926
|
return;
|
|
2916
2927
|
}
|
|
2917
2928
|
for (const seg of currentPath) {
|
|
2918
|
-
|
|
2919
|
-
const [px2, py2] = matTransformPoint(ctm, seg.x2, seg.y2);
|
|
2920
|
-
classifyAndAdd(
|
|
2921
|
-
{ x1: px1, y1: py1, x2: px2, y2: py2 },
|
|
2922
|
-
effectiveLW,
|
|
2923
|
-
horizontals,
|
|
2924
|
-
verticals
|
|
2925
|
-
);
|
|
2929
|
+
classifyAndAdd(seg, lineWidth, horizontals, verticals);
|
|
2926
2930
|
}
|
|
2927
2931
|
currentPath = [];
|
|
2928
2932
|
}
|
|
@@ -2930,28 +2934,9 @@ function extractLines(fnArray, argsArray) {
|
|
|
2930
2934
|
const op = fnArray[i];
|
|
2931
2935
|
const args = argsArray[i];
|
|
2932
2936
|
switch (op) {
|
|
2933
|
-
// ── Graphics State ──
|
|
2934
|
-
case OPS.save:
|
|
2935
|
-
stateStack.push({ ctm: [...ctm], lineWidth });
|
|
2936
|
-
break;
|
|
2937
|
-
case OPS.restore:
|
|
2938
|
-
if (stateStack.length > 0) {
|
|
2939
|
-
const state = stateStack.pop();
|
|
2940
|
-
ctm = state.ctm;
|
|
2941
|
-
lineWidth = state.lineWidth;
|
|
2942
|
-
}
|
|
2943
|
-
break;
|
|
2944
|
-
case OPS.transform: {
|
|
2945
|
-
const m = args;
|
|
2946
|
-
if (m.length >= 6) {
|
|
2947
|
-
ctm = matMultiply(ctm, [m[0], m[1], m[2], m[3], m[4], m[5]]);
|
|
2948
|
-
}
|
|
2949
|
-
break;
|
|
2950
|
-
}
|
|
2951
2937
|
case OPS.setLineWidth:
|
|
2952
2938
|
lineWidth = args[0] || 1;
|
|
2953
2939
|
break;
|
|
2954
|
-
// ── Path Construction ──
|
|
2955
2940
|
case OPS.constructPath: {
|
|
2956
2941
|
const arg0 = args[0];
|
|
2957
2942
|
if (Array.isArray(arg0)) {
|
|
@@ -3019,60 +3004,34 @@ function extractLines(fnArray, argsArray) {
|
|
|
3019
3004
|
}
|
|
3020
3005
|
}
|
|
3021
3006
|
}
|
|
3022
|
-
|
|
3023
|
-
|
|
3024
|
-
|
|
3025
|
-
|
|
3026
|
-
flushPath(isStroke5 || isBoth5, isFill5 || isBoth5);
|
|
3007
|
+
if (afterOp === OPS.stroke || afterOp === OPS.closeStroke) {
|
|
3008
|
+
flushPath(true);
|
|
3009
|
+
} else if (afterOp === OPS.fill || afterOp === OPS.eoFill || afterOp === OPS.fillStroke || afterOp === OPS.eoFillStroke || afterOp === OPS.closeFillStroke || afterOp === OPS.closeEOFillStroke) {
|
|
3010
|
+
flushPath(true);
|
|
3027
3011
|
} else if (afterOp === OPS.endPath) {
|
|
3028
|
-
flushPath(false
|
|
3012
|
+
flushPath(false);
|
|
3029
3013
|
}
|
|
3030
3014
|
}
|
|
3031
3015
|
break;
|
|
3032
3016
|
}
|
|
3033
|
-
// ── Paint Operations ──
|
|
3034
3017
|
case OPS.stroke:
|
|
3035
3018
|
case OPS.closeStroke:
|
|
3036
|
-
flushPath(true
|
|
3019
|
+
flushPath(true);
|
|
3037
3020
|
break;
|
|
3038
3021
|
case OPS.fill:
|
|
3039
3022
|
case OPS.eoFill:
|
|
3040
|
-
flushPath(false, true);
|
|
3041
|
-
break;
|
|
3042
3023
|
case OPS.fillStroke:
|
|
3043
3024
|
case OPS.eoFillStroke:
|
|
3044
3025
|
case OPS.closeFillStroke:
|
|
3045
3026
|
case OPS.closeEOFillStroke:
|
|
3046
|
-
flushPath(true
|
|
3027
|
+
flushPath(true);
|
|
3047
3028
|
break;
|
|
3048
3029
|
case OPS.endPath:
|
|
3049
|
-
flushPath(false
|
|
3030
|
+
flushPath(false);
|
|
3050
3031
|
break;
|
|
3051
3032
|
}
|
|
3052
3033
|
}
|
|
3053
|
-
return {
|
|
3054
|
-
horizontals: deduplicateLines(horizontals),
|
|
3055
|
-
verticals: deduplicateLines(verticals)
|
|
3056
|
-
};
|
|
3057
|
-
}
|
|
3058
|
-
function deduplicateLines(lines) {
|
|
3059
|
-
if (lines.length <= 1) return lines;
|
|
3060
|
-
const result = [];
|
|
3061
|
-
const tol = COORD_MERGE_TOL;
|
|
3062
|
-
for (const line of lines) {
|
|
3063
|
-
let isDuplicate = false;
|
|
3064
|
-
for (const existing of result) {
|
|
3065
|
-
if (Math.abs(line.y1 - existing.y1) <= tol && Math.abs(line.y2 - existing.y2) <= tol && Math.abs(line.x1 - existing.x1) <= tol && Math.abs(line.x2 - existing.x2) <= tol) {
|
|
3066
|
-
if (line.lineWidth > existing.lineWidth) {
|
|
3067
|
-
existing.lineWidth = line.lineWidth;
|
|
3068
|
-
}
|
|
3069
|
-
isDuplicate = true;
|
|
3070
|
-
break;
|
|
3071
|
-
}
|
|
3072
|
-
}
|
|
3073
|
-
if (!isDuplicate) result.push(line);
|
|
3074
|
-
}
|
|
3075
|
-
return result;
|
|
3034
|
+
return { horizontals, verticals };
|
|
3076
3035
|
}
|
|
3077
3036
|
function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
|
|
3078
3037
|
const dx = Math.abs(seg.x2 - seg.x1);
|
|
@@ -3091,6 +3050,55 @@ function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
|
|
|
3091
3050
|
verticals.push({ x1: x, y1, x2: x, y2, lineWidth });
|
|
3092
3051
|
}
|
|
3093
3052
|
}
|
|
3053
|
+
function preprocessLines(horizontals, verticals) {
|
|
3054
|
+
let h = horizontals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
|
|
3055
|
+
let v = verticals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
|
|
3056
|
+
h = mergeParallelLines(h, "h");
|
|
3057
|
+
v = mergeParallelLines(v, "v");
|
|
3058
|
+
return { horizontals: h, verticals: v };
|
|
3059
|
+
}
|
|
3060
|
+
function mergeParallelLines(lines, dir) {
|
|
3061
|
+
if (lines.length <= 1) return lines;
|
|
3062
|
+
const sorted = [...lines].sort((a, b) => {
|
|
3063
|
+
const posA = dir === "h" ? a.y1 : a.x1;
|
|
3064
|
+
const posB = dir === "h" ? b.y1 : b.x1;
|
|
3065
|
+
if (Math.abs(posA - posB) > 0.1) return posA - posB;
|
|
3066
|
+
return dir === "h" ? a.x1 - b.x1 : a.y1 - b.y1;
|
|
3067
|
+
});
|
|
3068
|
+
const MERGE_TOL = 3;
|
|
3069
|
+
const result = [sorted[0]];
|
|
3070
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
3071
|
+
const prev = result[result.length - 1];
|
|
3072
|
+
const curr = sorted[i];
|
|
3073
|
+
const prevPos = dir === "h" ? prev.y1 : prev.x1;
|
|
3074
|
+
const currPos = dir === "h" ? curr.y1 : curr.x1;
|
|
3075
|
+
if (Math.abs(prevPos - currPos) <= MERGE_TOL) {
|
|
3076
|
+
const prevStart = dir === "h" ? prev.x1 : prev.y1;
|
|
3077
|
+
const prevEnd = dir === "h" ? prev.x2 : prev.y2;
|
|
3078
|
+
const currStart = dir === "h" ? curr.x1 : curr.y1;
|
|
3079
|
+
const currEnd = dir === "h" ? curr.x2 : curr.y2;
|
|
3080
|
+
const overlap = Math.min(prevEnd, currEnd) - Math.max(prevStart, currStart);
|
|
3081
|
+
const minLen = Math.min(prevEnd - prevStart, currEnd - currStart);
|
|
3082
|
+
if (overlap > minLen * 0.3) {
|
|
3083
|
+
if (dir === "h") {
|
|
3084
|
+
prev.x1 = Math.min(prev.x1, curr.x1);
|
|
3085
|
+
prev.x2 = Math.max(prev.x2, curr.x2);
|
|
3086
|
+
prev.y1 = (prev.y1 + curr.y1) / 2;
|
|
3087
|
+
prev.y2 = prev.y1;
|
|
3088
|
+
} else {
|
|
3089
|
+
prev.y1 = Math.min(prev.y1, curr.y1);
|
|
3090
|
+
prev.y2 = Math.max(prev.y2, curr.y2);
|
|
3091
|
+
prev.x1 = (prev.x1 + curr.x1) / 2;
|
|
3092
|
+
prev.x2 = prev.x1;
|
|
3093
|
+
}
|
|
3094
|
+
prev.lineWidth = Math.max(prev.lineWidth, curr.lineWidth);
|
|
3095
|
+
continue;
|
|
3096
|
+
}
|
|
3097
|
+
}
|
|
3098
|
+
result.push(curr);
|
|
3099
|
+
}
|
|
3100
|
+
return result;
|
|
3101
|
+
}
|
|
3094
3102
|
function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
|
|
3095
3103
|
const margin = 5;
|
|
3096
3104
|
return {
|
|
@@ -3102,8 +3110,49 @@ function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
|
|
|
3102
3110
|
)
|
|
3103
3111
|
};
|
|
3104
3112
|
}
|
|
3113
|
+
function buildVertices(horizontals, verticals) {
|
|
3114
|
+
const vertices = [];
|
|
3115
|
+
const tol = CONNECT_TOL;
|
|
3116
|
+
for (const h of horizontals) {
|
|
3117
|
+
for (const v of verticals) {
|
|
3118
|
+
if (v.x1 >= h.x1 - tol && v.x1 <= h.x2 + tol && h.y1 >= v.y1 - tol && h.y1 <= v.y2 + tol) {
|
|
3119
|
+
const radius = Math.max(h.lineWidth, v.lineWidth, 1);
|
|
3120
|
+
vertices.push({ x: v.x1, y: h.y1, radius });
|
|
3121
|
+
}
|
|
3122
|
+
}
|
|
3123
|
+
}
|
|
3124
|
+
return vertices;
|
|
3125
|
+
}
|
|
3126
|
+
function mergeVertices(vertices) {
|
|
3127
|
+
if (vertices.length <= 1) return vertices;
|
|
3128
|
+
const merged = [];
|
|
3129
|
+
const used = new Array(vertices.length).fill(false);
|
|
3130
|
+
for (let i = 0; i < vertices.length; i++) {
|
|
3131
|
+
if (used[i]) continue;
|
|
3132
|
+
let sumX = vertices[i].x, sumY = vertices[i].y;
|
|
3133
|
+
let maxRadius = vertices[i].radius;
|
|
3134
|
+
let count = 1;
|
|
3135
|
+
for (let j = i + 1; j < vertices.length; j++) {
|
|
3136
|
+
if (used[j]) continue;
|
|
3137
|
+
const mergeTol = VERTEX_MERGE_FACTOR * Math.max(maxRadius, vertices[j].radius);
|
|
3138
|
+
if (Math.abs(vertices[i].x - vertices[j].x) <= mergeTol && Math.abs(vertices[i].y - vertices[j].y) <= mergeTol) {
|
|
3139
|
+
sumX += vertices[j].x;
|
|
3140
|
+
sumY += vertices[j].y;
|
|
3141
|
+
maxRadius = Math.max(maxRadius, vertices[j].radius);
|
|
3142
|
+
count++;
|
|
3143
|
+
used[j] = true;
|
|
3144
|
+
}
|
|
3145
|
+
}
|
|
3146
|
+
merged.push({ x: sumX / count, y: sumY / count, radius: maxRadius });
|
|
3147
|
+
}
|
|
3148
|
+
return merged;
|
|
3149
|
+
}
|
|
3105
3150
|
function buildTableGrids(horizontals, verticals) {
|
|
3106
3151
|
if (horizontals.length < 2 || verticals.length < 2) return [];
|
|
3152
|
+
const allVertices = buildVertices(horizontals, verticals);
|
|
3153
|
+
const vertices = mergeVertices(allVertices);
|
|
3154
|
+
if (vertices.length < 4) return [];
|
|
3155
|
+
const globalRadius = vertices.reduce((max, v) => Math.max(max, v.radius), 1);
|
|
3107
3156
|
const allLines = [
|
|
3108
3157
|
...horizontals.map((l, i) => ({ ...l, type: "h", id: i })),
|
|
3109
3158
|
...verticals.map((l, i) => ({ ...l, type: "v", id: i + horizontals.length }))
|
|
@@ -3114,21 +3163,74 @@ function buildTableGrids(horizontals, verticals) {
|
|
|
3114
3163
|
const hLines = group.filter((l) => l.type === "h");
|
|
3115
3164
|
const vLines = group.filter((l) => l.type === "v");
|
|
3116
3165
|
if (hLines.length < 2 || vLines.length < 2) continue;
|
|
3117
|
-
|
|
3118
|
-
const
|
|
3119
|
-
|
|
3120
|
-
|
|
3166
|
+
let gx1 = Infinity, gy1 = Infinity, gx2 = -Infinity, gy2 = -Infinity;
|
|
3167
|
+
for (const l of vLines) {
|
|
3168
|
+
if (l.x1 < gx1) gx1 = l.x1;
|
|
3169
|
+
if (l.x1 > gx2) gx2 = l.x1;
|
|
3170
|
+
}
|
|
3171
|
+
for (const l of hLines) {
|
|
3172
|
+
if (l.y1 < gy1) gy1 = l.y1;
|
|
3173
|
+
if (l.y1 > gy2) gy2 = l.y1;
|
|
3174
|
+
}
|
|
3175
|
+
const groupBbox = {
|
|
3176
|
+
x1: gx1 - CONNECT_TOL,
|
|
3177
|
+
y1: gy1 - CONNECT_TOL,
|
|
3178
|
+
x2: gx2 + CONNECT_TOL,
|
|
3179
|
+
y2: gy2 + CONNECT_TOL
|
|
3180
|
+
};
|
|
3181
|
+
const groupVertices = vertices.filter(
|
|
3182
|
+
(v) => v.x >= groupBbox.x1 && v.x <= groupBbox.x2 && v.y >= groupBbox.y1 && v.y <= groupBbox.y2
|
|
3183
|
+
);
|
|
3184
|
+
const groupRadius = groupVertices.length > 0 ? groupVertices.reduce((max, v) => Math.max(max, v.radius), 1) : globalRadius;
|
|
3185
|
+
const coordMergeTol = Math.max(VERTEX_MERGE_FACTOR * groupRadius, MIN_COORD_MERGE_TOL);
|
|
3186
|
+
const rawYs = [
|
|
3187
|
+
...hLines.map((l) => l.y1),
|
|
3188
|
+
...groupVertices.map((v) => v.y)
|
|
3189
|
+
];
|
|
3190
|
+
const rowYs = clusterCoordinates(rawYs, coordMergeTol).sort((a, b) => b - a);
|
|
3191
|
+
const rawXs = [
|
|
3192
|
+
...vLines.map((l) => l.x1),
|
|
3193
|
+
...groupVertices.map((v) => v.x)
|
|
3194
|
+
];
|
|
3195
|
+
const colXs = clusterCoordinates(rawXs, coordMergeTol).sort((a, b) => a - b);
|
|
3121
3196
|
if (rowYs.length < 2 || colXs.length < 2) continue;
|
|
3197
|
+
const validColXs = enforceMinWidth(colXs, MIN_COL_WIDTH);
|
|
3198
|
+
const validRowYs = enforceMinHeight(rowYs, MIN_ROW_HEIGHT);
|
|
3199
|
+
if (validRowYs.length < 2 || validColXs.length < 2) continue;
|
|
3122
3200
|
const bbox = {
|
|
3123
|
-
x1:
|
|
3124
|
-
y1:
|
|
3125
|
-
x2:
|
|
3126
|
-
y2:
|
|
3201
|
+
x1: validColXs[0],
|
|
3202
|
+
y1: validRowYs[validRowYs.length - 1],
|
|
3203
|
+
x2: validColXs[validColXs.length - 1],
|
|
3204
|
+
y2: validRowYs[0]
|
|
3127
3205
|
};
|
|
3128
|
-
grids.push({ rowYs, colXs, bbox });
|
|
3206
|
+
grids.push({ rowYs: validRowYs, colXs: validColXs, bbox, vertexRadius: groupRadius });
|
|
3129
3207
|
}
|
|
3130
3208
|
return mergeAdjacentGrids(grids);
|
|
3131
3209
|
}
|
|
3210
|
+
function enforceMinWidth(colXs, minWidth) {
|
|
3211
|
+
if (colXs.length <= 2) return colXs;
|
|
3212
|
+
const result = [colXs[0]];
|
|
3213
|
+
for (let i = 1; i < colXs.length; i++) {
|
|
3214
|
+
const prevX = result[result.length - 1];
|
|
3215
|
+
if (colXs[i] - prevX < minWidth && i < colXs.length - 1) {
|
|
3216
|
+
continue;
|
|
3217
|
+
}
|
|
3218
|
+
result.push(colXs[i]);
|
|
3219
|
+
}
|
|
3220
|
+
return result;
|
|
3221
|
+
}
|
|
3222
|
+
function enforceMinHeight(rowYs, minHeight) {
|
|
3223
|
+
if (rowYs.length <= 2) return rowYs;
|
|
3224
|
+
const result = [rowYs[0]];
|
|
3225
|
+
for (let i = 1; i < rowYs.length; i++) {
|
|
3226
|
+
const prevY = result[result.length - 1];
|
|
3227
|
+
if (prevY - rowYs[i] < minHeight && i < rowYs.length - 1) {
|
|
3228
|
+
continue;
|
|
3229
|
+
}
|
|
3230
|
+
result.push(rowYs[i]);
|
|
3231
|
+
}
|
|
3232
|
+
return result;
|
|
3233
|
+
}
|
|
3132
3234
|
function mergeAdjacentGrids(grids) {
|
|
3133
3235
|
if (grids.length <= 1) return grids;
|
|
3134
3236
|
const sorted = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
|
|
@@ -3137,9 +3239,10 @@ function mergeAdjacentGrids(grids) {
|
|
|
3137
3239
|
const prev = merged[merged.length - 1];
|
|
3138
3240
|
const curr = sorted[i];
|
|
3139
3241
|
if (prev.colXs.length === curr.colXs.length) {
|
|
3140
|
-
const
|
|
3242
|
+
const mergeTol = Math.max(VERTEX_MERGE_FACTOR * Math.max(prev.vertexRadius, curr.vertexRadius), 6) * 3;
|
|
3243
|
+
const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= mergeTol);
|
|
3141
3244
|
const verticalGap = prev.bbox.y1 - curr.bbox.y2;
|
|
3142
|
-
if (colMatch && verticalGap >= -
|
|
3245
|
+
if (colMatch && verticalGap >= -CONNECT_TOL && verticalGap <= 20) {
|
|
3143
3246
|
const allRowYs = [.../* @__PURE__ */ new Set([...prev.rowYs, ...curr.rowYs])].sort((a, b) => b - a);
|
|
3144
3247
|
merged[merged.length - 1] = {
|
|
3145
3248
|
rowYs: allRowYs,
|
|
@@ -3149,7 +3252,8 @@ function mergeAdjacentGrids(grids) {
|
|
|
3149
3252
|
y1: Math.min(prev.bbox.y1, curr.bbox.y1),
|
|
3150
3253
|
x2: Math.max(prev.bbox.x2, curr.bbox.x2),
|
|
3151
3254
|
y2: Math.max(prev.bbox.y2, curr.bbox.y2)
|
|
3152
|
-
}
|
|
3255
|
+
},
|
|
3256
|
+
vertexRadius: Math.max(prev.vertexRadius, curr.vertexRadius)
|
|
3153
3257
|
};
|
|
3154
3258
|
continue;
|
|
3155
3259
|
}
|
|
@@ -3158,14 +3262,14 @@ function mergeAdjacentGrids(grids) {
|
|
|
3158
3262
|
}
|
|
3159
3263
|
return merged;
|
|
3160
3264
|
}
|
|
3161
|
-
function clusterCoordinates(values) {
|
|
3265
|
+
function clusterCoordinates(values, tolerance) {
|
|
3162
3266
|
if (values.length === 0) return [];
|
|
3163
3267
|
const sorted = [...values].sort((a, b) => a - b);
|
|
3164
3268
|
const clusters = [{ sum: sorted[0], count: 1 }];
|
|
3165
3269
|
for (let i = 1; i < sorted.length; i++) {
|
|
3166
3270
|
const last = clusters[clusters.length - 1];
|
|
3167
3271
|
const avg = last.sum / last.count;
|
|
3168
|
-
if (Math.abs(sorted[i] - avg) <=
|
|
3272
|
+
if (Math.abs(sorted[i] - avg) <= tolerance) {
|
|
3169
3273
|
last.sum += sorted[i];
|
|
3170
3274
|
last.count++;
|
|
3171
3275
|
} else {
|
|
@@ -3222,6 +3326,20 @@ function extractCells(grid, horizontals, verticals) {
|
|
|
3222
3326
|
const numRows = rowYs.length - 1;
|
|
3223
3327
|
const numCols = colXs.length - 1;
|
|
3224
3328
|
if (numRows <= 0 || numCols <= 0) return [];
|
|
3329
|
+
const vBorders = Array.from(
|
|
3330
|
+
{ length: numRows },
|
|
3331
|
+
(_, r) => Array.from(
|
|
3332
|
+
{ length: numCols + 1 },
|
|
3333
|
+
(_2, c) => hasVerticalLine(verticals, colXs[c], rowYs[r], rowYs[r + 1], grid.vertexRadius)
|
|
3334
|
+
)
|
|
3335
|
+
);
|
|
3336
|
+
const hBorders = Array.from(
|
|
3337
|
+
{ length: numRows + 1 },
|
|
3338
|
+
(_, r) => Array.from(
|
|
3339
|
+
{ length: numCols },
|
|
3340
|
+
(_2, c) => hasHorizontalLine(horizontals, rowYs[r], colXs[c], colXs[c + 1], grid.vertexRadius)
|
|
3341
|
+
)
|
|
3342
|
+
);
|
|
3225
3343
|
const occupied = Array.from({ length: numRows }, () => Array(numCols).fill(false));
|
|
3226
3344
|
const cells = [];
|
|
3227
3345
|
for (let r = 0; r < numRows; r++) {
|
|
@@ -3229,18 +3347,26 @@ function extractCells(grid, horizontals, verticals) {
|
|
|
3229
3347
|
if (occupied[r][c]) continue;
|
|
3230
3348
|
let colSpan = 1;
|
|
3231
3349
|
let rowSpan = 1;
|
|
3232
|
-
while (c + colSpan < numCols) {
|
|
3233
|
-
|
|
3234
|
-
|
|
3235
|
-
|
|
3236
|
-
|
|
3350
|
+
while (c + colSpan < numCols && !vBorders[r][c + colSpan]) {
|
|
3351
|
+
let canExpand = true;
|
|
3352
|
+
for (let dr = 0; dr < rowSpan; dr++) {
|
|
3353
|
+
if (vBorders[r + dr][c + colSpan]) {
|
|
3354
|
+
canExpand = false;
|
|
3355
|
+
break;
|
|
3356
|
+
}
|
|
3357
|
+
}
|
|
3358
|
+
if (!canExpand) break;
|
|
3237
3359
|
colSpan++;
|
|
3238
3360
|
}
|
|
3239
3361
|
while (r + rowSpan < numRows) {
|
|
3240
|
-
|
|
3241
|
-
|
|
3242
|
-
|
|
3243
|
-
|
|
3362
|
+
let hasLine = false;
|
|
3363
|
+
for (let dc = 0; dc < colSpan; dc++) {
|
|
3364
|
+
if (hBorders[r + rowSpan][c + dc]) {
|
|
3365
|
+
hasLine = true;
|
|
3366
|
+
break;
|
|
3367
|
+
}
|
|
3368
|
+
}
|
|
3369
|
+
if (hasLine) break;
|
|
3244
3370
|
rowSpan++;
|
|
3245
3371
|
}
|
|
3246
3372
|
for (let dr = 0; dr < rowSpan; dr++) {
|
|
@@ -3264,28 +3390,30 @@ function extractCells(grid, horizontals, verticals) {
|
|
|
3264
3390
|
}
|
|
3265
3391
|
return cells;
|
|
3266
3392
|
}
|
|
3267
|
-
function hasVerticalLine(verticals, x, topY, botY) {
|
|
3268
|
-
const tol =
|
|
3393
|
+
function hasVerticalLine(verticals, x, topY, botY, vertexRadius) {
|
|
3394
|
+
const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
|
|
3269
3395
|
for (const v of verticals) {
|
|
3270
3396
|
if (Math.abs(v.x1 - x) <= tol) {
|
|
3271
3397
|
const cellH = Math.abs(topY - botY);
|
|
3398
|
+
if (cellH < 0.1) continue;
|
|
3272
3399
|
const overlapTop = Math.min(v.y2, topY);
|
|
3273
3400
|
const overlapBot = Math.max(v.y1, botY);
|
|
3274
3401
|
const overlap = overlapTop - overlapBot;
|
|
3275
|
-
if (overlap >= cellH * 0.
|
|
3402
|
+
if (overlap >= cellH * 0.75) return true;
|
|
3276
3403
|
}
|
|
3277
3404
|
}
|
|
3278
3405
|
return false;
|
|
3279
3406
|
}
|
|
3280
|
-
function hasHorizontalLine(horizontals, y, leftX, rightX) {
|
|
3281
|
-
const tol =
|
|
3407
|
+
function hasHorizontalLine(horizontals, y, leftX, rightX, vertexRadius) {
|
|
3408
|
+
const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
|
|
3282
3409
|
for (const h of horizontals) {
|
|
3283
3410
|
if (Math.abs(h.y1 - y) <= tol) {
|
|
3284
3411
|
const cellW = Math.abs(rightX - leftX);
|
|
3412
|
+
if (cellW < 0.1) continue;
|
|
3285
3413
|
const overlapLeft = Math.max(h.x1, leftX);
|
|
3286
3414
|
const overlapRight = Math.min(h.x2, rightX);
|
|
3287
3415
|
const overlap = overlapRight - overlapLeft;
|
|
3288
|
-
if (overlap >= cellW * 0.
|
|
3416
|
+
if (overlap >= cellW * 0.75) return true;
|
|
3289
3417
|
}
|
|
3290
3418
|
}
|
|
3291
3419
|
return false;
|
|
@@ -3296,23 +3424,24 @@ function mapTextToCells(items, cells) {
|
|
|
3296
3424
|
result.set(cell, []);
|
|
3297
3425
|
}
|
|
3298
3426
|
for (const item of items) {
|
|
3299
|
-
const cx = item.x + item.w / 2;
|
|
3300
|
-
const cy = item.y;
|
|
3301
3427
|
const pad = CELL_PADDING;
|
|
3302
3428
|
let bestCell = null;
|
|
3303
|
-
let
|
|
3429
|
+
let bestScore = 0;
|
|
3304
3430
|
for (const cell of cells) {
|
|
3305
|
-
|
|
3306
|
-
|
|
3307
|
-
|
|
3308
|
-
|
|
3309
|
-
|
|
3310
|
-
|
|
3311
|
-
|
|
3312
|
-
|
|
3431
|
+
const ix1 = Math.max(item.x, cell.bbox.x1 - pad);
|
|
3432
|
+
const ix2 = Math.min(item.x + item.w, cell.bbox.x2 + pad);
|
|
3433
|
+
const iy1 = Math.max(item.y, cell.bbox.y1 - pad);
|
|
3434
|
+
const iy2 = Math.min(item.y + (item.h || item.fontSize), cell.bbox.y2 + pad);
|
|
3435
|
+
if (ix1 >= ix2 || iy1 >= iy2) continue;
|
|
3436
|
+
const intersectArea = (ix2 - ix1) * (iy2 - iy1);
|
|
3437
|
+
const itemArea = Math.max(item.w, 1) * Math.max(item.h || item.fontSize, 1);
|
|
3438
|
+
const score = intersectArea / itemArea;
|
|
3439
|
+
if (score > bestScore) {
|
|
3440
|
+
bestScore = score;
|
|
3441
|
+
bestCell = cell;
|
|
3313
3442
|
}
|
|
3314
3443
|
}
|
|
3315
|
-
if (bestCell) {
|
|
3444
|
+
if (bestCell && bestScore > 0.3) {
|
|
3316
3445
|
result.get(bestCell).push(item);
|
|
3317
3446
|
}
|
|
3318
3447
|
}
|
|
@@ -3339,8 +3468,13 @@ function cellTextToString(items) {
|
|
|
3339
3468
|
const textLines = lines.map((line) => {
|
|
3340
3469
|
const s = line.sort((a, b) => a.x - b.x);
|
|
3341
3470
|
if (s.length === 1) return s[0].text;
|
|
3471
|
+
const evenSpaced = detectEvenSpacedItems(s);
|
|
3342
3472
|
let result = s[0].text;
|
|
3343
3473
|
for (let j = 1; j < s.length; j++) {
|
|
3474
|
+
if (evenSpaced[j]) {
|
|
3475
|
+
result += s[j].text;
|
|
3476
|
+
continue;
|
|
3477
|
+
}
|
|
3344
3478
|
const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
|
|
3345
3479
|
const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
|
|
3346
3480
|
const prevIsKorean = /[가-힣]$/.test(result);
|
|
@@ -3355,6 +3489,57 @@ function cellTextToString(items) {
|
|
|
3355
3489
|
}
|
|
3356
3490
|
return result;
|
|
3357
3491
|
});
|
|
3492
|
+
return mergeCellTextLines(textLines);
|
|
3493
|
+
}
|
|
3494
|
+
function detectEvenSpacedItems(items) {
|
|
3495
|
+
const result = new Array(items.length).fill(false);
|
|
3496
|
+
if (items.length < 3) return result;
|
|
3497
|
+
let runStart = -1;
|
|
3498
|
+
for (let i = 0; i < items.length; i++) {
|
|
3499
|
+
const isShortKorean = /^[가-힣]{1}$/.test(items[i].text) || /^[\d]{1}$/.test(items[i].text);
|
|
3500
|
+
if (isShortKorean && runStart >= 0 && i > 0) {
|
|
3501
|
+
const gap = items[i].x - (items[i - 1].x + items[i - 1].w);
|
|
3502
|
+
const maxRunGap = Math.max(items[i].fontSize * 3, 30);
|
|
3503
|
+
if (gap > maxRunGap) {
|
|
3504
|
+
if (i - runStart >= 3) markEvenRun(items, result, runStart, i);
|
|
3505
|
+
runStart = i;
|
|
3506
|
+
continue;
|
|
3507
|
+
}
|
|
3508
|
+
}
|
|
3509
|
+
if (isShortKorean) {
|
|
3510
|
+
if (runStart < 0) runStart = i;
|
|
3511
|
+
} else {
|
|
3512
|
+
if (runStart >= 0 && i - runStart >= 3) {
|
|
3513
|
+
markEvenRun(items, result, runStart, i);
|
|
3514
|
+
}
|
|
3515
|
+
runStart = -1;
|
|
3516
|
+
}
|
|
3517
|
+
}
|
|
3518
|
+
if (runStart >= 0 && items.length - runStart >= 3) {
|
|
3519
|
+
markEvenRun(items, result, runStart, items.length);
|
|
3520
|
+
}
|
|
3521
|
+
return result;
|
|
3522
|
+
}
|
|
3523
|
+
function markEvenRun(items, result, start, end) {
|
|
3524
|
+
const gaps = [];
|
|
3525
|
+
for (let i = start + 1; i < end; i++) {
|
|
3526
|
+
gaps.push(items[i].x - (items[i - 1].x + items[i - 1].w));
|
|
3527
|
+
}
|
|
3528
|
+
const posGaps = gaps.filter((g2) => g2 > 0);
|
|
3529
|
+
if (posGaps.length < 2) return;
|
|
3530
|
+
let minGap = Infinity, maxGap = -Infinity;
|
|
3531
|
+
for (const g2 of posGaps) {
|
|
3532
|
+
if (g2 < minGap) minGap = g2;
|
|
3533
|
+
if (g2 > maxGap) maxGap = g2;
|
|
3534
|
+
}
|
|
3535
|
+
const avgFs = items[start].fontSize;
|
|
3536
|
+
if (minGap >= avgFs * 0.1 && maxGap <= avgFs * 3 && maxGap / Math.max(minGap, 0.1) <= 3) {
|
|
3537
|
+
for (let i = start + 1; i < end; i++) {
|
|
3538
|
+
result[i] = true;
|
|
3539
|
+
}
|
|
3540
|
+
}
|
|
3541
|
+
}
|
|
3542
|
+
function mergeCellTextLines(textLines) {
|
|
3358
3543
|
if (textLines.length <= 1) return textLines[0] || "";
|
|
3359
3544
|
const merged = [textLines[0]];
|
|
3360
3545
|
for (let i = 1; i < textLines.length; i++) {
|
|
@@ -3380,24 +3565,172 @@ var Y_TOL = 3;
|
|
|
3380
3565
|
var COL_CLUSTER_TOL = 15;
|
|
3381
3566
|
var MIN_ROWS = 3;
|
|
3382
3567
|
var MIN_COLS = 2;
|
|
3383
|
-
var MIN_GAP_FACTOR =
|
|
3384
|
-
var
|
|
3568
|
+
var MIN_GAP_FACTOR = 2;
|
|
3569
|
+
var MIN_GAP_ABSOLUTE = 20;
|
|
3570
|
+
var MIN_COL_FILL_RATIO = 0.4;
|
|
3385
3571
|
function detectClusterTables(items, pageNum) {
|
|
3386
3572
|
if (items.length < MIN_ROWS * MIN_COLS) return [];
|
|
3387
|
-
const
|
|
3573
|
+
const { merged, originMap } = mergeEvenSpacedClusters(items);
|
|
3574
|
+
const rows = groupByBaseline(merged);
|
|
3388
3575
|
if (rows.length < MIN_ROWS) return [];
|
|
3389
|
-
const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
|
|
3390
|
-
if (suspiciousRows.length < MIN_ROWS) return [];
|
|
3391
|
-
const columns = extractColumnClusters(suspiciousRows);
|
|
3392
|
-
if (columns.length < MIN_COLS) return [];
|
|
3393
|
-
const tableRegions = findTableRegions(rows, columns);
|
|
3394
3576
|
const results = [];
|
|
3395
|
-
|
|
3396
|
-
|
|
3397
|
-
|
|
3577
|
+
const headerResult = detectHeaderRow(rows);
|
|
3578
|
+
if (headerResult) {
|
|
3579
|
+
const { columns, headerIdx } = headerResult;
|
|
3580
|
+
const headerRow = rows[headerIdx];
|
|
3581
|
+
const headerItems = [...headerRow.items].sort((a, b) => a.x - b.x);
|
|
3582
|
+
const headerAndBelow = rows.slice(headerIdx);
|
|
3583
|
+
const mergedRows = mergeMultiLineRows(headerAndBelow, columns);
|
|
3584
|
+
const tableRegions = findTableRegionsByHeader(mergedRows, columns, headerItems);
|
|
3585
|
+
for (const region of tableRegions) {
|
|
3586
|
+
const table = buildClusterTable(region.rows, columns, pageNum);
|
|
3587
|
+
if (table) {
|
|
3588
|
+
expandUsedItems(table.usedItems, originMap);
|
|
3589
|
+
results.push(table);
|
|
3590
|
+
}
|
|
3591
|
+
}
|
|
3592
|
+
}
|
|
3593
|
+
if (results.length === 0) {
|
|
3594
|
+
const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
|
|
3595
|
+
if (suspiciousRows.length >= MIN_ROWS) {
|
|
3596
|
+
const columns = extractColumnClusters(suspiciousRows);
|
|
3597
|
+
if (columns.length >= MIN_COLS) {
|
|
3598
|
+
const tableRegions = findTableRegions(rows, columns);
|
|
3599
|
+
for (const region of tableRegions) {
|
|
3600
|
+
const mergedRows = mergeMultiLineRows(region.rows, columns);
|
|
3601
|
+
const table = buildClusterTable(mergedRows, columns, pageNum);
|
|
3602
|
+
if (table) {
|
|
3603
|
+
expandUsedItems(table.usedItems, originMap);
|
|
3604
|
+
results.push(table);
|
|
3605
|
+
}
|
|
3606
|
+
}
|
|
3607
|
+
}
|
|
3608
|
+
}
|
|
3398
3609
|
}
|
|
3399
3610
|
return results;
|
|
3400
3611
|
}
|
|
3612
|
+
function mergeEvenSpacedClusters(items) {
|
|
3613
|
+
const originMap = /* @__PURE__ */ new Map();
|
|
3614
|
+
const rows = groupByBaseline(items);
|
|
3615
|
+
const merged = [];
|
|
3616
|
+
for (const row of rows) {
|
|
3617
|
+
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3618
|
+
let i = 0;
|
|
3619
|
+
while (i < sorted.length) {
|
|
3620
|
+
if (/^[가-힣\d]$/.test(sorted[i].text)) {
|
|
3621
|
+
let runEnd = i + 1;
|
|
3622
|
+
while (runEnd < sorted.length && /^[가-힣\d]$/.test(sorted[runEnd].text)) {
|
|
3623
|
+
const gap = sorted[runEnd].x - (sorted[runEnd - 1].x + sorted[runEnd - 1].w);
|
|
3624
|
+
const fs = sorted[runEnd].fontSize;
|
|
3625
|
+
if (gap < fs * 0.1 || gap > fs * 3) break;
|
|
3626
|
+
runEnd++;
|
|
3627
|
+
}
|
|
3628
|
+
if (runEnd - i >= 3) {
|
|
3629
|
+
const gaps = [];
|
|
3630
|
+
for (let g2 = i + 1; g2 < runEnd; g2++) {
|
|
3631
|
+
gaps.push(sorted[g2].x - (sorted[g2 - 1].x + sorted[g2 - 1].w));
|
|
3632
|
+
}
|
|
3633
|
+
let minG = Infinity, maxG = -Infinity;
|
|
3634
|
+
for (const g2 of gaps) {
|
|
3635
|
+
if (g2 < minG) minG = g2;
|
|
3636
|
+
if (g2 > maxG) maxG = g2;
|
|
3637
|
+
}
|
|
3638
|
+
if (minG > 0 && maxG / minG <= 3) {
|
|
3639
|
+
const run = sorted.slice(i, runEnd);
|
|
3640
|
+
const text = run.map((r) => r.text).join("");
|
|
3641
|
+
const first = run[0], last = run[runEnd - i - 1];
|
|
3642
|
+
const item = {
|
|
3643
|
+
text,
|
|
3644
|
+
x: first.x,
|
|
3645
|
+
y: first.y,
|
|
3646
|
+
w: last.x + last.w - first.x,
|
|
3647
|
+
h: first.h,
|
|
3648
|
+
fontSize: first.fontSize,
|
|
3649
|
+
fontName: first.fontName
|
|
3650
|
+
};
|
|
3651
|
+
originMap.set(item, run);
|
|
3652
|
+
merged.push(item);
|
|
3653
|
+
i = runEnd;
|
|
3654
|
+
continue;
|
|
3655
|
+
}
|
|
3656
|
+
}
|
|
3657
|
+
}
|
|
3658
|
+
merged.push(sorted[i]);
|
|
3659
|
+
i++;
|
|
3660
|
+
}
|
|
3661
|
+
}
|
|
3662
|
+
return { merged, originMap };
|
|
3663
|
+
}
|
|
3664
|
+
function expandUsedItems(usedItems, originMap) {
|
|
3665
|
+
const toAdd = [];
|
|
3666
|
+
for (const item of usedItems) {
|
|
3667
|
+
const origins = originMap.get(item);
|
|
3668
|
+
if (origins) for (const o of origins) toAdd.push(o);
|
|
3669
|
+
}
|
|
3670
|
+
for (const a of toAdd) usedItems.add(a);
|
|
3671
|
+
}
|
|
3672
|
+
function detectHeaderRow(rows) {
|
|
3673
|
+
const allItems = rows.flatMap((r) => r.items);
|
|
3674
|
+
if (allItems.length === 0) return null;
|
|
3675
|
+
let allMinX = Infinity, allMaxX = -Infinity;
|
|
3676
|
+
for (const i of allItems) {
|
|
3677
|
+
if (i.x < allMinX) allMinX = i.x;
|
|
3678
|
+
const r = i.x + i.w;
|
|
3679
|
+
if (r > allMaxX) allMaxX = r;
|
|
3680
|
+
}
|
|
3681
|
+
const pageSpan = allMaxX - allMinX;
|
|
3682
|
+
if (pageSpan <= 0) return null;
|
|
3683
|
+
for (let ri = 0; ri < rows.length; ri++) {
|
|
3684
|
+
const row = rows[ri];
|
|
3685
|
+
if (row.items.length < MIN_COLS || row.items.length > 6) continue;
|
|
3686
|
+
if (row.items.some((i) => i.text.length > 8)) continue;
|
|
3687
|
+
if (!row.items.some((i) => /[가-힣]/.test(i.text))) continue;
|
|
3688
|
+
if (row.items.some((i) => /^[□■○●·※▶▷◆◇\-]/.test(i.text))) continue;
|
|
3689
|
+
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3690
|
+
const xSpan = sorted[sorted.length - 1].x + sorted[sorted.length - 1].w - sorted[0].x;
|
|
3691
|
+
if (xSpan / pageSpan < 0.4) continue;
|
|
3692
|
+
const avgFs = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
|
|
3693
|
+
let hasLargeGap = false;
|
|
3694
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
3695
|
+
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
3696
|
+
if (gap >= avgFs * 2.5) {
|
|
3697
|
+
hasLargeGap = true;
|
|
3698
|
+
break;
|
|
3699
|
+
}
|
|
3700
|
+
}
|
|
3701
|
+
if (!hasLargeGap) continue;
|
|
3702
|
+
const columns = sorted.map((item) => ({ x: item.x, count: 0 }));
|
|
3703
|
+
let matchCount = 0;
|
|
3704
|
+
for (let j = ri + 1; j < rows.length && matchCount < MIN_ROWS + 2; j++) {
|
|
3705
|
+
const matched = countMatchedColumnsRange(rows[j], columns, sorted);
|
|
3706
|
+
if (matched >= MIN_COLS) matchCount++;
|
|
3707
|
+
}
|
|
3708
|
+
if (matchCount < MIN_ROWS) continue;
|
|
3709
|
+
return { columns, headerIdx: ri };
|
|
3710
|
+
}
|
|
3711
|
+
return null;
|
|
3712
|
+
}
|
|
3713
|
+
function mergeMultiLineRows(rows, columns) {
|
|
3714
|
+
if (rows.length <= 1) return rows;
|
|
3715
|
+
const result = [rows[0]];
|
|
3716
|
+
const allFontSizes = rows.flatMap((r) => r.items).map((i) => i.fontSize);
|
|
3717
|
+
const avgFontSize = allFontSizes.length > 0 ? allFontSizes.reduce((s, v) => s + v, 0) / allFontSizes.length : 12;
|
|
3718
|
+
for (let i = 1; i < rows.length; i++) {
|
|
3719
|
+
const prev = result[result.length - 1];
|
|
3720
|
+
const curr = rows[i];
|
|
3721
|
+
const yGap = Math.abs(prev.y - curr.y);
|
|
3722
|
+
const matchedCols = countMatchedColumns(curr, columns);
|
|
3723
|
+
if (yGap < avgFontSize * 1.8 && curr.items.length <= 2 && (matchedCols < MIN_COLS || curr.items.length === 1)) {
|
|
3724
|
+
result[result.length - 1] = {
|
|
3725
|
+
y: prev.y,
|
|
3726
|
+
items: [...prev.items, ...curr.items]
|
|
3727
|
+
};
|
|
3728
|
+
} else {
|
|
3729
|
+
result.push(curr);
|
|
3730
|
+
}
|
|
3731
|
+
}
|
|
3732
|
+
return result;
|
|
3733
|
+
}
|
|
3401
3734
|
function groupByBaseline(items) {
|
|
3402
3735
|
if (items.length === 0) return [];
|
|
3403
3736
|
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
@@ -3419,8 +3752,9 @@ function groupByBaseline(items) {
|
|
|
3419
3752
|
function hasSuspiciousGaps(row) {
|
|
3420
3753
|
if (row.items.length < 2) return false;
|
|
3421
3754
|
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3755
|
+
if (sorted.length === 2 && sorted[1].text.length > 20) return false;
|
|
3422
3756
|
const avgFontSize = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
|
|
3423
|
-
const minGap = avgFontSize * MIN_GAP_FACTOR;
|
|
3757
|
+
const minGap = Math.max(avgFontSize * MIN_GAP_FACTOR, MIN_GAP_ABSOLUTE);
|
|
3424
3758
|
for (let i = 1; i < sorted.length; i++) {
|
|
3425
3759
|
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
3426
3760
|
if (gap >= minGap) return true;
|
|
@@ -3447,6 +3781,41 @@ function extractColumnClusters(rows) {
|
|
|
3447
3781
|
const minCount = Math.max(2, Math.floor(rows.length * MIN_COL_FILL_RATIO));
|
|
3448
3782
|
return clusters.filter((c) => c.count >= minCount).sort((a, b) => a.x - b.x);
|
|
3449
3783
|
}
|
|
3784
|
+
function findTableRegionsByHeader(allRows, columns, headerItems) {
|
|
3785
|
+
const regions = [];
|
|
3786
|
+
let currentRegion = [];
|
|
3787
|
+
let missStreak = 0;
|
|
3788
|
+
for (const row of allRows) {
|
|
3789
|
+
const matchedCols = countMatchedColumnsRange(row, columns, headerItems);
|
|
3790
|
+
if (matchedCols >= MIN_COLS) {
|
|
3791
|
+
currentRegion.push(row);
|
|
3792
|
+
missStreak = 0;
|
|
3793
|
+
} else if (currentRegion.length > 0 && (row.items.length <= 2 || missStreak === 0)) {
|
|
3794
|
+
currentRegion.push(row);
|
|
3795
|
+
missStreak++;
|
|
3796
|
+
} else {
|
|
3797
|
+
while (currentRegion.length > 0) {
|
|
3798
|
+
const last = currentRegion[currentRegion.length - 1];
|
|
3799
|
+
if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
|
|
3800
|
+
currentRegion.pop();
|
|
3801
|
+
}
|
|
3802
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
3803
|
+
regions.push({ rows: [...currentRegion] });
|
|
3804
|
+
}
|
|
3805
|
+
currentRegion = [];
|
|
3806
|
+
missStreak = 0;
|
|
3807
|
+
}
|
|
3808
|
+
}
|
|
3809
|
+
while (currentRegion.length > 0) {
|
|
3810
|
+
const last = currentRegion[currentRegion.length - 1];
|
|
3811
|
+
if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
|
|
3812
|
+
currentRegion.pop();
|
|
3813
|
+
}
|
|
3814
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
3815
|
+
regions.push({ rows: currentRegion });
|
|
3816
|
+
}
|
|
3817
|
+
return regions;
|
|
3818
|
+
}
|
|
3450
3819
|
function findTableRegions(allRows, columns) {
|
|
3451
3820
|
const regions = [];
|
|
3452
3821
|
let currentRegion = [];
|
|
@@ -3482,18 +3851,81 @@ function countMatchedColumns(row, columns) {
|
|
|
3482
3851
|
}
|
|
3483
3852
|
return matched.size;
|
|
3484
3853
|
}
|
|
3485
|
-
function
|
|
3486
|
-
const
|
|
3487
|
-
let
|
|
3488
|
-
|
|
3489
|
-
|
|
3490
|
-
|
|
3491
|
-
|
|
3492
|
-
|
|
3493
|
-
|
|
3854
|
+
function countMatchedColumnsRange(row, columns, headerItems) {
|
|
3855
|
+
const boundaries = [];
|
|
3856
|
+
for (let ci = 0; ci < headerItems.length; ci++) {
|
|
3857
|
+
const left = ci === 0 ? 0 : (headerItems[ci - 1].x + headerItems[ci - 1].w + headerItems[ci].x) / 2;
|
|
3858
|
+
const right = ci === headerItems.length - 1 ? Infinity : (headerItems[ci].x + headerItems[ci].w + headerItems[ci + 1].x) / 2;
|
|
3859
|
+
boundaries.push({ left, right });
|
|
3860
|
+
}
|
|
3861
|
+
const matched = /* @__PURE__ */ new Set();
|
|
3862
|
+
for (const item of row.items) {
|
|
3863
|
+
for (let ci = 0; ci < boundaries.length; ci++) {
|
|
3864
|
+
if (item.x >= boundaries[ci].left && item.x < boundaries[ci].right) {
|
|
3865
|
+
matched.add(ci);
|
|
3866
|
+
break;
|
|
3867
|
+
}
|
|
3868
|
+
}
|
|
3869
|
+
}
|
|
3870
|
+
return matched.size;
|
|
3871
|
+
}
|
|
3872
|
+
function assignRowItems(items, columns, numCols) {
|
|
3873
|
+
if (items.length === 0) return [];
|
|
3874
|
+
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
3875
|
+
const colCenters = columns.map((c) => c.x);
|
|
3876
|
+
const gaps = [];
|
|
3877
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
3878
|
+
gaps.push({ idx: i, size: sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w) });
|
|
3879
|
+
}
|
|
3880
|
+
const gapSizes = gaps.map((g2) => g2.size).sort((a, b) => a - b);
|
|
3881
|
+
const medianGap = gapSizes.length > 0 ? gapSizes[Math.floor(gapSizes.length / 2)] : 0;
|
|
3882
|
+
const gapThreshold = sorted.length <= numCols + 1 ? 12 : Math.max(medianGap * 2.5, 12);
|
|
3883
|
+
const significantGaps = gaps.filter((g2) => g2.size >= gapThreshold).sort((a, b) => b.size - a.size).slice(0, numCols - 1).sort((a, b) => a.idx - b.idx);
|
|
3884
|
+
const groups = [];
|
|
3885
|
+
let start = 0;
|
|
3886
|
+
for (const gap of significantGaps) {
|
|
3887
|
+
groups.push(sorted.slice(start, gap.idx));
|
|
3888
|
+
start = gap.idx;
|
|
3889
|
+
}
|
|
3890
|
+
groups.push(sorted.slice(start));
|
|
3891
|
+
const result = [];
|
|
3892
|
+
const usedCols = /* @__PURE__ */ new Set();
|
|
3893
|
+
const groupCenters = groups.map((g2) => {
|
|
3894
|
+
let minX = Infinity, maxX = -Infinity;
|
|
3895
|
+
for (const i of g2) {
|
|
3896
|
+
if (i.x < minX) minX = i.x;
|
|
3897
|
+
const r = i.x + i.w;
|
|
3898
|
+
if (r > maxX) maxX = r;
|
|
3899
|
+
}
|
|
3900
|
+
return (minX + maxX) / 2;
|
|
3901
|
+
});
|
|
3902
|
+
const assignments = [];
|
|
3903
|
+
for (let gi = 0; gi < groups.length; gi++) {
|
|
3904
|
+
for (let ci = 0; ci < numCols; ci++) {
|
|
3905
|
+
assignments.push({ gi, ci, dist: Math.abs(groupCenters[gi] - colCenters[ci]) });
|
|
3906
|
+
}
|
|
3907
|
+
}
|
|
3908
|
+
assignments.sort((a, b) => a.dist - b.dist);
|
|
3909
|
+
const assignedGroups = /* @__PURE__ */ new Set();
|
|
3910
|
+
for (const { gi, ci } of assignments) {
|
|
3911
|
+
if (assignedGroups.has(gi) || usedCols.has(ci)) continue;
|
|
3912
|
+
result.push({ col: ci, items: groups[gi] });
|
|
3913
|
+
assignedGroups.add(gi);
|
|
3914
|
+
usedCols.add(ci);
|
|
3915
|
+
}
|
|
3916
|
+
for (let gi = 0; gi < groups.length; gi++) {
|
|
3917
|
+
if (assignedGroups.has(gi)) continue;
|
|
3918
|
+
let bestCol = 0, bestDist = Infinity;
|
|
3919
|
+
for (let ci = 0; ci < numCols; ci++) {
|
|
3920
|
+
const d = Math.abs(groupCenters[gi] - colCenters[ci]);
|
|
3921
|
+
if (d < bestDist) {
|
|
3922
|
+
bestDist = d;
|
|
3923
|
+
bestCol = ci;
|
|
3924
|
+
}
|
|
3494
3925
|
}
|
|
3926
|
+
result.push({ col: bestCol, items: groups[gi] });
|
|
3495
3927
|
}
|
|
3496
|
-
return
|
|
3928
|
+
return result;
|
|
3497
3929
|
}
|
|
3498
3930
|
function buildClusterTable(rows, columns, pageNum) {
|
|
3499
3931
|
const numCols = columns.length;
|
|
@@ -3511,12 +3943,12 @@ function buildClusterTable(rows, columns, pageNum) {
|
|
|
3511
3943
|
usedItems.add(row.items[0]);
|
|
3512
3944
|
continue;
|
|
3513
3945
|
}
|
|
3514
|
-
|
|
3515
|
-
|
|
3516
|
-
|
|
3946
|
+
const assignments = assignRowItems(row.items, columns, numCols);
|
|
3947
|
+
for (const { col, items } of assignments) {
|
|
3948
|
+
const text = items.map((i) => i.text).join(" ");
|
|
3517
3949
|
const existing = cells[r][col].text;
|
|
3518
|
-
cells[r][col].text = existing ? existing + " " +
|
|
3519
|
-
usedItems.add(item);
|
|
3950
|
+
cells[r][col].text = existing ? existing + " " + text : text;
|
|
3951
|
+
for (const item of items) usedItems.add(item);
|
|
3520
3952
|
}
|
|
3521
3953
|
}
|
|
3522
3954
|
let emptyRows = 0;
|
|
@@ -3528,11 +3960,48 @@ function buildClusterTable(rows, columns, pageNum) {
|
|
|
3528
3960
|
const hasValue = cells.some((row) => row[c].text !== "");
|
|
3529
3961
|
if (!hasValue) return null;
|
|
3530
3962
|
}
|
|
3963
|
+
for (let r = numRows - 1; r >= 1; r--) {
|
|
3964
|
+
const nonEmptyCols = cells[r].filter((c) => c.text.trim()).length;
|
|
3965
|
+
if (nonEmptyCols !== 1) continue;
|
|
3966
|
+
if (cells[r][0].text.trim() !== "") continue;
|
|
3967
|
+
const contentText = cells[r].find((c) => c.text.trim())?.text.trim() || "";
|
|
3968
|
+
if (/^[○●▶\-·]/.test(contentText)) continue;
|
|
3969
|
+
for (let pr = r - 1; pr >= 0; pr--) {
|
|
3970
|
+
if (cells[pr].some((c) => c.text.trim())) {
|
|
3971
|
+
for (let c = 0; c < numCols; c++) {
|
|
3972
|
+
const prev = cells[pr][c].text.trim();
|
|
3973
|
+
const curr = cells[r][c].text.trim();
|
|
3974
|
+
if (curr) cells[pr][c].text = prev ? prev + " " + curr : curr;
|
|
3975
|
+
}
|
|
3976
|
+
for (let c = 0; c < numCols; c++) cells[r][c].text = "";
|
|
3977
|
+
break;
|
|
3978
|
+
}
|
|
3979
|
+
}
|
|
3980
|
+
}
|
|
3981
|
+
for (let r = 0; r < cells.length - 1; r++) {
|
|
3982
|
+
const row = cells[r];
|
|
3983
|
+
const hasCol0 = row[0].text.trim() !== "";
|
|
3984
|
+
const hasColLast = numCols > 1 && row[numCols - 1].text.trim() !== "";
|
|
3985
|
+
const midEmpty = row.slice(1, numCols - 1).every((c) => c.text.trim() === "");
|
|
3986
|
+
if (hasCol0 && hasColLast && midEmpty) {
|
|
3987
|
+
const next = cells[r + 1];
|
|
3988
|
+
if (next[0].text.trim() === "" && next.some((c) => c.text.trim())) {
|
|
3989
|
+
for (let c = 1; c < numCols; c++) {
|
|
3990
|
+
const curr = next[c].text.trim();
|
|
3991
|
+
if (curr) row[c].text = row[c].text.trim() ? row[c].text.trim() + " " + curr : curr;
|
|
3992
|
+
}
|
|
3993
|
+
for (let c = 0; c < numCols; c++) next[c].text = "";
|
|
3994
|
+
}
|
|
3995
|
+
}
|
|
3996
|
+
}
|
|
3997
|
+
const filteredCells = cells.filter((row) => row.some((c) => c.text.trim()));
|
|
3998
|
+
const finalRowCount = filteredCells.length;
|
|
3999
|
+
if (finalRowCount < MIN_ROWS) return null;
|
|
3531
4000
|
const irTable = {
|
|
3532
|
-
rows:
|
|
4001
|
+
rows: finalRowCount,
|
|
3533
4002
|
cols: numCols,
|
|
3534
|
-
cells,
|
|
3535
|
-
hasHeader:
|
|
4003
|
+
cells: filteredCells,
|
|
4004
|
+
hasHeader: finalRowCount > 1
|
|
3536
4005
|
};
|
|
3537
4006
|
const allItems = rows.flatMap((r) => r.items);
|
|
3538
4007
|
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
@@ -3609,7 +4078,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3609
4078
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
3610
4079
|
const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
|
|
3611
4080
|
const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
|
|
3612
|
-
const
|
|
4081
|
+
const fontSizeFreq = /* @__PURE__ */ new Map();
|
|
3613
4082
|
const pageHeights = /* @__PURE__ */ new Map();
|
|
3614
4083
|
let parsedPages = 0;
|
|
3615
4084
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
@@ -3626,7 +4095,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3626
4095
|
warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
|
|
3627
4096
|
}
|
|
3628
4097
|
for (const item of visible) {
|
|
3629
|
-
if (item.fontSize > 0)
|
|
4098
|
+
if (item.fontSize > 0) fontSizeFreq.set(item.fontSize, (fontSizeFreq.get(item.fontSize) || 0) + 1);
|
|
3630
4099
|
}
|
|
3631
4100
|
const opList = await page.getOperatorList();
|
|
3632
4101
|
const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
|
|
@@ -3665,10 +4134,9 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3665
4134
|
blocks.splice(removed[ri], 1);
|
|
3666
4135
|
}
|
|
3667
4136
|
}
|
|
3668
|
-
const medianFontSize =
|
|
4137
|
+
const medianFontSize = computeMedianFontSizeFromFreq(fontSizeFreq);
|
|
3669
4138
|
if (medianFontSize > 0) {
|
|
3670
4139
|
detectHeadings(blocks, medianFontSize);
|
|
3671
|
-
mergeAdjacentHeadings(blocks);
|
|
3672
4140
|
}
|
|
3673
4141
|
detectMarkerHeadings(blocks);
|
|
3674
4142
|
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
@@ -3730,11 +4198,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
|
|
|
3730
4198
|
}
|
|
3731
4199
|
return { visible, hiddenCount };
|
|
3732
4200
|
}
|
|
3733
|
-
function
|
|
3734
|
-
if (
|
|
3735
|
-
|
|
3736
|
-
const
|
|
3737
|
-
|
|
4201
|
+
function computeMedianFontSizeFromFreq(freq) {
|
|
4202
|
+
if (freq.size === 0) return 0;
|
|
4203
|
+
let total = 0;
|
|
4204
|
+
for (const count of freq.values()) total += count;
|
|
4205
|
+
const sorted = [...freq.entries()].sort((a, b) => a[0] - b[0]);
|
|
4206
|
+
const mid = Math.floor(total / 2);
|
|
4207
|
+
let cumulative = 0;
|
|
4208
|
+
for (const [size, count] of sorted) {
|
|
4209
|
+
cumulative += count;
|
|
4210
|
+
if (cumulative > mid) return size;
|
|
4211
|
+
}
|
|
4212
|
+
return sorted[sorted.length - 1][0];
|
|
3738
4213
|
}
|
|
3739
4214
|
function detectHeadings(blocks, medianFontSize) {
|
|
3740
4215
|
for (const block of blocks) {
|
|
@@ -3754,220 +4229,27 @@ function detectHeadings(blocks, medianFontSize) {
|
|
|
3754
4229
|
}
|
|
3755
4230
|
}
|
|
3756
4231
|
}
|
|
3757
|
-
function mergeAdjacentHeadings(blocks) {
|
|
3758
|
-
let i = 0;
|
|
3759
|
-
while (i < blocks.length - 1) {
|
|
3760
|
-
const curr = blocks[i];
|
|
3761
|
-
const next = blocks[i + 1];
|
|
3762
|
-
if (curr.type !== "heading" || next.type !== "heading") {
|
|
3763
|
-
i++;
|
|
3764
|
-
continue;
|
|
3765
|
-
}
|
|
3766
|
-
if (!curr.bbox || !next.bbox || !curr.text || !next.text) {
|
|
3767
|
-
i++;
|
|
3768
|
-
continue;
|
|
3769
|
-
}
|
|
3770
|
-
const currBaseline = curr.bbox.y + (curr.style?.fontSize || curr.bbox.height);
|
|
3771
|
-
const nextBaseline = next.bbox.y + (next.style?.fontSize || next.bbox.height);
|
|
3772
|
-
const yDiff = Math.abs(currBaseline - nextBaseline);
|
|
3773
|
-
const maxFs = Math.max(curr.style?.fontSize || 12, next.style?.fontSize || 12);
|
|
3774
|
-
const sameY = curr.bbox.page === next.bbox.page && yDiff < maxFs * 1.5;
|
|
3775
|
-
const sameLevel = curr.level === next.level;
|
|
3776
|
-
if (sameY && sameLevel) {
|
|
3777
|
-
const currX = curr.bbox.x;
|
|
3778
|
-
const nextX = next.bbox.x;
|
|
3779
|
-
if (currX <= nextX) {
|
|
3780
|
-
curr.text = curr.text + " " + next.text;
|
|
3781
|
-
} else {
|
|
3782
|
-
curr.text = next.text + " " + curr.text;
|
|
3783
|
-
}
|
|
3784
|
-
curr.bbox = {
|
|
3785
|
-
page: curr.bbox.page,
|
|
3786
|
-
x: Math.min(curr.bbox.x, next.bbox.x),
|
|
3787
|
-
y: Math.min(curr.bbox.y, next.bbox.y),
|
|
3788
|
-
width: Math.max(curr.bbox.x + curr.bbox.width, next.bbox.x + next.bbox.width) - Math.min(curr.bbox.x, next.bbox.x),
|
|
3789
|
-
height: Math.max(curr.bbox.height, next.bbox.height)
|
|
3790
|
-
};
|
|
3791
|
-
blocks.splice(i + 1, 1);
|
|
3792
|
-
} else {
|
|
3793
|
-
i++;
|
|
3794
|
-
}
|
|
3795
|
-
}
|
|
3796
|
-
}
|
|
3797
4232
|
function collapseEvenSpacing(text) {
|
|
3798
4233
|
const tokens = text.split(" ");
|
|
3799
4234
|
const singleCharCount = tokens.filter((t) => t.length === 1).length;
|
|
3800
4235
|
if (tokens.length >= 3 && singleCharCount / tokens.length >= 0.7) {
|
|
3801
4236
|
return tokens.join("");
|
|
3802
4237
|
}
|
|
3803
|
-
return text
|
|
3804
|
-
}
|
|
3805
|
-
|
|
3806
|
-
const allY = items.map((i) => i.y);
|
|
3807
|
-
const pageHeight = Math.max(...allY) - Math.min(...allY);
|
|
3808
|
-
const gapThreshold = Math.max(15, pageHeight * 0.03);
|
|
3809
|
-
const orderedGroups = xyCutOrder(items, gapThreshold);
|
|
3810
|
-
const blocks = [];
|
|
3811
|
-
for (const group of orderedGroups) {
|
|
3812
|
-
if (group.length === 0) continue;
|
|
3813
|
-
const yLines = groupByY(group);
|
|
3814
|
-
for (const line of yLines) {
|
|
3815
|
-
const text = mergeLineSimple(line);
|
|
3816
|
-
if (!text.trim()) continue;
|
|
3817
|
-
blocks.push({
|
|
3818
|
-
type: "paragraph",
|
|
3819
|
-
text,
|
|
3820
|
-
pageNumber: pageNum,
|
|
3821
|
-
bbox: computeBBox(line, pageNum),
|
|
3822
|
-
style: dominantStyle(line)
|
|
3823
|
-
});
|
|
3824
|
-
}
|
|
3825
|
-
}
|
|
3826
|
-
return blocks.length > 0 ? blocks : null;
|
|
3827
|
-
}
|
|
3828
|
-
function normalizeUnderSegmentedTable(table, items, pageNum, bbox) {
|
|
3829
|
-
const totalCells = table.cells.reduce((sum, row) => sum + row.filter((c) => c.text.trim()).length, 0);
|
|
3830
|
-
const totalTextLines = table.cells.reduce((sum, row) => sum + row.reduce((s, c) => s + (c.text.trim() ? c.text.split("\n").length : 0), 0), 0);
|
|
3831
|
-
const isUnderSegmented = table.rows === 1 && table.cols === 1 || totalCells <= 2 && totalTextLines >= 8 || totalCells <= 2 && items.length >= 6;
|
|
3832
|
-
if (!isUnderSegmented) return null;
|
|
3833
|
-
if (hasMultiColumnLayout(items)) return buildXyCutBlocks(items, pageNum);
|
|
3834
|
-
const directTable = buildTableFromTextLayout(items, pageNum, bbox);
|
|
3835
|
-
if (directTable) return directTable;
|
|
3836
|
-
const clusterItems = items.map((i) => ({
|
|
3837
|
-
text: i.text,
|
|
3838
|
-
x: i.x,
|
|
3839
|
-
y: i.y,
|
|
3840
|
-
w: i.w,
|
|
3841
|
-
h: i.h,
|
|
3842
|
-
fontSize: i.fontSize,
|
|
3843
|
-
fontName: i.fontName
|
|
3844
|
-
}));
|
|
3845
|
-
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
3846
|
-
if (clusterResults.length > 0) {
|
|
3847
|
-
const blocks = [];
|
|
3848
|
-
const ciToIdx = /* @__PURE__ */ new Map();
|
|
3849
|
-
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
3850
|
-
const usedIndices = /* @__PURE__ */ new Set();
|
|
3851
|
-
for (const cr of clusterResults) {
|
|
3852
|
-
for (const ci of cr.usedItems) {
|
|
3853
|
-
const idx = ciToIdx.get(ci);
|
|
3854
|
-
if (idx !== void 0) usedIndices.add(idx);
|
|
3855
|
-
}
|
|
3856
|
-
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
3857
|
-
}
|
|
3858
|
-
const remaining = items.filter((_, idx) => !usedIndices.has(idx));
|
|
3859
|
-
for (const item of remaining) {
|
|
3860
|
-
if (!item.text.trim()) continue;
|
|
3861
|
-
blocks.push({
|
|
3862
|
-
type: "paragraph",
|
|
3863
|
-
text: item.text,
|
|
3864
|
-
pageNumber: pageNum,
|
|
3865
|
-
bbox: computeBBox([item], pageNum),
|
|
3866
|
-
style: { fontSize: item.fontSize, fontName: item.fontName }
|
|
3867
|
-
});
|
|
3868
|
-
}
|
|
3869
|
-
blocks.sort((a, b) => {
|
|
3870
|
-
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
3871
|
-
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
3872
|
-
return by - ay;
|
|
3873
|
-
});
|
|
3874
|
-
return blocks.length > 0 ? blocks : null;
|
|
3875
|
-
}
|
|
3876
|
-
return null;
|
|
3877
|
-
}
|
|
3878
|
-
function buildTableFromTextLayout(items, pageNum, bbox) {
|
|
3879
|
-
if (items.length < 4) return null;
|
|
3880
|
-
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
3881
|
-
const yTol = 3;
|
|
3882
|
-
const rows = [];
|
|
3883
|
-
let curRow = [sorted[0]];
|
|
3884
|
-
let curY = sorted[0].y;
|
|
3885
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
3886
|
-
if (Math.abs(sorted[i].y - curY) <= yTol) {
|
|
3887
|
-
curRow.push(sorted[i]);
|
|
3888
|
-
} else {
|
|
3889
|
-
rows.push(curRow);
|
|
3890
|
-
curRow = [sorted[i]];
|
|
3891
|
-
curY = sorted[i].y;
|
|
3892
|
-
}
|
|
3893
|
-
}
|
|
3894
|
-
rows.push(curRow);
|
|
3895
|
-
if (rows.length < 2) return null;
|
|
3896
|
-
const gapPositions = [];
|
|
3897
|
-
for (const row of rows) {
|
|
3898
|
-
if (row.length < 2) continue;
|
|
3899
|
-
const sortedX = [...row].sort((a, b) => a.x - b.x);
|
|
3900
|
-
const avgFs = sortedX.reduce((s, i) => s + i.fontSize, 0) / sortedX.length;
|
|
3901
|
-
for (let j = 1; j < sortedX.length; j++) {
|
|
3902
|
-
const gap = sortedX[j].x - (sortedX[j - 1].x + sortedX[j - 1].w);
|
|
3903
|
-
if (gap >= avgFs * 1.5) {
|
|
3904
|
-
gapPositions.push(sortedX[j - 1].x + sortedX[j - 1].w + gap / 2);
|
|
3905
|
-
}
|
|
3906
|
-
}
|
|
3907
|
-
}
|
|
3908
|
-
if (gapPositions.length < 2) return null;
|
|
3909
|
-
gapPositions.sort((a, b) => a - b);
|
|
3910
|
-
const colBoundaries = [];
|
|
3911
|
-
let clusterSum = gapPositions[0], clusterCount = 1;
|
|
3912
|
-
for (let i = 1; i < gapPositions.length; i++) {
|
|
3913
|
-
const avg = clusterSum / clusterCount;
|
|
3914
|
-
if (Math.abs(gapPositions[i] - avg) <= 15) {
|
|
3915
|
-
clusterSum += gapPositions[i];
|
|
3916
|
-
clusterCount++;
|
|
3917
|
-
} else {
|
|
3918
|
-
if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
|
|
3919
|
-
clusterSum = gapPositions[i];
|
|
3920
|
-
clusterCount = 1;
|
|
3921
|
-
}
|
|
3922
|
-
}
|
|
3923
|
-
if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
|
|
3924
|
-
if (colBoundaries.length === 0) return null;
|
|
3925
|
-
const numCols = colBoundaries.length + 1;
|
|
3926
|
-
const tableRows = [];
|
|
3927
|
-
for (const row of rows) {
|
|
3928
|
-
const cells = Array(numCols).fill("");
|
|
3929
|
-
const sortedX = [...row].sort((a, b) => a.x - b.x);
|
|
3930
|
-
for (const item of sortedX) {
|
|
3931
|
-
const cx = item.x + item.w / 2;
|
|
3932
|
-
let col = 0;
|
|
3933
|
-
for (let b = 0; b < colBoundaries.length; b++) {
|
|
3934
|
-
if (cx > colBoundaries[b]) col = b + 1;
|
|
3935
|
-
}
|
|
3936
|
-
cells[col] = cells[col] ? cells[col] + " " + item.text : item.text;
|
|
3937
|
-
}
|
|
3938
|
-
if (cells[0].trim() === "" && tableRows.length > 0) {
|
|
3939
|
-
const prevCells = tableRows[tableRows.length - 1].cells;
|
|
3940
|
-
for (let c = 0; c < numCols; c++) {
|
|
3941
|
-
if (cells[c].trim()) {
|
|
3942
|
-
prevCells[c] = prevCells[c] ? prevCells[c] + " " + cells[c].trim() : cells[c].trim();
|
|
3943
|
-
}
|
|
3944
|
-
}
|
|
3945
|
-
} else {
|
|
3946
|
-
tableRows.push({ cells });
|
|
3947
|
-
}
|
|
3948
|
-
}
|
|
3949
|
-
if (tableRows.length < 2) return null;
|
|
3950
|
-
const nonEmptyCount = tableRows.reduce((sum, r) => sum + r.cells.filter((c) => c.trim()).length, 0);
|
|
3951
|
-
const totalCount = tableRows.length * numCols;
|
|
3952
|
-
if (nonEmptyCount < totalCount * 0.3) return null;
|
|
3953
|
-
const irCells = tableRows.map(
|
|
3954
|
-
(r) => r.cells.map((text, colIdx) => {
|
|
3955
|
-
let cleaned = text.trim();
|
|
3956
|
-
if (colIdx > 0) cleaned = cleaned.replace(/^[•○·\-]\s*/, "");
|
|
3957
|
-
return { text: cleaned, colSpan: 1, rowSpan: 1 };
|
|
3958
|
-
})
|
|
4238
|
+
return text.replace(
|
|
4239
|
+
/(?<![가-힣])[가-힣](?: [가-힣\d]){2,}(?![가-힣])/g,
|
|
4240
|
+
(match) => match.replace(/ /g, "")
|
|
3959
4241
|
);
|
|
3960
|
-
const irTable = {
|
|
3961
|
-
rows: tableRows.length,
|
|
3962
|
-
cols: numCols,
|
|
3963
|
-
cells: irCells,
|
|
3964
|
-
hasHeader: tableRows.length > 1
|
|
3965
|
-
};
|
|
3966
|
-
return [{ type: "table", table: irTable, pageNumber: pageNum, bbox }];
|
|
3967
4242
|
}
|
|
3968
4243
|
function shouldDemoteTable(table) {
|
|
3969
4244
|
const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
|
|
3970
4245
|
const allText = allCells.join(" ");
|
|
4246
|
+
if (table.rows <= 3 && table.cols <= 3) {
|
|
4247
|
+
const totalCells2 = table.rows * table.cols;
|
|
4248
|
+
const emptyCells2 = totalCells2 - allCells.length;
|
|
4249
|
+
if (emptyCells2 >= totalCells2 * 0.3) return true;
|
|
4250
|
+
if (/[□■◆○●▶ㅇ]/.test(allText)) return true;
|
|
4251
|
+
if (/<[^>]+>/.test(allText)) return true;
|
|
4252
|
+
}
|
|
3971
4253
|
if (allText.length > 200) return false;
|
|
3972
4254
|
if (/[□■◆○●▶]/.test(allText) && table.rows <= 3) return true;
|
|
3973
4255
|
const totalCells = table.rows * table.cols;
|
|
@@ -4011,32 +4293,6 @@ function detectMarkerHeadings(blocks) {
|
|
|
4011
4293
|
}
|
|
4012
4294
|
}
|
|
4013
4295
|
}
|
|
4014
|
-
function hasMultiColumnLayout(items) {
|
|
4015
|
-
if (items.length < 30) return false;
|
|
4016
|
-
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
4017
|
-
const minX = sorted[0].x;
|
|
4018
|
-
let maxX = minX;
|
|
4019
|
-
for (const i of sorted) if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
4020
|
-
const pageWidth = maxX - minX;
|
|
4021
|
-
if (pageWidth < 200) return false;
|
|
4022
|
-
let bestGap = 0;
|
|
4023
|
-
let bestSplit = 0;
|
|
4024
|
-
for (let j = 1; j < sorted.length; j++) {
|
|
4025
|
-
const gap = sorted[j].x - (sorted[j - 1].x + sorted[j - 1].w);
|
|
4026
|
-
if (gap > bestGap) {
|
|
4027
|
-
bestGap = gap;
|
|
4028
|
-
bestSplit = (sorted[j - 1].x + sorted[j - 1].w + sorted[j].x) / 2;
|
|
4029
|
-
}
|
|
4030
|
-
}
|
|
4031
|
-
if (bestGap < 20) return false;
|
|
4032
|
-
const splitRatio = (bestSplit - minX) / pageWidth;
|
|
4033
|
-
if (splitRatio < 0.35 || splitRatio > 0.65) return false;
|
|
4034
|
-
const leftCount = items.filter((i) => i.x + i.w / 2 < bestSplit).length;
|
|
4035
|
-
const rightCount = items.filter((i) => i.x + i.w / 2 >= bestSplit).length;
|
|
4036
|
-
if (leftCount < 15 || rightCount < 15) return false;
|
|
4037
|
-
if (Math.min(leftCount, rightCount) / Math.max(leftCount, rightCount) < 0.35) return false;
|
|
4038
|
-
return true;
|
|
4039
|
-
}
|
|
4040
4296
|
var MAX_XYCUT_DEPTH = 50;
|
|
4041
4297
|
function xyCutOrder(items, gapThreshold, depth = 0) {
|
|
4042
4298
|
if (items.length === 0) return [];
|
|
@@ -4104,6 +4360,7 @@ function extractPageBlocksWithLines(items, pageNum, opList, pageWidth, pageHeigh
|
|
|
4104
4360
|
if (items.length === 0) return [];
|
|
4105
4361
|
let { horizontals, verticals } = extractLines(opList.fnArray, opList.argsArray);
|
|
4106
4362
|
({ horizontals, verticals } = filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight));
|
|
4363
|
+
({ horizontals, verticals } = preprocessLines(horizontals, verticals));
|
|
4107
4364
|
const grids = buildTableGrids(horizontals, verticals);
|
|
4108
4365
|
if (grids.length > 0) {
|
|
4109
4366
|
return extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals);
|
|
@@ -4115,14 +4372,19 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
4115
4372
|
const usedItems = /* @__PURE__ */ new Set();
|
|
4116
4373
|
const sortedGrids = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
|
|
4117
4374
|
for (const grid of sortedGrids) {
|
|
4375
|
+
const numGridRows = grid.rowYs.length - 1;
|
|
4376
|
+
const numGridCols = grid.colXs.length - 1;
|
|
4377
|
+
if (numGridRows === 1 && numGridCols >= 2) continue;
|
|
4118
4378
|
const tableItems = [];
|
|
4119
4379
|
const pad = 3;
|
|
4380
|
+
const gridW = grid.bbox.x2 - grid.bbox.x1;
|
|
4120
4381
|
for (const item of items) {
|
|
4121
4382
|
if (usedItems.has(item)) continue;
|
|
4122
|
-
if (item.
|
|
4123
|
-
|
|
4124
|
-
|
|
4125
|
-
|
|
4383
|
+
if (item.y < grid.bbox.y1 - pad || item.y > grid.bbox.y2 + pad) continue;
|
|
4384
|
+
if (item.x < grid.bbox.x1 - pad || item.x + item.w > grid.bbox.x2 + pad) continue;
|
|
4385
|
+
if (gridW < 120 && item.x + item.w > grid.bbox.x2 - 2) continue;
|
|
4386
|
+
tableItems.push(item);
|
|
4387
|
+
usedItems.add(item);
|
|
4126
4388
|
}
|
|
4127
4389
|
const cells = extractCells(grid, horizontals, verticals);
|
|
4128
4390
|
if (cells.length === 0) continue;
|
|
@@ -4146,6 +4408,7 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
4146
4408
|
const cellItems = cellTextMap.get(cell) || [];
|
|
4147
4409
|
let text = cellTextToString(cellItems);
|
|
4148
4410
|
text = text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").trim();
|
|
4411
|
+
text = text.split("\n").map((line) => collapseEvenSpacing(line)).join("\n");
|
|
4149
4412
|
irGrid[cell.row][cell.col] = {
|
|
4150
4413
|
text,
|
|
4151
4414
|
colSpan: cell.colSpan,
|
|
@@ -4167,31 +4430,61 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
4167
4430
|
width: grid.bbox.x2 - grid.bbox.x1,
|
|
4168
4431
|
height: grid.bbox.y2 - grid.bbox.y1
|
|
4169
4432
|
};
|
|
4170
|
-
const normalized = normalizeUnderSegmentedTable(irTable, tableItems, pageNum, tableBbox);
|
|
4171
|
-
if (normalized) {
|
|
4172
|
-
blocks.push(...normalized);
|
|
4173
|
-
continue;
|
|
4174
|
-
}
|
|
4175
4433
|
if (shouldDemoteTable(irTable)) {
|
|
4176
4434
|
const demoted = demoteTableToText(irTable);
|
|
4177
4435
|
if (demoted) {
|
|
4178
|
-
|
|
4436
|
+
const text = numGridRows === 1 ? "\n" + demoted + "\n" : demoted;
|
|
4437
|
+
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
|
|
4179
4438
|
}
|
|
4180
4439
|
continue;
|
|
4181
4440
|
}
|
|
4182
4441
|
blocks.push({ type: "table", table: irTable, pageNumber: pageNum, bbox: tableBbox });
|
|
4183
4442
|
}
|
|
4184
|
-
|
|
4443
|
+
let remaining = items.filter((i) => !usedItems.has(i));
|
|
4185
4444
|
if (remaining.length > 0) {
|
|
4186
4445
|
remaining.sort((a, b) => b.y - a.y || a.x - b.x);
|
|
4187
|
-
const
|
|
4188
|
-
|
|
4189
|
-
|
|
4446
|
+
const clusterItems = remaining.map((i) => ({
|
|
4447
|
+
text: i.text,
|
|
4448
|
+
x: i.x,
|
|
4449
|
+
y: i.y,
|
|
4450
|
+
w: i.w,
|
|
4451
|
+
h: i.h,
|
|
4452
|
+
fontSize: i.fontSize,
|
|
4453
|
+
fontName: i.fontName
|
|
4454
|
+
}));
|
|
4455
|
+
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4456
|
+
if (clusterResults.length > 0) {
|
|
4457
|
+
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4458
|
+
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4459
|
+
const usedClusterIndices = /* @__PURE__ */ new Set();
|
|
4460
|
+
for (const cr of clusterResults) {
|
|
4461
|
+
for (const ci of cr.usedItems) {
|
|
4462
|
+
const idx = ciToIdx.get(ci);
|
|
4463
|
+
if (idx !== void 0) usedClusterIndices.add(idx);
|
|
4464
|
+
}
|
|
4465
|
+
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4466
|
+
}
|
|
4467
|
+
remaining = remaining.filter((_, idx) => !usedClusterIndices.has(idx));
|
|
4468
|
+
}
|
|
4469
|
+
if (remaining.length > 0) {
|
|
4470
|
+
const allY = remaining.map((i) => i.y);
|
|
4471
|
+
const pageH = safeMax(allY) - safeMin(allY);
|
|
4472
|
+
const groups = xyCutOrder(remaining, Math.max(15, pageH * 0.03));
|
|
4473
|
+
const textBlocks = [];
|
|
4474
|
+
for (const group of groups) {
|
|
4475
|
+
if (group.length === 0) continue;
|
|
4476
|
+
const groupBlocks = extractPageBlocksFallback(group, pageNum);
|
|
4477
|
+
for (const b of groupBlocks) textBlocks.push(b);
|
|
4478
|
+
}
|
|
4479
|
+
const finalTextBlocks = detectListBlocks(textBlocks);
|
|
4480
|
+
for (const b of finalTextBlocks) blocks.push(b);
|
|
4481
|
+
}
|
|
4482
|
+
blocks.sort((a, b) => {
|
|
4190
4483
|
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
4191
4484
|
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
4192
4485
|
return by - ay;
|
|
4193
4486
|
});
|
|
4194
|
-
return mergeAdjacentTableBlocks(
|
|
4487
|
+
return mergeAdjacentTableBlocks(blocks);
|
|
4195
4488
|
}
|
|
4196
4489
|
return mergeAdjacentTableBlocks(blocks);
|
|
4197
4490
|
}
|
|
@@ -4217,57 +4510,53 @@ function mergeAdjacentTableBlocks(blocks) {
|
|
|
4217
4510
|
}
|
|
4218
4511
|
function extractPageBlocksFallback(items, pageNum) {
|
|
4219
4512
|
if (items.length === 0) return [];
|
|
4220
|
-
if (hasMultiColumnLayout(items)) {
|
|
4221
|
-
const xyBlocks = buildXyCutBlocks(items, pageNum) || [];
|
|
4222
|
-
return detectSpecialKoreanTables(detectListBlocks(xyBlocks));
|
|
4223
|
-
}
|
|
4224
4513
|
const blocks = [];
|
|
4225
|
-
const
|
|
4226
|
-
|
|
4227
|
-
|
|
4228
|
-
|
|
4229
|
-
|
|
4230
|
-
|
|
4231
|
-
|
|
4232
|
-
|
|
4233
|
-
|
|
4234
|
-
|
|
4235
|
-
|
|
4236
|
-
|
|
4237
|
-
|
|
4238
|
-
|
|
4239
|
-
|
|
4240
|
-
|
|
4241
|
-
|
|
4242
|
-
|
|
4243
|
-
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4244
|
-
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4245
|
-
const usedIndices = /* @__PURE__ */ new Set();
|
|
4246
|
-
for (const cr of clusterResults) {
|
|
4247
|
-
for (const ci of cr.usedItems) {
|
|
4248
|
-
const idx = ciToIdx.get(ci);
|
|
4249
|
-
if (idx !== void 0) usedIndices.add(idx);
|
|
4250
|
-
}
|
|
4251
|
-
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4514
|
+
const clusterItems = items.map((i) => ({
|
|
4515
|
+
text: i.text,
|
|
4516
|
+
x: i.x,
|
|
4517
|
+
y: i.y,
|
|
4518
|
+
w: i.w,
|
|
4519
|
+
h: i.h,
|
|
4520
|
+
fontSize: i.fontSize,
|
|
4521
|
+
fontName: i.fontName
|
|
4522
|
+
}));
|
|
4523
|
+
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4524
|
+
if (clusterResults.length > 0) {
|
|
4525
|
+
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4526
|
+
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4527
|
+
const usedIndices = /* @__PURE__ */ new Set();
|
|
4528
|
+
for (const cr of clusterResults) {
|
|
4529
|
+
for (const ci of cr.usedItems) {
|
|
4530
|
+
const idx = ciToIdx.get(ci);
|
|
4531
|
+
if (idx !== void 0) usedIndices.add(idx);
|
|
4252
4532
|
}
|
|
4253
|
-
|
|
4254
|
-
|
|
4255
|
-
|
|
4256
|
-
|
|
4257
|
-
|
|
4258
|
-
|
|
4259
|
-
|
|
4260
|
-
|
|
4261
|
-
|
|
4533
|
+
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4534
|
+
}
|
|
4535
|
+
const remaining = items.filter((_, idx) => !usedIndices.has(idx));
|
|
4536
|
+
if (remaining.length > 0) {
|
|
4537
|
+
const yLines = groupByY(remaining);
|
|
4538
|
+
for (const line of yLines) {
|
|
4539
|
+
const text = mergeLineSimple(line);
|
|
4540
|
+
if (!text.trim()) continue;
|
|
4541
|
+
const bbox = computeBBox(line, pageNum);
|
|
4542
|
+
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
|
|
4262
4543
|
}
|
|
4263
|
-
|
|
4264
|
-
|
|
4265
|
-
|
|
4266
|
-
|
|
4267
|
-
|
|
4544
|
+
}
|
|
4545
|
+
blocks.sort((a, b) => {
|
|
4546
|
+
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
4547
|
+
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
4548
|
+
return by - ay;
|
|
4549
|
+
});
|
|
4550
|
+
} else {
|
|
4551
|
+
const allYLines = groupByY(items);
|
|
4552
|
+
const columns = detectColumns(allYLines);
|
|
4553
|
+
if (columns && columns.length >= 3) {
|
|
4554
|
+
const tableText = extractWithColumns(allYLines, columns);
|
|
4555
|
+
const bbox = computeBBox(items, pageNum);
|
|
4556
|
+
blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
|
|
4268
4557
|
} else {
|
|
4269
4558
|
const allY = items.map((i) => i.y);
|
|
4270
|
-
const pageHeight =
|
|
4559
|
+
const pageHeight = safeMax(allY) - safeMin(allY);
|
|
4271
4560
|
const gapThreshold = Math.max(15, pageHeight * 0.03);
|
|
4272
4561
|
const orderedGroups = xyCutOrder(items, gapThreshold);
|
|
4273
4562
|
for (const group of orderedGroups) {
|
|
@@ -4320,22 +4609,76 @@ function dominantStyle(items) {
|
|
|
4320
4609
|
return { fontSize: dominantSize, fontName };
|
|
4321
4610
|
}
|
|
4322
4611
|
function normalizeItems(rawItems) {
|
|
4323
|
-
|
|
4612
|
+
const items = [];
|
|
4613
|
+
const spacePositions = [];
|
|
4614
|
+
for (const i of rawItems) {
|
|
4615
|
+
if (typeof i.str !== "string") continue;
|
|
4616
|
+
const x = Math.round(i.transform[4]);
|
|
4617
|
+
const y = Math.round(i.transform[5]);
|
|
4618
|
+
if (!i.str.trim()) {
|
|
4619
|
+
spacePositions.push({ x, y });
|
|
4620
|
+
continue;
|
|
4621
|
+
}
|
|
4324
4622
|
const scaleY = Math.abs(i.transform[3]);
|
|
4325
4623
|
const scaleX = Math.abs(i.transform[0]);
|
|
4326
4624
|
const fontSize = Math.round(Math.max(scaleY, scaleX));
|
|
4327
|
-
|
|
4328
|
-
|
|
4329
|
-
|
|
4330
|
-
|
|
4331
|
-
|
|
4332
|
-
|
|
4333
|
-
|
|
4334
|
-
|
|
4335
|
-
|
|
4336
|
-
|
|
4337
|
-
|
|
4338
|
-
|
|
4625
|
+
const w = Math.round(i.width);
|
|
4626
|
+
const h = Math.round(i.height);
|
|
4627
|
+
const isHidden = fontSize === 0 || i.width === 0 && i.str.trim().length > 0;
|
|
4628
|
+
let text = i.str.trim();
|
|
4629
|
+
if (/^[\d\s\-().·,☎]+$/.test(text) && /\d/.test(text) && / /.test(text)) {
|
|
4630
|
+
text = text.replace(/ /g, "");
|
|
4631
|
+
}
|
|
4632
|
+
const split = splitEvenSpacedItem(text, x, w, fontSize);
|
|
4633
|
+
if (split) {
|
|
4634
|
+
for (const s of split) {
|
|
4635
|
+
items.push({ text: s.text, x: s.x, y, w: s.w, h, fontSize, fontName: i.fontName || "", isHidden });
|
|
4636
|
+
}
|
|
4637
|
+
} else {
|
|
4638
|
+
items.push({ text, x, y, w, h, fontSize, fontName: i.fontName || "", isHidden });
|
|
4639
|
+
}
|
|
4640
|
+
}
|
|
4641
|
+
const sorted = items.sort((a, b) => b.y - a.y || a.x - b.x);
|
|
4642
|
+
const deduped = [];
|
|
4643
|
+
for (let i = 0; i < sorted.length; i++) {
|
|
4644
|
+
let isDup = false;
|
|
4645
|
+
for (let j = deduped.length - 1; j >= 0; j--) {
|
|
4646
|
+
const prev = deduped[j];
|
|
4647
|
+
if (prev.y - sorted[i].y > 3) break;
|
|
4648
|
+
if (Math.abs(prev.y - sorted[i].y) <= 3 && prev.text === sorted[i].text && Math.abs(prev.x - sorted[i].x) <= 3) {
|
|
4649
|
+
isDup = true;
|
|
4650
|
+
break;
|
|
4651
|
+
}
|
|
4652
|
+
}
|
|
4653
|
+
if (!isDup) deduped.push(sorted[i]);
|
|
4654
|
+
}
|
|
4655
|
+
if (spacePositions.length > 0) {
|
|
4656
|
+
for (const item of deduped) {
|
|
4657
|
+
for (const sp of spacePositions) {
|
|
4658
|
+
if (Math.abs(sp.y - item.y) <= 3) {
|
|
4659
|
+
const dist = item.x - sp.x;
|
|
4660
|
+
if (dist >= 0 && dist <= 20) {
|
|
4661
|
+
item.hasSpaceBefore = true;
|
|
4662
|
+
break;
|
|
4663
|
+
}
|
|
4664
|
+
}
|
|
4665
|
+
}
|
|
4666
|
+
}
|
|
4667
|
+
}
|
|
4668
|
+
return deduped;
|
|
4669
|
+
}
|
|
4670
|
+
function splitEvenSpacedItem(text, itemX, itemW, fontSize) {
|
|
4671
|
+
if (!/^[가-힣\d](?: [가-힣\d]){2,}$/.test(text)) return null;
|
|
4672
|
+
const chars = text.split(" ");
|
|
4673
|
+
if (chars.length < 3) return null;
|
|
4674
|
+
const charW = itemW / chars.length;
|
|
4675
|
+
if (charW > fontSize * 2) return null;
|
|
4676
|
+
return chars.map((ch, idx) => ({
|
|
4677
|
+
text: ch,
|
|
4678
|
+
x: Math.round(itemX + idx * charW),
|
|
4679
|
+
w: Math.round(charW * 0.8)
|
|
4680
|
+
// 실제 글자 폭은 간격보다 좁음
|
|
4681
|
+
}));
|
|
4339
4682
|
}
|
|
4340
4683
|
function groupByY(items) {
|
|
4341
4684
|
if (items.length === 0) return [];
|
|
@@ -4360,14 +4703,14 @@ function isProseSpread(items) {
|
|
|
4360
4703
|
for (let i = 1; i < sorted.length; i++) {
|
|
4361
4704
|
gaps.push(sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w));
|
|
4362
4705
|
}
|
|
4363
|
-
const maxGap =
|
|
4706
|
+
const maxGap = safeMax(gaps);
|
|
4364
4707
|
const avgLen = items.reduce((s, i) => s + i.text.length, 0) / items.length;
|
|
4365
4708
|
return maxGap < 40 && avgLen < 5;
|
|
4366
4709
|
}
|
|
4367
4710
|
function detectColumns(yLines) {
|
|
4368
4711
|
const allItems = yLines.flat();
|
|
4369
4712
|
if (allItems.length === 0) return null;
|
|
4370
|
-
const pageWidth =
|
|
4713
|
+
const pageWidth = safeMax(allItems.map((i) => i.x + i.w)) - safeMin(allItems.map((i) => i.x));
|
|
4371
4714
|
if (pageWidth < 100) return null;
|
|
4372
4715
|
let bigoLineIdx = -1;
|
|
4373
4716
|
for (let i = 0; i < yLines.length; i++) {
|
|
@@ -4399,7 +4742,7 @@ function detectColumns(yLines) {
|
|
|
4399
4742
|
}
|
|
4400
4743
|
const peaks = xClusters.filter((c) => c.count >= 3).sort((a, b) => a.minX - b.minX);
|
|
4401
4744
|
if (peaks.length < 3) return null;
|
|
4402
|
-
const MERGE_TOL =
|
|
4745
|
+
const MERGE_TOL = 40;
|
|
4403
4746
|
const merged = [peaks[0]];
|
|
4404
4747
|
for (let i = 1; i < peaks.length; i++) {
|
|
4405
4748
|
const prev = merged[merged.length - 1];
|
|
@@ -4413,7 +4756,14 @@ function detectColumns(yLines) {
|
|
|
4413
4756
|
merged.push({ ...peaks[i] });
|
|
4414
4757
|
}
|
|
4415
4758
|
}
|
|
4416
|
-
const
|
|
4759
|
+
const rawColumns = merged.filter((c) => c.count >= 3).map((c) => c.minX);
|
|
4760
|
+
if (rawColumns.length < 3) return null;
|
|
4761
|
+
const MIN_DETECT_COL_WIDTH = 30;
|
|
4762
|
+
const columns = [rawColumns[0]];
|
|
4763
|
+
for (let i = 1; i < rawColumns.length; i++) {
|
|
4764
|
+
if (rawColumns[i] - columns[columns.length - 1] < MIN_DETECT_COL_WIDTH) continue;
|
|
4765
|
+
columns.push(rawColumns[i]);
|
|
4766
|
+
}
|
|
4417
4767
|
return columns.length >= 3 ? columns : null;
|
|
4418
4768
|
}
|
|
4419
4769
|
function findColumn(x, columns) {
|
|
@@ -4541,6 +4891,16 @@ function buildGridTable(lines, columns) {
|
|
|
4541
4891
|
}
|
|
4542
4892
|
merged.splice(0, headerEnd, headerRow);
|
|
4543
4893
|
}
|
|
4894
|
+
for (const row of merged) {
|
|
4895
|
+
for (let c = 0; c < row.length; c++) {
|
|
4896
|
+
if (row[c]) row[c] = collapseEvenSpacing(row[c]);
|
|
4897
|
+
}
|
|
4898
|
+
}
|
|
4899
|
+
const totalCells = merged.length * numCols;
|
|
4900
|
+
const filledCells = merged.reduce((s, row) => s + row.filter((c) => c).length, 0);
|
|
4901
|
+
if (filledCells < totalCells * 0.35 || merged.length < 2 || merged.length <= 3 && numCols >= 7) {
|
|
4902
|
+
return merged.map((r) => r.filter((c) => c).join(" ")).join("\n");
|
|
4903
|
+
}
|
|
4544
4904
|
const md = [];
|
|
4545
4905
|
md.push("| " + merged[0].join(" | ") + " |");
|
|
4546
4906
|
md.push("| " + merged[0].map(() => "---").join(" | ") + " |");
|
|
@@ -4552,12 +4912,32 @@ function buildGridTable(lines, columns) {
|
|
|
4552
4912
|
function mergeLineSimple(items) {
|
|
4553
4913
|
if (items.length <= 1) return items[0]?.text || "";
|
|
4554
4914
|
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
4915
|
+
const isEvenSpaced = detectEvenSpacedItems(sorted);
|
|
4555
4916
|
let result = sorted[0].text;
|
|
4556
4917
|
for (let i = 1; i < sorted.length; i++) {
|
|
4557
4918
|
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
4558
4919
|
const avgFs = (sorted[i].fontSize + sorted[i - 1].fontSize) / 2;
|
|
4559
|
-
|
|
4560
|
-
|
|
4920
|
+
const tabThreshold = Math.max(avgFs * 2, 30);
|
|
4921
|
+
if (gap > tabThreshold) {
|
|
4922
|
+
result += " ";
|
|
4923
|
+
result += sorted[i].text;
|
|
4924
|
+
continue;
|
|
4925
|
+
}
|
|
4926
|
+
if (isEvenSpaced[i]) {
|
|
4927
|
+
result += sorted[i].text;
|
|
4928
|
+
continue;
|
|
4929
|
+
}
|
|
4930
|
+
if (sorted[i].hasSpaceBefore && gap >= avgFs * 0.05) {
|
|
4931
|
+
result += " ";
|
|
4932
|
+
result += sorted[i].text;
|
|
4933
|
+
continue;
|
|
4934
|
+
}
|
|
4935
|
+
if (/[□■○●▶◆◇ㅇ]$/.test(sorted[i - 1].text) && /^[가-힣]/.test(sorted[i].text) && gap > 1) {
|
|
4936
|
+
result += " ";
|
|
4937
|
+
result += sorted[i].text;
|
|
4938
|
+
continue;
|
|
4939
|
+
}
|
|
4940
|
+
if (gap < avgFs * 0.15) {
|
|
4561
4941
|
} else if (gap < avgFs * 0.35 && (/[가-힣]$/.test(result) || /^[가-힣]/.test(sorted[i].text))) {
|
|
4562
4942
|
} else if (gap > 3) result += " ";
|
|
4563
4943
|
result += sorted[i].text;
|
|
@@ -4566,8 +4946,8 @@ function mergeLineSimple(items) {
|
|
|
4566
4946
|
}
|
|
4567
4947
|
function cleanPdfText(text) {
|
|
4568
4948
|
return mergeKoreanLines(
|
|
4569
|
-
text.replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "")
|
|
4570
|
-
).replace(/^(?!\|)
|
|
4949
|
+
text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
|
|
4950
|
+
).replace(/^(?!\| ---).*$/gm, (line) => collapseEvenSpacing(line)).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
|
|
4571
4951
|
}
|
|
4572
4952
|
function startsWithMarker(line) {
|
|
4573
4953
|
const t = line.trimStart();
|
|
@@ -4759,7 +5139,7 @@ function mergeKoreanLines(text) {
|
|
|
4759
5139
|
result[result.length - 1] = prev + " " + currTrimmed;
|
|
4760
5140
|
continue;
|
|
4761
5141
|
}
|
|
4762
|
-
if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev)) {
|
|
5142
|
+
if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev) && !startsWithMarker(prev)) {
|
|
4763
5143
|
result[result.length - 1] = prev + " " + curr;
|
|
4764
5144
|
} else {
|
|
4765
5145
|
result.push(curr);
|
|
@@ -4772,7 +5152,7 @@ function mergeKoreanLines(text) {
|
|
|
4772
5152
|
import { readFile } from "fs/promises";
|
|
4773
5153
|
|
|
4774
5154
|
// src/xlsx/parser.ts
|
|
4775
|
-
import
|
|
5155
|
+
import JSZip2 from "jszip";
|
|
4776
5156
|
import { DOMParser as DOMParser2 } from "@xmldom/xmldom";
|
|
4777
5157
|
var MAX_SHEETS = 100;
|
|
4778
5158
|
var MAX_DECOMPRESS_SIZE3 = 100 * 1024 * 1024;
|
|
@@ -4810,7 +5190,7 @@ function getTextContent(el) {
|
|
|
4810
5190
|
return el.textContent?.trim() ?? "";
|
|
4811
5191
|
}
|
|
4812
5192
|
function parseXml(text) {
|
|
4813
|
-
return new DOMParser2().parseFromString(text, "text/xml");
|
|
5193
|
+
return new DOMParser2().parseFromString(stripDtd(text), "text/xml");
|
|
4814
5194
|
}
|
|
4815
5195
|
function parseSharedStrings(xml) {
|
|
4816
5196
|
const doc = parseXml(xml);
|
|
@@ -4963,7 +5343,7 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
|
|
|
4963
5343
|
}
|
|
4964
5344
|
async function parseXlsxDocument(buffer, options) {
|
|
4965
5345
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
|
|
4966
|
-
const zip = await
|
|
5346
|
+
const zip = await JSZip2.loadAsync(buffer);
|
|
4967
5347
|
const warnings = [];
|
|
4968
5348
|
const workbookFile = zip.file("xl/workbook.xml");
|
|
4969
5349
|
if (!workbookFile) {
|
|
@@ -5053,7 +5433,7 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
5053
5433
|
}
|
|
5054
5434
|
|
|
5055
5435
|
// src/docx/parser.ts
|
|
5056
|
-
import
|
|
5436
|
+
import JSZip3 from "jszip";
|
|
5057
5437
|
import { DOMParser as DOMParser3 } from "@xmldom/xmldom";
|
|
5058
5438
|
var MAX_DECOMPRESS_SIZE4 = 100 * 1024 * 1024;
|
|
5059
5439
|
function getChildElements(parent, localName) {
|
|
@@ -5097,7 +5477,7 @@ function getAttr(el, localName) {
|
|
|
5097
5477
|
return null;
|
|
5098
5478
|
}
|
|
5099
5479
|
function parseXml2(text) {
|
|
5100
|
-
return new DOMParser3().parseFromString(text, "text/xml");
|
|
5480
|
+
return new DOMParser3().parseFromString(stripDtd(text), "text/xml");
|
|
5101
5481
|
}
|
|
5102
5482
|
function parseStyles(xml) {
|
|
5103
5483
|
const doc = parseXml2(xml);
|
|
@@ -5391,7 +5771,7 @@ async function extractImages(zip, rels, doc) {
|
|
|
5391
5771
|
}
|
|
5392
5772
|
async function parseDocxDocument(buffer, options) {
|
|
5393
5773
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
|
|
5394
|
-
const zip = await
|
|
5774
|
+
const zip = await JSZip3.loadAsync(buffer);
|
|
5395
5775
|
const warnings = [];
|
|
5396
5776
|
const docFile = zip.file("word/document.xml");
|
|
5397
5777
|
if (!docFile) {
|
|
@@ -5608,7 +5988,7 @@ function extractInlineFields(text) {
|
|
|
5608
5988
|
}
|
|
5609
5989
|
|
|
5610
5990
|
// src/hwpx/generator.ts
|
|
5611
|
-
import
|
|
5991
|
+
import JSZip4 from "jszip";
|
|
5612
5992
|
|
|
5613
5993
|
// src/index.ts
|
|
5614
5994
|
async function parse(input, options) {
|
|
@@ -5703,7 +6083,13 @@ function normalize(s) {
|
|
|
5703
6083
|
}
|
|
5704
6084
|
var MAX_LEVENSHTEIN_LEN = 1e4;
|
|
5705
6085
|
function levenshtein(a, b) {
|
|
5706
|
-
if (a.length + b.length > MAX_LEVENSHTEIN_LEN)
|
|
6086
|
+
if (a.length + b.length > MAX_LEVENSHTEIN_LEN) {
|
|
6087
|
+
const sampleLen = Math.min(500, a.length, b.length);
|
|
6088
|
+
let diffs = 0;
|
|
6089
|
+
for (let i = 0; i < sampleLen; i++) if (a[i] !== b[i]) diffs++;
|
|
6090
|
+
const sampleRate = sampleLen > 0 ? diffs / sampleLen : 1;
|
|
6091
|
+
return Math.abs(a.length - b.length) + Math.round(Math.min(a.length, b.length) * sampleRate);
|
|
6092
|
+
}
|
|
5707
6093
|
if (a.length > b.length) [a, b] = [b, a];
|
|
5708
6094
|
const m = a.length;
|
|
5709
6095
|
const n = b.length;
|
|
@@ -5859,7 +6245,10 @@ function diffTableCells(a, b) {
|
|
|
5859
6245
|
}
|
|
5860
6246
|
|
|
5861
6247
|
export {
|
|
5862
|
-
|
|
6248
|
+
VERSION,
|
|
6249
|
+
toArrayBuffer,
|
|
6250
|
+
KordocError,
|
|
6251
|
+
sanitizeError,
|
|
5863
6252
|
blocksToMarkdown,
|
|
5864
6253
|
extractHwpxMetadataOnly,
|
|
5865
6254
|
extractHwp5MetadataOnly,
|
|
@@ -5868,4 +6257,4 @@ export {
|
|
|
5868
6257
|
extractFormFields,
|
|
5869
6258
|
parse
|
|
5870
6259
|
};
|
|
5871
|
-
//# sourceMappingURL=chunk-
|
|
6260
|
+
//# sourceMappingURL=chunk-LYFG7AUT.js.map
|