kordoc 2.1.0 → 2.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +318 -302
- package/dist/{chunk-MOL7MDBG.js → chunk-3TBUDJDE.js} +1 -1
- package/dist/chunk-3TBUDJDE.js.map +1 -0
- package/dist/{chunk-GJ2S6IMC.js → chunk-FINXMRCH.js} +978 -586
- package/dist/chunk-FINXMRCH.js.map +1 -0
- package/dist/chunk-MUAWCQDY.js +52 -0
- package/dist/chunk-MUAWCQDY.js.map +1 -0
- package/dist/cli.js +13 -9
- package/dist/cli.js.map +1 -1
- package/dist/detect-63IGCXTH.js +18 -0
- package/dist/index.cjs +1003 -553
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +1003 -553
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +41 -12
- package/dist/mcp.js.map +1 -1
- package/dist/page-range-OF5I4PQY.js +8 -0
- package/dist/{watch-X7IC7MLF.js → watch-Q6L4UBTC.js} +32 -16
- package/dist/watch-Q6L4UBTC.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-GJ2S6IMC.js.map +0 -1
- package/dist/chunk-MOL7MDBG.js.map +0 -1
- package/dist/chunk-PKIJLEV6.js +0 -93
- package/dist/chunk-PKIJLEV6.js.map +0 -1
- package/dist/page-range-737B4EZW.js +0 -8
- package/dist/utils-BWQ2RGUD.js +0 -22
- package/dist/watch-X7IC7MLF.js.map +0 -1
- /package/dist/{page-range-737B4EZW.js.map → detect-63IGCXTH.js.map} +0 -0
- /package/dist/{utils-BWQ2RGUD.js.map → page-range-OF5I4PQY.js.map} +0 -0
|
@@ -1,53 +1,105 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import {
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
precheckZipSize,
|
|
7
|
-
sanitizeHref,
|
|
8
|
-
toArrayBuffer
|
|
9
|
-
} from "./chunk-PKIJLEV6.js";
|
|
3
|
+
detectFormat,
|
|
4
|
+
detectZipFormat
|
|
5
|
+
} from "./chunk-MUAWCQDY.js";
|
|
10
6
|
import {
|
|
11
7
|
parsePageRange
|
|
12
|
-
} from "./chunk-
|
|
8
|
+
} from "./chunk-3TBUDJDE.js";
|
|
13
9
|
|
|
14
|
-
// src/
|
|
15
|
-
|
|
16
|
-
function
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
return b[0] === 80 && b[1] === 75 && b[2] === 3 && b[3] === 4;
|
|
22
|
-
}
|
|
23
|
-
function isOldHwpFile(buffer) {
|
|
24
|
-
const b = magicBytes(buffer);
|
|
25
|
-
return b[0] === 208 && b[1] === 207 && b[2] === 17 && b[3] === 224;
|
|
10
|
+
// src/utils.ts
|
|
11
|
+
var VERSION = true ? "2.2.1" : "0.0.0-dev";
|
|
12
|
+
function toArrayBuffer(buf) {
|
|
13
|
+
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
14
|
+
return buf.buffer;
|
|
15
|
+
}
|
|
16
|
+
return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
|
|
26
17
|
}
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
18
|
+
var KordocError = class extends Error {
|
|
19
|
+
constructor(message) {
|
|
20
|
+
super(message);
|
|
21
|
+
this.name = "KordocError";
|
|
22
|
+
}
|
|
23
|
+
};
|
|
24
|
+
function sanitizeError(err) {
|
|
25
|
+
if (err instanceof KordocError) return err.message;
|
|
26
|
+
return "\uBB38\uC11C \uCC98\uB9AC \uC911 \uC624\uB958\uAC00 \uBC1C\uC0DD\uD588\uC2B5\uB2C8\uB2E4";
|
|
30
27
|
}
|
|
31
|
-
function
|
|
32
|
-
if (
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
return "unknown";
|
|
28
|
+
function isPathTraversal(name) {
|
|
29
|
+
if (name.includes("\0")) return true;
|
|
30
|
+
const normalized = name.replace(/\\/g, "/");
|
|
31
|
+
const segments = normalized.split("/");
|
|
32
|
+
return segments.some((s) => s === "..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
37
33
|
}
|
|
38
|
-
|
|
34
|
+
function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
|
|
39
35
|
try {
|
|
40
|
-
const
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
36
|
+
const data = new DataView(buffer);
|
|
37
|
+
const len = buffer.byteLength;
|
|
38
|
+
let eocdOffset = -1;
|
|
39
|
+
for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
|
|
40
|
+
if (data.getUint32(i, true) === 101010256) {
|
|
41
|
+
eocdOffset = i;
|
|
42
|
+
break;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
|
|
46
|
+
const entryCount = data.getUint16(eocdOffset + 10, true);
|
|
47
|
+
if (entryCount > maxEntries) {
|
|
48
|
+
throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
|
|
49
|
+
}
|
|
50
|
+
const cdSize = data.getUint32(eocdOffset + 12, true);
|
|
51
|
+
const cdOffset = data.getUint32(eocdOffset + 16, true);
|
|
52
|
+
if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
|
|
53
|
+
let totalUncompressed = 0;
|
|
54
|
+
let pos = cdOffset;
|
|
55
|
+
for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
|
|
56
|
+
if (data.getUint32(pos, true) !== 33639248) break;
|
|
57
|
+
totalUncompressed += data.getUint32(pos + 24, true);
|
|
58
|
+
const nameLen = data.getUint16(pos + 28, true);
|
|
59
|
+
const extraLen = data.getUint16(pos + 30, true);
|
|
60
|
+
const commentLen = data.getUint16(pos + 32, true);
|
|
61
|
+
pos += 46 + nameLen + extraLen + commentLen;
|
|
62
|
+
}
|
|
63
|
+
if (totalUncompressed > maxUncompressedSize) {
|
|
64
|
+
throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
|
|
65
|
+
}
|
|
66
|
+
return { totalUncompressed, entryCount };
|
|
67
|
+
} catch (err) {
|
|
68
|
+
if (err instanceof KordocError) throw err;
|
|
69
|
+
return { totalUncompressed: 0, entryCount: 0 };
|
|
49
70
|
}
|
|
50
71
|
}
|
|
72
|
+
function stripDtd(xml) {
|
|
73
|
+
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
74
|
+
}
|
|
75
|
+
var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
|
|
76
|
+
function sanitizeHref(href) {
|
|
77
|
+
const trimmed = href.trim();
|
|
78
|
+
if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
|
|
79
|
+
return trimmed;
|
|
80
|
+
}
|
|
81
|
+
function safeMin(arr) {
|
|
82
|
+
let min = Infinity;
|
|
83
|
+
for (let i = 0; i < arr.length; i++) if (arr[i] < min) min = arr[i];
|
|
84
|
+
return min;
|
|
85
|
+
}
|
|
86
|
+
function safeMax(arr) {
|
|
87
|
+
let max = -Infinity;
|
|
88
|
+
for (let i = 0; i < arr.length; i++) if (arr[i] > max) max = arr[i];
|
|
89
|
+
return max;
|
|
90
|
+
}
|
|
91
|
+
function classifyError(err) {
|
|
92
|
+
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
93
|
+
const msg = err.message;
|
|
94
|
+
if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
|
|
95
|
+
if (msg.includes("DRM")) return "DRM_PROTECTED";
|
|
96
|
+
if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
|
|
97
|
+
if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
|
|
98
|
+
if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
|
|
99
|
+
if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
|
|
100
|
+
if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
|
|
101
|
+
return "PARSE_ERROR";
|
|
102
|
+
}
|
|
51
103
|
|
|
52
104
|
// src/table/builder.ts
|
|
53
105
|
var MAX_COLS = 200;
|
|
@@ -110,6 +162,7 @@ function buildTableDirect(rows, numRows) {
|
|
|
110
162
|
if (end > maxCols) maxCols = end;
|
|
111
163
|
}
|
|
112
164
|
}
|
|
165
|
+
if (maxCols > MAX_COLS) maxCols = MAX_COLS;
|
|
113
166
|
if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
|
|
114
167
|
const grid = Array.from(
|
|
115
168
|
{ length: numRows },
|
|
@@ -119,7 +172,7 @@ function buildTableDirect(rows, numRows) {
|
|
|
119
172
|
for (const cell of row) {
|
|
120
173
|
const r = cell.rowAddr ?? 0;
|
|
121
174
|
const c = cell.colAddr ?? 0;
|
|
122
|
-
if (r >= numRows || c >= maxCols) continue;
|
|
175
|
+
if (r >= numRows || c >= maxCols || r < 0 || c < 0) continue;
|
|
123
176
|
grid[r][c] = { text: cell.text.trim(), colSpan: cell.colSpan, rowSpan: cell.rowSpan };
|
|
124
177
|
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
125
178
|
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
@@ -148,9 +201,12 @@ function trimAndReturn(grid, numRows, maxCols) {
|
|
|
148
201
|
}
|
|
149
202
|
function convertTableToText(rows) {
|
|
150
203
|
return rows.map(
|
|
151
|
-
(row) => row.map((c) => c.text.trim().replace(/\n/g, " ")).filter(Boolean).join("
|
|
204
|
+
(row) => row.map((c) => c.text.trim().replace(/\n/g, " ").replace(/\|/g, "\\|")).filter(Boolean).join(" / ")
|
|
152
205
|
).filter(Boolean).join("\n");
|
|
153
206
|
}
|
|
207
|
+
function escapeGfm(text) {
|
|
208
|
+
return text.replace(/~/g, "\\~");
|
|
209
|
+
}
|
|
154
210
|
var HWP_SHAPE_ALT_TEXT_RE = /(?:모서리가 둥근 |둥근 )?(?:사각형|직사각형|정사각형|원|타원|삼각형|이등변 삼각형|직각 삼각형|선|직선|곡선|화살표|굵은 화살표|이중 화살표|오각형|육각형|팔각형|별|[4-8]점별|십자|십자형|구름|구름형|마름모|도넛|평행사변형|사다리꼴|부채꼴|호|반원|물결|번개|하트|빗금|블록 화살표|수식|표|그림|개체|그리기\s?개체|묶음\s?개체|글상자|수식\s?개체|OLE\s?개체)\s?입니다\.?/g;
|
|
155
211
|
function sanitizeText(text) {
|
|
156
212
|
let result = text.replace(/[\u{F0000}-\u{FFFFD}]/gu, "").replace(HWP_SHAPE_ALT_TEXT_RE, "").replace(/ +/g, " ").trim();
|
|
@@ -260,7 +316,7 @@ function blocksToMarkdown(blocks) {
|
|
|
260
316
|
if (block.footnoteText) {
|
|
261
317
|
text += ` (\uC8FC: ${block.footnoteText})`;
|
|
262
318
|
}
|
|
263
|
-
lines.push(text);
|
|
319
|
+
lines.push(escapeGfm(text), "");
|
|
264
320
|
} else if (block.type === "table" && block.table) {
|
|
265
321
|
if (lines.length > 0 && lines[lines.length - 1] !== "") {
|
|
266
322
|
lines.push("");
|
|
@@ -283,13 +339,13 @@ function tableToMarkdown(table) {
|
|
|
283
339
|
return content.split(/\n/).map((line) => {
|
|
284
340
|
const trimmed = line.trim();
|
|
285
341
|
if (!trimmed) return "";
|
|
286
|
-
if (/^\d+\.\s/.test(trimmed)) return `**${trimmed}**`;
|
|
287
|
-
if (/^[가-힣]\.\s/.test(trimmed)) return ` ${trimmed}`;
|
|
288
|
-
return trimmed;
|
|
342
|
+
if (/^\d+\.\s/.test(trimmed)) return `**${escapeGfm(trimmed)}**`;
|
|
343
|
+
if (/^[가-힣]\.\s/.test(trimmed)) return ` ${escapeGfm(trimmed)}`;
|
|
344
|
+
return escapeGfm(trimmed);
|
|
289
345
|
}).filter(Boolean).join("\n");
|
|
290
346
|
}
|
|
291
347
|
if (numCols === 1 && numRows >= 2) {
|
|
292
|
-
return cells.map((row) => sanitizeText(row[0].text).replace(/\n/g, " ")).filter(Boolean).join("\n");
|
|
348
|
+
return cells.map((row) => escapeGfm(sanitizeText(row[0].text)).replace(/\n/g, " ")).filter(Boolean).join("\n");
|
|
293
349
|
}
|
|
294
350
|
const display = Array.from({ length: numRows }, () => Array(numCols).fill(""));
|
|
295
351
|
const skip = /* @__PURE__ */ new Set();
|
|
@@ -298,15 +354,12 @@ function tableToMarkdown(table) {
|
|
|
298
354
|
if (skip.has(`${r},${c}`)) continue;
|
|
299
355
|
const cell = cells[r]?.[c];
|
|
300
356
|
if (!cell) continue;
|
|
301
|
-
display[r][c] = sanitizeText(cell.text).replace(/\n/g, "<br>");
|
|
357
|
+
display[r][c] = escapeGfm(sanitizeText(cell.text)).replace(/\|/g, "\\|").replace(/\n/g, "<br>");
|
|
302
358
|
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
303
359
|
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
304
360
|
if (dr === 0 && dc === 0) continue;
|
|
305
361
|
if (r + dr < numRows && c + dc < numCols) {
|
|
306
362
|
skip.add(`${r + dr},${c + dc}`);
|
|
307
|
-
if (dr === 0) {
|
|
308
|
-
display[r][c + dc] = cell.text.replace(/\n/g, "<br>");
|
|
309
|
-
}
|
|
310
363
|
}
|
|
311
364
|
}
|
|
312
365
|
}
|
|
@@ -344,7 +397,7 @@ function tableToMarkdown(table) {
|
|
|
344
397
|
}
|
|
345
398
|
|
|
346
399
|
// src/hwpx/parser.ts
|
|
347
|
-
import
|
|
400
|
+
import JSZip from "jszip";
|
|
348
401
|
import { inflateRawSync } from "zlib";
|
|
349
402
|
import { DOMParser } from "@xmldom/xmldom";
|
|
350
403
|
|
|
@@ -446,14 +499,11 @@ function parseStyleElements(doc, map) {
|
|
|
446
499
|
}
|
|
447
500
|
}
|
|
448
501
|
}
|
|
449
|
-
function stripDtd(xml) {
|
|
450
|
-
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
451
|
-
}
|
|
452
502
|
async function parseHwpxDocument(buffer, options) {
|
|
453
503
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
|
|
454
504
|
let zip;
|
|
455
505
|
try {
|
|
456
|
-
zip = await
|
|
506
|
+
zip = await JSZip.loadAsync(buffer);
|
|
457
507
|
} catch {
|
|
458
508
|
return extractFromBrokenZip(buffer);
|
|
459
509
|
}
|
|
@@ -616,7 +666,7 @@ function parseDublinCoreMetadata(xml, metadata) {
|
|
|
616
666
|
async function extractHwpxMetadataOnly(buffer) {
|
|
617
667
|
let zip;
|
|
618
668
|
try {
|
|
619
|
-
zip = await
|
|
669
|
+
zip = await JSZip.loadAsync(buffer);
|
|
620
670
|
} catch {
|
|
621
671
|
throw new KordocError("HWPX ZIP\uC744 \uC5F4 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
622
672
|
}
|
|
@@ -811,7 +861,8 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
811
861
|
if (newTable.rows.length > 0) {
|
|
812
862
|
if (tableStack.length > 0) {
|
|
813
863
|
const parentTable = tableStack.pop();
|
|
814
|
-
|
|
864
|
+
let nestedCols = 0;
|
|
865
|
+
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
815
866
|
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
816
867
|
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
817
868
|
} else {
|
|
@@ -920,7 +971,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
920
971
|
if (newTable.rows.length > 0) {
|
|
921
972
|
if (tableStack.length > 0) {
|
|
922
973
|
const parentTable = tableStack.pop();
|
|
923
|
-
|
|
974
|
+
let nestedCols = 0;
|
|
975
|
+
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
924
976
|
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
925
977
|
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
926
978
|
} else {
|
|
@@ -2018,6 +2070,7 @@ function parseLenientCfb(data) {
|
|
|
2018
2070
|
if (miniSectorSizeShift > 16) throw new Error("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uBBF8\uB2C8 \uC139\uD130 \uD06C\uAE30 \uC2DC\uD504\uD2B8: " + miniSectorSizeShift);
|
|
2019
2071
|
const miniSectorSize = 1 << miniSectorSizeShift;
|
|
2020
2072
|
const fatSectorCount = data.readUInt32LE(44);
|
|
2073
|
+
if (fatSectorCount > 1e4) throw new Error("FAT \uC139\uD130 \uC218\uAC00 \uB108\uBB34 \uB9CE\uC2B5\uB2C8\uB2E4: " + fatSectorCount);
|
|
2021
2074
|
const firstDirSector = data.readUInt32LE(48);
|
|
2022
2075
|
const miniStreamCutoff = data.readUInt32LE(56);
|
|
2023
2076
|
const firstMiniFatSector = data.readUInt32LE(60);
|
|
@@ -2406,10 +2459,14 @@ function findSections(cfb) {
|
|
|
2406
2459
|
}
|
|
2407
2460
|
function findSectionsLenient(lcfb, compressed) {
|
|
2408
2461
|
const sections = [];
|
|
2462
|
+
let totalDecompressed = 0;
|
|
2409
2463
|
for (let i = 0; i < MAX_SECTIONS; i++) {
|
|
2410
2464
|
const raw = lcfb.findStream(`/BodyText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
|
|
2411
2465
|
if (!raw) break;
|
|
2412
|
-
|
|
2466
|
+
const content = compressed ? decompressStream(raw) : raw;
|
|
2467
|
+
totalDecompressed += content.length;
|
|
2468
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2469
|
+
sections.push({ idx: i, content });
|
|
2413
2470
|
}
|
|
2414
2471
|
if (sections.length === 0) {
|
|
2415
2472
|
for (const e of lcfb.entries()) {
|
|
@@ -2417,7 +2474,12 @@ function findSectionsLenient(lcfb, compressed) {
|
|
|
2417
2474
|
if (e.name.startsWith("Section")) {
|
|
2418
2475
|
const idx = parseInt(e.name.replace("Section", ""), 10) || 0;
|
|
2419
2476
|
const raw = lcfb.findStream(e.name);
|
|
2420
|
-
if (raw)
|
|
2477
|
+
if (raw) {
|
|
2478
|
+
const content = compressed ? decompressStream(raw) : raw;
|
|
2479
|
+
totalDecompressed += content.length;
|
|
2480
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2481
|
+
sections.push({ idx, content });
|
|
2482
|
+
}
|
|
2421
2483
|
}
|
|
2422
2484
|
}
|
|
2423
2485
|
}
|
|
@@ -2425,11 +2487,15 @@ function findSectionsLenient(lcfb, compressed) {
|
|
|
2425
2487
|
}
|
|
2426
2488
|
function findViewTextSectionsLenient(lcfb, compressed) {
|
|
2427
2489
|
const sections = [];
|
|
2490
|
+
let totalDecompressed = 0;
|
|
2428
2491
|
for (let i = 0; i < MAX_SECTIONS; i++) {
|
|
2429
2492
|
const raw = lcfb.findStream(`/ViewText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
|
|
2430
2493
|
if (!raw) break;
|
|
2431
2494
|
try {
|
|
2432
|
-
|
|
2495
|
+
const content = decryptViewText(raw, compressed);
|
|
2496
|
+
totalDecompressed += content.length;
|
|
2497
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2498
|
+
sections.push({ idx: i, content });
|
|
2433
2499
|
} catch {
|
|
2434
2500
|
break;
|
|
2435
2501
|
}
|
|
@@ -2828,37 +2894,18 @@ function arrangeCells(rows, cols, cells) {
|
|
|
2828
2894
|
// src/pdf/line-detector.ts
|
|
2829
2895
|
import { OPS } from "pdfjs-dist/legacy/build/pdf.mjs";
|
|
2830
2896
|
var ORIENTATION_TOL = 2;
|
|
2831
|
-
var MIN_LINE_LENGTH =
|
|
2832
|
-
var
|
|
2897
|
+
var MIN_LINE_LENGTH = 15;
|
|
2898
|
+
var MAX_LINE_WIDTH = 5;
|
|
2833
2899
|
var CONNECT_TOL = 5;
|
|
2834
2900
|
var CELL_PADDING = 2;
|
|
2835
|
-
var
|
|
2836
|
-
var
|
|
2837
|
-
|
|
2838
|
-
|
|
2839
|
-
m1[0] * m2[0] + m1[2] * m2[1],
|
|
2840
|
-
m1[1] * m2[0] + m1[3] * m2[1],
|
|
2841
|
-
m1[0] * m2[2] + m1[2] * m2[3],
|
|
2842
|
-
m1[1] * m2[2] + m1[3] * m2[3],
|
|
2843
|
-
m1[0] * m2[4] + m1[2] * m2[5] + m1[4],
|
|
2844
|
-
m1[1] * m2[4] + m1[3] * m2[5] + m1[5]
|
|
2845
|
-
];
|
|
2846
|
-
}
|
|
2847
|
-
function matTransformPoint(m, x, y) {
|
|
2848
|
-
return [m[0] * x + m[2] * y + m[4], m[1] * x + m[3] * y + m[5]];
|
|
2849
|
-
}
|
|
2850
|
-
function matScale(m) {
|
|
2851
|
-
return Math.max(
|
|
2852
|
-
Math.sqrt(m[1] * m[1] + m[3] * m[3]),
|
|
2853
|
-
Math.sqrt(m[0] * m[0] + m[2] * m[2])
|
|
2854
|
-
);
|
|
2855
|
-
}
|
|
2901
|
+
var MIN_COL_WIDTH = 15;
|
|
2902
|
+
var MIN_ROW_HEIGHT = 6;
|
|
2903
|
+
var VERTEX_MERGE_FACTOR = 4;
|
|
2904
|
+
var MIN_COORD_MERGE_TOL = 8;
|
|
2856
2905
|
function extractLines(fnArray, argsArray) {
|
|
2857
2906
|
const horizontals = [];
|
|
2858
2907
|
const verticals = [];
|
|
2859
|
-
let ctm = [...IDENTITY];
|
|
2860
2908
|
let lineWidth = 1;
|
|
2861
|
-
const stateStack = [];
|
|
2862
2909
|
let currentPath = [];
|
|
2863
2910
|
let pathStartX = 0, pathStartY = 0;
|
|
2864
2911
|
let curX = 0, curY = 0;
|
|
@@ -2876,53 +2923,13 @@ function extractLines(fnArray, argsArray) {
|
|
|
2876
2923
|
);
|
|
2877
2924
|
}
|
|
2878
2925
|
}
|
|
2879
|
-
function
|
|
2880
|
-
if (
|
|
2881
|
-
const first = path[0], last = path[path.length - 1];
|
|
2882
|
-
const closed = Math.abs(first.x1 - last.x2) < 1 && Math.abs(first.y1 - last.y2) < 1;
|
|
2883
|
-
if (!closed) return false;
|
|
2884
|
-
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
2885
|
-
for (const seg of path) {
|
|
2886
|
-
minX = Math.min(minX, seg.x1, seg.x2);
|
|
2887
|
-
minY = Math.min(minY, seg.y1, seg.y2);
|
|
2888
|
-
maxX = Math.max(maxX, seg.x1, seg.x2);
|
|
2889
|
-
maxY = Math.max(maxY, seg.y1, seg.y2);
|
|
2890
|
-
}
|
|
2891
|
-
const w = maxX - minX, h = maxY - minY;
|
|
2892
|
-
if (w < MIN_LINE_LENGTH && h < MIN_LINE_LENGTH) return false;
|
|
2893
|
-
path.length = 0;
|
|
2894
|
-
if (h < ORIENTATION_TOL * 2 || w > MIN_LINE_LENGTH && h <= MAX_LINE_WIDTH) {
|
|
2895
|
-
path.push({ x1: minX, y1: (minY + maxY) / 2, x2: maxX, y2: (minY + maxY) / 2 });
|
|
2896
|
-
} else if (w < ORIENTATION_TOL * 2 || h > MIN_LINE_LENGTH && w <= MAX_LINE_WIDTH) {
|
|
2897
|
-
path.push({ x1: (minX + maxX) / 2, y1: minY, x2: (minX + maxX) / 2, y2: maxY });
|
|
2898
|
-
} else {
|
|
2899
|
-
pushRectangle(path, minX, minY, w, h);
|
|
2900
|
-
}
|
|
2901
|
-
return true;
|
|
2902
|
-
}
|
|
2903
|
-
function flushPath(isStroke, isFill) {
|
|
2904
|
-
if (!isStroke && !isFill) {
|
|
2905
|
-
currentPath = [];
|
|
2906
|
-
return;
|
|
2907
|
-
}
|
|
2908
|
-
if (isFill && !isStroke && currentPath.length >= 3) {
|
|
2909
|
-
tryConvertLinesToRectangle(currentPath);
|
|
2910
|
-
}
|
|
2911
|
-
const scale = matScale(ctm);
|
|
2912
|
-
const effectiveLW = lineWidth * scale;
|
|
2913
|
-
if (effectiveLW > MAX_LINE_WIDTH && isStroke && !isFill) {
|
|
2926
|
+
function flushPath(isStroke) {
|
|
2927
|
+
if (!isStroke) {
|
|
2914
2928
|
currentPath = [];
|
|
2915
2929
|
return;
|
|
2916
2930
|
}
|
|
2917
2931
|
for (const seg of currentPath) {
|
|
2918
|
-
|
|
2919
|
-
const [px2, py2] = matTransformPoint(ctm, seg.x2, seg.y2);
|
|
2920
|
-
classifyAndAdd(
|
|
2921
|
-
{ x1: px1, y1: py1, x2: px2, y2: py2 },
|
|
2922
|
-
effectiveLW,
|
|
2923
|
-
horizontals,
|
|
2924
|
-
verticals
|
|
2925
|
-
);
|
|
2932
|
+
classifyAndAdd(seg, lineWidth, horizontals, verticals);
|
|
2926
2933
|
}
|
|
2927
2934
|
currentPath = [];
|
|
2928
2935
|
}
|
|
@@ -2930,28 +2937,9 @@ function extractLines(fnArray, argsArray) {
|
|
|
2930
2937
|
const op = fnArray[i];
|
|
2931
2938
|
const args = argsArray[i];
|
|
2932
2939
|
switch (op) {
|
|
2933
|
-
// ── Graphics State ──
|
|
2934
|
-
case OPS.save:
|
|
2935
|
-
stateStack.push({ ctm: [...ctm], lineWidth });
|
|
2936
|
-
break;
|
|
2937
|
-
case OPS.restore:
|
|
2938
|
-
if (stateStack.length > 0) {
|
|
2939
|
-
const state = stateStack.pop();
|
|
2940
|
-
ctm = state.ctm;
|
|
2941
|
-
lineWidth = state.lineWidth;
|
|
2942
|
-
}
|
|
2943
|
-
break;
|
|
2944
|
-
case OPS.transform: {
|
|
2945
|
-
const m = args;
|
|
2946
|
-
if (m.length >= 6) {
|
|
2947
|
-
ctm = matMultiply(ctm, [m[0], m[1], m[2], m[3], m[4], m[5]]);
|
|
2948
|
-
}
|
|
2949
|
-
break;
|
|
2950
|
-
}
|
|
2951
2940
|
case OPS.setLineWidth:
|
|
2952
2941
|
lineWidth = args[0] || 1;
|
|
2953
2942
|
break;
|
|
2954
|
-
// ── Path Construction ──
|
|
2955
2943
|
case OPS.constructPath: {
|
|
2956
2944
|
const arg0 = args[0];
|
|
2957
2945
|
if (Array.isArray(arg0)) {
|
|
@@ -3019,60 +3007,34 @@ function extractLines(fnArray, argsArray) {
|
|
|
3019
3007
|
}
|
|
3020
3008
|
}
|
|
3021
3009
|
}
|
|
3022
|
-
|
|
3023
|
-
|
|
3024
|
-
|
|
3025
|
-
|
|
3026
|
-
flushPath(isStroke5 || isBoth5, isFill5 || isBoth5);
|
|
3010
|
+
if (afterOp === OPS.stroke || afterOp === OPS.closeStroke) {
|
|
3011
|
+
flushPath(true);
|
|
3012
|
+
} else if (afterOp === OPS.fill || afterOp === OPS.eoFill || afterOp === OPS.fillStroke || afterOp === OPS.eoFillStroke || afterOp === OPS.closeFillStroke || afterOp === OPS.closeEOFillStroke) {
|
|
3013
|
+
flushPath(true);
|
|
3027
3014
|
} else if (afterOp === OPS.endPath) {
|
|
3028
|
-
flushPath(false
|
|
3015
|
+
flushPath(false);
|
|
3029
3016
|
}
|
|
3030
3017
|
}
|
|
3031
3018
|
break;
|
|
3032
3019
|
}
|
|
3033
|
-
// ── Paint Operations ──
|
|
3034
3020
|
case OPS.stroke:
|
|
3035
3021
|
case OPS.closeStroke:
|
|
3036
|
-
flushPath(true
|
|
3022
|
+
flushPath(true);
|
|
3037
3023
|
break;
|
|
3038
3024
|
case OPS.fill:
|
|
3039
3025
|
case OPS.eoFill:
|
|
3040
|
-
flushPath(false, true);
|
|
3041
|
-
break;
|
|
3042
3026
|
case OPS.fillStroke:
|
|
3043
3027
|
case OPS.eoFillStroke:
|
|
3044
3028
|
case OPS.closeFillStroke:
|
|
3045
3029
|
case OPS.closeEOFillStroke:
|
|
3046
|
-
flushPath(true
|
|
3030
|
+
flushPath(true);
|
|
3047
3031
|
break;
|
|
3048
3032
|
case OPS.endPath:
|
|
3049
|
-
flushPath(false
|
|
3050
|
-
break;
|
|
3051
|
-
}
|
|
3052
|
-
}
|
|
3053
|
-
return {
|
|
3054
|
-
horizontals: deduplicateLines(horizontals),
|
|
3055
|
-
verticals: deduplicateLines(verticals)
|
|
3056
|
-
};
|
|
3057
|
-
}
|
|
3058
|
-
function deduplicateLines(lines) {
|
|
3059
|
-
if (lines.length <= 1) return lines;
|
|
3060
|
-
const result = [];
|
|
3061
|
-
const tol = COORD_MERGE_TOL;
|
|
3062
|
-
for (const line of lines) {
|
|
3063
|
-
let isDuplicate = false;
|
|
3064
|
-
for (const existing of result) {
|
|
3065
|
-
if (Math.abs(line.y1 - existing.y1) <= tol && Math.abs(line.y2 - existing.y2) <= tol && Math.abs(line.x1 - existing.x1) <= tol && Math.abs(line.x2 - existing.x2) <= tol) {
|
|
3066
|
-
if (line.lineWidth > existing.lineWidth) {
|
|
3067
|
-
existing.lineWidth = line.lineWidth;
|
|
3068
|
-
}
|
|
3069
|
-
isDuplicate = true;
|
|
3033
|
+
flushPath(false);
|
|
3070
3034
|
break;
|
|
3071
|
-
}
|
|
3072
3035
|
}
|
|
3073
|
-
if (!isDuplicate) result.push(line);
|
|
3074
3036
|
}
|
|
3075
|
-
return
|
|
3037
|
+
return { horizontals, verticals };
|
|
3076
3038
|
}
|
|
3077
3039
|
function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
|
|
3078
3040
|
const dx = Math.abs(seg.x2 - seg.x1);
|
|
@@ -3091,6 +3053,55 @@ function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
|
|
|
3091
3053
|
verticals.push({ x1: x, y1, x2: x, y2, lineWidth });
|
|
3092
3054
|
}
|
|
3093
3055
|
}
|
|
3056
|
+
function preprocessLines(horizontals, verticals) {
|
|
3057
|
+
let h = horizontals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
|
|
3058
|
+
let v = verticals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
|
|
3059
|
+
h = mergeParallelLines(h, "h");
|
|
3060
|
+
v = mergeParallelLines(v, "v");
|
|
3061
|
+
return { horizontals: h, verticals: v };
|
|
3062
|
+
}
|
|
3063
|
+
function mergeParallelLines(lines, dir) {
|
|
3064
|
+
if (lines.length <= 1) return lines;
|
|
3065
|
+
const sorted = [...lines].sort((a, b) => {
|
|
3066
|
+
const posA = dir === "h" ? a.y1 : a.x1;
|
|
3067
|
+
const posB = dir === "h" ? b.y1 : b.x1;
|
|
3068
|
+
if (Math.abs(posA - posB) > 0.1) return posA - posB;
|
|
3069
|
+
return dir === "h" ? a.x1 - b.x1 : a.y1 - b.y1;
|
|
3070
|
+
});
|
|
3071
|
+
const MERGE_TOL = 3;
|
|
3072
|
+
const result = [sorted[0]];
|
|
3073
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
3074
|
+
const prev = result[result.length - 1];
|
|
3075
|
+
const curr = sorted[i];
|
|
3076
|
+
const prevPos = dir === "h" ? prev.y1 : prev.x1;
|
|
3077
|
+
const currPos = dir === "h" ? curr.y1 : curr.x1;
|
|
3078
|
+
if (Math.abs(prevPos - currPos) <= MERGE_TOL) {
|
|
3079
|
+
const prevStart = dir === "h" ? prev.x1 : prev.y1;
|
|
3080
|
+
const prevEnd = dir === "h" ? prev.x2 : prev.y2;
|
|
3081
|
+
const currStart = dir === "h" ? curr.x1 : curr.y1;
|
|
3082
|
+
const currEnd = dir === "h" ? curr.x2 : curr.y2;
|
|
3083
|
+
const overlap = Math.min(prevEnd, currEnd) - Math.max(prevStart, currStart);
|
|
3084
|
+
const minLen = Math.min(prevEnd - prevStart, currEnd - currStart);
|
|
3085
|
+
if (overlap > minLen * 0.3) {
|
|
3086
|
+
if (dir === "h") {
|
|
3087
|
+
prev.x1 = Math.min(prev.x1, curr.x1);
|
|
3088
|
+
prev.x2 = Math.max(prev.x2, curr.x2);
|
|
3089
|
+
prev.y1 = (prev.y1 + curr.y1) / 2;
|
|
3090
|
+
prev.y2 = prev.y1;
|
|
3091
|
+
} else {
|
|
3092
|
+
prev.y1 = Math.min(prev.y1, curr.y1);
|
|
3093
|
+
prev.y2 = Math.max(prev.y2, curr.y2);
|
|
3094
|
+
prev.x1 = (prev.x1 + curr.x1) / 2;
|
|
3095
|
+
prev.x2 = prev.x1;
|
|
3096
|
+
}
|
|
3097
|
+
prev.lineWidth = Math.max(prev.lineWidth, curr.lineWidth);
|
|
3098
|
+
continue;
|
|
3099
|
+
}
|
|
3100
|
+
}
|
|
3101
|
+
result.push(curr);
|
|
3102
|
+
}
|
|
3103
|
+
return result;
|
|
3104
|
+
}
|
|
3094
3105
|
function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
|
|
3095
3106
|
const margin = 5;
|
|
3096
3107
|
return {
|
|
@@ -3102,8 +3113,49 @@ function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
|
|
|
3102
3113
|
)
|
|
3103
3114
|
};
|
|
3104
3115
|
}
|
|
3116
|
+
function buildVertices(horizontals, verticals) {
|
|
3117
|
+
const vertices = [];
|
|
3118
|
+
const tol = CONNECT_TOL;
|
|
3119
|
+
for (const h of horizontals) {
|
|
3120
|
+
for (const v of verticals) {
|
|
3121
|
+
if (v.x1 >= h.x1 - tol && v.x1 <= h.x2 + tol && h.y1 >= v.y1 - tol && h.y1 <= v.y2 + tol) {
|
|
3122
|
+
const radius = Math.max(h.lineWidth, v.lineWidth, 1);
|
|
3123
|
+
vertices.push({ x: v.x1, y: h.y1, radius });
|
|
3124
|
+
}
|
|
3125
|
+
}
|
|
3126
|
+
}
|
|
3127
|
+
return vertices;
|
|
3128
|
+
}
|
|
3129
|
+
function mergeVertices(vertices) {
|
|
3130
|
+
if (vertices.length <= 1) return vertices;
|
|
3131
|
+
const merged = [];
|
|
3132
|
+
const used = new Array(vertices.length).fill(false);
|
|
3133
|
+
for (let i = 0; i < vertices.length; i++) {
|
|
3134
|
+
if (used[i]) continue;
|
|
3135
|
+
let sumX = vertices[i].x, sumY = vertices[i].y;
|
|
3136
|
+
let maxRadius = vertices[i].radius;
|
|
3137
|
+
let count = 1;
|
|
3138
|
+
for (let j = i + 1; j < vertices.length; j++) {
|
|
3139
|
+
if (used[j]) continue;
|
|
3140
|
+
const mergeTol = VERTEX_MERGE_FACTOR * Math.max(maxRadius, vertices[j].radius);
|
|
3141
|
+
if (Math.abs(vertices[i].x - vertices[j].x) <= mergeTol && Math.abs(vertices[i].y - vertices[j].y) <= mergeTol) {
|
|
3142
|
+
sumX += vertices[j].x;
|
|
3143
|
+
sumY += vertices[j].y;
|
|
3144
|
+
maxRadius = Math.max(maxRadius, vertices[j].radius);
|
|
3145
|
+
count++;
|
|
3146
|
+
used[j] = true;
|
|
3147
|
+
}
|
|
3148
|
+
}
|
|
3149
|
+
merged.push({ x: sumX / count, y: sumY / count, radius: maxRadius });
|
|
3150
|
+
}
|
|
3151
|
+
return merged;
|
|
3152
|
+
}
|
|
3105
3153
|
function buildTableGrids(horizontals, verticals) {
|
|
3106
3154
|
if (horizontals.length < 2 || verticals.length < 2) return [];
|
|
3155
|
+
const allVertices = buildVertices(horizontals, verticals);
|
|
3156
|
+
const vertices = mergeVertices(allVertices);
|
|
3157
|
+
if (vertices.length < 4) return [];
|
|
3158
|
+
const globalRadius = vertices.reduce((max, v) => Math.max(max, v.radius), 1);
|
|
3107
3159
|
const allLines = [
|
|
3108
3160
|
...horizontals.map((l, i) => ({ ...l, type: "h", id: i })),
|
|
3109
3161
|
...verticals.map((l, i) => ({ ...l, type: "v", id: i + horizontals.length }))
|
|
@@ -3114,21 +3166,74 @@ function buildTableGrids(horizontals, verticals) {
|
|
|
3114
3166
|
const hLines = group.filter((l) => l.type === "h");
|
|
3115
3167
|
const vLines = group.filter((l) => l.type === "v");
|
|
3116
3168
|
if (hLines.length < 2 || vLines.length < 2) continue;
|
|
3117
|
-
|
|
3118
|
-
const
|
|
3119
|
-
|
|
3120
|
-
|
|
3169
|
+
let gx1 = Infinity, gy1 = Infinity, gx2 = -Infinity, gy2 = -Infinity;
|
|
3170
|
+
for (const l of vLines) {
|
|
3171
|
+
if (l.x1 < gx1) gx1 = l.x1;
|
|
3172
|
+
if (l.x1 > gx2) gx2 = l.x1;
|
|
3173
|
+
}
|
|
3174
|
+
for (const l of hLines) {
|
|
3175
|
+
if (l.y1 < gy1) gy1 = l.y1;
|
|
3176
|
+
if (l.y1 > gy2) gy2 = l.y1;
|
|
3177
|
+
}
|
|
3178
|
+
const groupBbox = {
|
|
3179
|
+
x1: gx1 - CONNECT_TOL,
|
|
3180
|
+
y1: gy1 - CONNECT_TOL,
|
|
3181
|
+
x2: gx2 + CONNECT_TOL,
|
|
3182
|
+
y2: gy2 + CONNECT_TOL
|
|
3183
|
+
};
|
|
3184
|
+
const groupVertices = vertices.filter(
|
|
3185
|
+
(v) => v.x >= groupBbox.x1 && v.x <= groupBbox.x2 && v.y >= groupBbox.y1 && v.y <= groupBbox.y2
|
|
3186
|
+
);
|
|
3187
|
+
const groupRadius = groupVertices.length > 0 ? groupVertices.reduce((max, v) => Math.max(max, v.radius), 1) : globalRadius;
|
|
3188
|
+
const coordMergeTol = Math.max(VERTEX_MERGE_FACTOR * groupRadius, MIN_COORD_MERGE_TOL);
|
|
3189
|
+
const rawYs = [
|
|
3190
|
+
...hLines.map((l) => l.y1),
|
|
3191
|
+
...groupVertices.map((v) => v.y)
|
|
3192
|
+
];
|
|
3193
|
+
const rowYs = clusterCoordinates(rawYs, coordMergeTol).sort((a, b) => b - a);
|
|
3194
|
+
const rawXs = [
|
|
3195
|
+
...vLines.map((l) => l.x1),
|
|
3196
|
+
...groupVertices.map((v) => v.x)
|
|
3197
|
+
];
|
|
3198
|
+
const colXs = clusterCoordinates(rawXs, coordMergeTol).sort((a, b) => a - b);
|
|
3121
3199
|
if (rowYs.length < 2 || colXs.length < 2) continue;
|
|
3200
|
+
const validColXs = enforceMinWidth(colXs, MIN_COL_WIDTH);
|
|
3201
|
+
const validRowYs = enforceMinHeight(rowYs, MIN_ROW_HEIGHT);
|
|
3202
|
+
if (validRowYs.length < 2 || validColXs.length < 2) continue;
|
|
3122
3203
|
const bbox = {
|
|
3123
|
-
x1:
|
|
3124
|
-
y1:
|
|
3125
|
-
x2:
|
|
3126
|
-
y2:
|
|
3204
|
+
x1: validColXs[0],
|
|
3205
|
+
y1: validRowYs[validRowYs.length - 1],
|
|
3206
|
+
x2: validColXs[validColXs.length - 1],
|
|
3207
|
+
y2: validRowYs[0]
|
|
3127
3208
|
};
|
|
3128
|
-
grids.push({ rowYs, colXs, bbox });
|
|
3209
|
+
grids.push({ rowYs: validRowYs, colXs: validColXs, bbox, vertexRadius: groupRadius });
|
|
3129
3210
|
}
|
|
3130
3211
|
return mergeAdjacentGrids(grids);
|
|
3131
3212
|
}
|
|
3213
|
+
function enforceMinWidth(colXs, minWidth) {
|
|
3214
|
+
if (colXs.length <= 2) return colXs;
|
|
3215
|
+
const result = [colXs[0]];
|
|
3216
|
+
for (let i = 1; i < colXs.length; i++) {
|
|
3217
|
+
const prevX = result[result.length - 1];
|
|
3218
|
+
if (colXs[i] - prevX < minWidth && i < colXs.length - 1) {
|
|
3219
|
+
continue;
|
|
3220
|
+
}
|
|
3221
|
+
result.push(colXs[i]);
|
|
3222
|
+
}
|
|
3223
|
+
return result;
|
|
3224
|
+
}
|
|
3225
|
+
function enforceMinHeight(rowYs, minHeight) {
|
|
3226
|
+
if (rowYs.length <= 2) return rowYs;
|
|
3227
|
+
const result = [rowYs[0]];
|
|
3228
|
+
for (let i = 1; i < rowYs.length; i++) {
|
|
3229
|
+
const prevY = result[result.length - 1];
|
|
3230
|
+
if (prevY - rowYs[i] < minHeight && i < rowYs.length - 1) {
|
|
3231
|
+
continue;
|
|
3232
|
+
}
|
|
3233
|
+
result.push(rowYs[i]);
|
|
3234
|
+
}
|
|
3235
|
+
return result;
|
|
3236
|
+
}
|
|
3132
3237
|
function mergeAdjacentGrids(grids) {
|
|
3133
3238
|
if (grids.length <= 1) return grids;
|
|
3134
3239
|
const sorted = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
|
|
@@ -3137,9 +3242,10 @@ function mergeAdjacentGrids(grids) {
|
|
|
3137
3242
|
const prev = merged[merged.length - 1];
|
|
3138
3243
|
const curr = sorted[i];
|
|
3139
3244
|
if (prev.colXs.length === curr.colXs.length) {
|
|
3140
|
-
const
|
|
3245
|
+
const mergeTol = Math.max(VERTEX_MERGE_FACTOR * Math.max(prev.vertexRadius, curr.vertexRadius), 6) * 3;
|
|
3246
|
+
const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= mergeTol);
|
|
3141
3247
|
const verticalGap = prev.bbox.y1 - curr.bbox.y2;
|
|
3142
|
-
if (colMatch && verticalGap >= -
|
|
3248
|
+
if (colMatch && verticalGap >= -CONNECT_TOL && verticalGap <= 20) {
|
|
3143
3249
|
const allRowYs = [.../* @__PURE__ */ new Set([...prev.rowYs, ...curr.rowYs])].sort((a, b) => b - a);
|
|
3144
3250
|
merged[merged.length - 1] = {
|
|
3145
3251
|
rowYs: allRowYs,
|
|
@@ -3149,7 +3255,8 @@ function mergeAdjacentGrids(grids) {
|
|
|
3149
3255
|
y1: Math.min(prev.bbox.y1, curr.bbox.y1),
|
|
3150
3256
|
x2: Math.max(prev.bbox.x2, curr.bbox.x2),
|
|
3151
3257
|
y2: Math.max(prev.bbox.y2, curr.bbox.y2)
|
|
3152
|
-
}
|
|
3258
|
+
},
|
|
3259
|
+
vertexRadius: Math.max(prev.vertexRadius, curr.vertexRadius)
|
|
3153
3260
|
};
|
|
3154
3261
|
continue;
|
|
3155
3262
|
}
|
|
@@ -3158,14 +3265,14 @@ function mergeAdjacentGrids(grids) {
|
|
|
3158
3265
|
}
|
|
3159
3266
|
return merged;
|
|
3160
3267
|
}
|
|
3161
|
-
function clusterCoordinates(values) {
|
|
3268
|
+
function clusterCoordinates(values, tolerance) {
|
|
3162
3269
|
if (values.length === 0) return [];
|
|
3163
3270
|
const sorted = [...values].sort((a, b) => a - b);
|
|
3164
3271
|
const clusters = [{ sum: sorted[0], count: 1 }];
|
|
3165
3272
|
for (let i = 1; i < sorted.length; i++) {
|
|
3166
3273
|
const last = clusters[clusters.length - 1];
|
|
3167
3274
|
const avg = last.sum / last.count;
|
|
3168
|
-
if (Math.abs(sorted[i] - avg) <=
|
|
3275
|
+
if (Math.abs(sorted[i] - avg) <= tolerance) {
|
|
3169
3276
|
last.sum += sorted[i];
|
|
3170
3277
|
last.count++;
|
|
3171
3278
|
} else {
|
|
@@ -3222,6 +3329,20 @@ function extractCells(grid, horizontals, verticals) {
|
|
|
3222
3329
|
const numRows = rowYs.length - 1;
|
|
3223
3330
|
const numCols = colXs.length - 1;
|
|
3224
3331
|
if (numRows <= 0 || numCols <= 0) return [];
|
|
3332
|
+
const vBorders = Array.from(
|
|
3333
|
+
{ length: numRows },
|
|
3334
|
+
(_, r) => Array.from(
|
|
3335
|
+
{ length: numCols + 1 },
|
|
3336
|
+
(_2, c) => hasVerticalLine(verticals, colXs[c], rowYs[r], rowYs[r + 1], grid.vertexRadius)
|
|
3337
|
+
)
|
|
3338
|
+
);
|
|
3339
|
+
const hBorders = Array.from(
|
|
3340
|
+
{ length: numRows + 1 },
|
|
3341
|
+
(_, r) => Array.from(
|
|
3342
|
+
{ length: numCols },
|
|
3343
|
+
(_2, c) => hasHorizontalLine(horizontals, rowYs[r], colXs[c], colXs[c + 1], grid.vertexRadius)
|
|
3344
|
+
)
|
|
3345
|
+
);
|
|
3225
3346
|
const occupied = Array.from({ length: numRows }, () => Array(numCols).fill(false));
|
|
3226
3347
|
const cells = [];
|
|
3227
3348
|
for (let r = 0; r < numRows; r++) {
|
|
@@ -3229,18 +3350,26 @@ function extractCells(grid, horizontals, verticals) {
|
|
|
3229
3350
|
if (occupied[r][c]) continue;
|
|
3230
3351
|
let colSpan = 1;
|
|
3231
3352
|
let rowSpan = 1;
|
|
3232
|
-
while (c + colSpan < numCols) {
|
|
3233
|
-
|
|
3234
|
-
|
|
3235
|
-
|
|
3236
|
-
|
|
3353
|
+
while (c + colSpan < numCols && !vBorders[r][c + colSpan]) {
|
|
3354
|
+
let canExpand = true;
|
|
3355
|
+
for (let dr = 0; dr < rowSpan; dr++) {
|
|
3356
|
+
if (vBorders[r + dr][c + colSpan]) {
|
|
3357
|
+
canExpand = false;
|
|
3358
|
+
break;
|
|
3359
|
+
}
|
|
3360
|
+
}
|
|
3361
|
+
if (!canExpand) break;
|
|
3237
3362
|
colSpan++;
|
|
3238
3363
|
}
|
|
3239
3364
|
while (r + rowSpan < numRows) {
|
|
3240
|
-
|
|
3241
|
-
|
|
3242
|
-
|
|
3243
|
-
|
|
3365
|
+
let hasLine = false;
|
|
3366
|
+
for (let dc = 0; dc < colSpan; dc++) {
|
|
3367
|
+
if (hBorders[r + rowSpan][c + dc]) {
|
|
3368
|
+
hasLine = true;
|
|
3369
|
+
break;
|
|
3370
|
+
}
|
|
3371
|
+
}
|
|
3372
|
+
if (hasLine) break;
|
|
3244
3373
|
rowSpan++;
|
|
3245
3374
|
}
|
|
3246
3375
|
for (let dr = 0; dr < rowSpan; dr++) {
|
|
@@ -3264,28 +3393,30 @@ function extractCells(grid, horizontals, verticals) {
|
|
|
3264
3393
|
}
|
|
3265
3394
|
return cells;
|
|
3266
3395
|
}
|
|
3267
|
-
function hasVerticalLine(verticals, x, topY, botY) {
|
|
3268
|
-
const tol =
|
|
3396
|
+
function hasVerticalLine(verticals, x, topY, botY, vertexRadius) {
|
|
3397
|
+
const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
|
|
3269
3398
|
for (const v of verticals) {
|
|
3270
3399
|
if (Math.abs(v.x1 - x) <= tol) {
|
|
3271
3400
|
const cellH = Math.abs(topY - botY);
|
|
3401
|
+
if (cellH < 0.1) continue;
|
|
3272
3402
|
const overlapTop = Math.min(v.y2, topY);
|
|
3273
3403
|
const overlapBot = Math.max(v.y1, botY);
|
|
3274
3404
|
const overlap = overlapTop - overlapBot;
|
|
3275
|
-
if (overlap >= cellH * 0.
|
|
3405
|
+
if (overlap >= cellH * 0.75) return true;
|
|
3276
3406
|
}
|
|
3277
3407
|
}
|
|
3278
3408
|
return false;
|
|
3279
3409
|
}
|
|
3280
|
-
function hasHorizontalLine(horizontals, y, leftX, rightX) {
|
|
3281
|
-
const tol =
|
|
3410
|
+
function hasHorizontalLine(horizontals, y, leftX, rightX, vertexRadius) {
|
|
3411
|
+
const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
|
|
3282
3412
|
for (const h of horizontals) {
|
|
3283
3413
|
if (Math.abs(h.y1 - y) <= tol) {
|
|
3284
3414
|
const cellW = Math.abs(rightX - leftX);
|
|
3415
|
+
if (cellW < 0.1) continue;
|
|
3285
3416
|
const overlapLeft = Math.max(h.x1, leftX);
|
|
3286
3417
|
const overlapRight = Math.min(h.x2, rightX);
|
|
3287
3418
|
const overlap = overlapRight - overlapLeft;
|
|
3288
|
-
if (overlap >= cellW * 0.
|
|
3419
|
+
if (overlap >= cellW * 0.75) return true;
|
|
3289
3420
|
}
|
|
3290
3421
|
}
|
|
3291
3422
|
return false;
|
|
@@ -3296,23 +3427,24 @@ function mapTextToCells(items, cells) {
|
|
|
3296
3427
|
result.set(cell, []);
|
|
3297
3428
|
}
|
|
3298
3429
|
for (const item of items) {
|
|
3299
|
-
const cx = item.x + item.w / 2;
|
|
3300
|
-
const cy = item.y;
|
|
3301
3430
|
const pad = CELL_PADDING;
|
|
3302
3431
|
let bestCell = null;
|
|
3303
|
-
let
|
|
3432
|
+
let bestScore = 0;
|
|
3304
3433
|
for (const cell of cells) {
|
|
3305
|
-
|
|
3306
|
-
|
|
3307
|
-
|
|
3308
|
-
|
|
3309
|
-
|
|
3310
|
-
|
|
3311
|
-
|
|
3312
|
-
|
|
3434
|
+
const ix1 = Math.max(item.x, cell.bbox.x1 - pad);
|
|
3435
|
+
const ix2 = Math.min(item.x + item.w, cell.bbox.x2 + pad);
|
|
3436
|
+
const iy1 = Math.max(item.y, cell.bbox.y1 - pad);
|
|
3437
|
+
const iy2 = Math.min(item.y + (item.h || item.fontSize), cell.bbox.y2 + pad);
|
|
3438
|
+
if (ix1 >= ix2 || iy1 >= iy2) continue;
|
|
3439
|
+
const intersectArea = (ix2 - ix1) * (iy2 - iy1);
|
|
3440
|
+
const itemArea = Math.max(item.w, 1) * Math.max(item.h || item.fontSize, 1);
|
|
3441
|
+
const score = intersectArea / itemArea;
|
|
3442
|
+
if (score > bestScore) {
|
|
3443
|
+
bestScore = score;
|
|
3444
|
+
bestCell = cell;
|
|
3313
3445
|
}
|
|
3314
3446
|
}
|
|
3315
|
-
if (bestCell) {
|
|
3447
|
+
if (bestCell && bestScore > 0.3) {
|
|
3316
3448
|
result.get(bestCell).push(item);
|
|
3317
3449
|
}
|
|
3318
3450
|
}
|
|
@@ -3339,8 +3471,13 @@ function cellTextToString(items) {
|
|
|
3339
3471
|
const textLines = lines.map((line) => {
|
|
3340
3472
|
const s = line.sort((a, b) => a.x - b.x);
|
|
3341
3473
|
if (s.length === 1) return s[0].text;
|
|
3474
|
+
const evenSpaced = detectEvenSpacedItems(s);
|
|
3342
3475
|
let result = s[0].text;
|
|
3343
3476
|
for (let j = 1; j < s.length; j++) {
|
|
3477
|
+
if (evenSpaced[j]) {
|
|
3478
|
+
result += s[j].text;
|
|
3479
|
+
continue;
|
|
3480
|
+
}
|
|
3344
3481
|
const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
|
|
3345
3482
|
const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
|
|
3346
3483
|
const prevIsKorean = /[가-힣]$/.test(result);
|
|
@@ -3355,6 +3492,57 @@ function cellTextToString(items) {
|
|
|
3355
3492
|
}
|
|
3356
3493
|
return result;
|
|
3357
3494
|
});
|
|
3495
|
+
return mergeCellTextLines(textLines);
|
|
3496
|
+
}
|
|
3497
|
+
function detectEvenSpacedItems(items) {
|
|
3498
|
+
const result = new Array(items.length).fill(false);
|
|
3499
|
+
if (items.length < 3) return result;
|
|
3500
|
+
let runStart = -1;
|
|
3501
|
+
for (let i = 0; i < items.length; i++) {
|
|
3502
|
+
const isShortKorean = /^[가-힣]{1}$/.test(items[i].text) || /^[\d]{1}$/.test(items[i].text);
|
|
3503
|
+
if (isShortKorean && runStart >= 0 && i > 0) {
|
|
3504
|
+
const gap = items[i].x - (items[i - 1].x + items[i - 1].w);
|
|
3505
|
+
const maxRunGap = Math.max(items[i].fontSize * 3, 30);
|
|
3506
|
+
if (gap > maxRunGap) {
|
|
3507
|
+
if (i - runStart >= 3) markEvenRun(items, result, runStart, i);
|
|
3508
|
+
runStart = i;
|
|
3509
|
+
continue;
|
|
3510
|
+
}
|
|
3511
|
+
}
|
|
3512
|
+
if (isShortKorean) {
|
|
3513
|
+
if (runStart < 0) runStart = i;
|
|
3514
|
+
} else {
|
|
3515
|
+
if (runStart >= 0 && i - runStart >= 3) {
|
|
3516
|
+
markEvenRun(items, result, runStart, i);
|
|
3517
|
+
}
|
|
3518
|
+
runStart = -1;
|
|
3519
|
+
}
|
|
3520
|
+
}
|
|
3521
|
+
if (runStart >= 0 && items.length - runStart >= 3) {
|
|
3522
|
+
markEvenRun(items, result, runStart, items.length);
|
|
3523
|
+
}
|
|
3524
|
+
return result;
|
|
3525
|
+
}
|
|
3526
|
+
function markEvenRun(items, result, start, end) {
|
|
3527
|
+
const gaps = [];
|
|
3528
|
+
for (let i = start + 1; i < end; i++) {
|
|
3529
|
+
gaps.push(items[i].x - (items[i - 1].x + items[i - 1].w));
|
|
3530
|
+
}
|
|
3531
|
+
const posGaps = gaps.filter((g2) => g2 > 0);
|
|
3532
|
+
if (posGaps.length < 2) return;
|
|
3533
|
+
let minGap = Infinity, maxGap = -Infinity;
|
|
3534
|
+
for (const g2 of posGaps) {
|
|
3535
|
+
if (g2 < minGap) minGap = g2;
|
|
3536
|
+
if (g2 > maxGap) maxGap = g2;
|
|
3537
|
+
}
|
|
3538
|
+
const avgFs = items[start].fontSize;
|
|
3539
|
+
if (minGap >= avgFs * 0.1 && maxGap <= avgFs * 3 && maxGap / Math.max(minGap, 0.1) <= 3) {
|
|
3540
|
+
for (let i = start + 1; i < end; i++) {
|
|
3541
|
+
result[i] = true;
|
|
3542
|
+
}
|
|
3543
|
+
}
|
|
3544
|
+
}
|
|
3545
|
+
function mergeCellTextLines(textLines) {
|
|
3358
3546
|
if (textLines.length <= 1) return textLines[0] || "";
|
|
3359
3547
|
const merged = [textLines[0]];
|
|
3360
3548
|
for (let i = 1; i < textLines.length; i++) {
|
|
@@ -3380,24 +3568,172 @@ var Y_TOL = 3;
|
|
|
3380
3568
|
var COL_CLUSTER_TOL = 15;
|
|
3381
3569
|
var MIN_ROWS = 3;
|
|
3382
3570
|
var MIN_COLS = 2;
|
|
3383
|
-
var MIN_GAP_FACTOR =
|
|
3384
|
-
var
|
|
3571
|
+
var MIN_GAP_FACTOR = 2;
|
|
3572
|
+
var MIN_GAP_ABSOLUTE = 20;
|
|
3573
|
+
var MIN_COL_FILL_RATIO = 0.4;
|
|
3385
3574
|
function detectClusterTables(items, pageNum) {
|
|
3386
3575
|
if (items.length < MIN_ROWS * MIN_COLS) return [];
|
|
3387
|
-
const
|
|
3576
|
+
const { merged, originMap } = mergeEvenSpacedClusters(items);
|
|
3577
|
+
const rows = groupByBaseline(merged);
|
|
3388
3578
|
if (rows.length < MIN_ROWS) return [];
|
|
3389
|
-
const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
|
|
3390
|
-
if (suspiciousRows.length < MIN_ROWS) return [];
|
|
3391
|
-
const columns = extractColumnClusters(suspiciousRows);
|
|
3392
|
-
if (columns.length < MIN_COLS) return [];
|
|
3393
|
-
const tableRegions = findTableRegions(rows, columns);
|
|
3394
3579
|
const results = [];
|
|
3395
|
-
|
|
3396
|
-
|
|
3397
|
-
|
|
3580
|
+
const headerResult = detectHeaderRow(rows);
|
|
3581
|
+
if (headerResult) {
|
|
3582
|
+
const { columns, headerIdx } = headerResult;
|
|
3583
|
+
const headerRow = rows[headerIdx];
|
|
3584
|
+
const headerItems = [...headerRow.items].sort((a, b) => a.x - b.x);
|
|
3585
|
+
const headerAndBelow = rows.slice(headerIdx);
|
|
3586
|
+
const mergedRows = mergeMultiLineRows(headerAndBelow, columns);
|
|
3587
|
+
const tableRegions = findTableRegionsByHeader(mergedRows, columns, headerItems);
|
|
3588
|
+
for (const region of tableRegions) {
|
|
3589
|
+
const table = buildClusterTable(region.rows, columns, pageNum);
|
|
3590
|
+
if (table) {
|
|
3591
|
+
expandUsedItems(table.usedItems, originMap);
|
|
3592
|
+
results.push(table);
|
|
3593
|
+
}
|
|
3594
|
+
}
|
|
3595
|
+
}
|
|
3596
|
+
if (results.length === 0) {
|
|
3597
|
+
const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
|
|
3598
|
+
if (suspiciousRows.length >= MIN_ROWS) {
|
|
3599
|
+
const columns = extractColumnClusters(suspiciousRows);
|
|
3600
|
+
if (columns.length >= MIN_COLS) {
|
|
3601
|
+
const tableRegions = findTableRegions(rows, columns);
|
|
3602
|
+
for (const region of tableRegions) {
|
|
3603
|
+
const mergedRows = mergeMultiLineRows(region.rows, columns);
|
|
3604
|
+
const table = buildClusterTable(mergedRows, columns, pageNum);
|
|
3605
|
+
if (table) {
|
|
3606
|
+
expandUsedItems(table.usedItems, originMap);
|
|
3607
|
+
results.push(table);
|
|
3608
|
+
}
|
|
3609
|
+
}
|
|
3610
|
+
}
|
|
3611
|
+
}
|
|
3398
3612
|
}
|
|
3399
3613
|
return results;
|
|
3400
3614
|
}
|
|
3615
|
+
function mergeEvenSpacedClusters(items) {
|
|
3616
|
+
const originMap = /* @__PURE__ */ new Map();
|
|
3617
|
+
const rows = groupByBaseline(items);
|
|
3618
|
+
const merged = [];
|
|
3619
|
+
for (const row of rows) {
|
|
3620
|
+
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3621
|
+
let i = 0;
|
|
3622
|
+
while (i < sorted.length) {
|
|
3623
|
+
if (/^[가-힣\d]$/.test(sorted[i].text)) {
|
|
3624
|
+
let runEnd = i + 1;
|
|
3625
|
+
while (runEnd < sorted.length && /^[가-힣\d]$/.test(sorted[runEnd].text)) {
|
|
3626
|
+
const gap = sorted[runEnd].x - (sorted[runEnd - 1].x + sorted[runEnd - 1].w);
|
|
3627
|
+
const fs = sorted[runEnd].fontSize;
|
|
3628
|
+
if (gap < fs * 0.1 || gap > fs * 3) break;
|
|
3629
|
+
runEnd++;
|
|
3630
|
+
}
|
|
3631
|
+
if (runEnd - i >= 3) {
|
|
3632
|
+
const gaps = [];
|
|
3633
|
+
for (let g2 = i + 1; g2 < runEnd; g2++) {
|
|
3634
|
+
gaps.push(sorted[g2].x - (sorted[g2 - 1].x + sorted[g2 - 1].w));
|
|
3635
|
+
}
|
|
3636
|
+
let minG = Infinity, maxG = -Infinity;
|
|
3637
|
+
for (const g2 of gaps) {
|
|
3638
|
+
if (g2 < minG) minG = g2;
|
|
3639
|
+
if (g2 > maxG) maxG = g2;
|
|
3640
|
+
}
|
|
3641
|
+
if (minG > 0 && maxG / minG <= 3) {
|
|
3642
|
+
const run = sorted.slice(i, runEnd);
|
|
3643
|
+
const text = run.map((r) => r.text).join("");
|
|
3644
|
+
const first = run[0], last = run[runEnd - i - 1];
|
|
3645
|
+
const item = {
|
|
3646
|
+
text,
|
|
3647
|
+
x: first.x,
|
|
3648
|
+
y: first.y,
|
|
3649
|
+
w: last.x + last.w - first.x,
|
|
3650
|
+
h: first.h,
|
|
3651
|
+
fontSize: first.fontSize,
|
|
3652
|
+
fontName: first.fontName
|
|
3653
|
+
};
|
|
3654
|
+
originMap.set(item, run);
|
|
3655
|
+
merged.push(item);
|
|
3656
|
+
i = runEnd;
|
|
3657
|
+
continue;
|
|
3658
|
+
}
|
|
3659
|
+
}
|
|
3660
|
+
}
|
|
3661
|
+
merged.push(sorted[i]);
|
|
3662
|
+
i++;
|
|
3663
|
+
}
|
|
3664
|
+
}
|
|
3665
|
+
return { merged, originMap };
|
|
3666
|
+
}
|
|
3667
|
+
function expandUsedItems(usedItems, originMap) {
|
|
3668
|
+
const toAdd = [];
|
|
3669
|
+
for (const item of usedItems) {
|
|
3670
|
+
const origins = originMap.get(item);
|
|
3671
|
+
if (origins) for (const o of origins) toAdd.push(o);
|
|
3672
|
+
}
|
|
3673
|
+
for (const a of toAdd) usedItems.add(a);
|
|
3674
|
+
}
|
|
3675
|
+
function detectHeaderRow(rows) {
|
|
3676
|
+
const allItems = rows.flatMap((r) => r.items);
|
|
3677
|
+
if (allItems.length === 0) return null;
|
|
3678
|
+
let allMinX = Infinity, allMaxX = -Infinity;
|
|
3679
|
+
for (const i of allItems) {
|
|
3680
|
+
if (i.x < allMinX) allMinX = i.x;
|
|
3681
|
+
const r = i.x + i.w;
|
|
3682
|
+
if (r > allMaxX) allMaxX = r;
|
|
3683
|
+
}
|
|
3684
|
+
const pageSpan = allMaxX - allMinX;
|
|
3685
|
+
if (pageSpan <= 0) return null;
|
|
3686
|
+
for (let ri = 0; ri < rows.length; ri++) {
|
|
3687
|
+
const row = rows[ri];
|
|
3688
|
+
if (row.items.length < MIN_COLS || row.items.length > 6) continue;
|
|
3689
|
+
if (row.items.some((i) => i.text.length > 8)) continue;
|
|
3690
|
+
if (!row.items.some((i) => /[가-힣]/.test(i.text))) continue;
|
|
3691
|
+
if (row.items.some((i) => /^[□■○●·※▶▷◆◇\-]/.test(i.text))) continue;
|
|
3692
|
+
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3693
|
+
const xSpan = sorted[sorted.length - 1].x + sorted[sorted.length - 1].w - sorted[0].x;
|
|
3694
|
+
if (xSpan / pageSpan < 0.4) continue;
|
|
3695
|
+
const avgFs = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
|
|
3696
|
+
let hasLargeGap = false;
|
|
3697
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
3698
|
+
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
3699
|
+
if (gap >= avgFs * 2.5) {
|
|
3700
|
+
hasLargeGap = true;
|
|
3701
|
+
break;
|
|
3702
|
+
}
|
|
3703
|
+
}
|
|
3704
|
+
if (!hasLargeGap) continue;
|
|
3705
|
+
const columns = sorted.map((item) => ({ x: item.x, count: 0 }));
|
|
3706
|
+
let matchCount = 0;
|
|
3707
|
+
for (let j = ri + 1; j < rows.length && matchCount < MIN_ROWS + 2; j++) {
|
|
3708
|
+
const matched = countMatchedColumnsRange(rows[j], columns, sorted);
|
|
3709
|
+
if (matched >= MIN_COLS) matchCount++;
|
|
3710
|
+
}
|
|
3711
|
+
if (matchCount < MIN_ROWS) continue;
|
|
3712
|
+
return { columns, headerIdx: ri };
|
|
3713
|
+
}
|
|
3714
|
+
return null;
|
|
3715
|
+
}
|
|
3716
|
+
function mergeMultiLineRows(rows, columns) {
|
|
3717
|
+
if (rows.length <= 1) return rows;
|
|
3718
|
+
const result = [rows[0]];
|
|
3719
|
+
const allFontSizes = rows.flatMap((r) => r.items).map((i) => i.fontSize);
|
|
3720
|
+
const avgFontSize = allFontSizes.length > 0 ? allFontSizes.reduce((s, v) => s + v, 0) / allFontSizes.length : 12;
|
|
3721
|
+
for (let i = 1; i < rows.length; i++) {
|
|
3722
|
+
const prev = result[result.length - 1];
|
|
3723
|
+
const curr = rows[i];
|
|
3724
|
+
const yGap = Math.abs(prev.y - curr.y);
|
|
3725
|
+
const matchedCols = countMatchedColumns(curr, columns);
|
|
3726
|
+
if (yGap < avgFontSize * 1.8 && curr.items.length <= 2 && (matchedCols < MIN_COLS || curr.items.length === 1)) {
|
|
3727
|
+
result[result.length - 1] = {
|
|
3728
|
+
y: prev.y,
|
|
3729
|
+
items: [...prev.items, ...curr.items]
|
|
3730
|
+
};
|
|
3731
|
+
} else {
|
|
3732
|
+
result.push(curr);
|
|
3733
|
+
}
|
|
3734
|
+
}
|
|
3735
|
+
return result;
|
|
3736
|
+
}
|
|
3401
3737
|
function groupByBaseline(items) {
|
|
3402
3738
|
if (items.length === 0) return [];
|
|
3403
3739
|
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
@@ -3419,8 +3755,9 @@ function groupByBaseline(items) {
|
|
|
3419
3755
|
function hasSuspiciousGaps(row) {
|
|
3420
3756
|
if (row.items.length < 2) return false;
|
|
3421
3757
|
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3758
|
+
if (sorted.length === 2 && sorted[1].text.length > 20) return false;
|
|
3422
3759
|
const avgFontSize = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
|
|
3423
|
-
const minGap = avgFontSize * MIN_GAP_FACTOR;
|
|
3760
|
+
const minGap = Math.max(avgFontSize * MIN_GAP_FACTOR, MIN_GAP_ABSOLUTE);
|
|
3424
3761
|
for (let i = 1; i < sorted.length; i++) {
|
|
3425
3762
|
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
3426
3763
|
if (gap >= minGap) return true;
|
|
@@ -3447,6 +3784,41 @@ function extractColumnClusters(rows) {
|
|
|
3447
3784
|
const minCount = Math.max(2, Math.floor(rows.length * MIN_COL_FILL_RATIO));
|
|
3448
3785
|
return clusters.filter((c) => c.count >= minCount).sort((a, b) => a.x - b.x);
|
|
3449
3786
|
}
|
|
3787
|
+
function findTableRegionsByHeader(allRows, columns, headerItems) {
|
|
3788
|
+
const regions = [];
|
|
3789
|
+
let currentRegion = [];
|
|
3790
|
+
let missStreak = 0;
|
|
3791
|
+
for (const row of allRows) {
|
|
3792
|
+
const matchedCols = countMatchedColumnsRange(row, columns, headerItems);
|
|
3793
|
+
if (matchedCols >= MIN_COLS) {
|
|
3794
|
+
currentRegion.push(row);
|
|
3795
|
+
missStreak = 0;
|
|
3796
|
+
} else if (currentRegion.length > 0 && (row.items.length <= 2 || missStreak === 0)) {
|
|
3797
|
+
currentRegion.push(row);
|
|
3798
|
+
missStreak++;
|
|
3799
|
+
} else {
|
|
3800
|
+
while (currentRegion.length > 0) {
|
|
3801
|
+
const last = currentRegion[currentRegion.length - 1];
|
|
3802
|
+
if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
|
|
3803
|
+
currentRegion.pop();
|
|
3804
|
+
}
|
|
3805
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
3806
|
+
regions.push({ rows: [...currentRegion] });
|
|
3807
|
+
}
|
|
3808
|
+
currentRegion = [];
|
|
3809
|
+
missStreak = 0;
|
|
3810
|
+
}
|
|
3811
|
+
}
|
|
3812
|
+
while (currentRegion.length > 0) {
|
|
3813
|
+
const last = currentRegion[currentRegion.length - 1];
|
|
3814
|
+
if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
|
|
3815
|
+
currentRegion.pop();
|
|
3816
|
+
}
|
|
3817
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
3818
|
+
regions.push({ rows: currentRegion });
|
|
3819
|
+
}
|
|
3820
|
+
return regions;
|
|
3821
|
+
}
|
|
3450
3822
|
function findTableRegions(allRows, columns) {
|
|
3451
3823
|
const regions = [];
|
|
3452
3824
|
let currentRegion = [];
|
|
@@ -3482,18 +3854,81 @@ function countMatchedColumns(row, columns) {
|
|
|
3482
3854
|
}
|
|
3483
3855
|
return matched.size;
|
|
3484
3856
|
}
|
|
3485
|
-
function
|
|
3486
|
-
const
|
|
3487
|
-
let
|
|
3488
|
-
|
|
3489
|
-
|
|
3490
|
-
|
|
3491
|
-
|
|
3492
|
-
|
|
3493
|
-
|
|
3857
|
+
function countMatchedColumnsRange(row, columns, headerItems) {
|
|
3858
|
+
const boundaries = [];
|
|
3859
|
+
for (let ci = 0; ci < headerItems.length; ci++) {
|
|
3860
|
+
const left = ci === 0 ? 0 : (headerItems[ci - 1].x + headerItems[ci - 1].w + headerItems[ci].x) / 2;
|
|
3861
|
+
const right = ci === headerItems.length - 1 ? Infinity : (headerItems[ci].x + headerItems[ci].w + headerItems[ci + 1].x) / 2;
|
|
3862
|
+
boundaries.push({ left, right });
|
|
3863
|
+
}
|
|
3864
|
+
const matched = /* @__PURE__ */ new Set();
|
|
3865
|
+
for (const item of row.items) {
|
|
3866
|
+
for (let ci = 0; ci < boundaries.length; ci++) {
|
|
3867
|
+
if (item.x >= boundaries[ci].left && item.x < boundaries[ci].right) {
|
|
3868
|
+
matched.add(ci);
|
|
3869
|
+
break;
|
|
3870
|
+
}
|
|
3494
3871
|
}
|
|
3495
3872
|
}
|
|
3496
|
-
return
|
|
3873
|
+
return matched.size;
|
|
3874
|
+
}
|
|
3875
|
+
function assignRowItems(items, columns, numCols) {
|
|
3876
|
+
if (items.length === 0) return [];
|
|
3877
|
+
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
3878
|
+
const colCenters = columns.map((c) => c.x);
|
|
3879
|
+
const gaps = [];
|
|
3880
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
3881
|
+
gaps.push({ idx: i, size: sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w) });
|
|
3882
|
+
}
|
|
3883
|
+
const gapSizes = gaps.map((g2) => g2.size).sort((a, b) => a - b);
|
|
3884
|
+
const medianGap = gapSizes.length > 0 ? gapSizes[Math.floor(gapSizes.length / 2)] : 0;
|
|
3885
|
+
const gapThreshold = sorted.length <= numCols + 1 ? 12 : Math.max(medianGap * 2.5, 12);
|
|
3886
|
+
const significantGaps = gaps.filter((g2) => g2.size >= gapThreshold).sort((a, b) => b.size - a.size).slice(0, numCols - 1).sort((a, b) => a.idx - b.idx);
|
|
3887
|
+
const groups = [];
|
|
3888
|
+
let start = 0;
|
|
3889
|
+
for (const gap of significantGaps) {
|
|
3890
|
+
groups.push(sorted.slice(start, gap.idx));
|
|
3891
|
+
start = gap.idx;
|
|
3892
|
+
}
|
|
3893
|
+
groups.push(sorted.slice(start));
|
|
3894
|
+
const result = [];
|
|
3895
|
+
const usedCols = /* @__PURE__ */ new Set();
|
|
3896
|
+
const groupCenters = groups.map((g2) => {
|
|
3897
|
+
let minX = Infinity, maxX = -Infinity;
|
|
3898
|
+
for (const i of g2) {
|
|
3899
|
+
if (i.x < minX) minX = i.x;
|
|
3900
|
+
const r = i.x + i.w;
|
|
3901
|
+
if (r > maxX) maxX = r;
|
|
3902
|
+
}
|
|
3903
|
+
return (minX + maxX) / 2;
|
|
3904
|
+
});
|
|
3905
|
+
const assignments = [];
|
|
3906
|
+
for (let gi = 0; gi < groups.length; gi++) {
|
|
3907
|
+
for (let ci = 0; ci < numCols; ci++) {
|
|
3908
|
+
assignments.push({ gi, ci, dist: Math.abs(groupCenters[gi] - colCenters[ci]) });
|
|
3909
|
+
}
|
|
3910
|
+
}
|
|
3911
|
+
assignments.sort((a, b) => a.dist - b.dist);
|
|
3912
|
+
const assignedGroups = /* @__PURE__ */ new Set();
|
|
3913
|
+
for (const { gi, ci } of assignments) {
|
|
3914
|
+
if (assignedGroups.has(gi) || usedCols.has(ci)) continue;
|
|
3915
|
+
result.push({ col: ci, items: groups[gi] });
|
|
3916
|
+
assignedGroups.add(gi);
|
|
3917
|
+
usedCols.add(ci);
|
|
3918
|
+
}
|
|
3919
|
+
for (let gi = 0; gi < groups.length; gi++) {
|
|
3920
|
+
if (assignedGroups.has(gi)) continue;
|
|
3921
|
+
let bestCol = 0, bestDist = Infinity;
|
|
3922
|
+
for (let ci = 0; ci < numCols; ci++) {
|
|
3923
|
+
const d = Math.abs(groupCenters[gi] - colCenters[ci]);
|
|
3924
|
+
if (d < bestDist) {
|
|
3925
|
+
bestDist = d;
|
|
3926
|
+
bestCol = ci;
|
|
3927
|
+
}
|
|
3928
|
+
}
|
|
3929
|
+
result.push({ col: bestCol, items: groups[gi] });
|
|
3930
|
+
}
|
|
3931
|
+
return result;
|
|
3497
3932
|
}
|
|
3498
3933
|
function buildClusterTable(rows, columns, pageNum) {
|
|
3499
3934
|
const numCols = columns.length;
|
|
@@ -3511,12 +3946,12 @@ function buildClusterTable(rows, columns, pageNum) {
|
|
|
3511
3946
|
usedItems.add(row.items[0]);
|
|
3512
3947
|
continue;
|
|
3513
3948
|
}
|
|
3514
|
-
|
|
3515
|
-
|
|
3516
|
-
|
|
3949
|
+
const assignments = assignRowItems(row.items, columns, numCols);
|
|
3950
|
+
for (const { col, items } of assignments) {
|
|
3951
|
+
const text = items.map((i) => i.text).join(" ");
|
|
3517
3952
|
const existing = cells[r][col].text;
|
|
3518
|
-
cells[r][col].text = existing ? existing + " " +
|
|
3519
|
-
usedItems.add(item);
|
|
3953
|
+
cells[r][col].text = existing ? existing + " " + text : text;
|
|
3954
|
+
for (const item of items) usedItems.add(item);
|
|
3520
3955
|
}
|
|
3521
3956
|
}
|
|
3522
3957
|
let emptyRows = 0;
|
|
@@ -3528,11 +3963,48 @@ function buildClusterTable(rows, columns, pageNum) {
|
|
|
3528
3963
|
const hasValue = cells.some((row) => row[c].text !== "");
|
|
3529
3964
|
if (!hasValue) return null;
|
|
3530
3965
|
}
|
|
3966
|
+
for (let r = numRows - 1; r >= 1; r--) {
|
|
3967
|
+
const nonEmptyCols = cells[r].filter((c) => c.text.trim()).length;
|
|
3968
|
+
if (nonEmptyCols !== 1) continue;
|
|
3969
|
+
if (cells[r][0].text.trim() !== "") continue;
|
|
3970
|
+
const contentText = cells[r].find((c) => c.text.trim())?.text.trim() || "";
|
|
3971
|
+
if (/^[○●▶\-·]/.test(contentText)) continue;
|
|
3972
|
+
for (let pr = r - 1; pr >= 0; pr--) {
|
|
3973
|
+
if (cells[pr].some((c) => c.text.trim())) {
|
|
3974
|
+
for (let c = 0; c < numCols; c++) {
|
|
3975
|
+
const prev = cells[pr][c].text.trim();
|
|
3976
|
+
const curr = cells[r][c].text.trim();
|
|
3977
|
+
if (curr) cells[pr][c].text = prev ? prev + " " + curr : curr;
|
|
3978
|
+
}
|
|
3979
|
+
for (let c = 0; c < numCols; c++) cells[r][c].text = "";
|
|
3980
|
+
break;
|
|
3981
|
+
}
|
|
3982
|
+
}
|
|
3983
|
+
}
|
|
3984
|
+
for (let r = 0; r < cells.length - 1; r++) {
|
|
3985
|
+
const row = cells[r];
|
|
3986
|
+
const hasCol0 = row[0].text.trim() !== "";
|
|
3987
|
+
const hasColLast = numCols > 1 && row[numCols - 1].text.trim() !== "";
|
|
3988
|
+
const midEmpty = row.slice(1, numCols - 1).every((c) => c.text.trim() === "");
|
|
3989
|
+
if (hasCol0 && hasColLast && midEmpty) {
|
|
3990
|
+
const next = cells[r + 1];
|
|
3991
|
+
if (next[0].text.trim() === "" && next.some((c) => c.text.trim())) {
|
|
3992
|
+
for (let c = 1; c < numCols; c++) {
|
|
3993
|
+
const curr = next[c].text.trim();
|
|
3994
|
+
if (curr) row[c].text = row[c].text.trim() ? row[c].text.trim() + " " + curr : curr;
|
|
3995
|
+
}
|
|
3996
|
+
for (let c = 0; c < numCols; c++) next[c].text = "";
|
|
3997
|
+
}
|
|
3998
|
+
}
|
|
3999
|
+
}
|
|
4000
|
+
const filteredCells = cells.filter((row) => row.some((c) => c.text.trim()));
|
|
4001
|
+
const finalRowCount = filteredCells.length;
|
|
4002
|
+
if (finalRowCount < MIN_ROWS) return null;
|
|
3531
4003
|
const irTable = {
|
|
3532
|
-
rows:
|
|
4004
|
+
rows: finalRowCount,
|
|
3533
4005
|
cols: numCols,
|
|
3534
|
-
cells,
|
|
3535
|
-
hasHeader:
|
|
4006
|
+
cells: filteredCells,
|
|
4007
|
+
hasHeader: finalRowCount > 1
|
|
3536
4008
|
};
|
|
3537
4009
|
const allItems = rows.flatMap((r) => r.items);
|
|
3538
4010
|
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
@@ -3609,7 +4081,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3609
4081
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
3610
4082
|
const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
|
|
3611
4083
|
const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
|
|
3612
|
-
const
|
|
4084
|
+
const fontSizeFreq = /* @__PURE__ */ new Map();
|
|
3613
4085
|
const pageHeights = /* @__PURE__ */ new Map();
|
|
3614
4086
|
let parsedPages = 0;
|
|
3615
4087
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
@@ -3626,7 +4098,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3626
4098
|
warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
|
|
3627
4099
|
}
|
|
3628
4100
|
for (const item of visible) {
|
|
3629
|
-
if (item.fontSize > 0)
|
|
4101
|
+
if (item.fontSize > 0) fontSizeFreq.set(item.fontSize, (fontSizeFreq.get(item.fontSize) || 0) + 1);
|
|
3630
4102
|
}
|
|
3631
4103
|
const opList = await page.getOperatorList();
|
|
3632
4104
|
const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
|
|
@@ -3665,10 +4137,9 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3665
4137
|
blocks.splice(removed[ri], 1);
|
|
3666
4138
|
}
|
|
3667
4139
|
}
|
|
3668
|
-
const medianFontSize =
|
|
4140
|
+
const medianFontSize = computeMedianFontSizeFromFreq(fontSizeFreq);
|
|
3669
4141
|
if (medianFontSize > 0) {
|
|
3670
4142
|
detectHeadings(blocks, medianFontSize);
|
|
3671
|
-
mergeAdjacentHeadings(blocks);
|
|
3672
4143
|
}
|
|
3673
4144
|
detectMarkerHeadings(blocks);
|
|
3674
4145
|
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
@@ -3730,11 +4201,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
|
|
|
3730
4201
|
}
|
|
3731
4202
|
return { visible, hiddenCount };
|
|
3732
4203
|
}
|
|
3733
|
-
function
|
|
3734
|
-
if (
|
|
3735
|
-
|
|
3736
|
-
const
|
|
3737
|
-
|
|
4204
|
+
function computeMedianFontSizeFromFreq(freq) {
|
|
4205
|
+
if (freq.size === 0) return 0;
|
|
4206
|
+
let total = 0;
|
|
4207
|
+
for (const count of freq.values()) total += count;
|
|
4208
|
+
const sorted = [...freq.entries()].sort((a, b) => a[0] - b[0]);
|
|
4209
|
+
const mid = Math.floor(total / 2);
|
|
4210
|
+
let cumulative = 0;
|
|
4211
|
+
for (const [size, count] of sorted) {
|
|
4212
|
+
cumulative += count;
|
|
4213
|
+
if (cumulative > mid) return size;
|
|
4214
|
+
}
|
|
4215
|
+
return sorted[sorted.length - 1][0];
|
|
3738
4216
|
}
|
|
3739
4217
|
function detectHeadings(blocks, medianFontSize) {
|
|
3740
4218
|
for (const block of blocks) {
|
|
@@ -3754,220 +4232,27 @@ function detectHeadings(blocks, medianFontSize) {
|
|
|
3754
4232
|
}
|
|
3755
4233
|
}
|
|
3756
4234
|
}
|
|
3757
|
-
function mergeAdjacentHeadings(blocks) {
|
|
3758
|
-
let i = 0;
|
|
3759
|
-
while (i < blocks.length - 1) {
|
|
3760
|
-
const curr = blocks[i];
|
|
3761
|
-
const next = blocks[i + 1];
|
|
3762
|
-
if (curr.type !== "heading" || next.type !== "heading") {
|
|
3763
|
-
i++;
|
|
3764
|
-
continue;
|
|
3765
|
-
}
|
|
3766
|
-
if (!curr.bbox || !next.bbox || !curr.text || !next.text) {
|
|
3767
|
-
i++;
|
|
3768
|
-
continue;
|
|
3769
|
-
}
|
|
3770
|
-
const currBaseline = curr.bbox.y + (curr.style?.fontSize || curr.bbox.height);
|
|
3771
|
-
const nextBaseline = next.bbox.y + (next.style?.fontSize || next.bbox.height);
|
|
3772
|
-
const yDiff = Math.abs(currBaseline - nextBaseline);
|
|
3773
|
-
const maxFs = Math.max(curr.style?.fontSize || 12, next.style?.fontSize || 12);
|
|
3774
|
-
const sameY = curr.bbox.page === next.bbox.page && yDiff < maxFs * 1.5;
|
|
3775
|
-
const sameLevel = curr.level === next.level;
|
|
3776
|
-
if (sameY && sameLevel) {
|
|
3777
|
-
const currX = curr.bbox.x;
|
|
3778
|
-
const nextX = next.bbox.x;
|
|
3779
|
-
if (currX <= nextX) {
|
|
3780
|
-
curr.text = curr.text + " " + next.text;
|
|
3781
|
-
} else {
|
|
3782
|
-
curr.text = next.text + " " + curr.text;
|
|
3783
|
-
}
|
|
3784
|
-
curr.bbox = {
|
|
3785
|
-
page: curr.bbox.page,
|
|
3786
|
-
x: Math.min(curr.bbox.x, next.bbox.x),
|
|
3787
|
-
y: Math.min(curr.bbox.y, next.bbox.y),
|
|
3788
|
-
width: Math.max(curr.bbox.x + curr.bbox.width, next.bbox.x + next.bbox.width) - Math.min(curr.bbox.x, next.bbox.x),
|
|
3789
|
-
height: Math.max(curr.bbox.height, next.bbox.height)
|
|
3790
|
-
};
|
|
3791
|
-
blocks.splice(i + 1, 1);
|
|
3792
|
-
} else {
|
|
3793
|
-
i++;
|
|
3794
|
-
}
|
|
3795
|
-
}
|
|
3796
|
-
}
|
|
3797
4235
|
function collapseEvenSpacing(text) {
|
|
3798
4236
|
const tokens = text.split(" ");
|
|
3799
4237
|
const singleCharCount = tokens.filter((t) => t.length === 1).length;
|
|
3800
4238
|
if (tokens.length >= 3 && singleCharCount / tokens.length >= 0.7) {
|
|
3801
4239
|
return tokens.join("");
|
|
3802
4240
|
}
|
|
3803
|
-
return text
|
|
3804
|
-
}
|
|
3805
|
-
|
|
3806
|
-
const allY = items.map((i) => i.y);
|
|
3807
|
-
const pageHeight = Math.max(...allY) - Math.min(...allY);
|
|
3808
|
-
const gapThreshold = Math.max(15, pageHeight * 0.03);
|
|
3809
|
-
const orderedGroups = xyCutOrder(items, gapThreshold);
|
|
3810
|
-
const blocks = [];
|
|
3811
|
-
for (const group of orderedGroups) {
|
|
3812
|
-
if (group.length === 0) continue;
|
|
3813
|
-
const yLines = groupByY(group);
|
|
3814
|
-
for (const line of yLines) {
|
|
3815
|
-
const text = mergeLineSimple(line);
|
|
3816
|
-
if (!text.trim()) continue;
|
|
3817
|
-
blocks.push({
|
|
3818
|
-
type: "paragraph",
|
|
3819
|
-
text,
|
|
3820
|
-
pageNumber: pageNum,
|
|
3821
|
-
bbox: computeBBox(line, pageNum),
|
|
3822
|
-
style: dominantStyle(line)
|
|
3823
|
-
});
|
|
3824
|
-
}
|
|
3825
|
-
}
|
|
3826
|
-
return blocks.length > 0 ? blocks : null;
|
|
3827
|
-
}
|
|
3828
|
-
function normalizeUnderSegmentedTable(table, items, pageNum, bbox) {
|
|
3829
|
-
const totalCells = table.cells.reduce((sum, row) => sum + row.filter((c) => c.text.trim()).length, 0);
|
|
3830
|
-
const totalTextLines = table.cells.reduce((sum, row) => sum + row.reduce((s, c) => s + (c.text.trim() ? c.text.split("\n").length : 0), 0), 0);
|
|
3831
|
-
const isUnderSegmented = table.rows === 1 && table.cols === 1 || totalCells <= 2 && totalTextLines >= 8 || totalCells <= 2 && items.length >= 6;
|
|
3832
|
-
if (!isUnderSegmented) return null;
|
|
3833
|
-
if (hasMultiColumnLayout(items)) return buildXyCutBlocks(items, pageNum);
|
|
3834
|
-
const directTable = buildTableFromTextLayout(items, pageNum, bbox);
|
|
3835
|
-
if (directTable) return directTable;
|
|
3836
|
-
const clusterItems = items.map((i) => ({
|
|
3837
|
-
text: i.text,
|
|
3838
|
-
x: i.x,
|
|
3839
|
-
y: i.y,
|
|
3840
|
-
w: i.w,
|
|
3841
|
-
h: i.h,
|
|
3842
|
-
fontSize: i.fontSize,
|
|
3843
|
-
fontName: i.fontName
|
|
3844
|
-
}));
|
|
3845
|
-
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
3846
|
-
if (clusterResults.length > 0) {
|
|
3847
|
-
const blocks = [];
|
|
3848
|
-
const ciToIdx = /* @__PURE__ */ new Map();
|
|
3849
|
-
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
3850
|
-
const usedIndices = /* @__PURE__ */ new Set();
|
|
3851
|
-
for (const cr of clusterResults) {
|
|
3852
|
-
for (const ci of cr.usedItems) {
|
|
3853
|
-
const idx = ciToIdx.get(ci);
|
|
3854
|
-
if (idx !== void 0) usedIndices.add(idx);
|
|
3855
|
-
}
|
|
3856
|
-
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
3857
|
-
}
|
|
3858
|
-
const remaining = items.filter((_, idx) => !usedIndices.has(idx));
|
|
3859
|
-
for (const item of remaining) {
|
|
3860
|
-
if (!item.text.trim()) continue;
|
|
3861
|
-
blocks.push({
|
|
3862
|
-
type: "paragraph",
|
|
3863
|
-
text: item.text,
|
|
3864
|
-
pageNumber: pageNum,
|
|
3865
|
-
bbox: computeBBox([item], pageNum),
|
|
3866
|
-
style: { fontSize: item.fontSize, fontName: item.fontName }
|
|
3867
|
-
});
|
|
3868
|
-
}
|
|
3869
|
-
blocks.sort((a, b) => {
|
|
3870
|
-
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
3871
|
-
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
3872
|
-
return by - ay;
|
|
3873
|
-
});
|
|
3874
|
-
return blocks.length > 0 ? blocks : null;
|
|
3875
|
-
}
|
|
3876
|
-
return null;
|
|
3877
|
-
}
|
|
3878
|
-
function buildTableFromTextLayout(items, pageNum, bbox) {
|
|
3879
|
-
if (items.length < 4) return null;
|
|
3880
|
-
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
3881
|
-
const yTol = 3;
|
|
3882
|
-
const rows = [];
|
|
3883
|
-
let curRow = [sorted[0]];
|
|
3884
|
-
let curY = sorted[0].y;
|
|
3885
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
3886
|
-
if (Math.abs(sorted[i].y - curY) <= yTol) {
|
|
3887
|
-
curRow.push(sorted[i]);
|
|
3888
|
-
} else {
|
|
3889
|
-
rows.push(curRow);
|
|
3890
|
-
curRow = [sorted[i]];
|
|
3891
|
-
curY = sorted[i].y;
|
|
3892
|
-
}
|
|
3893
|
-
}
|
|
3894
|
-
rows.push(curRow);
|
|
3895
|
-
if (rows.length < 2) return null;
|
|
3896
|
-
const gapPositions = [];
|
|
3897
|
-
for (const row of rows) {
|
|
3898
|
-
if (row.length < 2) continue;
|
|
3899
|
-
const sortedX = [...row].sort((a, b) => a.x - b.x);
|
|
3900
|
-
const avgFs = sortedX.reduce((s, i) => s + i.fontSize, 0) / sortedX.length;
|
|
3901
|
-
for (let j = 1; j < sortedX.length; j++) {
|
|
3902
|
-
const gap = sortedX[j].x - (sortedX[j - 1].x + sortedX[j - 1].w);
|
|
3903
|
-
if (gap >= avgFs * 1.5) {
|
|
3904
|
-
gapPositions.push(sortedX[j - 1].x + sortedX[j - 1].w + gap / 2);
|
|
3905
|
-
}
|
|
3906
|
-
}
|
|
3907
|
-
}
|
|
3908
|
-
if (gapPositions.length < 2) return null;
|
|
3909
|
-
gapPositions.sort((a, b) => a - b);
|
|
3910
|
-
const colBoundaries = [];
|
|
3911
|
-
let clusterSum = gapPositions[0], clusterCount = 1;
|
|
3912
|
-
for (let i = 1; i < gapPositions.length; i++) {
|
|
3913
|
-
const avg = clusterSum / clusterCount;
|
|
3914
|
-
if (Math.abs(gapPositions[i] - avg) <= 15) {
|
|
3915
|
-
clusterSum += gapPositions[i];
|
|
3916
|
-
clusterCount++;
|
|
3917
|
-
} else {
|
|
3918
|
-
if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
|
|
3919
|
-
clusterSum = gapPositions[i];
|
|
3920
|
-
clusterCount = 1;
|
|
3921
|
-
}
|
|
3922
|
-
}
|
|
3923
|
-
if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
|
|
3924
|
-
if (colBoundaries.length === 0) return null;
|
|
3925
|
-
const numCols = colBoundaries.length + 1;
|
|
3926
|
-
const tableRows = [];
|
|
3927
|
-
for (const row of rows) {
|
|
3928
|
-
const cells = Array(numCols).fill("");
|
|
3929
|
-
const sortedX = [...row].sort((a, b) => a.x - b.x);
|
|
3930
|
-
for (const item of sortedX) {
|
|
3931
|
-
const cx = item.x + item.w / 2;
|
|
3932
|
-
let col = 0;
|
|
3933
|
-
for (let b = 0; b < colBoundaries.length; b++) {
|
|
3934
|
-
if (cx > colBoundaries[b]) col = b + 1;
|
|
3935
|
-
}
|
|
3936
|
-
cells[col] = cells[col] ? cells[col] + " " + item.text : item.text;
|
|
3937
|
-
}
|
|
3938
|
-
if (cells[0].trim() === "" && tableRows.length > 0) {
|
|
3939
|
-
const prevCells = tableRows[tableRows.length - 1].cells;
|
|
3940
|
-
for (let c = 0; c < numCols; c++) {
|
|
3941
|
-
if (cells[c].trim()) {
|
|
3942
|
-
prevCells[c] = prevCells[c] ? prevCells[c] + " " + cells[c].trim() : cells[c].trim();
|
|
3943
|
-
}
|
|
3944
|
-
}
|
|
3945
|
-
} else {
|
|
3946
|
-
tableRows.push({ cells });
|
|
3947
|
-
}
|
|
3948
|
-
}
|
|
3949
|
-
if (tableRows.length < 2) return null;
|
|
3950
|
-
const nonEmptyCount = tableRows.reduce((sum, r) => sum + r.cells.filter((c) => c.trim()).length, 0);
|
|
3951
|
-
const totalCount = tableRows.length * numCols;
|
|
3952
|
-
if (nonEmptyCount < totalCount * 0.3) return null;
|
|
3953
|
-
const irCells = tableRows.map(
|
|
3954
|
-
(r) => r.cells.map((text, colIdx) => {
|
|
3955
|
-
let cleaned = text.trim();
|
|
3956
|
-
if (colIdx > 0) cleaned = cleaned.replace(/^[•○·\-]\s*/, "");
|
|
3957
|
-
return { text: cleaned, colSpan: 1, rowSpan: 1 };
|
|
3958
|
-
})
|
|
4241
|
+
return text.replace(
|
|
4242
|
+
/(?<![가-힣])[가-힣](?: [가-힣\d]){2,}(?![가-힣])/g,
|
|
4243
|
+
(match) => match.replace(/ /g, "")
|
|
3959
4244
|
);
|
|
3960
|
-
const irTable = {
|
|
3961
|
-
rows: tableRows.length,
|
|
3962
|
-
cols: numCols,
|
|
3963
|
-
cells: irCells,
|
|
3964
|
-
hasHeader: tableRows.length > 1
|
|
3965
|
-
};
|
|
3966
|
-
return [{ type: "table", table: irTable, pageNumber: pageNum, bbox }];
|
|
3967
4245
|
}
|
|
3968
4246
|
function shouldDemoteTable(table) {
|
|
3969
4247
|
const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
|
|
3970
4248
|
const allText = allCells.join(" ");
|
|
4249
|
+
if (table.rows <= 3 && table.cols <= 3) {
|
|
4250
|
+
const totalCells2 = table.rows * table.cols;
|
|
4251
|
+
const emptyCells2 = totalCells2 - allCells.length;
|
|
4252
|
+
if (emptyCells2 >= totalCells2 * 0.3) return true;
|
|
4253
|
+
if (/[□■◆○●▶ㅇ]/.test(allText)) return true;
|
|
4254
|
+
if (/<[^>]+>/.test(allText)) return true;
|
|
4255
|
+
}
|
|
3971
4256
|
if (allText.length > 200) return false;
|
|
3972
4257
|
if (/[□■◆○●▶]/.test(allText) && table.rows <= 3) return true;
|
|
3973
4258
|
const totalCells = table.rows * table.cols;
|
|
@@ -4011,32 +4296,6 @@ function detectMarkerHeadings(blocks) {
|
|
|
4011
4296
|
}
|
|
4012
4297
|
}
|
|
4013
4298
|
}
|
|
4014
|
-
function hasMultiColumnLayout(items) {
|
|
4015
|
-
if (items.length < 30) return false;
|
|
4016
|
-
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
4017
|
-
const minX = sorted[0].x;
|
|
4018
|
-
let maxX = minX;
|
|
4019
|
-
for (const i of sorted) if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
4020
|
-
const pageWidth = maxX - minX;
|
|
4021
|
-
if (pageWidth < 200) return false;
|
|
4022
|
-
let bestGap = 0;
|
|
4023
|
-
let bestSplit = 0;
|
|
4024
|
-
for (let j = 1; j < sorted.length; j++) {
|
|
4025
|
-
const gap = sorted[j].x - (sorted[j - 1].x + sorted[j - 1].w);
|
|
4026
|
-
if (gap > bestGap) {
|
|
4027
|
-
bestGap = gap;
|
|
4028
|
-
bestSplit = (sorted[j - 1].x + sorted[j - 1].w + sorted[j].x) / 2;
|
|
4029
|
-
}
|
|
4030
|
-
}
|
|
4031
|
-
if (bestGap < 20) return false;
|
|
4032
|
-
const splitRatio = (bestSplit - minX) / pageWidth;
|
|
4033
|
-
if (splitRatio < 0.35 || splitRatio > 0.65) return false;
|
|
4034
|
-
const leftCount = items.filter((i) => i.x + i.w / 2 < bestSplit).length;
|
|
4035
|
-
const rightCount = items.filter((i) => i.x + i.w / 2 >= bestSplit).length;
|
|
4036
|
-
if (leftCount < 15 || rightCount < 15) return false;
|
|
4037
|
-
if (Math.min(leftCount, rightCount) / Math.max(leftCount, rightCount) < 0.35) return false;
|
|
4038
|
-
return true;
|
|
4039
|
-
}
|
|
4040
4299
|
var MAX_XYCUT_DEPTH = 50;
|
|
4041
4300
|
function xyCutOrder(items, gapThreshold, depth = 0) {
|
|
4042
4301
|
if (items.length === 0) return [];
|
|
@@ -4104,6 +4363,7 @@ function extractPageBlocksWithLines(items, pageNum, opList, pageWidth, pageHeigh
|
|
|
4104
4363
|
if (items.length === 0) return [];
|
|
4105
4364
|
let { horizontals, verticals } = extractLines(opList.fnArray, opList.argsArray);
|
|
4106
4365
|
({ horizontals, verticals } = filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight));
|
|
4366
|
+
({ horizontals, verticals } = preprocessLines(horizontals, verticals));
|
|
4107
4367
|
const grids = buildTableGrids(horizontals, verticals);
|
|
4108
4368
|
if (grids.length > 0) {
|
|
4109
4369
|
return extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals);
|
|
@@ -4115,14 +4375,19 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
4115
4375
|
const usedItems = /* @__PURE__ */ new Set();
|
|
4116
4376
|
const sortedGrids = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
|
|
4117
4377
|
for (const grid of sortedGrids) {
|
|
4378
|
+
const numGridRows = grid.rowYs.length - 1;
|
|
4379
|
+
const numGridCols = grid.colXs.length - 1;
|
|
4380
|
+
if (numGridRows === 1 && numGridCols >= 2) continue;
|
|
4118
4381
|
const tableItems = [];
|
|
4119
4382
|
const pad = 3;
|
|
4383
|
+
const gridW = grid.bbox.x2 - grid.bbox.x1;
|
|
4120
4384
|
for (const item of items) {
|
|
4121
4385
|
if (usedItems.has(item)) continue;
|
|
4122
|
-
if (item.
|
|
4123
|
-
|
|
4124
|
-
|
|
4125
|
-
|
|
4386
|
+
if (item.y < grid.bbox.y1 - pad || item.y > grid.bbox.y2 + pad) continue;
|
|
4387
|
+
if (item.x < grid.bbox.x1 - pad || item.x + item.w > grid.bbox.x2 + pad) continue;
|
|
4388
|
+
if (gridW < 120 && item.x + item.w > grid.bbox.x2 - 2) continue;
|
|
4389
|
+
tableItems.push(item);
|
|
4390
|
+
usedItems.add(item);
|
|
4126
4391
|
}
|
|
4127
4392
|
const cells = extractCells(grid, horizontals, verticals);
|
|
4128
4393
|
if (cells.length === 0) continue;
|
|
@@ -4146,6 +4411,7 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
4146
4411
|
const cellItems = cellTextMap.get(cell) || [];
|
|
4147
4412
|
let text = cellTextToString(cellItems);
|
|
4148
4413
|
text = text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").trim();
|
|
4414
|
+
text = text.split("\n").map((line) => collapseEvenSpacing(line)).join("\n");
|
|
4149
4415
|
irGrid[cell.row][cell.col] = {
|
|
4150
4416
|
text,
|
|
4151
4417
|
colSpan: cell.colSpan,
|
|
@@ -4167,31 +4433,61 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
4167
4433
|
width: grid.bbox.x2 - grid.bbox.x1,
|
|
4168
4434
|
height: grid.bbox.y2 - grid.bbox.y1
|
|
4169
4435
|
};
|
|
4170
|
-
const normalized = normalizeUnderSegmentedTable(irTable, tableItems, pageNum, tableBbox);
|
|
4171
|
-
if (normalized) {
|
|
4172
|
-
blocks.push(...normalized);
|
|
4173
|
-
continue;
|
|
4174
|
-
}
|
|
4175
4436
|
if (shouldDemoteTable(irTable)) {
|
|
4176
4437
|
const demoted = demoteTableToText(irTable);
|
|
4177
4438
|
if (demoted) {
|
|
4178
|
-
|
|
4439
|
+
const text = numGridRows === 1 ? "\n" + demoted + "\n" : demoted;
|
|
4440
|
+
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
|
|
4179
4441
|
}
|
|
4180
4442
|
continue;
|
|
4181
4443
|
}
|
|
4182
4444
|
blocks.push({ type: "table", table: irTable, pageNumber: pageNum, bbox: tableBbox });
|
|
4183
4445
|
}
|
|
4184
|
-
|
|
4446
|
+
let remaining = items.filter((i) => !usedItems.has(i));
|
|
4185
4447
|
if (remaining.length > 0) {
|
|
4186
4448
|
remaining.sort((a, b) => b.y - a.y || a.x - b.x);
|
|
4187
|
-
const
|
|
4188
|
-
|
|
4189
|
-
|
|
4449
|
+
const clusterItems = remaining.map((i) => ({
|
|
4450
|
+
text: i.text,
|
|
4451
|
+
x: i.x,
|
|
4452
|
+
y: i.y,
|
|
4453
|
+
w: i.w,
|
|
4454
|
+
h: i.h,
|
|
4455
|
+
fontSize: i.fontSize,
|
|
4456
|
+
fontName: i.fontName
|
|
4457
|
+
}));
|
|
4458
|
+
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4459
|
+
if (clusterResults.length > 0) {
|
|
4460
|
+
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4461
|
+
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4462
|
+
const usedClusterIndices = /* @__PURE__ */ new Set();
|
|
4463
|
+
for (const cr of clusterResults) {
|
|
4464
|
+
for (const ci of cr.usedItems) {
|
|
4465
|
+
const idx = ciToIdx.get(ci);
|
|
4466
|
+
if (idx !== void 0) usedClusterIndices.add(idx);
|
|
4467
|
+
}
|
|
4468
|
+
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4469
|
+
}
|
|
4470
|
+
remaining = remaining.filter((_, idx) => !usedClusterIndices.has(idx));
|
|
4471
|
+
}
|
|
4472
|
+
if (remaining.length > 0) {
|
|
4473
|
+
const allY = remaining.map((i) => i.y);
|
|
4474
|
+
const pageH = safeMax(allY) - safeMin(allY);
|
|
4475
|
+
const groups = xyCutOrder(remaining, Math.max(15, pageH * 0.03));
|
|
4476
|
+
const textBlocks = [];
|
|
4477
|
+
for (const group of groups) {
|
|
4478
|
+
if (group.length === 0) continue;
|
|
4479
|
+
const groupBlocks = extractPageBlocksFallback(group, pageNum);
|
|
4480
|
+
for (const b of groupBlocks) textBlocks.push(b);
|
|
4481
|
+
}
|
|
4482
|
+
const finalTextBlocks = detectListBlocks(textBlocks);
|
|
4483
|
+
for (const b of finalTextBlocks) blocks.push(b);
|
|
4484
|
+
}
|
|
4485
|
+
blocks.sort((a, b) => {
|
|
4190
4486
|
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
4191
4487
|
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
4192
4488
|
return by - ay;
|
|
4193
4489
|
});
|
|
4194
|
-
return mergeAdjacentTableBlocks(
|
|
4490
|
+
return mergeAdjacentTableBlocks(blocks);
|
|
4195
4491
|
}
|
|
4196
4492
|
return mergeAdjacentTableBlocks(blocks);
|
|
4197
4493
|
}
|
|
@@ -4217,57 +4513,53 @@ function mergeAdjacentTableBlocks(blocks) {
|
|
|
4217
4513
|
}
|
|
4218
4514
|
function extractPageBlocksFallback(items, pageNum) {
|
|
4219
4515
|
if (items.length === 0) return [];
|
|
4220
|
-
if (hasMultiColumnLayout(items)) {
|
|
4221
|
-
const xyBlocks = buildXyCutBlocks(items, pageNum) || [];
|
|
4222
|
-
return detectSpecialKoreanTables(detectListBlocks(xyBlocks));
|
|
4223
|
-
}
|
|
4224
4516
|
const blocks = [];
|
|
4225
|
-
const
|
|
4226
|
-
|
|
4227
|
-
|
|
4228
|
-
|
|
4229
|
-
|
|
4230
|
-
|
|
4231
|
-
|
|
4232
|
-
|
|
4233
|
-
|
|
4234
|
-
|
|
4235
|
-
|
|
4236
|
-
|
|
4237
|
-
|
|
4238
|
-
|
|
4239
|
-
|
|
4240
|
-
|
|
4241
|
-
|
|
4242
|
-
|
|
4243
|
-
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4244
|
-
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4245
|
-
const usedIndices = /* @__PURE__ */ new Set();
|
|
4246
|
-
for (const cr of clusterResults) {
|
|
4247
|
-
for (const ci of cr.usedItems) {
|
|
4248
|
-
const idx = ciToIdx.get(ci);
|
|
4249
|
-
if (idx !== void 0) usedIndices.add(idx);
|
|
4250
|
-
}
|
|
4251
|
-
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4517
|
+
const clusterItems = items.map((i) => ({
|
|
4518
|
+
text: i.text,
|
|
4519
|
+
x: i.x,
|
|
4520
|
+
y: i.y,
|
|
4521
|
+
w: i.w,
|
|
4522
|
+
h: i.h,
|
|
4523
|
+
fontSize: i.fontSize,
|
|
4524
|
+
fontName: i.fontName
|
|
4525
|
+
}));
|
|
4526
|
+
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4527
|
+
if (clusterResults.length > 0) {
|
|
4528
|
+
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4529
|
+
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4530
|
+
const usedIndices = /* @__PURE__ */ new Set();
|
|
4531
|
+
for (const cr of clusterResults) {
|
|
4532
|
+
for (const ci of cr.usedItems) {
|
|
4533
|
+
const idx = ciToIdx.get(ci);
|
|
4534
|
+
if (idx !== void 0) usedIndices.add(idx);
|
|
4252
4535
|
}
|
|
4253
|
-
|
|
4254
|
-
|
|
4255
|
-
|
|
4256
|
-
|
|
4257
|
-
|
|
4258
|
-
|
|
4259
|
-
|
|
4260
|
-
|
|
4261
|
-
|
|
4536
|
+
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4537
|
+
}
|
|
4538
|
+
const remaining = items.filter((_, idx) => !usedIndices.has(idx));
|
|
4539
|
+
if (remaining.length > 0) {
|
|
4540
|
+
const yLines = groupByY(remaining);
|
|
4541
|
+
for (const line of yLines) {
|
|
4542
|
+
const text = mergeLineSimple(line);
|
|
4543
|
+
if (!text.trim()) continue;
|
|
4544
|
+
const bbox = computeBBox(line, pageNum);
|
|
4545
|
+
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
|
|
4262
4546
|
}
|
|
4263
|
-
|
|
4264
|
-
|
|
4265
|
-
|
|
4266
|
-
|
|
4267
|
-
|
|
4547
|
+
}
|
|
4548
|
+
blocks.sort((a, b) => {
|
|
4549
|
+
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
4550
|
+
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
4551
|
+
return by - ay;
|
|
4552
|
+
});
|
|
4553
|
+
} else {
|
|
4554
|
+
const allYLines = groupByY(items);
|
|
4555
|
+
const columns = detectColumns(allYLines);
|
|
4556
|
+
if (columns && columns.length >= 3) {
|
|
4557
|
+
const tableText = extractWithColumns(allYLines, columns);
|
|
4558
|
+
const bbox = computeBBox(items, pageNum);
|
|
4559
|
+
blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
|
|
4268
4560
|
} else {
|
|
4269
4561
|
const allY = items.map((i) => i.y);
|
|
4270
|
-
const pageHeight =
|
|
4562
|
+
const pageHeight = safeMax(allY) - safeMin(allY);
|
|
4271
4563
|
const gapThreshold = Math.max(15, pageHeight * 0.03);
|
|
4272
4564
|
const orderedGroups = xyCutOrder(items, gapThreshold);
|
|
4273
4565
|
for (const group of orderedGroups) {
|
|
@@ -4320,22 +4612,76 @@ function dominantStyle(items) {
|
|
|
4320
4612
|
return { fontSize: dominantSize, fontName };
|
|
4321
4613
|
}
|
|
4322
4614
|
function normalizeItems(rawItems) {
|
|
4323
|
-
|
|
4615
|
+
const items = [];
|
|
4616
|
+
const spacePositions = [];
|
|
4617
|
+
for (const i of rawItems) {
|
|
4618
|
+
if (typeof i.str !== "string") continue;
|
|
4619
|
+
const x = Math.round(i.transform[4]);
|
|
4620
|
+
const y = Math.round(i.transform[5]);
|
|
4621
|
+
if (!i.str.trim()) {
|
|
4622
|
+
spacePositions.push({ x, y });
|
|
4623
|
+
continue;
|
|
4624
|
+
}
|
|
4324
4625
|
const scaleY = Math.abs(i.transform[3]);
|
|
4325
4626
|
const scaleX = Math.abs(i.transform[0]);
|
|
4326
4627
|
const fontSize = Math.round(Math.max(scaleY, scaleX));
|
|
4327
|
-
|
|
4328
|
-
|
|
4329
|
-
|
|
4330
|
-
|
|
4331
|
-
|
|
4332
|
-
|
|
4333
|
-
|
|
4334
|
-
|
|
4335
|
-
|
|
4336
|
-
|
|
4337
|
-
|
|
4338
|
-
|
|
4628
|
+
const w = Math.round(i.width);
|
|
4629
|
+
const h = Math.round(i.height);
|
|
4630
|
+
const isHidden = fontSize === 0 || i.width === 0 && i.str.trim().length > 0;
|
|
4631
|
+
let text = i.str.trim();
|
|
4632
|
+
if (/^[\d\s\-().·,☎]+$/.test(text) && /\d/.test(text) && / /.test(text)) {
|
|
4633
|
+
text = text.replace(/ /g, "");
|
|
4634
|
+
}
|
|
4635
|
+
const split = splitEvenSpacedItem(text, x, w, fontSize);
|
|
4636
|
+
if (split) {
|
|
4637
|
+
for (const s of split) {
|
|
4638
|
+
items.push({ text: s.text, x: s.x, y, w: s.w, h, fontSize, fontName: i.fontName || "", isHidden });
|
|
4639
|
+
}
|
|
4640
|
+
} else {
|
|
4641
|
+
items.push({ text, x, y, w, h, fontSize, fontName: i.fontName || "", isHidden });
|
|
4642
|
+
}
|
|
4643
|
+
}
|
|
4644
|
+
const sorted = items.sort((a, b) => b.y - a.y || a.x - b.x);
|
|
4645
|
+
const deduped = [];
|
|
4646
|
+
for (let i = 0; i < sorted.length; i++) {
|
|
4647
|
+
let isDup = false;
|
|
4648
|
+
for (let j = deduped.length - 1; j >= 0; j--) {
|
|
4649
|
+
const prev = deduped[j];
|
|
4650
|
+
if (prev.y - sorted[i].y > 3) break;
|
|
4651
|
+
if (Math.abs(prev.y - sorted[i].y) <= 3 && prev.text === sorted[i].text && Math.abs(prev.x - sorted[i].x) <= 3) {
|
|
4652
|
+
isDup = true;
|
|
4653
|
+
break;
|
|
4654
|
+
}
|
|
4655
|
+
}
|
|
4656
|
+
if (!isDup) deduped.push(sorted[i]);
|
|
4657
|
+
}
|
|
4658
|
+
if (spacePositions.length > 0) {
|
|
4659
|
+
for (const item of deduped) {
|
|
4660
|
+
for (const sp of spacePositions) {
|
|
4661
|
+
if (Math.abs(sp.y - item.y) <= 3) {
|
|
4662
|
+
const dist = item.x - sp.x;
|
|
4663
|
+
if (dist >= 0 && dist <= 20) {
|
|
4664
|
+
item.hasSpaceBefore = true;
|
|
4665
|
+
break;
|
|
4666
|
+
}
|
|
4667
|
+
}
|
|
4668
|
+
}
|
|
4669
|
+
}
|
|
4670
|
+
}
|
|
4671
|
+
return deduped;
|
|
4672
|
+
}
|
|
4673
|
+
function splitEvenSpacedItem(text, itemX, itemW, fontSize) {
|
|
4674
|
+
if (!/^[가-힣\d](?: [가-힣\d]){2,}$/.test(text)) return null;
|
|
4675
|
+
const chars = text.split(" ");
|
|
4676
|
+
if (chars.length < 3) return null;
|
|
4677
|
+
const charW = itemW / chars.length;
|
|
4678
|
+
if (charW > fontSize * 2) return null;
|
|
4679
|
+
return chars.map((ch, idx) => ({
|
|
4680
|
+
text: ch,
|
|
4681
|
+
x: Math.round(itemX + idx * charW),
|
|
4682
|
+
w: Math.round(charW * 0.8)
|
|
4683
|
+
// 실제 글자 폭은 간격보다 좁음
|
|
4684
|
+
}));
|
|
4339
4685
|
}
|
|
4340
4686
|
function groupByY(items) {
|
|
4341
4687
|
if (items.length === 0) return [];
|
|
@@ -4360,14 +4706,14 @@ function isProseSpread(items) {
|
|
|
4360
4706
|
for (let i = 1; i < sorted.length; i++) {
|
|
4361
4707
|
gaps.push(sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w));
|
|
4362
4708
|
}
|
|
4363
|
-
const maxGap =
|
|
4709
|
+
const maxGap = safeMax(gaps);
|
|
4364
4710
|
const avgLen = items.reduce((s, i) => s + i.text.length, 0) / items.length;
|
|
4365
4711
|
return maxGap < 40 && avgLen < 5;
|
|
4366
4712
|
}
|
|
4367
4713
|
function detectColumns(yLines) {
|
|
4368
4714
|
const allItems = yLines.flat();
|
|
4369
4715
|
if (allItems.length === 0) return null;
|
|
4370
|
-
const pageWidth =
|
|
4716
|
+
const pageWidth = safeMax(allItems.map((i) => i.x + i.w)) - safeMin(allItems.map((i) => i.x));
|
|
4371
4717
|
if (pageWidth < 100) return null;
|
|
4372
4718
|
let bigoLineIdx = -1;
|
|
4373
4719
|
for (let i = 0; i < yLines.length; i++) {
|
|
@@ -4399,7 +4745,7 @@ function detectColumns(yLines) {
|
|
|
4399
4745
|
}
|
|
4400
4746
|
const peaks = xClusters.filter((c) => c.count >= 3).sort((a, b) => a.minX - b.minX);
|
|
4401
4747
|
if (peaks.length < 3) return null;
|
|
4402
|
-
const MERGE_TOL =
|
|
4748
|
+
const MERGE_TOL = 40;
|
|
4403
4749
|
const merged = [peaks[0]];
|
|
4404
4750
|
for (let i = 1; i < peaks.length; i++) {
|
|
4405
4751
|
const prev = merged[merged.length - 1];
|
|
@@ -4413,7 +4759,14 @@ function detectColumns(yLines) {
|
|
|
4413
4759
|
merged.push({ ...peaks[i] });
|
|
4414
4760
|
}
|
|
4415
4761
|
}
|
|
4416
|
-
const
|
|
4762
|
+
const rawColumns = merged.filter((c) => c.count >= 3).map((c) => c.minX);
|
|
4763
|
+
if (rawColumns.length < 3) return null;
|
|
4764
|
+
const MIN_DETECT_COL_WIDTH = 30;
|
|
4765
|
+
const columns = [rawColumns[0]];
|
|
4766
|
+
for (let i = 1; i < rawColumns.length; i++) {
|
|
4767
|
+
if (rawColumns[i] - columns[columns.length - 1] < MIN_DETECT_COL_WIDTH) continue;
|
|
4768
|
+
columns.push(rawColumns[i]);
|
|
4769
|
+
}
|
|
4417
4770
|
return columns.length >= 3 ? columns : null;
|
|
4418
4771
|
}
|
|
4419
4772
|
function findColumn(x, columns) {
|
|
@@ -4541,6 +4894,16 @@ function buildGridTable(lines, columns) {
|
|
|
4541
4894
|
}
|
|
4542
4895
|
merged.splice(0, headerEnd, headerRow);
|
|
4543
4896
|
}
|
|
4897
|
+
for (const row of merged) {
|
|
4898
|
+
for (let c = 0; c < row.length; c++) {
|
|
4899
|
+
if (row[c]) row[c] = collapseEvenSpacing(row[c]);
|
|
4900
|
+
}
|
|
4901
|
+
}
|
|
4902
|
+
const totalCells = merged.length * numCols;
|
|
4903
|
+
const filledCells = merged.reduce((s, row) => s + row.filter((c) => c).length, 0);
|
|
4904
|
+
if (filledCells < totalCells * 0.35 || merged.length < 2 || merged.length <= 3 && numCols >= 7) {
|
|
4905
|
+
return merged.map((r) => r.filter((c) => c).join(" ")).join("\n");
|
|
4906
|
+
}
|
|
4544
4907
|
const md = [];
|
|
4545
4908
|
md.push("| " + merged[0].join(" | ") + " |");
|
|
4546
4909
|
md.push("| " + merged[0].map(() => "---").join(" | ") + " |");
|
|
@@ -4552,12 +4915,32 @@ function buildGridTable(lines, columns) {
|
|
|
4552
4915
|
function mergeLineSimple(items) {
|
|
4553
4916
|
if (items.length <= 1) return items[0]?.text || "";
|
|
4554
4917
|
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
4918
|
+
const isEvenSpaced = detectEvenSpacedItems(sorted);
|
|
4555
4919
|
let result = sorted[0].text;
|
|
4556
4920
|
for (let i = 1; i < sorted.length; i++) {
|
|
4557
4921
|
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
4558
4922
|
const avgFs = (sorted[i].fontSize + sorted[i - 1].fontSize) / 2;
|
|
4559
|
-
|
|
4560
|
-
|
|
4923
|
+
const tabThreshold = Math.max(avgFs * 2, 30);
|
|
4924
|
+
if (gap > tabThreshold) {
|
|
4925
|
+
result += " ";
|
|
4926
|
+
result += sorted[i].text;
|
|
4927
|
+
continue;
|
|
4928
|
+
}
|
|
4929
|
+
if (isEvenSpaced[i]) {
|
|
4930
|
+
result += sorted[i].text;
|
|
4931
|
+
continue;
|
|
4932
|
+
}
|
|
4933
|
+
if (sorted[i].hasSpaceBefore && gap >= avgFs * 0.05) {
|
|
4934
|
+
result += " ";
|
|
4935
|
+
result += sorted[i].text;
|
|
4936
|
+
continue;
|
|
4937
|
+
}
|
|
4938
|
+
if (/[□■○●▶◆◇ㅇ]$/.test(sorted[i - 1].text) && /^[가-힣]/.test(sorted[i].text) && gap > 1) {
|
|
4939
|
+
result += " ";
|
|
4940
|
+
result += sorted[i].text;
|
|
4941
|
+
continue;
|
|
4942
|
+
}
|
|
4943
|
+
if (gap < avgFs * 0.15) {
|
|
4561
4944
|
} else if (gap < avgFs * 0.35 && (/[가-힣]$/.test(result) || /^[가-힣]/.test(sorted[i].text))) {
|
|
4562
4945
|
} else if (gap > 3) result += " ";
|
|
4563
4946
|
result += sorted[i].text;
|
|
@@ -4566,8 +4949,8 @@ function mergeLineSimple(items) {
|
|
|
4566
4949
|
}
|
|
4567
4950
|
function cleanPdfText(text) {
|
|
4568
4951
|
return mergeKoreanLines(
|
|
4569
|
-
text.replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "")
|
|
4570
|
-
).replace(/^(?!\|)
|
|
4952
|
+
text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
|
|
4953
|
+
).replace(/^(?!\| ---).*$/gm, (line) => collapseEvenSpacing(line)).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
|
|
4571
4954
|
}
|
|
4572
4955
|
function startsWithMarker(line) {
|
|
4573
4956
|
const t = line.trimStart();
|
|
@@ -4759,7 +5142,7 @@ function mergeKoreanLines(text) {
|
|
|
4759
5142
|
result[result.length - 1] = prev + " " + currTrimmed;
|
|
4760
5143
|
continue;
|
|
4761
5144
|
}
|
|
4762
|
-
if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev)) {
|
|
5145
|
+
if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev) && !startsWithMarker(prev)) {
|
|
4763
5146
|
result[result.length - 1] = prev + " " + curr;
|
|
4764
5147
|
} else {
|
|
4765
5148
|
result.push(curr);
|
|
@@ -4772,7 +5155,7 @@ function mergeKoreanLines(text) {
|
|
|
4772
5155
|
import { readFile } from "fs/promises";
|
|
4773
5156
|
|
|
4774
5157
|
// src/xlsx/parser.ts
|
|
4775
|
-
import
|
|
5158
|
+
import JSZip2 from "jszip";
|
|
4776
5159
|
import { DOMParser as DOMParser2 } from "@xmldom/xmldom";
|
|
4777
5160
|
var MAX_SHEETS = 100;
|
|
4778
5161
|
var MAX_DECOMPRESS_SIZE3 = 100 * 1024 * 1024;
|
|
@@ -4810,7 +5193,7 @@ function getTextContent(el) {
|
|
|
4810
5193
|
return el.textContent?.trim() ?? "";
|
|
4811
5194
|
}
|
|
4812
5195
|
function parseXml(text) {
|
|
4813
|
-
return new DOMParser2().parseFromString(text, "text/xml");
|
|
5196
|
+
return new DOMParser2().parseFromString(stripDtd(text), "text/xml");
|
|
4814
5197
|
}
|
|
4815
5198
|
function parseSharedStrings(xml) {
|
|
4816
5199
|
const doc = parseXml(xml);
|
|
@@ -4963,7 +5346,7 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
|
|
|
4963
5346
|
}
|
|
4964
5347
|
async function parseXlsxDocument(buffer, options) {
|
|
4965
5348
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
|
|
4966
|
-
const zip = await
|
|
5349
|
+
const zip = await JSZip2.loadAsync(buffer);
|
|
4967
5350
|
const warnings = [];
|
|
4968
5351
|
const workbookFile = zip.file("xl/workbook.xml");
|
|
4969
5352
|
if (!workbookFile) {
|
|
@@ -4985,7 +5368,7 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
4985
5368
|
}
|
|
4986
5369
|
let pageFilter = null;
|
|
4987
5370
|
if (options?.pages) {
|
|
4988
|
-
const { parsePageRange: parsePageRange2 } = await import("./page-range-
|
|
5371
|
+
const { parsePageRange: parsePageRange2 } = await import("./page-range-OF5I4PQY.js");
|
|
4989
5372
|
pageFilter = parsePageRange2(options.pages, sheets.length);
|
|
4990
5373
|
}
|
|
4991
5374
|
const blocks = [];
|
|
@@ -5053,7 +5436,7 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
5053
5436
|
}
|
|
5054
5437
|
|
|
5055
5438
|
// src/docx/parser.ts
|
|
5056
|
-
import
|
|
5439
|
+
import JSZip3 from "jszip";
|
|
5057
5440
|
import { DOMParser as DOMParser3 } from "@xmldom/xmldom";
|
|
5058
5441
|
var MAX_DECOMPRESS_SIZE4 = 100 * 1024 * 1024;
|
|
5059
5442
|
function getChildElements(parent, localName) {
|
|
@@ -5097,7 +5480,7 @@ function getAttr(el, localName) {
|
|
|
5097
5480
|
return null;
|
|
5098
5481
|
}
|
|
5099
5482
|
function parseXml2(text) {
|
|
5100
|
-
return new DOMParser3().parseFromString(text, "text/xml");
|
|
5483
|
+
return new DOMParser3().parseFromString(stripDtd(text), "text/xml");
|
|
5101
5484
|
}
|
|
5102
5485
|
function parseStyles(xml) {
|
|
5103
5486
|
const doc = parseXml2(xml);
|
|
@@ -5391,7 +5774,7 @@ async function extractImages(zip, rels, doc) {
|
|
|
5391
5774
|
}
|
|
5392
5775
|
async function parseDocxDocument(buffer, options) {
|
|
5393
5776
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
|
|
5394
|
-
const zip = await
|
|
5777
|
+
const zip = await JSZip3.loadAsync(buffer);
|
|
5395
5778
|
const warnings = [];
|
|
5396
5779
|
const docFile = zip.file("word/document.xml");
|
|
5397
5780
|
if (!docFile) {
|
|
@@ -5608,7 +5991,7 @@ function extractInlineFields(text) {
|
|
|
5608
5991
|
}
|
|
5609
5992
|
|
|
5610
5993
|
// src/hwpx/generator.ts
|
|
5611
|
-
import
|
|
5994
|
+
import JSZip4 from "jszip";
|
|
5612
5995
|
|
|
5613
5996
|
// src/index.ts
|
|
5614
5997
|
async function parse(input, options) {
|
|
@@ -5703,7 +6086,13 @@ function normalize(s) {
|
|
|
5703
6086
|
}
|
|
5704
6087
|
var MAX_LEVENSHTEIN_LEN = 1e4;
|
|
5705
6088
|
function levenshtein(a, b) {
|
|
5706
|
-
if (a.length + b.length > MAX_LEVENSHTEIN_LEN)
|
|
6089
|
+
if (a.length + b.length > MAX_LEVENSHTEIN_LEN) {
|
|
6090
|
+
const sampleLen = Math.min(500, a.length, b.length);
|
|
6091
|
+
let diffs = 0;
|
|
6092
|
+
for (let i = 0; i < sampleLen; i++) if (a[i] !== b[i]) diffs++;
|
|
6093
|
+
const sampleRate = sampleLen > 0 ? diffs / sampleLen : 1;
|
|
6094
|
+
return Math.abs(a.length - b.length) + Math.round(Math.min(a.length, b.length) * sampleRate);
|
|
6095
|
+
}
|
|
5707
6096
|
if (a.length > b.length) [a, b] = [b, a];
|
|
5708
6097
|
const m = a.length;
|
|
5709
6098
|
const n = b.length;
|
|
@@ -5859,7 +6248,10 @@ function diffTableCells(a, b) {
|
|
|
5859
6248
|
}
|
|
5860
6249
|
|
|
5861
6250
|
export {
|
|
5862
|
-
|
|
6251
|
+
VERSION,
|
|
6252
|
+
toArrayBuffer,
|
|
6253
|
+
KordocError,
|
|
6254
|
+
sanitizeError,
|
|
5863
6255
|
blocksToMarkdown,
|
|
5864
6256
|
extractHwpxMetadataOnly,
|
|
5865
6257
|
extractHwp5MetadataOnly,
|
|
@@ -5868,4 +6260,4 @@ export {
|
|
|
5868
6260
|
extractFormFields,
|
|
5869
6261
|
parse
|
|
5870
6262
|
};
|
|
5871
|
-
//# sourceMappingURL=chunk-
|
|
6263
|
+
//# sourceMappingURL=chunk-FINXMRCH.js.map
|