kordoc 2.0.2 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -2
- package/dist/{chunk-XJYM2AUA.js → chunk-GJ2S6IMC.js} +457 -35
- package/dist/chunk-GJ2S6IMC.js.map +1 -0
- package/dist/{chunk-EVWOJ4T5.js → chunk-PKIJLEV6.js} +2 -2
- package/dist/cli.js +4 -4
- package/dist/index.cjs +456 -33
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +456 -33
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +2 -2
- package/dist/{provider-A4FHJSID.js → provider-7H4CPZYS.js} +2 -1
- package/dist/provider-7H4CPZYS.js.map +1 -0
- package/dist/{utils-6JEIFBCJ.js → utils-BWQ2RGUD.js} +2 -2
- package/dist/{watch-BCPDLGOE.js → watch-X7IC7MLF.js} +9 -5
- package/dist/watch-X7IC7MLF.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-XJYM2AUA.js.map +0 -1
- package/dist/provider-A4FHJSID.js.map +0 -1
- package/dist/watch-BCPDLGOE.js.map +0 -1
- /package/dist/{chunk-EVWOJ4T5.js.map → chunk-PKIJLEV6.js.map} +0 -0
- /package/dist/{utils-6JEIFBCJ.js.map → utils-BWQ2RGUD.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -63,6 +63,7 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount) {
|
|
|
63
63
|
blocks.push({ type: "paragraph", text: text.trim(), pageNumber: i });
|
|
64
64
|
}
|
|
65
65
|
} catch {
|
|
66
|
+
blocks.push({ type: "paragraph", text: `[OCR \uC2E4\uD328: \uD398\uC774\uC9C0 ${i}]` });
|
|
66
67
|
}
|
|
67
68
|
}
|
|
68
69
|
return blocks;
|
|
@@ -138,7 +139,7 @@ import { inflateRawSync } from "zlib";
|
|
|
138
139
|
import { DOMParser } from "@xmldom/xmldom";
|
|
139
140
|
|
|
140
141
|
// src/utils.ts
|
|
141
|
-
var VERSION = true ? "2.0
|
|
142
|
+
var VERSION = true ? "2.1.0" : "0.0.0-dev";
|
|
142
143
|
function toArrayBuffer(buf) {
|
|
143
144
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
144
145
|
return buf.buffer;
|
|
@@ -327,6 +328,47 @@ function sanitizeText(text) {
|
|
|
327
328
|
}
|
|
328
329
|
return result;
|
|
329
330
|
}
|
|
331
|
+
function flattenLayoutTables(blocks) {
|
|
332
|
+
const result = [];
|
|
333
|
+
for (const block of blocks) {
|
|
334
|
+
if (block.type !== "table" || !block.table) {
|
|
335
|
+
result.push(block);
|
|
336
|
+
continue;
|
|
337
|
+
}
|
|
338
|
+
const { rows: numRows, cols: numCols, cells } = block.table;
|
|
339
|
+
if (numRows === 1 && numCols === 1) {
|
|
340
|
+
result.push(block);
|
|
341
|
+
continue;
|
|
342
|
+
}
|
|
343
|
+
if (numRows <= 3) {
|
|
344
|
+
let totalNewlines = 0;
|
|
345
|
+
let totalTextLen = 0;
|
|
346
|
+
for (let r = 0; r < numRows; r++) {
|
|
347
|
+
for (let c = 0; c < numCols; c++) {
|
|
348
|
+
const t = cells[r]?.[c]?.text || "";
|
|
349
|
+
totalNewlines += (t.match(/\n/g) || []).length;
|
|
350
|
+
totalTextLen += t.length;
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
if (totalNewlines > 5 || numRows <= 2 && totalTextLen > 300) {
|
|
354
|
+
for (let r = 0; r < numRows; r++) {
|
|
355
|
+
for (let c = 0; c < numCols; c++) {
|
|
356
|
+
const cellText = cells[r]?.[c]?.text?.trim();
|
|
357
|
+
if (!cellText) continue;
|
|
358
|
+
for (const line of cellText.split("\n")) {
|
|
359
|
+
const trimmed = line.trim();
|
|
360
|
+
if (!trimmed) continue;
|
|
361
|
+
result.push({ type: "paragraph", text: trimmed, pageNumber: block.pageNumber });
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
continue;
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
result.push(block);
|
|
369
|
+
}
|
|
370
|
+
return result;
|
|
371
|
+
}
|
|
330
372
|
function blocksToMarkdown(blocks) {
|
|
331
373
|
const lines = [];
|
|
332
374
|
for (let i = 0; i < blocks.length; i++) {
|
|
@@ -427,6 +469,9 @@ function tableToMarkdown(table) {
|
|
|
427
469
|
if (dr === 0 && dc === 0) continue;
|
|
428
470
|
if (r + dr < numRows && c + dc < numCols) {
|
|
429
471
|
skip.add(`${r + dr},${c + dc}`);
|
|
472
|
+
if (dr === 0) {
|
|
473
|
+
display[r][c + dc] = cell.text.replace(/\n/g, "<br>");
|
|
474
|
+
}
|
|
430
475
|
}
|
|
431
476
|
}
|
|
432
477
|
}
|
|
@@ -522,7 +567,12 @@ function parseCharProperties(doc, map) {
|
|
|
522
567
|
if (!id) continue;
|
|
523
568
|
const prop = {};
|
|
524
569
|
const height = el.getAttribute("height");
|
|
525
|
-
if (height)
|
|
570
|
+
if (height) {
|
|
571
|
+
const parsedHeight = parseInt(height, 10);
|
|
572
|
+
if (!isNaN(parsedHeight) && parsedHeight > 0) {
|
|
573
|
+
prop.fontSize = parsedHeight / 100;
|
|
574
|
+
}
|
|
575
|
+
}
|
|
526
576
|
const bold = el.getAttribute("bold");
|
|
527
577
|
if (bold === "true" || bold === "1") prop.bold = true;
|
|
528
578
|
const italic = el.getAttribute("italic");
|
|
@@ -662,7 +712,7 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
|
|
|
662
712
|
const data = await file.async("uint8array");
|
|
663
713
|
decompressed.total += data.length;
|
|
664
714
|
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
665
|
-
const ext = ref.includes(".") ? ref.split(".").pop() : "png";
|
|
715
|
+
const ext = ref.includes(".") ? ref.split(".").pop() || "png" : "png";
|
|
666
716
|
const mimeType = imageExtToMime(ext);
|
|
667
717
|
imageIndex++;
|
|
668
718
|
const filename = `image_${String(imageIndex).padStart(3, "0")}.${mimeToExt(mimeType)}`;
|
|
@@ -956,8 +1006,10 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
956
1006
|
break;
|
|
957
1007
|
case "cellSpan":
|
|
958
1008
|
if (tableCtx?.cell) {
|
|
959
|
-
const
|
|
960
|
-
const
|
|
1009
|
+
const rawCs = parseInt(el.getAttribute("colSpan") || "1", 10);
|
|
1010
|
+
const cs = isNaN(rawCs) ? 1 : rawCs;
|
|
1011
|
+
const rawRs = parseInt(el.getAttribute("rowSpan") || "1", 10);
|
|
1012
|
+
const rs = isNaN(rawRs) ? 1 : rawRs;
|
|
961
1013
|
tableCtx.cell.colSpan = clampSpan(cs, MAX_COLS);
|
|
962
1014
|
tableCtx.cell.rowSpan = clampSpan(rs, MAX_ROWS);
|
|
963
1015
|
}
|
|
@@ -1049,6 +1101,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
1049
1101
|
extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
|
|
1050
1102
|
} else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
|
|
1051
1103
|
walkChildren(el, d + 1);
|
|
1104
|
+
} else if (localTag === "run") {
|
|
1105
|
+
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
|
|
1052
1106
|
}
|
|
1053
1107
|
}
|
|
1054
1108
|
};
|
|
@@ -1225,8 +1279,9 @@ var TAG_CHAR_SHAPE = 68;
|
|
|
1225
1279
|
var TAG_CTRL_HEADER = 71;
|
|
1226
1280
|
var TAG_LIST_HEADER = 72;
|
|
1227
1281
|
var TAG_TABLE = 77;
|
|
1228
|
-
var TAG_DOC_CHAR_SHAPE =
|
|
1229
|
-
var
|
|
1282
|
+
var TAG_DOC_CHAR_SHAPE = 21;
|
|
1283
|
+
var TAG_DOC_PARA_SHAPE = 25;
|
|
1284
|
+
var TAG_DOC_STYLE = 26;
|
|
1230
1285
|
var CHAR_LINE = 0;
|
|
1231
1286
|
var CHAR_SECTION_BREAK = 10;
|
|
1232
1287
|
var CHAR_PARA = 13;
|
|
@@ -1282,8 +1337,14 @@ function parseFileHeader(data) {
|
|
|
1282
1337
|
}
|
|
1283
1338
|
function parseDocInfo(records) {
|
|
1284
1339
|
const charShapes = [];
|
|
1340
|
+
const paraShapes = [];
|
|
1285
1341
|
const styles = [];
|
|
1286
1342
|
for (const rec of records) {
|
|
1343
|
+
if (rec.tagId === TAG_DOC_PARA_SHAPE && rec.data.length >= 4) {
|
|
1344
|
+
const flags = rec.data.readUInt32LE(0);
|
|
1345
|
+
const outlineLevel = flags >> 25 & 7;
|
|
1346
|
+
paraShapes.push({ outlineLevel });
|
|
1347
|
+
}
|
|
1287
1348
|
if (rec.tagId === TAG_DOC_CHAR_SHAPE && rec.data.length >= 18) {
|
|
1288
1349
|
if (rec.data.length >= 50) {
|
|
1289
1350
|
const fontSize = rec.data.readUInt32LE(42);
|
|
@@ -1323,7 +1384,7 @@ function parseDocInfo(records) {
|
|
|
1323
1384
|
}
|
|
1324
1385
|
}
|
|
1325
1386
|
}
|
|
1326
|
-
return { charShapes, styles };
|
|
1387
|
+
return { charShapes, paraShapes, styles };
|
|
1327
1388
|
}
|
|
1328
1389
|
function extractText(data) {
|
|
1329
1390
|
let result = "";
|
|
@@ -2334,12 +2395,13 @@ function parseHwp5Document(buffer, options) {
|
|
|
2334
2395
|
}
|
|
2335
2396
|
}
|
|
2336
2397
|
const images = cfb ? extractHwp5Images(cfb, blocks, compressed, warnings) : extractHwp5ImagesLenient(lenientCfb, blocks, compressed, warnings);
|
|
2398
|
+
const flatBlocks = flattenLayoutTables(blocks);
|
|
2337
2399
|
if (docInfo) {
|
|
2338
|
-
detectHwp5Headings(
|
|
2400
|
+
detectHwp5Headings(flatBlocks, docInfo);
|
|
2339
2401
|
}
|
|
2340
|
-
const outline =
|
|
2341
|
-
const markdown = blocksToMarkdown(
|
|
2342
|
-
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
|
|
2402
|
+
const outline = flatBlocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
2403
|
+
const markdown = blocksToMarkdown(flatBlocks);
|
|
2404
|
+
return { markdown, blocks: flatBlocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
|
|
2343
2405
|
}
|
|
2344
2406
|
function parseDocInfoStream(cfb, compressed) {
|
|
2345
2407
|
try {
|
|
@@ -2390,16 +2452,21 @@ function detectHwp5Headings(blocks, docInfo) {
|
|
|
2390
2452
|
}
|
|
2391
2453
|
if (baseFontSize <= 0) return;
|
|
2392
2454
|
for (const block of blocks) {
|
|
2393
|
-
if (block.type
|
|
2455
|
+
if (block.type === "heading") continue;
|
|
2456
|
+
if (block.type !== "paragraph" || !block.text) continue;
|
|
2394
2457
|
const text = block.text.trim();
|
|
2395
2458
|
if (text.length === 0 || text.length > 200) continue;
|
|
2396
2459
|
if (/^\d+$/.test(text)) continue;
|
|
2397
|
-
const ratio = block.style.fontSize / baseFontSize;
|
|
2398
2460
|
let level = 0;
|
|
2399
|
-
if (
|
|
2400
|
-
|
|
2401
|
-
|
|
2402
|
-
|
|
2461
|
+
if (block.style?.fontSize && baseFontSize > 0) {
|
|
2462
|
+
const ratio = block.style.fontSize / baseFontSize;
|
|
2463
|
+
if (ratio >= HEADING_RATIO_H1) level = 1;
|
|
2464
|
+
else if (ratio >= HEADING_RATIO_H2) level = 2;
|
|
2465
|
+
else if (ratio >= HEADING_RATIO_H3) level = 3;
|
|
2466
|
+
}
|
|
2467
|
+
if (/^제\d+[장절편]\s/.test(text) && text.length <= 50) {
|
|
2468
|
+
if (level === 0) level = 2;
|
|
2469
|
+
} else if (/^제\d+(조의?\d*)\s*[\((]/.test(text) && text.length <= 80) {
|
|
2403
2470
|
if (level === 0) level = 3;
|
|
2404
2471
|
}
|
|
2405
2472
|
if (level > 0) {
|
|
@@ -2631,13 +2698,20 @@ function parseSection(records, docInfo, warnings, sectionNum) {
|
|
|
2631
2698
|
while (i < records.length) {
|
|
2632
2699
|
const rec = records[i];
|
|
2633
2700
|
if (rec.tagId === TAG_PARA_HEADER && rec.level === 0) {
|
|
2634
|
-
const { paragraph, tables, nextIdx, charShapeIds } = parseParagraphWithTables(records, i);
|
|
2701
|
+
const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i);
|
|
2635
2702
|
if (paragraph) {
|
|
2636
2703
|
const block = { type: "paragraph", text: paragraph, pageNumber: sectionNum };
|
|
2637
2704
|
if (docInfo && charShapeIds.length > 0) {
|
|
2638
2705
|
const style = resolveCharStyle(charShapeIds, docInfo);
|
|
2639
2706
|
if (style) block.style = style;
|
|
2640
2707
|
}
|
|
2708
|
+
if (docInfo && paraShapeId >= 0 && paraShapeId < docInfo.paraShapes.length) {
|
|
2709
|
+
const ol = docInfo.paraShapes[paraShapeId].outlineLevel;
|
|
2710
|
+
if (ol >= 1 && ol <= 6) {
|
|
2711
|
+
block.type = "heading";
|
|
2712
|
+
block.level = ol;
|
|
2713
|
+
}
|
|
2714
|
+
}
|
|
2641
2715
|
blocks.push(block);
|
|
2642
2716
|
}
|
|
2643
2717
|
for (const t of tables) blocks.push({ type: "table", table: t, pageNumber: sectionNum });
|
|
@@ -2757,6 +2831,8 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
2757
2831
|
let text = "";
|
|
2758
2832
|
const tables = [];
|
|
2759
2833
|
const charShapeIds = [];
|
|
2834
|
+
const paraHeaderData = records[startIdx].data;
|
|
2835
|
+
const paraShapeId = paraHeaderData.length >= 10 ? paraHeaderData.readUInt16LE(8) : -1;
|
|
2760
2836
|
let i = startIdx + 1;
|
|
2761
2837
|
while (i < records.length) {
|
|
2762
2838
|
const rec = records[i];
|
|
@@ -2781,7 +2857,7 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
2781
2857
|
i++;
|
|
2782
2858
|
}
|
|
2783
2859
|
const trimmed = text.trim();
|
|
2784
|
-
return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds };
|
|
2860
|
+
return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds, paraShapeId };
|
|
2785
2861
|
}
|
|
2786
2862
|
function parseTableBlock(records, startIdx) {
|
|
2787
2863
|
const tableLevel = records[startIdx].level;
|
|
@@ -2894,10 +2970,33 @@ var MIN_LINE_LENGTH = 10;
|
|
|
2894
2970
|
var COORD_MERGE_TOL = 3;
|
|
2895
2971
|
var CONNECT_TOL = 5;
|
|
2896
2972
|
var CELL_PADDING = 2;
|
|
2973
|
+
var MAX_LINE_WIDTH = 5;
|
|
2974
|
+
var IDENTITY = [1, 0, 0, 1, 0, 0];
|
|
2975
|
+
function matMultiply(m1, m2) {
|
|
2976
|
+
return [
|
|
2977
|
+
m1[0] * m2[0] + m1[2] * m2[1],
|
|
2978
|
+
m1[1] * m2[0] + m1[3] * m2[1],
|
|
2979
|
+
m1[0] * m2[2] + m1[2] * m2[3],
|
|
2980
|
+
m1[1] * m2[2] + m1[3] * m2[3],
|
|
2981
|
+
m1[0] * m2[4] + m1[2] * m2[5] + m1[4],
|
|
2982
|
+
m1[1] * m2[4] + m1[3] * m2[5] + m1[5]
|
|
2983
|
+
];
|
|
2984
|
+
}
|
|
2985
|
+
function matTransformPoint(m, x, y) {
|
|
2986
|
+
return [m[0] * x + m[2] * y + m[4], m[1] * x + m[3] * y + m[5]];
|
|
2987
|
+
}
|
|
2988
|
+
function matScale(m) {
|
|
2989
|
+
return Math.max(
|
|
2990
|
+
Math.sqrt(m[1] * m[1] + m[3] * m[3]),
|
|
2991
|
+
Math.sqrt(m[0] * m[0] + m[2] * m[2])
|
|
2992
|
+
);
|
|
2993
|
+
}
|
|
2897
2994
|
function extractLines(fnArray, argsArray) {
|
|
2898
2995
|
const horizontals = [];
|
|
2899
2996
|
const verticals = [];
|
|
2997
|
+
let ctm = [...IDENTITY];
|
|
2900
2998
|
let lineWidth = 1;
|
|
2999
|
+
const stateStack = [];
|
|
2901
3000
|
let currentPath = [];
|
|
2902
3001
|
let pathStartX = 0, pathStartY = 0;
|
|
2903
3002
|
let curX = 0, curY = 0;
|
|
@@ -2915,13 +3014,53 @@ function extractLines(fnArray, argsArray) {
|
|
|
2915
3014
|
);
|
|
2916
3015
|
}
|
|
2917
3016
|
}
|
|
2918
|
-
function
|
|
2919
|
-
if (
|
|
3017
|
+
function tryConvertLinesToRectangle(path) {
|
|
3018
|
+
if (path.length < 3 || path.length > 5) return false;
|
|
3019
|
+
const first = path[0], last = path[path.length - 1];
|
|
3020
|
+
const closed = Math.abs(first.x1 - last.x2) < 1 && Math.abs(first.y1 - last.y2) < 1;
|
|
3021
|
+
if (!closed) return false;
|
|
3022
|
+
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
3023
|
+
for (const seg of path) {
|
|
3024
|
+
minX = Math.min(minX, seg.x1, seg.x2);
|
|
3025
|
+
minY = Math.min(minY, seg.y1, seg.y2);
|
|
3026
|
+
maxX = Math.max(maxX, seg.x1, seg.x2);
|
|
3027
|
+
maxY = Math.max(maxY, seg.y1, seg.y2);
|
|
3028
|
+
}
|
|
3029
|
+
const w = maxX - minX, h = maxY - minY;
|
|
3030
|
+
if (w < MIN_LINE_LENGTH && h < MIN_LINE_LENGTH) return false;
|
|
3031
|
+
path.length = 0;
|
|
3032
|
+
if (h < ORIENTATION_TOL * 2 || w > MIN_LINE_LENGTH && h <= MAX_LINE_WIDTH) {
|
|
3033
|
+
path.push({ x1: minX, y1: (minY + maxY) / 2, x2: maxX, y2: (minY + maxY) / 2 });
|
|
3034
|
+
} else if (w < ORIENTATION_TOL * 2 || h > MIN_LINE_LENGTH && w <= MAX_LINE_WIDTH) {
|
|
3035
|
+
path.push({ x1: (minX + maxX) / 2, y1: minY, x2: (minX + maxX) / 2, y2: maxY });
|
|
3036
|
+
} else {
|
|
3037
|
+
pushRectangle(path, minX, minY, w, h);
|
|
3038
|
+
}
|
|
3039
|
+
return true;
|
|
3040
|
+
}
|
|
3041
|
+
function flushPath(isStroke, isFill) {
|
|
3042
|
+
if (!isStroke && !isFill) {
|
|
3043
|
+
currentPath = [];
|
|
3044
|
+
return;
|
|
3045
|
+
}
|
|
3046
|
+
if (isFill && !isStroke && currentPath.length >= 3) {
|
|
3047
|
+
tryConvertLinesToRectangle(currentPath);
|
|
3048
|
+
}
|
|
3049
|
+
const scale = matScale(ctm);
|
|
3050
|
+
const effectiveLW = lineWidth * scale;
|
|
3051
|
+
if (effectiveLW > MAX_LINE_WIDTH && isStroke && !isFill) {
|
|
2920
3052
|
currentPath = [];
|
|
2921
3053
|
return;
|
|
2922
3054
|
}
|
|
2923
3055
|
for (const seg of currentPath) {
|
|
2924
|
-
|
|
3056
|
+
const [px1, py1] = matTransformPoint(ctm, seg.x1, seg.y1);
|
|
3057
|
+
const [px2, py2] = matTransformPoint(ctm, seg.x2, seg.y2);
|
|
3058
|
+
classifyAndAdd(
|
|
3059
|
+
{ x1: px1, y1: py1, x2: px2, y2: py2 },
|
|
3060
|
+
effectiveLW,
|
|
3061
|
+
horizontals,
|
|
3062
|
+
verticals
|
|
3063
|
+
);
|
|
2925
3064
|
}
|
|
2926
3065
|
currentPath = [];
|
|
2927
3066
|
}
|
|
@@ -2929,9 +3068,28 @@ function extractLines(fnArray, argsArray) {
|
|
|
2929
3068
|
const op = fnArray[i];
|
|
2930
3069
|
const args = argsArray[i];
|
|
2931
3070
|
switch (op) {
|
|
3071
|
+
// ── Graphics State ──
|
|
3072
|
+
case OPS.save:
|
|
3073
|
+
stateStack.push({ ctm: [...ctm], lineWidth });
|
|
3074
|
+
break;
|
|
3075
|
+
case OPS.restore:
|
|
3076
|
+
if (stateStack.length > 0) {
|
|
3077
|
+
const state = stateStack.pop();
|
|
3078
|
+
ctm = state.ctm;
|
|
3079
|
+
lineWidth = state.lineWidth;
|
|
3080
|
+
}
|
|
3081
|
+
break;
|
|
3082
|
+
case OPS.transform: {
|
|
3083
|
+
const m = args;
|
|
3084
|
+
if (m.length >= 6) {
|
|
3085
|
+
ctm = matMultiply(ctm, [m[0], m[1], m[2], m[3], m[4], m[5]]);
|
|
3086
|
+
}
|
|
3087
|
+
break;
|
|
3088
|
+
}
|
|
2932
3089
|
case OPS.setLineWidth:
|
|
2933
3090
|
lineWidth = args[0] || 1;
|
|
2934
3091
|
break;
|
|
3092
|
+
// ── Path Construction ──
|
|
2935
3093
|
case OPS.constructPath: {
|
|
2936
3094
|
const arg0 = args[0];
|
|
2937
3095
|
if (Array.isArray(arg0)) {
|
|
@@ -2999,34 +3157,60 @@ function extractLines(fnArray, argsArray) {
|
|
|
2999
3157
|
}
|
|
3000
3158
|
}
|
|
3001
3159
|
}
|
|
3002
|
-
|
|
3003
|
-
|
|
3004
|
-
|
|
3005
|
-
|
|
3160
|
+
const isStroke5 = afterOp === OPS.stroke || afterOp === OPS.closeStroke;
|
|
3161
|
+
const isFill5 = afterOp === OPS.fill || afterOp === OPS.eoFill;
|
|
3162
|
+
const isBoth5 = afterOp === OPS.fillStroke || afterOp === OPS.eoFillStroke || afterOp === OPS.closeFillStroke || afterOp === OPS.closeEOFillStroke;
|
|
3163
|
+
if (isStroke5 || isFill5 || isBoth5) {
|
|
3164
|
+
flushPath(isStroke5 || isBoth5, isFill5 || isBoth5);
|
|
3006
3165
|
} else if (afterOp === OPS.endPath) {
|
|
3007
|
-
flushPath(false);
|
|
3166
|
+
flushPath(false, false);
|
|
3008
3167
|
}
|
|
3009
3168
|
}
|
|
3010
3169
|
break;
|
|
3011
3170
|
}
|
|
3171
|
+
// ── Paint Operations ──
|
|
3012
3172
|
case OPS.stroke:
|
|
3013
3173
|
case OPS.closeStroke:
|
|
3014
|
-
flushPath(true);
|
|
3174
|
+
flushPath(true, false);
|
|
3015
3175
|
break;
|
|
3016
3176
|
case OPS.fill:
|
|
3017
3177
|
case OPS.eoFill:
|
|
3178
|
+
flushPath(false, true);
|
|
3179
|
+
break;
|
|
3018
3180
|
case OPS.fillStroke:
|
|
3019
3181
|
case OPS.eoFillStroke:
|
|
3020
3182
|
case OPS.closeFillStroke:
|
|
3021
3183
|
case OPS.closeEOFillStroke:
|
|
3022
|
-
flushPath(true);
|
|
3184
|
+
flushPath(true, true);
|
|
3023
3185
|
break;
|
|
3024
3186
|
case OPS.endPath:
|
|
3025
|
-
flushPath(false);
|
|
3187
|
+
flushPath(false, false);
|
|
3026
3188
|
break;
|
|
3027
3189
|
}
|
|
3028
3190
|
}
|
|
3029
|
-
return {
|
|
3191
|
+
return {
|
|
3192
|
+
horizontals: deduplicateLines(horizontals),
|
|
3193
|
+
verticals: deduplicateLines(verticals)
|
|
3194
|
+
};
|
|
3195
|
+
}
|
|
3196
|
+
function deduplicateLines(lines) {
|
|
3197
|
+
if (lines.length <= 1) return lines;
|
|
3198
|
+
const result = [];
|
|
3199
|
+
const tol = COORD_MERGE_TOL;
|
|
3200
|
+
for (const line of lines) {
|
|
3201
|
+
let isDuplicate = false;
|
|
3202
|
+
for (const existing of result) {
|
|
3203
|
+
if (Math.abs(line.y1 - existing.y1) <= tol && Math.abs(line.y2 - existing.y2) <= tol && Math.abs(line.x1 - existing.x1) <= tol && Math.abs(line.x2 - existing.x2) <= tol) {
|
|
3204
|
+
if (line.lineWidth > existing.lineWidth) {
|
|
3205
|
+
existing.lineWidth = line.lineWidth;
|
|
3206
|
+
}
|
|
3207
|
+
isDuplicate = true;
|
|
3208
|
+
break;
|
|
3209
|
+
}
|
|
3210
|
+
}
|
|
3211
|
+
if (!isDuplicate) result.push(line);
|
|
3212
|
+
}
|
|
3213
|
+
return result;
|
|
3030
3214
|
}
|
|
3031
3215
|
function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
|
|
3032
3216
|
const dx = Math.abs(seg.x2 - seg.x1);
|
|
@@ -3622,6 +3806,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3622
3806
|
const medianFontSize = computeMedianFontSize(allFontSizes);
|
|
3623
3807
|
if (medianFontSize > 0) {
|
|
3624
3808
|
detectHeadings(blocks, medianFontSize);
|
|
3809
|
+
mergeAdjacentHeadings(blocks);
|
|
3625
3810
|
}
|
|
3626
3811
|
detectMarkerHeadings(blocks);
|
|
3627
3812
|
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
@@ -3696,6 +3881,46 @@ function detectHeadings(blocks, medianFontSize) {
|
|
|
3696
3881
|
}
|
|
3697
3882
|
}
|
|
3698
3883
|
}
|
|
3884
|
+
function mergeAdjacentHeadings(blocks) {
|
|
3885
|
+
let i = 0;
|
|
3886
|
+
while (i < blocks.length - 1) {
|
|
3887
|
+
const curr = blocks[i];
|
|
3888
|
+
const next = blocks[i + 1];
|
|
3889
|
+
if (curr.type !== "heading" || next.type !== "heading") {
|
|
3890
|
+
i++;
|
|
3891
|
+
continue;
|
|
3892
|
+
}
|
|
3893
|
+
if (!curr.bbox || !next.bbox || !curr.text || !next.text) {
|
|
3894
|
+
i++;
|
|
3895
|
+
continue;
|
|
3896
|
+
}
|
|
3897
|
+
const currBaseline = curr.bbox.y + (curr.style?.fontSize || curr.bbox.height);
|
|
3898
|
+
const nextBaseline = next.bbox.y + (next.style?.fontSize || next.bbox.height);
|
|
3899
|
+
const yDiff = Math.abs(currBaseline - nextBaseline);
|
|
3900
|
+
const maxFs = Math.max(curr.style?.fontSize || 12, next.style?.fontSize || 12);
|
|
3901
|
+
const sameY = curr.bbox.page === next.bbox.page && yDiff < maxFs * 1.5;
|
|
3902
|
+
const sameLevel = curr.level === next.level;
|
|
3903
|
+
if (sameY && sameLevel) {
|
|
3904
|
+
const currX = curr.bbox.x;
|
|
3905
|
+
const nextX = next.bbox.x;
|
|
3906
|
+
if (currX <= nextX) {
|
|
3907
|
+
curr.text = curr.text + " " + next.text;
|
|
3908
|
+
} else {
|
|
3909
|
+
curr.text = next.text + " " + curr.text;
|
|
3910
|
+
}
|
|
3911
|
+
curr.bbox = {
|
|
3912
|
+
page: curr.bbox.page,
|
|
3913
|
+
x: Math.min(curr.bbox.x, next.bbox.x),
|
|
3914
|
+
y: Math.min(curr.bbox.y, next.bbox.y),
|
|
3915
|
+
width: Math.max(curr.bbox.x + curr.bbox.width, next.bbox.x + next.bbox.width) - Math.min(curr.bbox.x, next.bbox.x),
|
|
3916
|
+
height: Math.max(curr.bbox.height, next.bbox.height)
|
|
3917
|
+
};
|
|
3918
|
+
blocks.splice(i + 1, 1);
|
|
3919
|
+
} else {
|
|
3920
|
+
i++;
|
|
3921
|
+
}
|
|
3922
|
+
}
|
|
3923
|
+
}
|
|
3699
3924
|
function collapseEvenSpacing(text) {
|
|
3700
3925
|
const tokens = text.split(" ");
|
|
3701
3926
|
const singleCharCount = tokens.filter((t) => t.length === 1).length;
|
|
@@ -3704,6 +3929,169 @@ function collapseEvenSpacing(text) {
|
|
|
3704
3929
|
}
|
|
3705
3930
|
return text;
|
|
3706
3931
|
}
|
|
3932
|
+
function buildXyCutBlocks(items, pageNum) {
|
|
3933
|
+
const allY = items.map((i) => i.y);
|
|
3934
|
+
const pageHeight = Math.max(...allY) - Math.min(...allY);
|
|
3935
|
+
const gapThreshold = Math.max(15, pageHeight * 0.03);
|
|
3936
|
+
const orderedGroups = xyCutOrder(items, gapThreshold);
|
|
3937
|
+
const blocks = [];
|
|
3938
|
+
for (const group of orderedGroups) {
|
|
3939
|
+
if (group.length === 0) continue;
|
|
3940
|
+
const yLines = groupByY(group);
|
|
3941
|
+
for (const line of yLines) {
|
|
3942
|
+
const text = mergeLineSimple(line);
|
|
3943
|
+
if (!text.trim()) continue;
|
|
3944
|
+
blocks.push({
|
|
3945
|
+
type: "paragraph",
|
|
3946
|
+
text,
|
|
3947
|
+
pageNumber: pageNum,
|
|
3948
|
+
bbox: computeBBox(line, pageNum),
|
|
3949
|
+
style: dominantStyle(line)
|
|
3950
|
+
});
|
|
3951
|
+
}
|
|
3952
|
+
}
|
|
3953
|
+
return blocks.length > 0 ? blocks : null;
|
|
3954
|
+
}
|
|
3955
|
+
function normalizeUnderSegmentedTable(table, items, pageNum, bbox) {
|
|
3956
|
+
const totalCells = table.cells.reduce((sum, row) => sum + row.filter((c) => c.text.trim()).length, 0);
|
|
3957
|
+
const totalTextLines = table.cells.reduce((sum, row) => sum + row.reduce((s, c) => s + (c.text.trim() ? c.text.split("\n").length : 0), 0), 0);
|
|
3958
|
+
const isUnderSegmented = table.rows === 1 && table.cols === 1 || totalCells <= 2 && totalTextLines >= 8 || totalCells <= 2 && items.length >= 6;
|
|
3959
|
+
if (!isUnderSegmented) return null;
|
|
3960
|
+
if (hasMultiColumnLayout(items)) return buildXyCutBlocks(items, pageNum);
|
|
3961
|
+
const directTable = buildTableFromTextLayout(items, pageNum, bbox);
|
|
3962
|
+
if (directTable) return directTable;
|
|
3963
|
+
const clusterItems = items.map((i) => ({
|
|
3964
|
+
text: i.text,
|
|
3965
|
+
x: i.x,
|
|
3966
|
+
y: i.y,
|
|
3967
|
+
w: i.w,
|
|
3968
|
+
h: i.h,
|
|
3969
|
+
fontSize: i.fontSize,
|
|
3970
|
+
fontName: i.fontName
|
|
3971
|
+
}));
|
|
3972
|
+
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
3973
|
+
if (clusterResults.length > 0) {
|
|
3974
|
+
const blocks = [];
|
|
3975
|
+
const ciToIdx = /* @__PURE__ */ new Map();
|
|
3976
|
+
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
3977
|
+
const usedIndices = /* @__PURE__ */ new Set();
|
|
3978
|
+
for (const cr of clusterResults) {
|
|
3979
|
+
for (const ci of cr.usedItems) {
|
|
3980
|
+
const idx = ciToIdx.get(ci);
|
|
3981
|
+
if (idx !== void 0) usedIndices.add(idx);
|
|
3982
|
+
}
|
|
3983
|
+
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
3984
|
+
}
|
|
3985
|
+
const remaining = items.filter((_, idx) => !usedIndices.has(idx));
|
|
3986
|
+
for (const item of remaining) {
|
|
3987
|
+
if (!item.text.trim()) continue;
|
|
3988
|
+
blocks.push({
|
|
3989
|
+
type: "paragraph",
|
|
3990
|
+
text: item.text,
|
|
3991
|
+
pageNumber: pageNum,
|
|
3992
|
+
bbox: computeBBox([item], pageNum),
|
|
3993
|
+
style: { fontSize: item.fontSize, fontName: item.fontName }
|
|
3994
|
+
});
|
|
3995
|
+
}
|
|
3996
|
+
blocks.sort((a, b) => {
|
|
3997
|
+
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
3998
|
+
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
3999
|
+
return by - ay;
|
|
4000
|
+
});
|
|
4001
|
+
return blocks.length > 0 ? blocks : null;
|
|
4002
|
+
}
|
|
4003
|
+
return null;
|
|
4004
|
+
}
|
|
4005
|
+
function buildTableFromTextLayout(items, pageNum, bbox) {
|
|
4006
|
+
if (items.length < 4) return null;
|
|
4007
|
+
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
4008
|
+
const yTol = 3;
|
|
4009
|
+
const rows = [];
|
|
4010
|
+
let curRow = [sorted[0]];
|
|
4011
|
+
let curY = sorted[0].y;
|
|
4012
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
4013
|
+
if (Math.abs(sorted[i].y - curY) <= yTol) {
|
|
4014
|
+
curRow.push(sorted[i]);
|
|
4015
|
+
} else {
|
|
4016
|
+
rows.push(curRow);
|
|
4017
|
+
curRow = [sorted[i]];
|
|
4018
|
+
curY = sorted[i].y;
|
|
4019
|
+
}
|
|
4020
|
+
}
|
|
4021
|
+
rows.push(curRow);
|
|
4022
|
+
if (rows.length < 2) return null;
|
|
4023
|
+
const gapPositions = [];
|
|
4024
|
+
for (const row of rows) {
|
|
4025
|
+
if (row.length < 2) continue;
|
|
4026
|
+
const sortedX = [...row].sort((a, b) => a.x - b.x);
|
|
4027
|
+
const avgFs = sortedX.reduce((s, i) => s + i.fontSize, 0) / sortedX.length;
|
|
4028
|
+
for (let j = 1; j < sortedX.length; j++) {
|
|
4029
|
+
const gap = sortedX[j].x - (sortedX[j - 1].x + sortedX[j - 1].w);
|
|
4030
|
+
if (gap >= avgFs * 1.5) {
|
|
4031
|
+
gapPositions.push(sortedX[j - 1].x + sortedX[j - 1].w + gap / 2);
|
|
4032
|
+
}
|
|
4033
|
+
}
|
|
4034
|
+
}
|
|
4035
|
+
if (gapPositions.length < 2) return null;
|
|
4036
|
+
gapPositions.sort((a, b) => a - b);
|
|
4037
|
+
const colBoundaries = [];
|
|
4038
|
+
let clusterSum = gapPositions[0], clusterCount = 1;
|
|
4039
|
+
for (let i = 1; i < gapPositions.length; i++) {
|
|
4040
|
+
const avg = clusterSum / clusterCount;
|
|
4041
|
+
if (Math.abs(gapPositions[i] - avg) <= 15) {
|
|
4042
|
+
clusterSum += gapPositions[i];
|
|
4043
|
+
clusterCount++;
|
|
4044
|
+
} else {
|
|
4045
|
+
if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
|
|
4046
|
+
clusterSum = gapPositions[i];
|
|
4047
|
+
clusterCount = 1;
|
|
4048
|
+
}
|
|
4049
|
+
}
|
|
4050
|
+
if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
|
|
4051
|
+
if (colBoundaries.length === 0) return null;
|
|
4052
|
+
const numCols = colBoundaries.length + 1;
|
|
4053
|
+
const tableRows = [];
|
|
4054
|
+
for (const row of rows) {
|
|
4055
|
+
const cells = Array(numCols).fill("");
|
|
4056
|
+
const sortedX = [...row].sort((a, b) => a.x - b.x);
|
|
4057
|
+
for (const item of sortedX) {
|
|
4058
|
+
const cx = item.x + item.w / 2;
|
|
4059
|
+
let col = 0;
|
|
4060
|
+
for (let b = 0; b < colBoundaries.length; b++) {
|
|
4061
|
+
if (cx > colBoundaries[b]) col = b + 1;
|
|
4062
|
+
}
|
|
4063
|
+
cells[col] = cells[col] ? cells[col] + " " + item.text : item.text;
|
|
4064
|
+
}
|
|
4065
|
+
if (cells[0].trim() === "" && tableRows.length > 0) {
|
|
4066
|
+
const prevCells = tableRows[tableRows.length - 1].cells;
|
|
4067
|
+
for (let c = 0; c < numCols; c++) {
|
|
4068
|
+
if (cells[c].trim()) {
|
|
4069
|
+
prevCells[c] = prevCells[c] ? prevCells[c] + " " + cells[c].trim() : cells[c].trim();
|
|
4070
|
+
}
|
|
4071
|
+
}
|
|
4072
|
+
} else {
|
|
4073
|
+
tableRows.push({ cells });
|
|
4074
|
+
}
|
|
4075
|
+
}
|
|
4076
|
+
if (tableRows.length < 2) return null;
|
|
4077
|
+
const nonEmptyCount = tableRows.reduce((sum, r) => sum + r.cells.filter((c) => c.trim()).length, 0);
|
|
4078
|
+
const totalCount = tableRows.length * numCols;
|
|
4079
|
+
if (nonEmptyCount < totalCount * 0.3) return null;
|
|
4080
|
+
const irCells = tableRows.map(
|
|
4081
|
+
(r) => r.cells.map((text, colIdx) => {
|
|
4082
|
+
let cleaned = text.trim();
|
|
4083
|
+
if (colIdx > 0) cleaned = cleaned.replace(/^[•○·\-]\s*/, "");
|
|
4084
|
+
return { text: cleaned, colSpan: 1, rowSpan: 1 };
|
|
4085
|
+
})
|
|
4086
|
+
);
|
|
4087
|
+
const irTable = {
|
|
4088
|
+
rows: tableRows.length,
|
|
4089
|
+
cols: numCols,
|
|
4090
|
+
cells: irCells,
|
|
4091
|
+
hasHeader: tableRows.length > 1
|
|
4092
|
+
};
|
|
4093
|
+
return [{ type: "table", table: irTable, pageNumber: pageNum, bbox }];
|
|
4094
|
+
}
|
|
3707
4095
|
function shouldDemoteTable(table) {
|
|
3708
4096
|
const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
|
|
3709
4097
|
const allText = allCells.join(" ");
|
|
@@ -3750,6 +4138,32 @@ function detectMarkerHeadings(blocks) {
|
|
|
3750
4138
|
}
|
|
3751
4139
|
}
|
|
3752
4140
|
}
|
|
4141
|
+
function hasMultiColumnLayout(items) {
|
|
4142
|
+
if (items.length < 30) return false;
|
|
4143
|
+
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
4144
|
+
const minX = sorted[0].x;
|
|
4145
|
+
let maxX = minX;
|
|
4146
|
+
for (const i of sorted) if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
4147
|
+
const pageWidth = maxX - minX;
|
|
4148
|
+
if (pageWidth < 200) return false;
|
|
4149
|
+
let bestGap = 0;
|
|
4150
|
+
let bestSplit = 0;
|
|
4151
|
+
for (let j = 1; j < sorted.length; j++) {
|
|
4152
|
+
const gap = sorted[j].x - (sorted[j - 1].x + sorted[j - 1].w);
|
|
4153
|
+
if (gap > bestGap) {
|
|
4154
|
+
bestGap = gap;
|
|
4155
|
+
bestSplit = (sorted[j - 1].x + sorted[j - 1].w + sorted[j].x) / 2;
|
|
4156
|
+
}
|
|
4157
|
+
}
|
|
4158
|
+
if (bestGap < 20) return false;
|
|
4159
|
+
const splitRatio = (bestSplit - minX) / pageWidth;
|
|
4160
|
+
if (splitRatio < 0.35 || splitRatio > 0.65) return false;
|
|
4161
|
+
const leftCount = items.filter((i) => i.x + i.w / 2 < bestSplit).length;
|
|
4162
|
+
const rightCount = items.filter((i) => i.x + i.w / 2 >= bestSplit).length;
|
|
4163
|
+
if (leftCount < 15 || rightCount < 15) return false;
|
|
4164
|
+
if (Math.min(leftCount, rightCount) / Math.max(leftCount, rightCount) < 0.35) return false;
|
|
4165
|
+
return true;
|
|
4166
|
+
}
|
|
3753
4167
|
var MAX_XYCUT_DEPTH = 50;
|
|
3754
4168
|
function xyCutOrder(items, gapThreshold, depth = 0) {
|
|
3755
4169
|
if (items.length === 0) return [];
|
|
@@ -3880,6 +4294,11 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
3880
4294
|
width: grid.bbox.x2 - grid.bbox.x1,
|
|
3881
4295
|
height: grid.bbox.y2 - grid.bbox.y1
|
|
3882
4296
|
};
|
|
4297
|
+
const normalized = normalizeUnderSegmentedTable(irTable, tableItems, pageNum, tableBbox);
|
|
4298
|
+
if (normalized) {
|
|
4299
|
+
blocks.push(...normalized);
|
|
4300
|
+
continue;
|
|
4301
|
+
}
|
|
3883
4302
|
if (shouldDemoteTable(irTable)) {
|
|
3884
4303
|
const demoted = demoteTableToText(irTable);
|
|
3885
4304
|
if (demoted) {
|
|
@@ -3925,6 +4344,10 @@ function mergeAdjacentTableBlocks(blocks) {
|
|
|
3925
4344
|
}
|
|
3926
4345
|
function extractPageBlocksFallback(items, pageNum) {
|
|
3927
4346
|
if (items.length === 0) return [];
|
|
4347
|
+
if (hasMultiColumnLayout(items)) {
|
|
4348
|
+
const xyBlocks = buildXyCutBlocks(items, pageNum) || [];
|
|
4349
|
+
return detectSpecialKoreanTables(detectListBlocks(xyBlocks));
|
|
4350
|
+
}
|
|
3928
4351
|
const blocks = [];
|
|
3929
4352
|
const allYLines = groupByY(items);
|
|
3930
4353
|
const columns = detectColumns(allYLines);
|
|
@@ -3942,7 +4365,7 @@ function extractPageBlocksFallback(items, pageNum) {
|
|
|
3942
4365
|
fontSize: i.fontSize,
|
|
3943
4366
|
fontName: i.fontName
|
|
3944
4367
|
}));
|
|
3945
|
-
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4368
|
+
const clusterResults = hasMultiColumnLayout(items) ? [] : detectClusterTables(clusterItems, pageNum);
|
|
3946
4369
|
if (clusterResults.length > 0) {
|
|
3947
4370
|
const ciToIdx = /* @__PURE__ */ new Map();
|
|
3948
4371
|
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|