kordoc 2.1.0 → 2.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +318 -302
- package/dist/{chunk-MOL7MDBG.js → chunk-3TBUDJDE.js} +1 -1
- package/dist/chunk-3TBUDJDE.js.map +1 -0
- package/dist/{chunk-GJ2S6IMC.js → chunk-FINXMRCH.js} +978 -586
- package/dist/chunk-FINXMRCH.js.map +1 -0
- package/dist/chunk-MUAWCQDY.js +52 -0
- package/dist/chunk-MUAWCQDY.js.map +1 -0
- package/dist/cli.js +13 -9
- package/dist/cli.js.map +1 -1
- package/dist/detect-63IGCXTH.js +18 -0
- package/dist/index.cjs +1003 -553
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +1003 -553
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +41 -12
- package/dist/mcp.js.map +1 -1
- package/dist/page-range-OF5I4PQY.js +8 -0
- package/dist/{watch-X7IC7MLF.js → watch-Q6L4UBTC.js} +32 -16
- package/dist/watch-Q6L4UBTC.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-GJ2S6IMC.js.map +0 -1
- package/dist/chunk-MOL7MDBG.js.map +0 -1
- package/dist/chunk-PKIJLEV6.js +0 -93
- package/dist/chunk-PKIJLEV6.js.map +0 -1
- package/dist/page-range-737B4EZW.js +0 -8
- package/dist/utils-BWQ2RGUD.js +0 -22
- package/dist/watch-X7IC7MLF.js.map +0 -1
- /package/dist/{page-range-737B4EZW.js.map → detect-63IGCXTH.js.map} +0 -0
- /package/dist/{utils-BWQ2RGUD.js.map → page-range-OF5I4PQY.js.map} +0 -0
package/dist/index.cjs
CHANGED
|
@@ -183,7 +183,7 @@ var import_zlib = require("zlib");
|
|
|
183
183
|
var import_xmldom = require("@xmldom/xmldom");
|
|
184
184
|
|
|
185
185
|
// src/utils.ts
|
|
186
|
-
var VERSION = true ? "2.1
|
|
186
|
+
var VERSION = true ? "2.2.1" : "0.0.0-dev";
|
|
187
187
|
function toArrayBuffer(buf) {
|
|
188
188
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
189
189
|
return buf.buffer;
|
|
@@ -199,7 +199,8 @@ var KordocError = class extends Error {
|
|
|
199
199
|
function isPathTraversal(name) {
|
|
200
200
|
if (name.includes("\0")) return true;
|
|
201
201
|
const normalized = name.replace(/\\/g, "/");
|
|
202
|
-
|
|
202
|
+
const segments = normalized.split("/");
|
|
203
|
+
return segments.some((s) => s === "..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
203
204
|
}
|
|
204
205
|
function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
|
|
205
206
|
try {
|
|
@@ -239,12 +240,25 @@ function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEnt
|
|
|
239
240
|
return { totalUncompressed: 0, entryCount: 0 };
|
|
240
241
|
}
|
|
241
242
|
}
|
|
243
|
+
function stripDtd(xml) {
|
|
244
|
+
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
245
|
+
}
|
|
242
246
|
var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
|
|
243
247
|
function sanitizeHref(href) {
|
|
244
248
|
const trimmed = href.trim();
|
|
245
249
|
if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
|
|
246
250
|
return trimmed;
|
|
247
251
|
}
|
|
252
|
+
function safeMin(arr) {
|
|
253
|
+
let min = Infinity;
|
|
254
|
+
for (let i = 0; i < arr.length; i++) if (arr[i] < min) min = arr[i];
|
|
255
|
+
return min;
|
|
256
|
+
}
|
|
257
|
+
function safeMax(arr) {
|
|
258
|
+
let max = -Infinity;
|
|
259
|
+
for (let i = 0; i < arr.length; i++) if (arr[i] > max) max = arr[i];
|
|
260
|
+
return max;
|
|
261
|
+
}
|
|
248
262
|
function classifyError(err) {
|
|
249
263
|
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
250
264
|
const msg = err.message;
|
|
@@ -319,6 +333,7 @@ function buildTableDirect(rows, numRows) {
|
|
|
319
333
|
if (end > maxCols) maxCols = end;
|
|
320
334
|
}
|
|
321
335
|
}
|
|
336
|
+
if (maxCols > MAX_COLS) maxCols = MAX_COLS;
|
|
322
337
|
if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
|
|
323
338
|
const grid = Array.from(
|
|
324
339
|
{ length: numRows },
|
|
@@ -328,7 +343,7 @@ function buildTableDirect(rows, numRows) {
|
|
|
328
343
|
for (const cell of row) {
|
|
329
344
|
const r = cell.rowAddr ?? 0;
|
|
330
345
|
const c = cell.colAddr ?? 0;
|
|
331
|
-
if (r >= numRows || c >= maxCols) continue;
|
|
346
|
+
if (r >= numRows || c >= maxCols || r < 0 || c < 0) continue;
|
|
332
347
|
grid[r][c] = { text: cell.text.trim(), colSpan: cell.colSpan, rowSpan: cell.rowSpan };
|
|
333
348
|
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
334
349
|
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
@@ -357,9 +372,12 @@ function trimAndReturn(grid, numRows, maxCols) {
|
|
|
357
372
|
}
|
|
358
373
|
function convertTableToText(rows) {
|
|
359
374
|
return rows.map(
|
|
360
|
-
(row) => row.map((c) => c.text.trim().replace(/\n/g, " ")).filter(Boolean).join("
|
|
375
|
+
(row) => row.map((c) => c.text.trim().replace(/\n/g, " ").replace(/\|/g, "\\|")).filter(Boolean).join(" / ")
|
|
361
376
|
).filter(Boolean).join("\n");
|
|
362
377
|
}
|
|
378
|
+
function escapeGfm(text) {
|
|
379
|
+
return text.replace(/~/g, "\\~");
|
|
380
|
+
}
|
|
363
381
|
var HWP_SHAPE_ALT_TEXT_RE = /(?:모서리가 둥근 |둥근 )?(?:사각형|직사각형|정사각형|원|타원|삼각형|이등변 삼각형|직각 삼각형|선|직선|곡선|화살표|굵은 화살표|이중 화살표|오각형|육각형|팔각형|별|[4-8]점별|십자|십자형|구름|구름형|마름모|도넛|평행사변형|사다리꼴|부채꼴|호|반원|물결|번개|하트|빗금|블록 화살표|수식|표|그림|개체|그리기\s?개체|묶음\s?개체|글상자|수식\s?개체|OLE\s?개체)\s?입니다\.?/g;
|
|
364
382
|
function sanitizeText(text) {
|
|
365
383
|
let result = text.replace(/[\u{F0000}-\u{FFFFD}]/gu, "").replace(HWP_SHAPE_ALT_TEXT_RE, "").replace(/ +/g, " ").trim();
|
|
@@ -469,7 +487,7 @@ function blocksToMarkdown(blocks) {
|
|
|
469
487
|
if (block.footnoteText) {
|
|
470
488
|
text += ` (\uC8FC: ${block.footnoteText})`;
|
|
471
489
|
}
|
|
472
|
-
lines.push(text);
|
|
490
|
+
lines.push(escapeGfm(text), "");
|
|
473
491
|
} else if (block.type === "table" && block.table) {
|
|
474
492
|
if (lines.length > 0 && lines[lines.length - 1] !== "") {
|
|
475
493
|
lines.push("");
|
|
@@ -492,13 +510,13 @@ function tableToMarkdown(table) {
|
|
|
492
510
|
return content.split(/\n/).map((line) => {
|
|
493
511
|
const trimmed = line.trim();
|
|
494
512
|
if (!trimmed) return "";
|
|
495
|
-
if (/^\d+\.\s/.test(trimmed)) return `**${trimmed}**`;
|
|
496
|
-
if (/^[가-힣]\.\s/.test(trimmed)) return ` ${trimmed}`;
|
|
497
|
-
return trimmed;
|
|
513
|
+
if (/^\d+\.\s/.test(trimmed)) return `**${escapeGfm(trimmed)}**`;
|
|
514
|
+
if (/^[가-힣]\.\s/.test(trimmed)) return ` ${escapeGfm(trimmed)}`;
|
|
515
|
+
return escapeGfm(trimmed);
|
|
498
516
|
}).filter(Boolean).join("\n");
|
|
499
517
|
}
|
|
500
518
|
if (numCols === 1 && numRows >= 2) {
|
|
501
|
-
return cells.map((row) => sanitizeText(row[0].text).replace(/\n/g, " ")).filter(Boolean).join("\n");
|
|
519
|
+
return cells.map((row) => escapeGfm(sanitizeText(row[0].text)).replace(/\n/g, " ")).filter(Boolean).join("\n");
|
|
502
520
|
}
|
|
503
521
|
const display = Array.from({ length: numRows }, () => Array(numCols).fill(""));
|
|
504
522
|
const skip = /* @__PURE__ */ new Set();
|
|
@@ -507,15 +525,12 @@ function tableToMarkdown(table) {
|
|
|
507
525
|
if (skip.has(`${r},${c}`)) continue;
|
|
508
526
|
const cell = cells[r]?.[c];
|
|
509
527
|
if (!cell) continue;
|
|
510
|
-
display[r][c] = sanitizeText(cell.text).replace(/\n/g, "<br>");
|
|
528
|
+
display[r][c] = escapeGfm(sanitizeText(cell.text)).replace(/\|/g, "\\|").replace(/\n/g, "<br>");
|
|
511
529
|
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
512
530
|
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
513
531
|
if (dr === 0 && dc === 0) continue;
|
|
514
532
|
if (r + dr < numRows && c + dc < numCols) {
|
|
515
533
|
skip.add(`${r + dr},${c + dc}`);
|
|
516
|
-
if (dr === 0) {
|
|
517
|
-
display[r][c + dc] = cell.text.replace(/\n/g, "<br>");
|
|
518
|
-
}
|
|
519
534
|
}
|
|
520
535
|
}
|
|
521
536
|
}
|
|
@@ -651,9 +666,6 @@ function parseStyleElements(doc, map) {
|
|
|
651
666
|
}
|
|
652
667
|
}
|
|
653
668
|
}
|
|
654
|
-
function stripDtd(xml) {
|
|
655
|
-
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
656
|
-
}
|
|
657
669
|
async function parseHwpxDocument(buffer, options) {
|
|
658
670
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
|
|
659
671
|
let zip;
|
|
@@ -1003,7 +1015,8 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
1003
1015
|
if (newTable.rows.length > 0) {
|
|
1004
1016
|
if (tableStack.length > 0) {
|
|
1005
1017
|
const parentTable = tableStack.pop();
|
|
1006
|
-
|
|
1018
|
+
let nestedCols = 0;
|
|
1019
|
+
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
1007
1020
|
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
1008
1021
|
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
1009
1022
|
} else {
|
|
@@ -1112,7 +1125,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
1112
1125
|
if (newTable.rows.length > 0) {
|
|
1113
1126
|
if (tableStack.length > 0) {
|
|
1114
1127
|
const parentTable = tableStack.pop();
|
|
1115
|
-
|
|
1128
|
+
let nestedCols = 0;
|
|
1129
|
+
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
1116
1130
|
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
1117
1131
|
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
1118
1132
|
} else {
|
|
@@ -2210,6 +2224,7 @@ function parseLenientCfb(data) {
|
|
|
2210
2224
|
if (miniSectorSizeShift > 16) throw new Error("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uBBF8\uB2C8 \uC139\uD130 \uD06C\uAE30 \uC2DC\uD504\uD2B8: " + miniSectorSizeShift);
|
|
2211
2225
|
const miniSectorSize = 1 << miniSectorSizeShift;
|
|
2212
2226
|
const fatSectorCount = data.readUInt32LE(44);
|
|
2227
|
+
if (fatSectorCount > 1e4) throw new Error("FAT \uC139\uD130 \uC218\uAC00 \uB108\uBB34 \uB9CE\uC2B5\uB2C8\uB2E4: " + fatSectorCount);
|
|
2213
2228
|
const firstDirSector = data.readUInt32LE(48);
|
|
2214
2229
|
const miniStreamCutoff = data.readUInt32LE(56);
|
|
2215
2230
|
const firstMiniFatSector = data.readUInt32LE(60);
|
|
@@ -2586,10 +2601,14 @@ function findSections(cfb) {
|
|
|
2586
2601
|
}
|
|
2587
2602
|
function findSectionsLenient(lcfb, compressed) {
|
|
2588
2603
|
const sections = [];
|
|
2604
|
+
let totalDecompressed = 0;
|
|
2589
2605
|
for (let i = 0; i < MAX_SECTIONS; i++) {
|
|
2590
2606
|
const raw = lcfb.findStream(`/BodyText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
|
|
2591
2607
|
if (!raw) break;
|
|
2592
|
-
|
|
2608
|
+
const content = compressed ? decompressStream(raw) : raw;
|
|
2609
|
+
totalDecompressed += content.length;
|
|
2610
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2611
|
+
sections.push({ idx: i, content });
|
|
2593
2612
|
}
|
|
2594
2613
|
if (sections.length === 0) {
|
|
2595
2614
|
for (const e of lcfb.entries()) {
|
|
@@ -2597,7 +2616,12 @@ function findSectionsLenient(lcfb, compressed) {
|
|
|
2597
2616
|
if (e.name.startsWith("Section")) {
|
|
2598
2617
|
const idx = parseInt(e.name.replace("Section", ""), 10) || 0;
|
|
2599
2618
|
const raw = lcfb.findStream(e.name);
|
|
2600
|
-
if (raw)
|
|
2619
|
+
if (raw) {
|
|
2620
|
+
const content = compressed ? decompressStream(raw) : raw;
|
|
2621
|
+
totalDecompressed += content.length;
|
|
2622
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2623
|
+
sections.push({ idx, content });
|
|
2624
|
+
}
|
|
2601
2625
|
}
|
|
2602
2626
|
}
|
|
2603
2627
|
}
|
|
@@ -2605,11 +2629,15 @@ function findSectionsLenient(lcfb, compressed) {
|
|
|
2605
2629
|
}
|
|
2606
2630
|
function findViewTextSectionsLenient(lcfb, compressed) {
|
|
2607
2631
|
const sections = [];
|
|
2632
|
+
let totalDecompressed = 0;
|
|
2608
2633
|
for (let i = 0; i < MAX_SECTIONS; i++) {
|
|
2609
2634
|
const raw = lcfb.findStream(`/ViewText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
|
|
2610
2635
|
if (!raw) break;
|
|
2611
2636
|
try {
|
|
2612
|
-
|
|
2637
|
+
const content = decryptViewText(raw, compressed);
|
|
2638
|
+
totalDecompressed += content.length;
|
|
2639
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2640
|
+
sections.push({ idx: i, content });
|
|
2613
2641
|
} catch {
|
|
2614
2642
|
break;
|
|
2615
2643
|
}
|
|
@@ -3011,37 +3039,18 @@ init_page_range();
|
|
|
3011
3039
|
// src/pdf/line-detector.ts
|
|
3012
3040
|
var import_pdf = require("pdfjs-dist/legacy/build/pdf.mjs");
|
|
3013
3041
|
var ORIENTATION_TOL = 2;
|
|
3014
|
-
var MIN_LINE_LENGTH =
|
|
3015
|
-
var
|
|
3042
|
+
var MIN_LINE_LENGTH = 15;
|
|
3043
|
+
var MAX_LINE_WIDTH = 5;
|
|
3016
3044
|
var CONNECT_TOL = 5;
|
|
3017
3045
|
var CELL_PADDING = 2;
|
|
3018
|
-
var
|
|
3019
|
-
var
|
|
3020
|
-
|
|
3021
|
-
|
|
3022
|
-
m1[0] * m2[0] + m1[2] * m2[1],
|
|
3023
|
-
m1[1] * m2[0] + m1[3] * m2[1],
|
|
3024
|
-
m1[0] * m2[2] + m1[2] * m2[3],
|
|
3025
|
-
m1[1] * m2[2] + m1[3] * m2[3],
|
|
3026
|
-
m1[0] * m2[4] + m1[2] * m2[5] + m1[4],
|
|
3027
|
-
m1[1] * m2[4] + m1[3] * m2[5] + m1[5]
|
|
3028
|
-
];
|
|
3029
|
-
}
|
|
3030
|
-
function matTransformPoint(m, x, y) {
|
|
3031
|
-
return [m[0] * x + m[2] * y + m[4], m[1] * x + m[3] * y + m[5]];
|
|
3032
|
-
}
|
|
3033
|
-
function matScale(m) {
|
|
3034
|
-
return Math.max(
|
|
3035
|
-
Math.sqrt(m[1] * m[1] + m[3] * m[3]),
|
|
3036
|
-
Math.sqrt(m[0] * m[0] + m[2] * m[2])
|
|
3037
|
-
);
|
|
3038
|
-
}
|
|
3046
|
+
var MIN_COL_WIDTH = 15;
|
|
3047
|
+
var MIN_ROW_HEIGHT = 6;
|
|
3048
|
+
var VERTEX_MERGE_FACTOR = 4;
|
|
3049
|
+
var MIN_COORD_MERGE_TOL = 8;
|
|
3039
3050
|
function extractLines(fnArray, argsArray) {
|
|
3040
3051
|
const horizontals = [];
|
|
3041
3052
|
const verticals = [];
|
|
3042
|
-
let ctm = [...IDENTITY];
|
|
3043
3053
|
let lineWidth = 1;
|
|
3044
|
-
const stateStack = [];
|
|
3045
3054
|
let currentPath = [];
|
|
3046
3055
|
let pathStartX = 0, pathStartY = 0;
|
|
3047
3056
|
let curX = 0, curY = 0;
|
|
@@ -3059,53 +3068,13 @@ function extractLines(fnArray, argsArray) {
|
|
|
3059
3068
|
);
|
|
3060
3069
|
}
|
|
3061
3070
|
}
|
|
3062
|
-
function
|
|
3063
|
-
if (
|
|
3064
|
-
const first = path[0], last = path[path.length - 1];
|
|
3065
|
-
const closed = Math.abs(first.x1 - last.x2) < 1 && Math.abs(first.y1 - last.y2) < 1;
|
|
3066
|
-
if (!closed) return false;
|
|
3067
|
-
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
3068
|
-
for (const seg of path) {
|
|
3069
|
-
minX = Math.min(minX, seg.x1, seg.x2);
|
|
3070
|
-
minY = Math.min(minY, seg.y1, seg.y2);
|
|
3071
|
-
maxX = Math.max(maxX, seg.x1, seg.x2);
|
|
3072
|
-
maxY = Math.max(maxY, seg.y1, seg.y2);
|
|
3073
|
-
}
|
|
3074
|
-
const w = maxX - minX, h = maxY - minY;
|
|
3075
|
-
if (w < MIN_LINE_LENGTH && h < MIN_LINE_LENGTH) return false;
|
|
3076
|
-
path.length = 0;
|
|
3077
|
-
if (h < ORIENTATION_TOL * 2 || w > MIN_LINE_LENGTH && h <= MAX_LINE_WIDTH) {
|
|
3078
|
-
path.push({ x1: minX, y1: (minY + maxY) / 2, x2: maxX, y2: (minY + maxY) / 2 });
|
|
3079
|
-
} else if (w < ORIENTATION_TOL * 2 || h > MIN_LINE_LENGTH && w <= MAX_LINE_WIDTH) {
|
|
3080
|
-
path.push({ x1: (minX + maxX) / 2, y1: minY, x2: (minX + maxX) / 2, y2: maxY });
|
|
3081
|
-
} else {
|
|
3082
|
-
pushRectangle(path, minX, minY, w, h);
|
|
3083
|
-
}
|
|
3084
|
-
return true;
|
|
3085
|
-
}
|
|
3086
|
-
function flushPath(isStroke, isFill) {
|
|
3087
|
-
if (!isStroke && !isFill) {
|
|
3088
|
-
currentPath = [];
|
|
3089
|
-
return;
|
|
3090
|
-
}
|
|
3091
|
-
if (isFill && !isStroke && currentPath.length >= 3) {
|
|
3092
|
-
tryConvertLinesToRectangle(currentPath);
|
|
3093
|
-
}
|
|
3094
|
-
const scale = matScale(ctm);
|
|
3095
|
-
const effectiveLW = lineWidth * scale;
|
|
3096
|
-
if (effectiveLW > MAX_LINE_WIDTH && isStroke && !isFill) {
|
|
3071
|
+
function flushPath(isStroke) {
|
|
3072
|
+
if (!isStroke) {
|
|
3097
3073
|
currentPath = [];
|
|
3098
3074
|
return;
|
|
3099
3075
|
}
|
|
3100
3076
|
for (const seg of currentPath) {
|
|
3101
|
-
|
|
3102
|
-
const [px2, py2] = matTransformPoint(ctm, seg.x2, seg.y2);
|
|
3103
|
-
classifyAndAdd(
|
|
3104
|
-
{ x1: px1, y1: py1, x2: px2, y2: py2 },
|
|
3105
|
-
effectiveLW,
|
|
3106
|
-
horizontals,
|
|
3107
|
-
verticals
|
|
3108
|
-
);
|
|
3077
|
+
classifyAndAdd(seg, lineWidth, horizontals, verticals);
|
|
3109
3078
|
}
|
|
3110
3079
|
currentPath = [];
|
|
3111
3080
|
}
|
|
@@ -3113,28 +3082,9 @@ function extractLines(fnArray, argsArray) {
|
|
|
3113
3082
|
const op = fnArray[i];
|
|
3114
3083
|
const args = argsArray[i];
|
|
3115
3084
|
switch (op) {
|
|
3116
|
-
// ── Graphics State ──
|
|
3117
|
-
case import_pdf.OPS.save:
|
|
3118
|
-
stateStack.push({ ctm: [...ctm], lineWidth });
|
|
3119
|
-
break;
|
|
3120
|
-
case import_pdf.OPS.restore:
|
|
3121
|
-
if (stateStack.length > 0) {
|
|
3122
|
-
const state = stateStack.pop();
|
|
3123
|
-
ctm = state.ctm;
|
|
3124
|
-
lineWidth = state.lineWidth;
|
|
3125
|
-
}
|
|
3126
|
-
break;
|
|
3127
|
-
case import_pdf.OPS.transform: {
|
|
3128
|
-
const m = args;
|
|
3129
|
-
if (m.length >= 6) {
|
|
3130
|
-
ctm = matMultiply(ctm, [m[0], m[1], m[2], m[3], m[4], m[5]]);
|
|
3131
|
-
}
|
|
3132
|
-
break;
|
|
3133
|
-
}
|
|
3134
3085
|
case import_pdf.OPS.setLineWidth:
|
|
3135
3086
|
lineWidth = args[0] || 1;
|
|
3136
3087
|
break;
|
|
3137
|
-
// ── Path Construction ──
|
|
3138
3088
|
case import_pdf.OPS.constructPath: {
|
|
3139
3089
|
const arg0 = args[0];
|
|
3140
3090
|
if (Array.isArray(arg0)) {
|
|
@@ -3202,60 +3152,34 @@ function extractLines(fnArray, argsArray) {
|
|
|
3202
3152
|
}
|
|
3203
3153
|
}
|
|
3204
3154
|
}
|
|
3205
|
-
|
|
3206
|
-
|
|
3207
|
-
|
|
3208
|
-
|
|
3209
|
-
flushPath(isStroke5 || isBoth5, isFill5 || isBoth5);
|
|
3155
|
+
if (afterOp === import_pdf.OPS.stroke || afterOp === import_pdf.OPS.closeStroke) {
|
|
3156
|
+
flushPath(true);
|
|
3157
|
+
} else if (afterOp === import_pdf.OPS.fill || afterOp === import_pdf.OPS.eoFill || afterOp === import_pdf.OPS.fillStroke || afterOp === import_pdf.OPS.eoFillStroke || afterOp === import_pdf.OPS.closeFillStroke || afterOp === import_pdf.OPS.closeEOFillStroke) {
|
|
3158
|
+
flushPath(true);
|
|
3210
3159
|
} else if (afterOp === import_pdf.OPS.endPath) {
|
|
3211
|
-
flushPath(false
|
|
3160
|
+
flushPath(false);
|
|
3212
3161
|
}
|
|
3213
3162
|
}
|
|
3214
3163
|
break;
|
|
3215
3164
|
}
|
|
3216
|
-
// ── Paint Operations ──
|
|
3217
3165
|
case import_pdf.OPS.stroke:
|
|
3218
3166
|
case import_pdf.OPS.closeStroke:
|
|
3219
|
-
flushPath(true
|
|
3167
|
+
flushPath(true);
|
|
3220
3168
|
break;
|
|
3221
3169
|
case import_pdf.OPS.fill:
|
|
3222
3170
|
case import_pdf.OPS.eoFill:
|
|
3223
|
-
flushPath(false, true);
|
|
3224
|
-
break;
|
|
3225
3171
|
case import_pdf.OPS.fillStroke:
|
|
3226
3172
|
case import_pdf.OPS.eoFillStroke:
|
|
3227
3173
|
case import_pdf.OPS.closeFillStroke:
|
|
3228
3174
|
case import_pdf.OPS.closeEOFillStroke:
|
|
3229
|
-
flushPath(true
|
|
3175
|
+
flushPath(true);
|
|
3230
3176
|
break;
|
|
3231
3177
|
case import_pdf.OPS.endPath:
|
|
3232
|
-
flushPath(false
|
|
3233
|
-
break;
|
|
3234
|
-
}
|
|
3235
|
-
}
|
|
3236
|
-
return {
|
|
3237
|
-
horizontals: deduplicateLines(horizontals),
|
|
3238
|
-
verticals: deduplicateLines(verticals)
|
|
3239
|
-
};
|
|
3240
|
-
}
|
|
3241
|
-
function deduplicateLines(lines) {
|
|
3242
|
-
if (lines.length <= 1) return lines;
|
|
3243
|
-
const result = [];
|
|
3244
|
-
const tol = COORD_MERGE_TOL;
|
|
3245
|
-
for (const line of lines) {
|
|
3246
|
-
let isDuplicate = false;
|
|
3247
|
-
for (const existing of result) {
|
|
3248
|
-
if (Math.abs(line.y1 - existing.y1) <= tol && Math.abs(line.y2 - existing.y2) <= tol && Math.abs(line.x1 - existing.x1) <= tol && Math.abs(line.x2 - existing.x2) <= tol) {
|
|
3249
|
-
if (line.lineWidth > existing.lineWidth) {
|
|
3250
|
-
existing.lineWidth = line.lineWidth;
|
|
3251
|
-
}
|
|
3252
|
-
isDuplicate = true;
|
|
3178
|
+
flushPath(false);
|
|
3253
3179
|
break;
|
|
3254
|
-
}
|
|
3255
3180
|
}
|
|
3256
|
-
if (!isDuplicate) result.push(line);
|
|
3257
3181
|
}
|
|
3258
|
-
return
|
|
3182
|
+
return { horizontals, verticals };
|
|
3259
3183
|
}
|
|
3260
3184
|
function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
|
|
3261
3185
|
const dx = Math.abs(seg.x2 - seg.x1);
|
|
@@ -3274,6 +3198,55 @@ function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
|
|
|
3274
3198
|
verticals.push({ x1: x, y1, x2: x, y2, lineWidth });
|
|
3275
3199
|
}
|
|
3276
3200
|
}
|
|
3201
|
+
function preprocessLines(horizontals, verticals) {
|
|
3202
|
+
let h = horizontals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
|
|
3203
|
+
let v = verticals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
|
|
3204
|
+
h = mergeParallelLines(h, "h");
|
|
3205
|
+
v = mergeParallelLines(v, "v");
|
|
3206
|
+
return { horizontals: h, verticals: v };
|
|
3207
|
+
}
|
|
3208
|
+
function mergeParallelLines(lines, dir) {
|
|
3209
|
+
if (lines.length <= 1) return lines;
|
|
3210
|
+
const sorted = [...lines].sort((a, b) => {
|
|
3211
|
+
const posA = dir === "h" ? a.y1 : a.x1;
|
|
3212
|
+
const posB = dir === "h" ? b.y1 : b.x1;
|
|
3213
|
+
if (Math.abs(posA - posB) > 0.1) return posA - posB;
|
|
3214
|
+
return dir === "h" ? a.x1 - b.x1 : a.y1 - b.y1;
|
|
3215
|
+
});
|
|
3216
|
+
const MERGE_TOL = 3;
|
|
3217
|
+
const result = [sorted[0]];
|
|
3218
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
3219
|
+
const prev = result[result.length - 1];
|
|
3220
|
+
const curr = sorted[i];
|
|
3221
|
+
const prevPos = dir === "h" ? prev.y1 : prev.x1;
|
|
3222
|
+
const currPos = dir === "h" ? curr.y1 : curr.x1;
|
|
3223
|
+
if (Math.abs(prevPos - currPos) <= MERGE_TOL) {
|
|
3224
|
+
const prevStart = dir === "h" ? prev.x1 : prev.y1;
|
|
3225
|
+
const prevEnd = dir === "h" ? prev.x2 : prev.y2;
|
|
3226
|
+
const currStart = dir === "h" ? curr.x1 : curr.y1;
|
|
3227
|
+
const currEnd = dir === "h" ? curr.x2 : curr.y2;
|
|
3228
|
+
const overlap = Math.min(prevEnd, currEnd) - Math.max(prevStart, currStart);
|
|
3229
|
+
const minLen = Math.min(prevEnd - prevStart, currEnd - currStart);
|
|
3230
|
+
if (overlap > minLen * 0.3) {
|
|
3231
|
+
if (dir === "h") {
|
|
3232
|
+
prev.x1 = Math.min(prev.x1, curr.x1);
|
|
3233
|
+
prev.x2 = Math.max(prev.x2, curr.x2);
|
|
3234
|
+
prev.y1 = (prev.y1 + curr.y1) / 2;
|
|
3235
|
+
prev.y2 = prev.y1;
|
|
3236
|
+
} else {
|
|
3237
|
+
prev.y1 = Math.min(prev.y1, curr.y1);
|
|
3238
|
+
prev.y2 = Math.max(prev.y2, curr.y2);
|
|
3239
|
+
prev.x1 = (prev.x1 + curr.x1) / 2;
|
|
3240
|
+
prev.x2 = prev.x1;
|
|
3241
|
+
}
|
|
3242
|
+
prev.lineWidth = Math.max(prev.lineWidth, curr.lineWidth);
|
|
3243
|
+
continue;
|
|
3244
|
+
}
|
|
3245
|
+
}
|
|
3246
|
+
result.push(curr);
|
|
3247
|
+
}
|
|
3248
|
+
return result;
|
|
3249
|
+
}
|
|
3277
3250
|
function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
|
|
3278
3251
|
const margin = 5;
|
|
3279
3252
|
return {
|
|
@@ -3285,8 +3258,49 @@ function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
|
|
|
3285
3258
|
)
|
|
3286
3259
|
};
|
|
3287
3260
|
}
|
|
3261
|
+
function buildVertices(horizontals, verticals) {
|
|
3262
|
+
const vertices = [];
|
|
3263
|
+
const tol = CONNECT_TOL;
|
|
3264
|
+
for (const h of horizontals) {
|
|
3265
|
+
for (const v of verticals) {
|
|
3266
|
+
if (v.x1 >= h.x1 - tol && v.x1 <= h.x2 + tol && h.y1 >= v.y1 - tol && h.y1 <= v.y2 + tol) {
|
|
3267
|
+
const radius = Math.max(h.lineWidth, v.lineWidth, 1);
|
|
3268
|
+
vertices.push({ x: v.x1, y: h.y1, radius });
|
|
3269
|
+
}
|
|
3270
|
+
}
|
|
3271
|
+
}
|
|
3272
|
+
return vertices;
|
|
3273
|
+
}
|
|
3274
|
+
function mergeVertices(vertices) {
|
|
3275
|
+
if (vertices.length <= 1) return vertices;
|
|
3276
|
+
const merged = [];
|
|
3277
|
+
const used = new Array(vertices.length).fill(false);
|
|
3278
|
+
for (let i = 0; i < vertices.length; i++) {
|
|
3279
|
+
if (used[i]) continue;
|
|
3280
|
+
let sumX = vertices[i].x, sumY = vertices[i].y;
|
|
3281
|
+
let maxRadius = vertices[i].radius;
|
|
3282
|
+
let count = 1;
|
|
3283
|
+
for (let j = i + 1; j < vertices.length; j++) {
|
|
3284
|
+
if (used[j]) continue;
|
|
3285
|
+
const mergeTol = VERTEX_MERGE_FACTOR * Math.max(maxRadius, vertices[j].radius);
|
|
3286
|
+
if (Math.abs(vertices[i].x - vertices[j].x) <= mergeTol && Math.abs(vertices[i].y - vertices[j].y) <= mergeTol) {
|
|
3287
|
+
sumX += vertices[j].x;
|
|
3288
|
+
sumY += vertices[j].y;
|
|
3289
|
+
maxRadius = Math.max(maxRadius, vertices[j].radius);
|
|
3290
|
+
count++;
|
|
3291
|
+
used[j] = true;
|
|
3292
|
+
}
|
|
3293
|
+
}
|
|
3294
|
+
merged.push({ x: sumX / count, y: sumY / count, radius: maxRadius });
|
|
3295
|
+
}
|
|
3296
|
+
return merged;
|
|
3297
|
+
}
|
|
3288
3298
|
function buildTableGrids(horizontals, verticals) {
|
|
3289
3299
|
if (horizontals.length < 2 || verticals.length < 2) return [];
|
|
3300
|
+
const allVertices = buildVertices(horizontals, verticals);
|
|
3301
|
+
const vertices = mergeVertices(allVertices);
|
|
3302
|
+
if (vertices.length < 4) return [];
|
|
3303
|
+
const globalRadius = vertices.reduce((max, v) => Math.max(max, v.radius), 1);
|
|
3290
3304
|
const allLines = [
|
|
3291
3305
|
...horizontals.map((l, i) => ({ ...l, type: "h", id: i })),
|
|
3292
3306
|
...verticals.map((l, i) => ({ ...l, type: "v", id: i + horizontals.length }))
|
|
@@ -3297,21 +3311,74 @@ function buildTableGrids(horizontals, verticals) {
|
|
|
3297
3311
|
const hLines = group.filter((l) => l.type === "h");
|
|
3298
3312
|
const vLines = group.filter((l) => l.type === "v");
|
|
3299
3313
|
if (hLines.length < 2 || vLines.length < 2) continue;
|
|
3300
|
-
|
|
3301
|
-
const
|
|
3302
|
-
|
|
3303
|
-
|
|
3314
|
+
let gx1 = Infinity, gy1 = Infinity, gx2 = -Infinity, gy2 = -Infinity;
|
|
3315
|
+
for (const l of vLines) {
|
|
3316
|
+
if (l.x1 < gx1) gx1 = l.x1;
|
|
3317
|
+
if (l.x1 > gx2) gx2 = l.x1;
|
|
3318
|
+
}
|
|
3319
|
+
for (const l of hLines) {
|
|
3320
|
+
if (l.y1 < gy1) gy1 = l.y1;
|
|
3321
|
+
if (l.y1 > gy2) gy2 = l.y1;
|
|
3322
|
+
}
|
|
3323
|
+
const groupBbox = {
|
|
3324
|
+
x1: gx1 - CONNECT_TOL,
|
|
3325
|
+
y1: gy1 - CONNECT_TOL,
|
|
3326
|
+
x2: gx2 + CONNECT_TOL,
|
|
3327
|
+
y2: gy2 + CONNECT_TOL
|
|
3328
|
+
};
|
|
3329
|
+
const groupVertices = vertices.filter(
|
|
3330
|
+
(v) => v.x >= groupBbox.x1 && v.x <= groupBbox.x2 && v.y >= groupBbox.y1 && v.y <= groupBbox.y2
|
|
3331
|
+
);
|
|
3332
|
+
const groupRadius = groupVertices.length > 0 ? groupVertices.reduce((max, v) => Math.max(max, v.radius), 1) : globalRadius;
|
|
3333
|
+
const coordMergeTol = Math.max(VERTEX_MERGE_FACTOR * groupRadius, MIN_COORD_MERGE_TOL);
|
|
3334
|
+
const rawYs = [
|
|
3335
|
+
...hLines.map((l) => l.y1),
|
|
3336
|
+
...groupVertices.map((v) => v.y)
|
|
3337
|
+
];
|
|
3338
|
+
const rowYs = clusterCoordinates(rawYs, coordMergeTol).sort((a, b) => b - a);
|
|
3339
|
+
const rawXs = [
|
|
3340
|
+
...vLines.map((l) => l.x1),
|
|
3341
|
+
...groupVertices.map((v) => v.x)
|
|
3342
|
+
];
|
|
3343
|
+
const colXs = clusterCoordinates(rawXs, coordMergeTol).sort((a, b) => a - b);
|
|
3304
3344
|
if (rowYs.length < 2 || colXs.length < 2) continue;
|
|
3345
|
+
const validColXs = enforceMinWidth(colXs, MIN_COL_WIDTH);
|
|
3346
|
+
const validRowYs = enforceMinHeight(rowYs, MIN_ROW_HEIGHT);
|
|
3347
|
+
if (validRowYs.length < 2 || validColXs.length < 2) continue;
|
|
3305
3348
|
const bbox = {
|
|
3306
|
-
x1:
|
|
3307
|
-
y1:
|
|
3308
|
-
x2:
|
|
3309
|
-
y2:
|
|
3349
|
+
x1: validColXs[0],
|
|
3350
|
+
y1: validRowYs[validRowYs.length - 1],
|
|
3351
|
+
x2: validColXs[validColXs.length - 1],
|
|
3352
|
+
y2: validRowYs[0]
|
|
3310
3353
|
};
|
|
3311
|
-
grids.push({ rowYs, colXs, bbox });
|
|
3354
|
+
grids.push({ rowYs: validRowYs, colXs: validColXs, bbox, vertexRadius: groupRadius });
|
|
3312
3355
|
}
|
|
3313
3356
|
return mergeAdjacentGrids(grids);
|
|
3314
3357
|
}
|
|
3358
|
+
function enforceMinWidth(colXs, minWidth) {
|
|
3359
|
+
if (colXs.length <= 2) return colXs;
|
|
3360
|
+
const result = [colXs[0]];
|
|
3361
|
+
for (let i = 1; i < colXs.length; i++) {
|
|
3362
|
+
const prevX = result[result.length - 1];
|
|
3363
|
+
if (colXs[i] - prevX < minWidth && i < colXs.length - 1) {
|
|
3364
|
+
continue;
|
|
3365
|
+
}
|
|
3366
|
+
result.push(colXs[i]);
|
|
3367
|
+
}
|
|
3368
|
+
return result;
|
|
3369
|
+
}
|
|
3370
|
+
function enforceMinHeight(rowYs, minHeight) {
|
|
3371
|
+
if (rowYs.length <= 2) return rowYs;
|
|
3372
|
+
const result = [rowYs[0]];
|
|
3373
|
+
for (let i = 1; i < rowYs.length; i++) {
|
|
3374
|
+
const prevY = result[result.length - 1];
|
|
3375
|
+
if (prevY - rowYs[i] < minHeight && i < rowYs.length - 1) {
|
|
3376
|
+
continue;
|
|
3377
|
+
}
|
|
3378
|
+
result.push(rowYs[i]);
|
|
3379
|
+
}
|
|
3380
|
+
return result;
|
|
3381
|
+
}
|
|
3315
3382
|
function mergeAdjacentGrids(grids) {
|
|
3316
3383
|
if (grids.length <= 1) return grids;
|
|
3317
3384
|
const sorted = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
|
|
@@ -3320,9 +3387,10 @@ function mergeAdjacentGrids(grids) {
|
|
|
3320
3387
|
const prev = merged[merged.length - 1];
|
|
3321
3388
|
const curr = sorted[i];
|
|
3322
3389
|
if (prev.colXs.length === curr.colXs.length) {
|
|
3323
|
-
const
|
|
3390
|
+
const mergeTol = Math.max(VERTEX_MERGE_FACTOR * Math.max(prev.vertexRadius, curr.vertexRadius), 6) * 3;
|
|
3391
|
+
const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= mergeTol);
|
|
3324
3392
|
const verticalGap = prev.bbox.y1 - curr.bbox.y2;
|
|
3325
|
-
if (colMatch && verticalGap >= -
|
|
3393
|
+
if (colMatch && verticalGap >= -CONNECT_TOL && verticalGap <= 20) {
|
|
3326
3394
|
const allRowYs = [.../* @__PURE__ */ new Set([...prev.rowYs, ...curr.rowYs])].sort((a, b) => b - a);
|
|
3327
3395
|
merged[merged.length - 1] = {
|
|
3328
3396
|
rowYs: allRowYs,
|
|
@@ -3332,7 +3400,8 @@ function mergeAdjacentGrids(grids) {
|
|
|
3332
3400
|
y1: Math.min(prev.bbox.y1, curr.bbox.y1),
|
|
3333
3401
|
x2: Math.max(prev.bbox.x2, curr.bbox.x2),
|
|
3334
3402
|
y2: Math.max(prev.bbox.y2, curr.bbox.y2)
|
|
3335
|
-
}
|
|
3403
|
+
},
|
|
3404
|
+
vertexRadius: Math.max(prev.vertexRadius, curr.vertexRadius)
|
|
3336
3405
|
};
|
|
3337
3406
|
continue;
|
|
3338
3407
|
}
|
|
@@ -3341,14 +3410,14 @@ function mergeAdjacentGrids(grids) {
|
|
|
3341
3410
|
}
|
|
3342
3411
|
return merged;
|
|
3343
3412
|
}
|
|
3344
|
-
function clusterCoordinates(values) {
|
|
3413
|
+
function clusterCoordinates(values, tolerance) {
|
|
3345
3414
|
if (values.length === 0) return [];
|
|
3346
3415
|
const sorted = [...values].sort((a, b) => a - b);
|
|
3347
3416
|
const clusters = [{ sum: sorted[0], count: 1 }];
|
|
3348
3417
|
for (let i = 1; i < sorted.length; i++) {
|
|
3349
3418
|
const last = clusters[clusters.length - 1];
|
|
3350
3419
|
const avg = last.sum / last.count;
|
|
3351
|
-
if (Math.abs(sorted[i] - avg) <=
|
|
3420
|
+
if (Math.abs(sorted[i] - avg) <= tolerance) {
|
|
3352
3421
|
last.sum += sorted[i];
|
|
3353
3422
|
last.count++;
|
|
3354
3423
|
} else {
|
|
@@ -3405,6 +3474,20 @@ function extractCells(grid, horizontals, verticals) {
|
|
|
3405
3474
|
const numRows = rowYs.length - 1;
|
|
3406
3475
|
const numCols = colXs.length - 1;
|
|
3407
3476
|
if (numRows <= 0 || numCols <= 0) return [];
|
|
3477
|
+
const vBorders = Array.from(
|
|
3478
|
+
{ length: numRows },
|
|
3479
|
+
(_, r) => Array.from(
|
|
3480
|
+
{ length: numCols + 1 },
|
|
3481
|
+
(_2, c) => hasVerticalLine(verticals, colXs[c], rowYs[r], rowYs[r + 1], grid.vertexRadius)
|
|
3482
|
+
)
|
|
3483
|
+
);
|
|
3484
|
+
const hBorders = Array.from(
|
|
3485
|
+
{ length: numRows + 1 },
|
|
3486
|
+
(_, r) => Array.from(
|
|
3487
|
+
{ length: numCols },
|
|
3488
|
+
(_2, c) => hasHorizontalLine(horizontals, rowYs[r], colXs[c], colXs[c + 1], grid.vertexRadius)
|
|
3489
|
+
)
|
|
3490
|
+
);
|
|
3408
3491
|
const occupied = Array.from({ length: numRows }, () => Array(numCols).fill(false));
|
|
3409
3492
|
const cells = [];
|
|
3410
3493
|
for (let r = 0; r < numRows; r++) {
|
|
@@ -3412,18 +3495,26 @@ function extractCells(grid, horizontals, verticals) {
|
|
|
3412
3495
|
if (occupied[r][c]) continue;
|
|
3413
3496
|
let colSpan = 1;
|
|
3414
3497
|
let rowSpan = 1;
|
|
3415
|
-
while (c + colSpan < numCols) {
|
|
3416
|
-
|
|
3417
|
-
|
|
3418
|
-
|
|
3419
|
-
|
|
3498
|
+
while (c + colSpan < numCols && !vBorders[r][c + colSpan]) {
|
|
3499
|
+
let canExpand = true;
|
|
3500
|
+
for (let dr = 0; dr < rowSpan; dr++) {
|
|
3501
|
+
if (vBorders[r + dr][c + colSpan]) {
|
|
3502
|
+
canExpand = false;
|
|
3503
|
+
break;
|
|
3504
|
+
}
|
|
3505
|
+
}
|
|
3506
|
+
if (!canExpand) break;
|
|
3420
3507
|
colSpan++;
|
|
3421
3508
|
}
|
|
3422
3509
|
while (r + rowSpan < numRows) {
|
|
3423
|
-
|
|
3424
|
-
|
|
3425
|
-
|
|
3426
|
-
|
|
3510
|
+
let hasLine = false;
|
|
3511
|
+
for (let dc = 0; dc < colSpan; dc++) {
|
|
3512
|
+
if (hBorders[r + rowSpan][c + dc]) {
|
|
3513
|
+
hasLine = true;
|
|
3514
|
+
break;
|
|
3515
|
+
}
|
|
3516
|
+
}
|
|
3517
|
+
if (hasLine) break;
|
|
3427
3518
|
rowSpan++;
|
|
3428
3519
|
}
|
|
3429
3520
|
for (let dr = 0; dr < rowSpan; dr++) {
|
|
@@ -3447,28 +3538,30 @@ function extractCells(grid, horizontals, verticals) {
|
|
|
3447
3538
|
}
|
|
3448
3539
|
return cells;
|
|
3449
3540
|
}
|
|
3450
|
-
function hasVerticalLine(verticals, x, topY, botY) {
|
|
3451
|
-
const tol =
|
|
3541
|
+
function hasVerticalLine(verticals, x, topY, botY, vertexRadius) {
|
|
3542
|
+
const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
|
|
3452
3543
|
for (const v of verticals) {
|
|
3453
3544
|
if (Math.abs(v.x1 - x) <= tol) {
|
|
3454
3545
|
const cellH = Math.abs(topY - botY);
|
|
3546
|
+
if (cellH < 0.1) continue;
|
|
3455
3547
|
const overlapTop = Math.min(v.y2, topY);
|
|
3456
3548
|
const overlapBot = Math.max(v.y1, botY);
|
|
3457
3549
|
const overlap = overlapTop - overlapBot;
|
|
3458
|
-
if (overlap >= cellH * 0.
|
|
3550
|
+
if (overlap >= cellH * 0.75) return true;
|
|
3459
3551
|
}
|
|
3460
3552
|
}
|
|
3461
3553
|
return false;
|
|
3462
3554
|
}
|
|
3463
|
-
function hasHorizontalLine(horizontals, y, leftX, rightX) {
|
|
3464
|
-
const tol =
|
|
3555
|
+
function hasHorizontalLine(horizontals, y, leftX, rightX, vertexRadius) {
|
|
3556
|
+
const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
|
|
3465
3557
|
for (const h of horizontals) {
|
|
3466
3558
|
if (Math.abs(h.y1 - y) <= tol) {
|
|
3467
3559
|
const cellW = Math.abs(rightX - leftX);
|
|
3560
|
+
if (cellW < 0.1) continue;
|
|
3468
3561
|
const overlapLeft = Math.max(h.x1, leftX);
|
|
3469
3562
|
const overlapRight = Math.min(h.x2, rightX);
|
|
3470
3563
|
const overlap = overlapRight - overlapLeft;
|
|
3471
|
-
if (overlap >= cellW * 0.
|
|
3564
|
+
if (overlap >= cellW * 0.75) return true;
|
|
3472
3565
|
}
|
|
3473
3566
|
}
|
|
3474
3567
|
return false;
|
|
@@ -3479,23 +3572,24 @@ function mapTextToCells(items, cells) {
|
|
|
3479
3572
|
result.set(cell, []);
|
|
3480
3573
|
}
|
|
3481
3574
|
for (const item of items) {
|
|
3482
|
-
const cx = item.x + item.w / 2;
|
|
3483
|
-
const cy = item.y;
|
|
3484
3575
|
const pad = CELL_PADDING;
|
|
3485
3576
|
let bestCell = null;
|
|
3486
|
-
let
|
|
3577
|
+
let bestScore = 0;
|
|
3487
3578
|
for (const cell of cells) {
|
|
3488
|
-
|
|
3489
|
-
|
|
3490
|
-
|
|
3491
|
-
|
|
3492
|
-
|
|
3493
|
-
|
|
3494
|
-
|
|
3495
|
-
|
|
3579
|
+
const ix1 = Math.max(item.x, cell.bbox.x1 - pad);
|
|
3580
|
+
const ix2 = Math.min(item.x + item.w, cell.bbox.x2 + pad);
|
|
3581
|
+
const iy1 = Math.max(item.y, cell.bbox.y1 - pad);
|
|
3582
|
+
const iy2 = Math.min(item.y + (item.h || item.fontSize), cell.bbox.y2 + pad);
|
|
3583
|
+
if (ix1 >= ix2 || iy1 >= iy2) continue;
|
|
3584
|
+
const intersectArea = (ix2 - ix1) * (iy2 - iy1);
|
|
3585
|
+
const itemArea = Math.max(item.w, 1) * Math.max(item.h || item.fontSize, 1);
|
|
3586
|
+
const score = intersectArea / itemArea;
|
|
3587
|
+
if (score > bestScore) {
|
|
3588
|
+
bestScore = score;
|
|
3589
|
+
bestCell = cell;
|
|
3496
3590
|
}
|
|
3497
3591
|
}
|
|
3498
|
-
if (bestCell) {
|
|
3592
|
+
if (bestCell && bestScore > 0.3) {
|
|
3499
3593
|
result.get(bestCell).push(item);
|
|
3500
3594
|
}
|
|
3501
3595
|
}
|
|
@@ -3522,8 +3616,13 @@ function cellTextToString(items) {
|
|
|
3522
3616
|
const textLines = lines.map((line) => {
|
|
3523
3617
|
const s = line.sort((a, b) => a.x - b.x);
|
|
3524
3618
|
if (s.length === 1) return s[0].text;
|
|
3619
|
+
const evenSpaced = detectEvenSpacedItems(s);
|
|
3525
3620
|
let result = s[0].text;
|
|
3526
3621
|
for (let j = 1; j < s.length; j++) {
|
|
3622
|
+
if (evenSpaced[j]) {
|
|
3623
|
+
result += s[j].text;
|
|
3624
|
+
continue;
|
|
3625
|
+
}
|
|
3527
3626
|
const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
|
|
3528
3627
|
const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
|
|
3529
3628
|
const prevIsKorean = /[가-힣]$/.test(result);
|
|
@@ -3538,6 +3637,57 @@ function cellTextToString(items) {
|
|
|
3538
3637
|
}
|
|
3539
3638
|
return result;
|
|
3540
3639
|
});
|
|
3640
|
+
return mergeCellTextLines(textLines);
|
|
3641
|
+
}
|
|
3642
|
+
function detectEvenSpacedItems(items) {
|
|
3643
|
+
const result = new Array(items.length).fill(false);
|
|
3644
|
+
if (items.length < 3) return result;
|
|
3645
|
+
let runStart = -1;
|
|
3646
|
+
for (let i = 0; i < items.length; i++) {
|
|
3647
|
+
const isShortKorean = /^[가-힣]{1}$/.test(items[i].text) || /^[\d]{1}$/.test(items[i].text);
|
|
3648
|
+
if (isShortKorean && runStart >= 0 && i > 0) {
|
|
3649
|
+
const gap = items[i].x - (items[i - 1].x + items[i - 1].w);
|
|
3650
|
+
const maxRunGap = Math.max(items[i].fontSize * 3, 30);
|
|
3651
|
+
if (gap > maxRunGap) {
|
|
3652
|
+
if (i - runStart >= 3) markEvenRun(items, result, runStart, i);
|
|
3653
|
+
runStart = i;
|
|
3654
|
+
continue;
|
|
3655
|
+
}
|
|
3656
|
+
}
|
|
3657
|
+
if (isShortKorean) {
|
|
3658
|
+
if (runStart < 0) runStart = i;
|
|
3659
|
+
} else {
|
|
3660
|
+
if (runStart >= 0 && i - runStart >= 3) {
|
|
3661
|
+
markEvenRun(items, result, runStart, i);
|
|
3662
|
+
}
|
|
3663
|
+
runStart = -1;
|
|
3664
|
+
}
|
|
3665
|
+
}
|
|
3666
|
+
if (runStart >= 0 && items.length - runStart >= 3) {
|
|
3667
|
+
markEvenRun(items, result, runStart, items.length);
|
|
3668
|
+
}
|
|
3669
|
+
return result;
|
|
3670
|
+
}
|
|
3671
|
+
function markEvenRun(items, result, start, end) {
|
|
3672
|
+
const gaps = [];
|
|
3673
|
+
for (let i = start + 1; i < end; i++) {
|
|
3674
|
+
gaps.push(items[i].x - (items[i - 1].x + items[i - 1].w));
|
|
3675
|
+
}
|
|
3676
|
+
const posGaps = gaps.filter((g2) => g2 > 0);
|
|
3677
|
+
if (posGaps.length < 2) return;
|
|
3678
|
+
let minGap = Infinity, maxGap = -Infinity;
|
|
3679
|
+
for (const g2 of posGaps) {
|
|
3680
|
+
if (g2 < minGap) minGap = g2;
|
|
3681
|
+
if (g2 > maxGap) maxGap = g2;
|
|
3682
|
+
}
|
|
3683
|
+
const avgFs = items[start].fontSize;
|
|
3684
|
+
if (minGap >= avgFs * 0.1 && maxGap <= avgFs * 3 && maxGap / Math.max(minGap, 0.1) <= 3) {
|
|
3685
|
+
for (let i = start + 1; i < end; i++) {
|
|
3686
|
+
result[i] = true;
|
|
3687
|
+
}
|
|
3688
|
+
}
|
|
3689
|
+
}
|
|
3690
|
+
function mergeCellTextLines(textLines) {
|
|
3541
3691
|
if (textLines.length <= 1) return textLines[0] || "";
|
|
3542
3692
|
const merged = [textLines[0]];
|
|
3543
3693
|
for (let i = 1; i < textLines.length; i++) {
|
|
@@ -3563,24 +3713,172 @@ var Y_TOL = 3;
|
|
|
3563
3713
|
var COL_CLUSTER_TOL = 15;
|
|
3564
3714
|
var MIN_ROWS = 3;
|
|
3565
3715
|
var MIN_COLS = 2;
|
|
3566
|
-
var MIN_GAP_FACTOR =
|
|
3567
|
-
var
|
|
3716
|
+
var MIN_GAP_FACTOR = 2;
|
|
3717
|
+
var MIN_GAP_ABSOLUTE = 20;
|
|
3718
|
+
var MIN_COL_FILL_RATIO = 0.4;
|
|
3568
3719
|
function detectClusterTables(items, pageNum) {
|
|
3569
3720
|
if (items.length < MIN_ROWS * MIN_COLS) return [];
|
|
3570
|
-
const
|
|
3721
|
+
const { merged, originMap } = mergeEvenSpacedClusters(items);
|
|
3722
|
+
const rows = groupByBaseline(merged);
|
|
3571
3723
|
if (rows.length < MIN_ROWS) return [];
|
|
3572
|
-
const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
|
|
3573
|
-
if (suspiciousRows.length < MIN_ROWS) return [];
|
|
3574
|
-
const columns = extractColumnClusters(suspiciousRows);
|
|
3575
|
-
if (columns.length < MIN_COLS) return [];
|
|
3576
|
-
const tableRegions = findTableRegions(rows, columns);
|
|
3577
3724
|
const results = [];
|
|
3578
|
-
|
|
3579
|
-
|
|
3580
|
-
|
|
3725
|
+
const headerResult = detectHeaderRow(rows);
|
|
3726
|
+
if (headerResult) {
|
|
3727
|
+
const { columns, headerIdx } = headerResult;
|
|
3728
|
+
const headerRow = rows[headerIdx];
|
|
3729
|
+
const headerItems = [...headerRow.items].sort((a, b) => a.x - b.x);
|
|
3730
|
+
const headerAndBelow = rows.slice(headerIdx);
|
|
3731
|
+
const mergedRows = mergeMultiLineRows(headerAndBelow, columns);
|
|
3732
|
+
const tableRegions = findTableRegionsByHeader(mergedRows, columns, headerItems);
|
|
3733
|
+
for (const region of tableRegions) {
|
|
3734
|
+
const table = buildClusterTable(region.rows, columns, pageNum);
|
|
3735
|
+
if (table) {
|
|
3736
|
+
expandUsedItems(table.usedItems, originMap);
|
|
3737
|
+
results.push(table);
|
|
3738
|
+
}
|
|
3739
|
+
}
|
|
3740
|
+
}
|
|
3741
|
+
if (results.length === 0) {
|
|
3742
|
+
const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
|
|
3743
|
+
if (suspiciousRows.length >= MIN_ROWS) {
|
|
3744
|
+
const columns = extractColumnClusters(suspiciousRows);
|
|
3745
|
+
if (columns.length >= MIN_COLS) {
|
|
3746
|
+
const tableRegions = findTableRegions(rows, columns);
|
|
3747
|
+
for (const region of tableRegions) {
|
|
3748
|
+
const mergedRows = mergeMultiLineRows(region.rows, columns);
|
|
3749
|
+
const table = buildClusterTable(mergedRows, columns, pageNum);
|
|
3750
|
+
if (table) {
|
|
3751
|
+
expandUsedItems(table.usedItems, originMap);
|
|
3752
|
+
results.push(table);
|
|
3753
|
+
}
|
|
3754
|
+
}
|
|
3755
|
+
}
|
|
3756
|
+
}
|
|
3581
3757
|
}
|
|
3582
3758
|
return results;
|
|
3583
3759
|
}
|
|
3760
|
+
function mergeEvenSpacedClusters(items) {
|
|
3761
|
+
const originMap = /* @__PURE__ */ new Map();
|
|
3762
|
+
const rows = groupByBaseline(items);
|
|
3763
|
+
const merged = [];
|
|
3764
|
+
for (const row of rows) {
|
|
3765
|
+
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3766
|
+
let i = 0;
|
|
3767
|
+
while (i < sorted.length) {
|
|
3768
|
+
if (/^[가-힣\d]$/.test(sorted[i].text)) {
|
|
3769
|
+
let runEnd = i + 1;
|
|
3770
|
+
while (runEnd < sorted.length && /^[가-힣\d]$/.test(sorted[runEnd].text)) {
|
|
3771
|
+
const gap = sorted[runEnd].x - (sorted[runEnd - 1].x + sorted[runEnd - 1].w);
|
|
3772
|
+
const fs = sorted[runEnd].fontSize;
|
|
3773
|
+
if (gap < fs * 0.1 || gap > fs * 3) break;
|
|
3774
|
+
runEnd++;
|
|
3775
|
+
}
|
|
3776
|
+
if (runEnd - i >= 3) {
|
|
3777
|
+
const gaps = [];
|
|
3778
|
+
for (let g2 = i + 1; g2 < runEnd; g2++) {
|
|
3779
|
+
gaps.push(sorted[g2].x - (sorted[g2 - 1].x + sorted[g2 - 1].w));
|
|
3780
|
+
}
|
|
3781
|
+
let minG = Infinity, maxG = -Infinity;
|
|
3782
|
+
for (const g2 of gaps) {
|
|
3783
|
+
if (g2 < minG) minG = g2;
|
|
3784
|
+
if (g2 > maxG) maxG = g2;
|
|
3785
|
+
}
|
|
3786
|
+
if (minG > 0 && maxG / minG <= 3) {
|
|
3787
|
+
const run = sorted.slice(i, runEnd);
|
|
3788
|
+
const text = run.map((r) => r.text).join("");
|
|
3789
|
+
const first = run[0], last = run[runEnd - i - 1];
|
|
3790
|
+
const item = {
|
|
3791
|
+
text,
|
|
3792
|
+
x: first.x,
|
|
3793
|
+
y: first.y,
|
|
3794
|
+
w: last.x + last.w - first.x,
|
|
3795
|
+
h: first.h,
|
|
3796
|
+
fontSize: first.fontSize,
|
|
3797
|
+
fontName: first.fontName
|
|
3798
|
+
};
|
|
3799
|
+
originMap.set(item, run);
|
|
3800
|
+
merged.push(item);
|
|
3801
|
+
i = runEnd;
|
|
3802
|
+
continue;
|
|
3803
|
+
}
|
|
3804
|
+
}
|
|
3805
|
+
}
|
|
3806
|
+
merged.push(sorted[i]);
|
|
3807
|
+
i++;
|
|
3808
|
+
}
|
|
3809
|
+
}
|
|
3810
|
+
return { merged, originMap };
|
|
3811
|
+
}
|
|
3812
|
+
function expandUsedItems(usedItems, originMap) {
|
|
3813
|
+
const toAdd = [];
|
|
3814
|
+
for (const item of usedItems) {
|
|
3815
|
+
const origins = originMap.get(item);
|
|
3816
|
+
if (origins) for (const o of origins) toAdd.push(o);
|
|
3817
|
+
}
|
|
3818
|
+
for (const a of toAdd) usedItems.add(a);
|
|
3819
|
+
}
|
|
3820
|
+
function detectHeaderRow(rows) {
|
|
3821
|
+
const allItems = rows.flatMap((r) => r.items);
|
|
3822
|
+
if (allItems.length === 0) return null;
|
|
3823
|
+
let allMinX = Infinity, allMaxX = -Infinity;
|
|
3824
|
+
for (const i of allItems) {
|
|
3825
|
+
if (i.x < allMinX) allMinX = i.x;
|
|
3826
|
+
const r = i.x + i.w;
|
|
3827
|
+
if (r > allMaxX) allMaxX = r;
|
|
3828
|
+
}
|
|
3829
|
+
const pageSpan = allMaxX - allMinX;
|
|
3830
|
+
if (pageSpan <= 0) return null;
|
|
3831
|
+
for (let ri = 0; ri < rows.length; ri++) {
|
|
3832
|
+
const row = rows[ri];
|
|
3833
|
+
if (row.items.length < MIN_COLS || row.items.length > 6) continue;
|
|
3834
|
+
if (row.items.some((i) => i.text.length > 8)) continue;
|
|
3835
|
+
if (!row.items.some((i) => /[가-힣]/.test(i.text))) continue;
|
|
3836
|
+
if (row.items.some((i) => /^[□■○●·※▶▷◆◇\-]/.test(i.text))) continue;
|
|
3837
|
+
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3838
|
+
const xSpan = sorted[sorted.length - 1].x + sorted[sorted.length - 1].w - sorted[0].x;
|
|
3839
|
+
if (xSpan / pageSpan < 0.4) continue;
|
|
3840
|
+
const avgFs = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
|
|
3841
|
+
let hasLargeGap = false;
|
|
3842
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
3843
|
+
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
3844
|
+
if (gap >= avgFs * 2.5) {
|
|
3845
|
+
hasLargeGap = true;
|
|
3846
|
+
break;
|
|
3847
|
+
}
|
|
3848
|
+
}
|
|
3849
|
+
if (!hasLargeGap) continue;
|
|
3850
|
+
const columns = sorted.map((item) => ({ x: item.x, count: 0 }));
|
|
3851
|
+
let matchCount = 0;
|
|
3852
|
+
for (let j = ri + 1; j < rows.length && matchCount < MIN_ROWS + 2; j++) {
|
|
3853
|
+
const matched = countMatchedColumnsRange(rows[j], columns, sorted);
|
|
3854
|
+
if (matched >= MIN_COLS) matchCount++;
|
|
3855
|
+
}
|
|
3856
|
+
if (matchCount < MIN_ROWS) continue;
|
|
3857
|
+
return { columns, headerIdx: ri };
|
|
3858
|
+
}
|
|
3859
|
+
return null;
|
|
3860
|
+
}
|
|
3861
|
+
function mergeMultiLineRows(rows, columns) {
|
|
3862
|
+
if (rows.length <= 1) return rows;
|
|
3863
|
+
const result = [rows[0]];
|
|
3864
|
+
const allFontSizes = rows.flatMap((r) => r.items).map((i) => i.fontSize);
|
|
3865
|
+
const avgFontSize = allFontSizes.length > 0 ? allFontSizes.reduce((s, v) => s + v, 0) / allFontSizes.length : 12;
|
|
3866
|
+
for (let i = 1; i < rows.length; i++) {
|
|
3867
|
+
const prev = result[result.length - 1];
|
|
3868
|
+
const curr = rows[i];
|
|
3869
|
+
const yGap = Math.abs(prev.y - curr.y);
|
|
3870
|
+
const matchedCols = countMatchedColumns(curr, columns);
|
|
3871
|
+
if (yGap < avgFontSize * 1.8 && curr.items.length <= 2 && (matchedCols < MIN_COLS || curr.items.length === 1)) {
|
|
3872
|
+
result[result.length - 1] = {
|
|
3873
|
+
y: prev.y,
|
|
3874
|
+
items: [...prev.items, ...curr.items]
|
|
3875
|
+
};
|
|
3876
|
+
} else {
|
|
3877
|
+
result.push(curr);
|
|
3878
|
+
}
|
|
3879
|
+
}
|
|
3880
|
+
return result;
|
|
3881
|
+
}
|
|
3584
3882
|
function groupByBaseline(items) {
|
|
3585
3883
|
if (items.length === 0) return [];
|
|
3586
3884
|
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
@@ -3602,8 +3900,9 @@ function groupByBaseline(items) {
|
|
|
3602
3900
|
function hasSuspiciousGaps(row) {
|
|
3603
3901
|
if (row.items.length < 2) return false;
|
|
3604
3902
|
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3903
|
+
if (sorted.length === 2 && sorted[1].text.length > 20) return false;
|
|
3605
3904
|
const avgFontSize = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
|
|
3606
|
-
const minGap = avgFontSize * MIN_GAP_FACTOR;
|
|
3905
|
+
const minGap = Math.max(avgFontSize * MIN_GAP_FACTOR, MIN_GAP_ABSOLUTE);
|
|
3607
3906
|
for (let i = 1; i < sorted.length; i++) {
|
|
3608
3907
|
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
3609
3908
|
if (gap >= minGap) return true;
|
|
@@ -3630,6 +3929,41 @@ function extractColumnClusters(rows) {
|
|
|
3630
3929
|
const minCount = Math.max(2, Math.floor(rows.length * MIN_COL_FILL_RATIO));
|
|
3631
3930
|
return clusters.filter((c) => c.count >= minCount).sort((a, b) => a.x - b.x);
|
|
3632
3931
|
}
|
|
3932
|
+
function findTableRegionsByHeader(allRows, columns, headerItems) {
|
|
3933
|
+
const regions = [];
|
|
3934
|
+
let currentRegion = [];
|
|
3935
|
+
let missStreak = 0;
|
|
3936
|
+
for (const row of allRows) {
|
|
3937
|
+
const matchedCols = countMatchedColumnsRange(row, columns, headerItems);
|
|
3938
|
+
if (matchedCols >= MIN_COLS) {
|
|
3939
|
+
currentRegion.push(row);
|
|
3940
|
+
missStreak = 0;
|
|
3941
|
+
} else if (currentRegion.length > 0 && (row.items.length <= 2 || missStreak === 0)) {
|
|
3942
|
+
currentRegion.push(row);
|
|
3943
|
+
missStreak++;
|
|
3944
|
+
} else {
|
|
3945
|
+
while (currentRegion.length > 0) {
|
|
3946
|
+
const last = currentRegion[currentRegion.length - 1];
|
|
3947
|
+
if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
|
|
3948
|
+
currentRegion.pop();
|
|
3949
|
+
}
|
|
3950
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
3951
|
+
regions.push({ rows: [...currentRegion] });
|
|
3952
|
+
}
|
|
3953
|
+
currentRegion = [];
|
|
3954
|
+
missStreak = 0;
|
|
3955
|
+
}
|
|
3956
|
+
}
|
|
3957
|
+
while (currentRegion.length > 0) {
|
|
3958
|
+
const last = currentRegion[currentRegion.length - 1];
|
|
3959
|
+
if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
|
|
3960
|
+
currentRegion.pop();
|
|
3961
|
+
}
|
|
3962
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
3963
|
+
regions.push({ rows: currentRegion });
|
|
3964
|
+
}
|
|
3965
|
+
return regions;
|
|
3966
|
+
}
|
|
3633
3967
|
function findTableRegions(allRows, columns) {
|
|
3634
3968
|
const regions = [];
|
|
3635
3969
|
let currentRegion = [];
|
|
@@ -3665,18 +3999,81 @@ function countMatchedColumns(row, columns) {
|
|
|
3665
3999
|
}
|
|
3666
4000
|
return matched.size;
|
|
3667
4001
|
}
|
|
3668
|
-
function
|
|
3669
|
-
const
|
|
3670
|
-
let
|
|
3671
|
-
|
|
3672
|
-
|
|
3673
|
-
|
|
3674
|
-
|
|
3675
|
-
|
|
3676
|
-
|
|
4002
|
+
function countMatchedColumnsRange(row, columns, headerItems) {
|
|
4003
|
+
const boundaries = [];
|
|
4004
|
+
for (let ci = 0; ci < headerItems.length; ci++) {
|
|
4005
|
+
const left = ci === 0 ? 0 : (headerItems[ci - 1].x + headerItems[ci - 1].w + headerItems[ci].x) / 2;
|
|
4006
|
+
const right = ci === headerItems.length - 1 ? Infinity : (headerItems[ci].x + headerItems[ci].w + headerItems[ci + 1].x) / 2;
|
|
4007
|
+
boundaries.push({ left, right });
|
|
4008
|
+
}
|
|
4009
|
+
const matched = /* @__PURE__ */ new Set();
|
|
4010
|
+
for (const item of row.items) {
|
|
4011
|
+
for (let ci = 0; ci < boundaries.length; ci++) {
|
|
4012
|
+
if (item.x >= boundaries[ci].left && item.x < boundaries[ci].right) {
|
|
4013
|
+
matched.add(ci);
|
|
4014
|
+
break;
|
|
4015
|
+
}
|
|
4016
|
+
}
|
|
4017
|
+
}
|
|
4018
|
+
return matched.size;
|
|
4019
|
+
}
|
|
4020
|
+
function assignRowItems(items, columns, numCols) {
|
|
4021
|
+
if (items.length === 0) return [];
|
|
4022
|
+
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
4023
|
+
const colCenters = columns.map((c) => c.x);
|
|
4024
|
+
const gaps = [];
|
|
4025
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
4026
|
+
gaps.push({ idx: i, size: sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w) });
|
|
4027
|
+
}
|
|
4028
|
+
const gapSizes = gaps.map((g2) => g2.size).sort((a, b) => a - b);
|
|
4029
|
+
const medianGap = gapSizes.length > 0 ? gapSizes[Math.floor(gapSizes.length / 2)] : 0;
|
|
4030
|
+
const gapThreshold = sorted.length <= numCols + 1 ? 12 : Math.max(medianGap * 2.5, 12);
|
|
4031
|
+
const significantGaps = gaps.filter((g2) => g2.size >= gapThreshold).sort((a, b) => b.size - a.size).slice(0, numCols - 1).sort((a, b) => a.idx - b.idx);
|
|
4032
|
+
const groups = [];
|
|
4033
|
+
let start = 0;
|
|
4034
|
+
for (const gap of significantGaps) {
|
|
4035
|
+
groups.push(sorted.slice(start, gap.idx));
|
|
4036
|
+
start = gap.idx;
|
|
4037
|
+
}
|
|
4038
|
+
groups.push(sorted.slice(start));
|
|
4039
|
+
const result = [];
|
|
4040
|
+
const usedCols = /* @__PURE__ */ new Set();
|
|
4041
|
+
const groupCenters = groups.map((g2) => {
|
|
4042
|
+
let minX = Infinity, maxX = -Infinity;
|
|
4043
|
+
for (const i of g2) {
|
|
4044
|
+
if (i.x < minX) minX = i.x;
|
|
4045
|
+
const r = i.x + i.w;
|
|
4046
|
+
if (r > maxX) maxX = r;
|
|
4047
|
+
}
|
|
4048
|
+
return (minX + maxX) / 2;
|
|
4049
|
+
});
|
|
4050
|
+
const assignments = [];
|
|
4051
|
+
for (let gi = 0; gi < groups.length; gi++) {
|
|
4052
|
+
for (let ci = 0; ci < numCols; ci++) {
|
|
4053
|
+
assignments.push({ gi, ci, dist: Math.abs(groupCenters[gi] - colCenters[ci]) });
|
|
4054
|
+
}
|
|
4055
|
+
}
|
|
4056
|
+
assignments.sort((a, b) => a.dist - b.dist);
|
|
4057
|
+
const assignedGroups = /* @__PURE__ */ new Set();
|
|
4058
|
+
for (const { gi, ci } of assignments) {
|
|
4059
|
+
if (assignedGroups.has(gi) || usedCols.has(ci)) continue;
|
|
4060
|
+
result.push({ col: ci, items: groups[gi] });
|
|
4061
|
+
assignedGroups.add(gi);
|
|
4062
|
+
usedCols.add(ci);
|
|
4063
|
+
}
|
|
4064
|
+
for (let gi = 0; gi < groups.length; gi++) {
|
|
4065
|
+
if (assignedGroups.has(gi)) continue;
|
|
4066
|
+
let bestCol = 0, bestDist = Infinity;
|
|
4067
|
+
for (let ci = 0; ci < numCols; ci++) {
|
|
4068
|
+
const d = Math.abs(groupCenters[gi] - colCenters[ci]);
|
|
4069
|
+
if (d < bestDist) {
|
|
4070
|
+
bestDist = d;
|
|
4071
|
+
bestCol = ci;
|
|
4072
|
+
}
|
|
3677
4073
|
}
|
|
4074
|
+
result.push({ col: bestCol, items: groups[gi] });
|
|
3678
4075
|
}
|
|
3679
|
-
return
|
|
4076
|
+
return result;
|
|
3680
4077
|
}
|
|
3681
4078
|
function buildClusterTable(rows, columns, pageNum) {
|
|
3682
4079
|
const numCols = columns.length;
|
|
@@ -3694,12 +4091,12 @@ function buildClusterTable(rows, columns, pageNum) {
|
|
|
3694
4091
|
usedItems.add(row.items[0]);
|
|
3695
4092
|
continue;
|
|
3696
4093
|
}
|
|
3697
|
-
|
|
3698
|
-
|
|
3699
|
-
|
|
4094
|
+
const assignments = assignRowItems(row.items, columns, numCols);
|
|
4095
|
+
for (const { col, items } of assignments) {
|
|
4096
|
+
const text = items.map((i) => i.text).join(" ");
|
|
3700
4097
|
const existing = cells[r][col].text;
|
|
3701
|
-
cells[r][col].text = existing ? existing + " " +
|
|
3702
|
-
usedItems.add(item);
|
|
4098
|
+
cells[r][col].text = existing ? existing + " " + text : text;
|
|
4099
|
+
for (const item of items) usedItems.add(item);
|
|
3703
4100
|
}
|
|
3704
4101
|
}
|
|
3705
4102
|
let emptyRows = 0;
|
|
@@ -3711,11 +4108,48 @@ function buildClusterTable(rows, columns, pageNum) {
|
|
|
3711
4108
|
const hasValue = cells.some((row) => row[c].text !== "");
|
|
3712
4109
|
if (!hasValue) return null;
|
|
3713
4110
|
}
|
|
4111
|
+
for (let r = numRows - 1; r >= 1; r--) {
|
|
4112
|
+
const nonEmptyCols = cells[r].filter((c) => c.text.trim()).length;
|
|
4113
|
+
if (nonEmptyCols !== 1) continue;
|
|
4114
|
+
if (cells[r][0].text.trim() !== "") continue;
|
|
4115
|
+
const contentText = cells[r].find((c) => c.text.trim())?.text.trim() || "";
|
|
4116
|
+
if (/^[○●▶\-·]/.test(contentText)) continue;
|
|
4117
|
+
for (let pr = r - 1; pr >= 0; pr--) {
|
|
4118
|
+
if (cells[pr].some((c) => c.text.trim())) {
|
|
4119
|
+
for (let c = 0; c < numCols; c++) {
|
|
4120
|
+
const prev = cells[pr][c].text.trim();
|
|
4121
|
+
const curr = cells[r][c].text.trim();
|
|
4122
|
+
if (curr) cells[pr][c].text = prev ? prev + " " + curr : curr;
|
|
4123
|
+
}
|
|
4124
|
+
for (let c = 0; c < numCols; c++) cells[r][c].text = "";
|
|
4125
|
+
break;
|
|
4126
|
+
}
|
|
4127
|
+
}
|
|
4128
|
+
}
|
|
4129
|
+
for (let r = 0; r < cells.length - 1; r++) {
|
|
4130
|
+
const row = cells[r];
|
|
4131
|
+
const hasCol0 = row[0].text.trim() !== "";
|
|
4132
|
+
const hasColLast = numCols > 1 && row[numCols - 1].text.trim() !== "";
|
|
4133
|
+
const midEmpty = row.slice(1, numCols - 1).every((c) => c.text.trim() === "");
|
|
4134
|
+
if (hasCol0 && hasColLast && midEmpty) {
|
|
4135
|
+
const next = cells[r + 1];
|
|
4136
|
+
if (next[0].text.trim() === "" && next.some((c) => c.text.trim())) {
|
|
4137
|
+
for (let c = 1; c < numCols; c++) {
|
|
4138
|
+
const curr = next[c].text.trim();
|
|
4139
|
+
if (curr) row[c].text = row[c].text.trim() ? row[c].text.trim() + " " + curr : curr;
|
|
4140
|
+
}
|
|
4141
|
+
for (let c = 0; c < numCols; c++) next[c].text = "";
|
|
4142
|
+
}
|
|
4143
|
+
}
|
|
4144
|
+
}
|
|
4145
|
+
const filteredCells = cells.filter((row) => row.some((c) => c.text.trim()));
|
|
4146
|
+
const finalRowCount = filteredCells.length;
|
|
4147
|
+
if (finalRowCount < MIN_ROWS) return null;
|
|
3714
4148
|
const irTable = {
|
|
3715
|
-
rows:
|
|
4149
|
+
rows: finalRowCount,
|
|
3716
4150
|
cols: numCols,
|
|
3717
|
-
cells,
|
|
3718
|
-
hasHeader:
|
|
4151
|
+
cells: filteredCells,
|
|
4152
|
+
hasHeader: finalRowCount > 1
|
|
3719
4153
|
};
|
|
3720
4154
|
const allItems = rows.flatMap((r) => r.items);
|
|
3721
4155
|
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
@@ -3792,7 +4226,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3792
4226
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
3793
4227
|
const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
|
|
3794
4228
|
const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
|
|
3795
|
-
const
|
|
4229
|
+
const fontSizeFreq = /* @__PURE__ */ new Map();
|
|
3796
4230
|
const pageHeights = /* @__PURE__ */ new Map();
|
|
3797
4231
|
let parsedPages = 0;
|
|
3798
4232
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
@@ -3809,7 +4243,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3809
4243
|
warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
|
|
3810
4244
|
}
|
|
3811
4245
|
for (const item of visible) {
|
|
3812
|
-
if (item.fontSize > 0)
|
|
4246
|
+
if (item.fontSize > 0) fontSizeFreq.set(item.fontSize, (fontSizeFreq.get(item.fontSize) || 0) + 1);
|
|
3813
4247
|
}
|
|
3814
4248
|
const opList = await page.getOperatorList();
|
|
3815
4249
|
const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
|
|
@@ -3848,10 +4282,9 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3848
4282
|
blocks.splice(removed[ri], 1);
|
|
3849
4283
|
}
|
|
3850
4284
|
}
|
|
3851
|
-
const medianFontSize =
|
|
4285
|
+
const medianFontSize = computeMedianFontSizeFromFreq(fontSizeFreq);
|
|
3852
4286
|
if (medianFontSize > 0) {
|
|
3853
4287
|
detectHeadings(blocks, medianFontSize);
|
|
3854
|
-
mergeAdjacentHeadings(blocks);
|
|
3855
4288
|
}
|
|
3856
4289
|
detectMarkerHeadings(blocks);
|
|
3857
4290
|
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
@@ -3902,11 +4335,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
|
|
|
3902
4335
|
}
|
|
3903
4336
|
return { visible, hiddenCount };
|
|
3904
4337
|
}
|
|
3905
|
-
function
|
|
3906
|
-
if (
|
|
3907
|
-
|
|
3908
|
-
const
|
|
3909
|
-
|
|
4338
|
+
function computeMedianFontSizeFromFreq(freq) {
|
|
4339
|
+
if (freq.size === 0) return 0;
|
|
4340
|
+
let total = 0;
|
|
4341
|
+
for (const count of freq.values()) total += count;
|
|
4342
|
+
const sorted = [...freq.entries()].sort((a, b) => a[0] - b[0]);
|
|
4343
|
+
const mid = Math.floor(total / 2);
|
|
4344
|
+
let cumulative = 0;
|
|
4345
|
+
for (const [size, count] of sorted) {
|
|
4346
|
+
cumulative += count;
|
|
4347
|
+
if (cumulative > mid) return size;
|
|
4348
|
+
}
|
|
4349
|
+
return sorted[sorted.length - 1][0];
|
|
3910
4350
|
}
|
|
3911
4351
|
function detectHeadings(blocks, medianFontSize) {
|
|
3912
4352
|
for (const block of blocks) {
|
|
@@ -3926,220 +4366,27 @@ function detectHeadings(blocks, medianFontSize) {
|
|
|
3926
4366
|
}
|
|
3927
4367
|
}
|
|
3928
4368
|
}
|
|
3929
|
-
function mergeAdjacentHeadings(blocks) {
|
|
3930
|
-
let i = 0;
|
|
3931
|
-
while (i < blocks.length - 1) {
|
|
3932
|
-
const curr = blocks[i];
|
|
3933
|
-
const next = blocks[i + 1];
|
|
3934
|
-
if (curr.type !== "heading" || next.type !== "heading") {
|
|
3935
|
-
i++;
|
|
3936
|
-
continue;
|
|
3937
|
-
}
|
|
3938
|
-
if (!curr.bbox || !next.bbox || !curr.text || !next.text) {
|
|
3939
|
-
i++;
|
|
3940
|
-
continue;
|
|
3941
|
-
}
|
|
3942
|
-
const currBaseline = curr.bbox.y + (curr.style?.fontSize || curr.bbox.height);
|
|
3943
|
-
const nextBaseline = next.bbox.y + (next.style?.fontSize || next.bbox.height);
|
|
3944
|
-
const yDiff = Math.abs(currBaseline - nextBaseline);
|
|
3945
|
-
const maxFs = Math.max(curr.style?.fontSize || 12, next.style?.fontSize || 12);
|
|
3946
|
-
const sameY = curr.bbox.page === next.bbox.page && yDiff < maxFs * 1.5;
|
|
3947
|
-
const sameLevel = curr.level === next.level;
|
|
3948
|
-
if (sameY && sameLevel) {
|
|
3949
|
-
const currX = curr.bbox.x;
|
|
3950
|
-
const nextX = next.bbox.x;
|
|
3951
|
-
if (currX <= nextX) {
|
|
3952
|
-
curr.text = curr.text + " " + next.text;
|
|
3953
|
-
} else {
|
|
3954
|
-
curr.text = next.text + " " + curr.text;
|
|
3955
|
-
}
|
|
3956
|
-
curr.bbox = {
|
|
3957
|
-
page: curr.bbox.page,
|
|
3958
|
-
x: Math.min(curr.bbox.x, next.bbox.x),
|
|
3959
|
-
y: Math.min(curr.bbox.y, next.bbox.y),
|
|
3960
|
-
width: Math.max(curr.bbox.x + curr.bbox.width, next.bbox.x + next.bbox.width) - Math.min(curr.bbox.x, next.bbox.x),
|
|
3961
|
-
height: Math.max(curr.bbox.height, next.bbox.height)
|
|
3962
|
-
};
|
|
3963
|
-
blocks.splice(i + 1, 1);
|
|
3964
|
-
} else {
|
|
3965
|
-
i++;
|
|
3966
|
-
}
|
|
3967
|
-
}
|
|
3968
|
-
}
|
|
3969
4369
|
function collapseEvenSpacing(text) {
|
|
3970
4370
|
const tokens = text.split(" ");
|
|
3971
4371
|
const singleCharCount = tokens.filter((t) => t.length === 1).length;
|
|
3972
4372
|
if (tokens.length >= 3 && singleCharCount / tokens.length >= 0.7) {
|
|
3973
4373
|
return tokens.join("");
|
|
3974
4374
|
}
|
|
3975
|
-
return text
|
|
3976
|
-
}
|
|
3977
|
-
|
|
3978
|
-
const allY = items.map((i) => i.y);
|
|
3979
|
-
const pageHeight = Math.max(...allY) - Math.min(...allY);
|
|
3980
|
-
const gapThreshold = Math.max(15, pageHeight * 0.03);
|
|
3981
|
-
const orderedGroups = xyCutOrder(items, gapThreshold);
|
|
3982
|
-
const blocks = [];
|
|
3983
|
-
for (const group of orderedGroups) {
|
|
3984
|
-
if (group.length === 0) continue;
|
|
3985
|
-
const yLines = groupByY(group);
|
|
3986
|
-
for (const line of yLines) {
|
|
3987
|
-
const text = mergeLineSimple(line);
|
|
3988
|
-
if (!text.trim()) continue;
|
|
3989
|
-
blocks.push({
|
|
3990
|
-
type: "paragraph",
|
|
3991
|
-
text,
|
|
3992
|
-
pageNumber: pageNum,
|
|
3993
|
-
bbox: computeBBox(line, pageNum),
|
|
3994
|
-
style: dominantStyle(line)
|
|
3995
|
-
});
|
|
3996
|
-
}
|
|
3997
|
-
}
|
|
3998
|
-
return blocks.length > 0 ? blocks : null;
|
|
3999
|
-
}
|
|
4000
|
-
function normalizeUnderSegmentedTable(table, items, pageNum, bbox) {
|
|
4001
|
-
const totalCells = table.cells.reduce((sum, row) => sum + row.filter((c) => c.text.trim()).length, 0);
|
|
4002
|
-
const totalTextLines = table.cells.reduce((sum, row) => sum + row.reduce((s, c) => s + (c.text.trim() ? c.text.split("\n").length : 0), 0), 0);
|
|
4003
|
-
const isUnderSegmented = table.rows === 1 && table.cols === 1 || totalCells <= 2 && totalTextLines >= 8 || totalCells <= 2 && items.length >= 6;
|
|
4004
|
-
if (!isUnderSegmented) return null;
|
|
4005
|
-
if (hasMultiColumnLayout(items)) return buildXyCutBlocks(items, pageNum);
|
|
4006
|
-
const directTable = buildTableFromTextLayout(items, pageNum, bbox);
|
|
4007
|
-
if (directTable) return directTable;
|
|
4008
|
-
const clusterItems = items.map((i) => ({
|
|
4009
|
-
text: i.text,
|
|
4010
|
-
x: i.x,
|
|
4011
|
-
y: i.y,
|
|
4012
|
-
w: i.w,
|
|
4013
|
-
h: i.h,
|
|
4014
|
-
fontSize: i.fontSize,
|
|
4015
|
-
fontName: i.fontName
|
|
4016
|
-
}));
|
|
4017
|
-
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4018
|
-
if (clusterResults.length > 0) {
|
|
4019
|
-
const blocks = [];
|
|
4020
|
-
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4021
|
-
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4022
|
-
const usedIndices = /* @__PURE__ */ new Set();
|
|
4023
|
-
for (const cr of clusterResults) {
|
|
4024
|
-
for (const ci of cr.usedItems) {
|
|
4025
|
-
const idx = ciToIdx.get(ci);
|
|
4026
|
-
if (idx !== void 0) usedIndices.add(idx);
|
|
4027
|
-
}
|
|
4028
|
-
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4029
|
-
}
|
|
4030
|
-
const remaining = items.filter((_, idx) => !usedIndices.has(idx));
|
|
4031
|
-
for (const item of remaining) {
|
|
4032
|
-
if (!item.text.trim()) continue;
|
|
4033
|
-
blocks.push({
|
|
4034
|
-
type: "paragraph",
|
|
4035
|
-
text: item.text,
|
|
4036
|
-
pageNumber: pageNum,
|
|
4037
|
-
bbox: computeBBox([item], pageNum),
|
|
4038
|
-
style: { fontSize: item.fontSize, fontName: item.fontName }
|
|
4039
|
-
});
|
|
4040
|
-
}
|
|
4041
|
-
blocks.sort((a, b) => {
|
|
4042
|
-
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
4043
|
-
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
4044
|
-
return by - ay;
|
|
4045
|
-
});
|
|
4046
|
-
return blocks.length > 0 ? blocks : null;
|
|
4047
|
-
}
|
|
4048
|
-
return null;
|
|
4049
|
-
}
|
|
4050
|
-
function buildTableFromTextLayout(items, pageNum, bbox) {
|
|
4051
|
-
if (items.length < 4) return null;
|
|
4052
|
-
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
4053
|
-
const yTol = 3;
|
|
4054
|
-
const rows = [];
|
|
4055
|
-
let curRow = [sorted[0]];
|
|
4056
|
-
let curY = sorted[0].y;
|
|
4057
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
4058
|
-
if (Math.abs(sorted[i].y - curY) <= yTol) {
|
|
4059
|
-
curRow.push(sorted[i]);
|
|
4060
|
-
} else {
|
|
4061
|
-
rows.push(curRow);
|
|
4062
|
-
curRow = [sorted[i]];
|
|
4063
|
-
curY = sorted[i].y;
|
|
4064
|
-
}
|
|
4065
|
-
}
|
|
4066
|
-
rows.push(curRow);
|
|
4067
|
-
if (rows.length < 2) return null;
|
|
4068
|
-
const gapPositions = [];
|
|
4069
|
-
for (const row of rows) {
|
|
4070
|
-
if (row.length < 2) continue;
|
|
4071
|
-
const sortedX = [...row].sort((a, b) => a.x - b.x);
|
|
4072
|
-
const avgFs = sortedX.reduce((s, i) => s + i.fontSize, 0) / sortedX.length;
|
|
4073
|
-
for (let j = 1; j < sortedX.length; j++) {
|
|
4074
|
-
const gap = sortedX[j].x - (sortedX[j - 1].x + sortedX[j - 1].w);
|
|
4075
|
-
if (gap >= avgFs * 1.5) {
|
|
4076
|
-
gapPositions.push(sortedX[j - 1].x + sortedX[j - 1].w + gap / 2);
|
|
4077
|
-
}
|
|
4078
|
-
}
|
|
4079
|
-
}
|
|
4080
|
-
if (gapPositions.length < 2) return null;
|
|
4081
|
-
gapPositions.sort((a, b) => a - b);
|
|
4082
|
-
const colBoundaries = [];
|
|
4083
|
-
let clusterSum = gapPositions[0], clusterCount = 1;
|
|
4084
|
-
for (let i = 1; i < gapPositions.length; i++) {
|
|
4085
|
-
const avg = clusterSum / clusterCount;
|
|
4086
|
-
if (Math.abs(gapPositions[i] - avg) <= 15) {
|
|
4087
|
-
clusterSum += gapPositions[i];
|
|
4088
|
-
clusterCount++;
|
|
4089
|
-
} else {
|
|
4090
|
-
if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
|
|
4091
|
-
clusterSum = gapPositions[i];
|
|
4092
|
-
clusterCount = 1;
|
|
4093
|
-
}
|
|
4094
|
-
}
|
|
4095
|
-
if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
|
|
4096
|
-
if (colBoundaries.length === 0) return null;
|
|
4097
|
-
const numCols = colBoundaries.length + 1;
|
|
4098
|
-
const tableRows = [];
|
|
4099
|
-
for (const row of rows) {
|
|
4100
|
-
const cells = Array(numCols).fill("");
|
|
4101
|
-
const sortedX = [...row].sort((a, b) => a.x - b.x);
|
|
4102
|
-
for (const item of sortedX) {
|
|
4103
|
-
const cx = item.x + item.w / 2;
|
|
4104
|
-
let col = 0;
|
|
4105
|
-
for (let b = 0; b < colBoundaries.length; b++) {
|
|
4106
|
-
if (cx > colBoundaries[b]) col = b + 1;
|
|
4107
|
-
}
|
|
4108
|
-
cells[col] = cells[col] ? cells[col] + " " + item.text : item.text;
|
|
4109
|
-
}
|
|
4110
|
-
if (cells[0].trim() === "" && tableRows.length > 0) {
|
|
4111
|
-
const prevCells = tableRows[tableRows.length - 1].cells;
|
|
4112
|
-
for (let c = 0; c < numCols; c++) {
|
|
4113
|
-
if (cells[c].trim()) {
|
|
4114
|
-
prevCells[c] = prevCells[c] ? prevCells[c] + " " + cells[c].trim() : cells[c].trim();
|
|
4115
|
-
}
|
|
4116
|
-
}
|
|
4117
|
-
} else {
|
|
4118
|
-
tableRows.push({ cells });
|
|
4119
|
-
}
|
|
4120
|
-
}
|
|
4121
|
-
if (tableRows.length < 2) return null;
|
|
4122
|
-
const nonEmptyCount = tableRows.reduce((sum, r) => sum + r.cells.filter((c) => c.trim()).length, 0);
|
|
4123
|
-
const totalCount = tableRows.length * numCols;
|
|
4124
|
-
if (nonEmptyCount < totalCount * 0.3) return null;
|
|
4125
|
-
const irCells = tableRows.map(
|
|
4126
|
-
(r) => r.cells.map((text, colIdx) => {
|
|
4127
|
-
let cleaned = text.trim();
|
|
4128
|
-
if (colIdx > 0) cleaned = cleaned.replace(/^[•○·\-]\s*/, "");
|
|
4129
|
-
return { text: cleaned, colSpan: 1, rowSpan: 1 };
|
|
4130
|
-
})
|
|
4375
|
+
return text.replace(
|
|
4376
|
+
/(?<![가-힣])[가-힣](?: [가-힣\d]){2,}(?![가-힣])/g,
|
|
4377
|
+
(match) => match.replace(/ /g, "")
|
|
4131
4378
|
);
|
|
4132
|
-
const irTable = {
|
|
4133
|
-
rows: tableRows.length,
|
|
4134
|
-
cols: numCols,
|
|
4135
|
-
cells: irCells,
|
|
4136
|
-
hasHeader: tableRows.length > 1
|
|
4137
|
-
};
|
|
4138
|
-
return [{ type: "table", table: irTable, pageNumber: pageNum, bbox }];
|
|
4139
4379
|
}
|
|
4140
4380
|
function shouldDemoteTable(table) {
|
|
4141
4381
|
const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
|
|
4142
4382
|
const allText = allCells.join(" ");
|
|
4383
|
+
if (table.rows <= 3 && table.cols <= 3) {
|
|
4384
|
+
const totalCells2 = table.rows * table.cols;
|
|
4385
|
+
const emptyCells2 = totalCells2 - allCells.length;
|
|
4386
|
+
if (emptyCells2 >= totalCells2 * 0.3) return true;
|
|
4387
|
+
if (/[□■◆○●▶ㅇ]/.test(allText)) return true;
|
|
4388
|
+
if (/<[^>]+>/.test(allText)) return true;
|
|
4389
|
+
}
|
|
4143
4390
|
if (allText.length > 200) return false;
|
|
4144
4391
|
if (/[□■◆○●▶]/.test(allText) && table.rows <= 3) return true;
|
|
4145
4392
|
const totalCells = table.rows * table.cols;
|
|
@@ -4183,32 +4430,6 @@ function detectMarkerHeadings(blocks) {
|
|
|
4183
4430
|
}
|
|
4184
4431
|
}
|
|
4185
4432
|
}
|
|
4186
|
-
function hasMultiColumnLayout(items) {
|
|
4187
|
-
if (items.length < 30) return false;
|
|
4188
|
-
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
4189
|
-
const minX = sorted[0].x;
|
|
4190
|
-
let maxX = minX;
|
|
4191
|
-
for (const i of sorted) if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
4192
|
-
const pageWidth = maxX - minX;
|
|
4193
|
-
if (pageWidth < 200) return false;
|
|
4194
|
-
let bestGap = 0;
|
|
4195
|
-
let bestSplit = 0;
|
|
4196
|
-
for (let j = 1; j < sorted.length; j++) {
|
|
4197
|
-
const gap = sorted[j].x - (sorted[j - 1].x + sorted[j - 1].w);
|
|
4198
|
-
if (gap > bestGap) {
|
|
4199
|
-
bestGap = gap;
|
|
4200
|
-
bestSplit = (sorted[j - 1].x + sorted[j - 1].w + sorted[j].x) / 2;
|
|
4201
|
-
}
|
|
4202
|
-
}
|
|
4203
|
-
if (bestGap < 20) return false;
|
|
4204
|
-
const splitRatio = (bestSplit - minX) / pageWidth;
|
|
4205
|
-
if (splitRatio < 0.35 || splitRatio > 0.65) return false;
|
|
4206
|
-
const leftCount = items.filter((i) => i.x + i.w / 2 < bestSplit).length;
|
|
4207
|
-
const rightCount = items.filter((i) => i.x + i.w / 2 >= bestSplit).length;
|
|
4208
|
-
if (leftCount < 15 || rightCount < 15) return false;
|
|
4209
|
-
if (Math.min(leftCount, rightCount) / Math.max(leftCount, rightCount) < 0.35) return false;
|
|
4210
|
-
return true;
|
|
4211
|
-
}
|
|
4212
4433
|
var MAX_XYCUT_DEPTH = 50;
|
|
4213
4434
|
function xyCutOrder(items, gapThreshold, depth = 0) {
|
|
4214
4435
|
if (items.length === 0) return [];
|
|
@@ -4276,6 +4497,7 @@ function extractPageBlocksWithLines(items, pageNum, opList, pageWidth, pageHeigh
|
|
|
4276
4497
|
if (items.length === 0) return [];
|
|
4277
4498
|
let { horizontals, verticals } = extractLines(opList.fnArray, opList.argsArray);
|
|
4278
4499
|
({ horizontals, verticals } = filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight));
|
|
4500
|
+
({ horizontals, verticals } = preprocessLines(horizontals, verticals));
|
|
4279
4501
|
const grids = buildTableGrids(horizontals, verticals);
|
|
4280
4502
|
if (grids.length > 0) {
|
|
4281
4503
|
return extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals);
|
|
@@ -4287,14 +4509,19 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
4287
4509
|
const usedItems = /* @__PURE__ */ new Set();
|
|
4288
4510
|
const sortedGrids = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
|
|
4289
4511
|
for (const grid of sortedGrids) {
|
|
4512
|
+
const numGridRows = grid.rowYs.length - 1;
|
|
4513
|
+
const numGridCols = grid.colXs.length - 1;
|
|
4514
|
+
if (numGridRows === 1 && numGridCols >= 2) continue;
|
|
4290
4515
|
const tableItems = [];
|
|
4291
4516
|
const pad = 3;
|
|
4517
|
+
const gridW = grid.bbox.x2 - grid.bbox.x1;
|
|
4292
4518
|
for (const item of items) {
|
|
4293
4519
|
if (usedItems.has(item)) continue;
|
|
4294
|
-
if (item.
|
|
4295
|
-
|
|
4296
|
-
|
|
4297
|
-
|
|
4520
|
+
if (item.y < grid.bbox.y1 - pad || item.y > grid.bbox.y2 + pad) continue;
|
|
4521
|
+
if (item.x < grid.bbox.x1 - pad || item.x + item.w > grid.bbox.x2 + pad) continue;
|
|
4522
|
+
if (gridW < 120 && item.x + item.w > grid.bbox.x2 - 2) continue;
|
|
4523
|
+
tableItems.push(item);
|
|
4524
|
+
usedItems.add(item);
|
|
4298
4525
|
}
|
|
4299
4526
|
const cells = extractCells(grid, horizontals, verticals);
|
|
4300
4527
|
if (cells.length === 0) continue;
|
|
@@ -4318,6 +4545,7 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
4318
4545
|
const cellItems = cellTextMap.get(cell) || [];
|
|
4319
4546
|
let text = cellTextToString(cellItems);
|
|
4320
4547
|
text = text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").trim();
|
|
4548
|
+
text = text.split("\n").map((line) => collapseEvenSpacing(line)).join("\n");
|
|
4321
4549
|
irGrid[cell.row][cell.col] = {
|
|
4322
4550
|
text,
|
|
4323
4551
|
colSpan: cell.colSpan,
|
|
@@ -4339,31 +4567,61 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
4339
4567
|
width: grid.bbox.x2 - grid.bbox.x1,
|
|
4340
4568
|
height: grid.bbox.y2 - grid.bbox.y1
|
|
4341
4569
|
};
|
|
4342
|
-
const normalized = normalizeUnderSegmentedTable(irTable, tableItems, pageNum, tableBbox);
|
|
4343
|
-
if (normalized) {
|
|
4344
|
-
blocks.push(...normalized);
|
|
4345
|
-
continue;
|
|
4346
|
-
}
|
|
4347
4570
|
if (shouldDemoteTable(irTable)) {
|
|
4348
4571
|
const demoted = demoteTableToText(irTable);
|
|
4349
4572
|
if (demoted) {
|
|
4350
|
-
|
|
4573
|
+
const text = numGridRows === 1 ? "\n" + demoted + "\n" : demoted;
|
|
4574
|
+
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
|
|
4351
4575
|
}
|
|
4352
4576
|
continue;
|
|
4353
4577
|
}
|
|
4354
4578
|
blocks.push({ type: "table", table: irTable, pageNumber: pageNum, bbox: tableBbox });
|
|
4355
4579
|
}
|
|
4356
|
-
|
|
4580
|
+
let remaining = items.filter((i) => !usedItems.has(i));
|
|
4357
4581
|
if (remaining.length > 0) {
|
|
4358
4582
|
remaining.sort((a, b) => b.y - a.y || a.x - b.x);
|
|
4359
|
-
const
|
|
4360
|
-
|
|
4361
|
-
|
|
4583
|
+
const clusterItems = remaining.map((i) => ({
|
|
4584
|
+
text: i.text,
|
|
4585
|
+
x: i.x,
|
|
4586
|
+
y: i.y,
|
|
4587
|
+
w: i.w,
|
|
4588
|
+
h: i.h,
|
|
4589
|
+
fontSize: i.fontSize,
|
|
4590
|
+
fontName: i.fontName
|
|
4591
|
+
}));
|
|
4592
|
+
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4593
|
+
if (clusterResults.length > 0) {
|
|
4594
|
+
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4595
|
+
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4596
|
+
const usedClusterIndices = /* @__PURE__ */ new Set();
|
|
4597
|
+
for (const cr of clusterResults) {
|
|
4598
|
+
for (const ci of cr.usedItems) {
|
|
4599
|
+
const idx = ciToIdx.get(ci);
|
|
4600
|
+
if (idx !== void 0) usedClusterIndices.add(idx);
|
|
4601
|
+
}
|
|
4602
|
+
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4603
|
+
}
|
|
4604
|
+
remaining = remaining.filter((_, idx) => !usedClusterIndices.has(idx));
|
|
4605
|
+
}
|
|
4606
|
+
if (remaining.length > 0) {
|
|
4607
|
+
const allY = remaining.map((i) => i.y);
|
|
4608
|
+
const pageH = safeMax(allY) - safeMin(allY);
|
|
4609
|
+
const groups = xyCutOrder(remaining, Math.max(15, pageH * 0.03));
|
|
4610
|
+
const textBlocks = [];
|
|
4611
|
+
for (const group of groups) {
|
|
4612
|
+
if (group.length === 0) continue;
|
|
4613
|
+
const groupBlocks = extractPageBlocksFallback(group, pageNum);
|
|
4614
|
+
for (const b of groupBlocks) textBlocks.push(b);
|
|
4615
|
+
}
|
|
4616
|
+
const finalTextBlocks = detectListBlocks(textBlocks);
|
|
4617
|
+
for (const b of finalTextBlocks) blocks.push(b);
|
|
4618
|
+
}
|
|
4619
|
+
blocks.sort((a, b) => {
|
|
4362
4620
|
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
4363
4621
|
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
4364
4622
|
return by - ay;
|
|
4365
4623
|
});
|
|
4366
|
-
return mergeAdjacentTableBlocks(
|
|
4624
|
+
return mergeAdjacentTableBlocks(blocks);
|
|
4367
4625
|
}
|
|
4368
4626
|
return mergeAdjacentTableBlocks(blocks);
|
|
4369
4627
|
}
|
|
@@ -4389,57 +4647,53 @@ function mergeAdjacentTableBlocks(blocks) {
|
|
|
4389
4647
|
}
|
|
4390
4648
|
function extractPageBlocksFallback(items, pageNum) {
|
|
4391
4649
|
if (items.length === 0) return [];
|
|
4392
|
-
if (hasMultiColumnLayout(items)) {
|
|
4393
|
-
const xyBlocks = buildXyCutBlocks(items, pageNum) || [];
|
|
4394
|
-
return detectSpecialKoreanTables(detectListBlocks(xyBlocks));
|
|
4395
|
-
}
|
|
4396
4650
|
const blocks = [];
|
|
4397
|
-
const
|
|
4398
|
-
|
|
4399
|
-
|
|
4400
|
-
|
|
4401
|
-
|
|
4402
|
-
|
|
4403
|
-
|
|
4404
|
-
|
|
4405
|
-
|
|
4406
|
-
|
|
4407
|
-
|
|
4408
|
-
|
|
4409
|
-
|
|
4410
|
-
|
|
4411
|
-
|
|
4412
|
-
|
|
4413
|
-
|
|
4414
|
-
|
|
4415
|
-
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4416
|
-
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4417
|
-
const usedIndices = /* @__PURE__ */ new Set();
|
|
4418
|
-
for (const cr of clusterResults) {
|
|
4419
|
-
for (const ci of cr.usedItems) {
|
|
4420
|
-
const idx = ciToIdx.get(ci);
|
|
4421
|
-
if (idx !== void 0) usedIndices.add(idx);
|
|
4422
|
-
}
|
|
4423
|
-
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4651
|
+
const clusterItems = items.map((i) => ({
|
|
4652
|
+
text: i.text,
|
|
4653
|
+
x: i.x,
|
|
4654
|
+
y: i.y,
|
|
4655
|
+
w: i.w,
|
|
4656
|
+
h: i.h,
|
|
4657
|
+
fontSize: i.fontSize,
|
|
4658
|
+
fontName: i.fontName
|
|
4659
|
+
}));
|
|
4660
|
+
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4661
|
+
if (clusterResults.length > 0) {
|
|
4662
|
+
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4663
|
+
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4664
|
+
const usedIndices = /* @__PURE__ */ new Set();
|
|
4665
|
+
for (const cr of clusterResults) {
|
|
4666
|
+
for (const ci of cr.usedItems) {
|
|
4667
|
+
const idx = ciToIdx.get(ci);
|
|
4668
|
+
if (idx !== void 0) usedIndices.add(idx);
|
|
4424
4669
|
}
|
|
4425
|
-
|
|
4426
|
-
|
|
4427
|
-
|
|
4428
|
-
|
|
4429
|
-
|
|
4430
|
-
|
|
4431
|
-
|
|
4432
|
-
|
|
4433
|
-
|
|
4670
|
+
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4671
|
+
}
|
|
4672
|
+
const remaining = items.filter((_, idx) => !usedIndices.has(idx));
|
|
4673
|
+
if (remaining.length > 0) {
|
|
4674
|
+
const yLines = groupByY(remaining);
|
|
4675
|
+
for (const line of yLines) {
|
|
4676
|
+
const text = mergeLineSimple(line);
|
|
4677
|
+
if (!text.trim()) continue;
|
|
4678
|
+
const bbox = computeBBox(line, pageNum);
|
|
4679
|
+
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
|
|
4434
4680
|
}
|
|
4435
|
-
|
|
4436
|
-
|
|
4437
|
-
|
|
4438
|
-
|
|
4439
|
-
|
|
4681
|
+
}
|
|
4682
|
+
blocks.sort((a, b) => {
|
|
4683
|
+
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
4684
|
+
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
4685
|
+
return by - ay;
|
|
4686
|
+
});
|
|
4687
|
+
} else {
|
|
4688
|
+
const allYLines = groupByY(items);
|
|
4689
|
+
const columns = detectColumns(allYLines);
|
|
4690
|
+
if (columns && columns.length >= 3) {
|
|
4691
|
+
const tableText = extractWithColumns(allYLines, columns);
|
|
4692
|
+
const bbox = computeBBox(items, pageNum);
|
|
4693
|
+
blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
|
|
4440
4694
|
} else {
|
|
4441
4695
|
const allY = items.map((i) => i.y);
|
|
4442
|
-
const pageHeight =
|
|
4696
|
+
const pageHeight = safeMax(allY) - safeMin(allY);
|
|
4443
4697
|
const gapThreshold = Math.max(15, pageHeight * 0.03);
|
|
4444
4698
|
const orderedGroups = xyCutOrder(items, gapThreshold);
|
|
4445
4699
|
for (const group of orderedGroups) {
|
|
@@ -4492,22 +4746,76 @@ function dominantStyle(items) {
|
|
|
4492
4746
|
return { fontSize: dominantSize, fontName };
|
|
4493
4747
|
}
|
|
4494
4748
|
function normalizeItems(rawItems) {
|
|
4495
|
-
|
|
4749
|
+
const items = [];
|
|
4750
|
+
const spacePositions = [];
|
|
4751
|
+
for (const i of rawItems) {
|
|
4752
|
+
if (typeof i.str !== "string") continue;
|
|
4753
|
+
const x = Math.round(i.transform[4]);
|
|
4754
|
+
const y = Math.round(i.transform[5]);
|
|
4755
|
+
if (!i.str.trim()) {
|
|
4756
|
+
spacePositions.push({ x, y });
|
|
4757
|
+
continue;
|
|
4758
|
+
}
|
|
4496
4759
|
const scaleY = Math.abs(i.transform[3]);
|
|
4497
4760
|
const scaleX = Math.abs(i.transform[0]);
|
|
4498
4761
|
const fontSize = Math.round(Math.max(scaleY, scaleX));
|
|
4499
|
-
|
|
4500
|
-
|
|
4501
|
-
|
|
4502
|
-
|
|
4503
|
-
|
|
4504
|
-
|
|
4505
|
-
|
|
4506
|
-
|
|
4507
|
-
|
|
4508
|
-
|
|
4509
|
-
|
|
4510
|
-
|
|
4762
|
+
const w = Math.round(i.width);
|
|
4763
|
+
const h = Math.round(i.height);
|
|
4764
|
+
const isHidden = fontSize === 0 || i.width === 0 && i.str.trim().length > 0;
|
|
4765
|
+
let text = i.str.trim();
|
|
4766
|
+
if (/^[\d\s\-().·,☎]+$/.test(text) && /\d/.test(text) && / /.test(text)) {
|
|
4767
|
+
text = text.replace(/ /g, "");
|
|
4768
|
+
}
|
|
4769
|
+
const split = splitEvenSpacedItem(text, x, w, fontSize);
|
|
4770
|
+
if (split) {
|
|
4771
|
+
for (const s of split) {
|
|
4772
|
+
items.push({ text: s.text, x: s.x, y, w: s.w, h, fontSize, fontName: i.fontName || "", isHidden });
|
|
4773
|
+
}
|
|
4774
|
+
} else {
|
|
4775
|
+
items.push({ text, x, y, w, h, fontSize, fontName: i.fontName || "", isHidden });
|
|
4776
|
+
}
|
|
4777
|
+
}
|
|
4778
|
+
const sorted = items.sort((a, b) => b.y - a.y || a.x - b.x);
|
|
4779
|
+
const deduped = [];
|
|
4780
|
+
for (let i = 0; i < sorted.length; i++) {
|
|
4781
|
+
let isDup = false;
|
|
4782
|
+
for (let j = deduped.length - 1; j >= 0; j--) {
|
|
4783
|
+
const prev = deduped[j];
|
|
4784
|
+
if (prev.y - sorted[i].y > 3) break;
|
|
4785
|
+
if (Math.abs(prev.y - sorted[i].y) <= 3 && prev.text === sorted[i].text && Math.abs(prev.x - sorted[i].x) <= 3) {
|
|
4786
|
+
isDup = true;
|
|
4787
|
+
break;
|
|
4788
|
+
}
|
|
4789
|
+
}
|
|
4790
|
+
if (!isDup) deduped.push(sorted[i]);
|
|
4791
|
+
}
|
|
4792
|
+
if (spacePositions.length > 0) {
|
|
4793
|
+
for (const item of deduped) {
|
|
4794
|
+
for (const sp of spacePositions) {
|
|
4795
|
+
if (Math.abs(sp.y - item.y) <= 3) {
|
|
4796
|
+
const dist = item.x - sp.x;
|
|
4797
|
+
if (dist >= 0 && dist <= 20) {
|
|
4798
|
+
item.hasSpaceBefore = true;
|
|
4799
|
+
break;
|
|
4800
|
+
}
|
|
4801
|
+
}
|
|
4802
|
+
}
|
|
4803
|
+
}
|
|
4804
|
+
}
|
|
4805
|
+
return deduped;
|
|
4806
|
+
}
|
|
4807
|
+
function splitEvenSpacedItem(text, itemX, itemW, fontSize) {
|
|
4808
|
+
if (!/^[가-힣\d](?: [가-힣\d]){2,}$/.test(text)) return null;
|
|
4809
|
+
const chars = text.split(" ");
|
|
4810
|
+
if (chars.length < 3) return null;
|
|
4811
|
+
const charW = itemW / chars.length;
|
|
4812
|
+
if (charW > fontSize * 2) return null;
|
|
4813
|
+
return chars.map((ch, idx) => ({
|
|
4814
|
+
text: ch,
|
|
4815
|
+
x: Math.round(itemX + idx * charW),
|
|
4816
|
+
w: Math.round(charW * 0.8)
|
|
4817
|
+
// 실제 글자 폭은 간격보다 좁음
|
|
4818
|
+
}));
|
|
4511
4819
|
}
|
|
4512
4820
|
function groupByY(items) {
|
|
4513
4821
|
if (items.length === 0) return [];
|
|
@@ -4532,14 +4840,14 @@ function isProseSpread(items) {
|
|
|
4532
4840
|
for (let i = 1; i < sorted.length; i++) {
|
|
4533
4841
|
gaps.push(sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w));
|
|
4534
4842
|
}
|
|
4535
|
-
const maxGap =
|
|
4843
|
+
const maxGap = safeMax(gaps);
|
|
4536
4844
|
const avgLen = items.reduce((s, i) => s + i.text.length, 0) / items.length;
|
|
4537
4845
|
return maxGap < 40 && avgLen < 5;
|
|
4538
4846
|
}
|
|
4539
4847
|
function detectColumns(yLines) {
|
|
4540
4848
|
const allItems = yLines.flat();
|
|
4541
4849
|
if (allItems.length === 0) return null;
|
|
4542
|
-
const pageWidth =
|
|
4850
|
+
const pageWidth = safeMax(allItems.map((i) => i.x + i.w)) - safeMin(allItems.map((i) => i.x));
|
|
4543
4851
|
if (pageWidth < 100) return null;
|
|
4544
4852
|
let bigoLineIdx = -1;
|
|
4545
4853
|
for (let i = 0; i < yLines.length; i++) {
|
|
@@ -4571,7 +4879,7 @@ function detectColumns(yLines) {
|
|
|
4571
4879
|
}
|
|
4572
4880
|
const peaks = xClusters.filter((c) => c.count >= 3).sort((a, b) => a.minX - b.minX);
|
|
4573
4881
|
if (peaks.length < 3) return null;
|
|
4574
|
-
const MERGE_TOL =
|
|
4882
|
+
const MERGE_TOL = 40;
|
|
4575
4883
|
const merged = [peaks[0]];
|
|
4576
4884
|
for (let i = 1; i < peaks.length; i++) {
|
|
4577
4885
|
const prev = merged[merged.length - 1];
|
|
@@ -4585,7 +4893,14 @@ function detectColumns(yLines) {
|
|
|
4585
4893
|
merged.push({ ...peaks[i] });
|
|
4586
4894
|
}
|
|
4587
4895
|
}
|
|
4588
|
-
const
|
|
4896
|
+
const rawColumns = merged.filter((c) => c.count >= 3).map((c) => c.minX);
|
|
4897
|
+
if (rawColumns.length < 3) return null;
|
|
4898
|
+
const MIN_DETECT_COL_WIDTH = 30;
|
|
4899
|
+
const columns = [rawColumns[0]];
|
|
4900
|
+
for (let i = 1; i < rawColumns.length; i++) {
|
|
4901
|
+
if (rawColumns[i] - columns[columns.length - 1] < MIN_DETECT_COL_WIDTH) continue;
|
|
4902
|
+
columns.push(rawColumns[i]);
|
|
4903
|
+
}
|
|
4589
4904
|
return columns.length >= 3 ? columns : null;
|
|
4590
4905
|
}
|
|
4591
4906
|
function findColumn(x, columns) {
|
|
@@ -4713,6 +5028,16 @@ function buildGridTable(lines, columns) {
|
|
|
4713
5028
|
}
|
|
4714
5029
|
merged.splice(0, headerEnd, headerRow);
|
|
4715
5030
|
}
|
|
5031
|
+
for (const row of merged) {
|
|
5032
|
+
for (let c = 0; c < row.length; c++) {
|
|
5033
|
+
if (row[c]) row[c] = collapseEvenSpacing(row[c]);
|
|
5034
|
+
}
|
|
5035
|
+
}
|
|
5036
|
+
const totalCells = merged.length * numCols;
|
|
5037
|
+
const filledCells = merged.reduce((s, row) => s + row.filter((c) => c).length, 0);
|
|
5038
|
+
if (filledCells < totalCells * 0.35 || merged.length < 2 || merged.length <= 3 && numCols >= 7) {
|
|
5039
|
+
return merged.map((r) => r.filter((c) => c).join(" ")).join("\n");
|
|
5040
|
+
}
|
|
4716
5041
|
const md = [];
|
|
4717
5042
|
md.push("| " + merged[0].join(" | ") + " |");
|
|
4718
5043
|
md.push("| " + merged[0].map(() => "---").join(" | ") + " |");
|
|
@@ -4724,12 +5049,32 @@ function buildGridTable(lines, columns) {
|
|
|
4724
5049
|
function mergeLineSimple(items) {
|
|
4725
5050
|
if (items.length <= 1) return items[0]?.text || "";
|
|
4726
5051
|
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
5052
|
+
const isEvenSpaced = detectEvenSpacedItems(sorted);
|
|
4727
5053
|
let result = sorted[0].text;
|
|
4728
5054
|
for (let i = 1; i < sorted.length; i++) {
|
|
4729
5055
|
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
4730
5056
|
const avgFs = (sorted[i].fontSize + sorted[i - 1].fontSize) / 2;
|
|
4731
|
-
|
|
4732
|
-
|
|
5057
|
+
const tabThreshold = Math.max(avgFs * 2, 30);
|
|
5058
|
+
if (gap > tabThreshold) {
|
|
5059
|
+
result += " ";
|
|
5060
|
+
result += sorted[i].text;
|
|
5061
|
+
continue;
|
|
5062
|
+
}
|
|
5063
|
+
if (isEvenSpaced[i]) {
|
|
5064
|
+
result += sorted[i].text;
|
|
5065
|
+
continue;
|
|
5066
|
+
}
|
|
5067
|
+
if (sorted[i].hasSpaceBefore && gap >= avgFs * 0.05) {
|
|
5068
|
+
result += " ";
|
|
5069
|
+
result += sorted[i].text;
|
|
5070
|
+
continue;
|
|
5071
|
+
}
|
|
5072
|
+
if (/[□■○●▶◆◇ㅇ]$/.test(sorted[i - 1].text) && /^[가-힣]/.test(sorted[i].text) && gap > 1) {
|
|
5073
|
+
result += " ";
|
|
5074
|
+
result += sorted[i].text;
|
|
5075
|
+
continue;
|
|
5076
|
+
}
|
|
5077
|
+
if (gap < avgFs * 0.15) {
|
|
4733
5078
|
} else if (gap < avgFs * 0.35 && (/[가-힣]$/.test(result) || /^[가-힣]/.test(sorted[i].text))) {
|
|
4734
5079
|
} else if (gap > 3) result += " ";
|
|
4735
5080
|
result += sorted[i].text;
|
|
@@ -4738,8 +5083,8 @@ function mergeLineSimple(items) {
|
|
|
4738
5083
|
}
|
|
4739
5084
|
function cleanPdfText(text) {
|
|
4740
5085
|
return mergeKoreanLines(
|
|
4741
|
-
text.replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "")
|
|
4742
|
-
).replace(/^(?!\|)
|
|
5086
|
+
text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
|
|
5087
|
+
).replace(/^(?!\| ---).*$/gm, (line) => collapseEvenSpacing(line)).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
|
|
4743
5088
|
}
|
|
4744
5089
|
function startsWithMarker(line) {
|
|
4745
5090
|
const t = line.trimStart();
|
|
@@ -4931,7 +5276,7 @@ function mergeKoreanLines(text) {
|
|
|
4931
5276
|
result[result.length - 1] = prev + " " + currTrimmed;
|
|
4932
5277
|
continue;
|
|
4933
5278
|
}
|
|
4934
|
-
if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev)) {
|
|
5279
|
+
if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev) && !startsWithMarker(prev)) {
|
|
4935
5280
|
result[result.length - 1] = prev + " " + curr;
|
|
4936
5281
|
} else {
|
|
4937
5282
|
result.push(curr);
|
|
@@ -4979,7 +5324,7 @@ function getTextContent(el) {
|
|
|
4979
5324
|
return el.textContent?.trim() ?? "";
|
|
4980
5325
|
}
|
|
4981
5326
|
function parseXml(text) {
|
|
4982
|
-
return new import_xmldom2.DOMParser().parseFromString(text, "text/xml");
|
|
5327
|
+
return new import_xmldom2.DOMParser().parseFromString(stripDtd(text), "text/xml");
|
|
4983
5328
|
}
|
|
4984
5329
|
function parseSharedStrings(xml) {
|
|
4985
5330
|
const doc = parseXml(xml);
|
|
@@ -5266,7 +5611,7 @@ function getAttr(el, localName) {
|
|
|
5266
5611
|
return null;
|
|
5267
5612
|
}
|
|
5268
5613
|
function parseXml2(text) {
|
|
5269
|
-
return new import_xmldom3.DOMParser().parseFromString(text, "text/xml");
|
|
5614
|
+
return new import_xmldom3.DOMParser().parseFromString(stripDtd(text), "text/xml");
|
|
5270
5615
|
}
|
|
5271
5616
|
function parseStyles(xml) {
|
|
5272
5617
|
const doc = parseXml2(xml);
|
|
@@ -5666,7 +6011,13 @@ function normalize(s) {
|
|
|
5666
6011
|
}
|
|
5667
6012
|
var MAX_LEVENSHTEIN_LEN = 1e4;
|
|
5668
6013
|
function levenshtein(a, b) {
|
|
5669
|
-
if (a.length + b.length > MAX_LEVENSHTEIN_LEN)
|
|
6014
|
+
if (a.length + b.length > MAX_LEVENSHTEIN_LEN) {
|
|
6015
|
+
const sampleLen = Math.min(500, a.length, b.length);
|
|
6016
|
+
let diffs = 0;
|
|
6017
|
+
for (let i = 0; i < sampleLen; i++) if (a[i] !== b[i]) diffs++;
|
|
6018
|
+
const sampleRate = sampleLen > 0 ? diffs / sampleLen : 1;
|
|
6019
|
+
return Math.abs(a.length - b.length) + Math.round(Math.min(a.length, b.length) * sampleRate);
|
|
6020
|
+
}
|
|
5670
6021
|
if (a.length > b.length) [a, b] = [b, a];
|
|
5671
6022
|
const m = a.length;
|
|
5672
6023
|
const n = b.length;
|
|
@@ -5949,13 +6300,20 @@ function extractInlineFields(text) {
|
|
|
5949
6300
|
|
|
5950
6301
|
// src/hwpx/generator.ts
|
|
5951
6302
|
var import_jszip5 = __toESM(require("jszip"), 1);
|
|
5952
|
-
var
|
|
6303
|
+
var NS_SECTION = "http://www.hancom.co.kr/hwpml/2011/section";
|
|
6304
|
+
var NS_PARA = "http://www.hancom.co.kr/hwpml/2011/paragraph";
|
|
6305
|
+
var NS_HEAD = "http://www.hancom.co.kr/hwpml/2011/head";
|
|
6306
|
+
var NS_OPF = "http://www.idpf.org/2007/opf/";
|
|
6307
|
+
var NS_HPF = "http://www.hancom.co.kr/schema/2011/hpf";
|
|
6308
|
+
var NS_OCF = "urn:oasis:names:tc:opendocument:xmlns:container";
|
|
5953
6309
|
async function markdownToHwpx(markdown) {
|
|
5954
6310
|
const blocks = parseMarkdownToBlocks(markdown);
|
|
5955
6311
|
const sectionXml = blocksToSectionXml(blocks);
|
|
5956
6312
|
const zip = new import_jszip5.default();
|
|
5957
6313
|
zip.file("mimetype", "application/hwp+zip", { compression: "STORE" });
|
|
6314
|
+
zip.file("META-INF/container.xml", generateContainerXml());
|
|
5958
6315
|
zip.file("Contents/content.hpf", generateManifest());
|
|
6316
|
+
zip.file("Contents/header.xml", generateHeaderXml());
|
|
5959
6317
|
zip.file("Contents/section0.xml", sectionXml);
|
|
5960
6318
|
return await zip.generateAsync({ type: "arraybuffer" });
|
|
5961
6319
|
}
|
|
@@ -6000,8 +6358,111 @@ function parseMarkdownToBlocks(md) {
|
|
|
6000
6358
|
function escapeXml(text) {
|
|
6001
6359
|
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """);
|
|
6002
6360
|
}
|
|
6361
|
+
function generateContainerXml() {
|
|
6362
|
+
return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
|
6363
|
+
<ocf:container xmlns:ocf="${NS_OCF}" xmlns:hpf="${NS_HPF}">
|
|
6364
|
+
<ocf:rootfiles>
|
|
6365
|
+
<ocf:rootfile full-path="Contents/content.hpf" media-type="application/hwpml-package+xml"/>
|
|
6366
|
+
</ocf:rootfiles>
|
|
6367
|
+
</ocf:container>`;
|
|
6368
|
+
}
|
|
6369
|
+
function generateManifest() {
|
|
6370
|
+
return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
|
6371
|
+
<opf:package xmlns:opf="${NS_OPF}" xmlns:hpf="${NS_HPF}" xmlns:hh="${NS_HEAD}">
|
|
6372
|
+
<opf:manifest>
|
|
6373
|
+
<opf:item id="header" href="Contents/header.xml" media-type="application/xml"/>
|
|
6374
|
+
<opf:item id="section0" href="Contents/section0.xml" media-type="application/xml"/>
|
|
6375
|
+
</opf:manifest>
|
|
6376
|
+
<opf:spine>
|
|
6377
|
+
<opf:itemref idref="header" linear="no"/>
|
|
6378
|
+
<opf:itemref idref="section0" linear="yes"/>
|
|
6379
|
+
</opf:spine>
|
|
6380
|
+
</opf:package>`;
|
|
6381
|
+
}
|
|
6382
|
+
function generateHeaderXml() {
|
|
6383
|
+
return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
|
6384
|
+
<hh:head xmlns:hh="${NS_HEAD}" xmlns:hp="${NS_PARA}" version="1.4" secCnt="1">
|
|
6385
|
+
<hh:beginNum page="1" footnote="1" endnote="1" pic="1" tbl="1" equation="1"/>
|
|
6386
|
+
<hh:refList>
|
|
6387
|
+
<hh:fontfaces itemCnt="7">
|
|
6388
|
+
<hh:fontface lang="HANGUL" fontCnt="1">
|
|
6389
|
+
<hh:font id="0" face="\uD568\uCD08\uB86C\uBC14\uD0D5" type="TTF" isEmbedded="0">
|
|
6390
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="4" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6391
|
+
</hh:font>
|
|
6392
|
+
</hh:fontface>
|
|
6393
|
+
<hh:fontface lang="LATIN" fontCnt="1">
|
|
6394
|
+
<hh:font id="0" face="Times New Roman" type="TTF" isEmbedded="0">
|
|
6395
|
+
<hh:typeInfo familyType="FCAT_OLDSTYLE" weight="5" proportion="4" contrast="2" strokeVariation="0" armStyle="0" letterform="0" midline="0" xHeight="4"/>
|
|
6396
|
+
</hh:font>
|
|
6397
|
+
</hh:fontface>
|
|
6398
|
+
<hh:fontface lang="HANJA" fontCnt="1">
|
|
6399
|
+
<hh:font id="0" face="\uD568\uCD08\uB86C\uBC14\uD0D5" type="TTF" isEmbedded="0">
|
|
6400
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="4" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6401
|
+
</hh:font>
|
|
6402
|
+
</hh:fontface>
|
|
6403
|
+
<hh:fontface lang="JAPANESE" fontCnt="1">
|
|
6404
|
+
<hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
|
|
6405
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6406
|
+
</hh:font>
|
|
6407
|
+
</hh:fontface>
|
|
6408
|
+
<hh:fontface lang="OTHER" fontCnt="1">
|
|
6409
|
+
<hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
|
|
6410
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6411
|
+
</hh:font>
|
|
6412
|
+
</hh:fontface>
|
|
6413
|
+
<hh:fontface lang="SYMBOL" fontCnt="1">
|
|
6414
|
+
<hh:font id="0" face="Symbol" type="TTF" isEmbedded="0">
|
|
6415
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6416
|
+
</hh:font>
|
|
6417
|
+
</hh:fontface>
|
|
6418
|
+
<hh:fontface lang="USER" fontCnt="1">
|
|
6419
|
+
<hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
|
|
6420
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6421
|
+
</hh:font>
|
|
6422
|
+
</hh:fontface>
|
|
6423
|
+
</hh:fontfaces>
|
|
6424
|
+
<hh:borderFills itemCnt="1">
|
|
6425
|
+
<hh:borderFill id="0" threeD="0" shadow="0" centerLine="0" breakCellSeparateLine="0">
|
|
6426
|
+
<hh:slash type="NONE" Crooked="0" isCounter="0"/>
|
|
6427
|
+
<hh:backSlash type="NONE" Crooked="0" isCounter="0"/>
|
|
6428
|
+
<hh:leftBorder type="NONE" width="0.1mm" color="0"/>
|
|
6429
|
+
<hh:rightBorder type="NONE" width="0.1mm" color="0"/>
|
|
6430
|
+
<hh:topBorder type="NONE" width="0.1mm" color="0"/>
|
|
6431
|
+
<hh:bottomBorder type="NONE" width="0.1mm" color="0"/>
|
|
6432
|
+
<hh:diagonal type="NONE" width="0.1mm" color="0"/>
|
|
6433
|
+
<hh:fillInfo/>
|
|
6434
|
+
</hh:borderFill>
|
|
6435
|
+
</hh:borderFills>
|
|
6436
|
+
<hh:charProperties itemCnt="1">
|
|
6437
|
+
<hh:charPr id="0" height="1000" textColor="0" shadeColor="-1" useFontSpace="0" useKerning="0" symMark="NONE" borderFillIDRef="0">
|
|
6438
|
+
<hh:fontRef hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
|
|
6439
|
+
<hh:ratio hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
|
|
6440
|
+
<hh:spacing hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
|
|
6441
|
+
<hh:relSz hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
|
|
6442
|
+
<hh:offset hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
|
|
6443
|
+
</hh:charPr>
|
|
6444
|
+
</hh:charProperties>
|
|
6445
|
+
<hh:tabProperties itemCnt="0"/>
|
|
6446
|
+
<hh:numberings itemCnt="0"/>
|
|
6447
|
+
<hh:bullets itemCnt="0"/>
|
|
6448
|
+
<hh:paraProperties itemCnt="1">
|
|
6449
|
+
<hh:paraPr id="0" tabIDRef="0" condense="0" fontLineHeight="0" snapToGrid="0" suppressOverlap="0" checked="0">
|
|
6450
|
+
<hh:parLineBreak lineBreak="BREAK_LINE" wordBreak="BREAK_WORD" breakLatinWord="BREAK_WORD" breakNonLatinWord="BREAK_WORD"/>
|
|
6451
|
+
<hh:parMargin left="0" right="0" prev="0" next="0" indent="0"/>
|
|
6452
|
+
<hh:parBorder borderFillIDRef="0" offsetLeft="0" offsetRight="0" offsetTop="0" offsetBottom="0" connect="0" ignoreMargin="0"/>
|
|
6453
|
+
<hh:parShade borderFillIDRef="0"/>
|
|
6454
|
+
<hh:parTabList/>
|
|
6455
|
+
</hh:paraPr>
|
|
6456
|
+
</hh:paraProperties>
|
|
6457
|
+
<hh:styles itemCnt="1">
|
|
6458
|
+
<hh:style id="0" type="PARA" name="\uBC14\uD0D5\uAE00" engName="Normal" paraPrIDRef="0" charPrIDRef="0" nextStyleIDRef="0" langIDRef="1042" lockForm="0"/>
|
|
6459
|
+
</hh:styles>
|
|
6460
|
+
</hh:refList>
|
|
6461
|
+
<hh:compatibleDocument targetProgram="HWP2018"/>
|
|
6462
|
+
</hh:head>`;
|
|
6463
|
+
}
|
|
6003
6464
|
function generateParagraph(text) {
|
|
6004
|
-
return `<hp:p><hp:run><hp:t>${escapeXml(text)}</hp:t></hp:run></hp:p>`;
|
|
6465
|
+
return `<hp:p paraPrIDRef="0" styleIDRef="0"><hp:run charPrIDRef="0"><hp:t>${escapeXml(text)}</hp:t></hp:run></hp:p>`;
|
|
6005
6466
|
}
|
|
6006
6467
|
function generateTable(rows) {
|
|
6007
6468
|
const trElements = rows.map((row) => {
|
|
@@ -6025,22 +6486,11 @@ function blocksToSectionXml(blocks) {
|
|
|
6025
6486
|
return "";
|
|
6026
6487
|
}
|
|
6027
6488
|
}).join("\n ");
|
|
6028
|
-
return `<?xml version="1.0" encoding="UTF-8"?>
|
|
6029
|
-
<hs:sec xmlns:hs="${
|
|
6489
|
+
return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
|
6490
|
+
<hs:sec xmlns:hs="${NS_SECTION}" xmlns:hp="${NS_PARA}">
|
|
6030
6491
|
${body}
|
|
6031
6492
|
</hs:sec>`;
|
|
6032
6493
|
}
|
|
6033
|
-
function generateManifest() {
|
|
6034
|
-
return `<?xml version="1.0" encoding="UTF-8"?>
|
|
6035
|
-
<opf:package xmlns:opf="http://www.idpf.org/2007/opf">
|
|
6036
|
-
<opf:manifest>
|
|
6037
|
-
<opf:item id="s0" href="section0.xml" media-type="application/xml"/>
|
|
6038
|
-
</opf:manifest>
|
|
6039
|
-
<opf:spine>
|
|
6040
|
-
<opf:itemref idref="s0"/>
|
|
6041
|
-
</opf:spine>
|
|
6042
|
-
</opf:package>`;
|
|
6043
|
-
}
|
|
6044
6494
|
|
|
6045
6495
|
// src/index.ts
|
|
6046
6496
|
async function parse(input, options) {
|