kordoc 2.1.0 → 2.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +318 -302
- package/dist/{chunk-MOL7MDBG.js → chunk-3TBUDJDE.js} +1 -1
- package/dist/chunk-3TBUDJDE.js.map +1 -0
- package/dist/{chunk-GJ2S6IMC.js → chunk-FINXMRCH.js} +978 -586
- package/dist/chunk-FINXMRCH.js.map +1 -0
- package/dist/chunk-MUAWCQDY.js +52 -0
- package/dist/chunk-MUAWCQDY.js.map +1 -0
- package/dist/cli.js +13 -9
- package/dist/cli.js.map +1 -1
- package/dist/detect-63IGCXTH.js +18 -0
- package/dist/index.cjs +1003 -553
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +1003 -553
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +41 -12
- package/dist/mcp.js.map +1 -1
- package/dist/page-range-OF5I4PQY.js +8 -0
- package/dist/{watch-X7IC7MLF.js → watch-Q6L4UBTC.js} +32 -16
- package/dist/watch-Q6L4UBTC.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-GJ2S6IMC.js.map +0 -1
- package/dist/chunk-MOL7MDBG.js.map +0 -1
- package/dist/chunk-PKIJLEV6.js +0 -93
- package/dist/chunk-PKIJLEV6.js.map +0 -1
- package/dist/page-range-737B4EZW.js +0 -8
- package/dist/utils-BWQ2RGUD.js +0 -22
- package/dist/watch-X7IC7MLF.js.map +0 -1
- /package/dist/{page-range-737B4EZW.js.map → detect-63IGCXTH.js.map} +0 -0
- /package/dist/{utils-BWQ2RGUD.js.map → page-range-OF5I4PQY.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -139,7 +139,7 @@ import { inflateRawSync } from "zlib";
|
|
|
139
139
|
import { DOMParser } from "@xmldom/xmldom";
|
|
140
140
|
|
|
141
141
|
// src/utils.ts
|
|
142
|
-
var VERSION = true ? "2.1
|
|
142
|
+
var VERSION = true ? "2.2.1" : "0.0.0-dev";
|
|
143
143
|
function toArrayBuffer(buf) {
|
|
144
144
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
145
145
|
return buf.buffer;
|
|
@@ -155,7 +155,8 @@ var KordocError = class extends Error {
|
|
|
155
155
|
function isPathTraversal(name) {
|
|
156
156
|
if (name.includes("\0")) return true;
|
|
157
157
|
const normalized = name.replace(/\\/g, "/");
|
|
158
|
-
|
|
158
|
+
const segments = normalized.split("/");
|
|
159
|
+
return segments.some((s) => s === "..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
159
160
|
}
|
|
160
161
|
function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
|
|
161
162
|
try {
|
|
@@ -195,12 +196,25 @@ function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEnt
|
|
|
195
196
|
return { totalUncompressed: 0, entryCount: 0 };
|
|
196
197
|
}
|
|
197
198
|
}
|
|
199
|
+
function stripDtd(xml) {
|
|
200
|
+
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
201
|
+
}
|
|
198
202
|
var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
|
|
199
203
|
function sanitizeHref(href) {
|
|
200
204
|
const trimmed = href.trim();
|
|
201
205
|
if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
|
|
202
206
|
return trimmed;
|
|
203
207
|
}
|
|
208
|
+
function safeMin(arr) {
|
|
209
|
+
let min = Infinity;
|
|
210
|
+
for (let i = 0; i < arr.length; i++) if (arr[i] < min) min = arr[i];
|
|
211
|
+
return min;
|
|
212
|
+
}
|
|
213
|
+
function safeMax(arr) {
|
|
214
|
+
let max = -Infinity;
|
|
215
|
+
for (let i = 0; i < arr.length; i++) if (arr[i] > max) max = arr[i];
|
|
216
|
+
return max;
|
|
217
|
+
}
|
|
204
218
|
function classifyError(err) {
|
|
205
219
|
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
206
220
|
const msg = err.message;
|
|
@@ -275,6 +289,7 @@ function buildTableDirect(rows, numRows) {
|
|
|
275
289
|
if (end > maxCols) maxCols = end;
|
|
276
290
|
}
|
|
277
291
|
}
|
|
292
|
+
if (maxCols > MAX_COLS) maxCols = MAX_COLS;
|
|
278
293
|
if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
|
|
279
294
|
const grid = Array.from(
|
|
280
295
|
{ length: numRows },
|
|
@@ -284,7 +299,7 @@ function buildTableDirect(rows, numRows) {
|
|
|
284
299
|
for (const cell of row) {
|
|
285
300
|
const r = cell.rowAddr ?? 0;
|
|
286
301
|
const c = cell.colAddr ?? 0;
|
|
287
|
-
if (r >= numRows || c >= maxCols) continue;
|
|
302
|
+
if (r >= numRows || c >= maxCols || r < 0 || c < 0) continue;
|
|
288
303
|
grid[r][c] = { text: cell.text.trim(), colSpan: cell.colSpan, rowSpan: cell.rowSpan };
|
|
289
304
|
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
290
305
|
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
@@ -313,9 +328,12 @@ function trimAndReturn(grid, numRows, maxCols) {
|
|
|
313
328
|
}
|
|
314
329
|
function convertTableToText(rows) {
|
|
315
330
|
return rows.map(
|
|
316
|
-
(row) => row.map((c) => c.text.trim().replace(/\n/g, " ")).filter(Boolean).join("
|
|
331
|
+
(row) => row.map((c) => c.text.trim().replace(/\n/g, " ").replace(/\|/g, "\\|")).filter(Boolean).join(" / ")
|
|
317
332
|
).filter(Boolean).join("\n");
|
|
318
333
|
}
|
|
334
|
+
function escapeGfm(text) {
|
|
335
|
+
return text.replace(/~/g, "\\~");
|
|
336
|
+
}
|
|
319
337
|
var HWP_SHAPE_ALT_TEXT_RE = /(?:모서리가 둥근 |둥근 )?(?:사각형|직사각형|정사각형|원|타원|삼각형|이등변 삼각형|직각 삼각형|선|직선|곡선|화살표|굵은 화살표|이중 화살표|오각형|육각형|팔각형|별|[4-8]점별|십자|십자형|구름|구름형|마름모|도넛|평행사변형|사다리꼴|부채꼴|호|반원|물결|번개|하트|빗금|블록 화살표|수식|표|그림|개체|그리기\s?개체|묶음\s?개체|글상자|수식\s?개체|OLE\s?개체)\s?입니다\.?/g;
|
|
320
338
|
function sanitizeText(text) {
|
|
321
339
|
let result = text.replace(/[\u{F0000}-\u{FFFFD}]/gu, "").replace(HWP_SHAPE_ALT_TEXT_RE, "").replace(/ +/g, " ").trim();
|
|
@@ -425,7 +443,7 @@ function blocksToMarkdown(blocks) {
|
|
|
425
443
|
if (block.footnoteText) {
|
|
426
444
|
text += ` (\uC8FC: ${block.footnoteText})`;
|
|
427
445
|
}
|
|
428
|
-
lines.push(text);
|
|
446
|
+
lines.push(escapeGfm(text), "");
|
|
429
447
|
} else if (block.type === "table" && block.table) {
|
|
430
448
|
if (lines.length > 0 && lines[lines.length - 1] !== "") {
|
|
431
449
|
lines.push("");
|
|
@@ -448,13 +466,13 @@ function tableToMarkdown(table) {
|
|
|
448
466
|
return content.split(/\n/).map((line) => {
|
|
449
467
|
const trimmed = line.trim();
|
|
450
468
|
if (!trimmed) return "";
|
|
451
|
-
if (/^\d+\.\s/.test(trimmed)) return `**${trimmed}**`;
|
|
452
|
-
if (/^[가-힣]\.\s/.test(trimmed)) return ` ${trimmed}`;
|
|
453
|
-
return trimmed;
|
|
469
|
+
if (/^\d+\.\s/.test(trimmed)) return `**${escapeGfm(trimmed)}**`;
|
|
470
|
+
if (/^[가-힣]\.\s/.test(trimmed)) return ` ${escapeGfm(trimmed)}`;
|
|
471
|
+
return escapeGfm(trimmed);
|
|
454
472
|
}).filter(Boolean).join("\n");
|
|
455
473
|
}
|
|
456
474
|
if (numCols === 1 && numRows >= 2) {
|
|
457
|
-
return cells.map((row) => sanitizeText(row[0].text).replace(/\n/g, " ")).filter(Boolean).join("\n");
|
|
475
|
+
return cells.map((row) => escapeGfm(sanitizeText(row[0].text)).replace(/\n/g, " ")).filter(Boolean).join("\n");
|
|
458
476
|
}
|
|
459
477
|
const display = Array.from({ length: numRows }, () => Array(numCols).fill(""));
|
|
460
478
|
const skip = /* @__PURE__ */ new Set();
|
|
@@ -463,15 +481,12 @@ function tableToMarkdown(table) {
|
|
|
463
481
|
if (skip.has(`${r},${c}`)) continue;
|
|
464
482
|
const cell = cells[r]?.[c];
|
|
465
483
|
if (!cell) continue;
|
|
466
|
-
display[r][c] = sanitizeText(cell.text).replace(/\n/g, "<br>");
|
|
484
|
+
display[r][c] = escapeGfm(sanitizeText(cell.text)).replace(/\|/g, "\\|").replace(/\n/g, "<br>");
|
|
467
485
|
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
468
486
|
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
469
487
|
if (dr === 0 && dc === 0) continue;
|
|
470
488
|
if (r + dr < numRows && c + dc < numCols) {
|
|
471
489
|
skip.add(`${r + dr},${c + dc}`);
|
|
472
|
-
if (dr === 0) {
|
|
473
|
-
display[r][c + dc] = cell.text.replace(/\n/g, "<br>");
|
|
474
|
-
}
|
|
475
490
|
}
|
|
476
491
|
}
|
|
477
492
|
}
|
|
@@ -607,9 +622,6 @@ function parseStyleElements(doc, map) {
|
|
|
607
622
|
}
|
|
608
623
|
}
|
|
609
624
|
}
|
|
610
|
-
function stripDtd(xml) {
|
|
611
|
-
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
612
|
-
}
|
|
613
625
|
async function parseHwpxDocument(buffer, options) {
|
|
614
626
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
|
|
615
627
|
let zip;
|
|
@@ -959,7 +971,8 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
959
971
|
if (newTable.rows.length > 0) {
|
|
960
972
|
if (tableStack.length > 0) {
|
|
961
973
|
const parentTable = tableStack.pop();
|
|
962
|
-
|
|
974
|
+
let nestedCols = 0;
|
|
975
|
+
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
963
976
|
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
964
977
|
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
965
978
|
} else {
|
|
@@ -1068,7 +1081,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
1068
1081
|
if (newTable.rows.length > 0) {
|
|
1069
1082
|
if (tableStack.length > 0) {
|
|
1070
1083
|
const parentTable = tableStack.pop();
|
|
1071
|
-
|
|
1084
|
+
let nestedCols = 0;
|
|
1085
|
+
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
1072
1086
|
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
1073
1087
|
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
1074
1088
|
} else {
|
|
@@ -2166,6 +2180,7 @@ function parseLenientCfb(data) {
|
|
|
2166
2180
|
if (miniSectorSizeShift > 16) throw new Error("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uBBF8\uB2C8 \uC139\uD130 \uD06C\uAE30 \uC2DC\uD504\uD2B8: " + miniSectorSizeShift);
|
|
2167
2181
|
const miniSectorSize = 1 << miniSectorSizeShift;
|
|
2168
2182
|
const fatSectorCount = data.readUInt32LE(44);
|
|
2183
|
+
if (fatSectorCount > 1e4) throw new Error("FAT \uC139\uD130 \uC218\uAC00 \uB108\uBB34 \uB9CE\uC2B5\uB2C8\uB2E4: " + fatSectorCount);
|
|
2169
2184
|
const firstDirSector = data.readUInt32LE(48);
|
|
2170
2185
|
const miniStreamCutoff = data.readUInt32LE(56);
|
|
2171
2186
|
const firstMiniFatSector = data.readUInt32LE(60);
|
|
@@ -2541,10 +2556,14 @@ function findSections(cfb) {
|
|
|
2541
2556
|
}
|
|
2542
2557
|
function findSectionsLenient(lcfb, compressed) {
|
|
2543
2558
|
const sections = [];
|
|
2559
|
+
let totalDecompressed = 0;
|
|
2544
2560
|
for (let i = 0; i < MAX_SECTIONS; i++) {
|
|
2545
2561
|
const raw = lcfb.findStream(`/BodyText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
|
|
2546
2562
|
if (!raw) break;
|
|
2547
|
-
|
|
2563
|
+
const content = compressed ? decompressStream(raw) : raw;
|
|
2564
|
+
totalDecompressed += content.length;
|
|
2565
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2566
|
+
sections.push({ idx: i, content });
|
|
2548
2567
|
}
|
|
2549
2568
|
if (sections.length === 0) {
|
|
2550
2569
|
for (const e of lcfb.entries()) {
|
|
@@ -2552,7 +2571,12 @@ function findSectionsLenient(lcfb, compressed) {
|
|
|
2552
2571
|
if (e.name.startsWith("Section")) {
|
|
2553
2572
|
const idx = parseInt(e.name.replace("Section", ""), 10) || 0;
|
|
2554
2573
|
const raw = lcfb.findStream(e.name);
|
|
2555
|
-
if (raw)
|
|
2574
|
+
if (raw) {
|
|
2575
|
+
const content = compressed ? decompressStream(raw) : raw;
|
|
2576
|
+
totalDecompressed += content.length;
|
|
2577
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2578
|
+
sections.push({ idx, content });
|
|
2579
|
+
}
|
|
2556
2580
|
}
|
|
2557
2581
|
}
|
|
2558
2582
|
}
|
|
@@ -2560,11 +2584,15 @@ function findSectionsLenient(lcfb, compressed) {
|
|
|
2560
2584
|
}
|
|
2561
2585
|
function findViewTextSectionsLenient(lcfb, compressed) {
|
|
2562
2586
|
const sections = [];
|
|
2587
|
+
let totalDecompressed = 0;
|
|
2563
2588
|
for (let i = 0; i < MAX_SECTIONS; i++) {
|
|
2564
2589
|
const raw = lcfb.findStream(`/ViewText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
|
|
2565
2590
|
if (!raw) break;
|
|
2566
2591
|
try {
|
|
2567
|
-
|
|
2592
|
+
const content = decryptViewText(raw, compressed);
|
|
2593
|
+
totalDecompressed += content.length;
|
|
2594
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2595
|
+
sections.push({ idx: i, content });
|
|
2568
2596
|
} catch {
|
|
2569
2597
|
break;
|
|
2570
2598
|
}
|
|
@@ -2966,37 +2994,18 @@ init_page_range();
|
|
|
2966
2994
|
// src/pdf/line-detector.ts
|
|
2967
2995
|
import { OPS } from "pdfjs-dist/legacy/build/pdf.mjs";
|
|
2968
2996
|
var ORIENTATION_TOL = 2;
|
|
2969
|
-
var MIN_LINE_LENGTH =
|
|
2970
|
-
var
|
|
2997
|
+
var MIN_LINE_LENGTH = 15;
|
|
2998
|
+
var MAX_LINE_WIDTH = 5;
|
|
2971
2999
|
var CONNECT_TOL = 5;
|
|
2972
3000
|
var CELL_PADDING = 2;
|
|
2973
|
-
var
|
|
2974
|
-
var
|
|
2975
|
-
|
|
2976
|
-
|
|
2977
|
-
m1[0] * m2[0] + m1[2] * m2[1],
|
|
2978
|
-
m1[1] * m2[0] + m1[3] * m2[1],
|
|
2979
|
-
m1[0] * m2[2] + m1[2] * m2[3],
|
|
2980
|
-
m1[1] * m2[2] + m1[3] * m2[3],
|
|
2981
|
-
m1[0] * m2[4] + m1[2] * m2[5] + m1[4],
|
|
2982
|
-
m1[1] * m2[4] + m1[3] * m2[5] + m1[5]
|
|
2983
|
-
];
|
|
2984
|
-
}
|
|
2985
|
-
function matTransformPoint(m, x, y) {
|
|
2986
|
-
return [m[0] * x + m[2] * y + m[4], m[1] * x + m[3] * y + m[5]];
|
|
2987
|
-
}
|
|
2988
|
-
function matScale(m) {
|
|
2989
|
-
return Math.max(
|
|
2990
|
-
Math.sqrt(m[1] * m[1] + m[3] * m[3]),
|
|
2991
|
-
Math.sqrt(m[0] * m[0] + m[2] * m[2])
|
|
2992
|
-
);
|
|
2993
|
-
}
|
|
3001
|
+
var MIN_COL_WIDTH = 15;
|
|
3002
|
+
var MIN_ROW_HEIGHT = 6;
|
|
3003
|
+
var VERTEX_MERGE_FACTOR = 4;
|
|
3004
|
+
var MIN_COORD_MERGE_TOL = 8;
|
|
2994
3005
|
function extractLines(fnArray, argsArray) {
|
|
2995
3006
|
const horizontals = [];
|
|
2996
3007
|
const verticals = [];
|
|
2997
|
-
let ctm = [...IDENTITY];
|
|
2998
3008
|
let lineWidth = 1;
|
|
2999
|
-
const stateStack = [];
|
|
3000
3009
|
let currentPath = [];
|
|
3001
3010
|
let pathStartX = 0, pathStartY = 0;
|
|
3002
3011
|
let curX = 0, curY = 0;
|
|
@@ -3014,53 +3023,13 @@ function extractLines(fnArray, argsArray) {
|
|
|
3014
3023
|
);
|
|
3015
3024
|
}
|
|
3016
3025
|
}
|
|
3017
|
-
function
|
|
3018
|
-
if (
|
|
3019
|
-
const first = path[0], last = path[path.length - 1];
|
|
3020
|
-
const closed = Math.abs(first.x1 - last.x2) < 1 && Math.abs(first.y1 - last.y2) < 1;
|
|
3021
|
-
if (!closed) return false;
|
|
3022
|
-
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
3023
|
-
for (const seg of path) {
|
|
3024
|
-
minX = Math.min(minX, seg.x1, seg.x2);
|
|
3025
|
-
minY = Math.min(minY, seg.y1, seg.y2);
|
|
3026
|
-
maxX = Math.max(maxX, seg.x1, seg.x2);
|
|
3027
|
-
maxY = Math.max(maxY, seg.y1, seg.y2);
|
|
3028
|
-
}
|
|
3029
|
-
const w = maxX - minX, h = maxY - minY;
|
|
3030
|
-
if (w < MIN_LINE_LENGTH && h < MIN_LINE_LENGTH) return false;
|
|
3031
|
-
path.length = 0;
|
|
3032
|
-
if (h < ORIENTATION_TOL * 2 || w > MIN_LINE_LENGTH && h <= MAX_LINE_WIDTH) {
|
|
3033
|
-
path.push({ x1: minX, y1: (minY + maxY) / 2, x2: maxX, y2: (minY + maxY) / 2 });
|
|
3034
|
-
} else if (w < ORIENTATION_TOL * 2 || h > MIN_LINE_LENGTH && w <= MAX_LINE_WIDTH) {
|
|
3035
|
-
path.push({ x1: (minX + maxX) / 2, y1: minY, x2: (minX + maxX) / 2, y2: maxY });
|
|
3036
|
-
} else {
|
|
3037
|
-
pushRectangle(path, minX, minY, w, h);
|
|
3038
|
-
}
|
|
3039
|
-
return true;
|
|
3040
|
-
}
|
|
3041
|
-
function flushPath(isStroke, isFill) {
|
|
3042
|
-
if (!isStroke && !isFill) {
|
|
3043
|
-
currentPath = [];
|
|
3044
|
-
return;
|
|
3045
|
-
}
|
|
3046
|
-
if (isFill && !isStroke && currentPath.length >= 3) {
|
|
3047
|
-
tryConvertLinesToRectangle(currentPath);
|
|
3048
|
-
}
|
|
3049
|
-
const scale = matScale(ctm);
|
|
3050
|
-
const effectiveLW = lineWidth * scale;
|
|
3051
|
-
if (effectiveLW > MAX_LINE_WIDTH && isStroke && !isFill) {
|
|
3026
|
+
function flushPath(isStroke) {
|
|
3027
|
+
if (!isStroke) {
|
|
3052
3028
|
currentPath = [];
|
|
3053
3029
|
return;
|
|
3054
3030
|
}
|
|
3055
3031
|
for (const seg of currentPath) {
|
|
3056
|
-
|
|
3057
|
-
const [px2, py2] = matTransformPoint(ctm, seg.x2, seg.y2);
|
|
3058
|
-
classifyAndAdd(
|
|
3059
|
-
{ x1: px1, y1: py1, x2: px2, y2: py2 },
|
|
3060
|
-
effectiveLW,
|
|
3061
|
-
horizontals,
|
|
3062
|
-
verticals
|
|
3063
|
-
);
|
|
3032
|
+
classifyAndAdd(seg, lineWidth, horizontals, verticals);
|
|
3064
3033
|
}
|
|
3065
3034
|
currentPath = [];
|
|
3066
3035
|
}
|
|
@@ -3068,28 +3037,9 @@ function extractLines(fnArray, argsArray) {
|
|
|
3068
3037
|
const op = fnArray[i];
|
|
3069
3038
|
const args = argsArray[i];
|
|
3070
3039
|
switch (op) {
|
|
3071
|
-
// ── Graphics State ──
|
|
3072
|
-
case OPS.save:
|
|
3073
|
-
stateStack.push({ ctm: [...ctm], lineWidth });
|
|
3074
|
-
break;
|
|
3075
|
-
case OPS.restore:
|
|
3076
|
-
if (stateStack.length > 0) {
|
|
3077
|
-
const state = stateStack.pop();
|
|
3078
|
-
ctm = state.ctm;
|
|
3079
|
-
lineWidth = state.lineWidth;
|
|
3080
|
-
}
|
|
3081
|
-
break;
|
|
3082
|
-
case OPS.transform: {
|
|
3083
|
-
const m = args;
|
|
3084
|
-
if (m.length >= 6) {
|
|
3085
|
-
ctm = matMultiply(ctm, [m[0], m[1], m[2], m[3], m[4], m[5]]);
|
|
3086
|
-
}
|
|
3087
|
-
break;
|
|
3088
|
-
}
|
|
3089
3040
|
case OPS.setLineWidth:
|
|
3090
3041
|
lineWidth = args[0] || 1;
|
|
3091
3042
|
break;
|
|
3092
|
-
// ── Path Construction ──
|
|
3093
3043
|
case OPS.constructPath: {
|
|
3094
3044
|
const arg0 = args[0];
|
|
3095
3045
|
if (Array.isArray(arg0)) {
|
|
@@ -3157,60 +3107,34 @@ function extractLines(fnArray, argsArray) {
|
|
|
3157
3107
|
}
|
|
3158
3108
|
}
|
|
3159
3109
|
}
|
|
3160
|
-
|
|
3161
|
-
|
|
3162
|
-
|
|
3163
|
-
|
|
3164
|
-
flushPath(isStroke5 || isBoth5, isFill5 || isBoth5);
|
|
3110
|
+
if (afterOp === OPS.stroke || afterOp === OPS.closeStroke) {
|
|
3111
|
+
flushPath(true);
|
|
3112
|
+
} else if (afterOp === OPS.fill || afterOp === OPS.eoFill || afterOp === OPS.fillStroke || afterOp === OPS.eoFillStroke || afterOp === OPS.closeFillStroke || afterOp === OPS.closeEOFillStroke) {
|
|
3113
|
+
flushPath(true);
|
|
3165
3114
|
} else if (afterOp === OPS.endPath) {
|
|
3166
|
-
flushPath(false
|
|
3115
|
+
flushPath(false);
|
|
3167
3116
|
}
|
|
3168
3117
|
}
|
|
3169
3118
|
break;
|
|
3170
3119
|
}
|
|
3171
|
-
// ── Paint Operations ──
|
|
3172
3120
|
case OPS.stroke:
|
|
3173
3121
|
case OPS.closeStroke:
|
|
3174
|
-
flushPath(true
|
|
3122
|
+
flushPath(true);
|
|
3175
3123
|
break;
|
|
3176
3124
|
case OPS.fill:
|
|
3177
3125
|
case OPS.eoFill:
|
|
3178
|
-
flushPath(false, true);
|
|
3179
|
-
break;
|
|
3180
3126
|
case OPS.fillStroke:
|
|
3181
3127
|
case OPS.eoFillStroke:
|
|
3182
3128
|
case OPS.closeFillStroke:
|
|
3183
3129
|
case OPS.closeEOFillStroke:
|
|
3184
|
-
flushPath(true
|
|
3130
|
+
flushPath(true);
|
|
3185
3131
|
break;
|
|
3186
3132
|
case OPS.endPath:
|
|
3187
|
-
flushPath(false
|
|
3188
|
-
break;
|
|
3189
|
-
}
|
|
3190
|
-
}
|
|
3191
|
-
return {
|
|
3192
|
-
horizontals: deduplicateLines(horizontals),
|
|
3193
|
-
verticals: deduplicateLines(verticals)
|
|
3194
|
-
};
|
|
3195
|
-
}
|
|
3196
|
-
function deduplicateLines(lines) {
|
|
3197
|
-
if (lines.length <= 1) return lines;
|
|
3198
|
-
const result = [];
|
|
3199
|
-
const tol = COORD_MERGE_TOL;
|
|
3200
|
-
for (const line of lines) {
|
|
3201
|
-
let isDuplicate = false;
|
|
3202
|
-
for (const existing of result) {
|
|
3203
|
-
if (Math.abs(line.y1 - existing.y1) <= tol && Math.abs(line.y2 - existing.y2) <= tol && Math.abs(line.x1 - existing.x1) <= tol && Math.abs(line.x2 - existing.x2) <= tol) {
|
|
3204
|
-
if (line.lineWidth > existing.lineWidth) {
|
|
3205
|
-
existing.lineWidth = line.lineWidth;
|
|
3206
|
-
}
|
|
3207
|
-
isDuplicate = true;
|
|
3133
|
+
flushPath(false);
|
|
3208
3134
|
break;
|
|
3209
|
-
}
|
|
3210
3135
|
}
|
|
3211
|
-
if (!isDuplicate) result.push(line);
|
|
3212
3136
|
}
|
|
3213
|
-
return
|
|
3137
|
+
return { horizontals, verticals };
|
|
3214
3138
|
}
|
|
3215
3139
|
function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
|
|
3216
3140
|
const dx = Math.abs(seg.x2 - seg.x1);
|
|
@@ -3229,6 +3153,55 @@ function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
|
|
|
3229
3153
|
verticals.push({ x1: x, y1, x2: x, y2, lineWidth });
|
|
3230
3154
|
}
|
|
3231
3155
|
}
|
|
3156
|
+
function preprocessLines(horizontals, verticals) {
|
|
3157
|
+
let h = horizontals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
|
|
3158
|
+
let v = verticals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
|
|
3159
|
+
h = mergeParallelLines(h, "h");
|
|
3160
|
+
v = mergeParallelLines(v, "v");
|
|
3161
|
+
return { horizontals: h, verticals: v };
|
|
3162
|
+
}
|
|
3163
|
+
function mergeParallelLines(lines, dir) {
|
|
3164
|
+
if (lines.length <= 1) return lines;
|
|
3165
|
+
const sorted = [...lines].sort((a, b) => {
|
|
3166
|
+
const posA = dir === "h" ? a.y1 : a.x1;
|
|
3167
|
+
const posB = dir === "h" ? b.y1 : b.x1;
|
|
3168
|
+
if (Math.abs(posA - posB) > 0.1) return posA - posB;
|
|
3169
|
+
return dir === "h" ? a.x1 - b.x1 : a.y1 - b.y1;
|
|
3170
|
+
});
|
|
3171
|
+
const MERGE_TOL = 3;
|
|
3172
|
+
const result = [sorted[0]];
|
|
3173
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
3174
|
+
const prev = result[result.length - 1];
|
|
3175
|
+
const curr = sorted[i];
|
|
3176
|
+
const prevPos = dir === "h" ? prev.y1 : prev.x1;
|
|
3177
|
+
const currPos = dir === "h" ? curr.y1 : curr.x1;
|
|
3178
|
+
if (Math.abs(prevPos - currPos) <= MERGE_TOL) {
|
|
3179
|
+
const prevStart = dir === "h" ? prev.x1 : prev.y1;
|
|
3180
|
+
const prevEnd = dir === "h" ? prev.x2 : prev.y2;
|
|
3181
|
+
const currStart = dir === "h" ? curr.x1 : curr.y1;
|
|
3182
|
+
const currEnd = dir === "h" ? curr.x2 : curr.y2;
|
|
3183
|
+
const overlap = Math.min(prevEnd, currEnd) - Math.max(prevStart, currStart);
|
|
3184
|
+
const minLen = Math.min(prevEnd - prevStart, currEnd - currStart);
|
|
3185
|
+
if (overlap > minLen * 0.3) {
|
|
3186
|
+
if (dir === "h") {
|
|
3187
|
+
prev.x1 = Math.min(prev.x1, curr.x1);
|
|
3188
|
+
prev.x2 = Math.max(prev.x2, curr.x2);
|
|
3189
|
+
prev.y1 = (prev.y1 + curr.y1) / 2;
|
|
3190
|
+
prev.y2 = prev.y1;
|
|
3191
|
+
} else {
|
|
3192
|
+
prev.y1 = Math.min(prev.y1, curr.y1);
|
|
3193
|
+
prev.y2 = Math.max(prev.y2, curr.y2);
|
|
3194
|
+
prev.x1 = (prev.x1 + curr.x1) / 2;
|
|
3195
|
+
prev.x2 = prev.x1;
|
|
3196
|
+
}
|
|
3197
|
+
prev.lineWidth = Math.max(prev.lineWidth, curr.lineWidth);
|
|
3198
|
+
continue;
|
|
3199
|
+
}
|
|
3200
|
+
}
|
|
3201
|
+
result.push(curr);
|
|
3202
|
+
}
|
|
3203
|
+
return result;
|
|
3204
|
+
}
|
|
3232
3205
|
function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
|
|
3233
3206
|
const margin = 5;
|
|
3234
3207
|
return {
|
|
@@ -3240,8 +3213,49 @@ function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
|
|
|
3240
3213
|
)
|
|
3241
3214
|
};
|
|
3242
3215
|
}
|
|
3216
|
+
function buildVertices(horizontals, verticals) {
|
|
3217
|
+
const vertices = [];
|
|
3218
|
+
const tol = CONNECT_TOL;
|
|
3219
|
+
for (const h of horizontals) {
|
|
3220
|
+
for (const v of verticals) {
|
|
3221
|
+
if (v.x1 >= h.x1 - tol && v.x1 <= h.x2 + tol && h.y1 >= v.y1 - tol && h.y1 <= v.y2 + tol) {
|
|
3222
|
+
const radius = Math.max(h.lineWidth, v.lineWidth, 1);
|
|
3223
|
+
vertices.push({ x: v.x1, y: h.y1, radius });
|
|
3224
|
+
}
|
|
3225
|
+
}
|
|
3226
|
+
}
|
|
3227
|
+
return vertices;
|
|
3228
|
+
}
|
|
3229
|
+
function mergeVertices(vertices) {
|
|
3230
|
+
if (vertices.length <= 1) return vertices;
|
|
3231
|
+
const merged = [];
|
|
3232
|
+
const used = new Array(vertices.length).fill(false);
|
|
3233
|
+
for (let i = 0; i < vertices.length; i++) {
|
|
3234
|
+
if (used[i]) continue;
|
|
3235
|
+
let sumX = vertices[i].x, sumY = vertices[i].y;
|
|
3236
|
+
let maxRadius = vertices[i].radius;
|
|
3237
|
+
let count = 1;
|
|
3238
|
+
for (let j = i + 1; j < vertices.length; j++) {
|
|
3239
|
+
if (used[j]) continue;
|
|
3240
|
+
const mergeTol = VERTEX_MERGE_FACTOR * Math.max(maxRadius, vertices[j].radius);
|
|
3241
|
+
if (Math.abs(vertices[i].x - vertices[j].x) <= mergeTol && Math.abs(vertices[i].y - vertices[j].y) <= mergeTol) {
|
|
3242
|
+
sumX += vertices[j].x;
|
|
3243
|
+
sumY += vertices[j].y;
|
|
3244
|
+
maxRadius = Math.max(maxRadius, vertices[j].radius);
|
|
3245
|
+
count++;
|
|
3246
|
+
used[j] = true;
|
|
3247
|
+
}
|
|
3248
|
+
}
|
|
3249
|
+
merged.push({ x: sumX / count, y: sumY / count, radius: maxRadius });
|
|
3250
|
+
}
|
|
3251
|
+
return merged;
|
|
3252
|
+
}
|
|
3243
3253
|
function buildTableGrids(horizontals, verticals) {
|
|
3244
3254
|
if (horizontals.length < 2 || verticals.length < 2) return [];
|
|
3255
|
+
const allVertices = buildVertices(horizontals, verticals);
|
|
3256
|
+
const vertices = mergeVertices(allVertices);
|
|
3257
|
+
if (vertices.length < 4) return [];
|
|
3258
|
+
const globalRadius = vertices.reduce((max, v) => Math.max(max, v.radius), 1);
|
|
3245
3259
|
const allLines = [
|
|
3246
3260
|
...horizontals.map((l, i) => ({ ...l, type: "h", id: i })),
|
|
3247
3261
|
...verticals.map((l, i) => ({ ...l, type: "v", id: i + horizontals.length }))
|
|
@@ -3252,21 +3266,74 @@ function buildTableGrids(horizontals, verticals) {
|
|
|
3252
3266
|
const hLines = group.filter((l) => l.type === "h");
|
|
3253
3267
|
const vLines = group.filter((l) => l.type === "v");
|
|
3254
3268
|
if (hLines.length < 2 || vLines.length < 2) continue;
|
|
3255
|
-
|
|
3256
|
-
const
|
|
3257
|
-
|
|
3258
|
-
|
|
3269
|
+
let gx1 = Infinity, gy1 = Infinity, gx2 = -Infinity, gy2 = -Infinity;
|
|
3270
|
+
for (const l of vLines) {
|
|
3271
|
+
if (l.x1 < gx1) gx1 = l.x1;
|
|
3272
|
+
if (l.x1 > gx2) gx2 = l.x1;
|
|
3273
|
+
}
|
|
3274
|
+
for (const l of hLines) {
|
|
3275
|
+
if (l.y1 < gy1) gy1 = l.y1;
|
|
3276
|
+
if (l.y1 > gy2) gy2 = l.y1;
|
|
3277
|
+
}
|
|
3278
|
+
const groupBbox = {
|
|
3279
|
+
x1: gx1 - CONNECT_TOL,
|
|
3280
|
+
y1: gy1 - CONNECT_TOL,
|
|
3281
|
+
x2: gx2 + CONNECT_TOL,
|
|
3282
|
+
y2: gy2 + CONNECT_TOL
|
|
3283
|
+
};
|
|
3284
|
+
const groupVertices = vertices.filter(
|
|
3285
|
+
(v) => v.x >= groupBbox.x1 && v.x <= groupBbox.x2 && v.y >= groupBbox.y1 && v.y <= groupBbox.y2
|
|
3286
|
+
);
|
|
3287
|
+
const groupRadius = groupVertices.length > 0 ? groupVertices.reduce((max, v) => Math.max(max, v.radius), 1) : globalRadius;
|
|
3288
|
+
const coordMergeTol = Math.max(VERTEX_MERGE_FACTOR * groupRadius, MIN_COORD_MERGE_TOL);
|
|
3289
|
+
const rawYs = [
|
|
3290
|
+
...hLines.map((l) => l.y1),
|
|
3291
|
+
...groupVertices.map((v) => v.y)
|
|
3292
|
+
];
|
|
3293
|
+
const rowYs = clusterCoordinates(rawYs, coordMergeTol).sort((a, b) => b - a);
|
|
3294
|
+
const rawXs = [
|
|
3295
|
+
...vLines.map((l) => l.x1),
|
|
3296
|
+
...groupVertices.map((v) => v.x)
|
|
3297
|
+
];
|
|
3298
|
+
const colXs = clusterCoordinates(rawXs, coordMergeTol).sort((a, b) => a - b);
|
|
3259
3299
|
if (rowYs.length < 2 || colXs.length < 2) continue;
|
|
3300
|
+
const validColXs = enforceMinWidth(colXs, MIN_COL_WIDTH);
|
|
3301
|
+
const validRowYs = enforceMinHeight(rowYs, MIN_ROW_HEIGHT);
|
|
3302
|
+
if (validRowYs.length < 2 || validColXs.length < 2) continue;
|
|
3260
3303
|
const bbox = {
|
|
3261
|
-
x1:
|
|
3262
|
-
y1:
|
|
3263
|
-
x2:
|
|
3264
|
-
y2:
|
|
3304
|
+
x1: validColXs[0],
|
|
3305
|
+
y1: validRowYs[validRowYs.length - 1],
|
|
3306
|
+
x2: validColXs[validColXs.length - 1],
|
|
3307
|
+
y2: validRowYs[0]
|
|
3265
3308
|
};
|
|
3266
|
-
grids.push({ rowYs, colXs, bbox });
|
|
3309
|
+
grids.push({ rowYs: validRowYs, colXs: validColXs, bbox, vertexRadius: groupRadius });
|
|
3267
3310
|
}
|
|
3268
3311
|
return mergeAdjacentGrids(grids);
|
|
3269
3312
|
}
|
|
3313
|
+
function enforceMinWidth(colXs, minWidth) {
|
|
3314
|
+
if (colXs.length <= 2) return colXs;
|
|
3315
|
+
const result = [colXs[0]];
|
|
3316
|
+
for (let i = 1; i < colXs.length; i++) {
|
|
3317
|
+
const prevX = result[result.length - 1];
|
|
3318
|
+
if (colXs[i] - prevX < minWidth && i < colXs.length - 1) {
|
|
3319
|
+
continue;
|
|
3320
|
+
}
|
|
3321
|
+
result.push(colXs[i]);
|
|
3322
|
+
}
|
|
3323
|
+
return result;
|
|
3324
|
+
}
|
|
3325
|
+
function enforceMinHeight(rowYs, minHeight) {
|
|
3326
|
+
if (rowYs.length <= 2) return rowYs;
|
|
3327
|
+
const result = [rowYs[0]];
|
|
3328
|
+
for (let i = 1; i < rowYs.length; i++) {
|
|
3329
|
+
const prevY = result[result.length - 1];
|
|
3330
|
+
if (prevY - rowYs[i] < minHeight && i < rowYs.length - 1) {
|
|
3331
|
+
continue;
|
|
3332
|
+
}
|
|
3333
|
+
result.push(rowYs[i]);
|
|
3334
|
+
}
|
|
3335
|
+
return result;
|
|
3336
|
+
}
|
|
3270
3337
|
function mergeAdjacentGrids(grids) {
|
|
3271
3338
|
if (grids.length <= 1) return grids;
|
|
3272
3339
|
const sorted = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
|
|
@@ -3275,9 +3342,10 @@ function mergeAdjacentGrids(grids) {
|
|
|
3275
3342
|
const prev = merged[merged.length - 1];
|
|
3276
3343
|
const curr = sorted[i];
|
|
3277
3344
|
if (prev.colXs.length === curr.colXs.length) {
|
|
3278
|
-
const
|
|
3345
|
+
const mergeTol = Math.max(VERTEX_MERGE_FACTOR * Math.max(prev.vertexRadius, curr.vertexRadius), 6) * 3;
|
|
3346
|
+
const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= mergeTol);
|
|
3279
3347
|
const verticalGap = prev.bbox.y1 - curr.bbox.y2;
|
|
3280
|
-
if (colMatch && verticalGap >= -
|
|
3348
|
+
if (colMatch && verticalGap >= -CONNECT_TOL && verticalGap <= 20) {
|
|
3281
3349
|
const allRowYs = [.../* @__PURE__ */ new Set([...prev.rowYs, ...curr.rowYs])].sort((a, b) => b - a);
|
|
3282
3350
|
merged[merged.length - 1] = {
|
|
3283
3351
|
rowYs: allRowYs,
|
|
@@ -3287,7 +3355,8 @@ function mergeAdjacentGrids(grids) {
|
|
|
3287
3355
|
y1: Math.min(prev.bbox.y1, curr.bbox.y1),
|
|
3288
3356
|
x2: Math.max(prev.bbox.x2, curr.bbox.x2),
|
|
3289
3357
|
y2: Math.max(prev.bbox.y2, curr.bbox.y2)
|
|
3290
|
-
}
|
|
3358
|
+
},
|
|
3359
|
+
vertexRadius: Math.max(prev.vertexRadius, curr.vertexRadius)
|
|
3291
3360
|
};
|
|
3292
3361
|
continue;
|
|
3293
3362
|
}
|
|
@@ -3296,14 +3365,14 @@ function mergeAdjacentGrids(grids) {
|
|
|
3296
3365
|
}
|
|
3297
3366
|
return merged;
|
|
3298
3367
|
}
|
|
3299
|
-
function clusterCoordinates(values) {
|
|
3368
|
+
function clusterCoordinates(values, tolerance) {
|
|
3300
3369
|
if (values.length === 0) return [];
|
|
3301
3370
|
const sorted = [...values].sort((a, b) => a - b);
|
|
3302
3371
|
const clusters = [{ sum: sorted[0], count: 1 }];
|
|
3303
3372
|
for (let i = 1; i < sorted.length; i++) {
|
|
3304
3373
|
const last = clusters[clusters.length - 1];
|
|
3305
3374
|
const avg = last.sum / last.count;
|
|
3306
|
-
if (Math.abs(sorted[i] - avg) <=
|
|
3375
|
+
if (Math.abs(sorted[i] - avg) <= tolerance) {
|
|
3307
3376
|
last.sum += sorted[i];
|
|
3308
3377
|
last.count++;
|
|
3309
3378
|
} else {
|
|
@@ -3360,6 +3429,20 @@ function extractCells(grid, horizontals, verticals) {
|
|
|
3360
3429
|
const numRows = rowYs.length - 1;
|
|
3361
3430
|
const numCols = colXs.length - 1;
|
|
3362
3431
|
if (numRows <= 0 || numCols <= 0) return [];
|
|
3432
|
+
const vBorders = Array.from(
|
|
3433
|
+
{ length: numRows },
|
|
3434
|
+
(_, r) => Array.from(
|
|
3435
|
+
{ length: numCols + 1 },
|
|
3436
|
+
(_2, c) => hasVerticalLine(verticals, colXs[c], rowYs[r], rowYs[r + 1], grid.vertexRadius)
|
|
3437
|
+
)
|
|
3438
|
+
);
|
|
3439
|
+
const hBorders = Array.from(
|
|
3440
|
+
{ length: numRows + 1 },
|
|
3441
|
+
(_, r) => Array.from(
|
|
3442
|
+
{ length: numCols },
|
|
3443
|
+
(_2, c) => hasHorizontalLine(horizontals, rowYs[r], colXs[c], colXs[c + 1], grid.vertexRadius)
|
|
3444
|
+
)
|
|
3445
|
+
);
|
|
3363
3446
|
const occupied = Array.from({ length: numRows }, () => Array(numCols).fill(false));
|
|
3364
3447
|
const cells = [];
|
|
3365
3448
|
for (let r = 0; r < numRows; r++) {
|
|
@@ -3367,18 +3450,26 @@ function extractCells(grid, horizontals, verticals) {
|
|
|
3367
3450
|
if (occupied[r][c]) continue;
|
|
3368
3451
|
let colSpan = 1;
|
|
3369
3452
|
let rowSpan = 1;
|
|
3370
|
-
while (c + colSpan < numCols) {
|
|
3371
|
-
|
|
3372
|
-
|
|
3373
|
-
|
|
3374
|
-
|
|
3453
|
+
while (c + colSpan < numCols && !vBorders[r][c + colSpan]) {
|
|
3454
|
+
let canExpand = true;
|
|
3455
|
+
for (let dr = 0; dr < rowSpan; dr++) {
|
|
3456
|
+
if (vBorders[r + dr][c + colSpan]) {
|
|
3457
|
+
canExpand = false;
|
|
3458
|
+
break;
|
|
3459
|
+
}
|
|
3460
|
+
}
|
|
3461
|
+
if (!canExpand) break;
|
|
3375
3462
|
colSpan++;
|
|
3376
3463
|
}
|
|
3377
3464
|
while (r + rowSpan < numRows) {
|
|
3378
|
-
|
|
3379
|
-
|
|
3380
|
-
|
|
3381
|
-
|
|
3465
|
+
let hasLine = false;
|
|
3466
|
+
for (let dc = 0; dc < colSpan; dc++) {
|
|
3467
|
+
if (hBorders[r + rowSpan][c + dc]) {
|
|
3468
|
+
hasLine = true;
|
|
3469
|
+
break;
|
|
3470
|
+
}
|
|
3471
|
+
}
|
|
3472
|
+
if (hasLine) break;
|
|
3382
3473
|
rowSpan++;
|
|
3383
3474
|
}
|
|
3384
3475
|
for (let dr = 0; dr < rowSpan; dr++) {
|
|
@@ -3402,28 +3493,30 @@ function extractCells(grid, horizontals, verticals) {
|
|
|
3402
3493
|
}
|
|
3403
3494
|
return cells;
|
|
3404
3495
|
}
|
|
3405
|
-
function hasVerticalLine(verticals, x, topY, botY) {
|
|
3406
|
-
const tol =
|
|
3496
|
+
function hasVerticalLine(verticals, x, topY, botY, vertexRadius) {
|
|
3497
|
+
const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
|
|
3407
3498
|
for (const v of verticals) {
|
|
3408
3499
|
if (Math.abs(v.x1 - x) <= tol) {
|
|
3409
3500
|
const cellH = Math.abs(topY - botY);
|
|
3501
|
+
if (cellH < 0.1) continue;
|
|
3410
3502
|
const overlapTop = Math.min(v.y2, topY);
|
|
3411
3503
|
const overlapBot = Math.max(v.y1, botY);
|
|
3412
3504
|
const overlap = overlapTop - overlapBot;
|
|
3413
|
-
if (overlap >= cellH * 0.
|
|
3505
|
+
if (overlap >= cellH * 0.75) return true;
|
|
3414
3506
|
}
|
|
3415
3507
|
}
|
|
3416
3508
|
return false;
|
|
3417
3509
|
}
|
|
3418
|
-
function hasHorizontalLine(horizontals, y, leftX, rightX) {
|
|
3419
|
-
const tol =
|
|
3510
|
+
function hasHorizontalLine(horizontals, y, leftX, rightX, vertexRadius) {
|
|
3511
|
+
const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
|
|
3420
3512
|
for (const h of horizontals) {
|
|
3421
3513
|
if (Math.abs(h.y1 - y) <= tol) {
|
|
3422
3514
|
const cellW = Math.abs(rightX - leftX);
|
|
3515
|
+
if (cellW < 0.1) continue;
|
|
3423
3516
|
const overlapLeft = Math.max(h.x1, leftX);
|
|
3424
3517
|
const overlapRight = Math.min(h.x2, rightX);
|
|
3425
3518
|
const overlap = overlapRight - overlapLeft;
|
|
3426
|
-
if (overlap >= cellW * 0.
|
|
3519
|
+
if (overlap >= cellW * 0.75) return true;
|
|
3427
3520
|
}
|
|
3428
3521
|
}
|
|
3429
3522
|
return false;
|
|
@@ -3434,23 +3527,24 @@ function mapTextToCells(items, cells) {
|
|
|
3434
3527
|
result.set(cell, []);
|
|
3435
3528
|
}
|
|
3436
3529
|
for (const item of items) {
|
|
3437
|
-
const cx = item.x + item.w / 2;
|
|
3438
|
-
const cy = item.y;
|
|
3439
3530
|
const pad = CELL_PADDING;
|
|
3440
3531
|
let bestCell = null;
|
|
3441
|
-
let
|
|
3532
|
+
let bestScore = 0;
|
|
3442
3533
|
for (const cell of cells) {
|
|
3443
|
-
|
|
3444
|
-
|
|
3445
|
-
|
|
3446
|
-
|
|
3447
|
-
|
|
3448
|
-
|
|
3449
|
-
|
|
3450
|
-
|
|
3534
|
+
const ix1 = Math.max(item.x, cell.bbox.x1 - pad);
|
|
3535
|
+
const ix2 = Math.min(item.x + item.w, cell.bbox.x2 + pad);
|
|
3536
|
+
const iy1 = Math.max(item.y, cell.bbox.y1 - pad);
|
|
3537
|
+
const iy2 = Math.min(item.y + (item.h || item.fontSize), cell.bbox.y2 + pad);
|
|
3538
|
+
if (ix1 >= ix2 || iy1 >= iy2) continue;
|
|
3539
|
+
const intersectArea = (ix2 - ix1) * (iy2 - iy1);
|
|
3540
|
+
const itemArea = Math.max(item.w, 1) * Math.max(item.h || item.fontSize, 1);
|
|
3541
|
+
const score = intersectArea / itemArea;
|
|
3542
|
+
if (score > bestScore) {
|
|
3543
|
+
bestScore = score;
|
|
3544
|
+
bestCell = cell;
|
|
3451
3545
|
}
|
|
3452
3546
|
}
|
|
3453
|
-
if (bestCell) {
|
|
3547
|
+
if (bestCell && bestScore > 0.3) {
|
|
3454
3548
|
result.get(bestCell).push(item);
|
|
3455
3549
|
}
|
|
3456
3550
|
}
|
|
@@ -3477,8 +3571,13 @@ function cellTextToString(items) {
|
|
|
3477
3571
|
const textLines = lines.map((line) => {
|
|
3478
3572
|
const s = line.sort((a, b) => a.x - b.x);
|
|
3479
3573
|
if (s.length === 1) return s[0].text;
|
|
3574
|
+
const evenSpaced = detectEvenSpacedItems(s);
|
|
3480
3575
|
let result = s[0].text;
|
|
3481
3576
|
for (let j = 1; j < s.length; j++) {
|
|
3577
|
+
if (evenSpaced[j]) {
|
|
3578
|
+
result += s[j].text;
|
|
3579
|
+
continue;
|
|
3580
|
+
}
|
|
3482
3581
|
const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
|
|
3483
3582
|
const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
|
|
3484
3583
|
const prevIsKorean = /[가-힣]$/.test(result);
|
|
@@ -3493,6 +3592,57 @@ function cellTextToString(items) {
|
|
|
3493
3592
|
}
|
|
3494
3593
|
return result;
|
|
3495
3594
|
});
|
|
3595
|
+
return mergeCellTextLines(textLines);
|
|
3596
|
+
}
|
|
3597
|
+
function detectEvenSpacedItems(items) {
|
|
3598
|
+
const result = new Array(items.length).fill(false);
|
|
3599
|
+
if (items.length < 3) return result;
|
|
3600
|
+
let runStart = -1;
|
|
3601
|
+
for (let i = 0; i < items.length; i++) {
|
|
3602
|
+
const isShortKorean = /^[가-힣]{1}$/.test(items[i].text) || /^[\d]{1}$/.test(items[i].text);
|
|
3603
|
+
if (isShortKorean && runStart >= 0 && i > 0) {
|
|
3604
|
+
const gap = items[i].x - (items[i - 1].x + items[i - 1].w);
|
|
3605
|
+
const maxRunGap = Math.max(items[i].fontSize * 3, 30);
|
|
3606
|
+
if (gap > maxRunGap) {
|
|
3607
|
+
if (i - runStart >= 3) markEvenRun(items, result, runStart, i);
|
|
3608
|
+
runStart = i;
|
|
3609
|
+
continue;
|
|
3610
|
+
}
|
|
3611
|
+
}
|
|
3612
|
+
if (isShortKorean) {
|
|
3613
|
+
if (runStart < 0) runStart = i;
|
|
3614
|
+
} else {
|
|
3615
|
+
if (runStart >= 0 && i - runStart >= 3) {
|
|
3616
|
+
markEvenRun(items, result, runStart, i);
|
|
3617
|
+
}
|
|
3618
|
+
runStart = -1;
|
|
3619
|
+
}
|
|
3620
|
+
}
|
|
3621
|
+
if (runStart >= 0 && items.length - runStart >= 3) {
|
|
3622
|
+
markEvenRun(items, result, runStart, items.length);
|
|
3623
|
+
}
|
|
3624
|
+
return result;
|
|
3625
|
+
}
|
|
3626
|
+
function markEvenRun(items, result, start, end) {
|
|
3627
|
+
const gaps = [];
|
|
3628
|
+
for (let i = start + 1; i < end; i++) {
|
|
3629
|
+
gaps.push(items[i].x - (items[i - 1].x + items[i - 1].w));
|
|
3630
|
+
}
|
|
3631
|
+
const posGaps = gaps.filter((g2) => g2 > 0);
|
|
3632
|
+
if (posGaps.length < 2) return;
|
|
3633
|
+
let minGap = Infinity, maxGap = -Infinity;
|
|
3634
|
+
for (const g2 of posGaps) {
|
|
3635
|
+
if (g2 < minGap) minGap = g2;
|
|
3636
|
+
if (g2 > maxGap) maxGap = g2;
|
|
3637
|
+
}
|
|
3638
|
+
const avgFs = items[start].fontSize;
|
|
3639
|
+
if (minGap >= avgFs * 0.1 && maxGap <= avgFs * 3 && maxGap / Math.max(minGap, 0.1) <= 3) {
|
|
3640
|
+
for (let i = start + 1; i < end; i++) {
|
|
3641
|
+
result[i] = true;
|
|
3642
|
+
}
|
|
3643
|
+
}
|
|
3644
|
+
}
|
|
3645
|
+
function mergeCellTextLines(textLines) {
|
|
3496
3646
|
if (textLines.length <= 1) return textLines[0] || "";
|
|
3497
3647
|
const merged = [textLines[0]];
|
|
3498
3648
|
for (let i = 1; i < textLines.length; i++) {
|
|
@@ -3518,24 +3668,172 @@ var Y_TOL = 3;
|
|
|
3518
3668
|
var COL_CLUSTER_TOL = 15;
|
|
3519
3669
|
var MIN_ROWS = 3;
|
|
3520
3670
|
var MIN_COLS = 2;
|
|
3521
|
-
var MIN_GAP_FACTOR =
|
|
3522
|
-
var
|
|
3671
|
+
var MIN_GAP_FACTOR = 2;
|
|
3672
|
+
var MIN_GAP_ABSOLUTE = 20;
|
|
3673
|
+
var MIN_COL_FILL_RATIO = 0.4;
|
|
3523
3674
|
function detectClusterTables(items, pageNum) {
|
|
3524
3675
|
if (items.length < MIN_ROWS * MIN_COLS) return [];
|
|
3525
|
-
const
|
|
3676
|
+
const { merged, originMap } = mergeEvenSpacedClusters(items);
|
|
3677
|
+
const rows = groupByBaseline(merged);
|
|
3526
3678
|
if (rows.length < MIN_ROWS) return [];
|
|
3527
|
-
const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
|
|
3528
|
-
if (suspiciousRows.length < MIN_ROWS) return [];
|
|
3529
|
-
const columns = extractColumnClusters(suspiciousRows);
|
|
3530
|
-
if (columns.length < MIN_COLS) return [];
|
|
3531
|
-
const tableRegions = findTableRegions(rows, columns);
|
|
3532
3679
|
const results = [];
|
|
3533
|
-
|
|
3534
|
-
|
|
3535
|
-
|
|
3680
|
+
const headerResult = detectHeaderRow(rows);
|
|
3681
|
+
if (headerResult) {
|
|
3682
|
+
const { columns, headerIdx } = headerResult;
|
|
3683
|
+
const headerRow = rows[headerIdx];
|
|
3684
|
+
const headerItems = [...headerRow.items].sort((a, b) => a.x - b.x);
|
|
3685
|
+
const headerAndBelow = rows.slice(headerIdx);
|
|
3686
|
+
const mergedRows = mergeMultiLineRows(headerAndBelow, columns);
|
|
3687
|
+
const tableRegions = findTableRegionsByHeader(mergedRows, columns, headerItems);
|
|
3688
|
+
for (const region of tableRegions) {
|
|
3689
|
+
const table = buildClusterTable(region.rows, columns, pageNum);
|
|
3690
|
+
if (table) {
|
|
3691
|
+
expandUsedItems(table.usedItems, originMap);
|
|
3692
|
+
results.push(table);
|
|
3693
|
+
}
|
|
3694
|
+
}
|
|
3695
|
+
}
|
|
3696
|
+
if (results.length === 0) {
|
|
3697
|
+
const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
|
|
3698
|
+
if (suspiciousRows.length >= MIN_ROWS) {
|
|
3699
|
+
const columns = extractColumnClusters(suspiciousRows);
|
|
3700
|
+
if (columns.length >= MIN_COLS) {
|
|
3701
|
+
const tableRegions = findTableRegions(rows, columns);
|
|
3702
|
+
for (const region of tableRegions) {
|
|
3703
|
+
const mergedRows = mergeMultiLineRows(region.rows, columns);
|
|
3704
|
+
const table = buildClusterTable(mergedRows, columns, pageNum);
|
|
3705
|
+
if (table) {
|
|
3706
|
+
expandUsedItems(table.usedItems, originMap);
|
|
3707
|
+
results.push(table);
|
|
3708
|
+
}
|
|
3709
|
+
}
|
|
3710
|
+
}
|
|
3711
|
+
}
|
|
3536
3712
|
}
|
|
3537
3713
|
return results;
|
|
3538
3714
|
}
|
|
3715
|
+
function mergeEvenSpacedClusters(items) {
|
|
3716
|
+
const originMap = /* @__PURE__ */ new Map();
|
|
3717
|
+
const rows = groupByBaseline(items);
|
|
3718
|
+
const merged = [];
|
|
3719
|
+
for (const row of rows) {
|
|
3720
|
+
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3721
|
+
let i = 0;
|
|
3722
|
+
while (i < sorted.length) {
|
|
3723
|
+
if (/^[가-힣\d]$/.test(sorted[i].text)) {
|
|
3724
|
+
let runEnd = i + 1;
|
|
3725
|
+
while (runEnd < sorted.length && /^[가-힣\d]$/.test(sorted[runEnd].text)) {
|
|
3726
|
+
const gap = sorted[runEnd].x - (sorted[runEnd - 1].x + sorted[runEnd - 1].w);
|
|
3727
|
+
const fs = sorted[runEnd].fontSize;
|
|
3728
|
+
if (gap < fs * 0.1 || gap > fs * 3) break;
|
|
3729
|
+
runEnd++;
|
|
3730
|
+
}
|
|
3731
|
+
if (runEnd - i >= 3) {
|
|
3732
|
+
const gaps = [];
|
|
3733
|
+
for (let g2 = i + 1; g2 < runEnd; g2++) {
|
|
3734
|
+
gaps.push(sorted[g2].x - (sorted[g2 - 1].x + sorted[g2 - 1].w));
|
|
3735
|
+
}
|
|
3736
|
+
let minG = Infinity, maxG = -Infinity;
|
|
3737
|
+
for (const g2 of gaps) {
|
|
3738
|
+
if (g2 < minG) minG = g2;
|
|
3739
|
+
if (g2 > maxG) maxG = g2;
|
|
3740
|
+
}
|
|
3741
|
+
if (minG > 0 && maxG / minG <= 3) {
|
|
3742
|
+
const run = sorted.slice(i, runEnd);
|
|
3743
|
+
const text = run.map((r) => r.text).join("");
|
|
3744
|
+
const first = run[0], last = run[runEnd - i - 1];
|
|
3745
|
+
const item = {
|
|
3746
|
+
text,
|
|
3747
|
+
x: first.x,
|
|
3748
|
+
y: first.y,
|
|
3749
|
+
w: last.x + last.w - first.x,
|
|
3750
|
+
h: first.h,
|
|
3751
|
+
fontSize: first.fontSize,
|
|
3752
|
+
fontName: first.fontName
|
|
3753
|
+
};
|
|
3754
|
+
originMap.set(item, run);
|
|
3755
|
+
merged.push(item);
|
|
3756
|
+
i = runEnd;
|
|
3757
|
+
continue;
|
|
3758
|
+
}
|
|
3759
|
+
}
|
|
3760
|
+
}
|
|
3761
|
+
merged.push(sorted[i]);
|
|
3762
|
+
i++;
|
|
3763
|
+
}
|
|
3764
|
+
}
|
|
3765
|
+
return { merged, originMap };
|
|
3766
|
+
}
|
|
3767
|
+
function expandUsedItems(usedItems, originMap) {
|
|
3768
|
+
const toAdd = [];
|
|
3769
|
+
for (const item of usedItems) {
|
|
3770
|
+
const origins = originMap.get(item);
|
|
3771
|
+
if (origins) for (const o of origins) toAdd.push(o);
|
|
3772
|
+
}
|
|
3773
|
+
for (const a of toAdd) usedItems.add(a);
|
|
3774
|
+
}
|
|
3775
|
+
function detectHeaderRow(rows) {
|
|
3776
|
+
const allItems = rows.flatMap((r) => r.items);
|
|
3777
|
+
if (allItems.length === 0) return null;
|
|
3778
|
+
let allMinX = Infinity, allMaxX = -Infinity;
|
|
3779
|
+
for (const i of allItems) {
|
|
3780
|
+
if (i.x < allMinX) allMinX = i.x;
|
|
3781
|
+
const r = i.x + i.w;
|
|
3782
|
+
if (r > allMaxX) allMaxX = r;
|
|
3783
|
+
}
|
|
3784
|
+
const pageSpan = allMaxX - allMinX;
|
|
3785
|
+
if (pageSpan <= 0) return null;
|
|
3786
|
+
for (let ri = 0; ri < rows.length; ri++) {
|
|
3787
|
+
const row = rows[ri];
|
|
3788
|
+
if (row.items.length < MIN_COLS || row.items.length > 6) continue;
|
|
3789
|
+
if (row.items.some((i) => i.text.length > 8)) continue;
|
|
3790
|
+
if (!row.items.some((i) => /[가-힣]/.test(i.text))) continue;
|
|
3791
|
+
if (row.items.some((i) => /^[□■○●·※▶▷◆◇\-]/.test(i.text))) continue;
|
|
3792
|
+
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3793
|
+
const xSpan = sorted[sorted.length - 1].x + sorted[sorted.length - 1].w - sorted[0].x;
|
|
3794
|
+
if (xSpan / pageSpan < 0.4) continue;
|
|
3795
|
+
const avgFs = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
|
|
3796
|
+
let hasLargeGap = false;
|
|
3797
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
3798
|
+
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
3799
|
+
if (gap >= avgFs * 2.5) {
|
|
3800
|
+
hasLargeGap = true;
|
|
3801
|
+
break;
|
|
3802
|
+
}
|
|
3803
|
+
}
|
|
3804
|
+
if (!hasLargeGap) continue;
|
|
3805
|
+
const columns = sorted.map((item) => ({ x: item.x, count: 0 }));
|
|
3806
|
+
let matchCount = 0;
|
|
3807
|
+
for (let j = ri + 1; j < rows.length && matchCount < MIN_ROWS + 2; j++) {
|
|
3808
|
+
const matched = countMatchedColumnsRange(rows[j], columns, sorted);
|
|
3809
|
+
if (matched >= MIN_COLS) matchCount++;
|
|
3810
|
+
}
|
|
3811
|
+
if (matchCount < MIN_ROWS) continue;
|
|
3812
|
+
return { columns, headerIdx: ri };
|
|
3813
|
+
}
|
|
3814
|
+
return null;
|
|
3815
|
+
}
|
|
3816
|
+
function mergeMultiLineRows(rows, columns) {
|
|
3817
|
+
if (rows.length <= 1) return rows;
|
|
3818
|
+
const result = [rows[0]];
|
|
3819
|
+
const allFontSizes = rows.flatMap((r) => r.items).map((i) => i.fontSize);
|
|
3820
|
+
const avgFontSize = allFontSizes.length > 0 ? allFontSizes.reduce((s, v) => s + v, 0) / allFontSizes.length : 12;
|
|
3821
|
+
for (let i = 1; i < rows.length; i++) {
|
|
3822
|
+
const prev = result[result.length - 1];
|
|
3823
|
+
const curr = rows[i];
|
|
3824
|
+
const yGap = Math.abs(prev.y - curr.y);
|
|
3825
|
+
const matchedCols = countMatchedColumns(curr, columns);
|
|
3826
|
+
if (yGap < avgFontSize * 1.8 && curr.items.length <= 2 && (matchedCols < MIN_COLS || curr.items.length === 1)) {
|
|
3827
|
+
result[result.length - 1] = {
|
|
3828
|
+
y: prev.y,
|
|
3829
|
+
items: [...prev.items, ...curr.items]
|
|
3830
|
+
};
|
|
3831
|
+
} else {
|
|
3832
|
+
result.push(curr);
|
|
3833
|
+
}
|
|
3834
|
+
}
|
|
3835
|
+
return result;
|
|
3836
|
+
}
|
|
3539
3837
|
function groupByBaseline(items) {
|
|
3540
3838
|
if (items.length === 0) return [];
|
|
3541
3839
|
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
@@ -3557,8 +3855,9 @@ function groupByBaseline(items) {
|
|
|
3557
3855
|
function hasSuspiciousGaps(row) {
|
|
3558
3856
|
if (row.items.length < 2) return false;
|
|
3559
3857
|
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3858
|
+
if (sorted.length === 2 && sorted[1].text.length > 20) return false;
|
|
3560
3859
|
const avgFontSize = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
|
|
3561
|
-
const minGap = avgFontSize * MIN_GAP_FACTOR;
|
|
3860
|
+
const minGap = Math.max(avgFontSize * MIN_GAP_FACTOR, MIN_GAP_ABSOLUTE);
|
|
3562
3861
|
for (let i = 1; i < sorted.length; i++) {
|
|
3563
3862
|
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
3564
3863
|
if (gap >= minGap) return true;
|
|
@@ -3585,6 +3884,41 @@ function extractColumnClusters(rows) {
|
|
|
3585
3884
|
const minCount = Math.max(2, Math.floor(rows.length * MIN_COL_FILL_RATIO));
|
|
3586
3885
|
return clusters.filter((c) => c.count >= minCount).sort((a, b) => a.x - b.x);
|
|
3587
3886
|
}
|
|
3887
|
+
function findTableRegionsByHeader(allRows, columns, headerItems) {
|
|
3888
|
+
const regions = [];
|
|
3889
|
+
let currentRegion = [];
|
|
3890
|
+
let missStreak = 0;
|
|
3891
|
+
for (const row of allRows) {
|
|
3892
|
+
const matchedCols = countMatchedColumnsRange(row, columns, headerItems);
|
|
3893
|
+
if (matchedCols >= MIN_COLS) {
|
|
3894
|
+
currentRegion.push(row);
|
|
3895
|
+
missStreak = 0;
|
|
3896
|
+
} else if (currentRegion.length > 0 && (row.items.length <= 2 || missStreak === 0)) {
|
|
3897
|
+
currentRegion.push(row);
|
|
3898
|
+
missStreak++;
|
|
3899
|
+
} else {
|
|
3900
|
+
while (currentRegion.length > 0) {
|
|
3901
|
+
const last = currentRegion[currentRegion.length - 1];
|
|
3902
|
+
if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
|
|
3903
|
+
currentRegion.pop();
|
|
3904
|
+
}
|
|
3905
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
3906
|
+
regions.push({ rows: [...currentRegion] });
|
|
3907
|
+
}
|
|
3908
|
+
currentRegion = [];
|
|
3909
|
+
missStreak = 0;
|
|
3910
|
+
}
|
|
3911
|
+
}
|
|
3912
|
+
while (currentRegion.length > 0) {
|
|
3913
|
+
const last = currentRegion[currentRegion.length - 1];
|
|
3914
|
+
if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
|
|
3915
|
+
currentRegion.pop();
|
|
3916
|
+
}
|
|
3917
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
3918
|
+
regions.push({ rows: currentRegion });
|
|
3919
|
+
}
|
|
3920
|
+
return regions;
|
|
3921
|
+
}
|
|
3588
3922
|
function findTableRegions(allRows, columns) {
|
|
3589
3923
|
const regions = [];
|
|
3590
3924
|
let currentRegion = [];
|
|
@@ -3620,18 +3954,81 @@ function countMatchedColumns(row, columns) {
|
|
|
3620
3954
|
}
|
|
3621
3955
|
return matched.size;
|
|
3622
3956
|
}
|
|
3623
|
-
function
|
|
3624
|
-
const
|
|
3625
|
-
let
|
|
3626
|
-
|
|
3627
|
-
|
|
3628
|
-
|
|
3629
|
-
|
|
3630
|
-
|
|
3631
|
-
|
|
3957
|
+
function countMatchedColumnsRange(row, columns, headerItems) {
|
|
3958
|
+
const boundaries = [];
|
|
3959
|
+
for (let ci = 0; ci < headerItems.length; ci++) {
|
|
3960
|
+
const left = ci === 0 ? 0 : (headerItems[ci - 1].x + headerItems[ci - 1].w + headerItems[ci].x) / 2;
|
|
3961
|
+
const right = ci === headerItems.length - 1 ? Infinity : (headerItems[ci].x + headerItems[ci].w + headerItems[ci + 1].x) / 2;
|
|
3962
|
+
boundaries.push({ left, right });
|
|
3963
|
+
}
|
|
3964
|
+
const matched = /* @__PURE__ */ new Set();
|
|
3965
|
+
for (const item of row.items) {
|
|
3966
|
+
for (let ci = 0; ci < boundaries.length; ci++) {
|
|
3967
|
+
if (item.x >= boundaries[ci].left && item.x < boundaries[ci].right) {
|
|
3968
|
+
matched.add(ci);
|
|
3969
|
+
break;
|
|
3970
|
+
}
|
|
3971
|
+
}
|
|
3972
|
+
}
|
|
3973
|
+
return matched.size;
|
|
3974
|
+
}
|
|
3975
|
+
function assignRowItems(items, columns, numCols) {
|
|
3976
|
+
if (items.length === 0) return [];
|
|
3977
|
+
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
3978
|
+
const colCenters = columns.map((c) => c.x);
|
|
3979
|
+
const gaps = [];
|
|
3980
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
3981
|
+
gaps.push({ idx: i, size: sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w) });
|
|
3982
|
+
}
|
|
3983
|
+
const gapSizes = gaps.map((g2) => g2.size).sort((a, b) => a - b);
|
|
3984
|
+
const medianGap = gapSizes.length > 0 ? gapSizes[Math.floor(gapSizes.length / 2)] : 0;
|
|
3985
|
+
const gapThreshold = sorted.length <= numCols + 1 ? 12 : Math.max(medianGap * 2.5, 12);
|
|
3986
|
+
const significantGaps = gaps.filter((g2) => g2.size >= gapThreshold).sort((a, b) => b.size - a.size).slice(0, numCols - 1).sort((a, b) => a.idx - b.idx);
|
|
3987
|
+
const groups = [];
|
|
3988
|
+
let start = 0;
|
|
3989
|
+
for (const gap of significantGaps) {
|
|
3990
|
+
groups.push(sorted.slice(start, gap.idx));
|
|
3991
|
+
start = gap.idx;
|
|
3992
|
+
}
|
|
3993
|
+
groups.push(sorted.slice(start));
|
|
3994
|
+
const result = [];
|
|
3995
|
+
const usedCols = /* @__PURE__ */ new Set();
|
|
3996
|
+
const groupCenters = groups.map((g2) => {
|
|
3997
|
+
let minX = Infinity, maxX = -Infinity;
|
|
3998
|
+
for (const i of g2) {
|
|
3999
|
+
if (i.x < minX) minX = i.x;
|
|
4000
|
+
const r = i.x + i.w;
|
|
4001
|
+
if (r > maxX) maxX = r;
|
|
4002
|
+
}
|
|
4003
|
+
return (minX + maxX) / 2;
|
|
4004
|
+
});
|
|
4005
|
+
const assignments = [];
|
|
4006
|
+
for (let gi = 0; gi < groups.length; gi++) {
|
|
4007
|
+
for (let ci = 0; ci < numCols; ci++) {
|
|
4008
|
+
assignments.push({ gi, ci, dist: Math.abs(groupCenters[gi] - colCenters[ci]) });
|
|
4009
|
+
}
|
|
4010
|
+
}
|
|
4011
|
+
assignments.sort((a, b) => a.dist - b.dist);
|
|
4012
|
+
const assignedGroups = /* @__PURE__ */ new Set();
|
|
4013
|
+
for (const { gi, ci } of assignments) {
|
|
4014
|
+
if (assignedGroups.has(gi) || usedCols.has(ci)) continue;
|
|
4015
|
+
result.push({ col: ci, items: groups[gi] });
|
|
4016
|
+
assignedGroups.add(gi);
|
|
4017
|
+
usedCols.add(ci);
|
|
4018
|
+
}
|
|
4019
|
+
for (let gi = 0; gi < groups.length; gi++) {
|
|
4020
|
+
if (assignedGroups.has(gi)) continue;
|
|
4021
|
+
let bestCol = 0, bestDist = Infinity;
|
|
4022
|
+
for (let ci = 0; ci < numCols; ci++) {
|
|
4023
|
+
const d = Math.abs(groupCenters[gi] - colCenters[ci]);
|
|
4024
|
+
if (d < bestDist) {
|
|
4025
|
+
bestDist = d;
|
|
4026
|
+
bestCol = ci;
|
|
4027
|
+
}
|
|
3632
4028
|
}
|
|
4029
|
+
result.push({ col: bestCol, items: groups[gi] });
|
|
3633
4030
|
}
|
|
3634
|
-
return
|
|
4031
|
+
return result;
|
|
3635
4032
|
}
|
|
3636
4033
|
function buildClusterTable(rows, columns, pageNum) {
|
|
3637
4034
|
const numCols = columns.length;
|
|
@@ -3649,12 +4046,12 @@ function buildClusterTable(rows, columns, pageNum) {
|
|
|
3649
4046
|
usedItems.add(row.items[0]);
|
|
3650
4047
|
continue;
|
|
3651
4048
|
}
|
|
3652
|
-
|
|
3653
|
-
|
|
3654
|
-
|
|
4049
|
+
const assignments = assignRowItems(row.items, columns, numCols);
|
|
4050
|
+
for (const { col, items } of assignments) {
|
|
4051
|
+
const text = items.map((i) => i.text).join(" ");
|
|
3655
4052
|
const existing = cells[r][col].text;
|
|
3656
|
-
cells[r][col].text = existing ? existing + " " +
|
|
3657
|
-
usedItems.add(item);
|
|
4053
|
+
cells[r][col].text = existing ? existing + " " + text : text;
|
|
4054
|
+
for (const item of items) usedItems.add(item);
|
|
3658
4055
|
}
|
|
3659
4056
|
}
|
|
3660
4057
|
let emptyRows = 0;
|
|
@@ -3666,11 +4063,48 @@ function buildClusterTable(rows, columns, pageNum) {
|
|
|
3666
4063
|
const hasValue = cells.some((row) => row[c].text !== "");
|
|
3667
4064
|
if (!hasValue) return null;
|
|
3668
4065
|
}
|
|
4066
|
+
for (let r = numRows - 1; r >= 1; r--) {
|
|
4067
|
+
const nonEmptyCols = cells[r].filter((c) => c.text.trim()).length;
|
|
4068
|
+
if (nonEmptyCols !== 1) continue;
|
|
4069
|
+
if (cells[r][0].text.trim() !== "") continue;
|
|
4070
|
+
const contentText = cells[r].find((c) => c.text.trim())?.text.trim() || "";
|
|
4071
|
+
if (/^[○●▶\-·]/.test(contentText)) continue;
|
|
4072
|
+
for (let pr = r - 1; pr >= 0; pr--) {
|
|
4073
|
+
if (cells[pr].some((c) => c.text.trim())) {
|
|
4074
|
+
for (let c = 0; c < numCols; c++) {
|
|
4075
|
+
const prev = cells[pr][c].text.trim();
|
|
4076
|
+
const curr = cells[r][c].text.trim();
|
|
4077
|
+
if (curr) cells[pr][c].text = prev ? prev + " " + curr : curr;
|
|
4078
|
+
}
|
|
4079
|
+
for (let c = 0; c < numCols; c++) cells[r][c].text = "";
|
|
4080
|
+
break;
|
|
4081
|
+
}
|
|
4082
|
+
}
|
|
4083
|
+
}
|
|
4084
|
+
for (let r = 0; r < cells.length - 1; r++) {
|
|
4085
|
+
const row = cells[r];
|
|
4086
|
+
const hasCol0 = row[0].text.trim() !== "";
|
|
4087
|
+
const hasColLast = numCols > 1 && row[numCols - 1].text.trim() !== "";
|
|
4088
|
+
const midEmpty = row.slice(1, numCols - 1).every((c) => c.text.trim() === "");
|
|
4089
|
+
if (hasCol0 && hasColLast && midEmpty) {
|
|
4090
|
+
const next = cells[r + 1];
|
|
4091
|
+
if (next[0].text.trim() === "" && next.some((c) => c.text.trim())) {
|
|
4092
|
+
for (let c = 1; c < numCols; c++) {
|
|
4093
|
+
const curr = next[c].text.trim();
|
|
4094
|
+
if (curr) row[c].text = row[c].text.trim() ? row[c].text.trim() + " " + curr : curr;
|
|
4095
|
+
}
|
|
4096
|
+
for (let c = 0; c < numCols; c++) next[c].text = "";
|
|
4097
|
+
}
|
|
4098
|
+
}
|
|
4099
|
+
}
|
|
4100
|
+
const filteredCells = cells.filter((row) => row.some((c) => c.text.trim()));
|
|
4101
|
+
const finalRowCount = filteredCells.length;
|
|
4102
|
+
if (finalRowCount < MIN_ROWS) return null;
|
|
3669
4103
|
const irTable = {
|
|
3670
|
-
rows:
|
|
4104
|
+
rows: finalRowCount,
|
|
3671
4105
|
cols: numCols,
|
|
3672
|
-
cells,
|
|
3673
|
-
hasHeader:
|
|
4106
|
+
cells: filteredCells,
|
|
4107
|
+
hasHeader: finalRowCount > 1
|
|
3674
4108
|
};
|
|
3675
4109
|
const allItems = rows.flatMap((r) => r.items);
|
|
3676
4110
|
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
@@ -3747,7 +4181,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3747
4181
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
3748
4182
|
const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
|
|
3749
4183
|
const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
|
|
3750
|
-
const
|
|
4184
|
+
const fontSizeFreq = /* @__PURE__ */ new Map();
|
|
3751
4185
|
const pageHeights = /* @__PURE__ */ new Map();
|
|
3752
4186
|
let parsedPages = 0;
|
|
3753
4187
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
@@ -3764,7 +4198,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3764
4198
|
warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
|
|
3765
4199
|
}
|
|
3766
4200
|
for (const item of visible) {
|
|
3767
|
-
if (item.fontSize > 0)
|
|
4201
|
+
if (item.fontSize > 0) fontSizeFreq.set(item.fontSize, (fontSizeFreq.get(item.fontSize) || 0) + 1);
|
|
3768
4202
|
}
|
|
3769
4203
|
const opList = await page.getOperatorList();
|
|
3770
4204
|
const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
|
|
@@ -3803,10 +4237,9 @@ async function parsePdfDocument(buffer, options) {
|
|
|
3803
4237
|
blocks.splice(removed[ri], 1);
|
|
3804
4238
|
}
|
|
3805
4239
|
}
|
|
3806
|
-
const medianFontSize =
|
|
4240
|
+
const medianFontSize = computeMedianFontSizeFromFreq(fontSizeFreq);
|
|
3807
4241
|
if (medianFontSize > 0) {
|
|
3808
4242
|
detectHeadings(blocks, medianFontSize);
|
|
3809
|
-
mergeAdjacentHeadings(blocks);
|
|
3810
4243
|
}
|
|
3811
4244
|
detectMarkerHeadings(blocks);
|
|
3812
4245
|
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
@@ -3857,11 +4290,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
|
|
|
3857
4290
|
}
|
|
3858
4291
|
return { visible, hiddenCount };
|
|
3859
4292
|
}
|
|
3860
|
-
function
|
|
3861
|
-
if (
|
|
3862
|
-
|
|
3863
|
-
const
|
|
3864
|
-
|
|
4293
|
+
function computeMedianFontSizeFromFreq(freq) {
|
|
4294
|
+
if (freq.size === 0) return 0;
|
|
4295
|
+
let total = 0;
|
|
4296
|
+
for (const count of freq.values()) total += count;
|
|
4297
|
+
const sorted = [...freq.entries()].sort((a, b) => a[0] - b[0]);
|
|
4298
|
+
const mid = Math.floor(total / 2);
|
|
4299
|
+
let cumulative = 0;
|
|
4300
|
+
for (const [size, count] of sorted) {
|
|
4301
|
+
cumulative += count;
|
|
4302
|
+
if (cumulative > mid) return size;
|
|
4303
|
+
}
|
|
4304
|
+
return sorted[sorted.length - 1][0];
|
|
3865
4305
|
}
|
|
3866
4306
|
function detectHeadings(blocks, medianFontSize) {
|
|
3867
4307
|
for (const block of blocks) {
|
|
@@ -3881,220 +4321,27 @@ function detectHeadings(blocks, medianFontSize) {
|
|
|
3881
4321
|
}
|
|
3882
4322
|
}
|
|
3883
4323
|
}
|
|
3884
|
-
function mergeAdjacentHeadings(blocks) {
|
|
3885
|
-
let i = 0;
|
|
3886
|
-
while (i < blocks.length - 1) {
|
|
3887
|
-
const curr = blocks[i];
|
|
3888
|
-
const next = blocks[i + 1];
|
|
3889
|
-
if (curr.type !== "heading" || next.type !== "heading") {
|
|
3890
|
-
i++;
|
|
3891
|
-
continue;
|
|
3892
|
-
}
|
|
3893
|
-
if (!curr.bbox || !next.bbox || !curr.text || !next.text) {
|
|
3894
|
-
i++;
|
|
3895
|
-
continue;
|
|
3896
|
-
}
|
|
3897
|
-
const currBaseline = curr.bbox.y + (curr.style?.fontSize || curr.bbox.height);
|
|
3898
|
-
const nextBaseline = next.bbox.y + (next.style?.fontSize || next.bbox.height);
|
|
3899
|
-
const yDiff = Math.abs(currBaseline - nextBaseline);
|
|
3900
|
-
const maxFs = Math.max(curr.style?.fontSize || 12, next.style?.fontSize || 12);
|
|
3901
|
-
const sameY = curr.bbox.page === next.bbox.page && yDiff < maxFs * 1.5;
|
|
3902
|
-
const sameLevel = curr.level === next.level;
|
|
3903
|
-
if (sameY && sameLevel) {
|
|
3904
|
-
const currX = curr.bbox.x;
|
|
3905
|
-
const nextX = next.bbox.x;
|
|
3906
|
-
if (currX <= nextX) {
|
|
3907
|
-
curr.text = curr.text + " " + next.text;
|
|
3908
|
-
} else {
|
|
3909
|
-
curr.text = next.text + " " + curr.text;
|
|
3910
|
-
}
|
|
3911
|
-
curr.bbox = {
|
|
3912
|
-
page: curr.bbox.page,
|
|
3913
|
-
x: Math.min(curr.bbox.x, next.bbox.x),
|
|
3914
|
-
y: Math.min(curr.bbox.y, next.bbox.y),
|
|
3915
|
-
width: Math.max(curr.bbox.x + curr.bbox.width, next.bbox.x + next.bbox.width) - Math.min(curr.bbox.x, next.bbox.x),
|
|
3916
|
-
height: Math.max(curr.bbox.height, next.bbox.height)
|
|
3917
|
-
};
|
|
3918
|
-
blocks.splice(i + 1, 1);
|
|
3919
|
-
} else {
|
|
3920
|
-
i++;
|
|
3921
|
-
}
|
|
3922
|
-
}
|
|
3923
|
-
}
|
|
3924
4324
|
function collapseEvenSpacing(text) {
|
|
3925
4325
|
const tokens = text.split(" ");
|
|
3926
4326
|
const singleCharCount = tokens.filter((t) => t.length === 1).length;
|
|
3927
4327
|
if (tokens.length >= 3 && singleCharCount / tokens.length >= 0.7) {
|
|
3928
4328
|
return tokens.join("");
|
|
3929
4329
|
}
|
|
3930
|
-
return text
|
|
3931
|
-
}
|
|
3932
|
-
|
|
3933
|
-
const allY = items.map((i) => i.y);
|
|
3934
|
-
const pageHeight = Math.max(...allY) - Math.min(...allY);
|
|
3935
|
-
const gapThreshold = Math.max(15, pageHeight * 0.03);
|
|
3936
|
-
const orderedGroups = xyCutOrder(items, gapThreshold);
|
|
3937
|
-
const blocks = [];
|
|
3938
|
-
for (const group of orderedGroups) {
|
|
3939
|
-
if (group.length === 0) continue;
|
|
3940
|
-
const yLines = groupByY(group);
|
|
3941
|
-
for (const line of yLines) {
|
|
3942
|
-
const text = mergeLineSimple(line);
|
|
3943
|
-
if (!text.trim()) continue;
|
|
3944
|
-
blocks.push({
|
|
3945
|
-
type: "paragraph",
|
|
3946
|
-
text,
|
|
3947
|
-
pageNumber: pageNum,
|
|
3948
|
-
bbox: computeBBox(line, pageNum),
|
|
3949
|
-
style: dominantStyle(line)
|
|
3950
|
-
});
|
|
3951
|
-
}
|
|
3952
|
-
}
|
|
3953
|
-
return blocks.length > 0 ? blocks : null;
|
|
3954
|
-
}
|
|
3955
|
-
function normalizeUnderSegmentedTable(table, items, pageNum, bbox) {
|
|
3956
|
-
const totalCells = table.cells.reduce((sum, row) => sum + row.filter((c) => c.text.trim()).length, 0);
|
|
3957
|
-
const totalTextLines = table.cells.reduce((sum, row) => sum + row.reduce((s, c) => s + (c.text.trim() ? c.text.split("\n").length : 0), 0), 0);
|
|
3958
|
-
const isUnderSegmented = table.rows === 1 && table.cols === 1 || totalCells <= 2 && totalTextLines >= 8 || totalCells <= 2 && items.length >= 6;
|
|
3959
|
-
if (!isUnderSegmented) return null;
|
|
3960
|
-
if (hasMultiColumnLayout(items)) return buildXyCutBlocks(items, pageNum);
|
|
3961
|
-
const directTable = buildTableFromTextLayout(items, pageNum, bbox);
|
|
3962
|
-
if (directTable) return directTable;
|
|
3963
|
-
const clusterItems = items.map((i) => ({
|
|
3964
|
-
text: i.text,
|
|
3965
|
-
x: i.x,
|
|
3966
|
-
y: i.y,
|
|
3967
|
-
w: i.w,
|
|
3968
|
-
h: i.h,
|
|
3969
|
-
fontSize: i.fontSize,
|
|
3970
|
-
fontName: i.fontName
|
|
3971
|
-
}));
|
|
3972
|
-
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
3973
|
-
if (clusterResults.length > 0) {
|
|
3974
|
-
const blocks = [];
|
|
3975
|
-
const ciToIdx = /* @__PURE__ */ new Map();
|
|
3976
|
-
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
3977
|
-
const usedIndices = /* @__PURE__ */ new Set();
|
|
3978
|
-
for (const cr of clusterResults) {
|
|
3979
|
-
for (const ci of cr.usedItems) {
|
|
3980
|
-
const idx = ciToIdx.get(ci);
|
|
3981
|
-
if (idx !== void 0) usedIndices.add(idx);
|
|
3982
|
-
}
|
|
3983
|
-
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
3984
|
-
}
|
|
3985
|
-
const remaining = items.filter((_, idx) => !usedIndices.has(idx));
|
|
3986
|
-
for (const item of remaining) {
|
|
3987
|
-
if (!item.text.trim()) continue;
|
|
3988
|
-
blocks.push({
|
|
3989
|
-
type: "paragraph",
|
|
3990
|
-
text: item.text,
|
|
3991
|
-
pageNumber: pageNum,
|
|
3992
|
-
bbox: computeBBox([item], pageNum),
|
|
3993
|
-
style: { fontSize: item.fontSize, fontName: item.fontName }
|
|
3994
|
-
});
|
|
3995
|
-
}
|
|
3996
|
-
blocks.sort((a, b) => {
|
|
3997
|
-
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
3998
|
-
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
3999
|
-
return by - ay;
|
|
4000
|
-
});
|
|
4001
|
-
return blocks.length > 0 ? blocks : null;
|
|
4002
|
-
}
|
|
4003
|
-
return null;
|
|
4004
|
-
}
|
|
4005
|
-
function buildTableFromTextLayout(items, pageNum, bbox) {
|
|
4006
|
-
if (items.length < 4) return null;
|
|
4007
|
-
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
4008
|
-
const yTol = 3;
|
|
4009
|
-
const rows = [];
|
|
4010
|
-
let curRow = [sorted[0]];
|
|
4011
|
-
let curY = sorted[0].y;
|
|
4012
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
4013
|
-
if (Math.abs(sorted[i].y - curY) <= yTol) {
|
|
4014
|
-
curRow.push(sorted[i]);
|
|
4015
|
-
} else {
|
|
4016
|
-
rows.push(curRow);
|
|
4017
|
-
curRow = [sorted[i]];
|
|
4018
|
-
curY = sorted[i].y;
|
|
4019
|
-
}
|
|
4020
|
-
}
|
|
4021
|
-
rows.push(curRow);
|
|
4022
|
-
if (rows.length < 2) return null;
|
|
4023
|
-
const gapPositions = [];
|
|
4024
|
-
for (const row of rows) {
|
|
4025
|
-
if (row.length < 2) continue;
|
|
4026
|
-
const sortedX = [...row].sort((a, b) => a.x - b.x);
|
|
4027
|
-
const avgFs = sortedX.reduce((s, i) => s + i.fontSize, 0) / sortedX.length;
|
|
4028
|
-
for (let j = 1; j < sortedX.length; j++) {
|
|
4029
|
-
const gap = sortedX[j].x - (sortedX[j - 1].x + sortedX[j - 1].w);
|
|
4030
|
-
if (gap >= avgFs * 1.5) {
|
|
4031
|
-
gapPositions.push(sortedX[j - 1].x + sortedX[j - 1].w + gap / 2);
|
|
4032
|
-
}
|
|
4033
|
-
}
|
|
4034
|
-
}
|
|
4035
|
-
if (gapPositions.length < 2) return null;
|
|
4036
|
-
gapPositions.sort((a, b) => a - b);
|
|
4037
|
-
const colBoundaries = [];
|
|
4038
|
-
let clusterSum = gapPositions[0], clusterCount = 1;
|
|
4039
|
-
for (let i = 1; i < gapPositions.length; i++) {
|
|
4040
|
-
const avg = clusterSum / clusterCount;
|
|
4041
|
-
if (Math.abs(gapPositions[i] - avg) <= 15) {
|
|
4042
|
-
clusterSum += gapPositions[i];
|
|
4043
|
-
clusterCount++;
|
|
4044
|
-
} else {
|
|
4045
|
-
if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
|
|
4046
|
-
clusterSum = gapPositions[i];
|
|
4047
|
-
clusterCount = 1;
|
|
4048
|
-
}
|
|
4049
|
-
}
|
|
4050
|
-
if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
|
|
4051
|
-
if (colBoundaries.length === 0) return null;
|
|
4052
|
-
const numCols = colBoundaries.length + 1;
|
|
4053
|
-
const tableRows = [];
|
|
4054
|
-
for (const row of rows) {
|
|
4055
|
-
const cells = Array(numCols).fill("");
|
|
4056
|
-
const sortedX = [...row].sort((a, b) => a.x - b.x);
|
|
4057
|
-
for (const item of sortedX) {
|
|
4058
|
-
const cx = item.x + item.w / 2;
|
|
4059
|
-
let col = 0;
|
|
4060
|
-
for (let b = 0; b < colBoundaries.length; b++) {
|
|
4061
|
-
if (cx > colBoundaries[b]) col = b + 1;
|
|
4062
|
-
}
|
|
4063
|
-
cells[col] = cells[col] ? cells[col] + " " + item.text : item.text;
|
|
4064
|
-
}
|
|
4065
|
-
if (cells[0].trim() === "" && tableRows.length > 0) {
|
|
4066
|
-
const prevCells = tableRows[tableRows.length - 1].cells;
|
|
4067
|
-
for (let c = 0; c < numCols; c++) {
|
|
4068
|
-
if (cells[c].trim()) {
|
|
4069
|
-
prevCells[c] = prevCells[c] ? prevCells[c] + " " + cells[c].trim() : cells[c].trim();
|
|
4070
|
-
}
|
|
4071
|
-
}
|
|
4072
|
-
} else {
|
|
4073
|
-
tableRows.push({ cells });
|
|
4074
|
-
}
|
|
4075
|
-
}
|
|
4076
|
-
if (tableRows.length < 2) return null;
|
|
4077
|
-
const nonEmptyCount = tableRows.reduce((sum, r) => sum + r.cells.filter((c) => c.trim()).length, 0);
|
|
4078
|
-
const totalCount = tableRows.length * numCols;
|
|
4079
|
-
if (nonEmptyCount < totalCount * 0.3) return null;
|
|
4080
|
-
const irCells = tableRows.map(
|
|
4081
|
-
(r) => r.cells.map((text, colIdx) => {
|
|
4082
|
-
let cleaned = text.trim();
|
|
4083
|
-
if (colIdx > 0) cleaned = cleaned.replace(/^[•○·\-]\s*/, "");
|
|
4084
|
-
return { text: cleaned, colSpan: 1, rowSpan: 1 };
|
|
4085
|
-
})
|
|
4330
|
+
return text.replace(
|
|
4331
|
+
/(?<![가-힣])[가-힣](?: [가-힣\d]){2,}(?![가-힣])/g,
|
|
4332
|
+
(match) => match.replace(/ /g, "")
|
|
4086
4333
|
);
|
|
4087
|
-
const irTable = {
|
|
4088
|
-
rows: tableRows.length,
|
|
4089
|
-
cols: numCols,
|
|
4090
|
-
cells: irCells,
|
|
4091
|
-
hasHeader: tableRows.length > 1
|
|
4092
|
-
};
|
|
4093
|
-
return [{ type: "table", table: irTable, pageNumber: pageNum, bbox }];
|
|
4094
4334
|
}
|
|
4095
4335
|
function shouldDemoteTable(table) {
|
|
4096
4336
|
const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
|
|
4097
4337
|
const allText = allCells.join(" ");
|
|
4338
|
+
if (table.rows <= 3 && table.cols <= 3) {
|
|
4339
|
+
const totalCells2 = table.rows * table.cols;
|
|
4340
|
+
const emptyCells2 = totalCells2 - allCells.length;
|
|
4341
|
+
if (emptyCells2 >= totalCells2 * 0.3) return true;
|
|
4342
|
+
if (/[□■◆○●▶ㅇ]/.test(allText)) return true;
|
|
4343
|
+
if (/<[^>]+>/.test(allText)) return true;
|
|
4344
|
+
}
|
|
4098
4345
|
if (allText.length > 200) return false;
|
|
4099
4346
|
if (/[□■◆○●▶]/.test(allText) && table.rows <= 3) return true;
|
|
4100
4347
|
const totalCells = table.rows * table.cols;
|
|
@@ -4138,32 +4385,6 @@ function detectMarkerHeadings(blocks) {
|
|
|
4138
4385
|
}
|
|
4139
4386
|
}
|
|
4140
4387
|
}
|
|
4141
|
-
function hasMultiColumnLayout(items) {
|
|
4142
|
-
if (items.length < 30) return false;
|
|
4143
|
-
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
4144
|
-
const minX = sorted[0].x;
|
|
4145
|
-
let maxX = minX;
|
|
4146
|
-
for (const i of sorted) if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
4147
|
-
const pageWidth = maxX - minX;
|
|
4148
|
-
if (pageWidth < 200) return false;
|
|
4149
|
-
let bestGap = 0;
|
|
4150
|
-
let bestSplit = 0;
|
|
4151
|
-
for (let j = 1; j < sorted.length; j++) {
|
|
4152
|
-
const gap = sorted[j].x - (sorted[j - 1].x + sorted[j - 1].w);
|
|
4153
|
-
if (gap > bestGap) {
|
|
4154
|
-
bestGap = gap;
|
|
4155
|
-
bestSplit = (sorted[j - 1].x + sorted[j - 1].w + sorted[j].x) / 2;
|
|
4156
|
-
}
|
|
4157
|
-
}
|
|
4158
|
-
if (bestGap < 20) return false;
|
|
4159
|
-
const splitRatio = (bestSplit - minX) / pageWidth;
|
|
4160
|
-
if (splitRatio < 0.35 || splitRatio > 0.65) return false;
|
|
4161
|
-
const leftCount = items.filter((i) => i.x + i.w / 2 < bestSplit).length;
|
|
4162
|
-
const rightCount = items.filter((i) => i.x + i.w / 2 >= bestSplit).length;
|
|
4163
|
-
if (leftCount < 15 || rightCount < 15) return false;
|
|
4164
|
-
if (Math.min(leftCount, rightCount) / Math.max(leftCount, rightCount) < 0.35) return false;
|
|
4165
|
-
return true;
|
|
4166
|
-
}
|
|
4167
4388
|
var MAX_XYCUT_DEPTH = 50;
|
|
4168
4389
|
function xyCutOrder(items, gapThreshold, depth = 0) {
|
|
4169
4390
|
if (items.length === 0) return [];
|
|
@@ -4231,6 +4452,7 @@ function extractPageBlocksWithLines(items, pageNum, opList, pageWidth, pageHeigh
|
|
|
4231
4452
|
if (items.length === 0) return [];
|
|
4232
4453
|
let { horizontals, verticals } = extractLines(opList.fnArray, opList.argsArray);
|
|
4233
4454
|
({ horizontals, verticals } = filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight));
|
|
4455
|
+
({ horizontals, verticals } = preprocessLines(horizontals, verticals));
|
|
4234
4456
|
const grids = buildTableGrids(horizontals, verticals);
|
|
4235
4457
|
if (grids.length > 0) {
|
|
4236
4458
|
return extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals);
|
|
@@ -4242,14 +4464,19 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
4242
4464
|
const usedItems = /* @__PURE__ */ new Set();
|
|
4243
4465
|
const sortedGrids = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
|
|
4244
4466
|
for (const grid of sortedGrids) {
|
|
4467
|
+
const numGridRows = grid.rowYs.length - 1;
|
|
4468
|
+
const numGridCols = grid.colXs.length - 1;
|
|
4469
|
+
if (numGridRows === 1 && numGridCols >= 2) continue;
|
|
4245
4470
|
const tableItems = [];
|
|
4246
4471
|
const pad = 3;
|
|
4472
|
+
const gridW = grid.bbox.x2 - grid.bbox.x1;
|
|
4247
4473
|
for (const item of items) {
|
|
4248
4474
|
if (usedItems.has(item)) continue;
|
|
4249
|
-
if (item.
|
|
4250
|
-
|
|
4251
|
-
|
|
4252
|
-
|
|
4475
|
+
if (item.y < grid.bbox.y1 - pad || item.y > grid.bbox.y2 + pad) continue;
|
|
4476
|
+
if (item.x < grid.bbox.x1 - pad || item.x + item.w > grid.bbox.x2 + pad) continue;
|
|
4477
|
+
if (gridW < 120 && item.x + item.w > grid.bbox.x2 - 2) continue;
|
|
4478
|
+
tableItems.push(item);
|
|
4479
|
+
usedItems.add(item);
|
|
4253
4480
|
}
|
|
4254
4481
|
const cells = extractCells(grid, horizontals, verticals);
|
|
4255
4482
|
if (cells.length === 0) continue;
|
|
@@ -4273,6 +4500,7 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
4273
4500
|
const cellItems = cellTextMap.get(cell) || [];
|
|
4274
4501
|
let text = cellTextToString(cellItems);
|
|
4275
4502
|
text = text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").trim();
|
|
4503
|
+
text = text.split("\n").map((line) => collapseEvenSpacing(line)).join("\n");
|
|
4276
4504
|
irGrid[cell.row][cell.col] = {
|
|
4277
4505
|
text,
|
|
4278
4506
|
colSpan: cell.colSpan,
|
|
@@ -4294,31 +4522,61 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
4294
4522
|
width: grid.bbox.x2 - grid.bbox.x1,
|
|
4295
4523
|
height: grid.bbox.y2 - grid.bbox.y1
|
|
4296
4524
|
};
|
|
4297
|
-
const normalized = normalizeUnderSegmentedTable(irTable, tableItems, pageNum, tableBbox);
|
|
4298
|
-
if (normalized) {
|
|
4299
|
-
blocks.push(...normalized);
|
|
4300
|
-
continue;
|
|
4301
|
-
}
|
|
4302
4525
|
if (shouldDemoteTable(irTable)) {
|
|
4303
4526
|
const demoted = demoteTableToText(irTable);
|
|
4304
4527
|
if (demoted) {
|
|
4305
|
-
|
|
4528
|
+
const text = numGridRows === 1 ? "\n" + demoted + "\n" : demoted;
|
|
4529
|
+
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
|
|
4306
4530
|
}
|
|
4307
4531
|
continue;
|
|
4308
4532
|
}
|
|
4309
4533
|
blocks.push({ type: "table", table: irTable, pageNumber: pageNum, bbox: tableBbox });
|
|
4310
4534
|
}
|
|
4311
|
-
|
|
4535
|
+
let remaining = items.filter((i) => !usedItems.has(i));
|
|
4312
4536
|
if (remaining.length > 0) {
|
|
4313
4537
|
remaining.sort((a, b) => b.y - a.y || a.x - b.x);
|
|
4314
|
-
const
|
|
4315
|
-
|
|
4316
|
-
|
|
4538
|
+
const clusterItems = remaining.map((i) => ({
|
|
4539
|
+
text: i.text,
|
|
4540
|
+
x: i.x,
|
|
4541
|
+
y: i.y,
|
|
4542
|
+
w: i.w,
|
|
4543
|
+
h: i.h,
|
|
4544
|
+
fontSize: i.fontSize,
|
|
4545
|
+
fontName: i.fontName
|
|
4546
|
+
}));
|
|
4547
|
+
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4548
|
+
if (clusterResults.length > 0) {
|
|
4549
|
+
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4550
|
+
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4551
|
+
const usedClusterIndices = /* @__PURE__ */ new Set();
|
|
4552
|
+
for (const cr of clusterResults) {
|
|
4553
|
+
for (const ci of cr.usedItems) {
|
|
4554
|
+
const idx = ciToIdx.get(ci);
|
|
4555
|
+
if (idx !== void 0) usedClusterIndices.add(idx);
|
|
4556
|
+
}
|
|
4557
|
+
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4558
|
+
}
|
|
4559
|
+
remaining = remaining.filter((_, idx) => !usedClusterIndices.has(idx));
|
|
4560
|
+
}
|
|
4561
|
+
if (remaining.length > 0) {
|
|
4562
|
+
const allY = remaining.map((i) => i.y);
|
|
4563
|
+
const pageH = safeMax(allY) - safeMin(allY);
|
|
4564
|
+
const groups = xyCutOrder(remaining, Math.max(15, pageH * 0.03));
|
|
4565
|
+
const textBlocks = [];
|
|
4566
|
+
for (const group of groups) {
|
|
4567
|
+
if (group.length === 0) continue;
|
|
4568
|
+
const groupBlocks = extractPageBlocksFallback(group, pageNum);
|
|
4569
|
+
for (const b of groupBlocks) textBlocks.push(b);
|
|
4570
|
+
}
|
|
4571
|
+
const finalTextBlocks = detectListBlocks(textBlocks);
|
|
4572
|
+
for (const b of finalTextBlocks) blocks.push(b);
|
|
4573
|
+
}
|
|
4574
|
+
blocks.sort((a, b) => {
|
|
4317
4575
|
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
4318
4576
|
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
4319
4577
|
return by - ay;
|
|
4320
4578
|
});
|
|
4321
|
-
return mergeAdjacentTableBlocks(
|
|
4579
|
+
return mergeAdjacentTableBlocks(blocks);
|
|
4322
4580
|
}
|
|
4323
4581
|
return mergeAdjacentTableBlocks(blocks);
|
|
4324
4582
|
}
|
|
@@ -4344,57 +4602,53 @@ function mergeAdjacentTableBlocks(blocks) {
|
|
|
4344
4602
|
}
|
|
4345
4603
|
function extractPageBlocksFallback(items, pageNum) {
|
|
4346
4604
|
if (items.length === 0) return [];
|
|
4347
|
-
if (hasMultiColumnLayout(items)) {
|
|
4348
|
-
const xyBlocks = buildXyCutBlocks(items, pageNum) || [];
|
|
4349
|
-
return detectSpecialKoreanTables(detectListBlocks(xyBlocks));
|
|
4350
|
-
}
|
|
4351
4605
|
const blocks = [];
|
|
4352
|
-
const
|
|
4353
|
-
|
|
4354
|
-
|
|
4355
|
-
|
|
4356
|
-
|
|
4357
|
-
|
|
4358
|
-
|
|
4359
|
-
|
|
4360
|
-
|
|
4361
|
-
|
|
4362
|
-
|
|
4363
|
-
|
|
4364
|
-
|
|
4365
|
-
|
|
4366
|
-
|
|
4367
|
-
|
|
4368
|
-
|
|
4369
|
-
|
|
4370
|
-
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4371
|
-
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4372
|
-
const usedIndices = /* @__PURE__ */ new Set();
|
|
4373
|
-
for (const cr of clusterResults) {
|
|
4374
|
-
for (const ci of cr.usedItems) {
|
|
4375
|
-
const idx = ciToIdx.get(ci);
|
|
4376
|
-
if (idx !== void 0) usedIndices.add(idx);
|
|
4377
|
-
}
|
|
4378
|
-
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4606
|
+
const clusterItems = items.map((i) => ({
|
|
4607
|
+
text: i.text,
|
|
4608
|
+
x: i.x,
|
|
4609
|
+
y: i.y,
|
|
4610
|
+
w: i.w,
|
|
4611
|
+
h: i.h,
|
|
4612
|
+
fontSize: i.fontSize,
|
|
4613
|
+
fontName: i.fontName
|
|
4614
|
+
}));
|
|
4615
|
+
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4616
|
+
if (clusterResults.length > 0) {
|
|
4617
|
+
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4618
|
+
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4619
|
+
const usedIndices = /* @__PURE__ */ new Set();
|
|
4620
|
+
for (const cr of clusterResults) {
|
|
4621
|
+
for (const ci of cr.usedItems) {
|
|
4622
|
+
const idx = ciToIdx.get(ci);
|
|
4623
|
+
if (idx !== void 0) usedIndices.add(idx);
|
|
4379
4624
|
}
|
|
4380
|
-
|
|
4381
|
-
|
|
4382
|
-
|
|
4383
|
-
|
|
4384
|
-
|
|
4385
|
-
|
|
4386
|
-
|
|
4387
|
-
|
|
4388
|
-
|
|
4625
|
+
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4626
|
+
}
|
|
4627
|
+
const remaining = items.filter((_, idx) => !usedIndices.has(idx));
|
|
4628
|
+
if (remaining.length > 0) {
|
|
4629
|
+
const yLines = groupByY(remaining);
|
|
4630
|
+
for (const line of yLines) {
|
|
4631
|
+
const text = mergeLineSimple(line);
|
|
4632
|
+
if (!text.trim()) continue;
|
|
4633
|
+
const bbox = computeBBox(line, pageNum);
|
|
4634
|
+
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
|
|
4389
4635
|
}
|
|
4390
|
-
|
|
4391
|
-
|
|
4392
|
-
|
|
4393
|
-
|
|
4394
|
-
|
|
4636
|
+
}
|
|
4637
|
+
blocks.sort((a, b) => {
|
|
4638
|
+
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
4639
|
+
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
4640
|
+
return by - ay;
|
|
4641
|
+
});
|
|
4642
|
+
} else {
|
|
4643
|
+
const allYLines = groupByY(items);
|
|
4644
|
+
const columns = detectColumns(allYLines);
|
|
4645
|
+
if (columns && columns.length >= 3) {
|
|
4646
|
+
const tableText = extractWithColumns(allYLines, columns);
|
|
4647
|
+
const bbox = computeBBox(items, pageNum);
|
|
4648
|
+
blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
|
|
4395
4649
|
} else {
|
|
4396
4650
|
const allY = items.map((i) => i.y);
|
|
4397
|
-
const pageHeight =
|
|
4651
|
+
const pageHeight = safeMax(allY) - safeMin(allY);
|
|
4398
4652
|
const gapThreshold = Math.max(15, pageHeight * 0.03);
|
|
4399
4653
|
const orderedGroups = xyCutOrder(items, gapThreshold);
|
|
4400
4654
|
for (const group of orderedGroups) {
|
|
@@ -4447,22 +4701,76 @@ function dominantStyle(items) {
|
|
|
4447
4701
|
return { fontSize: dominantSize, fontName };
|
|
4448
4702
|
}
|
|
4449
4703
|
function normalizeItems(rawItems) {
|
|
4450
|
-
|
|
4704
|
+
const items = [];
|
|
4705
|
+
const spacePositions = [];
|
|
4706
|
+
for (const i of rawItems) {
|
|
4707
|
+
if (typeof i.str !== "string") continue;
|
|
4708
|
+
const x = Math.round(i.transform[4]);
|
|
4709
|
+
const y = Math.round(i.transform[5]);
|
|
4710
|
+
if (!i.str.trim()) {
|
|
4711
|
+
spacePositions.push({ x, y });
|
|
4712
|
+
continue;
|
|
4713
|
+
}
|
|
4451
4714
|
const scaleY = Math.abs(i.transform[3]);
|
|
4452
4715
|
const scaleX = Math.abs(i.transform[0]);
|
|
4453
4716
|
const fontSize = Math.round(Math.max(scaleY, scaleX));
|
|
4454
|
-
|
|
4455
|
-
|
|
4456
|
-
|
|
4457
|
-
|
|
4458
|
-
|
|
4459
|
-
|
|
4460
|
-
|
|
4461
|
-
|
|
4462
|
-
|
|
4463
|
-
|
|
4464
|
-
|
|
4465
|
-
|
|
4717
|
+
const w = Math.round(i.width);
|
|
4718
|
+
const h = Math.round(i.height);
|
|
4719
|
+
const isHidden = fontSize === 0 || i.width === 0 && i.str.trim().length > 0;
|
|
4720
|
+
let text = i.str.trim();
|
|
4721
|
+
if (/^[\d\s\-().·,☎]+$/.test(text) && /\d/.test(text) && / /.test(text)) {
|
|
4722
|
+
text = text.replace(/ /g, "");
|
|
4723
|
+
}
|
|
4724
|
+
const split = splitEvenSpacedItem(text, x, w, fontSize);
|
|
4725
|
+
if (split) {
|
|
4726
|
+
for (const s of split) {
|
|
4727
|
+
items.push({ text: s.text, x: s.x, y, w: s.w, h, fontSize, fontName: i.fontName || "", isHidden });
|
|
4728
|
+
}
|
|
4729
|
+
} else {
|
|
4730
|
+
items.push({ text, x, y, w, h, fontSize, fontName: i.fontName || "", isHidden });
|
|
4731
|
+
}
|
|
4732
|
+
}
|
|
4733
|
+
const sorted = items.sort((a, b) => b.y - a.y || a.x - b.x);
|
|
4734
|
+
const deduped = [];
|
|
4735
|
+
for (let i = 0; i < sorted.length; i++) {
|
|
4736
|
+
let isDup = false;
|
|
4737
|
+
for (let j = deduped.length - 1; j >= 0; j--) {
|
|
4738
|
+
const prev = deduped[j];
|
|
4739
|
+
if (prev.y - sorted[i].y > 3) break;
|
|
4740
|
+
if (Math.abs(prev.y - sorted[i].y) <= 3 && prev.text === sorted[i].text && Math.abs(prev.x - sorted[i].x) <= 3) {
|
|
4741
|
+
isDup = true;
|
|
4742
|
+
break;
|
|
4743
|
+
}
|
|
4744
|
+
}
|
|
4745
|
+
if (!isDup) deduped.push(sorted[i]);
|
|
4746
|
+
}
|
|
4747
|
+
if (spacePositions.length > 0) {
|
|
4748
|
+
for (const item of deduped) {
|
|
4749
|
+
for (const sp of spacePositions) {
|
|
4750
|
+
if (Math.abs(sp.y - item.y) <= 3) {
|
|
4751
|
+
const dist = item.x - sp.x;
|
|
4752
|
+
if (dist >= 0 && dist <= 20) {
|
|
4753
|
+
item.hasSpaceBefore = true;
|
|
4754
|
+
break;
|
|
4755
|
+
}
|
|
4756
|
+
}
|
|
4757
|
+
}
|
|
4758
|
+
}
|
|
4759
|
+
}
|
|
4760
|
+
return deduped;
|
|
4761
|
+
}
|
|
4762
|
+
function splitEvenSpacedItem(text, itemX, itemW, fontSize) {
|
|
4763
|
+
if (!/^[가-힣\d](?: [가-힣\d]){2,}$/.test(text)) return null;
|
|
4764
|
+
const chars = text.split(" ");
|
|
4765
|
+
if (chars.length < 3) return null;
|
|
4766
|
+
const charW = itemW / chars.length;
|
|
4767
|
+
if (charW > fontSize * 2) return null;
|
|
4768
|
+
return chars.map((ch, idx) => ({
|
|
4769
|
+
text: ch,
|
|
4770
|
+
x: Math.round(itemX + idx * charW),
|
|
4771
|
+
w: Math.round(charW * 0.8)
|
|
4772
|
+
// 실제 글자 폭은 간격보다 좁음
|
|
4773
|
+
}));
|
|
4466
4774
|
}
|
|
4467
4775
|
function groupByY(items) {
|
|
4468
4776
|
if (items.length === 0) return [];
|
|
@@ -4487,14 +4795,14 @@ function isProseSpread(items) {
|
|
|
4487
4795
|
for (let i = 1; i < sorted.length; i++) {
|
|
4488
4796
|
gaps.push(sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w));
|
|
4489
4797
|
}
|
|
4490
|
-
const maxGap =
|
|
4798
|
+
const maxGap = safeMax(gaps);
|
|
4491
4799
|
const avgLen = items.reduce((s, i) => s + i.text.length, 0) / items.length;
|
|
4492
4800
|
return maxGap < 40 && avgLen < 5;
|
|
4493
4801
|
}
|
|
4494
4802
|
function detectColumns(yLines) {
|
|
4495
4803
|
const allItems = yLines.flat();
|
|
4496
4804
|
if (allItems.length === 0) return null;
|
|
4497
|
-
const pageWidth =
|
|
4805
|
+
const pageWidth = safeMax(allItems.map((i) => i.x + i.w)) - safeMin(allItems.map((i) => i.x));
|
|
4498
4806
|
if (pageWidth < 100) return null;
|
|
4499
4807
|
let bigoLineIdx = -1;
|
|
4500
4808
|
for (let i = 0; i < yLines.length; i++) {
|
|
@@ -4526,7 +4834,7 @@ function detectColumns(yLines) {
|
|
|
4526
4834
|
}
|
|
4527
4835
|
const peaks = xClusters.filter((c) => c.count >= 3).sort((a, b) => a.minX - b.minX);
|
|
4528
4836
|
if (peaks.length < 3) return null;
|
|
4529
|
-
const MERGE_TOL =
|
|
4837
|
+
const MERGE_TOL = 40;
|
|
4530
4838
|
const merged = [peaks[0]];
|
|
4531
4839
|
for (let i = 1; i < peaks.length; i++) {
|
|
4532
4840
|
const prev = merged[merged.length - 1];
|
|
@@ -4540,7 +4848,14 @@ function detectColumns(yLines) {
|
|
|
4540
4848
|
merged.push({ ...peaks[i] });
|
|
4541
4849
|
}
|
|
4542
4850
|
}
|
|
4543
|
-
const
|
|
4851
|
+
const rawColumns = merged.filter((c) => c.count >= 3).map((c) => c.minX);
|
|
4852
|
+
if (rawColumns.length < 3) return null;
|
|
4853
|
+
const MIN_DETECT_COL_WIDTH = 30;
|
|
4854
|
+
const columns = [rawColumns[0]];
|
|
4855
|
+
for (let i = 1; i < rawColumns.length; i++) {
|
|
4856
|
+
if (rawColumns[i] - columns[columns.length - 1] < MIN_DETECT_COL_WIDTH) continue;
|
|
4857
|
+
columns.push(rawColumns[i]);
|
|
4858
|
+
}
|
|
4544
4859
|
return columns.length >= 3 ? columns : null;
|
|
4545
4860
|
}
|
|
4546
4861
|
function findColumn(x, columns) {
|
|
@@ -4668,6 +4983,16 @@ function buildGridTable(lines, columns) {
|
|
|
4668
4983
|
}
|
|
4669
4984
|
merged.splice(0, headerEnd, headerRow);
|
|
4670
4985
|
}
|
|
4986
|
+
for (const row of merged) {
|
|
4987
|
+
for (let c = 0; c < row.length; c++) {
|
|
4988
|
+
if (row[c]) row[c] = collapseEvenSpacing(row[c]);
|
|
4989
|
+
}
|
|
4990
|
+
}
|
|
4991
|
+
const totalCells = merged.length * numCols;
|
|
4992
|
+
const filledCells = merged.reduce((s, row) => s + row.filter((c) => c).length, 0);
|
|
4993
|
+
if (filledCells < totalCells * 0.35 || merged.length < 2 || merged.length <= 3 && numCols >= 7) {
|
|
4994
|
+
return merged.map((r) => r.filter((c) => c).join(" ")).join("\n");
|
|
4995
|
+
}
|
|
4671
4996
|
const md = [];
|
|
4672
4997
|
md.push("| " + merged[0].join(" | ") + " |");
|
|
4673
4998
|
md.push("| " + merged[0].map(() => "---").join(" | ") + " |");
|
|
@@ -4679,12 +5004,32 @@ function buildGridTable(lines, columns) {
|
|
|
4679
5004
|
function mergeLineSimple(items) {
|
|
4680
5005
|
if (items.length <= 1) return items[0]?.text || "";
|
|
4681
5006
|
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
5007
|
+
const isEvenSpaced = detectEvenSpacedItems(sorted);
|
|
4682
5008
|
let result = sorted[0].text;
|
|
4683
5009
|
for (let i = 1; i < sorted.length; i++) {
|
|
4684
5010
|
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
4685
5011
|
const avgFs = (sorted[i].fontSize + sorted[i - 1].fontSize) / 2;
|
|
4686
|
-
|
|
4687
|
-
|
|
5012
|
+
const tabThreshold = Math.max(avgFs * 2, 30);
|
|
5013
|
+
if (gap > tabThreshold) {
|
|
5014
|
+
result += " ";
|
|
5015
|
+
result += sorted[i].text;
|
|
5016
|
+
continue;
|
|
5017
|
+
}
|
|
5018
|
+
if (isEvenSpaced[i]) {
|
|
5019
|
+
result += sorted[i].text;
|
|
5020
|
+
continue;
|
|
5021
|
+
}
|
|
5022
|
+
if (sorted[i].hasSpaceBefore && gap >= avgFs * 0.05) {
|
|
5023
|
+
result += " ";
|
|
5024
|
+
result += sorted[i].text;
|
|
5025
|
+
continue;
|
|
5026
|
+
}
|
|
5027
|
+
if (/[□■○●▶◆◇ㅇ]$/.test(sorted[i - 1].text) && /^[가-힣]/.test(sorted[i].text) && gap > 1) {
|
|
5028
|
+
result += " ";
|
|
5029
|
+
result += sorted[i].text;
|
|
5030
|
+
continue;
|
|
5031
|
+
}
|
|
5032
|
+
if (gap < avgFs * 0.15) {
|
|
4688
5033
|
} else if (gap < avgFs * 0.35 && (/[가-힣]$/.test(result) || /^[가-힣]/.test(sorted[i].text))) {
|
|
4689
5034
|
} else if (gap > 3) result += " ";
|
|
4690
5035
|
result += sorted[i].text;
|
|
@@ -4693,8 +5038,8 @@ function mergeLineSimple(items) {
|
|
|
4693
5038
|
}
|
|
4694
5039
|
function cleanPdfText(text) {
|
|
4695
5040
|
return mergeKoreanLines(
|
|
4696
|
-
text.replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "")
|
|
4697
|
-
).replace(/^(?!\|)
|
|
5041
|
+
text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
|
|
5042
|
+
).replace(/^(?!\| ---).*$/gm, (line) => collapseEvenSpacing(line)).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
|
|
4698
5043
|
}
|
|
4699
5044
|
function startsWithMarker(line) {
|
|
4700
5045
|
const t = line.trimStart();
|
|
@@ -4886,7 +5231,7 @@ function mergeKoreanLines(text) {
|
|
|
4886
5231
|
result[result.length - 1] = prev + " " + currTrimmed;
|
|
4887
5232
|
continue;
|
|
4888
5233
|
}
|
|
4889
|
-
if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev)) {
|
|
5234
|
+
if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev) && !startsWithMarker(prev)) {
|
|
4890
5235
|
result[result.length - 1] = prev + " " + curr;
|
|
4891
5236
|
} else {
|
|
4892
5237
|
result.push(curr);
|
|
@@ -4934,7 +5279,7 @@ function getTextContent(el) {
|
|
|
4934
5279
|
return el.textContent?.trim() ?? "";
|
|
4935
5280
|
}
|
|
4936
5281
|
function parseXml(text) {
|
|
4937
|
-
return new DOMParser2().parseFromString(text, "text/xml");
|
|
5282
|
+
return new DOMParser2().parseFromString(stripDtd(text), "text/xml");
|
|
4938
5283
|
}
|
|
4939
5284
|
function parseSharedStrings(xml) {
|
|
4940
5285
|
const doc = parseXml(xml);
|
|
@@ -5221,7 +5566,7 @@ function getAttr(el, localName) {
|
|
|
5221
5566
|
return null;
|
|
5222
5567
|
}
|
|
5223
5568
|
function parseXml2(text) {
|
|
5224
|
-
return new DOMParser3().parseFromString(text, "text/xml");
|
|
5569
|
+
return new DOMParser3().parseFromString(stripDtd(text), "text/xml");
|
|
5225
5570
|
}
|
|
5226
5571
|
function parseStyles(xml) {
|
|
5227
5572
|
const doc = parseXml2(xml);
|
|
@@ -5621,7 +5966,13 @@ function normalize(s) {
|
|
|
5621
5966
|
}
|
|
5622
5967
|
var MAX_LEVENSHTEIN_LEN = 1e4;
|
|
5623
5968
|
function levenshtein(a, b) {
|
|
5624
|
-
if (a.length + b.length > MAX_LEVENSHTEIN_LEN)
|
|
5969
|
+
if (a.length + b.length > MAX_LEVENSHTEIN_LEN) {
|
|
5970
|
+
const sampleLen = Math.min(500, a.length, b.length);
|
|
5971
|
+
let diffs = 0;
|
|
5972
|
+
for (let i = 0; i < sampleLen; i++) if (a[i] !== b[i]) diffs++;
|
|
5973
|
+
const sampleRate = sampleLen > 0 ? diffs / sampleLen : 1;
|
|
5974
|
+
return Math.abs(a.length - b.length) + Math.round(Math.min(a.length, b.length) * sampleRate);
|
|
5975
|
+
}
|
|
5625
5976
|
if (a.length > b.length) [a, b] = [b, a];
|
|
5626
5977
|
const m = a.length;
|
|
5627
5978
|
const n = b.length;
|
|
@@ -5904,13 +6255,20 @@ function extractInlineFields(text) {
|
|
|
5904
6255
|
|
|
5905
6256
|
// src/hwpx/generator.ts
|
|
5906
6257
|
import JSZip5 from "jszip";
|
|
5907
|
-
var
|
|
6258
|
+
var NS_SECTION = "http://www.hancom.co.kr/hwpml/2011/section";
|
|
6259
|
+
var NS_PARA = "http://www.hancom.co.kr/hwpml/2011/paragraph";
|
|
6260
|
+
var NS_HEAD = "http://www.hancom.co.kr/hwpml/2011/head";
|
|
6261
|
+
var NS_OPF = "http://www.idpf.org/2007/opf/";
|
|
6262
|
+
var NS_HPF = "http://www.hancom.co.kr/schema/2011/hpf";
|
|
6263
|
+
var NS_OCF = "urn:oasis:names:tc:opendocument:xmlns:container";
|
|
5908
6264
|
async function markdownToHwpx(markdown) {
|
|
5909
6265
|
const blocks = parseMarkdownToBlocks(markdown);
|
|
5910
6266
|
const sectionXml = blocksToSectionXml(blocks);
|
|
5911
6267
|
const zip = new JSZip5();
|
|
5912
6268
|
zip.file("mimetype", "application/hwp+zip", { compression: "STORE" });
|
|
6269
|
+
zip.file("META-INF/container.xml", generateContainerXml());
|
|
5913
6270
|
zip.file("Contents/content.hpf", generateManifest());
|
|
6271
|
+
zip.file("Contents/header.xml", generateHeaderXml());
|
|
5914
6272
|
zip.file("Contents/section0.xml", sectionXml);
|
|
5915
6273
|
return await zip.generateAsync({ type: "arraybuffer" });
|
|
5916
6274
|
}
|
|
@@ -5955,8 +6313,111 @@ function parseMarkdownToBlocks(md) {
|
|
|
5955
6313
|
function escapeXml(text) {
|
|
5956
6314
|
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """);
|
|
5957
6315
|
}
|
|
6316
|
+
function generateContainerXml() {
|
|
6317
|
+
return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
|
6318
|
+
<ocf:container xmlns:ocf="${NS_OCF}" xmlns:hpf="${NS_HPF}">
|
|
6319
|
+
<ocf:rootfiles>
|
|
6320
|
+
<ocf:rootfile full-path="Contents/content.hpf" media-type="application/hwpml-package+xml"/>
|
|
6321
|
+
</ocf:rootfiles>
|
|
6322
|
+
</ocf:container>`;
|
|
6323
|
+
}
|
|
6324
|
+
function generateManifest() {
|
|
6325
|
+
return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
|
6326
|
+
<opf:package xmlns:opf="${NS_OPF}" xmlns:hpf="${NS_HPF}" xmlns:hh="${NS_HEAD}">
|
|
6327
|
+
<opf:manifest>
|
|
6328
|
+
<opf:item id="header" href="Contents/header.xml" media-type="application/xml"/>
|
|
6329
|
+
<opf:item id="section0" href="Contents/section0.xml" media-type="application/xml"/>
|
|
6330
|
+
</opf:manifest>
|
|
6331
|
+
<opf:spine>
|
|
6332
|
+
<opf:itemref idref="header" linear="no"/>
|
|
6333
|
+
<opf:itemref idref="section0" linear="yes"/>
|
|
6334
|
+
</opf:spine>
|
|
6335
|
+
</opf:package>`;
|
|
6336
|
+
}
|
|
6337
|
+
function generateHeaderXml() {
|
|
6338
|
+
return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
|
6339
|
+
<hh:head xmlns:hh="${NS_HEAD}" xmlns:hp="${NS_PARA}" version="1.4" secCnt="1">
|
|
6340
|
+
<hh:beginNum page="1" footnote="1" endnote="1" pic="1" tbl="1" equation="1"/>
|
|
6341
|
+
<hh:refList>
|
|
6342
|
+
<hh:fontfaces itemCnt="7">
|
|
6343
|
+
<hh:fontface lang="HANGUL" fontCnt="1">
|
|
6344
|
+
<hh:font id="0" face="\uD568\uCD08\uB86C\uBC14\uD0D5" type="TTF" isEmbedded="0">
|
|
6345
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="4" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6346
|
+
</hh:font>
|
|
6347
|
+
</hh:fontface>
|
|
6348
|
+
<hh:fontface lang="LATIN" fontCnt="1">
|
|
6349
|
+
<hh:font id="0" face="Times New Roman" type="TTF" isEmbedded="0">
|
|
6350
|
+
<hh:typeInfo familyType="FCAT_OLDSTYLE" weight="5" proportion="4" contrast="2" strokeVariation="0" armStyle="0" letterform="0" midline="0" xHeight="4"/>
|
|
6351
|
+
</hh:font>
|
|
6352
|
+
</hh:fontface>
|
|
6353
|
+
<hh:fontface lang="HANJA" fontCnt="1">
|
|
6354
|
+
<hh:font id="0" face="\uD568\uCD08\uB86C\uBC14\uD0D5" type="TTF" isEmbedded="0">
|
|
6355
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="4" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6356
|
+
</hh:font>
|
|
6357
|
+
</hh:fontface>
|
|
6358
|
+
<hh:fontface lang="JAPANESE" fontCnt="1">
|
|
6359
|
+
<hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
|
|
6360
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6361
|
+
</hh:font>
|
|
6362
|
+
</hh:fontface>
|
|
6363
|
+
<hh:fontface lang="OTHER" fontCnt="1">
|
|
6364
|
+
<hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
|
|
6365
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6366
|
+
</hh:font>
|
|
6367
|
+
</hh:fontface>
|
|
6368
|
+
<hh:fontface lang="SYMBOL" fontCnt="1">
|
|
6369
|
+
<hh:font id="0" face="Symbol" type="TTF" isEmbedded="0">
|
|
6370
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6371
|
+
</hh:font>
|
|
6372
|
+
</hh:fontface>
|
|
6373
|
+
<hh:fontface lang="USER" fontCnt="1">
|
|
6374
|
+
<hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
|
|
6375
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
6376
|
+
</hh:font>
|
|
6377
|
+
</hh:fontface>
|
|
6378
|
+
</hh:fontfaces>
|
|
6379
|
+
<hh:borderFills itemCnt="1">
|
|
6380
|
+
<hh:borderFill id="0" threeD="0" shadow="0" centerLine="0" breakCellSeparateLine="0">
|
|
6381
|
+
<hh:slash type="NONE" Crooked="0" isCounter="0"/>
|
|
6382
|
+
<hh:backSlash type="NONE" Crooked="0" isCounter="0"/>
|
|
6383
|
+
<hh:leftBorder type="NONE" width="0.1mm" color="0"/>
|
|
6384
|
+
<hh:rightBorder type="NONE" width="0.1mm" color="0"/>
|
|
6385
|
+
<hh:topBorder type="NONE" width="0.1mm" color="0"/>
|
|
6386
|
+
<hh:bottomBorder type="NONE" width="0.1mm" color="0"/>
|
|
6387
|
+
<hh:diagonal type="NONE" width="0.1mm" color="0"/>
|
|
6388
|
+
<hh:fillInfo/>
|
|
6389
|
+
</hh:borderFill>
|
|
6390
|
+
</hh:borderFills>
|
|
6391
|
+
<hh:charProperties itemCnt="1">
|
|
6392
|
+
<hh:charPr id="0" height="1000" textColor="0" shadeColor="-1" useFontSpace="0" useKerning="0" symMark="NONE" borderFillIDRef="0">
|
|
6393
|
+
<hh:fontRef hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
|
|
6394
|
+
<hh:ratio hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
|
|
6395
|
+
<hh:spacing hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
|
|
6396
|
+
<hh:relSz hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
|
|
6397
|
+
<hh:offset hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
|
|
6398
|
+
</hh:charPr>
|
|
6399
|
+
</hh:charProperties>
|
|
6400
|
+
<hh:tabProperties itemCnt="0"/>
|
|
6401
|
+
<hh:numberings itemCnt="0"/>
|
|
6402
|
+
<hh:bullets itemCnt="0"/>
|
|
6403
|
+
<hh:paraProperties itemCnt="1">
|
|
6404
|
+
<hh:paraPr id="0" tabIDRef="0" condense="0" fontLineHeight="0" snapToGrid="0" suppressOverlap="0" checked="0">
|
|
6405
|
+
<hh:parLineBreak lineBreak="BREAK_LINE" wordBreak="BREAK_WORD" breakLatinWord="BREAK_WORD" breakNonLatinWord="BREAK_WORD"/>
|
|
6406
|
+
<hh:parMargin left="0" right="0" prev="0" next="0" indent="0"/>
|
|
6407
|
+
<hh:parBorder borderFillIDRef="0" offsetLeft="0" offsetRight="0" offsetTop="0" offsetBottom="0" connect="0" ignoreMargin="0"/>
|
|
6408
|
+
<hh:parShade borderFillIDRef="0"/>
|
|
6409
|
+
<hh:parTabList/>
|
|
6410
|
+
</hh:paraPr>
|
|
6411
|
+
</hh:paraProperties>
|
|
6412
|
+
<hh:styles itemCnt="1">
|
|
6413
|
+
<hh:style id="0" type="PARA" name="\uBC14\uD0D5\uAE00" engName="Normal" paraPrIDRef="0" charPrIDRef="0" nextStyleIDRef="0" langIDRef="1042" lockForm="0"/>
|
|
6414
|
+
</hh:styles>
|
|
6415
|
+
</hh:refList>
|
|
6416
|
+
<hh:compatibleDocument targetProgram="HWP2018"/>
|
|
6417
|
+
</hh:head>`;
|
|
6418
|
+
}
|
|
5958
6419
|
function generateParagraph(text) {
|
|
5959
|
-
return `<hp:p><hp:run><hp:t>${escapeXml(text)}</hp:t></hp:run></hp:p>`;
|
|
6420
|
+
return `<hp:p paraPrIDRef="0" styleIDRef="0"><hp:run charPrIDRef="0"><hp:t>${escapeXml(text)}</hp:t></hp:run></hp:p>`;
|
|
5960
6421
|
}
|
|
5961
6422
|
function generateTable(rows) {
|
|
5962
6423
|
const trElements = rows.map((row) => {
|
|
@@ -5980,22 +6441,11 @@ function blocksToSectionXml(blocks) {
|
|
|
5980
6441
|
return "";
|
|
5981
6442
|
}
|
|
5982
6443
|
}).join("\n ");
|
|
5983
|
-
return `<?xml version="1.0" encoding="UTF-8"?>
|
|
5984
|
-
<hs:sec xmlns:hs="${
|
|
6444
|
+
return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
|
6445
|
+
<hs:sec xmlns:hs="${NS_SECTION}" xmlns:hp="${NS_PARA}">
|
|
5985
6446
|
${body}
|
|
5986
6447
|
</hs:sec>`;
|
|
5987
6448
|
}
|
|
5988
|
-
function generateManifest() {
|
|
5989
|
-
return `<?xml version="1.0" encoding="UTF-8"?>
|
|
5990
|
-
<opf:package xmlns:opf="http://www.idpf.org/2007/opf">
|
|
5991
|
-
<opf:manifest>
|
|
5992
|
-
<opf:item id="s0" href="section0.xml" media-type="application/xml"/>
|
|
5993
|
-
</opf:manifest>
|
|
5994
|
-
<opf:spine>
|
|
5995
|
-
<opf:itemref idref="s0"/>
|
|
5996
|
-
</opf:spine>
|
|
5997
|
-
</opf:package>`;
|
|
5998
|
-
}
|
|
5999
6449
|
|
|
6000
6450
|
// src/index.ts
|
|
6001
6451
|
async function parse(input, options) {
|