kordoc 2.2.3 → 2.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +42 -3
- package/dist/chunk-JH5XLWJQ.js +457 -0
- package/dist/chunk-JH5XLWJQ.js.map +1 -0
- package/dist/chunk-MUOQXDZ4.cjs +33 -0
- package/dist/chunk-MUOQXDZ4.cjs.map +1 -0
- package/dist/chunk-OJ4QR33V.cjs +450 -0
- package/dist/chunk-OJ4QR33V.cjs.map +1 -0
- package/dist/{chunk-AIG7SDWU.js → chunk-RQWICKON.js} +964 -2732
- package/dist/chunk-RQWICKON.js.map +1 -0
- package/dist/chunk-SBVRCJFH.js +33 -0
- package/dist/chunk-SBVRCJFH.js.map +1 -0
- package/dist/chunk-UU2O6D3R.js +450 -0
- package/dist/chunk-UU2O6D3R.js.map +1 -0
- package/dist/cli.js +154 -7
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +1095 -3324
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +98 -8
- package/dist/index.d.ts +98 -8
- package/dist/index.js +917 -3100
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +140 -14
- package/dist/mcp.js.map +1 -1
- package/dist/page-range-3C7UGGEK.cjs +7 -0
- package/dist/page-range-3C7UGGEK.cjs.map +1 -0
- package/dist/page-range-H35FN3OQ.js +7 -0
- package/dist/page-range-H35FN3OQ.js.map +1 -0
- package/dist/parser-CYBX5MP4.cjs +2278 -0
- package/dist/parser-CYBX5MP4.cjs.map +1 -0
- package/dist/parser-OIRWPKIQ.js +2278 -0
- package/dist/parser-OIRWPKIQ.js.map +1 -0
- package/dist/parser-PXD73E4H.js +2279 -0
- package/dist/parser-PXD73E4H.js.map +1 -0
- package/dist/provider-WPIYEALY.js +37 -0
- package/dist/provider-WPIYEALY.js.map +1 -0
- package/dist/provider-YN2SSK4X.cjs +37 -0
- package/dist/provider-YN2SSK4X.cjs.map +1 -0
- package/dist/{watch-H672QAW2.js → watch-NSBABJ4A.js} +6 -4
- package/dist/{watch-H672QAW2.js.map → watch-NSBABJ4A.js.map} +1 -1
- package/package.json +1 -1
- package/dist/chunk-AIG7SDWU.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -1,93 +1,25 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
}
|
|
24
|
-
return result;
|
|
25
|
-
}
|
|
26
|
-
if (typeof spec !== "string" || spec.trim() === "") return result;
|
|
27
|
-
const parts = spec.split(",");
|
|
28
|
-
for (const part of parts) {
|
|
29
|
-
const trimmed = part.trim();
|
|
30
|
-
if (!trimmed) continue;
|
|
31
|
-
const rangeMatch = trimmed.match(/^(\d+)\s*-\s*(\d+)$/);
|
|
32
|
-
if (rangeMatch) {
|
|
33
|
-
const start = Math.max(1, parseInt(rangeMatch[1], 10));
|
|
34
|
-
const end = Math.min(maxPages, parseInt(rangeMatch[2], 10));
|
|
35
|
-
for (let i = start; i <= end; i++) result.add(i);
|
|
36
|
-
} else {
|
|
37
|
-
const page = parseInt(trimmed, 10);
|
|
38
|
-
if (!isNaN(page) && page >= 1 && page <= maxPages) result.add(page);
|
|
39
|
-
}
|
|
40
|
-
}
|
|
41
|
-
return result;
|
|
42
|
-
}
|
|
43
|
-
var init_page_range = __esm({
|
|
44
|
-
"src/page-range.ts"() {
|
|
45
|
-
"use strict";
|
|
46
|
-
}
|
|
47
|
-
});
|
|
48
|
-
|
|
49
|
-
// src/ocr/provider.ts
|
|
50
|
-
var provider_exports = {};
|
|
51
|
-
__export(provider_exports, {
|
|
52
|
-
ocrPages: () => ocrPages
|
|
53
|
-
});
|
|
54
|
-
async function ocrPages(doc, provider, pageFilter, effectivePageCount) {
|
|
55
|
-
const blocks = [];
|
|
56
|
-
for (let i = 1; i <= effectivePageCount; i++) {
|
|
57
|
-
if (pageFilter && !pageFilter.has(i)) continue;
|
|
58
|
-
const page = await doc.getPage(i);
|
|
59
|
-
try {
|
|
60
|
-
const imageData = await renderPageToPng(page);
|
|
61
|
-
const text = await provider(imageData, i, "image/png");
|
|
62
|
-
if (text.trim()) {
|
|
63
|
-
blocks.push({ type: "paragraph", text: text.trim(), pageNumber: i });
|
|
64
|
-
}
|
|
65
|
-
} catch {
|
|
66
|
-
blocks.push({ type: "paragraph", text: `[OCR \uC2E4\uD328: \uD398\uC774\uC9C0 ${i}]` });
|
|
67
|
-
}
|
|
68
|
-
}
|
|
69
|
-
return blocks;
|
|
70
|
-
}
|
|
71
|
-
async function renderPageToPng(page) {
|
|
72
|
-
let createCanvas;
|
|
73
|
-
try {
|
|
74
|
-
const canvasModule = await import("canvas");
|
|
75
|
-
createCanvas = canvasModule.createCanvas;
|
|
76
|
-
} catch {
|
|
77
|
-
throw new Error("OCR\uC744 \uC0AC\uC6A9\uD558\uB824\uBA74 'canvas' \uD328\uD0A4\uC9C0\uB97C \uC124\uCE58\uD558\uC138\uC694: npm install canvas");
|
|
78
|
-
}
|
|
79
|
-
const scale = 2;
|
|
80
|
-
const viewport = page.getViewport({ scale });
|
|
81
|
-
const canvas = createCanvas(Math.floor(viewport.width), Math.floor(viewport.height));
|
|
82
|
-
const ctx = canvas.getContext("2d");
|
|
83
|
-
await page.render({ canvasContext: ctx, viewport }).promise;
|
|
84
|
-
return new Uint8Array(canvas.toBuffer("image/png"));
|
|
85
|
-
}
|
|
86
|
-
var init_provider = __esm({
|
|
87
|
-
"src/ocr/provider.ts"() {
|
|
88
|
-
"use strict";
|
|
89
|
-
}
|
|
90
|
-
});
|
|
1
|
+
import {
|
|
2
|
+
HEADING_RATIO_H1,
|
|
3
|
+
HEADING_RATIO_H2,
|
|
4
|
+
HEADING_RATIO_H3,
|
|
5
|
+
KordocError,
|
|
6
|
+
MAX_COLS,
|
|
7
|
+
MAX_ROWS,
|
|
8
|
+
VERSION,
|
|
9
|
+
blocksToMarkdown,
|
|
10
|
+
buildTable,
|
|
11
|
+
classifyError,
|
|
12
|
+
convertTableToText,
|
|
13
|
+
flattenLayoutTables,
|
|
14
|
+
isPathTraversal,
|
|
15
|
+
precheckZipSize,
|
|
16
|
+
sanitizeHref,
|
|
17
|
+
stripDtd,
|
|
18
|
+
toArrayBuffer
|
|
19
|
+
} from "./chunk-UU2O6D3R.js";
|
|
20
|
+
import {
|
|
21
|
+
parsePageRange
|
|
22
|
+
} from "./chunk-SBVRCJFH.js";
|
|
91
23
|
|
|
92
24
|
// src/index.ts
|
|
93
25
|
import { readFile } from "fs/promises";
|
|
@@ -137,437 +69,6 @@ async function detectZipFormat(buffer) {
|
|
|
137
69
|
import JSZip2 from "jszip";
|
|
138
70
|
import { inflateRawSync } from "zlib";
|
|
139
71
|
import { DOMParser } from "@xmldom/xmldom";
|
|
140
|
-
|
|
141
|
-
// src/utils.ts
|
|
142
|
-
var VERSION = true ? "2.2.3" : "0.0.0-dev";
|
|
143
|
-
function toArrayBuffer(buf) {
|
|
144
|
-
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
145
|
-
return buf.buffer;
|
|
146
|
-
}
|
|
147
|
-
return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
|
|
148
|
-
}
|
|
149
|
-
var KordocError = class extends Error {
|
|
150
|
-
constructor(message) {
|
|
151
|
-
super(message);
|
|
152
|
-
this.name = "KordocError";
|
|
153
|
-
}
|
|
154
|
-
};
|
|
155
|
-
function isPathTraversal(name) {
|
|
156
|
-
if (name.includes("\0")) return true;
|
|
157
|
-
const normalized = name.replace(/\\/g, "/");
|
|
158
|
-
const segments = normalized.split("/");
|
|
159
|
-
return segments.some((s) => s === "..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
160
|
-
}
|
|
161
|
-
function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
|
|
162
|
-
try {
|
|
163
|
-
const data = new DataView(buffer);
|
|
164
|
-
const len = buffer.byteLength;
|
|
165
|
-
let eocdOffset = -1;
|
|
166
|
-
for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
|
|
167
|
-
if (data.getUint32(i, true) === 101010256) {
|
|
168
|
-
eocdOffset = i;
|
|
169
|
-
break;
|
|
170
|
-
}
|
|
171
|
-
}
|
|
172
|
-
if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
|
|
173
|
-
const entryCount = data.getUint16(eocdOffset + 10, true);
|
|
174
|
-
if (entryCount > maxEntries) {
|
|
175
|
-
throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
|
|
176
|
-
}
|
|
177
|
-
const cdSize = data.getUint32(eocdOffset + 12, true);
|
|
178
|
-
const cdOffset = data.getUint32(eocdOffset + 16, true);
|
|
179
|
-
if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
|
|
180
|
-
let totalUncompressed = 0;
|
|
181
|
-
let pos = cdOffset;
|
|
182
|
-
for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
|
|
183
|
-
if (data.getUint32(pos, true) !== 33639248) break;
|
|
184
|
-
totalUncompressed += data.getUint32(pos + 24, true);
|
|
185
|
-
const nameLen = data.getUint16(pos + 28, true);
|
|
186
|
-
const extraLen = data.getUint16(pos + 30, true);
|
|
187
|
-
const commentLen = data.getUint16(pos + 32, true);
|
|
188
|
-
pos += 46 + nameLen + extraLen + commentLen;
|
|
189
|
-
}
|
|
190
|
-
if (totalUncompressed > maxUncompressedSize) {
|
|
191
|
-
throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
|
|
192
|
-
}
|
|
193
|
-
return { totalUncompressed, entryCount };
|
|
194
|
-
} catch (err) {
|
|
195
|
-
if (err instanceof KordocError) throw err;
|
|
196
|
-
return { totalUncompressed: 0, entryCount: 0 };
|
|
197
|
-
}
|
|
198
|
-
}
|
|
199
|
-
function stripDtd(xml) {
|
|
200
|
-
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
201
|
-
}
|
|
202
|
-
var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
|
|
203
|
-
function sanitizeHref(href) {
|
|
204
|
-
const trimmed = href.trim();
|
|
205
|
-
if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
|
|
206
|
-
return trimmed;
|
|
207
|
-
}
|
|
208
|
-
function safeMin(arr) {
|
|
209
|
-
let min = Infinity;
|
|
210
|
-
for (let i = 0; i < arr.length; i++) if (arr[i] < min) min = arr[i];
|
|
211
|
-
return min;
|
|
212
|
-
}
|
|
213
|
-
function safeMax(arr) {
|
|
214
|
-
let max = -Infinity;
|
|
215
|
-
for (let i = 0; i < arr.length; i++) if (arr[i] > max) max = arr[i];
|
|
216
|
-
return max;
|
|
217
|
-
}
|
|
218
|
-
function classifyError(err) {
|
|
219
|
-
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
220
|
-
const msg = err.message;
|
|
221
|
-
if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
|
|
222
|
-
if (msg.includes("DRM")) return "DRM_PROTECTED";
|
|
223
|
-
if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
|
|
224
|
-
if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
|
|
225
|
-
if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
|
|
226
|
-
if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
|
|
227
|
-
if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
|
|
228
|
-
return "PARSE_ERROR";
|
|
229
|
-
}
|
|
230
|
-
|
|
231
|
-
// src/table/builder.ts
|
|
232
|
-
var MAX_COLS = 200;
|
|
233
|
-
var MAX_ROWS = 1e4;
|
|
234
|
-
function buildTable(rows) {
|
|
235
|
-
if (rows.length > MAX_ROWS) rows = rows.slice(0, MAX_ROWS);
|
|
236
|
-
const numRows = rows.length;
|
|
237
|
-
const hasAddr = rows.some((row) => row.some((c) => c.colAddr !== void 0 && c.rowAddr !== void 0));
|
|
238
|
-
if (hasAddr) return buildTableDirect(rows, numRows);
|
|
239
|
-
let maxCols = 0;
|
|
240
|
-
const tempOccupied = Array.from({ length: numRows }, () => []);
|
|
241
|
-
for (let rowIdx = 0; rowIdx < numRows; rowIdx++) {
|
|
242
|
-
let colIdx = 0;
|
|
243
|
-
for (const cell of rows[rowIdx]) {
|
|
244
|
-
while (colIdx < MAX_COLS && tempOccupied[rowIdx][colIdx]) colIdx++;
|
|
245
|
-
if (colIdx >= MAX_COLS) break;
|
|
246
|
-
for (let r = rowIdx; r < Math.min(rowIdx + cell.rowSpan, numRows); r++) {
|
|
247
|
-
for (let c = colIdx; c < Math.min(colIdx + cell.colSpan, MAX_COLS); c++) {
|
|
248
|
-
tempOccupied[r][c] = true;
|
|
249
|
-
}
|
|
250
|
-
}
|
|
251
|
-
colIdx += cell.colSpan;
|
|
252
|
-
if (colIdx > maxCols) maxCols = colIdx;
|
|
253
|
-
}
|
|
254
|
-
}
|
|
255
|
-
if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
|
|
256
|
-
const grid = Array.from(
|
|
257
|
-
{ length: numRows },
|
|
258
|
-
() => Array.from({ length: maxCols }, () => ({ text: "", colSpan: 1, rowSpan: 1 }))
|
|
259
|
-
);
|
|
260
|
-
const occupied = Array.from({ length: numRows }, () => Array(maxCols).fill(false));
|
|
261
|
-
for (let rowIdx = 0; rowIdx < numRows; rowIdx++) {
|
|
262
|
-
let colIdx = 0;
|
|
263
|
-
let cellIdx = 0;
|
|
264
|
-
while (colIdx < maxCols && cellIdx < rows[rowIdx].length) {
|
|
265
|
-
while (colIdx < maxCols && occupied[rowIdx][colIdx]) colIdx++;
|
|
266
|
-
if (colIdx >= maxCols) break;
|
|
267
|
-
const cell = rows[rowIdx][cellIdx];
|
|
268
|
-
grid[rowIdx][colIdx] = {
|
|
269
|
-
text: cell.text.trim(),
|
|
270
|
-
colSpan: cell.colSpan,
|
|
271
|
-
rowSpan: cell.rowSpan
|
|
272
|
-
};
|
|
273
|
-
for (let r = rowIdx; r < Math.min(rowIdx + cell.rowSpan, numRows); r++) {
|
|
274
|
-
for (let c = colIdx; c < Math.min(colIdx + cell.colSpan, maxCols); c++) {
|
|
275
|
-
occupied[r][c] = true;
|
|
276
|
-
}
|
|
277
|
-
}
|
|
278
|
-
colIdx += cell.colSpan;
|
|
279
|
-
cellIdx++;
|
|
280
|
-
}
|
|
281
|
-
}
|
|
282
|
-
return trimAndReturn(grid, numRows, maxCols);
|
|
283
|
-
}
|
|
284
|
-
function buildTableDirect(rows, numRows) {
|
|
285
|
-
let maxCols = 0;
|
|
286
|
-
for (const row of rows) {
|
|
287
|
-
for (const cell of row) {
|
|
288
|
-
const end = (cell.colAddr ?? 0) + cell.colSpan;
|
|
289
|
-
if (end > maxCols) maxCols = end;
|
|
290
|
-
}
|
|
291
|
-
}
|
|
292
|
-
if (maxCols > MAX_COLS) maxCols = MAX_COLS;
|
|
293
|
-
if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
|
|
294
|
-
const grid = Array.from(
|
|
295
|
-
{ length: numRows },
|
|
296
|
-
() => Array.from({ length: maxCols }, () => ({ text: "", colSpan: 1, rowSpan: 1 }))
|
|
297
|
-
);
|
|
298
|
-
for (const row of rows) {
|
|
299
|
-
for (const cell of row) {
|
|
300
|
-
const r = cell.rowAddr ?? 0;
|
|
301
|
-
const c = cell.colAddr ?? 0;
|
|
302
|
-
if (r >= numRows || c >= maxCols || r < 0 || c < 0) continue;
|
|
303
|
-
grid[r][c] = { text: cell.text.trim(), colSpan: cell.colSpan, rowSpan: cell.rowSpan };
|
|
304
|
-
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
305
|
-
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
306
|
-
if (dr === 0 && dc === 0) continue;
|
|
307
|
-
if (r + dr < numRows && c + dc < maxCols) {
|
|
308
|
-
grid[r + dr][c + dc] = { text: "", colSpan: 1, rowSpan: 1 };
|
|
309
|
-
}
|
|
310
|
-
}
|
|
311
|
-
}
|
|
312
|
-
}
|
|
313
|
-
}
|
|
314
|
-
return trimAndReturn(grid, numRows, maxCols);
|
|
315
|
-
}
|
|
316
|
-
function trimAndReturn(grid, numRows, maxCols) {
|
|
317
|
-
let effectiveCols = maxCols;
|
|
318
|
-
while (effectiveCols > 0) {
|
|
319
|
-
const colEmpty = grid.every((row) => !row[effectiveCols - 1]?.text?.trim());
|
|
320
|
-
if (!colEmpty) break;
|
|
321
|
-
effectiveCols--;
|
|
322
|
-
}
|
|
323
|
-
if (effectiveCols < maxCols && effectiveCols > 0) {
|
|
324
|
-
const trimmed = grid.map((row) => row.slice(0, effectiveCols));
|
|
325
|
-
return { rows: numRows, cols: effectiveCols, cells: trimmed, hasHeader: numRows > 1 };
|
|
326
|
-
}
|
|
327
|
-
return { rows: numRows, cols: maxCols, cells: grid, hasHeader: numRows > 1 };
|
|
328
|
-
}
|
|
329
|
-
function convertTableToText(rows) {
|
|
330
|
-
return rows.map(
|
|
331
|
-
(row) => row.map((c) => c.text.trim().replace(/\n/g, " ").replace(/\|/g, "\\|")).filter(Boolean).join(" / ")
|
|
332
|
-
).filter(Boolean).join("\n");
|
|
333
|
-
}
|
|
334
|
-
function escapeGfm(text) {
|
|
335
|
-
return text.replace(/~/g, "\\~");
|
|
336
|
-
}
|
|
337
|
-
var HWP_SHAPE_ALT_TEXT_RE = /(?:모서리가 둥근 |둥근 )?(?:사각형|직사각형|정사각형|원|타원|삼각형|이등변 삼각형|직각 삼각형|선|직선|곡선|화살표|굵은 화살표|이중 화살표|오각형|육각형|팔각형|별|[4-8]점별|십자|십자형|구름|구름형|마름모|도넛|평행사변형|사다리꼴|부채꼴|호|반원|물결|번개|하트|빗금|블록 화살표|수식|표|그림|개체|그리기\s?개체|묶음\s?개체|글상자|수식\s?개체|OLE\s?개체)\s?입니다\.?/g;
|
|
338
|
-
function sanitizeText(text) {
|
|
339
|
-
let result = text.replace(/[\u{F0000}-\u{FFFFD}]/gu, "").replace(HWP_SHAPE_ALT_TEXT_RE, "").replace(/ +/g, " ").trim();
|
|
340
|
-
if (result.length <= 30 && result.includes(" ")) {
|
|
341
|
-
const tokens = result.split(" ");
|
|
342
|
-
const koreanSingleCharCount = tokens.filter((t) => t.length === 1 && /[\uAC00-\uD7AF\u3131-\u318E]/.test(t)).length;
|
|
343
|
-
if (tokens.length >= 3 && koreanSingleCharCount / tokens.length >= 0.7) {
|
|
344
|
-
result = tokens.join("");
|
|
345
|
-
}
|
|
346
|
-
}
|
|
347
|
-
return result;
|
|
348
|
-
}
|
|
349
|
-
function flattenLayoutTables(blocks) {
|
|
350
|
-
const result = [];
|
|
351
|
-
for (const block of blocks) {
|
|
352
|
-
if (block.type !== "table" || !block.table) {
|
|
353
|
-
result.push(block);
|
|
354
|
-
continue;
|
|
355
|
-
}
|
|
356
|
-
const { rows: numRows, cols: numCols, cells } = block.table;
|
|
357
|
-
if (numRows === 1 && numCols === 1) {
|
|
358
|
-
result.push(block);
|
|
359
|
-
continue;
|
|
360
|
-
}
|
|
361
|
-
if (numRows <= 3) {
|
|
362
|
-
let totalNewlines = 0;
|
|
363
|
-
let totalTextLen = 0;
|
|
364
|
-
for (let r = 0; r < numRows; r++) {
|
|
365
|
-
for (let c = 0; c < numCols; c++) {
|
|
366
|
-
const t = cells[r]?.[c]?.text || "";
|
|
367
|
-
totalNewlines += (t.match(/\n/g) || []).length;
|
|
368
|
-
totalTextLen += t.length;
|
|
369
|
-
}
|
|
370
|
-
}
|
|
371
|
-
if (totalNewlines > 5 || numRows <= 2 && totalTextLen > 300) {
|
|
372
|
-
for (let r = 0; r < numRows; r++) {
|
|
373
|
-
for (let c = 0; c < numCols; c++) {
|
|
374
|
-
const cellText = cells[r]?.[c]?.text?.trim();
|
|
375
|
-
if (!cellText) continue;
|
|
376
|
-
for (const line of cellText.split("\n")) {
|
|
377
|
-
const trimmed = line.trim();
|
|
378
|
-
if (!trimmed) continue;
|
|
379
|
-
result.push({ type: "paragraph", text: trimmed, pageNumber: block.pageNumber });
|
|
380
|
-
}
|
|
381
|
-
}
|
|
382
|
-
}
|
|
383
|
-
continue;
|
|
384
|
-
}
|
|
385
|
-
}
|
|
386
|
-
result.push(block);
|
|
387
|
-
}
|
|
388
|
-
return result;
|
|
389
|
-
}
|
|
390
|
-
function blocksToMarkdown(blocks) {
|
|
391
|
-
const lines = [];
|
|
392
|
-
for (let i = 0; i < blocks.length; i++) {
|
|
393
|
-
const block = blocks[i];
|
|
394
|
-
if (block.type === "heading" && block.text) {
|
|
395
|
-
const prefix = "#".repeat(Math.min(block.level || 2, 6));
|
|
396
|
-
const headingText = sanitizeText(block.text);
|
|
397
|
-
if (headingText) lines.push("", `${prefix} ${headingText}`, "");
|
|
398
|
-
continue;
|
|
399
|
-
}
|
|
400
|
-
if (block.type === "image" && block.text) {
|
|
401
|
-
lines.push("", ``, "");
|
|
402
|
-
continue;
|
|
403
|
-
}
|
|
404
|
-
if (block.type === "separator") {
|
|
405
|
-
lines.push("", "---", "");
|
|
406
|
-
continue;
|
|
407
|
-
}
|
|
408
|
-
if (block.type === "list" && block.text) {
|
|
409
|
-
const listText = sanitizeText(block.text);
|
|
410
|
-
if (!listText) continue;
|
|
411
|
-
const alreadyNumbered = block.listType === "ordered" && /^\d+\.\s/.test(listText);
|
|
412
|
-
const prefix = alreadyNumbered ? "" : block.listType === "ordered" ? "1. " : "- ";
|
|
413
|
-
lines.push(`${prefix}${listText}`);
|
|
414
|
-
if (block.children) {
|
|
415
|
-
for (const child of block.children) {
|
|
416
|
-
const childPrefix = child.listType === "ordered" ? "1." : "-";
|
|
417
|
-
lines.push(` ${childPrefix} ${child.text || ""}`);
|
|
418
|
-
}
|
|
419
|
-
}
|
|
420
|
-
continue;
|
|
421
|
-
}
|
|
422
|
-
if (block.type === "paragraph" && block.text) {
|
|
423
|
-
let text = sanitizeText(block.text);
|
|
424
|
-
if (!text) continue;
|
|
425
|
-
if (/^\[별표\s*\d+/.test(text)) {
|
|
426
|
-
const nextBlock = blocks[i + 1];
|
|
427
|
-
if (nextBlock?.type === "paragraph" && nextBlock.text && /관련\)?$/.test(nextBlock.text)) {
|
|
428
|
-
lines.push("", `## ${text} ${nextBlock.text}`, "");
|
|
429
|
-
i++;
|
|
430
|
-
} else {
|
|
431
|
-
lines.push("", `## ${text}`, "");
|
|
432
|
-
}
|
|
433
|
-
continue;
|
|
434
|
-
}
|
|
435
|
-
if (/^\([^)]*조[^)]*관련\)$/.test(text)) {
|
|
436
|
-
lines.push(`*${text}*`, "");
|
|
437
|
-
continue;
|
|
438
|
-
}
|
|
439
|
-
if (block.href) {
|
|
440
|
-
const href = sanitizeHref(block.href);
|
|
441
|
-
if (href) text = `[${text}](${href})`;
|
|
442
|
-
}
|
|
443
|
-
if (block.footnoteText) {
|
|
444
|
-
text += ` (\uC8FC: ${block.footnoteText})`;
|
|
445
|
-
}
|
|
446
|
-
lines.push(escapeGfm(text), "");
|
|
447
|
-
} else if (block.type === "table" && block.table) {
|
|
448
|
-
if (lines.length > 0 && lines[lines.length - 1] !== "") {
|
|
449
|
-
lines.push("");
|
|
450
|
-
}
|
|
451
|
-
const tableMd = tableToMarkdown(block.table);
|
|
452
|
-
if (tableMd) {
|
|
453
|
-
lines.push(tableMd);
|
|
454
|
-
lines.push("");
|
|
455
|
-
}
|
|
456
|
-
}
|
|
457
|
-
}
|
|
458
|
-
return lines.join("\n").trim();
|
|
459
|
-
}
|
|
460
|
-
function hasMergedCells(table) {
|
|
461
|
-
for (const row of table.cells) {
|
|
462
|
-
for (const cell of row) {
|
|
463
|
-
if (cell.colSpan > 1 || cell.rowSpan > 1) return true;
|
|
464
|
-
}
|
|
465
|
-
}
|
|
466
|
-
return false;
|
|
467
|
-
}
|
|
468
|
-
function tableToHtml(table) {
|
|
469
|
-
const { cells, rows: numRows, cols: numCols } = table;
|
|
470
|
-
const skip = /* @__PURE__ */ new Set();
|
|
471
|
-
const lines = ["<table>"];
|
|
472
|
-
for (let r = 0; r < numRows; r++) {
|
|
473
|
-
const tag = r === 0 ? "th" : "td";
|
|
474
|
-
const rowHtml = [];
|
|
475
|
-
for (let c = 0; c < numCols; c++) {
|
|
476
|
-
if (skip.has(`${r},${c}`)) continue;
|
|
477
|
-
const cell = cells[r]?.[c];
|
|
478
|
-
if (!cell) continue;
|
|
479
|
-
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
480
|
-
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
481
|
-
if (dr === 0 && dc === 0) continue;
|
|
482
|
-
if (r + dr < numRows && c + dc < numCols) skip.add(`${r + dr},${c + dc}`);
|
|
483
|
-
}
|
|
484
|
-
}
|
|
485
|
-
const text = sanitizeText(cell.text).replace(/\n/g, "<br>");
|
|
486
|
-
const attrs = [];
|
|
487
|
-
if (cell.colSpan > 1) attrs.push(`colspan="${cell.colSpan}"`);
|
|
488
|
-
if (cell.rowSpan > 1) attrs.push(`rowspan="${cell.rowSpan}"`);
|
|
489
|
-
const attrStr = attrs.length ? " " + attrs.join(" ") : "";
|
|
490
|
-
rowHtml.push(`<${tag}${attrStr}>${text}</${tag}>`);
|
|
491
|
-
}
|
|
492
|
-
if (rowHtml.length) lines.push(`<tr>${rowHtml.join("")}</tr>`);
|
|
493
|
-
}
|
|
494
|
-
lines.push("</table>");
|
|
495
|
-
return lines.join("\n");
|
|
496
|
-
}
|
|
497
|
-
function tableToMarkdown(table) {
|
|
498
|
-
if (table.rows === 0 || table.cols === 0) return "";
|
|
499
|
-
const { cells, rows: numRows, cols: numCols } = table;
|
|
500
|
-
if (hasMergedCells(table)) return tableToHtml(table);
|
|
501
|
-
if (numRows === 1 && numCols === 1) {
|
|
502
|
-
const content = sanitizeText(cells[0][0].text);
|
|
503
|
-
if (!content) return "";
|
|
504
|
-
return content.split(/\n/).map((line) => {
|
|
505
|
-
const trimmed = line.trim();
|
|
506
|
-
if (!trimmed) return "";
|
|
507
|
-
if (/^\d+\.\s/.test(trimmed)) return `**${escapeGfm(trimmed)}**`;
|
|
508
|
-
if (/^[가-힣]\.\s/.test(trimmed)) return ` ${escapeGfm(trimmed)}`;
|
|
509
|
-
return escapeGfm(trimmed);
|
|
510
|
-
}).filter(Boolean).join("\n");
|
|
511
|
-
}
|
|
512
|
-
if (numCols === 1 && numRows >= 2) {
|
|
513
|
-
return cells.map((row) => escapeGfm(sanitizeText(row[0].text)).replace(/\n/g, " ")).filter(Boolean).join("\n");
|
|
514
|
-
}
|
|
515
|
-
const display = Array.from({ length: numRows }, () => Array(numCols).fill(""));
|
|
516
|
-
const skip = /* @__PURE__ */ new Set();
|
|
517
|
-
for (let r = 0; r < numRows; r++) {
|
|
518
|
-
for (let c = 0; c < numCols; c++) {
|
|
519
|
-
if (skip.has(`${r},${c}`)) continue;
|
|
520
|
-
const cell = cells[r]?.[c];
|
|
521
|
-
if (!cell) continue;
|
|
522
|
-
display[r][c] = escapeGfm(sanitizeText(cell.text)).replace(/\|/g, "\\|").replace(/\n/g, "<br>");
|
|
523
|
-
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
524
|
-
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
525
|
-
if (dr === 0 && dc === 0) continue;
|
|
526
|
-
if (r + dr < numRows && c + dc < numCols) {
|
|
527
|
-
skip.add(`${r + dr},${c + dc}`);
|
|
528
|
-
}
|
|
529
|
-
}
|
|
530
|
-
}
|
|
531
|
-
c += cell.colSpan - 1;
|
|
532
|
-
}
|
|
533
|
-
}
|
|
534
|
-
const uniqueRows = [];
|
|
535
|
-
let pendingFirstCol = "";
|
|
536
|
-
for (let r = 0; r < display.length; r++) {
|
|
537
|
-
const row = display[r];
|
|
538
|
-
const isEmptyPlaceholder = row.every((cell) => cell === "");
|
|
539
|
-
if (isEmptyPlaceholder) continue;
|
|
540
|
-
const nonEmptyCols = row.filter((cell) => cell !== "");
|
|
541
|
-
const hasSkipInRow = row.some((_, c) => skip.has(`${r},${c}`));
|
|
542
|
-
if (!hasSkipInRow && nonEmptyCols.length === 1 && row[0] !== "" && row.slice(1).every((c) => c === "")) {
|
|
543
|
-
pendingFirstCol = row[0];
|
|
544
|
-
continue;
|
|
545
|
-
}
|
|
546
|
-
if (pendingFirstCol && row[0] === "") {
|
|
547
|
-
row[0] = pendingFirstCol;
|
|
548
|
-
pendingFirstCol = "";
|
|
549
|
-
} else {
|
|
550
|
-
pendingFirstCol = "";
|
|
551
|
-
}
|
|
552
|
-
uniqueRows.push(row);
|
|
553
|
-
}
|
|
554
|
-
if (uniqueRows.length === 0) return "";
|
|
555
|
-
const md = [];
|
|
556
|
-
md.push("| " + uniqueRows[0].join(" | ") + " |");
|
|
557
|
-
md.push("| " + uniqueRows[0].map(() => "---").join(" | ") + " |");
|
|
558
|
-
for (let i = 1; i < uniqueRows.length; i++) {
|
|
559
|
-
md.push("| " + uniqueRows[i].join(" | ") + " |");
|
|
560
|
-
}
|
|
561
|
-
return md.join("\n");
|
|
562
|
-
}
|
|
563
|
-
|
|
564
|
-
// src/types.ts
|
|
565
|
-
var HEADING_RATIO_H1 = 1.5;
|
|
566
|
-
var HEADING_RATIO_H2 = 1.3;
|
|
567
|
-
var HEADING_RATIO_H3 = 1.15;
|
|
568
|
-
|
|
569
|
-
// src/hwpx/parser.ts
|
|
570
|
-
init_page_range();
|
|
571
72
|
var MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024;
|
|
572
73
|
var MAX_ZIP_ENTRIES = 500;
|
|
573
74
|
function clampSpan(val, max) {
|
|
@@ -2382,7 +1883,6 @@ function parseLenientCfb(data) {
|
|
|
2382
1883
|
}
|
|
2383
1884
|
|
|
2384
1885
|
// src/hwp5/parser.ts
|
|
2385
|
-
init_page_range();
|
|
2386
1886
|
import { createRequire } from "module";
|
|
2387
1887
|
var require2 = createRequire(import.meta.url);
|
|
2388
1888
|
var CFB = require2("cfb");
|
|
@@ -3026,2308 +2526,56 @@ function arrangeCells(rows, cols, cells) {
|
|
|
3026
2526
|
return grid.map((row) => row.map((c) => c || { text: "", colSpan: 1, rowSpan: 1 }));
|
|
3027
2527
|
}
|
|
3028
2528
|
|
|
3029
|
-
// src/
|
|
3030
|
-
|
|
3031
|
-
|
|
3032
|
-
|
|
3033
|
-
|
|
3034
|
-
var
|
|
3035
|
-
var
|
|
3036
|
-
|
|
3037
|
-
|
|
3038
|
-
|
|
3039
|
-
|
|
3040
|
-
|
|
3041
|
-
|
|
3042
|
-
var MIN_COORD_MERGE_TOL = 8;
|
|
3043
|
-
function extractLines(fnArray, argsArray) {
|
|
3044
|
-
const horizontals = [];
|
|
3045
|
-
const verticals = [];
|
|
3046
|
-
let lineWidth = 1;
|
|
3047
|
-
let currentPath = [];
|
|
3048
|
-
let pathStartX = 0, pathStartY = 0;
|
|
3049
|
-
let curX = 0, curY = 0;
|
|
3050
|
-
function pushRectangle(path, rx, ry, rw, rh) {
|
|
3051
|
-
if (Math.abs(rh) < ORIENTATION_TOL * 2) {
|
|
3052
|
-
path.push({ x1: rx, y1: ry + rh / 2, x2: rx + rw, y2: ry + rh / 2 });
|
|
3053
|
-
} else if (Math.abs(rw) < ORIENTATION_TOL * 2) {
|
|
3054
|
-
path.push({ x1: rx + rw / 2, y1: ry, x2: rx + rw / 2, y2: ry + rh });
|
|
3055
|
-
} else {
|
|
3056
|
-
path.push(
|
|
3057
|
-
{ x1: rx, y1: ry, x2: rx + rw, y2: ry },
|
|
3058
|
-
{ x1: rx + rw, y1: ry, x2: rx + rw, y2: ry + rh },
|
|
3059
|
-
{ x1: rx + rw, y1: ry + rh, x2: rx, y2: ry + rh },
|
|
3060
|
-
{ x1: rx, y1: ry + rh, x2: rx, y2: ry }
|
|
3061
|
-
);
|
|
3062
|
-
}
|
|
3063
|
-
}
|
|
3064
|
-
function flushPath(isStroke) {
|
|
3065
|
-
if (!isStroke) {
|
|
3066
|
-
currentPath = [];
|
|
3067
|
-
return;
|
|
3068
|
-
}
|
|
3069
|
-
for (const seg of currentPath) {
|
|
3070
|
-
classifyAndAdd(seg, lineWidth, horizontals, verticals);
|
|
3071
|
-
}
|
|
3072
|
-
currentPath = [];
|
|
3073
|
-
}
|
|
3074
|
-
for (let i = 0; i < fnArray.length; i++) {
|
|
3075
|
-
const op = fnArray[i];
|
|
3076
|
-
const args = argsArray[i];
|
|
3077
|
-
switch (op) {
|
|
3078
|
-
case OPS.setLineWidth:
|
|
3079
|
-
lineWidth = args[0] || 1;
|
|
3080
|
-
break;
|
|
3081
|
-
case OPS.constructPath: {
|
|
3082
|
-
const arg0 = args[0];
|
|
3083
|
-
if (Array.isArray(arg0)) {
|
|
3084
|
-
const subOps = arg0;
|
|
3085
|
-
const coords = args[1];
|
|
3086
|
-
let ci = 0;
|
|
3087
|
-
for (const subOp of subOps) {
|
|
3088
|
-
if (subOp === OPS.moveTo) {
|
|
3089
|
-
curX = coords[ci++];
|
|
3090
|
-
curY = coords[ci++];
|
|
3091
|
-
pathStartX = curX;
|
|
3092
|
-
pathStartY = curY;
|
|
3093
|
-
} else if (subOp === OPS.lineTo) {
|
|
3094
|
-
const x2 = coords[ci++], y2 = coords[ci++];
|
|
3095
|
-
currentPath.push({ x1: curX, y1: curY, x2, y2 });
|
|
3096
|
-
curX = x2;
|
|
3097
|
-
curY = y2;
|
|
3098
|
-
} else if (subOp === OPS.rectangle) {
|
|
3099
|
-
const rx = coords[ci++], ry = coords[ci++];
|
|
3100
|
-
const rw = coords[ci++], rh = coords[ci++];
|
|
3101
|
-
pushRectangle(currentPath, rx, ry, rw, rh);
|
|
3102
|
-
} else if (subOp === OPS.closePath) {
|
|
3103
|
-
if (curX !== pathStartX || curY !== pathStartY) {
|
|
3104
|
-
currentPath.push({ x1: curX, y1: curY, x2: pathStartX, y2: pathStartY });
|
|
3105
|
-
}
|
|
3106
|
-
curX = pathStartX;
|
|
3107
|
-
curY = pathStartY;
|
|
3108
|
-
} else if (subOp === OPS.curveTo) {
|
|
3109
|
-
ci += 6;
|
|
3110
|
-
} else if (subOp === OPS.curveTo2 || subOp === OPS.curveTo3) {
|
|
3111
|
-
ci += 4;
|
|
3112
|
-
}
|
|
3113
|
-
}
|
|
3114
|
-
} else {
|
|
3115
|
-
const afterOp = arg0;
|
|
3116
|
-
const dataArr = args[1];
|
|
3117
|
-
const pathData = dataArr?.[0];
|
|
3118
|
-
if (pathData && typeof pathData === "object") {
|
|
3119
|
-
const len = Object.keys(pathData).length;
|
|
3120
|
-
let di = 0;
|
|
3121
|
-
while (di < len) {
|
|
3122
|
-
const drawOp = pathData[di++];
|
|
3123
|
-
if (drawOp === 0 /* moveTo */) {
|
|
3124
|
-
curX = pathData[di++];
|
|
3125
|
-
curY = pathData[di++];
|
|
3126
|
-
pathStartX = curX;
|
|
3127
|
-
pathStartY = curY;
|
|
3128
|
-
} else if (drawOp === 1 /* lineTo */) {
|
|
3129
|
-
const x2 = pathData[di++], y2 = pathData[di++];
|
|
3130
|
-
currentPath.push({ x1: curX, y1: curY, x2, y2 });
|
|
3131
|
-
curX = x2;
|
|
3132
|
-
curY = y2;
|
|
3133
|
-
} else if (drawOp === 2 /* curveTo */) {
|
|
3134
|
-
di += 6;
|
|
3135
|
-
} else if (drawOp === 3 /* quadraticCurveTo */) {
|
|
3136
|
-
di += 4;
|
|
3137
|
-
} else if (drawOp === 4 /* closePath */) {
|
|
3138
|
-
if (curX !== pathStartX || curY !== pathStartY) {
|
|
3139
|
-
currentPath.push({ x1: curX, y1: curY, x2: pathStartX, y2: pathStartY });
|
|
3140
|
-
}
|
|
3141
|
-
curX = pathStartX;
|
|
3142
|
-
curY = pathStartY;
|
|
3143
|
-
} else {
|
|
3144
|
-
break;
|
|
3145
|
-
}
|
|
3146
|
-
}
|
|
3147
|
-
}
|
|
3148
|
-
if (afterOp === OPS.stroke || afterOp === OPS.closeStroke) {
|
|
3149
|
-
flushPath(true);
|
|
3150
|
-
} else if (afterOp === OPS.fill || afterOp === OPS.eoFill || afterOp === OPS.fillStroke || afterOp === OPS.eoFillStroke || afterOp === OPS.closeFillStroke || afterOp === OPS.closeEOFillStroke) {
|
|
3151
|
-
flushPath(true);
|
|
3152
|
-
} else if (afterOp === OPS.endPath) {
|
|
3153
|
-
flushPath(false);
|
|
3154
|
-
}
|
|
3155
|
-
}
|
|
3156
|
-
break;
|
|
3157
|
-
}
|
|
3158
|
-
case OPS.stroke:
|
|
3159
|
-
case OPS.closeStroke:
|
|
3160
|
-
flushPath(true);
|
|
3161
|
-
break;
|
|
3162
|
-
case OPS.fill:
|
|
3163
|
-
case OPS.eoFill:
|
|
3164
|
-
case OPS.fillStroke:
|
|
3165
|
-
case OPS.eoFillStroke:
|
|
3166
|
-
case OPS.closeFillStroke:
|
|
3167
|
-
case OPS.closeEOFillStroke:
|
|
3168
|
-
flushPath(true);
|
|
3169
|
-
break;
|
|
3170
|
-
case OPS.endPath:
|
|
3171
|
-
flushPath(false);
|
|
3172
|
-
break;
|
|
3173
|
-
}
|
|
3174
|
-
}
|
|
3175
|
-
return { horizontals, verticals };
|
|
3176
|
-
}
|
|
3177
|
-
function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
|
|
3178
|
-
const dx = Math.abs(seg.x2 - seg.x1);
|
|
3179
|
-
const dy = Math.abs(seg.y2 - seg.y1);
|
|
3180
|
-
const length = Math.sqrt(dx * dx + dy * dy);
|
|
3181
|
-
if (length < MIN_LINE_LENGTH) return;
|
|
3182
|
-
if (dy <= ORIENTATION_TOL) {
|
|
3183
|
-
const y = (seg.y1 + seg.y2) / 2;
|
|
3184
|
-
const x1 = Math.min(seg.x1, seg.x2);
|
|
3185
|
-
const x2 = Math.max(seg.x1, seg.x2);
|
|
3186
|
-
horizontals.push({ x1, y1: y, x2, y2: y, lineWidth });
|
|
3187
|
-
} else if (dx <= ORIENTATION_TOL) {
|
|
3188
|
-
const x = (seg.x1 + seg.x2) / 2;
|
|
3189
|
-
const y1 = Math.min(seg.y1, seg.y2);
|
|
3190
|
-
const y2 = Math.max(seg.y1, seg.y2);
|
|
3191
|
-
verticals.push({ x1: x, y1, x2: x, y2, lineWidth });
|
|
3192
|
-
}
|
|
3193
|
-
}
|
|
3194
|
-
function preprocessLines(horizontals, verticals) {
|
|
3195
|
-
let h = horizontals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
|
|
3196
|
-
let v = verticals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
|
|
3197
|
-
h = mergeParallelLines(h, "h");
|
|
3198
|
-
v = mergeParallelLines(v, "v");
|
|
3199
|
-
return { horizontals: h, verticals: v };
|
|
3200
|
-
}
|
|
3201
|
-
function mergeParallelLines(lines, dir) {
|
|
3202
|
-
if (lines.length <= 1) return lines;
|
|
3203
|
-
const sorted = [...lines].sort((a, b) => {
|
|
3204
|
-
const posA = dir === "h" ? a.y1 : a.x1;
|
|
3205
|
-
const posB = dir === "h" ? b.y1 : b.x1;
|
|
3206
|
-
if (Math.abs(posA - posB) > 0.1) return posA - posB;
|
|
3207
|
-
return dir === "h" ? a.x1 - b.x1 : a.y1 - b.y1;
|
|
3208
|
-
});
|
|
3209
|
-
const MERGE_TOL = 3;
|
|
3210
|
-
const result = [sorted[0]];
|
|
3211
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
3212
|
-
const prev = result[result.length - 1];
|
|
3213
|
-
const curr = sorted[i];
|
|
3214
|
-
const prevPos = dir === "h" ? prev.y1 : prev.x1;
|
|
3215
|
-
const currPos = dir === "h" ? curr.y1 : curr.x1;
|
|
3216
|
-
if (Math.abs(prevPos - currPos) <= MERGE_TOL) {
|
|
3217
|
-
const prevStart = dir === "h" ? prev.x1 : prev.y1;
|
|
3218
|
-
const prevEnd = dir === "h" ? prev.x2 : prev.y2;
|
|
3219
|
-
const currStart = dir === "h" ? curr.x1 : curr.y1;
|
|
3220
|
-
const currEnd = dir === "h" ? curr.x2 : curr.y2;
|
|
3221
|
-
const overlap = Math.min(prevEnd, currEnd) - Math.max(prevStart, currStart);
|
|
3222
|
-
const minLen = Math.min(prevEnd - prevStart, currEnd - currStart);
|
|
3223
|
-
if (overlap > minLen * 0.3) {
|
|
3224
|
-
if (dir === "h") {
|
|
3225
|
-
prev.x1 = Math.min(prev.x1, curr.x1);
|
|
3226
|
-
prev.x2 = Math.max(prev.x2, curr.x2);
|
|
3227
|
-
prev.y1 = (prev.y1 + curr.y1) / 2;
|
|
3228
|
-
prev.y2 = prev.y1;
|
|
3229
|
-
} else {
|
|
3230
|
-
prev.y1 = Math.min(prev.y1, curr.y1);
|
|
3231
|
-
prev.y2 = Math.max(prev.y2, curr.y2);
|
|
3232
|
-
prev.x1 = (prev.x1 + curr.x1) / 2;
|
|
3233
|
-
prev.x2 = prev.x1;
|
|
3234
|
-
}
|
|
3235
|
-
prev.lineWidth = Math.max(prev.lineWidth, curr.lineWidth);
|
|
3236
|
-
continue;
|
|
3237
|
-
}
|
|
3238
|
-
}
|
|
3239
|
-
result.push(curr);
|
|
3240
|
-
}
|
|
3241
|
-
return result;
|
|
3242
|
-
}
|
|
3243
|
-
function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
|
|
3244
|
-
const margin = 5;
|
|
3245
|
-
return {
|
|
3246
|
-
horizontals: horizontals.filter(
|
|
3247
|
-
(l) => !(Math.abs(l.y1) < margin || Math.abs(l.y1 - pageHeight) < margin) || l.x2 - l.x1 < pageWidth * 0.9
|
|
3248
|
-
),
|
|
3249
|
-
verticals: verticals.filter(
|
|
3250
|
-
(l) => !(Math.abs(l.x1) < margin || Math.abs(l.x1 - pageWidth) < margin) || l.y2 - l.y1 < pageHeight * 0.9
|
|
3251
|
-
)
|
|
3252
|
-
};
|
|
2529
|
+
// src/xlsx/parser.ts
|
|
2530
|
+
import JSZip3 from "jszip";
|
|
2531
|
+
import { DOMParser as DOMParser2 } from "@xmldom/xmldom";
|
|
2532
|
+
var MAX_SHEETS = 100;
|
|
2533
|
+
var MAX_DECOMPRESS_SIZE3 = 100 * 1024 * 1024;
|
|
2534
|
+
var MAX_ROWS2 = 1e4;
|
|
2535
|
+
var MAX_COLS2 = 200;
|
|
2536
|
+
function cleanNumericValue(raw) {
|
|
2537
|
+
if (!/^-?\d+\.\d+$/.test(raw)) return raw;
|
|
2538
|
+
const num = parseFloat(raw);
|
|
2539
|
+
if (!isFinite(num)) return raw;
|
|
2540
|
+
const cleaned = parseFloat(num.toPrecision(15)).toString();
|
|
2541
|
+
return cleaned;
|
|
3253
2542
|
}
|
|
3254
|
-
function
|
|
3255
|
-
const
|
|
3256
|
-
|
|
3257
|
-
|
|
3258
|
-
|
|
3259
|
-
|
|
3260
|
-
const radius = Math.max(h.lineWidth, v.lineWidth, 1);
|
|
3261
|
-
vertices.push({ x: v.x1, y: h.y1, radius });
|
|
3262
|
-
}
|
|
3263
|
-
}
|
|
3264
|
-
}
|
|
3265
|
-
return vertices;
|
|
3266
|
-
}
|
|
3267
|
-
function mergeVertices(vertices) {
|
|
3268
|
-
if (vertices.length <= 1) return vertices;
|
|
3269
|
-
const merged = [];
|
|
3270
|
-
const used = new Array(vertices.length).fill(false);
|
|
3271
|
-
for (let i = 0; i < vertices.length; i++) {
|
|
3272
|
-
if (used[i]) continue;
|
|
3273
|
-
let sumX = vertices[i].x, sumY = vertices[i].y;
|
|
3274
|
-
let maxRadius = vertices[i].radius;
|
|
3275
|
-
let count = 1;
|
|
3276
|
-
for (let j = i + 1; j < vertices.length; j++) {
|
|
3277
|
-
if (used[j]) continue;
|
|
3278
|
-
const mergeTol = VERTEX_MERGE_FACTOR * Math.max(maxRadius, vertices[j].radius);
|
|
3279
|
-
if (Math.abs(vertices[i].x - vertices[j].x) <= mergeTol && Math.abs(vertices[i].y - vertices[j].y) <= mergeTol) {
|
|
3280
|
-
sumX += vertices[j].x;
|
|
3281
|
-
sumY += vertices[j].y;
|
|
3282
|
-
maxRadius = Math.max(maxRadius, vertices[j].radius);
|
|
3283
|
-
count++;
|
|
3284
|
-
used[j] = true;
|
|
3285
|
-
}
|
|
3286
|
-
}
|
|
3287
|
-
merged.push({ x: sumX / count, y: sumY / count, radius: maxRadius });
|
|
3288
|
-
}
|
|
3289
|
-
return merged;
|
|
2543
|
+
function parseCellRef(ref) {
|
|
2544
|
+
const m = ref.match(/^([A-Z]+)(\d+)$/);
|
|
2545
|
+
if (!m) return null;
|
|
2546
|
+
let col = 0;
|
|
2547
|
+
for (const ch of m[1]) col = col * 26 + (ch.charCodeAt(0) - 64);
|
|
2548
|
+
return { col: col - 1, row: parseInt(m[2], 10) - 1 };
|
|
3290
2549
|
}
|
|
3291
|
-
function
|
|
3292
|
-
|
|
3293
|
-
|
|
3294
|
-
const
|
|
3295
|
-
|
|
3296
|
-
|
|
3297
|
-
|
|
3298
|
-
...horizontals.map((l, i) => ({ ...l, type: "h", id: i })),
|
|
3299
|
-
...verticals.map((l, i) => ({ ...l, type: "v", id: i + horizontals.length }))
|
|
3300
|
-
];
|
|
3301
|
-
const groups = groupConnectedLines(allLines);
|
|
3302
|
-
const grids = [];
|
|
3303
|
-
for (const group of groups) {
|
|
3304
|
-
const hLines = group.filter((l) => l.type === "h");
|
|
3305
|
-
const vLines = group.filter((l) => l.type === "v");
|
|
3306
|
-
if (hLines.length < 2 || vLines.length < 2) continue;
|
|
3307
|
-
let gx1 = Infinity, gy1 = Infinity, gx2 = -Infinity, gy2 = -Infinity;
|
|
3308
|
-
for (const l of vLines) {
|
|
3309
|
-
if (l.x1 < gx1) gx1 = l.x1;
|
|
3310
|
-
if (l.x1 > gx2) gx2 = l.x1;
|
|
3311
|
-
}
|
|
3312
|
-
for (const l of hLines) {
|
|
3313
|
-
if (l.y1 < gy1) gy1 = l.y1;
|
|
3314
|
-
if (l.y1 > gy2) gy2 = l.y1;
|
|
3315
|
-
}
|
|
3316
|
-
const groupBbox = {
|
|
3317
|
-
x1: gx1 - CONNECT_TOL,
|
|
3318
|
-
y1: gy1 - CONNECT_TOL,
|
|
3319
|
-
x2: gx2 + CONNECT_TOL,
|
|
3320
|
-
y2: gy2 + CONNECT_TOL
|
|
3321
|
-
};
|
|
3322
|
-
const groupVertices = vertices.filter(
|
|
3323
|
-
(v) => v.x >= groupBbox.x1 && v.x <= groupBbox.x2 && v.y >= groupBbox.y1 && v.y <= groupBbox.y2
|
|
3324
|
-
);
|
|
3325
|
-
const groupRadius = groupVertices.length > 0 ? groupVertices.reduce((max, v) => Math.max(max, v.radius), 1) : globalRadius;
|
|
3326
|
-
const coordMergeTol = Math.max(VERTEX_MERGE_FACTOR * groupRadius, MIN_COORD_MERGE_TOL);
|
|
3327
|
-
const rawYs = [
|
|
3328
|
-
...hLines.map((l) => l.y1),
|
|
3329
|
-
...groupVertices.map((v) => v.y)
|
|
3330
|
-
];
|
|
3331
|
-
const rowYs = clusterCoordinates(rawYs, coordMergeTol).sort((a, b) => b - a);
|
|
3332
|
-
const rawXs = [
|
|
3333
|
-
...vLines.map((l) => l.x1),
|
|
3334
|
-
...groupVertices.map((v) => v.x)
|
|
3335
|
-
];
|
|
3336
|
-
const colXs = clusterCoordinates(rawXs, coordMergeTol).sort((a, b) => a - b);
|
|
3337
|
-
if (rowYs.length < 2 || colXs.length < 2) continue;
|
|
3338
|
-
const validColXs = enforceMinWidth(colXs, MIN_COL_WIDTH);
|
|
3339
|
-
const validRowYs = enforceMinHeight(rowYs, MIN_ROW_HEIGHT);
|
|
3340
|
-
if (validRowYs.length < 2 || validColXs.length < 2) continue;
|
|
3341
|
-
const bbox = {
|
|
3342
|
-
x1: validColXs[0],
|
|
3343
|
-
y1: validRowYs[validRowYs.length - 1],
|
|
3344
|
-
x2: validColXs[validColXs.length - 1],
|
|
3345
|
-
y2: validRowYs[0]
|
|
3346
|
-
};
|
|
3347
|
-
grids.push({ rowYs: validRowYs, colXs: validColXs, bbox, vertexRadius: groupRadius });
|
|
3348
|
-
}
|
|
3349
|
-
return mergeAdjacentGrids(grids);
|
|
2550
|
+
function parseMergeRef(ref) {
|
|
2551
|
+
const parts = ref.split(":");
|
|
2552
|
+
if (parts.length !== 2) return null;
|
|
2553
|
+
const start = parseCellRef(parts[0]);
|
|
2554
|
+
const end = parseCellRef(parts[1]);
|
|
2555
|
+
if (!start || !end) return null;
|
|
2556
|
+
return { startCol: start.col, startRow: start.row, endCol: end.col, endRow: end.row };
|
|
3350
2557
|
}
|
|
3351
|
-
function
|
|
3352
|
-
|
|
3353
|
-
const result = [
|
|
3354
|
-
for (let i =
|
|
3355
|
-
const prevX = result[result.length - 1];
|
|
3356
|
-
if (colXs[i] - prevX < minWidth && i < colXs.length - 1) {
|
|
3357
|
-
continue;
|
|
3358
|
-
}
|
|
3359
|
-
result.push(colXs[i]);
|
|
3360
|
-
}
|
|
2558
|
+
function getElements(parent, tagName) {
|
|
2559
|
+
const nodes = parent.getElementsByTagName(tagName);
|
|
2560
|
+
const result = [];
|
|
2561
|
+
for (let i = 0; i < nodes.length; i++) result.push(nodes[i]);
|
|
3361
2562
|
return result;
|
|
3362
2563
|
}
|
|
3363
|
-
function
|
|
3364
|
-
|
|
3365
|
-
const result = [rowYs[0]];
|
|
3366
|
-
for (let i = 1; i < rowYs.length; i++) {
|
|
3367
|
-
const prevY = result[result.length - 1];
|
|
3368
|
-
if (prevY - rowYs[i] < minHeight && i < rowYs.length - 1) {
|
|
3369
|
-
continue;
|
|
3370
|
-
}
|
|
3371
|
-
result.push(rowYs[i]);
|
|
3372
|
-
}
|
|
3373
|
-
return result;
|
|
2564
|
+
function getTextContent(el) {
|
|
2565
|
+
return el.textContent?.trim() ?? "";
|
|
3374
2566
|
}
|
|
3375
|
-
function
|
|
3376
|
-
|
|
3377
|
-
const sorted = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
|
|
3378
|
-
const merged = [sorted[0]];
|
|
3379
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
3380
|
-
const prev = merged[merged.length - 1];
|
|
3381
|
-
const curr = sorted[i];
|
|
3382
|
-
if (prev.colXs.length === curr.colXs.length) {
|
|
3383
|
-
const mergeTol = Math.max(VERTEX_MERGE_FACTOR * Math.max(prev.vertexRadius, curr.vertexRadius), 6) * 3;
|
|
3384
|
-
const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= mergeTol);
|
|
3385
|
-
const verticalGap = prev.bbox.y1 - curr.bbox.y2;
|
|
3386
|
-
if (colMatch && verticalGap >= -CONNECT_TOL && verticalGap <= 20) {
|
|
3387
|
-
const allRowYs = [.../* @__PURE__ */ new Set([...prev.rowYs, ...curr.rowYs])].sort((a, b) => b - a);
|
|
3388
|
-
merged[merged.length - 1] = {
|
|
3389
|
-
rowYs: allRowYs,
|
|
3390
|
-
colXs: prev.colXs,
|
|
3391
|
-
bbox: {
|
|
3392
|
-
x1: Math.min(prev.bbox.x1, curr.bbox.x1),
|
|
3393
|
-
y1: Math.min(prev.bbox.y1, curr.bbox.y1),
|
|
3394
|
-
x2: Math.max(prev.bbox.x2, curr.bbox.x2),
|
|
3395
|
-
y2: Math.max(prev.bbox.y2, curr.bbox.y2)
|
|
3396
|
-
},
|
|
3397
|
-
vertexRadius: Math.max(prev.vertexRadius, curr.vertexRadius)
|
|
3398
|
-
};
|
|
3399
|
-
continue;
|
|
3400
|
-
}
|
|
3401
|
-
}
|
|
3402
|
-
merged.push(curr);
|
|
3403
|
-
}
|
|
3404
|
-
return merged;
|
|
2567
|
+
function parseXml(text) {
|
|
2568
|
+
return new DOMParser2().parseFromString(stripDtd(text), "text/xml");
|
|
3405
2569
|
}
|
|
3406
|
-
function
|
|
3407
|
-
|
|
3408
|
-
const
|
|
3409
|
-
const
|
|
3410
|
-
for (
|
|
3411
|
-
const
|
|
3412
|
-
|
|
3413
|
-
if (Math.abs(sorted[i] - avg) <= tolerance) {
|
|
3414
|
-
last.sum += sorted[i];
|
|
3415
|
-
last.count++;
|
|
3416
|
-
} else {
|
|
3417
|
-
clusters.push({ sum: sorted[i], count: 1 });
|
|
3418
|
-
}
|
|
2570
|
+
function parseSharedStrings(xml) {
|
|
2571
|
+
const doc = parseXml(xml);
|
|
2572
|
+
const strings = [];
|
|
2573
|
+
const siList = getElements(doc.documentElement, "si");
|
|
2574
|
+
for (const si of siList) {
|
|
2575
|
+
const tElements = getElements(si, "t");
|
|
2576
|
+
strings.push(tElements.map((t) => t.textContent ?? "").join(""));
|
|
3419
2577
|
}
|
|
3420
|
-
return
|
|
3421
|
-
}
|
|
3422
|
-
function groupConnectedLines(lines) {
|
|
3423
|
-
const parent = lines.map((_, i) => i);
|
|
3424
|
-
function find(x) {
|
|
3425
|
-
while (parent[x] !== x) {
|
|
3426
|
-
parent[x] = parent[parent[x]];
|
|
3427
|
-
x = parent[x];
|
|
3428
|
-
}
|
|
3429
|
-
return x;
|
|
3430
|
-
}
|
|
3431
|
-
function union(a, b) {
|
|
3432
|
-
const ra = find(a), rb = find(b);
|
|
3433
|
-
if (ra !== rb) parent[ra] = rb;
|
|
3434
|
-
}
|
|
3435
|
-
for (let i = 0; i < lines.length; i++) {
|
|
3436
|
-
for (let j = i + 1; j < lines.length; j++) {
|
|
3437
|
-
if (linesIntersect(lines[i], lines[j])) {
|
|
3438
|
-
union(i, j);
|
|
3439
|
-
}
|
|
3440
|
-
}
|
|
3441
|
-
}
|
|
3442
|
-
const groups = /* @__PURE__ */ new Map();
|
|
3443
|
-
for (let i = 0; i < lines.length; i++) {
|
|
3444
|
-
const root = find(i);
|
|
3445
|
-
if (!groups.has(root)) groups.set(root, []);
|
|
3446
|
-
groups.get(root).push(lines[i]);
|
|
3447
|
-
}
|
|
3448
|
-
return [...groups.values()];
|
|
3449
|
-
}
|
|
3450
|
-
function linesIntersect(a, b) {
|
|
3451
|
-
if (a.type === b.type) {
|
|
3452
|
-
if (a.type === "h") {
|
|
3453
|
-
if (Math.abs(a.y1 - b.y1) > CONNECT_TOL) return false;
|
|
3454
|
-
return Math.min(a.x2, b.x2) >= Math.max(a.x1, b.x1) - CONNECT_TOL;
|
|
3455
|
-
} else {
|
|
3456
|
-
if (Math.abs(a.x1 - b.x1) > CONNECT_TOL) return false;
|
|
3457
|
-
return Math.min(a.y2, b.y2) >= Math.max(a.y1, b.y1) - CONNECT_TOL;
|
|
3458
|
-
}
|
|
3459
|
-
}
|
|
3460
|
-
const h = a.type === "h" ? a : b;
|
|
3461
|
-
const v = a.type === "h" ? b : a;
|
|
3462
|
-
const tol = CONNECT_TOL;
|
|
3463
|
-
return v.x1 >= h.x1 - tol && v.x1 <= h.x2 + tol && h.y1 >= v.y1 - tol && h.y1 <= v.y2 + tol;
|
|
3464
|
-
}
|
|
3465
|
-
function extractCells(grid, horizontals, verticals) {
|
|
3466
|
-
const { rowYs, colXs } = grid;
|
|
3467
|
-
const numRows = rowYs.length - 1;
|
|
3468
|
-
const numCols = colXs.length - 1;
|
|
3469
|
-
if (numRows <= 0 || numCols <= 0) return [];
|
|
3470
|
-
const vBorders = Array.from(
|
|
3471
|
-
{ length: numRows },
|
|
3472
|
-
(_, r) => Array.from(
|
|
3473
|
-
{ length: numCols + 1 },
|
|
3474
|
-
(_2, c) => hasVerticalLine(verticals, colXs[c], rowYs[r], rowYs[r + 1], grid.vertexRadius)
|
|
3475
|
-
)
|
|
3476
|
-
);
|
|
3477
|
-
const hBorders = Array.from(
|
|
3478
|
-
{ length: numRows + 1 },
|
|
3479
|
-
(_, r) => Array.from(
|
|
3480
|
-
{ length: numCols },
|
|
3481
|
-
(_2, c) => hasHorizontalLine(horizontals, rowYs[r], colXs[c], colXs[c + 1], grid.vertexRadius)
|
|
3482
|
-
)
|
|
3483
|
-
);
|
|
3484
|
-
const occupied = Array.from({ length: numRows }, () => Array(numCols).fill(false));
|
|
3485
|
-
const cells = [];
|
|
3486
|
-
for (let r = 0; r < numRows; r++) {
|
|
3487
|
-
for (let c = 0; c < numCols; c++) {
|
|
3488
|
-
if (occupied[r][c]) continue;
|
|
3489
|
-
let colSpan = 1;
|
|
3490
|
-
let rowSpan = 1;
|
|
3491
|
-
while (c + colSpan < numCols && !vBorders[r][c + colSpan]) {
|
|
3492
|
-
let canExpand = true;
|
|
3493
|
-
for (let dr = 0; dr < rowSpan; dr++) {
|
|
3494
|
-
if (vBorders[r + dr][c + colSpan]) {
|
|
3495
|
-
canExpand = false;
|
|
3496
|
-
break;
|
|
3497
|
-
}
|
|
3498
|
-
}
|
|
3499
|
-
if (!canExpand) break;
|
|
3500
|
-
colSpan++;
|
|
3501
|
-
}
|
|
3502
|
-
while (r + rowSpan < numRows) {
|
|
3503
|
-
let hasLine = false;
|
|
3504
|
-
for (let dc = 0; dc < colSpan; dc++) {
|
|
3505
|
-
if (hBorders[r + rowSpan][c + dc]) {
|
|
3506
|
-
hasLine = true;
|
|
3507
|
-
break;
|
|
3508
|
-
}
|
|
3509
|
-
}
|
|
3510
|
-
if (hasLine) break;
|
|
3511
|
-
rowSpan++;
|
|
3512
|
-
}
|
|
3513
|
-
for (let dr = 0; dr < rowSpan; dr++) {
|
|
3514
|
-
for (let dc = 0; dc < colSpan; dc++) {
|
|
3515
|
-
occupied[r + dr][c + dc] = true;
|
|
3516
|
-
}
|
|
3517
|
-
}
|
|
3518
|
-
cells.push({
|
|
3519
|
-
row: r,
|
|
3520
|
-
col: c,
|
|
3521
|
-
rowSpan,
|
|
3522
|
-
colSpan,
|
|
3523
|
-
bbox: {
|
|
3524
|
-
x1: colXs[c],
|
|
3525
|
-
y1: rowYs[r + rowSpan],
|
|
3526
|
-
x2: colXs[c + colSpan],
|
|
3527
|
-
y2: rowYs[r]
|
|
3528
|
-
}
|
|
3529
|
-
});
|
|
3530
|
-
}
|
|
3531
|
-
}
|
|
3532
|
-
return cells;
|
|
3533
|
-
}
|
|
3534
|
-
function hasVerticalLine(verticals, x, topY, botY, vertexRadius) {
|
|
3535
|
-
const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
|
|
3536
|
-
for (const v of verticals) {
|
|
3537
|
-
if (Math.abs(v.x1 - x) <= tol) {
|
|
3538
|
-
const cellH = Math.abs(topY - botY);
|
|
3539
|
-
if (cellH < 0.1) continue;
|
|
3540
|
-
const overlapTop = Math.min(v.y2, topY);
|
|
3541
|
-
const overlapBot = Math.max(v.y1, botY);
|
|
3542
|
-
const overlap = overlapTop - overlapBot;
|
|
3543
|
-
if (overlap >= cellH * 0.75) return true;
|
|
3544
|
-
}
|
|
3545
|
-
}
|
|
3546
|
-
return false;
|
|
3547
|
-
}
|
|
3548
|
-
function hasHorizontalLine(horizontals, y, leftX, rightX, vertexRadius) {
|
|
3549
|
-
const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
|
|
3550
|
-
for (const h of horizontals) {
|
|
3551
|
-
if (Math.abs(h.y1 - y) <= tol) {
|
|
3552
|
-
const cellW = Math.abs(rightX - leftX);
|
|
3553
|
-
if (cellW < 0.1) continue;
|
|
3554
|
-
const overlapLeft = Math.max(h.x1, leftX);
|
|
3555
|
-
const overlapRight = Math.min(h.x2, rightX);
|
|
3556
|
-
const overlap = overlapRight - overlapLeft;
|
|
3557
|
-
if (overlap >= cellW * 0.75) return true;
|
|
3558
|
-
}
|
|
3559
|
-
}
|
|
3560
|
-
return false;
|
|
3561
|
-
}
|
|
3562
|
-
function mapTextToCells(items, cells) {
|
|
3563
|
-
const result = /* @__PURE__ */ new Map();
|
|
3564
|
-
for (const cell of cells) {
|
|
3565
|
-
result.set(cell, []);
|
|
3566
|
-
}
|
|
3567
|
-
for (const item of items) {
|
|
3568
|
-
const pad = CELL_PADDING;
|
|
3569
|
-
let bestCell = null;
|
|
3570
|
-
let bestScore = 0;
|
|
3571
|
-
for (const cell of cells) {
|
|
3572
|
-
const ix1 = Math.max(item.x, cell.bbox.x1 - pad);
|
|
3573
|
-
const ix2 = Math.min(item.x + item.w, cell.bbox.x2 + pad);
|
|
3574
|
-
const iy1 = Math.max(item.y, cell.bbox.y1 - pad);
|
|
3575
|
-
const iy2 = Math.min(item.y + (item.h || item.fontSize), cell.bbox.y2 + pad);
|
|
3576
|
-
if (ix1 >= ix2 || iy1 >= iy2) continue;
|
|
3577
|
-
const intersectArea = (ix2 - ix1) * (iy2 - iy1);
|
|
3578
|
-
const itemArea = Math.max(item.w, 1) * Math.max(item.h || item.fontSize, 1);
|
|
3579
|
-
const score = intersectArea / itemArea;
|
|
3580
|
-
if (score > bestScore) {
|
|
3581
|
-
bestScore = score;
|
|
3582
|
-
bestCell = cell;
|
|
3583
|
-
}
|
|
3584
|
-
}
|
|
3585
|
-
if (bestCell && bestScore > 0.3) {
|
|
3586
|
-
result.get(bestCell).push(item);
|
|
3587
|
-
}
|
|
3588
|
-
}
|
|
3589
|
-
return result;
|
|
3590
|
-
}
|
|
3591
|
-
function cellTextToString(items) {
|
|
3592
|
-
if (items.length === 0) return "";
|
|
3593
|
-
if (items.length === 1) return items[0].text;
|
|
3594
|
-
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
3595
|
-
const lines = [];
|
|
3596
|
-
let curLine = [sorted[0]];
|
|
3597
|
-
let curY = sorted[0].y;
|
|
3598
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
3599
|
-
const tol = Math.max(3, Math.min(sorted[i].fontSize, curLine[0].fontSize) * 0.6);
|
|
3600
|
-
if (Math.abs(sorted[i].y - curY) <= tol) {
|
|
3601
|
-
curLine.push(sorted[i]);
|
|
3602
|
-
} else {
|
|
3603
|
-
lines.push(curLine);
|
|
3604
|
-
curLine = [sorted[i]];
|
|
3605
|
-
curY = sorted[i].y;
|
|
3606
|
-
}
|
|
3607
|
-
}
|
|
3608
|
-
lines.push(curLine);
|
|
3609
|
-
const textLines = lines.map((line) => {
|
|
3610
|
-
const s = line.sort((a, b) => a.x - b.x);
|
|
3611
|
-
if (s.length === 1) return s[0].text;
|
|
3612
|
-
const evenSpaced = detectEvenSpacedItems(s);
|
|
3613
|
-
let result = s[0].text;
|
|
3614
|
-
for (let j = 1; j < s.length; j++) {
|
|
3615
|
-
if (evenSpaced[j]) {
|
|
3616
|
-
result += s[j].text;
|
|
3617
|
-
continue;
|
|
3618
|
-
}
|
|
3619
|
-
const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
|
|
3620
|
-
const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
|
|
3621
|
-
const prevIsKorean = /[가-힣]$/.test(result);
|
|
3622
|
-
const currIsKorean = /^[가-힣]/.test(s[j].text);
|
|
3623
|
-
if (gap < avgFs * 0.15) {
|
|
3624
|
-
result += s[j].text;
|
|
3625
|
-
} else if (gap < avgFs * 0.35 && (prevIsKorean || currIsKorean)) {
|
|
3626
|
-
result += s[j].text;
|
|
3627
|
-
} else {
|
|
3628
|
-
result += " " + s[j].text;
|
|
3629
|
-
}
|
|
3630
|
-
}
|
|
3631
|
-
return result;
|
|
3632
|
-
});
|
|
3633
|
-
return mergeCellTextLines(textLines);
|
|
3634
|
-
}
|
|
3635
|
-
function detectEvenSpacedItems(items) {
|
|
3636
|
-
const result = new Array(items.length).fill(false);
|
|
3637
|
-
if (items.length < 3) return result;
|
|
3638
|
-
let runStart = -1;
|
|
3639
|
-
for (let i = 0; i < items.length; i++) {
|
|
3640
|
-
const isShortKorean = /^[가-힣]{1}$/.test(items[i].text) || /^[\d]{1}$/.test(items[i].text);
|
|
3641
|
-
if (isShortKorean && runStart >= 0 && i > 0) {
|
|
3642
|
-
const gap = items[i].x - (items[i - 1].x + items[i - 1].w);
|
|
3643
|
-
const maxRunGap = Math.max(items[i].fontSize * 3, 30);
|
|
3644
|
-
if (gap > maxRunGap) {
|
|
3645
|
-
if (i - runStart >= 3) markEvenRun(items, result, runStart, i);
|
|
3646
|
-
runStart = i;
|
|
3647
|
-
continue;
|
|
3648
|
-
}
|
|
3649
|
-
}
|
|
3650
|
-
if (isShortKorean) {
|
|
3651
|
-
if (runStart < 0) runStart = i;
|
|
3652
|
-
} else {
|
|
3653
|
-
if (runStart >= 0 && i - runStart >= 3) {
|
|
3654
|
-
markEvenRun(items, result, runStart, i);
|
|
3655
|
-
}
|
|
3656
|
-
runStart = -1;
|
|
3657
|
-
}
|
|
3658
|
-
}
|
|
3659
|
-
if (runStart >= 0 && items.length - runStart >= 3) {
|
|
3660
|
-
markEvenRun(items, result, runStart, items.length);
|
|
3661
|
-
}
|
|
3662
|
-
return result;
|
|
3663
|
-
}
|
|
3664
|
-
function markEvenRun(items, result, start, end) {
|
|
3665
|
-
const gaps = [];
|
|
3666
|
-
for (let i = start + 1; i < end; i++) {
|
|
3667
|
-
gaps.push(items[i].x - (items[i - 1].x + items[i - 1].w));
|
|
3668
|
-
}
|
|
3669
|
-
const posGaps = gaps.filter((g2) => g2 > 0);
|
|
3670
|
-
if (posGaps.length < 2) return;
|
|
3671
|
-
let minGap = Infinity, maxGap = -Infinity;
|
|
3672
|
-
for (const g2 of posGaps) {
|
|
3673
|
-
if (g2 < minGap) minGap = g2;
|
|
3674
|
-
if (g2 > maxGap) maxGap = g2;
|
|
3675
|
-
}
|
|
3676
|
-
const avgFs = items[start].fontSize;
|
|
3677
|
-
if (minGap >= avgFs * 0.1 && maxGap <= avgFs * 3 && maxGap / Math.max(minGap, 0.1) <= 3) {
|
|
3678
|
-
for (let i = start + 1; i < end; i++) {
|
|
3679
|
-
result[i] = true;
|
|
3680
|
-
}
|
|
3681
|
-
}
|
|
3682
|
-
}
|
|
3683
|
-
function mergeCellTextLines(textLines) {
|
|
3684
|
-
if (textLines.length <= 1) return textLines[0] || "";
|
|
3685
|
-
const merged = [textLines[0]];
|
|
3686
|
-
for (let i = 1; i < textLines.length; i++) {
|
|
3687
|
-
const prev = merged[merged.length - 1];
|
|
3688
|
-
const curr = textLines[i];
|
|
3689
|
-
if (/[가-힣]$/.test(prev) && /^[가-힣]+$/.test(curr) && curr.length <= 8 && !curr.includes(" ")) {
|
|
3690
|
-
merged[merged.length - 1] = prev + curr;
|
|
3691
|
-
} else if (curr.trim().length <= 3 && /^[)\]%}]/.test(curr.trim())) {
|
|
3692
|
-
merged[merged.length - 1] = prev + curr.trim();
|
|
3693
|
-
} else if (/[,(]$/.test(prev.trim()) && curr.trim().length <= 15) {
|
|
3694
|
-
merged[merged.length - 1] = prev + curr.trim();
|
|
3695
|
-
} else if (/[\d,]$/.test(prev) && /^[\d,]+[)\]]?$/.test(curr.trim()) && curr.trim().length <= 10) {
|
|
3696
|
-
merged[merged.length - 1] = prev + curr.trim();
|
|
3697
|
-
} else {
|
|
3698
|
-
merged.push(curr);
|
|
3699
|
-
}
|
|
3700
|
-
}
|
|
3701
|
-
return merged.join("\n");
|
|
3702
|
-
}
|
|
3703
|
-
|
|
3704
|
-
// src/pdf/cluster-detector.ts
|
|
3705
|
-
var Y_TOL = 3;
|
|
3706
|
-
var COL_CLUSTER_TOL = 15;
|
|
3707
|
-
var MIN_ROWS = 3;
|
|
3708
|
-
var MIN_COLS = 2;
|
|
3709
|
-
var MIN_GAP_FACTOR = 2;
|
|
3710
|
-
var MIN_GAP_ABSOLUTE = 20;
|
|
3711
|
-
var MIN_COL_FILL_RATIO = 0.4;
|
|
3712
|
-
function detectClusterTables(items, pageNum) {
|
|
3713
|
-
if (items.length < MIN_ROWS * MIN_COLS) return [];
|
|
3714
|
-
const { merged, originMap } = mergeEvenSpacedClusters(items);
|
|
3715
|
-
const rows = groupByBaseline(merged);
|
|
3716
|
-
if (rows.length < MIN_ROWS) return [];
|
|
3717
|
-
const results = [];
|
|
3718
|
-
const headerResult = detectHeaderRow(rows);
|
|
3719
|
-
if (headerResult) {
|
|
3720
|
-
const { columns, headerIdx } = headerResult;
|
|
3721
|
-
const headerRow = rows[headerIdx];
|
|
3722
|
-
const headerItems = [...headerRow.items].sort((a, b) => a.x - b.x);
|
|
3723
|
-
const headerAndBelow = rows.slice(headerIdx);
|
|
3724
|
-
const mergedRows = mergeMultiLineRows(headerAndBelow, columns);
|
|
3725
|
-
const tableRegions = findTableRegionsByHeader(mergedRows, columns, headerItems);
|
|
3726
|
-
for (const region of tableRegions) {
|
|
3727
|
-
const table = buildClusterTable(region.rows, columns, pageNum);
|
|
3728
|
-
if (table) {
|
|
3729
|
-
expandUsedItems(table.usedItems, originMap);
|
|
3730
|
-
results.push(table);
|
|
3731
|
-
}
|
|
3732
|
-
}
|
|
3733
|
-
}
|
|
3734
|
-
if (results.length === 0) {
|
|
3735
|
-
const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
|
|
3736
|
-
if (suspiciousRows.length >= MIN_ROWS) {
|
|
3737
|
-
const columns = extractColumnClusters(suspiciousRows);
|
|
3738
|
-
if (columns.length >= MIN_COLS) {
|
|
3739
|
-
const tableRegions = findTableRegions(rows, columns);
|
|
3740
|
-
for (const region of tableRegions) {
|
|
3741
|
-
const mergedRows = mergeMultiLineRows(region.rows, columns);
|
|
3742
|
-
const table = buildClusterTable(mergedRows, columns, pageNum);
|
|
3743
|
-
if (table) {
|
|
3744
|
-
expandUsedItems(table.usedItems, originMap);
|
|
3745
|
-
results.push(table);
|
|
3746
|
-
}
|
|
3747
|
-
}
|
|
3748
|
-
}
|
|
3749
|
-
}
|
|
3750
|
-
}
|
|
3751
|
-
return results;
|
|
3752
|
-
}
|
|
3753
|
-
function mergeEvenSpacedClusters(items) {
|
|
3754
|
-
const originMap = /* @__PURE__ */ new Map();
|
|
3755
|
-
const rows = groupByBaseline(items);
|
|
3756
|
-
const merged = [];
|
|
3757
|
-
for (const row of rows) {
|
|
3758
|
-
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3759
|
-
let i = 0;
|
|
3760
|
-
while (i < sorted.length) {
|
|
3761
|
-
if (/^[가-힣\d]$/.test(sorted[i].text)) {
|
|
3762
|
-
let runEnd = i + 1;
|
|
3763
|
-
while (runEnd < sorted.length && /^[가-힣\d]$/.test(sorted[runEnd].text)) {
|
|
3764
|
-
const gap = sorted[runEnd].x - (sorted[runEnd - 1].x + sorted[runEnd - 1].w);
|
|
3765
|
-
const fs = sorted[runEnd].fontSize;
|
|
3766
|
-
if (gap < fs * 0.1 || gap > fs * 3) break;
|
|
3767
|
-
runEnd++;
|
|
3768
|
-
}
|
|
3769
|
-
if (runEnd - i >= 3) {
|
|
3770
|
-
const gaps = [];
|
|
3771
|
-
for (let g2 = i + 1; g2 < runEnd; g2++) {
|
|
3772
|
-
gaps.push(sorted[g2].x - (sorted[g2 - 1].x + sorted[g2 - 1].w));
|
|
3773
|
-
}
|
|
3774
|
-
let minG = Infinity, maxG = -Infinity;
|
|
3775
|
-
for (const g2 of gaps) {
|
|
3776
|
-
if (g2 < minG) minG = g2;
|
|
3777
|
-
if (g2 > maxG) maxG = g2;
|
|
3778
|
-
}
|
|
3779
|
-
if (minG > 0 && maxG / minG <= 3) {
|
|
3780
|
-
const run = sorted.slice(i, runEnd);
|
|
3781
|
-
const text = run.map((r) => r.text).join("");
|
|
3782
|
-
const first = run[0], last = run[runEnd - i - 1];
|
|
3783
|
-
const item = {
|
|
3784
|
-
text,
|
|
3785
|
-
x: first.x,
|
|
3786
|
-
y: first.y,
|
|
3787
|
-
w: last.x + last.w - first.x,
|
|
3788
|
-
h: first.h,
|
|
3789
|
-
fontSize: first.fontSize,
|
|
3790
|
-
fontName: first.fontName
|
|
3791
|
-
};
|
|
3792
|
-
originMap.set(item, run);
|
|
3793
|
-
merged.push(item);
|
|
3794
|
-
i = runEnd;
|
|
3795
|
-
continue;
|
|
3796
|
-
}
|
|
3797
|
-
}
|
|
3798
|
-
}
|
|
3799
|
-
merged.push(sorted[i]);
|
|
3800
|
-
i++;
|
|
3801
|
-
}
|
|
3802
|
-
}
|
|
3803
|
-
return { merged, originMap };
|
|
3804
|
-
}
|
|
3805
|
-
function expandUsedItems(usedItems, originMap) {
|
|
3806
|
-
const toAdd = [];
|
|
3807
|
-
for (const item of usedItems) {
|
|
3808
|
-
const origins = originMap.get(item);
|
|
3809
|
-
if (origins) for (const o of origins) toAdd.push(o);
|
|
3810
|
-
}
|
|
3811
|
-
for (const a of toAdd) usedItems.add(a);
|
|
3812
|
-
}
|
|
3813
|
-
function detectHeaderRow(rows) {
|
|
3814
|
-
const allItems = rows.flatMap((r) => r.items);
|
|
3815
|
-
if (allItems.length === 0) return null;
|
|
3816
|
-
let allMinX = Infinity, allMaxX = -Infinity;
|
|
3817
|
-
for (const i of allItems) {
|
|
3818
|
-
if (i.x < allMinX) allMinX = i.x;
|
|
3819
|
-
const r = i.x + i.w;
|
|
3820
|
-
if (r > allMaxX) allMaxX = r;
|
|
3821
|
-
}
|
|
3822
|
-
const pageSpan = allMaxX - allMinX;
|
|
3823
|
-
if (pageSpan <= 0) return null;
|
|
3824
|
-
for (let ri = 0; ri < rows.length; ri++) {
|
|
3825
|
-
const row = rows[ri];
|
|
3826
|
-
if (row.items.length < MIN_COLS || row.items.length > 6) continue;
|
|
3827
|
-
if (row.items.some((i) => i.text.length > 8)) continue;
|
|
3828
|
-
if (!row.items.some((i) => /[가-힣]/.test(i.text))) continue;
|
|
3829
|
-
if (row.items.some((i) => /^[□■○●·※▶▷◆◇\-]/.test(i.text))) continue;
|
|
3830
|
-
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3831
|
-
const xSpan = sorted[sorted.length - 1].x + sorted[sorted.length - 1].w - sorted[0].x;
|
|
3832
|
-
if (xSpan / pageSpan < 0.4) continue;
|
|
3833
|
-
const avgFs = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
|
|
3834
|
-
let hasLargeGap = false;
|
|
3835
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
3836
|
-
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
3837
|
-
if (gap >= avgFs * 2.5) {
|
|
3838
|
-
hasLargeGap = true;
|
|
3839
|
-
break;
|
|
3840
|
-
}
|
|
3841
|
-
}
|
|
3842
|
-
if (!hasLargeGap) continue;
|
|
3843
|
-
const columns = sorted.map((item) => ({ x: item.x, count: 0 }));
|
|
3844
|
-
let matchCount = 0;
|
|
3845
|
-
for (let j = ri + 1; j < rows.length && matchCount < MIN_ROWS + 2; j++) {
|
|
3846
|
-
const matched = countMatchedColumnsRange(rows[j], columns, sorted);
|
|
3847
|
-
if (matched >= MIN_COLS) matchCount++;
|
|
3848
|
-
}
|
|
3849
|
-
if (matchCount < MIN_ROWS) continue;
|
|
3850
|
-
return { columns, headerIdx: ri };
|
|
3851
|
-
}
|
|
3852
|
-
return null;
|
|
3853
|
-
}
|
|
3854
|
-
function mergeMultiLineRows(rows, columns) {
|
|
3855
|
-
if (rows.length <= 1) return rows;
|
|
3856
|
-
const result = [rows[0]];
|
|
3857
|
-
const allFontSizes = rows.flatMap((r) => r.items).map((i) => i.fontSize);
|
|
3858
|
-
const avgFontSize = allFontSizes.length > 0 ? allFontSizes.reduce((s, v) => s + v, 0) / allFontSizes.length : 12;
|
|
3859
|
-
for (let i = 1; i < rows.length; i++) {
|
|
3860
|
-
const prev = result[result.length - 1];
|
|
3861
|
-
const curr = rows[i];
|
|
3862
|
-
const yGap = Math.abs(prev.y - curr.y);
|
|
3863
|
-
const matchedCols = countMatchedColumns(curr, columns);
|
|
3864
|
-
if (yGap < avgFontSize * 1.8 && curr.items.length <= 2 && (matchedCols < MIN_COLS || curr.items.length === 1)) {
|
|
3865
|
-
result[result.length - 1] = {
|
|
3866
|
-
y: prev.y,
|
|
3867
|
-
items: [...prev.items, ...curr.items]
|
|
3868
|
-
};
|
|
3869
|
-
} else {
|
|
3870
|
-
result.push(curr);
|
|
3871
|
-
}
|
|
3872
|
-
}
|
|
3873
|
-
return result;
|
|
3874
|
-
}
|
|
3875
|
-
function groupByBaseline(items) {
|
|
3876
|
-
if (items.length === 0) return [];
|
|
3877
|
-
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
3878
|
-
const rows = [];
|
|
3879
|
-
let curItems = [sorted[0]];
|
|
3880
|
-
let curY = sorted[0].y;
|
|
3881
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
3882
|
-
if (Math.abs(sorted[i].y - curY) <= Y_TOL) {
|
|
3883
|
-
curItems.push(sorted[i]);
|
|
3884
|
-
} else {
|
|
3885
|
-
rows.push({ y: curY, items: curItems });
|
|
3886
|
-
curItems = [sorted[i]];
|
|
3887
|
-
curY = sorted[i].y;
|
|
3888
|
-
}
|
|
3889
|
-
}
|
|
3890
|
-
if (curItems.length > 0) rows.push({ y: curY, items: curItems });
|
|
3891
|
-
return rows;
|
|
3892
|
-
}
|
|
3893
|
-
function hasSuspiciousGaps(row) {
|
|
3894
|
-
if (row.items.length < 2) return false;
|
|
3895
|
-
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3896
|
-
if (sorted.length === 2 && sorted[1].text.length > 20) return false;
|
|
3897
|
-
const avgFontSize = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
|
|
3898
|
-
const minGap = Math.max(avgFontSize * MIN_GAP_FACTOR, MIN_GAP_ABSOLUTE);
|
|
3899
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
3900
|
-
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
3901
|
-
if (gap >= minGap) return true;
|
|
3902
|
-
}
|
|
3903
|
-
return false;
|
|
3904
|
-
}
|
|
3905
|
-
function extractColumnClusters(rows) {
|
|
3906
|
-
const allX = [];
|
|
3907
|
-
for (const row of rows) {
|
|
3908
|
-
for (const item of row.items) allX.push(item.x);
|
|
3909
|
-
}
|
|
3910
|
-
if (allX.length === 0) return [];
|
|
3911
|
-
allX.sort((a, b) => a - b);
|
|
3912
|
-
const clusters = [];
|
|
3913
|
-
let clusterStart = 0;
|
|
3914
|
-
for (let i = 1; i <= allX.length; i++) {
|
|
3915
|
-
if (i === allX.length || allX[i] - allX[i - 1] > COL_CLUSTER_TOL) {
|
|
3916
|
-
const slice = allX.slice(clusterStart, i);
|
|
3917
|
-
const avg = Math.round(slice.reduce((s, v) => s + v, 0) / slice.length);
|
|
3918
|
-
clusters.push({ x: avg, count: slice.length });
|
|
3919
|
-
clusterStart = i;
|
|
3920
|
-
}
|
|
3921
|
-
}
|
|
3922
|
-
const minCount = Math.max(2, Math.floor(rows.length * MIN_COL_FILL_RATIO));
|
|
3923
|
-
return clusters.filter((c) => c.count >= minCount).sort((a, b) => a.x - b.x);
|
|
3924
|
-
}
|
|
3925
|
-
function findTableRegionsByHeader(allRows, columns, headerItems) {
|
|
3926
|
-
const regions = [];
|
|
3927
|
-
let currentRegion = [];
|
|
3928
|
-
let missStreak = 0;
|
|
3929
|
-
for (const row of allRows) {
|
|
3930
|
-
const matchedCols = countMatchedColumnsRange(row, columns, headerItems);
|
|
3931
|
-
if (matchedCols >= MIN_COLS) {
|
|
3932
|
-
currentRegion.push(row);
|
|
3933
|
-
missStreak = 0;
|
|
3934
|
-
} else if (currentRegion.length > 0 && (row.items.length <= 2 || missStreak === 0)) {
|
|
3935
|
-
currentRegion.push(row);
|
|
3936
|
-
missStreak++;
|
|
3937
|
-
} else {
|
|
3938
|
-
while (currentRegion.length > 0) {
|
|
3939
|
-
const last = currentRegion[currentRegion.length - 1];
|
|
3940
|
-
if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
|
|
3941
|
-
currentRegion.pop();
|
|
3942
|
-
}
|
|
3943
|
-
if (currentRegion.length >= MIN_ROWS) {
|
|
3944
|
-
regions.push({ rows: [...currentRegion] });
|
|
3945
|
-
}
|
|
3946
|
-
currentRegion = [];
|
|
3947
|
-
missStreak = 0;
|
|
3948
|
-
}
|
|
3949
|
-
}
|
|
3950
|
-
while (currentRegion.length > 0) {
|
|
3951
|
-
const last = currentRegion[currentRegion.length - 1];
|
|
3952
|
-
if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
|
|
3953
|
-
currentRegion.pop();
|
|
3954
|
-
}
|
|
3955
|
-
if (currentRegion.length >= MIN_ROWS) {
|
|
3956
|
-
regions.push({ rows: currentRegion });
|
|
3957
|
-
}
|
|
3958
|
-
return regions;
|
|
3959
|
-
}
|
|
3960
|
-
function findTableRegions(allRows, columns) {
|
|
3961
|
-
const regions = [];
|
|
3962
|
-
let currentRegion = [];
|
|
3963
|
-
for (const row of allRows) {
|
|
3964
|
-
const matchedCols = countMatchedColumns(row, columns);
|
|
3965
|
-
if (matchedCols >= MIN_COLS) {
|
|
3966
|
-
currentRegion.push(row);
|
|
3967
|
-
} else if (row.items.length === 1) {
|
|
3968
|
-
if (currentRegion.length > 0) {
|
|
3969
|
-
currentRegion.push(row);
|
|
3970
|
-
}
|
|
3971
|
-
} else {
|
|
3972
|
-
if (currentRegion.length >= MIN_ROWS) {
|
|
3973
|
-
regions.push({ rows: [...currentRegion] });
|
|
3974
|
-
}
|
|
3975
|
-
currentRegion = [];
|
|
3976
|
-
}
|
|
3977
|
-
}
|
|
3978
|
-
if (currentRegion.length >= MIN_ROWS) {
|
|
3979
|
-
regions.push({ rows: currentRegion });
|
|
3980
|
-
}
|
|
3981
|
-
return regions;
|
|
3982
|
-
}
|
|
3983
|
-
function countMatchedColumns(row, columns) {
|
|
3984
|
-
const matched = /* @__PURE__ */ new Set();
|
|
3985
|
-
for (const item of row.items) {
|
|
3986
|
-
for (let ci = 0; ci < columns.length; ci++) {
|
|
3987
|
-
if (Math.abs(item.x - columns[ci].x) <= COL_CLUSTER_TOL * 2) {
|
|
3988
|
-
matched.add(ci);
|
|
3989
|
-
break;
|
|
3990
|
-
}
|
|
3991
|
-
}
|
|
3992
|
-
}
|
|
3993
|
-
return matched.size;
|
|
3994
|
-
}
|
|
3995
|
-
function countMatchedColumnsRange(row, columns, headerItems) {
|
|
3996
|
-
const boundaries = [];
|
|
3997
|
-
for (let ci = 0; ci < headerItems.length; ci++) {
|
|
3998
|
-
const left = ci === 0 ? 0 : (headerItems[ci - 1].x + headerItems[ci - 1].w + headerItems[ci].x) / 2;
|
|
3999
|
-
const right = ci === headerItems.length - 1 ? Infinity : (headerItems[ci].x + headerItems[ci].w + headerItems[ci + 1].x) / 2;
|
|
4000
|
-
boundaries.push({ left, right });
|
|
4001
|
-
}
|
|
4002
|
-
const matched = /* @__PURE__ */ new Set();
|
|
4003
|
-
for (const item of row.items) {
|
|
4004
|
-
for (let ci = 0; ci < boundaries.length; ci++) {
|
|
4005
|
-
if (item.x >= boundaries[ci].left && item.x < boundaries[ci].right) {
|
|
4006
|
-
matched.add(ci);
|
|
4007
|
-
break;
|
|
4008
|
-
}
|
|
4009
|
-
}
|
|
4010
|
-
}
|
|
4011
|
-
return matched.size;
|
|
4012
|
-
}
|
|
4013
|
-
function assignRowItems(items, columns, numCols) {
|
|
4014
|
-
if (items.length === 0) return [];
|
|
4015
|
-
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
4016
|
-
const colCenters = columns.map((c) => c.x);
|
|
4017
|
-
const gaps = [];
|
|
4018
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
4019
|
-
gaps.push({ idx: i, size: sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w) });
|
|
4020
|
-
}
|
|
4021
|
-
const gapSizes = gaps.map((g2) => g2.size).sort((a, b) => a - b);
|
|
4022
|
-
const medianGap = gapSizes.length > 0 ? gapSizes[Math.floor(gapSizes.length / 2)] : 0;
|
|
4023
|
-
const gapThreshold = sorted.length <= numCols + 1 ? 12 : Math.max(medianGap * 2.5, 12);
|
|
4024
|
-
const significantGaps = gaps.filter((g2) => g2.size >= gapThreshold).sort((a, b) => b.size - a.size).slice(0, numCols - 1).sort((a, b) => a.idx - b.idx);
|
|
4025
|
-
const groups = [];
|
|
4026
|
-
let start = 0;
|
|
4027
|
-
for (const gap of significantGaps) {
|
|
4028
|
-
groups.push(sorted.slice(start, gap.idx));
|
|
4029
|
-
start = gap.idx;
|
|
4030
|
-
}
|
|
4031
|
-
groups.push(sorted.slice(start));
|
|
4032
|
-
const result = [];
|
|
4033
|
-
const usedCols = /* @__PURE__ */ new Set();
|
|
4034
|
-
const groupCenters = groups.map((g2) => {
|
|
4035
|
-
let minX = Infinity, maxX = -Infinity;
|
|
4036
|
-
for (const i of g2) {
|
|
4037
|
-
if (i.x < minX) minX = i.x;
|
|
4038
|
-
const r = i.x + i.w;
|
|
4039
|
-
if (r > maxX) maxX = r;
|
|
4040
|
-
}
|
|
4041
|
-
return (minX + maxX) / 2;
|
|
4042
|
-
});
|
|
4043
|
-
const assignments = [];
|
|
4044
|
-
for (let gi = 0; gi < groups.length; gi++) {
|
|
4045
|
-
for (let ci = 0; ci < numCols; ci++) {
|
|
4046
|
-
assignments.push({ gi, ci, dist: Math.abs(groupCenters[gi] - colCenters[ci]) });
|
|
4047
|
-
}
|
|
4048
|
-
}
|
|
4049
|
-
assignments.sort((a, b) => a.dist - b.dist);
|
|
4050
|
-
const assignedGroups = /* @__PURE__ */ new Set();
|
|
4051
|
-
for (const { gi, ci } of assignments) {
|
|
4052
|
-
if (assignedGroups.has(gi) || usedCols.has(ci)) continue;
|
|
4053
|
-
result.push({ col: ci, items: groups[gi] });
|
|
4054
|
-
assignedGroups.add(gi);
|
|
4055
|
-
usedCols.add(ci);
|
|
4056
|
-
}
|
|
4057
|
-
for (let gi = 0; gi < groups.length; gi++) {
|
|
4058
|
-
if (assignedGroups.has(gi)) continue;
|
|
4059
|
-
let bestCol = 0, bestDist = Infinity;
|
|
4060
|
-
for (let ci = 0; ci < numCols; ci++) {
|
|
4061
|
-
const d = Math.abs(groupCenters[gi] - colCenters[ci]);
|
|
4062
|
-
if (d < bestDist) {
|
|
4063
|
-
bestDist = d;
|
|
4064
|
-
bestCol = ci;
|
|
4065
|
-
}
|
|
4066
|
-
}
|
|
4067
|
-
result.push({ col: bestCol, items: groups[gi] });
|
|
4068
|
-
}
|
|
4069
|
-
return result;
|
|
4070
|
-
}
|
|
4071
|
-
function buildClusterTable(rows, columns, pageNum) {
|
|
4072
|
-
const numCols = columns.length;
|
|
4073
|
-
const numRows = rows.length;
|
|
4074
|
-
if (numRows < MIN_ROWS || numCols < MIN_COLS) return null;
|
|
4075
|
-
const cells = Array.from(
|
|
4076
|
-
{ length: numRows },
|
|
4077
|
-
() => Array.from({ length: numCols }, () => ({ text: "", colSpan: 1, rowSpan: 1 }))
|
|
4078
|
-
);
|
|
4079
|
-
const usedItems = /* @__PURE__ */ new Set();
|
|
4080
|
-
for (let r = 0; r < numRows; r++) {
|
|
4081
|
-
const row = rows[r];
|
|
4082
|
-
if (row.items.length === 1 && numCols > 1) {
|
|
4083
|
-
cells[r][0] = { text: row.items[0].text, colSpan: numCols, rowSpan: 1 };
|
|
4084
|
-
usedItems.add(row.items[0]);
|
|
4085
|
-
continue;
|
|
4086
|
-
}
|
|
4087
|
-
const assignments = assignRowItems(row.items, columns, numCols);
|
|
4088
|
-
for (const { col, items } of assignments) {
|
|
4089
|
-
const text = items.map((i) => i.text).join(" ");
|
|
4090
|
-
const existing = cells[r][col].text;
|
|
4091
|
-
cells[r][col].text = existing ? existing + " " + text : text;
|
|
4092
|
-
for (const item of items) usedItems.add(item);
|
|
4093
|
-
}
|
|
4094
|
-
}
|
|
4095
|
-
let emptyRows = 0;
|
|
4096
|
-
for (const row of cells) {
|
|
4097
|
-
if (row.every((c) => c.text === "")) emptyRows++;
|
|
4098
|
-
}
|
|
4099
|
-
if (emptyRows > numRows * 0.5) return null;
|
|
4100
|
-
for (let c = 0; c < numCols; c++) {
|
|
4101
|
-
const hasValue = cells.some((row) => row[c].text !== "");
|
|
4102
|
-
if (!hasValue) return null;
|
|
4103
|
-
}
|
|
4104
|
-
for (let r = numRows - 1; r >= 1; r--) {
|
|
4105
|
-
const nonEmptyCols = cells[r].filter((c) => c.text.trim()).length;
|
|
4106
|
-
if (nonEmptyCols !== 1) continue;
|
|
4107
|
-
if (cells[r][0].text.trim() !== "") continue;
|
|
4108
|
-
const contentText = cells[r].find((c) => c.text.trim())?.text.trim() || "";
|
|
4109
|
-
if (/^[○●▶\-·]/.test(contentText)) continue;
|
|
4110
|
-
for (let pr = r - 1; pr >= 0; pr--) {
|
|
4111
|
-
if (cells[pr].some((c) => c.text.trim())) {
|
|
4112
|
-
for (let c = 0; c < numCols; c++) {
|
|
4113
|
-
const prev = cells[pr][c].text.trim();
|
|
4114
|
-
const curr = cells[r][c].text.trim();
|
|
4115
|
-
if (curr) cells[pr][c].text = prev ? prev + " " + curr : curr;
|
|
4116
|
-
}
|
|
4117
|
-
for (let c = 0; c < numCols; c++) cells[r][c].text = "";
|
|
4118
|
-
break;
|
|
4119
|
-
}
|
|
4120
|
-
}
|
|
4121
|
-
}
|
|
4122
|
-
for (let r = 0; r < cells.length - 1; r++) {
|
|
4123
|
-
const row = cells[r];
|
|
4124
|
-
const hasCol0 = row[0].text.trim() !== "";
|
|
4125
|
-
const hasColLast = numCols > 1 && row[numCols - 1].text.trim() !== "";
|
|
4126
|
-
const midEmpty = row.slice(1, numCols - 1).every((c) => c.text.trim() === "");
|
|
4127
|
-
if (hasCol0 && hasColLast && midEmpty) {
|
|
4128
|
-
const next = cells[r + 1];
|
|
4129
|
-
if (next[0].text.trim() === "" && next.some((c) => c.text.trim())) {
|
|
4130
|
-
for (let c = 1; c < numCols; c++) {
|
|
4131
|
-
const curr = next[c].text.trim();
|
|
4132
|
-
if (curr) row[c].text = row[c].text.trim() ? row[c].text.trim() + " " + curr : curr;
|
|
4133
|
-
}
|
|
4134
|
-
for (let c = 0; c < numCols; c++) next[c].text = "";
|
|
4135
|
-
}
|
|
4136
|
-
}
|
|
4137
|
-
}
|
|
4138
|
-
const filteredCells = cells.filter((row) => row.some((c) => c.text.trim()));
|
|
4139
|
-
const finalRowCount = filteredCells.length;
|
|
4140
|
-
if (finalRowCount < MIN_ROWS) return null;
|
|
4141
|
-
const irTable = {
|
|
4142
|
-
rows: finalRowCount,
|
|
4143
|
-
cols: numCols,
|
|
4144
|
-
cells: filteredCells,
|
|
4145
|
-
hasHeader: finalRowCount > 1
|
|
4146
|
-
};
|
|
4147
|
-
const allItems = rows.flatMap((r) => r.items);
|
|
4148
|
-
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
4149
|
-
for (const i of allItems) {
|
|
4150
|
-
if (i.x < minX) minX = i.x;
|
|
4151
|
-
if (i.y < minY) minY = i.y;
|
|
4152
|
-
if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
4153
|
-
const h = i.h > 0 ? i.h : i.fontSize;
|
|
4154
|
-
if (i.y + h > maxY) maxY = i.y + h;
|
|
4155
|
-
}
|
|
4156
|
-
return {
|
|
4157
|
-
table: irTable,
|
|
4158
|
-
bbox: { page: pageNum, x: minX, y: minY, width: maxX - minX, height: maxY - minY },
|
|
4159
|
-
usedItems
|
|
4160
|
-
};
|
|
4161
|
-
}
|
|
4162
|
-
|
|
4163
|
-
// src/pdf/polyfill.ts
|
|
4164
|
-
import * as pdfjsWorker from "pdfjs-dist/legacy/build/pdf.worker.mjs";
|
|
4165
|
-
var g = globalThis;
|
|
4166
|
-
if (typeof g.DOMMatrix === "undefined") {
|
|
4167
|
-
g.DOMMatrix = class DOMMatrix {
|
|
4168
|
-
m = [1, 0, 0, 1, 0, 0];
|
|
4169
|
-
constructor(init) {
|
|
4170
|
-
if (init) this.m = init;
|
|
4171
|
-
}
|
|
4172
|
-
};
|
|
4173
|
-
}
|
|
4174
|
-
if (typeof g.Path2D === "undefined") {
|
|
4175
|
-
g.Path2D = class Path2D {
|
|
4176
|
-
};
|
|
4177
|
-
}
|
|
4178
|
-
g.pdfjsWorker = pdfjsWorker;
|
|
4179
|
-
|
|
4180
|
-
// src/pdf/parser.ts
|
|
4181
|
-
import { getDocument, GlobalWorkerOptions } from "pdfjs-dist/legacy/build/pdf.mjs";
|
|
4182
|
-
GlobalWorkerOptions.workerSrc = "";
|
|
4183
|
-
var MAX_PAGES = 5e3;
|
|
4184
|
-
var MAX_TOTAL_TEXT = 100 * 1024 * 1024;
|
|
4185
|
-
var PDF_LOAD_TIMEOUT_MS = 3e4;
|
|
4186
|
-
async function loadPdfWithTimeout(buffer) {
|
|
4187
|
-
const loadingTask = getDocument({
|
|
4188
|
-
data: new Uint8Array(buffer),
|
|
4189
|
-
useSystemFonts: true,
|
|
4190
|
-
disableFontFace: true,
|
|
4191
|
-
isEvalSupported: false
|
|
4192
|
-
});
|
|
4193
|
-
let timer;
|
|
4194
|
-
try {
|
|
4195
|
-
return await Promise.race([
|
|
4196
|
-
loadingTask.promise,
|
|
4197
|
-
new Promise((_, reject) => {
|
|
4198
|
-
timer = setTimeout(() => {
|
|
4199
|
-
loadingTask.destroy();
|
|
4200
|
-
reject(new KordocError("PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (30\uCD08 \uCD08\uACFC)"));
|
|
4201
|
-
}, PDF_LOAD_TIMEOUT_MS);
|
|
4202
|
-
})
|
|
4203
|
-
]);
|
|
4204
|
-
} finally {
|
|
4205
|
-
if (timer !== void 0) clearTimeout(timer);
|
|
4206
|
-
}
|
|
4207
|
-
}
|
|
4208
|
-
async function parsePdfDocument(buffer, options) {
|
|
4209
|
-
const doc = await loadPdfWithTimeout(buffer);
|
|
4210
|
-
try {
|
|
4211
|
-
const pageCount = doc.numPages;
|
|
4212
|
-
if (pageCount === 0) throw new KordocError("PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.");
|
|
4213
|
-
const metadata = { pageCount };
|
|
4214
|
-
await extractPdfMetadata(doc, metadata);
|
|
4215
|
-
const blocks = [];
|
|
4216
|
-
const warnings = [];
|
|
4217
|
-
let totalChars = 0;
|
|
4218
|
-
let totalTextBytes = 0;
|
|
4219
|
-
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
4220
|
-
const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
|
|
4221
|
-
const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
|
|
4222
|
-
const fontSizeFreq = /* @__PURE__ */ new Map();
|
|
4223
|
-
const pageHeights = /* @__PURE__ */ new Map();
|
|
4224
|
-
let parsedPages = 0;
|
|
4225
|
-
for (let i = 1; i <= effectivePageCount; i++) {
|
|
4226
|
-
if (pageFilter && !pageFilter.has(i)) continue;
|
|
4227
|
-
try {
|
|
4228
|
-
const page = await doc.getPage(i);
|
|
4229
|
-
const tc = await page.getTextContent();
|
|
4230
|
-
const viewport = page.getViewport({ scale: 1 });
|
|
4231
|
-
pageHeights.set(i, viewport.height);
|
|
4232
|
-
const rawItems = tc.items;
|
|
4233
|
-
const items = normalizeItems(rawItems);
|
|
4234
|
-
const { visible, hiddenCount } = filterHiddenText(items, viewport.width, viewport.height);
|
|
4235
|
-
if (hiddenCount > 0) {
|
|
4236
|
-
warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
|
|
4237
|
-
}
|
|
4238
|
-
for (const item of visible) {
|
|
4239
|
-
if (item.fontSize > 0) fontSizeFreq.set(item.fontSize, (fontSizeFreq.get(item.fontSize) || 0) + 1);
|
|
4240
|
-
}
|
|
4241
|
-
const opList = await page.getOperatorList();
|
|
4242
|
-
const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
|
|
4243
|
-
for (const b of pageBlocks) blocks.push(b);
|
|
4244
|
-
for (const b of pageBlocks) {
|
|
4245
|
-
const t = b.text || "";
|
|
4246
|
-
totalChars += t.replace(/\s/g, "").length;
|
|
4247
|
-
totalTextBytes += t.length * 2;
|
|
4248
|
-
}
|
|
4249
|
-
if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
|
|
4250
|
-
parsedPages++;
|
|
4251
|
-
options?.onProgress?.(parsedPages, totalTarget);
|
|
4252
|
-
} catch (pageErr) {
|
|
4253
|
-
if (pageErr instanceof KordocError) throw pageErr;
|
|
4254
|
-
warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
4255
|
-
}
|
|
4256
|
-
}
|
|
4257
|
-
const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
|
|
4258
|
-
if (totalChars / Math.max(parsedPageCount, 1) < 10) {
|
|
4259
|
-
if (options?.ocr) {
|
|
4260
|
-
try {
|
|
4261
|
-
const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
|
|
4262
|
-
const ocrBlocks = await ocrPages2(doc, options.ocr, pageFilter, effectivePageCount);
|
|
4263
|
-
if (ocrBlocks.length > 0) {
|
|
4264
|
-
const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
|
|
4265
|
-
return { markdown: ocrMarkdown, blocks: ocrBlocks, metadata, warnings, isImageBased: true };
|
|
4266
|
-
}
|
|
4267
|
-
} catch {
|
|
4268
|
-
}
|
|
4269
|
-
}
|
|
4270
|
-
throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
|
|
4271
|
-
}
|
|
4272
|
-
if (options?.removeHeaderFooter !== false && parsedPageCount >= 3) {
|
|
4273
|
-
const removed = removeHeaderFooterBlocks(blocks, pageHeights, warnings);
|
|
4274
|
-
for (let ri = removed.length - 1; ri >= 0; ri--) {
|
|
4275
|
-
blocks.splice(removed[ri], 1);
|
|
4276
|
-
}
|
|
4277
|
-
}
|
|
4278
|
-
const medianFontSize = computeMedianFontSizeFromFreq(fontSizeFreq);
|
|
4279
|
-
if (medianFontSize > 0) {
|
|
4280
|
-
detectHeadings(blocks, medianFontSize);
|
|
4281
|
-
}
|
|
4282
|
-
detectMarkerHeadings(blocks);
|
|
4283
|
-
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
4284
|
-
let markdown = cleanPdfText(blocksToMarkdown(blocks));
|
|
4285
|
-
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
4286
|
-
} finally {
|
|
4287
|
-
await doc.destroy().catch(() => {
|
|
4288
|
-
});
|
|
4289
|
-
}
|
|
4290
|
-
}
|
|
4291
|
-
async function extractPdfMetadata(doc, metadata) {
|
|
4292
|
-
try {
|
|
4293
|
-
const result = await doc.getMetadata();
|
|
4294
|
-
if (!result?.info) return;
|
|
4295
|
-
const info = result.info;
|
|
4296
|
-
if (typeof info.Title === "string" && info.Title.trim()) metadata.title = info.Title.trim();
|
|
4297
|
-
if (typeof info.Author === "string" && info.Author.trim()) metadata.author = info.Author.trim();
|
|
4298
|
-
if (typeof info.Creator === "string" && info.Creator.trim()) metadata.creator = info.Creator.trim();
|
|
4299
|
-
if (typeof info.Subject === "string" && info.Subject.trim()) metadata.description = info.Subject.trim();
|
|
4300
|
-
if (typeof info.Keywords === "string" && info.Keywords.trim()) {
|
|
4301
|
-
metadata.keywords = info.Keywords.split(/[,;]/).map((k) => k.trim()).filter(Boolean);
|
|
4302
|
-
}
|
|
4303
|
-
if (typeof info.CreationDate === "string") metadata.createdAt = parsePdfDate(info.CreationDate);
|
|
4304
|
-
if (typeof info.ModDate === "string") metadata.modifiedAt = parsePdfDate(info.ModDate);
|
|
4305
|
-
} catch {
|
|
4306
|
-
}
|
|
4307
|
-
}
|
|
4308
|
-
function parsePdfDate(dateStr) {
|
|
4309
|
-
const m = dateStr.match(/D:(\d{4})(\d{2})?(\d{2})?(\d{2})?(\d{2})?(\d{2})?/);
|
|
4310
|
-
if (!m) return void 0;
|
|
4311
|
-
const [, year, month = "01", day = "01", hour = "00", min = "00", sec = "00"] = m;
|
|
4312
|
-
return `${year}-${month}-${day}T${hour}:${min}:${sec}`;
|
|
4313
|
-
}
|
|
4314
|
-
function filterHiddenText(items, pageWidth, pageHeight) {
|
|
4315
|
-
let hiddenCount = 0;
|
|
4316
|
-
const visible = [];
|
|
4317
|
-
for (const item of items) {
|
|
4318
|
-
if (item.isHidden) {
|
|
4319
|
-
hiddenCount++;
|
|
4320
|
-
continue;
|
|
4321
|
-
}
|
|
4322
|
-
const margin = Math.max(pageWidth, pageHeight) * 0.1;
|
|
4323
|
-
if (item.x < -margin || item.x > pageWidth + margin || item.y < -margin || item.y > pageHeight + margin) {
|
|
4324
|
-
hiddenCount++;
|
|
4325
|
-
continue;
|
|
4326
|
-
}
|
|
4327
|
-
visible.push(item);
|
|
4328
|
-
}
|
|
4329
|
-
return { visible, hiddenCount };
|
|
4330
|
-
}
|
|
4331
|
-
function computeMedianFontSizeFromFreq(freq) {
|
|
4332
|
-
if (freq.size === 0) return 0;
|
|
4333
|
-
let total = 0;
|
|
4334
|
-
for (const count of freq.values()) total += count;
|
|
4335
|
-
const sorted = [...freq.entries()].sort((a, b) => a[0] - b[0]);
|
|
4336
|
-
const mid = Math.floor(total / 2);
|
|
4337
|
-
let cumulative = 0;
|
|
4338
|
-
for (const [size, count] of sorted) {
|
|
4339
|
-
cumulative += count;
|
|
4340
|
-
if (cumulative > mid) return size;
|
|
4341
|
-
}
|
|
4342
|
-
return sorted[sorted.length - 1][0];
|
|
4343
|
-
}
|
|
4344
|
-
function detectHeadings(blocks, medianFontSize) {
|
|
4345
|
-
for (const block of blocks) {
|
|
4346
|
-
if (block.type !== "paragraph" || !block.text || !block.style?.fontSize) continue;
|
|
4347
|
-
const text = block.text.trim();
|
|
4348
|
-
if (text.length === 0 || text.length > 200) continue;
|
|
4349
|
-
if (/^\d+$/.test(text)) continue;
|
|
4350
|
-
const ratio = block.style.fontSize / medianFontSize;
|
|
4351
|
-
let level = 0;
|
|
4352
|
-
if (ratio >= HEADING_RATIO_H1) level = 1;
|
|
4353
|
-
else if (ratio >= HEADING_RATIO_H2) level = 2;
|
|
4354
|
-
else if (ratio >= HEADING_RATIO_H3) level = 3;
|
|
4355
|
-
if (level > 0) {
|
|
4356
|
-
block.type = "heading";
|
|
4357
|
-
block.level = level;
|
|
4358
|
-
block.text = collapseEvenSpacing(text);
|
|
4359
|
-
}
|
|
4360
|
-
}
|
|
4361
|
-
}
|
|
4362
|
-
function collapseEvenSpacing(text) {
|
|
4363
|
-
const tokens = text.split(" ");
|
|
4364
|
-
const singleCharCount = tokens.filter((t) => t.length === 1).length;
|
|
4365
|
-
if (tokens.length >= 3 && singleCharCount / tokens.length >= 0.7) {
|
|
4366
|
-
return tokens.join("");
|
|
4367
|
-
}
|
|
4368
|
-
return text.replace(
|
|
4369
|
-
/(?<![가-힣])[가-힣](?: [가-힣\d]){2,}(?![가-힣])/g,
|
|
4370
|
-
(match) => match.replace(/ /g, "")
|
|
4371
|
-
);
|
|
4372
|
-
}
|
|
4373
|
-
function shouldDemoteTable(table) {
|
|
4374
|
-
const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
|
|
4375
|
-
const allText = allCells.join(" ");
|
|
4376
|
-
if (table.rows <= 3 && table.cols <= 3) {
|
|
4377
|
-
const totalCells2 = table.rows * table.cols;
|
|
4378
|
-
const emptyCells2 = totalCells2 - allCells.length;
|
|
4379
|
-
if (emptyCells2 >= totalCells2 * 0.3) return true;
|
|
4380
|
-
if (/[□■◆○●▶ㅇ]/.test(allText)) return true;
|
|
4381
|
-
if (/<[^>]+>/.test(allText)) return true;
|
|
4382
|
-
}
|
|
4383
|
-
if (allText.length > 200) return false;
|
|
4384
|
-
if (/[□■◆○●▶]/.test(allText) && table.rows <= 3) return true;
|
|
4385
|
-
const totalCells = table.rows * table.cols;
|
|
4386
|
-
const emptyCells = totalCells - allCells.length;
|
|
4387
|
-
if (table.rows <= 2 && emptyCells > totalCells * 0.5) return true;
|
|
4388
|
-
if (table.rows === 1 && !/\d{2,}/.test(allText)) return true;
|
|
4389
|
-
return false;
|
|
4390
|
-
}
|
|
4391
|
-
function demoteTableToText(table) {
|
|
4392
|
-
const lines = [];
|
|
4393
|
-
for (let r = 0; r < table.rows; r++) {
|
|
4394
|
-
const cells = table.cells[r].map((c) => c.text.trim()).filter(Boolean);
|
|
4395
|
-
if (cells.length === 0) continue;
|
|
4396
|
-
if (table.cols === 2 && cells.length === 2) {
|
|
4397
|
-
lines.push(`${cells[0]} : ${cells[1]}`);
|
|
4398
|
-
} else {
|
|
4399
|
-
lines.push(cells.join(" "));
|
|
4400
|
-
}
|
|
4401
|
-
}
|
|
4402
|
-
return lines.join("\n");
|
|
4403
|
-
}
|
|
4404
|
-
function detectMarkerHeadings(blocks) {
|
|
4405
|
-
for (let i = 0; i < blocks.length; i++) {
|
|
4406
|
-
const block = blocks[i];
|
|
4407
|
-
if (block.type !== "paragraph" || !block.text) continue;
|
|
4408
|
-
const text = block.text.trim();
|
|
4409
|
-
if (text.length < 50 && /^[□■◆◇▶]\s*[가-힣]/.test(text)) {
|
|
4410
|
-
block.type = "heading";
|
|
4411
|
-
block.level = 4;
|
|
4412
|
-
continue;
|
|
4413
|
-
}
|
|
4414
|
-
if (/^[가-힣]{2,6}$/.test(text) && block.style?.fontSize) {
|
|
4415
|
-
const prev = blocks[i - 1];
|
|
4416
|
-
const next = blocks[i + 1];
|
|
4417
|
-
const prevIsStructural = !prev || prev.type === "table" || prev.type === "heading" || prev.type === "separator";
|
|
4418
|
-
const nextIsStructural = !next || next.type === "table" || next.type === "heading" || next.type === "paragraph" && next.text && /^[□■◆○●]/.test(next.text.trim());
|
|
4419
|
-
if (prevIsStructural || nextIsStructural) {
|
|
4420
|
-
block.type = "heading";
|
|
4421
|
-
block.level = 3;
|
|
4422
|
-
}
|
|
4423
|
-
}
|
|
4424
|
-
}
|
|
4425
|
-
}
|
|
4426
|
-
var MAX_XYCUT_DEPTH = 50;
|
|
4427
|
-
function xyCutOrder(items, gapThreshold, depth = 0) {
|
|
4428
|
-
if (items.length === 0) return [];
|
|
4429
|
-
if (items.length <= 2 || depth >= MAX_XYCUT_DEPTH) return [items];
|
|
4430
|
-
const region = computeRegion(items);
|
|
4431
|
-
const ySplit = findYSplit(items, region, gapThreshold);
|
|
4432
|
-
if (ySplit !== null) {
|
|
4433
|
-
const upper = items.filter((i) => i.y > ySplit);
|
|
4434
|
-
const lower = items.filter((i) => i.y <= ySplit);
|
|
4435
|
-
if (upper.length > 0 && lower.length > 0 && upper.length < items.length) {
|
|
4436
|
-
return [...xyCutOrder(upper, gapThreshold, depth + 1), ...xyCutOrder(lower, gapThreshold, depth + 1)];
|
|
4437
|
-
}
|
|
4438
|
-
}
|
|
4439
|
-
const xSplit = findXSplit(items, region, gapThreshold);
|
|
4440
|
-
if (xSplit !== null) {
|
|
4441
|
-
const left = items.filter((i) => i.x + i.w / 2 < xSplit);
|
|
4442
|
-
const right = items.filter((i) => i.x + i.w / 2 >= xSplit);
|
|
4443
|
-
if (left.length > 0 && right.length > 0 && left.length < items.length) {
|
|
4444
|
-
return [...xyCutOrder(left, gapThreshold, depth + 1), ...xyCutOrder(right, gapThreshold, depth + 1)];
|
|
4445
|
-
}
|
|
4446
|
-
}
|
|
4447
|
-
return [items];
|
|
4448
|
-
}
|
|
4449
|
-
function computeRegion(items) {
|
|
4450
|
-
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
4451
|
-
for (const i of items) {
|
|
4452
|
-
if (i.x < minX) minX = i.x;
|
|
4453
|
-
if (i.y < minY) minY = i.y;
|
|
4454
|
-
if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
4455
|
-
if (i.y + i.h > maxY) maxY = i.y + i.h;
|
|
4456
|
-
}
|
|
4457
|
-
return { items, minX, minY, maxX, maxY };
|
|
4458
|
-
}
|
|
4459
|
-
function findYSplit(items, _region, gapThreshold) {
|
|
4460
|
-
const sorted = [...items].sort((a, b) => b.y - a.y);
|
|
4461
|
-
let bestGap = gapThreshold;
|
|
4462
|
-
let bestSplit = null;
|
|
4463
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
4464
|
-
const prevBottom = sorted[i - 1].y - sorted[i - 1].h;
|
|
4465
|
-
const currTop = sorted[i].y;
|
|
4466
|
-
const gap = prevBottom - currTop;
|
|
4467
|
-
if (gap > bestGap) {
|
|
4468
|
-
bestGap = gap;
|
|
4469
|
-
bestSplit = (prevBottom + currTop) / 2;
|
|
4470
|
-
}
|
|
4471
|
-
}
|
|
4472
|
-
return bestSplit;
|
|
4473
|
-
}
|
|
4474
|
-
function findXSplit(items, _region, gapThreshold) {
|
|
4475
|
-
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
4476
|
-
let bestGap = gapThreshold;
|
|
4477
|
-
let bestSplit = null;
|
|
4478
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
4479
|
-
const prevRight = sorted[i - 1].x + sorted[i - 1].w;
|
|
4480
|
-
const currLeft = sorted[i].x;
|
|
4481
|
-
const gap = currLeft - prevRight;
|
|
4482
|
-
if (gap > bestGap) {
|
|
4483
|
-
bestGap = gap;
|
|
4484
|
-
bestSplit = (prevRight + currLeft) / 2;
|
|
4485
|
-
}
|
|
4486
|
-
}
|
|
4487
|
-
return bestSplit;
|
|
4488
|
-
}
|
|
4489
|
-
function extractPageBlocksWithLines(items, pageNum, opList, pageWidth, pageHeight) {
|
|
4490
|
-
if (items.length === 0) return [];
|
|
4491
|
-
let { horizontals, verticals } = extractLines(opList.fnArray, opList.argsArray);
|
|
4492
|
-
({ horizontals, verticals } = filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight));
|
|
4493
|
-
({ horizontals, verticals } = preprocessLines(horizontals, verticals));
|
|
4494
|
-
const grids = buildTableGrids(horizontals, verticals);
|
|
4495
|
-
if (grids.length > 0) {
|
|
4496
|
-
return extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals);
|
|
4497
|
-
}
|
|
4498
|
-
return extractPageBlocksFallback(items, pageNum);
|
|
4499
|
-
}
|
|
4500
|
-
function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
4501
|
-
const blocks = [];
|
|
4502
|
-
const usedItems = /* @__PURE__ */ new Set();
|
|
4503
|
-
const sortedGrids = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
|
|
4504
|
-
for (const grid of sortedGrids) {
|
|
4505
|
-
const numGridRows = grid.rowYs.length - 1;
|
|
4506
|
-
const numGridCols = grid.colXs.length - 1;
|
|
4507
|
-
if (numGridRows === 1 && numGridCols >= 2) continue;
|
|
4508
|
-
const tableItems = [];
|
|
4509
|
-
const pad = 3;
|
|
4510
|
-
const gridW = grid.bbox.x2 - grid.bbox.x1;
|
|
4511
|
-
for (const item of items) {
|
|
4512
|
-
if (usedItems.has(item)) continue;
|
|
4513
|
-
if (item.y < grid.bbox.y1 - pad || item.y > grid.bbox.y2 + pad) continue;
|
|
4514
|
-
if (item.x < grid.bbox.x1 - pad || item.x + item.w > grid.bbox.x2 + pad) continue;
|
|
4515
|
-
if (gridW < 120 && item.x + item.w > grid.bbox.x2 - 2) continue;
|
|
4516
|
-
tableItems.push(item);
|
|
4517
|
-
usedItems.add(item);
|
|
4518
|
-
}
|
|
4519
|
-
const cells = extractCells(grid, horizontals, verticals);
|
|
4520
|
-
if (cells.length === 0) continue;
|
|
4521
|
-
const textItems = tableItems.map((i) => ({
|
|
4522
|
-
text: i.text,
|
|
4523
|
-
x: i.x,
|
|
4524
|
-
y: i.y,
|
|
4525
|
-
w: i.w,
|
|
4526
|
-
h: i.h,
|
|
4527
|
-
fontSize: i.fontSize,
|
|
4528
|
-
fontName: i.fontName
|
|
4529
|
-
}));
|
|
4530
|
-
const cellTextMap = mapTextToCells(textItems, cells);
|
|
4531
|
-
const numRows = grid.rowYs.length - 1;
|
|
4532
|
-
const numCols = grid.colXs.length - 1;
|
|
4533
|
-
const irGrid = Array.from(
|
|
4534
|
-
{ length: numRows },
|
|
4535
|
-
() => Array.from({ length: numCols }, () => ({ text: "", colSpan: 1, rowSpan: 1 }))
|
|
4536
|
-
);
|
|
4537
|
-
for (const cell of cells) {
|
|
4538
|
-
const cellItems = cellTextMap.get(cell) || [];
|
|
4539
|
-
let text = cellTextToString(cellItems);
|
|
4540
|
-
text = text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").trim();
|
|
4541
|
-
text = text.split("\n").map((line) => collapseEvenSpacing(line)).join("\n");
|
|
4542
|
-
irGrid[cell.row][cell.col] = {
|
|
4543
|
-
text,
|
|
4544
|
-
colSpan: cell.colSpan,
|
|
4545
|
-
rowSpan: cell.rowSpan
|
|
4546
|
-
};
|
|
4547
|
-
}
|
|
4548
|
-
const irTable = {
|
|
4549
|
-
rows: numRows,
|
|
4550
|
-
cols: numCols,
|
|
4551
|
-
cells: irGrid,
|
|
4552
|
-
hasHeader: numRows > 1
|
|
4553
|
-
};
|
|
4554
|
-
const hasContent = irGrid.some((row) => row.some((cell) => cell.text.trim() !== ""));
|
|
4555
|
-
if (!hasContent) continue;
|
|
4556
|
-
const tableBbox = {
|
|
4557
|
-
page: pageNum,
|
|
4558
|
-
x: grid.bbox.x1,
|
|
4559
|
-
y: grid.bbox.y1,
|
|
4560
|
-
width: grid.bbox.x2 - grid.bbox.x1,
|
|
4561
|
-
height: grid.bbox.y2 - grid.bbox.y1
|
|
4562
|
-
};
|
|
4563
|
-
if (shouldDemoteTable(irTable)) {
|
|
4564
|
-
const demoted = demoteTableToText(irTable);
|
|
4565
|
-
if (demoted) {
|
|
4566
|
-
const text = numGridRows === 1 ? "\n" + demoted + "\n" : demoted;
|
|
4567
|
-
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
|
|
4568
|
-
}
|
|
4569
|
-
continue;
|
|
4570
|
-
}
|
|
4571
|
-
blocks.push({ type: "table", table: irTable, pageNumber: pageNum, bbox: tableBbox });
|
|
4572
|
-
}
|
|
4573
|
-
let remaining = items.filter((i) => !usedItems.has(i));
|
|
4574
|
-
if (remaining.length > 0) {
|
|
4575
|
-
remaining.sort((a, b) => b.y - a.y || a.x - b.x);
|
|
4576
|
-
const clusterItems = remaining.map((i) => ({
|
|
4577
|
-
text: i.text,
|
|
4578
|
-
x: i.x,
|
|
4579
|
-
y: i.y,
|
|
4580
|
-
w: i.w,
|
|
4581
|
-
h: i.h,
|
|
4582
|
-
fontSize: i.fontSize,
|
|
4583
|
-
fontName: i.fontName
|
|
4584
|
-
}));
|
|
4585
|
-
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4586
|
-
if (clusterResults.length > 0) {
|
|
4587
|
-
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4588
|
-
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4589
|
-
const usedClusterIndices = /* @__PURE__ */ new Set();
|
|
4590
|
-
for (const cr of clusterResults) {
|
|
4591
|
-
for (const ci of cr.usedItems) {
|
|
4592
|
-
const idx = ciToIdx.get(ci);
|
|
4593
|
-
if (idx !== void 0) usedClusterIndices.add(idx);
|
|
4594
|
-
}
|
|
4595
|
-
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4596
|
-
}
|
|
4597
|
-
remaining = remaining.filter((_, idx) => !usedClusterIndices.has(idx));
|
|
4598
|
-
}
|
|
4599
|
-
if (remaining.length > 0) {
|
|
4600
|
-
const allY = remaining.map((i) => i.y);
|
|
4601
|
-
const pageH = safeMax(allY) - safeMin(allY);
|
|
4602
|
-
const groups = xyCutOrder(remaining, Math.max(15, pageH * 0.03));
|
|
4603
|
-
const textBlocks = [];
|
|
4604
|
-
for (const group of groups) {
|
|
4605
|
-
if (group.length === 0) continue;
|
|
4606
|
-
const groupBlocks = extractPageBlocksFallback(group, pageNum);
|
|
4607
|
-
for (const b of groupBlocks) textBlocks.push(b);
|
|
4608
|
-
}
|
|
4609
|
-
const finalTextBlocks = detectListBlocks(textBlocks);
|
|
4610
|
-
for (const b of finalTextBlocks) blocks.push(b);
|
|
4611
|
-
}
|
|
4612
|
-
blocks.sort((a, b) => {
|
|
4613
|
-
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
4614
|
-
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
4615
|
-
return by - ay;
|
|
4616
|
-
});
|
|
4617
|
-
return mergeAdjacentTableBlocks(blocks);
|
|
4618
|
-
}
|
|
4619
|
-
return mergeAdjacentTableBlocks(blocks);
|
|
4620
|
-
}
|
|
4621
|
-
function mergeAdjacentTableBlocks(blocks) {
|
|
4622
|
-
if (blocks.length <= 1) return blocks;
|
|
4623
|
-
const result = [blocks[0]];
|
|
4624
|
-
for (let i = 1; i < blocks.length; i++) {
|
|
4625
|
-
const prev = result[result.length - 1];
|
|
4626
|
-
const curr = blocks[i];
|
|
4627
|
-
if (prev.type === "table" && curr.type === "table" && prev.table && curr.table && prev.table.cols === curr.table.cols) {
|
|
4628
|
-
const merged = {
|
|
4629
|
-
rows: prev.table.rows + curr.table.rows,
|
|
4630
|
-
cols: prev.table.cols,
|
|
4631
|
-
cells: [...prev.table.cells, ...curr.table.cells],
|
|
4632
|
-
hasHeader: prev.table.hasHeader
|
|
4633
|
-
};
|
|
4634
|
-
result[result.length - 1] = { ...prev, table: merged };
|
|
4635
|
-
} else {
|
|
4636
|
-
result.push(curr);
|
|
4637
|
-
}
|
|
4638
|
-
}
|
|
4639
|
-
return result;
|
|
4640
|
-
}
|
|
4641
|
-
function extractPageBlocksFallback(items, pageNum) {
|
|
4642
|
-
if (items.length === 0) return [];
|
|
4643
|
-
const blocks = [];
|
|
4644
|
-
const clusterItems = items.map((i) => ({
|
|
4645
|
-
text: i.text,
|
|
4646
|
-
x: i.x,
|
|
4647
|
-
y: i.y,
|
|
4648
|
-
w: i.w,
|
|
4649
|
-
h: i.h,
|
|
4650
|
-
fontSize: i.fontSize,
|
|
4651
|
-
fontName: i.fontName
|
|
4652
|
-
}));
|
|
4653
|
-
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4654
|
-
if (clusterResults.length > 0) {
|
|
4655
|
-
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4656
|
-
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4657
|
-
const usedIndices = /* @__PURE__ */ new Set();
|
|
4658
|
-
for (const cr of clusterResults) {
|
|
4659
|
-
for (const ci of cr.usedItems) {
|
|
4660
|
-
const idx = ciToIdx.get(ci);
|
|
4661
|
-
if (idx !== void 0) usedIndices.add(idx);
|
|
4662
|
-
}
|
|
4663
|
-
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4664
|
-
}
|
|
4665
|
-
const remaining = items.filter((_, idx) => !usedIndices.has(idx));
|
|
4666
|
-
if (remaining.length > 0) {
|
|
4667
|
-
const yLines = groupByY(remaining);
|
|
4668
|
-
for (const line of yLines) {
|
|
4669
|
-
const text = mergeLineSimple(line);
|
|
4670
|
-
if (!text.trim()) continue;
|
|
4671
|
-
const bbox = computeBBox(line, pageNum);
|
|
4672
|
-
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
|
|
4673
|
-
}
|
|
4674
|
-
}
|
|
4675
|
-
blocks.sort((a, b) => {
|
|
4676
|
-
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
4677
|
-
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
4678
|
-
return by - ay;
|
|
4679
|
-
});
|
|
4680
|
-
} else {
|
|
4681
|
-
const allYLines = groupByY(items);
|
|
4682
|
-
const columns = detectColumns(allYLines);
|
|
4683
|
-
if (columns && columns.length >= 3) {
|
|
4684
|
-
const tableText = extractWithColumns(allYLines, columns);
|
|
4685
|
-
const bbox = computeBBox(items, pageNum);
|
|
4686
|
-
blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
|
|
4687
|
-
} else {
|
|
4688
|
-
const allY = items.map((i) => i.y);
|
|
4689
|
-
const pageHeight = safeMax(allY) - safeMin(allY);
|
|
4690
|
-
const gapThreshold = Math.max(15, pageHeight * 0.03);
|
|
4691
|
-
const orderedGroups = xyCutOrder(items, gapThreshold);
|
|
4692
|
-
for (const group of orderedGroups) {
|
|
4693
|
-
if (group.length === 0) continue;
|
|
4694
|
-
const yLines = groupByY(group);
|
|
4695
|
-
const groupColumns = detectColumns(yLines);
|
|
4696
|
-
if (groupColumns && groupColumns.length >= 3) {
|
|
4697
|
-
const tableText = extractWithColumns(yLines, groupColumns);
|
|
4698
|
-
const bbox = computeBBox(group, pageNum);
|
|
4699
|
-
blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(group) });
|
|
4700
|
-
} else {
|
|
4701
|
-
for (const line of yLines) {
|
|
4702
|
-
const text = mergeLineSimple(line);
|
|
4703
|
-
if (!text.trim()) continue;
|
|
4704
|
-
const bbox = computeBBox(line, pageNum);
|
|
4705
|
-
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
|
|
4706
|
-
}
|
|
4707
|
-
}
|
|
4708
|
-
}
|
|
4709
|
-
}
|
|
4710
|
-
}
|
|
4711
|
-
return detectSpecialKoreanTables(blocks);
|
|
4712
|
-
}
|
|
4713
|
-
function computeBBox(items, pageNum) {
|
|
4714
|
-
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
4715
|
-
for (const i of items) {
|
|
4716
|
-
if (i.x < minX) minX = i.x;
|
|
4717
|
-
if (i.y < minY) minY = i.y;
|
|
4718
|
-
if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
4719
|
-
const effectiveH = i.h > 0 ? i.h : i.fontSize;
|
|
4720
|
-
if (i.y + effectiveH > maxY) maxY = i.y + effectiveH;
|
|
4721
|
-
}
|
|
4722
|
-
return { page: pageNum, x: minX, y: minY, width: maxX - minX, height: maxY - minY };
|
|
4723
|
-
}
|
|
4724
|
-
function dominantStyle(items) {
|
|
4725
|
-
if (items.length === 0) return void 0;
|
|
4726
|
-
const freq = /* @__PURE__ */ new Map();
|
|
4727
|
-
let maxCount = 0, dominantSize = 0;
|
|
4728
|
-
for (const i of items) {
|
|
4729
|
-
if (i.fontSize <= 0) continue;
|
|
4730
|
-
const count = (freq.get(i.fontSize) || 0) + 1;
|
|
4731
|
-
freq.set(i.fontSize, count);
|
|
4732
|
-
if (count > maxCount) {
|
|
4733
|
-
maxCount = count;
|
|
4734
|
-
dominantSize = i.fontSize;
|
|
4735
|
-
}
|
|
4736
|
-
}
|
|
4737
|
-
if (dominantSize === 0) return void 0;
|
|
4738
|
-
const fontName = items.find((i) => i.fontSize === dominantSize)?.fontName || void 0;
|
|
4739
|
-
return { fontSize: dominantSize, fontName };
|
|
4740
|
-
}
|
|
4741
|
-
function normalizeItems(rawItems) {
|
|
4742
|
-
const items = [];
|
|
4743
|
-
const spacePositions = [];
|
|
4744
|
-
for (const i of rawItems) {
|
|
4745
|
-
if (typeof i.str !== "string") continue;
|
|
4746
|
-
const x = Math.round(i.transform[4]);
|
|
4747
|
-
const y = Math.round(i.transform[5]);
|
|
4748
|
-
if (!i.str.trim()) {
|
|
4749
|
-
spacePositions.push({ x, y });
|
|
4750
|
-
continue;
|
|
4751
|
-
}
|
|
4752
|
-
const scaleY = Math.abs(i.transform[3]);
|
|
4753
|
-
const scaleX = Math.abs(i.transform[0]);
|
|
4754
|
-
const fontSize = Math.round(Math.max(scaleY, scaleX));
|
|
4755
|
-
const w = Math.round(i.width);
|
|
4756
|
-
const h = Math.round(i.height);
|
|
4757
|
-
const isHidden = fontSize === 0 || i.width === 0 && i.str.trim().length > 0;
|
|
4758
|
-
let text = i.str.trim();
|
|
4759
|
-
if (/^[\d\s\-().·,☎]+$/.test(text) && /\d/.test(text) && / /.test(text)) {
|
|
4760
|
-
text = text.replace(/ /g, "");
|
|
4761
|
-
}
|
|
4762
|
-
const split = splitEvenSpacedItem(text, x, w, fontSize);
|
|
4763
|
-
if (split) {
|
|
4764
|
-
for (const s of split) {
|
|
4765
|
-
items.push({ text: s.text, x: s.x, y, w: s.w, h, fontSize, fontName: i.fontName || "", isHidden });
|
|
4766
|
-
}
|
|
4767
|
-
} else {
|
|
4768
|
-
items.push({ text, x, y, w, h, fontSize, fontName: i.fontName || "", isHidden });
|
|
4769
|
-
}
|
|
4770
|
-
}
|
|
4771
|
-
const sorted = items.sort((a, b) => b.y - a.y || a.x - b.x);
|
|
4772
|
-
const deduped = [];
|
|
4773
|
-
for (let i = 0; i < sorted.length; i++) {
|
|
4774
|
-
let isDup = false;
|
|
4775
|
-
for (let j = deduped.length - 1; j >= 0; j--) {
|
|
4776
|
-
const prev = deduped[j];
|
|
4777
|
-
if (prev.y - sorted[i].y > 3) break;
|
|
4778
|
-
if (Math.abs(prev.y - sorted[i].y) <= 3 && prev.text === sorted[i].text && Math.abs(prev.x - sorted[i].x) <= 3) {
|
|
4779
|
-
isDup = true;
|
|
4780
|
-
break;
|
|
4781
|
-
}
|
|
4782
|
-
}
|
|
4783
|
-
if (!isDup) deduped.push(sorted[i]);
|
|
4784
|
-
}
|
|
4785
|
-
if (spacePositions.length > 0) {
|
|
4786
|
-
for (const item of deduped) {
|
|
4787
|
-
for (const sp of spacePositions) {
|
|
4788
|
-
if (Math.abs(sp.y - item.y) <= 3) {
|
|
4789
|
-
const dist = item.x - sp.x;
|
|
4790
|
-
if (dist >= 0 && dist <= 20) {
|
|
4791
|
-
item.hasSpaceBefore = true;
|
|
4792
|
-
break;
|
|
4793
|
-
}
|
|
4794
|
-
}
|
|
4795
|
-
}
|
|
4796
|
-
}
|
|
4797
|
-
}
|
|
4798
|
-
return deduped;
|
|
4799
|
-
}
|
|
4800
|
-
function splitEvenSpacedItem(text, itemX, itemW, fontSize) {
|
|
4801
|
-
if (!/^[가-힣\d](?: [가-힣\d]){2,}$/.test(text)) return null;
|
|
4802
|
-
const chars = text.split(" ");
|
|
4803
|
-
if (chars.length < 3) return null;
|
|
4804
|
-
const charW = itemW / chars.length;
|
|
4805
|
-
if (charW > fontSize * 2) return null;
|
|
4806
|
-
return chars.map((ch, idx) => ({
|
|
4807
|
-
text: ch,
|
|
4808
|
-
x: Math.round(itemX + idx * charW),
|
|
4809
|
-
w: Math.round(charW * 0.8)
|
|
4810
|
-
// 실제 글자 폭은 간격보다 좁음
|
|
4811
|
-
}));
|
|
4812
|
-
}
|
|
4813
|
-
function groupByY(items) {
|
|
4814
|
-
if (items.length === 0) return [];
|
|
4815
|
-
const lines = [];
|
|
4816
|
-
let curY = items[0].y;
|
|
4817
|
-
let curLine = [items[0]];
|
|
4818
|
-
for (let i = 1; i < items.length; i++) {
|
|
4819
|
-
if (Math.abs(items[i].y - curY) > 3) {
|
|
4820
|
-
lines.push(curLine);
|
|
4821
|
-
curLine = [];
|
|
4822
|
-
curY = items[i].y;
|
|
4823
|
-
}
|
|
4824
|
-
curLine.push(items[i]);
|
|
4825
|
-
}
|
|
4826
|
-
if (curLine.length > 0) lines.push(curLine);
|
|
4827
|
-
return lines;
|
|
4828
|
-
}
|
|
4829
|
-
function isProseSpread(items) {
|
|
4830
|
-
if (items.length < 4) return false;
|
|
4831
|
-
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
4832
|
-
const gaps = [];
|
|
4833
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
4834
|
-
gaps.push(sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w));
|
|
4835
|
-
}
|
|
4836
|
-
const maxGap = safeMax(gaps);
|
|
4837
|
-
const avgLen = items.reduce((s, i) => s + i.text.length, 0) / items.length;
|
|
4838
|
-
return maxGap < 40 && avgLen < 5;
|
|
4839
|
-
}
|
|
4840
|
-
function detectColumns(yLines) {
|
|
4841
|
-
const allItems = yLines.flat();
|
|
4842
|
-
if (allItems.length === 0) return null;
|
|
4843
|
-
const pageWidth = safeMax(allItems.map((i) => i.x + i.w)) - safeMin(allItems.map((i) => i.x));
|
|
4844
|
-
if (pageWidth < 100) return null;
|
|
4845
|
-
let bigoLineIdx = -1;
|
|
4846
|
-
for (let i = 0; i < yLines.length; i++) {
|
|
4847
|
-
if (yLines[i].length <= 2 && yLines[i].some((item) => item.text === "\uBE44\uACE0")) {
|
|
4848
|
-
bigoLineIdx = i;
|
|
4849
|
-
break;
|
|
4850
|
-
}
|
|
4851
|
-
}
|
|
4852
|
-
const tableYLines = bigoLineIdx >= 0 ? yLines.slice(0, bigoLineIdx) : yLines;
|
|
4853
|
-
const CLUSTER_TOL = 22;
|
|
4854
|
-
const xClusters = [];
|
|
4855
|
-
for (const line of tableYLines) {
|
|
4856
|
-
if (isProseSpread(line)) continue;
|
|
4857
|
-
for (const item of line) {
|
|
4858
|
-
let found = false;
|
|
4859
|
-
for (const c of xClusters) {
|
|
4860
|
-
if (Math.abs(item.x - c.center) <= CLUSTER_TOL) {
|
|
4861
|
-
c.center = Math.round((c.center * c.count + item.x) / (c.count + 1));
|
|
4862
|
-
c.minX = Math.min(c.minX, item.x);
|
|
4863
|
-
c.count++;
|
|
4864
|
-
found = true;
|
|
4865
|
-
break;
|
|
4866
|
-
}
|
|
4867
|
-
}
|
|
4868
|
-
if (!found) {
|
|
4869
|
-
xClusters.push({ center: item.x, count: 1, minX: item.x });
|
|
4870
|
-
}
|
|
4871
|
-
}
|
|
4872
|
-
}
|
|
4873
|
-
const peaks = xClusters.filter((c) => c.count >= 3).sort((a, b) => a.minX - b.minX);
|
|
4874
|
-
if (peaks.length < 3) return null;
|
|
4875
|
-
const MERGE_TOL = 40;
|
|
4876
|
-
const merged = [peaks[0]];
|
|
4877
|
-
for (let i = 1; i < peaks.length; i++) {
|
|
4878
|
-
const prev = merged[merged.length - 1];
|
|
4879
|
-
if (peaks[i].minX - prev.minX < MERGE_TOL) {
|
|
4880
|
-
if (peaks[i].count > prev.count) {
|
|
4881
|
-
prev.center = peaks[i].center;
|
|
4882
|
-
}
|
|
4883
|
-
prev.count += peaks[i].count;
|
|
4884
|
-
prev.minX = Math.min(prev.minX, peaks[i].minX);
|
|
4885
|
-
} else {
|
|
4886
|
-
merged.push({ ...peaks[i] });
|
|
4887
|
-
}
|
|
4888
|
-
}
|
|
4889
|
-
const rawColumns = merged.filter((c) => c.count >= 3).map((c) => c.minX);
|
|
4890
|
-
if (rawColumns.length < 3) return null;
|
|
4891
|
-
const MIN_DETECT_COL_WIDTH = 30;
|
|
4892
|
-
const columns = [rawColumns[0]];
|
|
4893
|
-
for (let i = 1; i < rawColumns.length; i++) {
|
|
4894
|
-
if (rawColumns[i] - columns[columns.length - 1] < MIN_DETECT_COL_WIDTH) continue;
|
|
4895
|
-
columns.push(rawColumns[i]);
|
|
4896
|
-
}
|
|
4897
|
-
return columns.length >= 3 ? columns : null;
|
|
4898
|
-
}
|
|
4899
|
-
function findColumn(x, columns) {
|
|
4900
|
-
for (let i = columns.length - 1; i >= 0; i--) {
|
|
4901
|
-
if (x >= columns[i] - 10) return i;
|
|
4902
|
-
}
|
|
4903
|
-
return 0;
|
|
4904
|
-
}
|
|
4905
|
-
function extractWithColumns(yLines, columns) {
|
|
4906
|
-
const result = [];
|
|
4907
|
-
const colMin = columns[0];
|
|
4908
|
-
const colMax = columns[columns.length - 1];
|
|
4909
|
-
let bigoIdx = -1;
|
|
4910
|
-
for (let i = 0; i < yLines.length; i++) {
|
|
4911
|
-
if (yLines[i].length <= 2 && yLines[i].some((item) => item.text === "\uBE44\uACE0")) {
|
|
4912
|
-
bigoIdx = i;
|
|
4913
|
-
break;
|
|
4914
|
-
}
|
|
4915
|
-
}
|
|
4916
|
-
let tableStart = -1;
|
|
4917
|
-
for (let i = 0; i < (bigoIdx >= 0 ? bigoIdx : yLines.length); i++) {
|
|
4918
|
-
const usedCols = new Set(yLines[i].map((item) => findColumn(item.x, columns)));
|
|
4919
|
-
if (usedCols.size >= 3) {
|
|
4920
|
-
tableStart = i;
|
|
4921
|
-
break;
|
|
4922
|
-
}
|
|
4923
|
-
}
|
|
4924
|
-
const tableEnd = bigoIdx >= 0 ? bigoIdx : yLines.length;
|
|
4925
|
-
for (let i = 0; i < (tableStart >= 0 ? tableStart : tableEnd); i++) {
|
|
4926
|
-
result.push(mergeLineSimple(yLines[i]));
|
|
4927
|
-
}
|
|
4928
|
-
if (tableStart >= 0) {
|
|
4929
|
-
const tableLines = yLines.slice(tableStart, tableEnd);
|
|
4930
|
-
const gridLines = [];
|
|
4931
|
-
for (const line of tableLines) {
|
|
4932
|
-
const inRange = line.some(
|
|
4933
|
-
(item) => item.x >= colMin - 20 && item.x <= colMax + 200
|
|
4934
|
-
);
|
|
4935
|
-
if (inRange && !isProseSpread(line)) {
|
|
4936
|
-
gridLines.push(line);
|
|
4937
|
-
} else {
|
|
4938
|
-
if (gridLines.length > 0) {
|
|
4939
|
-
result.push(buildGridTable(gridLines.splice(0), columns));
|
|
4940
|
-
}
|
|
4941
|
-
result.push(mergeLineSimple(line));
|
|
4942
|
-
}
|
|
4943
|
-
}
|
|
4944
|
-
if (gridLines.length > 0) {
|
|
4945
|
-
result.push(buildGridTable(gridLines, columns));
|
|
4946
|
-
}
|
|
4947
|
-
}
|
|
4948
|
-
if (bigoIdx >= 0) {
|
|
4949
|
-
result.push("");
|
|
4950
|
-
for (let i = bigoIdx; i < yLines.length; i++) {
|
|
4951
|
-
result.push(mergeLineSimple(yLines[i]));
|
|
4952
|
-
}
|
|
4953
|
-
}
|
|
4954
|
-
return result.join("\n");
|
|
4955
|
-
}
|
|
4956
|
-
function buildGridTable(lines, columns) {
|
|
4957
|
-
const numCols = columns.length;
|
|
4958
|
-
const yRows = lines.map((items) => {
|
|
4959
|
-
const row = Array(numCols).fill("");
|
|
4960
|
-
for (const item of items) {
|
|
4961
|
-
const col = findColumn(item.x, columns);
|
|
4962
|
-
row[col] = row[col] ? row[col] + " " + item.text : item.text;
|
|
4963
|
-
}
|
|
4964
|
-
return row;
|
|
4965
|
-
});
|
|
4966
|
-
const dataColStart = Math.max(2, Math.floor(numCols / 2));
|
|
4967
|
-
const merged = [];
|
|
4968
|
-
for (const row of yRows) {
|
|
4969
|
-
if (row.every((c) => c === "")) continue;
|
|
4970
|
-
if (merged.length === 0) {
|
|
4971
|
-
merged.push([...row]);
|
|
4972
|
-
continue;
|
|
4973
|
-
}
|
|
4974
|
-
const prev = merged[merged.length - 1];
|
|
4975
|
-
const filledCols = row.map((c, i) => c ? i : -1).filter((i) => i >= 0);
|
|
4976
|
-
const filledCount = filledCols.length;
|
|
4977
|
-
let isNewRow = false;
|
|
4978
|
-
if (row[0] && row[0].length >= 3) {
|
|
4979
|
-
isNewRow = true;
|
|
4980
|
-
}
|
|
4981
|
-
if (!isNewRow && numCols > 1 && row[1]) {
|
|
4982
|
-
isNewRow = true;
|
|
4983
|
-
}
|
|
4984
|
-
if (!isNewRow) {
|
|
4985
|
-
const hasData = row.slice(dataColStart).some((c) => c !== "");
|
|
4986
|
-
const prevHasData = prev.slice(dataColStart).some((c) => c !== "");
|
|
4987
|
-
if (hasData && prevHasData) {
|
|
4988
|
-
isNewRow = true;
|
|
4989
|
-
}
|
|
4990
|
-
}
|
|
4991
|
-
if (isNewRow && filledCount === 1 && row[0] && row[0].length <= 2) {
|
|
4992
|
-
isNewRow = false;
|
|
4993
|
-
}
|
|
4994
|
-
if (isNewRow) {
|
|
4995
|
-
merged.push([...row]);
|
|
4996
|
-
} else {
|
|
4997
|
-
for (let c = 0; c < numCols; c++) {
|
|
4998
|
-
if (row[c]) {
|
|
4999
|
-
prev[c] = prev[c] ? prev[c] + " " + row[c] : row[c];
|
|
5000
|
-
}
|
|
5001
|
-
}
|
|
5002
|
-
}
|
|
5003
|
-
}
|
|
5004
|
-
if (merged.length < 2) {
|
|
5005
|
-
return merged.map((r) => r.filter((c) => c).join(" ")).join("\n");
|
|
5006
|
-
}
|
|
5007
|
-
let headerEnd = 0;
|
|
5008
|
-
for (let r = 0; r < merged.length; r++) {
|
|
5009
|
-
const hasDataValues = merged[r].slice(dataColStart).some((c) => c && /\d/.test(c));
|
|
5010
|
-
if (hasDataValues) break;
|
|
5011
|
-
headerEnd = r + 1;
|
|
5012
|
-
}
|
|
5013
|
-
if (headerEnd > 1) {
|
|
5014
|
-
const headerRow = Array(numCols).fill("");
|
|
5015
|
-
for (let r = 0; r < headerEnd; r++) {
|
|
5016
|
-
for (let c = 0; c < numCols; c++) {
|
|
5017
|
-
if (merged[r][c]) {
|
|
5018
|
-
headerRow[c] = headerRow[c] ? headerRow[c] + " " + merged[r][c] : merged[r][c];
|
|
5019
|
-
}
|
|
5020
|
-
}
|
|
5021
|
-
}
|
|
5022
|
-
merged.splice(0, headerEnd, headerRow);
|
|
5023
|
-
}
|
|
5024
|
-
for (const row of merged) {
|
|
5025
|
-
for (let c = 0; c < row.length; c++) {
|
|
5026
|
-
if (row[c]) row[c] = collapseEvenSpacing(row[c]);
|
|
5027
|
-
}
|
|
5028
|
-
}
|
|
5029
|
-
const totalCells = merged.length * numCols;
|
|
5030
|
-
const filledCells = merged.reduce((s, row) => s + row.filter((c) => c).length, 0);
|
|
5031
|
-
if (filledCells < totalCells * 0.35 || merged.length < 2 || merged.length <= 3 && numCols >= 7) {
|
|
5032
|
-
return merged.map((r) => r.filter((c) => c).join(" ")).join("\n");
|
|
5033
|
-
}
|
|
5034
|
-
const md = [];
|
|
5035
|
-
md.push("| " + merged[0].join(" | ") + " |");
|
|
5036
|
-
md.push("| " + merged[0].map(() => "---").join(" | ") + " |");
|
|
5037
|
-
for (let r = 1; r < merged.length; r++) {
|
|
5038
|
-
md.push("| " + merged[r].join(" | ") + " |");
|
|
5039
|
-
}
|
|
5040
|
-
return md.join("\n");
|
|
5041
|
-
}
|
|
5042
|
-
function mergeLineSimple(items) {
|
|
5043
|
-
if (items.length <= 1) return items[0]?.text || "";
|
|
5044
|
-
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
5045
|
-
const isEvenSpaced = detectEvenSpacedItems(sorted);
|
|
5046
|
-
let result = sorted[0].text;
|
|
5047
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
5048
|
-
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
5049
|
-
const avgFs = (sorted[i].fontSize + sorted[i - 1].fontSize) / 2;
|
|
5050
|
-
const tabThreshold = Math.max(avgFs * 2, 30);
|
|
5051
|
-
if (gap > tabThreshold) {
|
|
5052
|
-
result += " ";
|
|
5053
|
-
result += sorted[i].text;
|
|
5054
|
-
continue;
|
|
5055
|
-
}
|
|
5056
|
-
if (isEvenSpaced[i]) {
|
|
5057
|
-
result += sorted[i].text;
|
|
5058
|
-
continue;
|
|
5059
|
-
}
|
|
5060
|
-
if (sorted[i].hasSpaceBefore && gap >= avgFs * 0.05) {
|
|
5061
|
-
result += " ";
|
|
5062
|
-
result += sorted[i].text;
|
|
5063
|
-
continue;
|
|
5064
|
-
}
|
|
5065
|
-
if (/[□■○●▶◆◇ㅇ]$/.test(sorted[i - 1].text) && /^[가-힣]/.test(sorted[i].text) && gap > 1) {
|
|
5066
|
-
result += " ";
|
|
5067
|
-
result += sorted[i].text;
|
|
5068
|
-
continue;
|
|
5069
|
-
}
|
|
5070
|
-
if (gap < avgFs * 0.15) {
|
|
5071
|
-
} else if (gap < avgFs * 0.35 && (/[가-힣]$/.test(result) || /^[가-힣]/.test(sorted[i].text))) {
|
|
5072
|
-
} else if (gap > 3) result += " ";
|
|
5073
|
-
result += sorted[i].text;
|
|
5074
|
-
}
|
|
5075
|
-
return result;
|
|
5076
|
-
}
|
|
5077
|
-
function cleanPdfText(text) {
|
|
5078
|
-
return mergeKoreanLines(
|
|
5079
|
-
text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
|
|
5080
|
-
).replace(/^(?!\| ---).*$/gm, (line) => collapseEvenSpacing(line)).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
|
|
5081
|
-
}
|
|
5082
|
-
function startsWithMarker(line) {
|
|
5083
|
-
const t = line.trimStart();
|
|
5084
|
-
return /^[가-힣ㄱ-ㅎ][.)]/.test(t) || /^\d+[.)]/.test(t) || /^\([가-힣ㄱ-ㅎ\d]+\)/.test(t) || /^[○●※▶▷◆◇■□★☆\-·]\s/.test(t) || /^제\d+[조항호장절]/.test(t);
|
|
5085
|
-
}
|
|
5086
|
-
function isStandaloneHeader(line) {
|
|
5087
|
-
return /^제\d+[조항호장절](\([^)]*\))?(\s+\S+){0,7}$/.test(line.trim());
|
|
5088
|
-
}
|
|
5089
|
-
function detectListBlocks(blocks) {
|
|
5090
|
-
const result = [];
|
|
5091
|
-
for (let i = 0; i < blocks.length; i++) {
|
|
5092
|
-
const block = blocks[i];
|
|
5093
|
-
if (block.type === "paragraph" && block.text) {
|
|
5094
|
-
const text = block.text.trim();
|
|
5095
|
-
if (/^\d+\.\s/.test(text)) {
|
|
5096
|
-
result.push({ ...block, type: "list", listType: "ordered", text: block.text });
|
|
5097
|
-
continue;
|
|
5098
|
-
}
|
|
5099
|
-
if (/^[○●·※▶▷◆◇\-]\s/.test(text)) {
|
|
5100
|
-
result.push({ ...block, type: "list", listType: "unordered", text: block.text });
|
|
5101
|
-
continue;
|
|
5102
|
-
}
|
|
5103
|
-
}
|
|
5104
|
-
result.push(block);
|
|
5105
|
-
}
|
|
5106
|
-
return result;
|
|
5107
|
-
}
|
|
5108
|
-
var KOREAN_TABLE_HEADER_RE = /^\(?(구분|항목|종류|분류|유형|대상|내용|기간|금액|비율|방법|절차|요건|조건|근거|목적|범위|기준)\)?[:\s]/;
|
|
5109
|
-
var KV_FALSE_POSITIVE_RE = /\d{1,2}:\d{2}|:\/\/|\d+:\d+/;
|
|
5110
|
-
function detectSpecialKoreanTables(blocks) {
|
|
5111
|
-
const result = [];
|
|
5112
|
-
let kvLines = [];
|
|
5113
|
-
const flushKvTable = () => {
|
|
5114
|
-
if (kvLines.length < 2) {
|
|
5115
|
-
for (const kv of kvLines) result.push(kv.block);
|
|
5116
|
-
kvLines = [];
|
|
5117
|
-
return;
|
|
5118
|
-
}
|
|
5119
|
-
const cells = kvLines.map((kv) => {
|
|
5120
|
-
if (kv.value) {
|
|
5121
|
-
return [
|
|
5122
|
-
{ text: kv.key, colSpan: 1, rowSpan: 1 },
|
|
5123
|
-
{ text: kv.value, colSpan: 1, rowSpan: 1 }
|
|
5124
|
-
];
|
|
5125
|
-
}
|
|
5126
|
-
return [
|
|
5127
|
-
{ text: kv.key, colSpan: 2, rowSpan: 1 },
|
|
5128
|
-
{ text: "", colSpan: 1, rowSpan: 1 }
|
|
5129
|
-
];
|
|
5130
|
-
});
|
|
5131
|
-
const irTable = {
|
|
5132
|
-
rows: cells.length,
|
|
5133
|
-
cols: 2,
|
|
5134
|
-
cells,
|
|
5135
|
-
hasHeader: true
|
|
5136
|
-
};
|
|
5137
|
-
const firstBlock = kvLines[0].block;
|
|
5138
|
-
result.push({
|
|
5139
|
-
type: "table",
|
|
5140
|
-
table: irTable,
|
|
5141
|
-
pageNumber: firstBlock.pageNumber,
|
|
5142
|
-
bbox: firstBlock.bbox
|
|
5143
|
-
});
|
|
5144
|
-
kvLines = [];
|
|
5145
|
-
};
|
|
5146
|
-
for (const block of blocks) {
|
|
5147
|
-
if (block.type !== "paragraph" || !block.text) {
|
|
5148
|
-
flushKvTable();
|
|
5149
|
-
result.push(block);
|
|
5150
|
-
continue;
|
|
5151
|
-
}
|
|
5152
|
-
const text = block.text.trim();
|
|
5153
|
-
if (KOREAN_TABLE_HEADER_RE.test(text)) {
|
|
5154
|
-
const colonIdx = text.indexOf(":");
|
|
5155
|
-
if (colonIdx >= 0) {
|
|
5156
|
-
kvLines.push({
|
|
5157
|
-
key: text.slice(0, colonIdx).trim(),
|
|
5158
|
-
value: text.slice(colonIdx + 1).trim(),
|
|
5159
|
-
block
|
|
5160
|
-
});
|
|
5161
|
-
} else {
|
|
5162
|
-
const spaceIdx = text.search(/\s/);
|
|
5163
|
-
if (spaceIdx > 0) {
|
|
5164
|
-
kvLines.push({
|
|
5165
|
-
key: text.slice(0, spaceIdx).trim(),
|
|
5166
|
-
value: text.slice(spaceIdx + 1).trim(),
|
|
5167
|
-
block
|
|
5168
|
-
});
|
|
5169
|
-
} else {
|
|
5170
|
-
kvLines.push({ key: text, value: "", block });
|
|
5171
|
-
}
|
|
5172
|
-
}
|
|
5173
|
-
continue;
|
|
5174
|
-
}
|
|
5175
|
-
if (kvLines.length > 0 && text.includes(":")) {
|
|
5176
|
-
if (!KV_FALSE_POSITIVE_RE.test(text) && !text.includes("(") && !text.includes(")")) {
|
|
5177
|
-
const colonIdx = text.indexOf(":");
|
|
5178
|
-
const key = text.slice(0, colonIdx).trim();
|
|
5179
|
-
if (/^[가-힣]+$/.test(key) && key.length >= 2 && key.length <= 8) {
|
|
5180
|
-
kvLines.push({
|
|
5181
|
-
key,
|
|
5182
|
-
value: text.slice(colonIdx + 1).trim(),
|
|
5183
|
-
block
|
|
5184
|
-
});
|
|
5185
|
-
continue;
|
|
5186
|
-
}
|
|
5187
|
-
}
|
|
5188
|
-
}
|
|
5189
|
-
flushKvTable();
|
|
5190
|
-
result.push(block);
|
|
5191
|
-
}
|
|
5192
|
-
flushKvTable();
|
|
5193
|
-
return result;
|
|
5194
|
-
}
|
|
5195
|
-
function removeHeaderFooterBlocks(blocks, pageHeights, warnings) {
|
|
5196
|
-
const ZONE_RATIO = 0.1;
|
|
5197
|
-
const MIN_REPEAT = 3;
|
|
5198
|
-
const headerTexts = /* @__PURE__ */ new Map();
|
|
5199
|
-
const footerTexts = /* @__PURE__ */ new Map();
|
|
5200
|
-
for (let bi = 0; bi < blocks.length; bi++) {
|
|
5201
|
-
const b = blocks[bi];
|
|
5202
|
-
if (!b.bbox || !b.pageNumber || !b.text?.trim()) continue;
|
|
5203
|
-
const ph = pageHeights.get(b.bbox.page) || pageHeights.get(b.pageNumber);
|
|
5204
|
-
if (!ph) continue;
|
|
5205
|
-
const blockTop = ph - (b.bbox.y + b.bbox.height);
|
|
5206
|
-
const blockBottom = ph - b.bbox.y;
|
|
5207
|
-
if (blockBottom <= ph * ZONE_RATIO) {
|
|
5208
|
-
const arr = footerTexts.get(b.pageNumber) || [];
|
|
5209
|
-
arr.push(b.text.trim());
|
|
5210
|
-
footerTexts.set(b.pageNumber, arr);
|
|
5211
|
-
} else if (blockTop >= ph * (1 - ZONE_RATIO)) {
|
|
5212
|
-
const arr = headerTexts.get(b.pageNumber) || [];
|
|
5213
|
-
arr.push(b.text.trim());
|
|
5214
|
-
headerTexts.set(b.pageNumber, arr);
|
|
5215
|
-
}
|
|
5216
|
-
}
|
|
5217
|
-
const repeatedPatterns = /* @__PURE__ */ new Set();
|
|
5218
|
-
for (const textsMap of [headerTexts, footerTexts]) {
|
|
5219
|
-
const patternCount = /* @__PURE__ */ new Map();
|
|
5220
|
-
for (const [, texts] of textsMap) {
|
|
5221
|
-
for (const t of texts) {
|
|
5222
|
-
const normalized = t.replace(/\d+/g, "#");
|
|
5223
|
-
patternCount.set(normalized, (patternCount.get(normalized) || 0) + 1);
|
|
5224
|
-
}
|
|
5225
|
-
}
|
|
5226
|
-
for (const [pattern, count] of patternCount) {
|
|
5227
|
-
if (count >= MIN_REPEAT) repeatedPatterns.add(pattern);
|
|
5228
|
-
}
|
|
5229
|
-
}
|
|
5230
|
-
if (repeatedPatterns.size === 0) return [];
|
|
5231
|
-
const removeIndices = [];
|
|
5232
|
-
for (let bi = 0; bi < blocks.length; bi++) {
|
|
5233
|
-
const b = blocks[bi];
|
|
5234
|
-
if (!b.bbox || !b.pageNumber || !b.text?.trim()) continue;
|
|
5235
|
-
const ph = pageHeights.get(b.bbox.page) || pageHeights.get(b.pageNumber);
|
|
5236
|
-
if (!ph) continue;
|
|
5237
|
-
const blockTop = ph - (b.bbox.y + b.bbox.height);
|
|
5238
|
-
const blockBottom = ph - b.bbox.y;
|
|
5239
|
-
const inZone = blockBottom <= ph * ZONE_RATIO || blockTop >= ph * (1 - ZONE_RATIO);
|
|
5240
|
-
if (!inZone) continue;
|
|
5241
|
-
const normalized = b.text.trim().replace(/\d+/g, "#");
|
|
5242
|
-
if (repeatedPatterns.has(normalized)) {
|
|
5243
|
-
removeIndices.push(bi);
|
|
5244
|
-
}
|
|
5245
|
-
}
|
|
5246
|
-
if (removeIndices.length > 0) {
|
|
5247
|
-
warnings.push({ message: `${removeIndices.length}\uAC1C \uBA38\uB9AC\uAE00/\uBC14\uB2E5\uAE00 \uC694\uC18C \uC81C\uAC70\uB428`, code: "HIDDEN_TEXT_FILTERED" });
|
|
5248
|
-
}
|
|
5249
|
-
return removeIndices;
|
|
5250
|
-
}
|
|
5251
|
-
function mergeKoreanLines(text) {
|
|
5252
|
-
if (!text) return "";
|
|
5253
|
-
const lines = text.split("\n");
|
|
5254
|
-
if (lines.length <= 1) return text;
|
|
5255
|
-
const result = [lines[0]];
|
|
5256
|
-
for (let i = 1; i < lines.length; i++) {
|
|
5257
|
-
const prev = result[result.length - 1];
|
|
5258
|
-
const curr = lines[i];
|
|
5259
|
-
const currTrimmed = curr.trim();
|
|
5260
|
-
if (/^#{1,6}\s/.test(prev) || /^#{1,6}\s/.test(curr) || /^\|/.test(currTrimmed) || /^---/.test(currTrimmed)) {
|
|
5261
|
-
result.push(curr);
|
|
5262
|
-
continue;
|
|
5263
|
-
}
|
|
5264
|
-
if (/,$/.test(prev.trim()) && currTrimmed.length > 0) {
|
|
5265
|
-
result[result.length - 1] = prev + "\n" + curr;
|
|
5266
|
-
continue;
|
|
5267
|
-
}
|
|
5268
|
-
if (/^\(※/.test(currTrimmed)) {
|
|
5269
|
-
result[result.length - 1] = prev + " " + currTrimmed;
|
|
5270
|
-
continue;
|
|
5271
|
-
}
|
|
5272
|
-
if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev) && !startsWithMarker(prev)) {
|
|
5273
|
-
result[result.length - 1] = prev + " " + curr;
|
|
5274
|
-
} else {
|
|
5275
|
-
result.push(curr);
|
|
5276
|
-
}
|
|
5277
|
-
}
|
|
5278
|
-
return result.join("\n");
|
|
5279
|
-
}
|
|
5280
|
-
|
|
5281
|
-
// src/xlsx/parser.ts
|
|
5282
|
-
import JSZip3 from "jszip";
|
|
5283
|
-
import { DOMParser as DOMParser2 } from "@xmldom/xmldom";
|
|
5284
|
-
var MAX_SHEETS = 100;
|
|
5285
|
-
var MAX_DECOMPRESS_SIZE3 = 100 * 1024 * 1024;
|
|
5286
|
-
var MAX_ROWS2 = 1e4;
|
|
5287
|
-
var MAX_COLS2 = 200;
|
|
5288
|
-
function cleanNumericValue(raw) {
|
|
5289
|
-
if (!/^-?\d+\.\d+$/.test(raw)) return raw;
|
|
5290
|
-
const num = parseFloat(raw);
|
|
5291
|
-
if (!isFinite(num)) return raw;
|
|
5292
|
-
const cleaned = parseFloat(num.toPrecision(15)).toString();
|
|
5293
|
-
return cleaned;
|
|
5294
|
-
}
|
|
5295
|
-
function parseCellRef(ref) {
|
|
5296
|
-
const m = ref.match(/^([A-Z]+)(\d+)$/);
|
|
5297
|
-
if (!m) return null;
|
|
5298
|
-
let col = 0;
|
|
5299
|
-
for (const ch of m[1]) col = col * 26 + (ch.charCodeAt(0) - 64);
|
|
5300
|
-
return { col: col - 1, row: parseInt(m[2], 10) - 1 };
|
|
5301
|
-
}
|
|
5302
|
-
function parseMergeRef(ref) {
|
|
5303
|
-
const parts = ref.split(":");
|
|
5304
|
-
if (parts.length !== 2) return null;
|
|
5305
|
-
const start = parseCellRef(parts[0]);
|
|
5306
|
-
const end = parseCellRef(parts[1]);
|
|
5307
|
-
if (!start || !end) return null;
|
|
5308
|
-
return { startCol: start.col, startRow: start.row, endCol: end.col, endRow: end.row };
|
|
5309
|
-
}
|
|
5310
|
-
function getElements(parent, tagName) {
|
|
5311
|
-
const nodes = parent.getElementsByTagName(tagName);
|
|
5312
|
-
const result = [];
|
|
5313
|
-
for (let i = 0; i < nodes.length; i++) result.push(nodes[i]);
|
|
5314
|
-
return result;
|
|
5315
|
-
}
|
|
5316
|
-
function getTextContent(el) {
|
|
5317
|
-
return el.textContent?.trim() ?? "";
|
|
5318
|
-
}
|
|
5319
|
-
function parseXml(text) {
|
|
5320
|
-
return new DOMParser2().parseFromString(stripDtd(text), "text/xml");
|
|
5321
|
-
}
|
|
5322
|
-
function parseSharedStrings(xml) {
|
|
5323
|
-
const doc = parseXml(xml);
|
|
5324
|
-
const strings = [];
|
|
5325
|
-
const siList = getElements(doc.documentElement, "si");
|
|
5326
|
-
for (const si of siList) {
|
|
5327
|
-
const tElements = getElements(si, "t");
|
|
5328
|
-
strings.push(tElements.map((t) => t.textContent ?? "").join(""));
|
|
5329
|
-
}
|
|
5330
|
-
return strings;
|
|
2578
|
+
return strings;
|
|
5331
2579
|
}
|
|
5332
2580
|
function parseWorkbook(xml) {
|
|
5333
2581
|
const doc = parseXml(xml);
|
|
@@ -5492,7 +2740,7 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
5492
2740
|
}
|
|
5493
2741
|
let pageFilter = null;
|
|
5494
2742
|
if (options?.pages) {
|
|
5495
|
-
const { parsePageRange: parsePageRange2 } = await
|
|
2743
|
+
const { parsePageRange: parsePageRange2 } = await import("./page-range-H35FN3OQ.js");
|
|
5496
2744
|
pageFilter = parsePageRange2(options.pages, sheets.length);
|
|
5497
2745
|
}
|
|
5498
2746
|
const blocks = [];
|
|
@@ -5563,21 +2811,21 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
5563
2811
|
import JSZip4 from "jszip";
|
|
5564
2812
|
import { DOMParser as DOMParser3 } from "@xmldom/xmldom";
|
|
5565
2813
|
var MAX_DECOMPRESS_SIZE4 = 100 * 1024 * 1024;
|
|
5566
|
-
function getChildElements(parent,
|
|
2814
|
+
function getChildElements(parent, localName2) {
|
|
5567
2815
|
const result = [];
|
|
5568
2816
|
const children = parent.childNodes;
|
|
5569
2817
|
for (let i = 0; i < children.length; i++) {
|
|
5570
2818
|
const node = children[i];
|
|
5571
2819
|
if (node.nodeType === 1) {
|
|
5572
2820
|
const el = node;
|
|
5573
|
-
if (el.localName ===
|
|
2821
|
+
if (el.localName === localName2 || el.tagName?.endsWith(`:${localName2}`)) {
|
|
5574
2822
|
result.push(el);
|
|
5575
2823
|
}
|
|
5576
2824
|
}
|
|
5577
2825
|
}
|
|
5578
2826
|
return result;
|
|
5579
2827
|
}
|
|
5580
|
-
function findElements(parent,
|
|
2828
|
+
function findElements(parent, localName2) {
|
|
5581
2829
|
const result = [];
|
|
5582
2830
|
const walk = (node) => {
|
|
5583
2831
|
const children = node.childNodes;
|
|
@@ -5585,7 +2833,7 @@ function findElements(parent, localName) {
|
|
|
5585
2833
|
const child = children[i];
|
|
5586
2834
|
if (child.nodeType === 1) {
|
|
5587
2835
|
const el = child;
|
|
5588
|
-
if (el.localName ===
|
|
2836
|
+
if (el.localName === localName2 || el.tagName?.endsWith(`:${localName2}`)) {
|
|
5589
2837
|
result.push(el);
|
|
5590
2838
|
}
|
|
5591
2839
|
walk(el);
|
|
@@ -5595,11 +2843,11 @@ function findElements(parent, localName) {
|
|
|
5595
2843
|
walk(parent);
|
|
5596
2844
|
return result;
|
|
5597
2845
|
}
|
|
5598
|
-
function getAttr(el,
|
|
2846
|
+
function getAttr(el, localName2) {
|
|
5599
2847
|
const attrs = el.attributes;
|
|
5600
2848
|
for (let i = 0; i < attrs.length; i++) {
|
|
5601
2849
|
const attr = attrs[i];
|
|
5602
|
-
if (attr.localName ===
|
|
2850
|
+
if (attr.localName === localName2 || attr.name === localName2) return attr.value;
|
|
5603
2851
|
}
|
|
5604
2852
|
return null;
|
|
5605
2853
|
}
|
|
@@ -5911,258 +3159,81 @@ async function parseDocxDocument(buffer, options) {
|
|
|
5911
3159
|
}
|
|
5912
3160
|
let styles = /* @__PURE__ */ new Map();
|
|
5913
3161
|
const stylesFile = zip.file("word/styles.xml");
|
|
5914
|
-
if (stylesFile) {
|
|
5915
|
-
try {
|
|
5916
|
-
styles = parseStyles(await stylesFile.async("text"));
|
|
5917
|
-
} catch {
|
|
5918
|
-
}
|
|
5919
|
-
}
|
|
5920
|
-
let numbering = /* @__PURE__ */ new Map();
|
|
5921
|
-
const numFile = zip.file("word/numbering.xml");
|
|
5922
|
-
if (numFile) {
|
|
5923
|
-
try {
|
|
5924
|
-
numbering = parseNumbering(await numFile.async("text"));
|
|
5925
|
-
} catch {
|
|
5926
|
-
}
|
|
5927
|
-
}
|
|
5928
|
-
let footnotes = /* @__PURE__ */ new Map();
|
|
5929
|
-
const fnFile = zip.file("word/footnotes.xml");
|
|
5930
|
-
if (fnFile) {
|
|
5931
|
-
try {
|
|
5932
|
-
footnotes = parseFootnotes(await fnFile.async("text"));
|
|
5933
|
-
} catch {
|
|
5934
|
-
}
|
|
5935
|
-
}
|
|
5936
|
-
const docXml = await docFile.async("text");
|
|
5937
|
-
const doc = parseXml2(docXml);
|
|
5938
|
-
const body = findElements(doc, "body");
|
|
5939
|
-
if (body.length === 0) {
|
|
5940
|
-
throw new KordocError("DOCX \uBCF8\uBB38(w:body)\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
5941
|
-
}
|
|
5942
|
-
const blocks = [];
|
|
5943
|
-
const bodyEl = body[0];
|
|
5944
|
-
const children = bodyEl.childNodes;
|
|
5945
|
-
for (let i = 0; i < children.length; i++) {
|
|
5946
|
-
const node = children[i];
|
|
5947
|
-
if (node.nodeType !== 1) continue;
|
|
5948
|
-
const el = node;
|
|
5949
|
-
const localName = el.localName ?? el.tagName?.split(":").pop();
|
|
5950
|
-
if (localName === "p") {
|
|
5951
|
-
const block = parseParagraph(el, styles, numbering, footnotes, rels);
|
|
5952
|
-
if (block) blocks.push(block);
|
|
5953
|
-
} else if (localName === "tbl") {
|
|
5954
|
-
const block = parseTable(el, styles, numbering, footnotes, rels);
|
|
5955
|
-
if (block) blocks.push(block);
|
|
5956
|
-
}
|
|
5957
|
-
}
|
|
5958
|
-
const { blocks: imgBlocks, images } = await extractImages(zip, rels, doc);
|
|
5959
|
-
const metadata = {};
|
|
5960
|
-
const coreFile = zip.file("docProps/core.xml");
|
|
5961
|
-
if (coreFile) {
|
|
5962
|
-
try {
|
|
5963
|
-
const coreXml = await coreFile.async("text");
|
|
5964
|
-
const coreDoc = parseXml2(coreXml);
|
|
5965
|
-
const getFirst = (tag) => {
|
|
5966
|
-
const els = coreDoc.getElementsByTagName(tag);
|
|
5967
|
-
return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
|
|
5968
|
-
};
|
|
5969
|
-
metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
|
|
5970
|
-
metadata.author = getFirst("dc:creator");
|
|
5971
|
-
metadata.description = getFirst("dc:description");
|
|
5972
|
-
const created = getFirst("dcterms:created");
|
|
5973
|
-
if (created) metadata.createdAt = created;
|
|
5974
|
-
const modified = getFirst("dcterms:modified");
|
|
5975
|
-
if (modified) metadata.modifiedAt = modified;
|
|
5976
|
-
} catch {
|
|
5977
|
-
}
|
|
5978
|
-
}
|
|
5979
|
-
const outline = blocks.filter((b) => b.type === "heading").map((b) => ({ level: b.level ?? 2, text: b.text ?? "" }));
|
|
5980
|
-
const markdown = blocksToMarkdown(blocks);
|
|
5981
|
-
return {
|
|
5982
|
-
markdown,
|
|
5983
|
-
blocks,
|
|
5984
|
-
metadata,
|
|
5985
|
-
outline: outline.length > 0 ? outline : void 0,
|
|
5986
|
-
warnings: warnings.length > 0 ? warnings : void 0,
|
|
5987
|
-
images: images.length > 0 ? images : void 0
|
|
5988
|
-
};
|
|
5989
|
-
}
|
|
5990
|
-
|
|
5991
|
-
// src/diff/text-diff.ts
|
|
5992
|
-
function similarity(a, b) {
|
|
5993
|
-
if (a === b) return 1;
|
|
5994
|
-
if (!a || !b) return 0;
|
|
5995
|
-
const maxLen = Math.max(a.length, b.length);
|
|
5996
|
-
if (maxLen === 0) return 1;
|
|
5997
|
-
return 1 - levenshtein(a, b) / maxLen;
|
|
5998
|
-
}
|
|
5999
|
-
function normalizedSimilarity(a, b) {
|
|
6000
|
-
return similarity(normalize(a), normalize(b));
|
|
6001
|
-
}
|
|
6002
|
-
function normalize(s) {
|
|
6003
|
-
return s.replace(/\s+/g, " ").trim();
|
|
6004
|
-
}
|
|
6005
|
-
var MAX_LEVENSHTEIN_LEN = 1e4;
|
|
6006
|
-
function levenshtein(a, b) {
|
|
6007
|
-
if (a.length + b.length > MAX_LEVENSHTEIN_LEN) {
|
|
6008
|
-
const sampleLen = Math.min(500, a.length, b.length);
|
|
6009
|
-
let diffs = 0;
|
|
6010
|
-
for (let i = 0; i < sampleLen; i++) if (a[i] !== b[i]) diffs++;
|
|
6011
|
-
const sampleRate = sampleLen > 0 ? diffs / sampleLen : 1;
|
|
6012
|
-
return Math.abs(a.length - b.length) + Math.round(Math.min(a.length, b.length) * sampleRate);
|
|
6013
|
-
}
|
|
6014
|
-
if (a.length > b.length) [a, b] = [b, a];
|
|
6015
|
-
const m = a.length;
|
|
6016
|
-
const n = b.length;
|
|
6017
|
-
let prev = Array.from({ length: m + 1 }, (_, i) => i);
|
|
6018
|
-
let curr = new Array(m + 1);
|
|
6019
|
-
for (let j = 1; j <= n; j++) {
|
|
6020
|
-
curr[0] = j;
|
|
6021
|
-
for (let i = 1; i <= m; i++) {
|
|
6022
|
-
if (a[i - 1] === b[j - 1]) {
|
|
6023
|
-
curr[i] = prev[i - 1];
|
|
6024
|
-
} else {
|
|
6025
|
-
curr[i] = 1 + Math.min(prev[i - 1], prev[i], curr[i - 1]);
|
|
6026
|
-
}
|
|
6027
|
-
}
|
|
6028
|
-
;
|
|
6029
|
-
[prev, curr] = [curr, prev];
|
|
6030
|
-
}
|
|
6031
|
-
return prev[m];
|
|
6032
|
-
}
|
|
6033
|
-
|
|
6034
|
-
// src/diff/compare.ts
|
|
6035
|
-
var SIMILARITY_THRESHOLD = 0.4;
|
|
6036
|
-
async function compare(bufferA, bufferB, options) {
|
|
6037
|
-
const [resultA, resultB] = await Promise.all([
|
|
6038
|
-
parse(bufferA, options),
|
|
6039
|
-
parse(bufferB, options)
|
|
6040
|
-
]);
|
|
6041
|
-
if (!resultA.success) throw new Error(`\uBB38\uC11CA \uD30C\uC2F1 \uC2E4\uD328: ${resultA.error}`);
|
|
6042
|
-
if (!resultB.success) throw new Error(`\uBB38\uC11CB \uD30C\uC2F1 \uC2E4\uD328: ${resultB.error}`);
|
|
6043
|
-
return diffBlocks(resultA.blocks, resultB.blocks);
|
|
6044
|
-
}
|
|
6045
|
-
function diffBlocks(blocksA, blocksB) {
|
|
6046
|
-
const aligned = alignBlocks(blocksA, blocksB);
|
|
6047
|
-
const stats = { added: 0, removed: 0, modified: 0, unchanged: 0 };
|
|
6048
|
-
const diffs = [];
|
|
6049
|
-
for (const [a, b] of aligned) {
|
|
6050
|
-
if (a && b) {
|
|
6051
|
-
const sim = blockSimilarity(a, b);
|
|
6052
|
-
if (sim >= 0.99) {
|
|
6053
|
-
diffs.push({ type: "unchanged", before: a, after: b, similarity: 1 });
|
|
6054
|
-
stats.unchanged++;
|
|
6055
|
-
} else {
|
|
6056
|
-
const diff = { type: "modified", before: a, after: b, similarity: sim };
|
|
6057
|
-
if (a.type === "table" && b.type === "table" && a.table && b.table) {
|
|
6058
|
-
diff.cellDiffs = diffTableCells(a.table, b.table);
|
|
6059
|
-
}
|
|
6060
|
-
diffs.push(diff);
|
|
6061
|
-
stats.modified++;
|
|
6062
|
-
}
|
|
6063
|
-
} else if (a) {
|
|
6064
|
-
diffs.push({ type: "removed", before: a });
|
|
6065
|
-
stats.removed++;
|
|
6066
|
-
} else if (b) {
|
|
6067
|
-
diffs.push({ type: "added", after: b });
|
|
6068
|
-
stats.added++;
|
|
6069
|
-
}
|
|
6070
|
-
}
|
|
6071
|
-
return { stats, diffs };
|
|
6072
|
-
}
|
|
6073
|
-
function alignBlocks(a, b) {
|
|
6074
|
-
const m = a.length, n = b.length;
|
|
6075
|
-
if (m * n > 1e7) return fallbackAlign(a, b);
|
|
6076
|
-
const simCache = /* @__PURE__ */ new Map();
|
|
6077
|
-
const getSim = (i2, j2) => {
|
|
6078
|
-
const key = `${i2},${j2}`;
|
|
6079
|
-
let v = simCache.get(key);
|
|
6080
|
-
if (v === void 0) {
|
|
6081
|
-
v = blockSimilarity(a[i2], b[j2]);
|
|
6082
|
-
simCache.set(key, v);
|
|
6083
|
-
}
|
|
6084
|
-
return v;
|
|
6085
|
-
};
|
|
6086
|
-
const dp = Array.from({ length: m + 1 }, () => new Array(n + 1).fill(0));
|
|
6087
|
-
for (let i2 = 1; i2 <= m; i2++) {
|
|
6088
|
-
for (let j2 = 1; j2 <= n; j2++) {
|
|
6089
|
-
if (getSim(i2 - 1, j2 - 1) >= SIMILARITY_THRESHOLD) {
|
|
6090
|
-
dp[i2][j2] = dp[i2 - 1][j2 - 1] + 1;
|
|
6091
|
-
} else {
|
|
6092
|
-
dp[i2][j2] = Math.max(dp[i2 - 1][j2], dp[i2][j2 - 1]);
|
|
6093
|
-
}
|
|
6094
|
-
}
|
|
6095
|
-
}
|
|
6096
|
-
const pairs = [];
|
|
6097
|
-
let i = m, j = n;
|
|
6098
|
-
while (i > 0 && j > 0) {
|
|
6099
|
-
if (getSim(i - 1, j - 1) >= SIMILARITY_THRESHOLD && dp[i][j] === dp[i - 1][j - 1] + 1) {
|
|
6100
|
-
pairs.push([i - 1, j - 1]);
|
|
6101
|
-
i--;
|
|
6102
|
-
j--;
|
|
6103
|
-
} else if (dp[i - 1][j] >= dp[i][j - 1]) {
|
|
6104
|
-
i--;
|
|
6105
|
-
} else {
|
|
6106
|
-
j--;
|
|
3162
|
+
if (stylesFile) {
|
|
3163
|
+
try {
|
|
3164
|
+
styles = parseStyles(await stylesFile.async("text"));
|
|
3165
|
+
} catch {
|
|
6107
3166
|
}
|
|
6108
3167
|
}
|
|
6109
|
-
|
|
6110
|
-
const
|
|
6111
|
-
|
|
6112
|
-
|
|
6113
|
-
|
|
6114
|
-
|
|
6115
|
-
|
|
3168
|
+
let numbering = /* @__PURE__ */ new Map();
|
|
3169
|
+
const numFile = zip.file("word/numbering.xml");
|
|
3170
|
+
if (numFile) {
|
|
3171
|
+
try {
|
|
3172
|
+
numbering = parseNumbering(await numFile.async("text"));
|
|
3173
|
+
} catch {
|
|
3174
|
+
}
|
|
6116
3175
|
}
|
|
6117
|
-
|
|
6118
|
-
|
|
6119
|
-
|
|
6120
|
-
|
|
6121
|
-
|
|
6122
|
-
|
|
6123
|
-
|
|
6124
|
-
for (let i = 0; i < len; i++) {
|
|
6125
|
-
result.push([a[i] || null, b[i] || null]);
|
|
3176
|
+
let footnotes = /* @__PURE__ */ new Map();
|
|
3177
|
+
const fnFile = zip.file("word/footnotes.xml");
|
|
3178
|
+
if (fnFile) {
|
|
3179
|
+
try {
|
|
3180
|
+
footnotes = parseFootnotes(await fnFile.async("text"));
|
|
3181
|
+
} catch {
|
|
3182
|
+
}
|
|
6126
3183
|
}
|
|
6127
|
-
|
|
6128
|
-
|
|
6129
|
-
|
|
6130
|
-
if (
|
|
6131
|
-
|
|
6132
|
-
return normalizedSimilarity(a.text || "", b.text || "");
|
|
3184
|
+
const docXml = await docFile.async("text");
|
|
3185
|
+
const doc = parseXml2(docXml);
|
|
3186
|
+
const body = findElements(doc, "body");
|
|
3187
|
+
if (body.length === 0) {
|
|
3188
|
+
throw new KordocError("DOCX \uBCF8\uBB38(w:body)\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
6133
3189
|
}
|
|
6134
|
-
|
|
6135
|
-
|
|
3190
|
+
const blocks = [];
|
|
3191
|
+
const bodyEl = body[0];
|
|
3192
|
+
const children = bodyEl.childNodes;
|
|
3193
|
+
for (let i = 0; i < children.length; i++) {
|
|
3194
|
+
const node = children[i];
|
|
3195
|
+
if (node.nodeType !== 1) continue;
|
|
3196
|
+
const el = node;
|
|
3197
|
+
const localName2 = el.localName ?? el.tagName?.split(":").pop();
|
|
3198
|
+
if (localName2 === "p") {
|
|
3199
|
+
const block = parseParagraph(el, styles, numbering, footnotes, rels);
|
|
3200
|
+
if (block) blocks.push(block);
|
|
3201
|
+
} else if (localName2 === "tbl") {
|
|
3202
|
+
const block = parseTable(el, styles, numbering, footnotes, rels);
|
|
3203
|
+
if (block) blocks.push(block);
|
|
3204
|
+
}
|
|
6136
3205
|
}
|
|
6137
|
-
|
|
6138
|
-
|
|
6139
|
-
|
|
6140
|
-
|
|
6141
|
-
|
|
6142
|
-
|
|
6143
|
-
|
|
6144
|
-
|
|
6145
|
-
|
|
6146
|
-
|
|
6147
|
-
|
|
6148
|
-
|
|
6149
|
-
|
|
6150
|
-
|
|
6151
|
-
|
|
6152
|
-
|
|
6153
|
-
|
|
6154
|
-
|
|
6155
|
-
|
|
6156
|
-
let type;
|
|
6157
|
-
if (cellA === void 0) type = "added";
|
|
6158
|
-
else if (cellB === void 0) type = "removed";
|
|
6159
|
-
else if (cellA === cellB) type = "unchanged";
|
|
6160
|
-
else type = "modified";
|
|
6161
|
-
row.push({ type, before: cellA, after: cellB });
|
|
3206
|
+
const { blocks: imgBlocks, images } = await extractImages(zip, rels, doc);
|
|
3207
|
+
const metadata = {};
|
|
3208
|
+
const coreFile = zip.file("docProps/core.xml");
|
|
3209
|
+
if (coreFile) {
|
|
3210
|
+
try {
|
|
3211
|
+
const coreXml = await coreFile.async("text");
|
|
3212
|
+
const coreDoc = parseXml2(coreXml);
|
|
3213
|
+
const getFirst = (tag) => {
|
|
3214
|
+
const els = coreDoc.getElementsByTagName(tag);
|
|
3215
|
+
return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
|
|
3216
|
+
};
|
|
3217
|
+
metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
|
|
3218
|
+
metadata.author = getFirst("dc:creator");
|
|
3219
|
+
metadata.description = getFirst("dc:description");
|
|
3220
|
+
const created = getFirst("dcterms:created");
|
|
3221
|
+
if (created) metadata.createdAt = created;
|
|
3222
|
+
const modified = getFirst("dcterms:modified");
|
|
3223
|
+
if (modified) metadata.modifiedAt = modified;
|
|
3224
|
+
} catch {
|
|
6162
3225
|
}
|
|
6163
|
-
result.push(row);
|
|
6164
3226
|
}
|
|
6165
|
-
|
|
3227
|
+
const outline = blocks.filter((b) => b.type === "heading").map((b) => ({ level: b.level ?? 2, text: b.text ?? "" }));
|
|
3228
|
+
const markdown = blocksToMarkdown(blocks);
|
|
3229
|
+
return {
|
|
3230
|
+
markdown,
|
|
3231
|
+
blocks,
|
|
3232
|
+
metadata,
|
|
3233
|
+
outline: outline.length > 0 ? outline : void 0,
|
|
3234
|
+
warnings: warnings.length > 0 ? warnings : void 0,
|
|
3235
|
+
images: images.length > 0 ? images : void 0
|
|
3236
|
+
};
|
|
6166
3237
|
}
|
|
6167
3238
|
|
|
6168
3239
|
// src/form/recognize.ts
|
|
@@ -6205,15 +3276,20 @@ var LABEL_KEYWORDS = /* @__PURE__ */ new Set([
|
|
|
6205
3276
|
"\uB2E8\uAC00",
|
|
6206
3277
|
"\uD569\uACC4",
|
|
6207
3278
|
"\uACC4",
|
|
6208
|
-
"\uC18C\uACC4"
|
|
3279
|
+
"\uC18C\uACC4",
|
|
3280
|
+
"\uB4F1\uB85D\uAE30\uC900\uC9C0",
|
|
3281
|
+
"\uBCF8\uC801",
|
|
3282
|
+
"\uC704\uC784\uC778",
|
|
3283
|
+
"\uCCAD\uAD6C\uC0AC\uC720",
|
|
3284
|
+
"\uC18C\uBA85\uC790\uB8CC"
|
|
6209
3285
|
]);
|
|
6210
3286
|
function isLabelCell(text) {
|
|
6211
|
-
const trimmed = text.trim();
|
|
3287
|
+
const trimmed = text.trim().replace(/[¹²³⁴⁵⁶⁷⁸⁹⁰*※]+$/g, "").trim();
|
|
6212
3288
|
if (!trimmed || trimmed.length > 30) return false;
|
|
6213
3289
|
for (const kw of LABEL_KEYWORDS) {
|
|
6214
3290
|
if (trimmed.includes(kw)) return true;
|
|
6215
3291
|
}
|
|
6216
|
-
if (/^[가-힣\s()
|
|
3292
|
+
if (/^[가-힣\s()()·::]+$/.test(trimmed) && trimmed.replace(/\s/g, "").length >= 2 && trimmed.replace(/\s/g, "").length <= 8 && !/\d/.test(trimmed)) return true;
|
|
6217
3293
|
if (/^[가-힣A-Za-z\s]+[::]$/.test(trimmed)) return true;
|
|
6218
3294
|
return false;
|
|
6219
3295
|
}
|
|
@@ -6246,7 +3322,7 @@ function extractFromTable(table) {
|
|
|
6246
3322
|
for (let c = 0; c < table.cols - 1; c++) {
|
|
6247
3323
|
const labelCell = table.cells[r][c];
|
|
6248
3324
|
const valueCell = table.cells[r][c + 1];
|
|
6249
|
-
if (isLabelCell(labelCell.text)
|
|
3325
|
+
if (isLabelCell(labelCell.text)) {
|
|
6250
3326
|
fields.push({
|
|
6251
3327
|
label: labelCell.text.trim().replace(/[::]\s*$/, ""),
|
|
6252
3328
|
value: valueCell.text.trim(),
|
|
@@ -6256,43 +3332,552 @@ function extractFromTable(table) {
|
|
|
6256
3332
|
}
|
|
6257
3333
|
}
|
|
6258
3334
|
}
|
|
6259
|
-
}
|
|
6260
|
-
if (fields.length === 0 && table.rows >= 2 && table.cols >= 2) {
|
|
6261
|
-
const headerRow = table.cells[0];
|
|
6262
|
-
const allLabels = headerRow.every((cell) => {
|
|
6263
|
-
const t = cell.text.trim();
|
|
6264
|
-
return t.length > 0 && t.length <= 20;
|
|
6265
|
-
});
|
|
6266
|
-
if (allLabels) {
|
|
6267
|
-
for (let r = 1; r < table.rows; r++) {
|
|
6268
|
-
for (let c = 0; c < table.cols; c++) {
|
|
6269
|
-
const label = headerRow[c].text.trim();
|
|
6270
|
-
const value = table.cells[r][c].text.trim();
|
|
6271
|
-
if (label && value) {
|
|
6272
|
-
fields.push({ label, value, row: r, col: c });
|
|
6273
|
-
}
|
|
6274
|
-
}
|
|
3335
|
+
}
|
|
3336
|
+
if (fields.length === 0 && table.rows >= 2 && table.cols >= 2) {
|
|
3337
|
+
const headerRow = table.cells[0];
|
|
3338
|
+
const allLabels = headerRow.every((cell) => {
|
|
3339
|
+
const t = cell.text.trim();
|
|
3340
|
+
return t.length > 0 && t.length <= 20;
|
|
3341
|
+
});
|
|
3342
|
+
if (allLabels) {
|
|
3343
|
+
for (let r = 1; r < table.rows; r++) {
|
|
3344
|
+
for (let c = 0; c < table.cols; c++) {
|
|
3345
|
+
const label = headerRow[c].text.trim();
|
|
3346
|
+
const value = table.cells[r][c].text.trim();
|
|
3347
|
+
if (label && value) {
|
|
3348
|
+
fields.push({ label, value, row: r, col: c });
|
|
3349
|
+
}
|
|
3350
|
+
}
|
|
3351
|
+
}
|
|
3352
|
+
}
|
|
3353
|
+
}
|
|
3354
|
+
return fields;
|
|
3355
|
+
}
|
|
3356
|
+
function extractInlineFields(text) {
|
|
3357
|
+
const fields = [];
|
|
3358
|
+
const pattern = /([가-힣A-Za-z]{2,10})\s*[::]\s*([^\n,;]{1,100})/g;
|
|
3359
|
+
let match;
|
|
3360
|
+
while ((match = pattern.exec(text)) !== null) {
|
|
3361
|
+
const label = match[1].trim();
|
|
3362
|
+
const value = match[2].trim();
|
|
3363
|
+
if (value) {
|
|
3364
|
+
fields.push({ label, value, row: -1, col: -1 });
|
|
3365
|
+
}
|
|
3366
|
+
}
|
|
3367
|
+
return fields;
|
|
3368
|
+
}
|
|
3369
|
+
|
|
3370
|
+
// src/form/match.ts
|
|
3371
|
+
function normalizeLabel(label) {
|
|
3372
|
+
return label.trim().replace(/[::\s()()·]/g, "");
|
|
3373
|
+
}
|
|
3374
|
+
function findMatchingKey(cellLabel, values) {
|
|
3375
|
+
if (values.has(cellLabel)) return cellLabel;
|
|
3376
|
+
let bestKey;
|
|
3377
|
+
let bestLen = 0;
|
|
3378
|
+
for (const key of values.keys()) {
|
|
3379
|
+
if (cellLabel.startsWith(key)) {
|
|
3380
|
+
if (key.length >= cellLabel.length * 0.6 && key.length > bestLen) {
|
|
3381
|
+
bestLen = key.length;
|
|
3382
|
+
bestKey = key;
|
|
3383
|
+
}
|
|
3384
|
+
} else if (key.startsWith(cellLabel)) {
|
|
3385
|
+
if (cellLabel.length >= key.length * 0.6 && cellLabel.length > bestLen) {
|
|
3386
|
+
bestLen = cellLabel.length;
|
|
3387
|
+
bestKey = key;
|
|
3388
|
+
}
|
|
3389
|
+
}
|
|
3390
|
+
}
|
|
3391
|
+
return bestKey;
|
|
3392
|
+
}
|
|
3393
|
+
function isKeywordLabel(text) {
|
|
3394
|
+
const trimmed = text.trim().replace(/[¹²³⁴⁵⁶⁷⁸⁹⁰*※]+$/g, "").trim();
|
|
3395
|
+
if (!trimmed || trimmed.length > 15) return false;
|
|
3396
|
+
for (const kw of LABEL_KEYWORDS) {
|
|
3397
|
+
if (trimmed.includes(kw)) return true;
|
|
3398
|
+
}
|
|
3399
|
+
return false;
|
|
3400
|
+
}
|
|
3401
|
+
function fillInCellPatterns(cellText, values, matchedLabels) {
|
|
3402
|
+
let text = cellText;
|
|
3403
|
+
const matches = [];
|
|
3404
|
+
text = text.replace(
|
|
3405
|
+
/([가-힣A-Za-z]+)\(\s{1,}\)([가-힣A-Za-z]*)/g,
|
|
3406
|
+
(match, prefix, suffix) => {
|
|
3407
|
+
const label = prefix + suffix;
|
|
3408
|
+
const normalizedLabel = normalizeLabel(label);
|
|
3409
|
+
const matchKey = values.has(normalizedLabel) ? normalizedLabel : values.has(normalizeLabel(prefix)) ? normalizeLabel(prefix) : void 0;
|
|
3410
|
+
if (matchKey === void 0) return match;
|
|
3411
|
+
const newValue = values.get(matchKey);
|
|
3412
|
+
matchedLabels.add(matchKey);
|
|
3413
|
+
matches.push({ key: matchKey, label, value: newValue });
|
|
3414
|
+
return `${prefix}(${newValue})${suffix}`;
|
|
3415
|
+
}
|
|
3416
|
+
);
|
|
3417
|
+
text = text.replace(
|
|
3418
|
+
/□([가-힣A-Za-z]+)/g,
|
|
3419
|
+
(match, keyword) => {
|
|
3420
|
+
const normalizedKw = normalizeLabel(keyword);
|
|
3421
|
+
const matchKey = values.has(normalizedKw) ? normalizedKw : void 0;
|
|
3422
|
+
if (matchKey === void 0) return match;
|
|
3423
|
+
const val = values.get(matchKey);
|
|
3424
|
+
const isTruthy = ["\u2611", "\u2713", "\u2714", "v", "V", "true", "1", "yes", "o", "O"].includes(val.trim()) || val.trim() === "";
|
|
3425
|
+
if (!isTruthy) return match;
|
|
3426
|
+
matchedLabels.add(matchKey);
|
|
3427
|
+
matches.push({ key: matchKey, label: `\u25A1${keyword}`, value: "\u2611" });
|
|
3428
|
+
return `\u2611${keyword}`;
|
|
3429
|
+
}
|
|
3430
|
+
);
|
|
3431
|
+
text = text.replace(
|
|
3432
|
+
/\(([가-힣A-Za-z]+)[::]\s{1,}\)/g,
|
|
3433
|
+
(match, keyword) => {
|
|
3434
|
+
const normalizedKw = normalizeLabel(keyword);
|
|
3435
|
+
const matchKey = values.has(normalizedKw) ? normalizedKw : void 0;
|
|
3436
|
+
if (matchKey === void 0) return match;
|
|
3437
|
+
const newValue = values.get(matchKey);
|
|
3438
|
+
matchedLabels.add(matchKey);
|
|
3439
|
+
matches.push({ key: matchKey, label: keyword, value: newValue });
|
|
3440
|
+
return `(${keyword}\uFF1A${newValue})`;
|
|
3441
|
+
}
|
|
3442
|
+
);
|
|
3443
|
+
return matches.length > 0 ? { text, matches } : null;
|
|
3444
|
+
}
|
|
3445
|
+
function normalizeValues(values) {
|
|
3446
|
+
const map = /* @__PURE__ */ new Map();
|
|
3447
|
+
for (const [label, value] of Object.entries(values)) {
|
|
3448
|
+
map.set(normalizeLabel(label), value);
|
|
3449
|
+
}
|
|
3450
|
+
return map;
|
|
3451
|
+
}
|
|
3452
|
+
function resolveUnmatched(normalizedValues, matchedLabels, originalValues) {
|
|
3453
|
+
return [...normalizedValues.keys()].filter((k) => !matchedLabels.has(k)).map((k) => {
|
|
3454
|
+
for (const orig of Object.keys(originalValues)) {
|
|
3455
|
+
if (normalizeLabel(orig) === k) return orig;
|
|
3456
|
+
}
|
|
3457
|
+
return k;
|
|
3458
|
+
});
|
|
3459
|
+
}
|
|
3460
|
+
|
|
3461
|
+
// src/form/filler.ts
|
|
3462
|
+
function fillFormFields(blocks, values) {
|
|
3463
|
+
const cloned = structuredClone(blocks);
|
|
3464
|
+
const filled = [];
|
|
3465
|
+
const matchedLabels = /* @__PURE__ */ new Set();
|
|
3466
|
+
const normalizedValues = normalizeValues(values);
|
|
3467
|
+
const patternFilledCells = /* @__PURE__ */ new Set();
|
|
3468
|
+
for (const block of cloned) {
|
|
3469
|
+
if (block.type !== "table" || !block.table) continue;
|
|
3470
|
+
for (let r = 0; r < block.table.rows; r++) {
|
|
3471
|
+
for (let c = 0; c < block.table.cols; c++) {
|
|
3472
|
+
const cell = block.table.cells[r]?.[c];
|
|
3473
|
+
if (!cell) continue;
|
|
3474
|
+
const result = fillInCellPatterns(cell.text, normalizedValues, matchedLabels);
|
|
3475
|
+
if (result) {
|
|
3476
|
+
cell.text = result.text;
|
|
3477
|
+
patternFilledCells.add(`${r},${c}`);
|
|
3478
|
+
for (const m of result.matches) {
|
|
3479
|
+
filled.push({ label: m.label, value: m.value, row: r, col: c });
|
|
3480
|
+
}
|
|
3481
|
+
}
|
|
3482
|
+
}
|
|
3483
|
+
}
|
|
3484
|
+
}
|
|
3485
|
+
for (const block of cloned) {
|
|
3486
|
+
if (block.type !== "table" || !block.table) continue;
|
|
3487
|
+
fillTable(block.table, normalizedValues, filled, matchedLabels, patternFilledCells);
|
|
3488
|
+
}
|
|
3489
|
+
for (const block of cloned) {
|
|
3490
|
+
if (block.type !== "paragraph" || !block.text) continue;
|
|
3491
|
+
const newText = fillInlineFields(block.text, normalizedValues, filled, matchedLabels);
|
|
3492
|
+
if (newText !== block.text) block.text = newText;
|
|
3493
|
+
}
|
|
3494
|
+
const unmatched = resolveUnmatched(normalizedValues, matchedLabels, values);
|
|
3495
|
+
return { blocks: cloned, filled, unmatched };
|
|
3496
|
+
}
|
|
3497
|
+
function fillTable(table, values, filled, matchedLabels, patternFilledCells) {
|
|
3498
|
+
if (table.cols < 2) return;
|
|
3499
|
+
for (let r = 0; r < table.rows; r++) {
|
|
3500
|
+
for (let c = 0; c < table.cols - 1; c++) {
|
|
3501
|
+
const labelCell = table.cells[r][c];
|
|
3502
|
+
const valueCell = table.cells[r][c + 1];
|
|
3503
|
+
if (!labelCell || !valueCell) continue;
|
|
3504
|
+
if (!isLabelCell(labelCell.text)) continue;
|
|
3505
|
+
if (isKeywordLabel(valueCell.text)) continue;
|
|
3506
|
+
const normalizedCellLabel = normalizeLabel(labelCell.text);
|
|
3507
|
+
if (!normalizedCellLabel) continue;
|
|
3508
|
+
const matchKey = findMatchingKey(normalizedCellLabel, values);
|
|
3509
|
+
if (matchKey === void 0) continue;
|
|
3510
|
+
const newValue = values.get(matchKey);
|
|
3511
|
+
if (patternFilledCells?.has(`${r},${c + 1}`)) {
|
|
3512
|
+
valueCell.text = newValue + " " + valueCell.text;
|
|
3513
|
+
} else {
|
|
3514
|
+
valueCell.text = newValue;
|
|
3515
|
+
}
|
|
3516
|
+
matchedLabels.add(matchKey);
|
|
3517
|
+
filled.push({
|
|
3518
|
+
label: labelCell.text.trim().replace(/[::]\s*$/, ""),
|
|
3519
|
+
value: newValue,
|
|
3520
|
+
row: r,
|
|
3521
|
+
col: c
|
|
3522
|
+
});
|
|
3523
|
+
}
|
|
3524
|
+
}
|
|
3525
|
+
if (table.rows >= 2 && table.cols >= 2) {
|
|
3526
|
+
const headerRow = table.cells[0];
|
|
3527
|
+
const allLabels = headerRow.every((cell) => {
|
|
3528
|
+
const t = cell.text.trim();
|
|
3529
|
+
return t.length > 0 && t.length <= 20 && isLabelCell(t);
|
|
3530
|
+
});
|
|
3531
|
+
if (!allLabels) return;
|
|
3532
|
+
for (let r = 1; r < table.rows; r++) {
|
|
3533
|
+
for (let c = 0; c < table.cols; c++) {
|
|
3534
|
+
const headerLabel = normalizeLabel(headerRow[c].text);
|
|
3535
|
+
const matchKey = findMatchingKey(headerLabel, values);
|
|
3536
|
+
if (matchKey === void 0) continue;
|
|
3537
|
+
if (matchedLabels.has(matchKey)) continue;
|
|
3538
|
+
const newValue = values.get(matchKey);
|
|
3539
|
+
table.cells[r][c].text = newValue;
|
|
3540
|
+
matchedLabels.add(matchKey);
|
|
3541
|
+
filled.push({
|
|
3542
|
+
label: headerRow[c].text.trim(),
|
|
3543
|
+
value: newValue,
|
|
3544
|
+
row: r,
|
|
3545
|
+
col: c
|
|
3546
|
+
});
|
|
3547
|
+
}
|
|
3548
|
+
}
|
|
3549
|
+
}
|
|
3550
|
+
}
|
|
3551
|
+
function fillInlineFields(text, values, filled, matchedLabels) {
|
|
3552
|
+
return text.replace(
|
|
3553
|
+
/([가-힣A-Za-z]{2,10})\s*[::]\s*([^\n,;]{0,100})/g,
|
|
3554
|
+
(match, rawLabel, _oldValue) => {
|
|
3555
|
+
const normalized = normalizeLabel(rawLabel);
|
|
3556
|
+
const matchKey = findMatchingKey(normalized, values);
|
|
3557
|
+
if (matchKey === void 0) return match;
|
|
3558
|
+
const newValue = values.get(matchKey);
|
|
3559
|
+
matchedLabels.add(matchKey);
|
|
3560
|
+
filled.push({
|
|
3561
|
+
label: rawLabel.trim(),
|
|
3562
|
+
value: newValue,
|
|
3563
|
+
row: -1,
|
|
3564
|
+
col: -1
|
|
3565
|
+
});
|
|
3566
|
+
return `${rawLabel}: ${newValue}`;
|
|
3567
|
+
}
|
|
3568
|
+
);
|
|
3569
|
+
}
|
|
3570
|
+
|
|
3571
|
+
// src/form/filler-hwpx.ts
|
|
3572
|
+
import JSZip5 from "jszip";
|
|
3573
|
+
import { DOMParser as DOMParser4, XMLSerializer } from "@xmldom/xmldom";
|
|
3574
|
+
async function fillHwpx(hwpxBuffer, values) {
|
|
3575
|
+
const zip = await JSZip5.loadAsync(hwpxBuffer);
|
|
3576
|
+
const filled = [];
|
|
3577
|
+
const matchedLabels = /* @__PURE__ */ new Set();
|
|
3578
|
+
const normalizedValues = normalizeValues(values);
|
|
3579
|
+
const sectionFiles = Object.keys(zip.files).filter((name) => /[Ss]ection\d+\.xml$/i.test(name)).sort();
|
|
3580
|
+
if (sectionFiles.length === 0) {
|
|
3581
|
+
throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
3582
|
+
}
|
|
3583
|
+
const xmlParser = new DOMParser4();
|
|
3584
|
+
const xmlSerializer = new XMLSerializer();
|
|
3585
|
+
for (const sectionPath of sectionFiles) {
|
|
3586
|
+
const zipEntry = zip.file(sectionPath);
|
|
3587
|
+
if (!zipEntry) continue;
|
|
3588
|
+
const rawXml = await zipEntry.async("text");
|
|
3589
|
+
const doc = xmlParser.parseFromString(stripDtd(rawXml), "text/xml");
|
|
3590
|
+
if (!doc.documentElement) continue;
|
|
3591
|
+
let modified = false;
|
|
3592
|
+
const tables = findAllElements(doc.documentElement, "tbl");
|
|
3593
|
+
const cellPatternApplied = /* @__PURE__ */ new Set();
|
|
3594
|
+
for (const tblEl of tables) {
|
|
3595
|
+
const allCells = findAllElements(tblEl, "tc");
|
|
3596
|
+
for (const tcEl of allCells) {
|
|
3597
|
+
const tNodes = collectCellTextNodes(tcEl);
|
|
3598
|
+
const fullText = tNodes.map((n) => n.text).join("");
|
|
3599
|
+
const result = fillInCellPatterns(fullText, normalizedValues, matchedLabels);
|
|
3600
|
+
if (!result) continue;
|
|
3601
|
+
applyTextReplacements(tNodes, fullText, result.text);
|
|
3602
|
+
cellPatternApplied.add(tcEl);
|
|
3603
|
+
for (const m of result.matches) {
|
|
3604
|
+
filled.push({ label: m.label, value: m.value, row: -1, col: -1 });
|
|
3605
|
+
}
|
|
3606
|
+
modified = true;
|
|
3607
|
+
}
|
|
3608
|
+
}
|
|
3609
|
+
for (const tblEl of tables) {
|
|
3610
|
+
const rows = findDirectChildren(tblEl, "tr");
|
|
3611
|
+
for (let rowIdx = 0; rowIdx < rows.length; rowIdx++) {
|
|
3612
|
+
const trEl = rows[rowIdx];
|
|
3613
|
+
const cells = findDirectChildren(trEl, "tc");
|
|
3614
|
+
for (let colIdx = 0; colIdx < cells.length - 1; colIdx++) {
|
|
3615
|
+
const labelText = extractCellText(cells[colIdx]);
|
|
3616
|
+
if (!isLabelCell(labelText)) continue;
|
|
3617
|
+
const valueCell = cells[colIdx + 1];
|
|
3618
|
+
const valueText = extractCellText(valueCell);
|
|
3619
|
+
if (isKeywordLabel(valueText)) continue;
|
|
3620
|
+
const normalizedCellLabel = normalizeLabel(labelText);
|
|
3621
|
+
if (!normalizedCellLabel) continue;
|
|
3622
|
+
const matchKey = findMatchingKey(normalizedCellLabel, normalizedValues);
|
|
3623
|
+
if (matchKey === void 0) continue;
|
|
3624
|
+
const newValue = normalizedValues.get(matchKey);
|
|
3625
|
+
if (cellPatternApplied.has(valueCell)) {
|
|
3626
|
+
prependCellText(valueCell, newValue);
|
|
3627
|
+
} else {
|
|
3628
|
+
replaceCellText(valueCell, newValue);
|
|
3629
|
+
}
|
|
3630
|
+
matchedLabels.add(matchKey);
|
|
3631
|
+
filled.push({
|
|
3632
|
+
label: labelText.trim().replace(/[::]\s*$/, ""),
|
|
3633
|
+
value: newValue,
|
|
3634
|
+
row: rowIdx,
|
|
3635
|
+
col: colIdx
|
|
3636
|
+
});
|
|
3637
|
+
modified = true;
|
|
3638
|
+
}
|
|
3639
|
+
}
|
|
3640
|
+
if (rows.length >= 2) {
|
|
3641
|
+
const headerCells = findDirectChildren(rows[0], "tc");
|
|
3642
|
+
const allLabels = headerCells.every((cell) => {
|
|
3643
|
+
const t = extractCellText(cell).trim();
|
|
3644
|
+
return t.length > 0 && t.length <= 20 && isLabelCell(t);
|
|
3645
|
+
});
|
|
3646
|
+
if (allLabels) {
|
|
3647
|
+
for (let rowIdx = 1; rowIdx < rows.length; rowIdx++) {
|
|
3648
|
+
const dataCells = findDirectChildren(rows[rowIdx], "tc");
|
|
3649
|
+
for (let colIdx = 0; colIdx < Math.min(headerCells.length, dataCells.length); colIdx++) {
|
|
3650
|
+
const headerLabel = normalizeLabel(extractCellText(headerCells[colIdx]));
|
|
3651
|
+
const matchKey = findMatchingKey(headerLabel, normalizedValues);
|
|
3652
|
+
if (matchKey === void 0) continue;
|
|
3653
|
+
if (matchedLabels.has(matchKey)) continue;
|
|
3654
|
+
const newValue = normalizedValues.get(matchKey);
|
|
3655
|
+
replaceCellText(dataCells[colIdx], newValue);
|
|
3656
|
+
matchedLabels.add(matchKey);
|
|
3657
|
+
filled.push({
|
|
3658
|
+
label: extractCellText(headerCells[colIdx]).trim(),
|
|
3659
|
+
value: newValue,
|
|
3660
|
+
row: rowIdx,
|
|
3661
|
+
col: colIdx
|
|
3662
|
+
});
|
|
3663
|
+
modified = true;
|
|
3664
|
+
}
|
|
3665
|
+
}
|
|
3666
|
+
}
|
|
3667
|
+
}
|
|
3668
|
+
}
|
|
3669
|
+
const allParagraphs = findAllElements(doc.documentElement, "p");
|
|
3670
|
+
for (const pEl of allParagraphs) {
|
|
3671
|
+
if (isInsideTable(pEl)) continue;
|
|
3672
|
+
const tNodes = collectTextNodes(pEl);
|
|
3673
|
+
const fullText = tNodes.map((n) => n.text).join("");
|
|
3674
|
+
const pattern = /([가-힣A-Za-z]{2,10})\s*[::]\s*([^\n,;]{0,100})/g;
|
|
3675
|
+
let match;
|
|
3676
|
+
while ((match = pattern.exec(fullText)) !== null) {
|
|
3677
|
+
const rawLabel = match[1];
|
|
3678
|
+
const normalized = normalizeLabel(rawLabel);
|
|
3679
|
+
const matchKey = findMatchingKey(normalized, normalizedValues);
|
|
3680
|
+
if (matchKey === void 0) continue;
|
|
3681
|
+
const newValue = normalizedValues.get(matchKey);
|
|
3682
|
+
const valueStart = match.index + match[0].length - match[2].length;
|
|
3683
|
+
const valueEnd = match.index + match[0].length;
|
|
3684
|
+
replaceTextRange(tNodes, valueStart, valueEnd, newValue);
|
|
3685
|
+
matchedLabels.add(matchKey);
|
|
3686
|
+
filled.push({ label: rawLabel.trim(), value: newValue, row: -1, col: -1 });
|
|
3687
|
+
modified = true;
|
|
3688
|
+
break;
|
|
3689
|
+
}
|
|
3690
|
+
}
|
|
3691
|
+
if (modified) {
|
|
3692
|
+
const newXml = xmlSerializer.serializeToString(doc);
|
|
3693
|
+
zip.file(sectionPath, newXml);
|
|
3694
|
+
}
|
|
3695
|
+
}
|
|
3696
|
+
const unmatched = resolveUnmatched(normalizedValues, matchedLabels, values);
|
|
3697
|
+
const buffer = await zip.generateAsync({ type: "arraybuffer" });
|
|
3698
|
+
return { buffer, filled, unmatched };
|
|
3699
|
+
}
|
|
3700
|
+
function localName(el) {
|
|
3701
|
+
return (el.tagName || el.localName || "").replace(/^[^:]+:/, "");
|
|
3702
|
+
}
|
|
3703
|
+
function findAllElements(node, tagLocalName) {
|
|
3704
|
+
const result = [];
|
|
3705
|
+
const walk = (n) => {
|
|
3706
|
+
const children = n.childNodes;
|
|
3707
|
+
if (!children) return;
|
|
3708
|
+
for (let i = 0; i < children.length; i++) {
|
|
3709
|
+
const child = children[i];
|
|
3710
|
+
if (child.nodeType !== 1) continue;
|
|
3711
|
+
if (localName(child) === tagLocalName) result.push(child);
|
|
3712
|
+
walk(child);
|
|
3713
|
+
}
|
|
3714
|
+
};
|
|
3715
|
+
walk(node);
|
|
3716
|
+
return result;
|
|
3717
|
+
}
|
|
3718
|
+
function findDirectChildren(parent, tagLocalName) {
|
|
3719
|
+
const result = [];
|
|
3720
|
+
const children = parent.childNodes;
|
|
3721
|
+
if (!children) return result;
|
|
3722
|
+
for (let i = 0; i < children.length; i++) {
|
|
3723
|
+
const child = children[i];
|
|
3724
|
+
if (child.nodeType === 1 && localName(child) === tagLocalName) {
|
|
3725
|
+
result.push(child);
|
|
3726
|
+
}
|
|
3727
|
+
}
|
|
3728
|
+
return result;
|
|
3729
|
+
}
|
|
3730
|
+
function isInsideTable(el) {
|
|
3731
|
+
let parent = el.parentNode;
|
|
3732
|
+
while (parent) {
|
|
3733
|
+
if (parent.nodeType === 1 && localName(parent) === "tbl") return true;
|
|
3734
|
+
parent = parent.parentNode;
|
|
3735
|
+
}
|
|
3736
|
+
return false;
|
|
3737
|
+
}
|
|
3738
|
+
function extractCellText(tcEl) {
|
|
3739
|
+
const parts = [];
|
|
3740
|
+
const walk = (node) => {
|
|
3741
|
+
const children = node.childNodes;
|
|
3742
|
+
if (!children) return;
|
|
3743
|
+
for (let i = 0; i < children.length; i++) {
|
|
3744
|
+
const child = children[i];
|
|
3745
|
+
if (child.nodeType === 3) {
|
|
3746
|
+
parts.push(child.textContent || "");
|
|
3747
|
+
} else if (child.nodeType === 1) {
|
|
3748
|
+
const tag = localName(child);
|
|
3749
|
+
if (tag === "t") walk(child);
|
|
3750
|
+
else if (tag === "run" || tag === "r" || tag === "p" || tag === "subList") walk(child);
|
|
3751
|
+
else if (tag === "tab") parts.push(" ");
|
|
3752
|
+
else if (tag === "br") parts.push("\n");
|
|
3753
|
+
}
|
|
3754
|
+
}
|
|
3755
|
+
};
|
|
3756
|
+
walk(tcEl);
|
|
3757
|
+
return parts.join("");
|
|
3758
|
+
}
|
|
3759
|
+
function prependCellText(tcEl, text) {
|
|
3760
|
+
const tElements = findAllElements(tcEl, "t");
|
|
3761
|
+
if (tElements.length === 0) return;
|
|
3762
|
+
const firstT = tElements[0];
|
|
3763
|
+
const existing = firstT.textContent || "";
|
|
3764
|
+
clearChildren(firstT);
|
|
3765
|
+
firstT.appendChild(firstT.ownerDocument.createTextNode(text + " " + existing));
|
|
3766
|
+
}
|
|
3767
|
+
function replaceCellText(tcEl, newValue) {
|
|
3768
|
+
const paragraphs = findAllElements(tcEl, "p");
|
|
3769
|
+
if (paragraphs.length === 0) return;
|
|
3770
|
+
const firstP = paragraphs[0];
|
|
3771
|
+
const runs = findAllElements(firstP, "run").concat(findAllElements(firstP, "r"));
|
|
3772
|
+
if (runs.length > 0) {
|
|
3773
|
+
setRunText(runs[0], newValue);
|
|
3774
|
+
for (let i = 1; i < runs.length; i++) {
|
|
3775
|
+
setRunText(runs[i], "");
|
|
3776
|
+
}
|
|
3777
|
+
} else {
|
|
3778
|
+
const tElements = findAllElements(firstP, "t");
|
|
3779
|
+
if (tElements.length > 0) {
|
|
3780
|
+
clearChildren(tElements[0]);
|
|
3781
|
+
tElements[0].appendChild(tElements[0].ownerDocument.createTextNode(newValue));
|
|
3782
|
+
for (let i = 1; i < tElements.length; i++) {
|
|
3783
|
+
clearChildren(tElements[i]);
|
|
6275
3784
|
}
|
|
6276
3785
|
}
|
|
6277
3786
|
}
|
|
6278
|
-
|
|
3787
|
+
for (let i = 1; i < paragraphs.length; i++) {
|
|
3788
|
+
const p = paragraphs[i];
|
|
3789
|
+
if (p.parentNode) {
|
|
3790
|
+
const pRuns = findAllElements(p, "run").concat(findAllElements(p, "r"));
|
|
3791
|
+
for (const run of pRuns) setRunText(run, "");
|
|
3792
|
+
const pTs = findAllElements(p, "t");
|
|
3793
|
+
for (const t of pTs) clearChildren(t);
|
|
3794
|
+
}
|
|
3795
|
+
}
|
|
6279
3796
|
}
|
|
6280
|
-
function
|
|
6281
|
-
const
|
|
6282
|
-
|
|
6283
|
-
|
|
6284
|
-
|
|
6285
|
-
|
|
6286
|
-
|
|
6287
|
-
if (value) {
|
|
6288
|
-
fields.push({ label, value, row: -1, col: -1 });
|
|
3797
|
+
function setRunText(runEl, text) {
|
|
3798
|
+
const tElements = findAllElements(runEl, "t");
|
|
3799
|
+
if (tElements.length > 0) {
|
|
3800
|
+
clearChildren(tElements[0]);
|
|
3801
|
+
tElements[0].appendChild(tElements[0].ownerDocument.createTextNode(text));
|
|
3802
|
+
for (let i = 1; i < tElements.length; i++) {
|
|
3803
|
+
clearChildren(tElements[i]);
|
|
6289
3804
|
}
|
|
6290
3805
|
}
|
|
6291
|
-
|
|
3806
|
+
}
|
|
3807
|
+
function clearChildren(el) {
|
|
3808
|
+
while (el.firstChild) el.removeChild(el.firstChild);
|
|
3809
|
+
}
|
|
3810
|
+
function collectTextNodes(pEl) {
|
|
3811
|
+
const tElements = findAllElements(pEl, "t");
|
|
3812
|
+
const result = [];
|
|
3813
|
+
let offset = 0;
|
|
3814
|
+
for (const t of tElements) {
|
|
3815
|
+
const text = t.textContent || "";
|
|
3816
|
+
result.push({ element: t, text, offset });
|
|
3817
|
+
offset += text.length;
|
|
3818
|
+
}
|
|
3819
|
+
return result;
|
|
3820
|
+
}
|
|
3821
|
+
function replaceTextRange(tNodes, globalStart, globalEnd, newValue) {
|
|
3822
|
+
let replaced = false;
|
|
3823
|
+
for (const node of tNodes) {
|
|
3824
|
+
const nodeStart = node.offset;
|
|
3825
|
+
const nodeEnd = node.offset + node.text.length;
|
|
3826
|
+
if (nodeEnd <= globalStart || nodeStart >= globalEnd) continue;
|
|
3827
|
+
const localStart = Math.max(0, globalStart - nodeStart);
|
|
3828
|
+
const localEnd = Math.min(node.text.length, globalEnd - nodeStart);
|
|
3829
|
+
if (!replaced) {
|
|
3830
|
+
const before = node.text.slice(0, localStart);
|
|
3831
|
+
const after = node.text.slice(localEnd);
|
|
3832
|
+
const newText = before + newValue + after;
|
|
3833
|
+
clearChildren(node.element);
|
|
3834
|
+
node.element.appendChild(node.element.ownerDocument.createTextNode(newText));
|
|
3835
|
+
replaced = true;
|
|
3836
|
+
} else {
|
|
3837
|
+
const before = node.text.slice(0, localStart);
|
|
3838
|
+
const after = node.text.slice(localEnd);
|
|
3839
|
+
const newText = before + after;
|
|
3840
|
+
clearChildren(node.element);
|
|
3841
|
+
node.element.appendChild(node.element.ownerDocument.createTextNode(newText));
|
|
3842
|
+
}
|
|
3843
|
+
}
|
|
3844
|
+
}
|
|
3845
|
+
function collectCellTextNodes(tcEl) {
|
|
3846
|
+
const tElements = findAllElements(tcEl, "t");
|
|
3847
|
+
const result = [];
|
|
3848
|
+
let offset = 0;
|
|
3849
|
+
for (const t of tElements) {
|
|
3850
|
+
const text = t.textContent || "";
|
|
3851
|
+
result.push({ element: t, text, offset });
|
|
3852
|
+
offset += text.length;
|
|
3853
|
+
}
|
|
3854
|
+
return result;
|
|
3855
|
+
}
|
|
3856
|
+
function applyTextReplacements(tNodes, originalFull, replacedFull) {
|
|
3857
|
+
if (originalFull === replacedFull) return;
|
|
3858
|
+
if (tNodes.length === 1) {
|
|
3859
|
+
clearChildren(tNodes[0].element);
|
|
3860
|
+
tNodes[0].element.appendChild(
|
|
3861
|
+
tNodes[0].element.ownerDocument.createTextNode(replacedFull)
|
|
3862
|
+
);
|
|
3863
|
+
return;
|
|
3864
|
+
}
|
|
3865
|
+
let diffStart = 0;
|
|
3866
|
+
while (diffStart < originalFull.length && diffStart < replacedFull.length && originalFull[diffStart] === replacedFull[diffStart]) {
|
|
3867
|
+
diffStart++;
|
|
3868
|
+
}
|
|
3869
|
+
let diffEndOrig = originalFull.length;
|
|
3870
|
+
let diffEndRepl = replacedFull.length;
|
|
3871
|
+
while (diffEndOrig > diffStart && diffEndRepl > diffStart && originalFull[diffEndOrig - 1] === replacedFull[diffEndRepl - 1]) {
|
|
3872
|
+
diffEndOrig--;
|
|
3873
|
+
diffEndRepl--;
|
|
3874
|
+
}
|
|
3875
|
+
const newPart = replacedFull.slice(diffStart, diffEndRepl);
|
|
3876
|
+
replaceTextRange(tNodes, diffStart, diffEndOrig, newPart);
|
|
6292
3877
|
}
|
|
6293
3878
|
|
|
6294
3879
|
// src/hwpx/generator.ts
|
|
6295
|
-
import
|
|
3880
|
+
import JSZip6 from "jszip";
|
|
6296
3881
|
var NS_SECTION = "http://www.hancom.co.kr/hwpml/2011/section";
|
|
6297
3882
|
var NS_PARA = "http://www.hancom.co.kr/hwpml/2011/paragraph";
|
|
6298
3883
|
var NS_HEAD = "http://www.hancom.co.kr/hwpml/2011/head";
|
|
@@ -6319,7 +3904,7 @@ var PARA_LIST = 7;
|
|
|
6319
3904
|
async function markdownToHwpx(markdown) {
|
|
6320
3905
|
const blocks = parseMarkdownToBlocks(markdown);
|
|
6321
3906
|
const sectionXml = blocksToSectionXml(blocks);
|
|
6322
|
-
const zip = new
|
|
3907
|
+
const zip = new JSZip6();
|
|
6323
3908
|
zip.file("mimetype", "application/hwp+zip", { compression: "STORE" });
|
|
6324
3909
|
zip.file("META-INF/container.xml", generateContainerXml());
|
|
6325
3910
|
zip.file("Contents/content.hpf", generateManifest());
|
|
@@ -6679,6 +4264,183 @@ function blocksToSectionXml(blocks) {
|
|
|
6679
4264
|
</hs:sec>`;
|
|
6680
4265
|
}
|
|
6681
4266
|
|
|
4267
|
+
// src/diff/text-diff.ts
|
|
4268
|
+
function similarity(a, b) {
|
|
4269
|
+
if (a === b) return 1;
|
|
4270
|
+
if (!a || !b) return 0;
|
|
4271
|
+
const maxLen = Math.max(a.length, b.length);
|
|
4272
|
+
if (maxLen === 0) return 1;
|
|
4273
|
+
return 1 - levenshtein(a, b) / maxLen;
|
|
4274
|
+
}
|
|
4275
|
+
function normalizedSimilarity(a, b) {
|
|
4276
|
+
return similarity(normalize(a), normalize(b));
|
|
4277
|
+
}
|
|
4278
|
+
function normalize(s) {
|
|
4279
|
+
return s.replace(/\s+/g, " ").trim();
|
|
4280
|
+
}
|
|
4281
|
+
var MAX_LEVENSHTEIN_LEN = 1e4;
|
|
4282
|
+
function levenshtein(a, b) {
|
|
4283
|
+
if (a.length + b.length > MAX_LEVENSHTEIN_LEN) {
|
|
4284
|
+
const sampleLen = Math.min(500, a.length, b.length);
|
|
4285
|
+
let diffs = 0;
|
|
4286
|
+
for (let i = 0; i < sampleLen; i++) if (a[i] !== b[i]) diffs++;
|
|
4287
|
+
const sampleRate = sampleLen > 0 ? diffs / sampleLen : 1;
|
|
4288
|
+
return Math.abs(a.length - b.length) + Math.round(Math.min(a.length, b.length) * sampleRate);
|
|
4289
|
+
}
|
|
4290
|
+
if (a.length > b.length) [a, b] = [b, a];
|
|
4291
|
+
const m = a.length;
|
|
4292
|
+
const n = b.length;
|
|
4293
|
+
let prev = Array.from({ length: m + 1 }, (_, i) => i);
|
|
4294
|
+
let curr = new Array(m + 1);
|
|
4295
|
+
for (let j = 1; j <= n; j++) {
|
|
4296
|
+
curr[0] = j;
|
|
4297
|
+
for (let i = 1; i <= m; i++) {
|
|
4298
|
+
if (a[i - 1] === b[j - 1]) {
|
|
4299
|
+
curr[i] = prev[i - 1];
|
|
4300
|
+
} else {
|
|
4301
|
+
curr[i] = 1 + Math.min(prev[i - 1], prev[i], curr[i - 1]);
|
|
4302
|
+
}
|
|
4303
|
+
}
|
|
4304
|
+
;
|
|
4305
|
+
[prev, curr] = [curr, prev];
|
|
4306
|
+
}
|
|
4307
|
+
return prev[m];
|
|
4308
|
+
}
|
|
4309
|
+
|
|
4310
|
+
// src/diff/compare.ts
|
|
4311
|
+
var SIMILARITY_THRESHOLD = 0.4;
|
|
4312
|
+
async function compare(bufferA, bufferB, options) {
|
|
4313
|
+
const [resultA, resultB] = await Promise.all([
|
|
4314
|
+
parse(bufferA, options),
|
|
4315
|
+
parse(bufferB, options)
|
|
4316
|
+
]);
|
|
4317
|
+
if (!resultA.success) throw new Error(`\uBB38\uC11CA \uD30C\uC2F1 \uC2E4\uD328: ${resultA.error}`);
|
|
4318
|
+
if (!resultB.success) throw new Error(`\uBB38\uC11CB \uD30C\uC2F1 \uC2E4\uD328: ${resultB.error}`);
|
|
4319
|
+
return diffBlocks(resultA.blocks, resultB.blocks);
|
|
4320
|
+
}
|
|
4321
|
+
function diffBlocks(blocksA, blocksB) {
|
|
4322
|
+
const aligned = alignBlocks(blocksA, blocksB);
|
|
4323
|
+
const stats = { added: 0, removed: 0, modified: 0, unchanged: 0 };
|
|
4324
|
+
const diffs = [];
|
|
4325
|
+
for (const [a, b] of aligned) {
|
|
4326
|
+
if (a && b) {
|
|
4327
|
+
const sim = blockSimilarity(a, b);
|
|
4328
|
+
if (sim >= 0.99) {
|
|
4329
|
+
diffs.push({ type: "unchanged", before: a, after: b, similarity: 1 });
|
|
4330
|
+
stats.unchanged++;
|
|
4331
|
+
} else {
|
|
4332
|
+
const diff = { type: "modified", before: a, after: b, similarity: sim };
|
|
4333
|
+
if (a.type === "table" && b.type === "table" && a.table && b.table) {
|
|
4334
|
+
diff.cellDiffs = diffTableCells(a.table, b.table);
|
|
4335
|
+
}
|
|
4336
|
+
diffs.push(diff);
|
|
4337
|
+
stats.modified++;
|
|
4338
|
+
}
|
|
4339
|
+
} else if (a) {
|
|
4340
|
+
diffs.push({ type: "removed", before: a });
|
|
4341
|
+
stats.removed++;
|
|
4342
|
+
} else if (b) {
|
|
4343
|
+
diffs.push({ type: "added", after: b });
|
|
4344
|
+
stats.added++;
|
|
4345
|
+
}
|
|
4346
|
+
}
|
|
4347
|
+
return { stats, diffs };
|
|
4348
|
+
}
|
|
4349
|
+
function alignBlocks(a, b) {
|
|
4350
|
+
const m = a.length, n = b.length;
|
|
4351
|
+
if (m * n > 1e7) return fallbackAlign(a, b);
|
|
4352
|
+
const simCache = /* @__PURE__ */ new Map();
|
|
4353
|
+
const getSim = (i2, j2) => {
|
|
4354
|
+
const key = `${i2},${j2}`;
|
|
4355
|
+
let v = simCache.get(key);
|
|
4356
|
+
if (v === void 0) {
|
|
4357
|
+
v = blockSimilarity(a[i2], b[j2]);
|
|
4358
|
+
simCache.set(key, v);
|
|
4359
|
+
}
|
|
4360
|
+
return v;
|
|
4361
|
+
};
|
|
4362
|
+
const dp = Array.from({ length: m + 1 }, () => new Array(n + 1).fill(0));
|
|
4363
|
+
for (let i2 = 1; i2 <= m; i2++) {
|
|
4364
|
+
for (let j2 = 1; j2 <= n; j2++) {
|
|
4365
|
+
if (getSim(i2 - 1, j2 - 1) >= SIMILARITY_THRESHOLD) {
|
|
4366
|
+
dp[i2][j2] = dp[i2 - 1][j2 - 1] + 1;
|
|
4367
|
+
} else {
|
|
4368
|
+
dp[i2][j2] = Math.max(dp[i2 - 1][j2], dp[i2][j2 - 1]);
|
|
4369
|
+
}
|
|
4370
|
+
}
|
|
4371
|
+
}
|
|
4372
|
+
const pairs = [];
|
|
4373
|
+
let i = m, j = n;
|
|
4374
|
+
while (i > 0 && j > 0) {
|
|
4375
|
+
if (getSim(i - 1, j - 1) >= SIMILARITY_THRESHOLD && dp[i][j] === dp[i - 1][j - 1] + 1) {
|
|
4376
|
+
pairs.push([i - 1, j - 1]);
|
|
4377
|
+
i--;
|
|
4378
|
+
j--;
|
|
4379
|
+
} else if (dp[i - 1][j] >= dp[i][j - 1]) {
|
|
4380
|
+
i--;
|
|
4381
|
+
} else {
|
|
4382
|
+
j--;
|
|
4383
|
+
}
|
|
4384
|
+
}
|
|
4385
|
+
pairs.reverse();
|
|
4386
|
+
const result = [];
|
|
4387
|
+
let ai = 0, bi = 0;
|
|
4388
|
+
for (const [pi, pj] of pairs) {
|
|
4389
|
+
while (ai < pi) result.push([a[ai++], null]);
|
|
4390
|
+
while (bi < pj) result.push([null, b[bi++]]);
|
|
4391
|
+
result.push([a[ai++], b[bi++]]);
|
|
4392
|
+
}
|
|
4393
|
+
while (ai < m) result.push([a[ai++], null]);
|
|
4394
|
+
while (bi < n) result.push([null, b[bi++]]);
|
|
4395
|
+
return result;
|
|
4396
|
+
}
|
|
4397
|
+
function fallbackAlign(a, b) {
|
|
4398
|
+
const result = [];
|
|
4399
|
+
const len = Math.max(a.length, b.length);
|
|
4400
|
+
for (let i = 0; i < len; i++) {
|
|
4401
|
+
result.push([a[i] || null, b[i] || null]);
|
|
4402
|
+
}
|
|
4403
|
+
return result;
|
|
4404
|
+
}
|
|
4405
|
+
function blockSimilarity(a, b) {
|
|
4406
|
+
if (a.type !== b.type) return 0;
|
|
4407
|
+
if (a.text !== void 0 && b.text !== void 0) {
|
|
4408
|
+
return normalizedSimilarity(a.text || "", b.text || "");
|
|
4409
|
+
}
|
|
4410
|
+
if (a.type === "table" && a.table && b.table) {
|
|
4411
|
+
return tableSimilarity(a.table, b.table);
|
|
4412
|
+
}
|
|
4413
|
+
if (a.type === b.type) return 1;
|
|
4414
|
+
return 0;
|
|
4415
|
+
}
|
|
4416
|
+
function tableSimilarity(a, b) {
|
|
4417
|
+
const dimSim = 1 - Math.abs(a.rows * a.cols - b.rows * b.cols) / Math.max(a.rows * a.cols, b.rows * b.cols, 1);
|
|
4418
|
+
const textsA = a.cells.flat().map((c) => c.text).join(" ");
|
|
4419
|
+
const textsB = b.cells.flat().map((c) => c.text).join(" ");
|
|
4420
|
+
const contentSim = normalizedSimilarity(textsA, textsB);
|
|
4421
|
+
return dimSim * 0.3 + contentSim * 0.7;
|
|
4422
|
+
}
|
|
4423
|
+
function diffTableCells(a, b) {
|
|
4424
|
+
const maxRows = Math.max(a.rows, b.rows);
|
|
4425
|
+
const maxCols = Math.max(a.cols, b.cols);
|
|
4426
|
+
const result = [];
|
|
4427
|
+
for (let r = 0; r < maxRows; r++) {
|
|
4428
|
+
const row = [];
|
|
4429
|
+
for (let c = 0; c < maxCols; c++) {
|
|
4430
|
+
const cellA = r < a.rows && c < a.cols ? a.cells[r][c].text : void 0;
|
|
4431
|
+
const cellB = r < b.rows && c < b.cols ? b.cells[r][c].text : void 0;
|
|
4432
|
+
let type;
|
|
4433
|
+
if (cellA === void 0) type = "added";
|
|
4434
|
+
else if (cellB === void 0) type = "removed";
|
|
4435
|
+
else if (cellA === cellB) type = "unchanged";
|
|
4436
|
+
else type = "modified";
|
|
4437
|
+
row.push({ type, before: cellA, after: cellB });
|
|
4438
|
+
}
|
|
4439
|
+
result.push(row);
|
|
4440
|
+
}
|
|
4441
|
+
return result;
|
|
4442
|
+
}
|
|
4443
|
+
|
|
6682
4444
|
// src/index.ts
|
|
6683
4445
|
async function parse(input, options) {
|
|
6684
4446
|
let buffer;
|
|
@@ -6731,6 +4493,18 @@ async function parseHwp(buffer, options) {
|
|
|
6731
4493
|
}
|
|
6732
4494
|
}
|
|
6733
4495
|
async function parsePdf(buffer, options) {
|
|
4496
|
+
let parsePdfDocument;
|
|
4497
|
+
try {
|
|
4498
|
+
const mod = await import("./parser-OIRWPKIQ.js");
|
|
4499
|
+
parsePdfDocument = mod.parsePdfDocument;
|
|
4500
|
+
} catch {
|
|
4501
|
+
return {
|
|
4502
|
+
success: false,
|
|
4503
|
+
fileType: "pdf",
|
|
4504
|
+
error: "PDF \uD30C\uC2F1\uC5D0 pdfjs-dist\uAC00 \uD544\uC694\uD569\uB2C8\uB2E4. \uC124\uCE58: npm install pdfjs-dist",
|
|
4505
|
+
code: "MISSING_DEPENDENCY"
|
|
4506
|
+
};
|
|
4507
|
+
}
|
|
6734
4508
|
try {
|
|
6735
4509
|
const { markdown, blocks, metadata, outline, warnings, isImageBased } = await parsePdfDocument(buffer, options);
|
|
6736
4510
|
return { success: true, fileType: "pdf", markdown, blocks, metadata, outline, warnings, isImageBased };
|
|
@@ -6755,6 +4529,45 @@ async function parseDocx(buffer, options) {
|
|
|
6755
4529
|
return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
6756
4530
|
}
|
|
6757
4531
|
}
|
|
4532
|
+
async function fillForm(input, values, outputFormat = "markdown") {
|
|
4533
|
+
let buffer;
|
|
4534
|
+
if (typeof input === "string") {
|
|
4535
|
+
const buf = await readFile(input);
|
|
4536
|
+
buffer = toArrayBuffer(buf);
|
|
4537
|
+
} else if (Buffer.isBuffer(input)) {
|
|
4538
|
+
buffer = toArrayBuffer(input);
|
|
4539
|
+
} else {
|
|
4540
|
+
buffer = input;
|
|
4541
|
+
}
|
|
4542
|
+
if (outputFormat === "hwpx-preserve") {
|
|
4543
|
+
const format = detectFormat(buffer);
|
|
4544
|
+
if (format === "hwpx") {
|
|
4545
|
+
const zipFormat = await detectZipFormat(buffer);
|
|
4546
|
+
if (zipFormat !== "hwpx") {
|
|
4547
|
+
throw new Error(`hwpx-preserve \uD3EC\uB9F7\uC740 HWPX \uC785\uB825\uB9CC \uC9C0\uC6D0\uD569\uB2C8\uB2E4 (\uAC10\uC9C0\uB41C \uD3EC\uB9F7: ${zipFormat})`);
|
|
4548
|
+
}
|
|
4549
|
+
} else {
|
|
4550
|
+
throw new Error(`hwpx-preserve \uD3EC\uB9F7\uC740 HWPX \uC785\uB825\uB9CC \uC9C0\uC6D0\uD569\uB2C8\uB2E4 (\uAC10\uC9C0\uB41C \uD3EC\uB9F7: ${format})`);
|
|
4551
|
+
}
|
|
4552
|
+
const hwpxResult = await fillHwpx(buffer, values);
|
|
4553
|
+
return {
|
|
4554
|
+
output: hwpxResult.buffer,
|
|
4555
|
+
format: "hwpx-preserve",
|
|
4556
|
+
fill: { filled: hwpxResult.filled, unmatched: hwpxResult.unmatched }
|
|
4557
|
+
};
|
|
4558
|
+
}
|
|
4559
|
+
const parsed = await parse(buffer);
|
|
4560
|
+
if (!parsed.success) {
|
|
4561
|
+
throw new Error(`\uC11C\uC2DD \uD30C\uC2F1 \uC2E4\uD328: ${parsed.error}`);
|
|
4562
|
+
}
|
|
4563
|
+
const fill = fillFormFields(parsed.blocks, values);
|
|
4564
|
+
const markdown = blocksToMarkdown(fill.blocks);
|
|
4565
|
+
if (outputFormat === "hwpx") {
|
|
4566
|
+
const hwpxBuffer = await markdownToHwpx(markdown);
|
|
4567
|
+
return { output: hwpxBuffer, format: "hwpx", fill };
|
|
4568
|
+
}
|
|
4569
|
+
return { output: markdown, format: "markdown", fill };
|
|
4570
|
+
}
|
|
6758
4571
|
export {
|
|
6759
4572
|
VERSION,
|
|
6760
4573
|
blocksToMarkdown,
|
|
@@ -6763,7 +4576,11 @@ export {
|
|
|
6763
4576
|
detectZipFormat,
|
|
6764
4577
|
diffBlocks,
|
|
6765
4578
|
extractFormFields,
|
|
4579
|
+
fillForm,
|
|
4580
|
+
fillFormFields,
|
|
4581
|
+
fillHwpx,
|
|
6766
4582
|
isHwpxFile,
|
|
4583
|
+
isLabelCell,
|
|
6767
4584
|
isOldHwpFile,
|
|
6768
4585
|
isPdfFile,
|
|
6769
4586
|
isZipFile,
|