kordoc 2.2.4 → 2.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +43 -4
- package/dist/chunk-FCQEF2ZM.js +457 -0
- package/dist/chunk-FCQEF2ZM.js.map +1 -0
- package/dist/chunk-HXUCZ2IL.cjs +450 -0
- package/dist/chunk-HXUCZ2IL.cjs.map +1 -0
- package/dist/chunk-MUOQXDZ4.cjs +33 -0
- package/dist/chunk-MUOQXDZ4.cjs.map +1 -0
- package/dist/chunk-NL5XLN5R.js +450 -0
- package/dist/chunk-NL5XLN5R.js.map +1 -0
- package/dist/{chunk-SY2RFVLW.js → chunk-RF6UJXR3.js} +135 -2805
- package/dist/chunk-RF6UJXR3.js.map +1 -0
- package/dist/chunk-SBVRCJFH.js +33 -0
- package/dist/chunk-SBVRCJFH.js.map +1 -0
- package/dist/cli.js +12 -7
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +294 -3084
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +1 -1
- package/dist/index.d.ts +1 -1
- package/dist/index.js +77 -2817
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +15 -9
- package/dist/mcp.js.map +1 -1
- package/dist/page-range-3C7UGGEK.cjs +7 -0
- package/dist/page-range-3C7UGGEK.cjs.map +1 -0
- package/dist/page-range-H35FN3OQ.js +7 -0
- package/dist/page-range-H35FN3OQ.js.map +1 -0
- package/dist/parser-43IAQ5KE.js +2278 -0
- package/dist/parser-43IAQ5KE.js.map +1 -0
- package/dist/parser-AMP7MAOH.js +2279 -0
- package/dist/parser-AMP7MAOH.js.map +1 -0
- package/dist/parser-KOWPTDJU.cjs +2278 -0
- package/dist/parser-KOWPTDJU.cjs.map +1 -0
- package/dist/provider-WPIYEALY.js +37 -0
- package/dist/provider-WPIYEALY.js.map +1 -0
- package/dist/provider-YN2SSK4X.cjs +37 -0
- package/dist/provider-YN2SSK4X.cjs.map +1 -0
- package/dist/{watch-5P7DJ3HG.js → watch-IUQXOXW3.js} +6 -4
- package/dist/{watch-5P7DJ3HG.js.map → watch-IUQXOXW3.js.map} +1 -1
- package/package.json +1 -1
- package/dist/chunk-SY2RFVLW.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -1,147 +1,31 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
if (from && typeof from === "object" || typeof from === "function") {
|
|
17
|
-
for (let key of __getOwnPropNames(from))
|
|
18
|
-
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
19
|
-
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
20
|
-
}
|
|
21
|
-
return to;
|
|
22
|
-
};
|
|
23
|
-
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
24
|
-
// If the importer is in node compatibility mode or this is not an ESM
|
|
25
|
-
// file that has been converted to a CommonJS file using a Babel-
|
|
26
|
-
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
27
|
-
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
28
|
-
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
29
|
-
mod
|
|
30
|
-
));
|
|
31
|
-
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
1
|
+
"use strict";Object.defineProperty(exports, "__esModule", {value: true}); function _interopRequireWildcard(obj) { if (obj && obj.__esModule) { return obj; } else { var newObj = {}; if (obj != null) { for (var key in obj) { if (Object.prototype.hasOwnProperty.call(obj, key)) { newObj[key] = obj[key]; } } } newObj.default = obj; return newObj; } } function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { default: obj }; } function _nullishCoalesce(lhs, rhsFn) { if (lhs != null) { return lhs; } else { return rhsFn(); } } function _optionalChain(ops) { let lastAccessLHS = undefined; let value = ops[0]; let i = 1; while (i < ops.length) { const op = ops[i]; const fn = ops[i + 1]; i += 2; if ((op === 'optionalAccess' || op === 'optionalCall') && value == null) { return undefined; } if (op === 'access' || op === 'optionalAccess') { lastAccessLHS = value; value = fn(value); } else if (op === 'call' || op === 'optionalCall') { value = fn((...args) => value.call(lastAccessLHS, ...args)); lastAccessLHS = undefined; } } return value; }
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
32
16
|
|
|
33
|
-
// src/page-range.ts
|
|
34
|
-
var page_range_exports = {};
|
|
35
|
-
__export(page_range_exports, {
|
|
36
|
-
parsePageRange: () => parsePageRange
|
|
37
|
-
});
|
|
38
|
-
function parsePageRange(spec, maxPages) {
|
|
39
|
-
const result = /* @__PURE__ */ new Set();
|
|
40
|
-
if (maxPages <= 0) return result;
|
|
41
|
-
if (Array.isArray(spec)) {
|
|
42
|
-
for (const n of spec) {
|
|
43
|
-
const page = Math.round(n);
|
|
44
|
-
if (page >= 1 && page <= maxPages) result.add(page);
|
|
45
|
-
}
|
|
46
|
-
return result;
|
|
47
|
-
}
|
|
48
|
-
if (typeof spec !== "string" || spec.trim() === "") return result;
|
|
49
|
-
const parts = spec.split(",");
|
|
50
|
-
for (const part of parts) {
|
|
51
|
-
const trimmed = part.trim();
|
|
52
|
-
if (!trimmed) continue;
|
|
53
|
-
const rangeMatch = trimmed.match(/^(\d+)\s*-\s*(\d+)$/);
|
|
54
|
-
if (rangeMatch) {
|
|
55
|
-
const start = Math.max(1, parseInt(rangeMatch[1], 10));
|
|
56
|
-
const end = Math.min(maxPages, parseInt(rangeMatch[2], 10));
|
|
57
|
-
for (let i = start; i <= end; i++) result.add(i);
|
|
58
|
-
} else {
|
|
59
|
-
const page = parseInt(trimmed, 10);
|
|
60
|
-
if (!isNaN(page) && page >= 1 && page <= maxPages) result.add(page);
|
|
61
|
-
}
|
|
62
|
-
}
|
|
63
|
-
return result;
|
|
64
|
-
}
|
|
65
|
-
var init_page_range = __esm({
|
|
66
|
-
"src/page-range.ts"() {
|
|
67
|
-
"use strict";
|
|
68
|
-
}
|
|
69
|
-
});
|
|
70
17
|
|
|
71
|
-
|
|
72
|
-
var
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
async function ocrPages(doc, provider, pageFilter, effectivePageCount) {
|
|
77
|
-
const blocks = [];
|
|
78
|
-
for (let i = 1; i <= effectivePageCount; i++) {
|
|
79
|
-
if (pageFilter && !pageFilter.has(i)) continue;
|
|
80
|
-
const page = await doc.getPage(i);
|
|
81
|
-
try {
|
|
82
|
-
const imageData = await renderPageToPng(page);
|
|
83
|
-
const text = await provider(imageData, i, "image/png");
|
|
84
|
-
if (text.trim()) {
|
|
85
|
-
blocks.push({ type: "paragraph", text: text.trim(), pageNumber: i });
|
|
86
|
-
}
|
|
87
|
-
} catch {
|
|
88
|
-
blocks.push({ type: "paragraph", text: `[OCR \uC2E4\uD328: \uD398\uC774\uC9C0 ${i}]` });
|
|
89
|
-
}
|
|
90
|
-
}
|
|
91
|
-
return blocks;
|
|
92
|
-
}
|
|
93
|
-
async function renderPageToPng(page) {
|
|
94
|
-
let createCanvas;
|
|
95
|
-
try {
|
|
96
|
-
const canvasModule = await import("canvas");
|
|
97
|
-
createCanvas = canvasModule.createCanvas;
|
|
98
|
-
} catch {
|
|
99
|
-
throw new Error("OCR\uC744 \uC0AC\uC6A9\uD558\uB824\uBA74 'canvas' \uD328\uD0A4\uC9C0\uB97C \uC124\uCE58\uD558\uC138\uC694: npm install canvas");
|
|
100
|
-
}
|
|
101
|
-
const scale = 2;
|
|
102
|
-
const viewport = page.getViewport({ scale });
|
|
103
|
-
const canvas = createCanvas(Math.floor(viewport.width), Math.floor(viewport.height));
|
|
104
|
-
const ctx = canvas.getContext("2d");
|
|
105
|
-
await page.render({ canvasContext: ctx, viewport }).promise;
|
|
106
|
-
return new Uint8Array(canvas.toBuffer("image/png"));
|
|
107
|
-
}
|
|
108
|
-
var init_provider = __esm({
|
|
109
|
-
"src/ocr/provider.ts"() {
|
|
110
|
-
"use strict";
|
|
111
|
-
}
|
|
112
|
-
});
|
|
18
|
+
|
|
19
|
+
var _chunkHXUCZ2ILcjs = require('./chunk-HXUCZ2IL.cjs');
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
var _chunkMUOQXDZ4cjs = require('./chunk-MUOQXDZ4.cjs');
|
|
113
23
|
|
|
114
24
|
// src/index.ts
|
|
115
|
-
var
|
|
116
|
-
__export(index_exports, {
|
|
117
|
-
VERSION: () => VERSION,
|
|
118
|
-
blocksToMarkdown: () => blocksToMarkdown,
|
|
119
|
-
compare: () => compare,
|
|
120
|
-
detectFormat: () => detectFormat,
|
|
121
|
-
detectZipFormat: () => detectZipFormat,
|
|
122
|
-
diffBlocks: () => diffBlocks,
|
|
123
|
-
extractFormFields: () => extractFormFields,
|
|
124
|
-
fillForm: () => fillForm,
|
|
125
|
-
fillFormFields: () => fillFormFields,
|
|
126
|
-
fillHwpx: () => fillHwpx,
|
|
127
|
-
isHwpxFile: () => isHwpxFile,
|
|
128
|
-
isLabelCell: () => isLabelCell,
|
|
129
|
-
isOldHwpFile: () => isOldHwpFile,
|
|
130
|
-
isPdfFile: () => isPdfFile,
|
|
131
|
-
isZipFile: () => isZipFile,
|
|
132
|
-
markdownToHwpx: () => markdownToHwpx,
|
|
133
|
-
parse: () => parse,
|
|
134
|
-
parseDocx: () => parseDocx,
|
|
135
|
-
parseHwp: () => parseHwp,
|
|
136
|
-
parseHwpx: () => parseHwpx,
|
|
137
|
-
parsePdf: () => parsePdf,
|
|
138
|
-
parseXlsx: () => parseXlsx
|
|
139
|
-
});
|
|
140
|
-
module.exports = __toCommonJS(index_exports);
|
|
141
|
-
var import_promises = require("fs/promises");
|
|
25
|
+
var _promises = require('fs/promises');
|
|
142
26
|
|
|
143
27
|
// src/detect.ts
|
|
144
|
-
var
|
|
28
|
+
var _jszip = require('jszip'); var _jszip2 = _interopRequireDefault(_jszip);
|
|
145
29
|
function magicBytes(buffer) {
|
|
146
30
|
return new Uint8Array(buffer, 0, Math.min(4, buffer.byteLength));
|
|
147
31
|
}
|
|
@@ -169,453 +53,22 @@ function detectFormat(buffer) {
|
|
|
169
53
|
}
|
|
170
54
|
async function detectZipFormat(buffer) {
|
|
171
55
|
try {
|
|
172
|
-
const zip = await
|
|
56
|
+
const zip = await _jszip2.default.loadAsync(buffer);
|
|
173
57
|
if (zip.file("xl/workbook.xml")) return "xlsx";
|
|
174
58
|
if (zip.file("word/document.xml")) return "docx";
|
|
175
59
|
if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return "hwpx";
|
|
176
60
|
const hasSection = Object.keys(zip.files).some((f) => f.startsWith("Contents/"));
|
|
177
61
|
if (hasSection) return "hwpx";
|
|
178
62
|
return "unknown";
|
|
179
|
-
} catch {
|
|
63
|
+
} catch (e2) {
|
|
180
64
|
return "unknown";
|
|
181
65
|
}
|
|
182
66
|
}
|
|
183
67
|
|
|
184
68
|
// src/hwpx/parser.ts
|
|
185
|
-
var import_jszip2 = __toESM(require("jszip"), 1);
|
|
186
|
-
var import_zlib = require("zlib");
|
|
187
|
-
var import_xmldom = require("@xmldom/xmldom");
|
|
188
|
-
|
|
189
|
-
// src/utils.ts
|
|
190
|
-
var VERSION = true ? "2.2.4" : "0.0.0-dev";
|
|
191
|
-
function toArrayBuffer(buf) {
|
|
192
|
-
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
193
|
-
return buf.buffer;
|
|
194
|
-
}
|
|
195
|
-
return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
|
|
196
|
-
}
|
|
197
|
-
var KordocError = class extends Error {
|
|
198
|
-
constructor(message) {
|
|
199
|
-
super(message);
|
|
200
|
-
this.name = "KordocError";
|
|
201
|
-
}
|
|
202
|
-
};
|
|
203
|
-
function isPathTraversal(name) {
|
|
204
|
-
if (name.includes("\0")) return true;
|
|
205
|
-
const normalized = name.replace(/\\/g, "/");
|
|
206
|
-
const segments = normalized.split("/");
|
|
207
|
-
return segments.some((s) => s === "..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
208
|
-
}
|
|
209
|
-
function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
|
|
210
|
-
try {
|
|
211
|
-
const data = new DataView(buffer);
|
|
212
|
-
const len = buffer.byteLength;
|
|
213
|
-
let eocdOffset = -1;
|
|
214
|
-
for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
|
|
215
|
-
if (data.getUint32(i, true) === 101010256) {
|
|
216
|
-
eocdOffset = i;
|
|
217
|
-
break;
|
|
218
|
-
}
|
|
219
|
-
}
|
|
220
|
-
if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
|
|
221
|
-
const entryCount = data.getUint16(eocdOffset + 10, true);
|
|
222
|
-
if (entryCount > maxEntries) {
|
|
223
|
-
throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
|
|
224
|
-
}
|
|
225
|
-
const cdSize = data.getUint32(eocdOffset + 12, true);
|
|
226
|
-
const cdOffset = data.getUint32(eocdOffset + 16, true);
|
|
227
|
-
if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
|
|
228
|
-
let totalUncompressed = 0;
|
|
229
|
-
let pos = cdOffset;
|
|
230
|
-
for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
|
|
231
|
-
if (data.getUint32(pos, true) !== 33639248) break;
|
|
232
|
-
totalUncompressed += data.getUint32(pos + 24, true);
|
|
233
|
-
const nameLen = data.getUint16(pos + 28, true);
|
|
234
|
-
const extraLen = data.getUint16(pos + 30, true);
|
|
235
|
-
const commentLen = data.getUint16(pos + 32, true);
|
|
236
|
-
pos += 46 + nameLen + extraLen + commentLen;
|
|
237
|
-
}
|
|
238
|
-
if (totalUncompressed > maxUncompressedSize) {
|
|
239
|
-
throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
|
|
240
|
-
}
|
|
241
|
-
return { totalUncompressed, entryCount };
|
|
242
|
-
} catch (err) {
|
|
243
|
-
if (err instanceof KordocError) throw err;
|
|
244
|
-
return { totalUncompressed: 0, entryCount: 0 };
|
|
245
|
-
}
|
|
246
|
-
}
|
|
247
|
-
function stripDtd(xml) {
|
|
248
|
-
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
249
|
-
}
|
|
250
|
-
var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
|
|
251
|
-
function sanitizeHref(href) {
|
|
252
|
-
const trimmed = href.trim();
|
|
253
|
-
if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
|
|
254
|
-
return trimmed;
|
|
255
|
-
}
|
|
256
|
-
function safeMin(arr) {
|
|
257
|
-
let min = Infinity;
|
|
258
|
-
for (let i = 0; i < arr.length; i++) if (arr[i] < min) min = arr[i];
|
|
259
|
-
return min;
|
|
260
|
-
}
|
|
261
|
-
function safeMax(arr) {
|
|
262
|
-
let max = -Infinity;
|
|
263
|
-
for (let i = 0; i < arr.length; i++) if (arr[i] > max) max = arr[i];
|
|
264
|
-
return max;
|
|
265
|
-
}
|
|
266
|
-
function classifyError(err) {
|
|
267
|
-
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
268
|
-
const msg = err.message;
|
|
269
|
-
if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
|
|
270
|
-
if (msg.includes("DRM")) return "DRM_PROTECTED";
|
|
271
|
-
if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
|
|
272
|
-
if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
|
|
273
|
-
if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
|
|
274
|
-
if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
|
|
275
|
-
if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
|
|
276
|
-
return "PARSE_ERROR";
|
|
277
|
-
}
|
|
278
|
-
|
|
279
|
-
// src/table/builder.ts
|
|
280
|
-
var MAX_COLS = 200;
|
|
281
|
-
var MAX_ROWS = 1e4;
|
|
282
|
-
function buildTable(rows) {
|
|
283
|
-
if (rows.length > MAX_ROWS) rows = rows.slice(0, MAX_ROWS);
|
|
284
|
-
const numRows = rows.length;
|
|
285
|
-
const hasAddr = rows.some((row) => row.some((c) => c.colAddr !== void 0 && c.rowAddr !== void 0));
|
|
286
|
-
if (hasAddr) return buildTableDirect(rows, numRows);
|
|
287
|
-
let maxCols = 0;
|
|
288
|
-
const tempOccupied = Array.from({ length: numRows }, () => []);
|
|
289
|
-
for (let rowIdx = 0; rowIdx < numRows; rowIdx++) {
|
|
290
|
-
let colIdx = 0;
|
|
291
|
-
for (const cell of rows[rowIdx]) {
|
|
292
|
-
while (colIdx < MAX_COLS && tempOccupied[rowIdx][colIdx]) colIdx++;
|
|
293
|
-
if (colIdx >= MAX_COLS) break;
|
|
294
|
-
for (let r = rowIdx; r < Math.min(rowIdx + cell.rowSpan, numRows); r++) {
|
|
295
|
-
for (let c = colIdx; c < Math.min(colIdx + cell.colSpan, MAX_COLS); c++) {
|
|
296
|
-
tempOccupied[r][c] = true;
|
|
297
|
-
}
|
|
298
|
-
}
|
|
299
|
-
colIdx += cell.colSpan;
|
|
300
|
-
if (colIdx > maxCols) maxCols = colIdx;
|
|
301
|
-
}
|
|
302
|
-
}
|
|
303
|
-
if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
|
|
304
|
-
const grid = Array.from(
|
|
305
|
-
{ length: numRows },
|
|
306
|
-
() => Array.from({ length: maxCols }, () => ({ text: "", colSpan: 1, rowSpan: 1 }))
|
|
307
|
-
);
|
|
308
|
-
const occupied = Array.from({ length: numRows }, () => Array(maxCols).fill(false));
|
|
309
|
-
for (let rowIdx = 0; rowIdx < numRows; rowIdx++) {
|
|
310
|
-
let colIdx = 0;
|
|
311
|
-
let cellIdx = 0;
|
|
312
|
-
while (colIdx < maxCols && cellIdx < rows[rowIdx].length) {
|
|
313
|
-
while (colIdx < maxCols && occupied[rowIdx][colIdx]) colIdx++;
|
|
314
|
-
if (colIdx >= maxCols) break;
|
|
315
|
-
const cell = rows[rowIdx][cellIdx];
|
|
316
|
-
grid[rowIdx][colIdx] = {
|
|
317
|
-
text: cell.text.trim(),
|
|
318
|
-
colSpan: cell.colSpan,
|
|
319
|
-
rowSpan: cell.rowSpan
|
|
320
|
-
};
|
|
321
|
-
for (let r = rowIdx; r < Math.min(rowIdx + cell.rowSpan, numRows); r++) {
|
|
322
|
-
for (let c = colIdx; c < Math.min(colIdx + cell.colSpan, maxCols); c++) {
|
|
323
|
-
occupied[r][c] = true;
|
|
324
|
-
}
|
|
325
|
-
}
|
|
326
|
-
colIdx += cell.colSpan;
|
|
327
|
-
cellIdx++;
|
|
328
|
-
}
|
|
329
|
-
}
|
|
330
|
-
return trimAndReturn(grid, numRows, maxCols);
|
|
331
|
-
}
|
|
332
|
-
function buildTableDirect(rows, numRows) {
|
|
333
|
-
let maxCols = 0;
|
|
334
|
-
for (const row of rows) {
|
|
335
|
-
for (const cell of row) {
|
|
336
|
-
const end = (cell.colAddr ?? 0) + cell.colSpan;
|
|
337
|
-
if (end > maxCols) maxCols = end;
|
|
338
|
-
}
|
|
339
|
-
}
|
|
340
|
-
if (maxCols > MAX_COLS) maxCols = MAX_COLS;
|
|
341
|
-
if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
|
|
342
|
-
const grid = Array.from(
|
|
343
|
-
{ length: numRows },
|
|
344
|
-
() => Array.from({ length: maxCols }, () => ({ text: "", colSpan: 1, rowSpan: 1 }))
|
|
345
|
-
);
|
|
346
|
-
for (const row of rows) {
|
|
347
|
-
for (const cell of row) {
|
|
348
|
-
const r = cell.rowAddr ?? 0;
|
|
349
|
-
const c = cell.colAddr ?? 0;
|
|
350
|
-
if (r >= numRows || c >= maxCols || r < 0 || c < 0) continue;
|
|
351
|
-
grid[r][c] = { text: cell.text.trim(), colSpan: cell.colSpan, rowSpan: cell.rowSpan };
|
|
352
|
-
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
353
|
-
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
354
|
-
if (dr === 0 && dc === 0) continue;
|
|
355
|
-
if (r + dr < numRows && c + dc < maxCols) {
|
|
356
|
-
grid[r + dr][c + dc] = { text: "", colSpan: 1, rowSpan: 1 };
|
|
357
|
-
}
|
|
358
|
-
}
|
|
359
|
-
}
|
|
360
|
-
}
|
|
361
|
-
}
|
|
362
|
-
return trimAndReturn(grid, numRows, maxCols);
|
|
363
|
-
}
|
|
364
|
-
function trimAndReturn(grid, numRows, maxCols) {
|
|
365
|
-
let effectiveCols = maxCols;
|
|
366
|
-
while (effectiveCols > 0) {
|
|
367
|
-
const colEmpty = grid.every((row) => !row[effectiveCols - 1]?.text?.trim());
|
|
368
|
-
if (!colEmpty) break;
|
|
369
|
-
effectiveCols--;
|
|
370
|
-
}
|
|
371
|
-
if (effectiveCols < maxCols && effectiveCols > 0) {
|
|
372
|
-
const trimmed = grid.map((row) => row.slice(0, effectiveCols));
|
|
373
|
-
return { rows: numRows, cols: effectiveCols, cells: trimmed, hasHeader: numRows > 1 };
|
|
374
|
-
}
|
|
375
|
-
return { rows: numRows, cols: maxCols, cells: grid, hasHeader: numRows > 1 };
|
|
376
|
-
}
|
|
377
|
-
function convertTableToText(rows) {
|
|
378
|
-
return rows.map(
|
|
379
|
-
(row) => row.map((c) => c.text.trim().replace(/\n/g, " ").replace(/\|/g, "\\|")).filter(Boolean).join(" / ")
|
|
380
|
-
).filter(Boolean).join("\n");
|
|
381
|
-
}
|
|
382
|
-
function escapeGfm(text) {
|
|
383
|
-
return text.replace(/~/g, "\\~");
|
|
384
|
-
}
|
|
385
|
-
var HWP_SHAPE_ALT_TEXT_RE = /(?:모서리가 둥근 |둥근 )?(?:사각형|직사각형|정사각형|원|타원|삼각형|이등변 삼각형|직각 삼각형|선|직선|곡선|화살표|굵은 화살표|이중 화살표|오각형|육각형|팔각형|별|[4-8]점별|십자|십자형|구름|구름형|마름모|도넛|평행사변형|사다리꼴|부채꼴|호|반원|물결|번개|하트|빗금|블록 화살표|수식|표|그림|개체|그리기\s?개체|묶음\s?개체|글상자|수식\s?개체|OLE\s?개체)\s?입니다\.?/g;
|
|
386
|
-
function sanitizeText(text) {
|
|
387
|
-
let result = text.replace(/[\u{F0000}-\u{FFFFD}]/gu, "").replace(HWP_SHAPE_ALT_TEXT_RE, "").replace(/ +/g, " ").trim();
|
|
388
|
-
if (result.length <= 30 && result.includes(" ")) {
|
|
389
|
-
const tokens = result.split(" ");
|
|
390
|
-
const koreanSingleCharCount = tokens.filter((t) => t.length === 1 && /[\uAC00-\uD7AF\u3131-\u318E]/.test(t)).length;
|
|
391
|
-
if (tokens.length >= 3 && koreanSingleCharCount / tokens.length >= 0.7) {
|
|
392
|
-
result = tokens.join("");
|
|
393
|
-
}
|
|
394
|
-
}
|
|
395
|
-
return result;
|
|
396
|
-
}
|
|
397
|
-
function flattenLayoutTables(blocks) {
|
|
398
|
-
const result = [];
|
|
399
|
-
for (const block of blocks) {
|
|
400
|
-
if (block.type !== "table" || !block.table) {
|
|
401
|
-
result.push(block);
|
|
402
|
-
continue;
|
|
403
|
-
}
|
|
404
|
-
const { rows: numRows, cols: numCols, cells } = block.table;
|
|
405
|
-
if (numRows === 1 && numCols === 1) {
|
|
406
|
-
result.push(block);
|
|
407
|
-
continue;
|
|
408
|
-
}
|
|
409
|
-
if (numRows <= 3) {
|
|
410
|
-
let totalNewlines = 0;
|
|
411
|
-
let totalTextLen = 0;
|
|
412
|
-
for (let r = 0; r < numRows; r++) {
|
|
413
|
-
for (let c = 0; c < numCols; c++) {
|
|
414
|
-
const t = cells[r]?.[c]?.text || "";
|
|
415
|
-
totalNewlines += (t.match(/\n/g) || []).length;
|
|
416
|
-
totalTextLen += t.length;
|
|
417
|
-
}
|
|
418
|
-
}
|
|
419
|
-
if (totalNewlines > 5 || numRows <= 2 && totalTextLen > 300) {
|
|
420
|
-
for (let r = 0; r < numRows; r++) {
|
|
421
|
-
for (let c = 0; c < numCols; c++) {
|
|
422
|
-
const cellText = cells[r]?.[c]?.text?.trim();
|
|
423
|
-
if (!cellText) continue;
|
|
424
|
-
for (const line of cellText.split("\n")) {
|
|
425
|
-
const trimmed = line.trim();
|
|
426
|
-
if (!trimmed) continue;
|
|
427
|
-
result.push({ type: "paragraph", text: trimmed, pageNumber: block.pageNumber });
|
|
428
|
-
}
|
|
429
|
-
}
|
|
430
|
-
}
|
|
431
|
-
continue;
|
|
432
|
-
}
|
|
433
|
-
}
|
|
434
|
-
result.push(block);
|
|
435
|
-
}
|
|
436
|
-
return result;
|
|
437
|
-
}
|
|
438
|
-
function blocksToMarkdown(blocks) {
|
|
439
|
-
const lines = [];
|
|
440
|
-
for (let i = 0; i < blocks.length; i++) {
|
|
441
|
-
const block = blocks[i];
|
|
442
|
-
if (block.type === "heading" && block.text) {
|
|
443
|
-
const prefix = "#".repeat(Math.min(block.level || 2, 6));
|
|
444
|
-
const headingText = sanitizeText(block.text);
|
|
445
|
-
if (headingText) lines.push("", `${prefix} ${headingText}`, "");
|
|
446
|
-
continue;
|
|
447
|
-
}
|
|
448
|
-
if (block.type === "image" && block.text) {
|
|
449
|
-
lines.push("", ``, "");
|
|
450
|
-
continue;
|
|
451
|
-
}
|
|
452
|
-
if (block.type === "separator") {
|
|
453
|
-
lines.push("", "---", "");
|
|
454
|
-
continue;
|
|
455
|
-
}
|
|
456
|
-
if (block.type === "list" && block.text) {
|
|
457
|
-
const listText = sanitizeText(block.text);
|
|
458
|
-
if (!listText) continue;
|
|
459
|
-
const alreadyNumbered = block.listType === "ordered" && /^\d+\.\s/.test(listText);
|
|
460
|
-
const prefix = alreadyNumbered ? "" : block.listType === "ordered" ? "1. " : "- ";
|
|
461
|
-
lines.push(`${prefix}${listText}`);
|
|
462
|
-
if (block.children) {
|
|
463
|
-
for (const child of block.children) {
|
|
464
|
-
const childPrefix = child.listType === "ordered" ? "1." : "-";
|
|
465
|
-
lines.push(` ${childPrefix} ${child.text || ""}`);
|
|
466
|
-
}
|
|
467
|
-
}
|
|
468
|
-
continue;
|
|
469
|
-
}
|
|
470
|
-
if (block.type === "paragraph" && block.text) {
|
|
471
|
-
let text = sanitizeText(block.text);
|
|
472
|
-
if (!text) continue;
|
|
473
|
-
if (/^\[별표\s*\d+/.test(text)) {
|
|
474
|
-
const nextBlock = blocks[i + 1];
|
|
475
|
-
if (nextBlock?.type === "paragraph" && nextBlock.text && /관련\)?$/.test(nextBlock.text)) {
|
|
476
|
-
lines.push("", `## ${text} ${nextBlock.text}`, "");
|
|
477
|
-
i++;
|
|
478
|
-
} else {
|
|
479
|
-
lines.push("", `## ${text}`, "");
|
|
480
|
-
}
|
|
481
|
-
continue;
|
|
482
|
-
}
|
|
483
|
-
if (/^\([^)]*조[^)]*관련\)$/.test(text)) {
|
|
484
|
-
lines.push(`*${text}*`, "");
|
|
485
|
-
continue;
|
|
486
|
-
}
|
|
487
|
-
if (block.href) {
|
|
488
|
-
const href = sanitizeHref(block.href);
|
|
489
|
-
if (href) text = `[${text}](${href})`;
|
|
490
|
-
}
|
|
491
|
-
if (block.footnoteText) {
|
|
492
|
-
text += ` (\uC8FC: ${block.footnoteText})`;
|
|
493
|
-
}
|
|
494
|
-
lines.push(escapeGfm(text), "");
|
|
495
|
-
} else if (block.type === "table" && block.table) {
|
|
496
|
-
if (lines.length > 0 && lines[lines.length - 1] !== "") {
|
|
497
|
-
lines.push("");
|
|
498
|
-
}
|
|
499
|
-
const tableMd = tableToMarkdown(block.table);
|
|
500
|
-
if (tableMd) {
|
|
501
|
-
lines.push(tableMd);
|
|
502
|
-
lines.push("");
|
|
503
|
-
}
|
|
504
|
-
}
|
|
505
|
-
}
|
|
506
|
-
return lines.join("\n").trim();
|
|
507
|
-
}
|
|
508
|
-
function hasMergedCells(table) {
|
|
509
|
-
for (const row of table.cells) {
|
|
510
|
-
for (const cell of row) {
|
|
511
|
-
if (cell.colSpan > 1 || cell.rowSpan > 1) return true;
|
|
512
|
-
}
|
|
513
|
-
}
|
|
514
|
-
return false;
|
|
515
|
-
}
|
|
516
|
-
function tableToHtml(table) {
|
|
517
|
-
const { cells, rows: numRows, cols: numCols } = table;
|
|
518
|
-
const skip = /* @__PURE__ */ new Set();
|
|
519
|
-
const lines = ["<table>"];
|
|
520
|
-
for (let r = 0; r < numRows; r++) {
|
|
521
|
-
const tag = r === 0 ? "th" : "td";
|
|
522
|
-
const rowHtml = [];
|
|
523
|
-
for (let c = 0; c < numCols; c++) {
|
|
524
|
-
if (skip.has(`${r},${c}`)) continue;
|
|
525
|
-
const cell = cells[r]?.[c];
|
|
526
|
-
if (!cell) continue;
|
|
527
|
-
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
528
|
-
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
529
|
-
if (dr === 0 && dc === 0) continue;
|
|
530
|
-
if (r + dr < numRows && c + dc < numCols) skip.add(`${r + dr},${c + dc}`);
|
|
531
|
-
}
|
|
532
|
-
}
|
|
533
|
-
const text = sanitizeText(cell.text).replace(/\n/g, "<br>");
|
|
534
|
-
const attrs = [];
|
|
535
|
-
if (cell.colSpan > 1) attrs.push(`colspan="${cell.colSpan}"`);
|
|
536
|
-
if (cell.rowSpan > 1) attrs.push(`rowspan="${cell.rowSpan}"`);
|
|
537
|
-
const attrStr = attrs.length ? " " + attrs.join(" ") : "";
|
|
538
|
-
rowHtml.push(`<${tag}${attrStr}>${text}</${tag}>`);
|
|
539
|
-
}
|
|
540
|
-
if (rowHtml.length) lines.push(`<tr>${rowHtml.join("")}</tr>`);
|
|
541
|
-
}
|
|
542
|
-
lines.push("</table>");
|
|
543
|
-
return lines.join("\n");
|
|
544
|
-
}
|
|
545
|
-
function tableToMarkdown(table) {
|
|
546
|
-
if (table.rows === 0 || table.cols === 0) return "";
|
|
547
|
-
const { cells, rows: numRows, cols: numCols } = table;
|
|
548
|
-
if (hasMergedCells(table)) return tableToHtml(table);
|
|
549
|
-
if (numRows === 1 && numCols === 1) {
|
|
550
|
-
const content = sanitizeText(cells[0][0].text);
|
|
551
|
-
if (!content) return "";
|
|
552
|
-
return content.split(/\n/).map((line) => {
|
|
553
|
-
const trimmed = line.trim();
|
|
554
|
-
if (!trimmed) return "";
|
|
555
|
-
if (/^\d+\.\s/.test(trimmed)) return `**${escapeGfm(trimmed)}**`;
|
|
556
|
-
if (/^[가-힣]\.\s/.test(trimmed)) return ` ${escapeGfm(trimmed)}`;
|
|
557
|
-
return escapeGfm(trimmed);
|
|
558
|
-
}).filter(Boolean).join("\n");
|
|
559
|
-
}
|
|
560
|
-
if (numCols === 1 && numRows >= 2) {
|
|
561
|
-
return cells.map((row) => escapeGfm(sanitizeText(row[0].text)).replace(/\n/g, " ")).filter(Boolean).join("\n");
|
|
562
|
-
}
|
|
563
|
-
const display = Array.from({ length: numRows }, () => Array(numCols).fill(""));
|
|
564
|
-
const skip = /* @__PURE__ */ new Set();
|
|
565
|
-
for (let r = 0; r < numRows; r++) {
|
|
566
|
-
for (let c = 0; c < numCols; c++) {
|
|
567
|
-
if (skip.has(`${r},${c}`)) continue;
|
|
568
|
-
const cell = cells[r]?.[c];
|
|
569
|
-
if (!cell) continue;
|
|
570
|
-
display[r][c] = escapeGfm(sanitizeText(cell.text)).replace(/\|/g, "\\|").replace(/\n/g, "<br>");
|
|
571
|
-
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
572
|
-
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
573
|
-
if (dr === 0 && dc === 0) continue;
|
|
574
|
-
if (r + dr < numRows && c + dc < numCols) {
|
|
575
|
-
skip.add(`${r + dr},${c + dc}`);
|
|
576
|
-
}
|
|
577
|
-
}
|
|
578
|
-
}
|
|
579
|
-
c += cell.colSpan - 1;
|
|
580
|
-
}
|
|
581
|
-
}
|
|
582
|
-
const uniqueRows = [];
|
|
583
|
-
let pendingFirstCol = "";
|
|
584
|
-
for (let r = 0; r < display.length; r++) {
|
|
585
|
-
const row = display[r];
|
|
586
|
-
const isEmptyPlaceholder = row.every((cell) => cell === "");
|
|
587
|
-
if (isEmptyPlaceholder) continue;
|
|
588
|
-
const nonEmptyCols = row.filter((cell) => cell !== "");
|
|
589
|
-
const hasSkipInRow = row.some((_, c) => skip.has(`${r},${c}`));
|
|
590
|
-
if (!hasSkipInRow && nonEmptyCols.length === 1 && row[0] !== "" && row.slice(1).every((c) => c === "")) {
|
|
591
|
-
pendingFirstCol = row[0];
|
|
592
|
-
continue;
|
|
593
|
-
}
|
|
594
|
-
if (pendingFirstCol && row[0] === "") {
|
|
595
|
-
row[0] = pendingFirstCol;
|
|
596
|
-
pendingFirstCol = "";
|
|
597
|
-
} else {
|
|
598
|
-
pendingFirstCol = "";
|
|
599
|
-
}
|
|
600
|
-
uniqueRows.push(row);
|
|
601
|
-
}
|
|
602
|
-
if (uniqueRows.length === 0) return "";
|
|
603
|
-
const md = [];
|
|
604
|
-
md.push("| " + uniqueRows[0].join(" | ") + " |");
|
|
605
|
-
md.push("| " + uniqueRows[0].map(() => "---").join(" | ") + " |");
|
|
606
|
-
for (let i = 1; i < uniqueRows.length; i++) {
|
|
607
|
-
md.push("| " + uniqueRows[i].join(" | ") + " |");
|
|
608
|
-
}
|
|
609
|
-
return md.join("\n");
|
|
610
|
-
}
|
|
611
69
|
|
|
612
|
-
|
|
613
|
-
var
|
|
614
|
-
var HEADING_RATIO_H2 = 1.3;
|
|
615
|
-
var HEADING_RATIO_H3 = 1.15;
|
|
616
|
-
|
|
617
|
-
// src/hwpx/parser.ts
|
|
618
|
-
init_page_range();
|
|
70
|
+
var _zlib = require('zlib');
|
|
71
|
+
var _xmldom = require('@xmldom/xmldom');
|
|
619
72
|
var MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024;
|
|
620
73
|
var MAX_ZIP_ENTRIES = 500;
|
|
621
74
|
function clampSpan(val, max) {
|
|
@@ -623,10 +76,10 @@ function clampSpan(val, max) {
|
|
|
623
76
|
}
|
|
624
77
|
var MAX_XML_DEPTH = 200;
|
|
625
78
|
function createXmlParser(warnings) {
|
|
626
|
-
return new
|
|
79
|
+
return new (0, _xmldom.DOMParser)({
|
|
627
80
|
onError(level, msg) {
|
|
628
|
-
if (level === "fatalError") throw new KordocError(`XML \uD30C\uC2F1 \uC2E4\uD328: ${msg}`);
|
|
629
|
-
warnings
|
|
81
|
+
if (level === "fatalError") throw new (0, _chunkHXUCZ2ILcjs.KordocError)(`XML \uD30C\uC2F1 \uC2E4\uD328: ${msg}`);
|
|
82
|
+
_optionalChain([warnings, 'optionalAccess', _2 => _2.push, 'call', _3 => _3({ code: "MALFORMED_XML", message: `XML ${level === "warn" ? "\uACBD\uACE0" : "\uC624\uB958"}: ${msg}` })]);
|
|
630
83
|
}
|
|
631
84
|
});
|
|
632
85
|
}
|
|
@@ -644,15 +97,15 @@ async function extractHwpxStyles(zip, decompressed) {
|
|
|
644
97
|
const xml = await file.async("text");
|
|
645
98
|
if (decompressed) {
|
|
646
99
|
decompressed.total += xml.length * 2;
|
|
647
|
-
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
100
|
+
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
648
101
|
}
|
|
649
102
|
const parser = createXmlParser();
|
|
650
|
-
const doc = parser.parseFromString(stripDtd(xml), "text/xml");
|
|
103
|
+
const doc = parser.parseFromString(_chunkHXUCZ2ILcjs.stripDtd.call(void 0, xml), "text/xml");
|
|
651
104
|
if (!doc.documentElement) continue;
|
|
652
105
|
parseCharProperties(doc, result.charProperties);
|
|
653
106
|
parseStyleElements(doc, result.styles);
|
|
654
107
|
break;
|
|
655
|
-
} catch {
|
|
108
|
+
} catch (e3) {
|
|
656
109
|
continue;
|
|
657
110
|
}
|
|
658
111
|
}
|
|
@@ -709,16 +162,16 @@ function parseStyleElements(doc, map) {
|
|
|
709
162
|
}
|
|
710
163
|
}
|
|
711
164
|
async function parseHwpxDocument(buffer, options) {
|
|
712
|
-
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
|
|
165
|
+
_chunkHXUCZ2ILcjs.precheckZipSize.call(void 0, buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
|
|
713
166
|
let zip;
|
|
714
167
|
try {
|
|
715
|
-
zip = await
|
|
716
|
-
} catch {
|
|
168
|
+
zip = await _jszip2.default.loadAsync(buffer);
|
|
169
|
+
} catch (e4) {
|
|
717
170
|
return extractFromBrokenZip(buffer);
|
|
718
171
|
}
|
|
719
172
|
const actualEntryCount = Object.keys(zip.files).length;
|
|
720
173
|
if (actualEntryCount > MAX_ZIP_ENTRIES) {
|
|
721
|
-
throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
174
|
+
throw new (0, _chunkHXUCZ2ILcjs.KordocError)("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
722
175
|
}
|
|
723
176
|
const decompressed = { total: 0 };
|
|
724
177
|
const metadata = {};
|
|
@@ -726,9 +179,9 @@ async function parseHwpxDocument(buffer, options) {
|
|
|
726
179
|
const styleMap = await extractHwpxStyles(zip, decompressed);
|
|
727
180
|
const warnings = [];
|
|
728
181
|
const sectionPaths = await resolveSectionPaths(zip);
|
|
729
|
-
if (sectionPaths.length === 0) throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
182
|
+
if (sectionPaths.length === 0) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
730
183
|
metadata.pageCount = sectionPaths.length;
|
|
731
|
-
const pageFilter = options
|
|
184
|
+
const pageFilter = _optionalChain([options, 'optionalAccess', _4 => _4.pages]) ? _chunkMUOQXDZ4cjs.parsePageRange.call(void 0, options.pages, sectionPaths.length) : null;
|
|
732
185
|
const totalTarget = pageFilter ? pageFilter.size : sectionPaths.length;
|
|
733
186
|
const blocks = [];
|
|
734
187
|
let parsedSections = 0;
|
|
@@ -739,19 +192,19 @@ async function parseHwpxDocument(buffer, options) {
|
|
|
739
192
|
try {
|
|
740
193
|
const xml = await file.async("text");
|
|
741
194
|
decompressed.total += xml.length * 2;
|
|
742
|
-
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
195
|
+
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
743
196
|
blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1));
|
|
744
197
|
parsedSections++;
|
|
745
|
-
options
|
|
198
|
+
_optionalChain([options, 'optionalAccess', _5 => _5.onProgress, 'optionalCall', _6 => _6(parsedSections, totalTarget)]);
|
|
746
199
|
} catch (secErr) {
|
|
747
|
-
if (secErr instanceof KordocError) throw secErr;
|
|
200
|
+
if (secErr instanceof _chunkHXUCZ2ILcjs.KordocError) throw secErr;
|
|
748
201
|
warnings.push({ page: si + 1, message: `\uC139\uC158 ${si + 1} \uD30C\uC2F1 \uC2E4\uD328: ${secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
749
202
|
}
|
|
750
203
|
}
|
|
751
204
|
const images = await extractImagesFromZip(zip, blocks, decompressed, warnings);
|
|
752
205
|
detectHwpxHeadings(blocks, styleMap);
|
|
753
206
|
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
754
|
-
const markdown = blocksToMarkdown(blocks);
|
|
207
|
+
const markdown = _chunkHXUCZ2ILcjs.blocksToMarkdown.call(void 0, blocks);
|
|
755
208
|
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
|
|
756
209
|
}
|
|
757
210
|
function imageExtToMime(ext) {
|
|
@@ -803,13 +256,13 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
|
|
|
803
256
|
];
|
|
804
257
|
let found = false;
|
|
805
258
|
for (const path of candidates) {
|
|
806
|
-
if (isPathTraversal(path)) continue;
|
|
259
|
+
if (_chunkHXUCZ2ILcjs.isPathTraversal.call(void 0, path)) continue;
|
|
807
260
|
const file = zip.file(path);
|
|
808
261
|
if (!file) continue;
|
|
809
262
|
try {
|
|
810
263
|
const data = await file.async("uint8array");
|
|
811
264
|
decompressed.total += data.length;
|
|
812
|
-
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
265
|
+
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
813
266
|
const ext = ref.includes(".") ? ref.split(".").pop() || "png" : "png";
|
|
814
267
|
const mimeType = imageExtToMime(ext);
|
|
815
268
|
imageIndex++;
|
|
@@ -820,11 +273,11 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
|
|
|
820
273
|
found = true;
|
|
821
274
|
break;
|
|
822
275
|
} catch (err) {
|
|
823
|
-
if (err instanceof KordocError) throw err;
|
|
276
|
+
if (err instanceof _chunkHXUCZ2ILcjs.KordocError) throw err;
|
|
824
277
|
}
|
|
825
278
|
}
|
|
826
279
|
if (!found) {
|
|
827
|
-
warnings
|
|
280
|
+
_optionalChain([warnings, 'optionalAccess', _7 => _7.push, 'call', _8 => _8({ page: block.pageNumber, message: `\uC774\uBBF8\uC9C0 \uD30C\uC77C \uC5C6\uC74C: ${ref}`, code: "SKIPPED_IMAGE" })]);
|
|
828
281
|
block.type = "paragraph";
|
|
829
282
|
block.text = `[\uC774\uBBF8\uC9C0: ${ref}]`;
|
|
830
283
|
}
|
|
@@ -840,23 +293,23 @@ async function extractHwpxMetadata(zip, metadata, decompressed) {
|
|
|
840
293
|
const xml = await file.async("text");
|
|
841
294
|
if (decompressed) {
|
|
842
295
|
decompressed.total += xml.length * 2;
|
|
843
|
-
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
296
|
+
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
844
297
|
}
|
|
845
298
|
parseDublinCoreMetadata(xml, metadata);
|
|
846
299
|
if (metadata.title || metadata.author) return;
|
|
847
300
|
}
|
|
848
|
-
} catch {
|
|
301
|
+
} catch (e5) {
|
|
849
302
|
}
|
|
850
303
|
}
|
|
851
304
|
function parseDublinCoreMetadata(xml, metadata) {
|
|
852
305
|
const parser = createXmlParser();
|
|
853
|
-
const doc = parser.parseFromString(stripDtd(xml), "text/xml");
|
|
306
|
+
const doc = parser.parseFromString(_chunkHXUCZ2ILcjs.stripDtd.call(void 0, xml), "text/xml");
|
|
854
307
|
if (!doc.documentElement) return;
|
|
855
308
|
const getText = (tagNames) => {
|
|
856
309
|
for (const tag of tagNames) {
|
|
857
310
|
const els = doc.getElementsByTagName(tag);
|
|
858
311
|
if (els.length > 0) {
|
|
859
|
-
const text = els[0].textContent
|
|
312
|
+
const text = _optionalChain([els, 'access', _9 => _9[0], 'access', _10 => _10.textContent, 'optionalAccess', _11 => _11.trim, 'call', _12 => _12()]);
|
|
860
313
|
if (text) return text;
|
|
861
314
|
}
|
|
862
315
|
}
|
|
@@ -909,7 +362,7 @@ function extractFromBrokenZip(buffer) {
|
|
|
909
362
|
}
|
|
910
363
|
const nameBytes = data.slice(pos + 30, pos + 30 + nameLen);
|
|
911
364
|
const name = new TextDecoder().decode(nameBytes);
|
|
912
|
-
if (isPathTraversal(name)) {
|
|
365
|
+
if (_chunkHXUCZ2ILcjs.isPathTraversal.call(void 0, name)) {
|
|
913
366
|
pos = fileStart + compSize;
|
|
914
367
|
continue;
|
|
915
368
|
}
|
|
@@ -921,21 +374,21 @@ function extractFromBrokenZip(buffer) {
|
|
|
921
374
|
if (method === 0) {
|
|
922
375
|
content = new TextDecoder().decode(fileData);
|
|
923
376
|
} else if (method === 8) {
|
|
924
|
-
const decompressed = (0,
|
|
377
|
+
const decompressed = _zlib.inflateRawSync.call(void 0, Buffer.from(fileData), { maxOutputLength: MAX_DECOMPRESS_SIZE });
|
|
925
378
|
content = new TextDecoder().decode(decompressed);
|
|
926
379
|
} else {
|
|
927
380
|
continue;
|
|
928
381
|
}
|
|
929
382
|
totalDecompressed += content.length * 2;
|
|
930
|
-
if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("\uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC");
|
|
383
|
+
if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("\uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC");
|
|
931
384
|
sectionNum++;
|
|
932
385
|
blocks.push(...parseSectionXml(content, void 0, warnings, sectionNum));
|
|
933
|
-
} catch {
|
|
386
|
+
} catch (e6) {
|
|
934
387
|
continue;
|
|
935
388
|
}
|
|
936
389
|
}
|
|
937
|
-
if (blocks.length === 0) throw new KordocError("\uC190\uC0C1\uB41C HWPX\uC5D0\uC11C \uC139\uC158 \uB370\uC774\uD130\uB97C \uBCF5\uAD6C\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
938
|
-
const markdown = blocksToMarkdown(blocks);
|
|
390
|
+
if (blocks.length === 0) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("\uC190\uC0C1\uB41C HWPX\uC5D0\uC11C \uC139\uC158 \uB370\uC774\uD130\uB97C \uBCF5\uAD6C\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
391
|
+
const markdown = _chunkHXUCZ2ILcjs.blocksToMarkdown.call(void 0, blocks);
|
|
939
392
|
return { markdown, blocks, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
940
393
|
}
|
|
941
394
|
async function resolveSectionPaths(zip) {
|
|
@@ -953,7 +406,7 @@ async function resolveSectionPaths(zip) {
|
|
|
953
406
|
}
|
|
954
407
|
function parseSectionPathsFromManifest(xml) {
|
|
955
408
|
const parser = createXmlParser();
|
|
956
|
-
const doc = parser.parseFromString(stripDtd(xml), "text/xml");
|
|
409
|
+
const doc = parser.parseFromString(_chunkHXUCZ2ILcjs.stripDtd.call(void 0, xml), "text/xml");
|
|
957
410
|
const items = doc.getElementsByTagName("opf:item");
|
|
958
411
|
const spine = doc.getElementsByTagName("opf:itemref");
|
|
959
412
|
const isSectionId = (id) => /^s/i.test(id) || id.toLowerCase().includes("section");
|
|
@@ -982,7 +435,7 @@ function detectHwpxHeadings(blocks, styleMap) {
|
|
|
982
435
|
let baseFontSize = 0;
|
|
983
436
|
const sizeFreq = /* @__PURE__ */ new Map();
|
|
984
437
|
for (const b of blocks) {
|
|
985
|
-
if (b.style
|
|
438
|
+
if (_optionalChain([b, 'access', _13 => _13.style, 'optionalAccess', _14 => _14.fontSize])) {
|
|
986
439
|
sizeFreq.set(b.style.fontSize, (sizeFreq.get(b.style.fontSize) || 0) + 1);
|
|
987
440
|
}
|
|
988
441
|
}
|
|
@@ -998,11 +451,11 @@ function detectHwpxHeadings(blocks, styleMap) {
|
|
|
998
451
|
const text = block.text.trim();
|
|
999
452
|
if (text.length === 0 || text.length > 200 || /^\d+$/.test(text)) continue;
|
|
1000
453
|
let level = 0;
|
|
1001
|
-
if (baseFontSize > 0 && block.style
|
|
454
|
+
if (baseFontSize > 0 && _optionalChain([block, 'access', _15 => _15.style, 'optionalAccess', _16 => _16.fontSize])) {
|
|
1002
455
|
const ratio = block.style.fontSize / baseFontSize;
|
|
1003
|
-
if (ratio >= HEADING_RATIO_H1) level = 1;
|
|
1004
|
-
else if (ratio >= HEADING_RATIO_H2) level = 2;
|
|
1005
|
-
else if (ratio >= HEADING_RATIO_H3) level = 3;
|
|
456
|
+
if (ratio >= _chunkHXUCZ2ILcjs.HEADING_RATIO_H1) level = 1;
|
|
457
|
+
else if (ratio >= _chunkHXUCZ2ILcjs.HEADING_RATIO_H2) level = 2;
|
|
458
|
+
else if (ratio >= _chunkHXUCZ2ILcjs.HEADING_RATIO_H3) level = 3;
|
|
1006
459
|
}
|
|
1007
460
|
const compactText = text.replace(/\s+/g, "");
|
|
1008
461
|
if (/^제\d+[조장절편]/.test(compactText) && text.length <= 50) {
|
|
@@ -1016,7 +469,7 @@ function detectHwpxHeadings(blocks, styleMap) {
|
|
|
1016
469
|
}
|
|
1017
470
|
function parseSectionXml(xml, styleMap, warnings, sectionNum) {
|
|
1018
471
|
const parser = createXmlParser(warnings);
|
|
1019
|
-
const doc = parser.parseFromString(stripDtd(xml), "text/xml");
|
|
472
|
+
const doc = parser.parseFromString(_chunkHXUCZ2ILcjs.stripDtd.call(void 0, xml), "text/xml");
|
|
1020
473
|
if (!doc.documentElement) return [];
|
|
1021
474
|
const blocks = [];
|
|
1022
475
|
walkSection(doc.documentElement, blocks, null, [], styleMap, warnings, sectionNum);
|
|
@@ -1060,16 +513,16 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
1060
513
|
let nestedCols = 0;
|
|
1061
514
|
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
1062
515
|
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
1063
|
-
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
516
|
+
blocks.push({ type: "table", table: _chunkHXUCZ2ILcjs.buildTable.call(void 0, newTable.rows), pageNumber: sectionNum });
|
|
1064
517
|
} else {
|
|
1065
|
-
const nestedText = convertTableToText(newTable.rows);
|
|
518
|
+
const nestedText = _chunkHXUCZ2ILcjs.convertTableToText.call(void 0, newTable.rows);
|
|
1066
519
|
if (parentTable.cell) {
|
|
1067
520
|
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
|
|
1068
521
|
}
|
|
1069
522
|
}
|
|
1070
523
|
tableCtx = parentTable;
|
|
1071
524
|
} else {
|
|
1072
|
-
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
525
|
+
blocks.push({ type: "table", table: _chunkHXUCZ2ILcjs.buildTable.call(void 0, newTable.rows), pageNumber: sectionNum });
|
|
1073
526
|
tableCtx = null;
|
|
1074
527
|
}
|
|
1075
528
|
} else {
|
|
@@ -1096,7 +549,7 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
1096
549
|
}
|
|
1097
550
|
break;
|
|
1098
551
|
case "cellAddr":
|
|
1099
|
-
if (tableCtx
|
|
552
|
+
if (_optionalChain([tableCtx, 'optionalAccess', _17 => _17.cell])) {
|
|
1100
553
|
const ca = parseInt(el.getAttribute("colAddr") || "", 10);
|
|
1101
554
|
const ra = parseInt(el.getAttribute("rowAddr") || "", 10);
|
|
1102
555
|
if (!isNaN(ca)) tableCtx.cell.colAddr = ca;
|
|
@@ -1104,19 +557,19 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
1104
557
|
}
|
|
1105
558
|
break;
|
|
1106
559
|
case "cellSpan":
|
|
1107
|
-
if (tableCtx
|
|
560
|
+
if (_optionalChain([tableCtx, 'optionalAccess', _18 => _18.cell])) {
|
|
1108
561
|
const rawCs = parseInt(el.getAttribute("colSpan") || "1", 10);
|
|
1109
562
|
const cs = isNaN(rawCs) ? 1 : rawCs;
|
|
1110
563
|
const rawRs = parseInt(el.getAttribute("rowSpan") || "1", 10);
|
|
1111
564
|
const rs = isNaN(rawRs) ? 1 : rawRs;
|
|
1112
|
-
tableCtx.cell.colSpan = clampSpan(cs, MAX_COLS);
|
|
1113
|
-
tableCtx.cell.rowSpan = clampSpan(rs, MAX_ROWS);
|
|
565
|
+
tableCtx.cell.colSpan = clampSpan(cs, _chunkHXUCZ2ILcjs.MAX_COLS);
|
|
566
|
+
tableCtx.cell.rowSpan = clampSpan(rs, _chunkHXUCZ2ILcjs.MAX_ROWS);
|
|
1114
567
|
}
|
|
1115
568
|
break;
|
|
1116
569
|
case "p": {
|
|
1117
570
|
const { text, href, footnote, style } = extractParagraphInfo(el, styleMap);
|
|
1118
571
|
if (text) {
|
|
1119
|
-
if (tableCtx
|
|
572
|
+
if (_optionalChain([tableCtx, 'optionalAccess', _19 => _19.cell])) {
|
|
1120
573
|
tableCtx.cell.text += (tableCtx.cell.text ? "\n" : "") + text;
|
|
1121
574
|
} else if (!tableCtx) {
|
|
1122
575
|
const block = { type: "paragraph", text, pageNumber: sectionNum };
|
|
@@ -1170,16 +623,16 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
1170
623
|
let nestedCols = 0;
|
|
1171
624
|
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
1172
625
|
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
1173
|
-
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
626
|
+
blocks.push({ type: "table", table: _chunkHXUCZ2ILcjs.buildTable.call(void 0, newTable.rows), pageNumber: sectionNum });
|
|
1174
627
|
} else {
|
|
1175
|
-
const nestedText = convertTableToText(newTable.rows);
|
|
628
|
+
const nestedText = _chunkHXUCZ2ILcjs.convertTableToText.call(void 0, newTable.rows);
|
|
1176
629
|
if (parentTable.cell) {
|
|
1177
630
|
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
|
|
1178
631
|
}
|
|
1179
632
|
}
|
|
1180
633
|
tableCtx = parentTable;
|
|
1181
634
|
} else {
|
|
1182
|
-
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
635
|
+
blocks.push({ type: "table", table: _chunkHXUCZ2ILcjs.buildTable.call(void 0, newTable.rows), pageNumber: sectionNum });
|
|
1183
636
|
tableCtx = null;
|
|
1184
637
|
}
|
|
1185
638
|
} else {
|
|
@@ -1237,7 +690,7 @@ function extractDrawTextBlocks(drawTextNode, blocks, styleMap, sectionNum) {
|
|
|
1237
690
|
const info = extractParagraphInfo(child, styleMap);
|
|
1238
691
|
const text = info.text.trim();
|
|
1239
692
|
if (text) {
|
|
1240
|
-
blocks.push({ type: "paragraph", text, style: info.style
|
|
693
|
+
blocks.push({ type: "paragraph", text, style: _nullishCoalesce(info.style, () => ( void 0)), pageNumber: sectionNum });
|
|
1241
694
|
}
|
|
1242
695
|
}
|
|
1243
696
|
}
|
|
@@ -1287,7 +740,7 @@ function extractParagraphInfo(para, styleMap) {
|
|
|
1287
740
|
case "hyperlink": {
|
|
1288
741
|
const url = child.getAttribute("url") || child.getAttribute("href") || "";
|
|
1289
742
|
if (url) {
|
|
1290
|
-
const safe = sanitizeHref(url);
|
|
743
|
+
const safe = _chunkHXUCZ2ILcjs.sanitizeHref.call(void 0, url);
|
|
1291
744
|
if (safe) href = safe;
|
|
1292
745
|
}
|
|
1293
746
|
walk(child);
|
|
@@ -1372,7 +825,7 @@ function extractTextFromNode(node) {
|
|
|
1372
825
|
}
|
|
1373
826
|
|
|
1374
827
|
// src/hwp5/record.ts
|
|
1375
|
-
|
|
828
|
+
|
|
1376
829
|
var TAG_PARA_HEADER = 66;
|
|
1377
830
|
var TAG_PARA_TEXT = 67;
|
|
1378
831
|
var TAG_CHAR_SHAPE = 68;
|
|
@@ -1420,14 +873,14 @@ function decompressStream(data) {
|
|
|
1420
873
|
const opts = { maxOutputLength: MAX_DECOMPRESS_SIZE2 };
|
|
1421
874
|
if (data.length >= 2 && data[0] === 120) {
|
|
1422
875
|
try {
|
|
1423
|
-
return (0,
|
|
1424
|
-
} catch {
|
|
876
|
+
return _zlib.inflateSync.call(void 0, data, opts);
|
|
877
|
+
} catch (e7) {
|
|
1425
878
|
}
|
|
1426
879
|
}
|
|
1427
|
-
return (0,
|
|
880
|
+
return _zlib.inflateRawSync.call(void 0, data, opts);
|
|
1428
881
|
}
|
|
1429
882
|
function parseFileHeader(data) {
|
|
1430
|
-
if (data.length < 40) throw new KordocError("FileHeader\uAC00 \uB108\uBB34 \uC9E7\uC2B5\uB2C8\uB2E4 (\uCD5C\uC18C 40\uBC14\uC774\uD2B8)");
|
|
883
|
+
if (data.length < 40) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("FileHeader\uAC00 \uB108\uBB34 \uC9E7\uC2B5\uB2C8\uB2E4 (\uCD5C\uC18C 40\uBC14\uC774\uD2B8)");
|
|
1431
884
|
const sig = data.subarray(0, 32).toString("utf8").replace(/\0+$/, "");
|
|
1432
885
|
return {
|
|
1433
886
|
signature: sig,
|
|
@@ -1480,7 +933,7 @@ function parseDocInfo(records) {
|
|
|
1480
933
|
offset += 2;
|
|
1481
934
|
const charShapeId = offset + 2 <= rec.data.length ? rec.data.readUInt16LE(offset) : 0;
|
|
1482
935
|
styles.push({ name, nameKo, charShapeId, paraShapeId, type });
|
|
1483
|
-
} catch {
|
|
936
|
+
} catch (e8) {
|
|
1484
937
|
}
|
|
1485
938
|
}
|
|
1486
939
|
}
|
|
@@ -2162,7 +1615,7 @@ function aes128EcbDecrypt(data, key) {
|
|
|
2162
1615
|
|
|
2163
1616
|
// src/hwp5/crypto.ts
|
|
2164
1617
|
var MsvcLcg = class {
|
|
2165
|
-
|
|
1618
|
+
|
|
2166
1619
|
constructor(seed) {
|
|
2167
1620
|
this.seed = seed >>> 0;
|
|
2168
1621
|
}
|
|
@@ -2242,7 +1695,7 @@ function decryptViewText(viewTextRaw, compressed) {
|
|
|
2242
1695
|
if (compressed) {
|
|
2243
1696
|
try {
|
|
2244
1697
|
return decompressStream(Buffer.from(decrypted));
|
|
2245
|
-
} catch {
|
|
1698
|
+
} catch (e9) {
|
|
2246
1699
|
return Buffer.from(decrypted);
|
|
2247
1700
|
}
|
|
2248
1701
|
}
|
|
@@ -2403,7 +1856,7 @@ function parseLenientCfb(data) {
|
|
|
2403
1856
|
function findEntryByPath(path) {
|
|
2404
1857
|
const parts = path.replace(/^\//, "").split("/");
|
|
2405
1858
|
if (parts.length === 1) {
|
|
2406
|
-
return dirEntries.find((e) => e.name === parts[0] && e.type === 2)
|
|
1859
|
+
return _nullishCoalesce(dirEntries.find((e) => e.name === parts[0] && e.type === 2), () => ( null));
|
|
2407
1860
|
}
|
|
2408
1861
|
const storageName = parts[0];
|
|
2409
1862
|
const streamName = parts.slice(1).join("/");
|
|
@@ -2413,7 +1866,7 @@ function parseLenientCfb(data) {
|
|
|
2413
1866
|
}
|
|
2414
1867
|
}
|
|
2415
1868
|
const lastPart = parts[parts.length - 1];
|
|
2416
|
-
return dirEntries.find((e) => e.type === 2 && e.name === lastPart)
|
|
1869
|
+
return _nullishCoalesce(dirEntries.find((e) => e.type === 2 && e.name === lastPart), () => ( null));
|
|
2417
1870
|
}
|
|
2418
1871
|
return {
|
|
2419
1872
|
findStream(path) {
|
|
@@ -2430,10 +1883,8 @@ function parseLenientCfb(data) {
|
|
|
2430
1883
|
}
|
|
2431
1884
|
|
|
2432
1885
|
// src/hwp5/parser.ts
|
|
2433
|
-
|
|
2434
|
-
var
|
|
2435
|
-
var import_meta = {};
|
|
2436
|
-
var require2 = (0, import_module.createRequire)(import_meta.url);
|
|
1886
|
+
var _module = require('module');
|
|
1887
|
+
var require2 = _module.createRequire.call(void 0, import.meta.url);
|
|
2437
1888
|
var CFB = require2("cfb");
|
|
2438
1889
|
var MAX_SECTIONS = 100;
|
|
2439
1890
|
var MAX_TOTAL_DECOMPRESS = 100 * 1024 * 1024;
|
|
@@ -2443,27 +1894,27 @@ function parseHwp5Document(buffer, options) {
|
|
|
2443
1894
|
const warnings = [];
|
|
2444
1895
|
try {
|
|
2445
1896
|
cfb = CFB.parse(buffer);
|
|
2446
|
-
} catch {
|
|
1897
|
+
} catch (e10) {
|
|
2447
1898
|
try {
|
|
2448
1899
|
lenientCfb = parseLenientCfb(buffer);
|
|
2449
1900
|
warnings.push({ message: "\uC190\uC0C1\uB41C CFB \uCEE8\uD14C\uC774\uB108 \u2014 lenient \uBAA8\uB4DC\uB85C \uBCF5\uAD6C", code: "LENIENT_CFB_RECOVERY" });
|
|
2450
|
-
} catch {
|
|
2451
|
-
throw new KordocError("CFB \uCEE8\uD14C\uC774\uB108 \uD30C\uC2F1 \uC2E4\uD328 (strict \uBC0F lenient \uBAA8\uB450)");
|
|
1901
|
+
} catch (e11) {
|
|
1902
|
+
throw new (0, _chunkHXUCZ2ILcjs.KordocError)("CFB \uCEE8\uD14C\uC774\uB108 \uD30C\uC2F1 \uC2E4\uD328 (strict \uBC0F lenient \uBAA8\uB450)");
|
|
2452
1903
|
}
|
|
2453
1904
|
}
|
|
2454
1905
|
const findStream = (path) => {
|
|
2455
1906
|
if (cfb) {
|
|
2456
1907
|
const entry = CFB.find(cfb, path);
|
|
2457
|
-
return entry
|
|
1908
|
+
return _optionalChain([entry, 'optionalAccess', _20 => _20.content]) ? Buffer.from(entry.content) : null;
|
|
2458
1909
|
}
|
|
2459
1910
|
return lenientCfb.findStream(path);
|
|
2460
1911
|
};
|
|
2461
1912
|
const headerData = findStream("/FileHeader");
|
|
2462
|
-
if (!headerData) throw new KordocError("FileHeader \uC2A4\uD2B8\uB9BC \uC5C6\uC74C");
|
|
1913
|
+
if (!headerData) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("FileHeader \uC2A4\uD2B8\uB9BC \uC5C6\uC74C");
|
|
2463
1914
|
const header = parseFileHeader(headerData);
|
|
2464
|
-
if (header.signature !== "HWP Document File") throw new KordocError("HWP \uC2DC\uADF8\uB2C8\uCC98 \uBD88\uC77C\uCE58");
|
|
2465
|
-
if (header.flags & FLAG_ENCRYPTED) throw new KordocError("\uC554\uD638\uD654\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
|
|
2466
|
-
if (header.flags & FLAG_DRM) throw new KordocError("DRM \uBCF4\uD638\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
|
|
1915
|
+
if (header.signature !== "HWP Document File") throw new (0, _chunkHXUCZ2ILcjs.KordocError)("HWP \uC2DC\uADF8\uB2C8\uCC98 \uBD88\uC77C\uCE58");
|
|
1916
|
+
if (header.flags & FLAG_ENCRYPTED) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("\uC554\uD638\uD654\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
|
|
1917
|
+
if (header.flags & FLAG_DRM) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("DRM \uBCF4\uD638\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
|
|
2467
1918
|
const compressed = (header.flags & FLAG_COMPRESSED) !== 0;
|
|
2468
1919
|
const distribution = (header.flags & FLAG_DISTRIBUTION) !== 0;
|
|
2469
1920
|
const metadata = {
|
|
@@ -2472,9 +1923,9 @@ function parseHwp5Document(buffer, options) {
|
|
|
2472
1923
|
if (cfb) extractHwp5Metadata(cfb, metadata);
|
|
2473
1924
|
const docInfo = cfb ? parseDocInfoStream(cfb, compressed) : parseDocInfoFromStream(findStream("/DocInfo"), compressed);
|
|
2474
1925
|
const sections = distribution ? cfb ? findViewTextSections(cfb, compressed) : findViewTextSectionsLenient(lenientCfb, compressed) : cfb ? findSections(cfb) : findSectionsLenient(lenientCfb, compressed);
|
|
2475
|
-
if (sections.length === 0) throw new KordocError("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
1926
|
+
if (sections.length === 0) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
2476
1927
|
metadata.pageCount = sections.length;
|
|
2477
|
-
const pageFilter = options
|
|
1928
|
+
const pageFilter = _optionalChain([options, 'optionalAccess', _21 => _21.pages]) ? _chunkMUOQXDZ4cjs.parsePageRange.call(void 0, options.pages, sections.length) : null;
|
|
2478
1929
|
const totalTarget = pageFilter ? pageFilter.size : sections.length;
|
|
2479
1930
|
const blocks = [];
|
|
2480
1931
|
let totalDecompressed = 0;
|
|
@@ -2485,34 +1936,34 @@ function parseHwp5Document(buffer, options) {
|
|
|
2485
1936
|
const sectionData = sections[si];
|
|
2486
1937
|
const data = !distribution && compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
|
|
2487
1938
|
totalDecompressed += data.length;
|
|
2488
|
-
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
1939
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2489
1940
|
const records = readRecords(data);
|
|
2490
1941
|
const sectionBlocks = parseSection(records, docInfo, warnings, si + 1);
|
|
2491
1942
|
blocks.push(...sectionBlocks);
|
|
2492
1943
|
parsedSections++;
|
|
2493
|
-
options
|
|
1944
|
+
_optionalChain([options, 'optionalAccess', _22 => _22.onProgress, 'optionalCall', _23 => _23(parsedSections, totalTarget)]);
|
|
2494
1945
|
} catch (secErr) {
|
|
2495
|
-
if (secErr instanceof KordocError) throw secErr;
|
|
1946
|
+
if (secErr instanceof _chunkHXUCZ2ILcjs.KordocError) throw secErr;
|
|
2496
1947
|
warnings.push({ page: si + 1, message: `\uC139\uC158 ${si + 1} \uD30C\uC2F1 \uC2E4\uD328: ${secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
2497
1948
|
}
|
|
2498
1949
|
}
|
|
2499
1950
|
const images = cfb ? extractHwp5Images(cfb, blocks, compressed, warnings) : extractHwp5ImagesLenient(lenientCfb, blocks, compressed, warnings);
|
|
2500
|
-
const flatBlocks = flattenLayoutTables(blocks);
|
|
1951
|
+
const flatBlocks = _chunkHXUCZ2ILcjs.flattenLayoutTables.call(void 0, blocks);
|
|
2501
1952
|
if (docInfo) {
|
|
2502
1953
|
detectHwp5Headings(flatBlocks, docInfo);
|
|
2503
1954
|
}
|
|
2504
1955
|
const outline = flatBlocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
2505
|
-
const markdown = blocksToMarkdown(flatBlocks);
|
|
1956
|
+
const markdown = _chunkHXUCZ2ILcjs.blocksToMarkdown.call(void 0, flatBlocks);
|
|
2506
1957
|
return { markdown, blocks: flatBlocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
|
|
2507
1958
|
}
|
|
2508
1959
|
function parseDocInfoStream(cfb, compressed) {
|
|
2509
1960
|
try {
|
|
2510
1961
|
const entry = CFB.find(cfb, "/DocInfo");
|
|
2511
|
-
if (!entry
|
|
1962
|
+
if (!_optionalChain([entry, 'optionalAccess', _24 => _24.content])) return null;
|
|
2512
1963
|
const data = compressed ? decompressStream(Buffer.from(entry.content)) : Buffer.from(entry.content);
|
|
2513
1964
|
const records = readRecords(data);
|
|
2514
1965
|
return parseDocInfo(records);
|
|
2515
|
-
} catch {
|
|
1966
|
+
} catch (e12) {
|
|
2516
1967
|
return null;
|
|
2517
1968
|
}
|
|
2518
1969
|
}
|
|
@@ -2521,7 +1972,7 @@ function parseDocInfoFromStream(raw, compressed) {
|
|
|
2521
1972
|
try {
|
|
2522
1973
|
const data = compressed ? decompressStream(raw) : raw;
|
|
2523
1974
|
return parseDocInfo(readRecords(data));
|
|
2524
|
-
} catch {
|
|
1975
|
+
} catch (e13) {
|
|
2525
1976
|
return null;
|
|
2526
1977
|
}
|
|
2527
1978
|
}
|
|
@@ -2531,7 +1982,7 @@ function detectHwp5Headings(blocks, docInfo) {
|
|
|
2531
1982
|
const name = (style.nameKo || style.name).toLowerCase();
|
|
2532
1983
|
if (name.includes("\uBC14\uD0D5") || name.includes("\uBCF8\uBB38") || name === "normal" || name === "body") {
|
|
2533
1984
|
const cs = docInfo.charShapes[style.charShapeId];
|
|
2534
|
-
if (cs
|
|
1985
|
+
if (_optionalChain([cs, 'optionalAccess', _25 => _25.fontSize]) > 0) {
|
|
2535
1986
|
baseFontSize = cs.fontSize / 10;
|
|
2536
1987
|
break;
|
|
2537
1988
|
}
|
|
@@ -2540,7 +1991,7 @@ function detectHwp5Headings(blocks, docInfo) {
|
|
|
2540
1991
|
if (baseFontSize === 0) {
|
|
2541
1992
|
const sizeFreq = /* @__PURE__ */ new Map();
|
|
2542
1993
|
for (const b of blocks) {
|
|
2543
|
-
if (b.style
|
|
1994
|
+
if (_optionalChain([b, 'access', _26 => _26.style, 'optionalAccess', _27 => _27.fontSize])) {
|
|
2544
1995
|
sizeFreq.set(b.style.fontSize, (sizeFreq.get(b.style.fontSize) || 0) + 1);
|
|
2545
1996
|
}
|
|
2546
1997
|
}
|
|
@@ -2560,11 +2011,11 @@ function detectHwp5Headings(blocks, docInfo) {
|
|
|
2560
2011
|
if (text.length === 0 || text.length > 200) continue;
|
|
2561
2012
|
if (/^\d+$/.test(text)) continue;
|
|
2562
2013
|
let level = 0;
|
|
2563
|
-
if (block.style
|
|
2014
|
+
if (_optionalChain([block, 'access', _28 => _28.style, 'optionalAccess', _29 => _29.fontSize]) && baseFontSize > 0) {
|
|
2564
2015
|
const ratio = block.style.fontSize / baseFontSize;
|
|
2565
|
-
if (ratio >= HEADING_RATIO_H1) level = 1;
|
|
2566
|
-
else if (ratio >= HEADING_RATIO_H2) level = 2;
|
|
2567
|
-
else if (ratio >= HEADING_RATIO_H3) level = 3;
|
|
2016
|
+
if (ratio >= _chunkHXUCZ2ILcjs.HEADING_RATIO_H1) level = 1;
|
|
2017
|
+
else if (ratio >= _chunkHXUCZ2ILcjs.HEADING_RATIO_H2) level = 2;
|
|
2018
|
+
else if (ratio >= _chunkHXUCZ2ILcjs.HEADING_RATIO_H3) level = 3;
|
|
2568
2019
|
}
|
|
2569
2020
|
if (/^제\d+[장절편]\s/.test(text) && text.length <= 50) {
|
|
2570
2021
|
if (level === 0) level = 2;
|
|
@@ -2580,7 +2031,7 @@ function detectHwp5Headings(blocks, docInfo) {
|
|
|
2580
2031
|
function extractHwp5Metadata(cfb, metadata) {
|
|
2581
2032
|
try {
|
|
2582
2033
|
const summaryEntry = CFB.find(cfb, "/HwpSummaryInformation") || CFB.find(cfb, "/SummaryInformation");
|
|
2583
|
-
if (!summaryEntry
|
|
2034
|
+
if (!_optionalChain([summaryEntry, 'optionalAccess', _30 => _30.content])) return;
|
|
2584
2035
|
const data = Buffer.from(summaryEntry.content);
|
|
2585
2036
|
if (data.length < 48) return;
|
|
2586
2037
|
const numSets = data.readUInt32LE(24);
|
|
@@ -2606,18 +2057,18 @@ function extractHwp5Metadata(cfb, metadata) {
|
|
|
2606
2057
|
else if (propId === 4) metadata.author = str;
|
|
2607
2058
|
else if (propId === 6) metadata.description = str;
|
|
2608
2059
|
}
|
|
2609
|
-
} catch {
|
|
2060
|
+
} catch (e14) {
|
|
2610
2061
|
}
|
|
2611
2062
|
}
|
|
2612
2063
|
function findViewTextSections(cfb, compressed) {
|
|
2613
2064
|
const sections = [];
|
|
2614
2065
|
for (let i = 0; i < MAX_SECTIONS; i++) {
|
|
2615
2066
|
const entry = CFB.find(cfb, `/ViewText/Section${i}`);
|
|
2616
|
-
if (!entry
|
|
2067
|
+
if (!_optionalChain([entry, 'optionalAccess', _31 => _31.content])) break;
|
|
2617
2068
|
try {
|
|
2618
2069
|
const decrypted = decryptViewText(Buffer.from(entry.content), compressed);
|
|
2619
2070
|
sections.push({ idx: i, content: decrypted });
|
|
2620
|
-
} catch {
|
|
2071
|
+
} catch (e15) {
|
|
2621
2072
|
break;
|
|
2622
2073
|
}
|
|
2623
2074
|
}
|
|
@@ -2627,13 +2078,13 @@ function findSections(cfb) {
|
|
|
2627
2078
|
const sections = [];
|
|
2628
2079
|
for (let i = 0; i < MAX_SECTIONS; i++) {
|
|
2629
2080
|
const entry = CFB.find(cfb, `/BodyText/Section${i}`);
|
|
2630
|
-
if (!entry
|
|
2081
|
+
if (!_optionalChain([entry, 'optionalAccess', _32 => _32.content])) break;
|
|
2631
2082
|
sections.push({ idx: i, content: Buffer.from(entry.content) });
|
|
2632
2083
|
}
|
|
2633
2084
|
if (sections.length === 0 && cfb.FileIndex) {
|
|
2634
2085
|
for (const entry of cfb.FileIndex) {
|
|
2635
2086
|
if (sections.length >= MAX_SECTIONS) break;
|
|
2636
|
-
if (entry.name
|
|
2087
|
+
if (_optionalChain([entry, 'access', _33 => _33.name, 'optionalAccess', _34 => _34.startsWith, 'call', _35 => _35("Section")]) && entry.content) {
|
|
2637
2088
|
const idx = parseInt(entry.name.replace("Section", ""), 10) || 0;
|
|
2638
2089
|
sections.push({ idx, content: Buffer.from(entry.content) });
|
|
2639
2090
|
}
|
|
@@ -2645,11 +2096,11 @@ function findSectionsLenient(lcfb, compressed) {
|
|
|
2645
2096
|
const sections = [];
|
|
2646
2097
|
let totalDecompressed = 0;
|
|
2647
2098
|
for (let i = 0; i < MAX_SECTIONS; i++) {
|
|
2648
|
-
const raw = lcfb.findStream(`/BodyText/Section${i}`)
|
|
2099
|
+
const raw = _nullishCoalesce(lcfb.findStream(`/BodyText/Section${i}`), () => ( lcfb.findStream(`Section${i}`)));
|
|
2649
2100
|
if (!raw) break;
|
|
2650
2101
|
const content = compressed ? decompressStream(raw) : raw;
|
|
2651
2102
|
totalDecompressed += content.length;
|
|
2652
|
-
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2103
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2653
2104
|
sections.push({ idx: i, content });
|
|
2654
2105
|
}
|
|
2655
2106
|
if (sections.length === 0) {
|
|
@@ -2661,7 +2112,7 @@ function findSectionsLenient(lcfb, compressed) {
|
|
|
2661
2112
|
if (raw) {
|
|
2662
2113
|
const content = compressed ? decompressStream(raw) : raw;
|
|
2663
2114
|
totalDecompressed += content.length;
|
|
2664
|
-
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2115
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2665
2116
|
sections.push({ idx, content });
|
|
2666
2117
|
}
|
|
2667
2118
|
}
|
|
@@ -2673,14 +2124,14 @@ function findViewTextSectionsLenient(lcfb, compressed) {
|
|
|
2673
2124
|
const sections = [];
|
|
2674
2125
|
let totalDecompressed = 0;
|
|
2675
2126
|
for (let i = 0; i < MAX_SECTIONS; i++) {
|
|
2676
|
-
const raw = lcfb.findStream(`/ViewText/Section${i}`)
|
|
2127
|
+
const raw = _nullishCoalesce(lcfb.findStream(`/ViewText/Section${i}`), () => ( lcfb.findStream(`Section${i}`)));
|
|
2677
2128
|
if (!raw) break;
|
|
2678
2129
|
try {
|
|
2679
2130
|
const content = decryptViewText(raw, compressed);
|
|
2680
2131
|
totalDecompressed += content.length;
|
|
2681
|
-
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2132
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2682
2133
|
sections.push({ idx: i, content });
|
|
2683
|
-
} catch {
|
|
2134
|
+
} catch (e16) {
|
|
2684
2135
|
break;
|
|
2685
2136
|
}
|
|
2686
2137
|
}
|
|
@@ -2716,7 +2167,7 @@ function extractHwp5Images(cfb, blocks, compressed, warnings) {
|
|
|
2716
2167
|
const binDataRe = /\/BinData\/[Bb][Ii][Nn](\d{4})$/;
|
|
2717
2168
|
if (cfb.FileIndex) {
|
|
2718
2169
|
for (const entry of cfb.FileIndex) {
|
|
2719
|
-
if (!entry
|
|
2170
|
+
if (!_optionalChain([entry, 'optionalAccess', _36 => _36.name]) || !entry.content) continue;
|
|
2720
2171
|
const match = entry.name.match(binDataRe);
|
|
2721
2172
|
if (!match) continue;
|
|
2722
2173
|
const idx = parseInt(match[1], 10);
|
|
@@ -2724,7 +2175,7 @@ function extractHwp5Images(cfb, blocks, compressed, warnings) {
|
|
|
2724
2175
|
if (compressed) {
|
|
2725
2176
|
try {
|
|
2726
2177
|
data = decompressStream(data);
|
|
2727
|
-
} catch {
|
|
2178
|
+
} catch (e17) {
|
|
2728
2179
|
}
|
|
2729
2180
|
}
|
|
2730
2181
|
binDataMap.set(idx, { data, name: entry.name });
|
|
@@ -2772,7 +2223,7 @@ function extractHwp5ImagesLenient(lcfb, blocks, compressed, warnings) {
|
|
|
2772
2223
|
if (compressed) {
|
|
2773
2224
|
try {
|
|
2774
2225
|
raw = decompressStream(raw);
|
|
2775
|
-
} catch {
|
|
2226
|
+
} catch (e18) {
|
|
2776
2227
|
}
|
|
2777
2228
|
}
|
|
2778
2229
|
binDataMap.set(idx, { data: raw, name: e.name });
|
|
@@ -2866,7 +2317,7 @@ function parseSection(records, docInfo, warnings, sectionNum) {
|
|
|
2866
2317
|
if (url && blocks.length > 0) {
|
|
2867
2318
|
const lastBlock = blocks[blocks.length - 1];
|
|
2868
2319
|
if (lastBlock.type === "paragraph" && !lastBlock.href) {
|
|
2869
|
-
lastBlock.href = sanitizeHref(url)
|
|
2320
|
+
lastBlock.href = _nullishCoalesce(_chunkHXUCZ2ILcjs.sanitizeHref.call(void 0, url), () => ( void 0));
|
|
2870
2321
|
}
|
|
2871
2322
|
}
|
|
2872
2323
|
}
|
|
@@ -2917,7 +2368,7 @@ function extractHyperlinkUrl(data) {
|
|
|
2917
2368
|
return url;
|
|
2918
2369
|
}
|
|
2919
2370
|
}
|
|
2920
|
-
} catch {
|
|
2371
|
+
} catch (e19) {
|
|
2921
2372
|
}
|
|
2922
2373
|
return null;
|
|
2923
2374
|
}
|
|
@@ -2984,8 +2435,8 @@ function parseTableBlock(records, startIdx) {
|
|
|
2984
2435
|
if (rec.tagId === TAG_PARA_HEADER && rec.level <= tableLevel) break;
|
|
2985
2436
|
if (rec.tagId === TAG_CTRL_HEADER && rec.level <= tableLevel) break;
|
|
2986
2437
|
if (rec.tagId === TAG_TABLE && rec.data.length >= 8) {
|
|
2987
|
-
rows = Math.min(rec.data.readUInt16LE(4), MAX_ROWS);
|
|
2988
|
-
cols = Math.min(rec.data.readUInt16LE(6), MAX_COLS);
|
|
2438
|
+
rows = Math.min(rec.data.readUInt16LE(4), _chunkHXUCZ2ILcjs.MAX_ROWS);
|
|
2439
|
+
cols = Math.min(rec.data.readUInt16LE(6), _chunkHXUCZ2ILcjs.MAX_COLS);
|
|
2989
2440
|
}
|
|
2990
2441
|
if (rec.tagId === TAG_LIST_HEADER) {
|
|
2991
2442
|
const { cell, nextIdx } = parseCellBlock(records, i, tableLevel);
|
|
@@ -3007,7 +2458,7 @@ function parseTableBlock(records, startIdx) {
|
|
|
3007
2458
|
return { table: { rows, cols, cells: irCells, hasHeader: rows > 1 }, nextIdx: i };
|
|
3008
2459
|
}
|
|
3009
2460
|
const cellRows = arrangeCells(rows, cols, cells);
|
|
3010
|
-
return { table: buildTable(cellRows), nextIdx: i };
|
|
2461
|
+
return { table: _chunkHXUCZ2ILcjs.buildTable.call(void 0, cellRows), nextIdx: i };
|
|
3011
2462
|
}
|
|
3012
2463
|
function parseCellBlock(records, startIdx, tableLevel) {
|
|
3013
2464
|
const rec = records[startIdx];
|
|
@@ -3022,8 +2473,8 @@ function parseCellBlock(records, startIdx, tableLevel) {
|
|
|
3022
2473
|
rowAddr = rec.data.readUInt16LE(10);
|
|
3023
2474
|
const cs = rec.data.readUInt16LE(12);
|
|
3024
2475
|
const rs = rec.data.readUInt16LE(14);
|
|
3025
|
-
if (cs > 0) colSpan = Math.min(cs, MAX_COLS);
|
|
3026
|
-
if (rs > 0) rowSpan = Math.min(rs, MAX_ROWS);
|
|
2476
|
+
if (cs > 0) colSpan = Math.min(cs, _chunkHXUCZ2ILcjs.MAX_COLS);
|
|
2477
|
+
if (rs > 0) rowSpan = Math.min(rs, _chunkHXUCZ2ILcjs.MAX_ROWS);
|
|
3027
2478
|
}
|
|
3028
2479
|
let i = startIdx + 1;
|
|
3029
2480
|
while (i < records.length) {
|
|
@@ -3043,8 +2494,8 @@ function arrangeCells(rows, cols, cells) {
|
|
|
3043
2494
|
const hasAddr = cells.some((c) => c.colAddr !== void 0 && c.rowAddr !== void 0);
|
|
3044
2495
|
if (hasAddr) {
|
|
3045
2496
|
for (const cell of cells) {
|
|
3046
|
-
const r = cell.rowAddr
|
|
3047
|
-
const c = cell.colAddr
|
|
2497
|
+
const r = _nullishCoalesce(cell.rowAddr, () => ( 0));
|
|
2498
|
+
const c = _nullishCoalesce(cell.colAddr, () => ( 0));
|
|
3048
2499
|
if (r >= rows || c >= cols) continue;
|
|
3049
2500
|
grid[r][c] = cell;
|
|
3050
2501
|
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
@@ -3075,2308 +2526,56 @@ function arrangeCells(rows, cols, cells) {
|
|
|
3075
2526
|
return grid.map((row) => row.map((c) => c || { text: "", colSpan: 1, rowSpan: 1 }));
|
|
3076
2527
|
}
|
|
3077
2528
|
|
|
3078
|
-
// src/
|
|
3079
|
-
init_page_range();
|
|
2529
|
+
// src/xlsx/parser.ts
|
|
3080
2530
|
|
|
3081
|
-
|
|
3082
|
-
var
|
|
3083
|
-
var
|
|
3084
|
-
var
|
|
3085
|
-
var
|
|
3086
|
-
|
|
3087
|
-
|
|
3088
|
-
|
|
3089
|
-
|
|
3090
|
-
|
|
3091
|
-
|
|
3092
|
-
function extractLines(fnArray, argsArray) {
|
|
3093
|
-
const horizontals = [];
|
|
3094
|
-
const verticals = [];
|
|
3095
|
-
let lineWidth = 1;
|
|
3096
|
-
let currentPath = [];
|
|
3097
|
-
let pathStartX = 0, pathStartY = 0;
|
|
3098
|
-
let curX = 0, curY = 0;
|
|
3099
|
-
function pushRectangle(path, rx, ry, rw, rh) {
|
|
3100
|
-
if (Math.abs(rh) < ORIENTATION_TOL * 2) {
|
|
3101
|
-
path.push({ x1: rx, y1: ry + rh / 2, x2: rx + rw, y2: ry + rh / 2 });
|
|
3102
|
-
} else if (Math.abs(rw) < ORIENTATION_TOL * 2) {
|
|
3103
|
-
path.push({ x1: rx + rw / 2, y1: ry, x2: rx + rw / 2, y2: ry + rh });
|
|
3104
|
-
} else {
|
|
3105
|
-
path.push(
|
|
3106
|
-
{ x1: rx, y1: ry, x2: rx + rw, y2: ry },
|
|
3107
|
-
{ x1: rx + rw, y1: ry, x2: rx + rw, y2: ry + rh },
|
|
3108
|
-
{ x1: rx + rw, y1: ry + rh, x2: rx, y2: ry + rh },
|
|
3109
|
-
{ x1: rx, y1: ry + rh, x2: rx, y2: ry }
|
|
3110
|
-
);
|
|
3111
|
-
}
|
|
3112
|
-
}
|
|
3113
|
-
function flushPath(isStroke) {
|
|
3114
|
-
if (!isStroke) {
|
|
3115
|
-
currentPath = [];
|
|
3116
|
-
return;
|
|
3117
|
-
}
|
|
3118
|
-
for (const seg of currentPath) {
|
|
3119
|
-
classifyAndAdd(seg, lineWidth, horizontals, verticals);
|
|
3120
|
-
}
|
|
3121
|
-
currentPath = [];
|
|
3122
|
-
}
|
|
3123
|
-
for (let i = 0; i < fnArray.length; i++) {
|
|
3124
|
-
const op = fnArray[i];
|
|
3125
|
-
const args = argsArray[i];
|
|
3126
|
-
switch (op) {
|
|
3127
|
-
case import_pdf.OPS.setLineWidth:
|
|
3128
|
-
lineWidth = args[0] || 1;
|
|
3129
|
-
break;
|
|
3130
|
-
case import_pdf.OPS.constructPath: {
|
|
3131
|
-
const arg0 = args[0];
|
|
3132
|
-
if (Array.isArray(arg0)) {
|
|
3133
|
-
const subOps = arg0;
|
|
3134
|
-
const coords = args[1];
|
|
3135
|
-
let ci = 0;
|
|
3136
|
-
for (const subOp of subOps) {
|
|
3137
|
-
if (subOp === import_pdf.OPS.moveTo) {
|
|
3138
|
-
curX = coords[ci++];
|
|
3139
|
-
curY = coords[ci++];
|
|
3140
|
-
pathStartX = curX;
|
|
3141
|
-
pathStartY = curY;
|
|
3142
|
-
} else if (subOp === import_pdf.OPS.lineTo) {
|
|
3143
|
-
const x2 = coords[ci++], y2 = coords[ci++];
|
|
3144
|
-
currentPath.push({ x1: curX, y1: curY, x2, y2 });
|
|
3145
|
-
curX = x2;
|
|
3146
|
-
curY = y2;
|
|
3147
|
-
} else if (subOp === import_pdf.OPS.rectangle) {
|
|
3148
|
-
const rx = coords[ci++], ry = coords[ci++];
|
|
3149
|
-
const rw = coords[ci++], rh = coords[ci++];
|
|
3150
|
-
pushRectangle(currentPath, rx, ry, rw, rh);
|
|
3151
|
-
} else if (subOp === import_pdf.OPS.closePath) {
|
|
3152
|
-
if (curX !== pathStartX || curY !== pathStartY) {
|
|
3153
|
-
currentPath.push({ x1: curX, y1: curY, x2: pathStartX, y2: pathStartY });
|
|
3154
|
-
}
|
|
3155
|
-
curX = pathStartX;
|
|
3156
|
-
curY = pathStartY;
|
|
3157
|
-
} else if (subOp === import_pdf.OPS.curveTo) {
|
|
3158
|
-
ci += 6;
|
|
3159
|
-
} else if (subOp === import_pdf.OPS.curveTo2 || subOp === import_pdf.OPS.curveTo3) {
|
|
3160
|
-
ci += 4;
|
|
3161
|
-
}
|
|
3162
|
-
}
|
|
3163
|
-
} else {
|
|
3164
|
-
const afterOp = arg0;
|
|
3165
|
-
const dataArr = args[1];
|
|
3166
|
-
const pathData = dataArr?.[0];
|
|
3167
|
-
if (pathData && typeof pathData === "object") {
|
|
3168
|
-
const len = Object.keys(pathData).length;
|
|
3169
|
-
let di = 0;
|
|
3170
|
-
while (di < len) {
|
|
3171
|
-
const drawOp = pathData[di++];
|
|
3172
|
-
if (drawOp === 0 /* moveTo */) {
|
|
3173
|
-
curX = pathData[di++];
|
|
3174
|
-
curY = pathData[di++];
|
|
3175
|
-
pathStartX = curX;
|
|
3176
|
-
pathStartY = curY;
|
|
3177
|
-
} else if (drawOp === 1 /* lineTo */) {
|
|
3178
|
-
const x2 = pathData[di++], y2 = pathData[di++];
|
|
3179
|
-
currentPath.push({ x1: curX, y1: curY, x2, y2 });
|
|
3180
|
-
curX = x2;
|
|
3181
|
-
curY = y2;
|
|
3182
|
-
} else if (drawOp === 2 /* curveTo */) {
|
|
3183
|
-
di += 6;
|
|
3184
|
-
} else if (drawOp === 3 /* quadraticCurveTo */) {
|
|
3185
|
-
di += 4;
|
|
3186
|
-
} else if (drawOp === 4 /* closePath */) {
|
|
3187
|
-
if (curX !== pathStartX || curY !== pathStartY) {
|
|
3188
|
-
currentPath.push({ x1: curX, y1: curY, x2: pathStartX, y2: pathStartY });
|
|
3189
|
-
}
|
|
3190
|
-
curX = pathStartX;
|
|
3191
|
-
curY = pathStartY;
|
|
3192
|
-
} else {
|
|
3193
|
-
break;
|
|
3194
|
-
}
|
|
3195
|
-
}
|
|
3196
|
-
}
|
|
3197
|
-
if (afterOp === import_pdf.OPS.stroke || afterOp === import_pdf.OPS.closeStroke) {
|
|
3198
|
-
flushPath(true);
|
|
3199
|
-
} else if (afterOp === import_pdf.OPS.fill || afterOp === import_pdf.OPS.eoFill || afterOp === import_pdf.OPS.fillStroke || afterOp === import_pdf.OPS.eoFillStroke || afterOp === import_pdf.OPS.closeFillStroke || afterOp === import_pdf.OPS.closeEOFillStroke) {
|
|
3200
|
-
flushPath(true);
|
|
3201
|
-
} else if (afterOp === import_pdf.OPS.endPath) {
|
|
3202
|
-
flushPath(false);
|
|
3203
|
-
}
|
|
3204
|
-
}
|
|
3205
|
-
break;
|
|
3206
|
-
}
|
|
3207
|
-
case import_pdf.OPS.stroke:
|
|
3208
|
-
case import_pdf.OPS.closeStroke:
|
|
3209
|
-
flushPath(true);
|
|
3210
|
-
break;
|
|
3211
|
-
case import_pdf.OPS.fill:
|
|
3212
|
-
case import_pdf.OPS.eoFill:
|
|
3213
|
-
case import_pdf.OPS.fillStroke:
|
|
3214
|
-
case import_pdf.OPS.eoFillStroke:
|
|
3215
|
-
case import_pdf.OPS.closeFillStroke:
|
|
3216
|
-
case import_pdf.OPS.closeEOFillStroke:
|
|
3217
|
-
flushPath(true);
|
|
3218
|
-
break;
|
|
3219
|
-
case import_pdf.OPS.endPath:
|
|
3220
|
-
flushPath(false);
|
|
3221
|
-
break;
|
|
3222
|
-
}
|
|
3223
|
-
}
|
|
3224
|
-
return { horizontals, verticals };
|
|
3225
|
-
}
|
|
3226
|
-
function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
|
|
3227
|
-
const dx = Math.abs(seg.x2 - seg.x1);
|
|
3228
|
-
const dy = Math.abs(seg.y2 - seg.y1);
|
|
3229
|
-
const length = Math.sqrt(dx * dx + dy * dy);
|
|
3230
|
-
if (length < MIN_LINE_LENGTH) return;
|
|
3231
|
-
if (dy <= ORIENTATION_TOL) {
|
|
3232
|
-
const y = (seg.y1 + seg.y2) / 2;
|
|
3233
|
-
const x1 = Math.min(seg.x1, seg.x2);
|
|
3234
|
-
const x2 = Math.max(seg.x1, seg.x2);
|
|
3235
|
-
horizontals.push({ x1, y1: y, x2, y2: y, lineWidth });
|
|
3236
|
-
} else if (dx <= ORIENTATION_TOL) {
|
|
3237
|
-
const x = (seg.x1 + seg.x2) / 2;
|
|
3238
|
-
const y1 = Math.min(seg.y1, seg.y2);
|
|
3239
|
-
const y2 = Math.max(seg.y1, seg.y2);
|
|
3240
|
-
verticals.push({ x1: x, y1, x2: x, y2, lineWidth });
|
|
3241
|
-
}
|
|
3242
|
-
}
|
|
3243
|
-
function preprocessLines(horizontals, verticals) {
|
|
3244
|
-
let h = horizontals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
|
|
3245
|
-
let v = verticals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
|
|
3246
|
-
h = mergeParallelLines(h, "h");
|
|
3247
|
-
v = mergeParallelLines(v, "v");
|
|
3248
|
-
return { horizontals: h, verticals: v };
|
|
3249
|
-
}
|
|
3250
|
-
function mergeParallelLines(lines, dir) {
|
|
3251
|
-
if (lines.length <= 1) return lines;
|
|
3252
|
-
const sorted = [...lines].sort((a, b) => {
|
|
3253
|
-
const posA = dir === "h" ? a.y1 : a.x1;
|
|
3254
|
-
const posB = dir === "h" ? b.y1 : b.x1;
|
|
3255
|
-
if (Math.abs(posA - posB) > 0.1) return posA - posB;
|
|
3256
|
-
return dir === "h" ? a.x1 - b.x1 : a.y1 - b.y1;
|
|
3257
|
-
});
|
|
3258
|
-
const MERGE_TOL = 3;
|
|
3259
|
-
const result = [sorted[0]];
|
|
3260
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
3261
|
-
const prev = result[result.length - 1];
|
|
3262
|
-
const curr = sorted[i];
|
|
3263
|
-
const prevPos = dir === "h" ? prev.y1 : prev.x1;
|
|
3264
|
-
const currPos = dir === "h" ? curr.y1 : curr.x1;
|
|
3265
|
-
if (Math.abs(prevPos - currPos) <= MERGE_TOL) {
|
|
3266
|
-
const prevStart = dir === "h" ? prev.x1 : prev.y1;
|
|
3267
|
-
const prevEnd = dir === "h" ? prev.x2 : prev.y2;
|
|
3268
|
-
const currStart = dir === "h" ? curr.x1 : curr.y1;
|
|
3269
|
-
const currEnd = dir === "h" ? curr.x2 : curr.y2;
|
|
3270
|
-
const overlap = Math.min(prevEnd, currEnd) - Math.max(prevStart, currStart);
|
|
3271
|
-
const minLen = Math.min(prevEnd - prevStart, currEnd - currStart);
|
|
3272
|
-
if (overlap > minLen * 0.3) {
|
|
3273
|
-
if (dir === "h") {
|
|
3274
|
-
prev.x1 = Math.min(prev.x1, curr.x1);
|
|
3275
|
-
prev.x2 = Math.max(prev.x2, curr.x2);
|
|
3276
|
-
prev.y1 = (prev.y1 + curr.y1) / 2;
|
|
3277
|
-
prev.y2 = prev.y1;
|
|
3278
|
-
} else {
|
|
3279
|
-
prev.y1 = Math.min(prev.y1, curr.y1);
|
|
3280
|
-
prev.y2 = Math.max(prev.y2, curr.y2);
|
|
3281
|
-
prev.x1 = (prev.x1 + curr.x1) / 2;
|
|
3282
|
-
prev.x2 = prev.x1;
|
|
3283
|
-
}
|
|
3284
|
-
prev.lineWidth = Math.max(prev.lineWidth, curr.lineWidth);
|
|
3285
|
-
continue;
|
|
3286
|
-
}
|
|
3287
|
-
}
|
|
3288
|
-
result.push(curr);
|
|
3289
|
-
}
|
|
3290
|
-
return result;
|
|
3291
|
-
}
|
|
3292
|
-
function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
|
|
3293
|
-
const margin = 5;
|
|
3294
|
-
return {
|
|
3295
|
-
horizontals: horizontals.filter(
|
|
3296
|
-
(l) => !(Math.abs(l.y1) < margin || Math.abs(l.y1 - pageHeight) < margin) || l.x2 - l.x1 < pageWidth * 0.9
|
|
3297
|
-
),
|
|
3298
|
-
verticals: verticals.filter(
|
|
3299
|
-
(l) => !(Math.abs(l.x1) < margin || Math.abs(l.x1 - pageWidth) < margin) || l.y2 - l.y1 < pageHeight * 0.9
|
|
3300
|
-
)
|
|
3301
|
-
};
|
|
2531
|
+
|
|
2532
|
+
var MAX_SHEETS = 100;
|
|
2533
|
+
var MAX_DECOMPRESS_SIZE3 = 100 * 1024 * 1024;
|
|
2534
|
+
var MAX_ROWS2 = 1e4;
|
|
2535
|
+
var MAX_COLS2 = 200;
|
|
2536
|
+
function cleanNumericValue(raw) {
|
|
2537
|
+
if (!/^-?\d+\.\d+$/.test(raw)) return raw;
|
|
2538
|
+
const num = parseFloat(raw);
|
|
2539
|
+
if (!isFinite(num)) return raw;
|
|
2540
|
+
const cleaned = parseFloat(num.toPrecision(15)).toString();
|
|
2541
|
+
return cleaned;
|
|
3302
2542
|
}
|
|
3303
|
-
function
|
|
3304
|
-
const
|
|
3305
|
-
|
|
3306
|
-
|
|
3307
|
-
|
|
3308
|
-
|
|
3309
|
-
const radius = Math.max(h.lineWidth, v.lineWidth, 1);
|
|
3310
|
-
vertices.push({ x: v.x1, y: h.y1, radius });
|
|
3311
|
-
}
|
|
3312
|
-
}
|
|
3313
|
-
}
|
|
3314
|
-
return vertices;
|
|
3315
|
-
}
|
|
3316
|
-
function mergeVertices(vertices) {
|
|
3317
|
-
if (vertices.length <= 1) return vertices;
|
|
3318
|
-
const merged = [];
|
|
3319
|
-
const used = new Array(vertices.length).fill(false);
|
|
3320
|
-
for (let i = 0; i < vertices.length; i++) {
|
|
3321
|
-
if (used[i]) continue;
|
|
3322
|
-
let sumX = vertices[i].x, sumY = vertices[i].y;
|
|
3323
|
-
let maxRadius = vertices[i].radius;
|
|
3324
|
-
let count = 1;
|
|
3325
|
-
for (let j = i + 1; j < vertices.length; j++) {
|
|
3326
|
-
if (used[j]) continue;
|
|
3327
|
-
const mergeTol = VERTEX_MERGE_FACTOR * Math.max(maxRadius, vertices[j].radius);
|
|
3328
|
-
if (Math.abs(vertices[i].x - vertices[j].x) <= mergeTol && Math.abs(vertices[i].y - vertices[j].y) <= mergeTol) {
|
|
3329
|
-
sumX += vertices[j].x;
|
|
3330
|
-
sumY += vertices[j].y;
|
|
3331
|
-
maxRadius = Math.max(maxRadius, vertices[j].radius);
|
|
3332
|
-
count++;
|
|
3333
|
-
used[j] = true;
|
|
3334
|
-
}
|
|
3335
|
-
}
|
|
3336
|
-
merged.push({ x: sumX / count, y: sumY / count, radius: maxRadius });
|
|
3337
|
-
}
|
|
3338
|
-
return merged;
|
|
2543
|
+
function parseCellRef(ref) {
|
|
2544
|
+
const m = ref.match(/^([A-Z]+)(\d+)$/);
|
|
2545
|
+
if (!m) return null;
|
|
2546
|
+
let col = 0;
|
|
2547
|
+
for (const ch of m[1]) col = col * 26 + (ch.charCodeAt(0) - 64);
|
|
2548
|
+
return { col: col - 1, row: parseInt(m[2], 10) - 1 };
|
|
3339
2549
|
}
|
|
3340
|
-
function
|
|
3341
|
-
|
|
3342
|
-
|
|
3343
|
-
const
|
|
3344
|
-
|
|
3345
|
-
|
|
3346
|
-
|
|
3347
|
-
...horizontals.map((l, i) => ({ ...l, type: "h", id: i })),
|
|
3348
|
-
...verticals.map((l, i) => ({ ...l, type: "v", id: i + horizontals.length }))
|
|
3349
|
-
];
|
|
3350
|
-
const groups = groupConnectedLines(allLines);
|
|
3351
|
-
const grids = [];
|
|
3352
|
-
for (const group of groups) {
|
|
3353
|
-
const hLines = group.filter((l) => l.type === "h");
|
|
3354
|
-
const vLines = group.filter((l) => l.type === "v");
|
|
3355
|
-
if (hLines.length < 2 || vLines.length < 2) continue;
|
|
3356
|
-
let gx1 = Infinity, gy1 = Infinity, gx2 = -Infinity, gy2 = -Infinity;
|
|
3357
|
-
for (const l of vLines) {
|
|
3358
|
-
if (l.x1 < gx1) gx1 = l.x1;
|
|
3359
|
-
if (l.x1 > gx2) gx2 = l.x1;
|
|
3360
|
-
}
|
|
3361
|
-
for (const l of hLines) {
|
|
3362
|
-
if (l.y1 < gy1) gy1 = l.y1;
|
|
3363
|
-
if (l.y1 > gy2) gy2 = l.y1;
|
|
3364
|
-
}
|
|
3365
|
-
const groupBbox = {
|
|
3366
|
-
x1: gx1 - CONNECT_TOL,
|
|
3367
|
-
y1: gy1 - CONNECT_TOL,
|
|
3368
|
-
x2: gx2 + CONNECT_TOL,
|
|
3369
|
-
y2: gy2 + CONNECT_TOL
|
|
3370
|
-
};
|
|
3371
|
-
const groupVertices = vertices.filter(
|
|
3372
|
-
(v) => v.x >= groupBbox.x1 && v.x <= groupBbox.x2 && v.y >= groupBbox.y1 && v.y <= groupBbox.y2
|
|
3373
|
-
);
|
|
3374
|
-
const groupRadius = groupVertices.length > 0 ? groupVertices.reduce((max, v) => Math.max(max, v.radius), 1) : globalRadius;
|
|
3375
|
-
const coordMergeTol = Math.max(VERTEX_MERGE_FACTOR * groupRadius, MIN_COORD_MERGE_TOL);
|
|
3376
|
-
const rawYs = [
|
|
3377
|
-
...hLines.map((l) => l.y1),
|
|
3378
|
-
...groupVertices.map((v) => v.y)
|
|
3379
|
-
];
|
|
3380
|
-
const rowYs = clusterCoordinates(rawYs, coordMergeTol).sort((a, b) => b - a);
|
|
3381
|
-
const rawXs = [
|
|
3382
|
-
...vLines.map((l) => l.x1),
|
|
3383
|
-
...groupVertices.map((v) => v.x)
|
|
3384
|
-
];
|
|
3385
|
-
const colXs = clusterCoordinates(rawXs, coordMergeTol).sort((a, b) => a - b);
|
|
3386
|
-
if (rowYs.length < 2 || colXs.length < 2) continue;
|
|
3387
|
-
const validColXs = enforceMinWidth(colXs, MIN_COL_WIDTH);
|
|
3388
|
-
const validRowYs = enforceMinHeight(rowYs, MIN_ROW_HEIGHT);
|
|
3389
|
-
if (validRowYs.length < 2 || validColXs.length < 2) continue;
|
|
3390
|
-
const bbox = {
|
|
3391
|
-
x1: validColXs[0],
|
|
3392
|
-
y1: validRowYs[validRowYs.length - 1],
|
|
3393
|
-
x2: validColXs[validColXs.length - 1],
|
|
3394
|
-
y2: validRowYs[0]
|
|
3395
|
-
};
|
|
3396
|
-
grids.push({ rowYs: validRowYs, colXs: validColXs, bbox, vertexRadius: groupRadius });
|
|
3397
|
-
}
|
|
3398
|
-
return mergeAdjacentGrids(grids);
|
|
2550
|
+
function parseMergeRef(ref) {
|
|
2551
|
+
const parts = ref.split(":");
|
|
2552
|
+
if (parts.length !== 2) return null;
|
|
2553
|
+
const start = parseCellRef(parts[0]);
|
|
2554
|
+
const end = parseCellRef(parts[1]);
|
|
2555
|
+
if (!start || !end) return null;
|
|
2556
|
+
return { startCol: start.col, startRow: start.row, endCol: end.col, endRow: end.row };
|
|
3399
2557
|
}
|
|
3400
|
-
function
|
|
3401
|
-
|
|
3402
|
-
const result = [
|
|
3403
|
-
for (let i =
|
|
3404
|
-
const prevX = result[result.length - 1];
|
|
3405
|
-
if (colXs[i] - prevX < minWidth && i < colXs.length - 1) {
|
|
3406
|
-
continue;
|
|
3407
|
-
}
|
|
3408
|
-
result.push(colXs[i]);
|
|
3409
|
-
}
|
|
2558
|
+
function getElements(parent, tagName) {
|
|
2559
|
+
const nodes = parent.getElementsByTagName(tagName);
|
|
2560
|
+
const result = [];
|
|
2561
|
+
for (let i = 0; i < nodes.length; i++) result.push(nodes[i]);
|
|
3410
2562
|
return result;
|
|
3411
2563
|
}
|
|
3412
|
-
function
|
|
3413
|
-
|
|
3414
|
-
const result = [rowYs[0]];
|
|
3415
|
-
for (let i = 1; i < rowYs.length; i++) {
|
|
3416
|
-
const prevY = result[result.length - 1];
|
|
3417
|
-
if (prevY - rowYs[i] < minHeight && i < rowYs.length - 1) {
|
|
3418
|
-
continue;
|
|
3419
|
-
}
|
|
3420
|
-
result.push(rowYs[i]);
|
|
3421
|
-
}
|
|
3422
|
-
return result;
|
|
2564
|
+
function getTextContent(el) {
|
|
2565
|
+
return _nullishCoalesce(_optionalChain([el, 'access', _37 => _37.textContent, 'optionalAccess', _38 => _38.trim, 'call', _39 => _39()]), () => ( ""));
|
|
3423
2566
|
}
|
|
3424
|
-
function
|
|
3425
|
-
|
|
3426
|
-
const sorted = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
|
|
3427
|
-
const merged = [sorted[0]];
|
|
3428
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
3429
|
-
const prev = merged[merged.length - 1];
|
|
3430
|
-
const curr = sorted[i];
|
|
3431
|
-
if (prev.colXs.length === curr.colXs.length) {
|
|
3432
|
-
const mergeTol = Math.max(VERTEX_MERGE_FACTOR * Math.max(prev.vertexRadius, curr.vertexRadius), 6) * 3;
|
|
3433
|
-
const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= mergeTol);
|
|
3434
|
-
const verticalGap = prev.bbox.y1 - curr.bbox.y2;
|
|
3435
|
-
if (colMatch && verticalGap >= -CONNECT_TOL && verticalGap <= 20) {
|
|
3436
|
-
const allRowYs = [.../* @__PURE__ */ new Set([...prev.rowYs, ...curr.rowYs])].sort((a, b) => b - a);
|
|
3437
|
-
merged[merged.length - 1] = {
|
|
3438
|
-
rowYs: allRowYs,
|
|
3439
|
-
colXs: prev.colXs,
|
|
3440
|
-
bbox: {
|
|
3441
|
-
x1: Math.min(prev.bbox.x1, curr.bbox.x1),
|
|
3442
|
-
y1: Math.min(prev.bbox.y1, curr.bbox.y1),
|
|
3443
|
-
x2: Math.max(prev.bbox.x2, curr.bbox.x2),
|
|
3444
|
-
y2: Math.max(prev.bbox.y2, curr.bbox.y2)
|
|
3445
|
-
},
|
|
3446
|
-
vertexRadius: Math.max(prev.vertexRadius, curr.vertexRadius)
|
|
3447
|
-
};
|
|
3448
|
-
continue;
|
|
3449
|
-
}
|
|
3450
|
-
}
|
|
3451
|
-
merged.push(curr);
|
|
3452
|
-
}
|
|
3453
|
-
return merged;
|
|
2567
|
+
function parseXml(text) {
|
|
2568
|
+
return new (0, _xmldom.DOMParser)().parseFromString(_chunkHXUCZ2ILcjs.stripDtd.call(void 0, text), "text/xml");
|
|
3454
2569
|
}
|
|
3455
|
-
function
|
|
3456
|
-
|
|
3457
|
-
const
|
|
3458
|
-
const
|
|
3459
|
-
for (
|
|
3460
|
-
const
|
|
3461
|
-
|
|
3462
|
-
if (Math.abs(sorted[i] - avg) <= tolerance) {
|
|
3463
|
-
last.sum += sorted[i];
|
|
3464
|
-
last.count++;
|
|
3465
|
-
} else {
|
|
3466
|
-
clusters.push({ sum: sorted[i], count: 1 });
|
|
3467
|
-
}
|
|
2570
|
+
function parseSharedStrings(xml) {
|
|
2571
|
+
const doc = parseXml(xml);
|
|
2572
|
+
const strings = [];
|
|
2573
|
+
const siList = getElements(doc.documentElement, "si");
|
|
2574
|
+
for (const si of siList) {
|
|
2575
|
+
const tElements = getElements(si, "t");
|
|
2576
|
+
strings.push(tElements.map((t) => _nullishCoalesce(t.textContent, () => ( ""))).join(""));
|
|
3468
2577
|
}
|
|
3469
|
-
return
|
|
3470
|
-
}
|
|
3471
|
-
function groupConnectedLines(lines) {
|
|
3472
|
-
const parent = lines.map((_, i) => i);
|
|
3473
|
-
function find(x) {
|
|
3474
|
-
while (parent[x] !== x) {
|
|
3475
|
-
parent[x] = parent[parent[x]];
|
|
3476
|
-
x = parent[x];
|
|
3477
|
-
}
|
|
3478
|
-
return x;
|
|
3479
|
-
}
|
|
3480
|
-
function union(a, b) {
|
|
3481
|
-
const ra = find(a), rb = find(b);
|
|
3482
|
-
if (ra !== rb) parent[ra] = rb;
|
|
3483
|
-
}
|
|
3484
|
-
for (let i = 0; i < lines.length; i++) {
|
|
3485
|
-
for (let j = i + 1; j < lines.length; j++) {
|
|
3486
|
-
if (linesIntersect(lines[i], lines[j])) {
|
|
3487
|
-
union(i, j);
|
|
3488
|
-
}
|
|
3489
|
-
}
|
|
3490
|
-
}
|
|
3491
|
-
const groups = /* @__PURE__ */ new Map();
|
|
3492
|
-
for (let i = 0; i < lines.length; i++) {
|
|
3493
|
-
const root = find(i);
|
|
3494
|
-
if (!groups.has(root)) groups.set(root, []);
|
|
3495
|
-
groups.get(root).push(lines[i]);
|
|
3496
|
-
}
|
|
3497
|
-
return [...groups.values()];
|
|
3498
|
-
}
|
|
3499
|
-
function linesIntersect(a, b) {
|
|
3500
|
-
if (a.type === b.type) {
|
|
3501
|
-
if (a.type === "h") {
|
|
3502
|
-
if (Math.abs(a.y1 - b.y1) > CONNECT_TOL) return false;
|
|
3503
|
-
return Math.min(a.x2, b.x2) >= Math.max(a.x1, b.x1) - CONNECT_TOL;
|
|
3504
|
-
} else {
|
|
3505
|
-
if (Math.abs(a.x1 - b.x1) > CONNECT_TOL) return false;
|
|
3506
|
-
return Math.min(a.y2, b.y2) >= Math.max(a.y1, b.y1) - CONNECT_TOL;
|
|
3507
|
-
}
|
|
3508
|
-
}
|
|
3509
|
-
const h = a.type === "h" ? a : b;
|
|
3510
|
-
const v = a.type === "h" ? b : a;
|
|
3511
|
-
const tol = CONNECT_TOL;
|
|
3512
|
-
return v.x1 >= h.x1 - tol && v.x1 <= h.x2 + tol && h.y1 >= v.y1 - tol && h.y1 <= v.y2 + tol;
|
|
3513
|
-
}
|
|
3514
|
-
function extractCells(grid, horizontals, verticals) {
|
|
3515
|
-
const { rowYs, colXs } = grid;
|
|
3516
|
-
const numRows = rowYs.length - 1;
|
|
3517
|
-
const numCols = colXs.length - 1;
|
|
3518
|
-
if (numRows <= 0 || numCols <= 0) return [];
|
|
3519
|
-
const vBorders = Array.from(
|
|
3520
|
-
{ length: numRows },
|
|
3521
|
-
(_, r) => Array.from(
|
|
3522
|
-
{ length: numCols + 1 },
|
|
3523
|
-
(_2, c) => hasVerticalLine(verticals, colXs[c], rowYs[r], rowYs[r + 1], grid.vertexRadius)
|
|
3524
|
-
)
|
|
3525
|
-
);
|
|
3526
|
-
const hBorders = Array.from(
|
|
3527
|
-
{ length: numRows + 1 },
|
|
3528
|
-
(_, r) => Array.from(
|
|
3529
|
-
{ length: numCols },
|
|
3530
|
-
(_2, c) => hasHorizontalLine(horizontals, rowYs[r], colXs[c], colXs[c + 1], grid.vertexRadius)
|
|
3531
|
-
)
|
|
3532
|
-
);
|
|
3533
|
-
const occupied = Array.from({ length: numRows }, () => Array(numCols).fill(false));
|
|
3534
|
-
const cells = [];
|
|
3535
|
-
for (let r = 0; r < numRows; r++) {
|
|
3536
|
-
for (let c = 0; c < numCols; c++) {
|
|
3537
|
-
if (occupied[r][c]) continue;
|
|
3538
|
-
let colSpan = 1;
|
|
3539
|
-
let rowSpan = 1;
|
|
3540
|
-
while (c + colSpan < numCols && !vBorders[r][c + colSpan]) {
|
|
3541
|
-
let canExpand = true;
|
|
3542
|
-
for (let dr = 0; dr < rowSpan; dr++) {
|
|
3543
|
-
if (vBorders[r + dr][c + colSpan]) {
|
|
3544
|
-
canExpand = false;
|
|
3545
|
-
break;
|
|
3546
|
-
}
|
|
3547
|
-
}
|
|
3548
|
-
if (!canExpand) break;
|
|
3549
|
-
colSpan++;
|
|
3550
|
-
}
|
|
3551
|
-
while (r + rowSpan < numRows) {
|
|
3552
|
-
let hasLine = false;
|
|
3553
|
-
for (let dc = 0; dc < colSpan; dc++) {
|
|
3554
|
-
if (hBorders[r + rowSpan][c + dc]) {
|
|
3555
|
-
hasLine = true;
|
|
3556
|
-
break;
|
|
3557
|
-
}
|
|
3558
|
-
}
|
|
3559
|
-
if (hasLine) break;
|
|
3560
|
-
rowSpan++;
|
|
3561
|
-
}
|
|
3562
|
-
for (let dr = 0; dr < rowSpan; dr++) {
|
|
3563
|
-
for (let dc = 0; dc < colSpan; dc++) {
|
|
3564
|
-
occupied[r + dr][c + dc] = true;
|
|
3565
|
-
}
|
|
3566
|
-
}
|
|
3567
|
-
cells.push({
|
|
3568
|
-
row: r,
|
|
3569
|
-
col: c,
|
|
3570
|
-
rowSpan,
|
|
3571
|
-
colSpan,
|
|
3572
|
-
bbox: {
|
|
3573
|
-
x1: colXs[c],
|
|
3574
|
-
y1: rowYs[r + rowSpan],
|
|
3575
|
-
x2: colXs[c + colSpan],
|
|
3576
|
-
y2: rowYs[r]
|
|
3577
|
-
}
|
|
3578
|
-
});
|
|
3579
|
-
}
|
|
3580
|
-
}
|
|
3581
|
-
return cells;
|
|
3582
|
-
}
|
|
3583
|
-
function hasVerticalLine(verticals, x, topY, botY, vertexRadius) {
|
|
3584
|
-
const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
|
|
3585
|
-
for (const v of verticals) {
|
|
3586
|
-
if (Math.abs(v.x1 - x) <= tol) {
|
|
3587
|
-
const cellH = Math.abs(topY - botY);
|
|
3588
|
-
if (cellH < 0.1) continue;
|
|
3589
|
-
const overlapTop = Math.min(v.y2, topY);
|
|
3590
|
-
const overlapBot = Math.max(v.y1, botY);
|
|
3591
|
-
const overlap = overlapTop - overlapBot;
|
|
3592
|
-
if (overlap >= cellH * 0.75) return true;
|
|
3593
|
-
}
|
|
3594
|
-
}
|
|
3595
|
-
return false;
|
|
3596
|
-
}
|
|
3597
|
-
function hasHorizontalLine(horizontals, y, leftX, rightX, vertexRadius) {
|
|
3598
|
-
const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
|
|
3599
|
-
for (const h of horizontals) {
|
|
3600
|
-
if (Math.abs(h.y1 - y) <= tol) {
|
|
3601
|
-
const cellW = Math.abs(rightX - leftX);
|
|
3602
|
-
if (cellW < 0.1) continue;
|
|
3603
|
-
const overlapLeft = Math.max(h.x1, leftX);
|
|
3604
|
-
const overlapRight = Math.min(h.x2, rightX);
|
|
3605
|
-
const overlap = overlapRight - overlapLeft;
|
|
3606
|
-
if (overlap >= cellW * 0.75) return true;
|
|
3607
|
-
}
|
|
3608
|
-
}
|
|
3609
|
-
return false;
|
|
3610
|
-
}
|
|
3611
|
-
function mapTextToCells(items, cells) {
|
|
3612
|
-
const result = /* @__PURE__ */ new Map();
|
|
3613
|
-
for (const cell of cells) {
|
|
3614
|
-
result.set(cell, []);
|
|
3615
|
-
}
|
|
3616
|
-
for (const item of items) {
|
|
3617
|
-
const pad = CELL_PADDING;
|
|
3618
|
-
let bestCell = null;
|
|
3619
|
-
let bestScore = 0;
|
|
3620
|
-
for (const cell of cells) {
|
|
3621
|
-
const ix1 = Math.max(item.x, cell.bbox.x1 - pad);
|
|
3622
|
-
const ix2 = Math.min(item.x + item.w, cell.bbox.x2 + pad);
|
|
3623
|
-
const iy1 = Math.max(item.y, cell.bbox.y1 - pad);
|
|
3624
|
-
const iy2 = Math.min(item.y + (item.h || item.fontSize), cell.bbox.y2 + pad);
|
|
3625
|
-
if (ix1 >= ix2 || iy1 >= iy2) continue;
|
|
3626
|
-
const intersectArea = (ix2 - ix1) * (iy2 - iy1);
|
|
3627
|
-
const itemArea = Math.max(item.w, 1) * Math.max(item.h || item.fontSize, 1);
|
|
3628
|
-
const score = intersectArea / itemArea;
|
|
3629
|
-
if (score > bestScore) {
|
|
3630
|
-
bestScore = score;
|
|
3631
|
-
bestCell = cell;
|
|
3632
|
-
}
|
|
3633
|
-
}
|
|
3634
|
-
if (bestCell && bestScore > 0.3) {
|
|
3635
|
-
result.get(bestCell).push(item);
|
|
3636
|
-
}
|
|
3637
|
-
}
|
|
3638
|
-
return result;
|
|
3639
|
-
}
|
|
3640
|
-
function cellTextToString(items) {
|
|
3641
|
-
if (items.length === 0) return "";
|
|
3642
|
-
if (items.length === 1) return items[0].text;
|
|
3643
|
-
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
3644
|
-
const lines = [];
|
|
3645
|
-
let curLine = [sorted[0]];
|
|
3646
|
-
let curY = sorted[0].y;
|
|
3647
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
3648
|
-
const tol = Math.max(3, Math.min(sorted[i].fontSize, curLine[0].fontSize) * 0.6);
|
|
3649
|
-
if (Math.abs(sorted[i].y - curY) <= tol) {
|
|
3650
|
-
curLine.push(sorted[i]);
|
|
3651
|
-
} else {
|
|
3652
|
-
lines.push(curLine);
|
|
3653
|
-
curLine = [sorted[i]];
|
|
3654
|
-
curY = sorted[i].y;
|
|
3655
|
-
}
|
|
3656
|
-
}
|
|
3657
|
-
lines.push(curLine);
|
|
3658
|
-
const textLines = lines.map((line) => {
|
|
3659
|
-
const s = line.sort((a, b) => a.x - b.x);
|
|
3660
|
-
if (s.length === 1) return s[0].text;
|
|
3661
|
-
const evenSpaced = detectEvenSpacedItems(s);
|
|
3662
|
-
let result = s[0].text;
|
|
3663
|
-
for (let j = 1; j < s.length; j++) {
|
|
3664
|
-
if (evenSpaced[j]) {
|
|
3665
|
-
result += s[j].text;
|
|
3666
|
-
continue;
|
|
3667
|
-
}
|
|
3668
|
-
const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
|
|
3669
|
-
const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
|
|
3670
|
-
const prevIsKorean = /[가-힣]$/.test(result);
|
|
3671
|
-
const currIsKorean = /^[가-힣]/.test(s[j].text);
|
|
3672
|
-
if (gap < avgFs * 0.15) {
|
|
3673
|
-
result += s[j].text;
|
|
3674
|
-
} else if (gap < avgFs * 0.35 && (prevIsKorean || currIsKorean)) {
|
|
3675
|
-
result += s[j].text;
|
|
3676
|
-
} else {
|
|
3677
|
-
result += " " + s[j].text;
|
|
3678
|
-
}
|
|
3679
|
-
}
|
|
3680
|
-
return result;
|
|
3681
|
-
});
|
|
3682
|
-
return mergeCellTextLines(textLines);
|
|
3683
|
-
}
|
|
3684
|
-
function detectEvenSpacedItems(items) {
|
|
3685
|
-
const result = new Array(items.length).fill(false);
|
|
3686
|
-
if (items.length < 3) return result;
|
|
3687
|
-
let runStart = -1;
|
|
3688
|
-
for (let i = 0; i < items.length; i++) {
|
|
3689
|
-
const isShortKorean = /^[가-힣]{1}$/.test(items[i].text) || /^[\d]{1}$/.test(items[i].text);
|
|
3690
|
-
if (isShortKorean && runStart >= 0 && i > 0) {
|
|
3691
|
-
const gap = items[i].x - (items[i - 1].x + items[i - 1].w);
|
|
3692
|
-
const maxRunGap = Math.max(items[i].fontSize * 3, 30);
|
|
3693
|
-
if (gap > maxRunGap) {
|
|
3694
|
-
if (i - runStart >= 3) markEvenRun(items, result, runStart, i);
|
|
3695
|
-
runStart = i;
|
|
3696
|
-
continue;
|
|
3697
|
-
}
|
|
3698
|
-
}
|
|
3699
|
-
if (isShortKorean) {
|
|
3700
|
-
if (runStart < 0) runStart = i;
|
|
3701
|
-
} else {
|
|
3702
|
-
if (runStart >= 0 && i - runStart >= 3) {
|
|
3703
|
-
markEvenRun(items, result, runStart, i);
|
|
3704
|
-
}
|
|
3705
|
-
runStart = -1;
|
|
3706
|
-
}
|
|
3707
|
-
}
|
|
3708
|
-
if (runStart >= 0 && items.length - runStart >= 3) {
|
|
3709
|
-
markEvenRun(items, result, runStart, items.length);
|
|
3710
|
-
}
|
|
3711
|
-
return result;
|
|
3712
|
-
}
|
|
3713
|
-
function markEvenRun(items, result, start, end) {
|
|
3714
|
-
const gaps = [];
|
|
3715
|
-
for (let i = start + 1; i < end; i++) {
|
|
3716
|
-
gaps.push(items[i].x - (items[i - 1].x + items[i - 1].w));
|
|
3717
|
-
}
|
|
3718
|
-
const posGaps = gaps.filter((g2) => g2 > 0);
|
|
3719
|
-
if (posGaps.length < 2) return;
|
|
3720
|
-
let minGap = Infinity, maxGap = -Infinity;
|
|
3721
|
-
for (const g2 of posGaps) {
|
|
3722
|
-
if (g2 < minGap) minGap = g2;
|
|
3723
|
-
if (g2 > maxGap) maxGap = g2;
|
|
3724
|
-
}
|
|
3725
|
-
const avgFs = items[start].fontSize;
|
|
3726
|
-
if (minGap >= avgFs * 0.1 && maxGap <= avgFs * 3 && maxGap / Math.max(minGap, 0.1) <= 3) {
|
|
3727
|
-
for (let i = start + 1; i < end; i++) {
|
|
3728
|
-
result[i] = true;
|
|
3729
|
-
}
|
|
3730
|
-
}
|
|
3731
|
-
}
|
|
3732
|
-
function mergeCellTextLines(textLines) {
|
|
3733
|
-
if (textLines.length <= 1) return textLines[0] || "";
|
|
3734
|
-
const merged = [textLines[0]];
|
|
3735
|
-
for (let i = 1; i < textLines.length; i++) {
|
|
3736
|
-
const prev = merged[merged.length - 1];
|
|
3737
|
-
const curr = textLines[i];
|
|
3738
|
-
if (/[가-힣]$/.test(prev) && /^[가-힣]+$/.test(curr) && curr.length <= 8 && !curr.includes(" ")) {
|
|
3739
|
-
merged[merged.length - 1] = prev + curr;
|
|
3740
|
-
} else if (curr.trim().length <= 3 && /^[)\]%}]/.test(curr.trim())) {
|
|
3741
|
-
merged[merged.length - 1] = prev + curr.trim();
|
|
3742
|
-
} else if (/[,(]$/.test(prev.trim()) && curr.trim().length <= 15) {
|
|
3743
|
-
merged[merged.length - 1] = prev + curr.trim();
|
|
3744
|
-
} else if (/[\d,]$/.test(prev) && /^[\d,]+[)\]]?$/.test(curr.trim()) && curr.trim().length <= 10) {
|
|
3745
|
-
merged[merged.length - 1] = prev + curr.trim();
|
|
3746
|
-
} else {
|
|
3747
|
-
merged.push(curr);
|
|
3748
|
-
}
|
|
3749
|
-
}
|
|
3750
|
-
return merged.join("\n");
|
|
3751
|
-
}
|
|
3752
|
-
|
|
3753
|
-
// src/pdf/cluster-detector.ts
|
|
3754
|
-
var Y_TOL = 3;
|
|
3755
|
-
var COL_CLUSTER_TOL = 15;
|
|
3756
|
-
var MIN_ROWS = 3;
|
|
3757
|
-
var MIN_COLS = 2;
|
|
3758
|
-
var MIN_GAP_FACTOR = 2;
|
|
3759
|
-
var MIN_GAP_ABSOLUTE = 20;
|
|
3760
|
-
var MIN_COL_FILL_RATIO = 0.4;
|
|
3761
|
-
function detectClusterTables(items, pageNum) {
|
|
3762
|
-
if (items.length < MIN_ROWS * MIN_COLS) return [];
|
|
3763
|
-
const { merged, originMap } = mergeEvenSpacedClusters(items);
|
|
3764
|
-
const rows = groupByBaseline(merged);
|
|
3765
|
-
if (rows.length < MIN_ROWS) return [];
|
|
3766
|
-
const results = [];
|
|
3767
|
-
const headerResult = detectHeaderRow(rows);
|
|
3768
|
-
if (headerResult) {
|
|
3769
|
-
const { columns, headerIdx } = headerResult;
|
|
3770
|
-
const headerRow = rows[headerIdx];
|
|
3771
|
-
const headerItems = [...headerRow.items].sort((a, b) => a.x - b.x);
|
|
3772
|
-
const headerAndBelow = rows.slice(headerIdx);
|
|
3773
|
-
const mergedRows = mergeMultiLineRows(headerAndBelow, columns);
|
|
3774
|
-
const tableRegions = findTableRegionsByHeader(mergedRows, columns, headerItems);
|
|
3775
|
-
for (const region of tableRegions) {
|
|
3776
|
-
const table = buildClusterTable(region.rows, columns, pageNum);
|
|
3777
|
-
if (table) {
|
|
3778
|
-
expandUsedItems(table.usedItems, originMap);
|
|
3779
|
-
results.push(table);
|
|
3780
|
-
}
|
|
3781
|
-
}
|
|
3782
|
-
}
|
|
3783
|
-
if (results.length === 0) {
|
|
3784
|
-
const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
|
|
3785
|
-
if (suspiciousRows.length >= MIN_ROWS) {
|
|
3786
|
-
const columns = extractColumnClusters(suspiciousRows);
|
|
3787
|
-
if (columns.length >= MIN_COLS) {
|
|
3788
|
-
const tableRegions = findTableRegions(rows, columns);
|
|
3789
|
-
for (const region of tableRegions) {
|
|
3790
|
-
const mergedRows = mergeMultiLineRows(region.rows, columns);
|
|
3791
|
-
const table = buildClusterTable(mergedRows, columns, pageNum);
|
|
3792
|
-
if (table) {
|
|
3793
|
-
expandUsedItems(table.usedItems, originMap);
|
|
3794
|
-
results.push(table);
|
|
3795
|
-
}
|
|
3796
|
-
}
|
|
3797
|
-
}
|
|
3798
|
-
}
|
|
3799
|
-
}
|
|
3800
|
-
return results;
|
|
3801
|
-
}
|
|
3802
|
-
function mergeEvenSpacedClusters(items) {
|
|
3803
|
-
const originMap = /* @__PURE__ */ new Map();
|
|
3804
|
-
const rows = groupByBaseline(items);
|
|
3805
|
-
const merged = [];
|
|
3806
|
-
for (const row of rows) {
|
|
3807
|
-
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3808
|
-
let i = 0;
|
|
3809
|
-
while (i < sorted.length) {
|
|
3810
|
-
if (/^[가-힣\d]$/.test(sorted[i].text)) {
|
|
3811
|
-
let runEnd = i + 1;
|
|
3812
|
-
while (runEnd < sorted.length && /^[가-힣\d]$/.test(sorted[runEnd].text)) {
|
|
3813
|
-
const gap = sorted[runEnd].x - (sorted[runEnd - 1].x + sorted[runEnd - 1].w);
|
|
3814
|
-
const fs = sorted[runEnd].fontSize;
|
|
3815
|
-
if (gap < fs * 0.1 || gap > fs * 3) break;
|
|
3816
|
-
runEnd++;
|
|
3817
|
-
}
|
|
3818
|
-
if (runEnd - i >= 3) {
|
|
3819
|
-
const gaps = [];
|
|
3820
|
-
for (let g2 = i + 1; g2 < runEnd; g2++) {
|
|
3821
|
-
gaps.push(sorted[g2].x - (sorted[g2 - 1].x + sorted[g2 - 1].w));
|
|
3822
|
-
}
|
|
3823
|
-
let minG = Infinity, maxG = -Infinity;
|
|
3824
|
-
for (const g2 of gaps) {
|
|
3825
|
-
if (g2 < minG) minG = g2;
|
|
3826
|
-
if (g2 > maxG) maxG = g2;
|
|
3827
|
-
}
|
|
3828
|
-
if (minG > 0 && maxG / minG <= 3) {
|
|
3829
|
-
const run = sorted.slice(i, runEnd);
|
|
3830
|
-
const text = run.map((r) => r.text).join("");
|
|
3831
|
-
const first = run[0], last = run[runEnd - i - 1];
|
|
3832
|
-
const item = {
|
|
3833
|
-
text,
|
|
3834
|
-
x: first.x,
|
|
3835
|
-
y: first.y,
|
|
3836
|
-
w: last.x + last.w - first.x,
|
|
3837
|
-
h: first.h,
|
|
3838
|
-
fontSize: first.fontSize,
|
|
3839
|
-
fontName: first.fontName
|
|
3840
|
-
};
|
|
3841
|
-
originMap.set(item, run);
|
|
3842
|
-
merged.push(item);
|
|
3843
|
-
i = runEnd;
|
|
3844
|
-
continue;
|
|
3845
|
-
}
|
|
3846
|
-
}
|
|
3847
|
-
}
|
|
3848
|
-
merged.push(sorted[i]);
|
|
3849
|
-
i++;
|
|
3850
|
-
}
|
|
3851
|
-
}
|
|
3852
|
-
return { merged, originMap };
|
|
3853
|
-
}
|
|
3854
|
-
function expandUsedItems(usedItems, originMap) {
|
|
3855
|
-
const toAdd = [];
|
|
3856
|
-
for (const item of usedItems) {
|
|
3857
|
-
const origins = originMap.get(item);
|
|
3858
|
-
if (origins) for (const o of origins) toAdd.push(o);
|
|
3859
|
-
}
|
|
3860
|
-
for (const a of toAdd) usedItems.add(a);
|
|
3861
|
-
}
|
|
3862
|
-
function detectHeaderRow(rows) {
|
|
3863
|
-
const allItems = rows.flatMap((r) => r.items);
|
|
3864
|
-
if (allItems.length === 0) return null;
|
|
3865
|
-
let allMinX = Infinity, allMaxX = -Infinity;
|
|
3866
|
-
for (const i of allItems) {
|
|
3867
|
-
if (i.x < allMinX) allMinX = i.x;
|
|
3868
|
-
const r = i.x + i.w;
|
|
3869
|
-
if (r > allMaxX) allMaxX = r;
|
|
3870
|
-
}
|
|
3871
|
-
const pageSpan = allMaxX - allMinX;
|
|
3872
|
-
if (pageSpan <= 0) return null;
|
|
3873
|
-
for (let ri = 0; ri < rows.length; ri++) {
|
|
3874
|
-
const row = rows[ri];
|
|
3875
|
-
if (row.items.length < MIN_COLS || row.items.length > 6) continue;
|
|
3876
|
-
if (row.items.some((i) => i.text.length > 8)) continue;
|
|
3877
|
-
if (!row.items.some((i) => /[가-힣]/.test(i.text))) continue;
|
|
3878
|
-
if (row.items.some((i) => /^[□■○●·※▶▷◆◇\-]/.test(i.text))) continue;
|
|
3879
|
-
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3880
|
-
const xSpan = sorted[sorted.length - 1].x + sorted[sorted.length - 1].w - sorted[0].x;
|
|
3881
|
-
if (xSpan / pageSpan < 0.4) continue;
|
|
3882
|
-
const avgFs = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
|
|
3883
|
-
let hasLargeGap = false;
|
|
3884
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
3885
|
-
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
3886
|
-
if (gap >= avgFs * 2.5) {
|
|
3887
|
-
hasLargeGap = true;
|
|
3888
|
-
break;
|
|
3889
|
-
}
|
|
3890
|
-
}
|
|
3891
|
-
if (!hasLargeGap) continue;
|
|
3892
|
-
const columns = sorted.map((item) => ({ x: item.x, count: 0 }));
|
|
3893
|
-
let matchCount = 0;
|
|
3894
|
-
for (let j = ri + 1; j < rows.length && matchCount < MIN_ROWS + 2; j++) {
|
|
3895
|
-
const matched = countMatchedColumnsRange(rows[j], columns, sorted);
|
|
3896
|
-
if (matched >= MIN_COLS) matchCount++;
|
|
3897
|
-
}
|
|
3898
|
-
if (matchCount < MIN_ROWS) continue;
|
|
3899
|
-
return { columns, headerIdx: ri };
|
|
3900
|
-
}
|
|
3901
|
-
return null;
|
|
3902
|
-
}
|
|
3903
|
-
function mergeMultiLineRows(rows, columns) {
|
|
3904
|
-
if (rows.length <= 1) return rows;
|
|
3905
|
-
const result = [rows[0]];
|
|
3906
|
-
const allFontSizes = rows.flatMap((r) => r.items).map((i) => i.fontSize);
|
|
3907
|
-
const avgFontSize = allFontSizes.length > 0 ? allFontSizes.reduce((s, v) => s + v, 0) / allFontSizes.length : 12;
|
|
3908
|
-
for (let i = 1; i < rows.length; i++) {
|
|
3909
|
-
const prev = result[result.length - 1];
|
|
3910
|
-
const curr = rows[i];
|
|
3911
|
-
const yGap = Math.abs(prev.y - curr.y);
|
|
3912
|
-
const matchedCols = countMatchedColumns(curr, columns);
|
|
3913
|
-
if (yGap < avgFontSize * 1.8 && curr.items.length <= 2 && (matchedCols < MIN_COLS || curr.items.length === 1)) {
|
|
3914
|
-
result[result.length - 1] = {
|
|
3915
|
-
y: prev.y,
|
|
3916
|
-
items: [...prev.items, ...curr.items]
|
|
3917
|
-
};
|
|
3918
|
-
} else {
|
|
3919
|
-
result.push(curr);
|
|
3920
|
-
}
|
|
3921
|
-
}
|
|
3922
|
-
return result;
|
|
3923
|
-
}
|
|
3924
|
-
function groupByBaseline(items) {
|
|
3925
|
-
if (items.length === 0) return [];
|
|
3926
|
-
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
3927
|
-
const rows = [];
|
|
3928
|
-
let curItems = [sorted[0]];
|
|
3929
|
-
let curY = sorted[0].y;
|
|
3930
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
3931
|
-
if (Math.abs(sorted[i].y - curY) <= Y_TOL) {
|
|
3932
|
-
curItems.push(sorted[i]);
|
|
3933
|
-
} else {
|
|
3934
|
-
rows.push({ y: curY, items: curItems });
|
|
3935
|
-
curItems = [sorted[i]];
|
|
3936
|
-
curY = sorted[i].y;
|
|
3937
|
-
}
|
|
3938
|
-
}
|
|
3939
|
-
if (curItems.length > 0) rows.push({ y: curY, items: curItems });
|
|
3940
|
-
return rows;
|
|
3941
|
-
}
|
|
3942
|
-
function hasSuspiciousGaps(row) {
|
|
3943
|
-
if (row.items.length < 2) return false;
|
|
3944
|
-
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3945
|
-
if (sorted.length === 2 && sorted[1].text.length > 20) return false;
|
|
3946
|
-
const avgFontSize = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
|
|
3947
|
-
const minGap = Math.max(avgFontSize * MIN_GAP_FACTOR, MIN_GAP_ABSOLUTE);
|
|
3948
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
3949
|
-
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
3950
|
-
if (gap >= minGap) return true;
|
|
3951
|
-
}
|
|
3952
|
-
return false;
|
|
3953
|
-
}
|
|
3954
|
-
function extractColumnClusters(rows) {
|
|
3955
|
-
const allX = [];
|
|
3956
|
-
for (const row of rows) {
|
|
3957
|
-
for (const item of row.items) allX.push(item.x);
|
|
3958
|
-
}
|
|
3959
|
-
if (allX.length === 0) return [];
|
|
3960
|
-
allX.sort((a, b) => a - b);
|
|
3961
|
-
const clusters = [];
|
|
3962
|
-
let clusterStart = 0;
|
|
3963
|
-
for (let i = 1; i <= allX.length; i++) {
|
|
3964
|
-
if (i === allX.length || allX[i] - allX[i - 1] > COL_CLUSTER_TOL) {
|
|
3965
|
-
const slice = allX.slice(clusterStart, i);
|
|
3966
|
-
const avg = Math.round(slice.reduce((s, v) => s + v, 0) / slice.length);
|
|
3967
|
-
clusters.push({ x: avg, count: slice.length });
|
|
3968
|
-
clusterStart = i;
|
|
3969
|
-
}
|
|
3970
|
-
}
|
|
3971
|
-
const minCount = Math.max(2, Math.floor(rows.length * MIN_COL_FILL_RATIO));
|
|
3972
|
-
return clusters.filter((c) => c.count >= minCount).sort((a, b) => a.x - b.x);
|
|
3973
|
-
}
|
|
3974
|
-
function findTableRegionsByHeader(allRows, columns, headerItems) {
|
|
3975
|
-
const regions = [];
|
|
3976
|
-
let currentRegion = [];
|
|
3977
|
-
let missStreak = 0;
|
|
3978
|
-
for (const row of allRows) {
|
|
3979
|
-
const matchedCols = countMatchedColumnsRange(row, columns, headerItems);
|
|
3980
|
-
if (matchedCols >= MIN_COLS) {
|
|
3981
|
-
currentRegion.push(row);
|
|
3982
|
-
missStreak = 0;
|
|
3983
|
-
} else if (currentRegion.length > 0 && (row.items.length <= 2 || missStreak === 0)) {
|
|
3984
|
-
currentRegion.push(row);
|
|
3985
|
-
missStreak++;
|
|
3986
|
-
} else {
|
|
3987
|
-
while (currentRegion.length > 0) {
|
|
3988
|
-
const last = currentRegion[currentRegion.length - 1];
|
|
3989
|
-
if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
|
|
3990
|
-
currentRegion.pop();
|
|
3991
|
-
}
|
|
3992
|
-
if (currentRegion.length >= MIN_ROWS) {
|
|
3993
|
-
regions.push({ rows: [...currentRegion] });
|
|
3994
|
-
}
|
|
3995
|
-
currentRegion = [];
|
|
3996
|
-
missStreak = 0;
|
|
3997
|
-
}
|
|
3998
|
-
}
|
|
3999
|
-
while (currentRegion.length > 0) {
|
|
4000
|
-
const last = currentRegion[currentRegion.length - 1];
|
|
4001
|
-
if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
|
|
4002
|
-
currentRegion.pop();
|
|
4003
|
-
}
|
|
4004
|
-
if (currentRegion.length >= MIN_ROWS) {
|
|
4005
|
-
regions.push({ rows: currentRegion });
|
|
4006
|
-
}
|
|
4007
|
-
return regions;
|
|
4008
|
-
}
|
|
4009
|
-
function findTableRegions(allRows, columns) {
|
|
4010
|
-
const regions = [];
|
|
4011
|
-
let currentRegion = [];
|
|
4012
|
-
for (const row of allRows) {
|
|
4013
|
-
const matchedCols = countMatchedColumns(row, columns);
|
|
4014
|
-
if (matchedCols >= MIN_COLS) {
|
|
4015
|
-
currentRegion.push(row);
|
|
4016
|
-
} else if (row.items.length === 1) {
|
|
4017
|
-
if (currentRegion.length > 0) {
|
|
4018
|
-
currentRegion.push(row);
|
|
4019
|
-
}
|
|
4020
|
-
} else {
|
|
4021
|
-
if (currentRegion.length >= MIN_ROWS) {
|
|
4022
|
-
regions.push({ rows: [...currentRegion] });
|
|
4023
|
-
}
|
|
4024
|
-
currentRegion = [];
|
|
4025
|
-
}
|
|
4026
|
-
}
|
|
4027
|
-
if (currentRegion.length >= MIN_ROWS) {
|
|
4028
|
-
regions.push({ rows: currentRegion });
|
|
4029
|
-
}
|
|
4030
|
-
return regions;
|
|
4031
|
-
}
|
|
4032
|
-
function countMatchedColumns(row, columns) {
|
|
4033
|
-
const matched = /* @__PURE__ */ new Set();
|
|
4034
|
-
for (const item of row.items) {
|
|
4035
|
-
for (let ci = 0; ci < columns.length; ci++) {
|
|
4036
|
-
if (Math.abs(item.x - columns[ci].x) <= COL_CLUSTER_TOL * 2) {
|
|
4037
|
-
matched.add(ci);
|
|
4038
|
-
break;
|
|
4039
|
-
}
|
|
4040
|
-
}
|
|
4041
|
-
}
|
|
4042
|
-
return matched.size;
|
|
4043
|
-
}
|
|
4044
|
-
function countMatchedColumnsRange(row, columns, headerItems) {
|
|
4045
|
-
const boundaries = [];
|
|
4046
|
-
for (let ci = 0; ci < headerItems.length; ci++) {
|
|
4047
|
-
const left = ci === 0 ? 0 : (headerItems[ci - 1].x + headerItems[ci - 1].w + headerItems[ci].x) / 2;
|
|
4048
|
-
const right = ci === headerItems.length - 1 ? Infinity : (headerItems[ci].x + headerItems[ci].w + headerItems[ci + 1].x) / 2;
|
|
4049
|
-
boundaries.push({ left, right });
|
|
4050
|
-
}
|
|
4051
|
-
const matched = /* @__PURE__ */ new Set();
|
|
4052
|
-
for (const item of row.items) {
|
|
4053
|
-
for (let ci = 0; ci < boundaries.length; ci++) {
|
|
4054
|
-
if (item.x >= boundaries[ci].left && item.x < boundaries[ci].right) {
|
|
4055
|
-
matched.add(ci);
|
|
4056
|
-
break;
|
|
4057
|
-
}
|
|
4058
|
-
}
|
|
4059
|
-
}
|
|
4060
|
-
return matched.size;
|
|
4061
|
-
}
|
|
4062
|
-
function assignRowItems(items, columns, numCols) {
|
|
4063
|
-
if (items.length === 0) return [];
|
|
4064
|
-
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
4065
|
-
const colCenters = columns.map((c) => c.x);
|
|
4066
|
-
const gaps = [];
|
|
4067
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
4068
|
-
gaps.push({ idx: i, size: sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w) });
|
|
4069
|
-
}
|
|
4070
|
-
const gapSizes = gaps.map((g2) => g2.size).sort((a, b) => a - b);
|
|
4071
|
-
const medianGap = gapSizes.length > 0 ? gapSizes[Math.floor(gapSizes.length / 2)] : 0;
|
|
4072
|
-
const gapThreshold = sorted.length <= numCols + 1 ? 12 : Math.max(medianGap * 2.5, 12);
|
|
4073
|
-
const significantGaps = gaps.filter((g2) => g2.size >= gapThreshold).sort((a, b) => b.size - a.size).slice(0, numCols - 1).sort((a, b) => a.idx - b.idx);
|
|
4074
|
-
const groups = [];
|
|
4075
|
-
let start = 0;
|
|
4076
|
-
for (const gap of significantGaps) {
|
|
4077
|
-
groups.push(sorted.slice(start, gap.idx));
|
|
4078
|
-
start = gap.idx;
|
|
4079
|
-
}
|
|
4080
|
-
groups.push(sorted.slice(start));
|
|
4081
|
-
const result = [];
|
|
4082
|
-
const usedCols = /* @__PURE__ */ new Set();
|
|
4083
|
-
const groupCenters = groups.map((g2) => {
|
|
4084
|
-
let minX = Infinity, maxX = -Infinity;
|
|
4085
|
-
for (const i of g2) {
|
|
4086
|
-
if (i.x < minX) minX = i.x;
|
|
4087
|
-
const r = i.x + i.w;
|
|
4088
|
-
if (r > maxX) maxX = r;
|
|
4089
|
-
}
|
|
4090
|
-
return (minX + maxX) / 2;
|
|
4091
|
-
});
|
|
4092
|
-
const assignments = [];
|
|
4093
|
-
for (let gi = 0; gi < groups.length; gi++) {
|
|
4094
|
-
for (let ci = 0; ci < numCols; ci++) {
|
|
4095
|
-
assignments.push({ gi, ci, dist: Math.abs(groupCenters[gi] - colCenters[ci]) });
|
|
4096
|
-
}
|
|
4097
|
-
}
|
|
4098
|
-
assignments.sort((a, b) => a.dist - b.dist);
|
|
4099
|
-
const assignedGroups = /* @__PURE__ */ new Set();
|
|
4100
|
-
for (const { gi, ci } of assignments) {
|
|
4101
|
-
if (assignedGroups.has(gi) || usedCols.has(ci)) continue;
|
|
4102
|
-
result.push({ col: ci, items: groups[gi] });
|
|
4103
|
-
assignedGroups.add(gi);
|
|
4104
|
-
usedCols.add(ci);
|
|
4105
|
-
}
|
|
4106
|
-
for (let gi = 0; gi < groups.length; gi++) {
|
|
4107
|
-
if (assignedGroups.has(gi)) continue;
|
|
4108
|
-
let bestCol = 0, bestDist = Infinity;
|
|
4109
|
-
for (let ci = 0; ci < numCols; ci++) {
|
|
4110
|
-
const d = Math.abs(groupCenters[gi] - colCenters[ci]);
|
|
4111
|
-
if (d < bestDist) {
|
|
4112
|
-
bestDist = d;
|
|
4113
|
-
bestCol = ci;
|
|
4114
|
-
}
|
|
4115
|
-
}
|
|
4116
|
-
result.push({ col: bestCol, items: groups[gi] });
|
|
4117
|
-
}
|
|
4118
|
-
return result;
|
|
4119
|
-
}
|
|
4120
|
-
function buildClusterTable(rows, columns, pageNum) {
|
|
4121
|
-
const numCols = columns.length;
|
|
4122
|
-
const numRows = rows.length;
|
|
4123
|
-
if (numRows < MIN_ROWS || numCols < MIN_COLS) return null;
|
|
4124
|
-
const cells = Array.from(
|
|
4125
|
-
{ length: numRows },
|
|
4126
|
-
() => Array.from({ length: numCols }, () => ({ text: "", colSpan: 1, rowSpan: 1 }))
|
|
4127
|
-
);
|
|
4128
|
-
const usedItems = /* @__PURE__ */ new Set();
|
|
4129
|
-
for (let r = 0; r < numRows; r++) {
|
|
4130
|
-
const row = rows[r];
|
|
4131
|
-
if (row.items.length === 1 && numCols > 1) {
|
|
4132
|
-
cells[r][0] = { text: row.items[0].text, colSpan: numCols, rowSpan: 1 };
|
|
4133
|
-
usedItems.add(row.items[0]);
|
|
4134
|
-
continue;
|
|
4135
|
-
}
|
|
4136
|
-
const assignments = assignRowItems(row.items, columns, numCols);
|
|
4137
|
-
for (const { col, items } of assignments) {
|
|
4138
|
-
const text = items.map((i) => i.text).join(" ");
|
|
4139
|
-
const existing = cells[r][col].text;
|
|
4140
|
-
cells[r][col].text = existing ? existing + " " + text : text;
|
|
4141
|
-
for (const item of items) usedItems.add(item);
|
|
4142
|
-
}
|
|
4143
|
-
}
|
|
4144
|
-
let emptyRows = 0;
|
|
4145
|
-
for (const row of cells) {
|
|
4146
|
-
if (row.every((c) => c.text === "")) emptyRows++;
|
|
4147
|
-
}
|
|
4148
|
-
if (emptyRows > numRows * 0.5) return null;
|
|
4149
|
-
for (let c = 0; c < numCols; c++) {
|
|
4150
|
-
const hasValue = cells.some((row) => row[c].text !== "");
|
|
4151
|
-
if (!hasValue) return null;
|
|
4152
|
-
}
|
|
4153
|
-
for (let r = numRows - 1; r >= 1; r--) {
|
|
4154
|
-
const nonEmptyCols = cells[r].filter((c) => c.text.trim()).length;
|
|
4155
|
-
if (nonEmptyCols !== 1) continue;
|
|
4156
|
-
if (cells[r][0].text.trim() !== "") continue;
|
|
4157
|
-
const contentText = cells[r].find((c) => c.text.trim())?.text.trim() || "";
|
|
4158
|
-
if (/^[○●▶\-·]/.test(contentText)) continue;
|
|
4159
|
-
for (let pr = r - 1; pr >= 0; pr--) {
|
|
4160
|
-
if (cells[pr].some((c) => c.text.trim())) {
|
|
4161
|
-
for (let c = 0; c < numCols; c++) {
|
|
4162
|
-
const prev = cells[pr][c].text.trim();
|
|
4163
|
-
const curr = cells[r][c].text.trim();
|
|
4164
|
-
if (curr) cells[pr][c].text = prev ? prev + " " + curr : curr;
|
|
4165
|
-
}
|
|
4166
|
-
for (let c = 0; c < numCols; c++) cells[r][c].text = "";
|
|
4167
|
-
break;
|
|
4168
|
-
}
|
|
4169
|
-
}
|
|
4170
|
-
}
|
|
4171
|
-
for (let r = 0; r < cells.length - 1; r++) {
|
|
4172
|
-
const row = cells[r];
|
|
4173
|
-
const hasCol0 = row[0].text.trim() !== "";
|
|
4174
|
-
const hasColLast = numCols > 1 && row[numCols - 1].text.trim() !== "";
|
|
4175
|
-
const midEmpty = row.slice(1, numCols - 1).every((c) => c.text.trim() === "");
|
|
4176
|
-
if (hasCol0 && hasColLast && midEmpty) {
|
|
4177
|
-
const next = cells[r + 1];
|
|
4178
|
-
if (next[0].text.trim() === "" && next.some((c) => c.text.trim())) {
|
|
4179
|
-
for (let c = 1; c < numCols; c++) {
|
|
4180
|
-
const curr = next[c].text.trim();
|
|
4181
|
-
if (curr) row[c].text = row[c].text.trim() ? row[c].text.trim() + " " + curr : curr;
|
|
4182
|
-
}
|
|
4183
|
-
for (let c = 0; c < numCols; c++) next[c].text = "";
|
|
4184
|
-
}
|
|
4185
|
-
}
|
|
4186
|
-
}
|
|
4187
|
-
const filteredCells = cells.filter((row) => row.some((c) => c.text.trim()));
|
|
4188
|
-
const finalRowCount = filteredCells.length;
|
|
4189
|
-
if (finalRowCount < MIN_ROWS) return null;
|
|
4190
|
-
const irTable = {
|
|
4191
|
-
rows: finalRowCount,
|
|
4192
|
-
cols: numCols,
|
|
4193
|
-
cells: filteredCells,
|
|
4194
|
-
hasHeader: finalRowCount > 1
|
|
4195
|
-
};
|
|
4196
|
-
const allItems = rows.flatMap((r) => r.items);
|
|
4197
|
-
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
4198
|
-
for (const i of allItems) {
|
|
4199
|
-
if (i.x < minX) minX = i.x;
|
|
4200
|
-
if (i.y < minY) minY = i.y;
|
|
4201
|
-
if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
4202
|
-
const h = i.h > 0 ? i.h : i.fontSize;
|
|
4203
|
-
if (i.y + h > maxY) maxY = i.y + h;
|
|
4204
|
-
}
|
|
4205
|
-
return {
|
|
4206
|
-
table: irTable,
|
|
4207
|
-
bbox: { page: pageNum, x: minX, y: minY, width: maxX - minX, height: maxY - minY },
|
|
4208
|
-
usedItems
|
|
4209
|
-
};
|
|
4210
|
-
}
|
|
4211
|
-
|
|
4212
|
-
// src/pdf/polyfill.ts
|
|
4213
|
-
var pdfjsWorker = __toESM(require("pdfjs-dist/legacy/build/pdf.worker.mjs"), 1);
|
|
4214
|
-
var g = globalThis;
|
|
4215
|
-
if (typeof g.DOMMatrix === "undefined") {
|
|
4216
|
-
g.DOMMatrix = class DOMMatrix {
|
|
4217
|
-
m = [1, 0, 0, 1, 0, 0];
|
|
4218
|
-
constructor(init) {
|
|
4219
|
-
if (init) this.m = init;
|
|
4220
|
-
}
|
|
4221
|
-
};
|
|
4222
|
-
}
|
|
4223
|
-
if (typeof g.Path2D === "undefined") {
|
|
4224
|
-
g.Path2D = class Path2D {
|
|
4225
|
-
};
|
|
4226
|
-
}
|
|
4227
|
-
g.pdfjsWorker = pdfjsWorker;
|
|
4228
|
-
|
|
4229
|
-
// src/pdf/parser.ts
|
|
4230
|
-
var import_pdf2 = require("pdfjs-dist/legacy/build/pdf.mjs");
|
|
4231
|
-
import_pdf2.GlobalWorkerOptions.workerSrc = "";
|
|
4232
|
-
var MAX_PAGES = 5e3;
|
|
4233
|
-
var MAX_TOTAL_TEXT = 100 * 1024 * 1024;
|
|
4234
|
-
var PDF_LOAD_TIMEOUT_MS = 3e4;
|
|
4235
|
-
async function loadPdfWithTimeout(buffer) {
|
|
4236
|
-
const loadingTask = (0, import_pdf2.getDocument)({
|
|
4237
|
-
data: new Uint8Array(buffer),
|
|
4238
|
-
useSystemFonts: true,
|
|
4239
|
-
disableFontFace: true,
|
|
4240
|
-
isEvalSupported: false
|
|
4241
|
-
});
|
|
4242
|
-
let timer;
|
|
4243
|
-
try {
|
|
4244
|
-
return await Promise.race([
|
|
4245
|
-
loadingTask.promise,
|
|
4246
|
-
new Promise((_, reject) => {
|
|
4247
|
-
timer = setTimeout(() => {
|
|
4248
|
-
loadingTask.destroy();
|
|
4249
|
-
reject(new KordocError("PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (30\uCD08 \uCD08\uACFC)"));
|
|
4250
|
-
}, PDF_LOAD_TIMEOUT_MS);
|
|
4251
|
-
})
|
|
4252
|
-
]);
|
|
4253
|
-
} finally {
|
|
4254
|
-
if (timer !== void 0) clearTimeout(timer);
|
|
4255
|
-
}
|
|
4256
|
-
}
|
|
4257
|
-
async function parsePdfDocument(buffer, options) {
|
|
4258
|
-
const doc = await loadPdfWithTimeout(buffer);
|
|
4259
|
-
try {
|
|
4260
|
-
const pageCount = doc.numPages;
|
|
4261
|
-
if (pageCount === 0) throw new KordocError("PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.");
|
|
4262
|
-
const metadata = { pageCount };
|
|
4263
|
-
await extractPdfMetadata(doc, metadata);
|
|
4264
|
-
const blocks = [];
|
|
4265
|
-
const warnings = [];
|
|
4266
|
-
let totalChars = 0;
|
|
4267
|
-
let totalTextBytes = 0;
|
|
4268
|
-
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
4269
|
-
const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
|
|
4270
|
-
const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
|
|
4271
|
-
const fontSizeFreq = /* @__PURE__ */ new Map();
|
|
4272
|
-
const pageHeights = /* @__PURE__ */ new Map();
|
|
4273
|
-
let parsedPages = 0;
|
|
4274
|
-
for (let i = 1; i <= effectivePageCount; i++) {
|
|
4275
|
-
if (pageFilter && !pageFilter.has(i)) continue;
|
|
4276
|
-
try {
|
|
4277
|
-
const page = await doc.getPage(i);
|
|
4278
|
-
const tc = await page.getTextContent();
|
|
4279
|
-
const viewport = page.getViewport({ scale: 1 });
|
|
4280
|
-
pageHeights.set(i, viewport.height);
|
|
4281
|
-
const rawItems = tc.items;
|
|
4282
|
-
const items = normalizeItems(rawItems);
|
|
4283
|
-
const { visible, hiddenCount } = filterHiddenText(items, viewport.width, viewport.height);
|
|
4284
|
-
if (hiddenCount > 0) {
|
|
4285
|
-
warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
|
|
4286
|
-
}
|
|
4287
|
-
for (const item of visible) {
|
|
4288
|
-
if (item.fontSize > 0) fontSizeFreq.set(item.fontSize, (fontSizeFreq.get(item.fontSize) || 0) + 1);
|
|
4289
|
-
}
|
|
4290
|
-
const opList = await page.getOperatorList();
|
|
4291
|
-
const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
|
|
4292
|
-
for (const b of pageBlocks) blocks.push(b);
|
|
4293
|
-
for (const b of pageBlocks) {
|
|
4294
|
-
const t = b.text || "";
|
|
4295
|
-
totalChars += t.replace(/\s/g, "").length;
|
|
4296
|
-
totalTextBytes += t.length * 2;
|
|
4297
|
-
}
|
|
4298
|
-
if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
|
|
4299
|
-
parsedPages++;
|
|
4300
|
-
options?.onProgress?.(parsedPages, totalTarget);
|
|
4301
|
-
} catch (pageErr) {
|
|
4302
|
-
if (pageErr instanceof KordocError) throw pageErr;
|
|
4303
|
-
warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
4304
|
-
}
|
|
4305
|
-
}
|
|
4306
|
-
const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
|
|
4307
|
-
if (totalChars / Math.max(parsedPageCount, 1) < 10) {
|
|
4308
|
-
if (options?.ocr) {
|
|
4309
|
-
try {
|
|
4310
|
-
const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
|
|
4311
|
-
const ocrBlocks = await ocrPages2(doc, options.ocr, pageFilter, effectivePageCount);
|
|
4312
|
-
if (ocrBlocks.length > 0) {
|
|
4313
|
-
const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
|
|
4314
|
-
return { markdown: ocrMarkdown, blocks: ocrBlocks, metadata, warnings, isImageBased: true };
|
|
4315
|
-
}
|
|
4316
|
-
} catch {
|
|
4317
|
-
}
|
|
4318
|
-
}
|
|
4319
|
-
throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
|
|
4320
|
-
}
|
|
4321
|
-
if (options?.removeHeaderFooter !== false && parsedPageCount >= 3) {
|
|
4322
|
-
const removed = removeHeaderFooterBlocks(blocks, pageHeights, warnings);
|
|
4323
|
-
for (let ri = removed.length - 1; ri >= 0; ri--) {
|
|
4324
|
-
blocks.splice(removed[ri], 1);
|
|
4325
|
-
}
|
|
4326
|
-
}
|
|
4327
|
-
const medianFontSize = computeMedianFontSizeFromFreq(fontSizeFreq);
|
|
4328
|
-
if (medianFontSize > 0) {
|
|
4329
|
-
detectHeadings(blocks, medianFontSize);
|
|
4330
|
-
}
|
|
4331
|
-
detectMarkerHeadings(blocks);
|
|
4332
|
-
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
4333
|
-
let markdown = cleanPdfText(blocksToMarkdown(blocks));
|
|
4334
|
-
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
4335
|
-
} finally {
|
|
4336
|
-
await doc.destroy().catch(() => {
|
|
4337
|
-
});
|
|
4338
|
-
}
|
|
4339
|
-
}
|
|
4340
|
-
async function extractPdfMetadata(doc, metadata) {
|
|
4341
|
-
try {
|
|
4342
|
-
const result = await doc.getMetadata();
|
|
4343
|
-
if (!result?.info) return;
|
|
4344
|
-
const info = result.info;
|
|
4345
|
-
if (typeof info.Title === "string" && info.Title.trim()) metadata.title = info.Title.trim();
|
|
4346
|
-
if (typeof info.Author === "string" && info.Author.trim()) metadata.author = info.Author.trim();
|
|
4347
|
-
if (typeof info.Creator === "string" && info.Creator.trim()) metadata.creator = info.Creator.trim();
|
|
4348
|
-
if (typeof info.Subject === "string" && info.Subject.trim()) metadata.description = info.Subject.trim();
|
|
4349
|
-
if (typeof info.Keywords === "string" && info.Keywords.trim()) {
|
|
4350
|
-
metadata.keywords = info.Keywords.split(/[,;]/).map((k) => k.trim()).filter(Boolean);
|
|
4351
|
-
}
|
|
4352
|
-
if (typeof info.CreationDate === "string") metadata.createdAt = parsePdfDate(info.CreationDate);
|
|
4353
|
-
if (typeof info.ModDate === "string") metadata.modifiedAt = parsePdfDate(info.ModDate);
|
|
4354
|
-
} catch {
|
|
4355
|
-
}
|
|
4356
|
-
}
|
|
4357
|
-
function parsePdfDate(dateStr) {
|
|
4358
|
-
const m = dateStr.match(/D:(\d{4})(\d{2})?(\d{2})?(\d{2})?(\d{2})?(\d{2})?/);
|
|
4359
|
-
if (!m) return void 0;
|
|
4360
|
-
const [, year, month = "01", day = "01", hour = "00", min = "00", sec = "00"] = m;
|
|
4361
|
-
return `${year}-${month}-${day}T${hour}:${min}:${sec}`;
|
|
4362
|
-
}
|
|
4363
|
-
function filterHiddenText(items, pageWidth, pageHeight) {
|
|
4364
|
-
let hiddenCount = 0;
|
|
4365
|
-
const visible = [];
|
|
4366
|
-
for (const item of items) {
|
|
4367
|
-
if (item.isHidden) {
|
|
4368
|
-
hiddenCount++;
|
|
4369
|
-
continue;
|
|
4370
|
-
}
|
|
4371
|
-
const margin = Math.max(pageWidth, pageHeight) * 0.1;
|
|
4372
|
-
if (item.x < -margin || item.x > pageWidth + margin || item.y < -margin || item.y > pageHeight + margin) {
|
|
4373
|
-
hiddenCount++;
|
|
4374
|
-
continue;
|
|
4375
|
-
}
|
|
4376
|
-
visible.push(item);
|
|
4377
|
-
}
|
|
4378
|
-
return { visible, hiddenCount };
|
|
4379
|
-
}
|
|
4380
|
-
function computeMedianFontSizeFromFreq(freq) {
|
|
4381
|
-
if (freq.size === 0) return 0;
|
|
4382
|
-
let total = 0;
|
|
4383
|
-
for (const count of freq.values()) total += count;
|
|
4384
|
-
const sorted = [...freq.entries()].sort((a, b) => a[0] - b[0]);
|
|
4385
|
-
const mid = Math.floor(total / 2);
|
|
4386
|
-
let cumulative = 0;
|
|
4387
|
-
for (const [size, count] of sorted) {
|
|
4388
|
-
cumulative += count;
|
|
4389
|
-
if (cumulative > mid) return size;
|
|
4390
|
-
}
|
|
4391
|
-
return sorted[sorted.length - 1][0];
|
|
4392
|
-
}
|
|
4393
|
-
function detectHeadings(blocks, medianFontSize) {
|
|
4394
|
-
for (const block of blocks) {
|
|
4395
|
-
if (block.type !== "paragraph" || !block.text || !block.style?.fontSize) continue;
|
|
4396
|
-
const text = block.text.trim();
|
|
4397
|
-
if (text.length === 0 || text.length > 200) continue;
|
|
4398
|
-
if (/^\d+$/.test(text)) continue;
|
|
4399
|
-
const ratio = block.style.fontSize / medianFontSize;
|
|
4400
|
-
let level = 0;
|
|
4401
|
-
if (ratio >= HEADING_RATIO_H1) level = 1;
|
|
4402
|
-
else if (ratio >= HEADING_RATIO_H2) level = 2;
|
|
4403
|
-
else if (ratio >= HEADING_RATIO_H3) level = 3;
|
|
4404
|
-
if (level > 0) {
|
|
4405
|
-
block.type = "heading";
|
|
4406
|
-
block.level = level;
|
|
4407
|
-
block.text = collapseEvenSpacing(text);
|
|
4408
|
-
}
|
|
4409
|
-
}
|
|
4410
|
-
}
|
|
4411
|
-
function collapseEvenSpacing(text) {
|
|
4412
|
-
const tokens = text.split(" ");
|
|
4413
|
-
const singleCharCount = tokens.filter((t) => t.length === 1).length;
|
|
4414
|
-
if (tokens.length >= 3 && singleCharCount / tokens.length >= 0.7) {
|
|
4415
|
-
return tokens.join("");
|
|
4416
|
-
}
|
|
4417
|
-
return text.replace(
|
|
4418
|
-
/(?<![가-힣])[가-힣](?: [가-힣\d]){2,}(?![가-힣])/g,
|
|
4419
|
-
(match) => match.replace(/ /g, "")
|
|
4420
|
-
);
|
|
4421
|
-
}
|
|
4422
|
-
function shouldDemoteTable(table) {
|
|
4423
|
-
const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
|
|
4424
|
-
const allText = allCells.join(" ");
|
|
4425
|
-
if (table.rows <= 3 && table.cols <= 3) {
|
|
4426
|
-
const totalCells2 = table.rows * table.cols;
|
|
4427
|
-
const emptyCells2 = totalCells2 - allCells.length;
|
|
4428
|
-
if (emptyCells2 >= totalCells2 * 0.3) return true;
|
|
4429
|
-
if (/[□■◆○●▶ㅇ]/.test(allText)) return true;
|
|
4430
|
-
if (/<[^>]+>/.test(allText)) return true;
|
|
4431
|
-
}
|
|
4432
|
-
if (allText.length > 200) return false;
|
|
4433
|
-
if (/[□■◆○●▶]/.test(allText) && table.rows <= 3) return true;
|
|
4434
|
-
const totalCells = table.rows * table.cols;
|
|
4435
|
-
const emptyCells = totalCells - allCells.length;
|
|
4436
|
-
if (table.rows <= 2 && emptyCells > totalCells * 0.5) return true;
|
|
4437
|
-
if (table.rows === 1 && !/\d{2,}/.test(allText)) return true;
|
|
4438
|
-
return false;
|
|
4439
|
-
}
|
|
4440
|
-
function demoteTableToText(table) {
|
|
4441
|
-
const lines = [];
|
|
4442
|
-
for (let r = 0; r < table.rows; r++) {
|
|
4443
|
-
const cells = table.cells[r].map((c) => c.text.trim()).filter(Boolean);
|
|
4444
|
-
if (cells.length === 0) continue;
|
|
4445
|
-
if (table.cols === 2 && cells.length === 2) {
|
|
4446
|
-
lines.push(`${cells[0]} : ${cells[1]}`);
|
|
4447
|
-
} else {
|
|
4448
|
-
lines.push(cells.join(" "));
|
|
4449
|
-
}
|
|
4450
|
-
}
|
|
4451
|
-
return lines.join("\n");
|
|
4452
|
-
}
|
|
4453
|
-
function detectMarkerHeadings(blocks) {
|
|
4454
|
-
for (let i = 0; i < blocks.length; i++) {
|
|
4455
|
-
const block = blocks[i];
|
|
4456
|
-
if (block.type !== "paragraph" || !block.text) continue;
|
|
4457
|
-
const text = block.text.trim();
|
|
4458
|
-
if (text.length < 50 && /^[□■◆◇▶]\s*[가-힣]/.test(text)) {
|
|
4459
|
-
block.type = "heading";
|
|
4460
|
-
block.level = 4;
|
|
4461
|
-
continue;
|
|
4462
|
-
}
|
|
4463
|
-
if (/^[가-힣]{2,6}$/.test(text) && block.style?.fontSize) {
|
|
4464
|
-
const prev = blocks[i - 1];
|
|
4465
|
-
const next = blocks[i + 1];
|
|
4466
|
-
const prevIsStructural = !prev || prev.type === "table" || prev.type === "heading" || prev.type === "separator";
|
|
4467
|
-
const nextIsStructural = !next || next.type === "table" || next.type === "heading" || next.type === "paragraph" && next.text && /^[□■◆○●]/.test(next.text.trim());
|
|
4468
|
-
if (prevIsStructural || nextIsStructural) {
|
|
4469
|
-
block.type = "heading";
|
|
4470
|
-
block.level = 3;
|
|
4471
|
-
}
|
|
4472
|
-
}
|
|
4473
|
-
}
|
|
4474
|
-
}
|
|
4475
|
-
var MAX_XYCUT_DEPTH = 50;
|
|
4476
|
-
function xyCutOrder(items, gapThreshold, depth = 0) {
|
|
4477
|
-
if (items.length === 0) return [];
|
|
4478
|
-
if (items.length <= 2 || depth >= MAX_XYCUT_DEPTH) return [items];
|
|
4479
|
-
const region = computeRegion(items);
|
|
4480
|
-
const ySplit = findYSplit(items, region, gapThreshold);
|
|
4481
|
-
if (ySplit !== null) {
|
|
4482
|
-
const upper = items.filter((i) => i.y > ySplit);
|
|
4483
|
-
const lower = items.filter((i) => i.y <= ySplit);
|
|
4484
|
-
if (upper.length > 0 && lower.length > 0 && upper.length < items.length) {
|
|
4485
|
-
return [...xyCutOrder(upper, gapThreshold, depth + 1), ...xyCutOrder(lower, gapThreshold, depth + 1)];
|
|
4486
|
-
}
|
|
4487
|
-
}
|
|
4488
|
-
const xSplit = findXSplit(items, region, gapThreshold);
|
|
4489
|
-
if (xSplit !== null) {
|
|
4490
|
-
const left = items.filter((i) => i.x + i.w / 2 < xSplit);
|
|
4491
|
-
const right = items.filter((i) => i.x + i.w / 2 >= xSplit);
|
|
4492
|
-
if (left.length > 0 && right.length > 0 && left.length < items.length) {
|
|
4493
|
-
return [...xyCutOrder(left, gapThreshold, depth + 1), ...xyCutOrder(right, gapThreshold, depth + 1)];
|
|
4494
|
-
}
|
|
4495
|
-
}
|
|
4496
|
-
return [items];
|
|
4497
|
-
}
|
|
4498
|
-
function computeRegion(items) {
|
|
4499
|
-
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
4500
|
-
for (const i of items) {
|
|
4501
|
-
if (i.x < minX) minX = i.x;
|
|
4502
|
-
if (i.y < minY) minY = i.y;
|
|
4503
|
-
if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
4504
|
-
if (i.y + i.h > maxY) maxY = i.y + i.h;
|
|
4505
|
-
}
|
|
4506
|
-
return { items, minX, minY, maxX, maxY };
|
|
4507
|
-
}
|
|
4508
|
-
function findYSplit(items, _region, gapThreshold) {
|
|
4509
|
-
const sorted = [...items].sort((a, b) => b.y - a.y);
|
|
4510
|
-
let bestGap = gapThreshold;
|
|
4511
|
-
let bestSplit = null;
|
|
4512
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
4513
|
-
const prevBottom = sorted[i - 1].y - sorted[i - 1].h;
|
|
4514
|
-
const currTop = sorted[i].y;
|
|
4515
|
-
const gap = prevBottom - currTop;
|
|
4516
|
-
if (gap > bestGap) {
|
|
4517
|
-
bestGap = gap;
|
|
4518
|
-
bestSplit = (prevBottom + currTop) / 2;
|
|
4519
|
-
}
|
|
4520
|
-
}
|
|
4521
|
-
return bestSplit;
|
|
4522
|
-
}
|
|
4523
|
-
function findXSplit(items, _region, gapThreshold) {
|
|
4524
|
-
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
4525
|
-
let bestGap = gapThreshold;
|
|
4526
|
-
let bestSplit = null;
|
|
4527
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
4528
|
-
const prevRight = sorted[i - 1].x + sorted[i - 1].w;
|
|
4529
|
-
const currLeft = sorted[i].x;
|
|
4530
|
-
const gap = currLeft - prevRight;
|
|
4531
|
-
if (gap > bestGap) {
|
|
4532
|
-
bestGap = gap;
|
|
4533
|
-
bestSplit = (prevRight + currLeft) / 2;
|
|
4534
|
-
}
|
|
4535
|
-
}
|
|
4536
|
-
return bestSplit;
|
|
4537
|
-
}
|
|
4538
|
-
function extractPageBlocksWithLines(items, pageNum, opList, pageWidth, pageHeight) {
|
|
4539
|
-
if (items.length === 0) return [];
|
|
4540
|
-
let { horizontals, verticals } = extractLines(opList.fnArray, opList.argsArray);
|
|
4541
|
-
({ horizontals, verticals } = filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight));
|
|
4542
|
-
({ horizontals, verticals } = preprocessLines(horizontals, verticals));
|
|
4543
|
-
const grids = buildTableGrids(horizontals, verticals);
|
|
4544
|
-
if (grids.length > 0) {
|
|
4545
|
-
return extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals);
|
|
4546
|
-
}
|
|
4547
|
-
return extractPageBlocksFallback(items, pageNum);
|
|
4548
|
-
}
|
|
4549
|
-
function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
4550
|
-
const blocks = [];
|
|
4551
|
-
const usedItems = /* @__PURE__ */ new Set();
|
|
4552
|
-
const sortedGrids = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
|
|
4553
|
-
for (const grid of sortedGrids) {
|
|
4554
|
-
const numGridRows = grid.rowYs.length - 1;
|
|
4555
|
-
const numGridCols = grid.colXs.length - 1;
|
|
4556
|
-
if (numGridRows === 1 && numGridCols >= 2) continue;
|
|
4557
|
-
const tableItems = [];
|
|
4558
|
-
const pad = 3;
|
|
4559
|
-
const gridW = grid.bbox.x2 - grid.bbox.x1;
|
|
4560
|
-
for (const item of items) {
|
|
4561
|
-
if (usedItems.has(item)) continue;
|
|
4562
|
-
if (item.y < grid.bbox.y1 - pad || item.y > grid.bbox.y2 + pad) continue;
|
|
4563
|
-
if (item.x < grid.bbox.x1 - pad || item.x + item.w > grid.bbox.x2 + pad) continue;
|
|
4564
|
-
if (gridW < 120 && item.x + item.w > grid.bbox.x2 - 2) continue;
|
|
4565
|
-
tableItems.push(item);
|
|
4566
|
-
usedItems.add(item);
|
|
4567
|
-
}
|
|
4568
|
-
const cells = extractCells(grid, horizontals, verticals);
|
|
4569
|
-
if (cells.length === 0) continue;
|
|
4570
|
-
const textItems = tableItems.map((i) => ({
|
|
4571
|
-
text: i.text,
|
|
4572
|
-
x: i.x,
|
|
4573
|
-
y: i.y,
|
|
4574
|
-
w: i.w,
|
|
4575
|
-
h: i.h,
|
|
4576
|
-
fontSize: i.fontSize,
|
|
4577
|
-
fontName: i.fontName
|
|
4578
|
-
}));
|
|
4579
|
-
const cellTextMap = mapTextToCells(textItems, cells);
|
|
4580
|
-
const numRows = grid.rowYs.length - 1;
|
|
4581
|
-
const numCols = grid.colXs.length - 1;
|
|
4582
|
-
const irGrid = Array.from(
|
|
4583
|
-
{ length: numRows },
|
|
4584
|
-
() => Array.from({ length: numCols }, () => ({ text: "", colSpan: 1, rowSpan: 1 }))
|
|
4585
|
-
);
|
|
4586
|
-
for (const cell of cells) {
|
|
4587
|
-
const cellItems = cellTextMap.get(cell) || [];
|
|
4588
|
-
let text = cellTextToString(cellItems);
|
|
4589
|
-
text = text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").trim();
|
|
4590
|
-
text = text.split("\n").map((line) => collapseEvenSpacing(line)).join("\n");
|
|
4591
|
-
irGrid[cell.row][cell.col] = {
|
|
4592
|
-
text,
|
|
4593
|
-
colSpan: cell.colSpan,
|
|
4594
|
-
rowSpan: cell.rowSpan
|
|
4595
|
-
};
|
|
4596
|
-
}
|
|
4597
|
-
const irTable = {
|
|
4598
|
-
rows: numRows,
|
|
4599
|
-
cols: numCols,
|
|
4600
|
-
cells: irGrid,
|
|
4601
|
-
hasHeader: numRows > 1
|
|
4602
|
-
};
|
|
4603
|
-
const hasContent = irGrid.some((row) => row.some((cell) => cell.text.trim() !== ""));
|
|
4604
|
-
if (!hasContent) continue;
|
|
4605
|
-
const tableBbox = {
|
|
4606
|
-
page: pageNum,
|
|
4607
|
-
x: grid.bbox.x1,
|
|
4608
|
-
y: grid.bbox.y1,
|
|
4609
|
-
width: grid.bbox.x2 - grid.bbox.x1,
|
|
4610
|
-
height: grid.bbox.y2 - grid.bbox.y1
|
|
4611
|
-
};
|
|
4612
|
-
if (shouldDemoteTable(irTable)) {
|
|
4613
|
-
const demoted = demoteTableToText(irTable);
|
|
4614
|
-
if (demoted) {
|
|
4615
|
-
const text = numGridRows === 1 ? "\n" + demoted + "\n" : demoted;
|
|
4616
|
-
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
|
|
4617
|
-
}
|
|
4618
|
-
continue;
|
|
4619
|
-
}
|
|
4620
|
-
blocks.push({ type: "table", table: irTable, pageNumber: pageNum, bbox: tableBbox });
|
|
4621
|
-
}
|
|
4622
|
-
let remaining = items.filter((i) => !usedItems.has(i));
|
|
4623
|
-
if (remaining.length > 0) {
|
|
4624
|
-
remaining.sort((a, b) => b.y - a.y || a.x - b.x);
|
|
4625
|
-
const clusterItems = remaining.map((i) => ({
|
|
4626
|
-
text: i.text,
|
|
4627
|
-
x: i.x,
|
|
4628
|
-
y: i.y,
|
|
4629
|
-
w: i.w,
|
|
4630
|
-
h: i.h,
|
|
4631
|
-
fontSize: i.fontSize,
|
|
4632
|
-
fontName: i.fontName
|
|
4633
|
-
}));
|
|
4634
|
-
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4635
|
-
if (clusterResults.length > 0) {
|
|
4636
|
-
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4637
|
-
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4638
|
-
const usedClusterIndices = /* @__PURE__ */ new Set();
|
|
4639
|
-
for (const cr of clusterResults) {
|
|
4640
|
-
for (const ci of cr.usedItems) {
|
|
4641
|
-
const idx = ciToIdx.get(ci);
|
|
4642
|
-
if (idx !== void 0) usedClusterIndices.add(idx);
|
|
4643
|
-
}
|
|
4644
|
-
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4645
|
-
}
|
|
4646
|
-
remaining = remaining.filter((_, idx) => !usedClusterIndices.has(idx));
|
|
4647
|
-
}
|
|
4648
|
-
if (remaining.length > 0) {
|
|
4649
|
-
const allY = remaining.map((i) => i.y);
|
|
4650
|
-
const pageH = safeMax(allY) - safeMin(allY);
|
|
4651
|
-
const groups = xyCutOrder(remaining, Math.max(15, pageH * 0.03));
|
|
4652
|
-
const textBlocks = [];
|
|
4653
|
-
for (const group of groups) {
|
|
4654
|
-
if (group.length === 0) continue;
|
|
4655
|
-
const groupBlocks = extractPageBlocksFallback(group, pageNum);
|
|
4656
|
-
for (const b of groupBlocks) textBlocks.push(b);
|
|
4657
|
-
}
|
|
4658
|
-
const finalTextBlocks = detectListBlocks(textBlocks);
|
|
4659
|
-
for (const b of finalTextBlocks) blocks.push(b);
|
|
4660
|
-
}
|
|
4661
|
-
blocks.sort((a, b) => {
|
|
4662
|
-
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
4663
|
-
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
4664
|
-
return by - ay;
|
|
4665
|
-
});
|
|
4666
|
-
return mergeAdjacentTableBlocks(blocks);
|
|
4667
|
-
}
|
|
4668
|
-
return mergeAdjacentTableBlocks(blocks);
|
|
4669
|
-
}
|
|
4670
|
-
function mergeAdjacentTableBlocks(blocks) {
|
|
4671
|
-
if (blocks.length <= 1) return blocks;
|
|
4672
|
-
const result = [blocks[0]];
|
|
4673
|
-
for (let i = 1; i < blocks.length; i++) {
|
|
4674
|
-
const prev = result[result.length - 1];
|
|
4675
|
-
const curr = blocks[i];
|
|
4676
|
-
if (prev.type === "table" && curr.type === "table" && prev.table && curr.table && prev.table.cols === curr.table.cols) {
|
|
4677
|
-
const merged = {
|
|
4678
|
-
rows: prev.table.rows + curr.table.rows,
|
|
4679
|
-
cols: prev.table.cols,
|
|
4680
|
-
cells: [...prev.table.cells, ...curr.table.cells],
|
|
4681
|
-
hasHeader: prev.table.hasHeader
|
|
4682
|
-
};
|
|
4683
|
-
result[result.length - 1] = { ...prev, table: merged };
|
|
4684
|
-
} else {
|
|
4685
|
-
result.push(curr);
|
|
4686
|
-
}
|
|
4687
|
-
}
|
|
4688
|
-
return result;
|
|
4689
|
-
}
|
|
4690
|
-
function extractPageBlocksFallback(items, pageNum) {
|
|
4691
|
-
if (items.length === 0) return [];
|
|
4692
|
-
const blocks = [];
|
|
4693
|
-
const clusterItems = items.map((i) => ({
|
|
4694
|
-
text: i.text,
|
|
4695
|
-
x: i.x,
|
|
4696
|
-
y: i.y,
|
|
4697
|
-
w: i.w,
|
|
4698
|
-
h: i.h,
|
|
4699
|
-
fontSize: i.fontSize,
|
|
4700
|
-
fontName: i.fontName
|
|
4701
|
-
}));
|
|
4702
|
-
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4703
|
-
if (clusterResults.length > 0) {
|
|
4704
|
-
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4705
|
-
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4706
|
-
const usedIndices = /* @__PURE__ */ new Set();
|
|
4707
|
-
for (const cr of clusterResults) {
|
|
4708
|
-
for (const ci of cr.usedItems) {
|
|
4709
|
-
const idx = ciToIdx.get(ci);
|
|
4710
|
-
if (idx !== void 0) usedIndices.add(idx);
|
|
4711
|
-
}
|
|
4712
|
-
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4713
|
-
}
|
|
4714
|
-
const remaining = items.filter((_, idx) => !usedIndices.has(idx));
|
|
4715
|
-
if (remaining.length > 0) {
|
|
4716
|
-
const yLines = groupByY(remaining);
|
|
4717
|
-
for (const line of yLines) {
|
|
4718
|
-
const text = mergeLineSimple(line);
|
|
4719
|
-
if (!text.trim()) continue;
|
|
4720
|
-
const bbox = computeBBox(line, pageNum);
|
|
4721
|
-
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
|
|
4722
|
-
}
|
|
4723
|
-
}
|
|
4724
|
-
blocks.sort((a, b) => {
|
|
4725
|
-
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
4726
|
-
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
4727
|
-
return by - ay;
|
|
4728
|
-
});
|
|
4729
|
-
} else {
|
|
4730
|
-
const allYLines = groupByY(items);
|
|
4731
|
-
const columns = detectColumns(allYLines);
|
|
4732
|
-
if (columns && columns.length >= 3) {
|
|
4733
|
-
const tableText = extractWithColumns(allYLines, columns);
|
|
4734
|
-
const bbox = computeBBox(items, pageNum);
|
|
4735
|
-
blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
|
|
4736
|
-
} else {
|
|
4737
|
-
const allY = items.map((i) => i.y);
|
|
4738
|
-
const pageHeight = safeMax(allY) - safeMin(allY);
|
|
4739
|
-
const gapThreshold = Math.max(15, pageHeight * 0.03);
|
|
4740
|
-
const orderedGroups = xyCutOrder(items, gapThreshold);
|
|
4741
|
-
for (const group of orderedGroups) {
|
|
4742
|
-
if (group.length === 0) continue;
|
|
4743
|
-
const yLines = groupByY(group);
|
|
4744
|
-
const groupColumns = detectColumns(yLines);
|
|
4745
|
-
if (groupColumns && groupColumns.length >= 3) {
|
|
4746
|
-
const tableText = extractWithColumns(yLines, groupColumns);
|
|
4747
|
-
const bbox = computeBBox(group, pageNum);
|
|
4748
|
-
blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(group) });
|
|
4749
|
-
} else {
|
|
4750
|
-
for (const line of yLines) {
|
|
4751
|
-
const text = mergeLineSimple(line);
|
|
4752
|
-
if (!text.trim()) continue;
|
|
4753
|
-
const bbox = computeBBox(line, pageNum);
|
|
4754
|
-
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
|
|
4755
|
-
}
|
|
4756
|
-
}
|
|
4757
|
-
}
|
|
4758
|
-
}
|
|
4759
|
-
}
|
|
4760
|
-
return detectSpecialKoreanTables(blocks);
|
|
4761
|
-
}
|
|
4762
|
-
function computeBBox(items, pageNum) {
|
|
4763
|
-
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
4764
|
-
for (const i of items) {
|
|
4765
|
-
if (i.x < minX) minX = i.x;
|
|
4766
|
-
if (i.y < minY) minY = i.y;
|
|
4767
|
-
if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
4768
|
-
const effectiveH = i.h > 0 ? i.h : i.fontSize;
|
|
4769
|
-
if (i.y + effectiveH > maxY) maxY = i.y + effectiveH;
|
|
4770
|
-
}
|
|
4771
|
-
return { page: pageNum, x: minX, y: minY, width: maxX - minX, height: maxY - minY };
|
|
4772
|
-
}
|
|
4773
|
-
function dominantStyle(items) {
|
|
4774
|
-
if (items.length === 0) return void 0;
|
|
4775
|
-
const freq = /* @__PURE__ */ new Map();
|
|
4776
|
-
let maxCount = 0, dominantSize = 0;
|
|
4777
|
-
for (const i of items) {
|
|
4778
|
-
if (i.fontSize <= 0) continue;
|
|
4779
|
-
const count = (freq.get(i.fontSize) || 0) + 1;
|
|
4780
|
-
freq.set(i.fontSize, count);
|
|
4781
|
-
if (count > maxCount) {
|
|
4782
|
-
maxCount = count;
|
|
4783
|
-
dominantSize = i.fontSize;
|
|
4784
|
-
}
|
|
4785
|
-
}
|
|
4786
|
-
if (dominantSize === 0) return void 0;
|
|
4787
|
-
const fontName = items.find((i) => i.fontSize === dominantSize)?.fontName || void 0;
|
|
4788
|
-
return { fontSize: dominantSize, fontName };
|
|
4789
|
-
}
|
|
4790
|
-
function normalizeItems(rawItems) {
|
|
4791
|
-
const items = [];
|
|
4792
|
-
const spacePositions = [];
|
|
4793
|
-
for (const i of rawItems) {
|
|
4794
|
-
if (typeof i.str !== "string") continue;
|
|
4795
|
-
const x = Math.round(i.transform[4]);
|
|
4796
|
-
const y = Math.round(i.transform[5]);
|
|
4797
|
-
if (!i.str.trim()) {
|
|
4798
|
-
spacePositions.push({ x, y });
|
|
4799
|
-
continue;
|
|
4800
|
-
}
|
|
4801
|
-
const scaleY = Math.abs(i.transform[3]);
|
|
4802
|
-
const scaleX = Math.abs(i.transform[0]);
|
|
4803
|
-
const fontSize = Math.round(Math.max(scaleY, scaleX));
|
|
4804
|
-
const w = Math.round(i.width);
|
|
4805
|
-
const h = Math.round(i.height);
|
|
4806
|
-
const isHidden = fontSize === 0 || i.width === 0 && i.str.trim().length > 0;
|
|
4807
|
-
let text = i.str.trim();
|
|
4808
|
-
if (/^[\d\s\-().·,☎]+$/.test(text) && /\d/.test(text) && / /.test(text)) {
|
|
4809
|
-
text = text.replace(/ /g, "");
|
|
4810
|
-
}
|
|
4811
|
-
const split = splitEvenSpacedItem(text, x, w, fontSize);
|
|
4812
|
-
if (split) {
|
|
4813
|
-
for (const s of split) {
|
|
4814
|
-
items.push({ text: s.text, x: s.x, y, w: s.w, h, fontSize, fontName: i.fontName || "", isHidden });
|
|
4815
|
-
}
|
|
4816
|
-
} else {
|
|
4817
|
-
items.push({ text, x, y, w, h, fontSize, fontName: i.fontName || "", isHidden });
|
|
4818
|
-
}
|
|
4819
|
-
}
|
|
4820
|
-
const sorted = items.sort((a, b) => b.y - a.y || a.x - b.x);
|
|
4821
|
-
const deduped = [];
|
|
4822
|
-
for (let i = 0; i < sorted.length; i++) {
|
|
4823
|
-
let isDup = false;
|
|
4824
|
-
for (let j = deduped.length - 1; j >= 0; j--) {
|
|
4825
|
-
const prev = deduped[j];
|
|
4826
|
-
if (prev.y - sorted[i].y > 3) break;
|
|
4827
|
-
if (Math.abs(prev.y - sorted[i].y) <= 3 && prev.text === sorted[i].text && Math.abs(prev.x - sorted[i].x) <= 3) {
|
|
4828
|
-
isDup = true;
|
|
4829
|
-
break;
|
|
4830
|
-
}
|
|
4831
|
-
}
|
|
4832
|
-
if (!isDup) deduped.push(sorted[i]);
|
|
4833
|
-
}
|
|
4834
|
-
if (spacePositions.length > 0) {
|
|
4835
|
-
for (const item of deduped) {
|
|
4836
|
-
for (const sp of spacePositions) {
|
|
4837
|
-
if (Math.abs(sp.y - item.y) <= 3) {
|
|
4838
|
-
const dist = item.x - sp.x;
|
|
4839
|
-
if (dist >= 0 && dist <= 20) {
|
|
4840
|
-
item.hasSpaceBefore = true;
|
|
4841
|
-
break;
|
|
4842
|
-
}
|
|
4843
|
-
}
|
|
4844
|
-
}
|
|
4845
|
-
}
|
|
4846
|
-
}
|
|
4847
|
-
return deduped;
|
|
4848
|
-
}
|
|
4849
|
-
function splitEvenSpacedItem(text, itemX, itemW, fontSize) {
|
|
4850
|
-
if (!/^[가-힣\d](?: [가-힣\d]){2,}$/.test(text)) return null;
|
|
4851
|
-
const chars = text.split(" ");
|
|
4852
|
-
if (chars.length < 3) return null;
|
|
4853
|
-
const charW = itemW / chars.length;
|
|
4854
|
-
if (charW > fontSize * 2) return null;
|
|
4855
|
-
return chars.map((ch, idx) => ({
|
|
4856
|
-
text: ch,
|
|
4857
|
-
x: Math.round(itemX + idx * charW),
|
|
4858
|
-
w: Math.round(charW * 0.8)
|
|
4859
|
-
// 실제 글자 폭은 간격보다 좁음
|
|
4860
|
-
}));
|
|
4861
|
-
}
|
|
4862
|
-
function groupByY(items) {
|
|
4863
|
-
if (items.length === 0) return [];
|
|
4864
|
-
const lines = [];
|
|
4865
|
-
let curY = items[0].y;
|
|
4866
|
-
let curLine = [items[0]];
|
|
4867
|
-
for (let i = 1; i < items.length; i++) {
|
|
4868
|
-
if (Math.abs(items[i].y - curY) > 3) {
|
|
4869
|
-
lines.push(curLine);
|
|
4870
|
-
curLine = [];
|
|
4871
|
-
curY = items[i].y;
|
|
4872
|
-
}
|
|
4873
|
-
curLine.push(items[i]);
|
|
4874
|
-
}
|
|
4875
|
-
if (curLine.length > 0) lines.push(curLine);
|
|
4876
|
-
return lines;
|
|
4877
|
-
}
|
|
4878
|
-
function isProseSpread(items) {
|
|
4879
|
-
if (items.length < 4) return false;
|
|
4880
|
-
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
4881
|
-
const gaps = [];
|
|
4882
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
4883
|
-
gaps.push(sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w));
|
|
4884
|
-
}
|
|
4885
|
-
const maxGap = safeMax(gaps);
|
|
4886
|
-
const avgLen = items.reduce((s, i) => s + i.text.length, 0) / items.length;
|
|
4887
|
-
return maxGap < 40 && avgLen < 5;
|
|
4888
|
-
}
|
|
4889
|
-
function detectColumns(yLines) {
|
|
4890
|
-
const allItems = yLines.flat();
|
|
4891
|
-
if (allItems.length === 0) return null;
|
|
4892
|
-
const pageWidth = safeMax(allItems.map((i) => i.x + i.w)) - safeMin(allItems.map((i) => i.x));
|
|
4893
|
-
if (pageWidth < 100) return null;
|
|
4894
|
-
let bigoLineIdx = -1;
|
|
4895
|
-
for (let i = 0; i < yLines.length; i++) {
|
|
4896
|
-
if (yLines[i].length <= 2 && yLines[i].some((item) => item.text === "\uBE44\uACE0")) {
|
|
4897
|
-
bigoLineIdx = i;
|
|
4898
|
-
break;
|
|
4899
|
-
}
|
|
4900
|
-
}
|
|
4901
|
-
const tableYLines = bigoLineIdx >= 0 ? yLines.slice(0, bigoLineIdx) : yLines;
|
|
4902
|
-
const CLUSTER_TOL = 22;
|
|
4903
|
-
const xClusters = [];
|
|
4904
|
-
for (const line of tableYLines) {
|
|
4905
|
-
if (isProseSpread(line)) continue;
|
|
4906
|
-
for (const item of line) {
|
|
4907
|
-
let found = false;
|
|
4908
|
-
for (const c of xClusters) {
|
|
4909
|
-
if (Math.abs(item.x - c.center) <= CLUSTER_TOL) {
|
|
4910
|
-
c.center = Math.round((c.center * c.count + item.x) / (c.count + 1));
|
|
4911
|
-
c.minX = Math.min(c.minX, item.x);
|
|
4912
|
-
c.count++;
|
|
4913
|
-
found = true;
|
|
4914
|
-
break;
|
|
4915
|
-
}
|
|
4916
|
-
}
|
|
4917
|
-
if (!found) {
|
|
4918
|
-
xClusters.push({ center: item.x, count: 1, minX: item.x });
|
|
4919
|
-
}
|
|
4920
|
-
}
|
|
4921
|
-
}
|
|
4922
|
-
const peaks = xClusters.filter((c) => c.count >= 3).sort((a, b) => a.minX - b.minX);
|
|
4923
|
-
if (peaks.length < 3) return null;
|
|
4924
|
-
const MERGE_TOL = 40;
|
|
4925
|
-
const merged = [peaks[0]];
|
|
4926
|
-
for (let i = 1; i < peaks.length; i++) {
|
|
4927
|
-
const prev = merged[merged.length - 1];
|
|
4928
|
-
if (peaks[i].minX - prev.minX < MERGE_TOL) {
|
|
4929
|
-
if (peaks[i].count > prev.count) {
|
|
4930
|
-
prev.center = peaks[i].center;
|
|
4931
|
-
}
|
|
4932
|
-
prev.count += peaks[i].count;
|
|
4933
|
-
prev.minX = Math.min(prev.minX, peaks[i].minX);
|
|
4934
|
-
} else {
|
|
4935
|
-
merged.push({ ...peaks[i] });
|
|
4936
|
-
}
|
|
4937
|
-
}
|
|
4938
|
-
const rawColumns = merged.filter((c) => c.count >= 3).map((c) => c.minX);
|
|
4939
|
-
if (rawColumns.length < 3) return null;
|
|
4940
|
-
const MIN_DETECT_COL_WIDTH = 30;
|
|
4941
|
-
const columns = [rawColumns[0]];
|
|
4942
|
-
for (let i = 1; i < rawColumns.length; i++) {
|
|
4943
|
-
if (rawColumns[i] - columns[columns.length - 1] < MIN_DETECT_COL_WIDTH) continue;
|
|
4944
|
-
columns.push(rawColumns[i]);
|
|
4945
|
-
}
|
|
4946
|
-
return columns.length >= 3 ? columns : null;
|
|
4947
|
-
}
|
|
4948
|
-
function findColumn(x, columns) {
|
|
4949
|
-
for (let i = columns.length - 1; i >= 0; i--) {
|
|
4950
|
-
if (x >= columns[i] - 10) return i;
|
|
4951
|
-
}
|
|
4952
|
-
return 0;
|
|
4953
|
-
}
|
|
4954
|
-
function extractWithColumns(yLines, columns) {
|
|
4955
|
-
const result = [];
|
|
4956
|
-
const colMin = columns[0];
|
|
4957
|
-
const colMax = columns[columns.length - 1];
|
|
4958
|
-
let bigoIdx = -1;
|
|
4959
|
-
for (let i = 0; i < yLines.length; i++) {
|
|
4960
|
-
if (yLines[i].length <= 2 && yLines[i].some((item) => item.text === "\uBE44\uACE0")) {
|
|
4961
|
-
bigoIdx = i;
|
|
4962
|
-
break;
|
|
4963
|
-
}
|
|
4964
|
-
}
|
|
4965
|
-
let tableStart = -1;
|
|
4966
|
-
for (let i = 0; i < (bigoIdx >= 0 ? bigoIdx : yLines.length); i++) {
|
|
4967
|
-
const usedCols = new Set(yLines[i].map((item) => findColumn(item.x, columns)));
|
|
4968
|
-
if (usedCols.size >= 3) {
|
|
4969
|
-
tableStart = i;
|
|
4970
|
-
break;
|
|
4971
|
-
}
|
|
4972
|
-
}
|
|
4973
|
-
const tableEnd = bigoIdx >= 0 ? bigoIdx : yLines.length;
|
|
4974
|
-
for (let i = 0; i < (tableStart >= 0 ? tableStart : tableEnd); i++) {
|
|
4975
|
-
result.push(mergeLineSimple(yLines[i]));
|
|
4976
|
-
}
|
|
4977
|
-
if (tableStart >= 0) {
|
|
4978
|
-
const tableLines = yLines.slice(tableStart, tableEnd);
|
|
4979
|
-
const gridLines = [];
|
|
4980
|
-
for (const line of tableLines) {
|
|
4981
|
-
const inRange = line.some(
|
|
4982
|
-
(item) => item.x >= colMin - 20 && item.x <= colMax + 200
|
|
4983
|
-
);
|
|
4984
|
-
if (inRange && !isProseSpread(line)) {
|
|
4985
|
-
gridLines.push(line);
|
|
4986
|
-
} else {
|
|
4987
|
-
if (gridLines.length > 0) {
|
|
4988
|
-
result.push(buildGridTable(gridLines.splice(0), columns));
|
|
4989
|
-
}
|
|
4990
|
-
result.push(mergeLineSimple(line));
|
|
4991
|
-
}
|
|
4992
|
-
}
|
|
4993
|
-
if (gridLines.length > 0) {
|
|
4994
|
-
result.push(buildGridTable(gridLines, columns));
|
|
4995
|
-
}
|
|
4996
|
-
}
|
|
4997
|
-
if (bigoIdx >= 0) {
|
|
4998
|
-
result.push("");
|
|
4999
|
-
for (let i = bigoIdx; i < yLines.length; i++) {
|
|
5000
|
-
result.push(mergeLineSimple(yLines[i]));
|
|
5001
|
-
}
|
|
5002
|
-
}
|
|
5003
|
-
return result.join("\n");
|
|
5004
|
-
}
|
|
5005
|
-
function buildGridTable(lines, columns) {
|
|
5006
|
-
const numCols = columns.length;
|
|
5007
|
-
const yRows = lines.map((items) => {
|
|
5008
|
-
const row = Array(numCols).fill("");
|
|
5009
|
-
for (const item of items) {
|
|
5010
|
-
const col = findColumn(item.x, columns);
|
|
5011
|
-
row[col] = row[col] ? row[col] + " " + item.text : item.text;
|
|
5012
|
-
}
|
|
5013
|
-
return row;
|
|
5014
|
-
});
|
|
5015
|
-
const dataColStart = Math.max(2, Math.floor(numCols / 2));
|
|
5016
|
-
const merged = [];
|
|
5017
|
-
for (const row of yRows) {
|
|
5018
|
-
if (row.every((c) => c === "")) continue;
|
|
5019
|
-
if (merged.length === 0) {
|
|
5020
|
-
merged.push([...row]);
|
|
5021
|
-
continue;
|
|
5022
|
-
}
|
|
5023
|
-
const prev = merged[merged.length - 1];
|
|
5024
|
-
const filledCols = row.map((c, i) => c ? i : -1).filter((i) => i >= 0);
|
|
5025
|
-
const filledCount = filledCols.length;
|
|
5026
|
-
let isNewRow = false;
|
|
5027
|
-
if (row[0] && row[0].length >= 3) {
|
|
5028
|
-
isNewRow = true;
|
|
5029
|
-
}
|
|
5030
|
-
if (!isNewRow && numCols > 1 && row[1]) {
|
|
5031
|
-
isNewRow = true;
|
|
5032
|
-
}
|
|
5033
|
-
if (!isNewRow) {
|
|
5034
|
-
const hasData = row.slice(dataColStart).some((c) => c !== "");
|
|
5035
|
-
const prevHasData = prev.slice(dataColStart).some((c) => c !== "");
|
|
5036
|
-
if (hasData && prevHasData) {
|
|
5037
|
-
isNewRow = true;
|
|
5038
|
-
}
|
|
5039
|
-
}
|
|
5040
|
-
if (isNewRow && filledCount === 1 && row[0] && row[0].length <= 2) {
|
|
5041
|
-
isNewRow = false;
|
|
5042
|
-
}
|
|
5043
|
-
if (isNewRow) {
|
|
5044
|
-
merged.push([...row]);
|
|
5045
|
-
} else {
|
|
5046
|
-
for (let c = 0; c < numCols; c++) {
|
|
5047
|
-
if (row[c]) {
|
|
5048
|
-
prev[c] = prev[c] ? prev[c] + " " + row[c] : row[c];
|
|
5049
|
-
}
|
|
5050
|
-
}
|
|
5051
|
-
}
|
|
5052
|
-
}
|
|
5053
|
-
if (merged.length < 2) {
|
|
5054
|
-
return merged.map((r) => r.filter((c) => c).join(" ")).join("\n");
|
|
5055
|
-
}
|
|
5056
|
-
let headerEnd = 0;
|
|
5057
|
-
for (let r = 0; r < merged.length; r++) {
|
|
5058
|
-
const hasDataValues = merged[r].slice(dataColStart).some((c) => c && /\d/.test(c));
|
|
5059
|
-
if (hasDataValues) break;
|
|
5060
|
-
headerEnd = r + 1;
|
|
5061
|
-
}
|
|
5062
|
-
if (headerEnd > 1) {
|
|
5063
|
-
const headerRow = Array(numCols).fill("");
|
|
5064
|
-
for (let r = 0; r < headerEnd; r++) {
|
|
5065
|
-
for (let c = 0; c < numCols; c++) {
|
|
5066
|
-
if (merged[r][c]) {
|
|
5067
|
-
headerRow[c] = headerRow[c] ? headerRow[c] + " " + merged[r][c] : merged[r][c];
|
|
5068
|
-
}
|
|
5069
|
-
}
|
|
5070
|
-
}
|
|
5071
|
-
merged.splice(0, headerEnd, headerRow);
|
|
5072
|
-
}
|
|
5073
|
-
for (const row of merged) {
|
|
5074
|
-
for (let c = 0; c < row.length; c++) {
|
|
5075
|
-
if (row[c]) row[c] = collapseEvenSpacing(row[c]);
|
|
5076
|
-
}
|
|
5077
|
-
}
|
|
5078
|
-
const totalCells = merged.length * numCols;
|
|
5079
|
-
const filledCells = merged.reduce((s, row) => s + row.filter((c) => c).length, 0);
|
|
5080
|
-
if (filledCells < totalCells * 0.35 || merged.length < 2 || merged.length <= 3 && numCols >= 7) {
|
|
5081
|
-
return merged.map((r) => r.filter((c) => c).join(" ")).join("\n");
|
|
5082
|
-
}
|
|
5083
|
-
const md = [];
|
|
5084
|
-
md.push("| " + merged[0].join(" | ") + " |");
|
|
5085
|
-
md.push("| " + merged[0].map(() => "---").join(" | ") + " |");
|
|
5086
|
-
for (let r = 1; r < merged.length; r++) {
|
|
5087
|
-
md.push("| " + merged[r].join(" | ") + " |");
|
|
5088
|
-
}
|
|
5089
|
-
return md.join("\n");
|
|
5090
|
-
}
|
|
5091
|
-
function mergeLineSimple(items) {
|
|
5092
|
-
if (items.length <= 1) return items[0]?.text || "";
|
|
5093
|
-
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
5094
|
-
const isEvenSpaced = detectEvenSpacedItems(sorted);
|
|
5095
|
-
let result = sorted[0].text;
|
|
5096
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
5097
|
-
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
5098
|
-
const avgFs = (sorted[i].fontSize + sorted[i - 1].fontSize) / 2;
|
|
5099
|
-
const tabThreshold = Math.max(avgFs * 2, 30);
|
|
5100
|
-
if (gap > tabThreshold) {
|
|
5101
|
-
result += " ";
|
|
5102
|
-
result += sorted[i].text;
|
|
5103
|
-
continue;
|
|
5104
|
-
}
|
|
5105
|
-
if (isEvenSpaced[i]) {
|
|
5106
|
-
result += sorted[i].text;
|
|
5107
|
-
continue;
|
|
5108
|
-
}
|
|
5109
|
-
if (sorted[i].hasSpaceBefore && gap >= avgFs * 0.05) {
|
|
5110
|
-
result += " ";
|
|
5111
|
-
result += sorted[i].text;
|
|
5112
|
-
continue;
|
|
5113
|
-
}
|
|
5114
|
-
if (/[□■○●▶◆◇ㅇ]$/.test(sorted[i - 1].text) && /^[가-힣]/.test(sorted[i].text) && gap > 1) {
|
|
5115
|
-
result += " ";
|
|
5116
|
-
result += sorted[i].text;
|
|
5117
|
-
continue;
|
|
5118
|
-
}
|
|
5119
|
-
if (gap < avgFs * 0.15) {
|
|
5120
|
-
} else if (gap < avgFs * 0.35 && (/[가-힣]$/.test(result) || /^[가-힣]/.test(sorted[i].text))) {
|
|
5121
|
-
} else if (gap > 3) result += " ";
|
|
5122
|
-
result += sorted[i].text;
|
|
5123
|
-
}
|
|
5124
|
-
return result;
|
|
5125
|
-
}
|
|
5126
|
-
function cleanPdfText(text) {
|
|
5127
|
-
return mergeKoreanLines(
|
|
5128
|
-
text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
|
|
5129
|
-
).replace(/^(?!\| ---).*$/gm, (line) => collapseEvenSpacing(line)).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
|
|
5130
|
-
}
|
|
5131
|
-
function startsWithMarker(line) {
|
|
5132
|
-
const t = line.trimStart();
|
|
5133
|
-
return /^[가-힣ㄱ-ㅎ][.)]/.test(t) || /^\d+[.)]/.test(t) || /^\([가-힣ㄱ-ㅎ\d]+\)/.test(t) || /^[○●※▶▷◆◇■□★☆\-·]\s/.test(t) || /^제\d+[조항호장절]/.test(t);
|
|
5134
|
-
}
|
|
5135
|
-
function isStandaloneHeader(line) {
|
|
5136
|
-
return /^제\d+[조항호장절](\([^)]*\))?(\s+\S+){0,7}$/.test(line.trim());
|
|
5137
|
-
}
|
|
5138
|
-
function detectListBlocks(blocks) {
|
|
5139
|
-
const result = [];
|
|
5140
|
-
for (let i = 0; i < blocks.length; i++) {
|
|
5141
|
-
const block = blocks[i];
|
|
5142
|
-
if (block.type === "paragraph" && block.text) {
|
|
5143
|
-
const text = block.text.trim();
|
|
5144
|
-
if (/^\d+\.\s/.test(text)) {
|
|
5145
|
-
result.push({ ...block, type: "list", listType: "ordered", text: block.text });
|
|
5146
|
-
continue;
|
|
5147
|
-
}
|
|
5148
|
-
if (/^[○●·※▶▷◆◇\-]\s/.test(text)) {
|
|
5149
|
-
result.push({ ...block, type: "list", listType: "unordered", text: block.text });
|
|
5150
|
-
continue;
|
|
5151
|
-
}
|
|
5152
|
-
}
|
|
5153
|
-
result.push(block);
|
|
5154
|
-
}
|
|
5155
|
-
return result;
|
|
5156
|
-
}
|
|
5157
|
-
var KOREAN_TABLE_HEADER_RE = /^\(?(구분|항목|종류|분류|유형|대상|내용|기간|금액|비율|방법|절차|요건|조건|근거|목적|범위|기준)\)?[:\s]/;
|
|
5158
|
-
var KV_FALSE_POSITIVE_RE = /\d{1,2}:\d{2}|:\/\/|\d+:\d+/;
|
|
5159
|
-
function detectSpecialKoreanTables(blocks) {
|
|
5160
|
-
const result = [];
|
|
5161
|
-
let kvLines = [];
|
|
5162
|
-
const flushKvTable = () => {
|
|
5163
|
-
if (kvLines.length < 2) {
|
|
5164
|
-
for (const kv of kvLines) result.push(kv.block);
|
|
5165
|
-
kvLines = [];
|
|
5166
|
-
return;
|
|
5167
|
-
}
|
|
5168
|
-
const cells = kvLines.map((kv) => {
|
|
5169
|
-
if (kv.value) {
|
|
5170
|
-
return [
|
|
5171
|
-
{ text: kv.key, colSpan: 1, rowSpan: 1 },
|
|
5172
|
-
{ text: kv.value, colSpan: 1, rowSpan: 1 }
|
|
5173
|
-
];
|
|
5174
|
-
}
|
|
5175
|
-
return [
|
|
5176
|
-
{ text: kv.key, colSpan: 2, rowSpan: 1 },
|
|
5177
|
-
{ text: "", colSpan: 1, rowSpan: 1 }
|
|
5178
|
-
];
|
|
5179
|
-
});
|
|
5180
|
-
const irTable = {
|
|
5181
|
-
rows: cells.length,
|
|
5182
|
-
cols: 2,
|
|
5183
|
-
cells,
|
|
5184
|
-
hasHeader: true
|
|
5185
|
-
};
|
|
5186
|
-
const firstBlock = kvLines[0].block;
|
|
5187
|
-
result.push({
|
|
5188
|
-
type: "table",
|
|
5189
|
-
table: irTable,
|
|
5190
|
-
pageNumber: firstBlock.pageNumber,
|
|
5191
|
-
bbox: firstBlock.bbox
|
|
5192
|
-
});
|
|
5193
|
-
kvLines = [];
|
|
5194
|
-
};
|
|
5195
|
-
for (const block of blocks) {
|
|
5196
|
-
if (block.type !== "paragraph" || !block.text) {
|
|
5197
|
-
flushKvTable();
|
|
5198
|
-
result.push(block);
|
|
5199
|
-
continue;
|
|
5200
|
-
}
|
|
5201
|
-
const text = block.text.trim();
|
|
5202
|
-
if (KOREAN_TABLE_HEADER_RE.test(text)) {
|
|
5203
|
-
const colonIdx = text.indexOf(":");
|
|
5204
|
-
if (colonIdx >= 0) {
|
|
5205
|
-
kvLines.push({
|
|
5206
|
-
key: text.slice(0, colonIdx).trim(),
|
|
5207
|
-
value: text.slice(colonIdx + 1).trim(),
|
|
5208
|
-
block
|
|
5209
|
-
});
|
|
5210
|
-
} else {
|
|
5211
|
-
const spaceIdx = text.search(/\s/);
|
|
5212
|
-
if (spaceIdx > 0) {
|
|
5213
|
-
kvLines.push({
|
|
5214
|
-
key: text.slice(0, spaceIdx).trim(),
|
|
5215
|
-
value: text.slice(spaceIdx + 1).trim(),
|
|
5216
|
-
block
|
|
5217
|
-
});
|
|
5218
|
-
} else {
|
|
5219
|
-
kvLines.push({ key: text, value: "", block });
|
|
5220
|
-
}
|
|
5221
|
-
}
|
|
5222
|
-
continue;
|
|
5223
|
-
}
|
|
5224
|
-
if (kvLines.length > 0 && text.includes(":")) {
|
|
5225
|
-
if (!KV_FALSE_POSITIVE_RE.test(text) && !text.includes("(") && !text.includes(")")) {
|
|
5226
|
-
const colonIdx = text.indexOf(":");
|
|
5227
|
-
const key = text.slice(0, colonIdx).trim();
|
|
5228
|
-
if (/^[가-힣]+$/.test(key) && key.length >= 2 && key.length <= 8) {
|
|
5229
|
-
kvLines.push({
|
|
5230
|
-
key,
|
|
5231
|
-
value: text.slice(colonIdx + 1).trim(),
|
|
5232
|
-
block
|
|
5233
|
-
});
|
|
5234
|
-
continue;
|
|
5235
|
-
}
|
|
5236
|
-
}
|
|
5237
|
-
}
|
|
5238
|
-
flushKvTable();
|
|
5239
|
-
result.push(block);
|
|
5240
|
-
}
|
|
5241
|
-
flushKvTable();
|
|
5242
|
-
return result;
|
|
5243
|
-
}
|
|
5244
|
-
function removeHeaderFooterBlocks(blocks, pageHeights, warnings) {
|
|
5245
|
-
const ZONE_RATIO = 0.1;
|
|
5246
|
-
const MIN_REPEAT = 3;
|
|
5247
|
-
const headerTexts = /* @__PURE__ */ new Map();
|
|
5248
|
-
const footerTexts = /* @__PURE__ */ new Map();
|
|
5249
|
-
for (let bi = 0; bi < blocks.length; bi++) {
|
|
5250
|
-
const b = blocks[bi];
|
|
5251
|
-
if (!b.bbox || !b.pageNumber || !b.text?.trim()) continue;
|
|
5252
|
-
const ph = pageHeights.get(b.bbox.page) || pageHeights.get(b.pageNumber);
|
|
5253
|
-
if (!ph) continue;
|
|
5254
|
-
const blockTop = ph - (b.bbox.y + b.bbox.height);
|
|
5255
|
-
const blockBottom = ph - b.bbox.y;
|
|
5256
|
-
if (blockBottom <= ph * ZONE_RATIO) {
|
|
5257
|
-
const arr = footerTexts.get(b.pageNumber) || [];
|
|
5258
|
-
arr.push(b.text.trim());
|
|
5259
|
-
footerTexts.set(b.pageNumber, arr);
|
|
5260
|
-
} else if (blockTop >= ph * (1 - ZONE_RATIO)) {
|
|
5261
|
-
const arr = headerTexts.get(b.pageNumber) || [];
|
|
5262
|
-
arr.push(b.text.trim());
|
|
5263
|
-
headerTexts.set(b.pageNumber, arr);
|
|
5264
|
-
}
|
|
5265
|
-
}
|
|
5266
|
-
const repeatedPatterns = /* @__PURE__ */ new Set();
|
|
5267
|
-
for (const textsMap of [headerTexts, footerTexts]) {
|
|
5268
|
-
const patternCount = /* @__PURE__ */ new Map();
|
|
5269
|
-
for (const [, texts] of textsMap) {
|
|
5270
|
-
for (const t of texts) {
|
|
5271
|
-
const normalized = t.replace(/\d+/g, "#");
|
|
5272
|
-
patternCount.set(normalized, (patternCount.get(normalized) || 0) + 1);
|
|
5273
|
-
}
|
|
5274
|
-
}
|
|
5275
|
-
for (const [pattern, count] of patternCount) {
|
|
5276
|
-
if (count >= MIN_REPEAT) repeatedPatterns.add(pattern);
|
|
5277
|
-
}
|
|
5278
|
-
}
|
|
5279
|
-
if (repeatedPatterns.size === 0) return [];
|
|
5280
|
-
const removeIndices = [];
|
|
5281
|
-
for (let bi = 0; bi < blocks.length; bi++) {
|
|
5282
|
-
const b = blocks[bi];
|
|
5283
|
-
if (!b.bbox || !b.pageNumber || !b.text?.trim()) continue;
|
|
5284
|
-
const ph = pageHeights.get(b.bbox.page) || pageHeights.get(b.pageNumber);
|
|
5285
|
-
if (!ph) continue;
|
|
5286
|
-
const blockTop = ph - (b.bbox.y + b.bbox.height);
|
|
5287
|
-
const blockBottom = ph - b.bbox.y;
|
|
5288
|
-
const inZone = blockBottom <= ph * ZONE_RATIO || blockTop >= ph * (1 - ZONE_RATIO);
|
|
5289
|
-
if (!inZone) continue;
|
|
5290
|
-
const normalized = b.text.trim().replace(/\d+/g, "#");
|
|
5291
|
-
if (repeatedPatterns.has(normalized)) {
|
|
5292
|
-
removeIndices.push(bi);
|
|
5293
|
-
}
|
|
5294
|
-
}
|
|
5295
|
-
if (removeIndices.length > 0) {
|
|
5296
|
-
warnings.push({ message: `${removeIndices.length}\uAC1C \uBA38\uB9AC\uAE00/\uBC14\uB2E5\uAE00 \uC694\uC18C \uC81C\uAC70\uB428`, code: "HIDDEN_TEXT_FILTERED" });
|
|
5297
|
-
}
|
|
5298
|
-
return removeIndices;
|
|
5299
|
-
}
|
|
5300
|
-
function mergeKoreanLines(text) {
|
|
5301
|
-
if (!text) return "";
|
|
5302
|
-
const lines = text.split("\n");
|
|
5303
|
-
if (lines.length <= 1) return text;
|
|
5304
|
-
const result = [lines[0]];
|
|
5305
|
-
for (let i = 1; i < lines.length; i++) {
|
|
5306
|
-
const prev = result[result.length - 1];
|
|
5307
|
-
const curr = lines[i];
|
|
5308
|
-
const currTrimmed = curr.trim();
|
|
5309
|
-
if (/^#{1,6}\s/.test(prev) || /^#{1,6}\s/.test(curr) || /^\|/.test(currTrimmed) || /^---/.test(currTrimmed)) {
|
|
5310
|
-
result.push(curr);
|
|
5311
|
-
continue;
|
|
5312
|
-
}
|
|
5313
|
-
if (/,$/.test(prev.trim()) && currTrimmed.length > 0) {
|
|
5314
|
-
result[result.length - 1] = prev + "\n" + curr;
|
|
5315
|
-
continue;
|
|
5316
|
-
}
|
|
5317
|
-
if (/^\(※/.test(currTrimmed)) {
|
|
5318
|
-
result[result.length - 1] = prev + " " + currTrimmed;
|
|
5319
|
-
continue;
|
|
5320
|
-
}
|
|
5321
|
-
if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev) && !startsWithMarker(prev)) {
|
|
5322
|
-
result[result.length - 1] = prev + " " + curr;
|
|
5323
|
-
} else {
|
|
5324
|
-
result.push(curr);
|
|
5325
|
-
}
|
|
5326
|
-
}
|
|
5327
|
-
return result.join("\n");
|
|
5328
|
-
}
|
|
5329
|
-
|
|
5330
|
-
// src/xlsx/parser.ts
|
|
5331
|
-
var import_jszip3 = __toESM(require("jszip"), 1);
|
|
5332
|
-
var import_xmldom2 = require("@xmldom/xmldom");
|
|
5333
|
-
var MAX_SHEETS = 100;
|
|
5334
|
-
var MAX_DECOMPRESS_SIZE3 = 100 * 1024 * 1024;
|
|
5335
|
-
var MAX_ROWS2 = 1e4;
|
|
5336
|
-
var MAX_COLS2 = 200;
|
|
5337
|
-
function cleanNumericValue(raw) {
|
|
5338
|
-
if (!/^-?\d+\.\d+$/.test(raw)) return raw;
|
|
5339
|
-
const num = parseFloat(raw);
|
|
5340
|
-
if (!isFinite(num)) return raw;
|
|
5341
|
-
const cleaned = parseFloat(num.toPrecision(15)).toString();
|
|
5342
|
-
return cleaned;
|
|
5343
|
-
}
|
|
5344
|
-
function parseCellRef(ref) {
|
|
5345
|
-
const m = ref.match(/^([A-Z]+)(\d+)$/);
|
|
5346
|
-
if (!m) return null;
|
|
5347
|
-
let col = 0;
|
|
5348
|
-
for (const ch of m[1]) col = col * 26 + (ch.charCodeAt(0) - 64);
|
|
5349
|
-
return { col: col - 1, row: parseInt(m[2], 10) - 1 };
|
|
5350
|
-
}
|
|
5351
|
-
function parseMergeRef(ref) {
|
|
5352
|
-
const parts = ref.split(":");
|
|
5353
|
-
if (parts.length !== 2) return null;
|
|
5354
|
-
const start = parseCellRef(parts[0]);
|
|
5355
|
-
const end = parseCellRef(parts[1]);
|
|
5356
|
-
if (!start || !end) return null;
|
|
5357
|
-
return { startCol: start.col, startRow: start.row, endCol: end.col, endRow: end.row };
|
|
5358
|
-
}
|
|
5359
|
-
function getElements(parent, tagName) {
|
|
5360
|
-
const nodes = parent.getElementsByTagName(tagName);
|
|
5361
|
-
const result = [];
|
|
5362
|
-
for (let i = 0; i < nodes.length; i++) result.push(nodes[i]);
|
|
5363
|
-
return result;
|
|
5364
|
-
}
|
|
5365
|
-
function getTextContent(el) {
|
|
5366
|
-
return el.textContent?.trim() ?? "";
|
|
5367
|
-
}
|
|
5368
|
-
function parseXml(text) {
|
|
5369
|
-
return new import_xmldom2.DOMParser().parseFromString(stripDtd(text), "text/xml");
|
|
5370
|
-
}
|
|
5371
|
-
function parseSharedStrings(xml) {
|
|
5372
|
-
const doc = parseXml(xml);
|
|
5373
|
-
const strings = [];
|
|
5374
|
-
const siList = getElements(doc.documentElement, "si");
|
|
5375
|
-
for (const si of siList) {
|
|
5376
|
-
const tElements = getElements(si, "t");
|
|
5377
|
-
strings.push(tElements.map((t) => t.textContent ?? "").join(""));
|
|
5378
|
-
}
|
|
5379
|
-
return strings;
|
|
2578
|
+
return strings;
|
|
5380
2579
|
}
|
|
5381
2580
|
function parseWorkbook(xml) {
|
|
5382
2581
|
const doc = parseXml(xml);
|
|
@@ -5384,9 +2583,9 @@ function parseWorkbook(xml) {
|
|
|
5384
2583
|
const sheetElements = getElements(doc.documentElement, "sheet");
|
|
5385
2584
|
for (const el of sheetElements) {
|
|
5386
2585
|
sheets.push({
|
|
5387
|
-
name: el.getAttribute("name")
|
|
5388
|
-
sheetId: el.getAttribute("sheetId")
|
|
5389
|
-
rId: el.getAttribute("r:id")
|
|
2586
|
+
name: _nullishCoalesce(el.getAttribute("name"), () => ( `Sheet${sheets.length + 1}`)),
|
|
2587
|
+
sheetId: _nullishCoalesce(el.getAttribute("sheetId"), () => ( "")),
|
|
2588
|
+
rId: _nullishCoalesce(el.getAttribute("r:id"), () => ( ""))
|
|
5390
2589
|
});
|
|
5391
2590
|
}
|
|
5392
2591
|
return sheets;
|
|
@@ -5409,7 +2608,7 @@ function parseWorksheet(xml, sharedStrings) {
|
|
|
5409
2608
|
let maxCol = 0;
|
|
5410
2609
|
const rows = getElements(doc.documentElement, "row");
|
|
5411
2610
|
for (const rowEl of rows) {
|
|
5412
|
-
const rowNum = parseInt(rowEl.getAttribute("r")
|
|
2611
|
+
const rowNum = parseInt(_nullishCoalesce(rowEl.getAttribute("r"), () => ( "0")), 10) - 1;
|
|
5413
2612
|
if (rowNum < 0 || rowNum >= MAX_ROWS2) continue;
|
|
5414
2613
|
const cells = getElements(rowEl, "c");
|
|
5415
2614
|
for (const cellEl of cells) {
|
|
@@ -5425,7 +2624,7 @@ function parseWorksheet(xml, sharedStrings) {
|
|
|
5425
2624
|
const raw = getTextContent(vElements[0]);
|
|
5426
2625
|
if (type === "s") {
|
|
5427
2626
|
const idx = parseInt(raw, 10);
|
|
5428
|
-
value = sharedStrings[idx]
|
|
2627
|
+
value = _nullishCoalesce(sharedStrings[idx], () => ( ""));
|
|
5429
2628
|
} else if (type === "b") {
|
|
5430
2629
|
value = raw === "1" ? "TRUE" : "FALSE";
|
|
5431
2630
|
} else {
|
|
@@ -5435,7 +2634,7 @@ function parseWorksheet(xml, sharedStrings) {
|
|
|
5435
2634
|
const isEl = getElements(cellEl, "is");
|
|
5436
2635
|
if (isEl.length > 0) {
|
|
5437
2636
|
const tElements = getElements(isEl[0], "t");
|
|
5438
|
-
value = tElements.map((t) => t.textContent
|
|
2637
|
+
value = tElements.map((t) => _nullishCoalesce(t.textContent, () => ( ""))).join("");
|
|
5439
2638
|
}
|
|
5440
2639
|
}
|
|
5441
2640
|
if (!value && fElements.length > 0) {
|
|
@@ -5499,18 +2698,18 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
|
|
|
5499
2698
|
for (let c = 0; c <= maxCol; c++) {
|
|
5500
2699
|
const key = `${r},${c}`;
|
|
5501
2700
|
if (mergeSkip.has(key)) continue;
|
|
5502
|
-
const text = (grid[r] && grid[r][c])
|
|
2701
|
+
const text = _nullishCoalesce((grid[r] && grid[r][c]), () => ( ""));
|
|
5503
2702
|
const merge = mergeMap.get(key);
|
|
5504
2703
|
row.push({
|
|
5505
2704
|
text,
|
|
5506
|
-
colSpan: merge
|
|
5507
|
-
rowSpan: merge
|
|
2705
|
+
colSpan: _nullishCoalesce(_optionalChain([merge, 'optionalAccess', _40 => _40.colSpan]), () => ( 1)),
|
|
2706
|
+
rowSpan: _nullishCoalesce(_optionalChain([merge, 'optionalAccess', _41 => _41.rowSpan]), () => ( 1))
|
|
5508
2707
|
});
|
|
5509
2708
|
}
|
|
5510
2709
|
cellRows.push(row);
|
|
5511
2710
|
}
|
|
5512
2711
|
if (cellRows.length > 0) {
|
|
5513
|
-
const table = buildTable(cellRows);
|
|
2712
|
+
const table = _chunkHXUCZ2ILcjs.buildTable.call(void 0, cellRows);
|
|
5514
2713
|
if (table.rows > 0) {
|
|
5515
2714
|
blocks.push({ type: "table", table, pageNumber: sheetIndex + 1 });
|
|
5516
2715
|
}
|
|
@@ -5518,12 +2717,12 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
|
|
|
5518
2717
|
return blocks;
|
|
5519
2718
|
}
|
|
5520
2719
|
async function parseXlsxDocument(buffer, options) {
|
|
5521
|
-
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
|
|
5522
|
-
const zip = await
|
|
2720
|
+
_chunkHXUCZ2ILcjs.precheckZipSize.call(void 0, buffer, MAX_DECOMPRESS_SIZE3);
|
|
2721
|
+
const zip = await _jszip2.default.loadAsync(buffer);
|
|
5523
2722
|
const warnings = [];
|
|
5524
2723
|
const workbookFile = zip.file("xl/workbook.xml");
|
|
5525
2724
|
if (!workbookFile) {
|
|
5526
|
-
throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 XLSX \uD30C\uC77C: xl/workbook.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
2725
|
+
throw new (0, _chunkHXUCZ2ILcjs.KordocError)("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 XLSX \uD30C\uC77C: xl/workbook.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
5527
2726
|
}
|
|
5528
2727
|
let sharedStrings = [];
|
|
5529
2728
|
const ssFile = zip.file("xl/sharedStrings.xml");
|
|
@@ -5532,7 +2731,7 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
5532
2731
|
}
|
|
5533
2732
|
const sheets = parseWorkbook(await workbookFile.async("text"));
|
|
5534
2733
|
if (sheets.length === 0) {
|
|
5535
|
-
throw new KordocError("XLSX \uD30C\uC77C\uC5D0 \uC2DC\uD2B8\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
2734
|
+
throw new (0, _chunkHXUCZ2ILcjs.KordocError)("XLSX \uD30C\uC77C\uC5D0 \uC2DC\uD2B8\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
5536
2735
|
}
|
|
5537
2736
|
let relsMap = /* @__PURE__ */ new Map();
|
|
5538
2737
|
const relsFile = zip.file("xl/_rels/workbook.xml.rels");
|
|
@@ -5540,8 +2739,8 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
5540
2739
|
relsMap = parseRels(await relsFile.async("text"));
|
|
5541
2740
|
}
|
|
5542
2741
|
let pageFilter = null;
|
|
5543
|
-
if (options
|
|
5544
|
-
const { parsePageRange: parsePageRange2 } = await Promise.resolve().then(() => (
|
|
2742
|
+
if (_optionalChain([options, 'optionalAccess', _42 => _42.pages])) {
|
|
2743
|
+
const { parsePageRange: parsePageRange2 } = await Promise.resolve().then(() => _interopRequireWildcard(require("./page-range-3C7UGGEK.cjs")));
|
|
5545
2744
|
pageFilter = parsePageRange2(options.pages, sheets.length);
|
|
5546
2745
|
}
|
|
5547
2746
|
const blocks = [];
|
|
@@ -5549,7 +2748,7 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
5549
2748
|
for (let i = 0; i < processedSheets; i++) {
|
|
5550
2749
|
if (pageFilter && !pageFilter.has(i + 1)) continue;
|
|
5551
2750
|
const sheet = sheets[i];
|
|
5552
|
-
options
|
|
2751
|
+
_optionalChain([options, 'optionalAccess', _43 => _43.onProgress, 'optionalCall', _44 => _44(i + 1, processedSheets)]);
|
|
5553
2752
|
let sheetPath = relsMap.get(sheet.rId);
|
|
5554
2753
|
if (sheetPath) {
|
|
5555
2754
|
if (!sheetPath.startsWith("xl/") && !sheetPath.startsWith("/")) {
|
|
@@ -5592,7 +2791,7 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
5592
2791
|
const doc = parseXml(coreXml);
|
|
5593
2792
|
const getFirst = (tag) => {
|
|
5594
2793
|
const els = doc.getElementsByTagName(tag);
|
|
5595
|
-
return els.length > 0 ? (els[0].textContent
|
|
2794
|
+
return els.length > 0 ? (_nullishCoalesce(els[0].textContent, () => ( ""))).trim() : void 0;
|
|
5596
2795
|
};
|
|
5597
2796
|
metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
|
|
5598
2797
|
metadata.author = getFirst("dc:creator");
|
|
@@ -5601,16 +2800,16 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
5601
2800
|
if (created) metadata.createdAt = created;
|
|
5602
2801
|
const modified = getFirst("dcterms:modified");
|
|
5603
2802
|
if (modified) metadata.modifiedAt = modified;
|
|
5604
|
-
} catch {
|
|
2803
|
+
} catch (e20) {
|
|
5605
2804
|
}
|
|
5606
2805
|
}
|
|
5607
|
-
const markdown = blocksToMarkdown(blocks);
|
|
2806
|
+
const markdown = _chunkHXUCZ2ILcjs.blocksToMarkdown.call(void 0, blocks);
|
|
5608
2807
|
return { markdown, blocks, metadata, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
5609
2808
|
}
|
|
5610
2809
|
|
|
5611
2810
|
// src/docx/parser.ts
|
|
5612
|
-
|
|
5613
|
-
|
|
2811
|
+
|
|
2812
|
+
|
|
5614
2813
|
var MAX_DECOMPRESS_SIZE4 = 100 * 1024 * 1024;
|
|
5615
2814
|
function getChildElements(parent, localName2) {
|
|
5616
2815
|
const result = [];
|
|
@@ -5619,7 +2818,7 @@ function getChildElements(parent, localName2) {
|
|
|
5619
2818
|
const node = children[i];
|
|
5620
2819
|
if (node.nodeType === 1) {
|
|
5621
2820
|
const el = node;
|
|
5622
|
-
if (el.localName === localName2 || el.tagName
|
|
2821
|
+
if (el.localName === localName2 || _optionalChain([el, 'access', _45 => _45.tagName, 'optionalAccess', _46 => _46.endsWith, 'call', _47 => _47(`:${localName2}`)])) {
|
|
5623
2822
|
result.push(el);
|
|
5624
2823
|
}
|
|
5625
2824
|
}
|
|
@@ -5634,7 +2833,7 @@ function findElements(parent, localName2) {
|
|
|
5634
2833
|
const child = children[i];
|
|
5635
2834
|
if (child.nodeType === 1) {
|
|
5636
2835
|
const el = child;
|
|
5637
|
-
if (el.localName === localName2 || el.tagName
|
|
2836
|
+
if (el.localName === localName2 || _optionalChain([el, 'access', _48 => _48.tagName, 'optionalAccess', _49 => _49.endsWith, 'call', _50 => _50(`:${localName2}`)])) {
|
|
5638
2837
|
result.push(el);
|
|
5639
2838
|
}
|
|
5640
2839
|
walk(el);
|
|
@@ -5653,7 +2852,7 @@ function getAttr(el, localName2) {
|
|
|
5653
2852
|
return null;
|
|
5654
2853
|
}
|
|
5655
2854
|
function parseXml2(text) {
|
|
5656
|
-
return new
|
|
2855
|
+
return new (0, _xmldom.DOMParser)().parseFromString(_chunkHXUCZ2ILcjs.stripDtd.call(void 0, text), "text/xml");
|
|
5657
2856
|
}
|
|
5658
2857
|
function parseStyles(xml) {
|
|
5659
2858
|
const doc = parseXml2(xml);
|
|
@@ -5663,9 +2862,9 @@ function parseStyles(xml) {
|
|
|
5663
2862
|
const styleId = getAttr(el, "styleId");
|
|
5664
2863
|
if (!styleId) continue;
|
|
5665
2864
|
const nameEls = getChildElements(el, "name");
|
|
5666
|
-
const name = nameEls.length > 0 ? getAttr(nameEls[0], "val")
|
|
2865
|
+
const name = nameEls.length > 0 ? _nullishCoalesce(getAttr(nameEls[0], "val"), () => ( "")) : "";
|
|
5667
2866
|
const basedOnEls = getChildElements(el, "basedOn");
|
|
5668
|
-
const basedOn = basedOnEls.length > 0 ? getAttr(basedOnEls[0], "val")
|
|
2867
|
+
const basedOn = basedOnEls.length > 0 ? _nullishCoalesce(getAttr(basedOnEls[0], "val"), () => ( void 0)) : void 0;
|
|
5669
2868
|
const pPrEls = getChildElements(el, "pPr");
|
|
5670
2869
|
let outlineLevel;
|
|
5671
2870
|
if (pPrEls.length > 0) {
|
|
@@ -5693,9 +2892,9 @@ function parseNumbering(xml) {
|
|
|
5693
2892
|
const levels = /* @__PURE__ */ new Map();
|
|
5694
2893
|
const lvlElements = getChildElements(el, "lvl");
|
|
5695
2894
|
for (const lvl of lvlElements) {
|
|
5696
|
-
const ilvl = parseInt(getAttr(lvl, "ilvl")
|
|
2895
|
+
const ilvl = parseInt(_nullishCoalesce(getAttr(lvl, "ilvl"), () => ( "0")), 10);
|
|
5697
2896
|
const numFmtEls = getChildElements(lvl, "numFmt");
|
|
5698
|
-
const numFmt = numFmtEls.length > 0 ? getAttr(numFmtEls[0], "val")
|
|
2897
|
+
const numFmt = numFmtEls.length > 0 ? _nullishCoalesce(getAttr(numFmtEls[0], "val"), () => ( "bullet")) : "bullet";
|
|
5699
2898
|
levels.set(ilvl, { numFmt, level: ilvl });
|
|
5700
2899
|
}
|
|
5701
2900
|
abstractNums.set(abstractNumId, levels);
|
|
@@ -5739,7 +2938,7 @@ function parseFootnotes(xml) {
|
|
|
5739
2938
|
const runs = findElements(p, "r");
|
|
5740
2939
|
for (const r of runs) {
|
|
5741
2940
|
const tElements = getChildElements(r, "t");
|
|
5742
|
-
for (const t of tElements) texts.push(t.textContent
|
|
2941
|
+
for (const t of tElements) texts.push(_nullishCoalesce(t.textContent, () => ( "")));
|
|
5743
2942
|
}
|
|
5744
2943
|
}
|
|
5745
2944
|
notes.set(id, texts.join("").trim());
|
|
@@ -5748,7 +2947,7 @@ function parseFootnotes(xml) {
|
|
|
5748
2947
|
}
|
|
5749
2948
|
function extractRun(r) {
|
|
5750
2949
|
const tElements = getChildElements(r, "t");
|
|
5751
|
-
const text = tElements.map((t) => t.textContent
|
|
2950
|
+
const text = tElements.map((t) => _nullishCoalesce(t.textContent, () => ( ""))).join("");
|
|
5752
2951
|
let bold = false;
|
|
5753
2952
|
let italic = false;
|
|
5754
2953
|
const rPrEls = getChildElements(r, "rPr");
|
|
@@ -5765,13 +2964,13 @@ function parseParagraph(p, styles, numbering, footnotes, rels) {
|
|
|
5765
2964
|
let ilvl = 0;
|
|
5766
2965
|
if (pPrEls.length > 0) {
|
|
5767
2966
|
const pStyleEls = getChildElements(pPrEls[0], "pStyle");
|
|
5768
|
-
if (pStyleEls.length > 0) styleId = getAttr(pStyleEls[0], "val")
|
|
2967
|
+
if (pStyleEls.length > 0) styleId = _nullishCoalesce(getAttr(pStyleEls[0], "val"), () => ( ""));
|
|
5769
2968
|
const numPrEls = getChildElements(pPrEls[0], "numPr");
|
|
5770
2969
|
if (numPrEls.length > 0) {
|
|
5771
2970
|
const numIdEls = getChildElements(numPrEls[0], "numId");
|
|
5772
2971
|
const ilvlEls = getChildElements(numPrEls[0], "ilvl");
|
|
5773
|
-
numId = numIdEls.length > 0 ? getAttr(numIdEls[0], "val")
|
|
5774
|
-
ilvl = ilvlEls.length > 0 ? parseInt(getAttr(ilvlEls[0], "val")
|
|
2972
|
+
numId = numIdEls.length > 0 ? _nullishCoalesce(getAttr(numIdEls[0], "val"), () => ( "")) : "";
|
|
2973
|
+
ilvl = ilvlEls.length > 0 ? parseInt(_nullishCoalesce(getAttr(ilvlEls[0], "val"), () => ( "0")), 10) : 0;
|
|
5775
2974
|
}
|
|
5776
2975
|
}
|
|
5777
2976
|
const parts = [];
|
|
@@ -5818,7 +3017,7 @@ function parseParagraph(p, styles, numbering, footnotes, rels) {
|
|
|
5818
3017
|
const text = parts.join("").trim();
|
|
5819
3018
|
if (!text) return null;
|
|
5820
3019
|
const style = styles.get(styleId);
|
|
5821
|
-
if (style
|
|
3020
|
+
if (_optionalChain([style, 'optionalAccess', _51 => _51.outlineLevel]) !== void 0 && style.outlineLevel >= 0 && style.outlineLevel <= 5) {
|
|
5822
3021
|
return {
|
|
5823
3022
|
type: "heading",
|
|
5824
3023
|
text,
|
|
@@ -5827,8 +3026,8 @@ function parseParagraph(p, styles, numbering, footnotes, rels) {
|
|
|
5827
3026
|
}
|
|
5828
3027
|
if (numId && numId !== "0") {
|
|
5829
3028
|
const numDef = numbering.get(numId);
|
|
5830
|
-
const levelInfo = numDef
|
|
5831
|
-
const listType = levelInfo
|
|
3029
|
+
const levelInfo = _optionalChain([numDef, 'optionalAccess', _52 => _52.get, 'call', _53 => _53(ilvl)]);
|
|
3030
|
+
const listType = _optionalChain([levelInfo, 'optionalAccess', _54 => _54.numFmt]) === "bullet" ? "unordered" : "ordered";
|
|
5832
3031
|
return { type: "list", text, listType };
|
|
5833
3032
|
}
|
|
5834
3033
|
const block = { type: "paragraph", text };
|
|
@@ -5854,7 +3053,7 @@ function parseTable(tbl, styles, numbering, footnotes, rels) {
|
|
|
5854
3053
|
if (tcPrEls.length > 0) {
|
|
5855
3054
|
const gridSpanEls = getChildElements(tcPrEls[0], "gridSpan");
|
|
5856
3055
|
if (gridSpanEls.length > 0) {
|
|
5857
|
-
colSpan = parseInt(getAttr(gridSpanEls[0], "val")
|
|
3056
|
+
colSpan = parseInt(_nullishCoalesce(getAttr(gridSpanEls[0], "val"), () => ( "1")), 10);
|
|
5858
3057
|
}
|
|
5859
3058
|
const vMergeEls = getChildElements(tcPrEls[0], "vMerge");
|
|
5860
3059
|
if (vMergeEls.length > 0) {
|
|
@@ -5869,7 +3068,7 @@ function parseTable(tbl, styles, numbering, footnotes, rels) {
|
|
|
5869
3068
|
const pElements = getChildElements(tc, "p");
|
|
5870
3069
|
for (const p of pElements) {
|
|
5871
3070
|
const block = parseParagraph(p, styles, numbering, footnotes, rels);
|
|
5872
|
-
if (block
|
|
3071
|
+
if (_optionalChain([block, 'optionalAccess', _55 => _55.text])) cellTexts.push(block.text);
|
|
5873
3072
|
}
|
|
5874
3073
|
row.push({ text: cellTexts.join("\n"), colSpan, rowSpan });
|
|
5875
3074
|
}
|
|
@@ -5882,7 +3081,7 @@ function parseTable(tbl, styles, numbering, footnotes, rels) {
|
|
|
5882
3081
|
if (!cell || cell.rowSpan === 0) continue;
|
|
5883
3082
|
let span = 1;
|
|
5884
3083
|
for (let nr = r + 1; nr < rows.length; nr++) {
|
|
5885
|
-
if (rows[nr][c]
|
|
3084
|
+
if (_optionalChain([rows, 'access', _56 => _56[nr], 'access', _57 => _57[c], 'optionalAccess', _58 => _58.rowSpan]) === 0) span++;
|
|
5886
3085
|
else break;
|
|
5887
3086
|
}
|
|
5888
3087
|
cell.rowSpan = span;
|
|
@@ -5926,7 +3125,7 @@ async function extractImages(zip, rels, doc) {
|
|
|
5926
3125
|
try {
|
|
5927
3126
|
const data = await imgFile.async("uint8array");
|
|
5928
3127
|
imgIdx++;
|
|
5929
|
-
const ext = imgPath.split(".").pop()
|
|
3128
|
+
const ext = _nullishCoalesce(_optionalChain([imgPath, 'access', _59 => _59.split, 'call', _60 => _60("."), 'access', _61 => _61.pop, 'call', _62 => _62(), 'optionalAccess', _63 => _63.toLowerCase, 'call', _64 => _64()]), () => ( "png"));
|
|
5930
3129
|
const mimeMap = {
|
|
5931
3130
|
png: "image/png",
|
|
5932
3131
|
jpg: "image/jpeg",
|
|
@@ -5937,21 +3136,21 @@ async function extractImages(zip, rels, doc) {
|
|
|
5937
3136
|
emf: "image/emf"
|
|
5938
3137
|
};
|
|
5939
3138
|
const filename = `image_${String(imgIdx).padStart(3, "0")}.${ext}`;
|
|
5940
|
-
images.push({ filename, data, mimeType: mimeMap[ext]
|
|
3139
|
+
images.push({ filename, data, mimeType: _nullishCoalesce(mimeMap[ext], () => ( "image/png")) });
|
|
5941
3140
|
blocks.push({ type: "image", text: filename });
|
|
5942
|
-
} catch {
|
|
3141
|
+
} catch (e21) {
|
|
5943
3142
|
}
|
|
5944
3143
|
}
|
|
5945
3144
|
}
|
|
5946
3145
|
return { blocks, images };
|
|
5947
3146
|
}
|
|
5948
3147
|
async function parseDocxDocument(buffer, options) {
|
|
5949
|
-
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
|
|
5950
|
-
const zip = await
|
|
3148
|
+
_chunkHXUCZ2ILcjs.precheckZipSize.call(void 0, buffer, MAX_DECOMPRESS_SIZE4);
|
|
3149
|
+
const zip = await _jszip2.default.loadAsync(buffer);
|
|
5951
3150
|
const warnings = [];
|
|
5952
3151
|
const docFile = zip.file("word/document.xml");
|
|
5953
3152
|
if (!docFile) {
|
|
5954
|
-
throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 DOCX \uD30C\uC77C: word/document.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
3153
|
+
throw new (0, _chunkHXUCZ2ILcjs.KordocError)("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 DOCX \uD30C\uC77C: word/document.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
5955
3154
|
}
|
|
5956
3155
|
let rels = /* @__PURE__ */ new Map();
|
|
5957
3156
|
const relsFile = zip.file("word/_rels/document.xml.rels");
|
|
@@ -5963,7 +3162,7 @@ async function parseDocxDocument(buffer, options) {
|
|
|
5963
3162
|
if (stylesFile) {
|
|
5964
3163
|
try {
|
|
5965
3164
|
styles = parseStyles(await stylesFile.async("text"));
|
|
5966
|
-
} catch {
|
|
3165
|
+
} catch (e22) {
|
|
5967
3166
|
}
|
|
5968
3167
|
}
|
|
5969
3168
|
let numbering = /* @__PURE__ */ new Map();
|
|
@@ -5971,7 +3170,7 @@ async function parseDocxDocument(buffer, options) {
|
|
|
5971
3170
|
if (numFile) {
|
|
5972
3171
|
try {
|
|
5973
3172
|
numbering = parseNumbering(await numFile.async("text"));
|
|
5974
|
-
} catch {
|
|
3173
|
+
} catch (e23) {
|
|
5975
3174
|
}
|
|
5976
3175
|
}
|
|
5977
3176
|
let footnotes = /* @__PURE__ */ new Map();
|
|
@@ -5979,14 +3178,14 @@ async function parseDocxDocument(buffer, options) {
|
|
|
5979
3178
|
if (fnFile) {
|
|
5980
3179
|
try {
|
|
5981
3180
|
footnotes = parseFootnotes(await fnFile.async("text"));
|
|
5982
|
-
} catch {
|
|
3181
|
+
} catch (e24) {
|
|
5983
3182
|
}
|
|
5984
3183
|
}
|
|
5985
3184
|
const docXml = await docFile.async("text");
|
|
5986
3185
|
const doc = parseXml2(docXml);
|
|
5987
3186
|
const body = findElements(doc, "body");
|
|
5988
3187
|
if (body.length === 0) {
|
|
5989
|
-
throw new KordocError("DOCX \uBCF8\uBB38(w:body)\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
3188
|
+
throw new (0, _chunkHXUCZ2ILcjs.KordocError)("DOCX \uBCF8\uBB38(w:body)\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
5990
3189
|
}
|
|
5991
3190
|
const blocks = [];
|
|
5992
3191
|
const bodyEl = body[0];
|
|
@@ -5995,7 +3194,7 @@ async function parseDocxDocument(buffer, options) {
|
|
|
5995
3194
|
const node = children[i];
|
|
5996
3195
|
if (node.nodeType !== 1) continue;
|
|
5997
3196
|
const el = node;
|
|
5998
|
-
const localName2 = el.localName
|
|
3197
|
+
const localName2 = _nullishCoalesce(el.localName, () => ( _optionalChain([el, 'access', _65 => _65.tagName, 'optionalAccess', _66 => _66.split, 'call', _67 => _67(":"), 'access', _68 => _68.pop, 'call', _69 => _69()])));
|
|
5999
3198
|
if (localName2 === "p") {
|
|
6000
3199
|
const block = parseParagraph(el, styles, numbering, footnotes, rels);
|
|
6001
3200
|
if (block) blocks.push(block);
|
|
@@ -6013,7 +3212,7 @@ async function parseDocxDocument(buffer, options) {
|
|
|
6013
3212
|
const coreDoc = parseXml2(coreXml);
|
|
6014
3213
|
const getFirst = (tag) => {
|
|
6015
3214
|
const els = coreDoc.getElementsByTagName(tag);
|
|
6016
|
-
return els.length > 0 ? (els[0].textContent
|
|
3215
|
+
return els.length > 0 ? (_nullishCoalesce(els[0].textContent, () => ( ""))).trim() : void 0;
|
|
6017
3216
|
};
|
|
6018
3217
|
metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
|
|
6019
3218
|
metadata.author = getFirst("dc:creator");
|
|
@@ -6022,11 +3221,11 @@ async function parseDocxDocument(buffer, options) {
|
|
|
6022
3221
|
if (created) metadata.createdAt = created;
|
|
6023
3222
|
const modified = getFirst("dcterms:modified");
|
|
6024
3223
|
if (modified) metadata.modifiedAt = modified;
|
|
6025
|
-
} catch {
|
|
3224
|
+
} catch (e25) {
|
|
6026
3225
|
}
|
|
6027
3226
|
}
|
|
6028
|
-
const outline = blocks.filter((b) => b.type === "heading").map((b) => ({ level: b.level
|
|
6029
|
-
const markdown = blocksToMarkdown(blocks);
|
|
3227
|
+
const outline = blocks.filter((b) => b.type === "heading").map((b) => ({ level: _nullishCoalesce(b.level, () => ( 2)), text: _nullishCoalesce(b.text, () => ( "")) }));
|
|
3228
|
+
const markdown = _chunkHXUCZ2ILcjs.blocksToMarkdown.call(void 0, blocks);
|
|
6030
3229
|
return {
|
|
6031
3230
|
markdown,
|
|
6032
3231
|
blocks,
|
|
@@ -6270,7 +3469,7 @@ function fillFormFields(blocks, values) {
|
|
|
6270
3469
|
if (block.type !== "table" || !block.table) continue;
|
|
6271
3470
|
for (let r = 0; r < block.table.rows; r++) {
|
|
6272
3471
|
for (let c = 0; c < block.table.cols; c++) {
|
|
6273
|
-
const cell = block.table.cells[r]
|
|
3472
|
+
const cell = _optionalChain([block, 'access', _70 => _70.table, 'access', _71 => _71.cells, 'access', _72 => _72[r], 'optionalAccess', _73 => _73[c]]);
|
|
6274
3473
|
if (!cell) continue;
|
|
6275
3474
|
const result = fillInCellPatterns(cell.text, normalizedValues, matchedLabels);
|
|
6276
3475
|
if (result) {
|
|
@@ -6309,7 +3508,7 @@ function fillTable(table, values, filled, matchedLabels, patternFilledCells) {
|
|
|
6309
3508
|
const matchKey = findMatchingKey(normalizedCellLabel, values);
|
|
6310
3509
|
if (matchKey === void 0) continue;
|
|
6311
3510
|
const newValue = values.get(matchKey);
|
|
6312
|
-
if (patternFilledCells
|
|
3511
|
+
if (_optionalChain([patternFilledCells, 'optionalAccess', _74 => _74.has, 'call', _75 => _75(`${r},${c + 1}`)])) {
|
|
6313
3512
|
valueCell.text = newValue + " " + valueCell.text;
|
|
6314
3513
|
} else {
|
|
6315
3514
|
valueCell.text = newValue;
|
|
@@ -6370,24 +3569,24 @@ function fillInlineFields(text, values, filled, matchedLabels) {
|
|
|
6370
3569
|
}
|
|
6371
3570
|
|
|
6372
3571
|
// src/form/filler-hwpx.ts
|
|
6373
|
-
|
|
6374
|
-
|
|
3572
|
+
|
|
3573
|
+
|
|
6375
3574
|
async function fillHwpx(hwpxBuffer, values) {
|
|
6376
|
-
const zip = await
|
|
3575
|
+
const zip = await _jszip2.default.loadAsync(hwpxBuffer);
|
|
6377
3576
|
const filled = [];
|
|
6378
3577
|
const matchedLabels = /* @__PURE__ */ new Set();
|
|
6379
3578
|
const normalizedValues = normalizeValues(values);
|
|
6380
3579
|
const sectionFiles = Object.keys(zip.files).filter((name) => /[Ss]ection\d+\.xml$/i.test(name)).sort();
|
|
6381
3580
|
if (sectionFiles.length === 0) {
|
|
6382
|
-
throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
3581
|
+
throw new (0, _chunkHXUCZ2ILcjs.KordocError)("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
6383
3582
|
}
|
|
6384
|
-
const xmlParser = new
|
|
6385
|
-
const xmlSerializer = new
|
|
3583
|
+
const xmlParser = new (0, _xmldom.DOMParser)();
|
|
3584
|
+
const xmlSerializer = new (0, _xmldom.XMLSerializer)();
|
|
6386
3585
|
for (const sectionPath of sectionFiles) {
|
|
6387
3586
|
const zipEntry = zip.file(sectionPath);
|
|
6388
3587
|
if (!zipEntry) continue;
|
|
6389
3588
|
const rawXml = await zipEntry.async("text");
|
|
6390
|
-
const doc = xmlParser.parseFromString(stripDtd(rawXml), "text/xml");
|
|
3589
|
+
const doc = xmlParser.parseFromString(_chunkHXUCZ2ILcjs.stripDtd.call(void 0, rawXml), "text/xml");
|
|
6391
3590
|
if (!doc.documentElement) continue;
|
|
6392
3591
|
let modified = false;
|
|
6393
3592
|
const tables = findAllElements(doc.documentElement, "tbl");
|
|
@@ -6678,7 +3877,7 @@ function applyTextReplacements(tNodes, originalFull, replacedFull) {
|
|
|
6678
3877
|
}
|
|
6679
3878
|
|
|
6680
3879
|
// src/hwpx/generator.ts
|
|
6681
|
-
|
|
3880
|
+
|
|
6682
3881
|
var NS_SECTION = "http://www.hancom.co.kr/hwpml/2011/section";
|
|
6683
3882
|
var NS_PARA = "http://www.hancom.co.kr/hwpml/2011/paragraph";
|
|
6684
3883
|
var NS_HEAD = "http://www.hancom.co.kr/hwpml/2011/head";
|
|
@@ -6705,7 +3904,7 @@ var PARA_LIST = 7;
|
|
|
6705
3904
|
async function markdownToHwpx(markdown) {
|
|
6706
3905
|
const blocks = parseMarkdownToBlocks(markdown);
|
|
6707
3906
|
const sectionXml = blocksToSectionXml(blocks);
|
|
6708
|
-
const zip = new
|
|
3907
|
+
const zip = new (0, _jszip2.default)();
|
|
6709
3908
|
zip.file("mimetype", "application/hwp+zip", { compression: "STORE" });
|
|
6710
3909
|
zip.file("META-INF/container.xml", generateContainerXml());
|
|
6711
3910
|
zip.file("Contents/content.hpf", generateManifest());
|
|
@@ -7247,14 +4446,14 @@ async function parse(input, options) {
|
|
|
7247
4446
|
let buffer;
|
|
7248
4447
|
if (typeof input === "string") {
|
|
7249
4448
|
try {
|
|
7250
|
-
const buf = await (0,
|
|
7251
|
-
buffer = toArrayBuffer(buf);
|
|
4449
|
+
const buf = await _promises.readFile.call(void 0, input);
|
|
4450
|
+
buffer = _chunkHXUCZ2ILcjs.toArrayBuffer.call(void 0, buf);
|
|
7252
4451
|
} catch (err) {
|
|
7253
4452
|
const msg = err instanceof Error && "code" in err && err.code === "ENOENT" ? `\uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${input}` : `\uD30C\uC77C \uC77D\uAE30 \uC2E4\uD328: ${input}`;
|
|
7254
4453
|
return { success: false, fileType: "unknown", error: msg, code: "PARSE_ERROR" };
|
|
7255
4454
|
}
|
|
7256
4455
|
} else if (Buffer.isBuffer(input)) {
|
|
7257
|
-
buffer = toArrayBuffer(input);
|
|
4456
|
+
buffer = _chunkHXUCZ2ILcjs.toArrayBuffer.call(void 0, input);
|
|
7258
4457
|
} else {
|
|
7259
4458
|
buffer = input;
|
|
7260
4459
|
}
|
|
@@ -7280,26 +4479,38 @@ async function parse(input, options) {
|
|
|
7280
4479
|
async function parseHwpx(buffer, options) {
|
|
7281
4480
|
try {
|
|
7282
4481
|
const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options);
|
|
7283
|
-
return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: images
|
|
4482
|
+
return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: _optionalChain([images, 'optionalAccess', _76 => _76.length]) ? images : void 0 };
|
|
7284
4483
|
} catch (err) {
|
|
7285
|
-
return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
4484
|
+
return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: _chunkHXUCZ2ILcjs.classifyError.call(void 0, err) };
|
|
7286
4485
|
}
|
|
7287
4486
|
}
|
|
7288
4487
|
async function parseHwp(buffer, options) {
|
|
7289
4488
|
try {
|
|
7290
4489
|
const { markdown, blocks, metadata, outline, warnings, images } = parseHwp5Document(Buffer.from(buffer), options);
|
|
7291
|
-
return { success: true, fileType: "hwp", markdown, blocks, metadata, outline, warnings, images: images
|
|
4490
|
+
return { success: true, fileType: "hwp", markdown, blocks, metadata, outline, warnings, images: _optionalChain([images, 'optionalAccess', _77 => _77.length]) ? images : void 0 };
|
|
7292
4491
|
} catch (err) {
|
|
7293
|
-
return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
4492
|
+
return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328", code: _chunkHXUCZ2ILcjs.classifyError.call(void 0, err) };
|
|
7294
4493
|
}
|
|
7295
4494
|
}
|
|
7296
4495
|
async function parsePdf(buffer, options) {
|
|
4496
|
+
let parsePdfDocument;
|
|
4497
|
+
try {
|
|
4498
|
+
const mod = await Promise.resolve().then(() => _interopRequireWildcard(require("./parser-KOWPTDJU.cjs")));
|
|
4499
|
+
parsePdfDocument = mod.parsePdfDocument;
|
|
4500
|
+
} catch (e26) {
|
|
4501
|
+
return {
|
|
4502
|
+
success: false,
|
|
4503
|
+
fileType: "pdf",
|
|
4504
|
+
error: "PDF \uD30C\uC2F1\uC5D0 pdfjs-dist\uAC00 \uD544\uC694\uD569\uB2C8\uB2E4. \uC124\uCE58: npm install pdfjs-dist",
|
|
4505
|
+
code: "MISSING_DEPENDENCY"
|
|
4506
|
+
};
|
|
4507
|
+
}
|
|
7297
4508
|
try {
|
|
7298
4509
|
const { markdown, blocks, metadata, outline, warnings, isImageBased } = await parsePdfDocument(buffer, options);
|
|
7299
4510
|
return { success: true, fileType: "pdf", markdown, blocks, metadata, outline, warnings, isImageBased };
|
|
7300
4511
|
} catch (err) {
|
|
7301
4512
|
const isImageBased = err instanceof Error && "isImageBased" in err ? true : void 0;
|
|
7302
|
-
return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err), isImageBased };
|
|
4513
|
+
return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: _chunkHXUCZ2ILcjs.classifyError.call(void 0, err), isImageBased };
|
|
7303
4514
|
}
|
|
7304
4515
|
}
|
|
7305
4516
|
async function parseXlsx(buffer, options) {
|
|
@@ -7307,24 +4518,24 @@ async function parseXlsx(buffer, options) {
|
|
|
7307
4518
|
const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options);
|
|
7308
4519
|
return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
|
|
7309
4520
|
} catch (err) {
|
|
7310
|
-
return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
4521
|
+
return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code: _chunkHXUCZ2ILcjs.classifyError.call(void 0, err) };
|
|
7311
4522
|
}
|
|
7312
4523
|
}
|
|
7313
4524
|
async function parseDocx(buffer, options) {
|
|
7314
4525
|
try {
|
|
7315
4526
|
const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options);
|
|
7316
|
-
return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: images
|
|
4527
|
+
return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: _optionalChain([images, 'optionalAccess', _78 => _78.length]) ? images : void 0 };
|
|
7317
4528
|
} catch (err) {
|
|
7318
|
-
return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
4529
|
+
return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: _chunkHXUCZ2ILcjs.classifyError.call(void 0, err) };
|
|
7319
4530
|
}
|
|
7320
4531
|
}
|
|
7321
4532
|
async function fillForm(input, values, outputFormat = "markdown") {
|
|
7322
4533
|
let buffer;
|
|
7323
4534
|
if (typeof input === "string") {
|
|
7324
|
-
const buf = await (0,
|
|
7325
|
-
buffer = toArrayBuffer(buf);
|
|
4535
|
+
const buf = await _promises.readFile.call(void 0, input);
|
|
4536
|
+
buffer = _chunkHXUCZ2ILcjs.toArrayBuffer.call(void 0, buf);
|
|
7326
4537
|
} else if (Buffer.isBuffer(input)) {
|
|
7327
|
-
buffer = toArrayBuffer(input);
|
|
4538
|
+
buffer = _chunkHXUCZ2ILcjs.toArrayBuffer.call(void 0, input);
|
|
7328
4539
|
} else {
|
|
7329
4540
|
buffer = input;
|
|
7330
4541
|
}
|
|
@@ -7350,36 +4561,35 @@ async function fillForm(input, values, outputFormat = "markdown") {
|
|
|
7350
4561
|
throw new Error(`\uC11C\uC2DD \uD30C\uC2F1 \uC2E4\uD328: ${parsed.error}`);
|
|
7351
4562
|
}
|
|
7352
4563
|
const fill = fillFormFields(parsed.blocks, values);
|
|
7353
|
-
const markdown = blocksToMarkdown(fill.blocks);
|
|
4564
|
+
const markdown = _chunkHXUCZ2ILcjs.blocksToMarkdown.call(void 0, fill.blocks);
|
|
7354
4565
|
if (outputFormat === "hwpx") {
|
|
7355
4566
|
const hwpxBuffer = await markdownToHwpx(markdown);
|
|
7356
4567
|
return { output: hwpxBuffer, format: "hwpx", fill };
|
|
7357
4568
|
}
|
|
7358
4569
|
return { output: markdown, format: "markdown", fill };
|
|
7359
4570
|
}
|
|
7360
|
-
|
|
7361
|
-
|
|
7362
|
-
|
|
7363
|
-
|
|
7364
|
-
|
|
7365
|
-
|
|
7366
|
-
|
|
7367
|
-
|
|
7368
|
-
|
|
7369
|
-
|
|
7370
|
-
|
|
7371
|
-
|
|
7372
|
-
|
|
7373
|
-
|
|
7374
|
-
|
|
7375
|
-
|
|
7376
|
-
|
|
7377
|
-
|
|
7378
|
-
|
|
7379
|
-
|
|
7380
|
-
|
|
7381
|
-
|
|
7382
|
-
|
|
7383
|
-
|
|
7384
|
-
});
|
|
4571
|
+
|
|
4572
|
+
|
|
4573
|
+
|
|
4574
|
+
|
|
4575
|
+
|
|
4576
|
+
|
|
4577
|
+
|
|
4578
|
+
|
|
4579
|
+
|
|
4580
|
+
|
|
4581
|
+
|
|
4582
|
+
|
|
4583
|
+
|
|
4584
|
+
|
|
4585
|
+
|
|
4586
|
+
|
|
4587
|
+
|
|
4588
|
+
|
|
4589
|
+
|
|
4590
|
+
|
|
4591
|
+
|
|
4592
|
+
|
|
4593
|
+
|
|
4594
|
+
exports.VERSION = _chunkHXUCZ2ILcjs.VERSION; exports.blocksToMarkdown = _chunkHXUCZ2ILcjs.blocksToMarkdown; exports.compare = compare; exports.detectFormat = detectFormat; exports.detectZipFormat = detectZipFormat; exports.diffBlocks = diffBlocks; exports.extractFormFields = extractFormFields; exports.fillForm = fillForm; exports.fillFormFields = fillFormFields; exports.fillHwpx = fillHwpx; exports.isHwpxFile = isHwpxFile; exports.isLabelCell = isLabelCell; exports.isOldHwpFile = isOldHwpFile; exports.isPdfFile = isPdfFile; exports.isZipFile = isZipFile; exports.markdownToHwpx = markdownToHwpx; exports.parse = parse; exports.parseDocx = parseDocx; exports.parseHwp = parseHwp; exports.parseHwpx = parseHwpx; exports.parsePdf = parsePdf; exports.parseXlsx = parseXlsx;
|
|
7385
4595
|
//# sourceMappingURL=index.cjs.map
|