kordoc 2.2.4 → 2.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/README.md +43 -4
  2. package/dist/chunk-FCQEF2ZM.js +457 -0
  3. package/dist/chunk-FCQEF2ZM.js.map +1 -0
  4. package/dist/chunk-HXUCZ2IL.cjs +450 -0
  5. package/dist/chunk-HXUCZ2IL.cjs.map +1 -0
  6. package/dist/chunk-MUOQXDZ4.cjs +33 -0
  7. package/dist/chunk-MUOQXDZ4.cjs.map +1 -0
  8. package/dist/chunk-NL5XLN5R.js +450 -0
  9. package/dist/chunk-NL5XLN5R.js.map +1 -0
  10. package/dist/{chunk-SY2RFVLW.js → chunk-RF6UJXR3.js} +135 -2805
  11. package/dist/chunk-RF6UJXR3.js.map +1 -0
  12. package/dist/chunk-SBVRCJFH.js +33 -0
  13. package/dist/chunk-SBVRCJFH.js.map +1 -0
  14. package/dist/cli.js +12 -7
  15. package/dist/cli.js.map +1 -1
  16. package/dist/index.cjs +294 -3084
  17. package/dist/index.cjs.map +1 -1
  18. package/dist/index.d.cts +1 -1
  19. package/dist/index.d.ts +1 -1
  20. package/dist/index.js +77 -2817
  21. package/dist/index.js.map +1 -1
  22. package/dist/mcp.js +15 -9
  23. package/dist/mcp.js.map +1 -1
  24. package/dist/page-range-3C7UGGEK.cjs +7 -0
  25. package/dist/page-range-3C7UGGEK.cjs.map +1 -0
  26. package/dist/page-range-H35FN3OQ.js +7 -0
  27. package/dist/page-range-H35FN3OQ.js.map +1 -0
  28. package/dist/parser-43IAQ5KE.js +2278 -0
  29. package/dist/parser-43IAQ5KE.js.map +1 -0
  30. package/dist/parser-AMP7MAOH.js +2279 -0
  31. package/dist/parser-AMP7MAOH.js.map +1 -0
  32. package/dist/parser-KOWPTDJU.cjs +2278 -0
  33. package/dist/parser-KOWPTDJU.cjs.map +1 -0
  34. package/dist/provider-WPIYEALY.js +37 -0
  35. package/dist/provider-WPIYEALY.js.map +1 -0
  36. package/dist/provider-YN2SSK4X.cjs +37 -0
  37. package/dist/provider-YN2SSK4X.cjs.map +1 -0
  38. package/dist/{watch-5P7DJ3HG.js → watch-IUQXOXW3.js} +6 -4
  39. package/dist/{watch-5P7DJ3HG.js.map → watch-IUQXOXW3.js.map} +1 -1
  40. package/package.json +1 -1
  41. package/dist/chunk-SY2RFVLW.js.map +0 -1
package/dist/index.cjs CHANGED
@@ -1,147 +1,31 @@
1
- "use strict";
2
- var __create = Object.create;
3
- var __defProp = Object.defineProperty;
4
- var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
- var __getOwnPropNames = Object.getOwnPropertyNames;
6
- var __getProtoOf = Object.getPrototypeOf;
7
- var __hasOwnProp = Object.prototype.hasOwnProperty;
8
- var __esm = (fn, res) => function __init() {
9
- return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
10
- };
11
- var __export = (target, all) => {
12
- for (var name in all)
13
- __defProp(target, name, { get: all[name], enumerable: true });
14
- };
15
- var __copyProps = (to, from, except, desc) => {
16
- if (from && typeof from === "object" || typeof from === "function") {
17
- for (let key of __getOwnPropNames(from))
18
- if (!__hasOwnProp.call(to, key) && key !== except)
19
- __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
20
- }
21
- return to;
22
- };
23
- var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
24
- // If the importer is in node compatibility mode or this is not an ESM
25
- // file that has been converted to a CommonJS file using a Babel-
26
- // compatible transform (i.e. "__esModule" has not been set), then set
27
- // "default" to the CommonJS "module.exports" for node compatibility.
28
- isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
29
- mod
30
- ));
31
- var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
1
+ "use strict";Object.defineProperty(exports, "__esModule", {value: true}); function _interopRequireWildcard(obj) { if (obj && obj.__esModule) { return obj; } else { var newObj = {}; if (obj != null) { for (var key in obj) { if (Object.prototype.hasOwnProperty.call(obj, key)) { newObj[key] = obj[key]; } } } newObj.default = obj; return newObj; } } function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { default: obj }; } function _nullishCoalesce(lhs, rhsFn) { if (lhs != null) { return lhs; } else { return rhsFn(); } } function _optionalChain(ops) { let lastAccessLHS = undefined; let value = ops[0]; let i = 1; while (i < ops.length) { const op = ops[i]; const fn = ops[i + 1]; i += 2; if ((op === 'optionalAccess' || op === 'optionalCall') && value == null) { return undefined; } if (op === 'access' || op === 'optionalAccess') { lastAccessLHS = value; value = fn(value); } else if (op === 'call' || op === 'optionalCall') { value = fn((...args) => value.call(lastAccessLHS, ...args)); lastAccessLHS = undefined; } } return value; }
2
+
3
+
4
+
5
+
6
+
7
+
8
+
9
+
10
+
11
+
12
+
13
+
14
+
15
+
32
16
 
33
- // src/page-range.ts
34
- var page_range_exports = {};
35
- __export(page_range_exports, {
36
- parsePageRange: () => parsePageRange
37
- });
38
- function parsePageRange(spec, maxPages) {
39
- const result = /* @__PURE__ */ new Set();
40
- if (maxPages <= 0) return result;
41
- if (Array.isArray(spec)) {
42
- for (const n of spec) {
43
- const page = Math.round(n);
44
- if (page >= 1 && page <= maxPages) result.add(page);
45
- }
46
- return result;
47
- }
48
- if (typeof spec !== "string" || spec.trim() === "") return result;
49
- const parts = spec.split(",");
50
- for (const part of parts) {
51
- const trimmed = part.trim();
52
- if (!trimmed) continue;
53
- const rangeMatch = trimmed.match(/^(\d+)\s*-\s*(\d+)$/);
54
- if (rangeMatch) {
55
- const start = Math.max(1, parseInt(rangeMatch[1], 10));
56
- const end = Math.min(maxPages, parseInt(rangeMatch[2], 10));
57
- for (let i = start; i <= end; i++) result.add(i);
58
- } else {
59
- const page = parseInt(trimmed, 10);
60
- if (!isNaN(page) && page >= 1 && page <= maxPages) result.add(page);
61
- }
62
- }
63
- return result;
64
- }
65
- var init_page_range = __esm({
66
- "src/page-range.ts"() {
67
- "use strict";
68
- }
69
- });
70
17
 
71
- // src/ocr/provider.ts
72
- var provider_exports = {};
73
- __export(provider_exports, {
74
- ocrPages: () => ocrPages
75
- });
76
- async function ocrPages(doc, provider, pageFilter, effectivePageCount) {
77
- const blocks = [];
78
- for (let i = 1; i <= effectivePageCount; i++) {
79
- if (pageFilter && !pageFilter.has(i)) continue;
80
- const page = await doc.getPage(i);
81
- try {
82
- const imageData = await renderPageToPng(page);
83
- const text = await provider(imageData, i, "image/png");
84
- if (text.trim()) {
85
- blocks.push({ type: "paragraph", text: text.trim(), pageNumber: i });
86
- }
87
- } catch {
88
- blocks.push({ type: "paragraph", text: `[OCR \uC2E4\uD328: \uD398\uC774\uC9C0 ${i}]` });
89
- }
90
- }
91
- return blocks;
92
- }
93
- async function renderPageToPng(page) {
94
- let createCanvas;
95
- try {
96
- const canvasModule = await import("canvas");
97
- createCanvas = canvasModule.createCanvas;
98
- } catch {
99
- throw new Error("OCR\uC744 \uC0AC\uC6A9\uD558\uB824\uBA74 'canvas' \uD328\uD0A4\uC9C0\uB97C \uC124\uCE58\uD558\uC138\uC694: npm install canvas");
100
- }
101
- const scale = 2;
102
- const viewport = page.getViewport({ scale });
103
- const canvas = createCanvas(Math.floor(viewport.width), Math.floor(viewport.height));
104
- const ctx = canvas.getContext("2d");
105
- await page.render({ canvasContext: ctx, viewport }).promise;
106
- return new Uint8Array(canvas.toBuffer("image/png"));
107
- }
108
- var init_provider = __esm({
109
- "src/ocr/provider.ts"() {
110
- "use strict";
111
- }
112
- });
18
+
19
+ var _chunkHXUCZ2ILcjs = require('./chunk-HXUCZ2IL.cjs');
20
+
21
+
22
+ var _chunkMUOQXDZ4cjs = require('./chunk-MUOQXDZ4.cjs');
113
23
 
114
24
  // src/index.ts
115
- var index_exports = {};
116
- __export(index_exports, {
117
- VERSION: () => VERSION,
118
- blocksToMarkdown: () => blocksToMarkdown,
119
- compare: () => compare,
120
- detectFormat: () => detectFormat,
121
- detectZipFormat: () => detectZipFormat,
122
- diffBlocks: () => diffBlocks,
123
- extractFormFields: () => extractFormFields,
124
- fillForm: () => fillForm,
125
- fillFormFields: () => fillFormFields,
126
- fillHwpx: () => fillHwpx,
127
- isHwpxFile: () => isHwpxFile,
128
- isLabelCell: () => isLabelCell,
129
- isOldHwpFile: () => isOldHwpFile,
130
- isPdfFile: () => isPdfFile,
131
- isZipFile: () => isZipFile,
132
- markdownToHwpx: () => markdownToHwpx,
133
- parse: () => parse,
134
- parseDocx: () => parseDocx,
135
- parseHwp: () => parseHwp,
136
- parseHwpx: () => parseHwpx,
137
- parsePdf: () => parsePdf,
138
- parseXlsx: () => parseXlsx
139
- });
140
- module.exports = __toCommonJS(index_exports);
141
- var import_promises = require("fs/promises");
25
+ var _promises = require('fs/promises');
142
26
 
143
27
  // src/detect.ts
144
- var import_jszip = __toESM(require("jszip"), 1);
28
+ var _jszip = require('jszip'); var _jszip2 = _interopRequireDefault(_jszip);
145
29
  function magicBytes(buffer) {
146
30
  return new Uint8Array(buffer, 0, Math.min(4, buffer.byteLength));
147
31
  }
@@ -169,453 +53,22 @@ function detectFormat(buffer) {
169
53
  }
170
54
  async function detectZipFormat(buffer) {
171
55
  try {
172
- const zip = await import_jszip.default.loadAsync(buffer);
56
+ const zip = await _jszip2.default.loadAsync(buffer);
173
57
  if (zip.file("xl/workbook.xml")) return "xlsx";
174
58
  if (zip.file("word/document.xml")) return "docx";
175
59
  if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return "hwpx";
176
60
  const hasSection = Object.keys(zip.files).some((f) => f.startsWith("Contents/"));
177
61
  if (hasSection) return "hwpx";
178
62
  return "unknown";
179
- } catch {
63
+ } catch (e2) {
180
64
  return "unknown";
181
65
  }
182
66
  }
183
67
 
184
68
  // src/hwpx/parser.ts
185
- var import_jszip2 = __toESM(require("jszip"), 1);
186
- var import_zlib = require("zlib");
187
- var import_xmldom = require("@xmldom/xmldom");
188
-
189
- // src/utils.ts
190
- var VERSION = true ? "2.2.4" : "0.0.0-dev";
191
- function toArrayBuffer(buf) {
192
- if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
193
- return buf.buffer;
194
- }
195
- return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
196
- }
197
- var KordocError = class extends Error {
198
- constructor(message) {
199
- super(message);
200
- this.name = "KordocError";
201
- }
202
- };
203
- function isPathTraversal(name) {
204
- if (name.includes("\0")) return true;
205
- const normalized = name.replace(/\\/g, "/");
206
- const segments = normalized.split("/");
207
- return segments.some((s) => s === "..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
208
- }
209
- function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
210
- try {
211
- const data = new DataView(buffer);
212
- const len = buffer.byteLength;
213
- let eocdOffset = -1;
214
- for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
215
- if (data.getUint32(i, true) === 101010256) {
216
- eocdOffset = i;
217
- break;
218
- }
219
- }
220
- if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
221
- const entryCount = data.getUint16(eocdOffset + 10, true);
222
- if (entryCount > maxEntries) {
223
- throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
224
- }
225
- const cdSize = data.getUint32(eocdOffset + 12, true);
226
- const cdOffset = data.getUint32(eocdOffset + 16, true);
227
- if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
228
- let totalUncompressed = 0;
229
- let pos = cdOffset;
230
- for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
231
- if (data.getUint32(pos, true) !== 33639248) break;
232
- totalUncompressed += data.getUint32(pos + 24, true);
233
- const nameLen = data.getUint16(pos + 28, true);
234
- const extraLen = data.getUint16(pos + 30, true);
235
- const commentLen = data.getUint16(pos + 32, true);
236
- pos += 46 + nameLen + extraLen + commentLen;
237
- }
238
- if (totalUncompressed > maxUncompressedSize) {
239
- throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
240
- }
241
- return { totalUncompressed, entryCount };
242
- } catch (err) {
243
- if (err instanceof KordocError) throw err;
244
- return { totalUncompressed: 0, entryCount: 0 };
245
- }
246
- }
247
- function stripDtd(xml) {
248
- return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
249
- }
250
- var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
251
- function sanitizeHref(href) {
252
- const trimmed = href.trim();
253
- if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
254
- return trimmed;
255
- }
256
- function safeMin(arr) {
257
- let min = Infinity;
258
- for (let i = 0; i < arr.length; i++) if (arr[i] < min) min = arr[i];
259
- return min;
260
- }
261
- function safeMax(arr) {
262
- let max = -Infinity;
263
- for (let i = 0; i < arr.length; i++) if (arr[i] > max) max = arr[i];
264
- return max;
265
- }
266
- function classifyError(err) {
267
- if (!(err instanceof Error)) return "PARSE_ERROR";
268
- const msg = err.message;
269
- if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
270
- if (msg.includes("DRM")) return "DRM_PROTECTED";
271
- if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
272
- if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
273
- if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
274
- if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
275
- if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
276
- return "PARSE_ERROR";
277
- }
278
-
279
- // src/table/builder.ts
280
- var MAX_COLS = 200;
281
- var MAX_ROWS = 1e4;
282
- function buildTable(rows) {
283
- if (rows.length > MAX_ROWS) rows = rows.slice(0, MAX_ROWS);
284
- const numRows = rows.length;
285
- const hasAddr = rows.some((row) => row.some((c) => c.colAddr !== void 0 && c.rowAddr !== void 0));
286
- if (hasAddr) return buildTableDirect(rows, numRows);
287
- let maxCols = 0;
288
- const tempOccupied = Array.from({ length: numRows }, () => []);
289
- for (let rowIdx = 0; rowIdx < numRows; rowIdx++) {
290
- let colIdx = 0;
291
- for (const cell of rows[rowIdx]) {
292
- while (colIdx < MAX_COLS && tempOccupied[rowIdx][colIdx]) colIdx++;
293
- if (colIdx >= MAX_COLS) break;
294
- for (let r = rowIdx; r < Math.min(rowIdx + cell.rowSpan, numRows); r++) {
295
- for (let c = colIdx; c < Math.min(colIdx + cell.colSpan, MAX_COLS); c++) {
296
- tempOccupied[r][c] = true;
297
- }
298
- }
299
- colIdx += cell.colSpan;
300
- if (colIdx > maxCols) maxCols = colIdx;
301
- }
302
- }
303
- if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
304
- const grid = Array.from(
305
- { length: numRows },
306
- () => Array.from({ length: maxCols }, () => ({ text: "", colSpan: 1, rowSpan: 1 }))
307
- );
308
- const occupied = Array.from({ length: numRows }, () => Array(maxCols).fill(false));
309
- for (let rowIdx = 0; rowIdx < numRows; rowIdx++) {
310
- let colIdx = 0;
311
- let cellIdx = 0;
312
- while (colIdx < maxCols && cellIdx < rows[rowIdx].length) {
313
- while (colIdx < maxCols && occupied[rowIdx][colIdx]) colIdx++;
314
- if (colIdx >= maxCols) break;
315
- const cell = rows[rowIdx][cellIdx];
316
- grid[rowIdx][colIdx] = {
317
- text: cell.text.trim(),
318
- colSpan: cell.colSpan,
319
- rowSpan: cell.rowSpan
320
- };
321
- for (let r = rowIdx; r < Math.min(rowIdx + cell.rowSpan, numRows); r++) {
322
- for (let c = colIdx; c < Math.min(colIdx + cell.colSpan, maxCols); c++) {
323
- occupied[r][c] = true;
324
- }
325
- }
326
- colIdx += cell.colSpan;
327
- cellIdx++;
328
- }
329
- }
330
- return trimAndReturn(grid, numRows, maxCols);
331
- }
332
- function buildTableDirect(rows, numRows) {
333
- let maxCols = 0;
334
- for (const row of rows) {
335
- for (const cell of row) {
336
- const end = (cell.colAddr ?? 0) + cell.colSpan;
337
- if (end > maxCols) maxCols = end;
338
- }
339
- }
340
- if (maxCols > MAX_COLS) maxCols = MAX_COLS;
341
- if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
342
- const grid = Array.from(
343
- { length: numRows },
344
- () => Array.from({ length: maxCols }, () => ({ text: "", colSpan: 1, rowSpan: 1 }))
345
- );
346
- for (const row of rows) {
347
- for (const cell of row) {
348
- const r = cell.rowAddr ?? 0;
349
- const c = cell.colAddr ?? 0;
350
- if (r >= numRows || c >= maxCols || r < 0 || c < 0) continue;
351
- grid[r][c] = { text: cell.text.trim(), colSpan: cell.colSpan, rowSpan: cell.rowSpan };
352
- for (let dr = 0; dr < cell.rowSpan; dr++) {
353
- for (let dc = 0; dc < cell.colSpan; dc++) {
354
- if (dr === 0 && dc === 0) continue;
355
- if (r + dr < numRows && c + dc < maxCols) {
356
- grid[r + dr][c + dc] = { text: "", colSpan: 1, rowSpan: 1 };
357
- }
358
- }
359
- }
360
- }
361
- }
362
- return trimAndReturn(grid, numRows, maxCols);
363
- }
364
- function trimAndReturn(grid, numRows, maxCols) {
365
- let effectiveCols = maxCols;
366
- while (effectiveCols > 0) {
367
- const colEmpty = grid.every((row) => !row[effectiveCols - 1]?.text?.trim());
368
- if (!colEmpty) break;
369
- effectiveCols--;
370
- }
371
- if (effectiveCols < maxCols && effectiveCols > 0) {
372
- const trimmed = grid.map((row) => row.slice(0, effectiveCols));
373
- return { rows: numRows, cols: effectiveCols, cells: trimmed, hasHeader: numRows > 1 };
374
- }
375
- return { rows: numRows, cols: maxCols, cells: grid, hasHeader: numRows > 1 };
376
- }
377
- function convertTableToText(rows) {
378
- return rows.map(
379
- (row) => row.map((c) => c.text.trim().replace(/\n/g, " ").replace(/\|/g, "\\|")).filter(Boolean).join(" / ")
380
- ).filter(Boolean).join("\n");
381
- }
382
- function escapeGfm(text) {
383
- return text.replace(/~/g, "\\~");
384
- }
385
- var HWP_SHAPE_ALT_TEXT_RE = /(?:모서리가 둥근 |둥근 )?(?:사각형|직사각형|정사각형|원|타원|삼각형|이등변 삼각형|직각 삼각형|선|직선|곡선|화살표|굵은 화살표|이중 화살표|오각형|육각형|팔각형|별|[4-8]점별|십자|십자형|구름|구름형|마름모|도넛|평행사변형|사다리꼴|부채꼴|호|반원|물결|번개|하트|빗금|블록 화살표|수식|표|그림|개체|그리기\s?개체|묶음\s?개체|글상자|수식\s?개체|OLE\s?개체)\s?입니다\.?/g;
386
- function sanitizeText(text) {
387
- let result = text.replace(/[\u{F0000}-\u{FFFFD}]/gu, "").replace(HWP_SHAPE_ALT_TEXT_RE, "").replace(/ +/g, " ").trim();
388
- if (result.length <= 30 && result.includes(" ")) {
389
- const tokens = result.split(" ");
390
- const koreanSingleCharCount = tokens.filter((t) => t.length === 1 && /[\uAC00-\uD7AF\u3131-\u318E]/.test(t)).length;
391
- if (tokens.length >= 3 && koreanSingleCharCount / tokens.length >= 0.7) {
392
- result = tokens.join("");
393
- }
394
- }
395
- return result;
396
- }
397
- function flattenLayoutTables(blocks) {
398
- const result = [];
399
- for (const block of blocks) {
400
- if (block.type !== "table" || !block.table) {
401
- result.push(block);
402
- continue;
403
- }
404
- const { rows: numRows, cols: numCols, cells } = block.table;
405
- if (numRows === 1 && numCols === 1) {
406
- result.push(block);
407
- continue;
408
- }
409
- if (numRows <= 3) {
410
- let totalNewlines = 0;
411
- let totalTextLen = 0;
412
- for (let r = 0; r < numRows; r++) {
413
- for (let c = 0; c < numCols; c++) {
414
- const t = cells[r]?.[c]?.text || "";
415
- totalNewlines += (t.match(/\n/g) || []).length;
416
- totalTextLen += t.length;
417
- }
418
- }
419
- if (totalNewlines > 5 || numRows <= 2 && totalTextLen > 300) {
420
- for (let r = 0; r < numRows; r++) {
421
- for (let c = 0; c < numCols; c++) {
422
- const cellText = cells[r]?.[c]?.text?.trim();
423
- if (!cellText) continue;
424
- for (const line of cellText.split("\n")) {
425
- const trimmed = line.trim();
426
- if (!trimmed) continue;
427
- result.push({ type: "paragraph", text: trimmed, pageNumber: block.pageNumber });
428
- }
429
- }
430
- }
431
- continue;
432
- }
433
- }
434
- result.push(block);
435
- }
436
- return result;
437
- }
438
- function blocksToMarkdown(blocks) {
439
- const lines = [];
440
- for (let i = 0; i < blocks.length; i++) {
441
- const block = blocks[i];
442
- if (block.type === "heading" && block.text) {
443
- const prefix = "#".repeat(Math.min(block.level || 2, 6));
444
- const headingText = sanitizeText(block.text);
445
- if (headingText) lines.push("", `${prefix} ${headingText}`, "");
446
- continue;
447
- }
448
- if (block.type === "image" && block.text) {
449
- lines.push("", `![image](${block.text})`, "");
450
- continue;
451
- }
452
- if (block.type === "separator") {
453
- lines.push("", "---", "");
454
- continue;
455
- }
456
- if (block.type === "list" && block.text) {
457
- const listText = sanitizeText(block.text);
458
- if (!listText) continue;
459
- const alreadyNumbered = block.listType === "ordered" && /^\d+\.\s/.test(listText);
460
- const prefix = alreadyNumbered ? "" : block.listType === "ordered" ? "1. " : "- ";
461
- lines.push(`${prefix}${listText}`);
462
- if (block.children) {
463
- for (const child of block.children) {
464
- const childPrefix = child.listType === "ordered" ? "1." : "-";
465
- lines.push(` ${childPrefix} ${child.text || ""}`);
466
- }
467
- }
468
- continue;
469
- }
470
- if (block.type === "paragraph" && block.text) {
471
- let text = sanitizeText(block.text);
472
- if (!text) continue;
473
- if (/^\[별표\s*\d+/.test(text)) {
474
- const nextBlock = blocks[i + 1];
475
- if (nextBlock?.type === "paragraph" && nextBlock.text && /관련\)?$/.test(nextBlock.text)) {
476
- lines.push("", `## ${text} ${nextBlock.text}`, "");
477
- i++;
478
- } else {
479
- lines.push("", `## ${text}`, "");
480
- }
481
- continue;
482
- }
483
- if (/^\([^)]*조[^)]*관련\)$/.test(text)) {
484
- lines.push(`*${text}*`, "");
485
- continue;
486
- }
487
- if (block.href) {
488
- const href = sanitizeHref(block.href);
489
- if (href) text = `[${text}](${href})`;
490
- }
491
- if (block.footnoteText) {
492
- text += ` (\uC8FC: ${block.footnoteText})`;
493
- }
494
- lines.push(escapeGfm(text), "");
495
- } else if (block.type === "table" && block.table) {
496
- if (lines.length > 0 && lines[lines.length - 1] !== "") {
497
- lines.push("");
498
- }
499
- const tableMd = tableToMarkdown(block.table);
500
- if (tableMd) {
501
- lines.push(tableMd);
502
- lines.push("");
503
- }
504
- }
505
- }
506
- return lines.join("\n").trim();
507
- }
508
- function hasMergedCells(table) {
509
- for (const row of table.cells) {
510
- for (const cell of row) {
511
- if (cell.colSpan > 1 || cell.rowSpan > 1) return true;
512
- }
513
- }
514
- return false;
515
- }
516
- function tableToHtml(table) {
517
- const { cells, rows: numRows, cols: numCols } = table;
518
- const skip = /* @__PURE__ */ new Set();
519
- const lines = ["<table>"];
520
- for (let r = 0; r < numRows; r++) {
521
- const tag = r === 0 ? "th" : "td";
522
- const rowHtml = [];
523
- for (let c = 0; c < numCols; c++) {
524
- if (skip.has(`${r},${c}`)) continue;
525
- const cell = cells[r]?.[c];
526
- if (!cell) continue;
527
- for (let dr = 0; dr < cell.rowSpan; dr++) {
528
- for (let dc = 0; dc < cell.colSpan; dc++) {
529
- if (dr === 0 && dc === 0) continue;
530
- if (r + dr < numRows && c + dc < numCols) skip.add(`${r + dr},${c + dc}`);
531
- }
532
- }
533
- const text = sanitizeText(cell.text).replace(/\n/g, "<br>");
534
- const attrs = [];
535
- if (cell.colSpan > 1) attrs.push(`colspan="${cell.colSpan}"`);
536
- if (cell.rowSpan > 1) attrs.push(`rowspan="${cell.rowSpan}"`);
537
- const attrStr = attrs.length ? " " + attrs.join(" ") : "";
538
- rowHtml.push(`<${tag}${attrStr}>${text}</${tag}>`);
539
- }
540
- if (rowHtml.length) lines.push(`<tr>${rowHtml.join("")}</tr>`);
541
- }
542
- lines.push("</table>");
543
- return lines.join("\n");
544
- }
545
- function tableToMarkdown(table) {
546
- if (table.rows === 0 || table.cols === 0) return "";
547
- const { cells, rows: numRows, cols: numCols } = table;
548
- if (hasMergedCells(table)) return tableToHtml(table);
549
- if (numRows === 1 && numCols === 1) {
550
- const content = sanitizeText(cells[0][0].text);
551
- if (!content) return "";
552
- return content.split(/\n/).map((line) => {
553
- const trimmed = line.trim();
554
- if (!trimmed) return "";
555
- if (/^\d+\.\s/.test(trimmed)) return `**${escapeGfm(trimmed)}**`;
556
- if (/^[가-힣]\.\s/.test(trimmed)) return ` ${escapeGfm(trimmed)}`;
557
- return escapeGfm(trimmed);
558
- }).filter(Boolean).join("\n");
559
- }
560
- if (numCols === 1 && numRows >= 2) {
561
- return cells.map((row) => escapeGfm(sanitizeText(row[0].text)).replace(/\n/g, " ")).filter(Boolean).join("\n");
562
- }
563
- const display = Array.from({ length: numRows }, () => Array(numCols).fill(""));
564
- const skip = /* @__PURE__ */ new Set();
565
- for (let r = 0; r < numRows; r++) {
566
- for (let c = 0; c < numCols; c++) {
567
- if (skip.has(`${r},${c}`)) continue;
568
- const cell = cells[r]?.[c];
569
- if (!cell) continue;
570
- display[r][c] = escapeGfm(sanitizeText(cell.text)).replace(/\|/g, "\\|").replace(/\n/g, "<br>");
571
- for (let dr = 0; dr < cell.rowSpan; dr++) {
572
- for (let dc = 0; dc < cell.colSpan; dc++) {
573
- if (dr === 0 && dc === 0) continue;
574
- if (r + dr < numRows && c + dc < numCols) {
575
- skip.add(`${r + dr},${c + dc}`);
576
- }
577
- }
578
- }
579
- c += cell.colSpan - 1;
580
- }
581
- }
582
- const uniqueRows = [];
583
- let pendingFirstCol = "";
584
- for (let r = 0; r < display.length; r++) {
585
- const row = display[r];
586
- const isEmptyPlaceholder = row.every((cell) => cell === "");
587
- if (isEmptyPlaceholder) continue;
588
- const nonEmptyCols = row.filter((cell) => cell !== "");
589
- const hasSkipInRow = row.some((_, c) => skip.has(`${r},${c}`));
590
- if (!hasSkipInRow && nonEmptyCols.length === 1 && row[0] !== "" && row.slice(1).every((c) => c === "")) {
591
- pendingFirstCol = row[0];
592
- continue;
593
- }
594
- if (pendingFirstCol && row[0] === "") {
595
- row[0] = pendingFirstCol;
596
- pendingFirstCol = "";
597
- } else {
598
- pendingFirstCol = "";
599
- }
600
- uniqueRows.push(row);
601
- }
602
- if (uniqueRows.length === 0) return "";
603
- const md = [];
604
- md.push("| " + uniqueRows[0].join(" | ") + " |");
605
- md.push("| " + uniqueRows[0].map(() => "---").join(" | ") + " |");
606
- for (let i = 1; i < uniqueRows.length; i++) {
607
- md.push("| " + uniqueRows[i].join(" | ") + " |");
608
- }
609
- return md.join("\n");
610
- }
611
69
 
612
- // src/types.ts
613
- var HEADING_RATIO_H1 = 1.5;
614
- var HEADING_RATIO_H2 = 1.3;
615
- var HEADING_RATIO_H3 = 1.15;
616
-
617
- // src/hwpx/parser.ts
618
- init_page_range();
70
+ var _zlib = require('zlib');
71
+ var _xmldom = require('@xmldom/xmldom');
619
72
  var MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024;
620
73
  var MAX_ZIP_ENTRIES = 500;
621
74
  function clampSpan(val, max) {
@@ -623,10 +76,10 @@ function clampSpan(val, max) {
623
76
  }
624
77
  var MAX_XML_DEPTH = 200;
625
78
  function createXmlParser(warnings) {
626
- return new import_xmldom.DOMParser({
79
+ return new (0, _xmldom.DOMParser)({
627
80
  onError(level, msg) {
628
- if (level === "fatalError") throw new KordocError(`XML \uD30C\uC2F1 \uC2E4\uD328: ${msg}`);
629
- warnings?.push({ code: "MALFORMED_XML", message: `XML ${level === "warn" ? "\uACBD\uACE0" : "\uC624\uB958"}: ${msg}` });
81
+ if (level === "fatalError") throw new (0, _chunkHXUCZ2ILcjs.KordocError)(`XML \uD30C\uC2F1 \uC2E4\uD328: ${msg}`);
82
+ _optionalChain([warnings, 'optionalAccess', _2 => _2.push, 'call', _3 => _3({ code: "MALFORMED_XML", message: `XML ${level === "warn" ? "\uACBD\uACE0" : "\uC624\uB958"}: ${msg}` })]);
630
83
  }
631
84
  });
632
85
  }
@@ -644,15 +97,15 @@ async function extractHwpxStyles(zip, decompressed) {
644
97
  const xml = await file.async("text");
645
98
  if (decompressed) {
646
99
  decompressed.total += xml.length * 2;
647
- if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
100
+ if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
648
101
  }
649
102
  const parser = createXmlParser();
650
- const doc = parser.parseFromString(stripDtd(xml), "text/xml");
103
+ const doc = parser.parseFromString(_chunkHXUCZ2ILcjs.stripDtd.call(void 0, xml), "text/xml");
651
104
  if (!doc.documentElement) continue;
652
105
  parseCharProperties(doc, result.charProperties);
653
106
  parseStyleElements(doc, result.styles);
654
107
  break;
655
- } catch {
108
+ } catch (e3) {
656
109
  continue;
657
110
  }
658
111
  }
@@ -709,16 +162,16 @@ function parseStyleElements(doc, map) {
709
162
  }
710
163
  }
711
164
  async function parseHwpxDocument(buffer, options) {
712
- precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
165
+ _chunkHXUCZ2ILcjs.precheckZipSize.call(void 0, buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
713
166
  let zip;
714
167
  try {
715
- zip = await import_jszip2.default.loadAsync(buffer);
716
- } catch {
168
+ zip = await _jszip2.default.loadAsync(buffer);
169
+ } catch (e4) {
717
170
  return extractFromBrokenZip(buffer);
718
171
  }
719
172
  const actualEntryCount = Object.keys(zip.files).length;
720
173
  if (actualEntryCount > MAX_ZIP_ENTRIES) {
721
- throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
174
+ throw new (0, _chunkHXUCZ2ILcjs.KordocError)("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
722
175
  }
723
176
  const decompressed = { total: 0 };
724
177
  const metadata = {};
@@ -726,9 +179,9 @@ async function parseHwpxDocument(buffer, options) {
726
179
  const styleMap = await extractHwpxStyles(zip, decompressed);
727
180
  const warnings = [];
728
181
  const sectionPaths = await resolveSectionPaths(zip);
729
- if (sectionPaths.length === 0) throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
182
+ if (sectionPaths.length === 0) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
730
183
  metadata.pageCount = sectionPaths.length;
731
- const pageFilter = options?.pages ? parsePageRange(options.pages, sectionPaths.length) : null;
184
+ const pageFilter = _optionalChain([options, 'optionalAccess', _4 => _4.pages]) ? _chunkMUOQXDZ4cjs.parsePageRange.call(void 0, options.pages, sectionPaths.length) : null;
732
185
  const totalTarget = pageFilter ? pageFilter.size : sectionPaths.length;
733
186
  const blocks = [];
734
187
  let parsedSections = 0;
@@ -739,19 +192,19 @@ async function parseHwpxDocument(buffer, options) {
739
192
  try {
740
193
  const xml = await file.async("text");
741
194
  decompressed.total += xml.length * 2;
742
- if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
195
+ if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
743
196
  blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1));
744
197
  parsedSections++;
745
- options?.onProgress?.(parsedSections, totalTarget);
198
+ _optionalChain([options, 'optionalAccess', _5 => _5.onProgress, 'optionalCall', _6 => _6(parsedSections, totalTarget)]);
746
199
  } catch (secErr) {
747
- if (secErr instanceof KordocError) throw secErr;
200
+ if (secErr instanceof _chunkHXUCZ2ILcjs.KordocError) throw secErr;
748
201
  warnings.push({ page: si + 1, message: `\uC139\uC158 ${si + 1} \uD30C\uC2F1 \uC2E4\uD328: ${secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
749
202
  }
750
203
  }
751
204
  const images = await extractImagesFromZip(zip, blocks, decompressed, warnings);
752
205
  detectHwpxHeadings(blocks, styleMap);
753
206
  const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
754
- const markdown = blocksToMarkdown(blocks);
207
+ const markdown = _chunkHXUCZ2ILcjs.blocksToMarkdown.call(void 0, blocks);
755
208
  return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
756
209
  }
757
210
  function imageExtToMime(ext) {
@@ -803,13 +256,13 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
803
256
  ];
804
257
  let found = false;
805
258
  for (const path of candidates) {
806
- if (isPathTraversal(path)) continue;
259
+ if (_chunkHXUCZ2ILcjs.isPathTraversal.call(void 0, path)) continue;
807
260
  const file = zip.file(path);
808
261
  if (!file) continue;
809
262
  try {
810
263
  const data = await file.async("uint8array");
811
264
  decompressed.total += data.length;
812
- if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
265
+ if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
813
266
  const ext = ref.includes(".") ? ref.split(".").pop() || "png" : "png";
814
267
  const mimeType = imageExtToMime(ext);
815
268
  imageIndex++;
@@ -820,11 +273,11 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
820
273
  found = true;
821
274
  break;
822
275
  } catch (err) {
823
- if (err instanceof KordocError) throw err;
276
+ if (err instanceof _chunkHXUCZ2ILcjs.KordocError) throw err;
824
277
  }
825
278
  }
826
279
  if (!found) {
827
- warnings?.push({ page: block.pageNumber, message: `\uC774\uBBF8\uC9C0 \uD30C\uC77C \uC5C6\uC74C: ${ref}`, code: "SKIPPED_IMAGE" });
280
+ _optionalChain([warnings, 'optionalAccess', _7 => _7.push, 'call', _8 => _8({ page: block.pageNumber, message: `\uC774\uBBF8\uC9C0 \uD30C\uC77C \uC5C6\uC74C: ${ref}`, code: "SKIPPED_IMAGE" })]);
828
281
  block.type = "paragraph";
829
282
  block.text = `[\uC774\uBBF8\uC9C0: ${ref}]`;
830
283
  }
@@ -840,23 +293,23 @@ async function extractHwpxMetadata(zip, metadata, decompressed) {
840
293
  const xml = await file.async("text");
841
294
  if (decompressed) {
842
295
  decompressed.total += xml.length * 2;
843
- if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
296
+ if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
844
297
  }
845
298
  parseDublinCoreMetadata(xml, metadata);
846
299
  if (metadata.title || metadata.author) return;
847
300
  }
848
- } catch {
301
+ } catch (e5) {
849
302
  }
850
303
  }
851
304
  function parseDublinCoreMetadata(xml, metadata) {
852
305
  const parser = createXmlParser();
853
- const doc = parser.parseFromString(stripDtd(xml), "text/xml");
306
+ const doc = parser.parseFromString(_chunkHXUCZ2ILcjs.stripDtd.call(void 0, xml), "text/xml");
854
307
  if (!doc.documentElement) return;
855
308
  const getText = (tagNames) => {
856
309
  for (const tag of tagNames) {
857
310
  const els = doc.getElementsByTagName(tag);
858
311
  if (els.length > 0) {
859
- const text = els[0].textContent?.trim();
312
+ const text = _optionalChain([els, 'access', _9 => _9[0], 'access', _10 => _10.textContent, 'optionalAccess', _11 => _11.trim, 'call', _12 => _12()]);
860
313
  if (text) return text;
861
314
  }
862
315
  }
@@ -909,7 +362,7 @@ function extractFromBrokenZip(buffer) {
909
362
  }
910
363
  const nameBytes = data.slice(pos + 30, pos + 30 + nameLen);
911
364
  const name = new TextDecoder().decode(nameBytes);
912
- if (isPathTraversal(name)) {
365
+ if (_chunkHXUCZ2ILcjs.isPathTraversal.call(void 0, name)) {
913
366
  pos = fileStart + compSize;
914
367
  continue;
915
368
  }
@@ -921,21 +374,21 @@ function extractFromBrokenZip(buffer) {
921
374
  if (method === 0) {
922
375
  content = new TextDecoder().decode(fileData);
923
376
  } else if (method === 8) {
924
- const decompressed = (0, import_zlib.inflateRawSync)(Buffer.from(fileData), { maxOutputLength: MAX_DECOMPRESS_SIZE });
377
+ const decompressed = _zlib.inflateRawSync.call(void 0, Buffer.from(fileData), { maxOutputLength: MAX_DECOMPRESS_SIZE });
925
378
  content = new TextDecoder().decode(decompressed);
926
379
  } else {
927
380
  continue;
928
381
  }
929
382
  totalDecompressed += content.length * 2;
930
- if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("\uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC");
383
+ if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("\uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC");
931
384
  sectionNum++;
932
385
  blocks.push(...parseSectionXml(content, void 0, warnings, sectionNum));
933
- } catch {
386
+ } catch (e6) {
934
387
  continue;
935
388
  }
936
389
  }
937
- if (blocks.length === 0) throw new KordocError("\uC190\uC0C1\uB41C HWPX\uC5D0\uC11C \uC139\uC158 \uB370\uC774\uD130\uB97C \uBCF5\uAD6C\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
938
- const markdown = blocksToMarkdown(blocks);
390
+ if (blocks.length === 0) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("\uC190\uC0C1\uB41C HWPX\uC5D0\uC11C \uC139\uC158 \uB370\uC774\uD130\uB97C \uBCF5\uAD6C\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
391
+ const markdown = _chunkHXUCZ2ILcjs.blocksToMarkdown.call(void 0, blocks);
939
392
  return { markdown, blocks, warnings: warnings.length > 0 ? warnings : void 0 };
940
393
  }
941
394
  async function resolveSectionPaths(zip) {
@@ -953,7 +406,7 @@ async function resolveSectionPaths(zip) {
953
406
  }
954
407
  function parseSectionPathsFromManifest(xml) {
955
408
  const parser = createXmlParser();
956
- const doc = parser.parseFromString(stripDtd(xml), "text/xml");
409
+ const doc = parser.parseFromString(_chunkHXUCZ2ILcjs.stripDtd.call(void 0, xml), "text/xml");
957
410
  const items = doc.getElementsByTagName("opf:item");
958
411
  const spine = doc.getElementsByTagName("opf:itemref");
959
412
  const isSectionId = (id) => /^s/i.test(id) || id.toLowerCase().includes("section");
@@ -982,7 +435,7 @@ function detectHwpxHeadings(blocks, styleMap) {
982
435
  let baseFontSize = 0;
983
436
  const sizeFreq = /* @__PURE__ */ new Map();
984
437
  for (const b of blocks) {
985
- if (b.style?.fontSize) {
438
+ if (_optionalChain([b, 'access', _13 => _13.style, 'optionalAccess', _14 => _14.fontSize])) {
986
439
  sizeFreq.set(b.style.fontSize, (sizeFreq.get(b.style.fontSize) || 0) + 1);
987
440
  }
988
441
  }
@@ -998,11 +451,11 @@ function detectHwpxHeadings(blocks, styleMap) {
998
451
  const text = block.text.trim();
999
452
  if (text.length === 0 || text.length > 200 || /^\d+$/.test(text)) continue;
1000
453
  let level = 0;
1001
- if (baseFontSize > 0 && block.style?.fontSize) {
454
+ if (baseFontSize > 0 && _optionalChain([block, 'access', _15 => _15.style, 'optionalAccess', _16 => _16.fontSize])) {
1002
455
  const ratio = block.style.fontSize / baseFontSize;
1003
- if (ratio >= HEADING_RATIO_H1) level = 1;
1004
- else if (ratio >= HEADING_RATIO_H2) level = 2;
1005
- else if (ratio >= HEADING_RATIO_H3) level = 3;
456
+ if (ratio >= _chunkHXUCZ2ILcjs.HEADING_RATIO_H1) level = 1;
457
+ else if (ratio >= _chunkHXUCZ2ILcjs.HEADING_RATIO_H2) level = 2;
458
+ else if (ratio >= _chunkHXUCZ2ILcjs.HEADING_RATIO_H3) level = 3;
1006
459
  }
1007
460
  const compactText = text.replace(/\s+/g, "");
1008
461
  if (/^제\d+[조장절편]/.test(compactText) && text.length <= 50) {
@@ -1016,7 +469,7 @@ function detectHwpxHeadings(blocks, styleMap) {
1016
469
  }
1017
470
  function parseSectionXml(xml, styleMap, warnings, sectionNum) {
1018
471
  const parser = createXmlParser(warnings);
1019
- const doc = parser.parseFromString(stripDtd(xml), "text/xml");
472
+ const doc = parser.parseFromString(_chunkHXUCZ2ILcjs.stripDtd.call(void 0, xml), "text/xml");
1020
473
  if (!doc.documentElement) return [];
1021
474
  const blocks = [];
1022
475
  walkSection(doc.documentElement, blocks, null, [], styleMap, warnings, sectionNum);
@@ -1060,16 +513,16 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
1060
513
  let nestedCols = 0;
1061
514
  for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
1062
515
  if (newTable.rows.length >= 3 && nestedCols >= 2) {
1063
- blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
516
+ blocks.push({ type: "table", table: _chunkHXUCZ2ILcjs.buildTable.call(void 0, newTable.rows), pageNumber: sectionNum });
1064
517
  } else {
1065
- const nestedText = convertTableToText(newTable.rows);
518
+ const nestedText = _chunkHXUCZ2ILcjs.convertTableToText.call(void 0, newTable.rows);
1066
519
  if (parentTable.cell) {
1067
520
  parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
1068
521
  }
1069
522
  }
1070
523
  tableCtx = parentTable;
1071
524
  } else {
1072
- blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
525
+ blocks.push({ type: "table", table: _chunkHXUCZ2ILcjs.buildTable.call(void 0, newTable.rows), pageNumber: sectionNum });
1073
526
  tableCtx = null;
1074
527
  }
1075
528
  } else {
@@ -1096,7 +549,7 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
1096
549
  }
1097
550
  break;
1098
551
  case "cellAddr":
1099
- if (tableCtx?.cell) {
552
+ if (_optionalChain([tableCtx, 'optionalAccess', _17 => _17.cell])) {
1100
553
  const ca = parseInt(el.getAttribute("colAddr") || "", 10);
1101
554
  const ra = parseInt(el.getAttribute("rowAddr") || "", 10);
1102
555
  if (!isNaN(ca)) tableCtx.cell.colAddr = ca;
@@ -1104,19 +557,19 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
1104
557
  }
1105
558
  break;
1106
559
  case "cellSpan":
1107
- if (tableCtx?.cell) {
560
+ if (_optionalChain([tableCtx, 'optionalAccess', _18 => _18.cell])) {
1108
561
  const rawCs = parseInt(el.getAttribute("colSpan") || "1", 10);
1109
562
  const cs = isNaN(rawCs) ? 1 : rawCs;
1110
563
  const rawRs = parseInt(el.getAttribute("rowSpan") || "1", 10);
1111
564
  const rs = isNaN(rawRs) ? 1 : rawRs;
1112
- tableCtx.cell.colSpan = clampSpan(cs, MAX_COLS);
1113
- tableCtx.cell.rowSpan = clampSpan(rs, MAX_ROWS);
565
+ tableCtx.cell.colSpan = clampSpan(cs, _chunkHXUCZ2ILcjs.MAX_COLS);
566
+ tableCtx.cell.rowSpan = clampSpan(rs, _chunkHXUCZ2ILcjs.MAX_ROWS);
1114
567
  }
1115
568
  break;
1116
569
  case "p": {
1117
570
  const { text, href, footnote, style } = extractParagraphInfo(el, styleMap);
1118
571
  if (text) {
1119
- if (tableCtx?.cell) {
572
+ if (_optionalChain([tableCtx, 'optionalAccess', _19 => _19.cell])) {
1120
573
  tableCtx.cell.text += (tableCtx.cell.text ? "\n" : "") + text;
1121
574
  } else if (!tableCtx) {
1122
575
  const block = { type: "paragraph", text, pageNumber: sectionNum };
@@ -1170,16 +623,16 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
1170
623
  let nestedCols = 0;
1171
624
  for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
1172
625
  if (newTable.rows.length >= 3 && nestedCols >= 2) {
1173
- blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
626
+ blocks.push({ type: "table", table: _chunkHXUCZ2ILcjs.buildTable.call(void 0, newTable.rows), pageNumber: sectionNum });
1174
627
  } else {
1175
- const nestedText = convertTableToText(newTable.rows);
628
+ const nestedText = _chunkHXUCZ2ILcjs.convertTableToText.call(void 0, newTable.rows);
1176
629
  if (parentTable.cell) {
1177
630
  parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
1178
631
  }
1179
632
  }
1180
633
  tableCtx = parentTable;
1181
634
  } else {
1182
- blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
635
+ blocks.push({ type: "table", table: _chunkHXUCZ2ILcjs.buildTable.call(void 0, newTable.rows), pageNumber: sectionNum });
1183
636
  tableCtx = null;
1184
637
  }
1185
638
  } else {
@@ -1237,7 +690,7 @@ function extractDrawTextBlocks(drawTextNode, blocks, styleMap, sectionNum) {
1237
690
  const info = extractParagraphInfo(child, styleMap);
1238
691
  const text = info.text.trim();
1239
692
  if (text) {
1240
- blocks.push({ type: "paragraph", text, style: info.style ?? void 0, pageNumber: sectionNum });
693
+ blocks.push({ type: "paragraph", text, style: _nullishCoalesce(info.style, () => ( void 0)), pageNumber: sectionNum });
1241
694
  }
1242
695
  }
1243
696
  }
@@ -1287,7 +740,7 @@ function extractParagraphInfo(para, styleMap) {
1287
740
  case "hyperlink": {
1288
741
  const url = child.getAttribute("url") || child.getAttribute("href") || "";
1289
742
  if (url) {
1290
- const safe = sanitizeHref(url);
743
+ const safe = _chunkHXUCZ2ILcjs.sanitizeHref.call(void 0, url);
1291
744
  if (safe) href = safe;
1292
745
  }
1293
746
  walk(child);
@@ -1372,7 +825,7 @@ function extractTextFromNode(node) {
1372
825
  }
1373
826
 
1374
827
  // src/hwp5/record.ts
1375
- var import_zlib2 = require("zlib");
828
+
1376
829
  var TAG_PARA_HEADER = 66;
1377
830
  var TAG_PARA_TEXT = 67;
1378
831
  var TAG_CHAR_SHAPE = 68;
@@ -1420,14 +873,14 @@ function decompressStream(data) {
1420
873
  const opts = { maxOutputLength: MAX_DECOMPRESS_SIZE2 };
1421
874
  if (data.length >= 2 && data[0] === 120) {
1422
875
  try {
1423
- return (0, import_zlib2.inflateSync)(data, opts);
1424
- } catch {
876
+ return _zlib.inflateSync.call(void 0, data, opts);
877
+ } catch (e7) {
1425
878
  }
1426
879
  }
1427
- return (0, import_zlib2.inflateRawSync)(data, opts);
880
+ return _zlib.inflateRawSync.call(void 0, data, opts);
1428
881
  }
1429
882
  function parseFileHeader(data) {
1430
- if (data.length < 40) throw new KordocError("FileHeader\uAC00 \uB108\uBB34 \uC9E7\uC2B5\uB2C8\uB2E4 (\uCD5C\uC18C 40\uBC14\uC774\uD2B8)");
883
+ if (data.length < 40) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("FileHeader\uAC00 \uB108\uBB34 \uC9E7\uC2B5\uB2C8\uB2E4 (\uCD5C\uC18C 40\uBC14\uC774\uD2B8)");
1431
884
  const sig = data.subarray(0, 32).toString("utf8").replace(/\0+$/, "");
1432
885
  return {
1433
886
  signature: sig,
@@ -1480,7 +933,7 @@ function parseDocInfo(records) {
1480
933
  offset += 2;
1481
934
  const charShapeId = offset + 2 <= rec.data.length ? rec.data.readUInt16LE(offset) : 0;
1482
935
  styles.push({ name, nameKo, charShapeId, paraShapeId, type });
1483
- } catch {
936
+ } catch (e8) {
1484
937
  }
1485
938
  }
1486
939
  }
@@ -2162,7 +1615,7 @@ function aes128EcbDecrypt(data, key) {
2162
1615
 
2163
1616
  // src/hwp5/crypto.ts
2164
1617
  var MsvcLcg = class {
2165
- seed;
1618
+
2166
1619
  constructor(seed) {
2167
1620
  this.seed = seed >>> 0;
2168
1621
  }
@@ -2242,7 +1695,7 @@ function decryptViewText(viewTextRaw, compressed) {
2242
1695
  if (compressed) {
2243
1696
  try {
2244
1697
  return decompressStream(Buffer.from(decrypted));
2245
- } catch {
1698
+ } catch (e9) {
2246
1699
  return Buffer.from(decrypted);
2247
1700
  }
2248
1701
  }
@@ -2403,7 +1856,7 @@ function parseLenientCfb(data) {
2403
1856
  function findEntryByPath(path) {
2404
1857
  const parts = path.replace(/^\//, "").split("/");
2405
1858
  if (parts.length === 1) {
2406
- return dirEntries.find((e) => e.name === parts[0] && e.type === 2) ?? null;
1859
+ return _nullishCoalesce(dirEntries.find((e) => e.name === parts[0] && e.type === 2), () => ( null));
2407
1860
  }
2408
1861
  const storageName = parts[0];
2409
1862
  const streamName = parts.slice(1).join("/");
@@ -2413,7 +1866,7 @@ function parseLenientCfb(data) {
2413
1866
  }
2414
1867
  }
2415
1868
  const lastPart = parts[parts.length - 1];
2416
- return dirEntries.find((e) => e.type === 2 && e.name === lastPart) ?? null;
1869
+ return _nullishCoalesce(dirEntries.find((e) => e.type === 2 && e.name === lastPart), () => ( null));
2417
1870
  }
2418
1871
  return {
2419
1872
  findStream(path) {
@@ -2430,10 +1883,8 @@ function parseLenientCfb(data) {
2430
1883
  }
2431
1884
 
2432
1885
  // src/hwp5/parser.ts
2433
- init_page_range();
2434
- var import_module = require("module");
2435
- var import_meta = {};
2436
- var require2 = (0, import_module.createRequire)(import_meta.url);
1886
+ var _module = require('module');
1887
+ var require2 = _module.createRequire.call(void 0, import.meta.url);
2437
1888
  var CFB = require2("cfb");
2438
1889
  var MAX_SECTIONS = 100;
2439
1890
  var MAX_TOTAL_DECOMPRESS = 100 * 1024 * 1024;
@@ -2443,27 +1894,27 @@ function parseHwp5Document(buffer, options) {
2443
1894
  const warnings = [];
2444
1895
  try {
2445
1896
  cfb = CFB.parse(buffer);
2446
- } catch {
1897
+ } catch (e10) {
2447
1898
  try {
2448
1899
  lenientCfb = parseLenientCfb(buffer);
2449
1900
  warnings.push({ message: "\uC190\uC0C1\uB41C CFB \uCEE8\uD14C\uC774\uB108 \u2014 lenient \uBAA8\uB4DC\uB85C \uBCF5\uAD6C", code: "LENIENT_CFB_RECOVERY" });
2450
- } catch {
2451
- throw new KordocError("CFB \uCEE8\uD14C\uC774\uB108 \uD30C\uC2F1 \uC2E4\uD328 (strict \uBC0F lenient \uBAA8\uB450)");
1901
+ } catch (e11) {
1902
+ throw new (0, _chunkHXUCZ2ILcjs.KordocError)("CFB \uCEE8\uD14C\uC774\uB108 \uD30C\uC2F1 \uC2E4\uD328 (strict \uBC0F lenient \uBAA8\uB450)");
2452
1903
  }
2453
1904
  }
2454
1905
  const findStream = (path) => {
2455
1906
  if (cfb) {
2456
1907
  const entry = CFB.find(cfb, path);
2457
- return entry?.content ? Buffer.from(entry.content) : null;
1908
+ return _optionalChain([entry, 'optionalAccess', _20 => _20.content]) ? Buffer.from(entry.content) : null;
2458
1909
  }
2459
1910
  return lenientCfb.findStream(path);
2460
1911
  };
2461
1912
  const headerData = findStream("/FileHeader");
2462
- if (!headerData) throw new KordocError("FileHeader \uC2A4\uD2B8\uB9BC \uC5C6\uC74C");
1913
+ if (!headerData) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("FileHeader \uC2A4\uD2B8\uB9BC \uC5C6\uC74C");
2463
1914
  const header = parseFileHeader(headerData);
2464
- if (header.signature !== "HWP Document File") throw new KordocError("HWP \uC2DC\uADF8\uB2C8\uCC98 \uBD88\uC77C\uCE58");
2465
- if (header.flags & FLAG_ENCRYPTED) throw new KordocError("\uC554\uD638\uD654\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
2466
- if (header.flags & FLAG_DRM) throw new KordocError("DRM \uBCF4\uD638\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
1915
+ if (header.signature !== "HWP Document File") throw new (0, _chunkHXUCZ2ILcjs.KordocError)("HWP \uC2DC\uADF8\uB2C8\uCC98 \uBD88\uC77C\uCE58");
1916
+ if (header.flags & FLAG_ENCRYPTED) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("\uC554\uD638\uD654\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
1917
+ if (header.flags & FLAG_DRM) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("DRM \uBCF4\uD638\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
2467
1918
  const compressed = (header.flags & FLAG_COMPRESSED) !== 0;
2468
1919
  const distribution = (header.flags & FLAG_DISTRIBUTION) !== 0;
2469
1920
  const metadata = {
@@ -2472,9 +1923,9 @@ function parseHwp5Document(buffer, options) {
2472
1923
  if (cfb) extractHwp5Metadata(cfb, metadata);
2473
1924
  const docInfo = cfb ? parseDocInfoStream(cfb, compressed) : parseDocInfoFromStream(findStream("/DocInfo"), compressed);
2474
1925
  const sections = distribution ? cfb ? findViewTextSections(cfb, compressed) : findViewTextSectionsLenient(lenientCfb, compressed) : cfb ? findSections(cfb) : findSectionsLenient(lenientCfb, compressed);
2475
- if (sections.length === 0) throw new KordocError("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
1926
+ if (sections.length === 0) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
2476
1927
  metadata.pageCount = sections.length;
2477
- const pageFilter = options?.pages ? parsePageRange(options.pages, sections.length) : null;
1928
+ const pageFilter = _optionalChain([options, 'optionalAccess', _21 => _21.pages]) ? _chunkMUOQXDZ4cjs.parsePageRange.call(void 0, options.pages, sections.length) : null;
2478
1929
  const totalTarget = pageFilter ? pageFilter.size : sections.length;
2479
1930
  const blocks = [];
2480
1931
  let totalDecompressed = 0;
@@ -2485,34 +1936,34 @@ function parseHwp5Document(buffer, options) {
2485
1936
  const sectionData = sections[si];
2486
1937
  const data = !distribution && compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
2487
1938
  totalDecompressed += data.length;
2488
- if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
1939
+ if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
2489
1940
  const records = readRecords(data);
2490
1941
  const sectionBlocks = parseSection(records, docInfo, warnings, si + 1);
2491
1942
  blocks.push(...sectionBlocks);
2492
1943
  parsedSections++;
2493
- options?.onProgress?.(parsedSections, totalTarget);
1944
+ _optionalChain([options, 'optionalAccess', _22 => _22.onProgress, 'optionalCall', _23 => _23(parsedSections, totalTarget)]);
2494
1945
  } catch (secErr) {
2495
- if (secErr instanceof KordocError) throw secErr;
1946
+ if (secErr instanceof _chunkHXUCZ2ILcjs.KordocError) throw secErr;
2496
1947
  warnings.push({ page: si + 1, message: `\uC139\uC158 ${si + 1} \uD30C\uC2F1 \uC2E4\uD328: ${secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
2497
1948
  }
2498
1949
  }
2499
1950
  const images = cfb ? extractHwp5Images(cfb, blocks, compressed, warnings) : extractHwp5ImagesLenient(lenientCfb, blocks, compressed, warnings);
2500
- const flatBlocks = flattenLayoutTables(blocks);
1951
+ const flatBlocks = _chunkHXUCZ2ILcjs.flattenLayoutTables.call(void 0, blocks);
2501
1952
  if (docInfo) {
2502
1953
  detectHwp5Headings(flatBlocks, docInfo);
2503
1954
  }
2504
1955
  const outline = flatBlocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
2505
- const markdown = blocksToMarkdown(flatBlocks);
1956
+ const markdown = _chunkHXUCZ2ILcjs.blocksToMarkdown.call(void 0, flatBlocks);
2506
1957
  return { markdown, blocks: flatBlocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
2507
1958
  }
2508
1959
  function parseDocInfoStream(cfb, compressed) {
2509
1960
  try {
2510
1961
  const entry = CFB.find(cfb, "/DocInfo");
2511
- if (!entry?.content) return null;
1962
+ if (!_optionalChain([entry, 'optionalAccess', _24 => _24.content])) return null;
2512
1963
  const data = compressed ? decompressStream(Buffer.from(entry.content)) : Buffer.from(entry.content);
2513
1964
  const records = readRecords(data);
2514
1965
  return parseDocInfo(records);
2515
- } catch {
1966
+ } catch (e12) {
2516
1967
  return null;
2517
1968
  }
2518
1969
  }
@@ -2521,7 +1972,7 @@ function parseDocInfoFromStream(raw, compressed) {
2521
1972
  try {
2522
1973
  const data = compressed ? decompressStream(raw) : raw;
2523
1974
  return parseDocInfo(readRecords(data));
2524
- } catch {
1975
+ } catch (e13) {
2525
1976
  return null;
2526
1977
  }
2527
1978
  }
@@ -2531,7 +1982,7 @@ function detectHwp5Headings(blocks, docInfo) {
2531
1982
  const name = (style.nameKo || style.name).toLowerCase();
2532
1983
  if (name.includes("\uBC14\uD0D5") || name.includes("\uBCF8\uBB38") || name === "normal" || name === "body") {
2533
1984
  const cs = docInfo.charShapes[style.charShapeId];
2534
- if (cs?.fontSize > 0) {
1985
+ if (_optionalChain([cs, 'optionalAccess', _25 => _25.fontSize]) > 0) {
2535
1986
  baseFontSize = cs.fontSize / 10;
2536
1987
  break;
2537
1988
  }
@@ -2540,7 +1991,7 @@ function detectHwp5Headings(blocks, docInfo) {
2540
1991
  if (baseFontSize === 0) {
2541
1992
  const sizeFreq = /* @__PURE__ */ new Map();
2542
1993
  for (const b of blocks) {
2543
- if (b.style?.fontSize) {
1994
+ if (_optionalChain([b, 'access', _26 => _26.style, 'optionalAccess', _27 => _27.fontSize])) {
2544
1995
  sizeFreq.set(b.style.fontSize, (sizeFreq.get(b.style.fontSize) || 0) + 1);
2545
1996
  }
2546
1997
  }
@@ -2560,11 +2011,11 @@ function detectHwp5Headings(blocks, docInfo) {
2560
2011
  if (text.length === 0 || text.length > 200) continue;
2561
2012
  if (/^\d+$/.test(text)) continue;
2562
2013
  let level = 0;
2563
- if (block.style?.fontSize && baseFontSize > 0) {
2014
+ if (_optionalChain([block, 'access', _28 => _28.style, 'optionalAccess', _29 => _29.fontSize]) && baseFontSize > 0) {
2564
2015
  const ratio = block.style.fontSize / baseFontSize;
2565
- if (ratio >= HEADING_RATIO_H1) level = 1;
2566
- else if (ratio >= HEADING_RATIO_H2) level = 2;
2567
- else if (ratio >= HEADING_RATIO_H3) level = 3;
2016
+ if (ratio >= _chunkHXUCZ2ILcjs.HEADING_RATIO_H1) level = 1;
2017
+ else if (ratio >= _chunkHXUCZ2ILcjs.HEADING_RATIO_H2) level = 2;
2018
+ else if (ratio >= _chunkHXUCZ2ILcjs.HEADING_RATIO_H3) level = 3;
2568
2019
  }
2569
2020
  if (/^제\d+[장절편]\s/.test(text) && text.length <= 50) {
2570
2021
  if (level === 0) level = 2;
@@ -2580,7 +2031,7 @@ function detectHwp5Headings(blocks, docInfo) {
2580
2031
  function extractHwp5Metadata(cfb, metadata) {
2581
2032
  try {
2582
2033
  const summaryEntry = CFB.find(cfb, "/HwpSummaryInformation") || CFB.find(cfb, "/SummaryInformation");
2583
- if (!summaryEntry?.content) return;
2034
+ if (!_optionalChain([summaryEntry, 'optionalAccess', _30 => _30.content])) return;
2584
2035
  const data = Buffer.from(summaryEntry.content);
2585
2036
  if (data.length < 48) return;
2586
2037
  const numSets = data.readUInt32LE(24);
@@ -2606,18 +2057,18 @@ function extractHwp5Metadata(cfb, metadata) {
2606
2057
  else if (propId === 4) metadata.author = str;
2607
2058
  else if (propId === 6) metadata.description = str;
2608
2059
  }
2609
- } catch {
2060
+ } catch (e14) {
2610
2061
  }
2611
2062
  }
2612
2063
  function findViewTextSections(cfb, compressed) {
2613
2064
  const sections = [];
2614
2065
  for (let i = 0; i < MAX_SECTIONS; i++) {
2615
2066
  const entry = CFB.find(cfb, `/ViewText/Section${i}`);
2616
- if (!entry?.content) break;
2067
+ if (!_optionalChain([entry, 'optionalAccess', _31 => _31.content])) break;
2617
2068
  try {
2618
2069
  const decrypted = decryptViewText(Buffer.from(entry.content), compressed);
2619
2070
  sections.push({ idx: i, content: decrypted });
2620
- } catch {
2071
+ } catch (e15) {
2621
2072
  break;
2622
2073
  }
2623
2074
  }
@@ -2627,13 +2078,13 @@ function findSections(cfb) {
2627
2078
  const sections = [];
2628
2079
  for (let i = 0; i < MAX_SECTIONS; i++) {
2629
2080
  const entry = CFB.find(cfb, `/BodyText/Section${i}`);
2630
- if (!entry?.content) break;
2081
+ if (!_optionalChain([entry, 'optionalAccess', _32 => _32.content])) break;
2631
2082
  sections.push({ idx: i, content: Buffer.from(entry.content) });
2632
2083
  }
2633
2084
  if (sections.length === 0 && cfb.FileIndex) {
2634
2085
  for (const entry of cfb.FileIndex) {
2635
2086
  if (sections.length >= MAX_SECTIONS) break;
2636
- if (entry.name?.startsWith("Section") && entry.content) {
2087
+ if (_optionalChain([entry, 'access', _33 => _33.name, 'optionalAccess', _34 => _34.startsWith, 'call', _35 => _35("Section")]) && entry.content) {
2637
2088
  const idx = parseInt(entry.name.replace("Section", ""), 10) || 0;
2638
2089
  sections.push({ idx, content: Buffer.from(entry.content) });
2639
2090
  }
@@ -2645,11 +2096,11 @@ function findSectionsLenient(lcfb, compressed) {
2645
2096
  const sections = [];
2646
2097
  let totalDecompressed = 0;
2647
2098
  for (let i = 0; i < MAX_SECTIONS; i++) {
2648
- const raw = lcfb.findStream(`/BodyText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
2099
+ const raw = _nullishCoalesce(lcfb.findStream(`/BodyText/Section${i}`), () => ( lcfb.findStream(`Section${i}`)));
2649
2100
  if (!raw) break;
2650
2101
  const content = compressed ? decompressStream(raw) : raw;
2651
2102
  totalDecompressed += content.length;
2652
- if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
2103
+ if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
2653
2104
  sections.push({ idx: i, content });
2654
2105
  }
2655
2106
  if (sections.length === 0) {
@@ -2661,7 +2112,7 @@ function findSectionsLenient(lcfb, compressed) {
2661
2112
  if (raw) {
2662
2113
  const content = compressed ? decompressStream(raw) : raw;
2663
2114
  totalDecompressed += content.length;
2664
- if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
2115
+ if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
2665
2116
  sections.push({ idx, content });
2666
2117
  }
2667
2118
  }
@@ -2673,14 +2124,14 @@ function findViewTextSectionsLenient(lcfb, compressed) {
2673
2124
  const sections = [];
2674
2125
  let totalDecompressed = 0;
2675
2126
  for (let i = 0; i < MAX_SECTIONS; i++) {
2676
- const raw = lcfb.findStream(`/ViewText/Section${i}`) ?? lcfb.findStream(`Section${i}`);
2127
+ const raw = _nullishCoalesce(lcfb.findStream(`/ViewText/Section${i}`), () => ( lcfb.findStream(`Section${i}`)));
2677
2128
  if (!raw) break;
2678
2129
  try {
2679
2130
  const content = decryptViewText(raw, compressed);
2680
2131
  totalDecompressed += content.length;
2681
- if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
2132
+ if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new (0, _chunkHXUCZ2ILcjs.KordocError)("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
2682
2133
  sections.push({ idx: i, content });
2683
- } catch {
2134
+ } catch (e16) {
2684
2135
  break;
2685
2136
  }
2686
2137
  }
@@ -2716,7 +2167,7 @@ function extractHwp5Images(cfb, blocks, compressed, warnings) {
2716
2167
  const binDataRe = /\/BinData\/[Bb][Ii][Nn](\d{4})$/;
2717
2168
  if (cfb.FileIndex) {
2718
2169
  for (const entry of cfb.FileIndex) {
2719
- if (!entry?.name || !entry.content) continue;
2170
+ if (!_optionalChain([entry, 'optionalAccess', _36 => _36.name]) || !entry.content) continue;
2720
2171
  const match = entry.name.match(binDataRe);
2721
2172
  if (!match) continue;
2722
2173
  const idx = parseInt(match[1], 10);
@@ -2724,7 +2175,7 @@ function extractHwp5Images(cfb, blocks, compressed, warnings) {
2724
2175
  if (compressed) {
2725
2176
  try {
2726
2177
  data = decompressStream(data);
2727
- } catch {
2178
+ } catch (e17) {
2728
2179
  }
2729
2180
  }
2730
2181
  binDataMap.set(idx, { data, name: entry.name });
@@ -2772,7 +2223,7 @@ function extractHwp5ImagesLenient(lcfb, blocks, compressed, warnings) {
2772
2223
  if (compressed) {
2773
2224
  try {
2774
2225
  raw = decompressStream(raw);
2775
- } catch {
2226
+ } catch (e18) {
2776
2227
  }
2777
2228
  }
2778
2229
  binDataMap.set(idx, { data: raw, name: e.name });
@@ -2866,7 +2317,7 @@ function parseSection(records, docInfo, warnings, sectionNum) {
2866
2317
  if (url && blocks.length > 0) {
2867
2318
  const lastBlock = blocks[blocks.length - 1];
2868
2319
  if (lastBlock.type === "paragraph" && !lastBlock.href) {
2869
- lastBlock.href = sanitizeHref(url) ?? void 0;
2320
+ lastBlock.href = _nullishCoalesce(_chunkHXUCZ2ILcjs.sanitizeHref.call(void 0, url), () => ( void 0));
2870
2321
  }
2871
2322
  }
2872
2323
  }
@@ -2917,7 +2368,7 @@ function extractHyperlinkUrl(data) {
2917
2368
  return url;
2918
2369
  }
2919
2370
  }
2920
- } catch {
2371
+ } catch (e19) {
2921
2372
  }
2922
2373
  return null;
2923
2374
  }
@@ -2984,8 +2435,8 @@ function parseTableBlock(records, startIdx) {
2984
2435
  if (rec.tagId === TAG_PARA_HEADER && rec.level <= tableLevel) break;
2985
2436
  if (rec.tagId === TAG_CTRL_HEADER && rec.level <= tableLevel) break;
2986
2437
  if (rec.tagId === TAG_TABLE && rec.data.length >= 8) {
2987
- rows = Math.min(rec.data.readUInt16LE(4), MAX_ROWS);
2988
- cols = Math.min(rec.data.readUInt16LE(6), MAX_COLS);
2438
+ rows = Math.min(rec.data.readUInt16LE(4), _chunkHXUCZ2ILcjs.MAX_ROWS);
2439
+ cols = Math.min(rec.data.readUInt16LE(6), _chunkHXUCZ2ILcjs.MAX_COLS);
2989
2440
  }
2990
2441
  if (rec.tagId === TAG_LIST_HEADER) {
2991
2442
  const { cell, nextIdx } = parseCellBlock(records, i, tableLevel);
@@ -3007,7 +2458,7 @@ function parseTableBlock(records, startIdx) {
3007
2458
  return { table: { rows, cols, cells: irCells, hasHeader: rows > 1 }, nextIdx: i };
3008
2459
  }
3009
2460
  const cellRows = arrangeCells(rows, cols, cells);
3010
- return { table: buildTable(cellRows), nextIdx: i };
2461
+ return { table: _chunkHXUCZ2ILcjs.buildTable.call(void 0, cellRows), nextIdx: i };
3011
2462
  }
3012
2463
  function parseCellBlock(records, startIdx, tableLevel) {
3013
2464
  const rec = records[startIdx];
@@ -3022,8 +2473,8 @@ function parseCellBlock(records, startIdx, tableLevel) {
3022
2473
  rowAddr = rec.data.readUInt16LE(10);
3023
2474
  const cs = rec.data.readUInt16LE(12);
3024
2475
  const rs = rec.data.readUInt16LE(14);
3025
- if (cs > 0) colSpan = Math.min(cs, MAX_COLS);
3026
- if (rs > 0) rowSpan = Math.min(rs, MAX_ROWS);
2476
+ if (cs > 0) colSpan = Math.min(cs, _chunkHXUCZ2ILcjs.MAX_COLS);
2477
+ if (rs > 0) rowSpan = Math.min(rs, _chunkHXUCZ2ILcjs.MAX_ROWS);
3027
2478
  }
3028
2479
  let i = startIdx + 1;
3029
2480
  while (i < records.length) {
@@ -3043,8 +2494,8 @@ function arrangeCells(rows, cols, cells) {
3043
2494
  const hasAddr = cells.some((c) => c.colAddr !== void 0 && c.rowAddr !== void 0);
3044
2495
  if (hasAddr) {
3045
2496
  for (const cell of cells) {
3046
- const r = cell.rowAddr ?? 0;
3047
- const c = cell.colAddr ?? 0;
2497
+ const r = _nullishCoalesce(cell.rowAddr, () => ( 0));
2498
+ const c = _nullishCoalesce(cell.colAddr, () => ( 0));
3048
2499
  if (r >= rows || c >= cols) continue;
3049
2500
  grid[r][c] = cell;
3050
2501
  for (let dr = 0; dr < cell.rowSpan; dr++) {
@@ -3075,2308 +2526,56 @@ function arrangeCells(rows, cols, cells) {
3075
2526
  return grid.map((row) => row.map((c) => c || { text: "", colSpan: 1, rowSpan: 1 }));
3076
2527
  }
3077
2528
 
3078
- // src/pdf/parser.ts
3079
- init_page_range();
2529
+ // src/xlsx/parser.ts
3080
2530
 
3081
- // src/pdf/line-detector.ts
3082
- var import_pdf = require("pdfjs-dist/legacy/build/pdf.mjs");
3083
- var ORIENTATION_TOL = 2;
3084
- var MIN_LINE_LENGTH = 15;
3085
- var MAX_LINE_WIDTH = 5;
3086
- var CONNECT_TOL = 5;
3087
- var CELL_PADDING = 2;
3088
- var MIN_COL_WIDTH = 15;
3089
- var MIN_ROW_HEIGHT = 6;
3090
- var VERTEX_MERGE_FACTOR = 4;
3091
- var MIN_COORD_MERGE_TOL = 8;
3092
- function extractLines(fnArray, argsArray) {
3093
- const horizontals = [];
3094
- const verticals = [];
3095
- let lineWidth = 1;
3096
- let currentPath = [];
3097
- let pathStartX = 0, pathStartY = 0;
3098
- let curX = 0, curY = 0;
3099
- function pushRectangle(path, rx, ry, rw, rh) {
3100
- if (Math.abs(rh) < ORIENTATION_TOL * 2) {
3101
- path.push({ x1: rx, y1: ry + rh / 2, x2: rx + rw, y2: ry + rh / 2 });
3102
- } else if (Math.abs(rw) < ORIENTATION_TOL * 2) {
3103
- path.push({ x1: rx + rw / 2, y1: ry, x2: rx + rw / 2, y2: ry + rh });
3104
- } else {
3105
- path.push(
3106
- { x1: rx, y1: ry, x2: rx + rw, y2: ry },
3107
- { x1: rx + rw, y1: ry, x2: rx + rw, y2: ry + rh },
3108
- { x1: rx + rw, y1: ry + rh, x2: rx, y2: ry + rh },
3109
- { x1: rx, y1: ry + rh, x2: rx, y2: ry }
3110
- );
3111
- }
3112
- }
3113
- function flushPath(isStroke) {
3114
- if (!isStroke) {
3115
- currentPath = [];
3116
- return;
3117
- }
3118
- for (const seg of currentPath) {
3119
- classifyAndAdd(seg, lineWidth, horizontals, verticals);
3120
- }
3121
- currentPath = [];
3122
- }
3123
- for (let i = 0; i < fnArray.length; i++) {
3124
- const op = fnArray[i];
3125
- const args = argsArray[i];
3126
- switch (op) {
3127
- case import_pdf.OPS.setLineWidth:
3128
- lineWidth = args[0] || 1;
3129
- break;
3130
- case import_pdf.OPS.constructPath: {
3131
- const arg0 = args[0];
3132
- if (Array.isArray(arg0)) {
3133
- const subOps = arg0;
3134
- const coords = args[1];
3135
- let ci = 0;
3136
- for (const subOp of subOps) {
3137
- if (subOp === import_pdf.OPS.moveTo) {
3138
- curX = coords[ci++];
3139
- curY = coords[ci++];
3140
- pathStartX = curX;
3141
- pathStartY = curY;
3142
- } else if (subOp === import_pdf.OPS.lineTo) {
3143
- const x2 = coords[ci++], y2 = coords[ci++];
3144
- currentPath.push({ x1: curX, y1: curY, x2, y2 });
3145
- curX = x2;
3146
- curY = y2;
3147
- } else if (subOp === import_pdf.OPS.rectangle) {
3148
- const rx = coords[ci++], ry = coords[ci++];
3149
- const rw = coords[ci++], rh = coords[ci++];
3150
- pushRectangle(currentPath, rx, ry, rw, rh);
3151
- } else if (subOp === import_pdf.OPS.closePath) {
3152
- if (curX !== pathStartX || curY !== pathStartY) {
3153
- currentPath.push({ x1: curX, y1: curY, x2: pathStartX, y2: pathStartY });
3154
- }
3155
- curX = pathStartX;
3156
- curY = pathStartY;
3157
- } else if (subOp === import_pdf.OPS.curveTo) {
3158
- ci += 6;
3159
- } else if (subOp === import_pdf.OPS.curveTo2 || subOp === import_pdf.OPS.curveTo3) {
3160
- ci += 4;
3161
- }
3162
- }
3163
- } else {
3164
- const afterOp = arg0;
3165
- const dataArr = args[1];
3166
- const pathData = dataArr?.[0];
3167
- if (pathData && typeof pathData === "object") {
3168
- const len = Object.keys(pathData).length;
3169
- let di = 0;
3170
- while (di < len) {
3171
- const drawOp = pathData[di++];
3172
- if (drawOp === 0 /* moveTo */) {
3173
- curX = pathData[di++];
3174
- curY = pathData[di++];
3175
- pathStartX = curX;
3176
- pathStartY = curY;
3177
- } else if (drawOp === 1 /* lineTo */) {
3178
- const x2 = pathData[di++], y2 = pathData[di++];
3179
- currentPath.push({ x1: curX, y1: curY, x2, y2 });
3180
- curX = x2;
3181
- curY = y2;
3182
- } else if (drawOp === 2 /* curveTo */) {
3183
- di += 6;
3184
- } else if (drawOp === 3 /* quadraticCurveTo */) {
3185
- di += 4;
3186
- } else if (drawOp === 4 /* closePath */) {
3187
- if (curX !== pathStartX || curY !== pathStartY) {
3188
- currentPath.push({ x1: curX, y1: curY, x2: pathStartX, y2: pathStartY });
3189
- }
3190
- curX = pathStartX;
3191
- curY = pathStartY;
3192
- } else {
3193
- break;
3194
- }
3195
- }
3196
- }
3197
- if (afterOp === import_pdf.OPS.stroke || afterOp === import_pdf.OPS.closeStroke) {
3198
- flushPath(true);
3199
- } else if (afterOp === import_pdf.OPS.fill || afterOp === import_pdf.OPS.eoFill || afterOp === import_pdf.OPS.fillStroke || afterOp === import_pdf.OPS.eoFillStroke || afterOp === import_pdf.OPS.closeFillStroke || afterOp === import_pdf.OPS.closeEOFillStroke) {
3200
- flushPath(true);
3201
- } else if (afterOp === import_pdf.OPS.endPath) {
3202
- flushPath(false);
3203
- }
3204
- }
3205
- break;
3206
- }
3207
- case import_pdf.OPS.stroke:
3208
- case import_pdf.OPS.closeStroke:
3209
- flushPath(true);
3210
- break;
3211
- case import_pdf.OPS.fill:
3212
- case import_pdf.OPS.eoFill:
3213
- case import_pdf.OPS.fillStroke:
3214
- case import_pdf.OPS.eoFillStroke:
3215
- case import_pdf.OPS.closeFillStroke:
3216
- case import_pdf.OPS.closeEOFillStroke:
3217
- flushPath(true);
3218
- break;
3219
- case import_pdf.OPS.endPath:
3220
- flushPath(false);
3221
- break;
3222
- }
3223
- }
3224
- return { horizontals, verticals };
3225
- }
3226
- function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
3227
- const dx = Math.abs(seg.x2 - seg.x1);
3228
- const dy = Math.abs(seg.y2 - seg.y1);
3229
- const length = Math.sqrt(dx * dx + dy * dy);
3230
- if (length < MIN_LINE_LENGTH) return;
3231
- if (dy <= ORIENTATION_TOL) {
3232
- const y = (seg.y1 + seg.y2) / 2;
3233
- const x1 = Math.min(seg.x1, seg.x2);
3234
- const x2 = Math.max(seg.x1, seg.x2);
3235
- horizontals.push({ x1, y1: y, x2, y2: y, lineWidth });
3236
- } else if (dx <= ORIENTATION_TOL) {
3237
- const x = (seg.x1 + seg.x2) / 2;
3238
- const y1 = Math.min(seg.y1, seg.y2);
3239
- const y2 = Math.max(seg.y1, seg.y2);
3240
- verticals.push({ x1: x, y1, x2: x, y2, lineWidth });
3241
- }
3242
- }
3243
- function preprocessLines(horizontals, verticals) {
3244
- let h = horizontals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
3245
- let v = verticals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
3246
- h = mergeParallelLines(h, "h");
3247
- v = mergeParallelLines(v, "v");
3248
- return { horizontals: h, verticals: v };
3249
- }
3250
- function mergeParallelLines(lines, dir) {
3251
- if (lines.length <= 1) return lines;
3252
- const sorted = [...lines].sort((a, b) => {
3253
- const posA = dir === "h" ? a.y1 : a.x1;
3254
- const posB = dir === "h" ? b.y1 : b.x1;
3255
- if (Math.abs(posA - posB) > 0.1) return posA - posB;
3256
- return dir === "h" ? a.x1 - b.x1 : a.y1 - b.y1;
3257
- });
3258
- const MERGE_TOL = 3;
3259
- const result = [sorted[0]];
3260
- for (let i = 1; i < sorted.length; i++) {
3261
- const prev = result[result.length - 1];
3262
- const curr = sorted[i];
3263
- const prevPos = dir === "h" ? prev.y1 : prev.x1;
3264
- const currPos = dir === "h" ? curr.y1 : curr.x1;
3265
- if (Math.abs(prevPos - currPos) <= MERGE_TOL) {
3266
- const prevStart = dir === "h" ? prev.x1 : prev.y1;
3267
- const prevEnd = dir === "h" ? prev.x2 : prev.y2;
3268
- const currStart = dir === "h" ? curr.x1 : curr.y1;
3269
- const currEnd = dir === "h" ? curr.x2 : curr.y2;
3270
- const overlap = Math.min(prevEnd, currEnd) - Math.max(prevStart, currStart);
3271
- const minLen = Math.min(prevEnd - prevStart, currEnd - currStart);
3272
- if (overlap > minLen * 0.3) {
3273
- if (dir === "h") {
3274
- prev.x1 = Math.min(prev.x1, curr.x1);
3275
- prev.x2 = Math.max(prev.x2, curr.x2);
3276
- prev.y1 = (prev.y1 + curr.y1) / 2;
3277
- prev.y2 = prev.y1;
3278
- } else {
3279
- prev.y1 = Math.min(prev.y1, curr.y1);
3280
- prev.y2 = Math.max(prev.y2, curr.y2);
3281
- prev.x1 = (prev.x1 + curr.x1) / 2;
3282
- prev.x2 = prev.x1;
3283
- }
3284
- prev.lineWidth = Math.max(prev.lineWidth, curr.lineWidth);
3285
- continue;
3286
- }
3287
- }
3288
- result.push(curr);
3289
- }
3290
- return result;
3291
- }
3292
- function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
3293
- const margin = 5;
3294
- return {
3295
- horizontals: horizontals.filter(
3296
- (l) => !(Math.abs(l.y1) < margin || Math.abs(l.y1 - pageHeight) < margin) || l.x2 - l.x1 < pageWidth * 0.9
3297
- ),
3298
- verticals: verticals.filter(
3299
- (l) => !(Math.abs(l.x1) < margin || Math.abs(l.x1 - pageWidth) < margin) || l.y2 - l.y1 < pageHeight * 0.9
3300
- )
3301
- };
2531
+
2532
+ var MAX_SHEETS = 100;
2533
+ var MAX_DECOMPRESS_SIZE3 = 100 * 1024 * 1024;
2534
+ var MAX_ROWS2 = 1e4;
2535
+ var MAX_COLS2 = 200;
2536
+ function cleanNumericValue(raw) {
2537
+ if (!/^-?\d+\.\d+$/.test(raw)) return raw;
2538
+ const num = parseFloat(raw);
2539
+ if (!isFinite(num)) return raw;
2540
+ const cleaned = parseFloat(num.toPrecision(15)).toString();
2541
+ return cleaned;
3302
2542
  }
3303
- function buildVertices(horizontals, verticals) {
3304
- const vertices = [];
3305
- const tol = CONNECT_TOL;
3306
- for (const h of horizontals) {
3307
- for (const v of verticals) {
3308
- if (v.x1 >= h.x1 - tol && v.x1 <= h.x2 + tol && h.y1 >= v.y1 - tol && h.y1 <= v.y2 + tol) {
3309
- const radius = Math.max(h.lineWidth, v.lineWidth, 1);
3310
- vertices.push({ x: v.x1, y: h.y1, radius });
3311
- }
3312
- }
3313
- }
3314
- return vertices;
3315
- }
3316
- function mergeVertices(vertices) {
3317
- if (vertices.length <= 1) return vertices;
3318
- const merged = [];
3319
- const used = new Array(vertices.length).fill(false);
3320
- for (let i = 0; i < vertices.length; i++) {
3321
- if (used[i]) continue;
3322
- let sumX = vertices[i].x, sumY = vertices[i].y;
3323
- let maxRadius = vertices[i].radius;
3324
- let count = 1;
3325
- for (let j = i + 1; j < vertices.length; j++) {
3326
- if (used[j]) continue;
3327
- const mergeTol = VERTEX_MERGE_FACTOR * Math.max(maxRadius, vertices[j].radius);
3328
- if (Math.abs(vertices[i].x - vertices[j].x) <= mergeTol && Math.abs(vertices[i].y - vertices[j].y) <= mergeTol) {
3329
- sumX += vertices[j].x;
3330
- sumY += vertices[j].y;
3331
- maxRadius = Math.max(maxRadius, vertices[j].radius);
3332
- count++;
3333
- used[j] = true;
3334
- }
3335
- }
3336
- merged.push({ x: sumX / count, y: sumY / count, radius: maxRadius });
3337
- }
3338
- return merged;
2543
+ function parseCellRef(ref) {
2544
+ const m = ref.match(/^([A-Z]+)(\d+)$/);
2545
+ if (!m) return null;
2546
+ let col = 0;
2547
+ for (const ch of m[1]) col = col * 26 + (ch.charCodeAt(0) - 64);
2548
+ return { col: col - 1, row: parseInt(m[2], 10) - 1 };
3339
2549
  }
3340
- function buildTableGrids(horizontals, verticals) {
3341
- if (horizontals.length < 2 || verticals.length < 2) return [];
3342
- const allVertices = buildVertices(horizontals, verticals);
3343
- const vertices = mergeVertices(allVertices);
3344
- if (vertices.length < 4) return [];
3345
- const globalRadius = vertices.reduce((max, v) => Math.max(max, v.radius), 1);
3346
- const allLines = [
3347
- ...horizontals.map((l, i) => ({ ...l, type: "h", id: i })),
3348
- ...verticals.map((l, i) => ({ ...l, type: "v", id: i + horizontals.length }))
3349
- ];
3350
- const groups = groupConnectedLines(allLines);
3351
- const grids = [];
3352
- for (const group of groups) {
3353
- const hLines = group.filter((l) => l.type === "h");
3354
- const vLines = group.filter((l) => l.type === "v");
3355
- if (hLines.length < 2 || vLines.length < 2) continue;
3356
- let gx1 = Infinity, gy1 = Infinity, gx2 = -Infinity, gy2 = -Infinity;
3357
- for (const l of vLines) {
3358
- if (l.x1 < gx1) gx1 = l.x1;
3359
- if (l.x1 > gx2) gx2 = l.x1;
3360
- }
3361
- for (const l of hLines) {
3362
- if (l.y1 < gy1) gy1 = l.y1;
3363
- if (l.y1 > gy2) gy2 = l.y1;
3364
- }
3365
- const groupBbox = {
3366
- x1: gx1 - CONNECT_TOL,
3367
- y1: gy1 - CONNECT_TOL,
3368
- x2: gx2 + CONNECT_TOL,
3369
- y2: gy2 + CONNECT_TOL
3370
- };
3371
- const groupVertices = vertices.filter(
3372
- (v) => v.x >= groupBbox.x1 && v.x <= groupBbox.x2 && v.y >= groupBbox.y1 && v.y <= groupBbox.y2
3373
- );
3374
- const groupRadius = groupVertices.length > 0 ? groupVertices.reduce((max, v) => Math.max(max, v.radius), 1) : globalRadius;
3375
- const coordMergeTol = Math.max(VERTEX_MERGE_FACTOR * groupRadius, MIN_COORD_MERGE_TOL);
3376
- const rawYs = [
3377
- ...hLines.map((l) => l.y1),
3378
- ...groupVertices.map((v) => v.y)
3379
- ];
3380
- const rowYs = clusterCoordinates(rawYs, coordMergeTol).sort((a, b) => b - a);
3381
- const rawXs = [
3382
- ...vLines.map((l) => l.x1),
3383
- ...groupVertices.map((v) => v.x)
3384
- ];
3385
- const colXs = clusterCoordinates(rawXs, coordMergeTol).sort((a, b) => a - b);
3386
- if (rowYs.length < 2 || colXs.length < 2) continue;
3387
- const validColXs = enforceMinWidth(colXs, MIN_COL_WIDTH);
3388
- const validRowYs = enforceMinHeight(rowYs, MIN_ROW_HEIGHT);
3389
- if (validRowYs.length < 2 || validColXs.length < 2) continue;
3390
- const bbox = {
3391
- x1: validColXs[0],
3392
- y1: validRowYs[validRowYs.length - 1],
3393
- x2: validColXs[validColXs.length - 1],
3394
- y2: validRowYs[0]
3395
- };
3396
- grids.push({ rowYs: validRowYs, colXs: validColXs, bbox, vertexRadius: groupRadius });
3397
- }
3398
- return mergeAdjacentGrids(grids);
2550
+ function parseMergeRef(ref) {
2551
+ const parts = ref.split(":");
2552
+ if (parts.length !== 2) return null;
2553
+ const start = parseCellRef(parts[0]);
2554
+ const end = parseCellRef(parts[1]);
2555
+ if (!start || !end) return null;
2556
+ return { startCol: start.col, startRow: start.row, endCol: end.col, endRow: end.row };
3399
2557
  }
3400
- function enforceMinWidth(colXs, minWidth) {
3401
- if (colXs.length <= 2) return colXs;
3402
- const result = [colXs[0]];
3403
- for (let i = 1; i < colXs.length; i++) {
3404
- const prevX = result[result.length - 1];
3405
- if (colXs[i] - prevX < minWidth && i < colXs.length - 1) {
3406
- continue;
3407
- }
3408
- result.push(colXs[i]);
3409
- }
2558
+ function getElements(parent, tagName) {
2559
+ const nodes = parent.getElementsByTagName(tagName);
2560
+ const result = [];
2561
+ for (let i = 0; i < nodes.length; i++) result.push(nodes[i]);
3410
2562
  return result;
3411
2563
  }
3412
- function enforceMinHeight(rowYs, minHeight) {
3413
- if (rowYs.length <= 2) return rowYs;
3414
- const result = [rowYs[0]];
3415
- for (let i = 1; i < rowYs.length; i++) {
3416
- const prevY = result[result.length - 1];
3417
- if (prevY - rowYs[i] < minHeight && i < rowYs.length - 1) {
3418
- continue;
3419
- }
3420
- result.push(rowYs[i]);
3421
- }
3422
- return result;
2564
+ function getTextContent(el) {
2565
+ return _nullishCoalesce(_optionalChain([el, 'access', _37 => _37.textContent, 'optionalAccess', _38 => _38.trim, 'call', _39 => _39()]), () => ( ""));
3423
2566
  }
3424
- function mergeAdjacentGrids(grids) {
3425
- if (grids.length <= 1) return grids;
3426
- const sorted = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
3427
- const merged = [sorted[0]];
3428
- for (let i = 1; i < sorted.length; i++) {
3429
- const prev = merged[merged.length - 1];
3430
- const curr = sorted[i];
3431
- if (prev.colXs.length === curr.colXs.length) {
3432
- const mergeTol = Math.max(VERTEX_MERGE_FACTOR * Math.max(prev.vertexRadius, curr.vertexRadius), 6) * 3;
3433
- const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= mergeTol);
3434
- const verticalGap = prev.bbox.y1 - curr.bbox.y2;
3435
- if (colMatch && verticalGap >= -CONNECT_TOL && verticalGap <= 20) {
3436
- const allRowYs = [.../* @__PURE__ */ new Set([...prev.rowYs, ...curr.rowYs])].sort((a, b) => b - a);
3437
- merged[merged.length - 1] = {
3438
- rowYs: allRowYs,
3439
- colXs: prev.colXs,
3440
- bbox: {
3441
- x1: Math.min(prev.bbox.x1, curr.bbox.x1),
3442
- y1: Math.min(prev.bbox.y1, curr.bbox.y1),
3443
- x2: Math.max(prev.bbox.x2, curr.bbox.x2),
3444
- y2: Math.max(prev.bbox.y2, curr.bbox.y2)
3445
- },
3446
- vertexRadius: Math.max(prev.vertexRadius, curr.vertexRadius)
3447
- };
3448
- continue;
3449
- }
3450
- }
3451
- merged.push(curr);
3452
- }
3453
- return merged;
2567
+ function parseXml(text) {
2568
+ return new (0, _xmldom.DOMParser)().parseFromString(_chunkHXUCZ2ILcjs.stripDtd.call(void 0, text), "text/xml");
3454
2569
  }
3455
- function clusterCoordinates(values, tolerance) {
3456
- if (values.length === 0) return [];
3457
- const sorted = [...values].sort((a, b) => a - b);
3458
- const clusters = [{ sum: sorted[0], count: 1 }];
3459
- for (let i = 1; i < sorted.length; i++) {
3460
- const last = clusters[clusters.length - 1];
3461
- const avg = last.sum / last.count;
3462
- if (Math.abs(sorted[i] - avg) <= tolerance) {
3463
- last.sum += sorted[i];
3464
- last.count++;
3465
- } else {
3466
- clusters.push({ sum: sorted[i], count: 1 });
3467
- }
2570
+ function parseSharedStrings(xml) {
2571
+ const doc = parseXml(xml);
2572
+ const strings = [];
2573
+ const siList = getElements(doc.documentElement, "si");
2574
+ for (const si of siList) {
2575
+ const tElements = getElements(si, "t");
2576
+ strings.push(tElements.map((t) => _nullishCoalesce(t.textContent, () => ( ""))).join(""));
3468
2577
  }
3469
- return clusters.map((c) => c.sum / c.count);
3470
- }
3471
- function groupConnectedLines(lines) {
3472
- const parent = lines.map((_, i) => i);
3473
- function find(x) {
3474
- while (parent[x] !== x) {
3475
- parent[x] = parent[parent[x]];
3476
- x = parent[x];
3477
- }
3478
- return x;
3479
- }
3480
- function union(a, b) {
3481
- const ra = find(a), rb = find(b);
3482
- if (ra !== rb) parent[ra] = rb;
3483
- }
3484
- for (let i = 0; i < lines.length; i++) {
3485
- for (let j = i + 1; j < lines.length; j++) {
3486
- if (linesIntersect(lines[i], lines[j])) {
3487
- union(i, j);
3488
- }
3489
- }
3490
- }
3491
- const groups = /* @__PURE__ */ new Map();
3492
- for (let i = 0; i < lines.length; i++) {
3493
- const root = find(i);
3494
- if (!groups.has(root)) groups.set(root, []);
3495
- groups.get(root).push(lines[i]);
3496
- }
3497
- return [...groups.values()];
3498
- }
3499
- function linesIntersect(a, b) {
3500
- if (a.type === b.type) {
3501
- if (a.type === "h") {
3502
- if (Math.abs(a.y1 - b.y1) > CONNECT_TOL) return false;
3503
- return Math.min(a.x2, b.x2) >= Math.max(a.x1, b.x1) - CONNECT_TOL;
3504
- } else {
3505
- if (Math.abs(a.x1 - b.x1) > CONNECT_TOL) return false;
3506
- return Math.min(a.y2, b.y2) >= Math.max(a.y1, b.y1) - CONNECT_TOL;
3507
- }
3508
- }
3509
- const h = a.type === "h" ? a : b;
3510
- const v = a.type === "h" ? b : a;
3511
- const tol = CONNECT_TOL;
3512
- return v.x1 >= h.x1 - tol && v.x1 <= h.x2 + tol && h.y1 >= v.y1 - tol && h.y1 <= v.y2 + tol;
3513
- }
3514
- function extractCells(grid, horizontals, verticals) {
3515
- const { rowYs, colXs } = grid;
3516
- const numRows = rowYs.length - 1;
3517
- const numCols = colXs.length - 1;
3518
- if (numRows <= 0 || numCols <= 0) return [];
3519
- const vBorders = Array.from(
3520
- { length: numRows },
3521
- (_, r) => Array.from(
3522
- { length: numCols + 1 },
3523
- (_2, c) => hasVerticalLine(verticals, colXs[c], rowYs[r], rowYs[r + 1], grid.vertexRadius)
3524
- )
3525
- );
3526
- const hBorders = Array.from(
3527
- { length: numRows + 1 },
3528
- (_, r) => Array.from(
3529
- { length: numCols },
3530
- (_2, c) => hasHorizontalLine(horizontals, rowYs[r], colXs[c], colXs[c + 1], grid.vertexRadius)
3531
- )
3532
- );
3533
- const occupied = Array.from({ length: numRows }, () => Array(numCols).fill(false));
3534
- const cells = [];
3535
- for (let r = 0; r < numRows; r++) {
3536
- for (let c = 0; c < numCols; c++) {
3537
- if (occupied[r][c]) continue;
3538
- let colSpan = 1;
3539
- let rowSpan = 1;
3540
- while (c + colSpan < numCols && !vBorders[r][c + colSpan]) {
3541
- let canExpand = true;
3542
- for (let dr = 0; dr < rowSpan; dr++) {
3543
- if (vBorders[r + dr][c + colSpan]) {
3544
- canExpand = false;
3545
- break;
3546
- }
3547
- }
3548
- if (!canExpand) break;
3549
- colSpan++;
3550
- }
3551
- while (r + rowSpan < numRows) {
3552
- let hasLine = false;
3553
- for (let dc = 0; dc < colSpan; dc++) {
3554
- if (hBorders[r + rowSpan][c + dc]) {
3555
- hasLine = true;
3556
- break;
3557
- }
3558
- }
3559
- if (hasLine) break;
3560
- rowSpan++;
3561
- }
3562
- for (let dr = 0; dr < rowSpan; dr++) {
3563
- for (let dc = 0; dc < colSpan; dc++) {
3564
- occupied[r + dr][c + dc] = true;
3565
- }
3566
- }
3567
- cells.push({
3568
- row: r,
3569
- col: c,
3570
- rowSpan,
3571
- colSpan,
3572
- bbox: {
3573
- x1: colXs[c],
3574
- y1: rowYs[r + rowSpan],
3575
- x2: colXs[c + colSpan],
3576
- y2: rowYs[r]
3577
- }
3578
- });
3579
- }
3580
- }
3581
- return cells;
3582
- }
3583
- function hasVerticalLine(verticals, x, topY, botY, vertexRadius) {
3584
- const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
3585
- for (const v of verticals) {
3586
- if (Math.abs(v.x1 - x) <= tol) {
3587
- const cellH = Math.abs(topY - botY);
3588
- if (cellH < 0.1) continue;
3589
- const overlapTop = Math.min(v.y2, topY);
3590
- const overlapBot = Math.max(v.y1, botY);
3591
- const overlap = overlapTop - overlapBot;
3592
- if (overlap >= cellH * 0.75) return true;
3593
- }
3594
- }
3595
- return false;
3596
- }
3597
- function hasHorizontalLine(horizontals, y, leftX, rightX, vertexRadius) {
3598
- const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
3599
- for (const h of horizontals) {
3600
- if (Math.abs(h.y1 - y) <= tol) {
3601
- const cellW = Math.abs(rightX - leftX);
3602
- if (cellW < 0.1) continue;
3603
- const overlapLeft = Math.max(h.x1, leftX);
3604
- const overlapRight = Math.min(h.x2, rightX);
3605
- const overlap = overlapRight - overlapLeft;
3606
- if (overlap >= cellW * 0.75) return true;
3607
- }
3608
- }
3609
- return false;
3610
- }
3611
- function mapTextToCells(items, cells) {
3612
- const result = /* @__PURE__ */ new Map();
3613
- for (const cell of cells) {
3614
- result.set(cell, []);
3615
- }
3616
- for (const item of items) {
3617
- const pad = CELL_PADDING;
3618
- let bestCell = null;
3619
- let bestScore = 0;
3620
- for (const cell of cells) {
3621
- const ix1 = Math.max(item.x, cell.bbox.x1 - pad);
3622
- const ix2 = Math.min(item.x + item.w, cell.bbox.x2 + pad);
3623
- const iy1 = Math.max(item.y, cell.bbox.y1 - pad);
3624
- const iy2 = Math.min(item.y + (item.h || item.fontSize), cell.bbox.y2 + pad);
3625
- if (ix1 >= ix2 || iy1 >= iy2) continue;
3626
- const intersectArea = (ix2 - ix1) * (iy2 - iy1);
3627
- const itemArea = Math.max(item.w, 1) * Math.max(item.h || item.fontSize, 1);
3628
- const score = intersectArea / itemArea;
3629
- if (score > bestScore) {
3630
- bestScore = score;
3631
- bestCell = cell;
3632
- }
3633
- }
3634
- if (bestCell && bestScore > 0.3) {
3635
- result.get(bestCell).push(item);
3636
- }
3637
- }
3638
- return result;
3639
- }
3640
- function cellTextToString(items) {
3641
- if (items.length === 0) return "";
3642
- if (items.length === 1) return items[0].text;
3643
- const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
3644
- const lines = [];
3645
- let curLine = [sorted[0]];
3646
- let curY = sorted[0].y;
3647
- for (let i = 1; i < sorted.length; i++) {
3648
- const tol = Math.max(3, Math.min(sorted[i].fontSize, curLine[0].fontSize) * 0.6);
3649
- if (Math.abs(sorted[i].y - curY) <= tol) {
3650
- curLine.push(sorted[i]);
3651
- } else {
3652
- lines.push(curLine);
3653
- curLine = [sorted[i]];
3654
- curY = sorted[i].y;
3655
- }
3656
- }
3657
- lines.push(curLine);
3658
- const textLines = lines.map((line) => {
3659
- const s = line.sort((a, b) => a.x - b.x);
3660
- if (s.length === 1) return s[0].text;
3661
- const evenSpaced = detectEvenSpacedItems(s);
3662
- let result = s[0].text;
3663
- for (let j = 1; j < s.length; j++) {
3664
- if (evenSpaced[j]) {
3665
- result += s[j].text;
3666
- continue;
3667
- }
3668
- const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
3669
- const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
3670
- const prevIsKorean = /[가-힣]$/.test(result);
3671
- const currIsKorean = /^[가-힣]/.test(s[j].text);
3672
- if (gap < avgFs * 0.15) {
3673
- result += s[j].text;
3674
- } else if (gap < avgFs * 0.35 && (prevIsKorean || currIsKorean)) {
3675
- result += s[j].text;
3676
- } else {
3677
- result += " " + s[j].text;
3678
- }
3679
- }
3680
- return result;
3681
- });
3682
- return mergeCellTextLines(textLines);
3683
- }
3684
- function detectEvenSpacedItems(items) {
3685
- const result = new Array(items.length).fill(false);
3686
- if (items.length < 3) return result;
3687
- let runStart = -1;
3688
- for (let i = 0; i < items.length; i++) {
3689
- const isShortKorean = /^[가-힣]{1}$/.test(items[i].text) || /^[\d]{1}$/.test(items[i].text);
3690
- if (isShortKorean && runStart >= 0 && i > 0) {
3691
- const gap = items[i].x - (items[i - 1].x + items[i - 1].w);
3692
- const maxRunGap = Math.max(items[i].fontSize * 3, 30);
3693
- if (gap > maxRunGap) {
3694
- if (i - runStart >= 3) markEvenRun(items, result, runStart, i);
3695
- runStart = i;
3696
- continue;
3697
- }
3698
- }
3699
- if (isShortKorean) {
3700
- if (runStart < 0) runStart = i;
3701
- } else {
3702
- if (runStart >= 0 && i - runStart >= 3) {
3703
- markEvenRun(items, result, runStart, i);
3704
- }
3705
- runStart = -1;
3706
- }
3707
- }
3708
- if (runStart >= 0 && items.length - runStart >= 3) {
3709
- markEvenRun(items, result, runStart, items.length);
3710
- }
3711
- return result;
3712
- }
3713
- function markEvenRun(items, result, start, end) {
3714
- const gaps = [];
3715
- for (let i = start + 1; i < end; i++) {
3716
- gaps.push(items[i].x - (items[i - 1].x + items[i - 1].w));
3717
- }
3718
- const posGaps = gaps.filter((g2) => g2 > 0);
3719
- if (posGaps.length < 2) return;
3720
- let minGap = Infinity, maxGap = -Infinity;
3721
- for (const g2 of posGaps) {
3722
- if (g2 < minGap) minGap = g2;
3723
- if (g2 > maxGap) maxGap = g2;
3724
- }
3725
- const avgFs = items[start].fontSize;
3726
- if (minGap >= avgFs * 0.1 && maxGap <= avgFs * 3 && maxGap / Math.max(minGap, 0.1) <= 3) {
3727
- for (let i = start + 1; i < end; i++) {
3728
- result[i] = true;
3729
- }
3730
- }
3731
- }
3732
- function mergeCellTextLines(textLines) {
3733
- if (textLines.length <= 1) return textLines[0] || "";
3734
- const merged = [textLines[0]];
3735
- for (let i = 1; i < textLines.length; i++) {
3736
- const prev = merged[merged.length - 1];
3737
- const curr = textLines[i];
3738
- if (/[가-힣]$/.test(prev) && /^[가-힣]+$/.test(curr) && curr.length <= 8 && !curr.includes(" ")) {
3739
- merged[merged.length - 1] = prev + curr;
3740
- } else if (curr.trim().length <= 3 && /^[)\]%}]/.test(curr.trim())) {
3741
- merged[merged.length - 1] = prev + curr.trim();
3742
- } else if (/[,(]$/.test(prev.trim()) && curr.trim().length <= 15) {
3743
- merged[merged.length - 1] = prev + curr.trim();
3744
- } else if (/[\d,]$/.test(prev) && /^[\d,]+[)\]]?$/.test(curr.trim()) && curr.trim().length <= 10) {
3745
- merged[merged.length - 1] = prev + curr.trim();
3746
- } else {
3747
- merged.push(curr);
3748
- }
3749
- }
3750
- return merged.join("\n");
3751
- }
3752
-
3753
- // src/pdf/cluster-detector.ts
3754
- var Y_TOL = 3;
3755
- var COL_CLUSTER_TOL = 15;
3756
- var MIN_ROWS = 3;
3757
- var MIN_COLS = 2;
3758
- var MIN_GAP_FACTOR = 2;
3759
- var MIN_GAP_ABSOLUTE = 20;
3760
- var MIN_COL_FILL_RATIO = 0.4;
3761
- function detectClusterTables(items, pageNum) {
3762
- if (items.length < MIN_ROWS * MIN_COLS) return [];
3763
- const { merged, originMap } = mergeEvenSpacedClusters(items);
3764
- const rows = groupByBaseline(merged);
3765
- if (rows.length < MIN_ROWS) return [];
3766
- const results = [];
3767
- const headerResult = detectHeaderRow(rows);
3768
- if (headerResult) {
3769
- const { columns, headerIdx } = headerResult;
3770
- const headerRow = rows[headerIdx];
3771
- const headerItems = [...headerRow.items].sort((a, b) => a.x - b.x);
3772
- const headerAndBelow = rows.slice(headerIdx);
3773
- const mergedRows = mergeMultiLineRows(headerAndBelow, columns);
3774
- const tableRegions = findTableRegionsByHeader(mergedRows, columns, headerItems);
3775
- for (const region of tableRegions) {
3776
- const table = buildClusterTable(region.rows, columns, pageNum);
3777
- if (table) {
3778
- expandUsedItems(table.usedItems, originMap);
3779
- results.push(table);
3780
- }
3781
- }
3782
- }
3783
- if (results.length === 0) {
3784
- const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
3785
- if (suspiciousRows.length >= MIN_ROWS) {
3786
- const columns = extractColumnClusters(suspiciousRows);
3787
- if (columns.length >= MIN_COLS) {
3788
- const tableRegions = findTableRegions(rows, columns);
3789
- for (const region of tableRegions) {
3790
- const mergedRows = mergeMultiLineRows(region.rows, columns);
3791
- const table = buildClusterTable(mergedRows, columns, pageNum);
3792
- if (table) {
3793
- expandUsedItems(table.usedItems, originMap);
3794
- results.push(table);
3795
- }
3796
- }
3797
- }
3798
- }
3799
- }
3800
- return results;
3801
- }
3802
- function mergeEvenSpacedClusters(items) {
3803
- const originMap = /* @__PURE__ */ new Map();
3804
- const rows = groupByBaseline(items);
3805
- const merged = [];
3806
- for (const row of rows) {
3807
- const sorted = [...row.items].sort((a, b) => a.x - b.x);
3808
- let i = 0;
3809
- while (i < sorted.length) {
3810
- if (/^[가-힣\d]$/.test(sorted[i].text)) {
3811
- let runEnd = i + 1;
3812
- while (runEnd < sorted.length && /^[가-힣\d]$/.test(sorted[runEnd].text)) {
3813
- const gap = sorted[runEnd].x - (sorted[runEnd - 1].x + sorted[runEnd - 1].w);
3814
- const fs = sorted[runEnd].fontSize;
3815
- if (gap < fs * 0.1 || gap > fs * 3) break;
3816
- runEnd++;
3817
- }
3818
- if (runEnd - i >= 3) {
3819
- const gaps = [];
3820
- for (let g2 = i + 1; g2 < runEnd; g2++) {
3821
- gaps.push(sorted[g2].x - (sorted[g2 - 1].x + sorted[g2 - 1].w));
3822
- }
3823
- let minG = Infinity, maxG = -Infinity;
3824
- for (const g2 of gaps) {
3825
- if (g2 < minG) minG = g2;
3826
- if (g2 > maxG) maxG = g2;
3827
- }
3828
- if (minG > 0 && maxG / minG <= 3) {
3829
- const run = sorted.slice(i, runEnd);
3830
- const text = run.map((r) => r.text).join("");
3831
- const first = run[0], last = run[runEnd - i - 1];
3832
- const item = {
3833
- text,
3834
- x: first.x,
3835
- y: first.y,
3836
- w: last.x + last.w - first.x,
3837
- h: first.h,
3838
- fontSize: first.fontSize,
3839
- fontName: first.fontName
3840
- };
3841
- originMap.set(item, run);
3842
- merged.push(item);
3843
- i = runEnd;
3844
- continue;
3845
- }
3846
- }
3847
- }
3848
- merged.push(sorted[i]);
3849
- i++;
3850
- }
3851
- }
3852
- return { merged, originMap };
3853
- }
3854
- function expandUsedItems(usedItems, originMap) {
3855
- const toAdd = [];
3856
- for (const item of usedItems) {
3857
- const origins = originMap.get(item);
3858
- if (origins) for (const o of origins) toAdd.push(o);
3859
- }
3860
- for (const a of toAdd) usedItems.add(a);
3861
- }
3862
- function detectHeaderRow(rows) {
3863
- const allItems = rows.flatMap((r) => r.items);
3864
- if (allItems.length === 0) return null;
3865
- let allMinX = Infinity, allMaxX = -Infinity;
3866
- for (const i of allItems) {
3867
- if (i.x < allMinX) allMinX = i.x;
3868
- const r = i.x + i.w;
3869
- if (r > allMaxX) allMaxX = r;
3870
- }
3871
- const pageSpan = allMaxX - allMinX;
3872
- if (pageSpan <= 0) return null;
3873
- for (let ri = 0; ri < rows.length; ri++) {
3874
- const row = rows[ri];
3875
- if (row.items.length < MIN_COLS || row.items.length > 6) continue;
3876
- if (row.items.some((i) => i.text.length > 8)) continue;
3877
- if (!row.items.some((i) => /[가-힣]/.test(i.text))) continue;
3878
- if (row.items.some((i) => /^[□■○●·※▶▷◆◇\-]/.test(i.text))) continue;
3879
- const sorted = [...row.items].sort((a, b) => a.x - b.x);
3880
- const xSpan = sorted[sorted.length - 1].x + sorted[sorted.length - 1].w - sorted[0].x;
3881
- if (xSpan / pageSpan < 0.4) continue;
3882
- const avgFs = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
3883
- let hasLargeGap = false;
3884
- for (let i = 1; i < sorted.length; i++) {
3885
- const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
3886
- if (gap >= avgFs * 2.5) {
3887
- hasLargeGap = true;
3888
- break;
3889
- }
3890
- }
3891
- if (!hasLargeGap) continue;
3892
- const columns = sorted.map((item) => ({ x: item.x, count: 0 }));
3893
- let matchCount = 0;
3894
- for (let j = ri + 1; j < rows.length && matchCount < MIN_ROWS + 2; j++) {
3895
- const matched = countMatchedColumnsRange(rows[j], columns, sorted);
3896
- if (matched >= MIN_COLS) matchCount++;
3897
- }
3898
- if (matchCount < MIN_ROWS) continue;
3899
- return { columns, headerIdx: ri };
3900
- }
3901
- return null;
3902
- }
3903
- function mergeMultiLineRows(rows, columns) {
3904
- if (rows.length <= 1) return rows;
3905
- const result = [rows[0]];
3906
- const allFontSizes = rows.flatMap((r) => r.items).map((i) => i.fontSize);
3907
- const avgFontSize = allFontSizes.length > 0 ? allFontSizes.reduce((s, v) => s + v, 0) / allFontSizes.length : 12;
3908
- for (let i = 1; i < rows.length; i++) {
3909
- const prev = result[result.length - 1];
3910
- const curr = rows[i];
3911
- const yGap = Math.abs(prev.y - curr.y);
3912
- const matchedCols = countMatchedColumns(curr, columns);
3913
- if (yGap < avgFontSize * 1.8 && curr.items.length <= 2 && (matchedCols < MIN_COLS || curr.items.length === 1)) {
3914
- result[result.length - 1] = {
3915
- y: prev.y,
3916
- items: [...prev.items, ...curr.items]
3917
- };
3918
- } else {
3919
- result.push(curr);
3920
- }
3921
- }
3922
- return result;
3923
- }
3924
- function groupByBaseline(items) {
3925
- if (items.length === 0) return [];
3926
- const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
3927
- const rows = [];
3928
- let curItems = [sorted[0]];
3929
- let curY = sorted[0].y;
3930
- for (let i = 1; i < sorted.length; i++) {
3931
- if (Math.abs(sorted[i].y - curY) <= Y_TOL) {
3932
- curItems.push(sorted[i]);
3933
- } else {
3934
- rows.push({ y: curY, items: curItems });
3935
- curItems = [sorted[i]];
3936
- curY = sorted[i].y;
3937
- }
3938
- }
3939
- if (curItems.length > 0) rows.push({ y: curY, items: curItems });
3940
- return rows;
3941
- }
3942
- function hasSuspiciousGaps(row) {
3943
- if (row.items.length < 2) return false;
3944
- const sorted = [...row.items].sort((a, b) => a.x - b.x);
3945
- if (sorted.length === 2 && sorted[1].text.length > 20) return false;
3946
- const avgFontSize = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
3947
- const minGap = Math.max(avgFontSize * MIN_GAP_FACTOR, MIN_GAP_ABSOLUTE);
3948
- for (let i = 1; i < sorted.length; i++) {
3949
- const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
3950
- if (gap >= minGap) return true;
3951
- }
3952
- return false;
3953
- }
3954
- function extractColumnClusters(rows) {
3955
- const allX = [];
3956
- for (const row of rows) {
3957
- for (const item of row.items) allX.push(item.x);
3958
- }
3959
- if (allX.length === 0) return [];
3960
- allX.sort((a, b) => a - b);
3961
- const clusters = [];
3962
- let clusterStart = 0;
3963
- for (let i = 1; i <= allX.length; i++) {
3964
- if (i === allX.length || allX[i] - allX[i - 1] > COL_CLUSTER_TOL) {
3965
- const slice = allX.slice(clusterStart, i);
3966
- const avg = Math.round(slice.reduce((s, v) => s + v, 0) / slice.length);
3967
- clusters.push({ x: avg, count: slice.length });
3968
- clusterStart = i;
3969
- }
3970
- }
3971
- const minCount = Math.max(2, Math.floor(rows.length * MIN_COL_FILL_RATIO));
3972
- return clusters.filter((c) => c.count >= minCount).sort((a, b) => a.x - b.x);
3973
- }
3974
- function findTableRegionsByHeader(allRows, columns, headerItems) {
3975
- const regions = [];
3976
- let currentRegion = [];
3977
- let missStreak = 0;
3978
- for (const row of allRows) {
3979
- const matchedCols = countMatchedColumnsRange(row, columns, headerItems);
3980
- if (matchedCols >= MIN_COLS) {
3981
- currentRegion.push(row);
3982
- missStreak = 0;
3983
- } else if (currentRegion.length > 0 && (row.items.length <= 2 || missStreak === 0)) {
3984
- currentRegion.push(row);
3985
- missStreak++;
3986
- } else {
3987
- while (currentRegion.length > 0) {
3988
- const last = currentRegion[currentRegion.length - 1];
3989
- if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
3990
- currentRegion.pop();
3991
- }
3992
- if (currentRegion.length >= MIN_ROWS) {
3993
- regions.push({ rows: [...currentRegion] });
3994
- }
3995
- currentRegion = [];
3996
- missStreak = 0;
3997
- }
3998
- }
3999
- while (currentRegion.length > 0) {
4000
- const last = currentRegion[currentRegion.length - 1];
4001
- if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
4002
- currentRegion.pop();
4003
- }
4004
- if (currentRegion.length >= MIN_ROWS) {
4005
- regions.push({ rows: currentRegion });
4006
- }
4007
- return regions;
4008
- }
4009
- function findTableRegions(allRows, columns) {
4010
- const regions = [];
4011
- let currentRegion = [];
4012
- for (const row of allRows) {
4013
- const matchedCols = countMatchedColumns(row, columns);
4014
- if (matchedCols >= MIN_COLS) {
4015
- currentRegion.push(row);
4016
- } else if (row.items.length === 1) {
4017
- if (currentRegion.length > 0) {
4018
- currentRegion.push(row);
4019
- }
4020
- } else {
4021
- if (currentRegion.length >= MIN_ROWS) {
4022
- regions.push({ rows: [...currentRegion] });
4023
- }
4024
- currentRegion = [];
4025
- }
4026
- }
4027
- if (currentRegion.length >= MIN_ROWS) {
4028
- regions.push({ rows: currentRegion });
4029
- }
4030
- return regions;
4031
- }
4032
- function countMatchedColumns(row, columns) {
4033
- const matched = /* @__PURE__ */ new Set();
4034
- for (const item of row.items) {
4035
- for (let ci = 0; ci < columns.length; ci++) {
4036
- if (Math.abs(item.x - columns[ci].x) <= COL_CLUSTER_TOL * 2) {
4037
- matched.add(ci);
4038
- break;
4039
- }
4040
- }
4041
- }
4042
- return matched.size;
4043
- }
4044
- function countMatchedColumnsRange(row, columns, headerItems) {
4045
- const boundaries = [];
4046
- for (let ci = 0; ci < headerItems.length; ci++) {
4047
- const left = ci === 0 ? 0 : (headerItems[ci - 1].x + headerItems[ci - 1].w + headerItems[ci].x) / 2;
4048
- const right = ci === headerItems.length - 1 ? Infinity : (headerItems[ci].x + headerItems[ci].w + headerItems[ci + 1].x) / 2;
4049
- boundaries.push({ left, right });
4050
- }
4051
- const matched = /* @__PURE__ */ new Set();
4052
- for (const item of row.items) {
4053
- for (let ci = 0; ci < boundaries.length; ci++) {
4054
- if (item.x >= boundaries[ci].left && item.x < boundaries[ci].right) {
4055
- matched.add(ci);
4056
- break;
4057
- }
4058
- }
4059
- }
4060
- return matched.size;
4061
- }
4062
- function assignRowItems(items, columns, numCols) {
4063
- if (items.length === 0) return [];
4064
- const sorted = [...items].sort((a, b) => a.x - b.x);
4065
- const colCenters = columns.map((c) => c.x);
4066
- const gaps = [];
4067
- for (let i = 1; i < sorted.length; i++) {
4068
- gaps.push({ idx: i, size: sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w) });
4069
- }
4070
- const gapSizes = gaps.map((g2) => g2.size).sort((a, b) => a - b);
4071
- const medianGap = gapSizes.length > 0 ? gapSizes[Math.floor(gapSizes.length / 2)] : 0;
4072
- const gapThreshold = sorted.length <= numCols + 1 ? 12 : Math.max(medianGap * 2.5, 12);
4073
- const significantGaps = gaps.filter((g2) => g2.size >= gapThreshold).sort((a, b) => b.size - a.size).slice(0, numCols - 1).sort((a, b) => a.idx - b.idx);
4074
- const groups = [];
4075
- let start = 0;
4076
- for (const gap of significantGaps) {
4077
- groups.push(sorted.slice(start, gap.idx));
4078
- start = gap.idx;
4079
- }
4080
- groups.push(sorted.slice(start));
4081
- const result = [];
4082
- const usedCols = /* @__PURE__ */ new Set();
4083
- const groupCenters = groups.map((g2) => {
4084
- let minX = Infinity, maxX = -Infinity;
4085
- for (const i of g2) {
4086
- if (i.x < minX) minX = i.x;
4087
- const r = i.x + i.w;
4088
- if (r > maxX) maxX = r;
4089
- }
4090
- return (minX + maxX) / 2;
4091
- });
4092
- const assignments = [];
4093
- for (let gi = 0; gi < groups.length; gi++) {
4094
- for (let ci = 0; ci < numCols; ci++) {
4095
- assignments.push({ gi, ci, dist: Math.abs(groupCenters[gi] - colCenters[ci]) });
4096
- }
4097
- }
4098
- assignments.sort((a, b) => a.dist - b.dist);
4099
- const assignedGroups = /* @__PURE__ */ new Set();
4100
- for (const { gi, ci } of assignments) {
4101
- if (assignedGroups.has(gi) || usedCols.has(ci)) continue;
4102
- result.push({ col: ci, items: groups[gi] });
4103
- assignedGroups.add(gi);
4104
- usedCols.add(ci);
4105
- }
4106
- for (let gi = 0; gi < groups.length; gi++) {
4107
- if (assignedGroups.has(gi)) continue;
4108
- let bestCol = 0, bestDist = Infinity;
4109
- for (let ci = 0; ci < numCols; ci++) {
4110
- const d = Math.abs(groupCenters[gi] - colCenters[ci]);
4111
- if (d < bestDist) {
4112
- bestDist = d;
4113
- bestCol = ci;
4114
- }
4115
- }
4116
- result.push({ col: bestCol, items: groups[gi] });
4117
- }
4118
- return result;
4119
- }
4120
- function buildClusterTable(rows, columns, pageNum) {
4121
- const numCols = columns.length;
4122
- const numRows = rows.length;
4123
- if (numRows < MIN_ROWS || numCols < MIN_COLS) return null;
4124
- const cells = Array.from(
4125
- { length: numRows },
4126
- () => Array.from({ length: numCols }, () => ({ text: "", colSpan: 1, rowSpan: 1 }))
4127
- );
4128
- const usedItems = /* @__PURE__ */ new Set();
4129
- for (let r = 0; r < numRows; r++) {
4130
- const row = rows[r];
4131
- if (row.items.length === 1 && numCols > 1) {
4132
- cells[r][0] = { text: row.items[0].text, colSpan: numCols, rowSpan: 1 };
4133
- usedItems.add(row.items[0]);
4134
- continue;
4135
- }
4136
- const assignments = assignRowItems(row.items, columns, numCols);
4137
- for (const { col, items } of assignments) {
4138
- const text = items.map((i) => i.text).join(" ");
4139
- const existing = cells[r][col].text;
4140
- cells[r][col].text = existing ? existing + " " + text : text;
4141
- for (const item of items) usedItems.add(item);
4142
- }
4143
- }
4144
- let emptyRows = 0;
4145
- for (const row of cells) {
4146
- if (row.every((c) => c.text === "")) emptyRows++;
4147
- }
4148
- if (emptyRows > numRows * 0.5) return null;
4149
- for (let c = 0; c < numCols; c++) {
4150
- const hasValue = cells.some((row) => row[c].text !== "");
4151
- if (!hasValue) return null;
4152
- }
4153
- for (let r = numRows - 1; r >= 1; r--) {
4154
- const nonEmptyCols = cells[r].filter((c) => c.text.trim()).length;
4155
- if (nonEmptyCols !== 1) continue;
4156
- if (cells[r][0].text.trim() !== "") continue;
4157
- const contentText = cells[r].find((c) => c.text.trim())?.text.trim() || "";
4158
- if (/^[○●▶\-·]/.test(contentText)) continue;
4159
- for (let pr = r - 1; pr >= 0; pr--) {
4160
- if (cells[pr].some((c) => c.text.trim())) {
4161
- for (let c = 0; c < numCols; c++) {
4162
- const prev = cells[pr][c].text.trim();
4163
- const curr = cells[r][c].text.trim();
4164
- if (curr) cells[pr][c].text = prev ? prev + " " + curr : curr;
4165
- }
4166
- for (let c = 0; c < numCols; c++) cells[r][c].text = "";
4167
- break;
4168
- }
4169
- }
4170
- }
4171
- for (let r = 0; r < cells.length - 1; r++) {
4172
- const row = cells[r];
4173
- const hasCol0 = row[0].text.trim() !== "";
4174
- const hasColLast = numCols > 1 && row[numCols - 1].text.trim() !== "";
4175
- const midEmpty = row.slice(1, numCols - 1).every((c) => c.text.trim() === "");
4176
- if (hasCol0 && hasColLast && midEmpty) {
4177
- const next = cells[r + 1];
4178
- if (next[0].text.trim() === "" && next.some((c) => c.text.trim())) {
4179
- for (let c = 1; c < numCols; c++) {
4180
- const curr = next[c].text.trim();
4181
- if (curr) row[c].text = row[c].text.trim() ? row[c].text.trim() + " " + curr : curr;
4182
- }
4183
- for (let c = 0; c < numCols; c++) next[c].text = "";
4184
- }
4185
- }
4186
- }
4187
- const filteredCells = cells.filter((row) => row.some((c) => c.text.trim()));
4188
- const finalRowCount = filteredCells.length;
4189
- if (finalRowCount < MIN_ROWS) return null;
4190
- const irTable = {
4191
- rows: finalRowCount,
4192
- cols: numCols,
4193
- cells: filteredCells,
4194
- hasHeader: finalRowCount > 1
4195
- };
4196
- const allItems = rows.flatMap((r) => r.items);
4197
- let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
4198
- for (const i of allItems) {
4199
- if (i.x < minX) minX = i.x;
4200
- if (i.y < minY) minY = i.y;
4201
- if (i.x + i.w > maxX) maxX = i.x + i.w;
4202
- const h = i.h > 0 ? i.h : i.fontSize;
4203
- if (i.y + h > maxY) maxY = i.y + h;
4204
- }
4205
- return {
4206
- table: irTable,
4207
- bbox: { page: pageNum, x: minX, y: minY, width: maxX - minX, height: maxY - minY },
4208
- usedItems
4209
- };
4210
- }
4211
-
4212
- // src/pdf/polyfill.ts
4213
- var pdfjsWorker = __toESM(require("pdfjs-dist/legacy/build/pdf.worker.mjs"), 1);
4214
- var g = globalThis;
4215
- if (typeof g.DOMMatrix === "undefined") {
4216
- g.DOMMatrix = class DOMMatrix {
4217
- m = [1, 0, 0, 1, 0, 0];
4218
- constructor(init) {
4219
- if (init) this.m = init;
4220
- }
4221
- };
4222
- }
4223
- if (typeof g.Path2D === "undefined") {
4224
- g.Path2D = class Path2D {
4225
- };
4226
- }
4227
- g.pdfjsWorker = pdfjsWorker;
4228
-
4229
- // src/pdf/parser.ts
4230
- var import_pdf2 = require("pdfjs-dist/legacy/build/pdf.mjs");
4231
- import_pdf2.GlobalWorkerOptions.workerSrc = "";
4232
- var MAX_PAGES = 5e3;
4233
- var MAX_TOTAL_TEXT = 100 * 1024 * 1024;
4234
- var PDF_LOAD_TIMEOUT_MS = 3e4;
4235
- async function loadPdfWithTimeout(buffer) {
4236
- const loadingTask = (0, import_pdf2.getDocument)({
4237
- data: new Uint8Array(buffer),
4238
- useSystemFonts: true,
4239
- disableFontFace: true,
4240
- isEvalSupported: false
4241
- });
4242
- let timer;
4243
- try {
4244
- return await Promise.race([
4245
- loadingTask.promise,
4246
- new Promise((_, reject) => {
4247
- timer = setTimeout(() => {
4248
- loadingTask.destroy();
4249
- reject(new KordocError("PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (30\uCD08 \uCD08\uACFC)"));
4250
- }, PDF_LOAD_TIMEOUT_MS);
4251
- })
4252
- ]);
4253
- } finally {
4254
- if (timer !== void 0) clearTimeout(timer);
4255
- }
4256
- }
4257
- async function parsePdfDocument(buffer, options) {
4258
- const doc = await loadPdfWithTimeout(buffer);
4259
- try {
4260
- const pageCount = doc.numPages;
4261
- if (pageCount === 0) throw new KordocError("PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.");
4262
- const metadata = { pageCount };
4263
- await extractPdfMetadata(doc, metadata);
4264
- const blocks = [];
4265
- const warnings = [];
4266
- let totalChars = 0;
4267
- let totalTextBytes = 0;
4268
- const effectivePageCount = Math.min(pageCount, MAX_PAGES);
4269
- const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
4270
- const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
4271
- const fontSizeFreq = /* @__PURE__ */ new Map();
4272
- const pageHeights = /* @__PURE__ */ new Map();
4273
- let parsedPages = 0;
4274
- for (let i = 1; i <= effectivePageCount; i++) {
4275
- if (pageFilter && !pageFilter.has(i)) continue;
4276
- try {
4277
- const page = await doc.getPage(i);
4278
- const tc = await page.getTextContent();
4279
- const viewport = page.getViewport({ scale: 1 });
4280
- pageHeights.set(i, viewport.height);
4281
- const rawItems = tc.items;
4282
- const items = normalizeItems(rawItems);
4283
- const { visible, hiddenCount } = filterHiddenText(items, viewport.width, viewport.height);
4284
- if (hiddenCount > 0) {
4285
- warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
4286
- }
4287
- for (const item of visible) {
4288
- if (item.fontSize > 0) fontSizeFreq.set(item.fontSize, (fontSizeFreq.get(item.fontSize) || 0) + 1);
4289
- }
4290
- const opList = await page.getOperatorList();
4291
- const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
4292
- for (const b of pageBlocks) blocks.push(b);
4293
- for (const b of pageBlocks) {
4294
- const t = b.text || "";
4295
- totalChars += t.replace(/\s/g, "").length;
4296
- totalTextBytes += t.length * 2;
4297
- }
4298
- if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
4299
- parsedPages++;
4300
- options?.onProgress?.(parsedPages, totalTarget);
4301
- } catch (pageErr) {
4302
- if (pageErr instanceof KordocError) throw pageErr;
4303
- warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
4304
- }
4305
- }
4306
- const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
4307
- if (totalChars / Math.max(parsedPageCount, 1) < 10) {
4308
- if (options?.ocr) {
4309
- try {
4310
- const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
4311
- const ocrBlocks = await ocrPages2(doc, options.ocr, pageFilter, effectivePageCount);
4312
- if (ocrBlocks.length > 0) {
4313
- const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
4314
- return { markdown: ocrMarkdown, blocks: ocrBlocks, metadata, warnings, isImageBased: true };
4315
- }
4316
- } catch {
4317
- }
4318
- }
4319
- throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
4320
- }
4321
- if (options?.removeHeaderFooter !== false && parsedPageCount >= 3) {
4322
- const removed = removeHeaderFooterBlocks(blocks, pageHeights, warnings);
4323
- for (let ri = removed.length - 1; ri >= 0; ri--) {
4324
- blocks.splice(removed[ri], 1);
4325
- }
4326
- }
4327
- const medianFontSize = computeMedianFontSizeFromFreq(fontSizeFreq);
4328
- if (medianFontSize > 0) {
4329
- detectHeadings(blocks, medianFontSize);
4330
- }
4331
- detectMarkerHeadings(blocks);
4332
- const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
4333
- let markdown = cleanPdfText(blocksToMarkdown(blocks));
4334
- return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
4335
- } finally {
4336
- await doc.destroy().catch(() => {
4337
- });
4338
- }
4339
- }
4340
- async function extractPdfMetadata(doc, metadata) {
4341
- try {
4342
- const result = await doc.getMetadata();
4343
- if (!result?.info) return;
4344
- const info = result.info;
4345
- if (typeof info.Title === "string" && info.Title.trim()) metadata.title = info.Title.trim();
4346
- if (typeof info.Author === "string" && info.Author.trim()) metadata.author = info.Author.trim();
4347
- if (typeof info.Creator === "string" && info.Creator.trim()) metadata.creator = info.Creator.trim();
4348
- if (typeof info.Subject === "string" && info.Subject.trim()) metadata.description = info.Subject.trim();
4349
- if (typeof info.Keywords === "string" && info.Keywords.trim()) {
4350
- metadata.keywords = info.Keywords.split(/[,;]/).map((k) => k.trim()).filter(Boolean);
4351
- }
4352
- if (typeof info.CreationDate === "string") metadata.createdAt = parsePdfDate(info.CreationDate);
4353
- if (typeof info.ModDate === "string") metadata.modifiedAt = parsePdfDate(info.ModDate);
4354
- } catch {
4355
- }
4356
- }
4357
- function parsePdfDate(dateStr) {
4358
- const m = dateStr.match(/D:(\d{4})(\d{2})?(\d{2})?(\d{2})?(\d{2})?(\d{2})?/);
4359
- if (!m) return void 0;
4360
- const [, year, month = "01", day = "01", hour = "00", min = "00", sec = "00"] = m;
4361
- return `${year}-${month}-${day}T${hour}:${min}:${sec}`;
4362
- }
4363
- function filterHiddenText(items, pageWidth, pageHeight) {
4364
- let hiddenCount = 0;
4365
- const visible = [];
4366
- for (const item of items) {
4367
- if (item.isHidden) {
4368
- hiddenCount++;
4369
- continue;
4370
- }
4371
- const margin = Math.max(pageWidth, pageHeight) * 0.1;
4372
- if (item.x < -margin || item.x > pageWidth + margin || item.y < -margin || item.y > pageHeight + margin) {
4373
- hiddenCount++;
4374
- continue;
4375
- }
4376
- visible.push(item);
4377
- }
4378
- return { visible, hiddenCount };
4379
- }
4380
- function computeMedianFontSizeFromFreq(freq) {
4381
- if (freq.size === 0) return 0;
4382
- let total = 0;
4383
- for (const count of freq.values()) total += count;
4384
- const sorted = [...freq.entries()].sort((a, b) => a[0] - b[0]);
4385
- const mid = Math.floor(total / 2);
4386
- let cumulative = 0;
4387
- for (const [size, count] of sorted) {
4388
- cumulative += count;
4389
- if (cumulative > mid) return size;
4390
- }
4391
- return sorted[sorted.length - 1][0];
4392
- }
4393
- function detectHeadings(blocks, medianFontSize) {
4394
- for (const block of blocks) {
4395
- if (block.type !== "paragraph" || !block.text || !block.style?.fontSize) continue;
4396
- const text = block.text.trim();
4397
- if (text.length === 0 || text.length > 200) continue;
4398
- if (/^\d+$/.test(text)) continue;
4399
- const ratio = block.style.fontSize / medianFontSize;
4400
- let level = 0;
4401
- if (ratio >= HEADING_RATIO_H1) level = 1;
4402
- else if (ratio >= HEADING_RATIO_H2) level = 2;
4403
- else if (ratio >= HEADING_RATIO_H3) level = 3;
4404
- if (level > 0) {
4405
- block.type = "heading";
4406
- block.level = level;
4407
- block.text = collapseEvenSpacing(text);
4408
- }
4409
- }
4410
- }
4411
- function collapseEvenSpacing(text) {
4412
- const tokens = text.split(" ");
4413
- const singleCharCount = tokens.filter((t) => t.length === 1).length;
4414
- if (tokens.length >= 3 && singleCharCount / tokens.length >= 0.7) {
4415
- return tokens.join("");
4416
- }
4417
- return text.replace(
4418
- /(?<![가-힣])[가-힣](?: [가-힣\d]){2,}(?![가-힣])/g,
4419
- (match) => match.replace(/ /g, "")
4420
- );
4421
- }
4422
- function shouldDemoteTable(table) {
4423
- const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
4424
- const allText = allCells.join(" ");
4425
- if (table.rows <= 3 && table.cols <= 3) {
4426
- const totalCells2 = table.rows * table.cols;
4427
- const emptyCells2 = totalCells2 - allCells.length;
4428
- if (emptyCells2 >= totalCells2 * 0.3) return true;
4429
- if (/[□■◆○●▶ㅇ]/.test(allText)) return true;
4430
- if (/<[^>]+>/.test(allText)) return true;
4431
- }
4432
- if (allText.length > 200) return false;
4433
- if (/[□■◆○●▶]/.test(allText) && table.rows <= 3) return true;
4434
- const totalCells = table.rows * table.cols;
4435
- const emptyCells = totalCells - allCells.length;
4436
- if (table.rows <= 2 && emptyCells > totalCells * 0.5) return true;
4437
- if (table.rows === 1 && !/\d{2,}/.test(allText)) return true;
4438
- return false;
4439
- }
4440
- function demoteTableToText(table) {
4441
- const lines = [];
4442
- for (let r = 0; r < table.rows; r++) {
4443
- const cells = table.cells[r].map((c) => c.text.trim()).filter(Boolean);
4444
- if (cells.length === 0) continue;
4445
- if (table.cols === 2 && cells.length === 2) {
4446
- lines.push(`${cells[0]} : ${cells[1]}`);
4447
- } else {
4448
- lines.push(cells.join(" "));
4449
- }
4450
- }
4451
- return lines.join("\n");
4452
- }
4453
- function detectMarkerHeadings(blocks) {
4454
- for (let i = 0; i < blocks.length; i++) {
4455
- const block = blocks[i];
4456
- if (block.type !== "paragraph" || !block.text) continue;
4457
- const text = block.text.trim();
4458
- if (text.length < 50 && /^[□■◆◇▶]\s*[가-힣]/.test(text)) {
4459
- block.type = "heading";
4460
- block.level = 4;
4461
- continue;
4462
- }
4463
- if (/^[가-힣]{2,6}$/.test(text) && block.style?.fontSize) {
4464
- const prev = blocks[i - 1];
4465
- const next = blocks[i + 1];
4466
- const prevIsStructural = !prev || prev.type === "table" || prev.type === "heading" || prev.type === "separator";
4467
- const nextIsStructural = !next || next.type === "table" || next.type === "heading" || next.type === "paragraph" && next.text && /^[□■◆○●]/.test(next.text.trim());
4468
- if (prevIsStructural || nextIsStructural) {
4469
- block.type = "heading";
4470
- block.level = 3;
4471
- }
4472
- }
4473
- }
4474
- }
4475
- var MAX_XYCUT_DEPTH = 50;
4476
- function xyCutOrder(items, gapThreshold, depth = 0) {
4477
- if (items.length === 0) return [];
4478
- if (items.length <= 2 || depth >= MAX_XYCUT_DEPTH) return [items];
4479
- const region = computeRegion(items);
4480
- const ySplit = findYSplit(items, region, gapThreshold);
4481
- if (ySplit !== null) {
4482
- const upper = items.filter((i) => i.y > ySplit);
4483
- const lower = items.filter((i) => i.y <= ySplit);
4484
- if (upper.length > 0 && lower.length > 0 && upper.length < items.length) {
4485
- return [...xyCutOrder(upper, gapThreshold, depth + 1), ...xyCutOrder(lower, gapThreshold, depth + 1)];
4486
- }
4487
- }
4488
- const xSplit = findXSplit(items, region, gapThreshold);
4489
- if (xSplit !== null) {
4490
- const left = items.filter((i) => i.x + i.w / 2 < xSplit);
4491
- const right = items.filter((i) => i.x + i.w / 2 >= xSplit);
4492
- if (left.length > 0 && right.length > 0 && left.length < items.length) {
4493
- return [...xyCutOrder(left, gapThreshold, depth + 1), ...xyCutOrder(right, gapThreshold, depth + 1)];
4494
- }
4495
- }
4496
- return [items];
4497
- }
4498
- function computeRegion(items) {
4499
- let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
4500
- for (const i of items) {
4501
- if (i.x < minX) minX = i.x;
4502
- if (i.y < minY) minY = i.y;
4503
- if (i.x + i.w > maxX) maxX = i.x + i.w;
4504
- if (i.y + i.h > maxY) maxY = i.y + i.h;
4505
- }
4506
- return { items, minX, minY, maxX, maxY };
4507
- }
4508
- function findYSplit(items, _region, gapThreshold) {
4509
- const sorted = [...items].sort((a, b) => b.y - a.y);
4510
- let bestGap = gapThreshold;
4511
- let bestSplit = null;
4512
- for (let i = 1; i < sorted.length; i++) {
4513
- const prevBottom = sorted[i - 1].y - sorted[i - 1].h;
4514
- const currTop = sorted[i].y;
4515
- const gap = prevBottom - currTop;
4516
- if (gap > bestGap) {
4517
- bestGap = gap;
4518
- bestSplit = (prevBottom + currTop) / 2;
4519
- }
4520
- }
4521
- return bestSplit;
4522
- }
4523
- function findXSplit(items, _region, gapThreshold) {
4524
- const sorted = [...items].sort((a, b) => a.x - b.x);
4525
- let bestGap = gapThreshold;
4526
- let bestSplit = null;
4527
- for (let i = 1; i < sorted.length; i++) {
4528
- const prevRight = sorted[i - 1].x + sorted[i - 1].w;
4529
- const currLeft = sorted[i].x;
4530
- const gap = currLeft - prevRight;
4531
- if (gap > bestGap) {
4532
- bestGap = gap;
4533
- bestSplit = (prevRight + currLeft) / 2;
4534
- }
4535
- }
4536
- return bestSplit;
4537
- }
4538
- function extractPageBlocksWithLines(items, pageNum, opList, pageWidth, pageHeight) {
4539
- if (items.length === 0) return [];
4540
- let { horizontals, verticals } = extractLines(opList.fnArray, opList.argsArray);
4541
- ({ horizontals, verticals } = filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight));
4542
- ({ horizontals, verticals } = preprocessLines(horizontals, verticals));
4543
- const grids = buildTableGrids(horizontals, verticals);
4544
- if (grids.length > 0) {
4545
- return extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals);
4546
- }
4547
- return extractPageBlocksFallback(items, pageNum);
4548
- }
4549
- function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
4550
- const blocks = [];
4551
- const usedItems = /* @__PURE__ */ new Set();
4552
- const sortedGrids = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
4553
- for (const grid of sortedGrids) {
4554
- const numGridRows = grid.rowYs.length - 1;
4555
- const numGridCols = grid.colXs.length - 1;
4556
- if (numGridRows === 1 && numGridCols >= 2) continue;
4557
- const tableItems = [];
4558
- const pad = 3;
4559
- const gridW = grid.bbox.x2 - grid.bbox.x1;
4560
- for (const item of items) {
4561
- if (usedItems.has(item)) continue;
4562
- if (item.y < grid.bbox.y1 - pad || item.y > grid.bbox.y2 + pad) continue;
4563
- if (item.x < grid.bbox.x1 - pad || item.x + item.w > grid.bbox.x2 + pad) continue;
4564
- if (gridW < 120 && item.x + item.w > grid.bbox.x2 - 2) continue;
4565
- tableItems.push(item);
4566
- usedItems.add(item);
4567
- }
4568
- const cells = extractCells(grid, horizontals, verticals);
4569
- if (cells.length === 0) continue;
4570
- const textItems = tableItems.map((i) => ({
4571
- text: i.text,
4572
- x: i.x,
4573
- y: i.y,
4574
- w: i.w,
4575
- h: i.h,
4576
- fontSize: i.fontSize,
4577
- fontName: i.fontName
4578
- }));
4579
- const cellTextMap = mapTextToCells(textItems, cells);
4580
- const numRows = grid.rowYs.length - 1;
4581
- const numCols = grid.colXs.length - 1;
4582
- const irGrid = Array.from(
4583
- { length: numRows },
4584
- () => Array.from({ length: numCols }, () => ({ text: "", colSpan: 1, rowSpan: 1 }))
4585
- );
4586
- for (const cell of cells) {
4587
- const cellItems = cellTextMap.get(cell) || [];
4588
- let text = cellTextToString(cellItems);
4589
- text = text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").trim();
4590
- text = text.split("\n").map((line) => collapseEvenSpacing(line)).join("\n");
4591
- irGrid[cell.row][cell.col] = {
4592
- text,
4593
- colSpan: cell.colSpan,
4594
- rowSpan: cell.rowSpan
4595
- };
4596
- }
4597
- const irTable = {
4598
- rows: numRows,
4599
- cols: numCols,
4600
- cells: irGrid,
4601
- hasHeader: numRows > 1
4602
- };
4603
- const hasContent = irGrid.some((row) => row.some((cell) => cell.text.trim() !== ""));
4604
- if (!hasContent) continue;
4605
- const tableBbox = {
4606
- page: pageNum,
4607
- x: grid.bbox.x1,
4608
- y: grid.bbox.y1,
4609
- width: grid.bbox.x2 - grid.bbox.x1,
4610
- height: grid.bbox.y2 - grid.bbox.y1
4611
- };
4612
- if (shouldDemoteTable(irTable)) {
4613
- const demoted = demoteTableToText(irTable);
4614
- if (demoted) {
4615
- const text = numGridRows === 1 ? "\n" + demoted + "\n" : demoted;
4616
- blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
4617
- }
4618
- continue;
4619
- }
4620
- blocks.push({ type: "table", table: irTable, pageNumber: pageNum, bbox: tableBbox });
4621
- }
4622
- let remaining = items.filter((i) => !usedItems.has(i));
4623
- if (remaining.length > 0) {
4624
- remaining.sort((a, b) => b.y - a.y || a.x - b.x);
4625
- const clusterItems = remaining.map((i) => ({
4626
- text: i.text,
4627
- x: i.x,
4628
- y: i.y,
4629
- w: i.w,
4630
- h: i.h,
4631
- fontSize: i.fontSize,
4632
- fontName: i.fontName
4633
- }));
4634
- const clusterResults = detectClusterTables(clusterItems, pageNum);
4635
- if (clusterResults.length > 0) {
4636
- const ciToIdx = /* @__PURE__ */ new Map();
4637
- for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
4638
- const usedClusterIndices = /* @__PURE__ */ new Set();
4639
- for (const cr of clusterResults) {
4640
- for (const ci of cr.usedItems) {
4641
- const idx = ciToIdx.get(ci);
4642
- if (idx !== void 0) usedClusterIndices.add(idx);
4643
- }
4644
- blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
4645
- }
4646
- remaining = remaining.filter((_, idx) => !usedClusterIndices.has(idx));
4647
- }
4648
- if (remaining.length > 0) {
4649
- const allY = remaining.map((i) => i.y);
4650
- const pageH = safeMax(allY) - safeMin(allY);
4651
- const groups = xyCutOrder(remaining, Math.max(15, pageH * 0.03));
4652
- const textBlocks = [];
4653
- for (const group of groups) {
4654
- if (group.length === 0) continue;
4655
- const groupBlocks = extractPageBlocksFallback(group, pageNum);
4656
- for (const b of groupBlocks) textBlocks.push(b);
4657
- }
4658
- const finalTextBlocks = detectListBlocks(textBlocks);
4659
- for (const b of finalTextBlocks) blocks.push(b);
4660
- }
4661
- blocks.sort((a, b) => {
4662
- const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
4663
- const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
4664
- return by - ay;
4665
- });
4666
- return mergeAdjacentTableBlocks(blocks);
4667
- }
4668
- return mergeAdjacentTableBlocks(blocks);
4669
- }
4670
- function mergeAdjacentTableBlocks(blocks) {
4671
- if (blocks.length <= 1) return blocks;
4672
- const result = [blocks[0]];
4673
- for (let i = 1; i < blocks.length; i++) {
4674
- const prev = result[result.length - 1];
4675
- const curr = blocks[i];
4676
- if (prev.type === "table" && curr.type === "table" && prev.table && curr.table && prev.table.cols === curr.table.cols) {
4677
- const merged = {
4678
- rows: prev.table.rows + curr.table.rows,
4679
- cols: prev.table.cols,
4680
- cells: [...prev.table.cells, ...curr.table.cells],
4681
- hasHeader: prev.table.hasHeader
4682
- };
4683
- result[result.length - 1] = { ...prev, table: merged };
4684
- } else {
4685
- result.push(curr);
4686
- }
4687
- }
4688
- return result;
4689
- }
4690
- function extractPageBlocksFallback(items, pageNum) {
4691
- if (items.length === 0) return [];
4692
- const blocks = [];
4693
- const clusterItems = items.map((i) => ({
4694
- text: i.text,
4695
- x: i.x,
4696
- y: i.y,
4697
- w: i.w,
4698
- h: i.h,
4699
- fontSize: i.fontSize,
4700
- fontName: i.fontName
4701
- }));
4702
- const clusterResults = detectClusterTables(clusterItems, pageNum);
4703
- if (clusterResults.length > 0) {
4704
- const ciToIdx = /* @__PURE__ */ new Map();
4705
- for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
4706
- const usedIndices = /* @__PURE__ */ new Set();
4707
- for (const cr of clusterResults) {
4708
- for (const ci of cr.usedItems) {
4709
- const idx = ciToIdx.get(ci);
4710
- if (idx !== void 0) usedIndices.add(idx);
4711
- }
4712
- blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
4713
- }
4714
- const remaining = items.filter((_, idx) => !usedIndices.has(idx));
4715
- if (remaining.length > 0) {
4716
- const yLines = groupByY(remaining);
4717
- for (const line of yLines) {
4718
- const text = mergeLineSimple(line);
4719
- if (!text.trim()) continue;
4720
- const bbox = computeBBox(line, pageNum);
4721
- blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
4722
- }
4723
- }
4724
- blocks.sort((a, b) => {
4725
- const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
4726
- const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
4727
- return by - ay;
4728
- });
4729
- } else {
4730
- const allYLines = groupByY(items);
4731
- const columns = detectColumns(allYLines);
4732
- if (columns && columns.length >= 3) {
4733
- const tableText = extractWithColumns(allYLines, columns);
4734
- const bbox = computeBBox(items, pageNum);
4735
- blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
4736
- } else {
4737
- const allY = items.map((i) => i.y);
4738
- const pageHeight = safeMax(allY) - safeMin(allY);
4739
- const gapThreshold = Math.max(15, pageHeight * 0.03);
4740
- const orderedGroups = xyCutOrder(items, gapThreshold);
4741
- for (const group of orderedGroups) {
4742
- if (group.length === 0) continue;
4743
- const yLines = groupByY(group);
4744
- const groupColumns = detectColumns(yLines);
4745
- if (groupColumns && groupColumns.length >= 3) {
4746
- const tableText = extractWithColumns(yLines, groupColumns);
4747
- const bbox = computeBBox(group, pageNum);
4748
- blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(group) });
4749
- } else {
4750
- for (const line of yLines) {
4751
- const text = mergeLineSimple(line);
4752
- if (!text.trim()) continue;
4753
- const bbox = computeBBox(line, pageNum);
4754
- blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
4755
- }
4756
- }
4757
- }
4758
- }
4759
- }
4760
- return detectSpecialKoreanTables(blocks);
4761
- }
4762
- function computeBBox(items, pageNum) {
4763
- let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
4764
- for (const i of items) {
4765
- if (i.x < minX) minX = i.x;
4766
- if (i.y < minY) minY = i.y;
4767
- if (i.x + i.w > maxX) maxX = i.x + i.w;
4768
- const effectiveH = i.h > 0 ? i.h : i.fontSize;
4769
- if (i.y + effectiveH > maxY) maxY = i.y + effectiveH;
4770
- }
4771
- return { page: pageNum, x: minX, y: minY, width: maxX - minX, height: maxY - minY };
4772
- }
4773
- function dominantStyle(items) {
4774
- if (items.length === 0) return void 0;
4775
- const freq = /* @__PURE__ */ new Map();
4776
- let maxCount = 0, dominantSize = 0;
4777
- for (const i of items) {
4778
- if (i.fontSize <= 0) continue;
4779
- const count = (freq.get(i.fontSize) || 0) + 1;
4780
- freq.set(i.fontSize, count);
4781
- if (count > maxCount) {
4782
- maxCount = count;
4783
- dominantSize = i.fontSize;
4784
- }
4785
- }
4786
- if (dominantSize === 0) return void 0;
4787
- const fontName = items.find((i) => i.fontSize === dominantSize)?.fontName || void 0;
4788
- return { fontSize: dominantSize, fontName };
4789
- }
4790
- function normalizeItems(rawItems) {
4791
- const items = [];
4792
- const spacePositions = [];
4793
- for (const i of rawItems) {
4794
- if (typeof i.str !== "string") continue;
4795
- const x = Math.round(i.transform[4]);
4796
- const y = Math.round(i.transform[5]);
4797
- if (!i.str.trim()) {
4798
- spacePositions.push({ x, y });
4799
- continue;
4800
- }
4801
- const scaleY = Math.abs(i.transform[3]);
4802
- const scaleX = Math.abs(i.transform[0]);
4803
- const fontSize = Math.round(Math.max(scaleY, scaleX));
4804
- const w = Math.round(i.width);
4805
- const h = Math.round(i.height);
4806
- const isHidden = fontSize === 0 || i.width === 0 && i.str.trim().length > 0;
4807
- let text = i.str.trim();
4808
- if (/^[\d\s\-().·,☎]+$/.test(text) && /\d/.test(text) && / /.test(text)) {
4809
- text = text.replace(/ /g, "");
4810
- }
4811
- const split = splitEvenSpacedItem(text, x, w, fontSize);
4812
- if (split) {
4813
- for (const s of split) {
4814
- items.push({ text: s.text, x: s.x, y, w: s.w, h, fontSize, fontName: i.fontName || "", isHidden });
4815
- }
4816
- } else {
4817
- items.push({ text, x, y, w, h, fontSize, fontName: i.fontName || "", isHidden });
4818
- }
4819
- }
4820
- const sorted = items.sort((a, b) => b.y - a.y || a.x - b.x);
4821
- const deduped = [];
4822
- for (let i = 0; i < sorted.length; i++) {
4823
- let isDup = false;
4824
- for (let j = deduped.length - 1; j >= 0; j--) {
4825
- const prev = deduped[j];
4826
- if (prev.y - sorted[i].y > 3) break;
4827
- if (Math.abs(prev.y - sorted[i].y) <= 3 && prev.text === sorted[i].text && Math.abs(prev.x - sorted[i].x) <= 3) {
4828
- isDup = true;
4829
- break;
4830
- }
4831
- }
4832
- if (!isDup) deduped.push(sorted[i]);
4833
- }
4834
- if (spacePositions.length > 0) {
4835
- for (const item of deduped) {
4836
- for (const sp of spacePositions) {
4837
- if (Math.abs(sp.y - item.y) <= 3) {
4838
- const dist = item.x - sp.x;
4839
- if (dist >= 0 && dist <= 20) {
4840
- item.hasSpaceBefore = true;
4841
- break;
4842
- }
4843
- }
4844
- }
4845
- }
4846
- }
4847
- return deduped;
4848
- }
4849
- function splitEvenSpacedItem(text, itemX, itemW, fontSize) {
4850
- if (!/^[가-힣\d](?: [가-힣\d]){2,}$/.test(text)) return null;
4851
- const chars = text.split(" ");
4852
- if (chars.length < 3) return null;
4853
- const charW = itemW / chars.length;
4854
- if (charW > fontSize * 2) return null;
4855
- return chars.map((ch, idx) => ({
4856
- text: ch,
4857
- x: Math.round(itemX + idx * charW),
4858
- w: Math.round(charW * 0.8)
4859
- // 실제 글자 폭은 간격보다 좁음
4860
- }));
4861
- }
4862
- function groupByY(items) {
4863
- if (items.length === 0) return [];
4864
- const lines = [];
4865
- let curY = items[0].y;
4866
- let curLine = [items[0]];
4867
- for (let i = 1; i < items.length; i++) {
4868
- if (Math.abs(items[i].y - curY) > 3) {
4869
- lines.push(curLine);
4870
- curLine = [];
4871
- curY = items[i].y;
4872
- }
4873
- curLine.push(items[i]);
4874
- }
4875
- if (curLine.length > 0) lines.push(curLine);
4876
- return lines;
4877
- }
4878
- function isProseSpread(items) {
4879
- if (items.length < 4) return false;
4880
- const sorted = [...items].sort((a, b) => a.x - b.x);
4881
- const gaps = [];
4882
- for (let i = 1; i < sorted.length; i++) {
4883
- gaps.push(sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w));
4884
- }
4885
- const maxGap = safeMax(gaps);
4886
- const avgLen = items.reduce((s, i) => s + i.text.length, 0) / items.length;
4887
- return maxGap < 40 && avgLen < 5;
4888
- }
4889
- function detectColumns(yLines) {
4890
- const allItems = yLines.flat();
4891
- if (allItems.length === 0) return null;
4892
- const pageWidth = safeMax(allItems.map((i) => i.x + i.w)) - safeMin(allItems.map((i) => i.x));
4893
- if (pageWidth < 100) return null;
4894
- let bigoLineIdx = -1;
4895
- for (let i = 0; i < yLines.length; i++) {
4896
- if (yLines[i].length <= 2 && yLines[i].some((item) => item.text === "\uBE44\uACE0")) {
4897
- bigoLineIdx = i;
4898
- break;
4899
- }
4900
- }
4901
- const tableYLines = bigoLineIdx >= 0 ? yLines.slice(0, bigoLineIdx) : yLines;
4902
- const CLUSTER_TOL = 22;
4903
- const xClusters = [];
4904
- for (const line of tableYLines) {
4905
- if (isProseSpread(line)) continue;
4906
- for (const item of line) {
4907
- let found = false;
4908
- for (const c of xClusters) {
4909
- if (Math.abs(item.x - c.center) <= CLUSTER_TOL) {
4910
- c.center = Math.round((c.center * c.count + item.x) / (c.count + 1));
4911
- c.minX = Math.min(c.minX, item.x);
4912
- c.count++;
4913
- found = true;
4914
- break;
4915
- }
4916
- }
4917
- if (!found) {
4918
- xClusters.push({ center: item.x, count: 1, minX: item.x });
4919
- }
4920
- }
4921
- }
4922
- const peaks = xClusters.filter((c) => c.count >= 3).sort((a, b) => a.minX - b.minX);
4923
- if (peaks.length < 3) return null;
4924
- const MERGE_TOL = 40;
4925
- const merged = [peaks[0]];
4926
- for (let i = 1; i < peaks.length; i++) {
4927
- const prev = merged[merged.length - 1];
4928
- if (peaks[i].minX - prev.minX < MERGE_TOL) {
4929
- if (peaks[i].count > prev.count) {
4930
- prev.center = peaks[i].center;
4931
- }
4932
- prev.count += peaks[i].count;
4933
- prev.minX = Math.min(prev.minX, peaks[i].minX);
4934
- } else {
4935
- merged.push({ ...peaks[i] });
4936
- }
4937
- }
4938
- const rawColumns = merged.filter((c) => c.count >= 3).map((c) => c.minX);
4939
- if (rawColumns.length < 3) return null;
4940
- const MIN_DETECT_COL_WIDTH = 30;
4941
- const columns = [rawColumns[0]];
4942
- for (let i = 1; i < rawColumns.length; i++) {
4943
- if (rawColumns[i] - columns[columns.length - 1] < MIN_DETECT_COL_WIDTH) continue;
4944
- columns.push(rawColumns[i]);
4945
- }
4946
- return columns.length >= 3 ? columns : null;
4947
- }
4948
- function findColumn(x, columns) {
4949
- for (let i = columns.length - 1; i >= 0; i--) {
4950
- if (x >= columns[i] - 10) return i;
4951
- }
4952
- return 0;
4953
- }
4954
- function extractWithColumns(yLines, columns) {
4955
- const result = [];
4956
- const colMin = columns[0];
4957
- const colMax = columns[columns.length - 1];
4958
- let bigoIdx = -1;
4959
- for (let i = 0; i < yLines.length; i++) {
4960
- if (yLines[i].length <= 2 && yLines[i].some((item) => item.text === "\uBE44\uACE0")) {
4961
- bigoIdx = i;
4962
- break;
4963
- }
4964
- }
4965
- let tableStart = -1;
4966
- for (let i = 0; i < (bigoIdx >= 0 ? bigoIdx : yLines.length); i++) {
4967
- const usedCols = new Set(yLines[i].map((item) => findColumn(item.x, columns)));
4968
- if (usedCols.size >= 3) {
4969
- tableStart = i;
4970
- break;
4971
- }
4972
- }
4973
- const tableEnd = bigoIdx >= 0 ? bigoIdx : yLines.length;
4974
- for (let i = 0; i < (tableStart >= 0 ? tableStart : tableEnd); i++) {
4975
- result.push(mergeLineSimple(yLines[i]));
4976
- }
4977
- if (tableStart >= 0) {
4978
- const tableLines = yLines.slice(tableStart, tableEnd);
4979
- const gridLines = [];
4980
- for (const line of tableLines) {
4981
- const inRange = line.some(
4982
- (item) => item.x >= colMin - 20 && item.x <= colMax + 200
4983
- );
4984
- if (inRange && !isProseSpread(line)) {
4985
- gridLines.push(line);
4986
- } else {
4987
- if (gridLines.length > 0) {
4988
- result.push(buildGridTable(gridLines.splice(0), columns));
4989
- }
4990
- result.push(mergeLineSimple(line));
4991
- }
4992
- }
4993
- if (gridLines.length > 0) {
4994
- result.push(buildGridTable(gridLines, columns));
4995
- }
4996
- }
4997
- if (bigoIdx >= 0) {
4998
- result.push("");
4999
- for (let i = bigoIdx; i < yLines.length; i++) {
5000
- result.push(mergeLineSimple(yLines[i]));
5001
- }
5002
- }
5003
- return result.join("\n");
5004
- }
5005
- function buildGridTable(lines, columns) {
5006
- const numCols = columns.length;
5007
- const yRows = lines.map((items) => {
5008
- const row = Array(numCols).fill("");
5009
- for (const item of items) {
5010
- const col = findColumn(item.x, columns);
5011
- row[col] = row[col] ? row[col] + " " + item.text : item.text;
5012
- }
5013
- return row;
5014
- });
5015
- const dataColStart = Math.max(2, Math.floor(numCols / 2));
5016
- const merged = [];
5017
- for (const row of yRows) {
5018
- if (row.every((c) => c === "")) continue;
5019
- if (merged.length === 0) {
5020
- merged.push([...row]);
5021
- continue;
5022
- }
5023
- const prev = merged[merged.length - 1];
5024
- const filledCols = row.map((c, i) => c ? i : -1).filter((i) => i >= 0);
5025
- const filledCount = filledCols.length;
5026
- let isNewRow = false;
5027
- if (row[0] && row[0].length >= 3) {
5028
- isNewRow = true;
5029
- }
5030
- if (!isNewRow && numCols > 1 && row[1]) {
5031
- isNewRow = true;
5032
- }
5033
- if (!isNewRow) {
5034
- const hasData = row.slice(dataColStart).some((c) => c !== "");
5035
- const prevHasData = prev.slice(dataColStart).some((c) => c !== "");
5036
- if (hasData && prevHasData) {
5037
- isNewRow = true;
5038
- }
5039
- }
5040
- if (isNewRow && filledCount === 1 && row[0] && row[0].length <= 2) {
5041
- isNewRow = false;
5042
- }
5043
- if (isNewRow) {
5044
- merged.push([...row]);
5045
- } else {
5046
- for (let c = 0; c < numCols; c++) {
5047
- if (row[c]) {
5048
- prev[c] = prev[c] ? prev[c] + " " + row[c] : row[c];
5049
- }
5050
- }
5051
- }
5052
- }
5053
- if (merged.length < 2) {
5054
- return merged.map((r) => r.filter((c) => c).join(" ")).join("\n");
5055
- }
5056
- let headerEnd = 0;
5057
- for (let r = 0; r < merged.length; r++) {
5058
- const hasDataValues = merged[r].slice(dataColStart).some((c) => c && /\d/.test(c));
5059
- if (hasDataValues) break;
5060
- headerEnd = r + 1;
5061
- }
5062
- if (headerEnd > 1) {
5063
- const headerRow = Array(numCols).fill("");
5064
- for (let r = 0; r < headerEnd; r++) {
5065
- for (let c = 0; c < numCols; c++) {
5066
- if (merged[r][c]) {
5067
- headerRow[c] = headerRow[c] ? headerRow[c] + " " + merged[r][c] : merged[r][c];
5068
- }
5069
- }
5070
- }
5071
- merged.splice(0, headerEnd, headerRow);
5072
- }
5073
- for (const row of merged) {
5074
- for (let c = 0; c < row.length; c++) {
5075
- if (row[c]) row[c] = collapseEvenSpacing(row[c]);
5076
- }
5077
- }
5078
- const totalCells = merged.length * numCols;
5079
- const filledCells = merged.reduce((s, row) => s + row.filter((c) => c).length, 0);
5080
- if (filledCells < totalCells * 0.35 || merged.length < 2 || merged.length <= 3 && numCols >= 7) {
5081
- return merged.map((r) => r.filter((c) => c).join(" ")).join("\n");
5082
- }
5083
- const md = [];
5084
- md.push("| " + merged[0].join(" | ") + " |");
5085
- md.push("| " + merged[0].map(() => "---").join(" | ") + " |");
5086
- for (let r = 1; r < merged.length; r++) {
5087
- md.push("| " + merged[r].join(" | ") + " |");
5088
- }
5089
- return md.join("\n");
5090
- }
5091
- function mergeLineSimple(items) {
5092
- if (items.length <= 1) return items[0]?.text || "";
5093
- const sorted = [...items].sort((a, b) => a.x - b.x);
5094
- const isEvenSpaced = detectEvenSpacedItems(sorted);
5095
- let result = sorted[0].text;
5096
- for (let i = 1; i < sorted.length; i++) {
5097
- const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
5098
- const avgFs = (sorted[i].fontSize + sorted[i - 1].fontSize) / 2;
5099
- const tabThreshold = Math.max(avgFs * 2, 30);
5100
- if (gap > tabThreshold) {
5101
- result += " ";
5102
- result += sorted[i].text;
5103
- continue;
5104
- }
5105
- if (isEvenSpaced[i]) {
5106
- result += sorted[i].text;
5107
- continue;
5108
- }
5109
- if (sorted[i].hasSpaceBefore && gap >= avgFs * 0.05) {
5110
- result += " ";
5111
- result += sorted[i].text;
5112
- continue;
5113
- }
5114
- if (/[□■○●▶◆◇ㅇ]$/.test(sorted[i - 1].text) && /^[가-힣]/.test(sorted[i].text) && gap > 1) {
5115
- result += " ";
5116
- result += sorted[i].text;
5117
- continue;
5118
- }
5119
- if (gap < avgFs * 0.15) {
5120
- } else if (gap < avgFs * 0.35 && (/[가-힣]$/.test(result) || /^[가-힣]/.test(sorted[i].text))) {
5121
- } else if (gap > 3) result += " ";
5122
- result += sorted[i].text;
5123
- }
5124
- return result;
5125
- }
5126
- function cleanPdfText(text) {
5127
- return mergeKoreanLines(
5128
- text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
5129
- ).replace(/^(?!\| ---).*$/gm, (line) => collapseEvenSpacing(line)).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
5130
- }
5131
- function startsWithMarker(line) {
5132
- const t = line.trimStart();
5133
- return /^[가-힣ㄱ-ㅎ][.)]/.test(t) || /^\d+[.)]/.test(t) || /^\([가-힣ㄱ-ㅎ\d]+\)/.test(t) || /^[○●※▶▷◆◇■□★☆\-·]\s/.test(t) || /^제\d+[조항호장절]/.test(t);
5134
- }
5135
- function isStandaloneHeader(line) {
5136
- return /^제\d+[조항호장절](\([^)]*\))?(\s+\S+){0,7}$/.test(line.trim());
5137
- }
5138
- function detectListBlocks(blocks) {
5139
- const result = [];
5140
- for (let i = 0; i < blocks.length; i++) {
5141
- const block = blocks[i];
5142
- if (block.type === "paragraph" && block.text) {
5143
- const text = block.text.trim();
5144
- if (/^\d+\.\s/.test(text)) {
5145
- result.push({ ...block, type: "list", listType: "ordered", text: block.text });
5146
- continue;
5147
- }
5148
- if (/^[○●·※▶▷◆◇\-]\s/.test(text)) {
5149
- result.push({ ...block, type: "list", listType: "unordered", text: block.text });
5150
- continue;
5151
- }
5152
- }
5153
- result.push(block);
5154
- }
5155
- return result;
5156
- }
5157
- var KOREAN_TABLE_HEADER_RE = /^\(?(구분|항목|종류|분류|유형|대상|내용|기간|금액|비율|방법|절차|요건|조건|근거|목적|범위|기준)\)?[:\s]/;
5158
- var KV_FALSE_POSITIVE_RE = /\d{1,2}:\d{2}|:\/\/|\d+:\d+/;
5159
- function detectSpecialKoreanTables(blocks) {
5160
- const result = [];
5161
- let kvLines = [];
5162
- const flushKvTable = () => {
5163
- if (kvLines.length < 2) {
5164
- for (const kv of kvLines) result.push(kv.block);
5165
- kvLines = [];
5166
- return;
5167
- }
5168
- const cells = kvLines.map((kv) => {
5169
- if (kv.value) {
5170
- return [
5171
- { text: kv.key, colSpan: 1, rowSpan: 1 },
5172
- { text: kv.value, colSpan: 1, rowSpan: 1 }
5173
- ];
5174
- }
5175
- return [
5176
- { text: kv.key, colSpan: 2, rowSpan: 1 },
5177
- { text: "", colSpan: 1, rowSpan: 1 }
5178
- ];
5179
- });
5180
- const irTable = {
5181
- rows: cells.length,
5182
- cols: 2,
5183
- cells,
5184
- hasHeader: true
5185
- };
5186
- const firstBlock = kvLines[0].block;
5187
- result.push({
5188
- type: "table",
5189
- table: irTable,
5190
- pageNumber: firstBlock.pageNumber,
5191
- bbox: firstBlock.bbox
5192
- });
5193
- kvLines = [];
5194
- };
5195
- for (const block of blocks) {
5196
- if (block.type !== "paragraph" || !block.text) {
5197
- flushKvTable();
5198
- result.push(block);
5199
- continue;
5200
- }
5201
- const text = block.text.trim();
5202
- if (KOREAN_TABLE_HEADER_RE.test(text)) {
5203
- const colonIdx = text.indexOf(":");
5204
- if (colonIdx >= 0) {
5205
- kvLines.push({
5206
- key: text.slice(0, colonIdx).trim(),
5207
- value: text.slice(colonIdx + 1).trim(),
5208
- block
5209
- });
5210
- } else {
5211
- const spaceIdx = text.search(/\s/);
5212
- if (spaceIdx > 0) {
5213
- kvLines.push({
5214
- key: text.slice(0, spaceIdx).trim(),
5215
- value: text.slice(spaceIdx + 1).trim(),
5216
- block
5217
- });
5218
- } else {
5219
- kvLines.push({ key: text, value: "", block });
5220
- }
5221
- }
5222
- continue;
5223
- }
5224
- if (kvLines.length > 0 && text.includes(":")) {
5225
- if (!KV_FALSE_POSITIVE_RE.test(text) && !text.includes("(") && !text.includes(")")) {
5226
- const colonIdx = text.indexOf(":");
5227
- const key = text.slice(0, colonIdx).trim();
5228
- if (/^[가-힣]+$/.test(key) && key.length >= 2 && key.length <= 8) {
5229
- kvLines.push({
5230
- key,
5231
- value: text.slice(colonIdx + 1).trim(),
5232
- block
5233
- });
5234
- continue;
5235
- }
5236
- }
5237
- }
5238
- flushKvTable();
5239
- result.push(block);
5240
- }
5241
- flushKvTable();
5242
- return result;
5243
- }
5244
- function removeHeaderFooterBlocks(blocks, pageHeights, warnings) {
5245
- const ZONE_RATIO = 0.1;
5246
- const MIN_REPEAT = 3;
5247
- const headerTexts = /* @__PURE__ */ new Map();
5248
- const footerTexts = /* @__PURE__ */ new Map();
5249
- for (let bi = 0; bi < blocks.length; bi++) {
5250
- const b = blocks[bi];
5251
- if (!b.bbox || !b.pageNumber || !b.text?.trim()) continue;
5252
- const ph = pageHeights.get(b.bbox.page) || pageHeights.get(b.pageNumber);
5253
- if (!ph) continue;
5254
- const blockTop = ph - (b.bbox.y + b.bbox.height);
5255
- const blockBottom = ph - b.bbox.y;
5256
- if (blockBottom <= ph * ZONE_RATIO) {
5257
- const arr = footerTexts.get(b.pageNumber) || [];
5258
- arr.push(b.text.trim());
5259
- footerTexts.set(b.pageNumber, arr);
5260
- } else if (blockTop >= ph * (1 - ZONE_RATIO)) {
5261
- const arr = headerTexts.get(b.pageNumber) || [];
5262
- arr.push(b.text.trim());
5263
- headerTexts.set(b.pageNumber, arr);
5264
- }
5265
- }
5266
- const repeatedPatterns = /* @__PURE__ */ new Set();
5267
- for (const textsMap of [headerTexts, footerTexts]) {
5268
- const patternCount = /* @__PURE__ */ new Map();
5269
- for (const [, texts] of textsMap) {
5270
- for (const t of texts) {
5271
- const normalized = t.replace(/\d+/g, "#");
5272
- patternCount.set(normalized, (patternCount.get(normalized) || 0) + 1);
5273
- }
5274
- }
5275
- for (const [pattern, count] of patternCount) {
5276
- if (count >= MIN_REPEAT) repeatedPatterns.add(pattern);
5277
- }
5278
- }
5279
- if (repeatedPatterns.size === 0) return [];
5280
- const removeIndices = [];
5281
- for (let bi = 0; bi < blocks.length; bi++) {
5282
- const b = blocks[bi];
5283
- if (!b.bbox || !b.pageNumber || !b.text?.trim()) continue;
5284
- const ph = pageHeights.get(b.bbox.page) || pageHeights.get(b.pageNumber);
5285
- if (!ph) continue;
5286
- const blockTop = ph - (b.bbox.y + b.bbox.height);
5287
- const blockBottom = ph - b.bbox.y;
5288
- const inZone = blockBottom <= ph * ZONE_RATIO || blockTop >= ph * (1 - ZONE_RATIO);
5289
- if (!inZone) continue;
5290
- const normalized = b.text.trim().replace(/\d+/g, "#");
5291
- if (repeatedPatterns.has(normalized)) {
5292
- removeIndices.push(bi);
5293
- }
5294
- }
5295
- if (removeIndices.length > 0) {
5296
- warnings.push({ message: `${removeIndices.length}\uAC1C \uBA38\uB9AC\uAE00/\uBC14\uB2E5\uAE00 \uC694\uC18C \uC81C\uAC70\uB428`, code: "HIDDEN_TEXT_FILTERED" });
5297
- }
5298
- return removeIndices;
5299
- }
5300
- function mergeKoreanLines(text) {
5301
- if (!text) return "";
5302
- const lines = text.split("\n");
5303
- if (lines.length <= 1) return text;
5304
- const result = [lines[0]];
5305
- for (let i = 1; i < lines.length; i++) {
5306
- const prev = result[result.length - 1];
5307
- const curr = lines[i];
5308
- const currTrimmed = curr.trim();
5309
- if (/^#{1,6}\s/.test(prev) || /^#{1,6}\s/.test(curr) || /^\|/.test(currTrimmed) || /^---/.test(currTrimmed)) {
5310
- result.push(curr);
5311
- continue;
5312
- }
5313
- if (/,$/.test(prev.trim()) && currTrimmed.length > 0) {
5314
- result[result.length - 1] = prev + "\n" + curr;
5315
- continue;
5316
- }
5317
- if (/^\(※/.test(currTrimmed)) {
5318
- result[result.length - 1] = prev + " " + currTrimmed;
5319
- continue;
5320
- }
5321
- if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev) && !startsWithMarker(prev)) {
5322
- result[result.length - 1] = prev + " " + curr;
5323
- } else {
5324
- result.push(curr);
5325
- }
5326
- }
5327
- return result.join("\n");
5328
- }
5329
-
5330
- // src/xlsx/parser.ts
5331
- var import_jszip3 = __toESM(require("jszip"), 1);
5332
- var import_xmldom2 = require("@xmldom/xmldom");
5333
- var MAX_SHEETS = 100;
5334
- var MAX_DECOMPRESS_SIZE3 = 100 * 1024 * 1024;
5335
- var MAX_ROWS2 = 1e4;
5336
- var MAX_COLS2 = 200;
5337
- function cleanNumericValue(raw) {
5338
- if (!/^-?\d+\.\d+$/.test(raw)) return raw;
5339
- const num = parseFloat(raw);
5340
- if (!isFinite(num)) return raw;
5341
- const cleaned = parseFloat(num.toPrecision(15)).toString();
5342
- return cleaned;
5343
- }
5344
- function parseCellRef(ref) {
5345
- const m = ref.match(/^([A-Z]+)(\d+)$/);
5346
- if (!m) return null;
5347
- let col = 0;
5348
- for (const ch of m[1]) col = col * 26 + (ch.charCodeAt(0) - 64);
5349
- return { col: col - 1, row: parseInt(m[2], 10) - 1 };
5350
- }
5351
- function parseMergeRef(ref) {
5352
- const parts = ref.split(":");
5353
- if (parts.length !== 2) return null;
5354
- const start = parseCellRef(parts[0]);
5355
- const end = parseCellRef(parts[1]);
5356
- if (!start || !end) return null;
5357
- return { startCol: start.col, startRow: start.row, endCol: end.col, endRow: end.row };
5358
- }
5359
- function getElements(parent, tagName) {
5360
- const nodes = parent.getElementsByTagName(tagName);
5361
- const result = [];
5362
- for (let i = 0; i < nodes.length; i++) result.push(nodes[i]);
5363
- return result;
5364
- }
5365
- function getTextContent(el) {
5366
- return el.textContent?.trim() ?? "";
5367
- }
5368
- function parseXml(text) {
5369
- return new import_xmldom2.DOMParser().parseFromString(stripDtd(text), "text/xml");
5370
- }
5371
- function parseSharedStrings(xml) {
5372
- const doc = parseXml(xml);
5373
- const strings = [];
5374
- const siList = getElements(doc.documentElement, "si");
5375
- for (const si of siList) {
5376
- const tElements = getElements(si, "t");
5377
- strings.push(tElements.map((t) => t.textContent ?? "").join(""));
5378
- }
5379
- return strings;
2578
+ return strings;
5380
2579
  }
5381
2580
  function parseWorkbook(xml) {
5382
2581
  const doc = parseXml(xml);
@@ -5384,9 +2583,9 @@ function parseWorkbook(xml) {
5384
2583
  const sheetElements = getElements(doc.documentElement, "sheet");
5385
2584
  for (const el of sheetElements) {
5386
2585
  sheets.push({
5387
- name: el.getAttribute("name") ?? `Sheet${sheets.length + 1}`,
5388
- sheetId: el.getAttribute("sheetId") ?? "",
5389
- rId: el.getAttribute("r:id") ?? ""
2586
+ name: _nullishCoalesce(el.getAttribute("name"), () => ( `Sheet${sheets.length + 1}`)),
2587
+ sheetId: _nullishCoalesce(el.getAttribute("sheetId"), () => ( "")),
2588
+ rId: _nullishCoalesce(el.getAttribute("r:id"), () => ( ""))
5390
2589
  });
5391
2590
  }
5392
2591
  return sheets;
@@ -5409,7 +2608,7 @@ function parseWorksheet(xml, sharedStrings) {
5409
2608
  let maxCol = 0;
5410
2609
  const rows = getElements(doc.documentElement, "row");
5411
2610
  for (const rowEl of rows) {
5412
- const rowNum = parseInt(rowEl.getAttribute("r") ?? "0", 10) - 1;
2611
+ const rowNum = parseInt(_nullishCoalesce(rowEl.getAttribute("r"), () => ( "0")), 10) - 1;
5413
2612
  if (rowNum < 0 || rowNum >= MAX_ROWS2) continue;
5414
2613
  const cells = getElements(rowEl, "c");
5415
2614
  for (const cellEl of cells) {
@@ -5425,7 +2624,7 @@ function parseWorksheet(xml, sharedStrings) {
5425
2624
  const raw = getTextContent(vElements[0]);
5426
2625
  if (type === "s") {
5427
2626
  const idx = parseInt(raw, 10);
5428
- value = sharedStrings[idx] ?? "";
2627
+ value = _nullishCoalesce(sharedStrings[idx], () => ( ""));
5429
2628
  } else if (type === "b") {
5430
2629
  value = raw === "1" ? "TRUE" : "FALSE";
5431
2630
  } else {
@@ -5435,7 +2634,7 @@ function parseWorksheet(xml, sharedStrings) {
5435
2634
  const isEl = getElements(cellEl, "is");
5436
2635
  if (isEl.length > 0) {
5437
2636
  const tElements = getElements(isEl[0], "t");
5438
- value = tElements.map((t) => t.textContent ?? "").join("");
2637
+ value = tElements.map((t) => _nullishCoalesce(t.textContent, () => ( ""))).join("");
5439
2638
  }
5440
2639
  }
5441
2640
  if (!value && fElements.length > 0) {
@@ -5499,18 +2698,18 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
5499
2698
  for (let c = 0; c <= maxCol; c++) {
5500
2699
  const key = `${r},${c}`;
5501
2700
  if (mergeSkip.has(key)) continue;
5502
- const text = (grid[r] && grid[r][c]) ?? "";
2701
+ const text = _nullishCoalesce((grid[r] && grid[r][c]), () => ( ""));
5503
2702
  const merge = mergeMap.get(key);
5504
2703
  row.push({
5505
2704
  text,
5506
- colSpan: merge?.colSpan ?? 1,
5507
- rowSpan: merge?.rowSpan ?? 1
2705
+ colSpan: _nullishCoalesce(_optionalChain([merge, 'optionalAccess', _40 => _40.colSpan]), () => ( 1)),
2706
+ rowSpan: _nullishCoalesce(_optionalChain([merge, 'optionalAccess', _41 => _41.rowSpan]), () => ( 1))
5508
2707
  });
5509
2708
  }
5510
2709
  cellRows.push(row);
5511
2710
  }
5512
2711
  if (cellRows.length > 0) {
5513
- const table = buildTable(cellRows);
2712
+ const table = _chunkHXUCZ2ILcjs.buildTable.call(void 0, cellRows);
5514
2713
  if (table.rows > 0) {
5515
2714
  blocks.push({ type: "table", table, pageNumber: sheetIndex + 1 });
5516
2715
  }
@@ -5518,12 +2717,12 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
5518
2717
  return blocks;
5519
2718
  }
5520
2719
  async function parseXlsxDocument(buffer, options) {
5521
- precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
5522
- const zip = await import_jszip3.default.loadAsync(buffer);
2720
+ _chunkHXUCZ2ILcjs.precheckZipSize.call(void 0, buffer, MAX_DECOMPRESS_SIZE3);
2721
+ const zip = await _jszip2.default.loadAsync(buffer);
5523
2722
  const warnings = [];
5524
2723
  const workbookFile = zip.file("xl/workbook.xml");
5525
2724
  if (!workbookFile) {
5526
- throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 XLSX \uD30C\uC77C: xl/workbook.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
2725
+ throw new (0, _chunkHXUCZ2ILcjs.KordocError)("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 XLSX \uD30C\uC77C: xl/workbook.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
5527
2726
  }
5528
2727
  let sharedStrings = [];
5529
2728
  const ssFile = zip.file("xl/sharedStrings.xml");
@@ -5532,7 +2731,7 @@ async function parseXlsxDocument(buffer, options) {
5532
2731
  }
5533
2732
  const sheets = parseWorkbook(await workbookFile.async("text"));
5534
2733
  if (sheets.length === 0) {
5535
- throw new KordocError("XLSX \uD30C\uC77C\uC5D0 \uC2DC\uD2B8\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4");
2734
+ throw new (0, _chunkHXUCZ2ILcjs.KordocError)("XLSX \uD30C\uC77C\uC5D0 \uC2DC\uD2B8\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4");
5536
2735
  }
5537
2736
  let relsMap = /* @__PURE__ */ new Map();
5538
2737
  const relsFile = zip.file("xl/_rels/workbook.xml.rels");
@@ -5540,8 +2739,8 @@ async function parseXlsxDocument(buffer, options) {
5540
2739
  relsMap = parseRels(await relsFile.async("text"));
5541
2740
  }
5542
2741
  let pageFilter = null;
5543
- if (options?.pages) {
5544
- const { parsePageRange: parsePageRange2 } = await Promise.resolve().then(() => (init_page_range(), page_range_exports));
2742
+ if (_optionalChain([options, 'optionalAccess', _42 => _42.pages])) {
2743
+ const { parsePageRange: parsePageRange2 } = await Promise.resolve().then(() => _interopRequireWildcard(require("./page-range-3C7UGGEK.cjs")));
5545
2744
  pageFilter = parsePageRange2(options.pages, sheets.length);
5546
2745
  }
5547
2746
  const blocks = [];
@@ -5549,7 +2748,7 @@ async function parseXlsxDocument(buffer, options) {
5549
2748
  for (let i = 0; i < processedSheets; i++) {
5550
2749
  if (pageFilter && !pageFilter.has(i + 1)) continue;
5551
2750
  const sheet = sheets[i];
5552
- options?.onProgress?.(i + 1, processedSheets);
2751
+ _optionalChain([options, 'optionalAccess', _43 => _43.onProgress, 'optionalCall', _44 => _44(i + 1, processedSheets)]);
5553
2752
  let sheetPath = relsMap.get(sheet.rId);
5554
2753
  if (sheetPath) {
5555
2754
  if (!sheetPath.startsWith("xl/") && !sheetPath.startsWith("/")) {
@@ -5592,7 +2791,7 @@ async function parseXlsxDocument(buffer, options) {
5592
2791
  const doc = parseXml(coreXml);
5593
2792
  const getFirst = (tag) => {
5594
2793
  const els = doc.getElementsByTagName(tag);
5595
- return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
2794
+ return els.length > 0 ? (_nullishCoalesce(els[0].textContent, () => ( ""))).trim() : void 0;
5596
2795
  };
5597
2796
  metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
5598
2797
  metadata.author = getFirst("dc:creator");
@@ -5601,16 +2800,16 @@ async function parseXlsxDocument(buffer, options) {
5601
2800
  if (created) metadata.createdAt = created;
5602
2801
  const modified = getFirst("dcterms:modified");
5603
2802
  if (modified) metadata.modifiedAt = modified;
5604
- } catch {
2803
+ } catch (e20) {
5605
2804
  }
5606
2805
  }
5607
- const markdown = blocksToMarkdown(blocks);
2806
+ const markdown = _chunkHXUCZ2ILcjs.blocksToMarkdown.call(void 0, blocks);
5608
2807
  return { markdown, blocks, metadata, warnings: warnings.length > 0 ? warnings : void 0 };
5609
2808
  }
5610
2809
 
5611
2810
  // src/docx/parser.ts
5612
- var import_jszip4 = __toESM(require("jszip"), 1);
5613
- var import_xmldom3 = require("@xmldom/xmldom");
2811
+
2812
+
5614
2813
  var MAX_DECOMPRESS_SIZE4 = 100 * 1024 * 1024;
5615
2814
  function getChildElements(parent, localName2) {
5616
2815
  const result = [];
@@ -5619,7 +2818,7 @@ function getChildElements(parent, localName2) {
5619
2818
  const node = children[i];
5620
2819
  if (node.nodeType === 1) {
5621
2820
  const el = node;
5622
- if (el.localName === localName2 || el.tagName?.endsWith(`:${localName2}`)) {
2821
+ if (el.localName === localName2 || _optionalChain([el, 'access', _45 => _45.tagName, 'optionalAccess', _46 => _46.endsWith, 'call', _47 => _47(`:${localName2}`)])) {
5623
2822
  result.push(el);
5624
2823
  }
5625
2824
  }
@@ -5634,7 +2833,7 @@ function findElements(parent, localName2) {
5634
2833
  const child = children[i];
5635
2834
  if (child.nodeType === 1) {
5636
2835
  const el = child;
5637
- if (el.localName === localName2 || el.tagName?.endsWith(`:${localName2}`)) {
2836
+ if (el.localName === localName2 || _optionalChain([el, 'access', _48 => _48.tagName, 'optionalAccess', _49 => _49.endsWith, 'call', _50 => _50(`:${localName2}`)])) {
5638
2837
  result.push(el);
5639
2838
  }
5640
2839
  walk(el);
@@ -5653,7 +2852,7 @@ function getAttr(el, localName2) {
5653
2852
  return null;
5654
2853
  }
5655
2854
  function parseXml2(text) {
5656
- return new import_xmldom3.DOMParser().parseFromString(stripDtd(text), "text/xml");
2855
+ return new (0, _xmldom.DOMParser)().parseFromString(_chunkHXUCZ2ILcjs.stripDtd.call(void 0, text), "text/xml");
5657
2856
  }
5658
2857
  function parseStyles(xml) {
5659
2858
  const doc = parseXml2(xml);
@@ -5663,9 +2862,9 @@ function parseStyles(xml) {
5663
2862
  const styleId = getAttr(el, "styleId");
5664
2863
  if (!styleId) continue;
5665
2864
  const nameEls = getChildElements(el, "name");
5666
- const name = nameEls.length > 0 ? getAttr(nameEls[0], "val") ?? "" : "";
2865
+ const name = nameEls.length > 0 ? _nullishCoalesce(getAttr(nameEls[0], "val"), () => ( "")) : "";
5667
2866
  const basedOnEls = getChildElements(el, "basedOn");
5668
- const basedOn = basedOnEls.length > 0 ? getAttr(basedOnEls[0], "val") ?? void 0 : void 0;
2867
+ const basedOn = basedOnEls.length > 0 ? _nullishCoalesce(getAttr(basedOnEls[0], "val"), () => ( void 0)) : void 0;
5669
2868
  const pPrEls = getChildElements(el, "pPr");
5670
2869
  let outlineLevel;
5671
2870
  if (pPrEls.length > 0) {
@@ -5693,9 +2892,9 @@ function parseNumbering(xml) {
5693
2892
  const levels = /* @__PURE__ */ new Map();
5694
2893
  const lvlElements = getChildElements(el, "lvl");
5695
2894
  for (const lvl of lvlElements) {
5696
- const ilvl = parseInt(getAttr(lvl, "ilvl") ?? "0", 10);
2895
+ const ilvl = parseInt(_nullishCoalesce(getAttr(lvl, "ilvl"), () => ( "0")), 10);
5697
2896
  const numFmtEls = getChildElements(lvl, "numFmt");
5698
- const numFmt = numFmtEls.length > 0 ? getAttr(numFmtEls[0], "val") ?? "bullet" : "bullet";
2897
+ const numFmt = numFmtEls.length > 0 ? _nullishCoalesce(getAttr(numFmtEls[0], "val"), () => ( "bullet")) : "bullet";
5699
2898
  levels.set(ilvl, { numFmt, level: ilvl });
5700
2899
  }
5701
2900
  abstractNums.set(abstractNumId, levels);
@@ -5739,7 +2938,7 @@ function parseFootnotes(xml) {
5739
2938
  const runs = findElements(p, "r");
5740
2939
  for (const r of runs) {
5741
2940
  const tElements = getChildElements(r, "t");
5742
- for (const t of tElements) texts.push(t.textContent ?? "");
2941
+ for (const t of tElements) texts.push(_nullishCoalesce(t.textContent, () => ( "")));
5743
2942
  }
5744
2943
  }
5745
2944
  notes.set(id, texts.join("").trim());
@@ -5748,7 +2947,7 @@ function parseFootnotes(xml) {
5748
2947
  }
5749
2948
  function extractRun(r) {
5750
2949
  const tElements = getChildElements(r, "t");
5751
- const text = tElements.map((t) => t.textContent ?? "").join("");
2950
+ const text = tElements.map((t) => _nullishCoalesce(t.textContent, () => ( ""))).join("");
5752
2951
  let bold = false;
5753
2952
  let italic = false;
5754
2953
  const rPrEls = getChildElements(r, "rPr");
@@ -5765,13 +2964,13 @@ function parseParagraph(p, styles, numbering, footnotes, rels) {
5765
2964
  let ilvl = 0;
5766
2965
  if (pPrEls.length > 0) {
5767
2966
  const pStyleEls = getChildElements(pPrEls[0], "pStyle");
5768
- if (pStyleEls.length > 0) styleId = getAttr(pStyleEls[0], "val") ?? "";
2967
+ if (pStyleEls.length > 0) styleId = _nullishCoalesce(getAttr(pStyleEls[0], "val"), () => ( ""));
5769
2968
  const numPrEls = getChildElements(pPrEls[0], "numPr");
5770
2969
  if (numPrEls.length > 0) {
5771
2970
  const numIdEls = getChildElements(numPrEls[0], "numId");
5772
2971
  const ilvlEls = getChildElements(numPrEls[0], "ilvl");
5773
- numId = numIdEls.length > 0 ? getAttr(numIdEls[0], "val") ?? "" : "";
5774
- ilvl = ilvlEls.length > 0 ? parseInt(getAttr(ilvlEls[0], "val") ?? "0", 10) : 0;
2972
+ numId = numIdEls.length > 0 ? _nullishCoalesce(getAttr(numIdEls[0], "val"), () => ( "")) : "";
2973
+ ilvl = ilvlEls.length > 0 ? parseInt(_nullishCoalesce(getAttr(ilvlEls[0], "val"), () => ( "0")), 10) : 0;
5775
2974
  }
5776
2975
  }
5777
2976
  const parts = [];
@@ -5818,7 +3017,7 @@ function parseParagraph(p, styles, numbering, footnotes, rels) {
5818
3017
  const text = parts.join("").trim();
5819
3018
  if (!text) return null;
5820
3019
  const style = styles.get(styleId);
5821
- if (style?.outlineLevel !== void 0 && style.outlineLevel >= 0 && style.outlineLevel <= 5) {
3020
+ if (_optionalChain([style, 'optionalAccess', _51 => _51.outlineLevel]) !== void 0 && style.outlineLevel >= 0 && style.outlineLevel <= 5) {
5822
3021
  return {
5823
3022
  type: "heading",
5824
3023
  text,
@@ -5827,8 +3026,8 @@ function parseParagraph(p, styles, numbering, footnotes, rels) {
5827
3026
  }
5828
3027
  if (numId && numId !== "0") {
5829
3028
  const numDef = numbering.get(numId);
5830
- const levelInfo = numDef?.get(ilvl);
5831
- const listType = levelInfo?.numFmt === "bullet" ? "unordered" : "ordered";
3029
+ const levelInfo = _optionalChain([numDef, 'optionalAccess', _52 => _52.get, 'call', _53 => _53(ilvl)]);
3030
+ const listType = _optionalChain([levelInfo, 'optionalAccess', _54 => _54.numFmt]) === "bullet" ? "unordered" : "ordered";
5832
3031
  return { type: "list", text, listType };
5833
3032
  }
5834
3033
  const block = { type: "paragraph", text };
@@ -5854,7 +3053,7 @@ function parseTable(tbl, styles, numbering, footnotes, rels) {
5854
3053
  if (tcPrEls.length > 0) {
5855
3054
  const gridSpanEls = getChildElements(tcPrEls[0], "gridSpan");
5856
3055
  if (gridSpanEls.length > 0) {
5857
- colSpan = parseInt(getAttr(gridSpanEls[0], "val") ?? "1", 10);
3056
+ colSpan = parseInt(_nullishCoalesce(getAttr(gridSpanEls[0], "val"), () => ( "1")), 10);
5858
3057
  }
5859
3058
  const vMergeEls = getChildElements(tcPrEls[0], "vMerge");
5860
3059
  if (vMergeEls.length > 0) {
@@ -5869,7 +3068,7 @@ function parseTable(tbl, styles, numbering, footnotes, rels) {
5869
3068
  const pElements = getChildElements(tc, "p");
5870
3069
  for (const p of pElements) {
5871
3070
  const block = parseParagraph(p, styles, numbering, footnotes, rels);
5872
- if (block?.text) cellTexts.push(block.text);
3071
+ if (_optionalChain([block, 'optionalAccess', _55 => _55.text])) cellTexts.push(block.text);
5873
3072
  }
5874
3073
  row.push({ text: cellTexts.join("\n"), colSpan, rowSpan });
5875
3074
  }
@@ -5882,7 +3081,7 @@ function parseTable(tbl, styles, numbering, footnotes, rels) {
5882
3081
  if (!cell || cell.rowSpan === 0) continue;
5883
3082
  let span = 1;
5884
3083
  for (let nr = r + 1; nr < rows.length; nr++) {
5885
- if (rows[nr][c]?.rowSpan === 0) span++;
3084
+ if (_optionalChain([rows, 'access', _56 => _56[nr], 'access', _57 => _57[c], 'optionalAccess', _58 => _58.rowSpan]) === 0) span++;
5886
3085
  else break;
5887
3086
  }
5888
3087
  cell.rowSpan = span;
@@ -5926,7 +3125,7 @@ async function extractImages(zip, rels, doc) {
5926
3125
  try {
5927
3126
  const data = await imgFile.async("uint8array");
5928
3127
  imgIdx++;
5929
- const ext = imgPath.split(".").pop()?.toLowerCase() ?? "png";
3128
+ const ext = _nullishCoalesce(_optionalChain([imgPath, 'access', _59 => _59.split, 'call', _60 => _60("."), 'access', _61 => _61.pop, 'call', _62 => _62(), 'optionalAccess', _63 => _63.toLowerCase, 'call', _64 => _64()]), () => ( "png"));
5930
3129
  const mimeMap = {
5931
3130
  png: "image/png",
5932
3131
  jpg: "image/jpeg",
@@ -5937,21 +3136,21 @@ async function extractImages(zip, rels, doc) {
5937
3136
  emf: "image/emf"
5938
3137
  };
5939
3138
  const filename = `image_${String(imgIdx).padStart(3, "0")}.${ext}`;
5940
- images.push({ filename, data, mimeType: mimeMap[ext] ?? "image/png" });
3139
+ images.push({ filename, data, mimeType: _nullishCoalesce(mimeMap[ext], () => ( "image/png")) });
5941
3140
  blocks.push({ type: "image", text: filename });
5942
- } catch {
3141
+ } catch (e21) {
5943
3142
  }
5944
3143
  }
5945
3144
  }
5946
3145
  return { blocks, images };
5947
3146
  }
5948
3147
  async function parseDocxDocument(buffer, options) {
5949
- precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
5950
- const zip = await import_jszip4.default.loadAsync(buffer);
3148
+ _chunkHXUCZ2ILcjs.precheckZipSize.call(void 0, buffer, MAX_DECOMPRESS_SIZE4);
3149
+ const zip = await _jszip2.default.loadAsync(buffer);
5951
3150
  const warnings = [];
5952
3151
  const docFile = zip.file("word/document.xml");
5953
3152
  if (!docFile) {
5954
- throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 DOCX \uD30C\uC77C: word/document.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
3153
+ throw new (0, _chunkHXUCZ2ILcjs.KordocError)("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 DOCX \uD30C\uC77C: word/document.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
5955
3154
  }
5956
3155
  let rels = /* @__PURE__ */ new Map();
5957
3156
  const relsFile = zip.file("word/_rels/document.xml.rels");
@@ -5963,7 +3162,7 @@ async function parseDocxDocument(buffer, options) {
5963
3162
  if (stylesFile) {
5964
3163
  try {
5965
3164
  styles = parseStyles(await stylesFile.async("text"));
5966
- } catch {
3165
+ } catch (e22) {
5967
3166
  }
5968
3167
  }
5969
3168
  let numbering = /* @__PURE__ */ new Map();
@@ -5971,7 +3170,7 @@ async function parseDocxDocument(buffer, options) {
5971
3170
  if (numFile) {
5972
3171
  try {
5973
3172
  numbering = parseNumbering(await numFile.async("text"));
5974
- } catch {
3173
+ } catch (e23) {
5975
3174
  }
5976
3175
  }
5977
3176
  let footnotes = /* @__PURE__ */ new Map();
@@ -5979,14 +3178,14 @@ async function parseDocxDocument(buffer, options) {
5979
3178
  if (fnFile) {
5980
3179
  try {
5981
3180
  footnotes = parseFootnotes(await fnFile.async("text"));
5982
- } catch {
3181
+ } catch (e24) {
5983
3182
  }
5984
3183
  }
5985
3184
  const docXml = await docFile.async("text");
5986
3185
  const doc = parseXml2(docXml);
5987
3186
  const body = findElements(doc, "body");
5988
3187
  if (body.length === 0) {
5989
- throw new KordocError("DOCX \uBCF8\uBB38(w:body)\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
3188
+ throw new (0, _chunkHXUCZ2ILcjs.KordocError)("DOCX \uBCF8\uBB38(w:body)\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
5990
3189
  }
5991
3190
  const blocks = [];
5992
3191
  const bodyEl = body[0];
@@ -5995,7 +3194,7 @@ async function parseDocxDocument(buffer, options) {
5995
3194
  const node = children[i];
5996
3195
  if (node.nodeType !== 1) continue;
5997
3196
  const el = node;
5998
- const localName2 = el.localName ?? el.tagName?.split(":").pop();
3197
+ const localName2 = _nullishCoalesce(el.localName, () => ( _optionalChain([el, 'access', _65 => _65.tagName, 'optionalAccess', _66 => _66.split, 'call', _67 => _67(":"), 'access', _68 => _68.pop, 'call', _69 => _69()])));
5999
3198
  if (localName2 === "p") {
6000
3199
  const block = parseParagraph(el, styles, numbering, footnotes, rels);
6001
3200
  if (block) blocks.push(block);
@@ -6013,7 +3212,7 @@ async function parseDocxDocument(buffer, options) {
6013
3212
  const coreDoc = parseXml2(coreXml);
6014
3213
  const getFirst = (tag) => {
6015
3214
  const els = coreDoc.getElementsByTagName(tag);
6016
- return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
3215
+ return els.length > 0 ? (_nullishCoalesce(els[0].textContent, () => ( ""))).trim() : void 0;
6017
3216
  };
6018
3217
  metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
6019
3218
  metadata.author = getFirst("dc:creator");
@@ -6022,11 +3221,11 @@ async function parseDocxDocument(buffer, options) {
6022
3221
  if (created) metadata.createdAt = created;
6023
3222
  const modified = getFirst("dcterms:modified");
6024
3223
  if (modified) metadata.modifiedAt = modified;
6025
- } catch {
3224
+ } catch (e25) {
6026
3225
  }
6027
3226
  }
6028
- const outline = blocks.filter((b) => b.type === "heading").map((b) => ({ level: b.level ?? 2, text: b.text ?? "" }));
6029
- const markdown = blocksToMarkdown(blocks);
3227
+ const outline = blocks.filter((b) => b.type === "heading").map((b) => ({ level: _nullishCoalesce(b.level, () => ( 2)), text: _nullishCoalesce(b.text, () => ( "")) }));
3228
+ const markdown = _chunkHXUCZ2ILcjs.blocksToMarkdown.call(void 0, blocks);
6030
3229
  return {
6031
3230
  markdown,
6032
3231
  blocks,
@@ -6270,7 +3469,7 @@ function fillFormFields(blocks, values) {
6270
3469
  if (block.type !== "table" || !block.table) continue;
6271
3470
  for (let r = 0; r < block.table.rows; r++) {
6272
3471
  for (let c = 0; c < block.table.cols; c++) {
6273
- const cell = block.table.cells[r]?.[c];
3472
+ const cell = _optionalChain([block, 'access', _70 => _70.table, 'access', _71 => _71.cells, 'access', _72 => _72[r], 'optionalAccess', _73 => _73[c]]);
6274
3473
  if (!cell) continue;
6275
3474
  const result = fillInCellPatterns(cell.text, normalizedValues, matchedLabels);
6276
3475
  if (result) {
@@ -6309,7 +3508,7 @@ function fillTable(table, values, filled, matchedLabels, patternFilledCells) {
6309
3508
  const matchKey = findMatchingKey(normalizedCellLabel, values);
6310
3509
  if (matchKey === void 0) continue;
6311
3510
  const newValue = values.get(matchKey);
6312
- if (patternFilledCells?.has(`${r},${c + 1}`)) {
3511
+ if (_optionalChain([patternFilledCells, 'optionalAccess', _74 => _74.has, 'call', _75 => _75(`${r},${c + 1}`)])) {
6313
3512
  valueCell.text = newValue + " " + valueCell.text;
6314
3513
  } else {
6315
3514
  valueCell.text = newValue;
@@ -6370,24 +3569,24 @@ function fillInlineFields(text, values, filled, matchedLabels) {
6370
3569
  }
6371
3570
 
6372
3571
  // src/form/filler-hwpx.ts
6373
- var import_jszip5 = __toESM(require("jszip"), 1);
6374
- var import_xmldom4 = require("@xmldom/xmldom");
3572
+
3573
+
6375
3574
  async function fillHwpx(hwpxBuffer, values) {
6376
- const zip = await import_jszip5.default.loadAsync(hwpxBuffer);
3575
+ const zip = await _jszip2.default.loadAsync(hwpxBuffer);
6377
3576
  const filled = [];
6378
3577
  const matchedLabels = /* @__PURE__ */ new Set();
6379
3578
  const normalizedValues = normalizeValues(values);
6380
3579
  const sectionFiles = Object.keys(zip.files).filter((name) => /[Ss]ection\d+\.xml$/i.test(name)).sort();
6381
3580
  if (sectionFiles.length === 0) {
6382
- throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
3581
+ throw new (0, _chunkHXUCZ2ILcjs.KordocError)("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
6383
3582
  }
6384
- const xmlParser = new import_xmldom4.DOMParser();
6385
- const xmlSerializer = new import_xmldom4.XMLSerializer();
3583
+ const xmlParser = new (0, _xmldom.DOMParser)();
3584
+ const xmlSerializer = new (0, _xmldom.XMLSerializer)();
6386
3585
  for (const sectionPath of sectionFiles) {
6387
3586
  const zipEntry = zip.file(sectionPath);
6388
3587
  if (!zipEntry) continue;
6389
3588
  const rawXml = await zipEntry.async("text");
6390
- const doc = xmlParser.parseFromString(stripDtd(rawXml), "text/xml");
3589
+ const doc = xmlParser.parseFromString(_chunkHXUCZ2ILcjs.stripDtd.call(void 0, rawXml), "text/xml");
6391
3590
  if (!doc.documentElement) continue;
6392
3591
  let modified = false;
6393
3592
  const tables = findAllElements(doc.documentElement, "tbl");
@@ -6678,7 +3877,7 @@ function applyTextReplacements(tNodes, originalFull, replacedFull) {
6678
3877
  }
6679
3878
 
6680
3879
  // src/hwpx/generator.ts
6681
- var import_jszip6 = __toESM(require("jszip"), 1);
3880
+
6682
3881
  var NS_SECTION = "http://www.hancom.co.kr/hwpml/2011/section";
6683
3882
  var NS_PARA = "http://www.hancom.co.kr/hwpml/2011/paragraph";
6684
3883
  var NS_HEAD = "http://www.hancom.co.kr/hwpml/2011/head";
@@ -6705,7 +3904,7 @@ var PARA_LIST = 7;
6705
3904
  async function markdownToHwpx(markdown) {
6706
3905
  const blocks = parseMarkdownToBlocks(markdown);
6707
3906
  const sectionXml = blocksToSectionXml(blocks);
6708
- const zip = new import_jszip6.default();
3907
+ const zip = new (0, _jszip2.default)();
6709
3908
  zip.file("mimetype", "application/hwp+zip", { compression: "STORE" });
6710
3909
  zip.file("META-INF/container.xml", generateContainerXml());
6711
3910
  zip.file("Contents/content.hpf", generateManifest());
@@ -7247,14 +4446,14 @@ async function parse(input, options) {
7247
4446
  let buffer;
7248
4447
  if (typeof input === "string") {
7249
4448
  try {
7250
- const buf = await (0, import_promises.readFile)(input);
7251
- buffer = toArrayBuffer(buf);
4449
+ const buf = await _promises.readFile.call(void 0, input);
4450
+ buffer = _chunkHXUCZ2ILcjs.toArrayBuffer.call(void 0, buf);
7252
4451
  } catch (err) {
7253
4452
  const msg = err instanceof Error && "code" in err && err.code === "ENOENT" ? `\uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${input}` : `\uD30C\uC77C \uC77D\uAE30 \uC2E4\uD328: ${input}`;
7254
4453
  return { success: false, fileType: "unknown", error: msg, code: "PARSE_ERROR" };
7255
4454
  }
7256
4455
  } else if (Buffer.isBuffer(input)) {
7257
- buffer = toArrayBuffer(input);
4456
+ buffer = _chunkHXUCZ2ILcjs.toArrayBuffer.call(void 0, input);
7258
4457
  } else {
7259
4458
  buffer = input;
7260
4459
  }
@@ -7280,26 +4479,38 @@ async function parse(input, options) {
7280
4479
  async function parseHwpx(buffer, options) {
7281
4480
  try {
7282
4481
  const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options);
7283
- return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
4482
+ return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: _optionalChain([images, 'optionalAccess', _76 => _76.length]) ? images : void 0 };
7284
4483
  } catch (err) {
7285
- return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
4484
+ return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: _chunkHXUCZ2ILcjs.classifyError.call(void 0, err) };
7286
4485
  }
7287
4486
  }
7288
4487
  async function parseHwp(buffer, options) {
7289
4488
  try {
7290
4489
  const { markdown, blocks, metadata, outline, warnings, images } = parseHwp5Document(Buffer.from(buffer), options);
7291
- return { success: true, fileType: "hwp", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
4490
+ return { success: true, fileType: "hwp", markdown, blocks, metadata, outline, warnings, images: _optionalChain([images, 'optionalAccess', _77 => _77.length]) ? images : void 0 };
7292
4491
  } catch (err) {
7293
- return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
4492
+ return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328", code: _chunkHXUCZ2ILcjs.classifyError.call(void 0, err) };
7294
4493
  }
7295
4494
  }
7296
4495
  async function parsePdf(buffer, options) {
4496
+ let parsePdfDocument;
4497
+ try {
4498
+ const mod = await Promise.resolve().then(() => _interopRequireWildcard(require("./parser-KOWPTDJU.cjs")));
4499
+ parsePdfDocument = mod.parsePdfDocument;
4500
+ } catch (e26) {
4501
+ return {
4502
+ success: false,
4503
+ fileType: "pdf",
4504
+ error: "PDF \uD30C\uC2F1\uC5D0 pdfjs-dist\uAC00 \uD544\uC694\uD569\uB2C8\uB2E4. \uC124\uCE58: npm install pdfjs-dist",
4505
+ code: "MISSING_DEPENDENCY"
4506
+ };
4507
+ }
7297
4508
  try {
7298
4509
  const { markdown, blocks, metadata, outline, warnings, isImageBased } = await parsePdfDocument(buffer, options);
7299
4510
  return { success: true, fileType: "pdf", markdown, blocks, metadata, outline, warnings, isImageBased };
7300
4511
  } catch (err) {
7301
4512
  const isImageBased = err instanceof Error && "isImageBased" in err ? true : void 0;
7302
- return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err), isImageBased };
4513
+ return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: _chunkHXUCZ2ILcjs.classifyError.call(void 0, err), isImageBased };
7303
4514
  }
7304
4515
  }
7305
4516
  async function parseXlsx(buffer, options) {
@@ -7307,24 +4518,24 @@ async function parseXlsx(buffer, options) {
7307
4518
  const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options);
7308
4519
  return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
7309
4520
  } catch (err) {
7310
- return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
4521
+ return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code: _chunkHXUCZ2ILcjs.classifyError.call(void 0, err) };
7311
4522
  }
7312
4523
  }
7313
4524
  async function parseDocx(buffer, options) {
7314
4525
  try {
7315
4526
  const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options);
7316
- return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
4527
+ return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: _optionalChain([images, 'optionalAccess', _78 => _78.length]) ? images : void 0 };
7317
4528
  } catch (err) {
7318
- return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
4529
+ return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: _chunkHXUCZ2ILcjs.classifyError.call(void 0, err) };
7319
4530
  }
7320
4531
  }
7321
4532
  async function fillForm(input, values, outputFormat = "markdown") {
7322
4533
  let buffer;
7323
4534
  if (typeof input === "string") {
7324
- const buf = await (0, import_promises.readFile)(input);
7325
- buffer = toArrayBuffer(buf);
4535
+ const buf = await _promises.readFile.call(void 0, input);
4536
+ buffer = _chunkHXUCZ2ILcjs.toArrayBuffer.call(void 0, buf);
7326
4537
  } else if (Buffer.isBuffer(input)) {
7327
- buffer = toArrayBuffer(input);
4538
+ buffer = _chunkHXUCZ2ILcjs.toArrayBuffer.call(void 0, input);
7328
4539
  } else {
7329
4540
  buffer = input;
7330
4541
  }
@@ -7350,36 +4561,35 @@ async function fillForm(input, values, outputFormat = "markdown") {
7350
4561
  throw new Error(`\uC11C\uC2DD \uD30C\uC2F1 \uC2E4\uD328: ${parsed.error}`);
7351
4562
  }
7352
4563
  const fill = fillFormFields(parsed.blocks, values);
7353
- const markdown = blocksToMarkdown(fill.blocks);
4564
+ const markdown = _chunkHXUCZ2ILcjs.blocksToMarkdown.call(void 0, fill.blocks);
7354
4565
  if (outputFormat === "hwpx") {
7355
4566
  const hwpxBuffer = await markdownToHwpx(markdown);
7356
4567
  return { output: hwpxBuffer, format: "hwpx", fill };
7357
4568
  }
7358
4569
  return { output: markdown, format: "markdown", fill };
7359
4570
  }
7360
- // Annotate the CommonJS export names for ESM import in node:
7361
- 0 && (module.exports = {
7362
- VERSION,
7363
- blocksToMarkdown,
7364
- compare,
7365
- detectFormat,
7366
- detectZipFormat,
7367
- diffBlocks,
7368
- extractFormFields,
7369
- fillForm,
7370
- fillFormFields,
7371
- fillHwpx,
7372
- isHwpxFile,
7373
- isLabelCell,
7374
- isOldHwpFile,
7375
- isPdfFile,
7376
- isZipFile,
7377
- markdownToHwpx,
7378
- parse,
7379
- parseDocx,
7380
- parseHwp,
7381
- parseHwpx,
7382
- parsePdf,
7383
- parseXlsx
7384
- });
4571
+
4572
+
4573
+
4574
+
4575
+
4576
+
4577
+
4578
+
4579
+
4580
+
4581
+
4582
+
4583
+
4584
+
4585
+
4586
+
4587
+
4588
+
4589
+
4590
+
4591
+
4592
+
4593
+
4594
+ exports.VERSION = _chunkHXUCZ2ILcjs.VERSION; exports.blocksToMarkdown = _chunkHXUCZ2ILcjs.blocksToMarkdown; exports.compare = compare; exports.detectFormat = detectFormat; exports.detectZipFormat = detectZipFormat; exports.diffBlocks = diffBlocks; exports.extractFormFields = extractFormFields; exports.fillForm = fillForm; exports.fillFormFields = fillFormFields; exports.fillHwpx = fillHwpx; exports.isHwpxFile = isHwpxFile; exports.isLabelCell = isLabelCell; exports.isOldHwpFile = isOldHwpFile; exports.isPdfFile = isPdfFile; exports.isZipFile = isZipFile; exports.markdownToHwpx = markdownToHwpx; exports.parse = parse; exports.parseDocx = parseDocx; exports.parseHwp = parseHwp; exports.parseHwpx = parseHwpx; exports.parsePdf = parsePdf; exports.parseXlsx = parseXlsx;
7385
4595
  //# sourceMappingURL=index.cjs.map