kordoc 1.7.2 → 1.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +36 -15
- package/dist/chunk-AHW56LNX.js +93 -0
- package/dist/chunk-AHW56LNX.js.map +1 -0
- package/dist/{chunk-NJ3R7LNR.js → chunk-MDRW3HYC.js} +1165 -234
- package/dist/chunk-MDRW3HYC.js.map +1 -0
- package/dist/chunk-MOL7MDBG.js +35 -0
- package/dist/chunk-MOL7MDBG.js.map +1 -0
- package/dist/cli.js +11 -7
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +1253 -195
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +17 -5
- package/dist/index.d.ts +17 -5
- package/dist/index.js +1248 -194
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +10 -7
- package/dist/mcp.js.map +1 -1
- package/dist/page-range-737B4EZW.js +8 -0
- package/dist/page-range-737B4EZW.js.map +1 -0
- package/dist/provider-A4FHJSID.js +0 -0
- package/dist/utils-VU6Z7HNR.js +22 -0
- package/dist/utils-VU6Z7HNR.js.map +1 -0
- package/dist/{watch-AKTZTPVF.js → watch-5IOZWFDD.js} +13 -5
- package/dist/watch-5IOZWFDD.js.map +1 -0
- package/package.json +77 -75
- package/dist/chunk-NJ3R7LNR.js.map +0 -1
- package/dist/watch-AKTZTPVF.js.map +0 -1
|
@@ -1,10 +1,22 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
+
import {
|
|
3
|
+
KordocError,
|
|
4
|
+
classifyError,
|
|
5
|
+
isPathTraversal,
|
|
6
|
+
precheckZipSize,
|
|
7
|
+
sanitizeHref,
|
|
8
|
+
toArrayBuffer
|
|
9
|
+
} from "./chunk-AHW56LNX.js";
|
|
10
|
+
import {
|
|
11
|
+
parsePageRange
|
|
12
|
+
} from "./chunk-MOL7MDBG.js";
|
|
2
13
|
|
|
3
14
|
// src/detect.ts
|
|
15
|
+
import JSZip from "jszip";
|
|
4
16
|
function magicBytes(buffer) {
|
|
5
17
|
return new Uint8Array(buffer, 0, Math.min(4, buffer.byteLength));
|
|
6
18
|
}
|
|
7
|
-
function
|
|
19
|
+
function isZipFile(buffer) {
|
|
8
20
|
const b = magicBytes(buffer);
|
|
9
21
|
return b[0] === 80 && b[1] === 75 && b[2] === 3 && b[3] === 4;
|
|
10
22
|
}
|
|
@@ -18,15 +30,28 @@ function isPdfFile(buffer) {
|
|
|
18
30
|
}
|
|
19
31
|
function detectFormat(buffer) {
|
|
20
32
|
if (buffer.byteLength < 4) return "unknown";
|
|
21
|
-
if (
|
|
33
|
+
if (isZipFile(buffer)) return "hwpx";
|
|
22
34
|
if (isOldHwpFile(buffer)) return "hwp";
|
|
23
35
|
if (isPdfFile(buffer)) return "pdf";
|
|
24
36
|
return "unknown";
|
|
25
37
|
}
|
|
38
|
+
async function detectZipFormat(buffer) {
|
|
39
|
+
try {
|
|
40
|
+
const zip = await JSZip.loadAsync(buffer);
|
|
41
|
+
if (zip.file("xl/workbook.xml")) return "xlsx";
|
|
42
|
+
if (zip.file("word/document.xml")) return "docx";
|
|
43
|
+
if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return "hwpx";
|
|
44
|
+
const hasSection = Object.keys(zip.files).some((f) => f.startsWith("Contents/"));
|
|
45
|
+
if (hasSection) return "hwpx";
|
|
46
|
+
return "unknown";
|
|
47
|
+
} catch {
|
|
48
|
+
return "unknown";
|
|
49
|
+
}
|
|
50
|
+
}
|
|
26
51
|
|
|
27
52
|
// src/table/builder.ts
|
|
28
53
|
var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
|
|
29
|
-
function
|
|
54
|
+
function sanitizeHref2(href) {
|
|
30
55
|
const trimmed = href.trim();
|
|
31
56
|
if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
|
|
32
57
|
return trimmed;
|
|
@@ -36,23 +61,24 @@ var MAX_ROWS = 1e4;
|
|
|
36
61
|
function buildTable(rows) {
|
|
37
62
|
if (rows.length > MAX_ROWS) rows = rows.slice(0, MAX_ROWS);
|
|
38
63
|
const numRows = rows.length;
|
|
39
|
-
const
|
|
64
|
+
const hasAddr = rows.some((row) => row.some((c) => c.colAddr !== void 0 && c.rowAddr !== void 0));
|
|
65
|
+
if (hasAddr) return buildTableDirect(rows, numRows);
|
|
40
66
|
let maxCols = 0;
|
|
67
|
+
const tempOccupied = Array.from({ length: numRows }, () => []);
|
|
41
68
|
for (let rowIdx = 0; rowIdx < numRows; rowIdx++) {
|
|
42
69
|
let colIdx = 0;
|
|
43
70
|
for (const cell of rows[rowIdx]) {
|
|
44
|
-
while (colIdx < MAX_COLS && tempOccupied
|
|
71
|
+
while (colIdx < MAX_COLS && tempOccupied[rowIdx][colIdx]) colIdx++;
|
|
45
72
|
if (colIdx >= MAX_COLS) break;
|
|
46
73
|
for (let r = rowIdx; r < Math.min(rowIdx + cell.rowSpan, numRows); r++) {
|
|
47
74
|
for (let c = colIdx; c < Math.min(colIdx + cell.colSpan, MAX_COLS); c++) {
|
|
48
|
-
tempOccupied
|
|
75
|
+
tempOccupied[r][c] = true;
|
|
49
76
|
}
|
|
50
77
|
}
|
|
51
78
|
colIdx += cell.colSpan;
|
|
52
79
|
if (colIdx > maxCols) maxCols = colIdx;
|
|
53
80
|
}
|
|
54
81
|
}
|
|
55
|
-
tempOccupied.clear();
|
|
56
82
|
if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
|
|
57
83
|
const grid = Array.from(
|
|
58
84
|
{ length: numRows },
|
|
@@ -80,6 +106,50 @@ function buildTable(rows) {
|
|
|
80
106
|
cellIdx++;
|
|
81
107
|
}
|
|
82
108
|
}
|
|
109
|
+
return trimAndReturn(grid, numRows, maxCols);
|
|
110
|
+
}
|
|
111
|
+
function buildTableDirect(rows, numRows) {
|
|
112
|
+
let maxCols = 0;
|
|
113
|
+
for (const row of rows) {
|
|
114
|
+
for (const cell of row) {
|
|
115
|
+
const end = (cell.colAddr ?? 0) + cell.colSpan;
|
|
116
|
+
if (end > maxCols) maxCols = end;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
|
|
120
|
+
const grid = Array.from(
|
|
121
|
+
{ length: numRows },
|
|
122
|
+
() => Array.from({ length: maxCols }, () => ({ text: "", colSpan: 1, rowSpan: 1 }))
|
|
123
|
+
);
|
|
124
|
+
for (const row of rows) {
|
|
125
|
+
for (const cell of row) {
|
|
126
|
+
const r = cell.rowAddr ?? 0;
|
|
127
|
+
const c = cell.colAddr ?? 0;
|
|
128
|
+
if (r >= numRows || c >= maxCols) continue;
|
|
129
|
+
grid[r][c] = { text: cell.text.trim(), colSpan: cell.colSpan, rowSpan: cell.rowSpan };
|
|
130
|
+
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
131
|
+
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
132
|
+
if (dr === 0 && dc === 0) continue;
|
|
133
|
+
if (r + dr < numRows && c + dc < maxCols) {
|
|
134
|
+
grid[r + dr][c + dc] = { text: "", colSpan: 1, rowSpan: 1 };
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
return trimAndReturn(grid, numRows, maxCols);
|
|
141
|
+
}
|
|
142
|
+
function trimAndReturn(grid, numRows, maxCols) {
|
|
143
|
+
let effectiveCols = maxCols;
|
|
144
|
+
while (effectiveCols > 0) {
|
|
145
|
+
const colEmpty = grid.every((row) => !row[effectiveCols - 1]?.text?.trim());
|
|
146
|
+
if (!colEmpty) break;
|
|
147
|
+
effectiveCols--;
|
|
148
|
+
}
|
|
149
|
+
if (effectiveCols < maxCols && effectiveCols > 0) {
|
|
150
|
+
const trimmed = grid.map((row) => row.slice(0, effectiveCols));
|
|
151
|
+
return { rows: numRows, cols: effectiveCols, cells: trimmed, hasHeader: numRows > 1 };
|
|
152
|
+
}
|
|
83
153
|
return { rows: numRows, cols: maxCols, cells: grid, hasHeader: numRows > 1 };
|
|
84
154
|
}
|
|
85
155
|
function convertTableToText(rows) {
|
|
@@ -87,13 +157,26 @@ function convertTableToText(rows) {
|
|
|
87
157
|
(row) => row.map((c) => c.text.trim().replace(/\n/g, " ")).filter(Boolean).join(" | ")
|
|
88
158
|
).filter(Boolean).join("\n");
|
|
89
159
|
}
|
|
160
|
+
var HWP_SHAPE_ALT_TEXT_RE = /(?:모서리가 둥근 |둥근 )?(?:사각형|직사각형|정사각형|원|타원|삼각형|이등변 삼각형|직각 삼각형|선|직선|곡선|화살표|굵은 화살표|이중 화살표|오각형|육각형|팔각형|별|[4-8]점별|십자|십자형|구름|구름형|마름모|도넛|평행사변형|사다리꼴|부채꼴|호|반원|물결|번개|하트|빗금|블록 화살표|수식|표|그림|개체|그리기\s?개체|묶음\s?개체|글상자|수식\s?개체|OLE\s?개체)\s?입니다\.?/g;
|
|
161
|
+
function sanitizeText(text) {
|
|
162
|
+
let result = text.replace(/[\u{F0000}-\u{FFFFD}]/gu, "").replace(HWP_SHAPE_ALT_TEXT_RE, "").replace(/ +/g, " ").trim();
|
|
163
|
+
if (result.length <= 30 && result.includes(" ")) {
|
|
164
|
+
const tokens = result.split(" ");
|
|
165
|
+
const koreanSingleCharCount = tokens.filter((t) => t.length === 1 && /[\uAC00-\uD7AF\u3131-\u318E]/.test(t)).length;
|
|
166
|
+
if (tokens.length >= 3 && koreanSingleCharCount / tokens.length >= 0.7) {
|
|
167
|
+
result = tokens.join("");
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
return result;
|
|
171
|
+
}
|
|
90
172
|
function blocksToMarkdown(blocks) {
|
|
91
173
|
const lines = [];
|
|
92
174
|
for (let i = 0; i < blocks.length; i++) {
|
|
93
175
|
const block = blocks[i];
|
|
94
176
|
if (block.type === "heading" && block.text) {
|
|
95
177
|
const prefix = "#".repeat(Math.min(block.level || 2, 6));
|
|
96
|
-
|
|
178
|
+
const headingText = sanitizeText(block.text);
|
|
179
|
+
if (headingText) lines.push("", `${prefix} ${headingText}`, "");
|
|
97
180
|
continue;
|
|
98
181
|
}
|
|
99
182
|
if (block.type === "image" && block.text) {
|
|
@@ -105,9 +188,11 @@ function blocksToMarkdown(blocks) {
|
|
|
105
188
|
continue;
|
|
106
189
|
}
|
|
107
190
|
if (block.type === "list" && block.text) {
|
|
108
|
-
const
|
|
191
|
+
const listText = sanitizeText(block.text);
|
|
192
|
+
if (!listText) continue;
|
|
193
|
+
const alreadyNumbered = block.listType === "ordered" && /^\d+\.\s/.test(listText);
|
|
109
194
|
const prefix = alreadyNumbered ? "" : block.listType === "ordered" ? "1. " : "- ";
|
|
110
|
-
lines.push(`${prefix}${
|
|
195
|
+
lines.push(`${prefix}${listText}`);
|
|
111
196
|
if (block.children) {
|
|
112
197
|
for (const child of block.children) {
|
|
113
198
|
const childPrefix = child.listType === "ordered" ? "1." : "-";
|
|
@@ -117,7 +202,8 @@ function blocksToMarkdown(blocks) {
|
|
|
117
202
|
continue;
|
|
118
203
|
}
|
|
119
204
|
if (block.type === "paragraph" && block.text) {
|
|
120
|
-
let text = block.text;
|
|
205
|
+
let text = sanitizeText(block.text);
|
|
206
|
+
if (!text) continue;
|
|
121
207
|
if (/^\[별표\s*\d+/.test(text)) {
|
|
122
208
|
const nextBlock = blocks[i + 1];
|
|
123
209
|
if (nextBlock?.type === "paragraph" && nextBlock.text && /관련\)?$/.test(nextBlock.text)) {
|
|
@@ -133,7 +219,7 @@ function blocksToMarkdown(blocks) {
|
|
|
133
219
|
continue;
|
|
134
220
|
}
|
|
135
221
|
if (block.href) {
|
|
136
|
-
const href =
|
|
222
|
+
const href = sanitizeHref2(block.href);
|
|
137
223
|
if (href) text = `[${text}](${href})`;
|
|
138
224
|
}
|
|
139
225
|
if (block.footnoteText) {
|
|
@@ -154,7 +240,7 @@ function tableToMarkdown(table) {
|
|
|
154
240
|
if (table.rows === 0 || table.cols === 0) return "";
|
|
155
241
|
const { cells, rows: numRows, cols: numCols } = table;
|
|
156
242
|
if (numRows === 1 && numCols === 1) {
|
|
157
|
-
const content = cells[0][0].text;
|
|
243
|
+
const content = sanitizeText(cells[0][0].text);
|
|
158
244
|
return content.split(/\n/).map((line) => {
|
|
159
245
|
const trimmed = line.trim();
|
|
160
246
|
if (!trimmed) return "";
|
|
@@ -163,13 +249,17 @@ function tableToMarkdown(table) {
|
|
|
163
249
|
return trimmed;
|
|
164
250
|
}).filter(Boolean).join("\n");
|
|
165
251
|
}
|
|
252
|
+
if (numCols === 1 && numRows >= 2) {
|
|
253
|
+
return cells.map((row) => sanitizeText(row[0].text).replace(/\n/g, " ")).filter(Boolean).join("\n");
|
|
254
|
+
}
|
|
166
255
|
const display = Array.from({ length: numRows }, () => Array(numCols).fill(""));
|
|
167
256
|
const skip = /* @__PURE__ */ new Set();
|
|
168
257
|
for (let r = 0; r < numRows; r++) {
|
|
169
258
|
for (let c = 0; c < numCols; c++) {
|
|
170
259
|
if (skip.has(`${r},${c}`)) continue;
|
|
171
|
-
const cell = cells[r][c];
|
|
172
|
-
|
|
260
|
+
const cell = cells[r]?.[c];
|
|
261
|
+
if (!cell) continue;
|
|
262
|
+
display[r][c] = sanitizeText(cell.text).replace(/\n/g, "<br>");
|
|
173
263
|
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
174
264
|
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
175
265
|
if (dr === 0 && dc === 0) continue;
|
|
@@ -178,12 +268,28 @@ function tableToMarkdown(table) {
|
|
|
178
268
|
}
|
|
179
269
|
}
|
|
180
270
|
}
|
|
271
|
+
c += cell.colSpan - 1;
|
|
181
272
|
}
|
|
182
273
|
}
|
|
183
274
|
const uniqueRows = [];
|
|
184
|
-
|
|
275
|
+
let pendingFirstCol = "";
|
|
276
|
+
for (let r = 0; r < display.length; r++) {
|
|
277
|
+
const row = display[r];
|
|
185
278
|
const isEmptyPlaceholder = row.every((cell) => cell === "");
|
|
186
|
-
if (
|
|
279
|
+
if (isEmptyPlaceholder) continue;
|
|
280
|
+
const hasSkippedCols = row.some((cell, c) => cell === "" && skip.has(`${r},${c}`));
|
|
281
|
+
const nonEmptyCols = row.filter((cell) => cell !== "");
|
|
282
|
+
if (!hasSkippedCols && nonEmptyCols.length === 1 && row[0] !== "" && row.slice(1).every((c) => c === "")) {
|
|
283
|
+
pendingFirstCol = row[0];
|
|
284
|
+
continue;
|
|
285
|
+
}
|
|
286
|
+
if (pendingFirstCol && row[0] === "") {
|
|
287
|
+
row[0] = pendingFirstCol;
|
|
288
|
+
pendingFirstCol = "";
|
|
289
|
+
} else {
|
|
290
|
+
pendingFirstCol = "";
|
|
291
|
+
}
|
|
292
|
+
uniqueRows.push(row);
|
|
187
293
|
}
|
|
188
294
|
if (uniqueRows.length === 0) return "";
|
|
189
295
|
const md = [];
|
|
@@ -195,75 +301,15 @@ function tableToMarkdown(table) {
|
|
|
195
301
|
return md.join("\n");
|
|
196
302
|
}
|
|
197
303
|
|
|
198
|
-
// src/utils.ts
|
|
199
|
-
var VERSION = true ? "1.7.2" : "0.0.0-dev";
|
|
200
|
-
function toArrayBuffer(buf) {
|
|
201
|
-
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
202
|
-
return buf.buffer;
|
|
203
|
-
}
|
|
204
|
-
return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
|
|
205
|
-
}
|
|
206
|
-
var KordocError = class extends Error {
|
|
207
|
-
constructor(message) {
|
|
208
|
-
super(message);
|
|
209
|
-
this.name = "KordocError";
|
|
210
|
-
}
|
|
211
|
-
};
|
|
212
|
-
function sanitizeError(err) {
|
|
213
|
-
if (err instanceof KordocError) return err.message;
|
|
214
|
-
return "\uBB38\uC11C \uCC98\uB9AC \uC911 \uC624\uB958\uAC00 \uBC1C\uC0DD\uD588\uC2B5\uB2C8\uB2E4";
|
|
215
|
-
}
|
|
216
|
-
function isPathTraversal(name) {
|
|
217
|
-
if (name.includes("\0")) return true;
|
|
218
|
-
const normalized = name.replace(/\\/g, "/");
|
|
219
|
-
return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
220
|
-
}
|
|
221
|
-
function classifyError(err) {
|
|
222
|
-
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
223
|
-
const msg = err.message;
|
|
224
|
-
if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
|
|
225
|
-
if (msg.includes("DRM")) return "DRM_PROTECTED";
|
|
226
|
-
if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
|
|
227
|
-
if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
|
|
228
|
-
if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
|
|
229
|
-
if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
|
|
230
|
-
if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
|
|
231
|
-
return "PARSE_ERROR";
|
|
232
|
-
}
|
|
233
|
-
|
|
234
304
|
// src/hwpx/parser.ts
|
|
235
|
-
import
|
|
305
|
+
import JSZip2 from "jszip";
|
|
236
306
|
import { inflateRawSync } from "zlib";
|
|
237
307
|
import { DOMParser } from "@xmldom/xmldom";
|
|
238
308
|
|
|
239
|
-
// src/
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
if (Array.isArray(spec)) {
|
|
244
|
-
for (const n of spec) {
|
|
245
|
-
const page = Math.round(n);
|
|
246
|
-
if (page >= 1 && page <= maxPages) result.add(page);
|
|
247
|
-
}
|
|
248
|
-
return result;
|
|
249
|
-
}
|
|
250
|
-
if (typeof spec !== "string" || spec.trim() === "") return result;
|
|
251
|
-
const parts = spec.split(",");
|
|
252
|
-
for (const part of parts) {
|
|
253
|
-
const trimmed = part.trim();
|
|
254
|
-
if (!trimmed) continue;
|
|
255
|
-
const rangeMatch = trimmed.match(/^(\d+)\s*-\s*(\d+)$/);
|
|
256
|
-
if (rangeMatch) {
|
|
257
|
-
const start = Math.max(1, parseInt(rangeMatch[1], 10));
|
|
258
|
-
const end = Math.min(maxPages, parseInt(rangeMatch[2], 10));
|
|
259
|
-
for (let i = start; i <= end; i++) result.add(i);
|
|
260
|
-
} else {
|
|
261
|
-
const page = parseInt(trimmed, 10);
|
|
262
|
-
if (!isNaN(page) && page >= 1 && page <= maxPages) result.add(page);
|
|
263
|
-
}
|
|
264
|
-
}
|
|
265
|
-
return result;
|
|
266
|
-
}
|
|
309
|
+
// src/types.ts
|
|
310
|
+
var HEADING_RATIO_H1 = 1.5;
|
|
311
|
+
var HEADING_RATIO_H2 = 1.3;
|
|
312
|
+
var HEADING_RATIO_H3 = 1.15;
|
|
267
313
|
|
|
268
314
|
// src/hwpx/parser.ts
|
|
269
315
|
var MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024;
|
|
@@ -357,16 +403,10 @@ function stripDtd(xml) {
|
|
|
357
403
|
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
358
404
|
}
|
|
359
405
|
async function parseHwpxDocument(buffer, options) {
|
|
360
|
-
|
|
361
|
-
if (precheck.totalUncompressed > MAX_DECOMPRESS_SIZE) {
|
|
362
|
-
throw new KordocError("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
363
|
-
}
|
|
364
|
-
if (precheck.entryCount > MAX_ZIP_ENTRIES) {
|
|
365
|
-
throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
366
|
-
}
|
|
406
|
+
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
|
|
367
407
|
let zip;
|
|
368
408
|
try {
|
|
369
|
-
zip = await
|
|
409
|
+
zip = await JSZip2.loadAsync(buffer);
|
|
370
410
|
} catch {
|
|
371
411
|
return extractFromBrokenZip(buffer);
|
|
372
412
|
}
|
|
@@ -529,7 +569,7 @@ function parseDublinCoreMetadata(xml, metadata) {
|
|
|
529
569
|
async function extractHwpxMetadataOnly(buffer) {
|
|
530
570
|
let zip;
|
|
531
571
|
try {
|
|
532
|
-
zip = await
|
|
572
|
+
zip = await JSZip2.loadAsync(buffer);
|
|
533
573
|
} catch {
|
|
534
574
|
throw new KordocError("HWPX ZIP\uC744 \uC5F4 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
535
575
|
}
|
|
@@ -539,46 +579,17 @@ async function extractHwpxMetadataOnly(buffer) {
|
|
|
539
579
|
metadata.pageCount = sectionPaths.length;
|
|
540
580
|
return metadata;
|
|
541
581
|
}
|
|
542
|
-
function precheckZipSize(buffer) {
|
|
543
|
-
try {
|
|
544
|
-
const data = new DataView(buffer);
|
|
545
|
-
const len = buffer.byteLength;
|
|
546
|
-
if (len < 22) return { totalUncompressed: 0, entryCount: 0 };
|
|
547
|
-
const searchStart = Math.max(0, len - 22 - 65535);
|
|
548
|
-
let eocdOffset = -1;
|
|
549
|
-
for (let i = len - 22; i >= searchStart; i--) {
|
|
550
|
-
if (data.getUint32(i, true) === 101010256) {
|
|
551
|
-
eocdOffset = i;
|
|
552
|
-
break;
|
|
553
|
-
}
|
|
554
|
-
}
|
|
555
|
-
if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
|
|
556
|
-
const entryCount = data.getUint16(eocdOffset + 10, true);
|
|
557
|
-
const cdSize = data.getUint32(eocdOffset + 12, true);
|
|
558
|
-
const cdOffset = data.getUint32(eocdOffset + 16, true);
|
|
559
|
-
if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
|
|
560
|
-
let totalUncompressed = 0;
|
|
561
|
-
let pos = cdOffset;
|
|
562
|
-
for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
|
|
563
|
-
if (data.getUint32(pos, true) !== 33639248) break;
|
|
564
|
-
totalUncompressed += data.getUint32(pos + 24, true);
|
|
565
|
-
const nameLen = data.getUint16(pos + 28, true);
|
|
566
|
-
const extraLen = data.getUint16(pos + 30, true);
|
|
567
|
-
const commentLen = data.getUint16(pos + 32, true);
|
|
568
|
-
pos += 46 + nameLen + extraLen + commentLen;
|
|
569
|
-
}
|
|
570
|
-
return { totalUncompressed, entryCount };
|
|
571
|
-
} catch {
|
|
572
|
-
return { totalUncompressed: 0, entryCount: 0 };
|
|
573
|
-
}
|
|
574
|
-
}
|
|
575
582
|
function extractFromBrokenZip(buffer) {
|
|
576
583
|
const data = new Uint8Array(buffer);
|
|
577
584
|
const view = new DataView(buffer);
|
|
578
585
|
let pos = 0;
|
|
579
586
|
const blocks = [];
|
|
587
|
+
const warnings = [
|
|
588
|
+
{ code: "BROKEN_ZIP_RECOVERY", message: "\uC190\uC0C1\uB41C ZIP \uAD6C\uC870 \u2014 Local File Header \uAE30\uBC18 \uBCF5\uAD6C \uBAA8\uB4DC" }
|
|
589
|
+
];
|
|
580
590
|
let totalDecompressed = 0;
|
|
581
591
|
let entryCount = 0;
|
|
592
|
+
let sectionNum = 0;
|
|
582
593
|
while (pos < data.length - 30) {
|
|
583
594
|
if (data[pos] !== 80 || data[pos + 1] !== 75 || data[pos + 2] !== 3 || data[pos + 3] !== 4) {
|
|
584
595
|
pos++;
|
|
@@ -624,14 +635,15 @@ function extractFromBrokenZip(buffer) {
|
|
|
624
635
|
}
|
|
625
636
|
totalDecompressed += content.length * 2;
|
|
626
637
|
if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("\uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC");
|
|
627
|
-
|
|
638
|
+
sectionNum++;
|
|
639
|
+
blocks.push(...parseSectionXml(content, void 0, warnings, sectionNum));
|
|
628
640
|
} catch {
|
|
629
641
|
continue;
|
|
630
642
|
}
|
|
631
643
|
}
|
|
632
644
|
if (blocks.length === 0) throw new KordocError("\uC190\uC0C1\uB41C HWPX\uC5D0\uC11C \uC139\uC158 \uB370\uC774\uD130\uB97C \uBCF5\uAD6C\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
633
645
|
const markdown = blocksToMarkdown(blocks);
|
|
634
|
-
return { markdown, blocks };
|
|
646
|
+
return { markdown, blocks, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
635
647
|
}
|
|
636
648
|
async function resolveSectionPaths(zip) {
|
|
637
649
|
const manifestPaths = ["Contents/content.hpf", "content.hpf"];
|
|
@@ -695,9 +707,9 @@ function detectHwpxHeadings(blocks, styleMap) {
|
|
|
695
707
|
let level = 0;
|
|
696
708
|
if (baseFontSize > 0 && block.style?.fontSize) {
|
|
697
709
|
const ratio = block.style.fontSize / baseFontSize;
|
|
698
|
-
if (ratio >=
|
|
699
|
-
else if (ratio >=
|
|
700
|
-
else if (ratio >=
|
|
710
|
+
if (ratio >= HEADING_RATIO_H1) level = 1;
|
|
711
|
+
else if (ratio >= HEADING_RATIO_H2) level = 2;
|
|
712
|
+
else if (ratio >= HEADING_RATIO_H3) level = 3;
|
|
701
713
|
}
|
|
702
714
|
if (/^제\d+[조장절편]/.test(text) && text.length <= 50) {
|
|
703
715
|
if (level === 0) level = 3;
|
|
@@ -783,6 +795,14 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
783
795
|
}
|
|
784
796
|
}
|
|
785
797
|
break;
|
|
798
|
+
case "cellAddr":
|
|
799
|
+
if (tableCtx?.cell) {
|
|
800
|
+
const ca = parseInt(el.getAttribute("colAddr") || "", 10);
|
|
801
|
+
const ra = parseInt(el.getAttribute("rowAddr") || "", 10);
|
|
802
|
+
if (!isNaN(ca)) tableCtx.cell.colAddr = ca;
|
|
803
|
+
if (!isNaN(ra)) tableCtx.cell.rowAddr = ra;
|
|
804
|
+
}
|
|
805
|
+
break;
|
|
786
806
|
case "cellSpan":
|
|
787
807
|
if (tableCtx?.cell) {
|
|
788
808
|
const cs = parseInt(el.getAttribute("colSpan") || "1", 10);
|
|
@@ -829,39 +849,47 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
829
849
|
if (depth > MAX_XML_DEPTH) return tableCtx;
|
|
830
850
|
const children = node.childNodes;
|
|
831
851
|
if (!children) return tableCtx;
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
if (
|
|
839
|
-
const
|
|
840
|
-
|
|
841
|
-
if (
|
|
842
|
-
if (tableStack.
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
852
|
+
const walkChildren = (parent, d) => {
|
|
853
|
+
if (d > MAX_XML_DEPTH) return;
|
|
854
|
+
const kids = parent.childNodes;
|
|
855
|
+
if (!kids) return;
|
|
856
|
+
for (let i = 0; i < kids.length; i++) {
|
|
857
|
+
const el = kids[i];
|
|
858
|
+
if (el.nodeType !== 1) continue;
|
|
859
|
+
const tag = el.tagName || el.localName || "";
|
|
860
|
+
const localTag = tag.replace(/^[^:]+:/, "");
|
|
861
|
+
if (localTag === "tbl") {
|
|
862
|
+
if (tableCtx) tableStack.push(tableCtx);
|
|
863
|
+
const newTable = { rows: [], currentRow: [], cell: null };
|
|
864
|
+
walkSection(el, blocks, newTable, tableStack, styleMap, warnings, sectionNum, d + 1);
|
|
865
|
+
if (newTable.rows.length > 0) {
|
|
866
|
+
if (tableStack.length > 0) {
|
|
867
|
+
const parentTable = tableStack.pop();
|
|
868
|
+
const nestedText = convertTableToText(newTable.rows);
|
|
869
|
+
if (parentTable.cell) {
|
|
870
|
+
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
|
|
871
|
+
}
|
|
872
|
+
tableCtx = parentTable;
|
|
873
|
+
} else {
|
|
874
|
+
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
875
|
+
tableCtx = null;
|
|
847
876
|
}
|
|
848
|
-
tableCtx = parentTable;
|
|
849
877
|
} else {
|
|
850
|
-
|
|
851
|
-
tableCtx = null;
|
|
878
|
+
tableCtx = tableStack.length > 0 ? tableStack.pop() : null;
|
|
852
879
|
}
|
|
853
|
-
} else {
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
} else if (
|
|
861
|
-
|
|
880
|
+
} else if (localTag === "pic" || localTag === "shape" || localTag === "drawingObject") {
|
|
881
|
+
const imgRef = extractImageRef(el);
|
|
882
|
+
if (imgRef) {
|
|
883
|
+
blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
|
|
884
|
+
} else if (warnings && sectionNum) {
|
|
885
|
+
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
886
|
+
}
|
|
887
|
+
} else if (localTag === "r" || localTag === "run" || localTag === "ctrl") {
|
|
888
|
+
walkChildren(el, d + 1);
|
|
862
889
|
}
|
|
863
890
|
}
|
|
864
|
-
}
|
|
891
|
+
};
|
|
892
|
+
walkChildren(node, depth);
|
|
865
893
|
return tableCtx;
|
|
866
894
|
}
|
|
867
895
|
function extractParagraphInfo(para, styleMap) {
|
|
@@ -900,7 +928,10 @@ function extractParagraphInfo(para, styleMap) {
|
|
|
900
928
|
// 하이퍼링크
|
|
901
929
|
case "hyperlink": {
|
|
902
930
|
const url = child.getAttribute("url") || child.getAttribute("href") || "";
|
|
903
|
-
if (url)
|
|
931
|
+
if (url) {
|
|
932
|
+
const safe = sanitizeHref(url);
|
|
933
|
+
if (safe) href = safe;
|
|
934
|
+
}
|
|
904
935
|
walk(child);
|
|
905
936
|
break;
|
|
906
937
|
}
|
|
@@ -913,6 +944,29 @@ function extractParagraphInfo(para, styleMap) {
|
|
|
913
944
|
if (noteText) footnote = (footnote ? footnote + "; " : "") + noteText;
|
|
914
945
|
break;
|
|
915
946
|
}
|
|
947
|
+
// 제어 요소 — 필드, 컨트롤, 매개변수 등 스킵
|
|
948
|
+
case "ctrl":
|
|
949
|
+
case "fieldBegin":
|
|
950
|
+
case "fieldEnd":
|
|
951
|
+
case "parameters":
|
|
952
|
+
case "stringParam":
|
|
953
|
+
case "integerParam":
|
|
954
|
+
case "boolParam":
|
|
955
|
+
case "floatParam":
|
|
956
|
+
case "secPr":
|
|
957
|
+
// 섹션 속성 (페이지 설정 등)
|
|
958
|
+
case "colPr":
|
|
959
|
+
// 다단 속성
|
|
960
|
+
case "linesegarray":
|
|
961
|
+
case "lineseg":
|
|
962
|
+
// 레이아웃 정보
|
|
963
|
+
// 도형/이미지 요소 — 대체텍스트("사각형입니다." 등) 누출 방지
|
|
964
|
+
case "pic":
|
|
965
|
+
case "shape":
|
|
966
|
+
case "drawingObject":
|
|
967
|
+
case "shapeComment":
|
|
968
|
+
case "drawText":
|
|
969
|
+
break;
|
|
916
970
|
// run 요소에서 charPrIDRef 추출
|
|
917
971
|
case "r": {
|
|
918
972
|
const runCharPr = child.getAttribute("charPrIDRef");
|
|
@@ -927,7 +981,10 @@ function extractParagraphInfo(para, styleMap) {
|
|
|
927
981
|
}
|
|
928
982
|
};
|
|
929
983
|
walk(para);
|
|
930
|
-
|
|
984
|
+
let cleanText = text.replace(/[ \t]+/g, " ").trim();
|
|
985
|
+
if (/^그림입니다\.?\s*원본\s*그림의\s*(이름|크기)/.test(cleanText)) cleanText = "";
|
|
986
|
+
cleanText = cleanText.replace(/그림입니다\.?\s*원본\s*그림의\s*(이름|크기)[^\n]*(\n[^\n]*원본\s*그림의\s*(이름|크기)[^\n]*)*/g, "").trim();
|
|
987
|
+
cleanText = cleanText.replace(/(?:모서리가 둥근 |둥근 )?(?:사각형|직사각형|정사각형|원|타원|삼각형|선|직선|곡선|화살표|오각형|육각형|팔각형|별|십자|구름|마름모|도넛|평행사변형|사다리꼴|개체|그리기\s?개체|묶음\s?개체|글상자|수식|표|그림|OLE\s?개체)\s?입니다\.?/g, "").trim();
|
|
931
988
|
let style;
|
|
932
989
|
if (styleMap && charPrId) {
|
|
933
990
|
const charProp = styleMap.charProperties.get(charPrId);
|
|
@@ -1205,9 +1262,9 @@ function detectHwp5Headings(blocks, docInfo) {
|
|
|
1205
1262
|
if (/^\d+$/.test(text)) continue;
|
|
1206
1263
|
const ratio = block.style.fontSize / baseFontSize;
|
|
1207
1264
|
let level = 0;
|
|
1208
|
-
if (ratio >=
|
|
1209
|
-
else if (ratio >=
|
|
1210
|
-
else if (ratio >=
|
|
1265
|
+
if (ratio >= HEADING_RATIO_H1) level = 1;
|
|
1266
|
+
else if (ratio >= HEADING_RATIO_H2) level = 2;
|
|
1267
|
+
else if (ratio >= HEADING_RATIO_H3) level = 3;
|
|
1211
1268
|
if (/^제\d+[조장절편]/.test(text) && text.length <= 50) {
|
|
1212
1269
|
if (level === 0) level = 3;
|
|
1213
1270
|
}
|
|
@@ -1308,20 +1365,22 @@ function detectImageMime(data) {
|
|
|
1308
1365
|
}
|
|
1309
1366
|
function extractHwp5Images(cfb, blocks, compressed, warnings) {
|
|
1310
1367
|
const binDataMap = /* @__PURE__ */ new Map();
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
if (
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1368
|
+
const binDataRe = /\/BinData\/[Bb][Ii][Nn](\d{4})$/;
|
|
1369
|
+
if (cfb.FileIndex) {
|
|
1370
|
+
for (const entry of cfb.FileIndex) {
|
|
1371
|
+
if (!entry?.name || !entry.content) continue;
|
|
1372
|
+
const match = entry.name.match(binDataRe);
|
|
1373
|
+
if (!match) continue;
|
|
1374
|
+
const idx = parseInt(match[1], 10);
|
|
1375
|
+
let data = Buffer.from(entry.content);
|
|
1376
|
+
if (compressed) {
|
|
1377
|
+
try {
|
|
1378
|
+
data = decompressStream(data);
|
|
1379
|
+
} catch {
|
|
1380
|
+
}
|
|
1322
1381
|
}
|
|
1382
|
+
binDataMap.set(idx, { data, name: entry.name });
|
|
1323
1383
|
}
|
|
1324
|
-
binDataMap.set(idx, { data, name: entry.name || `BIN${idx}` });
|
|
1325
1384
|
}
|
|
1326
1385
|
if (binDataMap.size === 0) return [];
|
|
1327
1386
|
const images = [];
|
|
@@ -1468,6 +1527,16 @@ function parseTableBlock(records, startIdx) {
|
|
|
1468
1527
|
i++;
|
|
1469
1528
|
}
|
|
1470
1529
|
if (rows === 0 || cols === 0 || cells.length === 0) return { table: null, nextIdx: i };
|
|
1530
|
+
const hasAddr = cells.some((c) => c.colAddr !== void 0 && c.rowAddr !== void 0);
|
|
1531
|
+
if (hasAddr) {
|
|
1532
|
+
const cellRows2 = arrangeCells(rows, cols, cells);
|
|
1533
|
+
const irCells = cellRows2.map((row) => row.map((c) => ({
|
|
1534
|
+
text: c.text.trim(),
|
|
1535
|
+
colSpan: c.colSpan,
|
|
1536
|
+
rowSpan: c.rowSpan
|
|
1537
|
+
})));
|
|
1538
|
+
return { table: { rows, cols, cells: irCells, hasHeader: rows > 1 }, nextIdx: i };
|
|
1539
|
+
}
|
|
1471
1540
|
const cellRows = arrangeCells(rows, cols, cells);
|
|
1472
1541
|
return { table: buildTable(cellRows), nextIdx: i };
|
|
1473
1542
|
}
|
|
@@ -1731,7 +1800,36 @@ function buildTableGrids(horizontals, verticals) {
|
|
|
1731
1800
|
};
|
|
1732
1801
|
grids.push({ rowYs, colXs, bbox });
|
|
1733
1802
|
}
|
|
1734
|
-
return grids;
|
|
1803
|
+
return mergeAdjacentGrids(grids);
|
|
1804
|
+
}
|
|
1805
|
+
function mergeAdjacentGrids(grids) {
|
|
1806
|
+
if (grids.length <= 1) return grids;
|
|
1807
|
+
const sorted = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
|
|
1808
|
+
const merged = [sorted[0]];
|
|
1809
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
1810
|
+
const prev = merged[merged.length - 1];
|
|
1811
|
+
const curr = sorted[i];
|
|
1812
|
+
if (prev.colXs.length === curr.colXs.length) {
|
|
1813
|
+
const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= COORD_MERGE_TOL * 3);
|
|
1814
|
+
const verticalGap = prev.bbox.y1 - curr.bbox.y2;
|
|
1815
|
+
if (colMatch && verticalGap >= -COORD_MERGE_TOL && verticalGap <= 20) {
|
|
1816
|
+
const allRowYs = [.../* @__PURE__ */ new Set([...prev.rowYs, ...curr.rowYs])].sort((a, b) => b - a);
|
|
1817
|
+
merged[merged.length - 1] = {
|
|
1818
|
+
rowYs: allRowYs,
|
|
1819
|
+
colXs: prev.colXs,
|
|
1820
|
+
bbox: {
|
|
1821
|
+
x1: Math.min(prev.bbox.x1, curr.bbox.x1),
|
|
1822
|
+
y1: Math.min(prev.bbox.y1, curr.bbox.y1),
|
|
1823
|
+
x2: Math.max(prev.bbox.x2, curr.bbox.x2),
|
|
1824
|
+
y2: Math.max(prev.bbox.y2, curr.bbox.y2)
|
|
1825
|
+
}
|
|
1826
|
+
};
|
|
1827
|
+
continue;
|
|
1828
|
+
}
|
|
1829
|
+
}
|
|
1830
|
+
merged.push(curr);
|
|
1831
|
+
}
|
|
1832
|
+
return merged;
|
|
1735
1833
|
}
|
|
1736
1834
|
function clusterCoordinates(values) {
|
|
1737
1835
|
if (values.length === 0) return [];
|
|
@@ -1918,7 +2016,11 @@ function cellTextToString(items) {
|
|
|
1918
2016
|
for (let j = 1; j < s.length; j++) {
|
|
1919
2017
|
const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
|
|
1920
2018
|
const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
|
|
1921
|
-
|
|
2019
|
+
const prevIsKorean = /[가-힣]$/.test(result);
|
|
2020
|
+
const currIsKorean = /^[가-힣]/.test(s[j].text);
|
|
2021
|
+
if (gap < avgFs * 0.15) {
|
|
2022
|
+
result += s[j].text;
|
|
2023
|
+
} else if (gap < avgFs * 0.35 && (prevIsKorean || currIsKorean)) {
|
|
1922
2024
|
result += s[j].text;
|
|
1923
2025
|
} else {
|
|
1924
2026
|
result += " " + s[j].text;
|
|
@@ -1933,6 +2035,12 @@ function cellTextToString(items) {
|
|
|
1933
2035
|
const curr = textLines[i];
|
|
1934
2036
|
if (/[가-힣]$/.test(prev) && /^[가-힣]+$/.test(curr) && curr.length <= 8 && !curr.includes(" ")) {
|
|
1935
2037
|
merged[merged.length - 1] = prev + curr;
|
|
2038
|
+
} else if (curr.trim().length <= 3 && /^[)\]%}]/.test(curr.trim())) {
|
|
2039
|
+
merged[merged.length - 1] = prev + curr.trim();
|
|
2040
|
+
} else if (/[,(]$/.test(prev.trim()) && curr.trim().length <= 15) {
|
|
2041
|
+
merged[merged.length - 1] = prev + curr.trim();
|
|
2042
|
+
} else if (/[\d,]$/.test(prev) && /^[\d,]+[)\]]?$/.test(curr.trim()) && curr.trim().length <= 10) {
|
|
2043
|
+
merged[merged.length - 1] = prev + curr.trim();
|
|
1936
2044
|
} else {
|
|
1937
2045
|
merged.push(curr);
|
|
1938
2046
|
}
|
|
@@ -2145,21 +2253,26 @@ async function loadPdfWithTimeout(buffer) {
|
|
|
2145
2253
|
disableFontFace: true,
|
|
2146
2254
|
isEvalSupported: false
|
|
2147
2255
|
});
|
|
2148
|
-
|
|
2149
|
-
|
|
2150
|
-
|
|
2151
|
-
|
|
2152
|
-
|
|
2153
|
-
|
|
2154
|
-
|
|
2155
|
-
|
|
2156
|
-
|
|
2256
|
+
let timer;
|
|
2257
|
+
try {
|
|
2258
|
+
return await Promise.race([
|
|
2259
|
+
loadingTask.promise,
|
|
2260
|
+
new Promise((_, reject) => {
|
|
2261
|
+
timer = setTimeout(() => {
|
|
2262
|
+
loadingTask.destroy();
|
|
2263
|
+
reject(new KordocError("PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (30\uCD08 \uCD08\uACFC)"));
|
|
2264
|
+
}, PDF_LOAD_TIMEOUT_MS);
|
|
2265
|
+
})
|
|
2266
|
+
]);
|
|
2267
|
+
} finally {
|
|
2268
|
+
if (timer !== void 0) clearTimeout(timer);
|
|
2269
|
+
}
|
|
2157
2270
|
}
|
|
2158
2271
|
async function parsePdfDocument(buffer, options) {
|
|
2159
2272
|
const doc = await loadPdfWithTimeout(buffer);
|
|
2160
2273
|
try {
|
|
2161
2274
|
const pageCount = doc.numPages;
|
|
2162
|
-
if (pageCount === 0)
|
|
2275
|
+
if (pageCount === 0) throw new KordocError("PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.");
|
|
2163
2276
|
const metadata = { pageCount };
|
|
2164
2277
|
await extractPdfMetadata(doc, metadata);
|
|
2165
2278
|
const blocks = [];
|
|
@@ -2212,14 +2325,14 @@ async function parsePdfDocument(buffer, options) {
|
|
|
2212
2325
|
const ocrBlocks = await ocrPages(doc, options.ocr, pageFilter, effectivePageCount);
|
|
2213
2326
|
if (ocrBlocks.length > 0) {
|
|
2214
2327
|
const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
|
|
2215
|
-
return {
|
|
2328
|
+
return { markdown: ocrMarkdown, blocks: ocrBlocks, metadata, warnings, isImageBased: true };
|
|
2216
2329
|
}
|
|
2217
2330
|
} catch {
|
|
2218
2331
|
}
|
|
2219
2332
|
}
|
|
2220
|
-
|
|
2333
|
+
throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
|
|
2221
2334
|
}
|
|
2222
|
-
if (options?.removeHeaderFooter && parsedPageCount >= 3) {
|
|
2335
|
+
if (options?.removeHeaderFooter !== false && parsedPageCount >= 3) {
|
|
2223
2336
|
const removed = removeHeaderFooterBlocks(blocks, pageHeights, warnings);
|
|
2224
2337
|
for (let ri = removed.length - 1; ri >= 0; ri--) {
|
|
2225
2338
|
blocks.splice(removed[ri], 1);
|
|
@@ -2229,9 +2342,10 @@ async function parsePdfDocument(buffer, options) {
|
|
|
2229
2342
|
if (medianFontSize > 0) {
|
|
2230
2343
|
detectHeadings(blocks, medianFontSize);
|
|
2231
2344
|
}
|
|
2345
|
+
detectMarkerHeadings(blocks);
|
|
2232
2346
|
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
2233
2347
|
let markdown = cleanPdfText(blocksToMarkdown(blocks));
|
|
2234
|
-
return {
|
|
2348
|
+
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
2235
2349
|
} finally {
|
|
2236
2350
|
await doc.destroy().catch(() => {
|
|
2237
2351
|
});
|
|
@@ -2302,12 +2416,67 @@ function detectHeadings(blocks, medianFontSize) {
|
|
|
2302
2416
|
if (/^\d+$/.test(text)) continue;
|
|
2303
2417
|
const ratio = block.style.fontSize / medianFontSize;
|
|
2304
2418
|
let level = 0;
|
|
2305
|
-
if (ratio >=
|
|
2306
|
-
else if (ratio >=
|
|
2307
|
-
else if (ratio >=
|
|
2419
|
+
if (ratio >= HEADING_RATIO_H1) level = 1;
|
|
2420
|
+
else if (ratio >= HEADING_RATIO_H2) level = 2;
|
|
2421
|
+
else if (ratio >= HEADING_RATIO_H3) level = 3;
|
|
2308
2422
|
if (level > 0) {
|
|
2309
2423
|
block.type = "heading";
|
|
2310
2424
|
block.level = level;
|
|
2425
|
+
block.text = collapseEvenSpacing(text);
|
|
2426
|
+
}
|
|
2427
|
+
}
|
|
2428
|
+
}
|
|
2429
|
+
function collapseEvenSpacing(text) {
|
|
2430
|
+
const tokens = text.split(" ");
|
|
2431
|
+
const singleCharCount = tokens.filter((t) => t.length === 1).length;
|
|
2432
|
+
if (tokens.length >= 3 && singleCharCount / tokens.length >= 0.7) {
|
|
2433
|
+
return tokens.join("");
|
|
2434
|
+
}
|
|
2435
|
+
return text;
|
|
2436
|
+
}
|
|
2437
|
+
function shouldDemoteTable(table) {
|
|
2438
|
+
const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
|
|
2439
|
+
const allText = allCells.join(" ");
|
|
2440
|
+
if (allText.length > 200) return false;
|
|
2441
|
+
if (/[□■◆○●▶]/.test(allText) && table.rows <= 3) return true;
|
|
2442
|
+
const totalCells = table.rows * table.cols;
|
|
2443
|
+
const emptyCells = totalCells - allCells.length;
|
|
2444
|
+
if (table.rows <= 2 && emptyCells > totalCells * 0.5) return true;
|
|
2445
|
+
if (table.rows === 1 && !/\d{2,}/.test(allText)) return true;
|
|
2446
|
+
return false;
|
|
2447
|
+
}
|
|
2448
|
+
function demoteTableToText(table) {
|
|
2449
|
+
const lines = [];
|
|
2450
|
+
for (let r = 0; r < table.rows; r++) {
|
|
2451
|
+
const cells = table.cells[r].map((c) => c.text.trim()).filter(Boolean);
|
|
2452
|
+
if (cells.length === 0) continue;
|
|
2453
|
+
if (table.cols === 2 && cells.length === 2) {
|
|
2454
|
+
lines.push(`${cells[0]} : ${cells[1]}`);
|
|
2455
|
+
} else {
|
|
2456
|
+
lines.push(cells.join(" "));
|
|
2457
|
+
}
|
|
2458
|
+
}
|
|
2459
|
+
return lines.join("\n");
|
|
2460
|
+
}
|
|
2461
|
+
function detectMarkerHeadings(blocks) {
|
|
2462
|
+
for (let i = 0; i < blocks.length; i++) {
|
|
2463
|
+
const block = blocks[i];
|
|
2464
|
+
if (block.type !== "paragraph" || !block.text) continue;
|
|
2465
|
+
const text = block.text.trim();
|
|
2466
|
+
if (text.length < 50 && /^[□■◆◇▶]\s*[가-힣]/.test(text)) {
|
|
2467
|
+
block.type = "heading";
|
|
2468
|
+
block.level = 4;
|
|
2469
|
+
continue;
|
|
2470
|
+
}
|
|
2471
|
+
if (/^[가-힣]{2,6}$/.test(text) && block.style?.fontSize) {
|
|
2472
|
+
const prev = blocks[i - 1];
|
|
2473
|
+
const next = blocks[i + 1];
|
|
2474
|
+
const prevIsStructural = !prev || prev.type === "table" || prev.type === "heading" || prev.type === "separator";
|
|
2475
|
+
const nextIsStructural = !next || next.type === "table" || next.type === "heading" || next.type === "paragraph" && next.text && /^[□■◆○●]/.test(next.text.trim());
|
|
2476
|
+
if (prevIsStructural || nextIsStructural) {
|
|
2477
|
+
block.type = "heading";
|
|
2478
|
+
block.level = 3;
|
|
2479
|
+
}
|
|
2311
2480
|
}
|
|
2312
2481
|
}
|
|
2313
2482
|
}
|
|
@@ -2344,7 +2513,7 @@ function computeRegion(items) {
|
|
|
2344
2513
|
}
|
|
2345
2514
|
return { items, minX, minY, maxX, maxY };
|
|
2346
2515
|
}
|
|
2347
|
-
function findYSplit(items,
|
|
2516
|
+
function findYSplit(items, _region, gapThreshold) {
|
|
2348
2517
|
const sorted = [...items].sort((a, b) => b.y - a.y);
|
|
2349
2518
|
let bestGap = gapThreshold;
|
|
2350
2519
|
let bestSplit = null;
|
|
@@ -2359,7 +2528,7 @@ function findYSplit(items, region, gapThreshold) {
|
|
|
2359
2528
|
}
|
|
2360
2529
|
return bestSplit;
|
|
2361
2530
|
}
|
|
2362
|
-
function findXSplit(items,
|
|
2531
|
+
function findXSplit(items, _region, gapThreshold) {
|
|
2363
2532
|
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
2364
2533
|
let bestGap = gapThreshold;
|
|
2365
2534
|
let bestSplit = null;
|
|
@@ -2418,7 +2587,8 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
2418
2587
|
);
|
|
2419
2588
|
for (const cell of cells) {
|
|
2420
2589
|
const cellItems = cellTextMap.get(cell) || [];
|
|
2421
|
-
|
|
2590
|
+
let text = cellTextToString(cellItems);
|
|
2591
|
+
text = text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").trim();
|
|
2422
2592
|
irGrid[cell.row][cell.col] = {
|
|
2423
2593
|
text,
|
|
2424
2594
|
colSpan: cell.colSpan,
|
|
@@ -2433,18 +2603,21 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
2433
2603
|
};
|
|
2434
2604
|
const hasContent = irGrid.some((row) => row.some((cell) => cell.text.trim() !== ""));
|
|
2435
2605
|
if (!hasContent) continue;
|
|
2436
|
-
|
|
2437
|
-
|
|
2438
|
-
|
|
2439
|
-
|
|
2440
|
-
|
|
2441
|
-
|
|
2442
|
-
|
|
2443
|
-
|
|
2444
|
-
|
|
2445
|
-
|
|
2606
|
+
const tableBbox = {
|
|
2607
|
+
page: pageNum,
|
|
2608
|
+
x: grid.bbox.x1,
|
|
2609
|
+
y: grid.bbox.y1,
|
|
2610
|
+
width: grid.bbox.x2 - grid.bbox.x1,
|
|
2611
|
+
height: grid.bbox.y2 - grid.bbox.y1
|
|
2612
|
+
};
|
|
2613
|
+
if (shouldDemoteTable(irTable)) {
|
|
2614
|
+
const demoted = demoteTableToText(irTable);
|
|
2615
|
+
if (demoted) {
|
|
2616
|
+
blocks.push({ type: "paragraph", text: demoted, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
|
|
2446
2617
|
}
|
|
2447
|
-
|
|
2618
|
+
continue;
|
|
2619
|
+
}
|
|
2620
|
+
blocks.push({ type: "table", table: irTable, pageNumber: pageNum, bbox: tableBbox });
|
|
2448
2621
|
}
|
|
2449
2622
|
const remaining = items.filter((i) => !usedItems.has(i));
|
|
2450
2623
|
if (remaining.length > 0) {
|
|
@@ -2456,9 +2629,29 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
2456
2629
|
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
2457
2630
|
return by - ay;
|
|
2458
2631
|
});
|
|
2459
|
-
return allBlocks;
|
|
2632
|
+
return mergeAdjacentTableBlocks(allBlocks);
|
|
2460
2633
|
}
|
|
2461
|
-
return blocks;
|
|
2634
|
+
return mergeAdjacentTableBlocks(blocks);
|
|
2635
|
+
}
|
|
2636
|
+
function mergeAdjacentTableBlocks(blocks) {
|
|
2637
|
+
if (blocks.length <= 1) return blocks;
|
|
2638
|
+
const result = [blocks[0]];
|
|
2639
|
+
for (let i = 1; i < blocks.length; i++) {
|
|
2640
|
+
const prev = result[result.length - 1];
|
|
2641
|
+
const curr = blocks[i];
|
|
2642
|
+
if (prev.type === "table" && curr.type === "table" && prev.table && curr.table && prev.table.cols === curr.table.cols) {
|
|
2643
|
+
const merged = {
|
|
2644
|
+
rows: prev.table.rows + curr.table.rows,
|
|
2645
|
+
cols: prev.table.cols,
|
|
2646
|
+
cells: [...prev.table.cells, ...curr.table.cells],
|
|
2647
|
+
hasHeader: prev.table.hasHeader
|
|
2648
|
+
};
|
|
2649
|
+
result[result.length - 1] = { ...prev, table: merged };
|
|
2650
|
+
} else {
|
|
2651
|
+
result.push(curr);
|
|
2652
|
+
}
|
|
2653
|
+
}
|
|
2654
|
+
return result;
|
|
2462
2655
|
}
|
|
2463
2656
|
function extractPageBlocksFallback(items, pageNum) {
|
|
2464
2657
|
if (items.length === 0) return [];
|
|
@@ -2481,11 +2674,13 @@ function extractPageBlocksFallback(items, pageNum) {
|
|
|
2481
2674
|
}));
|
|
2482
2675
|
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
2483
2676
|
if (clusterResults.length > 0) {
|
|
2677
|
+
const ciToIdx = /* @__PURE__ */ new Map();
|
|
2678
|
+
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
2484
2679
|
const usedIndices = /* @__PURE__ */ new Set();
|
|
2485
2680
|
for (const cr of clusterResults) {
|
|
2486
2681
|
for (const ci of cr.usedItems) {
|
|
2487
|
-
const idx =
|
|
2488
|
-
if (idx
|
|
2682
|
+
const idx = ciToIdx.get(ci);
|
|
2683
|
+
if (idx !== void 0) usedIndices.add(idx);
|
|
2489
2684
|
}
|
|
2490
2685
|
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
2491
2686
|
}
|
|
@@ -2796,7 +2991,8 @@ function mergeLineSimple(items) {
|
|
|
2796
2991
|
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
2797
2992
|
const avgFs = (sorted[i].fontSize + sorted[i - 1].fontSize) / 2;
|
|
2798
2993
|
if (gap > 15) result += " ";
|
|
2799
|
-
else if (gap < avgFs * 0.
|
|
2994
|
+
else if (gap < avgFs * 0.15) {
|
|
2995
|
+
} else if (gap < avgFs * 0.35 && (/[가-힣]$/.test(result) || /^[가-힣]/.test(sorted[i].text))) {
|
|
2800
2996
|
} else if (gap > 3) result += " ";
|
|
2801
2997
|
result += sorted[i].text;
|
|
2802
2998
|
}
|
|
@@ -2804,8 +3000,8 @@ function mergeLineSimple(items) {
|
|
|
2804
3000
|
}
|
|
2805
3001
|
function cleanPdfText(text) {
|
|
2806
3002
|
return mergeKoreanLines(
|
|
2807
|
-
text.replace(/^[\s]*[-–—]\s
|
|
2808
|
-
).replace(/\n{3,}/g, "\n\n").trim();
|
|
3003
|
+
text.replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "")
|
|
3004
|
+
).replace(/^(?!\|).{3,30}$/gm, (line) => collapseEvenSpacing(line)).replace(/\n{3,}/g, "\n\n").trim();
|
|
2809
3005
|
}
|
|
2810
3006
|
function startsWithMarker(line) {
|
|
2811
3007
|
const t = line.trimStart();
|
|
@@ -2819,15 +3015,13 @@ function detectListBlocks(blocks) {
|
|
|
2819
3015
|
for (let i = 0; i < blocks.length; i++) {
|
|
2820
3016
|
const block = blocks[i];
|
|
2821
3017
|
if (block.type === "paragraph" && block.text) {
|
|
2822
|
-
const
|
|
2823
|
-
if (
|
|
2824
|
-
result.push({
|
|
2825
|
-
|
|
2826
|
-
|
|
2827
|
-
|
|
2828
|
-
|
|
2829
|
-
text: block.text
|
|
2830
|
-
});
|
|
3018
|
+
const text = block.text.trim();
|
|
3019
|
+
if (/^\d+\.\s/.test(text)) {
|
|
3020
|
+
result.push({ ...block, type: "list", listType: "ordered", text: block.text });
|
|
3021
|
+
continue;
|
|
3022
|
+
}
|
|
3023
|
+
if (/^[○●·※▶▷◆◇\-]\s/.test(text)) {
|
|
3024
|
+
result.push({ ...block, type: "list", listType: "unordered", text: block.text });
|
|
2831
3025
|
continue;
|
|
2832
3026
|
}
|
|
2833
3027
|
}
|
|
@@ -2986,11 +3180,20 @@ function mergeKoreanLines(text) {
|
|
|
2986
3180
|
for (let i = 1; i < lines.length; i++) {
|
|
2987
3181
|
const prev = result[result.length - 1];
|
|
2988
3182
|
const curr = lines[i];
|
|
2989
|
-
|
|
3183
|
+
const currTrimmed = curr.trim();
|
|
3184
|
+
if (/^#{1,6}\s/.test(prev) || /^#{1,6}\s/.test(curr) || /^\|/.test(currTrimmed) || /^---/.test(currTrimmed)) {
|
|
2990
3185
|
result.push(curr);
|
|
2991
3186
|
continue;
|
|
2992
3187
|
}
|
|
2993
|
-
if (
|
|
3188
|
+
if (/,$/.test(prev.trim()) && currTrimmed.length > 0) {
|
|
3189
|
+
result[result.length - 1] = prev + "\n" + curr;
|
|
3190
|
+
continue;
|
|
3191
|
+
}
|
|
3192
|
+
if (/^\(※/.test(currTrimmed)) {
|
|
3193
|
+
result[result.length - 1] = prev + " " + currTrimmed;
|
|
3194
|
+
continue;
|
|
3195
|
+
}
|
|
3196
|
+
if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev)) {
|
|
2994
3197
|
result[result.length - 1] = prev + " " + curr;
|
|
2995
3198
|
} else {
|
|
2996
3199
|
result.push(curr);
|
|
@@ -3002,6 +3205,716 @@ function mergeKoreanLines(text) {
|
|
|
3002
3205
|
// src/index.ts
|
|
3003
3206
|
import { readFile } from "fs/promises";
|
|
3004
3207
|
|
|
3208
|
+
// src/xlsx/parser.ts
|
|
3209
|
+
import JSZip3 from "jszip";
|
|
3210
|
+
import { DOMParser as DOMParser2 } from "@xmldom/xmldom";
|
|
3211
|
+
var MAX_SHEETS = 100;
|
|
3212
|
+
var MAX_DECOMPRESS_SIZE3 = 100 * 1024 * 1024;
|
|
3213
|
+
var MAX_ROWS2 = 1e4;
|
|
3214
|
+
var MAX_COLS2 = 200;
|
|
3215
|
+
function cleanNumericValue(raw) {
|
|
3216
|
+
if (!/^-?\d+\.\d+$/.test(raw)) return raw;
|
|
3217
|
+
const num = parseFloat(raw);
|
|
3218
|
+
if (!isFinite(num)) return raw;
|
|
3219
|
+
const cleaned = parseFloat(num.toPrecision(15)).toString();
|
|
3220
|
+
return cleaned;
|
|
3221
|
+
}
|
|
3222
|
+
function parseCellRef(ref) {
|
|
3223
|
+
const m = ref.match(/^([A-Z]+)(\d+)$/);
|
|
3224
|
+
if (!m) return null;
|
|
3225
|
+
let col = 0;
|
|
3226
|
+
for (const ch of m[1]) col = col * 26 + (ch.charCodeAt(0) - 64);
|
|
3227
|
+
return { col: col - 1, row: parseInt(m[2], 10) - 1 };
|
|
3228
|
+
}
|
|
3229
|
+
function parseMergeRef(ref) {
|
|
3230
|
+
const parts = ref.split(":");
|
|
3231
|
+
if (parts.length !== 2) return null;
|
|
3232
|
+
const start = parseCellRef(parts[0]);
|
|
3233
|
+
const end = parseCellRef(parts[1]);
|
|
3234
|
+
if (!start || !end) return null;
|
|
3235
|
+
return { startCol: start.col, startRow: start.row, endCol: end.col, endRow: end.row };
|
|
3236
|
+
}
|
|
3237
|
+
function getElements(parent, tagName) {
|
|
3238
|
+
const nodes = parent.getElementsByTagName(tagName);
|
|
3239
|
+
const result = [];
|
|
3240
|
+
for (let i = 0; i < nodes.length; i++) result.push(nodes[i]);
|
|
3241
|
+
return result;
|
|
3242
|
+
}
|
|
3243
|
+
function getTextContent(el) {
|
|
3244
|
+
return el.textContent?.trim() ?? "";
|
|
3245
|
+
}
|
|
3246
|
+
function parseXml(text) {
|
|
3247
|
+
return new DOMParser2().parseFromString(text, "text/xml");
|
|
3248
|
+
}
|
|
3249
|
+
function parseSharedStrings(xml) {
|
|
3250
|
+
const doc = parseXml(xml);
|
|
3251
|
+
const strings = [];
|
|
3252
|
+
const siList = getElements(doc.documentElement, "si");
|
|
3253
|
+
for (const si of siList) {
|
|
3254
|
+
const tElements = getElements(si, "t");
|
|
3255
|
+
strings.push(tElements.map((t) => t.textContent ?? "").join(""));
|
|
3256
|
+
}
|
|
3257
|
+
return strings;
|
|
3258
|
+
}
|
|
3259
|
+
function parseWorkbook(xml) {
|
|
3260
|
+
const doc = parseXml(xml);
|
|
3261
|
+
const sheets = [];
|
|
3262
|
+
const sheetElements = getElements(doc.documentElement, "sheet");
|
|
3263
|
+
for (const el of sheetElements) {
|
|
3264
|
+
sheets.push({
|
|
3265
|
+
name: el.getAttribute("name") ?? `Sheet${sheets.length + 1}`,
|
|
3266
|
+
sheetId: el.getAttribute("sheetId") ?? "",
|
|
3267
|
+
rId: el.getAttribute("r:id") ?? ""
|
|
3268
|
+
});
|
|
3269
|
+
}
|
|
3270
|
+
return sheets;
|
|
3271
|
+
}
|
|
3272
|
+
function parseRels(xml) {
|
|
3273
|
+
const doc = parseXml(xml);
|
|
3274
|
+
const map = /* @__PURE__ */ new Map();
|
|
3275
|
+
const rels = getElements(doc.documentElement, "Relationship");
|
|
3276
|
+
for (const rel of rels) {
|
|
3277
|
+
const id = rel.getAttribute("Id");
|
|
3278
|
+
const target = rel.getAttribute("Target");
|
|
3279
|
+
if (id && target) map.set(id, target);
|
|
3280
|
+
}
|
|
3281
|
+
return map;
|
|
3282
|
+
}
|
|
3283
|
+
function parseWorksheet(xml, sharedStrings) {
|
|
3284
|
+
const doc = parseXml(xml);
|
|
3285
|
+
const grid = [];
|
|
3286
|
+
let maxRow = 0;
|
|
3287
|
+
let maxCol = 0;
|
|
3288
|
+
const rows = getElements(doc.documentElement, "row");
|
|
3289
|
+
for (const rowEl of rows) {
|
|
3290
|
+
const rowNum = parseInt(rowEl.getAttribute("r") ?? "0", 10) - 1;
|
|
3291
|
+
if (rowNum < 0 || rowNum >= MAX_ROWS2) continue;
|
|
3292
|
+
const cells = getElements(rowEl, "c");
|
|
3293
|
+
for (const cellEl of cells) {
|
|
3294
|
+
const ref = cellEl.getAttribute("r");
|
|
3295
|
+
if (!ref) continue;
|
|
3296
|
+
const pos = parseCellRef(ref);
|
|
3297
|
+
if (!pos || pos.col >= MAX_COLS2) continue;
|
|
3298
|
+
const type = cellEl.getAttribute("t");
|
|
3299
|
+
const vElements = getElements(cellEl, "v");
|
|
3300
|
+
const fElements = getElements(cellEl, "f");
|
|
3301
|
+
let value = "";
|
|
3302
|
+
if (vElements.length > 0) {
|
|
3303
|
+
const raw = getTextContent(vElements[0]);
|
|
3304
|
+
if (type === "s") {
|
|
3305
|
+
const idx = parseInt(raw, 10);
|
|
3306
|
+
value = sharedStrings[idx] ?? "";
|
|
3307
|
+
} else if (type === "b") {
|
|
3308
|
+
value = raw === "1" ? "TRUE" : "FALSE";
|
|
3309
|
+
} else {
|
|
3310
|
+
value = cleanNumericValue(raw);
|
|
3311
|
+
}
|
|
3312
|
+
} else if (type === "inlineStr") {
|
|
3313
|
+
const isEl = getElements(cellEl, "is");
|
|
3314
|
+
if (isEl.length > 0) {
|
|
3315
|
+
const tElements = getElements(isEl[0], "t");
|
|
3316
|
+
value = tElements.map((t) => t.textContent ?? "").join("");
|
|
3317
|
+
}
|
|
3318
|
+
}
|
|
3319
|
+
if (!value && fElements.length > 0) {
|
|
3320
|
+
value = `=${getTextContent(fElements[0])}`;
|
|
3321
|
+
}
|
|
3322
|
+
while (grid.length <= pos.row) grid.push([]);
|
|
3323
|
+
while (grid[pos.row].length <= pos.col) grid[pos.row].push("");
|
|
3324
|
+
grid[pos.row][pos.col] = value;
|
|
3325
|
+
if (pos.row > maxRow) maxRow = pos.row;
|
|
3326
|
+
if (pos.col > maxCol) maxCol = pos.col;
|
|
3327
|
+
}
|
|
3328
|
+
}
|
|
3329
|
+
const merges = [];
|
|
3330
|
+
const mergeCellElements = getElements(doc.documentElement, "mergeCell");
|
|
3331
|
+
for (const el of mergeCellElements) {
|
|
3332
|
+
const ref = el.getAttribute("ref");
|
|
3333
|
+
if (!ref) continue;
|
|
3334
|
+
const m = parseMergeRef(ref);
|
|
3335
|
+
if (m) merges.push(m);
|
|
3336
|
+
}
|
|
3337
|
+
return { grid, merges, maxRow, maxCol };
|
|
3338
|
+
}
|
|
3339
|
+
function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
|
|
3340
|
+
const blocks = [];
|
|
3341
|
+
if (sheetName) {
|
|
3342
|
+
blocks.push({
|
|
3343
|
+
type: "heading",
|
|
3344
|
+
text: sheetName,
|
|
3345
|
+
level: 2,
|
|
3346
|
+
pageNumber: sheetIndex + 1
|
|
3347
|
+
});
|
|
3348
|
+
}
|
|
3349
|
+
if (maxRow < 0 || maxCol < 0 || grid.length === 0) return blocks;
|
|
3350
|
+
const mergeMap = /* @__PURE__ */ new Map();
|
|
3351
|
+
const mergeSkip = /* @__PURE__ */ new Set();
|
|
3352
|
+
for (const m of merges) {
|
|
3353
|
+
const colSpan = m.endCol - m.startCol + 1;
|
|
3354
|
+
const rowSpan = m.endRow - m.startRow + 1;
|
|
3355
|
+
mergeMap.set(`${m.startRow},${m.startCol}`, { colSpan, rowSpan });
|
|
3356
|
+
for (let r = m.startRow; r <= m.endRow; r++) {
|
|
3357
|
+
for (let c = m.startCol; c <= m.endCol; c++) {
|
|
3358
|
+
if (r !== m.startRow || c !== m.startCol) {
|
|
3359
|
+
mergeSkip.add(`${r},${c}`);
|
|
3360
|
+
}
|
|
3361
|
+
}
|
|
3362
|
+
}
|
|
3363
|
+
}
|
|
3364
|
+
let firstRow = -1;
|
|
3365
|
+
let lastRow = -1;
|
|
3366
|
+
for (let r = 0; r <= maxRow; r++) {
|
|
3367
|
+
const row = grid[r];
|
|
3368
|
+
if (row && row.some((cell) => cell !== "")) {
|
|
3369
|
+
if (firstRow === -1) firstRow = r;
|
|
3370
|
+
lastRow = r;
|
|
3371
|
+
}
|
|
3372
|
+
}
|
|
3373
|
+
if (firstRow === -1) return blocks;
|
|
3374
|
+
const cellRows = [];
|
|
3375
|
+
for (let r = firstRow; r <= lastRow; r++) {
|
|
3376
|
+
const row = [];
|
|
3377
|
+
for (let c = 0; c <= maxCol; c++) {
|
|
3378
|
+
const key = `${r},${c}`;
|
|
3379
|
+
if (mergeSkip.has(key)) continue;
|
|
3380
|
+
const text = (grid[r] && grid[r][c]) ?? "";
|
|
3381
|
+
const merge = mergeMap.get(key);
|
|
3382
|
+
row.push({
|
|
3383
|
+
text,
|
|
3384
|
+
colSpan: merge?.colSpan ?? 1,
|
|
3385
|
+
rowSpan: merge?.rowSpan ?? 1
|
|
3386
|
+
});
|
|
3387
|
+
}
|
|
3388
|
+
cellRows.push(row);
|
|
3389
|
+
}
|
|
3390
|
+
if (cellRows.length > 0) {
|
|
3391
|
+
const table = buildTable(cellRows);
|
|
3392
|
+
if (table.rows > 0) {
|
|
3393
|
+
blocks.push({ type: "table", table, pageNumber: sheetIndex + 1 });
|
|
3394
|
+
}
|
|
3395
|
+
}
|
|
3396
|
+
return blocks;
|
|
3397
|
+
}
|
|
3398
|
+
async function parseXlsxDocument(buffer, options) {
|
|
3399
|
+
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
|
|
3400
|
+
const zip = await JSZip3.loadAsync(buffer);
|
|
3401
|
+
const warnings = [];
|
|
3402
|
+
const workbookFile = zip.file("xl/workbook.xml");
|
|
3403
|
+
if (!workbookFile) {
|
|
3404
|
+
throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 XLSX \uD30C\uC77C: xl/workbook.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
3405
|
+
}
|
|
3406
|
+
let sharedStrings = [];
|
|
3407
|
+
const ssFile = zip.file("xl/sharedStrings.xml");
|
|
3408
|
+
if (ssFile) {
|
|
3409
|
+
sharedStrings = parseSharedStrings(await ssFile.async("text"));
|
|
3410
|
+
}
|
|
3411
|
+
const sheets = parseWorkbook(await workbookFile.async("text"));
|
|
3412
|
+
if (sheets.length === 0) {
|
|
3413
|
+
throw new KordocError("XLSX \uD30C\uC77C\uC5D0 \uC2DC\uD2B8\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
3414
|
+
}
|
|
3415
|
+
let relsMap = /* @__PURE__ */ new Map();
|
|
3416
|
+
const relsFile = zip.file("xl/_rels/workbook.xml.rels");
|
|
3417
|
+
if (relsFile) {
|
|
3418
|
+
relsMap = parseRels(await relsFile.async("text"));
|
|
3419
|
+
}
|
|
3420
|
+
let pageFilter = null;
|
|
3421
|
+
if (options?.pages) {
|
|
3422
|
+
const { parsePageRange: parsePageRange2 } = await import("./page-range-737B4EZW.js");
|
|
3423
|
+
pageFilter = parsePageRange2(options.pages, sheets.length);
|
|
3424
|
+
}
|
|
3425
|
+
const blocks = [];
|
|
3426
|
+
const processedSheets = Math.min(sheets.length, MAX_SHEETS);
|
|
3427
|
+
for (let i = 0; i < processedSheets; i++) {
|
|
3428
|
+
if (pageFilter && !pageFilter.has(i + 1)) continue;
|
|
3429
|
+
const sheet = sheets[i];
|
|
3430
|
+
options?.onProgress?.(i + 1, processedSheets);
|
|
3431
|
+
let sheetPath = relsMap.get(sheet.rId);
|
|
3432
|
+
if (sheetPath) {
|
|
3433
|
+
if (!sheetPath.startsWith("xl/") && !sheetPath.startsWith("/")) {
|
|
3434
|
+
sheetPath = `xl/${sheetPath}`;
|
|
3435
|
+
} else if (sheetPath.startsWith("/")) {
|
|
3436
|
+
sheetPath = sheetPath.slice(1);
|
|
3437
|
+
}
|
|
3438
|
+
} else {
|
|
3439
|
+
sheetPath = `xl/worksheets/sheet${i + 1}.xml`;
|
|
3440
|
+
}
|
|
3441
|
+
const sheetFile = zip.file(sheetPath);
|
|
3442
|
+
if (!sheetFile) {
|
|
3443
|
+
warnings.push({
|
|
3444
|
+
page: i + 1,
|
|
3445
|
+
message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${sheetPath}`,
|
|
3446
|
+
code: "PARTIAL_PARSE"
|
|
3447
|
+
});
|
|
3448
|
+
continue;
|
|
3449
|
+
}
|
|
3450
|
+
try {
|
|
3451
|
+
const sheetXml = await sheetFile.async("text");
|
|
3452
|
+
const { grid, merges, maxRow, maxCol } = parseWorksheet(sheetXml, sharedStrings);
|
|
3453
|
+
const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
|
|
3454
|
+
blocks.push(...sheetBlocks);
|
|
3455
|
+
} catch (err) {
|
|
3456
|
+
warnings.push({
|
|
3457
|
+
page: i + 1,
|
|
3458
|
+
message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC2F1 \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
|
|
3459
|
+
code: "PARTIAL_PARSE"
|
|
3460
|
+
});
|
|
3461
|
+
}
|
|
3462
|
+
}
|
|
3463
|
+
const metadata = {
|
|
3464
|
+
pageCount: processedSheets
|
|
3465
|
+
};
|
|
3466
|
+
const coreFile = zip.file("docProps/core.xml");
|
|
3467
|
+
if (coreFile) {
|
|
3468
|
+
try {
|
|
3469
|
+
const coreXml = await coreFile.async("text");
|
|
3470
|
+
const doc = parseXml(coreXml);
|
|
3471
|
+
const getFirst = (tag) => {
|
|
3472
|
+
const els = doc.getElementsByTagName(tag);
|
|
3473
|
+
return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
|
|
3474
|
+
};
|
|
3475
|
+
metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
|
|
3476
|
+
metadata.author = getFirst("dc:creator");
|
|
3477
|
+
metadata.description = getFirst("dc:description");
|
|
3478
|
+
const created = getFirst("dcterms:created");
|
|
3479
|
+
if (created) metadata.createdAt = created;
|
|
3480
|
+
const modified = getFirst("dcterms:modified");
|
|
3481
|
+
if (modified) metadata.modifiedAt = modified;
|
|
3482
|
+
} catch {
|
|
3483
|
+
}
|
|
3484
|
+
}
|
|
3485
|
+
const markdown = blocksToMarkdown(blocks);
|
|
3486
|
+
return { markdown, blocks, metadata, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
3487
|
+
}
|
|
3488
|
+
|
|
3489
|
+
// src/docx/parser.ts
|
|
3490
|
+
import JSZip4 from "jszip";
|
|
3491
|
+
import { DOMParser as DOMParser3 } from "@xmldom/xmldom";
|
|
3492
|
+
var MAX_DECOMPRESS_SIZE4 = 100 * 1024 * 1024;
|
|
3493
|
+
function getChildElements(parent, localName) {
|
|
3494
|
+
const result = [];
|
|
3495
|
+
const children = parent.childNodes;
|
|
3496
|
+
for (let i = 0; i < children.length; i++) {
|
|
3497
|
+
const node = children[i];
|
|
3498
|
+
if (node.nodeType === 1) {
|
|
3499
|
+
const el = node;
|
|
3500
|
+
if (el.localName === localName || el.tagName?.endsWith(`:${localName}`)) {
|
|
3501
|
+
result.push(el);
|
|
3502
|
+
}
|
|
3503
|
+
}
|
|
3504
|
+
}
|
|
3505
|
+
return result;
|
|
3506
|
+
}
|
|
3507
|
+
function findElements(parent, localName) {
|
|
3508
|
+
const result = [];
|
|
3509
|
+
const walk = (node) => {
|
|
3510
|
+
const children = node.childNodes;
|
|
3511
|
+
for (let i = 0; i < children.length; i++) {
|
|
3512
|
+
const child = children[i];
|
|
3513
|
+
if (child.nodeType === 1) {
|
|
3514
|
+
const el = child;
|
|
3515
|
+
if (el.localName === localName || el.tagName?.endsWith(`:${localName}`)) {
|
|
3516
|
+
result.push(el);
|
|
3517
|
+
}
|
|
3518
|
+
walk(el);
|
|
3519
|
+
}
|
|
3520
|
+
}
|
|
3521
|
+
};
|
|
3522
|
+
walk(parent);
|
|
3523
|
+
return result;
|
|
3524
|
+
}
|
|
3525
|
+
function getAttr(el, localName) {
|
|
3526
|
+
const attrs = el.attributes;
|
|
3527
|
+
for (let i = 0; i < attrs.length; i++) {
|
|
3528
|
+
const attr = attrs[i];
|
|
3529
|
+
if (attr.localName === localName || attr.name === localName) return attr.value;
|
|
3530
|
+
}
|
|
3531
|
+
return null;
|
|
3532
|
+
}
|
|
3533
|
+
function parseXml2(text) {
|
|
3534
|
+
return new DOMParser3().parseFromString(text, "text/xml");
|
|
3535
|
+
}
|
|
3536
|
+
function parseStyles(xml) {
|
|
3537
|
+
const doc = parseXml2(xml);
|
|
3538
|
+
const styles = /* @__PURE__ */ new Map();
|
|
3539
|
+
const styleElements = findElements(doc, "style");
|
|
3540
|
+
for (const el of styleElements) {
|
|
3541
|
+
const styleId = getAttr(el, "styleId");
|
|
3542
|
+
if (!styleId) continue;
|
|
3543
|
+
const nameEls = getChildElements(el, "name");
|
|
3544
|
+
const name = nameEls.length > 0 ? getAttr(nameEls[0], "val") ?? "" : "";
|
|
3545
|
+
const basedOnEls = getChildElements(el, "basedOn");
|
|
3546
|
+
const basedOn = basedOnEls.length > 0 ? getAttr(basedOnEls[0], "val") ?? void 0 : void 0;
|
|
3547
|
+
const pPrEls = getChildElements(el, "pPr");
|
|
3548
|
+
let outlineLevel;
|
|
3549
|
+
if (pPrEls.length > 0) {
|
|
3550
|
+
const outlineEls = getChildElements(pPrEls[0], "outlineLvl");
|
|
3551
|
+
if (outlineEls.length > 0) {
|
|
3552
|
+
const val = getAttr(outlineEls[0], "val");
|
|
3553
|
+
if (val !== null) outlineLevel = parseInt(val, 10);
|
|
3554
|
+
}
|
|
3555
|
+
}
|
|
3556
|
+
if (outlineLevel === void 0) {
|
|
3557
|
+
const headingMatch = name.match(/^(?:heading|Heading)\s*(\d+)$/i);
|
|
3558
|
+
if (headingMatch) outlineLevel = parseInt(headingMatch[1], 10) - 1;
|
|
3559
|
+
}
|
|
3560
|
+
styles.set(styleId, { name, basedOn, outlineLevel });
|
|
3561
|
+
}
|
|
3562
|
+
return styles;
|
|
3563
|
+
}
|
|
3564
|
+
function parseNumbering(xml) {
|
|
3565
|
+
const doc = parseXml2(xml);
|
|
3566
|
+
const abstractNums = /* @__PURE__ */ new Map();
|
|
3567
|
+
const abstractElements = findElements(doc, "abstractNum");
|
|
3568
|
+
for (const el of abstractElements) {
|
|
3569
|
+
const abstractNumId = getAttr(el, "abstractNumId");
|
|
3570
|
+
if (!abstractNumId) continue;
|
|
3571
|
+
const levels = /* @__PURE__ */ new Map();
|
|
3572
|
+
const lvlElements = getChildElements(el, "lvl");
|
|
3573
|
+
for (const lvl of lvlElements) {
|
|
3574
|
+
const ilvl = parseInt(getAttr(lvl, "ilvl") ?? "0", 10);
|
|
3575
|
+
const numFmtEls = getChildElements(lvl, "numFmt");
|
|
3576
|
+
const numFmt = numFmtEls.length > 0 ? getAttr(numFmtEls[0], "val") ?? "bullet" : "bullet";
|
|
3577
|
+
levels.set(ilvl, { numFmt, level: ilvl });
|
|
3578
|
+
}
|
|
3579
|
+
abstractNums.set(abstractNumId, levels);
|
|
3580
|
+
}
|
|
3581
|
+
const nums = /* @__PURE__ */ new Map();
|
|
3582
|
+
const numElements = findElements(doc, "num");
|
|
3583
|
+
for (const el of numElements) {
|
|
3584
|
+
const numId = getAttr(el, "numId");
|
|
3585
|
+
if (!numId) continue;
|
|
3586
|
+
const abstractRefs = getChildElements(el, "abstractNumId");
|
|
3587
|
+
if (abstractRefs.length > 0) {
|
|
3588
|
+
const ref = getAttr(abstractRefs[0], "val");
|
|
3589
|
+
if (ref && abstractNums.has(ref)) {
|
|
3590
|
+
nums.set(numId, abstractNums.get(ref));
|
|
3591
|
+
}
|
|
3592
|
+
}
|
|
3593
|
+
}
|
|
3594
|
+
return nums;
|
|
3595
|
+
}
|
|
3596
|
+
function parseRels2(xml) {
|
|
3597
|
+
const doc = parseXml2(xml);
|
|
3598
|
+
const map = /* @__PURE__ */ new Map();
|
|
3599
|
+
const rels = findElements(doc, "Relationship");
|
|
3600
|
+
for (const rel of rels) {
|
|
3601
|
+
const id = getAttr(rel, "Id");
|
|
3602
|
+
const target = getAttr(rel, "Target");
|
|
3603
|
+
if (id && target) map.set(id, target);
|
|
3604
|
+
}
|
|
3605
|
+
return map;
|
|
3606
|
+
}
|
|
3607
|
+
function parseFootnotes(xml) {
|
|
3608
|
+
const doc = parseXml2(xml);
|
|
3609
|
+
const notes = /* @__PURE__ */ new Map();
|
|
3610
|
+
const fnElements = findElements(doc, "footnote");
|
|
3611
|
+
for (const fn of fnElements) {
|
|
3612
|
+
const id = getAttr(fn, "id");
|
|
3613
|
+
if (!id || id === "0" || id === "-1") continue;
|
|
3614
|
+
const texts = [];
|
|
3615
|
+
const pElements = findElements(fn, "p");
|
|
3616
|
+
for (const p of pElements) {
|
|
3617
|
+
const runs = findElements(p, "r");
|
|
3618
|
+
for (const r of runs) {
|
|
3619
|
+
const tElements = getChildElements(r, "t");
|
|
3620
|
+
for (const t of tElements) texts.push(t.textContent ?? "");
|
|
3621
|
+
}
|
|
3622
|
+
}
|
|
3623
|
+
notes.set(id, texts.join("").trim());
|
|
3624
|
+
}
|
|
3625
|
+
return notes;
|
|
3626
|
+
}
|
|
3627
|
+
function extractRun(r) {
|
|
3628
|
+
const tElements = getChildElements(r, "t");
|
|
3629
|
+
const text = tElements.map((t) => t.textContent ?? "").join("");
|
|
3630
|
+
let bold = false;
|
|
3631
|
+
let italic = false;
|
|
3632
|
+
const rPrEls = getChildElements(r, "rPr");
|
|
3633
|
+
if (rPrEls.length > 0) {
|
|
3634
|
+
bold = getChildElements(rPrEls[0], "b").length > 0;
|
|
3635
|
+
italic = getChildElements(rPrEls[0], "i").length > 0;
|
|
3636
|
+
}
|
|
3637
|
+
return { text, bold, italic };
|
|
3638
|
+
}
|
|
3639
|
+
function parseParagraph(p, styles, numbering, footnotes, rels) {
|
|
3640
|
+
const pPrEls = getChildElements(p, "pPr");
|
|
3641
|
+
let styleId = "";
|
|
3642
|
+
let numId = "";
|
|
3643
|
+
let ilvl = 0;
|
|
3644
|
+
if (pPrEls.length > 0) {
|
|
3645
|
+
const pStyleEls = getChildElements(pPrEls[0], "pStyle");
|
|
3646
|
+
if (pStyleEls.length > 0) styleId = getAttr(pStyleEls[0], "val") ?? "";
|
|
3647
|
+
const numPrEls = getChildElements(pPrEls[0], "numPr");
|
|
3648
|
+
if (numPrEls.length > 0) {
|
|
3649
|
+
const numIdEls = getChildElements(numPrEls[0], "numId");
|
|
3650
|
+
const ilvlEls = getChildElements(numPrEls[0], "ilvl");
|
|
3651
|
+
numId = numIdEls.length > 0 ? getAttr(numIdEls[0], "val") ?? "" : "";
|
|
3652
|
+
ilvl = ilvlEls.length > 0 ? parseInt(getAttr(ilvlEls[0], "val") ?? "0", 10) : 0;
|
|
3653
|
+
}
|
|
3654
|
+
}
|
|
3655
|
+
const parts = [];
|
|
3656
|
+
let hasBold = false;
|
|
3657
|
+
let hasItalic = false;
|
|
3658
|
+
let href;
|
|
3659
|
+
let footnoteText;
|
|
3660
|
+
const hyperlinks = getChildElements(p, "hyperlink");
|
|
3661
|
+
const hyperlinkTexts = /* @__PURE__ */ new Set();
|
|
3662
|
+
for (const hl of hyperlinks) {
|
|
3663
|
+
const rId = getAttr(hl, "id");
|
|
3664
|
+
const hlText = [];
|
|
3665
|
+
const runs2 = findElements(hl, "r");
|
|
3666
|
+
for (const r of runs2) {
|
|
3667
|
+
const result = extractRun(r);
|
|
3668
|
+
hlText.push(result.text);
|
|
3669
|
+
}
|
|
3670
|
+
const text2 = hlText.join("");
|
|
3671
|
+
if (text2) {
|
|
3672
|
+
hyperlinkTexts.add(text2);
|
|
3673
|
+
if (rId && rels.has(rId)) {
|
|
3674
|
+
href = rels.get(rId);
|
|
3675
|
+
parts.push(text2);
|
|
3676
|
+
} else {
|
|
3677
|
+
parts.push(text2);
|
|
3678
|
+
}
|
|
3679
|
+
}
|
|
3680
|
+
}
|
|
3681
|
+
const runs = getChildElements(p, "r");
|
|
3682
|
+
for (const r of runs) {
|
|
3683
|
+
if (r.parentNode && r.parentNode.localName === "hyperlink") continue;
|
|
3684
|
+
const result = extractRun(r);
|
|
3685
|
+
if (result.bold) hasBold = true;
|
|
3686
|
+
if (result.italic) hasItalic = true;
|
|
3687
|
+
const fnRefEls = getChildElements(r, "footnoteReference");
|
|
3688
|
+
if (fnRefEls.length > 0) {
|
|
3689
|
+
const fnId = getAttr(fnRefEls[0], "id");
|
|
3690
|
+
if (fnId && footnotes.has(fnId)) {
|
|
3691
|
+
footnoteText = footnotes.get(fnId);
|
|
3692
|
+
}
|
|
3693
|
+
}
|
|
3694
|
+
if (result.text) parts.push(result.text);
|
|
3695
|
+
}
|
|
3696
|
+
const text = parts.join("").trim();
|
|
3697
|
+
if (!text) return null;
|
|
3698
|
+
const style = styles.get(styleId);
|
|
3699
|
+
if (style?.outlineLevel !== void 0 && style.outlineLevel >= 0 && style.outlineLevel <= 5) {
|
|
3700
|
+
return {
|
|
3701
|
+
type: "heading",
|
|
3702
|
+
text,
|
|
3703
|
+
level: style.outlineLevel + 1
|
|
3704
|
+
};
|
|
3705
|
+
}
|
|
3706
|
+
if (numId && numId !== "0") {
|
|
3707
|
+
const numDef = numbering.get(numId);
|
|
3708
|
+
const levelInfo = numDef?.get(ilvl);
|
|
3709
|
+
const listType = levelInfo?.numFmt === "bullet" ? "unordered" : "ordered";
|
|
3710
|
+
return { type: "list", text, listType };
|
|
3711
|
+
}
|
|
3712
|
+
const block = { type: "paragraph", text };
|
|
3713
|
+
if (hasBold || hasItalic) {
|
|
3714
|
+
block.style = { bold: hasBold || void 0, italic: hasItalic || void 0 };
|
|
3715
|
+
}
|
|
3716
|
+
if (href) block.href = href;
|
|
3717
|
+
if (footnoteText) block.footnoteText = footnoteText;
|
|
3718
|
+
return block;
|
|
3719
|
+
}
|
|
3720
|
+
function parseTable(tbl, styles, numbering, footnotes, rels) {
|
|
3721
|
+
const trElements = getChildElements(tbl, "tr");
|
|
3722
|
+
if (trElements.length === 0) return null;
|
|
3723
|
+
const rows = [];
|
|
3724
|
+
let maxCols = 0;
|
|
3725
|
+
for (const tr of trElements) {
|
|
3726
|
+
const tcElements = getChildElements(tr, "tc");
|
|
3727
|
+
const row = [];
|
|
3728
|
+
for (const tc of tcElements) {
|
|
3729
|
+
let colSpan = 1;
|
|
3730
|
+
let rowSpan = 1;
|
|
3731
|
+
const tcPrEls = getChildElements(tc, "tcPr");
|
|
3732
|
+
if (tcPrEls.length > 0) {
|
|
3733
|
+
const gridSpanEls = getChildElements(tcPrEls[0], "gridSpan");
|
|
3734
|
+
if (gridSpanEls.length > 0) {
|
|
3735
|
+
colSpan = parseInt(getAttr(gridSpanEls[0], "val") ?? "1", 10);
|
|
3736
|
+
}
|
|
3737
|
+
const vMergeEls = getChildElements(tcPrEls[0], "vMerge");
|
|
3738
|
+
if (vMergeEls.length > 0) {
|
|
3739
|
+
const val = getAttr(vMergeEls[0], "val");
|
|
3740
|
+
if (val !== "restart" && val !== null) {
|
|
3741
|
+
row.push({ text: "", colSpan, rowSpan: 0 });
|
|
3742
|
+
continue;
|
|
3743
|
+
}
|
|
3744
|
+
}
|
|
3745
|
+
}
|
|
3746
|
+
const cellTexts = [];
|
|
3747
|
+
const pElements = getChildElements(tc, "p");
|
|
3748
|
+
for (const p of pElements) {
|
|
3749
|
+
const block = parseParagraph(p, styles, numbering, footnotes, rels);
|
|
3750
|
+
if (block?.text) cellTexts.push(block.text);
|
|
3751
|
+
}
|
|
3752
|
+
row.push({ text: cellTexts.join("\n"), colSpan, rowSpan });
|
|
3753
|
+
}
|
|
3754
|
+
rows.push(row);
|
|
3755
|
+
if (row.length > maxCols) maxCols = row.length;
|
|
3756
|
+
}
|
|
3757
|
+
for (let c = 0; c < maxCols; c++) {
|
|
3758
|
+
for (let r = 0; r < rows.length; r++) {
|
|
3759
|
+
const cell = rows[r][c];
|
|
3760
|
+
if (!cell || cell.rowSpan === 0) continue;
|
|
3761
|
+
let span = 1;
|
|
3762
|
+
for (let nr = r + 1; nr < rows.length; nr++) {
|
|
3763
|
+
if (rows[nr][c]?.rowSpan === 0) span++;
|
|
3764
|
+
else break;
|
|
3765
|
+
}
|
|
3766
|
+
cell.rowSpan = span;
|
|
3767
|
+
}
|
|
3768
|
+
}
|
|
3769
|
+
const cleanRows = [];
|
|
3770
|
+
for (const row of rows) {
|
|
3771
|
+
const clean = row.filter((cell) => cell.rowSpan !== 0);
|
|
3772
|
+
cleanRows.push(clean);
|
|
3773
|
+
}
|
|
3774
|
+
if (cleanRows.length === 0) return null;
|
|
3775
|
+
let cols = 0;
|
|
3776
|
+
for (const row of cleanRows) {
|
|
3777
|
+
let c = 0;
|
|
3778
|
+
for (const cell of row) c += cell.colSpan;
|
|
3779
|
+
if (c > cols) cols = c;
|
|
3780
|
+
}
|
|
3781
|
+
const table = {
|
|
3782
|
+
rows: cleanRows.length,
|
|
3783
|
+
cols,
|
|
3784
|
+
cells: cleanRows,
|
|
3785
|
+
hasHeader: cleanRows.length > 1
|
|
3786
|
+
};
|
|
3787
|
+
return { type: "table", table };
|
|
3788
|
+
}
|
|
3789
|
+
async function extractImages(zip, rels, doc) {
|
|
3790
|
+
const blocks = [];
|
|
3791
|
+
const images = [];
|
|
3792
|
+
const drawingElements = findElements(doc.documentElement, "drawing");
|
|
3793
|
+
let imgIdx = 0;
|
|
3794
|
+
for (const drawing of drawingElements) {
|
|
3795
|
+
const blips = findElements(drawing, "blip");
|
|
3796
|
+
for (const blip of blips) {
|
|
3797
|
+
const embedId = getAttr(blip, "embed");
|
|
3798
|
+
if (!embedId) continue;
|
|
3799
|
+
const target = rels.get(embedId);
|
|
3800
|
+
if (!target) continue;
|
|
3801
|
+
const imgPath = target.startsWith("/") ? target.slice(1) : target.startsWith("word/") ? target : `word/${target}`;
|
|
3802
|
+
const imgFile = zip.file(imgPath);
|
|
3803
|
+
if (!imgFile) continue;
|
|
3804
|
+
try {
|
|
3805
|
+
const data = await imgFile.async("uint8array");
|
|
3806
|
+
imgIdx++;
|
|
3807
|
+
const ext = imgPath.split(".").pop()?.toLowerCase() ?? "png";
|
|
3808
|
+
const mimeMap = {
|
|
3809
|
+
png: "image/png",
|
|
3810
|
+
jpg: "image/jpeg",
|
|
3811
|
+
jpeg: "image/jpeg",
|
|
3812
|
+
gif: "image/gif",
|
|
3813
|
+
bmp: "image/bmp",
|
|
3814
|
+
wmf: "image/wmf",
|
|
3815
|
+
emf: "image/emf"
|
|
3816
|
+
};
|
|
3817
|
+
const filename = `image_${String(imgIdx).padStart(3, "0")}.${ext}`;
|
|
3818
|
+
images.push({ filename, data, mimeType: mimeMap[ext] ?? "image/png" });
|
|
3819
|
+
blocks.push({ type: "image", text: filename });
|
|
3820
|
+
} catch {
|
|
3821
|
+
}
|
|
3822
|
+
}
|
|
3823
|
+
}
|
|
3824
|
+
return { blocks, images };
|
|
3825
|
+
}
|
|
3826
|
+
async function parseDocxDocument(buffer, options) {
|
|
3827
|
+
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
|
|
3828
|
+
const zip = await JSZip4.loadAsync(buffer);
|
|
3829
|
+
const warnings = [];
|
|
3830
|
+
const docFile = zip.file("word/document.xml");
|
|
3831
|
+
if (!docFile) {
|
|
3832
|
+
throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 DOCX \uD30C\uC77C: word/document.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
3833
|
+
}
|
|
3834
|
+
let rels = /* @__PURE__ */ new Map();
|
|
3835
|
+
const relsFile = zip.file("word/_rels/document.xml.rels");
|
|
3836
|
+
if (relsFile) {
|
|
3837
|
+
rels = parseRels2(await relsFile.async("text"));
|
|
3838
|
+
}
|
|
3839
|
+
let styles = /* @__PURE__ */ new Map();
|
|
3840
|
+
const stylesFile = zip.file("word/styles.xml");
|
|
3841
|
+
if (stylesFile) {
|
|
3842
|
+
try {
|
|
3843
|
+
styles = parseStyles(await stylesFile.async("text"));
|
|
3844
|
+
} catch {
|
|
3845
|
+
}
|
|
3846
|
+
}
|
|
3847
|
+
let numbering = /* @__PURE__ */ new Map();
|
|
3848
|
+
const numFile = zip.file("word/numbering.xml");
|
|
3849
|
+
if (numFile) {
|
|
3850
|
+
try {
|
|
3851
|
+
numbering = parseNumbering(await numFile.async("text"));
|
|
3852
|
+
} catch {
|
|
3853
|
+
}
|
|
3854
|
+
}
|
|
3855
|
+
let footnotes = /* @__PURE__ */ new Map();
|
|
3856
|
+
const fnFile = zip.file("word/footnotes.xml");
|
|
3857
|
+
if (fnFile) {
|
|
3858
|
+
try {
|
|
3859
|
+
footnotes = parseFootnotes(await fnFile.async("text"));
|
|
3860
|
+
} catch {
|
|
3861
|
+
}
|
|
3862
|
+
}
|
|
3863
|
+
const docXml = await docFile.async("text");
|
|
3864
|
+
const doc = parseXml2(docXml);
|
|
3865
|
+
const body = findElements(doc, "body");
|
|
3866
|
+
if (body.length === 0) {
|
|
3867
|
+
throw new KordocError("DOCX \uBCF8\uBB38(w:body)\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
3868
|
+
}
|
|
3869
|
+
const blocks = [];
|
|
3870
|
+
const bodyEl = body[0];
|
|
3871
|
+
const children = bodyEl.childNodes;
|
|
3872
|
+
for (let i = 0; i < children.length; i++) {
|
|
3873
|
+
const node = children[i];
|
|
3874
|
+
if (node.nodeType !== 1) continue;
|
|
3875
|
+
const el = node;
|
|
3876
|
+
const localName = el.localName ?? el.tagName?.split(":").pop();
|
|
3877
|
+
if (localName === "p") {
|
|
3878
|
+
const block = parseParagraph(el, styles, numbering, footnotes, rels);
|
|
3879
|
+
if (block) blocks.push(block);
|
|
3880
|
+
} else if (localName === "tbl") {
|
|
3881
|
+
const block = parseTable(el, styles, numbering, footnotes, rels);
|
|
3882
|
+
if (block) blocks.push(block);
|
|
3883
|
+
}
|
|
3884
|
+
}
|
|
3885
|
+
const { blocks: imgBlocks, images } = await extractImages(zip, rels, doc);
|
|
3886
|
+
const metadata = {};
|
|
3887
|
+
const coreFile = zip.file("docProps/core.xml");
|
|
3888
|
+
if (coreFile) {
|
|
3889
|
+
try {
|
|
3890
|
+
const coreXml = await coreFile.async("text");
|
|
3891
|
+
const coreDoc = parseXml2(coreXml);
|
|
3892
|
+
const getFirst = (tag) => {
|
|
3893
|
+
const els = coreDoc.getElementsByTagName(tag);
|
|
3894
|
+
return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
|
|
3895
|
+
};
|
|
3896
|
+
metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
|
|
3897
|
+
metadata.author = getFirst("dc:creator");
|
|
3898
|
+
metadata.description = getFirst("dc:description");
|
|
3899
|
+
const created = getFirst("dcterms:created");
|
|
3900
|
+
if (created) metadata.createdAt = created;
|
|
3901
|
+
const modified = getFirst("dcterms:modified");
|
|
3902
|
+
if (modified) metadata.modifiedAt = modified;
|
|
3903
|
+
} catch {
|
|
3904
|
+
}
|
|
3905
|
+
}
|
|
3906
|
+
const outline = blocks.filter((b) => b.type === "heading").map((b) => ({ level: b.level ?? 2, text: b.text ?? "" }));
|
|
3907
|
+
const markdown = blocksToMarkdown(blocks);
|
|
3908
|
+
return {
|
|
3909
|
+
markdown,
|
|
3910
|
+
blocks,
|
|
3911
|
+
metadata,
|
|
3912
|
+
outline: outline.length > 0 ? outline : void 0,
|
|
3913
|
+
warnings: warnings.length > 0 ? warnings : void 0,
|
|
3914
|
+
images: images.length > 0 ? images : void 0
|
|
3915
|
+
};
|
|
3916
|
+
}
|
|
3917
|
+
|
|
3005
3918
|
// src/form/recognize.ts
|
|
3006
3919
|
var LABEL_KEYWORDS = /* @__PURE__ */ new Set([
|
|
3007
3920
|
"\uC131\uBA85",
|
|
@@ -3129,7 +4042,7 @@ function extractInlineFields(text) {
|
|
|
3129
4042
|
}
|
|
3130
4043
|
|
|
3131
4044
|
// src/hwpx/generator.ts
|
|
3132
|
-
import
|
|
4045
|
+
import JSZip5 from "jszip";
|
|
3133
4046
|
|
|
3134
4047
|
// src/index.ts
|
|
3135
4048
|
async function parse(input, options) {
|
|
@@ -3152,8 +4065,12 @@ async function parse(input, options) {
|
|
|
3152
4065
|
}
|
|
3153
4066
|
const format = detectFormat(buffer);
|
|
3154
4067
|
switch (format) {
|
|
3155
|
-
case "hwpx":
|
|
4068
|
+
case "hwpx": {
|
|
4069
|
+
const zipFormat = await detectZipFormat(buffer);
|
|
4070
|
+
if (zipFormat === "xlsx") return parseXlsx(buffer, options);
|
|
4071
|
+
if (zipFormat === "docx") return parseDocx(buffer, options);
|
|
3156
4072
|
return parseHwpx(buffer, options);
|
|
4073
|
+
}
|
|
3157
4074
|
case "hwp":
|
|
3158
4075
|
return parseHwp(buffer, options);
|
|
3159
4076
|
case "pdf":
|
|
@@ -3180,9 +4097,27 @@ async function parseHwp(buffer, options) {
|
|
|
3180
4097
|
}
|
|
3181
4098
|
async function parsePdf(buffer, options) {
|
|
3182
4099
|
try {
|
|
3183
|
-
|
|
4100
|
+
const { markdown, blocks, metadata, outline, warnings, isImageBased } = await parsePdfDocument(buffer, options);
|
|
4101
|
+
return { success: true, fileType: "pdf", markdown, blocks, metadata, outline, warnings, isImageBased };
|
|
4102
|
+
} catch (err) {
|
|
4103
|
+
const isImageBased = err instanceof Error && "isImageBased" in err ? true : void 0;
|
|
4104
|
+
return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err), isImageBased };
|
|
4105
|
+
}
|
|
4106
|
+
}
|
|
4107
|
+
async function parseXlsx(buffer, options) {
|
|
4108
|
+
try {
|
|
4109
|
+
const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options);
|
|
4110
|
+
return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
|
|
3184
4111
|
} catch (err) {
|
|
3185
|
-
return { success: false, fileType: "
|
|
4112
|
+
return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
4113
|
+
}
|
|
4114
|
+
}
|
|
4115
|
+
async function parseDocx(buffer, options) {
|
|
4116
|
+
try {
|
|
4117
|
+
const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options);
|
|
4118
|
+
return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
|
|
4119
|
+
} catch (err) {
|
|
4120
|
+
return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
3186
4121
|
}
|
|
3187
4122
|
}
|
|
3188
4123
|
|
|
@@ -3360,10 +4295,6 @@ function diffTableCells(a, b) {
|
|
|
3360
4295
|
export {
|
|
3361
4296
|
detectFormat,
|
|
3362
4297
|
blocksToMarkdown,
|
|
3363
|
-
VERSION,
|
|
3364
|
-
toArrayBuffer,
|
|
3365
|
-
KordocError,
|
|
3366
|
-
sanitizeError,
|
|
3367
4298
|
extractHwpxMetadataOnly,
|
|
3368
4299
|
extractHwp5MetadataOnly,
|
|
3369
4300
|
extractPdfMetadataOnly,
|
|
@@ -3371,4 +4302,4 @@ export {
|
|
|
3371
4302
|
extractFormFields,
|
|
3372
4303
|
parse
|
|
3373
4304
|
};
|
|
3374
|
-
//# sourceMappingURL=chunk-
|
|
4305
|
+
//# sourceMappingURL=chunk-MDRW3HYC.js.map
|