kordoc 1.7.2 → 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +36 -15
- package/dist/chunk-MOL7MDBG.js +35 -0
- package/dist/chunk-MOL7MDBG.js.map +1 -0
- package/dist/{chunk-NJ3R7LNR.js → chunk-QQ6PZADA.js} +1120 -230
- package/dist/chunk-QQ6PZADA.js.map +1 -0
- package/dist/chunk-UUKFY5P5.js +93 -0
- package/dist/chunk-UUKFY5P5.js.map +1 -0
- package/dist/cli.js +11 -7
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +1208 -191
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +17 -5
- package/dist/index.d.ts +17 -5
- package/dist/index.js +1203 -190
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +10 -7
- package/dist/mcp.js.map +1 -1
- package/dist/page-range-737B4EZW.js +8 -0
- package/dist/page-range-737B4EZW.js.map +1 -0
- package/dist/provider-A4FHJSID.js +0 -0
- package/dist/utils-OTCR2KMY.js +22 -0
- package/dist/utils-OTCR2KMY.js.map +1 -0
- package/dist/{watch-AKTZTPVF.js → watch-JFDOENIO.js} +13 -5
- package/dist/watch-JFDOENIO.js.map +1 -0
- package/package.json +77 -75
- package/dist/chunk-NJ3R7LNR.js.map +0 -1
- package/dist/watch-AKTZTPVF.js.map +0 -1
|
@@ -1,10 +1,22 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
+
import {
|
|
3
|
+
KordocError,
|
|
4
|
+
classifyError,
|
|
5
|
+
isPathTraversal,
|
|
6
|
+
precheckZipSize,
|
|
7
|
+
sanitizeHref,
|
|
8
|
+
toArrayBuffer
|
|
9
|
+
} from "./chunk-UUKFY5P5.js";
|
|
10
|
+
import {
|
|
11
|
+
parsePageRange
|
|
12
|
+
} from "./chunk-MOL7MDBG.js";
|
|
2
13
|
|
|
3
14
|
// src/detect.ts
|
|
15
|
+
import JSZip from "jszip";
|
|
4
16
|
function magicBytes(buffer) {
|
|
5
17
|
return new Uint8Array(buffer, 0, Math.min(4, buffer.byteLength));
|
|
6
18
|
}
|
|
7
|
-
function
|
|
19
|
+
function isZipFile(buffer) {
|
|
8
20
|
const b = magicBytes(buffer);
|
|
9
21
|
return b[0] === 80 && b[1] === 75 && b[2] === 3 && b[3] === 4;
|
|
10
22
|
}
|
|
@@ -18,15 +30,28 @@ function isPdfFile(buffer) {
|
|
|
18
30
|
}
|
|
19
31
|
function detectFormat(buffer) {
|
|
20
32
|
if (buffer.byteLength < 4) return "unknown";
|
|
21
|
-
if (
|
|
33
|
+
if (isZipFile(buffer)) return "hwpx";
|
|
22
34
|
if (isOldHwpFile(buffer)) return "hwp";
|
|
23
35
|
if (isPdfFile(buffer)) return "pdf";
|
|
24
36
|
return "unknown";
|
|
25
37
|
}
|
|
38
|
+
async function detectZipFormat(buffer) {
|
|
39
|
+
try {
|
|
40
|
+
const zip = await JSZip.loadAsync(buffer);
|
|
41
|
+
if (zip.file("xl/workbook.xml")) return "xlsx";
|
|
42
|
+
if (zip.file("word/document.xml")) return "docx";
|
|
43
|
+
if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return "hwpx";
|
|
44
|
+
const hasSection = Object.keys(zip.files).some((f) => f.startsWith("Contents/"));
|
|
45
|
+
if (hasSection) return "hwpx";
|
|
46
|
+
return "unknown";
|
|
47
|
+
} catch {
|
|
48
|
+
return "unknown";
|
|
49
|
+
}
|
|
50
|
+
}
|
|
26
51
|
|
|
27
52
|
// src/table/builder.ts
|
|
28
53
|
var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
|
|
29
|
-
function
|
|
54
|
+
function sanitizeHref2(href) {
|
|
30
55
|
const trimmed = href.trim();
|
|
31
56
|
if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
|
|
32
57
|
return trimmed;
|
|
@@ -80,6 +105,16 @@ function buildTable(rows) {
|
|
|
80
105
|
cellIdx++;
|
|
81
106
|
}
|
|
82
107
|
}
|
|
108
|
+
let effectiveCols = maxCols;
|
|
109
|
+
while (effectiveCols > 0) {
|
|
110
|
+
const colEmpty = grid.every((row) => !row[effectiveCols - 1]?.text?.trim());
|
|
111
|
+
if (!colEmpty) break;
|
|
112
|
+
effectiveCols--;
|
|
113
|
+
}
|
|
114
|
+
if (effectiveCols < maxCols && effectiveCols > 0) {
|
|
115
|
+
const trimmed = grid.map((row) => row.slice(0, effectiveCols));
|
|
116
|
+
return { rows: numRows, cols: effectiveCols, cells: trimmed, hasHeader: numRows > 1 };
|
|
117
|
+
}
|
|
83
118
|
return { rows: numRows, cols: maxCols, cells: grid, hasHeader: numRows > 1 };
|
|
84
119
|
}
|
|
85
120
|
function convertTableToText(rows) {
|
|
@@ -87,13 +122,26 @@ function convertTableToText(rows) {
|
|
|
87
122
|
(row) => row.map((c) => c.text.trim().replace(/\n/g, " ")).filter(Boolean).join(" | ")
|
|
88
123
|
).filter(Boolean).join("\n");
|
|
89
124
|
}
|
|
125
|
+
var HWP_SHAPE_ALT_TEXT_RE = /(?:모서리가 둥근 |둥근 )?(?:사각형|직사각형|정사각형|원|타원|삼각형|이등변 삼각형|직각 삼각형|선|직선|곡선|화살표|굵은 화살표|이중 화살표|오각형|육각형|팔각형|별|[4-8]점별|십자|십자형|구름|구름형|마름모|도넛|평행사변형|사다리꼴|부채꼴|호|반원|물결|번개|하트|빗금|블록 화살표|수식|표|그림|개체|그리기\s?개체|묶음\s?개체|글상자|수식\s?개체|OLE\s?개체)\s?입니다\.?/g;
|
|
126
|
+
function sanitizeText(text) {
|
|
127
|
+
let result = text.replace(/[\u{F0000}-\u{FFFFD}]/gu, "").replace(HWP_SHAPE_ALT_TEXT_RE, "").replace(/ +/g, " ").trim();
|
|
128
|
+
if (result.length <= 30 && result.includes(" ")) {
|
|
129
|
+
const tokens = result.split(" ");
|
|
130
|
+
const koreanSingleCharCount = tokens.filter((t) => t.length === 1 && /[\uAC00-\uD7AF\u3131-\u318E]/.test(t)).length;
|
|
131
|
+
if (tokens.length >= 3 && koreanSingleCharCount / tokens.length >= 0.7) {
|
|
132
|
+
result = tokens.join("");
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
return result;
|
|
136
|
+
}
|
|
90
137
|
function blocksToMarkdown(blocks) {
|
|
91
138
|
const lines = [];
|
|
92
139
|
for (let i = 0; i < blocks.length; i++) {
|
|
93
140
|
const block = blocks[i];
|
|
94
141
|
if (block.type === "heading" && block.text) {
|
|
95
142
|
const prefix = "#".repeat(Math.min(block.level || 2, 6));
|
|
96
|
-
|
|
143
|
+
const headingText = sanitizeText(block.text);
|
|
144
|
+
if (headingText) lines.push("", `${prefix} ${headingText}`, "");
|
|
97
145
|
continue;
|
|
98
146
|
}
|
|
99
147
|
if (block.type === "image" && block.text) {
|
|
@@ -105,9 +153,11 @@ function blocksToMarkdown(blocks) {
|
|
|
105
153
|
continue;
|
|
106
154
|
}
|
|
107
155
|
if (block.type === "list" && block.text) {
|
|
108
|
-
const
|
|
156
|
+
const listText = sanitizeText(block.text);
|
|
157
|
+
if (!listText) continue;
|
|
158
|
+
const alreadyNumbered = block.listType === "ordered" && /^\d+\.\s/.test(listText);
|
|
109
159
|
const prefix = alreadyNumbered ? "" : block.listType === "ordered" ? "1. " : "- ";
|
|
110
|
-
lines.push(`${prefix}${
|
|
160
|
+
lines.push(`${prefix}${listText}`);
|
|
111
161
|
if (block.children) {
|
|
112
162
|
for (const child of block.children) {
|
|
113
163
|
const childPrefix = child.listType === "ordered" ? "1." : "-";
|
|
@@ -117,7 +167,8 @@ function blocksToMarkdown(blocks) {
|
|
|
117
167
|
continue;
|
|
118
168
|
}
|
|
119
169
|
if (block.type === "paragraph" && block.text) {
|
|
120
|
-
let text = block.text;
|
|
170
|
+
let text = sanitizeText(block.text);
|
|
171
|
+
if (!text) continue;
|
|
121
172
|
if (/^\[별표\s*\d+/.test(text)) {
|
|
122
173
|
const nextBlock = blocks[i + 1];
|
|
123
174
|
if (nextBlock?.type === "paragraph" && nextBlock.text && /관련\)?$/.test(nextBlock.text)) {
|
|
@@ -133,7 +184,7 @@ function blocksToMarkdown(blocks) {
|
|
|
133
184
|
continue;
|
|
134
185
|
}
|
|
135
186
|
if (block.href) {
|
|
136
|
-
const href =
|
|
187
|
+
const href = sanitizeHref2(block.href);
|
|
137
188
|
if (href) text = `[${text}](${href})`;
|
|
138
189
|
}
|
|
139
190
|
if (block.footnoteText) {
|
|
@@ -154,7 +205,7 @@ function tableToMarkdown(table) {
|
|
|
154
205
|
if (table.rows === 0 || table.cols === 0) return "";
|
|
155
206
|
const { cells, rows: numRows, cols: numCols } = table;
|
|
156
207
|
if (numRows === 1 && numCols === 1) {
|
|
157
|
-
const content = cells[0][0].text;
|
|
208
|
+
const content = sanitizeText(cells[0][0].text);
|
|
158
209
|
return content.split(/\n/).map((line) => {
|
|
159
210
|
const trimmed = line.trim();
|
|
160
211
|
if (!trimmed) return "";
|
|
@@ -163,13 +214,19 @@ function tableToMarkdown(table) {
|
|
|
163
214
|
return trimmed;
|
|
164
215
|
}).filter(Boolean).join("\n");
|
|
165
216
|
}
|
|
217
|
+
if (numCols === 1 && numRows >= 2) {
|
|
218
|
+
return cells.map((row) => sanitizeText(row[0].text).replace(/\n/g, " ")).filter(Boolean).join("\n");
|
|
219
|
+
}
|
|
166
220
|
const display = Array.from({ length: numRows }, () => Array(numCols).fill(""));
|
|
167
221
|
const skip = /* @__PURE__ */ new Set();
|
|
168
222
|
for (let r = 0; r < numRows; r++) {
|
|
223
|
+
let cellIdx = 0;
|
|
169
224
|
for (let c = 0; c < numCols; c++) {
|
|
170
225
|
if (skip.has(`${r},${c}`)) continue;
|
|
171
|
-
const cell = cells[r][
|
|
172
|
-
|
|
226
|
+
const cell = cells[r]?.[cellIdx];
|
|
227
|
+
if (!cell) break;
|
|
228
|
+
cellIdx++;
|
|
229
|
+
display[r][c] = sanitizeText(cell.text).replace(/\n/g, "<br>");
|
|
173
230
|
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
174
231
|
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
175
232
|
if (dr === 0 && dc === 0) continue;
|
|
@@ -178,12 +235,28 @@ function tableToMarkdown(table) {
|
|
|
178
235
|
}
|
|
179
236
|
}
|
|
180
237
|
}
|
|
238
|
+
c += cell.colSpan - 1;
|
|
181
239
|
}
|
|
182
240
|
}
|
|
183
241
|
const uniqueRows = [];
|
|
184
|
-
|
|
242
|
+
let pendingFirstCol = "";
|
|
243
|
+
for (let r = 0; r < display.length; r++) {
|
|
244
|
+
const row = display[r];
|
|
185
245
|
const isEmptyPlaceholder = row.every((cell) => cell === "");
|
|
186
|
-
if (
|
|
246
|
+
if (isEmptyPlaceholder) continue;
|
|
247
|
+
const hasSkippedCols = row.some((cell, c) => cell === "" && skip.has(`${r},${c}`));
|
|
248
|
+
const nonEmptyCols = row.filter((cell) => cell !== "");
|
|
249
|
+
if (!hasSkippedCols && nonEmptyCols.length === 1 && row[0] !== "" && row.slice(1).every((c) => c === "")) {
|
|
250
|
+
pendingFirstCol = row[0];
|
|
251
|
+
continue;
|
|
252
|
+
}
|
|
253
|
+
if (pendingFirstCol && row[0] === "") {
|
|
254
|
+
row[0] = pendingFirstCol;
|
|
255
|
+
pendingFirstCol = "";
|
|
256
|
+
} else {
|
|
257
|
+
pendingFirstCol = "";
|
|
258
|
+
}
|
|
259
|
+
uniqueRows.push(row);
|
|
187
260
|
}
|
|
188
261
|
if (uniqueRows.length === 0) return "";
|
|
189
262
|
const md = [];
|
|
@@ -195,75 +268,15 @@ function tableToMarkdown(table) {
|
|
|
195
268
|
return md.join("\n");
|
|
196
269
|
}
|
|
197
270
|
|
|
198
|
-
// src/utils.ts
|
|
199
|
-
var VERSION = true ? "1.7.2" : "0.0.0-dev";
|
|
200
|
-
function toArrayBuffer(buf) {
|
|
201
|
-
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
202
|
-
return buf.buffer;
|
|
203
|
-
}
|
|
204
|
-
return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
|
|
205
|
-
}
|
|
206
|
-
var KordocError = class extends Error {
|
|
207
|
-
constructor(message) {
|
|
208
|
-
super(message);
|
|
209
|
-
this.name = "KordocError";
|
|
210
|
-
}
|
|
211
|
-
};
|
|
212
|
-
function sanitizeError(err) {
|
|
213
|
-
if (err instanceof KordocError) return err.message;
|
|
214
|
-
return "\uBB38\uC11C \uCC98\uB9AC \uC911 \uC624\uB958\uAC00 \uBC1C\uC0DD\uD588\uC2B5\uB2C8\uB2E4";
|
|
215
|
-
}
|
|
216
|
-
function isPathTraversal(name) {
|
|
217
|
-
if (name.includes("\0")) return true;
|
|
218
|
-
const normalized = name.replace(/\\/g, "/");
|
|
219
|
-
return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
220
|
-
}
|
|
221
|
-
function classifyError(err) {
|
|
222
|
-
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
223
|
-
const msg = err.message;
|
|
224
|
-
if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
|
|
225
|
-
if (msg.includes("DRM")) return "DRM_PROTECTED";
|
|
226
|
-
if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
|
|
227
|
-
if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
|
|
228
|
-
if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
|
|
229
|
-
if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
|
|
230
|
-
if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
|
|
231
|
-
return "PARSE_ERROR";
|
|
232
|
-
}
|
|
233
|
-
|
|
234
271
|
// src/hwpx/parser.ts
|
|
235
|
-
import
|
|
272
|
+
import JSZip2 from "jszip";
|
|
236
273
|
import { inflateRawSync } from "zlib";
|
|
237
274
|
import { DOMParser } from "@xmldom/xmldom";
|
|
238
275
|
|
|
239
|
-
// src/
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
if (Array.isArray(spec)) {
|
|
244
|
-
for (const n of spec) {
|
|
245
|
-
const page = Math.round(n);
|
|
246
|
-
if (page >= 1 && page <= maxPages) result.add(page);
|
|
247
|
-
}
|
|
248
|
-
return result;
|
|
249
|
-
}
|
|
250
|
-
if (typeof spec !== "string" || spec.trim() === "") return result;
|
|
251
|
-
const parts = spec.split(",");
|
|
252
|
-
for (const part of parts) {
|
|
253
|
-
const trimmed = part.trim();
|
|
254
|
-
if (!trimmed) continue;
|
|
255
|
-
const rangeMatch = trimmed.match(/^(\d+)\s*-\s*(\d+)$/);
|
|
256
|
-
if (rangeMatch) {
|
|
257
|
-
const start = Math.max(1, parseInt(rangeMatch[1], 10));
|
|
258
|
-
const end = Math.min(maxPages, parseInt(rangeMatch[2], 10));
|
|
259
|
-
for (let i = start; i <= end; i++) result.add(i);
|
|
260
|
-
} else {
|
|
261
|
-
const page = parseInt(trimmed, 10);
|
|
262
|
-
if (!isNaN(page) && page >= 1 && page <= maxPages) result.add(page);
|
|
263
|
-
}
|
|
264
|
-
}
|
|
265
|
-
return result;
|
|
266
|
-
}
|
|
276
|
+
// src/types.ts
|
|
277
|
+
var HEADING_RATIO_H1 = 1.5;
|
|
278
|
+
var HEADING_RATIO_H2 = 1.3;
|
|
279
|
+
var HEADING_RATIO_H3 = 1.15;
|
|
267
280
|
|
|
268
281
|
// src/hwpx/parser.ts
|
|
269
282
|
var MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024;
|
|
@@ -357,16 +370,10 @@ function stripDtd(xml) {
|
|
|
357
370
|
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
358
371
|
}
|
|
359
372
|
async function parseHwpxDocument(buffer, options) {
|
|
360
|
-
|
|
361
|
-
if (precheck.totalUncompressed > MAX_DECOMPRESS_SIZE) {
|
|
362
|
-
throw new KordocError("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
363
|
-
}
|
|
364
|
-
if (precheck.entryCount > MAX_ZIP_ENTRIES) {
|
|
365
|
-
throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
366
|
-
}
|
|
373
|
+
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
|
|
367
374
|
let zip;
|
|
368
375
|
try {
|
|
369
|
-
zip = await
|
|
376
|
+
zip = await JSZip2.loadAsync(buffer);
|
|
370
377
|
} catch {
|
|
371
378
|
return extractFromBrokenZip(buffer);
|
|
372
379
|
}
|
|
@@ -529,7 +536,7 @@ function parseDublinCoreMetadata(xml, metadata) {
|
|
|
529
536
|
async function extractHwpxMetadataOnly(buffer) {
|
|
530
537
|
let zip;
|
|
531
538
|
try {
|
|
532
|
-
zip = await
|
|
539
|
+
zip = await JSZip2.loadAsync(buffer);
|
|
533
540
|
} catch {
|
|
534
541
|
throw new KordocError("HWPX ZIP\uC744 \uC5F4 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
535
542
|
}
|
|
@@ -539,46 +546,17 @@ async function extractHwpxMetadataOnly(buffer) {
|
|
|
539
546
|
metadata.pageCount = sectionPaths.length;
|
|
540
547
|
return metadata;
|
|
541
548
|
}
|
|
542
|
-
function precheckZipSize(buffer) {
|
|
543
|
-
try {
|
|
544
|
-
const data = new DataView(buffer);
|
|
545
|
-
const len = buffer.byteLength;
|
|
546
|
-
if (len < 22) return { totalUncompressed: 0, entryCount: 0 };
|
|
547
|
-
const searchStart = Math.max(0, len - 22 - 65535);
|
|
548
|
-
let eocdOffset = -1;
|
|
549
|
-
for (let i = len - 22; i >= searchStart; i--) {
|
|
550
|
-
if (data.getUint32(i, true) === 101010256) {
|
|
551
|
-
eocdOffset = i;
|
|
552
|
-
break;
|
|
553
|
-
}
|
|
554
|
-
}
|
|
555
|
-
if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
|
|
556
|
-
const entryCount = data.getUint16(eocdOffset + 10, true);
|
|
557
|
-
const cdSize = data.getUint32(eocdOffset + 12, true);
|
|
558
|
-
const cdOffset = data.getUint32(eocdOffset + 16, true);
|
|
559
|
-
if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
|
|
560
|
-
let totalUncompressed = 0;
|
|
561
|
-
let pos = cdOffset;
|
|
562
|
-
for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
|
|
563
|
-
if (data.getUint32(pos, true) !== 33639248) break;
|
|
564
|
-
totalUncompressed += data.getUint32(pos + 24, true);
|
|
565
|
-
const nameLen = data.getUint16(pos + 28, true);
|
|
566
|
-
const extraLen = data.getUint16(pos + 30, true);
|
|
567
|
-
const commentLen = data.getUint16(pos + 32, true);
|
|
568
|
-
pos += 46 + nameLen + extraLen + commentLen;
|
|
569
|
-
}
|
|
570
|
-
return { totalUncompressed, entryCount };
|
|
571
|
-
} catch {
|
|
572
|
-
return { totalUncompressed: 0, entryCount: 0 };
|
|
573
|
-
}
|
|
574
|
-
}
|
|
575
549
|
function extractFromBrokenZip(buffer) {
|
|
576
550
|
const data = new Uint8Array(buffer);
|
|
577
551
|
const view = new DataView(buffer);
|
|
578
552
|
let pos = 0;
|
|
579
553
|
const blocks = [];
|
|
554
|
+
const warnings = [
|
|
555
|
+
{ code: "BROKEN_ZIP_RECOVERY", message: "\uC190\uC0C1\uB41C ZIP \uAD6C\uC870 \u2014 Local File Header \uAE30\uBC18 \uBCF5\uAD6C \uBAA8\uB4DC" }
|
|
556
|
+
];
|
|
580
557
|
let totalDecompressed = 0;
|
|
581
558
|
let entryCount = 0;
|
|
559
|
+
let sectionNum = 0;
|
|
582
560
|
while (pos < data.length - 30) {
|
|
583
561
|
if (data[pos] !== 80 || data[pos + 1] !== 75 || data[pos + 2] !== 3 || data[pos + 3] !== 4) {
|
|
584
562
|
pos++;
|
|
@@ -624,14 +602,15 @@ function extractFromBrokenZip(buffer) {
|
|
|
624
602
|
}
|
|
625
603
|
totalDecompressed += content.length * 2;
|
|
626
604
|
if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("\uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC");
|
|
627
|
-
|
|
605
|
+
sectionNum++;
|
|
606
|
+
blocks.push(...parseSectionXml(content, void 0, warnings, sectionNum));
|
|
628
607
|
} catch {
|
|
629
608
|
continue;
|
|
630
609
|
}
|
|
631
610
|
}
|
|
632
611
|
if (blocks.length === 0) throw new KordocError("\uC190\uC0C1\uB41C HWPX\uC5D0\uC11C \uC139\uC158 \uB370\uC774\uD130\uB97C \uBCF5\uAD6C\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
633
612
|
const markdown = blocksToMarkdown(blocks);
|
|
634
|
-
return { markdown, blocks };
|
|
613
|
+
return { markdown, blocks, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
635
614
|
}
|
|
636
615
|
async function resolveSectionPaths(zip) {
|
|
637
616
|
const manifestPaths = ["Contents/content.hpf", "content.hpf"];
|
|
@@ -695,9 +674,9 @@ function detectHwpxHeadings(blocks, styleMap) {
|
|
|
695
674
|
let level = 0;
|
|
696
675
|
if (baseFontSize > 0 && block.style?.fontSize) {
|
|
697
676
|
const ratio = block.style.fontSize / baseFontSize;
|
|
698
|
-
if (ratio >=
|
|
699
|
-
else if (ratio >=
|
|
700
|
-
else if (ratio >=
|
|
677
|
+
if (ratio >= HEADING_RATIO_H1) level = 1;
|
|
678
|
+
else if (ratio >= HEADING_RATIO_H2) level = 2;
|
|
679
|
+
else if (ratio >= HEADING_RATIO_H3) level = 3;
|
|
701
680
|
}
|
|
702
681
|
if (/^제\d+[조장절편]/.test(text) && text.length <= 50) {
|
|
703
682
|
if (level === 0) level = 3;
|
|
@@ -829,39 +808,47 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
829
808
|
if (depth > MAX_XML_DEPTH) return tableCtx;
|
|
830
809
|
const children = node.childNodes;
|
|
831
810
|
if (!children) return tableCtx;
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
if (
|
|
839
|
-
const
|
|
840
|
-
|
|
841
|
-
if (
|
|
842
|
-
if (tableStack.
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
811
|
+
const walkChildren = (parent, d) => {
|
|
812
|
+
if (d > MAX_XML_DEPTH) return;
|
|
813
|
+
const kids = parent.childNodes;
|
|
814
|
+
if (!kids) return;
|
|
815
|
+
for (let i = 0; i < kids.length; i++) {
|
|
816
|
+
const el = kids[i];
|
|
817
|
+
if (el.nodeType !== 1) continue;
|
|
818
|
+
const tag = el.tagName || el.localName || "";
|
|
819
|
+
const localTag = tag.replace(/^[^:]+:/, "");
|
|
820
|
+
if (localTag === "tbl") {
|
|
821
|
+
if (tableCtx) tableStack.push(tableCtx);
|
|
822
|
+
const newTable = { rows: [], currentRow: [], cell: null };
|
|
823
|
+
walkSection(el, blocks, newTable, tableStack, styleMap, warnings, sectionNum, d + 1);
|
|
824
|
+
if (newTable.rows.length > 0) {
|
|
825
|
+
if (tableStack.length > 0) {
|
|
826
|
+
const parentTable = tableStack.pop();
|
|
827
|
+
const nestedText = convertTableToText(newTable.rows);
|
|
828
|
+
if (parentTable.cell) {
|
|
829
|
+
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
|
|
830
|
+
}
|
|
831
|
+
tableCtx = parentTable;
|
|
832
|
+
} else {
|
|
833
|
+
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
834
|
+
tableCtx = null;
|
|
847
835
|
}
|
|
848
|
-
tableCtx = parentTable;
|
|
849
836
|
} else {
|
|
850
|
-
|
|
851
|
-
tableCtx = null;
|
|
837
|
+
tableCtx = tableStack.length > 0 ? tableStack.pop() : null;
|
|
852
838
|
}
|
|
853
|
-
} else {
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
} else if (
|
|
861
|
-
|
|
839
|
+
} else if (localTag === "pic" || localTag === "shape" || localTag === "drawingObject") {
|
|
840
|
+
const imgRef = extractImageRef(el);
|
|
841
|
+
if (imgRef) {
|
|
842
|
+
blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
|
|
843
|
+
} else if (warnings && sectionNum) {
|
|
844
|
+
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
845
|
+
}
|
|
846
|
+
} else if (localTag === "r" || localTag === "run" || localTag === "ctrl") {
|
|
847
|
+
walkChildren(el, d + 1);
|
|
862
848
|
}
|
|
863
849
|
}
|
|
864
|
-
}
|
|
850
|
+
};
|
|
851
|
+
walkChildren(node, depth);
|
|
865
852
|
return tableCtx;
|
|
866
853
|
}
|
|
867
854
|
function extractParagraphInfo(para, styleMap) {
|
|
@@ -900,7 +887,10 @@ function extractParagraphInfo(para, styleMap) {
|
|
|
900
887
|
// 하이퍼링크
|
|
901
888
|
case "hyperlink": {
|
|
902
889
|
const url = child.getAttribute("url") || child.getAttribute("href") || "";
|
|
903
|
-
if (url)
|
|
890
|
+
if (url) {
|
|
891
|
+
const safe = sanitizeHref(url);
|
|
892
|
+
if (safe) href = safe;
|
|
893
|
+
}
|
|
904
894
|
walk(child);
|
|
905
895
|
break;
|
|
906
896
|
}
|
|
@@ -913,6 +903,29 @@ function extractParagraphInfo(para, styleMap) {
|
|
|
913
903
|
if (noteText) footnote = (footnote ? footnote + "; " : "") + noteText;
|
|
914
904
|
break;
|
|
915
905
|
}
|
|
906
|
+
// 제어 요소 — 필드, 컨트롤, 매개변수 등 스킵
|
|
907
|
+
case "ctrl":
|
|
908
|
+
case "fieldBegin":
|
|
909
|
+
case "fieldEnd":
|
|
910
|
+
case "parameters":
|
|
911
|
+
case "stringParam":
|
|
912
|
+
case "integerParam":
|
|
913
|
+
case "boolParam":
|
|
914
|
+
case "floatParam":
|
|
915
|
+
case "secPr":
|
|
916
|
+
// 섹션 속성 (페이지 설정 등)
|
|
917
|
+
case "colPr":
|
|
918
|
+
// 다단 속성
|
|
919
|
+
case "linesegarray":
|
|
920
|
+
case "lineseg":
|
|
921
|
+
// 레이아웃 정보
|
|
922
|
+
// 도형/이미지 요소 — 대체텍스트("사각형입니다." 등) 누출 방지
|
|
923
|
+
case "pic":
|
|
924
|
+
case "shape":
|
|
925
|
+
case "drawingObject":
|
|
926
|
+
case "shapeComment":
|
|
927
|
+
case "drawText":
|
|
928
|
+
break;
|
|
916
929
|
// run 요소에서 charPrIDRef 추출
|
|
917
930
|
case "r": {
|
|
918
931
|
const runCharPr = child.getAttribute("charPrIDRef");
|
|
@@ -927,7 +940,10 @@ function extractParagraphInfo(para, styleMap) {
|
|
|
927
940
|
}
|
|
928
941
|
};
|
|
929
942
|
walk(para);
|
|
930
|
-
|
|
943
|
+
let cleanText = text.replace(/[ \t]+/g, " ").trim();
|
|
944
|
+
if (/^그림입니다\.?\s*원본\s*그림의\s*(이름|크기)/.test(cleanText)) cleanText = "";
|
|
945
|
+
cleanText = cleanText.replace(/그림입니다\.?\s*원본\s*그림의\s*(이름|크기)[^\n]*(\n[^\n]*원본\s*그림의\s*(이름|크기)[^\n]*)*/g, "").trim();
|
|
946
|
+
cleanText = cleanText.replace(/(?:모서리가 둥근 |둥근 )?(?:사각형|직사각형|정사각형|원|타원|삼각형|선|직선|곡선|화살표|오각형|육각형|팔각형|별|십자|구름|마름모|도넛|평행사변형|사다리꼴|개체|그리기\s?개체|묶음\s?개체|글상자|수식|표|그림|OLE\s?개체)\s?입니다\.?/g, "").trim();
|
|
931
947
|
let style;
|
|
932
948
|
if (styleMap && charPrId) {
|
|
933
949
|
const charProp = styleMap.charProperties.get(charPrId);
|
|
@@ -1205,9 +1221,9 @@ function detectHwp5Headings(blocks, docInfo) {
|
|
|
1205
1221
|
if (/^\d+$/.test(text)) continue;
|
|
1206
1222
|
const ratio = block.style.fontSize / baseFontSize;
|
|
1207
1223
|
let level = 0;
|
|
1208
|
-
if (ratio >=
|
|
1209
|
-
else if (ratio >=
|
|
1210
|
-
else if (ratio >=
|
|
1224
|
+
if (ratio >= HEADING_RATIO_H1) level = 1;
|
|
1225
|
+
else if (ratio >= HEADING_RATIO_H2) level = 2;
|
|
1226
|
+
else if (ratio >= HEADING_RATIO_H3) level = 3;
|
|
1211
1227
|
if (/^제\d+[조장절편]/.test(text) && text.length <= 50) {
|
|
1212
1228
|
if (level === 0) level = 3;
|
|
1213
1229
|
}
|
|
@@ -1308,20 +1324,22 @@ function detectImageMime(data) {
|
|
|
1308
1324
|
}
|
|
1309
1325
|
function extractHwp5Images(cfb, blocks, compressed, warnings) {
|
|
1310
1326
|
const binDataMap = /* @__PURE__ */ new Map();
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
if (
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1327
|
+
const binDataRe = /\/BinData\/[Bb][Ii][Nn](\d{4})$/;
|
|
1328
|
+
if (cfb.FileIndex) {
|
|
1329
|
+
for (const entry of cfb.FileIndex) {
|
|
1330
|
+
if (!entry?.name || !entry.content) continue;
|
|
1331
|
+
const match = entry.name.match(binDataRe);
|
|
1332
|
+
if (!match) continue;
|
|
1333
|
+
const idx = parseInt(match[1], 10);
|
|
1334
|
+
let data = Buffer.from(entry.content);
|
|
1335
|
+
if (compressed) {
|
|
1336
|
+
try {
|
|
1337
|
+
data = decompressStream(data);
|
|
1338
|
+
} catch {
|
|
1339
|
+
}
|
|
1322
1340
|
}
|
|
1341
|
+
binDataMap.set(idx, { data, name: entry.name });
|
|
1323
1342
|
}
|
|
1324
|
-
binDataMap.set(idx, { data, name: entry.name || `BIN${idx}` });
|
|
1325
1343
|
}
|
|
1326
1344
|
if (binDataMap.size === 0) return [];
|
|
1327
1345
|
const images = [];
|
|
@@ -1468,6 +1486,16 @@ function parseTableBlock(records, startIdx) {
|
|
|
1468
1486
|
i++;
|
|
1469
1487
|
}
|
|
1470
1488
|
if (rows === 0 || cols === 0 || cells.length === 0) return { table: null, nextIdx: i };
|
|
1489
|
+
const hasAddr = cells.some((c) => c.colAddr !== void 0 && c.rowAddr !== void 0);
|
|
1490
|
+
if (hasAddr) {
|
|
1491
|
+
const cellRows2 = arrangeCells(rows, cols, cells);
|
|
1492
|
+
const irCells = cellRows2.map((row) => row.map((c) => ({
|
|
1493
|
+
text: c.text.trim(),
|
|
1494
|
+
colSpan: c.colSpan,
|
|
1495
|
+
rowSpan: c.rowSpan
|
|
1496
|
+
})));
|
|
1497
|
+
return { table: { rows, cols, cells: irCells, hasHeader: rows > 1 }, nextIdx: i };
|
|
1498
|
+
}
|
|
1471
1499
|
const cellRows = arrangeCells(rows, cols, cells);
|
|
1472
1500
|
return { table: buildTable(cellRows), nextIdx: i };
|
|
1473
1501
|
}
|
|
@@ -1731,7 +1759,36 @@ function buildTableGrids(horizontals, verticals) {
|
|
|
1731
1759
|
};
|
|
1732
1760
|
grids.push({ rowYs, colXs, bbox });
|
|
1733
1761
|
}
|
|
1734
|
-
return grids;
|
|
1762
|
+
return mergeAdjacentGrids(grids);
|
|
1763
|
+
}
|
|
1764
|
+
function mergeAdjacentGrids(grids) {
|
|
1765
|
+
if (grids.length <= 1) return grids;
|
|
1766
|
+
const sorted = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
|
|
1767
|
+
const merged = [sorted[0]];
|
|
1768
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
1769
|
+
const prev = merged[merged.length - 1];
|
|
1770
|
+
const curr = sorted[i];
|
|
1771
|
+
if (prev.colXs.length === curr.colXs.length) {
|
|
1772
|
+
const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= COORD_MERGE_TOL * 3);
|
|
1773
|
+
const verticalGap = prev.bbox.y1 - curr.bbox.y2;
|
|
1774
|
+
if (colMatch && verticalGap >= -COORD_MERGE_TOL && verticalGap <= 20) {
|
|
1775
|
+
const allRowYs = [.../* @__PURE__ */ new Set([...prev.rowYs, ...curr.rowYs])].sort((a, b) => b - a);
|
|
1776
|
+
merged[merged.length - 1] = {
|
|
1777
|
+
rowYs: allRowYs,
|
|
1778
|
+
colXs: prev.colXs,
|
|
1779
|
+
bbox: {
|
|
1780
|
+
x1: Math.min(prev.bbox.x1, curr.bbox.x1),
|
|
1781
|
+
y1: Math.min(prev.bbox.y1, curr.bbox.y1),
|
|
1782
|
+
x2: Math.max(prev.bbox.x2, curr.bbox.x2),
|
|
1783
|
+
y2: Math.max(prev.bbox.y2, curr.bbox.y2)
|
|
1784
|
+
}
|
|
1785
|
+
};
|
|
1786
|
+
continue;
|
|
1787
|
+
}
|
|
1788
|
+
}
|
|
1789
|
+
merged.push(curr);
|
|
1790
|
+
}
|
|
1791
|
+
return merged;
|
|
1735
1792
|
}
|
|
1736
1793
|
function clusterCoordinates(values) {
|
|
1737
1794
|
if (values.length === 0) return [];
|
|
@@ -1918,7 +1975,11 @@ function cellTextToString(items) {
|
|
|
1918
1975
|
for (let j = 1; j < s.length; j++) {
|
|
1919
1976
|
const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
|
|
1920
1977
|
const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
|
|
1921
|
-
|
|
1978
|
+
const prevIsKorean = /[가-힣]$/.test(result);
|
|
1979
|
+
const currIsKorean = /^[가-힣]/.test(s[j].text);
|
|
1980
|
+
if (gap < avgFs * 0.15) {
|
|
1981
|
+
result += s[j].text;
|
|
1982
|
+
} else if (gap < avgFs * 0.35 && (prevIsKorean || currIsKorean)) {
|
|
1922
1983
|
result += s[j].text;
|
|
1923
1984
|
} else {
|
|
1924
1985
|
result += " " + s[j].text;
|
|
@@ -1933,6 +1994,12 @@ function cellTextToString(items) {
|
|
|
1933
1994
|
const curr = textLines[i];
|
|
1934
1995
|
if (/[가-힣]$/.test(prev) && /^[가-힣]+$/.test(curr) && curr.length <= 8 && !curr.includes(" ")) {
|
|
1935
1996
|
merged[merged.length - 1] = prev + curr;
|
|
1997
|
+
} else if (curr.trim().length <= 3 && /^[)\]%}]/.test(curr.trim())) {
|
|
1998
|
+
merged[merged.length - 1] = prev + curr.trim();
|
|
1999
|
+
} else if (/[,(]$/.test(prev.trim()) && curr.trim().length <= 15) {
|
|
2000
|
+
merged[merged.length - 1] = prev + curr.trim();
|
|
2001
|
+
} else if (/[\d,]$/.test(prev) && /^[\d,]+[)\]]?$/.test(curr.trim()) && curr.trim().length <= 10) {
|
|
2002
|
+
merged[merged.length - 1] = prev + curr.trim();
|
|
1936
2003
|
} else {
|
|
1937
2004
|
merged.push(curr);
|
|
1938
2005
|
}
|
|
@@ -2145,21 +2212,26 @@ async function loadPdfWithTimeout(buffer) {
|
|
|
2145
2212
|
disableFontFace: true,
|
|
2146
2213
|
isEvalSupported: false
|
|
2147
2214
|
});
|
|
2148
|
-
|
|
2149
|
-
|
|
2150
|
-
|
|
2151
|
-
|
|
2152
|
-
|
|
2153
|
-
|
|
2154
|
-
|
|
2155
|
-
|
|
2156
|
-
|
|
2215
|
+
let timer;
|
|
2216
|
+
try {
|
|
2217
|
+
return await Promise.race([
|
|
2218
|
+
loadingTask.promise,
|
|
2219
|
+
new Promise((_, reject) => {
|
|
2220
|
+
timer = setTimeout(() => {
|
|
2221
|
+
loadingTask.destroy();
|
|
2222
|
+
reject(new KordocError("PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (30\uCD08 \uCD08\uACFC)"));
|
|
2223
|
+
}, PDF_LOAD_TIMEOUT_MS);
|
|
2224
|
+
})
|
|
2225
|
+
]);
|
|
2226
|
+
} finally {
|
|
2227
|
+
if (timer !== void 0) clearTimeout(timer);
|
|
2228
|
+
}
|
|
2157
2229
|
}
|
|
2158
2230
|
async function parsePdfDocument(buffer, options) {
|
|
2159
2231
|
const doc = await loadPdfWithTimeout(buffer);
|
|
2160
2232
|
try {
|
|
2161
2233
|
const pageCount = doc.numPages;
|
|
2162
|
-
if (pageCount === 0)
|
|
2234
|
+
if (pageCount === 0) throw new KordocError("PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.");
|
|
2163
2235
|
const metadata = { pageCount };
|
|
2164
2236
|
await extractPdfMetadata(doc, metadata);
|
|
2165
2237
|
const blocks = [];
|
|
@@ -2212,14 +2284,14 @@ async function parsePdfDocument(buffer, options) {
|
|
|
2212
2284
|
const ocrBlocks = await ocrPages(doc, options.ocr, pageFilter, effectivePageCount);
|
|
2213
2285
|
if (ocrBlocks.length > 0) {
|
|
2214
2286
|
const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
|
|
2215
|
-
return {
|
|
2287
|
+
return { markdown: ocrMarkdown, blocks: ocrBlocks, metadata, warnings, isImageBased: true };
|
|
2216
2288
|
}
|
|
2217
2289
|
} catch {
|
|
2218
2290
|
}
|
|
2219
2291
|
}
|
|
2220
|
-
|
|
2292
|
+
throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
|
|
2221
2293
|
}
|
|
2222
|
-
if (options?.removeHeaderFooter && parsedPageCount >= 3) {
|
|
2294
|
+
if (options?.removeHeaderFooter !== false && parsedPageCount >= 3) {
|
|
2223
2295
|
const removed = removeHeaderFooterBlocks(blocks, pageHeights, warnings);
|
|
2224
2296
|
for (let ri = removed.length - 1; ri >= 0; ri--) {
|
|
2225
2297
|
blocks.splice(removed[ri], 1);
|
|
@@ -2229,9 +2301,10 @@ async function parsePdfDocument(buffer, options) {
|
|
|
2229
2301
|
if (medianFontSize > 0) {
|
|
2230
2302
|
detectHeadings(blocks, medianFontSize);
|
|
2231
2303
|
}
|
|
2304
|
+
detectMarkerHeadings(blocks);
|
|
2232
2305
|
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
2233
2306
|
let markdown = cleanPdfText(blocksToMarkdown(blocks));
|
|
2234
|
-
return {
|
|
2307
|
+
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
2235
2308
|
} finally {
|
|
2236
2309
|
await doc.destroy().catch(() => {
|
|
2237
2310
|
});
|
|
@@ -2302,12 +2375,67 @@ function detectHeadings(blocks, medianFontSize) {
|
|
|
2302
2375
|
if (/^\d+$/.test(text)) continue;
|
|
2303
2376
|
const ratio = block.style.fontSize / medianFontSize;
|
|
2304
2377
|
let level = 0;
|
|
2305
|
-
if (ratio >=
|
|
2306
|
-
else if (ratio >=
|
|
2307
|
-
else if (ratio >=
|
|
2378
|
+
if (ratio >= HEADING_RATIO_H1) level = 1;
|
|
2379
|
+
else if (ratio >= HEADING_RATIO_H2) level = 2;
|
|
2380
|
+
else if (ratio >= HEADING_RATIO_H3) level = 3;
|
|
2308
2381
|
if (level > 0) {
|
|
2309
2382
|
block.type = "heading";
|
|
2310
2383
|
block.level = level;
|
|
2384
|
+
block.text = collapseEvenSpacing(text);
|
|
2385
|
+
}
|
|
2386
|
+
}
|
|
2387
|
+
}
|
|
2388
|
+
function collapseEvenSpacing(text) {
|
|
2389
|
+
const tokens = text.split(" ");
|
|
2390
|
+
const singleCharCount = tokens.filter((t) => t.length === 1).length;
|
|
2391
|
+
if (tokens.length >= 3 && singleCharCount / tokens.length >= 0.7) {
|
|
2392
|
+
return tokens.join("");
|
|
2393
|
+
}
|
|
2394
|
+
return text;
|
|
2395
|
+
}
|
|
2396
|
+
function shouldDemoteTable(table) {
|
|
2397
|
+
const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
|
|
2398
|
+
const allText = allCells.join(" ");
|
|
2399
|
+
if (allText.length > 200) return false;
|
|
2400
|
+
if (/[□■◆○●▶]/.test(allText) && table.rows <= 3) return true;
|
|
2401
|
+
const totalCells = table.rows * table.cols;
|
|
2402
|
+
const emptyCells = totalCells - allCells.length;
|
|
2403
|
+
if (table.rows <= 2 && emptyCells > totalCells * 0.5) return true;
|
|
2404
|
+
if (table.rows === 1 && !/\d{2,}/.test(allText)) return true;
|
|
2405
|
+
return false;
|
|
2406
|
+
}
|
|
2407
|
+
function demoteTableToText(table) {
|
|
2408
|
+
const lines = [];
|
|
2409
|
+
for (let r = 0; r < table.rows; r++) {
|
|
2410
|
+
const cells = table.cells[r].map((c) => c.text.trim()).filter(Boolean);
|
|
2411
|
+
if (cells.length === 0) continue;
|
|
2412
|
+
if (table.cols === 2 && cells.length === 2) {
|
|
2413
|
+
lines.push(`${cells[0]} : ${cells[1]}`);
|
|
2414
|
+
} else {
|
|
2415
|
+
lines.push(cells.join(" "));
|
|
2416
|
+
}
|
|
2417
|
+
}
|
|
2418
|
+
return lines.join("\n");
|
|
2419
|
+
}
|
|
2420
|
+
function detectMarkerHeadings(blocks) {
|
|
2421
|
+
for (let i = 0; i < blocks.length; i++) {
|
|
2422
|
+
const block = blocks[i];
|
|
2423
|
+
if (block.type !== "paragraph" || !block.text) continue;
|
|
2424
|
+
const text = block.text.trim();
|
|
2425
|
+
if (text.length < 50 && /^[□■◆◇▶]\s*[가-힣]/.test(text)) {
|
|
2426
|
+
block.type = "heading";
|
|
2427
|
+
block.level = 4;
|
|
2428
|
+
continue;
|
|
2429
|
+
}
|
|
2430
|
+
if (/^[가-힣]{2,6}$/.test(text) && block.style?.fontSize) {
|
|
2431
|
+
const prev = blocks[i - 1];
|
|
2432
|
+
const next = blocks[i + 1];
|
|
2433
|
+
const prevIsStructural = !prev || prev.type === "table" || prev.type === "heading" || prev.type === "separator";
|
|
2434
|
+
const nextIsStructural = !next || next.type === "table" || next.type === "heading" || next.type === "paragraph" && next.text && /^[□■◆○●]/.test(next.text.trim());
|
|
2435
|
+
if (prevIsStructural || nextIsStructural) {
|
|
2436
|
+
block.type = "heading";
|
|
2437
|
+
block.level = 3;
|
|
2438
|
+
}
|
|
2311
2439
|
}
|
|
2312
2440
|
}
|
|
2313
2441
|
}
|
|
@@ -2344,7 +2472,7 @@ function computeRegion(items) {
|
|
|
2344
2472
|
}
|
|
2345
2473
|
return { items, minX, minY, maxX, maxY };
|
|
2346
2474
|
}
|
|
2347
|
-
function findYSplit(items,
|
|
2475
|
+
function findYSplit(items, _region, gapThreshold) {
|
|
2348
2476
|
const sorted = [...items].sort((a, b) => b.y - a.y);
|
|
2349
2477
|
let bestGap = gapThreshold;
|
|
2350
2478
|
let bestSplit = null;
|
|
@@ -2359,7 +2487,7 @@ function findYSplit(items, region, gapThreshold) {
|
|
|
2359
2487
|
}
|
|
2360
2488
|
return bestSplit;
|
|
2361
2489
|
}
|
|
2362
|
-
function findXSplit(items,
|
|
2490
|
+
function findXSplit(items, _region, gapThreshold) {
|
|
2363
2491
|
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
2364
2492
|
let bestGap = gapThreshold;
|
|
2365
2493
|
let bestSplit = null;
|
|
@@ -2418,7 +2546,8 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
2418
2546
|
);
|
|
2419
2547
|
for (const cell of cells) {
|
|
2420
2548
|
const cellItems = cellTextMap.get(cell) || [];
|
|
2421
|
-
|
|
2549
|
+
let text = cellTextToString(cellItems);
|
|
2550
|
+
text = text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").trim();
|
|
2422
2551
|
irGrid[cell.row][cell.col] = {
|
|
2423
2552
|
text,
|
|
2424
2553
|
colSpan: cell.colSpan,
|
|
@@ -2433,18 +2562,21 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
2433
2562
|
};
|
|
2434
2563
|
const hasContent = irGrid.some((row) => row.some((cell) => cell.text.trim() !== ""));
|
|
2435
2564
|
if (!hasContent) continue;
|
|
2436
|
-
|
|
2437
|
-
|
|
2438
|
-
|
|
2439
|
-
|
|
2440
|
-
|
|
2441
|
-
|
|
2442
|
-
|
|
2443
|
-
|
|
2444
|
-
|
|
2445
|
-
|
|
2565
|
+
const tableBbox = {
|
|
2566
|
+
page: pageNum,
|
|
2567
|
+
x: grid.bbox.x1,
|
|
2568
|
+
y: grid.bbox.y1,
|
|
2569
|
+
width: grid.bbox.x2 - grid.bbox.x1,
|
|
2570
|
+
height: grid.bbox.y2 - grid.bbox.y1
|
|
2571
|
+
};
|
|
2572
|
+
if (shouldDemoteTable(irTable)) {
|
|
2573
|
+
const demoted = demoteTableToText(irTable);
|
|
2574
|
+
if (demoted) {
|
|
2575
|
+
blocks.push({ type: "paragraph", text: demoted, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
|
|
2446
2576
|
}
|
|
2447
|
-
|
|
2577
|
+
continue;
|
|
2578
|
+
}
|
|
2579
|
+
blocks.push({ type: "table", table: irTable, pageNumber: pageNum, bbox: tableBbox });
|
|
2448
2580
|
}
|
|
2449
2581
|
const remaining = items.filter((i) => !usedItems.has(i));
|
|
2450
2582
|
if (remaining.length > 0) {
|
|
@@ -2456,9 +2588,29 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
2456
2588
|
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
2457
2589
|
return by - ay;
|
|
2458
2590
|
});
|
|
2459
|
-
return allBlocks;
|
|
2591
|
+
return mergeAdjacentTableBlocks(allBlocks);
|
|
2460
2592
|
}
|
|
2461
|
-
return blocks;
|
|
2593
|
+
return mergeAdjacentTableBlocks(blocks);
|
|
2594
|
+
}
|
|
2595
|
+
function mergeAdjacentTableBlocks(blocks) {
|
|
2596
|
+
if (blocks.length <= 1) return blocks;
|
|
2597
|
+
const result = [blocks[0]];
|
|
2598
|
+
for (let i = 1; i < blocks.length; i++) {
|
|
2599
|
+
const prev = result[result.length - 1];
|
|
2600
|
+
const curr = blocks[i];
|
|
2601
|
+
if (prev.type === "table" && curr.type === "table" && prev.table && curr.table && prev.table.cols === curr.table.cols) {
|
|
2602
|
+
const merged = {
|
|
2603
|
+
rows: prev.table.rows + curr.table.rows,
|
|
2604
|
+
cols: prev.table.cols,
|
|
2605
|
+
cells: [...prev.table.cells, ...curr.table.cells],
|
|
2606
|
+
hasHeader: prev.table.hasHeader
|
|
2607
|
+
};
|
|
2608
|
+
result[result.length - 1] = { ...prev, table: merged };
|
|
2609
|
+
} else {
|
|
2610
|
+
result.push(curr);
|
|
2611
|
+
}
|
|
2612
|
+
}
|
|
2613
|
+
return result;
|
|
2462
2614
|
}
|
|
2463
2615
|
function extractPageBlocksFallback(items, pageNum) {
|
|
2464
2616
|
if (items.length === 0) return [];
|
|
@@ -2481,11 +2633,13 @@ function extractPageBlocksFallback(items, pageNum) {
|
|
|
2481
2633
|
}));
|
|
2482
2634
|
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
2483
2635
|
if (clusterResults.length > 0) {
|
|
2636
|
+
const ciToIdx = /* @__PURE__ */ new Map();
|
|
2637
|
+
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
2484
2638
|
const usedIndices = /* @__PURE__ */ new Set();
|
|
2485
2639
|
for (const cr of clusterResults) {
|
|
2486
2640
|
for (const ci of cr.usedItems) {
|
|
2487
|
-
const idx =
|
|
2488
|
-
if (idx
|
|
2641
|
+
const idx = ciToIdx.get(ci);
|
|
2642
|
+
if (idx !== void 0) usedIndices.add(idx);
|
|
2489
2643
|
}
|
|
2490
2644
|
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
2491
2645
|
}
|
|
@@ -2796,7 +2950,8 @@ function mergeLineSimple(items) {
|
|
|
2796
2950
|
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
2797
2951
|
const avgFs = (sorted[i].fontSize + sorted[i - 1].fontSize) / 2;
|
|
2798
2952
|
if (gap > 15) result += " ";
|
|
2799
|
-
else if (gap < avgFs * 0.
|
|
2953
|
+
else if (gap < avgFs * 0.15) {
|
|
2954
|
+
} else if (gap < avgFs * 0.35 && (/[가-힣]$/.test(result) || /^[가-힣]/.test(sorted[i].text))) {
|
|
2800
2955
|
} else if (gap > 3) result += " ";
|
|
2801
2956
|
result += sorted[i].text;
|
|
2802
2957
|
}
|
|
@@ -2804,8 +2959,8 @@ function mergeLineSimple(items) {
|
|
|
2804
2959
|
}
|
|
2805
2960
|
function cleanPdfText(text) {
|
|
2806
2961
|
return mergeKoreanLines(
|
|
2807
|
-
text.replace(/^[\s]*[-–—]\s
|
|
2808
|
-
).replace(/\n{3,}/g, "\n\n").trim();
|
|
2962
|
+
text.replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "")
|
|
2963
|
+
).replace(/^(?!\|).{3,30}$/gm, (line) => collapseEvenSpacing(line)).replace(/\n{3,}/g, "\n\n").trim();
|
|
2809
2964
|
}
|
|
2810
2965
|
function startsWithMarker(line) {
|
|
2811
2966
|
const t = line.trimStart();
|
|
@@ -2819,15 +2974,13 @@ function detectListBlocks(blocks) {
|
|
|
2819
2974
|
for (let i = 0; i < blocks.length; i++) {
|
|
2820
2975
|
const block = blocks[i];
|
|
2821
2976
|
if (block.type === "paragraph" && block.text) {
|
|
2822
|
-
const
|
|
2823
|
-
if (
|
|
2824
|
-
result.push({
|
|
2825
|
-
|
|
2826
|
-
|
|
2827
|
-
|
|
2828
|
-
|
|
2829
|
-
text: block.text
|
|
2830
|
-
});
|
|
2977
|
+
const text = block.text.trim();
|
|
2978
|
+
if (/^\d+\.\s/.test(text)) {
|
|
2979
|
+
result.push({ ...block, type: "list", listType: "ordered", text: block.text });
|
|
2980
|
+
continue;
|
|
2981
|
+
}
|
|
2982
|
+
if (/^[○●·※▶▷◆◇\-]\s/.test(text)) {
|
|
2983
|
+
result.push({ ...block, type: "list", listType: "unordered", text: block.text });
|
|
2831
2984
|
continue;
|
|
2832
2985
|
}
|
|
2833
2986
|
}
|
|
@@ -2986,11 +3139,20 @@ function mergeKoreanLines(text) {
|
|
|
2986
3139
|
for (let i = 1; i < lines.length; i++) {
|
|
2987
3140
|
const prev = result[result.length - 1];
|
|
2988
3141
|
const curr = lines[i];
|
|
2989
|
-
|
|
3142
|
+
const currTrimmed = curr.trim();
|
|
3143
|
+
if (/^#{1,6}\s/.test(prev) || /^#{1,6}\s/.test(curr) || /^\|/.test(currTrimmed) || /^---/.test(currTrimmed)) {
|
|
2990
3144
|
result.push(curr);
|
|
2991
3145
|
continue;
|
|
2992
3146
|
}
|
|
2993
|
-
if (
|
|
3147
|
+
if (/,$/.test(prev.trim()) && currTrimmed.length > 0) {
|
|
3148
|
+
result[result.length - 1] = prev + "\n" + curr;
|
|
3149
|
+
continue;
|
|
3150
|
+
}
|
|
3151
|
+
if (/^\(※/.test(currTrimmed)) {
|
|
3152
|
+
result[result.length - 1] = prev + " " + currTrimmed;
|
|
3153
|
+
continue;
|
|
3154
|
+
}
|
|
3155
|
+
if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev)) {
|
|
2994
3156
|
result[result.length - 1] = prev + " " + curr;
|
|
2995
3157
|
} else {
|
|
2996
3158
|
result.push(curr);
|
|
@@ -3002,6 +3164,716 @@ function mergeKoreanLines(text) {
|
|
|
3002
3164
|
// src/index.ts
|
|
3003
3165
|
import { readFile } from "fs/promises";
|
|
3004
3166
|
|
|
3167
|
+
// src/xlsx/parser.ts
|
|
3168
|
+
import JSZip3 from "jszip";
|
|
3169
|
+
import { DOMParser as DOMParser2 } from "@xmldom/xmldom";
|
|
3170
|
+
var MAX_SHEETS = 100;
|
|
3171
|
+
var MAX_DECOMPRESS_SIZE3 = 100 * 1024 * 1024;
|
|
3172
|
+
var MAX_ROWS2 = 1e4;
|
|
3173
|
+
var MAX_COLS2 = 200;
|
|
3174
|
+
function cleanNumericValue(raw) {
|
|
3175
|
+
if (!/^-?\d+\.\d+$/.test(raw)) return raw;
|
|
3176
|
+
const num = parseFloat(raw);
|
|
3177
|
+
if (!isFinite(num)) return raw;
|
|
3178
|
+
const cleaned = parseFloat(num.toPrecision(15)).toString();
|
|
3179
|
+
return cleaned;
|
|
3180
|
+
}
|
|
3181
|
+
function parseCellRef(ref) {
|
|
3182
|
+
const m = ref.match(/^([A-Z]+)(\d+)$/);
|
|
3183
|
+
if (!m) return null;
|
|
3184
|
+
let col = 0;
|
|
3185
|
+
for (const ch of m[1]) col = col * 26 + (ch.charCodeAt(0) - 64);
|
|
3186
|
+
return { col: col - 1, row: parseInt(m[2], 10) - 1 };
|
|
3187
|
+
}
|
|
3188
|
+
function parseMergeRef(ref) {
|
|
3189
|
+
const parts = ref.split(":");
|
|
3190
|
+
if (parts.length !== 2) return null;
|
|
3191
|
+
const start = parseCellRef(parts[0]);
|
|
3192
|
+
const end = parseCellRef(parts[1]);
|
|
3193
|
+
if (!start || !end) return null;
|
|
3194
|
+
return { startCol: start.col, startRow: start.row, endCol: end.col, endRow: end.row };
|
|
3195
|
+
}
|
|
3196
|
+
function getElements(parent, tagName) {
|
|
3197
|
+
const nodes = parent.getElementsByTagName(tagName);
|
|
3198
|
+
const result = [];
|
|
3199
|
+
for (let i = 0; i < nodes.length; i++) result.push(nodes[i]);
|
|
3200
|
+
return result;
|
|
3201
|
+
}
|
|
3202
|
+
function getTextContent(el) {
|
|
3203
|
+
return el.textContent?.trim() ?? "";
|
|
3204
|
+
}
|
|
3205
|
+
function parseXml(text) {
|
|
3206
|
+
return new DOMParser2().parseFromString(text, "text/xml");
|
|
3207
|
+
}
|
|
3208
|
+
function parseSharedStrings(xml) {
|
|
3209
|
+
const doc = parseXml(xml);
|
|
3210
|
+
const strings = [];
|
|
3211
|
+
const siList = getElements(doc.documentElement, "si");
|
|
3212
|
+
for (const si of siList) {
|
|
3213
|
+
const tElements = getElements(si, "t");
|
|
3214
|
+
strings.push(tElements.map((t) => t.textContent ?? "").join(""));
|
|
3215
|
+
}
|
|
3216
|
+
return strings;
|
|
3217
|
+
}
|
|
3218
|
+
function parseWorkbook(xml) {
|
|
3219
|
+
const doc = parseXml(xml);
|
|
3220
|
+
const sheets = [];
|
|
3221
|
+
const sheetElements = getElements(doc.documentElement, "sheet");
|
|
3222
|
+
for (const el of sheetElements) {
|
|
3223
|
+
sheets.push({
|
|
3224
|
+
name: el.getAttribute("name") ?? `Sheet${sheets.length + 1}`,
|
|
3225
|
+
sheetId: el.getAttribute("sheetId") ?? "",
|
|
3226
|
+
rId: el.getAttribute("r:id") ?? ""
|
|
3227
|
+
});
|
|
3228
|
+
}
|
|
3229
|
+
return sheets;
|
|
3230
|
+
}
|
|
3231
|
+
function parseRels(xml) {
|
|
3232
|
+
const doc = parseXml(xml);
|
|
3233
|
+
const map = /* @__PURE__ */ new Map();
|
|
3234
|
+
const rels = getElements(doc.documentElement, "Relationship");
|
|
3235
|
+
for (const rel of rels) {
|
|
3236
|
+
const id = rel.getAttribute("Id");
|
|
3237
|
+
const target = rel.getAttribute("Target");
|
|
3238
|
+
if (id && target) map.set(id, target);
|
|
3239
|
+
}
|
|
3240
|
+
return map;
|
|
3241
|
+
}
|
|
3242
|
+
function parseWorksheet(xml, sharedStrings) {
|
|
3243
|
+
const doc = parseXml(xml);
|
|
3244
|
+
const grid = [];
|
|
3245
|
+
let maxRow = 0;
|
|
3246
|
+
let maxCol = 0;
|
|
3247
|
+
const rows = getElements(doc.documentElement, "row");
|
|
3248
|
+
for (const rowEl of rows) {
|
|
3249
|
+
const rowNum = parseInt(rowEl.getAttribute("r") ?? "0", 10) - 1;
|
|
3250
|
+
if (rowNum < 0 || rowNum >= MAX_ROWS2) continue;
|
|
3251
|
+
const cells = getElements(rowEl, "c");
|
|
3252
|
+
for (const cellEl of cells) {
|
|
3253
|
+
const ref = cellEl.getAttribute("r");
|
|
3254
|
+
if (!ref) continue;
|
|
3255
|
+
const pos = parseCellRef(ref);
|
|
3256
|
+
if (!pos || pos.col >= MAX_COLS2) continue;
|
|
3257
|
+
const type = cellEl.getAttribute("t");
|
|
3258
|
+
const vElements = getElements(cellEl, "v");
|
|
3259
|
+
const fElements = getElements(cellEl, "f");
|
|
3260
|
+
let value = "";
|
|
3261
|
+
if (vElements.length > 0) {
|
|
3262
|
+
const raw = getTextContent(vElements[0]);
|
|
3263
|
+
if (type === "s") {
|
|
3264
|
+
const idx = parseInt(raw, 10);
|
|
3265
|
+
value = sharedStrings[idx] ?? "";
|
|
3266
|
+
} else if (type === "b") {
|
|
3267
|
+
value = raw === "1" ? "TRUE" : "FALSE";
|
|
3268
|
+
} else {
|
|
3269
|
+
value = cleanNumericValue(raw);
|
|
3270
|
+
}
|
|
3271
|
+
} else if (type === "inlineStr") {
|
|
3272
|
+
const isEl = getElements(cellEl, "is");
|
|
3273
|
+
if (isEl.length > 0) {
|
|
3274
|
+
const tElements = getElements(isEl[0], "t");
|
|
3275
|
+
value = tElements.map((t) => t.textContent ?? "").join("");
|
|
3276
|
+
}
|
|
3277
|
+
}
|
|
3278
|
+
if (!value && fElements.length > 0) {
|
|
3279
|
+
value = `=${getTextContent(fElements[0])}`;
|
|
3280
|
+
}
|
|
3281
|
+
while (grid.length <= pos.row) grid.push([]);
|
|
3282
|
+
while (grid[pos.row].length <= pos.col) grid[pos.row].push("");
|
|
3283
|
+
grid[pos.row][pos.col] = value;
|
|
3284
|
+
if (pos.row > maxRow) maxRow = pos.row;
|
|
3285
|
+
if (pos.col > maxCol) maxCol = pos.col;
|
|
3286
|
+
}
|
|
3287
|
+
}
|
|
3288
|
+
const merges = [];
|
|
3289
|
+
const mergeCellElements = getElements(doc.documentElement, "mergeCell");
|
|
3290
|
+
for (const el of mergeCellElements) {
|
|
3291
|
+
const ref = el.getAttribute("ref");
|
|
3292
|
+
if (!ref) continue;
|
|
3293
|
+
const m = parseMergeRef(ref);
|
|
3294
|
+
if (m) merges.push(m);
|
|
3295
|
+
}
|
|
3296
|
+
return { grid, merges, maxRow, maxCol };
|
|
3297
|
+
}
|
|
3298
|
+
function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
|
|
3299
|
+
const blocks = [];
|
|
3300
|
+
if (sheetName) {
|
|
3301
|
+
blocks.push({
|
|
3302
|
+
type: "heading",
|
|
3303
|
+
text: sheetName,
|
|
3304
|
+
level: 2,
|
|
3305
|
+
pageNumber: sheetIndex + 1
|
|
3306
|
+
});
|
|
3307
|
+
}
|
|
3308
|
+
if (maxRow < 0 || maxCol < 0 || grid.length === 0) return blocks;
|
|
3309
|
+
const mergeMap = /* @__PURE__ */ new Map();
|
|
3310
|
+
const mergeSkip = /* @__PURE__ */ new Set();
|
|
3311
|
+
for (const m of merges) {
|
|
3312
|
+
const colSpan = m.endCol - m.startCol + 1;
|
|
3313
|
+
const rowSpan = m.endRow - m.startRow + 1;
|
|
3314
|
+
mergeMap.set(`${m.startRow},${m.startCol}`, { colSpan, rowSpan });
|
|
3315
|
+
for (let r = m.startRow; r <= m.endRow; r++) {
|
|
3316
|
+
for (let c = m.startCol; c <= m.endCol; c++) {
|
|
3317
|
+
if (r !== m.startRow || c !== m.startCol) {
|
|
3318
|
+
mergeSkip.add(`${r},${c}`);
|
|
3319
|
+
}
|
|
3320
|
+
}
|
|
3321
|
+
}
|
|
3322
|
+
}
|
|
3323
|
+
let firstRow = -1;
|
|
3324
|
+
let lastRow = -1;
|
|
3325
|
+
for (let r = 0; r <= maxRow; r++) {
|
|
3326
|
+
const row = grid[r];
|
|
3327
|
+
if (row && row.some((cell) => cell !== "")) {
|
|
3328
|
+
if (firstRow === -1) firstRow = r;
|
|
3329
|
+
lastRow = r;
|
|
3330
|
+
}
|
|
3331
|
+
}
|
|
3332
|
+
if (firstRow === -1) return blocks;
|
|
3333
|
+
const cellRows = [];
|
|
3334
|
+
for (let r = firstRow; r <= lastRow; r++) {
|
|
3335
|
+
const row = [];
|
|
3336
|
+
for (let c = 0; c <= maxCol; c++) {
|
|
3337
|
+
const key = `${r},${c}`;
|
|
3338
|
+
if (mergeSkip.has(key)) continue;
|
|
3339
|
+
const text = (grid[r] && grid[r][c]) ?? "";
|
|
3340
|
+
const merge = mergeMap.get(key);
|
|
3341
|
+
row.push({
|
|
3342
|
+
text,
|
|
3343
|
+
colSpan: merge?.colSpan ?? 1,
|
|
3344
|
+
rowSpan: merge?.rowSpan ?? 1
|
|
3345
|
+
});
|
|
3346
|
+
}
|
|
3347
|
+
cellRows.push(row);
|
|
3348
|
+
}
|
|
3349
|
+
if (cellRows.length > 0) {
|
|
3350
|
+
const table = buildTable(cellRows);
|
|
3351
|
+
if (table.rows > 0) {
|
|
3352
|
+
blocks.push({ type: "table", table, pageNumber: sheetIndex + 1 });
|
|
3353
|
+
}
|
|
3354
|
+
}
|
|
3355
|
+
return blocks;
|
|
3356
|
+
}
|
|
3357
|
+
async function parseXlsxDocument(buffer, options) {
|
|
3358
|
+
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
|
|
3359
|
+
const zip = await JSZip3.loadAsync(buffer);
|
|
3360
|
+
const warnings = [];
|
|
3361
|
+
const workbookFile = zip.file("xl/workbook.xml");
|
|
3362
|
+
if (!workbookFile) {
|
|
3363
|
+
throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 XLSX \uD30C\uC77C: xl/workbook.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
3364
|
+
}
|
|
3365
|
+
let sharedStrings = [];
|
|
3366
|
+
const ssFile = zip.file("xl/sharedStrings.xml");
|
|
3367
|
+
if (ssFile) {
|
|
3368
|
+
sharedStrings = parseSharedStrings(await ssFile.async("text"));
|
|
3369
|
+
}
|
|
3370
|
+
const sheets = parseWorkbook(await workbookFile.async("text"));
|
|
3371
|
+
if (sheets.length === 0) {
|
|
3372
|
+
throw new KordocError("XLSX \uD30C\uC77C\uC5D0 \uC2DC\uD2B8\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
3373
|
+
}
|
|
3374
|
+
let relsMap = /* @__PURE__ */ new Map();
|
|
3375
|
+
const relsFile = zip.file("xl/_rels/workbook.xml.rels");
|
|
3376
|
+
if (relsFile) {
|
|
3377
|
+
relsMap = parseRels(await relsFile.async("text"));
|
|
3378
|
+
}
|
|
3379
|
+
let pageFilter = null;
|
|
3380
|
+
if (options?.pages) {
|
|
3381
|
+
const { parsePageRange: parsePageRange2 } = await import("./page-range-737B4EZW.js");
|
|
3382
|
+
pageFilter = parsePageRange2(options.pages, sheets.length);
|
|
3383
|
+
}
|
|
3384
|
+
const blocks = [];
|
|
3385
|
+
const processedSheets = Math.min(sheets.length, MAX_SHEETS);
|
|
3386
|
+
for (let i = 0; i < processedSheets; i++) {
|
|
3387
|
+
if (pageFilter && !pageFilter.has(i + 1)) continue;
|
|
3388
|
+
const sheet = sheets[i];
|
|
3389
|
+
options?.onProgress?.(i + 1, processedSheets);
|
|
3390
|
+
let sheetPath = relsMap.get(sheet.rId);
|
|
3391
|
+
if (sheetPath) {
|
|
3392
|
+
if (!sheetPath.startsWith("xl/") && !sheetPath.startsWith("/")) {
|
|
3393
|
+
sheetPath = `xl/${sheetPath}`;
|
|
3394
|
+
} else if (sheetPath.startsWith("/")) {
|
|
3395
|
+
sheetPath = sheetPath.slice(1);
|
|
3396
|
+
}
|
|
3397
|
+
} else {
|
|
3398
|
+
sheetPath = `xl/worksheets/sheet${i + 1}.xml`;
|
|
3399
|
+
}
|
|
3400
|
+
const sheetFile = zip.file(sheetPath);
|
|
3401
|
+
if (!sheetFile) {
|
|
3402
|
+
warnings.push({
|
|
3403
|
+
page: i + 1,
|
|
3404
|
+
message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${sheetPath}`,
|
|
3405
|
+
code: "PARTIAL_PARSE"
|
|
3406
|
+
});
|
|
3407
|
+
continue;
|
|
3408
|
+
}
|
|
3409
|
+
try {
|
|
3410
|
+
const sheetXml = await sheetFile.async("text");
|
|
3411
|
+
const { grid, merges, maxRow, maxCol } = parseWorksheet(sheetXml, sharedStrings);
|
|
3412
|
+
const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
|
|
3413
|
+
blocks.push(...sheetBlocks);
|
|
3414
|
+
} catch (err) {
|
|
3415
|
+
warnings.push({
|
|
3416
|
+
page: i + 1,
|
|
3417
|
+
message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC2F1 \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
|
|
3418
|
+
code: "PARTIAL_PARSE"
|
|
3419
|
+
});
|
|
3420
|
+
}
|
|
3421
|
+
}
|
|
3422
|
+
const metadata = {
|
|
3423
|
+
pageCount: processedSheets
|
|
3424
|
+
};
|
|
3425
|
+
const coreFile = zip.file("docProps/core.xml");
|
|
3426
|
+
if (coreFile) {
|
|
3427
|
+
try {
|
|
3428
|
+
const coreXml = await coreFile.async("text");
|
|
3429
|
+
const doc = parseXml(coreXml);
|
|
3430
|
+
const getFirst = (tag) => {
|
|
3431
|
+
const els = doc.getElementsByTagName(tag);
|
|
3432
|
+
return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
|
|
3433
|
+
};
|
|
3434
|
+
metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
|
|
3435
|
+
metadata.author = getFirst("dc:creator");
|
|
3436
|
+
metadata.description = getFirst("dc:description");
|
|
3437
|
+
const created = getFirst("dcterms:created");
|
|
3438
|
+
if (created) metadata.createdAt = created;
|
|
3439
|
+
const modified = getFirst("dcterms:modified");
|
|
3440
|
+
if (modified) metadata.modifiedAt = modified;
|
|
3441
|
+
} catch {
|
|
3442
|
+
}
|
|
3443
|
+
}
|
|
3444
|
+
const markdown = blocksToMarkdown(blocks);
|
|
3445
|
+
return { markdown, blocks, metadata, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
3446
|
+
}
|
|
3447
|
+
|
|
3448
|
+
// src/docx/parser.ts
|
|
3449
|
+
import JSZip4 from "jszip";
|
|
3450
|
+
import { DOMParser as DOMParser3 } from "@xmldom/xmldom";
|
|
3451
|
+
var MAX_DECOMPRESS_SIZE4 = 100 * 1024 * 1024;
|
|
3452
|
+
function getChildElements(parent, localName) {
|
|
3453
|
+
const result = [];
|
|
3454
|
+
const children = parent.childNodes;
|
|
3455
|
+
for (let i = 0; i < children.length; i++) {
|
|
3456
|
+
const node = children[i];
|
|
3457
|
+
if (node.nodeType === 1) {
|
|
3458
|
+
const el = node;
|
|
3459
|
+
if (el.localName === localName || el.tagName?.endsWith(`:${localName}`)) {
|
|
3460
|
+
result.push(el);
|
|
3461
|
+
}
|
|
3462
|
+
}
|
|
3463
|
+
}
|
|
3464
|
+
return result;
|
|
3465
|
+
}
|
|
3466
|
+
function findElements(parent, localName) {
|
|
3467
|
+
const result = [];
|
|
3468
|
+
const walk = (node) => {
|
|
3469
|
+
const children = node.childNodes;
|
|
3470
|
+
for (let i = 0; i < children.length; i++) {
|
|
3471
|
+
const child = children[i];
|
|
3472
|
+
if (child.nodeType === 1) {
|
|
3473
|
+
const el = child;
|
|
3474
|
+
if (el.localName === localName || el.tagName?.endsWith(`:${localName}`)) {
|
|
3475
|
+
result.push(el);
|
|
3476
|
+
}
|
|
3477
|
+
walk(el);
|
|
3478
|
+
}
|
|
3479
|
+
}
|
|
3480
|
+
};
|
|
3481
|
+
walk(parent);
|
|
3482
|
+
return result;
|
|
3483
|
+
}
|
|
3484
|
+
function getAttr(el, localName) {
|
|
3485
|
+
const attrs = el.attributes;
|
|
3486
|
+
for (let i = 0; i < attrs.length; i++) {
|
|
3487
|
+
const attr = attrs[i];
|
|
3488
|
+
if (attr.localName === localName || attr.name === localName) return attr.value;
|
|
3489
|
+
}
|
|
3490
|
+
return null;
|
|
3491
|
+
}
|
|
3492
|
+
function parseXml2(text) {
|
|
3493
|
+
return new DOMParser3().parseFromString(text, "text/xml");
|
|
3494
|
+
}
|
|
3495
|
+
function parseStyles(xml) {
|
|
3496
|
+
const doc = parseXml2(xml);
|
|
3497
|
+
const styles = /* @__PURE__ */ new Map();
|
|
3498
|
+
const styleElements = findElements(doc, "style");
|
|
3499
|
+
for (const el of styleElements) {
|
|
3500
|
+
const styleId = getAttr(el, "styleId");
|
|
3501
|
+
if (!styleId) continue;
|
|
3502
|
+
const nameEls = getChildElements(el, "name");
|
|
3503
|
+
const name = nameEls.length > 0 ? getAttr(nameEls[0], "val") ?? "" : "";
|
|
3504
|
+
const basedOnEls = getChildElements(el, "basedOn");
|
|
3505
|
+
const basedOn = basedOnEls.length > 0 ? getAttr(basedOnEls[0], "val") ?? void 0 : void 0;
|
|
3506
|
+
const pPrEls = getChildElements(el, "pPr");
|
|
3507
|
+
let outlineLevel;
|
|
3508
|
+
if (pPrEls.length > 0) {
|
|
3509
|
+
const outlineEls = getChildElements(pPrEls[0], "outlineLvl");
|
|
3510
|
+
if (outlineEls.length > 0) {
|
|
3511
|
+
const val = getAttr(outlineEls[0], "val");
|
|
3512
|
+
if (val !== null) outlineLevel = parseInt(val, 10);
|
|
3513
|
+
}
|
|
3514
|
+
}
|
|
3515
|
+
if (outlineLevel === void 0) {
|
|
3516
|
+
const headingMatch = name.match(/^(?:heading|Heading)\s*(\d+)$/i);
|
|
3517
|
+
if (headingMatch) outlineLevel = parseInt(headingMatch[1], 10) - 1;
|
|
3518
|
+
}
|
|
3519
|
+
styles.set(styleId, { name, basedOn, outlineLevel });
|
|
3520
|
+
}
|
|
3521
|
+
return styles;
|
|
3522
|
+
}
|
|
3523
|
+
function parseNumbering(xml) {
|
|
3524
|
+
const doc = parseXml2(xml);
|
|
3525
|
+
const abstractNums = /* @__PURE__ */ new Map();
|
|
3526
|
+
const abstractElements = findElements(doc, "abstractNum");
|
|
3527
|
+
for (const el of abstractElements) {
|
|
3528
|
+
const abstractNumId = getAttr(el, "abstractNumId");
|
|
3529
|
+
if (!abstractNumId) continue;
|
|
3530
|
+
const levels = /* @__PURE__ */ new Map();
|
|
3531
|
+
const lvlElements = getChildElements(el, "lvl");
|
|
3532
|
+
for (const lvl of lvlElements) {
|
|
3533
|
+
const ilvl = parseInt(getAttr(lvl, "ilvl") ?? "0", 10);
|
|
3534
|
+
const numFmtEls = getChildElements(lvl, "numFmt");
|
|
3535
|
+
const numFmt = numFmtEls.length > 0 ? getAttr(numFmtEls[0], "val") ?? "bullet" : "bullet";
|
|
3536
|
+
levels.set(ilvl, { numFmt, level: ilvl });
|
|
3537
|
+
}
|
|
3538
|
+
abstractNums.set(abstractNumId, levels);
|
|
3539
|
+
}
|
|
3540
|
+
const nums = /* @__PURE__ */ new Map();
|
|
3541
|
+
const numElements = findElements(doc, "num");
|
|
3542
|
+
for (const el of numElements) {
|
|
3543
|
+
const numId = getAttr(el, "numId");
|
|
3544
|
+
if (!numId) continue;
|
|
3545
|
+
const abstractRefs = getChildElements(el, "abstractNumId");
|
|
3546
|
+
if (abstractRefs.length > 0) {
|
|
3547
|
+
const ref = getAttr(abstractRefs[0], "val");
|
|
3548
|
+
if (ref && abstractNums.has(ref)) {
|
|
3549
|
+
nums.set(numId, abstractNums.get(ref));
|
|
3550
|
+
}
|
|
3551
|
+
}
|
|
3552
|
+
}
|
|
3553
|
+
return nums;
|
|
3554
|
+
}
|
|
3555
|
+
function parseRels2(xml) {
|
|
3556
|
+
const doc = parseXml2(xml);
|
|
3557
|
+
const map = /* @__PURE__ */ new Map();
|
|
3558
|
+
const rels = findElements(doc, "Relationship");
|
|
3559
|
+
for (const rel of rels) {
|
|
3560
|
+
const id = getAttr(rel, "Id");
|
|
3561
|
+
const target = getAttr(rel, "Target");
|
|
3562
|
+
if (id && target) map.set(id, target);
|
|
3563
|
+
}
|
|
3564
|
+
return map;
|
|
3565
|
+
}
|
|
3566
|
+
function parseFootnotes(xml) {
|
|
3567
|
+
const doc = parseXml2(xml);
|
|
3568
|
+
const notes = /* @__PURE__ */ new Map();
|
|
3569
|
+
const fnElements = findElements(doc, "footnote");
|
|
3570
|
+
for (const fn of fnElements) {
|
|
3571
|
+
const id = getAttr(fn, "id");
|
|
3572
|
+
if (!id || id === "0" || id === "-1") continue;
|
|
3573
|
+
const texts = [];
|
|
3574
|
+
const pElements = findElements(fn, "p");
|
|
3575
|
+
for (const p of pElements) {
|
|
3576
|
+
const runs = findElements(p, "r");
|
|
3577
|
+
for (const r of runs) {
|
|
3578
|
+
const tElements = getChildElements(r, "t");
|
|
3579
|
+
for (const t of tElements) texts.push(t.textContent ?? "");
|
|
3580
|
+
}
|
|
3581
|
+
}
|
|
3582
|
+
notes.set(id, texts.join("").trim());
|
|
3583
|
+
}
|
|
3584
|
+
return notes;
|
|
3585
|
+
}
|
|
3586
|
+
function extractRun(r) {
|
|
3587
|
+
const tElements = getChildElements(r, "t");
|
|
3588
|
+
const text = tElements.map((t) => t.textContent ?? "").join("");
|
|
3589
|
+
let bold = false;
|
|
3590
|
+
let italic = false;
|
|
3591
|
+
const rPrEls = getChildElements(r, "rPr");
|
|
3592
|
+
if (rPrEls.length > 0) {
|
|
3593
|
+
bold = getChildElements(rPrEls[0], "b").length > 0;
|
|
3594
|
+
italic = getChildElements(rPrEls[0], "i").length > 0;
|
|
3595
|
+
}
|
|
3596
|
+
return { text, bold, italic };
|
|
3597
|
+
}
|
|
3598
|
+
function parseParagraph(p, styles, numbering, footnotes, rels) {
|
|
3599
|
+
const pPrEls = getChildElements(p, "pPr");
|
|
3600
|
+
let styleId = "";
|
|
3601
|
+
let numId = "";
|
|
3602
|
+
let ilvl = 0;
|
|
3603
|
+
if (pPrEls.length > 0) {
|
|
3604
|
+
const pStyleEls = getChildElements(pPrEls[0], "pStyle");
|
|
3605
|
+
if (pStyleEls.length > 0) styleId = getAttr(pStyleEls[0], "val") ?? "";
|
|
3606
|
+
const numPrEls = getChildElements(pPrEls[0], "numPr");
|
|
3607
|
+
if (numPrEls.length > 0) {
|
|
3608
|
+
const numIdEls = getChildElements(numPrEls[0], "numId");
|
|
3609
|
+
const ilvlEls = getChildElements(numPrEls[0], "ilvl");
|
|
3610
|
+
numId = numIdEls.length > 0 ? getAttr(numIdEls[0], "val") ?? "" : "";
|
|
3611
|
+
ilvl = ilvlEls.length > 0 ? parseInt(getAttr(ilvlEls[0], "val") ?? "0", 10) : 0;
|
|
3612
|
+
}
|
|
3613
|
+
}
|
|
3614
|
+
const parts = [];
|
|
3615
|
+
let hasBold = false;
|
|
3616
|
+
let hasItalic = false;
|
|
3617
|
+
let href;
|
|
3618
|
+
let footnoteText;
|
|
3619
|
+
const hyperlinks = getChildElements(p, "hyperlink");
|
|
3620
|
+
const hyperlinkTexts = /* @__PURE__ */ new Set();
|
|
3621
|
+
for (const hl of hyperlinks) {
|
|
3622
|
+
const rId = getAttr(hl, "id");
|
|
3623
|
+
const hlText = [];
|
|
3624
|
+
const runs2 = findElements(hl, "r");
|
|
3625
|
+
for (const r of runs2) {
|
|
3626
|
+
const result = extractRun(r);
|
|
3627
|
+
hlText.push(result.text);
|
|
3628
|
+
}
|
|
3629
|
+
const text2 = hlText.join("");
|
|
3630
|
+
if (text2) {
|
|
3631
|
+
hyperlinkTexts.add(text2);
|
|
3632
|
+
if (rId && rels.has(rId)) {
|
|
3633
|
+
href = rels.get(rId);
|
|
3634
|
+
parts.push(text2);
|
|
3635
|
+
} else {
|
|
3636
|
+
parts.push(text2);
|
|
3637
|
+
}
|
|
3638
|
+
}
|
|
3639
|
+
}
|
|
3640
|
+
const runs = getChildElements(p, "r");
|
|
3641
|
+
for (const r of runs) {
|
|
3642
|
+
if (r.parentNode && r.parentNode.localName === "hyperlink") continue;
|
|
3643
|
+
const result = extractRun(r);
|
|
3644
|
+
if (result.bold) hasBold = true;
|
|
3645
|
+
if (result.italic) hasItalic = true;
|
|
3646
|
+
const fnRefEls = getChildElements(r, "footnoteReference");
|
|
3647
|
+
if (fnRefEls.length > 0) {
|
|
3648
|
+
const fnId = getAttr(fnRefEls[0], "id");
|
|
3649
|
+
if (fnId && footnotes.has(fnId)) {
|
|
3650
|
+
footnoteText = footnotes.get(fnId);
|
|
3651
|
+
}
|
|
3652
|
+
}
|
|
3653
|
+
if (result.text) parts.push(result.text);
|
|
3654
|
+
}
|
|
3655
|
+
const text = parts.join("").trim();
|
|
3656
|
+
if (!text) return null;
|
|
3657
|
+
const style = styles.get(styleId);
|
|
3658
|
+
if (style?.outlineLevel !== void 0 && style.outlineLevel >= 0 && style.outlineLevel <= 5) {
|
|
3659
|
+
return {
|
|
3660
|
+
type: "heading",
|
|
3661
|
+
text,
|
|
3662
|
+
level: style.outlineLevel + 1
|
|
3663
|
+
};
|
|
3664
|
+
}
|
|
3665
|
+
if (numId && numId !== "0") {
|
|
3666
|
+
const numDef = numbering.get(numId);
|
|
3667
|
+
const levelInfo = numDef?.get(ilvl);
|
|
3668
|
+
const listType = levelInfo?.numFmt === "bullet" ? "unordered" : "ordered";
|
|
3669
|
+
return { type: "list", text, listType };
|
|
3670
|
+
}
|
|
3671
|
+
const block = { type: "paragraph", text };
|
|
3672
|
+
if (hasBold || hasItalic) {
|
|
3673
|
+
block.style = { bold: hasBold || void 0, italic: hasItalic || void 0 };
|
|
3674
|
+
}
|
|
3675
|
+
if (href) block.href = href;
|
|
3676
|
+
if (footnoteText) block.footnoteText = footnoteText;
|
|
3677
|
+
return block;
|
|
3678
|
+
}
|
|
3679
|
+
function parseTable(tbl, styles, numbering, footnotes, rels) {
|
|
3680
|
+
const trElements = getChildElements(tbl, "tr");
|
|
3681
|
+
if (trElements.length === 0) return null;
|
|
3682
|
+
const rows = [];
|
|
3683
|
+
let maxCols = 0;
|
|
3684
|
+
for (const tr of trElements) {
|
|
3685
|
+
const tcElements = getChildElements(tr, "tc");
|
|
3686
|
+
const row = [];
|
|
3687
|
+
for (const tc of tcElements) {
|
|
3688
|
+
let colSpan = 1;
|
|
3689
|
+
let rowSpan = 1;
|
|
3690
|
+
const tcPrEls = getChildElements(tc, "tcPr");
|
|
3691
|
+
if (tcPrEls.length > 0) {
|
|
3692
|
+
const gridSpanEls = getChildElements(tcPrEls[0], "gridSpan");
|
|
3693
|
+
if (gridSpanEls.length > 0) {
|
|
3694
|
+
colSpan = parseInt(getAttr(gridSpanEls[0], "val") ?? "1", 10);
|
|
3695
|
+
}
|
|
3696
|
+
const vMergeEls = getChildElements(tcPrEls[0], "vMerge");
|
|
3697
|
+
if (vMergeEls.length > 0) {
|
|
3698
|
+
const val = getAttr(vMergeEls[0], "val");
|
|
3699
|
+
if (val !== "restart" && val !== null) {
|
|
3700
|
+
row.push({ text: "", colSpan, rowSpan: 0 });
|
|
3701
|
+
continue;
|
|
3702
|
+
}
|
|
3703
|
+
}
|
|
3704
|
+
}
|
|
3705
|
+
const cellTexts = [];
|
|
3706
|
+
const pElements = getChildElements(tc, "p");
|
|
3707
|
+
for (const p of pElements) {
|
|
3708
|
+
const block = parseParagraph(p, styles, numbering, footnotes, rels);
|
|
3709
|
+
if (block?.text) cellTexts.push(block.text);
|
|
3710
|
+
}
|
|
3711
|
+
row.push({ text: cellTexts.join("\n"), colSpan, rowSpan });
|
|
3712
|
+
}
|
|
3713
|
+
rows.push(row);
|
|
3714
|
+
if (row.length > maxCols) maxCols = row.length;
|
|
3715
|
+
}
|
|
3716
|
+
for (let c = 0; c < maxCols; c++) {
|
|
3717
|
+
for (let r = 0; r < rows.length; r++) {
|
|
3718
|
+
const cell = rows[r][c];
|
|
3719
|
+
if (!cell || cell.rowSpan === 0) continue;
|
|
3720
|
+
let span = 1;
|
|
3721
|
+
for (let nr = r + 1; nr < rows.length; nr++) {
|
|
3722
|
+
if (rows[nr][c]?.rowSpan === 0) span++;
|
|
3723
|
+
else break;
|
|
3724
|
+
}
|
|
3725
|
+
cell.rowSpan = span;
|
|
3726
|
+
}
|
|
3727
|
+
}
|
|
3728
|
+
const cleanRows = [];
|
|
3729
|
+
for (const row of rows) {
|
|
3730
|
+
const clean = row.filter((cell) => cell.rowSpan !== 0);
|
|
3731
|
+
cleanRows.push(clean);
|
|
3732
|
+
}
|
|
3733
|
+
if (cleanRows.length === 0) return null;
|
|
3734
|
+
let cols = 0;
|
|
3735
|
+
for (const row of cleanRows) {
|
|
3736
|
+
let c = 0;
|
|
3737
|
+
for (const cell of row) c += cell.colSpan;
|
|
3738
|
+
if (c > cols) cols = c;
|
|
3739
|
+
}
|
|
3740
|
+
const table = {
|
|
3741
|
+
rows: cleanRows.length,
|
|
3742
|
+
cols,
|
|
3743
|
+
cells: cleanRows,
|
|
3744
|
+
hasHeader: cleanRows.length > 1
|
|
3745
|
+
};
|
|
3746
|
+
return { type: "table", table };
|
|
3747
|
+
}
|
|
3748
|
+
async function extractImages(zip, rels, doc) {
|
|
3749
|
+
const blocks = [];
|
|
3750
|
+
const images = [];
|
|
3751
|
+
const drawingElements = findElements(doc.documentElement, "drawing");
|
|
3752
|
+
let imgIdx = 0;
|
|
3753
|
+
for (const drawing of drawingElements) {
|
|
3754
|
+
const blips = findElements(drawing, "blip");
|
|
3755
|
+
for (const blip of blips) {
|
|
3756
|
+
const embedId = getAttr(blip, "embed");
|
|
3757
|
+
if (!embedId) continue;
|
|
3758
|
+
const target = rels.get(embedId);
|
|
3759
|
+
if (!target) continue;
|
|
3760
|
+
const imgPath = target.startsWith("/") ? target.slice(1) : target.startsWith("word/") ? target : `word/${target}`;
|
|
3761
|
+
const imgFile = zip.file(imgPath);
|
|
3762
|
+
if (!imgFile) continue;
|
|
3763
|
+
try {
|
|
3764
|
+
const data = await imgFile.async("uint8array");
|
|
3765
|
+
imgIdx++;
|
|
3766
|
+
const ext = imgPath.split(".").pop()?.toLowerCase() ?? "png";
|
|
3767
|
+
const mimeMap = {
|
|
3768
|
+
png: "image/png",
|
|
3769
|
+
jpg: "image/jpeg",
|
|
3770
|
+
jpeg: "image/jpeg",
|
|
3771
|
+
gif: "image/gif",
|
|
3772
|
+
bmp: "image/bmp",
|
|
3773
|
+
wmf: "image/wmf",
|
|
3774
|
+
emf: "image/emf"
|
|
3775
|
+
};
|
|
3776
|
+
const filename = `image_${String(imgIdx).padStart(3, "0")}.${ext}`;
|
|
3777
|
+
images.push({ filename, data, mimeType: mimeMap[ext] ?? "image/png" });
|
|
3778
|
+
blocks.push({ type: "image", text: filename });
|
|
3779
|
+
} catch {
|
|
3780
|
+
}
|
|
3781
|
+
}
|
|
3782
|
+
}
|
|
3783
|
+
return { blocks, images };
|
|
3784
|
+
}
|
|
3785
|
+
async function parseDocxDocument(buffer, options) {
|
|
3786
|
+
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
|
|
3787
|
+
const zip = await JSZip4.loadAsync(buffer);
|
|
3788
|
+
const warnings = [];
|
|
3789
|
+
const docFile = zip.file("word/document.xml");
|
|
3790
|
+
if (!docFile) {
|
|
3791
|
+
throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 DOCX \uD30C\uC77C: word/document.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
3792
|
+
}
|
|
3793
|
+
let rels = /* @__PURE__ */ new Map();
|
|
3794
|
+
const relsFile = zip.file("word/_rels/document.xml.rels");
|
|
3795
|
+
if (relsFile) {
|
|
3796
|
+
rels = parseRels2(await relsFile.async("text"));
|
|
3797
|
+
}
|
|
3798
|
+
let styles = /* @__PURE__ */ new Map();
|
|
3799
|
+
const stylesFile = zip.file("word/styles.xml");
|
|
3800
|
+
if (stylesFile) {
|
|
3801
|
+
try {
|
|
3802
|
+
styles = parseStyles(await stylesFile.async("text"));
|
|
3803
|
+
} catch {
|
|
3804
|
+
}
|
|
3805
|
+
}
|
|
3806
|
+
let numbering = /* @__PURE__ */ new Map();
|
|
3807
|
+
const numFile = zip.file("word/numbering.xml");
|
|
3808
|
+
if (numFile) {
|
|
3809
|
+
try {
|
|
3810
|
+
numbering = parseNumbering(await numFile.async("text"));
|
|
3811
|
+
} catch {
|
|
3812
|
+
}
|
|
3813
|
+
}
|
|
3814
|
+
let footnotes = /* @__PURE__ */ new Map();
|
|
3815
|
+
const fnFile = zip.file("word/footnotes.xml");
|
|
3816
|
+
if (fnFile) {
|
|
3817
|
+
try {
|
|
3818
|
+
footnotes = parseFootnotes(await fnFile.async("text"));
|
|
3819
|
+
} catch {
|
|
3820
|
+
}
|
|
3821
|
+
}
|
|
3822
|
+
const docXml = await docFile.async("text");
|
|
3823
|
+
const doc = parseXml2(docXml);
|
|
3824
|
+
const body = findElements(doc, "body");
|
|
3825
|
+
if (body.length === 0) {
|
|
3826
|
+
throw new KordocError("DOCX \uBCF8\uBB38(w:body)\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
3827
|
+
}
|
|
3828
|
+
const blocks = [];
|
|
3829
|
+
const bodyEl = body[0];
|
|
3830
|
+
const children = bodyEl.childNodes;
|
|
3831
|
+
for (let i = 0; i < children.length; i++) {
|
|
3832
|
+
const node = children[i];
|
|
3833
|
+
if (node.nodeType !== 1) continue;
|
|
3834
|
+
const el = node;
|
|
3835
|
+
const localName = el.localName ?? el.tagName?.split(":").pop();
|
|
3836
|
+
if (localName === "p") {
|
|
3837
|
+
const block = parseParagraph(el, styles, numbering, footnotes, rels);
|
|
3838
|
+
if (block) blocks.push(block);
|
|
3839
|
+
} else if (localName === "tbl") {
|
|
3840
|
+
const block = parseTable(el, styles, numbering, footnotes, rels);
|
|
3841
|
+
if (block) blocks.push(block);
|
|
3842
|
+
}
|
|
3843
|
+
}
|
|
3844
|
+
const { blocks: imgBlocks, images } = await extractImages(zip, rels, doc);
|
|
3845
|
+
const metadata = {};
|
|
3846
|
+
const coreFile = zip.file("docProps/core.xml");
|
|
3847
|
+
if (coreFile) {
|
|
3848
|
+
try {
|
|
3849
|
+
const coreXml = await coreFile.async("text");
|
|
3850
|
+
const coreDoc = parseXml2(coreXml);
|
|
3851
|
+
const getFirst = (tag) => {
|
|
3852
|
+
const els = coreDoc.getElementsByTagName(tag);
|
|
3853
|
+
return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
|
|
3854
|
+
};
|
|
3855
|
+
metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
|
|
3856
|
+
metadata.author = getFirst("dc:creator");
|
|
3857
|
+
metadata.description = getFirst("dc:description");
|
|
3858
|
+
const created = getFirst("dcterms:created");
|
|
3859
|
+
if (created) metadata.createdAt = created;
|
|
3860
|
+
const modified = getFirst("dcterms:modified");
|
|
3861
|
+
if (modified) metadata.modifiedAt = modified;
|
|
3862
|
+
} catch {
|
|
3863
|
+
}
|
|
3864
|
+
}
|
|
3865
|
+
const outline = blocks.filter((b) => b.type === "heading").map((b) => ({ level: b.level ?? 2, text: b.text ?? "" }));
|
|
3866
|
+
const markdown = blocksToMarkdown(blocks);
|
|
3867
|
+
return {
|
|
3868
|
+
markdown,
|
|
3869
|
+
blocks,
|
|
3870
|
+
metadata,
|
|
3871
|
+
outline: outline.length > 0 ? outline : void 0,
|
|
3872
|
+
warnings: warnings.length > 0 ? warnings : void 0,
|
|
3873
|
+
images: images.length > 0 ? images : void 0
|
|
3874
|
+
};
|
|
3875
|
+
}
|
|
3876
|
+
|
|
3005
3877
|
// src/form/recognize.ts
|
|
3006
3878
|
var LABEL_KEYWORDS = /* @__PURE__ */ new Set([
|
|
3007
3879
|
"\uC131\uBA85",
|
|
@@ -3129,7 +4001,7 @@ function extractInlineFields(text) {
|
|
|
3129
4001
|
}
|
|
3130
4002
|
|
|
3131
4003
|
// src/hwpx/generator.ts
|
|
3132
|
-
import
|
|
4004
|
+
import JSZip5 from "jszip";
|
|
3133
4005
|
|
|
3134
4006
|
// src/index.ts
|
|
3135
4007
|
async function parse(input, options) {
|
|
@@ -3152,8 +4024,12 @@ async function parse(input, options) {
|
|
|
3152
4024
|
}
|
|
3153
4025
|
const format = detectFormat(buffer);
|
|
3154
4026
|
switch (format) {
|
|
3155
|
-
case "hwpx":
|
|
4027
|
+
case "hwpx": {
|
|
4028
|
+
const zipFormat = await detectZipFormat(buffer);
|
|
4029
|
+
if (zipFormat === "xlsx") return parseXlsx(buffer, options);
|
|
4030
|
+
if (zipFormat === "docx") return parseDocx(buffer, options);
|
|
3156
4031
|
return parseHwpx(buffer, options);
|
|
4032
|
+
}
|
|
3157
4033
|
case "hwp":
|
|
3158
4034
|
return parseHwp(buffer, options);
|
|
3159
4035
|
case "pdf":
|
|
@@ -3180,9 +4056,27 @@ async function parseHwp(buffer, options) {
|
|
|
3180
4056
|
}
|
|
3181
4057
|
async function parsePdf(buffer, options) {
|
|
3182
4058
|
try {
|
|
3183
|
-
|
|
4059
|
+
const { markdown, blocks, metadata, outline, warnings, isImageBased } = await parsePdfDocument(buffer, options);
|
|
4060
|
+
return { success: true, fileType: "pdf", markdown, blocks, metadata, outline, warnings, isImageBased };
|
|
3184
4061
|
} catch (err) {
|
|
3185
|
-
|
|
4062
|
+
const isImageBased = err instanceof Error && "isImageBased" in err ? true : void 0;
|
|
4063
|
+
return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err), isImageBased };
|
|
4064
|
+
}
|
|
4065
|
+
}
|
|
4066
|
+
async function parseXlsx(buffer, options) {
|
|
4067
|
+
try {
|
|
4068
|
+
const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options);
|
|
4069
|
+
return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
|
|
4070
|
+
} catch (err) {
|
|
4071
|
+
return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
4072
|
+
}
|
|
4073
|
+
}
|
|
4074
|
+
async function parseDocx(buffer, options) {
|
|
4075
|
+
try {
|
|
4076
|
+
const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options);
|
|
4077
|
+
return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
|
|
4078
|
+
} catch (err) {
|
|
4079
|
+
return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
3186
4080
|
}
|
|
3187
4081
|
}
|
|
3188
4082
|
|
|
@@ -3360,10 +4254,6 @@ function diffTableCells(a, b) {
|
|
|
3360
4254
|
export {
|
|
3361
4255
|
detectFormat,
|
|
3362
4256
|
blocksToMarkdown,
|
|
3363
|
-
VERSION,
|
|
3364
|
-
toArrayBuffer,
|
|
3365
|
-
KordocError,
|
|
3366
|
-
sanitizeError,
|
|
3367
4257
|
extractHwpxMetadataOnly,
|
|
3368
4258
|
extractHwp5MetadataOnly,
|
|
3369
4259
|
extractPdfMetadataOnly,
|
|
@@ -3371,4 +4261,4 @@ export {
|
|
|
3371
4261
|
extractFormFields,
|
|
3372
4262
|
parse
|
|
3373
4263
|
};
|
|
3374
|
-
//# sourceMappingURL=chunk-
|
|
4264
|
+
//# sourceMappingURL=chunk-QQ6PZADA.js.map
|