kordoc 2.2.5 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -4
- package/dist/{chunk-UU2O6D3R.js → chunk-JFTFC2BB.js} +2 -2
- package/dist/{chunk-JH5XLWJQ.js.map → chunk-JFTFC2BB.js.map} +1 -1
- package/dist/{chunk-5Y2Q3BRW.js → chunk-M3E3C5GS.js} +8 -1
- package/dist/chunk-M3E3C5GS.js.map +1 -0
- package/dist/{chunk-RQWICKON.js → chunk-OEJJPCMM.js} +369 -73
- package/dist/chunk-OEJJPCMM.js.map +1 -0
- package/dist/{chunk-JH5XLWJQ.js → chunk-Z7UPTVMX.js} +2 -2
- package/dist/{chunk-UU2O6D3R.js.map → chunk-Z7UPTVMX.js.map} +1 -1
- package/dist/{chunk-OJ4QR33V.cjs → chunk-ZNJPRRIA.cjs} +2 -2
- package/dist/{chunk-OJ4QR33V.cjs.map → chunk-ZNJPRRIA.cjs.map} +1 -1
- package/dist/cli.js +7 -4
- package/dist/cli.js.map +1 -1
- package/dist/{detect-GYK3HKD5.js → detect-I7YIS4Q6.js} +4 -2
- package/dist/index.cjs +463 -160
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +4 -2
- package/dist/index.d.ts +4 -2
- package/dist/index.js +387 -84
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +5 -5
- package/dist/{parser-OIRWPKIQ.js → parser-25LF2S2J.js} +45 -42
- package/dist/{parser-OIRWPKIQ.js.map → parser-25LF2S2J.js.map} +1 -1
- package/dist/{parser-PXD73E4H.js → parser-4LKJXBPP.js} +45 -42
- package/dist/{parser-PXD73E4H.js.map → parser-4LKJXBPP.js.map} +1 -1
- package/dist/{parser-CYBX5MP4.cjs → parser-KBQZB3QY.cjs} +61 -58
- package/dist/{parser-CYBX5MP4.cjs.map → parser-KBQZB3QY.cjs.map} +1 -1
- package/dist/{watch-NSBABJ4A.js → watch-GXRBLW3Y.js} +4 -4
- package/package.json +2 -2
- package/dist/chunk-5Y2Q3BRW.js.map +0 -1
- package/dist/chunk-RQWICKON.js.map +0 -1
- /package/dist/{detect-GYK3HKD5.js.map → detect-I7YIS4Q6.js.map} +0 -0
- /package/dist/{watch-NSBABJ4A.js.map → watch-GXRBLW3Y.js.map} +0 -0
package/dist/index.cjs
CHANGED
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
var
|
|
19
|
+
var _chunkZNJPRRIAcjs = require('./chunk-ZNJPRRIA.cjs');
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
var _chunkMUOQXDZ4cjs = require('./chunk-MUOQXDZ4.cjs');
|
|
@@ -44,11 +44,17 @@ function isPdfFile(buffer) {
|
|
|
44
44
|
const b = magicBytes(buffer);
|
|
45
45
|
return b[0] === 37 && b[1] === 80 && b[2] === 68 && b[3] === 70;
|
|
46
46
|
}
|
|
47
|
+
function isHwpmlFile(buffer) {
|
|
48
|
+
const bytes = new Uint8Array(buffer, 0, Math.min(512, buffer.byteLength));
|
|
49
|
+
const head = new TextDecoder("utf-8", { fatal: false }).decode(bytes).replace(/^\uFEFF/, "");
|
|
50
|
+
return head.trimStart().startsWith("<?xml") && head.includes("<HWPML");
|
|
51
|
+
}
|
|
47
52
|
function detectFormat(buffer) {
|
|
48
53
|
if (buffer.byteLength < 4) return "unknown";
|
|
49
54
|
if (isZipFile(buffer)) return "hwpx";
|
|
50
55
|
if (isOldHwpFile(buffer)) return "hwp";
|
|
51
56
|
if (isPdfFile(buffer)) return "pdf";
|
|
57
|
+
if (isHwpmlFile(buffer)) return "hwpml";
|
|
52
58
|
return "unknown";
|
|
53
59
|
}
|
|
54
60
|
async function detectZipFormat(buffer) {
|
|
@@ -78,7 +84,7 @@ var MAX_XML_DEPTH = 200;
|
|
|
78
84
|
function createXmlParser(warnings) {
|
|
79
85
|
return new (0, _xmldom.DOMParser)({
|
|
80
86
|
onError(level, msg) {
|
|
81
|
-
if (level === "fatalError") throw new (0,
|
|
87
|
+
if (level === "fatalError") throw new (0, _chunkZNJPRRIAcjs.KordocError)(`XML \uD30C\uC2F1 \uC2E4\uD328: ${msg}`);
|
|
82
88
|
_optionalChain([warnings, 'optionalAccess', _2 => _2.push, 'call', _3 => _3({ code: "MALFORMED_XML", message: `XML ${level === "warn" ? "\uACBD\uACE0" : "\uC624\uB958"}: ${msg}` })]);
|
|
83
89
|
}
|
|
84
90
|
});
|
|
@@ -97,10 +103,10 @@ async function extractHwpxStyles(zip, decompressed) {
|
|
|
97
103
|
const xml = await file.async("text");
|
|
98
104
|
if (decompressed) {
|
|
99
105
|
decompressed.total += xml.length * 2;
|
|
100
|
-
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new (0,
|
|
106
|
+
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new (0, _chunkZNJPRRIAcjs.KordocError)("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
101
107
|
}
|
|
102
108
|
const parser = createXmlParser();
|
|
103
|
-
const doc = parser.parseFromString(
|
|
109
|
+
const doc = parser.parseFromString(_chunkZNJPRRIAcjs.stripDtd.call(void 0, xml), "text/xml");
|
|
104
110
|
if (!doc.documentElement) continue;
|
|
105
111
|
parseCharProperties(doc, result.charProperties);
|
|
106
112
|
parseStyleElements(doc, result.styles);
|
|
@@ -162,7 +168,7 @@ function parseStyleElements(doc, map) {
|
|
|
162
168
|
}
|
|
163
169
|
}
|
|
164
170
|
async function parseHwpxDocument(buffer, options) {
|
|
165
|
-
|
|
171
|
+
_chunkZNJPRRIAcjs.precheckZipSize.call(void 0, buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
|
|
166
172
|
let zip;
|
|
167
173
|
try {
|
|
168
174
|
zip = await _jszip2.default.loadAsync(buffer);
|
|
@@ -171,7 +177,7 @@ async function parseHwpxDocument(buffer, options) {
|
|
|
171
177
|
}
|
|
172
178
|
const actualEntryCount = Object.keys(zip.files).length;
|
|
173
179
|
if (actualEntryCount > MAX_ZIP_ENTRIES) {
|
|
174
|
-
throw new (0,
|
|
180
|
+
throw new (0, _chunkZNJPRRIAcjs.KordocError)("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
175
181
|
}
|
|
176
182
|
const decompressed = { total: 0 };
|
|
177
183
|
const metadata = {};
|
|
@@ -179,11 +185,12 @@ async function parseHwpxDocument(buffer, options) {
|
|
|
179
185
|
const styleMap = await extractHwpxStyles(zip, decompressed);
|
|
180
186
|
const warnings = [];
|
|
181
187
|
const sectionPaths = await resolveSectionPaths(zip);
|
|
182
|
-
if (sectionPaths.length === 0) throw new (0,
|
|
188
|
+
if (sectionPaths.length === 0) throw new (0, _chunkZNJPRRIAcjs.KordocError)("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
183
189
|
metadata.pageCount = sectionPaths.length;
|
|
184
190
|
const pageFilter = _optionalChain([options, 'optionalAccess', _4 => _4.pages]) ? _chunkMUOQXDZ4cjs.parsePageRange.call(void 0, options.pages, sectionPaths.length) : null;
|
|
185
191
|
const totalTarget = pageFilter ? pageFilter.size : sectionPaths.length;
|
|
186
192
|
const blocks = [];
|
|
193
|
+
const nestedTableCounter = { count: 0 };
|
|
187
194
|
let parsedSections = 0;
|
|
188
195
|
for (let si = 0; si < sectionPaths.length; si++) {
|
|
189
196
|
if (pageFilter && !pageFilter.has(si + 1)) continue;
|
|
@@ -192,19 +199,19 @@ async function parseHwpxDocument(buffer, options) {
|
|
|
192
199
|
try {
|
|
193
200
|
const xml = await file.async("text");
|
|
194
201
|
decompressed.total += xml.length * 2;
|
|
195
|
-
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new (0,
|
|
196
|
-
blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1));
|
|
202
|
+
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new (0, _chunkZNJPRRIAcjs.KordocError)("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
203
|
+
blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1, nestedTableCounter));
|
|
197
204
|
parsedSections++;
|
|
198
205
|
_optionalChain([options, 'optionalAccess', _5 => _5.onProgress, 'optionalCall', _6 => _6(parsedSections, totalTarget)]);
|
|
199
206
|
} catch (secErr) {
|
|
200
|
-
if (secErr instanceof
|
|
207
|
+
if (secErr instanceof _chunkZNJPRRIAcjs.KordocError) throw secErr;
|
|
201
208
|
warnings.push({ page: si + 1, message: `\uC139\uC158 ${si + 1} \uD30C\uC2F1 \uC2E4\uD328: ${secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
202
209
|
}
|
|
203
210
|
}
|
|
204
211
|
const images = await extractImagesFromZip(zip, blocks, decompressed, warnings);
|
|
205
212
|
detectHwpxHeadings(blocks, styleMap);
|
|
206
213
|
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
207
|
-
const markdown =
|
|
214
|
+
const markdown = _chunkZNJPRRIAcjs.blocksToMarkdown.call(void 0, blocks);
|
|
208
215
|
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
|
|
209
216
|
}
|
|
210
217
|
function imageExtToMime(ext) {
|
|
@@ -254,16 +261,29 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
|
|
|
254
261
|
ref
|
|
255
262
|
// 절대 경로일 수도 있음
|
|
256
263
|
];
|
|
264
|
+
let resolvedPath = null;
|
|
265
|
+
if (!ref.includes(".")) {
|
|
266
|
+
const prefixes = [`BinData/${ref}`, `Contents/BinData/${ref}`];
|
|
267
|
+
for (const prefix of prefixes) {
|
|
268
|
+
const match = zip.file(new RegExp(`^${prefix.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\.[a-zA-Z0-9]+$`));
|
|
269
|
+
if (match.length > 0) {
|
|
270
|
+
resolvedPath = match[0].name;
|
|
271
|
+
break;
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
}
|
|
257
275
|
let found = false;
|
|
258
|
-
|
|
259
|
-
|
|
276
|
+
const allCandidates = resolvedPath ? [resolvedPath, ...candidates] : candidates;
|
|
277
|
+
for (const path of allCandidates) {
|
|
278
|
+
if (_chunkZNJPRRIAcjs.isPathTraversal.call(void 0, path)) continue;
|
|
260
279
|
const file = zip.file(path);
|
|
261
280
|
if (!file) continue;
|
|
262
281
|
try {
|
|
263
282
|
const data = await file.async("uint8array");
|
|
264
283
|
decompressed.total += data.length;
|
|
265
|
-
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new (0,
|
|
266
|
-
const
|
|
284
|
+
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new (0, _chunkZNJPRRIAcjs.KordocError)("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
285
|
+
const actualPath = path;
|
|
286
|
+
const ext = actualPath.includes(".") ? actualPath.split(".").pop() || "png" : "png";
|
|
267
287
|
const mimeType = imageExtToMime(ext);
|
|
268
288
|
imageIndex++;
|
|
269
289
|
const filename = `image_${String(imageIndex).padStart(3, "0")}.${mimeToExt(mimeType)}`;
|
|
@@ -273,7 +293,7 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
|
|
|
273
293
|
found = true;
|
|
274
294
|
break;
|
|
275
295
|
} catch (err) {
|
|
276
|
-
if (err instanceof
|
|
296
|
+
if (err instanceof _chunkZNJPRRIAcjs.KordocError) throw err;
|
|
277
297
|
}
|
|
278
298
|
}
|
|
279
299
|
if (!found) {
|
|
@@ -293,7 +313,7 @@ async function extractHwpxMetadata(zip, metadata, decompressed) {
|
|
|
293
313
|
const xml = await file.async("text");
|
|
294
314
|
if (decompressed) {
|
|
295
315
|
decompressed.total += xml.length * 2;
|
|
296
|
-
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new (0,
|
|
316
|
+
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new (0, _chunkZNJPRRIAcjs.KordocError)("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
297
317
|
}
|
|
298
318
|
parseDublinCoreMetadata(xml, metadata);
|
|
299
319
|
if (metadata.title || metadata.author) return;
|
|
@@ -303,7 +323,7 @@ async function extractHwpxMetadata(zip, metadata, decompressed) {
|
|
|
303
323
|
}
|
|
304
324
|
function parseDublinCoreMetadata(xml, metadata) {
|
|
305
325
|
const parser = createXmlParser();
|
|
306
|
-
const doc = parser.parseFromString(
|
|
326
|
+
const doc = parser.parseFromString(_chunkZNJPRRIAcjs.stripDtd.call(void 0, xml), "text/xml");
|
|
307
327
|
if (!doc.documentElement) return;
|
|
308
328
|
const getText = (tagNames) => {
|
|
309
329
|
for (const tag of tagNames) {
|
|
@@ -336,6 +356,7 @@ function extractFromBrokenZip(buffer) {
|
|
|
336
356
|
let totalDecompressed = 0;
|
|
337
357
|
let entryCount = 0;
|
|
338
358
|
let sectionNum = 0;
|
|
359
|
+
const nestedTableCounter = { count: 0 };
|
|
339
360
|
while (pos < data.length - 30) {
|
|
340
361
|
if (data[pos] !== 80 || data[pos + 1] !== 75 || data[pos + 2] !== 3 || data[pos + 3] !== 4) {
|
|
341
362
|
pos++;
|
|
@@ -362,7 +383,7 @@ function extractFromBrokenZip(buffer) {
|
|
|
362
383
|
}
|
|
363
384
|
const nameBytes = data.slice(pos + 30, pos + 30 + nameLen);
|
|
364
385
|
const name = new TextDecoder().decode(nameBytes);
|
|
365
|
-
if (
|
|
386
|
+
if (_chunkZNJPRRIAcjs.isPathTraversal.call(void 0, name)) {
|
|
366
387
|
pos = fileStart + compSize;
|
|
367
388
|
continue;
|
|
368
389
|
}
|
|
@@ -380,15 +401,15 @@ function extractFromBrokenZip(buffer) {
|
|
|
380
401
|
continue;
|
|
381
402
|
}
|
|
382
403
|
totalDecompressed += content.length * 2;
|
|
383
|
-
if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new (0,
|
|
404
|
+
if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new (0, _chunkZNJPRRIAcjs.KordocError)("\uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC");
|
|
384
405
|
sectionNum++;
|
|
385
|
-
blocks.push(...parseSectionXml(content, void 0, warnings, sectionNum));
|
|
406
|
+
blocks.push(...parseSectionXml(content, void 0, warnings, sectionNum, nestedTableCounter));
|
|
386
407
|
} catch (e6) {
|
|
387
408
|
continue;
|
|
388
409
|
}
|
|
389
410
|
}
|
|
390
|
-
if (blocks.length === 0) throw new (0,
|
|
391
|
-
const markdown =
|
|
411
|
+
if (blocks.length === 0) throw new (0, _chunkZNJPRRIAcjs.KordocError)("\uC190\uC0C1\uB41C HWPX\uC5D0\uC11C \uC139\uC158 \uB370\uC774\uD130\uB97C \uBCF5\uAD6C\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
412
|
+
const markdown = _chunkZNJPRRIAcjs.blocksToMarkdown.call(void 0, blocks);
|
|
392
413
|
return { markdown, blocks, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
393
414
|
}
|
|
394
415
|
async function resolveSectionPaths(zip) {
|
|
@@ -406,7 +427,7 @@ async function resolveSectionPaths(zip) {
|
|
|
406
427
|
}
|
|
407
428
|
function parseSectionPathsFromManifest(xml) {
|
|
408
429
|
const parser = createXmlParser();
|
|
409
|
-
const doc = parser.parseFromString(
|
|
430
|
+
const doc = parser.parseFromString(_chunkZNJPRRIAcjs.stripDtd.call(void 0, xml), "text/xml");
|
|
410
431
|
const items = doc.getElementsByTagName("opf:item");
|
|
411
432
|
const spine = doc.getElementsByTagName("opf:itemref");
|
|
412
433
|
const isSectionId = (id) => /^s/i.test(id) || id.toLowerCase().includes("section");
|
|
@@ -453,9 +474,9 @@ function detectHwpxHeadings(blocks, styleMap) {
|
|
|
453
474
|
let level = 0;
|
|
454
475
|
if (baseFontSize > 0 && _optionalChain([block, 'access', _15 => _15.style, 'optionalAccess', _16 => _16.fontSize])) {
|
|
455
476
|
const ratio = block.style.fontSize / baseFontSize;
|
|
456
|
-
if (ratio >=
|
|
457
|
-
else if (ratio >=
|
|
458
|
-
else if (ratio >=
|
|
477
|
+
if (ratio >= _chunkZNJPRRIAcjs.HEADING_RATIO_H1) level = 1;
|
|
478
|
+
else if (ratio >= _chunkZNJPRRIAcjs.HEADING_RATIO_H2) level = 2;
|
|
479
|
+
else if (ratio >= _chunkZNJPRRIAcjs.HEADING_RATIO_H3) level = 3;
|
|
459
480
|
}
|
|
460
481
|
const compactText = text.replace(/\s+/g, "");
|
|
461
482
|
if (/^제\d+[조장절편]/.test(compactText) && text.length <= 50) {
|
|
@@ -467,12 +488,40 @@ function detectHwpxHeadings(blocks, styleMap) {
|
|
|
467
488
|
}
|
|
468
489
|
}
|
|
469
490
|
}
|
|
470
|
-
function
|
|
491
|
+
function makeNestedTableMarker(counter, rows) {
|
|
492
|
+
counter.count++;
|
|
493
|
+
const firstRow = _nullishCoalesce(rows[0], () => ( []));
|
|
494
|
+
const hint = firstRow.map((c) => c.text.trim().replace(/\n/g, " ")).filter(Boolean).join(" | ");
|
|
495
|
+
const hintChars = [...hint];
|
|
496
|
+
const truncated = hintChars.length > 60 ? hintChars.slice(0, 60).join("") + "\u2026" : hint;
|
|
497
|
+
return truncated ? `[\uC911\uCCA9 \uD14C\uC774\uBE14 #${counter.count}: ${truncated}]` : `[\uC911\uCCA9 \uD14C\uC774\uBE14 #${counter.count}]`;
|
|
498
|
+
}
|
|
499
|
+
function handleNestedTable(newTable, tableStack, blocks, ctx) {
|
|
500
|
+
const parentTable = tableStack.pop();
|
|
501
|
+
let nestedCols = 0;
|
|
502
|
+
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
503
|
+
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
504
|
+
blocks.push({ type: "table", table: _chunkZNJPRRIAcjs.buildTable.call(void 0, newTable.rows), pageNumber: ctx.sectionNum });
|
|
505
|
+
if (parentTable.cell) {
|
|
506
|
+
const marker = ctx.counter ? makeNestedTableMarker(ctx.counter, newTable.rows) : "[\uC911\uCCA9 \uD14C\uC774\uBE14]";
|
|
507
|
+
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + marker;
|
|
508
|
+
}
|
|
509
|
+
} else {
|
|
510
|
+
const nestedText = _chunkZNJPRRIAcjs.convertTableToText.call(void 0, newTable.rows);
|
|
511
|
+
if (parentTable.cell) {
|
|
512
|
+
const marker = ctx.counter ? makeNestedTableMarker(ctx.counter, newTable.rows) : "[\uC911\uCCA9 \uD14C\uC774\uBE14]";
|
|
513
|
+
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + marker + "\n" + nestedText;
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
return parentTable;
|
|
517
|
+
}
|
|
518
|
+
function parseSectionXml(xml, styleMap, warnings, sectionNum, counter) {
|
|
471
519
|
const parser = createXmlParser(warnings);
|
|
472
|
-
const doc = parser.parseFromString(
|
|
520
|
+
const doc = parser.parseFromString(_chunkZNJPRRIAcjs.stripDtd.call(void 0, xml), "text/xml");
|
|
473
521
|
if (!doc.documentElement) return [];
|
|
474
522
|
const blocks = [];
|
|
475
|
-
|
|
523
|
+
const ctx = { styleMap, warnings, sectionNum, counter };
|
|
524
|
+
walkSection(doc.documentElement, blocks, null, [], ctx);
|
|
476
525
|
return blocks;
|
|
477
526
|
}
|
|
478
527
|
function extractImageRef(el) {
|
|
@@ -493,7 +542,7 @@ function extractImageRef(el) {
|
|
|
493
542
|
if (directRef) return directRef;
|
|
494
543
|
return null;
|
|
495
544
|
}
|
|
496
|
-
function walkSection(node, blocks, tableCtx, tableStack,
|
|
545
|
+
function walkSection(node, blocks, tableCtx, tableStack, ctx, depth = 0) {
|
|
497
546
|
if (depth > MAX_XML_DEPTH) return;
|
|
498
547
|
const children = node.childNodes;
|
|
499
548
|
if (!children) return;
|
|
@@ -506,23 +555,12 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
506
555
|
case "tbl": {
|
|
507
556
|
if (tableCtx) tableStack.push(tableCtx);
|
|
508
557
|
const newTable = { rows: [], currentRow: [], cell: null };
|
|
509
|
-
walkSection(el, blocks, newTable, tableStack,
|
|
558
|
+
walkSection(el, blocks, newTable, tableStack, ctx, depth + 1);
|
|
510
559
|
if (newTable.rows.length > 0) {
|
|
511
560
|
if (tableStack.length > 0) {
|
|
512
|
-
|
|
513
|
-
let nestedCols = 0;
|
|
514
|
-
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
515
|
-
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
516
|
-
blocks.push({ type: "table", table: _chunkOJ4QR33Vcjs.buildTable.call(void 0, newTable.rows), pageNumber: sectionNum });
|
|
517
|
-
} else {
|
|
518
|
-
const nestedText = _chunkOJ4QR33Vcjs.convertTableToText.call(void 0, newTable.rows);
|
|
519
|
-
if (parentTable.cell) {
|
|
520
|
-
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
|
|
521
|
-
}
|
|
522
|
-
}
|
|
523
|
-
tableCtx = parentTable;
|
|
561
|
+
tableCtx = handleNestedTable(newTable, tableStack, blocks, ctx);
|
|
524
562
|
} else {
|
|
525
|
-
blocks.push({ type: "table", table:
|
|
563
|
+
blocks.push({ type: "table", table: _chunkZNJPRRIAcjs.buildTable.call(void 0, newTable.rows), pageNumber: ctx.sectionNum });
|
|
526
564
|
tableCtx = null;
|
|
527
565
|
}
|
|
528
566
|
} else {
|
|
@@ -533,7 +571,7 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
533
571
|
case "tr":
|
|
534
572
|
if (tableCtx) {
|
|
535
573
|
tableCtx.currentRow = [];
|
|
536
|
-
walkSection(el, blocks, tableCtx, tableStack,
|
|
574
|
+
walkSection(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
537
575
|
if (tableCtx.currentRow.length > 0) tableCtx.rows.push(tableCtx.currentRow);
|
|
538
576
|
tableCtx.currentRow = [];
|
|
539
577
|
}
|
|
@@ -541,7 +579,7 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
541
579
|
case "tc":
|
|
542
580
|
if (tableCtx) {
|
|
543
581
|
tableCtx.cell = { text: "", colSpan: 1, rowSpan: 1 };
|
|
544
|
-
walkSection(el, blocks, tableCtx, tableStack,
|
|
582
|
+
walkSection(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
545
583
|
if (tableCtx.cell) {
|
|
546
584
|
tableCtx.currentRow.push(tableCtx.cell);
|
|
547
585
|
tableCtx.cell = null;
|
|
@@ -562,24 +600,24 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
562
600
|
const cs = isNaN(rawCs) ? 1 : rawCs;
|
|
563
601
|
const rawRs = parseInt(el.getAttribute("rowSpan") || "1", 10);
|
|
564
602
|
const rs = isNaN(rawRs) ? 1 : rawRs;
|
|
565
|
-
tableCtx.cell.colSpan = clampSpan(cs,
|
|
566
|
-
tableCtx.cell.rowSpan = clampSpan(rs,
|
|
603
|
+
tableCtx.cell.colSpan = clampSpan(cs, _chunkZNJPRRIAcjs.MAX_COLS);
|
|
604
|
+
tableCtx.cell.rowSpan = clampSpan(rs, _chunkZNJPRRIAcjs.MAX_ROWS);
|
|
567
605
|
}
|
|
568
606
|
break;
|
|
569
607
|
case "p": {
|
|
570
|
-
const { text, href, footnote, style } = extractParagraphInfo(el, styleMap);
|
|
608
|
+
const { text, href, footnote, style } = extractParagraphInfo(el, ctx.styleMap);
|
|
571
609
|
if (text) {
|
|
572
610
|
if (_optionalChain([tableCtx, 'optionalAccess', _19 => _19.cell])) {
|
|
573
611
|
tableCtx.cell.text += (tableCtx.cell.text ? "\n" : "") + text;
|
|
574
612
|
} else if (!tableCtx) {
|
|
575
|
-
const block = { type: "paragraph", text, pageNumber: sectionNum };
|
|
613
|
+
const block = { type: "paragraph", text, pageNumber: ctx.sectionNum };
|
|
576
614
|
if (style) block.style = style;
|
|
577
615
|
if (href) block.href = href;
|
|
578
616
|
if (footnote) block.footnoteText = footnote;
|
|
579
617
|
blocks.push(block);
|
|
580
618
|
}
|
|
581
619
|
}
|
|
582
|
-
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack,
|
|
620
|
+
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
583
621
|
break;
|
|
584
622
|
}
|
|
585
623
|
// 이미지/그림 — 경로 추출 또는 경고
|
|
@@ -588,19 +626,19 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
588
626
|
case "drawingObject": {
|
|
589
627
|
const imgRef = extractImageRef(el);
|
|
590
628
|
if (imgRef) {
|
|
591
|
-
blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
|
|
592
|
-
} else if (warnings && sectionNum) {
|
|
593
|
-
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
629
|
+
blocks.push({ type: "image", text: imgRef, pageNumber: ctx.sectionNum });
|
|
630
|
+
} else if (ctx.warnings && ctx.sectionNum) {
|
|
631
|
+
ctx.warnings.push({ page: ctx.sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
594
632
|
}
|
|
595
633
|
break;
|
|
596
634
|
}
|
|
597
635
|
default:
|
|
598
|
-
walkSection(el, blocks, tableCtx, tableStack,
|
|
636
|
+
walkSection(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
599
637
|
break;
|
|
600
638
|
}
|
|
601
639
|
}
|
|
602
640
|
}
|
|
603
|
-
function walkParagraphChildren(node, blocks, tableCtx, tableStack,
|
|
641
|
+
function walkParagraphChildren(node, blocks, tableCtx, tableStack, ctx, depth = 0) {
|
|
604
642
|
if (depth > MAX_XML_DEPTH) return tableCtx;
|
|
605
643
|
const children = node.childNodes;
|
|
606
644
|
if (!children) return tableCtx;
|
|
@@ -616,23 +654,12 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
616
654
|
if (localTag === "tbl") {
|
|
617
655
|
if (tableCtx) tableStack.push(tableCtx);
|
|
618
656
|
const newTable = { rows: [], currentRow: [], cell: null };
|
|
619
|
-
walkSection(el, blocks, newTable, tableStack,
|
|
657
|
+
walkSection(el, blocks, newTable, tableStack, ctx, d + 1);
|
|
620
658
|
if (newTable.rows.length > 0) {
|
|
621
659
|
if (tableStack.length > 0) {
|
|
622
|
-
|
|
623
|
-
let nestedCols = 0;
|
|
624
|
-
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
625
|
-
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
626
|
-
blocks.push({ type: "table", table: _chunkOJ4QR33Vcjs.buildTable.call(void 0, newTable.rows), pageNumber: sectionNum });
|
|
627
|
-
} else {
|
|
628
|
-
const nestedText = _chunkOJ4QR33Vcjs.convertTableToText.call(void 0, newTable.rows);
|
|
629
|
-
if (parentTable.cell) {
|
|
630
|
-
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
|
|
631
|
-
}
|
|
632
|
-
}
|
|
633
|
-
tableCtx = parentTable;
|
|
660
|
+
tableCtx = handleNestedTable(newTable, tableStack, blocks, ctx);
|
|
634
661
|
} else {
|
|
635
|
-
blocks.push({ type: "table", table:
|
|
662
|
+
blocks.push({ type: "table", table: _chunkZNJPRRIAcjs.buildTable.call(void 0, newTable.rows), pageNumber: ctx.sectionNum });
|
|
636
663
|
tableCtx = null;
|
|
637
664
|
}
|
|
638
665
|
} else {
|
|
@@ -641,21 +668,21 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
641
668
|
} else if (localTag === "pic" || localTag === "shape" || localTag === "drawingObject") {
|
|
642
669
|
const drawTextChild = findDescendant(el, "drawText");
|
|
643
670
|
if (drawTextChild) {
|
|
644
|
-
extractDrawTextBlocks(drawTextChild, blocks, styleMap, sectionNum);
|
|
671
|
+
extractDrawTextBlocks(drawTextChild, blocks, ctx.styleMap, ctx.sectionNum);
|
|
645
672
|
} else {
|
|
646
673
|
const imgRef = extractImageRef(el);
|
|
647
674
|
if (imgRef) {
|
|
648
|
-
blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
|
|
649
|
-
} else if (warnings && sectionNum) {
|
|
650
|
-
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
675
|
+
blocks.push({ type: "image", text: imgRef, pageNumber: ctx.sectionNum });
|
|
676
|
+
} else if (ctx.warnings && ctx.sectionNum) {
|
|
677
|
+
ctx.warnings.push({ page: ctx.sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
651
678
|
}
|
|
652
679
|
}
|
|
653
680
|
} else if (localTag === "drawText") {
|
|
654
|
-
extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
|
|
681
|
+
extractDrawTextBlocks(el, blocks, ctx.styleMap, ctx.sectionNum);
|
|
655
682
|
} else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
|
|
656
683
|
walkChildren(el, d + 1);
|
|
657
684
|
} else if (localTag === "run") {
|
|
658
|
-
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack,
|
|
685
|
+
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
659
686
|
}
|
|
660
687
|
}
|
|
661
688
|
};
|
|
@@ -740,7 +767,7 @@ function extractParagraphInfo(para, styleMap) {
|
|
|
740
767
|
case "hyperlink": {
|
|
741
768
|
const url = child.getAttribute("url") || child.getAttribute("href") || "";
|
|
742
769
|
if (url) {
|
|
743
|
-
const safe =
|
|
770
|
+
const safe = _chunkZNJPRRIAcjs.sanitizeHref.call(void 0, url);
|
|
744
771
|
if (safe) href = safe;
|
|
745
772
|
}
|
|
746
773
|
walk(child);
|
|
@@ -880,7 +907,7 @@ function decompressStream(data) {
|
|
|
880
907
|
return _zlib.inflateRawSync.call(void 0, data, opts);
|
|
881
908
|
}
|
|
882
909
|
function parseFileHeader(data) {
|
|
883
|
-
if (data.length < 40) throw new (0,
|
|
910
|
+
if (data.length < 40) throw new (0, _chunkZNJPRRIAcjs.KordocError)("FileHeader\uAC00 \uB108\uBB34 \uC9E7\uC2B5\uB2C8\uB2E4 (\uCD5C\uC18C 40\uBC14\uC774\uD2B8)");
|
|
884
911
|
const sig = data.subarray(0, 32).toString("utf8").replace(/\0+$/, "");
|
|
885
912
|
return {
|
|
886
913
|
signature: sig,
|
|
@@ -1899,7 +1926,7 @@ function parseHwp5Document(buffer, options) {
|
|
|
1899
1926
|
lenientCfb = parseLenientCfb(buffer);
|
|
1900
1927
|
warnings.push({ message: "\uC190\uC0C1\uB41C CFB \uCEE8\uD14C\uC774\uB108 \u2014 lenient \uBAA8\uB4DC\uB85C \uBCF5\uAD6C", code: "LENIENT_CFB_RECOVERY" });
|
|
1901
1928
|
} catch (e11) {
|
|
1902
|
-
throw new (0,
|
|
1929
|
+
throw new (0, _chunkZNJPRRIAcjs.KordocError)("CFB \uCEE8\uD14C\uC774\uB108 \uD30C\uC2F1 \uC2E4\uD328 (strict \uBC0F lenient \uBAA8\uB450)");
|
|
1903
1930
|
}
|
|
1904
1931
|
}
|
|
1905
1932
|
const findStream = (path) => {
|
|
@@ -1910,11 +1937,11 @@ function parseHwp5Document(buffer, options) {
|
|
|
1910
1937
|
return lenientCfb.findStream(path);
|
|
1911
1938
|
};
|
|
1912
1939
|
const headerData = findStream("/FileHeader");
|
|
1913
|
-
if (!headerData) throw new (0,
|
|
1940
|
+
if (!headerData) throw new (0, _chunkZNJPRRIAcjs.KordocError)("FileHeader \uC2A4\uD2B8\uB9BC \uC5C6\uC74C");
|
|
1914
1941
|
const header = parseFileHeader(headerData);
|
|
1915
|
-
if (header.signature !== "HWP Document File") throw new (0,
|
|
1916
|
-
if (header.flags & FLAG_ENCRYPTED) throw new (0,
|
|
1917
|
-
if (header.flags & FLAG_DRM) throw new (0,
|
|
1942
|
+
if (header.signature !== "HWP Document File") throw new (0, _chunkZNJPRRIAcjs.KordocError)("HWP \uC2DC\uADF8\uB2C8\uCC98 \uBD88\uC77C\uCE58");
|
|
1943
|
+
if (header.flags & FLAG_ENCRYPTED) throw new (0, _chunkZNJPRRIAcjs.KordocError)("\uC554\uD638\uD654\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
|
|
1944
|
+
if (header.flags & FLAG_DRM) throw new (0, _chunkZNJPRRIAcjs.KordocError)("DRM \uBCF4\uD638\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
|
|
1918
1945
|
const compressed = (header.flags & FLAG_COMPRESSED) !== 0;
|
|
1919
1946
|
const distribution = (header.flags & FLAG_DISTRIBUTION) !== 0;
|
|
1920
1947
|
const metadata = {
|
|
@@ -1923,11 +1950,12 @@ function parseHwp5Document(buffer, options) {
|
|
|
1923
1950
|
if (cfb) extractHwp5Metadata(cfb, metadata);
|
|
1924
1951
|
const docInfo = cfb ? parseDocInfoStream(cfb, compressed) : parseDocInfoFromStream(findStream("/DocInfo"), compressed);
|
|
1925
1952
|
const sections = distribution ? cfb ? findViewTextSections(cfb, compressed) : findViewTextSectionsLenient(lenientCfb, compressed) : cfb ? findSections(cfb) : findSectionsLenient(lenientCfb, compressed);
|
|
1926
|
-
if (sections.length === 0) throw new (0,
|
|
1953
|
+
if (sections.length === 0) throw new (0, _chunkZNJPRRIAcjs.KordocError)("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
1927
1954
|
metadata.pageCount = sections.length;
|
|
1928
1955
|
const pageFilter = _optionalChain([options, 'optionalAccess', _21 => _21.pages]) ? _chunkMUOQXDZ4cjs.parsePageRange.call(void 0, options.pages, sections.length) : null;
|
|
1929
1956
|
const totalTarget = pageFilter ? pageFilter.size : sections.length;
|
|
1930
1957
|
const blocks = [];
|
|
1958
|
+
const nestedTableCounter = { count: 0 };
|
|
1931
1959
|
let totalDecompressed = 0;
|
|
1932
1960
|
let parsedSections = 0;
|
|
1933
1961
|
for (let si = 0; si < sections.length; si++) {
|
|
@@ -1936,24 +1964,24 @@ function parseHwp5Document(buffer, options) {
|
|
|
1936
1964
|
const sectionData = sections[si];
|
|
1937
1965
|
const data = !distribution && compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
|
|
1938
1966
|
totalDecompressed += data.length;
|
|
1939
|
-
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new (0,
|
|
1967
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new (0, _chunkZNJPRRIAcjs.KordocError)("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
1940
1968
|
const records = readRecords(data);
|
|
1941
|
-
const sectionBlocks = parseSection(records, docInfo, warnings, si + 1);
|
|
1969
|
+
const sectionBlocks = parseSection(records, docInfo, warnings, si + 1, nestedTableCounter);
|
|
1942
1970
|
blocks.push(...sectionBlocks);
|
|
1943
1971
|
parsedSections++;
|
|
1944
1972
|
_optionalChain([options, 'optionalAccess', _22 => _22.onProgress, 'optionalCall', _23 => _23(parsedSections, totalTarget)]);
|
|
1945
1973
|
} catch (secErr) {
|
|
1946
|
-
if (secErr instanceof
|
|
1974
|
+
if (secErr instanceof _chunkZNJPRRIAcjs.KordocError) throw secErr;
|
|
1947
1975
|
warnings.push({ page: si + 1, message: `\uC139\uC158 ${si + 1} \uD30C\uC2F1 \uC2E4\uD328: ${secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
1948
1976
|
}
|
|
1949
1977
|
}
|
|
1950
1978
|
const images = cfb ? extractHwp5Images(cfb, blocks, compressed, warnings) : extractHwp5ImagesLenient(lenientCfb, blocks, compressed, warnings);
|
|
1951
|
-
const flatBlocks =
|
|
1979
|
+
const flatBlocks = _chunkZNJPRRIAcjs.flattenLayoutTables.call(void 0, blocks);
|
|
1952
1980
|
if (docInfo) {
|
|
1953
1981
|
detectHwp5Headings(flatBlocks, docInfo);
|
|
1954
1982
|
}
|
|
1955
1983
|
const outline = flatBlocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
1956
|
-
const markdown =
|
|
1984
|
+
const markdown = _chunkZNJPRRIAcjs.blocksToMarkdown.call(void 0, flatBlocks);
|
|
1957
1985
|
return { markdown, blocks: flatBlocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
|
|
1958
1986
|
}
|
|
1959
1987
|
function parseDocInfoStream(cfb, compressed) {
|
|
@@ -2013,9 +2041,9 @@ function detectHwp5Headings(blocks, docInfo) {
|
|
|
2013
2041
|
let level = 0;
|
|
2014
2042
|
if (_optionalChain([block, 'access', _28 => _28.style, 'optionalAccess', _29 => _29.fontSize]) && baseFontSize > 0) {
|
|
2015
2043
|
const ratio = block.style.fontSize / baseFontSize;
|
|
2016
|
-
if (ratio >=
|
|
2017
|
-
else if (ratio >=
|
|
2018
|
-
else if (ratio >=
|
|
2044
|
+
if (ratio >= _chunkZNJPRRIAcjs.HEADING_RATIO_H1) level = 1;
|
|
2045
|
+
else if (ratio >= _chunkZNJPRRIAcjs.HEADING_RATIO_H2) level = 2;
|
|
2046
|
+
else if (ratio >= _chunkZNJPRRIAcjs.HEADING_RATIO_H3) level = 3;
|
|
2019
2047
|
}
|
|
2020
2048
|
if (/^제\d+[장절편]\s/.test(text) && text.length <= 50) {
|
|
2021
2049
|
if (level === 0) level = 2;
|
|
@@ -2100,7 +2128,7 @@ function findSectionsLenient(lcfb, compressed) {
|
|
|
2100
2128
|
if (!raw) break;
|
|
2101
2129
|
const content = compressed ? decompressStream(raw) : raw;
|
|
2102
2130
|
totalDecompressed += content.length;
|
|
2103
|
-
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new (0,
|
|
2131
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new (0, _chunkZNJPRRIAcjs.KordocError)("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2104
2132
|
sections.push({ idx: i, content });
|
|
2105
2133
|
}
|
|
2106
2134
|
if (sections.length === 0) {
|
|
@@ -2112,7 +2140,7 @@ function findSectionsLenient(lcfb, compressed) {
|
|
|
2112
2140
|
if (raw) {
|
|
2113
2141
|
const content = compressed ? decompressStream(raw) : raw;
|
|
2114
2142
|
totalDecompressed += content.length;
|
|
2115
|
-
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new (0,
|
|
2143
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new (0, _chunkZNJPRRIAcjs.KordocError)("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2116
2144
|
sections.push({ idx, content });
|
|
2117
2145
|
}
|
|
2118
2146
|
}
|
|
@@ -2129,7 +2157,7 @@ function findViewTextSectionsLenient(lcfb, compressed) {
|
|
|
2129
2157
|
try {
|
|
2130
2158
|
const content = decryptViewText(raw, compressed);
|
|
2131
2159
|
totalDecompressed += content.length;
|
|
2132
|
-
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new (0,
|
|
2160
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new (0, _chunkZNJPRRIAcjs.KordocError)("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
2133
2161
|
sections.push({ idx: i, content });
|
|
2134
2162
|
} catch (e16) {
|
|
2135
2163
|
break;
|
|
@@ -2258,13 +2286,13 @@ function extractHwp5ImagesLenient(lcfb, blocks, compressed, warnings) {
|
|
|
2258
2286
|
}
|
|
2259
2287
|
return images;
|
|
2260
2288
|
}
|
|
2261
|
-
function parseSection(records, docInfo, warnings, sectionNum) {
|
|
2289
|
+
function parseSection(records, docInfo, warnings, sectionNum, counter) {
|
|
2262
2290
|
const blocks = [];
|
|
2263
2291
|
let i = 0;
|
|
2264
2292
|
while (i < records.length) {
|
|
2265
2293
|
const rec = records[i];
|
|
2266
2294
|
if (rec.tagId === TAG_PARA_HEADER && rec.level === 0) {
|
|
2267
|
-
const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i);
|
|
2295
|
+
const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i, counter);
|
|
2268
2296
|
if (paragraph) {
|
|
2269
2297
|
const block = { type: "paragraph", text: paragraph, pageNumber: sectionNum };
|
|
2270
2298
|
if (docInfo && charShapeIds.length > 0) {
|
|
@@ -2287,7 +2315,7 @@ function parseSection(records, docInfo, warnings, sectionNum) {
|
|
|
2287
2315
|
if (rec.tagId === TAG_CTRL_HEADER && rec.level <= 1 && rec.data.length >= 4) {
|
|
2288
2316
|
const ctrlId = rec.data.subarray(0, 4).toString("ascii");
|
|
2289
2317
|
if (ctrlId === " lbt" || ctrlId === "tbl ") {
|
|
2290
|
-
const { table, nextIdx } = parseTableBlock(records, i);
|
|
2318
|
+
const { table, nextIdx } = parseTableBlock(records, i, counter);
|
|
2291
2319
|
if (table) blocks.push({ type: "table", table, pageNumber: sectionNum });
|
|
2292
2320
|
i = nextIdx;
|
|
2293
2321
|
continue;
|
|
@@ -2317,7 +2345,7 @@ function parseSection(records, docInfo, warnings, sectionNum) {
|
|
|
2317
2345
|
if (url && blocks.length > 0) {
|
|
2318
2346
|
const lastBlock = blocks[blocks.length - 1];
|
|
2319
2347
|
if (lastBlock.type === "paragraph" && !lastBlock.href) {
|
|
2320
|
-
lastBlock.href = _nullishCoalesce(
|
|
2348
|
+
lastBlock.href = _nullishCoalesce(_chunkZNJPRRIAcjs.sanitizeHref.call(void 0, url), () => ( void 0));
|
|
2321
2349
|
}
|
|
2322
2350
|
}
|
|
2323
2351
|
}
|
|
@@ -2392,7 +2420,7 @@ function resolveCharStyle(charShapeIds, docInfo) {
|
|
|
2392
2420
|
if (cs.attrFlags & 2) style.bold = true;
|
|
2393
2421
|
return style.fontSize || style.bold || style.italic ? style : void 0;
|
|
2394
2422
|
}
|
|
2395
|
-
function parseParagraphWithTables(records, startIdx) {
|
|
2423
|
+
function parseParagraphWithTables(records, startIdx, counter) {
|
|
2396
2424
|
const startLevel = records[startIdx].level;
|
|
2397
2425
|
let text = "";
|
|
2398
2426
|
const tables = [];
|
|
@@ -2414,7 +2442,7 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
2414
2442
|
if (rec.tagId === TAG_CTRL_HEADER && rec.data.length >= 4) {
|
|
2415
2443
|
const ctrlId = rec.data.subarray(0, 4).toString("ascii");
|
|
2416
2444
|
if (ctrlId === " lbt" || ctrlId === "tbl ") {
|
|
2417
|
-
const { table, nextIdx } = parseTableBlock(records, i);
|
|
2445
|
+
const { table, nextIdx } = parseTableBlock(records, i, counter);
|
|
2418
2446
|
if (table) tables.push(table);
|
|
2419
2447
|
i = nextIdx;
|
|
2420
2448
|
continue;
|
|
@@ -2425,7 +2453,7 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
2425
2453
|
const trimmed = text.trim();
|
|
2426
2454
|
return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds, paraShapeId };
|
|
2427
2455
|
}
|
|
2428
|
-
function parseTableBlock(records, startIdx) {
|
|
2456
|
+
function parseTableBlock(records, startIdx, counter) {
|
|
2429
2457
|
const tableLevel = records[startIdx].level;
|
|
2430
2458
|
let i = startIdx + 1;
|
|
2431
2459
|
let rows = 0, cols = 0;
|
|
@@ -2435,11 +2463,11 @@ function parseTableBlock(records, startIdx) {
|
|
|
2435
2463
|
if (rec.tagId === TAG_PARA_HEADER && rec.level <= tableLevel) break;
|
|
2436
2464
|
if (rec.tagId === TAG_CTRL_HEADER && rec.level <= tableLevel) break;
|
|
2437
2465
|
if (rec.tagId === TAG_TABLE && rec.data.length >= 8) {
|
|
2438
|
-
rows = Math.min(rec.data.readUInt16LE(4),
|
|
2439
|
-
cols = Math.min(rec.data.readUInt16LE(6),
|
|
2466
|
+
rows = Math.min(rec.data.readUInt16LE(4), _chunkZNJPRRIAcjs.MAX_ROWS);
|
|
2467
|
+
cols = Math.min(rec.data.readUInt16LE(6), _chunkZNJPRRIAcjs.MAX_COLS);
|
|
2440
2468
|
}
|
|
2441
2469
|
if (rec.tagId === TAG_LIST_HEADER) {
|
|
2442
|
-
const { cell, nextIdx } = parseCellBlock(records, i, tableLevel);
|
|
2470
|
+
const { cell, nextIdx } = parseCellBlock(records, i, tableLevel, counter);
|
|
2443
2471
|
if (cell) cells.push(cell);
|
|
2444
2472
|
i = nextIdx;
|
|
2445
2473
|
continue;
|
|
@@ -2458,9 +2486,9 @@ function parseTableBlock(records, startIdx) {
|
|
|
2458
2486
|
return { table: { rows, cols, cells: irCells, hasHeader: rows > 1 }, nextIdx: i };
|
|
2459
2487
|
}
|
|
2460
2488
|
const cellRows = arrangeCells(rows, cols, cells);
|
|
2461
|
-
return { table:
|
|
2489
|
+
return { table: _chunkZNJPRRIAcjs.buildTable.call(void 0, cellRows), nextIdx: i };
|
|
2462
2490
|
}
|
|
2463
|
-
function parseCellBlock(records, startIdx, tableLevel) {
|
|
2491
|
+
function parseCellBlock(records, startIdx, tableLevel, counter) {
|
|
2464
2492
|
const rec = records[startIdx];
|
|
2465
2493
|
const cellLevel = rec.level;
|
|
2466
2494
|
const texts = [];
|
|
@@ -2473,8 +2501,8 @@ function parseCellBlock(records, startIdx, tableLevel) {
|
|
|
2473
2501
|
rowAddr = rec.data.readUInt16LE(10);
|
|
2474
2502
|
const cs = rec.data.readUInt16LE(12);
|
|
2475
2503
|
const rs = rec.data.readUInt16LE(14);
|
|
2476
|
-
if (cs > 0) colSpan = Math.min(cs,
|
|
2477
|
-
if (rs > 0) rowSpan = Math.min(rs,
|
|
2504
|
+
if (cs > 0) colSpan = Math.min(cs, _chunkZNJPRRIAcjs.MAX_COLS);
|
|
2505
|
+
if (rs > 0) rowSpan = Math.min(rs, _chunkZNJPRRIAcjs.MAX_ROWS);
|
|
2478
2506
|
}
|
|
2479
2507
|
let i = startIdx + 1;
|
|
2480
2508
|
while (i < records.length) {
|
|
@@ -2485,6 +2513,17 @@ function parseCellBlock(records, startIdx, tableLevel) {
|
|
|
2485
2513
|
const t = extractText(r.data).trim();
|
|
2486
2514
|
if (t) texts.push(t);
|
|
2487
2515
|
}
|
|
2516
|
+
if (r.tagId === TAG_CTRL_HEADER && r.data.length >= 4) {
|
|
2517
|
+
const ctrlId = r.data.subarray(0, 4).toString("ascii");
|
|
2518
|
+
if (ctrlId === " lbt" || ctrlId === "tbl ") {
|
|
2519
|
+
if (counter) {
|
|
2520
|
+
counter.count++;
|
|
2521
|
+
texts.push(`[\uC911\uCCA9 \uD14C\uC774\uBE14 #${counter.count}]`);
|
|
2522
|
+
} else {
|
|
2523
|
+
texts.push("[\uC911\uCCA9 \uD14C\uC774\uBE14]");
|
|
2524
|
+
}
|
|
2525
|
+
}
|
|
2526
|
+
}
|
|
2488
2527
|
i++;
|
|
2489
2528
|
}
|
|
2490
2529
|
return { cell: { text: texts.join("\n"), colSpan, rowSpan, colAddr, rowAddr }, nextIdx: i };
|
|
@@ -2565,7 +2604,7 @@ function getTextContent(el) {
|
|
|
2565
2604
|
return _nullishCoalesce(_optionalChain([el, 'access', _37 => _37.textContent, 'optionalAccess', _38 => _38.trim, 'call', _39 => _39()]), () => ( ""));
|
|
2566
2605
|
}
|
|
2567
2606
|
function parseXml(text) {
|
|
2568
|
-
return new (0, _xmldom.DOMParser)().parseFromString(
|
|
2607
|
+
return new (0, _xmldom.DOMParser)().parseFromString(_chunkZNJPRRIAcjs.stripDtd.call(void 0, text), "text/xml");
|
|
2569
2608
|
}
|
|
2570
2609
|
function parseSharedStrings(xml) {
|
|
2571
2610
|
const doc = parseXml(xml);
|
|
@@ -2709,7 +2748,7 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
|
|
|
2709
2748
|
cellRows.push(row);
|
|
2710
2749
|
}
|
|
2711
2750
|
if (cellRows.length > 0) {
|
|
2712
|
-
const table =
|
|
2751
|
+
const table = _chunkZNJPRRIAcjs.buildTable.call(void 0, cellRows);
|
|
2713
2752
|
if (table.rows > 0) {
|
|
2714
2753
|
blocks.push({ type: "table", table, pageNumber: sheetIndex + 1 });
|
|
2715
2754
|
}
|
|
@@ -2717,12 +2756,12 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
|
|
|
2717
2756
|
return blocks;
|
|
2718
2757
|
}
|
|
2719
2758
|
async function parseXlsxDocument(buffer, options) {
|
|
2720
|
-
|
|
2759
|
+
_chunkZNJPRRIAcjs.precheckZipSize.call(void 0, buffer, MAX_DECOMPRESS_SIZE3);
|
|
2721
2760
|
const zip = await _jszip2.default.loadAsync(buffer);
|
|
2722
2761
|
const warnings = [];
|
|
2723
2762
|
const workbookFile = zip.file("xl/workbook.xml");
|
|
2724
2763
|
if (!workbookFile) {
|
|
2725
|
-
throw new (0,
|
|
2764
|
+
throw new (0, _chunkZNJPRRIAcjs.KordocError)("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 XLSX \uD30C\uC77C: xl/workbook.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
2726
2765
|
}
|
|
2727
2766
|
let sharedStrings = [];
|
|
2728
2767
|
const ssFile = zip.file("xl/sharedStrings.xml");
|
|
@@ -2731,7 +2770,7 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
2731
2770
|
}
|
|
2732
2771
|
const sheets = parseWorkbook(await workbookFile.async("text"));
|
|
2733
2772
|
if (sheets.length === 0) {
|
|
2734
|
-
throw new (0,
|
|
2773
|
+
throw new (0, _chunkZNJPRRIAcjs.KordocError)("XLSX \uD30C\uC77C\uC5D0 \uC2DC\uD2B8\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
2735
2774
|
}
|
|
2736
2775
|
let relsMap = /* @__PURE__ */ new Map();
|
|
2737
2776
|
const relsFile = zip.file("xl/_rels/workbook.xml.rels");
|
|
@@ -2803,7 +2842,7 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
2803
2842
|
} catch (e20) {
|
|
2804
2843
|
}
|
|
2805
2844
|
}
|
|
2806
|
-
const markdown =
|
|
2845
|
+
const markdown = _chunkZNJPRRIAcjs.blocksToMarkdown.call(void 0, blocks);
|
|
2807
2846
|
return { markdown, blocks, metadata, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
2808
2847
|
}
|
|
2809
2848
|
|
|
@@ -2811,21 +2850,21 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
2811
2850
|
|
|
2812
2851
|
|
|
2813
2852
|
var MAX_DECOMPRESS_SIZE4 = 100 * 1024 * 1024;
|
|
2814
|
-
function getChildElements(parent,
|
|
2853
|
+
function getChildElements(parent, localName3) {
|
|
2815
2854
|
const result = [];
|
|
2816
2855
|
const children = parent.childNodes;
|
|
2817
2856
|
for (let i = 0; i < children.length; i++) {
|
|
2818
2857
|
const node = children[i];
|
|
2819
2858
|
if (node.nodeType === 1) {
|
|
2820
2859
|
const el = node;
|
|
2821
|
-
if (el.localName ===
|
|
2860
|
+
if (el.localName === localName3 || _optionalChain([el, 'access', _45 => _45.tagName, 'optionalAccess', _46 => _46.endsWith, 'call', _47 => _47(`:${localName3}`)])) {
|
|
2822
2861
|
result.push(el);
|
|
2823
2862
|
}
|
|
2824
2863
|
}
|
|
2825
2864
|
}
|
|
2826
2865
|
return result;
|
|
2827
2866
|
}
|
|
2828
|
-
function findElements(parent,
|
|
2867
|
+
function findElements(parent, localName3) {
|
|
2829
2868
|
const result = [];
|
|
2830
2869
|
const walk = (node) => {
|
|
2831
2870
|
const children = node.childNodes;
|
|
@@ -2833,7 +2872,7 @@ function findElements(parent, localName2) {
|
|
|
2833
2872
|
const child = children[i];
|
|
2834
2873
|
if (child.nodeType === 1) {
|
|
2835
2874
|
const el = child;
|
|
2836
|
-
if (el.localName ===
|
|
2875
|
+
if (el.localName === localName3 || _optionalChain([el, 'access', _48 => _48.tagName, 'optionalAccess', _49 => _49.endsWith, 'call', _50 => _50(`:${localName3}`)])) {
|
|
2837
2876
|
result.push(el);
|
|
2838
2877
|
}
|
|
2839
2878
|
walk(el);
|
|
@@ -2843,16 +2882,16 @@ function findElements(parent, localName2) {
|
|
|
2843
2882
|
walk(parent);
|
|
2844
2883
|
return result;
|
|
2845
2884
|
}
|
|
2846
|
-
function getAttr(el,
|
|
2885
|
+
function getAttr(el, localName3) {
|
|
2847
2886
|
const attrs = el.attributes;
|
|
2848
2887
|
for (let i = 0; i < attrs.length; i++) {
|
|
2849
2888
|
const attr = attrs[i];
|
|
2850
|
-
if (attr.localName ===
|
|
2889
|
+
if (attr.localName === localName3 || attr.name === localName3) return attr.value;
|
|
2851
2890
|
}
|
|
2852
2891
|
return null;
|
|
2853
2892
|
}
|
|
2854
2893
|
function parseXml2(text) {
|
|
2855
|
-
return new (0, _xmldom.DOMParser)().parseFromString(
|
|
2894
|
+
return new (0, _xmldom.DOMParser)().parseFromString(_chunkZNJPRRIAcjs.stripDtd.call(void 0, text), "text/xml");
|
|
2856
2895
|
}
|
|
2857
2896
|
function parseStyles(xml) {
|
|
2858
2897
|
const doc = parseXml2(xml);
|
|
@@ -3145,12 +3184,12 @@ async function extractImages(zip, rels, doc) {
|
|
|
3145
3184
|
return { blocks, images };
|
|
3146
3185
|
}
|
|
3147
3186
|
async function parseDocxDocument(buffer, options) {
|
|
3148
|
-
|
|
3187
|
+
_chunkZNJPRRIAcjs.precheckZipSize.call(void 0, buffer, MAX_DECOMPRESS_SIZE4);
|
|
3149
3188
|
const zip = await _jszip2.default.loadAsync(buffer);
|
|
3150
3189
|
const warnings = [];
|
|
3151
3190
|
const docFile = zip.file("word/document.xml");
|
|
3152
3191
|
if (!docFile) {
|
|
3153
|
-
throw new (0,
|
|
3192
|
+
throw new (0, _chunkZNJPRRIAcjs.KordocError)("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 DOCX \uD30C\uC77C: word/document.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
3154
3193
|
}
|
|
3155
3194
|
let rels = /* @__PURE__ */ new Map();
|
|
3156
3195
|
const relsFile = zip.file("word/_rels/document.xml.rels");
|
|
@@ -3185,7 +3224,7 @@ async function parseDocxDocument(buffer, options) {
|
|
|
3185
3224
|
const doc = parseXml2(docXml);
|
|
3186
3225
|
const body = findElements(doc, "body");
|
|
3187
3226
|
if (body.length === 0) {
|
|
3188
|
-
throw new (0,
|
|
3227
|
+
throw new (0, _chunkZNJPRRIAcjs.KordocError)("DOCX \uBCF8\uBB38(w:body)\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
3189
3228
|
}
|
|
3190
3229
|
const blocks = [];
|
|
3191
3230
|
const bodyEl = body[0];
|
|
@@ -3194,11 +3233,11 @@ async function parseDocxDocument(buffer, options) {
|
|
|
3194
3233
|
const node = children[i];
|
|
3195
3234
|
if (node.nodeType !== 1) continue;
|
|
3196
3235
|
const el = node;
|
|
3197
|
-
const
|
|
3198
|
-
if (
|
|
3236
|
+
const localName3 = _nullishCoalesce(el.localName, () => ( _optionalChain([el, 'access', _65 => _65.tagName, 'optionalAccess', _66 => _66.split, 'call', _67 => _67(":"), 'access', _68 => _68.pop, 'call', _69 => _69()])));
|
|
3237
|
+
if (localName3 === "p") {
|
|
3199
3238
|
const block = parseParagraph(el, styles, numbering, footnotes, rels);
|
|
3200
3239
|
if (block) blocks.push(block);
|
|
3201
|
-
} else if (
|
|
3240
|
+
} else if (localName3 === "tbl") {
|
|
3202
3241
|
const block = parseTable(el, styles, numbering, footnotes, rels);
|
|
3203
3242
|
if (block) blocks.push(block);
|
|
3204
3243
|
}
|
|
@@ -3225,7 +3264,7 @@ async function parseDocxDocument(buffer, options) {
|
|
|
3225
3264
|
}
|
|
3226
3265
|
}
|
|
3227
3266
|
const outline = blocks.filter((b) => b.type === "heading").map((b) => ({ level: _nullishCoalesce(b.level, () => ( 2)), text: _nullishCoalesce(b.text, () => ( "")) }));
|
|
3228
|
-
const markdown =
|
|
3267
|
+
const markdown = _chunkZNJPRRIAcjs.blocksToMarkdown.call(void 0, blocks);
|
|
3229
3268
|
return {
|
|
3230
3269
|
markdown,
|
|
3231
3270
|
blocks,
|
|
@@ -3236,6 +3275,259 @@ async function parseDocxDocument(buffer, options) {
|
|
|
3236
3275
|
};
|
|
3237
3276
|
}
|
|
3238
3277
|
|
|
3278
|
+
// src/hwpml/parser.ts
|
|
3279
|
+
|
|
3280
|
+
var MAX_XML_DEPTH2 = 200;
|
|
3281
|
+
var MAX_TABLE_ROWS = 5e3;
|
|
3282
|
+
var MAX_TABLE_COLS = 500;
|
|
3283
|
+
var MAX_HWPML_BYTES = 50 * 1024 * 1024;
|
|
3284
|
+
function parseHwpmlDocument(buffer, options) {
|
|
3285
|
+
if (buffer.byteLength > MAX_HWPML_BYTES) {
|
|
3286
|
+
throw new Error(`HWPML \uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC (${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB > 50MB)`);
|
|
3287
|
+
}
|
|
3288
|
+
const text = new TextDecoder("utf-8").decode(buffer).replace(/^\uFEFF/, "");
|
|
3289
|
+
const normalized = text.replace(/ /g, " ");
|
|
3290
|
+
const xml = _chunkZNJPRRIAcjs.stripDtd.call(void 0, normalized);
|
|
3291
|
+
const warnings = [];
|
|
3292
|
+
const parser = new (0, _xmldom.DOMParser)({
|
|
3293
|
+
onError: (_level, msg) => {
|
|
3294
|
+
warnings.push({ message: `HWPML XML \uD30C\uC2F1 \uACBD\uACE0: ${msg}`, code: "MALFORMED_XML" });
|
|
3295
|
+
}
|
|
3296
|
+
});
|
|
3297
|
+
const doc = parser.parseFromString(xml, "text/xml");
|
|
3298
|
+
if (!doc.documentElement) {
|
|
3299
|
+
return { markdown: "", blocks: [], warnings };
|
|
3300
|
+
}
|
|
3301
|
+
const root = doc.documentElement;
|
|
3302
|
+
const metadata = {};
|
|
3303
|
+
const docSummary = findChild(root, "DOCSUMMARY");
|
|
3304
|
+
if (docSummary) {
|
|
3305
|
+
const title = findChild(docSummary, "TITLE");
|
|
3306
|
+
const author = findChild(docSummary, "AUTHOR");
|
|
3307
|
+
const date = findChild(docSummary, "DATE");
|
|
3308
|
+
if (title) metadata.title = textContent(title).trim();
|
|
3309
|
+
if (author) metadata.author = textContent(author).trim();
|
|
3310
|
+
if (date) metadata.createdAt = textContent(date).trim() || void 0;
|
|
3311
|
+
}
|
|
3312
|
+
const paraShapeMap = buildParaShapeMap(root);
|
|
3313
|
+
const body = findChild(root, "BODY");
|
|
3314
|
+
if (!body) {
|
|
3315
|
+
return { markdown: "", blocks: [], metadata, warnings };
|
|
3316
|
+
}
|
|
3317
|
+
const blocks = [];
|
|
3318
|
+
const pageFilter = _optionalChain([options, 'optionalAccess', _70 => _70.pages]) ? _chunkMUOQXDZ4cjs.parsePageRange.call(void 0, options.pages, countSections(body)) : null;
|
|
3319
|
+
let sectionIdx = 0;
|
|
3320
|
+
const children = body.childNodes;
|
|
3321
|
+
for (let i = 0; i < children.length; i++) {
|
|
3322
|
+
const el = children[i];
|
|
3323
|
+
if (el.nodeType !== 1) continue;
|
|
3324
|
+
if (localName(el) !== "SECTION") continue;
|
|
3325
|
+
sectionIdx++;
|
|
3326
|
+
if (pageFilter && !pageFilter.has(sectionIdx)) continue;
|
|
3327
|
+
parseSection2(el, blocks, paraShapeMap, sectionIdx, warnings);
|
|
3328
|
+
}
|
|
3329
|
+
const outline = blocks.filter((b) => b.type === "heading" && b.text).map((b) => ({ level: _nullishCoalesce(b.level, () => ( 1)), text: b.text, pageNumber: b.pageNumber }));
|
|
3330
|
+
const markdown = _chunkZNJPRRIAcjs.blocksToMarkdown.call(void 0, blocks);
|
|
3331
|
+
return {
|
|
3332
|
+
markdown,
|
|
3333
|
+
blocks,
|
|
3334
|
+
metadata: Object.keys(metadata).length > 0 ? metadata : void 0,
|
|
3335
|
+
outline: outline.length > 0 ? outline : void 0,
|
|
3336
|
+
warnings: warnings.length > 0 ? warnings : void 0
|
|
3337
|
+
};
|
|
3338
|
+
}
|
|
3339
|
+
function buildParaShapeMap(root) {
|
|
3340
|
+
const map = /* @__PURE__ */ new Map();
|
|
3341
|
+
const head = findChild(root, "HEAD");
|
|
3342
|
+
if (!head) return map;
|
|
3343
|
+
const mappingTable = findChild(head, "MAPPINGTABLE");
|
|
3344
|
+
if (!mappingTable) return map;
|
|
3345
|
+
const paraShapeList = findChild(mappingTable, "PARASHAPELIST");
|
|
3346
|
+
if (!paraShapeList) return map;
|
|
3347
|
+
const children = paraShapeList.childNodes;
|
|
3348
|
+
for (let i = 0; i < children.length; i++) {
|
|
3349
|
+
const el = children[i];
|
|
3350
|
+
if (el.nodeType !== 1 || localName(el) !== "PARASHAPE") continue;
|
|
3351
|
+
const id = _nullishCoalesce(el.getAttribute("Id"), () => ( ""));
|
|
3352
|
+
const headingType = _nullishCoalesce(el.getAttribute("HeadingType"), () => ( "None"));
|
|
3353
|
+
const level = parseInt(_nullishCoalesce(el.getAttribute("Level"), () => ( "0")), 10);
|
|
3354
|
+
let headingLevel = null;
|
|
3355
|
+
if (headingType === "Outline") {
|
|
3356
|
+
const safeLevel = isNaN(level) ? 0 : Math.max(0, level);
|
|
3357
|
+
headingLevel = Math.min(safeLevel + 1, 6);
|
|
3358
|
+
}
|
|
3359
|
+
map.set(id, { headingLevel });
|
|
3360
|
+
}
|
|
3361
|
+
return map;
|
|
3362
|
+
}
|
|
3363
|
+
function parseSection2(section, blocks, paraShapeMap, sectionNum, warnings) {
|
|
3364
|
+
walkContent(section, blocks, paraShapeMap, sectionNum, warnings, false);
|
|
3365
|
+
}
|
|
3366
|
+
function walkContent(node, blocks, paraShapeMap, sectionNum, warnings, inHeaderFooter, depth = 0) {
|
|
3367
|
+
if (depth > MAX_XML_DEPTH2) return;
|
|
3368
|
+
const children = node.childNodes;
|
|
3369
|
+
for (let i = 0; i < children.length; i++) {
|
|
3370
|
+
const el = children[i];
|
|
3371
|
+
if (el.nodeType !== 1) continue;
|
|
3372
|
+
const tag = localName(el);
|
|
3373
|
+
if (tag === "HEADER" || tag === "FOOTER") {
|
|
3374
|
+
continue;
|
|
3375
|
+
}
|
|
3376
|
+
if (tag === "P") {
|
|
3377
|
+
if (!inHeaderFooter) {
|
|
3378
|
+
parseParagraph2(el, blocks, paraShapeMap, sectionNum);
|
|
3379
|
+
}
|
|
3380
|
+
continue;
|
|
3381
|
+
}
|
|
3382
|
+
if (tag === "TABLE") {
|
|
3383
|
+
if (!inHeaderFooter) {
|
|
3384
|
+
parseTable2(el, blocks, paraShapeMap, sectionNum, warnings);
|
|
3385
|
+
}
|
|
3386
|
+
continue;
|
|
3387
|
+
}
|
|
3388
|
+
if (tag === "PARALIST" || tag === "SECTION" || tag === "COLDEF") {
|
|
3389
|
+
walkContent(el, blocks, paraShapeMap, sectionNum, warnings, inHeaderFooter, depth + 1);
|
|
3390
|
+
continue;
|
|
3391
|
+
}
|
|
3392
|
+
walkContent(el, blocks, paraShapeMap, sectionNum, warnings, inHeaderFooter, depth + 1);
|
|
3393
|
+
}
|
|
3394
|
+
}
|
|
3395
|
+
function parseParagraph2(el, blocks, paraShapeMap, sectionNum) {
|
|
3396
|
+
const paraShapeId = _nullishCoalesce(el.getAttribute("ParaShape"), () => ( ""));
|
|
3397
|
+
const shapeInfo = paraShapeMap.get(paraShapeId);
|
|
3398
|
+
const text = extractParagraphText(el);
|
|
3399
|
+
if (!text) return;
|
|
3400
|
+
if (_optionalChain([shapeInfo, 'optionalAccess', _71 => _71.headingLevel]) != null) {
|
|
3401
|
+
blocks.push({ type: "heading", text, level: shapeInfo.headingLevel, pageNumber: sectionNum });
|
|
3402
|
+
} else {
|
|
3403
|
+
blocks.push({ type: "paragraph", text, pageNumber: sectionNum });
|
|
3404
|
+
}
|
|
3405
|
+
}
|
|
3406
|
+
function extractParagraphText(p) {
|
|
3407
|
+
const parts = [];
|
|
3408
|
+
collectCharText(p, parts);
|
|
3409
|
+
return parts.join("").trim();
|
|
3410
|
+
}
|
|
3411
|
+
function collectCharText(node, parts, depth = 0) {
|
|
3412
|
+
if (depth > MAX_XML_DEPTH2) return;
|
|
3413
|
+
const children = node.childNodes;
|
|
3414
|
+
for (let i = 0; i < children.length; i++) {
|
|
3415
|
+
const el = children[i];
|
|
3416
|
+
if (el.nodeType !== 1) continue;
|
|
3417
|
+
const tag = localName(el);
|
|
3418
|
+
if (tag === "CHAR") {
|
|
3419
|
+
const t = textContent(el);
|
|
3420
|
+
if (t) parts.push(t);
|
|
3421
|
+
} else if (tag === "TABLE" || tag === "PICTURE" || tag === "SHAPEOBJECT") {
|
|
3422
|
+
} else if (tag === "AUTONUM") {
|
|
3423
|
+
} else {
|
|
3424
|
+
collectCharText(el, parts, depth + 1);
|
|
3425
|
+
}
|
|
3426
|
+
}
|
|
3427
|
+
}
|
|
3428
|
+
function parseTable2(el, blocks, paraShapeMap, sectionNum, warnings) {
|
|
3429
|
+
const cells = [];
|
|
3430
|
+
const rowCount = parseInt(_nullishCoalesce(el.getAttribute("RowCount"), () => ( "0")), 10);
|
|
3431
|
+
const colCount = parseInt(_nullishCoalesce(el.getAttribute("ColCount"), () => ( "0")), 10);
|
|
3432
|
+
if (isNaN(rowCount) || isNaN(colCount) || rowCount === 0 || colCount === 0) return;
|
|
3433
|
+
if (rowCount > MAX_TABLE_ROWS || colCount > MAX_TABLE_COLS) {
|
|
3434
|
+
warnings.push({ message: `\uD14C\uC774\uBE14 \uD06C\uAE30 \uCD08\uACFC (${rowCount}x${colCount}) \u2014 \uC2A4\uD0B5`, code: "TRUNCATED_TABLE" });
|
|
3435
|
+
return;
|
|
3436
|
+
}
|
|
3437
|
+
const children = el.childNodes;
|
|
3438
|
+
for (let i = 0; i < children.length; i++) {
|
|
3439
|
+
const rowEl = children[i];
|
|
3440
|
+
if (rowEl.nodeType !== 1 || localName(rowEl) !== "ROW") continue;
|
|
3441
|
+
const rowCells = rowEl.childNodes;
|
|
3442
|
+
for (let j = 0; j < rowCells.length; j++) {
|
|
3443
|
+
const cellEl = rowCells[j];
|
|
3444
|
+
if (cellEl.nodeType !== 1 || localName(cellEl) !== "CELL") continue;
|
|
3445
|
+
const colAddr = parseInt(_nullishCoalesce(cellEl.getAttribute("ColAddr"), () => ( "0")), 10);
|
|
3446
|
+
const rowAddr = parseInt(_nullishCoalesce(cellEl.getAttribute("RowAddr"), () => ( "0")), 10);
|
|
3447
|
+
const colSpan = Math.min(Math.max(1, parseInt(_nullishCoalesce(cellEl.getAttribute("ColSpan"), () => ( "1")), 10) || 1), MAX_TABLE_COLS);
|
|
3448
|
+
const rowSpan = Math.min(Math.max(1, parseInt(_nullishCoalesce(cellEl.getAttribute("RowSpan"), () => ( "1")), 10) || 1), MAX_TABLE_ROWS);
|
|
3449
|
+
const cellText = extractCellText(cellEl);
|
|
3450
|
+
cells.push({ text: cellText, colSpan, rowSpan, colAddr, rowAddr });
|
|
3451
|
+
}
|
|
3452
|
+
}
|
|
3453
|
+
if (cells.length === 0) return;
|
|
3454
|
+
const grid = Array.from({ length: rowCount }, () => Array(colCount).fill(null));
|
|
3455
|
+
for (const cell of cells) {
|
|
3456
|
+
const r = _nullishCoalesce(cell.rowAddr, () => ( 0));
|
|
3457
|
+
const c = _nullishCoalesce(cell.colAddr, () => ( 0));
|
|
3458
|
+
if (isNaN(r) || isNaN(c) || r >= rowCount || c >= colCount) continue;
|
|
3459
|
+
grid[r][c] = cell;
|
|
3460
|
+
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
3461
|
+
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
3462
|
+
if (dr === 0 && dc === 0) continue;
|
|
3463
|
+
if (r + dr < rowCount && c + dc < colCount) {
|
|
3464
|
+
grid[r + dr][c + dc] = { text: "", colSpan: 1, rowSpan: 1 };
|
|
3465
|
+
}
|
|
3466
|
+
}
|
|
3467
|
+
}
|
|
3468
|
+
}
|
|
3469
|
+
const cellRows = grid.map(
|
|
3470
|
+
(row) => row.map((cell) => _nullishCoalesce(cell, () => ( { text: "", colSpan: 1, rowSpan: 1 })))
|
|
3471
|
+
);
|
|
3472
|
+
const table = _chunkZNJPRRIAcjs.buildTable.call(void 0, cellRows);
|
|
3473
|
+
blocks.push({ type: "table", table, pageNumber: sectionNum });
|
|
3474
|
+
}
|
|
3475
|
+
function extractCellText(cellEl) {
|
|
3476
|
+
const textParts = [];
|
|
3477
|
+
collectCellText(cellEl, textParts, 0);
|
|
3478
|
+
return textParts.filter(Boolean).join("\n").trim();
|
|
3479
|
+
}
|
|
3480
|
+
function collectCellText(node, parts, depth) {
|
|
3481
|
+
if (depth > 20) return;
|
|
3482
|
+
const children = node.childNodes;
|
|
3483
|
+
for (let i = 0; i < children.length; i++) {
|
|
3484
|
+
const el = children[i];
|
|
3485
|
+
if (el.nodeType !== 1) continue;
|
|
3486
|
+
const tag = localName(el);
|
|
3487
|
+
if (tag === "P") {
|
|
3488
|
+
const t = extractParagraphText(el);
|
|
3489
|
+
if (t) parts.push(t);
|
|
3490
|
+
} else if (tag === "TABLE") {
|
|
3491
|
+
parts.push("[\uC911\uCCA9 \uD14C\uC774\uBE14]");
|
|
3492
|
+
} else {
|
|
3493
|
+
collectCellText(el, parts, depth + 1);
|
|
3494
|
+
}
|
|
3495
|
+
}
|
|
3496
|
+
}
|
|
3497
|
+
function localName(el) {
|
|
3498
|
+
return (el.tagName || el.localName || "").replace(/^[^:]+:/, "");
|
|
3499
|
+
}
|
|
3500
|
+
function findChild(parent, tag) {
|
|
3501
|
+
const children = parent.childNodes;
|
|
3502
|
+
for (let i = 0; i < children.length; i++) {
|
|
3503
|
+
const el = children[i];
|
|
3504
|
+
if (el.nodeType === 1 && localName(el) === tag) return el;
|
|
3505
|
+
}
|
|
3506
|
+
return null;
|
|
3507
|
+
}
|
|
3508
|
+
function textContent(el) {
|
|
3509
|
+
const children = el.childNodes;
|
|
3510
|
+
const parts = [];
|
|
3511
|
+
for (let i = 0; i < children.length; i++) {
|
|
3512
|
+
const node = children[i];
|
|
3513
|
+
if (node.nodeType === 3) {
|
|
3514
|
+
parts.push(node.nodeValue || "");
|
|
3515
|
+
} else if (node.nodeType === 1) {
|
|
3516
|
+
parts.push(textContent(node));
|
|
3517
|
+
}
|
|
3518
|
+
}
|
|
3519
|
+
return parts.join("");
|
|
3520
|
+
}
|
|
3521
|
+
function countSections(body) {
|
|
3522
|
+
let count = 0;
|
|
3523
|
+
const children = body.childNodes;
|
|
3524
|
+
for (let i = 0; i < children.length; i++) {
|
|
3525
|
+
const el = children[i];
|
|
3526
|
+
if (el.nodeType === 1 && localName(el) === "SECTION") count++;
|
|
3527
|
+
}
|
|
3528
|
+
return count;
|
|
3529
|
+
}
|
|
3530
|
+
|
|
3239
3531
|
// src/form/recognize.ts
|
|
3240
3532
|
var LABEL_KEYWORDS = /* @__PURE__ */ new Set([
|
|
3241
3533
|
"\uC131\uBA85",
|
|
@@ -3469,7 +3761,7 @@ function fillFormFields(blocks, values) {
|
|
|
3469
3761
|
if (block.type !== "table" || !block.table) continue;
|
|
3470
3762
|
for (let r = 0; r < block.table.rows; r++) {
|
|
3471
3763
|
for (let c = 0; c < block.table.cols; c++) {
|
|
3472
|
-
const cell = _optionalChain([block, 'access',
|
|
3764
|
+
const cell = _optionalChain([block, 'access', _72 => _72.table, 'access', _73 => _73.cells, 'access', _74 => _74[r], 'optionalAccess', _75 => _75[c]]);
|
|
3473
3765
|
if (!cell) continue;
|
|
3474
3766
|
const result = fillInCellPatterns(cell.text, normalizedValues, matchedLabels);
|
|
3475
3767
|
if (result) {
|
|
@@ -3508,7 +3800,7 @@ function fillTable(table, values, filled, matchedLabels, patternFilledCells) {
|
|
|
3508
3800
|
const matchKey = findMatchingKey(normalizedCellLabel, values);
|
|
3509
3801
|
if (matchKey === void 0) continue;
|
|
3510
3802
|
const newValue = values.get(matchKey);
|
|
3511
|
-
if (_optionalChain([patternFilledCells, 'optionalAccess',
|
|
3803
|
+
if (_optionalChain([patternFilledCells, 'optionalAccess', _76 => _76.has, 'call', _77 => _77(`${r},${c + 1}`)])) {
|
|
3512
3804
|
valueCell.text = newValue + " " + valueCell.text;
|
|
3513
3805
|
} else {
|
|
3514
3806
|
valueCell.text = newValue;
|
|
@@ -3578,7 +3870,7 @@ async function fillHwpx(hwpxBuffer, values) {
|
|
|
3578
3870
|
const normalizedValues = normalizeValues(values);
|
|
3579
3871
|
const sectionFiles = Object.keys(zip.files).filter((name) => /[Ss]ection\d+\.xml$/i.test(name)).sort();
|
|
3580
3872
|
if (sectionFiles.length === 0) {
|
|
3581
|
-
throw new (0,
|
|
3873
|
+
throw new (0, _chunkZNJPRRIAcjs.KordocError)("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
3582
3874
|
}
|
|
3583
3875
|
const xmlParser = new (0, _xmldom.DOMParser)();
|
|
3584
3876
|
const xmlSerializer = new (0, _xmldom.XMLSerializer)();
|
|
@@ -3586,7 +3878,7 @@ async function fillHwpx(hwpxBuffer, values) {
|
|
|
3586
3878
|
const zipEntry = zip.file(sectionPath);
|
|
3587
3879
|
if (!zipEntry) continue;
|
|
3588
3880
|
const rawXml = await zipEntry.async("text");
|
|
3589
|
-
const doc = xmlParser.parseFromString(
|
|
3881
|
+
const doc = xmlParser.parseFromString(_chunkZNJPRRIAcjs.stripDtd.call(void 0, rawXml), "text/xml");
|
|
3590
3882
|
if (!doc.documentElement) continue;
|
|
3591
3883
|
let modified = false;
|
|
3592
3884
|
const tables = findAllElements(doc.documentElement, "tbl");
|
|
@@ -3612,10 +3904,10 @@ async function fillHwpx(hwpxBuffer, values) {
|
|
|
3612
3904
|
const trEl = rows[rowIdx];
|
|
3613
3905
|
const cells = findDirectChildren(trEl, "tc");
|
|
3614
3906
|
for (let colIdx = 0; colIdx < cells.length - 1; colIdx++) {
|
|
3615
|
-
const labelText =
|
|
3907
|
+
const labelText = extractCellText2(cells[colIdx]);
|
|
3616
3908
|
if (!isLabelCell(labelText)) continue;
|
|
3617
3909
|
const valueCell = cells[colIdx + 1];
|
|
3618
|
-
const valueText =
|
|
3910
|
+
const valueText = extractCellText2(valueCell);
|
|
3619
3911
|
if (isKeywordLabel(valueText)) continue;
|
|
3620
3912
|
const normalizedCellLabel = normalizeLabel(labelText);
|
|
3621
3913
|
if (!normalizedCellLabel) continue;
|
|
@@ -3640,14 +3932,14 @@ async function fillHwpx(hwpxBuffer, values) {
|
|
|
3640
3932
|
if (rows.length >= 2) {
|
|
3641
3933
|
const headerCells = findDirectChildren(rows[0], "tc");
|
|
3642
3934
|
const allLabels = headerCells.every((cell) => {
|
|
3643
|
-
const t =
|
|
3935
|
+
const t = extractCellText2(cell).trim();
|
|
3644
3936
|
return t.length > 0 && t.length <= 20 && isLabelCell(t);
|
|
3645
3937
|
});
|
|
3646
3938
|
if (allLabels) {
|
|
3647
3939
|
for (let rowIdx = 1; rowIdx < rows.length; rowIdx++) {
|
|
3648
3940
|
const dataCells = findDirectChildren(rows[rowIdx], "tc");
|
|
3649
3941
|
for (let colIdx = 0; colIdx < Math.min(headerCells.length, dataCells.length); colIdx++) {
|
|
3650
|
-
const headerLabel = normalizeLabel(
|
|
3942
|
+
const headerLabel = normalizeLabel(extractCellText2(headerCells[colIdx]));
|
|
3651
3943
|
const matchKey = findMatchingKey(headerLabel, normalizedValues);
|
|
3652
3944
|
if (matchKey === void 0) continue;
|
|
3653
3945
|
if (matchedLabels.has(matchKey)) continue;
|
|
@@ -3655,7 +3947,7 @@ async function fillHwpx(hwpxBuffer, values) {
|
|
|
3655
3947
|
replaceCellText(dataCells[colIdx], newValue);
|
|
3656
3948
|
matchedLabels.add(matchKey);
|
|
3657
3949
|
filled.push({
|
|
3658
|
-
label:
|
|
3950
|
+
label: extractCellText2(headerCells[colIdx]).trim(),
|
|
3659
3951
|
value: newValue,
|
|
3660
3952
|
row: rowIdx,
|
|
3661
3953
|
col: colIdx
|
|
@@ -3697,7 +3989,7 @@ async function fillHwpx(hwpxBuffer, values) {
|
|
|
3697
3989
|
const buffer = await zip.generateAsync({ type: "arraybuffer" });
|
|
3698
3990
|
return { buffer, filled, unmatched };
|
|
3699
3991
|
}
|
|
3700
|
-
function
|
|
3992
|
+
function localName2(el) {
|
|
3701
3993
|
return (el.tagName || el.localName || "").replace(/^[^:]+:/, "");
|
|
3702
3994
|
}
|
|
3703
3995
|
function findAllElements(node, tagLocalName) {
|
|
@@ -3708,7 +4000,7 @@ function findAllElements(node, tagLocalName) {
|
|
|
3708
4000
|
for (let i = 0; i < children.length; i++) {
|
|
3709
4001
|
const child = children[i];
|
|
3710
4002
|
if (child.nodeType !== 1) continue;
|
|
3711
|
-
if (
|
|
4003
|
+
if (localName2(child) === tagLocalName) result.push(child);
|
|
3712
4004
|
walk(child);
|
|
3713
4005
|
}
|
|
3714
4006
|
};
|
|
@@ -3721,7 +4013,7 @@ function findDirectChildren(parent, tagLocalName) {
|
|
|
3721
4013
|
if (!children) return result;
|
|
3722
4014
|
for (let i = 0; i < children.length; i++) {
|
|
3723
4015
|
const child = children[i];
|
|
3724
|
-
if (child.nodeType === 1 &&
|
|
4016
|
+
if (child.nodeType === 1 && localName2(child) === tagLocalName) {
|
|
3725
4017
|
result.push(child);
|
|
3726
4018
|
}
|
|
3727
4019
|
}
|
|
@@ -3730,12 +4022,12 @@ function findDirectChildren(parent, tagLocalName) {
|
|
|
3730
4022
|
function isInsideTable(el) {
|
|
3731
4023
|
let parent = el.parentNode;
|
|
3732
4024
|
while (parent) {
|
|
3733
|
-
if (parent.nodeType === 1 &&
|
|
4025
|
+
if (parent.nodeType === 1 && localName2(parent) === "tbl") return true;
|
|
3734
4026
|
parent = parent.parentNode;
|
|
3735
4027
|
}
|
|
3736
4028
|
return false;
|
|
3737
4029
|
}
|
|
3738
|
-
function
|
|
4030
|
+
function extractCellText2(tcEl) {
|
|
3739
4031
|
const parts = [];
|
|
3740
4032
|
const walk = (node) => {
|
|
3741
4033
|
const children = node.childNodes;
|
|
@@ -3745,7 +4037,7 @@ function extractCellText(tcEl) {
|
|
|
3745
4037
|
if (child.nodeType === 3) {
|
|
3746
4038
|
parts.push(child.textContent || "");
|
|
3747
4039
|
} else if (child.nodeType === 1) {
|
|
3748
|
-
const tag =
|
|
4040
|
+
const tag = localName2(child);
|
|
3749
4041
|
if (tag === "t") walk(child);
|
|
3750
4042
|
else if (tag === "run" || tag === "r" || tag === "p" || tag === "subList") walk(child);
|
|
3751
4043
|
else if (tag === "tab") parts.push(" ");
|
|
@@ -4447,13 +4739,13 @@ async function parse(input, options) {
|
|
|
4447
4739
|
if (typeof input === "string") {
|
|
4448
4740
|
try {
|
|
4449
4741
|
const buf = await _promises.readFile.call(void 0, input);
|
|
4450
|
-
buffer =
|
|
4742
|
+
buffer = _chunkZNJPRRIAcjs.toArrayBuffer.call(void 0, buf);
|
|
4451
4743
|
} catch (err) {
|
|
4452
4744
|
const msg = err instanceof Error && "code" in err && err.code === "ENOENT" ? `\uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${input}` : `\uD30C\uC77C \uC77D\uAE30 \uC2E4\uD328: ${input}`;
|
|
4453
4745
|
return { success: false, fileType: "unknown", error: msg, code: "PARSE_ERROR" };
|
|
4454
4746
|
}
|
|
4455
4747
|
} else if (Buffer.isBuffer(input)) {
|
|
4456
|
-
buffer =
|
|
4748
|
+
buffer = _chunkZNJPRRIAcjs.toArrayBuffer.call(void 0, input);
|
|
4457
4749
|
} else {
|
|
4458
4750
|
buffer = input;
|
|
4459
4751
|
}
|
|
@@ -4470,6 +4762,8 @@ async function parse(input, options) {
|
|
|
4470
4762
|
}
|
|
4471
4763
|
case "hwp":
|
|
4472
4764
|
return parseHwp(buffer, options);
|
|
4765
|
+
case "hwpml":
|
|
4766
|
+
return parseHwpml(buffer, options);
|
|
4473
4767
|
case "pdf":
|
|
4474
4768
|
return parsePdf(buffer, options);
|
|
4475
4769
|
default:
|
|
@@ -4479,23 +4773,23 @@ async function parse(input, options) {
|
|
|
4479
4773
|
async function parseHwpx(buffer, options) {
|
|
4480
4774
|
try {
|
|
4481
4775
|
const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options);
|
|
4482
|
-
return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: _optionalChain([images, 'optionalAccess',
|
|
4776
|
+
return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: _optionalChain([images, 'optionalAccess', _78 => _78.length]) ? images : void 0 };
|
|
4483
4777
|
} catch (err) {
|
|
4484
|
-
return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code:
|
|
4778
|
+
return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: _chunkZNJPRRIAcjs.classifyError.call(void 0, err) };
|
|
4485
4779
|
}
|
|
4486
4780
|
}
|
|
4487
4781
|
async function parseHwp(buffer, options) {
|
|
4488
4782
|
try {
|
|
4489
4783
|
const { markdown, blocks, metadata, outline, warnings, images } = parseHwp5Document(Buffer.from(buffer), options);
|
|
4490
|
-
return { success: true, fileType: "hwp", markdown, blocks, metadata, outline, warnings, images: _optionalChain([images, 'optionalAccess',
|
|
4784
|
+
return { success: true, fileType: "hwp", markdown, blocks, metadata, outline, warnings, images: _optionalChain([images, 'optionalAccess', _79 => _79.length]) ? images : void 0 };
|
|
4491
4785
|
} catch (err) {
|
|
4492
|
-
return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328", code:
|
|
4786
|
+
return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328", code: _chunkZNJPRRIAcjs.classifyError.call(void 0, err) };
|
|
4493
4787
|
}
|
|
4494
4788
|
}
|
|
4495
4789
|
async function parsePdf(buffer, options) {
|
|
4496
4790
|
let parsePdfDocument;
|
|
4497
4791
|
try {
|
|
4498
|
-
const mod = await Promise.resolve().then(() => _interopRequireWildcard(require("./parser-
|
|
4792
|
+
const mod = await Promise.resolve().then(() => _interopRequireWildcard(require("./parser-KBQZB3QY.cjs")));
|
|
4499
4793
|
parsePdfDocument = mod.parsePdfDocument;
|
|
4500
4794
|
} catch (e26) {
|
|
4501
4795
|
return {
|
|
@@ -4510,7 +4804,7 @@ async function parsePdf(buffer, options) {
|
|
|
4510
4804
|
return { success: true, fileType: "pdf", markdown, blocks, metadata, outline, warnings, isImageBased };
|
|
4511
4805
|
} catch (err) {
|
|
4512
4806
|
const isImageBased = err instanceof Error && "isImageBased" in err ? true : void 0;
|
|
4513
|
-
return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code:
|
|
4807
|
+
return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: _chunkZNJPRRIAcjs.classifyError.call(void 0, err), isImageBased };
|
|
4514
4808
|
}
|
|
4515
4809
|
}
|
|
4516
4810
|
async function parseXlsx(buffer, options) {
|
|
@@ -4518,24 +4812,32 @@ async function parseXlsx(buffer, options) {
|
|
|
4518
4812
|
const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options);
|
|
4519
4813
|
return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
|
|
4520
4814
|
} catch (err) {
|
|
4521
|
-
return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code:
|
|
4815
|
+
return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code: _chunkZNJPRRIAcjs.classifyError.call(void 0, err) };
|
|
4522
4816
|
}
|
|
4523
4817
|
}
|
|
4524
4818
|
async function parseDocx(buffer, options) {
|
|
4525
4819
|
try {
|
|
4526
4820
|
const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options);
|
|
4527
|
-
return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: _optionalChain([images, 'optionalAccess',
|
|
4821
|
+
return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: _optionalChain([images, 'optionalAccess', _80 => _80.length]) ? images : void 0 };
|
|
4528
4822
|
} catch (err) {
|
|
4529
|
-
return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code:
|
|
4823
|
+
return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: _chunkZNJPRRIAcjs.classifyError.call(void 0, err) };
|
|
4824
|
+
}
|
|
4825
|
+
}
|
|
4826
|
+
async function parseHwpml(buffer, options) {
|
|
4827
|
+
try {
|
|
4828
|
+
const { markdown, blocks, metadata, outline, warnings } = parseHwpmlDocument(buffer, options);
|
|
4829
|
+
return { success: true, fileType: "hwpml", markdown, blocks, metadata, outline, warnings };
|
|
4830
|
+
} catch (err) {
|
|
4831
|
+
return { success: false, fileType: "hwpml", error: err instanceof Error ? err.message : "HWPML \uD30C\uC2F1 \uC2E4\uD328", code: _chunkZNJPRRIAcjs.classifyError.call(void 0, err) };
|
|
4530
4832
|
}
|
|
4531
4833
|
}
|
|
4532
4834
|
async function fillForm(input, values, outputFormat = "markdown") {
|
|
4533
4835
|
let buffer;
|
|
4534
4836
|
if (typeof input === "string") {
|
|
4535
4837
|
const buf = await _promises.readFile.call(void 0, input);
|
|
4536
|
-
buffer =
|
|
4838
|
+
buffer = _chunkZNJPRRIAcjs.toArrayBuffer.call(void 0, buf);
|
|
4537
4839
|
} else if (Buffer.isBuffer(input)) {
|
|
4538
|
-
buffer =
|
|
4840
|
+
buffer = _chunkZNJPRRIAcjs.toArrayBuffer.call(void 0, input);
|
|
4539
4841
|
} else {
|
|
4540
4842
|
buffer = input;
|
|
4541
4843
|
}
|
|
@@ -4561,7 +4863,7 @@ async function fillForm(input, values, outputFormat = "markdown") {
|
|
|
4561
4863
|
throw new Error(`\uC11C\uC2DD \uD30C\uC2F1 \uC2E4\uD328: ${parsed.error}`);
|
|
4562
4864
|
}
|
|
4563
4865
|
const fill = fillFormFields(parsed.blocks, values);
|
|
4564
|
-
const markdown =
|
|
4866
|
+
const markdown = _chunkZNJPRRIAcjs.blocksToMarkdown.call(void 0, fill.blocks);
|
|
4565
4867
|
if (outputFormat === "hwpx") {
|
|
4566
4868
|
const hwpxBuffer = await markdownToHwpx(markdown);
|
|
4567
4869
|
return { output: hwpxBuffer, format: "hwpx", fill };
|
|
@@ -4591,5 +4893,6 @@ async function fillForm(input, values, outputFormat = "markdown") {
|
|
|
4591
4893
|
|
|
4592
4894
|
|
|
4593
4895
|
|
|
4594
|
-
|
|
4896
|
+
|
|
4897
|
+
exports.VERSION = _chunkZNJPRRIAcjs.VERSION; exports.blocksToMarkdown = _chunkZNJPRRIAcjs.blocksToMarkdown; exports.compare = compare; exports.detectFormat = detectFormat; exports.detectZipFormat = detectZipFormat; exports.diffBlocks = diffBlocks; exports.extractFormFields = extractFormFields; exports.fillForm = fillForm; exports.fillFormFields = fillFormFields; exports.fillHwpx = fillHwpx; exports.isHwpxFile = isHwpxFile; exports.isLabelCell = isLabelCell; exports.isOldHwpFile = isOldHwpFile; exports.isPdfFile = isPdfFile; exports.isZipFile = isZipFile; exports.markdownToHwpx = markdownToHwpx; exports.parse = parse; exports.parseDocx = parseDocx; exports.parseHwp = parseHwp; exports.parseHwpml = parseHwpml; exports.parseHwpx = parseHwpx; exports.parsePdf = parsePdf; exports.parseXlsx = parseXlsx;
|
|
4595
4898
|
//# sourceMappingURL=index.cjs.map
|