kordoc 2.2.5 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -4
- package/dist/{chunk-UU2O6D3R.js → chunk-JFTFC2BB.js} +2 -2
- package/dist/{chunk-JH5XLWJQ.js.map → chunk-JFTFC2BB.js.map} +1 -1
- package/dist/{chunk-5Y2Q3BRW.js → chunk-M3E3C5GS.js} +8 -1
- package/dist/chunk-M3E3C5GS.js.map +1 -0
- package/dist/{chunk-RQWICKON.js → chunk-OEJJPCMM.js} +369 -73
- package/dist/chunk-OEJJPCMM.js.map +1 -0
- package/dist/{chunk-JH5XLWJQ.js → chunk-Z7UPTVMX.js} +2 -2
- package/dist/{chunk-UU2O6D3R.js.map → chunk-Z7UPTVMX.js.map} +1 -1
- package/dist/{chunk-OJ4QR33V.cjs → chunk-ZNJPRRIA.cjs} +2 -2
- package/dist/{chunk-OJ4QR33V.cjs.map → chunk-ZNJPRRIA.cjs.map} +1 -1
- package/dist/cli.js +7 -4
- package/dist/cli.js.map +1 -1
- package/dist/{detect-GYK3HKD5.js → detect-I7YIS4Q6.js} +4 -2
- package/dist/index.cjs +463 -160
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +4 -2
- package/dist/index.d.ts +4 -2
- package/dist/index.js +387 -84
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +5 -5
- package/dist/{parser-OIRWPKIQ.js → parser-25LF2S2J.js} +45 -42
- package/dist/{parser-OIRWPKIQ.js.map → parser-25LF2S2J.js.map} +1 -1
- package/dist/{parser-PXD73E4H.js → parser-4LKJXBPP.js} +45 -42
- package/dist/{parser-PXD73E4H.js.map → parser-4LKJXBPP.js.map} +1 -1
- package/dist/{parser-CYBX5MP4.cjs → parser-KBQZB3QY.cjs} +61 -58
- package/dist/{parser-CYBX5MP4.cjs.map → parser-KBQZB3QY.cjs.map} +1 -1
- package/dist/{watch-NSBABJ4A.js → watch-GXRBLW3Y.js} +4 -4
- package/package.json +2 -2
- package/dist/chunk-5Y2Q3BRW.js.map +0 -1
- package/dist/chunk-RQWICKON.js.map +0 -1
- /package/dist/{detect-GYK3HKD5.js.map → detect-I7YIS4Q6.js.map} +0 -0
- /package/dist/{watch-NSBABJ4A.js.map → watch-GXRBLW3Y.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -16,7 +16,7 @@ import {
|
|
|
16
16
|
sanitizeHref,
|
|
17
17
|
stripDtd,
|
|
18
18
|
toArrayBuffer
|
|
19
|
-
} from "./chunk-
|
|
19
|
+
} from "./chunk-JFTFC2BB.js";
|
|
20
20
|
import {
|
|
21
21
|
parsePageRange
|
|
22
22
|
} from "./chunk-SBVRCJFH.js";
|
|
@@ -44,11 +44,17 @@ function isPdfFile(buffer) {
|
|
|
44
44
|
const b = magicBytes(buffer);
|
|
45
45
|
return b[0] === 37 && b[1] === 80 && b[2] === 68 && b[3] === 70;
|
|
46
46
|
}
|
|
47
|
+
function isHwpmlFile(buffer) {
|
|
48
|
+
const bytes = new Uint8Array(buffer, 0, Math.min(512, buffer.byteLength));
|
|
49
|
+
const head = new TextDecoder("utf-8", { fatal: false }).decode(bytes).replace(/^\uFEFF/, "");
|
|
50
|
+
return head.trimStart().startsWith("<?xml") && head.includes("<HWPML");
|
|
51
|
+
}
|
|
47
52
|
function detectFormat(buffer) {
|
|
48
53
|
if (buffer.byteLength < 4) return "unknown";
|
|
49
54
|
if (isZipFile(buffer)) return "hwpx";
|
|
50
55
|
if (isOldHwpFile(buffer)) return "hwp";
|
|
51
56
|
if (isPdfFile(buffer)) return "pdf";
|
|
57
|
+
if (isHwpmlFile(buffer)) return "hwpml";
|
|
52
58
|
return "unknown";
|
|
53
59
|
}
|
|
54
60
|
async function detectZipFormat(buffer) {
|
|
@@ -184,6 +190,7 @@ async function parseHwpxDocument(buffer, options) {
|
|
|
184
190
|
const pageFilter = options?.pages ? parsePageRange(options.pages, sectionPaths.length) : null;
|
|
185
191
|
const totalTarget = pageFilter ? pageFilter.size : sectionPaths.length;
|
|
186
192
|
const blocks = [];
|
|
193
|
+
const nestedTableCounter = { count: 0 };
|
|
187
194
|
let parsedSections = 0;
|
|
188
195
|
for (let si = 0; si < sectionPaths.length; si++) {
|
|
189
196
|
if (pageFilter && !pageFilter.has(si + 1)) continue;
|
|
@@ -193,7 +200,7 @@ async function parseHwpxDocument(buffer, options) {
|
|
|
193
200
|
const xml = await file.async("text");
|
|
194
201
|
decompressed.total += xml.length * 2;
|
|
195
202
|
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
196
|
-
blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1));
|
|
203
|
+
blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1, nestedTableCounter));
|
|
197
204
|
parsedSections++;
|
|
198
205
|
options?.onProgress?.(parsedSections, totalTarget);
|
|
199
206
|
} catch (secErr) {
|
|
@@ -254,8 +261,20 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
|
|
|
254
261
|
ref
|
|
255
262
|
// 절대 경로일 수도 있음
|
|
256
263
|
];
|
|
264
|
+
let resolvedPath = null;
|
|
265
|
+
if (!ref.includes(".")) {
|
|
266
|
+
const prefixes = [`BinData/${ref}`, `Contents/BinData/${ref}`];
|
|
267
|
+
for (const prefix of prefixes) {
|
|
268
|
+
const match = zip.file(new RegExp(`^${prefix.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\.[a-zA-Z0-9]+$`));
|
|
269
|
+
if (match.length > 0) {
|
|
270
|
+
resolvedPath = match[0].name;
|
|
271
|
+
break;
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
}
|
|
257
275
|
let found = false;
|
|
258
|
-
|
|
276
|
+
const allCandidates = resolvedPath ? [resolvedPath, ...candidates] : candidates;
|
|
277
|
+
for (const path of allCandidates) {
|
|
259
278
|
if (isPathTraversal(path)) continue;
|
|
260
279
|
const file = zip.file(path);
|
|
261
280
|
if (!file) continue;
|
|
@@ -263,7 +282,8 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
|
|
|
263
282
|
const data = await file.async("uint8array");
|
|
264
283
|
decompressed.total += data.length;
|
|
265
284
|
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
266
|
-
const
|
|
285
|
+
const actualPath = path;
|
|
286
|
+
const ext = actualPath.includes(".") ? actualPath.split(".").pop() || "png" : "png";
|
|
267
287
|
const mimeType = imageExtToMime(ext);
|
|
268
288
|
imageIndex++;
|
|
269
289
|
const filename = `image_${String(imageIndex).padStart(3, "0")}.${mimeToExt(mimeType)}`;
|
|
@@ -336,6 +356,7 @@ function extractFromBrokenZip(buffer) {
|
|
|
336
356
|
let totalDecompressed = 0;
|
|
337
357
|
let entryCount = 0;
|
|
338
358
|
let sectionNum = 0;
|
|
359
|
+
const nestedTableCounter = { count: 0 };
|
|
339
360
|
while (pos < data.length - 30) {
|
|
340
361
|
if (data[pos] !== 80 || data[pos + 1] !== 75 || data[pos + 2] !== 3 || data[pos + 3] !== 4) {
|
|
341
362
|
pos++;
|
|
@@ -382,7 +403,7 @@ function extractFromBrokenZip(buffer) {
|
|
|
382
403
|
totalDecompressed += content.length * 2;
|
|
383
404
|
if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("\uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC");
|
|
384
405
|
sectionNum++;
|
|
385
|
-
blocks.push(...parseSectionXml(content, void 0, warnings, sectionNum));
|
|
406
|
+
blocks.push(...parseSectionXml(content, void 0, warnings, sectionNum, nestedTableCounter));
|
|
386
407
|
} catch {
|
|
387
408
|
continue;
|
|
388
409
|
}
|
|
@@ -467,12 +488,40 @@ function detectHwpxHeadings(blocks, styleMap) {
|
|
|
467
488
|
}
|
|
468
489
|
}
|
|
469
490
|
}
|
|
470
|
-
function
|
|
491
|
+
function makeNestedTableMarker(counter, rows) {
|
|
492
|
+
counter.count++;
|
|
493
|
+
const firstRow = rows[0] ?? [];
|
|
494
|
+
const hint = firstRow.map((c) => c.text.trim().replace(/\n/g, " ")).filter(Boolean).join(" | ");
|
|
495
|
+
const hintChars = [...hint];
|
|
496
|
+
const truncated = hintChars.length > 60 ? hintChars.slice(0, 60).join("") + "\u2026" : hint;
|
|
497
|
+
return truncated ? `[\uC911\uCCA9 \uD14C\uC774\uBE14 #${counter.count}: ${truncated}]` : `[\uC911\uCCA9 \uD14C\uC774\uBE14 #${counter.count}]`;
|
|
498
|
+
}
|
|
499
|
+
function handleNestedTable(newTable, tableStack, blocks, ctx) {
|
|
500
|
+
const parentTable = tableStack.pop();
|
|
501
|
+
let nestedCols = 0;
|
|
502
|
+
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
503
|
+
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
504
|
+
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: ctx.sectionNum });
|
|
505
|
+
if (parentTable.cell) {
|
|
506
|
+
const marker = ctx.counter ? makeNestedTableMarker(ctx.counter, newTable.rows) : "[\uC911\uCCA9 \uD14C\uC774\uBE14]";
|
|
507
|
+
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + marker;
|
|
508
|
+
}
|
|
509
|
+
} else {
|
|
510
|
+
const nestedText = convertTableToText(newTable.rows);
|
|
511
|
+
if (parentTable.cell) {
|
|
512
|
+
const marker = ctx.counter ? makeNestedTableMarker(ctx.counter, newTable.rows) : "[\uC911\uCCA9 \uD14C\uC774\uBE14]";
|
|
513
|
+
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + marker + "\n" + nestedText;
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
return parentTable;
|
|
517
|
+
}
|
|
518
|
+
function parseSectionXml(xml, styleMap, warnings, sectionNum, counter) {
|
|
471
519
|
const parser = createXmlParser(warnings);
|
|
472
520
|
const doc = parser.parseFromString(stripDtd(xml), "text/xml");
|
|
473
521
|
if (!doc.documentElement) return [];
|
|
474
522
|
const blocks = [];
|
|
475
|
-
|
|
523
|
+
const ctx = { styleMap, warnings, sectionNum, counter };
|
|
524
|
+
walkSection(doc.documentElement, blocks, null, [], ctx);
|
|
476
525
|
return blocks;
|
|
477
526
|
}
|
|
478
527
|
function extractImageRef(el) {
|
|
@@ -493,7 +542,7 @@ function extractImageRef(el) {
|
|
|
493
542
|
if (directRef) return directRef;
|
|
494
543
|
return null;
|
|
495
544
|
}
|
|
496
|
-
function walkSection(node, blocks, tableCtx, tableStack,
|
|
545
|
+
function walkSection(node, blocks, tableCtx, tableStack, ctx, depth = 0) {
|
|
497
546
|
if (depth > MAX_XML_DEPTH) return;
|
|
498
547
|
const children = node.childNodes;
|
|
499
548
|
if (!children) return;
|
|
@@ -506,23 +555,12 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
506
555
|
case "tbl": {
|
|
507
556
|
if (tableCtx) tableStack.push(tableCtx);
|
|
508
557
|
const newTable = { rows: [], currentRow: [], cell: null };
|
|
509
|
-
walkSection(el, blocks, newTable, tableStack,
|
|
558
|
+
walkSection(el, blocks, newTable, tableStack, ctx, depth + 1);
|
|
510
559
|
if (newTable.rows.length > 0) {
|
|
511
560
|
if (tableStack.length > 0) {
|
|
512
|
-
|
|
513
|
-
let nestedCols = 0;
|
|
514
|
-
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
515
|
-
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
516
|
-
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
517
|
-
} else {
|
|
518
|
-
const nestedText = convertTableToText(newTable.rows);
|
|
519
|
-
if (parentTable.cell) {
|
|
520
|
-
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
|
|
521
|
-
}
|
|
522
|
-
}
|
|
523
|
-
tableCtx = parentTable;
|
|
561
|
+
tableCtx = handleNestedTable(newTable, tableStack, blocks, ctx);
|
|
524
562
|
} else {
|
|
525
|
-
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
563
|
+
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: ctx.sectionNum });
|
|
526
564
|
tableCtx = null;
|
|
527
565
|
}
|
|
528
566
|
} else {
|
|
@@ -533,7 +571,7 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
533
571
|
case "tr":
|
|
534
572
|
if (tableCtx) {
|
|
535
573
|
tableCtx.currentRow = [];
|
|
536
|
-
walkSection(el, blocks, tableCtx, tableStack,
|
|
574
|
+
walkSection(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
537
575
|
if (tableCtx.currentRow.length > 0) tableCtx.rows.push(tableCtx.currentRow);
|
|
538
576
|
tableCtx.currentRow = [];
|
|
539
577
|
}
|
|
@@ -541,7 +579,7 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
541
579
|
case "tc":
|
|
542
580
|
if (tableCtx) {
|
|
543
581
|
tableCtx.cell = { text: "", colSpan: 1, rowSpan: 1 };
|
|
544
|
-
walkSection(el, blocks, tableCtx, tableStack,
|
|
582
|
+
walkSection(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
545
583
|
if (tableCtx.cell) {
|
|
546
584
|
tableCtx.currentRow.push(tableCtx.cell);
|
|
547
585
|
tableCtx.cell = null;
|
|
@@ -567,19 +605,19 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
567
605
|
}
|
|
568
606
|
break;
|
|
569
607
|
case "p": {
|
|
570
|
-
const { text, href, footnote, style } = extractParagraphInfo(el, styleMap);
|
|
608
|
+
const { text, href, footnote, style } = extractParagraphInfo(el, ctx.styleMap);
|
|
571
609
|
if (text) {
|
|
572
610
|
if (tableCtx?.cell) {
|
|
573
611
|
tableCtx.cell.text += (tableCtx.cell.text ? "\n" : "") + text;
|
|
574
612
|
} else if (!tableCtx) {
|
|
575
|
-
const block = { type: "paragraph", text, pageNumber: sectionNum };
|
|
613
|
+
const block = { type: "paragraph", text, pageNumber: ctx.sectionNum };
|
|
576
614
|
if (style) block.style = style;
|
|
577
615
|
if (href) block.href = href;
|
|
578
616
|
if (footnote) block.footnoteText = footnote;
|
|
579
617
|
blocks.push(block);
|
|
580
618
|
}
|
|
581
619
|
}
|
|
582
|
-
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack,
|
|
620
|
+
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
583
621
|
break;
|
|
584
622
|
}
|
|
585
623
|
// 이미지/그림 — 경로 추출 또는 경고
|
|
@@ -588,19 +626,19 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
588
626
|
case "drawingObject": {
|
|
589
627
|
const imgRef = extractImageRef(el);
|
|
590
628
|
if (imgRef) {
|
|
591
|
-
blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
|
|
592
|
-
} else if (warnings && sectionNum) {
|
|
593
|
-
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
629
|
+
blocks.push({ type: "image", text: imgRef, pageNumber: ctx.sectionNum });
|
|
630
|
+
} else if (ctx.warnings && ctx.sectionNum) {
|
|
631
|
+
ctx.warnings.push({ page: ctx.sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
594
632
|
}
|
|
595
633
|
break;
|
|
596
634
|
}
|
|
597
635
|
default:
|
|
598
|
-
walkSection(el, blocks, tableCtx, tableStack,
|
|
636
|
+
walkSection(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
599
637
|
break;
|
|
600
638
|
}
|
|
601
639
|
}
|
|
602
640
|
}
|
|
603
|
-
function walkParagraphChildren(node, blocks, tableCtx, tableStack,
|
|
641
|
+
function walkParagraphChildren(node, blocks, tableCtx, tableStack, ctx, depth = 0) {
|
|
604
642
|
if (depth > MAX_XML_DEPTH) return tableCtx;
|
|
605
643
|
const children = node.childNodes;
|
|
606
644
|
if (!children) return tableCtx;
|
|
@@ -616,23 +654,12 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
616
654
|
if (localTag === "tbl") {
|
|
617
655
|
if (tableCtx) tableStack.push(tableCtx);
|
|
618
656
|
const newTable = { rows: [], currentRow: [], cell: null };
|
|
619
|
-
walkSection(el, blocks, newTable, tableStack,
|
|
657
|
+
walkSection(el, blocks, newTable, tableStack, ctx, d + 1);
|
|
620
658
|
if (newTable.rows.length > 0) {
|
|
621
659
|
if (tableStack.length > 0) {
|
|
622
|
-
|
|
623
|
-
let nestedCols = 0;
|
|
624
|
-
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
625
|
-
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
626
|
-
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
627
|
-
} else {
|
|
628
|
-
const nestedText = convertTableToText(newTable.rows);
|
|
629
|
-
if (parentTable.cell) {
|
|
630
|
-
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
|
|
631
|
-
}
|
|
632
|
-
}
|
|
633
|
-
tableCtx = parentTable;
|
|
660
|
+
tableCtx = handleNestedTable(newTable, tableStack, blocks, ctx);
|
|
634
661
|
} else {
|
|
635
|
-
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
662
|
+
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: ctx.sectionNum });
|
|
636
663
|
tableCtx = null;
|
|
637
664
|
}
|
|
638
665
|
} else {
|
|
@@ -641,21 +668,21 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
641
668
|
} else if (localTag === "pic" || localTag === "shape" || localTag === "drawingObject") {
|
|
642
669
|
const drawTextChild = findDescendant(el, "drawText");
|
|
643
670
|
if (drawTextChild) {
|
|
644
|
-
extractDrawTextBlocks(drawTextChild, blocks, styleMap, sectionNum);
|
|
671
|
+
extractDrawTextBlocks(drawTextChild, blocks, ctx.styleMap, ctx.sectionNum);
|
|
645
672
|
} else {
|
|
646
673
|
const imgRef = extractImageRef(el);
|
|
647
674
|
if (imgRef) {
|
|
648
|
-
blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
|
|
649
|
-
} else if (warnings && sectionNum) {
|
|
650
|
-
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
675
|
+
blocks.push({ type: "image", text: imgRef, pageNumber: ctx.sectionNum });
|
|
676
|
+
} else if (ctx.warnings && ctx.sectionNum) {
|
|
677
|
+
ctx.warnings.push({ page: ctx.sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
651
678
|
}
|
|
652
679
|
}
|
|
653
680
|
} else if (localTag === "drawText") {
|
|
654
|
-
extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
|
|
681
|
+
extractDrawTextBlocks(el, blocks, ctx.styleMap, ctx.sectionNum);
|
|
655
682
|
} else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
|
|
656
683
|
walkChildren(el, d + 1);
|
|
657
684
|
} else if (localTag === "run") {
|
|
658
|
-
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack,
|
|
685
|
+
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
659
686
|
}
|
|
660
687
|
}
|
|
661
688
|
};
|
|
@@ -1928,6 +1955,7 @@ function parseHwp5Document(buffer, options) {
|
|
|
1928
1955
|
const pageFilter = options?.pages ? parsePageRange(options.pages, sections.length) : null;
|
|
1929
1956
|
const totalTarget = pageFilter ? pageFilter.size : sections.length;
|
|
1930
1957
|
const blocks = [];
|
|
1958
|
+
const nestedTableCounter = { count: 0 };
|
|
1931
1959
|
let totalDecompressed = 0;
|
|
1932
1960
|
let parsedSections = 0;
|
|
1933
1961
|
for (let si = 0; si < sections.length; si++) {
|
|
@@ -1938,7 +1966,7 @@ function parseHwp5Document(buffer, options) {
|
|
|
1938
1966
|
totalDecompressed += data.length;
|
|
1939
1967
|
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
1940
1968
|
const records = readRecords(data);
|
|
1941
|
-
const sectionBlocks = parseSection(records, docInfo, warnings, si + 1);
|
|
1969
|
+
const sectionBlocks = parseSection(records, docInfo, warnings, si + 1, nestedTableCounter);
|
|
1942
1970
|
blocks.push(...sectionBlocks);
|
|
1943
1971
|
parsedSections++;
|
|
1944
1972
|
options?.onProgress?.(parsedSections, totalTarget);
|
|
@@ -2258,13 +2286,13 @@ function extractHwp5ImagesLenient(lcfb, blocks, compressed, warnings) {
|
|
|
2258
2286
|
}
|
|
2259
2287
|
return images;
|
|
2260
2288
|
}
|
|
2261
|
-
function parseSection(records, docInfo, warnings, sectionNum) {
|
|
2289
|
+
function parseSection(records, docInfo, warnings, sectionNum, counter) {
|
|
2262
2290
|
const blocks = [];
|
|
2263
2291
|
let i = 0;
|
|
2264
2292
|
while (i < records.length) {
|
|
2265
2293
|
const rec = records[i];
|
|
2266
2294
|
if (rec.tagId === TAG_PARA_HEADER && rec.level === 0) {
|
|
2267
|
-
const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i);
|
|
2295
|
+
const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i, counter);
|
|
2268
2296
|
if (paragraph) {
|
|
2269
2297
|
const block = { type: "paragraph", text: paragraph, pageNumber: sectionNum };
|
|
2270
2298
|
if (docInfo && charShapeIds.length > 0) {
|
|
@@ -2287,7 +2315,7 @@ function parseSection(records, docInfo, warnings, sectionNum) {
|
|
|
2287
2315
|
if (rec.tagId === TAG_CTRL_HEADER && rec.level <= 1 && rec.data.length >= 4) {
|
|
2288
2316
|
const ctrlId = rec.data.subarray(0, 4).toString("ascii");
|
|
2289
2317
|
if (ctrlId === " lbt" || ctrlId === "tbl ") {
|
|
2290
|
-
const { table, nextIdx } = parseTableBlock(records, i);
|
|
2318
|
+
const { table, nextIdx } = parseTableBlock(records, i, counter);
|
|
2291
2319
|
if (table) blocks.push({ type: "table", table, pageNumber: sectionNum });
|
|
2292
2320
|
i = nextIdx;
|
|
2293
2321
|
continue;
|
|
@@ -2392,7 +2420,7 @@ function resolveCharStyle(charShapeIds, docInfo) {
|
|
|
2392
2420
|
if (cs.attrFlags & 2) style.bold = true;
|
|
2393
2421
|
return style.fontSize || style.bold || style.italic ? style : void 0;
|
|
2394
2422
|
}
|
|
2395
|
-
function parseParagraphWithTables(records, startIdx) {
|
|
2423
|
+
function parseParagraphWithTables(records, startIdx, counter) {
|
|
2396
2424
|
const startLevel = records[startIdx].level;
|
|
2397
2425
|
let text = "";
|
|
2398
2426
|
const tables = [];
|
|
@@ -2414,7 +2442,7 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
2414
2442
|
if (rec.tagId === TAG_CTRL_HEADER && rec.data.length >= 4) {
|
|
2415
2443
|
const ctrlId = rec.data.subarray(0, 4).toString("ascii");
|
|
2416
2444
|
if (ctrlId === " lbt" || ctrlId === "tbl ") {
|
|
2417
|
-
const { table, nextIdx } = parseTableBlock(records, i);
|
|
2445
|
+
const { table, nextIdx } = parseTableBlock(records, i, counter);
|
|
2418
2446
|
if (table) tables.push(table);
|
|
2419
2447
|
i = nextIdx;
|
|
2420
2448
|
continue;
|
|
@@ -2425,7 +2453,7 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
2425
2453
|
const trimmed = text.trim();
|
|
2426
2454
|
return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds, paraShapeId };
|
|
2427
2455
|
}
|
|
2428
|
-
function parseTableBlock(records, startIdx) {
|
|
2456
|
+
function parseTableBlock(records, startIdx, counter) {
|
|
2429
2457
|
const tableLevel = records[startIdx].level;
|
|
2430
2458
|
let i = startIdx + 1;
|
|
2431
2459
|
let rows = 0, cols = 0;
|
|
@@ -2439,7 +2467,7 @@ function parseTableBlock(records, startIdx) {
|
|
|
2439
2467
|
cols = Math.min(rec.data.readUInt16LE(6), MAX_COLS);
|
|
2440
2468
|
}
|
|
2441
2469
|
if (rec.tagId === TAG_LIST_HEADER) {
|
|
2442
|
-
const { cell, nextIdx } = parseCellBlock(records, i, tableLevel);
|
|
2470
|
+
const { cell, nextIdx } = parseCellBlock(records, i, tableLevel, counter);
|
|
2443
2471
|
if (cell) cells.push(cell);
|
|
2444
2472
|
i = nextIdx;
|
|
2445
2473
|
continue;
|
|
@@ -2460,7 +2488,7 @@ function parseTableBlock(records, startIdx) {
|
|
|
2460
2488
|
const cellRows = arrangeCells(rows, cols, cells);
|
|
2461
2489
|
return { table: buildTable(cellRows), nextIdx: i };
|
|
2462
2490
|
}
|
|
2463
|
-
function parseCellBlock(records, startIdx, tableLevel) {
|
|
2491
|
+
function parseCellBlock(records, startIdx, tableLevel, counter) {
|
|
2464
2492
|
const rec = records[startIdx];
|
|
2465
2493
|
const cellLevel = rec.level;
|
|
2466
2494
|
const texts = [];
|
|
@@ -2485,6 +2513,17 @@ function parseCellBlock(records, startIdx, tableLevel) {
|
|
|
2485
2513
|
const t = extractText(r.data).trim();
|
|
2486
2514
|
if (t) texts.push(t);
|
|
2487
2515
|
}
|
|
2516
|
+
if (r.tagId === TAG_CTRL_HEADER && r.data.length >= 4) {
|
|
2517
|
+
const ctrlId = r.data.subarray(0, 4).toString("ascii");
|
|
2518
|
+
if (ctrlId === " lbt" || ctrlId === "tbl ") {
|
|
2519
|
+
if (counter) {
|
|
2520
|
+
counter.count++;
|
|
2521
|
+
texts.push(`[\uC911\uCCA9 \uD14C\uC774\uBE14 #${counter.count}]`);
|
|
2522
|
+
} else {
|
|
2523
|
+
texts.push("[\uC911\uCCA9 \uD14C\uC774\uBE14]");
|
|
2524
|
+
}
|
|
2525
|
+
}
|
|
2526
|
+
}
|
|
2488
2527
|
i++;
|
|
2489
2528
|
}
|
|
2490
2529
|
return { cell: { text: texts.join("\n"), colSpan, rowSpan, colAddr, rowAddr }, nextIdx: i };
|
|
@@ -2811,21 +2850,21 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
2811
2850
|
import JSZip4 from "jszip";
|
|
2812
2851
|
import { DOMParser as DOMParser3 } from "@xmldom/xmldom";
|
|
2813
2852
|
var MAX_DECOMPRESS_SIZE4 = 100 * 1024 * 1024;
|
|
2814
|
-
function getChildElements(parent,
|
|
2853
|
+
function getChildElements(parent, localName3) {
|
|
2815
2854
|
const result = [];
|
|
2816
2855
|
const children = parent.childNodes;
|
|
2817
2856
|
for (let i = 0; i < children.length; i++) {
|
|
2818
2857
|
const node = children[i];
|
|
2819
2858
|
if (node.nodeType === 1) {
|
|
2820
2859
|
const el = node;
|
|
2821
|
-
if (el.localName ===
|
|
2860
|
+
if (el.localName === localName3 || el.tagName?.endsWith(`:${localName3}`)) {
|
|
2822
2861
|
result.push(el);
|
|
2823
2862
|
}
|
|
2824
2863
|
}
|
|
2825
2864
|
}
|
|
2826
2865
|
return result;
|
|
2827
2866
|
}
|
|
2828
|
-
function findElements(parent,
|
|
2867
|
+
function findElements(parent, localName3) {
|
|
2829
2868
|
const result = [];
|
|
2830
2869
|
const walk = (node) => {
|
|
2831
2870
|
const children = node.childNodes;
|
|
@@ -2833,7 +2872,7 @@ function findElements(parent, localName2) {
|
|
|
2833
2872
|
const child = children[i];
|
|
2834
2873
|
if (child.nodeType === 1) {
|
|
2835
2874
|
const el = child;
|
|
2836
|
-
if (el.localName ===
|
|
2875
|
+
if (el.localName === localName3 || el.tagName?.endsWith(`:${localName3}`)) {
|
|
2837
2876
|
result.push(el);
|
|
2838
2877
|
}
|
|
2839
2878
|
walk(el);
|
|
@@ -2843,11 +2882,11 @@ function findElements(parent, localName2) {
|
|
|
2843
2882
|
walk(parent);
|
|
2844
2883
|
return result;
|
|
2845
2884
|
}
|
|
2846
|
-
function getAttr(el,
|
|
2885
|
+
function getAttr(el, localName3) {
|
|
2847
2886
|
const attrs = el.attributes;
|
|
2848
2887
|
for (let i = 0; i < attrs.length; i++) {
|
|
2849
2888
|
const attr = attrs[i];
|
|
2850
|
-
if (attr.localName ===
|
|
2889
|
+
if (attr.localName === localName3 || attr.name === localName3) return attr.value;
|
|
2851
2890
|
}
|
|
2852
2891
|
return null;
|
|
2853
2892
|
}
|
|
@@ -3194,11 +3233,11 @@ async function parseDocxDocument(buffer, options) {
|
|
|
3194
3233
|
const node = children[i];
|
|
3195
3234
|
if (node.nodeType !== 1) continue;
|
|
3196
3235
|
const el = node;
|
|
3197
|
-
const
|
|
3198
|
-
if (
|
|
3236
|
+
const localName3 = el.localName ?? el.tagName?.split(":").pop();
|
|
3237
|
+
if (localName3 === "p") {
|
|
3199
3238
|
const block = parseParagraph(el, styles, numbering, footnotes, rels);
|
|
3200
3239
|
if (block) blocks.push(block);
|
|
3201
|
-
} else if (
|
|
3240
|
+
} else if (localName3 === "tbl") {
|
|
3202
3241
|
const block = parseTable(el, styles, numbering, footnotes, rels);
|
|
3203
3242
|
if (block) blocks.push(block);
|
|
3204
3243
|
}
|
|
@@ -3236,6 +3275,259 @@ async function parseDocxDocument(buffer, options) {
|
|
|
3236
3275
|
};
|
|
3237
3276
|
}
|
|
3238
3277
|
|
|
3278
|
+
// src/hwpml/parser.ts
|
|
3279
|
+
import { DOMParser as DOMParser4 } from "@xmldom/xmldom";
|
|
3280
|
+
var MAX_XML_DEPTH2 = 200;
|
|
3281
|
+
var MAX_TABLE_ROWS = 5e3;
|
|
3282
|
+
var MAX_TABLE_COLS = 500;
|
|
3283
|
+
var MAX_HWPML_BYTES = 50 * 1024 * 1024;
|
|
3284
|
+
function parseHwpmlDocument(buffer, options) {
|
|
3285
|
+
if (buffer.byteLength > MAX_HWPML_BYTES) {
|
|
3286
|
+
throw new Error(`HWPML \uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC (${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB > 50MB)`);
|
|
3287
|
+
}
|
|
3288
|
+
const text = new TextDecoder("utf-8").decode(buffer).replace(/^\uFEFF/, "");
|
|
3289
|
+
const normalized = text.replace(/ /g, " ");
|
|
3290
|
+
const xml = stripDtd(normalized);
|
|
3291
|
+
const warnings = [];
|
|
3292
|
+
const parser = new DOMParser4({
|
|
3293
|
+
onError: (_level, msg) => {
|
|
3294
|
+
warnings.push({ message: `HWPML XML \uD30C\uC2F1 \uACBD\uACE0: ${msg}`, code: "MALFORMED_XML" });
|
|
3295
|
+
}
|
|
3296
|
+
});
|
|
3297
|
+
const doc = parser.parseFromString(xml, "text/xml");
|
|
3298
|
+
if (!doc.documentElement) {
|
|
3299
|
+
return { markdown: "", blocks: [], warnings };
|
|
3300
|
+
}
|
|
3301
|
+
const root = doc.documentElement;
|
|
3302
|
+
const metadata = {};
|
|
3303
|
+
const docSummary = findChild(root, "DOCSUMMARY");
|
|
3304
|
+
if (docSummary) {
|
|
3305
|
+
const title = findChild(docSummary, "TITLE");
|
|
3306
|
+
const author = findChild(docSummary, "AUTHOR");
|
|
3307
|
+
const date = findChild(docSummary, "DATE");
|
|
3308
|
+
if (title) metadata.title = textContent(title).trim();
|
|
3309
|
+
if (author) metadata.author = textContent(author).trim();
|
|
3310
|
+
if (date) metadata.createdAt = textContent(date).trim() || void 0;
|
|
3311
|
+
}
|
|
3312
|
+
const paraShapeMap = buildParaShapeMap(root);
|
|
3313
|
+
const body = findChild(root, "BODY");
|
|
3314
|
+
if (!body) {
|
|
3315
|
+
return { markdown: "", blocks: [], metadata, warnings };
|
|
3316
|
+
}
|
|
3317
|
+
const blocks = [];
|
|
3318
|
+
const pageFilter = options?.pages ? parsePageRange(options.pages, countSections(body)) : null;
|
|
3319
|
+
let sectionIdx = 0;
|
|
3320
|
+
const children = body.childNodes;
|
|
3321
|
+
for (let i = 0; i < children.length; i++) {
|
|
3322
|
+
const el = children[i];
|
|
3323
|
+
if (el.nodeType !== 1) continue;
|
|
3324
|
+
if (localName(el) !== "SECTION") continue;
|
|
3325
|
+
sectionIdx++;
|
|
3326
|
+
if (pageFilter && !pageFilter.has(sectionIdx)) continue;
|
|
3327
|
+
parseSection2(el, blocks, paraShapeMap, sectionIdx, warnings);
|
|
3328
|
+
}
|
|
3329
|
+
const outline = blocks.filter((b) => b.type === "heading" && b.text).map((b) => ({ level: b.level ?? 1, text: b.text, pageNumber: b.pageNumber }));
|
|
3330
|
+
const markdown = blocksToMarkdown(blocks);
|
|
3331
|
+
return {
|
|
3332
|
+
markdown,
|
|
3333
|
+
blocks,
|
|
3334
|
+
metadata: Object.keys(metadata).length > 0 ? metadata : void 0,
|
|
3335
|
+
outline: outline.length > 0 ? outline : void 0,
|
|
3336
|
+
warnings: warnings.length > 0 ? warnings : void 0
|
|
3337
|
+
};
|
|
3338
|
+
}
|
|
3339
|
+
function buildParaShapeMap(root) {
|
|
3340
|
+
const map = /* @__PURE__ */ new Map();
|
|
3341
|
+
const head = findChild(root, "HEAD");
|
|
3342
|
+
if (!head) return map;
|
|
3343
|
+
const mappingTable = findChild(head, "MAPPINGTABLE");
|
|
3344
|
+
if (!mappingTable) return map;
|
|
3345
|
+
const paraShapeList = findChild(mappingTable, "PARASHAPELIST");
|
|
3346
|
+
if (!paraShapeList) return map;
|
|
3347
|
+
const children = paraShapeList.childNodes;
|
|
3348
|
+
for (let i = 0; i < children.length; i++) {
|
|
3349
|
+
const el = children[i];
|
|
3350
|
+
if (el.nodeType !== 1 || localName(el) !== "PARASHAPE") continue;
|
|
3351
|
+
const id = el.getAttribute("Id") ?? "";
|
|
3352
|
+
const headingType = el.getAttribute("HeadingType") ?? "None";
|
|
3353
|
+
const level = parseInt(el.getAttribute("Level") ?? "0", 10);
|
|
3354
|
+
let headingLevel = null;
|
|
3355
|
+
if (headingType === "Outline") {
|
|
3356
|
+
const safeLevel = isNaN(level) ? 0 : Math.max(0, level);
|
|
3357
|
+
headingLevel = Math.min(safeLevel + 1, 6);
|
|
3358
|
+
}
|
|
3359
|
+
map.set(id, { headingLevel });
|
|
3360
|
+
}
|
|
3361
|
+
return map;
|
|
3362
|
+
}
|
|
3363
|
+
function parseSection2(section, blocks, paraShapeMap, sectionNum, warnings) {
|
|
3364
|
+
walkContent(section, blocks, paraShapeMap, sectionNum, warnings, false);
|
|
3365
|
+
}
|
|
3366
|
+
function walkContent(node, blocks, paraShapeMap, sectionNum, warnings, inHeaderFooter, depth = 0) {
|
|
3367
|
+
if (depth > MAX_XML_DEPTH2) return;
|
|
3368
|
+
const children = node.childNodes;
|
|
3369
|
+
for (let i = 0; i < children.length; i++) {
|
|
3370
|
+
const el = children[i];
|
|
3371
|
+
if (el.nodeType !== 1) continue;
|
|
3372
|
+
const tag = localName(el);
|
|
3373
|
+
if (tag === "HEADER" || tag === "FOOTER") {
|
|
3374
|
+
continue;
|
|
3375
|
+
}
|
|
3376
|
+
if (tag === "P") {
|
|
3377
|
+
if (!inHeaderFooter) {
|
|
3378
|
+
parseParagraph2(el, blocks, paraShapeMap, sectionNum);
|
|
3379
|
+
}
|
|
3380
|
+
continue;
|
|
3381
|
+
}
|
|
3382
|
+
if (tag === "TABLE") {
|
|
3383
|
+
if (!inHeaderFooter) {
|
|
3384
|
+
parseTable2(el, blocks, paraShapeMap, sectionNum, warnings);
|
|
3385
|
+
}
|
|
3386
|
+
continue;
|
|
3387
|
+
}
|
|
3388
|
+
if (tag === "PARALIST" || tag === "SECTION" || tag === "COLDEF") {
|
|
3389
|
+
walkContent(el, blocks, paraShapeMap, sectionNum, warnings, inHeaderFooter, depth + 1);
|
|
3390
|
+
continue;
|
|
3391
|
+
}
|
|
3392
|
+
walkContent(el, blocks, paraShapeMap, sectionNum, warnings, inHeaderFooter, depth + 1);
|
|
3393
|
+
}
|
|
3394
|
+
}
|
|
3395
|
+
function parseParagraph2(el, blocks, paraShapeMap, sectionNum) {
|
|
3396
|
+
const paraShapeId = el.getAttribute("ParaShape") ?? "";
|
|
3397
|
+
const shapeInfo = paraShapeMap.get(paraShapeId);
|
|
3398
|
+
const text = extractParagraphText(el);
|
|
3399
|
+
if (!text) return;
|
|
3400
|
+
if (shapeInfo?.headingLevel != null) {
|
|
3401
|
+
blocks.push({ type: "heading", text, level: shapeInfo.headingLevel, pageNumber: sectionNum });
|
|
3402
|
+
} else {
|
|
3403
|
+
blocks.push({ type: "paragraph", text, pageNumber: sectionNum });
|
|
3404
|
+
}
|
|
3405
|
+
}
|
|
3406
|
+
function extractParagraphText(p) {
|
|
3407
|
+
const parts = [];
|
|
3408
|
+
collectCharText(p, parts);
|
|
3409
|
+
return parts.join("").trim();
|
|
3410
|
+
}
|
|
3411
|
+
function collectCharText(node, parts, depth = 0) {
|
|
3412
|
+
if (depth > MAX_XML_DEPTH2) return;
|
|
3413
|
+
const children = node.childNodes;
|
|
3414
|
+
for (let i = 0; i < children.length; i++) {
|
|
3415
|
+
const el = children[i];
|
|
3416
|
+
if (el.nodeType !== 1) continue;
|
|
3417
|
+
const tag = localName(el);
|
|
3418
|
+
if (tag === "CHAR") {
|
|
3419
|
+
const t = textContent(el);
|
|
3420
|
+
if (t) parts.push(t);
|
|
3421
|
+
} else if (tag === "TABLE" || tag === "PICTURE" || tag === "SHAPEOBJECT") {
|
|
3422
|
+
} else if (tag === "AUTONUM") {
|
|
3423
|
+
} else {
|
|
3424
|
+
collectCharText(el, parts, depth + 1);
|
|
3425
|
+
}
|
|
3426
|
+
}
|
|
3427
|
+
}
|
|
3428
|
+
function parseTable2(el, blocks, paraShapeMap, sectionNum, warnings) {
|
|
3429
|
+
const cells = [];
|
|
3430
|
+
const rowCount = parseInt(el.getAttribute("RowCount") ?? "0", 10);
|
|
3431
|
+
const colCount = parseInt(el.getAttribute("ColCount") ?? "0", 10);
|
|
3432
|
+
if (isNaN(rowCount) || isNaN(colCount) || rowCount === 0 || colCount === 0) return;
|
|
3433
|
+
if (rowCount > MAX_TABLE_ROWS || colCount > MAX_TABLE_COLS) {
|
|
3434
|
+
warnings.push({ message: `\uD14C\uC774\uBE14 \uD06C\uAE30 \uCD08\uACFC (${rowCount}x${colCount}) \u2014 \uC2A4\uD0B5`, code: "TRUNCATED_TABLE" });
|
|
3435
|
+
return;
|
|
3436
|
+
}
|
|
3437
|
+
const children = el.childNodes;
|
|
3438
|
+
for (let i = 0; i < children.length; i++) {
|
|
3439
|
+
const rowEl = children[i];
|
|
3440
|
+
if (rowEl.nodeType !== 1 || localName(rowEl) !== "ROW") continue;
|
|
3441
|
+
const rowCells = rowEl.childNodes;
|
|
3442
|
+
for (let j = 0; j < rowCells.length; j++) {
|
|
3443
|
+
const cellEl = rowCells[j];
|
|
3444
|
+
if (cellEl.nodeType !== 1 || localName(cellEl) !== "CELL") continue;
|
|
3445
|
+
const colAddr = parseInt(cellEl.getAttribute("ColAddr") ?? "0", 10);
|
|
3446
|
+
const rowAddr = parseInt(cellEl.getAttribute("RowAddr") ?? "0", 10);
|
|
3447
|
+
const colSpan = Math.min(Math.max(1, parseInt(cellEl.getAttribute("ColSpan") ?? "1", 10) || 1), MAX_TABLE_COLS);
|
|
3448
|
+
const rowSpan = Math.min(Math.max(1, parseInt(cellEl.getAttribute("RowSpan") ?? "1", 10) || 1), MAX_TABLE_ROWS);
|
|
3449
|
+
const cellText = extractCellText(cellEl);
|
|
3450
|
+
cells.push({ text: cellText, colSpan, rowSpan, colAddr, rowAddr });
|
|
3451
|
+
}
|
|
3452
|
+
}
|
|
3453
|
+
if (cells.length === 0) return;
|
|
3454
|
+
const grid = Array.from({ length: rowCount }, () => Array(colCount).fill(null));
|
|
3455
|
+
for (const cell of cells) {
|
|
3456
|
+
const r = cell.rowAddr ?? 0;
|
|
3457
|
+
const c = cell.colAddr ?? 0;
|
|
3458
|
+
if (isNaN(r) || isNaN(c) || r >= rowCount || c >= colCount) continue;
|
|
3459
|
+
grid[r][c] = cell;
|
|
3460
|
+
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
3461
|
+
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
3462
|
+
if (dr === 0 && dc === 0) continue;
|
|
3463
|
+
if (r + dr < rowCount && c + dc < colCount) {
|
|
3464
|
+
grid[r + dr][c + dc] = { text: "", colSpan: 1, rowSpan: 1 };
|
|
3465
|
+
}
|
|
3466
|
+
}
|
|
3467
|
+
}
|
|
3468
|
+
}
|
|
3469
|
+
const cellRows = grid.map(
|
|
3470
|
+
(row) => row.map((cell) => cell ?? { text: "", colSpan: 1, rowSpan: 1 })
|
|
3471
|
+
);
|
|
3472
|
+
const table = buildTable(cellRows);
|
|
3473
|
+
blocks.push({ type: "table", table, pageNumber: sectionNum });
|
|
3474
|
+
}
|
|
3475
|
+
function extractCellText(cellEl) {
|
|
3476
|
+
const textParts = [];
|
|
3477
|
+
collectCellText(cellEl, textParts, 0);
|
|
3478
|
+
return textParts.filter(Boolean).join("\n").trim();
|
|
3479
|
+
}
|
|
3480
|
+
function collectCellText(node, parts, depth) {
|
|
3481
|
+
if (depth > 20) return;
|
|
3482
|
+
const children = node.childNodes;
|
|
3483
|
+
for (let i = 0; i < children.length; i++) {
|
|
3484
|
+
const el = children[i];
|
|
3485
|
+
if (el.nodeType !== 1) continue;
|
|
3486
|
+
const tag = localName(el);
|
|
3487
|
+
if (tag === "P") {
|
|
3488
|
+
const t = extractParagraphText(el);
|
|
3489
|
+
if (t) parts.push(t);
|
|
3490
|
+
} else if (tag === "TABLE") {
|
|
3491
|
+
parts.push("[\uC911\uCCA9 \uD14C\uC774\uBE14]");
|
|
3492
|
+
} else {
|
|
3493
|
+
collectCellText(el, parts, depth + 1);
|
|
3494
|
+
}
|
|
3495
|
+
}
|
|
3496
|
+
}
|
|
3497
|
+
function localName(el) {
|
|
3498
|
+
return (el.tagName || el.localName || "").replace(/^[^:]+:/, "");
|
|
3499
|
+
}
|
|
3500
|
+
function findChild(parent, tag) {
|
|
3501
|
+
const children = parent.childNodes;
|
|
3502
|
+
for (let i = 0; i < children.length; i++) {
|
|
3503
|
+
const el = children[i];
|
|
3504
|
+
if (el.nodeType === 1 && localName(el) === tag) return el;
|
|
3505
|
+
}
|
|
3506
|
+
return null;
|
|
3507
|
+
}
|
|
3508
|
+
function textContent(el) {
|
|
3509
|
+
const children = el.childNodes;
|
|
3510
|
+
const parts = [];
|
|
3511
|
+
for (let i = 0; i < children.length; i++) {
|
|
3512
|
+
const node = children[i];
|
|
3513
|
+
if (node.nodeType === 3) {
|
|
3514
|
+
parts.push(node.nodeValue || "");
|
|
3515
|
+
} else if (node.nodeType === 1) {
|
|
3516
|
+
parts.push(textContent(node));
|
|
3517
|
+
}
|
|
3518
|
+
}
|
|
3519
|
+
return parts.join("");
|
|
3520
|
+
}
|
|
3521
|
+
function countSections(body) {
|
|
3522
|
+
let count = 0;
|
|
3523
|
+
const children = body.childNodes;
|
|
3524
|
+
for (let i = 0; i < children.length; i++) {
|
|
3525
|
+
const el = children[i];
|
|
3526
|
+
if (el.nodeType === 1 && localName(el) === "SECTION") count++;
|
|
3527
|
+
}
|
|
3528
|
+
return count;
|
|
3529
|
+
}
|
|
3530
|
+
|
|
3239
3531
|
// src/form/recognize.ts
|
|
3240
3532
|
var LABEL_KEYWORDS = /* @__PURE__ */ new Set([
|
|
3241
3533
|
"\uC131\uBA85",
|
|
@@ -3570,7 +3862,7 @@ function fillInlineFields(text, values, filled, matchedLabels) {
|
|
|
3570
3862
|
|
|
3571
3863
|
// src/form/filler-hwpx.ts
|
|
3572
3864
|
import JSZip5 from "jszip";
|
|
3573
|
-
import { DOMParser as
|
|
3865
|
+
import { DOMParser as DOMParser5, XMLSerializer } from "@xmldom/xmldom";
|
|
3574
3866
|
async function fillHwpx(hwpxBuffer, values) {
|
|
3575
3867
|
const zip = await JSZip5.loadAsync(hwpxBuffer);
|
|
3576
3868
|
const filled = [];
|
|
@@ -3580,7 +3872,7 @@ async function fillHwpx(hwpxBuffer, values) {
|
|
|
3580
3872
|
if (sectionFiles.length === 0) {
|
|
3581
3873
|
throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
3582
3874
|
}
|
|
3583
|
-
const xmlParser = new
|
|
3875
|
+
const xmlParser = new DOMParser5();
|
|
3584
3876
|
const xmlSerializer = new XMLSerializer();
|
|
3585
3877
|
for (const sectionPath of sectionFiles) {
|
|
3586
3878
|
const zipEntry = zip.file(sectionPath);
|
|
@@ -3612,10 +3904,10 @@ async function fillHwpx(hwpxBuffer, values) {
|
|
|
3612
3904
|
const trEl = rows[rowIdx];
|
|
3613
3905
|
const cells = findDirectChildren(trEl, "tc");
|
|
3614
3906
|
for (let colIdx = 0; colIdx < cells.length - 1; colIdx++) {
|
|
3615
|
-
const labelText =
|
|
3907
|
+
const labelText = extractCellText2(cells[colIdx]);
|
|
3616
3908
|
if (!isLabelCell(labelText)) continue;
|
|
3617
3909
|
const valueCell = cells[colIdx + 1];
|
|
3618
|
-
const valueText =
|
|
3910
|
+
const valueText = extractCellText2(valueCell);
|
|
3619
3911
|
if (isKeywordLabel(valueText)) continue;
|
|
3620
3912
|
const normalizedCellLabel = normalizeLabel(labelText);
|
|
3621
3913
|
if (!normalizedCellLabel) continue;
|
|
@@ -3640,14 +3932,14 @@ async function fillHwpx(hwpxBuffer, values) {
|
|
|
3640
3932
|
if (rows.length >= 2) {
|
|
3641
3933
|
const headerCells = findDirectChildren(rows[0], "tc");
|
|
3642
3934
|
const allLabels = headerCells.every((cell) => {
|
|
3643
|
-
const t =
|
|
3935
|
+
const t = extractCellText2(cell).trim();
|
|
3644
3936
|
return t.length > 0 && t.length <= 20 && isLabelCell(t);
|
|
3645
3937
|
});
|
|
3646
3938
|
if (allLabels) {
|
|
3647
3939
|
for (let rowIdx = 1; rowIdx < rows.length; rowIdx++) {
|
|
3648
3940
|
const dataCells = findDirectChildren(rows[rowIdx], "tc");
|
|
3649
3941
|
for (let colIdx = 0; colIdx < Math.min(headerCells.length, dataCells.length); colIdx++) {
|
|
3650
|
-
const headerLabel = normalizeLabel(
|
|
3942
|
+
const headerLabel = normalizeLabel(extractCellText2(headerCells[colIdx]));
|
|
3651
3943
|
const matchKey = findMatchingKey(headerLabel, normalizedValues);
|
|
3652
3944
|
if (matchKey === void 0) continue;
|
|
3653
3945
|
if (matchedLabels.has(matchKey)) continue;
|
|
@@ -3655,7 +3947,7 @@ async function fillHwpx(hwpxBuffer, values) {
|
|
|
3655
3947
|
replaceCellText(dataCells[colIdx], newValue);
|
|
3656
3948
|
matchedLabels.add(matchKey);
|
|
3657
3949
|
filled.push({
|
|
3658
|
-
label:
|
|
3950
|
+
label: extractCellText2(headerCells[colIdx]).trim(),
|
|
3659
3951
|
value: newValue,
|
|
3660
3952
|
row: rowIdx,
|
|
3661
3953
|
col: colIdx
|
|
@@ -3697,7 +3989,7 @@ async function fillHwpx(hwpxBuffer, values) {
|
|
|
3697
3989
|
const buffer = await zip.generateAsync({ type: "arraybuffer" });
|
|
3698
3990
|
return { buffer, filled, unmatched };
|
|
3699
3991
|
}
|
|
3700
|
-
function
|
|
3992
|
+
function localName2(el) {
|
|
3701
3993
|
return (el.tagName || el.localName || "").replace(/^[^:]+:/, "");
|
|
3702
3994
|
}
|
|
3703
3995
|
function findAllElements(node, tagLocalName) {
|
|
@@ -3708,7 +4000,7 @@ function findAllElements(node, tagLocalName) {
|
|
|
3708
4000
|
for (let i = 0; i < children.length; i++) {
|
|
3709
4001
|
const child = children[i];
|
|
3710
4002
|
if (child.nodeType !== 1) continue;
|
|
3711
|
-
if (
|
|
4003
|
+
if (localName2(child) === tagLocalName) result.push(child);
|
|
3712
4004
|
walk(child);
|
|
3713
4005
|
}
|
|
3714
4006
|
};
|
|
@@ -3721,7 +4013,7 @@ function findDirectChildren(parent, tagLocalName) {
|
|
|
3721
4013
|
if (!children) return result;
|
|
3722
4014
|
for (let i = 0; i < children.length; i++) {
|
|
3723
4015
|
const child = children[i];
|
|
3724
|
-
if (child.nodeType === 1 &&
|
|
4016
|
+
if (child.nodeType === 1 && localName2(child) === tagLocalName) {
|
|
3725
4017
|
result.push(child);
|
|
3726
4018
|
}
|
|
3727
4019
|
}
|
|
@@ -3730,12 +4022,12 @@ function findDirectChildren(parent, tagLocalName) {
|
|
|
3730
4022
|
function isInsideTable(el) {
|
|
3731
4023
|
let parent = el.parentNode;
|
|
3732
4024
|
while (parent) {
|
|
3733
|
-
if (parent.nodeType === 1 &&
|
|
4025
|
+
if (parent.nodeType === 1 && localName2(parent) === "tbl") return true;
|
|
3734
4026
|
parent = parent.parentNode;
|
|
3735
4027
|
}
|
|
3736
4028
|
return false;
|
|
3737
4029
|
}
|
|
3738
|
-
function
|
|
4030
|
+
function extractCellText2(tcEl) {
|
|
3739
4031
|
const parts = [];
|
|
3740
4032
|
const walk = (node) => {
|
|
3741
4033
|
const children = node.childNodes;
|
|
@@ -3745,7 +4037,7 @@ function extractCellText(tcEl) {
|
|
|
3745
4037
|
if (child.nodeType === 3) {
|
|
3746
4038
|
parts.push(child.textContent || "");
|
|
3747
4039
|
} else if (child.nodeType === 1) {
|
|
3748
|
-
const tag =
|
|
4040
|
+
const tag = localName2(child);
|
|
3749
4041
|
if (tag === "t") walk(child);
|
|
3750
4042
|
else if (tag === "run" || tag === "r" || tag === "p" || tag === "subList") walk(child);
|
|
3751
4043
|
else if (tag === "tab") parts.push(" ");
|
|
@@ -4470,6 +4762,8 @@ async function parse(input, options) {
|
|
|
4470
4762
|
}
|
|
4471
4763
|
case "hwp":
|
|
4472
4764
|
return parseHwp(buffer, options);
|
|
4765
|
+
case "hwpml":
|
|
4766
|
+
return parseHwpml(buffer, options);
|
|
4473
4767
|
case "pdf":
|
|
4474
4768
|
return parsePdf(buffer, options);
|
|
4475
4769
|
default:
|
|
@@ -4495,7 +4789,7 @@ async function parseHwp(buffer, options) {
|
|
|
4495
4789
|
async function parsePdf(buffer, options) {
|
|
4496
4790
|
let parsePdfDocument;
|
|
4497
4791
|
try {
|
|
4498
|
-
const mod = await import("./parser-
|
|
4792
|
+
const mod = await import("./parser-25LF2S2J.js");
|
|
4499
4793
|
parsePdfDocument = mod.parsePdfDocument;
|
|
4500
4794
|
} catch {
|
|
4501
4795
|
return {
|
|
@@ -4529,6 +4823,14 @@ async function parseDocx(buffer, options) {
|
|
|
4529
4823
|
return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
4530
4824
|
}
|
|
4531
4825
|
}
|
|
4826
|
+
async function parseHwpml(buffer, options) {
|
|
4827
|
+
try {
|
|
4828
|
+
const { markdown, blocks, metadata, outline, warnings } = parseHwpmlDocument(buffer, options);
|
|
4829
|
+
return { success: true, fileType: "hwpml", markdown, blocks, metadata, outline, warnings };
|
|
4830
|
+
} catch (err) {
|
|
4831
|
+
return { success: false, fileType: "hwpml", error: err instanceof Error ? err.message : "HWPML \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
4832
|
+
}
|
|
4833
|
+
}
|
|
4532
4834
|
async function fillForm(input, values, outputFormat = "markdown") {
|
|
4533
4835
|
let buffer;
|
|
4534
4836
|
if (typeof input === "string") {
|
|
@@ -4588,6 +4890,7 @@ export {
|
|
|
4588
4890
|
parse,
|
|
4589
4891
|
parseDocx,
|
|
4590
4892
|
parseHwp,
|
|
4893
|
+
parseHwpml,
|
|
4591
4894
|
parseHwpx,
|
|
4592
4895
|
parsePdf,
|
|
4593
4896
|
parseXlsx
|