kordoc 2.2.5 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -4
- package/dist/{chunk-UU2O6D3R.js → chunk-JFTFC2BB.js} +2 -2
- package/dist/{chunk-JH5XLWJQ.js.map → chunk-JFTFC2BB.js.map} +1 -1
- package/dist/{chunk-5Y2Q3BRW.js → chunk-M3E3C5GS.js} +8 -1
- package/dist/chunk-M3E3C5GS.js.map +1 -0
- package/dist/{chunk-RQWICKON.js → chunk-OEJJPCMM.js} +369 -73
- package/dist/chunk-OEJJPCMM.js.map +1 -0
- package/dist/{chunk-JH5XLWJQ.js → chunk-Z7UPTVMX.js} +2 -2
- package/dist/{chunk-UU2O6D3R.js.map → chunk-Z7UPTVMX.js.map} +1 -1
- package/dist/{chunk-OJ4QR33V.cjs → chunk-ZNJPRRIA.cjs} +2 -2
- package/dist/{chunk-OJ4QR33V.cjs.map → chunk-ZNJPRRIA.cjs.map} +1 -1
- package/dist/cli.js +7 -4
- package/dist/cli.js.map +1 -1
- package/dist/{detect-GYK3HKD5.js → detect-I7YIS4Q6.js} +4 -2
- package/dist/index.cjs +463 -160
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +4 -2
- package/dist/index.d.ts +4 -2
- package/dist/index.js +387 -84
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +5 -5
- package/dist/{parser-OIRWPKIQ.js → parser-25LF2S2J.js} +45 -42
- package/dist/{parser-OIRWPKIQ.js.map → parser-25LF2S2J.js.map} +1 -1
- package/dist/{parser-PXD73E4H.js → parser-4LKJXBPP.js} +45 -42
- package/dist/{parser-PXD73E4H.js.map → parser-4LKJXBPP.js.map} +1 -1
- package/dist/{parser-CYBX5MP4.cjs → parser-KBQZB3QY.cjs} +61 -58
- package/dist/{parser-CYBX5MP4.cjs.map → parser-KBQZB3QY.cjs.map} +1 -1
- package/dist/{watch-NSBABJ4A.js → watch-GXRBLW3Y.js} +4 -4
- package/package.json +2 -2
- package/dist/chunk-5Y2Q3BRW.js.map +0 -1
- package/dist/chunk-RQWICKON.js.map +0 -1
- /package/dist/{detect-GYK3HKD5.js.map → detect-I7YIS4Q6.js.map} +0 -0
- /package/dist/{watch-NSBABJ4A.js.map → watch-GXRBLW3Y.js.map} +0 -0
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
import {
|
|
3
3
|
detectFormat,
|
|
4
4
|
detectZipFormat
|
|
5
|
-
} from "./chunk-
|
|
5
|
+
} from "./chunk-M3E3C5GS.js";
|
|
6
6
|
import {
|
|
7
7
|
HEADING_RATIO_H1,
|
|
8
8
|
HEADING_RATIO_H2,
|
|
@@ -20,7 +20,7 @@ import {
|
|
|
20
20
|
sanitizeHref,
|
|
21
21
|
stripDtd,
|
|
22
22
|
toArrayBuffer
|
|
23
|
-
} from "./chunk-
|
|
23
|
+
} from "./chunk-Z7UPTVMX.js";
|
|
24
24
|
import {
|
|
25
25
|
parsePageRange
|
|
26
26
|
} from "./chunk-MOL7MDBG.js";
|
|
@@ -144,6 +144,7 @@ async function parseHwpxDocument(buffer, options) {
|
|
|
144
144
|
const pageFilter = options?.pages ? parsePageRange(options.pages, sectionPaths.length) : null;
|
|
145
145
|
const totalTarget = pageFilter ? pageFilter.size : sectionPaths.length;
|
|
146
146
|
const blocks = [];
|
|
147
|
+
const nestedTableCounter = { count: 0 };
|
|
147
148
|
let parsedSections = 0;
|
|
148
149
|
for (let si = 0; si < sectionPaths.length; si++) {
|
|
149
150
|
if (pageFilter && !pageFilter.has(si + 1)) continue;
|
|
@@ -153,7 +154,7 @@ async function parseHwpxDocument(buffer, options) {
|
|
|
153
154
|
const xml = await file.async("text");
|
|
154
155
|
decompressed.total += xml.length * 2;
|
|
155
156
|
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
156
|
-
blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1));
|
|
157
|
+
blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1, nestedTableCounter));
|
|
157
158
|
parsedSections++;
|
|
158
159
|
options?.onProgress?.(parsedSections, totalTarget);
|
|
159
160
|
} catch (secErr) {
|
|
@@ -214,8 +215,20 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
|
|
|
214
215
|
ref
|
|
215
216
|
// 절대 경로일 수도 있음
|
|
216
217
|
];
|
|
218
|
+
let resolvedPath = null;
|
|
219
|
+
if (!ref.includes(".")) {
|
|
220
|
+
const prefixes = [`BinData/${ref}`, `Contents/BinData/${ref}`];
|
|
221
|
+
for (const prefix of prefixes) {
|
|
222
|
+
const match = zip.file(new RegExp(`^${prefix.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\.[a-zA-Z0-9]+$`));
|
|
223
|
+
if (match.length > 0) {
|
|
224
|
+
resolvedPath = match[0].name;
|
|
225
|
+
break;
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
}
|
|
217
229
|
let found = false;
|
|
218
|
-
|
|
230
|
+
const allCandidates = resolvedPath ? [resolvedPath, ...candidates] : candidates;
|
|
231
|
+
for (const path of allCandidates) {
|
|
219
232
|
if (isPathTraversal(path)) continue;
|
|
220
233
|
const file = zip.file(path);
|
|
221
234
|
if (!file) continue;
|
|
@@ -223,7 +236,8 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
|
|
|
223
236
|
const data = await file.async("uint8array");
|
|
224
237
|
decompressed.total += data.length;
|
|
225
238
|
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
226
|
-
const
|
|
239
|
+
const actualPath = path;
|
|
240
|
+
const ext = actualPath.includes(".") ? actualPath.split(".").pop() || "png" : "png";
|
|
227
241
|
const mimeType = imageExtToMime(ext);
|
|
228
242
|
imageIndex++;
|
|
229
243
|
const filename = `image_${String(imageIndex).padStart(3, "0")}.${mimeToExt(mimeType)}`;
|
|
@@ -309,6 +323,7 @@ function extractFromBrokenZip(buffer) {
|
|
|
309
323
|
let totalDecompressed = 0;
|
|
310
324
|
let entryCount = 0;
|
|
311
325
|
let sectionNum = 0;
|
|
326
|
+
const nestedTableCounter = { count: 0 };
|
|
312
327
|
while (pos < data.length - 30) {
|
|
313
328
|
if (data[pos] !== 80 || data[pos + 1] !== 75 || data[pos + 2] !== 3 || data[pos + 3] !== 4) {
|
|
314
329
|
pos++;
|
|
@@ -355,7 +370,7 @@ function extractFromBrokenZip(buffer) {
|
|
|
355
370
|
totalDecompressed += content.length * 2;
|
|
356
371
|
if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("\uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC");
|
|
357
372
|
sectionNum++;
|
|
358
|
-
blocks.push(...parseSectionXml(content, void 0, warnings, sectionNum));
|
|
373
|
+
blocks.push(...parseSectionXml(content, void 0, warnings, sectionNum, nestedTableCounter));
|
|
359
374
|
} catch {
|
|
360
375
|
continue;
|
|
361
376
|
}
|
|
@@ -440,12 +455,40 @@ function detectHwpxHeadings(blocks, styleMap) {
|
|
|
440
455
|
}
|
|
441
456
|
}
|
|
442
457
|
}
|
|
443
|
-
function
|
|
458
|
+
function makeNestedTableMarker(counter, rows) {
|
|
459
|
+
counter.count++;
|
|
460
|
+
const firstRow = rows[0] ?? [];
|
|
461
|
+
const hint = firstRow.map((c) => c.text.trim().replace(/\n/g, " ")).filter(Boolean).join(" | ");
|
|
462
|
+
const hintChars = [...hint];
|
|
463
|
+
const truncated = hintChars.length > 60 ? hintChars.slice(0, 60).join("") + "\u2026" : hint;
|
|
464
|
+
return truncated ? `[\uC911\uCCA9 \uD14C\uC774\uBE14 #${counter.count}: ${truncated}]` : `[\uC911\uCCA9 \uD14C\uC774\uBE14 #${counter.count}]`;
|
|
465
|
+
}
|
|
466
|
+
function handleNestedTable(newTable, tableStack, blocks, ctx) {
|
|
467
|
+
const parentTable = tableStack.pop();
|
|
468
|
+
let nestedCols = 0;
|
|
469
|
+
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
470
|
+
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
471
|
+
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: ctx.sectionNum });
|
|
472
|
+
if (parentTable.cell) {
|
|
473
|
+
const marker = ctx.counter ? makeNestedTableMarker(ctx.counter, newTable.rows) : "[\uC911\uCCA9 \uD14C\uC774\uBE14]";
|
|
474
|
+
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + marker;
|
|
475
|
+
}
|
|
476
|
+
} else {
|
|
477
|
+
const nestedText = convertTableToText(newTable.rows);
|
|
478
|
+
if (parentTable.cell) {
|
|
479
|
+
const marker = ctx.counter ? makeNestedTableMarker(ctx.counter, newTable.rows) : "[\uC911\uCCA9 \uD14C\uC774\uBE14]";
|
|
480
|
+
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + marker + "\n" + nestedText;
|
|
481
|
+
}
|
|
482
|
+
}
|
|
483
|
+
return parentTable;
|
|
484
|
+
}
|
|
485
|
+
function parseSectionXml(xml, styleMap, warnings, sectionNum, counter) {
|
|
444
486
|
const parser = createXmlParser(warnings);
|
|
445
487
|
const doc = parser.parseFromString(stripDtd(xml), "text/xml");
|
|
446
488
|
if (!doc.documentElement) return [];
|
|
447
489
|
const blocks = [];
|
|
448
|
-
|
|
490
|
+
const ctx = { styleMap, warnings, sectionNum, counter };
|
|
491
|
+
walkSection(doc.documentElement, blocks, null, [], ctx);
|
|
449
492
|
return blocks;
|
|
450
493
|
}
|
|
451
494
|
function extractImageRef(el) {
|
|
@@ -466,7 +509,7 @@ function extractImageRef(el) {
|
|
|
466
509
|
if (directRef) return directRef;
|
|
467
510
|
return null;
|
|
468
511
|
}
|
|
469
|
-
function walkSection(node, blocks, tableCtx, tableStack,
|
|
512
|
+
function walkSection(node, blocks, tableCtx, tableStack, ctx, depth = 0) {
|
|
470
513
|
if (depth > MAX_XML_DEPTH) return;
|
|
471
514
|
const children = node.childNodes;
|
|
472
515
|
if (!children) return;
|
|
@@ -479,23 +522,12 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
479
522
|
case "tbl": {
|
|
480
523
|
if (tableCtx) tableStack.push(tableCtx);
|
|
481
524
|
const newTable = { rows: [], currentRow: [], cell: null };
|
|
482
|
-
walkSection(el, blocks, newTable, tableStack,
|
|
525
|
+
walkSection(el, blocks, newTable, tableStack, ctx, depth + 1);
|
|
483
526
|
if (newTable.rows.length > 0) {
|
|
484
527
|
if (tableStack.length > 0) {
|
|
485
|
-
|
|
486
|
-
let nestedCols = 0;
|
|
487
|
-
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
488
|
-
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
489
|
-
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
490
|
-
} else {
|
|
491
|
-
const nestedText = convertTableToText(newTable.rows);
|
|
492
|
-
if (parentTable.cell) {
|
|
493
|
-
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
|
|
494
|
-
}
|
|
495
|
-
}
|
|
496
|
-
tableCtx = parentTable;
|
|
528
|
+
tableCtx = handleNestedTable(newTable, tableStack, blocks, ctx);
|
|
497
529
|
} else {
|
|
498
|
-
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
530
|
+
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: ctx.sectionNum });
|
|
499
531
|
tableCtx = null;
|
|
500
532
|
}
|
|
501
533
|
} else {
|
|
@@ -506,7 +538,7 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
506
538
|
case "tr":
|
|
507
539
|
if (tableCtx) {
|
|
508
540
|
tableCtx.currentRow = [];
|
|
509
|
-
walkSection(el, blocks, tableCtx, tableStack,
|
|
541
|
+
walkSection(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
510
542
|
if (tableCtx.currentRow.length > 0) tableCtx.rows.push(tableCtx.currentRow);
|
|
511
543
|
tableCtx.currentRow = [];
|
|
512
544
|
}
|
|
@@ -514,7 +546,7 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
514
546
|
case "tc":
|
|
515
547
|
if (tableCtx) {
|
|
516
548
|
tableCtx.cell = { text: "", colSpan: 1, rowSpan: 1 };
|
|
517
|
-
walkSection(el, blocks, tableCtx, tableStack,
|
|
549
|
+
walkSection(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
518
550
|
if (tableCtx.cell) {
|
|
519
551
|
tableCtx.currentRow.push(tableCtx.cell);
|
|
520
552
|
tableCtx.cell = null;
|
|
@@ -540,19 +572,19 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
540
572
|
}
|
|
541
573
|
break;
|
|
542
574
|
case "p": {
|
|
543
|
-
const { text, href, footnote, style } = extractParagraphInfo(el, styleMap);
|
|
575
|
+
const { text, href, footnote, style } = extractParagraphInfo(el, ctx.styleMap);
|
|
544
576
|
if (text) {
|
|
545
577
|
if (tableCtx?.cell) {
|
|
546
578
|
tableCtx.cell.text += (tableCtx.cell.text ? "\n" : "") + text;
|
|
547
579
|
} else if (!tableCtx) {
|
|
548
|
-
const block = { type: "paragraph", text, pageNumber: sectionNum };
|
|
580
|
+
const block = { type: "paragraph", text, pageNumber: ctx.sectionNum };
|
|
549
581
|
if (style) block.style = style;
|
|
550
582
|
if (href) block.href = href;
|
|
551
583
|
if (footnote) block.footnoteText = footnote;
|
|
552
584
|
blocks.push(block);
|
|
553
585
|
}
|
|
554
586
|
}
|
|
555
|
-
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack,
|
|
587
|
+
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
556
588
|
break;
|
|
557
589
|
}
|
|
558
590
|
// 이미지/그림 — 경로 추출 또는 경고
|
|
@@ -561,19 +593,19 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
561
593
|
case "drawingObject": {
|
|
562
594
|
const imgRef = extractImageRef(el);
|
|
563
595
|
if (imgRef) {
|
|
564
|
-
blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
|
|
565
|
-
} else if (warnings && sectionNum) {
|
|
566
|
-
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
596
|
+
blocks.push({ type: "image", text: imgRef, pageNumber: ctx.sectionNum });
|
|
597
|
+
} else if (ctx.warnings && ctx.sectionNum) {
|
|
598
|
+
ctx.warnings.push({ page: ctx.sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
567
599
|
}
|
|
568
600
|
break;
|
|
569
601
|
}
|
|
570
602
|
default:
|
|
571
|
-
walkSection(el, blocks, tableCtx, tableStack,
|
|
603
|
+
walkSection(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
572
604
|
break;
|
|
573
605
|
}
|
|
574
606
|
}
|
|
575
607
|
}
|
|
576
|
-
function walkParagraphChildren(node, blocks, tableCtx, tableStack,
|
|
608
|
+
function walkParagraphChildren(node, blocks, tableCtx, tableStack, ctx, depth = 0) {
|
|
577
609
|
if (depth > MAX_XML_DEPTH) return tableCtx;
|
|
578
610
|
const children = node.childNodes;
|
|
579
611
|
if (!children) return tableCtx;
|
|
@@ -589,23 +621,12 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
589
621
|
if (localTag === "tbl") {
|
|
590
622
|
if (tableCtx) tableStack.push(tableCtx);
|
|
591
623
|
const newTable = { rows: [], currentRow: [], cell: null };
|
|
592
|
-
walkSection(el, blocks, newTable, tableStack,
|
|
624
|
+
walkSection(el, blocks, newTable, tableStack, ctx, d + 1);
|
|
593
625
|
if (newTable.rows.length > 0) {
|
|
594
626
|
if (tableStack.length > 0) {
|
|
595
|
-
|
|
596
|
-
let nestedCols = 0;
|
|
597
|
-
for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
|
|
598
|
-
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
599
|
-
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
600
|
-
} else {
|
|
601
|
-
const nestedText = convertTableToText(newTable.rows);
|
|
602
|
-
if (parentTable.cell) {
|
|
603
|
-
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
|
|
604
|
-
}
|
|
605
|
-
}
|
|
606
|
-
tableCtx = parentTable;
|
|
627
|
+
tableCtx = handleNestedTable(newTable, tableStack, blocks, ctx);
|
|
607
628
|
} else {
|
|
608
|
-
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
629
|
+
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: ctx.sectionNum });
|
|
609
630
|
tableCtx = null;
|
|
610
631
|
}
|
|
611
632
|
} else {
|
|
@@ -614,21 +635,21 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
614
635
|
} else if (localTag === "pic" || localTag === "shape" || localTag === "drawingObject") {
|
|
615
636
|
const drawTextChild = findDescendant(el, "drawText");
|
|
616
637
|
if (drawTextChild) {
|
|
617
|
-
extractDrawTextBlocks(drawTextChild, blocks, styleMap, sectionNum);
|
|
638
|
+
extractDrawTextBlocks(drawTextChild, blocks, ctx.styleMap, ctx.sectionNum);
|
|
618
639
|
} else {
|
|
619
640
|
const imgRef = extractImageRef(el);
|
|
620
641
|
if (imgRef) {
|
|
621
|
-
blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
|
|
622
|
-
} else if (warnings && sectionNum) {
|
|
623
|
-
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
642
|
+
blocks.push({ type: "image", text: imgRef, pageNumber: ctx.sectionNum });
|
|
643
|
+
} else if (ctx.warnings && ctx.sectionNum) {
|
|
644
|
+
ctx.warnings.push({ page: ctx.sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
624
645
|
}
|
|
625
646
|
}
|
|
626
647
|
} else if (localTag === "drawText") {
|
|
627
|
-
extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
|
|
648
|
+
extractDrawTextBlocks(el, blocks, ctx.styleMap, ctx.sectionNum);
|
|
628
649
|
} else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
|
|
629
650
|
walkChildren(el, d + 1);
|
|
630
651
|
} else if (localTag === "run") {
|
|
631
|
-
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack,
|
|
652
|
+
tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, ctx, depth + 1);
|
|
632
653
|
}
|
|
633
654
|
}
|
|
634
655
|
};
|
|
@@ -1901,6 +1922,7 @@ function parseHwp5Document(buffer, options) {
|
|
|
1901
1922
|
const pageFilter = options?.pages ? parsePageRange(options.pages, sections.length) : null;
|
|
1902
1923
|
const totalTarget = pageFilter ? pageFilter.size : sections.length;
|
|
1903
1924
|
const blocks = [];
|
|
1925
|
+
const nestedTableCounter = { count: 0 };
|
|
1904
1926
|
let totalDecompressed = 0;
|
|
1905
1927
|
let parsedSections = 0;
|
|
1906
1928
|
for (let si = 0; si < sections.length; si++) {
|
|
@@ -1911,7 +1933,7 @@ function parseHwp5Document(buffer, options) {
|
|
|
1911
1933
|
totalDecompressed += data.length;
|
|
1912
1934
|
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
1913
1935
|
const records = readRecords(data);
|
|
1914
|
-
const sectionBlocks = parseSection(records, docInfo, warnings, si + 1);
|
|
1936
|
+
const sectionBlocks = parseSection(records, docInfo, warnings, si + 1, nestedTableCounter);
|
|
1915
1937
|
blocks.push(...sectionBlocks);
|
|
1916
1938
|
parsedSections++;
|
|
1917
1939
|
options?.onProgress?.(parsedSections, totalTarget);
|
|
@@ -2245,13 +2267,13 @@ function extractHwp5ImagesLenient(lcfb, blocks, compressed, warnings) {
|
|
|
2245
2267
|
}
|
|
2246
2268
|
return images;
|
|
2247
2269
|
}
|
|
2248
|
-
function parseSection(records, docInfo, warnings, sectionNum) {
|
|
2270
|
+
function parseSection(records, docInfo, warnings, sectionNum, counter) {
|
|
2249
2271
|
const blocks = [];
|
|
2250
2272
|
let i = 0;
|
|
2251
2273
|
while (i < records.length) {
|
|
2252
2274
|
const rec = records[i];
|
|
2253
2275
|
if (rec.tagId === TAG_PARA_HEADER && rec.level === 0) {
|
|
2254
|
-
const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i);
|
|
2276
|
+
const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i, counter);
|
|
2255
2277
|
if (paragraph) {
|
|
2256
2278
|
const block = { type: "paragraph", text: paragraph, pageNumber: sectionNum };
|
|
2257
2279
|
if (docInfo && charShapeIds.length > 0) {
|
|
@@ -2274,7 +2296,7 @@ function parseSection(records, docInfo, warnings, sectionNum) {
|
|
|
2274
2296
|
if (rec.tagId === TAG_CTRL_HEADER && rec.level <= 1 && rec.data.length >= 4) {
|
|
2275
2297
|
const ctrlId = rec.data.subarray(0, 4).toString("ascii");
|
|
2276
2298
|
if (ctrlId === " lbt" || ctrlId === "tbl ") {
|
|
2277
|
-
const { table, nextIdx } = parseTableBlock(records, i);
|
|
2299
|
+
const { table, nextIdx } = parseTableBlock(records, i, counter);
|
|
2278
2300
|
if (table) blocks.push({ type: "table", table, pageNumber: sectionNum });
|
|
2279
2301
|
i = nextIdx;
|
|
2280
2302
|
continue;
|
|
@@ -2379,7 +2401,7 @@ function resolveCharStyle(charShapeIds, docInfo) {
|
|
|
2379
2401
|
if (cs.attrFlags & 2) style.bold = true;
|
|
2380
2402
|
return style.fontSize || style.bold || style.italic ? style : void 0;
|
|
2381
2403
|
}
|
|
2382
|
-
function parseParagraphWithTables(records, startIdx) {
|
|
2404
|
+
function parseParagraphWithTables(records, startIdx, counter) {
|
|
2383
2405
|
const startLevel = records[startIdx].level;
|
|
2384
2406
|
let text = "";
|
|
2385
2407
|
const tables = [];
|
|
@@ -2401,7 +2423,7 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
2401
2423
|
if (rec.tagId === TAG_CTRL_HEADER && rec.data.length >= 4) {
|
|
2402
2424
|
const ctrlId = rec.data.subarray(0, 4).toString("ascii");
|
|
2403
2425
|
if (ctrlId === " lbt" || ctrlId === "tbl ") {
|
|
2404
|
-
const { table, nextIdx } = parseTableBlock(records, i);
|
|
2426
|
+
const { table, nextIdx } = parseTableBlock(records, i, counter);
|
|
2405
2427
|
if (table) tables.push(table);
|
|
2406
2428
|
i = nextIdx;
|
|
2407
2429
|
continue;
|
|
@@ -2412,7 +2434,7 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
2412
2434
|
const trimmed = text.trim();
|
|
2413
2435
|
return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds, paraShapeId };
|
|
2414
2436
|
}
|
|
2415
|
-
function parseTableBlock(records, startIdx) {
|
|
2437
|
+
function parseTableBlock(records, startIdx, counter) {
|
|
2416
2438
|
const tableLevel = records[startIdx].level;
|
|
2417
2439
|
let i = startIdx + 1;
|
|
2418
2440
|
let rows = 0, cols = 0;
|
|
@@ -2426,7 +2448,7 @@ function parseTableBlock(records, startIdx) {
|
|
|
2426
2448
|
cols = Math.min(rec.data.readUInt16LE(6), MAX_COLS);
|
|
2427
2449
|
}
|
|
2428
2450
|
if (rec.tagId === TAG_LIST_HEADER) {
|
|
2429
|
-
const { cell, nextIdx } = parseCellBlock(records, i, tableLevel);
|
|
2451
|
+
const { cell, nextIdx } = parseCellBlock(records, i, tableLevel, counter);
|
|
2430
2452
|
if (cell) cells.push(cell);
|
|
2431
2453
|
i = nextIdx;
|
|
2432
2454
|
continue;
|
|
@@ -2447,7 +2469,7 @@ function parseTableBlock(records, startIdx) {
|
|
|
2447
2469
|
const cellRows = arrangeCells(rows, cols, cells);
|
|
2448
2470
|
return { table: buildTable(cellRows), nextIdx: i };
|
|
2449
2471
|
}
|
|
2450
|
-
function parseCellBlock(records, startIdx, tableLevel) {
|
|
2472
|
+
function parseCellBlock(records, startIdx, tableLevel, counter) {
|
|
2451
2473
|
const rec = records[startIdx];
|
|
2452
2474
|
const cellLevel = rec.level;
|
|
2453
2475
|
const texts = [];
|
|
@@ -2472,6 +2494,17 @@ function parseCellBlock(records, startIdx, tableLevel) {
|
|
|
2472
2494
|
const t = extractText(r.data).trim();
|
|
2473
2495
|
if (t) texts.push(t);
|
|
2474
2496
|
}
|
|
2497
|
+
if (r.tagId === TAG_CTRL_HEADER && r.data.length >= 4) {
|
|
2498
|
+
const ctrlId = r.data.subarray(0, 4).toString("ascii");
|
|
2499
|
+
if (ctrlId === " lbt" || ctrlId === "tbl ") {
|
|
2500
|
+
if (counter) {
|
|
2501
|
+
counter.count++;
|
|
2502
|
+
texts.push(`[\uC911\uCCA9 \uD14C\uC774\uBE14 #${counter.count}]`);
|
|
2503
|
+
} else {
|
|
2504
|
+
texts.push("[\uC911\uCCA9 \uD14C\uC774\uBE14]");
|
|
2505
|
+
}
|
|
2506
|
+
}
|
|
2507
|
+
}
|
|
2475
2508
|
i++;
|
|
2476
2509
|
}
|
|
2477
2510
|
return { cell: { text: texts.join("\n"), colSpan, rowSpan, colAddr, rowAddr }, nextIdx: i };
|
|
@@ -3829,21 +3862,21 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
3829
3862
|
import JSZip5 from "jszip";
|
|
3830
3863
|
import { DOMParser as DOMParser4 } from "@xmldom/xmldom";
|
|
3831
3864
|
var MAX_DECOMPRESS_SIZE4 = 100 * 1024 * 1024;
|
|
3832
|
-
function getChildElements(parent,
|
|
3865
|
+
function getChildElements(parent, localName3) {
|
|
3833
3866
|
const result = [];
|
|
3834
3867
|
const children = parent.childNodes;
|
|
3835
3868
|
for (let i = 0; i < children.length; i++) {
|
|
3836
3869
|
const node = children[i];
|
|
3837
3870
|
if (node.nodeType === 1) {
|
|
3838
3871
|
const el = node;
|
|
3839
|
-
if (el.localName ===
|
|
3872
|
+
if (el.localName === localName3 || el.tagName?.endsWith(`:${localName3}`)) {
|
|
3840
3873
|
result.push(el);
|
|
3841
3874
|
}
|
|
3842
3875
|
}
|
|
3843
3876
|
}
|
|
3844
3877
|
return result;
|
|
3845
3878
|
}
|
|
3846
|
-
function findElements(parent,
|
|
3879
|
+
function findElements(parent, localName3) {
|
|
3847
3880
|
const result = [];
|
|
3848
3881
|
const walk = (node) => {
|
|
3849
3882
|
const children = node.childNodes;
|
|
@@ -3851,7 +3884,7 @@ function findElements(parent, localName2) {
|
|
|
3851
3884
|
const child = children[i];
|
|
3852
3885
|
if (child.nodeType === 1) {
|
|
3853
3886
|
const el = child;
|
|
3854
|
-
if (el.localName ===
|
|
3887
|
+
if (el.localName === localName3 || el.tagName?.endsWith(`:${localName3}`)) {
|
|
3855
3888
|
result.push(el);
|
|
3856
3889
|
}
|
|
3857
3890
|
walk(el);
|
|
@@ -3861,11 +3894,11 @@ function findElements(parent, localName2) {
|
|
|
3861
3894
|
walk(parent);
|
|
3862
3895
|
return result;
|
|
3863
3896
|
}
|
|
3864
|
-
function getAttr(el,
|
|
3897
|
+
function getAttr(el, localName3) {
|
|
3865
3898
|
const attrs = el.attributes;
|
|
3866
3899
|
for (let i = 0; i < attrs.length; i++) {
|
|
3867
3900
|
const attr = attrs[i];
|
|
3868
|
-
if (attr.localName ===
|
|
3901
|
+
if (attr.localName === localName3 || attr.name === localName3) return attr.value;
|
|
3869
3902
|
}
|
|
3870
3903
|
return null;
|
|
3871
3904
|
}
|
|
@@ -4212,11 +4245,11 @@ async function parseDocxDocument(buffer, options) {
|
|
|
4212
4245
|
const node = children[i];
|
|
4213
4246
|
if (node.nodeType !== 1) continue;
|
|
4214
4247
|
const el = node;
|
|
4215
|
-
const
|
|
4216
|
-
if (
|
|
4248
|
+
const localName3 = el.localName ?? el.tagName?.split(":").pop();
|
|
4249
|
+
if (localName3 === "p") {
|
|
4217
4250
|
const block = parseParagraph(el, styles, numbering, footnotes, rels);
|
|
4218
4251
|
if (block) blocks.push(block);
|
|
4219
|
-
} else if (
|
|
4252
|
+
} else if (localName3 === "tbl") {
|
|
4220
4253
|
const block = parseTable(el, styles, numbering, footnotes, rels);
|
|
4221
4254
|
if (block) blocks.push(block);
|
|
4222
4255
|
}
|
|
@@ -4254,6 +4287,259 @@ async function parseDocxDocument(buffer, options) {
|
|
|
4254
4287
|
};
|
|
4255
4288
|
}
|
|
4256
4289
|
|
|
4290
|
+
// src/hwpml/parser.ts
|
|
4291
|
+
import { DOMParser as DOMParser5 } from "@xmldom/xmldom";
|
|
4292
|
+
var MAX_XML_DEPTH2 = 200;
|
|
4293
|
+
var MAX_TABLE_ROWS = 5e3;
|
|
4294
|
+
var MAX_TABLE_COLS = 500;
|
|
4295
|
+
var MAX_HWPML_BYTES = 50 * 1024 * 1024;
|
|
4296
|
+
function parseHwpmlDocument(buffer, options) {
|
|
4297
|
+
if (buffer.byteLength > MAX_HWPML_BYTES) {
|
|
4298
|
+
throw new Error(`HWPML \uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC (${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB > 50MB)`);
|
|
4299
|
+
}
|
|
4300
|
+
const text = new TextDecoder("utf-8").decode(buffer).replace(/^\uFEFF/, "");
|
|
4301
|
+
const normalized = text.replace(/ /g, " ");
|
|
4302
|
+
const xml = stripDtd(normalized);
|
|
4303
|
+
const warnings = [];
|
|
4304
|
+
const parser = new DOMParser5({
|
|
4305
|
+
onError: (_level, msg) => {
|
|
4306
|
+
warnings.push({ message: `HWPML XML \uD30C\uC2F1 \uACBD\uACE0: ${msg}`, code: "MALFORMED_XML" });
|
|
4307
|
+
}
|
|
4308
|
+
});
|
|
4309
|
+
const doc = parser.parseFromString(xml, "text/xml");
|
|
4310
|
+
if (!doc.documentElement) {
|
|
4311
|
+
return { markdown: "", blocks: [], warnings };
|
|
4312
|
+
}
|
|
4313
|
+
const root = doc.documentElement;
|
|
4314
|
+
const metadata = {};
|
|
4315
|
+
const docSummary = findChild(root, "DOCSUMMARY");
|
|
4316
|
+
if (docSummary) {
|
|
4317
|
+
const title = findChild(docSummary, "TITLE");
|
|
4318
|
+
const author = findChild(docSummary, "AUTHOR");
|
|
4319
|
+
const date = findChild(docSummary, "DATE");
|
|
4320
|
+
if (title) metadata.title = textContent(title).trim();
|
|
4321
|
+
if (author) metadata.author = textContent(author).trim();
|
|
4322
|
+
if (date) metadata.createdAt = textContent(date).trim() || void 0;
|
|
4323
|
+
}
|
|
4324
|
+
const paraShapeMap = buildParaShapeMap(root);
|
|
4325
|
+
const body = findChild(root, "BODY");
|
|
4326
|
+
if (!body) {
|
|
4327
|
+
return { markdown: "", blocks: [], metadata, warnings };
|
|
4328
|
+
}
|
|
4329
|
+
const blocks = [];
|
|
4330
|
+
const pageFilter = options?.pages ? parsePageRange(options.pages, countSections(body)) : null;
|
|
4331
|
+
let sectionIdx = 0;
|
|
4332
|
+
const children = body.childNodes;
|
|
4333
|
+
for (let i = 0; i < children.length; i++) {
|
|
4334
|
+
const el = children[i];
|
|
4335
|
+
if (el.nodeType !== 1) continue;
|
|
4336
|
+
if (localName2(el) !== "SECTION") continue;
|
|
4337
|
+
sectionIdx++;
|
|
4338
|
+
if (pageFilter && !pageFilter.has(sectionIdx)) continue;
|
|
4339
|
+
parseSection2(el, blocks, paraShapeMap, sectionIdx, warnings);
|
|
4340
|
+
}
|
|
4341
|
+
const outline = blocks.filter((b) => b.type === "heading" && b.text).map((b) => ({ level: b.level ?? 1, text: b.text, pageNumber: b.pageNumber }));
|
|
4342
|
+
const markdown = blocksToMarkdown(blocks);
|
|
4343
|
+
return {
|
|
4344
|
+
markdown,
|
|
4345
|
+
blocks,
|
|
4346
|
+
metadata: Object.keys(metadata).length > 0 ? metadata : void 0,
|
|
4347
|
+
outline: outline.length > 0 ? outline : void 0,
|
|
4348
|
+
warnings: warnings.length > 0 ? warnings : void 0
|
|
4349
|
+
};
|
|
4350
|
+
}
|
|
4351
|
+
function buildParaShapeMap(root) {
|
|
4352
|
+
const map = /* @__PURE__ */ new Map();
|
|
4353
|
+
const head = findChild(root, "HEAD");
|
|
4354
|
+
if (!head) return map;
|
|
4355
|
+
const mappingTable = findChild(head, "MAPPINGTABLE");
|
|
4356
|
+
if (!mappingTable) return map;
|
|
4357
|
+
const paraShapeList = findChild(mappingTable, "PARASHAPELIST");
|
|
4358
|
+
if (!paraShapeList) return map;
|
|
4359
|
+
const children = paraShapeList.childNodes;
|
|
4360
|
+
for (let i = 0; i < children.length; i++) {
|
|
4361
|
+
const el = children[i];
|
|
4362
|
+
if (el.nodeType !== 1 || localName2(el) !== "PARASHAPE") continue;
|
|
4363
|
+
const id = el.getAttribute("Id") ?? "";
|
|
4364
|
+
const headingType = el.getAttribute("HeadingType") ?? "None";
|
|
4365
|
+
const level = parseInt(el.getAttribute("Level") ?? "0", 10);
|
|
4366
|
+
let headingLevel = null;
|
|
4367
|
+
if (headingType === "Outline") {
|
|
4368
|
+
const safeLevel = isNaN(level) ? 0 : Math.max(0, level);
|
|
4369
|
+
headingLevel = Math.min(safeLevel + 1, 6);
|
|
4370
|
+
}
|
|
4371
|
+
map.set(id, { headingLevel });
|
|
4372
|
+
}
|
|
4373
|
+
return map;
|
|
4374
|
+
}
|
|
4375
|
+
function parseSection2(section, blocks, paraShapeMap, sectionNum, warnings) {
|
|
4376
|
+
walkContent(section, blocks, paraShapeMap, sectionNum, warnings, false);
|
|
4377
|
+
}
|
|
4378
|
+
function walkContent(node, blocks, paraShapeMap, sectionNum, warnings, inHeaderFooter, depth = 0) {
|
|
4379
|
+
if (depth > MAX_XML_DEPTH2) return;
|
|
4380
|
+
const children = node.childNodes;
|
|
4381
|
+
for (let i = 0; i < children.length; i++) {
|
|
4382
|
+
const el = children[i];
|
|
4383
|
+
if (el.nodeType !== 1) continue;
|
|
4384
|
+
const tag = localName2(el);
|
|
4385
|
+
if (tag === "HEADER" || tag === "FOOTER") {
|
|
4386
|
+
continue;
|
|
4387
|
+
}
|
|
4388
|
+
if (tag === "P") {
|
|
4389
|
+
if (!inHeaderFooter) {
|
|
4390
|
+
parseParagraph2(el, blocks, paraShapeMap, sectionNum);
|
|
4391
|
+
}
|
|
4392
|
+
continue;
|
|
4393
|
+
}
|
|
4394
|
+
if (tag === "TABLE") {
|
|
4395
|
+
if (!inHeaderFooter) {
|
|
4396
|
+
parseTable2(el, blocks, paraShapeMap, sectionNum, warnings);
|
|
4397
|
+
}
|
|
4398
|
+
continue;
|
|
4399
|
+
}
|
|
4400
|
+
if (tag === "PARALIST" || tag === "SECTION" || tag === "COLDEF") {
|
|
4401
|
+
walkContent(el, blocks, paraShapeMap, sectionNum, warnings, inHeaderFooter, depth + 1);
|
|
4402
|
+
continue;
|
|
4403
|
+
}
|
|
4404
|
+
walkContent(el, blocks, paraShapeMap, sectionNum, warnings, inHeaderFooter, depth + 1);
|
|
4405
|
+
}
|
|
4406
|
+
}
|
|
4407
|
+
function parseParagraph2(el, blocks, paraShapeMap, sectionNum) {
|
|
4408
|
+
const paraShapeId = el.getAttribute("ParaShape") ?? "";
|
|
4409
|
+
const shapeInfo = paraShapeMap.get(paraShapeId);
|
|
4410
|
+
const text = extractParagraphText(el);
|
|
4411
|
+
if (!text) return;
|
|
4412
|
+
if (shapeInfo?.headingLevel != null) {
|
|
4413
|
+
blocks.push({ type: "heading", text, level: shapeInfo.headingLevel, pageNumber: sectionNum });
|
|
4414
|
+
} else {
|
|
4415
|
+
blocks.push({ type: "paragraph", text, pageNumber: sectionNum });
|
|
4416
|
+
}
|
|
4417
|
+
}
|
|
4418
|
+
function extractParagraphText(p) {
|
|
4419
|
+
const parts = [];
|
|
4420
|
+
collectCharText(p, parts);
|
|
4421
|
+
return parts.join("").trim();
|
|
4422
|
+
}
|
|
4423
|
+
function collectCharText(node, parts, depth = 0) {
|
|
4424
|
+
if (depth > MAX_XML_DEPTH2) return;
|
|
4425
|
+
const children = node.childNodes;
|
|
4426
|
+
for (let i = 0; i < children.length; i++) {
|
|
4427
|
+
const el = children[i];
|
|
4428
|
+
if (el.nodeType !== 1) continue;
|
|
4429
|
+
const tag = localName2(el);
|
|
4430
|
+
if (tag === "CHAR") {
|
|
4431
|
+
const t = textContent(el);
|
|
4432
|
+
if (t) parts.push(t);
|
|
4433
|
+
} else if (tag === "TABLE" || tag === "PICTURE" || tag === "SHAPEOBJECT") {
|
|
4434
|
+
} else if (tag === "AUTONUM") {
|
|
4435
|
+
} else {
|
|
4436
|
+
collectCharText(el, parts, depth + 1);
|
|
4437
|
+
}
|
|
4438
|
+
}
|
|
4439
|
+
}
|
|
4440
|
+
function parseTable2(el, blocks, paraShapeMap, sectionNum, warnings) {
|
|
4441
|
+
const cells = [];
|
|
4442
|
+
const rowCount = parseInt(el.getAttribute("RowCount") ?? "0", 10);
|
|
4443
|
+
const colCount = parseInt(el.getAttribute("ColCount") ?? "0", 10);
|
|
4444
|
+
if (isNaN(rowCount) || isNaN(colCount) || rowCount === 0 || colCount === 0) return;
|
|
4445
|
+
if (rowCount > MAX_TABLE_ROWS || colCount > MAX_TABLE_COLS) {
|
|
4446
|
+
warnings.push({ message: `\uD14C\uC774\uBE14 \uD06C\uAE30 \uCD08\uACFC (${rowCount}x${colCount}) \u2014 \uC2A4\uD0B5`, code: "TRUNCATED_TABLE" });
|
|
4447
|
+
return;
|
|
4448
|
+
}
|
|
4449
|
+
const children = el.childNodes;
|
|
4450
|
+
for (let i = 0; i < children.length; i++) {
|
|
4451
|
+
const rowEl = children[i];
|
|
4452
|
+
if (rowEl.nodeType !== 1 || localName2(rowEl) !== "ROW") continue;
|
|
4453
|
+
const rowCells = rowEl.childNodes;
|
|
4454
|
+
for (let j = 0; j < rowCells.length; j++) {
|
|
4455
|
+
const cellEl = rowCells[j];
|
|
4456
|
+
if (cellEl.nodeType !== 1 || localName2(cellEl) !== "CELL") continue;
|
|
4457
|
+
const colAddr = parseInt(cellEl.getAttribute("ColAddr") ?? "0", 10);
|
|
4458
|
+
const rowAddr = parseInt(cellEl.getAttribute("RowAddr") ?? "0", 10);
|
|
4459
|
+
const colSpan = Math.min(Math.max(1, parseInt(cellEl.getAttribute("ColSpan") ?? "1", 10) || 1), MAX_TABLE_COLS);
|
|
4460
|
+
const rowSpan = Math.min(Math.max(1, parseInt(cellEl.getAttribute("RowSpan") ?? "1", 10) || 1), MAX_TABLE_ROWS);
|
|
4461
|
+
const cellText = extractCellText2(cellEl);
|
|
4462
|
+
cells.push({ text: cellText, colSpan, rowSpan, colAddr, rowAddr });
|
|
4463
|
+
}
|
|
4464
|
+
}
|
|
4465
|
+
if (cells.length === 0) return;
|
|
4466
|
+
const grid = Array.from({ length: rowCount }, () => Array(colCount).fill(null));
|
|
4467
|
+
for (const cell of cells) {
|
|
4468
|
+
const r = cell.rowAddr ?? 0;
|
|
4469
|
+
const c = cell.colAddr ?? 0;
|
|
4470
|
+
if (isNaN(r) || isNaN(c) || r >= rowCount || c >= colCount) continue;
|
|
4471
|
+
grid[r][c] = cell;
|
|
4472
|
+
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
4473
|
+
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
4474
|
+
if (dr === 0 && dc === 0) continue;
|
|
4475
|
+
if (r + dr < rowCount && c + dc < colCount) {
|
|
4476
|
+
grid[r + dr][c + dc] = { text: "", colSpan: 1, rowSpan: 1 };
|
|
4477
|
+
}
|
|
4478
|
+
}
|
|
4479
|
+
}
|
|
4480
|
+
}
|
|
4481
|
+
const cellRows = grid.map(
|
|
4482
|
+
(row) => row.map((cell) => cell ?? { text: "", colSpan: 1, rowSpan: 1 })
|
|
4483
|
+
);
|
|
4484
|
+
const table = buildTable(cellRows);
|
|
4485
|
+
blocks.push({ type: "table", table, pageNumber: sectionNum });
|
|
4486
|
+
}
|
|
4487
|
+
function extractCellText2(cellEl) {
|
|
4488
|
+
const textParts = [];
|
|
4489
|
+
collectCellText(cellEl, textParts, 0);
|
|
4490
|
+
return textParts.filter(Boolean).join("\n").trim();
|
|
4491
|
+
}
|
|
4492
|
+
function collectCellText(node, parts, depth) {
|
|
4493
|
+
if (depth > 20) return;
|
|
4494
|
+
const children = node.childNodes;
|
|
4495
|
+
for (let i = 0; i < children.length; i++) {
|
|
4496
|
+
const el = children[i];
|
|
4497
|
+
if (el.nodeType !== 1) continue;
|
|
4498
|
+
const tag = localName2(el);
|
|
4499
|
+
if (tag === "P") {
|
|
4500
|
+
const t = extractParagraphText(el);
|
|
4501
|
+
if (t) parts.push(t);
|
|
4502
|
+
} else if (tag === "TABLE") {
|
|
4503
|
+
parts.push("[\uC911\uCCA9 \uD14C\uC774\uBE14]");
|
|
4504
|
+
} else {
|
|
4505
|
+
collectCellText(el, parts, depth + 1);
|
|
4506
|
+
}
|
|
4507
|
+
}
|
|
4508
|
+
}
|
|
4509
|
+
function localName2(el) {
|
|
4510
|
+
return (el.tagName || el.localName || "").replace(/^[^:]+:/, "");
|
|
4511
|
+
}
|
|
4512
|
+
function findChild(parent, tag) {
|
|
4513
|
+
const children = parent.childNodes;
|
|
4514
|
+
for (let i = 0; i < children.length; i++) {
|
|
4515
|
+
const el = children[i];
|
|
4516
|
+
if (el.nodeType === 1 && localName2(el) === tag) return el;
|
|
4517
|
+
}
|
|
4518
|
+
return null;
|
|
4519
|
+
}
|
|
4520
|
+
function textContent(el) {
|
|
4521
|
+
const children = el.childNodes;
|
|
4522
|
+
const parts = [];
|
|
4523
|
+
for (let i = 0; i < children.length; i++) {
|
|
4524
|
+
const node = children[i];
|
|
4525
|
+
if (node.nodeType === 3) {
|
|
4526
|
+
parts.push(node.nodeValue || "");
|
|
4527
|
+
} else if (node.nodeType === 1) {
|
|
4528
|
+
parts.push(textContent(node));
|
|
4529
|
+
}
|
|
4530
|
+
}
|
|
4531
|
+
return parts.join("");
|
|
4532
|
+
}
|
|
4533
|
+
function countSections(body) {
|
|
4534
|
+
let count = 0;
|
|
4535
|
+
const children = body.childNodes;
|
|
4536
|
+
for (let i = 0; i < children.length; i++) {
|
|
4537
|
+
const el = children[i];
|
|
4538
|
+
if (el.nodeType === 1 && localName2(el) === "SECTION") count++;
|
|
4539
|
+
}
|
|
4540
|
+
return count;
|
|
4541
|
+
}
|
|
4542
|
+
|
|
4257
4543
|
// src/index.ts
|
|
4258
4544
|
async function parse(input, options) {
|
|
4259
4545
|
let buffer;
|
|
@@ -4283,6 +4569,8 @@ async function parse(input, options) {
|
|
|
4283
4569
|
}
|
|
4284
4570
|
case "hwp":
|
|
4285
4571
|
return parseHwp(buffer, options);
|
|
4572
|
+
case "hwpml":
|
|
4573
|
+
return parseHwpml(buffer, options);
|
|
4286
4574
|
case "pdf":
|
|
4287
4575
|
return parsePdf(buffer, options);
|
|
4288
4576
|
default:
|
|
@@ -4308,7 +4596,7 @@ async function parseHwp(buffer, options) {
|
|
|
4308
4596
|
async function parsePdf(buffer, options) {
|
|
4309
4597
|
let parsePdfDocument;
|
|
4310
4598
|
try {
|
|
4311
|
-
const mod = await import("./parser-
|
|
4599
|
+
const mod = await import("./parser-4LKJXBPP.js");
|
|
4312
4600
|
parsePdfDocument = mod.parsePdfDocument;
|
|
4313
4601
|
} catch {
|
|
4314
4602
|
return {
|
|
@@ -4342,6 +4630,14 @@ async function parseDocx(buffer, options) {
|
|
|
4342
4630
|
return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
4343
4631
|
}
|
|
4344
4632
|
}
|
|
4633
|
+
async function parseHwpml(buffer, options) {
|
|
4634
|
+
try {
|
|
4635
|
+
const { markdown, blocks, metadata, outline, warnings } = parseHwpmlDocument(buffer, options);
|
|
4636
|
+
return { success: true, fileType: "hwpml", markdown, blocks, metadata, outline, warnings };
|
|
4637
|
+
} catch (err) {
|
|
4638
|
+
return { success: false, fileType: "hwpml", error: err instanceof Error ? err.message : "HWPML \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
4639
|
+
}
|
|
4640
|
+
}
|
|
4345
4641
|
|
|
4346
4642
|
// src/diff/text-diff.ts
|
|
4347
4643
|
function similarity(a, b) {
|
|
@@ -4530,4 +4826,4 @@ export {
|
|
|
4530
4826
|
compare,
|
|
4531
4827
|
parse
|
|
4532
4828
|
};
|
|
4533
|
-
//# sourceMappingURL=chunk-
|
|
4829
|
+
//# sourceMappingURL=chunk-OEJJPCMM.js.map
|