kordoc 2.2.6 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/README.md +22 -3
  2. package/dist/{chunk-RF6UJXR3.js → chunk-KSBPABBQ.js} +482 -78
  3. package/dist/chunk-KSBPABBQ.js.map +1 -0
  4. package/dist/{chunk-5Y2Q3BRW.js → chunk-M3E3C5GS.js} +8 -1
  5. package/dist/chunk-M3E3C5GS.js.map +1 -0
  6. package/dist/{chunk-FCQEF2ZM.js → chunk-VJPDY4YT.js} +2 -2
  7. package/dist/{chunk-NL5XLN5R.js.map → chunk-VJPDY4YT.js.map} +1 -1
  8. package/dist/{chunk-HXUCZ2IL.cjs → chunk-VLSATRNQ.cjs} +2 -2
  9. package/dist/{chunk-HXUCZ2IL.cjs.map → chunk-VLSATRNQ.cjs.map} +1 -1
  10. package/dist/{chunk-NL5XLN5R.js → chunk-XG5CQUSC.js} +2 -2
  11. package/dist/{chunk-FCQEF2ZM.js.map → chunk-XG5CQUSC.js.map} +1 -1
  12. package/dist/cli.js +5 -5
  13. package/dist/cli.js.map +1 -1
  14. package/dist/{detect-GYK3HKD5.js → detect-I7YIS4Q6.js} +4 -2
  15. package/dist/index.cjs +608 -197
  16. package/dist/index.cjs.map +1 -1
  17. package/dist/index.d.cts +6 -2
  18. package/dist/index.d.ts +6 -2
  19. package/dist/index.js +500 -89
  20. package/dist/index.js.map +1 -1
  21. package/dist/mcp.js +5 -5
  22. package/dist/{parser-AMP7MAOH.js → parser-4275GJRB.js} +45 -42
  23. package/dist/{parser-AMP7MAOH.js.map → parser-4275GJRB.js.map} +1 -1
  24. package/dist/{parser-KOWPTDJU.cjs → parser-STAOZMUC.cjs} +61 -58
  25. package/dist/{parser-KOWPTDJU.cjs.map → parser-STAOZMUC.cjs.map} +1 -1
  26. package/dist/{parser-43IAQ5KE.js → parser-XRUZEFZT.js} +45 -42
  27. package/dist/{parser-43IAQ5KE.js.map → parser-XRUZEFZT.js.map} +1 -1
  28. package/dist/{watch-IUQXOXW3.js → watch-BFLNFJBE.js} +4 -4
  29. package/package.json +2 -2
  30. package/dist/chunk-5Y2Q3BRW.js.map +0 -1
  31. package/dist/chunk-RF6UJXR3.js.map +0 -1
  32. /package/dist/{detect-GYK3HKD5.js.map → detect-I7YIS4Q6.js.map} +0 -0
  33. /package/dist/{watch-IUQXOXW3.js.map → watch-BFLNFJBE.js.map} +0 -0
@@ -2,7 +2,7 @@
2
2
  import {
3
3
  detectFormat,
4
4
  detectZipFormat
5
- } from "./chunk-5Y2Q3BRW.js";
5
+ } from "./chunk-M3E3C5GS.js";
6
6
  import {
7
7
  HEADING_RATIO_H1,
8
8
  HEADING_RATIO_H2,
@@ -20,7 +20,7 @@ import {
20
20
  sanitizeHref,
21
21
  stripDtd,
22
22
  toArrayBuffer
23
- } from "./chunk-FCQEF2ZM.js";
23
+ } from "./chunk-VJPDY4YT.js";
24
24
  import {
25
25
  parsePageRange
26
26
  } from "./chunk-MOL7MDBG.js";
@@ -29,6 +29,100 @@ import {
29
29
  import JSZip from "jszip";
30
30
  import { inflateRawSync } from "zlib";
31
31
  import { DOMParser } from "@xmldom/xmldom";
32
+
33
+ // src/hwpx/com-fallback.ts
34
+ import { execFileSync } from "child_process";
35
+ import { platform } from "os";
36
+ function isComFallbackAvailable() {
37
+ return platform() === "win32";
38
+ }
39
+ function isEncryptedHwpx(manifestXml) {
40
+ return manifestXml.includes("encryption-data");
41
+ }
42
+ function extractTextViaCom(filePath) {
43
+ if (!isComFallbackAvailable()) {
44
+ throw new Error("COM fallback\uC740 Windows\uC5D0\uC11C\uB9CC \uC0AC\uC6A9 \uAC00\uB2A5\uD569\uB2C8\uB2E4");
45
+ }
46
+ const escaped = filePath.replace(/'/g, "''");
47
+ const ps1 = `
48
+ [Console]::OutputEncoding = [System.Text.Encoding]::UTF8
49
+ $ErrorActionPreference = 'Stop'
50
+ try {
51
+ $hwp = New-Object -ComObject HWPFrame.HwpObject
52
+ $hwp.RegisterModule('FilePathCheckerModule', 'FilePathCheckerModuleExample') | Out-Null
53
+ $hwp.Open('${escaped}', '', '') | Out-Null
54
+ $pc = $hwp.PageCount
55
+ $result = @{ pageCount = $pc; pages = @() }
56
+ for ($p = 1; $p -le $pc; $p++) {
57
+ $t = $hwp.GetPageText($p, 0)
58
+ $result.pages += @($t)
59
+ }
60
+ $hwp.Clear(1)
61
+ [System.Runtime.InteropServices.Marshal]::ReleaseComObject($hwp) | Out-Null
62
+ $result | ConvertTo-Json -Depth 3 -Compress
63
+ } catch {
64
+ @{ error = $_.Exception.Message } | ConvertTo-Json -Compress
65
+ }
66
+ `;
67
+ const stdout = execFileSync("powershell", [
68
+ "-NoProfile",
69
+ "-NonInteractive",
70
+ "-ExecutionPolicy",
71
+ "Bypass",
72
+ "-Command",
73
+ ps1
74
+ ], {
75
+ encoding: "utf-8",
76
+ timeout: 12e4,
77
+ // 2분 타임아웃
78
+ windowsHide: true,
79
+ maxBuffer: 50 * 1024 * 1024
80
+ // 50MB
81
+ });
82
+ const trimmed = stdout.trim();
83
+ const jsonStart = trimmed.indexOf("{");
84
+ if (jsonStart < 0) throw new Error(`COM \uCD9C\uB825\uC5D0 JSON\uC774 \uC5C6\uC2B5\uB2C8\uB2E4: ${trimmed.slice(0, 200)}`);
85
+ const json = JSON.parse(trimmed.slice(jsonStart));
86
+ if (json.error) {
87
+ throw new Error(`COM \uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uC2E4\uD328: ${json.error}`);
88
+ }
89
+ const warnings = [];
90
+ const pages = Array.isArray(json.pages) ? json.pages : [];
91
+ const pageCount = json.pageCount ?? pages.length;
92
+ if (pages.length === 0) {
93
+ warnings.push({ message: "COM\uC73C\uB85C \uD14D\uC2A4\uD2B8\uB97C \uCD94\uCD9C\uD558\uC9C0 \uBABB\uD588\uC2B5\uB2C8\uB2E4", code: "COM_EMPTY" });
94
+ }
95
+ return { pages, pageCount, warnings };
96
+ }
97
+ function comResultToParseResult(pages, pageCount, warnings) {
98
+ const blocks = [];
99
+ const lines = [];
100
+ for (let i = 0; i < pages.length; i++) {
101
+ const text = (pages[i] ?? "").trim();
102
+ if (!text) continue;
103
+ const paragraphs = text.split(/\n/);
104
+ for (const para of paragraphs) {
105
+ const trimmed = para.trim();
106
+ if (!trimmed) continue;
107
+ blocks.push({ type: "paragraph", text: trimmed, pageNumber: i + 1 });
108
+ lines.push(trimmed);
109
+ }
110
+ }
111
+ const markdown = lines.join("\n\n");
112
+ const metadata = { pageCount };
113
+ warnings.push({
114
+ message: "DRM \uBB38\uC11C: \uD55C\uCEF4 COM API\uB85C \uD14D\uC2A4\uD2B8 \uCD94\uCD9C (\uC11C\uC2DD/\uD45C \uC815\uBCF4 \uC81C\uD55C\uC801)",
115
+ code: "DRM_COM_FALLBACK"
116
+ });
117
+ return {
118
+ markdown,
119
+ blocks,
120
+ metadata,
121
+ warnings: warnings.length > 0 ? warnings : void 0
122
+ };
123
+ }
124
+
125
+ // src/hwpx/parser.ts
32
126
  var MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024;
33
127
  var MAX_ZIP_ENTRIES = 500;
34
128
  function clampSpan(val, max) {
@@ -133,6 +227,19 @@ async function parseHwpxDocument(buffer, options) {
133
227
  if (actualEntryCount > MAX_ZIP_ENTRIES) {
134
228
  throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
135
229
  }
230
+ const manifestFile = zip.file("META-INF/manifest.xml");
231
+ if (manifestFile) {
232
+ const manifestXml = await manifestFile.async("text");
233
+ if (isEncryptedHwpx(manifestXml)) {
234
+ if (isComFallbackAvailable() && options?.filePath) {
235
+ const { pages, pageCount, warnings: warnings2 } = extractTextViaCom(options.filePath);
236
+ if (pages.some((p) => p && p.trim().length > 0)) {
237
+ return comResultToParseResult(pages, pageCount, warnings2);
238
+ }
239
+ }
240
+ throw new KordocError("DRM \uC554\uD638\uD654\uB41C HWPX \uD30C\uC77C\uC785\uB2C8\uB2E4. Windows + \uD55C\uCEF4 \uC624\uD53C\uC2A4 \uC124\uCE58 \uC2DC \uC790\uB3D9 \uCD94\uCD9C\uB429\uB2C8\uB2E4.");
241
+ }
242
+ }
136
243
  const decompressed = { total: 0 };
137
244
  const metadata = {};
138
245
  await extractHwpxMetadata(zip, metadata, decompressed);
@@ -144,6 +251,7 @@ async function parseHwpxDocument(buffer, options) {
144
251
  const pageFilter = options?.pages ? parsePageRange(options.pages, sectionPaths.length) : null;
145
252
  const totalTarget = pageFilter ? pageFilter.size : sectionPaths.length;
146
253
  const blocks = [];
254
+ const nestedTableCounter = { count: 0 };
147
255
  let parsedSections = 0;
148
256
  for (let si = 0; si < sectionPaths.length; si++) {
149
257
  if (pageFilter && !pageFilter.has(si + 1)) continue;
@@ -153,7 +261,7 @@ async function parseHwpxDocument(buffer, options) {
153
261
  const xml = await file.async("text");
154
262
  decompressed.total += xml.length * 2;
155
263
  if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
156
- blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1));
264
+ blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1, nestedTableCounter));
157
265
  parsedSections++;
158
266
  options?.onProgress?.(parsedSections, totalTarget);
159
267
  } catch (secErr) {
@@ -214,8 +322,20 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
214
322
  ref
215
323
  // 절대 경로일 수도 있음
216
324
  ];
325
+ let resolvedPath = null;
326
+ if (!ref.includes(".")) {
327
+ const prefixes = [`BinData/${ref}`, `Contents/BinData/${ref}`];
328
+ for (const prefix of prefixes) {
329
+ const match = zip.file(new RegExp(`^${prefix.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\.[a-zA-Z0-9]+$`));
330
+ if (match.length > 0) {
331
+ resolvedPath = match[0].name;
332
+ break;
333
+ }
334
+ }
335
+ }
217
336
  let found = false;
218
- for (const path of candidates) {
337
+ const allCandidates = resolvedPath ? [resolvedPath, ...candidates] : candidates;
338
+ for (const path of allCandidates) {
219
339
  if (isPathTraversal(path)) continue;
220
340
  const file = zip.file(path);
221
341
  if (!file) continue;
@@ -223,7 +343,8 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
223
343
  const data = await file.async("uint8array");
224
344
  decompressed.total += data.length;
225
345
  if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
226
- const ext = ref.includes(".") ? ref.split(".").pop() || "png" : "png";
346
+ const actualPath = path;
347
+ const ext = actualPath.includes(".") ? actualPath.split(".").pop() || "png" : "png";
227
348
  const mimeType = imageExtToMime(ext);
228
349
  imageIndex++;
229
350
  const filename = `image_${String(imageIndex).padStart(3, "0")}.${mimeToExt(mimeType)}`;
@@ -309,6 +430,7 @@ function extractFromBrokenZip(buffer) {
309
430
  let totalDecompressed = 0;
310
431
  let entryCount = 0;
311
432
  let sectionNum = 0;
433
+ const nestedTableCounter = { count: 0 };
312
434
  while (pos < data.length - 30) {
313
435
  if (data[pos] !== 80 || data[pos + 1] !== 75 || data[pos + 2] !== 3 || data[pos + 3] !== 4) {
314
436
  pos++;
@@ -355,7 +477,7 @@ function extractFromBrokenZip(buffer) {
355
477
  totalDecompressed += content.length * 2;
356
478
  if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("\uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC");
357
479
  sectionNum++;
358
- blocks.push(...parseSectionXml(content, void 0, warnings, sectionNum));
480
+ blocks.push(...parseSectionXml(content, void 0, warnings, sectionNum, nestedTableCounter));
359
481
  } catch {
360
482
  continue;
361
483
  }
@@ -440,12 +562,40 @@ function detectHwpxHeadings(blocks, styleMap) {
440
562
  }
441
563
  }
442
564
  }
443
- function parseSectionXml(xml, styleMap, warnings, sectionNum) {
565
+ function makeNestedTableMarker(counter, rows) {
566
+ counter.count++;
567
+ const firstRow = rows[0] ?? [];
568
+ const hint = firstRow.map((c) => c.text.trim().replace(/\n/g, " ")).filter(Boolean).join(" | ");
569
+ const hintChars = [...hint];
570
+ const truncated = hintChars.length > 60 ? hintChars.slice(0, 60).join("") + "\u2026" : hint;
571
+ return truncated ? `[\uC911\uCCA9 \uD14C\uC774\uBE14 #${counter.count}: ${truncated}]` : `[\uC911\uCCA9 \uD14C\uC774\uBE14 #${counter.count}]`;
572
+ }
573
+ function handleNestedTable(newTable, tableStack, blocks, ctx) {
574
+ const parentTable = tableStack.pop();
575
+ let nestedCols = 0;
576
+ for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
577
+ if (newTable.rows.length >= 3 && nestedCols >= 2) {
578
+ blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: ctx.sectionNum });
579
+ if (parentTable.cell) {
580
+ const marker = ctx.counter ? makeNestedTableMarker(ctx.counter, newTable.rows) : "[\uC911\uCCA9 \uD14C\uC774\uBE14]";
581
+ parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + marker;
582
+ }
583
+ } else {
584
+ const nestedText = convertTableToText(newTable.rows);
585
+ if (parentTable.cell) {
586
+ const marker = ctx.counter ? makeNestedTableMarker(ctx.counter, newTable.rows) : "[\uC911\uCCA9 \uD14C\uC774\uBE14]";
587
+ parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + marker + "\n" + nestedText;
588
+ }
589
+ }
590
+ return parentTable;
591
+ }
592
+ function parseSectionXml(xml, styleMap, warnings, sectionNum, counter) {
444
593
  const parser = createXmlParser(warnings);
445
594
  const doc = parser.parseFromString(stripDtd(xml), "text/xml");
446
595
  if (!doc.documentElement) return [];
447
596
  const blocks = [];
448
- walkSection(doc.documentElement, blocks, null, [], styleMap, warnings, sectionNum);
597
+ const ctx = { styleMap, warnings, sectionNum, counter };
598
+ walkSection(doc.documentElement, blocks, null, [], ctx);
449
599
  return blocks;
450
600
  }
451
601
  function extractImageRef(el) {
@@ -466,7 +616,7 @@ function extractImageRef(el) {
466
616
  if (directRef) return directRef;
467
617
  return null;
468
618
  }
469
- function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth = 0) {
619
+ function walkSection(node, blocks, tableCtx, tableStack, ctx, depth = 0) {
470
620
  if (depth > MAX_XML_DEPTH) return;
471
621
  const children = node.childNodes;
472
622
  if (!children) return;
@@ -479,23 +629,12 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
479
629
  case "tbl": {
480
630
  if (tableCtx) tableStack.push(tableCtx);
481
631
  const newTable = { rows: [], currentRow: [], cell: null };
482
- walkSection(el, blocks, newTable, tableStack, styleMap, warnings, sectionNum, depth + 1);
632
+ walkSection(el, blocks, newTable, tableStack, ctx, depth + 1);
483
633
  if (newTable.rows.length > 0) {
484
634
  if (tableStack.length > 0) {
485
- const parentTable = tableStack.pop();
486
- let nestedCols = 0;
487
- for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
488
- if (newTable.rows.length >= 3 && nestedCols >= 2) {
489
- blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
490
- } else {
491
- const nestedText = convertTableToText(newTable.rows);
492
- if (parentTable.cell) {
493
- parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
494
- }
495
- }
496
- tableCtx = parentTable;
635
+ tableCtx = handleNestedTable(newTable, tableStack, blocks, ctx);
497
636
  } else {
498
- blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
637
+ blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: ctx.sectionNum });
499
638
  tableCtx = null;
500
639
  }
501
640
  } else {
@@ -506,7 +645,7 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
506
645
  case "tr":
507
646
  if (tableCtx) {
508
647
  tableCtx.currentRow = [];
509
- walkSection(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
648
+ walkSection(el, blocks, tableCtx, tableStack, ctx, depth + 1);
510
649
  if (tableCtx.currentRow.length > 0) tableCtx.rows.push(tableCtx.currentRow);
511
650
  tableCtx.currentRow = [];
512
651
  }
@@ -514,7 +653,7 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
514
653
  case "tc":
515
654
  if (tableCtx) {
516
655
  tableCtx.cell = { text: "", colSpan: 1, rowSpan: 1 };
517
- walkSection(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
656
+ walkSection(el, blocks, tableCtx, tableStack, ctx, depth + 1);
518
657
  if (tableCtx.cell) {
519
658
  tableCtx.currentRow.push(tableCtx.cell);
520
659
  tableCtx.cell = null;
@@ -540,19 +679,19 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
540
679
  }
541
680
  break;
542
681
  case "p": {
543
- const { text, href, footnote, style } = extractParagraphInfo(el, styleMap);
682
+ const { text, href, footnote, style } = extractParagraphInfo(el, ctx.styleMap);
544
683
  if (text) {
545
684
  if (tableCtx?.cell) {
546
685
  tableCtx.cell.text += (tableCtx.cell.text ? "\n" : "") + text;
547
686
  } else if (!tableCtx) {
548
- const block = { type: "paragraph", text, pageNumber: sectionNum };
687
+ const block = { type: "paragraph", text, pageNumber: ctx.sectionNum };
549
688
  if (style) block.style = style;
550
689
  if (href) block.href = href;
551
690
  if (footnote) block.footnoteText = footnote;
552
691
  blocks.push(block);
553
692
  }
554
693
  }
555
- tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
694
+ tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, ctx, depth + 1);
556
695
  break;
557
696
  }
558
697
  // 이미지/그림 — 경로 추출 또는 경고
@@ -561,19 +700,19 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
561
700
  case "drawingObject": {
562
701
  const imgRef = extractImageRef(el);
563
702
  if (imgRef) {
564
- blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
565
- } else if (warnings && sectionNum) {
566
- warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
703
+ blocks.push({ type: "image", text: imgRef, pageNumber: ctx.sectionNum });
704
+ } else if (ctx.warnings && ctx.sectionNum) {
705
+ ctx.warnings.push({ page: ctx.sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
567
706
  }
568
707
  break;
569
708
  }
570
709
  default:
571
- walkSection(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
710
+ walkSection(el, blocks, tableCtx, tableStack, ctx, depth + 1);
572
711
  break;
573
712
  }
574
713
  }
575
714
  }
576
- function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth = 0) {
715
+ function walkParagraphChildren(node, blocks, tableCtx, tableStack, ctx, depth = 0) {
577
716
  if (depth > MAX_XML_DEPTH) return tableCtx;
578
717
  const children = node.childNodes;
579
718
  if (!children) return tableCtx;
@@ -589,23 +728,12 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
589
728
  if (localTag === "tbl") {
590
729
  if (tableCtx) tableStack.push(tableCtx);
591
730
  const newTable = { rows: [], currentRow: [], cell: null };
592
- walkSection(el, blocks, newTable, tableStack, styleMap, warnings, sectionNum, d + 1);
731
+ walkSection(el, blocks, newTable, tableStack, ctx, d + 1);
593
732
  if (newTable.rows.length > 0) {
594
733
  if (tableStack.length > 0) {
595
- const parentTable = tableStack.pop();
596
- let nestedCols = 0;
597
- for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
598
- if (newTable.rows.length >= 3 && nestedCols >= 2) {
599
- blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
600
- } else {
601
- const nestedText = convertTableToText(newTable.rows);
602
- if (parentTable.cell) {
603
- parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
604
- }
605
- }
606
- tableCtx = parentTable;
734
+ tableCtx = handleNestedTable(newTable, tableStack, blocks, ctx);
607
735
  } else {
608
- blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
736
+ blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: ctx.sectionNum });
609
737
  tableCtx = null;
610
738
  }
611
739
  } else {
@@ -614,21 +742,21 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
614
742
  } else if (localTag === "pic" || localTag === "shape" || localTag === "drawingObject") {
615
743
  const drawTextChild = findDescendant(el, "drawText");
616
744
  if (drawTextChild) {
617
- extractDrawTextBlocks(drawTextChild, blocks, styleMap, sectionNum);
745
+ extractDrawTextBlocks(drawTextChild, blocks, ctx.styleMap, ctx.sectionNum);
618
746
  } else {
619
747
  const imgRef = extractImageRef(el);
620
748
  if (imgRef) {
621
- blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
622
- } else if (warnings && sectionNum) {
623
- warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
749
+ blocks.push({ type: "image", text: imgRef, pageNumber: ctx.sectionNum });
750
+ } else if (ctx.warnings && ctx.sectionNum) {
751
+ ctx.warnings.push({ page: ctx.sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
624
752
  }
625
753
  }
626
754
  } else if (localTag === "drawText") {
627
- extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
755
+ extractDrawTextBlocks(el, blocks, ctx.styleMap, ctx.sectionNum);
628
756
  } else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
629
757
  walkChildren(el, d + 1);
630
758
  } else if (localTag === "run") {
631
- tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
759
+ tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, ctx, depth + 1);
632
760
  }
633
761
  }
634
762
  };
@@ -1901,6 +2029,7 @@ function parseHwp5Document(buffer, options) {
1901
2029
  const pageFilter = options?.pages ? parsePageRange(options.pages, sections.length) : null;
1902
2030
  const totalTarget = pageFilter ? pageFilter.size : sections.length;
1903
2031
  const blocks = [];
2032
+ const nestedTableCounter = { count: 0 };
1904
2033
  let totalDecompressed = 0;
1905
2034
  let parsedSections = 0;
1906
2035
  for (let si = 0; si < sections.length; si++) {
@@ -1911,7 +2040,7 @@ function parseHwp5Document(buffer, options) {
1911
2040
  totalDecompressed += data.length;
1912
2041
  if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
1913
2042
  const records = readRecords(data);
1914
- const sectionBlocks = parseSection(records, docInfo, warnings, si + 1);
2043
+ const sectionBlocks = parseSection(records, docInfo, warnings, si + 1, nestedTableCounter);
1915
2044
  blocks.push(...sectionBlocks);
1916
2045
  parsedSections++;
1917
2046
  options?.onProgress?.(parsedSections, totalTarget);
@@ -2245,13 +2374,13 @@ function extractHwp5ImagesLenient(lcfb, blocks, compressed, warnings) {
2245
2374
  }
2246
2375
  return images;
2247
2376
  }
2248
- function parseSection(records, docInfo, warnings, sectionNum) {
2377
+ function parseSection(records, docInfo, warnings, sectionNum, counter) {
2249
2378
  const blocks = [];
2250
2379
  let i = 0;
2251
2380
  while (i < records.length) {
2252
2381
  const rec = records[i];
2253
2382
  if (rec.tagId === TAG_PARA_HEADER && rec.level === 0) {
2254
- const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i);
2383
+ const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i, counter);
2255
2384
  if (paragraph) {
2256
2385
  const block = { type: "paragraph", text: paragraph, pageNumber: sectionNum };
2257
2386
  if (docInfo && charShapeIds.length > 0) {
@@ -2274,7 +2403,7 @@ function parseSection(records, docInfo, warnings, sectionNum) {
2274
2403
  if (rec.tagId === TAG_CTRL_HEADER && rec.level <= 1 && rec.data.length >= 4) {
2275
2404
  const ctrlId = rec.data.subarray(0, 4).toString("ascii");
2276
2405
  if (ctrlId === " lbt" || ctrlId === "tbl ") {
2277
- const { table, nextIdx } = parseTableBlock(records, i);
2406
+ const { table, nextIdx } = parseTableBlock(records, i, counter);
2278
2407
  if (table) blocks.push({ type: "table", table, pageNumber: sectionNum });
2279
2408
  i = nextIdx;
2280
2409
  continue;
@@ -2379,7 +2508,7 @@ function resolveCharStyle(charShapeIds, docInfo) {
2379
2508
  if (cs.attrFlags & 2) style.bold = true;
2380
2509
  return style.fontSize || style.bold || style.italic ? style : void 0;
2381
2510
  }
2382
- function parseParagraphWithTables(records, startIdx) {
2511
+ function parseParagraphWithTables(records, startIdx, counter) {
2383
2512
  const startLevel = records[startIdx].level;
2384
2513
  let text = "";
2385
2514
  const tables = [];
@@ -2401,7 +2530,7 @@ function parseParagraphWithTables(records, startIdx) {
2401
2530
  if (rec.tagId === TAG_CTRL_HEADER && rec.data.length >= 4) {
2402
2531
  const ctrlId = rec.data.subarray(0, 4).toString("ascii");
2403
2532
  if (ctrlId === " lbt" || ctrlId === "tbl ") {
2404
- const { table, nextIdx } = parseTableBlock(records, i);
2533
+ const { table, nextIdx } = parseTableBlock(records, i, counter);
2405
2534
  if (table) tables.push(table);
2406
2535
  i = nextIdx;
2407
2536
  continue;
@@ -2412,7 +2541,7 @@ function parseParagraphWithTables(records, startIdx) {
2412
2541
  const trimmed = text.trim();
2413
2542
  return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds, paraShapeId };
2414
2543
  }
2415
- function parseTableBlock(records, startIdx) {
2544
+ function parseTableBlock(records, startIdx, counter) {
2416
2545
  const tableLevel = records[startIdx].level;
2417
2546
  let i = startIdx + 1;
2418
2547
  let rows = 0, cols = 0;
@@ -2426,7 +2555,7 @@ function parseTableBlock(records, startIdx) {
2426
2555
  cols = Math.min(rec.data.readUInt16LE(6), MAX_COLS);
2427
2556
  }
2428
2557
  if (rec.tagId === TAG_LIST_HEADER) {
2429
- const { cell, nextIdx } = parseCellBlock(records, i, tableLevel);
2558
+ const { cell, nextIdx } = parseCellBlock(records, i, tableLevel, counter);
2430
2559
  if (cell) cells.push(cell);
2431
2560
  i = nextIdx;
2432
2561
  continue;
@@ -2447,7 +2576,7 @@ function parseTableBlock(records, startIdx) {
2447
2576
  const cellRows = arrangeCells(rows, cols, cells);
2448
2577
  return { table: buildTable(cellRows), nextIdx: i };
2449
2578
  }
2450
- function parseCellBlock(records, startIdx, tableLevel) {
2579
+ function parseCellBlock(records, startIdx, tableLevel, counter) {
2451
2580
  const rec = records[startIdx];
2452
2581
  const cellLevel = rec.level;
2453
2582
  const texts = [];
@@ -2472,6 +2601,17 @@ function parseCellBlock(records, startIdx, tableLevel) {
2472
2601
  const t = extractText(r.data).trim();
2473
2602
  if (t) texts.push(t);
2474
2603
  }
2604
+ if (r.tagId === TAG_CTRL_HEADER && r.data.length >= 4) {
2605
+ const ctrlId = r.data.subarray(0, 4).toString("ascii");
2606
+ if (ctrlId === " lbt" || ctrlId === "tbl ") {
2607
+ if (counter) {
2608
+ counter.count++;
2609
+ texts.push(`[\uC911\uCCA9 \uD14C\uC774\uBE14 #${counter.count}]`);
2610
+ } else {
2611
+ texts.push("[\uC911\uCCA9 \uD14C\uC774\uBE14]");
2612
+ }
2613
+ }
2614
+ }
2475
2615
  i++;
2476
2616
  }
2477
2617
  return { cell: { text: texts.join("\n"), colSpan, rowSpan, colAddr, rowAddr }, nextIdx: i };
@@ -3829,21 +3969,21 @@ async function parseXlsxDocument(buffer, options) {
3829
3969
  import JSZip5 from "jszip";
3830
3970
  import { DOMParser as DOMParser4 } from "@xmldom/xmldom";
3831
3971
  var MAX_DECOMPRESS_SIZE4 = 100 * 1024 * 1024;
3832
- function getChildElements(parent, localName2) {
3972
+ function getChildElements(parent, localName3) {
3833
3973
  const result = [];
3834
3974
  const children = parent.childNodes;
3835
3975
  for (let i = 0; i < children.length; i++) {
3836
3976
  const node = children[i];
3837
3977
  if (node.nodeType === 1) {
3838
3978
  const el = node;
3839
- if (el.localName === localName2 || el.tagName?.endsWith(`:${localName2}`)) {
3979
+ if (el.localName === localName3 || el.tagName?.endsWith(`:${localName3}`)) {
3840
3980
  result.push(el);
3841
3981
  }
3842
3982
  }
3843
3983
  }
3844
3984
  return result;
3845
3985
  }
3846
- function findElements(parent, localName2) {
3986
+ function findElements(parent, localName3) {
3847
3987
  const result = [];
3848
3988
  const walk = (node) => {
3849
3989
  const children = node.childNodes;
@@ -3851,7 +3991,7 @@ function findElements(parent, localName2) {
3851
3991
  const child = children[i];
3852
3992
  if (child.nodeType === 1) {
3853
3993
  const el = child;
3854
- if (el.localName === localName2 || el.tagName?.endsWith(`:${localName2}`)) {
3994
+ if (el.localName === localName3 || el.tagName?.endsWith(`:${localName3}`)) {
3855
3995
  result.push(el);
3856
3996
  }
3857
3997
  walk(el);
@@ -3861,11 +4001,11 @@ function findElements(parent, localName2) {
3861
4001
  walk(parent);
3862
4002
  return result;
3863
4003
  }
3864
- function getAttr(el, localName2) {
4004
+ function getAttr(el, localName3) {
3865
4005
  const attrs = el.attributes;
3866
4006
  for (let i = 0; i < attrs.length; i++) {
3867
4007
  const attr = attrs[i];
3868
- if (attr.localName === localName2 || attr.name === localName2) return attr.value;
4008
+ if (attr.localName === localName3 || attr.name === localName3) return attr.value;
3869
4009
  }
3870
4010
  return null;
3871
4011
  }
@@ -4212,11 +4352,11 @@ async function parseDocxDocument(buffer, options) {
4212
4352
  const node = children[i];
4213
4353
  if (node.nodeType !== 1) continue;
4214
4354
  const el = node;
4215
- const localName2 = el.localName ?? el.tagName?.split(":").pop();
4216
- if (localName2 === "p") {
4355
+ const localName3 = el.localName ?? el.tagName?.split(":").pop();
4356
+ if (localName3 === "p") {
4217
4357
  const block = parseParagraph(el, styles, numbering, footnotes, rels);
4218
4358
  if (block) blocks.push(block);
4219
- } else if (localName2 === "tbl") {
4359
+ } else if (localName3 === "tbl") {
4220
4360
  const block = parseTable(el, styles, numbering, footnotes, rels);
4221
4361
  if (block) blocks.push(block);
4222
4362
  }
@@ -4254,9 +4394,263 @@ async function parseDocxDocument(buffer, options) {
4254
4394
  };
4255
4395
  }
4256
4396
 
4397
+ // src/hwpml/parser.ts
4398
+ import { DOMParser as DOMParser5 } from "@xmldom/xmldom";
4399
+ var MAX_XML_DEPTH2 = 200;
4400
+ var MAX_TABLE_ROWS = 5e3;
4401
+ var MAX_TABLE_COLS = 500;
4402
+ var MAX_HWPML_BYTES = 50 * 1024 * 1024;
4403
+ function parseHwpmlDocument(buffer, options) {
4404
+ if (buffer.byteLength > MAX_HWPML_BYTES) {
4405
+ throw new Error(`HWPML \uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC (${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB > 50MB)`);
4406
+ }
4407
+ const text = new TextDecoder("utf-8").decode(buffer).replace(/^\uFEFF/, "");
4408
+ const normalized = text.replace(/&nbsp;/g, "&#160;");
4409
+ const xml = stripDtd(normalized);
4410
+ const warnings = [];
4411
+ const parser = new DOMParser5({
4412
+ onError: (_level, msg) => {
4413
+ warnings.push({ message: `HWPML XML \uD30C\uC2F1 \uACBD\uACE0: ${msg}`, code: "MALFORMED_XML" });
4414
+ }
4415
+ });
4416
+ const doc = parser.parseFromString(xml, "text/xml");
4417
+ if (!doc.documentElement) {
4418
+ return { markdown: "", blocks: [], warnings };
4419
+ }
4420
+ const root = doc.documentElement;
4421
+ const metadata = {};
4422
+ const docSummary = findChild(root, "DOCSUMMARY");
4423
+ if (docSummary) {
4424
+ const title = findChild(docSummary, "TITLE");
4425
+ const author = findChild(docSummary, "AUTHOR");
4426
+ const date = findChild(docSummary, "DATE");
4427
+ if (title) metadata.title = textContent(title).trim();
4428
+ if (author) metadata.author = textContent(author).trim();
4429
+ if (date) metadata.createdAt = textContent(date).trim() || void 0;
4430
+ }
4431
+ const paraShapeMap = buildParaShapeMap(root);
4432
+ const body = findChild(root, "BODY");
4433
+ if (!body) {
4434
+ return { markdown: "", blocks: [], metadata, warnings };
4435
+ }
4436
+ const blocks = [];
4437
+ const pageFilter = options?.pages ? parsePageRange(options.pages, countSections(body)) : null;
4438
+ let sectionIdx = 0;
4439
+ const children = body.childNodes;
4440
+ for (let i = 0; i < children.length; i++) {
4441
+ const el = children[i];
4442
+ if (el.nodeType !== 1) continue;
4443
+ if (localName2(el) !== "SECTION") continue;
4444
+ sectionIdx++;
4445
+ if (pageFilter && !pageFilter.has(sectionIdx)) continue;
4446
+ parseSection2(el, blocks, paraShapeMap, sectionIdx, warnings);
4447
+ }
4448
+ const outline = blocks.filter((b) => b.type === "heading" && b.text).map((b) => ({ level: b.level ?? 1, text: b.text, pageNumber: b.pageNumber }));
4449
+ const markdown = blocksToMarkdown(blocks);
4450
+ return {
4451
+ markdown,
4452
+ blocks,
4453
+ metadata: Object.keys(metadata).length > 0 ? metadata : void 0,
4454
+ outline: outline.length > 0 ? outline : void 0,
4455
+ warnings: warnings.length > 0 ? warnings : void 0
4456
+ };
4457
+ }
4458
+ function buildParaShapeMap(root) {
4459
+ const map = /* @__PURE__ */ new Map();
4460
+ const head = findChild(root, "HEAD");
4461
+ if (!head) return map;
4462
+ const mappingTable = findChild(head, "MAPPINGTABLE");
4463
+ if (!mappingTable) return map;
4464
+ const paraShapeList = findChild(mappingTable, "PARASHAPELIST");
4465
+ if (!paraShapeList) return map;
4466
+ const children = paraShapeList.childNodes;
4467
+ for (let i = 0; i < children.length; i++) {
4468
+ const el = children[i];
4469
+ if (el.nodeType !== 1 || localName2(el) !== "PARASHAPE") continue;
4470
+ const id = el.getAttribute("Id") ?? "";
4471
+ const headingType = el.getAttribute("HeadingType") ?? "None";
4472
+ const level = parseInt(el.getAttribute("Level") ?? "0", 10);
4473
+ let headingLevel = null;
4474
+ if (headingType === "Outline") {
4475
+ const safeLevel = isNaN(level) ? 0 : Math.max(0, level);
4476
+ headingLevel = Math.min(safeLevel + 1, 6);
4477
+ }
4478
+ map.set(id, { headingLevel });
4479
+ }
4480
+ return map;
4481
+ }
4482
+ function parseSection2(section, blocks, paraShapeMap, sectionNum, warnings) {
4483
+ walkContent(section, blocks, paraShapeMap, sectionNum, warnings, false);
4484
+ }
4485
+ function walkContent(node, blocks, paraShapeMap, sectionNum, warnings, inHeaderFooter, depth = 0) {
4486
+ if (depth > MAX_XML_DEPTH2) return;
4487
+ const children = node.childNodes;
4488
+ for (let i = 0; i < children.length; i++) {
4489
+ const el = children[i];
4490
+ if (el.nodeType !== 1) continue;
4491
+ const tag = localName2(el);
4492
+ if (tag === "HEADER" || tag === "FOOTER") {
4493
+ continue;
4494
+ }
4495
+ if (tag === "P") {
4496
+ if (!inHeaderFooter) {
4497
+ parseParagraph2(el, blocks, paraShapeMap, sectionNum);
4498
+ }
4499
+ continue;
4500
+ }
4501
+ if (tag === "TABLE") {
4502
+ if (!inHeaderFooter) {
4503
+ parseTable2(el, blocks, paraShapeMap, sectionNum, warnings);
4504
+ }
4505
+ continue;
4506
+ }
4507
+ if (tag === "PARALIST" || tag === "SECTION" || tag === "COLDEF") {
4508
+ walkContent(el, blocks, paraShapeMap, sectionNum, warnings, inHeaderFooter, depth + 1);
4509
+ continue;
4510
+ }
4511
+ walkContent(el, blocks, paraShapeMap, sectionNum, warnings, inHeaderFooter, depth + 1);
4512
+ }
4513
+ }
4514
+ function parseParagraph2(el, blocks, paraShapeMap, sectionNum) {
4515
+ const paraShapeId = el.getAttribute("ParaShape") ?? "";
4516
+ const shapeInfo = paraShapeMap.get(paraShapeId);
4517
+ const text = extractParagraphText(el);
4518
+ if (!text) return;
4519
+ if (shapeInfo?.headingLevel != null) {
4520
+ blocks.push({ type: "heading", text, level: shapeInfo.headingLevel, pageNumber: sectionNum });
4521
+ } else {
4522
+ blocks.push({ type: "paragraph", text, pageNumber: sectionNum });
4523
+ }
4524
+ }
4525
+ function extractParagraphText(p) {
4526
+ const parts = [];
4527
+ collectCharText(p, parts);
4528
+ return parts.join("").trim();
4529
+ }
4530
+ function collectCharText(node, parts, depth = 0) {
4531
+ if (depth > MAX_XML_DEPTH2) return;
4532
+ const children = node.childNodes;
4533
+ for (let i = 0; i < children.length; i++) {
4534
+ const el = children[i];
4535
+ if (el.nodeType !== 1) continue;
4536
+ const tag = localName2(el);
4537
+ if (tag === "CHAR") {
4538
+ const t = textContent(el);
4539
+ if (t) parts.push(t);
4540
+ } else if (tag === "TABLE" || tag === "PICTURE" || tag === "SHAPEOBJECT") {
4541
+ } else if (tag === "AUTONUM") {
4542
+ } else {
4543
+ collectCharText(el, parts, depth + 1);
4544
+ }
4545
+ }
4546
+ }
4547
+ function parseTable2(el, blocks, paraShapeMap, sectionNum, warnings) {
4548
+ const cells = [];
4549
+ const rowCount = parseInt(el.getAttribute("RowCount") ?? "0", 10);
4550
+ const colCount = parseInt(el.getAttribute("ColCount") ?? "0", 10);
4551
+ if (isNaN(rowCount) || isNaN(colCount) || rowCount === 0 || colCount === 0) return;
4552
+ if (rowCount > MAX_TABLE_ROWS || colCount > MAX_TABLE_COLS) {
4553
+ warnings.push({ message: `\uD14C\uC774\uBE14 \uD06C\uAE30 \uCD08\uACFC (${rowCount}x${colCount}) \u2014 \uC2A4\uD0B5`, code: "TRUNCATED_TABLE" });
4554
+ return;
4555
+ }
4556
+ const children = el.childNodes;
4557
+ for (let i = 0; i < children.length; i++) {
4558
+ const rowEl = children[i];
4559
+ if (rowEl.nodeType !== 1 || localName2(rowEl) !== "ROW") continue;
4560
+ const rowCells = rowEl.childNodes;
4561
+ for (let j = 0; j < rowCells.length; j++) {
4562
+ const cellEl = rowCells[j];
4563
+ if (cellEl.nodeType !== 1 || localName2(cellEl) !== "CELL") continue;
4564
+ const colAddr = parseInt(cellEl.getAttribute("ColAddr") ?? "0", 10);
4565
+ const rowAddr = parseInt(cellEl.getAttribute("RowAddr") ?? "0", 10);
4566
+ const colSpan = Math.min(Math.max(1, parseInt(cellEl.getAttribute("ColSpan") ?? "1", 10) || 1), MAX_TABLE_COLS);
4567
+ const rowSpan = Math.min(Math.max(1, parseInt(cellEl.getAttribute("RowSpan") ?? "1", 10) || 1), MAX_TABLE_ROWS);
4568
+ const cellText = extractCellText2(cellEl);
4569
+ cells.push({ text: cellText, colSpan, rowSpan, colAddr, rowAddr });
4570
+ }
4571
+ }
4572
+ if (cells.length === 0) return;
4573
+ const grid = Array.from({ length: rowCount }, () => Array(colCount).fill(null));
4574
+ for (const cell of cells) {
4575
+ const r = cell.rowAddr ?? 0;
4576
+ const c = cell.colAddr ?? 0;
4577
+ if (isNaN(r) || isNaN(c) || r >= rowCount || c >= colCount) continue;
4578
+ grid[r][c] = cell;
4579
+ for (let dr = 0; dr < cell.rowSpan; dr++) {
4580
+ for (let dc = 0; dc < cell.colSpan; dc++) {
4581
+ if (dr === 0 && dc === 0) continue;
4582
+ if (r + dr < rowCount && c + dc < colCount) {
4583
+ grid[r + dr][c + dc] = { text: "", colSpan: 1, rowSpan: 1 };
4584
+ }
4585
+ }
4586
+ }
4587
+ }
4588
+ const cellRows = grid.map(
4589
+ (row) => row.map((cell) => cell ?? { text: "", colSpan: 1, rowSpan: 1 })
4590
+ );
4591
+ const table = buildTable(cellRows);
4592
+ blocks.push({ type: "table", table, pageNumber: sectionNum });
4593
+ }
4594
+ function extractCellText2(cellEl) {
4595
+ const textParts = [];
4596
+ collectCellText(cellEl, textParts, 0);
4597
+ return textParts.filter(Boolean).join("\n").trim();
4598
+ }
4599
+ function collectCellText(node, parts, depth) {
4600
+ if (depth > 20) return;
4601
+ const children = node.childNodes;
4602
+ for (let i = 0; i < children.length; i++) {
4603
+ const el = children[i];
4604
+ if (el.nodeType !== 1) continue;
4605
+ const tag = localName2(el);
4606
+ if (tag === "P") {
4607
+ const t = extractParagraphText(el);
4608
+ if (t) parts.push(t);
4609
+ } else if (tag === "TABLE") {
4610
+ parts.push("[\uC911\uCCA9 \uD14C\uC774\uBE14]");
4611
+ } else {
4612
+ collectCellText(el, parts, depth + 1);
4613
+ }
4614
+ }
4615
+ }
4616
+ function localName2(el) {
4617
+ return (el.tagName || el.localName || "").replace(/^[^:]+:/, "");
4618
+ }
4619
+ function findChild(parent, tag) {
4620
+ const children = parent.childNodes;
4621
+ for (let i = 0; i < children.length; i++) {
4622
+ const el = children[i];
4623
+ if (el.nodeType === 1 && localName2(el) === tag) return el;
4624
+ }
4625
+ return null;
4626
+ }
4627
+ function textContent(el) {
4628
+ const children = el.childNodes;
4629
+ const parts = [];
4630
+ for (let i = 0; i < children.length; i++) {
4631
+ const node = children[i];
4632
+ if (node.nodeType === 3) {
4633
+ parts.push(node.nodeValue || "");
4634
+ } else if (node.nodeType === 1) {
4635
+ parts.push(textContent(node));
4636
+ }
4637
+ }
4638
+ return parts.join("");
4639
+ }
4640
+ function countSections(body) {
4641
+ let count = 0;
4642
+ const children = body.childNodes;
4643
+ for (let i = 0; i < children.length; i++) {
4644
+ const el = children[i];
4645
+ if (el.nodeType === 1 && localName2(el) === "SECTION") count++;
4646
+ }
4647
+ return count;
4648
+ }
4649
+
4257
4650
  // src/index.ts
4258
4651
  async function parse(input, options) {
4259
4652
  let buffer;
4653
+ const opts = typeof input === "string" && !options?.filePath ? { ...options, filePath: input } : options;
4260
4654
  if (typeof input === "string") {
4261
4655
  try {
4262
4656
  const buf = await readFile(input);
@@ -4277,14 +4671,16 @@ async function parse(input, options) {
4277
4671
  switch (format) {
4278
4672
  case "hwpx": {
4279
4673
  const zipFormat = await detectZipFormat(buffer);
4280
- if (zipFormat === "xlsx") return parseXlsx(buffer, options);
4281
- if (zipFormat === "docx") return parseDocx(buffer, options);
4282
- return parseHwpx(buffer, options);
4674
+ if (zipFormat === "xlsx") return parseXlsx(buffer, opts);
4675
+ if (zipFormat === "docx") return parseDocx(buffer, opts);
4676
+ return parseHwpx(buffer, opts);
4283
4677
  }
4284
4678
  case "hwp":
4285
- return parseHwp(buffer, options);
4679
+ return parseHwp(buffer, opts);
4680
+ case "hwpml":
4681
+ return parseHwpml(buffer, opts);
4286
4682
  case "pdf":
4287
- return parsePdf(buffer, options);
4683
+ return parsePdf(buffer, opts);
4288
4684
  default:
4289
4685
  return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
4290
4686
  }
@@ -4308,7 +4704,7 @@ async function parseHwp(buffer, options) {
4308
4704
  async function parsePdf(buffer, options) {
4309
4705
  let parsePdfDocument;
4310
4706
  try {
4311
- const mod = await import("./parser-AMP7MAOH.js");
4707
+ const mod = await import("./parser-4275GJRB.js");
4312
4708
  parsePdfDocument = mod.parsePdfDocument;
4313
4709
  } catch {
4314
4710
  return {
@@ -4342,6 +4738,14 @@ async function parseDocx(buffer, options) {
4342
4738
  return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
4343
4739
  }
4344
4740
  }
4741
+ async function parseHwpml(buffer, options) {
4742
+ try {
4743
+ const { markdown, blocks, metadata, outline, warnings } = parseHwpmlDocument(buffer, options);
4744
+ return { success: true, fileType: "hwpml", markdown, blocks, metadata, outline, warnings };
4745
+ } catch (err) {
4746
+ return { success: false, fileType: "hwpml", error: err instanceof Error ? err.message : "HWPML \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
4747
+ }
4748
+ }
4345
4749
 
4346
4750
  // src/diff/text-diff.ts
4347
4751
  function similarity(a, b) {
@@ -4530,4 +4934,4 @@ export {
4530
4934
  compare,
4531
4935
  parse
4532
4936
  };
4533
- //# sourceMappingURL=chunk-RF6UJXR3.js.map
4937
+ //# sourceMappingURL=chunk-KSBPABBQ.js.map