kordoc 2.2.6 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/README.md +22 -3
  2. package/dist/{chunk-RF6UJXR3.js → chunk-KSBPABBQ.js} +482 -78
  3. package/dist/chunk-KSBPABBQ.js.map +1 -0
  4. package/dist/{chunk-5Y2Q3BRW.js → chunk-M3E3C5GS.js} +8 -1
  5. package/dist/chunk-M3E3C5GS.js.map +1 -0
  6. package/dist/{chunk-FCQEF2ZM.js → chunk-VJPDY4YT.js} +2 -2
  7. package/dist/{chunk-NL5XLN5R.js.map → chunk-VJPDY4YT.js.map} +1 -1
  8. package/dist/{chunk-HXUCZ2IL.cjs → chunk-VLSATRNQ.cjs} +2 -2
  9. package/dist/{chunk-HXUCZ2IL.cjs.map → chunk-VLSATRNQ.cjs.map} +1 -1
  10. package/dist/{chunk-NL5XLN5R.js → chunk-XG5CQUSC.js} +2 -2
  11. package/dist/{chunk-FCQEF2ZM.js.map → chunk-XG5CQUSC.js.map} +1 -1
  12. package/dist/cli.js +5 -5
  13. package/dist/cli.js.map +1 -1
  14. package/dist/{detect-GYK3HKD5.js → detect-I7YIS4Q6.js} +4 -2
  15. package/dist/index.cjs +608 -197
  16. package/dist/index.cjs.map +1 -1
  17. package/dist/index.d.cts +6 -2
  18. package/dist/index.d.ts +6 -2
  19. package/dist/index.js +500 -89
  20. package/dist/index.js.map +1 -1
  21. package/dist/mcp.js +5 -5
  22. package/dist/{parser-AMP7MAOH.js → parser-4275GJRB.js} +45 -42
  23. package/dist/{parser-AMP7MAOH.js.map → parser-4275GJRB.js.map} +1 -1
  24. package/dist/{parser-KOWPTDJU.cjs → parser-STAOZMUC.cjs} +61 -58
  25. package/dist/{parser-KOWPTDJU.cjs.map → parser-STAOZMUC.cjs.map} +1 -1
  26. package/dist/{parser-43IAQ5KE.js → parser-XRUZEFZT.js} +45 -42
  27. package/dist/{parser-43IAQ5KE.js.map → parser-XRUZEFZT.js.map} +1 -1
  28. package/dist/{watch-IUQXOXW3.js → watch-BFLNFJBE.js} +4 -4
  29. package/package.json +2 -2
  30. package/dist/chunk-5Y2Q3BRW.js.map +0 -1
  31. package/dist/chunk-RF6UJXR3.js.map +0 -1
  32. /package/dist/{detect-GYK3HKD5.js.map → detect-I7YIS4Q6.js.map} +0 -0
  33. /package/dist/{watch-IUQXOXW3.js.map → watch-BFLNFJBE.js.map} +0 -0
package/dist/index.js CHANGED
@@ -16,7 +16,7 @@ import {
16
16
  sanitizeHref,
17
17
  stripDtd,
18
18
  toArrayBuffer
19
- } from "./chunk-NL5XLN5R.js";
19
+ } from "./chunk-XG5CQUSC.js";
20
20
  import {
21
21
  parsePageRange
22
22
  } from "./chunk-SBVRCJFH.js";
@@ -44,11 +44,17 @@ function isPdfFile(buffer) {
44
44
  const b = magicBytes(buffer);
45
45
  return b[0] === 37 && b[1] === 80 && b[2] === 68 && b[3] === 70;
46
46
  }
47
+ function isHwpmlFile(buffer) {
48
+ const bytes = new Uint8Array(buffer, 0, Math.min(512, buffer.byteLength));
49
+ const head = new TextDecoder("utf-8", { fatal: false }).decode(bytes).replace(/^\uFEFF/, "");
50
+ return head.trimStart().startsWith("<?xml") && head.includes("<HWPML");
51
+ }
47
52
  function detectFormat(buffer) {
48
53
  if (buffer.byteLength < 4) return "unknown";
49
54
  if (isZipFile(buffer)) return "hwpx";
50
55
  if (isOldHwpFile(buffer)) return "hwp";
51
56
  if (isPdfFile(buffer)) return "pdf";
57
+ if (isHwpmlFile(buffer)) return "hwpml";
52
58
  return "unknown";
53
59
  }
54
60
  async function detectZipFormat(buffer) {
@@ -69,6 +75,100 @@ async function detectZipFormat(buffer) {
69
75
  import JSZip2 from "jszip";
70
76
  import { inflateRawSync } from "zlib";
71
77
  import { DOMParser } from "@xmldom/xmldom";
78
+
79
+ // src/hwpx/com-fallback.ts
80
+ import { execFileSync } from "child_process";
81
+ import { platform } from "os";
82
+ function isComFallbackAvailable() {
83
+ return platform() === "win32";
84
+ }
85
+ function isEncryptedHwpx(manifestXml) {
86
+ return manifestXml.includes("encryption-data");
87
+ }
88
+ function extractTextViaCom(filePath) {
89
+ if (!isComFallbackAvailable()) {
90
+ throw new Error("COM fallback\uC740 Windows\uC5D0\uC11C\uB9CC \uC0AC\uC6A9 \uAC00\uB2A5\uD569\uB2C8\uB2E4");
91
+ }
92
+ const escaped = filePath.replace(/'/g, "''");
93
+ const ps1 = `
94
+ [Console]::OutputEncoding = [System.Text.Encoding]::UTF8
95
+ $ErrorActionPreference = 'Stop'
96
+ try {
97
+ $hwp = New-Object -ComObject HWPFrame.HwpObject
98
+ $hwp.RegisterModule('FilePathCheckerModule', 'FilePathCheckerModuleExample') | Out-Null
99
+ $hwp.Open('${escaped}', '', '') | Out-Null
100
+ $pc = $hwp.PageCount
101
+ $result = @{ pageCount = $pc; pages = @() }
102
+ for ($p = 1; $p -le $pc; $p++) {
103
+ $t = $hwp.GetPageText($p, 0)
104
+ $result.pages += @($t)
105
+ }
106
+ $hwp.Clear(1)
107
+ [System.Runtime.InteropServices.Marshal]::ReleaseComObject($hwp) | Out-Null
108
+ $result | ConvertTo-Json -Depth 3 -Compress
109
+ } catch {
110
+ @{ error = $_.Exception.Message } | ConvertTo-Json -Compress
111
+ }
112
+ `;
113
+ const stdout = execFileSync("powershell", [
114
+ "-NoProfile",
115
+ "-NonInteractive",
116
+ "-ExecutionPolicy",
117
+ "Bypass",
118
+ "-Command",
119
+ ps1
120
+ ], {
121
+ encoding: "utf-8",
122
+ timeout: 12e4,
123
+ // 2분 타임아웃
124
+ windowsHide: true,
125
+ maxBuffer: 50 * 1024 * 1024
126
+ // 50MB
127
+ });
128
+ const trimmed = stdout.trim();
129
+ const jsonStart = trimmed.indexOf("{");
130
+ if (jsonStart < 0) throw new Error(`COM \uCD9C\uB825\uC5D0 JSON\uC774 \uC5C6\uC2B5\uB2C8\uB2E4: ${trimmed.slice(0, 200)}`);
131
+ const json = JSON.parse(trimmed.slice(jsonStart));
132
+ if (json.error) {
133
+ throw new Error(`COM \uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uC2E4\uD328: ${json.error}`);
134
+ }
135
+ const warnings = [];
136
+ const pages = Array.isArray(json.pages) ? json.pages : [];
137
+ const pageCount = json.pageCount ?? pages.length;
138
+ if (pages.length === 0) {
139
+ warnings.push({ message: "COM\uC73C\uB85C \uD14D\uC2A4\uD2B8\uB97C \uCD94\uCD9C\uD558\uC9C0 \uBABB\uD588\uC2B5\uB2C8\uB2E4", code: "COM_EMPTY" });
140
+ }
141
+ return { pages, pageCount, warnings };
142
+ }
143
+ function comResultToParseResult(pages, pageCount, warnings) {
144
+ const blocks = [];
145
+ const lines = [];
146
+ for (let i = 0; i < pages.length; i++) {
147
+ const text = (pages[i] ?? "").trim();
148
+ if (!text) continue;
149
+ const paragraphs = text.split(/\n/);
150
+ for (const para of paragraphs) {
151
+ const trimmed = para.trim();
152
+ if (!trimmed) continue;
153
+ blocks.push({ type: "paragraph", text: trimmed, pageNumber: i + 1 });
154
+ lines.push(trimmed);
155
+ }
156
+ }
157
+ const markdown = lines.join("\n\n");
158
+ const metadata = { pageCount };
159
+ warnings.push({
160
+ message: "DRM \uBB38\uC11C: \uD55C\uCEF4 COM API\uB85C \uD14D\uC2A4\uD2B8 \uCD94\uCD9C (\uC11C\uC2DD/\uD45C \uC815\uBCF4 \uC81C\uD55C\uC801)",
161
+ code: "DRM_COM_FALLBACK"
162
+ });
163
+ return {
164
+ markdown,
165
+ blocks,
166
+ metadata,
167
+ warnings: warnings.length > 0 ? warnings : void 0
168
+ };
169
+ }
170
+
171
+ // src/hwpx/parser.ts
72
172
  var MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024;
73
173
  var MAX_ZIP_ENTRIES = 500;
74
174
  function clampSpan(val, max) {
@@ -173,6 +273,19 @@ async function parseHwpxDocument(buffer, options) {
173
273
  if (actualEntryCount > MAX_ZIP_ENTRIES) {
174
274
  throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
175
275
  }
276
+ const manifestFile = zip.file("META-INF/manifest.xml");
277
+ if (manifestFile) {
278
+ const manifestXml = await manifestFile.async("text");
279
+ if (isEncryptedHwpx(manifestXml)) {
280
+ if (isComFallbackAvailable() && options?.filePath) {
281
+ const { pages, pageCount, warnings: warnings2 } = extractTextViaCom(options.filePath);
282
+ if (pages.some((p) => p && p.trim().length > 0)) {
283
+ return comResultToParseResult(pages, pageCount, warnings2);
284
+ }
285
+ }
286
+ throw new KordocError("DRM \uC554\uD638\uD654\uB41C HWPX \uD30C\uC77C\uC785\uB2C8\uB2E4. Windows + \uD55C\uCEF4 \uC624\uD53C\uC2A4 \uC124\uCE58 \uC2DC \uC790\uB3D9 \uCD94\uCD9C\uB429\uB2C8\uB2E4.");
287
+ }
288
+ }
176
289
  const decompressed = { total: 0 };
177
290
  const metadata = {};
178
291
  await extractHwpxMetadata(zip, metadata, decompressed);
@@ -184,6 +297,7 @@ async function parseHwpxDocument(buffer, options) {
184
297
  const pageFilter = options?.pages ? parsePageRange(options.pages, sectionPaths.length) : null;
185
298
  const totalTarget = pageFilter ? pageFilter.size : sectionPaths.length;
186
299
  const blocks = [];
300
+ const nestedTableCounter = { count: 0 };
187
301
  let parsedSections = 0;
188
302
  for (let si = 0; si < sectionPaths.length; si++) {
189
303
  if (pageFilter && !pageFilter.has(si + 1)) continue;
@@ -193,7 +307,7 @@ async function parseHwpxDocument(buffer, options) {
193
307
  const xml = await file.async("text");
194
308
  decompressed.total += xml.length * 2;
195
309
  if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
196
- blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1));
310
+ blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1, nestedTableCounter));
197
311
  parsedSections++;
198
312
  options?.onProgress?.(parsedSections, totalTarget);
199
313
  } catch (secErr) {
@@ -254,8 +368,20 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
254
368
  ref
255
369
  // 절대 경로일 수도 있음
256
370
  ];
371
+ let resolvedPath = null;
372
+ if (!ref.includes(".")) {
373
+ const prefixes = [`BinData/${ref}`, `Contents/BinData/${ref}`];
374
+ for (const prefix of prefixes) {
375
+ const match = zip.file(new RegExp(`^${prefix.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\.[a-zA-Z0-9]+$`));
376
+ if (match.length > 0) {
377
+ resolvedPath = match[0].name;
378
+ break;
379
+ }
380
+ }
381
+ }
257
382
  let found = false;
258
- for (const path of candidates) {
383
+ const allCandidates = resolvedPath ? [resolvedPath, ...candidates] : candidates;
384
+ for (const path of allCandidates) {
259
385
  if (isPathTraversal(path)) continue;
260
386
  const file = zip.file(path);
261
387
  if (!file) continue;
@@ -263,7 +389,8 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
263
389
  const data = await file.async("uint8array");
264
390
  decompressed.total += data.length;
265
391
  if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
266
- const ext = ref.includes(".") ? ref.split(".").pop() || "png" : "png";
392
+ const actualPath = path;
393
+ const ext = actualPath.includes(".") ? actualPath.split(".").pop() || "png" : "png";
267
394
  const mimeType = imageExtToMime(ext);
268
395
  imageIndex++;
269
396
  const filename = `image_${String(imageIndex).padStart(3, "0")}.${mimeToExt(mimeType)}`;
@@ -336,6 +463,7 @@ function extractFromBrokenZip(buffer) {
336
463
  let totalDecompressed = 0;
337
464
  let entryCount = 0;
338
465
  let sectionNum = 0;
466
+ const nestedTableCounter = { count: 0 };
339
467
  while (pos < data.length - 30) {
340
468
  if (data[pos] !== 80 || data[pos + 1] !== 75 || data[pos + 2] !== 3 || data[pos + 3] !== 4) {
341
469
  pos++;
@@ -382,7 +510,7 @@ function extractFromBrokenZip(buffer) {
382
510
  totalDecompressed += content.length * 2;
383
511
  if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("\uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC");
384
512
  sectionNum++;
385
- blocks.push(...parseSectionXml(content, void 0, warnings, sectionNum));
513
+ blocks.push(...parseSectionXml(content, void 0, warnings, sectionNum, nestedTableCounter));
386
514
  } catch {
387
515
  continue;
388
516
  }
@@ -467,12 +595,40 @@ function detectHwpxHeadings(blocks, styleMap) {
467
595
  }
468
596
  }
469
597
  }
470
- function parseSectionXml(xml, styleMap, warnings, sectionNum) {
598
+ function makeNestedTableMarker(counter, rows) {
599
+ counter.count++;
600
+ const firstRow = rows[0] ?? [];
601
+ const hint = firstRow.map((c) => c.text.trim().replace(/\n/g, " ")).filter(Boolean).join(" | ");
602
+ const hintChars = [...hint];
603
+ const truncated = hintChars.length > 60 ? hintChars.slice(0, 60).join("") + "\u2026" : hint;
604
+ return truncated ? `[\uC911\uCCA9 \uD14C\uC774\uBE14 #${counter.count}: ${truncated}]` : `[\uC911\uCCA9 \uD14C\uC774\uBE14 #${counter.count}]`;
605
+ }
606
+ function handleNestedTable(newTable, tableStack, blocks, ctx) {
607
+ const parentTable = tableStack.pop();
608
+ let nestedCols = 0;
609
+ for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
610
+ if (newTable.rows.length >= 3 && nestedCols >= 2) {
611
+ blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: ctx.sectionNum });
612
+ if (parentTable.cell) {
613
+ const marker = ctx.counter ? makeNestedTableMarker(ctx.counter, newTable.rows) : "[\uC911\uCCA9 \uD14C\uC774\uBE14]";
614
+ parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + marker;
615
+ }
616
+ } else {
617
+ const nestedText = convertTableToText(newTable.rows);
618
+ if (parentTable.cell) {
619
+ const marker = ctx.counter ? makeNestedTableMarker(ctx.counter, newTable.rows) : "[\uC911\uCCA9 \uD14C\uC774\uBE14]";
620
+ parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + marker + "\n" + nestedText;
621
+ }
622
+ }
623
+ return parentTable;
624
+ }
625
+ function parseSectionXml(xml, styleMap, warnings, sectionNum, counter) {
471
626
  const parser = createXmlParser(warnings);
472
627
  const doc = parser.parseFromString(stripDtd(xml), "text/xml");
473
628
  if (!doc.documentElement) return [];
474
629
  const blocks = [];
475
- walkSection(doc.documentElement, blocks, null, [], styleMap, warnings, sectionNum);
630
+ const ctx = { styleMap, warnings, sectionNum, counter };
631
+ walkSection(doc.documentElement, blocks, null, [], ctx);
476
632
  return blocks;
477
633
  }
478
634
  function extractImageRef(el) {
@@ -493,7 +649,7 @@ function extractImageRef(el) {
493
649
  if (directRef) return directRef;
494
650
  return null;
495
651
  }
496
- function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth = 0) {
652
+ function walkSection(node, blocks, tableCtx, tableStack, ctx, depth = 0) {
497
653
  if (depth > MAX_XML_DEPTH) return;
498
654
  const children = node.childNodes;
499
655
  if (!children) return;
@@ -506,23 +662,12 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
506
662
  case "tbl": {
507
663
  if (tableCtx) tableStack.push(tableCtx);
508
664
  const newTable = { rows: [], currentRow: [], cell: null };
509
- walkSection(el, blocks, newTable, tableStack, styleMap, warnings, sectionNum, depth + 1);
665
+ walkSection(el, blocks, newTable, tableStack, ctx, depth + 1);
510
666
  if (newTable.rows.length > 0) {
511
667
  if (tableStack.length > 0) {
512
- const parentTable = tableStack.pop();
513
- let nestedCols = 0;
514
- for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
515
- if (newTable.rows.length >= 3 && nestedCols >= 2) {
516
- blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
517
- } else {
518
- const nestedText = convertTableToText(newTable.rows);
519
- if (parentTable.cell) {
520
- parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
521
- }
522
- }
523
- tableCtx = parentTable;
668
+ tableCtx = handleNestedTable(newTable, tableStack, blocks, ctx);
524
669
  } else {
525
- blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
670
+ blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: ctx.sectionNum });
526
671
  tableCtx = null;
527
672
  }
528
673
  } else {
@@ -533,7 +678,7 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
533
678
  case "tr":
534
679
  if (tableCtx) {
535
680
  tableCtx.currentRow = [];
536
- walkSection(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
681
+ walkSection(el, blocks, tableCtx, tableStack, ctx, depth + 1);
537
682
  if (tableCtx.currentRow.length > 0) tableCtx.rows.push(tableCtx.currentRow);
538
683
  tableCtx.currentRow = [];
539
684
  }
@@ -541,7 +686,7 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
541
686
  case "tc":
542
687
  if (tableCtx) {
543
688
  tableCtx.cell = { text: "", colSpan: 1, rowSpan: 1 };
544
- walkSection(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
689
+ walkSection(el, blocks, tableCtx, tableStack, ctx, depth + 1);
545
690
  if (tableCtx.cell) {
546
691
  tableCtx.currentRow.push(tableCtx.cell);
547
692
  tableCtx.cell = null;
@@ -567,19 +712,19 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
567
712
  }
568
713
  break;
569
714
  case "p": {
570
- const { text, href, footnote, style } = extractParagraphInfo(el, styleMap);
715
+ const { text, href, footnote, style } = extractParagraphInfo(el, ctx.styleMap);
571
716
  if (text) {
572
717
  if (tableCtx?.cell) {
573
718
  tableCtx.cell.text += (tableCtx.cell.text ? "\n" : "") + text;
574
719
  } else if (!tableCtx) {
575
- const block = { type: "paragraph", text, pageNumber: sectionNum };
720
+ const block = { type: "paragraph", text, pageNumber: ctx.sectionNum };
576
721
  if (style) block.style = style;
577
722
  if (href) block.href = href;
578
723
  if (footnote) block.footnoteText = footnote;
579
724
  blocks.push(block);
580
725
  }
581
726
  }
582
- tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
727
+ tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, ctx, depth + 1);
583
728
  break;
584
729
  }
585
730
  // 이미지/그림 — 경로 추출 또는 경고
@@ -588,19 +733,19 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
588
733
  case "drawingObject": {
589
734
  const imgRef = extractImageRef(el);
590
735
  if (imgRef) {
591
- blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
592
- } else if (warnings && sectionNum) {
593
- warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
736
+ blocks.push({ type: "image", text: imgRef, pageNumber: ctx.sectionNum });
737
+ } else if (ctx.warnings && ctx.sectionNum) {
738
+ ctx.warnings.push({ page: ctx.sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
594
739
  }
595
740
  break;
596
741
  }
597
742
  default:
598
- walkSection(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
743
+ walkSection(el, blocks, tableCtx, tableStack, ctx, depth + 1);
599
744
  break;
600
745
  }
601
746
  }
602
747
  }
603
- function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth = 0) {
748
+ function walkParagraphChildren(node, blocks, tableCtx, tableStack, ctx, depth = 0) {
604
749
  if (depth > MAX_XML_DEPTH) return tableCtx;
605
750
  const children = node.childNodes;
606
751
  if (!children) return tableCtx;
@@ -616,23 +761,12 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
616
761
  if (localTag === "tbl") {
617
762
  if (tableCtx) tableStack.push(tableCtx);
618
763
  const newTable = { rows: [], currentRow: [], cell: null };
619
- walkSection(el, blocks, newTable, tableStack, styleMap, warnings, sectionNum, d + 1);
764
+ walkSection(el, blocks, newTable, tableStack, ctx, d + 1);
620
765
  if (newTable.rows.length > 0) {
621
766
  if (tableStack.length > 0) {
622
- const parentTable = tableStack.pop();
623
- let nestedCols = 0;
624
- for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
625
- if (newTable.rows.length >= 3 && nestedCols >= 2) {
626
- blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
627
- } else {
628
- const nestedText = convertTableToText(newTable.rows);
629
- if (parentTable.cell) {
630
- parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
631
- }
632
- }
633
- tableCtx = parentTable;
767
+ tableCtx = handleNestedTable(newTable, tableStack, blocks, ctx);
634
768
  } else {
635
- blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
769
+ blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: ctx.sectionNum });
636
770
  tableCtx = null;
637
771
  }
638
772
  } else {
@@ -641,21 +775,21 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
641
775
  } else if (localTag === "pic" || localTag === "shape" || localTag === "drawingObject") {
642
776
  const drawTextChild = findDescendant(el, "drawText");
643
777
  if (drawTextChild) {
644
- extractDrawTextBlocks(drawTextChild, blocks, styleMap, sectionNum);
778
+ extractDrawTextBlocks(drawTextChild, blocks, ctx.styleMap, ctx.sectionNum);
645
779
  } else {
646
780
  const imgRef = extractImageRef(el);
647
781
  if (imgRef) {
648
- blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
649
- } else if (warnings && sectionNum) {
650
- warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
782
+ blocks.push({ type: "image", text: imgRef, pageNumber: ctx.sectionNum });
783
+ } else if (ctx.warnings && ctx.sectionNum) {
784
+ ctx.warnings.push({ page: ctx.sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
651
785
  }
652
786
  }
653
787
  } else if (localTag === "drawText") {
654
- extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
788
+ extractDrawTextBlocks(el, blocks, ctx.styleMap, ctx.sectionNum);
655
789
  } else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
656
790
  walkChildren(el, d + 1);
657
791
  } else if (localTag === "run") {
658
- tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
792
+ tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, ctx, depth + 1);
659
793
  }
660
794
  }
661
795
  };
@@ -1928,6 +2062,7 @@ function parseHwp5Document(buffer, options) {
1928
2062
  const pageFilter = options?.pages ? parsePageRange(options.pages, sections.length) : null;
1929
2063
  const totalTarget = pageFilter ? pageFilter.size : sections.length;
1930
2064
  const blocks = [];
2065
+ const nestedTableCounter = { count: 0 };
1931
2066
  let totalDecompressed = 0;
1932
2067
  let parsedSections = 0;
1933
2068
  for (let si = 0; si < sections.length; si++) {
@@ -1938,7 +2073,7 @@ function parseHwp5Document(buffer, options) {
1938
2073
  totalDecompressed += data.length;
1939
2074
  if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
1940
2075
  const records = readRecords(data);
1941
- const sectionBlocks = parseSection(records, docInfo, warnings, si + 1);
2076
+ const sectionBlocks = parseSection(records, docInfo, warnings, si + 1, nestedTableCounter);
1942
2077
  blocks.push(...sectionBlocks);
1943
2078
  parsedSections++;
1944
2079
  options?.onProgress?.(parsedSections, totalTarget);
@@ -2258,13 +2393,13 @@ function extractHwp5ImagesLenient(lcfb, blocks, compressed, warnings) {
2258
2393
  }
2259
2394
  return images;
2260
2395
  }
2261
- function parseSection(records, docInfo, warnings, sectionNum) {
2396
+ function parseSection(records, docInfo, warnings, sectionNum, counter) {
2262
2397
  const blocks = [];
2263
2398
  let i = 0;
2264
2399
  while (i < records.length) {
2265
2400
  const rec = records[i];
2266
2401
  if (rec.tagId === TAG_PARA_HEADER && rec.level === 0) {
2267
- const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i);
2402
+ const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i, counter);
2268
2403
  if (paragraph) {
2269
2404
  const block = { type: "paragraph", text: paragraph, pageNumber: sectionNum };
2270
2405
  if (docInfo && charShapeIds.length > 0) {
@@ -2287,7 +2422,7 @@ function parseSection(records, docInfo, warnings, sectionNum) {
2287
2422
  if (rec.tagId === TAG_CTRL_HEADER && rec.level <= 1 && rec.data.length >= 4) {
2288
2423
  const ctrlId = rec.data.subarray(0, 4).toString("ascii");
2289
2424
  if (ctrlId === " lbt" || ctrlId === "tbl ") {
2290
- const { table, nextIdx } = parseTableBlock(records, i);
2425
+ const { table, nextIdx } = parseTableBlock(records, i, counter);
2291
2426
  if (table) blocks.push({ type: "table", table, pageNumber: sectionNum });
2292
2427
  i = nextIdx;
2293
2428
  continue;
@@ -2392,7 +2527,7 @@ function resolveCharStyle(charShapeIds, docInfo) {
2392
2527
  if (cs.attrFlags & 2) style.bold = true;
2393
2528
  return style.fontSize || style.bold || style.italic ? style : void 0;
2394
2529
  }
2395
- function parseParagraphWithTables(records, startIdx) {
2530
+ function parseParagraphWithTables(records, startIdx, counter) {
2396
2531
  const startLevel = records[startIdx].level;
2397
2532
  let text = "";
2398
2533
  const tables = [];
@@ -2414,7 +2549,7 @@ function parseParagraphWithTables(records, startIdx) {
2414
2549
  if (rec.tagId === TAG_CTRL_HEADER && rec.data.length >= 4) {
2415
2550
  const ctrlId = rec.data.subarray(0, 4).toString("ascii");
2416
2551
  if (ctrlId === " lbt" || ctrlId === "tbl ") {
2417
- const { table, nextIdx } = parseTableBlock(records, i);
2552
+ const { table, nextIdx } = parseTableBlock(records, i, counter);
2418
2553
  if (table) tables.push(table);
2419
2554
  i = nextIdx;
2420
2555
  continue;
@@ -2425,7 +2560,7 @@ function parseParagraphWithTables(records, startIdx) {
2425
2560
  const trimmed = text.trim();
2426
2561
  return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds, paraShapeId };
2427
2562
  }
2428
- function parseTableBlock(records, startIdx) {
2563
+ function parseTableBlock(records, startIdx, counter) {
2429
2564
  const tableLevel = records[startIdx].level;
2430
2565
  let i = startIdx + 1;
2431
2566
  let rows = 0, cols = 0;
@@ -2439,7 +2574,7 @@ function parseTableBlock(records, startIdx) {
2439
2574
  cols = Math.min(rec.data.readUInt16LE(6), MAX_COLS);
2440
2575
  }
2441
2576
  if (rec.tagId === TAG_LIST_HEADER) {
2442
- const { cell, nextIdx } = parseCellBlock(records, i, tableLevel);
2577
+ const { cell, nextIdx } = parseCellBlock(records, i, tableLevel, counter);
2443
2578
  if (cell) cells.push(cell);
2444
2579
  i = nextIdx;
2445
2580
  continue;
@@ -2460,7 +2595,7 @@ function parseTableBlock(records, startIdx) {
2460
2595
  const cellRows = arrangeCells(rows, cols, cells);
2461
2596
  return { table: buildTable(cellRows), nextIdx: i };
2462
2597
  }
2463
- function parseCellBlock(records, startIdx, tableLevel) {
2598
+ function parseCellBlock(records, startIdx, tableLevel, counter) {
2464
2599
  const rec = records[startIdx];
2465
2600
  const cellLevel = rec.level;
2466
2601
  const texts = [];
@@ -2485,6 +2620,17 @@ function parseCellBlock(records, startIdx, tableLevel) {
2485
2620
  const t = extractText(r.data).trim();
2486
2621
  if (t) texts.push(t);
2487
2622
  }
2623
+ if (r.tagId === TAG_CTRL_HEADER && r.data.length >= 4) {
2624
+ const ctrlId = r.data.subarray(0, 4).toString("ascii");
2625
+ if (ctrlId === " lbt" || ctrlId === "tbl ") {
2626
+ if (counter) {
2627
+ counter.count++;
2628
+ texts.push(`[\uC911\uCCA9 \uD14C\uC774\uBE14 #${counter.count}]`);
2629
+ } else {
2630
+ texts.push("[\uC911\uCCA9 \uD14C\uC774\uBE14]");
2631
+ }
2632
+ }
2633
+ }
2488
2634
  i++;
2489
2635
  }
2490
2636
  return { cell: { text: texts.join("\n"), colSpan, rowSpan, colAddr, rowAddr }, nextIdx: i };
@@ -2811,21 +2957,21 @@ async function parseXlsxDocument(buffer, options) {
2811
2957
  import JSZip4 from "jszip";
2812
2958
  import { DOMParser as DOMParser3 } from "@xmldom/xmldom";
2813
2959
  var MAX_DECOMPRESS_SIZE4 = 100 * 1024 * 1024;
2814
- function getChildElements(parent, localName2) {
2960
+ function getChildElements(parent, localName3) {
2815
2961
  const result = [];
2816
2962
  const children = parent.childNodes;
2817
2963
  for (let i = 0; i < children.length; i++) {
2818
2964
  const node = children[i];
2819
2965
  if (node.nodeType === 1) {
2820
2966
  const el = node;
2821
- if (el.localName === localName2 || el.tagName?.endsWith(`:${localName2}`)) {
2967
+ if (el.localName === localName3 || el.tagName?.endsWith(`:${localName3}`)) {
2822
2968
  result.push(el);
2823
2969
  }
2824
2970
  }
2825
2971
  }
2826
2972
  return result;
2827
2973
  }
2828
- function findElements(parent, localName2) {
2974
+ function findElements(parent, localName3) {
2829
2975
  const result = [];
2830
2976
  const walk = (node) => {
2831
2977
  const children = node.childNodes;
@@ -2833,7 +2979,7 @@ function findElements(parent, localName2) {
2833
2979
  const child = children[i];
2834
2980
  if (child.nodeType === 1) {
2835
2981
  const el = child;
2836
- if (el.localName === localName2 || el.tagName?.endsWith(`:${localName2}`)) {
2982
+ if (el.localName === localName3 || el.tagName?.endsWith(`:${localName3}`)) {
2837
2983
  result.push(el);
2838
2984
  }
2839
2985
  walk(el);
@@ -2843,11 +2989,11 @@ function findElements(parent, localName2) {
2843
2989
  walk(parent);
2844
2990
  return result;
2845
2991
  }
2846
- function getAttr(el, localName2) {
2992
+ function getAttr(el, localName3) {
2847
2993
  const attrs = el.attributes;
2848
2994
  for (let i = 0; i < attrs.length; i++) {
2849
2995
  const attr = attrs[i];
2850
- if (attr.localName === localName2 || attr.name === localName2) return attr.value;
2996
+ if (attr.localName === localName3 || attr.name === localName3) return attr.value;
2851
2997
  }
2852
2998
  return null;
2853
2999
  }
@@ -3194,11 +3340,11 @@ async function parseDocxDocument(buffer, options) {
3194
3340
  const node = children[i];
3195
3341
  if (node.nodeType !== 1) continue;
3196
3342
  const el = node;
3197
- const localName2 = el.localName ?? el.tagName?.split(":").pop();
3198
- if (localName2 === "p") {
3343
+ const localName3 = el.localName ?? el.tagName?.split(":").pop();
3344
+ if (localName3 === "p") {
3199
3345
  const block = parseParagraph(el, styles, numbering, footnotes, rels);
3200
3346
  if (block) blocks.push(block);
3201
- } else if (localName2 === "tbl") {
3347
+ } else if (localName3 === "tbl") {
3202
3348
  const block = parseTable(el, styles, numbering, footnotes, rels);
3203
3349
  if (block) blocks.push(block);
3204
3350
  }
@@ -3236,6 +3382,259 @@ async function parseDocxDocument(buffer, options) {
3236
3382
  };
3237
3383
  }
3238
3384
 
3385
+ // src/hwpml/parser.ts
3386
+ import { DOMParser as DOMParser4 } from "@xmldom/xmldom";
3387
+ var MAX_XML_DEPTH2 = 200;
3388
+ var MAX_TABLE_ROWS = 5e3;
3389
+ var MAX_TABLE_COLS = 500;
3390
+ var MAX_HWPML_BYTES = 50 * 1024 * 1024;
3391
+ function parseHwpmlDocument(buffer, options) {
3392
+ if (buffer.byteLength > MAX_HWPML_BYTES) {
3393
+ throw new Error(`HWPML \uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC (${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB > 50MB)`);
3394
+ }
3395
+ const text = new TextDecoder("utf-8").decode(buffer).replace(/^\uFEFF/, "");
3396
+ const normalized = text.replace(/&nbsp;/g, "&#160;");
3397
+ const xml = stripDtd(normalized);
3398
+ const warnings = [];
3399
+ const parser = new DOMParser4({
3400
+ onError: (_level, msg) => {
3401
+ warnings.push({ message: `HWPML XML \uD30C\uC2F1 \uACBD\uACE0: ${msg}`, code: "MALFORMED_XML" });
3402
+ }
3403
+ });
3404
+ const doc = parser.parseFromString(xml, "text/xml");
3405
+ if (!doc.documentElement) {
3406
+ return { markdown: "", blocks: [], warnings };
3407
+ }
3408
+ const root = doc.documentElement;
3409
+ const metadata = {};
3410
+ const docSummary = findChild(root, "DOCSUMMARY");
3411
+ if (docSummary) {
3412
+ const title = findChild(docSummary, "TITLE");
3413
+ const author = findChild(docSummary, "AUTHOR");
3414
+ const date = findChild(docSummary, "DATE");
3415
+ if (title) metadata.title = textContent(title).trim();
3416
+ if (author) metadata.author = textContent(author).trim();
3417
+ if (date) metadata.createdAt = textContent(date).trim() || void 0;
3418
+ }
3419
+ const paraShapeMap = buildParaShapeMap(root);
3420
+ const body = findChild(root, "BODY");
3421
+ if (!body) {
3422
+ return { markdown: "", blocks: [], metadata, warnings };
3423
+ }
3424
+ const blocks = [];
3425
+ const pageFilter = options?.pages ? parsePageRange(options.pages, countSections(body)) : null;
3426
+ let sectionIdx = 0;
3427
+ const children = body.childNodes;
3428
+ for (let i = 0; i < children.length; i++) {
3429
+ const el = children[i];
3430
+ if (el.nodeType !== 1) continue;
3431
+ if (localName(el) !== "SECTION") continue;
3432
+ sectionIdx++;
3433
+ if (pageFilter && !pageFilter.has(sectionIdx)) continue;
3434
+ parseSection2(el, blocks, paraShapeMap, sectionIdx, warnings);
3435
+ }
3436
+ const outline = blocks.filter((b) => b.type === "heading" && b.text).map((b) => ({ level: b.level ?? 1, text: b.text, pageNumber: b.pageNumber }));
3437
+ const markdown = blocksToMarkdown(blocks);
3438
+ return {
3439
+ markdown,
3440
+ blocks,
3441
+ metadata: Object.keys(metadata).length > 0 ? metadata : void 0,
3442
+ outline: outline.length > 0 ? outline : void 0,
3443
+ warnings: warnings.length > 0 ? warnings : void 0
3444
+ };
3445
+ }
3446
+ function buildParaShapeMap(root) {
3447
+ const map = /* @__PURE__ */ new Map();
3448
+ const head = findChild(root, "HEAD");
3449
+ if (!head) return map;
3450
+ const mappingTable = findChild(head, "MAPPINGTABLE");
3451
+ if (!mappingTable) return map;
3452
+ const paraShapeList = findChild(mappingTable, "PARASHAPELIST");
3453
+ if (!paraShapeList) return map;
3454
+ const children = paraShapeList.childNodes;
3455
+ for (let i = 0; i < children.length; i++) {
3456
+ const el = children[i];
3457
+ if (el.nodeType !== 1 || localName(el) !== "PARASHAPE") continue;
3458
+ const id = el.getAttribute("Id") ?? "";
3459
+ const headingType = el.getAttribute("HeadingType") ?? "None";
3460
+ const level = parseInt(el.getAttribute("Level") ?? "0", 10);
3461
+ let headingLevel = null;
3462
+ if (headingType === "Outline") {
3463
+ const safeLevel = isNaN(level) ? 0 : Math.max(0, level);
3464
+ headingLevel = Math.min(safeLevel + 1, 6);
3465
+ }
3466
+ map.set(id, { headingLevel });
3467
+ }
3468
+ return map;
3469
+ }
3470
+ function parseSection2(section, blocks, paraShapeMap, sectionNum, warnings) {
3471
+ walkContent(section, blocks, paraShapeMap, sectionNum, warnings, false);
3472
+ }
3473
+ function walkContent(node, blocks, paraShapeMap, sectionNum, warnings, inHeaderFooter, depth = 0) {
3474
+ if (depth > MAX_XML_DEPTH2) return;
3475
+ const children = node.childNodes;
3476
+ for (let i = 0; i < children.length; i++) {
3477
+ const el = children[i];
3478
+ if (el.nodeType !== 1) continue;
3479
+ const tag = localName(el);
3480
+ if (tag === "HEADER" || tag === "FOOTER") {
3481
+ continue;
3482
+ }
3483
+ if (tag === "P") {
3484
+ if (!inHeaderFooter) {
3485
+ parseParagraph2(el, blocks, paraShapeMap, sectionNum);
3486
+ }
3487
+ continue;
3488
+ }
3489
+ if (tag === "TABLE") {
3490
+ if (!inHeaderFooter) {
3491
+ parseTable2(el, blocks, paraShapeMap, sectionNum, warnings);
3492
+ }
3493
+ continue;
3494
+ }
3495
+ if (tag === "PARALIST" || tag === "SECTION" || tag === "COLDEF") {
3496
+ walkContent(el, blocks, paraShapeMap, sectionNum, warnings, inHeaderFooter, depth + 1);
3497
+ continue;
3498
+ }
3499
+ walkContent(el, blocks, paraShapeMap, sectionNum, warnings, inHeaderFooter, depth + 1);
3500
+ }
3501
+ }
3502
+ function parseParagraph2(el, blocks, paraShapeMap, sectionNum) {
3503
+ const paraShapeId = el.getAttribute("ParaShape") ?? "";
3504
+ const shapeInfo = paraShapeMap.get(paraShapeId);
3505
+ const text = extractParagraphText(el);
3506
+ if (!text) return;
3507
+ if (shapeInfo?.headingLevel != null) {
3508
+ blocks.push({ type: "heading", text, level: shapeInfo.headingLevel, pageNumber: sectionNum });
3509
+ } else {
3510
+ blocks.push({ type: "paragraph", text, pageNumber: sectionNum });
3511
+ }
3512
+ }
3513
+ function extractParagraphText(p) {
3514
+ const parts = [];
3515
+ collectCharText(p, parts);
3516
+ return parts.join("").trim();
3517
+ }
3518
+ function collectCharText(node, parts, depth = 0) {
3519
+ if (depth > MAX_XML_DEPTH2) return;
3520
+ const children = node.childNodes;
3521
+ for (let i = 0; i < children.length; i++) {
3522
+ const el = children[i];
3523
+ if (el.nodeType !== 1) continue;
3524
+ const tag = localName(el);
3525
+ if (tag === "CHAR") {
3526
+ const t = textContent(el);
3527
+ if (t) parts.push(t);
3528
+ } else if (tag === "TABLE" || tag === "PICTURE" || tag === "SHAPEOBJECT") {
3529
+ } else if (tag === "AUTONUM") {
3530
+ } else {
3531
+ collectCharText(el, parts, depth + 1);
3532
+ }
3533
+ }
3534
+ }
3535
+ function parseTable2(el, blocks, paraShapeMap, sectionNum, warnings) {
3536
+ const cells = [];
3537
+ const rowCount = parseInt(el.getAttribute("RowCount") ?? "0", 10);
3538
+ const colCount = parseInt(el.getAttribute("ColCount") ?? "0", 10);
3539
+ if (isNaN(rowCount) || isNaN(colCount) || rowCount === 0 || colCount === 0) return;
3540
+ if (rowCount > MAX_TABLE_ROWS || colCount > MAX_TABLE_COLS) {
3541
+ warnings.push({ message: `\uD14C\uC774\uBE14 \uD06C\uAE30 \uCD08\uACFC (${rowCount}x${colCount}) \u2014 \uC2A4\uD0B5`, code: "TRUNCATED_TABLE" });
3542
+ return;
3543
+ }
3544
+ const children = el.childNodes;
3545
+ for (let i = 0; i < children.length; i++) {
3546
+ const rowEl = children[i];
3547
+ if (rowEl.nodeType !== 1 || localName(rowEl) !== "ROW") continue;
3548
+ const rowCells = rowEl.childNodes;
3549
+ for (let j = 0; j < rowCells.length; j++) {
3550
+ const cellEl = rowCells[j];
3551
+ if (cellEl.nodeType !== 1 || localName(cellEl) !== "CELL") continue;
3552
+ const colAddr = parseInt(cellEl.getAttribute("ColAddr") ?? "0", 10);
3553
+ const rowAddr = parseInt(cellEl.getAttribute("RowAddr") ?? "0", 10);
3554
+ const colSpan = Math.min(Math.max(1, parseInt(cellEl.getAttribute("ColSpan") ?? "1", 10) || 1), MAX_TABLE_COLS);
3555
+ const rowSpan = Math.min(Math.max(1, parseInt(cellEl.getAttribute("RowSpan") ?? "1", 10) || 1), MAX_TABLE_ROWS);
3556
+ const cellText = extractCellText(cellEl);
3557
+ cells.push({ text: cellText, colSpan, rowSpan, colAddr, rowAddr });
3558
+ }
3559
+ }
3560
+ if (cells.length === 0) return;
3561
+ const grid = Array.from({ length: rowCount }, () => Array(colCount).fill(null));
3562
+ for (const cell of cells) {
3563
+ const r = cell.rowAddr ?? 0;
3564
+ const c = cell.colAddr ?? 0;
3565
+ if (isNaN(r) || isNaN(c) || r >= rowCount || c >= colCount) continue;
3566
+ grid[r][c] = cell;
3567
+ for (let dr = 0; dr < cell.rowSpan; dr++) {
3568
+ for (let dc = 0; dc < cell.colSpan; dc++) {
3569
+ if (dr === 0 && dc === 0) continue;
3570
+ if (r + dr < rowCount && c + dc < colCount) {
3571
+ grid[r + dr][c + dc] = { text: "", colSpan: 1, rowSpan: 1 };
3572
+ }
3573
+ }
3574
+ }
3575
+ }
3576
+ const cellRows = grid.map(
3577
+ (row) => row.map((cell) => cell ?? { text: "", colSpan: 1, rowSpan: 1 })
3578
+ );
3579
+ const table = buildTable(cellRows);
3580
+ blocks.push({ type: "table", table, pageNumber: sectionNum });
3581
+ }
3582
+ function extractCellText(cellEl) {
3583
+ const textParts = [];
3584
+ collectCellText(cellEl, textParts, 0);
3585
+ return textParts.filter(Boolean).join("\n").trim();
3586
+ }
3587
+ function collectCellText(node, parts, depth) {
3588
+ if (depth > 20) return;
3589
+ const children = node.childNodes;
3590
+ for (let i = 0; i < children.length; i++) {
3591
+ const el = children[i];
3592
+ if (el.nodeType !== 1) continue;
3593
+ const tag = localName(el);
3594
+ if (tag === "P") {
3595
+ const t = extractParagraphText(el);
3596
+ if (t) parts.push(t);
3597
+ } else if (tag === "TABLE") {
3598
+ parts.push("[\uC911\uCCA9 \uD14C\uC774\uBE14]");
3599
+ } else {
3600
+ collectCellText(el, parts, depth + 1);
3601
+ }
3602
+ }
3603
+ }
3604
+ function localName(el) {
3605
+ return (el.tagName || el.localName || "").replace(/^[^:]+:/, "");
3606
+ }
3607
+ function findChild(parent, tag) {
3608
+ const children = parent.childNodes;
3609
+ for (let i = 0; i < children.length; i++) {
3610
+ const el = children[i];
3611
+ if (el.nodeType === 1 && localName(el) === tag) return el;
3612
+ }
3613
+ return null;
3614
+ }
3615
+ function textContent(el) {
3616
+ const children = el.childNodes;
3617
+ const parts = [];
3618
+ for (let i = 0; i < children.length; i++) {
3619
+ const node = children[i];
3620
+ if (node.nodeType === 3) {
3621
+ parts.push(node.nodeValue || "");
3622
+ } else if (node.nodeType === 1) {
3623
+ parts.push(textContent(node));
3624
+ }
3625
+ }
3626
+ return parts.join("");
3627
+ }
3628
+ function countSections(body) {
3629
+ let count = 0;
3630
+ const children = body.childNodes;
3631
+ for (let i = 0; i < children.length; i++) {
3632
+ const el = children[i];
3633
+ if (el.nodeType === 1 && localName(el) === "SECTION") count++;
3634
+ }
3635
+ return count;
3636
+ }
3637
+
3239
3638
  // src/form/recognize.ts
3240
3639
  var LABEL_KEYWORDS = /* @__PURE__ */ new Set([
3241
3640
  "\uC131\uBA85",
@@ -3570,7 +3969,7 @@ function fillInlineFields(text, values, filled, matchedLabels) {
3570
3969
 
3571
3970
  // src/form/filler-hwpx.ts
3572
3971
  import JSZip5 from "jszip";
3573
- import { DOMParser as DOMParser4, XMLSerializer } from "@xmldom/xmldom";
3972
+ import { DOMParser as DOMParser5, XMLSerializer } from "@xmldom/xmldom";
3574
3973
  async function fillHwpx(hwpxBuffer, values) {
3575
3974
  const zip = await JSZip5.loadAsync(hwpxBuffer);
3576
3975
  const filled = [];
@@ -3580,7 +3979,7 @@ async function fillHwpx(hwpxBuffer, values) {
3580
3979
  if (sectionFiles.length === 0) {
3581
3980
  throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
3582
3981
  }
3583
- const xmlParser = new DOMParser4();
3982
+ const xmlParser = new DOMParser5();
3584
3983
  const xmlSerializer = new XMLSerializer();
3585
3984
  for (const sectionPath of sectionFiles) {
3586
3985
  const zipEntry = zip.file(sectionPath);
@@ -3612,10 +4011,10 @@ async function fillHwpx(hwpxBuffer, values) {
3612
4011
  const trEl = rows[rowIdx];
3613
4012
  const cells = findDirectChildren(trEl, "tc");
3614
4013
  for (let colIdx = 0; colIdx < cells.length - 1; colIdx++) {
3615
- const labelText = extractCellText(cells[colIdx]);
4014
+ const labelText = extractCellText2(cells[colIdx]);
3616
4015
  if (!isLabelCell(labelText)) continue;
3617
4016
  const valueCell = cells[colIdx + 1];
3618
- const valueText = extractCellText(valueCell);
4017
+ const valueText = extractCellText2(valueCell);
3619
4018
  if (isKeywordLabel(valueText)) continue;
3620
4019
  const normalizedCellLabel = normalizeLabel(labelText);
3621
4020
  if (!normalizedCellLabel) continue;
@@ -3640,14 +4039,14 @@ async function fillHwpx(hwpxBuffer, values) {
3640
4039
  if (rows.length >= 2) {
3641
4040
  const headerCells = findDirectChildren(rows[0], "tc");
3642
4041
  const allLabels = headerCells.every((cell) => {
3643
- const t = extractCellText(cell).trim();
4042
+ const t = extractCellText2(cell).trim();
3644
4043
  return t.length > 0 && t.length <= 20 && isLabelCell(t);
3645
4044
  });
3646
4045
  if (allLabels) {
3647
4046
  for (let rowIdx = 1; rowIdx < rows.length; rowIdx++) {
3648
4047
  const dataCells = findDirectChildren(rows[rowIdx], "tc");
3649
4048
  for (let colIdx = 0; colIdx < Math.min(headerCells.length, dataCells.length); colIdx++) {
3650
- const headerLabel = normalizeLabel(extractCellText(headerCells[colIdx]));
4049
+ const headerLabel = normalizeLabel(extractCellText2(headerCells[colIdx]));
3651
4050
  const matchKey = findMatchingKey(headerLabel, normalizedValues);
3652
4051
  if (matchKey === void 0) continue;
3653
4052
  if (matchedLabels.has(matchKey)) continue;
@@ -3655,7 +4054,7 @@ async function fillHwpx(hwpxBuffer, values) {
3655
4054
  replaceCellText(dataCells[colIdx], newValue);
3656
4055
  matchedLabels.add(matchKey);
3657
4056
  filled.push({
3658
- label: extractCellText(headerCells[colIdx]).trim(),
4057
+ label: extractCellText2(headerCells[colIdx]).trim(),
3659
4058
  value: newValue,
3660
4059
  row: rowIdx,
3661
4060
  col: colIdx
@@ -3697,7 +4096,7 @@ async function fillHwpx(hwpxBuffer, values) {
3697
4096
  const buffer = await zip.generateAsync({ type: "arraybuffer" });
3698
4097
  return { buffer, filled, unmatched };
3699
4098
  }
3700
- function localName(el) {
4099
+ function localName2(el) {
3701
4100
  return (el.tagName || el.localName || "").replace(/^[^:]+:/, "");
3702
4101
  }
3703
4102
  function findAllElements(node, tagLocalName) {
@@ -3708,7 +4107,7 @@ function findAllElements(node, tagLocalName) {
3708
4107
  for (let i = 0; i < children.length; i++) {
3709
4108
  const child = children[i];
3710
4109
  if (child.nodeType !== 1) continue;
3711
- if (localName(child) === tagLocalName) result.push(child);
4110
+ if (localName2(child) === tagLocalName) result.push(child);
3712
4111
  walk(child);
3713
4112
  }
3714
4113
  };
@@ -3721,7 +4120,7 @@ function findDirectChildren(parent, tagLocalName) {
3721
4120
  if (!children) return result;
3722
4121
  for (let i = 0; i < children.length; i++) {
3723
4122
  const child = children[i];
3724
- if (child.nodeType === 1 && localName(child) === tagLocalName) {
4123
+ if (child.nodeType === 1 && localName2(child) === tagLocalName) {
3725
4124
  result.push(child);
3726
4125
  }
3727
4126
  }
@@ -3730,12 +4129,12 @@ function findDirectChildren(parent, tagLocalName) {
3730
4129
  function isInsideTable(el) {
3731
4130
  let parent = el.parentNode;
3732
4131
  while (parent) {
3733
- if (parent.nodeType === 1 && localName(parent) === "tbl") return true;
4132
+ if (parent.nodeType === 1 && localName2(parent) === "tbl") return true;
3734
4133
  parent = parent.parentNode;
3735
4134
  }
3736
4135
  return false;
3737
4136
  }
3738
- function extractCellText(tcEl) {
4137
+ function extractCellText2(tcEl) {
3739
4138
  const parts = [];
3740
4139
  const walk = (node) => {
3741
4140
  const children = node.childNodes;
@@ -3745,7 +4144,7 @@ function extractCellText(tcEl) {
3745
4144
  if (child.nodeType === 3) {
3746
4145
  parts.push(child.textContent || "");
3747
4146
  } else if (child.nodeType === 1) {
3748
- const tag = localName(child);
4147
+ const tag = localName2(child);
3749
4148
  if (tag === "t") walk(child);
3750
4149
  else if (tag === "run" || tag === "r" || tag === "p" || tag === "subList") walk(child);
3751
4150
  else if (tag === "tab") parts.push(" ");
@@ -4444,6 +4843,7 @@ function diffTableCells(a, b) {
4444
4843
  // src/index.ts
4445
4844
  async function parse(input, options) {
4446
4845
  let buffer;
4846
+ const opts = typeof input === "string" && !options?.filePath ? { ...options, filePath: input } : options;
4447
4847
  if (typeof input === "string") {
4448
4848
  try {
4449
4849
  const buf = await readFile(input);
@@ -4464,14 +4864,16 @@ async function parse(input, options) {
4464
4864
  switch (format) {
4465
4865
  case "hwpx": {
4466
4866
  const zipFormat = await detectZipFormat(buffer);
4467
- if (zipFormat === "xlsx") return parseXlsx(buffer, options);
4468
- if (zipFormat === "docx") return parseDocx(buffer, options);
4469
- return parseHwpx(buffer, options);
4867
+ if (zipFormat === "xlsx") return parseXlsx(buffer, opts);
4868
+ if (zipFormat === "docx") return parseDocx(buffer, opts);
4869
+ return parseHwpx(buffer, opts);
4470
4870
  }
4471
4871
  case "hwp":
4472
- return parseHwp(buffer, options);
4872
+ return parseHwp(buffer, opts);
4873
+ case "hwpml":
4874
+ return parseHwpml(buffer, opts);
4473
4875
  case "pdf":
4474
- return parsePdf(buffer, options);
4876
+ return parsePdf(buffer, opts);
4475
4877
  default:
4476
4878
  return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
4477
4879
  }
@@ -4495,7 +4897,7 @@ async function parseHwp(buffer, options) {
4495
4897
  async function parsePdf(buffer, options) {
4496
4898
  let parsePdfDocument;
4497
4899
  try {
4498
- const mod = await import("./parser-43IAQ5KE.js");
4900
+ const mod = await import("./parser-XRUZEFZT.js");
4499
4901
  parsePdfDocument = mod.parsePdfDocument;
4500
4902
  } catch {
4501
4903
  return {
@@ -4529,6 +4931,14 @@ async function parseDocx(buffer, options) {
4529
4931
  return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
4530
4932
  }
4531
4933
  }
4934
+ async function parseHwpml(buffer, options) {
4935
+ try {
4936
+ const { markdown, blocks, metadata, outline, warnings } = parseHwpmlDocument(buffer, options);
4937
+ return { success: true, fileType: "hwpml", markdown, blocks, metadata, outline, warnings };
4938
+ } catch (err) {
4939
+ return { success: false, fileType: "hwpml", error: err instanceof Error ? err.message : "HWPML \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
4940
+ }
4941
+ }
4532
4942
  async function fillForm(input, values, outputFormat = "markdown") {
4533
4943
  let buffer;
4534
4944
  if (typeof input === "string") {
@@ -4588,6 +4998,7 @@ export {
4588
4998
  parse,
4589
4999
  parseDocx,
4590
5000
  parseHwp,
5001
+ parseHwpml,
4591
5002
  parseHwpx,
4592
5003
  parsePdf,
4593
5004
  parseXlsx