kordoc 2.2.5 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/README.md +16 -4
  2. package/dist/{chunk-UU2O6D3R.js → chunk-JFTFC2BB.js} +2 -2
  3. package/dist/{chunk-JH5XLWJQ.js.map → chunk-JFTFC2BB.js.map} +1 -1
  4. package/dist/{chunk-5Y2Q3BRW.js → chunk-M3E3C5GS.js} +8 -1
  5. package/dist/chunk-M3E3C5GS.js.map +1 -0
  6. package/dist/{chunk-RQWICKON.js → chunk-OEJJPCMM.js} +369 -73
  7. package/dist/chunk-OEJJPCMM.js.map +1 -0
  8. package/dist/{chunk-JH5XLWJQ.js → chunk-Z7UPTVMX.js} +2 -2
  9. package/dist/{chunk-UU2O6D3R.js.map → chunk-Z7UPTVMX.js.map} +1 -1
  10. package/dist/{chunk-OJ4QR33V.cjs → chunk-ZNJPRRIA.cjs} +2 -2
  11. package/dist/{chunk-OJ4QR33V.cjs.map → chunk-ZNJPRRIA.cjs.map} +1 -1
  12. package/dist/cli.js +7 -4
  13. package/dist/cli.js.map +1 -1
  14. package/dist/{detect-GYK3HKD5.js → detect-I7YIS4Q6.js} +4 -2
  15. package/dist/index.cjs +463 -160
  16. package/dist/index.cjs.map +1 -1
  17. package/dist/index.d.cts +4 -2
  18. package/dist/index.d.ts +4 -2
  19. package/dist/index.js +387 -84
  20. package/dist/index.js.map +1 -1
  21. package/dist/mcp.js +5 -5
  22. package/dist/{parser-OIRWPKIQ.js → parser-25LF2S2J.js} +45 -42
  23. package/dist/{parser-OIRWPKIQ.js.map → parser-25LF2S2J.js.map} +1 -1
  24. package/dist/{parser-PXD73E4H.js → parser-4LKJXBPP.js} +45 -42
  25. package/dist/{parser-PXD73E4H.js.map → parser-4LKJXBPP.js.map} +1 -1
  26. package/dist/{parser-CYBX5MP4.cjs → parser-KBQZB3QY.cjs} +61 -58
  27. package/dist/{parser-CYBX5MP4.cjs.map → parser-KBQZB3QY.cjs.map} +1 -1
  28. package/dist/{watch-NSBABJ4A.js → watch-GXRBLW3Y.js} +4 -4
  29. package/package.json +2 -2
  30. package/dist/chunk-5Y2Q3BRW.js.map +0 -1
  31. package/dist/chunk-RQWICKON.js.map +0 -1
  32. /package/dist/{detect-GYK3HKD5.js.map → detect-I7YIS4Q6.js.map} +0 -0
  33. /package/dist/{watch-NSBABJ4A.js.map → watch-GXRBLW3Y.js.map} +0 -0
@@ -2,7 +2,7 @@
2
2
  import {
3
3
  detectFormat,
4
4
  detectZipFormat
5
- } from "./chunk-5Y2Q3BRW.js";
5
+ } from "./chunk-M3E3C5GS.js";
6
6
  import {
7
7
  HEADING_RATIO_H1,
8
8
  HEADING_RATIO_H2,
@@ -20,7 +20,7 @@ import {
20
20
  sanitizeHref,
21
21
  stripDtd,
22
22
  toArrayBuffer
23
- } from "./chunk-JH5XLWJQ.js";
23
+ } from "./chunk-Z7UPTVMX.js";
24
24
  import {
25
25
  parsePageRange
26
26
  } from "./chunk-MOL7MDBG.js";
@@ -144,6 +144,7 @@ async function parseHwpxDocument(buffer, options) {
144
144
  const pageFilter = options?.pages ? parsePageRange(options.pages, sectionPaths.length) : null;
145
145
  const totalTarget = pageFilter ? pageFilter.size : sectionPaths.length;
146
146
  const blocks = [];
147
+ const nestedTableCounter = { count: 0 };
147
148
  let parsedSections = 0;
148
149
  for (let si = 0; si < sectionPaths.length; si++) {
149
150
  if (pageFilter && !pageFilter.has(si + 1)) continue;
@@ -153,7 +154,7 @@ async function parseHwpxDocument(buffer, options) {
153
154
  const xml = await file.async("text");
154
155
  decompressed.total += xml.length * 2;
155
156
  if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
156
- blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1));
157
+ blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1, nestedTableCounter));
157
158
  parsedSections++;
158
159
  options?.onProgress?.(parsedSections, totalTarget);
159
160
  } catch (secErr) {
@@ -214,8 +215,20 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
214
215
  ref
215
216
  // 절대 경로일 수도 있음
216
217
  ];
218
+ let resolvedPath = null;
219
+ if (!ref.includes(".")) {
220
+ const prefixes = [`BinData/${ref}`, `Contents/BinData/${ref}`];
221
+ for (const prefix of prefixes) {
222
+ const match = zip.file(new RegExp(`^${prefix.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\.[a-zA-Z0-9]+$`));
223
+ if (match.length > 0) {
224
+ resolvedPath = match[0].name;
225
+ break;
226
+ }
227
+ }
228
+ }
217
229
  let found = false;
218
- for (const path of candidates) {
230
+ const allCandidates = resolvedPath ? [resolvedPath, ...candidates] : candidates;
231
+ for (const path of allCandidates) {
219
232
  if (isPathTraversal(path)) continue;
220
233
  const file = zip.file(path);
221
234
  if (!file) continue;
@@ -223,7 +236,8 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
223
236
  const data = await file.async("uint8array");
224
237
  decompressed.total += data.length;
225
238
  if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
226
- const ext = ref.includes(".") ? ref.split(".").pop() || "png" : "png";
239
+ const actualPath = path;
240
+ const ext = actualPath.includes(".") ? actualPath.split(".").pop() || "png" : "png";
227
241
  const mimeType = imageExtToMime(ext);
228
242
  imageIndex++;
229
243
  const filename = `image_${String(imageIndex).padStart(3, "0")}.${mimeToExt(mimeType)}`;
@@ -309,6 +323,7 @@ function extractFromBrokenZip(buffer) {
309
323
  let totalDecompressed = 0;
310
324
  let entryCount = 0;
311
325
  let sectionNum = 0;
326
+ const nestedTableCounter = { count: 0 };
312
327
  while (pos < data.length - 30) {
313
328
  if (data[pos] !== 80 || data[pos + 1] !== 75 || data[pos + 2] !== 3 || data[pos + 3] !== 4) {
314
329
  pos++;
@@ -355,7 +370,7 @@ function extractFromBrokenZip(buffer) {
355
370
  totalDecompressed += content.length * 2;
356
371
  if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("\uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC");
357
372
  sectionNum++;
358
- blocks.push(...parseSectionXml(content, void 0, warnings, sectionNum));
373
+ blocks.push(...parseSectionXml(content, void 0, warnings, sectionNum, nestedTableCounter));
359
374
  } catch {
360
375
  continue;
361
376
  }
@@ -440,12 +455,40 @@ function detectHwpxHeadings(blocks, styleMap) {
440
455
  }
441
456
  }
442
457
  }
443
- function parseSectionXml(xml, styleMap, warnings, sectionNum) {
458
+ function makeNestedTableMarker(counter, rows) {
459
+ counter.count++;
460
+ const firstRow = rows[0] ?? [];
461
+ const hint = firstRow.map((c) => c.text.trim().replace(/\n/g, " ")).filter(Boolean).join(" | ");
462
+ const hintChars = [...hint];
463
+ const truncated = hintChars.length > 60 ? hintChars.slice(0, 60).join("") + "\u2026" : hint;
464
+ return truncated ? `[\uC911\uCCA9 \uD14C\uC774\uBE14 #${counter.count}: ${truncated}]` : `[\uC911\uCCA9 \uD14C\uC774\uBE14 #${counter.count}]`;
465
+ }
466
+ function handleNestedTable(newTable, tableStack, blocks, ctx) {
467
+ const parentTable = tableStack.pop();
468
+ let nestedCols = 0;
469
+ for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
470
+ if (newTable.rows.length >= 3 && nestedCols >= 2) {
471
+ blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: ctx.sectionNum });
472
+ if (parentTable.cell) {
473
+ const marker = ctx.counter ? makeNestedTableMarker(ctx.counter, newTable.rows) : "[\uC911\uCCA9 \uD14C\uC774\uBE14]";
474
+ parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + marker;
475
+ }
476
+ } else {
477
+ const nestedText = convertTableToText(newTable.rows);
478
+ if (parentTable.cell) {
479
+ const marker = ctx.counter ? makeNestedTableMarker(ctx.counter, newTable.rows) : "[\uC911\uCCA9 \uD14C\uC774\uBE14]";
480
+ parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + marker + "\n" + nestedText;
481
+ }
482
+ }
483
+ return parentTable;
484
+ }
485
+ function parseSectionXml(xml, styleMap, warnings, sectionNum, counter) {
444
486
  const parser = createXmlParser(warnings);
445
487
  const doc = parser.parseFromString(stripDtd(xml), "text/xml");
446
488
  if (!doc.documentElement) return [];
447
489
  const blocks = [];
448
- walkSection(doc.documentElement, blocks, null, [], styleMap, warnings, sectionNum);
490
+ const ctx = { styleMap, warnings, sectionNum, counter };
491
+ walkSection(doc.documentElement, blocks, null, [], ctx);
449
492
  return blocks;
450
493
  }
451
494
  function extractImageRef(el) {
@@ -466,7 +509,7 @@ function extractImageRef(el) {
466
509
  if (directRef) return directRef;
467
510
  return null;
468
511
  }
469
- function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth = 0) {
512
+ function walkSection(node, blocks, tableCtx, tableStack, ctx, depth = 0) {
470
513
  if (depth > MAX_XML_DEPTH) return;
471
514
  const children = node.childNodes;
472
515
  if (!children) return;
@@ -479,23 +522,12 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
479
522
  case "tbl": {
480
523
  if (tableCtx) tableStack.push(tableCtx);
481
524
  const newTable = { rows: [], currentRow: [], cell: null };
482
- walkSection(el, blocks, newTable, tableStack, styleMap, warnings, sectionNum, depth + 1);
525
+ walkSection(el, blocks, newTable, tableStack, ctx, depth + 1);
483
526
  if (newTable.rows.length > 0) {
484
527
  if (tableStack.length > 0) {
485
- const parentTable = tableStack.pop();
486
- let nestedCols = 0;
487
- for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
488
- if (newTable.rows.length >= 3 && nestedCols >= 2) {
489
- blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
490
- } else {
491
- const nestedText = convertTableToText(newTable.rows);
492
- if (parentTable.cell) {
493
- parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
494
- }
495
- }
496
- tableCtx = parentTable;
528
+ tableCtx = handleNestedTable(newTable, tableStack, blocks, ctx);
497
529
  } else {
498
- blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
530
+ blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: ctx.sectionNum });
499
531
  tableCtx = null;
500
532
  }
501
533
  } else {
@@ -506,7 +538,7 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
506
538
  case "tr":
507
539
  if (tableCtx) {
508
540
  tableCtx.currentRow = [];
509
- walkSection(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
541
+ walkSection(el, blocks, tableCtx, tableStack, ctx, depth + 1);
510
542
  if (tableCtx.currentRow.length > 0) tableCtx.rows.push(tableCtx.currentRow);
511
543
  tableCtx.currentRow = [];
512
544
  }
@@ -514,7 +546,7 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
514
546
  case "tc":
515
547
  if (tableCtx) {
516
548
  tableCtx.cell = { text: "", colSpan: 1, rowSpan: 1 };
517
- walkSection(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
549
+ walkSection(el, blocks, tableCtx, tableStack, ctx, depth + 1);
518
550
  if (tableCtx.cell) {
519
551
  tableCtx.currentRow.push(tableCtx.cell);
520
552
  tableCtx.cell = null;
@@ -540,19 +572,19 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
540
572
  }
541
573
  break;
542
574
  case "p": {
543
- const { text, href, footnote, style } = extractParagraphInfo(el, styleMap);
575
+ const { text, href, footnote, style } = extractParagraphInfo(el, ctx.styleMap);
544
576
  if (text) {
545
577
  if (tableCtx?.cell) {
546
578
  tableCtx.cell.text += (tableCtx.cell.text ? "\n" : "") + text;
547
579
  } else if (!tableCtx) {
548
- const block = { type: "paragraph", text, pageNumber: sectionNum };
580
+ const block = { type: "paragraph", text, pageNumber: ctx.sectionNum };
549
581
  if (style) block.style = style;
550
582
  if (href) block.href = href;
551
583
  if (footnote) block.footnoteText = footnote;
552
584
  blocks.push(block);
553
585
  }
554
586
  }
555
- tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
587
+ tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, ctx, depth + 1);
556
588
  break;
557
589
  }
558
590
  // 이미지/그림 — 경로 추출 또는 경고
@@ -561,19 +593,19 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
561
593
  case "drawingObject": {
562
594
  const imgRef = extractImageRef(el);
563
595
  if (imgRef) {
564
- blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
565
- } else if (warnings && sectionNum) {
566
- warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
596
+ blocks.push({ type: "image", text: imgRef, pageNumber: ctx.sectionNum });
597
+ } else if (ctx.warnings && ctx.sectionNum) {
598
+ ctx.warnings.push({ page: ctx.sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
567
599
  }
568
600
  break;
569
601
  }
570
602
  default:
571
- walkSection(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
603
+ walkSection(el, blocks, tableCtx, tableStack, ctx, depth + 1);
572
604
  break;
573
605
  }
574
606
  }
575
607
  }
576
- function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth = 0) {
608
+ function walkParagraphChildren(node, blocks, tableCtx, tableStack, ctx, depth = 0) {
577
609
  if (depth > MAX_XML_DEPTH) return tableCtx;
578
610
  const children = node.childNodes;
579
611
  if (!children) return tableCtx;
@@ -589,23 +621,12 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
589
621
  if (localTag === "tbl") {
590
622
  if (tableCtx) tableStack.push(tableCtx);
591
623
  const newTable = { rows: [], currentRow: [], cell: null };
592
- walkSection(el, blocks, newTable, tableStack, styleMap, warnings, sectionNum, d + 1);
624
+ walkSection(el, blocks, newTable, tableStack, ctx, d + 1);
593
625
  if (newTable.rows.length > 0) {
594
626
  if (tableStack.length > 0) {
595
- const parentTable = tableStack.pop();
596
- let nestedCols = 0;
597
- for (const r of newTable.rows) if (r.length > nestedCols) nestedCols = r.length;
598
- if (newTable.rows.length >= 3 && nestedCols >= 2) {
599
- blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
600
- } else {
601
- const nestedText = convertTableToText(newTable.rows);
602
- if (parentTable.cell) {
603
- parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
604
- }
605
- }
606
- tableCtx = parentTable;
627
+ tableCtx = handleNestedTable(newTable, tableStack, blocks, ctx);
607
628
  } else {
608
- blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
629
+ blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: ctx.sectionNum });
609
630
  tableCtx = null;
610
631
  }
611
632
  } else {
@@ -614,21 +635,21 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
614
635
  } else if (localTag === "pic" || localTag === "shape" || localTag === "drawingObject") {
615
636
  const drawTextChild = findDescendant(el, "drawText");
616
637
  if (drawTextChild) {
617
- extractDrawTextBlocks(drawTextChild, blocks, styleMap, sectionNum);
638
+ extractDrawTextBlocks(drawTextChild, blocks, ctx.styleMap, ctx.sectionNum);
618
639
  } else {
619
640
  const imgRef = extractImageRef(el);
620
641
  if (imgRef) {
621
- blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
622
- } else if (warnings && sectionNum) {
623
- warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
642
+ blocks.push({ type: "image", text: imgRef, pageNumber: ctx.sectionNum });
643
+ } else if (ctx.warnings && ctx.sectionNum) {
644
+ ctx.warnings.push({ page: ctx.sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
624
645
  }
625
646
  }
626
647
  } else if (localTag === "drawText") {
627
- extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
648
+ extractDrawTextBlocks(el, blocks, ctx.styleMap, ctx.sectionNum);
628
649
  } else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
629
650
  walkChildren(el, d + 1);
630
651
  } else if (localTag === "run") {
631
- tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
652
+ tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, ctx, depth + 1);
632
653
  }
633
654
  }
634
655
  };
@@ -1901,6 +1922,7 @@ function parseHwp5Document(buffer, options) {
1901
1922
  const pageFilter = options?.pages ? parsePageRange(options.pages, sections.length) : null;
1902
1923
  const totalTarget = pageFilter ? pageFilter.size : sections.length;
1903
1924
  const blocks = [];
1925
+ const nestedTableCounter = { count: 0 };
1904
1926
  let totalDecompressed = 0;
1905
1927
  let parsedSections = 0;
1906
1928
  for (let si = 0; si < sections.length; si++) {
@@ -1911,7 +1933,7 @@ function parseHwp5Document(buffer, options) {
1911
1933
  totalDecompressed += data.length;
1912
1934
  if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
1913
1935
  const records = readRecords(data);
1914
- const sectionBlocks = parseSection(records, docInfo, warnings, si + 1);
1936
+ const sectionBlocks = parseSection(records, docInfo, warnings, si + 1, nestedTableCounter);
1915
1937
  blocks.push(...sectionBlocks);
1916
1938
  parsedSections++;
1917
1939
  options?.onProgress?.(parsedSections, totalTarget);
@@ -2245,13 +2267,13 @@ function extractHwp5ImagesLenient(lcfb, blocks, compressed, warnings) {
2245
2267
  }
2246
2268
  return images;
2247
2269
  }
2248
- function parseSection(records, docInfo, warnings, sectionNum) {
2270
+ function parseSection(records, docInfo, warnings, sectionNum, counter) {
2249
2271
  const blocks = [];
2250
2272
  let i = 0;
2251
2273
  while (i < records.length) {
2252
2274
  const rec = records[i];
2253
2275
  if (rec.tagId === TAG_PARA_HEADER && rec.level === 0) {
2254
- const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i);
2276
+ const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i, counter);
2255
2277
  if (paragraph) {
2256
2278
  const block = { type: "paragraph", text: paragraph, pageNumber: sectionNum };
2257
2279
  if (docInfo && charShapeIds.length > 0) {
@@ -2274,7 +2296,7 @@ function parseSection(records, docInfo, warnings, sectionNum) {
2274
2296
  if (rec.tagId === TAG_CTRL_HEADER && rec.level <= 1 && rec.data.length >= 4) {
2275
2297
  const ctrlId = rec.data.subarray(0, 4).toString("ascii");
2276
2298
  if (ctrlId === " lbt" || ctrlId === "tbl ") {
2277
- const { table, nextIdx } = parseTableBlock(records, i);
2299
+ const { table, nextIdx } = parseTableBlock(records, i, counter);
2278
2300
  if (table) blocks.push({ type: "table", table, pageNumber: sectionNum });
2279
2301
  i = nextIdx;
2280
2302
  continue;
@@ -2379,7 +2401,7 @@ function resolveCharStyle(charShapeIds, docInfo) {
2379
2401
  if (cs.attrFlags & 2) style.bold = true;
2380
2402
  return style.fontSize || style.bold || style.italic ? style : void 0;
2381
2403
  }
2382
- function parseParagraphWithTables(records, startIdx) {
2404
+ function parseParagraphWithTables(records, startIdx, counter) {
2383
2405
  const startLevel = records[startIdx].level;
2384
2406
  let text = "";
2385
2407
  const tables = [];
@@ -2401,7 +2423,7 @@ function parseParagraphWithTables(records, startIdx) {
2401
2423
  if (rec.tagId === TAG_CTRL_HEADER && rec.data.length >= 4) {
2402
2424
  const ctrlId = rec.data.subarray(0, 4).toString("ascii");
2403
2425
  if (ctrlId === " lbt" || ctrlId === "tbl ") {
2404
- const { table, nextIdx } = parseTableBlock(records, i);
2426
+ const { table, nextIdx } = parseTableBlock(records, i, counter);
2405
2427
  if (table) tables.push(table);
2406
2428
  i = nextIdx;
2407
2429
  continue;
@@ -2412,7 +2434,7 @@ function parseParagraphWithTables(records, startIdx) {
2412
2434
  const trimmed = text.trim();
2413
2435
  return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds, paraShapeId };
2414
2436
  }
2415
- function parseTableBlock(records, startIdx) {
2437
+ function parseTableBlock(records, startIdx, counter) {
2416
2438
  const tableLevel = records[startIdx].level;
2417
2439
  let i = startIdx + 1;
2418
2440
  let rows = 0, cols = 0;
@@ -2426,7 +2448,7 @@ function parseTableBlock(records, startIdx) {
2426
2448
  cols = Math.min(rec.data.readUInt16LE(6), MAX_COLS);
2427
2449
  }
2428
2450
  if (rec.tagId === TAG_LIST_HEADER) {
2429
- const { cell, nextIdx } = parseCellBlock(records, i, tableLevel);
2451
+ const { cell, nextIdx } = parseCellBlock(records, i, tableLevel, counter);
2430
2452
  if (cell) cells.push(cell);
2431
2453
  i = nextIdx;
2432
2454
  continue;
@@ -2447,7 +2469,7 @@ function parseTableBlock(records, startIdx) {
2447
2469
  const cellRows = arrangeCells(rows, cols, cells);
2448
2470
  return { table: buildTable(cellRows), nextIdx: i };
2449
2471
  }
2450
- function parseCellBlock(records, startIdx, tableLevel) {
2472
+ function parseCellBlock(records, startIdx, tableLevel, counter) {
2451
2473
  const rec = records[startIdx];
2452
2474
  const cellLevel = rec.level;
2453
2475
  const texts = [];
@@ -2472,6 +2494,17 @@ function parseCellBlock(records, startIdx, tableLevel) {
2472
2494
  const t = extractText(r.data).trim();
2473
2495
  if (t) texts.push(t);
2474
2496
  }
2497
+ if (r.tagId === TAG_CTRL_HEADER && r.data.length >= 4) {
2498
+ const ctrlId = r.data.subarray(0, 4).toString("ascii");
2499
+ if (ctrlId === " lbt" || ctrlId === "tbl ") {
2500
+ if (counter) {
2501
+ counter.count++;
2502
+ texts.push(`[\uC911\uCCA9 \uD14C\uC774\uBE14 #${counter.count}]`);
2503
+ } else {
2504
+ texts.push("[\uC911\uCCA9 \uD14C\uC774\uBE14]");
2505
+ }
2506
+ }
2507
+ }
2475
2508
  i++;
2476
2509
  }
2477
2510
  return { cell: { text: texts.join("\n"), colSpan, rowSpan, colAddr, rowAddr }, nextIdx: i };
@@ -3829,21 +3862,21 @@ async function parseXlsxDocument(buffer, options) {
3829
3862
  import JSZip5 from "jszip";
3830
3863
  import { DOMParser as DOMParser4 } from "@xmldom/xmldom";
3831
3864
  var MAX_DECOMPRESS_SIZE4 = 100 * 1024 * 1024;
3832
- function getChildElements(parent, localName2) {
3865
+ function getChildElements(parent, localName3) {
3833
3866
  const result = [];
3834
3867
  const children = parent.childNodes;
3835
3868
  for (let i = 0; i < children.length; i++) {
3836
3869
  const node = children[i];
3837
3870
  if (node.nodeType === 1) {
3838
3871
  const el = node;
3839
- if (el.localName === localName2 || el.tagName?.endsWith(`:${localName2}`)) {
3872
+ if (el.localName === localName3 || el.tagName?.endsWith(`:${localName3}`)) {
3840
3873
  result.push(el);
3841
3874
  }
3842
3875
  }
3843
3876
  }
3844
3877
  return result;
3845
3878
  }
3846
- function findElements(parent, localName2) {
3879
+ function findElements(parent, localName3) {
3847
3880
  const result = [];
3848
3881
  const walk = (node) => {
3849
3882
  const children = node.childNodes;
@@ -3851,7 +3884,7 @@ function findElements(parent, localName2) {
3851
3884
  const child = children[i];
3852
3885
  if (child.nodeType === 1) {
3853
3886
  const el = child;
3854
- if (el.localName === localName2 || el.tagName?.endsWith(`:${localName2}`)) {
3887
+ if (el.localName === localName3 || el.tagName?.endsWith(`:${localName3}`)) {
3855
3888
  result.push(el);
3856
3889
  }
3857
3890
  walk(el);
@@ -3861,11 +3894,11 @@ function findElements(parent, localName2) {
3861
3894
  walk(parent);
3862
3895
  return result;
3863
3896
  }
3864
- function getAttr(el, localName2) {
3897
+ function getAttr(el, localName3) {
3865
3898
  const attrs = el.attributes;
3866
3899
  for (let i = 0; i < attrs.length; i++) {
3867
3900
  const attr = attrs[i];
3868
- if (attr.localName === localName2 || attr.name === localName2) return attr.value;
3901
+ if (attr.localName === localName3 || attr.name === localName3) return attr.value;
3869
3902
  }
3870
3903
  return null;
3871
3904
  }
@@ -4212,11 +4245,11 @@ async function parseDocxDocument(buffer, options) {
4212
4245
  const node = children[i];
4213
4246
  if (node.nodeType !== 1) continue;
4214
4247
  const el = node;
4215
- const localName2 = el.localName ?? el.tagName?.split(":").pop();
4216
- if (localName2 === "p") {
4248
+ const localName3 = el.localName ?? el.tagName?.split(":").pop();
4249
+ if (localName3 === "p") {
4217
4250
  const block = parseParagraph(el, styles, numbering, footnotes, rels);
4218
4251
  if (block) blocks.push(block);
4219
- } else if (localName2 === "tbl") {
4252
+ } else if (localName3 === "tbl") {
4220
4253
  const block = parseTable(el, styles, numbering, footnotes, rels);
4221
4254
  if (block) blocks.push(block);
4222
4255
  }
@@ -4254,6 +4287,259 @@ async function parseDocxDocument(buffer, options) {
4254
4287
  };
4255
4288
  }
4256
4289
 
4290
+ // src/hwpml/parser.ts
4291
+ import { DOMParser as DOMParser5 } from "@xmldom/xmldom";
4292
+ var MAX_XML_DEPTH2 = 200;
4293
+ var MAX_TABLE_ROWS = 5e3;
4294
+ var MAX_TABLE_COLS = 500;
4295
+ var MAX_HWPML_BYTES = 50 * 1024 * 1024;
4296
+ function parseHwpmlDocument(buffer, options) {
4297
+ if (buffer.byteLength > MAX_HWPML_BYTES) {
4298
+ throw new Error(`HWPML \uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC (${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB > 50MB)`);
4299
+ }
4300
+ const text = new TextDecoder("utf-8").decode(buffer).replace(/^\uFEFF/, "");
4301
+ const normalized = text.replace(/&nbsp;/g, "&#160;");
4302
+ const xml = stripDtd(normalized);
4303
+ const warnings = [];
4304
+ const parser = new DOMParser5({
4305
+ onError: (_level, msg) => {
4306
+ warnings.push({ message: `HWPML XML \uD30C\uC2F1 \uACBD\uACE0: ${msg}`, code: "MALFORMED_XML" });
4307
+ }
4308
+ });
4309
+ const doc = parser.parseFromString(xml, "text/xml");
4310
+ if (!doc.documentElement) {
4311
+ return { markdown: "", blocks: [], warnings };
4312
+ }
4313
+ const root = doc.documentElement;
4314
+ const metadata = {};
4315
+ const docSummary = findChild(root, "DOCSUMMARY");
4316
+ if (docSummary) {
4317
+ const title = findChild(docSummary, "TITLE");
4318
+ const author = findChild(docSummary, "AUTHOR");
4319
+ const date = findChild(docSummary, "DATE");
4320
+ if (title) metadata.title = textContent(title).trim();
4321
+ if (author) metadata.author = textContent(author).trim();
4322
+ if (date) metadata.createdAt = textContent(date).trim() || void 0;
4323
+ }
4324
+ const paraShapeMap = buildParaShapeMap(root);
4325
+ const body = findChild(root, "BODY");
4326
+ if (!body) {
4327
+ return { markdown: "", blocks: [], metadata, warnings };
4328
+ }
4329
+ const blocks = [];
4330
+ const pageFilter = options?.pages ? parsePageRange(options.pages, countSections(body)) : null;
4331
+ let sectionIdx = 0;
4332
+ const children = body.childNodes;
4333
+ for (let i = 0; i < children.length; i++) {
4334
+ const el = children[i];
4335
+ if (el.nodeType !== 1) continue;
4336
+ if (localName2(el) !== "SECTION") continue;
4337
+ sectionIdx++;
4338
+ if (pageFilter && !pageFilter.has(sectionIdx)) continue;
4339
+ parseSection2(el, blocks, paraShapeMap, sectionIdx, warnings);
4340
+ }
4341
+ const outline = blocks.filter((b) => b.type === "heading" && b.text).map((b) => ({ level: b.level ?? 1, text: b.text, pageNumber: b.pageNumber }));
4342
+ const markdown = blocksToMarkdown(blocks);
4343
+ return {
4344
+ markdown,
4345
+ blocks,
4346
+ metadata: Object.keys(metadata).length > 0 ? metadata : void 0,
4347
+ outline: outline.length > 0 ? outline : void 0,
4348
+ warnings: warnings.length > 0 ? warnings : void 0
4349
+ };
4350
+ }
4351
+ function buildParaShapeMap(root) {
4352
+ const map = /* @__PURE__ */ new Map();
4353
+ const head = findChild(root, "HEAD");
4354
+ if (!head) return map;
4355
+ const mappingTable = findChild(head, "MAPPINGTABLE");
4356
+ if (!mappingTable) return map;
4357
+ const paraShapeList = findChild(mappingTable, "PARASHAPELIST");
4358
+ if (!paraShapeList) return map;
4359
+ const children = paraShapeList.childNodes;
4360
+ for (let i = 0; i < children.length; i++) {
4361
+ const el = children[i];
4362
+ if (el.nodeType !== 1 || localName2(el) !== "PARASHAPE") continue;
4363
+ const id = el.getAttribute("Id") ?? "";
4364
+ const headingType = el.getAttribute("HeadingType") ?? "None";
4365
+ const level = parseInt(el.getAttribute("Level") ?? "0", 10);
4366
+ let headingLevel = null;
4367
+ if (headingType === "Outline") {
4368
+ const safeLevel = isNaN(level) ? 0 : Math.max(0, level);
4369
+ headingLevel = Math.min(safeLevel + 1, 6);
4370
+ }
4371
+ map.set(id, { headingLevel });
4372
+ }
4373
+ return map;
4374
+ }
4375
+ function parseSection2(section, blocks, paraShapeMap, sectionNum, warnings) {
4376
+ walkContent(section, blocks, paraShapeMap, sectionNum, warnings, false);
4377
+ }
4378
+ function walkContent(node, blocks, paraShapeMap, sectionNum, warnings, inHeaderFooter, depth = 0) {
4379
+ if (depth > MAX_XML_DEPTH2) return;
4380
+ const children = node.childNodes;
4381
+ for (let i = 0; i < children.length; i++) {
4382
+ const el = children[i];
4383
+ if (el.nodeType !== 1) continue;
4384
+ const tag = localName2(el);
4385
+ if (tag === "HEADER" || tag === "FOOTER") {
4386
+ continue;
4387
+ }
4388
+ if (tag === "P") {
4389
+ if (!inHeaderFooter) {
4390
+ parseParagraph2(el, blocks, paraShapeMap, sectionNum);
4391
+ }
4392
+ continue;
4393
+ }
4394
+ if (tag === "TABLE") {
4395
+ if (!inHeaderFooter) {
4396
+ parseTable2(el, blocks, paraShapeMap, sectionNum, warnings);
4397
+ }
4398
+ continue;
4399
+ }
4400
+ if (tag === "PARALIST" || tag === "SECTION" || tag === "COLDEF") {
4401
+ walkContent(el, blocks, paraShapeMap, sectionNum, warnings, inHeaderFooter, depth + 1);
4402
+ continue;
4403
+ }
4404
+ walkContent(el, blocks, paraShapeMap, sectionNum, warnings, inHeaderFooter, depth + 1);
4405
+ }
4406
+ }
4407
+ function parseParagraph2(el, blocks, paraShapeMap, sectionNum) {
4408
+ const paraShapeId = el.getAttribute("ParaShape") ?? "";
4409
+ const shapeInfo = paraShapeMap.get(paraShapeId);
4410
+ const text = extractParagraphText(el);
4411
+ if (!text) return;
4412
+ if (shapeInfo?.headingLevel != null) {
4413
+ blocks.push({ type: "heading", text, level: shapeInfo.headingLevel, pageNumber: sectionNum });
4414
+ } else {
4415
+ blocks.push({ type: "paragraph", text, pageNumber: sectionNum });
4416
+ }
4417
+ }
4418
+ function extractParagraphText(p) {
4419
+ const parts = [];
4420
+ collectCharText(p, parts);
4421
+ return parts.join("").trim();
4422
+ }
4423
+ function collectCharText(node, parts, depth = 0) {
4424
+ if (depth > MAX_XML_DEPTH2) return;
4425
+ const children = node.childNodes;
4426
+ for (let i = 0; i < children.length; i++) {
4427
+ const el = children[i];
4428
+ if (el.nodeType !== 1) continue;
4429
+ const tag = localName2(el);
4430
+ if (tag === "CHAR") {
4431
+ const t = textContent(el);
4432
+ if (t) parts.push(t);
4433
+ } else if (tag === "TABLE" || tag === "PICTURE" || tag === "SHAPEOBJECT") {
4434
+ } else if (tag === "AUTONUM") {
4435
+ } else {
4436
+ collectCharText(el, parts, depth + 1);
4437
+ }
4438
+ }
4439
+ }
4440
+ function parseTable2(el, blocks, paraShapeMap, sectionNum, warnings) {
4441
+ const cells = [];
4442
+ const rowCount = parseInt(el.getAttribute("RowCount") ?? "0", 10);
4443
+ const colCount = parseInt(el.getAttribute("ColCount") ?? "0", 10);
4444
+ if (isNaN(rowCount) || isNaN(colCount) || rowCount === 0 || colCount === 0) return;
4445
+ if (rowCount > MAX_TABLE_ROWS || colCount > MAX_TABLE_COLS) {
4446
+ warnings.push({ message: `\uD14C\uC774\uBE14 \uD06C\uAE30 \uCD08\uACFC (${rowCount}x${colCount}) \u2014 \uC2A4\uD0B5`, code: "TRUNCATED_TABLE" });
4447
+ return;
4448
+ }
4449
+ const children = el.childNodes;
4450
+ for (let i = 0; i < children.length; i++) {
4451
+ const rowEl = children[i];
4452
+ if (rowEl.nodeType !== 1 || localName2(rowEl) !== "ROW") continue;
4453
+ const rowCells = rowEl.childNodes;
4454
+ for (let j = 0; j < rowCells.length; j++) {
4455
+ const cellEl = rowCells[j];
4456
+ if (cellEl.nodeType !== 1 || localName2(cellEl) !== "CELL") continue;
4457
+ const colAddr = parseInt(cellEl.getAttribute("ColAddr") ?? "0", 10);
4458
+ const rowAddr = parseInt(cellEl.getAttribute("RowAddr") ?? "0", 10);
4459
+ const colSpan = Math.min(Math.max(1, parseInt(cellEl.getAttribute("ColSpan") ?? "1", 10) || 1), MAX_TABLE_COLS);
4460
+ const rowSpan = Math.min(Math.max(1, parseInt(cellEl.getAttribute("RowSpan") ?? "1", 10) || 1), MAX_TABLE_ROWS);
4461
+ const cellText = extractCellText2(cellEl);
4462
+ cells.push({ text: cellText, colSpan, rowSpan, colAddr, rowAddr });
4463
+ }
4464
+ }
4465
+ if (cells.length === 0) return;
4466
+ const grid = Array.from({ length: rowCount }, () => Array(colCount).fill(null));
4467
+ for (const cell of cells) {
4468
+ const r = cell.rowAddr ?? 0;
4469
+ const c = cell.colAddr ?? 0;
4470
+ if (isNaN(r) || isNaN(c) || r >= rowCount || c >= colCount) continue;
4471
+ grid[r][c] = cell;
4472
+ for (let dr = 0; dr < cell.rowSpan; dr++) {
4473
+ for (let dc = 0; dc < cell.colSpan; dc++) {
4474
+ if (dr === 0 && dc === 0) continue;
4475
+ if (r + dr < rowCount && c + dc < colCount) {
4476
+ grid[r + dr][c + dc] = { text: "", colSpan: 1, rowSpan: 1 };
4477
+ }
4478
+ }
4479
+ }
4480
+ }
4481
+ const cellRows = grid.map(
4482
+ (row) => row.map((cell) => cell ?? { text: "", colSpan: 1, rowSpan: 1 })
4483
+ );
4484
+ const table = buildTable(cellRows);
4485
+ blocks.push({ type: "table", table, pageNumber: sectionNum });
4486
+ }
4487
+ function extractCellText2(cellEl) {
4488
+ const textParts = [];
4489
+ collectCellText(cellEl, textParts, 0);
4490
+ return textParts.filter(Boolean).join("\n").trim();
4491
+ }
4492
+ function collectCellText(node, parts, depth) {
4493
+ if (depth > 20) return;
4494
+ const children = node.childNodes;
4495
+ for (let i = 0; i < children.length; i++) {
4496
+ const el = children[i];
4497
+ if (el.nodeType !== 1) continue;
4498
+ const tag = localName2(el);
4499
+ if (tag === "P") {
4500
+ const t = extractParagraphText(el);
4501
+ if (t) parts.push(t);
4502
+ } else if (tag === "TABLE") {
4503
+ parts.push("[\uC911\uCCA9 \uD14C\uC774\uBE14]");
4504
+ } else {
4505
+ collectCellText(el, parts, depth + 1);
4506
+ }
4507
+ }
4508
+ }
4509
+ function localName2(el) {
4510
+ return (el.tagName || el.localName || "").replace(/^[^:]+:/, "");
4511
+ }
4512
+ function findChild(parent, tag) {
4513
+ const children = parent.childNodes;
4514
+ for (let i = 0; i < children.length; i++) {
4515
+ const el = children[i];
4516
+ if (el.nodeType === 1 && localName2(el) === tag) return el;
4517
+ }
4518
+ return null;
4519
+ }
4520
+ function textContent(el) {
4521
+ const children = el.childNodes;
4522
+ const parts = [];
4523
+ for (let i = 0; i < children.length; i++) {
4524
+ const node = children[i];
4525
+ if (node.nodeType === 3) {
4526
+ parts.push(node.nodeValue || "");
4527
+ } else if (node.nodeType === 1) {
4528
+ parts.push(textContent(node));
4529
+ }
4530
+ }
4531
+ return parts.join("");
4532
+ }
4533
+ function countSections(body) {
4534
+ let count = 0;
4535
+ const children = body.childNodes;
4536
+ for (let i = 0; i < children.length; i++) {
4537
+ const el = children[i];
4538
+ if (el.nodeType === 1 && localName2(el) === "SECTION") count++;
4539
+ }
4540
+ return count;
4541
+ }
4542
+
4257
4543
  // src/index.ts
4258
4544
  async function parse(input, options) {
4259
4545
  let buffer;
@@ -4283,6 +4569,8 @@ async function parse(input, options) {
4283
4569
  }
4284
4570
  case "hwp":
4285
4571
  return parseHwp(buffer, options);
4572
+ case "hwpml":
4573
+ return parseHwpml(buffer, options);
4286
4574
  case "pdf":
4287
4575
  return parsePdf(buffer, options);
4288
4576
  default:
@@ -4308,7 +4596,7 @@ async function parseHwp(buffer, options) {
4308
4596
  async function parsePdf(buffer, options) {
4309
4597
  let parsePdfDocument;
4310
4598
  try {
4311
- const mod = await import("./parser-PXD73E4H.js");
4599
+ const mod = await import("./parser-4LKJXBPP.js");
4312
4600
  parsePdfDocument = mod.parsePdfDocument;
4313
4601
  } catch {
4314
4602
  return {
@@ -4342,6 +4630,14 @@ async function parseDocx(buffer, options) {
4342
4630
  return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
4343
4631
  }
4344
4632
  }
4633
+ async function parseHwpml(buffer, options) {
4634
+ try {
4635
+ const { markdown, blocks, metadata, outline, warnings } = parseHwpmlDocument(buffer, options);
4636
+ return { success: true, fileType: "hwpml", markdown, blocks, metadata, outline, warnings };
4637
+ } catch (err) {
4638
+ return { success: false, fileType: "hwpml", error: err instanceof Error ? err.message : "HWPML \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
4639
+ }
4640
+ }
4345
4641
 
4346
4642
  // src/diff/text-diff.ts
4347
4643
  function similarity(a, b) {
@@ -4530,4 +4826,4 @@ export {
4530
4826
  compare,
4531
4827
  parse
4532
4828
  };
4533
- //# sourceMappingURL=chunk-RQWICKON.js.map
4829
+ //# sourceMappingURL=chunk-OEJJPCMM.js.map