kordoc 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -44,7 +44,7 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount) {
44
44
  const imageData = await renderPageToPng(page);
45
45
  const text = await provider(imageData, i, "image/png");
46
46
  if (text.trim()) {
47
- blocks.push({ type: "paragraph", text: text.trim() });
47
+ blocks.push({ type: "paragraph", text: text.trim(), pageNumber: i });
48
48
  }
49
49
  } catch {
50
50
  }
@@ -182,8 +182,29 @@ function blocksToMarkdown(blocks) {
182
182
  const lines = [];
183
183
  for (let i = 0; i < blocks.length; i++) {
184
184
  const block = blocks[i];
185
+ if (block.type === "heading" && block.text) {
186
+ const prefix = "#".repeat(Math.min(block.level || 2, 6));
187
+ lines.push("", `${prefix} ${block.text}`, "");
188
+ continue;
189
+ }
190
+ if (block.type === "separator") {
191
+ lines.push("", "---", "");
192
+ continue;
193
+ }
194
+ if (block.type === "list" && block.text) {
195
+ const alreadyNumbered = block.listType === "ordered" && /^\d+\.\s/.test(block.text);
196
+ const prefix = alreadyNumbered ? "" : block.listType === "ordered" ? "1. " : "- ";
197
+ lines.push(`${prefix}${block.text}`);
198
+ if (block.children) {
199
+ for (const child of block.children) {
200
+ const childPrefix = child.listType === "ordered" ? "1." : "-";
201
+ lines.push(` ${childPrefix} ${child.text || ""}`);
202
+ }
203
+ }
204
+ continue;
205
+ }
185
206
  if (block.type === "paragraph" && block.text) {
186
- const text = block.text;
207
+ let text = block.text;
187
208
  if (/^\[별표\s*\d+/.test(text)) {
188
209
  const nextBlock = blocks[i + 1];
189
210
  if (nextBlock?.type === "paragraph" && nextBlock.text && /관련\)?$/.test(nextBlock.text)) {
@@ -198,9 +219,19 @@ function blocksToMarkdown(blocks) {
198
219
  lines.push(`*${text}*`, "");
199
220
  continue;
200
221
  }
222
+ if (block.href) {
223
+ text = `[${text}](${block.href})`;
224
+ }
225
+ if (block.footnoteText) {
226
+ text += ` (\uC8FC: ${block.footnoteText})`;
227
+ }
201
228
  lines.push(text);
202
229
  } else if (block.type === "table" && block.table) {
230
+ if (lines.length > 0 && lines[lines.length - 1] !== "") {
231
+ lines.push("");
232
+ }
203
233
  lines.push(tableToMarkdown(block.table));
234
+ lines.push("");
204
235
  }
205
236
  }
206
237
  return lines.join("\n").trim();
@@ -251,7 +282,7 @@ function tableToMarkdown(table) {
251
282
  }
252
283
 
253
284
  // src/utils.ts
254
- var VERSION = true ? "1.4.0" : "0.0.0-dev";
285
+ var VERSION = true ? "1.5.0" : "0.0.0-dev";
255
286
  var KordocError = class extends Error {
256
287
  constructor(message) {
257
288
  super(message);
@@ -310,6 +341,75 @@ var MAX_ZIP_ENTRIES = 500;
310
341
  function clampSpan(val, max) {
311
342
  return Math.max(1, Math.min(val, max));
312
343
  }
344
+ async function extractHwpxStyles(zip) {
345
+ const result = {
346
+ charProperties: /* @__PURE__ */ new Map(),
347
+ styles: /* @__PURE__ */ new Map()
348
+ };
349
+ const headerPaths = ["Contents/header.xml", "header.xml", "Contents/head.xml", "head.xml"];
350
+ for (const hp of headerPaths) {
351
+ const hpLower = hp.toLowerCase();
352
+ const file = zip.file(hp) || Object.values(zip.files).find((f) => f.name.toLowerCase() === hpLower) || null;
353
+ if (!file) continue;
354
+ try {
355
+ const xml = await file.async("text");
356
+ const parser = new import_xmldom.DOMParser();
357
+ const doc = parser.parseFromString(stripDtd(xml), "text/xml");
358
+ if (!doc.documentElement) continue;
359
+ parseCharProperties(doc, result.charProperties);
360
+ parseStyleElements(doc, result.styles);
361
+ break;
362
+ } catch {
363
+ continue;
364
+ }
365
+ }
366
+ return result;
367
+ }
368
+ function parseCharProperties(doc, map) {
369
+ const tagNames = ["hh:charPr", "charPr", "hp:charPr"];
370
+ for (const tagName of tagNames) {
371
+ const elements = doc.getElementsByTagName(tagName);
372
+ for (let i = 0; i < elements.length; i++) {
373
+ const el = elements[i];
374
+ const id = el.getAttribute("id") || el.getAttribute("IDRef") || "";
375
+ if (!id) continue;
376
+ const prop = {};
377
+ const height = el.getAttribute("height");
378
+ if (height) prop.fontSize = parseInt(height, 10) / 100;
379
+ const bold = el.getAttribute("bold");
380
+ if (bold === "true" || bold === "1") prop.bold = true;
381
+ const italic = el.getAttribute("italic");
382
+ if (italic === "true" || italic === "1") prop.italic = true;
383
+ const fontFaces = el.getElementsByTagName("*");
384
+ for (let j = 0; j < fontFaces.length; j++) {
385
+ const ff = fontFaces[j];
386
+ const localTag = (ff.tagName || "").replace(/^[^:]+:/, "");
387
+ if (localTag === "fontface" || localTag === "fontRef") {
388
+ const face = ff.getAttribute("face") || ff.getAttribute("FontFace");
389
+ if (face) {
390
+ prop.fontName = face;
391
+ break;
392
+ }
393
+ }
394
+ }
395
+ map.set(id, prop);
396
+ }
397
+ }
398
+ }
399
+ function parseStyleElements(doc, map) {
400
+ const tagNames = ["hh:style", "style", "hp:style"];
401
+ for (const tagName of tagNames) {
402
+ const elements = doc.getElementsByTagName(tagName);
403
+ for (let i = 0; i < elements.length; i++) {
404
+ const el = elements[i];
405
+ const id = el.getAttribute("id") || el.getAttribute("IDRef") || String(i);
406
+ const name = el.getAttribute("name") || el.getAttribute("engName") || "";
407
+ const charPrId = el.getAttribute("charPrIDRef") || void 0;
408
+ const paraPrId = el.getAttribute("paraPrIDRef") || void 0;
409
+ map.set(id, { name, charPrId, paraPrId });
410
+ }
411
+ }
412
+ }
313
413
  function stripDtd(xml) {
314
414
  return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
315
415
  }
@@ -333,6 +433,8 @@ async function parseHwpxDocument(buffer, options) {
333
433
  }
334
434
  const metadata = {};
335
435
  await extractHwpxMetadata(zip, metadata);
436
+ const styleMap = await extractHwpxStyles(zip);
437
+ const warnings = [];
336
438
  const sectionPaths = await resolveSectionPaths(zip);
337
439
  if (sectionPaths.length === 0) throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
338
440
  metadata.pageCount = sectionPaths.length;
@@ -346,10 +448,12 @@ async function parseHwpxDocument(buffer, options) {
346
448
  const xml = await file.async("text");
347
449
  totalDecompressed += xml.length * 2;
348
450
  if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
349
- blocks.push(...parseSectionXml(xml));
451
+ blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1));
350
452
  }
453
+ detectHwpxHeadings(blocks, styleMap);
454
+ const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
351
455
  const markdown = blocksToMarkdown(blocks);
352
- return { markdown, blocks, metadata };
456
+ return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
353
457
  }
354
458
  async function extractHwpxMetadata(zip, metadata) {
355
459
  try {
@@ -515,15 +619,50 @@ function parseSectionPathsFromManifest(xml) {
515
619
  }
516
620
  return Array.from(idToHref.entries()).filter(([id]) => isSectionId(id)).sort((a, b) => a[0].localeCompare(b[0])).map(([, href]) => href);
517
621
  }
518
- function parseSectionXml(xml) {
622
+ function detectHwpxHeadings(blocks, styleMap) {
623
+ let baseFontSize = 0;
624
+ const sizeFreq = /* @__PURE__ */ new Map();
625
+ for (const b of blocks) {
626
+ if (b.style?.fontSize) {
627
+ sizeFreq.set(b.style.fontSize, (sizeFreq.get(b.style.fontSize) || 0) + 1);
628
+ }
629
+ }
630
+ let maxCount = 0;
631
+ for (const [size, count] of sizeFreq) {
632
+ if (count > maxCount) {
633
+ maxCount = count;
634
+ baseFontSize = size;
635
+ }
636
+ }
637
+ for (const block of blocks) {
638
+ if (block.type !== "paragraph" || !block.text) continue;
639
+ const text = block.text.trim();
640
+ if (text.length === 0 || text.length > 200 || /^\d+$/.test(text)) continue;
641
+ let level = 0;
642
+ if (baseFontSize > 0 && block.style?.fontSize) {
643
+ const ratio = block.style.fontSize / baseFontSize;
644
+ if (ratio >= 1.5) level = 1;
645
+ else if (ratio >= 1.3) level = 2;
646
+ else if (ratio >= 1.15) level = 3;
647
+ }
648
+ if (/^제\d+[조장절편]/.test(text) && text.length <= 50) {
649
+ if (level === 0) level = 3;
650
+ }
651
+ if (level > 0) {
652
+ block.type = "heading";
653
+ block.level = level;
654
+ }
655
+ }
656
+ }
657
+ function parseSectionXml(xml, styleMap, warnings, sectionNum) {
519
658
  const parser = new import_xmldom.DOMParser();
520
659
  const doc = parser.parseFromString(stripDtd(xml), "text/xml");
521
660
  if (!doc.documentElement) return [];
522
661
  const blocks = [];
523
- walkSection(doc.documentElement, blocks, null, []);
662
+ walkSection(doc.documentElement, blocks, null, [], styleMap, warnings, sectionNum);
524
663
  return blocks;
525
664
  }
526
- function walkSection(node, blocks, tableCtx, tableStack) {
665
+ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum) {
527
666
  const children = node.childNodes;
528
667
  if (!children) return;
529
668
  for (let i = 0; i < children.length; i++) {
@@ -535,7 +674,7 @@ function walkSection(node, blocks, tableCtx, tableStack) {
535
674
  case "tbl": {
536
675
  if (tableCtx) tableStack.push(tableCtx);
537
676
  const newTable = { rows: [], currentRow: [], cell: null };
538
- walkSection(el, blocks, newTable, tableStack);
677
+ walkSection(el, blocks, newTable, tableStack, styleMap, warnings, sectionNum);
539
678
  if (newTable.rows.length > 0) {
540
679
  if (tableStack.length > 0) {
541
680
  const parentTable = tableStack.pop();
@@ -545,7 +684,7 @@ function walkSection(node, blocks, tableCtx, tableStack) {
545
684
  }
546
685
  tableCtx = parentTable;
547
686
  } else {
548
- blocks.push({ type: "table", table: buildTable(newTable.rows) });
687
+ blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
549
688
  tableCtx = null;
550
689
  }
551
690
  } else {
@@ -556,7 +695,7 @@ function walkSection(node, blocks, tableCtx, tableStack) {
556
695
  case "tr":
557
696
  if (tableCtx) {
558
697
  tableCtx.currentRow = [];
559
- walkSection(el, blocks, tableCtx, tableStack);
698
+ walkSection(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum);
560
699
  if (tableCtx.currentRow.length > 0) tableCtx.rows.push(tableCtx.currentRow);
561
700
  tableCtx.currentRow = [];
562
701
  }
@@ -564,7 +703,7 @@ function walkSection(node, blocks, tableCtx, tableStack) {
564
703
  case "tc":
565
704
  if (tableCtx) {
566
705
  tableCtx.cell = { text: "", colSpan: 1, rowSpan: 1 };
567
- walkSection(el, blocks, tableCtx, tableStack);
706
+ walkSection(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum);
568
707
  if (tableCtx.cell) {
569
708
  tableCtx.currentRow.push(tableCtx.cell);
570
709
  tableCtx.cell = null;
@@ -580,25 +719,75 @@ function walkSection(node, blocks, tableCtx, tableStack) {
580
719
  }
581
720
  break;
582
721
  case "p": {
583
- const text = extractParagraphText(el);
722
+ const { text, href, footnote, style } = extractParagraphInfo(el, styleMap);
584
723
  if (text) {
585
724
  if (tableCtx?.cell) {
586
725
  tableCtx.cell.text += (tableCtx.cell.text ? "\n" : "") + text;
587
726
  } else if (!tableCtx) {
588
- blocks.push({ type: "paragraph", text });
727
+ const block = { type: "paragraph", text, pageNumber: sectionNum };
728
+ if (style) block.style = style;
729
+ if (href) block.href = href;
730
+ if (footnote) block.footnoteText = footnote;
731
+ blocks.push(block);
589
732
  }
590
733
  }
591
- walkSection(el, blocks, tableCtx, tableStack);
734
+ tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum);
592
735
  break;
593
736
  }
737
+ // 이미지/그림 — 경고 수집
738
+ case "pic":
739
+ case "shape":
740
+ case "drawingObject":
741
+ if (warnings && sectionNum) {
742
+ warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
743
+ }
744
+ break;
594
745
  default:
595
- walkSection(el, blocks, tableCtx, tableStack);
746
+ walkSection(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum);
596
747
  break;
597
748
  }
598
749
  }
599
750
  }
600
- function extractParagraphText(para) {
751
+ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum) {
752
+ const children = node.childNodes;
753
+ if (!children) return tableCtx;
754
+ for (let i = 0; i < children.length; i++) {
755
+ const el = children[i];
756
+ if (el.nodeType !== 1) continue;
757
+ const tag = el.tagName || el.localName || "";
758
+ const localTag = tag.replace(/^[^:]+:/, "");
759
+ if (localTag === "tbl") {
760
+ if (tableCtx) tableStack.push(tableCtx);
761
+ const newTable = { rows: [], currentRow: [], cell: null };
762
+ walkSection(el, blocks, newTable, tableStack, styleMap, warnings, sectionNum);
763
+ if (newTable.rows.length > 0) {
764
+ if (tableStack.length > 0) {
765
+ const parentTable = tableStack.pop();
766
+ const nestedText = convertTableToText(newTable.rows);
767
+ if (parentTable.cell) {
768
+ parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
769
+ }
770
+ tableCtx = parentTable;
771
+ } else {
772
+ blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
773
+ tableCtx = null;
774
+ }
775
+ } else {
776
+ tableCtx = tableStack.length > 0 ? tableStack.pop() : null;
777
+ }
778
+ } else if (localTag === "pic" || localTag === "shape" || localTag === "drawingObject") {
779
+ if (warnings && sectionNum) {
780
+ warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
781
+ }
782
+ }
783
+ }
784
+ return tableCtx;
785
+ }
786
+ function extractParagraphInfo(para, styleMap) {
601
787
  let text = "";
788
+ let href;
789
+ let footnote;
790
+ let charPrId;
602
791
  const walk = (node) => {
603
792
  const children = node.childNodes;
604
793
  if (!children) return;
@@ -627,6 +816,29 @@ function extractParagraphText(para) {
627
816
  case "tbl":
628
817
  break;
629
818
  // 테이블은 walkSection에서 처리
819
+ // 하이퍼링크
820
+ case "hyperlink": {
821
+ const url = child.getAttribute("url") || child.getAttribute("href") || "";
822
+ if (url) href = url;
823
+ walk(child);
824
+ break;
825
+ }
826
+ // 각주/미주
827
+ case "footNote":
828
+ case "endNote":
829
+ case "fn":
830
+ case "en": {
831
+ const noteText = extractTextFromNode(child);
832
+ if (noteText) footnote = (footnote ? footnote + "; " : "") + noteText;
833
+ break;
834
+ }
835
+ // run 요소에서 charPrIDRef 추출
836
+ case "r": {
837
+ const runCharPr = child.getAttribute("charPrIDRef");
838
+ if (runCharPr && !charPrId) charPrId = runCharPr;
839
+ walk(child);
840
+ break;
841
+ }
630
842
  default:
631
843
  walk(child);
632
844
  break;
@@ -634,16 +846,43 @@ function extractParagraphText(para) {
634
846
  }
635
847
  };
636
848
  walk(para);
637
- return text.replace(/[ \t]+/g, " ").trim();
849
+ const cleanText = text.replace(/[ \t]+/g, " ").trim();
850
+ let style;
851
+ if (styleMap && charPrId) {
852
+ const charProp = styleMap.charProperties.get(charPrId);
853
+ if (charProp) {
854
+ style = {};
855
+ if (charProp.fontSize) style.fontSize = charProp.fontSize;
856
+ if (charProp.bold) style.bold = true;
857
+ if (charProp.italic) style.italic = true;
858
+ if (charProp.fontName) style.fontName = charProp.fontName;
859
+ if (!style.fontSize && !style.bold && !style.italic) style = void 0;
860
+ }
861
+ }
862
+ return { text: cleanText, href, footnote, style };
863
+ }
864
+ function extractTextFromNode(node) {
865
+ let result = "";
866
+ const children = node.childNodes;
867
+ if (!children) return result;
868
+ for (let i = 0; i < children.length; i++) {
869
+ const child = children[i];
870
+ if (child.nodeType === 3) result += child.textContent || "";
871
+ else if (child.nodeType === 1) result += extractTextFromNode(child);
872
+ }
873
+ return result.trim();
638
874
  }
639
875
 
640
876
  // src/hwp5/record.ts
641
877
  var import_zlib2 = require("zlib");
642
878
  var TAG_PARA_HEADER = 66;
643
879
  var TAG_PARA_TEXT = 67;
880
+ var TAG_CHAR_SHAPE = 68;
644
881
  var TAG_CTRL_HEADER = 71;
645
882
  var TAG_LIST_HEADER = 72;
646
883
  var TAG_TABLE = 77;
884
+ var TAG_DOC_CHAR_SHAPE = 55;
885
+ var TAG_DOC_STYLE = 58;
647
886
  var CHAR_LINE = 0;
648
887
  var CHAR_PARA = 13;
649
888
  var CHAR_TAB = 9;
@@ -694,6 +933,51 @@ function parseFileHeader(data) {
694
933
  flags: data.readUInt32LE(36)
695
934
  };
696
935
  }
936
+ function parseDocInfo(records) {
937
+ const charShapes = [];
938
+ const styles = [];
939
+ for (const rec of records) {
940
+ if (rec.tagId === TAG_DOC_CHAR_SHAPE && rec.data.length >= 18) {
941
+ if (rec.data.length >= 50) {
942
+ const fontSize = rec.data.readUInt32LE(42);
943
+ const attrFlags = rec.data.readUInt32LE(46);
944
+ charShapes.push({ fontSize, attrFlags });
945
+ } else {
946
+ charShapes.push({ fontSize: 0, attrFlags: 0 });
947
+ }
948
+ }
949
+ if (rec.tagId === TAG_DOC_STYLE && rec.data.length >= 8) {
950
+ try {
951
+ let offset = 0;
952
+ const nameLen = rec.data.readUInt16LE(offset);
953
+ offset += 2;
954
+ const nameBytes = nameLen * 2;
955
+ const name = nameBytes > 0 && offset + nameBytes <= rec.data.length ? rec.data.subarray(offset, offset + nameBytes).toString("utf16le") : "";
956
+ offset += nameBytes;
957
+ let nameKo = "";
958
+ if (offset + 2 <= rec.data.length) {
959
+ const nameKoLen = rec.data.readUInt16LE(offset);
960
+ offset += 2;
961
+ const nameKoBytes = nameKoLen * 2;
962
+ if (nameKoBytes > 0 && offset + nameKoBytes <= rec.data.length) {
963
+ nameKo = rec.data.subarray(offset, offset + nameKoBytes).toString("utf16le");
964
+ }
965
+ offset += nameKoBytes;
966
+ }
967
+ const type = offset < rec.data.length ? rec.data.readUInt8(offset) : 0;
968
+ offset += 1;
969
+ offset += 2;
970
+ offset += 2;
971
+ const paraShapeId = offset + 2 <= rec.data.length ? rec.data.readUInt16LE(offset) : 0;
972
+ offset += 2;
973
+ const charShapeId = offset + 2 <= rec.data.length ? rec.data.readUInt16LE(offset) : 0;
974
+ styles.push({ name, nameKo, charShapeId, paraShapeId, type });
975
+ } catch {
976
+ }
977
+ }
978
+ }
979
+ return { charShapes, styles };
980
+ }
697
981
  function extractText(data) {
698
982
  let result = "";
699
983
  let i = 0;
@@ -759,6 +1043,8 @@ function parseHwp5Document(buffer, options) {
759
1043
  version: `${header.versionMajor}.x`
760
1044
  };
761
1045
  extractHwp5Metadata(cfb, metadata);
1046
+ const docInfo = parseDocInfoStream(cfb, compressed);
1047
+ const warnings = [];
762
1048
  const sections = findSections(cfb);
763
1049
  if (sections.length === 0) throw new KordocError("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
764
1050
  metadata.pageCount = sections.length;
@@ -772,10 +1058,73 @@ function parseHwp5Document(buffer, options) {
772
1058
  totalDecompressed += data.length;
773
1059
  if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
774
1060
  const records = readRecords(data);
775
- blocks.push(...parseSection(records));
1061
+ const sectionBlocks = parseSection(records, docInfo, warnings, si + 1);
1062
+ blocks.push(...sectionBlocks);
776
1063
  }
1064
+ if (docInfo) {
1065
+ detectHwp5Headings(blocks, docInfo);
1066
+ }
1067
+ const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
777
1068
  const markdown = blocksToMarkdown(blocks);
778
- return { markdown, blocks, metadata };
1069
+ return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
1070
+ }
1071
+ function parseDocInfoStream(cfb, compressed) {
1072
+ try {
1073
+ const entry = CFB.find(cfb, "/DocInfo");
1074
+ if (!entry?.content) return null;
1075
+ const data = compressed ? decompressStream(Buffer.from(entry.content)) : Buffer.from(entry.content);
1076
+ const records = readRecords(data);
1077
+ return parseDocInfo(records);
1078
+ } catch {
1079
+ return null;
1080
+ }
1081
+ }
1082
+ function detectHwp5Headings(blocks, docInfo) {
1083
+ let baseFontSize = 0;
1084
+ for (const style of docInfo.styles) {
1085
+ const name = (style.nameKo || style.name).toLowerCase();
1086
+ if (name.includes("\uBC14\uD0D5") || name.includes("\uBCF8\uBB38") || name === "normal" || name === "body") {
1087
+ const cs = docInfo.charShapes[style.charShapeId];
1088
+ if (cs?.fontSize > 0) {
1089
+ baseFontSize = cs.fontSize / 10;
1090
+ break;
1091
+ }
1092
+ }
1093
+ }
1094
+ if (baseFontSize === 0) {
1095
+ const sizeFreq = /* @__PURE__ */ new Map();
1096
+ for (const b of blocks) {
1097
+ if (b.style?.fontSize) {
1098
+ sizeFreq.set(b.style.fontSize, (sizeFreq.get(b.style.fontSize) || 0) + 1);
1099
+ }
1100
+ }
1101
+ let maxCount = 0;
1102
+ for (const [size, count] of sizeFreq) {
1103
+ if (count > maxCount) {
1104
+ maxCount = count;
1105
+ baseFontSize = size;
1106
+ }
1107
+ }
1108
+ }
1109
+ if (baseFontSize <= 0) return;
1110
+ for (const block of blocks) {
1111
+ if (block.type !== "paragraph" || !block.text || !block.style?.fontSize) continue;
1112
+ const text = block.text.trim();
1113
+ if (text.length === 0 || text.length > 200) continue;
1114
+ if (/^\d+$/.test(text)) continue;
1115
+ const ratio = block.style.fontSize / baseFontSize;
1116
+ let level = 0;
1117
+ if (ratio >= 1.5) level = 1;
1118
+ else if (ratio >= 1.3) level = 2;
1119
+ else if (ratio >= 1.15) level = 3;
1120
+ if (/^제\d+[조장절편]/.test(text) && text.length <= 50) {
1121
+ if (level === 0) level = 3;
1122
+ }
1123
+ if (level > 0) {
1124
+ block.type = "heading";
1125
+ block.level = level;
1126
+ }
1127
+ }
779
1128
  }
780
1129
  function extractHwp5Metadata(cfb, metadata) {
781
1130
  try {
@@ -827,15 +1176,22 @@ function findSections(cfb) {
827
1176
  }
828
1177
  return sections.sort((a, b) => a.idx - b.idx).map((s) => s.content);
829
1178
  }
830
- function parseSection(records) {
1179
+ function parseSection(records, docInfo, warnings, sectionNum) {
831
1180
  const blocks = [];
832
1181
  let i = 0;
833
1182
  while (i < records.length) {
834
1183
  const rec = records[i];
835
1184
  if (rec.tagId === TAG_PARA_HEADER && rec.level === 0) {
836
- const { paragraph, tables, nextIdx } = parseParagraphWithTables(records, i);
837
- if (paragraph) blocks.push({ type: "paragraph", text: paragraph });
838
- for (const t of tables) blocks.push({ type: "table", table: t });
1185
+ const { paragraph, tables, nextIdx, charShapeIds } = parseParagraphWithTables(records, i);
1186
+ if (paragraph) {
1187
+ const block = { type: "paragraph", text: paragraph, pageNumber: sectionNum };
1188
+ if (docInfo && charShapeIds.length > 0) {
1189
+ const style = resolveCharStyle(charShapeIds, docInfo);
1190
+ if (style) block.style = style;
1191
+ }
1192
+ blocks.push(block);
1193
+ }
1194
+ for (const t of tables) blocks.push({ type: "table", table: t, pageNumber: sectionNum });
839
1195
  i = nextIdx;
840
1196
  continue;
841
1197
  }
@@ -843,19 +1199,43 @@ function parseSection(records) {
843
1199
  const ctrlId = rec.data.subarray(0, 4).toString("ascii");
844
1200
  if (ctrlId === " lbt" || ctrlId === "tbl ") {
845
1201
  const { table, nextIdx } = parseTableBlock(records, i);
846
- if (table) blocks.push({ type: "table", table });
1202
+ if (table) blocks.push({ type: "table", table, pageNumber: sectionNum });
847
1203
  i = nextIdx;
848
1204
  continue;
849
1205
  }
1206
+ if (ctrlId === "gso " || ctrlId === " osg" || ctrlId === " elo" || ctrlId === "ole ") {
1207
+ warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC81C\uC5B4 \uC694\uC18C: ${ctrlId.trim()}`, code: "SKIPPED_IMAGE" });
1208
+ }
850
1209
  }
851
1210
  i++;
852
1211
  }
853
1212
  return blocks;
854
1213
  }
1214
+ function resolveCharStyle(charShapeIds, docInfo) {
1215
+ if (charShapeIds.length === 0 || docInfo.charShapes.length === 0) return void 0;
1216
+ const freq = /* @__PURE__ */ new Map();
1217
+ let maxCount = 0, dominantId = charShapeIds[0];
1218
+ for (const id of charShapeIds) {
1219
+ const count = (freq.get(id) || 0) + 1;
1220
+ freq.set(id, count);
1221
+ if (count > maxCount) {
1222
+ maxCount = count;
1223
+ dominantId = id;
1224
+ }
1225
+ }
1226
+ const cs = docInfo.charShapes[dominantId];
1227
+ if (!cs) return void 0;
1228
+ const style = {};
1229
+ if (cs.fontSize > 0) style.fontSize = cs.fontSize / 10;
1230
+ if (cs.attrFlags & 1) style.italic = true;
1231
+ if (cs.attrFlags & 2) style.bold = true;
1232
+ return style.fontSize || style.bold || style.italic ? style : void 0;
1233
+ }
855
1234
  function parseParagraphWithTables(records, startIdx) {
856
1235
  const startLevel = records[startIdx].level;
857
1236
  let text = "";
858
1237
  const tables = [];
1238
+ const charShapeIds = [];
859
1239
  let i = startIdx + 1;
860
1240
  while (i < records.length) {
861
1241
  const rec = records[i];
@@ -863,6 +1243,11 @@ function parseParagraphWithTables(records, startIdx) {
863
1243
  if (rec.tagId === TAG_PARA_TEXT) {
864
1244
  text = extractText(rec.data);
865
1245
  }
1246
+ if (rec.tagId === TAG_CHAR_SHAPE && rec.data.length >= 8) {
1247
+ for (let offset = 0; offset + 7 < rec.data.length; offset += 8) {
1248
+ charShapeIds.push(rec.data.readUInt32LE(offset + 4));
1249
+ }
1250
+ }
866
1251
  if (rec.tagId === TAG_CTRL_HEADER && rec.data.length >= 4) {
867
1252
  const ctrlId = rec.data.subarray(0, 4).toString("ascii");
868
1253
  if (ctrlId === " lbt" || ctrlId === "tbl ") {
@@ -875,7 +1260,7 @@ function parseParagraphWithTables(records, startIdx) {
875
1260
  i++;
876
1261
  }
877
1262
  const trimmed = text.trim();
878
- return { paragraph: trimmed || null, tables, nextIdx: i };
1263
+ return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds };
879
1264
  }
880
1265
  function parseTableBlock(records, startIdx) {
881
1266
  const tableLevel = records[startIdx].level;
@@ -947,6 +1332,355 @@ function arrangeCells(rows, cols, cells) {
947
1332
  return grid.map((row) => row.map((c) => c || { text: "", colSpan: 1, rowSpan: 1 }));
948
1333
  }
949
1334
 
1335
+ // src/pdf/line-detector.ts
1336
+ var import_pdf = require("pdfjs-dist/legacy/build/pdf.mjs");
1337
+ var ORIENTATION_TOL = 2;
1338
+ var MIN_LINE_LENGTH = 10;
1339
+ var COORD_MERGE_TOL = 3;
1340
+ var CONNECT_TOL = 5;
1341
+ var CELL_PADDING = 2;
1342
+ function extractLines(fnArray, argsArray) {
1343
+ const horizontals = [];
1344
+ const verticals = [];
1345
+ let lineWidth = 1;
1346
+ let currentPath = [];
1347
+ let pathStartX = 0, pathStartY = 0;
1348
+ let curX = 0, curY = 0;
1349
+ function flushPath(isStroke) {
1350
+ if (!isStroke) {
1351
+ currentPath = [];
1352
+ return;
1353
+ }
1354
+ for (const seg of currentPath) {
1355
+ classifyAndAdd(seg, lineWidth, horizontals, verticals);
1356
+ }
1357
+ currentPath = [];
1358
+ }
1359
+ for (let i = 0; i < fnArray.length; i++) {
1360
+ const op = fnArray[i];
1361
+ const args = argsArray[i];
1362
+ switch (op) {
1363
+ case import_pdf.OPS.setLineWidth:
1364
+ lineWidth = args[0] || 1;
1365
+ break;
1366
+ case import_pdf.OPS.constructPath: {
1367
+ const subOps = args[0];
1368
+ const coords = args[1];
1369
+ let ci = 0;
1370
+ for (const subOp of subOps) {
1371
+ if (subOp === import_pdf.OPS.moveTo) {
1372
+ curX = coords[ci++];
1373
+ curY = coords[ci++];
1374
+ pathStartX = curX;
1375
+ pathStartY = curY;
1376
+ } else if (subOp === import_pdf.OPS.lineTo) {
1377
+ const x2 = coords[ci++], y2 = coords[ci++];
1378
+ currentPath.push({ x1: curX, y1: curY, x2, y2 });
1379
+ curX = x2;
1380
+ curY = y2;
1381
+ } else if (subOp === import_pdf.OPS.rectangle) {
1382
+ const rx = coords[ci++], ry = coords[ci++];
1383
+ const rw = coords[ci++], rh = coords[ci++];
1384
+ if (Math.abs(rh) < ORIENTATION_TOL * 2) {
1385
+ currentPath.push({ x1: rx, y1: ry + rh / 2, x2: rx + rw, y2: ry + rh / 2 });
1386
+ } else if (Math.abs(rw) < ORIENTATION_TOL * 2) {
1387
+ currentPath.push({ x1: rx + rw / 2, y1: ry, x2: rx + rw / 2, y2: ry + rh });
1388
+ } else {
1389
+ currentPath.push(
1390
+ { x1: rx, y1: ry, x2: rx + rw, y2: ry },
1391
+ // bottom
1392
+ { x1: rx + rw, y1: ry, x2: rx + rw, y2: ry + rh },
1393
+ // right
1394
+ { x1: rx + rw, y1: ry + rh, x2: rx, y2: ry + rh },
1395
+ // top
1396
+ { x1: rx, y1: ry + rh, x2: rx, y2: ry }
1397
+ // left
1398
+ );
1399
+ }
1400
+ } else if (subOp === import_pdf.OPS.closePath) {
1401
+ if (curX !== pathStartX || curY !== pathStartY) {
1402
+ currentPath.push({ x1: curX, y1: curY, x2: pathStartX, y2: pathStartY });
1403
+ }
1404
+ curX = pathStartX;
1405
+ curY = pathStartY;
1406
+ } else if (subOp === import_pdf.OPS.curveTo) {
1407
+ ci += 6;
1408
+ } else if (subOp === import_pdf.OPS.curveTo2 || subOp === import_pdf.OPS.curveTo3) {
1409
+ ci += 4;
1410
+ }
1411
+ }
1412
+ break;
1413
+ }
1414
+ case import_pdf.OPS.stroke:
1415
+ case import_pdf.OPS.closeStroke:
1416
+ flushPath(true);
1417
+ break;
1418
+ case import_pdf.OPS.fill:
1419
+ case import_pdf.OPS.eoFill:
1420
+ case import_pdf.OPS.fillStroke:
1421
+ case import_pdf.OPS.eoFillStroke:
1422
+ case import_pdf.OPS.closeFillStroke:
1423
+ case import_pdf.OPS.closeEOFillStroke:
1424
+ flushPath(true);
1425
+ break;
1426
+ case import_pdf.OPS.endPath:
1427
+ flushPath(false);
1428
+ break;
1429
+ }
1430
+ }
1431
+ return { horizontals, verticals };
1432
+ }
1433
+ function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
1434
+ const dx = Math.abs(seg.x2 - seg.x1);
1435
+ const dy = Math.abs(seg.y2 - seg.y1);
1436
+ const length = Math.sqrt(dx * dx + dy * dy);
1437
+ if (length < MIN_LINE_LENGTH) return;
1438
+ if (dy <= ORIENTATION_TOL) {
1439
+ const y = (seg.y1 + seg.y2) / 2;
1440
+ const x1 = Math.min(seg.x1, seg.x2);
1441
+ const x2 = Math.max(seg.x1, seg.x2);
1442
+ horizontals.push({ x1, y1: y, x2, y2: y, lineWidth });
1443
+ } else if (dx <= ORIENTATION_TOL) {
1444
+ const x = (seg.x1 + seg.x2) / 2;
1445
+ const y1 = Math.min(seg.y1, seg.y2);
1446
+ const y2 = Math.max(seg.y1, seg.y2);
1447
+ verticals.push({ x1: x, y1, x2: x, y2, lineWidth });
1448
+ }
1449
+ }
1450
+ function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
1451
+ const margin = 5;
1452
+ return {
1453
+ horizontals: horizontals.filter(
1454
+ (l) => !(Math.abs(l.y1) < margin || Math.abs(l.y1 - pageHeight) < margin) || l.x2 - l.x1 < pageWidth * 0.9
1455
+ ),
1456
+ verticals: verticals.filter(
1457
+ (l) => !(Math.abs(l.x1) < margin || Math.abs(l.x1 - pageWidth) < margin) || l.y2 - l.y1 < pageHeight * 0.9
1458
+ )
1459
+ };
1460
+ }
1461
+ function buildTableGrids(horizontals, verticals) {
1462
+ if (horizontals.length < 2 || verticals.length < 2) return [];
1463
+ const allLines = [
1464
+ ...horizontals.map((l, i) => ({ ...l, type: "h", id: i })),
1465
+ ...verticals.map((l, i) => ({ ...l, type: "v", id: i + horizontals.length }))
1466
+ ];
1467
+ const groups = groupConnectedLines(allLines);
1468
+ const grids = [];
1469
+ for (const group of groups) {
1470
+ const hLines = group.filter((l) => l.type === "h");
1471
+ const vLines = group.filter((l) => l.type === "v");
1472
+ if (hLines.length < 2 || vLines.length < 2) continue;
1473
+ const rawYs = hLines.map((l) => l.y1);
1474
+ const rowYs = clusterCoordinates(rawYs).sort((a, b) => b - a);
1475
+ const rawXs = vLines.map((l) => l.x1);
1476
+ const colXs = clusterCoordinates(rawXs).sort((a, b) => a - b);
1477
+ if (rowYs.length < 2 || colXs.length < 2) continue;
1478
+ const bbox = {
1479
+ x1: colXs[0],
1480
+ y1: rowYs[rowYs.length - 1],
1481
+ x2: colXs[colXs.length - 1],
1482
+ y2: rowYs[0]
1483
+ };
1484
+ grids.push({ rowYs, colXs, bbox });
1485
+ }
1486
+ return grids;
1487
+ }
1488
+ function clusterCoordinates(values) {
1489
+ if (values.length === 0) return [];
1490
+ const sorted = [...values].sort((a, b) => a - b);
1491
+ const clusters = [{ sum: sorted[0], count: 1 }];
1492
+ for (let i = 1; i < sorted.length; i++) {
1493
+ const last = clusters[clusters.length - 1];
1494
+ const avg = last.sum / last.count;
1495
+ if (Math.abs(sorted[i] - avg) <= COORD_MERGE_TOL) {
1496
+ last.sum += sorted[i];
1497
+ last.count++;
1498
+ } else {
1499
+ clusters.push({ sum: sorted[i], count: 1 });
1500
+ }
1501
+ }
1502
+ return clusters.map((c) => c.sum / c.count);
1503
+ }
1504
+ function groupConnectedLines(lines) {
1505
+ const parent = lines.map((_, i) => i);
1506
+ function find(x) {
1507
+ while (parent[x] !== x) {
1508
+ parent[x] = parent[parent[x]];
1509
+ x = parent[x];
1510
+ }
1511
+ return x;
1512
+ }
1513
+ function union(a, b) {
1514
+ const ra = find(a), rb = find(b);
1515
+ if (ra !== rb) parent[ra] = rb;
1516
+ }
1517
+ for (let i = 0; i < lines.length; i++) {
1518
+ for (let j = i + 1; j < lines.length; j++) {
1519
+ if (linesIntersect(lines[i], lines[j])) {
1520
+ union(i, j);
1521
+ }
1522
+ }
1523
+ }
1524
+ const groups = /* @__PURE__ */ new Map();
1525
+ for (let i = 0; i < lines.length; i++) {
1526
+ const root = find(i);
1527
+ if (!groups.has(root)) groups.set(root, []);
1528
+ groups.get(root).push(lines[i]);
1529
+ }
1530
+ return [...groups.values()];
1531
+ }
1532
+ function linesIntersect(a, b) {
1533
+ if (a.type === b.type) {
1534
+ if (a.type === "h") {
1535
+ if (Math.abs(a.y1 - b.y1) > CONNECT_TOL) return false;
1536
+ return Math.min(a.x2, b.x2) >= Math.max(a.x1, b.x1) - CONNECT_TOL;
1537
+ } else {
1538
+ if (Math.abs(a.x1 - b.x1) > CONNECT_TOL) return false;
1539
+ return Math.min(a.y2, b.y2) >= Math.max(a.y1, b.y1) - CONNECT_TOL;
1540
+ }
1541
+ }
1542
+ const h = a.type === "h" ? a : b;
1543
+ const v = a.type === "h" ? b : a;
1544
+ const tol = CONNECT_TOL;
1545
+ return v.x1 >= h.x1 - tol && v.x1 <= h.x2 + tol && h.y1 >= v.y1 - tol && h.y1 <= v.y2 + tol;
1546
+ }
1547
+ function extractCells(grid, horizontals, verticals) {
1548
+ const { rowYs, colXs } = grid;
1549
+ const numRows = rowYs.length - 1;
1550
+ const numCols = colXs.length - 1;
1551
+ if (numRows <= 0 || numCols <= 0) return [];
1552
+ const occupied = Array.from({ length: numRows }, () => Array(numCols).fill(false));
1553
+ const cells = [];
1554
+ for (let r = 0; r < numRows; r++) {
1555
+ for (let c = 0; c < numCols; c++) {
1556
+ if (occupied[r][c]) continue;
1557
+ let colSpan = 1;
1558
+ let rowSpan = 1;
1559
+ while (c + colSpan < numCols) {
1560
+ const borderX = colXs[c + colSpan];
1561
+ const topY = rowYs[r];
1562
+ const botY = rowYs[r + 1];
1563
+ if (hasVerticalLine(verticals, borderX, topY, botY)) break;
1564
+ colSpan++;
1565
+ }
1566
+ while (r + rowSpan < numRows) {
1567
+ const borderY = rowYs[r + rowSpan];
1568
+ const leftX = colXs[c];
1569
+ const rightX = colXs[c + colSpan];
1570
+ if (hasHorizontalLine(horizontals, borderY, leftX, rightX)) break;
1571
+ rowSpan++;
1572
+ }
1573
+ for (let dr = 0; dr < rowSpan; dr++) {
1574
+ for (let dc = 0; dc < colSpan; dc++) {
1575
+ occupied[r + dr][c + dc] = true;
1576
+ }
1577
+ }
1578
+ cells.push({
1579
+ row: r,
1580
+ col: c,
1581
+ rowSpan,
1582
+ colSpan,
1583
+ bbox: {
1584
+ x1: colXs[c],
1585
+ y1: rowYs[r + rowSpan],
1586
+ x2: colXs[c + colSpan],
1587
+ y2: rowYs[r]
1588
+ }
1589
+ });
1590
+ }
1591
+ }
1592
+ return cells;
1593
+ }
1594
+ function hasVerticalLine(verticals, x, topY, botY) {
1595
+ const tol = COORD_MERGE_TOL + 1;
1596
+ for (const v of verticals) {
1597
+ if (Math.abs(v.x1 - x) <= tol) {
1598
+ const cellH = Math.abs(topY - botY);
1599
+ const overlapTop = Math.min(v.y2, topY);
1600
+ const overlapBot = Math.max(v.y1, botY);
1601
+ const overlap = overlapTop - overlapBot;
1602
+ if (overlap >= cellH * 0.5) return true;
1603
+ }
1604
+ }
1605
+ return false;
1606
+ }
1607
+ function hasHorizontalLine(horizontals, y, leftX, rightX) {
1608
+ const tol = COORD_MERGE_TOL + 1;
1609
+ for (const h of horizontals) {
1610
+ if (Math.abs(h.y1 - y) <= tol) {
1611
+ const cellW = Math.abs(rightX - leftX);
1612
+ const overlapLeft = Math.max(h.x1, leftX);
1613
+ const overlapRight = Math.min(h.x2, rightX);
1614
+ const overlap = overlapRight - overlapLeft;
1615
+ if (overlap >= cellW * 0.5) return true;
1616
+ }
1617
+ }
1618
+ return false;
1619
+ }
1620
+ function mapTextToCells(items, cells) {
1621
+ const result = /* @__PURE__ */ new Map();
1622
+ for (const cell of cells) {
1623
+ result.set(cell, []);
1624
+ }
1625
+ for (const item of items) {
1626
+ const cx = item.x + item.w / 2;
1627
+ const cy = item.y;
1628
+ const pad = CELL_PADDING;
1629
+ let bestCell = null;
1630
+ let bestDist = Infinity;
1631
+ for (const cell of cells) {
1632
+ if (cx >= cell.bbox.x1 - pad && cx <= cell.bbox.x2 + pad && cy >= cell.bbox.y1 - pad && cy <= cell.bbox.y2 + pad) {
1633
+ const cellCx = (cell.bbox.x1 + cell.bbox.x2) / 2;
1634
+ const cellCy = (cell.bbox.y1 + cell.bbox.y2) / 2;
1635
+ const dist = Math.abs(cx - cellCx) + Math.abs(cy - cellCy);
1636
+ if (dist < bestDist) {
1637
+ bestDist = dist;
1638
+ bestCell = cell;
1639
+ }
1640
+ }
1641
+ }
1642
+ if (bestCell) {
1643
+ result.get(bestCell).push(item);
1644
+ }
1645
+ }
1646
+ return result;
1647
+ }
1648
+ function cellTextToString(items) {
1649
+ if (items.length === 0) return "";
1650
+ if (items.length === 1) return items[0].text;
1651
+ const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
1652
+ const lines = [];
1653
+ let curLine = [sorted[0]];
1654
+ let curY = sorted[0].y;
1655
+ for (let i = 1; i < sorted.length; i++) {
1656
+ const tol = Math.max(3, Math.min(sorted[i].fontSize, curLine[0].fontSize) * 0.6);
1657
+ if (Math.abs(sorted[i].y - curY) <= tol) {
1658
+ curLine.push(sorted[i]);
1659
+ } else {
1660
+ lines.push(curLine);
1661
+ curLine = [sorted[i]];
1662
+ curY = sorted[i].y;
1663
+ }
1664
+ }
1665
+ lines.push(curLine);
1666
+ const textLines = lines.map((line) => {
1667
+ const s = line.sort((a, b) => a.x - b.x);
1668
+ return s.map((i) => i.text).join(" ");
1669
+ });
1670
+ if (textLines.length <= 1) return textLines[0] || "";
1671
+ const merged = [textLines[0]];
1672
+ for (let i = 1; i < textLines.length; i++) {
1673
+ const prev = merged[merged.length - 1];
1674
+ const curr = textLines[i];
1675
+ if (/[가-힣]$/.test(prev) && /^[가-힣]+$/.test(curr) && curr.length <= 5 && !curr.includes(" ")) {
1676
+ merged[merged.length - 1] = prev + curr;
1677
+ } else {
1678
+ merged.push(curr);
1679
+ }
1680
+ }
1681
+ return merged.join("\n");
1682
+ }
1683
+
950
1684
  // src/pdf/polyfill.ts
951
1685
  var pdfjsWorker = __toESM(require("pdfjs-dist/legacy/build/pdf.worker.mjs"), 1);
952
1686
  var g = globalThis;
@@ -965,12 +1699,12 @@ if (typeof g.Path2D === "undefined") {
965
1699
  g.pdfjsWorker = pdfjsWorker;
966
1700
 
967
1701
  // src/pdf/parser.ts
968
- var import_pdf = require("pdfjs-dist/legacy/build/pdf.mjs");
969
- import_pdf.GlobalWorkerOptions.workerSrc = "";
1702
+ var import_pdf2 = require("pdfjs-dist/legacy/build/pdf.mjs");
1703
+ import_pdf2.GlobalWorkerOptions.workerSrc = "";
970
1704
  var MAX_PAGES = 5e3;
971
1705
  var MAX_TOTAL_TEXT = 100 * 1024 * 1024;
972
1706
  async function parsePdfDocument(buffer, options) {
973
- const doc = await (0, import_pdf.getDocument)({
1707
+ const doc = await (0, import_pdf2.getDocument)({
974
1708
  data: new Uint8Array(buffer),
975
1709
  useSystemFonts: true,
976
1710
  disableFontFace: true,
@@ -978,25 +1712,39 @@ async function parsePdfDocument(buffer, options) {
978
1712
  }).promise;
979
1713
  try {
980
1714
  const pageCount = doc.numPages;
981
- if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.", blocks: [] };
1715
+ if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "PARSE_ERROR" };
982
1716
  const metadata = { pageCount };
983
1717
  await extractPdfMetadata(doc, metadata);
984
- const pageTexts = [];
985
1718
  const blocks = [];
1719
+ const warnings = [];
986
1720
  let totalChars = 0;
987
1721
  let totalTextBytes = 0;
988
1722
  const effectivePageCount = Math.min(pageCount, MAX_PAGES);
989
1723
  const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
1724
+ const allFontSizes = [];
990
1725
  for (let i = 1; i <= effectivePageCount; i++) {
991
1726
  if (pageFilter && !pageFilter.has(i)) continue;
992
1727
  const page = await doc.getPage(i);
993
1728
  const tc = await page.getTextContent();
994
- const pageText = extractPageContent(tc.items);
995
- totalChars += pageText.replace(/\s/g, "").length;
996
- totalTextBytes += pageText.length * 2;
1729
+ const viewport = page.getViewport({ scale: 1 });
1730
+ const rawItems = tc.items;
1731
+ const items = normalizeItems(rawItems);
1732
+ const { visible, hiddenCount } = filterHiddenText(items, viewport.width, viewport.height);
1733
+ if (hiddenCount > 0) {
1734
+ warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
1735
+ }
1736
+ for (const item of visible) {
1737
+ if (item.fontSize > 0) allFontSizes.push(item.fontSize);
1738
+ }
1739
+ const opList = await page.getOperatorList();
1740
+ const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
1741
+ for (const b of pageBlocks) blocks.push(b);
1742
+ for (const b of pageBlocks) {
1743
+ const t = b.text || "";
1744
+ totalChars += t.replace(/\s/g, "").length;
1745
+ totalTextBytes += t.length * 2;
1746
+ }
997
1747
  if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
998
- pageTexts.push(pageText);
999
- blocks.push({ type: "paragraph", text: pageText });
1000
1748
  }
1001
1749
  const parsedPageCount = pageFilter ? pageFilter.size : effectivePageCount;
1002
1750
  if (totalChars / Math.max(parsedPageCount, 1) < 10) {
@@ -1006,16 +1754,20 @@ async function parsePdfDocument(buffer, options) {
1006
1754
  const ocrBlocks = await ocrPages2(doc, options.ocr, pageFilter, effectivePageCount);
1007
1755
  if (ocrBlocks.length > 0) {
1008
1756
  const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
1009
- return { success: true, fileType: "pdf", markdown: ocrMarkdown, pageCount: parsedPageCount, blocks: ocrBlocks, metadata, isImageBased: true };
1757
+ return { success: true, fileType: "pdf", markdown: ocrMarkdown, pageCount: parsedPageCount, blocks: ocrBlocks, metadata, isImageBased: true, warnings };
1010
1758
  }
1011
1759
  } catch {
1012
1760
  }
1013
1761
  }
1014
1762
  return { success: false, fileType: "pdf", pageCount, isImageBased: true, error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`, code: "IMAGE_BASED_PDF" };
1015
1763
  }
1016
- let markdown = pageTexts.filter((t) => t.trim()).join("\n\n");
1017
- markdown = cleanPdfText(markdown);
1018
- return { success: true, fileType: "pdf", markdown, pageCount: parsedPageCount, blocks, metadata };
1764
+ const medianFontSize = computeMedianFontSize(allFontSizes);
1765
+ if (medianFontSize > 0) {
1766
+ detectHeadings(blocks, medianFontSize);
1767
+ }
1768
+ const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
1769
+ let markdown = cleanPdfText(blocksToMarkdown(blocks));
1770
+ return { success: true, fileType: "pdf", markdown, pageCount: parsedPageCount, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
1019
1771
  } finally {
1020
1772
  await doc.destroy().catch(() => {
1021
1773
  });
@@ -1044,24 +1796,272 @@ function parsePdfDate(dateStr) {
1044
1796
  const [, year, month = "01", day = "01", hour = "00", min = "00", sec = "00"] = m;
1045
1797
  return `${year}-${month}-${day}T${hour}:${min}:${sec}`;
1046
1798
  }
1047
- function extractPageContent(rawItems) {
1048
- const items = normalizeItems(rawItems);
1049
- if (items.length === 0) return "";
1050
- const yLines = groupByY(items);
1051
- const columns = detectColumns(yLines);
1799
+ function filterHiddenText(items, pageWidth, pageHeight) {
1800
+ let hiddenCount = 0;
1801
+ const visible = [];
1802
+ for (const item of items) {
1803
+ if (item.isHidden) {
1804
+ hiddenCount++;
1805
+ continue;
1806
+ }
1807
+ const margin = Math.max(pageWidth, pageHeight) * 0.1;
1808
+ if (item.x < -margin || item.x > pageWidth + margin || item.y < -margin || item.y > pageHeight + margin) {
1809
+ hiddenCount++;
1810
+ continue;
1811
+ }
1812
+ visible.push(item);
1813
+ }
1814
+ return { visible, hiddenCount };
1815
+ }
1816
+ function computeMedianFontSize(sizes) {
1817
+ if (sizes.length === 0) return 0;
1818
+ const sorted = [...sizes].sort((a, b) => a - b);
1819
+ const mid = Math.floor(sorted.length / 2);
1820
+ return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
1821
+ }
1822
+ function detectHeadings(blocks, medianFontSize) {
1823
+ for (const block of blocks) {
1824
+ if (block.type !== "paragraph" || !block.text || !block.style?.fontSize) continue;
1825
+ const text = block.text.trim();
1826
+ if (text.length === 0 || text.length > 200) continue;
1827
+ if (/^\d+$/.test(text)) continue;
1828
+ const ratio = block.style.fontSize / medianFontSize;
1829
+ let level = 0;
1830
+ if (ratio >= 1.5) level = 1;
1831
+ else if (ratio >= 1.3) level = 2;
1832
+ else if (ratio >= 1.15) level = 3;
1833
+ if (level > 0) {
1834
+ block.type = "heading";
1835
+ block.level = level;
1836
+ }
1837
+ }
1838
+ }
1839
+ var MAX_XYCUT_DEPTH = 50;
1840
+ function xyCutOrder(items, gapThreshold, depth = 0) {
1841
+ if (items.length === 0) return [];
1842
+ if (items.length <= 2 || depth >= MAX_XYCUT_DEPTH) return [items];
1843
+ const region = computeRegion(items);
1844
+ const ySplit = findYSplit(items, region, gapThreshold);
1845
+ if (ySplit !== null) {
1846
+ const upper = items.filter((i) => i.y > ySplit);
1847
+ const lower = items.filter((i) => i.y <= ySplit);
1848
+ if (upper.length > 0 && lower.length > 0 && upper.length < items.length) {
1849
+ return [...xyCutOrder(upper, gapThreshold, depth + 1), ...xyCutOrder(lower, gapThreshold, depth + 1)];
1850
+ }
1851
+ }
1852
+ const xSplit = findXSplit(items, region, gapThreshold);
1853
+ if (xSplit !== null) {
1854
+ const left = items.filter((i) => i.x + i.w / 2 < xSplit);
1855
+ const right = items.filter((i) => i.x + i.w / 2 >= xSplit);
1856
+ if (left.length > 0 && right.length > 0 && left.length < items.length) {
1857
+ return [...xyCutOrder(left, gapThreshold, depth + 1), ...xyCutOrder(right, gapThreshold, depth + 1)];
1858
+ }
1859
+ }
1860
+ return [items];
1861
+ }
1862
+ function computeRegion(items) {
1863
+ let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
1864
+ for (const i of items) {
1865
+ if (i.x < minX) minX = i.x;
1866
+ if (i.y < minY) minY = i.y;
1867
+ if (i.x + i.w > maxX) maxX = i.x + i.w;
1868
+ if (i.y + i.h > maxY) maxY = i.y + i.h;
1869
+ }
1870
+ return { items, minX, minY, maxX, maxY };
1871
+ }
1872
+ function findYSplit(items, region, gapThreshold) {
1873
+ const sorted = [...items].sort((a, b) => b.y - a.y);
1874
+ let bestGap = gapThreshold;
1875
+ let bestSplit = null;
1876
+ for (let i = 1; i < sorted.length; i++) {
1877
+ const prevBottom = sorted[i - 1].y - sorted[i - 1].h;
1878
+ const currTop = sorted[i].y;
1879
+ const gap = prevBottom - currTop;
1880
+ if (gap > bestGap) {
1881
+ bestGap = gap;
1882
+ bestSplit = (prevBottom + currTop) / 2;
1883
+ }
1884
+ }
1885
+ return bestSplit;
1886
+ }
1887
+ function findXSplit(items, region, gapThreshold) {
1888
+ const sorted = [...items].sort((a, b) => a.x - b.x);
1889
+ let bestGap = gapThreshold;
1890
+ let bestSplit = null;
1891
+ for (let i = 1; i < sorted.length; i++) {
1892
+ const prevRight = sorted[i - 1].x + sorted[i - 1].w;
1893
+ const currLeft = sorted[i].x;
1894
+ const gap = currLeft - prevRight;
1895
+ if (gap > bestGap) {
1896
+ bestGap = gap;
1897
+ bestSplit = (prevRight + currLeft) / 2;
1898
+ }
1899
+ }
1900
+ return bestSplit;
1901
+ }
1902
+ function extractPageBlocksWithLines(items, pageNum, opList, pageWidth, pageHeight) {
1903
+ if (items.length === 0) return [];
1904
+ let { horizontals, verticals } = extractLines(opList.fnArray, opList.argsArray);
1905
+ ({ horizontals, verticals } = filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight));
1906
+ const grids = buildTableGrids(horizontals, verticals);
1907
+ if (grids.length > 0) {
1908
+ return extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals);
1909
+ }
1910
+ return extractPageBlocksFallback(items, pageNum);
1911
+ }
1912
+ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
1913
+ const blocks = [];
1914
+ const usedItems = /* @__PURE__ */ new Set();
1915
+ const sortedGrids = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
1916
+ for (const grid of sortedGrids) {
1917
+ const tableItems = [];
1918
+ const pad = 3;
1919
+ for (const item of items) {
1920
+ if (usedItems.has(item)) continue;
1921
+ if (item.x >= grid.bbox.x1 - pad && item.x + item.w <= grid.bbox.x2 + pad && item.y >= grid.bbox.y1 - pad && item.y <= grid.bbox.y2 + pad) {
1922
+ tableItems.push(item);
1923
+ usedItems.add(item);
1924
+ }
1925
+ }
1926
+ const cells = extractCells(grid, horizontals, verticals);
1927
+ if (cells.length === 0) continue;
1928
+ const textItems = tableItems.map((i) => ({
1929
+ text: i.text,
1930
+ x: i.x,
1931
+ y: i.y,
1932
+ w: i.w,
1933
+ h: i.h,
1934
+ fontSize: i.fontSize,
1935
+ fontName: i.fontName
1936
+ }));
1937
+ const cellTextMap = mapTextToCells(textItems, cells);
1938
+ const numRows = grid.rowYs.length - 1;
1939
+ const numCols = grid.colXs.length - 1;
1940
+ const irGrid = Array.from(
1941
+ { length: numRows },
1942
+ () => Array.from({ length: numCols }, () => ({ text: "", colSpan: 1, rowSpan: 1 }))
1943
+ );
1944
+ for (const cell of cells) {
1945
+ const textItems2 = cellTextMap.get(cell) || [];
1946
+ const text = cellTextToString(textItems2);
1947
+ irGrid[cell.row][cell.col] = {
1948
+ text,
1949
+ colSpan: cell.colSpan,
1950
+ rowSpan: cell.rowSpan
1951
+ };
1952
+ }
1953
+ const irTable = {
1954
+ rows: numRows,
1955
+ cols: numCols,
1956
+ cells: irGrid,
1957
+ hasHeader: numRows > 1
1958
+ };
1959
+ blocks.push({
1960
+ type: "table",
1961
+ table: irTable,
1962
+ pageNumber: pageNum,
1963
+ bbox: {
1964
+ page: pageNum,
1965
+ x: grid.bbox.x1,
1966
+ y: grid.bbox.y1,
1967
+ width: grid.bbox.x2 - grid.bbox.x1,
1968
+ height: grid.bbox.y2 - grid.bbox.y1
1969
+ }
1970
+ });
1971
+ }
1972
+ const remaining = items.filter((i) => !usedItems.has(i));
1973
+ if (remaining.length > 0) {
1974
+ remaining.sort((a, b) => b.y - a.y || a.x - b.x);
1975
+ const textBlocks = detectListBlocks(extractPageBlocksFallback(remaining, pageNum));
1976
+ const allBlocks = [...blocks, ...textBlocks];
1977
+ allBlocks.sort((a, b) => {
1978
+ const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
1979
+ const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
1980
+ return by - ay;
1981
+ });
1982
+ return allBlocks;
1983
+ }
1984
+ return blocks;
1985
+ }
1986
+ function extractPageBlocksFallback(items, pageNum) {
1987
+ if (items.length === 0) return [];
1988
+ const blocks = [];
1989
+ const allYLines = groupByY(items);
1990
+ const columns = detectColumns(allYLines);
1052
1991
  if (columns && columns.length >= 3) {
1053
- return extractWithColumns(yLines, columns);
1992
+ const tableText = extractWithColumns(allYLines, columns);
1993
+ const bbox = computeBBox(items, pageNum);
1994
+ blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
1995
+ } else {
1996
+ const allY = items.map((i) => i.y);
1997
+ const pageHeight = Math.max(...allY) - Math.min(...allY);
1998
+ const gapThreshold = Math.max(15, pageHeight * 0.03);
1999
+ const orderedGroups = xyCutOrder(items, gapThreshold);
2000
+ for (const group of orderedGroups) {
2001
+ if (group.length === 0) continue;
2002
+ const yLines = groupByY(group);
2003
+ const groupColumns = detectColumns(yLines);
2004
+ if (groupColumns && groupColumns.length >= 3) {
2005
+ const tableText = extractWithColumns(yLines, groupColumns);
2006
+ const bbox = computeBBox(group, pageNum);
2007
+ blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(group) });
2008
+ } else {
2009
+ for (const line of yLines) {
2010
+ const text = mergeLineSimple(line);
2011
+ if (!text.trim()) continue;
2012
+ const bbox = computeBBox(line, pageNum);
2013
+ blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
2014
+ }
2015
+ }
2016
+ }
1054
2017
  }
1055
- return yLines.map((line) => mergeLineSimple(line)).join("\n");
2018
+ return blocks;
2019
+ }
2020
+ function computeBBox(items, pageNum) {
2021
+ let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
2022
+ for (const i of items) {
2023
+ if (i.x < minX) minX = i.x;
2024
+ if (i.y < minY) minY = i.y;
2025
+ if (i.x + i.w > maxX) maxX = i.x + i.w;
2026
+ const effectiveH = i.h > 0 ? i.h : i.fontSize;
2027
+ if (i.y + effectiveH > maxY) maxY = i.y + effectiveH;
2028
+ }
2029
+ return { page: pageNum, x: minX, y: minY, width: maxX - minX, height: maxY - minY };
2030
+ }
2031
+ function dominantStyle(items) {
2032
+ if (items.length === 0) return void 0;
2033
+ const freq = /* @__PURE__ */ new Map();
2034
+ let maxCount = 0, dominantSize = 0;
2035
+ for (const i of items) {
2036
+ if (i.fontSize <= 0) continue;
2037
+ const count = (freq.get(i.fontSize) || 0) + 1;
2038
+ freq.set(i.fontSize, count);
2039
+ if (count > maxCount) {
2040
+ maxCount = count;
2041
+ dominantSize = i.fontSize;
2042
+ }
2043
+ }
2044
+ if (dominantSize === 0) return void 0;
2045
+ const fontName = items.find((i) => i.fontSize === dominantSize)?.fontName || void 0;
2046
+ return { fontSize: dominantSize, fontName };
1056
2047
  }
1057
2048
  function normalizeItems(rawItems) {
1058
- return rawItems.filter((i) => typeof i.str === "string" && i.str.trim() !== "").map((i) => ({
1059
- text: i.str.trim(),
1060
- x: Math.round(i.transform[4]),
1061
- y: Math.round(i.transform[5]),
1062
- w: Math.round(i.width),
1063
- h: Math.round(i.height)
1064
- })).sort((a, b) => b.y - a.y || a.x - b.x);
2049
+ return rawItems.filter((i) => typeof i.str === "string" && i.str.trim() !== "").map((i) => {
2050
+ const scaleY = Math.abs(i.transform[3]);
2051
+ const scaleX = Math.abs(i.transform[0]);
2052
+ const fontSize = Math.round(Math.max(scaleY, scaleX));
2053
+ return {
2054
+ text: i.str.trim(),
2055
+ x: Math.round(i.transform[4]),
2056
+ y: Math.round(i.transform[5]),
2057
+ w: Math.round(i.width),
2058
+ h: Math.round(i.height),
2059
+ fontSize,
2060
+ fontName: i.fontName || "",
2061
+ // 0pt 폰트이거나 너비 0 → hidden text (prompt injection 의심)
2062
+ isHidden: fontSize === 0 || i.width === 0 && i.str.trim().length > 0
2063
+ };
2064
+ }).sort((a, b) => b.y - a.y || a.x - b.x);
1065
2065
  }
1066
2066
  function groupByY(items) {
1067
2067
  if (items.length === 0) return [];
@@ -1299,6 +2299,27 @@ function startsWithMarker(line) {
1299
2299
  function isStandaloneHeader(line) {
1300
2300
  return /^제\d+[조항호장절](\([^)]*\))?(\s+\S+){0,7}$/.test(line.trim());
1301
2301
  }
2302
+ function detectListBlocks(blocks) {
2303
+ const result = [];
2304
+ for (let i = 0; i < blocks.length; i++) {
2305
+ const block = blocks[i];
2306
+ if (block.type === "paragraph" && block.text) {
2307
+ const match = block.text.match(/^(\d+)\.\s/);
2308
+ if (match) {
2309
+ result.push({
2310
+ ...block,
2311
+ type: "list",
2312
+ listType: "ordered",
2313
+ // 원래 번호를 text에 보존 (blocksToMarkdown에서 그대로 출력)
2314
+ text: block.text
2315
+ });
2316
+ continue;
2317
+ }
2318
+ }
2319
+ result.push(block);
2320
+ }
2321
+ return result;
2322
+ }
1302
2323
  function mergeKoreanLines(text) {
1303
2324
  if (!text) return "";
1304
2325
  const lines = text.split("\n");
@@ -1307,6 +2328,10 @@ function mergeKoreanLines(text) {
1307
2328
  for (let i = 1; i < lines.length; i++) {
1308
2329
  const prev = result[result.length - 1];
1309
2330
  const curr = lines[i];
2331
+ if (/^#{1,6}\s/.test(prev) || /^#{1,6}\s/.test(curr)) {
2332
+ result.push(curr);
2333
+ continue;
2334
+ }
1310
2335
  if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev)) {
1311
2336
  result[result.length - 1] = prev + " " + curr;
1312
2337
  } else {
@@ -1448,12 +2473,13 @@ function fallbackAlign(a, b) {
1448
2473
  }
1449
2474
  function blockSimilarity(a, b) {
1450
2475
  if (a.type !== b.type) return 0;
1451
- if (a.type === "paragraph") {
2476
+ if (a.text !== void 0 && b.text !== void 0) {
1452
2477
  return normalizedSimilarity(a.text || "", b.text || "");
1453
2478
  }
1454
2479
  if (a.type === "table" && a.table && b.table) {
1455
2480
  return tableSimilarity(a.table, b.table);
1456
2481
  }
2482
+ if (a.type === b.type) return 1;
1457
2483
  return 0;
1458
2484
  }
1459
2485
  function tableSimilarity(a, b) {
@@ -1724,16 +2750,16 @@ async function parse(buffer, options) {
1724
2750
  }
1725
2751
  async function parseHwpx(buffer, options) {
1726
2752
  try {
1727
- const { markdown, blocks, metadata } = await parseHwpxDocument(buffer, options);
1728
- return { success: true, fileType: "hwpx", markdown, blocks, metadata };
2753
+ const { markdown, blocks, metadata, outline, warnings } = await parseHwpxDocument(buffer, options);
2754
+ return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings };
1729
2755
  } catch (err) {
1730
2756
  return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
1731
2757
  }
1732
2758
  }
1733
2759
  async function parseHwp(buffer, options) {
1734
2760
  try {
1735
- const { markdown, blocks, metadata } = parseHwp5Document(Buffer.from(buffer), options);
1736
- return { success: true, fileType: "hwp", markdown, blocks, metadata };
2761
+ const { markdown, blocks, metadata, outline, warnings } = parseHwp5Document(Buffer.from(buffer), options);
2762
+ return { success: true, fileType: "hwp", markdown, blocks, metadata, outline, warnings };
1737
2763
  } catch (err) {
1738
2764
  return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
1739
2765
  }