kordoc 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -85,8 +85,29 @@ function blocksToMarkdown(blocks) {
85
85
  const lines = [];
86
86
  for (let i = 0; i < blocks.length; i++) {
87
87
  const block = blocks[i];
88
+ if (block.type === "heading" && block.text) {
89
+ const prefix = "#".repeat(Math.min(block.level || 2, 6));
90
+ lines.push("", `${prefix} ${block.text}`, "");
91
+ continue;
92
+ }
93
+ if (block.type === "separator") {
94
+ lines.push("", "---", "");
95
+ continue;
96
+ }
97
+ if (block.type === "list" && block.text) {
98
+ const alreadyNumbered = block.listType === "ordered" && /^\d+\.\s/.test(block.text);
99
+ const prefix = alreadyNumbered ? "" : block.listType === "ordered" ? "1. " : "- ";
100
+ lines.push(`${prefix}${block.text}`);
101
+ if (block.children) {
102
+ for (const child of block.children) {
103
+ const childPrefix = child.listType === "ordered" ? "1." : "-";
104
+ lines.push(` ${childPrefix} ${child.text || ""}`);
105
+ }
106
+ }
107
+ continue;
108
+ }
88
109
  if (block.type === "paragraph" && block.text) {
89
- const text = block.text;
110
+ let text = block.text;
90
111
  if (/^\[별표\s*\d+/.test(text)) {
91
112
  const nextBlock = blocks[i + 1];
92
113
  if (nextBlock?.type === "paragraph" && nextBlock.text && /관련\)?$/.test(nextBlock.text)) {
@@ -101,9 +122,19 @@ function blocksToMarkdown(blocks) {
101
122
  lines.push(`*${text}*`, "");
102
123
  continue;
103
124
  }
125
+ if (block.href) {
126
+ text = `[${text}](${block.href})`;
127
+ }
128
+ if (block.footnoteText) {
129
+ text += ` (\uC8FC: ${block.footnoteText})`;
130
+ }
104
131
  lines.push(text);
105
132
  } else if (block.type === "table" && block.table) {
133
+ if (lines.length > 0 && lines[lines.length - 1] !== "") {
134
+ lines.push("");
135
+ }
106
136
  lines.push(tableToMarkdown(block.table));
137
+ lines.push("");
107
138
  }
108
139
  }
109
140
  return lines.join("\n").trim();
@@ -154,7 +185,7 @@ function tableToMarkdown(table) {
154
185
  }
155
186
 
156
187
  // src/utils.ts
157
- var VERSION = true ? "1.4.0" : "0.0.0-dev";
188
+ var VERSION = true ? "1.5.0" : "0.0.0-dev";
158
189
  function toArrayBuffer(buf) {
159
190
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
160
191
  return buf.buffer;
@@ -228,6 +259,75 @@ var MAX_ZIP_ENTRIES = 500;
228
259
  function clampSpan(val, max) {
229
260
  return Math.max(1, Math.min(val, max));
230
261
  }
262
+ async function extractHwpxStyles(zip) {
263
+ const result = {
264
+ charProperties: /* @__PURE__ */ new Map(),
265
+ styles: /* @__PURE__ */ new Map()
266
+ };
267
+ const headerPaths = ["Contents/header.xml", "header.xml", "Contents/head.xml", "head.xml"];
268
+ for (const hp of headerPaths) {
269
+ const hpLower = hp.toLowerCase();
270
+ const file = zip.file(hp) || Object.values(zip.files).find((f) => f.name.toLowerCase() === hpLower) || null;
271
+ if (!file) continue;
272
+ try {
273
+ const xml = await file.async("text");
274
+ const parser = new DOMParser();
275
+ const doc = parser.parseFromString(stripDtd(xml), "text/xml");
276
+ if (!doc.documentElement) continue;
277
+ parseCharProperties(doc, result.charProperties);
278
+ parseStyleElements(doc, result.styles);
279
+ break;
280
+ } catch {
281
+ continue;
282
+ }
283
+ }
284
+ return result;
285
+ }
286
+ function parseCharProperties(doc, map) {
287
+ const tagNames = ["hh:charPr", "charPr", "hp:charPr"];
288
+ for (const tagName of tagNames) {
289
+ const elements = doc.getElementsByTagName(tagName);
290
+ for (let i = 0; i < elements.length; i++) {
291
+ const el = elements[i];
292
+ const id = el.getAttribute("id") || el.getAttribute("IDRef") || "";
293
+ if (!id) continue;
294
+ const prop = {};
295
+ const height = el.getAttribute("height");
296
+ if (height) prop.fontSize = parseInt(height, 10) / 100;
297
+ const bold = el.getAttribute("bold");
298
+ if (bold === "true" || bold === "1") prop.bold = true;
299
+ const italic = el.getAttribute("italic");
300
+ if (italic === "true" || italic === "1") prop.italic = true;
301
+ const fontFaces = el.getElementsByTagName("*");
302
+ for (let j = 0; j < fontFaces.length; j++) {
303
+ const ff = fontFaces[j];
304
+ const localTag = (ff.tagName || "").replace(/^[^:]+:/, "");
305
+ if (localTag === "fontface" || localTag === "fontRef") {
306
+ const face = ff.getAttribute("face") || ff.getAttribute("FontFace");
307
+ if (face) {
308
+ prop.fontName = face;
309
+ break;
310
+ }
311
+ }
312
+ }
313
+ map.set(id, prop);
314
+ }
315
+ }
316
+ }
317
+ function parseStyleElements(doc, map) {
318
+ const tagNames = ["hh:style", "style", "hp:style"];
319
+ for (const tagName of tagNames) {
320
+ const elements = doc.getElementsByTagName(tagName);
321
+ for (let i = 0; i < elements.length; i++) {
322
+ const el = elements[i];
323
+ const id = el.getAttribute("id") || el.getAttribute("IDRef") || String(i);
324
+ const name = el.getAttribute("name") || el.getAttribute("engName") || "";
325
+ const charPrId = el.getAttribute("charPrIDRef") || void 0;
326
+ const paraPrId = el.getAttribute("paraPrIDRef") || void 0;
327
+ map.set(id, { name, charPrId, paraPrId });
328
+ }
329
+ }
330
+ }
231
331
  function stripDtd(xml) {
232
332
  return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
233
333
  }
@@ -251,6 +351,8 @@ async function parseHwpxDocument(buffer, options) {
251
351
  }
252
352
  const metadata = {};
253
353
  await extractHwpxMetadata(zip, metadata);
354
+ const styleMap = await extractHwpxStyles(zip);
355
+ const warnings = [];
254
356
  const sectionPaths = await resolveSectionPaths(zip);
255
357
  if (sectionPaths.length === 0) throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
256
358
  metadata.pageCount = sectionPaths.length;
@@ -264,10 +366,12 @@ async function parseHwpxDocument(buffer, options) {
264
366
  const xml = await file.async("text");
265
367
  totalDecompressed += xml.length * 2;
266
368
  if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
267
- blocks.push(...parseSectionXml(xml));
369
+ blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1));
268
370
  }
371
+ detectHwpxHeadings(blocks, styleMap);
372
+ const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
269
373
  const markdown = blocksToMarkdown(blocks);
270
- return { markdown, blocks, metadata };
374
+ return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
271
375
  }
272
376
  async function extractHwpxMetadata(zip, metadata) {
273
377
  try {
@@ -446,15 +550,50 @@ function parseSectionPathsFromManifest(xml) {
446
550
  }
447
551
  return Array.from(idToHref.entries()).filter(([id]) => isSectionId(id)).sort((a, b) => a[0].localeCompare(b[0])).map(([, href]) => href);
448
552
  }
449
- function parseSectionXml(xml) {
553
+ function detectHwpxHeadings(blocks, styleMap) {
554
+ let baseFontSize = 0;
555
+ const sizeFreq = /* @__PURE__ */ new Map();
556
+ for (const b of blocks) {
557
+ if (b.style?.fontSize) {
558
+ sizeFreq.set(b.style.fontSize, (sizeFreq.get(b.style.fontSize) || 0) + 1);
559
+ }
560
+ }
561
+ let maxCount = 0;
562
+ for (const [size, count] of sizeFreq) {
563
+ if (count > maxCount) {
564
+ maxCount = count;
565
+ baseFontSize = size;
566
+ }
567
+ }
568
+ for (const block of blocks) {
569
+ if (block.type !== "paragraph" || !block.text) continue;
570
+ const text = block.text.trim();
571
+ if (text.length === 0 || text.length > 200 || /^\d+$/.test(text)) continue;
572
+ let level = 0;
573
+ if (baseFontSize > 0 && block.style?.fontSize) {
574
+ const ratio = block.style.fontSize / baseFontSize;
575
+ if (ratio >= 1.5) level = 1;
576
+ else if (ratio >= 1.3) level = 2;
577
+ else if (ratio >= 1.15) level = 3;
578
+ }
579
+ if (/^제\d+[조장절편]/.test(text) && text.length <= 50) {
580
+ if (level === 0) level = 3;
581
+ }
582
+ if (level > 0) {
583
+ block.type = "heading";
584
+ block.level = level;
585
+ }
586
+ }
587
+ }
588
+ function parseSectionXml(xml, styleMap, warnings, sectionNum) {
450
589
  const parser = new DOMParser();
451
590
  const doc = parser.parseFromString(stripDtd(xml), "text/xml");
452
591
  if (!doc.documentElement) return [];
453
592
  const blocks = [];
454
- walkSection(doc.documentElement, blocks, null, []);
593
+ walkSection(doc.documentElement, blocks, null, [], styleMap, warnings, sectionNum);
455
594
  return blocks;
456
595
  }
457
- function walkSection(node, blocks, tableCtx, tableStack) {
596
+ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum) {
458
597
  const children = node.childNodes;
459
598
  if (!children) return;
460
599
  for (let i = 0; i < children.length; i++) {
@@ -466,7 +605,7 @@ function walkSection(node, blocks, tableCtx, tableStack) {
466
605
  case "tbl": {
467
606
  if (tableCtx) tableStack.push(tableCtx);
468
607
  const newTable = { rows: [], currentRow: [], cell: null };
469
- walkSection(el, blocks, newTable, tableStack);
608
+ walkSection(el, blocks, newTable, tableStack, styleMap, warnings, sectionNum);
470
609
  if (newTable.rows.length > 0) {
471
610
  if (tableStack.length > 0) {
472
611
  const parentTable = tableStack.pop();
@@ -476,7 +615,7 @@ function walkSection(node, blocks, tableCtx, tableStack) {
476
615
  }
477
616
  tableCtx = parentTable;
478
617
  } else {
479
- blocks.push({ type: "table", table: buildTable(newTable.rows) });
618
+ blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
480
619
  tableCtx = null;
481
620
  }
482
621
  } else {
@@ -487,7 +626,7 @@ function walkSection(node, blocks, tableCtx, tableStack) {
487
626
  case "tr":
488
627
  if (tableCtx) {
489
628
  tableCtx.currentRow = [];
490
- walkSection(el, blocks, tableCtx, tableStack);
629
+ walkSection(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum);
491
630
  if (tableCtx.currentRow.length > 0) tableCtx.rows.push(tableCtx.currentRow);
492
631
  tableCtx.currentRow = [];
493
632
  }
@@ -495,7 +634,7 @@ function walkSection(node, blocks, tableCtx, tableStack) {
495
634
  case "tc":
496
635
  if (tableCtx) {
497
636
  tableCtx.cell = { text: "", colSpan: 1, rowSpan: 1 };
498
- walkSection(el, blocks, tableCtx, tableStack);
637
+ walkSection(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum);
499
638
  if (tableCtx.cell) {
500
639
  tableCtx.currentRow.push(tableCtx.cell);
501
640
  tableCtx.cell = null;
@@ -511,25 +650,75 @@ function walkSection(node, blocks, tableCtx, tableStack) {
511
650
  }
512
651
  break;
513
652
  case "p": {
514
- const text = extractParagraphText(el);
653
+ const { text, href, footnote, style } = extractParagraphInfo(el, styleMap);
515
654
  if (text) {
516
655
  if (tableCtx?.cell) {
517
656
  tableCtx.cell.text += (tableCtx.cell.text ? "\n" : "") + text;
518
657
  } else if (!tableCtx) {
519
- blocks.push({ type: "paragraph", text });
658
+ const block = { type: "paragraph", text, pageNumber: sectionNum };
659
+ if (style) block.style = style;
660
+ if (href) block.href = href;
661
+ if (footnote) block.footnoteText = footnote;
662
+ blocks.push(block);
520
663
  }
521
664
  }
522
- walkSection(el, blocks, tableCtx, tableStack);
665
+ tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum);
523
666
  break;
524
667
  }
668
+ // 이미지/그림 — 경고 수집
669
+ case "pic":
670
+ case "shape":
671
+ case "drawingObject":
672
+ if (warnings && sectionNum) {
673
+ warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
674
+ }
675
+ break;
525
676
  default:
526
- walkSection(el, blocks, tableCtx, tableStack);
677
+ walkSection(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum);
527
678
  break;
528
679
  }
529
680
  }
530
681
  }
531
- function extractParagraphText(para) {
682
+ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum) {
683
+ const children = node.childNodes;
684
+ if (!children) return tableCtx;
685
+ for (let i = 0; i < children.length; i++) {
686
+ const el = children[i];
687
+ if (el.nodeType !== 1) continue;
688
+ const tag = el.tagName || el.localName || "";
689
+ const localTag = tag.replace(/^[^:]+:/, "");
690
+ if (localTag === "tbl") {
691
+ if (tableCtx) tableStack.push(tableCtx);
692
+ const newTable = { rows: [], currentRow: [], cell: null };
693
+ walkSection(el, blocks, newTable, tableStack, styleMap, warnings, sectionNum);
694
+ if (newTable.rows.length > 0) {
695
+ if (tableStack.length > 0) {
696
+ const parentTable = tableStack.pop();
697
+ const nestedText = convertTableToText(newTable.rows);
698
+ if (parentTable.cell) {
699
+ parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
700
+ }
701
+ tableCtx = parentTable;
702
+ } else {
703
+ blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
704
+ tableCtx = null;
705
+ }
706
+ } else {
707
+ tableCtx = tableStack.length > 0 ? tableStack.pop() : null;
708
+ }
709
+ } else if (localTag === "pic" || localTag === "shape" || localTag === "drawingObject") {
710
+ if (warnings && sectionNum) {
711
+ warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
712
+ }
713
+ }
714
+ }
715
+ return tableCtx;
716
+ }
717
+ function extractParagraphInfo(para, styleMap) {
532
718
  let text = "";
719
+ let href;
720
+ let footnote;
721
+ let charPrId;
533
722
  const walk = (node) => {
534
723
  const children = node.childNodes;
535
724
  if (!children) return;
@@ -558,6 +747,29 @@ function extractParagraphText(para) {
558
747
  case "tbl":
559
748
  break;
560
749
  // 테이블은 walkSection에서 처리
750
+ // 하이퍼링크
751
+ case "hyperlink": {
752
+ const url = child.getAttribute("url") || child.getAttribute("href") || "";
753
+ if (url) href = url;
754
+ walk(child);
755
+ break;
756
+ }
757
+ // 각주/미주
758
+ case "footNote":
759
+ case "endNote":
760
+ case "fn":
761
+ case "en": {
762
+ const noteText = extractTextFromNode(child);
763
+ if (noteText) footnote = (footnote ? footnote + "; " : "") + noteText;
764
+ break;
765
+ }
766
+ // run 요소에서 charPrIDRef 추출
767
+ case "r": {
768
+ const runCharPr = child.getAttribute("charPrIDRef");
769
+ if (runCharPr && !charPrId) charPrId = runCharPr;
770
+ walk(child);
771
+ break;
772
+ }
561
773
  default:
562
774
  walk(child);
563
775
  break;
@@ -565,16 +777,43 @@ function extractParagraphText(para) {
565
777
  }
566
778
  };
567
779
  walk(para);
568
- return text.replace(/[ \t]+/g, " ").trim();
780
+ const cleanText = text.replace(/[ \t]+/g, " ").trim();
781
+ let style;
782
+ if (styleMap && charPrId) {
783
+ const charProp = styleMap.charProperties.get(charPrId);
784
+ if (charProp) {
785
+ style = {};
786
+ if (charProp.fontSize) style.fontSize = charProp.fontSize;
787
+ if (charProp.bold) style.bold = true;
788
+ if (charProp.italic) style.italic = true;
789
+ if (charProp.fontName) style.fontName = charProp.fontName;
790
+ if (!style.fontSize && !style.bold && !style.italic) style = void 0;
791
+ }
792
+ }
793
+ return { text: cleanText, href, footnote, style };
794
+ }
795
+ function extractTextFromNode(node) {
796
+ let result = "";
797
+ const children = node.childNodes;
798
+ if (!children) return result;
799
+ for (let i = 0; i < children.length; i++) {
800
+ const child = children[i];
801
+ if (child.nodeType === 3) result += child.textContent || "";
802
+ else if (child.nodeType === 1) result += extractTextFromNode(child);
803
+ }
804
+ return result.trim();
569
805
  }
570
806
 
571
807
  // src/hwp5/record.ts
572
808
  import { inflateRawSync as inflateRawSync2, inflateSync } from "zlib";
573
809
  var TAG_PARA_HEADER = 66;
574
810
  var TAG_PARA_TEXT = 67;
811
+ var TAG_CHAR_SHAPE = 68;
575
812
  var TAG_CTRL_HEADER = 71;
576
813
  var TAG_LIST_HEADER = 72;
577
814
  var TAG_TABLE = 77;
815
+ var TAG_DOC_CHAR_SHAPE = 55;
816
+ var TAG_DOC_STYLE = 58;
578
817
  var CHAR_LINE = 0;
579
818
  var CHAR_PARA = 13;
580
819
  var CHAR_TAB = 9;
@@ -625,6 +864,51 @@ function parseFileHeader(data) {
625
864
  flags: data.readUInt32LE(36)
626
865
  };
627
866
  }
867
+ function parseDocInfo(records) {
868
+ const charShapes = [];
869
+ const styles = [];
870
+ for (const rec of records) {
871
+ if (rec.tagId === TAG_DOC_CHAR_SHAPE && rec.data.length >= 18) {
872
+ if (rec.data.length >= 50) {
873
+ const fontSize = rec.data.readUInt32LE(42);
874
+ const attrFlags = rec.data.readUInt32LE(46);
875
+ charShapes.push({ fontSize, attrFlags });
876
+ } else {
877
+ charShapes.push({ fontSize: 0, attrFlags: 0 });
878
+ }
879
+ }
880
+ if (rec.tagId === TAG_DOC_STYLE && rec.data.length >= 8) {
881
+ try {
882
+ let offset = 0;
883
+ const nameLen = rec.data.readUInt16LE(offset);
884
+ offset += 2;
885
+ const nameBytes = nameLen * 2;
886
+ const name = nameBytes > 0 && offset + nameBytes <= rec.data.length ? rec.data.subarray(offset, offset + nameBytes).toString("utf16le") : "";
887
+ offset += nameBytes;
888
+ let nameKo = "";
889
+ if (offset + 2 <= rec.data.length) {
890
+ const nameKoLen = rec.data.readUInt16LE(offset);
891
+ offset += 2;
892
+ const nameKoBytes = nameKoLen * 2;
893
+ if (nameKoBytes > 0 && offset + nameKoBytes <= rec.data.length) {
894
+ nameKo = rec.data.subarray(offset, offset + nameKoBytes).toString("utf16le");
895
+ }
896
+ offset += nameKoBytes;
897
+ }
898
+ const type = offset < rec.data.length ? rec.data.readUInt8(offset) : 0;
899
+ offset += 1;
900
+ offset += 2;
901
+ offset += 2;
902
+ const paraShapeId = offset + 2 <= rec.data.length ? rec.data.readUInt16LE(offset) : 0;
903
+ offset += 2;
904
+ const charShapeId = offset + 2 <= rec.data.length ? rec.data.readUInt16LE(offset) : 0;
905
+ styles.push({ name, nameKo, charShapeId, paraShapeId, type });
906
+ } catch {
907
+ }
908
+ }
909
+ }
910
+ return { charShapes, styles };
911
+ }
628
912
  function extractText(data) {
629
913
  let result = "";
630
914
  let i = 0;
@@ -689,6 +973,8 @@ function parseHwp5Document(buffer, options) {
689
973
  version: `${header.versionMajor}.x`
690
974
  };
691
975
  extractHwp5Metadata(cfb, metadata);
976
+ const docInfo = parseDocInfoStream(cfb, compressed);
977
+ const warnings = [];
692
978
  const sections = findSections(cfb);
693
979
  if (sections.length === 0) throw new KordocError("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
694
980
  metadata.pageCount = sections.length;
@@ -702,10 +988,73 @@ function parseHwp5Document(buffer, options) {
702
988
  totalDecompressed += data.length;
703
989
  if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
704
990
  const records = readRecords(data);
705
- blocks.push(...parseSection(records));
991
+ const sectionBlocks = parseSection(records, docInfo, warnings, si + 1);
992
+ blocks.push(...sectionBlocks);
706
993
  }
994
+ if (docInfo) {
995
+ detectHwp5Headings(blocks, docInfo);
996
+ }
997
+ const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
707
998
  const markdown = blocksToMarkdown(blocks);
708
- return { markdown, blocks, metadata };
999
+ return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
1000
+ }
1001
+ function parseDocInfoStream(cfb, compressed) {
1002
+ try {
1003
+ const entry = CFB.find(cfb, "/DocInfo");
1004
+ if (!entry?.content) return null;
1005
+ const data = compressed ? decompressStream(Buffer.from(entry.content)) : Buffer.from(entry.content);
1006
+ const records = readRecords(data);
1007
+ return parseDocInfo(records);
1008
+ } catch {
1009
+ return null;
1010
+ }
1011
+ }
1012
+ function detectHwp5Headings(blocks, docInfo) {
1013
+ let baseFontSize = 0;
1014
+ for (const style of docInfo.styles) {
1015
+ const name = (style.nameKo || style.name).toLowerCase();
1016
+ if (name.includes("\uBC14\uD0D5") || name.includes("\uBCF8\uBB38") || name === "normal" || name === "body") {
1017
+ const cs = docInfo.charShapes[style.charShapeId];
1018
+ if (cs?.fontSize > 0) {
1019
+ baseFontSize = cs.fontSize / 10;
1020
+ break;
1021
+ }
1022
+ }
1023
+ }
1024
+ if (baseFontSize === 0) {
1025
+ const sizeFreq = /* @__PURE__ */ new Map();
1026
+ for (const b of blocks) {
1027
+ if (b.style?.fontSize) {
1028
+ sizeFreq.set(b.style.fontSize, (sizeFreq.get(b.style.fontSize) || 0) + 1);
1029
+ }
1030
+ }
1031
+ let maxCount = 0;
1032
+ for (const [size, count] of sizeFreq) {
1033
+ if (count > maxCount) {
1034
+ maxCount = count;
1035
+ baseFontSize = size;
1036
+ }
1037
+ }
1038
+ }
1039
+ if (baseFontSize <= 0) return;
1040
+ for (const block of blocks) {
1041
+ if (block.type !== "paragraph" || !block.text || !block.style?.fontSize) continue;
1042
+ const text = block.text.trim();
1043
+ if (text.length === 0 || text.length > 200) continue;
1044
+ if (/^\d+$/.test(text)) continue;
1045
+ const ratio = block.style.fontSize / baseFontSize;
1046
+ let level = 0;
1047
+ if (ratio >= 1.5) level = 1;
1048
+ else if (ratio >= 1.3) level = 2;
1049
+ else if (ratio >= 1.15) level = 3;
1050
+ if (/^제\d+[조장절편]/.test(text) && text.length <= 50) {
1051
+ if (level === 0) level = 3;
1052
+ }
1053
+ if (level > 0) {
1054
+ block.type = "heading";
1055
+ block.level = level;
1056
+ }
1057
+ }
709
1058
  }
710
1059
  function extractHwp5Metadata(cfb, metadata) {
711
1060
  try {
@@ -771,15 +1120,22 @@ function findSections(cfb) {
771
1120
  }
772
1121
  return sections.sort((a, b) => a.idx - b.idx).map((s) => s.content);
773
1122
  }
774
- function parseSection(records) {
1123
+ function parseSection(records, docInfo, warnings, sectionNum) {
775
1124
  const blocks = [];
776
1125
  let i = 0;
777
1126
  while (i < records.length) {
778
1127
  const rec = records[i];
779
1128
  if (rec.tagId === TAG_PARA_HEADER && rec.level === 0) {
780
- const { paragraph, tables, nextIdx } = parseParagraphWithTables(records, i);
781
- if (paragraph) blocks.push({ type: "paragraph", text: paragraph });
782
- for (const t of tables) blocks.push({ type: "table", table: t });
1129
+ const { paragraph, tables, nextIdx, charShapeIds } = parseParagraphWithTables(records, i);
1130
+ if (paragraph) {
1131
+ const block = { type: "paragraph", text: paragraph, pageNumber: sectionNum };
1132
+ if (docInfo && charShapeIds.length > 0) {
1133
+ const style = resolveCharStyle(charShapeIds, docInfo);
1134
+ if (style) block.style = style;
1135
+ }
1136
+ blocks.push(block);
1137
+ }
1138
+ for (const t of tables) blocks.push({ type: "table", table: t, pageNumber: sectionNum });
783
1139
  i = nextIdx;
784
1140
  continue;
785
1141
  }
@@ -787,19 +1143,43 @@ function parseSection(records) {
787
1143
  const ctrlId = rec.data.subarray(0, 4).toString("ascii");
788
1144
  if (ctrlId === " lbt" || ctrlId === "tbl ") {
789
1145
  const { table, nextIdx } = parseTableBlock(records, i);
790
- if (table) blocks.push({ type: "table", table });
1146
+ if (table) blocks.push({ type: "table", table, pageNumber: sectionNum });
791
1147
  i = nextIdx;
792
1148
  continue;
793
1149
  }
1150
+ if (ctrlId === "gso " || ctrlId === " osg" || ctrlId === " elo" || ctrlId === "ole ") {
1151
+ warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC81C\uC5B4 \uC694\uC18C: ${ctrlId.trim()}`, code: "SKIPPED_IMAGE" });
1152
+ }
794
1153
  }
795
1154
  i++;
796
1155
  }
797
1156
  return blocks;
798
1157
  }
1158
+ function resolveCharStyle(charShapeIds, docInfo) {
1159
+ if (charShapeIds.length === 0 || docInfo.charShapes.length === 0) return void 0;
1160
+ const freq = /* @__PURE__ */ new Map();
1161
+ let maxCount = 0, dominantId = charShapeIds[0];
1162
+ for (const id of charShapeIds) {
1163
+ const count = (freq.get(id) || 0) + 1;
1164
+ freq.set(id, count);
1165
+ if (count > maxCount) {
1166
+ maxCount = count;
1167
+ dominantId = id;
1168
+ }
1169
+ }
1170
+ const cs = docInfo.charShapes[dominantId];
1171
+ if (!cs) return void 0;
1172
+ const style = {};
1173
+ if (cs.fontSize > 0) style.fontSize = cs.fontSize / 10;
1174
+ if (cs.attrFlags & 1) style.italic = true;
1175
+ if (cs.attrFlags & 2) style.bold = true;
1176
+ return style.fontSize || style.bold || style.italic ? style : void 0;
1177
+ }
799
1178
  function parseParagraphWithTables(records, startIdx) {
800
1179
  const startLevel = records[startIdx].level;
801
1180
  let text = "";
802
1181
  const tables = [];
1182
+ const charShapeIds = [];
803
1183
  let i = startIdx + 1;
804
1184
  while (i < records.length) {
805
1185
  const rec = records[i];
@@ -807,6 +1187,11 @@ function parseParagraphWithTables(records, startIdx) {
807
1187
  if (rec.tagId === TAG_PARA_TEXT) {
808
1188
  text = extractText(rec.data);
809
1189
  }
1190
+ if (rec.tagId === TAG_CHAR_SHAPE && rec.data.length >= 8) {
1191
+ for (let offset = 0; offset + 7 < rec.data.length; offset += 8) {
1192
+ charShapeIds.push(rec.data.readUInt32LE(offset + 4));
1193
+ }
1194
+ }
810
1195
  if (rec.tagId === TAG_CTRL_HEADER && rec.data.length >= 4) {
811
1196
  const ctrlId = rec.data.subarray(0, 4).toString("ascii");
812
1197
  if (ctrlId === " lbt" || ctrlId === "tbl ") {
@@ -819,7 +1204,7 @@ function parseParagraphWithTables(records, startIdx) {
819
1204
  i++;
820
1205
  }
821
1206
  const trimmed = text.trim();
822
- return { paragraph: trimmed || null, tables, nextIdx: i };
1207
+ return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds };
823
1208
  }
824
1209
  function parseTableBlock(records, startIdx) {
825
1210
  const tableLevel = records[startIdx].level;
@@ -891,6 +1276,355 @@ function arrangeCells(rows, cols, cells) {
891
1276
  return grid.map((row) => row.map((c) => c || { text: "", colSpan: 1, rowSpan: 1 }));
892
1277
  }
893
1278
 
1279
+ // src/pdf/line-detector.ts
1280
+ import { OPS } from "pdfjs-dist/legacy/build/pdf.mjs";
1281
+ var ORIENTATION_TOL = 2;
1282
+ var MIN_LINE_LENGTH = 10;
1283
+ var COORD_MERGE_TOL = 3;
1284
+ var CONNECT_TOL = 5;
1285
+ var CELL_PADDING = 2;
1286
+ function extractLines(fnArray, argsArray) {
1287
+ const horizontals = [];
1288
+ const verticals = [];
1289
+ let lineWidth = 1;
1290
+ let currentPath = [];
1291
+ let pathStartX = 0, pathStartY = 0;
1292
+ let curX = 0, curY = 0;
1293
+ function flushPath(isStroke) {
1294
+ if (!isStroke) {
1295
+ currentPath = [];
1296
+ return;
1297
+ }
1298
+ for (const seg of currentPath) {
1299
+ classifyAndAdd(seg, lineWidth, horizontals, verticals);
1300
+ }
1301
+ currentPath = [];
1302
+ }
1303
+ for (let i = 0; i < fnArray.length; i++) {
1304
+ const op = fnArray[i];
1305
+ const args = argsArray[i];
1306
+ switch (op) {
1307
+ case OPS.setLineWidth:
1308
+ lineWidth = args[0] || 1;
1309
+ break;
1310
+ case OPS.constructPath: {
1311
+ const subOps = args[0];
1312
+ const coords = args[1];
1313
+ let ci = 0;
1314
+ for (const subOp of subOps) {
1315
+ if (subOp === OPS.moveTo) {
1316
+ curX = coords[ci++];
1317
+ curY = coords[ci++];
1318
+ pathStartX = curX;
1319
+ pathStartY = curY;
1320
+ } else if (subOp === OPS.lineTo) {
1321
+ const x2 = coords[ci++], y2 = coords[ci++];
1322
+ currentPath.push({ x1: curX, y1: curY, x2, y2 });
1323
+ curX = x2;
1324
+ curY = y2;
1325
+ } else if (subOp === OPS.rectangle) {
1326
+ const rx = coords[ci++], ry = coords[ci++];
1327
+ const rw = coords[ci++], rh = coords[ci++];
1328
+ if (Math.abs(rh) < ORIENTATION_TOL * 2) {
1329
+ currentPath.push({ x1: rx, y1: ry + rh / 2, x2: rx + rw, y2: ry + rh / 2 });
1330
+ } else if (Math.abs(rw) < ORIENTATION_TOL * 2) {
1331
+ currentPath.push({ x1: rx + rw / 2, y1: ry, x2: rx + rw / 2, y2: ry + rh });
1332
+ } else {
1333
+ currentPath.push(
1334
+ { x1: rx, y1: ry, x2: rx + rw, y2: ry },
1335
+ // bottom
1336
+ { x1: rx + rw, y1: ry, x2: rx + rw, y2: ry + rh },
1337
+ // right
1338
+ { x1: rx + rw, y1: ry + rh, x2: rx, y2: ry + rh },
1339
+ // top
1340
+ { x1: rx, y1: ry + rh, x2: rx, y2: ry }
1341
+ // left
1342
+ );
1343
+ }
1344
+ } else if (subOp === OPS.closePath) {
1345
+ if (curX !== pathStartX || curY !== pathStartY) {
1346
+ currentPath.push({ x1: curX, y1: curY, x2: pathStartX, y2: pathStartY });
1347
+ }
1348
+ curX = pathStartX;
1349
+ curY = pathStartY;
1350
+ } else if (subOp === OPS.curveTo) {
1351
+ ci += 6;
1352
+ } else if (subOp === OPS.curveTo2 || subOp === OPS.curveTo3) {
1353
+ ci += 4;
1354
+ }
1355
+ }
1356
+ break;
1357
+ }
1358
+ case OPS.stroke:
1359
+ case OPS.closeStroke:
1360
+ flushPath(true);
1361
+ break;
1362
+ case OPS.fill:
1363
+ case OPS.eoFill:
1364
+ case OPS.fillStroke:
1365
+ case OPS.eoFillStroke:
1366
+ case OPS.closeFillStroke:
1367
+ case OPS.closeEOFillStroke:
1368
+ flushPath(true);
1369
+ break;
1370
+ case OPS.endPath:
1371
+ flushPath(false);
1372
+ break;
1373
+ }
1374
+ }
1375
+ return { horizontals, verticals };
1376
+ }
1377
+ function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
1378
+ const dx = Math.abs(seg.x2 - seg.x1);
1379
+ const dy = Math.abs(seg.y2 - seg.y1);
1380
+ const length = Math.sqrt(dx * dx + dy * dy);
1381
+ if (length < MIN_LINE_LENGTH) return;
1382
+ if (dy <= ORIENTATION_TOL) {
1383
+ const y = (seg.y1 + seg.y2) / 2;
1384
+ const x1 = Math.min(seg.x1, seg.x2);
1385
+ const x2 = Math.max(seg.x1, seg.x2);
1386
+ horizontals.push({ x1, y1: y, x2, y2: y, lineWidth });
1387
+ } else if (dx <= ORIENTATION_TOL) {
1388
+ const x = (seg.x1 + seg.x2) / 2;
1389
+ const y1 = Math.min(seg.y1, seg.y2);
1390
+ const y2 = Math.max(seg.y1, seg.y2);
1391
+ verticals.push({ x1: x, y1, x2: x, y2, lineWidth });
1392
+ }
1393
+ }
1394
+ function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
1395
+ const margin = 5;
1396
+ return {
1397
+ horizontals: horizontals.filter(
1398
+ (l) => !(Math.abs(l.y1) < margin || Math.abs(l.y1 - pageHeight) < margin) || l.x2 - l.x1 < pageWidth * 0.9
1399
+ ),
1400
+ verticals: verticals.filter(
1401
+ (l) => !(Math.abs(l.x1) < margin || Math.abs(l.x1 - pageWidth) < margin) || l.y2 - l.y1 < pageHeight * 0.9
1402
+ )
1403
+ };
1404
+ }
1405
+ function buildTableGrids(horizontals, verticals) {
1406
+ if (horizontals.length < 2 || verticals.length < 2) return [];
1407
+ const allLines = [
1408
+ ...horizontals.map((l, i) => ({ ...l, type: "h", id: i })),
1409
+ ...verticals.map((l, i) => ({ ...l, type: "v", id: i + horizontals.length }))
1410
+ ];
1411
+ const groups = groupConnectedLines(allLines);
1412
+ const grids = [];
1413
+ for (const group of groups) {
1414
+ const hLines = group.filter((l) => l.type === "h");
1415
+ const vLines = group.filter((l) => l.type === "v");
1416
+ if (hLines.length < 2 || vLines.length < 2) continue;
1417
+ const rawYs = hLines.map((l) => l.y1);
1418
+ const rowYs = clusterCoordinates(rawYs).sort((a, b) => b - a);
1419
+ const rawXs = vLines.map((l) => l.x1);
1420
+ const colXs = clusterCoordinates(rawXs).sort((a, b) => a - b);
1421
+ if (rowYs.length < 2 || colXs.length < 2) continue;
1422
+ const bbox = {
1423
+ x1: colXs[0],
1424
+ y1: rowYs[rowYs.length - 1],
1425
+ x2: colXs[colXs.length - 1],
1426
+ y2: rowYs[0]
1427
+ };
1428
+ grids.push({ rowYs, colXs, bbox });
1429
+ }
1430
+ return grids;
1431
+ }
1432
+ function clusterCoordinates(values) {
1433
+ if (values.length === 0) return [];
1434
+ const sorted = [...values].sort((a, b) => a - b);
1435
+ const clusters = [{ sum: sorted[0], count: 1 }];
1436
+ for (let i = 1; i < sorted.length; i++) {
1437
+ const last = clusters[clusters.length - 1];
1438
+ const avg = last.sum / last.count;
1439
+ if (Math.abs(sorted[i] - avg) <= COORD_MERGE_TOL) {
1440
+ last.sum += sorted[i];
1441
+ last.count++;
1442
+ } else {
1443
+ clusters.push({ sum: sorted[i], count: 1 });
1444
+ }
1445
+ }
1446
+ return clusters.map((c) => c.sum / c.count);
1447
+ }
1448
+ function groupConnectedLines(lines) {
1449
+ const parent = lines.map((_, i) => i);
1450
+ function find(x) {
1451
+ while (parent[x] !== x) {
1452
+ parent[x] = parent[parent[x]];
1453
+ x = parent[x];
1454
+ }
1455
+ return x;
1456
+ }
1457
+ function union(a, b) {
1458
+ const ra = find(a), rb = find(b);
1459
+ if (ra !== rb) parent[ra] = rb;
1460
+ }
1461
+ for (let i = 0; i < lines.length; i++) {
1462
+ for (let j = i + 1; j < lines.length; j++) {
1463
+ if (linesIntersect(lines[i], lines[j])) {
1464
+ union(i, j);
1465
+ }
1466
+ }
1467
+ }
1468
+ const groups = /* @__PURE__ */ new Map();
1469
+ for (let i = 0; i < lines.length; i++) {
1470
+ const root = find(i);
1471
+ if (!groups.has(root)) groups.set(root, []);
1472
+ groups.get(root).push(lines[i]);
1473
+ }
1474
+ return [...groups.values()];
1475
+ }
1476
+ function linesIntersect(a, b) {
1477
+ if (a.type === b.type) {
1478
+ if (a.type === "h") {
1479
+ if (Math.abs(a.y1 - b.y1) > CONNECT_TOL) return false;
1480
+ return Math.min(a.x2, b.x2) >= Math.max(a.x1, b.x1) - CONNECT_TOL;
1481
+ } else {
1482
+ if (Math.abs(a.x1 - b.x1) > CONNECT_TOL) return false;
1483
+ return Math.min(a.y2, b.y2) >= Math.max(a.y1, b.y1) - CONNECT_TOL;
1484
+ }
1485
+ }
1486
+ const h = a.type === "h" ? a : b;
1487
+ const v = a.type === "h" ? b : a;
1488
+ const tol = CONNECT_TOL;
1489
+ return v.x1 >= h.x1 - tol && v.x1 <= h.x2 + tol && h.y1 >= v.y1 - tol && h.y1 <= v.y2 + tol;
1490
+ }
1491
+ function extractCells(grid, horizontals, verticals) {
1492
+ const { rowYs, colXs } = grid;
1493
+ const numRows = rowYs.length - 1;
1494
+ const numCols = colXs.length - 1;
1495
+ if (numRows <= 0 || numCols <= 0) return [];
1496
+ const occupied = Array.from({ length: numRows }, () => Array(numCols).fill(false));
1497
+ const cells = [];
1498
+ for (let r = 0; r < numRows; r++) {
1499
+ for (let c = 0; c < numCols; c++) {
1500
+ if (occupied[r][c]) continue;
1501
+ let colSpan = 1;
1502
+ let rowSpan = 1;
1503
+ while (c + colSpan < numCols) {
1504
+ const borderX = colXs[c + colSpan];
1505
+ const topY = rowYs[r];
1506
+ const botY = rowYs[r + 1];
1507
+ if (hasVerticalLine(verticals, borderX, topY, botY)) break;
1508
+ colSpan++;
1509
+ }
1510
+ while (r + rowSpan < numRows) {
1511
+ const borderY = rowYs[r + rowSpan];
1512
+ const leftX = colXs[c];
1513
+ const rightX = colXs[c + colSpan];
1514
+ if (hasHorizontalLine(horizontals, borderY, leftX, rightX)) break;
1515
+ rowSpan++;
1516
+ }
1517
+ for (let dr = 0; dr < rowSpan; dr++) {
1518
+ for (let dc = 0; dc < colSpan; dc++) {
1519
+ occupied[r + dr][c + dc] = true;
1520
+ }
1521
+ }
1522
+ cells.push({
1523
+ row: r,
1524
+ col: c,
1525
+ rowSpan,
1526
+ colSpan,
1527
+ bbox: {
1528
+ x1: colXs[c],
1529
+ y1: rowYs[r + rowSpan],
1530
+ x2: colXs[c + colSpan],
1531
+ y2: rowYs[r]
1532
+ }
1533
+ });
1534
+ }
1535
+ }
1536
+ return cells;
1537
+ }
1538
+ function hasVerticalLine(verticals, x, topY, botY) {
1539
+ const tol = COORD_MERGE_TOL + 1;
1540
+ for (const v of verticals) {
1541
+ if (Math.abs(v.x1 - x) <= tol) {
1542
+ const cellH = Math.abs(topY - botY);
1543
+ const overlapTop = Math.min(v.y2, topY);
1544
+ const overlapBot = Math.max(v.y1, botY);
1545
+ const overlap = overlapTop - overlapBot;
1546
+ if (overlap >= cellH * 0.5) return true;
1547
+ }
1548
+ }
1549
+ return false;
1550
+ }
1551
+ function hasHorizontalLine(horizontals, y, leftX, rightX) {
1552
+ const tol = COORD_MERGE_TOL + 1;
1553
+ for (const h of horizontals) {
1554
+ if (Math.abs(h.y1 - y) <= tol) {
1555
+ const cellW = Math.abs(rightX - leftX);
1556
+ const overlapLeft = Math.max(h.x1, leftX);
1557
+ const overlapRight = Math.min(h.x2, rightX);
1558
+ const overlap = overlapRight - overlapLeft;
1559
+ if (overlap >= cellW * 0.5) return true;
1560
+ }
1561
+ }
1562
+ return false;
1563
+ }
1564
+ function mapTextToCells(items, cells) {
1565
+ const result = /* @__PURE__ */ new Map();
1566
+ for (const cell of cells) {
1567
+ result.set(cell, []);
1568
+ }
1569
+ for (const item of items) {
1570
+ const cx = item.x + item.w / 2;
1571
+ const cy = item.y;
1572
+ const pad = CELL_PADDING;
1573
+ let bestCell = null;
1574
+ let bestDist = Infinity;
1575
+ for (const cell of cells) {
1576
+ if (cx >= cell.bbox.x1 - pad && cx <= cell.bbox.x2 + pad && cy >= cell.bbox.y1 - pad && cy <= cell.bbox.y2 + pad) {
1577
+ const cellCx = (cell.bbox.x1 + cell.bbox.x2) / 2;
1578
+ const cellCy = (cell.bbox.y1 + cell.bbox.y2) / 2;
1579
+ const dist = Math.abs(cx - cellCx) + Math.abs(cy - cellCy);
1580
+ if (dist < bestDist) {
1581
+ bestDist = dist;
1582
+ bestCell = cell;
1583
+ }
1584
+ }
1585
+ }
1586
+ if (bestCell) {
1587
+ result.get(bestCell).push(item);
1588
+ }
1589
+ }
1590
+ return result;
1591
+ }
1592
+ function cellTextToString(items) {
1593
+ if (items.length === 0) return "";
1594
+ if (items.length === 1) return items[0].text;
1595
+ const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
1596
+ const lines = [];
1597
+ let curLine = [sorted[0]];
1598
+ let curY = sorted[0].y;
1599
+ for (let i = 1; i < sorted.length; i++) {
1600
+ const tol = Math.max(3, Math.min(sorted[i].fontSize, curLine[0].fontSize) * 0.6);
1601
+ if (Math.abs(sorted[i].y - curY) <= tol) {
1602
+ curLine.push(sorted[i]);
1603
+ } else {
1604
+ lines.push(curLine);
1605
+ curLine = [sorted[i]];
1606
+ curY = sorted[i].y;
1607
+ }
1608
+ }
1609
+ lines.push(curLine);
1610
+ const textLines = lines.map((line) => {
1611
+ const s = line.sort((a, b) => a.x - b.x);
1612
+ return s.map((i) => i.text).join(" ");
1613
+ });
1614
+ if (textLines.length <= 1) return textLines[0] || "";
1615
+ const merged = [textLines[0]];
1616
+ for (let i = 1; i < textLines.length; i++) {
1617
+ const prev = merged[merged.length - 1];
1618
+ const curr = textLines[i];
1619
+ if (/[가-힣]$/.test(prev) && /^[가-힣]+$/.test(curr) && curr.length <= 5 && !curr.includes(" ")) {
1620
+ merged[merged.length - 1] = prev + curr;
1621
+ } else {
1622
+ merged.push(curr);
1623
+ }
1624
+ }
1625
+ return merged.join("\n");
1626
+ }
1627
+
894
1628
  // src/pdf/polyfill.ts
895
1629
  import * as pdfjsWorker from "pdfjs-dist/legacy/build/pdf.worker.mjs";
896
1630
  var g = globalThis;
@@ -922,44 +1656,62 @@ async function parsePdfDocument(buffer, options) {
922
1656
  }).promise;
923
1657
  try {
924
1658
  const pageCount = doc.numPages;
925
- if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.", blocks: [] };
1659
+ if (pageCount === 0) return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "PARSE_ERROR" };
926
1660
  const metadata = { pageCount };
927
1661
  await extractPdfMetadata(doc, metadata);
928
- const pageTexts = [];
929
1662
  const blocks = [];
1663
+ const warnings = [];
930
1664
  let totalChars = 0;
931
1665
  let totalTextBytes = 0;
932
1666
  const effectivePageCount = Math.min(pageCount, MAX_PAGES);
933
1667
  const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
1668
+ const allFontSizes = [];
934
1669
  for (let i = 1; i <= effectivePageCount; i++) {
935
1670
  if (pageFilter && !pageFilter.has(i)) continue;
936
1671
  const page = await doc.getPage(i);
937
1672
  const tc = await page.getTextContent();
938
- const pageText = extractPageContent(tc.items);
939
- totalChars += pageText.replace(/\s/g, "").length;
940
- totalTextBytes += pageText.length * 2;
1673
+ const viewport = page.getViewport({ scale: 1 });
1674
+ const rawItems = tc.items;
1675
+ const items = normalizeItems(rawItems);
1676
+ const { visible, hiddenCount } = filterHiddenText(items, viewport.width, viewport.height);
1677
+ if (hiddenCount > 0) {
1678
+ warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
1679
+ }
1680
+ for (const item of visible) {
1681
+ if (item.fontSize > 0) allFontSizes.push(item.fontSize);
1682
+ }
1683
+ const opList = await page.getOperatorList();
1684
+ const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
1685
+ for (const b of pageBlocks) blocks.push(b);
1686
+ for (const b of pageBlocks) {
1687
+ const t = b.text || "";
1688
+ totalChars += t.replace(/\s/g, "").length;
1689
+ totalTextBytes += t.length * 2;
1690
+ }
941
1691
  if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
942
- pageTexts.push(pageText);
943
- blocks.push({ type: "paragraph", text: pageText });
944
1692
  }
945
1693
  const parsedPageCount = pageFilter ? pageFilter.size : effectivePageCount;
946
1694
  if (totalChars / Math.max(parsedPageCount, 1) < 10) {
947
1695
  if (options?.ocr) {
948
1696
  try {
949
- const { ocrPages } = await import("./provider-JB7SY74K.js");
1697
+ const { ocrPages } = await import("./provider-A4FHJSID.js");
950
1698
  const ocrBlocks = await ocrPages(doc, options.ocr, pageFilter, effectivePageCount);
951
1699
  if (ocrBlocks.length > 0) {
952
1700
  const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
953
- return { success: true, fileType: "pdf", markdown: ocrMarkdown, pageCount: parsedPageCount, blocks: ocrBlocks, metadata, isImageBased: true };
1701
+ return { success: true, fileType: "pdf", markdown: ocrMarkdown, pageCount: parsedPageCount, blocks: ocrBlocks, metadata, isImageBased: true, warnings };
954
1702
  }
955
1703
  } catch {
956
1704
  }
957
1705
  }
958
1706
  return { success: false, fileType: "pdf", pageCount, isImageBased: true, error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`, code: "IMAGE_BASED_PDF" };
959
1707
  }
960
- let markdown = pageTexts.filter((t) => t.trim()).join("\n\n");
961
- markdown = cleanPdfText(markdown);
962
- return { success: true, fileType: "pdf", markdown, pageCount: parsedPageCount, blocks, metadata };
1708
+ const medianFontSize = computeMedianFontSize(allFontSizes);
1709
+ if (medianFontSize > 0) {
1710
+ detectHeadings(blocks, medianFontSize);
1711
+ }
1712
+ const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
1713
+ let markdown = cleanPdfText(blocksToMarkdown(blocks));
1714
+ return { success: true, fileType: "pdf", markdown, pageCount: parsedPageCount, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
963
1715
  } finally {
964
1716
  await doc.destroy().catch(() => {
965
1717
  });
@@ -1004,24 +1756,272 @@ async function extractPdfMetadataOnly(buffer) {
1004
1756
  });
1005
1757
  }
1006
1758
  }
1007
- function extractPageContent(rawItems) {
1008
- const items = normalizeItems(rawItems);
1009
- if (items.length === 0) return "";
1010
- const yLines = groupByY(items);
1011
- const columns = detectColumns(yLines);
1759
+ function filterHiddenText(items, pageWidth, pageHeight) {
1760
+ let hiddenCount = 0;
1761
+ const visible = [];
1762
+ for (const item of items) {
1763
+ if (item.isHidden) {
1764
+ hiddenCount++;
1765
+ continue;
1766
+ }
1767
+ const margin = Math.max(pageWidth, pageHeight) * 0.1;
1768
+ if (item.x < -margin || item.x > pageWidth + margin || item.y < -margin || item.y > pageHeight + margin) {
1769
+ hiddenCount++;
1770
+ continue;
1771
+ }
1772
+ visible.push(item);
1773
+ }
1774
+ return { visible, hiddenCount };
1775
+ }
1776
+ function computeMedianFontSize(sizes) {
1777
+ if (sizes.length === 0) return 0;
1778
+ const sorted = [...sizes].sort((a, b) => a - b);
1779
+ const mid = Math.floor(sorted.length / 2);
1780
+ return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
1781
+ }
1782
+ function detectHeadings(blocks, medianFontSize) {
1783
+ for (const block of blocks) {
1784
+ if (block.type !== "paragraph" || !block.text || !block.style?.fontSize) continue;
1785
+ const text = block.text.trim();
1786
+ if (text.length === 0 || text.length > 200) continue;
1787
+ if (/^\d+$/.test(text)) continue;
1788
+ const ratio = block.style.fontSize / medianFontSize;
1789
+ let level = 0;
1790
+ if (ratio >= 1.5) level = 1;
1791
+ else if (ratio >= 1.3) level = 2;
1792
+ else if (ratio >= 1.15) level = 3;
1793
+ if (level > 0) {
1794
+ block.type = "heading";
1795
+ block.level = level;
1796
+ }
1797
+ }
1798
+ }
1799
+ var MAX_XYCUT_DEPTH = 50;
1800
+ function xyCutOrder(items, gapThreshold, depth = 0) {
1801
+ if (items.length === 0) return [];
1802
+ if (items.length <= 2 || depth >= MAX_XYCUT_DEPTH) return [items];
1803
+ const region = computeRegion(items);
1804
+ const ySplit = findYSplit(items, region, gapThreshold);
1805
+ if (ySplit !== null) {
1806
+ const upper = items.filter((i) => i.y > ySplit);
1807
+ const lower = items.filter((i) => i.y <= ySplit);
1808
+ if (upper.length > 0 && lower.length > 0 && upper.length < items.length) {
1809
+ return [...xyCutOrder(upper, gapThreshold, depth + 1), ...xyCutOrder(lower, gapThreshold, depth + 1)];
1810
+ }
1811
+ }
1812
+ const xSplit = findXSplit(items, region, gapThreshold);
1813
+ if (xSplit !== null) {
1814
+ const left = items.filter((i) => i.x + i.w / 2 < xSplit);
1815
+ const right = items.filter((i) => i.x + i.w / 2 >= xSplit);
1816
+ if (left.length > 0 && right.length > 0 && left.length < items.length) {
1817
+ return [...xyCutOrder(left, gapThreshold, depth + 1), ...xyCutOrder(right, gapThreshold, depth + 1)];
1818
+ }
1819
+ }
1820
+ return [items];
1821
+ }
1822
+ function computeRegion(items) {
1823
+ let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
1824
+ for (const i of items) {
1825
+ if (i.x < minX) minX = i.x;
1826
+ if (i.y < minY) minY = i.y;
1827
+ if (i.x + i.w > maxX) maxX = i.x + i.w;
1828
+ if (i.y + i.h > maxY) maxY = i.y + i.h;
1829
+ }
1830
+ return { items, minX, minY, maxX, maxY };
1831
+ }
1832
+ function findYSplit(items, region, gapThreshold) {
1833
+ const sorted = [...items].sort((a, b) => b.y - a.y);
1834
+ let bestGap = gapThreshold;
1835
+ let bestSplit = null;
1836
+ for (let i = 1; i < sorted.length; i++) {
1837
+ const prevBottom = sorted[i - 1].y - sorted[i - 1].h;
1838
+ const currTop = sorted[i].y;
1839
+ const gap = prevBottom - currTop;
1840
+ if (gap > bestGap) {
1841
+ bestGap = gap;
1842
+ bestSplit = (prevBottom + currTop) / 2;
1843
+ }
1844
+ }
1845
+ return bestSplit;
1846
+ }
1847
+ function findXSplit(items, region, gapThreshold) {
1848
+ const sorted = [...items].sort((a, b) => a.x - b.x);
1849
+ let bestGap = gapThreshold;
1850
+ let bestSplit = null;
1851
+ for (let i = 1; i < sorted.length; i++) {
1852
+ const prevRight = sorted[i - 1].x + sorted[i - 1].w;
1853
+ const currLeft = sorted[i].x;
1854
+ const gap = currLeft - prevRight;
1855
+ if (gap > bestGap) {
1856
+ bestGap = gap;
1857
+ bestSplit = (prevRight + currLeft) / 2;
1858
+ }
1859
+ }
1860
+ return bestSplit;
1861
+ }
1862
+ function extractPageBlocksWithLines(items, pageNum, opList, pageWidth, pageHeight) {
1863
+ if (items.length === 0) return [];
1864
+ let { horizontals, verticals } = extractLines(opList.fnArray, opList.argsArray);
1865
+ ({ horizontals, verticals } = filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight));
1866
+ const grids = buildTableGrids(horizontals, verticals);
1867
+ if (grids.length > 0) {
1868
+ return extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals);
1869
+ }
1870
+ return extractPageBlocksFallback(items, pageNum);
1871
+ }
1872
+ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
1873
+ const blocks = [];
1874
+ const usedItems = /* @__PURE__ */ new Set();
1875
+ const sortedGrids = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
1876
+ for (const grid of sortedGrids) {
1877
+ const tableItems = [];
1878
+ const pad = 3;
1879
+ for (const item of items) {
1880
+ if (usedItems.has(item)) continue;
1881
+ if (item.x >= grid.bbox.x1 - pad && item.x + item.w <= grid.bbox.x2 + pad && item.y >= grid.bbox.y1 - pad && item.y <= grid.bbox.y2 + pad) {
1882
+ tableItems.push(item);
1883
+ usedItems.add(item);
1884
+ }
1885
+ }
1886
+ const cells = extractCells(grid, horizontals, verticals);
1887
+ if (cells.length === 0) continue;
1888
+ const textItems = tableItems.map((i) => ({
1889
+ text: i.text,
1890
+ x: i.x,
1891
+ y: i.y,
1892
+ w: i.w,
1893
+ h: i.h,
1894
+ fontSize: i.fontSize,
1895
+ fontName: i.fontName
1896
+ }));
1897
+ const cellTextMap = mapTextToCells(textItems, cells);
1898
+ const numRows = grid.rowYs.length - 1;
1899
+ const numCols = grid.colXs.length - 1;
1900
+ const irGrid = Array.from(
1901
+ { length: numRows },
1902
+ () => Array.from({ length: numCols }, () => ({ text: "", colSpan: 1, rowSpan: 1 }))
1903
+ );
1904
+ for (const cell of cells) {
1905
+ const textItems2 = cellTextMap.get(cell) || [];
1906
+ const text = cellTextToString(textItems2);
1907
+ irGrid[cell.row][cell.col] = {
1908
+ text,
1909
+ colSpan: cell.colSpan,
1910
+ rowSpan: cell.rowSpan
1911
+ };
1912
+ }
1913
+ const irTable = {
1914
+ rows: numRows,
1915
+ cols: numCols,
1916
+ cells: irGrid,
1917
+ hasHeader: numRows > 1
1918
+ };
1919
+ blocks.push({
1920
+ type: "table",
1921
+ table: irTable,
1922
+ pageNumber: pageNum,
1923
+ bbox: {
1924
+ page: pageNum,
1925
+ x: grid.bbox.x1,
1926
+ y: grid.bbox.y1,
1927
+ width: grid.bbox.x2 - grid.bbox.x1,
1928
+ height: grid.bbox.y2 - grid.bbox.y1
1929
+ }
1930
+ });
1931
+ }
1932
+ const remaining = items.filter((i) => !usedItems.has(i));
1933
+ if (remaining.length > 0) {
1934
+ remaining.sort((a, b) => b.y - a.y || a.x - b.x);
1935
+ const textBlocks = detectListBlocks(extractPageBlocksFallback(remaining, pageNum));
1936
+ const allBlocks = [...blocks, ...textBlocks];
1937
+ allBlocks.sort((a, b) => {
1938
+ const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
1939
+ const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
1940
+ return by - ay;
1941
+ });
1942
+ return allBlocks;
1943
+ }
1944
+ return blocks;
1945
+ }
1946
+ function extractPageBlocksFallback(items, pageNum) {
1947
+ if (items.length === 0) return [];
1948
+ const blocks = [];
1949
+ const allYLines = groupByY(items);
1950
+ const columns = detectColumns(allYLines);
1012
1951
  if (columns && columns.length >= 3) {
1013
- return extractWithColumns(yLines, columns);
1952
+ const tableText = extractWithColumns(allYLines, columns);
1953
+ const bbox = computeBBox(items, pageNum);
1954
+ blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
1955
+ } else {
1956
+ const allY = items.map((i) => i.y);
1957
+ const pageHeight = Math.max(...allY) - Math.min(...allY);
1958
+ const gapThreshold = Math.max(15, pageHeight * 0.03);
1959
+ const orderedGroups = xyCutOrder(items, gapThreshold);
1960
+ for (const group of orderedGroups) {
1961
+ if (group.length === 0) continue;
1962
+ const yLines = groupByY(group);
1963
+ const groupColumns = detectColumns(yLines);
1964
+ if (groupColumns && groupColumns.length >= 3) {
1965
+ const tableText = extractWithColumns(yLines, groupColumns);
1966
+ const bbox = computeBBox(group, pageNum);
1967
+ blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(group) });
1968
+ } else {
1969
+ for (const line of yLines) {
1970
+ const text = mergeLineSimple(line);
1971
+ if (!text.trim()) continue;
1972
+ const bbox = computeBBox(line, pageNum);
1973
+ blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
1974
+ }
1975
+ }
1976
+ }
1977
+ }
1978
+ return blocks;
1979
+ }
1980
+ function computeBBox(items, pageNum) {
1981
+ let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
1982
+ for (const i of items) {
1983
+ if (i.x < minX) minX = i.x;
1984
+ if (i.y < minY) minY = i.y;
1985
+ if (i.x + i.w > maxX) maxX = i.x + i.w;
1986
+ const effectiveH = i.h > 0 ? i.h : i.fontSize;
1987
+ if (i.y + effectiveH > maxY) maxY = i.y + effectiveH;
1988
+ }
1989
+ return { page: pageNum, x: minX, y: minY, width: maxX - minX, height: maxY - minY };
1990
+ }
1991
+ function dominantStyle(items) {
1992
+ if (items.length === 0) return void 0;
1993
+ const freq = /* @__PURE__ */ new Map();
1994
+ let maxCount = 0, dominantSize = 0;
1995
+ for (const i of items) {
1996
+ if (i.fontSize <= 0) continue;
1997
+ const count = (freq.get(i.fontSize) || 0) + 1;
1998
+ freq.set(i.fontSize, count);
1999
+ if (count > maxCount) {
2000
+ maxCount = count;
2001
+ dominantSize = i.fontSize;
2002
+ }
1014
2003
  }
1015
- return yLines.map((line) => mergeLineSimple(line)).join("\n");
2004
+ if (dominantSize === 0) return void 0;
2005
+ const fontName = items.find((i) => i.fontSize === dominantSize)?.fontName || void 0;
2006
+ return { fontSize: dominantSize, fontName };
1016
2007
  }
1017
2008
  function normalizeItems(rawItems) {
1018
- return rawItems.filter((i) => typeof i.str === "string" && i.str.trim() !== "").map((i) => ({
1019
- text: i.str.trim(),
1020
- x: Math.round(i.transform[4]),
1021
- y: Math.round(i.transform[5]),
1022
- w: Math.round(i.width),
1023
- h: Math.round(i.height)
1024
- })).sort((a, b) => b.y - a.y || a.x - b.x);
2009
+ return rawItems.filter((i) => typeof i.str === "string" && i.str.trim() !== "").map((i) => {
2010
+ const scaleY = Math.abs(i.transform[3]);
2011
+ const scaleX = Math.abs(i.transform[0]);
2012
+ const fontSize = Math.round(Math.max(scaleY, scaleX));
2013
+ return {
2014
+ text: i.str.trim(),
2015
+ x: Math.round(i.transform[4]),
2016
+ y: Math.round(i.transform[5]),
2017
+ w: Math.round(i.width),
2018
+ h: Math.round(i.height),
2019
+ fontSize,
2020
+ fontName: i.fontName || "",
2021
+ // 0pt 폰트이거나 너비 0 → hidden text (prompt injection 의심)
2022
+ isHidden: fontSize === 0 || i.width === 0 && i.str.trim().length > 0
2023
+ };
2024
+ }).sort((a, b) => b.y - a.y || a.x - b.x);
1025
2025
  }
1026
2026
  function groupByY(items) {
1027
2027
  if (items.length === 0) return [];
@@ -1259,6 +2259,27 @@ function startsWithMarker(line) {
1259
2259
  function isStandaloneHeader(line) {
1260
2260
  return /^제\d+[조항호장절](\([^)]*\))?(\s+\S+){0,7}$/.test(line.trim());
1261
2261
  }
2262
+ function detectListBlocks(blocks) {
2263
+ const result = [];
2264
+ for (let i = 0; i < blocks.length; i++) {
2265
+ const block = blocks[i];
2266
+ if (block.type === "paragraph" && block.text) {
2267
+ const match = block.text.match(/^(\d+)\.\s/);
2268
+ if (match) {
2269
+ result.push({
2270
+ ...block,
2271
+ type: "list",
2272
+ listType: "ordered",
2273
+ // 원래 번호를 text에 보존 (blocksToMarkdown에서 그대로 출력)
2274
+ text: block.text
2275
+ });
2276
+ continue;
2277
+ }
2278
+ }
2279
+ result.push(block);
2280
+ }
2281
+ return result;
2282
+ }
1262
2283
  function mergeKoreanLines(text) {
1263
2284
  if (!text) return "";
1264
2285
  const lines = text.split("\n");
@@ -1267,6 +2288,10 @@ function mergeKoreanLines(text) {
1267
2288
  for (let i = 1; i < lines.length; i++) {
1268
2289
  const prev = result[result.length - 1];
1269
2290
  const curr = lines[i];
2291
+ if (/^#{1,6}\s/.test(prev) || /^#{1,6}\s/.test(curr)) {
2292
+ result.push(curr);
2293
+ continue;
2294
+ }
1270
2295
  if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev)) {
1271
2296
  result[result.length - 1] = prev + " " + curr;
1272
2297
  } else {
@@ -1424,16 +2449,16 @@ async function parse(buffer, options) {
1424
2449
  }
1425
2450
  async function parseHwpx(buffer, options) {
1426
2451
  try {
1427
- const { markdown, blocks, metadata } = await parseHwpxDocument(buffer, options);
1428
- return { success: true, fileType: "hwpx", markdown, blocks, metadata };
2452
+ const { markdown, blocks, metadata, outline, warnings } = await parseHwpxDocument(buffer, options);
2453
+ return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings };
1429
2454
  } catch (err) {
1430
2455
  return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
1431
2456
  }
1432
2457
  }
1433
2458
  async function parseHwp(buffer, options) {
1434
2459
  try {
1435
- const { markdown, blocks, metadata } = parseHwp5Document(Buffer.from(buffer), options);
1436
- return { success: true, fileType: "hwp", markdown, blocks, metadata };
2460
+ const { markdown, blocks, metadata, outline, warnings } = parseHwp5Document(Buffer.from(buffer), options);
2461
+ return { success: true, fileType: "hwp", markdown, blocks, metadata, outline, warnings };
1437
2462
  } catch (err) {
1438
2463
  return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
1439
2464
  }
@@ -1578,12 +2603,13 @@ function fallbackAlign(a, b) {
1578
2603
  }
1579
2604
  function blockSimilarity(a, b) {
1580
2605
  if (a.type !== b.type) return 0;
1581
- if (a.type === "paragraph") {
2606
+ if (a.text !== void 0 && b.text !== void 0) {
1582
2607
  return normalizedSimilarity(a.text || "", b.text || "");
1583
2608
  }
1584
2609
  if (a.type === "table" && a.table && b.table) {
1585
2610
  return tableSimilarity(a.table, b.table);
1586
2611
  }
2612
+ if (a.type === b.type) return 1;
1587
2613
  return 0;
1588
2614
  }
1589
2615
  function tableSimilarity(a, b) {
@@ -1628,4 +2654,4 @@ export {
1628
2654
  extractFormFields,
1629
2655
  parse
1630
2656
  };
1631
- //# sourceMappingURL=chunk-BWZW234S.js.map
2657
+ //# sourceMappingURL=chunk-5SZWGBNL.js.map