kordoc 2.0.1 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -138,7 +138,7 @@ import { inflateRawSync } from "zlib";
138
138
  import { DOMParser } from "@xmldom/xmldom";
139
139
 
140
140
  // src/utils.ts
141
- var VERSION = true ? "2.0.1" : "0.0.0-dev";
141
+ var VERSION = true ? "2.0.2" : "0.0.0-dev";
142
142
  function toArrayBuffer(buf) {
143
143
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
144
144
  return buf.buffer;
@@ -388,8 +388,11 @@ function blocksToMarkdown(blocks) {
388
388
  if (lines.length > 0 && lines[lines.length - 1] !== "") {
389
389
  lines.push("");
390
390
  }
391
- lines.push(tableToMarkdown(block.table));
392
- lines.push("");
391
+ const tableMd = tableToMarkdown(block.table);
392
+ if (tableMd) {
393
+ lines.push(tableMd);
394
+ lines.push("");
395
+ }
393
396
  }
394
397
  }
395
398
  return lines.join("\n").trim();
@@ -399,6 +402,7 @@ function tableToMarkdown(table) {
399
402
  const { cells, rows: numRows, cols: numCols } = table;
400
403
  if (numRows === 1 && numCols === 1) {
401
404
  const content = sanitizeText(cells[0][0].text);
405
+ if (!content) return "";
402
406
  return content.split(/\n/).map((line) => {
403
407
  const trimmed = line.trim();
404
408
  if (!trimmed) return "";
@@ -435,9 +439,9 @@ function tableToMarkdown(table) {
435
439
  const row = display[r];
436
440
  const isEmptyPlaceholder = row.every((cell) => cell === "");
437
441
  if (isEmptyPlaceholder) continue;
438
- const hasSkippedCols = row.some((cell, c) => cell === "" && skip.has(`${r},${c}`));
439
442
  const nonEmptyCols = row.filter((cell) => cell !== "");
440
- if (!hasSkippedCols && nonEmptyCols.length === 1 && row[0] !== "" && row.slice(1).every((c) => c === "")) {
443
+ const hasSkipInRow = row.some((_, c) => skip.has(`${r},${c}`));
444
+ if (!hasSkipInRow && nonEmptyCols.length === 1 && row[0] !== "" && row.slice(1).every((c) => c === "")) {
441
445
  pendingFirstCol = row[0];
442
446
  continue;
443
447
  }
@@ -852,7 +856,8 @@ function detectHwpxHeadings(blocks, styleMap) {
852
856
  else if (ratio >= HEADING_RATIO_H2) level = 2;
853
857
  else if (ratio >= HEADING_RATIO_H3) level = 3;
854
858
  }
855
- if (/^제\d+[조장절편]/.test(text) && text.length <= 50) {
859
+ const compactText = text.replace(/\s+/g, "");
860
+ if (/^제\d+[조장절편]/.test(compactText) && text.length <= 50) {
856
861
  if (level === 0) level = 3;
857
862
  }
858
863
  if (level > 0) {
@@ -904,9 +909,14 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
904
909
  if (newTable.rows.length > 0) {
905
910
  if (tableStack.length > 0) {
906
911
  const parentTable = tableStack.pop();
907
- const nestedText = convertTableToText(newTable.rows);
908
- if (parentTable.cell) {
909
- parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
912
+ const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
913
+ if (newTable.rows.length >= 3 && nestedCols >= 2) {
914
+ blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
915
+ } else {
916
+ const nestedText = convertTableToText(newTable.rows);
917
+ if (parentTable.cell) {
918
+ parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
919
+ }
910
920
  }
911
921
  tableCtx = parentTable;
912
922
  } else {
@@ -1006,9 +1016,14 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
1006
1016
  if (newTable.rows.length > 0) {
1007
1017
  if (tableStack.length > 0) {
1008
1018
  const parentTable = tableStack.pop();
1009
- const nestedText = convertTableToText(newTable.rows);
1010
- if (parentTable.cell) {
1011
- parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
1019
+ const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
1020
+ if (newTable.rows.length >= 3 && nestedCols >= 2) {
1021
+ blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
1022
+ } else {
1023
+ const nestedText = convertTableToText(newTable.rows);
1024
+ if (parentTable.cell) {
1025
+ parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
1026
+ }
1012
1027
  }
1013
1028
  tableCtx = parentTable;
1014
1029
  } else {
@@ -1019,13 +1034,20 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
1019
1034
  tableCtx = tableStack.length > 0 ? tableStack.pop() : null;
1020
1035
  }
1021
1036
  } else if (localTag === "pic" || localTag === "shape" || localTag === "drawingObject") {
1022
- const imgRef = extractImageRef(el);
1023
- if (imgRef) {
1024
- blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
1025
- } else if (warnings && sectionNum) {
1026
- warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
1037
+ const drawTextChild = findDescendant(el, "drawText");
1038
+ if (drawTextChild) {
1039
+ extractDrawTextBlocks(drawTextChild, blocks, styleMap, sectionNum);
1040
+ } else {
1041
+ const imgRef = extractImageRef(el);
1042
+ if (imgRef) {
1043
+ blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
1044
+ } else if (warnings && sectionNum) {
1045
+ warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
1046
+ }
1027
1047
  }
1028
- } else if (localTag === "r" || localTag === "run" || localTag === "ctrl") {
1048
+ } else if (localTag === "drawText") {
1049
+ extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
1050
+ } else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
1029
1051
  walkChildren(el, d + 1);
1030
1052
  }
1031
1053
  }
@@ -1033,6 +1055,40 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
1033
1055
  walkChildren(node, depth);
1034
1056
  return tableCtx;
1035
1057
  }
1058
+ function findDescendant(node, targetTag, depth = 0) {
1059
+ if (depth > 5) return null;
1060
+ const children = node.childNodes;
1061
+ if (!children) return null;
1062
+ for (let i = 0; i < children.length; i++) {
1063
+ const child = children[i];
1064
+ if (child.nodeType !== 1) continue;
1065
+ const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
1066
+ if (tag === targetTag) return child;
1067
+ const found = findDescendant(child, targetTag, depth + 1);
1068
+ if (found) return found;
1069
+ }
1070
+ return null;
1071
+ }
1072
+ function extractDrawTextBlocks(drawTextNode, blocks, styleMap, sectionNum) {
1073
+ const children = drawTextNode.childNodes;
1074
+ if (!children) return;
1075
+ for (let i = 0; i < children.length; i++) {
1076
+ const child = children[i];
1077
+ if (child.nodeType !== 1) continue;
1078
+ const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
1079
+ if (tag === "subList" || tag === "p" || tag === "para") {
1080
+ if (tag === "subList") {
1081
+ extractDrawTextBlocks(child, blocks, styleMap, sectionNum);
1082
+ } else {
1083
+ const info = extractParagraphInfo(child, styleMap);
1084
+ const text = info.text.trim();
1085
+ if (text) {
1086
+ blocks.push({ type: "paragraph", text, style: info.style ?? void 0, pageNumber: sectionNum });
1087
+ }
1088
+ }
1089
+ }
1090
+ }
1091
+ }
1036
1092
  function extractParagraphInfo(para, styleMap) {
1037
1093
  let text = "";
1038
1094
  let href;
@@ -1051,11 +1107,18 @@ function extractParagraphInfo(para, styleMap) {
1051
1107
  const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
1052
1108
  switch (tag) {
1053
1109
  case "t":
1054
- text += child.textContent || "";
1110
+ walk(child);
1055
1111
  break;
1056
- case "tab":
1057
- text += " ";
1112
+ // 자식 순회 (tab 등 하위 요소 처리)
1113
+ case "tab": {
1114
+ const leader = child.getAttribute("leader");
1115
+ if (leader && leader !== "0") {
1116
+ text += "";
1117
+ } else {
1118
+ text += " ";
1119
+ }
1058
1120
  break;
1121
+ }
1059
1122
  case "br":
1060
1123
  if ((child.getAttribute("type") || "line") === "line") text += "\n";
1061
1124
  break;
@@ -1122,6 +1185,8 @@ function extractParagraphInfo(para, styleMap) {
1122
1185
  }
1123
1186
  };
1124
1187
  walk(para);
1188
+ const leaderIdx = text.indexOf("");
1189
+ if (leaderIdx >= 0) text = text.substring(0, leaderIdx);
1125
1190
  let cleanText = text.replace(/[ \t]+/g, " ").trim();
1126
1191
  if (/^그림입니다\.?\s*원본\s*그림의\s*(이름|크기)/.test(cleanText)) cleanText = "";
1127
1192
  cleanText = cleanText.replace(/그림입니다\.?\s*원본\s*그림의\s*(이름|크기)[^\n]*(\n[^\n]*원본\s*그림의\s*(이름|크기)[^\n]*)*/g, "").trim();
@@ -2592,7 +2657,10 @@ function parseSection(records, docInfo, warnings, sectionNum) {
2592
2657
  if (binId >= 0) {
2593
2658
  blocks.push({ type: "image", text: String(binId), pageNumber: sectionNum });
2594
2659
  } else {
2595
- warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC81C\uC5B4 \uC694\uC18C: ${ctrlId.trim()}`, code: "SKIPPED_IMAGE" });
2660
+ const boxText = extractTextBoxText(records, i);
2661
+ if (boxText) {
2662
+ blocks.push({ type: "paragraph", text: boxText, pageNumber: sectionNum });
2663
+ }
2596
2664
  }
2597
2665
  } else if (ctrlId === " elo" || ctrlId === "ole ") {
2598
2666
  warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC81C\uC5B4 \uC694\uC18C: ${ctrlId.trim()}`, code: "SKIPPED_IMAGE" });
@@ -2631,6 +2699,19 @@ function extractNoteText(records, ctrlIdx) {
2631
2699
  }
2632
2700
  return texts.length > 0 ? texts.join(" ") : null;
2633
2701
  }
2702
+ function extractTextBoxText(records, ctrlIdx) {
2703
+ const ctrlLevel = records[ctrlIdx].level;
2704
+ const texts = [];
2705
+ for (let j = ctrlIdx + 1; j < records.length && j < ctrlIdx + 200; j++) {
2706
+ const r = records[j];
2707
+ if (r.level <= ctrlLevel) break;
2708
+ if (r.tagId === TAG_PARA_TEXT) {
2709
+ const t = extractText(r.data).trim();
2710
+ if (t) texts.push(t);
2711
+ }
2712
+ }
2713
+ return texts.length > 0 ? texts.join("\n") : null;
2714
+ }
2634
2715
  function extractHyperlinkUrl(data) {
2635
2716
  try {
2636
2717
  const httpSig = Buffer.from("http", "utf16le");