kordoc 2.0.1 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -138,7 +138,7 @@ import { inflateRawSync } from "zlib";
138
138
  import { DOMParser } from "@xmldom/xmldom";
139
139
 
140
140
  // src/utils.ts
141
- var VERSION = true ? "2.0.1" : "0.0.0-dev";
141
+ var VERSION = true ? "2.0.3" : "0.0.0-dev";
142
142
  function toArrayBuffer(buf) {
143
143
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
144
144
  return buf.buffer;
@@ -327,6 +327,47 @@ function sanitizeText(text) {
327
327
  }
328
328
  return result;
329
329
  }
330
+ function flattenLayoutTables(blocks) {
331
+ const result = [];
332
+ for (const block of blocks) {
333
+ if (block.type !== "table" || !block.table) {
334
+ result.push(block);
335
+ continue;
336
+ }
337
+ const { rows: numRows, cols: numCols, cells } = block.table;
338
+ if (numRows === 1 && numCols === 1) {
339
+ result.push(block);
340
+ continue;
341
+ }
342
+ if (numRows <= 3) {
343
+ let totalNewlines = 0;
344
+ let totalTextLen = 0;
345
+ for (let r = 0; r < numRows; r++) {
346
+ for (let c = 0; c < numCols; c++) {
347
+ const t = cells[r]?.[c]?.text || "";
348
+ totalNewlines += (t.match(/\n/g) || []).length;
349
+ totalTextLen += t.length;
350
+ }
351
+ }
352
+ if (totalNewlines > 5 || numRows <= 2 && totalTextLen > 300) {
353
+ for (let r = 0; r < numRows; r++) {
354
+ for (let c = 0; c < numCols; c++) {
355
+ const cellText = cells[r]?.[c]?.text?.trim();
356
+ if (!cellText) continue;
357
+ for (const line of cellText.split("\n")) {
358
+ const trimmed = line.trim();
359
+ if (!trimmed) continue;
360
+ result.push({ type: "paragraph", text: trimmed, pageNumber: block.pageNumber });
361
+ }
362
+ }
363
+ }
364
+ continue;
365
+ }
366
+ }
367
+ result.push(block);
368
+ }
369
+ return result;
370
+ }
330
371
  function blocksToMarkdown(blocks) {
331
372
  const lines = [];
332
373
  for (let i = 0; i < blocks.length; i++) {
@@ -388,8 +429,11 @@ function blocksToMarkdown(blocks) {
388
429
  if (lines.length > 0 && lines[lines.length - 1] !== "") {
389
430
  lines.push("");
390
431
  }
391
- lines.push(tableToMarkdown(block.table));
392
- lines.push("");
432
+ const tableMd = tableToMarkdown(block.table);
433
+ if (tableMd) {
434
+ lines.push(tableMd);
435
+ lines.push("");
436
+ }
393
437
  }
394
438
  }
395
439
  return lines.join("\n").trim();
@@ -399,6 +443,7 @@ function tableToMarkdown(table) {
399
443
  const { cells, rows: numRows, cols: numCols } = table;
400
444
  if (numRows === 1 && numCols === 1) {
401
445
  const content = sanitizeText(cells[0][0].text);
446
+ if (!content) return "";
402
447
  return content.split(/\n/).map((line) => {
403
448
  const trimmed = line.trim();
404
449
  if (!trimmed) return "";
@@ -435,9 +480,9 @@ function tableToMarkdown(table) {
435
480
  const row = display[r];
436
481
  const isEmptyPlaceholder = row.every((cell) => cell === "");
437
482
  if (isEmptyPlaceholder) continue;
438
- const hasSkippedCols = row.some((cell, c) => cell === "" && skip.has(`${r},${c}`));
439
483
  const nonEmptyCols = row.filter((cell) => cell !== "");
440
- if (!hasSkippedCols && nonEmptyCols.length === 1 && row[0] !== "" && row.slice(1).every((c) => c === "")) {
484
+ const hasSkipInRow = row.some((_, c) => skip.has(`${r},${c}`));
485
+ if (!hasSkipInRow && nonEmptyCols.length === 1 && row[0] !== "" && row.slice(1).every((c) => c === "")) {
441
486
  pendingFirstCol = row[0];
442
487
  continue;
443
488
  }
@@ -852,7 +897,8 @@ function detectHwpxHeadings(blocks, styleMap) {
852
897
  else if (ratio >= HEADING_RATIO_H2) level = 2;
853
898
  else if (ratio >= HEADING_RATIO_H3) level = 3;
854
899
  }
855
- if (/^제\d+[조장절편]/.test(text) && text.length <= 50) {
900
+ const compactText = text.replace(/\s+/g, "");
901
+ if (/^제\d+[조장절편]/.test(compactText) && text.length <= 50) {
856
902
  if (level === 0) level = 3;
857
903
  }
858
904
  if (level > 0) {
@@ -904,9 +950,14 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
904
950
  if (newTable.rows.length > 0) {
905
951
  if (tableStack.length > 0) {
906
952
  const parentTable = tableStack.pop();
907
- const nestedText = convertTableToText(newTable.rows);
908
- if (parentTable.cell) {
909
- parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
953
+ const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
954
+ if (newTable.rows.length >= 3 && nestedCols >= 2) {
955
+ blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
956
+ } else {
957
+ const nestedText = convertTableToText(newTable.rows);
958
+ if (parentTable.cell) {
959
+ parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
960
+ }
910
961
  }
911
962
  tableCtx = parentTable;
912
963
  } else {
@@ -1006,9 +1057,14 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
1006
1057
  if (newTable.rows.length > 0) {
1007
1058
  if (tableStack.length > 0) {
1008
1059
  const parentTable = tableStack.pop();
1009
- const nestedText = convertTableToText(newTable.rows);
1010
- if (parentTable.cell) {
1011
- parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
1060
+ const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
1061
+ if (newTable.rows.length >= 3 && nestedCols >= 2) {
1062
+ blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
1063
+ } else {
1064
+ const nestedText = convertTableToText(newTable.rows);
1065
+ if (parentTable.cell) {
1066
+ parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
1067
+ }
1012
1068
  }
1013
1069
  tableCtx = parentTable;
1014
1070
  } else {
@@ -1019,13 +1075,20 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
1019
1075
  tableCtx = tableStack.length > 0 ? tableStack.pop() : null;
1020
1076
  }
1021
1077
  } else if (localTag === "pic" || localTag === "shape" || localTag === "drawingObject") {
1022
- const imgRef = extractImageRef(el);
1023
- if (imgRef) {
1024
- blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
1025
- } else if (warnings && sectionNum) {
1026
- warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
1078
+ const drawTextChild = findDescendant(el, "drawText");
1079
+ if (drawTextChild) {
1080
+ extractDrawTextBlocks(drawTextChild, blocks, styleMap, sectionNum);
1081
+ } else {
1082
+ const imgRef = extractImageRef(el);
1083
+ if (imgRef) {
1084
+ blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
1085
+ } else if (warnings && sectionNum) {
1086
+ warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
1087
+ }
1027
1088
  }
1028
- } else if (localTag === "r" || localTag === "run" || localTag === "ctrl") {
1089
+ } else if (localTag === "drawText") {
1090
+ extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
1091
+ } else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
1029
1092
  walkChildren(el, d + 1);
1030
1093
  }
1031
1094
  }
@@ -1033,6 +1096,40 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
1033
1096
  walkChildren(node, depth);
1034
1097
  return tableCtx;
1035
1098
  }
1099
+ function findDescendant(node, targetTag, depth = 0) {
1100
+ if (depth > 5) return null;
1101
+ const children = node.childNodes;
1102
+ if (!children) return null;
1103
+ for (let i = 0; i < children.length; i++) {
1104
+ const child = children[i];
1105
+ if (child.nodeType !== 1) continue;
1106
+ const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
1107
+ if (tag === targetTag) return child;
1108
+ const found = findDescendant(child, targetTag, depth + 1);
1109
+ if (found) return found;
1110
+ }
1111
+ return null;
1112
+ }
1113
+ function extractDrawTextBlocks(drawTextNode, blocks, styleMap, sectionNum) {
1114
+ const children = drawTextNode.childNodes;
1115
+ if (!children) return;
1116
+ for (let i = 0; i < children.length; i++) {
1117
+ const child = children[i];
1118
+ if (child.nodeType !== 1) continue;
1119
+ const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
1120
+ if (tag === "subList" || tag === "p" || tag === "para") {
1121
+ if (tag === "subList") {
1122
+ extractDrawTextBlocks(child, blocks, styleMap, sectionNum);
1123
+ } else {
1124
+ const info = extractParagraphInfo(child, styleMap);
1125
+ const text = info.text.trim();
1126
+ if (text) {
1127
+ blocks.push({ type: "paragraph", text, style: info.style ?? void 0, pageNumber: sectionNum });
1128
+ }
1129
+ }
1130
+ }
1131
+ }
1132
+ }
1036
1133
  function extractParagraphInfo(para, styleMap) {
1037
1134
  let text = "";
1038
1135
  let href;
@@ -1051,11 +1148,18 @@ function extractParagraphInfo(para, styleMap) {
1051
1148
  const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
1052
1149
  switch (tag) {
1053
1150
  case "t":
1054
- text += child.textContent || "";
1151
+ walk(child);
1055
1152
  break;
1056
- case "tab":
1057
- text += " ";
1153
+ // 자식 순회 (tab 등 하위 요소 처리)
1154
+ case "tab": {
1155
+ const leader = child.getAttribute("leader");
1156
+ if (leader && leader !== "0") {
1157
+ text += "";
1158
+ } else {
1159
+ text += " ";
1160
+ }
1058
1161
  break;
1162
+ }
1059
1163
  case "br":
1060
1164
  if ((child.getAttribute("type") || "line") === "line") text += "\n";
1061
1165
  break;
@@ -1122,6 +1226,8 @@ function extractParagraphInfo(para, styleMap) {
1122
1226
  }
1123
1227
  };
1124
1228
  walk(para);
1229
+ const leaderIdx = text.indexOf("");
1230
+ if (leaderIdx >= 0) text = text.substring(0, leaderIdx);
1125
1231
  let cleanText = text.replace(/[ \t]+/g, " ").trim();
1126
1232
  if (/^그림입니다\.?\s*원본\s*그림의\s*(이름|크기)/.test(cleanText)) cleanText = "";
1127
1233
  cleanText = cleanText.replace(/그림입니다\.?\s*원본\s*그림의\s*(이름|크기)[^\n]*(\n[^\n]*원본\s*그림의\s*(이름|크기)[^\n]*)*/g, "").trim();
@@ -1160,8 +1266,9 @@ var TAG_CHAR_SHAPE = 68;
1160
1266
  var TAG_CTRL_HEADER = 71;
1161
1267
  var TAG_LIST_HEADER = 72;
1162
1268
  var TAG_TABLE = 77;
1163
- var TAG_DOC_CHAR_SHAPE = 55;
1164
- var TAG_DOC_STYLE = 58;
1269
+ var TAG_DOC_CHAR_SHAPE = 21;
1270
+ var TAG_DOC_PARA_SHAPE = 25;
1271
+ var TAG_DOC_STYLE = 26;
1165
1272
  var CHAR_LINE = 0;
1166
1273
  var CHAR_SECTION_BREAK = 10;
1167
1274
  var CHAR_PARA = 13;
@@ -1217,8 +1324,14 @@ function parseFileHeader(data) {
1217
1324
  }
1218
1325
  function parseDocInfo(records) {
1219
1326
  const charShapes = [];
1327
+ const paraShapes = [];
1220
1328
  const styles = [];
1221
1329
  for (const rec of records) {
1330
+ if (rec.tagId === TAG_DOC_PARA_SHAPE && rec.data.length >= 4) {
1331
+ const flags = rec.data.readUInt32LE(0);
1332
+ const outlineLevel = flags >> 25 & 7;
1333
+ paraShapes.push({ outlineLevel });
1334
+ }
1222
1335
  if (rec.tagId === TAG_DOC_CHAR_SHAPE && rec.data.length >= 18) {
1223
1336
  if (rec.data.length >= 50) {
1224
1337
  const fontSize = rec.data.readUInt32LE(42);
@@ -1258,7 +1371,7 @@ function parseDocInfo(records) {
1258
1371
  }
1259
1372
  }
1260
1373
  }
1261
- return { charShapes, styles };
1374
+ return { charShapes, paraShapes, styles };
1262
1375
  }
1263
1376
  function extractText(data) {
1264
1377
  let result = "";
@@ -2269,12 +2382,13 @@ function parseHwp5Document(buffer, options) {
2269
2382
  }
2270
2383
  }
2271
2384
  const images = cfb ? extractHwp5Images(cfb, blocks, compressed, warnings) : extractHwp5ImagesLenient(lenientCfb, blocks, compressed, warnings);
2385
+ const flatBlocks = flattenLayoutTables(blocks);
2272
2386
  if (docInfo) {
2273
- detectHwp5Headings(blocks, docInfo);
2387
+ detectHwp5Headings(flatBlocks, docInfo);
2274
2388
  }
2275
- const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
2276
- const markdown = blocksToMarkdown(blocks);
2277
- return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
2389
+ const outline = flatBlocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
2390
+ const markdown = blocksToMarkdown(flatBlocks);
2391
+ return { markdown, blocks: flatBlocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
2278
2392
  }
2279
2393
  function parseDocInfoStream(cfb, compressed) {
2280
2394
  try {
@@ -2325,16 +2439,21 @@ function detectHwp5Headings(blocks, docInfo) {
2325
2439
  }
2326
2440
  if (baseFontSize <= 0) return;
2327
2441
  for (const block of blocks) {
2328
- if (block.type !== "paragraph" || !block.text || !block.style?.fontSize) continue;
2442
+ if (block.type === "heading") continue;
2443
+ if (block.type !== "paragraph" || !block.text) continue;
2329
2444
  const text = block.text.trim();
2330
2445
  if (text.length === 0 || text.length > 200) continue;
2331
2446
  if (/^\d+$/.test(text)) continue;
2332
- const ratio = block.style.fontSize / baseFontSize;
2333
2447
  let level = 0;
2334
- if (ratio >= HEADING_RATIO_H1) level = 1;
2335
- else if (ratio >= HEADING_RATIO_H2) level = 2;
2336
- else if (ratio >= HEADING_RATIO_H3) level = 3;
2337
- if (/^제\d+[조장절편]/.test(text) && text.length <= 50) {
2448
+ if (block.style?.fontSize && baseFontSize > 0) {
2449
+ const ratio = block.style.fontSize / baseFontSize;
2450
+ if (ratio >= HEADING_RATIO_H1) level = 1;
2451
+ else if (ratio >= HEADING_RATIO_H2) level = 2;
2452
+ else if (ratio >= HEADING_RATIO_H3) level = 3;
2453
+ }
2454
+ if (/^제\d+[장절편]\s/.test(text) && text.length <= 50) {
2455
+ if (level === 0) level = 2;
2456
+ } else if (/^제\d+(조의?\d*)\s*[\((]/.test(text) && text.length <= 80) {
2338
2457
  if (level === 0) level = 3;
2339
2458
  }
2340
2459
  if (level > 0) {
@@ -2566,13 +2685,20 @@ function parseSection(records, docInfo, warnings, sectionNum) {
2566
2685
  while (i < records.length) {
2567
2686
  const rec = records[i];
2568
2687
  if (rec.tagId === TAG_PARA_HEADER && rec.level === 0) {
2569
- const { paragraph, tables, nextIdx, charShapeIds } = parseParagraphWithTables(records, i);
2688
+ const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i);
2570
2689
  if (paragraph) {
2571
2690
  const block = { type: "paragraph", text: paragraph, pageNumber: sectionNum };
2572
2691
  if (docInfo && charShapeIds.length > 0) {
2573
2692
  const style = resolveCharStyle(charShapeIds, docInfo);
2574
2693
  if (style) block.style = style;
2575
2694
  }
2695
+ if (docInfo && paraShapeId >= 0 && paraShapeId < docInfo.paraShapes.length) {
2696
+ const ol = docInfo.paraShapes[paraShapeId].outlineLevel;
2697
+ if (ol >= 1 && ol <= 6) {
2698
+ block.type = "heading";
2699
+ block.level = ol;
2700
+ }
2701
+ }
2576
2702
  blocks.push(block);
2577
2703
  }
2578
2704
  for (const t of tables) blocks.push({ type: "table", table: t, pageNumber: sectionNum });
@@ -2592,7 +2718,10 @@ function parseSection(records, docInfo, warnings, sectionNum) {
2592
2718
  if (binId >= 0) {
2593
2719
  blocks.push({ type: "image", text: String(binId), pageNumber: sectionNum });
2594
2720
  } else {
2595
- warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC81C\uC5B4 \uC694\uC18C: ${ctrlId.trim()}`, code: "SKIPPED_IMAGE" });
2721
+ const boxText = extractTextBoxText(records, i);
2722
+ if (boxText) {
2723
+ blocks.push({ type: "paragraph", text: boxText, pageNumber: sectionNum });
2724
+ }
2596
2725
  }
2597
2726
  } else if (ctrlId === " elo" || ctrlId === "ole ") {
2598
2727
  warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC81C\uC5B4 \uC694\uC18C: ${ctrlId.trim()}`, code: "SKIPPED_IMAGE" });
@@ -2631,6 +2760,19 @@ function extractNoteText(records, ctrlIdx) {
2631
2760
  }
2632
2761
  return texts.length > 0 ? texts.join(" ") : null;
2633
2762
  }
2763
+ function extractTextBoxText(records, ctrlIdx) {
2764
+ const ctrlLevel = records[ctrlIdx].level;
2765
+ const texts = [];
2766
+ for (let j = ctrlIdx + 1; j < records.length && j < ctrlIdx + 200; j++) {
2767
+ const r = records[j];
2768
+ if (r.level <= ctrlLevel) break;
2769
+ if (r.tagId === TAG_PARA_TEXT) {
2770
+ const t = extractText(r.data).trim();
2771
+ if (t) texts.push(t);
2772
+ }
2773
+ }
2774
+ return texts.length > 0 ? texts.join("\n") : null;
2775
+ }
2634
2776
  function extractHyperlinkUrl(data) {
2635
2777
  try {
2636
2778
  const httpSig = Buffer.from("http", "utf16le");
@@ -2676,6 +2818,8 @@ function parseParagraphWithTables(records, startIdx) {
2676
2818
  let text = "";
2677
2819
  const tables = [];
2678
2820
  const charShapeIds = [];
2821
+ const paraHeaderData = records[startIdx].data;
2822
+ const paraShapeId = paraHeaderData.length >= 10 ? paraHeaderData.readUInt16LE(8) : -1;
2679
2823
  let i = startIdx + 1;
2680
2824
  while (i < records.length) {
2681
2825
  const rec = records[i];
@@ -2700,7 +2844,7 @@ function parseParagraphWithTables(records, startIdx) {
2700
2844
  i++;
2701
2845
  }
2702
2846
  const trimmed = text.trim();
2703
- return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds };
2847
+ return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds, paraShapeId };
2704
2848
  }
2705
2849
  function parseTableBlock(records, startIdx) {
2706
2850
  const tableLevel = records[startIdx].level;