kordoc 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  **모두 파싱해버리겠다.**
4
4
 
5
- [![npm version](https://img.shields.io/badge/npm-v2.0.0-cb3837.svg)](https://www.npmjs.com/package/kordoc)
5
+ [![npm version](https://img.shields.io/badge/npm-v2.0.1-cb3837.svg)](https://www.npmjs.com/package/kordoc)
6
6
  [![license](https://img.shields.io/npm/l/kordoc.svg)](https://github.com/chrisryugj/kordoc/blob/main/LICENSE)
7
7
 
8
8
  > *대한민국에서 둘째가라면 서러울 문서지옥. 거기서 7년 버틴 공무원이 만들었습니다.*
@@ -27,10 +27,10 @@ HWP, HWPX, PDF, XLSX, DOCX — 관공서에서 쏟아지는 모든 문서를 파
27
27
 
28
28
  ---
29
29
 
30
- ## v2.0.0 변경사항
30
+ ## v2.0 변경사항
31
31
 
32
- - **HWP5 배포용 문서 복호화**열람 제한 HWP 파일을 AES-128 ECB 복호화. 순수 JS 구현, 네이티브 의존성 없음. [rhwp](https://github.com/pjc0247/rhwp)(MIT) 알고리즘 포팅.
33
- - **손상된 HWP 파일 복구** — 표준 CFB 모듈이 거부하는 파일을 직접 FAT/디렉토리 파싱으로 복구. rhwp LenientCfbReader 포팅.
32
+ - **🔓 배포용(열람 제한) HWP 파싱 지원** 관공서에서 배포용으로 잠근 HWP 파일도 이제 파싱됩니다. AES-128 ECB 복호화, 순수 JS 구현. [rhwp](https://github.com/edwardkim/rhwp)(MIT) 알고리즘 포팅.
33
+ - **손상된 HWP 파일 복구** — 표준 CFB 모듈이 거부하는 파일을 직접 FAT/디렉토리 파싱으로 복구. rhwp LenientCfbReader 포팅.
34
34
  - **HWP5 각주/미주/하이퍼링크 추출** — 각주 본문 텍스트 연결, 하이퍼링크 URL 추출 및 XSS 살균.
35
35
  - **HWPX 표 병합 밀림 수정** — colspan/rowspan 그리드 계산 버그 수정.
36
36
  - **보안 강화** — CFB 섹터 크기 검증, sanitizeHref 3중 경로 일관 적용.
@@ -282,7 +282,7 @@ import type {
282
282
  [MIT](./LICENSE)
283
283
 
284
284
  이 프로젝트는 아래 오픈소스를 포함합니다:
285
- - **rhwp** (MIT, pjc0247) — HWP5 배포용 복호화 및 lenient CFB 파싱 알고리즘
285
+ - **rhwp** (MIT, edwardkim) — HWP5 배포용 복호화 및 lenient CFB 파싱 알고리즘
286
286
  - **OpenDataLoader PDF** (Apache 2.0, Hancom Inc.) — PDF 테이블 감지 알고리즘
287
287
  - **cfb** (Apache 2.0, SheetJS) — HWP5 OLE2 컨테이너 파싱
288
288
  - **pdfjs-dist** (Apache 2.0, Mozilla) — PDF 텍스트 추출
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env node
2
2
 
3
3
  // src/utils.ts
4
- var VERSION = true ? "2.0.0" : "0.0.0-dev";
4
+ var VERSION = true ? "2.0.2" : "0.0.0-dev";
5
5
  function toArrayBuffer(buf) {
6
6
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
7
7
  return buf.buffer;
@@ -90,4 +90,4 @@ export {
90
90
  sanitizeHref,
91
91
  classifyError
92
92
  };
93
- //# sourceMappingURL=chunk-UMO6QQO5.js.map
93
+ //# sourceMappingURL=chunk-EVWOJ4T5.js.map
@@ -6,7 +6,7 @@ import {
6
6
  precheckZipSize,
7
7
  sanitizeHref,
8
8
  toArrayBuffer
9
- } from "./chunk-UMO6QQO5.js";
9
+ } from "./chunk-EVWOJ4T5.js";
10
10
  import {
11
11
  parsePageRange
12
12
  } from "./chunk-MOL7MDBG.js";
@@ -224,8 +224,11 @@ function blocksToMarkdown(blocks) {
224
224
  if (lines.length > 0 && lines[lines.length - 1] !== "") {
225
225
  lines.push("");
226
226
  }
227
- lines.push(tableToMarkdown(block.table));
228
- lines.push("");
227
+ const tableMd = tableToMarkdown(block.table);
228
+ if (tableMd) {
229
+ lines.push(tableMd);
230
+ lines.push("");
231
+ }
229
232
  }
230
233
  }
231
234
  return lines.join("\n").trim();
@@ -235,6 +238,7 @@ function tableToMarkdown(table) {
235
238
  const { cells, rows: numRows, cols: numCols } = table;
236
239
  if (numRows === 1 && numCols === 1) {
237
240
  const content = sanitizeText(cells[0][0].text);
241
+ if (!content) return "";
238
242
  return content.split(/\n/).map((line) => {
239
243
  const trimmed = line.trim();
240
244
  if (!trimmed) return "";
@@ -271,9 +275,9 @@ function tableToMarkdown(table) {
271
275
  const row = display[r];
272
276
  const isEmptyPlaceholder = row.every((cell) => cell === "");
273
277
  if (isEmptyPlaceholder) continue;
274
- const hasSkippedCols = row.some((cell, c) => cell === "" && skip.has(`${r},${c}`));
275
278
  const nonEmptyCols = row.filter((cell) => cell !== "");
276
- if (!hasSkippedCols && nonEmptyCols.length === 1 && row[0] !== "" && row.slice(1).every((c) => c === "")) {
279
+ const hasSkipInRow = row.some((_, c) => skip.has(`${r},${c}`));
280
+ if (!hasSkipInRow && nonEmptyCols.length === 1 && row[0] !== "" && row.slice(1).every((c) => c === "")) {
277
281
  pendingFirstCol = row[0];
278
282
  continue;
279
283
  }
@@ -705,7 +709,8 @@ function detectHwpxHeadings(blocks, styleMap) {
705
709
  else if (ratio >= HEADING_RATIO_H2) level = 2;
706
710
  else if (ratio >= HEADING_RATIO_H3) level = 3;
707
711
  }
708
- if (/^제\d+[조장절편]/.test(text) && text.length <= 50) {
712
+ const compactText = text.replace(/\s+/g, "");
713
+ if (/^제\d+[조장절편]/.test(compactText) && text.length <= 50) {
709
714
  if (level === 0) level = 3;
710
715
  }
711
716
  if (level > 0) {
@@ -757,9 +762,14 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
757
762
  if (newTable.rows.length > 0) {
758
763
  if (tableStack.length > 0) {
759
764
  const parentTable = tableStack.pop();
760
- const nestedText = convertTableToText(newTable.rows);
761
- if (parentTable.cell) {
762
- parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
765
+ const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
766
+ if (newTable.rows.length >= 3 && nestedCols >= 2) {
767
+ blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
768
+ } else {
769
+ const nestedText = convertTableToText(newTable.rows);
770
+ if (parentTable.cell) {
771
+ parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
772
+ }
763
773
  }
764
774
  tableCtx = parentTable;
765
775
  } else {
@@ -859,9 +869,14 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
859
869
  if (newTable.rows.length > 0) {
860
870
  if (tableStack.length > 0) {
861
871
  const parentTable = tableStack.pop();
862
- const nestedText = convertTableToText(newTable.rows);
863
- if (parentTable.cell) {
864
- parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
872
+ const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
873
+ if (newTable.rows.length >= 3 && nestedCols >= 2) {
874
+ blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
875
+ } else {
876
+ const nestedText = convertTableToText(newTable.rows);
877
+ if (parentTable.cell) {
878
+ parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
879
+ }
865
880
  }
866
881
  tableCtx = parentTable;
867
882
  } else {
@@ -872,13 +887,20 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
872
887
  tableCtx = tableStack.length > 0 ? tableStack.pop() : null;
873
888
  }
874
889
  } else if (localTag === "pic" || localTag === "shape" || localTag === "drawingObject") {
875
- const imgRef = extractImageRef(el);
876
- if (imgRef) {
877
- blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
878
- } else if (warnings && sectionNum) {
879
- warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
890
+ const drawTextChild = findDescendant(el, "drawText");
891
+ if (drawTextChild) {
892
+ extractDrawTextBlocks(drawTextChild, blocks, styleMap, sectionNum);
893
+ } else {
894
+ const imgRef = extractImageRef(el);
895
+ if (imgRef) {
896
+ blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
897
+ } else if (warnings && sectionNum) {
898
+ warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
899
+ }
880
900
  }
881
- } else if (localTag === "r" || localTag === "run" || localTag === "ctrl") {
901
+ } else if (localTag === "drawText") {
902
+ extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
903
+ } else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
882
904
  walkChildren(el, d + 1);
883
905
  }
884
906
  }
@@ -886,6 +908,40 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
886
908
  walkChildren(node, depth);
887
909
  return tableCtx;
888
910
  }
911
+ function findDescendant(node, targetTag, depth = 0) {
912
+ if (depth > 5) return null;
913
+ const children = node.childNodes;
914
+ if (!children) return null;
915
+ for (let i = 0; i < children.length; i++) {
916
+ const child = children[i];
917
+ if (child.nodeType !== 1) continue;
918
+ const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
919
+ if (tag === targetTag) return child;
920
+ const found = findDescendant(child, targetTag, depth + 1);
921
+ if (found) return found;
922
+ }
923
+ return null;
924
+ }
925
+ function extractDrawTextBlocks(drawTextNode, blocks, styleMap, sectionNum) {
926
+ const children = drawTextNode.childNodes;
927
+ if (!children) return;
928
+ for (let i = 0; i < children.length; i++) {
929
+ const child = children[i];
930
+ if (child.nodeType !== 1) continue;
931
+ const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
932
+ if (tag === "subList" || tag === "p" || tag === "para") {
933
+ if (tag === "subList") {
934
+ extractDrawTextBlocks(child, blocks, styleMap, sectionNum);
935
+ } else {
936
+ const info = extractParagraphInfo(child, styleMap);
937
+ const text = info.text.trim();
938
+ if (text) {
939
+ blocks.push({ type: "paragraph", text, style: info.style ?? void 0, pageNumber: sectionNum });
940
+ }
941
+ }
942
+ }
943
+ }
944
+ }
889
945
  function extractParagraphInfo(para, styleMap) {
890
946
  let text = "";
891
947
  let href;
@@ -904,11 +960,18 @@ function extractParagraphInfo(para, styleMap) {
904
960
  const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
905
961
  switch (tag) {
906
962
  case "t":
907
- text += child.textContent || "";
963
+ walk(child);
908
964
  break;
909
- case "tab":
910
- text += " ";
965
+ // 자식 순회 (tab 등 하위 요소 처리)
966
+ case "tab": {
967
+ const leader = child.getAttribute("leader");
968
+ if (leader && leader !== "0") {
969
+ text += "";
970
+ } else {
971
+ text += " ";
972
+ }
911
973
  break;
974
+ }
912
975
  case "br":
913
976
  if ((child.getAttribute("type") || "line") === "line") text += "\n";
914
977
  break;
@@ -975,6 +1038,8 @@ function extractParagraphInfo(para, styleMap) {
975
1038
  }
976
1039
  };
977
1040
  walk(para);
1041
+ const leaderIdx = text.indexOf("");
1042
+ if (leaderIdx >= 0) text = text.substring(0, leaderIdx);
978
1043
  let cleanText = text.replace(/[ \t]+/g, " ").trim();
979
1044
  if (/^그림입니다\.?\s*원본\s*그림의\s*(이름|크기)/.test(cleanText)) cleanText = "";
980
1045
  cleanText = cleanText.replace(/그림입니다\.?\s*원본\s*그림의\s*(이름|크기)[^\n]*(\n[^\n]*원본\s*그림의\s*(이름|크기)[^\n]*)*/g, "").trim();
@@ -1803,18 +1868,20 @@ function decryptDistributePayload(payload) {
1803
1868
  if (payload.length < 256) throw new Error("\uBC30\uD3EC\uC6A9 payload\uAC00 256\uBC14\uC774\uD2B8 \uBBF8\uB9CC\uC785\uB2C8\uB2E4");
1804
1869
  const seed = (payload[0] | payload[1] << 8 | payload[2] << 16 | payload[3] << 24) >>> 0;
1805
1870
  const lcg = new MsvcLcg(seed);
1806
- const result = new Uint8Array(256);
1807
- result[0] = payload[0];
1808
- result[1] = payload[1];
1809
- result[2] = payload[2];
1810
- result[3] = payload[3];
1811
- let i = 4;
1871
+ const result = new Uint8Array(payload.subarray(0, 256));
1872
+ let i = 0;
1873
+ let n = 0;
1874
+ let key = 0;
1812
1875
  while (i < 256) {
1813
- const keyByte = lcg.rand() & 255;
1814
- const n = (lcg.rand() & 15) + 1;
1815
- for (let j = 0; j < n && i < 256; j++, i++) {
1816
- result[i] = payload[i] ^ keyByte;
1876
+ if (n === 0) {
1877
+ key = lcg.rand() & 255;
1878
+ n = (lcg.rand() & 15) + 1;
1879
+ }
1880
+ if (i >= 4) {
1881
+ result[i] ^= key;
1817
1882
  }
1883
+ i++;
1884
+ n--;
1818
1885
  }
1819
1886
  return result;
1820
1887
  }
@@ -1838,7 +1905,7 @@ function parseRecordHeader(data, offset) {
1838
1905
  }
1839
1906
  return { tagId, size, headerSize };
1840
1907
  }
1841
- var TAG_DISTRIBUTE_DOC_DATA = 16 + 28;
1908
+ var TAG_DISTRIBUTE_DOC_DATA = 16 + 12;
1842
1909
  function decryptViewText(viewTextRaw, compressed) {
1843
1910
  const data = new Uint8Array(viewTextRaw);
1844
1911
  const rec = parseRecordHeader(data, 0);
@@ -2456,7 +2523,10 @@ function parseSection(records, docInfo, warnings, sectionNum) {
2456
2523
  if (binId >= 0) {
2457
2524
  blocks.push({ type: "image", text: String(binId), pageNumber: sectionNum });
2458
2525
  } else {
2459
- warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC81C\uC5B4 \uC694\uC18C: ${ctrlId.trim()}`, code: "SKIPPED_IMAGE" });
2526
+ const boxText = extractTextBoxText(records, i);
2527
+ if (boxText) {
2528
+ blocks.push({ type: "paragraph", text: boxText, pageNumber: sectionNum });
2529
+ }
2460
2530
  }
2461
2531
  } else if (ctrlId === " elo" || ctrlId === "ole ") {
2462
2532
  warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC81C\uC5B4 \uC694\uC18C: ${ctrlId.trim()}`, code: "SKIPPED_IMAGE" });
@@ -2495,6 +2565,19 @@ function extractNoteText(records, ctrlIdx) {
2495
2565
  }
2496
2566
  return texts.length > 0 ? texts.join(" ") : null;
2497
2567
  }
2568
+ function extractTextBoxText(records, ctrlIdx) {
2569
+ const ctrlLevel = records[ctrlIdx].level;
2570
+ const texts = [];
2571
+ for (let j = ctrlIdx + 1; j < records.length && j < ctrlIdx + 200; j++) {
2572
+ const r = records[j];
2573
+ if (r.level <= ctrlLevel) break;
2574
+ if (r.tagId === TAG_PARA_TEXT) {
2575
+ const t = extractText(r.data).trim();
2576
+ if (t) texts.push(t);
2577
+ }
2578
+ }
2579
+ return texts.length > 0 ? texts.join("\n") : null;
2580
+ }
2498
2581
  function extractHyperlinkUrl(data) {
2499
2582
  try {
2500
2583
  const httpSig = Buffer.from("http", "utf16le");
@@ -5363,4 +5446,4 @@ export {
5363
5446
  extractFormFields,
5364
5447
  parse
5365
5448
  };
5366
- //# sourceMappingURL=chunk-UUHAAZYN.js.map
5449
+ //# sourceMappingURL=chunk-XJYM2AUA.js.map