kordoc 2.0.0 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -5
- package/dist/{chunk-UMO6QQO5.js → chunk-EVWOJ4T5.js} +2 -2
- package/dist/{chunk-UUHAAZYN.js → chunk-XJYM2AUA.js} +117 -34
- package/dist/chunk-XJYM2AUA.js.map +1 -0
- package/dist/cli.js +4 -4
- package/dist/index.cjs +116 -33
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +116 -33
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +2 -2
- package/dist/{utils-K23YMTIM.js → utils-6JEIFBCJ.js} +2 -2
- package/dist/{watch-CGG7CCHJ.js → watch-BCPDLGOE.js} +3 -3
- package/package.json +1 -1
- package/dist/chunk-UUHAAZYN.js.map +0 -1
- /package/dist/{chunk-UMO6QQO5.js.map → chunk-EVWOJ4T5.js.map} +0 -0
- /package/dist/{utils-K23YMTIM.js.map → utils-6JEIFBCJ.js.map} +0 -0
- /package/dist/{watch-CGG7CCHJ.js.map → watch-BCPDLGOE.js.map} +0 -0
package/README.md
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
**모두 파싱해버리겠다.**
|
|
4
4
|
|
|
5
|
-
[](https://www.npmjs.com/package/kordoc)
|
|
6
6
|
[](https://github.com/chrisryugj/kordoc/blob/main/LICENSE)
|
|
7
7
|
|
|
8
8
|
> *대한민국에서 둘째가라면 서러울 문서지옥. 거기서 7년 버틴 공무원이 만들었습니다.*
|
|
@@ -27,10 +27,10 @@ HWP, HWPX, PDF, XLSX, DOCX — 관공서에서 쏟아지는 모든 문서를 파
|
|
|
27
27
|
|
|
28
28
|
---
|
|
29
29
|
|
|
30
|
-
## v2.0
|
|
30
|
+
## v2.0 변경사항
|
|
31
31
|
|
|
32
|
-
-
|
|
33
|
-
- **손상된 HWP 파일 복구** — 표준 CFB 모듈이 거부하는 파일을 직접 FAT/디렉토리 파싱으로 복구. rhwp
|
|
32
|
+
- **🔓 배포용(열람 제한) HWP 파싱 지원** — 관공서에서 배포용으로 잠근 HWP 파일도 이제 파싱됩니다. AES-128 ECB 복호화, 순수 JS 구현. [rhwp](https://github.com/edwardkim/rhwp)(MIT) 알고리즘 포팅.
|
|
33
|
+
- **손상된 HWP 파일 복구** — 표준 CFB 모듈이 거부하는 파일을 직접 FAT/디렉토리 파싱으로 복구. rhwp LenientCfbReader 포팅.
|
|
34
34
|
- **HWP5 각주/미주/하이퍼링크 추출** — 각주 본문 텍스트 연결, 하이퍼링크 URL 추출 및 XSS 살균.
|
|
35
35
|
- **HWPX 표 병합 밀림 수정** — colspan/rowspan 그리드 계산 버그 수정.
|
|
36
36
|
- **보안 강화** — CFB 섹터 크기 검증, sanitizeHref 3중 경로 일관 적용.
|
|
@@ -282,7 +282,7 @@ import type {
|
|
|
282
282
|
[MIT](./LICENSE)
|
|
283
283
|
|
|
284
284
|
이 프로젝트는 아래 오픈소스를 포함합니다:
|
|
285
|
-
- **rhwp** (MIT,
|
|
285
|
+
- **rhwp** (MIT, edwardkim) — HWP5 배포용 복호화 및 lenient CFB 파싱 알고리즘
|
|
286
286
|
- **OpenDataLoader PDF** (Apache 2.0, Hancom Inc.) — PDF 테이블 감지 알고리즘
|
|
287
287
|
- **cfb** (Apache 2.0, SheetJS) — HWP5 OLE2 컨테이너 파싱
|
|
288
288
|
- **pdfjs-dist** (Apache 2.0, Mozilla) — PDF 텍스트 추출
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
// src/utils.ts
|
|
4
|
-
var VERSION = true ? "2.0.
|
|
4
|
+
var VERSION = true ? "2.0.2" : "0.0.0-dev";
|
|
5
5
|
function toArrayBuffer(buf) {
|
|
6
6
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
7
7
|
return buf.buffer;
|
|
@@ -90,4 +90,4 @@ export {
|
|
|
90
90
|
sanitizeHref,
|
|
91
91
|
classifyError
|
|
92
92
|
};
|
|
93
|
-
//# sourceMappingURL=chunk-
|
|
93
|
+
//# sourceMappingURL=chunk-EVWOJ4T5.js.map
|
|
@@ -6,7 +6,7 @@ import {
|
|
|
6
6
|
precheckZipSize,
|
|
7
7
|
sanitizeHref,
|
|
8
8
|
toArrayBuffer
|
|
9
|
-
} from "./chunk-
|
|
9
|
+
} from "./chunk-EVWOJ4T5.js";
|
|
10
10
|
import {
|
|
11
11
|
parsePageRange
|
|
12
12
|
} from "./chunk-MOL7MDBG.js";
|
|
@@ -224,8 +224,11 @@ function blocksToMarkdown(blocks) {
|
|
|
224
224
|
if (lines.length > 0 && lines[lines.length - 1] !== "") {
|
|
225
225
|
lines.push("");
|
|
226
226
|
}
|
|
227
|
-
|
|
228
|
-
|
|
227
|
+
const tableMd = tableToMarkdown(block.table);
|
|
228
|
+
if (tableMd) {
|
|
229
|
+
lines.push(tableMd);
|
|
230
|
+
lines.push("");
|
|
231
|
+
}
|
|
229
232
|
}
|
|
230
233
|
}
|
|
231
234
|
return lines.join("\n").trim();
|
|
@@ -235,6 +238,7 @@ function tableToMarkdown(table) {
|
|
|
235
238
|
const { cells, rows: numRows, cols: numCols } = table;
|
|
236
239
|
if (numRows === 1 && numCols === 1) {
|
|
237
240
|
const content = sanitizeText(cells[0][0].text);
|
|
241
|
+
if (!content) return "";
|
|
238
242
|
return content.split(/\n/).map((line) => {
|
|
239
243
|
const trimmed = line.trim();
|
|
240
244
|
if (!trimmed) return "";
|
|
@@ -271,9 +275,9 @@ function tableToMarkdown(table) {
|
|
|
271
275
|
const row = display[r];
|
|
272
276
|
const isEmptyPlaceholder = row.every((cell) => cell === "");
|
|
273
277
|
if (isEmptyPlaceholder) continue;
|
|
274
|
-
const hasSkippedCols = row.some((cell, c) => cell === "" && skip.has(`${r},${c}`));
|
|
275
278
|
const nonEmptyCols = row.filter((cell) => cell !== "");
|
|
276
|
-
|
|
279
|
+
const hasSkipInRow = row.some((_, c) => skip.has(`${r},${c}`));
|
|
280
|
+
if (!hasSkipInRow && nonEmptyCols.length === 1 && row[0] !== "" && row.slice(1).every((c) => c === "")) {
|
|
277
281
|
pendingFirstCol = row[0];
|
|
278
282
|
continue;
|
|
279
283
|
}
|
|
@@ -705,7 +709,8 @@ function detectHwpxHeadings(blocks, styleMap) {
|
|
|
705
709
|
else if (ratio >= HEADING_RATIO_H2) level = 2;
|
|
706
710
|
else if (ratio >= HEADING_RATIO_H3) level = 3;
|
|
707
711
|
}
|
|
708
|
-
|
|
712
|
+
const compactText = text.replace(/\s+/g, "");
|
|
713
|
+
if (/^제\d+[조장절편]/.test(compactText) && text.length <= 50) {
|
|
709
714
|
if (level === 0) level = 3;
|
|
710
715
|
}
|
|
711
716
|
if (level > 0) {
|
|
@@ -757,9 +762,14 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
757
762
|
if (newTable.rows.length > 0) {
|
|
758
763
|
if (tableStack.length > 0) {
|
|
759
764
|
const parentTable = tableStack.pop();
|
|
760
|
-
const
|
|
761
|
-
if (
|
|
762
|
-
|
|
765
|
+
const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
|
|
766
|
+
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
767
|
+
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
768
|
+
} else {
|
|
769
|
+
const nestedText = convertTableToText(newTable.rows);
|
|
770
|
+
if (parentTable.cell) {
|
|
771
|
+
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
|
|
772
|
+
}
|
|
763
773
|
}
|
|
764
774
|
tableCtx = parentTable;
|
|
765
775
|
} else {
|
|
@@ -859,9 +869,14 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
859
869
|
if (newTable.rows.length > 0) {
|
|
860
870
|
if (tableStack.length > 0) {
|
|
861
871
|
const parentTable = tableStack.pop();
|
|
862
|
-
const
|
|
863
|
-
if (
|
|
864
|
-
|
|
872
|
+
const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
|
|
873
|
+
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
874
|
+
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
875
|
+
} else {
|
|
876
|
+
const nestedText = convertTableToText(newTable.rows);
|
|
877
|
+
if (parentTable.cell) {
|
|
878
|
+
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
|
|
879
|
+
}
|
|
865
880
|
}
|
|
866
881
|
tableCtx = parentTable;
|
|
867
882
|
} else {
|
|
@@ -872,13 +887,20 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
872
887
|
tableCtx = tableStack.length > 0 ? tableStack.pop() : null;
|
|
873
888
|
}
|
|
874
889
|
} else if (localTag === "pic" || localTag === "shape" || localTag === "drawingObject") {
|
|
875
|
-
const
|
|
876
|
-
if (
|
|
877
|
-
|
|
878
|
-
} else
|
|
879
|
-
|
|
890
|
+
const drawTextChild = findDescendant(el, "drawText");
|
|
891
|
+
if (drawTextChild) {
|
|
892
|
+
extractDrawTextBlocks(drawTextChild, blocks, styleMap, sectionNum);
|
|
893
|
+
} else {
|
|
894
|
+
const imgRef = extractImageRef(el);
|
|
895
|
+
if (imgRef) {
|
|
896
|
+
blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
|
|
897
|
+
} else if (warnings && sectionNum) {
|
|
898
|
+
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
899
|
+
}
|
|
880
900
|
}
|
|
881
|
-
} else if (localTag === "
|
|
901
|
+
} else if (localTag === "drawText") {
|
|
902
|
+
extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
|
|
903
|
+
} else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
|
|
882
904
|
walkChildren(el, d + 1);
|
|
883
905
|
}
|
|
884
906
|
}
|
|
@@ -886,6 +908,40 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
886
908
|
walkChildren(node, depth);
|
|
887
909
|
return tableCtx;
|
|
888
910
|
}
|
|
911
|
+
function findDescendant(node, targetTag, depth = 0) {
|
|
912
|
+
if (depth > 5) return null;
|
|
913
|
+
const children = node.childNodes;
|
|
914
|
+
if (!children) return null;
|
|
915
|
+
for (let i = 0; i < children.length; i++) {
|
|
916
|
+
const child = children[i];
|
|
917
|
+
if (child.nodeType !== 1) continue;
|
|
918
|
+
const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
|
|
919
|
+
if (tag === targetTag) return child;
|
|
920
|
+
const found = findDescendant(child, targetTag, depth + 1);
|
|
921
|
+
if (found) return found;
|
|
922
|
+
}
|
|
923
|
+
return null;
|
|
924
|
+
}
|
|
925
|
+
function extractDrawTextBlocks(drawTextNode, blocks, styleMap, sectionNum) {
|
|
926
|
+
const children = drawTextNode.childNodes;
|
|
927
|
+
if (!children) return;
|
|
928
|
+
for (let i = 0; i < children.length; i++) {
|
|
929
|
+
const child = children[i];
|
|
930
|
+
if (child.nodeType !== 1) continue;
|
|
931
|
+
const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
|
|
932
|
+
if (tag === "subList" || tag === "p" || tag === "para") {
|
|
933
|
+
if (tag === "subList") {
|
|
934
|
+
extractDrawTextBlocks(child, blocks, styleMap, sectionNum);
|
|
935
|
+
} else {
|
|
936
|
+
const info = extractParagraphInfo(child, styleMap);
|
|
937
|
+
const text = info.text.trim();
|
|
938
|
+
if (text) {
|
|
939
|
+
blocks.push({ type: "paragraph", text, style: info.style ?? void 0, pageNumber: sectionNum });
|
|
940
|
+
}
|
|
941
|
+
}
|
|
942
|
+
}
|
|
943
|
+
}
|
|
944
|
+
}
|
|
889
945
|
function extractParagraphInfo(para, styleMap) {
|
|
890
946
|
let text = "";
|
|
891
947
|
let href;
|
|
@@ -904,11 +960,18 @@ function extractParagraphInfo(para, styleMap) {
|
|
|
904
960
|
const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
|
|
905
961
|
switch (tag) {
|
|
906
962
|
case "t":
|
|
907
|
-
|
|
963
|
+
walk(child);
|
|
908
964
|
break;
|
|
909
|
-
|
|
910
|
-
|
|
965
|
+
// 자식 순회 (tab 등 하위 요소 처리)
|
|
966
|
+
case "tab": {
|
|
967
|
+
const leader = child.getAttribute("leader");
|
|
968
|
+
if (leader && leader !== "0") {
|
|
969
|
+
text += "";
|
|
970
|
+
} else {
|
|
971
|
+
text += " ";
|
|
972
|
+
}
|
|
911
973
|
break;
|
|
974
|
+
}
|
|
912
975
|
case "br":
|
|
913
976
|
if ((child.getAttribute("type") || "line") === "line") text += "\n";
|
|
914
977
|
break;
|
|
@@ -975,6 +1038,8 @@ function extractParagraphInfo(para, styleMap) {
|
|
|
975
1038
|
}
|
|
976
1039
|
};
|
|
977
1040
|
walk(para);
|
|
1041
|
+
const leaderIdx = text.indexOf("");
|
|
1042
|
+
if (leaderIdx >= 0) text = text.substring(0, leaderIdx);
|
|
978
1043
|
let cleanText = text.replace(/[ \t]+/g, " ").trim();
|
|
979
1044
|
if (/^그림입니다\.?\s*원본\s*그림의\s*(이름|크기)/.test(cleanText)) cleanText = "";
|
|
980
1045
|
cleanText = cleanText.replace(/그림입니다\.?\s*원본\s*그림의\s*(이름|크기)[^\n]*(\n[^\n]*원본\s*그림의\s*(이름|크기)[^\n]*)*/g, "").trim();
|
|
@@ -1803,18 +1868,20 @@ function decryptDistributePayload(payload) {
|
|
|
1803
1868
|
if (payload.length < 256) throw new Error("\uBC30\uD3EC\uC6A9 payload\uAC00 256\uBC14\uC774\uD2B8 \uBBF8\uB9CC\uC785\uB2C8\uB2E4");
|
|
1804
1869
|
const seed = (payload[0] | payload[1] << 8 | payload[2] << 16 | payload[3] << 24) >>> 0;
|
|
1805
1870
|
const lcg = new MsvcLcg(seed);
|
|
1806
|
-
const result = new Uint8Array(256);
|
|
1807
|
-
|
|
1808
|
-
|
|
1809
|
-
|
|
1810
|
-
result[3] = payload[3];
|
|
1811
|
-
let i = 4;
|
|
1871
|
+
const result = new Uint8Array(payload.subarray(0, 256));
|
|
1872
|
+
let i = 0;
|
|
1873
|
+
let n = 0;
|
|
1874
|
+
let key = 0;
|
|
1812
1875
|
while (i < 256) {
|
|
1813
|
-
|
|
1814
|
-
|
|
1815
|
-
|
|
1816
|
-
|
|
1876
|
+
if (n === 0) {
|
|
1877
|
+
key = lcg.rand() & 255;
|
|
1878
|
+
n = (lcg.rand() & 15) + 1;
|
|
1879
|
+
}
|
|
1880
|
+
if (i >= 4) {
|
|
1881
|
+
result[i] ^= key;
|
|
1817
1882
|
}
|
|
1883
|
+
i++;
|
|
1884
|
+
n--;
|
|
1818
1885
|
}
|
|
1819
1886
|
return result;
|
|
1820
1887
|
}
|
|
@@ -1838,7 +1905,7 @@ function parseRecordHeader(data, offset) {
|
|
|
1838
1905
|
}
|
|
1839
1906
|
return { tagId, size, headerSize };
|
|
1840
1907
|
}
|
|
1841
|
-
var TAG_DISTRIBUTE_DOC_DATA = 16 +
|
|
1908
|
+
var TAG_DISTRIBUTE_DOC_DATA = 16 + 12;
|
|
1842
1909
|
function decryptViewText(viewTextRaw, compressed) {
|
|
1843
1910
|
const data = new Uint8Array(viewTextRaw);
|
|
1844
1911
|
const rec = parseRecordHeader(data, 0);
|
|
@@ -2456,7 +2523,10 @@ function parseSection(records, docInfo, warnings, sectionNum) {
|
|
|
2456
2523
|
if (binId >= 0) {
|
|
2457
2524
|
blocks.push({ type: "image", text: String(binId), pageNumber: sectionNum });
|
|
2458
2525
|
} else {
|
|
2459
|
-
|
|
2526
|
+
const boxText = extractTextBoxText(records, i);
|
|
2527
|
+
if (boxText) {
|
|
2528
|
+
blocks.push({ type: "paragraph", text: boxText, pageNumber: sectionNum });
|
|
2529
|
+
}
|
|
2460
2530
|
}
|
|
2461
2531
|
} else if (ctrlId === " elo" || ctrlId === "ole ") {
|
|
2462
2532
|
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC81C\uC5B4 \uC694\uC18C: ${ctrlId.trim()}`, code: "SKIPPED_IMAGE" });
|
|
@@ -2495,6 +2565,19 @@ function extractNoteText(records, ctrlIdx) {
|
|
|
2495
2565
|
}
|
|
2496
2566
|
return texts.length > 0 ? texts.join(" ") : null;
|
|
2497
2567
|
}
|
|
2568
|
+
function extractTextBoxText(records, ctrlIdx) {
|
|
2569
|
+
const ctrlLevel = records[ctrlIdx].level;
|
|
2570
|
+
const texts = [];
|
|
2571
|
+
for (let j = ctrlIdx + 1; j < records.length && j < ctrlIdx + 200; j++) {
|
|
2572
|
+
const r = records[j];
|
|
2573
|
+
if (r.level <= ctrlLevel) break;
|
|
2574
|
+
if (r.tagId === TAG_PARA_TEXT) {
|
|
2575
|
+
const t = extractText(r.data).trim();
|
|
2576
|
+
if (t) texts.push(t);
|
|
2577
|
+
}
|
|
2578
|
+
}
|
|
2579
|
+
return texts.length > 0 ? texts.join("\n") : null;
|
|
2580
|
+
}
|
|
2498
2581
|
function extractHyperlinkUrl(data) {
|
|
2499
2582
|
try {
|
|
2500
2583
|
const httpSig = Buffer.from("http", "utf16le");
|
|
@@ -5363,4 +5446,4 @@ export {
|
|
|
5363
5446
|
extractFormFields,
|
|
5364
5447
|
parse
|
|
5365
5448
|
};
|
|
5366
|
-
//# sourceMappingURL=chunk-
|
|
5449
|
+
//# sourceMappingURL=chunk-XJYM2AUA.js.map
|