kordoc 2.0.1 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-L4OFASDS.js → chunk-EVWOJ4T5.js} +2 -2
- package/dist/{chunk-JJ65GKUH.js → chunk-XJYM2AUA.js} +104 -23
- package/dist/chunk-XJYM2AUA.js.map +1 -0
- package/dist/cli.js +4 -4
- package/dist/index.cjs +103 -22
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +103 -22
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +2 -2
- package/dist/{utils-4HVKHULU.js → utils-6JEIFBCJ.js} +2 -2
- package/dist/{watch-RNZ3KESY.js → watch-BCPDLGOE.js} +3 -3
- package/package.json +1 -1
- package/dist/chunk-JJ65GKUH.js.map +0 -1
- /package/dist/{chunk-L4OFASDS.js.map → chunk-EVWOJ4T5.js.map} +0 -0
- /package/dist/{utils-4HVKHULU.js.map → utils-6JEIFBCJ.js.map} +0 -0
- /package/dist/{watch-RNZ3KESY.js.map → watch-BCPDLGOE.js.map} +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
// src/utils.ts
|
|
4
|
-
var VERSION = true ? "2.0.
|
|
4
|
+
var VERSION = true ? "2.0.2" : "0.0.0-dev";
|
|
5
5
|
function toArrayBuffer(buf) {
|
|
6
6
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
7
7
|
return buf.buffer;
|
|
@@ -90,4 +90,4 @@ export {
|
|
|
90
90
|
sanitizeHref,
|
|
91
91
|
classifyError
|
|
92
92
|
};
|
|
93
|
-
//# sourceMappingURL=chunk-
|
|
93
|
+
//# sourceMappingURL=chunk-EVWOJ4T5.js.map
|
|
@@ -6,7 +6,7 @@ import {
|
|
|
6
6
|
precheckZipSize,
|
|
7
7
|
sanitizeHref,
|
|
8
8
|
toArrayBuffer
|
|
9
|
-
} from "./chunk-
|
|
9
|
+
} from "./chunk-EVWOJ4T5.js";
|
|
10
10
|
import {
|
|
11
11
|
parsePageRange
|
|
12
12
|
} from "./chunk-MOL7MDBG.js";
|
|
@@ -224,8 +224,11 @@ function blocksToMarkdown(blocks) {
|
|
|
224
224
|
if (lines.length > 0 && lines[lines.length - 1] !== "") {
|
|
225
225
|
lines.push("");
|
|
226
226
|
}
|
|
227
|
-
|
|
228
|
-
|
|
227
|
+
const tableMd = tableToMarkdown(block.table);
|
|
228
|
+
if (tableMd) {
|
|
229
|
+
lines.push(tableMd);
|
|
230
|
+
lines.push("");
|
|
231
|
+
}
|
|
229
232
|
}
|
|
230
233
|
}
|
|
231
234
|
return lines.join("\n").trim();
|
|
@@ -235,6 +238,7 @@ function tableToMarkdown(table) {
|
|
|
235
238
|
const { cells, rows: numRows, cols: numCols } = table;
|
|
236
239
|
if (numRows === 1 && numCols === 1) {
|
|
237
240
|
const content = sanitizeText(cells[0][0].text);
|
|
241
|
+
if (!content) return "";
|
|
238
242
|
return content.split(/\n/).map((line) => {
|
|
239
243
|
const trimmed = line.trim();
|
|
240
244
|
if (!trimmed) return "";
|
|
@@ -271,9 +275,9 @@ function tableToMarkdown(table) {
|
|
|
271
275
|
const row = display[r];
|
|
272
276
|
const isEmptyPlaceholder = row.every((cell) => cell === "");
|
|
273
277
|
if (isEmptyPlaceholder) continue;
|
|
274
|
-
const hasSkippedCols = row.some((cell, c) => cell === "" && skip.has(`${r},${c}`));
|
|
275
278
|
const nonEmptyCols = row.filter((cell) => cell !== "");
|
|
276
|
-
|
|
279
|
+
const hasSkipInRow = row.some((_, c) => skip.has(`${r},${c}`));
|
|
280
|
+
if (!hasSkipInRow && nonEmptyCols.length === 1 && row[0] !== "" && row.slice(1).every((c) => c === "")) {
|
|
277
281
|
pendingFirstCol = row[0];
|
|
278
282
|
continue;
|
|
279
283
|
}
|
|
@@ -705,7 +709,8 @@ function detectHwpxHeadings(blocks, styleMap) {
|
|
|
705
709
|
else if (ratio >= HEADING_RATIO_H2) level = 2;
|
|
706
710
|
else if (ratio >= HEADING_RATIO_H3) level = 3;
|
|
707
711
|
}
|
|
708
|
-
|
|
712
|
+
const compactText = text.replace(/\s+/g, "");
|
|
713
|
+
if (/^제\d+[조장절편]/.test(compactText) && text.length <= 50) {
|
|
709
714
|
if (level === 0) level = 3;
|
|
710
715
|
}
|
|
711
716
|
if (level > 0) {
|
|
@@ -757,9 +762,14 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
757
762
|
if (newTable.rows.length > 0) {
|
|
758
763
|
if (tableStack.length > 0) {
|
|
759
764
|
const parentTable = tableStack.pop();
|
|
760
|
-
const
|
|
761
|
-
if (
|
|
762
|
-
|
|
765
|
+
const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
|
|
766
|
+
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
767
|
+
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
768
|
+
} else {
|
|
769
|
+
const nestedText = convertTableToText(newTable.rows);
|
|
770
|
+
if (parentTable.cell) {
|
|
771
|
+
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
|
|
772
|
+
}
|
|
763
773
|
}
|
|
764
774
|
tableCtx = parentTable;
|
|
765
775
|
} else {
|
|
@@ -859,9 +869,14 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
859
869
|
if (newTable.rows.length > 0) {
|
|
860
870
|
if (tableStack.length > 0) {
|
|
861
871
|
const parentTable = tableStack.pop();
|
|
862
|
-
const
|
|
863
|
-
if (
|
|
864
|
-
|
|
872
|
+
const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
|
|
873
|
+
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
874
|
+
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
875
|
+
} else {
|
|
876
|
+
const nestedText = convertTableToText(newTable.rows);
|
|
877
|
+
if (parentTable.cell) {
|
|
878
|
+
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
|
|
879
|
+
}
|
|
865
880
|
}
|
|
866
881
|
tableCtx = parentTable;
|
|
867
882
|
} else {
|
|
@@ -872,13 +887,20 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
872
887
|
tableCtx = tableStack.length > 0 ? tableStack.pop() : null;
|
|
873
888
|
}
|
|
874
889
|
} else if (localTag === "pic" || localTag === "shape" || localTag === "drawingObject") {
|
|
875
|
-
const
|
|
876
|
-
if (
|
|
877
|
-
|
|
878
|
-
} else
|
|
879
|
-
|
|
890
|
+
const drawTextChild = findDescendant(el, "drawText");
|
|
891
|
+
if (drawTextChild) {
|
|
892
|
+
extractDrawTextBlocks(drawTextChild, blocks, styleMap, sectionNum);
|
|
893
|
+
} else {
|
|
894
|
+
const imgRef = extractImageRef(el);
|
|
895
|
+
if (imgRef) {
|
|
896
|
+
blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
|
|
897
|
+
} else if (warnings && sectionNum) {
|
|
898
|
+
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
899
|
+
}
|
|
880
900
|
}
|
|
881
|
-
} else if (localTag === "
|
|
901
|
+
} else if (localTag === "drawText") {
|
|
902
|
+
extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
|
|
903
|
+
} else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
|
|
882
904
|
walkChildren(el, d + 1);
|
|
883
905
|
}
|
|
884
906
|
}
|
|
@@ -886,6 +908,40 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
886
908
|
walkChildren(node, depth);
|
|
887
909
|
return tableCtx;
|
|
888
910
|
}
|
|
911
|
+
function findDescendant(node, targetTag, depth = 0) {
|
|
912
|
+
if (depth > 5) return null;
|
|
913
|
+
const children = node.childNodes;
|
|
914
|
+
if (!children) return null;
|
|
915
|
+
for (let i = 0; i < children.length; i++) {
|
|
916
|
+
const child = children[i];
|
|
917
|
+
if (child.nodeType !== 1) continue;
|
|
918
|
+
const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
|
|
919
|
+
if (tag === targetTag) return child;
|
|
920
|
+
const found = findDescendant(child, targetTag, depth + 1);
|
|
921
|
+
if (found) return found;
|
|
922
|
+
}
|
|
923
|
+
return null;
|
|
924
|
+
}
|
|
925
|
+
function extractDrawTextBlocks(drawTextNode, blocks, styleMap, sectionNum) {
|
|
926
|
+
const children = drawTextNode.childNodes;
|
|
927
|
+
if (!children) return;
|
|
928
|
+
for (let i = 0; i < children.length; i++) {
|
|
929
|
+
const child = children[i];
|
|
930
|
+
if (child.nodeType !== 1) continue;
|
|
931
|
+
const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
|
|
932
|
+
if (tag === "subList" || tag === "p" || tag === "para") {
|
|
933
|
+
if (tag === "subList") {
|
|
934
|
+
extractDrawTextBlocks(child, blocks, styleMap, sectionNum);
|
|
935
|
+
} else {
|
|
936
|
+
const info = extractParagraphInfo(child, styleMap);
|
|
937
|
+
const text = info.text.trim();
|
|
938
|
+
if (text) {
|
|
939
|
+
blocks.push({ type: "paragraph", text, style: info.style ?? void 0, pageNumber: sectionNum });
|
|
940
|
+
}
|
|
941
|
+
}
|
|
942
|
+
}
|
|
943
|
+
}
|
|
944
|
+
}
|
|
889
945
|
function extractParagraphInfo(para, styleMap) {
|
|
890
946
|
let text = "";
|
|
891
947
|
let href;
|
|
@@ -904,11 +960,18 @@ function extractParagraphInfo(para, styleMap) {
|
|
|
904
960
|
const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
|
|
905
961
|
switch (tag) {
|
|
906
962
|
case "t":
|
|
907
|
-
|
|
963
|
+
walk(child);
|
|
908
964
|
break;
|
|
909
|
-
|
|
910
|
-
|
|
965
|
+
// 자식 순회 (tab 등 하위 요소 처리)
|
|
966
|
+
case "tab": {
|
|
967
|
+
const leader = child.getAttribute("leader");
|
|
968
|
+
if (leader && leader !== "0") {
|
|
969
|
+
text += "";
|
|
970
|
+
} else {
|
|
971
|
+
text += " ";
|
|
972
|
+
}
|
|
911
973
|
break;
|
|
974
|
+
}
|
|
912
975
|
case "br":
|
|
913
976
|
if ((child.getAttribute("type") || "line") === "line") text += "\n";
|
|
914
977
|
break;
|
|
@@ -975,6 +1038,8 @@ function extractParagraphInfo(para, styleMap) {
|
|
|
975
1038
|
}
|
|
976
1039
|
};
|
|
977
1040
|
walk(para);
|
|
1041
|
+
const leaderIdx = text.indexOf("");
|
|
1042
|
+
if (leaderIdx >= 0) text = text.substring(0, leaderIdx);
|
|
978
1043
|
let cleanText = text.replace(/[ \t]+/g, " ").trim();
|
|
979
1044
|
if (/^그림입니다\.?\s*원본\s*그림의\s*(이름|크기)/.test(cleanText)) cleanText = "";
|
|
980
1045
|
cleanText = cleanText.replace(/그림입니다\.?\s*원본\s*그림의\s*(이름|크기)[^\n]*(\n[^\n]*원본\s*그림의\s*(이름|크기)[^\n]*)*/g, "").trim();
|
|
@@ -2458,7 +2523,10 @@ function parseSection(records, docInfo, warnings, sectionNum) {
|
|
|
2458
2523
|
if (binId >= 0) {
|
|
2459
2524
|
blocks.push({ type: "image", text: String(binId), pageNumber: sectionNum });
|
|
2460
2525
|
} else {
|
|
2461
|
-
|
|
2526
|
+
const boxText = extractTextBoxText(records, i);
|
|
2527
|
+
if (boxText) {
|
|
2528
|
+
blocks.push({ type: "paragraph", text: boxText, pageNumber: sectionNum });
|
|
2529
|
+
}
|
|
2462
2530
|
}
|
|
2463
2531
|
} else if (ctrlId === " elo" || ctrlId === "ole ") {
|
|
2464
2532
|
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC81C\uC5B4 \uC694\uC18C: ${ctrlId.trim()}`, code: "SKIPPED_IMAGE" });
|
|
@@ -2497,6 +2565,19 @@ function extractNoteText(records, ctrlIdx) {
|
|
|
2497
2565
|
}
|
|
2498
2566
|
return texts.length > 0 ? texts.join(" ") : null;
|
|
2499
2567
|
}
|
|
2568
|
+
function extractTextBoxText(records, ctrlIdx) {
|
|
2569
|
+
const ctrlLevel = records[ctrlIdx].level;
|
|
2570
|
+
const texts = [];
|
|
2571
|
+
for (let j = ctrlIdx + 1; j < records.length && j < ctrlIdx + 200; j++) {
|
|
2572
|
+
const r = records[j];
|
|
2573
|
+
if (r.level <= ctrlLevel) break;
|
|
2574
|
+
if (r.tagId === TAG_PARA_TEXT) {
|
|
2575
|
+
const t = extractText(r.data).trim();
|
|
2576
|
+
if (t) texts.push(t);
|
|
2577
|
+
}
|
|
2578
|
+
}
|
|
2579
|
+
return texts.length > 0 ? texts.join("\n") : null;
|
|
2580
|
+
}
|
|
2500
2581
|
function extractHyperlinkUrl(data) {
|
|
2501
2582
|
try {
|
|
2502
2583
|
const httpSig = Buffer.from("http", "utf16le");
|
|
@@ -5365,4 +5446,4 @@ export {
|
|
|
5365
5446
|
extractFormFields,
|
|
5366
5447
|
parse
|
|
5367
5448
|
};
|
|
5368
|
-
//# sourceMappingURL=chunk-
|
|
5449
|
+
//# sourceMappingURL=chunk-XJYM2AUA.js.map
|