kordoc 2.0.1 → 2.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +291 -291
- package/dist/{chunk-L4OFASDS.js → chunk-25TXW6EP.js} +2 -2
- package/dist/chunk-25TXW6EP.js.map +1 -0
- package/dist/{chunk-MOL7MDBG.js → chunk-3TBUDJDE.js} +1 -1
- package/dist/chunk-3TBUDJDE.js.map +1 -0
- package/dist/{chunk-JJ65GKUH.js → chunk-4UH6ABAY.js} +185 -41
- package/dist/chunk-4UH6ABAY.js.map +1 -0
- package/dist/cli.js +5 -5
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +181 -37
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +181 -37
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +3 -3
- package/dist/mcp.js.map +1 -1
- package/dist/page-range-OF5I4PQY.js +8 -0
- package/dist/{provider-A4FHJSID.js → provider-EU3CG724.js} +1 -1
- package/dist/provider-EU3CG724.js.map +1 -0
- package/dist/{utils-4HVKHULU.js → utils-BTZ4WSYX.js} +2 -2
- package/dist/{watch-RNZ3KESY.js → watch-QD3PDNXQ.js} +4 -4
- package/dist/watch-QD3PDNXQ.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-JJ65GKUH.js.map +0 -1
- package/dist/chunk-L4OFASDS.js.map +0 -1
- package/dist/chunk-MOL7MDBG.js.map +0 -1
- package/dist/page-range-737B4EZW.js +0 -8
- package/dist/provider-A4FHJSID.js.map +0 -1
- package/dist/watch-RNZ3KESY.js.map +0 -1
- /package/dist/{page-range-737B4EZW.js.map → page-range-OF5I4PQY.js.map} +0 -0
- /package/dist/{utils-4HVKHULU.js.map → utils-BTZ4WSYX.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -138,7 +138,7 @@ import { inflateRawSync } from "zlib";
|
|
|
138
138
|
import { DOMParser } from "@xmldom/xmldom";
|
|
139
139
|
|
|
140
140
|
// src/utils.ts
|
|
141
|
-
var VERSION = true ? "2.0.
|
|
141
|
+
var VERSION = true ? "2.0.3" : "0.0.0-dev";
|
|
142
142
|
function toArrayBuffer(buf) {
|
|
143
143
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
144
144
|
return buf.buffer;
|
|
@@ -327,6 +327,47 @@ function sanitizeText(text) {
|
|
|
327
327
|
}
|
|
328
328
|
return result;
|
|
329
329
|
}
|
|
330
|
+
function flattenLayoutTables(blocks) {
|
|
331
|
+
const result = [];
|
|
332
|
+
for (const block of blocks) {
|
|
333
|
+
if (block.type !== "table" || !block.table) {
|
|
334
|
+
result.push(block);
|
|
335
|
+
continue;
|
|
336
|
+
}
|
|
337
|
+
const { rows: numRows, cols: numCols, cells } = block.table;
|
|
338
|
+
if (numRows === 1 && numCols === 1) {
|
|
339
|
+
result.push(block);
|
|
340
|
+
continue;
|
|
341
|
+
}
|
|
342
|
+
if (numRows <= 3) {
|
|
343
|
+
let totalNewlines = 0;
|
|
344
|
+
let totalTextLen = 0;
|
|
345
|
+
for (let r = 0; r < numRows; r++) {
|
|
346
|
+
for (let c = 0; c < numCols; c++) {
|
|
347
|
+
const t = cells[r]?.[c]?.text || "";
|
|
348
|
+
totalNewlines += (t.match(/\n/g) || []).length;
|
|
349
|
+
totalTextLen += t.length;
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
if (totalNewlines > 5 || numRows <= 2 && totalTextLen > 300) {
|
|
353
|
+
for (let r = 0; r < numRows; r++) {
|
|
354
|
+
for (let c = 0; c < numCols; c++) {
|
|
355
|
+
const cellText = cells[r]?.[c]?.text?.trim();
|
|
356
|
+
if (!cellText) continue;
|
|
357
|
+
for (const line of cellText.split("\n")) {
|
|
358
|
+
const trimmed = line.trim();
|
|
359
|
+
if (!trimmed) continue;
|
|
360
|
+
result.push({ type: "paragraph", text: trimmed, pageNumber: block.pageNumber });
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
continue;
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
result.push(block);
|
|
368
|
+
}
|
|
369
|
+
return result;
|
|
370
|
+
}
|
|
330
371
|
function blocksToMarkdown(blocks) {
|
|
331
372
|
const lines = [];
|
|
332
373
|
for (let i = 0; i < blocks.length; i++) {
|
|
@@ -388,8 +429,11 @@ function blocksToMarkdown(blocks) {
|
|
|
388
429
|
if (lines.length > 0 && lines[lines.length - 1] !== "") {
|
|
389
430
|
lines.push("");
|
|
390
431
|
}
|
|
391
|
-
|
|
392
|
-
|
|
432
|
+
const tableMd = tableToMarkdown(block.table);
|
|
433
|
+
if (tableMd) {
|
|
434
|
+
lines.push(tableMd);
|
|
435
|
+
lines.push("");
|
|
436
|
+
}
|
|
393
437
|
}
|
|
394
438
|
}
|
|
395
439
|
return lines.join("\n").trim();
|
|
@@ -399,6 +443,7 @@ function tableToMarkdown(table) {
|
|
|
399
443
|
const { cells, rows: numRows, cols: numCols } = table;
|
|
400
444
|
if (numRows === 1 && numCols === 1) {
|
|
401
445
|
const content = sanitizeText(cells[0][0].text);
|
|
446
|
+
if (!content) return "";
|
|
402
447
|
return content.split(/\n/).map((line) => {
|
|
403
448
|
const trimmed = line.trim();
|
|
404
449
|
if (!trimmed) return "";
|
|
@@ -435,9 +480,9 @@ function tableToMarkdown(table) {
|
|
|
435
480
|
const row = display[r];
|
|
436
481
|
const isEmptyPlaceholder = row.every((cell) => cell === "");
|
|
437
482
|
if (isEmptyPlaceholder) continue;
|
|
438
|
-
const hasSkippedCols = row.some((cell, c) => cell === "" && skip.has(`${r},${c}`));
|
|
439
483
|
const nonEmptyCols = row.filter((cell) => cell !== "");
|
|
440
|
-
|
|
484
|
+
const hasSkipInRow = row.some((_, c) => skip.has(`${r},${c}`));
|
|
485
|
+
if (!hasSkipInRow && nonEmptyCols.length === 1 && row[0] !== "" && row.slice(1).every((c) => c === "")) {
|
|
441
486
|
pendingFirstCol = row[0];
|
|
442
487
|
continue;
|
|
443
488
|
}
|
|
@@ -852,7 +897,8 @@ function detectHwpxHeadings(blocks, styleMap) {
|
|
|
852
897
|
else if (ratio >= HEADING_RATIO_H2) level = 2;
|
|
853
898
|
else if (ratio >= HEADING_RATIO_H3) level = 3;
|
|
854
899
|
}
|
|
855
|
-
|
|
900
|
+
const compactText = text.replace(/\s+/g, "");
|
|
901
|
+
if (/^제\d+[조장절편]/.test(compactText) && text.length <= 50) {
|
|
856
902
|
if (level === 0) level = 3;
|
|
857
903
|
}
|
|
858
904
|
if (level > 0) {
|
|
@@ -904,9 +950,14 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
|
|
|
904
950
|
if (newTable.rows.length > 0) {
|
|
905
951
|
if (tableStack.length > 0) {
|
|
906
952
|
const parentTable = tableStack.pop();
|
|
907
|
-
const
|
|
908
|
-
if (
|
|
909
|
-
|
|
953
|
+
const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
|
|
954
|
+
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
955
|
+
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
956
|
+
} else {
|
|
957
|
+
const nestedText = convertTableToText(newTable.rows);
|
|
958
|
+
if (parentTable.cell) {
|
|
959
|
+
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
|
|
960
|
+
}
|
|
910
961
|
}
|
|
911
962
|
tableCtx = parentTable;
|
|
912
963
|
} else {
|
|
@@ -1006,9 +1057,14 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
1006
1057
|
if (newTable.rows.length > 0) {
|
|
1007
1058
|
if (tableStack.length > 0) {
|
|
1008
1059
|
const parentTable = tableStack.pop();
|
|
1009
|
-
const
|
|
1010
|
-
if (
|
|
1011
|
-
|
|
1060
|
+
const nestedCols = Math.max(...newTable.rows.map((r) => r.length));
|
|
1061
|
+
if (newTable.rows.length >= 3 && nestedCols >= 2) {
|
|
1062
|
+
blocks.push({ type: "table", table: buildTable(newTable.rows), pageNumber: sectionNum });
|
|
1063
|
+
} else {
|
|
1064
|
+
const nestedText = convertTableToText(newTable.rows);
|
|
1065
|
+
if (parentTable.cell) {
|
|
1066
|
+
parentTable.cell.text += (parentTable.cell.text ? "\n" : "") + nestedText;
|
|
1067
|
+
}
|
|
1012
1068
|
}
|
|
1013
1069
|
tableCtx = parentTable;
|
|
1014
1070
|
} else {
|
|
@@ -1019,13 +1075,20 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
1019
1075
|
tableCtx = tableStack.length > 0 ? tableStack.pop() : null;
|
|
1020
1076
|
}
|
|
1021
1077
|
} else if (localTag === "pic" || localTag === "shape" || localTag === "drawingObject") {
|
|
1022
|
-
const
|
|
1023
|
-
if (
|
|
1024
|
-
|
|
1025
|
-
} else
|
|
1026
|
-
|
|
1078
|
+
const drawTextChild = findDescendant(el, "drawText");
|
|
1079
|
+
if (drawTextChild) {
|
|
1080
|
+
extractDrawTextBlocks(drawTextChild, blocks, styleMap, sectionNum);
|
|
1081
|
+
} else {
|
|
1082
|
+
const imgRef = extractImageRef(el);
|
|
1083
|
+
if (imgRef) {
|
|
1084
|
+
blocks.push({ type: "image", text: imgRef, pageNumber: sectionNum });
|
|
1085
|
+
} else if (warnings && sectionNum) {
|
|
1086
|
+
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC694\uC18C: ${localTag}`, code: "SKIPPED_IMAGE" });
|
|
1087
|
+
}
|
|
1027
1088
|
}
|
|
1028
|
-
} else if (localTag === "
|
|
1089
|
+
} else if (localTag === "drawText") {
|
|
1090
|
+
extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
|
|
1091
|
+
} else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
|
|
1029
1092
|
walkChildren(el, d + 1);
|
|
1030
1093
|
}
|
|
1031
1094
|
}
|
|
@@ -1033,6 +1096,40 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
|
|
|
1033
1096
|
walkChildren(node, depth);
|
|
1034
1097
|
return tableCtx;
|
|
1035
1098
|
}
|
|
1099
|
+
function findDescendant(node, targetTag, depth = 0) {
|
|
1100
|
+
if (depth > 5) return null;
|
|
1101
|
+
const children = node.childNodes;
|
|
1102
|
+
if (!children) return null;
|
|
1103
|
+
for (let i = 0; i < children.length; i++) {
|
|
1104
|
+
const child = children[i];
|
|
1105
|
+
if (child.nodeType !== 1) continue;
|
|
1106
|
+
const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
|
|
1107
|
+
if (tag === targetTag) return child;
|
|
1108
|
+
const found = findDescendant(child, targetTag, depth + 1);
|
|
1109
|
+
if (found) return found;
|
|
1110
|
+
}
|
|
1111
|
+
return null;
|
|
1112
|
+
}
|
|
1113
|
+
function extractDrawTextBlocks(drawTextNode, blocks, styleMap, sectionNum) {
|
|
1114
|
+
const children = drawTextNode.childNodes;
|
|
1115
|
+
if (!children) return;
|
|
1116
|
+
for (let i = 0; i < children.length; i++) {
|
|
1117
|
+
const child = children[i];
|
|
1118
|
+
if (child.nodeType !== 1) continue;
|
|
1119
|
+
const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
|
|
1120
|
+
if (tag === "subList" || tag === "p" || tag === "para") {
|
|
1121
|
+
if (tag === "subList") {
|
|
1122
|
+
extractDrawTextBlocks(child, blocks, styleMap, sectionNum);
|
|
1123
|
+
} else {
|
|
1124
|
+
const info = extractParagraphInfo(child, styleMap);
|
|
1125
|
+
const text = info.text.trim();
|
|
1126
|
+
if (text) {
|
|
1127
|
+
blocks.push({ type: "paragraph", text, style: info.style ?? void 0, pageNumber: sectionNum });
|
|
1128
|
+
}
|
|
1129
|
+
}
|
|
1130
|
+
}
|
|
1131
|
+
}
|
|
1132
|
+
}
|
|
1036
1133
|
function extractParagraphInfo(para, styleMap) {
|
|
1037
1134
|
let text = "";
|
|
1038
1135
|
let href;
|
|
@@ -1051,11 +1148,18 @@ function extractParagraphInfo(para, styleMap) {
|
|
|
1051
1148
|
const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
|
|
1052
1149
|
switch (tag) {
|
|
1053
1150
|
case "t":
|
|
1054
|
-
|
|
1151
|
+
walk(child);
|
|
1055
1152
|
break;
|
|
1056
|
-
|
|
1057
|
-
|
|
1153
|
+
// 자식 순회 (tab 등 하위 요소 처리)
|
|
1154
|
+
case "tab": {
|
|
1155
|
+
const leader = child.getAttribute("leader");
|
|
1156
|
+
if (leader && leader !== "0") {
|
|
1157
|
+
text += "";
|
|
1158
|
+
} else {
|
|
1159
|
+
text += " ";
|
|
1160
|
+
}
|
|
1058
1161
|
break;
|
|
1162
|
+
}
|
|
1059
1163
|
case "br":
|
|
1060
1164
|
if ((child.getAttribute("type") || "line") === "line") text += "\n";
|
|
1061
1165
|
break;
|
|
@@ -1122,6 +1226,8 @@ function extractParagraphInfo(para, styleMap) {
|
|
|
1122
1226
|
}
|
|
1123
1227
|
};
|
|
1124
1228
|
walk(para);
|
|
1229
|
+
const leaderIdx = text.indexOf("");
|
|
1230
|
+
if (leaderIdx >= 0) text = text.substring(0, leaderIdx);
|
|
1125
1231
|
let cleanText = text.replace(/[ \t]+/g, " ").trim();
|
|
1126
1232
|
if (/^그림입니다\.?\s*원본\s*그림의\s*(이름|크기)/.test(cleanText)) cleanText = "";
|
|
1127
1233
|
cleanText = cleanText.replace(/그림입니다\.?\s*원본\s*그림의\s*(이름|크기)[^\n]*(\n[^\n]*원본\s*그림의\s*(이름|크기)[^\n]*)*/g, "").trim();
|
|
@@ -1160,8 +1266,9 @@ var TAG_CHAR_SHAPE = 68;
|
|
|
1160
1266
|
var TAG_CTRL_HEADER = 71;
|
|
1161
1267
|
var TAG_LIST_HEADER = 72;
|
|
1162
1268
|
var TAG_TABLE = 77;
|
|
1163
|
-
var TAG_DOC_CHAR_SHAPE =
|
|
1164
|
-
var
|
|
1269
|
+
var TAG_DOC_CHAR_SHAPE = 21;
|
|
1270
|
+
var TAG_DOC_PARA_SHAPE = 25;
|
|
1271
|
+
var TAG_DOC_STYLE = 26;
|
|
1165
1272
|
var CHAR_LINE = 0;
|
|
1166
1273
|
var CHAR_SECTION_BREAK = 10;
|
|
1167
1274
|
var CHAR_PARA = 13;
|
|
@@ -1217,8 +1324,14 @@ function parseFileHeader(data) {
|
|
|
1217
1324
|
}
|
|
1218
1325
|
function parseDocInfo(records) {
|
|
1219
1326
|
const charShapes = [];
|
|
1327
|
+
const paraShapes = [];
|
|
1220
1328
|
const styles = [];
|
|
1221
1329
|
for (const rec of records) {
|
|
1330
|
+
if (rec.tagId === TAG_DOC_PARA_SHAPE && rec.data.length >= 4) {
|
|
1331
|
+
const flags = rec.data.readUInt32LE(0);
|
|
1332
|
+
const outlineLevel = flags >> 25 & 7;
|
|
1333
|
+
paraShapes.push({ outlineLevel });
|
|
1334
|
+
}
|
|
1222
1335
|
if (rec.tagId === TAG_DOC_CHAR_SHAPE && rec.data.length >= 18) {
|
|
1223
1336
|
if (rec.data.length >= 50) {
|
|
1224
1337
|
const fontSize = rec.data.readUInt32LE(42);
|
|
@@ -1258,7 +1371,7 @@ function parseDocInfo(records) {
|
|
|
1258
1371
|
}
|
|
1259
1372
|
}
|
|
1260
1373
|
}
|
|
1261
|
-
return { charShapes, styles };
|
|
1374
|
+
return { charShapes, paraShapes, styles };
|
|
1262
1375
|
}
|
|
1263
1376
|
function extractText(data) {
|
|
1264
1377
|
let result = "";
|
|
@@ -2269,12 +2382,13 @@ function parseHwp5Document(buffer, options) {
|
|
|
2269
2382
|
}
|
|
2270
2383
|
}
|
|
2271
2384
|
const images = cfb ? extractHwp5Images(cfb, blocks, compressed, warnings) : extractHwp5ImagesLenient(lenientCfb, blocks, compressed, warnings);
|
|
2385
|
+
const flatBlocks = flattenLayoutTables(blocks);
|
|
2272
2386
|
if (docInfo) {
|
|
2273
|
-
detectHwp5Headings(
|
|
2387
|
+
detectHwp5Headings(flatBlocks, docInfo);
|
|
2274
2388
|
}
|
|
2275
|
-
const outline =
|
|
2276
|
-
const markdown = blocksToMarkdown(
|
|
2277
|
-
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
|
|
2389
|
+
const outline = flatBlocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
2390
|
+
const markdown = blocksToMarkdown(flatBlocks);
|
|
2391
|
+
return { markdown, blocks: flatBlocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
|
|
2278
2392
|
}
|
|
2279
2393
|
function parseDocInfoStream(cfb, compressed) {
|
|
2280
2394
|
try {
|
|
@@ -2325,16 +2439,21 @@ function detectHwp5Headings(blocks, docInfo) {
|
|
|
2325
2439
|
}
|
|
2326
2440
|
if (baseFontSize <= 0) return;
|
|
2327
2441
|
for (const block of blocks) {
|
|
2328
|
-
if (block.type
|
|
2442
|
+
if (block.type === "heading") continue;
|
|
2443
|
+
if (block.type !== "paragraph" || !block.text) continue;
|
|
2329
2444
|
const text = block.text.trim();
|
|
2330
2445
|
if (text.length === 0 || text.length > 200) continue;
|
|
2331
2446
|
if (/^\d+$/.test(text)) continue;
|
|
2332
|
-
const ratio = block.style.fontSize / baseFontSize;
|
|
2333
2447
|
let level = 0;
|
|
2334
|
-
if (
|
|
2335
|
-
|
|
2336
|
-
|
|
2337
|
-
|
|
2448
|
+
if (block.style?.fontSize && baseFontSize > 0) {
|
|
2449
|
+
const ratio = block.style.fontSize / baseFontSize;
|
|
2450
|
+
if (ratio >= HEADING_RATIO_H1) level = 1;
|
|
2451
|
+
else if (ratio >= HEADING_RATIO_H2) level = 2;
|
|
2452
|
+
else if (ratio >= HEADING_RATIO_H3) level = 3;
|
|
2453
|
+
}
|
|
2454
|
+
if (/^제\d+[장절편]\s/.test(text) && text.length <= 50) {
|
|
2455
|
+
if (level === 0) level = 2;
|
|
2456
|
+
} else if (/^제\d+(조의?\d*)\s*[\((]/.test(text) && text.length <= 80) {
|
|
2338
2457
|
if (level === 0) level = 3;
|
|
2339
2458
|
}
|
|
2340
2459
|
if (level > 0) {
|
|
@@ -2566,13 +2685,20 @@ function parseSection(records, docInfo, warnings, sectionNum) {
|
|
|
2566
2685
|
while (i < records.length) {
|
|
2567
2686
|
const rec = records[i];
|
|
2568
2687
|
if (rec.tagId === TAG_PARA_HEADER && rec.level === 0) {
|
|
2569
|
-
const { paragraph, tables, nextIdx, charShapeIds } = parseParagraphWithTables(records, i);
|
|
2688
|
+
const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i);
|
|
2570
2689
|
if (paragraph) {
|
|
2571
2690
|
const block = { type: "paragraph", text: paragraph, pageNumber: sectionNum };
|
|
2572
2691
|
if (docInfo && charShapeIds.length > 0) {
|
|
2573
2692
|
const style = resolveCharStyle(charShapeIds, docInfo);
|
|
2574
2693
|
if (style) block.style = style;
|
|
2575
2694
|
}
|
|
2695
|
+
if (docInfo && paraShapeId >= 0 && paraShapeId < docInfo.paraShapes.length) {
|
|
2696
|
+
const ol = docInfo.paraShapes[paraShapeId].outlineLevel;
|
|
2697
|
+
if (ol >= 1 && ol <= 6) {
|
|
2698
|
+
block.type = "heading";
|
|
2699
|
+
block.level = ol;
|
|
2700
|
+
}
|
|
2701
|
+
}
|
|
2576
2702
|
blocks.push(block);
|
|
2577
2703
|
}
|
|
2578
2704
|
for (const t of tables) blocks.push({ type: "table", table: t, pageNumber: sectionNum });
|
|
@@ -2592,7 +2718,10 @@ function parseSection(records, docInfo, warnings, sectionNum) {
|
|
|
2592
2718
|
if (binId >= 0) {
|
|
2593
2719
|
blocks.push({ type: "image", text: String(binId), pageNumber: sectionNum });
|
|
2594
2720
|
} else {
|
|
2595
|
-
|
|
2721
|
+
const boxText = extractTextBoxText(records, i);
|
|
2722
|
+
if (boxText) {
|
|
2723
|
+
blocks.push({ type: "paragraph", text: boxText, pageNumber: sectionNum });
|
|
2724
|
+
}
|
|
2596
2725
|
}
|
|
2597
2726
|
} else if (ctrlId === " elo" || ctrlId === "ole ") {
|
|
2598
2727
|
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC81C\uC5B4 \uC694\uC18C: ${ctrlId.trim()}`, code: "SKIPPED_IMAGE" });
|
|
@@ -2631,6 +2760,19 @@ function extractNoteText(records, ctrlIdx) {
|
|
|
2631
2760
|
}
|
|
2632
2761
|
return texts.length > 0 ? texts.join(" ") : null;
|
|
2633
2762
|
}
|
|
2763
|
+
function extractTextBoxText(records, ctrlIdx) {
|
|
2764
|
+
const ctrlLevel = records[ctrlIdx].level;
|
|
2765
|
+
const texts = [];
|
|
2766
|
+
for (let j = ctrlIdx + 1; j < records.length && j < ctrlIdx + 200; j++) {
|
|
2767
|
+
const r = records[j];
|
|
2768
|
+
if (r.level <= ctrlLevel) break;
|
|
2769
|
+
if (r.tagId === TAG_PARA_TEXT) {
|
|
2770
|
+
const t = extractText(r.data).trim();
|
|
2771
|
+
if (t) texts.push(t);
|
|
2772
|
+
}
|
|
2773
|
+
}
|
|
2774
|
+
return texts.length > 0 ? texts.join("\n") : null;
|
|
2775
|
+
}
|
|
2634
2776
|
function extractHyperlinkUrl(data) {
|
|
2635
2777
|
try {
|
|
2636
2778
|
const httpSig = Buffer.from("http", "utf16le");
|
|
@@ -2676,6 +2818,8 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
2676
2818
|
let text = "";
|
|
2677
2819
|
const tables = [];
|
|
2678
2820
|
const charShapeIds = [];
|
|
2821
|
+
const paraHeaderData = records[startIdx].data;
|
|
2822
|
+
const paraShapeId = paraHeaderData.length >= 10 ? paraHeaderData.readUInt16LE(8) : -1;
|
|
2679
2823
|
let i = startIdx + 1;
|
|
2680
2824
|
while (i < records.length) {
|
|
2681
2825
|
const rec = records[i];
|
|
@@ -2700,7 +2844,7 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
2700
2844
|
i++;
|
|
2701
2845
|
}
|
|
2702
2846
|
const trimmed = text.trim();
|
|
2703
|
-
return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds };
|
|
2847
|
+
return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds, paraShapeId };
|
|
2704
2848
|
}
|
|
2705
2849
|
function parseTableBlock(records, startIdx) {
|
|
2706
2850
|
const tableLevel = records[startIdx].level;
|