kordoc 2.0.2 → 2.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +291 -291
- package/dist/{chunk-EVWOJ4T5.js → chunk-25TXW6EP.js} +2 -2
- package/dist/chunk-25TXW6EP.js.map +1 -0
- package/dist/{chunk-MOL7MDBG.js → chunk-3TBUDJDE.js} +1 -1
- package/dist/chunk-3TBUDJDE.js.map +1 -0
- package/dist/{chunk-XJYM2AUA.js → chunk-4UH6ABAY.js} +83 -20
- package/dist/chunk-4UH6ABAY.js.map +1 -0
- package/dist/cli.js +5 -5
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +79 -16
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +79 -16
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +3 -3
- package/dist/mcp.js.map +1 -1
- package/dist/page-range-OF5I4PQY.js +8 -0
- package/dist/{provider-A4FHJSID.js → provider-EU3CG724.js} +1 -1
- package/dist/provider-EU3CG724.js.map +1 -0
- package/dist/{utils-6JEIFBCJ.js → utils-BTZ4WSYX.js} +2 -2
- package/dist/{watch-BCPDLGOE.js → watch-QD3PDNXQ.js} +4 -4
- package/dist/watch-QD3PDNXQ.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-EVWOJ4T5.js.map +0 -1
- package/dist/chunk-MOL7MDBG.js.map +0 -1
- package/dist/chunk-XJYM2AUA.js.map +0 -1
- package/dist/page-range-737B4EZW.js +0 -8
- package/dist/provider-A4FHJSID.js.map +0 -1
- package/dist/watch-BCPDLGOE.js.map +0 -1
- /package/dist/{page-range-737B4EZW.js.map → page-range-OF5I4PQY.js.map} +0 -0
- /package/dist/{utils-6JEIFBCJ.js.map → utils-BTZ4WSYX.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -138,7 +138,7 @@ import { inflateRawSync } from "zlib";
|
|
|
138
138
|
import { DOMParser } from "@xmldom/xmldom";
|
|
139
139
|
|
|
140
140
|
// src/utils.ts
|
|
141
|
-
var VERSION = true ? "2.0.
|
|
141
|
+
var VERSION = true ? "2.0.3" : "0.0.0-dev";
|
|
142
142
|
function toArrayBuffer(buf) {
|
|
143
143
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
144
144
|
return buf.buffer;
|
|
@@ -327,6 +327,47 @@ function sanitizeText(text) {
|
|
|
327
327
|
}
|
|
328
328
|
return result;
|
|
329
329
|
}
|
|
330
|
+
function flattenLayoutTables(blocks) {
|
|
331
|
+
const result = [];
|
|
332
|
+
for (const block of blocks) {
|
|
333
|
+
if (block.type !== "table" || !block.table) {
|
|
334
|
+
result.push(block);
|
|
335
|
+
continue;
|
|
336
|
+
}
|
|
337
|
+
const { rows: numRows, cols: numCols, cells } = block.table;
|
|
338
|
+
if (numRows === 1 && numCols === 1) {
|
|
339
|
+
result.push(block);
|
|
340
|
+
continue;
|
|
341
|
+
}
|
|
342
|
+
if (numRows <= 3) {
|
|
343
|
+
let totalNewlines = 0;
|
|
344
|
+
let totalTextLen = 0;
|
|
345
|
+
for (let r = 0; r < numRows; r++) {
|
|
346
|
+
for (let c = 0; c < numCols; c++) {
|
|
347
|
+
const t = cells[r]?.[c]?.text || "";
|
|
348
|
+
totalNewlines += (t.match(/\n/g) || []).length;
|
|
349
|
+
totalTextLen += t.length;
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
if (totalNewlines > 5 || numRows <= 2 && totalTextLen > 300) {
|
|
353
|
+
for (let r = 0; r < numRows; r++) {
|
|
354
|
+
for (let c = 0; c < numCols; c++) {
|
|
355
|
+
const cellText = cells[r]?.[c]?.text?.trim();
|
|
356
|
+
if (!cellText) continue;
|
|
357
|
+
for (const line of cellText.split("\n")) {
|
|
358
|
+
const trimmed = line.trim();
|
|
359
|
+
if (!trimmed) continue;
|
|
360
|
+
result.push({ type: "paragraph", text: trimmed, pageNumber: block.pageNumber });
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
continue;
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
result.push(block);
|
|
368
|
+
}
|
|
369
|
+
return result;
|
|
370
|
+
}
|
|
330
371
|
function blocksToMarkdown(blocks) {
|
|
331
372
|
const lines = [];
|
|
332
373
|
for (let i = 0; i < blocks.length; i++) {
|
|
@@ -1225,8 +1266,9 @@ var TAG_CHAR_SHAPE = 68;
|
|
|
1225
1266
|
var TAG_CTRL_HEADER = 71;
|
|
1226
1267
|
var TAG_LIST_HEADER = 72;
|
|
1227
1268
|
var TAG_TABLE = 77;
|
|
1228
|
-
var TAG_DOC_CHAR_SHAPE =
|
|
1229
|
-
var
|
|
1269
|
+
var TAG_DOC_CHAR_SHAPE = 21;
|
|
1270
|
+
var TAG_DOC_PARA_SHAPE = 25;
|
|
1271
|
+
var TAG_DOC_STYLE = 26;
|
|
1230
1272
|
var CHAR_LINE = 0;
|
|
1231
1273
|
var CHAR_SECTION_BREAK = 10;
|
|
1232
1274
|
var CHAR_PARA = 13;
|
|
@@ -1282,8 +1324,14 @@ function parseFileHeader(data) {
|
|
|
1282
1324
|
}
|
|
1283
1325
|
function parseDocInfo(records) {
|
|
1284
1326
|
const charShapes = [];
|
|
1327
|
+
const paraShapes = [];
|
|
1285
1328
|
const styles = [];
|
|
1286
1329
|
for (const rec of records) {
|
|
1330
|
+
if (rec.tagId === TAG_DOC_PARA_SHAPE && rec.data.length >= 4) {
|
|
1331
|
+
const flags = rec.data.readUInt32LE(0);
|
|
1332
|
+
const outlineLevel = flags >> 25 & 7;
|
|
1333
|
+
paraShapes.push({ outlineLevel });
|
|
1334
|
+
}
|
|
1287
1335
|
if (rec.tagId === TAG_DOC_CHAR_SHAPE && rec.data.length >= 18) {
|
|
1288
1336
|
if (rec.data.length >= 50) {
|
|
1289
1337
|
const fontSize = rec.data.readUInt32LE(42);
|
|
@@ -1323,7 +1371,7 @@ function parseDocInfo(records) {
|
|
|
1323
1371
|
}
|
|
1324
1372
|
}
|
|
1325
1373
|
}
|
|
1326
|
-
return { charShapes, styles };
|
|
1374
|
+
return { charShapes, paraShapes, styles };
|
|
1327
1375
|
}
|
|
1328
1376
|
function extractText(data) {
|
|
1329
1377
|
let result = "";
|
|
@@ -2334,12 +2382,13 @@ function parseHwp5Document(buffer, options) {
|
|
|
2334
2382
|
}
|
|
2335
2383
|
}
|
|
2336
2384
|
const images = cfb ? extractHwp5Images(cfb, blocks, compressed, warnings) : extractHwp5ImagesLenient(lenientCfb, blocks, compressed, warnings);
|
|
2385
|
+
const flatBlocks = flattenLayoutTables(blocks);
|
|
2337
2386
|
if (docInfo) {
|
|
2338
|
-
detectHwp5Headings(
|
|
2387
|
+
detectHwp5Headings(flatBlocks, docInfo);
|
|
2339
2388
|
}
|
|
2340
|
-
const outline =
|
|
2341
|
-
const markdown = blocksToMarkdown(
|
|
2342
|
-
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
|
|
2389
|
+
const outline = flatBlocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
2390
|
+
const markdown = blocksToMarkdown(flatBlocks);
|
|
2391
|
+
return { markdown, blocks: flatBlocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
|
|
2343
2392
|
}
|
|
2344
2393
|
function parseDocInfoStream(cfb, compressed) {
|
|
2345
2394
|
try {
|
|
@@ -2390,16 +2439,21 @@ function detectHwp5Headings(blocks, docInfo) {
|
|
|
2390
2439
|
}
|
|
2391
2440
|
if (baseFontSize <= 0) return;
|
|
2392
2441
|
for (const block of blocks) {
|
|
2393
|
-
if (block.type
|
|
2442
|
+
if (block.type === "heading") continue;
|
|
2443
|
+
if (block.type !== "paragraph" || !block.text) continue;
|
|
2394
2444
|
const text = block.text.trim();
|
|
2395
2445
|
if (text.length === 0 || text.length > 200) continue;
|
|
2396
2446
|
if (/^\d+$/.test(text)) continue;
|
|
2397
|
-
const ratio = block.style.fontSize / baseFontSize;
|
|
2398
2447
|
let level = 0;
|
|
2399
|
-
if (
|
|
2400
|
-
|
|
2401
|
-
|
|
2402
|
-
|
|
2448
|
+
if (block.style?.fontSize && baseFontSize > 0) {
|
|
2449
|
+
const ratio = block.style.fontSize / baseFontSize;
|
|
2450
|
+
if (ratio >= HEADING_RATIO_H1) level = 1;
|
|
2451
|
+
else if (ratio >= HEADING_RATIO_H2) level = 2;
|
|
2452
|
+
else if (ratio >= HEADING_RATIO_H3) level = 3;
|
|
2453
|
+
}
|
|
2454
|
+
if (/^제\d+[장절편]\s/.test(text) && text.length <= 50) {
|
|
2455
|
+
if (level === 0) level = 2;
|
|
2456
|
+
} else if (/^제\d+(조의?\d*)\s*[\((]/.test(text) && text.length <= 80) {
|
|
2403
2457
|
if (level === 0) level = 3;
|
|
2404
2458
|
}
|
|
2405
2459
|
if (level > 0) {
|
|
@@ -2631,13 +2685,20 @@ function parseSection(records, docInfo, warnings, sectionNum) {
|
|
|
2631
2685
|
while (i < records.length) {
|
|
2632
2686
|
const rec = records[i];
|
|
2633
2687
|
if (rec.tagId === TAG_PARA_HEADER && rec.level === 0) {
|
|
2634
|
-
const { paragraph, tables, nextIdx, charShapeIds } = parseParagraphWithTables(records, i);
|
|
2688
|
+
const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i);
|
|
2635
2689
|
if (paragraph) {
|
|
2636
2690
|
const block = { type: "paragraph", text: paragraph, pageNumber: sectionNum };
|
|
2637
2691
|
if (docInfo && charShapeIds.length > 0) {
|
|
2638
2692
|
const style = resolveCharStyle(charShapeIds, docInfo);
|
|
2639
2693
|
if (style) block.style = style;
|
|
2640
2694
|
}
|
|
2695
|
+
if (docInfo && paraShapeId >= 0 && paraShapeId < docInfo.paraShapes.length) {
|
|
2696
|
+
const ol = docInfo.paraShapes[paraShapeId].outlineLevel;
|
|
2697
|
+
if (ol >= 1 && ol <= 6) {
|
|
2698
|
+
block.type = "heading";
|
|
2699
|
+
block.level = ol;
|
|
2700
|
+
}
|
|
2701
|
+
}
|
|
2641
2702
|
blocks.push(block);
|
|
2642
2703
|
}
|
|
2643
2704
|
for (const t of tables) blocks.push({ type: "table", table: t, pageNumber: sectionNum });
|
|
@@ -2757,6 +2818,8 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
2757
2818
|
let text = "";
|
|
2758
2819
|
const tables = [];
|
|
2759
2820
|
const charShapeIds = [];
|
|
2821
|
+
const paraHeaderData = records[startIdx].data;
|
|
2822
|
+
const paraShapeId = paraHeaderData.length >= 10 ? paraHeaderData.readUInt16LE(8) : -1;
|
|
2760
2823
|
let i = startIdx + 1;
|
|
2761
2824
|
while (i < records.length) {
|
|
2762
2825
|
const rec = records[i];
|
|
@@ -2781,7 +2844,7 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
2781
2844
|
i++;
|
|
2782
2845
|
}
|
|
2783
2846
|
const trimmed = text.trim();
|
|
2784
|
-
return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds };
|
|
2847
|
+
return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds, paraShapeId };
|
|
2785
2848
|
}
|
|
2786
2849
|
function parseTableBlock(records, startIdx) {
|
|
2787
2850
|
const tableLevel = records[startIdx].level;
|