kordoc 0.1.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +107 -145
- package/dist/{chunk-P2BZDRLZ.js → chunk-KZMWHK72.js} +134 -73
- package/dist/cli.js +19 -7
- package/dist/index.cjs +131 -61
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +17 -9
- package/dist/index.d.ts +17 -9
- package/dist/index.js +130 -68
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +46 -13783
- package/package.json +13 -3
|
@@ -1,28 +1,20 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
var __defProp = Object.defineProperty;
|
|
3
|
-
var __require = /* @__PURE__ */ ((x) => typeof require !== "undefined" ? require : typeof Proxy !== "undefined" ? new Proxy(x, {
|
|
4
|
-
get: (a, b) => (typeof require !== "undefined" ? require : a)[b]
|
|
5
|
-
}) : x)(function(x) {
|
|
6
|
-
if (typeof require !== "undefined") return require.apply(this, arguments);
|
|
7
|
-
throw Error('Dynamic require of "' + x + '" is not supported');
|
|
8
|
-
});
|
|
9
|
-
var __export = (target, all) => {
|
|
10
|
-
for (var name in all)
|
|
11
|
-
__defProp(target, name, { get: all[name], enumerable: true });
|
|
12
|
-
};
|
|
13
2
|
|
|
14
3
|
// src/detect.ts
|
|
4
|
+
function magicBytes(buffer) {
|
|
5
|
+
return new Uint8Array(buffer, 0, Math.min(4, buffer.byteLength));
|
|
6
|
+
}
|
|
15
7
|
function isHwpxFile(buffer) {
|
|
16
|
-
const
|
|
17
|
-
return
|
|
8
|
+
const b = magicBytes(buffer);
|
|
9
|
+
return b[0] === 80 && b[1] === 75 && b[2] === 3 && b[3] === 4;
|
|
18
10
|
}
|
|
19
11
|
function isOldHwpFile(buffer) {
|
|
20
|
-
const
|
|
21
|
-
return
|
|
12
|
+
const b = magicBytes(buffer);
|
|
13
|
+
return b[0] === 208 && b[1] === 207 && b[2] === 17 && b[3] === 224;
|
|
22
14
|
}
|
|
23
15
|
function isPdfFile(buffer) {
|
|
24
|
-
const
|
|
25
|
-
return
|
|
16
|
+
const b = magicBytes(buffer);
|
|
17
|
+
return b[0] === 37 && b[1] === 80 && b[2] === 68 && b[3] === 70;
|
|
26
18
|
}
|
|
27
19
|
function detectFormat(buffer) {
|
|
28
20
|
if (isHwpxFile(buffer)) return "hwpx";
|
|
@@ -31,29 +23,40 @@ function detectFormat(buffer) {
|
|
|
31
23
|
return "unknown";
|
|
32
24
|
}
|
|
33
25
|
|
|
26
|
+
// src/utils.ts
|
|
27
|
+
var VERSION = true ? "0.2.2" : "0.0.0-dev";
|
|
28
|
+
function toArrayBuffer(buf) {
|
|
29
|
+
return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
|
|
30
|
+
}
|
|
31
|
+
|
|
34
32
|
// src/hwpx/parser.ts
|
|
35
33
|
import JSZip from "jszip";
|
|
34
|
+
import { inflateRawSync } from "zlib";
|
|
36
35
|
import { DOMParser } from "@xmldom/xmldom";
|
|
37
36
|
|
|
38
37
|
// src/table/builder.ts
|
|
38
|
+
var MAX_COLS = 200;
|
|
39
|
+
var MAX_ROWS = 1e4;
|
|
39
40
|
function buildTable(rows) {
|
|
41
|
+
if (rows.length > MAX_ROWS) rows = rows.slice(0, MAX_ROWS);
|
|
40
42
|
const numRows = rows.length;
|
|
41
|
-
const tempOccupied =
|
|
43
|
+
const tempOccupied = /* @__PURE__ */ new Set();
|
|
42
44
|
let maxCols = 0;
|
|
43
45
|
for (let rowIdx = 0; rowIdx < numRows; rowIdx++) {
|
|
44
46
|
let colIdx = 0;
|
|
45
47
|
for (const cell of rows[rowIdx]) {
|
|
46
|
-
while (colIdx <
|
|
47
|
-
if (colIdx >=
|
|
48
|
+
while (colIdx < MAX_COLS && tempOccupied.has(rowIdx * MAX_COLS + colIdx)) colIdx++;
|
|
49
|
+
if (colIdx >= MAX_COLS) break;
|
|
48
50
|
for (let r = rowIdx; r < Math.min(rowIdx + cell.rowSpan, numRows); r++) {
|
|
49
|
-
for (let c = colIdx; c < Math.min(colIdx + cell.colSpan,
|
|
50
|
-
tempOccupied
|
|
51
|
+
for (let c = colIdx; c < Math.min(colIdx + cell.colSpan, MAX_COLS); c++) {
|
|
52
|
+
tempOccupied.add(r * MAX_COLS + c);
|
|
51
53
|
}
|
|
52
54
|
}
|
|
53
55
|
colIdx += cell.colSpan;
|
|
54
56
|
if (colIdx > maxCols) maxCols = colIdx;
|
|
55
57
|
}
|
|
56
58
|
}
|
|
59
|
+
tempOccupied.clear();
|
|
57
60
|
if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
|
|
58
61
|
const grid = Array.from(
|
|
59
62
|
{ length: numRows },
|
|
@@ -146,13 +149,9 @@ function tableToMarkdown(table) {
|
|
|
146
149
|
}
|
|
147
150
|
}
|
|
148
151
|
const uniqueRows = [];
|
|
149
|
-
const seen = /* @__PURE__ */ new Set();
|
|
150
152
|
for (const row of display) {
|
|
151
|
-
const
|
|
152
|
-
if (!
|
|
153
|
-
seen.add(key);
|
|
154
|
-
uniqueRows.push(row);
|
|
155
|
-
}
|
|
153
|
+
const isEmptyPlaceholder = row.every((cell) => cell === "");
|
|
154
|
+
if (!isEmptyPlaceholder) uniqueRows.push(row);
|
|
156
155
|
}
|
|
157
156
|
if (uniqueRows.length === 0) return "";
|
|
158
157
|
const md = [];
|
|
@@ -165,6 +164,14 @@ function tableToMarkdown(table) {
|
|
|
165
164
|
}
|
|
166
165
|
|
|
167
166
|
// src/hwpx/parser.ts
|
|
167
|
+
var MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024;
|
|
168
|
+
var MAX_ZIP_ENTRIES = 500;
|
|
169
|
+
function clampSpan(val, max) {
|
|
170
|
+
return Math.max(1, Math.min(val, max));
|
|
171
|
+
}
|
|
172
|
+
function stripDtd(xml) {
|
|
173
|
+
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
174
|
+
}
|
|
168
175
|
async function parseHwpxDocument(buffer) {
|
|
169
176
|
let zip;
|
|
170
177
|
try {
|
|
@@ -174,11 +181,14 @@ async function parseHwpxDocument(buffer) {
|
|
|
174
181
|
}
|
|
175
182
|
const sectionPaths = await resolveSectionPaths(zip);
|
|
176
183
|
if (sectionPaths.length === 0) throw new Error("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
184
|
+
let totalDecompressed = 0;
|
|
177
185
|
const blocks = [];
|
|
178
186
|
for (const path of sectionPaths) {
|
|
179
187
|
const file = zip.file(path);
|
|
180
188
|
if (!file) continue;
|
|
181
189
|
const xml = await file.async("text");
|
|
190
|
+
totalDecompressed += xml.length * 2;
|
|
191
|
+
if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new Error("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
182
192
|
blocks.push(...parseSectionXml(xml));
|
|
183
193
|
}
|
|
184
194
|
return blocksToMarkdown(blocks);
|
|
@@ -188,15 +198,31 @@ function extractFromBrokenZip(buffer) {
|
|
|
188
198
|
const view = new DataView(buffer);
|
|
189
199
|
let pos = 0;
|
|
190
200
|
const texts = [];
|
|
201
|
+
let totalDecompressed = 0;
|
|
202
|
+
let entryCount = 0;
|
|
191
203
|
while (pos < data.length - 30) {
|
|
192
204
|
if (data[pos] !== 80 || data[pos + 1] !== 75 || data[pos + 2] !== 3 || data[pos + 3] !== 4) break;
|
|
205
|
+
if (++entryCount > MAX_ZIP_ENTRIES) break;
|
|
193
206
|
const method = view.getUint16(pos + 8, true);
|
|
194
207
|
const compSize = view.getUint32(pos + 18, true);
|
|
195
208
|
const nameLen = view.getUint16(pos + 26, true);
|
|
196
209
|
const extraLen = view.getUint16(pos + 28, true);
|
|
210
|
+
if (nameLen > 1024 || extraLen > 65535) {
|
|
211
|
+
pos += 30 + nameLen + extraLen;
|
|
212
|
+
continue;
|
|
213
|
+
}
|
|
214
|
+
const fileStart = pos + 30 + nameLen + extraLen;
|
|
215
|
+
if (fileStart + compSize > data.length) break;
|
|
216
|
+
if (compSize === 0 && method !== 0) {
|
|
217
|
+
pos = fileStart;
|
|
218
|
+
continue;
|
|
219
|
+
}
|
|
197
220
|
const nameBytes = data.slice(pos + 30, pos + 30 + nameLen);
|
|
198
221
|
const name = new TextDecoder().decode(nameBytes);
|
|
199
|
-
|
|
222
|
+
if (name.includes("..") || name.startsWith("/")) {
|
|
223
|
+
pos = fileStart + compSize;
|
|
224
|
+
continue;
|
|
225
|
+
}
|
|
200
226
|
const fileData = data.slice(fileStart, fileStart + compSize);
|
|
201
227
|
pos = fileStart + compSize;
|
|
202
228
|
if (!name.toLowerCase().includes("section") || !name.endsWith(".xml")) continue;
|
|
@@ -205,11 +231,13 @@ function extractFromBrokenZip(buffer) {
|
|
|
205
231
|
if (method === 0) {
|
|
206
232
|
content = new TextDecoder().decode(fileData);
|
|
207
233
|
} else if (method === 8) {
|
|
208
|
-
const {
|
|
209
|
-
content = new TextDecoder().decode(
|
|
234
|
+
const decompressed = inflateRawSync(Buffer.from(fileData), { maxOutputLength: MAX_DECOMPRESS_SIZE });
|
|
235
|
+
content = new TextDecoder().decode(decompressed);
|
|
210
236
|
} else {
|
|
211
237
|
continue;
|
|
212
238
|
}
|
|
239
|
+
totalDecompressed += content.length * 2;
|
|
240
|
+
if (totalDecompressed > MAX_DECOMPRESS_SIZE) throw new Error("\uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC");
|
|
213
241
|
const sectionText = blocksToMarkdown(parseSectionXml(content));
|
|
214
242
|
if (sectionText) texts.push(sectionText);
|
|
215
243
|
} catch {
|
|
@@ -233,7 +261,7 @@ async function resolveSectionPaths(zip) {
|
|
|
233
261
|
}
|
|
234
262
|
function parseSectionPathsFromManifest(xml) {
|
|
235
263
|
const parser = new DOMParser();
|
|
236
|
-
const doc = parser.parseFromString(xml, "text/xml");
|
|
264
|
+
const doc = parser.parseFromString(stripDtd(xml), "text/xml");
|
|
237
265
|
const items = doc.getElementsByTagName("opf:item");
|
|
238
266
|
const spine = doc.getElementsByTagName("opf:itemref");
|
|
239
267
|
const isSectionId = (id) => /^s/i.test(id) || id.toLowerCase().includes("section");
|
|
@@ -260,7 +288,7 @@ function parseSectionPathsFromManifest(xml) {
|
|
|
260
288
|
}
|
|
261
289
|
function parseSectionXml(xml) {
|
|
262
290
|
const parser = new DOMParser();
|
|
263
|
-
const doc = parser.parseFromString(xml, "text/xml");
|
|
291
|
+
const doc = parser.parseFromString(stripDtd(xml), "text/xml");
|
|
264
292
|
if (!doc.documentElement) return [];
|
|
265
293
|
const blocks = [];
|
|
266
294
|
walkSection(doc.documentElement, blocks, null, []);
|
|
@@ -318,8 +346,8 @@ function walkSection(node, blocks, tableCtx, tableStack) {
|
|
|
318
346
|
if (tableCtx?.cell) {
|
|
319
347
|
const cs = parseInt(el.getAttribute("colSpan") || "1", 10);
|
|
320
348
|
const rs = parseInt(el.getAttribute("rowSpan") || "1", 10);
|
|
321
|
-
|
|
322
|
-
|
|
349
|
+
tableCtx.cell.colSpan = clampSpan(cs, MAX_COLS);
|
|
350
|
+
tableCtx.cell.rowSpan = clampSpan(rs, MAX_ROWS);
|
|
323
351
|
}
|
|
324
352
|
break;
|
|
325
353
|
case "p": {
|
|
@@ -381,7 +409,7 @@ function extractParagraphText(para) {
|
|
|
381
409
|
}
|
|
382
410
|
|
|
383
411
|
// src/hwp5/record.ts
|
|
384
|
-
import { inflateRawSync, inflateSync } from "zlib";
|
|
412
|
+
import { inflateRawSync as inflateRawSync2, inflateSync } from "zlib";
|
|
385
413
|
var TAG_PARA_HEADER = 66;
|
|
386
414
|
var TAG_PARA_TEXT = 67;
|
|
387
415
|
var TAG_CTRL_HEADER = 71;
|
|
@@ -416,16 +444,19 @@ function readRecords(data) {
|
|
|
416
444
|
}
|
|
417
445
|
return records;
|
|
418
446
|
}
|
|
447
|
+
var MAX_DECOMPRESS_SIZE2 = 100 * 1024 * 1024;
|
|
419
448
|
function decompressStream(data) {
|
|
449
|
+
const opts = { maxOutputLength: MAX_DECOMPRESS_SIZE2 };
|
|
420
450
|
if (data.length >= 2 && data[0] === 120) {
|
|
421
451
|
try {
|
|
422
|
-
return inflateSync(data);
|
|
452
|
+
return inflateSync(data, opts);
|
|
423
453
|
} catch {
|
|
424
454
|
}
|
|
425
455
|
}
|
|
426
|
-
return
|
|
456
|
+
return inflateRawSync2(data, opts);
|
|
427
457
|
}
|
|
428
458
|
function parseFileHeader(data) {
|
|
459
|
+
if (data.length < 40) throw new Error("FileHeader\uAC00 \uB108\uBB34 \uC9E7\uC2B5\uB2C8\uB2E4 (\uCD5C\uC18C 40\uBC14\uC774\uD2B8)");
|
|
429
460
|
const sig = data.subarray(0, 32).toString("utf8").replace(/\0+$/, "");
|
|
430
461
|
return {
|
|
431
462
|
signature: sig,
|
|
@@ -461,6 +492,15 @@ function extractText(data) {
|
|
|
461
492
|
const isInline = ch >= 4 && ch <= 9 || ch >= 19 && ch <= 20;
|
|
462
493
|
if ((isExt || isInline) && i + 14 <= data.length) i += 14;
|
|
463
494
|
} else if (ch >= 32) {
|
|
495
|
+
if (ch >= 55296 && ch <= 56319 && i + 1 < data.length) {
|
|
496
|
+
const lo = data.readUInt16LE(i);
|
|
497
|
+
if (lo >= 56320 && lo <= 57343) {
|
|
498
|
+
i += 2;
|
|
499
|
+
const codePoint = (ch - 55296 << 10) + (lo - 56320) + 65536;
|
|
500
|
+
result += String.fromCodePoint(codePoint);
|
|
501
|
+
break;
|
|
502
|
+
}
|
|
503
|
+
}
|
|
464
504
|
result += String.fromCharCode(ch);
|
|
465
505
|
}
|
|
466
506
|
break;
|
|
@@ -473,6 +513,8 @@ function extractText(data) {
|
|
|
473
513
|
import { createRequire } from "module";
|
|
474
514
|
var require2 = createRequire(import.meta.url);
|
|
475
515
|
var CFB = require2("cfb");
|
|
516
|
+
var MAX_SECTIONS = 100;
|
|
517
|
+
var MAX_TOTAL_DECOMPRESS = 100 * 1024 * 1024;
|
|
476
518
|
function parseHwp5Document(buffer) {
|
|
477
519
|
const cfb = CFB.parse(buffer);
|
|
478
520
|
const headerEntry = CFB.find(cfb, "/FileHeader");
|
|
@@ -485,8 +527,11 @@ function parseHwp5Document(buffer) {
|
|
|
485
527
|
const sections = findSections(cfb);
|
|
486
528
|
if (sections.length === 0) throw new Error("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
487
529
|
const blocks = [];
|
|
530
|
+
let totalDecompressed = 0;
|
|
488
531
|
for (const sectionData of sections) {
|
|
489
532
|
const data = compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
|
|
533
|
+
totalDecompressed += data.length;
|
|
534
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new Error("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
490
535
|
const records = readRecords(data);
|
|
491
536
|
blocks.push(...parseSection(records));
|
|
492
537
|
}
|
|
@@ -494,7 +539,7 @@ function parseHwp5Document(buffer) {
|
|
|
494
539
|
}
|
|
495
540
|
function findSections(cfb) {
|
|
496
541
|
const sections = [];
|
|
497
|
-
for (let i = 0; ; i++) {
|
|
542
|
+
for (let i = 0; i < MAX_SECTIONS; i++) {
|
|
498
543
|
const entry = CFB.find(cfb, `/BodyText/Section${i}`);
|
|
499
544
|
if (!entry?.content) break;
|
|
500
545
|
sections.push({ idx: i, content: Buffer.from(entry.content) });
|
|
@@ -585,20 +630,29 @@ function parseTableBlock(records, startIdx) {
|
|
|
585
630
|
return { table: buildTable(cellRows), nextIdx: i };
|
|
586
631
|
}
|
|
587
632
|
function parseCellBlock(records, startIdx, tableLevel) {
|
|
588
|
-
const
|
|
633
|
+
const rec = records[startIdx];
|
|
634
|
+
const cellLevel = rec.level;
|
|
589
635
|
const texts = [];
|
|
636
|
+
let colSpan = 1;
|
|
637
|
+
let rowSpan = 1;
|
|
638
|
+
if (rec.data.length >= 14) {
|
|
639
|
+
const cs = rec.data.readUInt16LE(10);
|
|
640
|
+
const rs = rec.data.readUInt16LE(12);
|
|
641
|
+
if (cs > 0) colSpan = Math.min(cs, MAX_COLS);
|
|
642
|
+
if (rs > 0) rowSpan = Math.min(rs, MAX_ROWS);
|
|
643
|
+
}
|
|
590
644
|
let i = startIdx + 1;
|
|
591
645
|
while (i < records.length) {
|
|
592
|
-
const
|
|
593
|
-
if (
|
|
594
|
-
if (
|
|
595
|
-
if (
|
|
596
|
-
const t = extractText(
|
|
646
|
+
const r = records[i];
|
|
647
|
+
if (r.tagId === TAG_LIST_HEADER && r.level <= cellLevel) break;
|
|
648
|
+
if (r.level <= tableLevel && (r.tagId === TAG_PARA_HEADER || r.tagId === TAG_CTRL_HEADER)) break;
|
|
649
|
+
if (r.tagId === TAG_PARA_TEXT) {
|
|
650
|
+
const t = extractText(r.data).trim();
|
|
597
651
|
if (t) texts.push(t);
|
|
598
652
|
}
|
|
599
653
|
i++;
|
|
600
654
|
}
|
|
601
|
-
return { cell: { text: texts.join("\n"), colSpan
|
|
655
|
+
return { cell: { text: texts.join("\n"), colSpan, rowSpan }, nextIdx: i };
|
|
602
656
|
}
|
|
603
657
|
function arrangeCells(rows, cols, cells) {
|
|
604
658
|
const grid = Array.from({ length: rows }, () => Array(cols).fill(null));
|
|
@@ -627,13 +681,18 @@ var pdfjsModule = null;
|
|
|
627
681
|
async function loadPdfjs() {
|
|
628
682
|
if (pdfjsModule) return pdfjsModule;
|
|
629
683
|
try {
|
|
630
|
-
|
|
684
|
+
const mod = await import("pdfjs-dist/legacy/build/pdf.mjs");
|
|
631
685
|
const req = createRequire2(import.meta.url);
|
|
632
686
|
const workerPath = req.resolve("pdfjs-dist/legacy/build/pdf.worker.mjs");
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
687
|
+
mod.GlobalWorkerOptions.workerSrc = pathToFileURL(workerPath).href;
|
|
688
|
+
pdfjsModule = mod;
|
|
689
|
+
return mod;
|
|
690
|
+
} catch (err) {
|
|
691
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
692
|
+
if (msg.includes("Cannot find") || msg.includes("MODULE_NOT_FOUND")) {
|
|
693
|
+
return null;
|
|
694
|
+
}
|
|
695
|
+
throw new Error(`pdfjs-dist \uB85C\uB529 \uC2E4\uD328: ${msg}`);
|
|
637
696
|
}
|
|
638
697
|
}
|
|
639
698
|
async function parsePdfDocument(buffer) {
|
|
@@ -646,15 +705,14 @@ async function parsePdfDocument(buffer) {
|
|
|
646
705
|
error: "pdfjs-dist\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4. npm install pdfjs-dist"
|
|
647
706
|
};
|
|
648
707
|
}
|
|
708
|
+
const data = new Uint8Array(buffer);
|
|
709
|
+
const doc = await pdfjs.getDocument({
|
|
710
|
+
data,
|
|
711
|
+
useSystemFonts: true,
|
|
712
|
+
disableFontFace: true,
|
|
713
|
+
isEvalSupported: false
|
|
714
|
+
}).promise;
|
|
649
715
|
try {
|
|
650
|
-
const data = new Uint8Array(buffer);
|
|
651
|
-
const doc = await pdfjs.getDocument({
|
|
652
|
-
data,
|
|
653
|
-
useSystemFonts: true,
|
|
654
|
-
disableFontFace: true,
|
|
655
|
-
isEvalSupported: false,
|
|
656
|
-
workerSrc: ""
|
|
657
|
-
}).promise;
|
|
658
716
|
const pageCount = doc.numPages;
|
|
659
717
|
if (pageCount === 0) {
|
|
660
718
|
return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4." };
|
|
@@ -689,20 +747,14 @@ async function parsePdfDocument(buffer) {
|
|
|
689
747
|
}
|
|
690
748
|
markdown = reconstructTables(markdown);
|
|
691
749
|
return { success: true, fileType: "pdf", markdown, pageCount, isImageBased: false };
|
|
692
|
-
}
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
fileType: "pdf",
|
|
696
|
-
pageCount: 0,
|
|
697
|
-
error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328"
|
|
698
|
-
};
|
|
750
|
+
} finally {
|
|
751
|
+
await doc.destroy().catch(() => {
|
|
752
|
+
});
|
|
699
753
|
}
|
|
700
754
|
}
|
|
701
755
|
function groupTextItemsByLine(items) {
|
|
702
756
|
if (items.length === 0) return [];
|
|
703
|
-
const textItems = items.filter(
|
|
704
|
-
(item) => typeof item.str === "string" && item.str.trim() !== ""
|
|
705
|
-
);
|
|
757
|
+
const textItems = items.filter((item) => typeof item.str === "string" && item.str.trim() !== "");
|
|
706
758
|
if (textItems.length === 0) return [];
|
|
707
759
|
textItems.sort((a, b) => {
|
|
708
760
|
const yDiff = b.transform[5] - a.transform[5];
|
|
@@ -760,8 +812,9 @@ function reconstructTables(text) {
|
|
|
760
812
|
function formatAsMarkdownTable(rows) {
|
|
761
813
|
const maxCols = Math.max(...rows.map((r) => r.length));
|
|
762
814
|
const normalized = rows.map((r) => {
|
|
763
|
-
|
|
764
|
-
|
|
815
|
+
const copy = [...r];
|
|
816
|
+
while (copy.length < maxCols) copy.push("");
|
|
817
|
+
return copy;
|
|
765
818
|
});
|
|
766
819
|
const lines = [];
|
|
767
820
|
lines.push("| " + normalized[0].join(" | ") + " |");
|
|
@@ -774,6 +827,9 @@ function formatAsMarkdownTable(rows) {
|
|
|
774
827
|
|
|
775
828
|
// src/index.ts
|
|
776
829
|
async function parse(buffer) {
|
|
830
|
+
if (!buffer || buffer.byteLength === 0) {
|
|
831
|
+
return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4." };
|
|
832
|
+
}
|
|
777
833
|
const format = detectFormat(buffer);
|
|
778
834
|
switch (format) {
|
|
779
835
|
case "hwpx":
|
|
@@ -803,11 +859,16 @@ async function parseHwp(buffer) {
|
|
|
803
859
|
}
|
|
804
860
|
}
|
|
805
861
|
async function parsePdf(buffer) {
|
|
806
|
-
|
|
862
|
+
try {
|
|
863
|
+
return await parsePdfDocument(buffer);
|
|
864
|
+
} catch (err) {
|
|
865
|
+
return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328" };
|
|
866
|
+
}
|
|
807
867
|
}
|
|
808
868
|
|
|
809
869
|
export {
|
|
810
|
-
__export,
|
|
811
870
|
detectFormat,
|
|
871
|
+
VERSION,
|
|
872
|
+
toArrayBuffer,
|
|
812
873
|
parse
|
|
813
874
|
};
|
package/dist/cli.js
CHANGED
|
@@ -1,21 +1,31 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import {
|
|
3
|
+
VERSION,
|
|
3
4
|
detectFormat,
|
|
4
|
-
parse
|
|
5
|
-
|
|
5
|
+
parse,
|
|
6
|
+
toArrayBuffer
|
|
7
|
+
} from "./chunk-KZMWHK72.js";
|
|
6
8
|
|
|
7
9
|
// src/cli.ts
|
|
8
|
-
import { readFileSync, writeFileSync } from "fs";
|
|
10
|
+
import { readFileSync, writeFileSync, mkdirSync, statSync } from "fs";
|
|
9
11
|
import { basename, resolve } from "path";
|
|
10
12
|
import { Command } from "commander";
|
|
11
13
|
var program = new Command();
|
|
12
|
-
program.name("kordoc").description("\uBAA8\uB450 \uD30C\uC2F1\uD574\uBC84\uB9AC\uACA0\uB2E4 \u2014 HWP, HWPX, PDF \u2192 Markdown").version(
|
|
14
|
+
program.name("kordoc").description("\uBAA8\uB450 \uD30C\uC2F1\uD574\uBC84\uB9AC\uACA0\uB2E4 \u2014 HWP, HWPX, PDF \u2192 Markdown").version(VERSION).argument("<files...>", "\uBCC0\uD658\uD560 \uD30C\uC77C \uACBD\uB85C (HWP, HWPX, PDF)").option("-o, --output <path>", "\uCD9C\uB825 \uD30C\uC77C \uACBD\uB85C (\uB2E8\uC77C \uD30C\uC77C \uC2DC)").option("-d, --out-dir <dir>", "\uCD9C\uB825 \uB514\uB809\uD1A0\uB9AC (\uB2E4\uC911 \uD30C\uC77C \uC2DC)").option("--format <type>", "\uCD9C\uB825 \uD615\uC2DD: markdown (\uAE30\uBCF8) \uB610\uB294 json", "markdown").option("--silent", "\uC9C4\uD589 \uBA54\uC2DC\uC9C0 \uC228\uAE30\uAE30").action(async (files, opts) => {
|
|
13
15
|
for (const filePath of files) {
|
|
14
16
|
const absPath = resolve(filePath);
|
|
15
17
|
const fileName = basename(absPath);
|
|
16
18
|
try {
|
|
19
|
+
const fileSize = statSync(absPath).size;
|
|
20
|
+
if (fileSize > 500 * 1024 * 1024) {
|
|
21
|
+
process.stderr.write(`
|
|
22
|
+
[kordoc] SKIP: ${fileName} \u2014 \uD30C\uC77C\uC774 \uB108\uBB34 \uD07D\uB2C8\uB2E4 (${(fileSize / 1024 / 1024).toFixed(1)}MB)
|
|
23
|
+
`);
|
|
24
|
+
process.exitCode = 1;
|
|
25
|
+
continue;
|
|
26
|
+
}
|
|
17
27
|
const buffer = readFileSync(absPath);
|
|
18
|
-
const arrayBuffer =
|
|
28
|
+
const arrayBuffer = toArrayBuffer(buffer);
|
|
19
29
|
const format = detectFormat(arrayBuffer);
|
|
20
30
|
if (!opts.silent) {
|
|
21
31
|
process.stderr.write(`[kordoc] ${fileName} (${format}) ...`);
|
|
@@ -31,13 +41,15 @@ program.name("kordoc").description("\uBAA8\uB450 \uD30C\uC2F1\uD574\uBC84\uB9AC\
|
|
|
31
41
|
}
|
|
32
42
|
if (!opts.silent) process.stderr.write(` OK
|
|
33
43
|
`);
|
|
34
|
-
const output = opts.format === "json" ? JSON.stringify(result, null, 2) : result.markdown
|
|
44
|
+
const output = opts.format === "json" ? JSON.stringify(result, null, 2) : result.markdown;
|
|
35
45
|
if (opts.output && files.length === 1) {
|
|
36
46
|
writeFileSync(opts.output, output, "utf-8");
|
|
37
47
|
if (!opts.silent) process.stderr.write(` \u2192 ${opts.output}
|
|
38
48
|
`);
|
|
39
49
|
} else if (opts.outDir) {
|
|
40
|
-
|
|
50
|
+
mkdirSync(opts.outDir, { recursive: true });
|
|
51
|
+
const outExt = opts.format === "json" ? ".json" : ".md";
|
|
52
|
+
const outPath = resolve(opts.outDir, fileName.replace(/\.[^.]+$/, outExt));
|
|
41
53
|
writeFileSync(outPath, output, "utf-8");
|
|
42
54
|
if (!opts.silent) process.stderr.write(` \u2192 ${outPath}
|
|
43
55
|
`);
|